diff -r f345bda72bc4 -r 43e37759235e Symbian3/Examples/guid-6013a680-57f9-415b-8851-c4fa63356636/chartrans_8c_source.html --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/Symbian3/Examples/guid-6013a680-57f9-415b-8851-c4fa63356636/chartrans_8c_source.html Tue Mar 30 16:16:55 2010 +0100 @@ -0,0 +1,736 @@ + + +
+ +00001 /* +00002 * chartrans.c +00003 * Copyright (C) 1999-2004 A.J. van Os; Released under GNU GPL +00004 * +00005 * Description: +00006 * Translate Word characters to local representation +00007 */ +00008 +00009 #include <stdlib.h> +00010 #include <string.h> +00011 #include <ctype.h> +00012 #if defined(__STDC_ISO_10646__) +00013 #include <wctype.h> +00014 #endif /* __STDC_ISO_10646__ */ +00015 #include "antiword.h" +00016 +00017 static const USHORT usCp850[] = { /* DOS implementation of Latin1 */ +00018 0x00c7, 0x00fc, 0x00e9, 0x00e2, 0x00e4, 0x00e0, 0x00e5, 0x00e7, +00019 0x00ea, 0x00eb, 0x00e8, 0x00ef, 0x00ee, 0x00ec, 0x00c4, 0x00c5, +00020 0x00c9, 0x00e6, 0x00c6, 0x00f4, 0x00f6, 0x00f2, 0x00fb, 0x00f9, +00021 0x00ff, 0x00d6, 0x00dc, 0x00f8, 0x00a3, 0x00d8, 0x00d7, 0x0192, +00022 0x00e1, 0x00ed, 0x00f3, 0x00fa, 0x00f1, 0x00d1, 0x00aa, 0x00ba, +00023 0x00bf, 0x00ae, 0x00ac, 0x00bd, 0x00bc, 0x00a1, 0x00ab, 0x00bb, +00024 0x2591, 0x2592, 0x2593, 0x2502, 0x2524, 0x00c1, 0x00c2, 0x00c0, +00025 0x00a9, 0x2563, 0x2551, 0x2557, 0x255d, 0x00a2, 0x00a5, 0x2510, +00026 0x2514, 0x2534, 0x252c, 0x251c, 0x2500, 0x253c, 0x00e3, 0x00c3, +00027 0x255a, 0x2554, 0x2569, 0x2566, 0x2560, 0x2550, 0x256c, 0x00a4, +00028 0x00f0, 0x00d0, 0x00ca, 0x00cb, 0x00c8, 0x0131, 0x00cd, 0x00ce, +00029 0x00cf, 0x2518, 0x250c, 0x2588, 0x2584, 0x00a6, 0x00cc, 0x2580, +00030 0x00d3, 0x00df, 0x00d4, 0x00d2, 0x00f5, 0x00d5, 0x00b5, 0x00fe, +00031 0x00de, 0x00da, 0x00db, 0x00d9, 0x00fd, 0x00dd, 0x00af, 0x00b4, +00032 0x00ad, 0x00b1, 0x2017, 0x00be, 0x00b6, 0x00a7, 0x00f7, 0x00b8, +00033 0x00b0, 0x00a8, 0x00b7, 0x00b9, 0x00b3, 0x00b2, 0x25a0, 0x00a0, +00034 }; +00035 +00036 static const USHORT usCp1250[] = { /* Windows implementation of Latin2 */ +00037 0x20ac, 0x003f, 0x201a, 0x003f, 0x201e, 0x2026, 0x2020, 0x2021, +00038 0x003f, 0x2030, 0x0160, 0x2039, 0x015a, 0x0164, 0x017d, 0x0179, +00039 0x003f, 0x2018, 0x2019, 0x201c, 0x201d, 0x2022, 0x2013, 0x2014, +00040 0x003f, 0x2122, 0x0161, 0x203a, 0x015b, 0x0165, 0x017e, 0x017a, +00041 0x00a0, 0x02c7, 0x02d8, 0x0141, 0x00a4, 0x0104, 0x00a6, 0x00a7, +00042 0x00a8, 0x00a9, 0x015e, 0x00ab, 0x00ac, 0x00ad, 0x00ae, 0x017b, +00043 0x00b0, 0x00b1, 0x02db, 0x0142, 0x00b4, 0x00b5, 0x00b6, 0x00b7, +00044 0x00b8, 0x0105, 0x015f, 0x00bb, 0x013d, 0x02dd, 0x013e, 0x017c, +00045 0x0154, 0x00c1, 0x00c2, 0x0102, 0x00c4, 0x0139, 0x0106, 0x00c7, +00046 0x010c, 0x00c9, 0x0118, 0x00cb, 0x011a, 0x00cd, 0x00ce, 0x010e, +00047 0x0110, 0x0143, 0x0147, 0x00d3, 0x00d4, 0x0150, 0x00d6, 0x00d7, +00048 0x0158, 0x016e, 0x00da, 0x0170, 0x00dc, 0x00dd, 0x0162, 0x00df, +00049 0x0155, 0x00e1, 0x00e2, 0x0103, 0x00e4, 0x013a, 0x0107, 0x00e7, +00050 0x010d, 0x00e9, 0x0119, 0x00eb, 0x011b, 0x00ed, 0x00ee, 0x010f, +00051 0x0111, 0x0144, 0x0148, 0x00f3, 0x00f4, 0x0151, 0x00f6, 0x00f7, +00052 0x0159, 0x016f, 0x00fa, 0x0171, 0x00fc, 0x00fd, 0x0163, 0x02d9, +00053 }; +00054 +00055 static const USHORT usCp1251[] = { /* Windows implementation of Cyrillic */ +00056 0x0402, 0x0403, 0x201a, 0x0453, 0x201e, 0x2026, 0x2020, 0x2021, +00057 0x20ac, 0x2030, 0x0409, 0x2039, 0x040a, 0x040c, 0x040b, 0x040f, +00058 0x0452, 0x2018, 0x2019, 0x201c, 0x201d, 0x2022, 0x2013, 0x2014, +00059 0x00f3, 0x2122, 0x0459, 0x203a, 0x045a, 0x045c, 0x045b, 0x045f, +00060 0x00a0, 0x040e, 0x045e, 0x0408, 0x00a4, 0x0490, 0x00a6, 0x00a7, +00061 0x0401, 0x00a9, 0x0404, 0x00ab, 0x00ac, 0x00ad, 0x00ae, 0x0407, +00062 0x00b0, 0x00b1, 0x0406, 0x0456, 0x0491, 0x00b5, 0x00b6, 0x00b7, +00063 0x0451, 0x2116, 0x0454, 0x00bb, 0x0458, 0x0405, 0x0455, 0x0457, +00064 0x0410, 0x0411, 0x0412, 0x0413, 0x0414, 0x0415, 0x0416, 0x0417, +00065 0x0418, 0x0419, 0x041a, 0x041b, 0x041c, 0x041d, 0x041e, 0x041f, +00066 0x0420, 0x0421, 0x0422, 0x0423, 0x0424, 0x0425, 0x0426, 0x0427, +00067 0x0428, 0x0429, 0x042a, 0x042b, 0x042c, 0x042d, 0x042e, 0x042f, +00068 0x0430, 0x0431, 0x0432, 0x0433, 0x0434, 0x0435, 0x0436, 0x0437, +00069 0x0438, 0x0439, 0x043a, 0x043b, 0x043c, 0x043d, 0x043e, 0x043f, +00070 0x0440, 0x0441, 0x0442, 0x0443, 0x0444, 0x0445, 0x0446, 0x0447, +00071 0x0448, 0x0449, 0x044a, 0x044b, 0x044c, 0x044d, 0x044e, 0x044f, +00072 }; +00073 +00074 static const USHORT usCp1252[] = { /* Windows implementation of Latin1 */ +00075 0x20ac, 0x003f, 0x201a, 0x0192, 0x201e, 0x2026, 0x2020, 0x2021, +00076 0x02c6, 0x2030, 0x0160, 0x2039, 0x0152, 0x003f, 0x017d, 0x003f, +00077 0x003f, 0x2018, 0x2019, 0x201c, 0x201d, 0x2022, 0x2013, 0x2014, +00078 0x02dc, 0x2122, 0x0161, 0x203a, 0x0153, 0x003f, 0x017e, 0x0178, +00079 0x00a0, 0x00a1, 0x00a2, 0x00a3, 0x00a4, 0x00a5, 0x00a6, 0x00a7, +00080 0x00a8, 0x00a9, 0x00aa, 0x00ab, 0x00ac, 0x00ad, 0x00ae, 0x00af, +00081 0x00b0, 0x00b1, 0x00b2, 0x00b3, 0x00b4, 0x00b5, 0x00b6, 0x00b7, +00082 0x00b8, 0x00b9, 0x00ba, 0x00bb, 0x00bc, 0x00bd, 0x00be, 0x00bf, +00083 0x00c0, 0x00c1, 0x00c2, 0x00c3, 0x00c4, 0x00c5, 0x00c6, 0x00c7, +00084 0x00c8, 0x00c9, 0x00ca, 0x00cb, 0x00cc, 0x00cd, 0x00ce, 0x00cf, +00085 0x00d0, 0x00d1, 0x00d2, 0x00d3, 0x00d4, 0x00d5, 0x00d6, 0x00d7, +00086 0x00d8, 0x00d9, 0x00da, 0x00db, 0x00dc, 0x00dd, 0x00de, 0x00df, +00087 0x00e0, 0x00e1, 0x00e2, 0x00e3, 0x00e4, 0x00e5, 0x00e6, 0x00e7, +00088 0x00e8, 0x00e9, 0x00ea, 0x00eb, 0x00ec, 0x00ed, 0x00ee, 0x00ef, +00089 0x00f0, 0x00f1, 0x00f2, 0x00f3, 0x00f4, 0x00f5, 0x00f6, 0x00f7, +00090 0x00f8, 0x00f9, 0x00fa, 0x00fb, 0x00fc, 0x00fd, 0x00fe, 0x00ff, +00091 }; +00092 +00093 static const USHORT usMacRoman[] = { /* Apple implementation of Latin1 */ +00094 0x00c4, 0x00c5, 0x00c7, 0x00c9, 0x00d1, 0x00d6, 0x00dc, 0x00e1, +00095 0x00e0, 0x00e2, 0x00e4, 0x00e3, 0x00e5, 0x00e7, 0x00e9, 0x00e8, +00096 0x00ea, 0x00eb, 0x00ed, 0x00ec, 0x00ee, 0x00ef, 0x00f1, 0x00f3, +00097 0x00f2, 0x00f4, 0x00f6, 0x00f5, 0x00fa, 0x00f9, 0x00fb, 0x00fc, +00098 0x2020, 0x00b0, 0x00a2, 0x00a3, 0x00a7, 0x2022, 0x00b6, 0x00df, +00099 0x00ae, 0x00a9, 0x2122, 0x00b4, 0x00a8, 0x2260, 0x00c6, 0x00d8, +00100 0x221e, 0x00b1, 0x2264, 0x2265, 0x00a5, 0x00b5, 0x2202, 0x2211, +00101 0x220f, 0x03c0, 0x222b, 0x00aa, 0x00ba, 0x2126, 0x00e6, 0x00f8, +00102 0x00bf, 0x00a1, 0x00ac, 0x221a, 0x0192, 0x2248, 0x2206, 0x00ab, +00103 0x00bb, 0x2026, 0x00a0, 0x00c0, 0x00c3, 0x00d5, 0x0152, 0x0153, +00104 0x2013, 0x2014, 0x201c, 0x201d, 0x2018, 0x2019, 0x00f7, 0x25ca, +00105 0x00ff, 0x0178, 0x2044, 0x00a4, 0x2039, 0x203a, 0xfb01, 0xfb02, +00106 0x2021, 0x00b7, 0x201a, 0x201e, 0x2030, 0x00c2, 0x00ca, 0x00c1, +00107 0x00cb, 0x00c8, 0x00cd, 0x00ce, 0x00cf, 0x00cc, 0x00d3, 0x00d4, +00108 0x003f, 0x00d2, 0x00da, 0x00db, 0x00d9, 0x0131, 0x02c6, 0x02dc, +00109 0x00af, 0x02d8, 0x02d9, 0x02da, 0x00b8, 0x02dd, 0x02db, 0x02c7, +00110 }; +00111 +00112 static const USHORT usPrivateArea[] = { +00113 0x0020, 0x0021, 0x2200, 0x0023, 0x2203, 0x0025, 0x0026, 0x220d, +00114 0x0028, 0x0029, 0x2217, 0x002b, 0x002c, 0x2212, 0x002e, 0x002f, +00115 0x0030, 0x0031, 0x0032, 0x0033, 0x0034, 0x0035, 0x0036, 0x0037, +00116 0x0038, 0x0039, 0x003a, 0x003b, 0x003c, 0x2019, 0x003e, 0x003f, +00117 0x201d, 0x201c, 0x0392, 0x03a7, 0x0394, 0x0395, 0x03a6, 0x0393, +00118 0x0397, 0x0399, 0x03d1, 0x039a, 0x039b, 0x039c, 0x039d, 0x039f, +00119 0x03a0, 0x0398, 0x03a1, 0x03a3, 0x03a4, 0x03a5, 0x03c2, 0x03a9, +00120 0x039e, 0x03a8, 0x0396, 0x005b, 0x2234, 0x005d, 0x22a5, 0x005f, +00121 0x003f, 0x03b1, 0x03b2, 0x03c7, 0x03b4, 0x03b5, 0x03c6, 0x03b3, +00122 0x03b7, 0x03b9, 0x03d5, 0x03ba, 0x03bb, 0x03bc, 0x03bd, 0x03bf, +00123 0x03c0, 0x03b8, 0x03c1, 0x03c3, 0x03c4, 0x03c5, 0x03d6, 0x03c9, +00124 0x03be, 0x03c8, 0x03b6, 0x007b, 0x007c, 0x007d, 0x223c, 0x003f, +00125 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, +00126 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, +00127 0x003f, 0x003f, 0x003f, 0x2022, 0x003f, 0x003f, 0x003f, 0x003f, +00128 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, +00129 0x20ac, 0x03d2, 0x2032, 0x2264, 0x2044, 0x221e, 0x0192, 0x2663, +00130 0x2666, 0x2665, 0x2660, 0x2194, 0x2190, 0x2191, 0x2192, 0x2193, +00131 0x00b0, 0x00b1, 0x2033, 0x2265, 0x00d7, 0x221d, 0x2202, 0x2022, +00132 0x00f7, 0x2260, 0x2261, 0x2248, 0x2026, 0x007c, 0x23af, 0x21b5, +00133 0x2135, 0x2111, 0x211c, 0x2118, 0x2297, 0x2295, 0x2205, 0x2229, +00134 0x222a, 0x2283, 0x2287, 0x2284, 0x2282, 0x2286, 0x2208, 0x2209, +00135 0x2220, 0x2207, 0x00ae, 0x00a9, 0x2122, 0x220f, 0x221a, 0x22c5, +00136 0x00ac, 0x2227, 0x2228, 0x21d4, 0x21d0, 0x21d1, 0x21d2, 0x21d3, +00137 0x22c4, 0x3008, 0x00ae, 0x00a9, 0x2122, 0x2211, 0x239b, 0x239c, +00138 0x239d, 0x23a1, 0x23a2, 0x23a3, 0x23a7, 0x23a8, 0x23a9, 0x23aa, +00139 0x003f, 0x3009, 0x222b, 0x2320, 0x23ae, 0x2321, 0x239e, 0x239f, +00140 0x23a0, 0x23a4, 0x23a5, 0x23a6, 0x23ab, 0x23ac, 0x23ad, 0x003f, +00141 }; +00142 +00143 typedef struct char_table_tag { +00144 UCHAR ucLocal; +00145 USHORT usUnicode; +00146 } char_table_type; +00147 +00148 static char_table_type atCharTable[256]; +00149 static size_t tNextPosFree = 0; +00150 +00151 +00152 /* +00153 * iCompare - compare two records +00154 * +00155 * Compares two records. For use by qsort(3C) and bsearch(3C). +00156 * +00157 * returns -1 if rec1 < rec2, 0 if rec1 == rec2, 1 if rec1 > rec2 +00158 */ +00159 static int +00160 iCompare(const void *pvRecord1, const void *pvRecord2) +00161 { +00162 USHORT usUnicode1, usUnicode2; +00163 +00164 usUnicode1 = ((char_table_type *)pvRecord1)->usUnicode; +00165 usUnicode2 = ((char_table_type *)pvRecord2)->usUnicode; +00166 +00167 if (usUnicode1 < usUnicode2) { +00168 return -1; +00169 } +00170 if (usUnicode1 > usUnicode2) { +00171 return 1; +00172 } +00173 return 0; +00174 } /* end of iCompare */ +00175 +00176 /* +00177 * pGetCharTableRecord - get the character table record +00178 * +00179 * returns a pointer to the record when found, otherwise NULL +00180 */ +00181 static const char_table_type * +00182 pGetCharTableRecord(USHORT usUnicode) +00183 { +00184 char_table_type tKey; +00185 +00186 if (tNextPosFree == 0) { +00187 return NULL; +00188 } +00189 tKey.usUnicode = usUnicode; +00190 tKey.ucLocal = 0; +00191 return (char_table_type *)bsearch(&tKey, +00192 atCharTable, +00193 tNextPosFree, sizeof(atCharTable[0]), +00194 iCompare); +00195 } /* end of pGetCharTableRecord */ +00196 +00197 /* +00198 * ucGetBulletCharacter - get the local representation of the bullet +00199 */ +00200 UCHAR +00201 ucGetBulletCharacter(conversion_type eConversionType, encoding_type eEncoding) +00202 { +00203 #if defined(__riscos) +00204 return 0x8f; +00205 #else +00206 const char_table_type *pRec; +00207 +00208 fail(eEncoding == encoding_utf_8); +00209 +00210 if (eEncoding == encoding_latin_1 && +00211 (eConversionType == conversion_ps || +00212 eConversionType == conversion_pdf)) { +00213 /* Ugly, but it makes the PostScript and PDF look better */ +00214 return (UCHAR)143; +00215 } +00216 if (eConversionType != conversion_text && +00217 eConversionType != conversion_fmt_text) { +00218 pRec = pGetCharTableRecord(UNICODE_BULLET); +00219 if (pRec != NULL) { +00220 return pRec->ucLocal; +00221 } +00222 pRec = pGetCharTableRecord(UNICODE_BULLET_OPERATOR); +00223 if (pRec != NULL) { +00224 return pRec->ucLocal; +00225 } +00226 pRec = pGetCharTableRecord(UNICODE_MIDDLE_DOT); +00227 if (pRec != NULL) { +00228 return pRec->ucLocal; +00229 } +00230 } +00231 return (UCHAR)'.'; +00232 #endif /* __riscos */ +00233 } /* end of ucGetBulletCharacter */ +00234 +00235 /* +00236 * ucGetNbspCharacter - get the local representation of the non-breaking space +00237 */ +00238 UCHAR +00239 ucGetNbspCharacter(void) +00240 { +00241 const char_table_type *pRec; +00242 +00243 pRec = pGetCharTableRecord(0x00a0); /* Unicode non-breaking space */ +00244 if (pRec == NULL) { +00245 DBG_MSG("Non-breaking space record not found"); +00246 /* No value found, use the best guess */ +00247 return (UCHAR)0xa0; +00248 } +00249 return pRec->ucLocal; +00250 } /* end of ucGetNbspCharacter */ +00251 +00252 /* +00253 * bReadCharacterMappingTable - read the mapping table +00254 * +00255 * Read the character mapping table from file and have the contents sorted +00256 * +00257 * returns TRUE if successful, otherwise FALSE +00258 */ +00259 BOOL +00260 bReadCharacterMappingTable(FILE *pFile) +00261 { +00262 char *pcTmp; +00263 ULONG ulUnicode; +00264 UINT uiLocal; +00265 int iFields; +00266 char szLine[81]; +00267 +00268 if (pFile == NULL) { +00269 return FALSE; +00270 } +00271 +00272 /* Clean the table first */ +00273 (void)memset(atCharTable, 0, sizeof(atCharTable)); +00274 +00275 /* Fill the table */ +00276 while (fgets(szLine, (int)sizeof(szLine), pFile)) { +00277 if (szLine[0] == '#' || +00278 szLine[0] == '\r' || +00279 szLine[0] == '\n') { +00280 /* Comment or empty line */ +00281 continue; +00282 } +00283 iFields = sscanf(szLine, "%x %lx %*s", &uiLocal, &ulUnicode); +00284 if (iFields != 2) { +00285 pcTmp = strchr(szLine, '\r'); +00286 if (pcTmp != NULL) { +00287 *pcTmp = '\0'; +00288 } +00289 pcTmp = strchr(szLine, '\n'); +00290 if (pcTmp != NULL) { +00291 *pcTmp = '\0'; +00292 } +00293 werr(0, "Syntax error in: '%s'", szLine); +00294 continue; +00295 } +00296 if (uiLocal > 0xff || ulUnicode > 0xffff) { +00297 werr(0, "Syntax error in: '%02x %04lx'", +00298 uiLocal, ulUnicode); +00299 continue; +00300 } +00301 /* Store only the relevant entries */ +00302 if (uiLocal != ulUnicode || uiLocal >= 0x80) { +00303 atCharTable[tNextPosFree].ucLocal = (UCHAR)uiLocal; +00304 atCharTable[tNextPosFree].usUnicode = (USHORT)ulUnicode; +00305 tNextPosFree++; +00306 } +00307 if (tNextPosFree >= elementsof(atCharTable)) { +00308 werr(0, "Too many entries in the character mapping " +00309 "file. Ignoring the rest."); +00310 break; +00311 } +00312 } +00313 +00314 if (tNextPosFree != 0) { +00315 DBG_HEX(atCharTable[0].usUnicode); +00316 DBG_HEX(atCharTable[tNextPosFree - 1].usUnicode); +00317 +00318 qsort(atCharTable, +00319 tNextPosFree, sizeof(atCharTable[0]), +00320 iCompare); +00321 +00322 DBG_HEX(atCharTable[0].usUnicode); +00323 DBG_HEX(atCharTable[tNextPosFree - 1].usUnicode); +00324 } +00325 +00326 return TRUE; +00327 } /* end of bReadCharacterMappingTable */ +00328 +00329 /* +00330 * ulTranslateCharacters - Translate characters to local representation +00331 * +00332 * Translate all characters to local representation +00333 * +00334 * returns the translated character +00335 */ +00336 ULONG +00337 ulTranslateCharacters(USHORT usChar, ULONG ulFileOffset, int iWordVersion, +00338 conversion_type eConversionType, encoding_type eEncoding, +00339 BOOL bUseMacCharSet) +00340 { +00341 const char_table_type *pTmp; +00342 const USHORT *usCharSet; +00343 +00344 usCharSet = NULL; +00345 if (bUseMacCharSet) { +00346 /* Macintosh character set */ +00347 usCharSet = usMacRoman; +00348 } else if (iWordVersion == 0) { +00349 /* DOS character set */ +00350 usCharSet = usCp850; +00351 } else { +00352 /* Windows character set */ +00353 switch (eEncoding) { +00354 case encoding_latin_2: +00355 usCharSet = usCp1250; +00356 break; +00357 case encoding_cyrillic: +00358 usCharSet = usCp1251; +00359 break; +00360 case encoding_latin_1: +00361 default: +00362 usCharSet = usCp1252; +00363 break; +00364 } +00365 } +00366 fail(usCharSet == NULL); +00367 if (usChar >= 0x80 && usChar <= 0x9f) { +00368 /* Translate implementation defined characters */ +00369 usChar = usCharSet[usChar - 0x80]; +00370 } else if (iWordVersion < 8 && usChar >= 0xa0 && usChar <= 0xff) { +00371 /* Translate old character set to Unixcode */ +00372 usChar = usCharSet[usChar - 0x80]; +00373 } +00374 +00375 /* Microsoft Unicode to real Unicode */ +00376 if (usChar >= 0xf020 && usChar <= 0xf0ff) { +00377 DBG_HEX_C(usPrivateArea[usChar - 0xf020] == 0x003f, usChar); +00378 usChar = usPrivateArea[usChar - 0xf020]; +00379 } +00380 +00381 /* Characters with a special meaning in Word */ +00382 switch (usChar) { +00383 case IGNORE_CHARACTER: +00384 case FOOTNOTE_SEPARATOR: +00385 case FOOTNOTE_CONTINUATION: +00386 case ANNOTATION: +00387 case FRAME: +00388 case LINE_FEED: +00389 case WORD_SOFT_HYPHEN: +00390 case UNICODE_HYPHENATION_POINT: +00391 return IGNORE_CHARACTER; +00392 case PICTURE: +00393 case TABLE_SEPARATOR: +00394 case TAB: +00395 case HARD_RETURN: +00396 case PAGE_BREAK: +00397 case PAR_END: +00398 case COLUMN_FEED: +00399 return (ULONG)usChar; +00400 case FOOTNOTE_OR_ENDNOTE: +00401 NO_DBG_HEX(ulFileOffset); +00402 switch (eGetNotetype(ulFileOffset)) { +00403 case notetype_is_footnote: +00404 return FOOTNOTE_CHAR; +00405 case notetype_is_endnote: +00406 return ENDNOTE_CHAR; +00407 default: +00408 return UNKNOWN_NOTE_CHAR; +00409 } +00410 case WORD_UNBREAKABLE_JOIN: +00411 return (ULONG)OUR_UNBREAKABLE_JOIN; +00412 default: +00413 break; +00414 } +00415 +00416 if (eEncoding != encoding_utf_8) { +00417 /* Latin characters in an oriental text */ +00418 if (usChar >= 0xff01 && usChar <= 0xff5e) { +00419 usChar -= 0xfee0; +00420 } +00421 } +00422 +00423 if (eEncoding == encoding_latin_1 && +00424 (eConversionType == conversion_ps || +00425 eConversionType == conversion_pdf)) { +00426 /* Ugly, but it makes the PostScript and PDF look better */ +00427 switch (usChar) { +00428 case UNICODE_ELLIPSIS: +00429 return 140; +00430 case UNICODE_TRADEMARK_SIGN: +00431 return 141; +00432 case UNICODE_PER_MILLE_SIGN: +00433 return 142; +00434 case UNICODE_BULLET: +00435 case UNICODE_BULLET_OPERATOR: +00436 case UNICODE_BLACK_CLUB_SUIT: +00437 return 143; +00438 case UNICODE_LEFT_SINGLE_QMARK: +00439 return 144; +00440 case UNICODE_RIGHT_SINGLE_QMARK: +00441 return 145; +00442 case UNICODE_SINGLE_LEFT_ANGLE_QMARK: +00443 return 146; +00444 case UNICODE_SINGLE_RIGHT_ANGLE_QMARK: +00445 return 147; +00446 case UNICODE_LEFT_DOUBLE_QMARK: +00447 return 148; +00448 case UNICODE_RIGHT_DOUBLE_QMARK: +00449 return 149; +00450 case UNICODE_DOUBLE_LOW_9_QMARK: +00451 return 150; +00452 case UNICODE_EN_DASH: +00453 return 151; +00454 case UNICODE_EM_DASH: +00455 return 152; +00456 case UNICODE_MINUS_SIGN: +00457 return 153; +00458 case UNICODE_CAPITAL_LIGATURE_OE: +00459 return 154; +00460 case UNICODE_SMALL_LIGATURE_OE: +00461 return 155; +00462 case UNICODE_DAGGER: +00463 return 156; +00464 case UNICODE_DOUBLE_DAGGER: +00465 return 157; +00466 case UNICODE_SMALL_LIGATURE_FI: +00467 return 158; +00468 case UNICODE_SMALL_LIGATURE_FL: +00469 return 159; +00470 default: +00471 break; +00472 } +00473 } +00474 +00475 if (eConversionType == conversion_pdf) { +00476 if (eEncoding == encoding_latin_1) { +00477 switch (usChar) { +00478 case UNICODE_EURO_SIGN: +00479 return 128; +00480 default: +00481 break; +00482 } +00483 } else if (eEncoding == encoding_latin_2) { +00484 switch (usChar) { +00485 case UNICODE_CAPITAL_D_WITH_STROKE: +00486 case UNICODE_SMALL_D_WITH_STROKE: +00487 return 0x3f; +00488 default: +00489 break; +00490 } +00491 } +00492 } +00493 +00494 if (usChar < 0x80) { +00495 /* US ASCII */ +00496 if (usChar < 0x20 || usChar == 0x7f) { +00497 /* Ignore control characters */ +00498 DBG_HEX(usChar); +00499 DBG_FIXME(); +00500 return IGNORE_CHARACTER; +00501 } +00502 return (ULONG)usChar; +00503 } +00504 +00505 if (eEncoding == encoding_utf_8) { +00506 /* No need to convert Unicode characters */ +00507 return (ULONG)usChar; +00508 } +00509 +00510 /* Unicode to local representation */ +00511 pTmp = pGetCharTableRecord(usChar); +00512 if (pTmp != NULL) { +00513 DBG_HEX_C(usChar >= 0x7f && usChar <= 0x9f, usChar); +00514 return (ULONG)pTmp->ucLocal; +00515 } +00516 +00517 /* Fancy characters to simple US ASCII */ +00518 switch (usChar) { +00519 case UNICODE_SMALL_F_HOOK: +00520 return (ULONG)'f'; +00521 case UNICODE_GREEK_CAPITAL_CHI: +00522 return (ULONG)'X'; +00523 case UNICODE_GREEK_SMALL_UPSILON: +00524 return (ULONG)'v'; +00525 case UNICODE_MODIFIER_CIRCUMFLEX: +00526 case UNICODE_UPWARDS_ARROW: +00527 return (ULONG)'^'; +00528 case UNICODE_SMALL_TILDE: +00529 case UNICODE_TILDE_OPERATOR: +00530 return (ULONG)'~'; +00531 case UNICODE_EN_QUAD: +00532 case UNICODE_EM_QUAD: +00533 case UNICODE_EN_SPACE: +00534 case UNICODE_EM_SPACE: +00535 case UNICODE_THREE_PER_EM_SPACE: +00536 case UNICODE_FOUR_PER_EM_SPACE: +00537 case UNICODE_SIX_PER_EM_SPACE: +00538 case UNICODE_FIGURE_SPACE: +00539 case UNICODE_PUNCTUATION_SPACE: +00540 case UNICODE_THIN_SPACE: +00541 case UNICODE_NARROW_NO_BREAK_SPACE: +00542 case UNICODE_LIGHT_SHADE: +00543 case UNICODE_MEDIUM_SHADE: +00544 case UNICODE_DARK_SHADE: +00545 return (ULONG)' '; +00546 case UNICODE_LEFT_DOUBLE_QMARK: +00547 case UNICODE_RIGHT_DOUBLE_QMARK: +00548 case UNICODE_DOUBLE_LOW_9_QMARK: +00549 case UNICODE_DOUBLE_HIGH_REV_9_QMARK: +00550 case UNICODE_DOUBLE_PRIME: +00551 return (ULONG)'"'; +00552 case UNICODE_LEFT_SINGLE_QMARK: +00553 case UNICODE_RIGHT_SINGLE_QMARK: +00554 case UNICODE_SINGLE_LOW_9_QMARK: +00555 case UNICODE_SINGLE_HIGH_REV_9_QMARK: +00556 case UNICODE_PRIME: +00557 return (ULONG)'\''; +00558 case UNICODE_HYPHEN: +00559 case UNICODE_NON_BREAKING_HYPHEN: +00560 case UNICODE_FIGURE_DASH: +00561 case UNICODE_EN_DASH: +00562 case UNICODE_EM_DASH: +00563 case UNICODE_HORIZONTAL_BAR: +00564 case UNICODE_MINUS_SIGN: +00565 case UNICODE_BD_LIGHT_HORIZONTAL: +00566 case UNICODE_BD_DOUBLE_HORIZONTAL: +00567 return (ULONG)'-'; +00568 case UNICODE_DOUBLE_VERTICAL_LINE: +00569 case UNICODE_BD_LIGHT_VERTICAL: +00570 case UNICODE_BD_DOUBLE_VERTICAL: +00571 return (ULONG)'|'; +00572 case UNICODE_DOUBLE_LOW_LINE: +00573 return (ULONG)'_'; +00574 case UNICODE_DAGGER: +00575 return (ULONG)'+'; +00576 case UNICODE_DOUBLE_DAGGER: +00577 return (ULONG)'#'; +00578 case UNICODE_BULLET: +00579 case UNICODE_BULLET_OPERATOR: +00580 case UNICODE_BLACK_CLUB_SUIT: +00581 return (ULONG)ucGetBulletCharacter(eConversionType, eEncoding); +00582 case UNICODE_ONE_DOT_LEADER: +00583 case UNICODE_TWO_DOT_LEADER: +00584 return (ULONG)'.'; +00585 case UNICODE_ELLIPSIS: +00586 #if defined(__riscos) +00587 return (ULONG)OUR_ELLIPSIS; +00588 #else +00589 if (ulFileOffset == 0) { +00590 return (ULONG)OUR_ELLIPSIS; +00591 } +00592 return UNICODE_ELLIPSIS; +00593 #endif /* __riscos */ +00594 case UNICODE_DOUBLE_LEFT_ANGLE_QMARK: +00595 case UNICODE_TRIANGULAR_BULLET: +00596 case UNICODE_SINGLE_LEFT_ANGLE_QMARK: +00597 case UNICODE_LEFTWARDS_ARROW: +00598 return (ULONG)'<'; +00599 case UNICODE_DOUBLE_RIGHT_ANGLE_QMARK: +00600 case UNICODE_SINGLE_RIGHT_ANGLE_QMARK: +00601 case UNICODE_RIGHTWARDS_ARROW: +00602 return (ULONG)'>'; +00603 case UNICODE_UNDERTIE: +00604 return (ULONG)'-'; +00605 case UNICODE_N_ARY_SUMMATION: +00606 return (ULONG)'S'; +00607 case UNICODE_EURO_SIGN: +00608 return (ULONG)'E'; +00609 case UNICODE_CIRCLE: +00610 case UNICODE_SQUARE: +00611 return (ULONG)'O'; +00612 case UNICODE_DIAMOND: +00613 return (ULONG)OUR_DIAMOND; +00614 case UNICODE_NUMERO_SIGN: +00615 return (ULONG)'N'; +00616 case UNICODE_KELVIN_SIGN: +00617 return (ULONG)'K'; +00618 case UNICODE_DOWNWARDS_ARROW: +00619 return (ULONG)'v'; +00620 case UNICODE_FRACTION_SLASH: +00621 case UNICODE_DIVISION_SLASH: +00622 return (ULONG)'/'; +00623 case UNICODE_ASTERISK_OPERATOR: +00624 return (ULONG)'*'; +00625 case UNICODE_RATIO: +00626 return (ULONG)':'; +00627 case UNICODE_BD_LIGHT_DOWN_RIGHT: +00628 case UNICODE_BD_LIGHT_DOWN_AND_LEFT: +00629 case UNICODE_BD_LIGHT_UP_AND_RIGHT: +00630 case UNICODE_BD_LIGHT_UP_AND_LEFT: +00631 case UNICODE_BD_LIGHT_VERTICAL_AND_RIGHT: +00632 case UNICODE_BD_LIGHT_VERTICAL_AND_LEFT: +00633 case UNICODE_BD_LIGHT_DOWN_AND_HORIZONTAL: +00634 case UNICODE_BD_LIGHT_UP_AND_HORIZONTAL: +00635 case UNICODE_BD_LIGHT_VERTICAL_AND_HORIZONTAL: +00636 case UNICODE_BD_DOUBLE_DOWN_AND_RIGHT: +00637 case UNICODE_BD_DOUBLE_DOWN_AND_LEFT: +00638 case UNICODE_BD_DOUBLE_UP_AND_RIGHT: +00639 case UNICODE_BD_DOUBLE_UP_AND_LEFT: +00640 case UNICODE_BD_DOUBLE_VERTICAL_AND_RIGHT: +00641 case UNICODE_BD_DOUBLE_VERTICAL_AND_LEFT: +00642 case UNICODE_BD_DOUBLE_DOWN_AND_HORIZONTAL: +00643 case UNICODE_BD_DOUBLE_UP_AND_HORIZONTAL: +00644 case UNICODE_BD_DOUBLE_VERTICAL_AND_HORIZONTAL: +00645 case UNICODE_BLACK_SQUARE: +00646 return (ULONG)'+'; +00647 case UNICODE_HAIR_SPACE: +00648 case UNICODE_ZERO_WIDTH_SPACE: +00649 case UNICODE_ZERO_WIDTH_NON_JOINER: +00650 case UNICODE_ZERO_WIDTH_JOINER: +00651 case UNICODE_LEFT_TO_RIGHT_MARK: +00652 case UNICODE_RIGHT_TO_LEFT_MARK: +00653 case UNICODE_LEFT_TO_RIGHT_EMBEDDING: +00654 case UNICODE_RIGHT_TO_LEFT_EMBEDDING: +00655 case UNICODE_POP_DIRECTIONAL_FORMATTING: +00656 case UNICODE_LEFT_TO_RIGHT_OVERRIDE: +00657 case UNICODE_RIGHT_TO_LEFT_OVERRIDE: +00658 case UNICODE_ZERO_WIDTH_NO_BREAK_SPACE: +00659 return IGNORE_CHARACTER; +00660 default: +00661 break; +00662 } +00663 +00664 if (usChar == UNICODE_TRADEMARK_SIGN) { +00665 /* +00666 * No local representation, it doesn't look like anything in +00667 * US-ASCII and a question mark does more harm than good. +00668 */ +00669 return IGNORE_CHARACTER; +00670 } +00671 +00672 if (usChar >= 0xa0 && usChar <= 0xff) { +00673 /* Before Word 97, Word did't use Unicode */ +00674 return (ULONG)usChar; +00675 } +00676 +00677 DBG_HEX_C(usChar < 0x3000 || usChar >= 0xd800, ulFileOffset); +00678 DBG_HEX_C(usChar < 0x3000 || usChar >= 0xd800, usChar); +00679 DBG_MSG_C(usChar >= 0xe000 && usChar < 0xf900, "Private Use Area"); +00680 +00681 /* Untranslated Unicode character */ +00682 return 0x3f; +00683 } /* end of ulTranslateCharacters */ +00684 +00685 /* +00686 * ulToUpper - convert letter to upper case +00687 * +00688 * This function converts a letter to upper case. Unlike toupper(3) this +00689 * function is independent from the settings of locale. This comes in handy +00690 * for people who have to read Word documents in more than one language or +00691 * contain more than one language. +00692 * +00693 * returns the converted letter, or ulChar if the conversion was not possible. +00694 */ +00695 ULONG +00696 ulToUpper(ULONG ulChar) +00697 { +00698 if (ulChar < 0x80) { +00699 /* US ASCII: use standard function */ +00700 return (ULONG)toupper((int)ulChar); +00701 } +00702 if (ulChar >= 0xe0 && ulChar <= 0xfe && ulChar != 0xf7) { +00703 /* +00704 * Lower case accented characters +00705 * 0xf7 is Division sign; 0xd7 is Multiplication sign +00706 * 0xff is y with diaeresis; 0xdf is Sharp s +00707 */ +00708 return ulChar & ~0x20; +00709 } +00710 #if defined(__STDC_ISO_10646__) +00711 /* +00712 * If this is ISO C99 and all locales have wchar_t = ISO 10646 +00713 * (e.g., glibc 2.2 or newer), then use standard function +00714 */ +00715 if (ulChar > 0xff) { +00716 return (ULONG)towupper((wint_t)ulChar); +00717 } +00718 #endif /* __STDC_ISO_10646__ */ +00719 return ulChar; +00720 } /* end of ulToUpper */ +