|
1 // Copyright (c) 1997-2009 Nokia Corporation and/or its subsidiary(-ies). |
|
2 // All rights reserved. |
|
3 // This component and the accompanying materials are made available |
|
4 // under the terms of the License "Eclipse Public License v1.0" |
|
5 // which accompanies this distribution, and is available |
|
6 // at the URL "http://www.eclipse.org/legal/epl-v10.html". |
|
7 // |
|
8 // Initial Contributors: |
|
9 // Nokia Corporation - initial contribution. |
|
10 // |
|
11 // Contributors: |
|
12 // |
|
13 // Description: |
|
14 // Reads a Unicode character type data file (such as UnicodeData-3.0.0.txt or a file containing locale-specific overrides) |
|
15 // and writes C++ definitions of tables containing the information. |
|
16 // Usage: readtype <input-file> <output-file> { <locale-name> }. |
|
17 // <input-file>: either the standard Unicode character data file (e.g., UnicodeData-3.0.0.txt) or a file containing |
|
18 // overriding information for a certain locale, in the same format as the standard file, but with ranges for which |
|
19 // there is no data given in the form: |
|
20 // 0041;;;;;;;;;;;;;; |
|
21 // 006A;<No Data First>;;;;;;;;;;;;; |
|
22 // FFFF;<No Data Last>;;;;;;;;;;;;; |
|
23 // (in this example, these entries show that there is no overriding data for the character 0041 and range |
|
24 // 006A..FFFF inclusive). |
|
25 // Both single entries with no data and ranges with no data must have nothing in the third field (category). |
|
26 // <output-file>: the C++ source file to be output: this file becomes \e32\unicode\unitable.cpp, or an overriding |
|
27 // file in \e32\lsrc; there are none of these yet. |
|
28 // <locale-name>: a an optional name to be inserted into identifiers in the output file: omit this for the standard |
|
29 // data set; use names like 'Turkish', 'Japanese', etc., for locales. |
|
30 // |
|
31 // |
|
32 |
|
33 |
|
34 #include <stdio.h> |
|
35 #include <stdlib.h> |
|
36 #include <string.h> |
|
37 |
|
38 #ifndef _UNICODE |
|
39 #define _UNICODE |
|
40 #endif |
|
41 |
|
42 #include <unicode.h> |
|
43 |
|
44 // don't use unicode.h::TUnicodeDataRange, since its for 16-bit, and deprecated |
|
45 struct TUnicodeDataRange32 // Only used inside this cpp. |
|
46 { |
|
47 TUint32 iRangeStart; // Unicode value of the start of the range of characters |
|
48 TInt16 iIndex; // index into an array of character information structures (-1 means data no available) |
|
49 }; |
|
50 |
|
51 const int PlaneCount = 17; |
|
52 TUnicodePlane ThePlanesInReadType[PlaneCount]; |
|
53 |
|
54 // Tables to convert names used in the data file to categories defined in TChar. |
|
55 struct CatInfo |
|
56 { |
|
57 const char* iName; |
|
58 TChar::TCategory iCat; |
|
59 }; |
|
60 |
|
61 static const CatInfo TheCatInfo[] = |
|
62 { |
|
63 { "Lu", TChar::ELuCategory }, |
|
64 { "Ll", TChar::ELlCategory }, |
|
65 { "Lt", TChar::ELtCategory }, |
|
66 { "Lo", TChar::ELoCategory }, |
|
67 { "Lm", TChar::ELmCategory }, |
|
68 { "Mn", TChar::EMnCategory }, |
|
69 { "Mc", TChar::EMcCategory }, |
|
70 { "Me", TChar::EMeCategory }, |
|
71 { "Nd", TChar::ENdCategory }, |
|
72 { "Nl", TChar::ENlCategory }, |
|
73 { "No", TChar::ENoCategory }, |
|
74 { "Pc", TChar::EPcCategory }, |
|
75 { "Pd", TChar::EPdCategory }, |
|
76 { "Ps", TChar::EPsCategory }, |
|
77 { "Pe", TChar::EPeCategory }, |
|
78 { "Pi", TChar::EPiCategory }, |
|
79 { "Pf", TChar::EPfCategory }, |
|
80 { "Po", TChar::EPoCategory }, |
|
81 { "Sm", TChar::ESmCategory }, |
|
82 { "Sc", TChar::EScCategory }, |
|
83 { "Sk", TChar::ESkCategory }, |
|
84 { "So", TChar::ESoCategory }, |
|
85 { "Zs", TChar::EZsCategory }, |
|
86 { "Zl", TChar::EZlCategory }, |
|
87 { "Zp", TChar::EZpCategory }, |
|
88 { "Cc", TChar::ECcCategory }, |
|
89 { "Cf", TChar::ECfCategory }, |
|
90 { "Cs", TChar::ECsCategory }, |
|
91 { "Co", TChar::ECoCategory }, |
|
92 { "Cn", TChar::ECnCategory } |
|
93 }; |
|
94 const int TheCategories = sizeof(TheCatInfo) / sizeof(TheCatInfo[0]); |
|
95 |
|
96 struct BdCatInfo |
|
97 { |
|
98 const char* iName; |
|
99 TChar::TBdCategory iBdCat; |
|
100 }; |
|
101 |
|
102 static const BdCatInfo TheBdCatInfo[] = |
|
103 { |
|
104 { "L", TChar::ELeftToRight }, |
|
105 { "LRE", TChar::ELeftToRightEmbedding }, |
|
106 { "LRO", TChar::ELeftToRightOverride }, |
|
107 { "R", TChar::ERightToLeft }, |
|
108 { "AL", TChar::ERightToLeftArabic }, |
|
109 { "RLE", TChar::ERightToLeftEmbedding }, |
|
110 { "RLO", TChar::ERightToLeftOverride }, |
|
111 { "PDF", TChar::EPopDirectionalFormat }, |
|
112 { "EN", TChar::EEuropeanNumber }, |
|
113 { "ES", TChar::EEuropeanNumberSeparator }, |
|
114 { "ET", TChar::EEuropeanNumberTerminator }, |
|
115 { "AN", TChar::EArabicNumber }, |
|
116 { "CS", TChar::ECommonNumberSeparator }, |
|
117 { "NSM", TChar::ENonSpacingMark }, |
|
118 { "BN", TChar::EBoundaryNeutral }, |
|
119 { "B", TChar::EParagraphSeparator }, |
|
120 { "S", TChar::ESegmentSeparator }, |
|
121 { "WS", TChar::EWhitespace }, |
|
122 { "ON", TChar::EOtherNeutral }, |
|
123 }; |
|
124 const int TheBdCategories = sizeof(TheBdCatInfo) / sizeof(TheBdCatInfo[0]); |
|
125 |
|
126 // Class derived from TUnicodeData to provide constructor etc. |
|
127 class Data: public TUnicodeData |
|
128 { |
|
129 public: |
|
130 Data(); |
|
131 TBool operator==(const Data& c) const; |
|
132 TBool operator!=(const Data& c) const { return !(*this == c); } |
|
133 void Write(); |
|
134 }; |
|
135 |
|
136 // The character information table. |
|
137 const int MaxDatas = 1000; |
|
138 Data TheData[MaxDatas]; |
|
139 int Datas = 0; |
|
140 |
|
141 // The range table, containing indices to the character information table. |
|
142 const int MaxRanges = 4000; |
|
143 TUnicodeDataRange32 TheRange[MaxRanges]; |
|
144 int Ranges = 0; |
|
145 |
|
146 // The exhaustive index table, containing indices from every 16-bit value to the character information table. |
|
147 int TheIndex[0x110000]; |
|
148 |
|
149 // The special tables for characters in the range 0..255. |
|
150 TUint16 LowerCaseTable[256]; |
|
151 TUint16 FoldTable[256]; |
|
152 |
|
153 // The special table for characters in the range 0xFF00..0xFFFF |
|
154 TUint16 CjkWidthFoldTable[256]; |
|
155 |
|
156 /* |
|
157 The composition table. The compositions are stored as a word made up from the composition tag (high byte) and |
|
158 the number of components (low byte), the Unicode value of the composed character, then the Unicode values of |
|
159 the components. |
|
160 |
|
161 Two tables are created containing the indices of compositions. One of these is sorted by |
|
162 composed character, one by decomposition. This enables quick conversions to be made in both directions. |
|
163 */ |
|
164 const int MaxCompositionWords = 14000; |
|
165 TUint32 CompositionBuffer[MaxCompositionWords]; |
|
166 int CompositionWords = 0; |
|
167 const int MaxCompositions = 8000; |
|
168 TInt16 Compose[MaxCompositions]; // composition buffer indices, sorted by composed character |
|
169 TInt16 Decompose[MaxCompositions]; // composition buffer indices, sorted by decomposition |
|
170 int Compositions = 0; |
|
171 int trie_data[0x110000]; // used to build the trie |
|
172 |
|
173 FILE *input_file; |
|
174 FILE *output_file; |
|
175 const char *input_filename; |
|
176 const char *output_filename; |
|
177 |
|
178 // Convert a hex string to an integer. |
|
179 static int hex(const char *s) |
|
180 { |
|
181 int x = 0; |
|
182 while (*s) |
|
183 { |
|
184 int n = *s; |
|
185 if (n >= '0' && n <= '9') |
|
186 n -= '0'; |
|
187 else if (n >= 'A' && n <= 'F') |
|
188 n -= 'A' - 10; |
|
189 else if (n >= 'a' && n <= 'f') |
|
190 n -= 'a' - 10; |
|
191 else |
|
192 break; |
|
193 x = x * 16 + n; |
|
194 |
|
195 s++; |
|
196 } |
|
197 return x; |
|
198 } |
|
199 |
|
200 static TChar::TCategory Category(const char* aName,bool aWarn) |
|
201 { |
|
202 for (int i = 0; i < TheCategories; i++) |
|
203 if (!strcmp(aName,TheCatInfo[i].iName)) |
|
204 return TheCatInfo[i].iCat; |
|
205 if (aWarn) |
|
206 fprintf(stderr,"unknown category %s\n",aName); |
|
207 return (TChar::TCategory)(-1); |
|
208 } |
|
209 |
|
210 static TChar::TBdCategory BdCategory(const char* aName,bool aWarn) |
|
211 { |
|
212 for (int i = 0; i < TheBdCategories; i++) |
|
213 if (!strcmp(aName,TheBdCatInfo[i].iName)) |
|
214 return TheBdCatInfo[i].iBdCat; |
|
215 if (aWarn) |
|
216 fprintf(stderr,"unknown bidirectional category %s\n",aName); |
|
217 return (TChar::TBdCategory)(-1); |
|
218 } |
|
219 |
|
220 // Write an aggregate initialiser for a Data object to the output file. |
|
221 void Data::Write() |
|
222 { |
|
223 fprintf(output_file,"{ %d, %d, %d, %d, %d, %d }", |
|
224 (int)iCategory, |
|
225 (int)iBdCategory, |
|
226 (int)iCombiningClass, |
|
227 (int)iDigitOffset, |
|
228 (int)iCaseOffset, |
|
229 (int)iFlags); |
|
230 } |
|
231 |
|
232 /* |
|
233 Add a new entry to the range table. If the category is the illegal value -1 store -1 as the |
|
234 index; this feature is used when creating character data for specific locales, which mostly |
|
235 consists of ranges for which the data is held in the main table, and is marked in this way |
|
236 as unspecified in the locale table. |
|
237 */ |
|
238 void add_range(Data& info,TInt code) |
|
239 { |
|
240 // Get an index to the character info; add a new entry if necessary. |
|
241 int index = -1; |
|
242 if (info.iCategory != TChar::TCategory(0xFF)) |
|
243 { |
|
244 for (int i = 0; i < Datas && index == -1; i++) |
|
245 if (TheData[i] == info) |
|
246 index = i; |
|
247 if (index == -1) |
|
248 { |
|
249 if (Datas >= MaxDatas) |
|
250 { |
|
251 fprintf(stderr,"too many Datas: > %d\n",MaxDatas); |
|
252 exit(1); |
|
253 } |
|
254 TheData[index = Datas++] = info; |
|
255 } |
|
256 } |
|
257 |
|
258 // Add the entry to the range table. |
|
259 if (Ranges >= MaxRanges) |
|
260 { |
|
261 fprintf(stderr,"too many Ranges: > %d, when processing U+%x\n", MaxRanges, code); |
|
262 exit(1); |
|
263 } |
|
264 TheRange[Ranges].iRangeStart = code; |
|
265 TheRange[Ranges].iIndex = (TInt16)index; |
|
266 Ranges++; |
|
267 } |
|
268 |
|
269 // Write a table of "entries" integers each of "entry_size" bytes. |
|
270 int write_table(const void *table,const char *name, |
|
271 int entries,int input_entry_size,int output_entry_size, |
|
272 int entry_signed,int entries_per_row,int write_array_size) |
|
273 { |
|
274 const char *type = entry_signed ? "TInt" : "TUint"; |
|
275 const int bits = output_entry_size * 8; |
|
276 |
|
277 /* |
|
278 There is a choice here whether or not the number of entries in the array is written: |
|
279 either <name>[<size>] or <name>[] is written. The latter method is used where the header |
|
280 says <name>[] so that compilers like GCC don't moan about type mismatches. |
|
281 */ |
|
282 if (entries == 0) |
|
283 { |
|
284 // In case that given plane has no character. |
|
285 fprintf(output_file,"const %s%d * const %s = NULL;\n",type,bits,name); |
|
286 return 0; |
|
287 } |
|
288 if (write_array_size) |
|
289 fprintf(output_file,"const %s%d %s[%d] = \n\t{",type,bits,name,entries); |
|
290 else |
|
291 fprintf(output_file,"const %s%d %s[] = \n\t{ // %d entries",type,bits,name,entries); |
|
292 |
|
293 const unsigned char *p = (const unsigned char *)table; |
|
294 for (int i = 0; i < entries; i++, p += input_entry_size) |
|
295 { |
|
296 if (i % entries_per_row == 0) |
|
297 fprintf(output_file,"\n\t"); |
|
298 if (output_entry_size == 1) |
|
299 fprintf(output_file,"0x%02x",(int)(*p)); |
|
300 else if (output_entry_size == 2) |
|
301 fprintf(output_file,"0x%04x",(int)(*((TUint16 *)p))); |
|
302 else if (output_entry_size == 4) |
|
303 fprintf(output_file,"0x%08x",(int)(*((TUint32 *)p))); |
|
304 else |
|
305 { |
|
306 fprintf(stderr,"illegal output entry size: %d\n",output_entry_size); |
|
307 exit(1); |
|
308 } |
|
309 if (i < entries - 1) |
|
310 fputc(',',output_file); |
|
311 // comment for easy read |
|
312 //if ((i+1) % entries_per_row == 0) |
|
313 // fprintf(output_file, "\t// U+%X-U+%X (%d-%d)", i+1-entries_per_row, i, i+1-entries_per_row, i); |
|
314 } |
|
315 fprintf(output_file,"\n\t};\n"); |
|
316 |
|
317 return entries * output_entry_size; |
|
318 } |
|
319 |
|
320 /* |
|
321 Create and write a trie representing the data in 'aTheIndex' |
|
322 The trie is of two levels, the first level indexed by the high 'aBlockBits' bits of the |
|
323 character code, the second by the low bits. There is one wrinkle; if the index value, which is 16 bits, |
|
324 has its top bit set, it is not an index but the actual data value for all entries in that block. |
|
325 |
|
326 Thus the way to get the value for a code is: |
|
327 |
|
328 int index = trie_index[code >> aBlockBits]; |
|
329 if (index & 0x8000) |
|
330 value = index & ~0x8000; |
|
331 else |
|
332 value = aTrieData[code & (1 << (16 - aBlockBits))]; |
|
333 |
|
334 The data size in bytes is returned. |
|
335 The argument 'aWrite' determines whether the data is written or not. |
|
336 The arguments 'aTrie1Name' and 'aTrie2Name' are used as variable names in generated unitable.cpp. |
|
337 */ |
|
338 int write_trie(int aOutputEntrySize,int aBlockBits,bool aWrite, int *aTheIndex, int *aTrieData, char *aTrie1Name, char *aTrie2Name) |
|
339 { |
|
340 int n = 0; // number of entries used in trie_data |
|
341 |
|
342 int block_size = 1 << aBlockBits; |
|
343 int blocks = 1 << (16 - aBlockBits); |
|
344 |
|
345 int* trie_index = new int[blocks]; |
|
346 int* block = new int[block_size]; |
|
347 |
|
348 for (int block_index = 0; block_index < blocks; block_index++) |
|
349 { |
|
350 // Write the data for the current block. |
|
351 int block_start = block_index * block_size; |
|
352 bool all_the_same = true; |
|
353 for (int code = 0; code < block_size; code++) |
|
354 { |
|
355 block[code] = aTheIndex[block_start + code]; |
|
356 if (block[code] != block[0]) |
|
357 all_the_same = false; |
|
358 } |
|
359 |
|
360 // Try to find a match for it. |
|
361 int insert_at; |
|
362 if (all_the_same) |
|
363 trie_index[block_index] = block[0] | 0x8000; |
|
364 else |
|
365 { |
|
366 for (insert_at = 0; insert_at < n; insert_at++) |
|
367 { |
|
368 int entries = n - insert_at; |
|
369 if (entries > block_size) |
|
370 entries = block_size; |
|
371 int bytes = entries * sizeof(int); |
|
372 if (memcmp(block,aTrieData + insert_at,bytes) == 0) |
|
373 break; |
|
374 } |
|
375 |
|
376 memcpy(aTrieData + insert_at,block,block_size * sizeof(int)); |
|
377 if (insert_at + block_size > n) |
|
378 n = insert_at + block_size; |
|
379 trie_index[block_index] = insert_at; |
|
380 } |
|
381 } |
|
382 |
|
383 if (aWrite) |
|
384 { |
|
385 write_table(trie_index,aTrie1Name,blocks,4,2,false,16,true); |
|
386 write_table(aTrieData,aTrie2Name,n,4,aOutputEntrySize,false,32,true); |
|
387 } |
|
388 |
|
389 delete [] trie_index; |
|
390 delete [] block; |
|
391 |
|
392 return blocks * 2 + n * aOutputEntrySize; |
|
393 } |
|
394 |
|
395 // Write the best possible 2-level trie for all planes, trying block sizes of 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096 and 8192 |
|
396 // @return Data size in bytes. |
|
397 int write_trie() |
|
398 { |
|
399 int byteCount = 0; |
|
400 for (int plane=0; plane<PlaneCount; plane++) |
|
401 { |
|
402 int best_data_size = 1 << 30; |
|
403 int best_bits = 0; |
|
404 |
|
405 int outputEntrySize = 2; |
|
406 char trie1Name[255]; |
|
407 char trie2Name[255]; |
|
408 sprintf(trie1Name, "ThePlane%02dTrieIndex1", plane); |
|
409 sprintf(trie2Name, "ThePlane%02dTrieIndex2", plane); |
|
410 int *theIndex = TheIndex + plane * 0x10000; |
|
411 int *trieData = trie_data + plane * 0x10000; |
|
412 |
|
413 for (int cur_bits = 3; cur_bits < 14; cur_bits++) |
|
414 { |
|
415 int cur_data_size = write_trie(outputEntrySize, cur_bits, false, theIndex, trieData, trie1Name, trie2Name); |
|
416 if (cur_data_size < best_data_size) |
|
417 { |
|
418 best_bits = cur_bits; |
|
419 best_data_size = cur_data_size; |
|
420 } |
|
421 } |
|
422 |
|
423 byteCount += write_trie(outputEntrySize, best_bits, true, theIndex, trieData, trie1Name, trie2Name); |
|
424 ThePlanesInReadType[plane].iCodesPerBlock = (TUint8) best_bits; |
|
425 ThePlanesInReadType[plane].iMaskForCodePoint = (TUint16) ((1 << (best_bits)) - 1); |
|
426 ThePlanesInReadType[plane].iMaskForBlock = (TUint16) (~(ThePlanesInReadType[plane].iMaskForCodePoint)); |
|
427 } |
|
428 return byteCount; |
|
429 } |
|
430 |
|
431 /* |
|
432 Compare entries in the decompose table for the purpose of sorting them. The entries are indices |
|
433 into the starting words of compositions stored in the composition buffer. |
|
434 */ |
|
435 int compare_decompositions(const void *p,const void *q) |
|
436 { |
|
437 // Get the indexes. |
|
438 TInt16 index1 = *((const TInt16 *)p); |
|
439 TInt16 index2 = *((const TInt16 *)q); |
|
440 |
|
441 // Compare the two composition strings. |
|
442 return TUnicode::Compare((TUint16 *)&CompositionBuffer[index1 + 2], CompositionBuffer[index1 + 1]*2, |
|
443 (TUint16 *)&CompositionBuffer[index2 + 2], CompositionBuffer[index2 + 1]*2); |
|
444 } |
|
445 |
|
446 // Write the output file. |
|
447 void write_output() |
|
448 { |
|
449 int data_bytes = 0; |
|
450 |
|
451 // Write the comment at the top of the file |
|
452 fprintf(output_file, "// Copyright (c) 2007-2009 Nokia Corporation and/or its subsidiary(-ies).\n"); |
|
453 fprintf(output_file, "// All rights reserved.\n"); |
|
454 fprintf(output_file, "// This component and the accompanying materials are made available\n"); |
|
455 fprintf(output_file, "// under the terms of the License \"Eclipse Public License v1.0\"\n"); |
|
456 fprintf(output_file, "// which accompanies this distribution, and is available\n"); |
|
457 fprintf(output_file, "// at the URL \"http://www.eclipse.org/legal/epl-v10.html\".\n"); |
|
458 fprintf(output_file, "//\n"); |
|
459 fprintf(output_file, "// Initial Contributors:\n"); |
|
460 fprintf(output_file, "// Nokia Corporation - initial contribution.\n"); |
|
461 fprintf(output_file, "//\n"); |
|
462 fprintf(output_file, "// Contributors:\n"); |
|
463 fprintf(output_file, "//\n"); |
|
464 fprintf(output_file, "// Description:\n"); |
|
465 |
|
466 fprintf(output_file, |
|
467 "// Unicode character information tables.\n" |
|
468 "// Written by the READTYPE program.\n" |
|
469 "// Please read the 'Unicode Character Data and Line Break data Update History.doc' file for detailed history of updates to this file.\n" |
|
470 "// This file was generated by the READTYPE tool using UCD 5.0.\n" |
|
471 "// The contents of this file were generated automatically. Please do not edit this manually.\n" |
|
472 "//\n" |
|
473 "//\n" |
|
474 "\n"); |
|
475 |
|
476 // Write the directive to include the header file. |
|
477 fprintf(output_file,"#include <unicode.h>\n\n"); |
|
478 |
|
479 // Export two variables for unicode.cpp. |
|
480 fprintf(output_file, "\n"); |
|
481 fprintf(output_file, "// Declarations for tables held in unitable.cpp and used by unicode.cpp.\n"); |
|
482 fprintf(output_file, "extern const TStandardUnicodeDataSet TheStandardUnicodeDataSet[];\n"); |
|
483 fprintf(output_file, "extern const TUnicodePlane ThePlanes[17];\n\n\n"); |
|
484 |
|
485 // Write the trie data. |
|
486 data_bytes += write_trie(); |
|
487 |
|
488 // Write the character information table. |
|
489 fprintf(output_file,"static const TUnicodeData TheUnicodeData[] =\n\t{ // %d entries\n", Datas); |
|
490 int i; |
|
491 for (i = 0; i < Datas; i++) |
|
492 { |
|
493 fputc('\t',output_file); |
|
494 TheData[i].Write(); |
|
495 if (i < Datas - 1) |
|
496 fputc(',',output_file); |
|
497 fprintf(output_file, "\t// 0x%X (%d)", i, i); |
|
498 fputc('\n',output_file); |
|
499 } |
|
500 fprintf(output_file,"\t};\n\n"); |
|
501 data_bytes += Datas * sizeof(Data); |
|
502 |
|
503 // write plane properties |
|
504 fprintf(output_file, "const TUnicodePlane ThePlanes[%d] =\n\t{\n", PlaneCount); |
|
505 int plane; |
|
506 for (plane=0; plane<=16; plane++) |
|
507 { |
|
508 fprintf(output_file, "\t{%d, 0x%04X, 0x%04X }", |
|
509 ThePlanesInReadType[plane].iCodesPerBlock, ThePlanesInReadType[plane].iMaskForBlock, ThePlanesInReadType[plane].iMaskForCodePoint); |
|
510 if (plane < 16) |
|
511 fprintf(output_file, ",\n"); |
|
512 } |
|
513 fprintf(output_file, "\n\t};\n\n"); |
|
514 data_bytes += 5*PlaneCount; |
|
515 |
|
516 // Write a data structure referring to the trie data. |
|
517 fprintf(output_file,"const TStandardUnicodeDataSet TheStandardUnicodeDataSet[] =\n\t{ // %d entries\n", PlaneCount); |
|
518 for (plane=0; plane<=16; plane++) |
|
519 { |
|
520 fprintf(output_file,"\t{ ThePlane%02dTrieIndex1, ThePlane%02dTrieIndex2, TheUnicodeData }", plane, plane); |
|
521 if (plane < 16) |
|
522 fprintf(output_file, ",\n"); |
|
523 } |
|
524 fprintf(output_file, "\n\t};\n\n"); |
|
525 data_bytes += 12*PlaneCount; |
|
526 |
|
527 // Convert the fold table to lower case. |
|
528 for (i = 0; i < 256; i++) |
|
529 FoldTable[i] = LowerCaseTable[FoldTable[i]]; |
|
530 |
|
531 // Make 00A0 (non-break space) fold to space. |
|
532 FoldTable[0xA0] = 0x20; |
|
533 |
|
534 // Make unassigned characters in the CJK width fold table fold to themselves. |
|
535 for (i = 0; i < 256; i++) |
|
536 if (CjkWidthFoldTable[i] == 0) |
|
537 CjkWidthFoldTable[i] = (TUint16)(0xFF00 + i); |
|
538 |
|
539 // Write the special tables |
|
540 data_bytes += write_table(FoldTable,"TUnicode::FoldTable",256,2,2,false,16,true); |
|
541 data_bytes += write_table(CjkWidthFoldTable,"TUnicode::CjkWidthFoldTable",256,2,2,false,16,true); |
|
542 |
|
543 // Write the number of data bytes at the end of the file. |
|
544 fprintf(output_file,"\n// The tables and structures contain %d bytes of data.\n",data_bytes); |
|
545 } |
|
546 |
|
547 int main(int argc,char **argv) |
|
548 { |
|
549 if (argc < 2) |
|
550 { |
|
551 fputs("usage: readtype <input-file> <output-file>",stderr); |
|
552 exit(1); |
|
553 } |
|
554 |
|
555 input_filename = argv[1]; |
|
556 output_filename = argv[2]; |
|
557 |
|
558 // Locale support in previous version is deprecated. |
|
559 |
|
560 input_file = fopen(input_filename,"r"); |
|
561 if (!input_file) |
|
562 { |
|
563 fprintf(stderr,"cannot open input file %s\n",input_filename); |
|
564 exit(1); |
|
565 } |
|
566 output_file = fopen(output_filename,"w"); |
|
567 if (!output_file) |
|
568 { |
|
569 fprintf(stderr,"cannot open output file %s\n",output_filename); |
|
570 exit(1); |
|
571 } |
|
572 |
|
573 Data range_info; // attributes of the current range |
|
574 Data unassigned_info; // attributes used for unassigned characters; the default constructor |
|
575 // sets the category to Cn, bidirectional category to L, everything else to 0. |
|
576 TBool first = true; |
|
577 |
|
578 char line[1024]; |
|
579 const int Fields = 15; |
|
580 char *field[Fields]; |
|
581 TInt prev_code = 0; |
|
582 while (fgets(line,sizeof(line),input_file)) |
|
583 { |
|
584 // Strip trailing newline if any. |
|
585 int length = strlen(line); |
|
586 if (length && line[length - 1] == '\n') |
|
587 line[length - 1] = 0; |
|
588 |
|
589 // Parse into fields. |
|
590 int n = 1; |
|
591 field[0] = line; |
|
592 for (char *p = line; *p; p++) |
|
593 if (*p == ';' && n < Fields) |
|
594 { |
|
595 *p = 0; |
|
596 field[n++] = p + 1; |
|
597 } |
|
598 |
|
599 // Ignore the line if there is only one field. |
|
600 if (n == 1) |
|
601 continue; |
|
602 |
|
603 // Extract fields of interest. |
|
604 |
|
605 // Field 0: Unicode value in hexadecimal. |
|
606 int code = hex(field[0]); |
|
607 |
|
608 // Field 2: Category. |
|
609 Data cur_info; |
|
610 cur_info.iCategory = (TUint8)Category(field[2], true); |
|
611 |
|
612 // Field 3: Combining class. |
|
613 cur_info.iCombiningClass = (TUint8)atoi(field[3]); |
|
614 |
|
615 // Field 4: Bidirectional category. |
|
616 cur_info.iBdCategory = (TUint8)BdCategory(field[4], true); |
|
617 |
|
618 // Prepare to determine the folded version (converted to lower case, stripped of accents). |
|
619 int folded_code = code; |
|
620 |
|
621 // Field 5: Character decomposition. |
|
622 if (field[5][0]) |
|
623 { |
|
624 int components = 0; |
|
625 const int MaxComponents = 18; // FDFA; ARABIC LIGATURE SALLALLAHOU ALAYHE WASALLAM has 18 components! |
|
626 TUint32 component[MaxComponents]; |
|
627 |
|
628 // Extract the tag if any. |
|
629 char *p = field[5]; |
|
630 const char *tag = NULL; |
|
631 if (field[5][0] == '<') |
|
632 { |
|
633 tag = ++p; |
|
634 while (*p && *p != '>') |
|
635 p++; |
|
636 if (!*p) |
|
637 { |
|
638 fprintf(stderr,"syntax error: missing > on the line for code %x\n",code); |
|
639 exit(1); |
|
640 } |
|
641 *p++ = 0; |
|
642 } |
|
643 |
|
644 // Read the components. |
|
645 while (*p) |
|
646 { |
|
647 while (*p == ' ') |
|
648 p++; |
|
649 if (components >= MaxComponents) |
|
650 { |
|
651 fprintf(stderr,"decomposition of %x has too many components: increase MaxComponents\n",code); |
|
652 exit(1); |
|
653 } |
|
654 component[components++] = hex(p); |
|
655 while (*p && *p != ' ') |
|
656 p++; |
|
657 } |
|
658 |
|
659 // Store the composition if it has a null tag and is therefore canonical. |
|
660 if (tag == NULL) |
|
661 { |
|
662 // Put its index into the tables. |
|
663 if (Compositions >= MaxCompositions) |
|
664 { |
|
665 fprintf(stderr,"too many compositions (at code %x): increase MaxCompositions\n",code); |
|
666 exit(1); |
|
667 } |
|
668 if (CompositionWords >= 65535) |
|
669 { |
|
670 fprintf(stderr, "too many compositions (at code %x): need 32 bit!?\n", code); |
|
671 exit(1); |
|
672 } |
|
673 Compose[Compositions] = Decompose[Compositions] = (TInt16)CompositionWords; |
|
674 Compositions++; |
|
675 |
|
676 // Put it into the composition buffer. |
|
677 if (CompositionWords + 2 + components >= MaxCompositionWords) |
|
678 { |
|
679 fprintf(stderr,"too many compositions (at code %x): increase MaxCompositionWords\n",code); |
|
680 exit(1); |
|
681 } |
|
682 CompositionBuffer[CompositionWords++] = code; |
|
683 CompositionBuffer[CompositionWords++] = components; |
|
684 for (int i = 0; i < components; i++) |
|
685 CompositionBuffer[CompositionWords++] = component[i]; |
|
686 } |
|
687 |
|
688 // Store the code used in the ordinary and CJK fold tables. |
|
689 if (components > 0) |
|
690 { |
|
691 if (code < 256) |
|
692 { |
|
693 if (tag == NULL) |
|
694 folded_code = component[0]; |
|
695 } |
|
696 else if (code >= 0xFF00 && code <= 0xFFEE) // tag will always be <wide> or <narrow> |
|
697 folded_code = component[0]; |
|
698 } |
|
699 } |
|
700 |
|
701 // Field 8. Numeric value. |
|
702 if (field[8][0]) |
|
703 { |
|
704 if (field[8][1] == '/' || field[8][2] == '/') // fractions |
|
705 cur_info.iFlags |= TUnicodeData::EFraction; |
|
706 else |
|
707 { |
|
708 int value = atoi(field[8]); |
|
709 if (value >= 0 && value <= 255) |
|
710 { |
|
711 cur_info.iDigitOffset = (TUint8)((value - (code & 255)) & 255); |
|
712 cur_info.iFlags |= TUnicodeData::ESmallNumeric; |
|
713 } |
|
714 else if (value == 500) |
|
715 cur_info.iFlags |= TUnicodeData::EFiveHundred; |
|
716 else if (value == 1000) |
|
717 cur_info.iFlags |= TUnicodeData::EOneThousand; |
|
718 else if (value == 5000) |
|
719 cur_info.iFlags |= TUnicodeData::EFiveThousand; |
|
720 else if (value == 10000) |
|
721 cur_info.iFlags |= TUnicodeData::ETenThousand; |
|
722 else if (value == 100000) |
|
723 cur_info.iFlags |= TUnicodeData::EHundredThousand; |
|
724 else |
|
725 fprintf(stderr,"Warning: U+%X has a large numeric property with unrepresentable value %d. Ignored.\n",code,value); |
|
726 } |
|
727 } |
|
728 |
|
729 // Field 9: Mirrored property. |
|
730 if (field[9][0] == 'Y') |
|
731 cur_info.iFlags |= TUnicodeData::EMirrored; |
|
732 |
|
733 // Fields 12, 13, 14: Case variants. |
|
734 int uc = code, lc = code, tc = code; |
|
735 if (field[12][0]) |
|
736 { |
|
737 uc = hex(field[12]); |
|
738 int uc_offset = uc - code; |
|
739 if (abs(uc_offset) > 32767) |
|
740 { |
|
741 fprintf(stderr, "Warning: offset to upper case is too large: code %X, upper case %X, offset %X. Ignored!\n", code, uc, uc_offset); |
|
742 } |
|
743 else |
|
744 { |
|
745 cur_info.iFlags |= TUnicodeData::EHasUpperCase; |
|
746 cur_info.iCaseOffset = (TInt16)(-uc_offset); |
|
747 if (code<0x10000 && uc>0x10000 || code>0x10000 && uc<0x10000) |
|
748 fprintf(stderr, "Info: %X and its upper case %X locate at different planes.\n"); |
|
749 } |
|
750 } |
|
751 if (field[13][0]) |
|
752 { |
|
753 lc = hex(field[13]); |
|
754 int lc_offset = lc - code; |
|
755 if (abs(lc_offset) > 32767) |
|
756 { |
|
757 fprintf(stderr, "Warning: offset to lower case is too large: code %X, lower case %X, offset %X. Ignored!\n", code, lc, lc_offset); |
|
758 } |
|
759 else |
|
760 { |
|
761 cur_info.iFlags |= TUnicodeData::EHasLowerCase; |
|
762 cur_info.iCaseOffset = (TInt16)lc_offset; |
|
763 if (code<0x10000 && lc>0x10000 || code>0x10000 && lc<0x10000) |
|
764 fprintf(stderr, "Info: %X and its lower case %X locate at different planes.\n"); |
|
765 } |
|
766 } |
|
767 if (field[14][0]) |
|
768 tc = hex(field[14]); |
|
769 if (tc != lc && tc != uc) |
|
770 cur_info.iFlags |= TUnicodeData::EHasTitleCase; |
|
771 |
|
772 // If this code is < 256 fill in the entries in the special tables. |
|
773 if (code < 256) |
|
774 { |
|
775 LowerCaseTable[code] = (TUint16)lc; |
|
776 FoldTable[code] = (TUint16)folded_code; |
|
777 } |
|
778 |
|
779 // If the code is >= 0xFF00 fill in the entry in the CJK width folding table. |
|
780 else if (code >= 0xFF00 && code <= 0xFFFF) |
|
781 CjkWidthFoldTable[code & 0xFF] = (TUint16)folded_code; |
|
782 |
|
783 /* |
|
784 If there was a gap between this code and the previous one, write an 'unassigned' range, |
|
785 unless this character is actually the end of a range not fully listed (like the CJK ideographs |
|
786 from 4E00 to 9FA5 inclusive), in which case the character name will end in ' Last>'. |
|
787 */ |
|
788 if (code - prev_code > 1) |
|
789 { |
|
790 TBool last_in_range = false; |
|
791 int name_length = strlen(field[1]); |
|
792 if (name_length >= 6 && !strcmp(field[1] + name_length - 6," Last>")) |
|
793 last_in_range = TRUE; |
|
794 if (!last_in_range) |
|
795 { |
|
796 add_range(unassigned_info,prev_code + 1); |
|
797 range_info = unassigned_info; |
|
798 } |
|
799 } |
|
800 |
|
801 // Write the range. |
|
802 if (first || cur_info != range_info) |
|
803 { |
|
804 add_range(cur_info,code); |
|
805 range_info = cur_info; |
|
806 } |
|
807 |
|
808 first = false; |
|
809 prev_code = code; |
|
810 } |
|
811 |
|
812 /* |
|
813 If there was a gap at the end of the encoding (there is at present; FFFE and FFFF are not Unicode characters) |
|
814 write an 'unassigned' range. |
|
815 */ |
|
816 if (prev_code < 0xFFFF) |
|
817 add_range(unassigned_info,prev_code + 1); |
|
818 |
|
819 // Write an array of indices from Unicode character values to character data sets. |
|
820 for (int i = 0; i < Ranges; i++) |
|
821 { |
|
822 TUint32 end = i < Ranges - 1 ? TheRange[i + 1].iRangeStart : 0x110000; |
|
823 for (TUint32 j = TheRange[i].iRangeStart; j < end; j++) |
|
824 TheIndex[j] = TheRange[i].iIndex; |
|
825 } |
|
826 |
|
827 // Write the output file. |
|
828 write_output(); |
|
829 printf("\nDone.\n"); |
|
830 |
|
831 return 0; |
|
832 } |
|
833 |
|
834 Data::Data() |
|
835 { |
|
836 iCategory = TChar::ECnCategory; |
|
837 iBdCategory = TChar::ELeftToRight; |
|
838 iCombiningClass = 0; |
|
839 iDigitOffset = 0; |
|
840 iCaseOffset = 0; |
|
841 iFlags = 0; |
|
842 } |
|
843 |
|
844 TBool Data::operator==(const Data& c) const |
|
845 { |
|
846 return iCategory == c.iCategory && |
|
847 iBdCategory == c.iBdCategory && |
|
848 iCombiningClass == c.iCombiningClass && |
|
849 iDigitOffset == c.iDigitOffset && |
|
850 iCaseOffset == c.iCaseOffset && |
|
851 iFlags == c.iFlags; |
|
852 } |
|
853 |
|
854 /* |
|
855 This function is copied from unicode.cpp: having it here saves me having to link in unicode.cpp and |
|
856 unitable.cpp, which is probably the file we're trying to write! |
|
857 */ |
|
858 TInt TUnicode::Compare(const TUint16 *aString1,TInt aLength1,const TUint16 *aString2,TInt aLength2) |
|
859 { |
|
860 for (TInt i = 0; i < aLength1 || i < aLength2; i++, aString1++, aString2++) |
|
861 { |
|
862 TInt x = i < aLength1 ? *aString1 : -1; |
|
863 TInt y = i < aLength2 ? *aString2 : -1; |
|
864 if (x != y) |
|
865 return x - y; |
|
866 } |
|
867 return 0; |
|
868 } |