|
1 /**************************************************************************** |
|
2 ** |
|
3 ** Copyright (C) 2009 Nokia Corporation and/or its subsidiary(-ies). |
|
4 ** All rights reserved. |
|
5 ** Contact: Nokia Corporation (qt-info@nokia.com) |
|
6 ** |
|
7 ** This file is part of the utils of the Qt Toolkit. |
|
8 ** |
|
9 ** $QT_BEGIN_LICENSE:LGPL$ |
|
10 ** No Commercial Usage |
|
11 ** This file contains pre-release code and may not be distributed. |
|
12 ** You may use this file in accordance with the terms and conditions |
|
13 ** contained in the Technology Preview License Agreement accompanying |
|
14 ** this package. |
|
15 ** |
|
16 ** GNU Lesser General Public License Usage |
|
17 ** Alternatively, this file may be used under the terms of the GNU Lesser |
|
18 ** General Public License version 2.1 as published by the Free Software |
|
19 ** Foundation and appearing in the file LICENSE.LGPL included in the |
|
20 ** packaging of this file. Please review the following information to |
|
21 ** ensure the GNU Lesser General Public License version 2.1 requirements |
|
22 ** will be met: http://www.gnu.org/licenses/old-licenses/lgpl-2.1.html. |
|
23 ** |
|
24 ** In addition, as a special exception, Nokia gives you certain additional |
|
25 ** rights. These rights are described in the Nokia Qt LGPL Exception |
|
26 ** version 1.1, included in the file LGPL_EXCEPTION.txt in this package. |
|
27 ** |
|
28 ** If you have questions regarding the use of this file, please contact |
|
29 ** Nokia at qt-info@nokia.com. |
|
30 ** |
|
31 ** |
|
32 ** |
|
33 ** |
|
34 ** |
|
35 ** |
|
36 ** |
|
37 ** |
|
38 ** $QT_END_LICENSE$ |
|
39 ** |
|
40 ****************************************************************************/ |
|
41 #include <qlist.h> |
|
42 #include <qhash.h> |
|
43 #include <qfile.h> |
|
44 #include <qstring.h> |
|
45 #include <qchar.h> |
|
46 #include <private/qunicodetables_p.h> |
|
47 #include <qvector.h> |
|
48 #include <qdebug.h> |
|
49 |
|
50 |
|
51 static struct AgeMap { |
|
52 const char *age; |
|
53 const QChar::UnicodeVersion version; |
|
54 } ageMap [] = { |
|
55 { "1.1", QChar::Unicode_1_1 }, |
|
56 { "2.0", QChar::Unicode_2_0 }, |
|
57 { "2.1", QChar::Unicode_2_1_2 }, |
|
58 { "3.0", QChar::Unicode_3_0 }, |
|
59 { "3.1", QChar::Unicode_3_1 }, |
|
60 { "3.2", QChar::Unicode_3_2 }, |
|
61 { "4.0", QChar::Unicode_4_0 }, |
|
62 { "4.1", QChar::Unicode_4_1 }, |
|
63 { "5.0", QChar::Unicode_5_0 }, |
|
64 { 0, QChar::Unicode_Unassigned } |
|
65 }; |
|
66 #define CURRENT_UNICODE_VERSION "QChar::Unicode_5_0" |
|
67 |
|
68 static const char *grapheme_break_string = |
|
69 " enum GraphemeBreak {\n" |
|
70 " GraphemeBreakOther, \n" |
|
71 " GraphemeBreakCR,\n" |
|
72 " GraphemeBreakLF,\n" |
|
73 " GraphemeBreakControl,\n" |
|
74 " GraphemeBreakExtend,\n" |
|
75 " GraphemeBreakL,\n" |
|
76 " GraphemeBreakV,\n" |
|
77 " GraphemeBreakT,\n" |
|
78 " GraphemeBreakLV,\n" |
|
79 " GraphemeBreakLVT\n" |
|
80 " };\n\n"; |
|
81 |
|
82 enum GraphemeBreak { |
|
83 GraphemeBreakOther, |
|
84 GraphemeBreakCR, |
|
85 GraphemeBreakLF, |
|
86 GraphemeBreakControl, |
|
87 GraphemeBreakExtend, |
|
88 GraphemeBreakL, |
|
89 GraphemeBreakV, |
|
90 GraphemeBreakT, |
|
91 GraphemeBreakLV, |
|
92 GraphemeBreakLVT |
|
93 }; |
|
94 |
|
95 QHash<QByteArray, GraphemeBreak> grapheme_break_map; |
|
96 |
|
97 static void initGraphemeBreak() |
|
98 { |
|
99 struct GraphemeBreakList { |
|
100 GraphemeBreak brk; |
|
101 const char *name; |
|
102 } breaks[] = { |
|
103 { GraphemeBreakOther, "Other" }, |
|
104 { GraphemeBreakCR, "CR" }, |
|
105 { GraphemeBreakLF, "LF" }, |
|
106 { GraphemeBreakControl, "Control" }, |
|
107 { GraphemeBreakExtend, "Extend" }, |
|
108 { GraphemeBreakL, "L" }, |
|
109 { GraphemeBreakV, "V" }, |
|
110 { GraphemeBreakT, "T" }, |
|
111 { GraphemeBreakLV, "LV" }, |
|
112 { GraphemeBreakLVT, "LVT" }, |
|
113 { GraphemeBreakOther, 0 } |
|
114 }; |
|
115 GraphemeBreakList *d = breaks; |
|
116 while (d->name) { |
|
117 grapheme_break_map.insert(d->name, d->brk); |
|
118 ++d; |
|
119 } |
|
120 } |
|
121 |
|
122 const char *word_break_string = |
|
123 " enum WordBreak {\n" |
|
124 " WordBreakOther,\n" |
|
125 " WordBreakFormat,\n" |
|
126 " WordBreakKatakana,\n" |
|
127 " WordBreakALetter,\n" |
|
128 " WordBreakMidLetter,\n" |
|
129 " WordBreakMidNum,\n" |
|
130 " WordBreakNumeric,\n" |
|
131 " WordBreakExtendNumLet\n" |
|
132 " };\n\n"; |
|
133 |
|
134 enum WordBreak { |
|
135 WordBreakOther, |
|
136 WordBreakFormat, |
|
137 WordBreakKatakana, |
|
138 WordBreakALetter, |
|
139 WordBreakMidLetter, |
|
140 WordBreakMidNum, |
|
141 WordBreakNumeric, |
|
142 WordBreakExtendNumLet |
|
143 }; |
|
144 |
|
145 |
|
146 QHash<QByteArray, WordBreak> word_break_map; |
|
147 |
|
148 static void initWordBreak() |
|
149 { |
|
150 struct WordBreakList { |
|
151 WordBreak brk; |
|
152 const char *name; |
|
153 } breaks[] = { |
|
154 { WordBreakFormat, "Format" }, |
|
155 { WordBreakFormat, "Extend" }, // these are copied in from GraphemeBreakProperty.txt |
|
156 { WordBreakKatakana, "Katakana" }, |
|
157 { WordBreakALetter, "ALetter" }, |
|
158 { WordBreakMidLetter, "MidLetter" }, |
|
159 { WordBreakMidNum, "MidNum" }, |
|
160 { WordBreakNumeric, "Numeric" }, |
|
161 { WordBreakExtendNumLet, "ExtendNumLet" }, |
|
162 { WordBreakFormat, 0 } |
|
163 }; |
|
164 WordBreakList *d = breaks; |
|
165 while (d->name) { |
|
166 word_break_map.insert(d->name, d->brk); |
|
167 ++d; |
|
168 } |
|
169 } |
|
170 |
|
171 |
|
172 static const char *sentence_break_string = |
|
173 " enum SentenceBreak {\n" |
|
174 " SentenceBreakOther,\n" |
|
175 " SentenceBreakSep,\n" |
|
176 " SentenceBreakFormat,\n" |
|
177 " SentenceBreakSp,\n" |
|
178 " SentenceBreakLower,\n" |
|
179 " SentenceBreakUpper,\n" |
|
180 " SentenceBreakOLetter,\n" |
|
181 " SentenceBreakNumeric,\n" |
|
182 " SentenceBreakATerm,\n" |
|
183 " SentenceBreakSTerm,\n" |
|
184 " SentenceBreakClose\n" |
|
185 " };\n\n"; |
|
186 |
|
187 enum SentenceBreak { |
|
188 SentenceBreakOther, |
|
189 SentenceBreakSep, |
|
190 SentenceBreakFormat, |
|
191 SentenceBreakSp, |
|
192 SentenceBreakLower, |
|
193 SentenceBreakUpper, |
|
194 SentenceBreakOLetter, |
|
195 SentenceBreakNumeric, |
|
196 SentenceBreakATerm, |
|
197 SentenceBreakSTerm, |
|
198 SentenceBreakClose |
|
199 }; |
|
200 |
|
201 |
|
202 QHash<QByteArray, SentenceBreak> sentence_break_map; |
|
203 |
|
204 static void initSentenceBreak() |
|
205 { |
|
206 struct SentenceBreakList { |
|
207 SentenceBreak brk; |
|
208 const char *name; |
|
209 } breaks[] = { |
|
210 { SentenceBreakOther, "Other" }, |
|
211 { SentenceBreakSep, "Sep" }, |
|
212 { SentenceBreakFormat, "Format" }, |
|
213 { SentenceBreakSp, "Sp" }, |
|
214 { SentenceBreakLower, "Lower" }, |
|
215 { SentenceBreakUpper, "Upper" }, |
|
216 { SentenceBreakOLetter, "OLetter" }, |
|
217 { SentenceBreakNumeric, "Numeric" }, |
|
218 { SentenceBreakATerm, "ATerm" }, |
|
219 { SentenceBreakSTerm, "STerm" }, |
|
220 { SentenceBreakClose, "Close" }, |
|
221 { SentenceBreakOther, 0 } |
|
222 }; |
|
223 SentenceBreakList *d = breaks; |
|
224 while (d->name) { |
|
225 sentence_break_map.insert(d->name, d->brk); |
|
226 ++d; |
|
227 } |
|
228 } |
|
229 |
|
230 |
|
231 // Keep this one in sync with the code in createPropertyInfo |
|
232 const char *property_string = |
|
233 " struct Properties {\n" |
|
234 " ushort category : 8;\n" |
|
235 " ushort line_break_class : 8;\n" |
|
236 " ushort direction : 8;\n" |
|
237 " ushort combiningClass :8;\n" |
|
238 " ushort joining : 2;\n" |
|
239 " signed short digitValue : 6; /* 5 needed */\n" |
|
240 " ushort unicodeVersion : 4;\n" |
|
241 " ushort lowerCaseSpecial : 1;\n" |
|
242 " ushort upperCaseSpecial : 1;\n" |
|
243 " ushort titleCaseSpecial : 1;\n" |
|
244 " ushort caseFoldSpecial : 1; /* currently unused */\n" |
|
245 " signed short mirrorDiff : 16;\n" |
|
246 " signed short lowerCaseDiff : 16;\n" |
|
247 " signed short upperCaseDiff : 16;\n" |
|
248 " signed short titleCaseDiff : 16;\n" |
|
249 " signed short caseFoldDiff : 16;\n" |
|
250 " ushort graphemeBreak : 8;\n" |
|
251 " ushort wordBreak : 8;\n" |
|
252 " ushort sentenceBreak : 8;\n" |
|
253 " };\n" |
|
254 " Q_CORE_EXPORT const Properties * QT_FASTCALL properties(uint ucs4);\n" |
|
255 " Q_CORE_EXPORT const Properties * QT_FASTCALL properties(ushort ucs2);\n"; |
|
256 |
|
257 const char *lineBreakClass = |
|
258 " // see http://www.unicode.org/reports/tr14/tr14-19.html\n" |
|
259 " // we don't use the XX, AI and CB properties and map them to AL instead.\n" |
|
260 " // as we don't support any EBDIC based OS'es, NL is ignored and mapped to AL as well.\n" |
|
261 " enum LineBreakClass {\n" |
|
262 " LineBreak_OP, LineBreak_CL, LineBreak_QU, LineBreak_GL, LineBreak_NS,\n" |
|
263 " LineBreak_EX, LineBreak_SY, LineBreak_IS, LineBreak_PR, LineBreak_PO,\n" |
|
264 " LineBreak_NU, LineBreak_AL, LineBreak_ID, LineBreak_IN, LineBreak_HY,\n" |
|
265 " LineBreak_BA, LineBreak_BB, LineBreak_B2, LineBreak_ZW, LineBreak_CM,\n" |
|
266 " LineBreak_WJ, LineBreak_H2, LineBreak_H3, LineBreak_JL, LineBreak_JV,\n" |
|
267 " LineBreak_JT, LineBreak_SA, LineBreak_SG,\n" |
|
268 " LineBreak_SP, LineBreak_CR, LineBreak_LF, LineBreak_BK\n" |
|
269 " };\n\n"; |
|
270 |
|
271 const char *methods = |
|
272 " Q_CORE_EXPORT QUnicodeTables::LineBreakClass QT_FASTCALL lineBreakClass(uint ucs4);\n" |
|
273 " inline int lineBreakClass(const QChar &ch) {\n" |
|
274 " return QUnicodeTables::lineBreakClass(ch.unicode());\n" |
|
275 " }\n" |
|
276 "\n" |
|
277 " Q_CORE_EXPORT int QT_FASTCALL script(uint ucs4);\n" |
|
278 " Q_CORE_EXPORT_INLINE int QT_FASTCALL script(const QChar &ch) {\n" |
|
279 " return script(ch.unicode());\n" |
|
280 " }\n\n"; |
|
281 |
|
282 |
|
283 struct PropertyFlags { |
|
284 bool operator ==(const PropertyFlags &o) { |
|
285 return (combiningClass == o.combiningClass |
|
286 && category == o.category |
|
287 && direction == o.direction |
|
288 && joining == o.joining |
|
289 && age == o.age |
|
290 && digitValue == o.digitValue |
|
291 && line_break_class == o.line_break_class |
|
292 && mirrorDiff == o.mirrorDiff |
|
293 && lowerCaseDiff == o.lowerCaseDiff |
|
294 && upperCaseDiff == o.upperCaseDiff |
|
295 && titleCaseDiff == o.titleCaseDiff |
|
296 && caseFoldDiff == o.caseFoldDiff |
|
297 && lowerCaseSpecial == o.lowerCaseSpecial |
|
298 && upperCaseSpecial == o.upperCaseSpecial |
|
299 && titleCaseSpecial == o.titleCaseSpecial |
|
300 && caseFoldSpecial == o.caseFoldSpecial |
|
301 && graphemeBreak == o.graphemeBreak |
|
302 && wordBreak == o.wordBreak |
|
303 && sentenceBreak == o.sentenceBreak |
|
304 ); |
|
305 } |
|
306 // from UnicodeData.txt |
|
307 uchar combiningClass : 8; |
|
308 QChar::Category category : 5; |
|
309 QChar::Direction direction : 5; |
|
310 // from ArabicShaping.txt |
|
311 QChar::Joining joining : 2; |
|
312 // from DerivedAge.txt |
|
313 QChar::UnicodeVersion age : 4; |
|
314 int digitValue; |
|
315 uint line_break_class : 5; |
|
316 |
|
317 int mirrorDiff : 16; |
|
318 |
|
319 int lowerCaseDiff; |
|
320 int upperCaseDiff; |
|
321 int titleCaseDiff; |
|
322 int caseFoldDiff; |
|
323 bool lowerCaseSpecial; |
|
324 bool upperCaseSpecial; |
|
325 bool titleCaseSpecial; |
|
326 bool caseFoldSpecial; |
|
327 GraphemeBreak graphemeBreak; |
|
328 WordBreak wordBreak; |
|
329 SentenceBreak sentenceBreak; |
|
330 }; |
|
331 |
|
332 QList<int> specialCaseMap; |
|
333 int specialCaseMaxLen = 0; |
|
334 |
|
335 static int appendToSpecialCaseMap(const QList<int> &map) |
|
336 { |
|
337 QList<int> utf16map; |
|
338 for (int i = 0; i < map.size(); ++i) { |
|
339 int val = map.at(i); |
|
340 if (val > 0xffff) { |
|
341 utf16map << QChar::highSurrogate(val); |
|
342 utf16map << QChar::lowSurrogate(val); |
|
343 } else { |
|
344 utf16map << val; |
|
345 } |
|
346 } |
|
347 specialCaseMaxLen = qMax(specialCaseMaxLen, utf16map.size()); |
|
348 utf16map << 0; |
|
349 |
|
350 for (int i = 0; i < specialCaseMap.size() - utf16map.size() - 1; ++i) { |
|
351 int j; |
|
352 for (j = 0; j < utf16map.size(); ++j) { |
|
353 if (specialCaseMap.at(i+j) != utf16map.at(j)) |
|
354 break; |
|
355 } |
|
356 if (j == utf16map.size()) |
|
357 return i; |
|
358 } |
|
359 |
|
360 int pos = specialCaseMap.size(); |
|
361 specialCaseMap << utf16map; |
|
362 return pos; |
|
363 } |
|
364 |
|
365 struct UnicodeData { |
|
366 UnicodeData(int codepoint = 0) { |
|
367 p.category = QChar::NoCategory; |
|
368 p.combiningClass = 0; |
|
369 |
|
370 p.direction = QChar::DirL; |
|
371 // DirR for: U+0590..U+05FF, U+07C0..U+08FF, U+FB1D..U+FB4F, U+10800..U+10FFF |
|
372 if ((codepoint >= 0x590 && codepoint <= 0x5ff) |
|
373 || (codepoint >= 0x7c0 && codepoint <= 0x8ff) |
|
374 || (codepoint >= 0xfb1d && codepoint <= 0xfb4f) |
|
375 || (codepoint >= 0x10800 && codepoint <= 0x10fff)) |
|
376 p.direction = QChar::DirR; |
|
377 // DirAL for: U+0600..U+07BF, U+FB50..U+FDCF, U+FDF0..U+FDFF, U+FE70..U+FEFE |
|
378 if ((codepoint >= 0x600 && codepoint <= 0x7bf) |
|
379 || (codepoint >= 0xfb50 && codepoint <= 0xfdcf) |
|
380 || (codepoint >= 0xfdf0 && codepoint <= 0xfdff) |
|
381 || (codepoint >= 0xfe70 && codepoint <= 0xfefe)) |
|
382 p.direction = QChar::DirAL; |
|
383 |
|
384 mirroredChar = 0; |
|
385 decompositionType = QChar::NoDecomposition; |
|
386 p.joining = QChar::OtherJoining; |
|
387 p.age = QChar::Unicode_Unassigned; |
|
388 p.mirrorDiff = 0; |
|
389 p.digitValue = -1; |
|
390 p.line_break_class = QUnicodeTables::LineBreak_AL; |
|
391 p.lowerCaseDiff = 0; |
|
392 p.upperCaseDiff = 0; |
|
393 p.titleCaseDiff = 0; |
|
394 p.caseFoldDiff = 0; |
|
395 p.lowerCaseSpecial = 0; |
|
396 p.upperCaseSpecial = 0; |
|
397 p.titleCaseSpecial = 0; |
|
398 p.caseFoldSpecial = 0; |
|
399 p.graphemeBreak = GraphemeBreakOther; |
|
400 p.wordBreak = WordBreakOther; |
|
401 p.sentenceBreak = SentenceBreakOther; |
|
402 propertyIndex = -1; |
|
403 excludedComposition = false; |
|
404 } |
|
405 PropertyFlags p; |
|
406 |
|
407 // from UnicodeData.txt |
|
408 QChar::Decomposition decompositionType; |
|
409 QList<int> decomposition; |
|
410 |
|
411 QList<int> specialFolding; |
|
412 |
|
413 // from BidiMirroring.txt |
|
414 int mirroredChar; |
|
415 |
|
416 // CompositionExclusions.txt |
|
417 bool excludedComposition; |
|
418 |
|
419 // computed position of unicode property set |
|
420 int propertyIndex; |
|
421 }; |
|
422 |
|
423 enum UniDataFields { |
|
424 UD_Value, |
|
425 UD_Name, |
|
426 UD_Category, |
|
427 UD_CombiningClass, |
|
428 UD_BidiCategory, |
|
429 UD_Decomposition, |
|
430 UD_DecimalDigitValue, |
|
431 UD_DigitValue, |
|
432 UD_NumericValue, |
|
433 UD_Mirrored, |
|
434 UD_OldName, |
|
435 UD_Comment, |
|
436 UD_UpperCase, |
|
437 UD_LowerCase, |
|
438 UD_TitleCase |
|
439 }; |
|
440 |
|
441 QHash<QByteArray, QChar::Category> categoryMap; |
|
442 |
|
443 static void initCategoryMap() |
|
444 { |
|
445 struct Cat { |
|
446 QChar::Category cat; |
|
447 const char *name; |
|
448 } categories [] = { |
|
449 { QChar::Mark_NonSpacing, "Mn" }, |
|
450 { QChar::Mark_SpacingCombining, "Mc" }, |
|
451 { QChar::Mark_Enclosing, "Me" }, |
|
452 |
|
453 { QChar::Number_DecimalDigit, "Nd" }, |
|
454 { QChar::Number_Letter, "Nl" }, |
|
455 { QChar::Number_Other, "No" }, |
|
456 |
|
457 { QChar::Separator_Space, "Zs" }, |
|
458 { QChar::Separator_Line, "Zl" }, |
|
459 { QChar::Separator_Paragraph, "Zp" }, |
|
460 |
|
461 { QChar::Other_Control, "Cc" }, |
|
462 { QChar::Other_Format, "Cf" }, |
|
463 { QChar::Other_Surrogate, "Cs" }, |
|
464 { QChar::Other_PrivateUse, "Co" }, |
|
465 { QChar::Other_NotAssigned, "Cn" }, |
|
466 |
|
467 { QChar::Letter_Uppercase, "Lu" }, |
|
468 { QChar::Letter_Lowercase, "Ll" }, |
|
469 { QChar::Letter_Titlecase, "Lt" }, |
|
470 { QChar::Letter_Modifier, "Lm" }, |
|
471 { QChar::Letter_Other, "Lo" }, |
|
472 |
|
473 { QChar::Punctuation_Connector, "Pc" }, |
|
474 { QChar::Punctuation_Dash, "Pd" }, |
|
475 { QChar::Punctuation_Open, "Ps" }, |
|
476 { QChar::Punctuation_Close, "Pe" }, |
|
477 { QChar::Punctuation_InitialQuote, "Pi" }, |
|
478 { QChar::Punctuation_FinalQuote, "Pf" }, |
|
479 { QChar::Punctuation_Other, "Po" }, |
|
480 |
|
481 { QChar::Symbol_Math, "Sm" }, |
|
482 { QChar::Symbol_Currency, "Sc" }, |
|
483 { QChar::Symbol_Modifier, "Sk" }, |
|
484 { QChar::Symbol_Other, "So" }, |
|
485 { QChar::NoCategory, 0 } |
|
486 }; |
|
487 Cat *c = categories; |
|
488 while (c->cat != QChar::NoCategory) { |
|
489 categoryMap.insert(c->name, c->cat); |
|
490 ++c; |
|
491 } |
|
492 } |
|
493 |
|
494 QHash<QByteArray, QChar::Direction> directionMap; |
|
495 |
|
496 static void initDirectionMap() |
|
497 { |
|
498 struct Dir { |
|
499 QChar::Direction dir; |
|
500 const char *name; |
|
501 } directions[] = { |
|
502 { QChar::DirL, "L" }, |
|
503 { QChar::DirR, "R" }, |
|
504 { QChar::DirEN, "EN" }, |
|
505 { QChar::DirES, "ES" }, |
|
506 { QChar::DirET, "ET" }, |
|
507 { QChar::DirAN, "AN" }, |
|
508 { QChar::DirCS, "CS" }, |
|
509 { QChar::DirB, "B" }, |
|
510 { QChar::DirS, "S" }, |
|
511 { QChar::DirWS, "WS" }, |
|
512 { QChar::DirON, "ON" }, |
|
513 { QChar::DirLRE, "LRE" }, |
|
514 { QChar::DirLRO, "LRO" }, |
|
515 { QChar::DirAL, "AL" }, |
|
516 { QChar::DirRLE, "RLE" }, |
|
517 { QChar::DirRLO, "RLO" }, |
|
518 { QChar::DirPDF, "PDF" }, |
|
519 { QChar::DirNSM, "NSM" }, |
|
520 { QChar::DirBN, "BN" }, |
|
521 { QChar::DirL, 0 } |
|
522 }; |
|
523 Dir *d = directions; |
|
524 while (d->name) { |
|
525 directionMap.insert(d->name, d->dir); |
|
526 ++d; |
|
527 } |
|
528 } |
|
529 |
|
530 |
|
531 QHash<QByteArray, QChar::Decomposition> decompositionMap; |
|
532 |
|
533 static void initDecompositionMap() |
|
534 { |
|
535 struct Dec { |
|
536 QChar::Decomposition dec; |
|
537 const char *name; |
|
538 } decompositions[] = { |
|
539 { QChar::Canonical, "<canonical>" }, |
|
540 { QChar::Font, "<font>" }, |
|
541 { QChar::NoBreak, "<noBreak>" }, |
|
542 { QChar::Initial, "<initial>" }, |
|
543 { QChar::Medial, "<medial>" }, |
|
544 { QChar::Final, "<final>" }, |
|
545 { QChar::Isolated, "<isolated>" }, |
|
546 { QChar::Circle, "<circle>" }, |
|
547 { QChar::Super, "<super>" }, |
|
548 { QChar::Sub, "<sub>" }, |
|
549 { QChar::Vertical, "<vertical>" }, |
|
550 { QChar::Wide, "<wide>" }, |
|
551 { QChar::Narrow, "<narrow>" }, |
|
552 { QChar::Small, "<small>" }, |
|
553 { QChar::Square, "<square>" }, |
|
554 { QChar::Compat, "<compat>" }, |
|
555 { QChar::Fraction, "<fraction>" }, |
|
556 { QChar::NoDecomposition, 0 } |
|
557 }; |
|
558 Dec *d = decompositions; |
|
559 while (d->name) { |
|
560 decompositionMap.insert(d->name, d->dec); |
|
561 ++d; |
|
562 } |
|
563 } |
|
564 |
|
565 |
|
566 QHash<int, UnicodeData> unicodeData; |
|
567 QList<PropertyFlags> uniqueProperties; |
|
568 |
|
569 |
|
570 QHash<int, int> decompositionLength; |
|
571 int highestComposedCharacter = 0; |
|
572 int numLigatures = 0; |
|
573 int highestLigature = 0; |
|
574 |
|
575 struct Ligature {ushort u1; ushort u2; ushort ligature;}; |
|
576 // we need them sorted after the first component for fast lookup |
|
577 bool operator < (const Ligature &l1, const Ligature &l2) { |
|
578 return l1.u1 < l2.u1; |
|
579 } |
|
580 |
|
581 QHash<ushort, QList<Ligature> > ligatureHashes; |
|
582 |
|
583 QHash<int, int> combiningClassUsage; |
|
584 |
|
585 int maxLowerCaseDiff = 0; |
|
586 int maxUpperCaseDiff = 0; |
|
587 int maxTitleCaseDiff = 0; |
|
588 |
|
589 static void readUnicodeData() |
|
590 { |
|
591 QFile f("data/UnicodeData.txt"); |
|
592 if (!f.exists()) |
|
593 qFatal("Couldn't find UnicodeData.txt"); |
|
594 |
|
595 f.open(QFile::ReadOnly); |
|
596 |
|
597 while (!f.atEnd()) { |
|
598 QByteArray line; |
|
599 line.resize(1024); |
|
600 int len = f.readLine(line.data(), 1024); |
|
601 line.truncate(len-1); |
|
602 |
|
603 int comment = line.indexOf('#'); |
|
604 if (comment >= 0) |
|
605 line = line.left(comment); |
|
606 if (line.isEmpty()) |
|
607 continue; |
|
608 |
|
609 QList<QByteArray> properties = line.split(';'); |
|
610 bool ok; |
|
611 int codepoint = properties[UD_Value].toInt(&ok, 16); |
|
612 int lastCodepoint = codepoint; |
|
613 |
|
614 QByteArray name = properties[UD_Name]; |
|
615 if (name.startsWith('<') && name.contains("First")) { |
|
616 QByteArray nextLine; |
|
617 nextLine.resize(1024); |
|
618 f.readLine(nextLine.data(), 1024); |
|
619 QList<QByteArray> properties = nextLine.split(';'); |
|
620 lastCodepoint = properties[UD_Value].toInt(&ok, 16); |
|
621 } |
|
622 |
|
623 UnicodeData data(codepoint); |
|
624 data.p.category = categoryMap.value(properties[UD_Category], QChar::NoCategory); |
|
625 data.p.combiningClass = properties[UD_CombiningClass].toInt(); |
|
626 |
|
627 if (!combiningClassUsage.contains(data.p.combiningClass)) |
|
628 combiningClassUsage[data.p.combiningClass] = 1; |
|
629 else |
|
630 ++combiningClassUsage[data.p.combiningClass]; |
|
631 |
|
632 data.p.direction = directionMap.value(properties[UD_BidiCategory], data.p.direction); |
|
633 |
|
634 if (!properties[UD_UpperCase].isEmpty()) { |
|
635 int upperCase = properties[UD_UpperCase].toInt(&ok, 16); |
|
636 Q_ASSERT(ok); |
|
637 data.p.upperCaseDiff = upperCase - codepoint; |
|
638 maxUpperCaseDiff = qMax(maxUpperCaseDiff, qAbs(data.p.upperCaseDiff)); |
|
639 if (codepoint > 0xffff) { |
|
640 // if the condition below doesn't hold anymore we need to modify our case folding code |
|
641 //qDebug() << codepoint << QChar::highSurrogate(codepoint) << QChar::highSurrogate(foldMap.at(0)); |
|
642 Q_ASSERT(QChar::highSurrogate(codepoint) == QChar::highSurrogate(upperCase)); |
|
643 } |
|
644 } |
|
645 if (!properties[UD_LowerCase].isEmpty()) { |
|
646 int lowerCase = properties[UD_LowerCase].toInt(&ok, 16); |
|
647 Q_ASSERT (ok); |
|
648 data.p.lowerCaseDiff = lowerCase - codepoint; |
|
649 maxLowerCaseDiff = qMax(maxLowerCaseDiff, qAbs(data.p.lowerCaseDiff)); |
|
650 if (codepoint > 0xffff) { |
|
651 // if the condition below doesn't hold anymore we need to modify our case folding code |
|
652 //qDebug() << codepoint << QChar::highSurrogate(codepoint) << QChar::highSurrogate(foldMap.at(0)); |
|
653 Q_ASSERT(QChar::highSurrogate(codepoint) == QChar::highSurrogate(lowerCase)); |
|
654 } |
|
655 } |
|
656 // we want toTitleCase to map to ToUpper in case we don't have any titlecase. |
|
657 if (properties[UD_TitleCase].isEmpty()) |
|
658 properties[UD_TitleCase] = properties[UD_UpperCase]; |
|
659 if (!properties[UD_TitleCase].isEmpty()) { |
|
660 int titleCase = properties[UD_TitleCase].toInt(&ok, 16); |
|
661 Q_ASSERT (ok); |
|
662 data.p.titleCaseDiff = titleCase - codepoint; |
|
663 maxTitleCaseDiff = qMax(maxTitleCaseDiff, qAbs(data.p.titleCaseDiff)); |
|
664 if (codepoint > 0xffff) { |
|
665 // if the condition below doesn't hold anymore we need to modify our case folding code |
|
666 //qDebug() << codepoint << QChar::highSurrogate(codepoint) << QChar::highSurrogate(foldMap.at(0)); |
|
667 Q_ASSERT(QChar::highSurrogate(codepoint) == QChar::highSurrogate(titleCase)); |
|
668 } |
|
669 } |
|
670 |
|
671 if (!properties[UD_DigitValue].isEmpty()) |
|
672 data.p.digitValue = properties[UD_DigitValue].toInt(); |
|
673 |
|
674 // decompositition |
|
675 QByteArray decomposition = properties[UD_Decomposition]; |
|
676 if (!decomposition.isEmpty()) { |
|
677 highestComposedCharacter = qMax(highestComposedCharacter, codepoint); |
|
678 QList<QByteArray> d = decomposition.split(' '); |
|
679 if (d[0].contains('<')) { |
|
680 data.decompositionType = decompositionMap.value(d[0], QChar::Canonical); |
|
681 d.takeFirst(); |
|
682 } else { |
|
683 data.decompositionType = QChar::Canonical; |
|
684 } |
|
685 for (int i = 0; i < d.size(); ++i) |
|
686 data.decomposition.append(d[i].toInt(&ok, 16)); |
|
687 if (!decompositionLength.contains(data.decomposition.size())) |
|
688 decompositionLength[data.decomposition.size()] = 1; |
|
689 else |
|
690 ++decompositionLength[data.decomposition.size()]; |
|
691 } |
|
692 |
|
693 for (int i = codepoint; i <= lastCodepoint; ++i) |
|
694 unicodeData.insert(i, data); |
|
695 } |
|
696 |
|
697 } |
|
698 |
|
699 static int maxMirroredDiff = 0; |
|
700 |
|
701 static void readBidiMirroring() |
|
702 { |
|
703 QFile f("data/BidiMirroring.txt"); |
|
704 if (!f.exists()) |
|
705 qFatal("Couldn't find BidiMirroring.txt"); |
|
706 |
|
707 f.open(QFile::ReadOnly); |
|
708 |
|
709 while (!f.atEnd()) { |
|
710 QByteArray line; |
|
711 line.resize(1024); |
|
712 int len = f.readLine(line.data(), 1024); |
|
713 line.resize(len-1); |
|
714 |
|
715 int comment = line.indexOf('#'); |
|
716 if (comment >= 0) |
|
717 line = line.left(comment); |
|
718 |
|
719 if (line.isEmpty()) |
|
720 continue; |
|
721 line = line.replace(" ", ""); |
|
722 |
|
723 QList<QByteArray> pair = line.split(';'); |
|
724 Q_ASSERT(pair.size() == 2); |
|
725 |
|
726 bool ok; |
|
727 int codepoint = pair[0].toInt(&ok, 16); |
|
728 int mirror = pair[1].toInt(&ok, 16); |
|
729 |
|
730 UnicodeData d = unicodeData.value(codepoint, UnicodeData(codepoint)); |
|
731 d.mirroredChar = mirror; |
|
732 if (qAbs(codepoint-d.mirroredChar) > maxMirroredDiff) |
|
733 maxMirroredDiff = qAbs(codepoint - d.mirroredChar); |
|
734 |
|
735 d.p.mirrorDiff = d.mirroredChar - codepoint; |
|
736 unicodeData.insert(codepoint, d); |
|
737 } |
|
738 } |
|
739 |
|
740 static void readArabicShaping() |
|
741 { |
|
742 QFile f("data/ArabicShaping.txt"); |
|
743 if (!f.exists()) |
|
744 qFatal("Couldn't find ArabicShaping.txt"); |
|
745 |
|
746 f.open(QFile::ReadOnly); |
|
747 |
|
748 while (!f.atEnd()) { |
|
749 QByteArray line; |
|
750 line.resize(1024); |
|
751 int len = f.readLine(line.data(), 1024); |
|
752 line.resize(len-1); |
|
753 |
|
754 int comment = line.indexOf('#'); |
|
755 if (comment >= 0) |
|
756 line = line.left(comment); |
|
757 line = line.trimmed(); |
|
758 |
|
759 if (line.isEmpty()) |
|
760 continue; |
|
761 |
|
762 QList<QByteArray> shaping = line.split(';'); |
|
763 Q_ASSERT(shaping.size() == 4); |
|
764 |
|
765 bool ok; |
|
766 int codepoint = shaping[0].toInt(&ok, 16); |
|
767 QChar::Joining j = QChar::OtherJoining; |
|
768 QByteArray shape = shaping[2].trimmed(); |
|
769 if (shape == "R") |
|
770 j = QChar::Right; |
|
771 else if (shape == "D") |
|
772 j = QChar::Dual; |
|
773 else if (shape == "C") |
|
774 j = QChar::Center; |
|
775 |
|
776 UnicodeData d = unicodeData.value(codepoint, UnicodeData(codepoint)); |
|
777 d.p.joining = j; |
|
778 unicodeData.insert(codepoint, d); |
|
779 } |
|
780 } |
|
781 |
|
782 static void readDerivedAge() |
|
783 { |
|
784 QFile f("data/DerivedAge.txt"); |
|
785 if (!f.exists()) |
|
786 qFatal("Couldn't find DerivedAge.txt"); |
|
787 |
|
788 f.open(QFile::ReadOnly); |
|
789 |
|
790 while (!f.atEnd()) { |
|
791 QByteArray line; |
|
792 line.resize(1024); |
|
793 int len = f.readLine(line.data(), 1024); |
|
794 line.resize(len-1); |
|
795 |
|
796 int comment = line.indexOf('#'); |
|
797 if (comment >= 0) |
|
798 line = line.left(comment); |
|
799 line.replace(" ", ""); |
|
800 |
|
801 if (line.isEmpty()) |
|
802 continue; |
|
803 |
|
804 QList<QByteArray> l = line.split(';'); |
|
805 Q_ASSERT(l.size() == 2); |
|
806 |
|
807 QByteArray codes = l[0]; |
|
808 codes.replace("..", "."); |
|
809 QList<QByteArray> cl = codes.split('.'); |
|
810 |
|
811 bool ok; |
|
812 int from = cl[0].toInt(&ok, 16); |
|
813 int to = from; |
|
814 if (cl.size() == 2) |
|
815 to = cl[1].toInt(&ok, 16); |
|
816 |
|
817 QChar::UnicodeVersion age = QChar::Unicode_Unassigned; |
|
818 QByteArray ba = l[1]; |
|
819 AgeMap *map = ageMap; |
|
820 while (map->age) { |
|
821 if (ba == map->age) { |
|
822 age = map->version; |
|
823 break; |
|
824 } |
|
825 ++map; |
|
826 } |
|
827 //qDebug() << hex << from << ".." << to << ba << age; |
|
828 Q_ASSERT(age != QChar::Unicode_Unassigned); |
|
829 |
|
830 for (int codepoint = from; codepoint <= to; ++codepoint) { |
|
831 UnicodeData d = unicodeData.value(codepoint, UnicodeData(codepoint)); |
|
832 d.p.age = age; |
|
833 unicodeData.insert(codepoint, d); |
|
834 } |
|
835 } |
|
836 } |
|
837 |
|
838 |
|
839 static void readCompositionExclusion() |
|
840 { |
|
841 QFile f("data/CompositionExclusions.txt"); |
|
842 if (!f.exists()) |
|
843 qFatal("Couldn't find CompositionExclusions.txt"); |
|
844 |
|
845 f.open(QFile::ReadOnly); |
|
846 |
|
847 while (!f.atEnd()) { |
|
848 QByteArray line; |
|
849 line.resize(1024); |
|
850 int len = f.readLine(line.data(), 1024); |
|
851 line.resize(len-1); |
|
852 |
|
853 int comment = line.indexOf('#'); |
|
854 if (comment >= 0) |
|
855 line = line.left(comment); |
|
856 line.replace(" ", ""); |
|
857 |
|
858 if (line.isEmpty()) |
|
859 continue; |
|
860 |
|
861 Q_ASSERT(!line.contains("..")); |
|
862 |
|
863 bool ok; |
|
864 int codepoint = line.toInt(&ok, 16); |
|
865 |
|
866 UnicodeData d = unicodeData.value(codepoint, UnicodeData(codepoint)); |
|
867 d.excludedComposition = true; |
|
868 unicodeData.insert(codepoint, d); |
|
869 } |
|
870 |
|
871 for (int i = 0; i < 0x110000; ++i) { |
|
872 UnicodeData data = unicodeData.value(i, UnicodeData(i)); |
|
873 if (!data.excludedComposition |
|
874 && data.decompositionType == QChar::Canonical |
|
875 && data.decomposition.size() > 1) { |
|
876 Q_ASSERT(data.decomposition.size() == 2); |
|
877 |
|
878 uint part1 = data.decomposition.at(0); |
|
879 uint part2 = data.decomposition.at(1); |
|
880 UnicodeData first = unicodeData.value(part1, UnicodeData(part1)); |
|
881 if (first.p.combiningClass != 0) |
|
882 continue; |
|
883 |
|
884 ++numLigatures; |
|
885 highestLigature = qMax(highestLigature, (int)part1); |
|
886 Ligature l = {(ushort)part1, (ushort)part2, i}; |
|
887 ligatureHashes[part2].append(l); |
|
888 } |
|
889 } |
|
890 } |
|
891 |
|
892 struct NormalizationCorrection { |
|
893 uint codepoint; |
|
894 uint mapped; |
|
895 uint version; |
|
896 }; |
|
897 |
|
898 static QByteArray createNormalizationCorrections() |
|
899 { |
|
900 QFile f("data/NormalizationCorrections.txt"); |
|
901 if (!f.exists()) |
|
902 qFatal("Couldn't find NormalizationCorrections.txt"); |
|
903 |
|
904 f.open(QFile::ReadOnly); |
|
905 |
|
906 QByteArray out; |
|
907 |
|
908 out += "struct NormalizationCorrection {\n" |
|
909 " uint ucs4;\n" |
|
910 " uint old_mapping;\n" |
|
911 " int version;\n" |
|
912 "};\n\n" |
|
913 |
|
914 "static const NormalizationCorrection uc_normalization_corrections[] = {\n"; |
|
915 |
|
916 int numCorrections = 0; |
|
917 while (!f.atEnd()) { |
|
918 QByteArray line; |
|
919 line.resize(1024); |
|
920 int len = f.readLine(line.data(), 1024); |
|
921 line.resize(len-1); |
|
922 |
|
923 int comment = line.indexOf('#'); |
|
924 if (comment >= 0) |
|
925 line = line.left(comment); |
|
926 line.replace(" ", ""); |
|
927 |
|
928 if (line.isEmpty()) |
|
929 continue; |
|
930 |
|
931 Q_ASSERT(!line.contains("..")); |
|
932 |
|
933 QList<QByteArray> fields = line.split(';'); |
|
934 Q_ASSERT(fields.size() == 4); |
|
935 |
|
936 NormalizationCorrection c; |
|
937 bool ok; |
|
938 c.codepoint = fields.at(0).toInt(&ok, 16); |
|
939 c.mapped = fields.at(1).toInt(&ok, 16); |
|
940 if (fields.at(3) == "3.2.0") |
|
941 c.version = QChar::Unicode_3_2; |
|
942 else if (fields.at(3) == "4.0.0") |
|
943 c.version = QChar::Unicode_4_0; |
|
944 else |
|
945 qFatal("unknown unicode version in NormalizationCorrection.txt"); |
|
946 |
|
947 out += " { 0x" + QByteArray::number(c.codepoint, 16) + ", 0x" + QByteArray::number(c.mapped, 16) |
|
948 + ", " + QString::number(c.version) + " },\n"; |
|
949 ++numCorrections; |
|
950 } |
|
951 |
|
952 out += "};\n\n" |
|
953 |
|
954 "enum { NumNormalizationCorrections = " + QByteArray::number(numCorrections) + " };\n\n"; |
|
955 |
|
956 |
|
957 return out; |
|
958 } |
|
959 |
|
960 |
|
961 static void computeUniqueProperties() |
|
962 { |
|
963 qDebug("computeUniqueProperties:"); |
|
964 for (int uc = 0; uc < 0x110000; ++uc) { |
|
965 UnicodeData d = unicodeData.value(uc, UnicodeData(uc)); |
|
966 |
|
967 int index = uniqueProperties.indexOf(d.p); |
|
968 if (index == -1) { |
|
969 index = uniqueProperties.size(); |
|
970 uniqueProperties.append(d.p); |
|
971 } |
|
972 d.propertyIndex = index; |
|
973 unicodeData.insert(uc, d); |
|
974 } |
|
975 qDebug(" %d unicode properties found", uniqueProperties.size()); |
|
976 } |
|
977 |
|
978 |
|
979 static void readLineBreak() |
|
980 { |
|
981 QFile f("data/LineBreak.txt"); |
|
982 if (!f.exists()) |
|
983 qFatal("Couldn't find LineBreak.txt"); |
|
984 |
|
985 f.open(QFile::ReadOnly); |
|
986 |
|
987 while (!f.atEnd()) { |
|
988 QByteArray line; |
|
989 line.resize(1024); |
|
990 int len = f.readLine(line.data(), 1024); |
|
991 line.resize(len-1); |
|
992 |
|
993 int comment = line.indexOf('#'); |
|
994 if (comment >= 0) |
|
995 line = line.left(comment); |
|
996 line.replace(" ", ""); |
|
997 |
|
998 if (line.isEmpty()) |
|
999 continue; |
|
1000 |
|
1001 QList<QByteArray> l = line.split(';'); |
|
1002 Q_ASSERT(l.size() == 2); |
|
1003 |
|
1004 QByteArray codes = l[0]; |
|
1005 codes.replace("..", "."); |
|
1006 QList<QByteArray> cl = codes.split('.'); |
|
1007 |
|
1008 bool ok; |
|
1009 int from = cl[0].toInt(&ok, 16); |
|
1010 int to = from; |
|
1011 if (cl.size() == 2) |
|
1012 to = cl[1].toInt(&ok, 16); |
|
1013 |
|
1014 // ### Classes XX and AI are left out and mapped to AL for now |
|
1015 QUnicodeTables::LineBreakClass lb = QUnicodeTables::LineBreak_AL; |
|
1016 QByteArray ba = l[1]; |
|
1017 |
|
1018 if (ba == "AI") lb = QUnicodeTables::LineBreak_AL; |
|
1019 else if (ba == "XX") lb = QUnicodeTables::LineBreak_AL; |
|
1020 else if (ba == "NL") lb = QUnicodeTables::LineBreak_AL; |
|
1021 else if (ba == "OP") lb = QUnicodeTables::LineBreak_OP; |
|
1022 else if (ba == "CL") lb = QUnicodeTables::LineBreak_CL; |
|
1023 else if (ba == "QU") lb = QUnicodeTables::LineBreak_QU; |
|
1024 else if (ba == "GL") lb = QUnicodeTables::LineBreak_GL; |
|
1025 else if (ba == "NS") lb = QUnicodeTables::LineBreak_NS; |
|
1026 else if (ba == "EX") lb = QUnicodeTables::LineBreak_EX; |
|
1027 else if (ba == "SY") lb = QUnicodeTables::LineBreak_SY; |
|
1028 else if (ba == "IS") lb = QUnicodeTables::LineBreak_IS; |
|
1029 else if (ba == "PR") lb = QUnicodeTables::LineBreak_PR; |
|
1030 else if (ba == "PO") lb = QUnicodeTables::LineBreak_PO; |
|
1031 else if (ba == "NU") lb = QUnicodeTables::LineBreak_NU; |
|
1032 else if (ba == "AL") lb = QUnicodeTables::LineBreak_AL; |
|
1033 else if (ba == "ID") lb = QUnicodeTables::LineBreak_ID; |
|
1034 else if (ba == "IN") lb = QUnicodeTables::LineBreak_IN; |
|
1035 else if (ba == "HY") lb = QUnicodeTables::LineBreak_HY; |
|
1036 else if (ba == "BA") lb = QUnicodeTables::LineBreak_BA; |
|
1037 else if (ba == "BB") lb = QUnicodeTables::LineBreak_BB; |
|
1038 else if (ba == "B2") lb = QUnicodeTables::LineBreak_B2; |
|
1039 else if (ba == "ZW") lb = QUnicodeTables::LineBreak_ZW; |
|
1040 else if (ba == "CM") lb = QUnicodeTables::LineBreak_CM; |
|
1041 else if (ba == "SA") lb = QUnicodeTables::LineBreak_SA; |
|
1042 else if (ba == "BK") lb = QUnicodeTables::LineBreak_BK; |
|
1043 else if (ba == "CR") lb = QUnicodeTables::LineBreak_CR; |
|
1044 else if (ba == "LF") lb = QUnicodeTables::LineBreak_LF; |
|
1045 else if (ba == "SG") lb = QUnicodeTables::LineBreak_SG; |
|
1046 else if (ba == "CB") lb = QUnicodeTables::LineBreak_AL; |
|
1047 else if (ba == "SP") lb = QUnicodeTables::LineBreak_SP; |
|
1048 else if (ba == "WJ") lb = QUnicodeTables::LineBreak_WJ; |
|
1049 else if (ba == "H2") lb = QUnicodeTables::LineBreak_H2; |
|
1050 else if (ba == "H3") lb = QUnicodeTables::LineBreak_H3; |
|
1051 else if (ba == "JL") lb = QUnicodeTables::LineBreak_JL; |
|
1052 else if (ba == "JV") lb = QUnicodeTables::LineBreak_JV; |
|
1053 else if (ba == "JT") lb = QUnicodeTables::LineBreak_JT; |
|
1054 else { |
|
1055 qDebug() << "unhandled line break class:" << ba; |
|
1056 } |
|
1057 |
|
1058 for (int codepoint = from; codepoint <= to; ++codepoint) { |
|
1059 UnicodeData d = unicodeData.value(codepoint, UnicodeData(codepoint)); |
|
1060 d.p.line_break_class = lb; |
|
1061 unicodeData.insert(codepoint, d); |
|
1062 } |
|
1063 } |
|
1064 } |
|
1065 |
|
1066 |
|
1067 static void readSpecialCasing() |
|
1068 { |
|
1069 // qDebug() << "Reading SpecialCasing.txt"; |
|
1070 QFile f("data/SpecialCasing.txt"); |
|
1071 if (!f.exists()) |
|
1072 qFatal("Couldn't find SpecialCasing.txt"); |
|
1073 |
|
1074 f.open(QFile::ReadOnly); |
|
1075 |
|
1076 while (!f.atEnd()) { |
|
1077 QByteArray line; |
|
1078 line.resize(1024); |
|
1079 int len = f.readLine(line.data(), 1024); |
|
1080 line.resize(len-1); |
|
1081 |
|
1082 int comment = line.indexOf('#'); |
|
1083 if (comment >= 0) |
|
1084 line = line.left(comment); |
|
1085 |
|
1086 if (line.isEmpty()) |
|
1087 continue; |
|
1088 |
|
1089 QList<QByteArray> l = line.split(';'); |
|
1090 |
|
1091 QByteArray condition = l.size() < 5 ? QByteArray() : l[4].trimmed(); |
|
1092 if (!condition.isEmpty()) |
|
1093 // ##### |
|
1094 continue; |
|
1095 |
|
1096 bool ok; |
|
1097 int codepoint = l[0].trimmed().toInt(&ok, 16); |
|
1098 Q_ASSERT(ok); |
|
1099 Q_ASSERT(codepoint <= 0xffff); |
|
1100 |
|
1101 // qDebug() << "codepoint" << hex << codepoint; |
|
1102 // qDebug() << line; |
|
1103 |
|
1104 QList<QByteArray> lower = l[1].trimmed().split(' '); |
|
1105 QList<int> lowerMap; |
|
1106 for (int i = 0; i < lower.size(); ++i) { |
|
1107 bool ok; |
|
1108 lowerMap.append(lower.at(i).toInt(&ok, 16)); |
|
1109 Q_ASSERT(ok); |
|
1110 } |
|
1111 |
|
1112 QList<QByteArray> title = l[2].trimmed().split(' '); |
|
1113 QList<int> titleMap; |
|
1114 for (int i = 0; i < title.size(); ++i) { |
|
1115 bool ok; |
|
1116 titleMap.append(title.at(i).toInt(&ok, 16)); |
|
1117 if (!ok) |
|
1118 qDebug() << line << title.at(i); |
|
1119 Q_ASSERT(ok); |
|
1120 } |
|
1121 |
|
1122 QList<QByteArray> upper = l[3].trimmed().split(' '); |
|
1123 QList<int> upperMap; |
|
1124 for (int i = 0; i < upper.size(); ++i) { |
|
1125 bool ok; |
|
1126 upperMap.append(upper.at(i).toInt(&ok, 16)); |
|
1127 Q_ASSERT(ok); |
|
1128 } |
|
1129 |
|
1130 |
|
1131 UnicodeData ud = unicodeData.value(codepoint, UnicodeData(codepoint)); |
|
1132 |
|
1133 Q_ASSERT(lowerMap.size() > 1 || lowerMap.at(0) == codepoint + ud.p.lowerCaseDiff); |
|
1134 Q_ASSERT(titleMap.size() > 1 || titleMap.at(0) == codepoint + ud.p.titleCaseDiff); |
|
1135 Q_ASSERT(upperMap.size() > 1 || upperMap.at(0) == codepoint + ud.p.upperCaseDiff); |
|
1136 |
|
1137 if (lowerMap.size() > 1) { |
|
1138 ud.p.lowerCaseSpecial = true; |
|
1139 ud.p.lowerCaseDiff = appendToSpecialCaseMap(lowerMap); |
|
1140 } |
|
1141 if (titleMap.size() > 1) { |
|
1142 ud.p.titleCaseSpecial = true; |
|
1143 ud.p.titleCaseDiff = appendToSpecialCaseMap(titleMap); |
|
1144 } |
|
1145 if (upperMap.size() > 1) { |
|
1146 ud.p.upperCaseSpecial = true; |
|
1147 ud.p.upperCaseDiff = appendToSpecialCaseMap(upperMap);; |
|
1148 } |
|
1149 |
|
1150 unicodeData.insert(codepoint, ud); |
|
1151 } |
|
1152 } |
|
1153 |
|
1154 int maxCaseFoldDiff = 0; |
|
1155 |
|
1156 static void readCaseFolding() |
|
1157 { |
|
1158 qDebug() << "Reading CaseFolding.txt"; |
|
1159 QFile f("data/CaseFolding.txt"); |
|
1160 if (!f.exists()) |
|
1161 qFatal("Couldn't find CaseFolding.txt"); |
|
1162 |
|
1163 f.open(QFile::ReadOnly); |
|
1164 |
|
1165 while (!f.atEnd()) { |
|
1166 QByteArray line; |
|
1167 line.resize(1024); |
|
1168 int len = f.readLine(line.data(), 1024); |
|
1169 line.resize(len-1); |
|
1170 |
|
1171 int comment = line.indexOf('#'); |
|
1172 if (comment >= 0) |
|
1173 line = line.left(comment); |
|
1174 |
|
1175 if (line.isEmpty()) |
|
1176 continue; |
|
1177 |
|
1178 QList<QByteArray> l = line.split(';'); |
|
1179 |
|
1180 bool ok; |
|
1181 uint codepoint = l[0].trimmed().toInt(&ok, 16); |
|
1182 Q_ASSERT(ok); |
|
1183 |
|
1184 |
|
1185 l[1] = l[1].trimmed(); |
|
1186 if (l[1] == "F" || l[1] == "T") |
|
1187 continue; |
|
1188 |
|
1189 // qDebug() << "codepoint" << hex << codepoint; |
|
1190 // qDebug() << line; |
|
1191 QList<QByteArray> fold = l[2].trimmed().split(' '); |
|
1192 QList<int> foldMap; |
|
1193 for (int i = 0; i < fold.size(); ++i) { |
|
1194 bool ok; |
|
1195 foldMap.append(fold.at(i).toInt(&ok, 16)); |
|
1196 Q_ASSERT(ok); |
|
1197 } |
|
1198 |
|
1199 UnicodeData ud = unicodeData.value(codepoint, UnicodeData(codepoint)); |
|
1200 if (foldMap.size() == 1) { |
|
1201 ud.p.caseFoldDiff = foldMap.at(0) - codepoint; |
|
1202 maxCaseFoldDiff = qMax(maxCaseFoldDiff, ud.p.caseFoldDiff); |
|
1203 if (codepoint > 0xffff) { |
|
1204 // if the condition below doesn't hold anymore we need to modify our case folding code |
|
1205 //qDebug() << codepoint << QChar::highSurrogate(codepoint) << QChar::highSurrogate(foldMap.at(0)); |
|
1206 Q_ASSERT(QChar::highSurrogate(codepoint) == QChar::highSurrogate(foldMap.at(0))); |
|
1207 } |
|
1208 if (foldMap.at(0) != codepoint + ud.p.lowerCaseDiff) |
|
1209 qDebug() << hex << codepoint; |
|
1210 } else { |
|
1211 Q_ASSERT(false); // we currently don't support full case foldings |
|
1212 // qDebug() << "special" << hex << foldMap; |
|
1213 ud.p.caseFoldSpecial = true; |
|
1214 ud.p.caseFoldDiff = appendToSpecialCaseMap(foldMap); |
|
1215 } |
|
1216 unicodeData.insert(codepoint, ud); |
|
1217 } |
|
1218 } |
|
1219 |
|
1220 static void readGraphemeBreak() |
|
1221 { |
|
1222 qDebug() << "Reading GraphemeBreakProperty.txt"; |
|
1223 QFile f("data/GraphemeBreakProperty.txt"); |
|
1224 if (!f.exists()) |
|
1225 qFatal("Couldn't find GraphemeBreakProperty.txt"); |
|
1226 |
|
1227 f.open(QFile::ReadOnly); |
|
1228 |
|
1229 while (!f.atEnd()) { |
|
1230 QByteArray line; |
|
1231 line.resize(1024); |
|
1232 int len = f.readLine(line.data(), 1024); |
|
1233 line.resize(len-1); |
|
1234 |
|
1235 int comment = line.indexOf('#'); |
|
1236 if (comment >= 0) |
|
1237 line = line.left(comment); |
|
1238 |
|
1239 if (line.isEmpty()) |
|
1240 continue; |
|
1241 |
|
1242 QList<QByteArray> l = line.split(';'); |
|
1243 |
|
1244 QByteArray codes = l[0].trimmed(); |
|
1245 codes.replace("..", "."); |
|
1246 QList<QByteArray> cl = codes.split('.'); |
|
1247 |
|
1248 bool ok; |
|
1249 int from = cl[0].toInt(&ok, 16); |
|
1250 Q_ASSERT(ok); |
|
1251 int to = from; |
|
1252 if (cl.size() == 2) { |
|
1253 to = cl[1].toInt(&ok, 16); |
|
1254 Q_ASSERT(ok); |
|
1255 } |
|
1256 |
|
1257 GraphemeBreak brk = grapheme_break_map.value(l[1].trimmed(), GraphemeBreakOther); |
|
1258 |
|
1259 for (int codepoint = from; codepoint <= to; ++codepoint) { |
|
1260 UnicodeData ud = unicodeData.value(codepoint, UnicodeData(codepoint)); |
|
1261 ud.p.graphemeBreak = brk; |
|
1262 unicodeData.insert(codepoint, ud); |
|
1263 } |
|
1264 } |
|
1265 } |
|
1266 |
|
1267 static void readWordBreak() |
|
1268 { |
|
1269 qDebug() << "Reading WordBreakProperty.txt"; |
|
1270 QFile f("data/WordBreakProperty.txt"); |
|
1271 if (!f.exists()) |
|
1272 qFatal("Couldn't find WordBreakProperty.txt"); |
|
1273 |
|
1274 f.open(QFile::ReadOnly); |
|
1275 |
|
1276 while (!f.atEnd()) { |
|
1277 QByteArray line; |
|
1278 line.resize(1024); |
|
1279 int len = f.readLine(line.data(), 1024); |
|
1280 line.resize(len-1); |
|
1281 |
|
1282 int comment = line.indexOf('#'); |
|
1283 if (comment >= 0) |
|
1284 line = line.left(comment); |
|
1285 |
|
1286 if (line.isEmpty()) |
|
1287 continue; |
|
1288 |
|
1289 QList<QByteArray> l = line.split(';'); |
|
1290 |
|
1291 QByteArray codes = l[0].trimmed(); |
|
1292 codes.replace("..", "."); |
|
1293 QList<QByteArray> cl = codes.split('.'); |
|
1294 |
|
1295 bool ok; |
|
1296 int from = cl[0].toInt(&ok, 16); |
|
1297 Q_ASSERT(ok); |
|
1298 int to = from; |
|
1299 if (cl.size() == 2) { |
|
1300 to = cl[1].toInt(&ok, 16); |
|
1301 Q_ASSERT(ok); |
|
1302 } |
|
1303 |
|
1304 WordBreak brk = word_break_map.value(l[1].trimmed(), WordBreakOther); |
|
1305 Q_ASSERT(brk != WordBreakOther); |
|
1306 |
|
1307 for (int codepoint = from; codepoint <= to; ++codepoint) { |
|
1308 UnicodeData ud = unicodeData.value(codepoint, UnicodeData(codepoint)); |
|
1309 ud.p.wordBreak = brk; |
|
1310 unicodeData.insert(codepoint, ud); |
|
1311 } |
|
1312 } |
|
1313 } |
|
1314 |
|
1315 static void readSentenceBreak() |
|
1316 { |
|
1317 qDebug() << "Reading SentenceBreakProperty.txt"; |
|
1318 QFile f("data/SentenceBreakProperty.txt"); |
|
1319 if (!f.exists()) |
|
1320 qFatal("Couldn't find SentenceBreakProperty.txt"); |
|
1321 |
|
1322 f.open(QFile::ReadOnly); |
|
1323 |
|
1324 while (!f.atEnd()) { |
|
1325 QByteArray line; |
|
1326 line.resize(1024); |
|
1327 int len = f.readLine(line.data(), 1024); |
|
1328 line.resize(len-1); |
|
1329 |
|
1330 int comment = line.indexOf('#'); |
|
1331 if (comment >= 0) |
|
1332 line = line.left(comment); |
|
1333 |
|
1334 if (line.isEmpty()) |
|
1335 continue; |
|
1336 |
|
1337 QList<QByteArray> l = line.split(';'); |
|
1338 |
|
1339 QByteArray codes = l[0].trimmed(); |
|
1340 codes.replace("..", "."); |
|
1341 QList<QByteArray> cl = codes.split('.'); |
|
1342 |
|
1343 bool ok; |
|
1344 int from = cl[0].toInt(&ok, 16); |
|
1345 Q_ASSERT(ok); |
|
1346 int to = from; |
|
1347 if (cl.size() == 2) { |
|
1348 to = cl[1].toInt(&ok, 16); |
|
1349 Q_ASSERT(ok); |
|
1350 } |
|
1351 |
|
1352 SentenceBreak brk = sentence_break_map.value(l[1].trimmed(), SentenceBreakOther); |
|
1353 Q_ASSERT(brk != SentenceBreakOther); |
|
1354 |
|
1355 for (int codepoint = from; codepoint <= to; ++codepoint) { |
|
1356 UnicodeData ud = unicodeData.value(codepoint, UnicodeData(codepoint)); |
|
1357 ud.p.sentenceBreak = brk; |
|
1358 unicodeData.insert(codepoint, ud); |
|
1359 } |
|
1360 } |
|
1361 } |
|
1362 |
|
1363 #if 0 |
|
1364 // this piece of code does full case folding and comparison. We currently |
|
1365 // don't use it, since this gives lots of issues with things as case insensitive |
|
1366 // search and replace. |
|
1367 static inline void foldCase(uint ch, ushort *out) |
|
1368 { |
|
1369 const QUnicodeTables::Properties *p = qGetProp(ch); |
|
1370 if (!p->caseFoldSpecial) { |
|
1371 *(out++) = ch + p->caseFoldDiff; |
|
1372 } else { |
|
1373 const ushort *folded = specialCaseMap + p->caseFoldDiff; |
|
1374 while (*folded) |
|
1375 *out++ = *folded++; |
|
1376 } |
|
1377 *out = 0; |
|
1378 } |
|
1379 |
|
1380 static int ucstricmp(const ushort *a, const ushort *ae, const ushort *b, const ushort *be) |
|
1381 { |
|
1382 if (a == b) |
|
1383 return 0; |
|
1384 if (a == 0) |
|
1385 return 1; |
|
1386 if (b == 0) |
|
1387 return -1; |
|
1388 |
|
1389 while (a != ae && b != be) { |
|
1390 const QUnicodeTables::Properties *pa = qGetProp(*a); |
|
1391 const QUnicodeTables::Properties *pb = qGetProp(*b); |
|
1392 if (pa->caseFoldSpecial | pb->caseFoldSpecial) |
|
1393 goto special; |
|
1394 int diff = (int)(*a + pa->caseFoldDiff) - (int)(*b + pb->caseFoldDiff); |
|
1395 if ((diff)) |
|
1396 return diff; |
|
1397 ++a; |
|
1398 ++b; |
|
1399 } |
|
1400 } |
|
1401 if (a == ae) { |
|
1402 if (b == be) |
|
1403 return 0; |
|
1404 return -1; |
|
1405 } |
|
1406 return 1; |
|
1407 special: |
|
1408 ushort abuf[SPECIAL_CASE_MAX_LEN + 1]; |
|
1409 ushort bbuf[SPECIAL_CASE_MAX_LEN + 1]; |
|
1410 abuf[0] = bbuf[0] = 0; |
|
1411 ushort *ap = abuf; |
|
1412 ushort *bp = bbuf; |
|
1413 while (1) { |
|
1414 if (!*ap) { |
|
1415 if (a == ae) { |
|
1416 if (!*bp && b == be) |
|
1417 return 0; |
|
1418 return -1; |
|
1419 } |
|
1420 foldCase(*(a++), abuf); |
|
1421 ap = abuf; |
|
1422 } |
|
1423 if (!*bp) { |
|
1424 if (b == be) |
|
1425 return 1; |
|
1426 foldCase(*(b++), bbuf); |
|
1427 bp = bbuf; |
|
1428 } |
|
1429 if (*ap != *bp) |
|
1430 return (int)*ap - (int)*bp; |
|
1431 ++ap; |
|
1432 ++bp; |
|
1433 } |
|
1434 } |
|
1435 |
|
1436 |
|
1437 static int ucstricmp(const ushort *a, const ushort *ae, const uchar *b) |
|
1438 { |
|
1439 if (a == 0) |
|
1440 return 1; |
|
1441 if (b == 0) |
|
1442 return -1; |
|
1443 |
|
1444 while (a != ae && *b) { |
|
1445 const QUnicodeTables::Properties *pa = qGetProp(*a); |
|
1446 const QUnicodeTables::Properties *pb = qGetProp((ushort)*b); |
|
1447 if (pa->caseFoldSpecial | pb->caseFoldSpecial) |
|
1448 goto special; |
|
1449 int diff = (int)(*a + pa->caseFoldDiff) - (int)(*b + pb->caseFoldDiff); |
|
1450 if ((diff)) |
|
1451 return diff; |
|
1452 ++a; |
|
1453 ++b; |
|
1454 } |
|
1455 if (a == ae) { |
|
1456 if (!*b) |
|
1457 return 0; |
|
1458 return -1; |
|
1459 } |
|
1460 return 1; |
|
1461 |
|
1462 special: |
|
1463 ushort abuf[SPECIAL_CASE_MAX_LEN + 1]; |
|
1464 ushort bbuf[SPECIAL_CASE_MAX_LEN + 1]; |
|
1465 abuf[0] = bbuf[0] = 0; |
|
1466 ushort *ap = abuf; |
|
1467 ushort *bp = bbuf; |
|
1468 while (1) { |
|
1469 if (!*ap) { |
|
1470 if (a == ae) { |
|
1471 if (!*bp && !*b) |
|
1472 return 0; |
|
1473 return -1; |
|
1474 } |
|
1475 foldCase(*(a++), abuf); |
|
1476 ap = abuf; |
|
1477 } |
|
1478 if (!*bp) { |
|
1479 if (!*b) |
|
1480 return 1; |
|
1481 foldCase(*(b++), bbuf); |
|
1482 bp = bbuf; |
|
1483 } |
|
1484 if (*ap != *bp) |
|
1485 return (int)*ap - (int)*bp; |
|
1486 ++ap; |
|
1487 ++bp; |
|
1488 } |
|
1489 } |
|
1490 #endif |
|
1491 |
|
1492 #if 0 |
|
1493 static QList<QByteArray> blockNames; |
|
1494 struct BlockInfo |
|
1495 { |
|
1496 int blockIndex; |
|
1497 int firstCodePoint; |
|
1498 int lastCodePoint; |
|
1499 }; |
|
1500 static QList<BlockInfo> blockInfoList; |
|
1501 |
|
1502 static void readBlocks() |
|
1503 { |
|
1504 QFile f("data/Blocks.txt"); |
|
1505 if (!f.exists()) |
|
1506 qFatal("Couldn't find Blocks.txt"); |
|
1507 |
|
1508 f.open(QFile::ReadOnly); |
|
1509 |
|
1510 while (!f.atEnd()) { |
|
1511 QByteArray line = f.readLine(); |
|
1512 line.resize(line.size() - 1); |
|
1513 |
|
1514 int comment = line.indexOf("#"); |
|
1515 if (comment >= 0) |
|
1516 line = line.left(comment); |
|
1517 |
|
1518 line.replace(" ", ""); |
|
1519 |
|
1520 if (line.isEmpty()) |
|
1521 continue; |
|
1522 |
|
1523 int semicolon = line.indexOf(';'); |
|
1524 Q_ASSERT(semicolon >= 0); |
|
1525 QByteArray codePoints = line.left(semicolon); |
|
1526 QByteArray blockName = line.mid(semicolon + 1); |
|
1527 |
|
1528 int blockIndex = blockNames.indexOf(blockName); |
|
1529 if (blockIndex < 0) { |
|
1530 blockNames.append(blockName); |
|
1531 blockIndex = blockNames.indexOf(blockName); |
|
1532 Q_ASSERT(blockIndex >= 0); |
|
1533 } |
|
1534 |
|
1535 int dotdot = codePoints.indexOf(".."); |
|
1536 Q_ASSERT(dotdot >= 0); |
|
1537 bool unused; |
|
1538 int first = codePoints.left(dotdot).toInt(&unused, 16); |
|
1539 int last = codePoints.mid(dotdot + 2).toInt(&unused, 16); |
|
1540 |
|
1541 BlockInfo blockInfo = { blockIndex, first, last }; |
|
1542 blockInfoList.append(blockInfo); |
|
1543 } |
|
1544 } |
|
1545 #endif |
|
1546 |
|
1547 static QList<QByteArray> scriptNames; |
|
1548 static QHash<int, int> scriptAssignment; |
|
1549 static QHash<int, int> scriptHash; |
|
1550 |
|
1551 struct ExtraBlock { |
|
1552 int block; |
|
1553 QVector<int> vector; |
|
1554 }; |
|
1555 |
|
1556 static QList<ExtraBlock> extraBlockList; |
|
1557 |
|
1558 |
|
1559 static void readScripts() |
|
1560 { |
|
1561 scriptNames.append("Common"); |
|
1562 |
|
1563 static const char *files[] = { |
|
1564 "data/ScriptsInitial.txt", |
|
1565 "data/Scripts.txt", |
|
1566 "data/ScriptsCorrections.txt" |
|
1567 }; |
|
1568 enum { fileCount = sizeof(files) / sizeof(const char *) }; |
|
1569 |
|
1570 for (int i = 0; i < fileCount; ++i) { |
|
1571 QFile f(files[i]); |
|
1572 if (!f.exists()) |
|
1573 qFatal("Couldn't find %s", files[i]); |
|
1574 |
|
1575 |
|
1576 f.open(QFile::ReadOnly); |
|
1577 |
|
1578 while (!f.atEnd()) { |
|
1579 QByteArray line = f.readLine(); |
|
1580 line.resize(line.size() - 1); |
|
1581 |
|
1582 int comment = line.indexOf("#"); |
|
1583 if (comment >= 0) |
|
1584 line = line.left(comment); |
|
1585 |
|
1586 line.replace(" ", ""); |
|
1587 line.replace("_", ""); |
|
1588 |
|
1589 if (line.isEmpty()) |
|
1590 continue; |
|
1591 |
|
1592 int semicolon = line.indexOf(';'); |
|
1593 Q_ASSERT(semicolon >= 0); |
|
1594 QByteArray codePoints = line.left(semicolon); |
|
1595 QByteArray scriptName = line.mid(semicolon + 1); |
|
1596 |
|
1597 int scriptIndex = scriptNames.indexOf(scriptName); |
|
1598 if (scriptIndex < 0) { |
|
1599 scriptNames.append(scriptName); |
|
1600 scriptIndex = scriptNames.indexOf(scriptName); |
|
1601 Q_ASSERT(scriptIndex >= 0); |
|
1602 } |
|
1603 |
|
1604 int dotdot = codePoints.indexOf(".."); |
|
1605 bool unused; |
|
1606 int first = -1, last = -1; |
|
1607 if (dotdot >= 0) { |
|
1608 first = codePoints.left(dotdot).toInt(&unused, 16); |
|
1609 last = codePoints.mid(dotdot + 2).toInt(&unused, 16); |
|
1610 } else { |
|
1611 first = codePoints.toInt(&unused, 16); |
|
1612 } |
|
1613 |
|
1614 if (last != -1) { |
|
1615 for (int i = first; i <= last; ++i) |
|
1616 scriptAssignment[i] = scriptIndex; |
|
1617 } else { |
|
1618 scriptAssignment[first] = scriptIndex; |
|
1619 } |
|
1620 } |
|
1621 } |
|
1622 } |
|
1623 |
|
1624 |
|
1625 static int scriptSentinel = 0; |
|
1626 |
|
1627 QByteArray createScriptEnumDeclaration() |
|
1628 { |
|
1629 static const char *specialScripts[] = { |
|
1630 "Common", |
|
1631 "Arabic", |
|
1632 "Armenian", |
|
1633 "Bengali", |
|
1634 "Cyrillic", |
|
1635 "Devanagari", |
|
1636 "Georgian", |
|
1637 "Greek", |
|
1638 "Gujarati", |
|
1639 "Gurmukhi", |
|
1640 "Hangul", |
|
1641 "Hebrew", |
|
1642 "Kannada", |
|
1643 "Khmer", |
|
1644 "Lao", |
|
1645 "Malayalam", |
|
1646 "Myanmar", |
|
1647 "Ogham", |
|
1648 "Oriya", |
|
1649 "Runic", |
|
1650 "Sinhala", |
|
1651 "Syriac", |
|
1652 "Tamil", |
|
1653 "Telugu", |
|
1654 "Thaana", |
|
1655 "Thai", |
|
1656 "Tibetan", |
|
1657 "Inherited" |
|
1658 }; |
|
1659 const int specialScriptsCount = sizeof(specialScripts) / sizeof(const char *); |
|
1660 |
|
1661 // generate script enum |
|
1662 QByteArray declaration; |
|
1663 |
|
1664 declaration += " // See http://www.unicode.org/reports/tr24/tr24-5.html\n\n"; |
|
1665 declaration += " enum Script {\n Common"; |
|
1666 |
|
1667 int uniqueScripts = 1; // Common |
|
1668 |
|
1669 // output the ones with special processing first |
|
1670 for (int i = 1; i < scriptNames.size(); ++i) { |
|
1671 QByteArray scriptName = scriptNames.at(i); |
|
1672 // does the script require special processing? |
|
1673 bool special = false; |
|
1674 for (int s = 0; !special && s < specialScriptsCount; ++s) { |
|
1675 if (scriptName == specialScripts[s]) |
|
1676 special = true; |
|
1677 } |
|
1678 if (!special) { |
|
1679 scriptHash[i] = 0; // alias for 'Common' |
|
1680 continue; |
|
1681 } else { |
|
1682 ++uniqueScripts; |
|
1683 scriptHash[i] = i; |
|
1684 } |
|
1685 |
|
1686 declaration += ",\n "; |
|
1687 declaration += scriptName; |
|
1688 } |
|
1689 declaration += ",\n ScriptCount = Inherited"; |
|
1690 |
|
1691 // output the ones that are an alias for 'Common' |
|
1692 for (int i = 1; i < scriptNames.size(); ++i) { |
|
1693 if (scriptHash.value(i) != 0) |
|
1694 continue; |
|
1695 QByteArray scriptName = scriptNames.at(i); |
|
1696 scriptName += " = Common"; |
|
1697 declaration += ",\n "; |
|
1698 declaration += scriptName; |
|
1699 } |
|
1700 |
|
1701 declaration += "\n };\n"; |
|
1702 |
|
1703 scriptSentinel = ((uniqueScripts + 16) / 32) * 32; // a multiple of 32 |
|
1704 declaration += " enum { ScriptSentinel = "; |
|
1705 declaration += QByteArray::number(scriptSentinel); |
|
1706 declaration += " };\n\n"; |
|
1707 return declaration; |
|
1708 } |
|
1709 |
|
1710 QByteArray createScriptTableDeclaration() |
|
1711 { |
|
1712 Q_ASSERT(scriptSentinel > 0); |
|
1713 |
|
1714 QByteArray declaration; |
|
1715 |
|
1716 const int unicodeBlockCount = 512; // number of unicode blocks |
|
1717 const int unicodeBlockSize = 128; // size of each block |
|
1718 declaration = "enum { UnicodeBlockCount = "; |
|
1719 declaration += QByteArray::number(unicodeBlockCount); |
|
1720 declaration += " }; // number of unicode blocks\n"; |
|
1721 declaration += "enum { UnicodeBlockSize = "; |
|
1722 declaration += QByteArray::number(unicodeBlockSize); |
|
1723 declaration += " }; // size of each block\n\n"; |
|
1724 |
|
1725 // script table |
|
1726 declaration += "namespace QUnicodeTables {\n\nstatic const unsigned char uc_scripts[] = {\n"; |
|
1727 for (int i = 0; i < unicodeBlockCount; ++i) { |
|
1728 int block = (((i << 7) & 0xff00) | ((i & 1) * 0x80)); |
|
1729 int blockAssignment[unicodeBlockSize]; |
|
1730 for (int x = 0; x < unicodeBlockSize; ++x) { |
|
1731 int codePoint = (i << 7) | x; |
|
1732 blockAssignment[x] = scriptAssignment.value(codePoint, 0); |
|
1733 } |
|
1734 bool allTheSame = true; |
|
1735 const int originalScript = blockAssignment[0]; |
|
1736 const int script = scriptHash.value(originalScript); |
|
1737 for (int x = 1; allTheSame && x < unicodeBlockSize; ++x) { |
|
1738 const int s = scriptHash.value(blockAssignment[x]); |
|
1739 if (s != script) |
|
1740 allTheSame = false; |
|
1741 } |
|
1742 |
|
1743 if (allTheSame) { |
|
1744 declaration += " "; |
|
1745 declaration += scriptNames.value(originalScript); |
|
1746 declaration += ", /* U+"; |
|
1747 declaration += QByteArray::number(block, 16).rightJustified(4, '0'); |
|
1748 declaration += '-'; |
|
1749 declaration += |
|
1750 QByteArray::number(block + unicodeBlockSize - 1, 16).rightJustified(4, '0'); |
|
1751 declaration += " */\n"; |
|
1752 } else { |
|
1753 const int value = extraBlockList.size() + scriptSentinel; |
|
1754 const int offset = |
|
1755 ((value - scriptSentinel) * unicodeBlockSize) + unicodeBlockCount; |
|
1756 |
|
1757 declaration += " "; |
|
1758 declaration += QByteArray::number(value); |
|
1759 declaration += ", /* U+"; |
|
1760 declaration += QByteArray::number(block, 16).rightJustified(4, '0'); |
|
1761 declaration += '-'; |
|
1762 declaration += |
|
1763 QByteArray::number(block + unicodeBlockSize - 1, 16).rightJustified(4, '0'); |
|
1764 declaration += " at offset "; |
|
1765 declaration += QByteArray::number(offset); |
|
1766 declaration += " */\n"; |
|
1767 |
|
1768 ExtraBlock extraBlock; |
|
1769 extraBlock.block = block; |
|
1770 extraBlock.vector.resize(unicodeBlockSize); |
|
1771 for (int x = 0; x < unicodeBlockSize; ++x) |
|
1772 extraBlock.vector[x] = blockAssignment[x]; |
|
1773 |
|
1774 extraBlockList.append(extraBlock); |
|
1775 } |
|
1776 } |
|
1777 |
|
1778 for (int i = 0; i < extraBlockList.size(); ++i) { |
|
1779 const int value = i + scriptSentinel; |
|
1780 const int offset = |
|
1781 ((value - scriptSentinel) * unicodeBlockSize) + unicodeBlockCount; |
|
1782 const ExtraBlock &extraBlock = extraBlockList.at(i); |
|
1783 const int block = extraBlock.block; |
|
1784 |
|
1785 declaration += "\n\n /* U+"; |
|
1786 declaration += QByteArray::number(block, 16).rightJustified(4, '0'); |
|
1787 declaration += '-'; |
|
1788 declaration += |
|
1789 QByteArray::number(block + unicodeBlockSize - 1, 16).rightJustified(4, '0'); |
|
1790 declaration += " at offset "; |
|
1791 declaration += QByteArray::number(offset); |
|
1792 declaration += " */\n "; |
|
1793 |
|
1794 for (int x = 0; x < extraBlock.vector.size(); ++x) { |
|
1795 const int o = extraBlock.vector.at(x); |
|
1796 |
|
1797 declaration += scriptNames.value(o); |
|
1798 if (x < extraBlock.vector.size() - 1 || i < extraBlockList.size() - 1) |
|
1799 declaration += ','; |
|
1800 if ((x & 7) == 7 && x < extraBlock.vector.size() - 1) |
|
1801 declaration += "\n "; |
|
1802 else |
|
1803 declaration += ' '; |
|
1804 } |
|
1805 } |
|
1806 declaration += "\n};\n\n} // namespace QUnicodeTables\n\n"; |
|
1807 |
|
1808 qDebug("createScriptTableDeclaration: table size is %d bytes", |
|
1809 unicodeBlockCount + (extraBlockList.size() * unicodeBlockSize)); |
|
1810 |
|
1811 return declaration; |
|
1812 } |
|
1813 |
|
1814 #if 0 |
|
1815 static void dump(int from, int to) |
|
1816 { |
|
1817 for (int i = from; i <= to; ++i) { |
|
1818 UnicodeData d = unicodeData.value(i, UnicodeData(i)); |
|
1819 qDebug("0x%04x: cat=%d combining=%d dir=%d case=%x mirror=%x joining=%d age=%d", |
|
1820 i, d.p.category, d.p.combiningClass, d.p.direction, d.otherCase, d.mirroredChar, d.p.joining, d.p.age); |
|
1821 if (d.decompositionType != QChar::NoDecomposition) { |
|
1822 qDebug(" decomposition: type=%d, length=%d, first=%x", d.decompositionType, d.decomposition.size(), |
|
1823 d.decomposition[0]); |
|
1824 } |
|
1825 } |
|
1826 qDebug(" "); |
|
1827 } |
|
1828 #endif |
|
1829 |
|
1830 struct PropertyBlock { |
|
1831 PropertyBlock() { index = -1; } |
|
1832 int index; |
|
1833 QList<int> properties; |
|
1834 bool operator ==(const PropertyBlock &other) { return properties == other.properties; } |
|
1835 }; |
|
1836 |
|
1837 static QByteArray createPropertyInfo() |
|
1838 { |
|
1839 qDebug("createPropertyInfo:"); |
|
1840 |
|
1841 const int BMP_BLOCKSIZE=32; |
|
1842 const int BMP_SHIFT = 5; |
|
1843 const int BMP_END = 0x11000; |
|
1844 const int SMP_END = 0x110000; |
|
1845 const int SMP_BLOCKSIZE = 256; |
|
1846 const int SMP_SHIFT = 8; |
|
1847 |
|
1848 QList<PropertyBlock> blocks; |
|
1849 QList<int> blockMap; |
|
1850 |
|
1851 int used = 0; |
|
1852 |
|
1853 for (int block = 0; block < BMP_END/BMP_BLOCKSIZE; ++block) { |
|
1854 PropertyBlock b; |
|
1855 for (int i = 0; i < BMP_BLOCKSIZE; ++i) { |
|
1856 int uc = block*BMP_BLOCKSIZE + i; |
|
1857 UnicodeData d = unicodeData.value(uc, UnicodeData(uc)); |
|
1858 b.properties.append(d.propertyIndex); |
|
1859 } |
|
1860 int index = blocks.indexOf(b); |
|
1861 if (index == -1) { |
|
1862 index = blocks.size(); |
|
1863 b.index = used; |
|
1864 used += BMP_BLOCKSIZE; |
|
1865 blocks.append(b); |
|
1866 } |
|
1867 blockMap.append(blocks.at(index).index); |
|
1868 } |
|
1869 |
|
1870 int bmp_blocks = blocks.size(); |
|
1871 Q_ASSERT(blockMap.size() == BMP_END/BMP_BLOCKSIZE); |
|
1872 |
|
1873 for (int block = BMP_END/SMP_BLOCKSIZE; block < SMP_END/SMP_BLOCKSIZE; ++block) { |
|
1874 PropertyBlock b; |
|
1875 for (int i = 0; i < SMP_BLOCKSIZE; ++i) { |
|
1876 int uc = block*SMP_BLOCKSIZE + i; |
|
1877 UnicodeData d = unicodeData.value(uc, UnicodeData(uc)); |
|
1878 b.properties.append(d.propertyIndex); |
|
1879 } |
|
1880 int index = blocks.indexOf(b); |
|
1881 if (index == -1) { |
|
1882 index = blocks.size(); |
|
1883 b.index = used; |
|
1884 used += SMP_BLOCKSIZE; |
|
1885 blocks.append(b); |
|
1886 } |
|
1887 blockMap.append(blocks.at(index).index); |
|
1888 } |
|
1889 |
|
1890 int bmp_block_data = bmp_blocks*BMP_BLOCKSIZE*2; |
|
1891 int bmp_trie = BMP_END/BMP_BLOCKSIZE*2; |
|
1892 int bmp_mem = bmp_block_data + bmp_trie; |
|
1893 qDebug(" %d unique blocks in BMP.",blocks.size()); |
|
1894 qDebug(" block data uses: %d bytes", bmp_block_data); |
|
1895 qDebug(" trie data uses : %d bytes", bmp_trie); |
|
1896 |
|
1897 int smp_block_data = (blocks.size()- bmp_blocks)*SMP_BLOCKSIZE*2; |
|
1898 int smp_trie = (SMP_END-BMP_END)/SMP_BLOCKSIZE*2; |
|
1899 int smp_mem = smp_block_data + smp_trie; |
|
1900 qDebug(" %d unique blocks in SMP.",blocks.size()-bmp_blocks); |
|
1901 qDebug(" block data uses: %d bytes", smp_block_data); |
|
1902 qDebug(" trie data uses : %d bytes", smp_trie); |
|
1903 |
|
1904 qDebug("\n properties use : %d bytes", uniqueProperties.size()*20); |
|
1905 qDebug(" memory usage: %d bytes", bmp_mem+smp_mem + uniqueProperties.size()*20); |
|
1906 |
|
1907 QByteArray out; |
|
1908 out += "static const unsigned short uc_property_trie[] = {\n"; |
|
1909 |
|
1910 // first write the map |
|
1911 out += " // 0x" + QByteArray::number(BMP_END, 16); |
|
1912 for (int i = 0; i < BMP_END/BMP_BLOCKSIZE; ++i) { |
|
1913 if (!(i % 8)) { |
|
1914 if (out.endsWith(' ')) |
|
1915 out.chop(1); |
|
1916 if (!((i*BMP_BLOCKSIZE) % 0x1000)) |
|
1917 out += "\n"; |
|
1918 out += "\n "; |
|
1919 } |
|
1920 out += QByteArray::number(blockMap.at(i) + blockMap.size()); |
|
1921 out += ", "; |
|
1922 } |
|
1923 if (out.endsWith(' ')) |
|
1924 out.chop(1); |
|
1925 out += "\n\n // 0x" + QByteArray::number(BMP_END, 16) + " - 0x" + QByteArray::number(SMP_END, 16) + "\n";; |
|
1926 for (int i = BMP_END/BMP_BLOCKSIZE; i < blockMap.size(); ++i) { |
|
1927 if (!(i % 8)) { |
|
1928 if (out.endsWith(' ')) |
|
1929 out.chop(1); |
|
1930 if (!(i % (0x10000/SMP_BLOCKSIZE))) |
|
1931 out += "\n"; |
|
1932 out += "\n "; |
|
1933 } |
|
1934 out += QByteArray::number(blockMap.at(i) + blockMap.size()); |
|
1935 out += ", "; |
|
1936 } |
|
1937 if (out.endsWith(' ')) |
|
1938 out.chop(1); |
|
1939 out += "\n"; |
|
1940 // write the data |
|
1941 for (int i = 0; i < blocks.size(); ++i) { |
|
1942 if (out.endsWith(' ')) |
|
1943 out.chop(1); |
|
1944 out += "\n"; |
|
1945 const PropertyBlock &b = blocks.at(i); |
|
1946 for (int j = 0; j < b.properties.size(); ++j) { |
|
1947 if (!(j % 8)) { |
|
1948 if (out.endsWith(' ')) |
|
1949 out.chop(1); |
|
1950 out += "\n "; |
|
1951 } |
|
1952 out += QByteArray::number(b.properties.at(j)); |
|
1953 out += ", "; |
|
1954 } |
|
1955 } |
|
1956 |
|
1957 // we reserve one bit more than in the assert below for the sign |
|
1958 Q_ASSERT(maxMirroredDiff < (1<<12)); |
|
1959 Q_ASSERT(maxLowerCaseDiff < (1<<14)); |
|
1960 Q_ASSERT(maxUpperCaseDiff < (1<<14)); |
|
1961 Q_ASSERT(maxTitleCaseDiff < (1<<14)); |
|
1962 Q_ASSERT(maxCaseFoldDiff < (1<<14)); |
|
1963 |
|
1964 if (out.endsWith(' ')) |
|
1965 out.chop(1); |
|
1966 out += "\n};\n\n" |
|
1967 |
|
1968 "#define GET_PROP_INDEX(ucs4) \\\n" |
|
1969 " (ucs4 < 0x" + QByteArray::number(BMP_END, 16) + " \\\n" |
|
1970 " ? (uc_property_trie[uc_property_trie[ucs4>>" + QByteArray::number(BMP_SHIFT) + |
|
1971 "] + (ucs4 & 0x" + QByteArray::number(BMP_BLOCKSIZE-1, 16)+ ")]) \\\n" |
|
1972 " : (uc_property_trie[uc_property_trie[((ucs4 - 0x" + QByteArray::number(BMP_END, 16) + |
|
1973 ")>>" + QByteArray::number(SMP_SHIFT) + ") + 0x" + QByteArray::number(BMP_END/BMP_BLOCKSIZE, 16) + "]" |
|
1974 " + (ucs4 & 0x" + QByteArray::number(SMP_BLOCKSIZE-1, 16) + ")]))\n\n" |
|
1975 "#define GET_PROP_INDEX_UCS2(ucs2) \\\n" |
|
1976 "(uc_property_trie[uc_property_trie[ucs2>>" + QByteArray::number(BMP_SHIFT) + |
|
1977 "] + (ucs2 & 0x" + QByteArray::number(BMP_BLOCKSIZE-1, 16)+ ")])\n\n" |
|
1978 |
|
1979 |
|
1980 "static const QUnicodeTables::Properties uc_properties [] = {\n"; |
|
1981 |
|
1982 // keep in sync with the property declaration |
|
1983 for (int i = 0; i < uniqueProperties.size(); ++i) { |
|
1984 PropertyFlags p = uniqueProperties.at(i); |
|
1985 out += " { "; |
|
1986 // " ushort category : 8;\n" |
|
1987 out += QByteArray::number( p.category ); |
|
1988 out += ", "; |
|
1989 // " ushort line_break_class : 8;\n" |
|
1990 out += QByteArray::number( p.line_break_class ); |
|
1991 out += ", "; |
|
1992 // " ushort direction : 8;\n" |
|
1993 out += QByteArray::number( p.direction ); |
|
1994 out += ", "; |
|
1995 // " ushort combiningClass :8;\n" |
|
1996 out += QByteArray::number( p.combiningClass ); |
|
1997 out += ", "; |
|
1998 // " ushort joining : 2;\n" |
|
1999 out += QByteArray::number( p.joining ); |
|
2000 out += ", "; |
|
2001 // " signed short digitValue : 6;\n /* 5 needed */" |
|
2002 out += QByteArray::number( p.digitValue ); |
|
2003 out += ", "; |
|
2004 // " ushort unicodeVersion : 4;\n" |
|
2005 out += QByteArray::number( p.age ); |
|
2006 out += ", "; |
|
2007 // " ushort lowerCaseSpecial : 1;\n" |
|
2008 // " ushort upperCaseSpecial : 1;\n" |
|
2009 // " ushort titleCaseSpecial : 1;\n" |
|
2010 // " ushort caseFoldSpecial : 1;\n" |
|
2011 out += QByteArray::number( p.lowerCaseSpecial ); |
|
2012 out += ", "; |
|
2013 out += QByteArray::number( p.upperCaseSpecial ); |
|
2014 out += ", "; |
|
2015 out += QByteArray::number( p.titleCaseSpecial ); |
|
2016 out += ", "; |
|
2017 out += QByteArray::number( p.caseFoldSpecial ); |
|
2018 out += ", "; |
|
2019 // " signed short mirrorDiff : 16;\n" |
|
2020 // " signed short lowerCaseDiff : 16;\n" |
|
2021 // " signed short upperCaseDiff : 16;\n" |
|
2022 // " signed short titleCaseDiff : 16;\n" |
|
2023 // " signed short caseFoldDiff : 16;\n" |
|
2024 out += QByteArray::number( p.mirrorDiff ); |
|
2025 out += ", "; |
|
2026 out += QByteArray::number( p.lowerCaseDiff ); |
|
2027 out += ", "; |
|
2028 out += QByteArray::number( p.upperCaseDiff ); |
|
2029 out += ", "; |
|
2030 out += QByteArray::number( p.titleCaseDiff ); |
|
2031 out += ", "; |
|
2032 out += QByteArray::number( p.caseFoldDiff ); |
|
2033 out += ", "; |
|
2034 out += QByteArray::number( p.graphemeBreak ); |
|
2035 out += ", "; |
|
2036 out += QByteArray::number( p.wordBreak ); |
|
2037 out += ", "; |
|
2038 out += QByteArray::number( p.sentenceBreak ); |
|
2039 out += "},\n"; |
|
2040 } |
|
2041 out += "};\n\n"; |
|
2042 |
|
2043 out += "static inline const QUnicodeTables::Properties *qGetProp(uint ucs4)\n" |
|
2044 "{\n" |
|
2045 " int index = GET_PROP_INDEX(ucs4);\n" |
|
2046 " return uc_properties + index;\n" |
|
2047 "}\n" |
|
2048 "\n" |
|
2049 "static inline const QUnicodeTables::Properties *qGetProp(ushort ucs2)\n" |
|
2050 "{\n" |
|
2051 " int index = GET_PROP_INDEX_UCS2(ucs2);\n" |
|
2052 " return uc_properties + index;\n" |
|
2053 "}\n" |
|
2054 "\n" |
|
2055 "Q_CORE_EXPORT const QUnicodeTables::Properties * QT_FASTCALL QUnicodeTables::properties(uint ucs4)\n" |
|
2056 "{\n" |
|
2057 " int index = GET_PROP_INDEX(ucs4);\n" |
|
2058 " return uc_properties + index;\n" |
|
2059 "}\n" |
|
2060 "\n" |
|
2061 "Q_CORE_EXPORT const QUnicodeTables::Properties * QT_FASTCALL QUnicodeTables::properties(ushort ucs2)\n" |
|
2062 "{\n" |
|
2063 " int index = GET_PROP_INDEX_UCS2(ucs2);\n" |
|
2064 " return uc_properties + index;\n" |
|
2065 "}\n\n"; |
|
2066 |
|
2067 out += "#define CURRENT_VERSION "CURRENT_UNICODE_VERSION"\n\n"; |
|
2068 |
|
2069 out += "static const ushort specialCaseMap [] = {"; |
|
2070 for (int i = 0; i < specialCaseMap.size(); ++i) { |
|
2071 if (!(i % 16)) |
|
2072 out += "\n "; |
|
2073 out += QByteArray(" 0x") + QByteArray::number(specialCaseMap.at(i), 16); |
|
2074 if (i < specialCaseMap.size() - 1) |
|
2075 out += ","; |
|
2076 } |
|
2077 out += "\n};\n"; |
|
2078 out += "#define SPECIAL_CASE_MAX_LEN " + QByteArray::number(specialCaseMaxLen) + "\n\n"; |
|
2079 |
|
2080 qDebug() << "Special case map uses " << specialCaseMap.size()*2 << "bytes"; |
|
2081 |
|
2082 return out; |
|
2083 } |
|
2084 |
|
2085 |
|
2086 struct DecompositionBlock { |
|
2087 DecompositionBlock() { index = -1; } |
|
2088 int index; |
|
2089 QList<int> decompositionPositions; |
|
2090 bool operator ==(const DecompositionBlock &other) |
|
2091 { return decompositionPositions == other.decompositionPositions; } |
|
2092 }; |
|
2093 |
|
2094 static QByteArray createCompositionInfo() |
|
2095 { |
|
2096 qDebug("createCompositionInfo:"); |
|
2097 |
|
2098 const int BMP_BLOCKSIZE=16; |
|
2099 const int BMP_SHIFT = 4; |
|
2100 const int BMP_END = 0x3400; // start of Han |
|
2101 const int SMP_END = 0x30000; |
|
2102 const int SMP_BLOCKSIZE = 256; |
|
2103 const int SMP_SHIFT = 8; |
|
2104 |
|
2105 if(SMP_END <= highestComposedCharacter) |
|
2106 qFatal("end of table smaller than highest composed character at %x", highestComposedCharacter); |
|
2107 |
|
2108 QList<DecompositionBlock> blocks; |
|
2109 QList<int> blockMap; |
|
2110 QList<unsigned short> decompositions; |
|
2111 |
|
2112 int used = 0; |
|
2113 int tableIndex = 0; |
|
2114 |
|
2115 for (int block = 0; block < BMP_END/BMP_BLOCKSIZE; ++block) { |
|
2116 DecompositionBlock b; |
|
2117 for (int i = 0; i < BMP_BLOCKSIZE; ++i) { |
|
2118 int uc = block*BMP_BLOCKSIZE + i; |
|
2119 UnicodeData d = unicodeData.value(uc, UnicodeData(uc)); |
|
2120 if (!d.decomposition.isEmpty()) { |
|
2121 int utf16Chars = 0; |
|
2122 for (int j = 0; j < d.decomposition.size(); ++j) |
|
2123 utf16Chars += d.decomposition.at(j) > 0x10000 ? 2 : 1; |
|
2124 decompositions.append(d.decompositionType + (utf16Chars<<8)); |
|
2125 for (int j = 0; j < d.decomposition.size(); ++j) { |
|
2126 int code = d.decomposition.at(j); |
|
2127 if (code > 0x10000) { |
|
2128 // save as surrogate pair |
|
2129 code -= 0x10000; |
|
2130 ushort high = code/0x400 + 0xd800; |
|
2131 ushort low = code%0x400 + 0xdc00; |
|
2132 decompositions.append(high); |
|
2133 decompositions.append(low); |
|
2134 } else { |
|
2135 decompositions.append(code); |
|
2136 } |
|
2137 } |
|
2138 b.decompositionPositions.append(tableIndex); |
|
2139 tableIndex += utf16Chars + 1; |
|
2140 } else { |
|
2141 b.decompositionPositions.append(0xffff); |
|
2142 } |
|
2143 } |
|
2144 int index = blocks.indexOf(b); |
|
2145 if (index == -1) { |
|
2146 index = blocks.size(); |
|
2147 b.index = used; |
|
2148 used += BMP_BLOCKSIZE; |
|
2149 blocks.append(b); |
|
2150 } |
|
2151 blockMap.append(blocks.at(index).index); |
|
2152 } |
|
2153 |
|
2154 int bmp_blocks = blocks.size(); |
|
2155 Q_ASSERT(blockMap.size() == BMP_END/BMP_BLOCKSIZE); |
|
2156 |
|
2157 for (int block = BMP_END/SMP_BLOCKSIZE; block < SMP_END/SMP_BLOCKSIZE; ++block) { |
|
2158 DecompositionBlock b; |
|
2159 for (int i = 0; i < SMP_BLOCKSIZE; ++i) { |
|
2160 int uc = block*SMP_BLOCKSIZE + i; |
|
2161 UnicodeData d = unicodeData.value(uc, UnicodeData(uc)); |
|
2162 if (!d.decomposition.isEmpty()) { |
|
2163 int utf16Chars = 0; |
|
2164 for (int j = 0; j < d.decomposition.size(); ++j) |
|
2165 utf16Chars += d.decomposition.at(j) > 0x10000 ? 2 : 1; |
|
2166 decompositions.append(d.decompositionType + (utf16Chars<<8)); |
|
2167 for (int j = 0; j < d.decomposition.size(); ++j) { |
|
2168 int code = d.decomposition.at(j); |
|
2169 if (code > 0x10000) { |
|
2170 // save as surrogate pair |
|
2171 code -= 0x10000; |
|
2172 ushort high = code/0x400 + 0xd800; |
|
2173 ushort low = code%0x400 + 0xdc00; |
|
2174 decompositions.append(high); |
|
2175 decompositions.append(low); |
|
2176 } else { |
|
2177 decompositions.append(code); |
|
2178 } |
|
2179 } |
|
2180 b.decompositionPositions.append(tableIndex); |
|
2181 tableIndex += utf16Chars + 1; |
|
2182 } else { |
|
2183 b.decompositionPositions.append(0xffff); |
|
2184 } |
|
2185 } |
|
2186 int index = blocks.indexOf(b); |
|
2187 if (index == -1) { |
|
2188 index = blocks.size(); |
|
2189 b.index = used; |
|
2190 used += SMP_BLOCKSIZE; |
|
2191 blocks.append(b); |
|
2192 } |
|
2193 blockMap.append(blocks.at(index).index); |
|
2194 } |
|
2195 |
|
2196 int bmp_block_data = bmp_blocks*BMP_BLOCKSIZE*2; |
|
2197 int bmp_trie = BMP_END/BMP_BLOCKSIZE*2; |
|
2198 int bmp_mem = bmp_block_data + bmp_trie; |
|
2199 qDebug(" %d unique blocks in BMP.",blocks.size()); |
|
2200 qDebug(" block data uses: %d bytes", bmp_block_data); |
|
2201 qDebug(" trie data uses : %d bytes", bmp_trie); |
|
2202 qDebug(" memory usage: %d bytes", bmp_mem); |
|
2203 |
|
2204 int smp_block_data = (blocks.size()- bmp_blocks)*SMP_BLOCKSIZE*2; |
|
2205 int smp_trie = (SMP_END-BMP_END)/SMP_BLOCKSIZE*2; |
|
2206 int smp_mem = smp_block_data + smp_trie; |
|
2207 qDebug(" %d unique blocks in SMP.",blocks.size()-bmp_blocks); |
|
2208 qDebug(" block data uses: %d bytes", smp_block_data); |
|
2209 qDebug(" trie data uses : %d bytes", smp_trie); |
|
2210 |
|
2211 qDebug("\n decomposition table use : %d bytes", decompositions.size()*2); |
|
2212 qDebug(" memory usage: %d bytes", bmp_mem+smp_mem + decompositions.size()*2); |
|
2213 |
|
2214 QByteArray out; |
|
2215 |
|
2216 out += "static const unsigned short uc_decomposition_trie[] = {\n"; |
|
2217 |
|
2218 // first write the map |
|
2219 out += " // 0 - 0x" + QByteArray::number(BMP_END, 16); |
|
2220 for (int i = 0; i < BMP_END/BMP_BLOCKSIZE; ++i) { |
|
2221 if (!(i % 8)) { |
|
2222 if (out.endsWith(' ')) |
|
2223 out.chop(1); |
|
2224 if (!((i*BMP_BLOCKSIZE) % 0x1000)) |
|
2225 out += "\n"; |
|
2226 out += "\n "; |
|
2227 } |
|
2228 out += QByteArray::number(blockMap.at(i) + blockMap.size()); |
|
2229 out += ", "; |
|
2230 } |
|
2231 if (out.endsWith(' ')) |
|
2232 out.chop(1); |
|
2233 out += "\n\n // 0x" + QByteArray::number(BMP_END, 16) + " - 0x" + QByteArray::number(SMP_END, 16) + "\n";; |
|
2234 for (int i = BMP_END/BMP_BLOCKSIZE; i < blockMap.size(); ++i) { |
|
2235 if (!(i % 8)) { |
|
2236 if (out.endsWith(' ')) |
|
2237 out.chop(1); |
|
2238 if (!(i % (0x10000/SMP_BLOCKSIZE))) |
|
2239 out += "\n"; |
|
2240 out += "\n "; |
|
2241 } |
|
2242 out += QByteArray::number(blockMap.at(i) + blockMap.size()); |
|
2243 out += ", "; |
|
2244 } |
|
2245 if (out.endsWith(' ')) |
|
2246 out.chop(1); |
|
2247 out += "\n"; |
|
2248 // write the data |
|
2249 for (int i = 0; i < blocks.size(); ++i) { |
|
2250 if (out.endsWith(' ')) |
|
2251 out.chop(1); |
|
2252 out += "\n"; |
|
2253 const DecompositionBlock &b = blocks.at(i); |
|
2254 for (int j = 0; j < b.decompositionPositions.size(); ++j) { |
|
2255 if (!(j % 8)) { |
|
2256 if (out.endsWith(' ')) |
|
2257 out.chop(1); |
|
2258 out += "\n "; |
|
2259 } |
|
2260 out += "0x" + QByteArray::number(b.decompositionPositions.at(j), 16); |
|
2261 out += ", "; |
|
2262 } |
|
2263 } |
|
2264 |
|
2265 if (out.endsWith(' ')) |
|
2266 out.chop(1); |
|
2267 out += "\n};\n\n" |
|
2268 |
|
2269 "#define GET_DECOMPOSITION_INDEX(ucs4) \\\n" |
|
2270 " (ucs4 < 0x" + QByteArray::number(BMP_END, 16) + " \\\n" |
|
2271 " ? (uc_decomposition_trie[uc_decomposition_trie[ucs4>>" + QByteArray::number(BMP_SHIFT) + |
|
2272 "] + (ucs4 & 0x" + QByteArray::number(BMP_BLOCKSIZE-1, 16)+ ")]) \\\n" |
|
2273 " : (ucs4 < 0x" + QByteArray::number(SMP_END, 16) + "\\\n" |
|
2274 " ? uc_decomposition_trie[uc_decomposition_trie[((ucs4 - 0x" + QByteArray::number(BMP_END, 16) + |
|
2275 ")>>" + QByteArray::number(SMP_SHIFT) + ") + 0x" + QByteArray::number(BMP_END/BMP_BLOCKSIZE, 16) + "]" |
|
2276 " + (ucs4 & 0x" + QByteArray::number(SMP_BLOCKSIZE-1, 16) + ")]\\\n" |
|
2277 " : 0xffff))\n\n" |
|
2278 |
|
2279 "static const unsigned short uc_decomposition_map[] = {\n"; |
|
2280 |
|
2281 for (int i = 0; i < decompositions.size(); ++i) { |
|
2282 if (!(i % 8)) { |
|
2283 if (out.endsWith(' ')) |
|
2284 out.chop(1); |
|
2285 out += "\n "; |
|
2286 } |
|
2287 out += "0x" + QByteArray::number(decompositions.at(i), 16); |
|
2288 out += ", "; |
|
2289 } |
|
2290 |
|
2291 if (out.endsWith(' ')) |
|
2292 out.chop(1); |
|
2293 out += "\n};\n\n"; |
|
2294 |
|
2295 return out; |
|
2296 } |
|
2297 |
|
2298 static QByteArray createLigatureInfo() |
|
2299 { |
|
2300 qDebug("createLigatureInfo: numLigatures=%d", numLigatures); |
|
2301 |
|
2302 QList<DecompositionBlock> blocks; |
|
2303 QList<int> blockMap; |
|
2304 QList<unsigned short> ligatures; |
|
2305 |
|
2306 const int BMP_BLOCKSIZE = 32; |
|
2307 const int BMP_SHIFT = 5; |
|
2308 const int BMP_END = 0x3100; |
|
2309 Q_ASSERT(highestLigature < BMP_END); |
|
2310 |
|
2311 int used = 0; |
|
2312 int tableIndex = 0; |
|
2313 |
|
2314 for (int block = 0; block < BMP_END/BMP_BLOCKSIZE; ++block) { |
|
2315 DecompositionBlock b; |
|
2316 for (int i = 0; i < BMP_BLOCKSIZE; ++i) { |
|
2317 int uc = block*BMP_BLOCKSIZE + i; |
|
2318 QList<Ligature> l = ligatureHashes.value(uc); |
|
2319 if (!l.isEmpty()) { |
|
2320 b.decompositionPositions.append(tableIndex); |
|
2321 qSort(l); |
|
2322 |
|
2323 ligatures.append(l.size()); |
|
2324 for (int i = 0; i < l.size(); ++i) { |
|
2325 Q_ASSERT(l.at(i).u2 == uc); |
|
2326 ligatures.append(l.at(i).u1); |
|
2327 ligatures.append(l.at(i).ligature); |
|
2328 } |
|
2329 tableIndex += 2*l.size() + 1; |
|
2330 } else { |
|
2331 b.decompositionPositions.append(0xffff); |
|
2332 } |
|
2333 } |
|
2334 int index = blocks.indexOf(b); |
|
2335 if (index == -1) { |
|
2336 index = blocks.size(); |
|
2337 b.index = used; |
|
2338 used += BMP_BLOCKSIZE; |
|
2339 blocks.append(b); |
|
2340 } |
|
2341 blockMap.append(blocks.at(index).index); |
|
2342 } |
|
2343 |
|
2344 int bmp_blocks = blocks.size(); |
|
2345 Q_ASSERT(blockMap.size() == BMP_END/BMP_BLOCKSIZE); |
|
2346 |
|
2347 int bmp_block_data = bmp_blocks*BMP_BLOCKSIZE*2; |
|
2348 int bmp_trie = BMP_END/BMP_BLOCKSIZE*2; |
|
2349 int bmp_mem = bmp_block_data + bmp_trie; |
|
2350 qDebug(" %d unique blocks in BMP.",blocks.size()); |
|
2351 qDebug(" block data uses: %d bytes", bmp_block_data); |
|
2352 qDebug(" trie data uses : %d bytes", bmp_trie); |
|
2353 qDebug(" ligature data uses : %d bytes", ligatures.size()*2); |
|
2354 qDebug(" memory usage: %d bytes", bmp_mem + ligatures.size() * 2); |
|
2355 |
|
2356 QByteArray out; |
|
2357 |
|
2358 |
|
2359 out += "static const unsigned short uc_ligature_trie[] = {\n"; |
|
2360 |
|
2361 // first write the map |
|
2362 out += " // 0 - 0x" + QByteArray::number(BMP_END, 16); |
|
2363 for (int i = 0; i < BMP_END/BMP_BLOCKSIZE; ++i) { |
|
2364 if (!(i % 8)) { |
|
2365 if (out.endsWith(' ')) |
|
2366 out.chop(1); |
|
2367 if (!((i*BMP_BLOCKSIZE) % 0x1000)) |
|
2368 out += "\n"; |
|
2369 out += "\n "; |
|
2370 } |
|
2371 out += QByteArray::number(blockMap.at(i) + blockMap.size()); |
|
2372 out += ", "; |
|
2373 } |
|
2374 if (out.endsWith(' ')) |
|
2375 out.chop(1); |
|
2376 out += "\n"; |
|
2377 // write the data |
|
2378 for (int i = 0; i < blocks.size(); ++i) { |
|
2379 if (out.endsWith(' ')) |
|
2380 out.chop(1); |
|
2381 out += "\n"; |
|
2382 const DecompositionBlock &b = blocks.at(i); |
|
2383 for (int j = 0; j < b.decompositionPositions.size(); ++j) { |
|
2384 if (!(j % 8)) { |
|
2385 if (out.endsWith(' ')) |
|
2386 out.chop(1); |
|
2387 out += "\n "; |
|
2388 } |
|
2389 out += "0x" + QByteArray::number(b.decompositionPositions.at(j), 16); |
|
2390 out += ", "; |
|
2391 } |
|
2392 } |
|
2393 if (out.endsWith(' ')) |
|
2394 out.chop(1); |
|
2395 out += "\n};\n\n" |
|
2396 |
|
2397 "#define GET_LIGATURE_INDEX(u2) " |
|
2398 "(u2 < 0x" + QByteArray::number(BMP_END, 16) + " ? " |
|
2399 "uc_ligature_trie[uc_ligature_trie[u2>>" + QByteArray::number(BMP_SHIFT) + |
|
2400 "] + (u2 & 0x" + QByteArray::number(BMP_BLOCKSIZE-1, 16)+ ")] : 0xffff);\n\n" |
|
2401 |
|
2402 "static const unsigned short uc_ligature_map [] = {\n"; |
|
2403 |
|
2404 for (int i = 0; i < ligatures.size(); ++i) { |
|
2405 if (!(i % 8)) { |
|
2406 if (out.endsWith(' ')) |
|
2407 out.chop(1); |
|
2408 out += "\n "; |
|
2409 } |
|
2410 out += "0x" + QByteArray::number(ligatures.at(i), 16); |
|
2411 out += ", "; |
|
2412 } |
|
2413 |
|
2414 if (out.endsWith(' ')) |
|
2415 out.chop(1); |
|
2416 out += "\n};\n\n"; |
|
2417 |
|
2418 return out; |
|
2419 } |
|
2420 |
|
2421 QByteArray createCasingInfo() |
|
2422 { |
|
2423 QByteArray out; |
|
2424 |
|
2425 out += "struct CasingInfo {\n" |
|
2426 " uint codePoint : 16;\n" |
|
2427 " uint flags : 8;\n" |
|
2428 " uint offset : 8;\n" |
|
2429 "};\n\n"; |
|
2430 |
|
2431 return out; |
|
2432 } |
|
2433 |
|
2434 int main(int, char **) |
|
2435 { |
|
2436 initCategoryMap(); |
|
2437 initDirectionMap(); |
|
2438 initDecompositionMap(); |
|
2439 initGraphemeBreak(); |
|
2440 initWordBreak(); |
|
2441 initSentenceBreak(); |
|
2442 |
|
2443 readUnicodeData(); |
|
2444 readBidiMirroring(); |
|
2445 readArabicShaping(); |
|
2446 readDerivedAge(); |
|
2447 readCompositionExclusion(); |
|
2448 readLineBreak(); |
|
2449 readSpecialCasing(); |
|
2450 readCaseFolding(); |
|
2451 // readBlocks(); |
|
2452 readScripts(); |
|
2453 readGraphemeBreak(); |
|
2454 readWordBreak(); |
|
2455 readSentenceBreak(); |
|
2456 |
|
2457 computeUniqueProperties(); |
|
2458 QByteArray properties = createPropertyInfo(); |
|
2459 QByteArray compositions = createCompositionInfo(); |
|
2460 QByteArray ligatures = createLigatureInfo(); |
|
2461 QByteArray normalizationCorrections = createNormalizationCorrections(); |
|
2462 QByteArray scriptEnumDeclaration = createScriptEnumDeclaration(); |
|
2463 QByteArray scriptTableDeclaration = createScriptTableDeclaration(); |
|
2464 |
|
2465 QFile f("../../src/corelib/tools/qunicodetables.cpp"); |
|
2466 f.open(QFile::WriteOnly|QFile::Truncate); |
|
2467 |
|
2468 QByteArray header = |
|
2469 "/****************************************************************************\n" |
|
2470 "**\n" |
|
2471 "** Copyright (C) 2009 Nokia Corporation and/or its subsidiary(-ies).\n" |
|
2472 "** All rights reserved.\n" |
|
2473 "** Contact: Nokia Corporation (qt-info@nokia.com)\n" |
|
2474 "**\n" |
|
2475 "** This file is part of the QtCore module of the Qt Toolkit.\n" |
|
2476 "**\n" |
|
2477 "** $QT_BEGIN_LICENSE:LGPL$\n" |
|
2478 "** No Commercial Usage\n" |
|
2479 "** This file contains pre-release code and may not be distributed.\n" |
|
2480 "** You may use this file in accordance with the terms and conditions\n" |
|
2481 "** contained in the Technology Preview License Agreement accompanying\n" |
|
2482 "** this package.\n" |
|
2483 "**\n" |
|
2484 "** GNU Lesser General Public License Usage\n" |
|
2485 "** Alternatively, this file may be used under the terms of the GNU Lesser\n" |
|
2486 "** General Public License version 2.1 as published by the Free Software\n" |
|
2487 "** Foundation and appearing in the file LICENSE.LGPL included in the\n" |
|
2488 "** packaging of this file. Please review the following information to\n" |
|
2489 "** ensure the GNU Lesser General Public License version 2.1 requirements\n" |
|
2490 "** will be met: http://www.gnu.org/licenses/old-licenses/lgpl-2.1.html.\n" |
|
2491 "**\n" |
|
2492 "** In addition, as a special exception, Nokia gives you certain additional\n" |
|
2493 "** rights. These rights are described in the Nokia Qt LGPL Exception\n" |
|
2494 "** version 1.1, included in the file LGPL_EXCEPTION.txt in this package.\n" |
|
2495 "**\n" |
|
2496 "** If you have questions regarding the use of this file, please contact\n" |
|
2497 "** Nokia at qt-info@nokia.com.\n" |
|
2498 "**\n" |
|
2499 "**\n" |
|
2500 "**\n" |
|
2501 "**\n" |
|
2502 "**\n" |
|
2503 "**\n" |
|
2504 "**\n" |
|
2505 "**\n" |
|
2506 "** $QT_END_LICENSE$\n" |
|
2507 "**\n" |
|
2508 "****************************************************************************/\n\n" |
|
2509 |
|
2510 "/* This file is autogenerated from the Unicode 5.0 database. Do not edit */\n\n"; |
|
2511 |
|
2512 QByteArray warning = |
|
2513 "//\n" |
|
2514 "// W A R N I N G\n" |
|
2515 "// -------------\n" |
|
2516 "//\n" |
|
2517 "// This file is not part of the Qt API. It exists for the convenience\n" |
|
2518 "// of internal files. This header file may change from version to version\n" |
|
2519 "// without notice, or even be removed.\n" |
|
2520 "//\n" |
|
2521 "// We mean it.\n" |
|
2522 "//\n\n"; |
|
2523 |
|
2524 f.write(header); |
|
2525 f.write("QT_BEGIN_NAMESPACE\n\n"); |
|
2526 f.write(properties); |
|
2527 f.write(compositions); |
|
2528 f.write(ligatures); |
|
2529 f.write(normalizationCorrections); |
|
2530 f.write(scriptTableDeclaration); |
|
2531 f.write("\nQT_END_NAMESPACE\n"); |
|
2532 f.close(); |
|
2533 |
|
2534 f.setFileName("../../src/corelib/tools/qunicodetables_p.h"); |
|
2535 f.open(QFile::WriteOnly | QFile::Truncate); |
|
2536 f.write(header); |
|
2537 f.write(warning); |
|
2538 f.write("#ifndef QUNICODETABLES_P_H\n" |
|
2539 "#define QUNICODETABLES_P_H\n\n" |
|
2540 "#include <QtCore/qchar.h>\n\n" |
|
2541 "QT_BEGIN_NAMESPACE\n\n"); |
|
2542 f.write("namespace QUnicodeTables {\n"); |
|
2543 f.write(property_string); |
|
2544 f.write("\n"); |
|
2545 f.write(scriptEnumDeclaration); |
|
2546 f.write("\n"); |
|
2547 f.write(lineBreakClass); |
|
2548 f.write("\n"); |
|
2549 f.write(methods); |
|
2550 f.write("\n"); |
|
2551 f.write(grapheme_break_string); |
|
2552 f.write("\n"); |
|
2553 f.write(word_break_string); |
|
2554 f.write("\n"); |
|
2555 f.write(sentence_break_string); |
|
2556 f.write("\n}\n\n" |
|
2557 "QT_END_NAMESPACE\n\n" |
|
2558 "#endif\n"); |
|
2559 f.close(); |
|
2560 |
|
2561 qDebug() << "maxMirroredDiff = " << hex << maxMirroredDiff; |
|
2562 qDebug() << "maxLowerCaseDiff = " << hex << maxLowerCaseDiff; |
|
2563 qDebug() << "maxUpperCaseDiff = " << hex << maxUpperCaseDiff; |
|
2564 qDebug() << "maxTitleCaseDiff = " << hex << maxTitleCaseDiff; |
|
2565 qDebug() << "maxCaseFoldDiff = " << hex << maxCaseFoldDiff; |
|
2566 #if 0 |
|
2567 // dump(0, 0x7f); |
|
2568 // dump(0x620, 0x640); |
|
2569 // dump(0x10000, 0x10020); |
|
2570 // dump(0x10800, 0x10820); |
|
2571 |
|
2572 qDebug("decompositionLength used:"); |
|
2573 int totalcompositions = 0; |
|
2574 int sum = 0; |
|
2575 for (int i = 1; i < 20; ++i) { |
|
2576 qDebug(" length %d used %d times", i, decompositionLength.value(i, 0)); |
|
2577 totalcompositions += i*decompositionLength.value(i, 0); |
|
2578 sum += decompositionLength.value(i, 0); |
|
2579 } |
|
2580 qDebug(" len decomposition map %d, average length %f, num composed chars %d", |
|
2581 totalcompositions, (float)totalcompositions/(float)sum, sum); |
|
2582 qDebug("highest composed character %x", highestComposedCharacter); |
|
2583 qDebug("num ligatures = %d highest=%x, maxLength=%d", numLigatures, highestLigature, longestLigature); |
|
2584 |
|
2585 qBubbleSort(ligatures); |
|
2586 for (int i = 0; i < ligatures.size(); ++i) |
|
2587 qDebug("%s", ligatures.at(i).data()); |
|
2588 |
|
2589 // qDebug("combiningClass usage:"); |
|
2590 // int numClasses = 0; |
|
2591 // for (int i = 0; i < 255; ++i) { |
|
2592 // int num = combiningClassUsage.value(i, 0); |
|
2593 // if (num) { |
|
2594 // ++numClasses; |
|
2595 // qDebug(" combiningClass %d used %d times", i, num); |
|
2596 // } |
|
2597 // } |
|
2598 // qDebug("total of %d combining classes used", numClasses); |
|
2599 |
|
2600 #endif |
|
2601 } |
|
2602 |