36 ** |
36 ** |
37 ** |
37 ** |
38 ** $QT_END_LICENSE$ |
38 ** $QT_END_LICENSE$ |
39 ** |
39 ** |
40 ****************************************************************************/ |
40 ****************************************************************************/ |
|
41 |
41 #include <qlist.h> |
42 #include <qlist.h> |
42 #include <qhash.h> |
43 #include <qhash.h> |
43 #include <qfile.h> |
44 #include <qfile.h> |
|
45 #include <qbytearray.h> |
44 #include <qstring.h> |
46 #include <qstring.h> |
45 #include <qchar.h> |
47 #include <qchar.h> |
46 #include <private/qunicodetables_p.h> |
|
47 #include <qvector.h> |
48 #include <qvector.h> |
48 #include <qdebug.h> |
49 #include <qdebug.h> |
49 |
50 #if 0 |
50 |
51 #include <private/qunicodetables_p.h> |
51 static struct AgeMap { |
52 #endif |
52 const char *age; |
53 |
53 const QChar::UnicodeVersion version; |
54 #define DATA_VERSION_S "5.0" |
54 } ageMap [] = { |
55 #define DATA_VERSION_STR "QChar::Unicode_5_0" |
55 { "1.1", QChar::Unicode_1_1 }, |
56 |
56 { "2.0", QChar::Unicode_2_0 }, |
57 #define LAST_CODEPOINT 0x10ffff |
57 { "2.1", QChar::Unicode_2_1_2 }, |
58 #define LAST_CODEPOINT_STR "0x10ffff" |
58 { "3.0", QChar::Unicode_3_0 }, |
59 |
59 { "3.1", QChar::Unicode_3_1 }, |
60 |
60 { "3.2", QChar::Unicode_3_2 }, |
61 static QHash<QByteArray, QChar::UnicodeVersion> age_map; |
61 { "4.0", QChar::Unicode_4_0 }, |
62 |
62 { "4.1", QChar::Unicode_4_1 }, |
63 static void initAgeMap() |
63 { "5.0", QChar::Unicode_5_0 }, |
64 { |
64 { 0, QChar::Unicode_Unassigned } |
65 struct AgeMap { |
65 }; |
66 const QChar::UnicodeVersion version; |
66 #define CURRENT_UNICODE_VERSION "QChar::Unicode_5_0" |
67 const char *age; |
|
68 } ageMap[] = { |
|
69 { QChar::Unicode_1_1, "1.1" }, |
|
70 { QChar::Unicode_2_0, "2.0" }, |
|
71 { QChar::Unicode_2_1_2, "2.1" }, |
|
72 { QChar::Unicode_3_0, "3.0" }, |
|
73 { QChar::Unicode_3_1, "3.1" }, |
|
74 { QChar::Unicode_3_2, "3.2" }, |
|
75 { QChar::Unicode_4_0, "4.0" }, |
|
76 { QChar::Unicode_4_1, "4.1" }, |
|
77 { QChar::Unicode_5_0, "5.0" }, |
|
78 { QChar::Unicode_Unassigned, 0 } |
|
79 }; |
|
80 AgeMap *d = ageMap; |
|
81 while (d->age) { |
|
82 age_map.insert(d->age, d->version); |
|
83 ++d; |
|
84 } |
|
85 } |
|
86 |
67 |
87 |
68 static const char *grapheme_break_string = |
88 static const char *grapheme_break_string = |
69 " enum GraphemeBreak {\n" |
89 " enum GraphemeBreak {\n" |
70 " GraphemeBreakOther, \n" |
90 " GraphemeBreakOther,\n" |
71 " GraphemeBreakCR,\n" |
91 " GraphemeBreakCR,\n" |
72 " GraphemeBreakLF,\n" |
92 " GraphemeBreakLF,\n" |
73 " GraphemeBreakControl,\n" |
93 " GraphemeBreakControl,\n" |
74 " GraphemeBreakExtend,\n" |
94 " GraphemeBreakExtend,\n" |
75 " GraphemeBreakL,\n" |
95 " GraphemeBreakL,\n" |
108 { GraphemeBreakL, "L" }, |
130 { GraphemeBreakL, "L" }, |
109 { GraphemeBreakV, "V" }, |
131 { GraphemeBreakV, "V" }, |
110 { GraphemeBreakT, "T" }, |
132 { GraphemeBreakT, "T" }, |
111 { GraphemeBreakLV, "LV" }, |
133 { GraphemeBreakLV, "LV" }, |
112 { GraphemeBreakLVT, "LVT" }, |
134 { GraphemeBreakLVT, "LVT" }, |
113 { GraphemeBreakOther, 0 } |
135 { GraphemeBreak_Unassigned, 0 } |
114 }; |
136 }; |
115 GraphemeBreakList *d = breaks; |
137 GraphemeBreakList *d = breaks; |
116 while (d->name) { |
138 while (d->name) { |
117 grapheme_break_map.insert(d->name, d->brk); |
139 grapheme_break_map.insert(d->name, d->brk); |
118 ++d; |
140 ++d; |
119 } |
141 } |
120 } |
142 } |
121 |
143 |
122 const char *word_break_string = |
144 |
|
145 static const char *word_break_string = |
123 " enum WordBreak {\n" |
146 " enum WordBreak {\n" |
124 " WordBreakOther,\n" |
147 " WordBreakOther,\n" |
125 " WordBreakFormat,\n" |
148 " WordBreakFormat,\n" |
126 " WordBreakKatakana,\n" |
149 " WordBreakKatakana,\n" |
127 " WordBreakALetter,\n" |
150 " WordBreakALetter,\n" |
216 { SentenceBreakOLetter, "OLetter" }, |
241 { SentenceBreakOLetter, "OLetter" }, |
217 { SentenceBreakNumeric, "Numeric" }, |
242 { SentenceBreakNumeric, "Numeric" }, |
218 { SentenceBreakATerm, "ATerm" }, |
243 { SentenceBreakATerm, "ATerm" }, |
219 { SentenceBreakSTerm, "STerm" }, |
244 { SentenceBreakSTerm, "STerm" }, |
220 { SentenceBreakClose, "Close" }, |
245 { SentenceBreakClose, "Close" }, |
221 { SentenceBreakOther, 0 } |
246 { SentenceBreak_Unassigned, 0 } |
222 }; |
247 }; |
223 SentenceBreakList *d = breaks; |
248 SentenceBreakList *d = breaks; |
224 while (d->name) { |
249 while (d->name) { |
225 sentence_break_map.insert(d->name, d->brk); |
250 sentence_break_map.insert(d->name, d->brk); |
226 ++d; |
251 ++d; |
227 } |
252 } |
228 } |
253 } |
229 |
254 |
230 |
255 |
231 // Keep this one in sync with the code in createPropertyInfo |
256 static const char *lineBreakClass = |
232 const char *property_string = |
|
233 " struct Properties {\n" |
|
234 " ushort category : 8;\n" |
|
235 " ushort line_break_class : 8;\n" |
|
236 " ushort direction : 8;\n" |
|
237 " ushort combiningClass :8;\n" |
|
238 " ushort joining : 2;\n" |
|
239 " signed short digitValue : 6; /* 5 needed */\n" |
|
240 " ushort unicodeVersion : 4;\n" |
|
241 " ushort lowerCaseSpecial : 1;\n" |
|
242 " ushort upperCaseSpecial : 1;\n" |
|
243 " ushort titleCaseSpecial : 1;\n" |
|
244 " ushort caseFoldSpecial : 1; /* currently unused */\n" |
|
245 " signed short mirrorDiff : 16;\n" |
|
246 " signed short lowerCaseDiff : 16;\n" |
|
247 " signed short upperCaseDiff : 16;\n" |
|
248 " signed short titleCaseDiff : 16;\n" |
|
249 " signed short caseFoldDiff : 16;\n" |
|
250 " ushort graphemeBreak : 8;\n" |
|
251 " ushort wordBreak : 8;\n" |
|
252 " ushort sentenceBreak : 8;\n" |
|
253 " };\n" |
|
254 " Q_CORE_EXPORT const Properties * QT_FASTCALL properties(uint ucs4);\n" |
|
255 " Q_CORE_EXPORT const Properties * QT_FASTCALL properties(ushort ucs2);\n"; |
|
256 |
|
257 const char *lineBreakClass = |
|
258 " // see http://www.unicode.org/reports/tr14/tr14-19.html\n" |
257 " // see http://www.unicode.org/reports/tr14/tr14-19.html\n" |
259 " // we don't use the XX, AI and CB properties and map them to AL instead.\n" |
258 " // we don't use the XX, AI and CB properties and map them to AL instead.\n" |
260 " // as we don't support any EBDIC based OS'es, NL is ignored and mapped to AL as well.\n" |
259 " // as we don't support any EBDIC based OS'es, NL is ignored and mapped to AL as well.\n" |
261 " enum LineBreakClass {\n" |
260 " enum LineBreakClass {\n" |
262 " LineBreak_OP, LineBreak_CL, LineBreak_QU, LineBreak_GL, LineBreak_NS,\n" |
261 " LineBreak_OP, LineBreak_CL, LineBreak_QU, LineBreak_GL, LineBreak_NS,\n" |
266 " LineBreak_WJ, LineBreak_H2, LineBreak_H3, LineBreak_JL, LineBreak_JV,\n" |
265 " LineBreak_WJ, LineBreak_H2, LineBreak_H3, LineBreak_JL, LineBreak_JV,\n" |
267 " LineBreak_JT, LineBreak_SA, LineBreak_SG,\n" |
266 " LineBreak_JT, LineBreak_SA, LineBreak_SG,\n" |
268 " LineBreak_SP, LineBreak_CR, LineBreak_LF, LineBreak_BK\n" |
267 " LineBreak_SP, LineBreak_CR, LineBreak_LF, LineBreak_BK\n" |
269 " };\n\n"; |
268 " };\n\n"; |
270 |
269 |
271 const char *methods = |
270 enum LineBreakClass { |
|
271 LineBreak_OP, LineBreak_CL, LineBreak_QU, LineBreak_GL, LineBreak_NS, |
|
272 LineBreak_EX, LineBreak_SY, LineBreak_IS, LineBreak_PR, LineBreak_PO, |
|
273 LineBreak_NU, LineBreak_AL, LineBreak_ID, LineBreak_IN, LineBreak_HY, |
|
274 LineBreak_BA, LineBreak_BB, LineBreak_B2, LineBreak_ZW, LineBreak_CM, |
|
275 LineBreak_WJ, LineBreak_H2, LineBreak_H3, LineBreak_JL, LineBreak_JV, |
|
276 LineBreak_JT, LineBreak_SA, LineBreak_SG, |
|
277 LineBreak_SP, LineBreak_CR, LineBreak_LF, LineBreak_BK |
|
278 |
|
279 , LineBreak_Unassigned |
|
280 }; |
|
281 |
|
282 static QHash<QByteArray, LineBreakClass> line_break_map; |
|
283 |
|
284 static void initLineBreak() |
|
285 { |
|
286 // ### Classes XX and AI are left out and mapped to AL for now; |
|
287 // ### Class NL is ignored and mapped to AL as well. |
|
288 struct LineBreakList { |
|
289 LineBreakClass brk; |
|
290 const char *name; |
|
291 } breaks[] = { |
|
292 { LineBreak_BK, "BK" }, |
|
293 { LineBreak_CR, "CR" }, |
|
294 { LineBreak_LF, "LF" }, |
|
295 { LineBreak_CM, "CM" }, |
|
296 { LineBreak_AL, "NL" }, |
|
297 { LineBreak_SG, "SG" }, |
|
298 { LineBreak_WJ, "WJ" }, |
|
299 { LineBreak_ZW, "ZW" }, |
|
300 { LineBreak_GL, "GL" }, |
|
301 { LineBreak_SP, "SP" }, |
|
302 { LineBreak_B2, "B2" }, |
|
303 { LineBreak_BA, "BA" }, |
|
304 { LineBreak_BB, "BB" }, |
|
305 { LineBreak_HY, "HY" }, |
|
306 { LineBreak_AL, "CB" }, // ### |
|
307 { LineBreak_CL, "CL" }, |
|
308 { LineBreak_EX, "EX" }, |
|
309 { LineBreak_IN, "IN" }, |
|
310 { LineBreak_NS, "NS" }, |
|
311 { LineBreak_OP, "OP" }, |
|
312 { LineBreak_QU, "QU" }, |
|
313 { LineBreak_IS, "IS" }, |
|
314 { LineBreak_NU, "NU" }, |
|
315 { LineBreak_PO, "PO" }, |
|
316 { LineBreak_PR, "PR" }, |
|
317 { LineBreak_SY, "SY" }, |
|
318 { LineBreak_AL, "AI" }, |
|
319 { LineBreak_AL, "AL" }, |
|
320 { LineBreak_H2, "H2" }, |
|
321 { LineBreak_H3, "H3" }, |
|
322 { LineBreak_ID, "ID" }, |
|
323 { LineBreak_JL, "JL" }, |
|
324 { LineBreak_JV, "JV" }, |
|
325 { LineBreak_JT, "JT" }, |
|
326 { LineBreak_SA, "SA" }, |
|
327 { LineBreak_AL, "XX" }, |
|
328 { LineBreak_Unassigned, 0 } |
|
329 }; |
|
330 LineBreakList *d = breaks; |
|
331 while (d->name) { |
|
332 line_break_map.insert(d->name, d->brk); |
|
333 ++d; |
|
334 } |
|
335 } |
|
336 |
|
337 |
|
338 // Keep this one in sync with the code in createPropertyInfo |
|
339 static const char *property_string = |
|
340 " struct Properties {\n" |
|
341 " ushort category : 8; /* 5 needed */\n" |
|
342 " ushort line_break_class : 8; /* 6 needed */\n" |
|
343 " ushort direction : 8; /* 5 needed */\n" |
|
344 " ushort combiningClass : 8;\n" |
|
345 " ushort joining : 2;\n" |
|
346 " signed short digitValue : 6; /* 5 needed */\n" |
|
347 " ushort unicodeVersion : 4;\n" |
|
348 " ushort lowerCaseSpecial : 1;\n" |
|
349 " ushort upperCaseSpecial : 1;\n" |
|
350 " ushort titleCaseSpecial : 1;\n" |
|
351 " ushort caseFoldSpecial : 1; /* currently unused */\n" |
|
352 " signed short mirrorDiff : 16;\n" |
|
353 " signed short lowerCaseDiff : 16;\n" |
|
354 " signed short upperCaseDiff : 16;\n" |
|
355 " signed short titleCaseDiff : 16;\n" |
|
356 " signed short caseFoldDiff : 16;\n" |
|
357 " ushort graphemeBreak : 8; /* 4 needed */\n" |
|
358 " ushort wordBreak : 8; /* 4 needed */\n" |
|
359 " ushort sentenceBreak : 8; /* 4 needed */\n" |
|
360 " };\n" |
|
361 " Q_CORE_EXPORT const Properties * QT_FASTCALL properties(uint ucs4);\n" |
|
362 " Q_CORE_EXPORT const Properties * QT_FASTCALL properties(ushort ucs2);\n"; |
|
363 |
|
364 static const char *methods = |
272 " Q_CORE_EXPORT QUnicodeTables::LineBreakClass QT_FASTCALL lineBreakClass(uint ucs4);\n" |
365 " Q_CORE_EXPORT QUnicodeTables::LineBreakClass QT_FASTCALL lineBreakClass(uint ucs4);\n" |
273 " inline int lineBreakClass(const QChar &ch) {\n" |
366 " inline int lineBreakClass(const QChar &ch)\n" |
274 " return QUnicodeTables::lineBreakClass(ch.unicode());\n" |
367 " { return lineBreakClass(ch.unicode()); }\n" |
275 " }\n" |
|
276 "\n" |
368 "\n" |
277 " Q_CORE_EXPORT int QT_FASTCALL script(uint ucs4);\n" |
369 " Q_CORE_EXPORT int QT_FASTCALL script(uint ucs4);\n" |
278 " Q_CORE_EXPORT_INLINE int QT_FASTCALL script(const QChar &ch) {\n" |
370 " inline int script(const QChar &ch)\n" |
279 " return script(ch.unicode());\n" |
371 " { return script(ch.unicode()); }\n\n"; |
280 " }\n\n"; |
|
281 |
372 |
282 |
373 |
283 struct PropertyFlags { |
374 struct PropertyFlags { |
284 bool operator ==(const PropertyFlags &o) { |
375 bool operator ==(const PropertyFlags &o) { |
285 return (combiningClass == o.combiningClass |
376 return (combiningClass == o.combiningClass |
551 { QChar::Narrow, "<narrow>" }, |
645 { QChar::Narrow, "<narrow>" }, |
552 { QChar::Small, "<small>" }, |
646 { QChar::Small, "<small>" }, |
553 { QChar::Square, "<square>" }, |
647 { QChar::Square, "<square>" }, |
554 { QChar::Compat, "<compat>" }, |
648 { QChar::Compat, "<compat>" }, |
555 { QChar::Fraction, "<fraction>" }, |
649 { QChar::Fraction, "<fraction>" }, |
556 { QChar::NoDecomposition, 0 } |
650 { QChar::NoDecomposition, 0 } |
557 }; |
651 }; |
558 Dec *d = decompositions; |
652 Dec *d = decompositions; |
559 while (d->name) { |
653 while (d->name) { |
560 decompositionMap.insert(d->name, d->dec); |
654 decompositionMap.insert(d->name, d->dec); |
561 ++d; |
655 ++d; |
562 } |
656 } |
563 } |
657 } |
564 |
658 |
565 |
659 |
566 QHash<int, UnicodeData> unicodeData; |
660 static QHash<int, UnicodeData> unicodeData; |
567 QList<PropertyFlags> uniqueProperties; |
661 static QList<PropertyFlags> uniqueProperties; |
568 |
662 |
569 |
663 |
570 QHash<int, int> decompositionLength; |
664 static QHash<int, int> decompositionLength; |
571 int highestComposedCharacter = 0; |
665 static int highestComposedCharacter = 0; |
572 int numLigatures = 0; |
666 static int numLigatures = 0; |
573 int highestLigature = 0; |
667 static int highestLigature = 0; |
574 |
668 |
575 struct Ligature {ushort u1; ushort u2; ushort ligature;}; |
669 struct Ligature { |
|
670 ushort u1; |
|
671 ushort u2; |
|
672 ushort ligature; |
|
673 }; |
576 // we need them sorted after the first component for fast lookup |
674 // we need them sorted after the first component for fast lookup |
577 bool operator < (const Ligature &l1, const Ligature &l2) { |
675 bool operator < (const Ligature &l1, const Ligature &l2) |
578 return l1.u1 < l2.u1; |
676 { return l1.u1 < l2.u1; } |
579 } |
677 |
580 |
678 static QHash<ushort, QList<Ligature> > ligatureHashes; |
581 QHash<ushort, QList<Ligature> > ligatureHashes; |
679 |
582 |
680 static QHash<int, int> combiningClassUsage; |
583 QHash<int, int> combiningClassUsage; |
681 |
584 |
682 static int maxLowerCaseDiff = 0; |
585 int maxLowerCaseDiff = 0; |
683 static int maxUpperCaseDiff = 0; |
586 int maxUpperCaseDiff = 0; |
684 static int maxTitleCaseDiff = 0; |
587 int maxTitleCaseDiff = 0; |
|
588 |
685 |
589 static void readUnicodeData() |
686 static void readUnicodeData() |
590 { |
687 { |
591 QFile f("data/UnicodeData.txt"); |
688 QFile f("data/UnicodeData.txt"); |
592 if (!f.exists()) |
689 if (!f.exists()) |
607 continue; |
704 continue; |
608 |
705 |
609 QList<QByteArray> properties = line.split(';'); |
706 QList<QByteArray> properties = line.split(';'); |
610 bool ok; |
707 bool ok; |
611 int codepoint = properties[UD_Value].toInt(&ok, 16); |
708 int codepoint = properties[UD_Value].toInt(&ok, 16); |
|
709 Q_ASSERT(ok); |
|
710 Q_ASSERT(codepoint <= LAST_CODEPOINT); |
612 int lastCodepoint = codepoint; |
711 int lastCodepoint = codepoint; |
613 |
712 |
614 QByteArray name = properties[UD_Name]; |
713 QByteArray name = properties[UD_Name]; |
615 if (name.startsWith('<') && name.contains("First")) { |
714 if (name.startsWith('<') && name.contains("First")) { |
616 QByteArray nextLine; |
715 QByteArray nextLine; |
617 nextLine.resize(1024); |
716 nextLine.resize(1024); |
618 f.readLine(nextLine.data(), 1024); |
717 f.readLine(nextLine.data(), 1024); |
619 QList<QByteArray> properties = nextLine.split(';'); |
718 QList<QByteArray> properties = nextLine.split(';'); |
|
719 Q_ASSERT(properties[UD_Name].startsWith('<') && properties[UD_Name].contains("Last")); |
620 lastCodepoint = properties[UD_Value].toInt(&ok, 16); |
720 lastCodepoint = properties[UD_Value].toInt(&ok, 16); |
|
721 Q_ASSERT(ok); |
|
722 Q_ASSERT(lastCodepoint <= LAST_CODEPOINT); |
621 } |
723 } |
622 |
724 |
623 UnicodeData data(codepoint); |
725 UnicodeData data(codepoint); |
624 data.p.category = categoryMap.value(properties[UD_Category], QChar::NoCategory); |
726 data.p.category = categoryMap.value(properties[UD_Category], QChar::NoCategory); |
|
727 if (data.p.category == QChar::NoCategory) |
|
728 qFatal("unassigned char category: %s", properties[UD_Category].constData()); |
625 data.p.combiningClass = properties[UD_CombiningClass].toInt(); |
729 data.p.combiningClass = properties[UD_CombiningClass].toInt(); |
626 |
730 |
627 if (!combiningClassUsage.contains(data.p.combiningClass)) |
731 if (!combiningClassUsage.contains(data.p.combiningClass)) |
628 combiningClassUsage[data.p.combiningClass] = 1; |
732 combiningClassUsage[data.p.combiningClass] = 1; |
629 else |
733 else |
632 data.p.direction = directionMap.value(properties[UD_BidiCategory], data.p.direction); |
736 data.p.direction = directionMap.value(properties[UD_BidiCategory], data.p.direction); |
633 |
737 |
634 if (!properties[UD_UpperCase].isEmpty()) { |
738 if (!properties[UD_UpperCase].isEmpty()) { |
635 int upperCase = properties[UD_UpperCase].toInt(&ok, 16); |
739 int upperCase = properties[UD_UpperCase].toInt(&ok, 16); |
636 Q_ASSERT(ok); |
740 Q_ASSERT(ok); |
|
741 if (qAbs(upperCase - codepoint) >= (1<<14)) |
|
742 qWarning() << "upperCaseDiff exceeded (" << hex << codepoint << "->" << upperCase << ")"; |
637 data.p.upperCaseDiff = upperCase - codepoint; |
743 data.p.upperCaseDiff = upperCase - codepoint; |
638 maxUpperCaseDiff = qMax(maxUpperCaseDiff, qAbs(data.p.upperCaseDiff)); |
744 maxUpperCaseDiff = qMax(maxUpperCaseDiff, qAbs(data.p.upperCaseDiff)); |
639 if (codepoint > 0xffff) { |
745 if (codepoint > 0xffff) { |
640 // if the condition below doesn't hold anymore we need to modify our case folding code |
746 // if the condition below doesn't hold anymore we need to modify our case folding code |
641 //qDebug() << codepoint << QChar::highSurrogate(codepoint) << QChar::highSurrogate(foldMap.at(0)); |
747 //qDebug() << codepoint << QChar::highSurrogate(codepoint) << QChar::highSurrogate(foldMap.at(0)); |
642 Q_ASSERT(QChar::highSurrogate(codepoint) == QChar::highSurrogate(upperCase)); |
748 Q_ASSERT(QChar::highSurrogate(codepoint) == QChar::highSurrogate(upperCase)); |
643 } |
749 } |
644 } |
750 } |
645 if (!properties[UD_LowerCase].isEmpty()) { |
751 if (!properties[UD_LowerCase].isEmpty()) { |
646 int lowerCase = properties[UD_LowerCase].toInt(&ok, 16); |
752 int lowerCase = properties[UD_LowerCase].toInt(&ok, 16); |
647 Q_ASSERT (ok); |
753 Q_ASSERT(ok); |
|
754 if (qAbs(lowerCase - codepoint) >= (1<<14)) |
|
755 qWarning() << "lowerCaseDiff exceeded (" << hex << codepoint << "->" << lowerCase << ")"; |
648 data.p.lowerCaseDiff = lowerCase - codepoint; |
756 data.p.lowerCaseDiff = lowerCase - codepoint; |
649 maxLowerCaseDiff = qMax(maxLowerCaseDiff, qAbs(data.p.lowerCaseDiff)); |
757 maxLowerCaseDiff = qMax(maxLowerCaseDiff, qAbs(data.p.lowerCaseDiff)); |
650 if (codepoint > 0xffff) { |
758 if (codepoint > 0xffff) { |
651 // if the condition below doesn't hold anymore we need to modify our case folding code |
759 // if the condition below doesn't hold anymore we need to modify our case folding code |
652 //qDebug() << codepoint << QChar::highSurrogate(codepoint) << QChar::highSurrogate(foldMap.at(0)); |
760 //qDebug() << codepoint << QChar::highSurrogate(codepoint) << QChar::highSurrogate(foldMap.at(0)); |
808 codes.replace("..", "."); |
924 codes.replace("..", "."); |
809 QList<QByteArray> cl = codes.split('.'); |
925 QList<QByteArray> cl = codes.split('.'); |
810 |
926 |
811 bool ok; |
927 bool ok; |
812 int from = cl[0].toInt(&ok, 16); |
928 int from = cl[0].toInt(&ok, 16); |
|
929 Q_ASSERT(ok); |
813 int to = from; |
930 int to = from; |
814 if (cl.size() == 2) |
931 if (cl.size() == 2) { |
815 to = cl[1].toInt(&ok, 16); |
932 to = cl[1].toInt(&ok, 16); |
816 |
933 Q_ASSERT(ok); |
817 QChar::UnicodeVersion age = QChar::Unicode_Unassigned; |
934 } |
818 QByteArray ba = l[1]; |
935 |
819 AgeMap *map = ageMap; |
936 QChar::UnicodeVersion age = age_map.value(l[1].trimmed(), QChar::Unicode_Unassigned); |
820 while (map->age) { |
|
821 if (ba == map->age) { |
|
822 age = map->version; |
|
823 break; |
|
824 } |
|
825 ++map; |
|
826 } |
|
827 //qDebug() << hex << from << ".." << to << ba << age; |
937 //qDebug() << hex << from << ".." << to << ba << age; |
828 Q_ASSERT(age != QChar::Unicode_Unassigned); |
938 if (age == QChar::Unicode_Unassigned) |
|
939 qFatal("unassigned or unhandled age value: %s", l[1].constData()); |
829 |
940 |
830 for (int codepoint = from; codepoint <= to; ++codepoint) { |
941 for (int codepoint = from; codepoint <= to; ++codepoint) { |
831 UnicodeData d = unicodeData.value(codepoint, UnicodeData(codepoint)); |
942 UnicodeData d = unicodeData.value(codepoint, UnicodeData(codepoint)); |
832 d.p.age = age; |
943 d.p.age = age; |
833 unicodeData.insert(codepoint, d); |
944 unicodeData.insert(codepoint, d); |
834 } |
945 } |
835 } |
946 } |
836 } |
947 } |
837 |
948 |
838 |
949 |
839 static void readCompositionExclusion() |
950 static void readDerivedNormalizationProps() |
840 { |
951 { |
841 QFile f("data/CompositionExclusions.txt"); |
952 QFile f("data/DerivedNormalizationProps.txt"); |
842 if (!f.exists()) |
953 if (!f.exists()) |
843 qFatal("Couldn't find CompositionExclusions.txt"); |
954 qFatal("Couldn't find DerivedNormalizationProps.txt"); |
844 |
955 |
845 f.open(QFile::ReadOnly); |
956 f.open(QFile::ReadOnly); |
846 |
957 |
847 while (!f.atEnd()) { |
958 while (!f.atEnd()) { |
848 QByteArray line; |
959 QByteArray line; |
851 line.resize(len-1); |
962 line.resize(len-1); |
852 |
963 |
853 int comment = line.indexOf('#'); |
964 int comment = line.indexOf('#'); |
854 if (comment >= 0) |
965 if (comment >= 0) |
855 line = line.left(comment); |
966 line = line.left(comment); |
856 line.replace(" ", ""); |
967 |
857 |
968 if (line.trimmed().isEmpty()) |
858 if (line.isEmpty()) |
|
859 continue; |
969 continue; |
860 |
970 |
861 Q_ASSERT(!line.contains("..")); |
971 QList<QByteArray> l = line.split(';'); |
|
972 Q_ASSERT(l.size() >= 2); |
|
973 |
|
974 QByteArray propName = l[1].trimmed(); |
|
975 if (propName != "Full_Composition_Exclusion") |
|
976 // ### |
|
977 continue; |
|
978 |
|
979 QByteArray codes = l[0].trimmed(); |
|
980 codes.replace("..", "."); |
|
981 QList<QByteArray> cl = codes.split('.'); |
862 |
982 |
863 bool ok; |
983 bool ok; |
864 int codepoint = line.toInt(&ok, 16); |
984 int from = cl[0].toInt(&ok, 16); |
865 |
985 Q_ASSERT(ok); |
|
986 int to = from; |
|
987 if (cl.size() == 2) { |
|
988 to = cl[1].toInt(&ok, 16); |
|
989 Q_ASSERT(ok); |
|
990 } |
|
991 |
|
992 for (int codepoint = from; codepoint <= to; ++codepoint) { |
|
993 UnicodeData d = unicodeData.value(codepoint, UnicodeData(codepoint)); |
|
994 d.excludedComposition = true; |
|
995 unicodeData.insert(codepoint, d); |
|
996 } |
|
997 } |
|
998 |
|
999 for (int codepoint = 0; codepoint <= LAST_CODEPOINT; ++codepoint) { |
866 UnicodeData d = unicodeData.value(codepoint, UnicodeData(codepoint)); |
1000 UnicodeData d = unicodeData.value(codepoint, UnicodeData(codepoint)); |
867 d.excludedComposition = true; |
1001 if (!d.excludedComposition |
868 unicodeData.insert(codepoint, d); |
1002 && d.decompositionType == QChar::Canonical |
869 } |
1003 && d.decomposition.size() > 1) { |
870 |
1004 Q_ASSERT(d.decomposition.size() == 2); |
871 for (int i = 0; i < 0x110000; ++i) { |
1005 |
872 UnicodeData data = unicodeData.value(i, UnicodeData(i)); |
1006 uint part1 = d.decomposition.at(0); |
873 if (!data.excludedComposition |
1007 uint part2 = d.decomposition.at(1); |
874 && data.decompositionType == QChar::Canonical |
1008 |
875 && data.decomposition.size() > 1) { |
1009 // all non-starters are listed in DerivedNormalizationProps.txt |
876 Q_ASSERT(data.decomposition.size() == 2); |
1010 // and already excluded from composition |
877 |
1011 Q_ASSERT(unicodeData.value(part1, UnicodeData(part1)).p.combiningClass == 0); |
878 uint part1 = data.decomposition.at(0); |
|
879 uint part2 = data.decomposition.at(1); |
|
880 UnicodeData first = unicodeData.value(part1, UnicodeData(part1)); |
|
881 if (first.p.combiningClass != 0) |
|
882 continue; |
|
883 |
1012 |
884 ++numLigatures; |
1013 ++numLigatures; |
885 highestLigature = qMax(highestLigature, (int)part1); |
1014 highestLigature = qMax(highestLigature, (int)part1); |
886 Ligature l = {(ushort)part1, (ushort)part2, i}; |
1015 Ligature l = {(ushort)part1, (ushort)part2, codepoint}; |
887 ligatureHashes[part2].append(l); |
1016 ligatureHashes[part2].append(l); |
888 } |
1017 } |
889 } |
1018 } |
890 } |
1019 } |
|
1020 |
891 |
1021 |
892 struct NormalizationCorrection { |
1022 struct NormalizationCorrection { |
893 uint codepoint; |
1023 uint codepoint; |
894 uint mapped; |
1024 uint mapped; |
895 uint version; |
1025 uint version; |
1005 codes.replace("..", "."); |
1136 codes.replace("..", "."); |
1006 QList<QByteArray> cl = codes.split('.'); |
1137 QList<QByteArray> cl = codes.split('.'); |
1007 |
1138 |
1008 bool ok; |
1139 bool ok; |
1009 int from = cl[0].toInt(&ok, 16); |
1140 int from = cl[0].toInt(&ok, 16); |
|
1141 Q_ASSERT(ok); |
1010 int to = from; |
1142 int to = from; |
1011 if (cl.size() == 2) |
1143 if (cl.size() == 2) { |
1012 to = cl[1].toInt(&ok, 16); |
1144 to = cl[1].toInt(&ok, 16); |
1013 |
1145 Q_ASSERT(ok); |
1014 // ### Classes XX and AI are left out and mapped to AL for now |
1146 } |
1015 QUnicodeTables::LineBreakClass lb = QUnicodeTables::LineBreak_AL; |
1147 |
1016 QByteArray ba = l[1]; |
1148 LineBreakClass lb = line_break_map.value(l[1].trimmed(), LineBreak_Unassigned); |
1017 |
1149 if (lb == LineBreak_Unassigned) |
1018 if (ba == "AI") lb = QUnicodeTables::LineBreak_AL; |
1150 qFatal("unassigned line break class: %s", l[1].constData()); |
1019 else if (ba == "XX") lb = QUnicodeTables::LineBreak_AL; |
|
1020 else if (ba == "NL") lb = QUnicodeTables::LineBreak_AL; |
|
1021 else if (ba == "OP") lb = QUnicodeTables::LineBreak_OP; |
|
1022 else if (ba == "CL") lb = QUnicodeTables::LineBreak_CL; |
|
1023 else if (ba == "QU") lb = QUnicodeTables::LineBreak_QU; |
|
1024 else if (ba == "GL") lb = QUnicodeTables::LineBreak_GL; |
|
1025 else if (ba == "NS") lb = QUnicodeTables::LineBreak_NS; |
|
1026 else if (ba == "EX") lb = QUnicodeTables::LineBreak_EX; |
|
1027 else if (ba == "SY") lb = QUnicodeTables::LineBreak_SY; |
|
1028 else if (ba == "IS") lb = QUnicodeTables::LineBreak_IS; |
|
1029 else if (ba == "PR") lb = QUnicodeTables::LineBreak_PR; |
|
1030 else if (ba == "PO") lb = QUnicodeTables::LineBreak_PO; |
|
1031 else if (ba == "NU") lb = QUnicodeTables::LineBreak_NU; |
|
1032 else if (ba == "AL") lb = QUnicodeTables::LineBreak_AL; |
|
1033 else if (ba == "ID") lb = QUnicodeTables::LineBreak_ID; |
|
1034 else if (ba == "IN") lb = QUnicodeTables::LineBreak_IN; |
|
1035 else if (ba == "HY") lb = QUnicodeTables::LineBreak_HY; |
|
1036 else if (ba == "BA") lb = QUnicodeTables::LineBreak_BA; |
|
1037 else if (ba == "BB") lb = QUnicodeTables::LineBreak_BB; |
|
1038 else if (ba == "B2") lb = QUnicodeTables::LineBreak_B2; |
|
1039 else if (ba == "ZW") lb = QUnicodeTables::LineBreak_ZW; |
|
1040 else if (ba == "CM") lb = QUnicodeTables::LineBreak_CM; |
|
1041 else if (ba == "SA") lb = QUnicodeTables::LineBreak_SA; |
|
1042 else if (ba == "BK") lb = QUnicodeTables::LineBreak_BK; |
|
1043 else if (ba == "CR") lb = QUnicodeTables::LineBreak_CR; |
|
1044 else if (ba == "LF") lb = QUnicodeTables::LineBreak_LF; |
|
1045 else if (ba == "SG") lb = QUnicodeTables::LineBreak_SG; |
|
1046 else if (ba == "CB") lb = QUnicodeTables::LineBreak_AL; |
|
1047 else if (ba == "SP") lb = QUnicodeTables::LineBreak_SP; |
|
1048 else if (ba == "WJ") lb = QUnicodeTables::LineBreak_WJ; |
|
1049 else if (ba == "H2") lb = QUnicodeTables::LineBreak_H2; |
|
1050 else if (ba == "H3") lb = QUnicodeTables::LineBreak_H3; |
|
1051 else if (ba == "JL") lb = QUnicodeTables::LineBreak_JL; |
|
1052 else if (ba == "JV") lb = QUnicodeTables::LineBreak_JV; |
|
1053 else if (ba == "JT") lb = QUnicodeTables::LineBreak_JT; |
|
1054 else { |
|
1055 qDebug() << "unhandled line break class:" << ba; |
|
1056 } |
|
1057 |
1151 |
1058 for (int codepoint = from; codepoint <= to; ++codepoint) { |
1152 for (int codepoint = from; codepoint <= to; ++codepoint) { |
1059 UnicodeData d = unicodeData.value(codepoint, UnicodeData(codepoint)); |
1153 UnicodeData d = unicodeData.value(codepoint, UnicodeData(codepoint)); |
1060 d.p.line_break_class = lb; |
1154 d.p.line_break_class = lb; |
1061 unicodeData.insert(codepoint, d); |
1155 unicodeData.insert(codepoint, d); |
1196 Q_ASSERT(ok); |
1288 Q_ASSERT(ok); |
1197 } |
1289 } |
1198 |
1290 |
1199 UnicodeData ud = unicodeData.value(codepoint, UnicodeData(codepoint)); |
1291 UnicodeData ud = unicodeData.value(codepoint, UnicodeData(codepoint)); |
1200 if (foldMap.size() == 1) { |
1292 if (foldMap.size() == 1) { |
|
1293 if (qAbs(foldMap.at(0) - codepoint) >= (1<<14)) |
|
1294 qWarning() << "caseFoldDiff exceeded (" << hex << codepoint << "->" << foldMap.at(0) << ")"; |
1201 ud.p.caseFoldDiff = foldMap.at(0) - codepoint; |
1295 ud.p.caseFoldDiff = foldMap.at(0) - codepoint; |
1202 maxCaseFoldDiff = qMax(maxCaseFoldDiff, ud.p.caseFoldDiff); |
1296 maxCaseFoldDiff = qMax(maxCaseFoldDiff, qAbs(ud.p.caseFoldDiff)); |
1203 if (codepoint > 0xffff) { |
1297 if (codepoint > 0xffff) { |
1204 // if the condition below doesn't hold anymore we need to modify our case folding code |
1298 // if the condition below doesn't hold anymore we need to modify our case folding code |
1205 //qDebug() << codepoint << QChar::highSurrogate(codepoint) << QChar::highSurrogate(foldMap.at(0)); |
1299 //qDebug() << codepoint << QChar::highSurrogate(codepoint) << QChar::highSurrogate(foldMap.at(0)); |
1206 Q_ASSERT(QChar::highSurrogate(codepoint) == QChar::highSurrogate(foldMap.at(0))); |
1300 Q_ASSERT(QChar::highSurrogate(codepoint) == QChar::highSurrogate(foldMap.at(0))); |
1207 } |
1301 } |
1208 if (foldMap.at(0) != codepoint + ud.p.lowerCaseDiff) |
1302 if (foldMap.at(0) != codepoint + ud.p.lowerCaseDiff) |
1209 qDebug() << hex << codepoint; |
1303 qDebug() << hex << codepoint; |
1210 } else { |
1304 } else { |
1211 Q_ASSERT(false); // we currently don't support full case foldings |
1305 qFatal("we currently don't support full case foldings"); |
1212 // qDebug() << "special" << hex << foldMap; |
1306 // qDebug() << "special" << hex << foldMap; |
1213 ud.p.caseFoldSpecial = true; |
1307 ud.p.caseFoldSpecial = true; |
1214 ud.p.caseFoldDiff = appendToSpecialCaseMap(foldMap); |
1308 ud.p.caseFoldDiff = appendToSpecialCaseMap(foldMap); |
1215 } |
1309 } |
1216 unicodeData.insert(codepoint, ud); |
1310 unicodeData.insert(codepoint, ud); |
1659 const int specialScriptsCount = sizeof(specialScripts) / sizeof(const char *); |
1758 const int specialScriptsCount = sizeof(specialScripts) / sizeof(const char *); |
1660 |
1759 |
1661 // generate script enum |
1760 // generate script enum |
1662 QByteArray declaration; |
1761 QByteArray declaration; |
1663 |
1762 |
1664 declaration += " // See http://www.unicode.org/reports/tr24/tr24-5.html\n\n"; |
1763 declaration += " // See http://www.unicode.org/reports/tr24/tr24-5.html\n"; |
1665 declaration += " enum Script {\n Common"; |
1764 declaration += " enum Script {\n Common"; |
1666 |
1765 |
1667 int uniqueScripts = 1; // Common |
1766 int uniqueScripts = 1; // Common |
1668 |
1767 |
1669 // output the ones with special processing first |
1768 // output the ones with special processing first |
1670 for (int i = 1; i < scriptNames.size(); ++i) { |
1769 for (int i = 1; i < scriptNames.size(); ++i) { |
1671 QByteArray scriptName = scriptNames.at(i); |
1770 QByteArray scriptName = scriptNames.at(i); |
1672 // does the script require special processing? |
1771 // does the script require special processing? |
1673 bool special = false; |
1772 bool special = false; |
1674 for (int s = 0; !special && s < specialScriptsCount; ++s) { |
1773 for (int s = 0; s < specialScriptsCount; ++s) { |
1675 if (scriptName == specialScripts[s]) |
1774 if (scriptName == specialScripts[s]) { |
1676 special = true; |
1775 special = true; |
|
1776 break; |
|
1777 } |
1677 } |
1778 } |
1678 if (!special) { |
1779 if (!special) { |
1679 scriptHash[i] = 0; // alias for 'Common' |
1780 scriptHash[i] = 0; // alias for 'Common' |
1680 continue; |
1781 continue; |
1681 } else { |
1782 } else { |
1682 ++uniqueScripts; |
1783 ++uniqueScripts; |
1683 scriptHash[i] = i; |
1784 scriptHash[i] = i; |
1684 } |
1785 } |
1685 |
1786 |
1686 declaration += ",\n "; |
1787 if (scriptName != "Inherited") { |
1687 declaration += scriptName; |
1788 declaration += ",\n "; |
1688 } |
1789 declaration += scriptName; |
|
1790 } |
|
1791 } |
|
1792 declaration += ",\n Inherited"; |
1689 declaration += ",\n ScriptCount = Inherited"; |
1793 declaration += ",\n ScriptCount = Inherited"; |
1690 |
1794 |
1691 // output the ones that are an alias for 'Common' |
1795 // output the ones that are an alias for 'Common' |
1692 for (int i = 1; i < scriptNames.size(); ++i) { |
1796 for (int i = 1; i < scriptNames.size(); ++i) { |
1693 if (scriptHash.value(i) != 0) |
1797 if (scriptHash.value(i) != 0) |
1694 continue; |
1798 continue; |
1695 QByteArray scriptName = scriptNames.at(i); |
|
1696 scriptName += " = Common"; |
|
1697 declaration += ",\n "; |
1799 declaration += ",\n "; |
1698 declaration += scriptName; |
1800 declaration += scriptNames.at(i); |
|
1801 declaration += " = Common"; |
1699 } |
1802 } |
1700 |
1803 |
1701 declaration += "\n };\n"; |
1804 declaration += "\n };\n"; |
1702 |
1805 |
1703 scriptSentinel = ((uniqueScripts + 16) / 32) * 32; // a multiple of 32 |
1806 scriptSentinel = ((uniqueScripts + 16) / 32) * 32; // a multiple of 32 |
1888 } |
1992 } |
1889 |
1993 |
1890 int bmp_block_data = bmp_blocks*BMP_BLOCKSIZE*2; |
1994 int bmp_block_data = bmp_blocks*BMP_BLOCKSIZE*2; |
1891 int bmp_trie = BMP_END/BMP_BLOCKSIZE*2; |
1995 int bmp_trie = BMP_END/BMP_BLOCKSIZE*2; |
1892 int bmp_mem = bmp_block_data + bmp_trie; |
1996 int bmp_mem = bmp_block_data + bmp_trie; |
1893 qDebug(" %d unique blocks in BMP.",blocks.size()); |
1997 qDebug(" %d unique blocks in BMP.", blocks.size()); |
1894 qDebug(" block data uses: %d bytes", bmp_block_data); |
1998 qDebug(" block data uses: %d bytes", bmp_block_data); |
1895 qDebug(" trie data uses : %d bytes", bmp_trie); |
1999 qDebug(" trie data uses : %d bytes", bmp_trie); |
1896 |
2000 |
1897 int smp_block_data = (blocks.size()- bmp_blocks)*SMP_BLOCKSIZE*2; |
2001 int smp_block_data = (blocks.size() - bmp_blocks)*SMP_BLOCKSIZE*2; |
1898 int smp_trie = (SMP_END-BMP_END)/SMP_BLOCKSIZE*2; |
2002 int smp_trie = (SMP_END-BMP_END)/SMP_BLOCKSIZE*2; |
1899 int smp_mem = smp_block_data + smp_trie; |
2003 int smp_mem = smp_block_data + smp_trie; |
1900 qDebug(" %d unique blocks in SMP.",blocks.size()-bmp_blocks); |
2004 qDebug(" %d unique blocks in SMP.", blocks.size()-bmp_blocks); |
1901 qDebug(" block data uses: %d bytes", smp_block_data); |
2005 qDebug(" block data uses: %d bytes", smp_block_data); |
1902 qDebug(" trie data uses : %d bytes", smp_trie); |
2006 qDebug(" trie data uses : %d bytes", smp_trie); |
1903 |
2007 |
1904 qDebug("\n properties use : %d bytes", uniqueProperties.size()*20); |
2008 qDebug("\n properties use : %d bytes", uniqueProperties.size()*20); |
1905 qDebug(" memory usage: %d bytes", bmp_mem+smp_mem + uniqueProperties.size()*20); |
2009 qDebug(" memory usage: %d bytes", bmp_mem+smp_mem + uniqueProperties.size()*20); |
1906 |
2010 |
1907 QByteArray out; |
2011 QByteArray out; |
1908 out += "static const unsigned short uc_property_trie[] = {\n"; |
2012 out += "static const unsigned short uc_property_trie[] = {\n"; |
1909 |
2013 |
1910 // first write the map |
2014 // first write the map |
1911 out += " // 0x" + QByteArray::number(BMP_END, 16); |
2015 out += " // 0 - 0x" + QByteArray::number(BMP_END, 16); |
1912 for (int i = 0; i < BMP_END/BMP_BLOCKSIZE; ++i) { |
2016 for (int i = 0; i < BMP_END/BMP_BLOCKSIZE; ++i) { |
1913 if (!(i % 8)) { |
2017 if (!(i % 8)) { |
1914 if (out.endsWith(' ')) |
2018 if (out.endsWith(' ')) |
1915 out.chop(1); |
2019 out.chop(1); |
1916 if (!((i*BMP_BLOCKSIZE) % 0x1000)) |
2020 if (!((i*BMP_BLOCKSIZE) % 0x1000)) |
2062 "{\n" |
2166 "{\n" |
2063 " int index = GET_PROP_INDEX_UCS2(ucs2);\n" |
2167 " int index = GET_PROP_INDEX_UCS2(ucs2);\n" |
2064 " return uc_properties + index;\n" |
2168 " return uc_properties + index;\n" |
2065 "}\n\n"; |
2169 "}\n\n"; |
2066 |
2170 |
2067 out += "#define CURRENT_VERSION "CURRENT_UNICODE_VERSION"\n\n"; |
2171 out += "static const ushort specialCaseMap[] = {\n "; |
2068 |
|
2069 out += "static const ushort specialCaseMap [] = {"; |
|
2070 for (int i = 0; i < specialCaseMap.size(); ++i) { |
2172 for (int i = 0; i < specialCaseMap.size(); ++i) { |
2071 if (!(i % 16)) |
|
2072 out += "\n "; |
|
2073 out += QByteArray(" 0x") + QByteArray::number(specialCaseMap.at(i), 16); |
2173 out += QByteArray(" 0x") + QByteArray::number(specialCaseMap.at(i), 16); |
2074 if (i < specialCaseMap.size() - 1) |
2174 if (i < specialCaseMap.size() - 1) |
2075 out += ","; |
2175 out += ","; |
|
2176 if (!specialCaseMap.at(i)) |
|
2177 out += "\n "; |
2076 } |
2178 } |
2077 out += "\n};\n"; |
2179 out += "\n};\n"; |
2078 out += "#define SPECIAL_CASE_MAX_LEN " + QByteArray::number(specialCaseMaxLen) + "\n\n"; |
2180 out += "#define SPECIAL_CASE_MAX_LEN " + QByteArray::number(specialCaseMaxLen) + "\n\n"; |
2079 |
2181 |
2080 qDebug() << "Special case map uses " << specialCaseMap.size()*2 << "bytes"; |
2182 qDebug("Special case map uses : %d bytes", specialCaseMap.size()*2); |
2081 |
2183 |
2082 return out; |
2184 return out; |
2083 } |
2185 } |
2084 |
2186 |
2085 |
2187 |
2086 struct DecompositionBlock { |
2188 struct DecompositionBlock { |
2087 DecompositionBlock() { index = -1; } |
2189 DecompositionBlock() { index = -1; } |
2088 int index; |
2190 int index; |
2089 QList<int> decompositionPositions; |
2191 QList<int> decompositionPositions; |
2090 bool operator ==(const DecompositionBlock &other) |
2192 bool operator ==(const DecompositionBlock &other) |
2091 { return decompositionPositions == other.decompositionPositions; } |
2193 { return decompositionPositions == other.decompositionPositions; } |
2092 }; |
2194 }; |
2093 |
2195 |
2094 static QByteArray createCompositionInfo() |
2196 static QByteArray createCompositionInfo() |
2095 { |
2197 { |
2096 qDebug("createCompositionInfo:"); |
2198 qDebug("createCompositionInfo:"); |
2097 |
2199 |
2098 const int BMP_BLOCKSIZE=16; |
2200 const int BMP_BLOCKSIZE = 16; |
2099 const int BMP_SHIFT = 4; |
2201 const int BMP_SHIFT = 4; |
2100 const int BMP_END = 0x3400; // start of Han |
2202 const int BMP_END = 0x3400; // start of Han |
2101 const int SMP_END = 0x30000; |
2203 const int SMP_END = 0x30000; |
2102 const int SMP_BLOCKSIZE = 256; |
2204 const int SMP_BLOCKSIZE = 256; |
2103 const int SMP_SHIFT = 8; |
2205 const int SMP_SHIFT = 8; |
2194 } |
2294 } |
2195 |
2295 |
2196 int bmp_block_data = bmp_blocks*BMP_BLOCKSIZE*2; |
2296 int bmp_block_data = bmp_blocks*BMP_BLOCKSIZE*2; |
2197 int bmp_trie = BMP_END/BMP_BLOCKSIZE*2; |
2297 int bmp_trie = BMP_END/BMP_BLOCKSIZE*2; |
2198 int bmp_mem = bmp_block_data + bmp_trie; |
2298 int bmp_mem = bmp_block_data + bmp_trie; |
2199 qDebug(" %d unique blocks in BMP.",blocks.size()); |
2299 qDebug(" %d unique blocks in BMP.", blocks.size()); |
2200 qDebug(" block data uses: %d bytes", bmp_block_data); |
2300 qDebug(" block data uses: %d bytes", bmp_block_data); |
2201 qDebug(" trie data uses : %d bytes", bmp_trie); |
2301 qDebug(" trie data uses : %d bytes", bmp_trie); |
2202 qDebug(" memory usage: %d bytes", bmp_mem); |
2302 qDebug(" memory usage: %d bytes", bmp_mem); |
2203 |
2303 |
2204 int smp_block_data = (blocks.size()- bmp_blocks)*SMP_BLOCKSIZE*2; |
2304 int smp_block_data = (blocks.size() - bmp_blocks)*SMP_BLOCKSIZE*2; |
2205 int smp_trie = (SMP_END-BMP_END)/SMP_BLOCKSIZE*2; |
2305 int smp_trie = (SMP_END-BMP_END)/SMP_BLOCKSIZE*2; |
2206 int smp_mem = smp_block_data + smp_trie; |
2306 int smp_mem = smp_block_data + smp_trie; |
2207 qDebug(" %d unique blocks in SMP.",blocks.size()-bmp_blocks); |
2307 qDebug(" %d unique blocks in SMP.", blocks.size()-bmp_blocks); |
2208 qDebug(" block data uses: %d bytes", smp_block_data); |
2308 qDebug(" block data uses: %d bytes", smp_block_data); |
2209 qDebug(" trie data uses : %d bytes", smp_trie); |
2309 qDebug(" trie data uses : %d bytes", smp_trie); |
2210 |
2310 |
2211 qDebug("\n decomposition table use : %d bytes", decompositions.size()*2); |
2311 qDebug("\n decomposition table use : %d bytes", decompositions.size()*2); |
2212 qDebug(" memory usage: %d bytes", bmp_mem+smp_mem + decompositions.size()*2); |
2312 qDebug(" memory usage: %d bytes", bmp_mem+smp_mem + decompositions.size()*2); |
2431 return out; |
2531 return out; |
2432 } |
2532 } |
2433 |
2533 |
2434 int main(int, char **) |
2534 int main(int, char **) |
2435 { |
2535 { |
|
2536 initAgeMap(); |
2436 initCategoryMap(); |
2537 initCategoryMap(); |
2437 initDirectionMap(); |
2538 initDirectionMap(); |
2438 initDecompositionMap(); |
2539 initDecompositionMap(); |
2439 initGraphemeBreak(); |
2540 initGraphemeBreak(); |
2440 initWordBreak(); |
2541 initWordBreak(); |
2441 initSentenceBreak(); |
2542 initSentenceBreak(); |
2442 |
2543 initLineBreak(); |
|
2544 |
2443 readUnicodeData(); |
2545 readUnicodeData(); |
2444 readBidiMirroring(); |
2546 readBidiMirroring(); |
2445 readArabicShaping(); |
2547 readArabicShaping(); |
2446 readDerivedAge(); |
2548 readDerivedAge(); |
2447 readCompositionExclusion(); |
2549 readDerivedNormalizationProps(); |
2448 readLineBreak(); |
|
2449 readSpecialCasing(); |
2550 readSpecialCasing(); |
2450 readCaseFolding(); |
2551 readCaseFolding(); |
2451 // readBlocks(); |
2552 // readBlocks(); |
2452 readScripts(); |
2553 readScripts(); |
2453 readGraphemeBreak(); |
2554 readGraphemeBreak(); |
2454 readWordBreak(); |
2555 readWordBreak(); |
2455 readSentenceBreak(); |
2556 readSentenceBreak(); |
|
2557 readLineBreak(); |
2456 |
2558 |
2457 computeUniqueProperties(); |
2559 computeUniqueProperties(); |
2458 QByteArray properties = createPropertyInfo(); |
2560 QByteArray properties = createPropertyInfo(); |
2459 QByteArray compositions = createCompositionInfo(); |
2561 QByteArray compositions = createCompositionInfo(); |
2460 QByteArray ligatures = createLigatureInfo(); |
2562 QByteArray ligatures = createLigatureInfo(); |
2461 QByteArray normalizationCorrections = createNormalizationCorrections(); |
2563 QByteArray normalizationCorrections = createNormalizationCorrections(); |
2462 QByteArray scriptEnumDeclaration = createScriptEnumDeclaration(); |
2564 QByteArray scriptEnumDeclaration = createScriptEnumDeclaration(); |
2463 QByteArray scriptTableDeclaration = createScriptTableDeclaration(); |
2565 QByteArray scriptTableDeclaration = createScriptTableDeclaration(); |
2464 |
|
2465 QFile f("../../src/corelib/tools/qunicodetables.cpp"); |
|
2466 f.open(QFile::WriteOnly|QFile::Truncate); |
|
2467 |
2566 |
2468 QByteArray header = |
2567 QByteArray header = |
2469 "/****************************************************************************\n" |
2568 "/****************************************************************************\n" |
2470 "**\n" |
2569 "**\n" |
2471 "** Copyright (C) 2010 Nokia Corporation and/or its subsidiary(-ies).\n" |
2570 "** Copyright (C) 2010 Nokia Corporation and/or its subsidiary(-ies).\n" |
2519 "// without notice, or even be removed.\n" |
2619 "// without notice, or even be removed.\n" |
2520 "//\n" |
2620 "//\n" |
2521 "// We mean it.\n" |
2621 "// We mean it.\n" |
2522 "//\n\n"; |
2622 "//\n\n"; |
2523 |
2623 |
|
2624 QFile f("../../src/corelib/tools/qunicodetables.cpp"); |
|
2625 f.open(QFile::WriteOnly|QFile::Truncate); |
2524 f.write(header); |
2626 f.write(header); |
|
2627 f.write(note); |
2525 f.write("QT_BEGIN_NAMESPACE\n\n"); |
2628 f.write("QT_BEGIN_NAMESPACE\n\n"); |
2526 f.write(properties); |
2629 f.write(properties); |
2527 f.write(compositions); |
2630 f.write(compositions); |
2528 f.write(ligatures); |
2631 f.write(ligatures); |
2529 f.write(normalizationCorrections); |
2632 f.write(normalizationCorrections); |
2530 f.write(scriptTableDeclaration); |
2633 f.write(scriptTableDeclaration); |
2531 f.write("\nQT_END_NAMESPACE\n"); |
2634 f.write("QT_END_NAMESPACE\n"); |
2532 f.close(); |
2635 f.close(); |
2533 |
2636 |
2534 f.setFileName("../../src/corelib/tools/qunicodetables_p.h"); |
2637 f.setFileName("../../src/corelib/tools/qunicodetables_p.h"); |
2535 f.open(QFile::WriteOnly | QFile::Truncate); |
2638 f.open(QFile::WriteOnly | QFile::Truncate); |
2536 f.write(header); |
2639 f.write(header); |
|
2640 f.write(note); |
2537 f.write(warning); |
2641 f.write(warning); |
2538 f.write("#ifndef QUNICODETABLES_P_H\n" |
2642 f.write("#ifndef QUNICODETABLES_P_H\n" |
2539 "#define QUNICODETABLES_P_H\n\n" |
2643 "#define QUNICODETABLES_P_H\n\n" |
2540 "#include <QtCore/qchar.h>\n\n" |
2644 "#include <QtCore/qchar.h>\n\n" |
2541 "QT_BEGIN_NAMESPACE\n\n"); |
2645 "QT_BEGIN_NAMESPACE\n\n"); |
2542 f.write("namespace QUnicodeTables {\n"); |
2646 f.write("#define UNICODE_DATA_VERSION "DATA_VERSION_STR"\n\n"); |
|
2647 f.write("#define UNICODE_LAST_CODEPOINT "LAST_CODEPOINT_STR"\n\n"); |
|
2648 f.write("namespace QUnicodeTables {\n\n"); |
2543 f.write(property_string); |
2649 f.write(property_string); |
2544 f.write("\n"); |
2650 f.write("\n"); |
2545 f.write(scriptEnumDeclaration); |
2651 f.write(scriptEnumDeclaration); |
2546 f.write("\n"); |
2652 f.write("\n"); |
2547 f.write(lineBreakClass); |
2653 f.write(lineBreakClass); |
2548 f.write("\n"); |
2654 f.write("\n"); |
2549 f.write(methods); |
|
2550 f.write("\n"); |
|
2551 f.write(grapheme_break_string); |
2655 f.write(grapheme_break_string); |
2552 f.write("\n"); |
2656 f.write("\n"); |
2553 f.write(word_break_string); |
2657 f.write(word_break_string); |
2554 f.write("\n"); |
2658 f.write("\n"); |
2555 f.write(sentence_break_string); |
2659 f.write(sentence_break_string); |
2556 f.write("\n}\n\n" |
2660 f.write("\n"); |
|
2661 f.write(methods); |
|
2662 f.write("} // namespace QUnicodeTables\n\n" |
2557 "QT_END_NAMESPACE\n\n" |
2663 "QT_END_NAMESPACE\n\n" |
2558 "#endif\n"); |
2664 "#endif // QUNICODETABLES_P_H\n"); |
2559 f.close(); |
2665 f.close(); |
2560 |
2666 |
2561 qDebug() << "maxMirroredDiff = " << hex << maxMirroredDiff; |
2667 qDebug() << "maxMirroredDiff = " << hex << maxMirroredDiff; |
2562 qDebug() << "maxLowerCaseDiff = " << hex << maxLowerCaseDiff; |
2668 qDebug() << "maxLowerCaseDiff = " << hex << maxLowerCaseDiff; |
2563 qDebug() << "maxUpperCaseDiff = " << hex << maxUpperCaseDiff; |
2669 qDebug() << "maxUpperCaseDiff = " << hex << maxUpperCaseDiff; |
2576 qDebug(" length %d used %d times", i, decompositionLength.value(i, 0)); |
2682 qDebug(" length %d used %d times", i, decompositionLength.value(i, 0)); |
2577 totalcompositions += i*decompositionLength.value(i, 0); |
2683 totalcompositions += i*decompositionLength.value(i, 0); |
2578 sum += decompositionLength.value(i, 0); |
2684 sum += decompositionLength.value(i, 0); |
2579 } |
2685 } |
2580 qDebug(" len decomposition map %d, average length %f, num composed chars %d", |
2686 qDebug(" len decomposition map %d, average length %f, num composed chars %d", |
2581 totalcompositions, (float)totalcompositions/(float)sum, sum); |
2687 totalcompositions, (float)totalcompositions/(float)sum, sum); |
2582 qDebug("highest composed character %x", highestComposedCharacter); |
2688 qDebug("highest composed character %x", highestComposedCharacter); |
2583 qDebug("num ligatures = %d highest=%x, maxLength=%d", numLigatures, highestLigature, longestLigature); |
2689 qDebug("num ligatures = %d highest=%x, maxLength=%d", numLigatures, highestLigature, longestLigature); |
2584 |
2690 |
2585 qBubbleSort(ligatures); |
2691 qBubbleSort(ligatures); |
2586 for (int i = 0; i < ligatures.size(); ++i) |
2692 for (int i = 0; i < ligatures.size(); ++i) |