util/unicode/main.cpp
changeset 0 1918ee327afb
child 4 3b1da2848fc7
equal deleted inserted replaced
-1:000000000000 0:1918ee327afb
       
     1 /****************************************************************************
       
     2 **
       
     3 ** Copyright (C) 2009 Nokia Corporation and/or its subsidiary(-ies).
       
     4 ** All rights reserved.
       
     5 ** Contact: Nokia Corporation (qt-info@nokia.com)
       
     6 **
       
     7 ** This file is part of the utils of the Qt Toolkit.
       
     8 **
       
     9 ** $QT_BEGIN_LICENSE:LGPL$
       
    10 ** No Commercial Usage
       
    11 ** This file contains pre-release code and may not be distributed.
       
    12 ** You may use this file in accordance with the terms and conditions
       
    13 ** contained in the Technology Preview License Agreement accompanying
       
    14 ** this package.
       
    15 **
       
    16 ** GNU Lesser General Public License Usage
       
    17 ** Alternatively, this file may be used under the terms of the GNU Lesser
       
    18 ** General Public License version 2.1 as published by the Free Software
       
    19 ** Foundation and appearing in the file LICENSE.LGPL included in the
       
    20 ** packaging of this file.  Please review the following information to
       
    21 ** ensure the GNU Lesser General Public License version 2.1 requirements
       
    22 ** will be met: http://www.gnu.org/licenses/old-licenses/lgpl-2.1.html.
       
    23 **
       
    24 ** In addition, as a special exception, Nokia gives you certain additional
       
    25 ** rights.  These rights are described in the Nokia Qt LGPL Exception
       
    26 ** version 1.1, included in the file LGPL_EXCEPTION.txt in this package.
       
    27 **
       
    28 ** If you have questions regarding the use of this file, please contact
       
    29 ** Nokia at qt-info@nokia.com.
       
    30 **
       
    31 **
       
    32 **
       
    33 **
       
    34 **
       
    35 **
       
    36 **
       
    37 **
       
    38 ** $QT_END_LICENSE$
       
    39 **
       
    40 ****************************************************************************/
       
    41 #include <qlist.h>
       
    42 #include <qhash.h>
       
    43 #include <qfile.h>
       
    44 #include <qstring.h>
       
    45 #include <qchar.h>
       
    46 #include <private/qunicodetables_p.h>
       
    47 #include <qvector.h>
       
    48 #include <qdebug.h>
       
    49 
       
    50 
       
    51 static struct AgeMap {
       
    52     const char *age;
       
    53     const QChar::UnicodeVersion version;
       
    54 } ageMap [] = {
       
    55     { "1.1", QChar::Unicode_1_1 },
       
    56     { "2.0", QChar::Unicode_2_0 },
       
    57     { "2.1", QChar::Unicode_2_1_2 },
       
    58     { "3.0", QChar::Unicode_3_0 },
       
    59     { "3.1", QChar::Unicode_3_1 },
       
    60     { "3.2", QChar::Unicode_3_2 },
       
    61     { "4.0", QChar::Unicode_4_0 },
       
    62     { "4.1", QChar::Unicode_4_1 },
       
    63     { "5.0", QChar::Unicode_5_0 },
       
    64     { 0, QChar::Unicode_Unassigned }
       
    65 };
       
    66 #define CURRENT_UNICODE_VERSION "QChar::Unicode_5_0"
       
    67 
       
    68 static const char *grapheme_break_string =
       
    69     "    enum GraphemeBreak {\n"
       
    70     "        GraphemeBreakOther, \n"
       
    71     "        GraphemeBreakCR,\n"
       
    72     "        GraphemeBreakLF,\n"
       
    73     "        GraphemeBreakControl,\n"
       
    74     "        GraphemeBreakExtend,\n"
       
    75     "        GraphemeBreakL,\n"
       
    76     "        GraphemeBreakV,\n"
       
    77     "        GraphemeBreakT,\n"
       
    78     "        GraphemeBreakLV,\n"
       
    79     "        GraphemeBreakLVT\n"
       
    80     "    };\n\n";
       
    81 
       
    82 enum GraphemeBreak {
       
    83     GraphemeBreakOther,
       
    84     GraphemeBreakCR,
       
    85     GraphemeBreakLF,
       
    86     GraphemeBreakControl,
       
    87     GraphemeBreakExtend,
       
    88     GraphemeBreakL,
       
    89     GraphemeBreakV,
       
    90     GraphemeBreakT,
       
    91     GraphemeBreakLV,
       
    92     GraphemeBreakLVT
       
    93 };
       
    94 
       
    95 QHash<QByteArray, GraphemeBreak> grapheme_break_map;
       
    96 
       
    97 static void initGraphemeBreak()
       
    98 {
       
    99     struct GraphemeBreakList {
       
   100         GraphemeBreak brk;
       
   101         const char *name;
       
   102     } breaks[] = {
       
   103         { GraphemeBreakOther, "Other" },
       
   104         { GraphemeBreakCR, "CR" },
       
   105         { GraphemeBreakLF, "LF" },
       
   106         { GraphemeBreakControl, "Control" },
       
   107         { GraphemeBreakExtend, "Extend" },
       
   108         { GraphemeBreakL, "L" },
       
   109         { GraphemeBreakV, "V" },
       
   110         { GraphemeBreakT, "T" },
       
   111         { GraphemeBreakLV, "LV" },
       
   112         { GraphemeBreakLVT, "LVT" },
       
   113         { GraphemeBreakOther, 0 }
       
   114     };
       
   115     GraphemeBreakList *d = breaks;
       
   116     while (d->name) {
       
   117         grapheme_break_map.insert(d->name, d->brk);
       
   118         ++d;
       
   119     }
       
   120 }
       
   121 
       
   122 const char *word_break_string =
       
   123     "    enum WordBreak {\n"
       
   124     "        WordBreakOther,\n"
       
   125     "        WordBreakFormat,\n"
       
   126     "        WordBreakKatakana,\n"
       
   127     "        WordBreakALetter,\n"
       
   128     "        WordBreakMidLetter,\n"
       
   129     "        WordBreakMidNum,\n"
       
   130     "        WordBreakNumeric,\n"
       
   131     "        WordBreakExtendNumLet\n"
       
   132     "    };\n\n";
       
   133 
       
   134 enum WordBreak {
       
   135     WordBreakOther,
       
   136     WordBreakFormat,
       
   137     WordBreakKatakana,
       
   138     WordBreakALetter,
       
   139     WordBreakMidLetter,
       
   140     WordBreakMidNum,
       
   141     WordBreakNumeric,
       
   142     WordBreakExtendNumLet
       
   143 };
       
   144 
       
   145 
       
   146 QHash<QByteArray, WordBreak> word_break_map;
       
   147 
       
   148 static void initWordBreak()
       
   149 {
       
   150     struct WordBreakList {
       
   151         WordBreak brk;
       
   152         const char *name;
       
   153     } breaks[] = {
       
   154         { WordBreakFormat, "Format" },
       
   155         { WordBreakFormat, "Extend" }, // these are copied in from GraphemeBreakProperty.txt
       
   156         { WordBreakKatakana, "Katakana" },
       
   157         { WordBreakALetter, "ALetter" },
       
   158         { WordBreakMidLetter, "MidLetter" },
       
   159         { WordBreakMidNum, "MidNum" },
       
   160         { WordBreakNumeric, "Numeric" },
       
   161         { WordBreakExtendNumLet, "ExtendNumLet" },
       
   162         { WordBreakFormat,  0 }
       
   163     };
       
   164     WordBreakList *d = breaks;
       
   165     while (d->name) {
       
   166         word_break_map.insert(d->name, d->brk);
       
   167         ++d;
       
   168     }
       
   169 }
       
   170 
       
   171 
       
   172 static const char *sentence_break_string =
       
   173     "    enum SentenceBreak {\n"
       
   174     "        SentenceBreakOther,\n"
       
   175     "        SentenceBreakSep,\n"
       
   176     "        SentenceBreakFormat,\n"
       
   177     "        SentenceBreakSp,\n"
       
   178     "        SentenceBreakLower,\n"
       
   179     "        SentenceBreakUpper,\n"
       
   180     "        SentenceBreakOLetter,\n"
       
   181     "        SentenceBreakNumeric,\n"
       
   182     "        SentenceBreakATerm,\n"
       
   183     "        SentenceBreakSTerm,\n"
       
   184     "        SentenceBreakClose\n"
       
   185     "    };\n\n";
       
   186 
       
   187 enum SentenceBreak {
       
   188     SentenceBreakOther,
       
   189     SentenceBreakSep,
       
   190     SentenceBreakFormat,
       
   191     SentenceBreakSp,
       
   192     SentenceBreakLower,
       
   193     SentenceBreakUpper,
       
   194     SentenceBreakOLetter,
       
   195     SentenceBreakNumeric,
       
   196     SentenceBreakATerm,
       
   197     SentenceBreakSTerm,
       
   198     SentenceBreakClose
       
   199 };
       
   200 
       
   201 
       
   202 QHash<QByteArray, SentenceBreak> sentence_break_map;
       
   203 
       
   204 static void initSentenceBreak()
       
   205 {
       
   206     struct SentenceBreakList {
       
   207         SentenceBreak brk;
       
   208         const char *name;
       
   209     } breaks[] = {
       
   210         { SentenceBreakOther, "Other" },
       
   211         { SentenceBreakSep, "Sep" },
       
   212         { SentenceBreakFormat, "Format" },
       
   213         { SentenceBreakSp, "Sp" },
       
   214         { SentenceBreakLower, "Lower" },
       
   215         { SentenceBreakUpper, "Upper" },
       
   216         { SentenceBreakOLetter, "OLetter" },
       
   217         { SentenceBreakNumeric, "Numeric" },
       
   218         { SentenceBreakATerm, "ATerm" },
       
   219         { SentenceBreakSTerm, "STerm" },
       
   220         { SentenceBreakClose, "Close" },
       
   221         { SentenceBreakOther,  0 }
       
   222     };
       
   223     SentenceBreakList *d = breaks;
       
   224     while (d->name) {
       
   225         sentence_break_map.insert(d->name, d->brk);
       
   226         ++d;
       
   227     }
       
   228 }
       
   229 
       
   230 
       
   231 // Keep this one in sync with the code in createPropertyInfo
       
   232 const char *property_string =
       
   233     "    struct Properties {\n"
       
   234     "        ushort category : 8;\n"
       
   235     "        ushort line_break_class : 8;\n"
       
   236     "        ushort direction : 8;\n"
       
   237     "        ushort combiningClass :8;\n"
       
   238     "        ushort joining : 2;\n"
       
   239     "        signed short digitValue : 6; /* 5 needed */\n"
       
   240     "        ushort unicodeVersion : 4;\n"
       
   241     "        ushort lowerCaseSpecial : 1;\n"
       
   242     "        ushort upperCaseSpecial : 1;\n"
       
   243     "        ushort titleCaseSpecial : 1;\n"
       
   244     "        ushort caseFoldSpecial : 1; /* currently unused */\n"
       
   245     "        signed short mirrorDiff : 16;\n"
       
   246     "        signed short lowerCaseDiff : 16;\n"
       
   247     "        signed short upperCaseDiff : 16;\n"
       
   248     "        signed short titleCaseDiff : 16;\n"
       
   249     "        signed short caseFoldDiff : 16;\n"
       
   250     "        ushort graphemeBreak : 8;\n"
       
   251     "        ushort wordBreak : 8;\n"
       
   252     "        ushort sentenceBreak : 8;\n"
       
   253     "    };\n"
       
   254     "    Q_CORE_EXPORT const Properties * QT_FASTCALL properties(uint ucs4);\n"
       
   255     "    Q_CORE_EXPORT const Properties * QT_FASTCALL properties(ushort ucs2);\n";
       
   256 
       
   257 const char *lineBreakClass =
       
   258     "    // see http://www.unicode.org/reports/tr14/tr14-19.html\n"
       
   259     "    // we don't use the XX, AI and CB properties and map them to AL instead.\n"
       
   260     "    // as we don't support any EBDIC based OS'es, NL is ignored and mapped to AL as well.\n"
       
   261     "    enum LineBreakClass {\n"
       
   262     "        LineBreak_OP, LineBreak_CL, LineBreak_QU, LineBreak_GL, LineBreak_NS,\n"
       
   263     "        LineBreak_EX, LineBreak_SY, LineBreak_IS, LineBreak_PR, LineBreak_PO,\n"
       
   264     "        LineBreak_NU, LineBreak_AL, LineBreak_ID, LineBreak_IN, LineBreak_HY,\n"
       
   265     "        LineBreak_BA, LineBreak_BB, LineBreak_B2, LineBreak_ZW, LineBreak_CM,\n"
       
   266     "        LineBreak_WJ, LineBreak_H2, LineBreak_H3, LineBreak_JL, LineBreak_JV,\n"
       
   267     "        LineBreak_JT, LineBreak_SA, LineBreak_SG,\n"
       
   268     "        LineBreak_SP, LineBreak_CR, LineBreak_LF, LineBreak_BK\n"
       
   269     "    };\n\n";
       
   270 
       
   271 const char *methods =
       
   272     "    Q_CORE_EXPORT QUnicodeTables::LineBreakClass QT_FASTCALL lineBreakClass(uint ucs4);\n"
       
   273     "    inline int lineBreakClass(const QChar &ch) {\n"
       
   274     "        return QUnicodeTables::lineBreakClass(ch.unicode());\n"
       
   275     "    }\n"
       
   276     "\n"
       
   277     "    Q_CORE_EXPORT int QT_FASTCALL script(uint ucs4);\n"
       
   278     "    Q_CORE_EXPORT_INLINE int QT_FASTCALL script(const QChar &ch) {\n"
       
   279     "        return script(ch.unicode());\n"
       
   280     "    }\n\n";
       
   281 
       
   282 
       
   283 struct PropertyFlags {
       
   284     bool operator ==(const PropertyFlags &o) {
       
   285         return (combiningClass == o.combiningClass
       
   286                 && category == o.category
       
   287                 && direction == o.direction
       
   288                 && joining == o.joining
       
   289                 && age == o.age
       
   290                 && digitValue == o.digitValue
       
   291                 && line_break_class == o.line_break_class
       
   292                 && mirrorDiff == o.mirrorDiff
       
   293                 && lowerCaseDiff == o.lowerCaseDiff
       
   294                 && upperCaseDiff == o.upperCaseDiff
       
   295                 && titleCaseDiff == o.titleCaseDiff
       
   296                 && caseFoldDiff == o.caseFoldDiff
       
   297                 && lowerCaseSpecial == o.lowerCaseSpecial
       
   298                 && upperCaseSpecial == o.upperCaseSpecial
       
   299                 && titleCaseSpecial == o.titleCaseSpecial
       
   300                 && caseFoldSpecial == o.caseFoldSpecial
       
   301                 && graphemeBreak == o.graphemeBreak
       
   302                 && wordBreak == o.wordBreak
       
   303                 && sentenceBreak == o.sentenceBreak
       
   304             );
       
   305     }
       
   306     // from UnicodeData.txt
       
   307     uchar combiningClass : 8;
       
   308     QChar::Category category : 5;
       
   309     QChar::Direction direction : 5;
       
   310     // from ArabicShaping.txt
       
   311     QChar::Joining joining : 2;
       
   312     // from DerivedAge.txt
       
   313     QChar::UnicodeVersion age : 4;
       
   314     int digitValue;
       
   315     uint line_break_class : 5;
       
   316 
       
   317     int mirrorDiff : 16;
       
   318 
       
   319     int lowerCaseDiff;
       
   320     int upperCaseDiff;
       
   321     int titleCaseDiff;
       
   322     int caseFoldDiff;
       
   323     bool lowerCaseSpecial;
       
   324     bool upperCaseSpecial;
       
   325     bool titleCaseSpecial;
       
   326     bool caseFoldSpecial;
       
   327     GraphemeBreak graphemeBreak;
       
   328     WordBreak wordBreak;
       
   329     SentenceBreak sentenceBreak;
       
   330 };
       
   331 
       
   332 QList<int> specialCaseMap;
       
   333 int specialCaseMaxLen = 0;
       
   334 
       
   335 static int appendToSpecialCaseMap(const QList<int> &map)
       
   336 {
       
   337     QList<int> utf16map;
       
   338     for (int i = 0; i < map.size(); ++i) {
       
   339         int val = map.at(i);
       
   340         if (val > 0xffff) {
       
   341             utf16map << QChar::highSurrogate(val);
       
   342             utf16map << QChar::lowSurrogate(val);
       
   343         } else {
       
   344             utf16map << val;
       
   345         }
       
   346     }
       
   347     specialCaseMaxLen = qMax(specialCaseMaxLen, utf16map.size());
       
   348     utf16map << 0;
       
   349 
       
   350     for (int i = 0; i < specialCaseMap.size() - utf16map.size() - 1; ++i) {
       
   351         int j;
       
   352         for (j = 0; j < utf16map.size(); ++j) {
       
   353             if (specialCaseMap.at(i+j) != utf16map.at(j))
       
   354                 break;
       
   355         }
       
   356         if (j == utf16map.size())
       
   357             return i;
       
   358     }
       
   359 
       
   360     int pos = specialCaseMap.size();
       
   361     specialCaseMap << utf16map;
       
   362     return pos;
       
   363 }
       
   364 
       
   365 struct UnicodeData {
       
   366     UnicodeData(int codepoint = 0) {
       
   367         p.category = QChar::NoCategory;
       
   368         p.combiningClass = 0;
       
   369 
       
   370         p.direction = QChar::DirL;
       
   371         // DirR for:  U+0590..U+05FF, U+07C0..U+08FF, U+FB1D..U+FB4F, U+10800..U+10FFF
       
   372         if ((codepoint >= 0x590 && codepoint <= 0x5ff)
       
   373             || (codepoint >= 0x7c0 && codepoint <= 0x8ff)
       
   374             || (codepoint >= 0xfb1d && codepoint <= 0xfb4f)
       
   375             || (codepoint >= 0x10800 && codepoint <= 0x10fff))
       
   376             p.direction = QChar::DirR;
       
   377         // DirAL for: U+0600..U+07BF, U+FB50..U+FDCF, U+FDF0..U+FDFF, U+FE70..U+FEFE
       
   378         if ((codepoint >= 0x600 && codepoint <= 0x7bf)
       
   379             || (codepoint >= 0xfb50 && codepoint <= 0xfdcf)
       
   380             || (codepoint >= 0xfdf0 && codepoint <= 0xfdff)
       
   381             || (codepoint >= 0xfe70 && codepoint <= 0xfefe))
       
   382             p.direction = QChar::DirAL;
       
   383 
       
   384         mirroredChar = 0;
       
   385         decompositionType = QChar::NoDecomposition;
       
   386         p.joining = QChar::OtherJoining;
       
   387         p.age = QChar::Unicode_Unassigned;
       
   388         p.mirrorDiff = 0;
       
   389         p.digitValue = -1;
       
   390         p.line_break_class = QUnicodeTables::LineBreak_AL;
       
   391         p.lowerCaseDiff = 0;
       
   392         p.upperCaseDiff = 0;
       
   393         p.titleCaseDiff = 0;
       
   394         p.caseFoldDiff = 0;
       
   395         p.lowerCaseSpecial = 0;
       
   396         p.upperCaseSpecial = 0;
       
   397         p.titleCaseSpecial = 0;
       
   398         p.caseFoldSpecial = 0;
       
   399         p.graphemeBreak = GraphemeBreakOther;
       
   400         p.wordBreak = WordBreakOther;
       
   401         p.sentenceBreak = SentenceBreakOther;
       
   402         propertyIndex = -1;
       
   403         excludedComposition = false;
       
   404     }
       
   405     PropertyFlags p;
       
   406 
       
   407     // from UnicodeData.txt
       
   408     QChar::Decomposition decompositionType;
       
   409     QList<int> decomposition;
       
   410 
       
   411     QList<int> specialFolding;
       
   412 
       
   413     // from BidiMirroring.txt
       
   414     int mirroredChar;
       
   415 
       
   416     // CompositionExclusions.txt
       
   417     bool excludedComposition;
       
   418 
       
   419     // computed position of unicode property set
       
   420     int propertyIndex;
       
   421 };
       
   422 
       
   423 enum UniDataFields {
       
   424     UD_Value,
       
   425     UD_Name,
       
   426     UD_Category,
       
   427     UD_CombiningClass,
       
   428     UD_BidiCategory,
       
   429     UD_Decomposition,
       
   430     UD_DecimalDigitValue,
       
   431     UD_DigitValue,
       
   432     UD_NumericValue,
       
   433     UD_Mirrored,
       
   434     UD_OldName,
       
   435     UD_Comment,
       
   436     UD_UpperCase,
       
   437     UD_LowerCase,
       
   438     UD_TitleCase
       
   439 };
       
   440 
       
   441 QHash<QByteArray, QChar::Category> categoryMap;
       
   442 
       
   443 static void initCategoryMap()
       
   444 {
       
   445     struct Cat {
       
   446         QChar::Category cat;
       
   447         const char *name;
       
   448     } categories [] = {
       
   449         { QChar::Mark_NonSpacing,          "Mn" },
       
   450         { QChar::Mark_SpacingCombining,    "Mc" },
       
   451         { QChar::Mark_Enclosing,           "Me" },
       
   452 
       
   453         { QChar::Number_DecimalDigit,      "Nd" },
       
   454         { QChar::Number_Letter,            "Nl" },
       
   455         { QChar::Number_Other,             "No" },
       
   456 
       
   457         { QChar::Separator_Space,          "Zs" },
       
   458         { QChar::Separator_Line,           "Zl" },
       
   459         { QChar::Separator_Paragraph,      "Zp" },
       
   460 
       
   461         { QChar::Other_Control,            "Cc" },
       
   462         { QChar::Other_Format,             "Cf" },
       
   463         { QChar::Other_Surrogate,          "Cs" },
       
   464         { QChar::Other_PrivateUse,         "Co" },
       
   465         { QChar::Other_NotAssigned,        "Cn" },
       
   466 
       
   467         { QChar::Letter_Uppercase,         "Lu" },
       
   468         { QChar::Letter_Lowercase,         "Ll" },
       
   469         { QChar::Letter_Titlecase,         "Lt" },
       
   470         { QChar::Letter_Modifier,          "Lm" },
       
   471         { QChar::Letter_Other,             "Lo" },
       
   472 
       
   473         { QChar::Punctuation_Connector,    "Pc" },
       
   474         { QChar::Punctuation_Dash,         "Pd" },
       
   475         { QChar::Punctuation_Open,         "Ps" },
       
   476         { QChar::Punctuation_Close,        "Pe" },
       
   477         { QChar::Punctuation_InitialQuote, "Pi" },
       
   478         { QChar::Punctuation_FinalQuote,   "Pf" },
       
   479         { QChar::Punctuation_Other,        "Po" },
       
   480 
       
   481         { QChar::Symbol_Math,              "Sm" },
       
   482         { QChar::Symbol_Currency,          "Sc" },
       
   483         { QChar::Symbol_Modifier,          "Sk" },
       
   484         { QChar::Symbol_Other,             "So" },
       
   485         { QChar::NoCategory, 0 }
       
   486     };
       
   487     Cat *c = categories;
       
   488     while (c->cat != QChar::NoCategory) {
       
   489         categoryMap.insert(c->name, c->cat);
       
   490         ++c;
       
   491     }
       
   492 }
       
   493 
       
   494 QHash<QByteArray, QChar::Direction> directionMap;
       
   495 
       
   496 static void initDirectionMap()
       
   497 {
       
   498     struct Dir {
       
   499         QChar::Direction dir;
       
   500         const char *name;
       
   501     } directions[] = {
       
   502         { QChar::DirL, "L" },
       
   503         { QChar::DirR, "R" },
       
   504         { QChar::DirEN, "EN" },
       
   505         { QChar::DirES, "ES" },
       
   506         { QChar::DirET, "ET" },
       
   507         { QChar::DirAN, "AN" },
       
   508         { QChar::DirCS, "CS" },
       
   509         { QChar::DirB, "B" },
       
   510         { QChar::DirS, "S" },
       
   511         { QChar::DirWS, "WS" },
       
   512         { QChar::DirON, "ON" },
       
   513         { QChar::DirLRE, "LRE" },
       
   514         { QChar::DirLRO, "LRO" },
       
   515         { QChar::DirAL, "AL" },
       
   516         { QChar::DirRLE, "RLE" },
       
   517         { QChar::DirRLO, "RLO" },
       
   518         { QChar::DirPDF, "PDF" },
       
   519         { QChar::DirNSM, "NSM" },
       
   520         { QChar::DirBN, "BN" },
       
   521         { QChar::DirL, 0 }
       
   522     };
       
   523     Dir *d = directions;
       
   524     while (d->name) {
       
   525         directionMap.insert(d->name, d->dir);
       
   526         ++d;
       
   527     }
       
   528 }
       
   529 
       
   530 
       
   531 QHash<QByteArray, QChar::Decomposition> decompositionMap;
       
   532 
       
   533 static void initDecompositionMap()
       
   534 {
       
   535     struct Dec {
       
   536         QChar::Decomposition dec;
       
   537         const char *name;
       
   538     } decompositions[] = {
       
   539         { QChar::Canonical, "<canonical>" },
       
   540         { QChar::Font, "<font>" },
       
   541         { QChar::NoBreak, "<noBreak>" },
       
   542         { QChar::Initial, "<initial>" },
       
   543         { QChar::Medial, "<medial>" },
       
   544         { QChar::Final, "<final>" },
       
   545         { QChar::Isolated, "<isolated>" },
       
   546         { QChar::Circle, "<circle>" },
       
   547         { QChar::Super, "<super>" },
       
   548         { QChar::Sub, "<sub>" },
       
   549         { QChar::Vertical, "<vertical>" },
       
   550         { QChar::Wide, "<wide>" },
       
   551         { QChar::Narrow, "<narrow>" },
       
   552         { QChar::Small, "<small>" },
       
   553         { QChar::Square, "<square>" },
       
   554         { QChar::Compat, "<compat>" },
       
   555         { QChar::Fraction, "<fraction>" },
       
   556         { QChar::NoDecomposition,  0 }
       
   557     };
       
   558     Dec *d = decompositions;
       
   559     while (d->name) {
       
   560         decompositionMap.insert(d->name, d->dec);
       
   561         ++d;
       
   562     }
       
   563 }
       
   564 
       
   565 
       
   566 QHash<int, UnicodeData> unicodeData;
       
   567 QList<PropertyFlags> uniqueProperties;
       
   568 
       
   569 
       
   570 QHash<int, int> decompositionLength;
       
   571 int highestComposedCharacter = 0;
       
   572 int numLigatures = 0;
       
   573 int highestLigature = 0;
       
   574 
       
   575 struct Ligature {ushort u1; ushort u2; ushort ligature;};
       
   576 // we need them sorted after the first component for fast lookup
       
   577 bool operator < (const Ligature &l1, const Ligature &l2) {
       
   578     return l1.u1 < l2.u1;
       
   579 }
       
   580 
       
   581 QHash<ushort, QList<Ligature> > ligatureHashes;
       
   582 
       
   583 QHash<int, int> combiningClassUsage;
       
   584 
       
   585 int maxLowerCaseDiff = 0;
       
   586 int maxUpperCaseDiff = 0;
       
   587 int maxTitleCaseDiff = 0;
       
   588 
       
   589 static void readUnicodeData()
       
   590 {
       
   591     QFile f("data/UnicodeData.txt");
       
   592     if (!f.exists())
       
   593         qFatal("Couldn't find UnicodeData.txt");
       
   594 
       
   595     f.open(QFile::ReadOnly);
       
   596 
       
   597     while (!f.atEnd()) {
       
   598         QByteArray line;
       
   599         line.resize(1024);
       
   600         int len = f.readLine(line.data(), 1024);
       
   601         line.truncate(len-1);
       
   602 
       
   603         int comment = line.indexOf('#');
       
   604         if (comment >= 0)
       
   605             line = line.left(comment);
       
   606         if (line.isEmpty())
       
   607             continue;
       
   608 
       
   609         QList<QByteArray> properties = line.split(';');
       
   610         bool ok;
       
   611         int codepoint = properties[UD_Value].toInt(&ok, 16);
       
   612         int lastCodepoint = codepoint;
       
   613 
       
   614         QByteArray name = properties[UD_Name];
       
   615         if (name.startsWith('<') && name.contains("First")) {
       
   616             QByteArray nextLine;
       
   617             nextLine.resize(1024);
       
   618             f.readLine(nextLine.data(), 1024);
       
   619             QList<QByteArray> properties = nextLine.split(';');
       
   620             lastCodepoint = properties[UD_Value].toInt(&ok, 16);
       
   621         }
       
   622 
       
   623         UnicodeData data(codepoint);
       
   624         data.p.category = categoryMap.value(properties[UD_Category], QChar::NoCategory);
       
   625         data.p.combiningClass = properties[UD_CombiningClass].toInt();
       
   626 
       
   627         if (!combiningClassUsage.contains(data.p.combiningClass))
       
   628             combiningClassUsage[data.p.combiningClass] = 1;
       
   629         else
       
   630             ++combiningClassUsage[data.p.combiningClass];
       
   631 
       
   632         data.p.direction = directionMap.value(properties[UD_BidiCategory], data.p.direction);
       
   633 
       
   634         if (!properties[UD_UpperCase].isEmpty()) {
       
   635             int upperCase = properties[UD_UpperCase].toInt(&ok, 16);
       
   636             Q_ASSERT(ok);
       
   637             data.p.upperCaseDiff = upperCase - codepoint;
       
   638             maxUpperCaseDiff = qMax(maxUpperCaseDiff, qAbs(data.p.upperCaseDiff));
       
   639             if (codepoint > 0xffff) {
       
   640                 // if the condition below doesn't hold anymore we need to modify our case folding code
       
   641                 //qDebug() << codepoint << QChar::highSurrogate(codepoint) << QChar::highSurrogate(foldMap.at(0));
       
   642                 Q_ASSERT(QChar::highSurrogate(codepoint) == QChar::highSurrogate(upperCase));
       
   643             }
       
   644         }
       
   645         if (!properties[UD_LowerCase].isEmpty()) {
       
   646             int lowerCase = properties[UD_LowerCase].toInt(&ok, 16);
       
   647             Q_ASSERT (ok);
       
   648             data.p.lowerCaseDiff = lowerCase - codepoint;
       
   649             maxLowerCaseDiff = qMax(maxLowerCaseDiff, qAbs(data.p.lowerCaseDiff));
       
   650             if (codepoint > 0xffff) {
       
   651                 // if the condition below doesn't hold anymore we need to modify our case folding code
       
   652                 //qDebug() << codepoint << QChar::highSurrogate(codepoint) << QChar::highSurrogate(foldMap.at(0));
       
   653                 Q_ASSERT(QChar::highSurrogate(codepoint) == QChar::highSurrogate(lowerCase));
       
   654             }
       
   655         }
       
   656         // we want toTitleCase to map to ToUpper in case we don't have any titlecase.
       
   657         if (properties[UD_TitleCase].isEmpty())
       
   658             properties[UD_TitleCase] = properties[UD_UpperCase];
       
   659         if (!properties[UD_TitleCase].isEmpty()) {
       
   660             int titleCase = properties[UD_TitleCase].toInt(&ok, 16);
       
   661             Q_ASSERT (ok);
       
   662             data.p.titleCaseDiff = titleCase - codepoint;
       
   663             maxTitleCaseDiff = qMax(maxTitleCaseDiff, qAbs(data.p.titleCaseDiff));
       
   664             if (codepoint > 0xffff) {
       
   665                 // if the condition below doesn't hold anymore we need to modify our case folding code
       
   666                 //qDebug() << codepoint << QChar::highSurrogate(codepoint) << QChar::highSurrogate(foldMap.at(0));
       
   667                 Q_ASSERT(QChar::highSurrogate(codepoint) == QChar::highSurrogate(titleCase));
       
   668             }
       
   669         }
       
   670 
       
   671         if (!properties[UD_DigitValue].isEmpty())
       
   672             data.p.digitValue = properties[UD_DigitValue].toInt();
       
   673 
       
   674         // decompositition
       
   675         QByteArray decomposition = properties[UD_Decomposition];
       
   676         if (!decomposition.isEmpty()) {
       
   677             highestComposedCharacter = qMax(highestComposedCharacter, codepoint);
       
   678             QList<QByteArray> d = decomposition.split(' ');
       
   679             if (d[0].contains('<')) {
       
   680                 data.decompositionType = decompositionMap.value(d[0], QChar::Canonical);
       
   681                 d.takeFirst();
       
   682             } else {
       
   683                 data.decompositionType = QChar::Canonical;
       
   684             }
       
   685             for (int i = 0; i < d.size(); ++i)
       
   686                 data.decomposition.append(d[i].toInt(&ok, 16));
       
   687             if (!decompositionLength.contains(data.decomposition.size()))
       
   688                 decompositionLength[data.decomposition.size()] = 1;
       
   689             else
       
   690                 ++decompositionLength[data.decomposition.size()];
       
   691         }
       
   692 
       
   693         for (int i = codepoint; i <= lastCodepoint; ++i)
       
   694             unicodeData.insert(i, data);
       
   695     }
       
   696 
       
   697 }
       
   698 
       
   699 static int maxMirroredDiff = 0;
       
   700 
       
   701 static void readBidiMirroring()
       
   702 {
       
   703     QFile f("data/BidiMirroring.txt");
       
   704     if (!f.exists())
       
   705         qFatal("Couldn't find BidiMirroring.txt");
       
   706 
       
   707     f.open(QFile::ReadOnly);
       
   708 
       
   709     while (!f.atEnd()) {
       
   710         QByteArray line;
       
   711         line.resize(1024);
       
   712         int len = f.readLine(line.data(), 1024);
       
   713         line.resize(len-1);
       
   714 
       
   715         int comment = line.indexOf('#');
       
   716         if (comment >= 0)
       
   717             line = line.left(comment);
       
   718 
       
   719         if (line.isEmpty())
       
   720             continue;
       
   721         line = line.replace(" ", "");
       
   722 
       
   723         QList<QByteArray> pair = line.split(';');
       
   724         Q_ASSERT(pair.size() == 2);
       
   725 
       
   726         bool ok;
       
   727         int codepoint = pair[0].toInt(&ok, 16);
       
   728         int mirror = pair[1].toInt(&ok, 16);
       
   729 
       
   730         UnicodeData d = unicodeData.value(codepoint, UnicodeData(codepoint));
       
   731         d.mirroredChar = mirror;
       
   732         if (qAbs(codepoint-d.mirroredChar) > maxMirroredDiff)
       
   733             maxMirroredDiff = qAbs(codepoint - d.mirroredChar);
       
   734 
       
   735         d.p.mirrorDiff = d.mirroredChar - codepoint;
       
   736         unicodeData.insert(codepoint, d);
       
   737     }
       
   738 }
       
   739 
       
   740 static void readArabicShaping()
       
   741 {
       
   742     QFile f("data/ArabicShaping.txt");
       
   743     if (!f.exists())
       
   744         qFatal("Couldn't find ArabicShaping.txt");
       
   745 
       
   746     f.open(QFile::ReadOnly);
       
   747 
       
   748     while (!f.atEnd()) {
       
   749         QByteArray line;
       
   750         line.resize(1024);
       
   751         int len = f.readLine(line.data(), 1024);
       
   752         line.resize(len-1);
       
   753 
       
   754         int comment = line.indexOf('#');
       
   755         if (comment >= 0)
       
   756             line = line.left(comment);
       
   757         line = line.trimmed();
       
   758 
       
   759         if (line.isEmpty())
       
   760             continue;
       
   761 
       
   762         QList<QByteArray> shaping = line.split(';');
       
   763         Q_ASSERT(shaping.size() == 4);
       
   764 
       
   765         bool ok;
       
   766         int codepoint = shaping[0].toInt(&ok, 16);
       
   767         QChar::Joining j = QChar::OtherJoining;
       
   768         QByteArray shape = shaping[2].trimmed();
       
   769         if (shape == "R")
       
   770             j = QChar::Right;
       
   771         else if (shape == "D")
       
   772             j = QChar::Dual;
       
   773         else if (shape == "C")
       
   774             j = QChar::Center;
       
   775 
       
   776         UnicodeData d = unicodeData.value(codepoint, UnicodeData(codepoint));
       
   777         d.p.joining = j;
       
   778         unicodeData.insert(codepoint, d);
       
   779     }
       
   780 }
       
   781 
       
   782 static void readDerivedAge()
       
   783 {
       
   784     QFile f("data/DerivedAge.txt");
       
   785     if (!f.exists())
       
   786         qFatal("Couldn't find DerivedAge.txt");
       
   787 
       
   788     f.open(QFile::ReadOnly);
       
   789 
       
   790     while (!f.atEnd()) {
       
   791         QByteArray line;
       
   792         line.resize(1024);
       
   793         int len = f.readLine(line.data(), 1024);
       
   794         line.resize(len-1);
       
   795 
       
   796         int comment = line.indexOf('#');
       
   797         if (comment >= 0)
       
   798             line = line.left(comment);
       
   799         line.replace(" ", "");
       
   800 
       
   801         if (line.isEmpty())
       
   802             continue;
       
   803 
       
   804         QList<QByteArray> l = line.split(';');
       
   805         Q_ASSERT(l.size() == 2);
       
   806 
       
   807         QByteArray codes = l[0];
       
   808         codes.replace("..", ".");
       
   809         QList<QByteArray> cl = codes.split('.');
       
   810 
       
   811         bool ok;
       
   812         int from = cl[0].toInt(&ok, 16);
       
   813         int to = from;
       
   814         if (cl.size() == 2)
       
   815             to = cl[1].toInt(&ok, 16);
       
   816 
       
   817         QChar::UnicodeVersion age = QChar::Unicode_Unassigned;
       
   818         QByteArray ba = l[1];
       
   819         AgeMap *map = ageMap;
       
   820         while (map->age) {
       
   821             if (ba == map->age) {
       
   822                 age = map->version;
       
   823                 break;
       
   824             }
       
   825             ++map;
       
   826         }
       
   827         //qDebug() << hex << from << ".." << to << ba << age;
       
   828         Q_ASSERT(age != QChar::Unicode_Unassigned);
       
   829 
       
   830         for (int codepoint = from; codepoint <= to; ++codepoint) {
       
   831             UnicodeData d = unicodeData.value(codepoint, UnicodeData(codepoint));
       
   832             d.p.age = age;
       
   833             unicodeData.insert(codepoint, d);
       
   834         }
       
   835     }
       
   836 }
       
   837 
       
   838 
       
   839 static void readCompositionExclusion()
       
   840 {
       
   841     QFile f("data/CompositionExclusions.txt");
       
   842     if (!f.exists())
       
   843         qFatal("Couldn't find CompositionExclusions.txt");
       
   844 
       
   845     f.open(QFile::ReadOnly);
       
   846 
       
   847     while (!f.atEnd()) {
       
   848         QByteArray line;
       
   849         line.resize(1024);
       
   850         int len = f.readLine(line.data(), 1024);
       
   851         line.resize(len-1);
       
   852 
       
   853         int comment = line.indexOf('#');
       
   854         if (comment >= 0)
       
   855             line = line.left(comment);
       
   856         line.replace(" ", "");
       
   857 
       
   858         if (line.isEmpty())
       
   859             continue;
       
   860 
       
   861         Q_ASSERT(!line.contains(".."));
       
   862 
       
   863         bool ok;
       
   864         int codepoint = line.toInt(&ok, 16);
       
   865 
       
   866         UnicodeData d = unicodeData.value(codepoint, UnicodeData(codepoint));
       
   867         d.excludedComposition = true;
       
   868         unicodeData.insert(codepoint, d);
       
   869     }
       
   870 
       
   871     for (int i = 0; i < 0x110000; ++i) {
       
   872         UnicodeData data = unicodeData.value(i, UnicodeData(i));
       
   873         if (!data.excludedComposition
       
   874             && data.decompositionType == QChar::Canonical
       
   875             && data.decomposition.size() > 1) {
       
   876             Q_ASSERT(data.decomposition.size() == 2);
       
   877 
       
   878             uint part1 = data.decomposition.at(0);
       
   879             uint part2 = data.decomposition.at(1);
       
   880             UnicodeData first = unicodeData.value(part1, UnicodeData(part1));
       
   881             if (first.p.combiningClass != 0)
       
   882                 continue;
       
   883 
       
   884             ++numLigatures;
       
   885             highestLigature = qMax(highestLigature, (int)part1);
       
   886             Ligature l = {(ushort)part1, (ushort)part2, i};
       
   887             ligatureHashes[part2].append(l);
       
   888         }
       
   889     }
       
   890 }
       
   891 
       
   892 struct NormalizationCorrection {
       
   893     uint codepoint;
       
   894     uint mapped;
       
   895     uint version;
       
   896 };
       
   897 
       
   898 static QByteArray createNormalizationCorrections()
       
   899 {
       
   900     QFile f("data/NormalizationCorrections.txt");
       
   901     if (!f.exists())
       
   902         qFatal("Couldn't find NormalizationCorrections.txt");
       
   903 
       
   904     f.open(QFile::ReadOnly);
       
   905 
       
   906     QByteArray out;
       
   907 
       
   908     out += "struct NormalizationCorrection {\n"
       
   909            "    uint ucs4;\n"
       
   910            "    uint old_mapping;\n"
       
   911            "    int version;\n"
       
   912            "};\n\n"
       
   913 
       
   914            "static const NormalizationCorrection uc_normalization_corrections[] = {\n";
       
   915 
       
   916     int numCorrections = 0;
       
   917     while (!f.atEnd()) {
       
   918         QByteArray line;
       
   919         line.resize(1024);
       
   920         int len = f.readLine(line.data(), 1024);
       
   921         line.resize(len-1);
       
   922 
       
   923         int comment = line.indexOf('#');
       
   924         if (comment >= 0)
       
   925             line = line.left(comment);
       
   926         line.replace(" ", "");
       
   927 
       
   928         if (line.isEmpty())
       
   929             continue;
       
   930 
       
   931         Q_ASSERT(!line.contains(".."));
       
   932 
       
   933         QList<QByteArray> fields = line.split(';');
       
   934         Q_ASSERT(fields.size() == 4);
       
   935 
       
   936         NormalizationCorrection c;
       
   937         bool ok;
       
   938         c.codepoint = fields.at(0).toInt(&ok, 16);
       
   939         c.mapped = fields.at(1).toInt(&ok, 16);
       
   940         if (fields.at(3) == "3.2.0")
       
   941             c.version = QChar::Unicode_3_2;
       
   942         else if (fields.at(3) == "4.0.0")
       
   943             c.version = QChar::Unicode_4_0;
       
   944         else
       
   945             qFatal("unknown unicode version in NormalizationCorrection.txt");
       
   946 
       
   947         out += "    { 0x" + QByteArray::number(c.codepoint, 16) + ", 0x" + QByteArray::number(c.mapped, 16)
       
   948              + ", " + QString::number(c.version) + " },\n";
       
   949         ++numCorrections;
       
   950     }
       
   951 
       
   952     out += "};\n\n"
       
   953 
       
   954            "enum { NumNormalizationCorrections = " + QByteArray::number(numCorrections) + " };\n\n";
       
   955 
       
   956 
       
   957     return out;
       
   958 }
       
   959 
       
   960 
       
   961 static void computeUniqueProperties()
       
   962 {
       
   963     qDebug("computeUniqueProperties:");
       
   964     for (int uc = 0; uc < 0x110000; ++uc) {
       
   965         UnicodeData d = unicodeData.value(uc, UnicodeData(uc));
       
   966 
       
   967         int index = uniqueProperties.indexOf(d.p);
       
   968         if (index == -1) {
       
   969             index = uniqueProperties.size();
       
   970             uniqueProperties.append(d.p);
       
   971         }
       
   972         d.propertyIndex = index;
       
   973         unicodeData.insert(uc, d);
       
   974     }
       
   975     qDebug("    %d unicode properties found", uniqueProperties.size());
       
   976 }
       
   977 
       
   978 
       
   979 static void readLineBreak()
       
   980 {
       
   981     QFile f("data/LineBreak.txt");
       
   982     if (!f.exists())
       
   983         qFatal("Couldn't find LineBreak.txt");
       
   984 
       
   985     f.open(QFile::ReadOnly);
       
   986 
       
   987     while (!f.atEnd()) {
       
   988         QByteArray line;
       
   989         line.resize(1024);
       
   990         int len = f.readLine(line.data(), 1024);
       
   991         line.resize(len-1);
       
   992 
       
   993         int comment = line.indexOf('#');
       
   994         if (comment >= 0)
       
   995             line = line.left(comment);
       
   996         line.replace(" ", "");
       
   997 
       
   998         if (line.isEmpty())
       
   999             continue;
       
  1000 
       
  1001         QList<QByteArray> l = line.split(';');
       
  1002         Q_ASSERT(l.size() == 2);
       
  1003 
       
  1004         QByteArray codes = l[0];
       
  1005         codes.replace("..", ".");
       
  1006         QList<QByteArray> cl = codes.split('.');
       
  1007 
       
  1008         bool ok;
       
  1009         int from = cl[0].toInt(&ok, 16);
       
  1010         int to = from;
       
  1011         if (cl.size() == 2)
       
  1012             to = cl[1].toInt(&ok, 16);
       
  1013 
       
  1014         // ### Classes XX and AI are left out and mapped to AL for now
       
  1015         QUnicodeTables::LineBreakClass lb = QUnicodeTables::LineBreak_AL;
       
  1016         QByteArray ba = l[1];
       
  1017 
       
  1018         if (ba == "AI") lb = QUnicodeTables::LineBreak_AL;
       
  1019         else if (ba == "XX") lb = QUnicodeTables::LineBreak_AL;
       
  1020         else if (ba == "NL") lb = QUnicodeTables::LineBreak_AL;
       
  1021         else if (ba == "OP") lb = QUnicodeTables::LineBreak_OP;
       
  1022         else if (ba == "CL") lb = QUnicodeTables::LineBreak_CL;
       
  1023         else if (ba == "QU") lb = QUnicodeTables::LineBreak_QU;
       
  1024         else if (ba == "GL") lb = QUnicodeTables::LineBreak_GL;
       
  1025         else if (ba == "NS") lb = QUnicodeTables::LineBreak_NS;
       
  1026         else if (ba == "EX") lb = QUnicodeTables::LineBreak_EX;
       
  1027         else if (ba == "SY") lb = QUnicodeTables::LineBreak_SY;
       
  1028         else if (ba == "IS") lb = QUnicodeTables::LineBreak_IS;
       
  1029         else if (ba == "PR") lb = QUnicodeTables::LineBreak_PR;
       
  1030         else if (ba == "PO") lb = QUnicodeTables::LineBreak_PO;
       
  1031         else if (ba == "NU") lb = QUnicodeTables::LineBreak_NU;
       
  1032         else if (ba == "AL") lb = QUnicodeTables::LineBreak_AL;
       
  1033         else if (ba == "ID") lb = QUnicodeTables::LineBreak_ID;
       
  1034         else if (ba == "IN") lb = QUnicodeTables::LineBreak_IN;
       
  1035         else if (ba == "HY") lb = QUnicodeTables::LineBreak_HY;
       
  1036         else if (ba == "BA") lb = QUnicodeTables::LineBreak_BA;
       
  1037         else if (ba == "BB") lb = QUnicodeTables::LineBreak_BB;
       
  1038         else if (ba == "B2") lb = QUnicodeTables::LineBreak_B2;
       
  1039         else if (ba == "ZW") lb = QUnicodeTables::LineBreak_ZW;
       
  1040         else if (ba == "CM") lb = QUnicodeTables::LineBreak_CM;
       
  1041         else if (ba == "SA") lb = QUnicodeTables::LineBreak_SA;
       
  1042         else if (ba == "BK") lb = QUnicodeTables::LineBreak_BK;
       
  1043         else if (ba == "CR") lb = QUnicodeTables::LineBreak_CR;
       
  1044         else if (ba == "LF") lb = QUnicodeTables::LineBreak_LF;
       
  1045         else if (ba == "SG") lb = QUnicodeTables::LineBreak_SG;
       
  1046         else if (ba == "CB") lb = QUnicodeTables::LineBreak_AL;
       
  1047         else if (ba == "SP") lb = QUnicodeTables::LineBreak_SP;
       
  1048         else if (ba == "WJ") lb = QUnicodeTables::LineBreak_WJ;
       
  1049         else if (ba == "H2") lb = QUnicodeTables::LineBreak_H2;
       
  1050         else if (ba == "H3") lb = QUnicodeTables::LineBreak_H3;
       
  1051         else if (ba == "JL") lb = QUnicodeTables::LineBreak_JL;
       
  1052         else if (ba == "JV") lb = QUnicodeTables::LineBreak_JV;
       
  1053         else if (ba == "JT") lb = QUnicodeTables::LineBreak_JT;
       
  1054         else {
       
  1055             qDebug() << "unhandled line break class:" << ba;
       
  1056         }
       
  1057 
       
  1058         for (int codepoint = from; codepoint <= to; ++codepoint) {
       
  1059             UnicodeData d = unicodeData.value(codepoint, UnicodeData(codepoint));
       
  1060             d.p.line_break_class = lb;
       
  1061             unicodeData.insert(codepoint, d);
       
  1062         }
       
  1063     }
       
  1064 }
       
  1065 
       
  1066 
       
  1067 static void readSpecialCasing()
       
  1068 {
       
  1069 //     qDebug() << "Reading SpecialCasing.txt";
       
  1070     QFile f("data/SpecialCasing.txt");
       
  1071     if (!f.exists())
       
  1072         qFatal("Couldn't find SpecialCasing.txt");
       
  1073 
       
  1074     f.open(QFile::ReadOnly);
       
  1075 
       
  1076     while (!f.atEnd()) {
       
  1077         QByteArray line;
       
  1078         line.resize(1024);
       
  1079         int len = f.readLine(line.data(), 1024);
       
  1080         line.resize(len-1);
       
  1081 
       
  1082         int comment = line.indexOf('#');
       
  1083         if (comment >= 0)
       
  1084             line = line.left(comment);
       
  1085 
       
  1086         if (line.isEmpty())
       
  1087             continue;
       
  1088 
       
  1089         QList<QByteArray> l = line.split(';');
       
  1090 
       
  1091         QByteArray condition = l.size() < 5 ? QByteArray() : l[4].trimmed();
       
  1092         if (!condition.isEmpty())
       
  1093             // #####
       
  1094             continue;
       
  1095 
       
  1096         bool ok;
       
  1097         int codepoint = l[0].trimmed().toInt(&ok, 16);
       
  1098         Q_ASSERT(ok);
       
  1099         Q_ASSERT(codepoint <= 0xffff);
       
  1100 
       
  1101 //         qDebug() << "codepoint" << hex << codepoint;
       
  1102 //         qDebug() << line;
       
  1103 
       
  1104         QList<QByteArray> lower = l[1].trimmed().split(' ');
       
  1105         QList<int> lowerMap;
       
  1106         for (int i = 0; i < lower.size(); ++i) {
       
  1107             bool ok;
       
  1108             lowerMap.append(lower.at(i).toInt(&ok, 16));
       
  1109             Q_ASSERT(ok);
       
  1110         }
       
  1111 
       
  1112         QList<QByteArray> title = l[2].trimmed().split(' ');
       
  1113         QList<int> titleMap;
       
  1114         for (int i = 0; i < title.size(); ++i) {
       
  1115             bool ok;
       
  1116             titleMap.append(title.at(i).toInt(&ok, 16));
       
  1117             if (!ok)
       
  1118                 qDebug() << line << title.at(i);
       
  1119             Q_ASSERT(ok);
       
  1120         }
       
  1121 
       
  1122         QList<QByteArray> upper = l[3].trimmed().split(' ');
       
  1123         QList<int> upperMap;
       
  1124         for (int i = 0; i < upper.size(); ++i) {
       
  1125             bool ok;
       
  1126             upperMap.append(upper.at(i).toInt(&ok, 16));
       
  1127             Q_ASSERT(ok);
       
  1128         }
       
  1129 
       
  1130 
       
  1131         UnicodeData ud = unicodeData.value(codepoint, UnicodeData(codepoint));
       
  1132 
       
  1133         Q_ASSERT(lowerMap.size() > 1 || lowerMap.at(0) == codepoint + ud.p.lowerCaseDiff);
       
  1134         Q_ASSERT(titleMap.size() > 1 || titleMap.at(0) == codepoint + ud.p.titleCaseDiff);
       
  1135         Q_ASSERT(upperMap.size() > 1 || upperMap.at(0) == codepoint + ud.p.upperCaseDiff);
       
  1136 
       
  1137         if (lowerMap.size() > 1) {
       
  1138             ud.p.lowerCaseSpecial = true;
       
  1139             ud.p.lowerCaseDiff = appendToSpecialCaseMap(lowerMap);
       
  1140         }
       
  1141         if (titleMap.size() > 1) {
       
  1142             ud.p.titleCaseSpecial = true;
       
  1143             ud.p.titleCaseDiff = appendToSpecialCaseMap(titleMap);
       
  1144         }
       
  1145         if (upperMap.size() > 1) {
       
  1146             ud.p.upperCaseSpecial = true;
       
  1147             ud.p.upperCaseDiff = appendToSpecialCaseMap(upperMap);;
       
  1148         }
       
  1149 
       
  1150         unicodeData.insert(codepoint, ud);
       
  1151     }
       
  1152 }
       
  1153 
       
  1154 int maxCaseFoldDiff = 0;
       
  1155 
       
  1156 static void readCaseFolding()
       
  1157 {
       
  1158     qDebug() << "Reading CaseFolding.txt";
       
  1159     QFile f("data/CaseFolding.txt");
       
  1160     if (!f.exists())
       
  1161         qFatal("Couldn't find CaseFolding.txt");
       
  1162 
       
  1163     f.open(QFile::ReadOnly);
       
  1164 
       
  1165     while (!f.atEnd()) {
       
  1166         QByteArray line;
       
  1167         line.resize(1024);
       
  1168         int len = f.readLine(line.data(), 1024);
       
  1169         line.resize(len-1);
       
  1170 
       
  1171         int comment = line.indexOf('#');
       
  1172         if (comment >= 0)
       
  1173             line = line.left(comment);
       
  1174 
       
  1175         if (line.isEmpty())
       
  1176             continue;
       
  1177 
       
  1178         QList<QByteArray> l = line.split(';');
       
  1179 
       
  1180         bool ok;
       
  1181         uint codepoint = l[0].trimmed().toInt(&ok, 16);
       
  1182         Q_ASSERT(ok);
       
  1183 
       
  1184 
       
  1185         l[1] = l[1].trimmed();
       
  1186         if (l[1] == "F" || l[1] == "T")
       
  1187             continue;
       
  1188 
       
  1189 //         qDebug() << "codepoint" << hex << codepoint;
       
  1190 //         qDebug() << line;
       
  1191         QList<QByteArray> fold = l[2].trimmed().split(' ');
       
  1192         QList<int> foldMap;
       
  1193         for (int i = 0; i < fold.size(); ++i) {
       
  1194             bool ok;
       
  1195             foldMap.append(fold.at(i).toInt(&ok, 16));
       
  1196             Q_ASSERT(ok);
       
  1197         }
       
  1198 
       
  1199         UnicodeData ud = unicodeData.value(codepoint, UnicodeData(codepoint));
       
  1200         if (foldMap.size() == 1) {
       
  1201             ud.p.caseFoldDiff = foldMap.at(0) - codepoint;
       
  1202             maxCaseFoldDiff = qMax(maxCaseFoldDiff, ud.p.caseFoldDiff);
       
  1203             if (codepoint > 0xffff) {
       
  1204                 // if the condition below doesn't hold anymore we need to modify our case folding code
       
  1205                 //qDebug() << codepoint << QChar::highSurrogate(codepoint) << QChar::highSurrogate(foldMap.at(0));
       
  1206                 Q_ASSERT(QChar::highSurrogate(codepoint) == QChar::highSurrogate(foldMap.at(0)));
       
  1207             }
       
  1208             if (foldMap.at(0) != codepoint + ud.p.lowerCaseDiff)
       
  1209                 qDebug() << hex << codepoint;
       
  1210         } else {
       
  1211             Q_ASSERT(false); // we currently don't support full case foldings
       
  1212 //             qDebug() << "special" << hex << foldMap;
       
  1213             ud.p.caseFoldSpecial = true;
       
  1214             ud.p.caseFoldDiff = appendToSpecialCaseMap(foldMap);
       
  1215         }
       
  1216         unicodeData.insert(codepoint, ud);
       
  1217     }
       
  1218 }
       
  1219 
       
  1220 static void readGraphemeBreak()
       
  1221 {
       
  1222     qDebug() << "Reading GraphemeBreakProperty.txt";
       
  1223     QFile f("data/GraphemeBreakProperty.txt");
       
  1224     if (!f.exists())
       
  1225         qFatal("Couldn't find GraphemeBreakProperty.txt");
       
  1226 
       
  1227     f.open(QFile::ReadOnly);
       
  1228 
       
  1229     while (!f.atEnd()) {
       
  1230         QByteArray line;
       
  1231         line.resize(1024);
       
  1232         int len = f.readLine(line.data(), 1024);
       
  1233         line.resize(len-1);
       
  1234 
       
  1235         int comment = line.indexOf('#');
       
  1236         if (comment >= 0)
       
  1237             line = line.left(comment);
       
  1238 
       
  1239         if (line.isEmpty())
       
  1240             continue;
       
  1241 
       
  1242         QList<QByteArray> l = line.split(';');
       
  1243 
       
  1244         QByteArray codes = l[0].trimmed();
       
  1245         codes.replace("..", ".");
       
  1246         QList<QByteArray> cl = codes.split('.');
       
  1247 
       
  1248         bool ok;
       
  1249         int from = cl[0].toInt(&ok, 16);
       
  1250         Q_ASSERT(ok);
       
  1251         int to = from;
       
  1252         if (cl.size() == 2) {
       
  1253             to = cl[1].toInt(&ok, 16);
       
  1254             Q_ASSERT(ok);
       
  1255         }
       
  1256 
       
  1257         GraphemeBreak brk = grapheme_break_map.value(l[1].trimmed(), GraphemeBreakOther);
       
  1258 
       
  1259         for (int codepoint = from; codepoint <= to; ++codepoint) {
       
  1260             UnicodeData ud = unicodeData.value(codepoint, UnicodeData(codepoint));
       
  1261             ud.p.graphemeBreak = brk;
       
  1262             unicodeData.insert(codepoint, ud);
       
  1263         }
       
  1264     }
       
  1265 }
       
  1266 
       
  1267 static void readWordBreak()
       
  1268 {
       
  1269     qDebug() << "Reading WordBreakProperty.txt";
       
  1270     QFile f("data/WordBreakProperty.txt");
       
  1271     if (!f.exists())
       
  1272         qFatal("Couldn't find WordBreakProperty.txt");
       
  1273 
       
  1274     f.open(QFile::ReadOnly);
       
  1275 
       
  1276     while (!f.atEnd()) {
       
  1277         QByteArray line;
       
  1278         line.resize(1024);
       
  1279         int len = f.readLine(line.data(), 1024);
       
  1280         line.resize(len-1);
       
  1281 
       
  1282         int comment = line.indexOf('#');
       
  1283         if (comment >= 0)
       
  1284             line = line.left(comment);
       
  1285 
       
  1286         if (line.isEmpty())
       
  1287             continue;
       
  1288 
       
  1289         QList<QByteArray> l = line.split(';');
       
  1290 
       
  1291         QByteArray codes = l[0].trimmed();
       
  1292         codes.replace("..", ".");
       
  1293         QList<QByteArray> cl = codes.split('.');
       
  1294 
       
  1295         bool ok;
       
  1296         int from = cl[0].toInt(&ok, 16);
       
  1297         Q_ASSERT(ok);
       
  1298         int to = from;
       
  1299         if (cl.size() == 2) {
       
  1300             to = cl[1].toInt(&ok, 16);
       
  1301             Q_ASSERT(ok);
       
  1302         }
       
  1303 
       
  1304         WordBreak brk = word_break_map.value(l[1].trimmed(), WordBreakOther);
       
  1305         Q_ASSERT(brk != WordBreakOther);
       
  1306 
       
  1307         for (int codepoint = from; codepoint <= to; ++codepoint) {
       
  1308             UnicodeData ud = unicodeData.value(codepoint, UnicodeData(codepoint));
       
  1309             ud.p.wordBreak = brk;
       
  1310             unicodeData.insert(codepoint, ud);
       
  1311         }
       
  1312     }
       
  1313 }
       
  1314 
       
  1315 static void readSentenceBreak()
       
  1316 {
       
  1317     qDebug() << "Reading SentenceBreakProperty.txt";
       
  1318     QFile f("data/SentenceBreakProperty.txt");
       
  1319     if (!f.exists())
       
  1320         qFatal("Couldn't find SentenceBreakProperty.txt");
       
  1321 
       
  1322     f.open(QFile::ReadOnly);
       
  1323 
       
  1324     while (!f.atEnd()) {
       
  1325         QByteArray line;
       
  1326         line.resize(1024);
       
  1327         int len = f.readLine(line.data(), 1024);
       
  1328         line.resize(len-1);
       
  1329 
       
  1330         int comment = line.indexOf('#');
       
  1331         if (comment >= 0)
       
  1332             line = line.left(comment);
       
  1333 
       
  1334         if (line.isEmpty())
       
  1335             continue;
       
  1336 
       
  1337         QList<QByteArray> l = line.split(';');
       
  1338 
       
  1339         QByteArray codes = l[0].trimmed();
       
  1340         codes.replace("..", ".");
       
  1341         QList<QByteArray> cl = codes.split('.');
       
  1342 
       
  1343         bool ok;
       
  1344         int from = cl[0].toInt(&ok, 16);
       
  1345         Q_ASSERT(ok);
       
  1346         int to = from;
       
  1347         if (cl.size() == 2) {
       
  1348             to = cl[1].toInt(&ok, 16);
       
  1349             Q_ASSERT(ok);
       
  1350         }
       
  1351 
       
  1352         SentenceBreak brk = sentence_break_map.value(l[1].trimmed(), SentenceBreakOther);
       
  1353         Q_ASSERT(brk != SentenceBreakOther);
       
  1354 
       
  1355         for (int codepoint = from; codepoint <= to; ++codepoint) {
       
  1356             UnicodeData ud = unicodeData.value(codepoint, UnicodeData(codepoint));
       
  1357             ud.p.sentenceBreak = brk;
       
  1358             unicodeData.insert(codepoint, ud);
       
  1359         }
       
  1360     }
       
  1361 }
       
  1362 
       
  1363 #if 0
       
  1364 // this piece of code does full case folding and comparison. We currently
       
  1365 // don't use it, since this gives lots of issues with things as case insensitive
       
  1366 // search and replace.
       
  1367 static inline void foldCase(uint ch, ushort *out)
       
  1368 {
       
  1369     const QUnicodeTables::Properties *p = qGetProp(ch);
       
  1370     if (!p->caseFoldSpecial) {
       
  1371         *(out++) = ch + p->caseFoldDiff;
       
  1372     } else {
       
  1373         const ushort *folded = specialCaseMap + p->caseFoldDiff;
       
  1374         while (*folded)
       
  1375             *out++ = *folded++;
       
  1376     }
       
  1377     *out = 0;
       
  1378 }
       
  1379 
       
  1380 static int ucstricmp(const ushort *a, const ushort *ae, const ushort *b, const ushort *be)
       
  1381 {
       
  1382     if (a == b)
       
  1383         return 0;
       
  1384     if (a == 0)
       
  1385         return 1;
       
  1386     if (b == 0)
       
  1387         return -1;
       
  1388 
       
  1389     while (a != ae && b != be) {
       
  1390         const QUnicodeTables::Properties *pa = qGetProp(*a);
       
  1391         const QUnicodeTables::Properties *pb = qGetProp(*b);
       
  1392         if (pa->caseFoldSpecial | pb->caseFoldSpecial)
       
  1393             goto special;
       
  1394             int diff = (int)(*a + pa->caseFoldDiff) - (int)(*b + pb->caseFoldDiff);
       
  1395         if ((diff))
       
  1396             return diff;
       
  1397         ++a;
       
  1398         ++b;
       
  1399         }
       
  1400     }
       
  1401     if (a == ae) {
       
  1402         if (b == be)
       
  1403             return 0;
       
  1404         return -1;
       
  1405     }
       
  1406     return 1;
       
  1407 special:
       
  1408     ushort abuf[SPECIAL_CASE_MAX_LEN + 1];
       
  1409     ushort bbuf[SPECIAL_CASE_MAX_LEN + 1];
       
  1410     abuf[0] = bbuf[0] = 0;
       
  1411     ushort *ap = abuf;
       
  1412     ushort *bp = bbuf;
       
  1413     while (1) {
       
  1414         if (!*ap) {
       
  1415             if (a == ae) {
       
  1416                 if (!*bp && b == be)
       
  1417                     return 0;
       
  1418                 return -1;
       
  1419             }
       
  1420             foldCase(*(a++), abuf);
       
  1421             ap = abuf;
       
  1422         }
       
  1423         if (!*bp) {
       
  1424             if (b == be)
       
  1425                 return 1;
       
  1426             foldCase(*(b++), bbuf);
       
  1427             bp = bbuf;
       
  1428         }
       
  1429         if (*ap != *bp)
       
  1430             return (int)*ap - (int)*bp;
       
  1431         ++ap;
       
  1432         ++bp;
       
  1433     }
       
  1434 }
       
  1435 
       
  1436 
       
  1437 static int ucstricmp(const ushort *a, const ushort *ae, const uchar *b)
       
  1438 {
       
  1439     if (a == 0)
       
  1440         return 1;
       
  1441     if (b == 0)
       
  1442         return -1;
       
  1443 
       
  1444     while (a != ae && *b) {
       
  1445         const QUnicodeTables::Properties *pa = qGetProp(*a);
       
  1446         const QUnicodeTables::Properties *pb = qGetProp((ushort)*b);
       
  1447         if (pa->caseFoldSpecial | pb->caseFoldSpecial)
       
  1448             goto special;
       
  1449         int diff = (int)(*a + pa->caseFoldDiff) - (int)(*b + pb->caseFoldDiff);
       
  1450         if ((diff))
       
  1451             return diff;
       
  1452         ++a;
       
  1453         ++b;
       
  1454     }
       
  1455     if (a == ae) {
       
  1456         if (!*b)
       
  1457             return 0;
       
  1458         return -1;
       
  1459     }
       
  1460     return 1;
       
  1461 
       
  1462 special:
       
  1463     ushort abuf[SPECIAL_CASE_MAX_LEN + 1];
       
  1464     ushort bbuf[SPECIAL_CASE_MAX_LEN + 1];
       
  1465     abuf[0] = bbuf[0] = 0;
       
  1466     ushort *ap = abuf;
       
  1467     ushort *bp = bbuf;
       
  1468     while (1) {
       
  1469         if (!*ap) {
       
  1470             if (a == ae) {
       
  1471                 if (!*bp && !*b)
       
  1472                     return 0;
       
  1473                 return -1;
       
  1474             }
       
  1475             foldCase(*(a++), abuf);
       
  1476             ap = abuf;
       
  1477         }
       
  1478         if (!*bp) {
       
  1479             if (!*b)
       
  1480                 return 1;
       
  1481             foldCase(*(b++), bbuf);
       
  1482             bp = bbuf;
       
  1483         }
       
  1484         if (*ap != *bp)
       
  1485             return (int)*ap - (int)*bp;
       
  1486         ++ap;
       
  1487         ++bp;
       
  1488     }
       
  1489 }
       
  1490 #endif
       
  1491 
       
  1492 #if 0
       
  1493 static QList<QByteArray> blockNames;
       
  1494 struct BlockInfo
       
  1495 {
       
  1496     int blockIndex;
       
  1497     int firstCodePoint;
       
  1498     int lastCodePoint;
       
  1499 };
       
  1500 static QList<BlockInfo> blockInfoList;
       
  1501 
       
  1502 static void readBlocks()
       
  1503 {
       
  1504     QFile f("data/Blocks.txt");
       
  1505     if (!f.exists())
       
  1506         qFatal("Couldn't find Blocks.txt");
       
  1507 
       
  1508     f.open(QFile::ReadOnly);
       
  1509 
       
  1510     while (!f.atEnd()) {
       
  1511         QByteArray line = f.readLine();
       
  1512         line.resize(line.size() - 1);
       
  1513 
       
  1514         int comment = line.indexOf("#");
       
  1515         if (comment >= 0)
       
  1516             line = line.left(comment);
       
  1517 
       
  1518         line.replace(" ", "");
       
  1519 
       
  1520         if (line.isEmpty())
       
  1521             continue;
       
  1522 
       
  1523         int semicolon = line.indexOf(';');
       
  1524         Q_ASSERT(semicolon >= 0);
       
  1525         QByteArray codePoints = line.left(semicolon);
       
  1526         QByteArray blockName = line.mid(semicolon + 1);
       
  1527 
       
  1528         int blockIndex = blockNames.indexOf(blockName);
       
  1529         if (blockIndex < 0) {
       
  1530             blockNames.append(blockName);
       
  1531             blockIndex = blockNames.indexOf(blockName);
       
  1532             Q_ASSERT(blockIndex >= 0);
       
  1533         }
       
  1534 
       
  1535         int dotdot = codePoints.indexOf("..");
       
  1536         Q_ASSERT(dotdot >= 0);
       
  1537         bool unused;
       
  1538         int first = codePoints.left(dotdot).toInt(&unused, 16);
       
  1539         int last = codePoints.mid(dotdot + 2).toInt(&unused, 16);
       
  1540 
       
  1541         BlockInfo blockInfo = { blockIndex, first, last };
       
  1542         blockInfoList.append(blockInfo);
       
  1543     }
       
  1544 }
       
  1545 #endif
       
  1546 
       
  1547 static QList<QByteArray> scriptNames;
       
  1548 static QHash<int, int> scriptAssignment;
       
  1549 static QHash<int, int> scriptHash;
       
  1550 
       
  1551 struct ExtraBlock {
       
  1552     int block;
       
  1553     QVector<int> vector;
       
  1554 };
       
  1555 
       
  1556 static QList<ExtraBlock> extraBlockList;
       
  1557 
       
  1558 
       
  1559 static void readScripts()
       
  1560 {
       
  1561     scriptNames.append("Common");
       
  1562 
       
  1563     static const char *files[] = {
       
  1564         "data/ScriptsInitial.txt",
       
  1565         "data/Scripts.txt",
       
  1566         "data/ScriptsCorrections.txt"
       
  1567     };
       
  1568     enum { fileCount = sizeof(files) / sizeof(const char *) };
       
  1569 
       
  1570     for (int i = 0; i < fileCount; ++i) {
       
  1571         QFile f(files[i]);
       
  1572         if (!f.exists())
       
  1573             qFatal("Couldn't find %s", files[i]);
       
  1574 
       
  1575 
       
  1576         f.open(QFile::ReadOnly);
       
  1577 
       
  1578         while (!f.atEnd()) {
       
  1579             QByteArray line = f.readLine();
       
  1580             line.resize(line.size() - 1);
       
  1581 
       
  1582             int comment = line.indexOf("#");
       
  1583             if (comment >= 0)
       
  1584                 line = line.left(comment);
       
  1585 
       
  1586             line.replace(" ", "");
       
  1587             line.replace("_", "");
       
  1588 
       
  1589             if (line.isEmpty())
       
  1590                 continue;
       
  1591 
       
  1592             int semicolon = line.indexOf(';');
       
  1593             Q_ASSERT(semicolon >= 0);
       
  1594             QByteArray codePoints = line.left(semicolon);
       
  1595             QByteArray scriptName = line.mid(semicolon + 1);
       
  1596 
       
  1597             int scriptIndex = scriptNames.indexOf(scriptName);
       
  1598             if (scriptIndex < 0) {
       
  1599                 scriptNames.append(scriptName);
       
  1600                 scriptIndex = scriptNames.indexOf(scriptName);
       
  1601                 Q_ASSERT(scriptIndex >= 0);
       
  1602             }
       
  1603 
       
  1604             int dotdot = codePoints.indexOf("..");
       
  1605             bool unused;
       
  1606             int first = -1, last = -1;
       
  1607             if (dotdot >= 0) {
       
  1608                 first = codePoints.left(dotdot).toInt(&unused, 16);
       
  1609                 last = codePoints.mid(dotdot + 2).toInt(&unused, 16);
       
  1610             } else {
       
  1611                 first = codePoints.toInt(&unused, 16);
       
  1612             }
       
  1613 
       
  1614             if (last != -1) {
       
  1615                 for (int i = first; i <= last; ++i)
       
  1616                     scriptAssignment[i] = scriptIndex;
       
  1617             } else {
       
  1618                 scriptAssignment[first] = scriptIndex;
       
  1619             }
       
  1620         }
       
  1621     }
       
  1622 }
       
  1623 
       
  1624 
       
  1625 static int scriptSentinel = 0;
       
  1626 
       
  1627 QByteArray createScriptEnumDeclaration()
       
  1628 {
       
  1629     static const char *specialScripts[] = {
       
  1630         "Common",
       
  1631         "Arabic",
       
  1632         "Armenian",
       
  1633         "Bengali",
       
  1634         "Cyrillic",
       
  1635         "Devanagari",
       
  1636         "Georgian",
       
  1637         "Greek",
       
  1638         "Gujarati",
       
  1639         "Gurmukhi",
       
  1640         "Hangul",
       
  1641         "Hebrew",
       
  1642         "Kannada",
       
  1643         "Khmer",
       
  1644         "Lao",
       
  1645         "Malayalam",
       
  1646         "Myanmar",
       
  1647         "Ogham",
       
  1648         "Oriya",
       
  1649         "Runic",
       
  1650         "Sinhala",
       
  1651         "Syriac",
       
  1652         "Tamil",
       
  1653         "Telugu",
       
  1654         "Thaana",
       
  1655         "Thai",
       
  1656         "Tibetan",
       
  1657         "Inherited"
       
  1658     };
       
  1659     const int specialScriptsCount = sizeof(specialScripts) / sizeof(const char *);
       
  1660 
       
  1661     // generate script enum
       
  1662     QByteArray declaration;
       
  1663 
       
  1664     declaration += "    // See http://www.unicode.org/reports/tr24/tr24-5.html\n\n";
       
  1665     declaration += "    enum Script {\n        Common";
       
  1666 
       
  1667     int uniqueScripts = 1; // Common
       
  1668 
       
  1669     // output the ones with special processing first
       
  1670     for (int i = 1; i < scriptNames.size(); ++i) {
       
  1671         QByteArray scriptName = scriptNames.at(i);
       
  1672         // does the script require special processing?
       
  1673         bool special = false;
       
  1674         for (int s = 0; !special && s < specialScriptsCount; ++s) {
       
  1675             if (scriptName == specialScripts[s])
       
  1676                 special = true;
       
  1677         }
       
  1678         if (!special) {
       
  1679             scriptHash[i] =  0; // alias for 'Common'
       
  1680             continue;
       
  1681         } else {
       
  1682             ++uniqueScripts;
       
  1683             scriptHash[i] = i;
       
  1684         }
       
  1685 
       
  1686         declaration += ",\n        ";
       
  1687         declaration += scriptName;
       
  1688     }
       
  1689     declaration += ",\n        ScriptCount = Inherited";
       
  1690 
       
  1691     // output the ones that are an alias for 'Common'
       
  1692     for (int i = 1; i < scriptNames.size(); ++i) {
       
  1693         if (scriptHash.value(i) != 0)
       
  1694             continue;
       
  1695         QByteArray scriptName = scriptNames.at(i);
       
  1696         scriptName += " = Common";
       
  1697         declaration += ",\n        ";
       
  1698         declaration += scriptName;
       
  1699     }
       
  1700 
       
  1701     declaration += "\n    };\n";
       
  1702 
       
  1703     scriptSentinel = ((uniqueScripts + 16) / 32) * 32; // a multiple of 32
       
  1704     declaration += "    enum { ScriptSentinel = ";
       
  1705     declaration += QByteArray::number(scriptSentinel);
       
  1706     declaration += " };\n\n";
       
  1707     return declaration;
       
  1708 }
       
  1709 
       
  1710 QByteArray createScriptTableDeclaration()
       
  1711 {
       
  1712     Q_ASSERT(scriptSentinel > 0);
       
  1713 
       
  1714     QByteArray declaration;
       
  1715 
       
  1716     const int unicodeBlockCount = 512; // number of unicode blocks
       
  1717     const int unicodeBlockSize = 128; // size of each block
       
  1718     declaration = "enum { UnicodeBlockCount = ";
       
  1719     declaration += QByteArray::number(unicodeBlockCount);
       
  1720     declaration += " }; // number of unicode blocks\n";
       
  1721     declaration += "enum { UnicodeBlockSize = ";
       
  1722     declaration += QByteArray::number(unicodeBlockSize);
       
  1723     declaration += " }; // size of each block\n\n";
       
  1724 
       
  1725     // script table
       
  1726     declaration += "namespace QUnicodeTables {\n\nstatic const unsigned char uc_scripts[] = {\n";
       
  1727     for (int i = 0; i < unicodeBlockCount; ++i) {
       
  1728         int block = (((i << 7) & 0xff00) | ((i & 1) * 0x80));
       
  1729         int blockAssignment[unicodeBlockSize];
       
  1730         for (int x = 0; x < unicodeBlockSize; ++x) {
       
  1731             int codePoint = (i << 7) | x;
       
  1732             blockAssignment[x] = scriptAssignment.value(codePoint, 0);
       
  1733         }
       
  1734         bool allTheSame = true;
       
  1735         const int originalScript = blockAssignment[0];
       
  1736         const int script = scriptHash.value(originalScript);
       
  1737         for (int x = 1; allTheSame && x < unicodeBlockSize; ++x) {
       
  1738             const int s = scriptHash.value(blockAssignment[x]);
       
  1739             if (s != script)
       
  1740                 allTheSame = false;
       
  1741         }
       
  1742 
       
  1743         if (allTheSame) {
       
  1744             declaration += "    ";
       
  1745             declaration += scriptNames.value(originalScript);
       
  1746             declaration += ", /* U+";
       
  1747             declaration += QByteArray::number(block, 16).rightJustified(4, '0');
       
  1748             declaration += '-';
       
  1749             declaration +=
       
  1750                 QByteArray::number(block + unicodeBlockSize - 1, 16).rightJustified(4, '0');
       
  1751             declaration += " */\n";
       
  1752         } else {
       
  1753             const int value = extraBlockList.size() + scriptSentinel;
       
  1754             const int offset =
       
  1755                 ((value - scriptSentinel) * unicodeBlockSize) + unicodeBlockCount;
       
  1756 
       
  1757             declaration += "    ";
       
  1758             declaration += QByteArray::number(value);
       
  1759             declaration += ", /* U+";
       
  1760             declaration += QByteArray::number(block, 16).rightJustified(4, '0');
       
  1761             declaration += '-';
       
  1762             declaration +=
       
  1763                 QByteArray::number(block + unicodeBlockSize - 1, 16).rightJustified(4, '0');
       
  1764             declaration += " at offset ";
       
  1765             declaration += QByteArray::number(offset);
       
  1766             declaration += " */\n";
       
  1767 
       
  1768             ExtraBlock extraBlock;
       
  1769             extraBlock.block = block;
       
  1770             extraBlock.vector.resize(unicodeBlockSize);
       
  1771             for (int x = 0; x < unicodeBlockSize; ++x)
       
  1772                 extraBlock.vector[x] = blockAssignment[x];
       
  1773 
       
  1774             extraBlockList.append(extraBlock);
       
  1775         }
       
  1776     }
       
  1777 
       
  1778     for (int i = 0; i < extraBlockList.size(); ++i) {
       
  1779         const int value = i + scriptSentinel;
       
  1780         const int offset =
       
  1781             ((value - scriptSentinel) * unicodeBlockSize) + unicodeBlockCount;
       
  1782         const ExtraBlock &extraBlock = extraBlockList.at(i);
       
  1783         const int block = extraBlock.block;
       
  1784 
       
  1785         declaration += "\n\n    /* U+";
       
  1786         declaration += QByteArray::number(block, 16).rightJustified(4, '0');
       
  1787         declaration += '-';
       
  1788         declaration +=
       
  1789             QByteArray::number(block + unicodeBlockSize - 1, 16).rightJustified(4, '0');
       
  1790         declaration += " at offset ";
       
  1791         declaration += QByteArray::number(offset);
       
  1792         declaration += " */\n    ";
       
  1793 
       
  1794         for (int x = 0; x < extraBlock.vector.size(); ++x) {
       
  1795             const int o = extraBlock.vector.at(x);
       
  1796 
       
  1797             declaration += scriptNames.value(o);
       
  1798             if (x < extraBlock.vector.size() - 1 || i < extraBlockList.size() - 1)
       
  1799                 declaration += ',';
       
  1800             if ((x & 7) == 7 && x < extraBlock.vector.size() - 1)
       
  1801                 declaration += "\n    ";
       
  1802             else
       
  1803                 declaration += ' ';
       
  1804         }
       
  1805     }
       
  1806     declaration += "\n};\n\n} // namespace QUnicodeTables\n\n";
       
  1807 
       
  1808     qDebug("createScriptTableDeclaration: table size is %d bytes",
       
  1809            unicodeBlockCount + (extraBlockList.size() * unicodeBlockSize));
       
  1810 
       
  1811     return declaration;
       
  1812 }
       
  1813 
       
  1814 #if 0
       
  1815 static void dump(int from, int to)
       
  1816 {
       
  1817     for (int i = from; i <= to; ++i) {
       
  1818         UnicodeData d = unicodeData.value(i, UnicodeData(i));
       
  1819         qDebug("0x%04x: cat=%d combining=%d dir=%d case=%x mirror=%x joining=%d age=%d",
       
  1820                i, d.p.category, d.p.combiningClass, d.p.direction, d.otherCase, d.mirroredChar, d.p.joining, d.p.age);
       
  1821         if (d.decompositionType != QChar::NoDecomposition) {
       
  1822             qDebug("    decomposition: type=%d, length=%d, first=%x", d.decompositionType, d.decomposition.size(),
       
  1823                    d.decomposition[0]);
       
  1824         }
       
  1825     }
       
  1826     qDebug(" ");
       
  1827 }
       
  1828 #endif
       
  1829 
       
  1830 struct PropertyBlock {
       
  1831     PropertyBlock() { index = -1; }
       
  1832     int index;
       
  1833     QList<int> properties;
       
  1834     bool operator ==(const PropertyBlock &other) { return properties == other.properties; }
       
  1835 };
       
  1836 
       
  1837 static QByteArray createPropertyInfo()
       
  1838 {
       
  1839     qDebug("createPropertyInfo:");
       
  1840 
       
  1841     const int BMP_BLOCKSIZE=32;
       
  1842     const int BMP_SHIFT = 5;
       
  1843     const int BMP_END = 0x11000;
       
  1844     const int SMP_END = 0x110000;
       
  1845     const int SMP_BLOCKSIZE = 256;
       
  1846     const int SMP_SHIFT = 8;
       
  1847 
       
  1848     QList<PropertyBlock> blocks;
       
  1849     QList<int> blockMap;
       
  1850 
       
  1851     int used = 0;
       
  1852 
       
  1853     for (int block = 0; block < BMP_END/BMP_BLOCKSIZE; ++block) {
       
  1854         PropertyBlock b;
       
  1855         for (int i = 0; i < BMP_BLOCKSIZE; ++i) {
       
  1856             int uc = block*BMP_BLOCKSIZE + i;
       
  1857             UnicodeData d = unicodeData.value(uc, UnicodeData(uc));
       
  1858             b.properties.append(d.propertyIndex);
       
  1859         }
       
  1860         int index = blocks.indexOf(b);
       
  1861         if (index == -1) {
       
  1862             index = blocks.size();
       
  1863             b.index = used;
       
  1864             used += BMP_BLOCKSIZE;
       
  1865             blocks.append(b);
       
  1866         }
       
  1867         blockMap.append(blocks.at(index).index);
       
  1868     }
       
  1869 
       
  1870     int bmp_blocks = blocks.size();
       
  1871     Q_ASSERT(blockMap.size() == BMP_END/BMP_BLOCKSIZE);
       
  1872 
       
  1873     for (int block = BMP_END/SMP_BLOCKSIZE; block < SMP_END/SMP_BLOCKSIZE; ++block) {
       
  1874         PropertyBlock b;
       
  1875         for (int i = 0; i < SMP_BLOCKSIZE; ++i) {
       
  1876             int uc = block*SMP_BLOCKSIZE + i;
       
  1877             UnicodeData d = unicodeData.value(uc, UnicodeData(uc));
       
  1878             b.properties.append(d.propertyIndex);
       
  1879         }
       
  1880         int index = blocks.indexOf(b);
       
  1881         if (index == -1) {
       
  1882             index = blocks.size();
       
  1883             b.index = used;
       
  1884             used += SMP_BLOCKSIZE;
       
  1885             blocks.append(b);
       
  1886         }
       
  1887         blockMap.append(blocks.at(index).index);
       
  1888     }
       
  1889 
       
  1890     int bmp_block_data = bmp_blocks*BMP_BLOCKSIZE*2;
       
  1891     int bmp_trie = BMP_END/BMP_BLOCKSIZE*2;
       
  1892     int bmp_mem = bmp_block_data + bmp_trie;
       
  1893     qDebug("    %d unique blocks in BMP.",blocks.size());
       
  1894     qDebug("        block data uses: %d bytes", bmp_block_data);
       
  1895     qDebug("        trie data uses : %d bytes", bmp_trie);
       
  1896 
       
  1897     int smp_block_data = (blocks.size()- bmp_blocks)*SMP_BLOCKSIZE*2;
       
  1898     int smp_trie = (SMP_END-BMP_END)/SMP_BLOCKSIZE*2;
       
  1899     int smp_mem = smp_block_data + smp_trie;
       
  1900     qDebug("    %d unique blocks in SMP.",blocks.size()-bmp_blocks);
       
  1901     qDebug("        block data uses: %d bytes", smp_block_data);
       
  1902     qDebug("        trie data uses : %d bytes", smp_trie);
       
  1903 
       
  1904     qDebug("\n        properties use : %d bytes", uniqueProperties.size()*20);
       
  1905     qDebug("    memory usage: %d bytes", bmp_mem+smp_mem + uniqueProperties.size()*20);
       
  1906 
       
  1907     QByteArray out;
       
  1908     out += "static const unsigned short uc_property_trie[] = {\n";
       
  1909 
       
  1910     // first write the map
       
  1911     out += "    // 0x" + QByteArray::number(BMP_END, 16);
       
  1912     for (int i = 0; i < BMP_END/BMP_BLOCKSIZE; ++i) {
       
  1913         if (!(i % 8)) {
       
  1914             if (out.endsWith(' '))
       
  1915                 out.chop(1);
       
  1916             if (!((i*BMP_BLOCKSIZE) % 0x1000))
       
  1917                 out += "\n";
       
  1918             out += "\n    ";
       
  1919         }
       
  1920         out += QByteArray::number(blockMap.at(i) + blockMap.size());
       
  1921         out += ", ";
       
  1922     }
       
  1923     if (out.endsWith(' '))
       
  1924         out.chop(1);
       
  1925     out += "\n\n    // 0x" + QByteArray::number(BMP_END, 16) + " - 0x" + QByteArray::number(SMP_END, 16) + "\n";;
       
  1926     for (int i = BMP_END/BMP_BLOCKSIZE; i < blockMap.size(); ++i) {
       
  1927         if (!(i % 8)) {
       
  1928             if (out.endsWith(' '))
       
  1929                 out.chop(1);
       
  1930             if (!(i % (0x10000/SMP_BLOCKSIZE)))
       
  1931                 out += "\n";
       
  1932             out += "\n    ";
       
  1933         }
       
  1934         out += QByteArray::number(blockMap.at(i) + blockMap.size());
       
  1935         out += ", ";
       
  1936     }
       
  1937     if (out.endsWith(' '))
       
  1938         out.chop(1);
       
  1939     out += "\n";
       
  1940     // write the data
       
  1941     for (int i = 0; i < blocks.size(); ++i) {
       
  1942         if (out.endsWith(' '))
       
  1943             out.chop(1);
       
  1944         out += "\n";
       
  1945         const PropertyBlock &b = blocks.at(i);
       
  1946         for (int j = 0; j < b.properties.size(); ++j) {
       
  1947             if (!(j % 8)) {
       
  1948                 if (out.endsWith(' '))
       
  1949                     out.chop(1);
       
  1950                 out += "\n    ";
       
  1951             }
       
  1952             out += QByteArray::number(b.properties.at(j));
       
  1953             out += ", ";
       
  1954         }
       
  1955     }
       
  1956 
       
  1957     // we reserve one bit more than in the assert below for the sign
       
  1958     Q_ASSERT(maxMirroredDiff < (1<<12));
       
  1959     Q_ASSERT(maxLowerCaseDiff < (1<<14));
       
  1960     Q_ASSERT(maxUpperCaseDiff < (1<<14));
       
  1961     Q_ASSERT(maxTitleCaseDiff < (1<<14));
       
  1962     Q_ASSERT(maxCaseFoldDiff < (1<<14));
       
  1963 
       
  1964     if (out.endsWith(' '))
       
  1965         out.chop(1);
       
  1966     out += "\n};\n\n"
       
  1967 
       
  1968            "#define GET_PROP_INDEX(ucs4) \\\n"
       
  1969            "       (ucs4 < 0x" + QByteArray::number(BMP_END, 16) + " \\\n"
       
  1970            "        ? (uc_property_trie[uc_property_trie[ucs4>>" + QByteArray::number(BMP_SHIFT) +
       
  1971            "] + (ucs4 & 0x" + QByteArray::number(BMP_BLOCKSIZE-1, 16)+ ")]) \\\n"
       
  1972            "        : (uc_property_trie[uc_property_trie[((ucs4 - 0x" + QByteArray::number(BMP_END, 16) +
       
  1973            ")>>" + QByteArray::number(SMP_SHIFT) + ") + 0x" + QByteArray::number(BMP_END/BMP_BLOCKSIZE, 16) + "]"
       
  1974            " + (ucs4 & 0x" + QByteArray::number(SMP_BLOCKSIZE-1, 16) + ")]))\n\n"
       
  1975            "#define GET_PROP_INDEX_UCS2(ucs2) \\\n"
       
  1976            "(uc_property_trie[uc_property_trie[ucs2>>" + QByteArray::number(BMP_SHIFT) +
       
  1977            "] + (ucs2 & 0x" + QByteArray::number(BMP_BLOCKSIZE-1, 16)+ ")])\n\n"
       
  1978 
       
  1979 
       
  1980            "static const QUnicodeTables::Properties uc_properties [] = {\n";
       
  1981 
       
  1982     // keep in sync with the property declaration
       
  1983     for (int i = 0; i < uniqueProperties.size(); ++i) {
       
  1984         PropertyFlags p = uniqueProperties.at(i);
       
  1985         out += "    { ";
       
  1986 //     "        ushort category : 8;\n"
       
  1987         out += QByteArray::number( p.category );
       
  1988         out += ", ";
       
  1989 //     "        ushort line_break_class : 8;\n"
       
  1990         out += QByteArray::number( p.line_break_class );
       
  1991         out += ", ";
       
  1992 //     "        ushort direction : 8;\n"
       
  1993         out += QByteArray::number( p.direction );
       
  1994         out += ", ";
       
  1995 //     "        ushort combiningClass :8;\n"
       
  1996         out += QByteArray::number( p.combiningClass );
       
  1997         out += ", ";
       
  1998 //     "        ushort joining : 2;\n"
       
  1999         out += QByteArray::number( p.joining );
       
  2000         out += ", ";
       
  2001 //     "        signed short digitValue : 6;\n /* 5 needed */"
       
  2002         out += QByteArray::number( p.digitValue );
       
  2003         out += ", ";
       
  2004 //     "        ushort unicodeVersion : 4;\n"
       
  2005         out += QByteArray::number( p.age );
       
  2006         out += ", ";
       
  2007 //     "        ushort lowerCaseSpecial : 1;\n"
       
  2008 //     "        ushort upperCaseSpecial : 1;\n"
       
  2009 //     "        ushort titleCaseSpecial : 1;\n"
       
  2010 //     "        ushort caseFoldSpecial : 1;\n"
       
  2011         out += QByteArray::number( p.lowerCaseSpecial );
       
  2012         out += ", ";
       
  2013         out += QByteArray::number( p.upperCaseSpecial );
       
  2014         out += ", ";
       
  2015         out += QByteArray::number( p.titleCaseSpecial );
       
  2016         out += ", ";
       
  2017         out += QByteArray::number( p.caseFoldSpecial );
       
  2018         out += ", ";
       
  2019 //     "        signed short mirrorDiff : 16;\n"
       
  2020 //     "        signed short lowerCaseDiff : 16;\n"
       
  2021 //     "        signed short upperCaseDiff : 16;\n"
       
  2022 //     "        signed short titleCaseDiff : 16;\n"
       
  2023 //     "        signed short caseFoldDiff : 16;\n"
       
  2024         out += QByteArray::number( p.mirrorDiff );
       
  2025         out += ", ";
       
  2026         out += QByteArray::number( p.lowerCaseDiff );
       
  2027         out += ", ";
       
  2028         out += QByteArray::number( p.upperCaseDiff );
       
  2029         out += ", ";
       
  2030         out += QByteArray::number( p.titleCaseDiff );
       
  2031         out += ", ";
       
  2032         out += QByteArray::number( p.caseFoldDiff );
       
  2033         out += ", ";
       
  2034         out += QByteArray::number( p.graphemeBreak );
       
  2035         out += ", ";
       
  2036         out += QByteArray::number( p.wordBreak );
       
  2037         out += ", ";
       
  2038         out += QByteArray::number( p.sentenceBreak );
       
  2039         out += "},\n";
       
  2040     }
       
  2041     out += "};\n\n";
       
  2042 
       
  2043     out += "static inline const QUnicodeTables::Properties *qGetProp(uint ucs4)\n"
       
  2044            "{\n"
       
  2045            "    int index = GET_PROP_INDEX(ucs4);\n"
       
  2046            "    return uc_properties + index;\n"
       
  2047            "}\n"
       
  2048            "\n"
       
  2049            "static inline const QUnicodeTables::Properties *qGetProp(ushort ucs2)\n"
       
  2050            "{\n"
       
  2051            "    int index = GET_PROP_INDEX_UCS2(ucs2);\n"
       
  2052            "    return uc_properties + index;\n"
       
  2053            "}\n"
       
  2054            "\n"
       
  2055            "Q_CORE_EXPORT const QUnicodeTables::Properties * QT_FASTCALL QUnicodeTables::properties(uint ucs4)\n"
       
  2056            "{\n"
       
  2057            "    int index = GET_PROP_INDEX(ucs4);\n"
       
  2058            "    return uc_properties + index;\n"
       
  2059            "}\n"
       
  2060            "\n"
       
  2061            "Q_CORE_EXPORT const QUnicodeTables::Properties * QT_FASTCALL QUnicodeTables::properties(ushort ucs2)\n"
       
  2062            "{\n"
       
  2063            "    int index = GET_PROP_INDEX_UCS2(ucs2);\n"
       
  2064            "    return uc_properties + index;\n"
       
  2065            "}\n\n";
       
  2066 
       
  2067     out += "#define CURRENT_VERSION "CURRENT_UNICODE_VERSION"\n\n";
       
  2068 
       
  2069     out += "static const ushort specialCaseMap [] = {";
       
  2070     for (int i = 0; i < specialCaseMap.size(); ++i) {
       
  2071         if (!(i % 16))
       
  2072             out += "\n   ";
       
  2073         out += QByteArray(" 0x") + QByteArray::number(specialCaseMap.at(i), 16);
       
  2074         if (i < specialCaseMap.size() - 1)
       
  2075             out += ",";
       
  2076     }
       
  2077     out += "\n};\n";
       
  2078     out += "#define SPECIAL_CASE_MAX_LEN " + QByteArray::number(specialCaseMaxLen) + "\n\n";
       
  2079 
       
  2080     qDebug() << "Special case map uses " << specialCaseMap.size()*2 << "bytes";
       
  2081 
       
  2082     return out;
       
  2083 }
       
  2084 
       
  2085 
       
  2086 struct DecompositionBlock {
       
  2087     DecompositionBlock() { index = -1; }
       
  2088     int index;
       
  2089     QList<int> decompositionPositions;
       
  2090     bool operator ==(const DecompositionBlock &other)
       
  2091         { return decompositionPositions == other.decompositionPositions; }
       
  2092 };
       
  2093 
       
  2094 static QByteArray createCompositionInfo()
       
  2095 {
       
  2096     qDebug("createCompositionInfo:");
       
  2097 
       
  2098     const int BMP_BLOCKSIZE=16;
       
  2099     const int BMP_SHIFT = 4;
       
  2100     const int BMP_END = 0x3400; // start of Han
       
  2101     const int SMP_END = 0x30000;
       
  2102     const int SMP_BLOCKSIZE = 256;
       
  2103     const int SMP_SHIFT = 8;
       
  2104 
       
  2105     if(SMP_END <= highestComposedCharacter)
       
  2106         qFatal("end of table smaller than highest composed character at %x", highestComposedCharacter);
       
  2107 
       
  2108     QList<DecompositionBlock> blocks;
       
  2109     QList<int> blockMap;
       
  2110     QList<unsigned short> decompositions;
       
  2111 
       
  2112     int used = 0;
       
  2113     int tableIndex = 0;
       
  2114 
       
  2115     for (int block = 0; block < BMP_END/BMP_BLOCKSIZE; ++block) {
       
  2116         DecompositionBlock b;
       
  2117         for (int i = 0; i < BMP_BLOCKSIZE; ++i) {
       
  2118             int uc = block*BMP_BLOCKSIZE + i;
       
  2119             UnicodeData d = unicodeData.value(uc, UnicodeData(uc));
       
  2120             if (!d.decomposition.isEmpty()) {
       
  2121                 int utf16Chars = 0;
       
  2122                 for (int j = 0; j < d.decomposition.size(); ++j)
       
  2123                     utf16Chars += d.decomposition.at(j) > 0x10000 ? 2 : 1;
       
  2124                 decompositions.append(d.decompositionType + (utf16Chars<<8));
       
  2125                 for (int j = 0; j < d.decomposition.size(); ++j) {
       
  2126                     int code = d.decomposition.at(j);
       
  2127                     if (code > 0x10000) {
       
  2128                         // save as surrogate pair
       
  2129                         code -= 0x10000;
       
  2130                         ushort high = code/0x400 + 0xd800;
       
  2131                         ushort low = code%0x400 + 0xdc00;
       
  2132                         decompositions.append(high);
       
  2133                         decompositions.append(low);
       
  2134                     } else {
       
  2135                         decompositions.append(code);
       
  2136                     }
       
  2137                 }
       
  2138                 b.decompositionPositions.append(tableIndex);
       
  2139                 tableIndex += utf16Chars + 1;
       
  2140             } else {
       
  2141                 b.decompositionPositions.append(0xffff);
       
  2142             }
       
  2143         }
       
  2144         int index = blocks.indexOf(b);
       
  2145         if (index == -1) {
       
  2146             index = blocks.size();
       
  2147             b.index = used;
       
  2148             used += BMP_BLOCKSIZE;
       
  2149             blocks.append(b);
       
  2150         }
       
  2151         blockMap.append(blocks.at(index).index);
       
  2152     }
       
  2153 
       
  2154     int bmp_blocks = blocks.size();
       
  2155     Q_ASSERT(blockMap.size() == BMP_END/BMP_BLOCKSIZE);
       
  2156 
       
  2157     for (int block = BMP_END/SMP_BLOCKSIZE; block < SMP_END/SMP_BLOCKSIZE; ++block) {
       
  2158         DecompositionBlock b;
       
  2159         for (int i = 0; i < SMP_BLOCKSIZE; ++i) {
       
  2160             int uc = block*SMP_BLOCKSIZE + i;
       
  2161             UnicodeData d = unicodeData.value(uc, UnicodeData(uc));
       
  2162             if (!d.decomposition.isEmpty()) {
       
  2163                 int utf16Chars = 0;
       
  2164                 for (int j = 0; j < d.decomposition.size(); ++j)
       
  2165                     utf16Chars += d.decomposition.at(j) > 0x10000 ? 2 : 1;
       
  2166                 decompositions.append(d.decompositionType + (utf16Chars<<8));
       
  2167                 for (int j = 0; j < d.decomposition.size(); ++j) {
       
  2168                     int code = d.decomposition.at(j);
       
  2169                     if (code > 0x10000) {
       
  2170                         // save as surrogate pair
       
  2171                         code -= 0x10000;
       
  2172                         ushort high = code/0x400 + 0xd800;
       
  2173                         ushort low = code%0x400 + 0xdc00;
       
  2174                         decompositions.append(high);
       
  2175                         decompositions.append(low);
       
  2176                     } else {
       
  2177                         decompositions.append(code);
       
  2178                     }
       
  2179                 }
       
  2180                 b.decompositionPositions.append(tableIndex);
       
  2181                 tableIndex += utf16Chars + 1;
       
  2182             } else {
       
  2183                 b.decompositionPositions.append(0xffff);
       
  2184             }
       
  2185         }
       
  2186         int index = blocks.indexOf(b);
       
  2187         if (index == -1) {
       
  2188             index = blocks.size();
       
  2189             b.index = used;
       
  2190             used += SMP_BLOCKSIZE;
       
  2191             blocks.append(b);
       
  2192         }
       
  2193         blockMap.append(blocks.at(index).index);
       
  2194     }
       
  2195 
       
  2196     int bmp_block_data = bmp_blocks*BMP_BLOCKSIZE*2;
       
  2197     int bmp_trie = BMP_END/BMP_BLOCKSIZE*2;
       
  2198     int bmp_mem = bmp_block_data + bmp_trie;
       
  2199     qDebug("    %d unique blocks in BMP.",blocks.size());
       
  2200     qDebug("        block data uses: %d bytes", bmp_block_data);
       
  2201     qDebug("        trie data uses : %d bytes", bmp_trie);
       
  2202     qDebug("        memory usage: %d bytes", bmp_mem);
       
  2203 
       
  2204     int smp_block_data = (blocks.size()- bmp_blocks)*SMP_BLOCKSIZE*2;
       
  2205     int smp_trie = (SMP_END-BMP_END)/SMP_BLOCKSIZE*2;
       
  2206     int smp_mem = smp_block_data + smp_trie;
       
  2207     qDebug("    %d unique blocks in SMP.",blocks.size()-bmp_blocks);
       
  2208     qDebug("        block data uses: %d bytes", smp_block_data);
       
  2209     qDebug("        trie data uses : %d bytes", smp_trie);
       
  2210 
       
  2211     qDebug("\n        decomposition table use : %d bytes", decompositions.size()*2);
       
  2212     qDebug("    memory usage: %d bytes", bmp_mem+smp_mem + decompositions.size()*2);
       
  2213 
       
  2214     QByteArray out;
       
  2215 
       
  2216     out += "static const unsigned short uc_decomposition_trie[] = {\n";
       
  2217 
       
  2218     // first write the map
       
  2219     out += "    // 0 - 0x" + QByteArray::number(BMP_END, 16);
       
  2220     for (int i = 0; i < BMP_END/BMP_BLOCKSIZE; ++i) {
       
  2221         if (!(i % 8)) {
       
  2222             if (out.endsWith(' '))
       
  2223                 out.chop(1);
       
  2224             if (!((i*BMP_BLOCKSIZE) % 0x1000))
       
  2225                 out += "\n";
       
  2226             out += "\n    ";
       
  2227         }
       
  2228         out += QByteArray::number(blockMap.at(i) + blockMap.size());
       
  2229         out += ", ";
       
  2230     }
       
  2231     if (out.endsWith(' '))
       
  2232         out.chop(1);
       
  2233     out += "\n\n    // 0x" + QByteArray::number(BMP_END, 16) + " - 0x" + QByteArray::number(SMP_END, 16) + "\n";;
       
  2234     for (int i = BMP_END/BMP_BLOCKSIZE; i < blockMap.size(); ++i) {
       
  2235         if (!(i % 8)) {
       
  2236             if (out.endsWith(' '))
       
  2237                 out.chop(1);
       
  2238             if (!(i % (0x10000/SMP_BLOCKSIZE)))
       
  2239                 out += "\n";
       
  2240             out += "\n    ";
       
  2241         }
       
  2242         out += QByteArray::number(blockMap.at(i) + blockMap.size());
       
  2243         out += ", ";
       
  2244     }
       
  2245     if (out.endsWith(' '))
       
  2246         out.chop(1);
       
  2247     out += "\n";
       
  2248     // write the data
       
  2249     for (int i = 0; i < blocks.size(); ++i) {
       
  2250         if (out.endsWith(' '))
       
  2251             out.chop(1);
       
  2252         out += "\n";
       
  2253         const DecompositionBlock &b = blocks.at(i);
       
  2254         for (int j = 0; j < b.decompositionPositions.size(); ++j) {
       
  2255             if (!(j % 8)) {
       
  2256                 if (out.endsWith(' '))
       
  2257                     out.chop(1);
       
  2258                 out += "\n    ";
       
  2259             }
       
  2260             out += "0x" + QByteArray::number(b.decompositionPositions.at(j), 16);
       
  2261             out += ", ";
       
  2262         }
       
  2263     }
       
  2264 
       
  2265     if (out.endsWith(' '))
       
  2266         out.chop(1);
       
  2267     out += "\n};\n\n"
       
  2268 
       
  2269            "#define GET_DECOMPOSITION_INDEX(ucs4) \\\n"
       
  2270            "       (ucs4 < 0x" + QByteArray::number(BMP_END, 16) + " \\\n"
       
  2271            "        ? (uc_decomposition_trie[uc_decomposition_trie[ucs4>>" + QByteArray::number(BMP_SHIFT) +
       
  2272            "] + (ucs4 & 0x" + QByteArray::number(BMP_BLOCKSIZE-1, 16)+ ")]) \\\n"
       
  2273            "        : (ucs4 < 0x" + QByteArray::number(SMP_END, 16) + "\\\n"
       
  2274            "           ? uc_decomposition_trie[uc_decomposition_trie[((ucs4 - 0x" + QByteArray::number(BMP_END, 16) +
       
  2275            ")>>" + QByteArray::number(SMP_SHIFT) + ") + 0x" + QByteArray::number(BMP_END/BMP_BLOCKSIZE, 16) + "]"
       
  2276            " + (ucs4 & 0x" + QByteArray::number(SMP_BLOCKSIZE-1, 16) + ")]\\\n"
       
  2277            "           : 0xffff))\n\n"
       
  2278 
       
  2279            "static const unsigned short uc_decomposition_map[] = {\n";
       
  2280 
       
  2281     for (int i = 0; i < decompositions.size(); ++i) {
       
  2282         if (!(i % 8)) {
       
  2283             if (out.endsWith(' '))
       
  2284                 out.chop(1);
       
  2285             out += "\n    ";
       
  2286         }
       
  2287         out += "0x" + QByteArray::number(decompositions.at(i), 16);
       
  2288         out += ", ";
       
  2289     }
       
  2290 
       
  2291     if (out.endsWith(' '))
       
  2292         out.chop(1);
       
  2293     out += "\n};\n\n";
       
  2294 
       
  2295     return out;
       
  2296 }
       
  2297 
       
  2298 static QByteArray createLigatureInfo()
       
  2299 {
       
  2300     qDebug("createLigatureInfo: numLigatures=%d", numLigatures);
       
  2301 
       
  2302     QList<DecompositionBlock> blocks;
       
  2303     QList<int> blockMap;
       
  2304     QList<unsigned short> ligatures;
       
  2305 
       
  2306     const int BMP_BLOCKSIZE = 32;
       
  2307     const int BMP_SHIFT = 5;
       
  2308     const int BMP_END = 0x3100;
       
  2309     Q_ASSERT(highestLigature < BMP_END);
       
  2310 
       
  2311     int used = 0;
       
  2312     int tableIndex = 0;
       
  2313 
       
  2314     for (int block = 0; block < BMP_END/BMP_BLOCKSIZE; ++block) {
       
  2315         DecompositionBlock b;
       
  2316         for (int i = 0; i < BMP_BLOCKSIZE; ++i) {
       
  2317             int uc = block*BMP_BLOCKSIZE + i;
       
  2318             QList<Ligature> l = ligatureHashes.value(uc);
       
  2319             if (!l.isEmpty()) {
       
  2320                 b.decompositionPositions.append(tableIndex);
       
  2321                 qSort(l);
       
  2322 
       
  2323                 ligatures.append(l.size());
       
  2324                 for (int i = 0; i < l.size(); ++i) {
       
  2325                     Q_ASSERT(l.at(i).u2 == uc);
       
  2326                     ligatures.append(l.at(i).u1);
       
  2327                     ligatures.append(l.at(i).ligature);
       
  2328                 }
       
  2329                 tableIndex += 2*l.size() + 1;
       
  2330             } else {
       
  2331                 b.decompositionPositions.append(0xffff);
       
  2332             }
       
  2333         }
       
  2334         int index = blocks.indexOf(b);
       
  2335         if (index == -1) {
       
  2336             index = blocks.size();
       
  2337             b.index = used;
       
  2338             used += BMP_BLOCKSIZE;
       
  2339             blocks.append(b);
       
  2340         }
       
  2341         blockMap.append(blocks.at(index).index);
       
  2342     }
       
  2343 
       
  2344     int bmp_blocks = blocks.size();
       
  2345     Q_ASSERT(blockMap.size() == BMP_END/BMP_BLOCKSIZE);
       
  2346 
       
  2347     int bmp_block_data = bmp_blocks*BMP_BLOCKSIZE*2;
       
  2348     int bmp_trie = BMP_END/BMP_BLOCKSIZE*2;
       
  2349     int bmp_mem = bmp_block_data + bmp_trie;
       
  2350     qDebug("    %d unique blocks in BMP.",blocks.size());
       
  2351     qDebug("        block data uses: %d bytes", bmp_block_data);
       
  2352     qDebug("        trie data uses : %d bytes", bmp_trie);
       
  2353     qDebug("        ligature data uses : %d bytes", ligatures.size()*2);
       
  2354     qDebug("        memory usage: %d bytes", bmp_mem + ligatures.size() * 2);
       
  2355 
       
  2356     QByteArray out;
       
  2357 
       
  2358 
       
  2359     out += "static const unsigned short uc_ligature_trie[] = {\n";
       
  2360 
       
  2361     // first write the map
       
  2362     out += "    // 0 - 0x" + QByteArray::number(BMP_END, 16);
       
  2363     for (int i = 0; i < BMP_END/BMP_BLOCKSIZE; ++i) {
       
  2364         if (!(i % 8)) {
       
  2365             if (out.endsWith(' '))
       
  2366                 out.chop(1);
       
  2367             if (!((i*BMP_BLOCKSIZE) % 0x1000))
       
  2368                 out += "\n";
       
  2369             out += "\n    ";
       
  2370         }
       
  2371         out += QByteArray::number(blockMap.at(i) + blockMap.size());
       
  2372         out += ", ";
       
  2373     }
       
  2374     if (out.endsWith(' '))
       
  2375         out.chop(1);
       
  2376     out += "\n";
       
  2377     // write the data
       
  2378     for (int i = 0; i < blocks.size(); ++i) {
       
  2379         if (out.endsWith(' '))
       
  2380             out.chop(1);
       
  2381         out += "\n";
       
  2382         const DecompositionBlock &b = blocks.at(i);
       
  2383         for (int j = 0; j < b.decompositionPositions.size(); ++j) {
       
  2384             if (!(j % 8)) {
       
  2385                 if (out.endsWith(' '))
       
  2386                     out.chop(1);
       
  2387                 out += "\n    ";
       
  2388             }
       
  2389             out += "0x" + QByteArray::number(b.decompositionPositions.at(j), 16);
       
  2390             out += ", ";
       
  2391         }
       
  2392     }
       
  2393     if (out.endsWith(' '))
       
  2394         out.chop(1);
       
  2395     out += "\n};\n\n"
       
  2396 
       
  2397            "#define GET_LIGATURE_INDEX(u2) "
       
  2398            "(u2 < 0x" + QByteArray::number(BMP_END, 16) + " ? "
       
  2399            "uc_ligature_trie[uc_ligature_trie[u2>>" + QByteArray::number(BMP_SHIFT) +
       
  2400            "] + (u2 & 0x" + QByteArray::number(BMP_BLOCKSIZE-1, 16)+ ")] : 0xffff);\n\n"
       
  2401 
       
  2402            "static const unsigned short uc_ligature_map [] = {\n";
       
  2403 
       
  2404     for (int i = 0; i < ligatures.size(); ++i) {
       
  2405         if (!(i % 8)) {
       
  2406             if (out.endsWith(' '))
       
  2407                 out.chop(1);
       
  2408             out += "\n    ";
       
  2409         }
       
  2410         out += "0x" + QByteArray::number(ligatures.at(i), 16);
       
  2411         out += ", ";
       
  2412     }
       
  2413 
       
  2414     if (out.endsWith(' '))
       
  2415         out.chop(1);
       
  2416     out += "\n};\n\n";
       
  2417 
       
  2418     return out;
       
  2419 }
       
  2420 
       
  2421 QByteArray createCasingInfo()
       
  2422 {
       
  2423     QByteArray out;
       
  2424 
       
  2425     out += "struct CasingInfo {\n"
       
  2426            "    uint codePoint : 16;\n"
       
  2427            "    uint flags : 8;\n"
       
  2428            "    uint offset : 8;\n"
       
  2429            "};\n\n";
       
  2430 
       
  2431     return out;
       
  2432 }
       
  2433 
       
  2434 int main(int, char **)
       
  2435 {
       
  2436     initCategoryMap();
       
  2437     initDirectionMap();
       
  2438     initDecompositionMap();
       
  2439     initGraphemeBreak();
       
  2440     initWordBreak();
       
  2441     initSentenceBreak();
       
  2442     
       
  2443     readUnicodeData();
       
  2444     readBidiMirroring();
       
  2445     readArabicShaping();
       
  2446     readDerivedAge();
       
  2447     readCompositionExclusion();
       
  2448     readLineBreak();
       
  2449     readSpecialCasing();
       
  2450     readCaseFolding();
       
  2451     // readBlocks();
       
  2452     readScripts();
       
  2453     readGraphemeBreak();
       
  2454     readWordBreak();
       
  2455     readSentenceBreak();
       
  2456 
       
  2457     computeUniqueProperties();
       
  2458     QByteArray properties = createPropertyInfo();
       
  2459     QByteArray compositions = createCompositionInfo();
       
  2460     QByteArray ligatures = createLigatureInfo();
       
  2461     QByteArray normalizationCorrections = createNormalizationCorrections();
       
  2462     QByteArray scriptEnumDeclaration = createScriptEnumDeclaration();
       
  2463     QByteArray scriptTableDeclaration = createScriptTableDeclaration();
       
  2464 
       
  2465     QFile f("../../src/corelib/tools/qunicodetables.cpp");
       
  2466     f.open(QFile::WriteOnly|QFile::Truncate);
       
  2467 
       
  2468     QByteArray header =
       
  2469         "/****************************************************************************\n"
       
  2470         "**\n"
       
  2471         "** Copyright (C) 2009 Nokia Corporation and/or its subsidiary(-ies).\n"
       
  2472         "** All rights reserved.\n"
       
  2473         "** Contact: Nokia Corporation (qt-info@nokia.com)\n"
       
  2474         "**\n"
       
  2475         "** This file is part of the QtCore module of the Qt Toolkit.\n"
       
  2476         "**\n"
       
  2477         "** $QT_BEGIN_LICENSE:LGPL$\n"
       
  2478         "** No Commercial Usage\n"
       
  2479         "** This file contains pre-release code and may not be distributed.\n"
       
  2480         "** You may use this file in accordance with the terms and conditions\n"
       
  2481         "** contained in the Technology Preview License Agreement accompanying\n"
       
  2482         "** this package.\n"
       
  2483         "**\n"
       
  2484         "** GNU Lesser General Public License Usage\n"
       
  2485         "** Alternatively, this file may be used under the terms of the GNU Lesser\n"
       
  2486         "** General Public License version 2.1 as published by the Free Software\n"
       
  2487         "** Foundation and appearing in the file LICENSE.LGPL included in the\n"
       
  2488         "** packaging of this file.  Please review the following information to\n"
       
  2489         "** ensure the GNU Lesser General Public License version 2.1 requirements\n"
       
  2490         "** will be met: http://www.gnu.org/licenses/old-licenses/lgpl-2.1.html.\n"
       
  2491         "**\n"
       
  2492         "** In addition, as a special exception, Nokia gives you certain additional\n"
       
  2493         "** rights.  These rights are described in the Nokia Qt LGPL Exception\n"
       
  2494         "** version 1.1, included in the file LGPL_EXCEPTION.txt in this package.\n"
       
  2495         "**\n"
       
  2496         "** If you have questions regarding the use of this file, please contact\n"
       
  2497         "** Nokia at qt-info@nokia.com.\n"
       
  2498         "**\n"
       
  2499         "**\n"
       
  2500         "**\n"
       
  2501         "**\n"
       
  2502         "**\n"
       
  2503         "**\n"
       
  2504         "**\n"
       
  2505         "**\n"
       
  2506         "** $QT_END_LICENSE$\n"
       
  2507         "**\n"
       
  2508         "****************************************************************************/\n\n"
       
  2509 
       
  2510         "/* This file is autogenerated from the Unicode 5.0 database. Do not edit */\n\n";
       
  2511 
       
  2512     QByteArray warning =
       
  2513         "//\n"
       
  2514         "//  W A R N I N G\n"
       
  2515         "//  -------------\n"
       
  2516         "//\n"
       
  2517         "// This file is not part of the Qt API.  It exists for the convenience\n"
       
  2518         "// of internal files.  This header file may change from version to version\n"
       
  2519         "// without notice, or even be removed.\n"
       
  2520         "//\n"
       
  2521         "// We mean it.\n"
       
  2522         "//\n\n";
       
  2523 
       
  2524     f.write(header);
       
  2525     f.write("QT_BEGIN_NAMESPACE\n\n");
       
  2526     f.write(properties);
       
  2527     f.write(compositions);
       
  2528     f.write(ligatures);
       
  2529     f.write(normalizationCorrections);
       
  2530     f.write(scriptTableDeclaration);
       
  2531     f.write("\nQT_END_NAMESPACE\n");
       
  2532     f.close();
       
  2533 
       
  2534     f.setFileName("../../src/corelib/tools/qunicodetables_p.h");
       
  2535     f.open(QFile::WriteOnly | QFile::Truncate);
       
  2536     f.write(header);
       
  2537     f.write(warning);
       
  2538     f.write("#ifndef QUNICODETABLES_P_H\n"
       
  2539             "#define QUNICODETABLES_P_H\n\n"
       
  2540             "#include <QtCore/qchar.h>\n\n"
       
  2541             "QT_BEGIN_NAMESPACE\n\n");
       
  2542     f.write("namespace QUnicodeTables {\n");
       
  2543     f.write(property_string);
       
  2544     f.write("\n");
       
  2545     f.write(scriptEnumDeclaration);
       
  2546     f.write("\n");
       
  2547     f.write(lineBreakClass);
       
  2548     f.write("\n");
       
  2549     f.write(methods);
       
  2550     f.write("\n");
       
  2551     f.write(grapheme_break_string);
       
  2552     f.write("\n");
       
  2553     f.write(word_break_string);
       
  2554     f.write("\n");
       
  2555     f.write(sentence_break_string);
       
  2556     f.write("\n}\n\n"
       
  2557             "QT_END_NAMESPACE\n\n"
       
  2558             "#endif\n");
       
  2559     f.close();
       
  2560 
       
  2561     qDebug() << "maxMirroredDiff  = " << hex << maxMirroredDiff;
       
  2562     qDebug() << "maxLowerCaseDiff = " << hex << maxLowerCaseDiff;
       
  2563     qDebug() << "maxUpperCaseDiff = " << hex << maxUpperCaseDiff;
       
  2564     qDebug() << "maxTitleCaseDiff = " << hex << maxTitleCaseDiff;
       
  2565     qDebug() << "maxCaseFoldDiff  = " << hex << maxCaseFoldDiff;
       
  2566 #if 0
       
  2567 //     dump(0, 0x7f);
       
  2568 //     dump(0x620, 0x640);
       
  2569 //     dump(0x10000, 0x10020);
       
  2570 //     dump(0x10800, 0x10820);
       
  2571 
       
  2572     qDebug("decompositionLength used:");
       
  2573     int totalcompositions = 0;
       
  2574     int sum = 0;
       
  2575     for (int i = 1; i < 20; ++i) {
       
  2576         qDebug("    length %d used %d times", i, decompositionLength.value(i, 0));
       
  2577         totalcompositions += i*decompositionLength.value(i, 0);
       
  2578         sum += decompositionLength.value(i, 0);
       
  2579     }
       
  2580     qDebug("    len decomposition map %d, average length %f, num composed chars %d",
       
  2581            totalcompositions, (float)totalcompositions/(float)sum,  sum);
       
  2582     qDebug("highest composed character %x", highestComposedCharacter);
       
  2583     qDebug("num ligatures = %d highest=%x, maxLength=%d", numLigatures, highestLigature, longestLigature);
       
  2584 
       
  2585     qBubbleSort(ligatures);
       
  2586     for (int i = 0; i < ligatures.size(); ++i)
       
  2587         qDebug("%s", ligatures.at(i).data());
       
  2588 
       
  2589 //     qDebug("combiningClass usage:");
       
  2590 //     int numClasses = 0;
       
  2591 //     for (int i = 0; i < 255; ++i) {
       
  2592 //         int num = combiningClassUsage.value(i, 0);
       
  2593 //         if (num) {
       
  2594 //             ++numClasses;
       
  2595 //             qDebug("    combiningClass %d used %d times", i, num);
       
  2596 //         }
       
  2597 //     }
       
  2598 //     qDebug("total of %d combining classes used", numClasses);
       
  2599 
       
  2600 #endif
       
  2601 }
       
  2602