util/unicode/main.cpp
author Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
Mon, 15 Mar 2010 12:43:09 +0200
branchRCL_3
changeset 6 dee5afe5301f
parent 4 3b1da2848fc7
permissions -rw-r--r--
Revision: 201008 Kit: 201010

/****************************************************************************
**
** Copyright (C) 2010 Nokia Corporation and/or its subsidiary(-ies).
** All rights reserved.
** Contact: Nokia Corporation (qt-info@nokia.com)
**
** This file is part of the utils of the Qt Toolkit.
**
** $QT_BEGIN_LICENSE:LGPL$
** No Commercial Usage
** This file contains pre-release code and may not be distributed.
** You may use this file in accordance with the terms and conditions
** contained in the Technology Preview License Agreement accompanying
** this package.
**
** GNU Lesser General Public License Usage
** Alternatively, this file may be used under the terms of the GNU Lesser
** General Public License version 2.1 as published by the Free Software
** Foundation and appearing in the file LICENSE.LGPL included in the
** packaging of this file.  Please review the following information to
** ensure the GNU Lesser General Public License version 2.1 requirements
** will be met: http://www.gnu.org/licenses/old-licenses/lgpl-2.1.html.
**
** In addition, as a special exception, Nokia gives you certain additional
** rights.  These rights are described in the Nokia Qt LGPL Exception
** version 1.1, included in the file LGPL_EXCEPTION.txt in this package.
**
** If you have questions regarding the use of this file, please contact
** Nokia at qt-info@nokia.com.
**
**
**
**
**
**
**
**
** $QT_END_LICENSE$
**
****************************************************************************/
#include <qlist.h>
#include <qhash.h>
#include <qfile.h>
#include <qstring.h>
#include <qchar.h>
#include <private/qunicodetables_p.h>
#include <qvector.h>
#include <qdebug.h>


static struct AgeMap {
    const char *age;
    const QChar::UnicodeVersion version;
} ageMap [] = {
    { "1.1", QChar::Unicode_1_1 },
    { "2.0", QChar::Unicode_2_0 },
    { "2.1", QChar::Unicode_2_1_2 },
    { "3.0", QChar::Unicode_3_0 },
    { "3.1", QChar::Unicode_3_1 },
    { "3.2", QChar::Unicode_3_2 },
    { "4.0", QChar::Unicode_4_0 },
    { "4.1", QChar::Unicode_4_1 },
    { "5.0", QChar::Unicode_5_0 },
    { 0, QChar::Unicode_Unassigned }
};
#define CURRENT_UNICODE_VERSION "QChar::Unicode_5_0"

static const char *grapheme_break_string =
    "    enum GraphemeBreak {\n"
    "        GraphemeBreakOther, \n"
    "        GraphemeBreakCR,\n"
    "        GraphemeBreakLF,\n"
    "        GraphemeBreakControl,\n"
    "        GraphemeBreakExtend,\n"
    "        GraphemeBreakL,\n"
    "        GraphemeBreakV,\n"
    "        GraphemeBreakT,\n"
    "        GraphemeBreakLV,\n"
    "        GraphemeBreakLVT\n"
    "    };\n\n";

enum GraphemeBreak {
    GraphemeBreakOther,
    GraphemeBreakCR,
    GraphemeBreakLF,
    GraphemeBreakControl,
    GraphemeBreakExtend,
    GraphemeBreakL,
    GraphemeBreakV,
    GraphemeBreakT,
    GraphemeBreakLV,
    GraphemeBreakLVT
};

QHash<QByteArray, GraphemeBreak> grapheme_break_map;

static void initGraphemeBreak()
{
    struct GraphemeBreakList {
        GraphemeBreak brk;
        const char *name;
    } breaks[] = {
        { GraphemeBreakOther, "Other" },
        { GraphemeBreakCR, "CR" },
        { GraphemeBreakLF, "LF" },
        { GraphemeBreakControl, "Control" },
        { GraphemeBreakExtend, "Extend" },
        { GraphemeBreakL, "L" },
        { GraphemeBreakV, "V" },
        { GraphemeBreakT, "T" },
        { GraphemeBreakLV, "LV" },
        { GraphemeBreakLVT, "LVT" },
        { GraphemeBreakOther, 0 }
    };
    GraphemeBreakList *d = breaks;
    while (d->name) {
        grapheme_break_map.insert(d->name, d->brk);
        ++d;
    }
}

const char *word_break_string =
    "    enum WordBreak {\n"
    "        WordBreakOther,\n"
    "        WordBreakFormat,\n"
    "        WordBreakKatakana,\n"
    "        WordBreakALetter,\n"
    "        WordBreakMidLetter,\n"
    "        WordBreakMidNum,\n"
    "        WordBreakNumeric,\n"
    "        WordBreakExtendNumLet\n"
    "    };\n\n";

enum WordBreak {
    WordBreakOther,
    WordBreakFormat,
    WordBreakKatakana,
    WordBreakALetter,
    WordBreakMidLetter,
    WordBreakMidNum,
    WordBreakNumeric,
    WordBreakExtendNumLet
};


QHash<QByteArray, WordBreak> word_break_map;

static void initWordBreak()
{
    struct WordBreakList {
        WordBreak brk;
        const char *name;
    } breaks[] = {
        { WordBreakFormat, "Format" },
        { WordBreakFormat, "Extend" }, // these are copied in from GraphemeBreakProperty.txt
        { WordBreakKatakana, "Katakana" },
        { WordBreakALetter, "ALetter" },
        { WordBreakMidLetter, "MidLetter" },
        { WordBreakMidNum, "MidNum" },
        { WordBreakNumeric, "Numeric" },
        { WordBreakExtendNumLet, "ExtendNumLet" },
        { WordBreakFormat,  0 }
    };
    WordBreakList *d = breaks;
    while (d->name) {
        word_break_map.insert(d->name, d->brk);
        ++d;
    }
}


static const char *sentence_break_string =
    "    enum SentenceBreak {\n"
    "        SentenceBreakOther,\n"
    "        SentenceBreakSep,\n"
    "        SentenceBreakFormat,\n"
    "        SentenceBreakSp,\n"
    "        SentenceBreakLower,\n"
    "        SentenceBreakUpper,\n"
    "        SentenceBreakOLetter,\n"
    "        SentenceBreakNumeric,\n"
    "        SentenceBreakATerm,\n"
    "        SentenceBreakSTerm,\n"
    "        SentenceBreakClose\n"
    "    };\n\n";

enum SentenceBreak {
    SentenceBreakOther,
    SentenceBreakSep,
    SentenceBreakFormat,
    SentenceBreakSp,
    SentenceBreakLower,
    SentenceBreakUpper,
    SentenceBreakOLetter,
    SentenceBreakNumeric,
    SentenceBreakATerm,
    SentenceBreakSTerm,
    SentenceBreakClose
};


QHash<QByteArray, SentenceBreak> sentence_break_map;

static void initSentenceBreak()
{
    struct SentenceBreakList {
        SentenceBreak brk;
        const char *name;
    } breaks[] = {
        { SentenceBreakOther, "Other" },
        { SentenceBreakSep, "Sep" },
        { SentenceBreakFormat, "Format" },
        { SentenceBreakSp, "Sp" },
        { SentenceBreakLower, "Lower" },
        { SentenceBreakUpper, "Upper" },
        { SentenceBreakOLetter, "OLetter" },
        { SentenceBreakNumeric, "Numeric" },
        { SentenceBreakATerm, "ATerm" },
        { SentenceBreakSTerm, "STerm" },
        { SentenceBreakClose, "Close" },
        { SentenceBreakOther,  0 }
    };
    SentenceBreakList *d = breaks;
    while (d->name) {
        sentence_break_map.insert(d->name, d->brk);
        ++d;
    }
}


// Keep this one in sync with the code in createPropertyInfo
const char *property_string =
    "    struct Properties {\n"
    "        ushort category : 8;\n"
    "        ushort line_break_class : 8;\n"
    "        ushort direction : 8;\n"
    "        ushort combiningClass :8;\n"
    "        ushort joining : 2;\n"
    "        signed short digitValue : 6; /* 5 needed */\n"
    "        ushort unicodeVersion : 4;\n"
    "        ushort lowerCaseSpecial : 1;\n"
    "        ushort upperCaseSpecial : 1;\n"
    "        ushort titleCaseSpecial : 1;\n"
    "        ushort caseFoldSpecial : 1; /* currently unused */\n"
    "        signed short mirrorDiff : 16;\n"
    "        signed short lowerCaseDiff : 16;\n"
    "        signed short upperCaseDiff : 16;\n"
    "        signed short titleCaseDiff : 16;\n"
    "        signed short caseFoldDiff : 16;\n"
    "        ushort graphemeBreak : 8;\n"
    "        ushort wordBreak : 8;\n"
    "        ushort sentenceBreak : 8;\n"
    "    };\n"
    "    Q_CORE_EXPORT const Properties * QT_FASTCALL properties(uint ucs4);\n"
    "    Q_CORE_EXPORT const Properties * QT_FASTCALL properties(ushort ucs2);\n";

const char *lineBreakClass =
    "    // see http://www.unicode.org/reports/tr14/tr14-19.html\n"
    "    // we don't use the XX, AI and CB properties and map them to AL instead.\n"
    "    // as we don't support any EBDIC based OS'es, NL is ignored and mapped to AL as well.\n"
    "    enum LineBreakClass {\n"
    "        LineBreak_OP, LineBreak_CL, LineBreak_QU, LineBreak_GL, LineBreak_NS,\n"
    "        LineBreak_EX, LineBreak_SY, LineBreak_IS, LineBreak_PR, LineBreak_PO,\n"
    "        LineBreak_NU, LineBreak_AL, LineBreak_ID, LineBreak_IN, LineBreak_HY,\n"
    "        LineBreak_BA, LineBreak_BB, LineBreak_B2, LineBreak_ZW, LineBreak_CM,\n"
    "        LineBreak_WJ, LineBreak_H2, LineBreak_H3, LineBreak_JL, LineBreak_JV,\n"
    "        LineBreak_JT, LineBreak_SA, LineBreak_SG,\n"
    "        LineBreak_SP, LineBreak_CR, LineBreak_LF, LineBreak_BK\n"
    "    };\n\n";

const char *methods =
    "    Q_CORE_EXPORT QUnicodeTables::LineBreakClass QT_FASTCALL lineBreakClass(uint ucs4);\n"
    "    inline int lineBreakClass(const QChar &ch) {\n"
    "        return QUnicodeTables::lineBreakClass(ch.unicode());\n"
    "    }\n"
    "\n"
    "    Q_CORE_EXPORT int QT_FASTCALL script(uint ucs4);\n"
    "    Q_CORE_EXPORT_INLINE int QT_FASTCALL script(const QChar &ch) {\n"
    "        return script(ch.unicode());\n"
    "    }\n\n";


struct PropertyFlags {
    bool operator ==(const PropertyFlags &o) {
        return (combiningClass == o.combiningClass
                && category == o.category
                && direction == o.direction
                && joining == o.joining
                && age == o.age
                && digitValue == o.digitValue
                && line_break_class == o.line_break_class
                && mirrorDiff == o.mirrorDiff
                && lowerCaseDiff == o.lowerCaseDiff
                && upperCaseDiff == o.upperCaseDiff
                && titleCaseDiff == o.titleCaseDiff
                && caseFoldDiff == o.caseFoldDiff
                && lowerCaseSpecial == o.lowerCaseSpecial
                && upperCaseSpecial == o.upperCaseSpecial
                && titleCaseSpecial == o.titleCaseSpecial
                && caseFoldSpecial == o.caseFoldSpecial
                && graphemeBreak == o.graphemeBreak
                && wordBreak == o.wordBreak
                && sentenceBreak == o.sentenceBreak
            );
    }
    // from UnicodeData.txt
    uchar combiningClass : 8;
    QChar::Category category : 5;
    QChar::Direction direction : 5;
    // from ArabicShaping.txt
    QChar::Joining joining : 2;
    // from DerivedAge.txt
    QChar::UnicodeVersion age : 4;
    int digitValue;
    uint line_break_class : 5;

    int mirrorDiff : 16;

    int lowerCaseDiff;
    int upperCaseDiff;
    int titleCaseDiff;
    int caseFoldDiff;
    bool lowerCaseSpecial;
    bool upperCaseSpecial;
    bool titleCaseSpecial;
    bool caseFoldSpecial;
    GraphemeBreak graphemeBreak;
    WordBreak wordBreak;
    SentenceBreak sentenceBreak;
};

QList<int> specialCaseMap;
int specialCaseMaxLen = 0;

static int appendToSpecialCaseMap(const QList<int> &map)
{
    QList<int> utf16map;
    for (int i = 0; i < map.size(); ++i) {
        int val = map.at(i);
        if (val > 0xffff) {
            utf16map << QChar::highSurrogate(val);
            utf16map << QChar::lowSurrogate(val);
        } else {
            utf16map << val;
        }
    }
    specialCaseMaxLen = qMax(specialCaseMaxLen, utf16map.size());
    utf16map << 0;

    for (int i = 0; i < specialCaseMap.size() - utf16map.size() - 1; ++i) {
        int j;
        for (j = 0; j < utf16map.size(); ++j) {
            if (specialCaseMap.at(i+j) != utf16map.at(j))
                break;
        }
        if (j == utf16map.size())
            return i;
    }

    int pos = specialCaseMap.size();
    specialCaseMap << utf16map;
    return pos;
}

struct UnicodeData {
    UnicodeData(int codepoint = 0) {
        p.category = QChar::NoCategory;
        p.combiningClass = 0;

        p.direction = QChar::DirL;
        // DirR for:  U+0590..U+05FF, U+07C0..U+08FF, U+FB1D..U+FB4F, U+10800..U+10FFF
        if ((codepoint >= 0x590 && codepoint <= 0x5ff)
            || (codepoint >= 0x7c0 && codepoint <= 0x8ff)
            || (codepoint >= 0xfb1d && codepoint <= 0xfb4f)
            || (codepoint >= 0x10800 && codepoint <= 0x10fff))
            p.direction = QChar::DirR;
        // DirAL for: U+0600..U+07BF, U+FB50..U+FDCF, U+FDF0..U+FDFF, U+FE70..U+FEFE
        if ((codepoint >= 0x600 && codepoint <= 0x7bf)
            || (codepoint >= 0xfb50 && codepoint <= 0xfdcf)
            || (codepoint >= 0xfdf0 && codepoint <= 0xfdff)
            || (codepoint >= 0xfe70 && codepoint <= 0xfefe))
            p.direction = QChar::DirAL;

        mirroredChar = 0;
        decompositionType = QChar::NoDecomposition;
        p.joining = QChar::OtherJoining;
        p.age = QChar::Unicode_Unassigned;
        p.mirrorDiff = 0;
        p.digitValue = -1;
        p.line_break_class = QUnicodeTables::LineBreak_AL;
        p.lowerCaseDiff = 0;
        p.upperCaseDiff = 0;
        p.titleCaseDiff = 0;
        p.caseFoldDiff = 0;
        p.lowerCaseSpecial = 0;
        p.upperCaseSpecial = 0;
        p.titleCaseSpecial = 0;
        p.caseFoldSpecial = 0;
        p.graphemeBreak = GraphemeBreakOther;
        p.wordBreak = WordBreakOther;
        p.sentenceBreak = SentenceBreakOther;
        propertyIndex = -1;
        excludedComposition = false;
    }
    PropertyFlags p;

    // from UnicodeData.txt
    QChar::Decomposition decompositionType;
    QList<int> decomposition;

    QList<int> specialFolding;

    // from BidiMirroring.txt
    int mirroredChar;

    // CompositionExclusions.txt
    bool excludedComposition;

    // computed position of unicode property set
    int propertyIndex;
};

enum UniDataFields {
    UD_Value,
    UD_Name,
    UD_Category,
    UD_CombiningClass,
    UD_BidiCategory,
    UD_Decomposition,
    UD_DecimalDigitValue,
    UD_DigitValue,
    UD_NumericValue,
    UD_Mirrored,
    UD_OldName,
    UD_Comment,
    UD_UpperCase,
    UD_LowerCase,
    UD_TitleCase
};

QHash<QByteArray, QChar::Category> categoryMap;

static void initCategoryMap()
{
    struct Cat {
        QChar::Category cat;
        const char *name;
    } categories [] = {
        { QChar::Mark_NonSpacing,          "Mn" },
        { QChar::Mark_SpacingCombining,    "Mc" },
        { QChar::Mark_Enclosing,           "Me" },

        { QChar::Number_DecimalDigit,      "Nd" },
        { QChar::Number_Letter,            "Nl" },
        { QChar::Number_Other,             "No" },

        { QChar::Separator_Space,          "Zs" },
        { QChar::Separator_Line,           "Zl" },
        { QChar::Separator_Paragraph,      "Zp" },

        { QChar::Other_Control,            "Cc" },
        { QChar::Other_Format,             "Cf" },
        { QChar::Other_Surrogate,          "Cs" },
        { QChar::Other_PrivateUse,         "Co" },
        { QChar::Other_NotAssigned,        "Cn" },

        { QChar::Letter_Uppercase,         "Lu" },
        { QChar::Letter_Lowercase,         "Ll" },
        { QChar::Letter_Titlecase,         "Lt" },
        { QChar::Letter_Modifier,          "Lm" },
        { QChar::Letter_Other,             "Lo" },

        { QChar::Punctuation_Connector,    "Pc" },
        { QChar::Punctuation_Dash,         "Pd" },
        { QChar::Punctuation_Open,         "Ps" },
        { QChar::Punctuation_Close,        "Pe" },
        { QChar::Punctuation_InitialQuote, "Pi" },
        { QChar::Punctuation_FinalQuote,   "Pf" },
        { QChar::Punctuation_Other,        "Po" },

        { QChar::Symbol_Math,              "Sm" },
        { QChar::Symbol_Currency,          "Sc" },
        { QChar::Symbol_Modifier,          "Sk" },
        { QChar::Symbol_Other,             "So" },
        { QChar::NoCategory, 0 }
    };
    Cat *c = categories;
    while (c->cat != QChar::NoCategory) {
        categoryMap.insert(c->name, c->cat);
        ++c;
    }
}

QHash<QByteArray, QChar::Direction> directionMap;

static void initDirectionMap()
{
    struct Dir {
        QChar::Direction dir;
        const char *name;
    } directions[] = {
        { QChar::DirL, "L" },
        { QChar::DirR, "R" },
        { QChar::DirEN, "EN" },
        { QChar::DirES, "ES" },
        { QChar::DirET, "ET" },
        { QChar::DirAN, "AN" },
        { QChar::DirCS, "CS" },
        { QChar::DirB, "B" },
        { QChar::DirS, "S" },
        { QChar::DirWS, "WS" },
        { QChar::DirON, "ON" },
        { QChar::DirLRE, "LRE" },
        { QChar::DirLRO, "LRO" },
        { QChar::DirAL, "AL" },
        { QChar::DirRLE, "RLE" },
        { QChar::DirRLO, "RLO" },
        { QChar::DirPDF, "PDF" },
        { QChar::DirNSM, "NSM" },
        { QChar::DirBN, "BN" },
        { QChar::DirL, 0 }
    };
    Dir *d = directions;
    while (d->name) {
        directionMap.insert(d->name, d->dir);
        ++d;
    }
}


QHash<QByteArray, QChar::Decomposition> decompositionMap;

static void initDecompositionMap()
{
    struct Dec {
        QChar::Decomposition dec;
        const char *name;
    } decompositions[] = {
        { QChar::Canonical, "<canonical>" },
        { QChar::Font, "<font>" },
        { QChar::NoBreak, "<noBreak>" },
        { QChar::Initial, "<initial>" },
        { QChar::Medial, "<medial>" },
        { QChar::Final, "<final>" },
        { QChar::Isolated, "<isolated>" },
        { QChar::Circle, "<circle>" },
        { QChar::Super, "<super>" },
        { QChar::Sub, "<sub>" },
        { QChar::Vertical, "<vertical>" },
        { QChar::Wide, "<wide>" },
        { QChar::Narrow, "<narrow>" },
        { QChar::Small, "<small>" },
        { QChar::Square, "<square>" },
        { QChar::Compat, "<compat>" },
        { QChar::Fraction, "<fraction>" },
        { QChar::NoDecomposition,  0 }
    };
    Dec *d = decompositions;
    while (d->name) {
        decompositionMap.insert(d->name, d->dec);
        ++d;
    }
}


QHash<int, UnicodeData> unicodeData;
QList<PropertyFlags> uniqueProperties;


QHash<int, int> decompositionLength;
int highestComposedCharacter = 0;
int numLigatures = 0;
int highestLigature = 0;

struct Ligature {ushort u1; ushort u2; ushort ligature;};
// we need them sorted after the first component for fast lookup
bool operator < (const Ligature &l1, const Ligature &l2) {
    return l1.u1 < l2.u1;
}

QHash<ushort, QList<Ligature> > ligatureHashes;

QHash<int, int> combiningClassUsage;

int maxLowerCaseDiff = 0;
int maxUpperCaseDiff = 0;
int maxTitleCaseDiff = 0;

static void readUnicodeData()
{
    QFile f("data/UnicodeData.txt");
    if (!f.exists())
        qFatal("Couldn't find UnicodeData.txt");

    f.open(QFile::ReadOnly);

    while (!f.atEnd()) {
        QByteArray line;
        line.resize(1024);
        int len = f.readLine(line.data(), 1024);
        line.truncate(len-1);

        int comment = line.indexOf('#');
        if (comment >= 0)
            line = line.left(comment);
        if (line.isEmpty())
            continue;

        QList<QByteArray> properties = line.split(';');
        bool ok;
        int codepoint = properties[UD_Value].toInt(&ok, 16);
        int lastCodepoint = codepoint;

        QByteArray name = properties[UD_Name];
        if (name.startsWith('<') && name.contains("First")) {
            QByteArray nextLine;
            nextLine.resize(1024);
            f.readLine(nextLine.data(), 1024);
            QList<QByteArray> properties = nextLine.split(';');
            lastCodepoint = properties[UD_Value].toInt(&ok, 16);
        }

        UnicodeData data(codepoint);
        data.p.category = categoryMap.value(properties[UD_Category], QChar::NoCategory);
        data.p.combiningClass = properties[UD_CombiningClass].toInt();

        if (!combiningClassUsage.contains(data.p.combiningClass))
            combiningClassUsage[data.p.combiningClass] = 1;
        else
            ++combiningClassUsage[data.p.combiningClass];

        data.p.direction = directionMap.value(properties[UD_BidiCategory], data.p.direction);

        if (!properties[UD_UpperCase].isEmpty()) {
            int upperCase = properties[UD_UpperCase].toInt(&ok, 16);
            Q_ASSERT(ok);
            data.p.upperCaseDiff = upperCase - codepoint;
            maxUpperCaseDiff = qMax(maxUpperCaseDiff, qAbs(data.p.upperCaseDiff));
            if (codepoint > 0xffff) {
                // if the condition below doesn't hold anymore we need to modify our case folding code
                //qDebug() << codepoint << QChar::highSurrogate(codepoint) << QChar::highSurrogate(foldMap.at(0));
                Q_ASSERT(QChar::highSurrogate(codepoint) == QChar::highSurrogate(upperCase));
            }
        }
        if (!properties[UD_LowerCase].isEmpty()) {
            int lowerCase = properties[UD_LowerCase].toInt(&ok, 16);
            Q_ASSERT (ok);
            data.p.lowerCaseDiff = lowerCase - codepoint;
            maxLowerCaseDiff = qMax(maxLowerCaseDiff, qAbs(data.p.lowerCaseDiff));
            if (codepoint > 0xffff) {
                // if the condition below doesn't hold anymore we need to modify our case folding code
                //qDebug() << codepoint << QChar::highSurrogate(codepoint) << QChar::highSurrogate(foldMap.at(0));
                Q_ASSERT(QChar::highSurrogate(codepoint) == QChar::highSurrogate(lowerCase));
            }
        }
        // we want toTitleCase to map to ToUpper in case we don't have any titlecase.
        if (properties[UD_TitleCase].isEmpty())
            properties[UD_TitleCase] = properties[UD_UpperCase];
        if (!properties[UD_TitleCase].isEmpty()) {
            int titleCase = properties[UD_TitleCase].toInt(&ok, 16);
            Q_ASSERT (ok);
            data.p.titleCaseDiff = titleCase - codepoint;
            maxTitleCaseDiff = qMax(maxTitleCaseDiff, qAbs(data.p.titleCaseDiff));
            if (codepoint > 0xffff) {
                // if the condition below doesn't hold anymore we need to modify our case folding code
                //qDebug() << codepoint << QChar::highSurrogate(codepoint) << QChar::highSurrogate(foldMap.at(0));
                Q_ASSERT(QChar::highSurrogate(codepoint) == QChar::highSurrogate(titleCase));
            }
        }

        if (!properties[UD_DigitValue].isEmpty())
            data.p.digitValue = properties[UD_DigitValue].toInt();

        // decompositition
        QByteArray decomposition = properties[UD_Decomposition];
        if (!decomposition.isEmpty()) {
            highestComposedCharacter = qMax(highestComposedCharacter, codepoint);
            QList<QByteArray> d = decomposition.split(' ');
            if (d[0].contains('<')) {
                data.decompositionType = decompositionMap.value(d[0], QChar::Canonical);
                d.takeFirst();
            } else {
                data.decompositionType = QChar::Canonical;
            }
            for (int i = 0; i < d.size(); ++i)
                data.decomposition.append(d[i].toInt(&ok, 16));
            if (!decompositionLength.contains(data.decomposition.size()))
                decompositionLength[data.decomposition.size()] = 1;
            else
                ++decompositionLength[data.decomposition.size()];
        }

        for (int i = codepoint; i <= lastCodepoint; ++i)
            unicodeData.insert(i, data);
    }

}

static int maxMirroredDiff = 0;

static void readBidiMirroring()
{
    QFile f("data/BidiMirroring.txt");
    if (!f.exists())
        qFatal("Couldn't find BidiMirroring.txt");

    f.open(QFile::ReadOnly);

    while (!f.atEnd()) {
        QByteArray line;
        line.resize(1024);
        int len = f.readLine(line.data(), 1024);
        line.resize(len-1);

        int comment = line.indexOf('#');
        if (comment >= 0)
            line = line.left(comment);

        if (line.isEmpty())
            continue;
        line = line.replace(" ", "");

        QList<QByteArray> pair = line.split(';');
        Q_ASSERT(pair.size() == 2);

        bool ok;
        int codepoint = pair[0].toInt(&ok, 16);
        int mirror = pair[1].toInt(&ok, 16);

        UnicodeData d = unicodeData.value(codepoint, UnicodeData(codepoint));
        d.mirroredChar = mirror;
        if (qAbs(codepoint-d.mirroredChar) > maxMirroredDiff)
            maxMirroredDiff = qAbs(codepoint - d.mirroredChar);

        d.p.mirrorDiff = d.mirroredChar - codepoint;
        unicodeData.insert(codepoint, d);
    }
}

static void readArabicShaping()
{
    QFile f("data/ArabicShaping.txt");
    if (!f.exists())
        qFatal("Couldn't find ArabicShaping.txt");

    f.open(QFile::ReadOnly);

    while (!f.atEnd()) {
        QByteArray line;
        line.resize(1024);
        int len = f.readLine(line.data(), 1024);
        line.resize(len-1);

        int comment = line.indexOf('#');
        if (comment >= 0)
            line = line.left(comment);
        line = line.trimmed();

        if (line.isEmpty())
            continue;

        QList<QByteArray> shaping = line.split(';');
        Q_ASSERT(shaping.size() == 4);

        bool ok;
        int codepoint = shaping[0].toInt(&ok, 16);
        QChar::Joining j = QChar::OtherJoining;
        QByteArray shape = shaping[2].trimmed();
        if (shape == "R")
            j = QChar::Right;
        else if (shape == "D")
            j = QChar::Dual;
        else if (shape == "C")
            j = QChar::Center;

        UnicodeData d = unicodeData.value(codepoint, UnicodeData(codepoint));
        d.p.joining = j;
        unicodeData.insert(codepoint, d);
    }
}

static void readDerivedAge()
{
    QFile f("data/DerivedAge.txt");
    if (!f.exists())
        qFatal("Couldn't find DerivedAge.txt");

    f.open(QFile::ReadOnly);

    while (!f.atEnd()) {
        QByteArray line;
        line.resize(1024);
        int len = f.readLine(line.data(), 1024);
        line.resize(len-1);

        int comment = line.indexOf('#');
        if (comment >= 0)
            line = line.left(comment);
        line.replace(" ", "");

        if (line.isEmpty())
            continue;

        QList<QByteArray> l = line.split(';');
        Q_ASSERT(l.size() == 2);

        QByteArray codes = l[0];
        codes.replace("..", ".");
        QList<QByteArray> cl = codes.split('.');

        bool ok;
        int from = cl[0].toInt(&ok, 16);
        int to = from;
        if (cl.size() == 2)
            to = cl[1].toInt(&ok, 16);

        QChar::UnicodeVersion age = QChar::Unicode_Unassigned;
        QByteArray ba = l[1];
        AgeMap *map = ageMap;
        while (map->age) {
            if (ba == map->age) {
                age = map->version;
                break;
            }
            ++map;
        }
        //qDebug() << hex << from << ".." << to << ba << age;
        Q_ASSERT(age != QChar::Unicode_Unassigned);

        for (int codepoint = from; codepoint <= to; ++codepoint) {
            UnicodeData d = unicodeData.value(codepoint, UnicodeData(codepoint));
            d.p.age = age;
            unicodeData.insert(codepoint, d);
        }
    }
}


static void readCompositionExclusion()
{
    QFile f("data/CompositionExclusions.txt");
    if (!f.exists())
        qFatal("Couldn't find CompositionExclusions.txt");

    f.open(QFile::ReadOnly);

    while (!f.atEnd()) {
        QByteArray line;
        line.resize(1024);
        int len = f.readLine(line.data(), 1024);
        line.resize(len-1);

        int comment = line.indexOf('#');
        if (comment >= 0)
            line = line.left(comment);
        line.replace(" ", "");

        if (line.isEmpty())
            continue;

        Q_ASSERT(!line.contains(".."));

        bool ok;
        int codepoint = line.toInt(&ok, 16);

        UnicodeData d = unicodeData.value(codepoint, UnicodeData(codepoint));
        d.excludedComposition = true;
        unicodeData.insert(codepoint, d);
    }

    for (int i = 0; i < 0x110000; ++i) {
        UnicodeData data = unicodeData.value(i, UnicodeData(i));
        if (!data.excludedComposition
            && data.decompositionType == QChar::Canonical
            && data.decomposition.size() > 1) {
            Q_ASSERT(data.decomposition.size() == 2);

            uint part1 = data.decomposition.at(0);
            uint part2 = data.decomposition.at(1);
            UnicodeData first = unicodeData.value(part1, UnicodeData(part1));
            if (first.p.combiningClass != 0)
                continue;

            ++numLigatures;
            highestLigature = qMax(highestLigature, (int)part1);
            Ligature l = {(ushort)part1, (ushort)part2, i};
            ligatureHashes[part2].append(l);
        }
    }
}

struct NormalizationCorrection {
    uint codepoint;
    uint mapped;
    uint version;
};

static QByteArray createNormalizationCorrections()
{
    QFile f("data/NormalizationCorrections.txt");
    if (!f.exists())
        qFatal("Couldn't find NormalizationCorrections.txt");

    f.open(QFile::ReadOnly);

    QByteArray out;

    out += "struct NormalizationCorrection {\n"
           "    uint ucs4;\n"
           "    uint old_mapping;\n"
           "    int version;\n"
           "};\n\n"

           "static const NormalizationCorrection uc_normalization_corrections[] = {\n";

    int numCorrections = 0;
    while (!f.atEnd()) {
        QByteArray line;
        line.resize(1024);
        int len = f.readLine(line.data(), 1024);
        line.resize(len-1);

        int comment = line.indexOf('#');
        if (comment >= 0)
            line = line.left(comment);
        line.replace(" ", "");

        if (line.isEmpty())
            continue;

        Q_ASSERT(!line.contains(".."));

        QList<QByteArray> fields = line.split(';');
        Q_ASSERT(fields.size() == 4);

        NormalizationCorrection c;
        bool ok;
        c.codepoint = fields.at(0).toInt(&ok, 16);
        c.mapped = fields.at(1).toInt(&ok, 16);
        if (fields.at(3) == "3.2.0")
            c.version = QChar::Unicode_3_2;
        else if (fields.at(3) == "4.0.0")
            c.version = QChar::Unicode_4_0;
        else
            qFatal("unknown unicode version in NormalizationCorrection.txt");

        out += "    { 0x" + QByteArray::number(c.codepoint, 16) + ", 0x" + QByteArray::number(c.mapped, 16)
             + ", " + QString::number(c.version) + " },\n";
        ++numCorrections;
    }

    out += "};\n\n"

           "enum { NumNormalizationCorrections = " + QByteArray::number(numCorrections) + " };\n\n";


    return out;
}


static void computeUniqueProperties()
{
    qDebug("computeUniqueProperties:");
    for (int uc = 0; uc < 0x110000; ++uc) {
        UnicodeData d = unicodeData.value(uc, UnicodeData(uc));

        int index = uniqueProperties.indexOf(d.p);
        if (index == -1) {
            index = uniqueProperties.size();
            uniqueProperties.append(d.p);
        }
        d.propertyIndex = index;
        unicodeData.insert(uc, d);
    }
    qDebug("    %d unicode properties found", uniqueProperties.size());
}


static void readLineBreak()
{
    QFile f("data/LineBreak.txt");
    if (!f.exists())
        qFatal("Couldn't find LineBreak.txt");

    f.open(QFile::ReadOnly);

    while (!f.atEnd()) {
        QByteArray line;
        line.resize(1024);
        int len = f.readLine(line.data(), 1024);
        line.resize(len-1);

        int comment = line.indexOf('#');
        if (comment >= 0)
            line = line.left(comment);
        line.replace(" ", "");

        if (line.isEmpty())
            continue;

        QList<QByteArray> l = line.split(';');
        Q_ASSERT(l.size() == 2);

        QByteArray codes = l[0];
        codes.replace("..", ".");
        QList<QByteArray> cl = codes.split('.');

        bool ok;
        int from = cl[0].toInt(&ok, 16);
        int to = from;
        if (cl.size() == 2)
            to = cl[1].toInt(&ok, 16);

        // ### Classes XX and AI are left out and mapped to AL for now
        QUnicodeTables::LineBreakClass lb = QUnicodeTables::LineBreak_AL;
        QByteArray ba = l[1];

        if (ba == "AI") lb = QUnicodeTables::LineBreak_AL;
        else if (ba == "XX") lb = QUnicodeTables::LineBreak_AL;
        else if (ba == "NL") lb = QUnicodeTables::LineBreak_AL;
        else if (ba == "OP") lb = QUnicodeTables::LineBreak_OP;
        else if (ba == "CL") lb = QUnicodeTables::LineBreak_CL;
        else if (ba == "QU") lb = QUnicodeTables::LineBreak_QU;
        else if (ba == "GL") lb = QUnicodeTables::LineBreak_GL;
        else if (ba == "NS") lb = QUnicodeTables::LineBreak_NS;
        else if (ba == "EX") lb = QUnicodeTables::LineBreak_EX;
        else if (ba == "SY") lb = QUnicodeTables::LineBreak_SY;
        else if (ba == "IS") lb = QUnicodeTables::LineBreak_IS;
        else if (ba == "PR") lb = QUnicodeTables::LineBreak_PR;
        else if (ba == "PO") lb = QUnicodeTables::LineBreak_PO;
        else if (ba == "NU") lb = QUnicodeTables::LineBreak_NU;
        else if (ba == "AL") lb = QUnicodeTables::LineBreak_AL;
        else if (ba == "ID") lb = QUnicodeTables::LineBreak_ID;
        else if (ba == "IN") lb = QUnicodeTables::LineBreak_IN;
        else if (ba == "HY") lb = QUnicodeTables::LineBreak_HY;
        else if (ba == "BA") lb = QUnicodeTables::LineBreak_BA;
        else if (ba == "BB") lb = QUnicodeTables::LineBreak_BB;
        else if (ba == "B2") lb = QUnicodeTables::LineBreak_B2;
        else if (ba == "ZW") lb = QUnicodeTables::LineBreak_ZW;
        else if (ba == "CM") lb = QUnicodeTables::LineBreak_CM;
        else if (ba == "SA") lb = QUnicodeTables::LineBreak_SA;
        else if (ba == "BK") lb = QUnicodeTables::LineBreak_BK;
        else if (ba == "CR") lb = QUnicodeTables::LineBreak_CR;
        else if (ba == "LF") lb = QUnicodeTables::LineBreak_LF;
        else if (ba == "SG") lb = QUnicodeTables::LineBreak_SG;
        else if (ba == "CB") lb = QUnicodeTables::LineBreak_AL;
        else if (ba == "SP") lb = QUnicodeTables::LineBreak_SP;
        else if (ba == "WJ") lb = QUnicodeTables::LineBreak_WJ;
        else if (ba == "H2") lb = QUnicodeTables::LineBreak_H2;
        else if (ba == "H3") lb = QUnicodeTables::LineBreak_H3;
        else if (ba == "JL") lb = QUnicodeTables::LineBreak_JL;
        else if (ba == "JV") lb = QUnicodeTables::LineBreak_JV;
        else if (ba == "JT") lb = QUnicodeTables::LineBreak_JT;
        else {
            qDebug() << "unhandled line break class:" << ba;
        }

        for (int codepoint = from; codepoint <= to; ++codepoint) {
            UnicodeData d = unicodeData.value(codepoint, UnicodeData(codepoint));
            d.p.line_break_class = lb;
            unicodeData.insert(codepoint, d);
        }
    }
}


static void readSpecialCasing()
{
//     qDebug() << "Reading SpecialCasing.txt";
    QFile f("data/SpecialCasing.txt");
    if (!f.exists())
        qFatal("Couldn't find SpecialCasing.txt");

    f.open(QFile::ReadOnly);

    while (!f.atEnd()) {
        QByteArray line;
        line.resize(1024);
        int len = f.readLine(line.data(), 1024);
        line.resize(len-1);

        int comment = line.indexOf('#');
        if (comment >= 0)
            line = line.left(comment);

        if (line.isEmpty())
            continue;

        QList<QByteArray> l = line.split(';');

        QByteArray condition = l.size() < 5 ? QByteArray() : l[4].trimmed();
        if (!condition.isEmpty())
            // #####
            continue;

        bool ok;
        int codepoint = l[0].trimmed().toInt(&ok, 16);
        Q_ASSERT(ok);
        Q_ASSERT(codepoint <= 0xffff);

//         qDebug() << "codepoint" << hex << codepoint;
//         qDebug() << line;

        QList<QByteArray> lower = l[1].trimmed().split(' ');
        QList<int> lowerMap;
        for (int i = 0; i < lower.size(); ++i) {
            bool ok;
            lowerMap.append(lower.at(i).toInt(&ok, 16));
            Q_ASSERT(ok);
        }

        QList<QByteArray> title = l[2].trimmed().split(' ');
        QList<int> titleMap;
        for (int i = 0; i < title.size(); ++i) {
            bool ok;
            titleMap.append(title.at(i).toInt(&ok, 16));
            if (!ok)
                qDebug() << line << title.at(i);
            Q_ASSERT(ok);
        }

        QList<QByteArray> upper = l[3].trimmed().split(' ');
        QList<int> upperMap;
        for (int i = 0; i < upper.size(); ++i) {
            bool ok;
            upperMap.append(upper.at(i).toInt(&ok, 16));
            Q_ASSERT(ok);
        }


        UnicodeData ud = unicodeData.value(codepoint, UnicodeData(codepoint));

        Q_ASSERT(lowerMap.size() > 1 || lowerMap.at(0) == codepoint + ud.p.lowerCaseDiff);
        Q_ASSERT(titleMap.size() > 1 || titleMap.at(0) == codepoint + ud.p.titleCaseDiff);
        Q_ASSERT(upperMap.size() > 1 || upperMap.at(0) == codepoint + ud.p.upperCaseDiff);

        if (lowerMap.size() > 1) {
            ud.p.lowerCaseSpecial = true;
            ud.p.lowerCaseDiff = appendToSpecialCaseMap(lowerMap);
        }
        if (titleMap.size() > 1) {
            ud.p.titleCaseSpecial = true;
            ud.p.titleCaseDiff = appendToSpecialCaseMap(titleMap);
        }
        if (upperMap.size() > 1) {
            ud.p.upperCaseSpecial = true;
            ud.p.upperCaseDiff = appendToSpecialCaseMap(upperMap);;
        }

        unicodeData.insert(codepoint, ud);
    }
}

int maxCaseFoldDiff = 0;

static void readCaseFolding()
{
    qDebug() << "Reading CaseFolding.txt";
    QFile f("data/CaseFolding.txt");
    if (!f.exists())
        qFatal("Couldn't find CaseFolding.txt");

    f.open(QFile::ReadOnly);

    while (!f.atEnd()) {
        QByteArray line;
        line.resize(1024);
        int len = f.readLine(line.data(), 1024);
        line.resize(len-1);

        int comment = line.indexOf('#');
        if (comment >= 0)
            line = line.left(comment);

        if (line.isEmpty())
            continue;

        QList<QByteArray> l = line.split(';');

        bool ok;
        uint codepoint = l[0].trimmed().toInt(&ok, 16);
        Q_ASSERT(ok);


        l[1] = l[1].trimmed();
        if (l[1] == "F" || l[1] == "T")
            continue;

//         qDebug() << "codepoint" << hex << codepoint;
//         qDebug() << line;
        QList<QByteArray> fold = l[2].trimmed().split(' ');
        QList<int> foldMap;
        for (int i = 0; i < fold.size(); ++i) {
            bool ok;
            foldMap.append(fold.at(i).toInt(&ok, 16));
            Q_ASSERT(ok);
        }

        UnicodeData ud = unicodeData.value(codepoint, UnicodeData(codepoint));
        if (foldMap.size() == 1) {
            ud.p.caseFoldDiff = foldMap.at(0) - codepoint;
            maxCaseFoldDiff = qMax(maxCaseFoldDiff, ud.p.caseFoldDiff);
            if (codepoint > 0xffff) {
                // if the condition below doesn't hold anymore we need to modify our case folding code
                //qDebug() << codepoint << QChar::highSurrogate(codepoint) << QChar::highSurrogate(foldMap.at(0));
                Q_ASSERT(QChar::highSurrogate(codepoint) == QChar::highSurrogate(foldMap.at(0)));
            }
            if (foldMap.at(0) != codepoint + ud.p.lowerCaseDiff)
                qDebug() << hex << codepoint;
        } else {
            Q_ASSERT(false); // we currently don't support full case foldings
//             qDebug() << "special" << hex << foldMap;
            ud.p.caseFoldSpecial = true;
            ud.p.caseFoldDiff = appendToSpecialCaseMap(foldMap);
        }
        unicodeData.insert(codepoint, ud);
    }
}

static void readGraphemeBreak()
{
    qDebug() << "Reading GraphemeBreakProperty.txt";
    QFile f("data/GraphemeBreakProperty.txt");
    if (!f.exists())
        qFatal("Couldn't find GraphemeBreakProperty.txt");

    f.open(QFile::ReadOnly);

    while (!f.atEnd()) {
        QByteArray line;
        line.resize(1024);
        int len = f.readLine(line.data(), 1024);
        line.resize(len-1);

        int comment = line.indexOf('#');
        if (comment >= 0)
            line = line.left(comment);

        if (line.isEmpty())
            continue;

        QList<QByteArray> l = line.split(';');

        QByteArray codes = l[0].trimmed();
        codes.replace("..", ".");
        QList<QByteArray> cl = codes.split('.');

        bool ok;
        int from = cl[0].toInt(&ok, 16);
        Q_ASSERT(ok);
        int to = from;
        if (cl.size() == 2) {
            to = cl[1].toInt(&ok, 16);
            Q_ASSERT(ok);
        }

        GraphemeBreak brk = grapheme_break_map.value(l[1].trimmed(), GraphemeBreakOther);

        for (int codepoint = from; codepoint <= to; ++codepoint) {
            UnicodeData ud = unicodeData.value(codepoint, UnicodeData(codepoint));
            ud.p.graphemeBreak = brk;
            unicodeData.insert(codepoint, ud);
        }
    }
}

static void readWordBreak()
{
    qDebug() << "Reading WordBreakProperty.txt";
    QFile f("data/WordBreakProperty.txt");
    if (!f.exists())
        qFatal("Couldn't find WordBreakProperty.txt");

    f.open(QFile::ReadOnly);

    while (!f.atEnd()) {
        QByteArray line;
        line.resize(1024);
        int len = f.readLine(line.data(), 1024);
        line.resize(len-1);

        int comment = line.indexOf('#');
        if (comment >= 0)
            line = line.left(comment);

        if (line.isEmpty())
            continue;

        QList<QByteArray> l = line.split(';');

        QByteArray codes = l[0].trimmed();
        codes.replace("..", ".");
        QList<QByteArray> cl = codes.split('.');

        bool ok;
        int from = cl[0].toInt(&ok, 16);
        Q_ASSERT(ok);
        int to = from;
        if (cl.size() == 2) {
            to = cl[1].toInt(&ok, 16);
            Q_ASSERT(ok);
        }

        WordBreak brk = word_break_map.value(l[1].trimmed(), WordBreakOther);
        Q_ASSERT(brk != WordBreakOther);

        for (int codepoint = from; codepoint <= to; ++codepoint) {
            UnicodeData ud = unicodeData.value(codepoint, UnicodeData(codepoint));
            ud.p.wordBreak = brk;
            unicodeData.insert(codepoint, ud);
        }
    }
}

static void readSentenceBreak()
{
    qDebug() << "Reading SentenceBreakProperty.txt";
    QFile f("data/SentenceBreakProperty.txt");
    if (!f.exists())
        qFatal("Couldn't find SentenceBreakProperty.txt");

    f.open(QFile::ReadOnly);

    while (!f.atEnd()) {
        QByteArray line;
        line.resize(1024);
        int len = f.readLine(line.data(), 1024);
        line.resize(len-1);

        int comment = line.indexOf('#');
        if (comment >= 0)
            line = line.left(comment);

        if (line.isEmpty())
            continue;

        QList<QByteArray> l = line.split(';');

        QByteArray codes = l[0].trimmed();
        codes.replace("..", ".");
        QList<QByteArray> cl = codes.split('.');

        bool ok;
        int from = cl[0].toInt(&ok, 16);
        Q_ASSERT(ok);
        int to = from;
        if (cl.size() == 2) {
            to = cl[1].toInt(&ok, 16);
            Q_ASSERT(ok);
        }

        SentenceBreak brk = sentence_break_map.value(l[1].trimmed(), SentenceBreakOther);
        Q_ASSERT(brk != SentenceBreakOther);

        for (int codepoint = from; codepoint <= to; ++codepoint) {
            UnicodeData ud = unicodeData.value(codepoint, UnicodeData(codepoint));
            ud.p.sentenceBreak = brk;
            unicodeData.insert(codepoint, ud);
        }
    }
}

#if 0
// this piece of code does full case folding and comparison. We currently
// don't use it, since this gives lots of issues with things as case insensitive
// search and replace.
static inline void foldCase(uint ch, ushort *out)
{
    const QUnicodeTables::Properties *p = qGetProp(ch);
    if (!p->caseFoldSpecial) {
        *(out++) = ch + p->caseFoldDiff;
    } else {
        const ushort *folded = specialCaseMap + p->caseFoldDiff;
        while (*folded)
            *out++ = *folded++;
    }
    *out = 0;
}

static int ucstricmp(const ushort *a, const ushort *ae, const ushort *b, const ushort *be)
{
    if (a == b)
        return 0;
    if (a == 0)
        return 1;
    if (b == 0)
        return -1;

    while (a != ae && b != be) {
        const QUnicodeTables::Properties *pa = qGetProp(*a);
        const QUnicodeTables::Properties *pb = qGetProp(*b);
        if (pa->caseFoldSpecial | pb->caseFoldSpecial)
            goto special;
            int diff = (int)(*a + pa->caseFoldDiff) - (int)(*b + pb->caseFoldDiff);
        if ((diff))
            return diff;
        ++a;
        ++b;
        }
    }
    if (a == ae) {
        if (b == be)
            return 0;
        return -1;
    }
    return 1;
special:
    ushort abuf[SPECIAL_CASE_MAX_LEN + 1];
    ushort bbuf[SPECIAL_CASE_MAX_LEN + 1];
    abuf[0] = bbuf[0] = 0;
    ushort *ap = abuf;
    ushort *bp = bbuf;
    while (1) {
        if (!*ap) {
            if (a == ae) {
                if (!*bp && b == be)
                    return 0;
                return -1;
            }
            foldCase(*(a++), abuf);
            ap = abuf;
        }
        if (!*bp) {
            if (b == be)
                return 1;
            foldCase(*(b++), bbuf);
            bp = bbuf;
        }
        if (*ap != *bp)
            return (int)*ap - (int)*bp;
        ++ap;
        ++bp;
    }
}


static int ucstricmp(const ushort *a, const ushort *ae, const uchar *b)
{
    if (a == 0)
        return 1;
    if (b == 0)
        return -1;

    while (a != ae && *b) {
        const QUnicodeTables::Properties *pa = qGetProp(*a);
        const QUnicodeTables::Properties *pb = qGetProp((ushort)*b);
        if (pa->caseFoldSpecial | pb->caseFoldSpecial)
            goto special;
        int diff = (int)(*a + pa->caseFoldDiff) - (int)(*b + pb->caseFoldDiff);
        if ((diff))
            return diff;
        ++a;
        ++b;
    }
    if (a == ae) {
        if (!*b)
            return 0;
        return -1;
    }
    return 1;

special:
    ushort abuf[SPECIAL_CASE_MAX_LEN + 1];
    ushort bbuf[SPECIAL_CASE_MAX_LEN + 1];
    abuf[0] = bbuf[0] = 0;
    ushort *ap = abuf;
    ushort *bp = bbuf;
    while (1) {
        if (!*ap) {
            if (a == ae) {
                if (!*bp && !*b)
                    return 0;
                return -1;
            }
            foldCase(*(a++), abuf);
            ap = abuf;
        }
        if (!*bp) {
            if (!*b)
                return 1;
            foldCase(*(b++), bbuf);
            bp = bbuf;
        }
        if (*ap != *bp)
            return (int)*ap - (int)*bp;
        ++ap;
        ++bp;
    }
}
#endif

#if 0
static QList<QByteArray> blockNames;
struct BlockInfo
{
    int blockIndex;
    int firstCodePoint;
    int lastCodePoint;
};
static QList<BlockInfo> blockInfoList;

static void readBlocks()
{
    QFile f("data/Blocks.txt");
    if (!f.exists())
        qFatal("Couldn't find Blocks.txt");

    f.open(QFile::ReadOnly);

    while (!f.atEnd()) {
        QByteArray line = f.readLine();
        line.resize(line.size() - 1);

        int comment = line.indexOf("#");
        if (comment >= 0)
            line = line.left(comment);

        line.replace(" ", "");

        if (line.isEmpty())
            continue;

        int semicolon = line.indexOf(';');
        Q_ASSERT(semicolon >= 0);
        QByteArray codePoints = line.left(semicolon);
        QByteArray blockName = line.mid(semicolon + 1);

        int blockIndex = blockNames.indexOf(blockName);
        if (blockIndex < 0) {
            blockNames.append(blockName);
            blockIndex = blockNames.indexOf(blockName);
            Q_ASSERT(blockIndex >= 0);
        }

        int dotdot = codePoints.indexOf("..");
        Q_ASSERT(dotdot >= 0);
        bool unused;
        int first = codePoints.left(dotdot).toInt(&unused, 16);
        int last = codePoints.mid(dotdot + 2).toInt(&unused, 16);

        BlockInfo blockInfo = { blockIndex, first, last };
        blockInfoList.append(blockInfo);
    }
}
#endif

static QList<QByteArray> scriptNames;
static QHash<int, int> scriptAssignment;
static QHash<int, int> scriptHash;

struct ExtraBlock {
    int block;
    QVector<int> vector;
};

static QList<ExtraBlock> extraBlockList;


static void readScripts()
{
    scriptNames.append("Common");

    static const char *files[] = {
        "data/ScriptsInitial.txt",
        "data/Scripts.txt",
        "data/ScriptsCorrections.txt"
    };
    enum { fileCount = sizeof(files) / sizeof(const char *) };

    for (int i = 0; i < fileCount; ++i) {
        QFile f(files[i]);
        if (!f.exists())
            qFatal("Couldn't find %s", files[i]);


        f.open(QFile::ReadOnly);

        while (!f.atEnd()) {
            QByteArray line = f.readLine();
            line.resize(line.size() - 1);

            int comment = line.indexOf("#");
            if (comment >= 0)
                line = line.left(comment);

            line.replace(" ", "");
            line.replace("_", "");

            if (line.isEmpty())
                continue;

            int semicolon = line.indexOf(';');
            Q_ASSERT(semicolon >= 0);
            QByteArray codePoints = line.left(semicolon);
            QByteArray scriptName = line.mid(semicolon + 1);

            int scriptIndex = scriptNames.indexOf(scriptName);
            if (scriptIndex < 0) {
                scriptNames.append(scriptName);
                scriptIndex = scriptNames.indexOf(scriptName);
                Q_ASSERT(scriptIndex >= 0);
            }

            int dotdot = codePoints.indexOf("..");
            bool unused;
            int first = -1, last = -1;
            if (dotdot >= 0) {
                first = codePoints.left(dotdot).toInt(&unused, 16);
                last = codePoints.mid(dotdot + 2).toInt(&unused, 16);
            } else {
                first = codePoints.toInt(&unused, 16);
            }

            if (last != -1) {
                for (int i = first; i <= last; ++i)
                    scriptAssignment[i] = scriptIndex;
            } else {
                scriptAssignment[first] = scriptIndex;
            }
        }
    }
}


static int scriptSentinel = 0;

QByteArray createScriptEnumDeclaration()
{
    static const char *specialScripts[] = {
        "Common",
        "Arabic",
        "Armenian",
        "Bengali",
        "Cyrillic",
        "Devanagari",
        "Georgian",
        "Greek",
        "Gujarati",
        "Gurmukhi",
        "Hangul",
        "Hebrew",
        "Kannada",
        "Khmer",
        "Lao",
        "Malayalam",
        "Myanmar",
        "Ogham",
        "Oriya",
        "Runic",
        "Sinhala",
        "Syriac",
        "Tamil",
        "Telugu",
        "Thaana",
        "Thai",
        "Tibetan",
        "Inherited"
    };
    const int specialScriptsCount = sizeof(specialScripts) / sizeof(const char *);

    // generate script enum
    QByteArray declaration;

    declaration += "    // See http://www.unicode.org/reports/tr24/tr24-5.html\n\n";
    declaration += "    enum Script {\n        Common";

    int uniqueScripts = 1; // Common

    // output the ones with special processing first
    for (int i = 1; i < scriptNames.size(); ++i) {
        QByteArray scriptName = scriptNames.at(i);
        // does the script require special processing?
        bool special = false;
        for (int s = 0; !special && s < specialScriptsCount; ++s) {
            if (scriptName == specialScripts[s])
                special = true;
        }
        if (!special) {
            scriptHash[i] =  0; // alias for 'Common'
            continue;
        } else {
            ++uniqueScripts;
            scriptHash[i] = i;
        }

        declaration += ",\n        ";
        declaration += scriptName;
    }
    declaration += ",\n        ScriptCount = Inherited";

    // output the ones that are an alias for 'Common'
    for (int i = 1; i < scriptNames.size(); ++i) {
        if (scriptHash.value(i) != 0)
            continue;
        QByteArray scriptName = scriptNames.at(i);
        scriptName += " = Common";
        declaration += ",\n        ";
        declaration += scriptName;
    }

    declaration += "\n    };\n";

    scriptSentinel = ((uniqueScripts + 16) / 32) * 32; // a multiple of 32
    declaration += "    enum { ScriptSentinel = ";
    declaration += QByteArray::number(scriptSentinel);
    declaration += " };\n\n";
    return declaration;
}

QByteArray createScriptTableDeclaration()
{
    Q_ASSERT(scriptSentinel > 0);

    QByteArray declaration;

    const int unicodeBlockCount = 512; // number of unicode blocks
    const int unicodeBlockSize = 128; // size of each block
    declaration = "enum { UnicodeBlockCount = ";
    declaration += QByteArray::number(unicodeBlockCount);
    declaration += " }; // number of unicode blocks\n";
    declaration += "enum { UnicodeBlockSize = ";
    declaration += QByteArray::number(unicodeBlockSize);
    declaration += " }; // size of each block\n\n";

    // script table
    declaration += "namespace QUnicodeTables {\n\nstatic const unsigned char uc_scripts[] = {\n";
    for (int i = 0; i < unicodeBlockCount; ++i) {
        int block = (((i << 7) & 0xff00) | ((i & 1) * 0x80));
        int blockAssignment[unicodeBlockSize];
        for (int x = 0; x < unicodeBlockSize; ++x) {
            int codePoint = (i << 7) | x;
            blockAssignment[x] = scriptAssignment.value(codePoint, 0);
        }
        bool allTheSame = true;
        const int originalScript = blockAssignment[0];
        const int script = scriptHash.value(originalScript);
        for (int x = 1; allTheSame && x < unicodeBlockSize; ++x) {
            const int s = scriptHash.value(blockAssignment[x]);
            if (s != script)
                allTheSame = false;
        }

        if (allTheSame) {
            declaration += "    ";
            declaration += scriptNames.value(originalScript);
            declaration += ", /* U+";
            declaration += QByteArray::number(block, 16).rightJustified(4, '0');
            declaration += '-';
            declaration +=
                QByteArray::number(block + unicodeBlockSize - 1, 16).rightJustified(4, '0');
            declaration += " */\n";
        } else {
            const int value = extraBlockList.size() + scriptSentinel;
            const int offset =
                ((value - scriptSentinel) * unicodeBlockSize) + unicodeBlockCount;

            declaration += "    ";
            declaration += QByteArray::number(value);
            declaration += ", /* U+";
            declaration += QByteArray::number(block, 16).rightJustified(4, '0');
            declaration += '-';
            declaration +=
                QByteArray::number(block + unicodeBlockSize - 1, 16).rightJustified(4, '0');
            declaration += " at offset ";
            declaration += QByteArray::number(offset);
            declaration += " */\n";

            ExtraBlock extraBlock;
            extraBlock.block = block;
            extraBlock.vector.resize(unicodeBlockSize);
            for (int x = 0; x < unicodeBlockSize; ++x)
                extraBlock.vector[x] = blockAssignment[x];

            extraBlockList.append(extraBlock);
        }
    }

    for (int i = 0; i < extraBlockList.size(); ++i) {
        const int value = i + scriptSentinel;
        const int offset =
            ((value - scriptSentinel) * unicodeBlockSize) + unicodeBlockCount;
        const ExtraBlock &extraBlock = extraBlockList.at(i);
        const int block = extraBlock.block;

        declaration += "\n\n    /* U+";
        declaration += QByteArray::number(block, 16).rightJustified(4, '0');
        declaration += '-';
        declaration +=
            QByteArray::number(block + unicodeBlockSize - 1, 16).rightJustified(4, '0');
        declaration += " at offset ";
        declaration += QByteArray::number(offset);
        declaration += " */\n    ";

        for (int x = 0; x < extraBlock.vector.size(); ++x) {
            const int o = extraBlock.vector.at(x);

            declaration += scriptNames.value(o);
            if (x < extraBlock.vector.size() - 1 || i < extraBlockList.size() - 1)
                declaration += ',';
            if ((x & 7) == 7 && x < extraBlock.vector.size() - 1)
                declaration += "\n    ";
            else
                declaration += ' ';
        }
    }
    declaration += "\n};\n\n} // namespace QUnicodeTables\n\n";

    qDebug("createScriptTableDeclaration: table size is %d bytes",
           unicodeBlockCount + (extraBlockList.size() * unicodeBlockSize));

    return declaration;
}

#if 0
static void dump(int from, int to)
{
    for (int i = from; i <= to; ++i) {
        UnicodeData d = unicodeData.value(i, UnicodeData(i));
        qDebug("0x%04x: cat=%d combining=%d dir=%d case=%x mirror=%x joining=%d age=%d",
               i, d.p.category, d.p.combiningClass, d.p.direction, d.otherCase, d.mirroredChar, d.p.joining, d.p.age);
        if (d.decompositionType != QChar::NoDecomposition) {
            qDebug("    decomposition: type=%d, length=%d, first=%x", d.decompositionType, d.decomposition.size(),
                   d.decomposition[0]);
        }
    }
    qDebug(" ");
}
#endif

struct PropertyBlock {
    PropertyBlock() { index = -1; }
    int index;
    QList<int> properties;
    bool operator ==(const PropertyBlock &other) { return properties == other.properties; }
};

static QByteArray createPropertyInfo()
{
    qDebug("createPropertyInfo:");

    const int BMP_BLOCKSIZE=32;
    const int BMP_SHIFT = 5;
    const int BMP_END = 0x11000;
    const int SMP_END = 0x110000;
    const int SMP_BLOCKSIZE = 256;
    const int SMP_SHIFT = 8;

    QList<PropertyBlock> blocks;
    QList<int> blockMap;

    int used = 0;

    for (int block = 0; block < BMP_END/BMP_BLOCKSIZE; ++block) {
        PropertyBlock b;
        for (int i = 0; i < BMP_BLOCKSIZE; ++i) {
            int uc = block*BMP_BLOCKSIZE + i;
            UnicodeData d = unicodeData.value(uc, UnicodeData(uc));
            b.properties.append(d.propertyIndex);
        }
        int index = blocks.indexOf(b);
        if (index == -1) {
            index = blocks.size();
            b.index = used;
            used += BMP_BLOCKSIZE;
            blocks.append(b);
        }
        blockMap.append(blocks.at(index).index);
    }

    int bmp_blocks = blocks.size();
    Q_ASSERT(blockMap.size() == BMP_END/BMP_BLOCKSIZE);

    for (int block = BMP_END/SMP_BLOCKSIZE; block < SMP_END/SMP_BLOCKSIZE; ++block) {
        PropertyBlock b;
        for (int i = 0; i < SMP_BLOCKSIZE; ++i) {
            int uc = block*SMP_BLOCKSIZE + i;
            UnicodeData d = unicodeData.value(uc, UnicodeData(uc));
            b.properties.append(d.propertyIndex);
        }
        int index = blocks.indexOf(b);
        if (index == -1) {
            index = blocks.size();
            b.index = used;
            used += SMP_BLOCKSIZE;
            blocks.append(b);
        }
        blockMap.append(blocks.at(index).index);
    }

    int bmp_block_data = bmp_blocks*BMP_BLOCKSIZE*2;
    int bmp_trie = BMP_END/BMP_BLOCKSIZE*2;
    int bmp_mem = bmp_block_data + bmp_trie;
    qDebug("    %d unique blocks in BMP.",blocks.size());
    qDebug("        block data uses: %d bytes", bmp_block_data);
    qDebug("        trie data uses : %d bytes", bmp_trie);

    int smp_block_data = (blocks.size()- bmp_blocks)*SMP_BLOCKSIZE*2;
    int smp_trie = (SMP_END-BMP_END)/SMP_BLOCKSIZE*2;
    int smp_mem = smp_block_data + smp_trie;
    qDebug("    %d unique blocks in SMP.",blocks.size()-bmp_blocks);
    qDebug("        block data uses: %d bytes", smp_block_data);
    qDebug("        trie data uses : %d bytes", smp_trie);

    qDebug("\n        properties use : %d bytes", uniqueProperties.size()*20);
    qDebug("    memory usage: %d bytes", bmp_mem+smp_mem + uniqueProperties.size()*20);

    QByteArray out;
    out += "static const unsigned short uc_property_trie[] = {\n";

    // first write the map
    out += "    // 0x" + QByteArray::number(BMP_END, 16);
    for (int i = 0; i < BMP_END/BMP_BLOCKSIZE; ++i) {
        if (!(i % 8)) {
            if (out.endsWith(' '))
                out.chop(1);
            if (!((i*BMP_BLOCKSIZE) % 0x1000))
                out += "\n";
            out += "\n    ";
        }
        out += QByteArray::number(blockMap.at(i) + blockMap.size());
        out += ", ";
    }
    if (out.endsWith(' '))
        out.chop(1);
    out += "\n\n    // 0x" + QByteArray::number(BMP_END, 16) + " - 0x" + QByteArray::number(SMP_END, 16) + "\n";;
    for (int i = BMP_END/BMP_BLOCKSIZE; i < blockMap.size(); ++i) {
        if (!(i % 8)) {
            if (out.endsWith(' '))
                out.chop(1);
            if (!(i % (0x10000/SMP_BLOCKSIZE)))
                out += "\n";
            out += "\n    ";
        }
        out += QByteArray::number(blockMap.at(i) + blockMap.size());
        out += ", ";
    }
    if (out.endsWith(' '))
        out.chop(1);
    out += "\n";
    // write the data
    for (int i = 0; i < blocks.size(); ++i) {
        if (out.endsWith(' '))
            out.chop(1);
        out += "\n";
        const PropertyBlock &b = blocks.at(i);
        for (int j = 0; j < b.properties.size(); ++j) {
            if (!(j % 8)) {
                if (out.endsWith(' '))
                    out.chop(1);
                out += "\n    ";
            }
            out += QByteArray::number(b.properties.at(j));
            out += ", ";
        }
    }

    // we reserve one bit more than in the assert below for the sign
    Q_ASSERT(maxMirroredDiff < (1<<12));
    Q_ASSERT(maxLowerCaseDiff < (1<<14));
    Q_ASSERT(maxUpperCaseDiff < (1<<14));
    Q_ASSERT(maxTitleCaseDiff < (1<<14));
    Q_ASSERT(maxCaseFoldDiff < (1<<14));

    if (out.endsWith(' '))
        out.chop(1);
    out += "\n};\n\n"

           "#define GET_PROP_INDEX(ucs4) \\\n"
           "       (ucs4 < 0x" + QByteArray::number(BMP_END, 16) + " \\\n"
           "        ? (uc_property_trie[uc_property_trie[ucs4>>" + QByteArray::number(BMP_SHIFT) +
           "] + (ucs4 & 0x" + QByteArray::number(BMP_BLOCKSIZE-1, 16)+ ")]) \\\n"
           "        : (uc_property_trie[uc_property_trie[((ucs4 - 0x" + QByteArray::number(BMP_END, 16) +
           ")>>" + QByteArray::number(SMP_SHIFT) + ") + 0x" + QByteArray::number(BMP_END/BMP_BLOCKSIZE, 16) + "]"
           " + (ucs4 & 0x" + QByteArray::number(SMP_BLOCKSIZE-1, 16) + ")]))\n\n"
           "#define GET_PROP_INDEX_UCS2(ucs2) \\\n"
           "(uc_property_trie[uc_property_trie[ucs2>>" + QByteArray::number(BMP_SHIFT) +
           "] + (ucs2 & 0x" + QByteArray::number(BMP_BLOCKSIZE-1, 16)+ ")])\n\n"


           "static const QUnicodeTables::Properties uc_properties [] = {\n";

    // keep in sync with the property declaration
    for (int i = 0; i < uniqueProperties.size(); ++i) {
        PropertyFlags p = uniqueProperties.at(i);
        out += "    { ";
//     "        ushort category : 8;\n"
        out += QByteArray::number( p.category );
        out += ", ";
//     "        ushort line_break_class : 8;\n"
        out += QByteArray::number( p.line_break_class );
        out += ", ";
//     "        ushort direction : 8;\n"
        out += QByteArray::number( p.direction );
        out += ", ";
//     "        ushort combiningClass :8;\n"
        out += QByteArray::number( p.combiningClass );
        out += ", ";
//     "        ushort joining : 2;\n"
        out += QByteArray::number( p.joining );
        out += ", ";
//     "        signed short digitValue : 6;\n /* 5 needed */"
        out += QByteArray::number( p.digitValue );
        out += ", ";
//     "        ushort unicodeVersion : 4;\n"
        out += QByteArray::number( p.age );
        out += ", ";
//     "        ushort lowerCaseSpecial : 1;\n"
//     "        ushort upperCaseSpecial : 1;\n"
//     "        ushort titleCaseSpecial : 1;\n"
//     "        ushort caseFoldSpecial : 1;\n"
        out += QByteArray::number( p.lowerCaseSpecial );
        out += ", ";
        out += QByteArray::number( p.upperCaseSpecial );
        out += ", ";
        out += QByteArray::number( p.titleCaseSpecial );
        out += ", ";
        out += QByteArray::number( p.caseFoldSpecial );
        out += ", ";
//     "        signed short mirrorDiff : 16;\n"
//     "        signed short lowerCaseDiff : 16;\n"
//     "        signed short upperCaseDiff : 16;\n"
//     "        signed short titleCaseDiff : 16;\n"
//     "        signed short caseFoldDiff : 16;\n"
        out += QByteArray::number( p.mirrorDiff );
        out += ", ";
        out += QByteArray::number( p.lowerCaseDiff );
        out += ", ";
        out += QByteArray::number( p.upperCaseDiff );
        out += ", ";
        out += QByteArray::number( p.titleCaseDiff );
        out += ", ";
        out += QByteArray::number( p.caseFoldDiff );
        out += ", ";
        out += QByteArray::number( p.graphemeBreak );
        out += ", ";
        out += QByteArray::number( p.wordBreak );
        out += ", ";
        out += QByteArray::number( p.sentenceBreak );
        out += "},\n";
    }
    out += "};\n\n";

    out += "static inline const QUnicodeTables::Properties *qGetProp(uint ucs4)\n"
           "{\n"
           "    int index = GET_PROP_INDEX(ucs4);\n"
           "    return uc_properties + index;\n"
           "}\n"
           "\n"
           "static inline const QUnicodeTables::Properties *qGetProp(ushort ucs2)\n"
           "{\n"
           "    int index = GET_PROP_INDEX_UCS2(ucs2);\n"
           "    return uc_properties + index;\n"
           "}\n"
           "\n"
           "Q_CORE_EXPORT const QUnicodeTables::Properties * QT_FASTCALL QUnicodeTables::properties(uint ucs4)\n"
           "{\n"
           "    int index = GET_PROP_INDEX(ucs4);\n"
           "    return uc_properties + index;\n"
           "}\n"
           "\n"
           "Q_CORE_EXPORT const QUnicodeTables::Properties * QT_FASTCALL QUnicodeTables::properties(ushort ucs2)\n"
           "{\n"
           "    int index = GET_PROP_INDEX_UCS2(ucs2);\n"
           "    return uc_properties + index;\n"
           "}\n\n";

    out += "#define CURRENT_VERSION "CURRENT_UNICODE_VERSION"\n\n";

    out += "static const ushort specialCaseMap [] = {";
    for (int i = 0; i < specialCaseMap.size(); ++i) {
        if (!(i % 16))
            out += "\n   ";
        out += QByteArray(" 0x") + QByteArray::number(specialCaseMap.at(i), 16);
        if (i < specialCaseMap.size() - 1)
            out += ",";
    }
    out += "\n};\n";
    out += "#define SPECIAL_CASE_MAX_LEN " + QByteArray::number(specialCaseMaxLen) + "\n\n";

    qDebug() << "Special case map uses " << specialCaseMap.size()*2 << "bytes";

    return out;
}


struct DecompositionBlock {
    DecompositionBlock() { index = -1; }
    int index;
    QList<int> decompositionPositions;
    bool operator ==(const DecompositionBlock &other)
        { return decompositionPositions == other.decompositionPositions; }
};

static QByteArray createCompositionInfo()
{
    qDebug("createCompositionInfo:");

    const int BMP_BLOCKSIZE=16;
    const int BMP_SHIFT = 4;
    const int BMP_END = 0x3400; // start of Han
    const int SMP_END = 0x30000;
    const int SMP_BLOCKSIZE = 256;
    const int SMP_SHIFT = 8;

    if(SMP_END <= highestComposedCharacter)
        qFatal("end of table smaller than highest composed character at %x", highestComposedCharacter);

    QList<DecompositionBlock> blocks;
    QList<int> blockMap;
    QList<unsigned short> decompositions;

    int used = 0;
    int tableIndex = 0;

    for (int block = 0; block < BMP_END/BMP_BLOCKSIZE; ++block) {
        DecompositionBlock b;
        for (int i = 0; i < BMP_BLOCKSIZE; ++i) {
            int uc = block*BMP_BLOCKSIZE + i;
            UnicodeData d = unicodeData.value(uc, UnicodeData(uc));
            if (!d.decomposition.isEmpty()) {
                int utf16Chars = 0;
                for (int j = 0; j < d.decomposition.size(); ++j)
                    utf16Chars += d.decomposition.at(j) > 0x10000 ? 2 : 1;
                decompositions.append(d.decompositionType + (utf16Chars<<8));
                for (int j = 0; j < d.decomposition.size(); ++j) {
                    int code = d.decomposition.at(j);
                    if (code > 0x10000) {
                        // save as surrogate pair
                        code -= 0x10000;
                        ushort high = code/0x400 + 0xd800;
                        ushort low = code%0x400 + 0xdc00;
                        decompositions.append(high);
                        decompositions.append(low);
                    } else {
                        decompositions.append(code);
                    }
                }
                b.decompositionPositions.append(tableIndex);
                tableIndex += utf16Chars + 1;
            } else {
                b.decompositionPositions.append(0xffff);
            }
        }
        int index = blocks.indexOf(b);
        if (index == -1) {
            index = blocks.size();
            b.index = used;
            used += BMP_BLOCKSIZE;
            blocks.append(b);
        }
        blockMap.append(blocks.at(index).index);
    }

    int bmp_blocks = blocks.size();
    Q_ASSERT(blockMap.size() == BMP_END/BMP_BLOCKSIZE);

    for (int block = BMP_END/SMP_BLOCKSIZE; block < SMP_END/SMP_BLOCKSIZE; ++block) {
        DecompositionBlock b;
        for (int i = 0; i < SMP_BLOCKSIZE; ++i) {
            int uc = block*SMP_BLOCKSIZE + i;
            UnicodeData d = unicodeData.value(uc, UnicodeData(uc));
            if (!d.decomposition.isEmpty()) {
                int utf16Chars = 0;
                for (int j = 0; j < d.decomposition.size(); ++j)
                    utf16Chars += d.decomposition.at(j) > 0x10000 ? 2 : 1;
                decompositions.append(d.decompositionType + (utf16Chars<<8));
                for (int j = 0; j < d.decomposition.size(); ++j) {
                    int code = d.decomposition.at(j);
                    if (code > 0x10000) {
                        // save as surrogate pair
                        code -= 0x10000;
                        ushort high = code/0x400 + 0xd800;
                        ushort low = code%0x400 + 0xdc00;
                        decompositions.append(high);
                        decompositions.append(low);
                    } else {
                        decompositions.append(code);
                    }
                }
                b.decompositionPositions.append(tableIndex);
                tableIndex += utf16Chars + 1;
            } else {
                b.decompositionPositions.append(0xffff);
            }
        }
        int index = blocks.indexOf(b);
        if (index == -1) {
            index = blocks.size();
            b.index = used;
            used += SMP_BLOCKSIZE;
            blocks.append(b);
        }
        blockMap.append(blocks.at(index).index);
    }

    int bmp_block_data = bmp_blocks*BMP_BLOCKSIZE*2;
    int bmp_trie = BMP_END/BMP_BLOCKSIZE*2;
    int bmp_mem = bmp_block_data + bmp_trie;
    qDebug("    %d unique blocks in BMP.",blocks.size());
    qDebug("        block data uses: %d bytes", bmp_block_data);
    qDebug("        trie data uses : %d bytes", bmp_trie);
    qDebug("        memory usage: %d bytes", bmp_mem);

    int smp_block_data = (blocks.size()- bmp_blocks)*SMP_BLOCKSIZE*2;
    int smp_trie = (SMP_END-BMP_END)/SMP_BLOCKSIZE*2;
    int smp_mem = smp_block_data + smp_trie;
    qDebug("    %d unique blocks in SMP.",blocks.size()-bmp_blocks);
    qDebug("        block data uses: %d bytes", smp_block_data);
    qDebug("        trie data uses : %d bytes", smp_trie);

    qDebug("\n        decomposition table use : %d bytes", decompositions.size()*2);
    qDebug("    memory usage: %d bytes", bmp_mem+smp_mem + decompositions.size()*2);

    QByteArray out;

    out += "static const unsigned short uc_decomposition_trie[] = {\n";

    // first write the map
    out += "    // 0 - 0x" + QByteArray::number(BMP_END, 16);
    for (int i = 0; i < BMP_END/BMP_BLOCKSIZE; ++i) {
        if (!(i % 8)) {
            if (out.endsWith(' '))
                out.chop(1);
            if (!((i*BMP_BLOCKSIZE) % 0x1000))
                out += "\n";
            out += "\n    ";
        }
        out += QByteArray::number(blockMap.at(i) + blockMap.size());
        out += ", ";
    }
    if (out.endsWith(' '))
        out.chop(1);
    out += "\n\n    // 0x" + QByteArray::number(BMP_END, 16) + " - 0x" + QByteArray::number(SMP_END, 16) + "\n";;
    for (int i = BMP_END/BMP_BLOCKSIZE; i < blockMap.size(); ++i) {
        if (!(i % 8)) {
            if (out.endsWith(' '))
                out.chop(1);
            if (!(i % (0x10000/SMP_BLOCKSIZE)))
                out += "\n";
            out += "\n    ";
        }
        out += QByteArray::number(blockMap.at(i) + blockMap.size());
        out += ", ";
    }
    if (out.endsWith(' '))
        out.chop(1);
    out += "\n";
    // write the data
    for (int i = 0; i < blocks.size(); ++i) {
        if (out.endsWith(' '))
            out.chop(1);
        out += "\n";
        const DecompositionBlock &b = blocks.at(i);
        for (int j = 0; j < b.decompositionPositions.size(); ++j) {
            if (!(j % 8)) {
                if (out.endsWith(' '))
                    out.chop(1);
                out += "\n    ";
            }
            out += "0x" + QByteArray::number(b.decompositionPositions.at(j), 16);
            out += ", ";
        }
    }

    if (out.endsWith(' '))
        out.chop(1);
    out += "\n};\n\n"

           "#define GET_DECOMPOSITION_INDEX(ucs4) \\\n"
           "       (ucs4 < 0x" + QByteArray::number(BMP_END, 16) + " \\\n"
           "        ? (uc_decomposition_trie[uc_decomposition_trie[ucs4>>" + QByteArray::number(BMP_SHIFT) +
           "] + (ucs4 & 0x" + QByteArray::number(BMP_BLOCKSIZE-1, 16)+ ")]) \\\n"
           "        : (ucs4 < 0x" + QByteArray::number(SMP_END, 16) + "\\\n"
           "           ? uc_decomposition_trie[uc_decomposition_trie[((ucs4 - 0x" + QByteArray::number(BMP_END, 16) +
           ")>>" + QByteArray::number(SMP_SHIFT) + ") + 0x" + QByteArray::number(BMP_END/BMP_BLOCKSIZE, 16) + "]"
           " + (ucs4 & 0x" + QByteArray::number(SMP_BLOCKSIZE-1, 16) + ")]\\\n"
           "           : 0xffff))\n\n"

           "static const unsigned short uc_decomposition_map[] = {\n";

    for (int i = 0; i < decompositions.size(); ++i) {
        if (!(i % 8)) {
            if (out.endsWith(' '))
                out.chop(1);
            out += "\n    ";
        }
        out += "0x" + QByteArray::number(decompositions.at(i), 16);
        out += ", ";
    }

    if (out.endsWith(' '))
        out.chop(1);
    out += "\n};\n\n";

    return out;
}

static QByteArray createLigatureInfo()
{
    qDebug("createLigatureInfo: numLigatures=%d", numLigatures);

    QList<DecompositionBlock> blocks;
    QList<int> blockMap;
    QList<unsigned short> ligatures;

    const int BMP_BLOCKSIZE = 32;
    const int BMP_SHIFT = 5;
    const int BMP_END = 0x3100;
    Q_ASSERT(highestLigature < BMP_END);

    int used = 0;
    int tableIndex = 0;

    for (int block = 0; block < BMP_END/BMP_BLOCKSIZE; ++block) {
        DecompositionBlock b;
        for (int i = 0; i < BMP_BLOCKSIZE; ++i) {
            int uc = block*BMP_BLOCKSIZE + i;
            QList<Ligature> l = ligatureHashes.value(uc);
            if (!l.isEmpty()) {
                b.decompositionPositions.append(tableIndex);
                qSort(l);

                ligatures.append(l.size());
                for (int i = 0; i < l.size(); ++i) {
                    Q_ASSERT(l.at(i).u2 == uc);
                    ligatures.append(l.at(i).u1);
                    ligatures.append(l.at(i).ligature);
                }
                tableIndex += 2*l.size() + 1;
            } else {
                b.decompositionPositions.append(0xffff);
            }
        }
        int index = blocks.indexOf(b);
        if (index == -1) {
            index = blocks.size();
            b.index = used;
            used += BMP_BLOCKSIZE;
            blocks.append(b);
        }
        blockMap.append(blocks.at(index).index);
    }

    int bmp_blocks = blocks.size();
    Q_ASSERT(blockMap.size() == BMP_END/BMP_BLOCKSIZE);

    int bmp_block_data = bmp_blocks*BMP_BLOCKSIZE*2;
    int bmp_trie = BMP_END/BMP_BLOCKSIZE*2;
    int bmp_mem = bmp_block_data + bmp_trie;
    qDebug("    %d unique blocks in BMP.",blocks.size());
    qDebug("        block data uses: %d bytes", bmp_block_data);
    qDebug("        trie data uses : %d bytes", bmp_trie);
    qDebug("        ligature data uses : %d bytes", ligatures.size()*2);
    qDebug("        memory usage: %d bytes", bmp_mem + ligatures.size() * 2);

    QByteArray out;


    out += "static const unsigned short uc_ligature_trie[] = {\n";

    // first write the map
    out += "    // 0 - 0x" + QByteArray::number(BMP_END, 16);
    for (int i = 0; i < BMP_END/BMP_BLOCKSIZE; ++i) {
        if (!(i % 8)) {
            if (out.endsWith(' '))
                out.chop(1);
            if (!((i*BMP_BLOCKSIZE) % 0x1000))
                out += "\n";
            out += "\n    ";
        }
        out += QByteArray::number(blockMap.at(i) + blockMap.size());
        out += ", ";
    }
    if (out.endsWith(' '))
        out.chop(1);
    out += "\n";
    // write the data
    for (int i = 0; i < blocks.size(); ++i) {
        if (out.endsWith(' '))
            out.chop(1);
        out += "\n";
        const DecompositionBlock &b = blocks.at(i);
        for (int j = 0; j < b.decompositionPositions.size(); ++j) {
            if (!(j % 8)) {
                if (out.endsWith(' '))
                    out.chop(1);
                out += "\n    ";
            }
            out += "0x" + QByteArray::number(b.decompositionPositions.at(j), 16);
            out += ", ";
        }
    }
    if (out.endsWith(' '))
        out.chop(1);
    out += "\n};\n\n"

           "#define GET_LIGATURE_INDEX(u2) "
           "(u2 < 0x" + QByteArray::number(BMP_END, 16) + " ? "
           "uc_ligature_trie[uc_ligature_trie[u2>>" + QByteArray::number(BMP_SHIFT) +
           "] + (u2 & 0x" + QByteArray::number(BMP_BLOCKSIZE-1, 16)+ ")] : 0xffff);\n\n"

           "static const unsigned short uc_ligature_map [] = {\n";

    for (int i = 0; i < ligatures.size(); ++i) {
        if (!(i % 8)) {
            if (out.endsWith(' '))
                out.chop(1);
            out += "\n    ";
        }
        out += "0x" + QByteArray::number(ligatures.at(i), 16);
        out += ", ";
    }

    if (out.endsWith(' '))
        out.chop(1);
    out += "\n};\n\n";

    return out;
}

QByteArray createCasingInfo()
{
    QByteArray out;

    out += "struct CasingInfo {\n"
           "    uint codePoint : 16;\n"
           "    uint flags : 8;\n"
           "    uint offset : 8;\n"
           "};\n\n";

    return out;
}

int main(int, char **)
{
    initCategoryMap();
    initDirectionMap();
    initDecompositionMap();
    initGraphemeBreak();
    initWordBreak();
    initSentenceBreak();
    
    readUnicodeData();
    readBidiMirroring();
    readArabicShaping();
    readDerivedAge();
    readCompositionExclusion();
    readLineBreak();
    readSpecialCasing();
    readCaseFolding();
    // readBlocks();
    readScripts();
    readGraphemeBreak();
    readWordBreak();
    readSentenceBreak();

    computeUniqueProperties();
    QByteArray properties = createPropertyInfo();
    QByteArray compositions = createCompositionInfo();
    QByteArray ligatures = createLigatureInfo();
    QByteArray normalizationCorrections = createNormalizationCorrections();
    QByteArray scriptEnumDeclaration = createScriptEnumDeclaration();
    QByteArray scriptTableDeclaration = createScriptTableDeclaration();

    QFile f("../../src/corelib/tools/qunicodetables.cpp");
    f.open(QFile::WriteOnly|QFile::Truncate);

    QByteArray header =
        "/****************************************************************************\n"
        "**\n"
        "** Copyright (C) 2010 Nokia Corporation and/or its subsidiary(-ies).\n"
        "** All rights reserved.\n"
        "** Contact: Nokia Corporation (qt-info@nokia.com)\n"
        "**\n"
        "** This file is part of the QtCore module of the Qt Toolkit.\n"
        "**\n"
        "** $QT_BEGIN_LICENSE:LGPL$\n"
        "** No Commercial Usage\n"
        "** This file contains pre-release code and may not be distributed.\n"
        "** You may use this file in accordance with the terms and conditions\n"
        "** contained in the Technology Preview License Agreement accompanying\n"
        "** this package.\n"
        "**\n"
        "** GNU Lesser General Public License Usage\n"
        "** Alternatively, this file may be used under the terms of the GNU Lesser\n"
        "** General Public License version 2.1 as published by the Free Software\n"
        "** Foundation and appearing in the file LICENSE.LGPL included in the\n"
        "** packaging of this file.  Please review the following information to\n"
        "** ensure the GNU Lesser General Public License version 2.1 requirements\n"
        "** will be met: http://www.gnu.org/licenses/old-licenses/lgpl-2.1.html.\n"
        "**\n"
        "** In addition, as a special exception, Nokia gives you certain additional\n"
        "** rights.  These rights are described in the Nokia Qt LGPL Exception\n"
        "** version 1.1, included in the file LGPL_EXCEPTION.txt in this package.\n"
        "**\n"
        "** If you have questions regarding the use of this file, please contact\n"
        "** Nokia at qt-info@nokia.com.\n"
        "**\n"
        "**\n"
        "**\n"
        "**\n"
        "**\n"
        "**\n"
        "**\n"
        "**\n"
        "** $QT_END_LICENSE$\n"
        "**\n"
        "****************************************************************************/\n\n"

        "/* This file is autogenerated from the Unicode 5.0 database. Do not edit */\n\n";

    QByteArray warning =
        "//\n"
        "//  W A R N I N G\n"
        "//  -------------\n"
        "//\n"
        "// This file is not part of the Qt API.  It exists for the convenience\n"
        "// of internal files.  This header file may change from version to version\n"
        "// without notice, or even be removed.\n"
        "//\n"
        "// We mean it.\n"
        "//\n\n";

    f.write(header);
    f.write("QT_BEGIN_NAMESPACE\n\n");
    f.write(properties);
    f.write(compositions);
    f.write(ligatures);
    f.write(normalizationCorrections);
    f.write(scriptTableDeclaration);
    f.write("\nQT_END_NAMESPACE\n");
    f.close();

    f.setFileName("../../src/corelib/tools/qunicodetables_p.h");
    f.open(QFile::WriteOnly | QFile::Truncate);
    f.write(header);
    f.write(warning);
    f.write("#ifndef QUNICODETABLES_P_H\n"
            "#define QUNICODETABLES_P_H\n\n"
            "#include <QtCore/qchar.h>\n\n"
            "QT_BEGIN_NAMESPACE\n\n");
    f.write("namespace QUnicodeTables {\n");
    f.write(property_string);
    f.write("\n");
    f.write(scriptEnumDeclaration);
    f.write("\n");
    f.write(lineBreakClass);
    f.write("\n");
    f.write(methods);
    f.write("\n");
    f.write(grapheme_break_string);
    f.write("\n");
    f.write(word_break_string);
    f.write("\n");
    f.write(sentence_break_string);
    f.write("\n}\n\n"
            "QT_END_NAMESPACE\n\n"
            "#endif\n");
    f.close();

    qDebug() << "maxMirroredDiff  = " << hex << maxMirroredDiff;
    qDebug() << "maxLowerCaseDiff = " << hex << maxLowerCaseDiff;
    qDebug() << "maxUpperCaseDiff = " << hex << maxUpperCaseDiff;
    qDebug() << "maxTitleCaseDiff = " << hex << maxTitleCaseDiff;
    qDebug() << "maxCaseFoldDiff  = " << hex << maxCaseFoldDiff;
#if 0
//     dump(0, 0x7f);
//     dump(0x620, 0x640);
//     dump(0x10000, 0x10020);
//     dump(0x10800, 0x10820);

    qDebug("decompositionLength used:");
    int totalcompositions = 0;
    int sum = 0;
    for (int i = 1; i < 20; ++i) {
        qDebug("    length %d used %d times", i, decompositionLength.value(i, 0));
        totalcompositions += i*decompositionLength.value(i, 0);
        sum += decompositionLength.value(i, 0);
    }
    qDebug("    len decomposition map %d, average length %f, num composed chars %d",
           totalcompositions, (float)totalcompositions/(float)sum,  sum);
    qDebug("highest composed character %x", highestComposedCharacter);
    qDebug("num ligatures = %d highest=%x, maxLength=%d", numLigatures, highestLigature, longestLigature);

    qBubbleSort(ligatures);
    for (int i = 0; i < ligatures.size(); ++i)
        qDebug("%s", ligatures.at(i).data());

//     qDebug("combiningClass usage:");
//     int numClasses = 0;
//     for (int i = 0; i < 255; ++i) {
//         int num = combiningClassUsage.value(i, 0);
//         if (num) {
//             ++numClasses;
//             qDebug("    combiningClass %d used %d times", i, num);
//         }
//     }
//     qDebug("total of %d combining classes used", numClasses);

#endif
}