--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/localisation/localesupport/coltab/COLTAB.CPP Mon Oct 19 15:55:17 2009 +0100
@@ -0,0 +1,1278 @@
+// Copyright (c) 1999-2009 Nokia Corporation and/or its subsidiary(-ies).
+// All rights reserved.
+// This component and the accompanying materials are made available
+// under the terms of the License "Eclipse Public License v1.0"
+// which accompanies this distribution, and is available
+// at the URL "http://www.eclipse.org/legal/epl-v10.html".
+//
+// Initial Contributors:
+// Nokia Corporation - initial contribution.
+//
+// Contributors:
+//
+// Description:
+// Reads and parses the Unicode collation value table and writes out a C++ source file
+// containing the data in a form that can be used by the EPOC collation system.
+//
+// The program reads three files or one compositive files:
+//
+// Three files (by default):
+// 1. Base keys (maps single Unicode values to single collation key values): must be in the same format as
+// basekeys.txt, supplied with the Standard Unicode Collation system
+//
+// 2. Composite keys (maps single Unicode values to strings of collation keys): must be in the same format as
+// compkeys.txt, supplied with the Standard Unicode Collation system
+//
+// 3. Strings (maps strings of Unicode values to single collation keys OR strings of collation keys): must be in the
+// same format as compkeys.txt, except that there can be any number of Unicode characters at the start of the line,
+// space-separated and each exactly 4 hex digits.
+//
+// One compositive files (with option /a):
+// 1. All Keys (combine above three files into one file): must be in the same format as allkeys.txt, supplied with the Standard Unicode Collation system (after Unicode 3.0).
+//
+//
+
+
+#include <assert.h>
+#include <ctype.h>
+
+#ifdef __MSVCDOTNET__
+#include <fstream>
+#include <iostream>
+using namespace std;
+#else //!__MSVCDOTNET__
+#include <fstream.h>
+#include <iostream.h>
+#endif //__MSVCDOTNET__
+
+#include <stdlib.h>
+#include <string.h>
+#include <stdio.h>
+
+/*
+Constants constraining the range of level-1 and level-2 keys so that they can be packed.
+Non-zero values are reduced by one less than the minimum value.
+*/
+const unsigned int KLevel1Bits = 8;
+const unsigned int KLevel1Min = 0x20;
+const unsigned int KLevel1Max = KLevel1Min + (1 << KLevel1Bits) - 2;
+const unsigned int KLevel2Bits = 6;
+const unsigned int KLevel2Min = 1;
+const unsigned int KLevel2Max = KLevel2Min + (1 << KLevel2Bits) - 2;
+
+/*
+Table of characters in the WGL4 set, plus characters in canonical decompositions of
+those characters, plus commonly used control characters and space characters,
+given as ranges of Unicode characters. In each pair, the first code is the first in the range,
+and the second is the first code NOT in the range.
+
+The extra characters are added mainly to ensure that control characters and spaces are
+normally ignored. The extra characters are:
+
+0x0000-0x001F: ASCII control characters
+0x2000-0x2012: spaces, hyphen variants, figure dash
+0x2028-0x202E: line and paragraph separator, bidirectional control characters
+0xFEFF : byte-order mark
+0xFFFC-0xFFFD: object replacement character, replacement character
+*/
+const unsigned int Wgl4Range[] =
+ {
+ 0x00, 0x7f, // All ASCII
+ 0xa0, 0x180, // Non-breaking space, Latin-1, Latin Extended-A
+ 0x192,0x193, // Latin f with hook
+ 0x1fa,0x200, // A-ring, a-ring, AE, ae, O slash, o slash all with acute accent
+ 0x2c6,0x2c8, // non-combining circumflex and caron
+ 0x2c9,0x2ca, // non-combining macron
+ 0x2d8,0x2dc, // non-combining breve, dot above, ring above, ogonek
+ 0x2dd,0x2de, // non-combining double acute
+ 0x300,0x305, // combining grave, acute, circumflex, tilde, macron
+ 0x306,0x309, // combining breve, dot above, double dot above
+ 0x30a,0x30e, // combining ring above, double acute, caron, vertical line above
+ 0x327,0x329, // combining cedilla, ogonek
+ 0x384,0x38b, // Greek
+ 0x38c,0x38d, // Greek
+ 0x38e,0x3a2, // Greek
+ 0x3a3,0x3cf, // Greek
+ 0x401,0x40d, // Cyrillic
+ 0x40e,0x450, // Cyrillic
+ 0x451,0x45d, // Cyrillic
+ 0x45e,0x460, // Cyrillic
+ 0x490,0x492, // Cyrillic
+ 0x1e80,0x1e86, // Both W and w with each of grave, acute and diaeresis
+ 0x1ef2,0x1ef4, // Y with grave, y with grave
+ 0x2000,0x2016, // various space and horizontal lines
+ 0x2017,0x201f, //double vertical line, double low line, various quotation marks
+ 0x2020,0x2023, // dagger, double dagger, bullet
+ 0x2026,0x2027, //ellipsis
+ 0x2028,0x202F, // line & paragraph separators and directional formatting
+ 0x2030,0x2031, // per mille
+ 0x2032,0x2034, // prime
+ 0x2039,0x203b, // single angle quotation marks
+ 0x203c,0x203d, // double exclamation mark
+ 0x203e,0x203f, // non-combining overscore
+ 0x2044,0x2045, // fraction slash
+ 0x207f,0x2080, // superscript n
+ 0x20a3,0x20a5, // French Franc, Italian/Turkish Lira
+ 0x20a7,0x20a8, // Spanish Peseta
+ 0x20ac,0x20ad, // Euro symbol
+ 0x2105,0x2106, // care of
+ 0x2113,0x2114, // script l
+ 0x2116,0x2117, // numero
+ 0x2122,0x2123, // trade mark
+ 0x2126,0x2127, // ohm
+ 0x212e,0x212f, // estimated (net weight)
+ 0x215b,0x215f, // 1/8, 3/8, 5/8, 7/8
+ 0x2190,0x2196, // horizontal and vertical arrows
+ 0x21a8,0x21a9, // up down arrow with base
+ 0x2202,0x2203, // partial differential
+ 0x2206,0x2207, // increment (delta)
+ 0x220f,0x2210, // n-ary product (pi)
+ 0x2211,0x2213, // n-ary sum (sigma), minus
+ 0x2215,0x2216, // division (slash)
+ 0x2219,0x221b, // bullet operator, square root
+ 0x221e,0x2220, // infinity, right angle
+ 0x2229,0x222a, // intersection
+ 0x222b,0x222c, // union
+ 0x2248,0x2249, // almost equal to
+ 0x2260,0x2262, // not equal to, identical to
+ 0x2264,0x2266, // less-than-or-equal-to, greater-than-or-equal-to
+ 0x2302,0x2303, // house
+ 0x2310,0x2311, // rversed not sign
+ 0x2320,0x2322, // top and bottom of integral
+ 0x2500,0x2501, // box drawing
+ 0x2502,0x2503, // box drawing
+ 0x250c,0x250d, // box drawing
+ 0x2510,0x2511, // box drawing
+ 0x2514,0x2515, // box drawing
+ 0x2518,0x2519, // box drawing
+ 0x251c,0x251d, // box drawing
+ 0x2524,0x2525, // box drawing
+ 0x252c,0x252d, // box drawing
+ 0x2534,0x2535, // box drawing
+ 0x253c,0x253d, // box drawing
+ 0x2550,0x256d, // box drawing
+ 0x2580,0x2581, // block element
+ 0x2584,0x2585, // block element
+ 0x2588,0x2589, // block element
+ 0x258c,0x258d, // block element
+ 0x2590,0x2594, // block element
+ 0x25a0,0x25a2, // geometric shapes
+ 0x25aa,0x25ad, // geometric shapes
+ 0x25b2,0x25b3, // geometric shapes
+ 0x25ba,0x25bb, // geometric shapes
+ 0x25bc,0x25bd, // geometric shapes
+ 0x25c4,0x25c5, // geometric shapes
+ 0x25ca,0x25cc, // geometric shapes
+ 0x25cf,0x25d0, // geometric shapes
+ 0x25d8,0x25da, // geometric shapes
+ 0x25e6,0x25e7, // geometric shapes
+ 0x263a,0x263d, // smilies, sun
+ 0x2640,0x2641, // female
+ 0x2642,0x2643, // male
+ 0x2660,0x2661, // spade
+ 0x2663,0x2664, // club
+ 0x2665,0x2667, // heart
+ 0x266a,0x266c, // quaver, beamed quavers
+ 0xfb01,0xfb03, // fi, fl ligatures
+ 0xfeff,0xff00, // zero-width non-breaking space
+ 0xfffc, 0xfffe // object replacement character and replacement character
+ };
+const int Wgl4Ranges = sizeof(Wgl4Range) / sizeof(Wgl4Range[0]) / 2;
+
+int CompareWgl4Ranges(const void* aRange1,const void* aRange2)
+ {
+ unsigned int* p = (unsigned int*)aRange1;
+ unsigned int* q = (unsigned int*)aRange2;
+ if (q[0] == q[1])
+ {
+ unsigned int* temp = p;
+ p = q;
+ q = temp;
+ }
+ if (*p < *q)
+ return -1;
+ else if (*p >= q[1])
+ return 1;
+ else
+ return 0;
+ }
+
+// Determine if a character is in the WGL4 character repertoire.
+static bool InWgl4(unsigned int aChar)
+ {
+ unsigned int key[2];
+ key[0] = key[1] = aChar;
+ return bsearch(key,Wgl4Range,Wgl4Ranges,sizeof(Wgl4Range[0]) * 2,CompareWgl4Ranges) != NULL;
+ }
+
+// A collation key.
+class CollationKey
+ {
+public:
+ bool operator==(const CollationKey& k) const
+ { return iLevel[0] == k.iLevel[0] && iLevel[1] == k.iLevel[1] && iLevel[2] == k.iLevel[2] &&
+ iIgnorable == k.iIgnorable && iStop == k.iStop; }
+
+ enum
+ {
+ ELevels = 3
+ };
+ int iLevel[ELevels];// the keys at the various levels
+ bool iIgnorable; // TRUE if this key can normally be ignored
+ bool iStop; // TRUE if this is the last key in a string of keys
+ };
+
+// The collation index for a single Unicode value.
+class CollationIndex
+ {
+public:
+ static int Compare(const void* aIndex1,const void* aIndex2);
+
+ int iCode; // Unicode value
+ int iIndex; // index into the key table
+ };
+
+class Reader
+ {
+public:
+ Reader(bool aWgl4,bool aStandard,const char* aLocaleName, const char* aUidString);
+ ~Reader();
+ void ReadBaseKeys(const char* aFileName);
+ void ReadCompKeys(const char* aFileName);
+ void ReadStrings(const char* aFileName);
+ void ReadAllKeys(const char* aFileName);
+ void WriteOutput(const char* aFileName, bool aCopyrightMessage);
+ int CompareStringIndices(int aIndex1,int aIndex2) const;
+
+private:
+ Reader(const Reader&);
+ int Hex(const char *aString, int &aCharConsumed, bool aTolerate = false);
+ void GetCollationKey(const char* aString, int& aCharConsumed, CollationKey* aKey=NULL);
+ void GetMultipleCollationKeys(const char* aString);
+ unsigned int PackKey(const CollationKey& aValue);
+ int PackIndex(const CollationIndex& aValue, unsigned int result[2]);
+ bool ParseLine(const char* aLine, int aCode[16], int& aCodeCount, int& aKeyStart, int& aKeyCount);
+ void AddKeyOneToOne(const char* aLine, const int aCode, const int aKeyStart);
+ void AddKeyOneToMuch(const char* aLine, const int aCode, const int aKeyStart);
+ void AddKeyMuchToMuch(const char* aLine, const int aCode[16], const int aCodeCount, const int aKeyStart);
+
+ enum
+ {
+ EMaxCollationKeys = 0x110000 * 2, /*more elements considering composite keys */
+ EMaxCollationIndices = 0x110000,
+ EMaxStringElements = 65536,
+ EMaxStringIndices = 65536
+ };
+ CollationKey iCollationKey[EMaxCollationKeys];
+ int iKeys;
+ CollationIndex iCollationIndex[EMaxCollationIndices];
+ int iIndices;
+ int iStringElement[EMaxStringElements];
+ int iStringElements;
+ unsigned int iStringIndex[EMaxStringIndices];
+ int iStringIndices;
+ const char* iInputFileName;
+ int iLineNumber;
+ bool iSuppressCanonseqWarning; // have we issued the canonseq warning yet?
+ bool iWgl4; // true if writing keys for wgl4 characters only
+ bool iStandard; // true if reading standard files, not tailoring files
+ const char* iLocaleName;
+ const char* iUidString;
+ char* iCPlusPlusIdentifier; // iLocaleName in title case with difficult characters removed
+ };
+
+bool isValidHexDigit(char c)
+ {
+ if ('0' <= c && c <= '9')
+ return true;
+ if ('a' <= c && c <= 'f')
+ return true;
+ if ('A' <= c && c <= 'F')
+ return true;
+ return false;
+ }
+
+void PrintUsage()
+ {
+ cout << "Usage: coltab [/u<uid>] [/c] [/a] [/h<topic>] <locale>\n";
+ cout << "By Default (without /a option), for the locales 'standard' and 'wgl4' coltab reads basekeys.txt & compkeys.txt\n";
+ cout << "For any other locale name <name> coltab reads <name>_basekeys.txt,\n";
+ cout << "<name>_compkeys.txt and <name>_strings.txt.\n";
+ cout << "Use the /a option, for the locales 'standard' and 'wgl4' coltab reads allkeys.txt\n";
+ cout << "For any other locale name <name> coltab reads <name>_allkeys.txt.\n";
+ cout << "The output file is always ls_<name>.cpp.\n";
+ cout << "Use the /u option to specify the UID that the collation table should have.\n";
+ cout << "A hex number must follow /u immediately, for example /u800ACBDE\n";
+ cout << "this hex number must not exceed eight digits. If this is not specified,\n";
+ cout << "the output file will have to be edited to make it compilable.\n";
+ cout << "Specify /c to prefix the output with a Nokia copyright message.\n";
+ cout << "Specify /h for in-depth help.";
+ }
+
+void UsageError()
+ {
+ PrintUsage();
+ exit(1);
+ }
+
+void PrintHelp(char* aTopic)
+ {
+ int topic = 0;
+ while ('0' <= *aTopic && *aTopic <= '9')
+ {
+ topic = topic * 10 + (*aTopic - '0');
+ ++aTopic;
+ }
+ switch(topic)
+ {
+ case 1:
+ cout << "How Coltab interprets CANONSEQ:\n\n"\
+ "If the CANONSEQ specifier is used in a line, Coltab will ignore the mapping.\n"\
+ "This because, on the Symbian platform, any canonically composed character is\n"\
+ "decomposed before the key mapping is applied, so characters with canonical\n"\
+ "decompositions do not need keys. In files supplied by the Unicode Consortium,\n"\
+ "all mappings for composed characters are flagged by CANONSEQ, so it is useful\n"\
+ "if Coltab can just ignore these so that Unicode Consortium files can be used\n"\
+ "unedited.\n\n"\
+ "This can cause problems if a localizer copies a line from a Unicode file into,\n"\
+ "say, the <lang>_strings.txt file, in order to give a mapping for an accented\n"\
+ "character. The localizer replaces the composed character code with the\n"\
+ "decomposition and changes the keys but forgets to remove the CANONSEQ\n"\
+ "specifier. In this case the key would be ignored. Coltab provides a warning so\n"\
+ "that this can be put right.\n\n"\
+ "Coltab will only warn about the first CANONSEQ in each file, and does not warn\n"\
+ "if the 'standard' or 'wgl4' options are used.";
+ exit(1);
+ break;
+ case 2:
+ cout << "How to ensure coltab's output files are compilable.\n\n"\
+ "By default, Coltab's files for locales need to be edited before they are\n"\
+ "compilable. The UID for the collation method needs to be filled in. This UID\n"\
+ "is added so that the collation table can be searched for later. At present,\n"\
+ "this UID is not necessary for the correct functioning of the Symbian platform\n"\
+ "and so a value of 0 can be safely used.\n\n"\
+ "To insert this value into the file directly, use the /u option, for example\n"\
+ "coltab /u0 french\n"\
+ "If the /u option is used, the file should be compilable as is. If it is not,\n"\
+ "please raise it as a defect with Symbian's internationalization team,\n"\
+ "supplying the files that caused the problem if this is possible.\n"\
+ "If the 'standard' or 'wgl4' options are used, no UID is output, so the /u\n"\
+ "option is not required.";
+ exit(1);
+ break;
+ case 3:
+ cout << "How to ensure collation key values are inside the supported range. \n\n"\
+ "According to Unicode Standard, the range suppored by tool COLTAB:\n"\
+ " Level 0 (primary): 0000 - FFFF, \n"\
+ " Level 1 (Secondary): 0020 - 011E, \n"\
+ " Level 2 (Tertiary): 0001 - 003F. \n"\
+ "Please edit your collation files and make sure key values are inside the above range";
+ exit(1);
+ break;
+ default:
+ PrintUsage();
+ cout << "\n\nSpecify /h1 for help on the use of CANONSEQ\n";
+ cout << "Specify /h2 for help on making compilable files that do not need editing\n";
+ exit(1);
+ break;
+ }
+ }
+
+short HighSurrogate(int aCode)
+ {
+ return static_cast<short>(0xD7C0 + (aCode >> 10));
+ }
+
+short LowSurrogate(int aCode)
+ {
+ return static_cast<short>(0xDC00 | (aCode & 0x3FF));
+ }
+
+int main(int argc,char** argv)
+ {
+ bool copyright = false;
+ bool wgl4 = false;
+ bool allKeys = false;
+ const char* prefix = "";
+ const char* infix = "";
+ const char* locale = "";
+ char* localeArg = 0;
+ char* uidArg = 0;
+ for (int i = 1; i < argc; ++i)
+ {
+ if (argv[i][0] == '/' || argv[i][0] == '-')
+ {
+ switch (argv[i][1])
+ {
+ case 'u':
+ case 'U':
+ {
+ uidArg = argv[i] + 2;
+ const char* uidCheck = uidArg;
+ while (*uidCheck)
+ {
+ if (!isValidHexDigit(*uidCheck))
+ UsageError();
+ ++uidCheck;
+ }
+ if (uidCheck == uidArg || 8 < uidCheck - uidArg)
+ UsageError();
+ break;
+ }
+ case 'c':
+ case 'C':
+ copyright = true;
+ break;
+ case 'a':
+ allKeys = true;
+ break;
+ case 'h':
+ case 'H':
+ PrintHelp(argv[i] + 2);
+ break;
+ default:
+ UsageError();
+ break;
+ }
+ }
+ else if (!localeArg)
+ localeArg = argv[i];
+ else
+ UsageError();
+ }
+ if (!localeArg)
+ UsageError();
+ bool standard = false;
+ if (!_stricmp(localeArg, "standard"))
+ {
+ locale = "Standard";
+ standard = true;
+ }
+ else if (!_stricmp(localeArg, "wgl4"))
+ {
+ locale = "Wgl4";
+ wgl4 = true;
+ standard = true;
+ }
+ else
+ {
+ locale = prefix = localeArg;
+ infix = "_";
+ }
+
+ Reader* reader = new Reader(wgl4, standard, locale, uidArg);
+ if (!reader)
+ {
+ cout << "out of memory\n";
+ exit(1);
+ }
+ char* filename = new char[strlen(prefix) + strlen(infix) + 64];
+ if (allKeys == false)
+ {
+ sprintf(filename,"%s%scompkeys.txt",prefix,infix);
+ reader->ReadCompKeys(filename);
+ if (!standard)
+ {
+ sprintf(filename,"%s%sstrings.txt",prefix,infix);
+ reader->ReadStrings(filename);
+ }
+ sprintf(filename,"%s%sbasekeys.txt",prefix,infix);
+ reader->ReadBaseKeys(filename);
+ }
+ else
+ {
+ sprintf(filename,"%s%sAllKeys.txt",prefix,infix);
+ reader->ReadAllKeys(filename);
+ }
+ sprintf(filename,"ls_%s.cpp", localeArg);
+ reader->WriteOutput(filename, copyright);
+
+ delete reader;
+ delete [] filename;
+ return 0;
+ }
+
+Reader::Reader(bool aWgl4, bool aStandard,
+ const char* aLocaleName, const char* aUidString):
+ iKeys(0),
+ iIndices(0),
+ iStringElements(0),
+ iStringIndices(0),
+ iInputFileName(NULL),
+ iLineNumber(0),
+ iSuppressCanonseqWarning(false),
+ iWgl4(aWgl4),
+ iStandard(aStandard),
+ iLocaleName(aLocaleName),
+ iUidString(aUidString)
+ {
+ if (iStandard)
+ {
+ iCPlusPlusIdentifier = new char[9];
+ strcpy(iCPlusPlusIdentifier, "Standard");
+ return;
+ }
+ char* p = iCPlusPlusIdentifier = new char[strlen(aLocaleName) + 2];
+ int current = toupper(aLocaleName[0]);
+ if (current < 'A' || 'Z' < current)
+ *p++ = 'C';
+ else
+ {
+ *p++ = static_cast<char>(current);
+ ++aLocaleName;
+ }
+ bool inUnderScore = false;
+ while (*aLocaleName)
+ {
+ current = tolower(*aLocaleName++);
+ if (current < 'a' || 'z' < current)
+ {
+ if (!inUnderScore)
+ {
+ inUnderScore = true;
+ *p++ = '_';
+ }
+ }
+ else
+ {
+ inUnderScore = false;
+ *p++ = static_cast<char>(current);
+ }
+ }
+ *p = 0;
+ }
+
+Reader::~Reader()
+ {
+ delete [] iCPlusPlusIdentifier;
+ }
+
+// Get a hex number of exactly four digits from aString. Return -1 if none is found and aTolerate is true.
+int Reader::Hex(const char *aString, int &aCharConsumed, bool aTolerate)
+ {
+ char *end;
+ unsigned long x = strtoul(aString,&end,16);
+ aCharConsumed = end - aString;
+ if ((aCharConsumed != 4) && (aCharConsumed != 5) && (aCharConsumed != 6))
+ {
+ if (!aTolerate)
+ {
+ cout << "bad hex number on line " << iLineNumber << " of file " << iInputFileName << '\n';
+ exit(1);
+ }
+ return -1;
+ }
+ return x;
+ }
+
+// Get a collation value from a string of the form [.xxxx.xxxx.xxxx.xxxx]
+void Reader::GetCollationKey(const char* aString, int& aCharConsumed, CollationKey* aKey)
+ {
+ aCharConsumed = 0;
+ const char *end = strchr(aString, ']');
+ if (end != NULL){
+ aCharConsumed = end - aString;
+ }
+
+ if (aString[0] != '[' || (aCharConsumed != 21 && aCharConsumed != 22 && aCharConsumed != 23))
+ {
+ cout << "syntax error on line " << iLineNumber << " of file " << iInputFileName << '\n';
+ exit(1);
+ }
+ if (aKey == NULL)
+ {
+ if (iKeys >= EMaxCollationKeys)
+ {
+ cout << "too many keys";
+ exit(1);
+ }
+ aKey = &iCollationKey[iKeys++];
+ }
+ aKey->iIgnorable = aString[1] == '*'; // asterisk means that this character is normally ignored
+ int charConsumed = 0;
+ for (int i = 0; i < CollationKey::ELevels; i++)
+ aKey->iLevel[i] = Hex(aString + 2 + i * 5, charConsumed);
+
+ if (aKey->iLevel[1] > 0 && (aKey->iLevel[1] < KLevel1Min || aKey->iLevel[1] > KLevel1Max))
+ {
+ aKey->iLevel[1] = KLevel1Max;
+ cout << "illegal level-1 key value on line " << iLineNumber << "; outside the range " << KLevel1Min << ".." << KLevel1Max << "\n";
+ cout << "Error: illegal key value in file, please see coltab /h3 for details.\n";
+ exit(1);
+ }
+
+ if (aKey->iLevel[2] > 0 && (aKey->iLevel[2] < KLevel2Min || aKey->iLevel[2] > KLevel2Max))
+ {
+ cout << "illegal level-2 key value on line " << iLineNumber << "; outside the range " << KLevel2Min << ".." << KLevel2Max << "\n";
+ cout << "Error: illegal key value in file, please see coltab /h3 for details.\n";
+ exit(1);
+ }
+
+ aKey->iStop = true;
+ }
+
+void Reader::GetMultipleCollationKeys(const char* aString)
+ {
+ int keyCount = 0;
+ int charConsumed =0;
+ while (aString[0] == '[')
+ {
+ GetCollationKey(aString, charConsumed);
+
+ keyCount++;
+ iCollationKey[iKeys - 1].iStop = false;
+ int length = strlen(aString);
+ if (length <= charConsumed + 1)
+ break;
+ aString += charConsumed + 1;
+
+ if (aString[0] == ' ') //a space is put between collation keys in keys files provided by previous Unicode Standard (i.e 3.1)
+ aString++;
+
+ }
+ iCollationKey[iKeys - 1].iStop = true;
+ }
+
+/*
+Partially parse a line, returning its key code and the start of its first block of key data.
+Return false if it is not a data line, or not relevant.
+*/
+bool Reader::ParseLine(const char* aLine, int aCode[16], int& aCodeCount, int& aKeyStart, int& aKeyCount)
+ {
+ int lineLength = strlen(aLine);
+ int charConsumed = 0;
+ aCodeCount = 0;
+ aCode[0] = Hex(aLine,charConsumed,true);
+
+ /*
+ A data line must start with a hex number and be at least 27 characters long.
+ Canonically decomposable Unicode characters are skipped.
+ Skip non-WGL4 characters if doing WGL4 only.
+ */
+ if (aCode[0] != -1)
+ {
+ aCodeCount = 1;
+ if (!strcmp(aLine + lineLength - 8,"CANONSEQ"))
+ {
+ if (!iSuppressCanonseqWarning)
+ {
+ cout << "Warning: CANONSEQ used in file " << iInputFileName
+ << " on line " << iLineNumber << ".\nWarning: All mappings specifying CANONSEQ are ignored.\n"
+ << "Warning: Use coltab /h1 for more details.";
+ iSuppressCanonseqWarning = true;
+ }
+ aCodeCount = 0;
+ }
+ else if (lineLength < 27 ||
+ (iWgl4 && !InWgl4((unsigned int)aCode)))
+ aCodeCount = 0;
+ }
+
+ if (aCode[0] != -1)
+ {
+ // find '['
+ aKeyStart = charConsumed;
+ while (aKeyStart < lineLength && aLine[aKeyStart] != '[')
+ aKeyStart++;
+
+ // read all hex before '['
+ int index = charConsumed + 1;
+ while (index < aKeyStart)
+ {
+ aCode[aCodeCount] = Hex(aLine+index, charConsumed, true);
+ if (aCode[aCodeCount] == -1)
+ break;
+
+ index += charConsumed + 1;
+ aCodeCount++;
+ }
+
+ // find number of collation keys
+ aKeyCount = 0;
+ index = aKeyStart;
+ while (index < lineLength && aLine[index] != '%' && aLine[index] != '#')
+ {
+ if (aLine[index] == '[')
+ aKeyCount++;
+ index++;
+ }
+ }
+
+ return aCodeCount > 0;
+ }
+
+void Reader::AddKeyOneToOne(const char* aLine, const int aCode, const int aKeyStart)
+ {
+ if (iIndices >= EMaxCollationIndices)
+ {
+ cout << "too many Unicode values";
+ exit(1);
+ }
+ CollationIndex& index = iCollationIndex[iIndices++];
+ index.iCode = aCode;
+ index.iIndex = -1;
+
+ /*
+ First try to find the key in the array of keys found so far.
+ Search backwards to use the fact that runs of the same key occur together.
+ */
+ CollationKey key;
+ int charConsumed = 0;
+ GetCollationKey(aLine + aKeyStart, charConsumed, &key);
+ for (int i = iKeys - 1; i >= 0 && index.iIndex == -1; i--)
+ if (iCollationKey[i] == key)
+ index.iIndex = i;
+
+ // If that fails, add a new key.
+ if (index.iIndex == -1)
+ {
+ index.iIndex = iKeys++;
+ if (iKeys > EMaxCollationKeys)
+ {
+ cout << "too many keys";
+ exit(1);
+ }
+ iCollationKey[index.iIndex] = key;
+ }
+ }
+/*
+Read 1-to-1 mapping. Sample:
+02B9 ; [*02A5.0020.0002.02B9] % MODIFIER LETTER PRIME
+
+aCombinedFile = true: aFileName is combined file, which contains base keys, comp keys, and string keys.
+*/
+void Reader::ReadBaseKeys(const char* aFileName)
+ {
+ iSuppressCanonseqWarning = iStandard || iWgl4;
+ iLineNumber = 0;
+ iInputFileName = aFileName;
+ ifstream input_file;
+
+#ifdef __MSVCDOTNET__
+ input_file.open(iInputFileName, ios::in);
+#else //!__MSVCDOTNET__
+ input_file.open(iInputFileName, ios::in | ios::nocreate);
+#endif //__MSVCDOTNET__
+
+ if (input_file.fail())
+ {
+ cout << "cannot open input file '" << iInputFileName << "'\n";
+ exit(1);
+ }
+ cout << "reading base keys from '" << iInputFileName << "'\n";
+
+ char line[1024];
+ for (;;)
+ {
+ input_file.getline(line,sizeof(line));
+ if (input_file.eof())
+ break;
+ iLineNumber++;
+ // line number counting
+ if (iLineNumber % 100 == 0)
+ {
+ cout << "line " << iLineNumber << '\n';
+ cout.flush();
+ }
+ int code[16];
+ int codeCount = 0;
+ int key_start = 0;
+ int keyCount = 0;
+ if (ParseLine(line, code, codeCount, key_start, keyCount))
+ {
+ if (codeCount != 1 || keyCount != 1)
+ continue; // goto next line
+ AddKeyOneToOne(line, code[0], key_start);
+ }
+ }
+
+ input_file.close();
+ }
+
+void Reader::AddKeyOneToMuch(const char* aLine, const int aCode, const int aKeyStart)
+ {
+ if (iIndices >= EMaxCollationIndices)
+ {
+ cout << "too many Unicode values";
+ exit(1);
+ }
+ CollationIndex& index = iCollationIndex[iIndices++];
+ index.iCode = aCode;
+ index.iIndex = iKeys;
+ GetMultipleCollationKeys(aLine + aKeyStart);
+ }
+/*
+Read 1-to-much mapping.
+3303 ; [.279F.0020.001C.3303][.1114.0020.001C.3303][.27C7.0020.001F.3303] # SQUARE AARU; QQKN
+*/
+void Reader::ReadCompKeys(const char* aFileName)
+ {
+ iSuppressCanonseqWarning = iStandard || iWgl4;
+ iLineNumber = 0;
+ iInputFileName = aFileName;
+ ifstream input_file;
+
+#ifdef __MSVCDOTNET__
+ input_file.open(iInputFileName, ios::in);
+#else //!__MSVCDOTNET__
+ input_file.open(iInputFileName, ios::in | ios::nocreate);
+#endif //__MSVCDOTNET__
+
+ if (input_file.fail())
+ {
+ cout << "there are no composite keys; '" << iInputFileName << "' not found\n";
+ return;
+ }
+ cout << "reading composite keys from '" << iInputFileName << "'\n";
+
+ char line[1024];
+ for (;;)
+ {
+ input_file.getline(line,sizeof(line));
+ if (input_file.eof())
+ break;
+ iLineNumber++;
+ // line number counting
+ if (iLineNumber % 100 == 0)
+ {
+ cout << "line " << iLineNumber << '\n';
+ cout.flush();
+ }
+ int code[16];
+ int codeCount = 0;
+ int key_start = 0;
+ int keyCount = 0;
+ if (ParseLine(line, code, codeCount, key_start, keyCount))
+ {
+ if (codeCount != 1 || keyCount < 2)
+ continue; // goto next line
+ AddKeyOneToMuch(line, code[0], key_start);
+ }
+ }
+
+ input_file.close();
+ }
+
+
+void Reader::AddKeyMuchToMuch(const char* aLine, const int aCode[16], const int aCodeCount, const int aKeyStart)
+ {
+
+ // Store the index to the Unicode string and the key sequence.
+ if (iStringIndices > EMaxStringIndices)
+ {
+ cout << "too many string indices";
+ exit(1);
+ }
+ iStringIndex[iStringIndices++] = (iStringElements << 16) | iKeys;
+
+ // Reserve space for the length.
+ if (iStringElements >= EMaxStringElements)
+ {
+ cout << "too many string elements";
+ exit(1);
+ }
+ iStringElements++;
+
+ // Read the Unicode string.
+ int length = 0; // in unit of int16
+ int charCount = 0; // in unit of char. for debug.
+
+ for (int i=0; i<aCodeCount; i++)
+ {
+ if (iStringElements >= EMaxStringElements)
+ {
+ cout << "too many string elements";
+ exit(1);
+ }
+
+ if (aCode[i] > 0xFFFF)
+ {
+ // UCS4 --> UTF-16
+ iStringElement[iStringElements++] = 0xD7C0 + (aCode[i] >> 10);
+ iStringElement[iStringElements++] = 0xDC00 | (aCode[i] & 0x3FF);
+ length += 2;
+ }
+ else
+ {
+ iStringElement[iStringElements++] = aCode[i];
+ length++;
+ }
+ charCount++;
+ }
+
+ iStringElement[iStringElements - length - 1] = (unsigned int)length;
+
+ // Read the key sequence.
+ GetMultipleCollationKeys(aLine + aKeyStart);
+ }
+/*
+Read much-to-much mapping. Sample:
+004F 0338 [.08EA.0020.0008.00D8] % capital O-stroke
+0E40 0E08 ; [.1E2B.0020.0002.0E08][.1E5E.0020.001F.0E40] # <THAI CHARACTER SARA E, THAI CHARACTER CHO CHAN>
+*/
+void Reader::ReadStrings(const char* aFileName)
+ {
+ iSuppressCanonseqWarning = iStandard || iWgl4;
+ iLineNumber = 0;
+ iInputFileName = aFileName;
+ ifstream input_file;
+
+#ifdef __MSVCDOTNET__
+ input_file.open(iInputFileName, ios::in);
+#else //!__MSVCDOTNET__
+ input_file.open(iInputFileName, ios::in | ios::nocreate);
+#endif //__MSVCDOTNET__
+
+ if (input_file.fail())
+ {
+ cout << "there are no strings; '" << iInputFileName << "' not found\n";
+ return;
+ }
+ cout << "reading strings from '" << iInputFileName << "'\n";
+
+ char line[1024];
+ for (;;)
+ {
+ input_file.getline(line,sizeof(line));
+ if (input_file.eof())
+ break;
+ iLineNumber++;
+ // line number counting
+ if (iLineNumber % 100 == 0)
+ {
+ cout << "line " << iLineNumber << '\n';
+ cout.flush();
+ }
+ int code[16];
+ int codeCount = 0;
+ int key_start = 0;
+ int keyCount = 0;
+ if (ParseLine(line, code, codeCount, key_start, keyCount))
+ {
+ if (codeCount < 2 || keyCount < 1)
+ continue; // goto next line
+ AddKeyMuchToMuch(line, code, codeCount, key_start);
+ }
+ }
+
+ input_file.close();
+ }
+
+/*
+Read combined key table. Sample:
+1-to-1 mapping:
+02B9 ; [*02A5.0020.0002.02B9] % MODIFIER LETTER PRIME
+
+1-to-much mapping:
+3303 ; [.279F.0020.001C.3303][.1114.0020.001C.3303][.27C7.0020.001F.3303] # SQUARE AARU; QQKN
+
+much-to-much mapping:
+004F 0338 [.08EA.0020.0008.00D8] % capital O-stroke
+0E40 0E08 ; [.1E2B.0020.0002.0E08][.1E5E.0020.001F.0E40] # <THAI CHARACTER SARA E, THAI CHARACTER CHO CHAN>
+*/
+void Reader::ReadAllKeys(const char* aFileName)
+ {
+ iSuppressCanonseqWarning = iStandard || iWgl4;
+ iLineNumber = 0;
+ iInputFileName = aFileName;
+ ifstream input_file;
+
+#ifdef __MSVCDOTNET__
+ input_file.open(iInputFileName, ios::in);
+#else //!__MSVCDOTNET__
+ input_file.open(iInputFileName, ios::in | ios::nocreate);
+#endif //__MSVCDOTNET__
+
+ if (input_file.fail())
+ {
+ cout << "there are no keys; '" << iInputFileName << "' not found\n";
+ return;
+ }
+ cout << "reading all keys from '" << iInputFileName << "'\n";
+
+ char line[1024];
+ for (;;)
+ {
+ if (input_file.eof())
+ break;
+ input_file.getline(line,sizeof(line));
+ iLineNumber++;
+
+ int code[16];
+ int codeCount = 0;
+ int key_start = 0;
+ int keyCount = 0;
+ if (ParseLine(line, code, codeCount, key_start, keyCount))
+ {
+ if (codeCount == 1 && keyCount == 1)
+ AddKeyOneToOne(line, code[0], key_start);
+ else if (codeCount == 1 && keyCount > 1)
+ AddKeyOneToMuch(line, code[0], key_start);
+ else if (codeCount > 1 && keyCount > 0)
+ AddKeyMuchToMuch(line, code, codeCount, key_start);
+ else
+ cout << "ignore line: " << line << "\n";
+ }
+ }
+
+ input_file.close();
+ }
+
+
+// Pack the 3 collation key levels into a single 32-bit integer.
+unsigned int Reader::PackKey(const CollationKey& aValue)
+ {
+ unsigned int level0 = aValue.iLevel[0];
+ unsigned int level1 = aValue.iLevel[1];
+ if (level1 > 0)
+ level1 -= (KLevel1Min - 1);
+ unsigned int level2 = aValue.iLevel[2];
+ if (level2 > 0)
+ level2 -= (KLevel2Min - 1);
+ unsigned int key = level0 << 16 | level1 << 8 | level2 << 2;
+ if (aValue.iIgnorable)
+ key |= 2;
+ if (aValue.iStop)
+ key |= 1;
+ return key;
+ }
+
+// Pack a collation index value into a single 32-bit integer.
+int Reader::PackIndex(const CollationIndex& aValue, unsigned int result[2])
+ {
+ unsigned int code = aValue.iCode;
+ unsigned int index = aValue.iIndex;
+ if (code <= 0xFFFF)
+ {
+ result[0] = (code << 16 | index);
+ return 1;
+ }
+ else
+ {
+ result[0] = (::HighSurrogate(code) << 16 | index);
+ result[1] = (::LowSurrogate(code) << 16 | index);
+ return 2;
+ }
+ }
+
+const Reader* TheReader;
+static int CompareStringIndices(const void* aIndex1,const void* aIndex2)
+ {
+ return TheReader->CompareStringIndices(*(unsigned int*)aIndex1 >> 16,*(unsigned int*)aIndex2 >> 16);
+ }
+
+int CompareUnicodeStrings(const int *aString1,int aLength1,const int *aString2,int aLength2)
+ {
+ for (int i = 0; i < aLength1 || i < aLength2; i++, aString1++, aString2++)
+ {
+ int x = i < aLength1 ? *aString1 : -1;
+ int y = i < aLength2 ? *aString2 : -1;
+ if (x != y)
+ return x - y;
+ }
+ return 0;
+ }
+
+int Reader::CompareStringIndices(int aIndex1,int aIndex2) const
+ {
+ return CompareUnicodeStrings(iStringElement + aIndex1 + 1,iStringElement[aIndex1],
+ iStringElement + aIndex2 + 1,iStringElement[aIndex2]);
+ }
+
+void Reader::WriteOutput(const char* aFileName, bool aCopyright)
+ {
+ int i;
+ ofstream output_file;
+ output_file.open(aFileName);
+ if (output_file.fail())
+ {
+ cout << "cannot open output file '" << aFileName << "'\n";
+ exit(1);
+ }
+ cout << "writing output to '" << aFileName << "'\n";
+
+ char *locale = NULL;
+ if (iStandard)
+ locale = _strdup("Standard");
+ else
+ locale = _strdup(iLocaleName);
+
+ if (!iStandard)
+ {
+ _strlwr(locale);
+ locale[0] = (char)toupper(locale[0]);
+ if (aCopyright)
+ {
+ char* capsFileName = new char[strlen(aFileName) + 1];
+ strcpy(capsFileName, aFileName);
+ _strupr(capsFileName);
+ output_file << "/*\n" << capsFileName << "\n\nCopyright (C) 2000-2009 Nokia Corporation and/or its subsidiary(-ies). All rights reserved.\n*/\n";
+ delete [] capsFileName;
+ output_file << "\n/*\nThe LCharSet object used by the " << locale << " locale.\n";
+ output_file << "Generated by COLTAB.\n*/\n";
+ }
+
+ output_file << "\n#include \"ls_std.h\"\n#include <collate.h>\n";
+ output_file << "\nconst TUint KUid" << iCPlusPlusIdentifier << "CollationMethod = ";
+ if (iUidString)
+ output_file << "0x" << iUidString << ";\n";
+ else
+ {
+ output_file << "/* FILL THIS IN */;\n";
+ cout << "Warning: File will need editing\nWarning: see coltab /h2 for details.\n";
+ }
+ }
+
+ /*
+ Write the unique collation keys.
+ Each one has the format, going from highest to lowest bit:
+
+ 16 bits: level-0 key
+ 8 bits: level-1 key
+ 6 bits: level-2 key
+ 1 bit: set if this key is optionally ignorable
+ 1 bit: set if this is the last key in the string of keys for a single Unicode value
+
+ */
+ if (iKeys != 0)
+ {
+ output_file << "\nstatic const TUint32 The" << iCPlusPlusIdentifier << "Key[] = \n\t{";
+ CollationKey* ck = iCollationKey;
+ output_file << "\t // " << iKeys << " keys";
+ output_file << hex;
+ for (i = 0; i < iKeys; i++, ck++)
+ {
+ unsigned int key = PackKey(*ck);
+ if (i % 8 == 0)
+ output_file << "\n\t";
+ output_file << "0x";
+ output_file << key << ",";
+ }
+ output_file << dec;
+ output_file << "\n\t};\n\n";
+ }
+
+ if (iIndices != 0)
+ {
+ // Sort then write the collation index values - these relate Unicode values to collation keys.
+ qsort(iCollationIndex,iIndices,sizeof(CollationIndex),CollationIndex::Compare);
+ output_file << "static const TUint32 The" << iCPlusPlusIdentifier << "Index[] = \n\t{";
+ CollationIndex* ci = iCollationIndex;
+ int entry=0;
+ output_file << "\t // " << iIndices << " indices";
+ output_file << hex;
+ for (i = 0; i < iIndices; i++, ci++, entry++)
+ {
+ unsigned int key[2];
+ int bytecount = PackIndex(*ci, key);
+
+ if (entry % 8 == 0)
+ output_file << "\n\t";
+ output_file << "0x";
+ output_file << key[0] << ",";
+
+ if (bytecount == 2)
+ {
+ entry++;
+ if (entry % 8 == 0)
+ output_file << "\n\t";
+ output_file << "0x";
+ output_file << key[1] << ",";
+ }
+ }
+ output_file << dec;
+ output_file << "\n\t};";
+ output_file << "\t // " << entry << " entries";
+ output_file << "\n\n";
+ iIndices = entry; //One surrogate pair occupies 2 entries
+ }
+
+ if (iStringElements)
+ {
+ // Write the Unicode strings; these are preceded by their lengths.
+ output_file << "static const TUint16 The" << iCPlusPlusIdentifier << "StringElement[] = \n\t{";
+ output_file << hex;
+ for (i = 0; i < iStringElements; i++)
+ {
+ if (i % 8 == 0)
+ output_file << "\n\t";
+ output_file << "0x" << iStringElement[i] << ",";
+ }
+ output_file << dec;
+ if (iStringElements==0)
+ output_file << "0";
+ output_file << "\n\t};\n\n";
+
+ /*
+ Sort then write the string index values - these relate Unicode strings to collation keys.
+ Each one has the string index in the upper word and the key index in the lower word.
+ */
+ TheReader = this;
+ qsort(iStringIndex,iStringIndices,sizeof(iStringIndex[0]),::CompareStringIndices);
+ output_file << "static const TUint32 The" << iCPlusPlusIdentifier << "StringIndex[] = \n\t{";
+ output_file << hex;
+ for (i = 0; i < iStringIndices; i++)
+ {
+ if (i % 8 == 0)
+ output_file << "\n\t";
+ output_file << "0x" << iStringIndex[i] << ",";
+ }
+ output_file << dec;
+ if (iStringIndices ==0)
+ output_file << "0";
+ output_file << "\n\t};\n\n";
+ }
+
+ // Write the collation table structure.
+ output_file << "static const TCollationKeyTable The" << iCPlusPlusIdentifier << "Table = \n\t{ ";
+ if (iKeys)
+ output_file << "The" << iCPlusPlusIdentifier << "Key";
+ else
+ output_file << "0";
+ if (iIndices)
+ output_file << ", The" << iCPlusPlusIdentifier << "Index, " << iIndices;
+ else
+ output_file << ", 0, 0";
+ if (iStringElements)
+ output_file << ", The" << iCPlusPlusIdentifier << "StringElement, The" << iCPlusPlusIdentifier << "StringIndex, " << iStringIndices << " };\n";
+ else
+ output_file << ", 0, 0, 0 };\n";
+
+ if (!iStandard)
+ output_file << "\nstatic const TCollationMethod TheCollationMethod[] = \n"\
+ " {\n"\
+ " {\n"\
+ " KUid" << iCPlusPlusIdentifier << "CollationMethod, // the method for the locale\n"\
+ " NULL, // use the standard table as the main table\n"\
+ " &The" << iCPlusPlusIdentifier << "Table, // the locale values override the standard values\n"\
+ " 0 // the flags are standard\n"\
+ " },\n"\
+ " {\n"\
+ " KUidBasicCollationMethod, // the standard unlocalised method\n"\
+ " NULL, // null means use the standard table\n"\
+ " NULL, // there's no override table\n"\
+ " 0 // the flags are standard\n"\
+ " }\n"\
+ " };\n"\
+ "\n"\
+ "static const TCollationDataSet TheCollationDataSet =\n"\
+ " {\n"\
+ " TheCollationMethod,\n"\
+ " 2\n"\
+ " };"\
+ "\n\n"\
+ "// The one and only locale character set object.\n"\
+ "const LCharSet TheCharSet =\n"\
+ " {\n"\
+ " NULL,\n"\
+ " &TheCollationDataSet\n"\
+ " };\n";
+
+ output_file.close();
+ delete [] locale;
+ }
+
+int CollationIndex::Compare(const void* aIndex1,const void* aIndex2)
+ {
+ return ((CollationIndex*)aIndex1)->iCode - ((CollationIndex*)aIndex2)->iCode;
+ }