localisation/localesupport/coltab/COLTAB.CPP
changeset 0 1fb32624e06b
equal deleted inserted replaced
-1:000000000000 0:1fb32624e06b
       
     1 // Copyright (c) 1999-2009 Nokia Corporation and/or its subsidiary(-ies).
       
     2 // All rights reserved.
       
     3 // This component and the accompanying materials are made available
       
     4 // under the terms of "Eclipse Public License v1.0"
       
     5 // which accompanies this distribution, and is available
       
     6 // at the URL "http://www.eclipse.org/legal/epl-v10.html".
       
     7 //
       
     8 // Initial Contributors:
       
     9 // Nokia Corporation - initial contribution.
       
    10 //
       
    11 // Contributors:
       
    12 //
       
    13 // Description:
       
    14 // Reads and parses the Unicode collation value table and writes out a C++ source file
       
    15 // containing the data in a form that can be used by the EPOC collation system.
       
    16 //
       
    17 // The program reads three files or one compositive files:
       
    18 //
       
    19 // Three files (by default):
       
    20 // 1. Base keys (maps single Unicode values to single collation key values): must be in the same format as
       
    21 // basekeys.txt, supplied with the Standard Unicode Collation system
       
    22 //
       
    23 // 2. Composite keys (maps single Unicode values to strings of collation keys): must be in the same format as
       
    24 // compkeys.txt, supplied with the Standard Unicode Collation system
       
    25 //
       
    26 // 3. Strings (maps strings of Unicode values to single collation keys OR strings of collation keys): must be in the
       
    27 // same format as compkeys.txt, except that there can be any number of Unicode characters at the start of the line,
       
    28 // space-separated and each exactly 4 hex digits.
       
    29 //
       
    30 // One compositive files (with option /a):
       
    31 // 1. All Keys (combine above three files into one file): must be in the same format as allkeys.txt, supplied with the Standard Unicode Collation system (after Unicode 3.0).
       
    32 //
       
    33 //
       
    34 
       
    35 
       
    36 #include <assert.h>
       
    37 #include <ctype.h>
       
    38 
       
    39 #ifdef __MSVCDOTNET__
       
    40 #include <fstream>
       
    41 #include <iostream>
       
    42 using namespace std;
       
    43 #else //!__MSVCDOTNET__
       
    44 #include <fstream.h>
       
    45 #include <iostream.h>
       
    46 #endif //__MSVCDOTNET__
       
    47 
       
    48 #include <stdlib.h>
       
    49 #include <string.h>
       
    50 #include <stdio.h>
       
    51 
       
    52 /*
       
    53 Constants constraining the range of level-1 and level-2 keys so that they can be packed.
       
    54 Non-zero values are reduced by one less than the minimum value.
       
    55 */
       
    56 const unsigned int KLevel1Bits = 8;
       
    57 const unsigned int KLevel1Min = 0x20;
       
    58 const unsigned int KLevel1Max = KLevel1Min + (1 << KLevel1Bits) - 2;
       
    59 const unsigned int KLevel2Bits = 6;
       
    60 const unsigned int KLevel2Min = 1;
       
    61 const unsigned int KLevel2Max = KLevel2Min + (1 << KLevel2Bits) - 2;
       
    62 
       
    63 /*
       
    64 Table of characters in the WGL4 set, plus characters in canonical decompositions of
       
    65 those characters, plus commonly used control characters and space characters,
       
    66 given as ranges of Unicode characters. In each pair, the first code is the first in the range,
       
    67 and the second is the first code NOT in the range.
       
    68 
       
    69 The extra characters are added mainly to ensure that control characters and spaces are
       
    70 normally ignored. The extra characters are:
       
    71 
       
    72 0x0000-0x001F: ASCII control characters
       
    73 0x2000-0x2012: spaces, hyphen variants, figure dash
       
    74 0x2028-0x202E: line and paragraph separator, bidirectional control characters
       
    75 0xFEFF		 : byte-order mark
       
    76 0xFFFC-0xFFFD: object replacement character, replacement character
       
    77 */
       
    78 const unsigned int Wgl4Range[] =
       
    79 	{
       
    80 	0x00, 0x7f,		// All ASCII
       
    81 	0xa0, 0x180,		// Non-breaking space, Latin-1, Latin Extended-A
       
    82 	0x192,0x193,		// Latin f with hook
       
    83 	0x1fa,0x200,		// A-ring, a-ring, AE, ae, O slash, o slash all with acute accent
       
    84 	0x2c6,0x2c8,		// non-combining circumflex and caron
       
    85 	0x2c9,0x2ca,		// non-combining macron
       
    86 	0x2d8,0x2dc,		// non-combining breve, dot above, ring above, ogonek
       
    87 	0x2dd,0x2de,		// non-combining double acute
       
    88 	0x300,0x305,		// combining grave, acute, circumflex, tilde, macron
       
    89 	0x306,0x309,		// combining breve, dot above, double dot above
       
    90 	0x30a,0x30e,		// combining ring above, double acute, caron, vertical line above
       
    91 	0x327,0x329,		// combining cedilla, ogonek
       
    92 	0x384,0x38b,		// Greek
       
    93 	0x38c,0x38d,		// Greek
       
    94 	0x38e,0x3a2,		// Greek
       
    95 	0x3a3,0x3cf,		// Greek
       
    96 	0x401,0x40d,		// Cyrillic
       
    97 	0x40e,0x450,		// Cyrillic
       
    98 	0x451,0x45d,		// Cyrillic
       
    99 	0x45e,0x460,		// Cyrillic
       
   100 	0x490,0x492,		// Cyrillic
       
   101 	0x1e80,0x1e86,		// Both W and w with each of grave, acute and diaeresis
       
   102 	0x1ef2,0x1ef4,		// Y with grave, y with grave
       
   103 	0x2000,0x2016,		// various space and horizontal lines
       
   104 	0x2017,0x201f,		//double vertical line, double low line, various quotation marks
       
   105 	0x2020,0x2023,		// dagger, double dagger, bullet
       
   106 	0x2026,0x2027,		//ellipsis
       
   107 	0x2028,0x202F,		// line & paragraph separators and directional formatting
       
   108 	0x2030,0x2031,		// per mille
       
   109 	0x2032,0x2034,		// prime
       
   110 	0x2039,0x203b,		// single angle quotation marks
       
   111 	0x203c,0x203d,		// double exclamation mark
       
   112 	0x203e,0x203f,		// non-combining overscore
       
   113 	0x2044,0x2045,		// fraction slash
       
   114 	0x207f,0x2080,		// superscript n
       
   115 	0x20a3,0x20a5,		// French Franc, Italian/Turkish Lira
       
   116 	0x20a7,0x20a8,		// Spanish Peseta
       
   117 	0x20ac,0x20ad,		// Euro symbol
       
   118 	0x2105,0x2106,		// care of
       
   119 	0x2113,0x2114,		// script l
       
   120 	0x2116,0x2117,		// numero
       
   121 	0x2122,0x2123,		// trade mark
       
   122 	0x2126,0x2127,		// ohm
       
   123 	0x212e,0x212f,		// estimated (net weight)
       
   124 	0x215b,0x215f,		// 1/8, 3/8, 5/8, 7/8
       
   125 	0x2190,0x2196,		// horizontal and vertical arrows
       
   126 	0x21a8,0x21a9,		// up down arrow with base
       
   127 	0x2202,0x2203,		// partial differential
       
   128 	0x2206,0x2207,		// increment (delta)
       
   129 	0x220f,0x2210,		// n-ary product (pi)
       
   130 	0x2211,0x2213,		// n-ary sum (sigma), minus
       
   131 	0x2215,0x2216,		// division (slash)
       
   132 	0x2219,0x221b,		// bullet operator, square root
       
   133 	0x221e,0x2220,		// infinity, right angle
       
   134 	0x2229,0x222a,		// intersection
       
   135 	0x222b,0x222c,		// union
       
   136 	0x2248,0x2249,		// almost equal to
       
   137 	0x2260,0x2262,		// not equal to, identical to
       
   138 	0x2264,0x2266,		// less-than-or-equal-to, greater-than-or-equal-to
       
   139 	0x2302,0x2303,		// house
       
   140 	0x2310,0x2311,		// rversed not sign
       
   141 	0x2320,0x2322,		// top and bottom of integral
       
   142 	0x2500,0x2501,		// box drawing
       
   143 	0x2502,0x2503,		// box drawing
       
   144 	0x250c,0x250d,		// box drawing
       
   145 	0x2510,0x2511,		// box drawing
       
   146 	0x2514,0x2515,		// box drawing
       
   147 	0x2518,0x2519,		// box drawing
       
   148 	0x251c,0x251d,		// box drawing
       
   149 	0x2524,0x2525,		// box drawing
       
   150 	0x252c,0x252d,		// box drawing
       
   151 	0x2534,0x2535,		// box drawing
       
   152 	0x253c,0x253d,		// box drawing
       
   153 	0x2550,0x256d,		// box drawing
       
   154 	0x2580,0x2581,		// block element
       
   155 	0x2584,0x2585,		// block element
       
   156 	0x2588,0x2589,		// block element
       
   157 	0x258c,0x258d,		// block element
       
   158 	0x2590,0x2594,		// block element
       
   159 	0x25a0,0x25a2,		// geometric shapes
       
   160 	0x25aa,0x25ad,		// geometric shapes
       
   161 	0x25b2,0x25b3,		// geometric shapes
       
   162 	0x25ba,0x25bb,		// geometric shapes
       
   163 	0x25bc,0x25bd,		// geometric shapes
       
   164 	0x25c4,0x25c5,		// geometric shapes
       
   165 	0x25ca,0x25cc,		// geometric shapes
       
   166 	0x25cf,0x25d0,		// geometric shapes
       
   167 	0x25d8,0x25da,		// geometric shapes
       
   168 	0x25e6,0x25e7,		// geometric shapes
       
   169 	0x263a,0x263d,		// smilies, sun
       
   170 	0x2640,0x2641,		// female
       
   171 	0x2642,0x2643,		// male
       
   172 	0x2660,0x2661,		// spade
       
   173 	0x2663,0x2664,		// club
       
   174 	0x2665,0x2667,		// heart
       
   175 	0x266a,0x266c,		// quaver, beamed quavers
       
   176 	0xfb01,0xfb03,		// fi, fl ligatures
       
   177 	0xfeff,0xff00,		// zero-width non-breaking space
       
   178 	0xfffc, 0xfffe		// object replacement character and replacement character
       
   179 	};
       
   180 const int Wgl4Ranges = sizeof(Wgl4Range) / sizeof(Wgl4Range[0]) / 2;
       
   181 
       
   182 int CompareWgl4Ranges(const void* aRange1,const void* aRange2)
       
   183 	{
       
   184 	unsigned int* p = (unsigned int*)aRange1;
       
   185 	unsigned int* q = (unsigned int*)aRange2;
       
   186 	if (q[0] == q[1])
       
   187 		{
       
   188 		unsigned int* temp = p;
       
   189 		p = q;
       
   190 		q = temp;
       
   191 		}
       
   192 	if (*p < *q)
       
   193 		return -1;
       
   194 	else if (*p >= q[1])
       
   195 		return 1;
       
   196 	else
       
   197 		return 0;
       
   198 	}
       
   199 
       
   200 // Determine if a character is in the WGL4 character repertoire.
       
   201 static bool InWgl4(unsigned int aChar)
       
   202 	{
       
   203 	unsigned int key[2];
       
   204 	key[0] = key[1] = aChar;
       
   205 	return bsearch(key,Wgl4Range,Wgl4Ranges,sizeof(Wgl4Range[0]) * 2,CompareWgl4Ranges) != NULL;
       
   206 	}
       
   207 
       
   208 // A collation key.
       
   209 class CollationKey
       
   210 	{
       
   211 public:
       
   212 	bool operator==(const CollationKey& k) const
       
   213 		{ return iLevel[0] == k.iLevel[0] && iLevel[1] == k.iLevel[1] && iLevel[2] == k.iLevel[2] &&
       
   214 		  iIgnorable == k.iIgnorable && iStop == k.iStop; }
       
   215 
       
   216 	enum
       
   217 		{
       
   218 		ELevels = 3
       
   219 		};
       
   220 	int iLevel[ELevels];// the keys at the various levels
       
   221 	bool iIgnorable;	// TRUE if this key can normally be ignored
       
   222 	bool iStop;			// TRUE if this is the last key in a string of keys
       
   223 	};
       
   224 
       
   225 // The collation index for a single Unicode value.
       
   226 class CollationIndex
       
   227 	{
       
   228 public:
       
   229 	static int Compare(const void* aIndex1,const void* aIndex2);
       
   230 
       
   231 	int iCode;			// Unicode value
       
   232 	int iIndex;			// index into the key table
       
   233 	};
       
   234 
       
   235 class Reader
       
   236 	{
       
   237 public:
       
   238 	Reader(bool aWgl4,bool aStandard,const char* aLocaleName, const char* aUidString);
       
   239 	~Reader();
       
   240 	void ReadBaseKeys(const char* aFileName);
       
   241 	void ReadCompKeys(const char* aFileName);
       
   242 	void ReadStrings(const char* aFileName);
       
   243 	void ReadAllKeys(const char* aFileName);
       
   244 	void WriteOutput(const char* aFileName, bool aCopyrightMessage);
       
   245 	int CompareStringIndices(int aIndex1,int aIndex2) const;
       
   246 
       
   247 private:
       
   248 	Reader(const Reader&);
       
   249 	int Hex(const char *aString, int &aCharConsumed, bool aTolerate = false);
       
   250 	void GetCollationKey(const char* aString, int& aCharConsumed, CollationKey* aKey=NULL);
       
   251 	void GetMultipleCollationKeys(const char* aString);
       
   252 	unsigned int PackKey(const CollationKey& aValue);
       
   253 	int PackIndex(const CollationIndex& aValue, unsigned int result[2]);
       
   254 	bool ParseLine(const char* aLine, int aCode[16], int& aCodeCount, int& aKeyStart, int& aKeyCount);
       
   255 	void AddKeyOneToOne(const char* aLine, const int aCode, const int aKeyStart);
       
   256 	void AddKeyOneToMuch(const char* aLine, const int aCode, const int aKeyStart);
       
   257 	void AddKeyMuchToMuch(const char* aLine, const int aCode[16], const int aCodeCount, const int aKeyStart);
       
   258 
       
   259 	enum
       
   260 		{
       
   261 		EMaxCollationKeys = 0x110000 * 2, /*more elements considering composite keys */
       
   262 		EMaxCollationIndices = 0x110000,
       
   263 		EMaxStringElements = 65536,
       
   264 		EMaxStringIndices = 65536
       
   265 		};
       
   266 	CollationKey iCollationKey[EMaxCollationKeys];
       
   267 	int iKeys;
       
   268 	CollationIndex iCollationIndex[EMaxCollationIndices];
       
   269 	int iIndices;
       
   270 	int iStringElement[EMaxStringElements];
       
   271 	int iStringElements;
       
   272 	unsigned int iStringIndex[EMaxStringIndices];
       
   273 	int iStringIndices;
       
   274 	const char* iInputFileName;
       
   275 	int iLineNumber;
       
   276 	bool iSuppressCanonseqWarning;		// have we issued the canonseq warning yet?
       
   277 	bool iWgl4;				// true if writing keys for wgl4 characters only
       
   278 	bool iStandard;			// true if reading standard files, not tailoring files
       
   279 	const char* iLocaleName;
       
   280 	const char* iUidString;
       
   281 	char* iCPlusPlusIdentifier;		// iLocaleName in title case with difficult characters removed
       
   282 	};
       
   283 
       
   284 bool isValidHexDigit(char c)
       
   285 	{
       
   286 	if ('0' <= c && c <= '9')
       
   287 		return true;
       
   288 	if ('a' <= c && c <= 'f')
       
   289 		return true;
       
   290 	if ('A' <= c && c <= 'F')
       
   291 		return true;
       
   292 	return false;
       
   293 	}
       
   294 
       
   295 void PrintUsage()
       
   296 	{
       
   297 	cout << "Usage: coltab [/u<uid>] [/c] [/a] [/h<topic>] <locale>\n";
       
   298 	cout << "By Default (without /a option), for the locales 'standard' and 'wgl4' coltab reads basekeys.txt & compkeys.txt\n";
       
   299 	cout << "For any other locale name <name> coltab reads <name>_basekeys.txt,\n";
       
   300 	cout << "<name>_compkeys.txt and <name>_strings.txt.\n";
       
   301 	cout << "Use the /a option, for the locales 'standard' and 'wgl4' coltab reads allkeys.txt\n";
       
   302 	cout << "For any other locale name <name> coltab reads <name>_allkeys.txt.\n"; 
       
   303 	cout << "The output file is always ls_<name>.cpp.\n";
       
   304 	cout << "Use the /u option to specify the UID that the collation table should have.\n";
       
   305 	cout << "A hex number must follow /u immediately, for example /u800ACBDE\n";
       
   306 	cout << "this hex number must not exceed eight digits. If this is not specified,\n";
       
   307 	cout << "the output file will have to be edited to make it compilable.\n";
       
   308 	cout << "Specify /c to prefix the output with a Nokia copyright message.\n";
       
   309 	cout << "Specify /h for in-depth help.";
       
   310 	}
       
   311 
       
   312 void UsageError()
       
   313 	{
       
   314 	PrintUsage();
       
   315 	exit(1);
       
   316 	}
       
   317 
       
   318 void PrintHelp(char* aTopic)
       
   319 	{
       
   320 	int topic = 0;
       
   321 	while ('0' <= *aTopic && *aTopic <= '9')
       
   322 		{
       
   323 		topic = topic * 10 + (*aTopic - '0');
       
   324 		++aTopic;
       
   325 		}
       
   326 	switch(topic)
       
   327 		{
       
   328 	case 1:
       
   329 		cout << "How Coltab interprets CANONSEQ:\n\n"\
       
   330 			"If the CANONSEQ specifier is used in a line, Coltab will ignore the mapping.\n"\
       
   331 			"This because, on the Symbian platform, any canonically composed character is\n"\
       
   332 			"decomposed before the key mapping is applied, so characters with canonical\n"\
       
   333 			"decompositions do not need keys. In files supplied by the Unicode Consortium,\n"\
       
   334 			"all mappings for composed characters are flagged by CANONSEQ, so it is useful\n"\
       
   335 			"if Coltab can just ignore these so that Unicode Consortium files can be used\n"\
       
   336 			"unedited.\n\n"\
       
   337 			"This can cause problems if a localizer copies a line from a Unicode file into,\n"\
       
   338 			"say, the <lang>_strings.txt file, in order to give a mapping for an accented\n"\
       
   339 			"character. The localizer replaces the composed character code with the\n"\
       
   340 			"decomposition and changes the keys but forgets to remove the CANONSEQ\n"\
       
   341 			"specifier. In this case the key would be ignored. Coltab provides a warning so\n"\
       
   342 			"that this can be put right.\n\n"\
       
   343 			"Coltab will only warn about the first CANONSEQ in each file, and does not warn\n"\
       
   344 			"if the 'standard' or 'wgl4' options are used.";
       
   345 		exit(1);
       
   346 		break;
       
   347 	case 2:
       
   348 		cout << "How to ensure coltab's output files are compilable.\n\n"\
       
   349 			"By default, Coltab's files for locales need to be edited before they are\n"\
       
   350 			"compilable. The UID for the collation method needs to be filled in. This UID\n"\
       
   351 			"is added so that the collation table can be searched for later. At present,\n"\
       
   352 			"this UID is not necessary for the correct functioning of the Symbian platform\n"\
       
   353 			"and so a value of 0 can be safely used.\n\n"\
       
   354 			"To insert this value into the file directly, use the /u option, for example\n"\
       
   355 			"coltab /u0 french\n"\
       
   356 			"If the /u option is used, the file should be compilable as is. If it is not,\n"\
       
   357 			"please raise it as a defect with Symbian's internationalization team,\n"\
       
   358 			"supplying the files that caused the problem if this is possible.\n"\
       
   359 			"If the 'standard' or 'wgl4' options are used, no UID is output, so the /u\n"\
       
   360 			"option is not required.";
       
   361 		exit(1);
       
   362 		break;
       
   363 	case 3:
       
   364 		cout << "How to ensure collation key values are inside the supported range. \n\n"\
       
   365 			"According to Unicode Standard, the range suppored by tool COLTAB:\n"\
       
   366 			" Level 0 (primary):   0000 - FFFF, \n"\
       
   367 			" Level 1 (Secondary): 0020 - 011E, \n"\
       
   368 			" Level 2 (Tertiary):  0001 - 003F. \n"\
       
   369 			"Please edit your collation files and make sure key values are inside the above range";
       
   370 		exit(1);
       
   371 		break;
       
   372 	default:
       
   373 		PrintUsage();
       
   374 		cout << "\n\nSpecify /h1 for help on the use of CANONSEQ\n";
       
   375 		cout << "Specify /h2 for help on making compilable files that do not need editing\n";
       
   376 		exit(1);
       
   377 		break;
       
   378 		}
       
   379 	}
       
   380 
       
   381 short HighSurrogate(int aCode)
       
   382 	{
       
   383 	return static_cast<short>(0xD7C0 + (aCode >> 10));
       
   384 	}
       
   385 	
       
   386 short LowSurrogate(int aCode)
       
   387 	{
       
   388 	return static_cast<short>(0xDC00 | (aCode & 0x3FF));
       
   389 	}
       
   390 
       
   391 int main(int argc,char** argv)
       
   392 	{
       
   393 	bool copyright = false;
       
   394 	bool wgl4 = false;
       
   395 	bool allKeys = false;
       
   396 	const char* prefix = "";
       
   397 	const char* infix = "";
       
   398 	const char* locale = "";
       
   399 	char* localeArg = 0;
       
   400 	char* uidArg = 0;
       
   401 	for (int i = 1; i < argc; ++i)
       
   402 		{
       
   403 		if (argv[i][0] == '/' || argv[i][0] == '-')
       
   404 			{
       
   405 			switch (argv[i][1])
       
   406 				{
       
   407 			case 'u':
       
   408 			case 'U':
       
   409 				{
       
   410 				uidArg = argv[i] + 2;
       
   411 				const char* uidCheck = uidArg;
       
   412 				while (*uidCheck)
       
   413 					{
       
   414 					if (!isValidHexDigit(*uidCheck))
       
   415 						UsageError();
       
   416 					++uidCheck;
       
   417 					}
       
   418 				if (uidCheck == uidArg || 8 < uidCheck - uidArg)
       
   419 					UsageError();
       
   420 				break;
       
   421 				}
       
   422 			case 'c':
       
   423 			case 'C':
       
   424 				copyright = true;
       
   425 				break;
       
   426 			case 'a':
       
   427 				allKeys = true;
       
   428 				break;
       
   429 			case 'h':
       
   430 			case 'H':
       
   431 				PrintHelp(argv[i] + 2);
       
   432 				break;
       
   433 			default:
       
   434 				UsageError();
       
   435 				break;
       
   436 				}
       
   437 			}
       
   438 		else if (!localeArg)
       
   439 			localeArg = argv[i];
       
   440 		else
       
   441 			UsageError();
       
   442 		}
       
   443 	if (!localeArg)
       
   444 		UsageError();
       
   445 	bool standard = false;
       
   446 	if (!_stricmp(localeArg, "standard"))
       
   447 		{
       
   448 		locale = "Standard";
       
   449 		standard = true;
       
   450 		}
       
   451 	else if (!_stricmp(localeArg, "wgl4"))
       
   452 		{
       
   453 		locale = "Wgl4";
       
   454 		wgl4 = true;
       
   455 		standard = true;
       
   456 		}
       
   457 	else
       
   458 		{
       
   459 		locale = prefix = localeArg;
       
   460 		infix = "_";
       
   461 		}
       
   462 
       
   463 	Reader* reader = new Reader(wgl4, standard, locale, uidArg);
       
   464 	if (!reader)
       
   465 		{
       
   466 		cout << "out of memory\n";
       
   467 		exit(1);
       
   468 		}
       
   469 	char* filename = new char[strlen(prefix) + strlen(infix) + 64];
       
   470 	if (allKeys == false)
       
   471 		{
       
   472 		sprintf(filename,"%s%scompkeys.txt",prefix,infix);
       
   473 		reader->ReadCompKeys(filename);
       
   474 		if (!standard)
       
   475 			{
       
   476 			sprintf(filename,"%s%sstrings.txt",prefix,infix);
       
   477 			reader->ReadStrings(filename);
       
   478 			}
       
   479 		sprintf(filename,"%s%sbasekeys.txt",prefix,infix);
       
   480 		reader->ReadBaseKeys(filename);
       
   481 		}
       
   482 	else
       
   483 		{
       
   484 		sprintf(filename,"%s%sAllKeys.txt",prefix,infix);
       
   485 		reader->ReadAllKeys(filename);
       
   486 		}
       
   487 	sprintf(filename,"ls_%s.cpp", localeArg);
       
   488 	reader->WriteOutput(filename, copyright);
       
   489 
       
   490 	delete reader;
       
   491 	delete [] filename;
       
   492 	return 0;
       
   493 	}
       
   494 
       
   495 Reader::Reader(bool aWgl4, bool aStandard,
       
   496 	const char* aLocaleName, const char* aUidString):
       
   497 	iKeys(0),
       
   498 	iIndices(0),
       
   499 	iStringElements(0),
       
   500 	iStringIndices(0),
       
   501 	iInputFileName(NULL),
       
   502 	iLineNumber(0),
       
   503 	iSuppressCanonseqWarning(false),
       
   504 	iWgl4(aWgl4),
       
   505 	iStandard(aStandard),
       
   506 	iLocaleName(aLocaleName),
       
   507 	iUidString(aUidString)
       
   508 	{
       
   509 	if (iStandard)
       
   510 		{
       
   511 		iCPlusPlusIdentifier = new char[9];
       
   512 		strcpy(iCPlusPlusIdentifier, "Standard");
       
   513 		return;
       
   514 		}
       
   515 	char* p = iCPlusPlusIdentifier = new char[strlen(aLocaleName) + 2];
       
   516 	int current = toupper(aLocaleName[0]);
       
   517 	if (current < 'A' || 'Z' < current)
       
   518 		*p++ = 'C';
       
   519 	else
       
   520 		{
       
   521 		*p++ = static_cast<char>(current);
       
   522 		++aLocaleName;
       
   523 		}
       
   524 	bool inUnderScore = false;
       
   525 	while (*aLocaleName)
       
   526 		{
       
   527 		current = tolower(*aLocaleName++);
       
   528 		if (current < 'a' || 'z' < current)
       
   529 			{
       
   530 			if (!inUnderScore)
       
   531 				{
       
   532 				inUnderScore = true;
       
   533 				*p++ = '_';
       
   534 				}
       
   535 			}
       
   536 		else
       
   537 			{
       
   538 			inUnderScore = false;
       
   539 			*p++ = static_cast<char>(current);
       
   540 			}
       
   541 		}
       
   542 	*p = 0;
       
   543 	}
       
   544 
       
   545 Reader::~Reader()
       
   546 	{
       
   547 	delete [] iCPlusPlusIdentifier;
       
   548 	}
       
   549 
       
   550 // Get a hex number of exactly four digits from aString. Return -1 if none is found and aTolerate is true.
       
   551 int Reader::Hex(const char *aString, int &aCharConsumed, bool aTolerate)
       
   552 	{
       
   553 	char *end;
       
   554 	unsigned long x = strtoul(aString,&end,16);
       
   555 	aCharConsumed = end - aString;
       
   556 	if ((aCharConsumed != 4) && (aCharConsumed != 5) && (aCharConsumed != 6))
       
   557 		{
       
   558 		if (!aTolerate)
       
   559 			{
       
   560 			cout << "bad hex number on line " << iLineNumber << " of file " << iInputFileName << '\n';
       
   561 			exit(1);
       
   562 			}
       
   563 		return -1;
       
   564 		}
       
   565 	return x;
       
   566 	}
       
   567 
       
   568 // Get a collation value from a string of the form [.xxxx.xxxx.xxxx.xxxx]
       
   569 void Reader::GetCollationKey(const char* aString, int& aCharConsumed, CollationKey* aKey)
       
   570 	{
       
   571 	aCharConsumed = 0;
       
   572 	const char *end = strchr(aString, ']');
       
   573 	if (end != NULL){
       
   574 		aCharConsumed = end - aString;
       
   575 	}
       
   576 	
       
   577 	if (aString[0] != '[' || (aCharConsumed != 21 && aCharConsumed != 22 && aCharConsumed != 23))
       
   578 		{
       
   579 		cout << "syntax error on line " << iLineNumber << " of file " << iInputFileName << '\n';
       
   580 		exit(1);
       
   581 		}
       
   582 	if (aKey == NULL)
       
   583 		{
       
   584 		if (iKeys >= EMaxCollationKeys)
       
   585 			{
       
   586 			cout << "too many keys";
       
   587 			exit(1);
       
   588 			}
       
   589 		aKey = &iCollationKey[iKeys++];
       
   590 		}
       
   591 	aKey->iIgnorable = aString[1] == '*'; // asterisk means that this character is normally ignored
       
   592 	int charConsumed = 0;
       
   593 	for (int i = 0; i < CollationKey::ELevels; i++)
       
   594 		aKey->iLevel[i] = Hex(aString + 2 + i * 5, charConsumed);
       
   595 
       
   596 	if (aKey->iLevel[1] > 0 && (aKey->iLevel[1] < KLevel1Min || aKey->iLevel[1] > KLevel1Max))
       
   597 		{
       
   598 		aKey->iLevel[1] = KLevel1Max;
       
   599 		cout << "illegal level-1 key value on line " << iLineNumber << "; outside the range " << KLevel1Min << ".." << KLevel1Max << "\n";
       
   600 		cout << "Error: illegal key value in file, please see coltab /h3 for details.\n";
       
   601 		exit(1);
       
   602 		}
       
   603 	
       
   604 	if (aKey->iLevel[2] > 0 && (aKey->iLevel[2] < KLevel2Min || aKey->iLevel[2] > KLevel2Max))
       
   605 		{
       
   606 		cout << "illegal level-2 key value on line " << iLineNumber << "; outside the range " << KLevel2Min << ".." << KLevel2Max << "\n";
       
   607 		cout << "Error: illegal key value in file, please see coltab /h3 for details.\n";
       
   608 		exit(1);
       
   609 		}
       
   610 
       
   611 	aKey->iStop = true;
       
   612 	}
       
   613 
       
   614 void Reader::GetMultipleCollationKeys(const char* aString)
       
   615 	{
       
   616 	int keyCount = 0;
       
   617 	int charConsumed =0;
       
   618 	while (aString[0] == '[')
       
   619 		{
       
   620 		GetCollationKey(aString, charConsumed);
       
   621 
       
   622 		keyCount++;
       
   623 		iCollationKey[iKeys - 1].iStop = false;
       
   624 		int length = strlen(aString);
       
   625 		if (length <= charConsumed + 1)
       
   626 			break;
       
   627 		aString += charConsumed + 1;
       
   628 		
       
   629 		if (aString[0] == ' ') //a space is put between collation keys in keys files provided by previous Unicode Standard (i.e 3.1)
       
   630 			aString++;
       
   631 		
       
   632 		}
       
   633 	iCollationKey[iKeys - 1].iStop = true;
       
   634 	}
       
   635 
       
   636 /*
       
   637 Partially parse a line, returning its key code and the start of its first block of key data.
       
   638 Return false if it is not a data line, or not relevant.
       
   639 */
       
   640 bool Reader::ParseLine(const char* aLine, int aCode[16], int& aCodeCount, int& aKeyStart, int& aKeyCount)
       
   641 	{
       
   642 	int lineLength = strlen(aLine);
       
   643 	int charConsumed = 0;
       
   644 	aCodeCount = 0;
       
   645 	aCode[0] = Hex(aLine,charConsumed,true);
       
   646 
       
   647 	/*
       
   648 	A data line must start with a hex number and be at least 27 characters long.
       
   649 	Canonically decomposable Unicode characters are skipped.
       
   650 	Skip non-WGL4 characters if doing WGL4 only.
       
   651 	*/
       
   652 	if (aCode[0] != -1)
       
   653 		{
       
   654 		aCodeCount = 1;
       
   655 		if (!strcmp(aLine + lineLength - 8,"CANONSEQ"))
       
   656 			{
       
   657 			if (!iSuppressCanonseqWarning)
       
   658 				{
       
   659 				cout << "Warning: CANONSEQ used in file " << iInputFileName
       
   660 					<< " on line " << iLineNumber << ".\nWarning: All mappings specifying CANONSEQ are ignored.\n"
       
   661 					<< "Warning: Use coltab /h1 for more details.";
       
   662 				iSuppressCanonseqWarning = true;
       
   663 				}
       
   664 			aCodeCount = 0;
       
   665 			}
       
   666 		else if (lineLength < 27 ||
       
   667 			(iWgl4 && !InWgl4((unsigned int)aCode))) 
       
   668 			aCodeCount = 0;
       
   669 		}
       
   670 
       
   671 	if (aCode[0] != -1)
       
   672 		{
       
   673 		// find '['
       
   674 		aKeyStart = charConsumed;
       
   675 		while (aKeyStart < lineLength && aLine[aKeyStart] != '[')
       
   676 			aKeyStart++;
       
   677 
       
   678 		// read all hex before '['
       
   679 		int index = charConsumed + 1;
       
   680 		while (index < aKeyStart)
       
   681 			{
       
   682 			aCode[aCodeCount] = Hex(aLine+index, charConsumed, true);
       
   683 			if (aCode[aCodeCount] == -1)
       
   684 				break;
       
   685 
       
   686 			index += charConsumed + 1;
       
   687 			aCodeCount++;
       
   688 			}
       
   689 
       
   690 		// find number of collation keys
       
   691 		aKeyCount = 0;
       
   692 		index = aKeyStart;
       
   693 		while (index < lineLength && aLine[index] != '%' && aLine[index] != '#')
       
   694 			{
       
   695 			if (aLine[index] == '[')
       
   696 				aKeyCount++;
       
   697 			index++;
       
   698 			}
       
   699 		}
       
   700 
       
   701 	return aCodeCount > 0;
       
   702 	}
       
   703 
       
   704 void Reader::AddKeyOneToOne(const char* aLine, const int aCode, const int aKeyStart)
       
   705 	{
       
   706 	if (iIndices >= EMaxCollationIndices)
       
   707 		{
       
   708 		cout << "too many Unicode values";
       
   709 		exit(1);
       
   710 		}
       
   711 	CollationIndex& index = iCollationIndex[iIndices++];
       
   712 	index.iCode = aCode;
       
   713 	index.iIndex = -1;
       
   714 
       
   715 	/*
       
   716 	First try to find the key in the array of keys found so far.
       
   717 	Search backwards to use the fact that runs of the same key occur together.
       
   718 	*/
       
   719 	CollationKey key;
       
   720 	int charConsumed = 0;
       
   721 	GetCollationKey(aLine + aKeyStart, charConsumed, &key);
       
   722 	for (int i = iKeys - 1; i >= 0 && index.iIndex == -1; i--)
       
   723 		if (iCollationKey[i] == key)
       
   724 			index.iIndex = i;
       
   725 
       
   726 	// If that fails, add a new key.
       
   727 	if (index.iIndex == -1)
       
   728 		{
       
   729 		index.iIndex = iKeys++;
       
   730 		if (iKeys > EMaxCollationKeys)
       
   731 			{
       
   732 			cout << "too many keys";
       
   733 			exit(1);
       
   734 			} 
       
   735 		iCollationKey[index.iIndex] = key;
       
   736 		}
       
   737 	}
       
   738 /*
       
   739 Read 1-to-1 mapping. Sample:
       
   740 02B9 ; [*02A5.0020.0002.02B9] % MODIFIER LETTER PRIME
       
   741 
       
   742 aCombinedFile = true: aFileName is combined file, which contains base keys, comp keys, and string keys.
       
   743 */
       
   744 void Reader::ReadBaseKeys(const char* aFileName)
       
   745 	{
       
   746 	iSuppressCanonseqWarning = iStandard || iWgl4;
       
   747 	iLineNumber = 0;
       
   748 	iInputFileName = aFileName;
       
   749 	ifstream input_file;
       
   750 
       
   751 #ifdef __MSVCDOTNET__
       
   752 	input_file.open(iInputFileName, ios::in);
       
   753 #else //!__MSVCDOTNET__
       
   754 	input_file.open(iInputFileName, ios::in | ios::nocreate);
       
   755 #endif //__MSVCDOTNET__
       
   756 
       
   757 	if (input_file.fail())
       
   758 		{
       
   759 		cout << "cannot open input file '" << iInputFileName << "'\n";
       
   760 		exit(1);
       
   761 		}
       
   762 	cout << "reading base keys from '" << iInputFileName << "'\n";
       
   763 
       
   764 	char line[1024];
       
   765 	for (;;)
       
   766 		{
       
   767 		input_file.getline(line,sizeof(line));
       
   768 		if (input_file.eof())
       
   769 			break;
       
   770 		iLineNumber++;
       
   771 		// line number counting
       
   772 		if (iLineNumber % 100 == 0)
       
   773 			{
       
   774 			cout << "line " << iLineNumber << '\n';
       
   775 			cout.flush();
       
   776 			}
       
   777 		int code[16];
       
   778 		int codeCount = 0;
       
   779 		int key_start = 0;
       
   780 		int keyCount = 0;
       
   781 		if (ParseLine(line, code, codeCount, key_start, keyCount)) 
       
   782 			{
       
   783 			if (codeCount != 1 || keyCount != 1)
       
   784 				continue;	// goto next line
       
   785 			AddKeyOneToOne(line, code[0], key_start);
       
   786 			}
       
   787 		}
       
   788 
       
   789 	input_file.close();
       
   790 	}
       
   791 
       
   792 void Reader::AddKeyOneToMuch(const char* aLine, const int aCode, const int aKeyStart)
       
   793 	{
       
   794 	if (iIndices >= EMaxCollationIndices)
       
   795 		{
       
   796 		cout << "too many Unicode values";
       
   797 		exit(1);
       
   798 		}
       
   799 	CollationIndex& index = iCollationIndex[iIndices++];
       
   800 	index.iCode = aCode;
       
   801 	index.iIndex = iKeys;
       
   802 	GetMultipleCollationKeys(aLine + aKeyStart);
       
   803 	}
       
   804 /*
       
   805 Read 1-to-much mapping.
       
   806 3303  ; [.279F.0020.001C.3303][.1114.0020.001C.3303][.27C7.0020.001F.3303] # SQUARE AARU; QQKN
       
   807 */
       
   808 void Reader::ReadCompKeys(const char* aFileName)
       
   809 	{
       
   810 	iSuppressCanonseqWarning = iStandard || iWgl4;
       
   811 	iLineNumber = 0;
       
   812 	iInputFileName = aFileName;
       
   813 	ifstream input_file;
       
   814 
       
   815 #ifdef __MSVCDOTNET__
       
   816 	input_file.open(iInputFileName, ios::in);
       
   817 #else //!__MSVCDOTNET__
       
   818 	input_file.open(iInputFileName, ios::in | ios::nocreate);
       
   819 #endif //__MSVCDOTNET__
       
   820 
       
   821 	if (input_file.fail())
       
   822 		{
       
   823 		cout << "there are no composite keys; '" << iInputFileName << "' not found\n";
       
   824 		return;
       
   825 		}
       
   826 	cout << "reading composite keys from '" << iInputFileName << "'\n";
       
   827 
       
   828 	char line[1024];
       
   829 	for (;;)
       
   830 		{
       
   831 		input_file.getline(line,sizeof(line));
       
   832 		if (input_file.eof())
       
   833 			break;
       
   834 		iLineNumber++;
       
   835 		// line number counting
       
   836 		if (iLineNumber % 100 == 0)
       
   837 			{
       
   838 			cout << "line " << iLineNumber << '\n';
       
   839 			cout.flush();
       
   840 			}
       
   841 		int code[16];
       
   842 		int codeCount = 0;
       
   843 		int key_start = 0;
       
   844 		int keyCount = 0;
       
   845 		if (ParseLine(line, code, codeCount, key_start, keyCount)) 
       
   846 			{
       
   847 			if (codeCount != 1 || keyCount < 2)
       
   848 				continue;	// goto next line
       
   849 			AddKeyOneToMuch(line, code[0], key_start);
       
   850 			}
       
   851 		}
       
   852 
       
   853 	input_file.close();
       
   854 	}
       
   855 
       
   856 
       
   857 void Reader::AddKeyMuchToMuch(const char* aLine, const int aCode[16], const int aCodeCount, const int aKeyStart)
       
   858 	{
       
   859 
       
   860 	// Store the index to the Unicode string and the key sequence.
       
   861 	if (iStringIndices > EMaxStringIndices)
       
   862 		{
       
   863 		cout << "too many string indices";
       
   864 		exit(1);
       
   865 		}
       
   866 	iStringIndex[iStringIndices++] = (iStringElements << 16) | iKeys;
       
   867 
       
   868 	// Reserve space for the length.
       
   869 	if (iStringElements >= EMaxStringElements)
       
   870 		{
       
   871 		cout << "too many string elements";
       
   872 		exit(1);
       
   873 		}
       
   874 	iStringElements++;
       
   875 
       
   876 	// Read the Unicode string.
       
   877 	int length = 0;		// in unit of int16
       
   878 	int charCount = 0;	// in unit of char. for debug.
       
   879 
       
   880 	for (int i=0; i<aCodeCount; i++)
       
   881 		{	
       
   882 		if (iStringElements >= EMaxStringElements)
       
   883 			{
       
   884 			cout << "too many string elements";
       
   885 			exit(1);
       
   886 			}
       
   887 	
       
   888 		if (aCode[i] > 0xFFFF)
       
   889 			{
       
   890 			// UCS4 --> UTF-16
       
   891 			iStringElement[iStringElements++] = 0xD7C0 + (aCode[i] >> 10);
       
   892 			iStringElement[iStringElements++] = 0xDC00 | (aCode[i] & 0x3FF);
       
   893 			length += 2;
       
   894 			}
       
   895 		else
       
   896 			{
       
   897 			iStringElement[iStringElements++] = aCode[i];
       
   898 			length++;
       
   899 			}
       
   900 		charCount++;
       
   901 		}
       
   902 
       
   903 	iStringElement[iStringElements - length - 1] = (unsigned int)length;
       
   904 
       
   905 	// Read the key sequence.
       
   906 	GetMultipleCollationKeys(aLine + aKeyStart);
       
   907 	}
       
   908 /*
       
   909 Read much-to-much mapping. Sample:
       
   910 004F 0338 [.08EA.0020.0008.00D8] % capital O-stroke
       
   911 0E40 0E08 ; [.1E2B.0020.0002.0E08][.1E5E.0020.001F.0E40] # <THAI CHARACTER SARA E, THAI CHARACTER CHO CHAN>
       
   912 */
       
   913 void Reader::ReadStrings(const char* aFileName)
       
   914 	{
       
   915 	iSuppressCanonseqWarning = iStandard || iWgl4;
       
   916 	iLineNumber = 0;
       
   917 	iInputFileName = aFileName;
       
   918 	ifstream input_file;
       
   919 
       
   920 #ifdef __MSVCDOTNET__
       
   921 	input_file.open(iInputFileName, ios::in);
       
   922 #else //!__MSVCDOTNET__
       
   923 	input_file.open(iInputFileName, ios::in | ios::nocreate);
       
   924 #endif //__MSVCDOTNET__
       
   925 
       
   926 	if (input_file.fail())
       
   927 		{
       
   928 		cout << "there are no strings; '" << iInputFileName << "' not found\n";
       
   929 		return;
       
   930 		}
       
   931 	cout << "reading strings from '" << iInputFileName << "'\n";
       
   932 
       
   933 	char line[1024];
       
   934 	for (;;)
       
   935 		{
       
   936 		input_file.getline(line,sizeof(line));
       
   937 		if (input_file.eof())
       
   938 			break;
       
   939 		iLineNumber++;
       
   940 		// line number counting
       
   941 		if (iLineNumber % 100 == 0)
       
   942 			{
       
   943 			cout << "line " << iLineNumber << '\n';
       
   944 			cout.flush();
       
   945 			}
       
   946 		int code[16];
       
   947 		int codeCount = 0;
       
   948 		int key_start = 0;
       
   949 		int keyCount = 0;
       
   950 		if (ParseLine(line, code, codeCount, key_start, keyCount)) 
       
   951 			{
       
   952 			if (codeCount < 2 || keyCount < 1)
       
   953 				continue;	// goto next line
       
   954 			AddKeyMuchToMuch(line, code, codeCount, key_start);
       
   955 			}
       
   956 		}
       
   957 
       
   958 	input_file.close();
       
   959 	}
       
   960 
       
   961 /*
       
   962 Read combined key table. Sample:
       
   963 1-to-1 mapping:
       
   964 02B9 ; [*02A5.0020.0002.02B9] % MODIFIER LETTER PRIME
       
   965 
       
   966 1-to-much mapping:
       
   967 3303  ; [.279F.0020.001C.3303][.1114.0020.001C.3303][.27C7.0020.001F.3303] # SQUARE AARU; QQKN
       
   968 
       
   969 much-to-much mapping:
       
   970 004F 0338 [.08EA.0020.0008.00D8] % capital O-stroke
       
   971 0E40 0E08 ; [.1E2B.0020.0002.0E08][.1E5E.0020.001F.0E40] # <THAI CHARACTER SARA E, THAI CHARACTER CHO CHAN>
       
   972 */
       
   973 void Reader::ReadAllKeys(const char* aFileName)
       
   974 	{
       
   975 	iSuppressCanonseqWarning = iStandard || iWgl4;
       
   976 	iLineNumber = 0;
       
   977 	iInputFileName = aFileName;
       
   978 	ifstream input_file;
       
   979 
       
   980 #ifdef __MSVCDOTNET__
       
   981 	input_file.open(iInputFileName, ios::in);
       
   982 #else //!__MSVCDOTNET__
       
   983 	input_file.open(iInputFileName, ios::in | ios::nocreate);
       
   984 #endif //__MSVCDOTNET__
       
   985 
       
   986 	if (input_file.fail())
       
   987 		{
       
   988 		cout << "there are no keys; '" << iInputFileName << "' not found\n";
       
   989 		return;
       
   990 		}
       
   991 	cout << "reading all keys from '" << iInputFileName << "'\n";
       
   992 
       
   993 	char line[1024];
       
   994 	for (;;)
       
   995 		{
       
   996 		if (input_file.eof())
       
   997 			break;
       
   998 		input_file.getline(line,sizeof(line));
       
   999 		iLineNumber++;
       
  1000 
       
  1001 		int code[16];
       
  1002 		int codeCount = 0;
       
  1003 		int key_start = 0;
       
  1004 		int keyCount = 0;
       
  1005 		if (ParseLine(line, code, codeCount, key_start, keyCount)) 
       
  1006 			{
       
  1007 			if (codeCount == 1 && keyCount == 1)
       
  1008 				AddKeyOneToOne(line, code[0], key_start);
       
  1009 			else if (codeCount == 1 && keyCount > 1)
       
  1010 				AddKeyOneToMuch(line, code[0], key_start);
       
  1011 			else if (codeCount > 1 && keyCount > 0)
       
  1012 			AddKeyMuchToMuch(line, code, codeCount, key_start);
       
  1013 			else
       
  1014 				cout << "ignore line: " << line << "\n";
       
  1015 			}
       
  1016 		}
       
  1017 
       
  1018 	input_file.close();
       
  1019 	}
       
  1020 
       
  1021 
       
  1022 // Pack the 3 collation key levels into a single 32-bit integer.
       
  1023 unsigned int Reader::PackKey(const CollationKey& aValue)
       
  1024 	{
       
  1025 	unsigned int level0 = aValue.iLevel[0];
       
  1026 	unsigned int level1 = aValue.iLevel[1];
       
  1027 	if (level1 > 0)
       
  1028 		level1 -= (KLevel1Min - 1);
       
  1029 	unsigned int level2 = aValue.iLevel[2];
       
  1030 	if (level2 > 0)
       
  1031 		level2 -= (KLevel2Min - 1);
       
  1032 	unsigned int key = level0 << 16 | level1 << 8 | level2 << 2;
       
  1033 	if (aValue.iIgnorable)
       
  1034 		key |= 2;
       
  1035 	if (aValue.iStop)
       
  1036 		key |= 1;
       
  1037 	return key;
       
  1038 	}
       
  1039 
       
  1040 // Pack a collation index value into a single 32-bit integer.
       
  1041 int Reader::PackIndex(const CollationIndex& aValue, unsigned int result[2])
       
  1042 	{
       
  1043 	unsigned int code = aValue.iCode;
       
  1044 	unsigned int index = aValue.iIndex;
       
  1045 	if (code <= 0xFFFF)
       
  1046 		{
       
  1047 		result[0] = (code << 16 | index);
       
  1048 		return 1;
       
  1049 		}
       
  1050 	else
       
  1051 		{
       
  1052 		result[0] = (::HighSurrogate(code) << 16 | index);
       
  1053 		result[1] = (::LowSurrogate(code) << 16 | index);
       
  1054 		return 2;
       
  1055 		}
       
  1056 	}
       
  1057 
       
  1058 const Reader* TheReader;
       
  1059 static int CompareStringIndices(const void* aIndex1,const void* aIndex2)
       
  1060 	{
       
  1061 	return TheReader->CompareStringIndices(*(unsigned int*)aIndex1 >> 16,*(unsigned int*)aIndex2 >> 16);
       
  1062 	}
       
  1063 
       
  1064 int CompareUnicodeStrings(const int *aString1,int aLength1,const int *aString2,int aLength2)
       
  1065 	{
       
  1066 	for (int i = 0; i < aLength1 || i < aLength2; i++, aString1++, aString2++)
       
  1067 		{
       
  1068 		int x = i < aLength1 ? *aString1 : -1;
       
  1069 		int y = i < aLength2 ? *aString2 : -1;
       
  1070 		if (x != y)
       
  1071 			return x - y;
       
  1072 		}
       
  1073 	return 0;
       
  1074 	}
       
  1075 
       
  1076 int Reader::CompareStringIndices(int aIndex1,int aIndex2) const
       
  1077 	{
       
  1078 	return CompareUnicodeStrings(iStringElement + aIndex1 + 1,iStringElement[aIndex1],
       
  1079 								 iStringElement + aIndex2 + 1,iStringElement[aIndex2]);
       
  1080 	}
       
  1081 
       
  1082 void Reader::WriteOutput(const char* aFileName, bool aCopyright)
       
  1083 	{
       
  1084 	int i;
       
  1085 	ofstream output_file;
       
  1086 	output_file.open(aFileName);
       
  1087 	if (output_file.fail())
       
  1088 		{
       
  1089 		cout << "cannot open output file '" << aFileName << "'\n";
       
  1090 		exit(1);
       
  1091 		}
       
  1092 	cout << "writing output to '" << aFileName << "'\n";
       
  1093 
       
  1094 	char *locale = NULL;
       
  1095 	if (iStandard)
       
  1096 		locale = _strdup("Standard");
       
  1097 	else
       
  1098 		locale = _strdup(iLocaleName);
       
  1099 
       
  1100 	if (!iStandard)
       
  1101 		{
       
  1102 		_strlwr(locale);
       
  1103 		locale[0] = (char)toupper(locale[0]);
       
  1104 		if (aCopyright)
       
  1105 			{
       
  1106 			char* capsFileName = new char[strlen(aFileName) + 1];
       
  1107 			strcpy(capsFileName, aFileName);
       
  1108 			_strupr(capsFileName);
       
  1109 			output_file << "/*\n" << capsFileName << "\n\nCopyright (C) 2000-2009 Nokia Corporation and/or its subsidiary(-ies). All rights reserved.\n*/\n";
       
  1110 			delete [] capsFileName;
       
  1111 			output_file << "\n/*\nThe LCharSet object used by the " << locale << " locale.\n";
       
  1112 			output_file << "Generated by COLTAB.\n*/\n";
       
  1113 			}
       
  1114 
       
  1115 		output_file << "\n#include \"ls_std.h\"\n#include <collate.h>\n";
       
  1116 		output_file << "\nconst TUint KUid" << iCPlusPlusIdentifier << "CollationMethod = ";
       
  1117 		if (iUidString)
       
  1118 			output_file << "0x" << iUidString << ";\n";
       
  1119 		else
       
  1120 			{
       
  1121 			output_file << "/* FILL THIS IN */;\n";
       
  1122 			cout << "Warning: File will need editing\nWarning: see coltab /h2 for details.\n";
       
  1123 			}
       
  1124 		}
       
  1125 
       
  1126 	/*
       
  1127 	Write the unique collation keys.
       
  1128 	Each one has the format, going from highest to lowest bit:
       
  1129 
       
  1130 	16 bits:	level-0 key
       
  1131 	8 bits:		level-1 key
       
  1132 	6 bits:		level-2 key
       
  1133 	1 bit:		set if this key is optionally ignorable
       
  1134 	1 bit:		set if this is the last key in the string of keys for a single Unicode value
       
  1135 
       
  1136 	*/
       
  1137 	if (iKeys != 0)
       
  1138 		{
       
  1139 		output_file << "\nstatic const TUint32 The" << iCPlusPlusIdentifier << "Key[] = \n\t{";
       
  1140 		CollationKey* ck = iCollationKey;
       
  1141 		output_file << "\t // " << iKeys << " keys";
       
  1142 		output_file << hex;
       
  1143 		for (i = 0; i < iKeys; i++, ck++)
       
  1144 			{
       
  1145 			unsigned int key = PackKey(*ck);
       
  1146 			if (i % 8 == 0)
       
  1147 				output_file << "\n\t";
       
  1148 			output_file << "0x";
       
  1149 			output_file << key << ",";
       
  1150 			}
       
  1151 		output_file << dec;
       
  1152 		output_file << "\n\t};\n\n";
       
  1153 		}
       
  1154 
       
  1155 	if (iIndices != 0)
       
  1156 		{
       
  1157 		// Sort then write the collation index values - these relate Unicode values to collation keys.
       
  1158 		qsort(iCollationIndex,iIndices,sizeof(CollationIndex),CollationIndex::Compare);
       
  1159 		output_file << "static const TUint32 The" << iCPlusPlusIdentifier << "Index[] = \n\t{";
       
  1160 		CollationIndex* ci = iCollationIndex;
       
  1161 		int entry=0;
       
  1162 		output_file << "\t // " << iIndices << " indices";
       
  1163 		output_file << hex;
       
  1164 		for (i = 0; i < iIndices; i++, ci++, entry++)
       
  1165 			{
       
  1166 			unsigned int key[2];
       
  1167 			int bytecount = PackIndex(*ci, key);
       
  1168 
       
  1169 			if (entry % 8 == 0)
       
  1170 				output_file << "\n\t";
       
  1171 			output_file << "0x";
       
  1172 			output_file << key[0] << ",";
       
  1173 
       
  1174 			if (bytecount == 2)
       
  1175 				{
       
  1176 				entry++;
       
  1177 				if (entry % 8 == 0)
       
  1178 					output_file << "\n\t";
       
  1179 				output_file << "0x";
       
  1180 				output_file << key[1] << ",";
       
  1181 				}
       
  1182 			}
       
  1183 		output_file << dec;
       
  1184 		output_file << "\n\t};";
       
  1185 		output_file << "\t // " << entry << " entries";
       
  1186 		output_file << "\n\n";
       
  1187 		iIndices = entry; //One surrogate pair occupies 2 entries 
       
  1188 		}
       
  1189 
       
  1190 	if (iStringElements)
       
  1191 		{
       
  1192 		// Write the Unicode strings; these are preceded by their lengths.
       
  1193 		output_file << "static const TUint16 The" << iCPlusPlusIdentifier << "StringElement[] = \n\t{";
       
  1194 		output_file << hex;
       
  1195 		for (i = 0; i < iStringElements; i++)
       
  1196 			{
       
  1197 			if (i % 8 == 0)
       
  1198 				output_file << "\n\t";
       
  1199 			output_file << "0x" << iStringElement[i] << ",";
       
  1200 			}
       
  1201 		output_file << dec;
       
  1202 		if (iStringElements==0)
       
  1203 			output_file << "0";
       
  1204 		output_file << "\n\t};\n\n";
       
  1205 
       
  1206 		/*
       
  1207 		Sort then write the string index values - these relate Unicode strings to collation keys.
       
  1208 		Each one has the string index in the upper word and the key index in the lower word.
       
  1209 		*/
       
  1210 		TheReader = this;
       
  1211 		qsort(iStringIndex,iStringIndices,sizeof(iStringIndex[0]),::CompareStringIndices);
       
  1212 		output_file << "static const TUint32 The" << iCPlusPlusIdentifier << "StringIndex[] = \n\t{";
       
  1213 		output_file << hex;
       
  1214 		for (i = 0; i < iStringIndices; i++)
       
  1215 			{
       
  1216 			if (i % 8 == 0)
       
  1217 				output_file << "\n\t";
       
  1218 			output_file << "0x" << iStringIndex[i] << ",";
       
  1219 			}
       
  1220 		output_file << dec;
       
  1221 		if (iStringIndices ==0)
       
  1222 			output_file << "0";
       
  1223 		output_file << "\n\t};\n\n";
       
  1224 		}
       
  1225 
       
  1226 	// Write the collation table structure.
       
  1227 	output_file << "static const TCollationKeyTable The" << iCPlusPlusIdentifier << "Table = \n\t{ ";
       
  1228 	if (iKeys)
       
  1229 		output_file << "The" << iCPlusPlusIdentifier << "Key";
       
  1230 	else
       
  1231 		output_file << "0";
       
  1232 	if (iIndices)
       
  1233 		output_file << ", The" << iCPlusPlusIdentifier << "Index, " << iIndices;
       
  1234 	else
       
  1235 		output_file << ", 0, 0";
       
  1236 	if (iStringElements)
       
  1237 		output_file << ", The" << iCPlusPlusIdentifier << "StringElement, The" << iCPlusPlusIdentifier << "StringIndex, " << iStringIndices << " };\n";
       
  1238 	else
       
  1239 		output_file << ", 0, 0, 0 };\n";
       
  1240 
       
  1241 	if (!iStandard)
       
  1242 		output_file << "\nstatic const TCollationMethod TheCollationMethod[] = \n"\
       
  1243 			"	{\n"\
       
  1244 			"		{\n"\
       
  1245 			"		KUid" << iCPlusPlusIdentifier << "CollationMethod, // the method for the locale\n"\
       
  1246 			"		NULL, // use the standard table as the main table\n"\
       
  1247 			"		&The" << iCPlusPlusIdentifier << "Table, // the locale values override the standard values\n"\
       
  1248 			"		0 // the flags are standard\n"\
       
  1249 			"		},\n"\
       
  1250 			"		{\n"\
       
  1251 			"		KUidBasicCollationMethod, // the standard unlocalised method\n"\
       
  1252 			"		NULL, // null means use the standard table\n"\
       
  1253 			"		NULL, // there's no override table\n"\
       
  1254 			"		0 // the flags are standard\n"\
       
  1255 			"		}\n"\
       
  1256 			"	};\n"\
       
  1257 			"\n"\
       
  1258 			"static const TCollationDataSet TheCollationDataSet =\n"\
       
  1259 			"	{\n"\
       
  1260 			"	TheCollationMethod,\n"\
       
  1261 			"	2\n"\
       
  1262 			"	};"\
       
  1263 			"\n\n"\
       
  1264 			"// The one and only locale character set object.\n"\
       
  1265 			"const LCharSet TheCharSet =\n"\
       
  1266 			"	{\n"\
       
  1267 			"	NULL,\n"\
       
  1268 			"	&TheCollationDataSet\n"\
       
  1269 			"	};\n";
       
  1270 
       
  1271 	output_file.close();
       
  1272 	delete [] locale;
       
  1273 	}
       
  1274 
       
  1275 int CollationIndex::Compare(const void* aIndex1,const void* aIndex2)
       
  1276 	{
       
  1277 	return ((CollationIndex*)aIndex1)->iCode - ((CollationIndex*)aIndex2)->iCode;
       
  1278 	}