locales/loce32/Coltab/Coltab.cpp
changeset 0 05e9090e2422
equal deleted inserted replaced
-1:000000000000 0:05e9090e2422
       
     1 /*
       
     2 * Copyright (c) 1999 Nokia Corporation and/or its subsidiary(-ies).
       
     3 * All rights reserved.
       
     4 * This component and the accompanying materials are made available
       
     5 * under the terms of "Eclipse Public License v1.0"
       
     6 * which accompanies this distribution, and is available
       
     7 * at the URL "http://www.eclipse.org/legal/epl-v10.html".
       
     8 *
       
     9 * Initial Contributors:
       
    10 * Nokia Corporation - initial contribution.
       
    11 *
       
    12 * Contributors:
       
    13 *
       
    14 * Description: 
       
    15 *
       
    16 */
       
    17 
       
    18 
       
    19 
       
    20 /*
       
    21 
       
    22 Reads and parses the Unicode collation value table and writes out a C++ source file
       
    23 containing the data in a form that can be used by the EPOC collation system.
       
    24 
       
    25 The program reads three files:
       
    26 
       
    27 1. Base keys (maps single Unicode values to single collation key values): must be in the same format as
       
    28 basekeys.txt, supplied with the Standard Unicode Collation system
       
    29 
       
    30 2. Composite keys (maps single Unicode values to strings of collation keys): must be in the same format as
       
    31 compkeys.txt, supplied with the Standard Unicode Collation system
       
    32 
       
    33 3. Strings (maps strings of Unicode values to single collation keys OR strings of collation keys): must be in the
       
    34 same format as compkeys.txt, except that there can be any number of Unicode characters at the start of the line,
       
    35 space-separated and each exactly 4 hex digits.
       
    36 */
       
    37 
       
    38 #include <assert.h>
       
    39 #include <ctype.h>
       
    40 #include <fstream.h>
       
    41 #include <stdlib.h>
       
    42 #include <string.h>
       
    43 #include <stdio.h>
       
    44 
       
    45 /*
       
    46 Constants constraining the range of level-1 and level-2 keys so that they can be packed.
       
    47 Non-zero values are reduced by one less than the minimum value.
       
    48 */
       
    49 const unsigned int KLevel1Bits = 8;
       
    50 const unsigned int KLevel1Min = 0x20;
       
    51 const unsigned int KLevel1Max = KLevel1Min + (1 << KLevel1Bits) - 2;
       
    52 const unsigned int KLevel2Bits = 6;
       
    53 const unsigned int KLevel2Min = 1;
       
    54 const unsigned int KLevel2Max = KLevel2Min + (1 << KLevel2Bits) - 2;
       
    55 
       
    56 /*
       
    57 Table of characters in the WGL4 set, plus characters in canonical decompositions of
       
    58 those characters, plus commonly used control characters and space characters,
       
    59 given as ranges of Unicode characters. In each pair, the first code is the first in the range,
       
    60 and the second is the first code NOT in the range.
       
    61 
       
    62 The extra characters are added mainly to ensure that control characters and spaces are
       
    63 normally ignored. The extra characters are:
       
    64 
       
    65 0x0000-0x001F: ASCII control characters
       
    66 0x2000-0x2012: spaces, hyphen variants, figure dash
       
    67 0x2028-0x202E: line and paragraph separator, bidirectional control characters
       
    68 0xFEFF		 : byte-order mark
       
    69 0xFFFC-0xFFFD: object replacement character, replacement character
       
    70 */
       
    71 const unsigned short Wgl4Range[] =
       
    72 	{
       
    73 	0x00, 0x7f,		// All ASCII
       
    74 	0xa0, 0x180,		// Non-breaking space, Latin-1, Latin Extended-A
       
    75 	0x192,0x193,		// Latin f with hook
       
    76 	0x1fa,0x200,		// A-ring, a-ring, AE, ae, O slash, o slash all with acute accent
       
    77 	0x2c6,0x2c8,		// non-combining circumflex and caron
       
    78 	0x2c9,0x2ca,		// non-combining macron
       
    79 	0x2d8,0x2dc,		// non-combining breve, dot above, ring above, ogonek
       
    80 	0x2dd,0x2de,		// non-combining double acute
       
    81 	0x300,0x305,		// combining grave, acute, circumflex, tilde, macron
       
    82 	0x306,0x309,		// combining breve, dot above, double dot above
       
    83 	0x30a,0x30e,		// combining ring above, double acute, caron, vertical line above
       
    84 	0x327,0x329,		// combining cedilla, ogonek
       
    85 	0x384,0x38b,		// Greek
       
    86 	0x38c,0x38d,		// Greek
       
    87 	0x38e,0x3a2,		// Greek
       
    88 	0x3a3,0x3cf,		// Greek
       
    89 	0x401,0x40d,		// Cyrillic
       
    90 	0x40e,0x450,		// Cyrillic
       
    91 	0x451,0x45d,		// Cyrillic
       
    92 	0x45e,0x460,		// Cyrillic
       
    93 	0x490,0x492,		// Cyrillic
       
    94 	0x1e80,0x1e86,		// Both W and w with each of grave, acute and diaeresis
       
    95 	0x1ef2,0x1ef4,		// Y with grave, y with grave
       
    96 	0x2000,0x2016,		// various space and horizontal lines
       
    97 	0x2017,0x201f,		//double vertical line, double low line, various quotation marks
       
    98 	0x2020,0x2023,		// dagger, double dagger, bullet
       
    99 	0x2026,0x2027,		//ellipsis
       
   100 	0x2028,0x202F,		// line & paragraph separators and directional formatting
       
   101 	0x2030,0x2031,		// per mille
       
   102 	0x2032,0x2034,		// prime
       
   103 	0x2039,0x203b,		// single angle quotation marks
       
   104 	0x203c,0x203d,		// double exclamation mark
       
   105 	0x203e,0x203f,		// non-combining overscore
       
   106 	0x2044,0x2045,		// fraction slash
       
   107 	0x207f,0x2080,		// superscript n
       
   108 	0x20a3,0x20a5,		// French Franc, Italian/Turkish Lira
       
   109 	0x20a7,0x20a8,		// Spanish Peseta
       
   110 	0x20ac,0x20ad,		// Euro symbol
       
   111 	0x2105,0x2106,		// care of
       
   112 	0x2113,0x2114,		// script l
       
   113 	0x2116,0x2117,		// numero
       
   114 	0x2122,0x2123,		// trade mark
       
   115 	0x2126,0x2127,		// ohm
       
   116 	0x212e,0x212f,		// estimated (net weight)
       
   117 	0x215b,0x215f,		// 1/8, 3/8, 5/8, 7/8
       
   118 	0x2190,0x2196,		// horizontal and vertical arrows
       
   119 	0x21a8,0x21a9,		// up down arrow with base
       
   120 	0x2202,0x2203,		// partial differential
       
   121 	0x2206,0x2207,		// increment (delta)
       
   122 	0x220f,0x2210,		// n-ary product (pi)
       
   123 	0x2211,0x2213,		// n-ary sum (sigma), minus
       
   124 	0x2215,0x2216,		// division (slash)
       
   125 	0x2219,0x221b,		// bullet operator, square root
       
   126 	0x221e,0x2220,		// infinity, right angle
       
   127 	0x2229,0x222a,		// intersection
       
   128 	0x222b,0x222c,		// union
       
   129 	0x2248,0x2249,		// almost equal to
       
   130 	0x2260,0x2262,		// not equal to, identical to
       
   131 	0x2264,0x2266,		// less-than-or-equal-to, greater-than-or-equal-to
       
   132 	0x2302,0x2303,		// house
       
   133 	0x2310,0x2311,		// rversed not sign
       
   134 	0x2320,0x2322,		// top and bottom of integral
       
   135 	0x2500,0x2501,		// box drawing
       
   136 	0x2502,0x2503,		// box drawing
       
   137 	0x250c,0x250d,		// box drawing
       
   138 	0x2510,0x2511,		// box drawing
       
   139 	0x2514,0x2515,		// box drawing
       
   140 	0x2518,0x2519,		// box drawing
       
   141 	0x251c,0x251d,		// box drawing
       
   142 	0x2524,0x2525,		// box drawing
       
   143 	0x252c,0x252d,		// box drawing
       
   144 	0x2534,0x2535,		// box drawing
       
   145 	0x253c,0x253d,		// box drawing
       
   146 	0x2550,0x256d,		// box drawing
       
   147 	0x2580,0x2581,		// block element
       
   148 	0x2584,0x2585,		// block element
       
   149 	0x2588,0x2589,		// block element
       
   150 	0x258c,0x258d,		// block element
       
   151 	0x2590,0x2594,		// block element
       
   152 	0x25a0,0x25a2,		// geometric shapes
       
   153 	0x25aa,0x25ad,		// geometric shapes
       
   154 	0x25b2,0x25b3,		// geometric shapes
       
   155 	0x25ba,0x25bb,		// geometric shapes
       
   156 	0x25bc,0x25bd,		// geometric shapes
       
   157 	0x25c4,0x25c5,		// geometric shapes
       
   158 	0x25ca,0x25cc,		// geometric shapes
       
   159 	0x25cf,0x25d0,		// geometric shapes
       
   160 	0x25d8,0x25da,		// geometric shapes
       
   161 	0x25e6,0x25e7,		// geometric shapes
       
   162 	0x263a,0x263d,		// smilies, sun
       
   163 	0x2640,0x2641,		// female
       
   164 	0x2642,0x2643,		// male
       
   165 	0x2660,0x2661,		// spade
       
   166 	0x2663,0x2664,		// club
       
   167 	0x2665,0x2667,		// heart
       
   168 	0x266a,0x266c,		// quaver, beamed quavers
       
   169 	0xfb01,0xfb03,		// fi, fl ligatures
       
   170 	0xfeff,0xff00,		// zero-width non-breaking space
       
   171 	0xfffc, 0xfffe		// object replacement character and replacement character
       
   172 	};
       
   173 const int Wgl4Ranges = sizeof(Wgl4Range) / sizeof(Wgl4Range[0]) / 2;
       
   174 
       
   175 int CompareWgl4Ranges(const void* aRange1,const void* aRange2)
       
   176 	{
       
   177 	unsigned short* p = (unsigned short*)aRange1;
       
   178 	unsigned short* q = (unsigned short*)aRange2;
       
   179 	if (q[0] == q[1])
       
   180 		{
       
   181 		unsigned short* temp = p;
       
   182 		p = q;
       
   183 		q = temp;
       
   184 		}
       
   185 	if (*p < *q)
       
   186 		return -1;
       
   187 	else if (*p >= q[1])
       
   188 		return 1;
       
   189 	else
       
   190 		return 0;
       
   191 	}
       
   192 
       
   193 // Determine if a character is in the WGL4 character repertoire.
       
   194 static bool InWgl4(unsigned short aChar)
       
   195 	{
       
   196 	unsigned short key[2];
       
   197 	key[0] = key[1] = aChar;
       
   198 	return bsearch(key,Wgl4Range,Wgl4Ranges,sizeof(Wgl4Range[0]) * 2,CompareWgl4Ranges) != NULL;
       
   199 	}
       
   200 
       
   201 // A collation key.
       
   202 class CollationKey
       
   203 	{
       
   204 	public:
       
   205 	bool operator==(const CollationKey& k) const
       
   206 		{ return iLevel[0] == k.iLevel[0] && iLevel[1] == k.iLevel[1] && iLevel[2] == k.iLevel[2] &&
       
   207 		  iIgnorable == k.iIgnorable && iStop == k.iStop; }
       
   208 
       
   209 	enum
       
   210 		{
       
   211 		ELevels = 3
       
   212 		};
       
   213 	int iLevel[ELevels];// the keys at the various levels
       
   214 	bool iIgnorable;	// TRUE if this key can normally be ignored
       
   215 	bool iStop;			// TRUE if this is the last key in a string of keys
       
   216 	};
       
   217 
       
   218 // The collation index for a single Unicode value.
       
   219 class CollationIndex
       
   220 	{
       
   221 	public:
       
   222 	static int Compare(const void* aIndex1,const void* aIndex2);
       
   223 
       
   224 	int iCode;			// Unicode value
       
   225 	int iIndex;			// index into the key table
       
   226 	};
       
   227 
       
   228 class Reader
       
   229 	{
       
   230 	public:
       
   231 	Reader(bool aWgl4,bool aStandard,const char* aLocaleName);
       
   232 	~Reader();
       
   233 	void ReadBaseKeys(const char* aFileName);
       
   234 	void ReadCompKeys(const char* aFileName);
       
   235 	void ReadStrings(const char* aFileName);
       
   236 	void WriteOutput(const char* aFileName);
       
   237 	int CompareStringIndices(int aIndex1,int aIndex2) const;
       
   238 
       
   239 	private:
       
   240 	int Hex(const char *aString,bool aTolerate = false);
       
   241 	void GetCollationKey(const char* aString,CollationKey* aKey = NULL);
       
   242 	void GetMultipleCollationKeys(const char* aString);
       
   243 	unsigned int PackKey(const CollationKey& aValue);
       
   244 	unsigned int PackIndex(const CollationIndex& aValue);
       
   245 	bool ParseLine(const char* aLine,int& aCode,int& aKeyStart);
       
   246 
       
   247 	enum
       
   248 		{
       
   249 		EMaxCollationKeys = 65536,
       
   250 		EMaxCollationIndices = 65536,
       
   251 		EMaxStringElements = 65536,
       
   252 		EMaxStringIndices = 65536
       
   253 		};
       
   254 	CollationKey iCollationKey[EMaxCollationKeys];
       
   255 	int iKeys;
       
   256 	CollationIndex iCollationIndex[EMaxCollationIndices];
       
   257 	int iIndices;
       
   258 	unsigned short iStringElement[EMaxStringElements];
       
   259 	int iStringElements;
       
   260 	unsigned int iStringIndex[EMaxStringIndices];
       
   261 	int iStringIndices;
       
   262 	const char* iInputFileName;
       
   263 	int iLineNumber;
       
   264 	bool iWgl4;				// true if writing keys for wgl4 characters only
       
   265 	bool iStandard;			// true if reading standard files, not tailoring files
       
   266 	const char* iLocaleName;
       
   267 	};
       
   268 
       
   269 void UsageError()
       
   270 	{
       
   271 	cout << "Usage: coltab <locale>\n";
       
   272 	cout << "For the locales 'standard' and 'wgl4' coltab reads basekeys.txt & compkeys.txt\n";
       
   273 	cout << "For any other locale name <name> coltab reads <name>_basekeys.txt,\n";
       
   274 	cout << "<name>_compkeys.txt and <name>_strings.txt.\n";
       
   275 	cout << "The output file is always ls_<name>.cpp.";
       
   276 	exit(1);
       
   277 	}
       
   278 
       
   279 int main(int argc,char** argv)
       
   280 	{
       
   281 	if (argc != 2)
       
   282 		UsageError();
       
   283 	bool wgl4 = false;
       
   284 	const char* prefix = "";
       
   285 	const char* infix = "";
       
   286 	const char* locale = "";
       
   287 	bool standard = false;
       
   288 	if (!_stricmp(argv[1],"standard"))
       
   289 		{
       
   290 		locale = "Standard";
       
   291 		standard = true;
       
   292 		}
       
   293 	else if (!_stricmp(argv[1],"wgl4"))
       
   294 		{
       
   295 		locale = "Wgl4";
       
   296 		wgl4 = true;
       
   297 		standard = true;
       
   298 		}
       
   299 	else
       
   300 		{
       
   301 		locale = prefix = argv[1];
       
   302 		infix = "_";
       
   303 		}
       
   304 
       
   305 	Reader* reader = new Reader(wgl4,standard,locale);
       
   306 	if (!reader)
       
   307 		{
       
   308 		cout << "out of memory\n";
       
   309 		exit(1);
       
   310 		}
       
   311 	char* filename = new char[strlen(prefix) + 64];
       
   312 	sprintf(filename,"%s%scompkeys.txt",prefix,infix);
       
   313 	reader->ReadCompKeys(filename);
       
   314 	if (!standard)
       
   315 		{
       
   316 		sprintf(filename,"%s%sstrings.txt",prefix,infix);
       
   317 		reader->ReadStrings(filename);
       
   318 		}
       
   319 	sprintf(filename,"%s%sbasekeys.txt",prefix,infix);
       
   320 	reader->ReadBaseKeys(filename);
       
   321 	sprintf(filename,"ls_%s.cpp",argv[1]);
       
   322 	reader->WriteOutput(filename);
       
   323 
       
   324 	delete reader;
       
   325 	delete [] filename;
       
   326 	return 0;
       
   327 	}
       
   328 
       
   329 Reader::Reader(bool aWgl4,bool aStandard,const char* aLocaleName):
       
   330 	iKeys(0),
       
   331 	iIndices(0),
       
   332 	iStringElements(0),
       
   333 	iStringIndices(0),
       
   334 	iInputFileName(NULL),
       
   335 	iLineNumber(0),
       
   336 	iWgl4(aWgl4),
       
   337 	iStandard(aStandard),
       
   338 	iLocaleName(aLocaleName)
       
   339 	{
       
   340 	}
       
   341 
       
   342 Reader::~Reader()
       
   343 	{
       
   344 	}
       
   345 
       
   346 // Get a hex number of exactly four digits from aString. Return -1 if none is found and aTolerate is true.
       
   347 int Reader::Hex(const char *aString,bool aTolerate)
       
   348 	{
       
   349 	char *end;
       
   350 	unsigned long x = strtoul(aString,&end,16);
       
   351 	if (end != aString + 4)
       
   352 		{
       
   353 		if (!aTolerate)
       
   354 			{
       
   355 			cout << "bad hex number on line " << iLineNumber << " of file " << iInputFileName << '\n';
       
   356 			exit(1);
       
   357 			}
       
   358 		return -1;
       
   359 		}
       
   360 	return x;
       
   361 	}
       
   362 
       
   363 // Get a collation value from a string of the form [.xxxx.xxxx.xxxx.xxxx]
       
   364 void Reader::GetCollationKey(const char* aString,CollationKey* aKey)
       
   365 	{
       
   366 	if (aString[0] != '[' || aString[21] != ']')
       
   367 		{
       
   368 		cout << "syntax error on line " << iLineNumber << " of file " << iInputFileName << '\n';
       
   369 		exit(1);
       
   370 		}
       
   371 	if (aKey == NULL)
       
   372 		{
       
   373 		if (iKeys >= EMaxCollationKeys)
       
   374 			{
       
   375 			cout << "too many keys";
       
   376 			exit(1);
       
   377 			}
       
   378 		aKey = &iCollationKey[iKeys++];
       
   379 		}
       
   380 	aKey->iIgnorable = aString[1] == '*'; // asterisk means that this character is normally ignored
       
   381 	for (int i = 0; i < CollationKey::ELevels; i++)
       
   382 		aKey->iLevel[i] = Hex(aString + 2 + i * 5);
       
   383 
       
   384 	if (aKey->iLevel[1] > 0 && (aKey->iLevel[1] < KLevel1Min || aKey->iLevel[1] > KLevel1Max))
       
   385 		{
       
   386 		cout << "illegal level-1 key value on line " << iLineNumber << "; outside the range " << KLevel1Min << ".." << KLevel1Max;
       
   387 		exit(1);
       
   388 		}
       
   389 	if (aKey->iLevel[2] > 0 && (aKey->iLevel[2] < KLevel2Min || aKey->iLevel[2] > KLevel2Max))
       
   390 		{
       
   391 		cout << "illegal level-2 key value on line " << iLineNumber << "; outside the range " << KLevel2Min << ".." << KLevel2Max;
       
   392 		exit(1);
       
   393 		}
       
   394 
       
   395 	aKey->iStop = true;
       
   396 	}
       
   397 
       
   398 void Reader::GetMultipleCollationKeys(const char* aString)
       
   399 	{
       
   400 	while (aString[0] == '[')
       
   401 		{
       
   402 		GetCollationKey(aString);
       
   403 		iCollationKey[iKeys - 1].iStop = false;
       
   404 		if (strlen(aString) <= 23)
       
   405 			break;
       
   406 		aString += 23;
       
   407 		}
       
   408 	iCollationKey[iKeys - 1].iStop = true;
       
   409 	}
       
   410 
       
   411 /*
       
   412 Partially parse a line, returning its key code and the start of its first block of key data.
       
   413 Return false if it is not a data line, or not relevant.
       
   414 */
       
   415 bool Reader::ParseLine(const char* aLine,int& aCode,int& aKeyStart)
       
   416 	{
       
   417 	int line_length = strlen(aLine);
       
   418 	aCode = Hex(aLine,true);
       
   419 
       
   420 	/*
       
   421 	A data line must start with a hex number and be at least 27 characters long.
       
   422 	Canonically decomposable Unicode characters are skipped.
       
   423 	Skip non-WGL4 characters if doing WGL4 only.
       
   424 	*/
       
   425 	if (aCode != -1)
       
   426 		{
       
   427 		if (line_length < 27 ||
       
   428 			!strcmp(aLine + line_length - 8,"CANONSEQ") ||
       
   429 			(iWgl4 && !InWgl4((unsigned short)aCode))) 
       
   430 			aCode = -1;
       
   431 		}
       
   432 
       
   433 	if (aCode != -1)
       
   434 		{
       
   435 		aKeyStart = 4;
       
   436 		while (aKeyStart < line_length && aLine[aKeyStart] != '[')
       
   437 			aKeyStart++;
       
   438 		}
       
   439 
       
   440 	return aCode != -1;
       
   441 	}
       
   442 
       
   443 void Reader::ReadBaseKeys(const char* aFileName)
       
   444 	{
       
   445 	iLineNumber = 0;
       
   446 	iInputFileName = aFileName;
       
   447 	ifstream input_file;
       
   448 	input_file.open(iInputFileName,ios::in | ios::nocreate);
       
   449 	if (input_file.fail())
       
   450 		{
       
   451 		cout << "cannot open input file '" << iInputFileName << "'\n";
       
   452 		exit(1);
       
   453 		}
       
   454 	cout << "reading base keys from '" << iInputFileName << "'\n";
       
   455 
       
   456 	char line[1024];
       
   457 	for (;;)
       
   458 		{
       
   459 		input_file.getline(line,sizeof(line));
       
   460 		if (input_file.eof())
       
   461 			break;
       
   462 		iLineNumber++;
       
   463 		if (iLineNumber % 100 == 0)
       
   464 			{
       
   465 			cout << "line " << iLineNumber << '\n';
       
   466 			cout.flush();
       
   467 			}
       
   468 		int code = 0;
       
   469 		int key_start = 0;
       
   470 		if (ParseLine(line,code,key_start)) 
       
   471 			{
       
   472 			if (iIndices >= EMaxCollationIndices)
       
   473 				{
       
   474 				cout << "too many Unicode values";
       
   475 				exit(1);
       
   476 				}
       
   477 			CollationIndex& index = iCollationIndex[iIndices++];
       
   478 			index.iCode = code;
       
   479 			index.iIndex = -1;
       
   480 
       
   481 			/*
       
   482 			First try to find the key in the array of keys found so far.
       
   483 			Search backwards to use the fact that runs of the same key occur together.
       
   484 			*/
       
   485 			CollationKey key;
       
   486 			GetCollationKey(line + key_start,&key);
       
   487 			for (int i = iKeys - 1; i >= 0 && index.iIndex == -1; i--)
       
   488 				if (iCollationKey[i] == key)
       
   489 					index.iIndex = i;
       
   490 
       
   491 			// If that fails, add a new key.
       
   492 			if (index.iIndex == -1)
       
   493 				{
       
   494 				index.iIndex = iKeys++;
       
   495 				if (iKeys > EMaxCollationKeys)
       
   496 					{
       
   497 					cout << "too many keys";
       
   498 					exit(1);
       
   499 					} 
       
   500 				iCollationKey[index.iIndex] = key;
       
   501 				}
       
   502 			}
       
   503 		}
       
   504 
       
   505 	input_file.close();
       
   506 	}
       
   507 
       
   508 void Reader::ReadCompKeys(const char* aFileName)
       
   509 	{
       
   510 	iLineNumber = 0;
       
   511 	iInputFileName = aFileName;
       
   512 	ifstream input_file;
       
   513 	input_file.open(iInputFileName,ios::in | ios::nocreate);
       
   514 	if (input_file.fail())
       
   515 		{
       
   516 		cout << "there are no composite keys; '" << iInputFileName << "' not found\n";
       
   517 		return;
       
   518 		}
       
   519 	cout << "reading composite keys from '" << iInputFileName << "'\n";
       
   520 
       
   521 	char line[1024];
       
   522 	for (;;)
       
   523 		{
       
   524 		input_file.getline(line,sizeof(line));
       
   525 		if (input_file.eof())
       
   526 			break;
       
   527 		iLineNumber++;
       
   528 		if (iLineNumber % 100 == 0)
       
   529 			{
       
   530 			cout << "line " << iLineNumber << '\n';
       
   531 			cout.flush();
       
   532 			}
       
   533 		int code = 0;
       
   534 		int key_start = 0;
       
   535 		if (ParseLine(line,code,key_start)) 
       
   536 			{
       
   537 			if (iIndices >= EMaxCollationIndices)
       
   538 				{
       
   539 				cout << "too many Unicode values";
       
   540 				exit(1);
       
   541 				}
       
   542 			CollationIndex& index = iCollationIndex[iIndices++];
       
   543 			index.iCode = code;
       
   544 			index.iIndex = iKeys;
       
   545 			GetMultipleCollationKeys(line + key_start);
       
   546 			}
       
   547 		}
       
   548 
       
   549 	input_file.close();
       
   550 	}
       
   551 
       
   552 
       
   553 void Reader::ReadStrings(const char* aFileName)
       
   554 	{
       
   555 	iLineNumber = 0;
       
   556 	iInputFileName = aFileName;
       
   557 	ifstream input_file;
       
   558 	input_file.open(iInputFileName,ios::in | ios::nocreate);
       
   559 	if (input_file.fail())
       
   560 		{
       
   561 		cout << "there are no strings; '" << iInputFileName << "' not found\n";
       
   562 		return;
       
   563 		}
       
   564 	cout << "reading strings from '" << iInputFileName << "'\n";
       
   565 
       
   566 	char line[1024];
       
   567 	for (;;)
       
   568 		{
       
   569 		input_file.getline(line,sizeof(line));
       
   570 		if (input_file.eof())
       
   571 			break;
       
   572 		iLineNumber++;
       
   573 		if (iLineNumber % 100 == 0)
       
   574 			{
       
   575 			cout << "line " << iLineNumber << '\n';
       
   576 			cout.flush();
       
   577 			}
       
   578 		int code = 0;
       
   579 		int key_start = 0;
       
   580 		if (ParseLine(line,code,key_start))
       
   581 			{
       
   582 			// Store the index to the Unicode string and the key sequence.
       
   583 			if (iStringIndices > EMaxStringIndices)
       
   584 				{
       
   585 				cout << "too many string indices";
       
   586 				exit(1);
       
   587 				}
       
   588 			iStringIndex[iStringIndices++] = (iStringElements << 16) | iKeys;
       
   589 
       
   590 			// Reserve space for the length.
       
   591 			if (iStringElements >= EMaxStringElements)
       
   592 				{
       
   593 				cout << "too many string elements";
       
   594 				exit(1);
       
   595 				}
       
   596 			iStringElements++;
       
   597 
       
   598 			// Read the Unicode string.
       
   599 			int index = 0;
       
   600 			int length = 0;
       
   601 			while (index < key_start)
       
   602 				{
       
   603 				if (iStringElements >= EMaxStringElements)
       
   604 					{
       
   605 					cout << "too many string elements";
       
   606 					exit(1);
       
   607 					}
       
   608 				iStringElement[iStringElements++] = (unsigned short)Hex(line + index);
       
   609 				index += 5;
       
   610 				length++;
       
   611 				}
       
   612 			iStringElement[iStringElements - length - 1] = (unsigned short)length;
       
   613 
       
   614 			// Read the key sequence.
       
   615 			GetMultipleCollationKeys(line + key_start);
       
   616 			}
       
   617 		}
       
   618 
       
   619 	input_file.close();
       
   620 	}
       
   621 
       
   622 // Pack the 3 collation key levels into a single 32-bit integer.
       
   623 unsigned int Reader::PackKey(const CollationKey& aValue)
       
   624 	{
       
   625 	unsigned int level0 = aValue.iLevel[0];
       
   626 	unsigned int level1 = aValue.iLevel[1];
       
   627 	if (level1 > 0)
       
   628 		level1 -= (KLevel1Min - 1);
       
   629 	unsigned int level2 = aValue.iLevel[2];
       
   630 	if (level2 > 0)
       
   631 		level2 -= (KLevel2Min - 1);
       
   632 	unsigned int key = level0 << 16 | level1 << 8 | level2 << 2;
       
   633 	if (aValue.iIgnorable)
       
   634 		key |= 2;
       
   635 	if (aValue.iStop)
       
   636 		key |= 1;
       
   637 	return key;
       
   638 	}
       
   639 
       
   640 // Pack a collation index value into a single 32-bit integer.
       
   641 unsigned int Reader::PackIndex(const CollationIndex& aValue)
       
   642 	{
       
   643 	unsigned int code = aValue.iCode;
       
   644 	unsigned int index = aValue.iIndex;
       
   645 	return code << 16 | index;
       
   646 	}
       
   647 
       
   648 const Reader* TheReader;
       
   649 static int CompareStringIndices(const void* aIndex1,const void* aIndex2)
       
   650 	{
       
   651 	return TheReader->CompareStringIndices(*(unsigned int*)aIndex1 >> 16,*(unsigned int*)aIndex2 >> 16);
       
   652 	}
       
   653 
       
   654 int CompareUnicodeStrings(const unsigned short *aString1,int aLength1,const unsigned short *aString2,int aLength2)
       
   655 	{
       
   656 	for (int i = 0; i < aLength1 || i < aLength2; i++, aString1++, aString2++)
       
   657 		{
       
   658 		int x = i < aLength1 ? *aString1 : -1;
       
   659 		int y = i < aLength2 ? *aString2 : -1;
       
   660 		if (x != y)
       
   661 			return x - y;
       
   662 		}
       
   663 	return 0;
       
   664 	}
       
   665 
       
   666 int Reader::CompareStringIndices(int aIndex1,int aIndex2) const
       
   667 	{
       
   668 	return CompareUnicodeStrings(iStringElement + aIndex1 + 1,iStringElement[aIndex1],
       
   669 								 iStringElement + aIndex2 + 1,iStringElement[aIndex2]);
       
   670 	}
       
   671 
       
   672 void Reader::WriteOutput(const char* aFileName)
       
   673 	{
       
   674 	ofstream output_file;
       
   675 	output_file.open(aFileName);
       
   676 	if (output_file.fail())
       
   677 		{
       
   678 		cout << "cannot open output file '" << aFileName << "'\n";
       
   679 		exit(1);
       
   680 		}
       
   681 	cout << "writing output to '" << aFileName << "'\n";
       
   682 
       
   683 	char *locale = NULL;
       
   684 	if (iStandard)
       
   685 		locale = _strdup("Standard");
       
   686 	else
       
   687 		locale = _strdup(iLocaleName);
       
   688 
       
   689 	if (!iStandard)
       
   690 		{
       
   691 		_strupr(locale);
       
   692 		output_file << "/*\nLS_" << locale << ".CPP\n\nCopyright (C) 2000 Symbian Ltd. All rights reserved.\n*/\n";
       
   693 		_strlwr(locale);
       
   694 		locale[0] = (char)toupper(locale[0]);
       
   695 		output_file << "\n/*\nThe LCharSet object used by the " << locale << " locale.\n";
       
   696 		output_file << "Generated by COLTAB.\n*/\n";
       
   697 
       
   698 		output_file << "\n#include \"ls_std.h\"\n#include <collate.h>\n";
       
   699 		output_file << "\nconst TUint KUid" << locale << "CollationMethod = /* FILL THIS IN */;\n";
       
   700 		}
       
   701 
       
   702 	/*
       
   703 	Write the unique collation keys.
       
   704 	Each one has the format, going from highest to lowest bit:
       
   705 
       
   706 	16 bits:	level-0 key
       
   707 	8 bits:		level-1 key
       
   708 	6 bits:		level-2 key
       
   709 	1 bit:		set if this key is optionally ignorable
       
   710 	1 bit:		set if this is the last key in the string of keys for a single Unicode value
       
   711 
       
   712 	*/
       
   713 	if (iKeys != 0)
       
   714 		{
       
   715 		output_file << "\nstatic const TUint32 The" << locale << "Key[] = \n\t{";
       
   716 		CollationKey* ck = iCollationKey;
       
   717 		output_file << hex;
       
   718 		for (int i = 0; i < iKeys; i++, ck++)
       
   719 			{
       
   720 			unsigned int key = PackKey(*ck);
       
   721 			if (i % 8 == 0)
       
   722 				output_file << "\n\t";
       
   723 			output_file << "0x" << key << ",";
       
   724 			}
       
   725 		output_file << dec;
       
   726 		output_file << "\n\t};\n\n";
       
   727 		}
       
   728 	
       
   729 	if (iIndices != 0)
       
   730 		{
       
   731 		// Sort then write the collation index values - these relate Unicode values to collation keys.
       
   732 		qsort(iCollationIndex,iIndices,sizeof(CollationIndex),CollationIndex::Compare);
       
   733 		output_file << "static const TUint32 The" << locale << "Index[] = \n\t{";
       
   734 		CollationIndex* ci = iCollationIndex;
       
   735 		output_file << hex;
       
   736 		for (int i = 0; i < iIndices; i++, ci++)
       
   737 			{
       
   738 			unsigned int key = PackIndex(*ci);
       
   739 			if (i % 8 == 0)
       
   740 				output_file << "\n\t";
       
   741 			output_file << "0x" << key << ",";
       
   742 			}
       
   743 		output_file << dec;
       
   744 		output_file << "\n\t};\n\n";
       
   745 		}
       
   746 
       
   747 	if (iStringElements)
       
   748 		{
       
   749 		// Write the Unicode strings; these are preceded by their lengths.
       
   750 		output_file << "static const TUint16 The" << locale << "StringElement[] = \n\t{";
       
   751 		output_file << hex;
       
   752 		for (int i = 0; i < iStringElements; i++)
       
   753 			{
       
   754 			if (i % 8 == 0)
       
   755 				output_file << "\n\t";
       
   756 			output_file << "0x" << iStringElement[i] << ",";
       
   757 			}
       
   758 		output_file << dec;
       
   759 		if (iStringElements==0)
       
   760 			output_file << "0";
       
   761 		output_file << "\n\t};\n\n";
       
   762 		/*
       
   763 		Sort then write the string index values - these relate Unicode strings to collation keys.
       
   764 		Each one has the string index in the upper word and the key index in the lower word.
       
   765 		*/
       
   766 		TheReader = this;
       
   767 		qsort(iStringIndex,iStringIndices,sizeof(iStringIndex[0]),::CompareStringIndices);
       
   768 		output_file << "static const TUint32 The" << locale << "StringIndex[] = \n\t{";
       
   769 		output_file << hex;
       
   770 		for (i = 0; i < iStringIndices; i++)
       
   771 			{
       
   772 			if (i % 8 == 0)
       
   773 				output_file << "\n\t";
       
   774 			output_file << "0x" << iStringIndex[i] << ",";
       
   775 			}
       
   776 		output_file << dec;
       
   777 		if (iStringIndices ==0)
       
   778 			output_file << "0";
       
   779 		output_file << "\n\t};\n\n";
       
   780 		}
       
   781 
       
   782 	// Write the collation table structure.
       
   783 	output_file << "static const TCollationKeyTable The" << locale << "Table = \n\t{ ";
       
   784 	if (iKeys)
       
   785 		output_file << "The" << locale << "Key";
       
   786 	else
       
   787 		output_file << "0";
       
   788 	if (iIndices)
       
   789 		output_file << ", The" << locale << "Index, " << iIndices;
       
   790 	else
       
   791 		output_file << ", 0, 0";
       
   792 	if (iStringElements)
       
   793 		output_file << ", The" << locale << "StringElement, The" << locale << "StringIndex, " << iStringIndices << " };\n";
       
   794 	else
       
   795 		output_file << ", 0, 0, 0 };\n";
       
   796 
       
   797 	if (!iStandard)
       
   798 		output_file << "\nstatic const TCollationMethod TheCollationMethod[] = \n"\
       
   799 			"	{\n"\
       
   800 			"		{\n"\
       
   801 			"		KUid" << locale << "CollationMethod, // the method for the locale\n"\
       
   802 			"		NULL, // use the standard table as the main table\n"\
       
   803 			"		&The" << locale << "Table, // the locale values override the standard values\n"\
       
   804 			"		0 // the flags are standard\n"\
       
   805 			"		},\n"\
       
   806 			"		{\n"\
       
   807 			"		KUidBasicCollationMethod, // the standard unlocalised method\n"\
       
   808 			"		NULL, // null means use the standard table\n"\
       
   809 			"		NULL, // there's no override table\n"\
       
   810 			"		0 // the flags are standard\n"\
       
   811 			"		}\n"\
       
   812 			"	};\n"\
       
   813 			"\n"\
       
   814 			"static const TCollationDataSet TheCollationDataSet =\n"\
       
   815 			"	{\n"\
       
   816 			"	TheCollationMethod,\n"\
       
   817 			"	2\n"\
       
   818 			"	};"\
       
   819 			"\n\n"\
       
   820 			"// The one and only locale character set object.\n"\
       
   821 			"const LCharSet TheCharSet =\n"\
       
   822 			"	{\n"\
       
   823 			"	NULL,\n"\
       
   824 			"	&TheCollationDataSet\n"\
       
   825 			"	};\n";
       
   826 
       
   827 	output_file.close();
       
   828 	delete [] locale;
       
   829 	}
       
   830 
       
   831 int CollationIndex::Compare(const void* aIndex1,const void* aIndex2)
       
   832 	{
       
   833 	return ((CollationIndex*)aIndex1)->iCode - ((CollationIndex*)aIndex2)->iCode;
       
   834 	}