toolsandutils/e32tools/readtype/readtype.cpp
changeset 0 83f4b4db085c
child 2 99082257a271
equal deleted inserted replaced
-1:000000000000 0:83f4b4db085c
       
     1 // Copyright (c) 1997-2009 Nokia Corporation and/or its subsidiary(-ies).
       
     2 // All rights reserved.
       
     3 // This component and the accompanying materials are made available
       
     4 // under the terms of the License "Eclipse Public License v1.0"
       
     5 // which accompanies this distribution, and is available
       
     6 // at the URL "http://www.eclipse.org/legal/epl-v10.html".
       
     7 //
       
     8 // Initial Contributors:
       
     9 // Nokia Corporation - initial contribution.
       
    10 //
       
    11 // Contributors:
       
    12 //
       
    13 // Description:
       
    14 // Reads a Unicode character type data file (such as UnicodeData-3.0.0.txt or a file containing locale-specific overrides)
       
    15 // and writes C++ definitions of tables containing the information.
       
    16 // Usage: readtype <input-file> <output-file> { <locale-name> }.
       
    17 // <input-file>: either the standard Unicode character data file (e.g., UnicodeData-3.0.0.txt) or a file containing
       
    18 // overriding information for a certain locale, in the same format as the standard file, but with ranges for which
       
    19 // there is no data given in the form:
       
    20 // 0041;;;;;;;;;;;;;;
       
    21 // 006A;<No Data First>;;;;;;;;;;;;;
       
    22 // FFFF;<No Data Last>;;;;;;;;;;;;;
       
    23 // (in this example, these entries show that there is no overriding data for the character 0041 and range
       
    24 // 006A..FFFF inclusive).
       
    25 // Both single entries with no data and ranges with no data must have nothing in the third field (category).
       
    26 // <output-file>: the C++ source file to be output: this file becomes \e32\unicode\unitable.cpp, or an overriding
       
    27 // file in \e32\lsrc; there are none of these yet.
       
    28 // <locale-name>: a an optional name to be inserted into identifiers in the output file: omit this for the standard
       
    29 // data set; use names like 'Turkish', 'Japanese', etc., for locales.
       
    30 // 
       
    31 //
       
    32 
       
    33 
       
    34 #include <stdio.h>
       
    35 #include <stdlib.h>
       
    36 #include <string.h>
       
    37 
       
    38 #ifndef _UNICODE
       
    39 #define _UNICODE
       
    40 #endif
       
    41 
       
    42 #include <unicode.h>
       
    43 
       
    44 // don't use unicode.h::TUnicodeDataRange, since its for 16-bit, and deprecated
       
    45 struct TUnicodeDataRange32	// Only used inside this cpp.
       
    46 	{
       
    47 	TUint32 iRangeStart;	// Unicode value of the start of the range of characters
       
    48 	TInt16 iIndex;			// index into an array of character information structures (-1 means data no available)
       
    49 	};
       
    50 
       
    51 const int PlaneCount = 17;
       
    52 TUnicodePlane ThePlanesInReadType[PlaneCount];
       
    53 
       
    54 // Tables to convert names used in the data file to categories defined in TChar.
       
    55 struct CatInfo
       
    56 	{
       
    57 	const char* iName;
       
    58 	TChar::TCategory iCat;
       
    59 	};
       
    60 
       
    61 static const CatInfo TheCatInfo[] =
       
    62 	{
       
    63 	{ "Lu", TChar::ELuCategory },
       
    64 	{ "Ll", TChar::ELlCategory },
       
    65 	{ "Lt", TChar::ELtCategory },
       
    66 	{ "Lo", TChar::ELoCategory },
       
    67 	{ "Lm", TChar::ELmCategory },
       
    68 	{ "Mn", TChar::EMnCategory },
       
    69 	{ "Mc", TChar::EMcCategory },
       
    70 	{ "Me", TChar::EMeCategory },
       
    71 	{ "Nd", TChar::ENdCategory },
       
    72 	{ "Nl", TChar::ENlCategory },
       
    73 	{ "No", TChar::ENoCategory },
       
    74 	{ "Pc", TChar::EPcCategory },
       
    75 	{ "Pd", TChar::EPdCategory },
       
    76 	{ "Ps", TChar::EPsCategory },
       
    77 	{ "Pe", TChar::EPeCategory },
       
    78 	{ "Pi", TChar::EPiCategory },
       
    79 	{ "Pf", TChar::EPfCategory },
       
    80 	{ "Po", TChar::EPoCategory },
       
    81 	{ "Sm", TChar::ESmCategory },
       
    82 	{ "Sc", TChar::EScCategory },
       
    83 	{ "Sk", TChar::ESkCategory },
       
    84 	{ "So", TChar::ESoCategory },
       
    85 	{ "Zs", TChar::EZsCategory },
       
    86 	{ "Zl", TChar::EZlCategory },
       
    87 	{ "Zp", TChar::EZpCategory },
       
    88 	{ "Cc", TChar::ECcCategory },
       
    89 	{ "Cf", TChar::ECfCategory },
       
    90 	{ "Cs", TChar::ECsCategory },
       
    91 	{ "Co", TChar::ECoCategory },
       
    92 	{ "Cn", TChar::ECnCategory }
       
    93 	};
       
    94 const int TheCategories = sizeof(TheCatInfo) / sizeof(TheCatInfo[0]);
       
    95 
       
    96 struct BdCatInfo
       
    97 	{
       
    98 	const char* iName;
       
    99 	TChar::TBdCategory iBdCat;
       
   100 	};
       
   101 
       
   102 static const BdCatInfo TheBdCatInfo[] =
       
   103 	{
       
   104 	{ "L", TChar::ELeftToRight },
       
   105 	{ "LRE", TChar::ELeftToRightEmbedding },
       
   106 	{ "LRO", TChar::ELeftToRightOverride },
       
   107 	{ "R", TChar::ERightToLeft },
       
   108 	{ "AL", TChar::ERightToLeftArabic }, 
       
   109 	{ "RLE", TChar::ERightToLeftEmbedding },
       
   110 	{ "RLO", TChar::ERightToLeftOverride },
       
   111 	{ "PDF", TChar::EPopDirectionalFormat },
       
   112 	{ "EN", TChar::EEuropeanNumber }, 
       
   113 	{ "ES", TChar::EEuropeanNumberSeparator }, 
       
   114 	{ "ET", TChar::EEuropeanNumberTerminator },  
       
   115 	{ "AN", TChar::EArabicNumber }, 
       
   116 	{ "CS", TChar::ECommonNumberSeparator }, 
       
   117 	{ "NSM", TChar::ENonSpacingMark },
       
   118 	{ "BN", TChar::EBoundaryNeutral }, 
       
   119 	{ "B", TChar::EParagraphSeparator },
       
   120 	{ "S", TChar::ESegmentSeparator },
       
   121 	{ "WS", TChar::EWhitespace }, 
       
   122 	{ "ON", TChar::EOtherNeutral }, 
       
   123 	};
       
   124 const int TheBdCategories = sizeof(TheBdCatInfo) / sizeof(TheBdCatInfo[0]);
       
   125 
       
   126 // Class derived from TUnicodeData to provide constructor etc.
       
   127 class Data: public TUnicodeData
       
   128 	{
       
   129 	public:
       
   130 	Data();
       
   131 	TBool operator==(const Data& c) const;
       
   132 	TBool operator!=(const Data& c) const { return !(*this == c); }
       
   133 	void Write();
       
   134 	};
       
   135 
       
   136 // The character information table.
       
   137 const int MaxDatas = 1000;
       
   138 Data TheData[MaxDatas];
       
   139 int Datas = 0;
       
   140 
       
   141 // The range table, containing indices to the character information table.
       
   142 const int MaxRanges = 4000;
       
   143 TUnicodeDataRange32 TheRange[MaxRanges];
       
   144 int Ranges = 0;
       
   145 
       
   146 // The exhaustive index table, containing indices from every 16-bit value to the character information table.
       
   147 int TheIndex[0x110000];
       
   148 
       
   149 // The special tables for characters in the range 0..255.
       
   150 TUint16 LowerCaseTable[256];
       
   151 TUint16 FoldTable[256];
       
   152 
       
   153 // The special table for characters in the range 0xFF00..0xFFFF
       
   154 TUint16 CjkWidthFoldTable[256];
       
   155 
       
   156 /*
       
   157 The composition table. The compositions are stored as a word made up from the composition tag (high byte) and
       
   158 the number of components (low byte), the Unicode value of the composed character, then the Unicode values of
       
   159 the components.
       
   160 
       
   161 Two tables are created containing the indices of compositions. One of these is sorted by
       
   162 composed character, one by decomposition. This enables quick conversions to be made in both directions.
       
   163 */
       
   164 const int MaxCompositionWords = 14000;
       
   165 TUint32 CompositionBuffer[MaxCompositionWords];
       
   166 int CompositionWords = 0;
       
   167 const int MaxCompositions = 8000;
       
   168 TInt16 Compose[MaxCompositions];		// composition buffer indices, sorted by composed character
       
   169 TInt16 Decompose[MaxCompositions];		// composition buffer indices, sorted by decomposition
       
   170 int Compositions = 0;
       
   171 int trie_data[0x110000];					// used to build the trie
       
   172 
       
   173 FILE *input_file;
       
   174 FILE *output_file;
       
   175 const char *input_filename;
       
   176 const char *output_filename;
       
   177 
       
   178 // Convert a hex string to an integer.
       
   179 static int hex(const char *s)
       
   180 	{
       
   181 	int x = 0;
       
   182 	while (*s)
       
   183 		{
       
   184 		int n = *s;
       
   185 		if (n >= '0' && n <= '9')
       
   186 			n -= '0';
       
   187 		else if (n >= 'A' && n <= 'F')
       
   188 			n -= 'A' - 10;
       
   189 		else if (n >= 'a' && n <= 'f')
       
   190 			n -= 'a' - 10;
       
   191 		else
       
   192 			break;
       
   193 		x = x * 16 + n;
       
   194 
       
   195 		s++;
       
   196 		}
       
   197 	return x;
       
   198 	}
       
   199 
       
   200 static TChar::TCategory Category(const char* aName,bool aWarn)
       
   201 	{
       
   202 	for (int i = 0; i < TheCategories; i++)
       
   203 		if (!strcmp(aName,TheCatInfo[i].iName))
       
   204 			return TheCatInfo[i].iCat;
       
   205 	if (aWarn)
       
   206 		fprintf(stderr,"unknown category %s\n",aName);
       
   207 	return (TChar::TCategory)(-1);
       
   208 	}
       
   209 
       
   210 static TChar::TBdCategory BdCategory(const char* aName,bool aWarn)
       
   211 	{
       
   212 	for (int i = 0; i < TheBdCategories; i++)
       
   213 		if (!strcmp(aName,TheBdCatInfo[i].iName))
       
   214 			return TheBdCatInfo[i].iBdCat;
       
   215 	if (aWarn)
       
   216 		fprintf(stderr,"unknown bidirectional category %s\n",aName);
       
   217 	return (TChar::TBdCategory)(-1);
       
   218 	}
       
   219 
       
   220 // Write an aggregate initialiser for a Data object to the output file.
       
   221 void Data::Write()
       
   222 	{
       
   223 	fprintf(output_file,"{ %d, %d, %d, %d, %d, %d }",
       
   224 			(int)iCategory,
       
   225 			(int)iBdCategory,
       
   226 			(int)iCombiningClass,
       
   227 			(int)iDigitOffset,
       
   228 			(int)iCaseOffset,
       
   229 			(int)iFlags);
       
   230 	}
       
   231 
       
   232 /*
       
   233 Add a new entry to the range table. If the category is the illegal value -1 store -1 as the
       
   234 index; this feature is used when creating character data for specific locales, which mostly
       
   235 consists of ranges for which the data is held in the main table, and is marked in this way
       
   236 as unspecified in the locale table.
       
   237 */
       
   238 void add_range(Data& info,TInt code)
       
   239 	{
       
   240 	// Get an index to the character info; add a new entry if necessary.
       
   241 	int index = -1;
       
   242 	if (info.iCategory != TChar::TCategory(0xFF))
       
   243 		{
       
   244 		for (int i = 0; i < Datas && index == -1; i++)
       
   245 			if (TheData[i] == info)
       
   246 				index = i;
       
   247 		if (index == -1)
       
   248 			{
       
   249 			if (Datas >= MaxDatas)
       
   250 				{
       
   251 				fprintf(stderr,"too many Datas: > %d\n",MaxDatas);
       
   252 				exit(1);
       
   253 				}
       
   254 			TheData[index = Datas++] = info;
       
   255 			}
       
   256 		}
       
   257 
       
   258 	// Add the entry to the range table.
       
   259 	if (Ranges >= MaxRanges)
       
   260 		{
       
   261 		fprintf(stderr,"too many Ranges: > %d, when processing U+%x\n", MaxRanges, code);
       
   262 		exit(1);
       
   263 		}
       
   264 	TheRange[Ranges].iRangeStart = code;
       
   265 	TheRange[Ranges].iIndex = (TInt16)index;
       
   266 	Ranges++;
       
   267 	}
       
   268 
       
   269 // Write a table of "entries" integers each of "entry_size" bytes.
       
   270 int write_table(const void *table,const char *name,
       
   271 				int entries,int input_entry_size,int output_entry_size,
       
   272 				int entry_signed,int entries_per_row,int write_array_size)
       
   273 	{
       
   274 	const char *type = entry_signed ? "TInt" : "TUint";
       
   275 	const int bits = output_entry_size * 8;
       
   276 
       
   277 	/*
       
   278 	There is a choice here whether or not the number of entries in the array is written:
       
   279 	either <name>[<size>] or <name>[] is written. The latter method is used where the header
       
   280 	says <name>[] so that compilers like GCC don't moan about type mismatches.
       
   281 	*/
       
   282 	if (entries == 0)
       
   283 		{
       
   284 		// In case that given plane has no character.
       
   285 		fprintf(output_file,"const %s%d * const %s = NULL;\n",type,bits,name);
       
   286 		return 0;
       
   287 		}
       
   288 	if (write_array_size)
       
   289 		fprintf(output_file,"const %s%d %s[%d] = \n\t{",type,bits,name,entries);
       
   290 	else
       
   291 		fprintf(output_file,"const %s%d %s[] = \n\t{ // %d entries",type,bits,name,entries);
       
   292 
       
   293 	const unsigned char *p = (const unsigned char *)table;
       
   294 	for (int i = 0; i < entries; i++, p += input_entry_size)
       
   295 		{
       
   296 		if (i % entries_per_row == 0)
       
   297 			fprintf(output_file,"\n\t");
       
   298 		if (output_entry_size == 1)
       
   299 			fprintf(output_file,"0x%02x",(int)(*p));
       
   300 		else if (output_entry_size == 2)
       
   301 			fprintf(output_file,"0x%04x",(int)(*((TUint16 *)p)));
       
   302 		else if (output_entry_size == 4)
       
   303 			fprintf(output_file,"0x%08x",(int)(*((TUint32 *)p)));
       
   304 		else
       
   305 			{
       
   306 			fprintf(stderr,"illegal output entry size: %d\n",output_entry_size);
       
   307 			exit(1);
       
   308 			}
       
   309 		if (i < entries - 1)
       
   310 			fputc(',',output_file);
       
   311 		// comment for easy read
       
   312 		//if ((i+1) % entries_per_row == 0)
       
   313 		//	fprintf(output_file, "\t// U+%X-U+%X (%d-%d)", i+1-entries_per_row, i, i+1-entries_per_row, i);
       
   314 		}
       
   315 	fprintf(output_file,"\n\t};\n");
       
   316 
       
   317 	return entries * output_entry_size;
       
   318 	}
       
   319 
       
   320 /*
       
   321 Create and write a trie representing the data in 'aTheIndex'
       
   322 The trie is of two levels, the first level indexed by the high 'aBlockBits' bits of the
       
   323 character code, the second by the low bits. There is one wrinkle; if the index value, which is 16 bits,
       
   324 has its top bit set, it is not an index but the actual data value for all entries in that block.
       
   325 
       
   326 Thus the way to get the value for a code is:
       
   327 
       
   328 int index = trie_index[code >> aBlockBits];
       
   329 if (index & 0x8000)
       
   330 	value = index & ~0x8000;
       
   331 else
       
   332 	value = aTrieData[code & (1 << (16 - aBlockBits))];
       
   333 
       
   334 The data size in bytes is returned.
       
   335 The argument 'aWrite' determines whether the data is written or not.
       
   336 The arguments 'aTrie1Name' and 'aTrie2Name' are used as variable names in generated unitable.cpp.
       
   337 */
       
   338 int write_trie(int aOutputEntrySize,int aBlockBits,bool aWrite, int *aTheIndex, int *aTrieData, char *aTrie1Name, char *aTrie2Name)
       
   339 	{
       
   340 	int n = 0; // number of entries used in trie_data
       
   341 
       
   342 	int block_size = 1 << aBlockBits;
       
   343 	int blocks = 1 << (16 - aBlockBits);
       
   344 
       
   345 	int* trie_index = new int[blocks];
       
   346 	int* block = new int[block_size];
       
   347 
       
   348 	for (int block_index = 0; block_index < blocks; block_index++)
       
   349 		{
       
   350 		// Write the data for the current block.
       
   351 		int block_start = block_index * block_size;
       
   352 		bool all_the_same = true;
       
   353 		for (int code = 0; code < block_size; code++)
       
   354 			{
       
   355 			block[code] = aTheIndex[block_start + code];
       
   356 			if (block[code] != block[0])
       
   357 				all_the_same = false;
       
   358 			}
       
   359 
       
   360 		// Try to find a match for it.
       
   361 		int insert_at;
       
   362 		if (all_the_same)
       
   363 			trie_index[block_index] = block[0] | 0x8000;
       
   364 		else
       
   365 			{
       
   366 			for (insert_at = 0; insert_at < n; insert_at++)
       
   367 				{
       
   368 				int entries = n - insert_at;
       
   369 				if (entries > block_size)
       
   370 					entries = block_size;
       
   371 				int bytes = entries * sizeof(int);
       
   372 				if (memcmp(block,aTrieData + insert_at,bytes) == 0)
       
   373 					break;
       
   374 				}
       
   375 
       
   376 			memcpy(aTrieData + insert_at,block,block_size * sizeof(int));
       
   377 			if (insert_at + block_size > n)
       
   378 				n = insert_at + block_size;
       
   379 			trie_index[block_index] = insert_at;
       
   380 			}
       
   381 		}
       
   382 
       
   383 	if (aWrite)
       
   384 		{
       
   385 		write_table(trie_index,aTrie1Name,blocks,4,2,false,16,true);
       
   386 		write_table(aTrieData,aTrie2Name,n,4,aOutputEntrySize,false,32,true);
       
   387 		}
       
   388 
       
   389 	delete [] trie_index;
       
   390 	delete [] block;
       
   391 
       
   392 	return blocks * 2 + n * aOutputEntrySize;
       
   393 	}
       
   394 
       
   395 // Write the best possible 2-level trie for all planes, trying block sizes of 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096 and 8192
       
   396 // @return Data size in bytes.
       
   397 int write_trie()
       
   398 	{
       
   399 	int byteCount = 0;
       
   400 	for (int plane=0; plane<PlaneCount; plane++)
       
   401 		{
       
   402 		int best_data_size = 1 << 30;
       
   403 		int best_bits = 0;
       
   404 
       
   405 		int outputEntrySize = 2;
       
   406 		char trie1Name[255];
       
   407 		char trie2Name[255];
       
   408 		sprintf(trie1Name, "ThePlane%02dTrieIndex1", plane);
       
   409 		sprintf(trie2Name, "ThePlane%02dTrieIndex2", plane);
       
   410 		int *theIndex = TheIndex + plane * 0x10000;
       
   411 		int *trieData = trie_data + plane * 0x10000;
       
   412 
       
   413 		for (int cur_bits = 3; cur_bits < 14; cur_bits++)
       
   414 			{
       
   415 			int cur_data_size = write_trie(outputEntrySize, cur_bits, false, theIndex, trieData, trie1Name, trie2Name);
       
   416 			if (cur_data_size < best_data_size)
       
   417 				{
       
   418 				best_bits = cur_bits;
       
   419 				best_data_size = cur_data_size;
       
   420 				}
       
   421 			}
       
   422 
       
   423 		byteCount += write_trie(outputEntrySize, best_bits, true, theIndex, trieData, trie1Name, trie2Name);
       
   424 		ThePlanesInReadType[plane].iCodesPerBlock = (TUint8) best_bits;
       
   425 		ThePlanesInReadType[plane].iMaskForCodePoint = (TUint16) ((1 << (best_bits)) - 1);
       
   426 		ThePlanesInReadType[plane].iMaskForBlock = (TUint16) (~(ThePlanesInReadType[plane].iMaskForCodePoint));
       
   427 		}
       
   428 	return byteCount;
       
   429 	}
       
   430 
       
   431 /*
       
   432 Compare entries in the decompose table for the purpose of sorting them. The entries are indices
       
   433 into the starting words of compositions stored in the composition buffer.
       
   434 */
       
   435 int compare_decompositions(const void *p,const void *q)
       
   436 	{
       
   437 	// Get the indexes.
       
   438 	TInt16 index1 = *((const TInt16 *)p);
       
   439 	TInt16 index2 = *((const TInt16 *)q);
       
   440 
       
   441 	// Compare the two composition strings.
       
   442 	return TUnicode::Compare((TUint16 *)&CompositionBuffer[index1 + 2], CompositionBuffer[index1 + 1]*2,
       
   443 							 (TUint16 *)&CompositionBuffer[index2 + 2], CompositionBuffer[index2 + 1]*2);
       
   444 	}
       
   445 
       
   446 // Write the output file.
       
   447 void write_output()
       
   448 	{
       
   449 	int data_bytes = 0;
       
   450 
       
   451 	// Write the comment at the top of the file
       
   452 	fprintf(output_file, "// Copyright (c) 2007-2009 Nokia Corporation and/or its subsidiary(-ies).\n");
       
   453 	fprintf(output_file, "// All rights reserved.\n");
       
   454 	fprintf(output_file, "// This component and the accompanying materials are made available\n");
       
   455 	fprintf(output_file, "// under the terms of the License \"Eclipse Public License v1.0\"\n");
       
   456 	fprintf(output_file, "// which accompanies this distribution, and is available\n");
       
   457 	fprintf(output_file, "// at the URL \"http://www.eclipse.org/legal/epl-v10.html\".\n");
       
   458 	fprintf(output_file, "//\n");
       
   459 	fprintf(output_file, "// Initial Contributors:\n");
       
   460 	fprintf(output_file, "// Nokia Corporation - initial contribution.\n");
       
   461 	fprintf(output_file, "//\n");
       
   462 	fprintf(output_file, "// Contributors:\n");
       
   463 	fprintf(output_file, "//\n");
       
   464 	fprintf(output_file, "// Description:\n");
       
   465 
       
   466 	fprintf(output_file,
       
   467 			"// Unicode character information tables.\n"
       
   468 			"// Written by the READTYPE program.\n"
       
   469 			"// Please read the 'Unicode Character Data and Line Break data Update History.doc' file for detailed history of updates to this file.\n"
       
   470 			"// This file was generated by the READTYPE tool using UCD 5.0.\n"
       
   471 			"// The contents of this file were generated automatically. Please do not edit this manually.\n"
       
   472 			"//\n"
       
   473 			"//\n"
       
   474 			"\n");
       
   475 
       
   476 	// Write the directive to include the header file.
       
   477 	fprintf(output_file,"#include <unicode.h>\n\n");
       
   478 
       
   479 	// Export two variables for unicode.cpp.
       
   480 	fprintf(output_file, "\n");
       
   481 	fprintf(output_file, "// Declarations for tables held in unitable.cpp and used by unicode.cpp.\n");
       
   482 	fprintf(output_file, "extern const TStandardUnicodeDataSet TheStandardUnicodeDataSet[];\n");
       
   483 	fprintf(output_file, "extern const TUnicodePlane ThePlanes[17];\n\n\n");
       
   484 
       
   485 	// Write the trie data.
       
   486 	data_bytes += write_trie();
       
   487 
       
   488 	// Write the character information table.
       
   489 	fprintf(output_file,"static const TUnicodeData TheUnicodeData[] =\n\t{ // %d entries\n", Datas);
       
   490 	int i;
       
   491 	for (i = 0; i < Datas; i++)
       
   492 		{
       
   493 		fputc('\t',output_file);
       
   494 		TheData[i].Write();
       
   495 		if (i < Datas - 1)
       
   496 			fputc(',',output_file);
       
   497 		fprintf(output_file, "\t// 0x%X (%d)", i, i);
       
   498 		fputc('\n',output_file);
       
   499 		}
       
   500 	fprintf(output_file,"\t};\n\n");
       
   501 	data_bytes += Datas * sizeof(Data);
       
   502 
       
   503 	// write plane properties
       
   504 	fprintf(output_file, "const TUnicodePlane ThePlanes[%d] =\n\t{\n", PlaneCount);
       
   505 	int plane;
       
   506 	for (plane=0; plane<=16; plane++)
       
   507 		{
       
   508 		fprintf(output_file, "\t{%d, 0x%04X, 0x%04X }",
       
   509 			ThePlanesInReadType[plane].iCodesPerBlock, ThePlanesInReadType[plane].iMaskForBlock, ThePlanesInReadType[plane].iMaskForCodePoint);
       
   510 		if (plane < 16)
       
   511 			fprintf(output_file, ",\n");
       
   512 		}
       
   513 	fprintf(output_file, "\n\t};\n\n");
       
   514 	data_bytes += 5*PlaneCount;
       
   515 
       
   516 	// Write a data structure referring to the trie data.
       
   517 	fprintf(output_file,"const TStandardUnicodeDataSet TheStandardUnicodeDataSet[] =\n\t{ // %d entries\n", PlaneCount);
       
   518 	for (plane=0; plane<=16; plane++)
       
   519 		{
       
   520 		fprintf(output_file,"\t{ ThePlane%02dTrieIndex1, ThePlane%02dTrieIndex2, TheUnicodeData }", plane, plane);
       
   521 		if (plane < 16)
       
   522 			fprintf(output_file, ",\n");
       
   523 		}
       
   524 	fprintf(output_file, "\n\t};\n\n");
       
   525 	data_bytes += 12*PlaneCount;
       
   526 
       
   527 	// Convert the fold table to lower case.
       
   528 	for (i = 0; i < 256; i++)
       
   529 		FoldTable[i] = LowerCaseTable[FoldTable[i]];
       
   530 
       
   531 	// Make 00A0 (non-break space) fold to space.
       
   532 	FoldTable[0xA0] = 0x20;
       
   533 
       
   534 	// Make unassigned characters in the CJK width fold table fold to themselves.
       
   535 	for (i = 0; i < 256; i++)
       
   536 		if (CjkWidthFoldTable[i] == 0)
       
   537 			CjkWidthFoldTable[i] = (TUint16)(0xFF00 + i);
       
   538 
       
   539 	// Write the special tables
       
   540 	data_bytes += write_table(FoldTable,"TUnicode::FoldTable",256,2,2,false,16,true);
       
   541 	data_bytes += write_table(CjkWidthFoldTable,"TUnicode::CjkWidthFoldTable",256,2,2,false,16,true);
       
   542 
       
   543 	// Write the number of data bytes at the end of the file.
       
   544 	fprintf(output_file,"\n// The tables and structures contain %d bytes of data.\n",data_bytes);
       
   545 	}
       
   546 
       
   547 int main(int argc,char **argv)
       
   548 	{
       
   549 	if (argc < 2)
       
   550 		{
       
   551 		fputs("usage: readtype <input-file> <output-file>",stderr);
       
   552 		exit(1);
       
   553 		}
       
   554 
       
   555 	input_filename = argv[1];
       
   556 	output_filename = argv[2];
       
   557 
       
   558 	// Locale support in previous version is deprecated.
       
   559 
       
   560 	input_file = fopen(input_filename,"r");
       
   561 	if (!input_file)
       
   562 		{
       
   563 		fprintf(stderr,"cannot open input file %s\n",input_filename);
       
   564 		exit(1);
       
   565 		}
       
   566 	output_file = fopen(output_filename,"w");
       
   567 	if (!output_file)
       
   568 		{
       
   569 		fprintf(stderr,"cannot open output file %s\n",output_filename);
       
   570 		exit(1);
       
   571 		}
       
   572 
       
   573 	Data range_info;		// attributes of the current range
       
   574 	Data unassigned_info;	// attributes used for unassigned characters; the default constructor
       
   575 							// sets the category to Cn, bidirectional category to L, everything else to 0.
       
   576 	TBool first = true;
       
   577 	
       
   578 	char line[1024];
       
   579 	const int Fields = 15;
       
   580 	char *field[Fields];
       
   581 	TInt prev_code = 0;
       
   582 	while (fgets(line,sizeof(line),input_file))
       
   583 		{
       
   584 		// Strip trailing newline if any.
       
   585 		int length = strlen(line);
       
   586 		if (length && line[length - 1] == '\n')
       
   587 			line[length - 1] = 0;
       
   588 
       
   589 		// Parse into fields.
       
   590 		int n = 1;
       
   591 		field[0] = line;
       
   592 		for (char *p = line; *p; p++)
       
   593 			if (*p == ';' && n < Fields)
       
   594 				{
       
   595 				*p = 0;
       
   596 				field[n++] = p + 1;
       
   597 				}
       
   598 
       
   599 		// Ignore the line if there is only one field.
       
   600 		if (n == 1)
       
   601 			continue;
       
   602 
       
   603 		// Extract fields of interest.
       
   604 
       
   605 		// Field 0: Unicode value in hexadecimal.
       
   606 		int code = hex(field[0]);
       
   607 
       
   608 		// Field 2: Category.
       
   609 		Data cur_info;
       
   610 		cur_info.iCategory = (TUint8)Category(field[2], true);
       
   611 
       
   612 		// Field 3: Combining class.
       
   613 		cur_info.iCombiningClass = (TUint8)atoi(field[3]);
       
   614 		
       
   615 		// Field 4: Bidirectional category.
       
   616 		cur_info.iBdCategory = (TUint8)BdCategory(field[4], true);
       
   617 
       
   618 		// Prepare to determine the folded version (converted to lower case, stripped of accents).
       
   619 		int folded_code = code;
       
   620 
       
   621 		// Field 5: Character decomposition.
       
   622 		if (field[5][0])
       
   623 			{
       
   624 			int components = 0;
       
   625 			const int MaxComponents = 18;		// FDFA; ARABIC LIGATURE SALLALLAHOU ALAYHE WASALLAM has 18 components!
       
   626 			TUint32 component[MaxComponents];
       
   627 
       
   628 			// Extract the tag if any.
       
   629 			char *p = field[5];
       
   630 			const char *tag = NULL;
       
   631 			if (field[5][0] == '<')
       
   632 				{
       
   633 				tag = ++p;
       
   634 				while (*p && *p != '>')
       
   635 					p++;
       
   636 				if (!*p)
       
   637 					{
       
   638 					fprintf(stderr,"syntax error: missing > on the line for code %x\n",code);
       
   639 					exit(1);
       
   640 					}
       
   641 				*p++ = 0;
       
   642 				}
       
   643 
       
   644 			// Read the components.
       
   645 			while (*p)
       
   646 				{
       
   647 				while (*p == ' ')
       
   648 					p++;
       
   649 				if (components >= MaxComponents)
       
   650 					{
       
   651 					fprintf(stderr,"decomposition of %x has too many components: increase MaxComponents\n",code);
       
   652 					exit(1);
       
   653 					}
       
   654 				component[components++] = hex(p);
       
   655 				while (*p && *p != ' ')
       
   656 					p++;
       
   657 				}
       
   658 
       
   659 			// Store the composition if it has a null tag and is therefore canonical.
       
   660 			if (tag == NULL)
       
   661 				{
       
   662 				// Put its index into the tables.
       
   663 				if (Compositions >= MaxCompositions)
       
   664 					{
       
   665 					fprintf(stderr,"too many compositions (at code %x): increase MaxCompositions\n",code);
       
   666 					exit(1);
       
   667 					}
       
   668 				if (CompositionWords >= 65535)
       
   669 					{
       
   670 					fprintf(stderr, "too many compositions (at code %x): need 32 bit!?\n", code);
       
   671 					exit(1);
       
   672 					}
       
   673 				Compose[Compositions] = Decompose[Compositions] = (TInt16)CompositionWords;
       
   674 				Compositions++;
       
   675 
       
   676 				// Put it into the composition buffer.
       
   677 				if (CompositionWords + 2 + components >= MaxCompositionWords)
       
   678 					{
       
   679 					fprintf(stderr,"too many compositions (at code %x): increase MaxCompositionWords\n",code);
       
   680 					exit(1);
       
   681 					}
       
   682 				CompositionBuffer[CompositionWords++] = code;
       
   683 				CompositionBuffer[CompositionWords++] = components;
       
   684 				for (int i = 0; i < components; i++)
       
   685 					CompositionBuffer[CompositionWords++] = component[i];
       
   686 				}
       
   687 			
       
   688 			// Store the code used in the ordinary and CJK fold tables.
       
   689 			if (components > 0)
       
   690 				{
       
   691 				if (code < 256)
       
   692 					{
       
   693 					if (tag == NULL)
       
   694 						folded_code = component[0];
       
   695 					}
       
   696 				else if (code >= 0xFF00 && code <= 0xFFEE)	// tag will always be <wide> or <narrow>
       
   697 					folded_code = component[0];
       
   698 				}
       
   699 			}
       
   700 
       
   701 		// Field 8. Numeric value.
       
   702 		if (field[8][0])
       
   703 			{
       
   704 			if (field[8][1] == '/' || field[8][2] == '/')		// fractions
       
   705 				cur_info.iFlags |= TUnicodeData::EFraction;
       
   706 			else
       
   707 				{
       
   708 				int value = atoi(field[8]);
       
   709 				if (value >= 0 && value <= 255)
       
   710 					{
       
   711 					cur_info.iDigitOffset = (TUint8)((value - (code & 255)) & 255);
       
   712 					cur_info.iFlags |= TUnicodeData::ESmallNumeric;
       
   713 					}
       
   714 				else if (value == 500)
       
   715 					cur_info.iFlags |= TUnicodeData::EFiveHundred;
       
   716 				else if (value == 1000)
       
   717 					cur_info.iFlags |= TUnicodeData::EOneThousand;
       
   718 				else if (value == 5000)
       
   719 					cur_info.iFlags |= TUnicodeData::EFiveThousand;
       
   720 				else if (value == 10000)
       
   721 					cur_info.iFlags |= TUnicodeData::ETenThousand;
       
   722 				else if (value == 100000)
       
   723 					cur_info.iFlags |= TUnicodeData::EHundredThousand;
       
   724 				else
       
   725 					fprintf(stderr,"Warning: U+%X has a large numeric property with unrepresentable value %d. Ignored.\n",code,value);
       
   726 				}
       
   727 			}
       
   728 
       
   729 		// Field 9: Mirrored property.
       
   730 		if (field[9][0] == 'Y')
       
   731 			cur_info.iFlags |= TUnicodeData::EMirrored;
       
   732 
       
   733 		// Fields 12, 13, 14: Case variants.
       
   734 		int uc = code, lc = code, tc = code;
       
   735 		if (field[12][0])
       
   736 			{
       
   737 			uc = hex(field[12]);
       
   738 			int uc_offset = uc - code;
       
   739 			if (abs(uc_offset) > 32767)
       
   740 				{
       
   741 				fprintf(stderr, "Warning: offset to upper case is too large: code %X, upper case %X, offset %X. Ignored!\n", code, uc, uc_offset);
       
   742 				}
       
   743 			else
       
   744 				{
       
   745 				cur_info.iFlags |= TUnicodeData::EHasUpperCase;
       
   746 				cur_info.iCaseOffset = (TInt16)(-uc_offset);
       
   747 				if (code<0x10000 && uc>0x10000 || code>0x10000 && uc<0x10000)
       
   748 					fprintf(stderr, "Info: %X and its upper case %X locate at different planes.\n");
       
   749 				}
       
   750 			}
       
   751 		if (field[13][0])
       
   752 			{
       
   753 			lc = hex(field[13]);
       
   754 			int lc_offset = lc - code;
       
   755 			if (abs(lc_offset) > 32767)
       
   756 				{
       
   757 				fprintf(stderr, "Warning: offset to lower case is too large: code %X, lower case %X, offset %X. Ignored!\n", code, lc, lc_offset);
       
   758 				}
       
   759 			else
       
   760 				{
       
   761 				cur_info.iFlags |= TUnicodeData::EHasLowerCase;
       
   762 				cur_info.iCaseOffset = (TInt16)lc_offset;
       
   763 				if (code<0x10000 && lc>0x10000 || code>0x10000 && lc<0x10000)
       
   764 					fprintf(stderr, "Info: %X and its lower case %X locate at different planes.\n");
       
   765 				}
       
   766 			}
       
   767 		if (field[14][0])
       
   768 			tc = hex(field[14]);
       
   769 		if (tc != lc && tc != uc)
       
   770 			cur_info.iFlags |= TUnicodeData::EHasTitleCase;
       
   771 
       
   772 		// If this code is < 256 fill in the entries in the special tables.
       
   773 		if (code < 256)
       
   774 			{
       
   775 			LowerCaseTable[code] = (TUint16)lc;
       
   776 			FoldTable[code] = (TUint16)folded_code;
       
   777 			}
       
   778 
       
   779 		// If the code is >= 0xFF00 fill in the entry in the CJK width folding table.
       
   780 		else if (code >= 0xFF00 && code <= 0xFFFF)
       
   781 			CjkWidthFoldTable[code & 0xFF] = (TUint16)folded_code;
       
   782 
       
   783 		/*
       
   784 		If there was a gap between this code and the previous one, write an 'unassigned' range,
       
   785 		unless this character is actually the end of a range not fully listed (like the CJK ideographs
       
   786 		from 4E00 to 9FA5 inclusive), in which case the character name will end in ' Last>'.
       
   787 		*/
       
   788 		if (code - prev_code > 1)
       
   789 			{
       
   790 			TBool last_in_range = false;
       
   791 			int name_length = strlen(field[1]);
       
   792 			if (name_length >= 6 && !strcmp(field[1] + name_length - 6," Last>"))
       
   793 				last_in_range = TRUE;
       
   794 			if (!last_in_range)
       
   795 				{
       
   796 				add_range(unassigned_info,prev_code + 1);
       
   797 				range_info = unassigned_info;
       
   798 				}
       
   799 			}
       
   800 
       
   801 		// Write the range.
       
   802 		if (first || cur_info != range_info)
       
   803 			{
       
   804 			add_range(cur_info,code);
       
   805 			range_info = cur_info;
       
   806 			}
       
   807 
       
   808 		first = false;
       
   809 		prev_code = code;
       
   810 		}
       
   811 
       
   812 	/*
       
   813 	If there was a gap at the end of the encoding (there is at present; FFFE and FFFF are not Unicode characters)
       
   814 	write an 'unassigned' range.
       
   815 	*/
       
   816 	if (prev_code < 0xFFFF)
       
   817 		add_range(unassigned_info,prev_code + 1);
       
   818 
       
   819 	// Write an array of indices from Unicode character values to character data sets.
       
   820 	for (int i = 0; i < Ranges; i++)
       
   821 		{
       
   822 		TUint32 end = i < Ranges - 1 ? TheRange[i + 1].iRangeStart : 0x110000;
       
   823 		for (TUint32 j = TheRange[i].iRangeStart; j < end; j++)
       
   824 			TheIndex[j] = TheRange[i].iIndex;
       
   825 		}
       
   826 
       
   827 	// Write the output file.
       
   828 	write_output();
       
   829 	printf("\nDone.\n");
       
   830 
       
   831 	return 0;
       
   832 	}
       
   833 
       
   834 Data::Data()
       
   835 	{
       
   836 	iCategory = TChar::ECnCategory;
       
   837 	iBdCategory = TChar::ELeftToRight;
       
   838 	iCombiningClass = 0;
       
   839 	iDigitOffset = 0;
       
   840 	iCaseOffset = 0;
       
   841 	iFlags = 0;
       
   842 	}
       
   843 
       
   844 TBool Data::operator==(const Data& c) const
       
   845 	{
       
   846 	return iCategory == c.iCategory &&
       
   847 		   iBdCategory == c.iBdCategory &&
       
   848 		   iCombiningClass == c.iCombiningClass &&
       
   849 		   iDigitOffset == c.iDigitOffset &&
       
   850 		   iCaseOffset == c.iCaseOffset &&
       
   851 		   iFlags == c.iFlags;
       
   852 	}
       
   853 
       
   854 /*
       
   855 This function is copied from unicode.cpp: having it here saves me having to link in unicode.cpp and
       
   856 unitable.cpp, which is probably the file we're trying to write!
       
   857 */
       
   858 TInt TUnicode::Compare(const TUint16 *aString1,TInt aLength1,const TUint16 *aString2,TInt aLength2)
       
   859 	{
       
   860 	for (TInt i = 0; i < aLength1 || i < aLength2; i++, aString1++, aString2++)
       
   861 		{
       
   862 		TInt x = i < aLength1 ? *aString1 : -1;
       
   863 		TInt y = i < aLength2 ? *aString2 : -1;
       
   864 		if (x != y)
       
   865 			return x - y;
       
   866 		}
       
   867 	return 0;
       
   868 	}