kernel/eka/euser/unicode/perl/FoldAndDecompTables.pl
changeset 0 a41df078684a
equal deleted inserted replaced
-1:000000000000 0:a41df078684a
       
     1 # Copyright (c) 2001-2009 Nokia Corporation and/or its subsidiary(-ies).
       
     2 # All rights reserved.
       
     3 # This component and the accompanying materials are made available
       
     4 # under the terms of the License "Eclipse Public License v1.0"
       
     5 # which accompanies this distribution, and is available
       
     6 # at the URL "http://www.eclipse.org/legal/epl-v10.html".
       
     7 #
       
     8 # Initial Contributors:
       
     9 # Nokia Corporation - initial contribution.
       
    10 #
       
    11 # Contributors:
       
    12 #
       
    13 # Description:
       
    14 # Creates C++ code describing how to decompose, compose and fold each character.
       
    15 # Usage:
       
    16 # perl -w FoldAndDecompTables.pl < <output-from-UnicodeMaxDecompose>
       
    17 # Tables we want to create:
       
    18 # A: Ordered list of non-excluded decompositions
       
    19 # B: List of folded decompositions matching A
       
    20 # C: List of decompositions not listed in A of length > 1
       
    21 # D: List of folded decompositions matching C
       
    22 # E: List of decompositions of length = 1 whose matching folded decompositions
       
    23 # are of length > 1
       
    24 # F: List of folded decompositions matching E
       
    25 # G: List of decompositions of length = 1 with matching folded decompositions
       
    26 # H: List of folded decompostions matching G
       
    27 # I: List of folded decompositions that do not have matching decompositions
       
    28 # J: List of decompositions (folding and otherwise) of length > 2
       
    29 # K: Hash table mapping Unicode value to its folded decomposition value in the
       
    30 # concatenated list B-D-F-H-I
       
    31 # L: List of hash slots in K matching A (providing a mapping from non-excluded
       
    32 # decompositions to Unicode value)
       
    33 # [all lengths are of UTF16 strings]
       
    34 # 
       
    35 #
       
    36 
       
    37 use strict;
       
    38 
       
    39 #
       
    40 # Hash table:
       
    41 #
       
    42 
       
    43 # Size of hashing table = 1 to the power $LgHashTableSize
       
    44 my $LgHashTableSize = 12;
       
    45 
       
    46 # Do not change these next two values!
       
    47 my $HashTableSize = 1 << $LgHashTableSize;
       
    48 my $HashTableBitmaskCpp = sprintf('0x%x', $HashTableSize - 1);
       
    49 
       
    50 # Hashing function in Perl: Getting the initial search position
       
    51 sub HashStart
       
    52 	{
       
    53 	return $_[0] & ($HashTableSize - 1);
       
    54 	}
       
    55 # How far to step through each time
       
    56 sub HashStep
       
    57 	{
       
    58 	my ($code) = @_;
       
    59 	$code *= $code >> $LgHashTableSize;
       
    60 	return ($code * 2 + 1) & ($HashTableSize - 1);
       
    61 	}
       
    62 
       
    63 # Make sure input string is all hex numbers separated by single spaces with
       
    64 # each hex number having 4 digits and decomposed into UTF16
       
    65 sub Normalize
       
    66 	{
       
    67 	my ($string) = @_;
       
    68 	if ($string =~ /^([0-9A-F]{4}( [0-9A-F]{4})*)?$/)
       
    69 		{
       
    70 		return $string;
       
    71 		}
       
    72 	my $norm = '';
       
    73 	foreach my $elt (split(' ', $string))
       
    74 		{
       
    75 		if ($elt)
       
    76 			{
       
    77 			die "'$elt' is not a hex number"
       
    78 				unless $elt =~ /[0-9a-fA-F]+/;
       
    79 			$norm = $norm.' '
       
    80 				unless $norm eq '';
       
    81 			$elt = hex $elt;
       
    82 			if ($elt < 0x10000)
       
    83 				{
       
    84 				$norm = $norm.(sprintf('%04X', $elt));
       
    85 				}
       
    86 			else
       
    87 				{
       
    88 				# Add a surrogate pair
       
    89 				$norm = $norm.(sprintf('%04X %04X',
       
    90 					($elt / 0x400) + 0xD7C0, ($elt % 0x400) + 0xDC00));
       
    91 				}
       
    92 			}
       
    93 		}
       
    94 	#print STDERR "'$string' normalized to '$norm'\n";
       
    95 	return $norm;
       
    96 	}
       
    97 
       
    98 # First stage:
       
    99 # Hash of Unicode values to normalised decomposition and folded strings
       
   100 my %Decomp = ();
       
   101 my %Folded = ();
       
   102 # Mapping from decomposition->char, if not excluded
       
   103 my %Composition = ();
       
   104 # characters with non-excluded decompositions
       
   105 my @IncludedDecomps = ();
       
   106 # characters with long (>1 UTF16) excluded decompositions
       
   107 my @LongExcludedDecomps = ();
       
   108 # characters with singleton decompositions but long folds
       
   109 my @ShortDecompsLongFolds = ();
       
   110 # characters with singleton folds and singleton
       
   111 my @ShortDecompsShortFolds = ();
       
   112 # characters with singleton folds but no decomps
       
   113 my @ShortFoldsOnly = ();
       
   114 
       
   115 # A mapping from decompositions of length greater than two
       
   116 # to the code that produced them.
       
   117 my %VeryLongDecompositions = ();
       
   118 
       
   119 # A list of characters containing all decompositions of length >2 as slices
       
   120 my @VeryLongDecompData = ();
       
   121 # Mapping from decomposition->index into VeryLongDecompData
       
   122 my %VeryLongDecompMap = ();
       
   123 
       
   124 # There will be a hash table mapping Unicode values to indices into the other
       
   125 # tables. %Index maps the same thing in Perl.
       
   126 my %Index = ();
       
   127 # %HashTableEntryContents maps the table entries to the Unicode values they
       
   128 # contain.
       
   129 my %HashTableEntryContents = ();
       
   130 # %HashTableEntry maps Unicode value to the entry in the hash table
       
   131 my %HashTableEntry = ();
       
   132 
       
   133 # Bind a unicode value to an index into the tables
       
   134 sub AddHashValue
       
   135 	{
       
   136 	my ($unicode, $index) = @_;
       
   137 	$Index{$unicode} = $index;
       
   138 	my $pos = HashStart($unicode);
       
   139 	my $step = HashStep($unicode);
       
   140 	while (exists $HashTableEntryContents{$pos})
       
   141 		{
       
   142 		$pos += $step;
       
   143 		if ($HashTableSize <= $pos)
       
   144 			{
       
   145 			$pos %= $HashTableSize;
       
   146 			}
       
   147 		}
       
   148 	$HashTableEntryContents{$pos} = $unicode;
       
   149 	$HashTableEntry{$unicode} = $pos;
       
   150 	}
       
   151 
       
   152 # Bind a whole array to the indices starting from that given as the first
       
   153 # argument. Returns the index of the next slot to be filled.
       
   154 sub AddListToHash
       
   155 	{
       
   156 	my ($index, @unicodes) = @_;
       
   157 	while (@unicodes)
       
   158 		{
       
   159 		AddHashValue(shift @unicodes, $index);
       
   160 		$index++;
       
   161 		}
       
   162 	return $index;
       
   163 	}
       
   164 
       
   165 # put the results of a read line into the data structures
       
   166 sub AddCode
       
   167 	{
       
   168 	my ($code, $excluded, $decomposition, $folded) = @_;
       
   169 	return if ($decomposition eq '' && $folded eq '');
       
   170 	$Decomp{$code} = $decomposition;
       
   171 	$Folded{$code} = $folded;
       
   172 
       
   173 	if (!$excluded && $decomposition ne '')
       
   174 		{
       
   175 		push @IncludedDecomps, $code;
       
   176 		$Composition{$decomposition} = $code;
       
   177 		}
       
   178 	elsif (4 < length $decomposition)
       
   179 		{
       
   180 		push @LongExcludedDecomps, $code;
       
   181 		}
       
   182 	elsif (4 < length $folded)
       
   183 		{
       
   184 		push @ShortDecompsLongFolds, $code;
       
   185 		}
       
   186 	elsif ($decomposition ne '')
       
   187 		{
       
   188 		push @ShortDecompsShortFolds, $code;
       
   189 		}
       
   190 	elsif ($folded ne '')
       
   191 		{
       
   192 		push @ShortFoldsOnly, $code;
       
   193 		}
       
   194 
       
   195 	$VeryLongDecompositions{$decomposition} = $code
       
   196 		if (9 < length $decomposition);
       
   197 	$VeryLongDecompositions{$folded} = $code
       
   198 		if (9 < length $folded);
       
   199 	}
       
   200 
       
   201 if (scalar(@ARGV) != 0)
       
   202 	{
       
   203 	print (STDERR "Usage:\nperl -w FoldAndDecompTables.pl < <output-from-UnicodeMaxDecompose>\n");
       
   204 	exit 1;
       
   205 	}
       
   206 
       
   207 my $lineNo = 0;
       
   208 my $inBlock = 0;
       
   209 while(<STDIN>)
       
   210 	{
       
   211 	$lineNo++;
       
   212 	if (/^(1?[0-9a-fA-F]{4,5});([^;]*);.*symbian:(E?);[^;]*;([0-9a-fA-F \t]*);([0-9a-fA-F \t]*)[ \t]*$/i)
       
   213 		{
       
   214 		my $code = hex $1;
       
   215 		my $description = $2;
       
   216 		my $excluded = $3;
       
   217 		my $decomposition = Normalize($4);
       
   218 		my $folded = Normalize($5);
       
   219 
       
   220 		die ("Value $1 too large to be Unicode at line $lineNo.")
       
   221 			if (0x110000 <= $code);
       
   222 
       
   223 		die("Normalisation failed with '$decomposition' at line $lineNo.")
       
   224 			unless (length $decomposition) == 0 || (length $decomposition) % 5 == 4;
       
   225 		die("Normalisation failed with '$folded' at line $lineNo.")
       
   226 			unless (length $folded) == 0 || (length $folded) % 5 == 4;
       
   227 
       
   228 		AddCode($code, $excluded, $decomposition, $folded);
       
   229 
       
   230 		if ($description =~ /^<.*Last>$/i)
       
   231 			{
       
   232 			die("End of block without start at line $lineNo!")
       
   233 				if !$inBlock;
       
   234 			while ($inBlock <= $code)
       
   235 				{
       
   236 				AddCode($inBlock, $excluded, $decomposition, $folded);
       
   237 				$inBlock++;
       
   238 				}
       
   239 			$inBlock = 0;
       
   240 			}
       
   241 		elsif ($description =~ /^<.*First>$/i)
       
   242 			{
       
   243 			die("Block within block at line $lineNo!")
       
   244 				if $inBlock;
       
   245 			$inBlock = $code + 1;
       
   246 			}
       
   247 		}
       
   248 	elsif (!/^[ \t]*$/)
       
   249 		{
       
   250 		die("Did not understand line $lineNo.");
       
   251 		}
       
   252 	}
       
   253 
       
   254 # We need to construct the data for the table of decompositions of length > 2.
       
   255 foreach my $decomp (sort {length $::b <=> length $::a} keys %VeryLongDecompositions)
       
   256 	{
       
   257 	if (!exists $VeryLongDecompMap{$decomp})
       
   258 		{
       
   259 		# Does not already exist
       
   260 		my $newPos = scalar @VeryLongDecompData;
       
   261 		$VeryLongDecompMap{$decomp} = $newPos;
       
   262 		foreach my $code (split(' ', $decomp))
       
   263 			{
       
   264 			push @VeryLongDecompData, $code;
       
   265 			}
       
   266 		while ($decomp =~ /^([0-9A-F]{4}( [0-9A-F]{4}){2,}) [0-9A-F]{4}$/)
       
   267 			{
       
   268 			$decomp = $1;
       
   269 			$VeryLongDecompMap{$decomp} = $newPos;
       
   270 			}
       
   271 		}
       
   272 	}
       
   273 
       
   274 # We need to sort the codes for included decompositions into lexicographic
       
   275 # order of their decompositions.
       
   276 # This, luckily, is the same as sorting the strings that represent their
       
   277 # decompositions in hex lexicographically.
       
   278 @IncludedDecomps = sort {$Decomp{$::a} cmp $Decomp{$::b}} @IncludedDecomps;
       
   279 
       
   280 print (STDERR 'Included: ', scalar(@IncludedDecomps), "\nLong: ", scalar(@LongExcludedDecomps));
       
   281 print(STDERR "\nLongFolds: ", scalar(@ShortDecompsLongFolds), "\nShort: ", scalar(@ShortDecompsShortFolds));
       
   282 print(STDERR "\nShortFoldsOnly: ", scalar(@ShortFoldsOnly), "\nTOTAL: ");
       
   283 print STDERR (scalar(@IncludedDecomps) + scalar(@LongExcludedDecomps) + scalar(@ShortDecompsLongFolds) + scalar(@ShortDecompsShortFolds) + scalar(@ShortFoldsOnly));
       
   284 print STDERR "\n";
       
   285 
       
   286 # Analyse the hash table to find out the maximum and average time
       
   287 # taken to find each ASCII character
       
   288 my $maxAsciiTime = 0;
       
   289 my $totalAsciiTime = 0;
       
   290 my $mostDifficultCode = undef;
       
   291 my $asciiFoundWithoutStepCount = 0;
       
   292 for (32..126)
       
   293 	{
       
   294 	my $code = $_;
       
   295 	my $pos = HashStart($code);
       
   296 	my $step = HashStep($code);
       
   297 	my $stepCount = 1;
       
   298 	if ($HashTableEntry{$code})
       
   299 		{
       
   300 		my $posRequired = $HashTableEntry{$code};
       
   301 		while ($pos != $posRequired)
       
   302 			{
       
   303 			$pos = ($pos + $step) % $HashTableSize;
       
   304 			$stepCount++;
       
   305 			}
       
   306 		}
       
   307 	$totalAsciiTime += $stepCount;
       
   308 	if ($maxAsciiTime < $stepCount)
       
   309 		{
       
   310 		$maxAsciiTime = $stepCount;
       
   311 		$mostDifficultCode = $code;
       
   312 		}
       
   313 	if ($stepCount == 1)
       
   314 		{
       
   315 		$asciiFoundWithoutStepCount++;
       
   316 		}
       
   317 	}
       
   318 printf (STDERR "Average ASCII search: %f\n", $totalAsciiTime / 95);
       
   319 printf (STDERR "Maximum ASCII search %d for %x: '%c'.\n", $maxAsciiTime, $mostDifficultCode, $mostDifficultCode);
       
   320 
       
   321 # Now we populate the hash table
       
   322 my $index = 0;
       
   323 
       
   324 $index = AddListToHash($index, @IncludedDecomps);
       
   325 my $hashIndexAfterIncludedDecomps = $index;
       
   326 printf (STDERR "after IncludedDecomps index= %d\n", $hashIndexAfterIncludedDecomps);
       
   327 
       
   328 $index = AddListToHash($index, @LongExcludedDecomps);
       
   329 my $hashIndexAfterLongExcludeDecomps = $index;
       
   330 printf (STDERR "after LongExcludedDecomps index= %d\n", $hashIndexAfterLongExcludeDecomps);
       
   331 
       
   332 $index = AddListToHash($index, @ShortDecompsLongFolds);
       
   333 my $hashIndexAfterShortDecompsLongFolds = $index;
       
   334 printf (STDERR "after ShortDecompsLongFolds index= %d\n", $hashIndexAfterShortDecompsLongFolds);
       
   335 
       
   336 $index = AddListToHash($index, @ShortDecompsShortFolds);
       
   337 my $hashIndexAfterShortDecompsShortFolds = $index;
       
   338 printf (STDERR "after ShortDecompsShortFolds index= %d\n", $hashIndexAfterShortDecompsShortFolds);
       
   339 
       
   340 $index = AddListToHash($index, @ShortFoldsOnly);
       
   341 my $hashIndexAfterShortFoldsOnly = $index;
       
   342 printf (STDERR "after ShortFoldsOnly index= %d\n", $hashIndexAfterShortFoldsOnly);
       
   343 
       
   344 #
       
   345 # Output C++ File
       
   346 #
       
   347 my $totalBytes = 0;
       
   348 
       
   349 print "// Copyright (c) 2001-2009 Nokia Corporation and/or its subsidiary(-ies).\n";
       
   350 print "// All rights reserved.\n";
       
   351 print "// This component and the accompanying materials are made available\n";
       
   352 print "// under the terms of the License \"Eclipse Public License v1.0\"\n";
       
   353 print "// which accompanies this distribution, and is available\n";
       
   354 print "// at the URL \"http://www.eclipse.org/legal/epl-v10.html\".\n";
       
   355 print "//\n";
       
   356 print "// Initial Contributors:\n";
       
   357 print "// Nokia Corporation - initial contribution.\n";
       
   358 print "//\n";
       
   359 print "// Contributors:\n";
       
   360 print "//\n";
       
   361 print "// Description:\n";
       
   362 print "//\n";
       
   363 print "// Fold and decomposition tables.\n";
       
   364 print "//\n";
       
   365 print "// These tables are linked in the following way:\n";
       
   366 print "// KUnicodeToIndexHash is a hash table using double hashing for\n";
       
   367 print "// conflict resolution. The functions DecompositionHashStart and\n";
       
   368 print "// DecompositionHashStep give the start and step values for accessing\n";
       
   369 print "// the table. The first probe is at DecompositionHashStart and each\n";
       
   370 print "// subsequent probe is offset by DecompositionHashStep. Probes\n";
       
   371 print "// continue until either 0 is found (indicating that the Unicode value\n";
       
   372 print "// sought has no decompostion (i.e. decomposes to itself)) or a value\n";
       
   373 print "// is found that has the sought Unicode value in its lower 20 bits.\n";
       
   374 print "//\n";
       
   375 print "// In this latter case, the upper 12 bits contain an index into\n";
       
   376 print "// one of the following tables, according to the following rules:\n";
       
   377 print "//\n";
       
   378 print "// In the case of folding:\n";
       
   379 print "// If the Index is less than the length of KNonSingletonFolds / 2,\n";
       
   380 print "// it is an index into KNonSingletonFolds. If the Index is\n";
       
   381 print "// greater than the length of KNonSingletonFolds / 2, then it is an\n";
       
   382 print "// index into KSingletonFolds.\n";
       
   383 print "//\n";
       
   384 print "// In the case of decomposition:\n";
       
   385 print "// If the Index is less than the length of KNonSingletonDecompositions / 2,\n";
       
   386 print "// it is an index into KNonSingletonDecompositions. If the Index is\n";
       
   387 print "// greater than the length of KNonSingletonDecompositions / 2, then it is an\n";
       
   388 print "// index into KSingletonDecompositions.\n";
       
   389 print "//\n";
       
   390 print "// In summary:\n";
       
   391 print "// Let Knsf be the length of KNonSingletonFolds / 2,\n";
       
   392 print "// let Knsd be the length of KNonSingletonDecompositions / 2,\n";
       
   393 print "// let Ksd be the length of KSingletonDecompositions and\n";
       
   394 print "// let Ksf be the length of KSingletonFolds.\n";
       
   395 print "// Now if you want to fold a character and you have found\n";
       
   396 print "// its index 'i' from the KUnicodeToIndexHash, then;\n";
       
   397 print "// if (i < Knsf) then look up\n";
       
   398 print "//\t\tKNonSingletonFolds[i * 2] and KNonSingletonFolds[i * 2 + 1]\n";
       
   399 print "// else if (Knsf <= i < Knsf + Ksf) look up KSingletonFolds[i - Knsf]\n";
       
   400 print "// else there is no fold for this character.\n";
       
   401 print "//\n";
       
   402 print "// Or if you want to decompose the same character, then;\n";
       
   403 print "// if (i < Knsd) then look up KNonSingletonDecompositions[i * 2]\n";
       
   404 print "//\t\tand KNonSingletonDecompositions[i * 2 + 1]\n";
       
   405 print "// else if (Knsd <= i < Knsd + Ksd) look up KSingletonDecompositions[i - Knsd]\n";
       
   406 print "// else there is no decomposition for this character.\n";
       
   407 print "//\n";
       
   408 print "// Your index into KSingletonDecompositions or KSingletonFolds\n";
       
   409 print "// yields a single value which is the decomposition or fold.\n";
       
   410 print "//\n";
       
   411 print "// The KNonSingletonFolds and KNonSingletonDecomposition\n";
       
   412 print "// tables are made up of pairs of values. Each pair is either a pair\n";
       
   413 print "// of Unicode values that constitute the fold or decomposition, or\n";
       
   414 print "// the first value is KLongD and the second has its top 4 bits as the\n";
       
   415 print "// length of the decomposition (or folded decomposition) minus 3,\n";
       
   416 print "// and its bottom 12 bits as the index into KLongDecompositions\n";
       
   417 print "// of where you can find this decomposition.\n";
       
   418 print "//\n";
       
   419 print "// KLongDecompositions simply contains UTF-16 (Unicode) for\n";
       
   420 print "// all the decomposed and folded sequences longer than 4 bytes long.\n";
       
   421 print "\n";
       
   422 print "// Hash table mapping unicode values to indices into the other tables\n";
       
   423 print "// in use = ".$hashIndexAfterShortFoldsOnly." entries\n";
       
   424 print "const unsigned long KUnicodeToIndexHash[$HashTableSize] =\n\t{\n\t";
       
   425 my @HashTableOutput;
       
   426 for (0..($HashTableSize - 1))
       
   427 	{
       
   428 	my $v = 0;
       
   429 	if (exists $HashTableEntryContents{$_})
       
   430 		{
       
   431 		$v = $HashTableEntryContents{$_};
       
   432 		die ('Did not expect a Unicode value > 0xFFFFF')
       
   433 			if 0xFFFFF < $v;
       
   434 		$v |= ($Index{$v}) << 20;
       
   435 		}
       
   436 	push @HashTableOutput, sprintf('0x%08x', $v);
       
   437 	$totalBytes += 4;
       
   438 	}
       
   439 print (shift @HashTableOutput);
       
   440 my $valueCount = 0;
       
   441 foreach my $v (@HashTableOutput)
       
   442 	{
       
   443 	print (((++$valueCount & 7) == 0)? ",\n\t" : ', ');
       
   444 	print $v;
       
   445 	}
       
   446 print "\n\t};\n\n";
       
   447 print "// Hash table access functions\n";
       
   448 print "const int KDecompositionHashBitmask = $HashTableBitmaskCpp;\n\n";
       
   449 print "inline int DecompositionHashStart(long a)\n";
       
   450 print "\t{\n\treturn a & $HashTableBitmaskCpp;\n\t}\n\n";
       
   451 print "inline int DecompositionHashStep(long a)\n";
       
   452 print "\t{\n\ta *= a >> $LgHashTableSize;\n";
       
   453 print "\treturn ((a<<1) + 1) & $HashTableBitmaskCpp;\n\t}\n\n";
       
   454 
       
   455 print "// Table mapping KNonSingletonDecompositions to the hash table entry that\n";
       
   456 print "// indexes it\n";
       
   457 print "const unsigned short KCompositionMapping[] =\n\t{\n\t";
       
   458 for (0..(scalar(@IncludedDecomps - 1)))
       
   459 	{
       
   460 	if ($_ != 0)
       
   461 		{print (($_ & 7) == 0? ",\n\t" : ', ')}
       
   462 	printf( '0x%04x', $HashTableEntry{$IncludedDecomps[$_]} );
       
   463 	$totalBytes += 2;
       
   464 	}
       
   465 print "\n\t};\n\n";
       
   466 
       
   467 print "// Table containing all the decomposition and folding strings longer\n";
       
   468 print "// than 2 UTF16 characters\n";
       
   469 print "const unsigned short KLongDecompositions[] =\n\t{\n\t0x";
       
   470 for(0..(scalar(@VeryLongDecompData) - 1))
       
   471 	{
       
   472 	if ($_ != 0)
       
   473 		{print (($_ & 7) == 0?",\n\t0x" : ', 0x')}
       
   474 	print $VeryLongDecompData[$_];
       
   475 	$totalBytes += 2;
       
   476 	}
       
   477 print "\n\t};\n\n";
       
   478 
       
   479 print "// Table containing decompositions longer than one UTF16 character.\n";
       
   480 print "// The top of the table contains all compositions, sorted lexicographically.\n";
       
   481 print "// Any decompositions of length 2 are in the table as a pair of values,\n";
       
   482 print "// decompositions longer than that are represented by a KLongD followed by\n";
       
   483 print "// a value whose top four bits indicate the length of the decomposition minus\n";
       
   484 print "// three and whose bottom 12 bits indicate an index into the KLongDecompositions\n";
       
   485 print "// array where the decomposition starts.\n";
       
   486 print "const long KLongD = 0;\n";
       
   487 print "// sizeof/2 = ".$hashIndexAfterLongExcludeDecomps."\n";
       
   488 print "const unsigned short KNonSingletonDecompositions[] =\n\t{\n\t";
       
   489 
       
   490 sub PrintNonsingletonDecompTableEntry
       
   491 	{
       
   492 	my ($decomp) = @_;
       
   493 	if (length $decomp < 10)
       
   494 		{
       
   495 		if ($decomp =~ /([0-9A-F]{4}) ([0-9A-F]{4})/)
       
   496 			{
       
   497 			print '0x'.$1.', 0x'.$2;
       
   498 			}
       
   499 		else
       
   500 			{
       
   501 			die("$decomp expected to be normalized and of length 1 or 2")
       
   502 				if $decomp !~ /[0-9A-F]{4}/;
       
   503 			print '0x'.$decomp.', 0xFFFF';
       
   504 			}
       
   505 		}
       
   506 	else
       
   507 		{
       
   508 		printf ('KLongD, 0x%1X%03X', ((length $decomp) - 14)/5, $VeryLongDecompMap{$decomp});
       
   509 		}
       
   510 	}
       
   511 
       
   512 {my $entryNo = 0;
       
   513 foreach my $code (@IncludedDecomps)
       
   514 	{
       
   515 	if ($entryNo != 0)
       
   516 		{print (($entryNo & 3) == 0?",\n\t" : ', ')}
       
   517 	PrintNonsingletonDecompTableEntry($Decomp{$code});
       
   518 	$entryNo++;
       
   519 	$totalBytes += 4;
       
   520 	}
       
   521 foreach my $code (@LongExcludedDecomps)
       
   522 	{
       
   523 	print (($entryNo & 3) == 0?",\n\t" : ', ');
       
   524 	PrintNonsingletonDecompTableEntry($Decomp{$code});
       
   525 	$entryNo++;
       
   526 	$totalBytes += 4;
       
   527 	}
       
   528 }
       
   529 print "\n\t};\n\n";
       
   530 
       
   531 print "// Table of folded decompositions which either have more than one UTF16, or\n";
       
   532 print "// their normal decompositions have more than one UTF16\n";
       
   533 print "// sizeof/2 = ".$hashIndexAfterShortDecompsLongFolds."\n";
       
   534 print "const unsigned short KNonSingletonFolds[] =\n\t{\n\t";
       
   535 {my $entryNo = 0;
       
   536 foreach my $code (@IncludedDecomps)
       
   537 	{
       
   538 	if ($entryNo != 0)
       
   539 		{print (($entryNo & 3) == 0?",\n\t" : ', ')}
       
   540 	PrintNonsingletonDecompTableEntry($Folded{$code});
       
   541 	$entryNo++;
       
   542 	$totalBytes += 4;
       
   543 	}
       
   544 foreach my $code (@LongExcludedDecomps)
       
   545 	{
       
   546 	print (($entryNo & 3) == 0?",\n\t" : ', ');
       
   547 	PrintNonsingletonDecompTableEntry($Folded{$code});
       
   548 	$entryNo++;
       
   549 	$totalBytes += 4;
       
   550 	}
       
   551 foreach my $code (@ShortDecompsLongFolds)
       
   552 	{
       
   553 	print (($entryNo & 3) == 0?",\n\t" : ', ');
       
   554 	PrintNonsingletonDecompTableEntry($Folded{$code});
       
   555 	$entryNo++;
       
   556 	$totalBytes += 4;
       
   557 	}
       
   558 }
       
   559 print "\n\t};\n\n";
       
   560 
       
   561 print "// Table of singleton decompositions and characters with singleton folds\n";
       
   562 print "// Note for Unicode 5.0:\n";
       
   563 print "// Unicode 5.0 contains some non-BMP characters have non-BMP \"singleton\" folds.\n";
       
   564 print "// As per the algorithm of this file, the non-BMP character should be stored in \n";
       
   565 print "// this table. \"Unsigned short\" is not big enough to hold them. However, this \n";
       
   566 print "// \"character\" information is not useful. So we just store 0xFFFF instead. \n";
       
   567 print "// Please do check 0xFFFF when access this table. If meet 0xFFFF, that means \n";
       
   568 print "// your character has no decomposition.\n";
       
   569 print "// See the variable \"ShortDecompsLongFolds\" in FoldAndDecompTables.pl if you \n";
       
   570 print "// want to know more.\n";
       
   571 print "// sizeof = ".($hashIndexAfterShortDecompsShortFolds-$hashIndexAfterLongExcludeDecomps)."\n";
       
   572 print "const unsigned short KSingletonDecompositions[] =\n\t{\n\t0x";
       
   573 {my $entryNo = 0;
       
   574 foreach my $code (@ShortDecompsLongFolds)
       
   575 	{
       
   576 	if ($entryNo != 0)
       
   577 		{print (($entryNo & 7) == 0?",\n\t0x" : ', 0x')}
       
   578 	if (exists $Decomp{$code} && $Decomp{$code} ne '')
       
   579 		{
       
   580 		print $Decomp{$code};
       
   581 		}
       
   582 	else
       
   583 		{
       
   584 		# Don't take these 0xFFFF as character.
       
   585 		#printf ('%04X', $code);
       
   586 		printf ("FFFF");
       
   587 		}
       
   588 	$entryNo++;
       
   589 	$totalBytes += 4;
       
   590 	}
       
   591 foreach my $code (@ShortDecompsShortFolds)
       
   592 	{
       
   593 	if ($entryNo != 0)
       
   594 		{print (($entryNo & 7) == 0?",\n\t0x" : ', 0x')}
       
   595 	print $Decomp{$code};
       
   596 	$entryNo++;
       
   597 	$totalBytes += 4;
       
   598 	}
       
   599 }
       
   600 print "\n\t};\n\n";
       
   601 
       
   602 print "// Table of singleton folds\n";
       
   603 print "// sizeof = ".($hashIndexAfterShortFoldsOnly-$hashIndexAfterShortDecompsLongFolds)."\n";
       
   604 print "const unsigned short KSingletonFolds[] =\n\t{\n\t0x";
       
   605 {my $entryNo = 0;
       
   606 foreach my $code (@ShortDecompsShortFolds)
       
   607 	{
       
   608 	if ($entryNo != 0)
       
   609 		{print (($entryNo & 7) == 0?",\n\t0x" : ', 0x')}
       
   610 	print $Folded{$code};
       
   611 	$entryNo++;
       
   612 	$totalBytes += 4;
       
   613 	}
       
   614 foreach my $code (@ShortFoldsOnly)
       
   615 	{
       
   616 	print (($entryNo & 7) == 0?",\n\t0x" : ', 0x');
       
   617 	print $Folded{$code};
       
   618 	$entryNo++;
       
   619 	$totalBytes += 4;
       
   620 	}
       
   621 }
       
   622 print "\n\t};\n";
       
   623 
       
   624 print "\n// Total size: $totalBytes bytes\n";
       
   625 print STDERR $totalBytes, " bytes\n";