|
1 # Copyright (c) 2001-2009 Nokia Corporation and/or its subsidiary(-ies). |
|
2 # All rights reserved. |
|
3 # This component and the accompanying materials are made available |
|
4 # under the terms of the License "Eclipse Public License v1.0" |
|
5 # which accompanies this distribution, and is available |
|
6 # at the URL "http://www.eclipse.org/legal/epl-v10.html". |
|
7 # |
|
8 # Initial Contributors: |
|
9 # Nokia Corporation - initial contribution. |
|
10 # |
|
11 # Contributors: |
|
12 # |
|
13 # Description: |
|
14 # Creates C++ code describing how to decompose, compose and fold each character. |
|
15 # Usage: |
|
16 # perl -w FoldAndDecompTables.pl < <output-from-UnicodeMaxDecompose> |
|
17 # Tables we want to create: |
|
18 # A: Ordered list of non-excluded decompositions |
|
19 # B: List of folded decompositions matching A |
|
20 # C: List of decompositions not listed in A of length > 1 |
|
21 # D: List of folded decompositions matching C |
|
22 # E: List of decompositions of length = 1 whose matching folded decompositions |
|
23 # are of length > 1 |
|
24 # F: List of folded decompositions matching E |
|
25 # G: List of decompositions of length = 1 with matching folded decompositions |
|
26 # H: List of folded decompostions matching G |
|
27 # I: List of folded decompositions that do not have matching decompositions |
|
28 # J: List of decompositions (folding and otherwise) of length > 2 |
|
29 # K: Hash table mapping Unicode value to its folded decomposition value in the |
|
30 # concatenated list B-D-F-H-I |
|
31 # L: List of hash slots in K matching A (providing a mapping from non-excluded |
|
32 # decompositions to Unicode value) |
|
33 # [all lengths are of UTF16 strings] |
|
34 # |
|
35 # |
|
36 |
|
37 use strict; |
|
38 |
|
39 # |
|
40 # Hash table: |
|
41 # |
|
42 |
|
43 # Size of hashing table = 1 to the power $LgHashTableSize |
|
44 my $LgHashTableSize = 12; |
|
45 |
|
46 # Do not change these next two values! |
|
47 my $HashTableSize = 1 << $LgHashTableSize; |
|
48 my $HashTableBitmaskCpp = sprintf('0x%x', $HashTableSize - 1); |
|
49 |
|
50 # Hashing function in Perl: Getting the initial search position |
|
51 sub HashStart |
|
52 { |
|
53 return $_[0] & ($HashTableSize - 1); |
|
54 } |
|
55 # How far to step through each time |
|
56 sub HashStep |
|
57 { |
|
58 my ($code) = @_; |
|
59 $code *= $code >> $LgHashTableSize; |
|
60 return ($code * 2 + 1) & ($HashTableSize - 1); |
|
61 } |
|
62 |
|
63 # Make sure input string is all hex numbers separated by single spaces with |
|
64 # each hex number having 4 digits and decomposed into UTF16 |
|
65 sub Normalize |
|
66 { |
|
67 my ($string) = @_; |
|
68 if ($string =~ /^([0-9A-F]{4}( [0-9A-F]{4})*)?$/) |
|
69 { |
|
70 return $string; |
|
71 } |
|
72 my $norm = ''; |
|
73 foreach my $elt (split(' ', $string)) |
|
74 { |
|
75 if ($elt) |
|
76 { |
|
77 die "'$elt' is not a hex number" |
|
78 unless $elt =~ /[0-9a-fA-F]+/; |
|
79 $norm = $norm.' ' |
|
80 unless $norm eq ''; |
|
81 $elt = hex $elt; |
|
82 if ($elt < 0x10000) |
|
83 { |
|
84 $norm = $norm.(sprintf('%04X', $elt)); |
|
85 } |
|
86 else |
|
87 { |
|
88 # Add a surrogate pair |
|
89 $norm = $norm.(sprintf('%04X %04X', |
|
90 ($elt / 0x400) + 0xD7C0, ($elt % 0x400) + 0xDC00)); |
|
91 } |
|
92 } |
|
93 } |
|
94 #print STDERR "'$string' normalized to '$norm'\n"; |
|
95 return $norm; |
|
96 } |
|
97 |
|
98 # First stage: |
|
99 # Hash of Unicode values to normalised decomposition and folded strings |
|
100 my %Decomp = (); |
|
101 my %Folded = (); |
|
102 # Mapping from decomposition->char, if not excluded |
|
103 my %Composition = (); |
|
104 # characters with non-excluded decompositions |
|
105 my @IncludedDecomps = (); |
|
106 # characters with long (>1 UTF16) excluded decompositions |
|
107 my @LongExcludedDecomps = (); |
|
108 # characters with singleton decompositions but long folds |
|
109 my @ShortDecompsLongFolds = (); |
|
110 # characters with singleton folds and singleton |
|
111 my @ShortDecompsShortFolds = (); |
|
112 # characters with singleton folds but no decomps |
|
113 my @ShortFoldsOnly = (); |
|
114 |
|
115 # A mapping from decompositions of length greater than two |
|
116 # to the code that produced them. |
|
117 my %VeryLongDecompositions = (); |
|
118 |
|
119 # A list of characters containing all decompositions of length >2 as slices |
|
120 my @VeryLongDecompData = (); |
|
121 # Mapping from decomposition->index into VeryLongDecompData |
|
122 my %VeryLongDecompMap = (); |
|
123 |
|
124 # There will be a hash table mapping Unicode values to indices into the other |
|
125 # tables. %Index maps the same thing in Perl. |
|
126 my %Index = (); |
|
127 # %HashTableEntryContents maps the table entries to the Unicode values they |
|
128 # contain. |
|
129 my %HashTableEntryContents = (); |
|
130 # %HashTableEntry maps Unicode value to the entry in the hash table |
|
131 my %HashTableEntry = (); |
|
132 |
|
133 # Bind a unicode value to an index into the tables |
|
134 sub AddHashValue |
|
135 { |
|
136 my ($unicode, $index) = @_; |
|
137 $Index{$unicode} = $index; |
|
138 my $pos = HashStart($unicode); |
|
139 my $step = HashStep($unicode); |
|
140 while (exists $HashTableEntryContents{$pos}) |
|
141 { |
|
142 $pos += $step; |
|
143 if ($HashTableSize <= $pos) |
|
144 { |
|
145 $pos %= $HashTableSize; |
|
146 } |
|
147 } |
|
148 $HashTableEntryContents{$pos} = $unicode; |
|
149 $HashTableEntry{$unicode} = $pos; |
|
150 } |
|
151 |
|
152 # Bind a whole array to the indices starting from that given as the first |
|
153 # argument. Returns the index of the next slot to be filled. |
|
154 sub AddListToHash |
|
155 { |
|
156 my ($index, @unicodes) = @_; |
|
157 while (@unicodes) |
|
158 { |
|
159 AddHashValue(shift @unicodes, $index); |
|
160 $index++; |
|
161 } |
|
162 return $index; |
|
163 } |
|
164 |
|
165 # put the results of a read line into the data structures |
|
166 sub AddCode |
|
167 { |
|
168 my ($code, $excluded, $decomposition, $folded) = @_; |
|
169 return if ($decomposition eq '' && $folded eq ''); |
|
170 $Decomp{$code} = $decomposition; |
|
171 $Folded{$code} = $folded; |
|
172 |
|
173 if (!$excluded && $decomposition ne '') |
|
174 { |
|
175 push @IncludedDecomps, $code; |
|
176 $Composition{$decomposition} = $code; |
|
177 } |
|
178 elsif (4 < length $decomposition) |
|
179 { |
|
180 push @LongExcludedDecomps, $code; |
|
181 } |
|
182 elsif (4 < length $folded) |
|
183 { |
|
184 push @ShortDecompsLongFolds, $code; |
|
185 } |
|
186 elsif ($decomposition ne '') |
|
187 { |
|
188 push @ShortDecompsShortFolds, $code; |
|
189 } |
|
190 elsif ($folded ne '') |
|
191 { |
|
192 push @ShortFoldsOnly, $code; |
|
193 } |
|
194 |
|
195 $VeryLongDecompositions{$decomposition} = $code |
|
196 if (9 < length $decomposition); |
|
197 $VeryLongDecompositions{$folded} = $code |
|
198 if (9 < length $folded); |
|
199 } |
|
200 |
|
201 if (scalar(@ARGV) != 0) |
|
202 { |
|
203 print (STDERR "Usage:\nperl -w FoldAndDecompTables.pl < <output-from-UnicodeMaxDecompose>\n"); |
|
204 exit 1; |
|
205 } |
|
206 |
|
207 my $lineNo = 0; |
|
208 my $inBlock = 0; |
|
209 while(<STDIN>) |
|
210 { |
|
211 $lineNo++; |
|
212 if (/^(1?[0-9a-fA-F]{4,5});([^;]*);.*symbian:(E?);[^;]*;([0-9a-fA-F \t]*);([0-9a-fA-F \t]*)[ \t]*$/i) |
|
213 { |
|
214 my $code = hex $1; |
|
215 my $description = $2; |
|
216 my $excluded = $3; |
|
217 my $decomposition = Normalize($4); |
|
218 my $folded = Normalize($5); |
|
219 |
|
220 die ("Value $1 too large to be Unicode at line $lineNo.") |
|
221 if (0x110000 <= $code); |
|
222 |
|
223 die("Normalisation failed with '$decomposition' at line $lineNo.") |
|
224 unless (length $decomposition) == 0 || (length $decomposition) % 5 == 4; |
|
225 die("Normalisation failed with '$folded' at line $lineNo.") |
|
226 unless (length $folded) == 0 || (length $folded) % 5 == 4; |
|
227 |
|
228 AddCode($code, $excluded, $decomposition, $folded); |
|
229 |
|
230 if ($description =~ /^<.*Last>$/i) |
|
231 { |
|
232 die("End of block without start at line $lineNo!") |
|
233 if !$inBlock; |
|
234 while ($inBlock <= $code) |
|
235 { |
|
236 AddCode($inBlock, $excluded, $decomposition, $folded); |
|
237 $inBlock++; |
|
238 } |
|
239 $inBlock = 0; |
|
240 } |
|
241 elsif ($description =~ /^<.*First>$/i) |
|
242 { |
|
243 die("Block within block at line $lineNo!") |
|
244 if $inBlock; |
|
245 $inBlock = $code + 1; |
|
246 } |
|
247 } |
|
248 elsif (!/^[ \t]*$/) |
|
249 { |
|
250 die("Did not understand line $lineNo."); |
|
251 } |
|
252 } |
|
253 |
|
254 # We need to construct the data for the table of decompositions of length > 2. |
|
255 foreach my $decomp (sort {length $::b <=> length $::a} keys %VeryLongDecompositions) |
|
256 { |
|
257 if (!exists $VeryLongDecompMap{$decomp}) |
|
258 { |
|
259 # Does not already exist |
|
260 my $newPos = scalar @VeryLongDecompData; |
|
261 $VeryLongDecompMap{$decomp} = $newPos; |
|
262 foreach my $code (split(' ', $decomp)) |
|
263 { |
|
264 push @VeryLongDecompData, $code; |
|
265 } |
|
266 while ($decomp =~ /^([0-9A-F]{4}( [0-9A-F]{4}){2,}) [0-9A-F]{4}$/) |
|
267 { |
|
268 $decomp = $1; |
|
269 $VeryLongDecompMap{$decomp} = $newPos; |
|
270 } |
|
271 } |
|
272 } |
|
273 |
|
274 # We need to sort the codes for included decompositions into lexicographic |
|
275 # order of their decompositions. |
|
276 # This, luckily, is the same as sorting the strings that represent their |
|
277 # decompositions in hex lexicographically. |
|
278 @IncludedDecomps = sort {$Decomp{$::a} cmp $Decomp{$::b}} @IncludedDecomps; |
|
279 |
|
280 print (STDERR 'Included: ', scalar(@IncludedDecomps), "\nLong: ", scalar(@LongExcludedDecomps)); |
|
281 print(STDERR "\nLongFolds: ", scalar(@ShortDecompsLongFolds), "\nShort: ", scalar(@ShortDecompsShortFolds)); |
|
282 print(STDERR "\nShortFoldsOnly: ", scalar(@ShortFoldsOnly), "\nTOTAL: "); |
|
283 print STDERR (scalar(@IncludedDecomps) + scalar(@LongExcludedDecomps) + scalar(@ShortDecompsLongFolds) + scalar(@ShortDecompsShortFolds) + scalar(@ShortFoldsOnly)); |
|
284 print STDERR "\n"; |
|
285 |
|
286 # Analyse the hash table to find out the maximum and average time |
|
287 # taken to find each ASCII character |
|
288 my $maxAsciiTime = 0; |
|
289 my $totalAsciiTime = 0; |
|
290 my $mostDifficultCode = undef; |
|
291 my $asciiFoundWithoutStepCount = 0; |
|
292 for (32..126) |
|
293 { |
|
294 my $code = $_; |
|
295 my $pos = HashStart($code); |
|
296 my $step = HashStep($code); |
|
297 my $stepCount = 1; |
|
298 if ($HashTableEntry{$code}) |
|
299 { |
|
300 my $posRequired = $HashTableEntry{$code}; |
|
301 while ($pos != $posRequired) |
|
302 { |
|
303 $pos = ($pos + $step) % $HashTableSize; |
|
304 $stepCount++; |
|
305 } |
|
306 } |
|
307 $totalAsciiTime += $stepCount; |
|
308 if ($maxAsciiTime < $stepCount) |
|
309 { |
|
310 $maxAsciiTime = $stepCount; |
|
311 $mostDifficultCode = $code; |
|
312 } |
|
313 if ($stepCount == 1) |
|
314 { |
|
315 $asciiFoundWithoutStepCount++; |
|
316 } |
|
317 } |
|
318 printf (STDERR "Average ASCII search: %f\n", $totalAsciiTime / 95); |
|
319 printf (STDERR "Maximum ASCII search %d for %x: '%c'.\n", $maxAsciiTime, $mostDifficultCode, $mostDifficultCode); |
|
320 |
|
321 # Now we populate the hash table |
|
322 my $index = 0; |
|
323 |
|
324 $index = AddListToHash($index, @IncludedDecomps); |
|
325 my $hashIndexAfterIncludedDecomps = $index; |
|
326 printf (STDERR "after IncludedDecomps index= %d\n", $hashIndexAfterIncludedDecomps); |
|
327 |
|
328 $index = AddListToHash($index, @LongExcludedDecomps); |
|
329 my $hashIndexAfterLongExcludeDecomps = $index; |
|
330 printf (STDERR "after LongExcludedDecomps index= %d\n", $hashIndexAfterLongExcludeDecomps); |
|
331 |
|
332 $index = AddListToHash($index, @ShortDecompsLongFolds); |
|
333 my $hashIndexAfterShortDecompsLongFolds = $index; |
|
334 printf (STDERR "after ShortDecompsLongFolds index= %d\n", $hashIndexAfterShortDecompsLongFolds); |
|
335 |
|
336 $index = AddListToHash($index, @ShortDecompsShortFolds); |
|
337 my $hashIndexAfterShortDecompsShortFolds = $index; |
|
338 printf (STDERR "after ShortDecompsShortFolds index= %d\n", $hashIndexAfterShortDecompsShortFolds); |
|
339 |
|
340 $index = AddListToHash($index, @ShortFoldsOnly); |
|
341 my $hashIndexAfterShortFoldsOnly = $index; |
|
342 printf (STDERR "after ShortFoldsOnly index= %d\n", $hashIndexAfterShortFoldsOnly); |
|
343 |
|
344 # |
|
345 # Output C++ File |
|
346 # |
|
347 my $totalBytes = 0; |
|
348 |
|
349 print "// Copyright (c) 2001-2009 Nokia Corporation and/or its subsidiary(-ies).\n"; |
|
350 print "// All rights reserved.\n"; |
|
351 print "// This component and the accompanying materials are made available\n"; |
|
352 print "// under the terms of the License \"Eclipse Public License v1.0\"\n"; |
|
353 print "// which accompanies this distribution, and is available\n"; |
|
354 print "// at the URL \"http://www.eclipse.org/legal/epl-v10.html\".\n"; |
|
355 print "//\n"; |
|
356 print "// Initial Contributors:\n"; |
|
357 print "// Nokia Corporation - initial contribution.\n"; |
|
358 print "//\n"; |
|
359 print "// Contributors:\n"; |
|
360 print "//\n"; |
|
361 print "// Description:\n"; |
|
362 print "//\n"; |
|
363 print "// Fold and decomposition tables.\n"; |
|
364 print "//\n"; |
|
365 print "// These tables are linked in the following way:\n"; |
|
366 print "// KUnicodeToIndexHash is a hash table using double hashing for\n"; |
|
367 print "// conflict resolution. The functions DecompositionHashStart and\n"; |
|
368 print "// DecompositionHashStep give the start and step values for accessing\n"; |
|
369 print "// the table. The first probe is at DecompositionHashStart and each\n"; |
|
370 print "// subsequent probe is offset by DecompositionHashStep. Probes\n"; |
|
371 print "// continue until either 0 is found (indicating that the Unicode value\n"; |
|
372 print "// sought has no decompostion (i.e. decomposes to itself)) or a value\n"; |
|
373 print "// is found that has the sought Unicode value in its lower 20 bits.\n"; |
|
374 print "//\n"; |
|
375 print "// In this latter case, the upper 12 bits contain an index into\n"; |
|
376 print "// one of the following tables, according to the following rules:\n"; |
|
377 print "//\n"; |
|
378 print "// In the case of folding:\n"; |
|
379 print "// If the Index is less than the length of KNonSingletonFolds / 2,\n"; |
|
380 print "// it is an index into KNonSingletonFolds. If the Index is\n"; |
|
381 print "// greater than the length of KNonSingletonFolds / 2, then it is an\n"; |
|
382 print "// index into KSingletonFolds.\n"; |
|
383 print "//\n"; |
|
384 print "// In the case of decomposition:\n"; |
|
385 print "// If the Index is less than the length of KNonSingletonDecompositions / 2,\n"; |
|
386 print "// it is an index into KNonSingletonDecompositions. If the Index is\n"; |
|
387 print "// greater than the length of KNonSingletonDecompositions / 2, then it is an\n"; |
|
388 print "// index into KSingletonDecompositions.\n"; |
|
389 print "//\n"; |
|
390 print "// In summary:\n"; |
|
391 print "// Let Knsf be the length of KNonSingletonFolds / 2,\n"; |
|
392 print "// let Knsd be the length of KNonSingletonDecompositions / 2,\n"; |
|
393 print "// let Ksd be the length of KSingletonDecompositions and\n"; |
|
394 print "// let Ksf be the length of KSingletonFolds.\n"; |
|
395 print "// Now if you want to fold a character and you have found\n"; |
|
396 print "// its index 'i' from the KUnicodeToIndexHash, then;\n"; |
|
397 print "// if (i < Knsf) then look up\n"; |
|
398 print "//\t\tKNonSingletonFolds[i * 2] and KNonSingletonFolds[i * 2 + 1]\n"; |
|
399 print "// else if (Knsf <= i < Knsf + Ksf) look up KSingletonFolds[i - Knsf]\n"; |
|
400 print "// else there is no fold for this character.\n"; |
|
401 print "//\n"; |
|
402 print "// Or if you want to decompose the same character, then;\n"; |
|
403 print "// if (i < Knsd) then look up KNonSingletonDecompositions[i * 2]\n"; |
|
404 print "//\t\tand KNonSingletonDecompositions[i * 2 + 1]\n"; |
|
405 print "// else if (Knsd <= i < Knsd + Ksd) look up KSingletonDecompositions[i - Knsd]\n"; |
|
406 print "// else there is no decomposition for this character.\n"; |
|
407 print "//\n"; |
|
408 print "// Your index into KSingletonDecompositions or KSingletonFolds\n"; |
|
409 print "// yields a single value which is the decomposition or fold.\n"; |
|
410 print "//\n"; |
|
411 print "// The KNonSingletonFolds and KNonSingletonDecomposition\n"; |
|
412 print "// tables are made up of pairs of values. Each pair is either a pair\n"; |
|
413 print "// of Unicode values that constitute the fold or decomposition, or\n"; |
|
414 print "// the first value is KLongD and the second has its top 4 bits as the\n"; |
|
415 print "// length of the decomposition (or folded decomposition) minus 3,\n"; |
|
416 print "// and its bottom 12 bits as the index into KLongDecompositions\n"; |
|
417 print "// of where you can find this decomposition.\n"; |
|
418 print "//\n"; |
|
419 print "// KLongDecompositions simply contains UTF-16 (Unicode) for\n"; |
|
420 print "// all the decomposed and folded sequences longer than 4 bytes long.\n"; |
|
421 print "\n"; |
|
422 print "// Hash table mapping unicode values to indices into the other tables\n"; |
|
423 print "// in use = ".$hashIndexAfterShortFoldsOnly." entries\n"; |
|
424 print "const unsigned long KUnicodeToIndexHash[$HashTableSize] =\n\t{\n\t"; |
|
425 my @HashTableOutput; |
|
426 for (0..($HashTableSize - 1)) |
|
427 { |
|
428 my $v = 0; |
|
429 if (exists $HashTableEntryContents{$_}) |
|
430 { |
|
431 $v = $HashTableEntryContents{$_}; |
|
432 die ('Did not expect a Unicode value > 0xFFFFF') |
|
433 if 0xFFFFF < $v; |
|
434 $v |= ($Index{$v}) << 20; |
|
435 } |
|
436 push @HashTableOutput, sprintf('0x%08x', $v); |
|
437 $totalBytes += 4; |
|
438 } |
|
439 print (shift @HashTableOutput); |
|
440 my $valueCount = 0; |
|
441 foreach my $v (@HashTableOutput) |
|
442 { |
|
443 print (((++$valueCount & 7) == 0)? ",\n\t" : ', '); |
|
444 print $v; |
|
445 } |
|
446 print "\n\t};\n\n"; |
|
447 print "// Hash table access functions\n"; |
|
448 print "const int KDecompositionHashBitmask = $HashTableBitmaskCpp;\n\n"; |
|
449 print "inline int DecompositionHashStart(long a)\n"; |
|
450 print "\t{\n\treturn a & $HashTableBitmaskCpp;\n\t}\n\n"; |
|
451 print "inline int DecompositionHashStep(long a)\n"; |
|
452 print "\t{\n\ta *= a >> $LgHashTableSize;\n"; |
|
453 print "\treturn ((a<<1) + 1) & $HashTableBitmaskCpp;\n\t}\n\n"; |
|
454 |
|
455 print "// Table mapping KNonSingletonDecompositions to the hash table entry that\n"; |
|
456 print "// indexes it\n"; |
|
457 print "const unsigned short KCompositionMapping[] =\n\t{\n\t"; |
|
458 for (0..(scalar(@IncludedDecomps - 1))) |
|
459 { |
|
460 if ($_ != 0) |
|
461 {print (($_ & 7) == 0? ",\n\t" : ', ')} |
|
462 printf( '0x%04x', $HashTableEntry{$IncludedDecomps[$_]} ); |
|
463 $totalBytes += 2; |
|
464 } |
|
465 print "\n\t};\n\n"; |
|
466 |
|
467 print "// Table containing all the decomposition and folding strings longer\n"; |
|
468 print "// than 2 UTF16 characters\n"; |
|
469 print "const unsigned short KLongDecompositions[] =\n\t{\n\t0x"; |
|
470 for(0..(scalar(@VeryLongDecompData) - 1)) |
|
471 { |
|
472 if ($_ != 0) |
|
473 {print (($_ & 7) == 0?",\n\t0x" : ', 0x')} |
|
474 print $VeryLongDecompData[$_]; |
|
475 $totalBytes += 2; |
|
476 } |
|
477 print "\n\t};\n\n"; |
|
478 |
|
479 print "// Table containing decompositions longer than one UTF16 character.\n"; |
|
480 print "// The top of the table contains all compositions, sorted lexicographically.\n"; |
|
481 print "// Any decompositions of length 2 are in the table as a pair of values,\n"; |
|
482 print "// decompositions longer than that are represented by a KLongD followed by\n"; |
|
483 print "// a value whose top four bits indicate the length of the decomposition minus\n"; |
|
484 print "// three and whose bottom 12 bits indicate an index into the KLongDecompositions\n"; |
|
485 print "// array where the decomposition starts.\n"; |
|
486 print "const long KLongD = 0;\n"; |
|
487 print "// sizeof/2 = ".$hashIndexAfterLongExcludeDecomps."\n"; |
|
488 print "const unsigned short KNonSingletonDecompositions[] =\n\t{\n\t"; |
|
489 |
|
490 sub PrintNonsingletonDecompTableEntry |
|
491 { |
|
492 my ($decomp) = @_; |
|
493 if (length $decomp < 10) |
|
494 { |
|
495 if ($decomp =~ /([0-9A-F]{4}) ([0-9A-F]{4})/) |
|
496 { |
|
497 print '0x'.$1.', 0x'.$2; |
|
498 } |
|
499 else |
|
500 { |
|
501 die("$decomp expected to be normalized and of length 1 or 2") |
|
502 if $decomp !~ /[0-9A-F]{4}/; |
|
503 print '0x'.$decomp.', 0xFFFF'; |
|
504 } |
|
505 } |
|
506 else |
|
507 { |
|
508 printf ('KLongD, 0x%1X%03X', ((length $decomp) - 14)/5, $VeryLongDecompMap{$decomp}); |
|
509 } |
|
510 } |
|
511 |
|
512 {my $entryNo = 0; |
|
513 foreach my $code (@IncludedDecomps) |
|
514 { |
|
515 if ($entryNo != 0) |
|
516 {print (($entryNo & 3) == 0?",\n\t" : ', ')} |
|
517 PrintNonsingletonDecompTableEntry($Decomp{$code}); |
|
518 $entryNo++; |
|
519 $totalBytes += 4; |
|
520 } |
|
521 foreach my $code (@LongExcludedDecomps) |
|
522 { |
|
523 print (($entryNo & 3) == 0?",\n\t" : ', '); |
|
524 PrintNonsingletonDecompTableEntry($Decomp{$code}); |
|
525 $entryNo++; |
|
526 $totalBytes += 4; |
|
527 } |
|
528 } |
|
529 print "\n\t};\n\n"; |
|
530 |
|
531 print "// Table of folded decompositions which either have more than one UTF16, or\n"; |
|
532 print "// their normal decompositions have more than one UTF16\n"; |
|
533 print "// sizeof/2 = ".$hashIndexAfterShortDecompsLongFolds."\n"; |
|
534 print "const unsigned short KNonSingletonFolds[] =\n\t{\n\t"; |
|
535 {my $entryNo = 0; |
|
536 foreach my $code (@IncludedDecomps) |
|
537 { |
|
538 if ($entryNo != 0) |
|
539 {print (($entryNo & 3) == 0?",\n\t" : ', ')} |
|
540 PrintNonsingletonDecompTableEntry($Folded{$code}); |
|
541 $entryNo++; |
|
542 $totalBytes += 4; |
|
543 } |
|
544 foreach my $code (@LongExcludedDecomps) |
|
545 { |
|
546 print (($entryNo & 3) == 0?",\n\t" : ', '); |
|
547 PrintNonsingletonDecompTableEntry($Folded{$code}); |
|
548 $entryNo++; |
|
549 $totalBytes += 4; |
|
550 } |
|
551 foreach my $code (@ShortDecompsLongFolds) |
|
552 { |
|
553 print (($entryNo & 3) == 0?",\n\t" : ', '); |
|
554 PrintNonsingletonDecompTableEntry($Folded{$code}); |
|
555 $entryNo++; |
|
556 $totalBytes += 4; |
|
557 } |
|
558 } |
|
559 print "\n\t};\n\n"; |
|
560 |
|
561 print "// Table of singleton decompositions and characters with singleton folds\n"; |
|
562 print "// Note for Unicode 5.0:\n"; |
|
563 print "// Unicode 5.0 contains some non-BMP characters have non-BMP \"singleton\" folds.\n"; |
|
564 print "// As per the algorithm of this file, the non-BMP character should be stored in \n"; |
|
565 print "// this table. \"Unsigned short\" is not big enough to hold them. However, this \n"; |
|
566 print "// \"character\" information is not useful. So we just store 0xFFFF instead. \n"; |
|
567 print "// Please do check 0xFFFF when access this table. If meet 0xFFFF, that means \n"; |
|
568 print "// your character has no decomposition.\n"; |
|
569 print "// See the variable \"ShortDecompsLongFolds\" in FoldAndDecompTables.pl if you \n"; |
|
570 print "// want to know more.\n"; |
|
571 print "// sizeof = ".($hashIndexAfterShortDecompsShortFolds-$hashIndexAfterLongExcludeDecomps)."\n"; |
|
572 print "const unsigned short KSingletonDecompositions[] =\n\t{\n\t0x"; |
|
573 {my $entryNo = 0; |
|
574 foreach my $code (@ShortDecompsLongFolds) |
|
575 { |
|
576 if ($entryNo != 0) |
|
577 {print (($entryNo & 7) == 0?",\n\t0x" : ', 0x')} |
|
578 if (exists $Decomp{$code} && $Decomp{$code} ne '') |
|
579 { |
|
580 print $Decomp{$code}; |
|
581 } |
|
582 else |
|
583 { |
|
584 # Don't take these 0xFFFF as character. |
|
585 #printf ('%04X', $code); |
|
586 printf ("FFFF"); |
|
587 } |
|
588 $entryNo++; |
|
589 $totalBytes += 4; |
|
590 } |
|
591 foreach my $code (@ShortDecompsShortFolds) |
|
592 { |
|
593 if ($entryNo != 0) |
|
594 {print (($entryNo & 7) == 0?",\n\t0x" : ', 0x')} |
|
595 print $Decomp{$code}; |
|
596 $entryNo++; |
|
597 $totalBytes += 4; |
|
598 } |
|
599 } |
|
600 print "\n\t};\n\n"; |
|
601 |
|
602 print "// Table of singleton folds\n"; |
|
603 print "// sizeof = ".($hashIndexAfterShortFoldsOnly-$hashIndexAfterShortDecompsLongFolds)."\n"; |
|
604 print "const unsigned short KSingletonFolds[] =\n\t{\n\t0x"; |
|
605 {my $entryNo = 0; |
|
606 foreach my $code (@ShortDecompsShortFolds) |
|
607 { |
|
608 if ($entryNo != 0) |
|
609 {print (($entryNo & 7) == 0?",\n\t0x" : ', 0x')} |
|
610 print $Folded{$code}; |
|
611 $entryNo++; |
|
612 $totalBytes += 4; |
|
613 } |
|
614 foreach my $code (@ShortFoldsOnly) |
|
615 { |
|
616 print (($entryNo & 7) == 0?",\n\t0x" : ', 0x'); |
|
617 print $Folded{$code}; |
|
618 $entryNo++; |
|
619 $totalBytes += 4; |
|
620 } |
|
621 } |
|
622 print "\n\t};\n"; |
|
623 |
|
624 print "\n// Total size: $totalBytes bytes\n"; |
|
625 print STDERR $totalBytes, " bytes\n"; |