author | John Imhofe <john.imhofe@nokia.com> |
Mon, 21 Dec 2009 16:14:42 +0000 | |
changeset 2 | 4122176ea935 |
parent 0 | a41df078684a |
permissions | -rw-r--r-- |
2
4122176ea935
Revision: 200948 + Removing redundant base integration tests and fixing build errors
John Imhofe <john.imhofe@nokia.com>
parents:
0
diff
changeset
|
1 |
# |
0 | 2 |
# Copyright (c) 1997-2009 Nokia Corporation and/or its subsidiary(-ies). |
3 |
# All rights reserved. |
|
4 |
# This component and the accompanying materials are made available |
|
2
4122176ea935
Revision: 200948 + Removing redundant base integration tests and fixing build errors
John Imhofe <john.imhofe@nokia.com>
parents:
0
diff
changeset
|
5 |
# under the terms of "Eclipse Public License v1.0" |
0 | 6 |
# which accompanies this distribution, and is available |
7 |
# at the URL "http://www.eclipse.org/legal/epl-v10.html". |
|
8 |
# |
|
9 |
# Initial Contributors: |
|
10 |
# Nokia Corporation - initial contribution. |
|
11 |
# |
|
12 |
# Contributors: |
|
13 |
# |
|
2
4122176ea935
Revision: 200948 + Removing redundant base integration tests and fixing build errors
John Imhofe <john.imhofe@nokia.com>
parents:
0
diff
changeset
|
14 |
# Description: |
0 | 15 |
# Case Equivalence |
16 |
# Given the unicode data file, work out the case equivalence classes |
|
17 |
# i.e. the equivalence classes for the transitive closure of ~ defined as |
|
18 |
# follows: |
|
19 |
# a~b if Uppercase(a) == b || Lowercase(a) == b || Titlecase(a) == b |
|
20 |
# Usage: perl CaseEquivalence <UnicodeData.txt |
|
21 |
# |
|
22 |
||
23 |
use strict; |
|
24 |
my @Name = (); |
|
25 |
my @Upper = (); |
|
26 |
my @Lower = (); |
|
27 |
my @Title = (); |
|
28 |
# $DecompositionValue[$code] is undefined if $code has no decomposition |
|
29 |
# sequence, if it has a single value decomposition sequence, then this is it, |
|
30 |
# if it has a longer sequence, the value is -1 |
|
31 |
my @DecompositionValue = (); |
|
32 |
# 1 for each code that has a differently-cased version, |
|
33 |
# 2 for each code that is a lower-case version of something else. |
|
34 |
my %Codes = (); |
|
35 |
my %CaseClass = (); |
|
36 |
||
37 |
# Command-line options |
|
38 |
my $OptionOutputTrie = 1; |
|
39 |
my $OptionOutputForwardMapping = 0; |
|
40 |
my $OptionOutputReverseMapping = 0; |
|
41 |
my $OptionIgnoreOneToOneReverseMappings = 0; |
|
42 |
my $OptionIncludeExtraMappings = 1; |
|
43 |
||
44 |
foreach my $optionString (@ARGV) |
|
45 |
{ |
|
46 |
if ($optionString =~ m![/-]o[tfrm]!) |
|
47 |
{ |
|
48 |
$OptionOutputTrie = 0; |
|
49 |
my $option = substr($optionString, 2, 1); |
|
50 |
if ($option eq 'f') |
|
51 |
{ |
|
52 |
$OptionOutputForwardMapping = 1; |
|
53 |
} |
|
54 |
elsif ($option eq 'r') |
|
55 |
{ |
|
56 |
$OptionOutputReverseMapping = 1; |
|
57 |
} |
|
58 |
elsif ($option eq 'm') |
|
59 |
{ |
|
60 |
$OptionOutputReverseMapping = 1; |
|
61 |
$OptionIgnoreOneToOneReverseMappings = 1; |
|
62 |
} |
|
63 |
else |
|
64 |
{ |
|
65 |
$OptionOutputTrie = 1; |
|
66 |
} |
|
67 |
} |
|
68 |
elsif ($optionString =~ m![/-]s!) |
|
69 |
{ |
|
70 |
$OptionIncludeExtraMappings = 0; |
|
71 |
} |
|
72 |
else |
|
73 |
{ |
|
74 |
print STDERR "Usage: perl CaseEquivalence [-o<mapping>] [-s]\nusing standard input and output streams.\n"; |
|
75 |
print STDERR "<mapping> is one of:\nt: output C++ code giving a trie for folding case. Each trie level is 4 bits.\n"; |
|
76 |
print STDERR "f: Give a list of all codes that need mapping and what they map to.\n"; |
|
77 |
print STDERR "r: Give a list of all codes are mapped to and what maps to them.\n"; |
|
78 |
print STDERR "m: Give a list of all codes are mapped to by more than one code.\n"; |
|
79 |
print STDERR "\nOmitting the -s option adds the following case-equivalence:\nSpace = Non-breaking space\n"; |
|
80 |
exit; |
|
81 |
} |
|
82 |
} |
|
83 |
||
84 |
# set a code as being part of a non-unitary case-equivalence class. |
|
85 |
sub add |
|
86 |
{ |
|
87 |
my ($addition) = @_; |
|
88 |
if (!$Codes{$addition}) |
|
89 |
{ |
|
90 |
$Codes{$addition} = 1; |
|
91 |
} |
|
92 |
} |
|
93 |
||
94 |
# make a code point to its final case varient |
|
95 |
sub chaseDown |
|
96 |
{ |
|
97 |
my ($codeVal) = @_; |
|
98 |
my $class = $codeVal; |
|
99 |
while ($CaseClass{$class}) |
|
100 |
{ |
|
101 |
$class = $CaseClass{$class}; |
|
102 |
} |
|
103 |
$CaseClass{$codeVal} = $class unless $codeVal == $class; |
|
104 |
return $class; |
|
105 |
} |
|
106 |
||
107 |
# link two codes together as being part of the same case-equivalence class |
|
108 |
sub makeEquivalent |
|
109 |
{ |
|
110 |
my ($left, $right) = @_; |
|
111 |
if (!$left || !$right) |
|
112 |
{ |
|
113 |
return; |
|
114 |
} |
|
115 |
$left = chaseDown($left); |
|
116 |
$right = chaseDown($right); |
|
117 |
if ($Codes{$left} < $Codes{$right}) |
|
118 |
{ |
|
119 |
$CaseClass{$left} = $right; |
|
120 |
return; |
|
121 |
} |
|
122 |
if ($Codes{$right} < $Codes{$left}) |
|
123 |
{ |
|
124 |
$CaseClass{$right} = $left; |
|
125 |
return; |
|
126 |
} |
|
127 |
if ($left < $right) |
|
128 |
{ |
|
129 |
$CaseClass{$right} = $left; |
|
130 |
return; |
|
131 |
} |
|
132 |
if ($right < $left) |
|
133 |
{ |
|
134 |
$CaseClass{$left} = $right; |
|
135 |
return; |
|
136 |
} |
|
137 |
# $left == $right.. do nothing |
|
138 |
return; |
|
139 |
} |
|
140 |
||
141 |
# Link possibly unmentioned codes together. The first one is considered lower-case |
|
142 |
sub addEquivalenceClass |
|
143 |
{ |
|
144 |
my ($lower, @rest) = @_; |
|
145 |
$Codes{$lower} = 2; |
|
146 |
foreach my $one (@rest) |
|
147 |
{ |
|
148 |
$Codes{$one} = 1; |
|
149 |
makeEquivalent($lower, $one); |
|
150 |
} |
|
151 |
} |
|
152 |
||
153 |
# Firstly we read in the data |
|
154 |
while(<STDIN>) |
|
155 |
{ |
|
156 |
my @line = split('#', $_, 1); |
|
157 |
my @fields = split(/;/, $line[0]); |
|
158 |
my @decomposition = split(' ', $fields[5]); |
|
159 |
if (1 < scalar(@fields)) |
|
160 |
{ |
|
161 |
my $codeVal = hex($fields[0]); |
|
162 |
# if the character has a non-compatibility decomposition sequence, record this fact. |
|
163 |
if (0 < scalar(@decomposition)) |
|
164 |
{ |
|
165 |
my $decompositionType = ""; |
|
166 |
if ($decomposition[0] =~ m/<[a-zA-Z0-9]+>/) |
|
167 |
{ |
|
168 |
$decompositionType = shift @decomposition; |
|
169 |
} |
|
170 |
if ($decompositionType !~ m/compat/i) |
|
171 |
{ |
|
172 |
$DecompositionValue[$codeVal] = scalar(@decomposition) == 1? hex($decomposition[0]) : -1; |
|
173 |
} |
|
174 |
} |
|
175 |
$Name[$codeVal] = $fields[1]; |
|
176 |
my $upperval = $fields[12]; |
|
177 |
my $lowerval = $fields[13]; |
|
178 |
my $titleval = $fields[14]; |
|
179 |
||
180 |
# strip whitespace from the end of the string |
|
181 |
$titleval =~ s/\s+$//; |
|
182 |
if ($upperval) |
|
183 |
{ |
|
184 |
$upperval = hex($upperval); |
|
185 |
$Upper[$codeVal] = $upperval; |
|
186 |
add $codeVal; |
|
187 |
add $upperval; |
|
188 |
} |
|
189 |
if ($titleval) |
|
190 |
{ |
|
191 |
$titleval = hex($titleval); |
|
192 |
$Title[$codeVal] = $titleval; |
|
193 |
add $codeVal; |
|
194 |
add $titleval; |
|
195 |
} |
|
196 |
if ($lowerval) |
|
197 |
{ |
|
198 |
$lowerval = hex($lowerval); |
|
199 |
$Lower[$codeVal] = $lowerval; |
|
200 |
add $codeVal; |
|
201 |
$Codes{$lowerval} = 2; |
|
202 |
} |
|
203 |
} |
|
204 |
} |
|
205 |
||
206 |
# Remove all codes that decompose to a sequence |
|
207 |
foreach my $codeVal (keys(%Codes)) |
|
208 |
{ |
|
209 |
my $current = $DecompositionValue[$codeVal]; |
|
210 |
while ($current && 0 < $current) |
|
211 |
{ |
|
212 |
$current = $DecompositionValue[$current]; |
|
213 |
} |
|
214 |
if ($current && $current == -1) |
|
215 |
{ |
|
216 |
delete $Codes{$codeVal}; |
|
217 |
} |
|
218 |
} |
|
219 |
||
220 |
# Next we form the equivalence classes. |
|
221 |
if ($OptionIncludeExtraMappings) |
|
222 |
{ |
|
223 |
# space = non-breaking space |
|
224 |
addEquivalenceClass(0x20, 0xA0); |
|
225 |
} |
|
226 |
# We try to end up with everything being equivalent to a lower case letter |
|
227 |
foreach my $codeVal (keys(%Codes)) |
|
228 |
{ |
|
229 |
makeEquivalent($codeVal, $Lower[$codeVal]); |
|
230 |
makeEquivalent($codeVal, $Upper[$codeVal]); |
|
231 |
makeEquivalent($codeVal, $Title[$codeVal]); |
|
232 |
} |
|
233 |
||
234 |
# Next we chase each pointer in CaseClass down to its final result |
|
235 |
foreach my $codeVal (keys(%CaseClass)) |
|
236 |
{ |
|
237 |
chaseDown($codeVal); |
|
238 |
} |
|
239 |
||
240 |
# Now output the results in order, and collect the raw data |
|
241 |
my @Offset = (); |
|
242 |
my $oldCodeCount = 0; |
|
243 |
foreach my $codeVal (sort {$a <=> $b} keys(%CaseClass)) |
|
244 |
{ |
|
245 |
my $class = $CaseClass{$codeVal}; |
|
246 |
my $offset = $class - $codeVal; |
|
247 |
if ($OptionOutputForwardMapping) |
|
248 |
{ |
|
249 |
printf "%x %d\t\t%s => %s\n", $codeVal, $offset, $Name[$codeVal], $Name[$class]; |
|
250 |
} |
|
251 |
while ($oldCodeCount != $codeVal) |
|
252 |
{ |
|
253 |
$Offset[$oldCodeCount] = 0; |
|
254 |
$oldCodeCount++; |
|
255 |
} |
|
256 |
$oldCodeCount++; |
|
257 |
$Offset[$codeVal] = $offset; |
|
258 |
} |
|
259 |
||
260 |
if ($OptionOutputReverseMapping) |
|
261 |
{ |
|
262 |
my %ReverseMapping = (); |
|
263 |
foreach my $codeVal (keys(%CaseClass)) |
|
264 |
{ |
|
265 |
my $mapsTo = $CaseClass{$codeVal}; |
|
266 |
if (!$ReverseMapping{$mapsTo}) |
|
267 |
{ |
|
268 |
$ReverseMapping{$mapsTo} = [$codeVal]; |
|
269 |
} |
|
270 |
else |
|
271 |
{ |
|
272 |
push (@{ $ReverseMapping{$mapsTo} }, $codeVal); |
|
273 |
} |
|
274 |
} |
|
275 |
foreach my $mapVal (sort {$a <=> $b} keys(%ReverseMapping)) |
|
276 |
{ |
|
277 |
next if ($OptionIgnoreOneToOneReverseMappings && scalar(@{$ReverseMapping{$mapVal}}) == 1); |
|
278 |
printf("%x: %s <=", $mapVal, $Name[$mapVal]); |
|
279 |
my $firstTime = 1; |
|
280 |
foreach my $val ( @{ $ReverseMapping{$mapVal} } ) |
|
281 |
{ |
|
282 |
if (!$firstTime) |
|
283 |
{ |
|
284 |
print ','; |
|
285 |
} |
|
286 |
$firstTime = 0; |
|
287 |
printf(" %s:%x", $Name[$val], $val); |
|
288 |
} |
|
289 |
print "\n"; |
|
290 |
} |
|
291 |
} |
|
292 |
||
293 |
# does the array 2 match array 1? Match the shorter array against the prefix of |
|
294 |
# the other array |
|
295 |
sub arraysMatch |
|
296 |
{ |
|
297 |
my ($left, $right, $leftpos) = @_; |
|
298 |
my $last = scalar(@$left) - $leftpos; |
|
299 |
if (scalar(@$right) < $last) |
|
300 |
{ |
|
301 |
$last = scalar(@$right); |
|
302 |
} |
|
303 |
my $pos = 0; |
|
304 |
while ($pos < $last) |
|
305 |
{ |
|
306 |
if ($$left[$pos + $leftpos] != $$right[$pos]) |
|
307 |
{ |
|
308 |
return 0; |
|
309 |
} |
|
310 |
$pos++; |
|
311 |
} |
|
312 |
return 1; |
|
313 |
} |
|
314 |
||
315 |
# find a match for array 2 in array 1, allowing values past the end of array 1 |
|
316 |
# to match anything in array 1 |
|
317 |
sub findMatch |
|
318 |
{ |
|
319 |
my ($candidate, $term) = @_; |
|
320 |
my $pos = 0; |
|
321 |
while (!arraysMatch($candidate, $term, $pos)) |
|
322 |
{ |
|
323 |
$pos++; |
|
324 |
} |
|
325 |
return $pos; |
|
326 |
} |
|
327 |
||
328 |
# add the data in array 2 to array 1, returning the position they went in. |
|
329 |
sub addArray |
|
330 |
{ |
|
331 |
my ($candidate, $addition) = @_; |
|
332 |
my $pos = findMatch($candidate, $addition); |
|
333 |
# add any required on to the end of the candidate block |
|
334 |
my $last = $pos + scalar(@$addition); |
|
335 |
my $additionPos = scalar(@$candidate) - $pos; |
|
336 |
while ($pos + $additionPos < $last) |
|
337 |
{ |
|
338 |
$$candidate[$pos + $additionPos] = $$addition[$additionPos]; |
|
339 |
$additionPos++; |
|
340 |
} |
|
341 |
return $pos; |
|
342 |
} |
|
343 |
||
344 |
# create data block 1 and indices 2 from data 3 and block size 4 |
|
345 |
sub createTrieLevel |
|
346 |
{ |
|
347 |
my ($data, $indices, $input, $blockSize) = @_; |
|
348 |
my $block = 0; |
|
349 |
while ($block * $blockSize < scalar(@$input)) |
|
350 |
{ |
|
351 |
my $start = $block * $blockSize; |
|
352 |
my $end = $start + $blockSize; |
|
353 |
my $currentBlockSize = $blockSize; |
|
354 |
if (scalar(@$input) < $end) |
|
355 |
{ |
|
356 |
$end = scalar(@$input); |
|
357 |
$currentBlockSize = $end - $start; |
|
358 |
} |
|
359 |
my @currentBlock = @$input[$start..($end - 1)]; |
|
360 |
while ($currentBlockSize != $blockSize) |
|
361 |
{ |
|
362 |
$currentBlock[$currentBlockSize] = 0; |
|
363 |
$currentBlockSize++; |
|
364 |
} |
|
365 |
$$indices[$block] = addArray($data, \@currentBlock); |
|
366 |
$block++; |
|
367 |
} |
|
368 |
} |
|
369 |
||
370 |
sub OutputArray |
|
371 |
{ |
|
372 |
my $index = 0; |
|
373 |
my $firstTime = 1; |
|
374 |
while ($index != scalar(@_)) |
|
375 |
{ |
|
376 |
if (!$firstTime) |
|
377 |
{ |
|
378 |
if ($index % 8) |
|
379 |
{ |
|
380 |
print ', '; |
|
381 |
} |
|
382 |
else |
|
383 |
{ |
|
384 |
print ",\n\t"; |
|
385 |
} |
|
386 |
} |
|
387 |
else |
|
388 |
{ |
|
389 |
print "\t"; |
|
390 |
$firstTime = 0; |
|
391 |
} |
|
392 |
print($_[$index]); |
|
393 |
$index++; |
|
394 |
} |
|
395 |
print "\n"; |
|
396 |
} |
|
397 |
||
398 |
if ($OptionOutputTrie) |
|
399 |
{ |
|
400 |
my @Trie0 = (); |
|
401 |
my @Index0 = (); |
|
402 |
my @Trie1 = (); |
|
403 |
my @Index1 = (); |
|
404 |
my @Trie2 = (); |
|
405 |
my @Index2 = (); |
|
406 |
createTrieLevel(\@Trie0, \@Index0, \@Offset, 16); |
|
407 |
createTrieLevel(\@Trie1, \@Index1, \@Index0, 16); |
|
408 |
createTrieLevel(\@Trie2, \@Index2, \@Index1, 16); |
|
409 |
print "// Use the bits from 12 up from your character to index CaseFoldTable0.\n"; |
|
410 |
print "// Use the result of this plus bits 8-11 to index CaseFoldTable1.\n"; |
|
411 |
print "// Use the result of this plus bits 4-7 to index CaseFoldTable2.\n"; |
|
412 |
print "// Use the result of this plus bits 0-3 to index CaseFoldTable3.\n"; |
|
413 |
print "// Add the result of this to your character to fold it.\n\n"; |
|
414 |
print "static const short CaseFoldTable3[] =\n\t{\n"; |
|
415 |
OutputArray(@Trie0); |
|
416 |
print "\t};\n\nstatic const unsigned short CaseFoldTable2[] =\n\t{\n"; |
|
417 |
OutputArray(@Trie1); |
|
418 |
print "\t};\n\nstatic const unsigned char CaseFoldTable1[] =\n\t{\n"; |
|
419 |
OutputArray(@Trie2); |
|
420 |
print "\t};\n\nstatic const unsigned char CaseFoldTable0[] =\n\t{\n"; |
|
421 |
OutputArray(@Index2); |
|
422 |
print "\t};\n"; |
|
423 |
} |