localisation/localesupport/OtherTools/CaseEquivalence.pl
changeset 31 56f325a607ea
parent 15 4122176ea935
child 32 c9417927a896
child 33 0173bcd7697c
equal deleted inserted replaced
15:4122176ea935 31:56f325a607ea
     1 #
       
     2 # Copyright (c) 1997-2009 Nokia Corporation and/or its subsidiary(-ies).
       
     3 # All rights reserved.
       
     4 # This component and the accompanying materials are made available
       
     5 # under the terms of "Eclipse Public License v1.0"
       
     6 # which accompanies this distribution, and is available
       
     7 # at the URL "http://www.eclipse.org/legal/epl-v10.html".
       
     8 #
       
     9 # Initial Contributors:
       
    10 # Nokia Corporation - initial contribution.
       
    11 #
       
    12 # Contributors:
       
    13 #
       
    14 # Description: 
       
    15 # Case Equivalence
       
    16 # Given the unicode data file, work out the case equivalence classes
       
    17 # i.e. the equivalence classes for the transitive closure of ~ defined as
       
    18 # follows:
       
    19 # a~b if Uppercase(a) == b || Lowercase(a) == b || Titlecase(a) == b
       
    20 # Usage: perl CaseEquivalence <UnicodeData.txt
       
    21 #
       
    22 
       
    23 use strict;
       
    24 my @Name = ();
       
    25 my @Upper = ();
       
    26 my @Lower = ();
       
    27 my @Title = ();
       
    28 # $DecompositionValue[$code] is undefined if $code has no decomposition
       
    29 # sequence, if it has a single value decomposition sequence, then this is it,
       
    30 # if it has a longer sequence, the value is -1
       
    31 my @DecompositionValue = ();
       
    32 # 1 for each code that has a differently-cased version,
       
    33 # 2 for each code that is a lower-case version of something else.
       
    34 my %Codes = ();
       
    35 my %CaseClass = ();
       
    36 
       
    37 # Command-line options
       
    38 my $OptionOutputTrie = 1;
       
    39 my $OptionOutputForwardMapping = 0;
       
    40 my $OptionOutputReverseMapping = 0;
       
    41 my $OptionIgnoreOneToOneReverseMappings = 0;
       
    42 my $OptionIncludeExtraMappings = 1;
       
    43 
       
    44 foreach my $optionString (@ARGV)
       
    45 	{
       
    46 	if ($optionString =~ m![/-]o[tfrm]!)
       
    47 		{
       
    48 		$OptionOutputTrie = 0;
       
    49 		my $option = substr($optionString, 2, 1);
       
    50 		if ($option eq 'f')
       
    51 			{
       
    52 			$OptionOutputForwardMapping = 1;
       
    53 			}
       
    54 		elsif ($option eq 'r')
       
    55 			{
       
    56 			$OptionOutputReverseMapping = 1;
       
    57 			}
       
    58 		elsif ($option eq 'm')
       
    59 			{
       
    60 			$OptionOutputReverseMapping = 1;
       
    61 			$OptionIgnoreOneToOneReverseMappings = 1;
       
    62 			}
       
    63 		else
       
    64 			{
       
    65 			$OptionOutputTrie = 1;
       
    66 			}
       
    67 		}
       
    68 	elsif ($optionString =~ m![/-]s!)
       
    69 		{
       
    70 		$OptionIncludeExtraMappings = 0;
       
    71 		}
       
    72 	else
       
    73 		{
       
    74 		print STDERR "Usage: perl CaseEquivalence [-o<mapping>] [-s]\nusing standard input and output streams.\n";
       
    75 		print STDERR "<mapping> is one of:\nt: output C++ code giving a trie for folding case. Each trie level is 4 bits.\n";
       
    76 		print STDERR "f: Give a list of all codes that need mapping and what they map to.\n";
       
    77 		print STDERR "r: Give a list of all codes are mapped to and what maps to them.\n";
       
    78 		print STDERR "m: Give a list of all codes are mapped to by more than one code.\n";
       
    79 		print STDERR "\nOmitting the -s option adds the following case-equivalence:\nSpace = Non-breaking space\n";
       
    80 		exit;
       
    81 		}
       
    82 	}
       
    83 
       
    84 # set a code as being part of a non-unitary case-equivalence class.
       
    85 sub add
       
    86 	{
       
    87 	my ($addition) = @_;
       
    88 	if (!$Codes{$addition})
       
    89 		{
       
    90 		$Codes{$addition} = 1;
       
    91 		}
       
    92 	}
       
    93 
       
    94 # make a code point to its final case varient
       
    95 sub chaseDown
       
    96 	{
       
    97 	my ($codeVal) = @_;
       
    98 	my $class = $codeVal;
       
    99 	while ($CaseClass{$class})
       
   100 		{
       
   101 		$class = $CaseClass{$class};
       
   102 		}
       
   103 	$CaseClass{$codeVal} = $class unless $codeVal == $class;
       
   104 	return $class;
       
   105 	}
       
   106 
       
   107 # link two codes together as being part of the same case-equivalence class
       
   108 sub makeEquivalent
       
   109 	{
       
   110 	my ($left, $right) = @_;
       
   111 	if (!$left || !$right)
       
   112 		{
       
   113 		return;
       
   114 		}
       
   115 	$left = chaseDown($left);
       
   116 	$right = chaseDown($right);
       
   117 	if ($Codes{$left} < $Codes{$right})
       
   118 		{
       
   119 		$CaseClass{$left} = $right;
       
   120 		return;
       
   121 		}
       
   122 	if ($Codes{$right} < $Codes{$left})
       
   123 		{
       
   124 		$CaseClass{$right} = $left;
       
   125 		return;
       
   126 		}
       
   127 	if ($left < $right)
       
   128 		{
       
   129 		$CaseClass{$right} = $left;
       
   130 		return;
       
   131 		}
       
   132 	if ($right < $left)
       
   133 		{
       
   134 		$CaseClass{$left} = $right;
       
   135 		return;
       
   136 		}
       
   137 	# $left == $right.. do nothing
       
   138 	return;
       
   139 	}
       
   140 
       
   141 # Link possibly unmentioned codes together. The first one is considered lower-case
       
   142 sub addEquivalenceClass
       
   143 	{
       
   144 	my ($lower, @rest) = @_;
       
   145 	$Codes{$lower} = 2;
       
   146 	foreach my $one (@rest)
       
   147 		{
       
   148 		$Codes{$one} = 1;
       
   149 		makeEquivalent($lower, $one);
       
   150 		}
       
   151 	}
       
   152 
       
   153 # Firstly we read in the data
       
   154 while(<STDIN>)
       
   155 	{
       
   156 	my @line = split('#', $_, 1);
       
   157 	my @fields = split(/;/, $line[0]);
       
   158 	my @decomposition = split(' ', $fields[5]);
       
   159 	if (1 < scalar(@fields))
       
   160 		{
       
   161 		my $codeVal = hex($fields[0]);
       
   162 		# if the character has a non-compatibility decomposition sequence, record this fact.
       
   163 		if (0 < scalar(@decomposition))
       
   164 			{
       
   165 			my $decompositionType = "";
       
   166 			if ($decomposition[0] =~ m/<[a-zA-Z0-9]+>/)
       
   167 				{
       
   168 				$decompositionType = shift @decomposition;
       
   169 				}
       
   170 			if ($decompositionType !~ m/compat/i)
       
   171 				{
       
   172 				$DecompositionValue[$codeVal] = scalar(@decomposition) == 1? hex($decomposition[0]) : -1;
       
   173 				}
       
   174 			}
       
   175 		$Name[$codeVal] = $fields[1];
       
   176 		my $upperval = $fields[12];
       
   177 		my $lowerval = $fields[13];
       
   178 		my $titleval = $fields[14];
       
   179 
       
   180 		# strip whitespace from the end of the string
       
   181 		$titleval =~ s/\s+$//;
       
   182 		if ($upperval)
       
   183 			{
       
   184 			$upperval = hex($upperval);
       
   185 			$Upper[$codeVal] = $upperval;
       
   186 			add $codeVal;
       
   187 			add $upperval;
       
   188 			}
       
   189 		if ($titleval)
       
   190 			{
       
   191 			$titleval = hex($titleval);
       
   192 			$Title[$codeVal] = $titleval;
       
   193 			add $codeVal;
       
   194 			add $titleval;
       
   195 			}
       
   196 		if ($lowerval)
       
   197 			{
       
   198 			$lowerval = hex($lowerval);
       
   199 			$Lower[$codeVal] = $lowerval;
       
   200 			add $codeVal;
       
   201 			$Codes{$lowerval} = 2;
       
   202 			}
       
   203 		}
       
   204 	}
       
   205 
       
   206 # Remove all codes that decompose to a sequence
       
   207 foreach my $codeVal (keys(%Codes))
       
   208 	{
       
   209 	my $current = $DecompositionValue[$codeVal];
       
   210 	while ($current && 0 < $current)
       
   211 		{
       
   212 		$current = $DecompositionValue[$current];
       
   213 		}
       
   214 	if ($current && $current == -1)
       
   215 		{
       
   216 		delete $Codes{$codeVal};
       
   217 		}
       
   218 	}
       
   219 
       
   220 # Next we form the equivalence classes.
       
   221 if ($OptionIncludeExtraMappings)
       
   222 	{
       
   223 	# space = non-breaking space
       
   224 	addEquivalenceClass(0x20, 0xA0);
       
   225 	}
       
   226 # We try to end up with everything being equivalent to a lower case letter
       
   227 foreach my $codeVal (keys(%Codes))
       
   228 	{
       
   229 	makeEquivalent($codeVal, $Lower[$codeVal]);
       
   230 	makeEquivalent($codeVal, $Upper[$codeVal]);
       
   231 	makeEquivalent($codeVal, $Title[$codeVal]);
       
   232 	}
       
   233 
       
   234 # Next we chase each pointer in CaseClass down to its final result
       
   235 foreach my $codeVal (keys(%CaseClass))
       
   236 	{
       
   237 	chaseDown($codeVal);
       
   238 	}
       
   239 
       
   240 # Now output the results in order, and collect the raw data
       
   241 my @Offset = ();
       
   242 my $oldCodeCount = 0;
       
   243 foreach my $codeVal (sort {$a <=> $b} keys(%CaseClass))
       
   244 	{
       
   245 	my $class = $CaseClass{$codeVal};
       
   246 	my $offset = $class - $codeVal;
       
   247 	if ($OptionOutputForwardMapping)
       
   248 		{
       
   249 		printf "%x %d\t\t%s => %s\n", $codeVal, $offset, $Name[$codeVal], $Name[$class];
       
   250 		}
       
   251 	while ($oldCodeCount != $codeVal)
       
   252 		{
       
   253 		$Offset[$oldCodeCount] = 0;
       
   254 		$oldCodeCount++;
       
   255 		}
       
   256 	$oldCodeCount++;
       
   257 	$Offset[$codeVal] = $offset;
       
   258 	}
       
   259 
       
   260 if ($OptionOutputReverseMapping)
       
   261 	{
       
   262 	my %ReverseMapping = ();
       
   263 	foreach my $codeVal (keys(%CaseClass))
       
   264 		{
       
   265 		my $mapsTo = $CaseClass{$codeVal};
       
   266 		if (!$ReverseMapping{$mapsTo})
       
   267 			{
       
   268 			$ReverseMapping{$mapsTo} = [$codeVal];
       
   269 			}
       
   270 		else
       
   271 			{
       
   272 			push (@{ $ReverseMapping{$mapsTo} }, $codeVal);
       
   273 			}
       
   274 		}
       
   275 	foreach my $mapVal (sort {$a <=> $b} keys(%ReverseMapping))
       
   276 		{
       
   277 		next if ($OptionIgnoreOneToOneReverseMappings && scalar(@{$ReverseMapping{$mapVal}}) == 1);
       
   278 		printf("%x: %s <=", $mapVal, $Name[$mapVal]);
       
   279 		my $firstTime = 1;
       
   280 		foreach my $val ( @{ $ReverseMapping{$mapVal} } )
       
   281 			{
       
   282 			if (!$firstTime)
       
   283 				{
       
   284 				print ',';
       
   285 				}
       
   286 			$firstTime = 0;
       
   287 			printf(" %s:%x", $Name[$val], $val);
       
   288 			}
       
   289 		print "\n";
       
   290 		}
       
   291 	}
       
   292 
       
   293 # does the array 2 match array 1? Match the shorter array against the prefix of
       
   294 # the other array
       
   295 sub arraysMatch
       
   296 	{
       
   297 	my ($left, $right, $leftpos) = @_;
       
   298 	my $last = scalar(@$left) - $leftpos;
       
   299 	if (scalar(@$right) < $last)
       
   300 		{
       
   301 		$last = scalar(@$right);
       
   302 		}
       
   303 	my $pos = 0;
       
   304 	while ($pos < $last)
       
   305 		{
       
   306 		if ($$left[$pos + $leftpos] != $$right[$pos])
       
   307 			{
       
   308 			return 0;
       
   309 			}
       
   310 		$pos++;
       
   311 		}
       
   312 	return 1;
       
   313 	}
       
   314 
       
   315 # find a match for array 2 in array 1, allowing values past the end of array 1
       
   316 # to match anything in array 1
       
   317 sub findMatch
       
   318 	{
       
   319 	my ($candidate, $term) = @_;
       
   320 	my $pos = 0;
       
   321 	while (!arraysMatch($candidate, $term, $pos))
       
   322 		{
       
   323 		$pos++;
       
   324 		}
       
   325 	return $pos;
       
   326 	}
       
   327 
       
   328 # add the data in array 2 to array 1, returning the position they went in.
       
   329 sub addArray
       
   330 	{
       
   331 	my ($candidate, $addition) = @_;
       
   332 	my $pos = findMatch($candidate, $addition);
       
   333 	# add any required on to the end of the candidate block
       
   334 	my $last = $pos + scalar(@$addition);
       
   335 	my $additionPos = scalar(@$candidate) - $pos;
       
   336 	while ($pos + $additionPos < $last)
       
   337 		{
       
   338 		$$candidate[$pos + $additionPos] = $$addition[$additionPos];
       
   339 		$additionPos++;
       
   340 		}
       
   341 	return $pos;
       
   342 	}
       
   343 
       
   344 # create data block 1 and indices 2 from data 3 and block size 4
       
   345 sub createTrieLevel
       
   346 	{
       
   347 	my ($data, $indices, $input, $blockSize) = @_;
       
   348 	my $block = 0;
       
   349 	while ($block * $blockSize < scalar(@$input))
       
   350 		{
       
   351 		my $start = $block * $blockSize;
       
   352 		my $end = $start + $blockSize;
       
   353 		my $currentBlockSize = $blockSize;
       
   354 		if (scalar(@$input) < $end)
       
   355 			{
       
   356 			$end = scalar(@$input);
       
   357 			$currentBlockSize = $end - $start;
       
   358 			}
       
   359 		my @currentBlock = @$input[$start..($end - 1)];
       
   360 		while ($currentBlockSize != $blockSize)
       
   361 			{
       
   362 			$currentBlock[$currentBlockSize] = 0;
       
   363 			$currentBlockSize++;
       
   364 			}
       
   365 		$$indices[$block] = addArray($data, \@currentBlock);
       
   366 		$block++;
       
   367 		}
       
   368 	}
       
   369 
       
   370 sub OutputArray
       
   371 	{
       
   372 	my $index = 0;
       
   373 	my $firstTime = 1;
       
   374 	while ($index != scalar(@_))
       
   375 		{
       
   376 		if (!$firstTime)
       
   377 			{
       
   378 			if ($index % 8)
       
   379 				{
       
   380 				print ', ';
       
   381 				}
       
   382 			else
       
   383 				{
       
   384 				print ",\n\t";
       
   385 				}
       
   386 			}
       
   387 		else
       
   388 			{
       
   389 			print "\t";
       
   390 			$firstTime = 0;
       
   391 			}
       
   392 		print($_[$index]);
       
   393 		$index++;
       
   394 		}
       
   395 	print "\n";
       
   396 	}
       
   397 
       
   398 if ($OptionOutputTrie)
       
   399 	{
       
   400 	my @Trie0 = ();
       
   401 	my @Index0 = ();
       
   402 	my @Trie1 = ();
       
   403 	my @Index1 = ();
       
   404 	my @Trie2 = ();
       
   405 	my @Index2 = ();
       
   406 	createTrieLevel(\@Trie0, \@Index0, \@Offset, 16);
       
   407 	createTrieLevel(\@Trie1, \@Index1, \@Index0, 16);
       
   408 	createTrieLevel(\@Trie2, \@Index2, \@Index1, 16);
       
   409 	print "// Use the bits from 12 up from your character to index CaseFoldTable0.\n";
       
   410 	print "// Use the result of this plus bits 8-11 to index CaseFoldTable1.\n";
       
   411 	print "// Use the result of this plus bits 4-7 to index CaseFoldTable2.\n";
       
   412 	print "// Use the result of this plus bits 0-3 to index CaseFoldTable3.\n";
       
   413 	print "// Add the result of this to your character to fold it.\n\n";
       
   414 	print "static const short CaseFoldTable3[] =\n\t{\n";
       
   415 	OutputArray(@Trie0);
       
   416 	print "\t};\n\nstatic const unsigned short CaseFoldTable2[] =\n\t{\n";
       
   417 	OutputArray(@Trie1);
       
   418 	print "\t};\n\nstatic const unsigned char CaseFoldTable1[] =\n\t{\n";
       
   419 	OutputArray(@Trie2);
       
   420 	print "\t};\n\nstatic const unsigned char CaseFoldTable0[] =\n\t{\n";
       
   421 	OutputArray(@Index2);
       
   422 	print "\t};\n";
       
   423 	}