localisation/localesupport/OtherTools/CaseEquivalence.pl
changeset 0 a41df078684a
child 2 4122176ea935
equal deleted inserted replaced
-1:000000000000 0:a41df078684a
       
     1 
       
     2 # Copyright (c) 1997-2009 Nokia Corporation and/or its subsidiary(-ies).
       
     3 # All rights reserved.
       
     4 # This component and the accompanying materials are made available
       
     5 # under the terms of the License "Eclipse Public License v1.0"
       
     6 # which accompanies this distribution, and is available
       
     7 # at the URL "http://www.eclipse.org/legal/epl-v10.html".
       
     8 #
       
     9 # Initial Contributors:
       
    10 # Nokia Corporation - initial contribution.
       
    11 #
       
    12 # Contributors:
       
    13 #
       
    14 # Description:
       
    15 # Case Equivalence
       
    16 # Given the unicode data file, work out the case equivalence classes
       
    17 # i.e. the equivalence classes for the transitive closure of ~ defined as
       
    18 # follows:
       
    19 # a~b if Uppercase(a) == b || Lowercase(a) == b || Titlecase(a) == b
       
    20 # Usage: perl CaseEquivalence <UnicodeData.txt
       
    21 # 
       
    22 #
       
    23 
       
    24 use strict;
       
    25 my @Name = ();
       
    26 my @Upper = ();
       
    27 my @Lower = ();
       
    28 my @Title = ();
       
    29 # $DecompositionValue[$code] is undefined if $code has no decomposition
       
    30 # sequence, if it has a single value decomposition sequence, then this is it,
       
    31 # if it has a longer sequence, the value is -1
       
    32 my @DecompositionValue = ();
       
    33 # 1 for each code that has a differently-cased version,
       
    34 # 2 for each code that is a lower-case version of something else.
       
    35 my %Codes = ();
       
    36 my %CaseClass = ();
       
    37 
       
    38 # Command-line options
       
    39 my $OptionOutputTrie = 1;
       
    40 my $OptionOutputForwardMapping = 0;
       
    41 my $OptionOutputReverseMapping = 0;
       
    42 my $OptionIgnoreOneToOneReverseMappings = 0;
       
    43 my $OptionIncludeExtraMappings = 1;
       
    44 
       
    45 foreach my $optionString (@ARGV)
       
    46 	{
       
    47 	if ($optionString =~ m![/-]o[tfrm]!)
       
    48 		{
       
    49 		$OptionOutputTrie = 0;
       
    50 		my $option = substr($optionString, 2, 1);
       
    51 		if ($option eq 'f')
       
    52 			{
       
    53 			$OptionOutputForwardMapping = 1;
       
    54 			}
       
    55 		elsif ($option eq 'r')
       
    56 			{
       
    57 			$OptionOutputReverseMapping = 1;
       
    58 			}
       
    59 		elsif ($option eq 'm')
       
    60 			{
       
    61 			$OptionOutputReverseMapping = 1;
       
    62 			$OptionIgnoreOneToOneReverseMappings = 1;
       
    63 			}
       
    64 		else
       
    65 			{
       
    66 			$OptionOutputTrie = 1;
       
    67 			}
       
    68 		}
       
    69 	elsif ($optionString =~ m![/-]s!)
       
    70 		{
       
    71 		$OptionIncludeExtraMappings = 0;
       
    72 		}
       
    73 	else
       
    74 		{
       
    75 		print STDERR "Usage: perl CaseEquivalence [-o<mapping>] [-s]\nusing standard input and output streams.\n";
       
    76 		print STDERR "<mapping> is one of:\nt: output C++ code giving a trie for folding case. Each trie level is 4 bits.\n";
       
    77 		print STDERR "f: Give a list of all codes that need mapping and what they map to.\n";
       
    78 		print STDERR "r: Give a list of all codes are mapped to and what maps to them.\n";
       
    79 		print STDERR "m: Give a list of all codes are mapped to by more than one code.\n";
       
    80 		print STDERR "\nOmitting the -s option adds the following case-equivalence:\nSpace = Non-breaking space\n";
       
    81 		exit;
       
    82 		}
       
    83 	}
       
    84 
       
    85 # set a code as being part of a non-unitary case-equivalence class.
       
    86 sub add
       
    87 	{
       
    88 	my ($addition) = @_;
       
    89 	if (!$Codes{$addition})
       
    90 		{
       
    91 		$Codes{$addition} = 1;
       
    92 		}
       
    93 	}
       
    94 
       
    95 # make a code point to its final case varient
       
    96 sub chaseDown
       
    97 	{
       
    98 	my ($codeVal) = @_;
       
    99 	my $class = $codeVal;
       
   100 	while ($CaseClass{$class})
       
   101 		{
       
   102 		$class = $CaseClass{$class};
       
   103 		}
       
   104 	$CaseClass{$codeVal} = $class unless $codeVal == $class;
       
   105 	return $class;
       
   106 	}
       
   107 
       
   108 # link two codes together as being part of the same case-equivalence class
       
   109 sub makeEquivalent
       
   110 	{
       
   111 	my ($left, $right) = @_;
       
   112 	if (!$left || !$right)
       
   113 		{
       
   114 		return;
       
   115 		}
       
   116 	$left = chaseDown($left);
       
   117 	$right = chaseDown($right);
       
   118 	if ($Codes{$left} < $Codes{$right})
       
   119 		{
       
   120 		$CaseClass{$left} = $right;
       
   121 		return;
       
   122 		}
       
   123 	if ($Codes{$right} < $Codes{$left})
       
   124 		{
       
   125 		$CaseClass{$right} = $left;
       
   126 		return;
       
   127 		}
       
   128 	if ($left < $right)
       
   129 		{
       
   130 		$CaseClass{$right} = $left;
       
   131 		return;
       
   132 		}
       
   133 	if ($right < $left)
       
   134 		{
       
   135 		$CaseClass{$left} = $right;
       
   136 		return;
       
   137 		}
       
   138 	# $left == $right.. do nothing
       
   139 	return;
       
   140 	}
       
   141 
       
   142 # Link possibly unmentioned codes together. The first one is considered lower-case
       
   143 sub addEquivalenceClass
       
   144 	{
       
   145 	my ($lower, @rest) = @_;
       
   146 	$Codes{$lower} = 2;
       
   147 	foreach my $one (@rest)
       
   148 		{
       
   149 		$Codes{$one} = 1;
       
   150 		makeEquivalent($lower, $one);
       
   151 		}
       
   152 	}
       
   153 
       
   154 # Firstly we read in the data
       
   155 while(<STDIN>)
       
   156 	{
       
   157 	my @line = split('#', $_, 1);
       
   158 	my @fields = split(/;/, $line[0]);
       
   159 	my @decomposition = split(' ', $fields[5]);
       
   160 	if (1 < scalar(@fields))
       
   161 		{
       
   162 		my $codeVal = hex($fields[0]);
       
   163 		# if the character has a non-compatibility decomposition sequence, record this fact.
       
   164 		if (0 < scalar(@decomposition))
       
   165 			{
       
   166 			my $decompositionType = "";
       
   167 			if ($decomposition[0] =~ m/<[a-zA-Z0-9]+>/)
       
   168 				{
       
   169 				$decompositionType = shift @decomposition;
       
   170 				}
       
   171 			if ($decompositionType !~ m/compat/i)
       
   172 				{
       
   173 				$DecompositionValue[$codeVal] = scalar(@decomposition) == 1? hex($decomposition[0]) : -1;
       
   174 				}
       
   175 			}
       
   176 		$Name[$codeVal] = $fields[1];
       
   177 		my $upperval = $fields[12];
       
   178 		my $lowerval = $fields[13];
       
   179 		my $titleval = $fields[14];
       
   180 
       
   181 		# strip whitespace from the end of the string
       
   182 		$titleval =~ s/\s+$//;
       
   183 		if ($upperval)
       
   184 			{
       
   185 			$upperval = hex($upperval);
       
   186 			$Upper[$codeVal] = $upperval;
       
   187 			add $codeVal;
       
   188 			add $upperval;
       
   189 			}
       
   190 		if ($titleval)
       
   191 			{
       
   192 			$titleval = hex($titleval);
       
   193 			$Title[$codeVal] = $titleval;
       
   194 			add $codeVal;
       
   195 			add $titleval;
       
   196 			}
       
   197 		if ($lowerval)
       
   198 			{
       
   199 			$lowerval = hex($lowerval);
       
   200 			$Lower[$codeVal] = $lowerval;
       
   201 			add $codeVal;
       
   202 			$Codes{$lowerval} = 2;
       
   203 			}
       
   204 		}
       
   205 	}
       
   206 
       
   207 # Remove all codes that decompose to a sequence
       
   208 foreach my $codeVal (keys(%Codes))
       
   209 	{
       
   210 	my $current = $DecompositionValue[$codeVal];
       
   211 	while ($current && 0 < $current)
       
   212 		{
       
   213 		$current = $DecompositionValue[$current];
       
   214 		}
       
   215 	if ($current && $current == -1)
       
   216 		{
       
   217 		delete $Codes{$codeVal};
       
   218 		}
       
   219 	}
       
   220 
       
   221 # Next we form the equivalence classes.
       
   222 if ($OptionIncludeExtraMappings)
       
   223 	{
       
   224 	# space = non-breaking space
       
   225 	addEquivalenceClass(0x20, 0xA0);
       
   226 	}
       
   227 # We try to end up with everything being equivalent to a lower case letter
       
   228 foreach my $codeVal (keys(%Codes))
       
   229 	{
       
   230 	makeEquivalent($codeVal, $Lower[$codeVal]);
       
   231 	makeEquivalent($codeVal, $Upper[$codeVal]);
       
   232 	makeEquivalent($codeVal, $Title[$codeVal]);
       
   233 	}
       
   234 
       
   235 # Next we chase each pointer in CaseClass down to its final result
       
   236 foreach my $codeVal (keys(%CaseClass))
       
   237 	{
       
   238 	chaseDown($codeVal);
       
   239 	}
       
   240 
       
   241 # Now output the results in order, and collect the raw data
       
   242 my @Offset = ();
       
   243 my $oldCodeCount = 0;
       
   244 foreach my $codeVal (sort {$a <=> $b} keys(%CaseClass))
       
   245 	{
       
   246 	my $class = $CaseClass{$codeVal};
       
   247 	my $offset = $class - $codeVal;
       
   248 	if ($OptionOutputForwardMapping)
       
   249 		{
       
   250 		printf "%x %d\t\t%s => %s\n", $codeVal, $offset, $Name[$codeVal], $Name[$class];
       
   251 		}
       
   252 	while ($oldCodeCount != $codeVal)
       
   253 		{
       
   254 		$Offset[$oldCodeCount] = 0;
       
   255 		$oldCodeCount++;
       
   256 		}
       
   257 	$oldCodeCount++;
       
   258 	$Offset[$codeVal] = $offset;
       
   259 	}
       
   260 
       
   261 if ($OptionOutputReverseMapping)
       
   262 	{
       
   263 	my %ReverseMapping = ();
       
   264 	foreach my $codeVal (keys(%CaseClass))
       
   265 		{
       
   266 		my $mapsTo = $CaseClass{$codeVal};
       
   267 		if (!$ReverseMapping{$mapsTo})
       
   268 			{
       
   269 			$ReverseMapping{$mapsTo} = [$codeVal];
       
   270 			}
       
   271 		else
       
   272 			{
       
   273 			push (@{ $ReverseMapping{$mapsTo} }, $codeVal);
       
   274 			}
       
   275 		}
       
   276 	foreach my $mapVal (sort {$a <=> $b} keys(%ReverseMapping))
       
   277 		{
       
   278 		next if ($OptionIgnoreOneToOneReverseMappings && scalar(@{$ReverseMapping{$mapVal}}) == 1);
       
   279 		printf("%x: %s <=", $mapVal, $Name[$mapVal]);
       
   280 		my $firstTime = 1;
       
   281 		foreach my $val ( @{ $ReverseMapping{$mapVal} } )
       
   282 			{
       
   283 			if (!$firstTime)
       
   284 				{
       
   285 				print ',';
       
   286 				}
       
   287 			$firstTime = 0;
       
   288 			printf(" %s:%x", $Name[$val], $val);
       
   289 			}
       
   290 		print "\n";
       
   291 		}
       
   292 	}
       
   293 
       
   294 # does the array 2 match array 1? Match the shorter array against the prefix of
       
   295 # the other array
       
   296 sub arraysMatch
       
   297 	{
       
   298 	my ($left, $right, $leftpos) = @_;
       
   299 	my $last = scalar(@$left) - $leftpos;
       
   300 	if (scalar(@$right) < $last)
       
   301 		{
       
   302 		$last = scalar(@$right);
       
   303 		}
       
   304 	my $pos = 0;
       
   305 	while ($pos < $last)
       
   306 		{
       
   307 		if ($$left[$pos + $leftpos] != $$right[$pos])
       
   308 			{
       
   309 			return 0;
       
   310 			}
       
   311 		$pos++;
       
   312 		}
       
   313 	return 1;
       
   314 	}
       
   315 
       
   316 # find a match for array 2 in array 1, allowing values past the end of array 1
       
   317 # to match anything in array 1
       
   318 sub findMatch
       
   319 	{
       
   320 	my ($candidate, $term) = @_;
       
   321 	my $pos = 0;
       
   322 	while (!arraysMatch($candidate, $term, $pos))
       
   323 		{
       
   324 		$pos++;
       
   325 		}
       
   326 	return $pos;
       
   327 	}
       
   328 
       
   329 # add the data in array 2 to array 1, returning the position they went in.
       
   330 sub addArray
       
   331 	{
       
   332 	my ($candidate, $addition) = @_;
       
   333 	my $pos = findMatch($candidate, $addition);
       
   334 	# add any required on to the end of the candidate block
       
   335 	my $last = $pos + scalar(@$addition);
       
   336 	my $additionPos = scalar(@$candidate) - $pos;
       
   337 	while ($pos + $additionPos < $last)
       
   338 		{
       
   339 		$$candidate[$pos + $additionPos] = $$addition[$additionPos];
       
   340 		$additionPos++;
       
   341 		}
       
   342 	return $pos;
       
   343 	}
       
   344 
       
   345 # create data block 1 and indices 2 from data 3 and block size 4
       
   346 sub createTrieLevel
       
   347 	{
       
   348 	my ($data, $indices, $input, $blockSize) = @_;
       
   349 	my $block = 0;
       
   350 	while ($block * $blockSize < scalar(@$input))
       
   351 		{
       
   352 		my $start = $block * $blockSize;
       
   353 		my $end = $start + $blockSize;
       
   354 		my $currentBlockSize = $blockSize;
       
   355 		if (scalar(@$input) < $end)
       
   356 			{
       
   357 			$end = scalar(@$input);
       
   358 			$currentBlockSize = $end - $start;
       
   359 			}
       
   360 		my @currentBlock = @$input[$start..($end - 1)];
       
   361 		while ($currentBlockSize != $blockSize)
       
   362 			{
       
   363 			$currentBlock[$currentBlockSize] = 0;
       
   364 			$currentBlockSize++;
       
   365 			}
       
   366 		$$indices[$block] = addArray($data, \@currentBlock);
       
   367 		$block++;
       
   368 		}
       
   369 	}
       
   370 
       
   371 sub OutputArray
       
   372 	{
       
   373 	my $index = 0;
       
   374 	my $firstTime = 1;
       
   375 	while ($index != scalar(@_))
       
   376 		{
       
   377 		if (!$firstTime)
       
   378 			{
       
   379 			if ($index % 8)
       
   380 				{
       
   381 				print ', ';
       
   382 				}
       
   383 			else
       
   384 				{
       
   385 				print ",\n\t";
       
   386 				}
       
   387 			}
       
   388 		else
       
   389 			{
       
   390 			print "\t";
       
   391 			$firstTime = 0;
       
   392 			}
       
   393 		print($_[$index]);
       
   394 		$index++;
       
   395 		}
       
   396 	print "\n";
       
   397 	}
       
   398 
       
   399 if ($OptionOutputTrie)
       
   400 	{
       
   401 	my @Trie0 = ();
       
   402 	my @Index0 = ();
       
   403 	my @Trie1 = ();
       
   404 	my @Index1 = ();
       
   405 	my @Trie2 = ();
       
   406 	my @Index2 = ();
       
   407 	createTrieLevel(\@Trie0, \@Index0, \@Offset, 16);
       
   408 	createTrieLevel(\@Trie1, \@Index1, \@Index0, 16);
       
   409 	createTrieLevel(\@Trie2, \@Index2, \@Index1, 16);
       
   410 	print "// Use the bits from 12 up from your character to index CaseFoldTable0.\n";
       
   411 	print "// Use the result of this plus bits 8-11 to index CaseFoldTable1.\n";
       
   412 	print "// Use the result of this plus bits 4-7 to index CaseFoldTable2.\n";
       
   413 	print "// Use the result of this plus bits 0-3 to index CaseFoldTable3.\n";
       
   414 	print "// Add the result of this to your character to fold it.\n\n";
       
   415 	print "static const short CaseFoldTable3[] =\n\t{\n";
       
   416 	OutputArray(@Trie0);
       
   417 	print "\t};\n\nstatic const unsigned short CaseFoldTable2[] =\n\t{\n";
       
   418 	OutputArray(@Trie1);
       
   419 	print "\t};\n\nstatic const unsigned char CaseFoldTable1[] =\n\t{\n";
       
   420 	OutputArray(@Trie2);
       
   421 	print "\t};\n\nstatic const unsigned char CaseFoldTable0[] =\n\t{\n";
       
   422 	OutputArray(@Index2);
       
   423 	print "\t};\n";
       
   424 	}