localisation/localesupport/OtherTools/CaseEquivalence.pl
changeset 0 a41df078684a
child 15 4122176ea935
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/localisation/localesupport/OtherTools/CaseEquivalence.pl	Mon Oct 19 15:55:17 2009 +0100
@@ -0,0 +1,424 @@
+
+# Copyright (c) 1997-2009 Nokia Corporation and/or its subsidiary(-ies).
+# All rights reserved.
+# This component and the accompanying materials are made available
+# under the terms of the License "Eclipse Public License v1.0"
+# which accompanies this distribution, and is available
+# at the URL "http://www.eclipse.org/legal/epl-v10.html".
+#
+# Initial Contributors:
+# Nokia Corporation - initial contribution.
+#
+# Contributors:
+#
+# Description:
+# Case Equivalence
+# Given the unicode data file, work out the case equivalence classes
+# i.e. the equivalence classes for the transitive closure of ~ defined as
+# follows:
+# a~b if Uppercase(a) == b || Lowercase(a) == b || Titlecase(a) == b
+# Usage: perl CaseEquivalence <UnicodeData.txt
+# 
+#
+
+use strict;
+my @Name = ();
+my @Upper = ();
+my @Lower = ();
+my @Title = ();
+# $DecompositionValue[$code] is undefined if $code has no decomposition
+# sequence, if it has a single value decomposition sequence, then this is it,
+# if it has a longer sequence, the value is -1
+my @DecompositionValue = ();
+# 1 for each code that has a differently-cased version,
+# 2 for each code that is a lower-case version of something else.
+my %Codes = ();
+my %CaseClass = ();
+
+# Command-line options
+my $OptionOutputTrie = 1;
+my $OptionOutputForwardMapping = 0;
+my $OptionOutputReverseMapping = 0;
+my $OptionIgnoreOneToOneReverseMappings = 0;
+my $OptionIncludeExtraMappings = 1;
+
+foreach my $optionString (@ARGV)
+	{
+	if ($optionString =~ m![/-]o[tfrm]!)
+		{
+		$OptionOutputTrie = 0;
+		my $option = substr($optionString, 2, 1);
+		if ($option eq 'f')
+			{
+			$OptionOutputForwardMapping = 1;
+			}
+		elsif ($option eq 'r')
+			{
+			$OptionOutputReverseMapping = 1;
+			}
+		elsif ($option eq 'm')
+			{
+			$OptionOutputReverseMapping = 1;
+			$OptionIgnoreOneToOneReverseMappings = 1;
+			}
+		else
+			{
+			$OptionOutputTrie = 1;
+			}
+		}
+	elsif ($optionString =~ m![/-]s!)
+		{
+		$OptionIncludeExtraMappings = 0;
+		}
+	else
+		{
+		print STDERR "Usage: perl CaseEquivalence [-o<mapping>] [-s]\nusing standard input and output streams.\n";
+		print STDERR "<mapping> is one of:\nt: output C++ code giving a trie for folding case. Each trie level is 4 bits.\n";
+		print STDERR "f: Give a list of all codes that need mapping and what they map to.\n";
+		print STDERR "r: Give a list of all codes are mapped to and what maps to them.\n";
+		print STDERR "m: Give a list of all codes are mapped to by more than one code.\n";
+		print STDERR "\nOmitting the -s option adds the following case-equivalence:\nSpace = Non-breaking space\n";
+		exit;
+		}
+	}
+
+# set a code as being part of a non-unitary case-equivalence class.
+sub add
+	{
+	my ($addition) = @_;
+	if (!$Codes{$addition})
+		{
+		$Codes{$addition} = 1;
+		}
+	}
+
+# make a code point to its final case varient
+sub chaseDown
+	{
+	my ($codeVal) = @_;
+	my $class = $codeVal;
+	while ($CaseClass{$class})
+		{
+		$class = $CaseClass{$class};
+		}
+	$CaseClass{$codeVal} = $class unless $codeVal == $class;
+	return $class;
+	}
+
+# link two codes together as being part of the same case-equivalence class
+sub makeEquivalent
+	{
+	my ($left, $right) = @_;
+	if (!$left || !$right)
+		{
+		return;
+		}
+	$left = chaseDown($left);
+	$right = chaseDown($right);
+	if ($Codes{$left} < $Codes{$right})
+		{
+		$CaseClass{$left} = $right;
+		return;
+		}
+	if ($Codes{$right} < $Codes{$left})
+		{
+		$CaseClass{$right} = $left;
+		return;
+		}
+	if ($left < $right)
+		{
+		$CaseClass{$right} = $left;
+		return;
+		}
+	if ($right < $left)
+		{
+		$CaseClass{$left} = $right;
+		return;
+		}
+	# $left == $right.. do nothing
+	return;
+	}
+
+# Link possibly unmentioned codes together. The first one is considered lower-case
+sub addEquivalenceClass
+	{
+	my ($lower, @rest) = @_;
+	$Codes{$lower} = 2;
+	foreach my $one (@rest)
+		{
+		$Codes{$one} = 1;
+		makeEquivalent($lower, $one);
+		}
+	}
+
+# Firstly we read in the data
+while(<STDIN>)
+	{
+	my @line = split('#', $_, 1);
+	my @fields = split(/;/, $line[0]);
+	my @decomposition = split(' ', $fields[5]);
+	if (1 < scalar(@fields))
+		{
+		my $codeVal = hex($fields[0]);
+		# if the character has a non-compatibility decomposition sequence, record this fact.
+		if (0 < scalar(@decomposition))
+			{
+			my $decompositionType = "";
+			if ($decomposition[0] =~ m/<[a-zA-Z0-9]+>/)
+				{
+				$decompositionType = shift @decomposition;
+				}
+			if ($decompositionType !~ m/compat/i)
+				{
+				$DecompositionValue[$codeVal] = scalar(@decomposition) == 1? hex($decomposition[0]) : -1;
+				}
+			}
+		$Name[$codeVal] = $fields[1];
+		my $upperval = $fields[12];
+		my $lowerval = $fields[13];
+		my $titleval = $fields[14];
+
+		# strip whitespace from the end of the string
+		$titleval =~ s/\s+$//;
+		if ($upperval)
+			{
+			$upperval = hex($upperval);
+			$Upper[$codeVal] = $upperval;
+			add $codeVal;
+			add $upperval;
+			}
+		if ($titleval)
+			{
+			$titleval = hex($titleval);
+			$Title[$codeVal] = $titleval;
+			add $codeVal;
+			add $titleval;
+			}
+		if ($lowerval)
+			{
+			$lowerval = hex($lowerval);
+			$Lower[$codeVal] = $lowerval;
+			add $codeVal;
+			$Codes{$lowerval} = 2;
+			}
+		}
+	}
+
+# Remove all codes that decompose to a sequence
+foreach my $codeVal (keys(%Codes))
+	{
+	my $current = $DecompositionValue[$codeVal];
+	while ($current && 0 < $current)
+		{
+		$current = $DecompositionValue[$current];
+		}
+	if ($current && $current == -1)
+		{
+		delete $Codes{$codeVal};
+		}
+	}
+
+# Next we form the equivalence classes.
+if ($OptionIncludeExtraMappings)
+	{
+	# space = non-breaking space
+	addEquivalenceClass(0x20, 0xA0);
+	}
+# We try to end up with everything being equivalent to a lower case letter
+foreach my $codeVal (keys(%Codes))
+	{
+	makeEquivalent($codeVal, $Lower[$codeVal]);
+	makeEquivalent($codeVal, $Upper[$codeVal]);
+	makeEquivalent($codeVal, $Title[$codeVal]);
+	}
+
+# Next we chase each pointer in CaseClass down to its final result
+foreach my $codeVal (keys(%CaseClass))
+	{
+	chaseDown($codeVal);
+	}
+
+# Now output the results in order, and collect the raw data
+my @Offset = ();
+my $oldCodeCount = 0;
+foreach my $codeVal (sort {$a <=> $b} keys(%CaseClass))
+	{
+	my $class = $CaseClass{$codeVal};
+	my $offset = $class - $codeVal;
+	if ($OptionOutputForwardMapping)
+		{
+		printf "%x %d\t\t%s => %s\n", $codeVal, $offset, $Name[$codeVal], $Name[$class];
+		}
+	while ($oldCodeCount != $codeVal)
+		{
+		$Offset[$oldCodeCount] = 0;
+		$oldCodeCount++;
+		}
+	$oldCodeCount++;
+	$Offset[$codeVal] = $offset;
+	}
+
+if ($OptionOutputReverseMapping)
+	{
+	my %ReverseMapping = ();
+	foreach my $codeVal (keys(%CaseClass))
+		{
+		my $mapsTo = $CaseClass{$codeVal};
+		if (!$ReverseMapping{$mapsTo})
+			{
+			$ReverseMapping{$mapsTo} = [$codeVal];
+			}
+		else
+			{
+			push (@{ $ReverseMapping{$mapsTo} }, $codeVal);
+			}
+		}
+	foreach my $mapVal (sort {$a <=> $b} keys(%ReverseMapping))
+		{
+		next if ($OptionIgnoreOneToOneReverseMappings && scalar(@{$ReverseMapping{$mapVal}}) == 1);
+		printf("%x: %s <=", $mapVal, $Name[$mapVal]);
+		my $firstTime = 1;
+		foreach my $val ( @{ $ReverseMapping{$mapVal} } )
+			{
+			if (!$firstTime)
+				{
+				print ',';
+				}
+			$firstTime = 0;
+			printf(" %s:%x", $Name[$val], $val);
+			}
+		print "\n";
+		}
+	}
+
+# does the array 2 match array 1? Match the shorter array against the prefix of
+# the other array
+sub arraysMatch
+	{
+	my ($left, $right, $leftpos) = @_;
+	my $last = scalar(@$left) - $leftpos;
+	if (scalar(@$right) < $last)
+		{
+		$last = scalar(@$right);
+		}
+	my $pos = 0;
+	while ($pos < $last)
+		{
+		if ($$left[$pos + $leftpos] != $$right[$pos])
+			{
+			return 0;
+			}
+		$pos++;
+		}
+	return 1;
+	}
+
+# find a match for array 2 in array 1, allowing values past the end of array 1
+# to match anything in array 1
+sub findMatch
+	{
+	my ($candidate, $term) = @_;
+	my $pos = 0;
+	while (!arraysMatch($candidate, $term, $pos))
+		{
+		$pos++;
+		}
+	return $pos;
+	}
+
+# add the data in array 2 to array 1, returning the position they went in.
+sub addArray
+	{
+	my ($candidate, $addition) = @_;
+	my $pos = findMatch($candidate, $addition);
+	# add any required on to the end of the candidate block
+	my $last = $pos + scalar(@$addition);
+	my $additionPos = scalar(@$candidate) - $pos;
+	while ($pos + $additionPos < $last)
+		{
+		$$candidate[$pos + $additionPos] = $$addition[$additionPos];
+		$additionPos++;
+		}
+	return $pos;
+	}
+
+# create data block 1 and indices 2 from data 3 and block size 4
+sub createTrieLevel
+	{
+	my ($data, $indices, $input, $blockSize) = @_;
+	my $block = 0;
+	while ($block * $blockSize < scalar(@$input))
+		{
+		my $start = $block * $blockSize;
+		my $end = $start + $blockSize;
+		my $currentBlockSize = $blockSize;
+		if (scalar(@$input) < $end)
+			{
+			$end = scalar(@$input);
+			$currentBlockSize = $end - $start;
+			}
+		my @currentBlock = @$input[$start..($end - 1)];
+		while ($currentBlockSize != $blockSize)
+			{
+			$currentBlock[$currentBlockSize] = 0;
+			$currentBlockSize++;
+			}
+		$$indices[$block] = addArray($data, \@currentBlock);
+		$block++;
+		}
+	}
+
+sub OutputArray
+	{
+	my $index = 0;
+	my $firstTime = 1;
+	while ($index != scalar(@_))
+		{
+		if (!$firstTime)
+			{
+			if ($index % 8)
+				{
+				print ', ';
+				}
+			else
+				{
+				print ",\n\t";
+				}
+			}
+		else
+			{
+			print "\t";
+			$firstTime = 0;
+			}
+		print($_[$index]);
+		$index++;
+		}
+	print "\n";
+	}
+
+if ($OptionOutputTrie)
+	{
+	my @Trie0 = ();
+	my @Index0 = ();
+	my @Trie1 = ();
+	my @Index1 = ();
+	my @Trie2 = ();
+	my @Index2 = ();
+	createTrieLevel(\@Trie0, \@Index0, \@Offset, 16);
+	createTrieLevel(\@Trie1, \@Index1, \@Index0, 16);
+	createTrieLevel(\@Trie2, \@Index2, \@Index1, 16);
+	print "// Use the bits from 12 up from your character to index CaseFoldTable0.\n";
+	print "// Use the result of this plus bits 8-11 to index CaseFoldTable1.\n";
+	print "// Use the result of this plus bits 4-7 to index CaseFoldTable2.\n";
+	print "// Use the result of this plus bits 0-3 to index CaseFoldTable3.\n";
+	print "// Add the result of this to your character to fold it.\n\n";
+	print "static const short CaseFoldTable3[] =\n\t{\n";
+	OutputArray(@Trie0);
+	print "\t};\n\nstatic const unsigned short CaseFoldTable2[] =\n\t{\n";
+	OutputArray(@Trie1);
+	print "\t};\n\nstatic const unsigned char CaseFoldTable1[] =\n\t{\n";
+	OutputArray(@Trie2);
+	print "\t};\n\nstatic const unsigned char CaseFoldTable0[] =\n\t{\n";
+	OutputArray(@Index2);
+	print "\t};\n";
+	}