localisation/localesupport/OtherTools/CaseEquivalence.pl
changeset 31 56f325a607ea
parent 15 4122176ea935
child 32 c9417927a896
child 33 0173bcd7697c
--- a/localisation/localesupport/OtherTools/CaseEquivalence.pl	Mon Dec 21 16:14:42 2009 +0000
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,423 +0,0 @@
-#
-# Copyright (c) 1997-2009 Nokia Corporation and/or its subsidiary(-ies).
-# All rights reserved.
-# This component and the accompanying materials are made available
-# under the terms of "Eclipse Public License v1.0"
-# which accompanies this distribution, and is available
-# at the URL "http://www.eclipse.org/legal/epl-v10.html".
-#
-# Initial Contributors:
-# Nokia Corporation - initial contribution.
-#
-# Contributors:
-#
-# Description: 
-# Case Equivalence
-# Given the unicode data file, work out the case equivalence classes
-# i.e. the equivalence classes for the transitive closure of ~ defined as
-# follows:
-# a~b if Uppercase(a) == b || Lowercase(a) == b || Titlecase(a) == b
-# Usage: perl CaseEquivalence <UnicodeData.txt
-#
-
-use strict;
-my @Name = ();
-my @Upper = ();
-my @Lower = ();
-my @Title = ();
-# $DecompositionValue[$code] is undefined if $code has no decomposition
-# sequence, if it has a single value decomposition sequence, then this is it,
-# if it has a longer sequence, the value is -1
-my @DecompositionValue = ();
-# 1 for each code that has a differently-cased version,
-# 2 for each code that is a lower-case version of something else.
-my %Codes = ();
-my %CaseClass = ();
-
-# Command-line options
-my $OptionOutputTrie = 1;
-my $OptionOutputForwardMapping = 0;
-my $OptionOutputReverseMapping = 0;
-my $OptionIgnoreOneToOneReverseMappings = 0;
-my $OptionIncludeExtraMappings = 1;
-
-foreach my $optionString (@ARGV)
-	{
-	if ($optionString =~ m![/-]o[tfrm]!)
-		{
-		$OptionOutputTrie = 0;
-		my $option = substr($optionString, 2, 1);
-		if ($option eq 'f')
-			{
-			$OptionOutputForwardMapping = 1;
-			}
-		elsif ($option eq 'r')
-			{
-			$OptionOutputReverseMapping = 1;
-			}
-		elsif ($option eq 'm')
-			{
-			$OptionOutputReverseMapping = 1;
-			$OptionIgnoreOneToOneReverseMappings = 1;
-			}
-		else
-			{
-			$OptionOutputTrie = 1;
-			}
-		}
-	elsif ($optionString =~ m![/-]s!)
-		{
-		$OptionIncludeExtraMappings = 0;
-		}
-	else
-		{
-		print STDERR "Usage: perl CaseEquivalence [-o<mapping>] [-s]\nusing standard input and output streams.\n";
-		print STDERR "<mapping> is one of:\nt: output C++ code giving a trie for folding case. Each trie level is 4 bits.\n";
-		print STDERR "f: Give a list of all codes that need mapping and what they map to.\n";
-		print STDERR "r: Give a list of all codes are mapped to and what maps to them.\n";
-		print STDERR "m: Give a list of all codes are mapped to by more than one code.\n";
-		print STDERR "\nOmitting the -s option adds the following case-equivalence:\nSpace = Non-breaking space\n";
-		exit;
-		}
-	}
-
-# set a code as being part of a non-unitary case-equivalence class.
-sub add
-	{
-	my ($addition) = @_;
-	if (!$Codes{$addition})
-		{
-		$Codes{$addition} = 1;
-		}
-	}
-
-# make a code point to its final case varient
-sub chaseDown
-	{
-	my ($codeVal) = @_;
-	my $class = $codeVal;
-	while ($CaseClass{$class})
-		{
-		$class = $CaseClass{$class};
-		}
-	$CaseClass{$codeVal} = $class unless $codeVal == $class;
-	return $class;
-	}
-
-# link two codes together as being part of the same case-equivalence class
-sub makeEquivalent
-	{
-	my ($left, $right) = @_;
-	if (!$left || !$right)
-		{
-		return;
-		}
-	$left = chaseDown($left);
-	$right = chaseDown($right);
-	if ($Codes{$left} < $Codes{$right})
-		{
-		$CaseClass{$left} = $right;
-		return;
-		}
-	if ($Codes{$right} < $Codes{$left})
-		{
-		$CaseClass{$right} = $left;
-		return;
-		}
-	if ($left < $right)
-		{
-		$CaseClass{$right} = $left;
-		return;
-		}
-	if ($right < $left)
-		{
-		$CaseClass{$left} = $right;
-		return;
-		}
-	# $left == $right.. do nothing
-	return;
-	}
-
-# Link possibly unmentioned codes together. The first one is considered lower-case
-sub addEquivalenceClass
-	{
-	my ($lower, @rest) = @_;
-	$Codes{$lower} = 2;
-	foreach my $one (@rest)
-		{
-		$Codes{$one} = 1;
-		makeEquivalent($lower, $one);
-		}
-	}
-
-# Firstly we read in the data
-while(<STDIN>)
-	{
-	my @line = split('#', $_, 1);
-	my @fields = split(/;/, $line[0]);
-	my @decomposition = split(' ', $fields[5]);
-	if (1 < scalar(@fields))
-		{
-		my $codeVal = hex($fields[0]);
-		# if the character has a non-compatibility decomposition sequence, record this fact.
-		if (0 < scalar(@decomposition))
-			{
-			my $decompositionType = "";
-			if ($decomposition[0] =~ m/<[a-zA-Z0-9]+>/)
-				{
-				$decompositionType = shift @decomposition;
-				}
-			if ($decompositionType !~ m/compat/i)
-				{
-				$DecompositionValue[$codeVal] = scalar(@decomposition) == 1? hex($decomposition[0]) : -1;
-				}
-			}
-		$Name[$codeVal] = $fields[1];
-		my $upperval = $fields[12];
-		my $lowerval = $fields[13];
-		my $titleval = $fields[14];
-
-		# strip whitespace from the end of the string
-		$titleval =~ s/\s+$//;
-		if ($upperval)
-			{
-			$upperval = hex($upperval);
-			$Upper[$codeVal] = $upperval;
-			add $codeVal;
-			add $upperval;
-			}
-		if ($titleval)
-			{
-			$titleval = hex($titleval);
-			$Title[$codeVal] = $titleval;
-			add $codeVal;
-			add $titleval;
-			}
-		if ($lowerval)
-			{
-			$lowerval = hex($lowerval);
-			$Lower[$codeVal] = $lowerval;
-			add $codeVal;
-			$Codes{$lowerval} = 2;
-			}
-		}
-	}
-
-# Remove all codes that decompose to a sequence
-foreach my $codeVal (keys(%Codes))
-	{
-	my $current = $DecompositionValue[$codeVal];
-	while ($current && 0 < $current)
-		{
-		$current = $DecompositionValue[$current];
-		}
-	if ($current && $current == -1)
-		{
-		delete $Codes{$codeVal};
-		}
-	}
-
-# Next we form the equivalence classes.
-if ($OptionIncludeExtraMappings)
-	{
-	# space = non-breaking space
-	addEquivalenceClass(0x20, 0xA0);
-	}
-# We try to end up with everything being equivalent to a lower case letter
-foreach my $codeVal (keys(%Codes))
-	{
-	makeEquivalent($codeVal, $Lower[$codeVal]);
-	makeEquivalent($codeVal, $Upper[$codeVal]);
-	makeEquivalent($codeVal, $Title[$codeVal]);
-	}
-
-# Next we chase each pointer in CaseClass down to its final result
-foreach my $codeVal (keys(%CaseClass))
-	{
-	chaseDown($codeVal);
-	}
-
-# Now output the results in order, and collect the raw data
-my @Offset = ();
-my $oldCodeCount = 0;
-foreach my $codeVal (sort {$a <=> $b} keys(%CaseClass))
-	{
-	my $class = $CaseClass{$codeVal};
-	my $offset = $class - $codeVal;
-	if ($OptionOutputForwardMapping)
-		{
-		printf "%x %d\t\t%s => %s\n", $codeVal, $offset, $Name[$codeVal], $Name[$class];
-		}
-	while ($oldCodeCount != $codeVal)
-		{
-		$Offset[$oldCodeCount] = 0;
-		$oldCodeCount++;
-		}
-	$oldCodeCount++;
-	$Offset[$codeVal] = $offset;
-	}
-
-if ($OptionOutputReverseMapping)
-	{
-	my %ReverseMapping = ();
-	foreach my $codeVal (keys(%CaseClass))
-		{
-		my $mapsTo = $CaseClass{$codeVal};
-		if (!$ReverseMapping{$mapsTo})
-			{
-			$ReverseMapping{$mapsTo} = [$codeVal];
-			}
-		else
-			{
-			push (@{ $ReverseMapping{$mapsTo} }, $codeVal);
-			}
-		}
-	foreach my $mapVal (sort {$a <=> $b} keys(%ReverseMapping))
-		{
-		next if ($OptionIgnoreOneToOneReverseMappings && scalar(@{$ReverseMapping{$mapVal}}) == 1);
-		printf("%x: %s <=", $mapVal, $Name[$mapVal]);
-		my $firstTime = 1;
-		foreach my $val ( @{ $ReverseMapping{$mapVal} } )
-			{
-			if (!$firstTime)
-				{
-				print ',';
-				}
-			$firstTime = 0;
-			printf(" %s:%x", $Name[$val], $val);
-			}
-		print "\n";
-		}
-	}
-
-# does the array 2 match array 1? Match the shorter array against the prefix of
-# the other array
-sub arraysMatch
-	{
-	my ($left, $right, $leftpos) = @_;
-	my $last = scalar(@$left) - $leftpos;
-	if (scalar(@$right) < $last)
-		{
-		$last = scalar(@$right);
-		}
-	my $pos = 0;
-	while ($pos < $last)
-		{
-		if ($$left[$pos + $leftpos] != $$right[$pos])
-			{
-			return 0;
-			}
-		$pos++;
-		}
-	return 1;
-	}
-
-# find a match for array 2 in array 1, allowing values past the end of array 1
-# to match anything in array 1
-sub findMatch
-	{
-	my ($candidate, $term) = @_;
-	my $pos = 0;
-	while (!arraysMatch($candidate, $term, $pos))
-		{
-		$pos++;
-		}
-	return $pos;
-	}
-
-# add the data in array 2 to array 1, returning the position they went in.
-sub addArray
-	{
-	my ($candidate, $addition) = @_;
-	my $pos = findMatch($candidate, $addition);
-	# add any required on to the end of the candidate block
-	my $last = $pos + scalar(@$addition);
-	my $additionPos = scalar(@$candidate) - $pos;
-	while ($pos + $additionPos < $last)
-		{
-		$$candidate[$pos + $additionPos] = $$addition[$additionPos];
-		$additionPos++;
-		}
-	return $pos;
-	}
-
-# create data block 1 and indices 2 from data 3 and block size 4
-sub createTrieLevel
-	{
-	my ($data, $indices, $input, $blockSize) = @_;
-	my $block = 0;
-	while ($block * $blockSize < scalar(@$input))
-		{
-		my $start = $block * $blockSize;
-		my $end = $start + $blockSize;
-		my $currentBlockSize = $blockSize;
-		if (scalar(@$input) < $end)
-			{
-			$end = scalar(@$input);
-			$currentBlockSize = $end - $start;
-			}
-		my @currentBlock = @$input[$start..($end - 1)];
-		while ($currentBlockSize != $blockSize)
-			{
-			$currentBlock[$currentBlockSize] = 0;
-			$currentBlockSize++;
-			}
-		$$indices[$block] = addArray($data, \@currentBlock);
-		$block++;
-		}
-	}
-
-sub OutputArray
-	{
-	my $index = 0;
-	my $firstTime = 1;
-	while ($index != scalar(@_))
-		{
-		if (!$firstTime)
-			{
-			if ($index % 8)
-				{
-				print ', ';
-				}
-			else
-				{
-				print ",\n\t";
-				}
-			}
-		else
-			{
-			print "\t";
-			$firstTime = 0;
-			}
-		print($_[$index]);
-		$index++;
-		}
-	print "\n";
-	}
-
-if ($OptionOutputTrie)
-	{
-	my @Trie0 = ();
-	my @Index0 = ();
-	my @Trie1 = ();
-	my @Index1 = ();
-	my @Trie2 = ();
-	my @Index2 = ();
-	createTrieLevel(\@Trie0, \@Index0, \@Offset, 16);
-	createTrieLevel(\@Trie1, \@Index1, \@Index0, 16);
-	createTrieLevel(\@Trie2, \@Index2, \@Index1, 16);
-	print "// Use the bits from 12 up from your character to index CaseFoldTable0.\n";
-	print "// Use the result of this plus bits 8-11 to index CaseFoldTable1.\n";
-	print "// Use the result of this plus bits 4-7 to index CaseFoldTable2.\n";
-	print "// Use the result of this plus bits 0-3 to index CaseFoldTable3.\n";
-	print "// Add the result of this to your character to fold it.\n\n";
-	print "static const short CaseFoldTable3[] =\n\t{\n";
-	OutputArray(@Trie0);
-	print "\t};\n\nstatic const unsigned short CaseFoldTable2[] =\n\t{\n";
-	OutputArray(@Trie1);
-	print "\t};\n\nstatic const unsigned char CaseFoldTable1[] =\n\t{\n";
-	OutputArray(@Trie2);
-	print "\t};\n\nstatic const unsigned char CaseFoldTable0[] =\n\t{\n";
-	OutputArray(@Index2);
-	print "\t};\n";
-	}