diff -r 000000000000 -r 1fb32624e06b localisation/localesupport/OtherTools/CaseEquivalence.pl --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/localisation/localesupport/OtherTools/CaseEquivalence.pl Tue Feb 02 02:02:46 2010 +0200 @@ -0,0 +1,423 @@ +# +# Copyright (c) 1997-2009 Nokia Corporation and/or its subsidiary(-ies). +# All rights reserved. +# This component and the accompanying materials are made available +# under the terms of "Eclipse Public License v1.0" +# which accompanies this distribution, and is available +# at the URL "http://www.eclipse.org/legal/epl-v10.html". +# +# Initial Contributors: +# Nokia Corporation - initial contribution. +# +# Contributors: +# +# Description: +# Case Equivalence +# Given the unicode data file, work out the case equivalence classes +# i.e. the equivalence classes for the transitive closure of ~ defined as +# follows: +# a~b if Uppercase(a) == b || Lowercase(a) == b || Titlecase(a) == b +# Usage: perl CaseEquivalence ] [-s]\nusing standard input and output streams.\n"; + print STDERR " is one of:\nt: output C++ code giving a trie for folding case. Each trie level is 4 bits.\n"; + print STDERR "f: Give a list of all codes that need mapping and what they map to.\n"; + print STDERR "r: Give a list of all codes are mapped to and what maps to them.\n"; + print STDERR "m: Give a list of all codes are mapped to by more than one code.\n"; + print STDERR "\nOmitting the -s option adds the following case-equivalence:\nSpace = Non-breaking space\n"; + exit; + } + } + +# set a code as being part of a non-unitary case-equivalence class. +sub add + { + my ($addition) = @_; + if (!$Codes{$addition}) + { + $Codes{$addition} = 1; + } + } + +# make a code point to its final case varient +sub chaseDown + { + my ($codeVal) = @_; + my $class = $codeVal; + while ($CaseClass{$class}) + { + $class = $CaseClass{$class}; + } + $CaseClass{$codeVal} = $class unless $codeVal == $class; + return $class; + } + +# link two codes together as being part of the same case-equivalence class +sub makeEquivalent + { + my ($left, $right) = @_; + if (!$left || !$right) + { + return; + } + $left = chaseDown($left); + $right = chaseDown($right); + if ($Codes{$left} < $Codes{$right}) + { + $CaseClass{$left} = $right; + return; + } + if ($Codes{$right} < $Codes{$left}) + { + $CaseClass{$right} = $left; + return; + } + if ($left < $right) + { + $CaseClass{$right} = $left; + return; + } + if ($right < $left) + { + $CaseClass{$left} = $right; + return; + } + # $left == $right.. do nothing + return; + } + +# Link possibly unmentioned codes together. The first one is considered lower-case +sub addEquivalenceClass + { + my ($lower, @rest) = @_; + $Codes{$lower} = 2; + foreach my $one (@rest) + { + $Codes{$one} = 1; + makeEquivalent($lower, $one); + } + } + +# Firstly we read in the data +while() + { + my @line = split('#', $_, 1); + my @fields = split(/;/, $line[0]); + my @decomposition = split(' ', $fields[5]); + if (1 < scalar(@fields)) + { + my $codeVal = hex($fields[0]); + # if the character has a non-compatibility decomposition sequence, record this fact. + if (0 < scalar(@decomposition)) + { + my $decompositionType = ""; + if ($decomposition[0] =~ m/<[a-zA-Z0-9]+>/) + { + $decompositionType = shift @decomposition; + } + if ($decompositionType !~ m/compat/i) + { + $DecompositionValue[$codeVal] = scalar(@decomposition) == 1? hex($decomposition[0]) : -1; + } + } + $Name[$codeVal] = $fields[1]; + my $upperval = $fields[12]; + my $lowerval = $fields[13]; + my $titleval = $fields[14]; + + # strip whitespace from the end of the string + $titleval =~ s/\s+$//; + if ($upperval) + { + $upperval = hex($upperval); + $Upper[$codeVal] = $upperval; + add $codeVal; + add $upperval; + } + if ($titleval) + { + $titleval = hex($titleval); + $Title[$codeVal] = $titleval; + add $codeVal; + add $titleval; + } + if ($lowerval) + { + $lowerval = hex($lowerval); + $Lower[$codeVal] = $lowerval; + add $codeVal; + $Codes{$lowerval} = 2; + } + } + } + +# Remove all codes that decompose to a sequence +foreach my $codeVal (keys(%Codes)) + { + my $current = $DecompositionValue[$codeVal]; + while ($current && 0 < $current) + { + $current = $DecompositionValue[$current]; + } + if ($current && $current == -1) + { + delete $Codes{$codeVal}; + } + } + +# Next we form the equivalence classes. +if ($OptionIncludeExtraMappings) + { + # space = non-breaking space + addEquivalenceClass(0x20, 0xA0); + } +# We try to end up with everything being equivalent to a lower case letter +foreach my $codeVal (keys(%Codes)) + { + makeEquivalent($codeVal, $Lower[$codeVal]); + makeEquivalent($codeVal, $Upper[$codeVal]); + makeEquivalent($codeVal, $Title[$codeVal]); + } + +# Next we chase each pointer in CaseClass down to its final result +foreach my $codeVal (keys(%CaseClass)) + { + chaseDown($codeVal); + } + +# Now output the results in order, and collect the raw data +my @Offset = (); +my $oldCodeCount = 0; +foreach my $codeVal (sort {$a <=> $b} keys(%CaseClass)) + { + my $class = $CaseClass{$codeVal}; + my $offset = $class - $codeVal; + if ($OptionOutputForwardMapping) + { + printf "%x %d\t\t%s => %s\n", $codeVal, $offset, $Name[$codeVal], $Name[$class]; + } + while ($oldCodeCount != $codeVal) + { + $Offset[$oldCodeCount] = 0; + $oldCodeCount++; + } + $oldCodeCount++; + $Offset[$codeVal] = $offset; + } + +if ($OptionOutputReverseMapping) + { + my %ReverseMapping = (); + foreach my $codeVal (keys(%CaseClass)) + { + my $mapsTo = $CaseClass{$codeVal}; + if (!$ReverseMapping{$mapsTo}) + { + $ReverseMapping{$mapsTo} = [$codeVal]; + } + else + { + push (@{ $ReverseMapping{$mapsTo} }, $codeVal); + } + } + foreach my $mapVal (sort {$a <=> $b} keys(%ReverseMapping)) + { + next if ($OptionIgnoreOneToOneReverseMappings && scalar(@{$ReverseMapping{$mapVal}}) == 1); + printf("%x: %s <=", $mapVal, $Name[$mapVal]); + my $firstTime = 1; + foreach my $val ( @{ $ReverseMapping{$mapVal} } ) + { + if (!$firstTime) + { + print ','; + } + $firstTime = 0; + printf(" %s:%x", $Name[$val], $val); + } + print "\n"; + } + } + +# does the array 2 match array 1? Match the shorter array against the prefix of +# the other array +sub arraysMatch + { + my ($left, $right, $leftpos) = @_; + my $last = scalar(@$left) - $leftpos; + if (scalar(@$right) < $last) + { + $last = scalar(@$right); + } + my $pos = 0; + while ($pos < $last) + { + if ($$left[$pos + $leftpos] != $$right[$pos]) + { + return 0; + } + $pos++; + } + return 1; + } + +# find a match for array 2 in array 1, allowing values past the end of array 1 +# to match anything in array 1 +sub findMatch + { + my ($candidate, $term) = @_; + my $pos = 0; + while (!arraysMatch($candidate, $term, $pos)) + { + $pos++; + } + return $pos; + } + +# add the data in array 2 to array 1, returning the position they went in. +sub addArray + { + my ($candidate, $addition) = @_; + my $pos = findMatch($candidate, $addition); + # add any required on to the end of the candidate block + my $last = $pos + scalar(@$addition); + my $additionPos = scalar(@$candidate) - $pos; + while ($pos + $additionPos < $last) + { + $$candidate[$pos + $additionPos] = $$addition[$additionPos]; + $additionPos++; + } + return $pos; + } + +# create data block 1 and indices 2 from data 3 and block size 4 +sub createTrieLevel + { + my ($data, $indices, $input, $blockSize) = @_; + my $block = 0; + while ($block * $blockSize < scalar(@$input)) + { + my $start = $block * $blockSize; + my $end = $start + $blockSize; + my $currentBlockSize = $blockSize; + if (scalar(@$input) < $end) + { + $end = scalar(@$input); + $currentBlockSize = $end - $start; + } + my @currentBlock = @$input[$start..($end - 1)]; + while ($currentBlockSize != $blockSize) + { + $currentBlock[$currentBlockSize] = 0; + $currentBlockSize++; + } + $$indices[$block] = addArray($data, \@currentBlock); + $block++; + } + } + +sub OutputArray + { + my $index = 0; + my $firstTime = 1; + while ($index != scalar(@_)) + { + if (!$firstTime) + { + if ($index % 8) + { + print ', '; + } + else + { + print ",\n\t"; + } + } + else + { + print "\t"; + $firstTime = 0; + } + print($_[$index]); + $index++; + } + print "\n"; + } + +if ($OptionOutputTrie) + { + my @Trie0 = (); + my @Index0 = (); + my @Trie1 = (); + my @Index1 = (); + my @Trie2 = (); + my @Index2 = (); + createTrieLevel(\@Trie0, \@Index0, \@Offset, 16); + createTrieLevel(\@Trie1, \@Index1, \@Index0, 16); + createTrieLevel(\@Trie2, \@Index2, \@Index1, 16); + print "// Use the bits from 12 up from your character to index CaseFoldTable0.\n"; + print "// Use the result of this plus bits 8-11 to index CaseFoldTable1.\n"; + print "// Use the result of this plus bits 4-7 to index CaseFoldTable2.\n"; + print "// Use the result of this plus bits 0-3 to index CaseFoldTable3.\n"; + print "// Add the result of this to your character to fold it.\n\n"; + print "static const short CaseFoldTable3[] =\n\t{\n"; + OutputArray(@Trie0); + print "\t};\n\nstatic const unsigned short CaseFoldTable2[] =\n\t{\n"; + OutputArray(@Trie1); + print "\t};\n\nstatic const unsigned char CaseFoldTable1[] =\n\t{\n"; + OutputArray(@Trie2); + print "\t};\n\nstatic const unsigned char CaseFoldTable0[] =\n\t{\n"; + OutputArray(@Index2); + print "\t};\n"; + }