diff -r 4122176ea935 -r 56f325a607ea localisation/localesupport/OtherTools/CaseEquivalence.pl --- a/localisation/localesupport/OtherTools/CaseEquivalence.pl Mon Dec 21 16:14:42 2009 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,423 +0,0 @@ -# -# Copyright (c) 1997-2009 Nokia Corporation and/or its subsidiary(-ies). -# All rights reserved. -# This component and the accompanying materials are made available -# under the terms of "Eclipse Public License v1.0" -# which accompanies this distribution, and is available -# at the URL "http://www.eclipse.org/legal/epl-v10.html". -# -# Initial Contributors: -# Nokia Corporation - initial contribution. -# -# Contributors: -# -# Description: -# Case Equivalence -# Given the unicode data file, work out the case equivalence classes -# i.e. the equivalence classes for the transitive closure of ~ defined as -# follows: -# a~b if Uppercase(a) == b || Lowercase(a) == b || Titlecase(a) == b -# Usage: perl CaseEquivalence ] [-s]\nusing standard input and output streams.\n"; - print STDERR " is one of:\nt: output C++ code giving a trie for folding case. Each trie level is 4 bits.\n"; - print STDERR "f: Give a list of all codes that need mapping and what they map to.\n"; - print STDERR "r: Give a list of all codes are mapped to and what maps to them.\n"; - print STDERR "m: Give a list of all codes are mapped to by more than one code.\n"; - print STDERR "\nOmitting the -s option adds the following case-equivalence:\nSpace = Non-breaking space\n"; - exit; - } - } - -# set a code as being part of a non-unitary case-equivalence class. -sub add - { - my ($addition) = @_; - if (!$Codes{$addition}) - { - $Codes{$addition} = 1; - } - } - -# make a code point to its final case varient -sub chaseDown - { - my ($codeVal) = @_; - my $class = $codeVal; - while ($CaseClass{$class}) - { - $class = $CaseClass{$class}; - } - $CaseClass{$codeVal} = $class unless $codeVal == $class; - return $class; - } - -# link two codes together as being part of the same case-equivalence class -sub makeEquivalent - { - my ($left, $right) = @_; - if (!$left || !$right) - { - return; - } - $left = chaseDown($left); - $right = chaseDown($right); - if ($Codes{$left} < $Codes{$right}) - { - $CaseClass{$left} = $right; - return; - } - if ($Codes{$right} < $Codes{$left}) - { - $CaseClass{$right} = $left; - return; - } - if ($left < $right) - { - $CaseClass{$right} = $left; - return; - } - if ($right < $left) - { - $CaseClass{$left} = $right; - return; - } - # $left == $right.. do nothing - return; - } - -# Link possibly unmentioned codes together. The first one is considered lower-case -sub addEquivalenceClass - { - my ($lower, @rest) = @_; - $Codes{$lower} = 2; - foreach my $one (@rest) - { - $Codes{$one} = 1; - makeEquivalent($lower, $one); - } - } - -# Firstly we read in the data -while() - { - my @line = split('#', $_, 1); - my @fields = split(/;/, $line[0]); - my @decomposition = split(' ', $fields[5]); - if (1 < scalar(@fields)) - { - my $codeVal = hex($fields[0]); - # if the character has a non-compatibility decomposition sequence, record this fact. - if (0 < scalar(@decomposition)) - { - my $decompositionType = ""; - if ($decomposition[0] =~ m/<[a-zA-Z0-9]+>/) - { - $decompositionType = shift @decomposition; - } - if ($decompositionType !~ m/compat/i) - { - $DecompositionValue[$codeVal] = scalar(@decomposition) == 1? hex($decomposition[0]) : -1; - } - } - $Name[$codeVal] = $fields[1]; - my $upperval = $fields[12]; - my $lowerval = $fields[13]; - my $titleval = $fields[14]; - - # strip whitespace from the end of the string - $titleval =~ s/\s+$//; - if ($upperval) - { - $upperval = hex($upperval); - $Upper[$codeVal] = $upperval; - add $codeVal; - add $upperval; - } - if ($titleval) - { - $titleval = hex($titleval); - $Title[$codeVal] = $titleval; - add $codeVal; - add $titleval; - } - if ($lowerval) - { - $lowerval = hex($lowerval); - $Lower[$codeVal] = $lowerval; - add $codeVal; - $Codes{$lowerval} = 2; - } - } - } - -# Remove all codes that decompose to a sequence -foreach my $codeVal (keys(%Codes)) - { - my $current = $DecompositionValue[$codeVal]; - while ($current && 0 < $current) - { - $current = $DecompositionValue[$current]; - } - if ($current && $current == -1) - { - delete $Codes{$codeVal}; - } - } - -# Next we form the equivalence classes. -if ($OptionIncludeExtraMappings) - { - # space = non-breaking space - addEquivalenceClass(0x20, 0xA0); - } -# We try to end up with everything being equivalent to a lower case letter -foreach my $codeVal (keys(%Codes)) - { - makeEquivalent($codeVal, $Lower[$codeVal]); - makeEquivalent($codeVal, $Upper[$codeVal]); - makeEquivalent($codeVal, $Title[$codeVal]); - } - -# Next we chase each pointer in CaseClass down to its final result -foreach my $codeVal (keys(%CaseClass)) - { - chaseDown($codeVal); - } - -# Now output the results in order, and collect the raw data -my @Offset = (); -my $oldCodeCount = 0; -foreach my $codeVal (sort {$a <=> $b} keys(%CaseClass)) - { - my $class = $CaseClass{$codeVal}; - my $offset = $class - $codeVal; - if ($OptionOutputForwardMapping) - { - printf "%x %d\t\t%s => %s\n", $codeVal, $offset, $Name[$codeVal], $Name[$class]; - } - while ($oldCodeCount != $codeVal) - { - $Offset[$oldCodeCount] = 0; - $oldCodeCount++; - } - $oldCodeCount++; - $Offset[$codeVal] = $offset; - } - -if ($OptionOutputReverseMapping) - { - my %ReverseMapping = (); - foreach my $codeVal (keys(%CaseClass)) - { - my $mapsTo = $CaseClass{$codeVal}; - if (!$ReverseMapping{$mapsTo}) - { - $ReverseMapping{$mapsTo} = [$codeVal]; - } - else - { - push (@{ $ReverseMapping{$mapsTo} }, $codeVal); - } - } - foreach my $mapVal (sort {$a <=> $b} keys(%ReverseMapping)) - { - next if ($OptionIgnoreOneToOneReverseMappings && scalar(@{$ReverseMapping{$mapVal}}) == 1); - printf("%x: %s <=", $mapVal, $Name[$mapVal]); - my $firstTime = 1; - foreach my $val ( @{ $ReverseMapping{$mapVal} } ) - { - if (!$firstTime) - { - print ','; - } - $firstTime = 0; - printf(" %s:%x", $Name[$val], $val); - } - print "\n"; - } - } - -# does the array 2 match array 1? Match the shorter array against the prefix of -# the other array -sub arraysMatch - { - my ($left, $right, $leftpos) = @_; - my $last = scalar(@$left) - $leftpos; - if (scalar(@$right) < $last) - { - $last = scalar(@$right); - } - my $pos = 0; - while ($pos < $last) - { - if ($$left[$pos + $leftpos] != $$right[$pos]) - { - return 0; - } - $pos++; - } - return 1; - } - -# find a match for array 2 in array 1, allowing values past the end of array 1 -# to match anything in array 1 -sub findMatch - { - my ($candidate, $term) = @_; - my $pos = 0; - while (!arraysMatch($candidate, $term, $pos)) - { - $pos++; - } - return $pos; - } - -# add the data in array 2 to array 1, returning the position they went in. -sub addArray - { - my ($candidate, $addition) = @_; - my $pos = findMatch($candidate, $addition); - # add any required on to the end of the candidate block - my $last = $pos + scalar(@$addition); - my $additionPos = scalar(@$candidate) - $pos; - while ($pos + $additionPos < $last) - { - $$candidate[$pos + $additionPos] = $$addition[$additionPos]; - $additionPos++; - } - return $pos; - } - -# create data block 1 and indices 2 from data 3 and block size 4 -sub createTrieLevel - { - my ($data, $indices, $input, $blockSize) = @_; - my $block = 0; - while ($block * $blockSize < scalar(@$input)) - { - my $start = $block * $blockSize; - my $end = $start + $blockSize; - my $currentBlockSize = $blockSize; - if (scalar(@$input) < $end) - { - $end = scalar(@$input); - $currentBlockSize = $end - $start; - } - my @currentBlock = @$input[$start..($end - 1)]; - while ($currentBlockSize != $blockSize) - { - $currentBlock[$currentBlockSize] = 0; - $currentBlockSize++; - } - $$indices[$block] = addArray($data, \@currentBlock); - $block++; - } - } - -sub OutputArray - { - my $index = 0; - my $firstTime = 1; - while ($index != scalar(@_)) - { - if (!$firstTime) - { - if ($index % 8) - { - print ', '; - } - else - { - print ",\n\t"; - } - } - else - { - print "\t"; - $firstTime = 0; - } - print($_[$index]); - $index++; - } - print "\n"; - } - -if ($OptionOutputTrie) - { - my @Trie0 = (); - my @Index0 = (); - my @Trie1 = (); - my @Index1 = (); - my @Trie2 = (); - my @Index2 = (); - createTrieLevel(\@Trie0, \@Index0, \@Offset, 16); - createTrieLevel(\@Trie1, \@Index1, \@Index0, 16); - createTrieLevel(\@Trie2, \@Index2, \@Index1, 16); - print "// Use the bits from 12 up from your character to index CaseFoldTable0.\n"; - print "// Use the result of this plus bits 8-11 to index CaseFoldTable1.\n"; - print "// Use the result of this plus bits 4-7 to index CaseFoldTable2.\n"; - print "// Use the result of this plus bits 0-3 to index CaseFoldTable3.\n"; - print "// Add the result of this to your character to fold it.\n\n"; - print "static const short CaseFoldTable3[] =\n\t{\n"; - OutputArray(@Trie0); - print "\t};\n\nstatic const unsigned short CaseFoldTable2[] =\n\t{\n"; - OutputArray(@Trie1); - print "\t};\n\nstatic const unsigned char CaseFoldTable1[] =\n\t{\n"; - OutputArray(@Trie2); - print "\t};\n\nstatic const unsigned char CaseFoldTable0[] =\n\t{\n"; - OutputArray(@Index2); - print "\t};\n"; - }