diff -r 000000000000 -r a41df078684a kernel/eka/euser/unicode/perl/UnicodeAddFolded.pl --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/kernel/eka/euser/unicode/perl/UnicodeAddFolded.pl Mon Oct 19 15:55:17 2009 +0100 @@ -0,0 +1,108 @@ +# Copyright (c) 2002-2009 Nokia Corporation and/or its subsidiary(-ies). +# All rights reserved. +# This component and the accompanying materials are made available +# under the terms of the License "Eclipse Public License v1.0" +# which accompanies this distribution, and is available +# at the URL "http://www.eclipse.org/legal/epl-v10.html". +# +# Initial Contributors: +# Nokia Corporation - initial contribution. +# +# Contributors: +# +# Description: +# Adds folding information to Unicode data +# Added as the third field after the 'Symbian:' marker in the following format: +# Symbian:;; +# where is null or a sequence of hex unicode values +# separated by spaces representing the folded form of the character. +# Usage: +# perl -w UnicodeAddFolded.pl CaseFolding.txt < +# +# + +use strict; + +if (scalar(@ARGV) != 1) + { + print (STDERR "Usage:\nperl -w UnicodeAddFolded.pl CaseFolding.txt < \n"); + exit 1; + } + +open(FOLDING, $ARGV[0]) or die("Could not open file $ARGV[0]\n"); + +my %Fold = (); +my %MappingLine = (); +my $lineNo = 0; +while () + { + $lineNo++; + my ($line, $comment) = split(/#/, $_, 2); + if ($line =~ /^[ \t]*(1?[0-9a-fA-F]{4,5});[ \t]*([LEICSFT]);[ \t]*([0-9a-fA-F][0-9a-fA-F \t]*);[ \t]*$/) + { + my $code = hex($1); + my $type = $2; + my $folded = $3; + # We'll deal with Turkic mappings with our own hack. + # F = Full mappings (fold is longer than one character) + # T = I = Turkic mapping + if ($type !~ /[FTI]/ && $folded !~ /[ \t]/) + { + die ("$code has two mappings: lines $MappingLine{$code} and $lineNo.") + if (exists $Fold{$code}); + $Fold{$code} = $folded; + $MappingLine{$code} = $lineNo; + } + } + elsif ($line !~ /^[ \t]*$/) + { + die ("Did not understand line $lineNo of $ARGV[0]"); + } + } + +close FOLDING; + +# Turkic hack: +# Map dotted capital I and dotless small I to lower case i. +# This makes all the 'i's fold the same, which isn't very nice for Turkic +# languages, but it at least gives us behaviour consistent across locales +# which does at least map dotted I, and i to the same value, as well +# as mapping I and dotless i to the same value, and mapping I and i +# to the same value. +$Fold{0x49} = '0069'; +$Fold{0x130} = '0069'; +$Fold{0x131} = '0069'; + +$lineNo = 0; +while (my $line = ) + { + chomp $line; + $lineNo++; + # Split into fields: make sure trailing null strings are not + # deleted by adding a dummy final field + my @attribute = split(/;/, $line.';dummy'); + # Delete the dummy field + pop @attribute; + die ("Line $lineNo is missing 'Symbian:' entries. Has UnicodeCompositionEx been run?") + if (scalar(@attribute) == 15); + if (scalar(@attribute) == 16) + { + die ("Line $lineNo is missing 'Symbian:' entries. Has UnicodeCompositionEx been run?") + if ($attribute[15] !~ /^[ \t]*symbian:/i); + my $code = $attribute[0]; + die("First attribute '$code' not a valid Unicode codepoint at line $lineNo") + unless $code =~ /^1?[0-9a-fA-F]{4,5}$/; + $code = hex($code); + $attribute[16] = exists $Fold{$code}? $Fold{$code} : ''; + print join(';', @attribute); + } + elsif ($line !~ /^[ \t]*$/) + { + die 'Do not understand line '.$lineNo; + } + else + { + print $line; + } + print "\n"; + }