kernel/eka/euser/unicode/perl/UnicodeAddFolded.pl
changeset 0 a41df078684a
equal deleted inserted replaced
-1:000000000000 0:a41df078684a
       
     1 # Copyright (c) 2002-2009 Nokia Corporation and/or its subsidiary(-ies).
       
     2 # All rights reserved.
       
     3 # This component and the accompanying materials are made available
       
     4 # under the terms of the License "Eclipse Public License v1.0"
       
     5 # which accompanies this distribution, and is available
       
     6 # at the URL "http://www.eclipse.org/legal/epl-v10.html".
       
     7 #
       
     8 # Initial Contributors:
       
     9 # Nokia Corporation - initial contribution.
       
    10 #
       
    11 # Contributors:
       
    12 #
       
    13 # Description:
       
    14 # Adds folding information to Unicode data
       
    15 # Added as the third field after the 'Symbian:' marker in the following format:
       
    16 # Symbian:<grapheme-role>;<excluded-from-composition>;<folded-form>
       
    17 # where <folded-form> is null or a sequence of hex unicode values
       
    18 # separated by spaces representing the folded form of the character.
       
    19 # Usage:
       
    20 # perl -w UnicodeAddFolded.pl CaseFolding.txt < <output-of-UnicodeCompositionEx>
       
    21 # 
       
    22 #
       
    23 
       
    24 use strict;
       
    25 
       
    26 if (scalar(@ARGV) != 1)
       
    27 	{
       
    28 	print (STDERR "Usage:\nperl -w UnicodeAddFolded.pl CaseFolding.txt < <output-of-UnicodeCompositionEx>\n");
       
    29 	exit 1;
       
    30 	}
       
    31 
       
    32 open(FOLDING, $ARGV[0]) or die("Could not open file $ARGV[0]\n");
       
    33 
       
    34 my %Fold = ();
       
    35 my %MappingLine = ();
       
    36 my $lineNo = 0;
       
    37 while (<FOLDING>)
       
    38 	{
       
    39 	$lineNo++;
       
    40 	my ($line, $comment) = split(/#/, $_, 2);
       
    41 	if ($line =~ /^[ \t]*(1?[0-9a-fA-F]{4,5});[ \t]*([LEICSFT]);[ \t]*([0-9a-fA-F][0-9a-fA-F \t]*);[ \t]*$/)
       
    42 		{
       
    43 		my $code = hex($1);
       
    44 		my $type = $2;
       
    45 		my $folded = $3;
       
    46 		# We'll deal with Turkic mappings with our own hack.
       
    47 		# F = Full mappings (fold is longer than one character)
       
    48 		# T = I = Turkic mapping
       
    49 		if ($type !~ /[FTI]/ && $folded !~ /[ \t]/)
       
    50 			{
       
    51 			die ("$code has two mappings: lines $MappingLine{$code} and $lineNo.")
       
    52 				if (exists $Fold{$code});
       
    53 			$Fold{$code} = $folded;
       
    54 			$MappingLine{$code} = $lineNo;
       
    55 			}
       
    56 		}
       
    57 	elsif ($line !~ /^[ \t]*$/)
       
    58 		{
       
    59 		die ("Did not understand line $lineNo of $ARGV[0]");
       
    60 		}
       
    61 	}
       
    62 
       
    63 close FOLDING;
       
    64 
       
    65 # Turkic hack:
       
    66 # Map dotted capital I and dotless small I to lower case i.
       
    67 # This makes all the 'i's fold the same, which isn't very nice for Turkic
       
    68 # languages, but it at least gives us behaviour consistent across locales
       
    69 # which does at least map dotted I, and i to the same value, as well
       
    70 # as mapping I and dotless i to the same value, and mapping I and i
       
    71 # to the same value.
       
    72 $Fold{0x49} = '0069';
       
    73 $Fold{0x130} = '0069';
       
    74 $Fold{0x131} = '0069';
       
    75 
       
    76 $lineNo = 0;
       
    77 while (my $line = <STDIN>)
       
    78 	{
       
    79 	chomp $line;
       
    80 	$lineNo++;
       
    81 	# Split into fields: make sure trailing null strings are not
       
    82 	# deleted by adding a dummy final field
       
    83 	my @attribute = split(/;/, $line.';dummy');
       
    84 	# Delete the dummy field
       
    85 	pop @attribute;
       
    86 	die ("Line $lineNo is missing 'Symbian:' entries. Has UnicodeCompositionEx been run?")
       
    87 		if (scalar(@attribute) == 15);
       
    88 	if (scalar(@attribute) == 16)
       
    89 		{
       
    90 		die ("Line $lineNo is missing 'Symbian:' entries. Has UnicodeCompositionEx been run?")
       
    91 			if ($attribute[15] !~ /^[ \t]*symbian:/i);
       
    92 		my $code = $attribute[0];
       
    93 		die("First attribute '$code' not a valid Unicode codepoint at line $lineNo")
       
    94 			unless $code =~ /^1?[0-9a-fA-F]{4,5}$/;
       
    95 		$code = hex($code);
       
    96 		$attribute[16] = exists $Fold{$code}? $Fold{$code} : '';
       
    97 		print join(';', @attribute);
       
    98 		}
       
    99 	elsif ($line !~ /^[ \t]*$/)
       
   100 		{
       
   101 		die 'Do not understand line '.$lineNo;
       
   102 		}
       
   103 	else
       
   104 		{
       
   105 		print $line;
       
   106 		}
       
   107 	print "\n";
       
   108 	}