kernel/eka/euser/unicode/perl/UnicodeMaxDecompose.pl
changeset 9 96e5fb8b040d
equal deleted inserted replaced
-1:000000000000 9:96e5fb8b040d
       
     1 #
       
     2 # Copyright (c) 2009 Nokia Corporation and/or its subsidiary(-ies).
       
     3 # All rights reserved.
       
     4 # This component and the accompanying materials are made available
       
     5 # under the terms of the License "Eclipse Public License v1.0"
       
     6 # which accompanies this distribution, and is available
       
     7 # at the URL "http://www.eclipse.org/legal/epl-v10.html".
       
     8 #
       
     9 # Initial Contributors:
       
    10 # Nokia Corporation - initial contribution.
       
    11 #
       
    12 # Contributors:
       
    13 #
       
    14 # Description:
       
    15 #
       
    16 # UnicodeMaxDecompose.pl
       
    17 #
       
    18 # Adds maximal decompositions of the character and maximal decompositions of
       
    19 # its folded varient to the Unicode data.
       
    20 #
       
    21 # Added as the fourth field after the 'Symbain:' marker in the following format:
       
    22 #
       
    23 # Symbian:<grapheme-role>;<excluded>;<folded>;<max-decomposition>;<folded-decomposition>
       
    24 # where each of <max-decomposition> and <folded-decomposition> are strings
       
    25 # of hex numbers separated by spaces, representing the complete decomposition
       
    26 # of the character and its folded equivalent respectively.
       
    27 #
       
    28 # Usage:
       
    29 # perl -w UnicodeMaxDecompose.pl < <output-of-UnicodeAddFolded>
       
    30 
       
    31 use strict;
       
    32 
       
    33 if (scalar(@ARGV) != 0)
       
    34 	{
       
    35 	print (STDERR "Usage:\nperl -w UnicodeMaxDecompose.pl < <output-of-UnicodeAddFolded>\n");
       
    36 	exit 1;
       
    37 	}
       
    38 
       
    39 my %StatedDecomposition = ();
       
    40 my %CompleteDecomposition = ();
       
    41 
       
    42 sub Decompose
       
    43 	{
       
    44 	my ($code) = @_;
       
    45 	return unless exists $StatedDecomposition{$code};
       
    46 	my $stated = $StatedDecomposition{$code};
       
    47 	delete $StatedDecomposition{$code};
       
    48 	my @complete = ();
       
    49 	foreach my $hexelt ( split(' ', $stated) )
       
    50 		{
       
    51 		if ($hexelt)
       
    52 			{
       
    53 			Decompose($hexelt);
       
    54 			if (exists $CompleteDecomposition{$hexelt})
       
    55 				{
       
    56 				push @complete, $CompleteDecomposition{$hexelt};
       
    57 				}
       
    58 			else
       
    59 				{
       
    60 				push @complete, $hexelt;
       
    61 				}
       
    62 			}
       
    63 		}
       
    64 	$CompleteDecomposition{$code} = join(' ', @complete);
       
    65 	}
       
    66 
       
    67 my %Folded = ();
       
    68 my %LineToCode = ();
       
    69 my @RawLine = ();
       
    70 
       
    71 my $lineNo = 0;
       
    72 while (my $line = <STDIN>)
       
    73 	{
       
    74 	chomp $line;
       
    75 	$lineNo++;
       
    76 	# Split into fields: make sure trailing null strings are not
       
    77 	# deleted by adding a dummy final field
       
    78 	my @attribute = split(/;/, $line.';dummy');
       
    79 	# Delete the dummy field
       
    80 	pop @attribute;
       
    81 	die ("Line $lineNo is missing 'Symbian:' entries. Has UnicodeAddFolded been run?")
       
    82 		if (scalar(@attribute) == 16);
       
    83 	if (scalar(@attribute) == 17)
       
    84 		{
       
    85 		die ("Line $lineNo is missing 'Symbian:' entries. Has UnicodeAddFolded been run?")
       
    86 			if ($attribute[15] !~ /^[ \t]*symbian:/i);
       
    87 		my $code = $attribute[0];
       
    88 		die("First attribute '$code' not a valid Unicode codepoint at line $lineNo")
       
    89 			unless ($code =~ /^1?[0-9a-fA-F]{4,5}$/ && hex($code) < 0x110000);
       
    90 		my $decomposition = $attribute[5];
       
    91 		die("Decomposition '$decomposition' at line $lineNo is not a valid Unicode decomposition.")
       
    92 			unless $decomposition =~ /^[ \t]*(<.*>[ \t]*[0-9a-fA-F])?[0-9a-fA-F \t]*$/;
       
    93 		my $folded = $attribute[16];
       
    94 		die ("'$folded' not a valid string of hex values at line $lineNo.")
       
    95 			unless $folded =~ /[0-9a-fA-F \t]*/;
       
    96 		# Store all decompositions that  have no tag and at least one value
       
    97 		if ($decomposition =~ /^[ \t]*[0-9a-fA-F]/)
       
    98 			{
       
    99 			$StatedDecomposition{$code} = $decomposition;
       
   100 			}
       
   101 		if ($folded =~ /[0-9a-fA-F]/)
       
   102 			{
       
   103 			$Folded{$code} = $folded;
       
   104 			}
       
   105 		$LineToCode{$lineNo-1} = $code;
       
   106 		}
       
   107 	elsif ($line !~ /^[ \t]*$/)
       
   108 		{
       
   109 		die 'Do not understand line '.$lineNo;
       
   110 		}
       
   111 	$RawLine[$lineNo-1] = $line;
       
   112 	}
       
   113 
       
   114 # Completely decompose all strings in the %StatedDecomposition
       
   115 foreach my $code (keys %StatedDecomposition)
       
   116 	{
       
   117 	Decompose($code);
       
   118 	}
       
   119 
       
   120 # Now decompose all the folded versions
       
   121 foreach my $code (keys %Folded)
       
   122 	{
       
   123 	my @result = ();
       
   124 	foreach my $hexelt (split(' ', $Folded{$code}))
       
   125 		{
       
   126 		if (exists $CompleteDecomposition{$hexelt})
       
   127 			{
       
   128 			push @result, split(' ', $CompleteDecomposition{$hexelt});
       
   129 			}
       
   130 		else
       
   131 			{
       
   132 			push @result, $hexelt;
       
   133 			}
       
   134 		}
       
   135 	$Folded{$code} = join(' ', @result);
       
   136 	}
       
   137 
       
   138 # Now output all the results
       
   139 for (my $i = 0; $i != scalar(@RawLine); $i++)
       
   140 	{
       
   141 	print $RawLine[$i];
       
   142 	if (exists $LineToCode{$i})
       
   143 		{
       
   144 		my $code = $LineToCode{$i};
       
   145 		print ';';
       
   146 		my $decomp = '';
       
   147 		$decomp = $CompleteDecomposition{$code}
       
   148 			if exists $CompleteDecomposition{$code};
       
   149 		print $decomp.';';
       
   150 		if (exists $Folded{$code})
       
   151 			{
       
   152 			print $Folded{$code}
       
   153 			}
       
   154 		else
       
   155 			{
       
   156 			# If there is no folded value, but there is a decomposition
       
   157 			# sequence, the character must fold to the decomposition
       
   158 			# sequence too.
       
   159 			print $decomp;
       
   160 			}
       
   161 		}
       
   162 	print "\n";
       
   163 	}