kernel/eka/euser/unicode/perl/UnicodeCompositionEx.pl
changeset 9 96e5fb8b040d
equal deleted inserted replaced
-1:000000000000 9:96e5fb8b040d
       
     1 #
       
     2 # Copyright (c) 2009 Nokia Corporation and/or its subsidiary(-ies).
       
     3 # All rights reserved.
       
     4 # This component and the accompanying materials are made available
       
     5 # under the terms of the License "Eclipse Public License v1.0"
       
     6 # which accompanies this distribution, and is available
       
     7 # at the URL "http://www.eclipse.org/legal/epl-v10.html".
       
     8 #
       
     9 # Initial Contributors:
       
    10 # Nokia Corporation - initial contribution.
       
    11 #
       
    12 # Contributors:
       
    13 #
       
    14 # Description:
       
    15 #
       
    16 # UnicodeCompositionEx
       
    17 # adds composition exclusion information to unicode data
       
    18 #
       
    19 # Added as a new field:
       
    20 # Symbian:<excluded-from-composition>
       
    21 # where <excluded-from-composition> is E or null.
       
    22 #
       
    23 # Usage:
       
    24 # perl -w UnicodeAddComposeEx.pl CompositionExclusions.txt < <Unicode-data-file>
       
    25 
       
    26 use strict;
       
    27 
       
    28 if (scalar(@ARGV) != 1)
       
    29 	{
       
    30 	print (STDERR "Usage:\nperl -w UnicodeAddComposeEx.pl CompositionExclusions.txt < <Unicode-data-file>\n");
       
    31 	exit 1;
       
    32 	}
       
    33 
       
    34 open(EXCLUSIONS, $ARGV[0]) or die("Could not open file $ARGV[0]\n");
       
    35 
       
    36 my $lineNo = 0;
       
    37 my %Excluded = ();
       
    38 while (<EXCLUSIONS>)
       
    39 	{
       
    40 	$lineNo++;
       
    41 	# try to parse the line if there is some non-whitespace before the comment
       
    42 	if (!/^[ \t]*([#].*)?$/)
       
    43 		{
       
    44 		/^[ \t]*([0-9A-Fa-f]{4,6})[ \t]*([#].*)?$/ or die("Did not understand line $lineNo of $ARGV[0]");
       
    45 		my $code = hex($1);
       
    46 		die ("Value $code outside Unicode range at line $lineNo of $ARGV[0]")
       
    47 			unless ($code < 0x110000);
       
    48 		$Excluded{$code} = 1;
       
    49 		#printf("Excluding %X because it is in the exclusion list\n", $code);
       
    50 		}
       
    51 	}
       
    52 
       
    53 close EXCLUSIONS;
       
    54 # This is a two-pass operation, so we must store the lines ready for output later.
       
    55 my @DataFileLines = ();
       
    56 my %DataFileLineCodes = ();
       
    57 # The first pass will collect all the relevant data:
       
    58 # The first character of the decomposition if there is more than one
       
    59 my %FirstOfDecompositionString = ();
       
    60 # The singleton decomposition if it is a singleton
       
    61 my %SingletonDecomposition = ();
       
    62 # The decompositions tag, if any
       
    63 my %DecompTag = ();
       
    64 # The combining class
       
    65 my %CombiningClass = ();
       
    66 # We will also be marking all singleton decompositions for exclusion
       
    67 $lineNo = 0;
       
    68 while (my $line = <STDIN>)
       
    69 	{
       
    70 	chomp $line;
       
    71 	$DataFileLines[$lineNo] = $line;
       
    72 	$lineNo++;
       
    73 	# Split into fields: make sure trailing null strings are not
       
    74 	# deleted by adding a dummy final field
       
    75 	my @attribute = split(/;/, $line.';dummy');
       
    76 	# Delete the dummy field
       
    77 	pop @attribute;
       
    78 
       
    79 	if (scalar(@attribute) == 15)
       
    80 		{
       
    81 		my $code = $attribute[0];
       
    82 		die("First attribute '$code' not a valid Unicode codepoint at line $lineNo")
       
    83 			unless $code =~ /^1?[0-9a-fA-F]{4,5}$/;
       
    84 		$code = hex($code);
       
    85 		my $combiningClass = $attribute[3];
       
    86 		die("Fourth attribute '$combiningClass' is not a valid Unicode combining class at line $lineNo")
       
    87 			unless (0 <= $combiningClass && $combiningClass < 256);
       
    88 		my $decompositionString = $attribute[5];
       
    89 		die ("Sixth attribute '$decompositionString' is not a valid decomposition string at line $lineNo")
       
    90 			unless ($decompositionString =~ /^(<.*>)?[0-9a-fA-F \t]*$/);
       
    91 		my @decomposition = split(/[ \t]+/, $decompositionString);
       
    92 		if (@decomposition && $decomposition[0] =~ /^<.*>$/)
       
    93 			{
       
    94 			$DecompTag{$code} = shift @decomposition;
       
    95 			}
       
    96 		if (scalar(@decomposition) == 1)
       
    97 			{
       
    98 			# We want to exclude codes such as these, with a singleton
       
    99 			# decomposition mapping, but at the moment we don't know if the
       
   100 			# character mapped to has a decomposition mapping, so we will
       
   101 			# defer this to another stage.
       
   102 			die("Decomposition $decomposition[0] not understood at line $lineNo")
       
   103 				unless ($decomposition[0] =~ /^[0-9A-Fa-f]+$/);
       
   104 			$SingletonDecomposition{$code} = hex($decomposition[0]);
       
   105 			}
       
   106 		elsif (1 < scalar(@decomposition))
       
   107 			{
       
   108 			die("Decomposition $decomposition[0] not understood at line $lineNo")
       
   109 				unless ($decomposition[0] =~ /^[0-9A-Fa-f]+$/);
       
   110 			$FirstOfDecompositionString{$code} = hex($decomposition[0]);
       
   111 			}
       
   112 		$CombiningClass{$code} = $combiningClass;
       
   113 		$DataFileLineCodes{$lineNo-1} = $code;
       
   114 		}
       
   115 	elsif ($line !~ /^[ \t]*$/)
       
   116 		{
       
   117 		die 'Do not understand line '.$lineNo;
       
   118 		}
       
   119 	}
       
   120 
       
   121 # Each code that has a decomposition string longer than one character
       
   122 # where the first character has non-zero combining class is excluded
       
   123 foreach my $code (keys %FirstOfDecompositionString)
       
   124 	{
       
   125 	my $decomp = $FirstOfDecompositionString{$code};
       
   126 	if (exists($CombiningClass{$decomp}))
       
   127 		{
       
   128 		if ($CombiningClass{$decomp} != 0)
       
   129 			{
       
   130 			$Excluded{$code} = 1;
       
   131 			#printf("Excluding %X because its decomposition starts with a non-starter(%X)\n", $code, $decomp);
       
   132 			}
       
   133 		}
       
   134 	}
       
   135 
       
   136 # Each code that has a singleton decomposition string may be excluded if
       
   137 # that code has only a singleton mapping itself.
       
   138 foreach my $code (sort (keys %SingletonDecomposition))
       
   139 	{
       
   140 	my $mapsTo = $code;
       
   141 	while (exists $SingletonDecomposition{$mapsTo} && !exists $DecompTag{$code})
       
   142 		{
       
   143 		$mapsTo = $SingletonDecomposition{$mapsTo};
       
   144 		}
       
   145 	if (!exists $FirstOfDecompositionString{$mapsTo})
       
   146 		{
       
   147 		#printf("Excluding %X because its decomposition is a singleton(%X)\n", $code, $mapsTo);
       
   148 		$Excluded{$code} = 1;
       
   149 		}
       
   150 	}
       
   151 
       
   152 # Now we output the file with the extra filed appended to each line
       
   153 for(my $i = 0; $i != scalar(@DataFileLines); $i++)
       
   154 	{
       
   155 	print $DataFileLines[$i];
       
   156 	if (exists($DataFileLineCodes{$i}))
       
   157 		{
       
   158 		print ';Symbian:';
       
   159 		if (exists($Excluded{ $DataFileLineCodes{$i} }))
       
   160 			{
       
   161 			print 'E';
       
   162 			}
       
   163 		}
       
   164 	print "\n";
       
   165 	}