|
1 # |
|
2 # Copyright (c) 2009 Nokia Corporation and/or its subsidiary(-ies). |
|
3 # All rights reserved. |
|
4 # This component and the accompanying materials are made available |
|
5 # under the terms of the License "Eclipse Public License v1.0" |
|
6 # which accompanies this distribution, and is available |
|
7 # at the URL "http://www.eclipse.org/legal/epl-v10.html". |
|
8 # |
|
9 # Initial Contributors: |
|
10 # Nokia Corporation - initial contribution. |
|
11 # |
|
12 # Contributors: |
|
13 # |
|
14 # Description: |
|
15 # |
|
16 # UnicodeCompositionEx |
|
17 # adds composition exclusion information to unicode data |
|
18 # |
|
19 # Added as a new field: |
|
20 # Symbian:<excluded-from-composition> |
|
21 # where <excluded-from-composition> is E or null. |
|
22 # |
|
23 # Usage: |
|
24 # perl -w UnicodeAddComposeEx.pl CompositionExclusions.txt < <Unicode-data-file> |
|
25 |
|
26 use strict; |
|
27 |
|
28 if (scalar(@ARGV) != 1) |
|
29 { |
|
30 print (STDERR "Usage:\nperl -w UnicodeAddComposeEx.pl CompositionExclusions.txt < <Unicode-data-file>\n"); |
|
31 exit 1; |
|
32 } |
|
33 |
|
34 open(EXCLUSIONS, $ARGV[0]) or die("Could not open file $ARGV[0]\n"); |
|
35 |
|
36 my $lineNo = 0; |
|
37 my %Excluded = (); |
|
38 while (<EXCLUSIONS>) |
|
39 { |
|
40 $lineNo++; |
|
41 # try to parse the line if there is some non-whitespace before the comment |
|
42 if (!/^[ \t]*([#].*)?$/) |
|
43 { |
|
44 /^[ \t]*([0-9A-Fa-f]{4,6})[ \t]*([#].*)?$/ or die("Did not understand line $lineNo of $ARGV[0]"); |
|
45 my $code = hex($1); |
|
46 die ("Value $code outside Unicode range at line $lineNo of $ARGV[0]") |
|
47 unless ($code < 0x110000); |
|
48 $Excluded{$code} = 1; |
|
49 #printf("Excluding %X because it is in the exclusion list\n", $code); |
|
50 } |
|
51 } |
|
52 |
|
53 close EXCLUSIONS; |
|
54 # This is a two-pass operation, so we must store the lines ready for output later. |
|
55 my @DataFileLines = (); |
|
56 my %DataFileLineCodes = (); |
|
57 # The first pass will collect all the relevant data: |
|
58 # The first character of the decomposition if there is more than one |
|
59 my %FirstOfDecompositionString = (); |
|
60 # The singleton decomposition if it is a singleton |
|
61 my %SingletonDecomposition = (); |
|
62 # The decompositions tag, if any |
|
63 my %DecompTag = (); |
|
64 # The combining class |
|
65 my %CombiningClass = (); |
|
66 # We will also be marking all singleton decompositions for exclusion |
|
67 $lineNo = 0; |
|
68 while (my $line = <STDIN>) |
|
69 { |
|
70 chomp $line; |
|
71 $DataFileLines[$lineNo] = $line; |
|
72 $lineNo++; |
|
73 # Split into fields: make sure trailing null strings are not |
|
74 # deleted by adding a dummy final field |
|
75 my @attribute = split(/;/, $line.';dummy'); |
|
76 # Delete the dummy field |
|
77 pop @attribute; |
|
78 |
|
79 if (scalar(@attribute) == 15) |
|
80 { |
|
81 my $code = $attribute[0]; |
|
82 die("First attribute '$code' not a valid Unicode codepoint at line $lineNo") |
|
83 unless $code =~ /^1?[0-9a-fA-F]{4,5}$/; |
|
84 $code = hex($code); |
|
85 my $combiningClass = $attribute[3]; |
|
86 die("Fourth attribute '$combiningClass' is not a valid Unicode combining class at line $lineNo") |
|
87 unless (0 <= $combiningClass && $combiningClass < 256); |
|
88 my $decompositionString = $attribute[5]; |
|
89 die ("Sixth attribute '$decompositionString' is not a valid decomposition string at line $lineNo") |
|
90 unless ($decompositionString =~ /^(<.*>)?[0-9a-fA-F \t]*$/); |
|
91 my @decomposition = split(/[ \t]+/, $decompositionString); |
|
92 if (@decomposition && $decomposition[0] =~ /^<.*>$/) |
|
93 { |
|
94 $DecompTag{$code} = shift @decomposition; |
|
95 } |
|
96 if (scalar(@decomposition) == 1) |
|
97 { |
|
98 # We want to exclude codes such as these, with a singleton |
|
99 # decomposition mapping, but at the moment we don't know if the |
|
100 # character mapped to has a decomposition mapping, so we will |
|
101 # defer this to another stage. |
|
102 die("Decomposition $decomposition[0] not understood at line $lineNo") |
|
103 unless ($decomposition[0] =~ /^[0-9A-Fa-f]+$/); |
|
104 $SingletonDecomposition{$code} = hex($decomposition[0]); |
|
105 } |
|
106 elsif (1 < scalar(@decomposition)) |
|
107 { |
|
108 die("Decomposition $decomposition[0] not understood at line $lineNo") |
|
109 unless ($decomposition[0] =~ /^[0-9A-Fa-f]+$/); |
|
110 $FirstOfDecompositionString{$code} = hex($decomposition[0]); |
|
111 } |
|
112 $CombiningClass{$code} = $combiningClass; |
|
113 $DataFileLineCodes{$lineNo-1} = $code; |
|
114 } |
|
115 elsif ($line !~ /^[ \t]*$/) |
|
116 { |
|
117 die 'Do not understand line '.$lineNo; |
|
118 } |
|
119 } |
|
120 |
|
121 # Each code that has a decomposition string longer than one character |
|
122 # where the first character has non-zero combining class is excluded |
|
123 foreach my $code (keys %FirstOfDecompositionString) |
|
124 { |
|
125 my $decomp = $FirstOfDecompositionString{$code}; |
|
126 if (exists($CombiningClass{$decomp})) |
|
127 { |
|
128 if ($CombiningClass{$decomp} != 0) |
|
129 { |
|
130 $Excluded{$code} = 1; |
|
131 #printf("Excluding %X because its decomposition starts with a non-starter(%X)\n", $code, $decomp); |
|
132 } |
|
133 } |
|
134 } |
|
135 |
|
136 # Each code that has a singleton decomposition string may be excluded if |
|
137 # that code has only a singleton mapping itself. |
|
138 foreach my $code (sort (keys %SingletonDecomposition)) |
|
139 { |
|
140 my $mapsTo = $code; |
|
141 while (exists $SingletonDecomposition{$mapsTo} && !exists $DecompTag{$code}) |
|
142 { |
|
143 $mapsTo = $SingletonDecomposition{$mapsTo}; |
|
144 } |
|
145 if (!exists $FirstOfDecompositionString{$mapsTo}) |
|
146 { |
|
147 #printf("Excluding %X because its decomposition is a singleton(%X)\n", $code, $mapsTo); |
|
148 $Excluded{$code} = 1; |
|
149 } |
|
150 } |
|
151 |
|
152 # Now we output the file with the extra filed appended to each line |
|
153 for(my $i = 0; $i != scalar(@DataFileLines); $i++) |
|
154 { |
|
155 print $DataFileLines[$i]; |
|
156 if (exists($DataFileLineCodes{$i})) |
|
157 { |
|
158 print ';Symbian:'; |
|
159 if (exists($Excluded{ $DataFileLineCodes{$i} })) |
|
160 { |
|
161 print 'E'; |
|
162 } |
|
163 } |
|
164 print "\n"; |
|
165 } |