|
1 # |
|
2 # Copyright (c) 2009 Nokia Corporation and/or its subsidiary(-ies). |
|
3 # All rights reserved. |
|
4 # This component and the accompanying materials are made available |
|
5 # under the terms of the License "Eclipse Public License v1.0" |
|
6 # which accompanies this distribution, and is available |
|
7 # at the URL "http://www.eclipse.org/legal/epl-v10.html". |
|
8 # |
|
9 # Initial Contributors: |
|
10 # Nokia Corporation - initial contribution. |
|
11 # |
|
12 # Contributors: |
|
13 # |
|
14 # Description: |
|
15 # |
|
16 # UnicodeMaxDecompose.pl |
|
17 # |
|
18 # Adds maximal decompositions of the character and maximal decompositions of |
|
19 # its folded varient to the Unicode data. |
|
20 # |
|
21 # Added as the fourth field after the 'Symbain:' marker in the following format: |
|
22 # |
|
23 # Symbian:<grapheme-role>;<excluded>;<folded>;<max-decomposition>;<folded-decomposition> |
|
24 # where each of <max-decomposition> and <folded-decomposition> are strings |
|
25 # of hex numbers separated by spaces, representing the complete decomposition |
|
26 # of the character and its folded equivalent respectively. |
|
27 # |
|
28 # Usage: |
|
29 # perl -w UnicodeMaxDecompose.pl < <output-of-UnicodeAddFolded> |
|
30 |
|
31 use strict; |
|
32 |
|
33 if (scalar(@ARGV) != 0) |
|
34 { |
|
35 print (STDERR "Usage:\nperl -w UnicodeMaxDecompose.pl < <output-of-UnicodeAddFolded>\n"); |
|
36 exit 1; |
|
37 } |
|
38 |
|
39 my %StatedDecomposition = (); |
|
40 my %CompleteDecomposition = (); |
|
41 |
|
42 sub Decompose |
|
43 { |
|
44 my ($code) = @_; |
|
45 return unless exists $StatedDecomposition{$code}; |
|
46 my $stated = $StatedDecomposition{$code}; |
|
47 delete $StatedDecomposition{$code}; |
|
48 my @complete = (); |
|
49 foreach my $hexelt ( split(' ', $stated) ) |
|
50 { |
|
51 if ($hexelt) |
|
52 { |
|
53 Decompose($hexelt); |
|
54 if (exists $CompleteDecomposition{$hexelt}) |
|
55 { |
|
56 push @complete, $CompleteDecomposition{$hexelt}; |
|
57 } |
|
58 else |
|
59 { |
|
60 push @complete, $hexelt; |
|
61 } |
|
62 } |
|
63 } |
|
64 $CompleteDecomposition{$code} = join(' ', @complete); |
|
65 } |
|
66 |
|
67 my %Folded = (); |
|
68 my %LineToCode = (); |
|
69 my @RawLine = (); |
|
70 |
|
71 my $lineNo = 0; |
|
72 while (my $line = <STDIN>) |
|
73 { |
|
74 chomp $line; |
|
75 $lineNo++; |
|
76 # Split into fields: make sure trailing null strings are not |
|
77 # deleted by adding a dummy final field |
|
78 my @attribute = split(/;/, $line.';dummy'); |
|
79 # Delete the dummy field |
|
80 pop @attribute; |
|
81 die ("Line $lineNo is missing 'Symbian:' entries. Has UnicodeAddFolded been run?") |
|
82 if (scalar(@attribute) == 16); |
|
83 if (scalar(@attribute) == 17) |
|
84 { |
|
85 die ("Line $lineNo is missing 'Symbian:' entries. Has UnicodeAddFolded been run?") |
|
86 if ($attribute[15] !~ /^[ \t]*symbian:/i); |
|
87 my $code = $attribute[0]; |
|
88 die("First attribute '$code' not a valid Unicode codepoint at line $lineNo") |
|
89 unless ($code =~ /^1?[0-9a-fA-F]{4,5}$/ && hex($code) < 0x110000); |
|
90 my $decomposition = $attribute[5]; |
|
91 die("Decomposition '$decomposition' at line $lineNo is not a valid Unicode decomposition.") |
|
92 unless $decomposition =~ /^[ \t]*(<.*>[ \t]*[0-9a-fA-F])?[0-9a-fA-F \t]*$/; |
|
93 my $folded = $attribute[16]; |
|
94 die ("'$folded' not a valid string of hex values at line $lineNo.") |
|
95 unless $folded =~ /[0-9a-fA-F \t]*/; |
|
96 # Store all decompositions that have no tag and at least one value |
|
97 if ($decomposition =~ /^[ \t]*[0-9a-fA-F]/) |
|
98 { |
|
99 $StatedDecomposition{$code} = $decomposition; |
|
100 } |
|
101 if ($folded =~ /[0-9a-fA-F]/) |
|
102 { |
|
103 $Folded{$code} = $folded; |
|
104 } |
|
105 $LineToCode{$lineNo-1} = $code; |
|
106 } |
|
107 elsif ($line !~ /^[ \t]*$/) |
|
108 { |
|
109 die 'Do not understand line '.$lineNo; |
|
110 } |
|
111 $RawLine[$lineNo-1] = $line; |
|
112 } |
|
113 |
|
114 # Completely decompose all strings in the %StatedDecomposition |
|
115 foreach my $code (keys %StatedDecomposition) |
|
116 { |
|
117 Decompose($code); |
|
118 } |
|
119 |
|
120 # Now decompose all the folded versions |
|
121 foreach my $code (keys %Folded) |
|
122 { |
|
123 my @result = (); |
|
124 foreach my $hexelt (split(' ', $Folded{$code})) |
|
125 { |
|
126 if (exists $CompleteDecomposition{$hexelt}) |
|
127 { |
|
128 push @result, split(' ', $CompleteDecomposition{$hexelt}); |
|
129 } |
|
130 else |
|
131 { |
|
132 push @result, $hexelt; |
|
133 } |
|
134 } |
|
135 $Folded{$code} = join(' ', @result); |
|
136 } |
|
137 |
|
138 # Now output all the results |
|
139 for (my $i = 0; $i != scalar(@RawLine); $i++) |
|
140 { |
|
141 print $RawLine[$i]; |
|
142 if (exists $LineToCode{$i}) |
|
143 { |
|
144 my $code = $LineToCode{$i}; |
|
145 print ';'; |
|
146 my $decomp = ''; |
|
147 $decomp = $CompleteDecomposition{$code} |
|
148 if exists $CompleteDecomposition{$code}; |
|
149 print $decomp.';'; |
|
150 if (exists $Folded{$code}) |
|
151 { |
|
152 print $Folded{$code} |
|
153 } |
|
154 else |
|
155 { |
|
156 # If there is no folded value, but there is a decomposition |
|
157 # sequence, the character must fold to the decomposition |
|
158 # sequence too. |
|
159 print $decomp; |
|
160 } |
|
161 } |
|
162 print "\n"; |
|
163 } |