|
1 # Copyright (c) 2002-2009 Nokia Corporation and/or its subsidiary(-ies). |
|
2 # All rights reserved. |
|
3 # This component and the accompanying materials are made available |
|
4 # under the terms of the License "Eclipse Public License v1.0" |
|
5 # which accompanies this distribution, and is available |
|
6 # at the URL "http://www.eclipse.org/legal/epl-v10.html". |
|
7 # |
|
8 # Initial Contributors: |
|
9 # Nokia Corporation - initial contribution. |
|
10 # |
|
11 # Contributors: |
|
12 # |
|
13 # Description: |
|
14 # Adds folding information to Unicode data |
|
15 # Added as the third field after the 'Symbian:' marker in the following format: |
|
16 # Symbian:<grapheme-role>;<excluded-from-composition>;<folded-form> |
|
17 # where <folded-form> is null or a sequence of hex unicode values |
|
18 # separated by spaces representing the folded form of the character. |
|
19 # Usage: |
|
20 # perl -w UnicodeAddFolded.pl CaseFolding.txt < <output-of-UnicodeCompositionEx> |
|
21 # |
|
22 # |
|
23 |
|
24 use strict; |
|
25 |
|
26 if (scalar(@ARGV) != 1) |
|
27 { |
|
28 print (STDERR "Usage:\nperl -w UnicodeAddFolded.pl CaseFolding.txt < <output-of-UnicodeCompositionEx>\n"); |
|
29 exit 1; |
|
30 } |
|
31 |
|
32 open(FOLDING, $ARGV[0]) or die("Could not open file $ARGV[0]\n"); |
|
33 |
|
34 my %Fold = (); |
|
35 my %MappingLine = (); |
|
36 my $lineNo = 0; |
|
37 while (<FOLDING>) |
|
38 { |
|
39 $lineNo++; |
|
40 my ($line, $comment) = split(/#/, $_, 2); |
|
41 if ($line =~ /^[ \t]*(1?[0-9a-fA-F]{4,5});[ \t]*([LEICSFT]);[ \t]*([0-9a-fA-F][0-9a-fA-F \t]*);[ \t]*$/) |
|
42 { |
|
43 my $code = hex($1); |
|
44 my $type = $2; |
|
45 my $folded = $3; |
|
46 # We'll deal with Turkic mappings with our own hack. |
|
47 # F = Full mappings (fold is longer than one character) |
|
48 # T = I = Turkic mapping |
|
49 if ($type !~ /[FTI]/ && $folded !~ /[ \t]/) |
|
50 { |
|
51 die ("$code has two mappings: lines $MappingLine{$code} and $lineNo.") |
|
52 if (exists $Fold{$code}); |
|
53 $Fold{$code} = $folded; |
|
54 $MappingLine{$code} = $lineNo; |
|
55 } |
|
56 } |
|
57 elsif ($line !~ /^[ \t]*$/) |
|
58 { |
|
59 die ("Did not understand line $lineNo of $ARGV[0]"); |
|
60 } |
|
61 } |
|
62 |
|
63 close FOLDING; |
|
64 |
|
65 # Turkic hack: |
|
66 # Map dotted capital I and dotless small I to lower case i. |
|
67 # This makes all the 'i's fold the same, which isn't very nice for Turkic |
|
68 # languages, but it at least gives us behaviour consistent across locales |
|
69 # which does at least map dotted I, and i to the same value, as well |
|
70 # as mapping I and dotless i to the same value, and mapping I and i |
|
71 # to the same value. |
|
72 $Fold{0x49} = '0069'; |
|
73 $Fold{0x130} = '0069'; |
|
74 $Fold{0x131} = '0069'; |
|
75 |
|
76 $lineNo = 0; |
|
77 while (my $line = <STDIN>) |
|
78 { |
|
79 chomp $line; |
|
80 $lineNo++; |
|
81 # Split into fields: make sure trailing null strings are not |
|
82 # deleted by adding a dummy final field |
|
83 my @attribute = split(/;/, $line.';dummy'); |
|
84 # Delete the dummy field |
|
85 pop @attribute; |
|
86 die ("Line $lineNo is missing 'Symbian:' entries. Has UnicodeCompositionEx been run?") |
|
87 if (scalar(@attribute) == 15); |
|
88 if (scalar(@attribute) == 16) |
|
89 { |
|
90 die ("Line $lineNo is missing 'Symbian:' entries. Has UnicodeCompositionEx been run?") |
|
91 if ($attribute[15] !~ /^[ \t]*symbian:/i); |
|
92 my $code = $attribute[0]; |
|
93 die("First attribute '$code' not a valid Unicode codepoint at line $lineNo") |
|
94 unless $code =~ /^1?[0-9a-fA-F]{4,5}$/; |
|
95 $code = hex($code); |
|
96 $attribute[16] = exists $Fold{$code}? $Fold{$code} : ''; |
|
97 print join(';', @attribute); |
|
98 } |
|
99 elsif ($line !~ /^[ \t]*$/) |
|
100 { |
|
101 die 'Do not understand line '.$lineNo; |
|
102 } |
|
103 else |
|
104 { |
|
105 print $line; |
|
106 } |
|
107 print "\n"; |
|
108 } |