|
1 # |
|
2 # Copyright (c) 2000-2009 Nokia Corporation and/or its subsidiary(-ies). |
|
3 # All rights reserved. |
|
4 # This component and the accompanying materials are made available |
|
5 # under the terms of "Eclipse Public License v1.0" |
|
6 # which accompanies this distribution, and is available |
|
7 # at the URL "http://www.eclipse.org/legal/epl-v10.html". |
|
8 # |
|
9 # Initial Contributors: |
|
10 # Nokia Corporation - initial contribution. |
|
11 # |
|
12 # Contributors: |
|
13 # |
|
14 # Description: |
|
15 # |
|
16 |
|
17 use strict; |
|
18 use integer; |
|
19 |
|
20 package UTF; |
|
21 require Exporter; |
|
22 @UTF::ISA=qw(Exporter); |
|
23 @UTF::EXPORT=qw(Utf8ToUnicode UnicodeToUtf8); |
|
24 |
|
25 my $KErrorIllFormedInput=-1; |
|
26 |
|
27 sub Utf8ToUnicode |
|
28 { |
|
29 my $Unicode = shift; |
|
30 my $Utf8 = shift; |
|
31 my $UnicodeTemplate = shift; |
|
32 my $Utf8Index = 0; |
|
33 my $UnicodeIndex = 0; |
|
34 my $numOfBytes = length($Utf8); |
|
35 my @Utf8Unpacked = unpack "C*",$Utf8; |
|
36 my @UnicodeUnpacked = (); |
|
37 |
|
38 for (;;) |
|
39 { |
|
40 if ($Utf8Index > $#Utf8Unpacked) |
|
41 { |
|
42 last; |
|
43 } |
|
44 |
|
45 my $currentUtf8Byte = $Utf8Unpacked[$Utf8Index]; |
|
46 |
|
47 if (($currentUtf8Byte&0x80)==0x00) |
|
48 { |
|
49 $UnicodeUnpacked[$UnicodeIndex] = $currentUtf8Byte; |
|
50 } |
|
51 |
|
52 elsif (($currentUtf8Byte&0xe0)==0xc0) |
|
53 { |
|
54 my $currentUnicodeCharacter=(($currentUtf8Byte&0x1f)<<6); |
|
55 ++$Utf8Index; |
|
56 $currentUtf8Byte=$Utf8Unpacked[$Utf8Index]; |
|
57 if (($currentUtf8Byte&0xc0)!=0x80) |
|
58 { |
|
59 return $KErrorIllFormedInput; |
|
60 } |
|
61 $currentUnicodeCharacter|=($currentUtf8Byte&0x3f); |
|
62 $UnicodeUnpacked[$UnicodeIndex] = $currentUnicodeCharacter; |
|
63 } |
|
64 |
|
65 elsif (($currentUtf8Byte&0xf0)==0xe0) |
|
66 { |
|
67 my $currentUnicodeCharacter=(($currentUtf8Byte&0x0f)<<12); |
|
68 ++$Utf8Index; |
|
69 $currentUtf8Byte=$Utf8Unpacked[$Utf8Index]; |
|
70 if (($currentUtf8Byte&0xc0)!=0x80) |
|
71 { |
|
72 return $KErrorIllFormedInput; |
|
73 } |
|
74 $currentUnicodeCharacter|=(($currentUtf8Byte&0x3f)<<6); |
|
75 ++$Utf8Index; |
|
76 $currentUtf8Byte=$Utf8Unpacked[$Utf8Index]; |
|
77 if (($currentUtf8Byte&0xc0)!=0x80) |
|
78 { |
|
79 return $KErrorIllFormedInput; |
|
80 } |
|
81 $currentUnicodeCharacter|=($currentUtf8Byte&0x3f); |
|
82 $UnicodeUnpacked[$UnicodeIndex] = $currentUnicodeCharacter; |
|
83 } |
|
84 |
|
85 elsif (($currentUtf8Byte&0xf8)==0xf0) |
|
86 { |
|
87 my $currentUnicodeCharacter=(($currentUtf8Byte&0x07)<<8); |
|
88 ++$Utf8Index; |
|
89 $currentUtf8Byte=$Utf8Unpacked[$Utf8Index]; |
|
90 if (($currentUtf8Byte&0xc0)!=0x80) |
|
91 { |
|
92 return $KErrorIllFormedInput; |
|
93 } |
|
94 $currentUnicodeCharacter|=(($currentUtf8Byte&0x3f)<<2); |
|
95 if ($currentUnicodeCharacter<0x0040) |
|
96 { |
|
97 return $KErrorIllFormedInput; |
|
98 } |
|
99 $currentUnicodeCharacter-=0x0040; |
|
100 if ($currentUnicodeCharacter>=0x0400) |
|
101 { |
|
102 return $KErrorIllFormedInput; |
|
103 } |
|
104 ++$Utf8Index; |
|
105 $currentUtf8Byte=$Utf8Unpacked[$Utf8Index]; |
|
106 if (($currentUtf8Byte&0xc0)!=0x80) |
|
107 { |
|
108 return $KErrorIllFormedInput; |
|
109 } |
|
110 $currentUnicodeCharacter|=(($currentUtf8Byte&0x30)>>4); |
|
111 $UnicodeUnpacked[$UnicodeIndex] = (0xd800|$currentUnicodeCharacter); |
|
112 $currentUnicodeCharacter=(($currentUtf8Byte&0x0f)<<6); |
|
113 ++$Utf8Index; |
|
114 $currentUtf8Byte=$Utf8Unpacked[$Utf8Index]; |
|
115 if (($currentUtf8Byte&0xc0)!=0x80) |
|
116 { |
|
117 return $KErrorIllFormedInput; |
|
118 } |
|
119 $currentUnicodeCharacter|=($currentUtf8Byte&0x3f); |
|
120 ++$UnicodeIndex; |
|
121 $UnicodeUnpacked[$UnicodeIndex] = (0xdc00|$currentUnicodeCharacter); |
|
122 } |
|
123 else |
|
124 { |
|
125 return $KErrorIllFormedInput; |
|
126 } |
|
127 ++$UnicodeIndex; |
|
128 ++$Utf8Index; |
|
129 } |
|
130 $$Unicode = (); |
|
131 $$Unicode = pack "$UnicodeTemplate*", @UnicodeUnpacked; |
|
132 return $UnicodeIndex; |
|
133 } |
|
134 |
|
135 sub UnicodeToUtf8 |
|
136 { |
|
137 my $Utf8 = shift; |
|
138 my $Unicode = shift; |
|
139 my $UnicodeTemplate = shift; |
|
140 my $Utf8Index = 0; |
|
141 my $UnicodeIndex = 0; |
|
142 my $numOfBytes = length($Unicode); |
|
143 my @UnicodeUnpacked = unpack "$UnicodeTemplate*", $Unicode; |
|
144 my @Utf8Unpacked = (); |
|
145 |
|
146 for (;;) |
|
147 { |
|
148 # exit the loop if no more in the UnicodeUnpacked |
|
149 if ($UnicodeIndex > $#UnicodeUnpacked) |
|
150 { |
|
151 last; |
|
152 } |
|
153 |
|
154 my $currentUnicodeCharacter=$UnicodeUnpacked[$UnicodeIndex]; |
|
155 if (($currentUnicodeCharacter&0xff80)==0x0000) |
|
156 { |
|
157 $Utf8Unpacked[$Utf8Index]= $currentUnicodeCharacter; |
|
158 } |
|
159 elsif (($currentUnicodeCharacter&0xf800)==0x0000) |
|
160 { |
|
161 |
|
162 $Utf8Unpacked[$Utf8Index]= (0xc0 | $currentUnicodeCharacter >> 6); |
|
163 ++$Utf8Index; |
|
164 $Utf8Unpacked[$Utf8Index]= (0x80 | $currentUnicodeCharacter&0x3f); |
|
165 } |
|
166 elsif (($currentUnicodeCharacter&0xfc00)==0xd800) |
|
167 { |
|
168 $currentUnicodeCharacter+=0x0040; |
|
169 $Utf8Unpacked[$Utf8Index]= (0xf0|(($currentUnicodeCharacter>>8)&0x07)); |
|
170 ++$Utf8Index; |
|
171 $Utf8Unpacked[$Utf8Index]= (0x80|(($currentUnicodeCharacter>>2)&0x3f)); |
|
172 my $currentUtf8Byte=(0x80|(($currentUnicodeCharacter&0x03)<<4)); |
|
173 ++$UnicodeIndex; |
|
174 $currentUnicodeCharacter=$UnicodeUnpacked[$UnicodeIndex]; |
|
175 if (($currentUnicodeCharacter&0xfc00)!=0xdc00) |
|
176 { |
|
177 return $KErrorIllFormedInput; |
|
178 } |
|
179 $currentUtf8Byte|=(($currentUnicodeCharacter>>6)&0x0f); |
|
180 ++$Utf8Index; |
|
181 $Utf8Unpacked[$Utf8Index]= $currentUtf8Byte; |
|
182 ++$Utf8Index; |
|
183 $Utf8Unpacked[$Utf8Index]= (0x80| ($currentUnicodeCharacter&0x3f)); |
|
184 } |
|
185 else |
|
186 { |
|
187 $Utf8Unpacked[$Utf8Index]= (0xe0|($currentUnicodeCharacter>>12)); |
|
188 ++$Utf8Index; |
|
189 $Utf8Unpacked[$Utf8Index]= (0x80|(($currentUnicodeCharacter>>6)&0x3f)); |
|
190 ++$Utf8Index; |
|
191 $Utf8Unpacked[$Utf8Index]= (0x80| ($currentUnicodeCharacter&0x3f)); |
|
192 } |
|
193 ++$Utf8Index; |
|
194 ++$UnicodeIndex; |
|
195 } |
|
196 $$Utf8 = (); |
|
197 $$Utf8 = pack "C*", @Utf8Unpacked; |
|
198 return $Utf8Index; |
|
199 |
|
200 } |