|
1 // Copyright (c) 2008-2009 Nokia Corporation and/or its subsidiary(-ies). |
|
2 // All rights reserved. |
|
3 // This component and the accompanying materials are made available |
|
4 // under the terms of the License "Eclipse Public License v1.0" |
|
5 // which accompanies this distribution, and is available |
|
6 // at the URL "http://www.eclipse.org/legal/epl-v10.html". |
|
7 // |
|
8 // Initial Contributors: |
|
9 // Nokia Corporation - initial contribution. |
|
10 // |
|
11 // Contributors: |
|
12 // |
|
13 // Description: |
|
14 // |
|
15 // There are 2 reasons why not use existing unicodeconv.cpp: |
|
16 // 1) "unicode->foreign" in existing unicodeconv.cpp is quite slow, especially |
|
17 // for huge code pages (e.g, Asia code pages). See INC127598. |
|
18 // |
|
19 // 2) GB18030 has 32-bit code that existing unicodeconv.cpp cannot handle. |
|
20 // |
|
21 // The algorithm of this special version unicodeconv.cpp is straightforward: |
|
22 // 1) foreign->unicode: |
|
23 // 1.1) 1 byte/2 byte->unicode bmp: use existing mechanism; mapping table in |
|
24 // "cp54936_2byte_tounicode.cpp", which is generated with command |
|
25 // "perl -w ..\group\FatConversionTable.pl cp54936_2byte.txt". |
|
26 // |
|
27 // 1.2) 4 byte->unicode bmp: convert the 4-byte code to a 16-bit index, then |
|
28 // search into the mapping table in "cp54936_4byte_tounicode.cpp", |
|
29 // which is generated with command |
|
30 // "perl -w ..\group\cp54936_4byte_tounicode.pl cp54936_4byte.txt". |
|
31 // |
|
32 // 1.3) 4 byte->unicode non-bmp: calculate with formula in this file. |
|
33 // |
|
34 // 2) unicode->foreign: |
|
35 // 2.1) unicode bmp->1/2/4 byte: the huge table in "cp54936_allbmp_fromunicode.cpp" |
|
36 // can map directly, which is generated with command |
|
37 // "perl -w ..\group\cp54936_allbmp_fromunicode.pl cp54936_2byte.txt cp54936_4byte.txt". |
|
38 // |
|
39 // 2.2) unicode non-bmp->4 byte: calculate with formula in this file. |
|
40 // |
|
41 // The function cp54936_2byte_tounicode.cpp::TConvDataStruct:: |
|
42 // ConvertSingleUnicode() is not used anymore. It's reserved just because not |
|
43 // changing the tool FatConversionTable.pl. |
|
44 // |
|
45 // About the mapping table "cp54936_2byte.txt" and "cp54936_4byte.txt": |
|
46 // 1) All Private Used Area (PUA) code points are reserved. |
|
47 // 2) All GB18030 code points that mapping to undefined Unicode are reserved. |
|
48 // |
|
49 // |
|
50 // About the formula for non-bmp calculation: |
|
51 // 1) All code points from 0x10000 to 0x10FFFF are supported. |
|
52 // 2) Code points in 0x10000-0x1FFFF and 0x30000-0x10FFFF are summarized from |
|
53 // the GB18030 standard, since the standard does not define the mapping for |
|
54 // code points out of 0x20000-0x2FFFF. |
|
55 |
|
56 |
|
57 #include <e32std.h> |
|
58 #include <e32def.h> |
|
59 #include <e32des8.h> |
|
60 #include "unicodeconv.h" |
|
61 #include "cp54936.h" |
|
62 |
|
63 |
|
64 enum TFccPanic |
|
65 { |
|
66 EBadForeignCode = 0, |
|
67 E4ByteIndexOutOfRange, |
|
68 EPanicBadIndices1, |
|
69 EInavlidUnicodeValue |
|
70 }; |
|
71 void Panic(TFccPanic aPanic) |
|
72 { |
|
73 |
|
74 User::Panic(_L("FatCharsetConv"),aPanic); |
|
75 } |
|
76 |
|
77 |
|
78 //replacement character to be used when unicode cannot be converted |
|
79 const TUint8 KForeignReplacement = 0x5F; |
|
80 |
|
81 const TUint8 KU10000Byte1 = 0x90; |
|
82 const TUint8 KU10000Byte2 = 0x30; |
|
83 const TUint8 KU10000Byte3 = 0x81; |
|
84 const TUint8 KU10000Byte4 = 0x30; |
|
85 |
|
86 inline TBool IsSupplementary(TUint aChar) |
|
87 /** |
|
88 @param aChar The 32-bit code point value of a Unicode character. |
|
89 |
|
90 @return True, if aChar is supplementary character; false, otherwise. |
|
91 */ |
|
92 { |
|
93 return (aChar > 0xFFFF); |
|
94 } |
|
95 |
|
96 inline TBool IsSurrogate(TText16 aInt16) |
|
97 /** |
|
98 @return True, if aText16 is high surrogate or low surrogate; false, otherwise. |
|
99 */ |
|
100 { |
|
101 return (aInt16 & 0xF800) == 0xD800; |
|
102 } |
|
103 |
|
104 inline TBool IsHighSurrogate(TText16 aInt16) |
|
105 /** |
|
106 @return True, if aText16 is high surrogate; false, otherwise. |
|
107 */ |
|
108 { |
|
109 return (aInt16 & 0xFC00) == 0xD800; |
|
110 } |
|
111 |
|
112 inline TBool IsLowSurrogate(TText16 aInt16) |
|
113 /** |
|
114 @return True, if aText16 is low surrogate; false, otherwise. |
|
115 */ |
|
116 { |
|
117 return (aInt16 & 0xFC00) == 0xDC00; |
|
118 } |
|
119 |
|
120 inline TUint JoinSurrogate(TText16 aHighSurrogate, TText16 aLowSurrogate) |
|
121 /** |
|
122 Combine a high surrogate and a low surrogate into a supplementary character. |
|
123 |
|
124 @return The 32-bit code point value of the generated Unicode supplementary |
|
125 character. |
|
126 */ |
|
127 { |
|
128 return ((aHighSurrogate - 0xD7F7) << 10) + aLowSurrogate; |
|
129 } |
|
130 |
|
131 inline TText16 GetHighSurrogate(TUint aChar) |
|
132 /** |
|
133 Retrieve the high surrogate of a supplementary character. |
|
134 |
|
135 @param aChar The 32-bit code point value of a Unicode character. |
|
136 |
|
137 @return High surrogate of aChar, if aChar is a supplementary character; |
|
138 aChar itself, if aChar is not a supplementary character. |
|
139 */ |
|
140 { |
|
141 return STATIC_CAST(TText16, 0xD7C0 + (aChar >> 10)); |
|
142 } |
|
143 |
|
144 inline TText16 GetLowSurrogate(TUint aChar) |
|
145 /** |
|
146 Retrieve the low surrogate of a supplementary character. |
|
147 |
|
148 @param aChar The 32-bit code point value of a Unicode character. |
|
149 |
|
150 @return Low surrogate of aChar, if aChar is a supplementary character; |
|
151 zero, if aChar is not a supplementary character. |
|
152 */ |
|
153 { |
|
154 return STATIC_CAST(TText16, 0xDC00 | (aChar & 0x3FF)); |
|
155 } |
|
156 |
|
157 //This function converts from Unicoded characters, to foreign characters and adds them into a descriptor |
|
158 EXPORT_C void UnicodeConv::ConvertFromUnicodeL(TDes8& aForeign, const TDesC16& aUnicode) |
|
159 { |
|
160 UnicodeConv::ConvertFromUnicodeL(aForeign, aUnicode, ETrue); |
|
161 } |
|
162 |
|
163 //This function converts from Unicoded characters, to foreign characters and adds them into a descriptor |
|
164 EXPORT_C TInt UnicodeConv::ConvertFromUnicodeL(TDes8& aForeign, const TDesC16& aUnicode, TBool leaveWhenOverflow) |
|
165 { |
|
166 const TInt length = aUnicode.Length(); |
|
167 const TUint16* unicode = aUnicode.Ptr(); |
|
168 const TUint16* guard = unicode + length; |
|
169 |
|
170 TUint8* foreign = const_cast<TUint8*>(aForeign.Ptr()); |
|
171 TUint8* foreignguard = foreign + aForeign.MaxLength(); |
|
172 |
|
173 //loop going through the character of the unicode descriptor |
|
174 while (unicode < guard) |
|
175 { |
|
176 TUint32 unicodeChar = *unicode++; |
|
177 if (IsHighSurrogate(unicodeChar)) |
|
178 { |
|
179 if (unicode >= guard || !IsLowSurrogate(*unicode)) |
|
180 { |
|
181 if (foreign >= foreignguard) |
|
182 { |
|
183 aForeign.SetLength(foreign-aForeign.Ptr()); |
|
184 if (leaveWhenOverflow) |
|
185 User::Leave(KErrOverflow); |
|
186 else |
|
187 return KErrOverflow; |
|
188 } |
|
189 *foreign++ = KForeignReplacement; |
|
190 continue; |
|
191 } |
|
192 unicodeChar = JoinSurrogate(unicodeChar, *unicode++); |
|
193 } |
|
194 if (IsLowSurrogate(unicodeChar)) |
|
195 { |
|
196 if (foreign >= foreignguard) |
|
197 { |
|
198 aForeign.SetLength(foreign-aForeign.Ptr()); |
|
199 if (leaveWhenOverflow) |
|
200 User::Leave(KErrOverflow); |
|
201 else |
|
202 return KErrOverflow; |
|
203 } |
|
204 *foreign++ = KForeignReplacement; |
|
205 continue; |
|
206 } |
|
207 |
|
208 TUint8 b1, b2, b3, b4; // byte 1,2,3,4 of result GB18030 code. |
|
209 TInt count; // byte count of result GB18030 code; can be 1, 2 or 4. |
|
210 |
|
211 // unicode to cp54936 |
|
212 if (IsSupplementary(unicodeChar)) |
|
213 { |
|
214 unicodeChar -= 0x10000; |
|
215 b4 = unicodeChar % 10 + KU10000Byte4; |
|
216 unicodeChar /= 10; |
|
217 b3 = unicodeChar % 126 + KU10000Byte3; |
|
218 unicodeChar /= 126; |
|
219 b2 = unicodeChar % 10 + KU10000Byte2; |
|
220 b1 = unicodeChar / 10 + KU10000Byte1; |
|
221 count = 4; |
|
222 } |
|
223 else |
|
224 { |
|
225 TUint32 foreignChar; |
|
226 foreignChar = KMappingTableUnicodeBmp2CP54936[unicodeChar]; |
|
227 b1 = ((foreignChar >> 24) & 0xFF); |
|
228 b2 = ((foreignChar >> 16) & 0xFF); |
|
229 b3 = ((foreignChar >> 8) & 0xFF); |
|
230 b4 = (foreignChar & 0xFF); |
|
231 count = 1; |
|
232 if (b1) |
|
233 { |
|
234 count = 4; |
|
235 } |
|
236 else |
|
237 { |
|
238 __ASSERT_DEBUG(b2==0, Panic(EBadForeignCode)); |
|
239 if (b3) |
|
240 { |
|
241 count = 2; |
|
242 } |
|
243 } |
|
244 } |
|
245 |
|
246 if (foreign + count > foreignguard) |
|
247 { |
|
248 aForeign.SetLength(foreign-aForeign.Ptr()); |
|
249 if (leaveWhenOverflow) |
|
250 User::Leave(KErrOverflow); |
|
251 else |
|
252 return KErrOverflow; |
|
253 } |
|
254 if (count == 4) |
|
255 { |
|
256 *foreign++ = b1; |
|
257 *foreign++ = b2; |
|
258 } |
|
259 if (count >= 2) |
|
260 *foreign++ = b3; |
|
261 *foreign++ = b4; |
|
262 } |
|
263 aForeign.SetLength(foreign-aForeign.Ptr()); |
|
264 return KErrNone; |
|
265 } |
|
266 |
|
267 |
|
268 //This function converts from foreign characters into unicode and adds them into a descriptor |
|
269 EXPORT_C void UnicodeConv::ConvertToUnicodeL(TDes16& aUnicode, const TDesC8& aForeign) |
|
270 { |
|
271 UnicodeConv::ConvertToUnicodeL(aUnicode, aForeign, ETrue); |
|
272 } |
|
273 |
|
274 //This function converts from foreign characters into unicode and adds them into a descriptor |
|
275 EXPORT_C TInt UnicodeConv::ConvertToUnicodeL(TDes16& aUnicode, const TDesC8& aForeign, TBool leaveWhenOverflow) |
|
276 { |
|
277 const TInt foreignLength = aForeign.Length(); |
|
278 const TUint8* foreign = aForeign.Ptr(); |
|
279 const TUint8* guard = foreign + foreignLength; |
|
280 |
|
281 TUint16* unicode = const_cast<TUint16*>(aUnicode.Ptr()); |
|
282 TUint16* unicodeguard = unicode + aUnicode.MaxLength(); |
|
283 |
|
284 TUint8 b1, b2, b3, b4; |
|
285 enum TCodeType |
|
286 { |
|
287 E1Byte = 0, |
|
288 E2Byte, |
|
289 E4ByteBmp, |
|
290 E4ByteSupplementary, |
|
291 EError, |
|
292 }; |
|
293 TCodeType codetype; |
|
294 TUint32 unicodeChar; |
|
295 |
|
296 //loop going through the characters of the foreign descriptor |
|
297 while (foreign < guard) |
|
298 { |
|
299 // roughly, detect which area the foreign code belongs to |
|
300 b1 = *foreign++; |
|
301 if (b1 <= 0x7F) |
|
302 codetype = E1Byte; |
|
303 else if (b1 == 0x80 || b1 > 0xFE) |
|
304 codetype = EError; |
|
305 else if (foreign >= guard) |
|
306 codetype = EError; |
|
307 else |
|
308 { |
|
309 b2 = *foreign++; |
|
310 if (b2 >= 0x40 && b2 <= 0xFE && b2 != 0x7F) |
|
311 codetype = E2Byte; |
|
312 else if (b2 < 0x30 || b2 > 0x39) |
|
313 codetype = EError; |
|
314 else if (foreign+1 >= guard) |
|
315 codetype = EError; |
|
316 else |
|
317 { |
|
318 b3 = *foreign++; |
|
319 if (b3 < 0x81 || b3 > 0xFE) |
|
320 codetype = EError; |
|
321 else |
|
322 { |
|
323 b4 = *foreign++; |
|
324 if (b4 < 0x30 || b4 > 0x39) |
|
325 codetype = EError; |
|
326 else if (b1 >= 0x81 && b1 <= 0x84) // 0x81308130-0x8439FE39 |
|
327 codetype = E4ByteBmp; |
|
328 else if (b1 >= 0x90 && b1 <= 0xE3) // 0x90308130-0xE339FE39 |
|
329 codetype = E4ByteSupplementary; |
|
330 else |
|
331 codetype = EError; // others are reserved |
|
332 } |
|
333 } |
|
334 } |
|
335 |
|
336 // cp54936 to unicode |
|
337 if (codetype == E1Byte) |
|
338 { |
|
339 unicodeChar = b1; |
|
340 } |
|
341 else if (codetype == E2Byte) |
|
342 { |
|
343 // conventional algorithm used in FatCharsetConv |
|
344 const TLeadOrSingle* structPtr = TConvDataStruct::KFirstByteConversions + (b1-0x80); |
|
345 if (structPtr->iUnicodeIfSingle) |
|
346 unicodeChar = structPtr->iUnicodeIfSingle; |
|
347 else if (TConvDataStruct::KMinTrailByte <= b2 && b2 <= TConvDataStruct::KMaxTrailByte) |
|
348 unicodeChar = TConvDataStruct::KDoubleByteConversions[structPtr->iDoubleByteIndex + (b2 - TConvDataStruct::KMinTrailByte)]; |
|
349 else |
|
350 unicodeChar = 0xFFFD; |
|
351 } |
|
352 else if (codetype == E4ByteBmp) |
|
353 { |
|
354 TUint index = (b1-0x81)*12600 + (b2-0x30)*1260 + (b3-0x81)*10 + (b4-0x30); |
|
355 __ASSERT_DEBUG(index<39420, Panic(E4ByteIndexOutOfRange)); |
|
356 unicodeChar = KMappingTable4ByteBmp2Unicode[index]; |
|
357 } |
|
358 else if (codetype == E4ByteSupplementary) |
|
359 { |
|
360 unicodeChar = 0x10000 + (b1 - KU10000Byte1) * 12600 + |
|
361 (b2 - KU10000Byte2) * 1260 + |
|
362 (b3 - KU10000Byte3) * 10 + |
|
363 (b4 - KU10000Byte4); |
|
364 __ASSERT_DEBUG(unicodeChar >= 0x10000 && unicodeChar <= 0x10FFFF, Panic(EInavlidUnicodeValue)); |
|
365 } |
|
366 else |
|
367 { |
|
368 unicodeChar = 0xFFFD; |
|
369 } |
|
370 |
|
371 // append to output buffer |
|
372 if (IsSupplementary(unicodeChar)) |
|
373 { |
|
374 if (unicode + 1 >= unicodeguard) |
|
375 { |
|
376 aUnicode.SetLength(unicode-aUnicode.Ptr()); |
|
377 if (leaveWhenOverflow) |
|
378 User::Leave(KErrOverflow); |
|
379 else |
|
380 return KErrOverflow; |
|
381 } |
|
382 *unicode++ = GetHighSurrogate(unicodeChar); |
|
383 *unicode++ = GetLowSurrogate(unicodeChar); |
|
384 } |
|
385 else |
|
386 { |
|
387 if (unicode >= unicodeguard) |
|
388 { |
|
389 aUnicode.SetLength(unicode-aUnicode.Ptr()); |
|
390 if (leaveWhenOverflow) |
|
391 User::Leave(KErrOverflow); |
|
392 else |
|
393 return KErrOverflow; |
|
394 } |
|
395 *unicode++ = unicodeChar; |
|
396 } |
|
397 } |
|
398 aUnicode.SetLength(unicode-aUnicode.Ptr()); |
|
399 return KErrNone; |
|
400 } |
|
401 |
|
402 EXPORT_C TBool UnicodeConv::IsLegalShortNameCharacter (TUint aCharacter) |
|
403 { |
|
404 //1. aCharacter >= 0x0080 |
|
405 if (aCharacter>=0x0080) |
|
406 { |
|
407 // Since all Unicode characters can be mapped to GB18030, so no need to |
|
408 // test the converting. |
|
409 if (aCharacter <= 0x10FFFF && !IsSurrogate(aCharacter)) |
|
410 return ETrue; |
|
411 else |
|
412 return EFalse; |
|
413 } |
|
414 |
|
415 // For most common cases: |
|
416 // Note: lower case characters are considered legal DOS char here. |
|
417 if ((aCharacter>='a' && aCharacter<='z') || |
|
418 (aCharacter>='A' && aCharacter<='Z') || |
|
419 (aCharacter>='0' && aCharacter<='9')) |
|
420 { |
|
421 return ETrue; |
|
422 } |
|
423 // Checking for illegal chars: |
|
424 // 2. aCharacter <= 0x20 |
|
425 // Note: leading 0x05 byte should be guarded by callers of this function |
|
426 // as the information of the position of the character is required. |
|
427 if (aCharacter < 0x20) |
|
428 return EFalse; |
|
429 // Space (' ') is not considered as a legal DOS char here. |
|
430 if (aCharacter == 0x20) |
|
431 return EFalse; |
|
432 |
|
433 // 3. 0x20 < aCharacter < 0x80 |
|
434 // According to FAT Spec, "following characters are not legal in any bytes of DIR_Name": |
|
435 switch (aCharacter) |
|
436 { |
|
437 case 0x22: // '"' |
|
438 case 0x2A: // '*' |
|
439 case 0x2B: // '+' |
|
440 case 0x2C: // ',' |
|
441 //case 0x2E: // '.' // Although '.' is not allowed in any bytes of DIR_Name, it |
|
442 // is a valid character in short file names. |
|
443 case 0x2F: // '/' |
|
444 case 0x3A: // ':' |
|
445 case 0x3B: // ';' |
|
446 case 0x3C: // '<' |
|
447 case 0x3D: // '=' |
|
448 case 0x3E: // '>' |
|
449 case 0x3F: // '?' |
|
450 case 0x5B: // '[' |
|
451 case 0x5C: // '\' |
|
452 case 0x5D: // ']' |
|
453 case 0x7C: // '|' |
|
454 return EFalse; |
|
455 default: |
|
456 return ETrue; |
|
457 } |
|
458 } |
|
459 |