1 /* |
|
2 * Copyright (c) 2008-2009 Nokia Corporation and/or its subsidiary(-ies). |
|
3 * All rights reserved. |
|
4 * This component and the accompanying materials are made available |
|
5 * under the terms of "Eclipse Public License v1.0" |
|
6 * which accompanies this distribution, and is available |
|
7 * at the URL "http://www.eclipse.org/legal/epl-v10.html". |
|
8 * |
|
9 * Initial Contributors: |
|
10 * Nokia Corporation - initial contribution. |
|
11 * |
|
12 * Contributors: |
|
13 * |
|
14 * Description: |
|
15 * |
|
16 */ |
|
17 // There are 2 reasons why not use existing unicodeconv.cpp: |
|
18 // 1) "unicode->foreign" in existing unicodeconv.cpp is quite slow, especially |
|
19 // for huge code pages (e.g, Asia code pages). See INC127598. |
|
20 // |
|
21 // 2) GB18030 has 32-bit code that existing unicodeconv.cpp cannot handle. |
|
22 // |
|
23 // The algorithm of this special version unicodeconv.cpp is straightforward: |
|
24 // 1) foreign->unicode: |
|
25 // 1.1) 1 byte/2 byte->unicode bmp: use existing mechanism; mapping table in |
|
26 // "cp54936_2byte_tounicode.cpp", which is generated with command |
|
27 // "perl -w ..\group\FatConversionTable.pl cp54936_2byte.txt". |
|
28 // |
|
29 // 1.2) 4 byte->unicode bmp: convert the 4-byte code to a 16-bit index, then |
|
30 // search into the mapping table in "cp54936_4byte_tounicode.cpp", |
|
31 // which is generated with command |
|
32 // "perl -w ..\group\cp54936_4byte_tounicode.pl cp54936_4byte.txt". |
|
33 // |
|
34 // 1.3) 4 byte->unicode non-bmp: calculate with formula in this file. |
|
35 // |
|
36 // 2) unicode->foreign: |
|
37 // 2.1) unicode bmp->1/2/4 byte: the huge table in "cp54936_allbmp_fromunicode.cpp" |
|
38 // can map directly, which is generated with command |
|
39 // "perl -w ..\group\cp54936_allbmp_fromunicode.pl cp54936_2byte.txt cp54936_4byte.txt". |
|
40 // |
|
41 // 2.2) unicode non-bmp->4 byte: calculate with formula in this file. |
|
42 // |
|
43 // The function cp54936_2byte_tounicode.cpp::TConvDataStruct:: |
|
44 // ConvertSingleUnicode() is not used anymore. It's reserved just because not |
|
45 // changing the tool FatConversionTable.pl. |
|
46 // |
|
47 // About the mapping table "cp54936_2byte.txt" and "cp54936_4byte.txt": |
|
48 // 1) All Private Used Area (PUA) code points are reserved. |
|
49 // 2) All GB18030 code points that mapping to undefined Unicode are reserved. |
|
50 // |
|
51 // |
|
52 // About the formula for non-bmp calculation: |
|
53 // 1) All code points from 0x10000 to 0x10FFFF are supported. |
|
54 // 2) Code points in 0x10000-0x1FFFF and 0x30000-0x10FFFF are summarized from |
|
55 // the GB18030 standard, since the standard does not define the mapping for |
|
56 // code points out of 0x20000-0x2FFFF. |
|
57 |
|
58 |
|
59 #include <e32std.h> |
|
60 #include <e32def.h> |
|
61 #include <e32des8.h> |
|
62 #include "unicodeconv.h" |
|
63 #include "cp54936.h" |
|
64 |
|
65 |
|
66 enum TFccPanic |
|
67 { |
|
68 EBadForeignCode = 0, |
|
69 E4ByteIndexOutOfRange, |
|
70 EPanicBadIndices1, |
|
71 EInavlidUnicodeValue |
|
72 }; |
|
73 void Panic(TFccPanic aPanic) |
|
74 { |
|
75 |
|
76 User::Panic(_L("FatCharsetConv"),aPanic); |
|
77 } |
|
78 |
|
79 |
|
80 //replacement character to be used when unicode cannot be converted |
|
81 const TUint8 KForeignReplacement = 0x5F; |
|
82 |
|
83 const TUint8 KU10000Byte1 = 0x90; |
|
84 const TUint8 KU10000Byte2 = 0x30; |
|
85 const TUint8 KU10000Byte3 = 0x81; |
|
86 const TUint8 KU10000Byte4 = 0x30; |
|
87 |
|
88 inline TBool IsSupplementary(TUint aChar) |
|
89 /** |
|
90 @param aChar The 32-bit code point value of a Unicode character. |
|
91 |
|
92 @return True, if aChar is supplementary character; false, otherwise. |
|
93 */ |
|
94 { |
|
95 return (aChar > 0xFFFF); |
|
96 } |
|
97 |
|
98 inline TBool IsSurrogate(TText16 aInt16) |
|
99 /** |
|
100 @return True, if aText16 is high surrogate or low surrogate; false, otherwise. |
|
101 */ |
|
102 { |
|
103 return (aInt16 & 0xF800) == 0xD800; |
|
104 } |
|
105 |
|
106 inline TBool IsHighSurrogate(TText16 aInt16) |
|
107 /** |
|
108 @return True, if aText16 is high surrogate; false, otherwise. |
|
109 */ |
|
110 { |
|
111 return (aInt16 & 0xFC00) == 0xD800; |
|
112 } |
|
113 |
|
114 inline TBool IsLowSurrogate(TText16 aInt16) |
|
115 /** |
|
116 @return True, if aText16 is low surrogate; false, otherwise. |
|
117 */ |
|
118 { |
|
119 return (aInt16 & 0xFC00) == 0xDC00; |
|
120 } |
|
121 |
|
122 inline TUint JoinSurrogate(TText16 aHighSurrogate, TText16 aLowSurrogate) |
|
123 /** |
|
124 Combine a high surrogate and a low surrogate into a supplementary character. |
|
125 |
|
126 @return The 32-bit code point value of the generated Unicode supplementary |
|
127 character. |
|
128 */ |
|
129 { |
|
130 return ((aHighSurrogate - 0xD7F7) << 10) + aLowSurrogate; |
|
131 } |
|
132 |
|
133 inline TText16 GetHighSurrogate(TUint aChar) |
|
134 /** |
|
135 Retrieve the high surrogate of a supplementary character. |
|
136 |
|
137 @param aChar The 32-bit code point value of a Unicode character. |
|
138 |
|
139 @return High surrogate of aChar, if aChar is a supplementary character; |
|
140 aChar itself, if aChar is not a supplementary character. |
|
141 */ |
|
142 { |
|
143 return STATIC_CAST(TText16, 0xD7C0 + (aChar >> 10)); |
|
144 } |
|
145 |
|
146 inline TText16 GetLowSurrogate(TUint aChar) |
|
147 /** |
|
148 Retrieve the low surrogate of a supplementary character. |
|
149 |
|
150 @param aChar The 32-bit code point value of a Unicode character. |
|
151 |
|
152 @return Low surrogate of aChar, if aChar is a supplementary character; |
|
153 zero, if aChar is not a supplementary character. |
|
154 */ |
|
155 { |
|
156 return STATIC_CAST(TText16, 0xDC00 | (aChar & 0x3FF)); |
|
157 } |
|
158 |
|
159 //This function converts from Unicoded characters, to foreign characters and adds them into a descriptor |
|
160 EXPORT_C void UnicodeConv::ConvertFromUnicodeL(TDes8& aForeign, const TDesC16& aUnicode) |
|
161 { |
|
162 UnicodeConv::ConvertFromUnicodeL(aForeign, aUnicode, ETrue); |
|
163 } |
|
164 |
|
165 //This function converts from Unicoded characters, to foreign characters and adds them into a descriptor |
|
166 EXPORT_C TInt UnicodeConv::ConvertFromUnicodeL(TDes8& aForeign, const TDesC16& aUnicode, TBool leaveWhenOverflow) |
|
167 { |
|
168 const TInt length = aUnicode.Length(); |
|
169 const TUint16* unicode = aUnicode.Ptr(); |
|
170 const TUint16* guard = unicode + length; |
|
171 |
|
172 TUint8* foreign = const_cast<TUint8*>(aForeign.Ptr()); |
|
173 TUint8* foreignguard = foreign + aForeign.MaxLength(); |
|
174 |
|
175 //loop going through the character of the unicode descriptor |
|
176 while (unicode < guard) |
|
177 { |
|
178 TUint32 unicodeChar = *unicode++; |
|
179 if (IsHighSurrogate(unicodeChar)) |
|
180 { |
|
181 if (unicode >= guard || !IsLowSurrogate(*unicode)) |
|
182 { |
|
183 if (foreign >= foreignguard) |
|
184 { |
|
185 aForeign.SetLength(foreign-aForeign.Ptr()); |
|
186 if (leaveWhenOverflow) |
|
187 User::Leave(KErrOverflow); |
|
188 else |
|
189 return KErrOverflow; |
|
190 } |
|
191 *foreign++ = KForeignReplacement; |
|
192 continue; |
|
193 } |
|
194 unicodeChar = JoinSurrogate(unicodeChar, *unicode++); |
|
195 } |
|
196 if (IsLowSurrogate(unicodeChar)) |
|
197 { |
|
198 if (foreign >= foreignguard) |
|
199 { |
|
200 aForeign.SetLength(foreign-aForeign.Ptr()); |
|
201 if (leaveWhenOverflow) |
|
202 User::Leave(KErrOverflow); |
|
203 else |
|
204 return KErrOverflow; |
|
205 } |
|
206 *foreign++ = KForeignReplacement; |
|
207 continue; |
|
208 } |
|
209 |
|
210 TUint8 b1, b2, b3, b4; // byte 1,2,3,4 of result GB18030 code. |
|
211 TInt count; // byte count of result GB18030 code; can be 1, 2 or 4. |
|
212 |
|
213 // unicode to cp54936 |
|
214 if (IsSupplementary(unicodeChar)) |
|
215 { |
|
216 unicodeChar -= 0x10000; |
|
217 b4 = unicodeChar % 10 + KU10000Byte4; |
|
218 unicodeChar /= 10; |
|
219 b3 = unicodeChar % 126 + KU10000Byte3; |
|
220 unicodeChar /= 126; |
|
221 b2 = unicodeChar % 10 + KU10000Byte2; |
|
222 b1 = unicodeChar / 10 + KU10000Byte1; |
|
223 count = 4; |
|
224 } |
|
225 else |
|
226 { |
|
227 TUint32 foreignChar; |
|
228 foreignChar = KMappingTableUnicodeBmp2CP54936[unicodeChar]; |
|
229 b1 = ((foreignChar >> 24) & 0xFF); |
|
230 b2 = ((foreignChar >> 16) & 0xFF); |
|
231 b3 = ((foreignChar >> 8) & 0xFF); |
|
232 b4 = (foreignChar & 0xFF); |
|
233 count = 1; |
|
234 if (b1) |
|
235 { |
|
236 count = 4; |
|
237 } |
|
238 else |
|
239 { |
|
240 __ASSERT_DEBUG(b2==0, Panic(EBadForeignCode)); |
|
241 if (b3) |
|
242 { |
|
243 count = 2; |
|
244 } |
|
245 } |
|
246 } |
|
247 |
|
248 if (foreign + count > foreignguard) |
|
249 { |
|
250 aForeign.SetLength(foreign-aForeign.Ptr()); |
|
251 if (leaveWhenOverflow) |
|
252 User::Leave(KErrOverflow); |
|
253 else |
|
254 return KErrOverflow; |
|
255 } |
|
256 if (count == 4) |
|
257 { |
|
258 *foreign++ = b1; |
|
259 *foreign++ = b2; |
|
260 } |
|
261 if (count >= 2) |
|
262 *foreign++ = b3; |
|
263 *foreign++ = b4; |
|
264 } |
|
265 aForeign.SetLength(foreign-aForeign.Ptr()); |
|
266 return KErrNone; |
|
267 } |
|
268 |
|
269 |
|
270 //This function converts from foreign characters into unicode and adds them into a descriptor |
|
271 EXPORT_C void UnicodeConv::ConvertToUnicodeL(TDes16& aUnicode, const TDesC8& aForeign) |
|
272 { |
|
273 UnicodeConv::ConvertToUnicodeL(aUnicode, aForeign, ETrue); |
|
274 } |
|
275 |
|
276 //This function converts from foreign characters into unicode and adds them into a descriptor |
|
277 EXPORT_C TInt UnicodeConv::ConvertToUnicodeL(TDes16& aUnicode, const TDesC8& aForeign, TBool leaveWhenOverflow) |
|
278 { |
|
279 const TInt foreignLength = aForeign.Length(); |
|
280 const TUint8* foreign = aForeign.Ptr(); |
|
281 const TUint8* guard = foreign + foreignLength; |
|
282 |
|
283 TUint16* unicode = const_cast<TUint16*>(aUnicode.Ptr()); |
|
284 TUint16* unicodeguard = unicode + aUnicode.MaxLength(); |
|
285 |
|
286 TUint8 b1, b2, b3, b4; |
|
287 enum TCodeType |
|
288 { |
|
289 E1Byte = 0, |
|
290 E2Byte, |
|
291 E4ByteBmp, |
|
292 E4ByteSupplementary, |
|
293 EError, |
|
294 }; |
|
295 TCodeType codetype; |
|
296 TUint32 unicodeChar; |
|
297 |
|
298 //loop going through the characters of the foreign descriptor |
|
299 while (foreign < guard) |
|
300 { |
|
301 // roughly, detect which area the foreign code belongs to |
|
302 b1 = *foreign++; |
|
303 if (b1 <= 0x7F) |
|
304 codetype = E1Byte; |
|
305 else if (b1 == 0x80 || b1 > 0xFE) |
|
306 codetype = EError; |
|
307 else if (foreign >= guard) |
|
308 codetype = EError; |
|
309 else |
|
310 { |
|
311 b2 = *foreign++; |
|
312 if (b2 >= 0x40 && b2 <= 0xFE && b2 != 0x7F) |
|
313 codetype = E2Byte; |
|
314 else if (b2 < 0x30 || b2 > 0x39) |
|
315 codetype = EError; |
|
316 else if (foreign+1 >= guard) |
|
317 codetype = EError; |
|
318 else |
|
319 { |
|
320 b3 = *foreign++; |
|
321 if (b3 < 0x81 || b3 > 0xFE) |
|
322 codetype = EError; |
|
323 else |
|
324 { |
|
325 b4 = *foreign++; |
|
326 if (b4 < 0x30 || b4 > 0x39) |
|
327 codetype = EError; |
|
328 else if (b1 >= 0x81 && b1 <= 0x84) // 0x81308130-0x8439FE39 |
|
329 codetype = E4ByteBmp; |
|
330 else if (b1 >= 0x90 && b1 <= 0xE3) // 0x90308130-0xE339FE39 |
|
331 codetype = E4ByteSupplementary; |
|
332 else |
|
333 codetype = EError; // others are reserved |
|
334 } |
|
335 } |
|
336 } |
|
337 |
|
338 // cp54936 to unicode |
|
339 if (codetype == E1Byte) |
|
340 { |
|
341 unicodeChar = b1; |
|
342 } |
|
343 else if (codetype == E2Byte) |
|
344 { |
|
345 // conventional algorithm used in FatCharsetConv |
|
346 const TLeadOrSingle* structPtr = TConvDataStruct::KFirstByteConversions + (b1-0x80); |
|
347 if (structPtr->iUnicodeIfSingle) |
|
348 unicodeChar = structPtr->iUnicodeIfSingle; |
|
349 else if (TConvDataStruct::KMinTrailByte <= b2 && b2 <= TConvDataStruct::KMaxTrailByte) |
|
350 unicodeChar = TConvDataStruct::KDoubleByteConversions[structPtr->iDoubleByteIndex + (b2 - TConvDataStruct::KMinTrailByte)]; |
|
351 else |
|
352 unicodeChar = 0xFFFD; |
|
353 } |
|
354 else if (codetype == E4ByteBmp) |
|
355 { |
|
356 TUint index = (b1-0x81)*12600 + (b2-0x30)*1260 + (b3-0x81)*10 + (b4-0x30); |
|
357 __ASSERT_DEBUG(index<39420, Panic(E4ByteIndexOutOfRange)); |
|
358 unicodeChar = KMappingTable4ByteBmp2Unicode[index]; |
|
359 } |
|
360 else if (codetype == E4ByteSupplementary) |
|
361 { |
|
362 unicodeChar = 0x10000 + (b1 - KU10000Byte1) * 12600 + |
|
363 (b2 - KU10000Byte2) * 1260 + |
|
364 (b3 - KU10000Byte3) * 10 + |
|
365 (b4 - KU10000Byte4); |
|
366 __ASSERT_DEBUG(unicodeChar >= 0x10000 && unicodeChar <= 0x10FFFF, Panic(EInavlidUnicodeValue)); |
|
367 } |
|
368 else |
|
369 { |
|
370 unicodeChar = 0xFFFD; |
|
371 } |
|
372 |
|
373 // append to output buffer |
|
374 if (IsSupplementary(unicodeChar)) |
|
375 { |
|
376 if (unicode + 1 >= unicodeguard) |
|
377 { |
|
378 aUnicode.SetLength(unicode-aUnicode.Ptr()); |
|
379 if (leaveWhenOverflow) |
|
380 User::Leave(KErrOverflow); |
|
381 else |
|
382 return KErrOverflow; |
|
383 } |
|
384 *unicode++ = GetHighSurrogate(unicodeChar); |
|
385 *unicode++ = GetLowSurrogate(unicodeChar); |
|
386 } |
|
387 else |
|
388 { |
|
389 if (unicode >= unicodeguard) |
|
390 { |
|
391 aUnicode.SetLength(unicode-aUnicode.Ptr()); |
|
392 if (leaveWhenOverflow) |
|
393 User::Leave(KErrOverflow); |
|
394 else |
|
395 return KErrOverflow; |
|
396 } |
|
397 *unicode++ = unicodeChar; |
|
398 } |
|
399 } |
|
400 aUnicode.SetLength(unicode-aUnicode.Ptr()); |
|
401 return KErrNone; |
|
402 } |
|
403 |
|
404 EXPORT_C TBool UnicodeConv::IsLegalShortNameCharacter (TUint aCharacter) |
|
405 { |
|
406 //1. aCharacter >= 0x0080 |
|
407 if (aCharacter>=0x0080) |
|
408 { |
|
409 // Since all Unicode characters can be mapped to GB18030, so no need to |
|
410 // test the converting. |
|
411 if (aCharacter <= 0x10FFFF && !IsSurrogate(aCharacter)) |
|
412 return ETrue; |
|
413 else |
|
414 return EFalse; |
|
415 } |
|
416 |
|
417 // For most common cases: |
|
418 // Note: lower case characters are considered legal DOS char here. |
|
419 if ((aCharacter>='a' && aCharacter<='z') || |
|
420 (aCharacter>='A' && aCharacter<='Z') || |
|
421 (aCharacter>='0' && aCharacter<='9')) |
|
422 { |
|
423 return ETrue; |
|
424 } |
|
425 // Checking for illegal chars: |
|
426 // 2. aCharacter <= 0x20 |
|
427 // Note: leading 0x05 byte should be guarded by callers of this function |
|
428 // as the information of the position of the character is required. |
|
429 if (aCharacter < 0x20) |
|
430 return EFalse; |
|
431 // Space (' ') is not considered as a legal DOS char here. |
|
432 if (aCharacter == 0x20) |
|
433 return EFalse; |
|
434 |
|
435 // 3. 0x20 < aCharacter < 0x80 |
|
436 // According to FAT Spec, "following characters are not legal in any bytes of DIR_Name": |
|
437 switch (aCharacter) |
|
438 { |
|
439 case 0x22: // '"' |
|
440 case 0x2A: // '*' |
|
441 case 0x2B: // '+' |
|
442 case 0x2C: // ',' |
|
443 //case 0x2E: // '.' // Although '.' is not allowed in any bytes of DIR_Name, it |
|
444 // is a valid character in short file names. |
|
445 case 0x2F: // '/' |
|
446 case 0x3A: // ':' |
|
447 case 0x3B: // ';' |
|
448 case 0x3C: // '<' |
|
449 case 0x3D: // '=' |
|
450 case 0x3E: // '>' |
|
451 case 0x3F: // '?' |
|
452 case 0x5B: // '[' |
|
453 case 0x5C: // '\' |
|
454 case 0x5D: // ']' |
|
455 case 0x7C: // '|' |
|
456 return EFalse; |
|
457 default: |
|
458 return ETrue; |
|
459 } |
|
460 } |
|
461 |
|