|
1 /* |
|
2 * Copyright (c) 1997-2009 Nokia Corporation and/or its subsidiary(-ies). |
|
3 * All rights reserved. |
|
4 * This component and the accompanying materials are made available |
|
5 * under the terms of "Eclipse Public License v1.0" |
|
6 * which accompanies this distribution, and is available |
|
7 * at the URL "http://www.eclipse.org/legal/epl-v10.html". |
|
8 * |
|
9 * Initial Contributors: |
|
10 * Nokia Corporation - initial contribution. |
|
11 * |
|
12 * Contributors: |
|
13 * |
|
14 * Description: |
|
15 * |
|
16 */ |
|
17 |
|
18 |
|
19 #include <e32std.h> |
|
20 #include <e32base.h> |
|
21 #include <utf.h> |
|
22 |
|
23 const TUint KNotInBase64Alphabet=KMaxTUint; |
|
24 |
|
25 enum TPanic |
|
26 { |
|
27 EPanicBad6BitNumber=1, |
|
28 EPanicBadUtf7Pointers1, |
|
29 EPanicBadUtf7Pointers2, |
|
30 EPanicBadUtf7Pointers3, |
|
31 EPanicBadUtf7Pointers4, |
|
32 EPanicBadUtf7Pointers5, |
|
33 EPanicBadUtf7Pointers6, |
|
34 EPanicBadUtf7Pointers7, |
|
35 EPanicBadUtf7Pointers8, |
|
36 EPanicBadUtf7Pointers9, |
|
37 EPanicBadUtf7Pointers10, |
|
38 EPanicBadUtf7Pointers11, |
|
39 EPanicNotInBase64Block, |
|
40 EPanicBadUnicodePointers1, |
|
41 EPanicBadUnicodePointers2, |
|
42 EPanicBadUnicodePointers3, |
|
43 EPanicBadUnicodePointers4, |
|
44 EPanicBadUnicodePointers5, |
|
45 EPanicBadUnicodePointers6, |
|
46 EPanicBadUnicodePointers7, |
|
47 EPanicBadUnicodePointers8, |
|
48 EPanicBadUnicodePointers9, |
|
49 EPanicBadUnicodePointers10, |
|
50 EPanicBadBitBufferState1, |
|
51 EPanicBadBitBufferState2, |
|
52 EPanicBadBitBufferState3, |
|
53 EPanicBadBitBufferState4, |
|
54 EPanicBadBitBufferState5, |
|
55 EPanicBadBitBufferState6, |
|
56 EPanicBadBitBufferState7, |
|
57 EPanicBadBitBufferState8, |
|
58 EPanicBadBitBufferState9, |
|
59 EPanicBadBitBufferState10, |
|
60 EPanicBadBitBufferState11, |
|
61 EPanicBadBitBufferState12, |
|
62 EPanicBadBitBufferState13, |
|
63 EPanicBadBitBufferState14, |
|
64 EPanicBadBitBufferState15, |
|
65 EPanicBadBitBufferState16, |
|
66 EPanicBadBitBufferState17, |
|
67 EPanicUnexpectedNumberOfLoopIterations, |
|
68 EPanicInitialEscapeCharacterButNoBase64, |
|
69 EPanicBase64SequenceDoesNotFallOnUnicodeCharacterBoundary, |
|
70 EPanicBadUtf8Pointers1, |
|
71 EPanicBadUtf8Pointers2, |
|
72 EPanicBadUtf8Pointers3, |
|
73 EPanicBadUtf8Pointers4, |
|
74 EPanicBadUtf8Pointers5, |
|
75 EPanicBadUtf8Pointers6, |
|
76 EPanicBadUtf8Pointers7, |
|
77 EPanicOutOfSyncUtf7Byte1, |
|
78 EPanicOutOfSyncUtf7Byte2, |
|
79 EPanicOutOfSyncBase64Decoding |
|
80 }; |
|
81 |
|
82 _LIT(KLitPanicText, "CHARCONV-UTF"); |
|
83 |
|
84 LOCAL_C void Panic(TPanic aPanic) |
|
85 { |
|
86 User::Panic(KLitPanicText, aPanic); |
|
87 } |
|
88 |
|
89 inline TUint EscapeCharacterForStartingBase64Block(TBool aIsImapUtf7) {return aIsImapUtf7? '&': '+';} |
|
90 |
|
91 LOCAL_C TUint Base64Decoding(TUint aMemberOfBase64Alphabet, TBool aIsImapUtf7) |
|
92 { |
|
93 if ((aMemberOfBase64Alphabet>='A') && (aMemberOfBase64Alphabet<='Z')) |
|
94 { |
|
95 return aMemberOfBase64Alphabet-'A'; |
|
96 } |
|
97 if ((aMemberOfBase64Alphabet>='a') && (aMemberOfBase64Alphabet<='z')) |
|
98 { |
|
99 return aMemberOfBase64Alphabet-('a'-26); |
|
100 } |
|
101 if ((aMemberOfBase64Alphabet>='0') && (aMemberOfBase64Alphabet<='9')) |
|
102 { |
|
103 return aMemberOfBase64Alphabet+((26*2)-'0'); |
|
104 } |
|
105 if (aMemberOfBase64Alphabet=='+') |
|
106 { |
|
107 return 62; |
|
108 } |
|
109 if (aMemberOfBase64Alphabet==STATIC_CAST(TUint, aIsImapUtf7? ',': '/')) |
|
110 { |
|
111 return 63; |
|
112 } |
|
113 return KNotInBase64Alphabet; |
|
114 } |
|
115 |
|
116 LOCAL_C TUint Base64Encoding(TUint a6BitNumber, TBool aIsImapUtf7) |
|
117 { |
|
118 __ASSERT_DEBUG(a6BitNumber<64, Panic(EPanicBad6BitNumber)); |
|
119 if ((a6BitNumber==63) && aIsImapUtf7) |
|
120 { |
|
121 return ','; |
|
122 } |
|
123 static const TUint8 base64Alphabet[64]={'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '+', '/'}; |
|
124 return base64Alphabet[a6BitNumber]; |
|
125 } |
|
126 |
|
127 LOCAL_C TUint8* PointerToEscapeCharacterStartingBase64Block(TUint8* aPointerToUtf7Byte, const TUint8* aPointerToFirstUtf7Byte, TBool aIsImapUtf7) |
|
128 { |
|
129 __ASSERT_DEBUG(aPointerToUtf7Byte>=aPointerToFirstUtf7Byte, Panic(EPanicBadUtf7Pointers1)); |
|
130 TUint8* pointerToCandidateEscapeCharacter=NULL; |
|
131 FOREVER |
|
132 { |
|
133 const TUint utf7Byte=*aPointerToUtf7Byte; |
|
134 if (utf7Byte==EscapeCharacterForStartingBase64Block(aIsImapUtf7)) |
|
135 { |
|
136 pointerToCandidateEscapeCharacter=aPointerToUtf7Byte; |
|
137 } |
|
138 else if (Base64Decoding(utf7Byte, aIsImapUtf7)==KNotInBase64Alphabet) |
|
139 { |
|
140 break; |
|
141 } |
|
142 __ASSERT_DEBUG(aPointerToUtf7Byte>=aPointerToFirstUtf7Byte, Panic(EPanicBadUtf7Pointers2)); |
|
143 if (aPointerToUtf7Byte<=aPointerToFirstUtf7Byte) |
|
144 { |
|
145 break; |
|
146 } |
|
147 --aPointerToUtf7Byte; |
|
148 } |
|
149 __ASSERT_DEBUG(pointerToCandidateEscapeCharacter!=NULL, Panic(EPanicNotInBase64Block)); |
|
150 return pointerToCandidateEscapeCharacter; |
|
151 } |
|
152 |
|
153 LOCAL_C TBool EncodeInUtf7Directly(TUint aUnicodeCharacter, TBool aIsImapUtf7, TBool aEncodeOptionalDirectCharactersInBase64) |
|
154 { |
|
155 if (aIsImapUtf7) |
|
156 { |
|
157 return (aUnicodeCharacter>=0x0020) && (aUnicodeCharacter<=0x007e); |
|
158 } |
|
159 if ((aUnicodeCharacter>=0x0021) && (aUnicodeCharacter<=0x007d)) |
|
160 { |
|
161 if (aEncodeOptionalDirectCharactersInBase64) |
|
162 { |
|
163 return (((aUnicodeCharacter>=0x0041) && (aUnicodeCharacter<=0x005a)) || |
|
164 ((aUnicodeCharacter>=0x0061) && (aUnicodeCharacter<=0x007a)) || |
|
165 ((aUnicodeCharacter>=0x0027) && (aUnicodeCharacter<=0x0029)) || |
|
166 ((aUnicodeCharacter>=0x002b) && (aUnicodeCharacter<=0x003a)) || |
|
167 (aUnicodeCharacter==0x003f)); |
|
168 } |
|
169 return aUnicodeCharacter!=0x005c; |
|
170 } |
|
171 return (aUnicodeCharacter==0x0020) || (aUnicodeCharacter==0x0009) || (aUnicodeCharacter==0x000d) || (aUnicodeCharacter==0x000a); |
|
172 } |
|
173 |
|
174 inline TBool BitBufferContainsNonZeroBits(TUint aBitBuffer, TInt aNumberOfBitsInBuffer) |
|
175 { |
|
176 return (aBitBuffer&((1<<aNumberOfBitsInBuffer)-1))!=0; |
|
177 } |
|
178 |
|
179 |
|
180 |
|
181 /** Converts Unicode text into UTF-7 encoding. The fucntion leaves with |
|
182 KErrCorrupt if the input string is corrupt. |
|
183 |
|
184 @param aUnicode A UCS-2 encoded input string. |
|
185 @param aEncodeOptionalDirectCharactersInBase64 If ETrue then |
|
186 characters from UTF-7 set O (optional direct characters) are encoded in |
|
187 Modified Base64. If EFalse the characters are encoded directly, |
|
188 as their ASCII equivalents. |
|
189 @return A descriptor containing the UTF-7 encoded output string. */ |
|
190 EXPORT_C HBufC8* CnvUtfConverter::ConvertFromUnicodeToUtf7L( |
|
191 const TDesC16& aUnicode, |
|
192 TBool aEncodeOptionalDirectCharactersInBase64) |
|
193 { |
|
194 // If aUnicode is Null string, return an empty HBufC |
|
195 if (aUnicode.Length() == 0) |
|
196 { |
|
197 HBufC8* hBuf8 = HBufC8::NewL(1); |
|
198 return hBuf8; |
|
199 } |
|
200 |
|
201 // Otherwise, convert and store result in a buffer, reallocating that buffer if needed. |
|
202 TInt length = aUnicode.Length(); |
|
203 const TInt bufsize = 100; |
|
204 |
|
205 TPtrC16 unicode (aUnicode); |
|
206 TBuf8<bufsize> buf; |
|
207 HBufC8* hBuf8 = HBufC8::NewLC(length); |
|
208 TPtr8 utf7 = hBuf8->Des(); |
|
209 |
|
210 FOREVER |
|
211 { |
|
212 TInt unconverted = ConvertFromUnicodeToUtf7(buf, unicode, aEncodeOptionalDirectCharactersInBase64); |
|
213 if( unconverted == EErrorIllFormedInput || unconverted < 0) |
|
214 User::Leave(KErrCorrupt); |
|
215 |
|
216 if (utf7.Length() + buf.Length() > utf7.MaxLength()) |
|
217 { |
|
218 // Reallocate the hBuf8 |
|
219 hBuf8 = hBuf8->ReAllocL(utf7.Length() + buf.Length()); |
|
220 CleanupStack::Pop(); |
|
221 CleanupStack::PushL(hBuf8); |
|
222 utf7.Set(hBuf8->Des()); |
|
223 } |
|
224 utf7.Append(buf); |
|
225 if (unconverted ==0) |
|
226 break; |
|
227 unicode.Set(unicode.Right(unconverted)); |
|
228 } |
|
229 CleanupStack::Pop(); |
|
230 return hBuf8; |
|
231 |
|
232 } |
|
233 |
|
234 /** Converts Unicode text into UTF-7 encoding. |
|
235 |
|
236 @param aUtf7 On return, contains the UTF-7 encoded output string. |
|
237 @param aUnicode A UCS-2 encoded input string. |
|
238 @param aEncodeOptionalDirectCharactersInBase64 If ETrue then characters from |
|
239 UTF-7 set O (optional direct characters) are encoded in Modified Base64. If |
|
240 EFalse the characters are encoded directly, as their ASCII equivalents. |
|
241 @return The number of unconverted characters left at the end of the input |
|
242 descriptor, or one of the error values defined in TError. */ |
|
243 EXPORT_C TInt CnvUtfConverter::ConvertFromUnicodeToUtf7( |
|
244 TDes8& aUtf7, |
|
245 const TDesC16& aUnicode, |
|
246 TBool aEncodeOptionalDirectCharactersInBase64) |
|
247 { |
|
248 return ConvertFromUnicodeToUtf7(aUtf7, aUnicode, EFalse, aEncodeOptionalDirectCharactersInBase64); |
|
249 } |
|
250 |
|
251 TInt CnvUtfConverter::ConvertFromUnicodeToUtf7(TDes8& aUtf7, |
|
252 const TDesC16& aUnicode, |
|
253 TBool aIsImapUtf7, |
|
254 TBool aEncodeOptionalDirectCharactersInBase64) |
|
255 { |
|
256 if (aUnicode.Length()==0) |
|
257 { |
|
258 aUtf7.SetLength(0); |
|
259 return 0; |
|
260 } |
|
261 if (aUtf7.MaxLength()==0) |
|
262 { |
|
263 return aUnicode.Length(); |
|
264 } |
|
265 const TUint escapeCharacterForStartingBase64Block=EscapeCharacterForStartingBase64Block(aIsImapUtf7); |
|
266 TUint8* pointerToPreviousUtf7Byte=CONST_CAST(TUint8*, aUtf7.Ptr()-1); |
|
267 const TUint8* const pointerToLastUtf7Byte=pointerToPreviousUtf7Byte+aUtf7.MaxLength(); |
|
268 const TUint16* pointerToPreviousUnicodeCharacter=aUnicode.Ptr()-1; |
|
269 const TUint16* const pointerToLastUnicodeCharacter=pointerToPreviousUnicodeCharacter+aUnicode.Length(); |
|
270 const TUint KIsInBase64Block=0x80000000u; |
|
271 TUint bitBuffer=0; |
|
272 TInt numberOfBitsInBuffer=0; |
|
273 FOREVER |
|
274 { |
|
275 __ASSERT_DEBUG(pointerToPreviousUtf7Byte<=pointerToLastUtf7Byte, Panic(EPanicBadUtf7Pointers3)); |
|
276 __ASSERT_DEBUG(pointerToPreviousUnicodeCharacter<=pointerToLastUnicodeCharacter, Panic(EPanicBadUnicodePointers1)); |
|
277 TUint currentUnicodeCharacter=(pointerToPreviousUnicodeCharacter==pointerToLastUnicodeCharacter)? 0: *(pointerToPreviousUnicodeCharacter+1); |
|
278 if ((pointerToPreviousUnicodeCharacter==pointerToLastUnicodeCharacter) || EncodeInUtf7Directly(currentUnicodeCharacter, aIsImapUtf7, aEncodeOptionalDirectCharactersInBase64)) |
|
279 { |
|
280 __ASSERT_DEBUG((bitBuffer&KIsInBase64Block) || (numberOfBitsInBuffer==0), Panic(EPanicBadBitBufferState1)); |
|
281 __ASSERT_DEBUG((numberOfBitsInBuffer==0) || (numberOfBitsInBuffer==2) || (numberOfBitsInBuffer==4), Panic(EPanicBadBitBufferState2)); |
|
282 if (bitBuffer&KIsInBase64Block) |
|
283 { |
|
284 if (numberOfBitsInBuffer!=0) |
|
285 { |
|
286 if (pointerToLastUtf7Byte-pointerToPreviousUtf7Byte<2) // make sure there is enough space for the trailing '-' as well as the remains of the bitBuffer as the KIsInBase64Block flag is about to turned off, thus the trailing '-' may never get written |
|
287 { |
|
288 break; |
|
289 } |
|
290 ++pointerToPreviousUtf7Byte; |
|
291 *pointerToPreviousUtf7Byte=STATIC_CAST(TUint8, Base64Encoding((bitBuffer<<(6-numberOfBitsInBuffer))&0x3f, aIsImapUtf7)); |
|
292 } |
|
293 else |
|
294 { |
|
295 if (pointerToPreviousUtf7Byte==pointerToLastUtf7Byte) |
|
296 { |
|
297 break; |
|
298 } |
|
299 } |
|
300 ++pointerToPreviousUtf7Byte; |
|
301 *pointerToPreviousUtf7Byte='-'; |
|
302 bitBuffer=0; |
|
303 numberOfBitsInBuffer=0; |
|
304 } |
|
305 __ASSERT_DEBUG(pointerToPreviousUnicodeCharacter<=pointerToLastUnicodeCharacter, Panic(EPanicBadUnicodePointers2)); |
|
306 if (pointerToPreviousUnicodeCharacter>=pointerToLastUnicodeCharacter) |
|
307 { |
|
308 break; |
|
309 } |
|
310 __ASSERT_DEBUG(pointerToPreviousUtf7Byte<=pointerToLastUtf7Byte, Panic(EPanicBadUtf7Pointers4)); |
|
311 if (pointerToLastUtf7Byte-pointerToPreviousUtf7Byte<((currentUnicodeCharacter==escapeCharacterForStartingBase64Block)? 2: 1)) |
|
312 { |
|
313 break; |
|
314 } |
|
315 ++pointerToPreviousUtf7Byte; |
|
316 *pointerToPreviousUtf7Byte=STATIC_CAST(TUint8, currentUnicodeCharacter); |
|
317 ++pointerToPreviousUnicodeCharacter; |
|
318 if (currentUnicodeCharacter==escapeCharacterForStartingBase64Block) |
|
319 { |
|
320 ++pointerToPreviousUtf7Byte; |
|
321 *pointerToPreviousUtf7Byte='-'; |
|
322 } |
|
323 } |
|
324 else |
|
325 { |
|
326 { |
|
327 TInt numberOfUtf7BytesRequired=(numberOfBitsInBuffer+16)/6; // "(numberOfBitsInBuffer+16)/6" is the number of iterations that will happen in the while loop below |
|
328 if (~bitBuffer&KIsInBase64Block) |
|
329 { |
|
330 ++numberOfUtf7BytesRequired; // for the initial escapeCharacterForStartingBase64Block |
|
331 } |
|
332 if (pointerToLastUtf7Byte-pointerToPreviousUtf7Byte<numberOfUtf7BytesRequired) |
|
333 { |
|
334 break; |
|
335 } |
|
336 } |
|
337 if (~bitBuffer&KIsInBase64Block) |
|
338 { |
|
339 __ASSERT_DEBUG(pointerToPreviousUtf7Byte<pointerToLastUtf7Byte, Panic(EPanicBadUtf7Pointers5)); |
|
340 ++pointerToPreviousUtf7Byte; |
|
341 *pointerToPreviousUtf7Byte=STATIC_CAST(TUint8, escapeCharacterForStartingBase64Block); |
|
342 } |
|
343 bitBuffer<<=16; |
|
344 bitBuffer|=currentUnicodeCharacter; |
|
345 numberOfBitsInBuffer+=16; |
|
346 ++pointerToPreviousUnicodeCharacter; |
|
347 __ASSERT_DEBUG(numberOfBitsInBuffer<=20, Panic(EPanicBadBitBufferState3)); |
|
348 while (numberOfBitsInBuffer>=6) |
|
349 { |
|
350 numberOfBitsInBuffer-=6; |
|
351 __ASSERT_DEBUG(pointerToPreviousUtf7Byte<pointerToLastUtf7Byte, Panic(EPanicBadUtf7Pointers6)); |
|
352 ++pointerToPreviousUtf7Byte; |
|
353 *pointerToPreviousUtf7Byte=STATIC_CAST(TUint8, Base64Encoding((bitBuffer>>numberOfBitsInBuffer)&0x3f, aIsImapUtf7)); |
|
354 } |
|
355 bitBuffer&=((1<<numberOfBitsInBuffer)-1); // zero all the consumed bits - not strictly necessary but it leaves the buffer in a cleaner state |
|
356 bitBuffer|=KIsInBase64Block; |
|
357 } |
|
358 } |
|
359 __ASSERT_DEBUG((bitBuffer&KIsInBase64Block) || (numberOfBitsInBuffer==0), Panic(EPanicBadBitBufferState4)); |
|
360 __ASSERT_DEBUG((numberOfBitsInBuffer==0) || (numberOfBitsInBuffer==2) || (numberOfBitsInBuffer==4), Panic(EPanicBadBitBufferState5)); |
|
361 if (bitBuffer&KIsInBase64Block) |
|
362 { |
|
363 #if defined(_DEBUG) |
|
364 TInt numberOfLoopIterations=1; |
|
365 #endif |
|
366 FOREVER // there should never be more than 2 iterations of this loop - the first "if" should always succeed the second time if it doesn't succeed the first time |
|
367 { |
|
368 __ASSERT_DEBUG(pointerToPreviousUtf7Byte<=pointerToLastUtf7Byte, Panic(EPanicBadUtf7Pointers7)); |
|
369 __ASSERT_DEBUG((numberOfBitsInBuffer==0) || (numberOfBitsInBuffer==2) || (numberOfBitsInBuffer==4), Panic(EPanicBadBitBufferState6)); |
|
370 __ASSERT_DEBUG(numberOfLoopIterations<=2, Panic(EPanicUnexpectedNumberOfLoopIterations)); |
|
371 #if defined(_DEBUG) |
|
372 ++numberOfLoopIterations; |
|
373 #endif |
|
374 if (pointerToLastUtf7Byte-pointerToPreviousUtf7Byte>=((numberOfBitsInBuffer==0)? 1: 2)) // if there's room to finish off the base-64 sequence by (i) flushing the bit-buffer and (ii) appending the trailing '-' |
|
375 { |
|
376 if (numberOfBitsInBuffer!=0) |
|
377 { |
|
378 __ASSERT_DEBUG(pointerToPreviousUtf7Byte<pointerToLastUtf7Byte, Panic(EPanicBadUtf7Pointers8)); |
|
379 ++pointerToPreviousUtf7Byte; |
|
380 *pointerToPreviousUtf7Byte=STATIC_CAST(TUint8, Base64Encoding((bitBuffer<<(6-numberOfBitsInBuffer))&0x3f, aIsImapUtf7)); |
|
381 } |
|
382 __ASSERT_DEBUG(pointerToPreviousUtf7Byte<pointerToLastUtf7Byte, Panic(EPanicBadUtf7Pointers9)); |
|
383 ++pointerToPreviousUtf7Byte; |
|
384 *pointerToPreviousUtf7Byte='-'; |
|
385 break; |
|
386 } |
|
387 // it is now necessary to move back pointerToPreviousUtf7Byte so that the base-64 sequence can be terminated - note it must be terminated on a Unicode character boundary hence the reason why pointerToPreviousUnicodeCharacter may be moved back too |
|
388 TUint8* pointerToEscapeCharacterStartingBase64Block=PointerToEscapeCharacterStartingBase64Block(pointerToPreviousUtf7Byte, aUtf7.Ptr(), aIsImapUtf7); |
|
389 const TInt oldNumberOfBase64Characters=pointerToPreviousUtf7Byte-pointerToEscapeCharacterStartingBase64Block; |
|
390 __ASSERT_DEBUG(oldNumberOfBase64Characters>0, Panic(EPanicInitialEscapeCharacterButNoBase64)); |
|
391 __ASSERT_DEBUG(((oldNumberOfBase64Characters*6)+numberOfBitsInBuffer)%16==0, Panic(EPanicBase64SequenceDoesNotFallOnUnicodeCharacterBoundary)); |
|
392 pointerToPreviousUnicodeCharacter-=((oldNumberOfBase64Characters*6)+numberOfBitsInBuffer)/16; // move back pointerToPreviousUnicodeCharacter to before the equivalent of the base-64 sequence |
|
393 pointerToPreviousUtf7Byte=pointerToEscapeCharacterStartingBase64Block; |
|
394 __ASSERT_DEBUG(*pointerToPreviousUtf7Byte==escapeCharacterForStartingBase64Block, Panic(EPanicBadUtf7Pointers10)); |
|
395 if (oldNumberOfBase64Characters<4) // if the new base-64 sequence will be so short that it won't even be able to contain the UTF-7 encoding of a single Unicode character |
|
396 { |
|
397 --pointerToPreviousUtf7Byte; // move back pointerToPreviousUtf7Byte to before the escapeCharacterForStartingBase64Block |
|
398 break; |
|
399 } |
|
400 const TInt newNumberOfUnicodeCharacters=((oldNumberOfBase64Characters-1)*3)/8; |
|
401 pointerToPreviousUnicodeCharacter+=newNumberOfUnicodeCharacters; |
|
402 pointerToPreviousUtf7Byte+=((newNumberOfUnicodeCharacters*8)+2)/3; |
|
403 const TInt numberOfBitsToBeZeroedInLastBase64Character=(newNumberOfUnicodeCharacters%3)*2; |
|
404 if (numberOfBitsToBeZeroedInLastBase64Character!=0) |
|
405 { |
|
406 *pointerToPreviousUtf7Byte=STATIC_CAST(TUint8, Base64Encoding(Base64Decoding(*pointerToPreviousUtf7Byte, aIsImapUtf7)&0x3f&~((1<<numberOfBitsToBeZeroedInLastBase64Character)-1), aIsImapUtf7)); |
|
407 } |
|
408 bitBuffer=KIsInBase64Block; |
|
409 numberOfBitsInBuffer=0; |
|
410 } |
|
411 } |
|
412 aUtf7.SetLength((pointerToPreviousUtf7Byte-aUtf7.Ptr())+1); |
|
413 return pointerToLastUnicodeCharacter-pointerToPreviousUnicodeCharacter; |
|
414 } |
|
415 |
|
416 |
|
417 |
|
418 /** Converts Unicode text into UTF-8 encoding. |
|
419 |
|
420 @param aUtf8 On return, contains the UTF-8 encoded output string. |
|
421 @param aUnicode The Unicode-encoded input string. |
|
422 @return The number of unconverted characters left at the end of the input |
|
423 descriptor, or one of the error values defined in TError. */ |
|
424 EXPORT_C TInt CnvUtfConverter::ConvertFromUnicodeToUtf8(TDes8& aUtf8, const TDesC16& aUnicode) |
|
425 { |
|
426 return ConvertFromUnicodeToUtf8(aUtf8, aUnicode, EFalse); |
|
427 } |
|
428 |
|
429 |
|
430 /** Converts Unicode text into UTF-8 encoding. |
|
431 |
|
432 The variant of UTF-8 used internally by Java differs slightly from |
|
433 standard UTF-8. The TBool argument controls the UTF-8 |
|
434 variant generated by this function. This function leaves with a |
|
435 KErrCorrupt if the input string is corrupt. |
|
436 |
|
437 @param aUnicode A UCS-2 encoded input string. |
|
438 @return A pointer to an HBufC8 containing the converted UTF8. */ |
|
439 EXPORT_C HBufC8* CnvUtfConverter::ConvertFromUnicodeToUtf8L(const TDesC16& aUnicode) |
|
440 { |
|
441 // If aUnicode is Null string, return an empty HBufC |
|
442 if (aUnicode.Length() == 0) |
|
443 { |
|
444 HBufC8* hBuf8 = HBufC8::NewL(1); |
|
445 return hBuf8; |
|
446 } |
|
447 |
|
448 // Otherwise, convert and store result in a buffer, reallocating that buffer if needed. |
|
449 const TInt length = aUnicode.Length(); |
|
450 const TInt bufsize = 100; |
|
451 |
|
452 TPtrC16 unicode (aUnicode); |
|
453 TBuf8<bufsize> buf; |
|
454 HBufC8* hBuf8 = HBufC8::NewLC(length); |
|
455 TPtr8 utf8 = hBuf8->Des(); |
|
456 |
|
457 FOREVER |
|
458 { |
|
459 TInt unconverted = ConvertFromUnicodeToUtf8(buf, unicode); |
|
460 if( unconverted == EErrorIllFormedInput || unconverted < 0) |
|
461 User::Leave(KErrCorrupt); |
|
462 |
|
463 if (utf8.Length() + buf.Length() > utf8.MaxLength()) |
|
464 { |
|
465 // Reallocate the hBuf8 |
|
466 hBuf8 = hBuf8->ReAllocL(utf8.Length() + buf.Length()); |
|
467 CleanupStack::Pop(); |
|
468 CleanupStack::PushL(hBuf8); |
|
469 utf8.Set(hBuf8->Des()); |
|
470 } |
|
471 utf8.Append(buf); |
|
472 if (unconverted ==0) |
|
473 break; |
|
474 unicode.Set(unicode.Right(unconverted)); |
|
475 } |
|
476 CleanupStack::Pop(); |
|
477 return hBuf8; |
|
478 } |
|
479 |
|
480 /** Converts Unicode text into UTF-8 encoding. |
|
481 |
|
482 Surrogate pairs can be input which will result in a valid 4 byte UTF-8 value. |
|
483 |
|
484 The variant of UTF-8 used internally by Java differs slightly from standard |
|
485 UTF-8. The TBool argument controls the UTF-8 variant generated by this function. |
|
486 |
|
487 @param aUtf8 On return, contains the UTF-8 encoded output string. |
|
488 @param aUnicode A UCS-2 encoded input string. |
|
489 @param aGenerateJavaConformantUtf8 EFalse for orthodox UTF-8. ETrue for Java |
|
490 UTF-8. The default is EFalse. |
|
491 @return The number of unconverted characters left at the end of the input descriptor, |
|
492 or one of the error values defined in TError. */ |
|
493 TInt CnvUtfConverter::ConvertFromUnicodeToUtf8(TDes8& aUtf8, |
|
494 const TDesC16& aUnicode, |
|
495 TBool aGenerateJavaConformantUtf8) |
|
496 { |
|
497 if (aUnicode.Length() == 0) |
|
498 { |
|
499 aUtf8.SetLength(0); |
|
500 return 0; |
|
501 } |
|
502 if (aUtf8.MaxLength() == 0) |
|
503 { |
|
504 return aUnicode.Length(); |
|
505 } |
|
506 |
|
507 TUint8* pUtf8 = CONST_CAST(TUint8*, aUtf8.Ptr()); |
|
508 const TUint8* pointerToLastUtf8Byte = pUtf8 + (aUtf8.MaxLength() - 1); |
|
509 TBool inputIsTruncated = EFalse; |
|
510 const TUint16* pUnicode = aUnicode.Ptr(); |
|
511 const TUint16* pointerToLastUnicodeCharacter = pUnicode + (aUnicode.Length() - 1); |
|
512 |
|
513 FOREVER |
|
514 { |
|
515 __ASSERT_DEBUG(pUtf8 <= pointerToLastUtf8Byte, Panic(EPanicBadUtf8Pointers1)); |
|
516 __ASSERT_DEBUG(pUnicode <= pointerToLastUnicodeCharacter, Panic(EPanicBadUnicodePointers3)); |
|
517 |
|
518 if (pUnicode[0] < 0x80) |
|
519 { |
|
520 // ascii - 1 byte |
|
521 |
|
522 // internally java is different since the \x0000 character is |
|
523 // translated into \xC0 \x80. |
|
524 |
|
525 if ((aGenerateJavaConformantUtf8) && (pUnicode[0] == 0x0000)) |
|
526 { |
|
527 if (pUtf8 == pointerToLastUtf8Byte) |
|
528 { |
|
529 pUtf8--; |
|
530 pUnicode--; |
|
531 break; |
|
532 } |
|
533 *pUtf8++ = STATIC_CAST(TUint8, 0xc0); |
|
534 *pUtf8 = STATIC_CAST(TUint8, 0x80); |
|
535 } |
|
536 else |
|
537 { |
|
538 *pUtf8 = STATIC_CAST(TUint8, pUnicode[0]); |
|
539 } |
|
540 } |
|
541 else if (pUnicode[0] < 0x800) |
|
542 { |
|
543 // U+0080..U+07FF - 2 bytes |
|
544 |
|
545 if (pUtf8 == pointerToLastUtf8Byte) |
|
546 { |
|
547 pUtf8--; |
|
548 pUnicode--; |
|
549 break; |
|
550 } |
|
551 |
|
552 *pUtf8++ = STATIC_CAST(TUint8, 0xc0|(pUnicode[0]>>6)); |
|
553 *pUtf8 = STATIC_CAST(TUint8, 0x80|(pUnicode[0]&0x3f)); |
|
554 |
|
555 } |
|
556 |
|
557 // check to see if we have a surrogate in the stream, surrogates encode code points outside |
|
558 // the BMP and are 4 utf-8 chars, otherwise what we have here is 3 utf-8 chars. |
|
559 |
|
560 else if (((pUnicode[0] & 0xfc00) == 0xd800) && !aGenerateJavaConformantUtf8) |
|
561 { |
|
562 // surrogate pair - 4 bytes in utf-8 |
|
563 // U+10000..U+10FFFF |
|
564 |
|
565 __ASSERT_DEBUG(pUtf8 <= pointerToLastUtf8Byte, Panic(EPanicBadUtf8Pointers2)); |
|
566 // is there enough space to hold the character |
|
567 if ((pointerToLastUtf8Byte - pUtf8) < 3) |
|
568 { |
|
569 pUtf8--; |
|
570 pUnicode--; |
|
571 break; // no go to the exit condition |
|
572 } |
|
573 |
|
574 __ASSERT_DEBUG(pUnicode <= pointerToLastUnicodeCharacter, Panic(EPanicBadUnicodePointers4)); |
|
575 if (pUnicode >= pointerToLastUnicodeCharacter) |
|
576 { |
|
577 pUtf8--; |
|
578 pUnicode--; |
|
579 inputIsTruncated = ETrue; |
|
580 break; // middle of a surrogate pair. go to end condition |
|
581 } |
|
582 |
|
583 if ((pUnicode[1] & 0xfc00) != 0xdc00) |
|
584 { |
|
585 return EErrorIllFormedInput; |
|
586 } |
|
587 |
|
588 // convert utf-16 surrogate to utf-32 |
|
589 TUint ch = ((pUnicode[0] - 0xD800) << 10 | (pUnicode[1] - 0xDC00)) + 0x10000; |
|
590 |
|
591 // convert utf-32 to utf-8 |
|
592 *pUtf8++ = STATIC_CAST(TUint8,0xf0 | (ch >> 18)); |
|
593 *pUtf8++ = STATIC_CAST(TUint8,0x80 | ((ch >> 12) & 0x3f)); |
|
594 *pUtf8++ = STATIC_CAST(TUint8,0x80 | ((ch >> 6) & 0x3f)); |
|
595 *pUtf8 = STATIC_CAST(TUint8,0x80 | (ch & 0x3f)); |
|
596 |
|
597 // we consumed 2 utf-16 values, move this pointer |
|
598 pUnicode++; |
|
599 } |
|
600 else |
|
601 { |
|
602 // 3 byte - utf-8, U+800..U+FFFF rest of BMP. |
|
603 |
|
604 if (pointerToLastUtf8Byte - pUtf8 < 2) |
|
605 { |
|
606 pUtf8--; |
|
607 pUnicode--; |
|
608 break; |
|
609 } |
|
610 *pUtf8++ = STATIC_CAST(TUint8, 0xe0|(pUnicode[0]>>12)); |
|
611 *pUtf8++ = STATIC_CAST(TUint8, 0x80|((pUnicode[0]>>6)&0x3f)); |
|
612 *pUtf8 = STATIC_CAST(TUint8, 0x80|(pUnicode[0]&0x3f)); |
|
613 } |
|
614 |
|
615 if ((pUnicode == pointerToLastUnicodeCharacter) || (pUtf8 == pointerToLastUtf8Byte)) |
|
616 { |
|
617 break; |
|
618 } |
|
619 |
|
620 pUtf8++; |
|
621 pUnicode++; |
|
622 |
|
623 } |
|
624 |
|
625 if ((pUnicode < aUnicode.Ptr()) && inputIsTruncated) |
|
626 { |
|
627 return EErrorIllFormedInput; |
|
628 } |
|
629 |
|
630 aUtf8.SetLength((pUtf8 - aUtf8.Ptr())+1); |
|
631 return pointerToLastUnicodeCharacter-pUnicode; |
|
632 } |
|
633 |
|
634 |
|
635 |
|
636 /** Converts text encoded using the Unicode transformation format UTF-7 |
|
637 into the Unicode UCS-2 character set. |
|
638 |
|
639 @param aUtf7 The UTF-7 encoded input string. |
|
640 @return A pointer to an HBufC16 containing the converted Unicode string */ |
|
641 EXPORT_C HBufC16* CnvUtfConverter::ConvertToUnicodeFromUtf7L(const TDesC8& aUtf7) |
|
642 { |
|
643 // If aUtf8 is an empty string return |
|
644 if (aUtf7.Length()==0) |
|
645 { |
|
646 HBufC16* hBuf = HBufC16::NewL(1); |
|
647 return hBuf; |
|
648 } |
|
649 |
|
650 // else convert aUtf8 to Unicode storing the result in a buffer, reallocating |
|
651 // it when needed. |
|
652 TInt length = aUtf7.Length(); |
|
653 const TInt bufsize = 100; |
|
654 TInt state = KStateDefault; |
|
655 |
|
656 TPtrC8 utf7 (aUtf7); |
|
657 TBuf<bufsize> buf; |
|
658 HBufC16* hBuf = HBufC16::NewLC(length); |
|
659 TPtr unicode = hBuf->Des(); |
|
660 |
|
661 FOREVER |
|
662 { |
|
663 TInt unconverted = ConvertToUnicodeFromUtf7(buf, utf7, state); |
|
664 if( unconverted == EErrorIllFormedInput || unconverted < 0) |
|
665 User::Leave(KErrCorrupt); |
|
666 |
|
667 if (unicode.Length() + buf.Length() > unicode.MaxLength()) |
|
668 { |
|
669 // Reallocate hBuf |
|
670 hBuf = hBuf->ReAllocL(unicode.Length() + buf.Length()); |
|
671 CleanupStack::Pop(); |
|
672 CleanupStack::PushL(hBuf); |
|
673 unicode.Set(hBuf->Des()); |
|
674 } |
|
675 unicode.Append(buf); |
|
676 if (unconverted ==0) |
|
677 break; |
|
678 utf7.Set(utf7.Right(unconverted)); |
|
679 } |
|
680 CleanupStack::Pop(); |
|
681 return hBuf; |
|
682 } |
|
683 |
|
684 |
|
685 |
|
686 /** Converts text encoded using the Unicode transformation format UTF-7 into the |
|
687 Unicode UCS-2 character set. |
|
688 |
|
689 If the conversion is achieved using a series of calls to this function, where |
|
690 each call starts off where the previous call reached in the input descriptor, |
|
691 the state of the conversion is stored. The initial value of the state variable |
|
692 should be set as KStateDefault when the conversion is started, and afterwards |
|
693 simply passed unchanged into each function call. |
|
694 |
|
695 @param aUnicode On return, contains the Unicode encoded output string. |
|
696 @param aUtf7 The UTF-7 encoded input string. |
|
697 @param aState For the first call of the function set to KStateDefault. For |
|
698 subsequent calls, pass in the variable unchanged. |
|
699 @return The number of unconverted bytes left at the end of the input descriptor, |
|
700 or one of the error values defined in TError. */ |
|
701 EXPORT_C TInt CnvUtfConverter::ConvertToUnicodeFromUtf7(TDes16& aUnicode, |
|
702 const TDesC8& aUtf7, |
|
703 TInt& aState) |
|
704 { |
|
705 return ConvertToUnicodeFromUtf7(aUnicode, aUtf7, EFalse, aState); |
|
706 } |
|
707 |
|
708 TInt CnvUtfConverter::ConvertToUnicodeFromUtf7(TDes16& aUnicode, |
|
709 const TDesC8& aUtf7, |
|
710 TBool aIsImapUtf7, |
|
711 TInt& aState) |
|
712 { |
|
713 if (aUtf7.Length()==0) |
|
714 { |
|
715 aUnicode.SetLength(0); |
|
716 return 0; |
|
717 } |
|
718 if (aUnicode.MaxLength()==0) |
|
719 { |
|
720 return aUtf7.Length(); |
|
721 } |
|
722 const TUint escapeCharacterForStartingBase64Block=EscapeCharacterForStartingBase64Block(aIsImapUtf7); |
|
723 TUint16* pointerToPreviousUnicodeCharacter=CONST_CAST(TUint16*, aUnicode.Ptr()-1); |
|
724 const TUint16* pointerToLastUnicodeCharacter=pointerToPreviousUnicodeCharacter+aUnicode.MaxLength(); |
|
725 const TUint8* pointerToCurrentUtf7Byte=aUtf7.Ptr(); |
|
726 const TUint8* pointerToLastUtf7Byte=pointerToCurrentUtf7Byte+(aUtf7.Length()-1); |
|
727 TUint currentUtf7Byte=*pointerToCurrentUtf7Byte; |
|
728 const TUint KIsInBase64Block=0x80000000u; |
|
729 TUint bitBuffer=STATIC_CAST(TUint, aState); |
|
730 TInt numberOfBitsInBuffer=((bitBuffer&0xf0)>>4); |
|
731 bitBuffer&=~0xf0; // turn off the bits that stored numberOfBitsInBuffer |
|
732 if (bitBuffer&KIsInBase64Block) |
|
733 { |
|
734 __ASSERT_ALWAYS((numberOfBitsInBuffer==0) || (numberOfBitsInBuffer==2) || (numberOfBitsInBuffer==4) || ((numberOfBitsInBuffer<16) && (numberOfBitsInBuffer%2==0) && !BitBufferContainsNonZeroBits(bitBuffer, numberOfBitsInBuffer)), Panic(EPanicBadBitBufferState7)); |
|
735 __ASSERT_ALWAYS((bitBuffer&~(KIsInBase64Block|0x0000000f))==0, Panic(EPanicBadBitBufferState8)); |
|
736 } |
|
737 else |
|
738 { |
|
739 __ASSERT_ALWAYS(bitBuffer==0, Panic(EPanicBadBitBufferState9)); |
|
740 __ASSERT_ALWAYS(numberOfBitsInBuffer==0, Panic(EPanicBadBitBufferState10)); |
|
741 } |
|
742 aState=KStateDefault; |
|
743 if (bitBuffer&KIsInBase64Block) |
|
744 { |
|
745 currentUtf7Byte=Base64Decoding(currentUtf7Byte, aIsImapUtf7); |
|
746 } |
|
747 TBool inputIsTruncated=EFalse; |
|
748 FOREVER |
|
749 { |
|
750 __ASSERT_DEBUG(pointerToPreviousUnicodeCharacter<pointerToLastUnicodeCharacter, Panic(EPanicBadUnicodePointers5)); |
|
751 __ASSERT_DEBUG(pointerToCurrentUtf7Byte<=pointerToLastUtf7Byte, Panic(EPanicBadUtf7Pointers11)); |
|
752 __ASSERT_DEBUG((bitBuffer&KIsInBase64Block) || (currentUtf7Byte==*pointerToCurrentUtf7Byte), Panic(EPanicOutOfSyncUtf7Byte1)); |
|
753 __ASSERT_DEBUG((~bitBuffer&KIsInBase64Block) || (currentUtf7Byte==Base64Decoding(*pointerToCurrentUtf7Byte, aIsImapUtf7)), Panic(EPanicOutOfSyncUtf7Byte2)); |
|
754 __ASSERT_DEBUG((bitBuffer&KIsInBase64Block) || ((bitBuffer==0) && (numberOfBitsInBuffer==0)), Panic(EPanicBadBitBufferState11)); |
|
755 if ((~bitBuffer&KIsInBase64Block) && (currentUtf7Byte==escapeCharacterForStartingBase64Block)) |
|
756 { |
|
757 if (pointerToCurrentUtf7Byte==pointerToLastUtf7Byte) |
|
758 { |
|
759 --pointerToCurrentUtf7Byte; |
|
760 inputIsTruncated=ETrue; |
|
761 goto end; |
|
762 } |
|
763 ++pointerToCurrentUtf7Byte; |
|
764 currentUtf7Byte=*pointerToCurrentUtf7Byte; |
|
765 if (currentUtf7Byte=='-') |
|
766 { |
|
767 currentUtf7Byte=escapeCharacterForStartingBase64Block; |
|
768 } |
|
769 else |
|
770 { |
|
771 currentUtf7Byte=Base64Decoding(currentUtf7Byte, aIsImapUtf7); |
|
772 if (currentUtf7Byte==KNotInBase64Alphabet) |
|
773 { |
|
774 return EErrorIllFormedInput; |
|
775 } |
|
776 bitBuffer=KIsInBase64Block; |
|
777 } |
|
778 } |
|
779 if (bitBuffer&KIsInBase64Block) |
|
780 { |
|
781 FOREVER |
|
782 { |
|
783 __ASSERT_DEBUG(currentUtf7Byte==Base64Decoding(*pointerToCurrentUtf7Byte, aIsImapUtf7), Panic(EPanicOutOfSyncBase64Decoding)); |
|
784 __ASSERT_DEBUG((numberOfBitsInBuffer<16) || (BitBufferContainsNonZeroBits(bitBuffer, numberOfBitsInBuffer-16) && (numberOfBitsInBuffer<16+6)), Panic(EPanicBadBitBufferState12)); |
|
785 if (currentUtf7Byte==KNotInBase64Alphabet) |
|
786 { |
|
787 if (BitBufferContainsNonZeroBits(bitBuffer, numberOfBitsInBuffer)) |
|
788 { |
|
789 return EErrorIllFormedInput; |
|
790 } |
|
791 bitBuffer=0; |
|
792 numberOfBitsInBuffer=0; |
|
793 currentUtf7Byte=*pointerToCurrentUtf7Byte; |
|
794 if (currentUtf7Byte=='-') |
|
795 { |
|
796 if (pointerToCurrentUtf7Byte==pointerToLastUtf7Byte) |
|
797 { |
|
798 goto end; |
|
799 } |
|
800 ++pointerToCurrentUtf7Byte; |
|
801 currentUtf7Byte=*pointerToCurrentUtf7Byte; |
|
802 } |
|
803 break; |
|
804 } |
|
805 bitBuffer<<=6; |
|
806 bitBuffer|=currentUtf7Byte; |
|
807 bitBuffer|=KIsInBase64Block; |
|
808 numberOfBitsInBuffer+=6; |
|
809 // only flush the buffer if it contains a whole Unicode character and the remainder is either all zero-bits (hence would be a legal point to end the base-64 sequence) or at least 6 bits long (therefore would leave at least one UTF-7 byte unconverted at the end of the input descriptor) |
|
810 if ((numberOfBitsInBuffer>=16+6) || ((numberOfBitsInBuffer>=16) && !BitBufferContainsNonZeroBits(bitBuffer, numberOfBitsInBuffer-16))) |
|
811 { |
|
812 numberOfBitsInBuffer-=16; |
|
813 __ASSERT_DEBUG(pointerToPreviousUnicodeCharacter<pointerToLastUnicodeCharacter, Panic(EPanicBadUnicodePointers6)); |
|
814 ++pointerToPreviousUnicodeCharacter; |
|
815 *pointerToPreviousUnicodeCharacter=STATIC_CAST(TUint16, bitBuffer>>numberOfBitsInBuffer); |
|
816 bitBuffer&=((1<<numberOfBitsInBuffer)-1); // zero all the consumed bits - must be done as bitBuffer is stored along with numberOfBitsInBuffer in aState if the output descriptor runs out of space or if the input descriptor was truncated |
|
817 bitBuffer|=KIsInBase64Block; // turn it back on as the line above turned it off |
|
818 if (pointerToPreviousUnicodeCharacter==pointerToLastUnicodeCharacter) |
|
819 { |
|
820 goto end; |
|
821 } |
|
822 } |
|
823 if (pointerToCurrentUtf7Byte==pointerToLastUtf7Byte) |
|
824 { |
|
825 inputIsTruncated=ETrue; |
|
826 goto end; |
|
827 } |
|
828 ++pointerToCurrentUtf7Byte; |
|
829 currentUtf7Byte=Base64Decoding(*pointerToCurrentUtf7Byte, aIsImapUtf7); |
|
830 } |
|
831 } |
|
832 else |
|
833 { |
|
834 __ASSERT_DEBUG(pointerToPreviousUnicodeCharacter<pointerToLastUnicodeCharacter, Panic(EPanicBadUnicodePointers7)); |
|
835 ++pointerToPreviousUnicodeCharacter; |
|
836 *pointerToPreviousUnicodeCharacter=STATIC_CAST(TUint16, currentUtf7Byte); |
|
837 if ((pointerToPreviousUnicodeCharacter==pointerToLastUnicodeCharacter) || (pointerToCurrentUtf7Byte==pointerToLastUtf7Byte)) |
|
838 { |
|
839 goto end; |
|
840 } |
|
841 ++pointerToCurrentUtf7Byte; |
|
842 currentUtf7Byte=*pointerToCurrentUtf7Byte; |
|
843 } |
|
844 } |
|
845 end: |
|
846 if (bitBuffer&KIsInBase64Block) |
|
847 { |
|
848 __ASSERT_DEBUG((numberOfBitsInBuffer<16) || (BitBufferContainsNonZeroBits(bitBuffer, numberOfBitsInBuffer-16) && (numberOfBitsInBuffer<16+6)), Panic(EPanicBadBitBufferState13)); |
|
849 if (BitBufferContainsNonZeroBits(bitBuffer, numberOfBitsInBuffer)) |
|
850 { |
|
851 // rewind how far we've got in the UTF-7 descriptor to indicate to the user (by returning a value greater than zero) that not all of the input could be converted as it ended with a truncated base-64 sequence |
|
852 __ASSERT_DEBUG(numberOfBitsInBuffer>=6, Panic(EPanicBadBitBufferState14)); |
|
853 pointerToCurrentUtf7Byte-=numberOfBitsInBuffer/6; |
|
854 const TInt newNumberOfBitsInBuffer=numberOfBitsInBuffer%6; |
|
855 bitBuffer&=~KIsInBase64Block; // temporarily turn off the KIsInBase64Block for the right-shift |
|
856 bitBuffer>>=(numberOfBitsInBuffer-newNumberOfBitsInBuffer); |
|
857 bitBuffer|=KIsInBase64Block; // must be turned back on again as the bit-buffer is packed into aState |
|
858 numberOfBitsInBuffer=newNumberOfBitsInBuffer; |
|
859 __ASSERT_DEBUG((numberOfBitsInBuffer==0) || (numberOfBitsInBuffer==2) || (numberOfBitsInBuffer==4), Panic(EPanicBadBitBufferState15)); |
|
860 } |
|
861 __ASSERT_DEBUG((numberOfBitsInBuffer<16) && (numberOfBitsInBuffer%2==0), Panic(EPanicBadBitBufferState16)); |
|
862 aState=STATIC_CAST(TInt, bitBuffer); |
|
863 aState|=(numberOfBitsInBuffer<<4); |
|
864 __ASSERT_DEBUG(aState&KIsInBase64Block, Panic(EPanicBadBitBufferState17)); |
|
865 bitBuffer=0; |
|
866 numberOfBitsInBuffer=0; |
|
867 } |
|
868 if ((pointerToCurrentUtf7Byte<aUtf7.Ptr()) && inputIsTruncated) |
|
869 { |
|
870 return EErrorIllFormedInput; |
|
871 } |
|
872 aUnicode.SetLength((pointerToPreviousUnicodeCharacter+1)-aUnicode.Ptr()); |
|
873 return pointerToLastUtf7Byte-pointerToCurrentUtf7Byte; |
|
874 } |
|
875 |
|
876 |
|
877 |
|
878 /** Converts text encoded using the Unicode transformation format UTF-8 |
|
879 into the Unicode UCS-2 character set. This function leaves with an |
|
880 error code of the input string is corrupted. |
|
881 |
|
882 @param aUtf8 The UTF-8 encoded input string |
|
883 @return A pointer to an HBufC16 with the converted Unicode string. */ |
|
884 EXPORT_C HBufC16* CnvUtfConverter::ConvertToUnicodeFromUtf8L(const TDesC8& aUtf8) |
|
885 { |
|
886 // If aUtf8 is an empty string return |
|
887 if (aUtf8.Length()==0) |
|
888 { |
|
889 HBufC16* hBuf = HBufC16::NewL(1); |
|
890 return hBuf; |
|
891 } |
|
892 |
|
893 // else convert aUtf8 to Unicode storing the result in a buffer, reallocating |
|
894 // it when needed. |
|
895 TInt length = aUtf8.Length(); |
|
896 const TInt bufsize = 100; |
|
897 |
|
898 TPtrC8 utf8 (aUtf8); |
|
899 TBuf<bufsize> buf; |
|
900 HBufC16* hBuf = HBufC16::NewLC(length); |
|
901 TPtr unicode = hBuf->Des(); |
|
902 |
|
903 FOREVER |
|
904 { |
|
905 TInt unconverted = ConvertToUnicodeFromUtf8(buf, utf8); |
|
906 if( unconverted == EErrorIllFormedInput || unconverted < 0) |
|
907 User::Leave(KErrCorrupt); |
|
908 |
|
909 if (unicode.Length() + buf.Length() > unicode.MaxLength()) |
|
910 { |
|
911 // Reallocate hBuf |
|
912 hBuf = hBuf->ReAllocL(unicode.Length() + buf.Length()); |
|
913 CleanupStack::Pop(); |
|
914 CleanupStack::PushL(hBuf); |
|
915 unicode.Set(hBuf->Des()); |
|
916 } |
|
917 unicode.Append(buf); |
|
918 if (unconverted ==0) |
|
919 break; |
|
920 utf8.Set(utf8.Right(unconverted)); |
|
921 } |
|
922 CleanupStack::Pop(); |
|
923 return hBuf; |
|
924 } |
|
925 |
|
926 /** Converts text encoded using the Unicode transformation format UTF-8 into the |
|
927 Unicode UCS-2 character set. |
|
928 |
|
929 @param aUnicode On return, contains the Unicode encoded output string. |
|
930 @param aUtf8 The UTF-8 encoded input string |
|
931 @return The number of unconverted bytes left at the end of the input descriptor, |
|
932 or one of the error values defined in TError. */ |
|
933 EXPORT_C TInt CnvUtfConverter::ConvertToUnicodeFromUtf8(TDes16& aUnicode, const TDesC8& aUtf8) |
|
934 { |
|
935 return ConvertToUnicodeFromUtf8(aUnicode, aUtf8, EFalse); |
|
936 } |
|
937 |
|
938 static void UpdateUnconvertibleInfo(TInt& aNumberOfUnconvertibleCharacters, |
|
939 TInt& aIndexOfFirstByteOfFirstUnconvertibleCharacter, TUint8 aIndex) |
|
940 { |
|
941 if (aNumberOfUnconvertibleCharacters<=0) |
|
942 { |
|
943 aIndexOfFirstByteOfFirstUnconvertibleCharacter = aIndex; |
|
944 } |
|
945 ++aNumberOfUnconvertibleCharacters; |
|
946 } |
|
947 |
|
948 /** Converts text encoded using the Unicode transformation format UTF-8 into the |
|
949 Unicode UCS-2 character set. |
|
950 |
|
951 @param aUnicode On return, contains the Unicode encoded output string. |
|
952 @param aUtf8 The UTF-8 encoded input string |
|
953 @param aGenerateJavaConformantUtf8 EFalse for orthodox UTF-8. ETrue for Java |
|
954 @return The number of unconverted bytes left at the end of the input descriptor, |
|
955 or one of the error values defined in TError. */ |
|
956 TInt CnvUtfConverter::ConvertToUnicodeFromUtf8(TDes16& aUnicode, const TDesC8& aUtf8, TBool aGenerateJavaConformantUtf8) |
|
957 { |
|
958 TInt dummyUnconverted, dummyUnconvertedIndex; |
|
959 return ConvertToUnicodeFromUtf8(aUnicode, aUtf8, aGenerateJavaConformantUtf8, dummyUnconverted, dummyUnconvertedIndex); |
|
960 } |
|
961 |
|
962 /** Converts text encoded using the Unicode transformation format UTF-8 into the |
|
963 Unicode UCS-2 character set. Surrogate pairs can be created when a valid 4 byte UTF-8 is input. |
|
964 |
|
965 The variant of UTF-8 used internally by Java differs slightly from standard |
|
966 UTF-8. The TBool argument controls the UTF-8 variant generated by this function. |
|
967 |
|
968 @param aUnicode On return, contains the Unicode encoded output string. |
|
969 @param aUtf8 The UTF-8 encoded input string |
|
970 @param aGenerateJavaConformantUtf8 EFalse for orthodox UTF-8. ETrue for Java |
|
971 UTF-8. The default is EFalse. |
|
972 @param aNumberOfUnconvertibleCharacters On return, contains the number of bytes |
|
973 which were not converted. |
|
974 @param aIndexOfFirstByteOfFirstUnconvertibleCharacter On return, the index |
|
975 of the first byte of the first unconvertible character. For instance if the |
|
976 first character in the input descriptor (aForeign) could not be converted, |
|
977 then this parameter is set to the first byte of that character, i.e. zero. |
|
978 A negative value is returned if all the characters were converted. |
|
979 @return The number of unconverted bytes left at the end of the input descriptor, |
|
980 or one of the error values defined in TError. */ |
|
981 |
|
982 /* of note: conformance. Unicode standard 5.0 section 3.9, table 3-7 |
|
983 * Well formed UTF-8 Byte Sequences, full table. |
|
984 * +----------------------------------------------------------------+ |
|
985 * | Code Points | 1st byte | 2nd byte | 3rd byte | 4th byte | |
|
986 * +--------------------+----------+----------+----------+----------+ |
|
987 * | U+0000..U+007F | 00..7D | | | | 1 byte, ascii |
|
988 * | U+0080..U+07FF | C2..DF | 80..BF | | | 2 bytes, error if 1st < 0xC2 |
|
989 * | U+0800..U+0FFF | E0 | A0..BF | 80..BF | | 3 bytes, 1st == 0xE0, error if 2nd < 0xA0 |
|
990 * | U+1000..U+CFFF | E1..EC | 80..BF | 80..BF | | normal |
|
991 * | U+D000..U+D7FF | ED | 80..9F | 80..BF | | 3 bytes, 1st == 0xED, error if 2nd > 0x9F |
|
992 * | U+E000..U+FFFF | EE..EF | 80..BF | 80..BF | | normal |
|
993 * | U+10000..U+3FFFF | F0 | 90..BF | 80..BF | 80..BF | 4 bytes, 1st == 0xf0, error if 2nd < 0x90 |
|
994 * | U+40000..U+FFFFF | F1..F3 | 80..BF | 80..BF | 80..BF | normal |
|
995 * | U+100000..U+10FFFF | F4 | 80..8F | 80..BF | 80..BF | 4 bytes, 1st == 0xF4, error if 2nd > 0x8F |
|
996 * +--------------------+----------+----------+----------+----------+ |
|
997 * |
|
998 * As a consequence of the well-formedness conditions specified in table 3-7, |
|
999 * the following byte values are disallowed in UTF-8: C0-C1, F5-FF. |
|
1000 */ |
|
1001 TInt CnvUtfConverter::ConvertToUnicodeFromUtf8(TDes16& aUnicode, const TDesC8& aUtf8, TBool aGenerateJavaConformantUtf8, |
|
1002 TInt& aNumberOfUnconvertibleCharacters, TInt& aIndexOfFirstByteOfFirstUnconvertibleCharacter) |
|
1003 { |
|
1004 aUnicode.SetLength(0); |
|
1005 |
|
1006 if ((aUtf8.Length() == 0) || (aUnicode.MaxLength() == 0)) |
|
1007 { |
|
1008 return aUtf8.Length(); |
|
1009 } |
|
1010 |
|
1011 TUint16* pUnicode = CONST_CAST(TUint16*, aUnicode.Ptr()); |
|
1012 const TUint16* pLastUnicode = pUnicode + (aUnicode.MaxLength() - 1); |
|
1013 const TUint8* pUtf8 = aUtf8.Ptr(); |
|
1014 const TUint8* pLastUtf8 = pUtf8 + (aUtf8.Length() - 1); |
|
1015 const TUint16 replacementcharacter = 0xFFFD; |
|
1016 TUint currentUnicodeCharacter; |
|
1017 TInt sequenceLength; |
|
1018 |
|
1019 |
|
1020 FOREVER |
|
1021 { |
|
1022 TBool illFormed=EFalse; |
|
1023 |
|
1024 __ASSERT_DEBUG(pUnicode <= pLastUnicode, Panic(EPanicBadUnicodePointers8)); |
|
1025 __ASSERT_DEBUG(pUtf8 <= pLastUtf8, Panic(EPanicBadUtf8Pointers3)); |
|
1026 |
|
1027 sequenceLength = 1; |
|
1028 |
|
1029 // ascii - optimisation (i.e. it isn't a sequence) |
|
1030 if (pUtf8[0] < 0x80) |
|
1031 { |
|
1032 currentUnicodeCharacter = pUtf8[0]; |
|
1033 } |
|
1034 else |
|
1035 { |
|
1036 // see if well formed utf-8, use table above for reference |
|
1037 if ((pUtf8[0] >= 0xc2) && (pUtf8[0] <= 0xdf)) |
|
1038 { |
|
1039 // 0xc1-0xc2 are not valid bytes |
|
1040 sequenceLength = 2; |
|
1041 } |
|
1042 else if ((pUtf8[0] & 0xf0) == 0xe0) |
|
1043 { |
|
1044 sequenceLength = 3; |
|
1045 } |
|
1046 else if ((pUtf8[0] >= 0xf0) && (pUtf8[0] < 0xf5)) |
|
1047 { |
|
1048 // 0xf5-0xff, are not valid bytes |
|
1049 sequenceLength = 4; |
|
1050 } |
|
1051 else if ((pUtf8[0] == 0xc0) && aGenerateJavaConformantUtf8) |
|
1052 { |
|
1053 if ((pUtf8 == pLastUtf8) || (pUtf8[1] == 0x80)) |
|
1054 { |
|
1055 // either we've split the 0xc0 0x80 (i.e. 0xc0 is |
|
1056 // the last character in the string) or we've |
|
1057 // discovered a valid 0xc0 0x80 sequence. |
|
1058 sequenceLength = 2; |
|
1059 } |
|
1060 } |
|
1061 |
|
1062 /* checking to see if we got a valid sequence */ |
|
1063 if (sequenceLength == 1) |
|
1064 { |
|
1065 // bad value in the leading byte, 0xc0-0xc1,0x5f-0xff for example |
|
1066 currentUnicodeCharacter = replacementcharacter; |
|
1067 UpdateUnconvertibleInfo(aNumberOfUnconvertibleCharacters, |
|
1068 aIndexOfFirstByteOfFirstUnconvertibleCharacter, pUtf8-aUtf8.Ptr()); |
|
1069 } |
|
1070 else |
|
1071 { |
|
1072 // this is a check to see if the sequence goes beyond the input |
|
1073 // stream. if its not the first and only character in the input |
|
1074 // stream this isn't an error, otherwise it is. |
|
1075 if ((pUtf8 + sequenceLength - 1) > pLastUtf8) |
|
1076 { |
|
1077 // check to see if this sequence was the first character |
|
1078 if ((pUnicode - aUnicode.Ptr()) == 0) |
|
1079 { |
|
1080 return EErrorIllFormedInput; |
|
1081 } |
|
1082 break; |
|
1083 } |
|
1084 |
|
1085 currentUnicodeCharacter = pUtf8[0] & (0x7F>>sequenceLength); |
|
1086 |
|
1087 /* check the trailing bytes, they should begin with 10 */ |
|
1088 TUint i = 1; |
|
1089 |
|
1090 do |
|
1091 { |
|
1092 if ((pUtf8[i] & 0xc0) == 0x80) |
|
1093 { |
|
1094 // add the trailing 6 bits to the current unicode char |
|
1095 currentUnicodeCharacter = (currentUnicodeCharacter <<6 ) | (pUtf8[i] & 0x3F); |
|
1096 } |
|
1097 else |
|
1098 { |
|
1099 // ill formed character (doesn't have a lead 10) |
|
1100 currentUnicodeCharacter = replacementcharacter; |
|
1101 UpdateUnconvertibleInfo(aNumberOfUnconvertibleCharacters, |
|
1102 aIndexOfFirstByteOfFirstUnconvertibleCharacter, pUtf8-aUtf8.Ptr()); |
|
1103 illFormed=ETrue; |
|
1104 break; |
|
1105 } |
|
1106 i++; |
|
1107 } |
|
1108 while (i < sequenceLength); |
|
1109 } |
|
1110 |
|
1111 /* conformance check. bits of above table for reference. |
|
1112 * +----------------------------------------------------------------+ |
|
1113 * | Code Points | 1st byte | 2nd byte | 3rd byte | 4th byte | |
|
1114 * +--------------------+----------+----------+----------+----------+ |
|
1115 * | U+0800..U+0FFF | E0 | A0..BF | 80..BF | | 3 bytes, 1st == 0xE0, 2nd < 0xA0 |
|
1116 * | U+D000..U+D7FF | ED | 80..9F | 80..BF | | 3 bytes, 1st == 0xED, 2nd > 0x9F |
|
1117 * | U+10000..U+3FFFF | F0 | 90..BF | 80..BF | 80..BF | 4 bytes, 1st == 0xf0, 2nd < 0x90 |
|
1118 * | U+100000..U+10FFFF | F4 | 80..8F | 80..BF | 80..BF | 4 bytes, 1st == 0xF4, 2nd > 0x8F |
|
1119 * +--------------------+----------+----------+----------+----------+ |
|
1120 */ |
|
1121 |
|
1122 if (currentUnicodeCharacter != replacementcharacter) |
|
1123 { |
|
1124 if (sequenceLength == 3) |
|
1125 { |
|
1126 if ((pUtf8[0] == 0xE0) && (pUtf8[1] < 0xA0)) |
|
1127 { |
|
1128 currentUnicodeCharacter = replacementcharacter; |
|
1129 UpdateUnconvertibleInfo(aNumberOfUnconvertibleCharacters, |
|
1130 aIndexOfFirstByteOfFirstUnconvertibleCharacter, pUtf8-aUtf8.Ptr()); |
|
1131 illFormed=ETrue; |
|
1132 } |
|
1133 else if ((pUtf8[0] == 0xED) && (pUtf8[1] > 0x9F)) |
|
1134 { |
|
1135 currentUnicodeCharacter = replacementcharacter; |
|
1136 UpdateUnconvertibleInfo(aNumberOfUnconvertibleCharacters, |
|
1137 aIndexOfFirstByteOfFirstUnconvertibleCharacter, pUtf8-aUtf8.Ptr()); |
|
1138 illFormed=ETrue; |
|
1139 } |
|
1140 } |
|
1141 else if (sequenceLength == 4) |
|
1142 { |
|
1143 if ((pUtf8[0] == 0xF0) && (pUtf8[1] < 0x90)) |
|
1144 { |
|
1145 currentUnicodeCharacter = replacementcharacter; |
|
1146 UpdateUnconvertibleInfo(aNumberOfUnconvertibleCharacters, |
|
1147 aIndexOfFirstByteOfFirstUnconvertibleCharacter, pUtf8-aUtf8.Ptr()); |
|
1148 illFormed=ETrue; |
|
1149 } |
|
1150 else if ((pUtf8[0] == 0xF4) && (pUtf8[1] > 0x8F)) |
|
1151 { |
|
1152 currentUnicodeCharacter = replacementcharacter; |
|
1153 UpdateUnconvertibleInfo(aNumberOfUnconvertibleCharacters, |
|
1154 aIndexOfFirstByteOfFirstUnconvertibleCharacter, pUtf8-aUtf8.Ptr()); |
|
1155 illFormed=ETrue; |
|
1156 } |
|
1157 } |
|
1158 |
|
1159 |
|
1160 /* last conformance check - Unicode 5.0 section 3.9 D92 Because surrogate code points |
|
1161 * are not Unicode scalar values, any UTF-8 byte sequence that would map to code |
|
1162 * points D800..DFFF is ill formed */ |
|
1163 |
|
1164 if ((currentUnicodeCharacter >= 0xD800) && (currentUnicodeCharacter <= 0xDFFF)) |
|
1165 { |
|
1166 currentUnicodeCharacter = replacementcharacter; |
|
1167 UpdateUnconvertibleInfo(aNumberOfUnconvertibleCharacters, |
|
1168 aIndexOfFirstByteOfFirstUnconvertibleCharacter, pUtf8-aUtf8.Ptr()); |
|
1169 illFormed=ETrue; |
|
1170 } |
|
1171 } |
|
1172 // end conformance check |
|
1173 } |
|
1174 |
|
1175 // would this character generate a surrogate pair in UTF-16? |
|
1176 if (currentUnicodeCharacter > 0xFFFF) |
|
1177 { |
|
1178 // is there enough space to hold a surrogate pair in the output? |
|
1179 if (pUnicode >= pLastUnicode) |
|
1180 { |
|
1181 break; // no, end processing. |
|
1182 } |
|
1183 |
|
1184 TUint surrogate = (currentUnicodeCharacter>>10) + 0xD7C0; |
|
1185 *pUnicode++ = STATIC_CAST(TUint16, surrogate); |
|
1186 |
|
1187 surrogate = (currentUnicodeCharacter & 0x3FF) + 0xDC00; |
|
1188 *pUnicode++ = STATIC_CAST(TUint16, surrogate); |
|
1189 } |
|
1190 else |
|
1191 { |
|
1192 *pUnicode++ = STATIC_CAST(TUint16, currentUnicodeCharacter); |
|
1193 } |
|
1194 |
|
1195 // move the input pointer |
|
1196 if (currentUnicodeCharacter != replacementcharacter) |
|
1197 { |
|
1198 pUtf8 += sequenceLength; |
|
1199 } |
|
1200 else if(illFormed == EFalse) |
|
1201 { |
|
1202 pUtf8 += (sequenceLength); |
|
1203 } |
|
1204 else |
|
1205 { |
|
1206 // we had a character we didn't recognize (i.e. it was invalid) |
|
1207 // so move to the next character in the input |
|
1208 pUtf8++; |
|
1209 } |
|
1210 |
|
1211 if ((pUtf8 > pLastUtf8) || (pUnicode > pLastUnicode)) |
|
1212 { |
|
1213 break; // we've either reached the end of the input or the end of output |
|
1214 } |
|
1215 } |
|
1216 |
|
1217 aUnicode.SetLength(pUnicode - aUnicode.Ptr()); |
|
1218 return (pLastUtf8 - pUtf8 + 1); |
|
1219 } |
|
1220 |
|
1221 /** Given a sample text this function attempts to determine whether or not |
|
1222 * the same text is encoded using the UTF-8 standard encoding scheme. |
|
1223 |
|
1224 @param TInt a confidence level, given at certain value. if the given sample |
|
1225 is UTF-8 this value will not be changed (unless > 100) then its |
|
1226 set to 100. Otherwise if the same isn't UTF-8, its set to 0. |
|
1227 @param TDesC8 sample text. |
|
1228 UTF-8. The default is EFalse. |
|
1229 @return void |
|
1230 */ |
|
1231 |
|
1232 /* of note: conformance. Unicode standard 5.0 section 3.9, table 3-7 |
|
1233 * Well formed UTF-8 Byte Sequences, full table. |
|
1234 * +----------------------------------------------------------------+ |
|
1235 * | Code Points | 1st byte | 2nd byte | 3rd byte | 4th byte | |
|
1236 * +--------------------+----------+----------+----------+----------+ |
|
1237 * | U+0000..U+007F | 00..7D | | | | 1 byte, ascii |
|
1238 * | U+0080..U+07FF | C2..DF | 80..BF | | | 2 bytes, error if 1st < 0xC2 |
|
1239 * | U+0800..U+0FFF | E0 | A0..BF | 80..BF | | 3 bytes, 1st == 0xE0, error if 2nd < 0xA0 |
|
1240 * | U+1000..U+CFFF | E1..EC | 80..BF | 80..BF | | normal |
|
1241 * | U+D000..U+D7FF | ED | 80..9F | 80..BF | | 3 bytes, 1st == 0xED, error if 2nd > 0x9F |
|
1242 * | U+E000..U+FFFF | EE..EF | 80..BF | 80..BF | | normal |
|
1243 * | U+10000..U+3FFFF | F0 | 90..BF | 80..BF | 80..BF | 4 bytes, 1st == 0xf0, error if 2nd < 0x90 |
|
1244 * | U+40000..U+FFFFF | F1..F3 | 80..BF | 80..BF | 80..BF | normal |
|
1245 * | U+100000..U+10FFFF | F4 | 80..8F | 80..BF | 80..BF | 4 bytes, 1st == 0xF4, error if 2nd > 0x8F |
|
1246 * +--------------------+----------+----------+----------+----------+ |
|
1247 * |
|
1248 * As a consequence of the well-formedness conditions specified in table 3-7, |
|
1249 * the following byte values are disallowed in UTF-8: C0-C1, F5-FF. |
|
1250 * |
|
1251 * Code Rules: |
|
1252 * R1: If the string contains any non-UTF-8 characters the returned confidence |
|
1253 * is 0. Valid UTF-8 combinations are listed in the above table. |
|
1254 * R2: Otherwise if the string starts with a UTF-8 BOM (byte order mark) in |
|
1255 * the (see ) the returned confidence is 95. |
|
1256 * R3: Otherwise the confidence returned is based upon the sample string |
|
1257 * length. |
|
1258 * R4: If the sample string is under 75 characters, the confidence is set to |
|
1259 * 75. |
|
1260 */ |
|
1261 GLREF_C void IsCharacterSetUTF8(TInt& aConfidenceLevel, const TDesC8& aSample) |
|
1262 { |
|
1263 |
|
1264 TInt sampleLength = aSample.Length(); |
|
1265 |
|
1266 if (sampleLength == 0) |
|
1267 { |
|
1268 aConfidenceLevel = 89; |
|
1269 return; |
|
1270 } |
|
1271 TInt bytesRemaining = 0; |
|
1272 TInt sequenceLength = 0; |
|
1273 |
|
1274 aConfidenceLevel = sampleLength; |
|
1275 |
|
1276 const TUint8* buffer = &aSample[0]; |
|
1277 |
|
1278 if (sampleLength < 95) |
|
1279 { |
|
1280 // check for the BOM |
|
1281 if ((sampleLength >= 3) && |
|
1282 ((buffer[0] == 0xEF) && |
|
1283 (buffer[1] == 0xBB) && |
|
1284 (buffer[2] == 0xBF)) |
|
1285 ) |
|
1286 { |
|
1287 aConfidenceLevel = 95; |
|
1288 } |
|
1289 else if (sampleLength < 75) |
|
1290 { |
|
1291 aConfidenceLevel = 75; |
|
1292 } |
|
1293 } |
|
1294 |
|
1295 for (TInt index = 0;index != sampleLength;index++) |
|
1296 { |
|
1297 |
|
1298 if (bytesRemaining > 0) |
|
1299 { |
|
1300 // bytesRemaining > 0, means that a byte representing the start of a |
|
1301 // multibyte sequence was encountered and the bytesRemaining is the |
|
1302 // number of bytes to follow. |
|
1303 |
|
1304 if ((buffer[index] & 0xc0) == 0x80) |
|
1305 { |
|
1306 // need to check for ill-formed sequences -- all are in the 2nd byte |
|
1307 |
|
1308 if ((sequenceLength == 3) && (bytesRemaining == 2)) |
|
1309 { |
|
1310 if ((buffer[index - 1] == 0xe0) && (buffer[index] < 0xa0)) |
|
1311 { |
|
1312 aConfidenceLevel = 0; |
|
1313 break; |
|
1314 } |
|
1315 else if ((buffer[index - 1] == 0xed) && (buffer[index] > 0x9f)) |
|
1316 { |
|
1317 aConfidenceLevel = 0; |
|
1318 break; |
|
1319 } |
|
1320 } |
|
1321 else if ((sequenceLength == 4) && (bytesRemaining == 3)) |
|
1322 { |
|
1323 if ((buffer[index - 1] == 0xf0) && (buffer[index] < 0x90)) |
|
1324 { |
|
1325 aConfidenceLevel = 0; |
|
1326 break; |
|
1327 } |
|
1328 else if ((buffer[index - 1] == 0xf4) && (buffer[index] > 0x8f)) |
|
1329 { |
|
1330 aConfidenceLevel = 0; |
|
1331 break; |
|
1332 } |
|
1333 } |
|
1334 |
|
1335 --bytesRemaining; |
|
1336 continue; |
|
1337 } |
|
1338 else |
|
1339 { |
|
1340 aConfidenceLevel = 0; |
|
1341 break; |
|
1342 } |
|
1343 } |
|
1344 |
|
1345 if (bytesRemaining == 0) |
|
1346 { |
|
1347 if (buffer[index] < 0x80) |
|
1348 { |
|
1349 // The value of aSample[index] is in the range 0x00-0x7f |
|
1350 //UTF8 maintains ASCII transparency. So it's a valid |
|
1351 //UTF8. Do nothing, check next value. |
|
1352 continue; |
|
1353 } |
|
1354 else if ((buffer[index] >= 0xc2) && (buffer[index] < 0xe0)) |
|
1355 { |
|
1356 // valid start of a 2 byte sequence (see conformance note) |
|
1357 sequenceLength = 2; |
|
1358 bytesRemaining = 1; |
|
1359 } |
|
1360 else if ((buffer[index] & 0xf0) == 0xe0) |
|
1361 { |
|
1362 // valid start of a 3 byte sequence |
|
1363 sequenceLength = 3; |
|
1364 bytesRemaining = 2; |
|
1365 } |
|
1366 else if ((buffer[index] >= 0xf0) && (buffer[index] < 0xf5)) |
|
1367 { |
|
1368 // valid start of a 4 byte sequence (see conformance note) |
|
1369 sequenceLength = 4; |
|
1370 bytesRemaining = 3; |
|
1371 } |
|
1372 else |
|
1373 { |
|
1374 // wasn't anything expected so must be an illegal/irregular UTF8 coded value |
|
1375 aConfidenceLevel = 0; |
|
1376 break; |
|
1377 } |
|
1378 } |
|
1379 } // for |
|
1380 |
|
1381 aConfidenceLevel = (aConfidenceLevel > 0)? ((aConfidenceLevel > 100)? 100: aConfidenceLevel): 0; |
|
1382 } |
|
1383 |
|
1384 GLREF_C void IsCharacterSetUTF7(TInt& aConfidenceLevel, const TDesC8& aSample) |
|
1385 { |
|
1386 TInt sampleLength = aSample.Length(); |
|
1387 aConfidenceLevel = 70; |
|
1388 for (TInt i=0; i<sampleLength; ++i) |
|
1389 { |
|
1390 // UTF-7 value ranges only 7 bits |
|
1391 if((aSample[i]&0x80)!=0x00) |
|
1392 { |
|
1393 aConfidenceLevel= 0; |
|
1394 break; |
|
1395 } |
|
1396 |
|
1397 // there is no "~" in UTF-7 encoding. So if find either, it's not UTF-7 |
|
1398 else if (char(aSample[i])=='~') |
|
1399 { |
|
1400 aConfidenceLevel = 0; |
|
1401 break; |
|
1402 } |
|
1403 |
|
1404 // The SMS7Bit escape char value is 0x1b. Reduce confidence if it follows the following format |
|
1405 else if ( (aSample[i]==0x1b) && (i <sampleLength-1) ) |
|
1406 { |
|
1407 static const TInt smsExtensionTable[11] = |
|
1408 {0x0a, 0x14, 0x1b, 0x28, 0x29, 0x2f, 0x3c, 0x3d, 0x3e, 0x40, 0x65}; |
|
1409 TInt increment1 = i+1; |
|
1410 if (increment1>= sampleLength) |
|
1411 break; |
|
1412 for (TInt j=0; j < 11; ++j) |
|
1413 { |
|
1414 if (aSample[increment1] == smsExtensionTable[j]) |
|
1415 { |
|
1416 aConfidenceLevel-=10; |
|
1417 } |
|
1418 } |
|
1419 } |
|
1420 // The UTF-7 escape char is 0x2b. The values that follow the escape sequence |
|
1421 // the values following the escape char value must belong to the modified base64 |
|
1422 // or '-' else it is an ill-formed sequence, so probably not UTF-7 |
|
1423 else if ( (aSample[i]==0x2b) && (i <sampleLength-1) ) |
|
1424 { |
|
1425 TInt increment1 = i+1; |
|
1426 if ((aSample[increment1] == 0x2b) || (aSample[increment1] == 0x2d) || (aSample[increment1] == 0x2f) || |
|
1427 ((aSample[increment1] >= 0x41) && (aSample[increment1] <= 0x5a)) || |
|
1428 ((aSample[increment1] >= 0x61) && (aSample[increment1] <= 0x7a))) |
|
1429 { |
|
1430 aConfidenceLevel+=5; |
|
1431 } |
|
1432 else |
|
1433 { |
|
1434 aConfidenceLevel-=15; |
|
1435 } |
|
1436 i++; // should this be here or up in the if loop ?? |
|
1437 } |
|
1438 } //for |
|
1439 aConfidenceLevel =(aConfidenceLevel >0)? ((aConfidenceLevel > 100)? 100: aConfidenceLevel): 0; |
|
1440 } |