|
1 // Copyright (c) 1997-2009 Nokia Corporation and/or its subsidiary(-ies). |
|
2 // All rights reserved. |
|
3 // This component and the accompanying materials are made available |
|
4 // under the terms of the License "Symbian Foundation License v1.0" |
|
5 // which accompanies this distribution, and is available |
|
6 // at the URL "http://www.symbianfoundation.org/legal/sfl-v10.html". |
|
7 // |
|
8 // Initial Contributors: |
|
9 // Nokia Corporation - initial contribution. |
|
10 // |
|
11 // Contributors: |
|
12 // |
|
13 // Description: |
|
14 // |
|
15 |
|
16 #include <e32std.h> |
|
17 #include <e32base.h> |
|
18 #include <utf.h> |
|
19 |
|
20 const TUint KNotInBase64Alphabet=KMaxTUint; |
|
21 |
|
22 enum TPanic |
|
23 { |
|
24 EPanicBad6BitNumber=1, |
|
25 EPanicBadUtf7Pointers1, |
|
26 EPanicBadUtf7Pointers2, |
|
27 EPanicBadUtf7Pointers3, |
|
28 EPanicBadUtf7Pointers4, |
|
29 EPanicBadUtf7Pointers5, |
|
30 EPanicBadUtf7Pointers6, |
|
31 EPanicBadUtf7Pointers7, |
|
32 EPanicBadUtf7Pointers8, |
|
33 EPanicBadUtf7Pointers9, |
|
34 EPanicBadUtf7Pointers10, |
|
35 EPanicBadUtf7Pointers11, |
|
36 EPanicNotInBase64Block, |
|
37 EPanicBadUnicodePointers1, |
|
38 EPanicBadUnicodePointers2, |
|
39 EPanicBadUnicodePointers3, |
|
40 EPanicBadUnicodePointers4, |
|
41 EPanicBadUnicodePointers5, |
|
42 EPanicBadUnicodePointers6, |
|
43 EPanicBadUnicodePointers7, |
|
44 EPanicBadUnicodePointers8, |
|
45 EPanicBadUnicodePointers9, |
|
46 EPanicBadUnicodePointers10, |
|
47 EPanicBadBitBufferState1, |
|
48 EPanicBadBitBufferState2, |
|
49 EPanicBadBitBufferState3, |
|
50 EPanicBadBitBufferState4, |
|
51 EPanicBadBitBufferState5, |
|
52 EPanicBadBitBufferState6, |
|
53 EPanicBadBitBufferState7, |
|
54 EPanicBadBitBufferState8, |
|
55 EPanicBadBitBufferState9, |
|
56 EPanicBadBitBufferState10, |
|
57 EPanicBadBitBufferState11, |
|
58 EPanicBadBitBufferState12, |
|
59 EPanicBadBitBufferState13, |
|
60 EPanicBadBitBufferState14, |
|
61 EPanicBadBitBufferState15, |
|
62 EPanicBadBitBufferState16, |
|
63 EPanicBadBitBufferState17, |
|
64 EPanicUnexpectedNumberOfLoopIterations, |
|
65 EPanicInitialEscapeCharacterButNoBase64, |
|
66 EPanicBase64SequenceDoesNotFallOnUnicodeCharacterBoundary, |
|
67 EPanicBadUtf8Pointers1, |
|
68 EPanicBadUtf8Pointers2, |
|
69 EPanicBadUtf8Pointers3, |
|
70 EPanicBadUtf8Pointers4, |
|
71 EPanicBadUtf8Pointers5, |
|
72 EPanicBadUtf8Pointers6, |
|
73 EPanicBadUtf8Pointers7, |
|
74 EPanicOutOfSyncUtf7Byte1, |
|
75 EPanicOutOfSyncUtf7Byte2, |
|
76 EPanicOutOfSyncBase64Decoding |
|
77 }; |
|
78 |
|
79 _LIT(KLitPanicText, "CHARCONV-UTF"); |
|
80 |
|
81 LOCAL_C void Panic(TPanic aPanic) |
|
82 { |
|
83 User::Panic(KLitPanicText, aPanic); |
|
84 } |
|
85 |
|
86 inline TUint EscapeCharacterForStartingBase64Block(TBool aIsImapUtf7) {return aIsImapUtf7? '&': '+';} |
|
87 |
|
88 LOCAL_C TUint Base64Decoding(TUint aMemberOfBase64Alphabet, TBool aIsImapUtf7) |
|
89 { |
|
90 if ((aMemberOfBase64Alphabet>='A') && (aMemberOfBase64Alphabet<='Z')) |
|
91 { |
|
92 return aMemberOfBase64Alphabet-'A'; |
|
93 } |
|
94 if ((aMemberOfBase64Alphabet>='a') && (aMemberOfBase64Alphabet<='z')) |
|
95 { |
|
96 return aMemberOfBase64Alphabet-('a'-26); |
|
97 } |
|
98 if ((aMemberOfBase64Alphabet>='0') && (aMemberOfBase64Alphabet<='9')) |
|
99 { |
|
100 return aMemberOfBase64Alphabet+((26*2)-'0'); |
|
101 } |
|
102 if (aMemberOfBase64Alphabet=='+') |
|
103 { |
|
104 return 62; |
|
105 } |
|
106 if (aMemberOfBase64Alphabet==STATIC_CAST(TUint, aIsImapUtf7? ',': '/')) |
|
107 { |
|
108 return 63; |
|
109 } |
|
110 return KNotInBase64Alphabet; |
|
111 } |
|
112 |
|
113 LOCAL_C TUint Base64Encoding(TUint a6BitNumber, TBool aIsImapUtf7) |
|
114 { |
|
115 __ASSERT_DEBUG(a6BitNumber<64, Panic(EPanicBad6BitNumber)); |
|
116 if ((a6BitNumber==63) && aIsImapUtf7) |
|
117 { |
|
118 return ','; |
|
119 } |
|
120 static const TUint8 base64Alphabet[64]={'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '+', '/'}; |
|
121 return base64Alphabet[a6BitNumber]; |
|
122 } |
|
123 |
|
124 LOCAL_C TUint8* PointerToEscapeCharacterStartingBase64Block(TUint8* aPointerToUtf7Byte, const TUint8* aPointerToFirstUtf7Byte, TBool aIsImapUtf7) |
|
125 { |
|
126 __ASSERT_DEBUG(aPointerToUtf7Byte>=aPointerToFirstUtf7Byte, Panic(EPanicBadUtf7Pointers1)); |
|
127 TUint8* pointerToCandidateEscapeCharacter=NULL; |
|
128 FOREVER |
|
129 { |
|
130 const TUint utf7Byte=*aPointerToUtf7Byte; |
|
131 if (utf7Byte==EscapeCharacterForStartingBase64Block(aIsImapUtf7)) |
|
132 { |
|
133 pointerToCandidateEscapeCharacter=aPointerToUtf7Byte; |
|
134 } |
|
135 else if (Base64Decoding(utf7Byte, aIsImapUtf7)==KNotInBase64Alphabet) |
|
136 { |
|
137 break; |
|
138 } |
|
139 __ASSERT_DEBUG(aPointerToUtf7Byte>=aPointerToFirstUtf7Byte, Panic(EPanicBadUtf7Pointers2)); |
|
140 if (aPointerToUtf7Byte<=aPointerToFirstUtf7Byte) |
|
141 { |
|
142 break; |
|
143 } |
|
144 --aPointerToUtf7Byte; |
|
145 } |
|
146 __ASSERT_DEBUG(pointerToCandidateEscapeCharacter!=NULL, Panic(EPanicNotInBase64Block)); |
|
147 return pointerToCandidateEscapeCharacter; |
|
148 } |
|
149 |
|
150 LOCAL_C TBool EncodeInUtf7Directly(TUint aUnicodeCharacter, TBool aIsImapUtf7, TBool aEncodeOptionalDirectCharactersInBase64) |
|
151 { |
|
152 if (aIsImapUtf7) |
|
153 { |
|
154 return (aUnicodeCharacter>=0x0020) && (aUnicodeCharacter<=0x007e); |
|
155 } |
|
156 if ((aUnicodeCharacter>=0x0021) && (aUnicodeCharacter<=0x007d)) |
|
157 { |
|
158 if (aEncodeOptionalDirectCharactersInBase64) |
|
159 { |
|
160 return (((aUnicodeCharacter>=0x0041) && (aUnicodeCharacter<=0x005a)) || |
|
161 ((aUnicodeCharacter>=0x0061) && (aUnicodeCharacter<=0x007a)) || |
|
162 ((aUnicodeCharacter>=0x0027) && (aUnicodeCharacter<=0x0029)) || |
|
163 ((aUnicodeCharacter>=0x002b) && (aUnicodeCharacter<=0x003a)) || |
|
164 (aUnicodeCharacter==0x003f)); |
|
165 } |
|
166 return aUnicodeCharacter!=0x005c; |
|
167 } |
|
168 return (aUnicodeCharacter==0x0020) || (aUnicodeCharacter==0x0009) || (aUnicodeCharacter==0x000d) || (aUnicodeCharacter==0x000a); |
|
169 } |
|
170 |
|
171 inline TBool BitBufferContainsNonZeroBits(TUint aBitBuffer, TInt aNumberOfBitsInBuffer) |
|
172 { |
|
173 return (aBitBuffer&((1<<aNumberOfBitsInBuffer)-1))!=0; |
|
174 } |
|
175 |
|
176 |
|
177 |
|
178 /** Converts Unicode text into UTF-7 encoding. The fucntion leaves with |
|
179 KErrCorrupt if the input string is corrupt. |
|
180 |
|
181 @param aUnicode A UCS-2 encoded input string. |
|
182 @param aEncodeOptionalDirectCharactersInBase64 If ETrue then |
|
183 characters from UTF-7 set O (optional direct characters) are encoded in |
|
184 Modified Base64. If EFalse the characters are encoded directly, |
|
185 as their ASCII equivalents. |
|
186 @return A descriptor containing the UTF-7 encoded output string. */ |
|
187 EXPORT_C HBufC8* CnvUtfConverter::ConvertFromUnicodeToUtf7L( |
|
188 const TDesC16& aUnicode, |
|
189 TBool aEncodeOptionalDirectCharactersInBase64) |
|
190 { |
|
191 // If aUnicode is Null string, return an empty HBufC |
|
192 if (aUnicode.Length() == 0) |
|
193 { |
|
194 HBufC8* hBuf8 = HBufC8::NewL(1); |
|
195 return hBuf8; |
|
196 } |
|
197 |
|
198 // Otherwise, convert and store result in a buffer, reallocating that buffer if needed. |
|
199 TInt length = aUnicode.Length(); |
|
200 const TInt bufsize = 100; |
|
201 |
|
202 TPtrC16 unicode (aUnicode); |
|
203 TBuf8<bufsize> buf; |
|
204 HBufC8* hBuf8 = HBufC8::NewLC(length); |
|
205 TPtr8 utf7 = hBuf8->Des(); |
|
206 |
|
207 FOREVER |
|
208 { |
|
209 TInt unconverted = ConvertFromUnicodeToUtf7(buf, unicode, aEncodeOptionalDirectCharactersInBase64); |
|
210 if( unconverted == EErrorIllFormedInput || unconverted < 0) |
|
211 User::Leave(KErrCorrupt); |
|
212 |
|
213 if (utf7.Length() + buf.Length() > utf7.MaxLength()) |
|
214 { |
|
215 // Reallocate the hBuf8 |
|
216 hBuf8 = hBuf8->ReAllocL(utf7.Length() + buf.Length()); |
|
217 CleanupStack::Pop(); |
|
218 CleanupStack::PushL(hBuf8); |
|
219 utf7.Set(hBuf8->Des()); |
|
220 } |
|
221 utf7.Append(buf); |
|
222 if (unconverted ==0) |
|
223 break; |
|
224 unicode.Set(unicode.Right(unconverted)); |
|
225 } |
|
226 CleanupStack::Pop(); |
|
227 return hBuf8; |
|
228 |
|
229 } |
|
230 |
|
231 /** Converts Unicode text into UTF-7 encoding. |
|
232 |
|
233 @param aUtf7 On return, contains the UTF-7 encoded output string. |
|
234 @param aUnicode A UCS-2 encoded input string. |
|
235 @param aEncodeOptionalDirectCharactersInBase64 If ETrue then characters from |
|
236 UTF-7 set O (optional direct characters) are encoded in Modified Base64. If |
|
237 EFalse the characters are encoded directly, as their ASCII equivalents. |
|
238 @return The number of unconverted characters left at the end of the input |
|
239 descriptor, or one of the error values defined in TError. */ |
|
240 EXPORT_C TInt CnvUtfConverter::ConvertFromUnicodeToUtf7( |
|
241 TDes8& aUtf7, |
|
242 const TDesC16& aUnicode, |
|
243 TBool aEncodeOptionalDirectCharactersInBase64) |
|
244 { |
|
245 return ConvertFromUnicodeToUtf7(aUtf7, aUnicode, EFalse, aEncodeOptionalDirectCharactersInBase64); |
|
246 } |
|
247 |
|
248 TInt CnvUtfConverter::ConvertFromUnicodeToUtf7(TDes8& aUtf7, |
|
249 const TDesC16& aUnicode, |
|
250 TBool aIsImapUtf7, |
|
251 TBool aEncodeOptionalDirectCharactersInBase64) |
|
252 { |
|
253 if (aUnicode.Length()==0) |
|
254 { |
|
255 aUtf7.SetLength(0); |
|
256 return 0; |
|
257 } |
|
258 if (aUtf7.MaxLength()==0) |
|
259 { |
|
260 return aUnicode.Length(); |
|
261 } |
|
262 const TUint escapeCharacterForStartingBase64Block=EscapeCharacterForStartingBase64Block(aIsImapUtf7); |
|
263 TUint8* pointerToPreviousUtf7Byte=CONST_CAST(TUint8*, aUtf7.Ptr()-1); |
|
264 const TUint8* const pointerToLastUtf7Byte=pointerToPreviousUtf7Byte+aUtf7.MaxLength(); |
|
265 const TUint16* pointerToPreviousUnicodeCharacter=aUnicode.Ptr()-1; |
|
266 const TUint16* const pointerToLastUnicodeCharacter=pointerToPreviousUnicodeCharacter+aUnicode.Length(); |
|
267 const TUint KIsInBase64Block=0x80000000u; |
|
268 TUint bitBuffer=0; |
|
269 TInt numberOfBitsInBuffer=0; |
|
270 FOREVER |
|
271 { |
|
272 __ASSERT_DEBUG(pointerToPreviousUtf7Byte<=pointerToLastUtf7Byte, Panic(EPanicBadUtf7Pointers3)); |
|
273 __ASSERT_DEBUG(pointerToPreviousUnicodeCharacter<=pointerToLastUnicodeCharacter, Panic(EPanicBadUnicodePointers1)); |
|
274 TUint currentUnicodeCharacter=(pointerToPreviousUnicodeCharacter==pointerToLastUnicodeCharacter)? 0: *(pointerToPreviousUnicodeCharacter+1); |
|
275 if ((pointerToPreviousUnicodeCharacter==pointerToLastUnicodeCharacter) || EncodeInUtf7Directly(currentUnicodeCharacter, aIsImapUtf7, aEncodeOptionalDirectCharactersInBase64)) |
|
276 { |
|
277 __ASSERT_DEBUG((bitBuffer&KIsInBase64Block) || (numberOfBitsInBuffer==0), Panic(EPanicBadBitBufferState1)); |
|
278 __ASSERT_DEBUG((numberOfBitsInBuffer==0) || (numberOfBitsInBuffer==2) || (numberOfBitsInBuffer==4), Panic(EPanicBadBitBufferState2)); |
|
279 if (bitBuffer&KIsInBase64Block) |
|
280 { |
|
281 if (numberOfBitsInBuffer!=0) |
|
282 { |
|
283 if (pointerToLastUtf7Byte-pointerToPreviousUtf7Byte<2) // make sure there is enough space for the trailing '-' as well as the remains of the bitBuffer as the KIsInBase64Block flag is about to turned off, thus the trailing '-' may never get written |
|
284 { |
|
285 break; |
|
286 } |
|
287 ++pointerToPreviousUtf7Byte; |
|
288 *pointerToPreviousUtf7Byte=STATIC_CAST(TUint8, Base64Encoding((bitBuffer<<(6-numberOfBitsInBuffer))&0x3f, aIsImapUtf7)); |
|
289 } |
|
290 else |
|
291 { |
|
292 if (pointerToPreviousUtf7Byte==pointerToLastUtf7Byte) |
|
293 { |
|
294 break; |
|
295 } |
|
296 } |
|
297 ++pointerToPreviousUtf7Byte; |
|
298 *pointerToPreviousUtf7Byte='-'; |
|
299 bitBuffer=0; |
|
300 numberOfBitsInBuffer=0; |
|
301 } |
|
302 __ASSERT_DEBUG(pointerToPreviousUnicodeCharacter<=pointerToLastUnicodeCharacter, Panic(EPanicBadUnicodePointers2)); |
|
303 if (pointerToPreviousUnicodeCharacter>=pointerToLastUnicodeCharacter) |
|
304 { |
|
305 break; |
|
306 } |
|
307 __ASSERT_DEBUG(pointerToPreviousUtf7Byte<=pointerToLastUtf7Byte, Panic(EPanicBadUtf7Pointers4)); |
|
308 if (pointerToLastUtf7Byte-pointerToPreviousUtf7Byte<((currentUnicodeCharacter==escapeCharacterForStartingBase64Block)? 2: 1)) |
|
309 { |
|
310 break; |
|
311 } |
|
312 ++pointerToPreviousUtf7Byte; |
|
313 *pointerToPreviousUtf7Byte=STATIC_CAST(TUint8, currentUnicodeCharacter); |
|
314 ++pointerToPreviousUnicodeCharacter; |
|
315 if (currentUnicodeCharacter==escapeCharacterForStartingBase64Block) |
|
316 { |
|
317 ++pointerToPreviousUtf7Byte; |
|
318 *pointerToPreviousUtf7Byte='-'; |
|
319 } |
|
320 } |
|
321 else |
|
322 { |
|
323 { |
|
324 TInt numberOfUtf7BytesRequired=(numberOfBitsInBuffer+16)/6; // "(numberOfBitsInBuffer+16)/6" is the number of iterations that will happen in the while loop below |
|
325 if (~bitBuffer&KIsInBase64Block) |
|
326 { |
|
327 ++numberOfUtf7BytesRequired; // for the initial escapeCharacterForStartingBase64Block |
|
328 } |
|
329 if (pointerToLastUtf7Byte-pointerToPreviousUtf7Byte<numberOfUtf7BytesRequired) |
|
330 { |
|
331 break; |
|
332 } |
|
333 } |
|
334 if (~bitBuffer&KIsInBase64Block) |
|
335 { |
|
336 __ASSERT_DEBUG(pointerToPreviousUtf7Byte<pointerToLastUtf7Byte, Panic(EPanicBadUtf7Pointers5)); |
|
337 ++pointerToPreviousUtf7Byte; |
|
338 *pointerToPreviousUtf7Byte=STATIC_CAST(TUint8, escapeCharacterForStartingBase64Block); |
|
339 } |
|
340 bitBuffer<<=16; |
|
341 bitBuffer|=currentUnicodeCharacter; |
|
342 numberOfBitsInBuffer+=16; |
|
343 ++pointerToPreviousUnicodeCharacter; |
|
344 __ASSERT_DEBUG(numberOfBitsInBuffer<=20, Panic(EPanicBadBitBufferState3)); |
|
345 while (numberOfBitsInBuffer>=6) |
|
346 { |
|
347 numberOfBitsInBuffer-=6; |
|
348 __ASSERT_DEBUG(pointerToPreviousUtf7Byte<pointerToLastUtf7Byte, Panic(EPanicBadUtf7Pointers6)); |
|
349 ++pointerToPreviousUtf7Byte; |
|
350 *pointerToPreviousUtf7Byte=STATIC_CAST(TUint8, Base64Encoding((bitBuffer>>numberOfBitsInBuffer)&0x3f, aIsImapUtf7)); |
|
351 } |
|
352 bitBuffer&=((1<<numberOfBitsInBuffer)-1); // zero all the consumed bits - not strictly necessary but it leaves the buffer in a cleaner state |
|
353 bitBuffer|=KIsInBase64Block; |
|
354 } |
|
355 } |
|
356 __ASSERT_DEBUG((bitBuffer&KIsInBase64Block) || (numberOfBitsInBuffer==0), Panic(EPanicBadBitBufferState4)); |
|
357 __ASSERT_DEBUG((numberOfBitsInBuffer==0) || (numberOfBitsInBuffer==2) || (numberOfBitsInBuffer==4), Panic(EPanicBadBitBufferState5)); |
|
358 if (bitBuffer&KIsInBase64Block) |
|
359 { |
|
360 #if defined(_DEBUG) |
|
361 TInt numberOfLoopIterations=1; |
|
362 #endif |
|
363 FOREVER // there should never be more than 2 iterations of this loop - the first "if" should always succeed the second time if it doesn't succeed the first time |
|
364 { |
|
365 __ASSERT_DEBUG(pointerToPreviousUtf7Byte<=pointerToLastUtf7Byte, Panic(EPanicBadUtf7Pointers7)); |
|
366 __ASSERT_DEBUG((numberOfBitsInBuffer==0) || (numberOfBitsInBuffer==2) || (numberOfBitsInBuffer==4), Panic(EPanicBadBitBufferState6)); |
|
367 __ASSERT_DEBUG(numberOfLoopIterations<=2, Panic(EPanicUnexpectedNumberOfLoopIterations)); |
|
368 #if defined(_DEBUG) |
|
369 ++numberOfLoopIterations; |
|
370 #endif |
|
371 if (pointerToLastUtf7Byte-pointerToPreviousUtf7Byte>=((numberOfBitsInBuffer==0)? 1: 2)) // if there's room to finish off the base-64 sequence by (i) flushing the bit-buffer and (ii) appending the trailing '-' |
|
372 { |
|
373 if (numberOfBitsInBuffer!=0) |
|
374 { |
|
375 __ASSERT_DEBUG(pointerToPreviousUtf7Byte<pointerToLastUtf7Byte, Panic(EPanicBadUtf7Pointers8)); |
|
376 ++pointerToPreviousUtf7Byte; |
|
377 *pointerToPreviousUtf7Byte=STATIC_CAST(TUint8, Base64Encoding((bitBuffer<<(6-numberOfBitsInBuffer))&0x3f, aIsImapUtf7)); |
|
378 } |
|
379 __ASSERT_DEBUG(pointerToPreviousUtf7Byte<pointerToLastUtf7Byte, Panic(EPanicBadUtf7Pointers9)); |
|
380 ++pointerToPreviousUtf7Byte; |
|
381 *pointerToPreviousUtf7Byte='-'; |
|
382 break; |
|
383 } |
|
384 // it is now necessary to move back pointerToPreviousUtf7Byte so that the base-64 sequence can be terminated - note it must be terminated on a Unicode character boundary hence the reason why pointerToPreviousUnicodeCharacter may be moved back too |
|
385 TUint8* pointerToEscapeCharacterStartingBase64Block=PointerToEscapeCharacterStartingBase64Block(pointerToPreviousUtf7Byte, aUtf7.Ptr(), aIsImapUtf7); |
|
386 const TInt oldNumberOfBase64Characters=pointerToPreviousUtf7Byte-pointerToEscapeCharacterStartingBase64Block; |
|
387 __ASSERT_DEBUG(oldNumberOfBase64Characters>0, Panic(EPanicInitialEscapeCharacterButNoBase64)); |
|
388 __ASSERT_DEBUG(((oldNumberOfBase64Characters*6)+numberOfBitsInBuffer)%16==0, Panic(EPanicBase64SequenceDoesNotFallOnUnicodeCharacterBoundary)); |
|
389 pointerToPreviousUnicodeCharacter-=((oldNumberOfBase64Characters*6)+numberOfBitsInBuffer)/16; // move back pointerToPreviousUnicodeCharacter to before the equivalent of the base-64 sequence |
|
390 pointerToPreviousUtf7Byte=pointerToEscapeCharacterStartingBase64Block; |
|
391 __ASSERT_DEBUG(*pointerToPreviousUtf7Byte==escapeCharacterForStartingBase64Block, Panic(EPanicBadUtf7Pointers10)); |
|
392 if (oldNumberOfBase64Characters<4) // if the new base-64 sequence will be so short that it won't even be able to contain the UTF-7 encoding of a single Unicode character |
|
393 { |
|
394 --pointerToPreviousUtf7Byte; // move back pointerToPreviousUtf7Byte to before the escapeCharacterForStartingBase64Block |
|
395 break; |
|
396 } |
|
397 const TInt newNumberOfUnicodeCharacters=((oldNumberOfBase64Characters-1)*3)/8; |
|
398 pointerToPreviousUnicodeCharacter+=newNumberOfUnicodeCharacters; |
|
399 pointerToPreviousUtf7Byte+=((newNumberOfUnicodeCharacters*8)+2)/3; |
|
400 const TInt numberOfBitsToBeZeroedInLastBase64Character=(newNumberOfUnicodeCharacters%3)*2; |
|
401 if (numberOfBitsToBeZeroedInLastBase64Character!=0) |
|
402 { |
|
403 *pointerToPreviousUtf7Byte=STATIC_CAST(TUint8, Base64Encoding(Base64Decoding(*pointerToPreviousUtf7Byte, aIsImapUtf7)&0x3f&~((1<<numberOfBitsToBeZeroedInLastBase64Character)-1), aIsImapUtf7)); |
|
404 } |
|
405 bitBuffer=KIsInBase64Block; |
|
406 numberOfBitsInBuffer=0; |
|
407 } |
|
408 } |
|
409 aUtf7.SetLength((pointerToPreviousUtf7Byte-aUtf7.Ptr())+1); |
|
410 return pointerToLastUnicodeCharacter-pointerToPreviousUnicodeCharacter; |
|
411 } |
|
412 |
|
413 |
|
414 |
|
415 /** Converts Unicode text into UTF-8 encoding. |
|
416 |
|
417 @param aUtf8 On return, contains the UTF-8 encoded output string. |
|
418 @param aUnicode The Unicode-encoded input string. |
|
419 @return The number of unconverted characters left at the end of the input |
|
420 descriptor, or one of the error values defined in TError. */ |
|
421 EXPORT_C TInt CnvUtfConverter::ConvertFromUnicodeToUtf8(TDes8& aUtf8, const TDesC16& aUnicode) |
|
422 { |
|
423 return ConvertFromUnicodeToUtf8(aUtf8, aUnicode, EFalse); |
|
424 } |
|
425 |
|
426 |
|
427 /** Converts Unicode text into UTF-8 encoding. |
|
428 |
|
429 The variant of UTF-8 used internally by Java differs slightly from |
|
430 standard UTF-8. The TBool argument controls the UTF-8 |
|
431 variant generated by this function. This function leaves with a |
|
432 KErrCorrupt if the input string is corrupt. |
|
433 |
|
434 @param aUnicode A UCS-2 encoded input string. |
|
435 @return A pointer to an HBufC8 containing the converted UTF8. */ |
|
436 EXPORT_C HBufC8* CnvUtfConverter::ConvertFromUnicodeToUtf8L(const TDesC16& aUnicode) |
|
437 { |
|
438 // If aUnicode is Null string, return an empty HBufC |
|
439 if (aUnicode.Length() == 0) |
|
440 { |
|
441 HBufC8* hBuf8 = HBufC8::NewL(1); |
|
442 return hBuf8; |
|
443 } |
|
444 |
|
445 // Otherwise, convert and store result in a buffer, reallocating that buffer if needed. |
|
446 const TInt length = aUnicode.Length(); |
|
447 const TInt bufsize = 100; |
|
448 |
|
449 TPtrC16 unicode (aUnicode); |
|
450 TBuf8<bufsize> buf; |
|
451 HBufC8* hBuf8 = HBufC8::NewLC(length); |
|
452 TPtr8 utf8 = hBuf8->Des(); |
|
453 |
|
454 FOREVER |
|
455 { |
|
456 TInt unconverted = ConvertFromUnicodeToUtf8(buf, unicode); |
|
457 if( unconverted == EErrorIllFormedInput || unconverted < 0) |
|
458 User::Leave(KErrCorrupt); |
|
459 |
|
460 if (utf8.Length() + buf.Length() > utf8.MaxLength()) |
|
461 { |
|
462 // Reallocate the hBuf8 |
|
463 hBuf8 = hBuf8->ReAllocL(utf8.Length() + buf.Length()); |
|
464 CleanupStack::Pop(); |
|
465 CleanupStack::PushL(hBuf8); |
|
466 utf8.Set(hBuf8->Des()); |
|
467 } |
|
468 utf8.Append(buf); |
|
469 if (unconverted ==0) |
|
470 break; |
|
471 unicode.Set(unicode.Right(unconverted)); |
|
472 } |
|
473 CleanupStack::Pop(); |
|
474 return hBuf8; |
|
475 } |
|
476 |
|
477 /** Converts Unicode text into UTF-8 encoding. |
|
478 |
|
479 Surrogate pairs can be input which will result in a valid 4 byte UTF-8 value. |
|
480 |
|
481 The variant of UTF-8 used internally by Java differs slightly from standard |
|
482 UTF-8. The TBool argument controls the UTF-8 variant generated by this function. |
|
483 |
|
484 @param aUtf8 On return, contains the UTF-8 encoded output string. |
|
485 @param aUnicode A UCS-2 encoded input string. |
|
486 @param aGenerateJavaConformantUtf8 EFalse for orthodox UTF-8. ETrue for Java |
|
487 UTF-8. The default is EFalse. |
|
488 @return The number of unconverted characters left at the end of the input descriptor, |
|
489 or one of the error values defined in TError. */ |
|
490 TInt CnvUtfConverter::ConvertFromUnicodeToUtf8(TDes8& aUtf8, |
|
491 const TDesC16& aUnicode, |
|
492 TBool aGenerateJavaConformantUtf8) |
|
493 { |
|
494 if (aUnicode.Length() == 0) |
|
495 { |
|
496 aUtf8.SetLength(0); |
|
497 return 0; |
|
498 } |
|
499 if (aUtf8.MaxLength() == 0) |
|
500 { |
|
501 return aUnicode.Length(); |
|
502 } |
|
503 |
|
504 TUint8* pUtf8 = CONST_CAST(TUint8*, aUtf8.Ptr()); |
|
505 const TUint8* pointerToLastUtf8Byte = pUtf8 + (aUtf8.MaxLength() - 1); |
|
506 TBool inputIsTruncated = EFalse; |
|
507 const TUint16* pUnicode = aUnicode.Ptr(); |
|
508 const TUint16* pointerToLastUnicodeCharacter = pUnicode + (aUnicode.Length() - 1); |
|
509 |
|
510 FOREVER |
|
511 { |
|
512 __ASSERT_DEBUG(pUtf8 <= pointerToLastUtf8Byte, Panic(EPanicBadUtf8Pointers1)); |
|
513 __ASSERT_DEBUG(pUnicode <= pointerToLastUnicodeCharacter, Panic(EPanicBadUnicodePointers3)); |
|
514 |
|
515 if (pUnicode[0] < 0x80) |
|
516 { |
|
517 // ascii - 1 byte |
|
518 |
|
519 // internally java is different since the \x0000 character is |
|
520 // translated into \xC0 \x80. |
|
521 |
|
522 if ((aGenerateJavaConformantUtf8) && (pUnicode[0] == 0x0000)) |
|
523 { |
|
524 if (pUtf8 == pointerToLastUtf8Byte) |
|
525 { |
|
526 pUtf8--; |
|
527 pUnicode--; |
|
528 break; |
|
529 } |
|
530 *pUtf8++ = STATIC_CAST(TUint8, 0xc0); |
|
531 *pUtf8 = STATIC_CAST(TUint8, 0x80); |
|
532 } |
|
533 else |
|
534 { |
|
535 *pUtf8 = STATIC_CAST(TUint8, pUnicode[0]); |
|
536 } |
|
537 } |
|
538 else if (pUnicode[0] < 0x800) |
|
539 { |
|
540 // U+0080..U+07FF - 2 bytes |
|
541 |
|
542 if (pUtf8 == pointerToLastUtf8Byte) |
|
543 { |
|
544 pUtf8--; |
|
545 pUnicode--; |
|
546 break; |
|
547 } |
|
548 |
|
549 *pUtf8++ = STATIC_CAST(TUint8, 0xc0|(pUnicode[0]>>6)); |
|
550 *pUtf8 = STATIC_CAST(TUint8, 0x80|(pUnicode[0]&0x3f)); |
|
551 |
|
552 } |
|
553 |
|
554 // check to see if we have a surrogate in the stream, surrogates encode code points outside |
|
555 // the BMP and are 4 utf-8 chars, otherwise what we have here is 3 utf-8 chars. |
|
556 |
|
557 else if (((pUnicode[0] & 0xfc00) == 0xd800) && !aGenerateJavaConformantUtf8) |
|
558 { |
|
559 // surrogate pair - 4 bytes in utf-8 |
|
560 // U+10000..U+10FFFF |
|
561 |
|
562 __ASSERT_DEBUG(pUtf8 <= pointerToLastUtf8Byte, Panic(EPanicBadUtf8Pointers2)); |
|
563 // is there enough space to hold the character |
|
564 if ((pointerToLastUtf8Byte - pUtf8) < 3) |
|
565 { |
|
566 pUtf8--; |
|
567 pUnicode--; |
|
568 break; // no go to the exit condition |
|
569 } |
|
570 |
|
571 __ASSERT_DEBUG(pUnicode <= pointerToLastUnicodeCharacter, Panic(EPanicBadUnicodePointers4)); |
|
572 if (pUnicode >= pointerToLastUnicodeCharacter) |
|
573 { |
|
574 pUtf8--; |
|
575 pUnicode--; |
|
576 inputIsTruncated = ETrue; |
|
577 break; // middle of a surrogate pair. go to end condition |
|
578 } |
|
579 |
|
580 if ((pUnicode[1] & 0xfc00) != 0xdc00) |
|
581 { |
|
582 return EErrorIllFormedInput; |
|
583 } |
|
584 |
|
585 // convert utf-16 surrogate to utf-32 |
|
586 TUint ch = ((pUnicode[0] - 0xD800) << 10 | (pUnicode[1] - 0xDC00)) + 0x10000; |
|
587 |
|
588 // convert utf-32 to utf-8 |
|
589 *pUtf8++ = STATIC_CAST(TUint8,0xf0 | (ch >> 18)); |
|
590 *pUtf8++ = STATIC_CAST(TUint8,0x80 | ((ch >> 12) & 0x3f)); |
|
591 *pUtf8++ = STATIC_CAST(TUint8,0x80 | ((ch >> 6) & 0x3f)); |
|
592 *pUtf8 = STATIC_CAST(TUint8,0x80 | (ch & 0x3f)); |
|
593 |
|
594 // we consumed 2 utf-16 values, move this pointer |
|
595 pUnicode++; |
|
596 } |
|
597 else |
|
598 { |
|
599 // 3 byte - utf-8, U+800..U+FFFF rest of BMP. |
|
600 |
|
601 if (pointerToLastUtf8Byte - pUtf8 < 2) |
|
602 { |
|
603 pUtf8--; |
|
604 pUnicode--; |
|
605 break; |
|
606 } |
|
607 *pUtf8++ = STATIC_CAST(TUint8, 0xe0|(pUnicode[0]>>12)); |
|
608 *pUtf8++ = STATIC_CAST(TUint8, 0x80|((pUnicode[0]>>6)&0x3f)); |
|
609 *pUtf8 = STATIC_CAST(TUint8, 0x80|(pUnicode[0]&0x3f)); |
|
610 } |
|
611 |
|
612 if ((pUnicode == pointerToLastUnicodeCharacter) || (pUtf8 == pointerToLastUtf8Byte)) |
|
613 { |
|
614 break; |
|
615 } |
|
616 |
|
617 pUtf8++; |
|
618 pUnicode++; |
|
619 |
|
620 } |
|
621 |
|
622 if ((pUnicode < aUnicode.Ptr()) && inputIsTruncated) |
|
623 { |
|
624 return EErrorIllFormedInput; |
|
625 } |
|
626 |
|
627 aUtf8.SetLength((pUtf8 - aUtf8.Ptr())+1); |
|
628 return pointerToLastUnicodeCharacter-pUnicode; |
|
629 } |
|
630 |
|
631 |
|
632 |
|
633 /** Converts text encoded using the Unicode transformation format UTF-7 |
|
634 into the Unicode UCS-2 character set. |
|
635 |
|
636 @param aUtf7 The UTF-7 encoded input string. |
|
637 @return A pointer to an HBufC16 containing the converted Unicode string */ |
|
638 EXPORT_C HBufC16* CnvUtfConverter::ConvertToUnicodeFromUtf7L(const TDesC8& aUtf7) |
|
639 { |
|
640 // If aUtf8 is an empty string return |
|
641 if (aUtf7.Length()==0) |
|
642 { |
|
643 HBufC16* hBuf = HBufC16::NewL(1); |
|
644 return hBuf; |
|
645 } |
|
646 |
|
647 // else convert aUtf8 to Unicode storing the result in a buffer, reallocating |
|
648 // it when needed. |
|
649 TInt length = aUtf7.Length(); |
|
650 const TInt bufsize = 100; |
|
651 TInt state = KStateDefault; |
|
652 |
|
653 TPtrC8 utf7 (aUtf7); |
|
654 TBuf<bufsize> buf; |
|
655 HBufC16* hBuf = HBufC16::NewLC(length); |
|
656 TPtr unicode = hBuf->Des(); |
|
657 |
|
658 FOREVER |
|
659 { |
|
660 TInt unconverted = ConvertToUnicodeFromUtf7(buf, utf7, state); |
|
661 if( unconverted == EErrorIllFormedInput || unconverted < 0) |
|
662 User::Leave(KErrCorrupt); |
|
663 |
|
664 if (unicode.Length() + buf.Length() > unicode.MaxLength()) |
|
665 { |
|
666 // Reallocate hBuf |
|
667 hBuf = hBuf->ReAllocL(unicode.Length() + buf.Length()); |
|
668 CleanupStack::Pop(); |
|
669 CleanupStack::PushL(hBuf); |
|
670 unicode.Set(hBuf->Des()); |
|
671 } |
|
672 unicode.Append(buf); |
|
673 if (unconverted ==0) |
|
674 break; |
|
675 utf7.Set(utf7.Right(unconverted)); |
|
676 } |
|
677 CleanupStack::Pop(); |
|
678 return hBuf; |
|
679 } |
|
680 |
|
681 |
|
682 |
|
683 /** Converts text encoded using the Unicode transformation format UTF-7 into the |
|
684 Unicode UCS-2 character set. |
|
685 |
|
686 If the conversion is achieved using a series of calls to this function, where |
|
687 each call starts off where the previous call reached in the input descriptor, |
|
688 the state of the conversion is stored. The initial value of the state variable |
|
689 should be set as KStateDefault when the conversion is started, and afterwards |
|
690 simply passed unchanged into each function call. |
|
691 |
|
692 @param aUnicode On return, contains the Unicode encoded output string. |
|
693 @param aUtf7 The UTF-7 encoded input string. |
|
694 @param aState For the first call of the function set to KStateDefault. For |
|
695 subsequent calls, pass in the variable unchanged. |
|
696 @return The number of unconverted bytes left at the end of the input descriptor, |
|
697 or one of the error values defined in TError. */ |
|
698 EXPORT_C TInt CnvUtfConverter::ConvertToUnicodeFromUtf7(TDes16& aUnicode, |
|
699 const TDesC8& aUtf7, |
|
700 TInt& aState) |
|
701 { |
|
702 return ConvertToUnicodeFromUtf7(aUnicode, aUtf7, EFalse, aState); |
|
703 } |
|
704 |
|
705 TInt CnvUtfConverter::ConvertToUnicodeFromUtf7(TDes16& aUnicode, |
|
706 const TDesC8& aUtf7, |
|
707 TBool aIsImapUtf7, |
|
708 TInt& aState) |
|
709 { |
|
710 if (aUtf7.Length()==0) |
|
711 { |
|
712 aUnicode.SetLength(0); |
|
713 return 0; |
|
714 } |
|
715 if (aUnicode.MaxLength()==0) |
|
716 { |
|
717 return aUtf7.Length(); |
|
718 } |
|
719 const TUint escapeCharacterForStartingBase64Block=EscapeCharacterForStartingBase64Block(aIsImapUtf7); |
|
720 TUint16* pointerToPreviousUnicodeCharacter=CONST_CAST(TUint16*, aUnicode.Ptr()-1); |
|
721 const TUint16* pointerToLastUnicodeCharacter=pointerToPreviousUnicodeCharacter+aUnicode.MaxLength(); |
|
722 const TUint8* pointerToCurrentUtf7Byte=aUtf7.Ptr(); |
|
723 const TUint8* pointerToLastUtf7Byte=pointerToCurrentUtf7Byte+(aUtf7.Length()-1); |
|
724 TUint currentUtf7Byte=*pointerToCurrentUtf7Byte; |
|
725 const TUint KIsInBase64Block=0x80000000u; |
|
726 TUint bitBuffer=STATIC_CAST(TUint, aState); |
|
727 TInt numberOfBitsInBuffer=((bitBuffer&0xf0)>>4); |
|
728 bitBuffer&=~0xf0; // turn off the bits that stored numberOfBitsInBuffer |
|
729 if (bitBuffer&KIsInBase64Block) |
|
730 { |
|
731 __ASSERT_ALWAYS((numberOfBitsInBuffer==0) || (numberOfBitsInBuffer==2) || (numberOfBitsInBuffer==4) || ((numberOfBitsInBuffer<16) && (numberOfBitsInBuffer%2==0) && !BitBufferContainsNonZeroBits(bitBuffer, numberOfBitsInBuffer)), Panic(EPanicBadBitBufferState7)); |
|
732 __ASSERT_ALWAYS((bitBuffer&~(KIsInBase64Block|0x0000000f))==0, Panic(EPanicBadBitBufferState8)); |
|
733 } |
|
734 else |
|
735 { |
|
736 __ASSERT_ALWAYS(bitBuffer==0, Panic(EPanicBadBitBufferState9)); |
|
737 __ASSERT_ALWAYS(numberOfBitsInBuffer==0, Panic(EPanicBadBitBufferState10)); |
|
738 } |
|
739 aState=KStateDefault; |
|
740 if (bitBuffer&KIsInBase64Block) |
|
741 { |
|
742 currentUtf7Byte=Base64Decoding(currentUtf7Byte, aIsImapUtf7); |
|
743 } |
|
744 TBool inputIsTruncated=EFalse; |
|
745 FOREVER |
|
746 { |
|
747 __ASSERT_DEBUG(pointerToPreviousUnicodeCharacter<pointerToLastUnicodeCharacter, Panic(EPanicBadUnicodePointers5)); |
|
748 __ASSERT_DEBUG(pointerToCurrentUtf7Byte<=pointerToLastUtf7Byte, Panic(EPanicBadUtf7Pointers11)); |
|
749 __ASSERT_DEBUG((bitBuffer&KIsInBase64Block) || (currentUtf7Byte==*pointerToCurrentUtf7Byte), Panic(EPanicOutOfSyncUtf7Byte1)); |
|
750 __ASSERT_DEBUG((~bitBuffer&KIsInBase64Block) || (currentUtf7Byte==Base64Decoding(*pointerToCurrentUtf7Byte, aIsImapUtf7)), Panic(EPanicOutOfSyncUtf7Byte2)); |
|
751 __ASSERT_DEBUG((bitBuffer&KIsInBase64Block) || ((bitBuffer==0) && (numberOfBitsInBuffer==0)), Panic(EPanicBadBitBufferState11)); |
|
752 if ((~bitBuffer&KIsInBase64Block) && (currentUtf7Byte==escapeCharacterForStartingBase64Block)) |
|
753 { |
|
754 if (pointerToCurrentUtf7Byte==pointerToLastUtf7Byte) |
|
755 { |
|
756 --pointerToCurrentUtf7Byte; |
|
757 inputIsTruncated=ETrue; |
|
758 goto end; |
|
759 } |
|
760 ++pointerToCurrentUtf7Byte; |
|
761 currentUtf7Byte=*pointerToCurrentUtf7Byte; |
|
762 if (currentUtf7Byte=='-') |
|
763 { |
|
764 currentUtf7Byte=escapeCharacterForStartingBase64Block; |
|
765 } |
|
766 else |
|
767 { |
|
768 currentUtf7Byte=Base64Decoding(currentUtf7Byte, aIsImapUtf7); |
|
769 if (currentUtf7Byte==KNotInBase64Alphabet) |
|
770 { |
|
771 return EErrorIllFormedInput; |
|
772 } |
|
773 bitBuffer=KIsInBase64Block; |
|
774 } |
|
775 } |
|
776 if (bitBuffer&KIsInBase64Block) |
|
777 { |
|
778 FOREVER |
|
779 { |
|
780 __ASSERT_DEBUG(currentUtf7Byte==Base64Decoding(*pointerToCurrentUtf7Byte, aIsImapUtf7), Panic(EPanicOutOfSyncBase64Decoding)); |
|
781 __ASSERT_DEBUG((numberOfBitsInBuffer<16) || (BitBufferContainsNonZeroBits(bitBuffer, numberOfBitsInBuffer-16) && (numberOfBitsInBuffer<16+6)), Panic(EPanicBadBitBufferState12)); |
|
782 if (currentUtf7Byte==KNotInBase64Alphabet) |
|
783 { |
|
784 if (BitBufferContainsNonZeroBits(bitBuffer, numberOfBitsInBuffer)) |
|
785 { |
|
786 return EErrorIllFormedInput; |
|
787 } |
|
788 bitBuffer=0; |
|
789 numberOfBitsInBuffer=0; |
|
790 currentUtf7Byte=*pointerToCurrentUtf7Byte; |
|
791 if (currentUtf7Byte=='-') |
|
792 { |
|
793 if (pointerToCurrentUtf7Byte==pointerToLastUtf7Byte) |
|
794 { |
|
795 goto end; |
|
796 } |
|
797 ++pointerToCurrentUtf7Byte; |
|
798 currentUtf7Byte=*pointerToCurrentUtf7Byte; |
|
799 } |
|
800 break; |
|
801 } |
|
802 bitBuffer<<=6; |
|
803 bitBuffer|=currentUtf7Byte; |
|
804 bitBuffer|=KIsInBase64Block; |
|
805 numberOfBitsInBuffer+=6; |
|
806 // only flush the buffer if it contains a whole Unicode character and the remainder is either all zero-bits (hence would be a legal point to end the base-64 sequence) or at least 6 bits long (therefore would leave at least one UTF-7 byte unconverted at the end of the input descriptor) |
|
807 if ((numberOfBitsInBuffer>=16+6) || ((numberOfBitsInBuffer>=16) && !BitBufferContainsNonZeroBits(bitBuffer, numberOfBitsInBuffer-16))) |
|
808 { |
|
809 numberOfBitsInBuffer-=16; |
|
810 __ASSERT_DEBUG(pointerToPreviousUnicodeCharacter<pointerToLastUnicodeCharacter, Panic(EPanicBadUnicodePointers6)); |
|
811 ++pointerToPreviousUnicodeCharacter; |
|
812 *pointerToPreviousUnicodeCharacter=STATIC_CAST(TUint16, bitBuffer>>numberOfBitsInBuffer); |
|
813 bitBuffer&=((1<<numberOfBitsInBuffer)-1); // zero all the consumed bits - must be done as bitBuffer is stored along with numberOfBitsInBuffer in aState if the output descriptor runs out of space or if the input descriptor was truncated |
|
814 bitBuffer|=KIsInBase64Block; // turn it back on as the line above turned it off |
|
815 if (pointerToPreviousUnicodeCharacter==pointerToLastUnicodeCharacter) |
|
816 { |
|
817 goto end; |
|
818 } |
|
819 } |
|
820 if (pointerToCurrentUtf7Byte==pointerToLastUtf7Byte) |
|
821 { |
|
822 inputIsTruncated=ETrue; |
|
823 goto end; |
|
824 } |
|
825 ++pointerToCurrentUtf7Byte; |
|
826 currentUtf7Byte=Base64Decoding(*pointerToCurrentUtf7Byte, aIsImapUtf7); |
|
827 } |
|
828 } |
|
829 else |
|
830 { |
|
831 __ASSERT_DEBUG(pointerToPreviousUnicodeCharacter<pointerToLastUnicodeCharacter, Panic(EPanicBadUnicodePointers7)); |
|
832 ++pointerToPreviousUnicodeCharacter; |
|
833 *pointerToPreviousUnicodeCharacter=STATIC_CAST(TUint16, currentUtf7Byte); |
|
834 if ((pointerToPreviousUnicodeCharacter==pointerToLastUnicodeCharacter) || (pointerToCurrentUtf7Byte==pointerToLastUtf7Byte)) |
|
835 { |
|
836 goto end; |
|
837 } |
|
838 ++pointerToCurrentUtf7Byte; |
|
839 currentUtf7Byte=*pointerToCurrentUtf7Byte; |
|
840 } |
|
841 } |
|
842 end: |
|
843 if (bitBuffer&KIsInBase64Block) |
|
844 { |
|
845 __ASSERT_DEBUG((numberOfBitsInBuffer<16) || (BitBufferContainsNonZeroBits(bitBuffer, numberOfBitsInBuffer-16) && (numberOfBitsInBuffer<16+6)), Panic(EPanicBadBitBufferState13)); |
|
846 if (BitBufferContainsNonZeroBits(bitBuffer, numberOfBitsInBuffer)) |
|
847 { |
|
848 // rewind how far we've got in the UTF-7 descriptor to indicate to the user (by returning a value greater than zero) that not all of the input could be converted as it ended with a truncated base-64 sequence |
|
849 __ASSERT_DEBUG(numberOfBitsInBuffer>=6, Panic(EPanicBadBitBufferState14)); |
|
850 pointerToCurrentUtf7Byte-=numberOfBitsInBuffer/6; |
|
851 const TInt newNumberOfBitsInBuffer=numberOfBitsInBuffer%6; |
|
852 bitBuffer&=~KIsInBase64Block; // temporarily turn off the KIsInBase64Block for the right-shift |
|
853 bitBuffer>>=(numberOfBitsInBuffer-newNumberOfBitsInBuffer); |
|
854 bitBuffer|=KIsInBase64Block; // must be turned back on again as the bit-buffer is packed into aState |
|
855 numberOfBitsInBuffer=newNumberOfBitsInBuffer; |
|
856 __ASSERT_DEBUG((numberOfBitsInBuffer==0) || (numberOfBitsInBuffer==2) || (numberOfBitsInBuffer==4), Panic(EPanicBadBitBufferState15)); |
|
857 } |
|
858 __ASSERT_DEBUG((numberOfBitsInBuffer<16) && (numberOfBitsInBuffer%2==0), Panic(EPanicBadBitBufferState16)); |
|
859 aState=STATIC_CAST(TInt, bitBuffer); |
|
860 aState|=(numberOfBitsInBuffer<<4); |
|
861 __ASSERT_DEBUG(aState&KIsInBase64Block, Panic(EPanicBadBitBufferState17)); |
|
862 bitBuffer=0; |
|
863 numberOfBitsInBuffer=0; |
|
864 } |
|
865 if ((pointerToCurrentUtf7Byte<aUtf7.Ptr()) && inputIsTruncated) |
|
866 { |
|
867 return EErrorIllFormedInput; |
|
868 } |
|
869 aUnicode.SetLength((pointerToPreviousUnicodeCharacter+1)-aUnicode.Ptr()); |
|
870 return pointerToLastUtf7Byte-pointerToCurrentUtf7Byte; |
|
871 } |
|
872 |
|
873 |
|
874 |
|
875 /** Converts text encoded using the Unicode transformation format UTF-8 |
|
876 into the Unicode UCS-2 character set. This function leaves with an |
|
877 error code of the input string is corrupted. |
|
878 |
|
879 @param aUtf8 The UTF-8 encoded input string |
|
880 @return A pointer to an HBufC16 with the converted Unicode string. */ |
|
881 EXPORT_C HBufC16* CnvUtfConverter::ConvertToUnicodeFromUtf8L(const TDesC8& aUtf8) |
|
882 { |
|
883 // If aUtf8 is an empty string return |
|
884 if (aUtf8.Length()==0) |
|
885 { |
|
886 HBufC16* hBuf = HBufC16::NewL(1); |
|
887 return hBuf; |
|
888 } |
|
889 |
|
890 // else convert aUtf8 to Unicode storing the result in a buffer, reallocating |
|
891 // it when needed. |
|
892 TInt length = aUtf8.Length(); |
|
893 const TInt bufsize = 100; |
|
894 |
|
895 TPtrC8 utf8 (aUtf8); |
|
896 TBuf<bufsize> buf; |
|
897 HBufC16* hBuf = HBufC16::NewLC(length); |
|
898 TPtr unicode = hBuf->Des(); |
|
899 |
|
900 FOREVER |
|
901 { |
|
902 TInt unconverted = ConvertToUnicodeFromUtf8(buf, utf8); |
|
903 if( unconverted == EErrorIllFormedInput || unconverted < 0) |
|
904 User::Leave(KErrCorrupt); |
|
905 |
|
906 if (unicode.Length() + buf.Length() > unicode.MaxLength()) |
|
907 { |
|
908 // Reallocate hBuf |
|
909 hBuf = hBuf->ReAllocL(unicode.Length() + buf.Length()); |
|
910 CleanupStack::Pop(); |
|
911 CleanupStack::PushL(hBuf); |
|
912 unicode.Set(hBuf->Des()); |
|
913 } |
|
914 unicode.Append(buf); |
|
915 if (unconverted ==0) |
|
916 break; |
|
917 utf8.Set(utf8.Right(unconverted)); |
|
918 } |
|
919 CleanupStack::Pop(); |
|
920 return hBuf; |
|
921 } |
|
922 |
|
923 /** Converts text encoded using the Unicode transformation format UTF-8 into the |
|
924 Unicode UCS-2 character set. |
|
925 |
|
926 @param aUnicode On return, contains the Unicode encoded output string. |
|
927 @param aUtf8 The UTF-8 encoded input string |
|
928 @return The number of unconverted bytes left at the end of the input descriptor, |
|
929 or one of the error values defined in TError. */ |
|
930 EXPORT_C TInt CnvUtfConverter::ConvertToUnicodeFromUtf8(TDes16& aUnicode, const TDesC8& aUtf8) |
|
931 { |
|
932 return ConvertToUnicodeFromUtf8(aUnicode, aUtf8, EFalse); |
|
933 } |
|
934 |
|
935 static void UpdateUnconvertibleInfo(TInt& aNumberOfUnconvertibleCharacters, |
|
936 TInt& aIndexOfFirstByteOfFirstUnconvertibleCharacter, TUint8 aIndex) |
|
937 { |
|
938 if (aNumberOfUnconvertibleCharacters<=0) |
|
939 { |
|
940 aIndexOfFirstByteOfFirstUnconvertibleCharacter = aIndex; |
|
941 } |
|
942 ++aNumberOfUnconvertibleCharacters; |
|
943 } |
|
944 |
|
945 /** Converts text encoded using the Unicode transformation format UTF-8 into the |
|
946 Unicode UCS-2 character set. |
|
947 |
|
948 @param aUnicode On return, contains the Unicode encoded output string. |
|
949 @param aUtf8 The UTF-8 encoded input string |
|
950 @param aGenerateJavaConformantUtf8 EFalse for orthodox UTF-8. ETrue for Java |
|
951 @return The number of unconverted bytes left at the end of the input descriptor, |
|
952 or one of the error values defined in TError. */ |
|
953 TInt CnvUtfConverter::ConvertToUnicodeFromUtf8(TDes16& aUnicode, const TDesC8& aUtf8, TBool aGenerateJavaConformantUtf8) |
|
954 { |
|
955 TInt dummyUnconverted, dummyUnconvertedIndex; |
|
956 return ConvertToUnicodeFromUtf8(aUnicode, aUtf8, aGenerateJavaConformantUtf8, dummyUnconverted, dummyUnconvertedIndex); |
|
957 } |
|
958 |
|
959 /** Converts text encoded using the Unicode transformation format UTF-8 into the |
|
960 Unicode UCS-2 character set. Surrogate pairs can be created when a valid 4 byte UTF-8 is input. |
|
961 |
|
962 The variant of UTF-8 used internally by Java differs slightly from standard |
|
963 UTF-8. The TBool argument controls the UTF-8 variant generated by this function. |
|
964 |
|
965 @param aUnicode On return, contains the Unicode encoded output string. |
|
966 @param aUtf8 The UTF-8 encoded input string |
|
967 @param aGenerateJavaConformantUtf8 EFalse for orthodox UTF-8. ETrue for Java |
|
968 UTF-8. The default is EFalse. |
|
969 @param aNumberOfUnconvertibleCharacters On return, contains the number of bytes |
|
970 which were not converted. |
|
971 @param aIndexOfFirstByteOfFirstUnconvertibleCharacter On return, the index |
|
972 of the first byte of the first unconvertible character. For instance if the |
|
973 first character in the input descriptor (aForeign) could not be converted, |
|
974 then this parameter is set to the first byte of that character, i.e. zero. |
|
975 A negative value is returned if all the characters were converted. |
|
976 @return The number of unconverted bytes left at the end of the input descriptor, |
|
977 or one of the error values defined in TError. */ |
|
978 |
|
979 /* of note: conformance. Unicode standard 5.0 section 3.9, table 3-7 |
|
980 * Well formed UTF-8 Byte Sequences, full table. |
|
981 * +----------------------------------------------------------------+ |
|
982 * | Code Points | 1st byte | 2nd byte | 3rd byte | 4th byte | |
|
983 * +--------------------+----------+----------+----------+----------+ |
|
984 * | U+0000..U+007F | 00..7D | | | | 1 byte, ascii |
|
985 * | U+0080..U+07FF | C2..DF | 80..BF | | | 2 bytes, error if 1st < 0xC2 |
|
986 * | U+0800..U+0FFF | E0 | A0..BF | 80..BF | | 3 bytes, 1st == 0xE0, error if 2nd < 0xA0 |
|
987 * | U+1000..U+CFFF | E1..EC | 80..BF | 80..BF | | normal |
|
988 * | U+D000..U+D7FF | ED | 80..9F | 80..BF | | 3 bytes, 1st == 0xED, error if 2nd > 0x9F |
|
989 * | U+E000..U+FFFF | EE..EF | 80..BF | 80..BF | | normal |
|
990 * | U+10000..U+3FFFF | F0 | 90..BF | 80..BF | 80..BF | 4 bytes, 1st == 0xf0, error if 2nd < 0x90 |
|
991 * | U+40000..U+FFFFF | F1..F3 | 80..BF | 80..BF | 80..BF | normal |
|
992 * | U+100000..U+10FFFF | F4 | 80..8F | 80..BF | 80..BF | 4 bytes, 1st == 0xF4, error if 2nd > 0x8F |
|
993 * +--------------------+----------+----------+----------+----------+ |
|
994 * |
|
995 * As a consequence of the well-formedness conditions specified in table 3-7, |
|
996 * the following byte values are disallowed in UTF-8: C0-C1, F5-FF. |
|
997 */ |
|
998 TInt CnvUtfConverter::ConvertToUnicodeFromUtf8(TDes16& aUnicode, const TDesC8& aUtf8, TBool aGenerateJavaConformantUtf8, |
|
999 TInt& aNumberOfUnconvertibleCharacters, TInt& aIndexOfFirstByteOfFirstUnconvertibleCharacter) |
|
1000 { |
|
1001 aUnicode.SetLength(0); |
|
1002 |
|
1003 if ((aUtf8.Length() == 0) || (aUnicode.MaxLength() == 0)) |
|
1004 { |
|
1005 return aUtf8.Length(); |
|
1006 } |
|
1007 |
|
1008 TUint16* pUnicode = CONST_CAST(TUint16*, aUnicode.Ptr()); |
|
1009 const TUint16* pLastUnicode = pUnicode + (aUnicode.MaxLength() - 1); |
|
1010 const TUint8* pUtf8 = aUtf8.Ptr(); |
|
1011 const TUint8* pLastUtf8 = pUtf8 + (aUtf8.Length() - 1); |
|
1012 const TUint16 replacementcharacter = 0xFFFD; |
|
1013 TUint currentUnicodeCharacter; |
|
1014 TInt sequenceLength; |
|
1015 |
|
1016 |
|
1017 FOREVER |
|
1018 { |
|
1019 TBool illFormed=EFalse; |
|
1020 |
|
1021 __ASSERT_DEBUG(pUnicode <= pLastUnicode, Panic(EPanicBadUnicodePointers8)); |
|
1022 __ASSERT_DEBUG(pUtf8 <= pLastUtf8, Panic(EPanicBadUtf8Pointers3)); |
|
1023 |
|
1024 sequenceLength = 1; |
|
1025 |
|
1026 // ascii - optimisation (i.e. it isn't a sequence) |
|
1027 if (pUtf8[0] < 0x80) |
|
1028 { |
|
1029 currentUnicodeCharacter = pUtf8[0]; |
|
1030 } |
|
1031 else |
|
1032 { |
|
1033 // see if well formed utf-8, use table above for reference |
|
1034 if ((pUtf8[0] >= 0xc2) && (pUtf8[0] <= 0xdf)) |
|
1035 { |
|
1036 // 0xc1-0xc2 are not valid bytes |
|
1037 sequenceLength = 2; |
|
1038 } |
|
1039 else if ((pUtf8[0] & 0xf0) == 0xe0) |
|
1040 { |
|
1041 sequenceLength = 3; |
|
1042 } |
|
1043 else if ((pUtf8[0] >= 0xf0) && (pUtf8[0] < 0xf5)) |
|
1044 { |
|
1045 // 0xf5-0xff, are not valid bytes |
|
1046 sequenceLength = 4; |
|
1047 } |
|
1048 else if ((pUtf8[0] == 0xc0) && aGenerateJavaConformantUtf8) |
|
1049 { |
|
1050 if ((pUtf8 == pLastUtf8) || (pUtf8[1] == 0x80)) |
|
1051 { |
|
1052 // either we've split the 0xc0 0x80 (i.e. 0xc0 is |
|
1053 // the last character in the string) or we've |
|
1054 // discovered a valid 0xc0 0x80 sequence. |
|
1055 sequenceLength = 2; |
|
1056 } |
|
1057 } |
|
1058 |
|
1059 /* checking to see if we got a valid sequence */ |
|
1060 if (sequenceLength == 1) |
|
1061 { |
|
1062 // bad value in the leading byte, 0xc0-0xc1,0x5f-0xff for example |
|
1063 currentUnicodeCharacter = replacementcharacter; |
|
1064 UpdateUnconvertibleInfo(aNumberOfUnconvertibleCharacters, |
|
1065 aIndexOfFirstByteOfFirstUnconvertibleCharacter, pUtf8-aUtf8.Ptr()); |
|
1066 } |
|
1067 else |
|
1068 { |
|
1069 // this is a check to see if the sequence goes beyond the input |
|
1070 // stream. if its not the first and only character in the input |
|
1071 // stream this isn't an error, otherwise it is. |
|
1072 if ((pUtf8 + sequenceLength - 1) > pLastUtf8) |
|
1073 { |
|
1074 // check to see if this sequence was the first character |
|
1075 if ((pUnicode - aUnicode.Ptr()) == 0) |
|
1076 { |
|
1077 return EErrorIllFormedInput; |
|
1078 } |
|
1079 break; |
|
1080 } |
|
1081 |
|
1082 currentUnicodeCharacter = pUtf8[0] & (0x7F>>sequenceLength); |
|
1083 |
|
1084 /* check the trailing bytes, they should begin with 10 */ |
|
1085 TUint i = 1; |
|
1086 |
|
1087 do |
|
1088 { |
|
1089 if ((pUtf8[i] & 0xc0) == 0x80) |
|
1090 { |
|
1091 // add the trailing 6 bits to the current unicode char |
|
1092 currentUnicodeCharacter = (currentUnicodeCharacter <<6 ) | (pUtf8[i] & 0x3F); |
|
1093 } |
|
1094 else |
|
1095 { |
|
1096 // ill formed character (doesn't have a lead 10) |
|
1097 currentUnicodeCharacter = replacementcharacter; |
|
1098 UpdateUnconvertibleInfo(aNumberOfUnconvertibleCharacters, |
|
1099 aIndexOfFirstByteOfFirstUnconvertibleCharacter, pUtf8-aUtf8.Ptr()); |
|
1100 illFormed=ETrue; |
|
1101 break; |
|
1102 } |
|
1103 i++; |
|
1104 } |
|
1105 while (i < (unsigned)sequenceLength); |
|
1106 } |
|
1107 |
|
1108 /* conformance check. bits of above table for reference. |
|
1109 * +----------------------------------------------------------------+ |
|
1110 * | Code Points | 1st byte | 2nd byte | 3rd byte | 4th byte | |
|
1111 * +--------------------+----------+----------+----------+----------+ |
|
1112 * | U+0800..U+0FFF | E0 | A0..BF | 80..BF | | 3 bytes, 1st == 0xE0, 2nd < 0xA0 |
|
1113 * | U+D000..U+D7FF | ED | 80..9F | 80..BF | | 3 bytes, 1st == 0xED, 2nd > 0x9F |
|
1114 * | U+10000..U+3FFFF | F0 | 90..BF | 80..BF | 80..BF | 4 bytes, 1st == 0xf0, 2nd < 0x90 |
|
1115 * | U+100000..U+10FFFF | F4 | 80..8F | 80..BF | 80..BF | 4 bytes, 1st == 0xF4, 2nd > 0x8F |
|
1116 * +--------------------+----------+----------+----------+----------+ |
|
1117 */ |
|
1118 |
|
1119 if (currentUnicodeCharacter != replacementcharacter) |
|
1120 { |
|
1121 if (sequenceLength == 3) |
|
1122 { |
|
1123 if ((pUtf8[0] == 0xE0) && (pUtf8[1] < 0xA0)) |
|
1124 { |
|
1125 currentUnicodeCharacter = replacementcharacter; |
|
1126 UpdateUnconvertibleInfo(aNumberOfUnconvertibleCharacters, |
|
1127 aIndexOfFirstByteOfFirstUnconvertibleCharacter, pUtf8-aUtf8.Ptr()); |
|
1128 illFormed=ETrue; |
|
1129 } |
|
1130 else if ((pUtf8[0] == 0xED) && (pUtf8[1] > 0x9F)) |
|
1131 { |
|
1132 currentUnicodeCharacter = replacementcharacter; |
|
1133 UpdateUnconvertibleInfo(aNumberOfUnconvertibleCharacters, |
|
1134 aIndexOfFirstByteOfFirstUnconvertibleCharacter, pUtf8-aUtf8.Ptr()); |
|
1135 illFormed=ETrue; |
|
1136 } |
|
1137 } |
|
1138 else if (sequenceLength == 4) |
|
1139 { |
|
1140 if ((pUtf8[0] == 0xF0) && (pUtf8[1] < 0x90)) |
|
1141 { |
|
1142 currentUnicodeCharacter = replacementcharacter; |
|
1143 UpdateUnconvertibleInfo(aNumberOfUnconvertibleCharacters, |
|
1144 aIndexOfFirstByteOfFirstUnconvertibleCharacter, pUtf8-aUtf8.Ptr()); |
|
1145 illFormed=ETrue; |
|
1146 } |
|
1147 else if ((pUtf8[0] == 0xF4) && (pUtf8[1] > 0x8F)) |
|
1148 { |
|
1149 currentUnicodeCharacter = replacementcharacter; |
|
1150 UpdateUnconvertibleInfo(aNumberOfUnconvertibleCharacters, |
|
1151 aIndexOfFirstByteOfFirstUnconvertibleCharacter, pUtf8-aUtf8.Ptr()); |
|
1152 illFormed=ETrue; |
|
1153 } |
|
1154 } |
|
1155 |
|
1156 |
|
1157 /* last conformance check - Unicode 5.0 section 3.9 D92 Because surrogate code points |
|
1158 * are not Unicode scalar values, any UTF-8 byte sequence that would map to code |
|
1159 * points D800..DFFF is ill formed */ |
|
1160 |
|
1161 if ((currentUnicodeCharacter >= 0xD800) && (currentUnicodeCharacter <= 0xDFFF)) |
|
1162 { |
|
1163 currentUnicodeCharacter = replacementcharacter; |
|
1164 UpdateUnconvertibleInfo(aNumberOfUnconvertibleCharacters, |
|
1165 aIndexOfFirstByteOfFirstUnconvertibleCharacter, pUtf8-aUtf8.Ptr()); |
|
1166 illFormed=ETrue; |
|
1167 } |
|
1168 } |
|
1169 // end conformance check |
|
1170 } |
|
1171 |
|
1172 // would this character generate a surrogate pair in UTF-16? |
|
1173 if (currentUnicodeCharacter > 0xFFFF) |
|
1174 { |
|
1175 // is there enough space to hold a surrogate pair in the output? |
|
1176 if (pUnicode >= pLastUnicode) |
|
1177 { |
|
1178 break; // no, end processing. |
|
1179 } |
|
1180 |
|
1181 TUint surrogate = (currentUnicodeCharacter>>10) + 0xD7C0; |
|
1182 *pUnicode++ = STATIC_CAST(TUint16, surrogate); |
|
1183 |
|
1184 surrogate = (currentUnicodeCharacter & 0x3FF) + 0xDC00; |
|
1185 *pUnicode++ = STATIC_CAST(TUint16, surrogate); |
|
1186 } |
|
1187 else |
|
1188 { |
|
1189 *pUnicode++ = STATIC_CAST(TUint16, currentUnicodeCharacter); |
|
1190 } |
|
1191 |
|
1192 // move the input pointer |
|
1193 if (currentUnicodeCharacter != replacementcharacter) |
|
1194 { |
|
1195 pUtf8 += sequenceLength; |
|
1196 } |
|
1197 else if(illFormed == EFalse) |
|
1198 { |
|
1199 pUtf8 += (sequenceLength); |
|
1200 } |
|
1201 else |
|
1202 { |
|
1203 // we had a character we didn't recognize (i.e. it was invalid) |
|
1204 // so move to the next character in the input |
|
1205 pUtf8++; |
|
1206 } |
|
1207 |
|
1208 if ((pUtf8 > pLastUtf8) || (pUnicode > pLastUnicode)) |
|
1209 { |
|
1210 break; // we've either reached the end of the input or the end of output |
|
1211 } |
|
1212 } |
|
1213 |
|
1214 aUnicode.SetLength(pUnicode - aUnicode.Ptr()); |
|
1215 return (pLastUtf8 - pUtf8 + 1); |
|
1216 } |
|
1217 |
|
1218 /** Given a sample text this function attempts to determine whether or not |
|
1219 * the same text is encoded using the UTF-8 standard encoding scheme. |
|
1220 |
|
1221 @param TInt a confidence level, given at certain value. if the given sample |
|
1222 is UTF-8 this value will not be changed (unless > 100) then its |
|
1223 set to 100. Otherwise if the same isn't UTF-8, its set to 0. |
|
1224 @param TDesC8 sample text. |
|
1225 UTF-8. The default is EFalse. |
|
1226 @return void |
|
1227 */ |
|
1228 |
|
1229 /* of note: conformance. Unicode standard 5.0 section 3.9, table 3-7 |
|
1230 * Well formed UTF-8 Byte Sequences, full table. |
|
1231 * +----------------------------------------------------------------+ |
|
1232 * | Code Points | 1st byte | 2nd byte | 3rd byte | 4th byte | |
|
1233 * +--------------------+----------+----------+----------+----------+ |
|
1234 * | U+0000..U+007F | 00..7D | | | | 1 byte, ascii |
|
1235 * | U+0080..U+07FF | C2..DF | 80..BF | | | 2 bytes, error if 1st < 0xC2 |
|
1236 * | U+0800..U+0FFF | E0 | A0..BF | 80..BF | | 3 bytes, 1st == 0xE0, error if 2nd < 0xA0 |
|
1237 * | U+1000..U+CFFF | E1..EC | 80..BF | 80..BF | | normal |
|
1238 * | U+D000..U+D7FF | ED | 80..9F | 80..BF | | 3 bytes, 1st == 0xED, error if 2nd > 0x9F |
|
1239 * | U+E000..U+FFFF | EE..EF | 80..BF | 80..BF | | normal |
|
1240 * | U+10000..U+3FFFF | F0 | 90..BF | 80..BF | 80..BF | 4 bytes, 1st == 0xf0, error if 2nd < 0x90 |
|
1241 * | U+40000..U+FFFFF | F1..F3 | 80..BF | 80..BF | 80..BF | normal |
|
1242 * | U+100000..U+10FFFF | F4 | 80..8F | 80..BF | 80..BF | 4 bytes, 1st == 0xF4, error if 2nd > 0x8F |
|
1243 * +--------------------+----------+----------+----------+----------+ |
|
1244 * |
|
1245 * As a consequence of the well-formedness conditions specified in table 3-7, |
|
1246 * the following byte values are disallowed in UTF-8: C0-C1, F5-FF. |
|
1247 * |
|
1248 * Code Rules: |
|
1249 * R1: If the string contains any non-UTF-8 characters the returned confidence |
|
1250 * is 0. Valid UTF-8 combinations are listed in the above table. |
|
1251 * R2: Otherwise if the string starts with a UTF-8 BOM (byte order mark) in |
|
1252 * the (see ) the returned confidence is 95. |
|
1253 * R3: Otherwise the confidence returned is based upon the sample string |
|
1254 * length. |
|
1255 * R4: If the sample string is under 75 characters, the confidence is set to |
|
1256 * 75. |
|
1257 */ |
|
1258 GLREF_C void IsCharacterSetUTF8(TInt& aConfidenceLevel, const TDesC8& aSample) |
|
1259 { |
|
1260 |
|
1261 TInt sampleLength = aSample.Length(); |
|
1262 |
|
1263 if (sampleLength == 0) |
|
1264 { |
|
1265 aConfidenceLevel = 89; |
|
1266 return; |
|
1267 } |
|
1268 TInt bytesRemaining = 0; |
|
1269 TInt sequenceLength = 0; |
|
1270 |
|
1271 aConfidenceLevel = sampleLength; |
|
1272 |
|
1273 const TUint8* buffer = &aSample[0]; |
|
1274 |
|
1275 if (sampleLength < 95) |
|
1276 { |
|
1277 // check for the BOM |
|
1278 if ((sampleLength >= 3) && |
|
1279 ((buffer[0] == 0xEF) && |
|
1280 (buffer[1] == 0xBB) && |
|
1281 (buffer[2] == 0xBF)) |
|
1282 ) |
|
1283 { |
|
1284 aConfidenceLevel = 95; |
|
1285 } |
|
1286 else if (sampleLength < 75) |
|
1287 { |
|
1288 aConfidenceLevel = 75; |
|
1289 } |
|
1290 } |
|
1291 |
|
1292 for (TInt index = 0;index != sampleLength;index++) |
|
1293 { |
|
1294 |
|
1295 if (bytesRemaining > 0) |
|
1296 { |
|
1297 // bytesRemaining > 0, means that a byte representing the start of a |
|
1298 // multibyte sequence was encountered and the bytesRemaining is the |
|
1299 // number of bytes to follow. |
|
1300 |
|
1301 if ((buffer[index] & 0xc0) == 0x80) |
|
1302 { |
|
1303 // need to check for ill-formed sequences -- all are in the 2nd byte |
|
1304 |
|
1305 if ((sequenceLength == 3) && (bytesRemaining == 2)) |
|
1306 { |
|
1307 if ((buffer[index - 1] == 0xe0) && (buffer[index] < 0xa0)) |
|
1308 { |
|
1309 aConfidenceLevel = 0; |
|
1310 break; |
|
1311 } |
|
1312 else if ((buffer[index - 1] == 0xed) && (buffer[index] > 0x9f)) |
|
1313 { |
|
1314 aConfidenceLevel = 0; |
|
1315 break; |
|
1316 } |
|
1317 } |
|
1318 else if ((sequenceLength == 4) && (bytesRemaining == 3)) |
|
1319 { |
|
1320 if ((buffer[index - 1] == 0xf0) && (buffer[index] < 0x90)) |
|
1321 { |
|
1322 aConfidenceLevel = 0; |
|
1323 break; |
|
1324 } |
|
1325 else if ((buffer[index - 1] == 0xf4) && (buffer[index] > 0x8f)) |
|
1326 { |
|
1327 aConfidenceLevel = 0; |
|
1328 break; |
|
1329 } |
|
1330 } |
|
1331 |
|
1332 --bytesRemaining; |
|
1333 continue; |
|
1334 } |
|
1335 else |
|
1336 { |
|
1337 aConfidenceLevel = 0; |
|
1338 break; |
|
1339 } |
|
1340 } |
|
1341 |
|
1342 if (bytesRemaining == 0) |
|
1343 { |
|
1344 if (buffer[index] < 0x80) |
|
1345 { |
|
1346 // The value of aSample[index] is in the range 0x00-0x7f |
|
1347 //UTF8 maintains ASCII transparency. So it's a valid |
|
1348 //UTF8. Do nothing, check next value. |
|
1349 continue; |
|
1350 } |
|
1351 else if ((buffer[index] >= 0xc2) && (buffer[index] < 0xe0)) |
|
1352 { |
|
1353 // valid start of a 2 byte sequence (see conformance note) |
|
1354 sequenceLength = 2; |
|
1355 bytesRemaining = 1; |
|
1356 } |
|
1357 else if ((buffer[index] & 0xf0) == 0xe0) |
|
1358 { |
|
1359 // valid start of a 3 byte sequence |
|
1360 sequenceLength = 3; |
|
1361 bytesRemaining = 2; |
|
1362 } |
|
1363 else if ((buffer[index] >= 0xf0) && (buffer[index] < 0xf5)) |
|
1364 { |
|
1365 // valid start of a 4 byte sequence (see conformance note) |
|
1366 sequenceLength = 4; |
|
1367 bytesRemaining = 3; |
|
1368 } |
|
1369 else |
|
1370 { |
|
1371 // wasn't anything expected so must be an illegal/irregular UTF8 coded value |
|
1372 aConfidenceLevel = 0; |
|
1373 break; |
|
1374 } |
|
1375 } |
|
1376 } // for |
|
1377 |
|
1378 aConfidenceLevel = (aConfidenceLevel > 0)? ((aConfidenceLevel > 100)? 100: aConfidenceLevel): 0; |
|
1379 } |
|
1380 |
|
1381 GLREF_C void IsCharacterSetUTF7(TInt& aConfidenceLevel, const TDesC8& aSample) |
|
1382 { |
|
1383 TInt sampleLength = aSample.Length(); |
|
1384 aConfidenceLevel = 70; |
|
1385 for (TInt i=0; i<sampleLength; ++i) |
|
1386 { |
|
1387 // UTF-7 value ranges only 7 bits |
|
1388 if((aSample[i]&0x80)!=0x00) |
|
1389 { |
|
1390 aConfidenceLevel= 0; |
|
1391 break; |
|
1392 } |
|
1393 |
|
1394 // there is no "~" in UTF-7 encoding. So if find either, it's not UTF-7 |
|
1395 else if (char(aSample[i])=='~') |
|
1396 { |
|
1397 aConfidenceLevel = 0; |
|
1398 break; |
|
1399 } |
|
1400 |
|
1401 // The SMS7Bit escape char value is 0x1b. Reduce confidence if it follows the following format |
|
1402 else if ( (aSample[i]==0x1b) && (i <sampleLength-1) ) |
|
1403 { |
|
1404 static const TInt smsExtensionTable[11] = |
|
1405 {0x0a, 0x14, 0x1b, 0x28, 0x29, 0x2f, 0x3c, 0x3d, 0x3e, 0x40, 0x65}; |
|
1406 TInt increment1 = i+1; |
|
1407 if (increment1>= sampleLength) |
|
1408 break; |
|
1409 for (TInt j=0; j < 11; ++j) |
|
1410 { |
|
1411 if (aSample[increment1] == smsExtensionTable[j]) |
|
1412 { |
|
1413 aConfidenceLevel-=10; |
|
1414 } |
|
1415 } |
|
1416 } |
|
1417 // The UTF-7 escape char is 0x2b. The values that follow the escape sequence |
|
1418 // the values following the escape char value must belong to the modified base64 |
|
1419 // or '-' else it is an ill-formed sequence, so probably not UTF-7 |
|
1420 else if ( (aSample[i]==0x2b) && (i <sampleLength-1) ) |
|
1421 { |
|
1422 TInt increment1 = i+1; |
|
1423 if ((aSample[increment1] == 0x2b) || (aSample[increment1] == 0x2d) || (aSample[increment1] == 0x2f) || |
|
1424 ((aSample[increment1] >= 0x41) && (aSample[increment1] <= 0x5a)) || |
|
1425 ((aSample[increment1] >= 0x61) && (aSample[increment1] <= 0x7a))) |
|
1426 { |
|
1427 aConfidenceLevel+=5; |
|
1428 } |
|
1429 else |
|
1430 { |
|
1431 aConfidenceLevel-=15; |
|
1432 } |
|
1433 i++; // should this be here or up in the if loop ?? |
|
1434 } |
|
1435 } //for |
|
1436 aConfidenceLevel =(aConfidenceLevel >0)? ((aConfidenceLevel > 100)? 100: aConfidenceLevel): 0; |
|
1437 } |