|
1 /* |
|
2 * Copyright (c) 2005 Nokia Corporation and/or its subsidiary(-ies). |
|
3 * All rights reserved. |
|
4 * This component and the accompanying materials are made available |
|
5 * under the terms of the License "Eclipse Public License v1.0" |
|
6 * which accompanies this distribution, and is available |
|
7 * at the URL "http://www.eclipse.org/legal/epl-v10.html". |
|
8 * |
|
9 * Initial Contributors: |
|
10 * Nokia Corporation - initial contribution. |
|
11 * |
|
12 * Contributors: |
|
13 * |
|
14 * Description: A misc. char-encoding related utilities. |
|
15 * |
|
16 */ |
|
17 |
|
18 |
|
19 #include <charconv.h> |
|
20 |
|
21 #include "XmlEncoding.h" |
|
22 |
|
23 |
|
24 // ----------------------------------------------------------------------------- |
|
25 // CXmlEncoding::NewL |
|
26 // |
|
27 // Two-phased constructor. |
|
28 // ----------------------------------------------------------------------------- |
|
29 // |
|
30 EXPORT_C CXmlEncoding* CXmlEncoding::NewL() |
|
31 { |
|
32 CXmlEncoding* self = new (ELeave) CXmlEncoding(); |
|
33 |
|
34 CleanupStack::PushL(self); |
|
35 self->ConstructL(); |
|
36 CleanupStack::Pop(); |
|
37 |
|
38 return self; |
|
39 } |
|
40 |
|
41 |
|
42 // ----------------------------------------------------------------------------- |
|
43 // CXmlEncoding::CXmlEncoding |
|
44 // |
|
45 // C++ default constructor can NOT contain any code, that |
|
46 // might leave. |
|
47 // ----------------------------------------------------------------------------- |
|
48 // |
|
49 CXmlEncoding::CXmlEncoding(): |
|
50 iCharEncodings(5) |
|
51 { |
|
52 } |
|
53 |
|
54 |
|
55 // ----------------------------------------------------------------------------- |
|
56 // CXmlEncoding::ConstructL |
|
57 // |
|
58 // Symbian 2nd phase constructor can leave. |
|
59 // ----------------------------------------------------------------------------- |
|
60 // |
|
61 void CXmlEncoding::ConstructL() |
|
62 { |
|
63 User::LeaveIfError(iRfs.Connect()); |
|
64 iConverter = CCnvCharacterSetConverter::NewL(); |
|
65 } |
|
66 |
|
67 |
|
68 // ----------------------------------------------------------------------------- |
|
69 // CXmlEncoding::~CXmlEncoding |
|
70 // |
|
71 // Deconstructor. |
|
72 // ----------------------------------------------------------------------------- |
|
73 // |
|
74 CXmlEncoding::~CXmlEncoding() |
|
75 { |
|
76 iRfs.Close(); |
|
77 delete iConverter; |
|
78 |
|
79 // Delete the cached encoding-map. |
|
80 for (TInt i = 0; i < iCharEncodings.Count(); i++) |
|
81 { |
|
82 delete iCharEncodings[i].charEncoding; |
|
83 } |
|
84 |
|
85 iCharEncodings.Close(); |
|
86 } |
|
87 |
|
88 |
|
89 // ----------------------------------------------------------------------------- |
|
90 // CXmlEncoding::ResolveCharEncodingL |
|
91 // |
|
92 // Resolves the given char-encoding into its uid. |
|
93 // ----------------------------------------------------------------------------- |
|
94 // |
|
95 TBool CXmlEncoding::ResolveCharEncodingL(const TDesC8& aCharEncoding, TUint& aUid) const |
|
96 { |
|
97 TBool found = EFalse; |
|
98 |
|
99 // First look it up in the cached encodings. |
|
100 for (TInt i = 0; i < iCharEncodings.Count(); i++) |
|
101 { |
|
102 if (iCharEncodings[i].charEncoding->CompareF(aCharEncoding) == 0) |
|
103 { |
|
104 aUid = iCharEncodings[i].uid; |
|
105 found = ETrue; |
|
106 |
|
107 break; |
|
108 } |
|
109 } |
|
110 |
|
111 // Otherwise use the CCnvCharacterSetConverter |
|
112 if (!found) |
|
113 { |
|
114 TUint tid; |
|
115 |
|
116 // Look it up. |
|
117 tid = iConverter->ConvertStandardNameOfCharacterSetToIdentifierL(aCharEncoding, |
|
118 const_cast<CXmlEncoding*>(this)->iRfs); |
|
119 |
|
120 // If found add it to the cached encodings. |
|
121 if (tid != 0) |
|
122 { |
|
123 SupportedEncodings encoding; |
|
124 TInt err; |
|
125 |
|
126 encoding.charEncoding = aCharEncoding.AllocL(); |
|
127 encoding.uid = tid; |
|
128 |
|
129 err = const_cast<CXmlEncoding*>(this)->iCharEncodings.Append(encoding); |
|
130 if (err != KErrNone) |
|
131 { |
|
132 delete encoding.charEncoding; |
|
133 User::Leave(err); |
|
134 } |
|
135 |
|
136 aUid = tid; |
|
137 found = ETrue; |
|
138 } |
|
139 } |
|
140 |
|
141 return found; |
|
142 } |
|
143 |
|
144 |
|
145 // ----------------------------------------------------------------------------- |
|
146 // XmlEncoding::DetermineCharEncoding |
|
147 // |
|
148 // Determine the char-encoding. |
|
149 // ----------------------------------------------------------------------------- |
|
150 // |
|
151 EXPORT_C TBool CXmlEncoding::DetermineCharEncodingL(const TDesC8& aBuffer, |
|
152 const TDesC& aCharSet, TUint& aEncoding) const |
|
153 { |
|
154 TBool foundEncoding = EFalse; |
|
155 |
|
156 // Try to determine the encoding via the BOM (byte order mask). |
|
157 foundEncoding = DetermineCharEncodingFromBom(aBuffer, aEncoding); |
|
158 |
|
159 // Try to determine the encoding via the xml-prolog. |
|
160 if (!foundEncoding) |
|
161 { |
|
162 foundEncoding = DetermineCharEncodingFromXmlProlog(aBuffer, aEncoding); |
|
163 } |
|
164 |
|
165 // Try to determine the encoding via the char-set provided by |
|
166 // the orignal source. |
|
167 if (!foundEncoding) |
|
168 { |
|
169 HBufC8* str = NULL; |
|
170 |
|
171 // Convert it to 8bit first. |
|
172 str = HBufC8::NewL(aCharSet.Length()); |
|
173 CleanupStack::PushL(str); |
|
174 str->Des().Append(aCharSet); |
|
175 |
|
176 foundEncoding = ResolveCharEncodingL(*str, aEncoding); |
|
177 CleanupStack::PopAndDestroy(str); |
|
178 } |
|
179 |
|
180 return foundEncoding; |
|
181 } |
|
182 |
|
183 |
|
184 // ----------------------------------------------------------------------------- |
|
185 // XmlEncoding::DetermineCharEncodingFromBom |
|
186 // |
|
187 // Determine the char-encoding from the BOM. |
|
188 // ----------------------------------------------------------------------------- |
|
189 // |
|
190 TBool CXmlEncoding::DetermineCharEncodingFromBom(const TDesC8& aBuffer, |
|
191 TUint& aEncoding) const |
|
192 { |
|
193 _LIT8(KUcs2Big, "UTF-16BE"); |
|
194 _LIT8(KUcs2Little, "UTF-16LE"); |
|
195 _LIT8(KUtf8, "UTF-8"); |
|
196 |
|
197 TBool foundEncoding = EFalse; |
|
198 |
|
199 if (aBuffer.Length() < 3) |
|
200 { |
|
201 return EFalse; |
|
202 } |
|
203 |
|
204 // Extract the first three bytes. |
|
205 TUint8 c1 = aBuffer[0]; |
|
206 TUint8 c2 = aBuffer[1]; |
|
207 TUint8 c3 = aBuffer[2]; |
|
208 |
|
209 // Check for the BOM. |
|
210 if ((c1 == 0xFE) && (c2 == 0xFF)) |
|
211 { |
|
212 TRAP_IGNORE(foundEncoding = ResolveCharEncodingL(KUcs2Big, aEncoding)); |
|
213 } |
|
214 else if ((c1 == 0xFF) && (c2 == 0xFE)) |
|
215 { |
|
216 TRAP_IGNORE(foundEncoding = ResolveCharEncodingL(KUcs2Little, aEncoding)); |
|
217 } |
|
218 else if ((c1 == 0xEF) && (c2 == 0xBB) && (c3 == 0xBF)) |
|
219 { |
|
220 TRAP_IGNORE(foundEncoding = ResolveCharEncodingL(KUtf8, aEncoding)); |
|
221 } |
|
222 |
|
223 return foundEncoding; |
|
224 } |
|
225 |
|
226 |
|
227 // ----------------------------------------------------------------------------- |
|
228 // XmlEncoding::DetermineCharEncodingFromXmlProlog |
|
229 // |
|
230 // Determine the char-encoding from the char-encoding in the xml-prolog. |
|
231 // ----------------------------------------------------------------------------- |
|
232 // |
|
233 TBool CXmlEncoding::DetermineCharEncodingFromXmlProlog(const TDesC8& aBuffer, |
|
234 TUint& aEncoding) const |
|
235 { |
|
236 TBool foundEncoding = EFalse; |
|
237 TInt begin; |
|
238 TInt end; |
|
239 TInt valueBegin; |
|
240 TInt valueEnd; |
|
241 |
|
242 // Try to determine the encoding via the xml-prolog. |
|
243 if (FindEncoding(aBuffer, begin, end, valueBegin, valueEnd)) |
|
244 { |
|
245 TPtrC8 str(aBuffer.Ptr() + valueBegin, valueEnd - valueBegin + 1); |
|
246 |
|
247 TRAP_IGNORE(foundEncoding = ResolveCharEncodingL(str, aEncoding)); |
|
248 } |
|
249 |
|
250 return foundEncoding; |
|
251 } |
|
252 |
|
253 |
|
254 // ----------------------------------------------------------------------------- |
|
255 // XmlEncoding::StripCharEncoding |
|
256 // |
|
257 // Erase the char-encoding attribute (if any) from the xml-prolog. |
|
258 // ----------------------------------------------------------------------------- |
|
259 // |
|
260 void CXmlEncoding::StripCharEncoding(HBufC8& aUtf8Buffer) const |
|
261 { |
|
262 TInt begin; |
|
263 TInt end; |
|
264 TInt valueBegin; |
|
265 TInt valueEnd; |
|
266 |
|
267 if (FindEncoding(aUtf8Buffer, begin, end, valueBegin, valueEnd)) |
|
268 { |
|
269 aUtf8Buffer.Des().Delete(begin, end - begin + 1); |
|
270 } |
|
271 } |
|
272 |
|
273 |
|
274 // ----------------------------------------------------------------------------- |
|
275 // XmlEncoding::StripCharEncoding |
|
276 // |
|
277 // Erase the char-encoding attribute (if any) from the xml-prolog. |
|
278 // ----------------------------------------------------------------------------- |
|
279 // |
|
280 void CXmlEncoding::StripCharEncoding(HBufC16& aUcs2Buffer) const |
|
281 { |
|
282 TInt begin; |
|
283 TInt end; |
|
284 TInt valueBegin; |
|
285 TInt valueEnd; |
|
286 |
|
287 if (FindEncoding(aUcs2Buffer, begin, end, valueBegin, valueEnd)) |
|
288 { |
|
289 aUcs2Buffer.Des().Delete(begin, end - begin + 1); |
|
290 } |
|
291 } |
|
292 |
|
293 |
|
294 // ----------------------------------------------------------------------------- |
|
295 // XmlEncoding::ConvertToUtf8L |
|
296 // |
|
297 // Converts the given native buffer into utf8. |
|
298 // ----------------------------------------------------------------------------- |
|
299 // |
|
300 EXPORT_C HBufC8* CXmlEncoding::ConvertToUtf8L(TUint aEncoding, const TDesC8& aBuffer) const |
|
301 { |
|
302 HBufC16* usc2Buffer = NULL; |
|
303 HBufC8* utf8Buffer = NULL; |
|
304 |
|
305 // First convert it to ucs2. |
|
306 usc2Buffer = ConvertToUcs2L(aEncoding, aBuffer); |
|
307 CleanupStack::PushL(usc2Buffer); |
|
308 |
|
309 // Then convert it to utf8. |
|
310 utf8Buffer = ConvertToUtf8L(*usc2Buffer); |
|
311 |
|
312 CleanupStack::PopAndDestroy(usc2Buffer); |
|
313 usc2Buffer = NULL; |
|
314 |
|
315 // Erase the char-encoding attribute (if any) from the xml-prolog. If forces |
|
316 // Libxml2 to use the default encoding, utf8. |
|
317 StripCharEncoding(*utf8Buffer); |
|
318 |
|
319 return utf8Buffer; |
|
320 } |
|
321 |
|
322 |
|
323 // ----------------------------------------------------------------------------- |
|
324 // XmlEncoding::ConvertToUcs2L |
|
325 // |
|
326 // Converts the given native buffer into ucs2. |
|
327 // ----------------------------------------------------------------------------- |
|
328 // |
|
329 EXPORT_C HBufC16* CXmlEncoding::ConvertToUcs2L(TUint aEncoding, const TDesC8& aBuffer) const |
|
330 { |
|
331 TBuf<100> temp16Buffer; |
|
332 HBufC* unicode = NULL; |
|
333 TPtrC8 source8Ptr(aBuffer); |
|
334 |
|
335 // Init the converter and ensure the encoding is supported. |
|
336 if (iConverter->PrepareToConvertToOrFromL(aEncoding, const_cast<CXmlEncoding*>(this)->iRfs) != |
|
337 CCnvCharacterSetConverter::EAvailable) |
|
338 { |
|
339 User::Leave(KErrNotSupported); |
|
340 } |
|
341 |
|
342 for(;;) // conversion loop |
|
343 { |
|
344 TInt returnValue; |
|
345 TInt state = CCnvCharacterSetConverter::KStateDefault; |
|
346 |
|
347 returnValue = iConverter->ConvertToUnicode(temp16Buffer, source8Ptr, state); |
|
348 if (returnValue == CCnvCharacterSetConverter::EErrorIllFormedInput) |
|
349 { |
|
350 User::Leave(KErrCorrupt); |
|
351 } |
|
352 else |
|
353 { |
|
354 if (returnValue < 0) // future-proof against "TError" expanding |
|
355 { |
|
356 User::Leave(KErrGeneral); |
|
357 } |
|
358 } |
|
359 |
|
360 if (!unicode) |
|
361 { |
|
362 unicode = temp16Buffer.AllocLC(); |
|
363 } |
|
364 else |
|
365 { |
|
366 HBufC* tmp = unicode->ReAllocL(unicode->Length() + temp16Buffer.Length()); |
|
367 CleanupStack::Pop(unicode); |
|
368 unicode = tmp; |
|
369 CleanupStack::PushL(unicode); |
|
370 unicode->Des().Append(temp16Buffer); |
|
371 } |
|
372 |
|
373 if (returnValue == 0) // All is converted without Errors |
|
374 { |
|
375 break; |
|
376 } |
|
377 |
|
378 // There is "returnValue" bytes not converted yet |
|
379 source8Ptr.Set(source8Ptr.Right(returnValue)); |
|
380 } |
|
381 |
|
382 // Erase the char-encoding attribute (if any) from the xml-prolog -- as the |
|
383 // encoding is no longer valid. |
|
384 StripCharEncoding(*unicode); |
|
385 |
|
386 CleanupStack::Pop(unicode); |
|
387 return unicode; |
|
388 } |
|
389 |
|
390 |
|
391 // ----------------------------------------------------------------------------- |
|
392 // XmlEncoding::ConvertToUtf8L |
|
393 // |
|
394 // Converts the given ucs2 buffer into utf8. |
|
395 // ----------------------------------------------------------------------------- |
|
396 // |
|
397 HBufC8* CXmlEncoding::ConvertToUtf8L(HBufC16& aUsc2Buffer) const |
|
398 { |
|
399 TBuf8<100> temp8Buffer; |
|
400 HBufC8* target = NULL; |
|
401 TPtrC source16Ptr(aUsc2Buffer); |
|
402 |
|
403 // Init the converter and ensure the encoding is supported. |
|
404 if (iConverter->PrepareToConvertToOrFromL(KCharacterSetIdentifierUtf8, |
|
405 const_cast<CXmlEncoding*>(this)->iRfs) != CCnvCharacterSetConverter::EAvailable) |
|
406 { |
|
407 User::Leave(KErrNotSupported); |
|
408 } |
|
409 |
|
410 for(;;) // conversion loop |
|
411 { |
|
412 TInt returnValue; |
|
413 TInt state = CCnvCharacterSetConverter::KStateDefault; |
|
414 |
|
415 returnValue = iConverter->ConvertFromUnicode(temp8Buffer, source16Ptr, state); |
|
416 if (returnValue == CCnvCharacterSetConverter::EErrorIllFormedInput) |
|
417 { |
|
418 User::Leave(KErrCorrupt); |
|
419 } |
|
420 else |
|
421 { |
|
422 if (returnValue < 0) // future-proof against "TError" expanding |
|
423 { |
|
424 User::Leave(KErrGeneral); |
|
425 } |
|
426 } |
|
427 |
|
428 if (!target) |
|
429 { |
|
430 target = temp8Buffer.AllocLC(); |
|
431 } |
|
432 else |
|
433 { |
|
434 HBufC8* tmp = target->ReAllocL(target->Length() + temp8Buffer.Length()); |
|
435 CleanupStack::Pop(target); |
|
436 target = tmp; |
|
437 CleanupStack::PushL(target); |
|
438 target->Des().Append(temp8Buffer); |
|
439 } |
|
440 |
|
441 if (returnValue == 0) // All is converted without Errors |
|
442 { |
|
443 break; |
|
444 } |
|
445 |
|
446 // There is "returnValue" bytes not converted yet |
|
447 source16Ptr.Set(source16Ptr.Right(returnValue)); |
|
448 } |
|
449 CleanupStack::Pop(target); |
|
450 |
|
451 return target; |
|
452 } |
|
453 |
|
454 |
|
455 // ----------------------------------------------------------------------------- |
|
456 // XmlEncoding::FindEncoding |
|
457 // |
|
458 // Finds the location of the encoding attribute in the xml-prolog. |
|
459 // Refer to: http://www.w3.org/TR/2000/REC-xml-20001006#sec-prolog-dtd. |
|
460 // ----------------------------------------------------------------------------- |
|
461 // |
|
462 TBool CXmlEncoding::FindEncoding(const TDesC8& aBuffer, TInt& aBegin, TInt& aEnd, |
|
463 TInt& aValueBegin, TInt& aValueEnd) const |
|
464 { |
|
465 _LIT8(KPiStart, "<?xml"); |
|
466 _LIT8(KPiEnd, "?>"); |
|
467 _LIT8(KEncoding, "encoding"); |
|
468 |
|
469 TInt piStartLoc = KErrNotFound; |
|
470 TInt piEndLoc = KErrNotFound; |
|
471 |
|
472 aBegin = KErrNotFound; |
|
473 aEnd = KErrNotFound; |
|
474 aValueBegin = KErrNotFound; |
|
475 aValueEnd = KErrNotFound; |
|
476 |
|
477 // Determine if the xml-prolog contains a char-encoding attribute. |
|
478 piStartLoc = aBuffer.FindF(KPiStart); |
|
479 piEndLoc = aBuffer.FindF(KPiEnd); |
|
480 aBegin = aBuffer.FindF(KEncoding); |
|
481 |
|
482 if ((piStartLoc != KErrNotFound) && (piEndLoc != KErrNotFound) && |
|
483 (aBegin != KErrNotFound) && (piStartLoc < piEndLoc) && (aBegin < piEndLoc)) |
|
484 { |
|
485 TInt i; |
|
486 TBool foundEquals = EFalse; |
|
487 TInt quoteCount = 0; |
|
488 |
|
489 // If so, find the end of the attribute. |
|
490 i = aBegin + KEncoding().Length(); |
|
491 while (i < piEndLoc) |
|
492 { |
|
493 TInt8 c; |
|
494 |
|
495 c = aBuffer[i]; |
|
496 |
|
497 // The '=' char must be the first non-whitespace after "encoding". |
|
498 if (!foundEquals) |
|
499 { |
|
500 if (c == '=') |
|
501 { |
|
502 foundEquals = ETrue; |
|
503 } |
|
504 else if ((c != 0x20) && (c != 0x09) && (c != 0x0D) && (c != 0x0A)) |
|
505 { |
|
506 // Give up. |
|
507 return EFalse; |
|
508 } |
|
509 } |
|
510 |
|
511 // Otherwise look for the two quotes |
|
512 else if ((c == '\"') || (c == '\'')) |
|
513 { |
|
514 quoteCount++; |
|
515 |
|
516 if (quoteCount == 1) |
|
517 { |
|
518 aValueBegin = i + 1; |
|
519 } |
|
520 else if (quoteCount == 2) |
|
521 { |
|
522 aEnd = i; |
|
523 aValueEnd = i - 1; |
|
524 break; |
|
525 } |
|
526 } |
|
527 |
|
528 i++; |
|
529 } |
|
530 } |
|
531 |
|
532 return ((aValueBegin != KErrNotFound) && (aValueEnd != KErrNotFound)); |
|
533 } |
|
534 |
|
535 |
|
536 // ----------------------------------------------------------------------------- |
|
537 // XmlEncoding::FindEncoding |
|
538 // |
|
539 // Finds the location of the encoding attribute in the xml-prolog. |
|
540 // Refer to: http://www.w3.org/TR/2000/REC-xml-20001006#sec-prolog-dtd. |
|
541 // ----------------------------------------------------------------------------- |
|
542 // |
|
543 TBool CXmlEncoding::FindEncoding(const TDesC& aBuffer, TInt& aBegin, |
|
544 TInt& aEnd, TInt& aValueBegin, TInt& aValueEnd) const |
|
545 { |
|
546 _LIT16(KPiStart, "<?xml"); |
|
547 _LIT16(KPiEnd, "?>"); |
|
548 _LIT16(KEncoding, "encoding"); |
|
549 |
|
550 TInt piStartLoc = KErrNotFound; |
|
551 TInt piEndLoc = KErrNotFound; |
|
552 |
|
553 aBegin = KErrNotFound; |
|
554 aEnd = KErrNotFound; |
|
555 aValueBegin = KErrNotFound; |
|
556 aValueEnd = KErrNotFound; |
|
557 |
|
558 // Determine if the xml-prolog contains a char-encoding attribute. |
|
559 piStartLoc = aBuffer.FindF(KPiStart); |
|
560 piEndLoc = aBuffer.FindF(KPiEnd); |
|
561 aBegin = aBuffer.FindF(KEncoding); |
|
562 |
|
563 if ((piStartLoc != KErrNotFound) && (piEndLoc != KErrNotFound) && |
|
564 (aBegin != KErrNotFound) && (piStartLoc < piEndLoc) && (aBegin < piEndLoc)) |
|
565 { |
|
566 TInt i; |
|
567 TBool foundEquals = EFalse; |
|
568 TInt quoteCount = 0; |
|
569 |
|
570 // If so, find the end of the attribute. |
|
571 i = aBegin + KEncoding().Length(); |
|
572 while (i < piEndLoc) |
|
573 { |
|
574 TInt16 c; |
|
575 |
|
576 c = aBuffer[i]; |
|
577 |
|
578 // The '=' char must be the first non-whitespace after "encoding". |
|
579 if (!foundEquals) |
|
580 { |
|
581 if (c == '=') |
|
582 { |
|
583 foundEquals = ETrue; |
|
584 } |
|
585 else if ((c != 0x20) && (c != 0x09) && (c != 0x0D) && (c != 0x0A)) |
|
586 { |
|
587 // Give up. |
|
588 return EFalse; |
|
589 } |
|
590 } |
|
591 |
|
592 // Otherwise look for the two quotes |
|
593 else if ((c == '\"') || (c == '\'')) |
|
594 { |
|
595 quoteCount++; |
|
596 |
|
597 if (quoteCount == 1) |
|
598 { |
|
599 aValueBegin = i + 1; |
|
600 } |
|
601 else if (quoteCount == 2) |
|
602 { |
|
603 aEnd = i; |
|
604 aValueEnd = i - 1; |
|
605 break; |
|
606 } |
|
607 } |
|
608 |
|
609 i++; |
|
610 } |
|
611 } |
|
612 |
|
613 return ((aValueBegin != KErrNotFound) && (aValueEnd != KErrNotFound)); |
|
614 } |