|
1 // Copyright (c) 2003-2009 Nokia Corporation and/or its subsidiary(-ies). |
|
2 // All rights reserved. |
|
3 // This component and the accompanying materials are made available |
|
4 // under the terms of "Eclipse Public License v1.0" |
|
5 // which accompanies this distribution, and is available |
|
6 // at the URL "http://www.eclipse.org/legal/epl-v10.html". |
|
7 // |
|
8 // Initial Contributors: |
|
9 // Nokia Corporation - initial contribution. |
|
10 // |
|
11 // Contributors: |
|
12 // |
|
13 // Description: |
|
14 // |
|
15 |
|
16 #include <e32std.h> |
|
17 #include <utf.h> |
|
18 #include <charconv.h> |
|
19 |
|
20 #include <xml/plugins/charsetconverter.h> |
|
21 #include <xml/xmlframeworkerrors.h> |
|
22 |
|
23 using namespace Xml; |
|
24 |
|
25 /** |
|
26 The maximum number of bytes used for conversion at any time. |
|
27 This is also used to size the necessary buffers used in the conversions. |
|
28 |
|
29 @internalTechnology |
|
30 */ |
|
31 const TInt KMaxReadableBytes = 512; |
|
32 |
|
33 |
|
34 |
|
35 LOCAL_C void DestroyHBufC16(TAny* aHBufC) |
|
36 /** |
|
37 This method is used when pointer reallocation is needed and the pointer needs to be |
|
38 cleaned via the cleanup stack. |
|
39 |
|
40 @param aHBufC the wide buffer. |
|
41 @internalTechnology |
|
42 |
|
43 */ |
|
44 { |
|
45 delete *static_cast<HBufC**>(aHBufC); |
|
46 } |
|
47 |
|
48 |
|
49 |
|
50 LOCAL_C void DestroyHBufC8(TAny* aHBufC) |
|
51 /** |
|
52 This method is used when pointer reallocation is needed and the pointer needs to be |
|
53 cleaned via the cleanup stack. |
|
54 |
|
55 @param aHBufC the narrow buffer. |
|
56 @internalTechnology |
|
57 |
|
58 */ |
|
59 { |
|
60 delete *static_cast<HBufC8**>(aHBufC); |
|
61 } |
|
62 |
|
63 |
|
64 |
|
65 CCharSetConverter::CCharSetConverter() |
|
66 /** |
|
67 Default Constructor |
|
68 |
|
69 */ |
|
70 { |
|
71 // do nothing; |
|
72 } |
|
73 |
|
74 |
|
75 |
|
76 /** |
|
77 This method creates an instance of this class. |
|
78 The framework is responsible for creating this object. |
|
79 |
|
80 @leave ... One of the system wide error codes e.g. KErrNoMemory |
|
81 @return The new'ed object. |
|
82 @internalTechnology |
|
83 */ |
|
84 CCharSetConverter* CCharSetConverter::NewL() |
|
85 { |
|
86 CCharSetConverter* self = new(ELeave) CCharSetConverter(); |
|
87 CleanupStack::PushL(self); |
|
88 self->ConstructL(); |
|
89 CleanupStack::Pop(self); |
|
90 return(self); |
|
91 } |
|
92 |
|
93 |
|
94 |
|
95 void CCharSetConverter::ConstructL() |
|
96 /** |
|
97 This method provides some construction of this object. |
|
98 |
|
99 */ |
|
100 { |
|
101 iCnvCharacterSetConverter = CCnvCharacterSetConverter::NewL(); |
|
102 User::LeaveIfError(iFs.Connect()); |
|
103 iConversionBuffer = User::Heap().AllocL(KMaxReadableBytes); |
|
104 iConversionBufferSize = KMaxReadableBytes; |
|
105 } |
|
106 |
|
107 |
|
108 |
|
109 CCharSetConverter::~CCharSetConverter() |
|
110 /** |
|
111 Destructor. |
|
112 The framework is responsible for destroying this object. |
|
113 |
|
114 @post This object is properly destroyed. |
|
115 |
|
116 */ |
|
117 { |
|
118 iFs.Close(); |
|
119 delete iCnvCharacterSetConverter; |
|
120 delete iConversionBuffer; |
|
121 } |
|
122 |
|
123 |
|
124 |
|
125 EXPORT_C void CCharSetConverter::PrepareCharConvL(TUint& aCharSetUid, const TDesC8& aEncoding) |
|
126 /** |
|
127 This method prepares CharConv to encode from the standard name. |
|
128 |
|
129 @post CharConv has been prepared. |
|
130 |
|
131 @leave KErrXmlUnsupportedCharacterSet - Charset not supported. |
|
132 @leave KErrXmlUnavailableCharacterSet - Charset not available |
|
133 |
|
134 @param aCharSetUid On return, contains the character set identifier |
|
135 of the encoding. |
|
136 @param aEncoding the encoding to prepare for. |
|
137 */ |
|
138 { |
|
139 // Get the charset uid |
|
140 if ((aCharSetUid = |
|
141 iCnvCharacterSetConverter->ConvertStandardNameOfCharacterSetToIdentifierL(aEncoding, iFs)) == 0) |
|
142 { |
|
143 User::Leave(KErrXmlUnsupportedCharacterSet); |
|
144 } |
|
145 |
|
146 |
|
147 // Prepare charconv to use this charset |
|
148 if (iCnvCharacterSetConverter->PrepareToConvertToOrFromL(aCharSetUid, iFs) == |
|
149 CCnvCharacterSetConverter::ENotAvailable ) |
|
150 { |
|
151 User::Leave(KErrXmlUnavailableCharacterSet); // Unavailable |
|
152 } |
|
153 } |
|
154 |
|
155 |
|
156 |
|
157 EXPORT_C void CCharSetConverter::PrepareCharConvL(TUint& aCharSetUid, TInt aMibEnum) |
|
158 /** |
|
159 This method prepares CharConv to encode from the mib enum. |
|
160 |
|
161 @post CharConv has been prepared. |
|
162 |
|
163 @leave KErrXmlUnsupportedCharacterSet - Charset not supported. |
|
164 @leave KErrXmlUnavailableCharacterSet - Charset not available |
|
165 |
|
166 @param aCharSetUid On return, contains the character set identifier |
|
167 of the encoding. |
|
168 @param aMibEnum The IANA specified mib enum for this encoding |
|
169 |
|
170 @see http://www.iana.org/assignments/character-sets |
|
171 */ |
|
172 { |
|
173 // Get the charset uid |
|
174 if ((aCharSetUid = |
|
175 iCnvCharacterSetConverter->ConvertMibEnumOfCharacterSetToIdentifierL(aMibEnum, iFs)) == 0) |
|
176 { |
|
177 User::Leave(KErrXmlUnsupportedCharacterSet); // May want to try something else? |
|
178 } |
|
179 |
|
180 |
|
181 // Prepare charconv to use this charset |
|
182 if (iCnvCharacterSetConverter->PrepareToConvertToOrFromL(aCharSetUid, iFs) == |
|
183 CCnvCharacterSetConverter::ENotAvailable ) |
|
184 { |
|
185 User::Leave(KErrXmlUnavailableCharacterSet); // Unavailable |
|
186 } |
|
187 } |
|
188 |
|
189 |
|
190 |
|
191 EXPORT_C TInt CCharSetConverter::ConvertToUnicodeL(TUint32 aSrcCharset, const TDesC8& aInputBuffer, |
|
192 HBufC16*& aUnicodeConversion) |
|
193 /** |
|
194 This method converts the given bytes to unicode. |
|
195 If this function leaves, memory is cleaned up. |
|
196 This overload allocates memory for the output itself. |
|
197 |
|
198 @return KErrNone if the conversion was succesfull |
|
199 or one of the error values defined in TError. |
|
200 |
|
201 @leave KErrXmlUnavailableCharacterSet - CharSet not available. |
|
202 |
|
203 @param aSrcCharset The character set encoding to convert from. |
|
204 @param aInputBuffer The characters to be converted. |
|
205 @param aUnicodeConversion On return, contains the unicode conversion. |
|
206 */ |
|
207 { |
|
208 if (iCnvCharacterSetConverter->PrepareToConvertToOrFromL(aSrcCharset, iFs) == |
|
209 CCnvCharacterSetConverter::ENotAvailable) |
|
210 { |
|
211 User::Leave(KErrXmlUnavailableCharacterSet); |
|
212 } |
|
213 |
|
214 TInt maxLength = KMaxReadableBytes; |
|
215 aUnicodeConversion = HBufC16::NewL(maxLength); |
|
216 CleanupStack::PushL(TCleanupItem(DestroyHBufC16, &aUnicodeConversion));//push buffer's address |
|
217 |
|
218 |
|
219 TInt state = CCnvCharacterSetConverter::KStateDefault; |
|
220 TPtr16 remainingOutput(aUnicodeConversion->Des()); |
|
221 TInt unconverted = iCnvCharacterSetConverter->ConvertToUnicode(remainingOutput, aInputBuffer, state); |
|
222 |
|
223 // While there is still more data to convert |
|
224 while (0 < unconverted) |
|
225 { |
|
226 // Resize the buffer to hold more data |
|
227 maxLength += KMaxReadableBytes; |
|
228 aUnicodeConversion = aUnicodeConversion->ReAllocL(maxLength); |
|
229 |
|
230 // Segment the writable area |
|
231 TInt outputLength = aUnicodeConversion->Length(); |
|
232 TPtr16 remainingOutput1(&(aUnicodeConversion->Des())[0] + outputLength, 0, maxLength - outputLength); |
|
233 remainingOutput.Set(remainingOutput1); |
|
234 |
|
235 // Convert the data |
|
236 unconverted = iCnvCharacterSetConverter->ConvertToUnicode(remainingOutput, aInputBuffer.Right(unconverted), state); |
|
237 aUnicodeConversion->Des().SetLength(outputLength + remainingOutput.Length()); |
|
238 } |
|
239 |
|
240 // Reallocate to a minimally-sized buffer |
|
241 if (unconverted == 0) |
|
242 { |
|
243 aUnicodeConversion = aUnicodeConversion->ReAllocL(aUnicodeConversion->Length()); |
|
244 } |
|
245 |
|
246 CleanupStack::Pop(&aUnicodeConversion);//destroy the object pointed by the buffer wherever it is since we have got hold of the pointer (buffer)'s address |
|
247 return unconverted; // return error value if there is one. |
|
248 } |
|
249 |
|
250 |
|
251 EXPORT_C TInt CCharSetConverter::ConvertToUnicodeL(TUint32 aSrcCharset, |
|
252 const TDesC8& aInput, |
|
253 TPtr16& aOutput) |
|
254 /** |
|
255 This method converts the given bytes to unicode. |
|
256 If this function leaves, memory is cleaned up. |
|
257 This overload stores the conversion output in memory already allocated, for the sole use |
|
258 of the TPtr versions of overloaded ConvertToUnicodeL and ConvertFromUnicodeL functions. You must make sure you |
|
259 have finished with the output from a previous call to either (TPtr overload of) ConvertToUnicodeL |
|
260 or ConvertFromUnicodeL before calling either again, as the previous output will be overwritten with |
|
261 the new output. |
|
262 This version is more efficient than the HBufC alternative and so should be used whenever possible. |
|
263 |
|
264 @return KErrNone if the conversion was succesfull |
|
265 or one of the error values defined in TError. |
|
266 |
|
267 @leave KErrXmlUnavailableCharacterSet - CharSet not available. |
|
268 |
|
269 @param aSrcCharset The character set encoding to convert from. |
|
270 @param aInput The characters to be converted. |
|
271 @param aOutput On return, contains the unicode conversion. |
|
272 */ |
|
273 { |
|
274 if (iCnvCharacterSetConverter->PrepareToConvertToOrFromL(aSrcCharset, iFs) == |
|
275 CCnvCharacterSetConverter::ENotAvailable) |
|
276 { |
|
277 User::Leave(KErrXmlUnavailableCharacterSet); |
|
278 } |
|
279 |
|
280 TInt state = CCnvCharacterSetConverter::KStateDefault; |
|
281 |
|
282 // Set up output descriptor reference: "Payload" is iConversionBuffer (a TAny *), it's initial |
|
283 // length is zero (because it's empty) and it's initial maximum length is the maximum number of |
|
284 // unicode characters which will fit into the current size of iConversion buffer |
|
285 aOutput.Set((TUint16*)iConversionBuffer, 0, iConversionBufferSize/sizeof(TUint16)); |
|
286 |
|
287 // Convert the data, returning the amount of characters that are unconverted, due to the output buffer being full |
|
288 TInt unconverted = iCnvCharacterSetConverter->ConvertToUnicode(aOutput, aInput, state); |
|
289 |
|
290 // While there is still more data to convert |
|
291 while (0 < unconverted) |
|
292 { |
|
293 TInt outputLength = aOutput.Length(); |
|
294 |
|
295 // Resize the buffer to hold more data |
|
296 iConversionBufferSize += KMaxReadableBytes; |
|
297 |
|
298 iConversionBuffer = User::Heap().ReAllocL(iConversionBuffer,iConversionBufferSize); |
|
299 if (iConversionBuffer == NULL) |
|
300 User::Leave(KErrNoMemory); |
|
301 |
|
302 // Reconstruct the output descriptor to point to the new buffer, setting current |
|
303 // length (the number of characters we've converted so far) and maximum length |
|
304 // (the number of unicode characters which will fit into the newly extended |
|
305 // iConversionBuffer) appropriately. |
|
306 aOutput.Set((TUint16*)iConversionBuffer, outputLength, iConversionBufferSize/sizeof(TUint16)); |
|
307 |
|
308 // Construct a modifiable pointer descriptor pointing to the the writable area of |
|
309 // iConversionBuffer |
|
310 TPtr16 remainingOutput(((TUint16*)iConversionBuffer)+outputLength, 0, aOutput.MaxLength() - outputLength); |
|
311 |
|
312 // Try to convert another chunk of data |
|
313 unconverted = iCnvCharacterSetConverter->ConvertToUnicode(remainingOutput, aInput.Right(unconverted), state); |
|
314 |
|
315 // Update the length of the output buffer to include the data we just converted. |
|
316 aOutput.SetLength(remainingOutput.Length()+outputLength); |
|
317 } |
|
318 |
|
319 return unconverted; // return error value if there is one. |
|
320 } |
|
321 |
|
322 |
|
323 |
|
324 EXPORT_C TInt CCharSetConverter::ConvertFromUnicodeL(const TDesC16& aUnicodeConversion, |
|
325 TUint32 aDestCharset, HBufC8*& aOutputBuffer) |
|
326 /** |
|
327 This method converts the given unicode to the specified encoding. |
|
328 If this function leaves, memory is cleaned up. |
|
329 This overload allocates memory for the output itself. |
|
330 |
|
331 @return KErrNone if the conversion was succesfull |
|
332 or one of the error values defined in TError. |
|
333 |
|
334 @leave KErrXmlUnavailableCharacterSet - Charset not available. |
|
335 |
|
336 @param aUnicodeConversion The unicode to convert. |
|
337 @param aDestCharset The character set encoding to convert to. |
|
338 @param aOutputBuffer On return, contains the specified conversion. |
|
339 */ |
|
340 { |
|
341 if (iCnvCharacterSetConverter->PrepareToConvertToOrFromL(aDestCharset, iFs) == |
|
342 CCnvCharacterSetConverter::ENotAvailable) |
|
343 { |
|
344 User::Leave(KErrXmlUnavailableCharacterSet); |
|
345 } |
|
346 |
|
347 TInt maxLength = KMaxReadableBytes; |
|
348 aOutputBuffer = HBufC8::NewL(maxLength); |
|
349 CleanupStack::PushL(TCleanupItem(DestroyHBufC8, &aOutputBuffer));//push buffer's address |
|
350 |
|
351 TPtr8 remainingOutput(aOutputBuffer->Des()); |
|
352 TInt unconverted = iCnvCharacterSetConverter->ConvertFromUnicode(remainingOutput, aUnicodeConversion); |
|
353 |
|
354 // While there is still more data to convert |
|
355 while (0 < unconverted) |
|
356 { |
|
357 // Resize the buffer to hold more data |
|
358 maxLength += KMaxReadableBytes; |
|
359 aOutputBuffer = aOutputBuffer->ReAllocL(maxLength); |
|
360 |
|
361 // Segment the writable area |
|
362 TInt outputLength = aOutputBuffer->Length(); |
|
363 TPtr8 remainingOutput1(&(aOutputBuffer->Des())[0] + outputLength, 0, maxLength - outputLength); |
|
364 remainingOutput.Set(remainingOutput1); |
|
365 |
|
366 // Convert the data |
|
367 unconverted = iCnvCharacterSetConverter->ConvertFromUnicode(remainingOutput, aUnicodeConversion.Right(unconverted)); |
|
368 aOutputBuffer->Des().SetLength(outputLength + remainingOutput.Length()); |
|
369 } |
|
370 |
|
371 // Reallocate to a minimally-sized buffer |
|
372 if (unconverted == 0) |
|
373 { |
|
374 aOutputBuffer = aOutputBuffer->ReAllocL(aOutputBuffer->Length()); |
|
375 } |
|
376 |
|
377 CleanupStack::Pop(&aOutputBuffer);//destroy the object pointed by the buffer wherever it is since we have got hold of the pointer (buffer)'s address |
|
378 |
|
379 return unconverted; // return error value if there is one. |
|
380 } |
|
381 |
|
382 |
|
383 EXPORT_C TInt CCharSetConverter::ConvertFromUnicodeL(const TDesC16& aInput, |
|
384 TUint32 aDestCharset, |
|
385 TPtr8& aOutput) |
|
386 /** |
|
387 This method converts the given unicode to the specified encoding. |
|
388 If this function leaves, memory is cleaned up. |
|
389 This overload stores the conversion output in memory already allocated, for the sole use |
|
390 of the TPtr versions of overloaded ConvertToUnicodeL and ConvertFromUnicodeL functions. You must make sure you |
|
391 have finished with the output from a previous call to either (TPtr overload of) ConvertToUnicodeL |
|
392 or ConvertFromUnicodeL before calling either again, as the previous output will be overwritten with |
|
393 the new output. |
|
394 This version is more efficient than the HBufC alternative and so should be used whenever possible. |
|
395 |
|
396 @return KErrNone if the conversion was succesfull |
|
397 or one of the error values defined in TError. |
|
398 |
|
399 @leave KErrXmlUnavailableCharacterSet - Charset not available. |
|
400 |
|
401 @param aInput The unicode to convert. |
|
402 @param aDestCharset The character set encoding to convert to. |
|
403 @param aOutput The characters after conversion. |
|
404 */ |
|
405 { |
|
406 if (iCnvCharacterSetConverter->PrepareToConvertToOrFromL(aDestCharset, iFs) == |
|
407 CCnvCharacterSetConverter::ENotAvailable) |
|
408 { |
|
409 User::Leave(KErrXmlUnavailableCharacterSet); |
|
410 } |
|
411 // Set up output descriptor reference: "Payload" is iConversionBuffer (a TAny *), it's initial |
|
412 // length is zero (because it's empty). |
|
413 aOutput.Set((TUint8*)iConversionBuffer, 0, iConversionBufferSize); |
|
414 |
|
415 // Convert the data, returning the amount of characters that are unconverted, due to the output buffer being full |
|
416 TInt unconverted = iCnvCharacterSetConverter->ConvertFromUnicode(aOutput, aInput); |
|
417 |
|
418 // While there is still more data to convert |
|
419 while (0 < unconverted) |
|
420 { |
|
421 TInt outputLength = aOutput.Length(); |
|
422 |
|
423 // Resize the buffer to hold the remaining data |
|
424 iConversionBufferSize += KMaxReadableBytes; |
|
425 |
|
426 iConversionBuffer = User::Heap().ReAllocL(iConversionBuffer,iConversionBufferSize); |
|
427 if (iConversionBuffer == NULL) |
|
428 User::Leave(KErrNoMemory); |
|
429 aOutput.Set((TUint8*)iConversionBuffer,iConversionBufferSize,iConversionBufferSize); |
|
430 |
|
431 // Construct a modifiable pointer descriptor pointing to the the writable area of |
|
432 // iConversionBuffer |
|
433 TPtr8 remainingOutput(((TUint8*)iConversionBuffer) + outputLength, 0, iConversionBufferSize - outputLength); |
|
434 |
|
435 // Try to convert another chunk of data |
|
436 unconverted = iCnvCharacterSetConverter->ConvertFromUnicode(remainingOutput, aInput.Right(unconverted)); |
|
437 |
|
438 // Update the length of the output buffer to include the data we just converted. |
|
439 aOutput.SetLength(remainingOutput.Length()+outputLength); |
|
440 } |
|
441 |
|
442 return unconverted; // return error value if there is one. |
|
443 } |
|
444 |
|
445 |
|
446 EXPORT_C void CCharSetConverter::PrepareToConvertToOrFromL(TUint32 aCharSetUid) |
|
447 /** |
|
448 This method is a helper function that prepares CharConv for a conversion. |
|
449 |
|
450 @see CCnvCharacterSetConverter::PrepareToConvertToOrFromL |
|
451 @post CharConv is ready for the conversion or not. |
|
452 |
|
453 @leave KErrXmlUnavailableCharacterSet - Charset not available. |
|
454 |
|
455 @param aCharSetUid The character set encoding to convert to. |
|
456 */ |
|
457 { |
|
458 if (iCnvCharacterSetConverter->PrepareToConvertToOrFromL(aCharSetUid, iFs) == |
|
459 CCnvCharacterSetConverter::ENotAvailable ) |
|
460 { |
|
461 User::Leave(KErrXmlUnavailableCharacterSet); |
|
462 } |
|
463 } |
|
464 |
|
465 |
|
466 |
|
467 EXPORT_C void CCharSetConverter::ConvertCharacterSetIdentifierToStandardNameL(TUint32 aCharSetUid, |
|
468 HBufC8*& aCharSet) |
|
469 /** |
|
470 This method is a helper function that obtains a standand character |
|
471 encoding name from a character set identifer. |
|
472 |
|
473 @see CCnvCharacterSetConverter::ConvertCharacterSetIdentifierToStandardNameL |
|
474 |
|
475 @leave KErrXmlUnsupportedCharacterSet If the character set is not known. |
|
476 |
|
477 @param aCharSetUid The character set to obtain the name for. |
|
478 @param aCharSet On return holds the Internet-standard name |
|
479 or MIME name of the character set. |
|
480 The name is encoded in 8 bit ASCII. |
|
481 */ |
|
482 { |
|
483 if ((aCharSet = |
|
484 iCnvCharacterSetConverter-> |
|
485 ConvertCharacterSetIdentifierToStandardNameL(aCharSetUid, iFs)) == NULL) |
|
486 { |
|
487 User::Leave(KErrXmlUnsupportedCharacterSet); |
|
488 } |
|
489 } |
|
490 |
|
491 |
|
492 |
|
493 EXPORT_C TInt CCharSetConverter::ConvertUcs4CharactersToEncodingL(TUint32* aUcs4Src, |
|
494 TInt aUcs4Count, |
|
495 TUint32 aDestCharset, |
|
496 HBufC8*& aConversion) |
|
497 /** |
|
498 This method converts ucs-4 characters to the desired non-modal encoding. |
|
499 aConversion should be NULL on calling of this function. |
|
500 If this function leaves, memory is cleaned up. |
|
501 There is no TPtr overload of this method, as currently it is only called a few times and so would not |
|
502 produce any noticable benefits. |
|
503 |
|
504 @return CCharSetConverter::ConvertFromUnicodeL. |
|
505 |
|
506 @leave KErrXmlBadCharacterConversion |
|
507 |
|
508 @param aUcs4Src list of ucs-4 characters. |
|
509 @param aUcs4Count number of ucs4 characters. |
|
510 @param aDestCharset the desired encoding. |
|
511 @param aConversion On return, points to the converted encoding. |
|
512 */ |
|
513 { |
|
514 // convert ucs-4 to ucs-2 |
|
515 |
|
516 // Find the length of the output |
|
517 TText16 buf[2]; |
|
518 TInt length = 0; |
|
519 TUint32* src = NULL; |
|
520 |
|
521 for (src = aUcs4Src; src != (aUcs4Src + aUcs4Count); ++src) |
|
522 { |
|
523 // Convert a single character into the buffer, discard the result |
|
524 // but increase the length by the number of UTF16 codes output. |
|
525 length += Utf32ToUtf16(buf, *src) - buf; |
|
526 } |
|
527 |
|
528 HBufC16* utf16Out = HBufC16::NewL(length); |
|
529 CleanupStack::PushL(utf16Out); |
|
530 |
|
531 utf16Out->Des().SetLength(length); |
|
532 |
|
533 TText16* p = &((utf16Out->Des())[0]); |
|
534 |
|
535 |
|
536 // go through characters converting to ucs2. |
|
537 for (src = aUcs4Src; src != aUcs4Src + aUcs4Count; ++src) |
|
538 { |
|
539 // convert each ucs4 character |
|
540 p = Utf32ToUtf16(p, *src); |
|
541 } |
|
542 |
|
543 // convert from ucs2 to desired encoding |
|
544 aConversion = NULL; |
|
545 TInt ret = 0; |
|
546 |
|
547 //HBufC overload of this method called, due to the need pass back the HBufC to the calling method |
|
548 ret = CCharSetConverter::ConvertFromUnicodeL(*utf16Out, aDestCharset, aConversion); |
|
549 CleanupStack::PushL(aConversion); |
|
550 |
|
551 if(ret > KErrNone) |
|
552 { |
|
553 // CharConv couldn't convert all the bytes. Character encoding may be truncated. |
|
554 User::Leave(KErrXmlBadCharacterConversion); |
|
555 } |
|
556 |
|
557 CleanupStack::Pop(aConversion); |
|
558 CleanupStack::PopAndDestroy(utf16Out); |
|
559 return(ret); |
|
560 } |
|
561 |
|
562 |
|
563 |
|
564 TText16* CCharSetConverter::Utf32ToUtf16(TText16* aUtf16Out, TUint32 aUtf32) |
|
565 /** |
|
566 This method converts a ucs-4 character to unicode. |
|
567 |
|
568 @return Pointer to the next free byte in the output buffer. |
|
569 |
|
570 @param aUtf16Out On return, contains the unicode character conversion. |
|
571 @param aUtf32 The ucs-4 character |
|
572 */ |
|
573 { |
|
574 if (aUtf32 <= 0xFFFF) |
|
575 { |
|
576 // UTF32 (or UCS4) should not have characters in the range |
|
577 // D800-DBFF (high surrogate) and DC00-DFFF (low surrogate) in it, |
|
578 // as these are the surrogates that make up the extension mechanism for |
|
579 // fitting Unicode into 16 bits. |
|
580 // In principle, surrogates in UCS-4 should be ignored. |
|
581 // They are considered a bad thing because they might be an aliasing |
|
582 // problem: one thing looking like another. |
|
583 // In practice I don't think it is a problem here. |
|
584 // If you like, you could reject any character between D800 to DFFF. |
|
585 |
|
586 // could weed out unpaired surrogates here, but... |
|
587 *aUtf16Out = static_cast<TText16>(aUtf32); |
|
588 return aUtf16Out + 1; |
|
589 } |
|
590 |
|
591 // A way to visualise the use of surrogate pairs is to imaging planes. |
|
592 // The surrogate is located on plane zero and identifies the actual plane |
|
593 // this character resides in. |
|
594 // This is why for supplementary characters we must insert the surrogates |
|
595 // so that charconv can convert correctly. |
|
596 // |
|
597 // 0 D800 |
|
598 // | | DFFF |
|
599 // | | | E000 10FFF |
|
600 // | | | | | |
|
601 // xxxxxxYxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx |
|
602 // ^ |
|
603 // For Utf32 this means nothing. |
|
604 // |
|
605 // For Utf16 if the following bit pattern is located then it corresponds to a |
|
606 // supplementary character. |
|
607 // |
|
608 // ^ |
|
609 // D800 DC00 DFFF |
|
610 // | | | |
|
611 // yyyyyyyyyyyyyyyyyyyyyyyyyyyyyyy |
|
612 // High Low |
|
613 // |
|
614 // 1101 10.. ........ 1101 11.. ........ |
|
615 // --10 bit-- --10 bit-- |
|
616 // |
|
617 // |
|
618 // Add 10000 to both 10-bit values and the offset to the correct character is obtained. |
|
619 // |
|
620 // |
|
621 // So a test of this function would be to check that the value returned |
|
622 // matches a utf-8 character encoding manually calculated from the original ucs4 |
|
623 // value. |
|
624 |
|
625 |
|
626 // We have a supplementary character consists of 5 nibbles (20 bits) |
|
627 // with no surrogates. |
|
628 // We have to insert the surrogate pair on the values minus 0x10000. |
|
629 // b0-b9 is the low order value, b10-b19 is the high order value. |
|
630 // b19....b10 b9....b0 |
|
631 // high low |
|
632 // |
|
633 // Character values. |
|
634 // Basic 0x0-0xFFFF |
|
635 // Supplimentary 0x10000-0x10FFFF |
|
636 // so 0x10000 >> 10 = 0x43FF |
|
637 |
|
638 // To add the surrogate to the high order: |
|
639 // |
|
640 // ((utf32-0x10000)>>10)+0xD800 |
|
641 // = (utf32>>10)-(0x10000>>10)+0xD800 |
|
642 // = (utf32>>10)+(0xD800-0x40) |
|
643 // = (utf32>>10)+0xD7C0 |
|
644 |
|
645 aUtf16Out[0] = static_cast<TText16>((aUtf32 >> 10) + 0xD7C0); |
|
646 |
|
647 // To add the surrogate to the low order: |
|
648 // |
|
649 // ((utf32-0x10000) & 0x3FF)+0xDC00 |
|
650 // = ((utf32 & 0x3FF) - (0x10000 & 0x3FF)) + 0xDC00 |
|
651 // = ((utf32 & 0x3FF) - (0)) + 0xDC00 |
|
652 // = (utf32 & 0x3FF) + 0xDC00 |
|
653 |
|
654 aUtf16Out[1] = static_cast<TText16>(0xDC00 | (aUtf32 & 0x3FF)); |
|
655 |
|
656 return aUtf16Out + 2; |
|
657 } |