diff -r 000000000000 -r e35f40988205 xml/xmlfw/src/xmlframework/charsetconverter.cpp --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/xml/xmlfw/src/xmlframework/charsetconverter.cpp Thu Dec 17 09:29:21 2009 +0200 @@ -0,0 +1,657 @@ +// Copyright (c) 2003-2009 Nokia Corporation and/or its subsidiary(-ies). +// All rights reserved. +// This component and the accompanying materials are made available +// under the terms of "Eclipse Public License v1.0" +// which accompanies this distribution, and is available +// at the URL "http://www.eclipse.org/legal/epl-v10.html". +// +// Initial Contributors: +// Nokia Corporation - initial contribution. +// +// Contributors: +// +// Description: +// + +#include +#include +#include + +#include +#include + +using namespace Xml; + +/** +The maximum number of bytes used for conversion at any time. +This is also used to size the necessary buffers used in the conversions. + +@internalTechnology +*/ +const TInt KMaxReadableBytes = 512; + + + +LOCAL_C void DestroyHBufC16(TAny* aHBufC) +/** +This method is used when pointer reallocation is needed and the pointer needs to be +cleaned via the cleanup stack. + +@param aHBufC the wide buffer. +@internalTechnology + +*/ + { + delete *static_cast(aHBufC); + } + + + +LOCAL_C void DestroyHBufC8(TAny* aHBufC) +/** +This method is used when pointer reallocation is needed and the pointer needs to be +cleaned via the cleanup stack. + +@param aHBufC the narrow buffer. +@internalTechnology + +*/ + { + delete *static_cast(aHBufC); + } + + + +CCharSetConverter::CCharSetConverter() +/** +Default Constructor + +*/ + { + // do nothing; + } + + + +/** +This method creates an instance of this class. +The framework is responsible for creating this object. + +@leave ... One of the system wide error codes e.g. KErrNoMemory +@return The new'ed object. +@internalTechnology +*/ +CCharSetConverter* CCharSetConverter::NewL() + { + CCharSetConverter* self = new(ELeave) CCharSetConverter(); + CleanupStack::PushL(self); + self->ConstructL(); + CleanupStack::Pop(self); + return(self); + } + + + +void CCharSetConverter::ConstructL() +/** +This method provides some construction of this object. + +*/ + { + iCnvCharacterSetConverter = CCnvCharacterSetConverter::NewL(); + User::LeaveIfError(iFs.Connect()); + iConversionBuffer = User::Heap().AllocL(KMaxReadableBytes); + iConversionBufferSize = KMaxReadableBytes; + } + + + +CCharSetConverter::~CCharSetConverter() +/** +Destructor. +The framework is responsible for destroying this object. + +@post This object is properly destroyed. + +*/ + { + iFs.Close(); + delete iCnvCharacterSetConverter; + delete iConversionBuffer; + } + + + +EXPORT_C void CCharSetConverter::PrepareCharConvL(TUint& aCharSetUid, const TDesC8& aEncoding) +/** +This method prepares CharConv to encode from the standard name. + +@post CharConv has been prepared. + +@leave KErrXmlUnsupportedCharacterSet - Charset not supported. +@leave KErrXmlUnavailableCharacterSet - Charset not available + +@param aCharSetUid On return, contains the character set identifier + of the encoding. +@param aEncoding the encoding to prepare for. +*/ + { + // Get the charset uid + if ((aCharSetUid = + iCnvCharacterSetConverter->ConvertStandardNameOfCharacterSetToIdentifierL(aEncoding, iFs)) == 0) + { + User::Leave(KErrXmlUnsupportedCharacterSet); + } + + + // Prepare charconv to use this charset + if (iCnvCharacterSetConverter->PrepareToConvertToOrFromL(aCharSetUid, iFs) == + CCnvCharacterSetConverter::ENotAvailable ) + { + User::Leave(KErrXmlUnavailableCharacterSet); // Unavailable + } + } + + + +EXPORT_C void CCharSetConverter::PrepareCharConvL(TUint& aCharSetUid, TInt aMibEnum) +/** +This method prepares CharConv to encode from the mib enum. + +@post CharConv has been prepared. + +@leave KErrXmlUnsupportedCharacterSet - Charset not supported. +@leave KErrXmlUnavailableCharacterSet - Charset not available + +@param aCharSetUid On return, contains the character set identifier + of the encoding. +@param aMibEnum The IANA specified mib enum for this encoding + +@see http://www.iana.org/assignments/character-sets +*/ + { + // Get the charset uid + if ((aCharSetUid = + iCnvCharacterSetConverter->ConvertMibEnumOfCharacterSetToIdentifierL(aMibEnum, iFs)) == 0) + { + User::Leave(KErrXmlUnsupportedCharacterSet); // May want to try something else? + } + + + // Prepare charconv to use this charset + if (iCnvCharacterSetConverter->PrepareToConvertToOrFromL(aCharSetUid, iFs) == + CCnvCharacterSetConverter::ENotAvailable ) + { + User::Leave(KErrXmlUnavailableCharacterSet); // Unavailable + } + } + + + +EXPORT_C TInt CCharSetConverter::ConvertToUnicodeL(TUint32 aSrcCharset, const TDesC8& aInputBuffer, + HBufC16*& aUnicodeConversion) +/** +This method converts the given bytes to unicode. +If this function leaves, memory is cleaned up. +This overload allocates memory for the output itself. + +@return KErrNone if the conversion was succesfull + or one of the error values defined in TError. + +@leave KErrXmlUnavailableCharacterSet - CharSet not available. + +@param aSrcCharset The character set encoding to convert from. +@param aInputBuffer The characters to be converted. +@param aUnicodeConversion On return, contains the unicode conversion. +*/ + { + if (iCnvCharacterSetConverter->PrepareToConvertToOrFromL(aSrcCharset, iFs) == + CCnvCharacterSetConverter::ENotAvailable) + { + User::Leave(KErrXmlUnavailableCharacterSet); + } + + TInt maxLength = KMaxReadableBytes; + aUnicodeConversion = HBufC16::NewL(maxLength); + CleanupStack::PushL(TCleanupItem(DestroyHBufC16, &aUnicodeConversion));//push buffer's address + + + TInt state = CCnvCharacterSetConverter::KStateDefault; + TPtr16 remainingOutput(aUnicodeConversion->Des()); + TInt unconverted = iCnvCharacterSetConverter->ConvertToUnicode(remainingOutput, aInputBuffer, state); + + // While there is still more data to convert + while (0 < unconverted) + { + // Resize the buffer to hold more data + maxLength += KMaxReadableBytes; + aUnicodeConversion = aUnicodeConversion->ReAllocL(maxLength); + + // Segment the writable area + TInt outputLength = aUnicodeConversion->Length(); + TPtr16 remainingOutput1(&(aUnicodeConversion->Des())[0] + outputLength, 0, maxLength - outputLength); + remainingOutput.Set(remainingOutput1); + + // Convert the data + unconverted = iCnvCharacterSetConverter->ConvertToUnicode(remainingOutput, aInputBuffer.Right(unconverted), state); + aUnicodeConversion->Des().SetLength(outputLength + remainingOutput.Length()); + } + + // Reallocate to a minimally-sized buffer + if (unconverted == 0) + { + aUnicodeConversion = aUnicodeConversion->ReAllocL(aUnicodeConversion->Length()); + } + + CleanupStack::Pop(&aUnicodeConversion);//destroy the object pointed by the buffer wherever it is since we have got hold of the pointer (buffer)'s address + return unconverted; // return error value if there is one. + } + + +EXPORT_C TInt CCharSetConverter::ConvertToUnicodeL(TUint32 aSrcCharset, + const TDesC8& aInput, + TPtr16& aOutput) +/** +This method converts the given bytes to unicode. +If this function leaves, memory is cleaned up. +This overload stores the conversion output in memory already allocated, for the sole use +of the TPtr versions of overloaded ConvertToUnicodeL and ConvertFromUnicodeL functions. You must make sure you +have finished with the output from a previous call to either (TPtr overload of) ConvertToUnicodeL +or ConvertFromUnicodeL before calling either again, as the previous output will be overwritten with +the new output. +This version is more efficient than the HBufC alternative and so should be used whenever possible. + +@return KErrNone if the conversion was succesfull + or one of the error values defined in TError. + +@leave KErrXmlUnavailableCharacterSet - CharSet not available. + +@param aSrcCharset The character set encoding to convert from. +@param aInput The characters to be converted. +@param aOutput On return, contains the unicode conversion. +*/ + { + if (iCnvCharacterSetConverter->PrepareToConvertToOrFromL(aSrcCharset, iFs) == + CCnvCharacterSetConverter::ENotAvailable) + { + User::Leave(KErrXmlUnavailableCharacterSet); + } + + TInt state = CCnvCharacterSetConverter::KStateDefault; + + // Set up output descriptor reference: "Payload" is iConversionBuffer (a TAny *), it's initial + // length is zero (because it's empty) and it's initial maximum length is the maximum number of + // unicode characters which will fit into the current size of iConversion buffer + aOutput.Set((TUint16*)iConversionBuffer, 0, iConversionBufferSize/sizeof(TUint16)); + + // Convert the data, returning the amount of characters that are unconverted, due to the output buffer being full + TInt unconverted = iCnvCharacterSetConverter->ConvertToUnicode(aOutput, aInput, state); + + // While there is still more data to convert + while (0 < unconverted) + { + TInt outputLength = aOutput.Length(); + + // Resize the buffer to hold more data + iConversionBufferSize += KMaxReadableBytes; + + iConversionBuffer = User::Heap().ReAllocL(iConversionBuffer,iConversionBufferSize); + if (iConversionBuffer == NULL) + User::Leave(KErrNoMemory); + + // Reconstruct the output descriptor to point to the new buffer, setting current + // length (the number of characters we've converted so far) and maximum length + // (the number of unicode characters which will fit into the newly extended + // iConversionBuffer) appropriately. + aOutput.Set((TUint16*)iConversionBuffer, outputLength, iConversionBufferSize/sizeof(TUint16)); + + // Construct a modifiable pointer descriptor pointing to the the writable area of + // iConversionBuffer + TPtr16 remainingOutput(((TUint16*)iConversionBuffer)+outputLength, 0, aOutput.MaxLength() - outputLength); + + // Try to convert another chunk of data + unconverted = iCnvCharacterSetConverter->ConvertToUnicode(remainingOutput, aInput.Right(unconverted), state); + + // Update the length of the output buffer to include the data we just converted. + aOutput.SetLength(remainingOutput.Length()+outputLength); + } + + return unconverted; // return error value if there is one. + } + + + +EXPORT_C TInt CCharSetConverter::ConvertFromUnicodeL(const TDesC16& aUnicodeConversion, + TUint32 aDestCharset, HBufC8*& aOutputBuffer) +/** +This method converts the given unicode to the specified encoding. +If this function leaves, memory is cleaned up. +This overload allocates memory for the output itself. + +@return KErrNone if the conversion was succesfull + or one of the error values defined in TError. + +@leave KErrXmlUnavailableCharacterSet - Charset not available. + +@param aUnicodeConversion The unicode to convert. +@param aDestCharset The character set encoding to convert to. +@param aOutputBuffer On return, contains the specified conversion. +*/ + { + if (iCnvCharacterSetConverter->PrepareToConvertToOrFromL(aDestCharset, iFs) == + CCnvCharacterSetConverter::ENotAvailable) + { + User::Leave(KErrXmlUnavailableCharacterSet); + } + + TInt maxLength = KMaxReadableBytes; + aOutputBuffer = HBufC8::NewL(maxLength); + CleanupStack::PushL(TCleanupItem(DestroyHBufC8, &aOutputBuffer));//push buffer's address + + TPtr8 remainingOutput(aOutputBuffer->Des()); + TInt unconverted = iCnvCharacterSetConverter->ConvertFromUnicode(remainingOutput, aUnicodeConversion); + + // While there is still more data to convert + while (0 < unconverted) + { + // Resize the buffer to hold more data + maxLength += KMaxReadableBytes; + aOutputBuffer = aOutputBuffer->ReAllocL(maxLength); + + // Segment the writable area + TInt outputLength = aOutputBuffer->Length(); + TPtr8 remainingOutput1(&(aOutputBuffer->Des())[0] + outputLength, 0, maxLength - outputLength); + remainingOutput.Set(remainingOutput1); + + // Convert the data + unconverted = iCnvCharacterSetConverter->ConvertFromUnicode(remainingOutput, aUnicodeConversion.Right(unconverted)); + aOutputBuffer->Des().SetLength(outputLength + remainingOutput.Length()); + } + + // Reallocate to a minimally-sized buffer + if (unconverted == 0) + { + aOutputBuffer = aOutputBuffer->ReAllocL(aOutputBuffer->Length()); + } + + CleanupStack::Pop(&aOutputBuffer);//destroy the object pointed by the buffer wherever it is since we have got hold of the pointer (buffer)'s address + + return unconverted; // return error value if there is one. + } + + +EXPORT_C TInt CCharSetConverter::ConvertFromUnicodeL(const TDesC16& aInput, + TUint32 aDestCharset, + TPtr8& aOutput) +/** +This method converts the given unicode to the specified encoding. +If this function leaves, memory is cleaned up. +This overload stores the conversion output in memory already allocated, for the sole use +of the TPtr versions of overloaded ConvertToUnicodeL and ConvertFromUnicodeL functions. You must make sure you +have finished with the output from a previous call to either (TPtr overload of) ConvertToUnicodeL +or ConvertFromUnicodeL before calling either again, as the previous output will be overwritten with +the new output. +This version is more efficient than the HBufC alternative and so should be used whenever possible. + +@return KErrNone if the conversion was succesfull + or one of the error values defined in TError. + +@leave KErrXmlUnavailableCharacterSet - Charset not available. + +@param aInput The unicode to convert. +@param aDestCharset The character set encoding to convert to. +@param aOutput The characters after conversion. +*/ + { + if (iCnvCharacterSetConverter->PrepareToConvertToOrFromL(aDestCharset, iFs) == + CCnvCharacterSetConverter::ENotAvailable) + { + User::Leave(KErrXmlUnavailableCharacterSet); + } + // Set up output descriptor reference: "Payload" is iConversionBuffer (a TAny *), it's initial + // length is zero (because it's empty). + aOutput.Set((TUint8*)iConversionBuffer, 0, iConversionBufferSize); + + // Convert the data, returning the amount of characters that are unconverted, due to the output buffer being full + TInt unconverted = iCnvCharacterSetConverter->ConvertFromUnicode(aOutput, aInput); + + // While there is still more data to convert + while (0 < unconverted) + { + TInt outputLength = aOutput.Length(); + + // Resize the buffer to hold the remaining data + iConversionBufferSize += KMaxReadableBytes; + + iConversionBuffer = User::Heap().ReAllocL(iConversionBuffer,iConversionBufferSize); + if (iConversionBuffer == NULL) + User::Leave(KErrNoMemory); + aOutput.Set((TUint8*)iConversionBuffer,iConversionBufferSize,iConversionBufferSize); + + // Construct a modifiable pointer descriptor pointing to the the writable area of + // iConversionBuffer + TPtr8 remainingOutput(((TUint8*)iConversionBuffer) + outputLength, 0, iConversionBufferSize - outputLength); + + // Try to convert another chunk of data + unconverted = iCnvCharacterSetConverter->ConvertFromUnicode(remainingOutput, aInput.Right(unconverted)); + + // Update the length of the output buffer to include the data we just converted. + aOutput.SetLength(remainingOutput.Length()+outputLength); + } + + return unconverted; // return error value if there is one. + } + + +EXPORT_C void CCharSetConverter::PrepareToConvertToOrFromL(TUint32 aCharSetUid) +/** +This method is a helper function that prepares CharConv for a conversion. + +@see CCnvCharacterSetConverter::PrepareToConvertToOrFromL +@post CharConv is ready for the conversion or not. + +@leave KErrXmlUnavailableCharacterSet - Charset not available. + +@param aCharSetUid The character set encoding to convert to. +*/ + { + if (iCnvCharacterSetConverter->PrepareToConvertToOrFromL(aCharSetUid, iFs) == + CCnvCharacterSetConverter::ENotAvailable ) + { + User::Leave(KErrXmlUnavailableCharacterSet); + } + } + + + +EXPORT_C void CCharSetConverter::ConvertCharacterSetIdentifierToStandardNameL(TUint32 aCharSetUid, + HBufC8*& aCharSet) +/** +This method is a helper function that obtains a standand character +encoding name from a character set identifer. + +@see CCnvCharacterSetConverter::ConvertCharacterSetIdentifierToStandardNameL + +@leave KErrXmlUnsupportedCharacterSet If the character set is not known. + +@param aCharSetUid The character set to obtain the name for. +@param aCharSet On return holds the Internet-standard name + or MIME name of the character set. + The name is encoded in 8 bit ASCII. +*/ + { + if ((aCharSet = + iCnvCharacterSetConverter-> + ConvertCharacterSetIdentifierToStandardNameL(aCharSetUid, iFs)) == NULL) + { + User::Leave(KErrXmlUnsupportedCharacterSet); + } + } + + + +EXPORT_C TInt CCharSetConverter::ConvertUcs4CharactersToEncodingL(TUint32* aUcs4Src, + TInt aUcs4Count, + TUint32 aDestCharset, + HBufC8*& aConversion) +/** +This method converts ucs-4 characters to the desired non-modal encoding. +aConversion should be NULL on calling of this function. +If this function leaves, memory is cleaned up. +There is no TPtr overload of this method, as currently it is only called a few times and so would not +produce any noticable benefits. + +@return CCharSetConverter::ConvertFromUnicodeL. + +@leave KErrXmlBadCharacterConversion + +@param aUcs4Src list of ucs-4 characters. +@param aUcs4Count number of ucs4 characters. +@param aDestCharset the desired encoding. +@param aConversion On return, points to the converted encoding. +*/ + { + // convert ucs-4 to ucs-2 + + // Find the length of the output + TText16 buf[2]; + TInt length = 0; + TUint32* src = NULL; + + for (src = aUcs4Src; src != (aUcs4Src + aUcs4Count); ++src) + { + // Convert a single character into the buffer, discard the result + // but increase the length by the number of UTF16 codes output. + length += Utf32ToUtf16(buf, *src) - buf; + } + + HBufC16* utf16Out = HBufC16::NewL(length); + CleanupStack::PushL(utf16Out); + + utf16Out->Des().SetLength(length); + + TText16* p = &((utf16Out->Des())[0]); + + + // go through characters converting to ucs2. + for (src = aUcs4Src; src != aUcs4Src + aUcs4Count; ++src) + { + // convert each ucs4 character + p = Utf32ToUtf16(p, *src); + } + + // convert from ucs2 to desired encoding + aConversion = NULL; + TInt ret = 0; + + //HBufC overload of this method called, due to the need pass back the HBufC to the calling method + ret = CCharSetConverter::ConvertFromUnicodeL(*utf16Out, aDestCharset, aConversion); + CleanupStack::PushL(aConversion); + + if(ret > KErrNone) + { + // CharConv couldn't convert all the bytes. Character encoding may be truncated. + User::Leave(KErrXmlBadCharacterConversion); + } + + CleanupStack::Pop(aConversion); + CleanupStack::PopAndDestroy(utf16Out); + return(ret); + } + + + +TText16* CCharSetConverter::Utf32ToUtf16(TText16* aUtf16Out, TUint32 aUtf32) +/** +This method converts a ucs-4 character to unicode. + +@return Pointer to the next free byte in the output buffer. + +@param aUtf16Out On return, contains the unicode character conversion. +@param aUtf32 The ucs-4 character +*/ + { + if (aUtf32 <= 0xFFFF) + { + // UTF32 (or UCS4) should not have characters in the range + // D800-DBFF (high surrogate) and DC00-DFFF (low surrogate) in it, + // as these are the surrogates that make up the extension mechanism for + // fitting Unicode into 16 bits. + // In principle, surrogates in UCS-4 should be ignored. + // They are considered a bad thing because they might be an aliasing + // problem: one thing looking like another. + // In practice I don't think it is a problem here. + // If you like, you could reject any character between D800 to DFFF. + + // could weed out unpaired surrogates here, but... + *aUtf16Out = static_cast(aUtf32); + return aUtf16Out + 1; + } + + // A way to visualise the use of surrogate pairs is to imaging planes. + // The surrogate is located on plane zero and identifies the actual plane + // this character resides in. + // This is why for supplementary characters we must insert the surrogates + // so that charconv can convert correctly. + // + // 0 D800 + // | | DFFF + // | | | E000 10FFF + // | | | | | + // xxxxxxYxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx + // ^ + // For Utf32 this means nothing. + // + // For Utf16 if the following bit pattern is located then it corresponds to a + // supplementary character. + // + // ^ + // D800 DC00 DFFF + // | | | + // yyyyyyyyyyyyyyyyyyyyyyyyyyyyyyy + // High Low + // + // 1101 10.. ........ 1101 11.. ........ + // --10 bit-- --10 bit-- + // + // + // Add 10000 to both 10-bit values and the offset to the correct character is obtained. + // + // + // So a test of this function would be to check that the value returned + // matches a utf-8 character encoding manually calculated from the original ucs4 + // value. + + + // We have a supplementary character consists of 5 nibbles (20 bits) + // with no surrogates. + // We have to insert the surrogate pair on the values minus 0x10000. + // b0-b9 is the low order value, b10-b19 is the high order value. + // b19....b10 b9....b0 + // high low + // + // Character values. + // Basic 0x0-0xFFFF + // Supplimentary 0x10000-0x10FFFF + // so 0x10000 >> 10 = 0x43FF + + // To add the surrogate to the high order: + // + // ((utf32-0x10000)>>10)+0xD800 + // = (utf32>>10)-(0x10000>>10)+0xD800 + // = (utf32>>10)+(0xD800-0x40) + // = (utf32>>10)+0xD7C0 + + aUtf16Out[0] = static_cast((aUtf32 >> 10) + 0xD7C0); + + // To add the surrogate to the low order: + // + // ((utf32-0x10000) & 0x3FF)+0xDC00 + // = ((utf32 & 0x3FF) - (0x10000 & 0x3FF)) + 0xDC00 + // = ((utf32 & 0x3FF) - (0)) + 0xDC00 + // = (utf32 & 0x3FF) + 0xDC00 + + aUtf16Out[1] = static_cast(0xDC00 | (aUtf32 & 0x3FF)); + + return aUtf16Out + 2; + }