xml/xmlfw/src/xmlframework/charsetconverter.cpp
author Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
Thu, 17 Dec 2009 09:29:21 +0200
changeset 0 e35f40988205
permissions -rw-r--r--
Revision: 200947 Kit: 200951

// Copyright (c) 2003-2009 Nokia Corporation and/or its subsidiary(-ies).
// All rights reserved.
// This component and the accompanying materials are made available
// under the terms of "Eclipse Public License v1.0"
// which accompanies this distribution, and is available
// at the URL "http://www.eclipse.org/legal/epl-v10.html".
//
// Initial Contributors:
// Nokia Corporation - initial contribution.
//
// Contributors:
//
// Description:
//

#include <e32std.h>
#include <utf.h>
#include <charconv.h>

#include <xml/plugins/charsetconverter.h>
#include <xml/xmlframeworkerrors.h>

using namespace Xml;

/**
The maximum number of bytes used for conversion at any time.
This is also used to size the necessary buffers used in the conversions.

@internalTechnology
*/
const TInt KMaxReadableBytes = 512;



LOCAL_C void DestroyHBufC16(TAny* aHBufC)
/**
This method is used when pointer reallocation is needed and the pointer needs to be 
cleaned via the cleanup stack.

@param				aHBufC the wide buffer.
@internalTechnology

*/
	{
	delete *static_cast<HBufC**>(aHBufC);
	}



LOCAL_C void DestroyHBufC8(TAny* aHBufC)
/**
This method is used when pointer reallocation is needed and the pointer needs to be 
cleaned via the cleanup stack.

@param				aHBufC the narrow buffer.
@internalTechnology

*/
	{
	delete *static_cast<HBufC8**>(aHBufC);
	}



CCharSetConverter::CCharSetConverter()
/**
Default Constructor

*/
	{
	// do nothing;
	}



/**
This method creates an instance of this class.
The framework is responsible for creating this object.

@leave ... One of the system wide error codes e.g. KErrNoMemory
@return	The new'ed object.
@internalTechnology
*/
CCharSetConverter* CCharSetConverter::NewL()
	{
	CCharSetConverter* self = new(ELeave) CCharSetConverter();
	CleanupStack::PushL(self);
	self->ConstructL();
	CleanupStack::Pop(self);
	return(self);
	}



void CCharSetConverter::ConstructL()
/**
This method provides some construction of this object.

*/
	{
	iCnvCharacterSetConverter = CCnvCharacterSetConverter::NewL();
	User::LeaveIfError(iFs.Connect());
	iConversionBuffer = User::Heap().AllocL(KMaxReadableBytes);
	iConversionBufferSize = KMaxReadableBytes;
	}



CCharSetConverter::~CCharSetConverter()
/**
Destructor.
The framework is responsible for destroying this object.

@post				This object is properly destroyed.

*/
	{
	iFs.Close();
	delete iCnvCharacterSetConverter;
	delete iConversionBuffer;
	}



EXPORT_C void CCharSetConverter::PrepareCharConvL(TUint& aCharSetUid, const TDesC8& aEncoding)
/**
This method prepares CharConv to encode from the standard name.

@post				CharConv has been prepared.

@leave				KErrXmlUnsupportedCharacterSet - Charset not supported.
@leave				KErrXmlUnavailableCharacterSet - Charset not available

@param				aCharSetUid On return, contains the character set identifier 
					of the encoding.
@param				aEncoding the encoding to prepare for.
*/
	{
	// Get the charset uid
	if ((aCharSetUid = 
		iCnvCharacterSetConverter->ConvertStandardNameOfCharacterSetToIdentifierL(aEncoding, iFs)) == 0)
		{
		User::Leave(KErrXmlUnsupportedCharacterSet);
		}


	// Prepare charconv to use this charset
	if (iCnvCharacterSetConverter->PrepareToConvertToOrFromL(aCharSetUid, iFs) == 
			CCnvCharacterSetConverter::ENotAvailable )
		{
		User::Leave(KErrXmlUnavailableCharacterSet); // Unavailable
		}
	}



EXPORT_C void CCharSetConverter::PrepareCharConvL(TUint& aCharSetUid, TInt aMibEnum)
/**
This method prepares CharConv to encode from the mib enum.

@post				CharConv has been prepared.

@leave				KErrXmlUnsupportedCharacterSet - Charset not supported.
@leave				KErrXmlUnavailableCharacterSet - Charset not available

@param				aCharSetUid On return, contains the character set identifier 
					of the encoding.
@param				aMibEnum The IANA specified mib enum for this encoding

@see				http://www.iana.org/assignments/character-sets
*/
	{
	// Get the charset uid
	if ((aCharSetUid = 
		iCnvCharacterSetConverter->ConvertMibEnumOfCharacterSetToIdentifierL(aMibEnum, iFs)) == 0)
		{
		User::Leave(KErrXmlUnsupportedCharacterSet);  // May want to try something else?
		}


	// Prepare charconv to use this charset
	if (iCnvCharacterSetConverter->PrepareToConvertToOrFromL(aCharSetUid, iFs) == 
			CCnvCharacterSetConverter::ENotAvailable )
		{
		User::Leave(KErrXmlUnavailableCharacterSet); // Unavailable
		}
	}



EXPORT_C TInt CCharSetConverter::ConvertToUnicodeL(TUint32 aSrcCharset, const TDesC8& aInputBuffer, 
														 HBufC16*& aUnicodeConversion)
/**
This method converts the given bytes to unicode.
If this function leaves, memory is cleaned up.
This overload allocates memory for the output itself.

@return				KErrNone if the conversion was succesfull
					or one of the error values defined in TError.

@leave				KErrXmlUnavailableCharacterSet - CharSet not available.

@param				aSrcCharset The character set encoding to convert from.
@param				aInputBuffer The characters to be converted.
@param				aUnicodeConversion On return, contains the unicode conversion.
*/
	{	
	if (iCnvCharacterSetConverter->PrepareToConvertToOrFromL(aSrcCharset, iFs) == 
			CCnvCharacterSetConverter::ENotAvailable)
		{
		User::Leave(KErrXmlUnavailableCharacterSet);
		}

	TInt maxLength = KMaxReadableBytes;	
	aUnicodeConversion = HBufC16::NewL(maxLength);	
	CleanupStack::PushL(TCleanupItem(DestroyHBufC16, &aUnicodeConversion));//push buffer's address


	TInt state = CCnvCharacterSetConverter::KStateDefault;
	TPtr16 remainingOutput(aUnicodeConversion->Des());
	TInt unconverted = iCnvCharacterSetConverter->ConvertToUnicode(remainingOutput, aInputBuffer, state);	

	// While there is still more data to convert
	while (0 < unconverted)	
		{
		// Resize the buffer to hold more data
		maxLength += KMaxReadableBytes;
		aUnicodeConversion = aUnicodeConversion->ReAllocL(maxLength);

		// Segment the writable area
		TInt outputLength = aUnicodeConversion->Length();
		TPtr16 remainingOutput1(&(aUnicodeConversion->Des())[0] + outputLength, 0, maxLength - outputLength);
		remainingOutput.Set(remainingOutput1);

		// Convert the data
		unconverted = iCnvCharacterSetConverter->ConvertToUnicode(remainingOutput, aInputBuffer.Right(unconverted), state);
		aUnicodeConversion->Des().SetLength(outputLength + remainingOutput.Length());
		}

	// Reallocate to a minimally-sized buffer	
	if (unconverted == 0)
		{
		aUnicodeConversion = aUnicodeConversion->ReAllocL(aUnicodeConversion->Length());
		}

	CleanupStack::Pop(&aUnicodeConversion);//destroy the object pointed by the buffer wherever it is since we have got hold of the pointer (buffer)'s address
	return unconverted;  // return error value if there is one.
	}


EXPORT_C TInt CCharSetConverter::ConvertToUnicodeL(TUint32 aSrcCharset, 
												   const TDesC8& aInput,
												   TPtr16& aOutput)
/**
This method converts the given bytes to unicode.
If this function leaves, memory is cleaned up.
This overload stores the conversion output in memory already allocated, for the sole use
of the TPtr versions of overloaded ConvertToUnicodeL and ConvertFromUnicodeL functions. You must make sure you
have finished with the output from a previous call to either (TPtr overload of) ConvertToUnicodeL
or ConvertFromUnicodeL before calling either again, as the previous output will be overwritten with
the new output.
This version is more efficient than the HBufC alternative and so should be used whenever possible.

@return				KErrNone if the conversion was succesfull
					or one of the error values defined in TError.

@leave				KErrXmlUnavailableCharacterSet - CharSet not available.

@param				aSrcCharset The character set encoding to convert from.
@param				aInput The characters to be converted.
@param				aOutput On return, contains the unicode conversion.
*/
	{	
	if (iCnvCharacterSetConverter->PrepareToConvertToOrFromL(aSrcCharset, iFs) == 
			CCnvCharacterSetConverter::ENotAvailable)
		{
		User::Leave(KErrXmlUnavailableCharacterSet);
		}

	TInt state = CCnvCharacterSetConverter::KStateDefault;

	// Set up output descriptor reference: "Payload" is iConversionBuffer (a TAny *), it's initial
	// length is zero (because it's empty) and it's initial maximum length is the maximum number of
	// unicode characters which will fit into the current size of iConversion buffer 	
	aOutput.Set((TUint16*)iConversionBuffer, 0, iConversionBufferSize/sizeof(TUint16));

	// Convert the data, returning the amount of characters that are unconverted, due to the output buffer being full
	TInt unconverted = iCnvCharacterSetConverter->ConvertToUnicode(aOutput, aInput, state);	

	// While there is still more data to convert
	while (0 < unconverted)	
		{
		TInt outputLength = aOutput.Length();

		// Resize the buffer to hold more data
		iConversionBufferSize += KMaxReadableBytes;
		
		iConversionBuffer = User::Heap().ReAllocL(iConversionBuffer,iConversionBufferSize);
		if (iConversionBuffer == NULL) 	
			User::Leave(KErrNoMemory);
		
		// Reconstruct the output descriptor to point to the new buffer, setting current
		// length (the number of characters we've converted so far) and maximum length
		// (the number of unicode characters which will fit into the newly extended 
		// iConversionBuffer) appropriately. 
		aOutput.Set((TUint16*)iConversionBuffer, outputLength, iConversionBufferSize/sizeof(TUint16));		

		// Construct a modifiable pointer descriptor pointing to the the writable area of
		// iConversionBuffer
		TPtr16 remainingOutput(((TUint16*)iConversionBuffer)+outputLength, 0, aOutput.MaxLength() - outputLength);

		// Try to convert another chunk of data
		unconverted = iCnvCharacterSetConverter->ConvertToUnicode(remainingOutput, aInput.Right(unconverted), state);
		
		// Update the length of the output buffer to include the data we just converted.
		aOutput.SetLength(remainingOutput.Length()+outputLength);
		}
		
	return unconverted;  // return error value if there is one.
	}


			
EXPORT_C TInt CCharSetConverter::ConvertFromUnicodeL(const TDesC16& aUnicodeConversion, 
														   TUint32 aDestCharset, HBufC8*& aOutputBuffer)
/**
This method converts the given unicode to the specified encoding.
If this function leaves, memory is cleaned up.
This overload allocates memory for the output itself.

@return				KErrNone if the conversion was succesfull
					or one of the error values defined in TError.
					
@leave				KErrXmlUnavailableCharacterSet - Charset not available.

@param				aUnicodeConversion The unicode to convert.
@param				aDestCharset The character set encoding to convert to.
@param				aOutputBuffer On return, contains the specified conversion.
*/
	{
	if (iCnvCharacterSetConverter->PrepareToConvertToOrFromL(aDestCharset, iFs) == 
			CCnvCharacterSetConverter::ENotAvailable)
		{
		User::Leave(KErrXmlUnavailableCharacterSet);
		}

	TInt maxLength = KMaxReadableBytes;	
	aOutputBuffer = HBufC8::NewL(maxLength);	
	CleanupStack::PushL(TCleanupItem(DestroyHBufC8, &aOutputBuffer));//push buffer's address

	TPtr8 remainingOutput(aOutputBuffer->Des());
	TInt unconverted = iCnvCharacterSetConverter->ConvertFromUnicode(remainingOutput, aUnicodeConversion);	

	// While there is still more data to convert
	while (0 < unconverted)	
		{
		// Resize the buffer to hold more data
		maxLength += KMaxReadableBytes;
		aOutputBuffer = aOutputBuffer->ReAllocL(maxLength);

		// Segment the writable area
		TInt outputLength = aOutputBuffer->Length();
		TPtr8 remainingOutput1(&(aOutputBuffer->Des())[0] + outputLength, 0, maxLength - outputLength);
		remainingOutput.Set(remainingOutput1);

		// Convert the data
		unconverted = iCnvCharacterSetConverter->ConvertFromUnicode(remainingOutput, aUnicodeConversion.Right(unconverted));
		aOutputBuffer->Des().SetLength(outputLength + remainingOutput.Length());
		}

	// Reallocate to a minimally-sized buffer	
	if (unconverted == 0)
		{
		aOutputBuffer = aOutputBuffer->ReAllocL(aOutputBuffer->Length());
		}

	CleanupStack::Pop(&aOutputBuffer);//destroy the object pointed by the buffer wherever it is since we have got hold of the pointer (buffer)'s address

	return unconverted;  // return error value if there is one.
	}


EXPORT_C TInt CCharSetConverter::ConvertFromUnicodeL(const TDesC16& aInput,
													 TUint32 aDestCharset,
													 TPtr8& aOutput)
/**
This method converts the given unicode to the specified encoding. 
If this function leaves, memory is cleaned up.
This overload stores the conversion output in memory already allocated, for the sole use
of the TPtr versions of overloaded ConvertToUnicodeL and ConvertFromUnicodeL functions. You must make sure you
have finished with the output from a previous call to either (TPtr overload of) ConvertToUnicodeL
or ConvertFromUnicodeL before calling either again, as the previous output will be overwritten with
the new output.
This version is more efficient than the HBufC alternative and so should be used whenever possible.

@return				KErrNone if the conversion was succesfull
					or one of the error values defined in TError.
					
@leave				KErrXmlUnavailableCharacterSet - Charset not available.

@param				aInput The unicode to convert.
@param				aDestCharset The character set encoding to convert to.
@param				aOutput The characters after conversion.
*/
	{
	if (iCnvCharacterSetConverter->PrepareToConvertToOrFromL(aDestCharset, iFs) == 
			CCnvCharacterSetConverter::ENotAvailable)
		{
		User::Leave(KErrXmlUnavailableCharacterSet);
		}
	// Set up output descriptor reference: "Payload" is iConversionBuffer (a TAny *), it's initial
	// length is zero (because it's empty).	
	aOutput.Set((TUint8*)iConversionBuffer, 0, iConversionBufferSize);	

	// Convert the data, returning the amount of characters that are unconverted, due to the output buffer being full
	TInt unconverted = iCnvCharacterSetConverter->ConvertFromUnicode(aOutput, aInput);	

	// While there is still more data to convert
	while (0 < unconverted)	
		{
		TInt outputLength = aOutput.Length();
		
		// Resize the buffer to hold the remaining data
		iConversionBufferSize += KMaxReadableBytes;
		
		iConversionBuffer = User::Heap().ReAllocL(iConversionBuffer,iConversionBufferSize);
		if (iConversionBuffer == NULL) 	
			User::Leave(KErrNoMemory);
		aOutput.Set((TUint8*)iConversionBuffer,iConversionBufferSize,iConversionBufferSize);
		
		// Construct a modifiable pointer descriptor pointing to the the writable area of
		// iConversionBuffer
		TPtr8 remainingOutput(((TUint8*)iConversionBuffer) + outputLength, 0, iConversionBufferSize - outputLength);

		// Try to convert another chunk of data
		unconverted = iCnvCharacterSetConverter->ConvertFromUnicode(remainingOutput, aInput.Right(unconverted));
	
		// Update the length of the output buffer to include the data we just converted.
		aOutput.SetLength(remainingOutput.Length()+outputLength);
		}
		
	return unconverted;  // return error value if there is one.
	}


EXPORT_C void CCharSetConverter::PrepareToConvertToOrFromL(TUint32 aCharSetUid)
/**
This method is a helper function that prepares CharConv for a conversion.

@see				CCnvCharacterSetConverter::PrepareToConvertToOrFromL
@post				CharConv is ready for the conversion or not.

@leave				KErrXmlUnavailableCharacterSet - Charset not available.

@param				aCharSetUid The character set encoding to convert to.
*/
	{
	if (iCnvCharacterSetConverter->PrepareToConvertToOrFromL(aCharSetUid, iFs) == 
			CCnvCharacterSetConverter::ENotAvailable )
		{
		User::Leave(KErrXmlUnavailableCharacterSet);
		}
	}



EXPORT_C void CCharSetConverter::ConvertCharacterSetIdentifierToStandardNameL(TUint32 aCharSetUid, 
																			  HBufC8*& aCharSet)
/**
This method is a helper function that obtains a standand character
encoding name from a character set identifer.

@see				CCnvCharacterSetConverter::ConvertCharacterSetIdentifierToStandardNameL

@leave				KErrXmlUnsupportedCharacterSet If the character set is not known.

@param				aCharSetUid The character set to obtain the name for.
@param				aCharSet On return holds the Internet-standard name
					or MIME name of the character set.
					The name is encoded in 8 bit ASCII. 
*/
	{
	if ((aCharSet = 
		iCnvCharacterSetConverter->
			ConvertCharacterSetIdentifierToStandardNameL(aCharSetUid, iFs)) == NULL)
		{
		User::Leave(KErrXmlUnsupportedCharacterSet);
		}
	}



EXPORT_C TInt CCharSetConverter::ConvertUcs4CharactersToEncodingL(TUint32* aUcs4Src, 
																  TInt aUcs4Count, 
																  TUint32 aDestCharset,
																  HBufC8*& aConversion)
/**
This method converts ucs-4 characters to the desired non-modal encoding.
aConversion should be NULL on calling of this function.
If this function leaves, memory is cleaned up.
There is no TPtr overload of this method, as currently it is only called a few times and so would not 
produce any noticable benefits.

@return				CCharSetConverter::ConvertFromUnicodeL.

@leave				KErrXmlBadCharacterConversion

@param				aUcs4Src list of ucs-4 characters.
@param				aUcs4Count number of ucs4 characters.
@param				aDestCharset the desired encoding.
@param				aConversion On return, points to the converted encoding.
*/
	{
	// convert ucs-4 to ucs-2

	// Find the length of the output
	TText16 buf[2];
	TInt length = 0;
	TUint32* src = NULL;

	for (src = aUcs4Src; src != (aUcs4Src + aUcs4Count); ++src)
		{
		// Convert a single character into the buffer, discard the result
		// but increase the length by the number of UTF16 codes output.
		length += Utf32ToUtf16(buf, *src) - buf;
		}

	HBufC16* utf16Out = HBufC16::NewL(length);
	CleanupStack::PushL(utf16Out);

	utf16Out->Des().SetLength(length);
    
	TText16* p = &((utf16Out->Des())[0]);
	

	// go through characters converting to ucs2.
	for (src = aUcs4Src; src != aUcs4Src + aUcs4Count; ++src)
		{
		// convert each ucs4 character
		 p = Utf32ToUtf16(p, *src);
		}

	// convert from ucs2 to desired encoding
	aConversion = NULL;
	TInt ret = 0;

	//HBufC overload of this method called, due to the need pass back the HBufC to the calling method
	ret = CCharSetConverter::ConvertFromUnicodeL(*utf16Out, aDestCharset, aConversion);
	CleanupStack::PushL(aConversion);

	if(ret > KErrNone)
		{
		// CharConv couldn't convert all the bytes. Character encoding may be truncated.
		User::Leave(KErrXmlBadCharacterConversion);
		}

	CleanupStack::Pop(aConversion);
	CleanupStack::PopAndDestroy(utf16Out);
	return(ret);
	}



TText16* CCharSetConverter::Utf32ToUtf16(TText16* aUtf16Out, TUint32 aUtf32)
/**
This method converts a ucs-4 character to unicode.

@return				Pointer to the next free byte in the output buffer.

@param				aUtf16Out On return, contains the unicode character conversion.
@param				aUtf32 The ucs-4 character
*/
	{
	if (aUtf32 <= 0xFFFF)
		{
		// UTF32 (or UCS4) should not have characters in the range 
		// D800-DBFF (high surrogate) and DC00-DFFF (low surrogate) in it,
		// as these are the surrogates that make up the extension mechanism for 
		// fitting Unicode into 16 bits.
		// In principle, surrogates in UCS-4 should be ignored. 
		// They are considered a bad thing because they might be an aliasing 
		// problem: one thing looking like another. 
		// In practice I don't think it is a problem here. 
		// If you like, you could reject any character between D800 to DFFF.

		// could weed out unpaired surrogates here, but...
		*aUtf16Out = static_cast<TText16>(aUtf32);
		return aUtf16Out + 1;
		}

	// A way to visualise the use of surrogate pairs is to imaging planes.
	// The surrogate is located on plane zero and identifies the actual plane
	// this character resides in.
	// This is why for supplementary characters we must insert the surrogates
	// so that charconv can convert correctly.
	// 
	// 0    D800 
	// |    | DFFF 
	// |    | | E000                                                                  10FFF
	// |    | | |                                                                     |
	// xxxxxxYxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx
	//       ^
	// For Utf32 this means nothing. 
	// 
	// For Utf16 if the following bit pattern is located then it corresponds to a 
	// supplementary character.
	// 
	//       ^
	// D800           DC00           DFFF
	// |              |              |
	// yyyyyyyyyyyyyyyyyyyyyyyyyyyyyyy
	//       High           Low
	// 
	// 1101 10.. ........         1101 11.. ........
	//        --10  bit--                --10  bit-- 
	// 
	// 
	// Add 10000 to both 10-bit values and the offset to the correct character is obtained.
	// 
	// 
	// So a test of this function would be to check that the value returned 
	// matches a utf-8 character encoding manually calculated from the original ucs4
	// value.


	// We have a supplementary character consists of 5 nibbles (20 bits) 
	// with no surrogates.
	// We have to insert the surrogate pair on the values minus 0x10000.
	// b0-b9 is the low order value, b10-b19 is the high order value.
	// b19....b10 b9....b0
	//   high       low
	//
	// Character values.
	// Basic			0x0-0xFFFF
	// Supplimentary	0x10000-0x10FFFF
	// so 0x10000 >> 10 = 0x43FF

	// To add the surrogate to the high order:
	//
	// ((utf32-0x10000)>>10)+0xD800
	// = (utf32>>10)-(0x10000>>10)+0xD800
	// = (utf32>>10)+(0xD800-0x40)
	// = (utf32>>10)+0xD7C0

	aUtf16Out[0] = static_cast<TText16>((aUtf32 >> 10)  + 0xD7C0);

	// To add the surrogate to the low order:
	//
	// ((utf32-0x10000) & 0x3FF)+0xDC00
	// = ((utf32 & 0x3FF) - (0x10000 & 0x3FF)) + 0xDC00
	// = ((utf32 & 0x3FF) - (0)) + 0xDC00
	// = (utf32 & 0x3FF) + 0xDC00

	aUtf16Out[1] = static_cast<TText16>(0xDC00 | (aUtf32 & 0x3FF));

	return aUtf16Out + 2;
	}