browserutilities/feedsengine/FeedsServer/XmlUtils/src/XmlEncoding.cpp
author Simon Howkins <simonh@symbian.org>
Mon, 15 Nov 2010 14:53:34 +0000
branchRCL_3
changeset 105 871af676edac
parent 94 919f36ff910f
permissions -rw-r--r--
Adjusted to avoid exports, etc, from a top-level bld.inf

/*
* Copyright (c) 2005 Nokia Corporation and/or its subsidiary(-ies).
* All rights reserved.
* This component and the accompanying materials are made available
* under the terms of the License "Eclipse Public License v1.0"
* which accompanies this distribution, and is available
* at the URL "http://www.eclipse.org/legal/epl-v10.html".
*
* Initial Contributors:
* Nokia Corporation - initial contribution.
*
* Contributors:
*
* Description:  A misc. char-encoding related utilities.
*
*/

#include <charconv.h>

#include <xmlencoding.h>

// -----------------------------------------------------------------------------
// CXmlEncoding::NewL
//
// Two-phased constructor.
// -----------------------------------------------------------------------------
//
EXPORT_C CXmlEncoding* CXmlEncoding::NewL()
    {
    CXmlEncoding* self = new (ELeave) CXmlEncoding();
    
    CleanupStack::PushL(self);
    self->ConstructL();
    CleanupStack::Pop();

    return self;
    }

        
// -----------------------------------------------------------------------------
// CXmlEncoding::CXmlEncoding
//
// C++ default constructor can NOT contain any code, that
// might leave.
// -----------------------------------------------------------------------------
//
CXmlEncoding::CXmlEncoding():
        iCharEncodings(5)
    {
    }
        

// -----------------------------------------------------------------------------
// CXmlEncoding::ConstructL
//
// Symbian 2nd phase constructor can leave.
// -----------------------------------------------------------------------------
//
void CXmlEncoding::ConstructL()
    {
    User::LeaveIfError(iRfs.Connect());
    iConverter = CCnvCharacterSetConverter::NewL();
    }
        

// -----------------------------------------------------------------------------
// CXmlEncoding::~CXmlEncoding
//
// Deconstructor.
// -----------------------------------------------------------------------------
//
CXmlEncoding::~CXmlEncoding()
    {
    iRfs.Close();
    delete iConverter;

    // Delete the cached encoding-map.
    for (TInt i = 0; i < iCharEncodings.Count(); i++)
        {
        delete iCharEncodings[i].charEncoding;
        }

    iCharEncodings.Close();
    }


// -----------------------------------------------------------------------------
// CXmlEncoding::ResolveCharEncodingL
// 
// Resolves the given char-encoding into its uid.
// -----------------------------------------------------------------------------
//
TBool CXmlEncoding::ResolveCharEncodingL(const TDesC8& aCharEncoding, TUint& aUid) const
    {
    TBool  found = EFalse;

    // First look it up in the cached encodings.
    for (TInt i = 0; i < iCharEncodings.Count(); i++)
        {
        if (iCharEncodings[i].charEncoding->CompareF(aCharEncoding) == 0)
            {
            aUid = iCharEncodings[i].uid;
            found = ETrue;

            break;
            }
        }

    // Otherwise use the CCnvCharacterSetConverter
    if (!found)
        {
        TUint  tid;

        // Look it up.
        tid = iConverter->ConvertStandardNameOfCharacterSetToIdentifierL(aCharEncoding, 
                const_cast<CXmlEncoding*>(this)->iRfs);

        // If found add it to the cached encodings.
        if (tid != 0)
            {
            SupportedEncodings  encoding;
            TInt                err;

            encoding.charEncoding = aCharEncoding.AllocL();
            encoding.uid = tid;

            err = const_cast<CXmlEncoding*>(this)->iCharEncodings.Append(encoding);
            if (err != KErrNone)
                {
                delete encoding.charEncoding;
                User::Leave(err);
                }
            
            aUid = tid;
            found = ETrue;
            }
        }

    return found;
    }


// -----------------------------------------------------------------------------
// XmlEncoding::DetermineCharEncoding
//
// Determine the char-encoding.
// -----------------------------------------------------------------------------
//
EXPORT_C TBool CXmlEncoding::DetermineCharEncodingL(const TDesC8& aBuffer, 
        const TDesC& aCharSet, TUint& aEncoding) const
    {
    TBool  foundEncoding = EFalse;

    // Try to determine the encoding via the BOM (byte order mask).
    foundEncoding = DetermineCharEncodingFromBom(aBuffer, aEncoding);

    // Try to determine the encoding via the xml-prolog.
    if (!foundEncoding)
        {
        foundEncoding = DetermineCharEncodingFromXmlProlog(aBuffer, aEncoding);
        }

    // Try to determine the encoding via the char-set provided by 
    // the orignal source.
    if (!foundEncoding)
        {
        HBufC8*  str = NULL;

        // Convert it to 8bit first.
        str = HBufC8::NewL(aCharSet.Length());
        CleanupStack::PushL(str);
        str->Des().Append(aCharSet);

        foundEncoding = ResolveCharEncodingL(*str, aEncoding);
        CleanupStack::PopAndDestroy(str);
        }

    return foundEncoding;
    }


// -----------------------------------------------------------------------------
// XmlEncoding::DetermineCharEncodingFromBom
//
// Determine the char-encoding from the BOM.
// -----------------------------------------------------------------------------
//
TBool CXmlEncoding::DetermineCharEncodingFromBom(const TDesC8& aBuffer, 
        TUint& aEncoding) const
    {
    _LIT8(KUcs2Big, "UTF-16BE");
    _LIT8(KUcs2Little, "UTF-16LE");
    _LIT8(KUtf8, "UTF-8");

    TBool  foundEncoding = EFalse;
    
    if (aBuffer.Length() < 3)
        {    
        return EFalse;
        }

    // Extract the first three bytes.
    TUint8 c1 = aBuffer[0];
    TUint8 c2 = aBuffer[1];
    TUint8 c3 = aBuffer[2];
      
    // Check for the BOM.
    if ((c1 == 0xFE) && (c2 == 0xFF))
        {
        TRAP_IGNORE(foundEncoding = ResolveCharEncodingL(KUcs2Big, aEncoding));
        }
    else if ((c1 == 0xFF) && (c2 == 0xFE))
        {
        TRAP_IGNORE(foundEncoding = ResolveCharEncodingL(KUcs2Little, aEncoding));
        } 
    else if ((c1 == 0xEF) && (c2 == 0xBB) && (c3 == 0xBF))
        {
        TRAP_IGNORE(foundEncoding = ResolveCharEncodingL(KUtf8, aEncoding));
        }

    return foundEncoding;
    }


// -----------------------------------------------------------------------------
// XmlEncoding::DetermineCharEncodingFromXmlProlog
//
// Determine the char-encoding from the char-encoding in the xml-prolog.
// -----------------------------------------------------------------------------
//
TBool CXmlEncoding::DetermineCharEncodingFromXmlProlog(const TDesC8& aBuffer, 
        TUint& aEncoding) const
    {
    TBool  foundEncoding = EFalse;
    TInt   begin;
    TInt   end;
    TInt   valueBegin;
    TInt   valueEnd;

    // Try to determine the encoding via the xml-prolog.
    if (FindEncoding(aBuffer, begin, end, valueBegin, valueEnd))
        {
        TPtrC8  str(aBuffer.Ptr() + valueBegin, valueEnd - valueBegin + 1);

        TRAP_IGNORE(foundEncoding = ResolveCharEncodingL(str, aEncoding));
        }

    return foundEncoding;
    }


// -----------------------------------------------------------------------------
// XmlEncoding::StripCharEncoding
//
// Erase the char-encoding attribute (if any) from the xml-prolog.
// -----------------------------------------------------------------------------
//
void CXmlEncoding::StripCharEncoding(HBufC8& aUtf8Buffer) const
    {
    TInt  begin;
    TInt  end;
    TInt  valueBegin;
    TInt  valueEnd;

    if (FindEncoding(aUtf8Buffer, begin, end, valueBegin, valueEnd))
        {
        aUtf8Buffer.Des().Delete(begin, end - begin + 1);
        }
    }


// -----------------------------------------------------------------------------
// XmlEncoding::StripCharEncoding
//
// Erase the char-encoding attribute (if any) from the xml-prolog.
// -----------------------------------------------------------------------------
//
void CXmlEncoding::StripCharEncoding(HBufC16& aUcs2Buffer) const
    {
    TInt  begin;
    TInt  end;
    TInt  valueBegin;
    TInt  valueEnd;

    if (FindEncoding(aUcs2Buffer, begin, end, valueBegin, valueEnd))
        {
        aUcs2Buffer.Des().Delete(begin, end - begin + 1);
        }
    }


// -----------------------------------------------------------------------------
// XmlEncoding::ConvertToUtf8L
//
// Converts the given native buffer into utf8.
// -----------------------------------------------------------------------------
//
EXPORT_C HBufC8* CXmlEncoding::ConvertToUtf8L(TUint aEncoding, const TDesC8& aBuffer) const
	{
	HBufC16*  usc2Buffer = NULL;
	HBufC8*   utf8Buffer = NULL;

    // First convert it to ucs2.
    usc2Buffer = ConvertToUcs2L(aEncoding, aBuffer);
    CleanupStack::PushL(usc2Buffer);

    // Then convert it to utf8.
    utf8Buffer = ConvertToUtf8L(*usc2Buffer);

    CleanupStack::PopAndDestroy(usc2Buffer);
    usc2Buffer = NULL;

    // Erase the char-encoding attribute (if any) from the xml-prolog.  If forces
    // Libxml2 to use the default encoding, utf8.
    StripCharEncoding(*utf8Buffer);

    return utf8Buffer;
	}


// -----------------------------------------------------------------------------
// XmlEncoding::ConvertToUcs2L
//
// Converts the given native buffer into ucs2.
// -----------------------------------------------------------------------------
//
EXPORT_C HBufC16* CXmlEncoding::ConvertToUcs2L(TUint aEncoding, const TDesC8& aBuffer) const
	{
	TBuf<100>  temp16Buffer;
	HBufC*     unicode = NULL;
	TPtrC8	   source8Ptr(aBuffer);

    // Init the converter and ensure the encoding is supported.
	if (iConverter->PrepareToConvertToOrFromL(aEncoding, const_cast<CXmlEncoding*>(this)->iRfs) != 
            CCnvCharacterSetConverter::EAvailable)
		{
		User::Leave(KErrNotSupported);
		}

	for(;;) // conversion loop
        {
        TInt  returnValue;
		TInt  state = CCnvCharacterSetConverter::KStateDefault;

		returnValue = iConverter->ConvertToUnicode(temp16Buffer, source8Ptr, state);
        if (returnValue == CCnvCharacterSetConverter::EErrorIllFormedInput)
			{
            User::Leave(KErrCorrupt);
			}
        else
			{
			if (returnValue < 0) // future-proof against "TError" expanding
				{
				User::Leave(KErrGeneral);
				}
			}

		if (!unicode)
			{
			unicode = temp16Buffer.AllocLC();
			}
		else
			{
			HBufC* tmp = unicode->ReAllocL(unicode->Length() + temp16Buffer.Length());
			CleanupStack::Pop(unicode);
			unicode = tmp;
			CleanupStack::PushL(unicode);
			unicode->Des().Append(temp16Buffer);
			}

        if (returnValue == 0)	// All is converted without Errors
			{
            break;
			}

		// There is "returnValue" bytes not converted yet
        source8Ptr.Set(source8Ptr.Right(returnValue));
        }

    // Erase the char-encoding attribute (if any) from the xml-prolog -- as the
    // encoding is no longer valid.
    StripCharEncoding(*unicode);

	CleanupStack::Pop(unicode);
	return unicode;
	}


// -----------------------------------------------------------------------------
// XmlEncoding::ConvertToUtf8L
//
// Converts the given ucs2 buffer into utf8.
// -----------------------------------------------------------------------------
//
HBufC8* CXmlEncoding::ConvertToUtf8L(HBufC16& aUsc2Buffer) const
	{
	TBuf8<100>	temp8Buffer;
	HBufC8*		target = NULL;
	TPtrC		source16Ptr(aUsc2Buffer);

    // Init the converter and ensure the encoding is supported.
	if (iConverter->PrepareToConvertToOrFromL(KCharacterSetIdentifierUtf8, 
	        const_cast<CXmlEncoding*>(this)->iRfs) != CCnvCharacterSetConverter::EAvailable)
		{
		User::Leave(KErrNotSupported);
		}

	for(;;) // conversion loop
        {
		TInt  returnValue;
        TInt  state = CCnvCharacterSetConverter::KStateDefault;

		returnValue = iConverter->ConvertFromUnicode(temp8Buffer, source16Ptr, state);
        if (returnValue == CCnvCharacterSetConverter::EErrorIllFormedInput)
			{
            User::Leave(KErrCorrupt);
			}
        else
			{
			if (returnValue < 0) // future-proof against "TError" expanding
				{
				User::Leave(KErrGeneral);
				}
			}

		if (!target)
			{
			target = temp8Buffer.AllocLC();
			}
		else
			{
			HBufC8* tmp = target->ReAllocL(target->Length() + temp8Buffer.Length());
			CleanupStack::Pop(target);
			target = tmp;
			CleanupStack::PushL(target);
			target->Des().Append(temp8Buffer);
			}

        if (returnValue == 0)	// All is converted without Errors
			{
            break;
			}

		// There is "returnValue" bytes not converted yet
        source16Ptr.Set(source16Ptr.Right(returnValue));
        }
	CleanupStack::Pop(target);

	return target;
	}


// -----------------------------------------------------------------------------
// XmlEncoding::FindEncoding
//
// Finds the location of the encoding attribute in the xml-prolog.
// Refer to: http://www.w3.org/TR/2000/REC-xml-20001006#sec-prolog-dtd.
// -----------------------------------------------------------------------------
//
TBool CXmlEncoding::FindEncoding(const TDesC8& aBuffer, TInt& aBegin, TInt& aEnd, 
        TInt& aValueBegin, TInt& aValueEnd) const
    {
    _LIT8(KPiStart, "<?xml");
    _LIT8(KPiEnd, "?>");
    _LIT8(KEncoding, "encoding");

    TInt    piStartLoc = KErrNotFound;
    TInt    piEndLoc = KErrNotFound;

    aBegin = KErrNotFound;
    aEnd = KErrNotFound;
    aValueBegin = KErrNotFound;
    aValueEnd = KErrNotFound;

    // Determine if the xml-prolog contains a char-encoding attribute.
    piStartLoc = aBuffer.FindF(KPiStart);
    piEndLoc = aBuffer.FindF(KPiEnd);
    aBegin = aBuffer.FindF(KEncoding);
    
    if ((piStartLoc != KErrNotFound) && (piEndLoc != KErrNotFound) && 
            (aBegin != KErrNotFound) && (piStartLoc < piEndLoc) && (aBegin < piEndLoc))
        { 
        TInt   i;
        TBool  foundEquals = EFalse;
        TInt   quoteCount = 0;

        // If so, find the end of the attribute.
        i = aBegin + KEncoding().Length();
        while (i < piEndLoc)
            {
            TInt8  c;

            c = aBuffer[i];

            // The '=' char must be the first non-whitespace after "encoding".
            if (!foundEquals)
                {
                if (c == '=')
                    {
                    foundEquals = ETrue;
                    }
                else if ((c != 0x20) && (c != 0x09) && (c != 0x0D) && (c != 0x0A))
                    {
                    // Give up.
                    return EFalse;
                    }
                }

            // Otherwise look for the two quotes
            else if ((c == '\"') || (c == '\''))
                {
                quoteCount++;

                if (quoteCount == 1)
                    {
                    aValueBegin = i + 1;
                    }
                else if (quoteCount == 2)
                    {
                    aEnd = i;
                    aValueEnd = i - 1;
                    break;
                    }
                }

            i++;
            }
        }

    return ((aValueBegin != KErrNotFound) && (aValueEnd != KErrNotFound));
    }


// -----------------------------------------------------------------------------
// XmlEncoding::FindEncoding
//
// Finds the location of the encoding attribute in the xml-prolog.
// Refer to: http://www.w3.org/TR/2000/REC-xml-20001006#sec-prolog-dtd.
// -----------------------------------------------------------------------------
//
TBool CXmlEncoding::FindEncoding(const TDesC& aBuffer, TInt& aBegin, 
        TInt& aEnd, TInt& aValueBegin, TInt& aValueEnd) const
    {
    _LIT16(KPiStart, "<?xml");
    _LIT16(KPiEnd, "?>");
    _LIT16(KEncoding, "encoding");

    TInt    piStartLoc = KErrNotFound;
    TInt    piEndLoc = KErrNotFound;

    aBegin = KErrNotFound;
    aEnd = KErrNotFound;
    aValueBegin = KErrNotFound;
    aValueEnd = KErrNotFound;

    // Determine if the xml-prolog contains a char-encoding attribute.
    piStartLoc = aBuffer.FindF(KPiStart);
    piEndLoc = aBuffer.FindF(KPiEnd);
    aBegin = aBuffer.FindF(KEncoding);
    
    if ((piStartLoc != KErrNotFound) && (piEndLoc != KErrNotFound) && 
            (aBegin != KErrNotFound) && (piStartLoc < piEndLoc) && (aBegin < piEndLoc))
        { 
        TInt   i;
        TBool  foundEquals = EFalse;
        TInt   quoteCount = 0;

        // If so, find the end of the attribute.
        i = aBegin + KEncoding().Length();
        while (i < piEndLoc)
            {
            TInt16  c;

            c = aBuffer[i];

            // The '=' char must be the first non-whitespace after "encoding".
            if (!foundEquals)
                {
                if (c == '=')
                    {
                    foundEquals = ETrue;
                    }
                else if ((c != 0x20) && (c != 0x09) && (c != 0x0D) && (c != 0x0A))
                    {
                    // Give up.
                    return EFalse;
                    }
                }

            // Otherwise look for the two quotes
            else if ((c == '\"') || (c == '\''))
                {
                quoteCount++;

                if (quoteCount == 1)
                    {
                    aValueBegin = i + 1;
                    }
                else if (quoteCount == 2)
                    {
                    aEnd = i;
                    aValueEnd = i - 1;
                    break;
                    }
                }

            i++;
            }
        }

    return ((aValueBegin != KErrNotFound) && (aValueEnd != KErrNotFound));
    }