browserutilities/feedsengine/FeedsServer/XmlUtils/src/XmlEncoding.cpp
changeset 0 dd21522fd290
child 36 0ed94ceaa377
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/browserutilities/feedsengine/FeedsServer/XmlUtils/src/XmlEncoding.cpp	Mon Mar 30 12:54:55 2009 +0300
@@ -0,0 +1,614 @@
+/*
+* Copyright (c) 2005 Nokia Corporation and/or its subsidiary(-ies).
+* All rights reserved.
+* This component and the accompanying materials are made available
+* under the terms of the License "Eclipse Public License v1.0"
+* which accompanies this distribution, and is available
+* at the URL "http://www.eclipse.org/legal/epl-v10.html".
+*
+* Initial Contributors:
+* Nokia Corporation - initial contribution.
+*
+* Contributors:
+*
+* Description:  A misc. char-encoding related utilities.
+*
+*/
+
+
+#include <charconv.h>
+
+#include "XmlEncoding.h"
+
+
+// -----------------------------------------------------------------------------
+// CXmlEncoding::NewL
+//
+// Two-phased constructor.
+// -----------------------------------------------------------------------------
+//
+EXPORT_C CXmlEncoding* CXmlEncoding::NewL()
+    {
+    CXmlEncoding* self = new (ELeave) CXmlEncoding();
+    
+    CleanupStack::PushL(self);
+    self->ConstructL();
+    CleanupStack::Pop();
+
+    return self;
+    }
+
+        
+// -----------------------------------------------------------------------------
+// CXmlEncoding::CXmlEncoding
+//
+// C++ default constructor can NOT contain any code, that
+// might leave.
+// -----------------------------------------------------------------------------
+//
+CXmlEncoding::CXmlEncoding():
+        iCharEncodings(5)
+    {
+    }
+        
+
+// -----------------------------------------------------------------------------
+// CXmlEncoding::ConstructL
+//
+// Symbian 2nd phase constructor can leave.
+// -----------------------------------------------------------------------------
+//
+void CXmlEncoding::ConstructL()
+    {
+    User::LeaveIfError(iRfs.Connect());
+    iConverter = CCnvCharacterSetConverter::NewL();
+    }
+        
+
+// -----------------------------------------------------------------------------
+// CXmlEncoding::~CXmlEncoding
+//
+// Deconstructor.
+// -----------------------------------------------------------------------------
+//
+CXmlEncoding::~CXmlEncoding()
+    {
+    iRfs.Close();
+    delete iConverter;
+
+    // Delete the cached encoding-map.
+    for (TInt i = 0; i < iCharEncodings.Count(); i++)
+        {
+        delete iCharEncodings[i].charEncoding;
+        }
+
+    iCharEncodings.Close();
+    }
+
+
+// -----------------------------------------------------------------------------
+// CXmlEncoding::ResolveCharEncodingL
+// 
+// Resolves the given char-encoding into its uid.
+// -----------------------------------------------------------------------------
+//
+TBool CXmlEncoding::ResolveCharEncodingL(const TDesC8& aCharEncoding, TUint& aUid) const
+    {
+    TBool  found = EFalse;
+
+    // First look it up in the cached encodings.
+    for (TInt i = 0; i < iCharEncodings.Count(); i++)
+        {
+        if (iCharEncodings[i].charEncoding->CompareF(aCharEncoding) == 0)
+            {
+            aUid = iCharEncodings[i].uid;
+            found = ETrue;
+
+            break;
+            }
+        }
+
+    // Otherwise use the CCnvCharacterSetConverter
+    if (!found)
+        {
+        TUint  tid;
+
+        // Look it up.
+        tid = iConverter->ConvertStandardNameOfCharacterSetToIdentifierL(aCharEncoding, 
+                const_cast<CXmlEncoding*>(this)->iRfs);
+
+        // If found add it to the cached encodings.
+        if (tid != 0)
+            {
+            SupportedEncodings  encoding;
+            TInt                err;
+
+            encoding.charEncoding = aCharEncoding.AllocL();
+            encoding.uid = tid;
+
+            err = const_cast<CXmlEncoding*>(this)->iCharEncodings.Append(encoding);
+            if (err != KErrNone)
+                {
+                delete encoding.charEncoding;
+                User::Leave(err);
+                }
+            
+            aUid = tid;
+            found = ETrue;
+            }
+        }
+
+    return found;
+    }
+
+
+// -----------------------------------------------------------------------------
+// XmlEncoding::DetermineCharEncoding
+//
+// Determine the char-encoding.
+// -----------------------------------------------------------------------------
+//
+EXPORT_C TBool CXmlEncoding::DetermineCharEncodingL(const TDesC8& aBuffer, 
+        const TDesC& aCharSet, TUint& aEncoding) const
+    {
+    TBool  foundEncoding = EFalse;
+
+    // Try to determine the encoding via the BOM (byte order mask).
+    foundEncoding = DetermineCharEncodingFromBom(aBuffer, aEncoding);
+
+    // Try to determine the encoding via the xml-prolog.
+    if (!foundEncoding)
+        {
+        foundEncoding = DetermineCharEncodingFromXmlProlog(aBuffer, aEncoding);
+        }
+
+    // Try to determine the encoding via the char-set provided by 
+    // the orignal source.
+    if (!foundEncoding)
+        {
+        HBufC8*  str = NULL;
+
+        // Convert it to 8bit first.
+        str = HBufC8::NewL(aCharSet.Length());
+        CleanupStack::PushL(str);
+        str->Des().Append(aCharSet);
+
+        foundEncoding = ResolveCharEncodingL(*str, aEncoding);
+        CleanupStack::PopAndDestroy(str);
+        }
+
+    return foundEncoding;
+    }
+
+
+// -----------------------------------------------------------------------------
+// XmlEncoding::DetermineCharEncodingFromBom
+//
+// Determine the char-encoding from the BOM.
+// -----------------------------------------------------------------------------
+//
+TBool CXmlEncoding::DetermineCharEncodingFromBom(const TDesC8& aBuffer, 
+        TUint& aEncoding) const
+    {
+    _LIT8(KUcs2Big, "UTF-16BE");
+    _LIT8(KUcs2Little, "UTF-16LE");
+    _LIT8(KUtf8, "UTF-8");
+
+    TBool  foundEncoding = EFalse;
+    
+    if (aBuffer.Length() < 3)
+        {    
+        return EFalse;
+        }
+
+    // Extract the first three bytes.
+    TUint8 c1 = aBuffer[0];
+    TUint8 c2 = aBuffer[1];
+    TUint8 c3 = aBuffer[2];
+      
+    // Check for the BOM.
+    if ((c1 == 0xFE) && (c2 == 0xFF))
+        {
+        TRAP_IGNORE(foundEncoding = ResolveCharEncodingL(KUcs2Big, aEncoding));
+        }
+    else if ((c1 == 0xFF) && (c2 == 0xFE))
+        {
+        TRAP_IGNORE(foundEncoding = ResolveCharEncodingL(KUcs2Little, aEncoding));
+        } 
+    else if ((c1 == 0xEF) && (c2 == 0xBB) && (c3 == 0xBF))
+        {
+        TRAP_IGNORE(foundEncoding = ResolveCharEncodingL(KUtf8, aEncoding));
+        }
+
+    return foundEncoding;
+    }
+
+
+// -----------------------------------------------------------------------------
+// XmlEncoding::DetermineCharEncodingFromXmlProlog
+//
+// Determine the char-encoding from the char-encoding in the xml-prolog.
+// -----------------------------------------------------------------------------
+//
+TBool CXmlEncoding::DetermineCharEncodingFromXmlProlog(const TDesC8& aBuffer, 
+        TUint& aEncoding) const
+    {
+    TBool  foundEncoding = EFalse;
+    TInt   begin;
+    TInt   end;
+    TInt   valueBegin;
+    TInt   valueEnd;
+
+    // Try to determine the encoding via the xml-prolog.
+    if (FindEncoding(aBuffer, begin, end, valueBegin, valueEnd))
+        {
+        TPtrC8  str(aBuffer.Ptr() + valueBegin, valueEnd - valueBegin + 1);
+
+        TRAP_IGNORE(foundEncoding = ResolveCharEncodingL(str, aEncoding));
+        }
+
+    return foundEncoding;
+    }
+
+
+// -----------------------------------------------------------------------------
+// XmlEncoding::StripCharEncoding
+//
+// Erase the char-encoding attribute (if any) from the xml-prolog.
+// -----------------------------------------------------------------------------
+//
+void CXmlEncoding::StripCharEncoding(HBufC8& aUtf8Buffer) const
+    {
+    TInt  begin;
+    TInt  end;
+    TInt  valueBegin;
+    TInt  valueEnd;
+
+    if (FindEncoding(aUtf8Buffer, begin, end, valueBegin, valueEnd))
+        {
+        aUtf8Buffer.Des().Delete(begin, end - begin + 1);
+        }
+    }
+
+
+// -----------------------------------------------------------------------------
+// XmlEncoding::StripCharEncoding
+//
+// Erase the char-encoding attribute (if any) from the xml-prolog.
+// -----------------------------------------------------------------------------
+//
+void CXmlEncoding::StripCharEncoding(HBufC16& aUcs2Buffer) const
+    {
+    TInt  begin;
+    TInt  end;
+    TInt  valueBegin;
+    TInt  valueEnd;
+
+    if (FindEncoding(aUcs2Buffer, begin, end, valueBegin, valueEnd))
+        {
+        aUcs2Buffer.Des().Delete(begin, end - begin + 1);
+        }
+    }
+
+
+// -----------------------------------------------------------------------------
+// XmlEncoding::ConvertToUtf8L
+//
+// Converts the given native buffer into utf8.
+// -----------------------------------------------------------------------------
+//
+EXPORT_C HBufC8* CXmlEncoding::ConvertToUtf8L(TUint aEncoding, const TDesC8& aBuffer) const
+	{
+	HBufC16*  usc2Buffer = NULL;
+	HBufC8*   utf8Buffer = NULL;
+
+    // First convert it to ucs2.
+    usc2Buffer = ConvertToUcs2L(aEncoding, aBuffer);
+    CleanupStack::PushL(usc2Buffer);
+
+    // Then convert it to utf8.
+    utf8Buffer = ConvertToUtf8L(*usc2Buffer);
+
+    CleanupStack::PopAndDestroy(usc2Buffer);
+    usc2Buffer = NULL;
+
+    // Erase the char-encoding attribute (if any) from the xml-prolog.  If forces
+    // Libxml2 to use the default encoding, utf8.
+    StripCharEncoding(*utf8Buffer);
+
+    return utf8Buffer;
+	}
+
+
+// -----------------------------------------------------------------------------
+// XmlEncoding::ConvertToUcs2L
+//
+// Converts the given native buffer into ucs2.
+// -----------------------------------------------------------------------------
+//
+EXPORT_C HBufC16* CXmlEncoding::ConvertToUcs2L(TUint aEncoding, const TDesC8& aBuffer) const
+	{
+	TBuf<100>  temp16Buffer;
+	HBufC*     unicode = NULL;
+	TPtrC8	   source8Ptr(aBuffer);
+
+    // Init the converter and ensure the encoding is supported.
+	if (iConverter->PrepareToConvertToOrFromL(aEncoding, const_cast<CXmlEncoding*>(this)->iRfs) != 
+            CCnvCharacterSetConverter::EAvailable)
+		{
+		User::Leave(KErrNotSupported);
+		}
+
+	for(;;) // conversion loop
+        {
+        TInt  returnValue;
+		TInt  state = CCnvCharacterSetConverter::KStateDefault;
+
+		returnValue = iConverter->ConvertToUnicode(temp16Buffer, source8Ptr, state);
+        if (returnValue == CCnvCharacterSetConverter::EErrorIllFormedInput)
+			{
+            User::Leave(KErrCorrupt);
+			}
+        else
+			{
+			if (returnValue < 0) // future-proof against "TError" expanding
+				{
+				User::Leave(KErrGeneral);
+				}
+			}
+
+		if (!unicode)
+			{
+			unicode = temp16Buffer.AllocLC();
+			}
+		else
+			{
+			HBufC* tmp = unicode->ReAllocL(unicode->Length() + temp16Buffer.Length());
+			CleanupStack::Pop(unicode);
+			unicode = tmp;
+			CleanupStack::PushL(unicode);
+			unicode->Des().Append(temp16Buffer);
+			}
+
+        if (returnValue == 0)	// All is converted without Errors
+			{
+            break;
+			}
+
+		// There is "returnValue" bytes not converted yet
+        source8Ptr.Set(source8Ptr.Right(returnValue));
+        }
+
+    // Erase the char-encoding attribute (if any) from the xml-prolog -- as the
+    // encoding is no longer valid.
+    StripCharEncoding(*unicode);
+
+	CleanupStack::Pop(unicode);
+	return unicode;
+	}
+
+
+// -----------------------------------------------------------------------------
+// XmlEncoding::ConvertToUtf8L
+//
+// Converts the given ucs2 buffer into utf8.
+// -----------------------------------------------------------------------------
+//
+HBufC8* CXmlEncoding::ConvertToUtf8L(HBufC16& aUsc2Buffer) const
+	{
+	TBuf8<100>	temp8Buffer;
+	HBufC8*		target = NULL;
+	TPtrC		source16Ptr(aUsc2Buffer);
+
+    // Init the converter and ensure the encoding is supported.
+	if (iConverter->PrepareToConvertToOrFromL(KCharacterSetIdentifierUtf8, 
+	        const_cast<CXmlEncoding*>(this)->iRfs) != CCnvCharacterSetConverter::EAvailable)
+		{
+		User::Leave(KErrNotSupported);
+		}
+
+	for(;;) // conversion loop
+        {
+		TInt  returnValue;
+        TInt  state = CCnvCharacterSetConverter::KStateDefault;
+
+		returnValue = iConverter->ConvertFromUnicode(temp8Buffer, source16Ptr, state);
+        if (returnValue == CCnvCharacterSetConverter::EErrorIllFormedInput)
+			{
+            User::Leave(KErrCorrupt);
+			}
+        else
+			{
+			if (returnValue < 0) // future-proof against "TError" expanding
+				{
+				User::Leave(KErrGeneral);
+				}
+			}
+
+		if (!target)
+			{
+			target = temp8Buffer.AllocLC();
+			}
+		else
+			{
+			HBufC8* tmp = target->ReAllocL(target->Length() + temp8Buffer.Length());
+			CleanupStack::Pop(target);
+			target = tmp;
+			CleanupStack::PushL(target);
+			target->Des().Append(temp8Buffer);
+			}
+
+        if (returnValue == 0)	// All is converted without Errors
+			{
+            break;
+			}
+
+		// There is "returnValue" bytes not converted yet
+        source16Ptr.Set(source16Ptr.Right(returnValue));
+        }
+	CleanupStack::Pop(target);
+
+	return target;
+	}
+
+
+// -----------------------------------------------------------------------------
+// XmlEncoding::FindEncoding
+//
+// Finds the location of the encoding attribute in the xml-prolog.
+// Refer to: http://www.w3.org/TR/2000/REC-xml-20001006#sec-prolog-dtd.
+// -----------------------------------------------------------------------------
+//
+TBool CXmlEncoding::FindEncoding(const TDesC8& aBuffer, TInt& aBegin, TInt& aEnd, 
+        TInt& aValueBegin, TInt& aValueEnd) const
+    {
+    _LIT8(KPiStart, "<?xml");
+    _LIT8(KPiEnd, "?>");
+    _LIT8(KEncoding, "encoding");
+
+    TInt    piStartLoc = KErrNotFound;
+    TInt    piEndLoc = KErrNotFound;
+
+    aBegin = KErrNotFound;
+    aEnd = KErrNotFound;
+    aValueBegin = KErrNotFound;
+    aValueEnd = KErrNotFound;
+
+    // Determine if the xml-prolog contains a char-encoding attribute.
+    piStartLoc = aBuffer.FindF(KPiStart);
+    piEndLoc = aBuffer.FindF(KPiEnd);
+    aBegin = aBuffer.FindF(KEncoding);
+    
+    if ((piStartLoc != KErrNotFound) && (piEndLoc != KErrNotFound) && 
+            (aBegin != KErrNotFound) && (piStartLoc < piEndLoc) && (aBegin < piEndLoc))
+        { 
+        TInt   i;
+        TBool  foundEquals = EFalse;
+        TInt   quoteCount = 0;
+
+        // If so, find the end of the attribute.
+        i = aBegin + KEncoding().Length();
+        while (i < piEndLoc)
+            {
+            TInt8  c;
+
+            c = aBuffer[i];
+
+            // The '=' char must be the first non-whitespace after "encoding".
+            if (!foundEquals)
+                {
+                if (c == '=')
+                    {
+                    foundEquals = ETrue;
+                    }
+                else if ((c != 0x20) && (c != 0x09) && (c != 0x0D) && (c != 0x0A))
+                    {
+                    // Give up.
+                    return EFalse;
+                    }
+                }
+
+            // Otherwise look for the two quotes
+            else if ((c == '\"') || (c == '\''))
+                {
+                quoteCount++;
+
+                if (quoteCount == 1)
+                    {
+                    aValueBegin = i + 1;
+                    }
+                else if (quoteCount == 2)
+                    {
+                    aEnd = i;
+                    aValueEnd = i - 1;
+                    break;
+                    }
+                }
+
+            i++;
+            }
+        }
+
+    return ((aValueBegin != KErrNotFound) && (aValueEnd != KErrNotFound));
+    }
+
+
+// -----------------------------------------------------------------------------
+// XmlEncoding::FindEncoding
+//
+// Finds the location of the encoding attribute in the xml-prolog.
+// Refer to: http://www.w3.org/TR/2000/REC-xml-20001006#sec-prolog-dtd.
+// -----------------------------------------------------------------------------
+//
+TBool CXmlEncoding::FindEncoding(const TDesC& aBuffer, TInt& aBegin, 
+        TInt& aEnd, TInt& aValueBegin, TInt& aValueEnd) const
+    {
+    _LIT16(KPiStart, "<?xml");
+    _LIT16(KPiEnd, "?>");
+    _LIT16(KEncoding, "encoding");
+
+    TInt    piStartLoc = KErrNotFound;
+    TInt    piEndLoc = KErrNotFound;
+
+    aBegin = KErrNotFound;
+    aEnd = KErrNotFound;
+    aValueBegin = KErrNotFound;
+    aValueEnd = KErrNotFound;
+
+    // Determine if the xml-prolog contains a char-encoding attribute.
+    piStartLoc = aBuffer.FindF(KPiStart);
+    piEndLoc = aBuffer.FindF(KPiEnd);
+    aBegin = aBuffer.FindF(KEncoding);
+    
+    if ((piStartLoc != KErrNotFound) && (piEndLoc != KErrNotFound) && 
+            (aBegin != KErrNotFound) && (piStartLoc < piEndLoc) && (aBegin < piEndLoc))
+        { 
+        TInt   i;
+        TBool  foundEquals = EFalse;
+        TInt   quoteCount = 0;
+
+        // If so, find the end of the attribute.
+        i = aBegin + KEncoding().Length();
+        while (i < piEndLoc)
+            {
+            TInt16  c;
+
+            c = aBuffer[i];
+
+            // The '=' char must be the first non-whitespace after "encoding".
+            if (!foundEquals)
+                {
+                if (c == '=')
+                    {
+                    foundEquals = ETrue;
+                    }
+                else if ((c != 0x20) && (c != 0x09) && (c != 0x0D) && (c != 0x0A))
+                    {
+                    // Give up.
+                    return EFalse;
+                    }
+                }
+
+            // Otherwise look for the two quotes
+            else if ((c == '\"') || (c == '\''))
+                {
+                quoteCount++;
+
+                if (quoteCount == 1)
+                    {
+                    aValueBegin = i + 1;
+                    }
+                else if (quoteCount == 2)
+                    {
+                    aEnd = i;
+                    aValueEnd = i - 1;
+                    break;
+                    }
+                }
+
+            i++;
+            }
+        }
+
+    return ((aValueBegin != KErrNotFound) && (aValueEnd != KErrNotFound));
+    }