filehandling/htmltorichtextconverter/src/CHtmlToCrtConvCharsetConvert.cpp
author andy simpson <andrews@symbian.org>
Thu, 02 Sep 2010 15:49:18 +0100
branchRCL_3
changeset 64 75184094ace1
parent 0 2e3d3ce01487
permissions -rw-r--r--
Merge after removal of incorrect RCL_3 drop

// Copyright (c) 2001-2009 Nokia Corporation and/or its subsidiary(-ies).
// All rights reserved.
// This component and the accompanying materials are made available
// under the terms of "Eclipse Public License v1.0"
// which accompanies this distribution, and is available
// at the URL "http://www.eclipse.org/legal/epl-v10.html".
//
// Initial Contributors:
// Nokia Corporation - initial contribution.
//
// Contributors:
//
// Description:
//

#include <s32file.h>

#include "CHtmlToCrtConvCharsetConvert.h"
#include "CHtmlToCrtConvBuffer.h"
#include "CHtmlToCrtConverter.h"
#include "MHtmlToCrtConvResourceFile.h"
#include <chtmltocrtconverter.rsg>

const TInt KSampleBufferSize = 256;
const TInt KMetaTagCharsetValueBufferLength	= 40;
const TInt KMimimumConfidenceLevel = 50;

CHtmlToCrtConvCharsetConvert* CHtmlToCrtConvCharsetConvert::NewL(CHtmlToCrtConvBuffer& aBuffer, RFs& aFsSession, CCnvCharacterSetConverter& aCnvCharacterSetConverter, MHtmlToCrtConvResourceFile& aResourceFile, MConverterUiObserver* aObserver)
	{
	CHtmlToCrtConvCharsetConvert* self=new(ELeave) CHtmlToCrtConvCharsetConvert(aBuffer, aObserver, aFsSession, aCnvCharacterSetConverter, aResourceFile);
	CleanupStack::PushL(self);
	self->ConstructL();
	CleanupStack::Pop(self);
	return self;
	}

void CHtmlToCrtConvCharsetConvert::ConstructL()
	{
	iArrayOfCharacterSetsAvailable=CCnvCharacterSetConverter::CreateArrayOfCharacterSetsAvailableL(iFsSession);
	}

CHtmlToCrtConvCharsetConvert::CHtmlToCrtConvCharsetConvert(CHtmlToCrtConvBuffer& aBuffer, MConverterUiObserver* aObserver, RFs& aFsSession, CCnvCharacterSetConverter& aCnvCharacterSetConverter, MHtmlToCrtConvResourceFile& aResourceFile)
:iCnvCharacterSetConverter(aCnvCharacterSetConverter)
,iBuffer(aBuffer)
,iObserver(aObserver)
,iResourceFile(aResourceFile)
,iFsSession(aFsSession)
	{
	}

CHtmlToCrtConvCharsetConvert::~CHtmlToCrtConvCharsetConvert()
	{
	delete iArrayOfCharacterSetsAvailable;
	}

//==================================================================================
//GetCharSetFromPasswordL - uses MConverterUiObserver::QueryPasswordL
//==================================================================================
TBool CHtmlToCrtConvCharsetConvert::GetCharSetFromPasswordL()
	{
	if(iObserver)
		{
		HBufC16* unicodeCharsetName=iObserver->QueryPasswordL(KHtmlToCrtRequestCharset);
				
		if(unicodeCharsetName)
			{
			//convert from unicode
			HBufC8* charsetName=HBufC8::NewLC(unicodeCharsetName->Length());
			ConvertFromUnicodeL(*unicodeCharsetName, *charsetName);		
			//convert standard name of character set to TUint identifier
			iCharacterSetIdentifier=iCnvCharacterSetConverter.ConvertStandardNameOfCharacterSetToIdentifierL(*(charsetName), iFsSession);
			CleanupStack::PopAndDestroy(charsetName);
			}
		}
	return iCharacterSetIdentifier;
	}

//==================================================================================
//GetCharSetFromMetaTagL - searches for META tag, then extracts charset value
//<META http-equiv="Content-Type" content="text/html; charset = us-ascii">
//==================================================================================
TBool CHtmlToCrtConvCharsetConvert::GetCharSetFromMetaTagL()
	{
	HBufC8* temp=HBufC8::NewLC(KSampleBufferSize);
	TPtr8 buffer(temp->Des());
	TInt offset=0;
	iBuffer.GetSampleOfTextFromFileL(buffer, KSampleBufferSize, offset);

	_LIT8(KMeta, "<meta");
	_LIT8(KClosingTagIndicator, ">");
	_LIT8(KCharset, "charset");
	_LIT8(KEquals, "=");

	offset=FindAndDeleteString(buffer, KMeta);
	if(offset>=0)//meta found
		{
		//find >
		TInt offset=buffer.FindF(KClosingTagIndicator);		
		if(offset>=0)//> found
			{
			//delete > onwards - Delete() adjusts length so as not to delete beyond end of buffer
			buffer.Delete(offset, buffer.Length());

			//now buffer only contains META tag
			offset=FindAndDeleteString(buffer, KCharset);
			if(offset>=0)//charset found
				{
				offset=FindAndDeleteString(buffer, KEquals);
				if(offset>=0)//= found
					{
					ParseCharsetValue(buffer);
					//ConvertStandardNameOfCharacterSetToIdentifierL returns the UID of a 
					//character set for a given Internet-standard name
					//zero is returned if none found
					iCharacterSetIdentifier=iCnvCharacterSetConverter.ConvertStandardNameOfCharacterSetToIdentifierL(buffer, iFsSession);
					}
				}
			}
		}
	CleanupStack::PopAndDestroy(temp);
	return iCharacterSetIdentifier;
	}

//==================================================================================
//FindAndDeleteString - deletes up to and including string
//==================================================================================
TInt CHtmlToCrtConvCharsetConvert::FindAndDeleteString(TDes8& aBuffer, const TDesC8& aString)
	{
	TInt offset=aBuffer.FindF(aString);
	TInt length=aString.Length();
	
	if(offset>=0)//string found
		{
		//delete up to and including string
		ASSERT(offset+length <= aBuffer.Length());
		aBuffer.Delete(0, offset+length);
		}
	return offset;
	}
//==================================================================================
//ParseCharsetValue - charset value read until " or white space character seen
//==================================================================================
void CHtmlToCrtConvCharsetConvert::ParseCharsetValue(TDes8& aBuffer)
	{
	aBuffer.TrimLeft();
	TInt length=aBuffer.Length();
	if(length)
		{
		TChar character;
		TBuf8<KMetaTagCharsetValueBufferLength> charsetBuffer;
		TBool finishedReadingValue=EFalse;
		TInt ii=0;
		while(ii<length && !finishedReadingValue)
			{
			character=aBuffer[ii++];
			if(character.IsSpace() || character=='"')
				{
				finishedReadingValue=ETrue;
				}
			else
				{
				if(charsetBuffer.Length()<KMetaTagCharsetValueBufferLength)
					{
					charsetBuffer.Append(character);
					}
				else
					{
					ii=length;
					}
				}
			}
		aBuffer.Copy(charsetBuffer);
		}
	}
//==================================================================================
//GetCharSetUsingAutoDetectL
//==================================================================================
TBool CHtmlToCrtConvCharsetConvert::GetCharSetUsingAutoDetectL()
	{
	HBufC8* temp=HBufC8::NewLC(KSampleBufferSize);
	TPtr8 buffer(temp->Des());
	TInt offset=256;
	//it's likely that the start of the file will contain the header, so offset
	//is used to read from a position within the file
	iBuffer.GetSampleOfTextFromFileL(buffer, KSampleBufferSize, offset);

	TInt confidence=0;

	if(buffer.Length())
		{
		iCnvCharacterSetConverter.AutoDetectCharacterSetL(confidence, iCharacterSetIdentifier, *iArrayOfCharacterSetsAvailable, buffer);
		}

	CleanupStack::PopAndDestroy(temp);

	if(iCharacterSetIdentifier == KCharacterSetIdentifierAscii)
		{
		//In order to support Latin letters, the character set needs to 
		//be widened from ASCII to Code Page 1252
		iCharacterSetIdentifier = KCharacterSetIdentifierCodePage1252;
		}
		
	//confidence in range 0-100
	if(confidence<KMimimumConfidenceLevel)
		{
		//low level of confidence in the chosen character set, so reset
		// - this method of determining character set not successful
		iCharacterSetIdentifier=0;
		return EFalse;
		}
	else
		return ETrue;
	}

//=============================================================
//PrepareForConvertToUnicodeL
//=============================================================
void CHtmlToCrtConvCharsetConvert::PrepareForConvertToUnicodeL()
	{
	if(!iCharacterSetIdentifier)
		{
		//get default character set for locale from resource file
		HBufC* unicodeDefaultCharacterSet=iResourceFile.ReadResourceHBufCLC(R_CNV_DEFAULT_CHARACTER_SET);

		//convert from unicode
		HBufC8* defaultCharacterSet=HBufC8::NewLC(unicodeDefaultCharacterSet->Length());
		ConvertFromUnicodeL(*unicodeDefaultCharacterSet, *defaultCharacterSet);
		
		//convert standard name of character set to TUint identifier
		iCharacterSetIdentifier=iCnvCharacterSetConverter.ConvertStandardNameOfCharacterSetToIdentifierL(*(defaultCharacterSet), iFsSession);
		CleanupStack::PopAndDestroy(2); //defaultCharacterSet, unicodeDefaultCharacterSet
		}

	iCnvCharacterSetConverter.PrepareToConvertToOrFromL(iCharacterSetIdentifier, *iArrayOfCharacterSetsAvailable, iFsSession);
	}
//=============================================================
//ConvertFromUnicodeL
//=============================================================
void CHtmlToCrtConvCharsetConvert::ConvertFromUnicodeL(const TDesC& aUnicode, HBufC8& aNarrow)
	{
	TPtr8 ptrNarrow(aNarrow.Des());	
	iCnvCharacterSetConverter.PrepareToConvertToOrFromL(KCharacterSetIdentifierAscii, *iArrayOfCharacterSetsAvailable, iFsSession);
	iCnvCharacterSetConverter.ConvertFromUnicode(ptrNarrow, aUnicode);
	}