diff -r 000000000000 -r 1fb32624e06b charconvfw/charconvplugins/src/plugins/hz.cpp --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/charconvfw/charconvplugins/src/plugins/hz.cpp Tue Feb 02 02:02:46 2010 +0200 @@ -0,0 +1,577 @@ +/* +* Copyright (c) 1997-2009 Nokia Corporation and/or its subsidiary(-ies). +* All rights reserved. +* This component and the accompanying materials are made available +* under the terms of "Eclipse Public License v1.0" +* which accompanies this distribution, and is available +* at the URL "http://www.eclipse.org/legal/epl-v10.html". +* +* Initial Contributors: +* Nokia Corporation - initial contribution. +* +* Contributors: +* +* Description: +* HZ is defined in RFC 1843 +* +*/ + + +#include +#include +#include "gb2312.h" +#include +#include + +const TInt KIsInGbBlock=CCnvCharacterSetConverter::KStateDefault+1; +#if defined(_DEBUG) +const TInt KLengthOfIntermediateBuffer=6; +#else +const TInt KLengthOfIntermediateBuffer=150; +#endif + +#if defined(_DEBUG) + +_LIT(KLitPanicText, "HZ"); + +enum TPanic + { + EPanicTooManyMatchingIndicesFound=1, + EPanicBadNumberOfBytesRequiredToBeAvailable, + EPanicBadNumberOfBytesAvailable, + EPanicBadNumberOfBytesThatCanBeMadeAvailable, + EPanicBadNumberOfBytesMadeAvailable1, + EPanicBadNumberOfBytesMadeAvailable2, + EPanicBadDescriptorSubDivision1, + EPanicBadDescriptorSubDivision2, + EPanicBadDescriptorSubDivision3, + EPanicBadDescriptorSubDivision4, + EPanicBadPointers1, + EPanicBadPointers2, + EPanicBadPointers3, + EPanicBadPointers4, + EPanicBadPointers5, + EPanicBadPointers6, + EPanicBadPointers7, + EPanicBadPointers8, + EPanicBadPointers9, + EPanicBadPointers10, + EPanicBadPointers11, + EPanicBadPointers12, + EPanicStillInGbBlock, + EPanicBadState, + EPanicSplitBoundaryIsNotAsLateAsPossible1, + EPanicSplitBoundaryIsNotAsLateAsPossible2, + EPanicBadGb2312Index, + EPanicBadHzIndex, + EPanicBadTildeSequence, + EPanicBadReturnValue1, + EPanicBadReturnValue2, + EPanicRemainderOfHzHasGotLonger + }; + +LOCAL_C void Panic(TPanic aPanic) + { + User::Panic(KLitPanicText, aPanic); + } + +#endif + +class CHZConverterImpl : public CCharacterSetConverterPluginInterface + { + +public: + virtual const TDesC8& ReplacementForUnconvertibleUnicodeCharacters(); + + virtual TInt ConvertFromUnicode( + CCnvCharacterSetConverter::TEndianness aDefaultEndiannessOfForeignCharacters, + const TDesC8& aReplacementForUnconvertibleUnicodeCharacters, + TDes8& aForeign, + const TDesC16& aUnicode, + CCnvCharacterSetConverter::TArrayOfAscendingIndices& aIndicesOfUnconvertibleCharacters); + + virtual TInt ConvertToUnicode( + CCnvCharacterSetConverter::TEndianness aDefaultEndiannessOfForeignCharacters, + TDes16& aUnicode, + const TDesC8& aForeign, + TInt& aState, + TInt& aNumberOfUnconvertibleCharacters, + TInt& aIndexOfFirstByteOfFirstUnconvertibleCharacter); + + virtual TBool IsInThisCharacterSetL( + TBool& aSetToTrue, + TInt& aConfidenceLevel, + const TDesC8& aSample); + + static CHZConverterImpl* NewL(); + virtual ~CHZConverterImpl(); + +private: + CHZConverterImpl(); + + }; + + + +const TDesC8& CHZConverterImpl::ReplacementForUnconvertibleUnicodeCharacters() + { + return CnvGb2312::ReplacementForUnconvertibleUnicodeCharacters(); + } + +LOCAL_C void IncrementNumberOfUnicodeCharactersNotConverted(TInt aLengthOfUnicode, TInt& aNumberOfUnicodeCharactersNotConverted, CCnvCharacterSetConverter::TArrayOfAscendingIndices& aIndicesOfUnconvertibleCharacters) // these seemingly haphazard order of these paramters is to match the position of the second and third parameters with the caller + { + ++aNumberOfUnicodeCharactersNotConverted; + const TInt indexOfUnicodeCharacterNowNotConverted=aLengthOfUnicode-aNumberOfUnicodeCharactersNotConverted; +#if defined(_DEBUG) + TInt numberOfMatchingIndicesFound=0; +#endif + for (TInt i=aIndicesOfUnconvertibleCharacters.NumberOfIndices()-1; i>=0; --i) // must iterate backwards as items from aIndicesOfUnconvertibleCharacters may be deleted + { + if (aIndicesOfUnconvertibleCharacters[i]==indexOfUnicodeCharacterNowNotConverted) + { + aIndicesOfUnconvertibleCharacters.Remove(i); +#if defined(_DEBUG) + ++numberOfMatchingIndicesFound; +#endif + } + } + __ASSERT_DEBUG(numberOfMatchingIndicesFound<=1, Panic(EPanicTooManyMatchingIndicesFound)); + } + +LOCAL_C void MakeAvailable(TInt aNumberOfBytesRequiredToBeAvailable, TInt& aNumberOfUnicodeCharactersNotConverted, CCnvCharacterSetConverter::TArrayOfAscendingIndices& aIndicesOfUnconvertibleCharacters, TInt aLengthOfUnicode, const TUint8*& aPointerToLastUsedByte, TInt& aNumberOfBytesAvailable, TInt aNumberOfBytesThatCanBeMadeAvailable) // these seemingly haphazard order of these paramters is to match the position of the second to fourth parameters (inclusive) with the caller +// makes available as much of aNumberOfBytesRequiredToBeAvailable as it can, even if the final value (i.e. value on returning) of aNumberOfBytesAvailableaNumberOfBytesThatCanBeMadeAvailable+aNumberOfBytesAvailable) + { + __ASSERT_DEBUG(aNumberOfBytesRequiredToBeAvailable>0, Panic(EPanicBadNumberOfBytesRequiredToBeAvailable)); + __ASSERT_DEBUG(aNumberOfBytesAvailable>=0, Panic(EPanicBadNumberOfBytesAvailable)); + __ASSERT_DEBUG(aNumberOfBytesThatCanBeMadeAvailable>=0, Panic(EPanicBadNumberOfBytesThatCanBeMadeAvailable)); + TInt numberOfBytesMadeAvailable=0; + FOREVER + { + if (aNumberOfBytesAvailable>=aNumberOfBytesRequiredToBeAvailable) + { + break; // no more needs to be done + } + __ASSERT_DEBUG(numberOfBytesMadeAvailable<=aNumberOfBytesThatCanBeMadeAvailable, Panic(EPanicBadNumberOfBytesMadeAvailable1)); + if (numberOfBytesMadeAvailable>=aNumberOfBytesThatCanBeMadeAvailable) + { + break; // give up - no more can be done + } + const TInt numberOfBytesInCharacter=(*aPointerToLastUsedByte&0x80)? 2: 1; + aPointerToLastUsedByte-=numberOfBytesInCharacter; + aNumberOfBytesAvailable+=numberOfBytesInCharacter; + numberOfBytesMadeAvailable+=numberOfBytesInCharacter; + IncrementNumberOfUnicodeCharactersNotConverted(aLengthOfUnicode, aNumberOfUnicodeCharactersNotConverted, aIndicesOfUnconvertibleCharacters); + } + __ASSERT_DEBUG(numberOfBytesMadeAvailable<=aNumberOfBytesThatCanBeMadeAvailable, Panic(EPanicBadNumberOfBytesMadeAvailable2)); + } + +LOCAL_C void ConvertFromGb2312ToHzInPlace(TDes8& aDescriptor, TInt& aNumberOfUnicodeCharactersNotConverted, CCnvCharacterSetConverter::TArrayOfAscendingIndices& aIndicesOfUnconvertibleCharacters, TInt aLengthOfUnicode) + { + // it is legal for aDescriptor to be of length 0 + const TInt originalLengthOfDescriptor=aDescriptor.Length(); + if (originalLengthOfDescriptor>0) + { + TInt numberOfBytesAvailable=aDescriptor.MaxLength()-originalLengthOfDescriptor; + TUint8* pointerToPreviousByte=CONST_CAST(TUint8*, aDescriptor.Ptr()-1); + const TUint8* pointerToLastUsedByte=pointerToPreviousByte+originalLengthOfDescriptor; + TBool isInGbBlock=EFalse; + FOREVER + { + __ASSERT_DEBUG((pointerToLastUsedByte-(aDescriptor.Ptr()-1))+numberOfBytesAvailable==aDescriptor.MaxLength(), Panic(EPanicBadDescriptorSubDivision1)); + __ASSERT_DEBUG(pointerToPreviousByte=pointerToLastUsedByte) + { + break; + } + } + if (currentByte=='~') + { + MakeAvailable(1, aNumberOfUnicodeCharactersNotConverted, aIndicesOfUnconvertibleCharacters, aLengthOfUnicode, pointerToLastUsedByte, numberOfBytesAvailable, (pointerToLastUsedByte-pointerToPreviousByte)-1); // what's passed into the last parameter is not a typo - we do not want the "~" currently pointed to by (pointerToPreviousByte+1) to be made available + if (numberOfBytesAvailable<1) // 1 byte is required for the extra "~" character + { + break; + } + Mem::Copy(pointerToPreviousByte+2, pointerToPreviousByte+1, pointerToLastUsedByte-pointerToPreviousByte); + ++pointerToPreviousByte; + *pointerToPreviousByte='~'; + numberOfBytesAvailable-=1; + pointerToLastUsedByte+=1; + } + ++pointerToPreviousByte; + } + __ASSERT_DEBUG(pointerToPreviousByte<=pointerToLastUsedByte, Panic(EPanicBadPointers4)); + if (pointerToPreviousByte>=pointerToLastUsedByte) + { + if (isInGbBlock) + { + goto closeGbBlock; // this is to share the code for closing the GB-block + } + break; + } + } + __ASSERT_DEBUG(pointerToPreviousByte<=pointerToLastUsedByte, Panic(EPanicBadPointers5)); + if (pointerToPreviousByte& aGb2312, TPtrC8& aHzBeingConsumed, TPtrC8& aRemainderOfHz, TInt& aState, TUint& aOutputConversionFlags) + { + // this function panics if aRemainderOfHz is of length 0 + TUint8* pointerToPreviousGb2312Byte=CONST_CAST(TUint8*, aGb2312.Ptr()-1); + const TUint8* pointerToCurrentHzByte=aRemainderOfHz.Ptr(); + const TUint8* const pointerToLastHzByte=pointerToCurrentHzByte+(aRemainderOfHz.Length()-1); + const TUint8* const pointerToLastHzByteToConvertThisTime=Min(pointerToLastHzByte, pointerToCurrentHzByte+(KLengthOfIntermediateBuffer-1)); + FOREVER + { + const TUint currentHzByte=*pointerToCurrentHzByte; + if (currentHzByte=='~') + { + __ASSERT_DEBUG(pointerToCurrentHzByte<=pointerToLastHzByte, Panic(EPanicBadPointers7)); + if (pointerToCurrentHzByte>=pointerToLastHzByte) + { + aOutputConversionFlags|=CCnvCharacterSetConverter::EOutputConversionFlagInputIsTruncated; + --pointerToCurrentHzByte; + break; + } + ++pointerToCurrentHzByte; + const TUint nextHzByte=*pointerToCurrentHzByte; + switch (nextHzByte) + { + case '{': + if (aState==KIsInGbBlock) + { + return CCnvCharacterSetConverter::EErrorIllFormedInput; + } + aState=KIsInGbBlock; + break; + case '}': + if (aState==CCnvCharacterSetConverter::KStateDefault) + { + return CCnvCharacterSetConverter::EErrorIllFormedInput; + } + aState=CCnvCharacterSetConverter::KStateDefault; + break; + case '~': + ++pointerToPreviousGb2312Byte; + *pointerToPreviousGb2312Byte=STATIC_CAST(TUint8, currentHzByte); + break; + case 0x0a: + break; + default: + return CCnvCharacterSetConverter::EErrorIllFormedInput; + } + } + else + { + __ASSERT_DEBUG(pointerToCurrentHzByte<=pointerToLastHzByte, Panic(EPanicBadPointers8)); + if (pointerToCurrentHzByte>pointerToLastHzByteToConvertThisTime) + { + --pointerToCurrentHzByte; + break; + } + if (aState==CCnvCharacterSetConverter::KStateDefault) + { + ++pointerToPreviousGb2312Byte; + *pointerToPreviousGb2312Byte=STATIC_CAST(TUint8, currentHzByte); + } + else + { + __ASSERT_DEBUG(aState==KIsInGbBlock, Panic(EPanicBadState)); + __ASSERT_DEBUG(pointerToCurrentHzByte<=pointerToLastHzByteToConvertThisTime, Panic(EPanicBadPointers9)); + if (pointerToCurrentHzByte>=pointerToLastHzByteToConvertThisTime) + { + aOutputConversionFlags|=CCnvCharacterSetConverter::EOutputConversionFlagInputIsTruncated; + --pointerToCurrentHzByte; + break; + } + ++pointerToCurrentHzByte; + ++pointerToPreviousGb2312Byte; + *pointerToPreviousGb2312Byte=STATIC_CAST(TUint8, currentHzByte|0x80); + ++pointerToPreviousGb2312Byte; + *pointerToPreviousGb2312Byte=STATIC_CAST(TUint8, *pointerToCurrentHzByte|0x80); + } + } + __ASSERT_DEBUG(pointerToCurrentHzByte<=pointerToLastHzByte, Panic(EPanicBadPointers10)); + if (pointerToCurrentHzByte>=pointerToLastHzByte) + { + break; + } + ++pointerToCurrentHzByte; + } + aGb2312.SetLength((pointerToPreviousGb2312Byte+1)-aGb2312.Ptr()); + const TInt numberOfHzBytesBeingConsumed=(pointerToCurrentHzByte+1)-aRemainderOfHz.Ptr(); + aHzBeingConsumed.Set(aRemainderOfHz.Left(numberOfHzBytesBeingConsumed)); + aRemainderOfHz.Set(aRemainderOfHz.Mid(numberOfHzBytesBeingConsumed)); +#if defined(_DEBUG) + // AAA: check that if the split occurs on a boundary between some one-byte and some two-byte text, then aState corresponds to the state *after* the split (the code marked "BBB" relies on this) + if (aRemainderOfHz.Length()>=2) + { + __ASSERT_DEBUG(aRemainderOfHz.Left(2)!=_L8("~{"), Panic(EPanicSplitBoundaryIsNotAsLateAsPossible1)); + __ASSERT_DEBUG(aRemainderOfHz.Left(2)!=_L8("~}"), Panic(EPanicSplitBoundaryIsNotAsLateAsPossible2)); + } +#endif + return 0; + } + +LOCAL_C TInt Gb2312IndexToHzIndex(const TDesC8& aHz, TInt aGb2312Index, TBool aReturnMaximalHzIndex) + { + // this function panics if aHz is of length 0 + // aHz may start in either KIsInGbBlock or CCnvCharacterSetConverter::KStateDefault state, but it must *not* have any truncated sequences (i.e. "tilde " sequence that is not complete, or part of a 2-byte character sequence) at either its start or its end + __ASSERT_DEBUG(aGb2312Index>=0, Panic(EPanicBadGb2312Index)); + TInt hzIndex=0; + TInt offsetFromGb2312IndexToHzIndex=0; + const TUint8* const pointerToFirstHzByte=aHz.Ptr(); + const TUint8* pointerToCurrentHzByte=pointerToFirstHzByte; + const TUint8* const pointerToLastHzByte=pointerToFirstHzByte+(aHz.Length()-1); + FOREVER + { + const TInt newHzIndex=pointerToCurrentHzByte-pointerToFirstHzByte; + const TInt candidateHzIndex=aGb2312Index+offsetFromGb2312IndexToHzIndex; + __ASSERT_DEBUG(hzIndex<=candidateHzIndex, Panic(EPanicBadHzIndex)); + if (aReturnMaximalHzIndex? (newHzIndex>candidateHzIndex): (hzIndex>=candidateHzIndex)) + { + break; + } + hzIndex=newHzIndex; + if (*pointerToCurrentHzByte=='~') + { + __ASSERT_DEBUG(pointerToCurrentHzByte<=pointerToLastHzByte, Panic(EPanicBadPointers11)); + if (pointerToCurrentHzByte>=pointerToLastHzByte) + { + break; + } + ++pointerToCurrentHzByte; + const TUint currentHzByte=*pointerToCurrentHzByte; + if (currentHzByte=='~') + { + ++offsetFromGb2312IndexToHzIndex; + } + else + { + __ASSERT_DEBUG((currentHzByte=='{') || (currentHzByte=='}') || (currentHzByte==0x0a), Panic(EPanicBadTildeSequence)); + offsetFromGb2312IndexToHzIndex+=2; + } + } + __ASSERT_DEBUG(pointerToCurrentHzByte<=pointerToLastHzByte, Panic(EPanicBadPointers12)); + if (pointerToCurrentHzByte>=pointerToLastHzByte) + { + break; + } + ++pointerToCurrentHzByte; + } + return hzIndex; + } + +TInt CHZConverterImpl::ConvertToUnicode( + CCnvCharacterSetConverter::TEndianness aDefaultEndiannessOfForeignCharacters, + TDes16& aUnicode, + const TDesC8& aForeign, + TInt& aState, + TInt& aNumberOfUnconvertibleCharacters, + TInt& aIndexOfFirstByteOfFirstUnconvertibleCharacter) + { + aUnicode.SetLength(0); + TPtrC8 remainderOfHz(aForeign); + TInt numberOfHzBytesConsumed=0; + TUint outputConversionFlags=0; + TUint inputConversionFlags=CCnvCharacterSetConverter::EInputConversionFlagAppend; + const SCnvConversionData& gb2312ConversionData=CnvGb2312::ConversionData(); + FOREVER + { + __ASSERT_DEBUG(numberOfHzBytesConsumed+remainderOfHz.Length()==aForeign.Length(), Panic(EPanicBadDescriptorSubDivision3)); +#if defined(_DEBUG) + const TInt oldLengthOfRemainderOfHz=remainderOfHz.Length(); +#endif + TBuf8 gb2312; + TPtrC8 hzBeingConsumed; + const TInt returnValue1=ConvertFromHzToHomogeneousGb2312(gb2312, hzBeingConsumed, remainderOfHz, aState, outputConversionFlags); + if (returnValue1<0) + { + return returnValue1; // this is an error-code + } + __ASSERT_DEBUG(returnValue1==0, Panic(EPanicBadReturnValue1)); + __ASSERT_DEBUG(hzBeingConsumed.Length()+remainderOfHz.Length()==oldLengthOfRemainderOfHz, Panic(EPanicRemainderOfHzHasGotLonger)); + if (hzBeingConsumed.Length()==0) + { + break; + } + TInt numberOfUnconvertibleCharacters; + TInt indexOfFirstByteOfFirstUnconvertibleCharacter; + const TInt returnValue2=CCnvCharacterSetConverter::DoConvertToUnicode(gb2312ConversionData, aDefaultEndiannessOfForeignCharacters, aUnicode, gb2312, numberOfUnconvertibleCharacters, indexOfFirstByteOfFirstUnconvertibleCharacter, outputConversionFlags, inputConversionFlags); + if (returnValue2<0) + { + return returnValue2; // this is an error-code + } + if (numberOfUnconvertibleCharacters>0) + { + if (aNumberOfUnconvertibleCharacters==0) + { + aIndexOfFirstByteOfFirstUnconvertibleCharacter=numberOfHzBytesConsumed+Gb2312IndexToHzIndex(hzBeingConsumed, indexOfFirstByteOfFirstUnconvertibleCharacter, EFalse); + } + aNumberOfUnconvertibleCharacters+=numberOfUnconvertibleCharacters; + } + if (returnValue2>0) + { + const TInt numberOfGb2312BytesConverted=gb2312.Length()-returnValue2; + __ASSERT_DEBUG(numberOfGb2312BytesConverted>=0, Panic(EPanicBadReturnValue2)); + // don't call gb2312.SetLength(numberOfGb2312BytesConverted) as we want to access gb2312[numberOfGb2312BytesConverted] - in any case, gb2312's length is never going to be used again + // don't bother re-setting remainderOfHz as it won't be used again + numberOfHzBytesConsumed+=Gb2312IndexToHzIndex(hzBeingConsumed, numberOfGb2312BytesConverted, ETrue); + aState=(gb2312[numberOfGb2312BytesConverted]&0x80)? KIsInGbBlock: CCnvCharacterSetConverter::KStateDefault; // BBB: if the split (between the text that was converted and the text that wasn't converted) occurs on a boundary between some one-byte and some two-byte text, then aState corresponds to the state *after* the split (the code marked "AAA" checks this) - this means that we set aState according to gb2312[numberOfGb2312BytesConverted] rather than gb2312[numberOfGb2312BytesConverted-1] + break; + } + numberOfHzBytesConsumed+=hzBeingConsumed.Length(); + remainderOfHz.Set(aForeign.Mid(numberOfHzBytesConsumed)); + __ASSERT_DEBUG(numberOfHzBytesConsumed+remainderOfHz.Length()==aForeign.Length(), Panic(EPanicBadDescriptorSubDivision4)); + if (remainderOfHz.Length()==0) + { + break; + } + if (numberOfHzBytesConsumed>0) + { + inputConversionFlags|=CCnvCharacterSetConverter::EInputConversionFlagAllowTruncatedInputNotEvenPartlyConsumable; + } + } + // N.B. remainderOfHz is in an undefined state by this point + if ((numberOfHzBytesConsumed==0) && (outputConversionFlags&CCnvCharacterSetConverter::EOutputConversionFlagInputIsTruncated)) + { + return CCnvCharacterSetConverter::EErrorIllFormedInput; + } + return aForeign.Length()-numberOfHzBytesConsumed; + } + +TBool CHZConverterImpl::IsInThisCharacterSetL( + TBool& aSetToTrue, + TInt& aConfidenceLevel, + const TDesC8& aSample) + { + aSetToTrue=ETrue; + TInt sampleLength = aSample.Length(); + TInt pairOfTilde=0; + TInt occrenceOfNonHz=0; + aConfidenceLevel = 50; + // Hz encoding uses escape sequences... + for (TInt i = 0; i < sampleLength; ++i) + { + if (aSample[i]>0x7e) + occrenceOfNonHz++; + if (aSample[i]==0x7e) + { + TInt increment1 = i+1; + if (increment1 >= sampleLength) + break; + if ((aSample[increment1] == 0x7b)||(aSample[increment1] == 0x7d)||(aSample[increment1] == 0x7e)) + { + pairOfTilde++; + i++; + } + } + }//for + if (sampleLength) + { + TInt occurrenceOftilde =2*pairOfTilde*100/sampleLength; + aConfidenceLevel=aConfidenceLevel-Max(0,(4-occurrenceOftilde)); + aConfidenceLevel += occurrenceOftilde; + aConfidenceLevel -= ((occrenceOfNonHz*100)/sampleLength); + } + return ETrue; + } + +CHZConverterImpl* CHZConverterImpl::NewL() + { + CHZConverterImpl* self = new(ELeave) CHZConverterImpl(); + return self; + } + +CHZConverterImpl::~CHZConverterImpl() + { + } + +CHZConverterImpl::CHZConverterImpl() + { + } + +const TImplementationProxy ImplementationTable[] = + { + IMPLEMENTATION_PROXY_ENTRY(0x10006065, CHZConverterImpl::NewL) + }; + +EXPORT_C const TImplementationProxy* ImplementationGroupProxy(TInt& aTableCount) + { + aTableCount = sizeof(ImplementationTable) / sizeof(TImplementationProxy); + + return ImplementationTable; + }