charconvfw/Charconv/ongoing/Source/foreign/builtin/CP1252.CPP
author Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
Fri, 16 Apr 2010 16:55:07 +0300
changeset 16 56cd22a7a1cb
parent 0 1fb32624e06b
permissions -rw-r--r--
Revision: 201011 Kit: 201015

/*
* Copyright (c) 1252 Nokia Corporation and/or its subsidiary(-ies). 
* All rights reserved.
* This component and the accompanying materials are made available
* under the terms of the License "Eclipse Public License v1.0"
* which accompanies this distribution, and is available
* at the URL "http://www.eclipse.org/legal/epl-v10.html".
*
* Initial Contributors:
* Nokia Corporation - initial contribution.
*
* Contributors:
*
* Description:      
*
*/








#include <e32std.h>
#include <convdata.h>

#define ARRAY_LENGTH(aArray) (sizeof(aArray)/sizeof((aArray)[0]))

LOCAL_D const TUint16 keyedTables16OfIndexedTables16_indexedEntries_codePage1252ToUnicode_1[]=
	{
	0x201a,
	0x0192,
	0x201e,
	0x2026,
	0x2020,
	0x2021,
	0x02c6,
	0x2030,
	0x0160,
	0x2039,
	0x0152
	};

LOCAL_D const TUint16 keyedTables16OfIndexedTables16_indexedEntries_codePage1252ToUnicode_2[]=
	{
	0x2018,
	0x2019,
	0x201c,
	0x201d,
	0x2022,
	0x2013,
	0x2014,
	0x02dc,
	0x2122,
	0x0161,
	0x203a,
	0x0153
	};

LOCAL_D const TUint16 keyedTables16OfIndexedTables16_indexedEntries_codePage1252ToUnicode_3[]=
	{
	0x017e,
	0x0178
	};

LOCAL_D const SCnvConversionData::SOneDirectionData::SRange::UData::SKeyedTable16OfIndexedTables16::SKeyedEntry keyedTables16OfIndexedTables16_keyedEntries_codePage1252ToUnicode_1[]=
	{
		{
		0x82,
		0x8c,
		keyedTables16OfIndexedTables16_indexedEntries_codePage1252ToUnicode_1
		},
		{
		0x91,
		0x9c,
		keyedTables16OfIndexedTables16_indexedEntries_codePage1252ToUnicode_2
		},
		{
		0x9e,
		0x9f,
		keyedTables16OfIndexedTables16_indexedEntries_codePage1252ToUnicode_3
		}
	};

LOCAL_D const SCnvConversionData::SOneDirectionData::SRange::UData::SKeyedTable1616::SEntry keyedTable1616_unicodeToCodePage1252_1[]=
	{
		{
		0x0152,
		0x8c
		},
		{
		0x0153,
		0x9c
		},
		{
		0x0160,
		0x8a
		},
		{
		0x0161,
		0x9a
		},
		{
		0x0178,
		0x9f
		},
		{
		0x017d,
		0x8e
		},
		{
		0x017e,
		0x9e
		},
		{
		0x0192,
		0x83
		},
		{
		0x02c6,
		0x88
		},
		{
		0x02dc,
		0x98
		},
		{
		0x2013,
		0x96
		},
		{
		0x2014,
		0x97
		},
		{
		0x2018,
		0x91
		},
		{
		0x2019,
		0x92
		},
		{
		0x201a,
		0x82
		},
		{
		0x201c,
		0x93
		},
		{
		0x201d,
		0x94
		},
		{
		0x201e,
		0x84
		},
		{
		0x2020,
		0x86
		},
		{
		0x2021,
		0x87
		},
		{
		0x2022,
		0x95
		},
		{
		0x2026,
		0x85
		},
		{
		0x2030,
		0x89
		},
		{
		0x2039,
		0x8b
		},
		{
		0x203a,
		0x9b
		},
		{
		0x20ac,
		0x80
		},
		{
		0x2122,
		0x99
		}
	};

LOCAL_D const SCnvConversionData::SVariableByteData::SRange codePage1252VariableByteDataRanges[]=
	{
		{
		0x00,
		0xff,
		0,
		0
		}
	};

LOCAL_D const SCnvConversionData::SOneDirectionData::SRange codePage1252ToUnicodeDataRanges[]=
	{
		{
		0x00,
		0x7f,
		SCnvConversionData::SOneDirectionData::SRange::EDirect,
		0,
		0,
			{
			0,
			0
			}
		},
		{
		0xa0,
		0xff,
		SCnvConversionData::SOneDirectionData::SRange::EDirect,
		0,
		0,
			{
			0,
			0
			}
		},
		{
		0x80,
		0x80,
		SCnvConversionData::SOneDirectionData::SRange::EOffset,
		0,
		0,
			{
			STATIC_CAST(TUint, 8236),
			0
			}
		},
		{
		0x8e,
		0x8e,
		SCnvConversionData::SOneDirectionData::SRange::EOffset,
		0,
		0,
			{
			STATIC_CAST(TUint, 239),
			0
			}
		},
		{
		0x82,
		0x9f,
		SCnvConversionData::SOneDirectionData::SRange::EKeyedTable16OfIndexedTables16,
		0,
		0,
			{
			UData_SKeyedTable16OfIndexedTables16(keyedTables16OfIndexedTables16_keyedEntries_codePage1252ToUnicode_1)
			}
		}
	};

LOCAL_D const SCnvConversionData::SOneDirectionData::SRange unicodeToCodePage1252DataRanges[]=
	{
		{
		0x0000,
		0x007f,
		SCnvConversionData::SOneDirectionData::SRange::EDirect,
		1,
		0,
			{
			0,
			0
			}
		},
		{
		0x00a0,
		0x00ff,
		SCnvConversionData::SOneDirectionData::SRange::EDirect,
		1,
		0,
			{
			0,
			0
			}
		},
		{
		0x0152,
		0x2122,
		SCnvConversionData::SOneDirectionData::SRange::EKeyedTable1616,
		1,
		0,
			{
			UData_SKeyedTable1616(keyedTable1616_unicodeToCodePage1252_1)
			}
		}
	};

GLREF_D const SCnvConversionData codePage1252ConversionData=
	{
	SCnvConversionData::EUnspecified,
		{
		ARRAY_LENGTH(codePage1252VariableByteDataRanges),
		codePage1252VariableByteDataRanges
		},
		{
		ARRAY_LENGTH(codePage1252ToUnicodeDataRanges),
		codePage1252ToUnicodeDataRanges
		},
		{
		ARRAY_LENGTH(unicodeToCodePage1252DataRanges),
		unicodeToCodePage1252DataRanges
		},
	NULL,
	NULL
	};

GLREF_C void IsCharacterSetCP1252(TInt& aConfidenceLevel, const TDesC8& aSample)
	{
	aConfidenceLevel = 60;
	TInt sampleLength = aSample.Length();

	for (TInt i=0; i<sampleLength; ++i)
		{
		// CP1252 includes ASCII as well
		// first check if the char is in the range 0x80 - 0x9f (controls codes in ISO88591)
		// If it is in that range then the likelihood that it's CP1252 is a bit higher
		if ((aSample[i] >= 0x80) && (aSample[i] <= 0x9f))
			{
			if((aSample[i]==0x81)||(aSample[i]==0x8D)||(aSample[i]==0x8f)||
				(aSample[i]==0x90)||(aSample[i]==0x9d))
				{
				// These code values are not supported by the Codepage CP1252
				aConfidenceLevel = 0;
				break;
				}
			else
				{
				// problem: UTF8 uses the values 0x80-0x9f in more than 50% of it's multibyte representation
				// so if the text was UTF8 .... the confidence here would hit the roof. Could check to make 
				// sure that this is not UTF8
				aConfidenceLevel+=1;
				}
			}
		TInt increment1 = i+1;
		TInt decrement1 = i-1;
		// 0xf7 is the division symbol in CP1252.
		// 0xd7 is the division symbol in CP1252.If char on either side of the division
		// symbol is a number then the confidence that it's ISO88591 increases
		if( decrement1>= 0 && ((aSample[i]==0xf7) || (aSample[i]==0xd7)) && increment1<sampleLength)
			{
			
			if (increment1 >= sampleLength)
				break;
			if ( (aSample[decrement1] >= 0x30) && (aSample[decrement1] <= 0x39) &&  // char before is a number
				 (aSample[increment1] >= 0x30) && (aSample[increment1] <= 0x39) )   // char after is a number
				{
				aConfidenceLevel+=5;
				}
			}
		// Can also use the currency symbol to increase confidence if the char after a 
		// currency symbol is numeric
		if((aSample[i]>=0xa2) && (aSample[i] <= 0xa5) && increment1<sampleLength)
			{
			if ((aSample[increment1] >= 0x30) && (aSample[increment1] <= 0x39))
				{
				aConfidenceLevel+=5; 
				}
			}
		} // for loop
	aConfidenceLevel =(aConfidenceLevel >0)? ((aConfidenceLevel > 100)? 100: aConfidenceLevel): 0;
	}