symport/e32/euser/unicode/unicode.cpp
changeset 1 0a7b44b10206
child 2 806186ab5e14
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/symport/e32/euser/unicode/unicode.cpp	Thu Jun 25 15:59:54 2009 +0100
@@ -0,0 +1,369 @@
+// Copyright (c) 1997-2009 Nokia Corporation and/or its subsidiary(-ies).
+// All rights reserved.
+// This component and the accompanying materials are made available
+// under the terms of the License "Symbian Foundation License v1.0"
+// which accompanies this distribution, and is available
+// at the URL "http://www.symbianfoundation.org/legal/sfl-v10.html".
+//
+// Initial Contributors:
+// Nokia Corporation - initial contribution.
+//
+// Contributors:
+//
+// Description:
+// e32\euser\unicode\unicode.cpp
+// The implementation of the base-level Unicode character classification functions. These are members of
+// a class called TUnicode that contains a Unicode value.
+// 
+//
+
+#include <unicode.h>
+#include "compareimp.h"
+
+static const TUnicodeData TheDefaultUnicodeData =
+	{ TChar::ECnCategory, TChar::EOtherNeutral, 0, 0, 0, TUnicodeData::ENonNumeric };
+
+// Fill in a TChar::TCharInfo structure with category information about the character.
+void TUnicode::GetInfo(TChar::TCharInfo& aInfo,const TUnicodeDataSet *aOverridingDataSet) const
+	{
+	const TUnicodeData& data = GetData(aOverridingDataSet);
+	aInfo.iCategory = (TChar::TCategory)data.iCategory;
+	aInfo.iBdCategory = (TChar::TBdCategory)data.iBdCategory;
+	aInfo.iCombiningClass = data.iCombiningClass;
+	aInfo.iLowerCase = iCode;
+	aInfo.iUpperCase = iCode;
+	aInfo.iTitleCase = iCode;
+	if (data.iFlags & TUnicodeData::EHasLowerCase)
+		aInfo.iLowerCase = GetLowerCase(data);
+	if (data.iFlags & TUnicodeData::EHasUpperCase)
+		aInfo.iUpperCase = GetUpperCase(data);
+	if (data.iFlags & TUnicodeData::EHasTitleCase)
+		aInfo.iTitleCase = GetTitleCase(data);
+	aInfo.iMirrored = data.iFlags & TUnicodeData::EMirrored;
+	if (data.iFlags & TUnicodeData::ENumericFlags)
+		aInfo.iNumericValue = GetNumericValue(data);
+	else
+		aInfo.iNumericValue = -1;
+	}
+
+/*
+Get the data describing a character. If "aOverridingDataSet" is non-null, look in that
+data set before searching the standard data set.
+*/
+const TUnicodeData& TUnicode::GetData(const TUnicodeDataSet *aOverridingDataSet) const
+	{
+	const TUnicodeData *result = NULL;
+	if (aOverridingDataSet)
+		result = GetDataFromDataSet(*aOverridingDataSet);
+	if (result == NULL)
+		{
+		if (0xFFFF < iCode)
+			// for now we have no data for values above U+FFFF
+			return TheDefaultUnicodeData;
+		int index = TheStandardUnicodeDataSet.iIndex1[iCode >> 4];
+		if (index & 0x8000) // high bit set means all values in block have the same value, and it's in the index
+			index &= ~0x8000;
+		else
+			index = TheStandardUnicodeDataSet.iIndex2[index + (iCode & 0x000F)];
+		return TheStandardUnicodeDataSet.iData[index];
+		}
+
+	return *result;
+	}
+
+/*
+Given a character data set, get the data referring to this character.
+Return NULL if no data is available in this data set.
+*/
+const TUnicodeData *TUnicode::GetDataFromDataSet(const TUnicodeDataSet& aDataSet) const
+	{
+	// Perform a binary chop to find the range containing this character.
+	TInt n = aDataSet.iRanges;
+	const TUnicodeDataRange *base = aDataSet.iRange;
+	const TUnicodeDataRange *last = base + n - 1;
+	const TUnicodeDataRange *r = base;
+
+	while (n > 1)
+		{
+		TInt pivot = n / 2;
+		r += pivot;
+		if (iCode < r->iRangeStart)									// it's before this range
+			n = pivot;
+		else if (r < last && iCode >= r[1].iRangeStart)				// it's after this range
+			{
+			base = r + 1;
+			n -= pivot + 1;
+			}
+		else														// it's in this range
+			break;
+		r = base;
+		}
+
+	if (r->iIndex >= 0)
+		return &aDataSet.iData[r->iIndex];		// index >= 0: data available
+	else
+		return NULL;							// index < 0: no data available
+	}
+
+EXPORT_C TChar::TCategory TUnicode::GetCategory(const TUnicodeDataSet *aOverridingDataSet) const
+	{
+	return (TChar::TCategory)GetData(aOverridingDataSet).iCategory;
+	}
+
+TChar::TBdCategory TUnicode::GetBdCategory(const TUnicodeDataSet *aOverridingDataSet) const
+	{
+	return (TChar::TBdCategory)GetData(aOverridingDataSet).iBdCategory;
+	}
+
+TInt TUnicode::GetCombiningClass(const TUnicodeDataSet *aOverridingDataSet) const
+	{
+	return GetData(aOverridingDataSet).iCombiningClass;
+	}
+
+EXPORT_C TUint TUnicode::GetLowerCase(const TUnicodeDataSet *aOverridingDataSet) const
+	{
+	return GetLowerCase(GetData(aOverridingDataSet));
+	}
+
+EXPORT_C TUint TUnicode::GetUpperCase(const TUnicodeDataSet *aOverridingDataSet) const
+	{
+	return GetUpperCase(GetData(aOverridingDataSet));
+	}
+
+TUint TUnicode::GetLowerCase(const TUnicodeData& aData) const
+	{
+	if (aData.iFlags & TUnicodeData::EHasLowerCase)
+		return iCode + aData.iCaseOffset;
+	else
+		return iCode;
+	}
+
+TUint TUnicode::GetUpperCase(const TUnicodeData& aData) const
+	{
+	if (aData.iFlags & TUnicodeData::EHasUpperCase)
+		return iCode - aData.iCaseOffset;
+	else
+		return iCode;
+	}
+
+TUint TUnicode::GetTitleCase(const TUnicodeDataSet *aOverridingDataSet) const
+	{
+	return GetTitleCase(GetData(aOverridingDataSet));
+	}
+
+TUint TUnicode::GetTitleCase(const TUnicodeData& aData) const
+	{
+	// Handle the very few characters with distinct title case variants.
+	if (aData.iFlags & TUnicodeData::EHasTitleCase)
+		{
+		// If the character has no upper case variant add one to get the title case form.
+		if (!(aData.iFlags & TUnicodeData::EHasUpperCase))
+			return iCode + 1;
+		// If the character has no lower case variant subtract one to get the title case form.
+		if (!(aData.iFlags & TUnicodeData::EHasLowerCase))
+			return iCode - 1;
+		// Both upper and lower case forms exist so the character itself must be title case.
+		return iCode;
+		}
+
+	// All other characters have title case forms that are the same as their upper case forms.
+	return GetUpperCase(aData);
+	}
+
+TBool TUnicode::IsMirrored(const TUnicodeDataSet *aOverridingDataSet) const
+	{
+	return GetData(aOverridingDataSet).iFlags & TUnicodeData::EMirrored;
+	}
+
+TInt TUnicode::GetNumericValue(const TUnicodeDataSet *aOverridingDataSet) const
+	{
+	return GetNumericValue(GetData(aOverridingDataSet));
+	}
+
+/*
+Return the integer numeric value of this character.
+Return -1 if the character is not numeric, or -2 if it has a fractional value.
+*/
+TInt TUnicode::GetNumericValue(const TUnicodeData& aData) const
+	{
+	switch (aData.iFlags & TUnicodeData::ENumericFlags)
+		{
+		case TUnicodeData::ENonNumeric: return -1;
+		case TUnicodeData::ESmallNumeric: return (iCode + aData.iDigitOffset) & 0xFF;
+		case TUnicodeData::EFiveHundred: return 500;
+		case TUnicodeData::EOneThousand: return 1000;
+		case TUnicodeData::EFiveThousand: return 5000;
+		case TUnicodeData::ETenThousand: return 10000;
+		case TUnicodeData::EHundredThousand: return 100000;
+		case TUnicodeData::EFraction: return -2;
+		default: return -1; // we should never come here
+		}
+	}
+
+struct TWidthInfo
+	{
+	TUint iStart;
+	TUint iEnd;
+	TChar::TCjkWidth iWidth;
+	};
+
+static const TWidthInfo TheWidthInfoTable[] =
+	{
+	{ 0x0020, 0x007F, TChar::ENarrow },
+	{ 0x00A2, 0x00A4, TChar::ENarrow },
+	{ 0x00A5, 0x00A7, TChar::ENarrow },
+	{ 0x00AF, 0x00B0, TChar::ENarrow },
+	{ 0x00B1, 0x1100, TChar::ENeutralWidth },
+	{ 0x1100, 0x1160, TChar::EWide },
+	{ 0x1160, 0x2E80, TChar::ENeutralWidth },
+	{ 0x2E80, 0xD7A4, TChar::EWide },
+	{ 0xF900, 0xFA2E, TChar::EWide },
+	{ 0xFE30, 0xFE6C, TChar::EWide },
+	{ 0xFF01, 0xFF5F, TChar::EFullWidth },
+	{ 0xFF61, 0xFFDD, TChar::EHalfWidth },
+	{ 0xFFE0, 0xFFE7, TChar::EFullWidth },
+	{ 0xFFE8, 0xFFEF, TChar::EHalfWidth }
+	};
+
+const TInt TheWidthInfos = sizeof(TheWidthInfoTable) / sizeof(TheWidthInfoTable[0]);
+
+/*
+Get the notional width used by East Asian encoding systems. No check is made that the character is assigned.
+No separate 'ambiguous width' is returned; ambiguous characters are treated as neutral except for those
+in the CJK range, which are treated as wide. This is a big simplification, but the cost of an exhaustive table
+is too great to justify at the moment.
+*/
+TChar::TCjkWidth TUnicode::GetCjkWidth() const
+	{
+	const TWidthInfo* w = TheWidthInfoTable;
+	for (int i = 0; i < TheWidthInfos; i++, w++)
+		if (iCode >= w->iStart && iCode < w->iEnd)
+			return w->iWidth;
+	return TChar::ENeutralWidth;
+	}
+
+/*
+Convert a Unicode character into a form most likely to be equal to another character, while
+still preserving the essential meaning of the character. Possible folding operations include
+converting to lower case (TChar::EFoldCase), stripping accents (TChar::EFoldAccents) and others.
+The flag value has a default, TChar::EFoldStandard, which performs the folding operations done
+by calling Fold functions with no flags argument, and there is also TChar::EFoldAll,
+which performs all possible folding operations.
+
+Note that the difference between folding and collation is that folding is
+	*	character-based
+	*	biased towards yielding equality where possible
+while collation is
+	*	string-based
+	*	designed to yield a non-equal ordering
+
+Typically, folding will be used when searching for a match, while collation will be used when
+sorting a list.
+*/
+EXPORT_C TUint TUnicode::Fold(TInt aFlags,const TUnicodeDataSet *aOverridingDataSet) const
+	{
+	TUint result = iCode;
+
+	/*
+	Fold CJK width variants. This only applies to characters 0xFF00 and above so we can use
+	a built-in table.
+	*/
+	if (result >= 0xFF00 && (aFlags & TChar::EFoldWidth))
+		result = CjkWidthFoldTable[result & 0xFF];
+
+	/*
+	If the character is <= 0x00FF and the flags include folding case and stripping accents,
+	and there is no overriding character data, we can use the built-in fold table.
+	*/
+	const TUnicodeData* data = NULL;
+	if (aOverridingDataSet)
+		data = GetDataFromDataSet(*aOverridingDataSet);
+	if (data == NULL && result < 256 &&
+		(aFlags & (TChar::EFoldCase | TChar::EFoldAccents)) == (TChar::EFoldCase | TChar::EFoldAccents))
+		return FoldTable[result];
+
+	/*
+	Other characters have to be dealt with laboriously.
+	The first operations are those that, if successful, tell us that nothing more
+	need be done. If a value is folded to a space or a digit or converted to Katakana
+	it cannot have anything else done to it.
+	*/
+	if (aFlags & TChar::EFoldKana)
+		{
+		if ((result >= 0x3041 && result <= 0x3094) || result == 0x309D || result == 0x309E)
+			return result += 0x0060;
+		}
+	if (data == NULL)
+		data = &GetData(NULL);
+	if (aFlags & TChar::EFoldSpaces)
+		{
+		if (data->iCategory == TChar::EZsCategory)
+			return 0x0020;
+		}
+	if (aFlags & TChar::EFoldDigits)
+		{
+		TInt n = GetNumericValue(*data);
+		if (n >= 0 && n <= 9)
+			return 0x0030 + n;
+		}
+
+	/*
+	The final operations are the relatively rare and expensive ones (after the special
+	case dealt with above) of accent removal and case conversion.
+	*/
+	if ((aFlags & TChar::EFoldAccents) && (result < 0x2000))
+		{
+		/*
+		Throw away characters other than the first if all are accents. For the moment these
+		are defined as characters in the range 0x0300..0x0361. This definition may need
+		to be modified; or I may decide to store a flag in the decomposition table indicating
+		whether or not the decomposition consists of base + accent(s).
+		*/
+		TPtrC16 decomposition;
+		if (::DecomposeChar(iCode, decomposition))
+			{
+			TBool all_accents = TRUE;			
+			for (TInt i = 1; all_accents && i < decomposition.Length(); ++i)
+				{
+				if (decomposition[i] < 0x0300 || decomposition[i] > 0x0361)
+					all_accents = FALSE;
+				}
+			if (all_accents)
+				result = decomposition[0];
+			}
+		}
+
+	if (aFlags & TChar::EFoldCase)
+		{
+		if (aOverridingDataSet == NULL && result < 256)
+			result = FoldTable[result];
+		else
+			result = TUnicode(result).GetLowerCase(aOverridingDataSet);
+		}
+	
+	return result;
+	}
+
+/*
+Compare two Unicode strings naively by Unicode value. This is NOT the same as a comparison
+of null-terminated strings; the strings can contain null characters (Unicode 0x0000) and they
+compare greater than no character. This means that the string { 0x0001 0x0000 } always comes
+after the string { 0x0001 }.
+
+This function exists to make it easier to search tables of Unicode strings (like the composition
+buffer) using the binary chop method. It is also used by READTYPE when sorting the compose table.
+
+The return values are: 0 for equality, < 0 if aString1 < aString2, > 0 if aString1 > aString2.
+*/
+TInt TUnicode::Compare(const TUint16 *aString1,TInt aLength1,const TUint16 *aString2,TInt aLength2)
+	{
+	for (TInt i = 0; i < aLength1 || i < aLength2; i++, aString1++, aString2++)
+		{
+		TInt x = i < aLength1 ? *aString1 : -1;
+		TInt y = i < aLength2 ? *aString2 : -1;
+		if (x != y)
+			return x - y;
+		}
+	return 0;
+	}
+