symport/e32/euser/unicode/unicode.cpp
changeset 1 0a7b44b10206
child 2 806186ab5e14
equal deleted inserted replaced
0:c55016431358 1:0a7b44b10206
       
     1 // Copyright (c) 1997-2009 Nokia Corporation and/or its subsidiary(-ies).
       
     2 // All rights reserved.
       
     3 // This component and the accompanying materials are made available
       
     4 // under the terms of the License "Symbian Foundation License v1.0"
       
     5 // which accompanies this distribution, and is available
       
     6 // at the URL "http://www.symbianfoundation.org/legal/sfl-v10.html".
       
     7 //
       
     8 // Initial Contributors:
       
     9 // Nokia Corporation - initial contribution.
       
    10 //
       
    11 // Contributors:
       
    12 //
       
    13 // Description:
       
    14 // e32\euser\unicode\unicode.cpp
       
    15 // The implementation of the base-level Unicode character classification functions. These are members of
       
    16 // a class called TUnicode that contains a Unicode value.
       
    17 // 
       
    18 //
       
    19 
       
    20 #include <unicode.h>
       
    21 #include "compareimp.h"
       
    22 
       
    23 static const TUnicodeData TheDefaultUnicodeData =
       
    24 	{ TChar::ECnCategory, TChar::EOtherNeutral, 0, 0, 0, TUnicodeData::ENonNumeric };
       
    25 
       
    26 // Fill in a TChar::TCharInfo structure with category information about the character.
       
    27 void TUnicode::GetInfo(TChar::TCharInfo& aInfo,const TUnicodeDataSet *aOverridingDataSet) const
       
    28 	{
       
    29 	const TUnicodeData& data = GetData(aOverridingDataSet);
       
    30 	aInfo.iCategory = (TChar::TCategory)data.iCategory;
       
    31 	aInfo.iBdCategory = (TChar::TBdCategory)data.iBdCategory;
       
    32 	aInfo.iCombiningClass = data.iCombiningClass;
       
    33 	aInfo.iLowerCase = iCode;
       
    34 	aInfo.iUpperCase = iCode;
       
    35 	aInfo.iTitleCase = iCode;
       
    36 	if (data.iFlags & TUnicodeData::EHasLowerCase)
       
    37 		aInfo.iLowerCase = GetLowerCase(data);
       
    38 	if (data.iFlags & TUnicodeData::EHasUpperCase)
       
    39 		aInfo.iUpperCase = GetUpperCase(data);
       
    40 	if (data.iFlags & TUnicodeData::EHasTitleCase)
       
    41 		aInfo.iTitleCase = GetTitleCase(data);
       
    42 	aInfo.iMirrored = data.iFlags & TUnicodeData::EMirrored;
       
    43 	if (data.iFlags & TUnicodeData::ENumericFlags)
       
    44 		aInfo.iNumericValue = GetNumericValue(data);
       
    45 	else
       
    46 		aInfo.iNumericValue = -1;
       
    47 	}
       
    48 
       
    49 /*
       
    50 Get the data describing a character. If "aOverridingDataSet" is non-null, look in that
       
    51 data set before searching the standard data set.
       
    52 */
       
    53 const TUnicodeData& TUnicode::GetData(const TUnicodeDataSet *aOverridingDataSet) const
       
    54 	{
       
    55 	const TUnicodeData *result = NULL;
       
    56 	if (aOverridingDataSet)
       
    57 		result = GetDataFromDataSet(*aOverridingDataSet);
       
    58 	if (result == NULL)
       
    59 		{
       
    60 		if (0xFFFF < iCode)
       
    61 			// for now we have no data for values above U+FFFF
       
    62 			return TheDefaultUnicodeData;
       
    63 		int index = TheStandardUnicodeDataSet.iIndex1[iCode >> 4];
       
    64 		if (index & 0x8000) // high bit set means all values in block have the same value, and it's in the index
       
    65 			index &= ~0x8000;
       
    66 		else
       
    67 			index = TheStandardUnicodeDataSet.iIndex2[index + (iCode & 0x000F)];
       
    68 		return TheStandardUnicodeDataSet.iData[index];
       
    69 		}
       
    70 
       
    71 	return *result;
       
    72 	}
       
    73 
       
    74 /*
       
    75 Given a character data set, get the data referring to this character.
       
    76 Return NULL if no data is available in this data set.
       
    77 */
       
    78 const TUnicodeData *TUnicode::GetDataFromDataSet(const TUnicodeDataSet& aDataSet) const
       
    79 	{
       
    80 	// Perform a binary chop to find the range containing this character.
       
    81 	TInt n = aDataSet.iRanges;
       
    82 	const TUnicodeDataRange *base = aDataSet.iRange;
       
    83 	const TUnicodeDataRange *last = base + n - 1;
       
    84 	const TUnicodeDataRange *r = base;
       
    85 
       
    86 	while (n > 1)
       
    87 		{
       
    88 		TInt pivot = n / 2;
       
    89 		r += pivot;
       
    90 		if (iCode < r->iRangeStart)									// it's before this range
       
    91 			n = pivot;
       
    92 		else if (r < last && iCode >= r[1].iRangeStart)				// it's after this range
       
    93 			{
       
    94 			base = r + 1;
       
    95 			n -= pivot + 1;
       
    96 			}
       
    97 		else														// it's in this range
       
    98 			break;
       
    99 		r = base;
       
   100 		}
       
   101 
       
   102 	if (r->iIndex >= 0)
       
   103 		return &aDataSet.iData[r->iIndex];		// index >= 0: data available
       
   104 	else
       
   105 		return NULL;							// index < 0: no data available
       
   106 	}
       
   107 
       
   108 EXPORT_C TChar::TCategory TUnicode::GetCategory(const TUnicodeDataSet *aOverridingDataSet) const
       
   109 	{
       
   110 	return (TChar::TCategory)GetData(aOverridingDataSet).iCategory;
       
   111 	}
       
   112 
       
   113 TChar::TBdCategory TUnicode::GetBdCategory(const TUnicodeDataSet *aOverridingDataSet) const
       
   114 	{
       
   115 	return (TChar::TBdCategory)GetData(aOverridingDataSet).iBdCategory;
       
   116 	}
       
   117 
       
   118 TInt TUnicode::GetCombiningClass(const TUnicodeDataSet *aOverridingDataSet) const
       
   119 	{
       
   120 	return GetData(aOverridingDataSet).iCombiningClass;
       
   121 	}
       
   122 
       
   123 EXPORT_C TUint TUnicode::GetLowerCase(const TUnicodeDataSet *aOverridingDataSet) const
       
   124 	{
       
   125 	return GetLowerCase(GetData(aOverridingDataSet));
       
   126 	}
       
   127 
       
   128 EXPORT_C TUint TUnicode::GetUpperCase(const TUnicodeDataSet *aOverridingDataSet) const
       
   129 	{
       
   130 	return GetUpperCase(GetData(aOverridingDataSet));
       
   131 	}
       
   132 
       
   133 TUint TUnicode::GetLowerCase(const TUnicodeData& aData) const
       
   134 	{
       
   135 	if (aData.iFlags & TUnicodeData::EHasLowerCase)
       
   136 		return iCode + aData.iCaseOffset;
       
   137 	else
       
   138 		return iCode;
       
   139 	}
       
   140 
       
   141 TUint TUnicode::GetUpperCase(const TUnicodeData& aData) const
       
   142 	{
       
   143 	if (aData.iFlags & TUnicodeData::EHasUpperCase)
       
   144 		return iCode - aData.iCaseOffset;
       
   145 	else
       
   146 		return iCode;
       
   147 	}
       
   148 
       
   149 TUint TUnicode::GetTitleCase(const TUnicodeDataSet *aOverridingDataSet) const
       
   150 	{
       
   151 	return GetTitleCase(GetData(aOverridingDataSet));
       
   152 	}
       
   153 
       
   154 TUint TUnicode::GetTitleCase(const TUnicodeData& aData) const
       
   155 	{
       
   156 	// Handle the very few characters with distinct title case variants.
       
   157 	if (aData.iFlags & TUnicodeData::EHasTitleCase)
       
   158 		{
       
   159 		// If the character has no upper case variant add one to get the title case form.
       
   160 		if (!(aData.iFlags & TUnicodeData::EHasUpperCase))
       
   161 			return iCode + 1;
       
   162 		// If the character has no lower case variant subtract one to get the title case form.
       
   163 		if (!(aData.iFlags & TUnicodeData::EHasLowerCase))
       
   164 			return iCode - 1;
       
   165 		// Both upper and lower case forms exist so the character itself must be title case.
       
   166 		return iCode;
       
   167 		}
       
   168 
       
   169 	// All other characters have title case forms that are the same as their upper case forms.
       
   170 	return GetUpperCase(aData);
       
   171 	}
       
   172 
       
   173 TBool TUnicode::IsMirrored(const TUnicodeDataSet *aOverridingDataSet) const
       
   174 	{
       
   175 	return GetData(aOverridingDataSet).iFlags & TUnicodeData::EMirrored;
       
   176 	}
       
   177 
       
   178 TInt TUnicode::GetNumericValue(const TUnicodeDataSet *aOverridingDataSet) const
       
   179 	{
       
   180 	return GetNumericValue(GetData(aOverridingDataSet));
       
   181 	}
       
   182 
       
   183 /*
       
   184 Return the integer numeric value of this character.
       
   185 Return -1 if the character is not numeric, or -2 if it has a fractional value.
       
   186 */
       
   187 TInt TUnicode::GetNumericValue(const TUnicodeData& aData) const
       
   188 	{
       
   189 	switch (aData.iFlags & TUnicodeData::ENumericFlags)
       
   190 		{
       
   191 		case TUnicodeData::ENonNumeric: return -1;
       
   192 		case TUnicodeData::ESmallNumeric: return (iCode + aData.iDigitOffset) & 0xFF;
       
   193 		case TUnicodeData::EFiveHundred: return 500;
       
   194 		case TUnicodeData::EOneThousand: return 1000;
       
   195 		case TUnicodeData::EFiveThousand: return 5000;
       
   196 		case TUnicodeData::ETenThousand: return 10000;
       
   197 		case TUnicodeData::EHundredThousand: return 100000;
       
   198 		case TUnicodeData::EFraction: return -2;
       
   199 		default: return -1; // we should never come here
       
   200 		}
       
   201 	}
       
   202 
       
   203 struct TWidthInfo
       
   204 	{
       
   205 	TUint iStart;
       
   206 	TUint iEnd;
       
   207 	TChar::TCjkWidth iWidth;
       
   208 	};
       
   209 
       
   210 static const TWidthInfo TheWidthInfoTable[] =
       
   211 	{
       
   212 	{ 0x0020, 0x007F, TChar::ENarrow },
       
   213 	{ 0x00A2, 0x00A4, TChar::ENarrow },
       
   214 	{ 0x00A5, 0x00A7, TChar::ENarrow },
       
   215 	{ 0x00AF, 0x00B0, TChar::ENarrow },
       
   216 	{ 0x00B1, 0x1100, TChar::ENeutralWidth },
       
   217 	{ 0x1100, 0x1160, TChar::EWide },
       
   218 	{ 0x1160, 0x2E80, TChar::ENeutralWidth },
       
   219 	{ 0x2E80, 0xD7A4, TChar::EWide },
       
   220 	{ 0xF900, 0xFA2E, TChar::EWide },
       
   221 	{ 0xFE30, 0xFE6C, TChar::EWide },
       
   222 	{ 0xFF01, 0xFF5F, TChar::EFullWidth },
       
   223 	{ 0xFF61, 0xFFDD, TChar::EHalfWidth },
       
   224 	{ 0xFFE0, 0xFFE7, TChar::EFullWidth },
       
   225 	{ 0xFFE8, 0xFFEF, TChar::EHalfWidth }
       
   226 	};
       
   227 
       
   228 const TInt TheWidthInfos = sizeof(TheWidthInfoTable) / sizeof(TheWidthInfoTable[0]);
       
   229 
       
   230 /*
       
   231 Get the notional width used by East Asian encoding systems. No check is made that the character is assigned.
       
   232 No separate 'ambiguous width' is returned; ambiguous characters are treated as neutral except for those
       
   233 in the CJK range, which are treated as wide. This is a big simplification, but the cost of an exhaustive table
       
   234 is too great to justify at the moment.
       
   235 */
       
   236 TChar::TCjkWidth TUnicode::GetCjkWidth() const
       
   237 	{
       
   238 	const TWidthInfo* w = TheWidthInfoTable;
       
   239 	for (int i = 0; i < TheWidthInfos; i++, w++)
       
   240 		if (iCode >= w->iStart && iCode < w->iEnd)
       
   241 			return w->iWidth;
       
   242 	return TChar::ENeutralWidth;
       
   243 	}
       
   244 
       
   245 /*
       
   246 Convert a Unicode character into a form most likely to be equal to another character, while
       
   247 still preserving the essential meaning of the character. Possible folding operations include
       
   248 converting to lower case (TChar::EFoldCase), stripping accents (TChar::EFoldAccents) and others.
       
   249 The flag value has a default, TChar::EFoldStandard, which performs the folding operations done
       
   250 by calling Fold functions with no flags argument, and there is also TChar::EFoldAll,
       
   251 which performs all possible folding operations.
       
   252 
       
   253 Note that the difference between folding and collation is that folding is
       
   254 	*	character-based
       
   255 	*	biased towards yielding equality where possible
       
   256 while collation is
       
   257 	*	string-based
       
   258 	*	designed to yield a non-equal ordering
       
   259 
       
   260 Typically, folding will be used when searching for a match, while collation will be used when
       
   261 sorting a list.
       
   262 */
       
   263 EXPORT_C TUint TUnicode::Fold(TInt aFlags,const TUnicodeDataSet *aOverridingDataSet) const
       
   264 	{
       
   265 	TUint result = iCode;
       
   266 
       
   267 	/*
       
   268 	Fold CJK width variants. This only applies to characters 0xFF00 and above so we can use
       
   269 	a built-in table.
       
   270 	*/
       
   271 	if (result >= 0xFF00 && (aFlags & TChar::EFoldWidth))
       
   272 		result = CjkWidthFoldTable[result & 0xFF];
       
   273 
       
   274 	/*
       
   275 	If the character is <= 0x00FF and the flags include folding case and stripping accents,
       
   276 	and there is no overriding character data, we can use the built-in fold table.
       
   277 	*/
       
   278 	const TUnicodeData* data = NULL;
       
   279 	if (aOverridingDataSet)
       
   280 		data = GetDataFromDataSet(*aOverridingDataSet);
       
   281 	if (data == NULL && result < 256 &&
       
   282 		(aFlags & (TChar::EFoldCase | TChar::EFoldAccents)) == (TChar::EFoldCase | TChar::EFoldAccents))
       
   283 		return FoldTable[result];
       
   284 
       
   285 	/*
       
   286 	Other characters have to be dealt with laboriously.
       
   287 	The first operations are those that, if successful, tell us that nothing more
       
   288 	need be done. If a value is folded to a space or a digit or converted to Katakana
       
   289 	it cannot have anything else done to it.
       
   290 	*/
       
   291 	if (aFlags & TChar::EFoldKana)
       
   292 		{
       
   293 		if ((result >= 0x3041 && result <= 0x3094) || result == 0x309D || result == 0x309E)
       
   294 			return result += 0x0060;
       
   295 		}
       
   296 	if (data == NULL)
       
   297 		data = &GetData(NULL);
       
   298 	if (aFlags & TChar::EFoldSpaces)
       
   299 		{
       
   300 		if (data->iCategory == TChar::EZsCategory)
       
   301 			return 0x0020;
       
   302 		}
       
   303 	if (aFlags & TChar::EFoldDigits)
       
   304 		{
       
   305 		TInt n = GetNumericValue(*data);
       
   306 		if (n >= 0 && n <= 9)
       
   307 			return 0x0030 + n;
       
   308 		}
       
   309 
       
   310 	/*
       
   311 	The final operations are the relatively rare and expensive ones (after the special
       
   312 	case dealt with above) of accent removal and case conversion.
       
   313 	*/
       
   314 	if ((aFlags & TChar::EFoldAccents) && (result < 0x2000))
       
   315 		{
       
   316 		/*
       
   317 		Throw away characters other than the first if all are accents. For the moment these
       
   318 		are defined as characters in the range 0x0300..0x0361. This definition may need
       
   319 		to be modified; or I may decide to store a flag in the decomposition table indicating
       
   320 		whether or not the decomposition consists of base + accent(s).
       
   321 		*/
       
   322 		TPtrC16 decomposition;
       
   323 		if (::DecomposeChar(iCode, decomposition))
       
   324 			{
       
   325 			TBool all_accents = TRUE;			
       
   326 			for (TInt i = 1; all_accents && i < decomposition.Length(); ++i)
       
   327 				{
       
   328 				if (decomposition[i] < 0x0300 || decomposition[i] > 0x0361)
       
   329 					all_accents = FALSE;
       
   330 				}
       
   331 			if (all_accents)
       
   332 				result = decomposition[0];
       
   333 			}
       
   334 		}
       
   335 
       
   336 	if (aFlags & TChar::EFoldCase)
       
   337 		{
       
   338 		if (aOverridingDataSet == NULL && result < 256)
       
   339 			result = FoldTable[result];
       
   340 		else
       
   341 			result = TUnicode(result).GetLowerCase(aOverridingDataSet);
       
   342 		}
       
   343 	
       
   344 	return result;
       
   345 	}
       
   346 
       
   347 /*
       
   348 Compare two Unicode strings naively by Unicode value. This is NOT the same as a comparison
       
   349 of null-terminated strings; the strings can contain null characters (Unicode 0x0000) and they
       
   350 compare greater than no character. This means that the string { 0x0001 0x0000 } always comes
       
   351 after the string { 0x0001 }.
       
   352 
       
   353 This function exists to make it easier to search tables of Unicode strings (like the composition
       
   354 buffer) using the binary chop method. It is also used by READTYPE when sorting the compose table.
       
   355 
       
   356 The return values are: 0 for equality, < 0 if aString1 < aString2, > 0 if aString1 > aString2.
       
   357 */
       
   358 TInt TUnicode::Compare(const TUint16 *aString1,TInt aLength1,const TUint16 *aString2,TInt aLength2)
       
   359 	{
       
   360 	for (TInt i = 0; i < aLength1 || i < aLength2; i++, aString1++, aString2++)
       
   361 		{
       
   362 		TInt x = i < aLength1 ? *aString1 : -1;
       
   363 		TInt y = i < aLength2 ? *aString2 : -1;
       
   364 		if (x != y)
       
   365 			return x - y;
       
   366 		}
       
   367 	return 0;
       
   368 	}
       
   369