kernel/eka/euser/unicode/unicode.cpp
changeset 0 a41df078684a
equal deleted inserted replaced
-1:000000000000 0:a41df078684a
       
     1 // Copyright (c) 1997-2009 Nokia Corporation and/or its subsidiary(-ies).
       
     2 // All rights reserved.
       
     3 // This component and the accompanying materials are made available
       
     4 // under the terms of the License "Eclipse Public License v1.0"
       
     5 // which accompanies this distribution, and is available
       
     6 // at the URL "http://www.eclipse.org/legal/epl-v10.html".
       
     7 //
       
     8 // Initial Contributors:
       
     9 // Nokia Corporation - initial contribution.
       
    10 //
       
    11 // Contributors:
       
    12 //
       
    13 // Description:
       
    14 // e32\euser\unicode\unicode.cpp
       
    15 // The implementation of the base-level Unicode character classification functions. These are members of
       
    16 // a class called TUnicode that contains a Unicode value.
       
    17 // 
       
    18 //
       
    19 
       
    20 #include <unicode.h>
       
    21 #include "CompareImp.h"
       
    22 
       
    23 static const TUnicodeData TheDefaultUnicodeData =
       
    24 	{ TChar::ECnCategory, TChar::EOtherNeutral, 0, 0, 0, TUnicodeData::ENonNumeric };
       
    25 
       
    26 
       
    27 // Declarations for tables held in unitable.cpp and used by unicode.cpp.
       
    28 #ifndef __KERNEL_MODE__
       
    29 extern const TStandardUnicodeDataSet TheStandardUnicodeDataSet[];
       
    30 extern const TUnicodePlane ThePlanes[17];
       
    31 #endif
       
    32 
       
    33 
       
    34 // Fill in a TChar::TCharInfo structure with category information about the character.
       
    35 void TUnicode::GetInfo(TChar::TCharInfo& aInfo,const TUnicodeDataSet *aOverridingDataSet) const
       
    36 	{
       
    37 	const TUnicodeData& data = GetData(aOverridingDataSet);
       
    38 	aInfo.iCategory = (TChar::TCategory)data.iCategory;
       
    39 	aInfo.iBdCategory = (TChar::TBdCategory)data.iBdCategory;
       
    40 	aInfo.iCombiningClass = data.iCombiningClass;
       
    41 	aInfo.iLowerCase = iCode;
       
    42 	aInfo.iUpperCase = iCode;
       
    43 	aInfo.iTitleCase = iCode;
       
    44 	if (data.iFlags & TUnicodeData::EHasLowerCase)
       
    45 		aInfo.iLowerCase = GetLowerCase(data);
       
    46 	if (data.iFlags & TUnicodeData::EHasUpperCase)
       
    47 		aInfo.iUpperCase = GetUpperCase(data);
       
    48 	if (data.iFlags & TUnicodeData::EHasTitleCase)
       
    49 		aInfo.iTitleCase = GetTitleCase(data);
       
    50 	aInfo.iMirrored = data.iFlags & TUnicodeData::EMirrored;
       
    51 	if (data.iFlags & TUnicodeData::ENumericFlags)
       
    52 		aInfo.iNumericValue = GetNumericValue(data);
       
    53 	else
       
    54 		aInfo.iNumericValue = -1;
       
    55 	}
       
    56 
       
    57 /*
       
    58 Get the data describing a character. If "aOverridingDataSet" is non-null, look in that
       
    59 data set before searching the standard data set.
       
    60 */
       
    61 const TUnicodeData& TUnicode::GetData(const TUnicodeDataSet *aOverridingDataSet) const
       
    62 	{
       
    63 	const TUnicodeData *result = NULL;
       
    64 	if (aOverridingDataSet)
       
    65 		result = GetDataFromDataSet(*aOverridingDataSet);
       
    66 	if (result == NULL)
       
    67 		{
       
    68 		if (0xFFFF >= iCode)
       
    69 			{
       
    70 			// optimize for BMP characters (plane 0)
       
    71 			TInt index = TheStandardUnicodeDataSet[0].iIndex1[iCode >> 4];
       
    72 			if (index & 0x8000) // high bit set means all values in block have the same value, and it's in the index
       
    73 				index &= ~0x8000;
       
    74 			else
       
    75 				index = TheStandardUnicodeDataSet[0].iIndex2[index + (iCode & 0x000F)];
       
    76 			return TheStandardUnicodeDataSet[0].iData[index];
       
    77 			}
       
    78 		else
       
    79 			{
       
    80 			// for non-BMP characters (plane 1-16)
       
    81 			TInt plane = (iCode >> 16);
       
    82 			if (plane > 16)
       
    83 				{
       
    84 				// for now we have no data for values above U+10FFFF
       
    85 				return TheDefaultUnicodeData;
       
    86 				}
       
    87 			TInt codesPerBlock = ThePlanes[plane].iCodesPerBlock;
       
    88 			TInt maskForCodePoint = ThePlanes[plane].iMaskForCodePoint;
       
    89 			
       
    90 			TInt low16bit = (iCode & 0xFFFF);
       
    91 			TInt index = TheStandardUnicodeDataSet[plane].iIndex1[low16bit >> codesPerBlock];
       
    92 			if (index & 0x8000) // high bit set means all values in block have the same value, and it's in the index
       
    93 				index &= ~0x8000;
       
    94 			else
       
    95 				index = TheStandardUnicodeDataSet[plane].iIndex2[index + (low16bit & maskForCodePoint)];
       
    96 			return TheStandardUnicodeDataSet[plane].iData[index];
       
    97 			}
       
    98 		}
       
    99 
       
   100 	return *result;
       
   101 	}
       
   102 
       
   103 /*
       
   104 Given a character data set, get the data referring to this character.
       
   105 Return NULL if no data is available in this data set.
       
   106 */
       
   107 const TUnicodeData *TUnicode::GetDataFromDataSet(const TUnicodeDataSet& aDataSet) const
       
   108 	{
       
   109 	// Perform a binary chop to find the range containing this character.
       
   110 	TInt n = aDataSet.iRanges;
       
   111 	const TUnicodeDataRange *base = aDataSet.iRange;
       
   112 	const TUnicodeDataRange *last = base + n - 1;
       
   113 	const TUnicodeDataRange *r = base;
       
   114 
       
   115 	while (n > 1)
       
   116 		{
       
   117 		TInt pivot = n / 2;
       
   118 		r += pivot;
       
   119 		if (iCode < r->iRangeStart)									// it's before this range
       
   120 			n = pivot;
       
   121 		else if (r < last && iCode >= r[1].iRangeStart)				// it's after this range
       
   122 			{
       
   123 			base = r + 1;
       
   124 			n -= pivot + 1;
       
   125 			}
       
   126 		else														// it's in this range
       
   127 			break;
       
   128 		r = base;
       
   129 		}
       
   130 
       
   131 	if (r->iIndex >= 0)
       
   132 		return &aDataSet.iData[r->iIndex];		// index >= 0: data available
       
   133 	else
       
   134 		return NULL;							// index < 0: no data available
       
   135 	}
       
   136 
       
   137 EXPORT_C TChar::TCategory TUnicode::GetCategory(const TUnicodeDataSet *aOverridingDataSet) const
       
   138 	{
       
   139 	return (TChar::TCategory)GetData(aOverridingDataSet).iCategory;
       
   140 	}
       
   141 
       
   142 TChar::TBdCategory TUnicode::GetBdCategory(const TUnicodeDataSet *aOverridingDataSet) const
       
   143 	{
       
   144 	return (TChar::TBdCategory)GetData(aOverridingDataSet).iBdCategory;
       
   145 	}
       
   146 
       
   147 TInt TUnicode::GetCombiningClass(const TUnicodeDataSet *aOverridingDataSet) const
       
   148 	{
       
   149 	return GetData(aOverridingDataSet).iCombiningClass;
       
   150 	}
       
   151 
       
   152 EXPORT_C TUint TUnicode::GetLowerCase(const TUnicodeDataSet *aOverridingDataSet) const
       
   153 	{
       
   154 	return GetLowerCase(GetData(aOverridingDataSet));
       
   155 	}
       
   156 
       
   157 EXPORT_C TUint TUnicode::GetUpperCase(const TUnicodeDataSet *aOverridingDataSet) const
       
   158 	{
       
   159 	return GetUpperCase(GetData(aOverridingDataSet));
       
   160 	}
       
   161 
       
   162 TUint TUnicode::GetLowerCase(const TUnicodeData& aData) const
       
   163 	{
       
   164 	if (aData.iFlags & TUnicodeData::EHasLowerCase)
       
   165 		return iCode + aData.iCaseOffset;
       
   166 	else
       
   167 		return iCode;
       
   168 	}
       
   169 
       
   170 TUint TUnicode::GetUpperCase(const TUnicodeData& aData) const
       
   171 	{
       
   172 	if (aData.iFlags & TUnicodeData::EHasUpperCase)
       
   173 		return iCode - aData.iCaseOffset;
       
   174 	else
       
   175 		return iCode;
       
   176 	}
       
   177 
       
   178 TUint TUnicode::GetTitleCase(const TUnicodeDataSet *aOverridingDataSet) const
       
   179 	{
       
   180 	return GetTitleCase(GetData(aOverridingDataSet));
       
   181 	}
       
   182 
       
   183 TUint TUnicode::GetTitleCase(const TUnicodeData& aData) const
       
   184 	{
       
   185 	// Handle the very few characters with distinct title case variants.
       
   186 	if (aData.iFlags & TUnicodeData::EHasTitleCase)
       
   187 		{
       
   188 		// If the character has no upper case variant add one to get the title case form.
       
   189 		if (!(aData.iFlags & TUnicodeData::EHasUpperCase))
       
   190 			return iCode + 1;
       
   191 		// If the character has no lower case variant subtract one to get the title case form.
       
   192 		if (!(aData.iFlags & TUnicodeData::EHasLowerCase))
       
   193 			return iCode - 1;
       
   194 		// Both upper and lower case forms exist so the character itself must be title case.
       
   195 		return iCode;
       
   196 		}
       
   197 
       
   198 	// All other characters have title case forms that are the same as their upper case forms.
       
   199 	return GetUpperCase(aData);
       
   200 	}
       
   201 
       
   202 TBool TUnicode::IsMirrored(const TUnicodeDataSet *aOverridingDataSet) const
       
   203 	{
       
   204 	return GetData(aOverridingDataSet).iFlags & TUnicodeData::EMirrored;
       
   205 	}
       
   206 
       
   207 TInt TUnicode::GetNumericValue(const TUnicodeDataSet *aOverridingDataSet) const
       
   208 	{
       
   209 	return GetNumericValue(GetData(aOverridingDataSet));
       
   210 	}
       
   211 
       
   212 /*
       
   213 Return the integer numeric value of this character.
       
   214 Return -1 if the character is not numeric, or -2 if it has a fractional value.
       
   215 */
       
   216 TInt TUnicode::GetNumericValue(const TUnicodeData& aData) const
       
   217 	{
       
   218 	switch (aData.iFlags & TUnicodeData::ENumericFlags)
       
   219 		{
       
   220 		case TUnicodeData::ENonNumeric: return -1;
       
   221 		case TUnicodeData::ESmallNumeric: return (iCode + aData.iDigitOffset) & 0xFF;
       
   222 		case TUnicodeData::EFiveHundred: return 500;
       
   223 		case TUnicodeData::EOneThousand: return 1000;
       
   224 		case TUnicodeData::EFiveThousand: return 5000;
       
   225 		case TUnicodeData::ETenThousand: return 10000;
       
   226 		case TUnicodeData::EHundredThousand: return 100000;
       
   227 		case TUnicodeData::EFraction: return -2;
       
   228 		default: return -1; // we should never come here
       
   229 		}
       
   230 	}
       
   231 
       
   232 struct TWidthInfo
       
   233 	{
       
   234 	TUint iStart;
       
   235 	TUint iEnd;
       
   236 	TChar::TCjkWidth iWidth;
       
   237 	};
       
   238 
       
   239 static const TWidthInfo TheWidthInfoTable[] =
       
   240 	{
       
   241 	{ 0x0020, 0x007F, TChar::ENarrow },
       
   242 	{ 0x00A2, 0x00A4, TChar::ENarrow },
       
   243 	{ 0x00A5, 0x00A7, TChar::ENarrow },
       
   244 	{ 0x00AF, 0x00B0, TChar::ENarrow },
       
   245 	{ 0x00B1, 0x1100, TChar::ENeutralWidth },
       
   246 	{ 0x1100, 0x1160, TChar::EWide },
       
   247 	{ 0x1160, 0x2E80, TChar::ENeutralWidth },
       
   248 	{ 0x2E80, 0xD7A4, TChar::EWide },
       
   249 	{ 0xF900, 0xFA2E, TChar::EWide },
       
   250 	{ 0xFE30, 0xFE6C, TChar::EWide },
       
   251 	{ 0xFF01, 0xFF5F, TChar::EFullWidth },
       
   252 	{ 0xFF61, 0xFFDD, TChar::EHalfWidth },
       
   253 	{ 0xFFE0, 0xFFE7, TChar::EFullWidth },
       
   254 	{ 0xFFE8, 0xFFEF, TChar::EHalfWidth },
       
   255 	{ 0x20000, 0x2A6DF, TChar::EWide },		// CJK Unified Ideographs Extension B
       
   256 	{ 0x2F800, 0x2FA1F, TChar::EWide },		// CJK Unified Ideographs Supplement
       
   257 	};
       
   258 
       
   259 const TInt TheWidthInfos = sizeof(TheWidthInfoTable) / sizeof(TheWidthInfoTable[0]);
       
   260 
       
   261 /*
       
   262 Get the notional width used by East Asian encoding systems. No check is made that the character is assigned.
       
   263 No separate 'ambiguous width' is returned; ambiguous characters are treated as neutral except for those
       
   264 in the CJK range, which are treated as wide. This is a big simplification, but the cost of an exhaustive table
       
   265 is too great to justify at the moment.
       
   266 */
       
   267 TChar::TCjkWidth TUnicode::GetCjkWidth() const
       
   268 	{
       
   269 	const TWidthInfo* w = TheWidthInfoTable;
       
   270 	for (TInt i = 0; i < TheWidthInfos; i++, w++)
       
   271 		if (iCode >= w->iStart && iCode < w->iEnd)
       
   272 			return w->iWidth;
       
   273 	return TChar::ENeutralWidth;
       
   274 	}
       
   275 
       
   276 /*
       
   277 Convert a Unicode character into a form most likely to be equal to another character, while
       
   278 still preserving the essential meaning of the character. Possible folding operations include
       
   279 converting to lower case (TChar::EFoldCase), stripping accents (TChar::EFoldAccents) and others.
       
   280 The flag value has a default, TChar::EFoldStandard, which performs the folding operations done
       
   281 by calling Fold functions with no flags argument, and there is also TChar::EFoldAll,
       
   282 which performs all possible folding operations.
       
   283 
       
   284 Note that the difference between folding and collation is that folding is
       
   285 	*	character-based
       
   286 	*	biased towards yielding equality where possible
       
   287 while collation is
       
   288 	*	string-based
       
   289 	*	designed to yield a non-equal ordering
       
   290 
       
   291 Typically, folding will be used when searching for a match, while collation will be used when
       
   292 sorting a list.
       
   293 */
       
   294 EXPORT_C TUint TUnicode::Fold(TInt aFlags,const TUnicodeDataSet *aOverridingDataSet) const
       
   295 	{
       
   296 	TUint result = iCode;
       
   297 
       
   298 	/*
       
   299 	Fold CJK width variants. This only applies to characters 0xFF00 and above so we can use
       
   300 	a built-in table.
       
   301 	*/
       
   302 	if (result >= 0xFF00 && (aFlags & TChar::EFoldWidth))
       
   303 		result = CjkWidthFoldTable[result & 0xFF];
       
   304 
       
   305 	/*
       
   306 	If the character is <= 0x00FF and the flags include folding case and stripping accents,
       
   307 	and there is no overriding character data, we can use the built-in fold table.
       
   308 	*/
       
   309 	const TUnicodeData* data = NULL;
       
   310 	if (aOverridingDataSet)
       
   311 		data = GetDataFromDataSet(*aOverridingDataSet);
       
   312 	if (data == NULL && result < 256 &&
       
   313 		(aFlags & (TChar::EFoldCase | TChar::EFoldAccents)) == (TChar::EFoldCase | TChar::EFoldAccents))
       
   314 		return FoldTable[result];
       
   315 
       
   316 	/*
       
   317 	Other characters have to be dealt with laboriously.
       
   318 	The first operations are those that, if successful, tell us that nothing more
       
   319 	need be done. If a value is folded to a space or a digit or converted to Katakana
       
   320 	it cannot have anything else done to it.
       
   321 	*/
       
   322 	if (aFlags & TChar::EFoldKana)
       
   323 		{
       
   324 		if ((result >= 0x3041 && result <= 0x3094) || result == 0x309D || result == 0x309E)
       
   325 			return result += 0x0060;
       
   326 		}
       
   327 	if (data == NULL)
       
   328 		data = &GetData(NULL);
       
   329 	if (aFlags & TChar::EFoldSpaces)
       
   330 		{
       
   331 		if (data->iCategory == TChar::EZsCategory)
       
   332 			return 0x0020;
       
   333 		}
       
   334 	if (aFlags & TChar::EFoldDigits)
       
   335 		{
       
   336 		TInt n = GetNumericValue(*data);
       
   337 		if (n >= 0 && n <= 9)
       
   338 			return 0x0030 + n;
       
   339 		}
       
   340 
       
   341 	/*
       
   342 	The final operations are the relatively rare and expensive ones (after the special
       
   343 	case dealt with above) of accent removal and case conversion.
       
   344 	*/
       
   345 	if ((aFlags & TChar::EFoldAccents) && (result < 0x2000))
       
   346 		{
       
   347 		/*
       
   348 		Throw away characters other than the first if all are accents. For the moment these
       
   349 		are defined as characters in the range 0x0300..0x0361. This definition may need
       
   350 		to be modified; or I may decide to store a flag in the decomposition table indicating
       
   351 		whether or not the decomposition consists of base + accent(s).
       
   352 		*/
       
   353 		TPtrC16 decomposition;
       
   354 		if (::DecomposeChar(iCode, decomposition))
       
   355 			{
       
   356 			TBool all_accents = TRUE;			
       
   357 			for (TInt i = 1; all_accents && i < decomposition.Length(); ++i)
       
   358 				{
       
   359 				if (decomposition[i] < 0x0300 || decomposition[i] > 0x0361)
       
   360 					all_accents = FALSE;
       
   361 				}
       
   362 			if (all_accents)
       
   363 				result = decomposition[0];
       
   364 			}
       
   365 		}
       
   366 
       
   367 	if (aFlags & TChar::EFoldCase)
       
   368 		{
       
   369 		if (aOverridingDataSet == NULL && result < 256)
       
   370 			result = FoldTable[result];
       
   371 		else
       
   372 			result = TUnicode(result).GetLowerCase(aOverridingDataSet);
       
   373 		}
       
   374 	
       
   375 	return result;
       
   376 	}
       
   377 
       
   378 /*
       
   379 Compare two Unicode strings naively by Unicode value. This is NOT the same as a comparison
       
   380 of null-terminated strings; the strings can contain null characters (Unicode 0x0000) and they
       
   381 compare greater than no character. This means that the string { 0x0001 0x0000 } always comes
       
   382 after the string { 0x0001 }.
       
   383 
       
   384 This function exists to make it easier to search tables of Unicode strings (like the composition
       
   385 buffer) using the binary chop method. It is also used by READTYPE when sorting the compose table.
       
   386 
       
   387 The return values are: 0 for equality, < 0 if aString1 < aString2, > 0 if aString1 > aString2.
       
   388 */
       
   389 TInt TUnicode::Compare(const TUint16 *aString1,TInt aLength1,const TUint16 *aString2,TInt aLength2)
       
   390 	{
       
   391 	for (TInt i = 0; i < aLength1 || i < aLength2; i++, aString1++, aString2++)
       
   392 		{
       
   393 		TInt x = i < aLength1 ? *aString1 : -1;
       
   394 		TInt y = i < aLength2 ? *aString2 : -1;
       
   395 		if (x != y)
       
   396 			return x - y;
       
   397 		}
       
   398 	return 0;
       
   399 	}
       
   400