0
|
1 |
// Copyright (c) 1997-2009 Nokia Corporation and/or its subsidiary(-ies).
|
|
2 |
// All rights reserved.
|
|
3 |
// This component and the accompanying materials are made available
|
|
4 |
// under the terms of the License "Eclipse Public License v1.0"
|
|
5 |
// which accompanies this distribution, and is available
|
|
6 |
// at the URL "http://www.eclipse.org/legal/epl-v10.html".
|
|
7 |
//
|
|
8 |
// Initial Contributors:
|
|
9 |
// Nokia Corporation - initial contribution.
|
|
10 |
//
|
|
11 |
// Contributors:
|
|
12 |
//
|
|
13 |
// Description:
|
|
14 |
// e32\euser\unicode\unicode.cpp
|
|
15 |
// The implementation of the base-level Unicode character classification functions. These are members of
|
|
16 |
// a class called TUnicode that contains a Unicode value.
|
|
17 |
//
|
|
18 |
//
|
|
19 |
|
|
20 |
#include <unicode.h>
|
|
21 |
#include "CompareImp.h"
|
|
22 |
|
|
23 |
static const TUnicodeData TheDefaultUnicodeData =
|
|
24 |
{ TChar::ECnCategory, TChar::EOtherNeutral, 0, 0, 0, TUnicodeData::ENonNumeric };
|
|
25 |
|
|
26 |
|
|
27 |
// Declarations for tables held in unitable.cpp and used by unicode.cpp.
|
|
28 |
#ifndef __KERNEL_MODE__
|
|
29 |
extern const TStandardUnicodeDataSet TheStandardUnicodeDataSet[];
|
|
30 |
extern const TUnicodePlane ThePlanes[17];
|
|
31 |
#endif
|
|
32 |
|
|
33 |
|
|
34 |
// Fill in a TChar::TCharInfo structure with category information about the character.
|
|
35 |
void TUnicode::GetInfo(TChar::TCharInfo& aInfo,const TUnicodeDataSet *aOverridingDataSet) const
|
|
36 |
{
|
|
37 |
const TUnicodeData& data = GetData(aOverridingDataSet);
|
|
38 |
aInfo.iCategory = (TChar::TCategory)data.iCategory;
|
|
39 |
aInfo.iBdCategory = (TChar::TBdCategory)data.iBdCategory;
|
|
40 |
aInfo.iCombiningClass = data.iCombiningClass;
|
|
41 |
aInfo.iLowerCase = iCode;
|
|
42 |
aInfo.iUpperCase = iCode;
|
|
43 |
aInfo.iTitleCase = iCode;
|
|
44 |
if (data.iFlags & TUnicodeData::EHasLowerCase)
|
|
45 |
aInfo.iLowerCase = GetLowerCase(data);
|
|
46 |
if (data.iFlags & TUnicodeData::EHasUpperCase)
|
|
47 |
aInfo.iUpperCase = GetUpperCase(data);
|
|
48 |
if (data.iFlags & TUnicodeData::EHasTitleCase)
|
|
49 |
aInfo.iTitleCase = GetTitleCase(data);
|
|
50 |
aInfo.iMirrored = data.iFlags & TUnicodeData::EMirrored;
|
|
51 |
if (data.iFlags & TUnicodeData::ENumericFlags)
|
|
52 |
aInfo.iNumericValue = GetNumericValue(data);
|
|
53 |
else
|
|
54 |
aInfo.iNumericValue = -1;
|
|
55 |
}
|
|
56 |
|
|
57 |
/*
|
|
58 |
Get the data describing a character. If "aOverridingDataSet" is non-null, look in that
|
|
59 |
data set before searching the standard data set.
|
|
60 |
*/
|
|
61 |
const TUnicodeData& TUnicode::GetData(const TUnicodeDataSet *aOverridingDataSet) const
|
|
62 |
{
|
|
63 |
const TUnicodeData *result = NULL;
|
|
64 |
if (aOverridingDataSet)
|
|
65 |
result = GetDataFromDataSet(*aOverridingDataSet);
|
|
66 |
if (result == NULL)
|
|
67 |
{
|
|
68 |
if (0xFFFF >= iCode)
|
|
69 |
{
|
|
70 |
// optimize for BMP characters (plane 0)
|
|
71 |
TInt index = TheStandardUnicodeDataSet[0].iIndex1[iCode >> 4];
|
|
72 |
if (index & 0x8000) // high bit set means all values in block have the same value, and it's in the index
|
|
73 |
index &= ~0x8000;
|
|
74 |
else
|
|
75 |
index = TheStandardUnicodeDataSet[0].iIndex2[index + (iCode & 0x000F)];
|
|
76 |
return TheStandardUnicodeDataSet[0].iData[index];
|
|
77 |
}
|
|
78 |
else
|
|
79 |
{
|
|
80 |
// for non-BMP characters (plane 1-16)
|
|
81 |
TInt plane = (iCode >> 16);
|
|
82 |
if (plane > 16)
|
|
83 |
{
|
|
84 |
// for now we have no data for values above U+10FFFF
|
|
85 |
return TheDefaultUnicodeData;
|
|
86 |
}
|
|
87 |
TInt codesPerBlock = ThePlanes[plane].iCodesPerBlock;
|
|
88 |
TInt maskForCodePoint = ThePlanes[plane].iMaskForCodePoint;
|
|
89 |
|
|
90 |
TInt low16bit = (iCode & 0xFFFF);
|
|
91 |
TInt index = TheStandardUnicodeDataSet[plane].iIndex1[low16bit >> codesPerBlock];
|
|
92 |
if (index & 0x8000) // high bit set means all values in block have the same value, and it's in the index
|
|
93 |
index &= ~0x8000;
|
|
94 |
else
|
|
95 |
index = TheStandardUnicodeDataSet[plane].iIndex2[index + (low16bit & maskForCodePoint)];
|
|
96 |
return TheStandardUnicodeDataSet[plane].iData[index];
|
|
97 |
}
|
|
98 |
}
|
|
99 |
|
|
100 |
return *result;
|
|
101 |
}
|
|
102 |
|
|
103 |
/*
|
|
104 |
Given a character data set, get the data referring to this character.
|
|
105 |
Return NULL if no data is available in this data set.
|
|
106 |
*/
|
|
107 |
const TUnicodeData *TUnicode::GetDataFromDataSet(const TUnicodeDataSet& aDataSet) const
|
|
108 |
{
|
|
109 |
// Perform a binary chop to find the range containing this character.
|
|
110 |
TInt n = aDataSet.iRanges;
|
|
111 |
const TUnicodeDataRange *base = aDataSet.iRange;
|
|
112 |
const TUnicodeDataRange *last = base + n - 1;
|
|
113 |
const TUnicodeDataRange *r = base;
|
|
114 |
|
|
115 |
while (n > 1)
|
|
116 |
{
|
|
117 |
TInt pivot = n / 2;
|
|
118 |
r += pivot;
|
|
119 |
if (iCode < r->iRangeStart) // it's before this range
|
|
120 |
n = pivot;
|
|
121 |
else if (r < last && iCode >= r[1].iRangeStart) // it's after this range
|
|
122 |
{
|
|
123 |
base = r + 1;
|
|
124 |
n -= pivot + 1;
|
|
125 |
}
|
|
126 |
else // it's in this range
|
|
127 |
break;
|
|
128 |
r = base;
|
|
129 |
}
|
|
130 |
|
|
131 |
if (r->iIndex >= 0)
|
|
132 |
return &aDataSet.iData[r->iIndex]; // index >= 0: data available
|
|
133 |
else
|
|
134 |
return NULL; // index < 0: no data available
|
|
135 |
}
|
|
136 |
|
|
137 |
EXPORT_C TChar::TCategory TUnicode::GetCategory(const TUnicodeDataSet *aOverridingDataSet) const
|
|
138 |
{
|
|
139 |
return (TChar::TCategory)GetData(aOverridingDataSet).iCategory;
|
|
140 |
}
|
|
141 |
|
|
142 |
TChar::TBdCategory TUnicode::GetBdCategory(const TUnicodeDataSet *aOverridingDataSet) const
|
|
143 |
{
|
|
144 |
return (TChar::TBdCategory)GetData(aOverridingDataSet).iBdCategory;
|
|
145 |
}
|
|
146 |
|
|
147 |
TInt TUnicode::GetCombiningClass(const TUnicodeDataSet *aOverridingDataSet) const
|
|
148 |
{
|
|
149 |
return GetData(aOverridingDataSet).iCombiningClass;
|
|
150 |
}
|
|
151 |
|
|
152 |
EXPORT_C TUint TUnicode::GetLowerCase(const TUnicodeDataSet *aOverridingDataSet) const
|
|
153 |
{
|
|
154 |
return GetLowerCase(GetData(aOverridingDataSet));
|
|
155 |
}
|
|
156 |
|
|
157 |
EXPORT_C TUint TUnicode::GetUpperCase(const TUnicodeDataSet *aOverridingDataSet) const
|
|
158 |
{
|
|
159 |
return GetUpperCase(GetData(aOverridingDataSet));
|
|
160 |
}
|
|
161 |
|
|
162 |
TUint TUnicode::GetLowerCase(const TUnicodeData& aData) const
|
|
163 |
{
|
|
164 |
if (aData.iFlags & TUnicodeData::EHasLowerCase)
|
|
165 |
return iCode + aData.iCaseOffset;
|
|
166 |
else
|
|
167 |
return iCode;
|
|
168 |
}
|
|
169 |
|
|
170 |
TUint TUnicode::GetUpperCase(const TUnicodeData& aData) const
|
|
171 |
{
|
|
172 |
if (aData.iFlags & TUnicodeData::EHasUpperCase)
|
|
173 |
return iCode - aData.iCaseOffset;
|
|
174 |
else
|
|
175 |
return iCode;
|
|
176 |
}
|
|
177 |
|
|
178 |
TUint TUnicode::GetTitleCase(const TUnicodeDataSet *aOverridingDataSet) const
|
|
179 |
{
|
|
180 |
return GetTitleCase(GetData(aOverridingDataSet));
|
|
181 |
}
|
|
182 |
|
|
183 |
TUint TUnicode::GetTitleCase(const TUnicodeData& aData) const
|
|
184 |
{
|
|
185 |
// Handle the very few characters with distinct title case variants.
|
|
186 |
if (aData.iFlags & TUnicodeData::EHasTitleCase)
|
|
187 |
{
|
|
188 |
// If the character has no upper case variant add one to get the title case form.
|
|
189 |
if (!(aData.iFlags & TUnicodeData::EHasUpperCase))
|
|
190 |
return iCode + 1;
|
|
191 |
// If the character has no lower case variant subtract one to get the title case form.
|
|
192 |
if (!(aData.iFlags & TUnicodeData::EHasLowerCase))
|
|
193 |
return iCode - 1;
|
|
194 |
// Both upper and lower case forms exist so the character itself must be title case.
|
|
195 |
return iCode;
|
|
196 |
}
|
|
197 |
|
|
198 |
// All other characters have title case forms that are the same as their upper case forms.
|
|
199 |
return GetUpperCase(aData);
|
|
200 |
}
|
|
201 |
|
|
202 |
TBool TUnicode::IsMirrored(const TUnicodeDataSet *aOverridingDataSet) const
|
|
203 |
{
|
|
204 |
return GetData(aOverridingDataSet).iFlags & TUnicodeData::EMirrored;
|
|
205 |
}
|
|
206 |
|
|
207 |
TInt TUnicode::GetNumericValue(const TUnicodeDataSet *aOverridingDataSet) const
|
|
208 |
{
|
|
209 |
return GetNumericValue(GetData(aOverridingDataSet));
|
|
210 |
}
|
|
211 |
|
|
212 |
/*
|
|
213 |
Return the integer numeric value of this character.
|
|
214 |
Return -1 if the character is not numeric, or -2 if it has a fractional value.
|
|
215 |
*/
|
|
216 |
TInt TUnicode::GetNumericValue(const TUnicodeData& aData) const
|
|
217 |
{
|
|
218 |
switch (aData.iFlags & TUnicodeData::ENumericFlags)
|
|
219 |
{
|
|
220 |
case TUnicodeData::ENonNumeric: return -1;
|
|
221 |
case TUnicodeData::ESmallNumeric: return (iCode + aData.iDigitOffset) & 0xFF;
|
|
222 |
case TUnicodeData::EFiveHundred: return 500;
|
|
223 |
case TUnicodeData::EOneThousand: return 1000;
|
|
224 |
case TUnicodeData::EFiveThousand: return 5000;
|
|
225 |
case TUnicodeData::ETenThousand: return 10000;
|
|
226 |
case TUnicodeData::EHundredThousand: return 100000;
|
|
227 |
case TUnicodeData::EFraction: return -2;
|
|
228 |
default: return -1; // we should never come here
|
|
229 |
}
|
|
230 |
}
|
|
231 |
|
|
232 |
struct TWidthInfo
|
|
233 |
{
|
|
234 |
TUint iStart;
|
|
235 |
TUint iEnd;
|
|
236 |
TChar::TCjkWidth iWidth;
|
|
237 |
};
|
|
238 |
|
|
239 |
static const TWidthInfo TheWidthInfoTable[] =
|
|
240 |
{
|
|
241 |
{ 0x0020, 0x007F, TChar::ENarrow },
|
|
242 |
{ 0x00A2, 0x00A4, TChar::ENarrow },
|
|
243 |
{ 0x00A5, 0x00A7, TChar::ENarrow },
|
|
244 |
{ 0x00AF, 0x00B0, TChar::ENarrow },
|
|
245 |
{ 0x00B1, 0x1100, TChar::ENeutralWidth },
|
|
246 |
{ 0x1100, 0x1160, TChar::EWide },
|
|
247 |
{ 0x1160, 0x2E80, TChar::ENeutralWidth },
|
|
248 |
{ 0x2E80, 0xD7A4, TChar::EWide },
|
|
249 |
{ 0xF900, 0xFA2E, TChar::EWide },
|
|
250 |
{ 0xFE30, 0xFE6C, TChar::EWide },
|
|
251 |
{ 0xFF01, 0xFF5F, TChar::EFullWidth },
|
|
252 |
{ 0xFF61, 0xFFDD, TChar::EHalfWidth },
|
|
253 |
{ 0xFFE0, 0xFFE7, TChar::EFullWidth },
|
|
254 |
{ 0xFFE8, 0xFFEF, TChar::EHalfWidth },
|
|
255 |
{ 0x20000, 0x2A6DF, TChar::EWide }, // CJK Unified Ideographs Extension B
|
|
256 |
{ 0x2F800, 0x2FA1F, TChar::EWide }, // CJK Unified Ideographs Supplement
|
|
257 |
};
|
|
258 |
|
|
259 |
const TInt TheWidthInfos = sizeof(TheWidthInfoTable) / sizeof(TheWidthInfoTable[0]);
|
|
260 |
|
|
261 |
/*
|
|
262 |
Get the notional width used by East Asian encoding systems. No check is made that the character is assigned.
|
|
263 |
No separate 'ambiguous width' is returned; ambiguous characters are treated as neutral except for those
|
|
264 |
in the CJK range, which are treated as wide. This is a big simplification, but the cost of an exhaustive table
|
|
265 |
is too great to justify at the moment.
|
|
266 |
*/
|
|
267 |
TChar::TCjkWidth TUnicode::GetCjkWidth() const
|
|
268 |
{
|
|
269 |
const TWidthInfo* w = TheWidthInfoTable;
|
|
270 |
for (TInt i = 0; i < TheWidthInfos; i++, w++)
|
|
271 |
if (iCode >= w->iStart && iCode < w->iEnd)
|
|
272 |
return w->iWidth;
|
|
273 |
return TChar::ENeutralWidth;
|
|
274 |
}
|
|
275 |
|
|
276 |
/*
|
|
277 |
Convert a Unicode character into a form most likely to be equal to another character, while
|
|
278 |
still preserving the essential meaning of the character. Possible folding operations include
|
|
279 |
converting to lower case (TChar::EFoldCase), stripping accents (TChar::EFoldAccents) and others.
|
|
280 |
The flag value has a default, TChar::EFoldStandard, which performs the folding operations done
|
|
281 |
by calling Fold functions with no flags argument, and there is also TChar::EFoldAll,
|
|
282 |
which performs all possible folding operations.
|
|
283 |
|
|
284 |
Note that the difference between folding and collation is that folding is
|
|
285 |
* character-based
|
|
286 |
* biased towards yielding equality where possible
|
|
287 |
while collation is
|
|
288 |
* string-based
|
|
289 |
* designed to yield a non-equal ordering
|
|
290 |
|
|
291 |
Typically, folding will be used when searching for a match, while collation will be used when
|
|
292 |
sorting a list.
|
|
293 |
*/
|
|
294 |
EXPORT_C TUint TUnicode::Fold(TInt aFlags,const TUnicodeDataSet *aOverridingDataSet) const
|
|
295 |
{
|
|
296 |
TUint result = iCode;
|
|
297 |
|
|
298 |
/*
|
|
299 |
Fold CJK width variants. This only applies to characters 0xFF00 and above so we can use
|
|
300 |
a built-in table.
|
|
301 |
*/
|
|
302 |
if (result >= 0xFF00 && (aFlags & TChar::EFoldWidth))
|
|
303 |
result = CjkWidthFoldTable[result & 0xFF];
|
|
304 |
|
|
305 |
/*
|
|
306 |
If the character is <= 0x00FF and the flags include folding case and stripping accents,
|
|
307 |
and there is no overriding character data, we can use the built-in fold table.
|
|
308 |
*/
|
|
309 |
const TUnicodeData* data = NULL;
|
|
310 |
if (aOverridingDataSet)
|
|
311 |
data = GetDataFromDataSet(*aOverridingDataSet);
|
|
312 |
if (data == NULL && result < 256 &&
|
|
313 |
(aFlags & (TChar::EFoldCase | TChar::EFoldAccents)) == (TChar::EFoldCase | TChar::EFoldAccents))
|
|
314 |
return FoldTable[result];
|
|
315 |
|
|
316 |
/*
|
|
317 |
Other characters have to be dealt with laboriously.
|
|
318 |
The first operations are those that, if successful, tell us that nothing more
|
|
319 |
need be done. If a value is folded to a space or a digit or converted to Katakana
|
|
320 |
it cannot have anything else done to it.
|
|
321 |
*/
|
|
322 |
if (aFlags & TChar::EFoldKana)
|
|
323 |
{
|
|
324 |
if ((result >= 0x3041 && result <= 0x3094) || result == 0x309D || result == 0x309E)
|
|
325 |
return result += 0x0060;
|
|
326 |
}
|
|
327 |
if (data == NULL)
|
|
328 |
data = &GetData(NULL);
|
|
329 |
if (aFlags & TChar::EFoldSpaces)
|
|
330 |
{
|
|
331 |
if (data->iCategory == TChar::EZsCategory)
|
|
332 |
return 0x0020;
|
|
333 |
}
|
|
334 |
if (aFlags & TChar::EFoldDigits)
|
|
335 |
{
|
|
336 |
TInt n = GetNumericValue(*data);
|
|
337 |
if (n >= 0 && n <= 9)
|
|
338 |
return 0x0030 + n;
|
|
339 |
}
|
|
340 |
|
|
341 |
/*
|
|
342 |
The final operations are the relatively rare and expensive ones (after the special
|
|
343 |
case dealt with above) of accent removal and case conversion.
|
|
344 |
*/
|
|
345 |
if ((aFlags & TChar::EFoldAccents) && (result < 0x2000))
|
|
346 |
{
|
|
347 |
/*
|
|
348 |
Throw away characters other than the first if all are accents. For the moment these
|
|
349 |
are defined as characters in the range 0x0300..0x0361. This definition may need
|
|
350 |
to be modified; or I may decide to store a flag in the decomposition table indicating
|
|
351 |
whether or not the decomposition consists of base + accent(s).
|
|
352 |
*/
|
|
353 |
TPtrC16 decomposition;
|
|
354 |
if (::DecomposeChar(iCode, decomposition))
|
|
355 |
{
|
|
356 |
TBool all_accents = TRUE;
|
|
357 |
for (TInt i = 1; all_accents && i < decomposition.Length(); ++i)
|
|
358 |
{
|
|
359 |
if (decomposition[i] < 0x0300 || decomposition[i] > 0x0361)
|
|
360 |
all_accents = FALSE;
|
|
361 |
}
|
|
362 |
if (all_accents)
|
|
363 |
result = decomposition[0];
|
|
364 |
}
|
|
365 |
}
|
|
366 |
|
|
367 |
if (aFlags & TChar::EFoldCase)
|
|
368 |
{
|
|
369 |
if (aOverridingDataSet == NULL && result < 256)
|
|
370 |
result = FoldTable[result];
|
|
371 |
else
|
|
372 |
result = TUnicode(result).GetLowerCase(aOverridingDataSet);
|
|
373 |
}
|
|
374 |
|
|
375 |
return result;
|
|
376 |
}
|
|
377 |
|
|
378 |
/*
|
|
379 |
Compare two Unicode strings naively by Unicode value. This is NOT the same as a comparison
|
|
380 |
of null-terminated strings; the strings can contain null characters (Unicode 0x0000) and they
|
|
381 |
compare greater than no character. This means that the string { 0x0001 0x0000 } always comes
|
|
382 |
after the string { 0x0001 }.
|
|
383 |
|
|
384 |
This function exists to make it easier to search tables of Unicode strings (like the composition
|
|
385 |
buffer) using the binary chop method. It is also used by READTYPE when sorting the compose table.
|
|
386 |
|
|
387 |
The return values are: 0 for equality, < 0 if aString1 < aString2, > 0 if aString1 > aString2.
|
|
388 |
*/
|
|
389 |
TInt TUnicode::Compare(const TUint16 *aString1,TInt aLength1,const TUint16 *aString2,TInt aLength2)
|
|
390 |
{
|
|
391 |
for (TInt i = 0; i < aLength1 || i < aLength2; i++, aString1++, aString2++)
|
|
392 |
{
|
|
393 |
TInt x = i < aLength1 ? *aString1 : -1;
|
|
394 |
TInt y = i < aLength2 ? *aString2 : -1;
|
|
395 |
if (x != y)
|
|
396 |
return x - y;
|
|
397 |
}
|
|
398 |
return 0;
|
|
399 |
}
|
|
400 |
|