|
1 // Copyright (c) 1996-2009 Nokia Corporation and/or its subsidiary(-ies). |
|
2 // All rights reserved. |
|
3 // This component and the accompanying materials are made available |
|
4 // under the terms of the License "Symbian Foundation License v1.0" to Symbian Foundation members and "Symbian Foundation End User License Agreement v1.0" to non-members |
|
5 // which accompanies this distribution, and is available |
|
6 // at the URL "http://www.symbianfoundation.org/legal/licencesv10.html". |
|
7 // |
|
8 // Initial Contributors: |
|
9 // Nokia Corporation - initial contribution. |
|
10 // |
|
11 // Contributors: |
|
12 // |
|
13 // Description: |
|
14 // e32\include\collate.h |
|
15 // Definitions needed for Unicode collation. |
|
16 // Collation is the comparison of two Unicode strings to produce an ordering |
|
17 // that may be used in a dictionary or other list. |
|
18 // Collation is implemented using the Standard Unicode Collation algorithm. There |
|
19 // are four levels of comparison: |
|
20 // primary: basic character identity |
|
21 // secondary: accents and diacritics |
|
22 // tertiary: upper and lower case, and other minor attributes |
|
23 // quaternary: Unicode character value |
|
24 // Punctuation is normally ignored but can optionally be taken into account. |
|
25 // Strings are fully expanded using the standard Unicode canonical expansions before |
|
26 // they are compared. Thai and Lao vowels are swapped with the following character |
|
27 // if any. |
|
28 // EUSER contains the 'basic collation method'. This method assigns the standard Unicode collation key values |
|
29 // to the characters in the WGL4 repertoire, plus commonly used control characters and fixed-width spaces, plus |
|
30 // the CJK ideograms (for which the keys can be generated algorithmically). Other characters are collated after |
|
31 // all the characters for which keys are defined, and ordered by their Unicode values. |
|
32 // Locales can supply any number of other collation methods. They will usually supply a 'tailoring' of the standard |
|
33 // method. This is done by using the standard table as the main key table (signalled by placing NULL in |
|
34 // TCollationMethod::iMainTable) and specifying an override table (TCollationMethod::iOverrideTable). |
|
35 // Locale-specific collation data resides in ELOCL. |
|
36 // |
|
37 // |
|
38 |
|
39 |
|
40 |
|
41 #ifndef __COLLATE_H__ |
|
42 #define __COLLATE_H__ |
|
43 |
|
44 #ifdef __KERNEL_MODE__ |
|
45 #include <e32cmn.h> |
|
46 #else |
|
47 #include <e32std.h> |
|
48 #endif |
|
49 |
|
50 //This material is used in the Unicode build only. |
|
51 #ifdef _UNICODE |
|
52 |
|
53 /** |
|
54 Collation key table structure. |
|
55 @publishedPartner |
|
56 */ |
|
57 struct TCollationKeyTable |
|
58 { |
|
59 public: |
|
60 /** |
|
61 Masks for the various parts of the elements of the iKey array. |
|
62 */ |
|
63 enum |
|
64 { |
|
65 ELevel0Mask = 0xFFFF0000, // primary key - basic character identity |
|
66 ELevel1Mask = 0x0000FF00, // secondary key - accents and diacritics |
|
67 ELevel2Mask = 0x000000FC, // tertiary key - case, etc. |
|
68 EIgnoreFlag = 0x2, // if set, this key is normally ignored |
|
69 EStopFlag = 0x1 // if set, this key is the last in a sequence representing a Unicode value or values |
|
70 }; |
|
71 |
|
72 /** |
|
73 An array containing all of the keys and strings of keys concatenated |
|
74 together. Each key has EStopFlag set only if it is the last key in its |
|
75 string. Eack key contains the keys for levels 0, 1 and 2, and a flag |
|
76 EIgnoreFlag if the key is usually ignored (for punctuation & spaces |
|
77 etc.). |
|
78 */ |
|
79 const TUint32* iKey; |
|
80 /** |
|
81 An array of indices into the iKey array. Each element has its high 16 |
|
82 bits indicating a Unicode value and its low 16 bits indicating an index |
|
83 into the iKey array at which its key starts. The elements are sorted by |
|
84 Unicode value. |
|
85 */ |
|
86 const TUint32* iIndex; |
|
87 /** |
|
88 The size of the iIndex array. |
|
89 */ |
|
90 TInt iIndices; |
|
91 /** |
|
92 Concatenated Unicode strings. Each is a strings that is to be converted |
|
93 to keys differently from how it would be if each letter were converted |
|
94 independently. An example is "ch" in Spanish, which sorts as though it |
|
95 were a single letter. Each Unicode string is preceeded by a 16-bit value |
|
96 indicating the string's length. The end of the string is not delimited. |
|
97 */ |
|
98 const TUint16* iString; |
|
99 /** |
|
100 An array of elements mapping elements of iString to elements of iIndex. |
|
101 Each element has its high 16 bits indicating the index of the start of |
|
102 an element of iString, and its low 16 bits indicating the corresponding |
|
103 element in iIndex. This array is sorted on the string index. |
|
104 */ |
|
105 const TUint32* iStringIndex; |
|
106 /** |
|
107 The size of the iStringIndex array. |
|
108 */ |
|
109 TInt iStringIndices; |
|
110 }; |
|
111 |
|
112 /** |
|
113 Defines a collation method. |
|
114 |
|
115 Collation means sorting pieces of text. It needs to take into account characters, |
|
116 accents and case; spaces and punctuation are usually ignored. It differs from |
|
117 ordinary methods of sorting in that it is locale-dependent - different |
|
118 languages use different ordering methods. Additionally, multiple collation |
|
119 methods may exist within the same locale. |
|
120 |
|
121 A collation method provides the collation keys and other data needed to customise |
|
122 collation; the Mem and TDesC16 collation functions (e.g. Mem::CompareC()) |
|
123 perform the collation. Note that these functions use the standard collation |
|
124 method for the current locale - you only need to specify an object of class |
|
125 TCollationMethod to customise this collation scheme. Collation methods can |
|
126 be retrieved using member functions of the Mem class. Each one has a unique |
|
127 identifier. |
|
128 |
|
129 A collation method specifies a main table of collation keys, and optionally |
|
130 an overriding table that contains keys for which the values in the main table |
|
131 are overridden. A collation key table (TCollationKeyTable) is the set of collation |
|
132 keys: primary (basic character identity), secondary (accents and diacritics) |
|
133 and tertiary (case). The quaternary key is the Unicode character values themselves. |
|
134 |
|
135 The simplest way to customise a collation method is to create a local copy |
|
136 of the standard collation method and change it. For example, you could use |
|
137 the standard method, but not ignore punctuation and spaces: |
|
138 |
|
139 @code |
|
140 TCollationMethod m = *Mem::CollationMethodByIndex(0); // get the standard method |
|
141 m.iFlags |= TCollationMethod::EIgnoreNone; // dont ignore punctuation and spaces |
|
142 @endcode |
|
143 |
|
144 @publishedPartner |
|
145 */ |
|
146 struct TCollationMethod |
|
147 { |
|
148 public: |
|
149 /** |
|
150 The UID of this collation method. |
|
151 */ |
|
152 TUint iId; |
|
153 |
|
154 /** |
|
155 The main collation key table; if NULL, use the standard table. |
|
156 */ |
|
157 const TCollationKeyTable* iMainTable; |
|
158 |
|
159 /** |
|
160 If non-NULL, tailoring for collation keys. |
|
161 */ |
|
162 const TCollationKeyTable* iOverrideTable; |
|
163 enum |
|
164 { |
|
165 /** |
|
166 Don't ignore any keys (punctuation, etc. is normally ignored). |
|
167 */ |
|
168 EIgnoreNone = 1, |
|
169 |
|
170 /** |
|
171 Reverse the normal order for characters differing only in case |
|
172 */ |
|
173 ESwapCase = 2, |
|
174 |
|
175 /** |
|
176 Compare secondary keys which represent accents in reverse |
|
177 order (from right to left); this is needed for French when comparing |
|
178 words that differ only in accents. |
|
179 */ |
|
180 EAccentsBackwards = 4, |
|
181 |
|
182 /** |
|
183 Reverse the normal order for characters differing only in whether they |
|
184 are katakana or hiragana. |
|
185 */ |
|
186 ESwapKana = 8, |
|
187 |
|
188 /** |
|
189 Fold all characters to lower case before extracting keys; needed for |
|
190 comparison of filenames, for which case is ignored but other |
|
191 tertiary (level-2) distinctions are not. |
|
192 */ |
|
193 EFoldCase = 16, |
|
194 |
|
195 /** Flag to indicate a collation method for matching purpose |
|
196 This flag is only needed if we wish to specify a particular collation method |
|
197 to be used for matching purpose. |
|
198 */ |
|
199 EMatchingTable = 32, |
|
200 |
|
201 /** Ignore the check for adjacent combining characters. A combining |
|
202 character effectively changes the character it combines with to something |
|
203 else and so a match doesn't occur. Setting this flag will allow character |
|
204 matching regardless of any combining characters. |
|
205 */ |
|
206 EIgnoreCombining = 64 |
|
207 }; |
|
208 |
|
209 /** |
|
210 Flags. |
|
211 |
|
212 @see TCollationMethod::EIgnoreNone |
|
213 @see TCollationMethod::ESwapCase |
|
214 @see TCollationMethod::EAccentsBackwards |
|
215 @see TCollationMethod::ESwapKana |
|
216 @see TCollationMethod::EFoldCase |
|
217 */ |
|
218 TUint iFlags; |
|
219 }; |
|
220 |
|
221 /** |
|
222 A collation data set provides any collation methods needed by a locale. |
|
223 @publishedPartner |
|
224 */ |
|
225 struct TCollationDataSet |
|
226 { |
|
227 public: |
|
228 const TCollationMethod* iMethod; |
|
229 TInt iMethods; |
|
230 }; |
|
231 |
|
232 // Collation method IDs |
|
233 |
|
234 /** |
|
235 A collation data set provides any collation methods needed by a locale. |
|
236 @internalTechnology |
|
237 @released |
|
238 */ |
|
239 const TUint KUidBasicCollationMethod = 0x10004F4E; |
|
240 |
|
241 /** |
|
242 A collation data set provides any collation methods needed by a locale. |
|
243 @internalTechnology |
|
244 @released |
|
245 */ |
|
246 const TUint KUidStandardUnicodeCollationMethod = 0x10004E96; |
|
247 |
|
248 #ifndef __KERNEL_MODE__ |
|
249 |
|
250 //Forward declarations |
|
251 class TUTF32Iterator; |
|
252 struct LCharSet; |
|
253 |
|
254 /** |
|
255 Provides low-level collation functions. |
|
256 @internalComponent |
|
257 */ |
|
258 class TCollate |
|
259 { |
|
260 public: |
|
261 /** |
|
262 Construct a TCollate object based on the collation method specified |
|
263 within aCharSet, if any. If there is none, or aCharSet is null, the |
|
264 standard collation method will be used. aMask and aFlags provide a |
|
265 method for overriding the flags in the collation method: Each flag set |
|
266 to 1 in aMask is a flag that will be overridden and set to the |
|
267 corresponding flag value in aFlags. Ownership of aCharSet is not passed. |
|
268 */ |
|
269 TCollate(const LCharSet* aCharSet,TUint aMask = 0,TUint aFlags = 0xFFFFFFFF); |
|
270 /** |
|
271 Construct a TCollate object based on an already constructed |
|
272 TCollationMethod specified in aMethod. Ownership is not passed. |
|
273 */ |
|
274 TCollate(const TCollationMethod& aMethod); |
|
275 |
|
276 enum TComparisonResult |
|
277 { |
|
278 ELeftComparesLessAndIsNotPrefix = -2, |
|
279 ELeftIsPrefixOfRight = -1, |
|
280 EStringsIdentical = 0, |
|
281 ERightIsPrefixOfLeft = 1, |
|
282 ERightComparesLessAndIsNotPrefix = 2 |
|
283 }; |
|
284 |
|
285 /** |
|
286 Compare the string beginning at aString1 of length aLength1 against the |
|
287 string beginning at aString2 of length aLength2. |
|
288 aMaxLevel determines the tightness of the collation. At level 0, only |
|
289 character identities are distinguished. At level 1 accents are |
|
290 distinguished as well. At level 2 case is distinguishes as well. At |
|
291 level 3 all valid different Unicode characters are considered different. |
|
292 */ |
|
293 TComparisonResult Compare(const TUint16* aString1,TInt aLength1, |
|
294 const TUint16* aString2,TInt aLength2, |
|
295 TInt aMaxLevel = 3) const; |
|
296 /** |
|
297 Find the string beginning at aString2 of length aLength2 in the string |
|
298 beginning at aString1 of length aLength1. aMaxLevel determines |
|
299 the tightness of the collation, see Compare for details. |
|
300 */ |
|
301 TInt Find(const TUint16 *aString1,TInt aLength1,const TUint16 *aString2,TInt aLength2, |
|
302 TInt aMaxLevel,TUint aString2WildChar = 0) const; |
|
303 |
|
304 TInt Find(const TUint16 *aString1,TInt aLength1,const TUint16 *aString2,TInt aLength2, |
|
305 TInt &aLengthFound,TInt aMaxLevel,TUint aString2WildChar = 0) const; |
|
306 |
|
307 /** |
|
308 Test if the string beginning at aSearchTerm of length aSearchTermLength |
|
309 matches the string beginning at aCandidate of length aCandidateLength. |
|
310 aMaxLevel determines the tightness of the collation, see |
|
311 Compare for details. The search term may have wild card characters as |
|
312 specified by aWildChar (for matching a single grapheme- i.e. character |
|
313 and any characters that combine with it, such as accents) and |
|
314 aWildSequenceChar (for matching any sequence of whole graphemes). The |
|
315 return value is KErrNotFound iff the search term does not match the |
|
316 candidate string exactly. To find a match within the candidate string, |
|
317 the search term must begin and end with a wild sequence character. If |
|
318 the search term does match the candidate string, 0 will be returned, |
|
319 unless the first character of the search term is a wild sequence |
|
320 character in which case the value returned will be the index into |
|
321 aCandidate at which the first non-wild sequence character matched. |
|
322 aWildSequenceChar must be a valid (non-surrogate) Unicode character |
|
323 below FFFE. |
|
324 */ |
|
325 TInt Match(const TUint16 *aCandidate, TInt aCandidateLength, |
|
326 const TUint16 *aSearchTerm,TInt aSearchTermLength, |
|
327 TInt aMaxLevel, TUint aWildChar = '?', TUint aWildSequenceChar = '*', TUint aEscapeChar = 0) const; |
|
328 |
|
329 private: |
|
330 /** |
|
331 Compare values output from the iterators. After the comparison, if |
|
332 ERightIsPrefixOfLeft or EStringsIdentical is returned, then aLeft and |
|
333 aRight will be pointing at the next key (at MaxLevel) after the match. |
|
334 If right is shown to be a prefix of left, this means that it has been |
|
335 checked at all requested levels. If it is reported that the right is a |
|
336 prefix of the left, then this will mean also that there are no unmatched |
|
337 combining characters on the left. |
|
338 */ |
|
339 TComparisonResult CompareKeySequences(TUTF32Iterator& aLeft, TUTF32Iterator& aRight, |
|
340 TInt aMaxLevel, TInt aRightStringWildChar, TInt aEscapeChar) const; |
|
341 /** |
|
342 Finds search term inside candidate string. Returns KErrNotFound if there |
|
343 is no match, returns the offset into the candidate string at which the |
|
344 search term was found (note that this is the offset from the start of |
|
345 the iteration, not from where the iteration was when the function was |
|
346 called). If a string was found, the search term iterator is left |
|
347 pointing at the end of the search term, and the candidate iterator is |
|
348 left pointing just after the matched keys. aMatchPos returns where in |
|
349 the candidate string the match was found. |
|
350 */ |
|
351 TInt FindKeySequence(TUTF32Iterator& aCandidate, TUTF32Iterator& aSearchTerm, |
|
352 TInt aMaxLevel, TInt aWildChar, TInt aEscapeChar, TInt& aLengthFound) const; |
|
353 |
|
354 private: |
|
355 TCollationMethod iMethod; |
|
356 }; |
|
357 |
|
358 #endif // __KERNEL_MODE__ |
|
359 |
|
360 #endif // _UNICODE |
|
361 |
|
362 #endif // __COLLATE_H__ |