diff -r 000000000000 -r a41df078684a kernel/eka/include/collate.h --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/kernel/eka/include/collate.h Mon Oct 19 15:55:17 2009 +0100 @@ -0,0 +1,370 @@ +// Copyright (c) 1996-2009 Nokia Corporation and/or its subsidiary(-ies). +// All rights reserved. +// This component and the accompanying materials are made available +// under the terms of the License "Eclipse Public License v1.0" +// which accompanies this distribution, and is available +// at the URL "http://www.eclipse.org/legal/epl-v10.html". +// +// Initial Contributors: +// Nokia Corporation - initial contribution. +// +// Contributors: +// +// Description: +// e32\include\collate.h +// Definitions needed for Unicode collation. +// Collation is the comparison of two Unicode strings to produce an ordering +// that may be used in a dictionary or other list. +// Collation is implemented using the Standard Unicode Collation algorithm. There +// are four levels of comparison: +// primary: basic character identity +// secondary: accents and diacritics +// tertiary: upper and lower case, and other minor attributes +// quaternary: Unicode character value +// Punctuation is normally ignored but can optionally be taken into account. +// Strings are fully expanded using the standard Unicode canonical expansions before +// they are compared. Thai and Lao vowels are swapped with the following character +// if any. +// EUSER contains the 'basic collation method'. This method assigns the standard Unicode collation key values +// to the characters in the WGL4 repertoire, plus commonly used control characters and fixed-width spaces, plus +// the CJK ideograms (for which the keys can be generated algorithmically). Other characters are collated after +// all the characters for which keys are defined, and ordered by their Unicode values. +// Locales can supply any number of other collation methods. They will usually supply a 'tailoring' of the standard +// method. This is done by using the standard table as the main key table (signalled by placing NULL in +// TCollationMethod::iMainTable) and specifying an override table (TCollationMethod::iOverrideTable). +// Locale-specific collation data resides in ELOCL. +// +// WARNING: This file contains some APIs which are internal and are subject +// to change without notice. Such APIs should therefore not be used +// outside the Kernel and Hardware Services package. +// + +#ifndef __COLLATE_H__ +#define __COLLATE_H__ + +#ifdef __KERNEL_MODE__ +#include +#else +#include +#endif + +//This material is used in the Unicode build only. +#ifdef _UNICODE + +/** +Collation key table structure. +@publishedPartner +@released +*/ +struct TCollationKeyTable + { +public: + /** + Masks for the various parts of the elements of the iKey array. + */ + enum + { + ELevel0Mask = 0xFFFF0000, // primary key - basic character identity + ELevel1Mask = 0x0000FF00, // secondary key - accents and diacritics + ELevel2Mask = 0x000000FC, // tertiary key - case, etc. + EIgnoreFlag = 0x2, // if set, this key is normally ignored + EStopFlag = 0x1 // if set, this key is the last in a sequence representing a Unicode value or values + }; + + /** + An array containing all of the keys and strings of keys concatenated + together. Each key has EStopFlag set only if it is the last key in its + string. Eack key contains the keys for levels 0, 1 and 2, and a flag + EIgnoreFlag if the key is usually ignored (for punctuation & spaces + etc.). + */ + const TUint32* iKey; + /** + An array of indices into the iKey array. Each element has its high 16 + bits indicating a Unicode value and its low 16 bits indicating an index + into the iKey array at which its key starts. For surrogate pairs, high + surrogate code is in index[i]:16-31, and low surrogate code is in + index[i+1]:16-31. These two elements are combined to represent a surrogate + pair. The elements are sorted by Unicode value. + */ + const TUint32* iIndex; + /** + The size of the iIndex array. + */ + TInt iIndices; + /** + Concatenated Unicode strings. Each is a strings that is to be converted + to keys differently from how it would be if each letter were converted + independently. An example is "ch" in Spanish, which sorts as though it + were a single letter. Each Unicode string is preceeded by a 16-bit value + indicating the string's length (in 16-bit). The end of the string is not + delimited. A surrogate pair is represented by two ajacent 16-bit values. + */ + const TUint16* iString; + /** + An array of elements mapping elements of iString to elements of iIndex. + Each element has its high 16 bits indicating the index of the start of + an element of iString, and its low 16 bits indicating the corresponding + element in iIndex. This array is sorted on the string index. + */ + const TUint32* iStringIndex; + /** + The size of the iStringIndex array. + */ + TInt iStringIndices; + }; + +/** +Defines a collation method. + +Collation means sorting pieces of text. It needs to take into account characters, +accents and case; spaces and punctuation are usually ignored. It differs from +ordinary methods of sorting in that it is locale-dependent - different +languages use different ordering methods. Additionally, multiple collation +methods may exist within the same locale. + +A collation method provides the collation keys and other data needed to customise +collation; the Mem and TDesC16 collation functions (e.g. Mem::CompareC()) +perform the collation. Note that these functions use the standard collation +method for the current locale - you only need to specify an object of class +TCollationMethod to customise this collation scheme. Collation methods can +be retrieved using member functions of the Mem class. Each one has a unique +identifier. + +A collation method specifies a main table of collation keys, and optionally +an overriding table that contains keys for which the values in the main table +are overridden. A collation key table (TCollationKeyTable) is the set of collation +keys: primary (basic character identity), secondary (accents and diacritics) +and tertiary (case). The quaternary key is the Unicode character values themselves. + +The simplest way to customise a collation method is to create a local copy +of the standard collation method and change it. For example, you could use +the standard method, but not ignore punctuation and spaces: + +@code +TCollationMethod m = *Mem::CollationMethodByIndex(0); // get the standard method +m.iFlags |= TCollationMethod::EIgnoreNone; // dont ignore punctuation and spaces +@endcode + +@publishedPartner +@released +*/ +struct TCollationMethod + { + public: + /** + The UID of this collation method. + */ + TUint iId; + + /** + The main collation key table; if NULL, use the standard table. + */ + const TCollationKeyTable* iMainTable; + + /** + If non-NULL, tailoring for collation keys. + */ + const TCollationKeyTable* iOverrideTable; + enum + { + /** + Don't ignore any keys (punctuation, etc. is normally ignored). + */ + EIgnoreNone = 1, + + /** + Reverse the normal order for characters differing only in case + */ + ESwapCase = 2, + + /** + Compare secondary keys which represent accents in reverse + order (from right to left); this is needed for French when comparing + words that differ only in accents. + */ + EAccentsBackwards = 4, + + /** + Reverse the normal order for characters differing only in whether they + are katakana or hiragana. + */ + ESwapKana = 8, + + /** + Fold all characters to lower case before extracting keys; needed for + comparison of filenames, for which case is ignored but other + tertiary (level-2) distinctions are not. + */ + EFoldCase = 16, + + /** Flag to indicate a collation method for matching purpose + This flag is only needed if we wish to specify a particular collation method + to be used for matching purpose. + */ + EMatchingTable = 32, + + /** Ignore the check for adjacent combining characters. A combining + character effectively changes the character it combines with to something + else and so a match doesn't occur. Setting this flag will allow character + matching regardless of any combining characters. + */ + EIgnoreCombining = 64 + }; + + /** + Flags. + + @see TCollationMethod::EIgnoreNone + @see TCollationMethod::ESwapCase + @see TCollationMethod::EAccentsBackwards + @see TCollationMethod::ESwapKana + @see TCollationMethod::EFoldCase + */ + TUint iFlags; + }; + +/** +A collation data set provides any collation methods needed by a locale. +@publishedPartner +@released +*/ +struct TCollationDataSet + { + public: + const TCollationMethod* iMethod; + TInt iMethods; + }; + +// Collation method IDs + +/** +A collation data set provides any collation methods needed by a locale. +@internalTechnology +@released +*/ +const TUint KUidBasicCollationMethod = 0x10004F4E; + +/** +A collation data set provides any collation methods needed by a locale. +@internalTechnology +@released +*/ +const TUint KUidStandardUnicodeCollationMethod = 0x10004E96; + +#ifndef __KERNEL_MODE__ + +//Forward declarations +class TUTF32Iterator; +struct LCharSet; + +/** +Provides low-level collation functions. +@internalComponent +@released +*/ +class TCollate + { +public: + /** + Construct a TCollate object based on the collation method specified + within aCharSet, if any. If there is none, or aCharSet is null, the + standard collation method will be used. aMask and aFlags provide a + method for overriding the flags in the collation method: Each flag set + to 1 in aMask is a flag that will be overridden and set to the + corresponding flag value in aFlags. Ownership of aCharSet is not passed. + */ + TCollate(const LCharSet* aCharSet,TUint aMask = 0,TUint aFlags = 0xFFFFFFFF); + /** + Construct a TCollate object based on an already constructed + TCollationMethod specified in aMethod. Ownership is not passed. + */ + TCollate(const TCollationMethod& aMethod); + + enum TComparisonResult + { + ELeftComparesLessAndIsNotPrefix = -2, + ELeftIsPrefixOfRight = -1, + EStringsIdentical = 0, + ERightIsPrefixOfLeft = 1, + ERightComparesLessAndIsNotPrefix = 2 + }; + + /** + Compare the string beginning at aString1 of length aLength1 against the + string beginning at aString2 of length aLength2. + aMaxLevel determines the tightness of the collation. At level 0, only + character identities are distinguished. At level 1 accents are + distinguished as well. At level 2 case is distinguishes as well. At + level 3 all valid different Unicode characters are considered different. + */ + TComparisonResult Compare(const TUint16* aString1,TInt aLength1, + const TUint16* aString2,TInt aLength2, + TInt aMaxLevel = 3) const; + /** + Find the string beginning at aString2 of length aLength2 in the string + beginning at aString1 of length aLength1. aMaxLevel determines + the tightness of the collation, see Compare for details. + */ + TInt Find(const TUint16 *aString1,TInt aLength1,const TUint16 *aString2,TInt aLength2, + TInt aMaxLevel,TUint aString2WildChar = 0) const; + + TInt Find(const TUint16 *aString1,TInt aLength1,const TUint16 *aString2,TInt aLength2, + TInt &aLengthFound,TInt aMaxLevel,TUint aString2WildChar = 0) const; + + /** + Test if the string beginning at aSearchTerm of length aSearchTermLength + matches the string beginning at aCandidate of length aCandidateLength. + aMaxLevel determines the tightness of the collation, see + Compare for details. The search term may have wild card characters as + specified by aWildChar (for matching a single grapheme- i.e. character + and any characters that combine with it, such as accents) and + aWildSequenceChar (for matching any sequence of whole graphemes). The + return value is KErrNotFound iff the search term does not match the + candidate string exactly. To find a match within the candidate string, + the search term must begin and end with a wild sequence character. If + the search term does match the candidate string, 0 will be returned, + unless the first character of the search term is a wild sequence + character in which case the value returned will be the index into + aCandidate at which the first non-wild sequence character matched. + aWildSequenceChar must be a valid (non-surrogate) Unicode character + below FFFE. + */ + TInt Match(const TUint16 *aCandidate, TInt aCandidateLength, + const TUint16 *aSearchTerm,TInt aSearchTermLength, + TInt aMaxLevel, TUint aWildChar = '?', TUint aWildSequenceChar = '*', TUint aEscapeChar = 0) const; + +private: + /** + Compare values output from the iterators. After the comparison, if + ERightIsPrefixOfLeft or EStringsIdentical is returned, then aLeft and + aRight will be pointing at the next key (at MaxLevel) after the match. + If right is shown to be a prefix of left, this means that it has been + checked at all requested levels. If it is reported that the right is a + prefix of the left, then this will mean also that there are no unmatched + combining characters on the left. + */ + TComparisonResult CompareKeySequences(TUTF32Iterator& aLeft, TUTF32Iterator& aRight, + TInt aMaxLevel, TInt aRightStringWildChar, TInt aEscapeChar) const; + /** + Finds search term inside candidate string. Returns KErrNotFound if there + is no match, returns the offset into the candidate string at which the + search term was found (note that this is the offset from the start of + the iteration, not from where the iteration was when the function was + called). If a string was found, the search term iterator is left + pointing at the end of the search term, and the candidate iterator is + left pointing just after the matched keys. aMatchPos returns where in + the candidate string the match was found. + */ + TInt FindKeySequence(TUTF32Iterator& aCandidate, TUTF32Iterator& aSearchTerm, + TInt aMaxLevel, TInt aWildChar, TInt aEscapeChar, TInt& aLengthFound) const; + +private: + TCollationMethod iMethod; + }; + +#endif // __KERNEL_MODE__ + +#endif // _UNICODE + +#endif // __COLLATE_H__