fontservices/textshaperplugin/IcuSource/common/unicode/normlzr.h
changeset 0 1fb32624e06b
equal deleted inserted replaced
-1:000000000000 0:1fb32624e06b
       
     1 /*
       
     2  ********************************************************************
       
     3  * COPYRIGHT:
       
     4  * Copyright (c) 1996-2005, International Business Machines Corporation and
       
     5  * others. All Rights Reserved.
       
     6  ********************************************************************
       
     7  */
       
     8 
       
     9 #ifndef NORMLZR_H
       
    10 #define NORMLZR_H
       
    11 
       
    12 #include "unicode/utypes.h"
       
    13 
       
    14 /**
       
    15  * \file 
       
    16  * \brief C++ API: Unicode Normalization
       
    17  */
       
    18  
       
    19 #if !UCONFIG_NO_NORMALIZATION
       
    20 
       
    21 #include "unicode/uobject.h"
       
    22 #include "unicode/unistr.h"
       
    23 #include "unicode/chariter.h"
       
    24 #include "unicode/unorm.h"
       
    25 
       
    26 
       
    27 struct UCharIterator;
       
    28 typedef struct UCharIterator UCharIterator; /**< C typedef for struct UCharIterator. @stable ICU 2.1 */
       
    29 
       
    30 U_NAMESPACE_BEGIN
       
    31 /**
       
    32  *
       
    33  * The Normalizer class consists of two parts:
       
    34  * - static functions that normalize strings or test if strings are normalized
       
    35  * - a Normalizer object is an iterator that takes any kind of text and
       
    36  *   provides iteration over its normalized form
       
    37  *
       
    38  * The Normalizer class is not suitable for subclassing.
       
    39  *
       
    40  * The static functions are basically wrappers around the C implementation,
       
    41  * using UnicodeString instead of UChar*.
       
    42  * For basic information about normalization forms and details about the C API
       
    43  * please see the documentation in unorm.h.
       
    44  *
       
    45  * The iterator API with the Normalizer constructors and the non-static functions
       
    46  * uses a CharacterIterator as input. It is possible to pass a string which
       
    47  * is then internally wrapped in a CharacterIterator.
       
    48  * The input text is not normalized all at once, but incrementally where needed
       
    49  * (providing efficient random access).
       
    50  * This allows to pass in a large text but spend only a small amount of time
       
    51  * normalizing a small part of that text.
       
    52  * However, if the entire text is normalized, then the iterator will be
       
    53  * slower than normalizing the entire text at once and iterating over the result.
       
    54  * A possible use of the Normalizer iterator is also to report an index into the
       
    55  * original text that is close to where the normalized characters come from.
       
    56  *
       
    57  * <em>Important:</em> The iterator API was cleaned up significantly for ICU 2.0.
       
    58  * The earlier implementation reported the getIndex() inconsistently,
       
    59  * and previous() could not be used after setIndex(), next(), first(), and current().
       
    60  *
       
    61  * Normalizer allows to start normalizing from anywhere in the input text by
       
    62  * calling setIndexOnly(), first(), or last().
       
    63  * Without calling any of these, the iterator will start at the beginning of the text.
       
    64  *
       
    65  * At any time, next() returns the next normalized code point (UChar32),
       
    66  * with post-increment semantics (like CharacterIterator::next32PostInc()).
       
    67  * previous() returns the previous normalized code point (UChar32),
       
    68  * with pre-decrement semantics (like CharacterIterator::previous32()).
       
    69  *
       
    70  * current() returns the current code point
       
    71  * (respectively the one at the newly set index) without moving
       
    72  * the getIndex(). Note that if the text at the current position
       
    73  * needs to be normalized, then these functions will do that.
       
    74  * (This is why current() is not const.)
       
    75  * It is more efficient to call setIndexOnly() instead, which does not
       
    76  * normalize.
       
    77  *
       
    78  * getIndex() always refers to the position in the input text where the normalized
       
    79  * code points are returned from. It does not always change with each returned
       
    80  * code point.
       
    81  * The code point that is returned from any of the functions
       
    82  * corresponds to text at or after getIndex(), according to the
       
    83  * function's iteration semantics (post-increment or pre-decrement).
       
    84  *
       
    85  * next() returns a code point from at or after the getIndex()
       
    86  * from before the next() call. After the next() call, the getIndex()
       
    87  * might have moved to where the next code point will be returned from
       
    88  * (from a next() or current() call).
       
    89  * This is semantically equivalent to array access with array[index++]
       
    90  * (post-increment semantics).
       
    91  *
       
    92  * previous() returns a code point from at or after the getIndex()
       
    93  * from after the previous() call.
       
    94  * This is semantically equivalent to array access with array[--index]
       
    95  * (pre-decrement semantics).
       
    96  *
       
    97  * Internally, the Normalizer iterator normalizes a small piece of text
       
    98  * starting at the getIndex() and ending at a following "safe" index.
       
    99  * The normalized results is stored in an internal string buffer, and
       
   100  * the code points are iterated from there.
       
   101  * With multiple iteration calls, this is repeated until the next piece
       
   102  * of text needs to be normalized, and the getIndex() needs to be moved.
       
   103  *
       
   104  * The following "safe" index, the internal buffer, and the secondary
       
   105  * iteration index into that buffer are not exposed on the API.
       
   106  * This also means that it is currently not practical to return to
       
   107  * a particular, arbitrary position in the text because one would need to
       
   108  * know, and be able to set, in addition to the getIndex(), at least also the
       
   109  * current index into the internal buffer.
       
   110  * It is currently only possible to observe when getIndex() changes
       
   111  * (with careful consideration of the iteration semantics),
       
   112  * at which time the internal index will be 0.
       
   113  * For example, if getIndex() is different after next() than before it,
       
   114  * then the internal index is 0 and one can return to this getIndex()
       
   115  * later with setIndexOnly().
       
   116  *
       
   117  * @author Laura Werner, Mark Davis, Markus Scherer
       
   118  * @stable ICU 2.0
       
   119  */
       
   120 class U_COMMON_API Normalizer : public UObject {
       
   121 public:
       
   122   /**
       
   123    * If DONE is returned from an iteration function that returns a code point,
       
   124    * then there are no more normalization results available.
       
   125    * @stable ICU 2.0
       
   126    */
       
   127   enum {
       
   128       DONE=0xffff
       
   129   };
       
   130 
       
   131   // Constructors
       
   132 
       
   133   /**
       
   134    * Creates a new <code>Normalizer</code> object for iterating over the
       
   135    * normalized form of a given string.
       
   136    * <p>
       
   137    * @param str   The string to be normalized.  The normalization
       
   138    *              will start at the beginning of the string.
       
   139    *
       
   140    * @param mode  The normalization mode.
       
   141    * @stable ICU 2.0
       
   142    */
       
   143   Normalizer(const UnicodeString& str, UNormalizationMode mode);
       
   144 
       
   145   /**
       
   146    * Creates a new <code>Normalizer</code> object for iterating over the
       
   147    * normalized form of a given string.
       
   148    * <p>
       
   149    * @param str   The string to be normalized.  The normalization
       
   150    *              will start at the beginning of the string.
       
   151    *
       
   152    * @param length Length of the string, or -1 if NUL-terminated.
       
   153    * @param mode  The normalization mode.
       
   154    * @stable ICU 2.0
       
   155    */
       
   156   Normalizer(const UChar* str, int32_t length, UNormalizationMode mode);
       
   157 
       
   158   /**
       
   159    * Creates a new <code>Normalizer</code> object for iterating over the
       
   160    * normalized form of the given text.
       
   161    * <p>
       
   162    * @param iter  The input text to be normalized.  The normalization
       
   163    *              will start at the beginning of the string.
       
   164    *
       
   165    * @param mode  The normalization mode.
       
   166    * @stable ICU 2.0
       
   167    */
       
   168   Normalizer(const CharacterIterator& iter, UNormalizationMode mode);
       
   169 
       
   170   /**
       
   171    * Copy constructor.
       
   172    * @param copy The object to be copied.
       
   173    * @stable ICU 2.0
       
   174    */
       
   175   Normalizer(const Normalizer& copy);
       
   176 
       
   177   /**
       
   178    * Destructor
       
   179    * @stable ICU 2.0
       
   180    */
       
   181   virtual ~Normalizer();
       
   182 
       
   183 
       
   184   //-------------------------------------------------------------------------
       
   185   // Static utility methods
       
   186   //-------------------------------------------------------------------------
       
   187 
       
   188   /**
       
   189    * Normalizes a <code>UnicodeString</code> according to the specified normalization mode.
       
   190    * This is a wrapper for unorm_normalize(), using UnicodeString's.
       
   191    *
       
   192    * The <code>options</code> parameter specifies which optional
       
   193    * <code>Normalizer</code> features are to be enabled for this operation.
       
   194    *
       
   195    * @param source    the input string to be normalized.
       
   196    * @param mode      the normalization mode
       
   197    * @param options   the optional features to be enabled (0 for no options)
       
   198    * @param result    The normalized string (on output).
       
   199    * @param status    The error code.
       
   200    * @stable ICU 2.0
       
   201    */
       
   202   static void U_EXPORT2 normalize(const UnicodeString& source,
       
   203                         UNormalizationMode mode, int32_t options,
       
   204                         UnicodeString& result,
       
   205                         UErrorCode &status);
       
   206 
       
   207   /**
       
   208    * Compose a <code>UnicodeString</code>.
       
   209    * This is equivalent to normalize() with mode UNORM_NFC or UNORM_NFKC.
       
   210    * This is a wrapper for unorm_normalize(), using UnicodeString's.
       
   211    *
       
   212    * The <code>options</code> parameter specifies which optional
       
   213    * <code>Normalizer</code> features are to be enabled for this operation.
       
   214    *
       
   215    * @param source    the string to be composed.
       
   216    * @param compat    Perform compatibility decomposition before composition.
       
   217    *                  If this argument is <code>FALSE</code>, only canonical
       
   218    *                  decomposition will be performed.
       
   219    * @param options   the optional features to be enabled (0 for no options)
       
   220    * @param result    The composed string (on output).
       
   221    * @param status    The error code.
       
   222    * @stable ICU 2.0
       
   223    */
       
   224   static void U_EXPORT2 compose(const UnicodeString& source,
       
   225                       UBool compat, int32_t options,
       
   226                       UnicodeString& result,
       
   227                       UErrorCode &status);
       
   228 
       
   229   /**
       
   230    * Static method to decompose a <code>UnicodeString</code>.
       
   231    * This is equivalent to normalize() with mode UNORM_NFD or UNORM_NFKD.
       
   232    * This is a wrapper for unorm_normalize(), using UnicodeString's.
       
   233    *
       
   234    * The <code>options</code> parameter specifies which optional
       
   235    * <code>Normalizer</code> features are to be enabled for this operation.
       
   236    *
       
   237    * @param source    the string to be decomposed.
       
   238    * @param compat    Perform compatibility decomposition.
       
   239    *                  If this argument is <code>FALSE</code>, only canonical
       
   240    *                  decomposition will be performed.
       
   241    * @param options   the optional features to be enabled (0 for no options)
       
   242    * @param result    The decomposed string (on output).
       
   243    * @param status    The error code.
       
   244    * @stable ICU 2.0
       
   245    */
       
   246   static void U_EXPORT2 decompose(const UnicodeString& source,
       
   247                         UBool compat, int32_t options,
       
   248                         UnicodeString& result,
       
   249                         UErrorCode &status);
       
   250 
       
   251   /**
       
   252    * Performing quick check on a string, to quickly determine if the string is
       
   253    * in a particular normalization format.
       
   254    * This is a wrapper for unorm_quickCheck(), using a UnicodeString.
       
   255    *
       
   256    * Three types of result can be returned UNORM_YES, UNORM_NO or
       
   257    * UNORM_MAYBE. Result UNORM_YES indicates that the argument
       
   258    * string is in the desired normalized format, UNORM_NO determines that
       
   259    * argument string is not in the desired normalized format. A
       
   260    * UNORM_MAYBE result indicates that a more thorough check is required,
       
   261    * the user may have to put the string in its normalized form and compare the
       
   262    * results.
       
   263    * @param source       string for determining if it is in a normalized format
       
   264    * @param mode         normalization format
       
   265    * @param status A reference to a UErrorCode to receive any errors
       
   266    * @return UNORM_YES, UNORM_NO or UNORM_MAYBE
       
   267    *
       
   268    * @see isNormalized
       
   269    * @stable ICU 2.0
       
   270    */
       
   271   static inline UNormalizationCheckResult
       
   272   quickCheck(const UnicodeString &source, UNormalizationMode mode, UErrorCode &status);
       
   273 
       
   274   /**
       
   275    * Performing quick check on a string; same as the other version of quickCheck
       
   276    * but takes an extra options parameter like most normalization functions.
       
   277    *
       
   278    * @param source       string for determining if it is in a normalized format
       
   279    * @param mode         normalization format
       
   280    * @param options      the optional features to be enabled (0 for no options)
       
   281    * @param status A reference to a UErrorCode to receive any errors
       
   282    * @return UNORM_YES, UNORM_NO or UNORM_MAYBE
       
   283    *
       
   284    * @see isNormalized
       
   285    * @stable ICU 2.6
       
   286    */
       
   287   static inline UNormalizationCheckResult
       
   288   quickCheck(const UnicodeString &source, UNormalizationMode mode, int32_t options, UErrorCode &status);
       
   289 
       
   290   /**
       
   291    * Test if a string is in a given normalization form.
       
   292    * This is semantically equivalent to source.equals(normalize(source, mode)) .
       
   293    *
       
   294    * Unlike unorm_quickCheck(), this function returns a definitive result,
       
   295    * never a "maybe".
       
   296    * For NFD, NFKD, and FCD, both functions work exactly the same.
       
   297    * For NFC and NFKC where quickCheck may return "maybe", this function will
       
   298    * perform further tests to arrive at a TRUE/FALSE result.
       
   299    *
       
   300    * @param src        String that is to be tested if it is in a normalization format.
       
   301    * @param mode       Which normalization form to test for.
       
   302    * @param errorCode  ICU error code in/out parameter.
       
   303    *                   Must fulfill U_SUCCESS before the function call.
       
   304    * @return Boolean value indicating whether the source string is in the
       
   305    *         "mode" normalization form.
       
   306    *
       
   307    * @see quickCheck
       
   308    * @stable ICU 2.2
       
   309    */
       
   310   static inline UBool
       
   311   isNormalized(const UnicodeString &src, UNormalizationMode mode, UErrorCode &errorCode);
       
   312 
       
   313   /**
       
   314    * Test if a string is in a given normalization form; same as the other version of isNormalized
       
   315    * but takes an extra options parameter like most normalization functions.
       
   316    *
       
   317    * @param src        String that is to be tested if it is in a normalization format.
       
   318    * @param mode       Which normalization form to test for.
       
   319    * @param options      the optional features to be enabled (0 for no options)
       
   320    * @param errorCode  ICU error code in/out parameter.
       
   321    *                   Must fulfill U_SUCCESS before the function call.
       
   322    * @return Boolean value indicating whether the source string is in the
       
   323    *         "mode" normalization form.
       
   324    *
       
   325    * @see quickCheck
       
   326    * @stable ICU 2.6
       
   327    */
       
   328   static inline UBool
       
   329   isNormalized(const UnicodeString &src, UNormalizationMode mode, int32_t options, UErrorCode &errorCode);
       
   330 
       
   331   /**
       
   332    * Concatenate normalized strings, making sure that the result is normalized as well.
       
   333    *
       
   334    * If both the left and the right strings are in
       
   335    * the normalization form according to "mode/options",
       
   336    * then the result will be
       
   337    *
       
   338    * \code
       
   339    *     dest=normalize(left+right, mode, options)
       
   340    * \endcode
       
   341    *
       
   342    * For details see unorm_concatenate in unorm.h.
       
   343    *
       
   344    * @param left Left source string.
       
   345    * @param right Right source string.
       
   346    * @param result The output string.
       
   347    * @param mode The normalization mode.
       
   348    * @param options A bit set of normalization options.
       
   349    * @param errorCode ICU error code in/out parameter.
       
   350    *                   Must fulfill U_SUCCESS before the function call.
       
   351    * @return result
       
   352    *
       
   353    * @see unorm_concatenate
       
   354    * @see normalize
       
   355    * @see unorm_next
       
   356    * @see unorm_previous
       
   357    *
       
   358    * @stable ICU 2.1
       
   359    */
       
   360   static UnicodeString &
       
   361   U_EXPORT2 concatenate(UnicodeString &left, UnicodeString &right,
       
   362               UnicodeString &result,
       
   363               UNormalizationMode mode, int32_t options,
       
   364               UErrorCode &errorCode);
       
   365 
       
   366   /**
       
   367    * Compare two strings for canonical equivalence.
       
   368    * Further options include case-insensitive comparison and
       
   369    * code point order (as opposed to code unit order).
       
   370    *
       
   371    * Canonical equivalence between two strings is defined as their normalized
       
   372    * forms (NFD or NFC) being identical.
       
   373    * This function compares strings incrementally instead of normalizing
       
   374    * (and optionally case-folding) both strings entirely,
       
   375    * improving performance significantly.
       
   376    *
       
   377    * Bulk normalization is only necessary if the strings do not fulfill the FCD
       
   378    * conditions. Only in this case, and only if the strings are relatively long,
       
   379    * is memory allocated temporarily.
       
   380    * For FCD strings and short non-FCD strings there is no memory allocation.
       
   381    *
       
   382    * Semantically, this is equivalent to
       
   383    *   strcmp[CodePointOrder](NFD(foldCase(s1)), NFD(foldCase(s2)))
       
   384    * where code point order and foldCase are all optional.
       
   385    *
       
   386    * UAX 21 2.5 Caseless Matching specifies that for a canonical caseless match
       
   387    * the case folding must be performed first, then the normalization.
       
   388    *
       
   389    * @param s1 First source string.
       
   390    * @param s2 Second source string.
       
   391    *
       
   392    * @param options A bit set of options:
       
   393    *   - U_FOLD_CASE_DEFAULT or 0 is used for default options:
       
   394    *     Case-sensitive comparison in code unit order, and the input strings
       
   395    *     are quick-checked for FCD.
       
   396    *
       
   397    *   - UNORM_INPUT_IS_FCD
       
   398    *     Set if the caller knows that both s1 and s2 fulfill the FCD conditions.
       
   399    *     If not set, the function will quickCheck for FCD
       
   400    *     and normalize if necessary.
       
   401    *
       
   402    *   - U_COMPARE_CODE_POINT_ORDER
       
   403    *     Set to choose code point order instead of code unit order
       
   404    *     (see u_strCompare for details).
       
   405    *
       
   406    *   - U_COMPARE_IGNORE_CASE
       
   407    *     Set to compare strings case-insensitively using case folding,
       
   408    *     instead of case-sensitively.
       
   409    *     If set, then the following case folding options are used.
       
   410    *
       
   411    *   - Options as used with case-insensitive comparisons, currently:
       
   412    *
       
   413    *   - U_FOLD_CASE_EXCLUDE_SPECIAL_I
       
   414    *    (see u_strCaseCompare for details)
       
   415    *
       
   416    *   - regular normalization options shifted left by UNORM_COMPARE_NORM_OPTIONS_SHIFT
       
   417    *
       
   418    * @param errorCode ICU error code in/out parameter.
       
   419    *                  Must fulfill U_SUCCESS before the function call.
       
   420    * @return <0 or 0 or >0 as usual for string comparisons
       
   421    *
       
   422    * @see unorm_compare
       
   423    * @see normalize
       
   424    * @see UNORM_FCD
       
   425    * @see u_strCompare
       
   426    * @see u_strCaseCompare
       
   427    *
       
   428    * @stable ICU 2.2
       
   429    */
       
   430   static inline int32_t
       
   431   compare(const UnicodeString &s1, const UnicodeString &s2,
       
   432           uint32_t options,
       
   433           UErrorCode &errorCode);
       
   434 
       
   435   //-------------------------------------------------------------------------
       
   436   // Iteration API
       
   437   //-------------------------------------------------------------------------
       
   438 
       
   439   /**
       
   440    * Return the current character in the normalized text.
       
   441    * current() may need to normalize some text at getIndex().
       
   442    * The getIndex() is not changed.
       
   443    *
       
   444    * @return the current normalized code point
       
   445    * @stable ICU 2.0
       
   446    */
       
   447   UChar32              current(void);
       
   448 
       
   449   /**
       
   450    * Return the first character in the normalized text.
       
   451    * This is equivalent to setIndexOnly(startIndex()) followed by next().
       
   452    * (Post-increment semantics.)
       
   453    *
       
   454    * @return the first normalized code point
       
   455    * @stable ICU 2.0
       
   456    */
       
   457   UChar32              first(void);
       
   458 
       
   459   /**
       
   460    * Return the last character in the normalized text.
       
   461    * This is equivalent to setIndexOnly(endIndex()) followed by previous().
       
   462    * (Pre-decrement semantics.)
       
   463    *
       
   464    * @return the last normalized code point
       
   465    * @stable ICU 2.0
       
   466    */
       
   467   UChar32              last(void);
       
   468 
       
   469   /**
       
   470    * Return the next character in the normalized text.
       
   471    * (Post-increment semantics.)
       
   472    * If the end of the text has already been reached, DONE is returned.
       
   473    * The DONE value could be confused with a U+FFFF non-character code point
       
   474    * in the text. If this is possible, you can test getIndex()<endIndex()
       
   475    * before calling next(), or (getIndex()<endIndex() || last()!=DONE)
       
   476    * after calling next(). (Calling last() will change the iterator state!)
       
   477    *
       
   478    * The C API unorm_next() is more efficient and does not have this ambiguity.
       
   479    *
       
   480    * @return the next normalized code point
       
   481    * @stable ICU 2.0
       
   482    */
       
   483   UChar32              next(void);
       
   484 
       
   485   /**
       
   486    * Return the previous character in the normalized text and decrement.
       
   487    * (Pre-decrement semantics.)
       
   488    * If the beginning of the text has already been reached, DONE is returned.
       
   489    * The DONE value could be confused with a U+FFFF non-character code point
       
   490    * in the text. If this is possible, you can test
       
   491    * (getIndex()>startIndex() || first()!=DONE). (Calling first() will change
       
   492    * the iterator state!)
       
   493    *
       
   494    * The C API unorm_previous() is more efficient and does not have this ambiguity.
       
   495    *
       
   496    * @return the previous normalized code point
       
   497    * @stable ICU 2.0
       
   498    */
       
   499   UChar32              previous(void);
       
   500 
       
   501   /**
       
   502    * Set the iteration position in the input text that is being normalized,
       
   503    * without any immediate normalization.
       
   504    * After setIndexOnly(), getIndex() will return the same index that is
       
   505    * specified here.
       
   506    *
       
   507    * @param index the desired index in the input text.
       
   508    * @stable ICU 2.0
       
   509    */
       
   510   void                 setIndexOnly(int32_t index);
       
   511 
       
   512   /**
       
   513    * Reset the index to the beginning of the text.
       
   514    * This is equivalent to setIndexOnly(startIndex)).
       
   515    * @stable ICU 2.0
       
   516    */
       
   517   void                reset(void);
       
   518 
       
   519   /**
       
   520    * Retrieve the current iteration position in the input text that is
       
   521    * being normalized.
       
   522    *
       
   523    * A following call to next() will return a normalized code point from
       
   524    * the input text at or after this index.
       
   525    *
       
   526    * After a call to previous(), getIndex() will point at or before the
       
   527    * position in the input text where the normalized code point
       
   528    * was returned from with previous().
       
   529    *
       
   530    * @return the current index in the input text
       
   531    * @stable ICU 2.0
       
   532    */
       
   533   int32_t            getIndex(void) const;
       
   534 
       
   535   /**
       
   536    * Retrieve the index of the start of the input text. This is the begin index
       
   537    * of the <code>CharacterIterator</code> or the start (i.e. index 0) of the string
       
   538    * over which this <code>Normalizer</code> is iterating.
       
   539    *
       
   540    * @return the smallest index in the input text where the Normalizer operates
       
   541    * @stable ICU 2.0
       
   542    */
       
   543   int32_t            startIndex(void) const;
       
   544 
       
   545   /**
       
   546    * Retrieve the index of the end of the input text. This is the end index
       
   547    * of the <code>CharacterIterator</code> or the length of the string
       
   548    * over which this <code>Normalizer</code> is iterating.
       
   549    * This end index is exclusive, i.e., the Normalizer operates only on characters
       
   550    * before this index.
       
   551    *
       
   552    * @return the first index in the input text where the Normalizer does not operate
       
   553    * @stable ICU 2.0
       
   554    */
       
   555   int32_t            endIndex(void) const;
       
   556 
       
   557   /**
       
   558    * Returns TRUE when both iterators refer to the same character in the same
       
   559    * input text.
       
   560    *
       
   561    * @param that a Normalizer object to compare this one to
       
   562    * @return comparison result
       
   563    * @stable ICU 2.0
       
   564    */
       
   565   UBool        operator==(const Normalizer& that) const;
       
   566 
       
   567   /**
       
   568    * Returns FALSE when both iterators refer to the same character in the same
       
   569    * input text.
       
   570    *
       
   571    * @param that a Normalizer object to compare this one to
       
   572    * @return comparison result
       
   573    * @stable ICU 2.0
       
   574    */
       
   575   inline UBool        operator!=(const Normalizer& that) const;
       
   576 
       
   577   /**
       
   578    * Returns a pointer to a new Normalizer that is a clone of this one.
       
   579    * The caller is responsible for deleting the new clone.
       
   580    * @return a pointer to a new Normalizer
       
   581    * @stable ICU 2.0
       
   582    */
       
   583   Normalizer*        clone(void) const;
       
   584 
       
   585   /**
       
   586    * Generates a hash code for this iterator.
       
   587    *
       
   588    * @return the hash code
       
   589    * @stable ICU 2.0
       
   590    */
       
   591   int32_t                hashCode(void) const;
       
   592 
       
   593   //-------------------------------------------------------------------------
       
   594   // Property access methods
       
   595   //-------------------------------------------------------------------------
       
   596 
       
   597   /**
       
   598    * Set the normalization mode for this object.
       
   599    * <p>
       
   600    * <b>Note:</b>If the normalization mode is changed while iterating
       
   601    * over a string, calls to {@link #next() } and {@link #previous() } may
       
   602    * return previously buffers characters in the old normalization mode
       
   603    * until the iteration is able to re-sync at the next base character.
       
   604    * It is safest to call {@link #setIndexOnly }, {@link #reset() },
       
   605    * {@link #setText }, {@link #first() },
       
   606    * {@link #last() }, etc. after calling <code>setMode</code>.
       
   607    * <p>
       
   608    * @param newMode the new mode for this <code>Normalizer</code>.
       
   609    * @see #getUMode
       
   610    * @stable ICU 2.0
       
   611    */
       
   612   void setMode(UNormalizationMode newMode);
       
   613 
       
   614   /**
       
   615    * Return the normalization mode for this object.
       
   616    *
       
   617    * This is an unusual name because there used to be a getMode() that
       
   618    * returned a different type.
       
   619    *
       
   620    * @return the mode for this <code>Normalizer</code>
       
   621    * @see #setMode
       
   622    * @stable ICU 2.0
       
   623    */
       
   624   UNormalizationMode getUMode(void) const;
       
   625 
       
   626   /**
       
   627    * Set options that affect this <code>Normalizer</code>'s operation.
       
   628    * Options do not change the basic composition or decomposition operation
       
   629    * that is being performed, but they control whether
       
   630    * certain optional portions of the operation are done.
       
   631    * Currently the only available option is obsolete.
       
   632    *
       
   633    * It is possible to specify multiple options that are all turned on or off.
       
   634    *
       
   635    * @param   option  the option(s) whose value is/are to be set.
       
   636    * @param   value   the new setting for the option.  Use <code>TRUE</code> to
       
   637    *                  turn the option(s) on and <code>FALSE</code> to turn it/them off.
       
   638    *
       
   639    * @see #getOption
       
   640    * @stable ICU 2.0
       
   641    */
       
   642   void setOption(int32_t option,
       
   643          UBool value);
       
   644 
       
   645   /**
       
   646    * Determine whether an option is turned on or off.
       
   647    * If multiple options are specified, then the result is TRUE if any
       
   648    * of them are set.
       
   649    * <p>
       
   650    * @param option the option(s) that are to be checked
       
   651    * @return TRUE if any of the option(s) are set
       
   652    * @see #setOption
       
   653    * @stable ICU 2.0
       
   654    */
       
   655   UBool getOption(int32_t option) const;
       
   656 
       
   657   /**
       
   658    * Set the input text over which this <code>Normalizer</code> will iterate.
       
   659    * The iteration position is set to the beginning.
       
   660    *
       
   661    * @param newText a string that replaces the current input text
       
   662    * @param status a UErrorCode
       
   663    * @stable ICU 2.0
       
   664    */
       
   665   void setText(const UnicodeString& newText,
       
   666            UErrorCode &status);
       
   667 
       
   668   /**
       
   669    * Set the input text over which this <code>Normalizer</code> will iterate.
       
   670    * The iteration position is set to the beginning.
       
   671    *
       
   672    * @param newText a CharacterIterator object that replaces the current input text
       
   673    * @param status a UErrorCode
       
   674    * @stable ICU 2.0
       
   675    */
       
   676   void setText(const CharacterIterator& newText,
       
   677            UErrorCode &status);
       
   678 
       
   679   /**
       
   680    * Set the input text over which this <code>Normalizer</code> will iterate.
       
   681    * The iteration position is set to the beginning.
       
   682    *
       
   683    * @param newText a string that replaces the current input text
       
   684    * @param length the length of the string, or -1 if NUL-terminated
       
   685    * @param status a UErrorCode
       
   686    * @stable ICU 2.0
       
   687    */
       
   688   void setText(const UChar* newText,
       
   689                     int32_t length,
       
   690             UErrorCode &status);
       
   691   /**
       
   692    * Copies the input text into the UnicodeString argument.
       
   693    *
       
   694    * @param result Receives a copy of the text under iteration.
       
   695    * @stable ICU 2.0
       
   696    */
       
   697   void            getText(UnicodeString&  result);
       
   698 
       
   699   /**
       
   700    * ICU "poor man's RTTI", returns a UClassID for this class.
       
   701    * @returns a UClassID for this class.
       
   702    * @stable ICU 2.2
       
   703    */
       
   704   static UClassID U_EXPORT2 getStaticClassID();
       
   705 
       
   706   /**
       
   707    * ICU "poor man's RTTI", returns a UClassID for the actual class.
       
   708    * @return a UClassID for the actual class.
       
   709    * @stable ICU 2.2
       
   710    */
       
   711   virtual UClassID getDynamicClassID() const;
       
   712 
       
   713 private:
       
   714   //-------------------------------------------------------------------------
       
   715   // Private functions
       
   716   //-------------------------------------------------------------------------
       
   717 
       
   718   Normalizer(); // default constructor not implemented
       
   719   Normalizer &operator=(const Normalizer &that); // assignment operator not implemented
       
   720 
       
   721   // Private utility methods for iteration
       
   722   // For documentation, see the source code
       
   723   UBool nextNormalize();
       
   724   UBool previousNormalize();
       
   725 
       
   726   void    init(CharacterIterator *iter);
       
   727   void    clearBuffer(void);
       
   728 
       
   729   //-------------------------------------------------------------------------
       
   730   // Private data
       
   731   //-------------------------------------------------------------------------
       
   732 
       
   733   UNormalizationMode  fUMode;
       
   734   int32_t             fOptions;
       
   735 
       
   736   // The input text and our position in it
       
   737   UCharIterator       *text;
       
   738 
       
   739   // The normalization buffer is the result of normalization
       
   740   // of the source in [currentIndex..nextIndex[ .
       
   741   int32_t         currentIndex, nextIndex;
       
   742 
       
   743   // A buffer for holding intermediate results
       
   744   UnicodeString       buffer;
       
   745   int32_t         bufferPos;
       
   746 
       
   747 };
       
   748 
       
   749 //-------------------------------------------------------------------------
       
   750 // Inline implementations
       
   751 //-------------------------------------------------------------------------
       
   752 
       
   753 inline UBool
       
   754 Normalizer::operator!= (const Normalizer& other) const
       
   755 { return ! operator==(other); }
       
   756 
       
   757 inline UNormalizationCheckResult
       
   758 Normalizer::quickCheck(const UnicodeString& source,
       
   759                        UNormalizationMode mode,
       
   760                        UErrorCode &status) {
       
   761     if(U_FAILURE(status)) {
       
   762         return UNORM_MAYBE;
       
   763     }
       
   764 
       
   765     return unorm_quickCheck(source.getBuffer(), source.length(),
       
   766                             mode, &status);
       
   767 }
       
   768 
       
   769 inline UNormalizationCheckResult
       
   770 Normalizer::quickCheck(const UnicodeString& source,
       
   771                        UNormalizationMode mode, int32_t options,
       
   772                        UErrorCode &status) {
       
   773     if(U_FAILURE(status)) {
       
   774         return UNORM_MAYBE;
       
   775     }
       
   776 
       
   777     return unorm_quickCheckWithOptions(source.getBuffer(), source.length(),
       
   778                                        mode, options, &status);
       
   779 }
       
   780 
       
   781 inline UBool
       
   782 Normalizer::isNormalized(const UnicodeString& source,
       
   783                          UNormalizationMode mode,
       
   784                          UErrorCode &status) {
       
   785     if(U_FAILURE(status)) {
       
   786         return FALSE;
       
   787     }
       
   788 
       
   789     return unorm_isNormalized(source.getBuffer(), source.length(),
       
   790                               mode, &status);
       
   791 }
       
   792 
       
   793 inline UBool
       
   794 Normalizer::isNormalized(const UnicodeString& source,
       
   795                          UNormalizationMode mode, int32_t options,
       
   796                          UErrorCode &status) {
       
   797     if(U_FAILURE(status)) {
       
   798         return FALSE;
       
   799     }
       
   800 
       
   801     return unorm_isNormalizedWithOptions(source.getBuffer(), source.length(),
       
   802                                          mode, options, &status);
       
   803 }
       
   804 
       
   805 inline int32_t
       
   806 Normalizer::compare(const UnicodeString &s1, const UnicodeString &s2,
       
   807                     uint32_t options,
       
   808                     UErrorCode &errorCode) {
       
   809   // all argument checking is done in unorm_compare
       
   810   return unorm_compare(s1.getBuffer(), s1.length(),
       
   811                        s2.getBuffer(), s2.length(),
       
   812                        options,
       
   813                        &errorCode);
       
   814 }
       
   815 
       
   816 U_NAMESPACE_END
       
   817 
       
   818 #endif /* #if !UCONFIG_NO_NORMALIZATION */
       
   819 
       
   820 #endif // NORMLZR_H