WebKit/mac/icu/unicode/uidna.h
changeset 0 4f2f89ce4247
equal deleted inserted replaced
-1:000000000000 0:4f2f89ce4247
       
     1 /*
       
     2  *******************************************************************************
       
     3  *
       
     4  *   Copyright (C) 2003-2004, International Business Machines
       
     5  *   Corporation and others.  All Rights Reserved.
       
     6  *
       
     7  *******************************************************************************
       
     8  *   file name:  uidna.h
       
     9  *   encoding:   US-ASCII
       
    10  *   tab size:   8 (not used)
       
    11  *   indentation:4
       
    12  *
       
    13  *   created on: 2003feb1
       
    14  *   created by: Ram Viswanadha
       
    15  */
       
    16 
       
    17 #ifndef __UIDNA_H__
       
    18 #define __UIDNA_H__
       
    19 
       
    20 #include "unicode/utypes.h"
       
    21 
       
    22 #if !UCONFIG_NO_IDNA
       
    23 
       
    24 #include "unicode/parseerr.h"
       
    25   
       
    26 /**
       
    27  *\file
       
    28  * UIDNA API implements the IDNA protocol as defined in the IDNA RFC 
       
    29  * (http://www.ietf.org/rfc/rfc3490.txt).
       
    30  * The RFC defines 2 operations: ToASCII and ToUnicode. Domain labels 
       
    31  * containing non-ASCII code points are required to be processed by
       
    32  * ToASCII operation before passing it to resolver libraries. Domain names
       
    33  * that are obtained from resolver libraries are required to be processed by
       
    34  * ToUnicode operation before displaying the domain name to the user.
       
    35  * IDNA requires that implementations process input strings with Nameprep
       
    36  * (http://www.ietf.org/rfc/rfc3491.txt), 
       
    37  * which is a profile of Stringprep (http://www.ietf.org/rfc/rfc3454.txt), 
       
    38  * and then with Punycode (http://www.ietf.org/rfc/rfc3492.txt). 
       
    39  * Implementations of IDNA MUST fully implement Nameprep and Punycode; 
       
    40  * neither Nameprep nor Punycode are optional.
       
    41  * The input and output of ToASCII and ToUnicode operations are Unicode 
       
    42  * and are designed to be chainable, i.e., applying ToASCII or ToUnicode operations
       
    43  * multiple times to an input string will yield the same result as applying the operation
       
    44  * once.
       
    45  * ToUnicode(ToUnicode(ToUnicode...(ToUnicode(string)))) == ToUnicode(string) 
       
    46  * ToASCII(ToASCII(ToASCII...(ToASCII(string))) == ToASCII(string).
       
    47  *
       
    48  */
       
    49 
       
    50 #ifndef U_HIDE_DRAFT_API
       
    51 
       
    52 /** 
       
    53  * Option to prohibit processing of unassigned codepoints in the input and
       
    54  * do not check if the input conforms to STD-3 ASCII rules.
       
    55  * 
       
    56  * @see  uidna_toASCII uidna_toUnicode
       
    57  * @stable ICU 2.6
       
    58  */
       
    59 #define UIDNA_DEFAULT          0x0000
       
    60 /** 
       
    61  * Option to allow processing of unassigned codepoints in the input
       
    62  * 
       
    63  * @see  uidna_toASCII uidna_toUnicode
       
    64  * @stable ICU 2.6
       
    65  */
       
    66 #define UIDNA_ALLOW_UNASSIGNED 0x0001
       
    67 /** 
       
    68  * Option to check if input conforms to STD-3 ASCII rules
       
    69  * 
       
    70  * @see  uidna_toASCII uidna_toUnicode
       
    71  * @stable ICU 2.6
       
    72  */
       
    73 #define UIDNA_USE_STD3_RULES   0x0002
       
    74 
       
    75 #endif /*U_HIDE_DRAFT_API*/
       
    76     
       
    77 /**
       
    78  * This function implements the ToASCII operation as defined in the IDNA RFC.
       
    79  * This operation is done on <b>single labels</b> before sending it to something that expects
       
    80  * ASCII names. A label is an individual part of a domain name. Labels are usually
       
    81  * separated by dots; e.g." "www.example.com" is composed of 3 labels 
       
    82  * "www","example", and "com".
       
    83  *
       
    84  *
       
    85  * @param src               Input UChar array containing label in Unicode.
       
    86  * @param srcLength         Number of UChars in src, or -1 if NUL-terminated.
       
    87  * @param dest              Output UChar array with ASCII (ACE encoded) label.
       
    88  * @param destCapacity      Size of dest.
       
    89  * @param options           A bit set of options:
       
    90  *
       
    91  *  - UIDNA_DEFAULT             Use default options, i.e., do not process unassigned code points
       
    92  *                              and do not use STD3 ASCII rules
       
    93  *                              If unassigned code points are found the operation fails with 
       
    94  *                              U_UNASSIGNED_ERROR error code.
       
    95  *
       
    96  *  - UIDNA_ALLOW_UNASSIGNED    Unassigned values can be converted to ASCII for query operations
       
    97  *                              If this option is set, the unassigned code points are in the input 
       
    98  *                              are treated as normal Unicode code points.
       
    99  *                          
       
   100  *  - UIDNA_USE_STD3_RULES      Use STD3 ASCII rules for host name syntax restrictions
       
   101  *                              If this option is set and the input does not satisfy STD3 rules,  
       
   102  *                              the operation will fail with U_IDNA_STD3_ASCII_RULES_ERROR
       
   103  *
       
   104  * @param parseError        Pointer to UParseError struct to receive information on position 
       
   105  *                          of error if an error is encountered. Can be NULL.
       
   106  * @param status            ICU in/out error code parameter.
       
   107  *                          U_INVALID_CHAR_FOUND if src contains
       
   108  *                          unmatched single surrogates.
       
   109  *                          U_INDEX_OUTOFBOUNDS_ERROR if src contains
       
   110  *                          too many code points.
       
   111  *                          U_BUFFER_OVERFLOW_ERROR if destCapacity is not enough
       
   112  * @return                  Number of ASCII characters converted.
       
   113  * @stable ICU 2.6
       
   114  */
       
   115 U_STABLE int32_t U_EXPORT2
       
   116 uidna_toASCII(const UChar* src, int32_t srcLength, 
       
   117               UChar* dest, int32_t destCapacity,
       
   118               int32_t options,
       
   119               UParseError* parseError,
       
   120               UErrorCode* status);
       
   121 
       
   122 
       
   123 /**
       
   124  * This function implements the ToUnicode operation as defined in the IDNA RFC.
       
   125  * This operation is done on <b>single labels</b> before sending it to something that expects
       
   126  * Unicode names. A label is an individual part of a domain name. Labels are usually
       
   127  * separated by dots; for e.g." "www.example.com" is composed of 3 labels 
       
   128  * "www","example", and "com".
       
   129  *
       
   130  * @param src               Input UChar array containing ASCII (ACE encoded) label.
       
   131  * @param srcLength         Number of UChars in src, or -1 if NUL-terminated.
       
   132  * @param dest Output       Converted UChar array containing Unicode equivalent of label.
       
   133  * @param destCapacity      Size of dest.
       
   134  * @param options           A bit set of options:
       
   135  *  
       
   136  *  - UIDNA_DEFAULT             Use default options, i.e., do not process unassigned code points
       
   137  *                              and do not use STD3 ASCII rules
       
   138  *                              If unassigned code points are found the operation fails with 
       
   139  *                              U_UNASSIGNED_ERROR error code.
       
   140  *
       
   141  *  - UIDNA_ALLOW_UNASSIGNED      Unassigned values can be converted to ASCII for query operations
       
   142  *                              If this option is set, the unassigned code points are in the input 
       
   143  *                              are treated as normal Unicode code points. <b> Note: </b> This option is 
       
   144  *                              required on toUnicode operation because the RFC mandates 
       
   145  *                              verification of decoded ACE input by applying toASCII and comparing
       
   146  *                              its output with source
       
   147  *
       
   148  *                          
       
   149  *                          
       
   150  *  - UIDNA_USE_STD3_RULES      Use STD3 ASCII rules for host name syntax restrictions
       
   151  *                              If this option is set and the input does not satisfy STD3 rules,  
       
   152  *                              the operation will fail with U_IDNA_STD3_ASCII_RULES_ERROR
       
   153  *
       
   154  * @param parseError        Pointer to UParseError struct to receive information on position 
       
   155  *                          of error if an error is encountered. Can be NULL.
       
   156  * @param status            ICU in/out error code parameter.
       
   157  *                          U_INVALID_CHAR_FOUND if src contains
       
   158  *                          unmatched single surrogates.
       
   159  *                          U_INDEX_OUTOFBOUNDS_ERROR if src contains
       
   160  *                          too many code points.
       
   161  *                          U_BUFFER_OVERFLOW_ERROR if destCapacity is not enough
       
   162  * @return                  Number of Unicode characters converted.
       
   163  * @stable ICU 2.6
       
   164  */
       
   165 U_STABLE int32_t U_EXPORT2
       
   166 uidna_toUnicode(const UChar* src, int32_t srcLength,
       
   167                 UChar* dest, int32_t destCapacity,
       
   168                 int32_t options,
       
   169                 UParseError* parseError,
       
   170                 UErrorCode* status);
       
   171 
       
   172 
       
   173 /**
       
   174  * Convenience function that implements the IDNToASCII operation as defined in the IDNA RFC.
       
   175  * This operation is done on complete domain names, e.g: "www.example.com". 
       
   176  * It is important to note that this operation can fail. If it fails, then the input 
       
   177  * domain name cannot be used as an Internationalized Domain Name and the application
       
   178  * should have methods defined to deal with the failure.
       
   179  * 
       
   180  * <b>Note:</b> IDNA RFC specifies that a conformant application should divide a domain name
       
   181  * into separate labels, decide whether to apply allowUnassigned and useSTD3ASCIIRules on each, 
       
   182  * and then convert. This function does not offer that level of granularity. The options once  
       
   183  * set will apply to all labels in the domain name
       
   184  *
       
   185  * @param src               Input UChar array containing IDN in Unicode.
       
   186  * @param srcLength         Number of UChars in src, or -1 if NUL-terminated.
       
   187  * @param dest              Output UChar array with ASCII (ACE encoded) IDN.
       
   188  * @param destCapacity      Size of dest.
       
   189  * @param options           A bit set of options:
       
   190  *  
       
   191  *  - UIDNA_DEFAULT             Use default options, i.e., do not process unassigned code points
       
   192  *                              and do not use STD3 ASCII rules
       
   193  *                              If unassigned code points are found the operation fails with 
       
   194  *                              U_UNASSIGNED_CODE_POINT_FOUND error code.
       
   195  *
       
   196  *  - UIDNA_ALLOW_UNASSIGNED    Unassigned values can be converted to ASCII for query operations
       
   197  *                              If this option is set, the unassigned code points are in the input 
       
   198  *                              are treated as normal Unicode code points.
       
   199  *                          
       
   200  *  - UIDNA_USE_STD3_RULES      Use STD3 ASCII rules for host name syntax restrictions
       
   201  *                              If this option is set and the input does not satisfy STD3 rules,  
       
   202  *                              the operation will fail with U_IDNA_STD3_ASCII_RULES_ERROR
       
   203  * 
       
   204  * @param parseError        Pointer to UParseError struct to receive information on position 
       
   205  *                          of error if an error is encountered. Can be NULL.
       
   206  * @param status            ICU in/out error code parameter.
       
   207  *                          U_INVALID_CHAR_FOUND if src contains
       
   208  *                          unmatched single surrogates.
       
   209  *                          U_INDEX_OUTOFBOUNDS_ERROR if src contains
       
   210  *                          too many code points.
       
   211  *                          U_BUFFER_OVERFLOW_ERROR if destCapacity is not enough
       
   212  * @return                  Number of ASCII characters converted.
       
   213  * @stable ICU 2.6
       
   214  */
       
   215 U_STABLE int32_t U_EXPORT2
       
   216 uidna_IDNToASCII(  const UChar* src, int32_t srcLength,
       
   217                    UChar* dest, int32_t destCapacity,
       
   218                    int32_t options,
       
   219                    UParseError* parseError,
       
   220                    UErrorCode* status);
       
   221 
       
   222 /**
       
   223  * Convenience function that implements the IDNToUnicode operation as defined in the IDNA RFC.
       
   224  * This operation is done on complete domain names, e.g: "www.example.com". 
       
   225  *
       
   226  * <b>Note:</b> IDNA RFC specifies that a conformant application should divide a domain name
       
   227  * into separate labels, decide whether to apply allowUnassigned and useSTD3ASCIIRules on each, 
       
   228  * and then convert. This function does not offer that level of granularity. The options once  
       
   229  * set will apply to all labels in the domain name
       
   230  *
       
   231  * @param src               Input UChar array containing IDN in ASCII (ACE encoded) form.
       
   232  * @param srcLength         Number of UChars in src, or -1 if NUL-terminated.
       
   233  * @param dest Output       UChar array containing Unicode equivalent of source IDN.
       
   234  * @param destCapacity      Size of dest.
       
   235  * @param options           A bit set of options:
       
   236  *  
       
   237  *  - UIDNA_DEFAULT             Use default options, i.e., do not process unassigned code points
       
   238  *                              and do not use STD3 ASCII rules
       
   239  *                              If unassigned code points are found the operation fails with 
       
   240  *                              U_UNASSIGNED_CODE_POINT_FOUND error code.
       
   241  *
       
   242  *  - UIDNA_ALLOW_UNASSIGNED    Unassigned values can be converted to ASCII for query operations
       
   243  *                              If this option is set, the unassigned code points are in the input 
       
   244  *                              are treated as normal Unicode code points.
       
   245  *                          
       
   246  *  - UIDNA_USE_STD3_RULES      Use STD3 ASCII rules for host name syntax restrictions
       
   247  *                              If this option is set and the input does not satisfy STD3 rules,  
       
   248  *                              the operation will fail with U_IDNA_STD3_ASCII_RULES_ERROR
       
   249  *
       
   250  * @param parseError        Pointer to UParseError struct to receive information on position 
       
   251  *                          of error if an error is encountered. Can be NULL.
       
   252  * @param status            ICU in/out error code parameter.
       
   253  *                          U_INVALID_CHAR_FOUND if src contains
       
   254  *                          unmatched single surrogates.
       
   255  *                          U_INDEX_OUTOFBOUNDS_ERROR if src contains
       
   256  *                          too many code points.
       
   257  *                          U_BUFFER_OVERFLOW_ERROR if destCapacity is not enough
       
   258  * @return                  Number of ASCII characters converted.
       
   259  * @stable ICU 2.6
       
   260  */
       
   261 U_STABLE int32_t U_EXPORT2
       
   262 uidna_IDNToUnicode(  const UChar* src, int32_t srcLength,
       
   263                      UChar* dest, int32_t destCapacity,
       
   264                      int32_t options,
       
   265                      UParseError* parseError,
       
   266                      UErrorCode* status);
       
   267 
       
   268 /**
       
   269  * Compare two IDN strings for equivalence.
       
   270  * This function splits the domain names into labels and compares them.
       
   271  * According to IDN RFC, whenever two labels are compared, they are 
       
   272  * considered equal if and only if their ASCII forms (obtained by 
       
   273  * applying toASCII) match using an case-insensitive ASCII comparison.
       
   274  * Two domain names are considered a match if and only if all labels 
       
   275  * match regardless of whether label separators match.
       
   276  *
       
   277  * @param s1                First source string.
       
   278  * @param length1           Length of first source string, or -1 if NUL-terminated.
       
   279  *
       
   280  * @param s2                Second source string.
       
   281  * @param length2           Length of second source string, or -1 if NUL-terminated.
       
   282  * @param options           A bit set of options:
       
   283  *  
       
   284  *  - UIDNA_DEFAULT             Use default options, i.e., do not process unassigned code points
       
   285  *                              and do not use STD3 ASCII rules
       
   286  *                              If unassigned code points are found the operation fails with 
       
   287  *                              U_UNASSIGNED_CODE_POINT_FOUND error code.
       
   288  *
       
   289  *  - UIDNA_ALLOW_UNASSIGNED    Unassigned values can be converted to ASCII for query operations
       
   290  *                              If this option is set, the unassigned code points are in the input 
       
   291  *                              are treated as normal Unicode code points.
       
   292  *                          
       
   293  *  - UIDNA_USE_STD3_RULES      Use STD3 ASCII rules for host name syntax restrictions
       
   294  *                              If this option is set and the input does not satisfy STD3 rules,  
       
   295  *                              the operation will fail with U_IDNA_STD3_ASCII_RULES_ERROR
       
   296  *
       
   297  * @param status            ICU error code in/out parameter.
       
   298  *                          Must fulfill U_SUCCESS before the function call.
       
   299  * @return <0 or 0 or >0 as usual for string comparisons
       
   300  * @stable ICU 2.6
       
   301  */
       
   302 U_STABLE int32_t U_EXPORT2
       
   303 uidna_compare(  const UChar *s1, int32_t length1,
       
   304                 const UChar *s2, int32_t length2,
       
   305                 int32_t options,
       
   306                 UErrorCode* status);
       
   307 
       
   308 #endif /* #if !UCONFIG_NO_IDNA */
       
   309 
       
   310 #endif