glib/tests/utf8.txt
changeset 18 47c74d1534e1
equal deleted inserted replaced
0:e4d67989cc36 18:47c74d1534e1
       
     1 # This file is derived from 
       
     2 #
       
     3 #    http://www.cl.cam.ac.uk/~mgk25/ucs/examples/UTF-8-test.txt
       
     4 #    
       
     5 # Which was created by   Markus Kuhn <mkuhn@acm.org> - 2000-09-02 
       
     6 #
       
     7 # lines begining with # and blank lines are ignored
       
     8 #
       
     9 # Beyond that, this file consists of a series of test cases. Each test case consists of
       
    10 # 2 or 3 lines:
       
    11 #
       
    12 #  1. A UTF-8 string
       
    13 #  2. A status
       
    14 #      VALID      : The string is a valid UTF-8 representation of valid Unicode
       
    15 #      INCOMPLETE : The string has a partial character at the end
       
    16 #      NOTUNICODE : The string is valid UTF-8, but the characters represented
       
    17 #                   are not valid unicode (
       
    18 #      OVERLONG   : The string includes overlong sequences
       
    19 #      MALFORMED  : The string is not valid UTF-8
       
    20 # 3. If the status is VALID or NOTUNICODE, the UCS-4 representation of the string,
       
    21 #    as a series of hex numbers.
       
    22 
       
    23 # 1  Some correct UTF-8 text
       
    24 κόσμε
       
    25 VALID
       
    26 03ba 1f79 03c3 03bc 03b5
       
    27 
       
    28 # 2.1  First possible sequence of a certain length
       
    29 #
       
    30 # FIXME - handle NULLS?
       
    31 #
       
    32 # [ NULL BYTE ]
       
    33 #VALID
       
    34 #0000
       
    35 
       
    36 €
       
    37 VALID
       
    38 0080
       
    39 
       
    40 
       
    41 NOTUNICODE
       
    42 00200000
       
    43 
       
    44 
       
    45 NOTUNICODE
       
    46 04000000
       
    47 
       
    48 
       
    49 VALID
       
    50 0000007f
       
    51 
       
    52 ߿
       
    53 VALID
       
    54 000007ff
       
    55 
       
    56 ￿
       
    57 NOTUNICODE
       
    58 0000ffff
       
    59 
       
    60 
       
    61 NOTUNICODE
       
    62 001fffff
       
    63 
       
    64 
       
    65 NOTUNICODE
       
    66 03ffffff
       
    67 
       
    68 
       
    69 NOTUNICODE
       
    70 7fffffff
       
    71 
       
    72 # 2.3  Other boundary conditions
       
    73 
       
    74
       
    75 VALID
       
    76 d7ff
       
    77 
       
    78
       
    79 VALID
       
    80 fffd
       
    81 
       
    82 􏿿
       
    83 NOTUNICODE
       
    84 0010ffff
       
    85 
       
    86 
       
    87 NOTUNICODE
       
    88 00110000
       
    89 
       
    90 # 3.1  Unexpected continuation bytes
       
    91 
       
    92 
       
    93 MALFORMED
       
    94 
       
    95 MALFORMED
       
    96 
       
    97 MALFORMED
       
    98 
       
    99 MALFORMED
       
   100 
       
   101 MALFORMED
       
   102 
       
   103 MALFORMED
       
   104 
       
   105 MALFORMED
       
   106 
       
   107 MALFORMED
       
   108 
       
   109 MALFORMED
       
   110 
       
   111 # 3.2  Lonely start characters
       
   112 
       
   113                                 
       
   114 MALFORMED
       
   115                 
       
   116 MALFORMED
       
   117         
       
   118 MALFORMED
       
   119     
       
   120 MALFORMED
       
   121   
       
   122 MALFORMED
       
   123 
       
   124 # 3.3  Sequences with last continuation byte missing
       
   125 
       
   126 
       
   127 INCOMPLETE
       
   128 
       
   129 INCOMPLETE
       
   130 
       
   131 INCOMPLETE
       
   132 
       
   133 INCOMPLETE
       
   134 
       
   135 INCOMPLETE
       
   136 
       
   137 INCOMPLETE
       
   138 
       
   139 INCOMPLETE
       
   140 
       
   141 INCOMPLETE
       
   142 
       
   143 INCOMPLETE
       
   144 
       
   145 INCOMPLETE
       
   146 
       
   147 # 3.4  Concatenation of incomplete sequences
       
   148 
       
   149 
       
   150 MALFORMED
       
   151 
       
   152 # 3.5  Impossible bytes
       
   153 
       
   154 
       
   155 MALFORMED
       
   156 
       
   157 MALFORMED
       
   158 
       
   159 MALFORMED
       
   160 
       
   161 #  Examples of an overlong ASCII character
       
   162 
       
   163 
       
   164 OVERLONG
       
   165 
       
   166 OVERLONG
       
   167 
       
   168 OVERLONG
       
   169 
       
   170 OVERLONG
       
   171 
       
   172 OVERLONG
       
   173 
       
   174 #  Maximum overlong sequences
       
   175 
       
   176 
       
   177 OVERLONG
       
   178 
       
   179 OVERLONG
       
   180 
       
   181 OVERLONG
       
   182 
       
   183 OVERLONG
       
   184 
       
   185 OVERLONG
       
   186 
       
   187 # Overlong representation of the NUL character
       
   188 
       
   189 
       
   190 OVERLONG
       
   191 
       
   192 OVERLONG
       
   193 
       
   194 OVERLONG
       
   195 
       
   196 OVERLONG
       
   197 
       
   198 OVERLONG
       
   199 
       
   200 # Illegal code positions
       
   201 
       
   202 # Single UTF-16 surrogates
       
   203 
       
   204 
       
   205 NOTUNICODE
       
   206 d800
       
   207 
       
   208 
       
   209 NOTUNICODE
       
   210 db7f
       
   211 
       
   212 
       
   213 NOTUNICODE
       
   214 db80
       
   215 
       
   216 
       
   217 NOTUNICODE
       
   218 dbff
       
   219 
       
   220 
       
   221 NOTUNICODE
       
   222 dc00
       
   223 
       
   224 
       
   225 NOTUNICODE
       
   226 df80
       
   227 
       
   228 
       
   229 NOTUNICODE
       
   230 dfff
       
   231 
       
   232 # Paired UTF-16 surrogates
       
   233 
       
   234 
       
   235 NOTUNICODE
       
   236 d800 dc00
       
   237 
       
   238 
       
   239 NOTUNICODE
       
   240 d800 dfff
       
   241 
       
   242 
       
   243 NOTUNICODE
       
   244 db7f dc00
       
   245 
       
   246 
       
   247 NOTUNICODE
       
   248 db7f dfff
       
   249 
       
   250 
       
   251 NOTUNICODE
       
   252 db80 dc00
       
   253 
       
   254 
       
   255 NOTUNICODE
       
   256 db80 dfff
       
   257 
       
   258 
       
   259 NOTUNICODE
       
   260 dbff dc00
       
   261 
       
   262 
       
   263 NOTUNICODE
       
   264 dbff dfff
       
   265 
       
   266 # Other illegal code positions
       
   267 
       
   268
       
   269 NOTUNICODE
       
   270 fffe
       
   271 
       
   272 ￿
       
   273 NOTUNICODE
       
   274 ffff
       
   275 
       
   276 ################
       
   277 #
       
   278 # Some more tests, not from Markus Kuhn's file
       
   279 #
       
   280 
       
   281 # Mixed plane 0 and higher planes
       
   282