|
1 # This file is derived from |
|
2 # |
|
3 # http://www.cl.cam.ac.uk/~mgk25/ucs/examples/UTF-8-test.txt |
|
4 # |
|
5 # Which was created by Markus Kuhn <mkuhn@acm.org> - 2000-09-02 |
|
6 # |
|
7 # lines begining with # and blank lines are ignored |
|
8 # |
|
9 # Beyond that, this file consists of a series of test cases. Each test case consists of |
|
10 # 2 or 3 lines: |
|
11 # |
|
12 # 1. A UTF-8 string |
|
13 # 2. A status |
|
14 # VALID : The string is a valid UTF-8 representation of valid Unicode |
|
15 # INCOMPLETE : The string has a partial character at the end |
|
16 # NOTUNICODE : The string is valid UTF-8, but the characters represented |
|
17 # are not valid unicode ( |
|
18 # OVERLONG : The string includes overlong sequences |
|
19 # MALFORMED : The string is not valid UTF-8 |
|
20 # 3. If the status is VALID or NOTUNICODE, the UCS-4 representation of the string, |
|
21 # as a series of hex numbers. |
|
22 |
|
23 # 1 Some correct UTF-8 text |
|
24 κόσμε |
|
25 VALID |
|
26 03ba 1f79 03c3 03bc 03b5 |
|
27 |
|
28 # 2.1 First possible sequence of a certain length |
|
29 # |
|
30 # FIXME - handle NULLS? |
|
31 # |
|
32 # [ NULL BYTE ] |
|
33 #VALID |
|
34 #0000 |
|
35 |
|
36 |
|
37 VALID |
|
38 0080 |
|
39 |
|
40 |
|
41 NOTUNICODE |
|
42 00200000 |
|
43 |
|
44 |
|
45 NOTUNICODE |
|
46 04000000 |
|
47 |
|
48 |
|
49 VALID |
|
50 0000007f |
|
51 |
|
52 ߿ |
|
53 VALID |
|
54 000007ff |
|
55 |
|
56 |
|
57 NOTUNICODE |
|
58 0000ffff |
|
59 |
|
60 |
|
61 NOTUNICODE |
|
62 001fffff |
|
63 |
|
64 |
|
65 NOTUNICODE |
|
66 03ffffff |
|
67 |
|
68 |
|
69 NOTUNICODE |
|
70 7fffffff |
|
71 |
|
72 # 2.3 Other boundary conditions |
|
73 |
|
74 |
|
75 VALID |
|
76 d7ff |
|
77 |
|
78 � |
|
79 VALID |
|
80 fffd |
|
81 |
|
82 |
|
83 NOTUNICODE |
|
84 0010ffff |
|
85 |
|
86 |
|
87 NOTUNICODE |
|
88 00110000 |
|
89 |
|
90 # 3.1 Unexpected continuation bytes |
|
91 |
|
92 |
|
93 MALFORMED |
|
94 |
|
95 MALFORMED |
|
96 |
|
97 MALFORMED |
|
98 |
|
99 MALFORMED |
|
100 |
|
101 MALFORMED |
|
102 |
|
103 MALFORMED |
|
104 |
|
105 MALFORMED |
|
106 |
|
107 MALFORMED |
|
108 |
|
109 MALFORMED |
|
110 |
|
111 # 3.2 Lonely start characters |
|
112 |
|
113 |
|
114 MALFORMED |
|
115 |
|
116 MALFORMED |
|
117 |
|
118 MALFORMED |
|
119 |
|
120 MALFORMED |
|
121 |
|
122 MALFORMED |
|
123 |
|
124 # 3.3 Sequences with last continuation byte missing |
|
125 |
|
126 |
|
127 INCOMPLETE |
|
128 |
|
129 INCOMPLETE |
|
130 |
|
131 INCOMPLETE |
|
132 |
|
133 INCOMPLETE |
|
134 |
|
135 INCOMPLETE |
|
136 |
|
137 INCOMPLETE |
|
138 |
|
139 INCOMPLETE |
|
140 |
|
141 INCOMPLETE |
|
142 |
|
143 INCOMPLETE |
|
144 |
|
145 INCOMPLETE |
|
146 |
|
147 # 3.4 Concatenation of incomplete sequences |
|
148 |
|
149 |
|
150 MALFORMED |
|
151 |
|
152 # 3.5 Impossible bytes |
|
153 |
|
154 |
|
155 MALFORMED |
|
156 |
|
157 MALFORMED |
|
158 |
|
159 MALFORMED |
|
160 |
|
161 # Examples of an overlong ASCII character |
|
162 |
|
163 |
|
164 OVERLONG |
|
165 |
|
166 OVERLONG |
|
167 |
|
168 OVERLONG |
|
169 |
|
170 OVERLONG |
|
171 |
|
172 OVERLONG |
|
173 |
|
174 # Maximum overlong sequences |
|
175 |
|
176 |
|
177 OVERLONG |
|
178 |
|
179 OVERLONG |
|
180 |
|
181 OVERLONG |
|
182 |
|
183 OVERLONG |
|
184 |
|
185 OVERLONG |
|
186 |
|
187 # Overlong representation of the NUL character |
|
188 |
|
189 |
|
190 OVERLONG |
|
191 |
|
192 OVERLONG |
|
193 |
|
194 OVERLONG |
|
195 |
|
196 OVERLONG |
|
197 |
|
198 OVERLONG |
|
199 |
|
200 # Illegal code positions |
|
201 |
|
202 # Single UTF-16 surrogates |
|
203 |
|
204 |
|
205 NOTUNICODE |
|
206 d800 |
|
207 |
|
208 |
|
209 NOTUNICODE |
|
210 db7f |
|
211 |
|
212 |
|
213 NOTUNICODE |
|
214 db80 |
|
215 |
|
216 |
|
217 NOTUNICODE |
|
218 dbff |
|
219 |
|
220 |
|
221 NOTUNICODE |
|
222 dc00 |
|
223 |
|
224 |
|
225 NOTUNICODE |
|
226 df80 |
|
227 |
|
228 |
|
229 NOTUNICODE |
|
230 dfff |
|
231 |
|
232 # Paired UTF-16 surrogates |
|
233 |
|
234 |
|
235 NOTUNICODE |
|
236 d800 dc00 |
|
237 |
|
238 |
|
239 NOTUNICODE |
|
240 d800 dfff |
|
241 |
|
242 |
|
243 NOTUNICODE |
|
244 db7f dc00 |
|
245 |
|
246 |
|
247 NOTUNICODE |
|
248 db7f dfff |
|
249 |
|
250 |
|
251 NOTUNICODE |
|
252 db80 dc00 |
|
253 |
|
254 |
|
255 NOTUNICODE |
|
256 db80 dfff |
|
257 |
|
258 |
|
259 NOTUNICODE |
|
260 dbff dc00 |
|
261 |
|
262 |
|
263 NOTUNICODE |
|
264 dbff dfff |
|
265 |
|
266 # Other illegal code positions |
|
267 |
|
268 |
|
269 NOTUNICODE |
|
270 fffe |
|
271 |
|
272 |
|
273 NOTUNICODE |
|
274 ffff |
|
275 |
|
276 ################ |
|
277 # |
|
278 # Some more tests, not from Markus Kuhn's file |
|
279 # |
|
280 |
|
281 # Mixed plane 0 and higher planes |
|
282 |