diff -r e4d67989cc36 -r 47c74d1534e1 glib/tests/utf8.txt --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/glib/tests/utf8.txt Fri Apr 16 16:46:38 2010 +0300 @@ -0,0 +1,282 @@ +# This file is derived from +# +# http://www.cl.cam.ac.uk/~mgk25/ucs/examples/UTF-8-test.txt +# +# Which was created by Markus Kuhn - 2000-09-02 +# +# lines begining with # and blank lines are ignored +# +# Beyond that, this file consists of a series of test cases. Each test case consists of +# 2 or 3 lines: +# +# 1. A UTF-8 string +# 2. A status +# VALID : The string is a valid UTF-8 representation of valid Unicode +# INCOMPLETE : The string has a partial character at the end +# NOTUNICODE : The string is valid UTF-8, but the characters represented +# are not valid unicode ( +# OVERLONG : The string includes overlong sequences +# MALFORMED : The string is not valid UTF-8 +# 3. If the status is VALID or NOTUNICODE, the UCS-4 representation of the string, +# as a series of hex numbers. + +# 1 Some correct UTF-8 text +κόσμε +VALID +03ba 1f79 03c3 03bc 03b5 + +# 2.1 First possible sequence of a certain length +# +# FIXME - handle NULLS? +# +# [ NULL BYTE ] +#VALID +#0000 + +€ +VALID +0080 + + +NOTUNICODE +00200000 + + +NOTUNICODE +04000000 + + +VALID +0000007f + +߿ +VALID +000007ff + +￿ +NOTUNICODE +0000ffff + + +NOTUNICODE +001fffff + + +NOTUNICODE +03ffffff + + +NOTUNICODE +7fffffff + +# 2.3 Other boundary conditions + +퟿ +VALID +d7ff + +� +VALID +fffd + +􏿿 +NOTUNICODE +0010ffff + + +NOTUNICODE +00110000 + +# 3.1 Unexpected continuation bytes + + +MALFORMED + +MALFORMED + +MALFORMED + +MALFORMED + +MALFORMED + +MALFORMED + +MALFORMED + +MALFORMED + +MALFORMED + +# 3.2 Lonely start characters + + +MALFORMED + +MALFORMED + +MALFORMED + +MALFORMED + +MALFORMED + +# 3.3 Sequences with last continuation byte missing + + +INCOMPLETE + +INCOMPLETE + +INCOMPLETE + +INCOMPLETE + +INCOMPLETE + +INCOMPLETE + +INCOMPLETE + +INCOMPLETE + +INCOMPLETE + +INCOMPLETE + +# 3.4 Concatenation of incomplete sequences + + +MALFORMED + +# 3.5 Impossible bytes + + +MALFORMED + +MALFORMED + +MALFORMED + +# Examples of an overlong ASCII character + + +OVERLONG + +OVERLONG + +OVERLONG + +OVERLONG + +OVERLONG + +# Maximum overlong sequences + + +OVERLONG + +OVERLONG + +OVERLONG + +OVERLONG + +OVERLONG + +# Overlong representation of the NUL character + + +OVERLONG + +OVERLONG + +OVERLONG + +OVERLONG + +OVERLONG + +# Illegal code positions + +# Single UTF-16 surrogates + + +NOTUNICODE +d800 + + +NOTUNICODE +db7f + + +NOTUNICODE +db80 + + +NOTUNICODE +dbff + + +NOTUNICODE +dc00 + + +NOTUNICODE +df80 + + +NOTUNICODE +dfff + +# Paired UTF-16 surrogates + + +NOTUNICODE +d800 dc00 + + +NOTUNICODE +d800 dfff + + +NOTUNICODE +db7f dc00 + + +NOTUNICODE +db7f dfff + + +NOTUNICODE +db80 dc00 + + +NOTUNICODE +db80 dfff + + +NOTUNICODE +dbff dc00 + + +NOTUNICODE +dbff dfff + +# Other illegal code positions + +￾ +NOTUNICODE +fffe + +￿ +NOTUNICODE +ffff + +################ +# +# Some more tests, not from Markus Kuhn's file +# + +# Mixed plane 0 and higher planes +