glib/tests/utf8.txt
changeset 18 47c74d1534e1
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/glib/tests/utf8.txt	Fri Apr 16 16:46:38 2010 +0300
@@ -0,0 +1,282 @@
+# This file is derived from 
+#
+#    http://www.cl.cam.ac.uk/~mgk25/ucs/examples/UTF-8-test.txt
+#    
+# Which was created by   Markus Kuhn <mkuhn@acm.org> - 2000-09-02 
+#
+# lines begining with # and blank lines are ignored
+#
+# Beyond that, this file consists of a series of test cases. Each test case consists of
+# 2 or 3 lines:
+#
+#  1. A UTF-8 string
+#  2. A status
+#      VALID      : The string is a valid UTF-8 representation of valid Unicode
+#      INCOMPLETE : The string has a partial character at the end
+#      NOTUNICODE : The string is valid UTF-8, but the characters represented
+#                   are not valid unicode (
+#      OVERLONG   : The string includes overlong sequences
+#      MALFORMED  : The string is not valid UTF-8
+# 3. If the status is VALID or NOTUNICODE, the UCS-4 representation of the string,
+#    as a series of hex numbers.
+
+# 1  Some correct UTF-8 text
+κόσμε
+VALID
+03ba 1f79 03c3 03bc 03b5
+
+# 2.1  First possible sequence of a certain length
+#
+# FIXME - handle NULLS?
+#
+# [ NULL BYTE ]
+#VALID
+#0000
+

+VALID
+0080
+
+
+NOTUNICODE
+00200000
+
+
+NOTUNICODE
+04000000
+
+
+VALID
+0000007f
+
+߿
+VALID
+000007ff
+
+￿
+NOTUNICODE
+0000ffff
+
+
+NOTUNICODE
+001fffff
+
+
+NOTUNICODE
+03ffffff
+
+
+NOTUNICODE
+7fffffff
+
+# 2.3  Other boundary conditions
+
+퟿
+VALID
+d7ff
+
+�
+VALID
+fffd
+
+􏿿
+NOTUNICODE
+0010ffff
+
+
+NOTUNICODE
+00110000
+
+# 3.1  Unexpected continuation bytes
+
+
+MALFORMED
+
+MALFORMED
+
+MALFORMED
+
+MALFORMED
+
+MALFORMED
+
+MALFORMED
+
+MALFORMED
+
+MALFORMED
+
+MALFORMED
+
+# 3.2  Lonely start characters
+
+                                
+MALFORMED
+                
+MALFORMED
+        
+MALFORMED
+    
+MALFORMED
+  
+MALFORMED
+
+# 3.3  Sequences with last continuation byte missing
+
+
+INCOMPLETE
+
+INCOMPLETE
+
+INCOMPLETE
+
+INCOMPLETE
+
+INCOMPLETE
+
+INCOMPLETE
+
+INCOMPLETE
+
+INCOMPLETE
+
+INCOMPLETE
+
+INCOMPLETE
+
+# 3.4  Concatenation of incomplete sequences
+
+
+MALFORMED
+
+# 3.5  Impossible bytes
+
+
+MALFORMED
+
+MALFORMED
+
+MALFORMED
+
+#  Examples of an overlong ASCII character
+
+
+OVERLONG
+
+OVERLONG
+
+OVERLONG
+
+OVERLONG
+
+OVERLONG
+
+#  Maximum overlong sequences
+
+
+OVERLONG
+
+OVERLONG
+
+OVERLONG
+
+OVERLONG
+
+OVERLONG
+
+# Overlong representation of the NUL character
+
+
+OVERLONG
+
+OVERLONG
+
+OVERLONG
+
+OVERLONG
+
+OVERLONG
+
+# Illegal code positions
+
+# Single UTF-16 surrogates
+
+
+NOTUNICODE
+d800
+
+
+NOTUNICODE
+db7f
+
+
+NOTUNICODE
+db80
+
+
+NOTUNICODE
+dbff
+
+
+NOTUNICODE
+dc00
+
+
+NOTUNICODE
+df80
+
+
+NOTUNICODE
+dfff
+
+# Paired UTF-16 surrogates
+
+
+NOTUNICODE
+d800 dc00
+
+
+NOTUNICODE
+d800 dfff
+
+
+NOTUNICODE
+db7f dc00
+
+
+NOTUNICODE
+db7f dfff
+
+
+NOTUNICODE
+db80 dc00
+
+
+NOTUNICODE
+db80 dfff
+
+
+NOTUNICODE
+dbff dc00
+
+
+NOTUNICODE
+dbff dfff
+
+# Other illegal code positions
+
+￾
+NOTUNICODE
+fffe
+
+￿
+NOTUNICODE
+ffff
+
+################
+#
+# Some more tests, not from Markus Kuhn's file
+#
+
+# Mixed plane 0 and higher planes
+