python-2.5.2/win32/Lib/test/test_unicode.py
changeset 0 ae805ac0140d
equal deleted inserted replaced
-1:000000000000 0:ae805ac0140d
       
     1 # -*- coding: iso-8859-1 -*-
       
     2 """ Test script for the Unicode implementation.
       
     3 
       
     4 Written by Marc-Andre Lemburg (mal@lemburg.com).
       
     5 
       
     6 (c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
       
     7 
       
     8 """#"
       
     9 import unittest, sys, struct, codecs, new
       
    10 from test import test_support, string_tests
       
    11 
       
    12 # Error handling (bad decoder return)
       
    13 def search_function(encoding):
       
    14     def decode1(input, errors="strict"):
       
    15         return 42 # not a tuple
       
    16     def encode1(input, errors="strict"):
       
    17         return 42 # not a tuple
       
    18     def encode2(input, errors="strict"):
       
    19         return (42, 42) # no unicode
       
    20     def decode2(input, errors="strict"):
       
    21         return (42, 42) # no unicode
       
    22     if encoding=="test.unicode1":
       
    23         return (encode1, decode1, None, None)
       
    24     elif encoding=="test.unicode2":
       
    25         return (encode2, decode2, None, None)
       
    26     else:
       
    27         return None
       
    28 codecs.register(search_function)
       
    29 
       
    30 class UnicodeTest(
       
    31     string_tests.CommonTest,
       
    32     string_tests.MixinStrUnicodeUserStringTest,
       
    33     string_tests.MixinStrUnicodeTest,
       
    34     ):
       
    35     type2test = unicode
       
    36 
       
    37     def checkequalnofix(self, result, object, methodname, *args):
       
    38         method = getattr(object, methodname)
       
    39         realresult = method(*args)
       
    40         self.assertEqual(realresult, result)
       
    41         self.assert_(type(realresult) is type(result))
       
    42 
       
    43         # if the original is returned make sure that
       
    44         # this doesn't happen with subclasses
       
    45         if realresult is object:
       
    46             class usub(unicode):
       
    47                 def __repr__(self):
       
    48                     return 'usub(%r)' % unicode.__repr__(self)
       
    49             object = usub(object)
       
    50             method = getattr(object, methodname)
       
    51             realresult = method(*args)
       
    52             self.assertEqual(realresult, result)
       
    53             self.assert_(object is not realresult)
       
    54 
       
    55     def test_literals(self):
       
    56         self.assertEqual(u'\xff', u'\u00ff')
       
    57         self.assertEqual(u'\uffff', u'\U0000ffff')
       
    58         self.assertRaises(UnicodeError, eval, 'u\'\\Ufffffffe\'')
       
    59         self.assertRaises(UnicodeError, eval, 'u\'\\Uffffffff\'')
       
    60         self.assertRaises(UnicodeError, eval, 'u\'\\U%08x\'' % 0x110000)
       
    61 
       
    62     def test_repr(self):
       
    63         if not sys.platform.startswith('java'):
       
    64             # Test basic sanity of repr()
       
    65             self.assertEqual(repr(u'abc'), "u'abc'")
       
    66             self.assertEqual(repr(u'ab\\c'), "u'ab\\\\c'")
       
    67             self.assertEqual(repr(u'ab\\'), "u'ab\\\\'")
       
    68             self.assertEqual(repr(u'\\c'), "u'\\\\c'")
       
    69             self.assertEqual(repr(u'\\'), "u'\\\\'")
       
    70             self.assertEqual(repr(u'\n'), "u'\\n'")
       
    71             self.assertEqual(repr(u'\r'), "u'\\r'")
       
    72             self.assertEqual(repr(u'\t'), "u'\\t'")
       
    73             self.assertEqual(repr(u'\b'), "u'\\x08'")
       
    74             self.assertEqual(repr(u"'\""), """u'\\'"'""")
       
    75             self.assertEqual(repr(u"'\""), """u'\\'"'""")
       
    76             self.assertEqual(repr(u"'"), '''u"'"''')
       
    77             self.assertEqual(repr(u'"'), """u'"'""")
       
    78             latin1repr = (
       
    79                 "u'\\x00\\x01\\x02\\x03\\x04\\x05\\x06\\x07\\x08\\t\\n\\x0b\\x0c\\r"
       
    80                 "\\x0e\\x0f\\x10\\x11\\x12\\x13\\x14\\x15\\x16\\x17\\x18\\x19\\x1a"
       
    81                 "\\x1b\\x1c\\x1d\\x1e\\x1f !\"#$%&\\'()*+,-./0123456789:;<=>?@ABCDEFGHI"
       
    82                 "JKLMNOPQRSTUVWXYZ[\\\\]^_`abcdefghijklmnopqrstuvwxyz{|}~\\x7f"
       
    83                 "\\x80\\x81\\x82\\x83\\x84\\x85\\x86\\x87\\x88\\x89\\x8a\\x8b\\x8c\\x8d"
       
    84                 "\\x8e\\x8f\\x90\\x91\\x92\\x93\\x94\\x95\\x96\\x97\\x98\\x99\\x9a\\x9b"
       
    85                 "\\x9c\\x9d\\x9e\\x9f\\xa0\\xa1\\xa2\\xa3\\xa4\\xa5\\xa6\\xa7\\xa8\\xa9"
       
    86                 "\\xaa\\xab\\xac\\xad\\xae\\xaf\\xb0\\xb1\\xb2\\xb3\\xb4\\xb5\\xb6\\xb7"
       
    87                 "\\xb8\\xb9\\xba\\xbb\\xbc\\xbd\\xbe\\xbf\\xc0\\xc1\\xc2\\xc3\\xc4\\xc5"
       
    88                 "\\xc6\\xc7\\xc8\\xc9\\xca\\xcb\\xcc\\xcd\\xce\\xcf\\xd0\\xd1\\xd2\\xd3"
       
    89                 "\\xd4\\xd5\\xd6\\xd7\\xd8\\xd9\\xda\\xdb\\xdc\\xdd\\xde\\xdf\\xe0\\xe1"
       
    90                 "\\xe2\\xe3\\xe4\\xe5\\xe6\\xe7\\xe8\\xe9\\xea\\xeb\\xec\\xed\\xee\\xef"
       
    91                 "\\xf0\\xf1\\xf2\\xf3\\xf4\\xf5\\xf6\\xf7\\xf8\\xf9\\xfa\\xfb\\xfc\\xfd"
       
    92                 "\\xfe\\xff'")
       
    93             testrepr = repr(u''.join(map(unichr, xrange(256))))
       
    94             self.assertEqual(testrepr, latin1repr)
       
    95             # Test repr works on wide unicode escapes without overflow.
       
    96             self.assertEqual(repr(u"\U00010000" * 39 + u"\uffff" * 4096),
       
    97                              repr(u"\U00010000" * 39 + u"\uffff" * 4096))
       
    98 
       
    99 
       
   100     def test_count(self):
       
   101         string_tests.CommonTest.test_count(self)
       
   102         # check mixed argument types
       
   103         self.checkequalnofix(3,  'aaa', 'count', u'a')
       
   104         self.checkequalnofix(0,  'aaa', 'count', u'b')
       
   105         self.checkequalnofix(3, u'aaa', 'count',  'a')
       
   106         self.checkequalnofix(0, u'aaa', 'count',  'b')
       
   107         self.checkequalnofix(0, u'aaa', 'count',  'b')
       
   108         self.checkequalnofix(1, u'aaa', 'count',  'a', -1)
       
   109         self.checkequalnofix(3, u'aaa', 'count',  'a', -10)
       
   110         self.checkequalnofix(2, u'aaa', 'count',  'a', 0, -1)
       
   111         self.checkequalnofix(0, u'aaa', 'count',  'a', 0, -10)
       
   112 
       
   113     def test_find(self):
       
   114         self.checkequalnofix(0,  u'abcdefghiabc', 'find', u'abc')
       
   115         self.checkequalnofix(9,  u'abcdefghiabc', 'find', u'abc', 1)
       
   116         self.checkequalnofix(-1, u'abcdefghiabc', 'find', u'def', 4)
       
   117 
       
   118         self.assertRaises(TypeError, u'hello'.find)
       
   119         self.assertRaises(TypeError, u'hello'.find, 42)
       
   120 
       
   121     def test_rfind(self):
       
   122         string_tests.CommonTest.test_rfind(self)
       
   123         # check mixed argument types
       
   124         self.checkequalnofix(9,   'abcdefghiabc', 'rfind', u'abc')
       
   125         self.checkequalnofix(12,  'abcdefghiabc', 'rfind', u'')
       
   126         self.checkequalnofix(12, u'abcdefghiabc', 'rfind',  '')
       
   127 
       
   128     def test_index(self):
       
   129         string_tests.CommonTest.test_index(self)
       
   130         # check mixed argument types
       
   131         for (t1, t2) in ((str, unicode), (unicode, str)):
       
   132             self.checkequalnofix(0, t1('abcdefghiabc'), 'index',  t2(''))
       
   133             self.checkequalnofix(3, t1('abcdefghiabc'), 'index',  t2('def'))
       
   134             self.checkequalnofix(0, t1('abcdefghiabc'), 'index',  t2('abc'))
       
   135             self.checkequalnofix(9, t1('abcdefghiabc'), 'index',  t2('abc'), 1)
       
   136             self.assertRaises(ValueError, t1('abcdefghiabc').index, t2('hib'))
       
   137             self.assertRaises(ValueError, t1('abcdefghiab').index,  t2('abc'), 1)
       
   138             self.assertRaises(ValueError, t1('abcdefghi').index,  t2('ghi'), 8)
       
   139             self.assertRaises(ValueError, t1('abcdefghi').index,  t2('ghi'), -1)
       
   140 
       
   141     def test_rindex(self):
       
   142         string_tests.CommonTest.test_rindex(self)
       
   143         # check mixed argument types
       
   144         for (t1, t2) in ((str, unicode), (unicode, str)):
       
   145             self.checkequalnofix(12, t1('abcdefghiabc'), 'rindex',  t2(''))
       
   146             self.checkequalnofix(3,  t1('abcdefghiabc'), 'rindex',  t2('def'))
       
   147             self.checkequalnofix(9,  t1('abcdefghiabc'), 'rindex',  t2('abc'))
       
   148             self.checkequalnofix(0,  t1('abcdefghiabc'), 'rindex',  t2('abc'), 0, -1)
       
   149 
       
   150             self.assertRaises(ValueError, t1('abcdefghiabc').rindex,  t2('hib'))
       
   151             self.assertRaises(ValueError, t1('defghiabc').rindex,  t2('def'), 1)
       
   152             self.assertRaises(ValueError, t1('defghiabc').rindex,  t2('abc'), 0, -1)
       
   153             self.assertRaises(ValueError, t1('abcdefghi').rindex,  t2('ghi'), 0, 8)
       
   154             self.assertRaises(ValueError, t1('abcdefghi').rindex,  t2('ghi'), 0, -1)
       
   155 
       
   156     def test_translate(self):
       
   157         self.checkequalnofix(u'bbbc', u'abababc', 'translate', {ord('a'):None})
       
   158         self.checkequalnofix(u'iiic', u'abababc', 'translate', {ord('a'):None, ord('b'):ord('i')})
       
   159         self.checkequalnofix(u'iiix', u'abababc', 'translate', {ord('a'):None, ord('b'):ord('i'), ord('c'):u'x'})
       
   160         self.checkequalnofix(u'<i><i><i>c', u'abababc', 'translate', {ord('a'):None, ord('b'):u'<i>'})
       
   161         self.checkequalnofix(u'c', u'abababc', 'translate', {ord('a'):None, ord('b'):u''})
       
   162         self.checkequalnofix(u'xyyx', u'xzx', 'translate', {ord('z'):u'yy'})
       
   163 
       
   164         self.assertRaises(TypeError, u'hello'.translate)
       
   165         self.assertRaises(TypeError, u'abababc'.translate, {ord('a'):''})
       
   166 
       
   167     def test_split(self):
       
   168         string_tests.CommonTest.test_split(self)
       
   169 
       
   170         # Mixed arguments
       
   171         self.checkequalnofix([u'a', u'b', u'c', u'd'], u'a//b//c//d', 'split', '//')
       
   172         self.checkequalnofix([u'a', u'b', u'c', u'd'], 'a//b//c//d', 'split', u'//')
       
   173         self.checkequalnofix([u'endcase ', u''], u'endcase test', 'split', 'test')
       
   174 
       
   175     def test_join(self):
       
   176         string_tests.MixinStrUnicodeUserStringTest.test_join(self)
       
   177 
       
   178         # mixed arguments
       
   179         self.checkequalnofix(u'a b c d', u' ', 'join', ['a', 'b', u'c', u'd'])
       
   180         self.checkequalnofix(u'abcd', u'', 'join', (u'a', u'b', u'c', u'd'))
       
   181         self.checkequalnofix(u'w x y z', u' ', 'join', string_tests.Sequence('wxyz'))
       
   182         self.checkequalnofix(u'a b c d', ' ', 'join', [u'a', u'b', u'c', u'd'])
       
   183         self.checkequalnofix(u'a b c d', ' ', 'join', ['a', 'b', u'c', u'd'])
       
   184         self.checkequalnofix(u'abcd', '', 'join', (u'a', u'b', u'c', u'd'))
       
   185         self.checkequalnofix(u'w x y z', ' ', 'join', string_tests.Sequence(u'wxyz'))
       
   186 
       
   187     def test_strip(self):
       
   188         string_tests.CommonTest.test_strip(self)
       
   189         self.assertRaises(UnicodeError, u"hello".strip, "\xff")
       
   190 
       
   191     def test_replace(self):
       
   192         string_tests.CommonTest.test_replace(self)
       
   193 
       
   194         # method call forwarded from str implementation because of unicode argument
       
   195         self.checkequalnofix(u'one@two!three!', 'one!two!three!', 'replace', u'!', u'@', 1)
       
   196         self.assertRaises(TypeError, 'replace'.replace, u"r", 42)
       
   197 
       
   198     def test_comparison(self):
       
   199         # Comparisons:
       
   200         self.assertEqual(u'abc', 'abc')
       
   201         self.assertEqual('abc', u'abc')
       
   202         self.assertEqual(u'abc', u'abc')
       
   203         self.assert_(u'abcd' > 'abc')
       
   204         self.assert_('abcd' > u'abc')
       
   205         self.assert_(u'abcd' > u'abc')
       
   206         self.assert_(u'abc' < 'abcd')
       
   207         self.assert_('abc' < u'abcd')
       
   208         self.assert_(u'abc' < u'abcd')
       
   209 
       
   210         if 0:
       
   211             # Move these tests to a Unicode collation module test...
       
   212             # Testing UTF-16 code point order comparisons...
       
   213 
       
   214             # No surrogates, no fixup required.
       
   215             self.assert_(u'\u0061' < u'\u20ac')
       
   216             # Non surrogate below surrogate value, no fixup required
       
   217             self.assert_(u'\u0061' < u'\ud800\udc02')
       
   218 
       
   219             # Non surrogate above surrogate value, fixup required
       
   220             def test_lecmp(s, s2):
       
   221                 self.assert_(s < s2)
       
   222 
       
   223             def test_fixup(s):
       
   224                 s2 = u'\ud800\udc01'
       
   225                 test_lecmp(s, s2)
       
   226                 s2 = u'\ud900\udc01'
       
   227                 test_lecmp(s, s2)
       
   228                 s2 = u'\uda00\udc01'
       
   229                 test_lecmp(s, s2)
       
   230                 s2 = u'\udb00\udc01'
       
   231                 test_lecmp(s, s2)
       
   232                 s2 = u'\ud800\udd01'
       
   233                 test_lecmp(s, s2)
       
   234                 s2 = u'\ud900\udd01'
       
   235                 test_lecmp(s, s2)
       
   236                 s2 = u'\uda00\udd01'
       
   237                 test_lecmp(s, s2)
       
   238                 s2 = u'\udb00\udd01'
       
   239                 test_lecmp(s, s2)
       
   240                 s2 = u'\ud800\ude01'
       
   241                 test_lecmp(s, s2)
       
   242                 s2 = u'\ud900\ude01'
       
   243                 test_lecmp(s, s2)
       
   244                 s2 = u'\uda00\ude01'
       
   245                 test_lecmp(s, s2)
       
   246                 s2 = u'\udb00\ude01'
       
   247                 test_lecmp(s, s2)
       
   248                 s2 = u'\ud800\udfff'
       
   249                 test_lecmp(s, s2)
       
   250                 s2 = u'\ud900\udfff'
       
   251                 test_lecmp(s, s2)
       
   252                 s2 = u'\uda00\udfff'
       
   253                 test_lecmp(s, s2)
       
   254                 s2 = u'\udb00\udfff'
       
   255                 test_lecmp(s, s2)
       
   256 
       
   257                 test_fixup(u'\ue000')
       
   258                 test_fixup(u'\uff61')
       
   259 
       
   260         # Surrogates on both sides, no fixup required
       
   261         self.assert_(u'\ud800\udc02' < u'\ud84d\udc56')
       
   262 
       
   263     def test_islower(self):
       
   264         string_tests.MixinStrUnicodeUserStringTest.test_islower(self)
       
   265         self.checkequalnofix(False, u'\u1FFc', 'islower')
       
   266 
       
   267     def test_isupper(self):
       
   268         string_tests.MixinStrUnicodeUserStringTest.test_isupper(self)
       
   269         if not sys.platform.startswith('java'):
       
   270             self.checkequalnofix(False, u'\u1FFc', 'isupper')
       
   271 
       
   272     def test_istitle(self):
       
   273         string_tests.MixinStrUnicodeUserStringTest.test_title(self)
       
   274         self.checkequalnofix(True, u'\u1FFc', 'istitle')
       
   275         self.checkequalnofix(True, u'Greek \u1FFcitlecases ...', 'istitle')
       
   276 
       
   277     def test_isspace(self):
       
   278         string_tests.MixinStrUnicodeUserStringTest.test_isspace(self)
       
   279         self.checkequalnofix(True, u'\u2000', 'isspace')
       
   280         self.checkequalnofix(True, u'\u200a', 'isspace')
       
   281         self.checkequalnofix(False, u'\u2014', 'isspace')
       
   282 
       
   283     def test_isalpha(self):
       
   284         string_tests.MixinStrUnicodeUserStringTest.test_isalpha(self)
       
   285         self.checkequalnofix(True, u'\u1FFc', 'isalpha')
       
   286 
       
   287     def test_isdecimal(self):
       
   288         self.checkequalnofix(False, u'', 'isdecimal')
       
   289         self.checkequalnofix(False, u'a', 'isdecimal')
       
   290         self.checkequalnofix(True, u'0', 'isdecimal')
       
   291         self.checkequalnofix(False, u'\u2460', 'isdecimal') # CIRCLED DIGIT ONE
       
   292         self.checkequalnofix(False, u'\xbc', 'isdecimal') # VULGAR FRACTION ONE QUARTER
       
   293         self.checkequalnofix(True, u'\u0660', 'isdecimal') # ARABIC-INDIC DIGIT ZERO
       
   294         self.checkequalnofix(True, u'0123456789', 'isdecimal')
       
   295         self.checkequalnofix(False, u'0123456789a', 'isdecimal')
       
   296 
       
   297         self.checkraises(TypeError, 'abc', 'isdecimal', 42)
       
   298 
       
   299     def test_isdigit(self):
       
   300         string_tests.MixinStrUnicodeUserStringTest.test_isdigit(self)
       
   301         self.checkequalnofix(True, u'\u2460', 'isdigit')
       
   302         self.checkequalnofix(False, u'\xbc', 'isdigit')
       
   303         self.checkequalnofix(True, u'\u0660', 'isdigit')
       
   304 
       
   305     def test_isnumeric(self):
       
   306         self.checkequalnofix(False, u'', 'isnumeric')
       
   307         self.checkequalnofix(False, u'a', 'isnumeric')
       
   308         self.checkequalnofix(True, u'0', 'isnumeric')
       
   309         self.checkequalnofix(True, u'\u2460', 'isnumeric')
       
   310         self.checkequalnofix(True, u'\xbc', 'isnumeric')
       
   311         self.checkequalnofix(True, u'\u0660', 'isnumeric')
       
   312         self.checkequalnofix(True, u'0123456789', 'isnumeric')
       
   313         self.checkequalnofix(False, u'0123456789a', 'isnumeric')
       
   314 
       
   315         self.assertRaises(TypeError, u"abc".isnumeric, 42)
       
   316 
       
   317     def test_contains(self):
       
   318         # Testing Unicode contains method
       
   319         self.assert_('a' in u'abdb')
       
   320         self.assert_('a' in u'bdab')
       
   321         self.assert_('a' in u'bdaba')
       
   322         self.assert_('a' in u'bdba')
       
   323         self.assert_('a' in u'bdba')
       
   324         self.assert_(u'a' in u'bdba')
       
   325         self.assert_(u'a' not in u'bdb')
       
   326         self.assert_(u'a' not in 'bdb')
       
   327         self.assert_(u'a' in 'bdba')
       
   328         self.assert_(u'a' in ('a',1,None))
       
   329         self.assert_(u'a' in (1,None,'a'))
       
   330         self.assert_(u'a' in (1,None,u'a'))
       
   331         self.assert_('a' in ('a',1,None))
       
   332         self.assert_('a' in (1,None,'a'))
       
   333         self.assert_('a' in (1,None,u'a'))
       
   334         self.assert_('a' not in ('x',1,u'y'))
       
   335         self.assert_('a' not in ('x',1,None))
       
   336         self.assert_(u'abcd' not in u'abcxxxx')
       
   337         self.assert_(u'ab' in u'abcd')
       
   338         self.assert_('ab' in u'abc')
       
   339         self.assert_(u'ab' in 'abc')
       
   340         self.assert_(u'ab' in (1,None,u'ab'))
       
   341         self.assert_(u'' in u'abc')
       
   342         self.assert_('' in u'abc')
       
   343 
       
   344         # If the following fails either
       
   345         # the contains operator does not propagate UnicodeErrors or
       
   346         # someone has changed the default encoding
       
   347         self.assertRaises(UnicodeError, 'g\xe2teau'.__contains__, u'\xe2')
       
   348 
       
   349         self.assert_(u'' in '')
       
   350         self.assert_('' in u'')
       
   351         self.assert_(u'' in u'')
       
   352         self.assert_(u'' in 'abc')
       
   353         self.assert_('' in u'abc')
       
   354         self.assert_(u'' in u'abc')
       
   355         self.assert_(u'\0' not in 'abc')
       
   356         self.assert_('\0' not in u'abc')
       
   357         self.assert_(u'\0' not in u'abc')
       
   358         self.assert_(u'\0' in '\0abc')
       
   359         self.assert_('\0' in u'\0abc')
       
   360         self.assert_(u'\0' in u'\0abc')
       
   361         self.assert_(u'\0' in 'abc\0')
       
   362         self.assert_('\0' in u'abc\0')
       
   363         self.assert_(u'\0' in u'abc\0')
       
   364         self.assert_(u'a' in '\0abc')
       
   365         self.assert_('a' in u'\0abc')
       
   366         self.assert_(u'a' in u'\0abc')
       
   367         self.assert_(u'asdf' in 'asdf')
       
   368         self.assert_('asdf' in u'asdf')
       
   369         self.assert_(u'asdf' in u'asdf')
       
   370         self.assert_(u'asdf' not in 'asd')
       
   371         self.assert_('asdf' not in u'asd')
       
   372         self.assert_(u'asdf' not in u'asd')
       
   373         self.assert_(u'asdf' not in '')
       
   374         self.assert_('asdf' not in u'')
       
   375         self.assert_(u'asdf' not in u'')
       
   376 
       
   377         self.assertRaises(TypeError, u"abc".__contains__)
       
   378 
       
   379     def test_formatting(self):
       
   380         string_tests.MixinStrUnicodeUserStringTest.test_formatting(self)
       
   381         # Testing Unicode formatting strings...
       
   382         self.assertEqual(u"%s, %s" % (u"abc", "abc"), u'abc, abc')
       
   383         self.assertEqual(u"%s, %s, %i, %f, %5.2f" % (u"abc", "abc", 1, 2, 3), u'abc, abc, 1, 2.000000,  3.00')
       
   384         self.assertEqual(u"%s, %s, %i, %f, %5.2f" % (u"abc", "abc", 1, -2, 3), u'abc, abc, 1, -2.000000,  3.00')
       
   385         self.assertEqual(u"%s, %s, %i, %f, %5.2f" % (u"abc", "abc", -1, -2, 3.5), u'abc, abc, -1, -2.000000,  3.50')
       
   386         self.assertEqual(u"%s, %s, %i, %f, %5.2f" % (u"abc", "abc", -1, -2, 3.57), u'abc, abc, -1, -2.000000,  3.57')
       
   387         self.assertEqual(u"%s, %s, %i, %f, %5.2f" % (u"abc", "abc", -1, -2, 1003.57), u'abc, abc, -1, -2.000000, 1003.57')
       
   388         if not sys.platform.startswith('java'):
       
   389             self.assertEqual(u"%r, %r" % (u"abc", "abc"), u"u'abc', 'abc'")
       
   390         self.assertEqual(u"%(x)s, %(y)s" % {'x':u"abc", 'y':"def"}, u'abc, def')
       
   391         self.assertEqual(u"%(x)s, %(\xfc)s" % {'x':u"abc", u'\xfc':"def"}, u'abc, def')
       
   392 
       
   393         self.assertEqual(u'%c' % 0x1234, u'\u1234')
       
   394         self.assertRaises(OverflowError, u"%c".__mod__, (sys.maxunicode+1,))
       
   395 
       
   396         # formatting jobs delegated from the string implementation:
       
   397         self.assertEqual('...%(foo)s...' % {'foo':u"abc"}, u'...abc...')
       
   398         self.assertEqual('...%(foo)s...' % {'foo':"abc"}, '...abc...')
       
   399         self.assertEqual('...%(foo)s...' % {u'foo':"abc"}, '...abc...')
       
   400         self.assertEqual('...%(foo)s...' % {u'foo':u"abc"}, u'...abc...')
       
   401         self.assertEqual('...%(foo)s...' % {u'foo':u"abc",'def':123},  u'...abc...')
       
   402         self.assertEqual('...%(foo)s...' % {u'foo':u"abc",u'def':123}, u'...abc...')
       
   403         self.assertEqual('...%s...%s...%s...%s...' % (1,2,3,u"abc"), u'...1...2...3...abc...')
       
   404         self.assertEqual('...%%...%%s...%s...%s...%s...%s...' % (1,2,3,u"abc"), u'...%...%s...1...2...3...abc...')
       
   405         self.assertEqual('...%s...' % u"abc", u'...abc...')
       
   406         self.assertEqual('%*s' % (5,u'abc',), u'  abc')
       
   407         self.assertEqual('%*s' % (-5,u'abc',), u'abc  ')
       
   408         self.assertEqual('%*.*s' % (5,2,u'abc',), u'   ab')
       
   409         self.assertEqual('%*.*s' % (5,3,u'abc',), u'  abc')
       
   410         self.assertEqual('%i %*.*s' % (10, 5,3,u'abc',), u'10   abc')
       
   411         self.assertEqual('%i%s %*.*s' % (10, 3, 5, 3, u'abc',), u'103   abc')
       
   412         self.assertEqual('%c' % u'a', u'a')
       
   413         class Wrapper:
       
   414             def __str__(self):
       
   415                 return u'\u1234'
       
   416         self.assertEqual('%s' % Wrapper(), u'\u1234')
       
   417 
       
   418     @test_support.run_with_locale('LC_ALL', 'de_DE', 'fr_FR')
       
   419     def test_format_float(self):
       
   420         # should not format with a comma, but always with C locale
       
   421         self.assertEqual(u'1.0', u'%.1f' % 1.0)
       
   422 
       
   423     def test_constructor(self):
       
   424         # unicode(obj) tests (this maps to PyObject_Unicode() at C level)
       
   425 
       
   426         self.assertEqual(
       
   427             unicode(u'unicode remains unicode'),
       
   428             u'unicode remains unicode'
       
   429         )
       
   430 
       
   431         class UnicodeSubclass(unicode):
       
   432             pass
       
   433 
       
   434         self.assertEqual(
       
   435             unicode(UnicodeSubclass('unicode subclass becomes unicode')),
       
   436             u'unicode subclass becomes unicode'
       
   437         )
       
   438 
       
   439         self.assertEqual(
       
   440             unicode('strings are converted to unicode'),
       
   441             u'strings are converted to unicode'
       
   442         )
       
   443 
       
   444         class UnicodeCompat:
       
   445             def __init__(self, x):
       
   446                 self.x = x
       
   447             def __unicode__(self):
       
   448                 return self.x
       
   449 
       
   450         self.assertEqual(
       
   451             unicode(UnicodeCompat('__unicode__ compatible objects are recognized')),
       
   452             u'__unicode__ compatible objects are recognized')
       
   453 
       
   454         class StringCompat:
       
   455             def __init__(self, x):
       
   456                 self.x = x
       
   457             def __str__(self):
       
   458                 return self.x
       
   459 
       
   460         self.assertEqual(
       
   461             unicode(StringCompat('__str__ compatible objects are recognized')),
       
   462             u'__str__ compatible objects are recognized'
       
   463         )
       
   464 
       
   465         # unicode(obj) is compatible to str():
       
   466 
       
   467         o = StringCompat('unicode(obj) is compatible to str()')
       
   468         self.assertEqual(unicode(o), u'unicode(obj) is compatible to str()')
       
   469         self.assertEqual(str(o), 'unicode(obj) is compatible to str()')
       
   470 
       
   471         # %-formatting and .__unicode__()
       
   472         self.assertEqual(u'%s' %
       
   473                          UnicodeCompat(u"u'%s' % obj uses obj.__unicode__()"),
       
   474                          u"u'%s' % obj uses obj.__unicode__()")
       
   475         self.assertEqual(u'%s' %
       
   476                          UnicodeCompat(u"u'%s' % obj falls back to obj.__str__()"),
       
   477                          u"u'%s' % obj falls back to obj.__str__()")
       
   478 
       
   479         for obj in (123, 123.45, 123L):
       
   480             self.assertEqual(unicode(obj), unicode(str(obj)))
       
   481 
       
   482         # unicode(obj, encoding, error) tests (this maps to
       
   483         # PyUnicode_FromEncodedObject() at C level)
       
   484 
       
   485         if not sys.platform.startswith('java'):
       
   486             self.assertRaises(
       
   487                 TypeError,
       
   488                 unicode,
       
   489                 u'decoding unicode is not supported',
       
   490                 'utf-8',
       
   491                 'strict'
       
   492             )
       
   493 
       
   494         self.assertEqual(
       
   495             unicode('strings are decoded to unicode', 'utf-8', 'strict'),
       
   496             u'strings are decoded to unicode'
       
   497         )
       
   498 
       
   499         if not sys.platform.startswith('java'):
       
   500             self.assertEqual(
       
   501                 unicode(
       
   502                     buffer('character buffers are decoded to unicode'),
       
   503                     'utf-8',
       
   504                     'strict'
       
   505                 ),
       
   506                 u'character buffers are decoded to unicode'
       
   507             )
       
   508 
       
   509         self.assertRaises(TypeError, unicode, 42, 42, 42)
       
   510 
       
   511     def test_codecs_utf7(self):
       
   512         utfTests = [
       
   513             (u'A\u2262\u0391.', 'A+ImIDkQ.'),             # RFC2152 example
       
   514             (u'Hi Mom -\u263a-!', 'Hi Mom -+Jjo--!'),     # RFC2152 example
       
   515             (u'\u65E5\u672C\u8A9E', '+ZeVnLIqe-'),        # RFC2152 example
       
   516             (u'Item 3 is \u00a31.', 'Item 3 is +AKM-1.'), # RFC2152 example
       
   517             (u'+', '+-'),
       
   518             (u'+-', '+--'),
       
   519             (u'+?', '+-?'),
       
   520             (u'\?', '+AFw?'),
       
   521             (u'+?', '+-?'),
       
   522             (ur'\\?', '+AFwAXA?'),
       
   523             (ur'\\\?', '+AFwAXABc?'),
       
   524             (ur'++--', '+-+---')
       
   525         ]
       
   526 
       
   527         for (x, y) in utfTests:
       
   528             self.assertEqual(x.encode('utf-7'), y)
       
   529 
       
   530         # surrogates not supported
       
   531         self.assertRaises(UnicodeError, unicode, '+3ADYAA-', 'utf-7')
       
   532 
       
   533         self.assertEqual(unicode('+3ADYAA-', 'utf-7', 'replace'), u'\ufffd')
       
   534 
       
   535     def test_codecs_utf8(self):
       
   536         self.assertEqual(u''.encode('utf-8'), '')
       
   537         self.assertEqual(u'\u20ac'.encode('utf-8'), '\xe2\x82\xac')
       
   538         self.assertEqual(u'\ud800\udc02'.encode('utf-8'), '\xf0\x90\x80\x82')
       
   539         self.assertEqual(u'\ud84d\udc56'.encode('utf-8'), '\xf0\xa3\x91\x96')
       
   540         self.assertEqual(u'\ud800'.encode('utf-8'), '\xed\xa0\x80')
       
   541         self.assertEqual(u'\udc00'.encode('utf-8'), '\xed\xb0\x80')
       
   542         self.assertEqual(
       
   543             (u'\ud800\udc02'*1000).encode('utf-8'),
       
   544             '\xf0\x90\x80\x82'*1000
       
   545         )
       
   546         self.assertEqual(
       
   547             u'\u6b63\u78ba\u306b\u8a00\u3046\u3068\u7ffb\u8a33\u306f'
       
   548             u'\u3055\u308c\u3066\u3044\u307e\u305b\u3093\u3002\u4e00'
       
   549             u'\u90e8\u306f\u30c9\u30a4\u30c4\u8a9e\u3067\u3059\u304c'
       
   550             u'\u3001\u3042\u3068\u306f\u3067\u305f\u3089\u3081\u3067'
       
   551             u'\u3059\u3002\u5b9f\u969b\u306b\u306f\u300cWenn ist das'
       
   552             u' Nunstuck git und'.encode('utf-8'),
       
   553             '\xe6\xad\xa3\xe7\xa2\xba\xe3\x81\xab\xe8\xa8\x80\xe3\x81'
       
   554             '\x86\xe3\x81\xa8\xe7\xbf\xbb\xe8\xa8\xb3\xe3\x81\xaf\xe3'
       
   555             '\x81\x95\xe3\x82\x8c\xe3\x81\xa6\xe3\x81\x84\xe3\x81\xbe'
       
   556             '\xe3\x81\x9b\xe3\x82\x93\xe3\x80\x82\xe4\xb8\x80\xe9\x83'
       
   557             '\xa8\xe3\x81\xaf\xe3\x83\x89\xe3\x82\xa4\xe3\x83\x84\xe8'
       
   558             '\xaa\x9e\xe3\x81\xa7\xe3\x81\x99\xe3\x81\x8c\xe3\x80\x81'
       
   559             '\xe3\x81\x82\xe3\x81\xa8\xe3\x81\xaf\xe3\x81\xa7\xe3\x81'
       
   560             '\x9f\xe3\x82\x89\xe3\x82\x81\xe3\x81\xa7\xe3\x81\x99\xe3'
       
   561             '\x80\x82\xe5\xae\x9f\xe9\x9a\x9b\xe3\x81\xab\xe3\x81\xaf'
       
   562             '\xe3\x80\x8cWenn ist das Nunstuck git und'
       
   563         )
       
   564 
       
   565         # UTF-8 specific decoding tests
       
   566         self.assertEqual(unicode('\xf0\xa3\x91\x96', 'utf-8'), u'\U00023456' )
       
   567         self.assertEqual(unicode('\xf0\x90\x80\x82', 'utf-8'), u'\U00010002' )
       
   568         self.assertEqual(unicode('\xe2\x82\xac', 'utf-8'), u'\u20ac' )
       
   569 
       
   570         # Other possible utf-8 test cases:
       
   571         # * strict decoding testing for all of the
       
   572         #   UTF8_ERROR cases in PyUnicode_DecodeUTF8
       
   573 
       
   574     def test_codecs_idna(self):
       
   575         # Test whether trailing dot is preserved
       
   576         self.assertEqual(u"www.python.org.".encode("idna"), "www.python.org.")
       
   577 
       
   578     def test_codecs_errors(self):
       
   579         # Error handling (encoding)
       
   580         self.assertRaises(UnicodeError, u'Andr\202 x'.encode, 'ascii')
       
   581         self.assertRaises(UnicodeError, u'Andr\202 x'.encode, 'ascii','strict')
       
   582         self.assertEqual(u'Andr\202 x'.encode('ascii','ignore'), "Andr x")
       
   583         self.assertEqual(u'Andr\202 x'.encode('ascii','replace'), "Andr? x")
       
   584 
       
   585         # Error handling (decoding)
       
   586         self.assertRaises(UnicodeError, unicode, 'Andr\202 x', 'ascii')
       
   587         self.assertRaises(UnicodeError, unicode, 'Andr\202 x', 'ascii','strict')
       
   588         self.assertEqual(unicode('Andr\202 x','ascii','ignore'), u"Andr x")
       
   589         self.assertEqual(unicode('Andr\202 x','ascii','replace'), u'Andr\uFFFD x')
       
   590 
       
   591         # Error handling (unknown character names)
       
   592         self.assertEqual("\\N{foo}xx".decode("unicode-escape", "ignore"), u"xx")
       
   593 
       
   594         # Error handling (truncated escape sequence)
       
   595         self.assertRaises(UnicodeError, "\\".decode, "unicode-escape")
       
   596 
       
   597         self.assertRaises(TypeError, "hello".decode, "test.unicode1")
       
   598         self.assertRaises(TypeError, unicode, "hello", "test.unicode2")
       
   599         self.assertRaises(TypeError, u"hello".encode, "test.unicode1")
       
   600         self.assertRaises(TypeError, u"hello".encode, "test.unicode2")
       
   601         # executes PyUnicode_Encode()
       
   602         import imp
       
   603         self.assertRaises(
       
   604             ImportError,
       
   605             imp.find_module,
       
   606             "non-existing module",
       
   607             [u"non-existing dir"]
       
   608         )
       
   609 
       
   610         # Error handling (wrong arguments)
       
   611         self.assertRaises(TypeError, u"hello".encode, 42, 42, 42)
       
   612 
       
   613         # Error handling (PyUnicode_EncodeDecimal())
       
   614         self.assertRaises(UnicodeError, int, u"\u0200")
       
   615 
       
   616     def test_codecs(self):
       
   617         # Encoding
       
   618         self.assertEqual(u'hello'.encode('ascii'), 'hello')
       
   619         self.assertEqual(u'hello'.encode('utf-7'), 'hello')
       
   620         self.assertEqual(u'hello'.encode('utf-8'), 'hello')
       
   621         self.assertEqual(u'hello'.encode('utf8'), 'hello')
       
   622         self.assertEqual(u'hello'.encode('utf-16-le'), 'h\000e\000l\000l\000o\000')
       
   623         self.assertEqual(u'hello'.encode('utf-16-be'), '\000h\000e\000l\000l\000o')
       
   624         self.assertEqual(u'hello'.encode('latin-1'), 'hello')
       
   625 
       
   626         # Roundtrip safety for BMP (just the first 1024 chars)
       
   627         for c in xrange(1024):
       
   628             u = unichr(c)
       
   629             for encoding in ('utf-7', 'utf-8', 'utf-16', 'utf-16-le',
       
   630                              'utf-16-be', 'raw_unicode_escape',
       
   631                              'unicode_escape', 'unicode_internal'):
       
   632                 self.assertEqual(unicode(u.encode(encoding),encoding), u)
       
   633 
       
   634         # Roundtrip safety for BMP (just the first 256 chars)
       
   635         for c in xrange(256):
       
   636             u = unichr(c)
       
   637             for encoding in ('latin-1',):
       
   638                 self.assertEqual(unicode(u.encode(encoding),encoding), u)
       
   639 
       
   640         # Roundtrip safety for BMP (just the first 128 chars)
       
   641         for c in xrange(128):
       
   642             u = unichr(c)
       
   643             for encoding in ('ascii',):
       
   644                 self.assertEqual(unicode(u.encode(encoding),encoding), u)
       
   645 
       
   646         # Roundtrip safety for non-BMP (just a few chars)
       
   647         u = u'\U00010001\U00020002\U00030003\U00040004\U00050005'
       
   648         for encoding in ('utf-8', 'utf-16', 'utf-16-le', 'utf-16-be',
       
   649                          #'raw_unicode_escape',
       
   650                          'unicode_escape', 'unicode_internal'):
       
   651             self.assertEqual(unicode(u.encode(encoding),encoding), u)
       
   652 
       
   653         # UTF-8 must be roundtrip safe for all UCS-2 code points
       
   654         # This excludes surrogates: in the full range, there would be
       
   655         # a surrogate pair (\udbff\udc00), which gets converted back
       
   656         # to a non-BMP character (\U0010fc00)
       
   657         u = u''.join(map(unichr, range(0,0xd800)+range(0xe000,0x10000)))
       
   658         for encoding in ('utf-8',):
       
   659             self.assertEqual(unicode(u.encode(encoding),encoding), u)
       
   660 
       
   661     def test_codecs_charmap(self):
       
   662         # 0-127
       
   663         s = ''.join(map(chr, xrange(128)))
       
   664         for encoding in (
       
   665             'cp037', 'cp1026',
       
   666             'cp437', 'cp500', 'cp737', 'cp775', 'cp850',
       
   667             'cp852', 'cp855', 'cp860', 'cp861', 'cp862',
       
   668             'cp863', 'cp865', 'cp866',
       
   669             'iso8859_10', 'iso8859_13', 'iso8859_14', 'iso8859_15',
       
   670             'iso8859_2', 'iso8859_3', 'iso8859_4', 'iso8859_5', 'iso8859_6',
       
   671             'iso8859_7', 'iso8859_9', 'koi8_r', 'latin_1',
       
   672             'mac_cyrillic', 'mac_latin2',
       
   673 
       
   674             'cp1250', 'cp1251', 'cp1252', 'cp1253', 'cp1254', 'cp1255',
       
   675             'cp1256', 'cp1257', 'cp1258',
       
   676             'cp856', 'cp857', 'cp864', 'cp869', 'cp874',
       
   677 
       
   678             'mac_greek', 'mac_iceland','mac_roman', 'mac_turkish',
       
   679             'cp1006', 'iso8859_8',
       
   680 
       
   681             ### These have undefined mappings:
       
   682             #'cp424',
       
   683 
       
   684             ### These fail the round-trip:
       
   685             #'cp875'
       
   686 
       
   687             ):
       
   688             self.assertEqual(unicode(s, encoding).encode(encoding), s)
       
   689 
       
   690         # 128-255
       
   691         s = ''.join(map(chr, xrange(128, 256)))
       
   692         for encoding in (
       
   693             'cp037', 'cp1026',
       
   694             'cp437', 'cp500', 'cp737', 'cp775', 'cp850',
       
   695             'cp852', 'cp855', 'cp860', 'cp861', 'cp862',
       
   696             'cp863', 'cp865', 'cp866',
       
   697             'iso8859_10', 'iso8859_13', 'iso8859_14', 'iso8859_15',
       
   698             'iso8859_2', 'iso8859_4', 'iso8859_5',
       
   699             'iso8859_9', 'koi8_r', 'latin_1',
       
   700             'mac_cyrillic', 'mac_latin2',
       
   701 
       
   702             ### These have undefined mappings:
       
   703             #'cp1250', 'cp1251', 'cp1252', 'cp1253', 'cp1254', 'cp1255',
       
   704             #'cp1256', 'cp1257', 'cp1258',
       
   705             #'cp424', 'cp856', 'cp857', 'cp864', 'cp869', 'cp874',
       
   706             #'iso8859_3', 'iso8859_6', 'iso8859_7',
       
   707             #'mac_greek', 'mac_iceland','mac_roman', 'mac_turkish',
       
   708 
       
   709             ### These fail the round-trip:
       
   710             #'cp1006', 'cp875', 'iso8859_8',
       
   711 
       
   712             ):
       
   713             self.assertEqual(unicode(s, encoding).encode(encoding), s)
       
   714 
       
   715     def test_concatenation(self):
       
   716         self.assertEqual((u"abc" u"def"), u"abcdef")
       
   717         self.assertEqual(("abc" u"def"), u"abcdef")
       
   718         self.assertEqual((u"abc" "def"), u"abcdef")
       
   719         self.assertEqual((u"abc" u"def" "ghi"), u"abcdefghi")
       
   720         self.assertEqual(("abc" "def" u"ghi"), u"abcdefghi")
       
   721 
       
   722     def test_printing(self):
       
   723         class BitBucket:
       
   724             def write(self, text):
       
   725                 pass
       
   726 
       
   727         out = BitBucket()
       
   728         print >>out, u'abc'
       
   729         print >>out, u'abc', u'def'
       
   730         print >>out, u'abc', 'def'
       
   731         print >>out, 'abc', u'def'
       
   732         print >>out, u'abc\n'
       
   733         print >>out, u'abc\n',
       
   734         print >>out, u'abc\n',
       
   735         print >>out, u'def\n'
       
   736         print >>out, u'def\n'
       
   737 
       
   738     def test_ucs4(self):
       
   739         if sys.maxunicode == 0xFFFF:
       
   740             return
       
   741         x = u'\U00100000'
       
   742         y = x.encode("raw-unicode-escape").decode("raw-unicode-escape")
       
   743         self.assertEqual(x, y)
       
   744 
       
   745     def test_conversion(self):
       
   746         # Make sure __unicode__() works properly
       
   747         class Foo0:
       
   748             def __str__(self):
       
   749                 return "foo"
       
   750 
       
   751         class Foo1:
       
   752             def __unicode__(self):
       
   753                 return u"foo"
       
   754 
       
   755         class Foo2(object):
       
   756             def __unicode__(self):
       
   757                 return u"foo"
       
   758 
       
   759         class Foo3(object):
       
   760             def __unicode__(self):
       
   761                 return "foo"
       
   762 
       
   763         class Foo4(str):
       
   764             def __unicode__(self):
       
   765                 return "foo"
       
   766 
       
   767         class Foo5(unicode):
       
   768             def __unicode__(self):
       
   769                 return "foo"
       
   770 
       
   771         class Foo6(str):
       
   772             def __str__(self):
       
   773                 return "foos"
       
   774 
       
   775             def __unicode__(self):
       
   776                 return u"foou"
       
   777 
       
   778         class Foo7(unicode):
       
   779             def __str__(self):
       
   780                 return "foos"
       
   781             def __unicode__(self):
       
   782                 return u"foou"
       
   783 
       
   784         class Foo8(unicode):
       
   785             def __new__(cls, content=""):
       
   786                 return unicode.__new__(cls, 2*content)
       
   787             def __unicode__(self):
       
   788                 return self
       
   789 
       
   790         class Foo9(unicode):
       
   791             def __str__(self):
       
   792                 return "string"
       
   793             def __unicode__(self):
       
   794                 return "not unicode"
       
   795 
       
   796         self.assertEqual(unicode(Foo0()), u"foo")
       
   797         self.assertEqual(unicode(Foo1()), u"foo")
       
   798         self.assertEqual(unicode(Foo2()), u"foo")
       
   799         self.assertEqual(unicode(Foo3()), u"foo")
       
   800         self.assertEqual(unicode(Foo4("bar")), u"foo")
       
   801         self.assertEqual(unicode(Foo5("bar")), u"foo")
       
   802         self.assertEqual(unicode(Foo6("bar")), u"foou")
       
   803         self.assertEqual(unicode(Foo7("bar")), u"foou")
       
   804         self.assertEqual(unicode(Foo8("foo")), u"foofoo")
       
   805         self.assertEqual(str(Foo9("foo")), "string")
       
   806         self.assertEqual(unicode(Foo9("foo")), u"not unicode")
       
   807 
       
   808     def test_unicode_repr(self):
       
   809         class s1:
       
   810             def __repr__(self):
       
   811                 return '\\n'
       
   812 
       
   813         class s2:
       
   814             def __repr__(self):
       
   815                 return u'\\n'
       
   816 
       
   817         self.assertEqual(repr(s1()), '\\n')
       
   818         self.assertEqual(repr(s2()), '\\n')
       
   819 
       
   820     def test_expandtabs_overflows_gracefully(self):
       
   821         # This test only affects 32-bit platforms because expandtabs can only take
       
   822         # an int as the max value, not a 64-bit C long.  If expandtabs is changed
       
   823         # to take a 64-bit long, this test should apply to all platforms.
       
   824         if sys.maxint > (1 << 32) or struct.calcsize('P') != 4:
       
   825             return
       
   826         self.assertRaises(OverflowError, u't\tt\t'.expandtabs, sys.maxint)
       
   827 
       
   828 
       
   829 def test_main():
       
   830     test_support.run_unittest(UnicodeTest)
       
   831 
       
   832 if __name__ == "__main__":
       
   833     test_main()