python-2.5.2/win32/Lib/test/test_multibytecodec.py
changeset 0 ae805ac0140d
equal deleted inserted replaced
-1:000000000000 0:ae805ac0140d
       
     1 #!/usr/bin/env python
       
     2 #
       
     3 # test_multibytecodec.py
       
     4 #   Unit test for multibytecodec itself
       
     5 #
       
     6 
       
     7 from test import test_support
       
     8 from test import test_multibytecodec_support
       
     9 from test.test_support import TESTFN
       
    10 import unittest, StringIO, codecs, sys, os
       
    11 
       
    12 ALL_CJKENCODINGS = [
       
    13 # _codecs_cn
       
    14     'gb2312', 'gbk', 'gb18030', 'hz',
       
    15 # _codecs_hk
       
    16     'big5hkscs',
       
    17 # _codecs_jp
       
    18     'cp932', 'shift_jis', 'euc_jp', 'euc_jisx0213', 'shift_jisx0213',
       
    19     'euc_jis_2004', 'shift_jis_2004',
       
    20 # _codecs_kr
       
    21     'cp949', 'euc_kr', 'johab',
       
    22 # _codecs_tw
       
    23     'big5', 'cp950',
       
    24 # _codecs_iso2022
       
    25     'iso2022_jp', 'iso2022_jp_1', 'iso2022_jp_2', 'iso2022_jp_2004',
       
    26     'iso2022_jp_3', 'iso2022_jp_ext', 'iso2022_kr',
       
    27 ]
       
    28 
       
    29 class Test_MultibyteCodec(unittest.TestCase):
       
    30 
       
    31     def test_nullcoding(self):
       
    32         for enc in ALL_CJKENCODINGS:
       
    33             self.assertEqual(''.decode(enc), u'')
       
    34             self.assertEqual(unicode('', enc), u'')
       
    35             self.assertEqual(u''.encode(enc), '')
       
    36 
       
    37     def test_str_decode(self):
       
    38         for enc in ALL_CJKENCODINGS:
       
    39             self.assertEqual('abcd'.encode(enc), 'abcd')
       
    40 
       
    41     def test_errorcallback_longindex(self):
       
    42         dec = codecs.getdecoder('euc-kr')
       
    43         myreplace  = lambda exc: (u'', sys.maxint+1)
       
    44         codecs.register_error('test.cjktest', myreplace)
       
    45         self.assertRaises(IndexError, dec,
       
    46                           'apple\x92ham\x93spam', 'test.cjktest')
       
    47 
       
    48     def test_codingspec(self):
       
    49         try:
       
    50             for enc in ALL_CJKENCODINGS:
       
    51                 print >> open(TESTFN, 'w'), '# coding:', enc
       
    52                 exec open(TESTFN)
       
    53         finally:
       
    54             os.unlink(TESTFN)
       
    55 
       
    56 class Test_IncrementalEncoder(unittest.TestCase):
       
    57 
       
    58     def test_stateless(self):
       
    59         # cp949 encoder isn't stateful at all.
       
    60         encoder = codecs.getincrementalencoder('cp949')()
       
    61         self.assertEqual(encoder.encode(u'\ud30c\uc774\uc36c \ub9c8\uc744'),
       
    62                          '\xc6\xc4\xc0\xcc\xbd\xe3 \xb8\xb6\xc0\xbb')
       
    63         self.assertEqual(encoder.reset(), None)
       
    64         self.assertEqual(encoder.encode(u'\u2606\u223c\u2606', True),
       
    65                          '\xa1\xd9\xa1\xad\xa1\xd9')
       
    66         self.assertEqual(encoder.reset(), None)
       
    67         self.assertEqual(encoder.encode(u'', True), '')
       
    68         self.assertEqual(encoder.encode(u'', False), '')
       
    69         self.assertEqual(encoder.reset(), None)
       
    70 
       
    71     def test_stateful(self):
       
    72         # jisx0213 encoder is stateful for a few codepoints. eg)
       
    73         #   U+00E6 => A9DC
       
    74         #   U+00E6 U+0300 => ABC4
       
    75         #   U+0300 => ABDC
       
    76 
       
    77         encoder = codecs.getincrementalencoder('jisx0213')()
       
    78         self.assertEqual(encoder.encode(u'\u00e6\u0300'), '\xab\xc4')
       
    79         self.assertEqual(encoder.encode(u'\u00e6'), '')
       
    80         self.assertEqual(encoder.encode(u'\u0300'), '\xab\xc4')
       
    81         self.assertEqual(encoder.encode(u'\u00e6', True), '\xa9\xdc')
       
    82 
       
    83         self.assertEqual(encoder.reset(), None)
       
    84         self.assertEqual(encoder.encode(u'\u0300'), '\xab\xdc')
       
    85 
       
    86         self.assertEqual(encoder.encode(u'\u00e6'), '')
       
    87         self.assertEqual(encoder.encode('', True), '\xa9\xdc')
       
    88         self.assertEqual(encoder.encode('', True), '')
       
    89 
       
    90     def test_stateful_keep_buffer(self):
       
    91         encoder = codecs.getincrementalencoder('jisx0213')()
       
    92         self.assertEqual(encoder.encode(u'\u00e6'), '')
       
    93         self.assertRaises(UnicodeEncodeError, encoder.encode, u'\u0123')
       
    94         self.assertEqual(encoder.encode(u'\u0300\u00e6'), '\xab\xc4')
       
    95         self.assertRaises(UnicodeEncodeError, encoder.encode, u'\u0123')
       
    96         self.assertEqual(encoder.reset(), None)
       
    97         self.assertEqual(encoder.encode(u'\u0300'), '\xab\xdc')
       
    98         self.assertEqual(encoder.encode(u'\u00e6'), '')
       
    99         self.assertRaises(UnicodeEncodeError, encoder.encode, u'\u0123')
       
   100         self.assertEqual(encoder.encode(u'', True), '\xa9\xdc')
       
   101 
       
   102 
       
   103 class Test_IncrementalDecoder(unittest.TestCase):
       
   104 
       
   105     def test_dbcs(self):
       
   106         # cp949 decoder is simple with only 1 or 2 bytes sequences.
       
   107         decoder = codecs.getincrementaldecoder('cp949')()
       
   108         self.assertEqual(decoder.decode('\xc6\xc4\xc0\xcc\xbd'),
       
   109                          u'\ud30c\uc774')
       
   110         self.assertEqual(decoder.decode('\xe3 \xb8\xb6\xc0\xbb'),
       
   111                          u'\uc36c \ub9c8\uc744')
       
   112         self.assertEqual(decoder.decode(''), u'')
       
   113 
       
   114     def test_dbcs_keep_buffer(self):
       
   115         decoder = codecs.getincrementaldecoder('cp949')()
       
   116         self.assertEqual(decoder.decode('\xc6\xc4\xc0'), u'\ud30c')
       
   117         self.assertRaises(UnicodeDecodeError, decoder.decode, '', True)
       
   118         self.assertEqual(decoder.decode('\xcc'), u'\uc774')
       
   119 
       
   120         self.assertEqual(decoder.decode('\xc6\xc4\xc0'), u'\ud30c')
       
   121         self.assertRaises(UnicodeDecodeError, decoder.decode, '\xcc\xbd', True)
       
   122         self.assertEqual(decoder.decode('\xcc'), u'\uc774')
       
   123 
       
   124     def test_iso2022(self):
       
   125         decoder = codecs.getincrementaldecoder('iso2022-jp')()
       
   126         ESC = '\x1b'
       
   127         self.assertEqual(decoder.decode(ESC + '('), u'')
       
   128         self.assertEqual(decoder.decode('B', True), u'')
       
   129         self.assertEqual(decoder.decode(ESC + '$'), u'')
       
   130         self.assertEqual(decoder.decode('B@$'), u'\u4e16')
       
   131         self.assertEqual(decoder.decode('@$@'), u'\u4e16')
       
   132         self.assertEqual(decoder.decode('$', True), u'\u4e16')
       
   133         self.assertEqual(decoder.reset(), None)
       
   134         self.assertEqual(decoder.decode('@$'), u'@$')
       
   135         self.assertEqual(decoder.decode(ESC + '$'), u'')
       
   136         self.assertRaises(UnicodeDecodeError, decoder.decode, '', True)
       
   137         self.assertEqual(decoder.decode('B@$'), u'\u4e16')
       
   138 
       
   139 class Test_StreamReader(unittest.TestCase):
       
   140     def test_bug1728403(self):
       
   141         try:
       
   142             open(TESTFN, 'w').write('\xa1')
       
   143             f = codecs.open(TESTFN, encoding='cp949')
       
   144             self.assertRaises(UnicodeDecodeError, f.read, 2)
       
   145         finally:
       
   146             try: f.close()
       
   147             except: pass
       
   148             os.unlink(TESTFN)
       
   149 
       
   150 class Test_StreamWriter(unittest.TestCase):
       
   151     if len(u'\U00012345') == 2: # UCS2
       
   152         def test_gb18030(self):
       
   153             s= StringIO.StringIO()
       
   154             c = codecs.getwriter('gb18030')(s)
       
   155             c.write(u'123')
       
   156             self.assertEqual(s.getvalue(), '123')
       
   157             c.write(u'\U00012345')
       
   158             self.assertEqual(s.getvalue(), '123\x907\x959')
       
   159             c.write(u'\U00012345'[0])
       
   160             self.assertEqual(s.getvalue(), '123\x907\x959')
       
   161             c.write(u'\U00012345'[1] + u'\U00012345' + u'\uac00\u00ac')
       
   162             self.assertEqual(s.getvalue(),
       
   163                     '123\x907\x959\x907\x959\x907\x959\x827\xcf5\x810\x851')
       
   164             c.write(u'\U00012345'[0])
       
   165             self.assertEqual(s.getvalue(),
       
   166                     '123\x907\x959\x907\x959\x907\x959\x827\xcf5\x810\x851')
       
   167             self.assertRaises(UnicodeError, c.reset)
       
   168             self.assertEqual(s.getvalue(),
       
   169                     '123\x907\x959\x907\x959\x907\x959\x827\xcf5\x810\x851')
       
   170 
       
   171         def test_utf_8(self):
       
   172             s= StringIO.StringIO()
       
   173             c = codecs.getwriter('utf-8')(s)
       
   174             c.write(u'123')
       
   175             self.assertEqual(s.getvalue(), '123')
       
   176             c.write(u'\U00012345')
       
   177             self.assertEqual(s.getvalue(), '123\xf0\x92\x8d\x85')
       
   178 
       
   179             # Python utf-8 codec can't buffer surrogate pairs yet.
       
   180             if 0:
       
   181                 c.write(u'\U00012345'[0])
       
   182                 self.assertEqual(s.getvalue(), '123\xf0\x92\x8d\x85')
       
   183                 c.write(u'\U00012345'[1] + u'\U00012345' + u'\uac00\u00ac')
       
   184                 self.assertEqual(s.getvalue(),
       
   185                     '123\xf0\x92\x8d\x85\xf0\x92\x8d\x85\xf0\x92\x8d\x85'
       
   186                     '\xea\xb0\x80\xc2\xac')
       
   187                 c.write(u'\U00012345'[0])
       
   188                 self.assertEqual(s.getvalue(),
       
   189                     '123\xf0\x92\x8d\x85\xf0\x92\x8d\x85\xf0\x92\x8d\x85'
       
   190                     '\xea\xb0\x80\xc2\xac')
       
   191                 c.reset()
       
   192                 self.assertEqual(s.getvalue(),
       
   193                     '123\xf0\x92\x8d\x85\xf0\x92\x8d\x85\xf0\x92\x8d\x85'
       
   194                     '\xea\xb0\x80\xc2\xac\xed\xa0\x88')
       
   195                 c.write(u'\U00012345'[1])
       
   196                 self.assertEqual(s.getvalue(),
       
   197                     '123\xf0\x92\x8d\x85\xf0\x92\x8d\x85\xf0\x92\x8d\x85'
       
   198                     '\xea\xb0\x80\xc2\xac\xed\xa0\x88\xed\xbd\x85')
       
   199 
       
   200     else: # UCS4
       
   201         pass
       
   202 
       
   203     def test_streamwriter_strwrite(self):
       
   204         s = StringIO.StringIO()
       
   205         wr = codecs.getwriter('gb18030')(s)
       
   206         wr.write('abcd')
       
   207         self.assertEqual(s.getvalue(), 'abcd')
       
   208 
       
   209 class Test_ISO2022(unittest.TestCase):
       
   210     def test_g2(self):
       
   211         iso2022jp2 = '\x1b(B:hu4:unit\x1b.A\x1bNi de famille'
       
   212         uni = u':hu4:unit\xe9 de famille'
       
   213         self.assertEqual(iso2022jp2.decode('iso2022-jp-2'), uni)
       
   214 
       
   215     def test_iso2022_jp_g0(self):
       
   216         self.failIf('\x0e' in u'\N{SOFT HYPHEN}'.encode('iso-2022-jp-2'))
       
   217         for encoding in ('iso-2022-jp-2004', 'iso-2022-jp-3'):
       
   218             e = u'\u3406'.encode(encoding)
       
   219             self.failIf(filter(lambda x: x >= '\x80', e))
       
   220 
       
   221     def test_bug1572832(self):
       
   222         if sys.maxunicode >= 0x10000:
       
   223             myunichr = unichr
       
   224         else:
       
   225             myunichr = lambda x: unichr(0xD7C0+(x>>10)) + unichr(0xDC00+(x&0x3FF))
       
   226 
       
   227         for x in xrange(0x10000, 0x110000):
       
   228             # Any ISO 2022 codec will cause the segfault
       
   229             myunichr(x).encode('iso_2022_jp', 'ignore')
       
   230 
       
   231 def test_main():
       
   232     suite = unittest.TestSuite()
       
   233     suite.addTest(unittest.makeSuite(Test_MultibyteCodec))
       
   234     suite.addTest(unittest.makeSuite(Test_IncrementalEncoder))
       
   235     suite.addTest(unittest.makeSuite(Test_IncrementalDecoder))
       
   236     suite.addTest(unittest.makeSuite(Test_StreamReader))
       
   237     suite.addTest(unittest.makeSuite(Test_StreamWriter))
       
   238     suite.addTest(unittest.makeSuite(Test_ISO2022))
       
   239     test_support.run_suite(suite)
       
   240 
       
   241 if __name__ == "__main__":
       
   242     test_main()