symbian-qemu-0.9.1-12/python-2.6.1/Lib/test/test_multibytecodec.py
changeset 1 2fb8b9db1c86
equal deleted inserted replaced
0:ffa851df0825 1:2fb8b9db1c86
       
     1 #!/usr/bin/env python
       
     2 #
       
     3 # test_multibytecodec.py
       
     4 #   Unit test for multibytecodec itself
       
     5 #
       
     6 
       
     7 from test import test_support
       
     8 from test import test_multibytecodec_support
       
     9 from test.test_support import TESTFN
       
    10 import unittest, StringIO, codecs, sys, os
       
    11 import _multibytecodec
       
    12 
       
    13 ALL_CJKENCODINGS = [
       
    14 # _codecs_cn
       
    15     'gb2312', 'gbk', 'gb18030', 'hz',
       
    16 # _codecs_hk
       
    17     'big5hkscs',
       
    18 # _codecs_jp
       
    19     'cp932', 'shift_jis', 'euc_jp', 'euc_jisx0213', 'shift_jisx0213',
       
    20     'euc_jis_2004', 'shift_jis_2004',
       
    21 # _codecs_kr
       
    22     'cp949', 'euc_kr', 'johab',
       
    23 # _codecs_tw
       
    24     'big5', 'cp950',
       
    25 # _codecs_iso2022
       
    26     'iso2022_jp', 'iso2022_jp_1', 'iso2022_jp_2', 'iso2022_jp_2004',
       
    27     'iso2022_jp_3', 'iso2022_jp_ext', 'iso2022_kr',
       
    28 ]
       
    29 
       
    30 class Test_MultibyteCodec(unittest.TestCase):
       
    31 
       
    32     def test_nullcoding(self):
       
    33         for enc in ALL_CJKENCODINGS:
       
    34             self.assertEqual(''.decode(enc), u'')
       
    35             self.assertEqual(unicode('', enc), u'')
       
    36             self.assertEqual(u''.encode(enc), '')
       
    37 
       
    38     def test_str_decode(self):
       
    39         for enc in ALL_CJKENCODINGS:
       
    40             self.assertEqual('abcd'.encode(enc), 'abcd')
       
    41 
       
    42     def test_errorcallback_longindex(self):
       
    43         dec = codecs.getdecoder('euc-kr')
       
    44         myreplace  = lambda exc: (u'', sys.maxint+1)
       
    45         codecs.register_error('test.cjktest', myreplace)
       
    46         self.assertRaises(IndexError, dec,
       
    47                           'apple\x92ham\x93spam', 'test.cjktest')
       
    48 
       
    49     def test_codingspec(self):
       
    50         try:
       
    51             for enc in ALL_CJKENCODINGS:
       
    52                 print >> open(TESTFN, 'w'), '# coding:', enc
       
    53                 exec open(TESTFN)
       
    54         finally:
       
    55             os.unlink(TESTFN)
       
    56 
       
    57     def test_init_segfault(self):
       
    58         # bug #3305: this used to segfault
       
    59         self.assertRaises(AttributeError,
       
    60                           _multibytecodec.MultibyteStreamReader, None)
       
    61         self.assertRaises(AttributeError,
       
    62                           _multibytecodec.MultibyteStreamWriter, None)
       
    63 
       
    64 
       
    65 class Test_IncrementalEncoder(unittest.TestCase):
       
    66 
       
    67     def test_stateless(self):
       
    68         # cp949 encoder isn't stateful at all.
       
    69         encoder = codecs.getincrementalencoder('cp949')()
       
    70         self.assertEqual(encoder.encode(u'\ud30c\uc774\uc36c \ub9c8\uc744'),
       
    71                          '\xc6\xc4\xc0\xcc\xbd\xe3 \xb8\xb6\xc0\xbb')
       
    72         self.assertEqual(encoder.reset(), None)
       
    73         self.assertEqual(encoder.encode(u'\u2606\u223c\u2606', True),
       
    74                          '\xa1\xd9\xa1\xad\xa1\xd9')
       
    75         self.assertEqual(encoder.reset(), None)
       
    76         self.assertEqual(encoder.encode(u'', True), '')
       
    77         self.assertEqual(encoder.encode(u'', False), '')
       
    78         self.assertEqual(encoder.reset(), None)
       
    79 
       
    80     def test_stateful(self):
       
    81         # jisx0213 encoder is stateful for a few codepoints. eg)
       
    82         #   U+00E6 => A9DC
       
    83         #   U+00E6 U+0300 => ABC4
       
    84         #   U+0300 => ABDC
       
    85 
       
    86         encoder = codecs.getincrementalencoder('jisx0213')()
       
    87         self.assertEqual(encoder.encode(u'\u00e6\u0300'), '\xab\xc4')
       
    88         self.assertEqual(encoder.encode(u'\u00e6'), '')
       
    89         self.assertEqual(encoder.encode(u'\u0300'), '\xab\xc4')
       
    90         self.assertEqual(encoder.encode(u'\u00e6', True), '\xa9\xdc')
       
    91 
       
    92         self.assertEqual(encoder.reset(), None)
       
    93         self.assertEqual(encoder.encode(u'\u0300'), '\xab\xdc')
       
    94 
       
    95         self.assertEqual(encoder.encode(u'\u00e6'), '')
       
    96         self.assertEqual(encoder.encode('', True), '\xa9\xdc')
       
    97         self.assertEqual(encoder.encode('', True), '')
       
    98 
       
    99     def test_stateful_keep_buffer(self):
       
   100         encoder = codecs.getincrementalencoder('jisx0213')()
       
   101         self.assertEqual(encoder.encode(u'\u00e6'), '')
       
   102         self.assertRaises(UnicodeEncodeError, encoder.encode, u'\u0123')
       
   103         self.assertEqual(encoder.encode(u'\u0300\u00e6'), '\xab\xc4')
       
   104         self.assertRaises(UnicodeEncodeError, encoder.encode, u'\u0123')
       
   105         self.assertEqual(encoder.reset(), None)
       
   106         self.assertEqual(encoder.encode(u'\u0300'), '\xab\xdc')
       
   107         self.assertEqual(encoder.encode(u'\u00e6'), '')
       
   108         self.assertRaises(UnicodeEncodeError, encoder.encode, u'\u0123')
       
   109         self.assertEqual(encoder.encode(u'', True), '\xa9\xdc')
       
   110 
       
   111 
       
   112 class Test_IncrementalDecoder(unittest.TestCase):
       
   113 
       
   114     def test_dbcs(self):
       
   115         # cp949 decoder is simple with only 1 or 2 bytes sequences.
       
   116         decoder = codecs.getincrementaldecoder('cp949')()
       
   117         self.assertEqual(decoder.decode('\xc6\xc4\xc0\xcc\xbd'),
       
   118                          u'\ud30c\uc774')
       
   119         self.assertEqual(decoder.decode('\xe3 \xb8\xb6\xc0\xbb'),
       
   120                          u'\uc36c \ub9c8\uc744')
       
   121         self.assertEqual(decoder.decode(''), u'')
       
   122 
       
   123     def test_dbcs_keep_buffer(self):
       
   124         decoder = codecs.getincrementaldecoder('cp949')()
       
   125         self.assertEqual(decoder.decode('\xc6\xc4\xc0'), u'\ud30c')
       
   126         self.assertRaises(UnicodeDecodeError, decoder.decode, '', True)
       
   127         self.assertEqual(decoder.decode('\xcc'), u'\uc774')
       
   128 
       
   129         self.assertEqual(decoder.decode('\xc6\xc4\xc0'), u'\ud30c')
       
   130         self.assertRaises(UnicodeDecodeError, decoder.decode, '\xcc\xbd', True)
       
   131         self.assertEqual(decoder.decode('\xcc'), u'\uc774')
       
   132 
       
   133     def test_iso2022(self):
       
   134         decoder = codecs.getincrementaldecoder('iso2022-jp')()
       
   135         ESC = '\x1b'
       
   136         self.assertEqual(decoder.decode(ESC + '('), u'')
       
   137         self.assertEqual(decoder.decode('B', True), u'')
       
   138         self.assertEqual(decoder.decode(ESC + '$'), u'')
       
   139         self.assertEqual(decoder.decode('B@$'), u'\u4e16')
       
   140         self.assertEqual(decoder.decode('@$@'), u'\u4e16')
       
   141         self.assertEqual(decoder.decode('$', True), u'\u4e16')
       
   142         self.assertEqual(decoder.reset(), None)
       
   143         self.assertEqual(decoder.decode('@$'), u'@$')
       
   144         self.assertEqual(decoder.decode(ESC + '$'), u'')
       
   145         self.assertRaises(UnicodeDecodeError, decoder.decode, '', True)
       
   146         self.assertEqual(decoder.decode('B@$'), u'\u4e16')
       
   147 
       
   148 class Test_StreamReader(unittest.TestCase):
       
   149     def test_bug1728403(self):
       
   150         try:
       
   151             open(TESTFN, 'w').write('\xa1')
       
   152             f = codecs.open(TESTFN, encoding='cp949')
       
   153             self.assertRaises(UnicodeDecodeError, f.read, 2)
       
   154         finally:
       
   155             try: f.close()
       
   156             except: pass
       
   157             os.unlink(TESTFN)
       
   158 
       
   159 class Test_StreamWriter(unittest.TestCase):
       
   160     if len(u'\U00012345') == 2: # UCS2
       
   161         def test_gb18030(self):
       
   162             s = StringIO.StringIO()
       
   163             c = codecs.getwriter('gb18030')(s)
       
   164             c.write(u'123')
       
   165             self.assertEqual(s.getvalue(), '123')
       
   166             c.write(u'\U00012345')
       
   167             self.assertEqual(s.getvalue(), '123\x907\x959')
       
   168             c.write(u'\U00012345'[0])
       
   169             self.assertEqual(s.getvalue(), '123\x907\x959')
       
   170             c.write(u'\U00012345'[1] + u'\U00012345' + u'\uac00\u00ac')
       
   171             self.assertEqual(s.getvalue(),
       
   172                     '123\x907\x959\x907\x959\x907\x959\x827\xcf5\x810\x851')
       
   173             c.write(u'\U00012345'[0])
       
   174             self.assertEqual(s.getvalue(),
       
   175                     '123\x907\x959\x907\x959\x907\x959\x827\xcf5\x810\x851')
       
   176             self.assertRaises(UnicodeError, c.reset)
       
   177             self.assertEqual(s.getvalue(),
       
   178                     '123\x907\x959\x907\x959\x907\x959\x827\xcf5\x810\x851')
       
   179 
       
   180         def test_utf_8(self):
       
   181             s= StringIO.StringIO()
       
   182             c = codecs.getwriter('utf-8')(s)
       
   183             c.write(u'123')
       
   184             self.assertEqual(s.getvalue(), '123')
       
   185             c.write(u'\U00012345')
       
   186             self.assertEqual(s.getvalue(), '123\xf0\x92\x8d\x85')
       
   187 
       
   188             # Python utf-8 codec can't buffer surrogate pairs yet.
       
   189             if 0:
       
   190                 c.write(u'\U00012345'[0])
       
   191                 self.assertEqual(s.getvalue(), '123\xf0\x92\x8d\x85')
       
   192                 c.write(u'\U00012345'[1] + u'\U00012345' + u'\uac00\u00ac')
       
   193                 self.assertEqual(s.getvalue(),
       
   194                     '123\xf0\x92\x8d\x85\xf0\x92\x8d\x85\xf0\x92\x8d\x85'
       
   195                     '\xea\xb0\x80\xc2\xac')
       
   196                 c.write(u'\U00012345'[0])
       
   197                 self.assertEqual(s.getvalue(),
       
   198                     '123\xf0\x92\x8d\x85\xf0\x92\x8d\x85\xf0\x92\x8d\x85'
       
   199                     '\xea\xb0\x80\xc2\xac')
       
   200                 c.reset()
       
   201                 self.assertEqual(s.getvalue(),
       
   202                     '123\xf0\x92\x8d\x85\xf0\x92\x8d\x85\xf0\x92\x8d\x85'
       
   203                     '\xea\xb0\x80\xc2\xac\xed\xa0\x88')
       
   204                 c.write(u'\U00012345'[1])
       
   205                 self.assertEqual(s.getvalue(),
       
   206                     '123\xf0\x92\x8d\x85\xf0\x92\x8d\x85\xf0\x92\x8d\x85'
       
   207                     '\xea\xb0\x80\xc2\xac\xed\xa0\x88\xed\xbd\x85')
       
   208 
       
   209     else: # UCS4
       
   210         pass
       
   211 
       
   212     def test_streamwriter_strwrite(self):
       
   213         s = StringIO.StringIO()
       
   214         wr = codecs.getwriter('gb18030')(s)
       
   215         wr.write('abcd')
       
   216         self.assertEqual(s.getvalue(), 'abcd')
       
   217 
       
   218 class Test_ISO2022(unittest.TestCase):
       
   219     def test_g2(self):
       
   220         iso2022jp2 = '\x1b(B:hu4:unit\x1b.A\x1bNi de famille'
       
   221         uni = u':hu4:unit\xe9 de famille'
       
   222         self.assertEqual(iso2022jp2.decode('iso2022-jp-2'), uni)
       
   223 
       
   224     def test_iso2022_jp_g0(self):
       
   225         self.failIf('\x0e' in u'\N{SOFT HYPHEN}'.encode('iso-2022-jp-2'))
       
   226         for encoding in ('iso-2022-jp-2004', 'iso-2022-jp-3'):
       
   227             e = u'\u3406'.encode(encoding)
       
   228             self.failIf(filter(lambda x: x >= '\x80', e))
       
   229 
       
   230     def test_bug1572832(self):
       
   231         if sys.maxunicode >= 0x10000:
       
   232             myunichr = unichr
       
   233         else:
       
   234             myunichr = lambda x: unichr(0xD7C0+(x>>10)) + unichr(0xDC00+(x&0x3FF))
       
   235 
       
   236         for x in xrange(0x10000, 0x110000):
       
   237             # Any ISO 2022 codec will cause the segfault
       
   238             myunichr(x).encode('iso_2022_jp', 'ignore')
       
   239 
       
   240 def test_main():
       
   241     test_support.run_unittest(__name__)
       
   242 
       
   243 if __name__ == "__main__":
       
   244     test_main()