symbian-qemu-0.9.1-12/python-2.6.1/Lib/test/test_codecs.py
changeset 1 2fb8b9db1c86
equal deleted inserted replaced
0:ffa851df0825 1:2fb8b9db1c86
       
     1 from test import test_support
       
     2 import unittest
       
     3 import codecs
       
     4 import sys, StringIO, _testcapi
       
     5 
       
     6 class Queue(object):
       
     7     """
       
     8     queue: write bytes at one end, read bytes from the other end
       
     9     """
       
    10     def __init__(self):
       
    11         self._buffer = ""
       
    12 
       
    13     def write(self, chars):
       
    14         self._buffer += chars
       
    15 
       
    16     def read(self, size=-1):
       
    17         if size<0:
       
    18             s = self._buffer
       
    19             self._buffer = ""
       
    20             return s
       
    21         else:
       
    22             s = self._buffer[:size]
       
    23             self._buffer = self._buffer[size:]
       
    24             return s
       
    25 
       
    26 class ReadTest(unittest.TestCase):
       
    27     def check_partial(self, input, partialresults):
       
    28         # get a StreamReader for the encoding and feed the bytestring version
       
    29         # of input to the reader byte by byte. Read everything available from
       
    30         # the StreamReader and check that the results equal the appropriate
       
    31         # entries from partialresults.
       
    32         q = Queue()
       
    33         r = codecs.getreader(self.encoding)(q)
       
    34         result = u""
       
    35         for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
       
    36             q.write(c)
       
    37             result += r.read()
       
    38             self.assertEqual(result, partialresult)
       
    39         # check that there's nothing left in the buffers
       
    40         self.assertEqual(r.read(), u"")
       
    41         self.assertEqual(r.bytebuffer, "")
       
    42         self.assertEqual(r.charbuffer, u"")
       
    43 
       
    44         # do the check again, this time using a incremental decoder
       
    45         d = codecs.getincrementaldecoder(self.encoding)()
       
    46         result = u""
       
    47         for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
       
    48             result += d.decode(c)
       
    49             self.assertEqual(result, partialresult)
       
    50         # check that there's nothing left in the buffers
       
    51         self.assertEqual(d.decode("", True), u"")
       
    52         self.assertEqual(d.buffer, "")
       
    53 
       
    54         # Check whether the reset method works properly
       
    55         d.reset()
       
    56         result = u""
       
    57         for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
       
    58             result += d.decode(c)
       
    59             self.assertEqual(result, partialresult)
       
    60         # check that there's nothing left in the buffers
       
    61         self.assertEqual(d.decode("", True), u"")
       
    62         self.assertEqual(d.buffer, "")
       
    63 
       
    64         # check iterdecode()
       
    65         encoded = input.encode(self.encoding)
       
    66         self.assertEqual(
       
    67             input,
       
    68             u"".join(codecs.iterdecode(encoded, self.encoding))
       
    69         )
       
    70 
       
    71     def test_readline(self):
       
    72         def getreader(input):
       
    73             stream = StringIO.StringIO(input.encode(self.encoding))
       
    74             return codecs.getreader(self.encoding)(stream)
       
    75 
       
    76         def readalllines(input, keepends=True, size=None):
       
    77             reader = getreader(input)
       
    78             lines = []
       
    79             while True:
       
    80                 line = reader.readline(size=size, keepends=keepends)
       
    81                 if not line:
       
    82                     break
       
    83                 lines.append(line)
       
    84             return "|".join(lines)
       
    85 
       
    86         s = u"foo\nbar\r\nbaz\rspam\u2028eggs"
       
    87         sexpected = u"foo\n|bar\r\n|baz\r|spam\u2028|eggs"
       
    88         sexpectednoends = u"foo|bar|baz|spam|eggs"
       
    89         self.assertEqual(readalllines(s, True), sexpected)
       
    90         self.assertEqual(readalllines(s, False), sexpectednoends)
       
    91         self.assertEqual(readalllines(s, True, 10), sexpected)
       
    92         self.assertEqual(readalllines(s, False, 10), sexpectednoends)
       
    93 
       
    94         # Test long lines (multiple calls to read() in readline())
       
    95         vw = []
       
    96         vwo = []
       
    97         for (i, lineend) in enumerate(u"\n \r\n \r \u2028".split()):
       
    98             vw.append((i*200)*u"\3042" + lineend)
       
    99             vwo.append((i*200)*u"\3042")
       
   100         self.assertEqual(readalllines("".join(vw), True), "".join(vw))
       
   101         self.assertEqual(readalllines("".join(vw), False),"".join(vwo))
       
   102 
       
   103         # Test lines where the first read might end with \r, so the
       
   104         # reader has to look ahead whether this is a lone \r or a \r\n
       
   105         for size in xrange(80):
       
   106             for lineend in u"\n \r\n \r \u2028".split():
       
   107                 s = 10*(size*u"a" + lineend + u"xxx\n")
       
   108                 reader = getreader(s)
       
   109                 for i in xrange(10):
       
   110                     self.assertEqual(
       
   111                         reader.readline(keepends=True),
       
   112                         size*u"a" + lineend,
       
   113                     )
       
   114                 reader = getreader(s)
       
   115                 for i in xrange(10):
       
   116                     self.assertEqual(
       
   117                         reader.readline(keepends=False),
       
   118                         size*u"a",
       
   119                     )
       
   120 
       
   121     def test_bug1175396(self):
       
   122         s = [
       
   123             '<%!--===================================================\r\n',
       
   124             '    BLOG index page: show recent articles,\r\n',
       
   125             '    today\'s articles, or articles of a specific date.\r\n',
       
   126             '========================================================--%>\r\n',
       
   127             '<%@inputencoding="ISO-8859-1"%>\r\n',
       
   128             '<%@pagetemplate=TEMPLATE.y%>\r\n',
       
   129             '<%@import=import frog.util, frog%>\r\n',
       
   130             '<%@import=import frog.objects%>\r\n',
       
   131             '<%@import=from frog.storageerrors import StorageError%>\r\n',
       
   132             '<%\r\n',
       
   133             '\r\n',
       
   134             'import logging\r\n',
       
   135             'log=logging.getLogger("Snakelets.logger")\r\n',
       
   136             '\r\n',
       
   137             '\r\n',
       
   138             'user=self.SessionCtx.user\r\n',
       
   139             'storageEngine=self.SessionCtx.storageEngine\r\n',
       
   140             '\r\n',
       
   141             '\r\n',
       
   142             'def readArticlesFromDate(date, count=None):\r\n',
       
   143             '    entryids=storageEngine.listBlogEntries(date)\r\n',
       
   144             '    entryids.reverse() # descending\r\n',
       
   145             '    if count:\r\n',
       
   146             '        entryids=entryids[:count]\r\n',
       
   147             '    try:\r\n',
       
   148             '        return [ frog.objects.BlogEntry.load(storageEngine, date, Id) for Id in entryids ]\r\n',
       
   149             '    except StorageError,x:\r\n',
       
   150             '        log.error("Error loading articles: "+str(x))\r\n',
       
   151             '        self.abort("cannot load articles")\r\n',
       
   152             '\r\n',
       
   153             'showdate=None\r\n',
       
   154             '\r\n',
       
   155             'arg=self.Request.getArg()\r\n',
       
   156             'if arg=="today":\r\n',
       
   157             '    #-------------------- TODAY\'S ARTICLES\r\n',
       
   158             '    self.write("<h2>Today\'s articles</h2>")\r\n',
       
   159             '    showdate = frog.util.isodatestr() \r\n',
       
   160             '    entries = readArticlesFromDate(showdate)\r\n',
       
   161             'elif arg=="active":\r\n',
       
   162             '    #-------------------- ACTIVE ARTICLES redirect\r\n',
       
   163             '    self.Yredirect("active.y")\r\n',
       
   164             'elif arg=="login":\r\n',
       
   165             '    #-------------------- LOGIN PAGE redirect\r\n',
       
   166             '    self.Yredirect("login.y")\r\n',
       
   167             'elif arg=="date":\r\n',
       
   168             '    #-------------------- ARTICLES OF A SPECIFIC DATE\r\n',
       
   169             '    showdate = self.Request.getParameter("date")\r\n',
       
   170             '    self.write("<h2>Articles written on %s</h2>"% frog.util.mediumdatestr(showdate))\r\n',
       
   171             '    entries = readArticlesFromDate(showdate)\r\n',
       
   172             'else:\r\n',
       
   173             '    #-------------------- RECENT ARTICLES\r\n',
       
   174             '    self.write("<h2>Recent articles</h2>")\r\n',
       
   175             '    dates=storageEngine.listBlogEntryDates()\r\n',
       
   176             '    if dates:\r\n',
       
   177             '        entries=[]\r\n',
       
   178             '        SHOWAMOUNT=10\r\n',
       
   179             '        for showdate in dates:\r\n',
       
   180             '            entries.extend( readArticlesFromDate(showdate, SHOWAMOUNT-len(entries)) )\r\n',
       
   181             '            if len(entries)>=SHOWAMOUNT:\r\n',
       
   182             '                break\r\n',
       
   183             '                \r\n',
       
   184         ]
       
   185         stream = StringIO.StringIO("".join(s).encode(self.encoding))
       
   186         reader = codecs.getreader(self.encoding)(stream)
       
   187         for (i, line) in enumerate(reader):
       
   188             self.assertEqual(line, s[i])
       
   189 
       
   190     def test_readlinequeue(self):
       
   191         q = Queue()
       
   192         writer = codecs.getwriter(self.encoding)(q)
       
   193         reader = codecs.getreader(self.encoding)(q)
       
   194 
       
   195         # No lineends
       
   196         writer.write(u"foo\r")
       
   197         self.assertEqual(reader.readline(keepends=False), u"foo")
       
   198         writer.write(u"\nbar\r")
       
   199         self.assertEqual(reader.readline(keepends=False), u"")
       
   200         self.assertEqual(reader.readline(keepends=False), u"bar")
       
   201         writer.write(u"baz")
       
   202         self.assertEqual(reader.readline(keepends=False), u"baz")
       
   203         self.assertEqual(reader.readline(keepends=False), u"")
       
   204 
       
   205         # Lineends
       
   206         writer.write(u"foo\r")
       
   207         self.assertEqual(reader.readline(keepends=True), u"foo\r")
       
   208         writer.write(u"\nbar\r")
       
   209         self.assertEqual(reader.readline(keepends=True), u"\n")
       
   210         self.assertEqual(reader.readline(keepends=True), u"bar\r")
       
   211         writer.write(u"baz")
       
   212         self.assertEqual(reader.readline(keepends=True), u"baz")
       
   213         self.assertEqual(reader.readline(keepends=True), u"")
       
   214         writer.write(u"foo\r\n")
       
   215         self.assertEqual(reader.readline(keepends=True), u"foo\r\n")
       
   216 
       
   217     def test_bug1098990_a(self):
       
   218         s1 = u"xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx yyyyyyyyyyyyyyyyyyyyyyyyyyyyyyy\r\n"
       
   219         s2 = u"offending line: ladfj askldfj klasdj fskla dfzaskdj fasklfj laskd fjasklfzzzzaa%whereisthis!!!\r\n"
       
   220         s3 = u"next line.\r\n"
       
   221 
       
   222         s = (s1+s2+s3).encode(self.encoding)
       
   223         stream = StringIO.StringIO(s)
       
   224         reader = codecs.getreader(self.encoding)(stream)
       
   225         self.assertEqual(reader.readline(), s1)
       
   226         self.assertEqual(reader.readline(), s2)
       
   227         self.assertEqual(reader.readline(), s3)
       
   228         self.assertEqual(reader.readline(), u"")
       
   229 
       
   230     def test_bug1098990_b(self):
       
   231         s1 = u"aaaaaaaaaaaaaaaaaaaaaaaa\r\n"
       
   232         s2 = u"bbbbbbbbbbbbbbbbbbbbbbbb\r\n"
       
   233         s3 = u"stillokay:bbbbxx\r\n"
       
   234         s4 = u"broken!!!!badbad\r\n"
       
   235         s5 = u"againokay.\r\n"
       
   236 
       
   237         s = (s1+s2+s3+s4+s5).encode(self.encoding)
       
   238         stream = StringIO.StringIO(s)
       
   239         reader = codecs.getreader(self.encoding)(stream)
       
   240         self.assertEqual(reader.readline(), s1)
       
   241         self.assertEqual(reader.readline(), s2)
       
   242         self.assertEqual(reader.readline(), s3)
       
   243         self.assertEqual(reader.readline(), s4)
       
   244         self.assertEqual(reader.readline(), s5)
       
   245         self.assertEqual(reader.readline(), u"")
       
   246 
       
   247 class UTF32Test(ReadTest):
       
   248     encoding = "utf-32"
       
   249 
       
   250     spamle = ('\xff\xfe\x00\x00'
       
   251               's\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m\x00\x00\x00'
       
   252               's\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m\x00\x00\x00')
       
   253     spambe = ('\x00\x00\xfe\xff'
       
   254               '\x00\x00\x00s\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m'
       
   255               '\x00\x00\x00s\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m')
       
   256 
       
   257     def test_only_one_bom(self):
       
   258         _,_,reader,writer = codecs.lookup(self.encoding)
       
   259         # encode some stream
       
   260         s = StringIO.StringIO()
       
   261         f = writer(s)
       
   262         f.write(u"spam")
       
   263         f.write(u"spam")
       
   264         d = s.getvalue()
       
   265         # check whether there is exactly one BOM in it
       
   266         self.assert_(d == self.spamle or d == self.spambe)
       
   267         # try to read it back
       
   268         s = StringIO.StringIO(d)
       
   269         f = reader(s)
       
   270         self.assertEquals(f.read(), u"spamspam")
       
   271 
       
   272     def test_badbom(self):
       
   273         s = StringIO.StringIO(4*"\xff")
       
   274         f = codecs.getreader(self.encoding)(s)
       
   275         self.assertRaises(UnicodeError, f.read)
       
   276 
       
   277         s = StringIO.StringIO(8*"\xff")
       
   278         f = codecs.getreader(self.encoding)(s)
       
   279         self.assertRaises(UnicodeError, f.read)
       
   280 
       
   281     def test_partial(self):
       
   282         self.check_partial(
       
   283             u"\x00\xff\u0100\uffff",
       
   284             [
       
   285                 u"", # first byte of BOM read
       
   286                 u"", # second byte of BOM read
       
   287                 u"", # third byte of BOM read
       
   288                 u"", # fourth byte of BOM read => byteorder known
       
   289                 u"",
       
   290                 u"",
       
   291                 u"",
       
   292                 u"\x00",
       
   293                 u"\x00",
       
   294                 u"\x00",
       
   295                 u"\x00",
       
   296                 u"\x00\xff",
       
   297                 u"\x00\xff",
       
   298                 u"\x00\xff",
       
   299                 u"\x00\xff",
       
   300                 u"\x00\xff\u0100",
       
   301                 u"\x00\xff\u0100",
       
   302                 u"\x00\xff\u0100",
       
   303                 u"\x00\xff\u0100",
       
   304                 u"\x00\xff\u0100\uffff",
       
   305             ]
       
   306         )
       
   307 
       
   308     def test_errors(self):
       
   309         self.assertRaises(UnicodeDecodeError, codecs.utf_32_decode,
       
   310                           "\xff", "strict", True)
       
   311 
       
   312 class UTF32LETest(ReadTest):
       
   313     encoding = "utf-32-le"
       
   314 
       
   315     def test_partial(self):
       
   316         self.check_partial(
       
   317             u"\x00\xff\u0100\uffff",
       
   318             [
       
   319                 u"",
       
   320                 u"",
       
   321                 u"",
       
   322                 u"\x00",
       
   323                 u"\x00",
       
   324                 u"\x00",
       
   325                 u"\x00",
       
   326                 u"\x00\xff",
       
   327                 u"\x00\xff",
       
   328                 u"\x00\xff",
       
   329                 u"\x00\xff",
       
   330                 u"\x00\xff\u0100",
       
   331                 u"\x00\xff\u0100",
       
   332                 u"\x00\xff\u0100",
       
   333                 u"\x00\xff\u0100",
       
   334                 u"\x00\xff\u0100\uffff",
       
   335             ]
       
   336         )
       
   337 
       
   338     def test_simple(self):
       
   339         self.assertEqual(u"\U00010203".encode(self.encoding), "\x03\x02\x01\x00")
       
   340 
       
   341     def test_errors(self):
       
   342         self.assertRaises(UnicodeDecodeError, codecs.utf_32_le_decode,
       
   343                           "\xff", "strict", True)
       
   344 
       
   345 class UTF32BETest(ReadTest):
       
   346     encoding = "utf-32-be"
       
   347 
       
   348     def test_partial(self):
       
   349         self.check_partial(
       
   350             u"\x00\xff\u0100\uffff",
       
   351             [
       
   352                 u"",
       
   353                 u"",
       
   354                 u"",
       
   355                 u"\x00",
       
   356                 u"\x00",
       
   357                 u"\x00",
       
   358                 u"\x00",
       
   359                 u"\x00\xff",
       
   360                 u"\x00\xff",
       
   361                 u"\x00\xff",
       
   362                 u"\x00\xff",
       
   363                 u"\x00\xff\u0100",
       
   364                 u"\x00\xff\u0100",
       
   365                 u"\x00\xff\u0100",
       
   366                 u"\x00\xff\u0100",
       
   367                 u"\x00\xff\u0100\uffff",
       
   368             ]
       
   369         )
       
   370 
       
   371     def test_simple(self):
       
   372         self.assertEqual(u"\U00010203".encode(self.encoding), "\x00\x01\x02\x03")
       
   373 
       
   374     def test_errors(self):
       
   375         self.assertRaises(UnicodeDecodeError, codecs.utf_32_be_decode,
       
   376                           "\xff", "strict", True)
       
   377 
       
   378 class UTF16Test(ReadTest):
       
   379     encoding = "utf-16"
       
   380 
       
   381     spamle = '\xff\xfes\x00p\x00a\x00m\x00s\x00p\x00a\x00m\x00'
       
   382     spambe = '\xfe\xff\x00s\x00p\x00a\x00m\x00s\x00p\x00a\x00m'
       
   383 
       
   384     def test_only_one_bom(self):
       
   385         _,_,reader,writer = codecs.lookup(self.encoding)
       
   386         # encode some stream
       
   387         s = StringIO.StringIO()
       
   388         f = writer(s)
       
   389         f.write(u"spam")
       
   390         f.write(u"spam")
       
   391         d = s.getvalue()
       
   392         # check whether there is exactly one BOM in it
       
   393         self.assert_(d == self.spamle or d == self.spambe)
       
   394         # try to read it back
       
   395         s = StringIO.StringIO(d)
       
   396         f = reader(s)
       
   397         self.assertEquals(f.read(), u"spamspam")
       
   398 
       
   399     def test_badbom(self):
       
   400         s = StringIO.StringIO("\xff\xff")
       
   401         f = codecs.getreader(self.encoding)(s)
       
   402         self.assertRaises(UnicodeError, f.read)
       
   403 
       
   404         s = StringIO.StringIO("\xff\xff\xff\xff")
       
   405         f = codecs.getreader(self.encoding)(s)
       
   406         self.assertRaises(UnicodeError, f.read)
       
   407 
       
   408     def test_partial(self):
       
   409         self.check_partial(
       
   410             u"\x00\xff\u0100\uffff",
       
   411             [
       
   412                 u"", # first byte of BOM read
       
   413                 u"", # second byte of BOM read => byteorder known
       
   414                 u"",
       
   415                 u"\x00",
       
   416                 u"\x00",
       
   417                 u"\x00\xff",
       
   418                 u"\x00\xff",
       
   419                 u"\x00\xff\u0100",
       
   420                 u"\x00\xff\u0100",
       
   421                 u"\x00\xff\u0100\uffff",
       
   422             ]
       
   423         )
       
   424 
       
   425     def test_errors(self):
       
   426         self.assertRaises(UnicodeDecodeError, codecs.utf_16_decode, "\xff", "strict", True)
       
   427 
       
   428 class UTF16LETest(ReadTest):
       
   429     encoding = "utf-16-le"
       
   430 
       
   431     def test_partial(self):
       
   432         self.check_partial(
       
   433             u"\x00\xff\u0100\uffff",
       
   434             [
       
   435                 u"",
       
   436                 u"\x00",
       
   437                 u"\x00",
       
   438                 u"\x00\xff",
       
   439                 u"\x00\xff",
       
   440                 u"\x00\xff\u0100",
       
   441                 u"\x00\xff\u0100",
       
   442                 u"\x00\xff\u0100\uffff",
       
   443             ]
       
   444         )
       
   445 
       
   446     def test_errors(self):
       
   447         self.assertRaises(UnicodeDecodeError, codecs.utf_16_le_decode, "\xff", "strict", True)
       
   448 
       
   449 class UTF16BETest(ReadTest):
       
   450     encoding = "utf-16-be"
       
   451 
       
   452     def test_partial(self):
       
   453         self.check_partial(
       
   454             u"\x00\xff\u0100\uffff",
       
   455             [
       
   456                 u"",
       
   457                 u"\x00",
       
   458                 u"\x00",
       
   459                 u"\x00\xff",
       
   460                 u"\x00\xff",
       
   461                 u"\x00\xff\u0100",
       
   462                 u"\x00\xff\u0100",
       
   463                 u"\x00\xff\u0100\uffff",
       
   464             ]
       
   465         )
       
   466 
       
   467     def test_errors(self):
       
   468         self.assertRaises(UnicodeDecodeError, codecs.utf_16_be_decode, "\xff", "strict", True)
       
   469 
       
   470 class UTF8Test(ReadTest):
       
   471     encoding = "utf-8"
       
   472 
       
   473     def test_partial(self):
       
   474         self.check_partial(
       
   475             u"\x00\xff\u07ff\u0800\uffff",
       
   476             [
       
   477                 u"\x00",
       
   478                 u"\x00",
       
   479                 u"\x00\xff",
       
   480                 u"\x00\xff",
       
   481                 u"\x00\xff\u07ff",
       
   482                 u"\x00\xff\u07ff",
       
   483                 u"\x00\xff\u07ff",
       
   484                 u"\x00\xff\u07ff\u0800",
       
   485                 u"\x00\xff\u07ff\u0800",
       
   486                 u"\x00\xff\u07ff\u0800",
       
   487                 u"\x00\xff\u07ff\u0800\uffff",
       
   488             ]
       
   489         )
       
   490 
       
   491 class UTF7Test(ReadTest):
       
   492     encoding = "utf-7"
       
   493 
       
   494     def test_partial(self):
       
   495         self.check_partial(
       
   496             u"a+-b",
       
   497             [
       
   498                 u"a",
       
   499                 u"a",
       
   500                 u"a+",
       
   501                 u"a+-",
       
   502                 u"a+-b",
       
   503             ]
       
   504         )
       
   505 
       
   506 class UTF16ExTest(unittest.TestCase):
       
   507 
       
   508     def test_errors(self):
       
   509         self.assertRaises(UnicodeDecodeError, codecs.utf_16_ex_decode, "\xff", "strict", 0, True)
       
   510 
       
   511     def test_bad_args(self):
       
   512         self.assertRaises(TypeError, codecs.utf_16_ex_decode)
       
   513 
       
   514 class ReadBufferTest(unittest.TestCase):
       
   515 
       
   516     def test_array(self):
       
   517         import array
       
   518         self.assertEqual(
       
   519             codecs.readbuffer_encode(array.array("c", "spam")),
       
   520             ("spam", 4)
       
   521         )
       
   522 
       
   523     def test_empty(self):
       
   524         self.assertEqual(codecs.readbuffer_encode(""), ("", 0))
       
   525 
       
   526     def test_bad_args(self):
       
   527         self.assertRaises(TypeError, codecs.readbuffer_encode)
       
   528         self.assertRaises(TypeError, codecs.readbuffer_encode, 42)
       
   529 
       
   530 class CharBufferTest(unittest.TestCase):
       
   531 
       
   532     def test_string(self):
       
   533         self.assertEqual(codecs.charbuffer_encode("spam"), ("spam", 4))
       
   534 
       
   535     def test_empty(self):
       
   536         self.assertEqual(codecs.charbuffer_encode(""), ("", 0))
       
   537 
       
   538     def test_bad_args(self):
       
   539         self.assertRaises(TypeError, codecs.charbuffer_encode)
       
   540         self.assertRaises(TypeError, codecs.charbuffer_encode, 42)
       
   541 
       
   542 class UTF8SigTest(ReadTest):
       
   543     encoding = "utf-8-sig"
       
   544 
       
   545     def test_partial(self):
       
   546         self.check_partial(
       
   547             u"\ufeff\x00\xff\u07ff\u0800\uffff",
       
   548             [
       
   549                 u"",
       
   550                 u"",
       
   551                 u"", # First BOM has been read and skipped
       
   552                 u"",
       
   553                 u"",
       
   554                 u"\ufeff", # Second BOM has been read and emitted
       
   555                 u"\ufeff\x00", # "\x00" read and emitted
       
   556                 u"\ufeff\x00", # First byte of encoded u"\xff" read
       
   557                 u"\ufeff\x00\xff", # Second byte of encoded u"\xff" read
       
   558                 u"\ufeff\x00\xff", # First byte of encoded u"\u07ff" read
       
   559                 u"\ufeff\x00\xff\u07ff", # Second byte of encoded u"\u07ff" read
       
   560                 u"\ufeff\x00\xff\u07ff",
       
   561                 u"\ufeff\x00\xff\u07ff",
       
   562                 u"\ufeff\x00\xff\u07ff\u0800",
       
   563                 u"\ufeff\x00\xff\u07ff\u0800",
       
   564                 u"\ufeff\x00\xff\u07ff\u0800",
       
   565                 u"\ufeff\x00\xff\u07ff\u0800\uffff",
       
   566             ]
       
   567         )
       
   568 
       
   569     def test_bug1601501(self):
       
   570         # SF bug #1601501: check that the codec works with a buffer
       
   571         unicode("\xef\xbb\xbf", "utf-8-sig")
       
   572 
       
   573     def test_bom(self):
       
   574         d = codecs.getincrementaldecoder("utf-8-sig")()
       
   575         s = u"spam"
       
   576         self.assertEqual(d.decode(s.encode("utf-8-sig")), s)
       
   577 
       
   578     def test_stream_bom(self):
       
   579         unistring = u"ABC\u00A1\u2200XYZ"
       
   580         bytestring = codecs.BOM_UTF8 + "ABC\xC2\xA1\xE2\x88\x80XYZ"
       
   581 
       
   582         reader = codecs.getreader("utf-8-sig")
       
   583         for sizehint in [None] + range(1, 11) + \
       
   584                         [64, 128, 256, 512, 1024]:
       
   585             istream = reader(StringIO.StringIO(bytestring))
       
   586             ostream = StringIO.StringIO()
       
   587             while 1:
       
   588                 if sizehint is not None:
       
   589                     data = istream.read(sizehint)
       
   590                 else:
       
   591                     data = istream.read()
       
   592 
       
   593                 if not data:
       
   594                     break
       
   595                 ostream.write(data)
       
   596 
       
   597             got = ostream.getvalue()
       
   598             self.assertEqual(got, unistring)
       
   599 
       
   600     def test_stream_bare(self):
       
   601         unistring = u"ABC\u00A1\u2200XYZ"
       
   602         bytestring = "ABC\xC2\xA1\xE2\x88\x80XYZ"
       
   603 
       
   604         reader = codecs.getreader("utf-8-sig")
       
   605         for sizehint in [None] + range(1, 11) + \
       
   606                         [64, 128, 256, 512, 1024]:
       
   607             istream = reader(StringIO.StringIO(bytestring))
       
   608             ostream = StringIO.StringIO()
       
   609             while 1:
       
   610                 if sizehint is not None:
       
   611                     data = istream.read(sizehint)
       
   612                 else:
       
   613                     data = istream.read()
       
   614 
       
   615                 if not data:
       
   616                     break
       
   617                 ostream.write(data)
       
   618 
       
   619             got = ostream.getvalue()
       
   620             self.assertEqual(got, unistring)
       
   621 
       
   622 class EscapeDecodeTest(unittest.TestCase):
       
   623     def test_empty(self):
       
   624         self.assertEquals(codecs.escape_decode(""), ("", 0))
       
   625 
       
   626 class RecodingTest(unittest.TestCase):
       
   627     def test_recoding(self):
       
   628         f = StringIO.StringIO()
       
   629         f2 = codecs.EncodedFile(f, "unicode_internal", "utf-8")
       
   630         f2.write(u"a")
       
   631         f2.close()
       
   632         # Python used to crash on this at exit because of a refcount
       
   633         # bug in _codecsmodule.c
       
   634 
       
   635 # From RFC 3492
       
   636 punycode_testcases = [
       
   637     # A Arabic (Egyptian):
       
   638     (u"\u0644\u064A\u0647\u0645\u0627\u0628\u062A\u0643\u0644"
       
   639      u"\u0645\u0648\u0634\u0639\u0631\u0628\u064A\u061F",
       
   640      "egbpdaj6bu4bxfgehfvwxn"),
       
   641     # B Chinese (simplified):
       
   642     (u"\u4ED6\u4EEC\u4E3A\u4EC0\u4E48\u4E0D\u8BF4\u4E2D\u6587",
       
   643      "ihqwcrb4cv8a8dqg056pqjye"),
       
   644     # C Chinese (traditional):
       
   645     (u"\u4ED6\u5011\u7232\u4EC0\u9EBD\u4E0D\u8AAA\u4E2D\u6587",
       
   646      "ihqwctvzc91f659drss3x8bo0yb"),
       
   647     # D Czech: Pro<ccaron>prost<ecaron>nemluv<iacute><ccaron>esky
       
   648     (u"\u0050\u0072\u006F\u010D\u0070\u0072\u006F\u0073\u0074"
       
   649      u"\u011B\u006E\u0065\u006D\u006C\u0075\u0076\u00ED\u010D"
       
   650      u"\u0065\u0073\u006B\u0079",
       
   651      "Proprostnemluvesky-uyb24dma41a"),
       
   652     # E Hebrew:
       
   653     (u"\u05DC\u05DE\u05D4\u05D4\u05DD\u05E4\u05E9\u05D5\u05D8"
       
   654      u"\u05DC\u05D0\u05DE\u05D3\u05D1\u05E8\u05D9\u05DD\u05E2"
       
   655      u"\u05D1\u05E8\u05D9\u05EA",
       
   656      "4dbcagdahymbxekheh6e0a7fei0b"),
       
   657     # F Hindi (Devanagari):
       
   658     (u"\u092F\u0939\u0932\u094B\u0917\u0939\u093F\u0928\u094D"
       
   659     u"\u0926\u0940\u0915\u094D\u092F\u094B\u0902\u0928\u0939"
       
   660     u"\u0940\u0902\u092C\u094B\u0932\u0938\u0915\u0924\u0947"
       
   661     u"\u0939\u0948\u0902",
       
   662     "i1baa7eci9glrd9b2ae1bj0hfcgg6iyaf8o0a1dig0cd"),
       
   663 
       
   664     #(G) Japanese (kanji and hiragana):
       
   665     (u"\u306A\u305C\u307F\u3093\u306A\u65E5\u672C\u8A9E\u3092"
       
   666     u"\u8A71\u3057\u3066\u304F\u308C\u306A\u3044\u306E\u304B",
       
   667      "n8jok5ay5dzabd5bym9f0cm5685rrjetr6pdxa"),
       
   668 
       
   669     # (H) Korean (Hangul syllables):
       
   670     (u"\uC138\uACC4\uC758\uBAA8\uB4E0\uC0AC\uB78C\uB4E4\uC774"
       
   671      u"\uD55C\uAD6D\uC5B4\uB97C\uC774\uD574\uD55C\uB2E4\uBA74"
       
   672      u"\uC5BC\uB9C8\uB098\uC88B\uC744\uAE4C",
       
   673      "989aomsvi5e83db1d2a355cv1e0vak1dwrv93d5xbh15a0dt30a5j"
       
   674      "psd879ccm6fea98c"),
       
   675 
       
   676     # (I) Russian (Cyrillic):
       
   677     (u"\u043F\u043E\u0447\u0435\u043C\u0443\u0436\u0435\u043E"
       
   678      u"\u043D\u0438\u043D\u0435\u0433\u043E\u0432\u043E\u0440"
       
   679      u"\u044F\u0442\u043F\u043E\u0440\u0443\u0441\u0441\u043A"
       
   680      u"\u0438",
       
   681      "b1abfaaepdrnnbgefbaDotcwatmq2g4l"),
       
   682 
       
   683     # (J) Spanish: Porqu<eacute>nopuedensimplementehablarenEspa<ntilde>ol
       
   684     (u"\u0050\u006F\u0072\u0071\u0075\u00E9\u006E\u006F\u0070"
       
   685      u"\u0075\u0065\u0064\u0065\u006E\u0073\u0069\u006D\u0070"
       
   686      u"\u006C\u0065\u006D\u0065\u006E\u0074\u0065\u0068\u0061"
       
   687      u"\u0062\u006C\u0061\u0072\u0065\u006E\u0045\u0073\u0070"
       
   688      u"\u0061\u00F1\u006F\u006C",
       
   689      "PorqunopuedensimplementehablarenEspaol-fmd56a"),
       
   690 
       
   691     # (K) Vietnamese:
       
   692     #  T<adotbelow>isaoh<odotbelow>kh<ocirc>ngth<ecirchookabove>ch\
       
   693     #   <ihookabove>n<oacute>iti<ecircacute>ngVi<ecircdotbelow>t
       
   694     (u"\u0054\u1EA1\u0069\u0073\u0061\u006F\u0068\u1ECD\u006B"
       
   695      u"\u0068\u00F4\u006E\u0067\u0074\u0068\u1EC3\u0063\u0068"
       
   696      u"\u1EC9\u006E\u00F3\u0069\u0074\u0069\u1EBF\u006E\u0067"
       
   697      u"\u0056\u0069\u1EC7\u0074",
       
   698      "TisaohkhngthchnitingVit-kjcr8268qyxafd2f1b9g"),
       
   699 
       
   700     #(L) 3<nen>B<gumi><kinpachi><sensei>
       
   701     (u"\u0033\u5E74\u0042\u7D44\u91D1\u516B\u5148\u751F",
       
   702      "3B-ww4c5e180e575a65lsy2b"),
       
   703 
       
   704     # (M) <amuro><namie>-with-SUPER-MONKEYS
       
   705     (u"\u5B89\u5BA4\u5948\u7F8E\u6075\u002D\u0077\u0069\u0074"
       
   706      u"\u0068\u002D\u0053\u0055\u0050\u0045\u0052\u002D\u004D"
       
   707      u"\u004F\u004E\u004B\u0045\u0059\u0053",
       
   708      "-with-SUPER-MONKEYS-pc58ag80a8qai00g7n9n"),
       
   709 
       
   710     # (N) Hello-Another-Way-<sorezore><no><basho>
       
   711     (u"\u0048\u0065\u006C\u006C\u006F\u002D\u0041\u006E\u006F"
       
   712      u"\u0074\u0068\u0065\u0072\u002D\u0057\u0061\u0079\u002D"
       
   713      u"\u305D\u308C\u305E\u308C\u306E\u5834\u6240",
       
   714      "Hello-Another-Way--fc4qua05auwb3674vfr0b"),
       
   715 
       
   716     # (O) <hitotsu><yane><no><shita>2
       
   717     (u"\u3072\u3068\u3064\u5C4B\u6839\u306E\u4E0B\u0032",
       
   718      "2-u9tlzr9756bt3uc0v"),
       
   719 
       
   720     # (P) Maji<de>Koi<suru>5<byou><mae>
       
   721     (u"\u004D\u0061\u006A\u0069\u3067\u004B\u006F\u0069\u3059"
       
   722      u"\u308B\u0035\u79D2\u524D",
       
   723      "MajiKoi5-783gue6qz075azm5e"),
       
   724 
       
   725      # (Q) <pafii>de<runba>
       
   726     (u"\u30D1\u30D5\u30A3\u30FC\u0064\u0065\u30EB\u30F3\u30D0",
       
   727      "de-jg4avhby1noc0d"),
       
   728 
       
   729     # (R) <sono><supiido><de>
       
   730     (u"\u305D\u306E\u30B9\u30D4\u30FC\u30C9\u3067",
       
   731      "d9juau41awczczp"),
       
   732 
       
   733     # (S) -> $1.00 <-
       
   734     (u"\u002D\u003E\u0020\u0024\u0031\u002E\u0030\u0030\u0020"
       
   735      u"\u003C\u002D",
       
   736      "-> $1.00 <--")
       
   737     ]
       
   738 
       
   739 for i in punycode_testcases:
       
   740     if len(i)!=2:
       
   741         print repr(i)
       
   742 
       
   743 class PunycodeTest(unittest.TestCase):
       
   744     def test_encode(self):
       
   745         for uni, puny in punycode_testcases:
       
   746             # Need to convert both strings to lower case, since
       
   747             # some of the extended encodings use upper case, but our
       
   748             # code produces only lower case. Converting just puny to
       
   749             # lower is also insufficient, since some of the input characters
       
   750             # are upper case.
       
   751             self.assertEquals(uni.encode("punycode").lower(), puny.lower())
       
   752 
       
   753     def test_decode(self):
       
   754         for uni, puny in punycode_testcases:
       
   755             self.assertEquals(uni, puny.decode("punycode"))
       
   756 
       
   757 class UnicodeInternalTest(unittest.TestCase):
       
   758     def test_bug1251300(self):
       
   759         # Decoding with unicode_internal used to not correctly handle "code
       
   760         # points" above 0x10ffff on UCS-4 builds.
       
   761         if sys.maxunicode > 0xffff:
       
   762             ok = [
       
   763                 ("\x00\x10\xff\xff", u"\U0010ffff"),
       
   764                 ("\x00\x00\x01\x01", u"\U00000101"),
       
   765                 ("", u""),
       
   766             ]
       
   767             not_ok = [
       
   768                 "\x7f\xff\xff\xff",
       
   769                 "\x80\x00\x00\x00",
       
   770                 "\x81\x00\x00\x00",
       
   771                 "\x00",
       
   772                 "\x00\x00\x00\x00\x00",
       
   773             ]
       
   774             for internal, uni in ok:
       
   775                 if sys.byteorder == "little":
       
   776                     internal = "".join(reversed(internal))
       
   777                 self.assertEquals(uni, internal.decode("unicode_internal"))
       
   778             for internal in not_ok:
       
   779                 if sys.byteorder == "little":
       
   780                     internal = "".join(reversed(internal))
       
   781                 self.assertRaises(UnicodeDecodeError, internal.decode,
       
   782                     "unicode_internal")
       
   783 
       
   784     def test_decode_error_attributes(self):
       
   785         if sys.maxunicode > 0xffff:
       
   786             try:
       
   787                 "\x00\x00\x00\x00\x00\x11\x11\x00".decode("unicode_internal")
       
   788             except UnicodeDecodeError, ex:
       
   789                 self.assertEquals("unicode_internal", ex.encoding)
       
   790                 self.assertEquals("\x00\x00\x00\x00\x00\x11\x11\x00", ex.object)
       
   791                 self.assertEquals(4, ex.start)
       
   792                 self.assertEquals(8, ex.end)
       
   793             else:
       
   794                 self.fail()
       
   795 
       
   796     def test_decode_callback(self):
       
   797         if sys.maxunicode > 0xffff:
       
   798             codecs.register_error("UnicodeInternalTest", codecs.ignore_errors)
       
   799             decoder = codecs.getdecoder("unicode_internal")
       
   800             ab = u"ab".encode("unicode_internal")
       
   801             ignored = decoder("%s\x22\x22\x22\x22%s" % (ab[:4], ab[4:]),
       
   802                 "UnicodeInternalTest")
       
   803             self.assertEquals((u"ab", 12), ignored)
       
   804 
       
   805 # From http://www.gnu.org/software/libidn/draft-josefsson-idn-test-vectors.html
       
   806 nameprep_tests = [
       
   807     # 3.1 Map to nothing.
       
   808     ('foo\xc2\xad\xcd\x8f\xe1\xa0\x86\xe1\xa0\x8bbar'
       
   809      '\xe2\x80\x8b\xe2\x81\xa0baz\xef\xb8\x80\xef\xb8\x88\xef'
       
   810      '\xb8\x8f\xef\xbb\xbf',
       
   811      'foobarbaz'),
       
   812     # 3.2 Case folding ASCII U+0043 U+0041 U+0046 U+0045.
       
   813     ('CAFE',
       
   814      'cafe'),
       
   815     # 3.3 Case folding 8bit U+00DF (german sharp s).
       
   816     # The original test case is bogus; it says \xc3\xdf
       
   817     ('\xc3\x9f',
       
   818      'ss'),
       
   819     # 3.4 Case folding U+0130 (turkish capital I with dot).
       
   820     ('\xc4\xb0',
       
   821      'i\xcc\x87'),
       
   822     # 3.5 Case folding multibyte U+0143 U+037A.
       
   823     ('\xc5\x83\xcd\xba',
       
   824      '\xc5\x84 \xce\xb9'),
       
   825     # 3.6 Case folding U+2121 U+33C6 U+1D7BB.
       
   826     # XXX: skip this as it fails in UCS-2 mode
       
   827     #('\xe2\x84\xa1\xe3\x8f\x86\xf0\x9d\x9e\xbb',
       
   828     # 'telc\xe2\x88\x95kg\xcf\x83'),
       
   829     (None, None),
       
   830     # 3.7 Normalization of U+006a U+030c U+00A0 U+00AA.
       
   831     ('j\xcc\x8c\xc2\xa0\xc2\xaa',
       
   832      '\xc7\xb0 a'),
       
   833     # 3.8 Case folding U+1FB7 and normalization.
       
   834     ('\xe1\xbe\xb7',
       
   835      '\xe1\xbe\xb6\xce\xb9'),
       
   836     # 3.9 Self-reverting case folding U+01F0 and normalization.
       
   837     # The original test case is bogus, it says `\xc7\xf0'
       
   838     ('\xc7\xb0',
       
   839      '\xc7\xb0'),
       
   840     # 3.10 Self-reverting case folding U+0390 and normalization.
       
   841     ('\xce\x90',
       
   842      '\xce\x90'),
       
   843     # 3.11 Self-reverting case folding U+03B0 and normalization.
       
   844     ('\xce\xb0',
       
   845      '\xce\xb0'),
       
   846     # 3.12 Self-reverting case folding U+1E96 and normalization.
       
   847     ('\xe1\xba\x96',
       
   848      '\xe1\xba\x96'),
       
   849     # 3.13 Self-reverting case folding U+1F56 and normalization.
       
   850     ('\xe1\xbd\x96',
       
   851      '\xe1\xbd\x96'),
       
   852     # 3.14 ASCII space character U+0020.
       
   853     (' ',
       
   854      ' '),
       
   855     # 3.15 Non-ASCII 8bit space character U+00A0.
       
   856     ('\xc2\xa0',
       
   857      ' '),
       
   858     # 3.16 Non-ASCII multibyte space character U+1680.
       
   859     ('\xe1\x9a\x80',
       
   860      None),
       
   861     # 3.17 Non-ASCII multibyte space character U+2000.
       
   862     ('\xe2\x80\x80',
       
   863      ' '),
       
   864     # 3.18 Zero Width Space U+200b.
       
   865     ('\xe2\x80\x8b',
       
   866      ''),
       
   867     # 3.19 Non-ASCII multibyte space character U+3000.
       
   868     ('\xe3\x80\x80',
       
   869      ' '),
       
   870     # 3.20 ASCII control characters U+0010 U+007F.
       
   871     ('\x10\x7f',
       
   872      '\x10\x7f'),
       
   873     # 3.21 Non-ASCII 8bit control character U+0085.
       
   874     ('\xc2\x85',
       
   875      None),
       
   876     # 3.22 Non-ASCII multibyte control character U+180E.
       
   877     ('\xe1\xa0\x8e',
       
   878      None),
       
   879     # 3.23 Zero Width No-Break Space U+FEFF.
       
   880     ('\xef\xbb\xbf',
       
   881      ''),
       
   882     # 3.24 Non-ASCII control character U+1D175.
       
   883     ('\xf0\x9d\x85\xb5',
       
   884      None),
       
   885     # 3.25 Plane 0 private use character U+F123.
       
   886     ('\xef\x84\xa3',
       
   887      None),
       
   888     # 3.26 Plane 15 private use character U+F1234.
       
   889     ('\xf3\xb1\x88\xb4',
       
   890      None),
       
   891     # 3.27 Plane 16 private use character U+10F234.
       
   892     ('\xf4\x8f\x88\xb4',
       
   893      None),
       
   894     # 3.28 Non-character code point U+8FFFE.
       
   895     ('\xf2\x8f\xbf\xbe',
       
   896      None),
       
   897     # 3.29 Non-character code point U+10FFFF.
       
   898     ('\xf4\x8f\xbf\xbf',
       
   899      None),
       
   900     # 3.30 Surrogate code U+DF42.
       
   901     ('\xed\xbd\x82',
       
   902      None),
       
   903     # 3.31 Non-plain text character U+FFFD.
       
   904     ('\xef\xbf\xbd',
       
   905      None),
       
   906     # 3.32 Ideographic description character U+2FF5.
       
   907     ('\xe2\xbf\xb5',
       
   908      None),
       
   909     # 3.33 Display property character U+0341.
       
   910     ('\xcd\x81',
       
   911      '\xcc\x81'),
       
   912     # 3.34 Left-to-right mark U+200E.
       
   913     ('\xe2\x80\x8e',
       
   914      None),
       
   915     # 3.35 Deprecated U+202A.
       
   916     ('\xe2\x80\xaa',
       
   917      None),
       
   918     # 3.36 Language tagging character U+E0001.
       
   919     ('\xf3\xa0\x80\x81',
       
   920      None),
       
   921     # 3.37 Language tagging character U+E0042.
       
   922     ('\xf3\xa0\x81\x82',
       
   923      None),
       
   924     # 3.38 Bidi: RandALCat character U+05BE and LCat characters.
       
   925     ('foo\xd6\xbebar',
       
   926      None),
       
   927     # 3.39 Bidi: RandALCat character U+FD50 and LCat characters.
       
   928     ('foo\xef\xb5\x90bar',
       
   929      None),
       
   930     # 3.40 Bidi: RandALCat character U+FB38 and LCat characters.
       
   931     ('foo\xef\xb9\xb6bar',
       
   932      'foo \xd9\x8ebar'),
       
   933     # 3.41 Bidi: RandALCat without trailing RandALCat U+0627 U+0031.
       
   934     ('\xd8\xa71',
       
   935      None),
       
   936     # 3.42 Bidi: RandALCat character U+0627 U+0031 U+0628.
       
   937     ('\xd8\xa71\xd8\xa8',
       
   938      '\xd8\xa71\xd8\xa8'),
       
   939     # 3.43 Unassigned code point U+E0002.
       
   940     # Skip this test as we allow unassigned
       
   941     #('\xf3\xa0\x80\x82',
       
   942     # None),
       
   943     (None, None),
       
   944     # 3.44 Larger test (shrinking).
       
   945     # Original test case reads \xc3\xdf
       
   946     ('X\xc2\xad\xc3\x9f\xc4\xb0\xe2\x84\xa1j\xcc\x8c\xc2\xa0\xc2'
       
   947      '\xaa\xce\xb0\xe2\x80\x80',
       
   948      'xssi\xcc\x87tel\xc7\xb0 a\xce\xb0 '),
       
   949     # 3.45 Larger test (expanding).
       
   950     # Original test case reads \xc3\x9f
       
   951     ('X\xc3\x9f\xe3\x8c\x96\xc4\xb0\xe2\x84\xa1\xe2\x92\x9f\xe3\x8c'
       
   952      '\x80',
       
   953      'xss\xe3\x82\xad\xe3\x83\xad\xe3\x83\xa1\xe3\x83\xbc\xe3'
       
   954      '\x83\x88\xe3\x83\xabi\xcc\x87tel\x28d\x29\xe3\x82'
       
   955      '\xa2\xe3\x83\x91\xe3\x83\xbc\xe3\x83\x88')
       
   956     ]
       
   957 
       
   958 
       
   959 class NameprepTest(unittest.TestCase):
       
   960     def test_nameprep(self):
       
   961         from encodings.idna import nameprep
       
   962         for pos, (orig, prepped) in enumerate(nameprep_tests):
       
   963             if orig is None:
       
   964                 # Skipped
       
   965                 continue
       
   966             # The Unicode strings are given in UTF-8
       
   967             orig = unicode(orig, "utf-8")
       
   968             if prepped is None:
       
   969                 # Input contains prohibited characters
       
   970                 self.assertRaises(UnicodeError, nameprep, orig)
       
   971             else:
       
   972                 prepped = unicode(prepped, "utf-8")
       
   973                 try:
       
   974                     self.assertEquals(nameprep(orig), prepped)
       
   975                 except Exception,e:
       
   976                     raise test_support.TestFailed("Test 3.%d: %s" % (pos+1, str(e)))
       
   977 
       
   978 class IDNACodecTest(unittest.TestCase):
       
   979     def test_builtin_decode(self):
       
   980         self.assertEquals(unicode("python.org", "idna"), u"python.org")
       
   981         self.assertEquals(unicode("python.org.", "idna"), u"python.org.")
       
   982         self.assertEquals(unicode("xn--pythn-mua.org", "idna"), u"pyth\xf6n.org")
       
   983         self.assertEquals(unicode("xn--pythn-mua.org.", "idna"), u"pyth\xf6n.org.")
       
   984 
       
   985     def test_builtin_encode(self):
       
   986         self.assertEquals(u"python.org".encode("idna"), "python.org")
       
   987         self.assertEquals("python.org.".encode("idna"), "python.org.")
       
   988         self.assertEquals(u"pyth\xf6n.org".encode("idna"), "xn--pythn-mua.org")
       
   989         self.assertEquals(u"pyth\xf6n.org.".encode("idna"), "xn--pythn-mua.org.")
       
   990 
       
   991     def test_stream(self):
       
   992         import StringIO
       
   993         r = codecs.getreader("idna")(StringIO.StringIO("abc"))
       
   994         r.read(3)
       
   995         self.assertEquals(r.read(), u"")
       
   996 
       
   997     def test_incremental_decode(self):
       
   998         self.assertEquals(
       
   999             "".join(codecs.iterdecode("python.org", "idna")),
       
  1000             u"python.org"
       
  1001         )
       
  1002         self.assertEquals(
       
  1003             "".join(codecs.iterdecode("python.org.", "idna")),
       
  1004             u"python.org."
       
  1005         )
       
  1006         self.assertEquals(
       
  1007             "".join(codecs.iterdecode("xn--pythn-mua.org.", "idna")),
       
  1008             u"pyth\xf6n.org."
       
  1009         )
       
  1010         self.assertEquals(
       
  1011             "".join(codecs.iterdecode("xn--pythn-mua.org.", "idna")),
       
  1012             u"pyth\xf6n.org."
       
  1013         )
       
  1014 
       
  1015         decoder = codecs.getincrementaldecoder("idna")()
       
  1016         self.assertEquals(decoder.decode("xn--xam", ), u"")
       
  1017         self.assertEquals(decoder.decode("ple-9ta.o", ), u"\xe4xample.")
       
  1018         self.assertEquals(decoder.decode(u"rg"), u"")
       
  1019         self.assertEquals(decoder.decode(u"", True), u"org")
       
  1020 
       
  1021         decoder.reset()
       
  1022         self.assertEquals(decoder.decode("xn--xam", ), u"")
       
  1023         self.assertEquals(decoder.decode("ple-9ta.o", ), u"\xe4xample.")
       
  1024         self.assertEquals(decoder.decode("rg."), u"org.")
       
  1025         self.assertEquals(decoder.decode("", True), u"")
       
  1026 
       
  1027     def test_incremental_encode(self):
       
  1028         self.assertEquals(
       
  1029             "".join(codecs.iterencode(u"python.org", "idna")),
       
  1030             "python.org"
       
  1031         )
       
  1032         self.assertEquals(
       
  1033             "".join(codecs.iterencode(u"python.org.", "idna")),
       
  1034             "python.org."
       
  1035         )
       
  1036         self.assertEquals(
       
  1037             "".join(codecs.iterencode(u"pyth\xf6n.org.", "idna")),
       
  1038             "xn--pythn-mua.org."
       
  1039         )
       
  1040         self.assertEquals(
       
  1041             "".join(codecs.iterencode(u"pyth\xf6n.org.", "idna")),
       
  1042             "xn--pythn-mua.org."
       
  1043         )
       
  1044 
       
  1045         encoder = codecs.getincrementalencoder("idna")()
       
  1046         self.assertEquals(encoder.encode(u"\xe4x"), "")
       
  1047         self.assertEquals(encoder.encode(u"ample.org"), "xn--xample-9ta.")
       
  1048         self.assertEquals(encoder.encode(u"", True), "org")
       
  1049 
       
  1050         encoder.reset()
       
  1051         self.assertEquals(encoder.encode(u"\xe4x"), "")
       
  1052         self.assertEquals(encoder.encode(u"ample.org."), "xn--xample-9ta.org.")
       
  1053         self.assertEquals(encoder.encode(u"", True), "")
       
  1054 
       
  1055 class CodecsModuleTest(unittest.TestCase):
       
  1056 
       
  1057     def test_decode(self):
       
  1058         self.assertEquals(codecs.decode('\xe4\xf6\xfc', 'latin-1'),
       
  1059                           u'\xe4\xf6\xfc')
       
  1060         self.assertRaises(TypeError, codecs.decode)
       
  1061         self.assertEquals(codecs.decode('abc'), u'abc')
       
  1062         self.assertRaises(UnicodeDecodeError, codecs.decode, '\xff', 'ascii')
       
  1063 
       
  1064     def test_encode(self):
       
  1065         self.assertEquals(codecs.encode(u'\xe4\xf6\xfc', 'latin-1'),
       
  1066                           '\xe4\xf6\xfc')
       
  1067         self.assertRaises(TypeError, codecs.encode)
       
  1068         self.assertRaises(LookupError, codecs.encode, "foo", "__spam__")
       
  1069         self.assertEquals(codecs.encode(u'abc'), 'abc')
       
  1070         self.assertRaises(UnicodeEncodeError, codecs.encode, u'\xffff', 'ascii')
       
  1071 
       
  1072     def test_register(self):
       
  1073         self.assertRaises(TypeError, codecs.register)
       
  1074         self.assertRaises(TypeError, codecs.register, 42)
       
  1075 
       
  1076     def test_lookup(self):
       
  1077         self.assertRaises(TypeError, codecs.lookup)
       
  1078         self.assertRaises(LookupError, codecs.lookup, "__spam__")
       
  1079         self.assertRaises(LookupError, codecs.lookup, " ")
       
  1080 
       
  1081     def test_getencoder(self):
       
  1082         self.assertRaises(TypeError, codecs.getencoder)
       
  1083         self.assertRaises(LookupError, codecs.getencoder, "__spam__")
       
  1084 
       
  1085     def test_getdecoder(self):
       
  1086         self.assertRaises(TypeError, codecs.getdecoder)
       
  1087         self.assertRaises(LookupError, codecs.getdecoder, "__spam__")
       
  1088 
       
  1089     def test_getreader(self):
       
  1090         self.assertRaises(TypeError, codecs.getreader)
       
  1091         self.assertRaises(LookupError, codecs.getreader, "__spam__")
       
  1092 
       
  1093     def test_getwriter(self):
       
  1094         self.assertRaises(TypeError, codecs.getwriter)
       
  1095         self.assertRaises(LookupError, codecs.getwriter, "__spam__")
       
  1096 
       
  1097 class StreamReaderTest(unittest.TestCase):
       
  1098 
       
  1099     def setUp(self):
       
  1100         self.reader = codecs.getreader('utf-8')
       
  1101         self.stream = StringIO.StringIO('\xed\x95\x9c\n\xea\xb8\x80')
       
  1102 
       
  1103     def test_readlines(self):
       
  1104         f = self.reader(self.stream)
       
  1105         self.assertEquals(f.readlines(), [u'\ud55c\n', u'\uae00'])
       
  1106 
       
  1107 class EncodedFileTest(unittest.TestCase):
       
  1108 
       
  1109     def test_basic(self):
       
  1110         f = StringIO.StringIO('\xed\x95\x9c\n\xea\xb8\x80')
       
  1111         ef = codecs.EncodedFile(f, 'utf-16-le', 'utf-8')
       
  1112         self.assertEquals(ef.read(), '\\\xd5\n\x00\x00\xae')
       
  1113 
       
  1114         f = StringIO.StringIO()
       
  1115         ef = codecs.EncodedFile(f, 'utf-8', 'latin1')
       
  1116         ef.write('\xc3\xbc')
       
  1117         self.assertEquals(f.getvalue(), '\xfc')
       
  1118 
       
  1119 class Str2StrTest(unittest.TestCase):
       
  1120 
       
  1121     def test_read(self):
       
  1122         sin = "\x80".encode("base64_codec")
       
  1123         reader = codecs.getreader("base64_codec")(StringIO.StringIO(sin))
       
  1124         sout = reader.read()
       
  1125         self.assertEqual(sout, "\x80")
       
  1126         self.assert_(isinstance(sout, str))
       
  1127 
       
  1128     def test_readline(self):
       
  1129         sin = "\x80".encode("base64_codec")
       
  1130         reader = codecs.getreader("base64_codec")(StringIO.StringIO(sin))
       
  1131         sout = reader.readline()
       
  1132         self.assertEqual(sout, "\x80")
       
  1133         self.assert_(isinstance(sout, str))
       
  1134 
       
  1135 all_unicode_encodings = [
       
  1136     "ascii",
       
  1137     "base64_codec",
       
  1138     "big5",
       
  1139     "big5hkscs",
       
  1140     "charmap",
       
  1141     "cp037",
       
  1142     "cp1006",
       
  1143     "cp1026",
       
  1144     "cp1140",
       
  1145     "cp1250",
       
  1146     "cp1251",
       
  1147     "cp1252",
       
  1148     "cp1253",
       
  1149     "cp1254",
       
  1150     "cp1255",
       
  1151     "cp1256",
       
  1152     "cp1257",
       
  1153     "cp1258",
       
  1154     "cp424",
       
  1155     "cp437",
       
  1156     "cp500",
       
  1157     "cp737",
       
  1158     "cp775",
       
  1159     "cp850",
       
  1160     "cp852",
       
  1161     "cp855",
       
  1162     "cp856",
       
  1163     "cp857",
       
  1164     "cp860",
       
  1165     "cp861",
       
  1166     "cp862",
       
  1167     "cp863",
       
  1168     "cp864",
       
  1169     "cp865",
       
  1170     "cp866",
       
  1171     "cp869",
       
  1172     "cp874",
       
  1173     "cp875",
       
  1174     "cp932",
       
  1175     "cp949",
       
  1176     "cp950",
       
  1177     "euc_jis_2004",
       
  1178     "euc_jisx0213",
       
  1179     "euc_jp",
       
  1180     "euc_kr",
       
  1181     "gb18030",
       
  1182     "gb2312",
       
  1183     "gbk",
       
  1184     "hex_codec",
       
  1185     "hp_roman8",
       
  1186     "hz",
       
  1187     "idna",
       
  1188     "iso2022_jp",
       
  1189     "iso2022_jp_1",
       
  1190     "iso2022_jp_2",
       
  1191     "iso2022_jp_2004",
       
  1192     "iso2022_jp_3",
       
  1193     "iso2022_jp_ext",
       
  1194     "iso2022_kr",
       
  1195     "iso8859_1",
       
  1196     "iso8859_10",
       
  1197     "iso8859_11",
       
  1198     "iso8859_13",
       
  1199     "iso8859_14",
       
  1200     "iso8859_15",
       
  1201     "iso8859_16",
       
  1202     "iso8859_2",
       
  1203     "iso8859_3",
       
  1204     "iso8859_4",
       
  1205     "iso8859_5",
       
  1206     "iso8859_6",
       
  1207     "iso8859_7",
       
  1208     "iso8859_8",
       
  1209     "iso8859_9",
       
  1210     "johab",
       
  1211     "koi8_r",
       
  1212     "koi8_u",
       
  1213     "latin_1",
       
  1214     "mac_cyrillic",
       
  1215     "mac_greek",
       
  1216     "mac_iceland",
       
  1217     "mac_latin2",
       
  1218     "mac_roman",
       
  1219     "mac_turkish",
       
  1220     "palmos",
       
  1221     "ptcp154",
       
  1222     "punycode",
       
  1223     "raw_unicode_escape",
       
  1224     "rot_13",
       
  1225     "shift_jis",
       
  1226     "shift_jis_2004",
       
  1227     "shift_jisx0213",
       
  1228     "tis_620",
       
  1229     "unicode_escape",
       
  1230     "unicode_internal",
       
  1231     "utf_16",
       
  1232     "utf_16_be",
       
  1233     "utf_16_le",
       
  1234     "utf_7",
       
  1235     "utf_8",
       
  1236 ]
       
  1237 
       
  1238 if hasattr(codecs, "mbcs_encode"):
       
  1239     all_unicode_encodings.append("mbcs")
       
  1240 
       
  1241 # The following encodings work only with str, not unicode
       
  1242 all_string_encodings = [
       
  1243     "quopri_codec",
       
  1244     "string_escape",
       
  1245     "uu_codec",
       
  1246 ]
       
  1247 
       
  1248 # The following encoding is not tested, because it's not supposed
       
  1249 # to work:
       
  1250 #    "undefined"
       
  1251 
       
  1252 # The following encodings don't work in stateful mode
       
  1253 broken_unicode_with_streams = [
       
  1254     "base64_codec",
       
  1255     "hex_codec",
       
  1256     "punycode",
       
  1257     "unicode_internal"
       
  1258 ]
       
  1259 broken_incremental_coders = broken_unicode_with_streams[:]
       
  1260 
       
  1261 # The following encodings only support "strict" mode
       
  1262 only_strict_mode = [
       
  1263     "idna",
       
  1264     "zlib_codec",
       
  1265     "bz2_codec",
       
  1266 ]
       
  1267 
       
  1268 try:
       
  1269     import bz2
       
  1270 except ImportError:
       
  1271     pass
       
  1272 else:
       
  1273     all_unicode_encodings.append("bz2_codec")
       
  1274     broken_unicode_with_streams.append("bz2_codec")
       
  1275 
       
  1276 try:
       
  1277     import zlib
       
  1278 except ImportError:
       
  1279     pass
       
  1280 else:
       
  1281     all_unicode_encodings.append("zlib_codec")
       
  1282     broken_unicode_with_streams.append("zlib_codec")
       
  1283 
       
  1284 class BasicUnicodeTest(unittest.TestCase):
       
  1285     def test_basics(self):
       
  1286         s = u"abc123" # all codecs should be able to encode these
       
  1287         for encoding in all_unicode_encodings:
       
  1288             name = codecs.lookup(encoding).name
       
  1289             if encoding.endswith("_codec"):
       
  1290                 name += "_codec"
       
  1291             elif encoding == "latin_1":
       
  1292                 name = "latin_1"
       
  1293             self.assertEqual(encoding.replace("_", "-"), name.replace("_", "-"))
       
  1294             (bytes, size) = codecs.getencoder(encoding)(s)
       
  1295             if encoding != "unicode_internal":
       
  1296                 self.assertEqual(size, len(s), "%r != %r (encoding=%r)" % (size, len(s), encoding))
       
  1297             (chars, size) = codecs.getdecoder(encoding)(bytes)
       
  1298             self.assertEqual(chars, s, "%r != %r (encoding=%r)" % (chars, s, encoding))
       
  1299 
       
  1300             if encoding not in broken_unicode_with_streams:
       
  1301                 # check stream reader/writer
       
  1302                 q = Queue()
       
  1303                 writer = codecs.getwriter(encoding)(q)
       
  1304                 encodedresult = ""
       
  1305                 for c in s:
       
  1306                     writer.write(c)
       
  1307                     encodedresult += q.read()
       
  1308                 q = Queue()
       
  1309                 reader = codecs.getreader(encoding)(q)
       
  1310                 decodedresult = u""
       
  1311                 for c in encodedresult:
       
  1312                     q.write(c)
       
  1313                     decodedresult += reader.read()
       
  1314                 self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
       
  1315 
       
  1316             if encoding not in broken_incremental_coders:
       
  1317                 # check incremental decoder/encoder (fetched via the Python
       
  1318                 # and C API) and iterencode()/iterdecode()
       
  1319                 try:
       
  1320                     encoder = codecs.getincrementalencoder(encoding)()
       
  1321                     cencoder = _testcapi.codec_incrementalencoder(encoding)
       
  1322                 except LookupError: # no IncrementalEncoder
       
  1323                     pass
       
  1324                 else:
       
  1325                     # check incremental decoder/encoder
       
  1326                     encodedresult = ""
       
  1327                     for c in s:
       
  1328                         encodedresult += encoder.encode(c)
       
  1329                     encodedresult += encoder.encode(u"", True)
       
  1330                     decoder = codecs.getincrementaldecoder(encoding)()
       
  1331                     decodedresult = u""
       
  1332                     for c in encodedresult:
       
  1333                         decodedresult += decoder.decode(c)
       
  1334                     decodedresult += decoder.decode("", True)
       
  1335                     self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
       
  1336 
       
  1337                     # check C API
       
  1338                     encodedresult = ""
       
  1339                     for c in s:
       
  1340                         encodedresult += cencoder.encode(c)
       
  1341                     encodedresult += cencoder.encode(u"", True)
       
  1342                     cdecoder = _testcapi.codec_incrementaldecoder(encoding)
       
  1343                     decodedresult = u""
       
  1344                     for c in encodedresult:
       
  1345                         decodedresult += cdecoder.decode(c)
       
  1346                     decodedresult += cdecoder.decode("", True)
       
  1347                     self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
       
  1348 
       
  1349                     # check iterencode()/iterdecode()
       
  1350                     result = u"".join(codecs.iterdecode(codecs.iterencode(s, encoding), encoding))
       
  1351                     self.assertEqual(result, s, "%r != %r (encoding=%r)" % (result, s, encoding))
       
  1352 
       
  1353                     # check iterencode()/iterdecode() with empty string
       
  1354                     result = u"".join(codecs.iterdecode(codecs.iterencode(u"", encoding), encoding))
       
  1355                     self.assertEqual(result, u"")
       
  1356 
       
  1357                 if encoding not in only_strict_mode:
       
  1358                     # check incremental decoder/encoder with errors argument
       
  1359                     try:
       
  1360                         encoder = codecs.getincrementalencoder(encoding)("ignore")
       
  1361                         cencoder = _testcapi.codec_incrementalencoder(encoding, "ignore")
       
  1362                     except LookupError: # no IncrementalEncoder
       
  1363                         pass
       
  1364                     else:
       
  1365                         encodedresult = "".join(encoder.encode(c) for c in s)
       
  1366                         decoder = codecs.getincrementaldecoder(encoding)("ignore")
       
  1367                         decodedresult = u"".join(decoder.decode(c) for c in encodedresult)
       
  1368                         self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
       
  1369 
       
  1370                         encodedresult = "".join(cencoder.encode(c) for c in s)
       
  1371                         cdecoder = _testcapi.codec_incrementaldecoder(encoding, "ignore")
       
  1372                         decodedresult = u"".join(cdecoder.decode(c) for c in encodedresult)
       
  1373                         self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
       
  1374 
       
  1375     def test_seek(self):
       
  1376         # all codecs should be able to encode these
       
  1377         s = u"%s\n%s\n" % (100*u"abc123", 100*u"def456")
       
  1378         for encoding in all_unicode_encodings:
       
  1379             if encoding == "idna": # FIXME: See SF bug #1163178
       
  1380                 continue
       
  1381             if encoding in broken_unicode_with_streams:
       
  1382                 continue
       
  1383             reader = codecs.getreader(encoding)(StringIO.StringIO(s.encode(encoding)))
       
  1384             for t in xrange(5):
       
  1385                 # Test that calling seek resets the internal codec state and buffers
       
  1386                 reader.seek(0, 0)
       
  1387                 line = reader.readline()
       
  1388                 self.assertEqual(s[:len(line)], line)
       
  1389 
       
  1390     def test_bad_decode_args(self):
       
  1391         for encoding in all_unicode_encodings:
       
  1392             decoder = codecs.getdecoder(encoding)
       
  1393             self.assertRaises(TypeError, decoder)
       
  1394             if encoding not in ("idna", "punycode"):
       
  1395                 self.assertRaises(TypeError, decoder, 42)
       
  1396 
       
  1397     def test_bad_encode_args(self):
       
  1398         for encoding in all_unicode_encodings:
       
  1399             encoder = codecs.getencoder(encoding)
       
  1400             self.assertRaises(TypeError, encoder)
       
  1401 
       
  1402     def test_encoding_map_type_initialized(self):
       
  1403         from encodings import cp1140
       
  1404         # This used to crash, we are only verifying there's no crash.
       
  1405         table_type = type(cp1140.encoding_table)
       
  1406         self.assertEqual(table_type, table_type)
       
  1407 
       
  1408 class BasicStrTest(unittest.TestCase):
       
  1409     def test_basics(self):
       
  1410         s = "abc123"
       
  1411         for encoding in all_string_encodings:
       
  1412             (bytes, size) = codecs.getencoder(encoding)(s)
       
  1413             self.assertEqual(size, len(s))
       
  1414             (chars, size) = codecs.getdecoder(encoding)(bytes)
       
  1415             self.assertEqual(chars, s, "%r != %r (encoding=%r)" % (chars, s, encoding))
       
  1416 
       
  1417 class CharmapTest(unittest.TestCase):
       
  1418     def test_decode_with_string_map(self):
       
  1419         self.assertEquals(
       
  1420             codecs.charmap_decode("\x00\x01\x02", "strict", u"abc"),
       
  1421             (u"abc", 3)
       
  1422         )
       
  1423 
       
  1424         self.assertEquals(
       
  1425             codecs.charmap_decode("\x00\x01\x02", "replace", u"ab"),
       
  1426             (u"ab\ufffd", 3)
       
  1427         )
       
  1428 
       
  1429         self.assertEquals(
       
  1430             codecs.charmap_decode("\x00\x01\x02", "replace", u"ab\ufffe"),
       
  1431             (u"ab\ufffd", 3)
       
  1432         )
       
  1433 
       
  1434         self.assertEquals(
       
  1435             codecs.charmap_decode("\x00\x01\x02", "ignore", u"ab"),
       
  1436             (u"ab", 3)
       
  1437         )
       
  1438 
       
  1439         self.assertEquals(
       
  1440             codecs.charmap_decode("\x00\x01\x02", "ignore", u"ab\ufffe"),
       
  1441             (u"ab", 3)
       
  1442         )
       
  1443 
       
  1444         allbytes = "".join(chr(i) for i in xrange(256))
       
  1445         self.assertEquals(
       
  1446             codecs.charmap_decode(allbytes, "ignore", u""),
       
  1447             (u"", len(allbytes))
       
  1448         )
       
  1449 
       
  1450 class WithStmtTest(unittest.TestCase):
       
  1451     def test_encodedfile(self):
       
  1452         f = StringIO.StringIO("\xc3\xbc")
       
  1453         with codecs.EncodedFile(f, "latin-1", "utf-8") as ef:
       
  1454             self.assertEquals(ef.read(), "\xfc")
       
  1455 
       
  1456     def test_streamreaderwriter(self):
       
  1457         f = StringIO.StringIO("\xc3\xbc")
       
  1458         info = codecs.lookup("utf-8")
       
  1459         with codecs.StreamReaderWriter(f, info.streamreader,
       
  1460                                        info.streamwriter, 'strict') as srw:
       
  1461             self.assertEquals(srw.read(), u"\xfc")
       
  1462 
       
  1463 
       
  1464 def test_main():
       
  1465     test_support.run_unittest(
       
  1466         UTF32Test,
       
  1467         UTF32LETest,
       
  1468         UTF32BETest,
       
  1469         UTF16Test,
       
  1470         UTF16LETest,
       
  1471         UTF16BETest,
       
  1472         UTF8Test,
       
  1473         UTF8SigTest,
       
  1474         UTF7Test,
       
  1475         UTF16ExTest,
       
  1476         ReadBufferTest,
       
  1477         CharBufferTest,
       
  1478         EscapeDecodeTest,
       
  1479         RecodingTest,
       
  1480         PunycodeTest,
       
  1481         UnicodeInternalTest,
       
  1482         NameprepTest,
       
  1483         IDNACodecTest,
       
  1484         CodecsModuleTest,
       
  1485         StreamReaderTest,
       
  1486         EncodedFileTest,
       
  1487         Str2StrTest,
       
  1488         BasicUnicodeTest,
       
  1489         BasicStrTest,
       
  1490         CharmapTest,
       
  1491         WithStmtTest,
       
  1492     )
       
  1493 
       
  1494 
       
  1495 if __name__ == "__main__":
       
  1496     test_main()