python-2.5.2/win32/Lib/test/test_codecs.py
changeset 0 ae805ac0140d
equal deleted inserted replaced
-1:000000000000 0:ae805ac0140d
       
     1 from __future__ import with_statement
       
     2 from test import test_support
       
     3 import unittest
       
     4 import codecs
       
     5 import sys, StringIO, _testcapi
       
     6 
       
     7 class Queue(object):
       
     8     """
       
     9     queue: write bytes at one end, read bytes from the other end
       
    10     """
       
    11     def __init__(self):
       
    12         self._buffer = ""
       
    13 
       
    14     def write(self, chars):
       
    15         self._buffer += chars
       
    16 
       
    17     def read(self, size=-1):
       
    18         if size<0:
       
    19             s = self._buffer
       
    20             self._buffer = ""
       
    21             return s
       
    22         else:
       
    23             s = self._buffer[:size]
       
    24             self._buffer = self._buffer[size:]
       
    25             return s
       
    26 
       
    27 class ReadTest(unittest.TestCase):
       
    28     def check_partial(self, input, partialresults):
       
    29         # get a StreamReader for the encoding and feed the bytestring version
       
    30         # of input to the reader byte by byte. Read everything available from
       
    31         # the StreamReader and check that the results equal the appropriate
       
    32         # entries from partialresults.
       
    33         q = Queue()
       
    34         r = codecs.getreader(self.encoding)(q)
       
    35         result = u""
       
    36         for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
       
    37             q.write(c)
       
    38             result += r.read()
       
    39             self.assertEqual(result, partialresult)
       
    40         # check that there's nothing left in the buffers
       
    41         self.assertEqual(r.read(), u"")
       
    42         self.assertEqual(r.bytebuffer, "")
       
    43         self.assertEqual(r.charbuffer, u"")
       
    44 
       
    45         # do the check again, this time using a incremental decoder
       
    46         d = codecs.getincrementaldecoder(self.encoding)()
       
    47         result = u""
       
    48         for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
       
    49             result += d.decode(c)
       
    50             self.assertEqual(result, partialresult)
       
    51         # check that there's nothing left in the buffers
       
    52         self.assertEqual(d.decode("", True), u"")
       
    53         self.assertEqual(d.buffer, "")
       
    54 
       
    55         # Check whether the rest method works properly
       
    56         d.reset()
       
    57         result = u""
       
    58         for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
       
    59             result += d.decode(c)
       
    60             self.assertEqual(result, partialresult)
       
    61         # check that there's nothing left in the buffers
       
    62         self.assertEqual(d.decode("", True), u"")
       
    63         self.assertEqual(d.buffer, "")
       
    64 
       
    65         # check iterdecode()
       
    66         encoded = input.encode(self.encoding)
       
    67         self.assertEqual(
       
    68             input,
       
    69             u"".join(codecs.iterdecode(encoded, self.encoding))
       
    70         )
       
    71 
       
    72     def test_readline(self):
       
    73         def getreader(input):
       
    74             stream = StringIO.StringIO(input.encode(self.encoding))
       
    75             return codecs.getreader(self.encoding)(stream)
       
    76 
       
    77         def readalllines(input, keepends=True, size=None):
       
    78             reader = getreader(input)
       
    79             lines = []
       
    80             while True:
       
    81                 line = reader.readline(size=size, keepends=keepends)
       
    82                 if not line:
       
    83                     break
       
    84                 lines.append(line)
       
    85             return "|".join(lines)
       
    86 
       
    87         s = u"foo\nbar\r\nbaz\rspam\u2028eggs"
       
    88         sexpected = u"foo\n|bar\r\n|baz\r|spam\u2028|eggs"
       
    89         sexpectednoends = u"foo|bar|baz|spam|eggs"
       
    90         self.assertEqual(readalllines(s, True), sexpected)
       
    91         self.assertEqual(readalllines(s, False), sexpectednoends)
       
    92         self.assertEqual(readalllines(s, True, 10), sexpected)
       
    93         self.assertEqual(readalllines(s, False, 10), sexpectednoends)
       
    94 
       
    95         # Test long lines (multiple calls to read() in readline())
       
    96         vw = []
       
    97         vwo = []
       
    98         for (i, lineend) in enumerate(u"\n \r\n \r \u2028".split()):
       
    99             vw.append((i*200)*u"\3042" + lineend)
       
   100             vwo.append((i*200)*u"\3042")
       
   101         self.assertEqual(readalllines("".join(vw), True), "".join(vw))
       
   102         self.assertEqual(readalllines("".join(vw), False),"".join(vwo))
       
   103 
       
   104         # Test lines where the first read might end with \r, so the
       
   105         # reader has to look ahead whether this is a lone \r or a \r\n
       
   106         for size in xrange(80):
       
   107             for lineend in u"\n \r\n \r \u2028".split():
       
   108                 s = 10*(size*u"a" + lineend + u"xxx\n")
       
   109                 reader = getreader(s)
       
   110                 for i in xrange(10):
       
   111                     self.assertEqual(
       
   112                         reader.readline(keepends=True),
       
   113                         size*u"a" + lineend,
       
   114                     )
       
   115                 reader = getreader(s)
       
   116                 for i in xrange(10):
       
   117                     self.assertEqual(
       
   118                         reader.readline(keepends=False),
       
   119                         size*u"a",
       
   120                     )
       
   121 
       
   122     def test_bug1175396(self):
       
   123         s = [
       
   124             '<%!--===================================================\r\n',
       
   125             '    BLOG index page: show recent articles,\r\n',
       
   126             '    today\'s articles, or articles of a specific date.\r\n',
       
   127             '========================================================--%>\r\n',
       
   128             '<%@inputencoding="ISO-8859-1"%>\r\n',
       
   129             '<%@pagetemplate=TEMPLATE.y%>\r\n',
       
   130             '<%@import=import frog.util, frog%>\r\n',
       
   131             '<%@import=import frog.objects%>\r\n',
       
   132             '<%@import=from frog.storageerrors import StorageError%>\r\n',
       
   133             '<%\r\n',
       
   134             '\r\n',
       
   135             'import logging\r\n',
       
   136             'log=logging.getLogger("Snakelets.logger")\r\n',
       
   137             '\r\n',
       
   138             '\r\n',
       
   139             'user=self.SessionCtx.user\r\n',
       
   140             'storageEngine=self.SessionCtx.storageEngine\r\n',
       
   141             '\r\n',
       
   142             '\r\n',
       
   143             'def readArticlesFromDate(date, count=None):\r\n',
       
   144             '    entryids=storageEngine.listBlogEntries(date)\r\n',
       
   145             '    entryids.reverse() # descending\r\n',
       
   146             '    if count:\r\n',
       
   147             '        entryids=entryids[:count]\r\n',
       
   148             '    try:\r\n',
       
   149             '        return [ frog.objects.BlogEntry.load(storageEngine, date, Id) for Id in entryids ]\r\n',
       
   150             '    except StorageError,x:\r\n',
       
   151             '        log.error("Error loading articles: "+str(x))\r\n',
       
   152             '        self.abort("cannot load articles")\r\n',
       
   153             '\r\n',
       
   154             'showdate=None\r\n',
       
   155             '\r\n',
       
   156             'arg=self.Request.getArg()\r\n',
       
   157             'if arg=="today":\r\n',
       
   158             '    #-------------------- TODAY\'S ARTICLES\r\n',
       
   159             '    self.write("<h2>Today\'s articles</h2>")\r\n',
       
   160             '    showdate = frog.util.isodatestr() \r\n',
       
   161             '    entries = readArticlesFromDate(showdate)\r\n',
       
   162             'elif arg=="active":\r\n',
       
   163             '    #-------------------- ACTIVE ARTICLES redirect\r\n',
       
   164             '    self.Yredirect("active.y")\r\n',
       
   165             'elif arg=="login":\r\n',
       
   166             '    #-------------------- LOGIN PAGE redirect\r\n',
       
   167             '    self.Yredirect("login.y")\r\n',
       
   168             'elif arg=="date":\r\n',
       
   169             '    #-------------------- ARTICLES OF A SPECIFIC DATE\r\n',
       
   170             '    showdate = self.Request.getParameter("date")\r\n',
       
   171             '    self.write("<h2>Articles written on %s</h2>"% frog.util.mediumdatestr(showdate))\r\n',
       
   172             '    entries = readArticlesFromDate(showdate)\r\n',
       
   173             'else:\r\n',
       
   174             '    #-------------------- RECENT ARTICLES\r\n',
       
   175             '    self.write("<h2>Recent articles</h2>")\r\n',
       
   176             '    dates=storageEngine.listBlogEntryDates()\r\n',
       
   177             '    if dates:\r\n',
       
   178             '        entries=[]\r\n',
       
   179             '        SHOWAMOUNT=10\r\n',
       
   180             '        for showdate in dates:\r\n',
       
   181             '            entries.extend( readArticlesFromDate(showdate, SHOWAMOUNT-len(entries)) )\r\n',
       
   182             '            if len(entries)>=SHOWAMOUNT:\r\n',
       
   183             '                break\r\n',
       
   184             '                \r\n',
       
   185         ]
       
   186         stream = StringIO.StringIO("".join(s).encode(self.encoding))
       
   187         reader = codecs.getreader(self.encoding)(stream)
       
   188         for (i, line) in enumerate(reader):
       
   189             self.assertEqual(line, s[i])
       
   190 
       
   191     def test_readlinequeue(self):
       
   192         q = Queue()
       
   193         writer = codecs.getwriter(self.encoding)(q)
       
   194         reader = codecs.getreader(self.encoding)(q)
       
   195 
       
   196         # No lineends
       
   197         writer.write(u"foo\r")
       
   198         self.assertEqual(reader.readline(keepends=False), u"foo")
       
   199         writer.write(u"\nbar\r")
       
   200         self.assertEqual(reader.readline(keepends=False), u"")
       
   201         self.assertEqual(reader.readline(keepends=False), u"bar")
       
   202         writer.write(u"baz")
       
   203         self.assertEqual(reader.readline(keepends=False), u"baz")
       
   204         self.assertEqual(reader.readline(keepends=False), u"")
       
   205 
       
   206         # Lineends
       
   207         writer.write(u"foo\r")
       
   208         self.assertEqual(reader.readline(keepends=True), u"foo\r")
       
   209         writer.write(u"\nbar\r")
       
   210         self.assertEqual(reader.readline(keepends=True), u"\n")
       
   211         self.assertEqual(reader.readline(keepends=True), u"bar\r")
       
   212         writer.write(u"baz")
       
   213         self.assertEqual(reader.readline(keepends=True), u"baz")
       
   214         self.assertEqual(reader.readline(keepends=True), u"")
       
   215         writer.write(u"foo\r\n")
       
   216         self.assertEqual(reader.readline(keepends=True), u"foo\r\n")
       
   217 
       
   218     def test_bug1098990_a(self):
       
   219         s1 = u"xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx yyyyyyyyyyyyyyyyyyyyyyyyyyyyyyy\r\n"
       
   220         s2 = u"offending line: ladfj askldfj klasdj fskla dfzaskdj fasklfj laskd fjasklfzzzzaa%whereisthis!!!\r\n"
       
   221         s3 = u"next line.\r\n"
       
   222 
       
   223         s = (s1+s2+s3).encode(self.encoding)
       
   224         stream = StringIO.StringIO(s)
       
   225         reader = codecs.getreader(self.encoding)(stream)
       
   226         self.assertEqual(reader.readline(), s1)
       
   227         self.assertEqual(reader.readline(), s2)
       
   228         self.assertEqual(reader.readline(), s3)
       
   229         self.assertEqual(reader.readline(), u"")
       
   230 
       
   231     def test_bug1098990_b(self):
       
   232         s1 = u"aaaaaaaaaaaaaaaaaaaaaaaa\r\n"
       
   233         s2 = u"bbbbbbbbbbbbbbbbbbbbbbbb\r\n"
       
   234         s3 = u"stillokay:bbbbxx\r\n"
       
   235         s4 = u"broken!!!!badbad\r\n"
       
   236         s5 = u"againokay.\r\n"
       
   237 
       
   238         s = (s1+s2+s3+s4+s5).encode(self.encoding)
       
   239         stream = StringIO.StringIO(s)
       
   240         reader = codecs.getreader(self.encoding)(stream)
       
   241         self.assertEqual(reader.readline(), s1)
       
   242         self.assertEqual(reader.readline(), s2)
       
   243         self.assertEqual(reader.readline(), s3)
       
   244         self.assertEqual(reader.readline(), s4)
       
   245         self.assertEqual(reader.readline(), s5)
       
   246         self.assertEqual(reader.readline(), u"")
       
   247 
       
   248 class UTF16Test(ReadTest):
       
   249     encoding = "utf-16"
       
   250 
       
   251     spamle = '\xff\xfes\x00p\x00a\x00m\x00s\x00p\x00a\x00m\x00'
       
   252     spambe = '\xfe\xff\x00s\x00p\x00a\x00m\x00s\x00p\x00a\x00m'
       
   253 
       
   254     def test_only_one_bom(self):
       
   255         _,_,reader,writer = codecs.lookup(self.encoding)
       
   256         # encode some stream
       
   257         s = StringIO.StringIO()
       
   258         f = writer(s)
       
   259         f.write(u"spam")
       
   260         f.write(u"spam")
       
   261         d = s.getvalue()
       
   262         # check whether there is exactly one BOM in it
       
   263         self.assert_(d == self.spamle or d == self.spambe)
       
   264         # try to read it back
       
   265         s = StringIO.StringIO(d)
       
   266         f = reader(s)
       
   267         self.assertEquals(f.read(), u"spamspam")
       
   268 
       
   269     def test_badbom(self):
       
   270         s = StringIO.StringIO("\xff\xff")
       
   271         f = codecs.getreader(self.encoding)(s)
       
   272         self.assertRaises(UnicodeError, f.read)
       
   273 
       
   274         s = StringIO.StringIO("\xff\xff\xff\xff")
       
   275         f = codecs.getreader(self.encoding)(s)
       
   276         self.assertRaises(UnicodeError, f.read)
       
   277 
       
   278     def test_partial(self):
       
   279         self.check_partial(
       
   280             u"\x00\xff\u0100\uffff",
       
   281             [
       
   282                 u"", # first byte of BOM read
       
   283                 u"", # second byte of BOM read => byteorder known
       
   284                 u"",
       
   285                 u"\x00",
       
   286                 u"\x00",
       
   287                 u"\x00\xff",
       
   288                 u"\x00\xff",
       
   289                 u"\x00\xff\u0100",
       
   290                 u"\x00\xff\u0100",
       
   291                 u"\x00\xff\u0100\uffff",
       
   292             ]
       
   293         )
       
   294 
       
   295     def test_errors(self):
       
   296         self.assertRaises(UnicodeDecodeError, codecs.utf_16_decode, "\xff", "strict", True)
       
   297 
       
   298 class UTF16LETest(ReadTest):
       
   299     encoding = "utf-16-le"
       
   300 
       
   301     def test_partial(self):
       
   302         self.check_partial(
       
   303             u"\x00\xff\u0100\uffff",
       
   304             [
       
   305                 u"",
       
   306                 u"\x00",
       
   307                 u"\x00",
       
   308                 u"\x00\xff",
       
   309                 u"\x00\xff",
       
   310                 u"\x00\xff\u0100",
       
   311                 u"\x00\xff\u0100",
       
   312                 u"\x00\xff\u0100\uffff",
       
   313             ]
       
   314         )
       
   315 
       
   316     def test_errors(self):
       
   317         self.assertRaises(UnicodeDecodeError, codecs.utf_16_le_decode, "\xff", "strict", True)
       
   318 
       
   319 class UTF16BETest(ReadTest):
       
   320     encoding = "utf-16-be"
       
   321 
       
   322     def test_partial(self):
       
   323         self.check_partial(
       
   324             u"\x00\xff\u0100\uffff",
       
   325             [
       
   326                 u"",
       
   327                 u"\x00",
       
   328                 u"\x00",
       
   329                 u"\x00\xff",
       
   330                 u"\x00\xff",
       
   331                 u"\x00\xff\u0100",
       
   332                 u"\x00\xff\u0100",
       
   333                 u"\x00\xff\u0100\uffff",
       
   334             ]
       
   335         )
       
   336 
       
   337     def test_errors(self):
       
   338         self.assertRaises(UnicodeDecodeError, codecs.utf_16_be_decode, "\xff", "strict", True)
       
   339 
       
   340 class UTF8Test(ReadTest):
       
   341     encoding = "utf-8"
       
   342 
       
   343     def test_partial(self):
       
   344         self.check_partial(
       
   345             u"\x00\xff\u07ff\u0800\uffff",
       
   346             [
       
   347                 u"\x00",
       
   348                 u"\x00",
       
   349                 u"\x00\xff",
       
   350                 u"\x00\xff",
       
   351                 u"\x00\xff\u07ff",
       
   352                 u"\x00\xff\u07ff",
       
   353                 u"\x00\xff\u07ff",
       
   354                 u"\x00\xff\u07ff\u0800",
       
   355                 u"\x00\xff\u07ff\u0800",
       
   356                 u"\x00\xff\u07ff\u0800",
       
   357                 u"\x00\xff\u07ff\u0800\uffff",
       
   358             ]
       
   359         )
       
   360 
       
   361 class UTF7Test(ReadTest):
       
   362     encoding = "utf-7"
       
   363 
       
   364     # No test_partial() yet, because UTF-7 doesn't support it.
       
   365 
       
   366 class UTF16ExTest(unittest.TestCase):
       
   367 
       
   368     def test_errors(self):
       
   369         self.assertRaises(UnicodeDecodeError, codecs.utf_16_ex_decode, "\xff", "strict", 0, True)
       
   370 
       
   371     def test_bad_args(self):
       
   372         self.assertRaises(TypeError, codecs.utf_16_ex_decode)
       
   373 
       
   374 class ReadBufferTest(unittest.TestCase):
       
   375 
       
   376     def test_array(self):
       
   377         import array
       
   378         self.assertEqual(
       
   379             codecs.readbuffer_encode(array.array("c", "spam")),
       
   380             ("spam", 4)
       
   381         )
       
   382 
       
   383     def test_empty(self):
       
   384         self.assertEqual(codecs.readbuffer_encode(""), ("", 0))
       
   385 
       
   386     def test_bad_args(self):
       
   387         self.assertRaises(TypeError, codecs.readbuffer_encode)
       
   388         self.assertRaises(TypeError, codecs.readbuffer_encode, 42)
       
   389 
       
   390 class CharBufferTest(unittest.TestCase):
       
   391 
       
   392     def test_string(self):
       
   393         self.assertEqual(codecs.charbuffer_encode("spam"), ("spam", 4))
       
   394 
       
   395     def test_empty(self):
       
   396         self.assertEqual(codecs.charbuffer_encode(""), ("", 0))
       
   397 
       
   398     def test_bad_args(self):
       
   399         self.assertRaises(TypeError, codecs.charbuffer_encode)
       
   400         self.assertRaises(TypeError, codecs.charbuffer_encode, 42)
       
   401 
       
   402 class UTF8SigTest(ReadTest):
       
   403     encoding = "utf-8-sig"
       
   404 
       
   405     def test_partial(self):
       
   406         self.check_partial(
       
   407             u"\ufeff\x00\xff\u07ff\u0800\uffff",
       
   408             [
       
   409                 u"",
       
   410                 u"",
       
   411                 u"", # First BOM has been read and skipped
       
   412                 u"",
       
   413                 u"",
       
   414                 u"\ufeff", # Second BOM has been read and emitted
       
   415                 u"\ufeff\x00", # "\x00" read and emitted
       
   416                 u"\ufeff\x00", # First byte of encoded u"\xff" read
       
   417                 u"\ufeff\x00\xff", # Second byte of encoded u"\xff" read
       
   418                 u"\ufeff\x00\xff", # First byte of encoded u"\u07ff" read
       
   419                 u"\ufeff\x00\xff\u07ff", # Second byte of encoded u"\u07ff" read
       
   420                 u"\ufeff\x00\xff\u07ff",
       
   421                 u"\ufeff\x00\xff\u07ff",
       
   422                 u"\ufeff\x00\xff\u07ff\u0800",
       
   423                 u"\ufeff\x00\xff\u07ff\u0800",
       
   424                 u"\ufeff\x00\xff\u07ff\u0800",
       
   425                 u"\ufeff\x00\xff\u07ff\u0800\uffff",
       
   426             ]
       
   427         )
       
   428 
       
   429     def test_bug1601501(self):
       
   430         # SF bug #1601501: check that the codec works with a buffer
       
   431         unicode("\xef\xbb\xbf", "utf-8-sig")
       
   432 
       
   433     def test_bom(self):
       
   434         d = codecs.getincrementaldecoder("utf-8-sig")()
       
   435         s = u"spam"
       
   436         self.assertEqual(d.decode(s.encode("utf-8-sig")), s)
       
   437 
       
   438     def test_stream_bom(self):
       
   439         unistring = u"ABC\u00A1\u2200XYZ"
       
   440         bytestring = codecs.BOM_UTF8 + "ABC\xC2\xA1\xE2\x88\x80XYZ"
       
   441 
       
   442         reader = codecs.getreader("utf-8-sig")
       
   443         for sizehint in [None] + range(1, 11) + \
       
   444                         [64, 128, 256, 512, 1024]:
       
   445             istream = reader(StringIO.StringIO(bytestring))
       
   446             ostream = StringIO.StringIO()
       
   447             while 1:
       
   448                 if sizehint is not None:
       
   449                     data = istream.read(sizehint)
       
   450                 else:
       
   451                     data = istream.read()
       
   452 
       
   453                 if not data:
       
   454                     break
       
   455                 ostream.write(data)
       
   456 
       
   457             got = ostream.getvalue()
       
   458             self.assertEqual(got, unistring)
       
   459 
       
   460     def test_stream_bare(self):
       
   461         unistring = u"ABC\u00A1\u2200XYZ"
       
   462         bytestring = "ABC\xC2\xA1\xE2\x88\x80XYZ"
       
   463 
       
   464         reader = codecs.getreader("utf-8-sig")
       
   465         for sizehint in [None] + range(1, 11) + \
       
   466                         [64, 128, 256, 512, 1024]:
       
   467             istream = reader(StringIO.StringIO(bytestring))
       
   468             ostream = StringIO.StringIO()
       
   469             while 1:
       
   470                 if sizehint is not None:
       
   471                     data = istream.read(sizehint)
       
   472                 else:
       
   473                     data = istream.read()
       
   474 
       
   475                 if not data:
       
   476                     break
       
   477                 ostream.write(data)
       
   478 
       
   479             got = ostream.getvalue()
       
   480             self.assertEqual(got, unistring)
       
   481 
       
   482 class EscapeDecodeTest(unittest.TestCase):
       
   483     def test_empty(self):
       
   484         self.assertEquals(codecs.escape_decode(""), ("", 0))
       
   485 
       
   486 class RecodingTest(unittest.TestCase):
       
   487     def test_recoding(self):
       
   488         f = StringIO.StringIO()
       
   489         f2 = codecs.EncodedFile(f, "unicode_internal", "utf-8")
       
   490         f2.write(u"a")
       
   491         f2.close()
       
   492         # Python used to crash on this at exit because of a refcount
       
   493         # bug in _codecsmodule.c
       
   494 
       
   495 # From RFC 3492
       
   496 punycode_testcases = [
       
   497     # A Arabic (Egyptian):
       
   498     (u"\u0644\u064A\u0647\u0645\u0627\u0628\u062A\u0643\u0644"
       
   499      u"\u0645\u0648\u0634\u0639\u0631\u0628\u064A\u061F",
       
   500      "egbpdaj6bu4bxfgehfvwxn"),
       
   501     # B Chinese (simplified):
       
   502     (u"\u4ED6\u4EEC\u4E3A\u4EC0\u4E48\u4E0D\u8BF4\u4E2D\u6587",
       
   503      "ihqwcrb4cv8a8dqg056pqjye"),
       
   504     # C Chinese (traditional):
       
   505     (u"\u4ED6\u5011\u7232\u4EC0\u9EBD\u4E0D\u8AAA\u4E2D\u6587",
       
   506      "ihqwctvzc91f659drss3x8bo0yb"),
       
   507     # D Czech: Pro<ccaron>prost<ecaron>nemluv<iacute><ccaron>esky
       
   508     (u"\u0050\u0072\u006F\u010D\u0070\u0072\u006F\u0073\u0074"
       
   509      u"\u011B\u006E\u0065\u006D\u006C\u0075\u0076\u00ED\u010D"
       
   510      u"\u0065\u0073\u006B\u0079",
       
   511      "Proprostnemluvesky-uyb24dma41a"),
       
   512     # E Hebrew:
       
   513     (u"\u05DC\u05DE\u05D4\u05D4\u05DD\u05E4\u05E9\u05D5\u05D8"
       
   514      u"\u05DC\u05D0\u05DE\u05D3\u05D1\u05E8\u05D9\u05DD\u05E2"
       
   515      u"\u05D1\u05E8\u05D9\u05EA",
       
   516      "4dbcagdahymbxekheh6e0a7fei0b"),
       
   517     # F Hindi (Devanagari):
       
   518     (u"\u092F\u0939\u0932\u094B\u0917\u0939\u093F\u0928\u094D"
       
   519     u"\u0926\u0940\u0915\u094D\u092F\u094B\u0902\u0928\u0939"
       
   520     u"\u0940\u0902\u092C\u094B\u0932\u0938\u0915\u0924\u0947"
       
   521     u"\u0939\u0948\u0902",
       
   522     "i1baa7eci9glrd9b2ae1bj0hfcgg6iyaf8o0a1dig0cd"),
       
   523 
       
   524     #(G) Japanese (kanji and hiragana):
       
   525     (u"\u306A\u305C\u307F\u3093\u306A\u65E5\u672C\u8A9E\u3092"
       
   526     u"\u8A71\u3057\u3066\u304F\u308C\u306A\u3044\u306E\u304B",
       
   527      "n8jok5ay5dzabd5bym9f0cm5685rrjetr6pdxa"),
       
   528 
       
   529     # (H) Korean (Hangul syllables):
       
   530     (u"\uC138\uACC4\uC758\uBAA8\uB4E0\uC0AC\uB78C\uB4E4\uC774"
       
   531      u"\uD55C\uAD6D\uC5B4\uB97C\uC774\uD574\uD55C\uB2E4\uBA74"
       
   532      u"\uC5BC\uB9C8\uB098\uC88B\uC744\uAE4C",
       
   533      "989aomsvi5e83db1d2a355cv1e0vak1dwrv93d5xbh15a0dt30a5j"
       
   534      "psd879ccm6fea98c"),
       
   535 
       
   536     # (I) Russian (Cyrillic):
       
   537     (u"\u043F\u043E\u0447\u0435\u043C\u0443\u0436\u0435\u043E"
       
   538      u"\u043D\u0438\u043D\u0435\u0433\u043E\u0432\u043E\u0440"
       
   539      u"\u044F\u0442\u043F\u043E\u0440\u0443\u0441\u0441\u043A"
       
   540      u"\u0438",
       
   541      "b1abfaaepdrnnbgefbaDotcwatmq2g4l"),
       
   542 
       
   543     # (J) Spanish: Porqu<eacute>nopuedensimplementehablarenEspa<ntilde>ol
       
   544     (u"\u0050\u006F\u0072\u0071\u0075\u00E9\u006E\u006F\u0070"
       
   545      u"\u0075\u0065\u0064\u0065\u006E\u0073\u0069\u006D\u0070"
       
   546      u"\u006C\u0065\u006D\u0065\u006E\u0074\u0065\u0068\u0061"
       
   547      u"\u0062\u006C\u0061\u0072\u0065\u006E\u0045\u0073\u0070"
       
   548      u"\u0061\u00F1\u006F\u006C",
       
   549      "PorqunopuedensimplementehablarenEspaol-fmd56a"),
       
   550 
       
   551     # (K) Vietnamese:
       
   552     #  T<adotbelow>isaoh<odotbelow>kh<ocirc>ngth<ecirchookabove>ch\
       
   553     #   <ihookabove>n<oacute>iti<ecircacute>ngVi<ecircdotbelow>t
       
   554     (u"\u0054\u1EA1\u0069\u0073\u0061\u006F\u0068\u1ECD\u006B"
       
   555      u"\u0068\u00F4\u006E\u0067\u0074\u0068\u1EC3\u0063\u0068"
       
   556      u"\u1EC9\u006E\u00F3\u0069\u0074\u0069\u1EBF\u006E\u0067"
       
   557      u"\u0056\u0069\u1EC7\u0074",
       
   558      "TisaohkhngthchnitingVit-kjcr8268qyxafd2f1b9g"),
       
   559 
       
   560     #(L) 3<nen>B<gumi><kinpachi><sensei>
       
   561     (u"\u0033\u5E74\u0042\u7D44\u91D1\u516B\u5148\u751F",
       
   562      "3B-ww4c5e180e575a65lsy2b"),
       
   563 
       
   564     # (M) <amuro><namie>-with-SUPER-MONKEYS
       
   565     (u"\u5B89\u5BA4\u5948\u7F8E\u6075\u002D\u0077\u0069\u0074"
       
   566      u"\u0068\u002D\u0053\u0055\u0050\u0045\u0052\u002D\u004D"
       
   567      u"\u004F\u004E\u004B\u0045\u0059\u0053",
       
   568      "-with-SUPER-MONKEYS-pc58ag80a8qai00g7n9n"),
       
   569 
       
   570     # (N) Hello-Another-Way-<sorezore><no><basho>
       
   571     (u"\u0048\u0065\u006C\u006C\u006F\u002D\u0041\u006E\u006F"
       
   572      u"\u0074\u0068\u0065\u0072\u002D\u0057\u0061\u0079\u002D"
       
   573      u"\u305D\u308C\u305E\u308C\u306E\u5834\u6240",
       
   574      "Hello-Another-Way--fc4qua05auwb3674vfr0b"),
       
   575 
       
   576     # (O) <hitotsu><yane><no><shita>2
       
   577     (u"\u3072\u3068\u3064\u5C4B\u6839\u306E\u4E0B\u0032",
       
   578      "2-u9tlzr9756bt3uc0v"),
       
   579 
       
   580     # (P) Maji<de>Koi<suru>5<byou><mae>
       
   581     (u"\u004D\u0061\u006A\u0069\u3067\u004B\u006F\u0069\u3059"
       
   582      u"\u308B\u0035\u79D2\u524D",
       
   583      "MajiKoi5-783gue6qz075azm5e"),
       
   584 
       
   585      # (Q) <pafii>de<runba>
       
   586     (u"\u30D1\u30D5\u30A3\u30FC\u0064\u0065\u30EB\u30F3\u30D0",
       
   587      "de-jg4avhby1noc0d"),
       
   588 
       
   589     # (R) <sono><supiido><de>
       
   590     (u"\u305D\u306E\u30B9\u30D4\u30FC\u30C9\u3067",
       
   591      "d9juau41awczczp"),
       
   592 
       
   593     # (S) -> $1.00 <-
       
   594     (u"\u002D\u003E\u0020\u0024\u0031\u002E\u0030\u0030\u0020"
       
   595      u"\u003C\u002D",
       
   596      "-> $1.00 <--")
       
   597     ]
       
   598 
       
   599 for i in punycode_testcases:
       
   600     if len(i)!=2:
       
   601         print repr(i)
       
   602 
       
   603 class PunycodeTest(unittest.TestCase):
       
   604     def test_encode(self):
       
   605         for uni, puny in punycode_testcases:
       
   606             # Need to convert both strings to lower case, since
       
   607             # some of the extended encodings use upper case, but our
       
   608             # code produces only lower case. Converting just puny to
       
   609             # lower is also insufficient, since some of the input characters
       
   610             # are upper case.
       
   611             self.assertEquals(uni.encode("punycode").lower(), puny.lower())
       
   612 
       
   613     def test_decode(self):
       
   614         for uni, puny in punycode_testcases:
       
   615             self.assertEquals(uni, puny.decode("punycode"))
       
   616 
       
   617 class UnicodeInternalTest(unittest.TestCase):
       
   618     def test_bug1251300(self):
       
   619         # Decoding with unicode_internal used to not correctly handle "code
       
   620         # points" above 0x10ffff on UCS-4 builds.
       
   621         if sys.maxunicode > 0xffff:
       
   622             ok = [
       
   623                 ("\x00\x10\xff\xff", u"\U0010ffff"),
       
   624                 ("\x00\x00\x01\x01", u"\U00000101"),
       
   625                 ("", u""),
       
   626             ]
       
   627             not_ok = [
       
   628                 "\x7f\xff\xff\xff",
       
   629                 "\x80\x00\x00\x00",
       
   630                 "\x81\x00\x00\x00",
       
   631                 "\x00",
       
   632                 "\x00\x00\x00\x00\x00",
       
   633             ]
       
   634             for internal, uni in ok:
       
   635                 if sys.byteorder == "little":
       
   636                     internal = "".join(reversed(internal))
       
   637                 self.assertEquals(uni, internal.decode("unicode_internal"))
       
   638             for internal in not_ok:
       
   639                 if sys.byteorder == "little":
       
   640                     internal = "".join(reversed(internal))
       
   641                 self.assertRaises(UnicodeDecodeError, internal.decode,
       
   642                     "unicode_internal")
       
   643 
       
   644     def test_decode_error_attributes(self):
       
   645         if sys.maxunicode > 0xffff:
       
   646             try:
       
   647                 "\x00\x00\x00\x00\x00\x11\x11\x00".decode("unicode_internal")
       
   648             except UnicodeDecodeError, ex:
       
   649                 self.assertEquals("unicode_internal", ex.encoding)
       
   650                 self.assertEquals("\x00\x00\x00\x00\x00\x11\x11\x00", ex.object)
       
   651                 self.assertEquals(4, ex.start)
       
   652                 self.assertEquals(8, ex.end)
       
   653             else:
       
   654                 self.fail()
       
   655 
       
   656     def test_decode_callback(self):
       
   657         if sys.maxunicode > 0xffff:
       
   658             codecs.register_error("UnicodeInternalTest", codecs.ignore_errors)
       
   659             decoder = codecs.getdecoder("unicode_internal")
       
   660             ab = u"ab".encode("unicode_internal")
       
   661             ignored = decoder("%s\x22\x22\x22\x22%s" % (ab[:4], ab[4:]),
       
   662                 "UnicodeInternalTest")
       
   663             self.assertEquals((u"ab", 12), ignored)
       
   664 
       
   665 # From http://www.gnu.org/software/libidn/draft-josefsson-idn-test-vectors.html
       
   666 nameprep_tests = [
       
   667     # 3.1 Map to nothing.
       
   668     ('foo\xc2\xad\xcd\x8f\xe1\xa0\x86\xe1\xa0\x8bbar'
       
   669      '\xe2\x80\x8b\xe2\x81\xa0baz\xef\xb8\x80\xef\xb8\x88\xef'
       
   670      '\xb8\x8f\xef\xbb\xbf',
       
   671      'foobarbaz'),
       
   672     # 3.2 Case folding ASCII U+0043 U+0041 U+0046 U+0045.
       
   673     ('CAFE',
       
   674      'cafe'),
       
   675     # 3.3 Case folding 8bit U+00DF (german sharp s).
       
   676     # The original test case is bogus; it says \xc3\xdf
       
   677     ('\xc3\x9f',
       
   678      'ss'),
       
   679     # 3.4 Case folding U+0130 (turkish capital I with dot).
       
   680     ('\xc4\xb0',
       
   681      'i\xcc\x87'),
       
   682     # 3.5 Case folding multibyte U+0143 U+037A.
       
   683     ('\xc5\x83\xcd\xba',
       
   684      '\xc5\x84 \xce\xb9'),
       
   685     # 3.6 Case folding U+2121 U+33C6 U+1D7BB.
       
   686     # XXX: skip this as it fails in UCS-2 mode
       
   687     #('\xe2\x84\xa1\xe3\x8f\x86\xf0\x9d\x9e\xbb',
       
   688     # 'telc\xe2\x88\x95kg\xcf\x83'),
       
   689     (None, None),
       
   690     # 3.7 Normalization of U+006a U+030c U+00A0 U+00AA.
       
   691     ('j\xcc\x8c\xc2\xa0\xc2\xaa',
       
   692      '\xc7\xb0 a'),
       
   693     # 3.8 Case folding U+1FB7 and normalization.
       
   694     ('\xe1\xbe\xb7',
       
   695      '\xe1\xbe\xb6\xce\xb9'),
       
   696     # 3.9 Self-reverting case folding U+01F0 and normalization.
       
   697     # The original test case is bogus, it says `\xc7\xf0'
       
   698     ('\xc7\xb0',
       
   699      '\xc7\xb0'),
       
   700     # 3.10 Self-reverting case folding U+0390 and normalization.
       
   701     ('\xce\x90',
       
   702      '\xce\x90'),
       
   703     # 3.11 Self-reverting case folding U+03B0 and normalization.
       
   704     ('\xce\xb0',
       
   705      '\xce\xb0'),
       
   706     # 3.12 Self-reverting case folding U+1E96 and normalization.
       
   707     ('\xe1\xba\x96',
       
   708      '\xe1\xba\x96'),
       
   709     # 3.13 Self-reverting case folding U+1F56 and normalization.
       
   710     ('\xe1\xbd\x96',
       
   711      '\xe1\xbd\x96'),
       
   712     # 3.14 ASCII space character U+0020.
       
   713     (' ',
       
   714      ' '),
       
   715     # 3.15 Non-ASCII 8bit space character U+00A0.
       
   716     ('\xc2\xa0',
       
   717      ' '),
       
   718     # 3.16 Non-ASCII multibyte space character U+1680.
       
   719     ('\xe1\x9a\x80',
       
   720      None),
       
   721     # 3.17 Non-ASCII multibyte space character U+2000.
       
   722     ('\xe2\x80\x80',
       
   723      ' '),
       
   724     # 3.18 Zero Width Space U+200b.
       
   725     ('\xe2\x80\x8b',
       
   726      ''),
       
   727     # 3.19 Non-ASCII multibyte space character U+3000.
       
   728     ('\xe3\x80\x80',
       
   729      ' '),
       
   730     # 3.20 ASCII control characters U+0010 U+007F.
       
   731     ('\x10\x7f',
       
   732      '\x10\x7f'),
       
   733     # 3.21 Non-ASCII 8bit control character U+0085.
       
   734     ('\xc2\x85',
       
   735      None),
       
   736     # 3.22 Non-ASCII multibyte control character U+180E.
       
   737     ('\xe1\xa0\x8e',
       
   738      None),
       
   739     # 3.23 Zero Width No-Break Space U+FEFF.
       
   740     ('\xef\xbb\xbf',
       
   741      ''),
       
   742     # 3.24 Non-ASCII control character U+1D175.
       
   743     ('\xf0\x9d\x85\xb5',
       
   744      None),
       
   745     # 3.25 Plane 0 private use character U+F123.
       
   746     ('\xef\x84\xa3',
       
   747      None),
       
   748     # 3.26 Plane 15 private use character U+F1234.
       
   749     ('\xf3\xb1\x88\xb4',
       
   750      None),
       
   751     # 3.27 Plane 16 private use character U+10F234.
       
   752     ('\xf4\x8f\x88\xb4',
       
   753      None),
       
   754     # 3.28 Non-character code point U+8FFFE.
       
   755     ('\xf2\x8f\xbf\xbe',
       
   756      None),
       
   757     # 3.29 Non-character code point U+10FFFF.
       
   758     ('\xf4\x8f\xbf\xbf',
       
   759      None),
       
   760     # 3.30 Surrogate code U+DF42.
       
   761     ('\xed\xbd\x82',
       
   762      None),
       
   763     # 3.31 Non-plain text character U+FFFD.
       
   764     ('\xef\xbf\xbd',
       
   765      None),
       
   766     # 3.32 Ideographic description character U+2FF5.
       
   767     ('\xe2\xbf\xb5',
       
   768      None),
       
   769     # 3.33 Display property character U+0341.
       
   770     ('\xcd\x81',
       
   771      '\xcc\x81'),
       
   772     # 3.34 Left-to-right mark U+200E.
       
   773     ('\xe2\x80\x8e',
       
   774      None),
       
   775     # 3.35 Deprecated U+202A.
       
   776     ('\xe2\x80\xaa',
       
   777      None),
       
   778     # 3.36 Language tagging character U+E0001.
       
   779     ('\xf3\xa0\x80\x81',
       
   780      None),
       
   781     # 3.37 Language tagging character U+E0042.
       
   782     ('\xf3\xa0\x81\x82',
       
   783      None),
       
   784     # 3.38 Bidi: RandALCat character U+05BE and LCat characters.
       
   785     ('foo\xd6\xbebar',
       
   786      None),
       
   787     # 3.39 Bidi: RandALCat character U+FD50 and LCat characters.
       
   788     ('foo\xef\xb5\x90bar',
       
   789      None),
       
   790     # 3.40 Bidi: RandALCat character U+FB38 and LCat characters.
       
   791     ('foo\xef\xb9\xb6bar',
       
   792      'foo \xd9\x8ebar'),
       
   793     # 3.41 Bidi: RandALCat without trailing RandALCat U+0627 U+0031.
       
   794     ('\xd8\xa71',
       
   795      None),
       
   796     # 3.42 Bidi: RandALCat character U+0627 U+0031 U+0628.
       
   797     ('\xd8\xa71\xd8\xa8',
       
   798      '\xd8\xa71\xd8\xa8'),
       
   799     # 3.43 Unassigned code point U+E0002.
       
   800     # Skip this test as we allow unassigned
       
   801     #('\xf3\xa0\x80\x82',
       
   802     # None),
       
   803     (None, None),
       
   804     # 3.44 Larger test (shrinking).
       
   805     # Original test case reads \xc3\xdf
       
   806     ('X\xc2\xad\xc3\x9f\xc4\xb0\xe2\x84\xa1j\xcc\x8c\xc2\xa0\xc2'
       
   807      '\xaa\xce\xb0\xe2\x80\x80',
       
   808      'xssi\xcc\x87tel\xc7\xb0 a\xce\xb0 '),
       
   809     # 3.45 Larger test (expanding).
       
   810     # Original test case reads \xc3\x9f
       
   811     ('X\xc3\x9f\xe3\x8c\x96\xc4\xb0\xe2\x84\xa1\xe2\x92\x9f\xe3\x8c'
       
   812      '\x80',
       
   813      'xss\xe3\x82\xad\xe3\x83\xad\xe3\x83\xa1\xe3\x83\xbc\xe3'
       
   814      '\x83\x88\xe3\x83\xabi\xcc\x87tel\x28d\x29\xe3\x82'
       
   815      '\xa2\xe3\x83\x91\xe3\x83\xbc\xe3\x83\x88')
       
   816     ]
       
   817 
       
   818 
       
   819 class NameprepTest(unittest.TestCase):
       
   820     def test_nameprep(self):
       
   821         from encodings.idna import nameprep
       
   822         for pos, (orig, prepped) in enumerate(nameprep_tests):
       
   823             if orig is None:
       
   824                 # Skipped
       
   825                 continue
       
   826             # The Unicode strings are given in UTF-8
       
   827             orig = unicode(orig, "utf-8")
       
   828             if prepped is None:
       
   829                 # Input contains prohibited characters
       
   830                 self.assertRaises(UnicodeError, nameprep, orig)
       
   831             else:
       
   832                 prepped = unicode(prepped, "utf-8")
       
   833                 try:
       
   834                     self.assertEquals(nameprep(orig), prepped)
       
   835                 except Exception,e:
       
   836                     raise test_support.TestFailed("Test 3.%d: %s" % (pos+1, str(e)))
       
   837 
       
   838 class IDNACodecTest(unittest.TestCase):
       
   839     def test_builtin_decode(self):
       
   840         self.assertEquals(unicode("python.org", "idna"), u"python.org")
       
   841         self.assertEquals(unicode("python.org.", "idna"), u"python.org.")
       
   842         self.assertEquals(unicode("xn--pythn-mua.org", "idna"), u"pyth\xf6n.org")
       
   843         self.assertEquals(unicode("xn--pythn-mua.org.", "idna"), u"pyth\xf6n.org.")
       
   844 
       
   845     def test_builtin_encode(self):
       
   846         self.assertEquals(u"python.org".encode("idna"), "python.org")
       
   847         self.assertEquals("python.org.".encode("idna"), "python.org.")
       
   848         self.assertEquals(u"pyth\xf6n.org".encode("idna"), "xn--pythn-mua.org")
       
   849         self.assertEquals(u"pyth\xf6n.org.".encode("idna"), "xn--pythn-mua.org.")
       
   850 
       
   851     def test_stream(self):
       
   852         import StringIO
       
   853         r = codecs.getreader("idna")(StringIO.StringIO("abc"))
       
   854         r.read(3)
       
   855         self.assertEquals(r.read(), u"")
       
   856 
       
   857     def test_incremental_decode(self):
       
   858         self.assertEquals(
       
   859             "".join(codecs.iterdecode("python.org", "idna")),
       
   860             u"python.org"
       
   861         )
       
   862         self.assertEquals(
       
   863             "".join(codecs.iterdecode("python.org.", "idna")),
       
   864             u"python.org."
       
   865         )
       
   866         self.assertEquals(
       
   867             "".join(codecs.iterdecode("xn--pythn-mua.org.", "idna")),
       
   868             u"pyth\xf6n.org."
       
   869         )
       
   870         self.assertEquals(
       
   871             "".join(codecs.iterdecode("xn--pythn-mua.org.", "idna")),
       
   872             u"pyth\xf6n.org."
       
   873         )
       
   874 
       
   875         decoder = codecs.getincrementaldecoder("idna")()
       
   876         self.assertEquals(decoder.decode("xn--xam", ), u"")
       
   877         self.assertEquals(decoder.decode("ple-9ta.o", ), u"\xe4xample.")
       
   878         self.assertEquals(decoder.decode(u"rg"), u"")
       
   879         self.assertEquals(decoder.decode(u"", True), u"org")
       
   880 
       
   881         decoder.reset()
       
   882         self.assertEquals(decoder.decode("xn--xam", ), u"")
       
   883         self.assertEquals(decoder.decode("ple-9ta.o", ), u"\xe4xample.")
       
   884         self.assertEquals(decoder.decode("rg."), u"org.")
       
   885         self.assertEquals(decoder.decode("", True), u"")
       
   886 
       
   887     def test_incremental_encode(self):
       
   888         self.assertEquals(
       
   889             "".join(codecs.iterencode(u"python.org", "idna")),
       
   890             "python.org"
       
   891         )
       
   892         self.assertEquals(
       
   893             "".join(codecs.iterencode(u"python.org.", "idna")),
       
   894             "python.org."
       
   895         )
       
   896         self.assertEquals(
       
   897             "".join(codecs.iterencode(u"pyth\xf6n.org.", "idna")),
       
   898             "xn--pythn-mua.org."
       
   899         )
       
   900         self.assertEquals(
       
   901             "".join(codecs.iterencode(u"pyth\xf6n.org.", "idna")),
       
   902             "xn--pythn-mua.org."
       
   903         )
       
   904 
       
   905         encoder = codecs.getincrementalencoder("idna")()
       
   906         self.assertEquals(encoder.encode(u"\xe4x"), "")
       
   907         self.assertEquals(encoder.encode(u"ample.org"), "xn--xample-9ta.")
       
   908         self.assertEquals(encoder.encode(u"", True), "org")
       
   909 
       
   910         encoder.reset()
       
   911         self.assertEquals(encoder.encode(u"\xe4x"), "")
       
   912         self.assertEquals(encoder.encode(u"ample.org."), "xn--xample-9ta.org.")
       
   913         self.assertEquals(encoder.encode(u"", True), "")
       
   914 
       
   915 class CodecsModuleTest(unittest.TestCase):
       
   916 
       
   917     def test_decode(self):
       
   918         self.assertEquals(codecs.decode('\xe4\xf6\xfc', 'latin-1'),
       
   919                           u'\xe4\xf6\xfc')
       
   920         self.assertRaises(TypeError, codecs.decode)
       
   921         self.assertEquals(codecs.decode('abc'), u'abc')
       
   922         self.assertRaises(UnicodeDecodeError, codecs.decode, '\xff', 'ascii')
       
   923 
       
   924     def test_encode(self):
       
   925         self.assertEquals(codecs.encode(u'\xe4\xf6\xfc', 'latin-1'),
       
   926                           '\xe4\xf6\xfc')
       
   927         self.assertRaises(TypeError, codecs.encode)
       
   928         self.assertRaises(LookupError, codecs.encode, "foo", "__spam__")
       
   929         self.assertEquals(codecs.encode(u'abc'), 'abc')
       
   930         self.assertRaises(UnicodeEncodeError, codecs.encode, u'\xffff', 'ascii')
       
   931 
       
   932     def test_register(self):
       
   933         self.assertRaises(TypeError, codecs.register)
       
   934         self.assertRaises(TypeError, codecs.register, 42)
       
   935 
       
   936     def test_lookup(self):
       
   937         self.assertRaises(TypeError, codecs.lookup)
       
   938         self.assertRaises(LookupError, codecs.lookup, "__spam__")
       
   939         self.assertRaises(LookupError, codecs.lookup, " ")
       
   940 
       
   941     def test_getencoder(self):
       
   942         self.assertRaises(TypeError, codecs.getencoder)
       
   943         self.assertRaises(LookupError, codecs.getencoder, "__spam__")
       
   944 
       
   945     def test_getdecoder(self):
       
   946         self.assertRaises(TypeError, codecs.getdecoder)
       
   947         self.assertRaises(LookupError, codecs.getdecoder, "__spam__")
       
   948 
       
   949     def test_getreader(self):
       
   950         self.assertRaises(TypeError, codecs.getreader)
       
   951         self.assertRaises(LookupError, codecs.getreader, "__spam__")
       
   952 
       
   953     def test_getwriter(self):
       
   954         self.assertRaises(TypeError, codecs.getwriter)
       
   955         self.assertRaises(LookupError, codecs.getwriter, "__spam__")
       
   956 
       
   957 class StreamReaderTest(unittest.TestCase):
       
   958 
       
   959     def setUp(self):
       
   960         self.reader = codecs.getreader('utf-8')
       
   961         self.stream = StringIO.StringIO('\xed\x95\x9c\n\xea\xb8\x80')
       
   962 
       
   963     def test_readlines(self):
       
   964         f = self.reader(self.stream)
       
   965         self.assertEquals(f.readlines(), [u'\ud55c\n', u'\uae00'])
       
   966 
       
   967 class EncodedFileTest(unittest.TestCase):
       
   968 
       
   969     def test_basic(self):
       
   970         f = StringIO.StringIO('\xed\x95\x9c\n\xea\xb8\x80')
       
   971         ef = codecs.EncodedFile(f, 'utf-16-le', 'utf-8')
       
   972         self.assertEquals(ef.read(), '\\\xd5\n\x00\x00\xae')
       
   973 
       
   974         f = StringIO.StringIO()
       
   975         ef = codecs.EncodedFile(f, 'utf-8', 'latin1')
       
   976         ef.write('\xc3\xbc')
       
   977         self.assertEquals(f.getvalue(), '\xfc')
       
   978 
       
   979 class Str2StrTest(unittest.TestCase):
       
   980 
       
   981     def test_read(self):
       
   982         sin = "\x80".encode("base64_codec")
       
   983         reader = codecs.getreader("base64_codec")(StringIO.StringIO(sin))
       
   984         sout = reader.read()
       
   985         self.assertEqual(sout, "\x80")
       
   986         self.assert_(isinstance(sout, str))
       
   987 
       
   988     def test_readline(self):
       
   989         sin = "\x80".encode("base64_codec")
       
   990         reader = codecs.getreader("base64_codec")(StringIO.StringIO(sin))
       
   991         sout = reader.readline()
       
   992         self.assertEqual(sout, "\x80")
       
   993         self.assert_(isinstance(sout, str))
       
   994 
       
   995 all_unicode_encodings = [
       
   996     "ascii",
       
   997     "base64_codec",
       
   998     "big5",
       
   999     "big5hkscs",
       
  1000     "charmap",
       
  1001     "cp037",
       
  1002     "cp1006",
       
  1003     "cp1026",
       
  1004     "cp1140",
       
  1005     "cp1250",
       
  1006     "cp1251",
       
  1007     "cp1252",
       
  1008     "cp1253",
       
  1009     "cp1254",
       
  1010     "cp1255",
       
  1011     "cp1256",
       
  1012     "cp1257",
       
  1013     "cp1258",
       
  1014     "cp424",
       
  1015     "cp437",
       
  1016     "cp500",
       
  1017     "cp737",
       
  1018     "cp775",
       
  1019     "cp850",
       
  1020     "cp852",
       
  1021     "cp855",
       
  1022     "cp856",
       
  1023     "cp857",
       
  1024     "cp860",
       
  1025     "cp861",
       
  1026     "cp862",
       
  1027     "cp863",
       
  1028     "cp864",
       
  1029     "cp865",
       
  1030     "cp866",
       
  1031     "cp869",
       
  1032     "cp874",
       
  1033     "cp875",
       
  1034     "cp932",
       
  1035     "cp949",
       
  1036     "cp950",
       
  1037     "euc_jis_2004",
       
  1038     "euc_jisx0213",
       
  1039     "euc_jp",
       
  1040     "euc_kr",
       
  1041     "gb18030",
       
  1042     "gb2312",
       
  1043     "gbk",
       
  1044     "hex_codec",
       
  1045     "hp_roman8",
       
  1046     "hz",
       
  1047     "idna",
       
  1048     "iso2022_jp",
       
  1049     "iso2022_jp_1",
       
  1050     "iso2022_jp_2",
       
  1051     "iso2022_jp_2004",
       
  1052     "iso2022_jp_3",
       
  1053     "iso2022_jp_ext",
       
  1054     "iso2022_kr",
       
  1055     "iso8859_1",
       
  1056     "iso8859_10",
       
  1057     "iso8859_11",
       
  1058     "iso8859_13",
       
  1059     "iso8859_14",
       
  1060     "iso8859_15",
       
  1061     "iso8859_16",
       
  1062     "iso8859_2",
       
  1063     "iso8859_3",
       
  1064     "iso8859_4",
       
  1065     "iso8859_5",
       
  1066     "iso8859_6",
       
  1067     "iso8859_7",
       
  1068     "iso8859_8",
       
  1069     "iso8859_9",
       
  1070     "johab",
       
  1071     "koi8_r",
       
  1072     "koi8_u",
       
  1073     "latin_1",
       
  1074     "mac_cyrillic",
       
  1075     "mac_greek",
       
  1076     "mac_iceland",
       
  1077     "mac_latin2",
       
  1078     "mac_roman",
       
  1079     "mac_turkish",
       
  1080     "palmos",
       
  1081     "ptcp154",
       
  1082     "punycode",
       
  1083     "raw_unicode_escape",
       
  1084     "rot_13",
       
  1085     "shift_jis",
       
  1086     "shift_jis_2004",
       
  1087     "shift_jisx0213",
       
  1088     "tis_620",
       
  1089     "unicode_escape",
       
  1090     "unicode_internal",
       
  1091     "utf_16",
       
  1092     "utf_16_be",
       
  1093     "utf_16_le",
       
  1094     "utf_7",
       
  1095     "utf_8",
       
  1096 ]
       
  1097 
       
  1098 if hasattr(codecs, "mbcs_encode"):
       
  1099     all_unicode_encodings.append("mbcs")
       
  1100 
       
  1101 # The following encodings work only with str, not unicode
       
  1102 all_string_encodings = [
       
  1103     "quopri_codec",
       
  1104     "string_escape",
       
  1105     "uu_codec",
       
  1106 ]
       
  1107 
       
  1108 # The following encoding is not tested, because it's not supposed
       
  1109 # to work:
       
  1110 #    "undefined"
       
  1111 
       
  1112 # The following encodings don't work in stateful mode
       
  1113 broken_unicode_with_streams = [
       
  1114     "base64_codec",
       
  1115     "hex_codec",
       
  1116     "punycode",
       
  1117     "unicode_internal"
       
  1118 ]
       
  1119 broken_incremental_coders = broken_unicode_with_streams[:]
       
  1120 
       
  1121 try:
       
  1122     import bz2
       
  1123 except ImportError:
       
  1124     pass
       
  1125 else:
       
  1126     all_unicode_encodings.append("bz2_codec")
       
  1127     broken_unicode_with_streams.append("bz2_codec")
       
  1128 
       
  1129 try:
       
  1130     import zlib
       
  1131 except ImportError:
       
  1132     pass
       
  1133 else:
       
  1134     all_unicode_encodings.append("zlib_codec")
       
  1135     broken_unicode_with_streams.append("zlib_codec")
       
  1136 
       
  1137 class BasicUnicodeTest(unittest.TestCase):
       
  1138     def test_basics(self):
       
  1139         s = u"abc123" # all codecs should be able to encode these
       
  1140         for encoding in all_unicode_encodings:
       
  1141             name = codecs.lookup(encoding).name
       
  1142             if encoding.endswith("_codec"):
       
  1143                 name += "_codec"
       
  1144             elif encoding == "latin_1":
       
  1145                 name = "latin_1"
       
  1146             self.assertEqual(encoding.replace("_", "-"), name.replace("_", "-"))
       
  1147             (bytes, size) = codecs.getencoder(encoding)(s)
       
  1148             if encoding != "unicode_internal":
       
  1149                 self.assertEqual(size, len(s), "%r != %r (encoding=%r)" % (size, len(s), encoding))
       
  1150             (chars, size) = codecs.getdecoder(encoding)(bytes)
       
  1151             self.assertEqual(chars, s, "%r != %r (encoding=%r)" % (chars, s, encoding))
       
  1152 
       
  1153             if encoding not in broken_unicode_with_streams:
       
  1154                 # check stream reader/writer
       
  1155                 q = Queue()
       
  1156                 writer = codecs.getwriter(encoding)(q)
       
  1157                 encodedresult = ""
       
  1158                 for c in s:
       
  1159                     writer.write(c)
       
  1160                     encodedresult += q.read()
       
  1161                 q = Queue()
       
  1162                 reader = codecs.getreader(encoding)(q)
       
  1163                 decodedresult = u""
       
  1164                 for c in encodedresult:
       
  1165                     q.write(c)
       
  1166                     decodedresult += reader.read()
       
  1167                 self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
       
  1168 
       
  1169             if encoding not in broken_incremental_coders:
       
  1170                 # check incremental decoder/encoder (fetched via the Python
       
  1171                 # and C API) and iterencode()/iterdecode()
       
  1172                 try:
       
  1173                     encoder = codecs.getincrementalencoder(encoding)()
       
  1174                     cencoder = _testcapi.codec_incrementalencoder(encoding)
       
  1175                 except LookupError: # no IncrementalEncoder
       
  1176                     pass
       
  1177                 else:
       
  1178                     # check incremental decoder/encoder
       
  1179                     encodedresult = ""
       
  1180                     for c in s:
       
  1181                         encodedresult += encoder.encode(c)
       
  1182                     encodedresult += encoder.encode(u"", True)
       
  1183                     decoder = codecs.getincrementaldecoder(encoding)()
       
  1184                     decodedresult = u""
       
  1185                     for c in encodedresult:
       
  1186                         decodedresult += decoder.decode(c)
       
  1187                     decodedresult += decoder.decode("", True)
       
  1188                     self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
       
  1189 
       
  1190                     # check C API
       
  1191                     encodedresult = ""
       
  1192                     for c in s:
       
  1193                         encodedresult += cencoder.encode(c)
       
  1194                     encodedresult += cencoder.encode(u"", True)
       
  1195                     cdecoder = _testcapi.codec_incrementaldecoder(encoding)
       
  1196                     decodedresult = u""
       
  1197                     for c in encodedresult:
       
  1198                         decodedresult += cdecoder.decode(c)
       
  1199                     decodedresult += cdecoder.decode("", True)
       
  1200                     self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
       
  1201 
       
  1202                     # check iterencode()/iterdecode()
       
  1203                     result = u"".join(codecs.iterdecode(codecs.iterencode(s, encoding), encoding))
       
  1204                     self.assertEqual(result, s, "%r != %r (encoding=%r)" % (result, s, encoding))
       
  1205 
       
  1206                     # check iterencode()/iterdecode() with empty string
       
  1207                     result = u"".join(codecs.iterdecode(codecs.iterencode(u"", encoding), encoding))
       
  1208                     self.assertEqual(result, u"")
       
  1209 
       
  1210     def test_seek(self):
       
  1211         # all codecs should be able to encode these
       
  1212         s = u"%s\n%s\n" % (100*u"abc123", 100*u"def456")
       
  1213         for encoding in all_unicode_encodings:
       
  1214             if encoding == "idna": # FIXME: See SF bug #1163178
       
  1215                 continue
       
  1216             if encoding in broken_unicode_with_streams:
       
  1217                 continue
       
  1218             reader = codecs.getreader(encoding)(StringIO.StringIO(s.encode(encoding)))
       
  1219             for t in xrange(5):
       
  1220                 # Test that calling seek resets the internal codec state and buffers
       
  1221                 reader.seek(0, 0)
       
  1222                 line = reader.readline()
       
  1223                 self.assertEqual(s[:len(line)], line)
       
  1224 
       
  1225     def test_bad_decode_args(self):
       
  1226         for encoding in all_unicode_encodings:
       
  1227             decoder = codecs.getdecoder(encoding)
       
  1228             self.assertRaises(TypeError, decoder)
       
  1229             if encoding not in ("idna", "punycode"):
       
  1230                 self.assertRaises(TypeError, decoder, 42)
       
  1231 
       
  1232     def test_bad_encode_args(self):
       
  1233         for encoding in all_unicode_encodings:
       
  1234             encoder = codecs.getencoder(encoding)
       
  1235             self.assertRaises(TypeError, encoder)
       
  1236 
       
  1237     def test_encoding_map_type_initialized(self):
       
  1238         from encodings import cp1140
       
  1239         # This used to crash, we are only verifying there's no crash.
       
  1240         table_type = type(cp1140.encoding_table)
       
  1241         self.assertEqual(table_type, table_type)
       
  1242 
       
  1243 class BasicStrTest(unittest.TestCase):
       
  1244     def test_basics(self):
       
  1245         s = "abc123"
       
  1246         for encoding in all_string_encodings:
       
  1247             (bytes, size) = codecs.getencoder(encoding)(s)
       
  1248             self.assertEqual(size, len(s))
       
  1249             (chars, size) = codecs.getdecoder(encoding)(bytes)
       
  1250             self.assertEqual(chars, s, "%r != %r (encoding=%r)" % (chars, s, encoding))
       
  1251 
       
  1252 class CharmapTest(unittest.TestCase):
       
  1253     def test_decode_with_string_map(self):
       
  1254         self.assertEquals(
       
  1255             codecs.charmap_decode("\x00\x01\x02", "strict", u"abc"),
       
  1256             (u"abc", 3)
       
  1257         )
       
  1258 
       
  1259         self.assertEquals(
       
  1260             codecs.charmap_decode("\x00\x01\x02", "replace", u"ab"),
       
  1261             (u"ab\ufffd", 3)
       
  1262         )
       
  1263 
       
  1264         self.assertEquals(
       
  1265             codecs.charmap_decode("\x00\x01\x02", "replace", u"ab\ufffe"),
       
  1266             (u"ab\ufffd", 3)
       
  1267         )
       
  1268 
       
  1269         self.assertEquals(
       
  1270             codecs.charmap_decode("\x00\x01\x02", "ignore", u"ab"),
       
  1271             (u"ab", 3)
       
  1272         )
       
  1273 
       
  1274         self.assertEquals(
       
  1275             codecs.charmap_decode("\x00\x01\x02", "ignore", u"ab\ufffe"),
       
  1276             (u"ab", 3)
       
  1277         )
       
  1278 
       
  1279         allbytes = "".join(chr(i) for i in xrange(256))
       
  1280         self.assertEquals(
       
  1281             codecs.charmap_decode(allbytes, "ignore", u""),
       
  1282             (u"", len(allbytes))
       
  1283         )
       
  1284 
       
  1285 class WithStmtTest(unittest.TestCase):
       
  1286     def test_encodedfile(self):
       
  1287         f = StringIO.StringIO("\xc3\xbc")
       
  1288         with codecs.EncodedFile(f, "latin-1", "utf-8") as ef:
       
  1289             self.assertEquals(ef.read(), "\xfc")
       
  1290 
       
  1291     def test_streamreaderwriter(self):
       
  1292         f = StringIO.StringIO("\xc3\xbc")
       
  1293         info = codecs.lookup("utf-8")
       
  1294         with codecs.StreamReaderWriter(f, info.streamreader,
       
  1295                                        info.streamwriter, 'strict') as srw:
       
  1296             self.assertEquals(srw.read(), u"\xfc")
       
  1297 
       
  1298 
       
  1299 def test_main():
       
  1300     test_support.run_unittest(
       
  1301         UTF16Test,
       
  1302         UTF16LETest,
       
  1303         UTF16BETest,
       
  1304         UTF8Test,
       
  1305         UTF8SigTest,
       
  1306         UTF7Test,
       
  1307         UTF16ExTest,
       
  1308         ReadBufferTest,
       
  1309         CharBufferTest,
       
  1310         EscapeDecodeTest,
       
  1311         RecodingTest,
       
  1312         PunycodeTest,
       
  1313         UnicodeInternalTest,
       
  1314         NameprepTest,
       
  1315         IDNACodecTest,
       
  1316         CodecsModuleTest,
       
  1317         StreamReaderTest,
       
  1318         EncodedFileTest,
       
  1319         Str2StrTest,
       
  1320         BasicUnicodeTest,
       
  1321         BasicStrTest,
       
  1322         CharmapTest,
       
  1323         WithStmtTest,
       
  1324     )
       
  1325 
       
  1326 
       
  1327 if __name__ == "__main__":
       
  1328     test_main()