|
1 from __future__ import with_statement |
|
2 from test import test_support |
|
3 import unittest |
|
4 import codecs |
|
5 import sys, StringIO, _testcapi |
|
6 |
|
7 class Queue(object): |
|
8 """ |
|
9 queue: write bytes at one end, read bytes from the other end |
|
10 """ |
|
11 def __init__(self): |
|
12 self._buffer = "" |
|
13 |
|
14 def write(self, chars): |
|
15 self._buffer += chars |
|
16 |
|
17 def read(self, size=-1): |
|
18 if size<0: |
|
19 s = self._buffer |
|
20 self._buffer = "" |
|
21 return s |
|
22 else: |
|
23 s = self._buffer[:size] |
|
24 self._buffer = self._buffer[size:] |
|
25 return s |
|
26 |
|
27 class ReadTest(unittest.TestCase): |
|
28 def check_partial(self, input, partialresults): |
|
29 # get a StreamReader for the encoding and feed the bytestring version |
|
30 # of input to the reader byte by byte. Read everything available from |
|
31 # the StreamReader and check that the results equal the appropriate |
|
32 # entries from partialresults. |
|
33 q = Queue() |
|
34 r = codecs.getreader(self.encoding)(q) |
|
35 result = u"" |
|
36 for (c, partialresult) in zip(input.encode(self.encoding), partialresults): |
|
37 q.write(c) |
|
38 result += r.read() |
|
39 self.assertEqual(result, partialresult) |
|
40 # check that there's nothing left in the buffers |
|
41 self.assertEqual(r.read(), u"") |
|
42 self.assertEqual(r.bytebuffer, "") |
|
43 self.assertEqual(r.charbuffer, u"") |
|
44 |
|
45 # do the check again, this time using a incremental decoder |
|
46 d = codecs.getincrementaldecoder(self.encoding)() |
|
47 result = u"" |
|
48 for (c, partialresult) in zip(input.encode(self.encoding), partialresults): |
|
49 result += d.decode(c) |
|
50 self.assertEqual(result, partialresult) |
|
51 # check that there's nothing left in the buffers |
|
52 self.assertEqual(d.decode("", True), u"") |
|
53 self.assertEqual(d.buffer, "") |
|
54 |
|
55 # Check whether the rest method works properly |
|
56 d.reset() |
|
57 result = u"" |
|
58 for (c, partialresult) in zip(input.encode(self.encoding), partialresults): |
|
59 result += d.decode(c) |
|
60 self.assertEqual(result, partialresult) |
|
61 # check that there's nothing left in the buffers |
|
62 self.assertEqual(d.decode("", True), u"") |
|
63 self.assertEqual(d.buffer, "") |
|
64 |
|
65 # check iterdecode() |
|
66 encoded = input.encode(self.encoding) |
|
67 self.assertEqual( |
|
68 input, |
|
69 u"".join(codecs.iterdecode(encoded, self.encoding)) |
|
70 ) |
|
71 |
|
72 def test_readline(self): |
|
73 def getreader(input): |
|
74 stream = StringIO.StringIO(input.encode(self.encoding)) |
|
75 return codecs.getreader(self.encoding)(stream) |
|
76 |
|
77 def readalllines(input, keepends=True, size=None): |
|
78 reader = getreader(input) |
|
79 lines = [] |
|
80 while True: |
|
81 line = reader.readline(size=size, keepends=keepends) |
|
82 if not line: |
|
83 break |
|
84 lines.append(line) |
|
85 return "|".join(lines) |
|
86 |
|
87 s = u"foo\nbar\r\nbaz\rspam\u2028eggs" |
|
88 sexpected = u"foo\n|bar\r\n|baz\r|spam\u2028|eggs" |
|
89 sexpectednoends = u"foo|bar|baz|spam|eggs" |
|
90 self.assertEqual(readalllines(s, True), sexpected) |
|
91 self.assertEqual(readalllines(s, False), sexpectednoends) |
|
92 self.assertEqual(readalllines(s, True, 10), sexpected) |
|
93 self.assertEqual(readalllines(s, False, 10), sexpectednoends) |
|
94 |
|
95 # Test long lines (multiple calls to read() in readline()) |
|
96 vw = [] |
|
97 vwo = [] |
|
98 for (i, lineend) in enumerate(u"\n \r\n \r \u2028".split()): |
|
99 vw.append((i*200)*u"\3042" + lineend) |
|
100 vwo.append((i*200)*u"\3042") |
|
101 self.assertEqual(readalllines("".join(vw), True), "".join(vw)) |
|
102 self.assertEqual(readalllines("".join(vw), False),"".join(vwo)) |
|
103 |
|
104 # Test lines where the first read might end with \r, so the |
|
105 # reader has to look ahead whether this is a lone \r or a \r\n |
|
106 for size in xrange(80): |
|
107 for lineend in u"\n \r\n \r \u2028".split(): |
|
108 s = 10*(size*u"a" + lineend + u"xxx\n") |
|
109 reader = getreader(s) |
|
110 for i in xrange(10): |
|
111 self.assertEqual( |
|
112 reader.readline(keepends=True), |
|
113 size*u"a" + lineend, |
|
114 ) |
|
115 reader = getreader(s) |
|
116 for i in xrange(10): |
|
117 self.assertEqual( |
|
118 reader.readline(keepends=False), |
|
119 size*u"a", |
|
120 ) |
|
121 |
|
122 def test_bug1175396(self): |
|
123 s = [ |
|
124 '<%!--===================================================\r\n', |
|
125 ' BLOG index page: show recent articles,\r\n', |
|
126 ' today\'s articles, or articles of a specific date.\r\n', |
|
127 '========================================================--%>\r\n', |
|
128 '<%@inputencoding="ISO-8859-1"%>\r\n', |
|
129 '<%@pagetemplate=TEMPLATE.y%>\r\n', |
|
130 '<%@import=import frog.util, frog%>\r\n', |
|
131 '<%@import=import frog.objects%>\r\n', |
|
132 '<%@import=from frog.storageerrors import StorageError%>\r\n', |
|
133 '<%\r\n', |
|
134 '\r\n', |
|
135 'import logging\r\n', |
|
136 'log=logging.getLogger("Snakelets.logger")\r\n', |
|
137 '\r\n', |
|
138 '\r\n', |
|
139 'user=self.SessionCtx.user\r\n', |
|
140 'storageEngine=self.SessionCtx.storageEngine\r\n', |
|
141 '\r\n', |
|
142 '\r\n', |
|
143 'def readArticlesFromDate(date, count=None):\r\n', |
|
144 ' entryids=storageEngine.listBlogEntries(date)\r\n', |
|
145 ' entryids.reverse() # descending\r\n', |
|
146 ' if count:\r\n', |
|
147 ' entryids=entryids[:count]\r\n', |
|
148 ' try:\r\n', |
|
149 ' return [ frog.objects.BlogEntry.load(storageEngine, date, Id) for Id in entryids ]\r\n', |
|
150 ' except StorageError,x:\r\n', |
|
151 ' log.error("Error loading articles: "+str(x))\r\n', |
|
152 ' self.abort("cannot load articles")\r\n', |
|
153 '\r\n', |
|
154 'showdate=None\r\n', |
|
155 '\r\n', |
|
156 'arg=self.Request.getArg()\r\n', |
|
157 'if arg=="today":\r\n', |
|
158 ' #-------------------- TODAY\'S ARTICLES\r\n', |
|
159 ' self.write("<h2>Today\'s articles</h2>")\r\n', |
|
160 ' showdate = frog.util.isodatestr() \r\n', |
|
161 ' entries = readArticlesFromDate(showdate)\r\n', |
|
162 'elif arg=="active":\r\n', |
|
163 ' #-------------------- ACTIVE ARTICLES redirect\r\n', |
|
164 ' self.Yredirect("active.y")\r\n', |
|
165 'elif arg=="login":\r\n', |
|
166 ' #-------------------- LOGIN PAGE redirect\r\n', |
|
167 ' self.Yredirect("login.y")\r\n', |
|
168 'elif arg=="date":\r\n', |
|
169 ' #-------------------- ARTICLES OF A SPECIFIC DATE\r\n', |
|
170 ' showdate = self.Request.getParameter("date")\r\n', |
|
171 ' self.write("<h2>Articles written on %s</h2>"% frog.util.mediumdatestr(showdate))\r\n', |
|
172 ' entries = readArticlesFromDate(showdate)\r\n', |
|
173 'else:\r\n', |
|
174 ' #-------------------- RECENT ARTICLES\r\n', |
|
175 ' self.write("<h2>Recent articles</h2>")\r\n', |
|
176 ' dates=storageEngine.listBlogEntryDates()\r\n', |
|
177 ' if dates:\r\n', |
|
178 ' entries=[]\r\n', |
|
179 ' SHOWAMOUNT=10\r\n', |
|
180 ' for showdate in dates:\r\n', |
|
181 ' entries.extend( readArticlesFromDate(showdate, SHOWAMOUNT-len(entries)) )\r\n', |
|
182 ' if len(entries)>=SHOWAMOUNT:\r\n', |
|
183 ' break\r\n', |
|
184 ' \r\n', |
|
185 ] |
|
186 stream = StringIO.StringIO("".join(s).encode(self.encoding)) |
|
187 reader = codecs.getreader(self.encoding)(stream) |
|
188 for (i, line) in enumerate(reader): |
|
189 self.assertEqual(line, s[i]) |
|
190 |
|
191 def test_readlinequeue(self): |
|
192 q = Queue() |
|
193 writer = codecs.getwriter(self.encoding)(q) |
|
194 reader = codecs.getreader(self.encoding)(q) |
|
195 |
|
196 # No lineends |
|
197 writer.write(u"foo\r") |
|
198 self.assertEqual(reader.readline(keepends=False), u"foo") |
|
199 writer.write(u"\nbar\r") |
|
200 self.assertEqual(reader.readline(keepends=False), u"") |
|
201 self.assertEqual(reader.readline(keepends=False), u"bar") |
|
202 writer.write(u"baz") |
|
203 self.assertEqual(reader.readline(keepends=False), u"baz") |
|
204 self.assertEqual(reader.readline(keepends=False), u"") |
|
205 |
|
206 # Lineends |
|
207 writer.write(u"foo\r") |
|
208 self.assertEqual(reader.readline(keepends=True), u"foo\r") |
|
209 writer.write(u"\nbar\r") |
|
210 self.assertEqual(reader.readline(keepends=True), u"\n") |
|
211 self.assertEqual(reader.readline(keepends=True), u"bar\r") |
|
212 writer.write(u"baz") |
|
213 self.assertEqual(reader.readline(keepends=True), u"baz") |
|
214 self.assertEqual(reader.readline(keepends=True), u"") |
|
215 writer.write(u"foo\r\n") |
|
216 self.assertEqual(reader.readline(keepends=True), u"foo\r\n") |
|
217 |
|
218 def test_bug1098990_a(self): |
|
219 s1 = u"xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx yyyyyyyyyyyyyyyyyyyyyyyyyyyyyyy\r\n" |
|
220 s2 = u"offending line: ladfj askldfj klasdj fskla dfzaskdj fasklfj laskd fjasklfzzzzaa%whereisthis!!!\r\n" |
|
221 s3 = u"next line.\r\n" |
|
222 |
|
223 s = (s1+s2+s3).encode(self.encoding) |
|
224 stream = StringIO.StringIO(s) |
|
225 reader = codecs.getreader(self.encoding)(stream) |
|
226 self.assertEqual(reader.readline(), s1) |
|
227 self.assertEqual(reader.readline(), s2) |
|
228 self.assertEqual(reader.readline(), s3) |
|
229 self.assertEqual(reader.readline(), u"") |
|
230 |
|
231 def test_bug1098990_b(self): |
|
232 s1 = u"aaaaaaaaaaaaaaaaaaaaaaaa\r\n" |
|
233 s2 = u"bbbbbbbbbbbbbbbbbbbbbbbb\r\n" |
|
234 s3 = u"stillokay:bbbbxx\r\n" |
|
235 s4 = u"broken!!!!badbad\r\n" |
|
236 s5 = u"againokay.\r\n" |
|
237 |
|
238 s = (s1+s2+s3+s4+s5).encode(self.encoding) |
|
239 stream = StringIO.StringIO(s) |
|
240 reader = codecs.getreader(self.encoding)(stream) |
|
241 self.assertEqual(reader.readline(), s1) |
|
242 self.assertEqual(reader.readline(), s2) |
|
243 self.assertEqual(reader.readline(), s3) |
|
244 self.assertEqual(reader.readline(), s4) |
|
245 self.assertEqual(reader.readline(), s5) |
|
246 self.assertEqual(reader.readline(), u"") |
|
247 |
|
248 class UTF16Test(ReadTest): |
|
249 encoding = "utf-16" |
|
250 |
|
251 spamle = '\xff\xfes\x00p\x00a\x00m\x00s\x00p\x00a\x00m\x00' |
|
252 spambe = '\xfe\xff\x00s\x00p\x00a\x00m\x00s\x00p\x00a\x00m' |
|
253 |
|
254 def test_only_one_bom(self): |
|
255 _,_,reader,writer = codecs.lookup(self.encoding) |
|
256 # encode some stream |
|
257 s = StringIO.StringIO() |
|
258 f = writer(s) |
|
259 f.write(u"spam") |
|
260 f.write(u"spam") |
|
261 d = s.getvalue() |
|
262 # check whether there is exactly one BOM in it |
|
263 self.assert_(d == self.spamle or d == self.spambe) |
|
264 # try to read it back |
|
265 s = StringIO.StringIO(d) |
|
266 f = reader(s) |
|
267 self.assertEquals(f.read(), u"spamspam") |
|
268 |
|
269 def test_badbom(self): |
|
270 s = StringIO.StringIO("\xff\xff") |
|
271 f = codecs.getreader(self.encoding)(s) |
|
272 self.assertRaises(UnicodeError, f.read) |
|
273 |
|
274 s = StringIO.StringIO("\xff\xff\xff\xff") |
|
275 f = codecs.getreader(self.encoding)(s) |
|
276 self.assertRaises(UnicodeError, f.read) |
|
277 |
|
278 def test_partial(self): |
|
279 self.check_partial( |
|
280 u"\x00\xff\u0100\uffff", |
|
281 [ |
|
282 u"", # first byte of BOM read |
|
283 u"", # second byte of BOM read => byteorder known |
|
284 u"", |
|
285 u"\x00", |
|
286 u"\x00", |
|
287 u"\x00\xff", |
|
288 u"\x00\xff", |
|
289 u"\x00\xff\u0100", |
|
290 u"\x00\xff\u0100", |
|
291 u"\x00\xff\u0100\uffff", |
|
292 ] |
|
293 ) |
|
294 |
|
295 def test_errors(self): |
|
296 self.assertRaises(UnicodeDecodeError, codecs.utf_16_decode, "\xff", "strict", True) |
|
297 |
|
298 class UTF16LETest(ReadTest): |
|
299 encoding = "utf-16-le" |
|
300 |
|
301 def test_partial(self): |
|
302 self.check_partial( |
|
303 u"\x00\xff\u0100\uffff", |
|
304 [ |
|
305 u"", |
|
306 u"\x00", |
|
307 u"\x00", |
|
308 u"\x00\xff", |
|
309 u"\x00\xff", |
|
310 u"\x00\xff\u0100", |
|
311 u"\x00\xff\u0100", |
|
312 u"\x00\xff\u0100\uffff", |
|
313 ] |
|
314 ) |
|
315 |
|
316 def test_errors(self): |
|
317 self.assertRaises(UnicodeDecodeError, codecs.utf_16_le_decode, "\xff", "strict", True) |
|
318 |
|
319 class UTF16BETest(ReadTest): |
|
320 encoding = "utf-16-be" |
|
321 |
|
322 def test_partial(self): |
|
323 self.check_partial( |
|
324 u"\x00\xff\u0100\uffff", |
|
325 [ |
|
326 u"", |
|
327 u"\x00", |
|
328 u"\x00", |
|
329 u"\x00\xff", |
|
330 u"\x00\xff", |
|
331 u"\x00\xff\u0100", |
|
332 u"\x00\xff\u0100", |
|
333 u"\x00\xff\u0100\uffff", |
|
334 ] |
|
335 ) |
|
336 |
|
337 def test_errors(self): |
|
338 self.assertRaises(UnicodeDecodeError, codecs.utf_16_be_decode, "\xff", "strict", True) |
|
339 |
|
340 class UTF8Test(ReadTest): |
|
341 encoding = "utf-8" |
|
342 |
|
343 def test_partial(self): |
|
344 self.check_partial( |
|
345 u"\x00\xff\u07ff\u0800\uffff", |
|
346 [ |
|
347 u"\x00", |
|
348 u"\x00", |
|
349 u"\x00\xff", |
|
350 u"\x00\xff", |
|
351 u"\x00\xff\u07ff", |
|
352 u"\x00\xff\u07ff", |
|
353 u"\x00\xff\u07ff", |
|
354 u"\x00\xff\u07ff\u0800", |
|
355 u"\x00\xff\u07ff\u0800", |
|
356 u"\x00\xff\u07ff\u0800", |
|
357 u"\x00\xff\u07ff\u0800\uffff", |
|
358 ] |
|
359 ) |
|
360 |
|
361 class UTF7Test(ReadTest): |
|
362 encoding = "utf-7" |
|
363 |
|
364 # No test_partial() yet, because UTF-7 doesn't support it. |
|
365 |
|
366 class UTF16ExTest(unittest.TestCase): |
|
367 |
|
368 def test_errors(self): |
|
369 self.assertRaises(UnicodeDecodeError, codecs.utf_16_ex_decode, "\xff", "strict", 0, True) |
|
370 |
|
371 def test_bad_args(self): |
|
372 self.assertRaises(TypeError, codecs.utf_16_ex_decode) |
|
373 |
|
374 class ReadBufferTest(unittest.TestCase): |
|
375 |
|
376 def test_array(self): |
|
377 import array |
|
378 self.assertEqual( |
|
379 codecs.readbuffer_encode(array.array("c", "spam")), |
|
380 ("spam", 4) |
|
381 ) |
|
382 |
|
383 def test_empty(self): |
|
384 self.assertEqual(codecs.readbuffer_encode(""), ("", 0)) |
|
385 |
|
386 def test_bad_args(self): |
|
387 self.assertRaises(TypeError, codecs.readbuffer_encode) |
|
388 self.assertRaises(TypeError, codecs.readbuffer_encode, 42) |
|
389 |
|
390 class CharBufferTest(unittest.TestCase): |
|
391 |
|
392 def test_string(self): |
|
393 self.assertEqual(codecs.charbuffer_encode("spam"), ("spam", 4)) |
|
394 |
|
395 def test_empty(self): |
|
396 self.assertEqual(codecs.charbuffer_encode(""), ("", 0)) |
|
397 |
|
398 def test_bad_args(self): |
|
399 self.assertRaises(TypeError, codecs.charbuffer_encode) |
|
400 self.assertRaises(TypeError, codecs.charbuffer_encode, 42) |
|
401 |
|
402 class UTF8SigTest(ReadTest): |
|
403 encoding = "utf-8-sig" |
|
404 |
|
405 def test_partial(self): |
|
406 self.check_partial( |
|
407 u"\ufeff\x00\xff\u07ff\u0800\uffff", |
|
408 [ |
|
409 u"", |
|
410 u"", |
|
411 u"", # First BOM has been read and skipped |
|
412 u"", |
|
413 u"", |
|
414 u"\ufeff", # Second BOM has been read and emitted |
|
415 u"\ufeff\x00", # "\x00" read and emitted |
|
416 u"\ufeff\x00", # First byte of encoded u"\xff" read |
|
417 u"\ufeff\x00\xff", # Second byte of encoded u"\xff" read |
|
418 u"\ufeff\x00\xff", # First byte of encoded u"\u07ff" read |
|
419 u"\ufeff\x00\xff\u07ff", # Second byte of encoded u"\u07ff" read |
|
420 u"\ufeff\x00\xff\u07ff", |
|
421 u"\ufeff\x00\xff\u07ff", |
|
422 u"\ufeff\x00\xff\u07ff\u0800", |
|
423 u"\ufeff\x00\xff\u07ff\u0800", |
|
424 u"\ufeff\x00\xff\u07ff\u0800", |
|
425 u"\ufeff\x00\xff\u07ff\u0800\uffff", |
|
426 ] |
|
427 ) |
|
428 |
|
429 def test_bug1601501(self): |
|
430 # SF bug #1601501: check that the codec works with a buffer |
|
431 unicode("\xef\xbb\xbf", "utf-8-sig") |
|
432 |
|
433 def test_bom(self): |
|
434 d = codecs.getincrementaldecoder("utf-8-sig")() |
|
435 s = u"spam" |
|
436 self.assertEqual(d.decode(s.encode("utf-8-sig")), s) |
|
437 |
|
438 def test_stream_bom(self): |
|
439 unistring = u"ABC\u00A1\u2200XYZ" |
|
440 bytestring = codecs.BOM_UTF8 + "ABC\xC2\xA1\xE2\x88\x80XYZ" |
|
441 |
|
442 reader = codecs.getreader("utf-8-sig") |
|
443 for sizehint in [None] + range(1, 11) + \ |
|
444 [64, 128, 256, 512, 1024]: |
|
445 istream = reader(StringIO.StringIO(bytestring)) |
|
446 ostream = StringIO.StringIO() |
|
447 while 1: |
|
448 if sizehint is not None: |
|
449 data = istream.read(sizehint) |
|
450 else: |
|
451 data = istream.read() |
|
452 |
|
453 if not data: |
|
454 break |
|
455 ostream.write(data) |
|
456 |
|
457 got = ostream.getvalue() |
|
458 self.assertEqual(got, unistring) |
|
459 |
|
460 def test_stream_bare(self): |
|
461 unistring = u"ABC\u00A1\u2200XYZ" |
|
462 bytestring = "ABC\xC2\xA1\xE2\x88\x80XYZ" |
|
463 |
|
464 reader = codecs.getreader("utf-8-sig") |
|
465 for sizehint in [None] + range(1, 11) + \ |
|
466 [64, 128, 256, 512, 1024]: |
|
467 istream = reader(StringIO.StringIO(bytestring)) |
|
468 ostream = StringIO.StringIO() |
|
469 while 1: |
|
470 if sizehint is not None: |
|
471 data = istream.read(sizehint) |
|
472 else: |
|
473 data = istream.read() |
|
474 |
|
475 if not data: |
|
476 break |
|
477 ostream.write(data) |
|
478 |
|
479 got = ostream.getvalue() |
|
480 self.assertEqual(got, unistring) |
|
481 |
|
482 class EscapeDecodeTest(unittest.TestCase): |
|
483 def test_empty(self): |
|
484 self.assertEquals(codecs.escape_decode(""), ("", 0)) |
|
485 |
|
486 class RecodingTest(unittest.TestCase): |
|
487 def test_recoding(self): |
|
488 f = StringIO.StringIO() |
|
489 f2 = codecs.EncodedFile(f, "unicode_internal", "utf-8") |
|
490 f2.write(u"a") |
|
491 f2.close() |
|
492 # Python used to crash on this at exit because of a refcount |
|
493 # bug in _codecsmodule.c |
|
494 |
|
495 # From RFC 3492 |
|
496 punycode_testcases = [ |
|
497 # A Arabic (Egyptian): |
|
498 (u"\u0644\u064A\u0647\u0645\u0627\u0628\u062A\u0643\u0644" |
|
499 u"\u0645\u0648\u0634\u0639\u0631\u0628\u064A\u061F", |
|
500 "egbpdaj6bu4bxfgehfvwxn"), |
|
501 # B Chinese (simplified): |
|
502 (u"\u4ED6\u4EEC\u4E3A\u4EC0\u4E48\u4E0D\u8BF4\u4E2D\u6587", |
|
503 "ihqwcrb4cv8a8dqg056pqjye"), |
|
504 # C Chinese (traditional): |
|
505 (u"\u4ED6\u5011\u7232\u4EC0\u9EBD\u4E0D\u8AAA\u4E2D\u6587", |
|
506 "ihqwctvzc91f659drss3x8bo0yb"), |
|
507 # D Czech: Pro<ccaron>prost<ecaron>nemluv<iacute><ccaron>esky |
|
508 (u"\u0050\u0072\u006F\u010D\u0070\u0072\u006F\u0073\u0074" |
|
509 u"\u011B\u006E\u0065\u006D\u006C\u0075\u0076\u00ED\u010D" |
|
510 u"\u0065\u0073\u006B\u0079", |
|
511 "Proprostnemluvesky-uyb24dma41a"), |
|
512 # E Hebrew: |
|
513 (u"\u05DC\u05DE\u05D4\u05D4\u05DD\u05E4\u05E9\u05D5\u05D8" |
|
514 u"\u05DC\u05D0\u05DE\u05D3\u05D1\u05E8\u05D9\u05DD\u05E2" |
|
515 u"\u05D1\u05E8\u05D9\u05EA", |
|
516 "4dbcagdahymbxekheh6e0a7fei0b"), |
|
517 # F Hindi (Devanagari): |
|
518 (u"\u092F\u0939\u0932\u094B\u0917\u0939\u093F\u0928\u094D" |
|
519 u"\u0926\u0940\u0915\u094D\u092F\u094B\u0902\u0928\u0939" |
|
520 u"\u0940\u0902\u092C\u094B\u0932\u0938\u0915\u0924\u0947" |
|
521 u"\u0939\u0948\u0902", |
|
522 "i1baa7eci9glrd9b2ae1bj0hfcgg6iyaf8o0a1dig0cd"), |
|
523 |
|
524 #(G) Japanese (kanji and hiragana): |
|
525 (u"\u306A\u305C\u307F\u3093\u306A\u65E5\u672C\u8A9E\u3092" |
|
526 u"\u8A71\u3057\u3066\u304F\u308C\u306A\u3044\u306E\u304B", |
|
527 "n8jok5ay5dzabd5bym9f0cm5685rrjetr6pdxa"), |
|
528 |
|
529 # (H) Korean (Hangul syllables): |
|
530 (u"\uC138\uACC4\uC758\uBAA8\uB4E0\uC0AC\uB78C\uB4E4\uC774" |
|
531 u"\uD55C\uAD6D\uC5B4\uB97C\uC774\uD574\uD55C\uB2E4\uBA74" |
|
532 u"\uC5BC\uB9C8\uB098\uC88B\uC744\uAE4C", |
|
533 "989aomsvi5e83db1d2a355cv1e0vak1dwrv93d5xbh15a0dt30a5j" |
|
534 "psd879ccm6fea98c"), |
|
535 |
|
536 # (I) Russian (Cyrillic): |
|
537 (u"\u043F\u043E\u0447\u0435\u043C\u0443\u0436\u0435\u043E" |
|
538 u"\u043D\u0438\u043D\u0435\u0433\u043E\u0432\u043E\u0440" |
|
539 u"\u044F\u0442\u043F\u043E\u0440\u0443\u0441\u0441\u043A" |
|
540 u"\u0438", |
|
541 "b1abfaaepdrnnbgefbaDotcwatmq2g4l"), |
|
542 |
|
543 # (J) Spanish: Porqu<eacute>nopuedensimplementehablarenEspa<ntilde>ol |
|
544 (u"\u0050\u006F\u0072\u0071\u0075\u00E9\u006E\u006F\u0070" |
|
545 u"\u0075\u0065\u0064\u0065\u006E\u0073\u0069\u006D\u0070" |
|
546 u"\u006C\u0065\u006D\u0065\u006E\u0074\u0065\u0068\u0061" |
|
547 u"\u0062\u006C\u0061\u0072\u0065\u006E\u0045\u0073\u0070" |
|
548 u"\u0061\u00F1\u006F\u006C", |
|
549 "PorqunopuedensimplementehablarenEspaol-fmd56a"), |
|
550 |
|
551 # (K) Vietnamese: |
|
552 # T<adotbelow>isaoh<odotbelow>kh<ocirc>ngth<ecirchookabove>ch\ |
|
553 # <ihookabove>n<oacute>iti<ecircacute>ngVi<ecircdotbelow>t |
|
554 (u"\u0054\u1EA1\u0069\u0073\u0061\u006F\u0068\u1ECD\u006B" |
|
555 u"\u0068\u00F4\u006E\u0067\u0074\u0068\u1EC3\u0063\u0068" |
|
556 u"\u1EC9\u006E\u00F3\u0069\u0074\u0069\u1EBF\u006E\u0067" |
|
557 u"\u0056\u0069\u1EC7\u0074", |
|
558 "TisaohkhngthchnitingVit-kjcr8268qyxafd2f1b9g"), |
|
559 |
|
560 #(L) 3<nen>B<gumi><kinpachi><sensei> |
|
561 (u"\u0033\u5E74\u0042\u7D44\u91D1\u516B\u5148\u751F", |
|
562 "3B-ww4c5e180e575a65lsy2b"), |
|
563 |
|
564 # (M) <amuro><namie>-with-SUPER-MONKEYS |
|
565 (u"\u5B89\u5BA4\u5948\u7F8E\u6075\u002D\u0077\u0069\u0074" |
|
566 u"\u0068\u002D\u0053\u0055\u0050\u0045\u0052\u002D\u004D" |
|
567 u"\u004F\u004E\u004B\u0045\u0059\u0053", |
|
568 "-with-SUPER-MONKEYS-pc58ag80a8qai00g7n9n"), |
|
569 |
|
570 # (N) Hello-Another-Way-<sorezore><no><basho> |
|
571 (u"\u0048\u0065\u006C\u006C\u006F\u002D\u0041\u006E\u006F" |
|
572 u"\u0074\u0068\u0065\u0072\u002D\u0057\u0061\u0079\u002D" |
|
573 u"\u305D\u308C\u305E\u308C\u306E\u5834\u6240", |
|
574 "Hello-Another-Way--fc4qua05auwb3674vfr0b"), |
|
575 |
|
576 # (O) <hitotsu><yane><no><shita>2 |
|
577 (u"\u3072\u3068\u3064\u5C4B\u6839\u306E\u4E0B\u0032", |
|
578 "2-u9tlzr9756bt3uc0v"), |
|
579 |
|
580 # (P) Maji<de>Koi<suru>5<byou><mae> |
|
581 (u"\u004D\u0061\u006A\u0069\u3067\u004B\u006F\u0069\u3059" |
|
582 u"\u308B\u0035\u79D2\u524D", |
|
583 "MajiKoi5-783gue6qz075azm5e"), |
|
584 |
|
585 # (Q) <pafii>de<runba> |
|
586 (u"\u30D1\u30D5\u30A3\u30FC\u0064\u0065\u30EB\u30F3\u30D0", |
|
587 "de-jg4avhby1noc0d"), |
|
588 |
|
589 # (R) <sono><supiido><de> |
|
590 (u"\u305D\u306E\u30B9\u30D4\u30FC\u30C9\u3067", |
|
591 "d9juau41awczczp"), |
|
592 |
|
593 # (S) -> $1.00 <- |
|
594 (u"\u002D\u003E\u0020\u0024\u0031\u002E\u0030\u0030\u0020" |
|
595 u"\u003C\u002D", |
|
596 "-> $1.00 <--") |
|
597 ] |
|
598 |
|
599 for i in punycode_testcases: |
|
600 if len(i)!=2: |
|
601 print repr(i) |
|
602 |
|
603 class PunycodeTest(unittest.TestCase): |
|
604 def test_encode(self): |
|
605 for uni, puny in punycode_testcases: |
|
606 # Need to convert both strings to lower case, since |
|
607 # some of the extended encodings use upper case, but our |
|
608 # code produces only lower case. Converting just puny to |
|
609 # lower is also insufficient, since some of the input characters |
|
610 # are upper case. |
|
611 self.assertEquals(uni.encode("punycode").lower(), puny.lower()) |
|
612 |
|
613 def test_decode(self): |
|
614 for uni, puny in punycode_testcases: |
|
615 self.assertEquals(uni, puny.decode("punycode")) |
|
616 |
|
617 class UnicodeInternalTest(unittest.TestCase): |
|
618 def test_bug1251300(self): |
|
619 # Decoding with unicode_internal used to not correctly handle "code |
|
620 # points" above 0x10ffff on UCS-4 builds. |
|
621 if sys.maxunicode > 0xffff: |
|
622 ok = [ |
|
623 ("\x00\x10\xff\xff", u"\U0010ffff"), |
|
624 ("\x00\x00\x01\x01", u"\U00000101"), |
|
625 ("", u""), |
|
626 ] |
|
627 not_ok = [ |
|
628 "\x7f\xff\xff\xff", |
|
629 "\x80\x00\x00\x00", |
|
630 "\x81\x00\x00\x00", |
|
631 "\x00", |
|
632 "\x00\x00\x00\x00\x00", |
|
633 ] |
|
634 for internal, uni in ok: |
|
635 if sys.byteorder == "little": |
|
636 internal = "".join(reversed(internal)) |
|
637 self.assertEquals(uni, internal.decode("unicode_internal")) |
|
638 for internal in not_ok: |
|
639 if sys.byteorder == "little": |
|
640 internal = "".join(reversed(internal)) |
|
641 self.assertRaises(UnicodeDecodeError, internal.decode, |
|
642 "unicode_internal") |
|
643 |
|
644 def test_decode_error_attributes(self): |
|
645 if sys.maxunicode > 0xffff: |
|
646 try: |
|
647 "\x00\x00\x00\x00\x00\x11\x11\x00".decode("unicode_internal") |
|
648 except UnicodeDecodeError, ex: |
|
649 self.assertEquals("unicode_internal", ex.encoding) |
|
650 self.assertEquals("\x00\x00\x00\x00\x00\x11\x11\x00", ex.object) |
|
651 self.assertEquals(4, ex.start) |
|
652 self.assertEquals(8, ex.end) |
|
653 else: |
|
654 self.fail() |
|
655 |
|
656 def test_decode_callback(self): |
|
657 if sys.maxunicode > 0xffff: |
|
658 codecs.register_error("UnicodeInternalTest", codecs.ignore_errors) |
|
659 decoder = codecs.getdecoder("unicode_internal") |
|
660 ab = u"ab".encode("unicode_internal") |
|
661 ignored = decoder("%s\x22\x22\x22\x22%s" % (ab[:4], ab[4:]), |
|
662 "UnicodeInternalTest") |
|
663 self.assertEquals((u"ab", 12), ignored) |
|
664 |
|
665 # From http://www.gnu.org/software/libidn/draft-josefsson-idn-test-vectors.html |
|
666 nameprep_tests = [ |
|
667 # 3.1 Map to nothing. |
|
668 ('foo\xc2\xad\xcd\x8f\xe1\xa0\x86\xe1\xa0\x8bbar' |
|
669 '\xe2\x80\x8b\xe2\x81\xa0baz\xef\xb8\x80\xef\xb8\x88\xef' |
|
670 '\xb8\x8f\xef\xbb\xbf', |
|
671 'foobarbaz'), |
|
672 # 3.2 Case folding ASCII U+0043 U+0041 U+0046 U+0045. |
|
673 ('CAFE', |
|
674 'cafe'), |
|
675 # 3.3 Case folding 8bit U+00DF (german sharp s). |
|
676 # The original test case is bogus; it says \xc3\xdf |
|
677 ('\xc3\x9f', |
|
678 'ss'), |
|
679 # 3.4 Case folding U+0130 (turkish capital I with dot). |
|
680 ('\xc4\xb0', |
|
681 'i\xcc\x87'), |
|
682 # 3.5 Case folding multibyte U+0143 U+037A. |
|
683 ('\xc5\x83\xcd\xba', |
|
684 '\xc5\x84 \xce\xb9'), |
|
685 # 3.6 Case folding U+2121 U+33C6 U+1D7BB. |
|
686 # XXX: skip this as it fails in UCS-2 mode |
|
687 #('\xe2\x84\xa1\xe3\x8f\x86\xf0\x9d\x9e\xbb', |
|
688 # 'telc\xe2\x88\x95kg\xcf\x83'), |
|
689 (None, None), |
|
690 # 3.7 Normalization of U+006a U+030c U+00A0 U+00AA. |
|
691 ('j\xcc\x8c\xc2\xa0\xc2\xaa', |
|
692 '\xc7\xb0 a'), |
|
693 # 3.8 Case folding U+1FB7 and normalization. |
|
694 ('\xe1\xbe\xb7', |
|
695 '\xe1\xbe\xb6\xce\xb9'), |
|
696 # 3.9 Self-reverting case folding U+01F0 and normalization. |
|
697 # The original test case is bogus, it says `\xc7\xf0' |
|
698 ('\xc7\xb0', |
|
699 '\xc7\xb0'), |
|
700 # 3.10 Self-reverting case folding U+0390 and normalization. |
|
701 ('\xce\x90', |
|
702 '\xce\x90'), |
|
703 # 3.11 Self-reverting case folding U+03B0 and normalization. |
|
704 ('\xce\xb0', |
|
705 '\xce\xb0'), |
|
706 # 3.12 Self-reverting case folding U+1E96 and normalization. |
|
707 ('\xe1\xba\x96', |
|
708 '\xe1\xba\x96'), |
|
709 # 3.13 Self-reverting case folding U+1F56 and normalization. |
|
710 ('\xe1\xbd\x96', |
|
711 '\xe1\xbd\x96'), |
|
712 # 3.14 ASCII space character U+0020. |
|
713 (' ', |
|
714 ' '), |
|
715 # 3.15 Non-ASCII 8bit space character U+00A0. |
|
716 ('\xc2\xa0', |
|
717 ' '), |
|
718 # 3.16 Non-ASCII multibyte space character U+1680. |
|
719 ('\xe1\x9a\x80', |
|
720 None), |
|
721 # 3.17 Non-ASCII multibyte space character U+2000. |
|
722 ('\xe2\x80\x80', |
|
723 ' '), |
|
724 # 3.18 Zero Width Space U+200b. |
|
725 ('\xe2\x80\x8b', |
|
726 ''), |
|
727 # 3.19 Non-ASCII multibyte space character U+3000. |
|
728 ('\xe3\x80\x80', |
|
729 ' '), |
|
730 # 3.20 ASCII control characters U+0010 U+007F. |
|
731 ('\x10\x7f', |
|
732 '\x10\x7f'), |
|
733 # 3.21 Non-ASCII 8bit control character U+0085. |
|
734 ('\xc2\x85', |
|
735 None), |
|
736 # 3.22 Non-ASCII multibyte control character U+180E. |
|
737 ('\xe1\xa0\x8e', |
|
738 None), |
|
739 # 3.23 Zero Width No-Break Space U+FEFF. |
|
740 ('\xef\xbb\xbf', |
|
741 ''), |
|
742 # 3.24 Non-ASCII control character U+1D175. |
|
743 ('\xf0\x9d\x85\xb5', |
|
744 None), |
|
745 # 3.25 Plane 0 private use character U+F123. |
|
746 ('\xef\x84\xa3', |
|
747 None), |
|
748 # 3.26 Plane 15 private use character U+F1234. |
|
749 ('\xf3\xb1\x88\xb4', |
|
750 None), |
|
751 # 3.27 Plane 16 private use character U+10F234. |
|
752 ('\xf4\x8f\x88\xb4', |
|
753 None), |
|
754 # 3.28 Non-character code point U+8FFFE. |
|
755 ('\xf2\x8f\xbf\xbe', |
|
756 None), |
|
757 # 3.29 Non-character code point U+10FFFF. |
|
758 ('\xf4\x8f\xbf\xbf', |
|
759 None), |
|
760 # 3.30 Surrogate code U+DF42. |
|
761 ('\xed\xbd\x82', |
|
762 None), |
|
763 # 3.31 Non-plain text character U+FFFD. |
|
764 ('\xef\xbf\xbd', |
|
765 None), |
|
766 # 3.32 Ideographic description character U+2FF5. |
|
767 ('\xe2\xbf\xb5', |
|
768 None), |
|
769 # 3.33 Display property character U+0341. |
|
770 ('\xcd\x81', |
|
771 '\xcc\x81'), |
|
772 # 3.34 Left-to-right mark U+200E. |
|
773 ('\xe2\x80\x8e', |
|
774 None), |
|
775 # 3.35 Deprecated U+202A. |
|
776 ('\xe2\x80\xaa', |
|
777 None), |
|
778 # 3.36 Language tagging character U+E0001. |
|
779 ('\xf3\xa0\x80\x81', |
|
780 None), |
|
781 # 3.37 Language tagging character U+E0042. |
|
782 ('\xf3\xa0\x81\x82', |
|
783 None), |
|
784 # 3.38 Bidi: RandALCat character U+05BE and LCat characters. |
|
785 ('foo\xd6\xbebar', |
|
786 None), |
|
787 # 3.39 Bidi: RandALCat character U+FD50 and LCat characters. |
|
788 ('foo\xef\xb5\x90bar', |
|
789 None), |
|
790 # 3.40 Bidi: RandALCat character U+FB38 and LCat characters. |
|
791 ('foo\xef\xb9\xb6bar', |
|
792 'foo \xd9\x8ebar'), |
|
793 # 3.41 Bidi: RandALCat without trailing RandALCat U+0627 U+0031. |
|
794 ('\xd8\xa71', |
|
795 None), |
|
796 # 3.42 Bidi: RandALCat character U+0627 U+0031 U+0628. |
|
797 ('\xd8\xa71\xd8\xa8', |
|
798 '\xd8\xa71\xd8\xa8'), |
|
799 # 3.43 Unassigned code point U+E0002. |
|
800 # Skip this test as we allow unassigned |
|
801 #('\xf3\xa0\x80\x82', |
|
802 # None), |
|
803 (None, None), |
|
804 # 3.44 Larger test (shrinking). |
|
805 # Original test case reads \xc3\xdf |
|
806 ('X\xc2\xad\xc3\x9f\xc4\xb0\xe2\x84\xa1j\xcc\x8c\xc2\xa0\xc2' |
|
807 '\xaa\xce\xb0\xe2\x80\x80', |
|
808 'xssi\xcc\x87tel\xc7\xb0 a\xce\xb0 '), |
|
809 # 3.45 Larger test (expanding). |
|
810 # Original test case reads \xc3\x9f |
|
811 ('X\xc3\x9f\xe3\x8c\x96\xc4\xb0\xe2\x84\xa1\xe2\x92\x9f\xe3\x8c' |
|
812 '\x80', |
|
813 'xss\xe3\x82\xad\xe3\x83\xad\xe3\x83\xa1\xe3\x83\xbc\xe3' |
|
814 '\x83\x88\xe3\x83\xabi\xcc\x87tel\x28d\x29\xe3\x82' |
|
815 '\xa2\xe3\x83\x91\xe3\x83\xbc\xe3\x83\x88') |
|
816 ] |
|
817 |
|
818 |
|
819 class NameprepTest(unittest.TestCase): |
|
820 def test_nameprep(self): |
|
821 from encodings.idna import nameprep |
|
822 for pos, (orig, prepped) in enumerate(nameprep_tests): |
|
823 if orig is None: |
|
824 # Skipped |
|
825 continue |
|
826 # The Unicode strings are given in UTF-8 |
|
827 orig = unicode(orig, "utf-8") |
|
828 if prepped is None: |
|
829 # Input contains prohibited characters |
|
830 self.assertRaises(UnicodeError, nameprep, orig) |
|
831 else: |
|
832 prepped = unicode(prepped, "utf-8") |
|
833 try: |
|
834 self.assertEquals(nameprep(orig), prepped) |
|
835 except Exception,e: |
|
836 raise test_support.TestFailed("Test 3.%d: %s" % (pos+1, str(e))) |
|
837 |
|
838 class IDNACodecTest(unittest.TestCase): |
|
839 def test_builtin_decode(self): |
|
840 self.assertEquals(unicode("python.org", "idna"), u"python.org") |
|
841 self.assertEquals(unicode("python.org.", "idna"), u"python.org.") |
|
842 self.assertEquals(unicode("xn--pythn-mua.org", "idna"), u"pyth\xf6n.org") |
|
843 self.assertEquals(unicode("xn--pythn-mua.org.", "idna"), u"pyth\xf6n.org.") |
|
844 |
|
845 def test_builtin_encode(self): |
|
846 self.assertEquals(u"python.org".encode("idna"), "python.org") |
|
847 self.assertEquals("python.org.".encode("idna"), "python.org.") |
|
848 self.assertEquals(u"pyth\xf6n.org".encode("idna"), "xn--pythn-mua.org") |
|
849 self.assertEquals(u"pyth\xf6n.org.".encode("idna"), "xn--pythn-mua.org.") |
|
850 |
|
851 def test_stream(self): |
|
852 import StringIO |
|
853 r = codecs.getreader("idna")(StringIO.StringIO("abc")) |
|
854 r.read(3) |
|
855 self.assertEquals(r.read(), u"") |
|
856 |
|
857 def test_incremental_decode(self): |
|
858 self.assertEquals( |
|
859 "".join(codecs.iterdecode("python.org", "idna")), |
|
860 u"python.org" |
|
861 ) |
|
862 self.assertEquals( |
|
863 "".join(codecs.iterdecode("python.org.", "idna")), |
|
864 u"python.org." |
|
865 ) |
|
866 self.assertEquals( |
|
867 "".join(codecs.iterdecode("xn--pythn-mua.org.", "idna")), |
|
868 u"pyth\xf6n.org." |
|
869 ) |
|
870 self.assertEquals( |
|
871 "".join(codecs.iterdecode("xn--pythn-mua.org.", "idna")), |
|
872 u"pyth\xf6n.org." |
|
873 ) |
|
874 |
|
875 decoder = codecs.getincrementaldecoder("idna")() |
|
876 self.assertEquals(decoder.decode("xn--xam", ), u"") |
|
877 self.assertEquals(decoder.decode("ple-9ta.o", ), u"\xe4xample.") |
|
878 self.assertEquals(decoder.decode(u"rg"), u"") |
|
879 self.assertEquals(decoder.decode(u"", True), u"org") |
|
880 |
|
881 decoder.reset() |
|
882 self.assertEquals(decoder.decode("xn--xam", ), u"") |
|
883 self.assertEquals(decoder.decode("ple-9ta.o", ), u"\xe4xample.") |
|
884 self.assertEquals(decoder.decode("rg."), u"org.") |
|
885 self.assertEquals(decoder.decode("", True), u"") |
|
886 |
|
887 def test_incremental_encode(self): |
|
888 self.assertEquals( |
|
889 "".join(codecs.iterencode(u"python.org", "idna")), |
|
890 "python.org" |
|
891 ) |
|
892 self.assertEquals( |
|
893 "".join(codecs.iterencode(u"python.org.", "idna")), |
|
894 "python.org." |
|
895 ) |
|
896 self.assertEquals( |
|
897 "".join(codecs.iterencode(u"pyth\xf6n.org.", "idna")), |
|
898 "xn--pythn-mua.org." |
|
899 ) |
|
900 self.assertEquals( |
|
901 "".join(codecs.iterencode(u"pyth\xf6n.org.", "idna")), |
|
902 "xn--pythn-mua.org." |
|
903 ) |
|
904 |
|
905 encoder = codecs.getincrementalencoder("idna")() |
|
906 self.assertEquals(encoder.encode(u"\xe4x"), "") |
|
907 self.assertEquals(encoder.encode(u"ample.org"), "xn--xample-9ta.") |
|
908 self.assertEquals(encoder.encode(u"", True), "org") |
|
909 |
|
910 encoder.reset() |
|
911 self.assertEquals(encoder.encode(u"\xe4x"), "") |
|
912 self.assertEquals(encoder.encode(u"ample.org."), "xn--xample-9ta.org.") |
|
913 self.assertEquals(encoder.encode(u"", True), "") |
|
914 |
|
915 class CodecsModuleTest(unittest.TestCase): |
|
916 |
|
917 def test_decode(self): |
|
918 self.assertEquals(codecs.decode('\xe4\xf6\xfc', 'latin-1'), |
|
919 u'\xe4\xf6\xfc') |
|
920 self.assertRaises(TypeError, codecs.decode) |
|
921 self.assertEquals(codecs.decode('abc'), u'abc') |
|
922 self.assertRaises(UnicodeDecodeError, codecs.decode, '\xff', 'ascii') |
|
923 |
|
924 def test_encode(self): |
|
925 self.assertEquals(codecs.encode(u'\xe4\xf6\xfc', 'latin-1'), |
|
926 '\xe4\xf6\xfc') |
|
927 self.assertRaises(TypeError, codecs.encode) |
|
928 self.assertRaises(LookupError, codecs.encode, "foo", "__spam__") |
|
929 self.assertEquals(codecs.encode(u'abc'), 'abc') |
|
930 self.assertRaises(UnicodeEncodeError, codecs.encode, u'\xffff', 'ascii') |
|
931 |
|
932 def test_register(self): |
|
933 self.assertRaises(TypeError, codecs.register) |
|
934 self.assertRaises(TypeError, codecs.register, 42) |
|
935 |
|
936 def test_lookup(self): |
|
937 self.assertRaises(TypeError, codecs.lookup) |
|
938 self.assertRaises(LookupError, codecs.lookup, "__spam__") |
|
939 self.assertRaises(LookupError, codecs.lookup, " ") |
|
940 |
|
941 def test_getencoder(self): |
|
942 self.assertRaises(TypeError, codecs.getencoder) |
|
943 self.assertRaises(LookupError, codecs.getencoder, "__spam__") |
|
944 |
|
945 def test_getdecoder(self): |
|
946 self.assertRaises(TypeError, codecs.getdecoder) |
|
947 self.assertRaises(LookupError, codecs.getdecoder, "__spam__") |
|
948 |
|
949 def test_getreader(self): |
|
950 self.assertRaises(TypeError, codecs.getreader) |
|
951 self.assertRaises(LookupError, codecs.getreader, "__spam__") |
|
952 |
|
953 def test_getwriter(self): |
|
954 self.assertRaises(TypeError, codecs.getwriter) |
|
955 self.assertRaises(LookupError, codecs.getwriter, "__spam__") |
|
956 |
|
957 class StreamReaderTest(unittest.TestCase): |
|
958 |
|
959 def setUp(self): |
|
960 self.reader = codecs.getreader('utf-8') |
|
961 self.stream = StringIO.StringIO('\xed\x95\x9c\n\xea\xb8\x80') |
|
962 |
|
963 def test_readlines(self): |
|
964 f = self.reader(self.stream) |
|
965 self.assertEquals(f.readlines(), [u'\ud55c\n', u'\uae00']) |
|
966 |
|
967 class EncodedFileTest(unittest.TestCase): |
|
968 |
|
969 def test_basic(self): |
|
970 f = StringIO.StringIO('\xed\x95\x9c\n\xea\xb8\x80') |
|
971 ef = codecs.EncodedFile(f, 'utf-16-le', 'utf-8') |
|
972 self.assertEquals(ef.read(), '\\\xd5\n\x00\x00\xae') |
|
973 |
|
974 f = StringIO.StringIO() |
|
975 ef = codecs.EncodedFile(f, 'utf-8', 'latin1') |
|
976 ef.write('\xc3\xbc') |
|
977 self.assertEquals(f.getvalue(), '\xfc') |
|
978 |
|
979 class Str2StrTest(unittest.TestCase): |
|
980 |
|
981 def test_read(self): |
|
982 sin = "\x80".encode("base64_codec") |
|
983 reader = codecs.getreader("base64_codec")(StringIO.StringIO(sin)) |
|
984 sout = reader.read() |
|
985 self.assertEqual(sout, "\x80") |
|
986 self.assert_(isinstance(sout, str)) |
|
987 |
|
988 def test_readline(self): |
|
989 sin = "\x80".encode("base64_codec") |
|
990 reader = codecs.getreader("base64_codec")(StringIO.StringIO(sin)) |
|
991 sout = reader.readline() |
|
992 self.assertEqual(sout, "\x80") |
|
993 self.assert_(isinstance(sout, str)) |
|
994 |
|
995 all_unicode_encodings = [ |
|
996 "ascii", |
|
997 "base64_codec", |
|
998 "big5", |
|
999 "big5hkscs", |
|
1000 "charmap", |
|
1001 "cp037", |
|
1002 "cp1006", |
|
1003 "cp1026", |
|
1004 "cp1140", |
|
1005 "cp1250", |
|
1006 "cp1251", |
|
1007 "cp1252", |
|
1008 "cp1253", |
|
1009 "cp1254", |
|
1010 "cp1255", |
|
1011 "cp1256", |
|
1012 "cp1257", |
|
1013 "cp1258", |
|
1014 "cp424", |
|
1015 "cp437", |
|
1016 "cp500", |
|
1017 "cp737", |
|
1018 "cp775", |
|
1019 "cp850", |
|
1020 "cp852", |
|
1021 "cp855", |
|
1022 "cp856", |
|
1023 "cp857", |
|
1024 "cp860", |
|
1025 "cp861", |
|
1026 "cp862", |
|
1027 "cp863", |
|
1028 "cp864", |
|
1029 "cp865", |
|
1030 "cp866", |
|
1031 "cp869", |
|
1032 "cp874", |
|
1033 "cp875", |
|
1034 "cp932", |
|
1035 "cp949", |
|
1036 "cp950", |
|
1037 "euc_jis_2004", |
|
1038 "euc_jisx0213", |
|
1039 "euc_jp", |
|
1040 "euc_kr", |
|
1041 "gb18030", |
|
1042 "gb2312", |
|
1043 "gbk", |
|
1044 "hex_codec", |
|
1045 "hp_roman8", |
|
1046 "hz", |
|
1047 "idna", |
|
1048 "iso2022_jp", |
|
1049 "iso2022_jp_1", |
|
1050 "iso2022_jp_2", |
|
1051 "iso2022_jp_2004", |
|
1052 "iso2022_jp_3", |
|
1053 "iso2022_jp_ext", |
|
1054 "iso2022_kr", |
|
1055 "iso8859_1", |
|
1056 "iso8859_10", |
|
1057 "iso8859_11", |
|
1058 "iso8859_13", |
|
1059 "iso8859_14", |
|
1060 "iso8859_15", |
|
1061 "iso8859_16", |
|
1062 "iso8859_2", |
|
1063 "iso8859_3", |
|
1064 "iso8859_4", |
|
1065 "iso8859_5", |
|
1066 "iso8859_6", |
|
1067 "iso8859_7", |
|
1068 "iso8859_8", |
|
1069 "iso8859_9", |
|
1070 "johab", |
|
1071 "koi8_r", |
|
1072 "koi8_u", |
|
1073 "latin_1", |
|
1074 "mac_cyrillic", |
|
1075 "mac_greek", |
|
1076 "mac_iceland", |
|
1077 "mac_latin2", |
|
1078 "mac_roman", |
|
1079 "mac_turkish", |
|
1080 "palmos", |
|
1081 "ptcp154", |
|
1082 "punycode", |
|
1083 "raw_unicode_escape", |
|
1084 "rot_13", |
|
1085 "shift_jis", |
|
1086 "shift_jis_2004", |
|
1087 "shift_jisx0213", |
|
1088 "tis_620", |
|
1089 "unicode_escape", |
|
1090 "unicode_internal", |
|
1091 "utf_16", |
|
1092 "utf_16_be", |
|
1093 "utf_16_le", |
|
1094 "utf_7", |
|
1095 "utf_8", |
|
1096 ] |
|
1097 |
|
1098 if hasattr(codecs, "mbcs_encode"): |
|
1099 all_unicode_encodings.append("mbcs") |
|
1100 |
|
1101 # The following encodings work only with str, not unicode |
|
1102 all_string_encodings = [ |
|
1103 "quopri_codec", |
|
1104 "string_escape", |
|
1105 "uu_codec", |
|
1106 ] |
|
1107 |
|
1108 # The following encoding is not tested, because it's not supposed |
|
1109 # to work: |
|
1110 # "undefined" |
|
1111 |
|
1112 # The following encodings don't work in stateful mode |
|
1113 broken_unicode_with_streams = [ |
|
1114 "base64_codec", |
|
1115 "hex_codec", |
|
1116 "punycode", |
|
1117 "unicode_internal" |
|
1118 ] |
|
1119 broken_incremental_coders = broken_unicode_with_streams[:] |
|
1120 |
|
1121 try: |
|
1122 import bz2 |
|
1123 except ImportError: |
|
1124 pass |
|
1125 else: |
|
1126 all_unicode_encodings.append("bz2_codec") |
|
1127 broken_unicode_with_streams.append("bz2_codec") |
|
1128 |
|
1129 try: |
|
1130 import zlib |
|
1131 except ImportError: |
|
1132 pass |
|
1133 else: |
|
1134 all_unicode_encodings.append("zlib_codec") |
|
1135 broken_unicode_with_streams.append("zlib_codec") |
|
1136 |
|
1137 class BasicUnicodeTest(unittest.TestCase): |
|
1138 def test_basics(self): |
|
1139 s = u"abc123" # all codecs should be able to encode these |
|
1140 for encoding in all_unicode_encodings: |
|
1141 name = codecs.lookup(encoding).name |
|
1142 if encoding.endswith("_codec"): |
|
1143 name += "_codec" |
|
1144 elif encoding == "latin_1": |
|
1145 name = "latin_1" |
|
1146 self.assertEqual(encoding.replace("_", "-"), name.replace("_", "-")) |
|
1147 (bytes, size) = codecs.getencoder(encoding)(s) |
|
1148 if encoding != "unicode_internal": |
|
1149 self.assertEqual(size, len(s), "%r != %r (encoding=%r)" % (size, len(s), encoding)) |
|
1150 (chars, size) = codecs.getdecoder(encoding)(bytes) |
|
1151 self.assertEqual(chars, s, "%r != %r (encoding=%r)" % (chars, s, encoding)) |
|
1152 |
|
1153 if encoding not in broken_unicode_with_streams: |
|
1154 # check stream reader/writer |
|
1155 q = Queue() |
|
1156 writer = codecs.getwriter(encoding)(q) |
|
1157 encodedresult = "" |
|
1158 for c in s: |
|
1159 writer.write(c) |
|
1160 encodedresult += q.read() |
|
1161 q = Queue() |
|
1162 reader = codecs.getreader(encoding)(q) |
|
1163 decodedresult = u"" |
|
1164 for c in encodedresult: |
|
1165 q.write(c) |
|
1166 decodedresult += reader.read() |
|
1167 self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding)) |
|
1168 |
|
1169 if encoding not in broken_incremental_coders: |
|
1170 # check incremental decoder/encoder (fetched via the Python |
|
1171 # and C API) and iterencode()/iterdecode() |
|
1172 try: |
|
1173 encoder = codecs.getincrementalencoder(encoding)() |
|
1174 cencoder = _testcapi.codec_incrementalencoder(encoding) |
|
1175 except LookupError: # no IncrementalEncoder |
|
1176 pass |
|
1177 else: |
|
1178 # check incremental decoder/encoder |
|
1179 encodedresult = "" |
|
1180 for c in s: |
|
1181 encodedresult += encoder.encode(c) |
|
1182 encodedresult += encoder.encode(u"", True) |
|
1183 decoder = codecs.getincrementaldecoder(encoding)() |
|
1184 decodedresult = u"" |
|
1185 for c in encodedresult: |
|
1186 decodedresult += decoder.decode(c) |
|
1187 decodedresult += decoder.decode("", True) |
|
1188 self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding)) |
|
1189 |
|
1190 # check C API |
|
1191 encodedresult = "" |
|
1192 for c in s: |
|
1193 encodedresult += cencoder.encode(c) |
|
1194 encodedresult += cencoder.encode(u"", True) |
|
1195 cdecoder = _testcapi.codec_incrementaldecoder(encoding) |
|
1196 decodedresult = u"" |
|
1197 for c in encodedresult: |
|
1198 decodedresult += cdecoder.decode(c) |
|
1199 decodedresult += cdecoder.decode("", True) |
|
1200 self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding)) |
|
1201 |
|
1202 # check iterencode()/iterdecode() |
|
1203 result = u"".join(codecs.iterdecode(codecs.iterencode(s, encoding), encoding)) |
|
1204 self.assertEqual(result, s, "%r != %r (encoding=%r)" % (result, s, encoding)) |
|
1205 |
|
1206 # check iterencode()/iterdecode() with empty string |
|
1207 result = u"".join(codecs.iterdecode(codecs.iterencode(u"", encoding), encoding)) |
|
1208 self.assertEqual(result, u"") |
|
1209 |
|
1210 def test_seek(self): |
|
1211 # all codecs should be able to encode these |
|
1212 s = u"%s\n%s\n" % (100*u"abc123", 100*u"def456") |
|
1213 for encoding in all_unicode_encodings: |
|
1214 if encoding == "idna": # FIXME: See SF bug #1163178 |
|
1215 continue |
|
1216 if encoding in broken_unicode_with_streams: |
|
1217 continue |
|
1218 reader = codecs.getreader(encoding)(StringIO.StringIO(s.encode(encoding))) |
|
1219 for t in xrange(5): |
|
1220 # Test that calling seek resets the internal codec state and buffers |
|
1221 reader.seek(0, 0) |
|
1222 line = reader.readline() |
|
1223 self.assertEqual(s[:len(line)], line) |
|
1224 |
|
1225 def test_bad_decode_args(self): |
|
1226 for encoding in all_unicode_encodings: |
|
1227 decoder = codecs.getdecoder(encoding) |
|
1228 self.assertRaises(TypeError, decoder) |
|
1229 if encoding not in ("idna", "punycode"): |
|
1230 self.assertRaises(TypeError, decoder, 42) |
|
1231 |
|
1232 def test_bad_encode_args(self): |
|
1233 for encoding in all_unicode_encodings: |
|
1234 encoder = codecs.getencoder(encoding) |
|
1235 self.assertRaises(TypeError, encoder) |
|
1236 |
|
1237 def test_encoding_map_type_initialized(self): |
|
1238 from encodings import cp1140 |
|
1239 # This used to crash, we are only verifying there's no crash. |
|
1240 table_type = type(cp1140.encoding_table) |
|
1241 self.assertEqual(table_type, table_type) |
|
1242 |
|
1243 class BasicStrTest(unittest.TestCase): |
|
1244 def test_basics(self): |
|
1245 s = "abc123" |
|
1246 for encoding in all_string_encodings: |
|
1247 (bytes, size) = codecs.getencoder(encoding)(s) |
|
1248 self.assertEqual(size, len(s)) |
|
1249 (chars, size) = codecs.getdecoder(encoding)(bytes) |
|
1250 self.assertEqual(chars, s, "%r != %r (encoding=%r)" % (chars, s, encoding)) |
|
1251 |
|
1252 class CharmapTest(unittest.TestCase): |
|
1253 def test_decode_with_string_map(self): |
|
1254 self.assertEquals( |
|
1255 codecs.charmap_decode("\x00\x01\x02", "strict", u"abc"), |
|
1256 (u"abc", 3) |
|
1257 ) |
|
1258 |
|
1259 self.assertEquals( |
|
1260 codecs.charmap_decode("\x00\x01\x02", "replace", u"ab"), |
|
1261 (u"ab\ufffd", 3) |
|
1262 ) |
|
1263 |
|
1264 self.assertEquals( |
|
1265 codecs.charmap_decode("\x00\x01\x02", "replace", u"ab\ufffe"), |
|
1266 (u"ab\ufffd", 3) |
|
1267 ) |
|
1268 |
|
1269 self.assertEquals( |
|
1270 codecs.charmap_decode("\x00\x01\x02", "ignore", u"ab"), |
|
1271 (u"ab", 3) |
|
1272 ) |
|
1273 |
|
1274 self.assertEquals( |
|
1275 codecs.charmap_decode("\x00\x01\x02", "ignore", u"ab\ufffe"), |
|
1276 (u"ab", 3) |
|
1277 ) |
|
1278 |
|
1279 allbytes = "".join(chr(i) for i in xrange(256)) |
|
1280 self.assertEquals( |
|
1281 codecs.charmap_decode(allbytes, "ignore", u""), |
|
1282 (u"", len(allbytes)) |
|
1283 ) |
|
1284 |
|
1285 class WithStmtTest(unittest.TestCase): |
|
1286 def test_encodedfile(self): |
|
1287 f = StringIO.StringIO("\xc3\xbc") |
|
1288 with codecs.EncodedFile(f, "latin-1", "utf-8") as ef: |
|
1289 self.assertEquals(ef.read(), "\xfc") |
|
1290 |
|
1291 def test_streamreaderwriter(self): |
|
1292 f = StringIO.StringIO("\xc3\xbc") |
|
1293 info = codecs.lookup("utf-8") |
|
1294 with codecs.StreamReaderWriter(f, info.streamreader, |
|
1295 info.streamwriter, 'strict') as srw: |
|
1296 self.assertEquals(srw.read(), u"\xfc") |
|
1297 |
|
1298 |
|
1299 def test_main(): |
|
1300 test_support.run_unittest( |
|
1301 UTF16Test, |
|
1302 UTF16LETest, |
|
1303 UTF16BETest, |
|
1304 UTF8Test, |
|
1305 UTF8SigTest, |
|
1306 UTF7Test, |
|
1307 UTF16ExTest, |
|
1308 ReadBufferTest, |
|
1309 CharBufferTest, |
|
1310 EscapeDecodeTest, |
|
1311 RecodingTest, |
|
1312 PunycodeTest, |
|
1313 UnicodeInternalTest, |
|
1314 NameprepTest, |
|
1315 IDNACodecTest, |
|
1316 CodecsModuleTest, |
|
1317 StreamReaderTest, |
|
1318 EncodedFileTest, |
|
1319 Str2StrTest, |
|
1320 BasicUnicodeTest, |
|
1321 BasicStrTest, |
|
1322 CharmapTest, |
|
1323 WithStmtTest, |
|
1324 ) |
|
1325 |
|
1326 |
|
1327 if __name__ == "__main__": |
|
1328 test_main() |