|
1 import test.test_support, unittest |
|
2 import sys, codecs, htmlentitydefs, unicodedata |
|
3 |
|
4 class PosReturn: |
|
5 # this can be used for configurable callbacks |
|
6 |
|
7 def __init__(self): |
|
8 self.pos = 0 |
|
9 |
|
10 def handle(self, exc): |
|
11 oldpos = self.pos |
|
12 realpos = oldpos |
|
13 if realpos<0: |
|
14 realpos = len(exc.object) + realpos |
|
15 # if we don't advance this time, terminate on the next call |
|
16 # otherwise we'd get an endless loop |
|
17 if realpos <= exc.start: |
|
18 self.pos = len(exc.object) |
|
19 return (u"<?>", oldpos) |
|
20 |
|
21 # A UnicodeEncodeError object with a bad start attribute |
|
22 class BadStartUnicodeEncodeError(UnicodeEncodeError): |
|
23 def __init__(self): |
|
24 UnicodeEncodeError.__init__(self, "ascii", u"", 0, 1, "bad") |
|
25 self.start = [] |
|
26 |
|
27 # A UnicodeEncodeError object with a bad object attribute |
|
28 class BadObjectUnicodeEncodeError(UnicodeEncodeError): |
|
29 def __init__(self): |
|
30 UnicodeEncodeError.__init__(self, "ascii", u"", 0, 1, "bad") |
|
31 self.object = [] |
|
32 |
|
33 # A UnicodeDecodeError object without an end attribute |
|
34 class NoEndUnicodeDecodeError(UnicodeDecodeError): |
|
35 def __init__(self): |
|
36 UnicodeDecodeError.__init__(self, "ascii", "", 0, 1, "bad") |
|
37 del self.end |
|
38 |
|
39 # A UnicodeDecodeError object with a bad object attribute |
|
40 class BadObjectUnicodeDecodeError(UnicodeDecodeError): |
|
41 def __init__(self): |
|
42 UnicodeDecodeError.__init__(self, "ascii", "", 0, 1, "bad") |
|
43 self.object = [] |
|
44 |
|
45 # A UnicodeTranslateError object without a start attribute |
|
46 class NoStartUnicodeTranslateError(UnicodeTranslateError): |
|
47 def __init__(self): |
|
48 UnicodeTranslateError.__init__(self, u"", 0, 1, "bad") |
|
49 del self.start |
|
50 |
|
51 # A UnicodeTranslateError object without an end attribute |
|
52 class NoEndUnicodeTranslateError(UnicodeTranslateError): |
|
53 def __init__(self): |
|
54 UnicodeTranslateError.__init__(self, u"", 0, 1, "bad") |
|
55 del self.end |
|
56 |
|
57 # A UnicodeTranslateError object without an object attribute |
|
58 class NoObjectUnicodeTranslateError(UnicodeTranslateError): |
|
59 def __init__(self): |
|
60 UnicodeTranslateError.__init__(self, u"", 0, 1, "bad") |
|
61 del self.object |
|
62 |
|
63 class CodecCallbackTest(unittest.TestCase): |
|
64 |
|
65 def test_xmlcharrefreplace(self): |
|
66 # replace unencodable characters which numeric character entities. |
|
67 # For ascii, latin-1 and charmaps this is completely implemented |
|
68 # in C and should be reasonably fast. |
|
69 s = u"\u30b9\u30d1\u30e2 \xe4nd eggs" |
|
70 self.assertEqual( |
|
71 s.encode("ascii", "xmlcharrefreplace"), |
|
72 "スパモ änd eggs" |
|
73 ) |
|
74 self.assertEqual( |
|
75 s.encode("latin-1", "xmlcharrefreplace"), |
|
76 "スパモ \xe4nd eggs" |
|
77 ) |
|
78 |
|
79 def test_xmlcharnamereplace(self): |
|
80 # This time use a named character entity for unencodable |
|
81 # characters, if one is available. |
|
82 |
|
83 def xmlcharnamereplace(exc): |
|
84 if not isinstance(exc, UnicodeEncodeError): |
|
85 raise TypeError("don't know how to handle %r" % exc) |
|
86 l = [] |
|
87 for c in exc.object[exc.start:exc.end]: |
|
88 try: |
|
89 l.append(u"&%s;" % htmlentitydefs.codepoint2name[ord(c)]) |
|
90 except KeyError: |
|
91 l.append(u"&#%d;" % ord(c)) |
|
92 return (u"".join(l), exc.end) |
|
93 |
|
94 codecs.register_error( |
|
95 "test.xmlcharnamereplace", xmlcharnamereplace) |
|
96 |
|
97 sin = u"\xab\u211c\xbb = \u2329\u1234\u20ac\u232a" |
|
98 sout = "«ℜ» = ⟨ሴ€⟩" |
|
99 self.assertEqual(sin.encode("ascii", "test.xmlcharnamereplace"), sout) |
|
100 sout = "\xabℜ\xbb = ⟨ሴ€⟩" |
|
101 self.assertEqual(sin.encode("latin-1", "test.xmlcharnamereplace"), sout) |
|
102 sout = "\xabℜ\xbb = ⟨ሴ\xa4⟩" |
|
103 self.assertEqual(sin.encode("iso-8859-15", "test.xmlcharnamereplace"), sout) |
|
104 |
|
105 def test_uninamereplace(self): |
|
106 # We're using the names from the unicode database this time, |
|
107 # and we're doing "syntax highlighting" here, i.e. we include |
|
108 # the replaced text in ANSI escape sequences. For this it is |
|
109 # useful that the error handler is not called for every single |
|
110 # unencodable character, but for a complete sequence of |
|
111 # unencodable characters, otherwise we would output many |
|
112 # unneccessary escape sequences. |
|
113 |
|
114 def uninamereplace(exc): |
|
115 if not isinstance(exc, UnicodeEncodeError): |
|
116 raise TypeError("don't know how to handle %r" % exc) |
|
117 l = [] |
|
118 for c in exc.object[exc.start:exc.end]: |
|
119 l.append(unicodedata.name(c, u"0x%x" % ord(c))) |
|
120 return (u"\033[1m%s\033[0m" % u", ".join(l), exc.end) |
|
121 |
|
122 codecs.register_error( |
|
123 "test.uninamereplace", uninamereplace) |
|
124 |
|
125 sin = u"\xac\u1234\u20ac\u8000" |
|
126 sout = "\033[1mNOT SIGN, ETHIOPIC SYLLABLE SEE, EURO SIGN, CJK UNIFIED IDEOGRAPH-8000\033[0m" |
|
127 self.assertEqual(sin.encode("ascii", "test.uninamereplace"), sout) |
|
128 |
|
129 sout = "\xac\033[1mETHIOPIC SYLLABLE SEE, EURO SIGN, CJK UNIFIED IDEOGRAPH-8000\033[0m" |
|
130 self.assertEqual(sin.encode("latin-1", "test.uninamereplace"), sout) |
|
131 |
|
132 sout = "\xac\033[1mETHIOPIC SYLLABLE SEE\033[0m\xa4\033[1mCJK UNIFIED IDEOGRAPH-8000\033[0m" |
|
133 self.assertEqual(sin.encode("iso-8859-15", "test.uninamereplace"), sout) |
|
134 |
|
135 def test_backslashescape(self): |
|
136 # Does the same as the "unicode-escape" encoding, but with different |
|
137 # base encodings. |
|
138 sin = u"a\xac\u1234\u20ac\u8000" |
|
139 if sys.maxunicode > 0xffff: |
|
140 sin += unichr(sys.maxunicode) |
|
141 sout = "a\\xac\\u1234\\u20ac\\u8000" |
|
142 if sys.maxunicode > 0xffff: |
|
143 sout += "\\U%08x" % sys.maxunicode |
|
144 self.assertEqual(sin.encode("ascii", "backslashreplace"), sout) |
|
145 |
|
146 sout = "a\xac\\u1234\\u20ac\\u8000" |
|
147 if sys.maxunicode > 0xffff: |
|
148 sout += "\\U%08x" % sys.maxunicode |
|
149 self.assertEqual(sin.encode("latin-1", "backslashreplace"), sout) |
|
150 |
|
151 sout = "a\xac\\u1234\xa4\\u8000" |
|
152 if sys.maxunicode > 0xffff: |
|
153 sout += "\\U%08x" % sys.maxunicode |
|
154 self.assertEqual(sin.encode("iso-8859-15", "backslashreplace"), sout) |
|
155 |
|
156 def test_decoderelaxedutf8(self): |
|
157 # This is the test for a decoding callback handler, |
|
158 # that relaxes the UTF-8 minimal encoding restriction. |
|
159 # A null byte that is encoded as "\xc0\x80" will be |
|
160 # decoded as a null byte. All other illegal sequences |
|
161 # will be handled strictly. |
|
162 def relaxedutf8(exc): |
|
163 if not isinstance(exc, UnicodeDecodeError): |
|
164 raise TypeError("don't know how to handle %r" % exc) |
|
165 if exc.object[exc.start:exc.end].startswith("\xc0\x80"): |
|
166 return (u"\x00", exc.start+2) # retry after two bytes |
|
167 else: |
|
168 raise exc |
|
169 |
|
170 codecs.register_error( |
|
171 "test.relaxedutf8", relaxedutf8) |
|
172 |
|
173 sin = "a\x00b\xc0\x80c\xc3\xbc\xc0\x80\xc0\x80" |
|
174 sout = u"a\x00b\x00c\xfc\x00\x00" |
|
175 self.assertEqual(sin.decode("utf-8", "test.relaxedutf8"), sout) |
|
176 sin = "\xc0\x80\xc0\x81" |
|
177 self.assertRaises(UnicodeError, sin.decode, "utf-8", "test.relaxedutf8") |
|
178 |
|
179 def test_charmapencode(self): |
|
180 # For charmap encodings the replacement string will be |
|
181 # mapped through the encoding again. This means, that |
|
182 # to be able to use e.g. the "replace" handler, the |
|
183 # charmap has to have a mapping for "?". |
|
184 charmap = dict([ (ord(c), 2*c.upper()) for c in "abcdefgh"]) |
|
185 sin = u"abc" |
|
186 sout = "AABBCC" |
|
187 self.assertEquals(codecs.charmap_encode(sin, "strict", charmap)[0], sout) |
|
188 |
|
189 sin = u"abcA" |
|
190 self.assertRaises(UnicodeError, codecs.charmap_encode, sin, "strict", charmap) |
|
191 |
|
192 charmap[ord("?")] = "XYZ" |
|
193 sin = u"abcDEF" |
|
194 sout = "AABBCCXYZXYZXYZ" |
|
195 self.assertEquals(codecs.charmap_encode(sin, "replace", charmap)[0], sout) |
|
196 |
|
197 charmap[ord("?")] = u"XYZ" |
|
198 self.assertRaises(TypeError, codecs.charmap_encode, sin, "replace", charmap) |
|
199 |
|
200 charmap[ord("?")] = u"XYZ" |
|
201 self.assertRaises(TypeError, codecs.charmap_encode, sin, "replace", charmap) |
|
202 |
|
203 def test_decodeunicodeinternal(self): |
|
204 self.assertRaises( |
|
205 UnicodeDecodeError, |
|
206 "\x00\x00\x00\x00\x00".decode, |
|
207 "unicode-internal", |
|
208 ) |
|
209 if sys.maxunicode > 0xffff: |
|
210 def handler_unicodeinternal(exc): |
|
211 if not isinstance(exc, UnicodeDecodeError): |
|
212 raise TypeError("don't know how to handle %r" % exc) |
|
213 return (u"\x01", 1) |
|
214 |
|
215 self.assertEqual( |
|
216 "\x00\x00\x00\x00\x00".decode("unicode-internal", "ignore"), |
|
217 u"\u0000" |
|
218 ) |
|
219 |
|
220 self.assertEqual( |
|
221 "\x00\x00\x00\x00\x00".decode("unicode-internal", "replace"), |
|
222 u"\u0000\ufffd" |
|
223 ) |
|
224 |
|
225 codecs.register_error("test.hui", handler_unicodeinternal) |
|
226 |
|
227 self.assertEqual( |
|
228 "\x00\x00\x00\x00\x00".decode("unicode-internal", "test.hui"), |
|
229 u"\u0000\u0001\u0000" |
|
230 ) |
|
231 |
|
232 def test_callbacks(self): |
|
233 def handler1(exc): |
|
234 if not isinstance(exc, UnicodeEncodeError) \ |
|
235 and not isinstance(exc, UnicodeDecodeError): |
|
236 raise TypeError("don't know how to handle %r" % exc) |
|
237 l = [u"<%d>" % ord(exc.object[pos]) for pos in xrange(exc.start, exc.end)] |
|
238 return (u"[%s]" % u"".join(l), exc.end) |
|
239 |
|
240 codecs.register_error("test.handler1", handler1) |
|
241 |
|
242 def handler2(exc): |
|
243 if not isinstance(exc, UnicodeDecodeError): |
|
244 raise TypeError("don't know how to handle %r" % exc) |
|
245 l = [u"<%d>" % ord(exc.object[pos]) for pos in xrange(exc.start, exc.end)] |
|
246 return (u"[%s]" % u"".join(l), exc.end+1) # skip one character |
|
247 |
|
248 codecs.register_error("test.handler2", handler2) |
|
249 |
|
250 s = "\x00\x81\x7f\x80\xff" |
|
251 |
|
252 self.assertEqual( |
|
253 s.decode("ascii", "test.handler1"), |
|
254 u"\x00[<129>]\x7f[<128>][<255>]" |
|
255 ) |
|
256 self.assertEqual( |
|
257 s.decode("ascii", "test.handler2"), |
|
258 u"\x00[<129>][<128>]" |
|
259 ) |
|
260 |
|
261 self.assertEqual( |
|
262 "\\u3042\u3xxx".decode("unicode-escape", "test.handler1"), |
|
263 u"\u3042[<92><117><51><120>]xx" |
|
264 ) |
|
265 |
|
266 self.assertEqual( |
|
267 "\\u3042\u3xx".decode("unicode-escape", "test.handler1"), |
|
268 u"\u3042[<92><117><51><120><120>]" |
|
269 ) |
|
270 |
|
271 self.assertEqual( |
|
272 codecs.charmap_decode("abc", "test.handler1", {ord("a"): u"z"})[0], |
|
273 u"z[<98>][<99>]" |
|
274 ) |
|
275 |
|
276 self.assertEqual( |
|
277 u"g\xfc\xdfrk".encode("ascii", "test.handler1"), |
|
278 u"g[<252><223>]rk" |
|
279 ) |
|
280 |
|
281 self.assertEqual( |
|
282 u"g\xfc\xdf".encode("ascii", "test.handler1"), |
|
283 u"g[<252><223>]" |
|
284 ) |
|
285 |
|
286 def test_longstrings(self): |
|
287 # test long strings to check for memory overflow problems |
|
288 errors = [ "strict", "ignore", "replace", "xmlcharrefreplace", |
|
289 "backslashreplace"] |
|
290 # register the handlers under different names, |
|
291 # to prevent the codec from recognizing the name |
|
292 for err in errors: |
|
293 codecs.register_error("test." + err, codecs.lookup_error(err)) |
|
294 l = 1000 |
|
295 errors += [ "test." + err for err in errors ] |
|
296 for uni in [ s*l for s in (u"x", u"\u3042", u"a\xe4") ]: |
|
297 for enc in ("ascii", "latin-1", "iso-8859-1", "iso-8859-15", |
|
298 "utf-8", "utf-7", "utf-16", "utf-32"): |
|
299 for err in errors: |
|
300 try: |
|
301 uni.encode(enc, err) |
|
302 except UnicodeError: |
|
303 pass |
|
304 |
|
305 def check_exceptionobjectargs(self, exctype, args, msg): |
|
306 # Test UnicodeError subclasses: construction, attribute assignment and __str__ conversion |
|
307 # check with one missing argument |
|
308 self.assertRaises(TypeError, exctype, *args[:-1]) |
|
309 # check with one argument too much |
|
310 self.assertRaises(TypeError, exctype, *(args + ["too much"])) |
|
311 # check with one argument of the wrong type |
|
312 wrongargs = [ "spam", u"eggs", 42, 1.0, None ] |
|
313 for i in xrange(len(args)): |
|
314 for wrongarg in wrongargs: |
|
315 if type(wrongarg) is type(args[i]): |
|
316 continue |
|
317 # build argument array |
|
318 callargs = [] |
|
319 for j in xrange(len(args)): |
|
320 if i==j: |
|
321 callargs.append(wrongarg) |
|
322 else: |
|
323 callargs.append(args[i]) |
|
324 self.assertRaises(TypeError, exctype, *callargs) |
|
325 |
|
326 # check with the correct number and type of arguments |
|
327 exc = exctype(*args) |
|
328 self.assertEquals(str(exc), msg) |
|
329 |
|
330 def test_unicodeencodeerror(self): |
|
331 self.check_exceptionobjectargs( |
|
332 UnicodeEncodeError, |
|
333 ["ascii", u"g\xfcrk", 1, 2, "ouch"], |
|
334 "'ascii' codec can't encode character u'\\xfc' in position 1: ouch" |
|
335 ) |
|
336 self.check_exceptionobjectargs( |
|
337 UnicodeEncodeError, |
|
338 ["ascii", u"g\xfcrk", 1, 4, "ouch"], |
|
339 "'ascii' codec can't encode characters in position 1-3: ouch" |
|
340 ) |
|
341 self.check_exceptionobjectargs( |
|
342 UnicodeEncodeError, |
|
343 ["ascii", u"\xfcx", 0, 1, "ouch"], |
|
344 "'ascii' codec can't encode character u'\\xfc' in position 0: ouch" |
|
345 ) |
|
346 self.check_exceptionobjectargs( |
|
347 UnicodeEncodeError, |
|
348 ["ascii", u"\u0100x", 0, 1, "ouch"], |
|
349 "'ascii' codec can't encode character u'\\u0100' in position 0: ouch" |
|
350 ) |
|
351 self.check_exceptionobjectargs( |
|
352 UnicodeEncodeError, |
|
353 ["ascii", u"\uffffx", 0, 1, "ouch"], |
|
354 "'ascii' codec can't encode character u'\\uffff' in position 0: ouch" |
|
355 ) |
|
356 if sys.maxunicode > 0xffff: |
|
357 self.check_exceptionobjectargs( |
|
358 UnicodeEncodeError, |
|
359 ["ascii", u"\U00010000x", 0, 1, "ouch"], |
|
360 "'ascii' codec can't encode character u'\\U00010000' in position 0: ouch" |
|
361 ) |
|
362 |
|
363 def test_unicodedecodeerror(self): |
|
364 self.check_exceptionobjectargs( |
|
365 UnicodeDecodeError, |
|
366 ["ascii", "g\xfcrk", 1, 2, "ouch"], |
|
367 "'ascii' codec can't decode byte 0xfc in position 1: ouch" |
|
368 ) |
|
369 self.check_exceptionobjectargs( |
|
370 UnicodeDecodeError, |
|
371 ["ascii", "g\xfcrk", 1, 3, "ouch"], |
|
372 "'ascii' codec can't decode bytes in position 1-2: ouch" |
|
373 ) |
|
374 |
|
375 def test_unicodetranslateerror(self): |
|
376 self.check_exceptionobjectargs( |
|
377 UnicodeTranslateError, |
|
378 [u"g\xfcrk", 1, 2, "ouch"], |
|
379 "can't translate character u'\\xfc' in position 1: ouch" |
|
380 ) |
|
381 self.check_exceptionobjectargs( |
|
382 UnicodeTranslateError, |
|
383 [u"g\u0100rk", 1, 2, "ouch"], |
|
384 "can't translate character u'\\u0100' in position 1: ouch" |
|
385 ) |
|
386 self.check_exceptionobjectargs( |
|
387 UnicodeTranslateError, |
|
388 [u"g\uffffrk", 1, 2, "ouch"], |
|
389 "can't translate character u'\\uffff' in position 1: ouch" |
|
390 ) |
|
391 if sys.maxunicode > 0xffff: |
|
392 self.check_exceptionobjectargs( |
|
393 UnicodeTranslateError, |
|
394 [u"g\U00010000rk", 1, 2, "ouch"], |
|
395 "can't translate character u'\\U00010000' in position 1: ouch" |
|
396 ) |
|
397 self.check_exceptionobjectargs( |
|
398 UnicodeTranslateError, |
|
399 [u"g\xfcrk", 1, 3, "ouch"], |
|
400 "can't translate characters in position 1-2: ouch" |
|
401 ) |
|
402 |
|
403 def test_badandgoodstrictexceptions(self): |
|
404 # "strict" complains about a non-exception passed in |
|
405 self.assertRaises( |
|
406 TypeError, |
|
407 codecs.strict_errors, |
|
408 42 |
|
409 ) |
|
410 # "strict" complains about the wrong exception type |
|
411 self.assertRaises( |
|
412 Exception, |
|
413 codecs.strict_errors, |
|
414 Exception("ouch") |
|
415 ) |
|
416 |
|
417 # If the correct exception is passed in, "strict" raises it |
|
418 self.assertRaises( |
|
419 UnicodeEncodeError, |
|
420 codecs.strict_errors, |
|
421 UnicodeEncodeError("ascii", u"\u3042", 0, 1, "ouch") |
|
422 ) |
|
423 |
|
424 def test_badandgoodignoreexceptions(self): |
|
425 # "ignore" complains about a non-exception passed in |
|
426 self.assertRaises( |
|
427 TypeError, |
|
428 codecs.ignore_errors, |
|
429 42 |
|
430 ) |
|
431 # "ignore" complains about the wrong exception type |
|
432 self.assertRaises( |
|
433 TypeError, |
|
434 codecs.ignore_errors, |
|
435 UnicodeError("ouch") |
|
436 ) |
|
437 # If the correct exception is passed in, "ignore" returns an empty replacement |
|
438 self.assertEquals( |
|
439 codecs.ignore_errors(UnicodeEncodeError("ascii", u"\u3042", 0, 1, "ouch")), |
|
440 (u"", 1) |
|
441 ) |
|
442 self.assertEquals( |
|
443 codecs.ignore_errors(UnicodeDecodeError("ascii", "\xff", 0, 1, "ouch")), |
|
444 (u"", 1) |
|
445 ) |
|
446 self.assertEquals( |
|
447 codecs.ignore_errors(UnicodeTranslateError(u"\u3042", 0, 1, "ouch")), |
|
448 (u"", 1) |
|
449 ) |
|
450 |
|
451 def test_badandgoodreplaceexceptions(self): |
|
452 # "replace" complains about a non-exception passed in |
|
453 self.assertRaises( |
|
454 TypeError, |
|
455 codecs.replace_errors, |
|
456 42 |
|
457 ) |
|
458 # "replace" complains about the wrong exception type |
|
459 self.assertRaises( |
|
460 TypeError, |
|
461 codecs.replace_errors, |
|
462 UnicodeError("ouch") |
|
463 ) |
|
464 self.assertRaises( |
|
465 TypeError, |
|
466 codecs.replace_errors, |
|
467 BadObjectUnicodeEncodeError() |
|
468 ) |
|
469 self.assertRaises( |
|
470 TypeError, |
|
471 codecs.replace_errors, |
|
472 BadObjectUnicodeDecodeError() |
|
473 ) |
|
474 # With the correct exception, "replace" returns an "?" or u"\ufffd" replacement |
|
475 self.assertEquals( |
|
476 codecs.replace_errors(UnicodeEncodeError("ascii", u"\u3042", 0, 1, "ouch")), |
|
477 (u"?", 1) |
|
478 ) |
|
479 self.assertEquals( |
|
480 codecs.replace_errors(UnicodeDecodeError("ascii", "\xff", 0, 1, "ouch")), |
|
481 (u"\ufffd", 1) |
|
482 ) |
|
483 self.assertEquals( |
|
484 codecs.replace_errors(UnicodeTranslateError(u"\u3042", 0, 1, "ouch")), |
|
485 (u"\ufffd", 1) |
|
486 ) |
|
487 |
|
488 def test_badandgoodxmlcharrefreplaceexceptions(self): |
|
489 # "xmlcharrefreplace" complains about a non-exception passed in |
|
490 self.assertRaises( |
|
491 TypeError, |
|
492 codecs.xmlcharrefreplace_errors, |
|
493 42 |
|
494 ) |
|
495 # "xmlcharrefreplace" complains about the wrong exception types |
|
496 self.assertRaises( |
|
497 TypeError, |
|
498 codecs.xmlcharrefreplace_errors, |
|
499 UnicodeError("ouch") |
|
500 ) |
|
501 # "xmlcharrefreplace" can only be used for encoding |
|
502 self.assertRaises( |
|
503 TypeError, |
|
504 codecs.xmlcharrefreplace_errors, |
|
505 UnicodeDecodeError("ascii", "\xff", 0, 1, "ouch") |
|
506 ) |
|
507 self.assertRaises( |
|
508 TypeError, |
|
509 codecs.xmlcharrefreplace_errors, |
|
510 UnicodeTranslateError(u"\u3042", 0, 1, "ouch") |
|
511 ) |
|
512 # Use the correct exception |
|
513 cs = (0, 1, 9, 10, 99, 100, 999, 1000, 9999, 10000, 0x3042) |
|
514 s = "".join(unichr(c) for c in cs) |
|
515 self.assertEquals( |
|
516 codecs.xmlcharrefreplace_errors( |
|
517 UnicodeEncodeError("ascii", s, 0, len(s), "ouch") |
|
518 ), |
|
519 (u"".join(u"&#%d;" % ord(c) for c in s), len(s)) |
|
520 ) |
|
521 |
|
522 def test_badandgoodbackslashreplaceexceptions(self): |
|
523 # "backslashreplace" complains about a non-exception passed in |
|
524 self.assertRaises( |
|
525 TypeError, |
|
526 codecs.backslashreplace_errors, |
|
527 42 |
|
528 ) |
|
529 # "backslashreplace" complains about the wrong exception types |
|
530 self.assertRaises( |
|
531 TypeError, |
|
532 codecs.backslashreplace_errors, |
|
533 UnicodeError("ouch") |
|
534 ) |
|
535 # "backslashreplace" can only be used for encoding |
|
536 self.assertRaises( |
|
537 TypeError, |
|
538 codecs.backslashreplace_errors, |
|
539 UnicodeDecodeError("ascii", "\xff", 0, 1, "ouch") |
|
540 ) |
|
541 self.assertRaises( |
|
542 TypeError, |
|
543 codecs.backslashreplace_errors, |
|
544 UnicodeTranslateError(u"\u3042", 0, 1, "ouch") |
|
545 ) |
|
546 # Use the correct exception |
|
547 self.assertEquals( |
|
548 codecs.backslashreplace_errors(UnicodeEncodeError("ascii", u"\u3042", 0, 1, "ouch")), |
|
549 (u"\\u3042", 1) |
|
550 ) |
|
551 self.assertEquals( |
|
552 codecs.backslashreplace_errors(UnicodeEncodeError("ascii", u"\x00", 0, 1, "ouch")), |
|
553 (u"\\x00", 1) |
|
554 ) |
|
555 self.assertEquals( |
|
556 codecs.backslashreplace_errors(UnicodeEncodeError("ascii", u"\xff", 0, 1, "ouch")), |
|
557 (u"\\xff", 1) |
|
558 ) |
|
559 self.assertEquals( |
|
560 codecs.backslashreplace_errors(UnicodeEncodeError("ascii", u"\u0100", 0, 1, "ouch")), |
|
561 (u"\\u0100", 1) |
|
562 ) |
|
563 self.assertEquals( |
|
564 codecs.backslashreplace_errors(UnicodeEncodeError("ascii", u"\uffff", 0, 1, "ouch")), |
|
565 (u"\\uffff", 1) |
|
566 ) |
|
567 if sys.maxunicode>0xffff: |
|
568 self.assertEquals( |
|
569 codecs.backslashreplace_errors(UnicodeEncodeError("ascii", u"\U00010000", 0, 1, "ouch")), |
|
570 (u"\\U00010000", 1) |
|
571 ) |
|
572 self.assertEquals( |
|
573 codecs.backslashreplace_errors(UnicodeEncodeError("ascii", u"\U0010ffff", 0, 1, "ouch")), |
|
574 (u"\\U0010ffff", 1) |
|
575 ) |
|
576 |
|
577 def test_badhandlerresults(self): |
|
578 results = ( 42, u"foo", (1,2,3), (u"foo", 1, 3), (u"foo", None), (u"foo",), ("foo", 1, 3), ("foo", None), ("foo",) ) |
|
579 encs = ("ascii", "latin-1", "iso-8859-1", "iso-8859-15") |
|
580 |
|
581 for res in results: |
|
582 codecs.register_error("test.badhandler", lambda: res) |
|
583 for enc in encs: |
|
584 self.assertRaises( |
|
585 TypeError, |
|
586 u"\u3042".encode, |
|
587 enc, |
|
588 "test.badhandler" |
|
589 ) |
|
590 for (enc, bytes) in ( |
|
591 ("ascii", "\xff"), |
|
592 ("utf-8", "\xff"), |
|
593 ("utf-7", "+x-"), |
|
594 ("unicode-internal", "\x00"), |
|
595 ): |
|
596 self.assertRaises( |
|
597 TypeError, |
|
598 bytes.decode, |
|
599 enc, |
|
600 "test.badhandler" |
|
601 ) |
|
602 |
|
603 def test_lookup(self): |
|
604 self.assertEquals(codecs.strict_errors, codecs.lookup_error("strict")) |
|
605 self.assertEquals(codecs.ignore_errors, codecs.lookup_error("ignore")) |
|
606 self.assertEquals(codecs.strict_errors, codecs.lookup_error("strict")) |
|
607 self.assertEquals( |
|
608 codecs.xmlcharrefreplace_errors, |
|
609 codecs.lookup_error("xmlcharrefreplace") |
|
610 ) |
|
611 self.assertEquals( |
|
612 codecs.backslashreplace_errors, |
|
613 codecs.lookup_error("backslashreplace") |
|
614 ) |
|
615 |
|
616 def test_unencodablereplacement(self): |
|
617 def unencrepl(exc): |
|
618 if isinstance(exc, UnicodeEncodeError): |
|
619 return (u"\u4242", exc.end) |
|
620 else: |
|
621 raise TypeError("don't know how to handle %r" % exc) |
|
622 codecs.register_error("test.unencreplhandler", unencrepl) |
|
623 for enc in ("ascii", "iso-8859-1", "iso-8859-15"): |
|
624 self.assertRaises( |
|
625 UnicodeEncodeError, |
|
626 u"\u4242".encode, |
|
627 enc, |
|
628 "test.unencreplhandler" |
|
629 ) |
|
630 |
|
631 def test_badregistercall(self): |
|
632 # enhance coverage of: |
|
633 # Modules/_codecsmodule.c::register_error() |
|
634 # Python/codecs.c::PyCodec_RegisterError() |
|
635 self.assertRaises(TypeError, codecs.register_error, 42) |
|
636 self.assertRaises(TypeError, codecs.register_error, "test.dummy", 42) |
|
637 |
|
638 def test_badlookupcall(self): |
|
639 # enhance coverage of: |
|
640 # Modules/_codecsmodule.c::lookup_error() |
|
641 self.assertRaises(TypeError, codecs.lookup_error) |
|
642 |
|
643 def test_unknownhandler(self): |
|
644 # enhance coverage of: |
|
645 # Modules/_codecsmodule.c::lookup_error() |
|
646 self.assertRaises(LookupError, codecs.lookup_error, "test.unknown") |
|
647 |
|
648 def test_xmlcharrefvalues(self): |
|
649 # enhance coverage of: |
|
650 # Python/codecs.c::PyCodec_XMLCharRefReplaceErrors() |
|
651 # and inline implementations |
|
652 v = (1, 5, 10, 50, 100, 500, 1000, 5000, 10000, 50000) |
|
653 if sys.maxunicode>=100000: |
|
654 v += (100000, 500000, 1000000) |
|
655 s = u"".join([unichr(x) for x in v]) |
|
656 codecs.register_error("test.xmlcharrefreplace", codecs.xmlcharrefreplace_errors) |
|
657 for enc in ("ascii", "iso-8859-15"): |
|
658 for err in ("xmlcharrefreplace", "test.xmlcharrefreplace"): |
|
659 s.encode(enc, err) |
|
660 |
|
661 def test_decodehelper(self): |
|
662 # enhance coverage of: |
|
663 # Objects/unicodeobject.c::unicode_decode_call_errorhandler() |
|
664 # and callers |
|
665 self.assertRaises(LookupError, "\xff".decode, "ascii", "test.unknown") |
|
666 |
|
667 def baddecodereturn1(exc): |
|
668 return 42 |
|
669 codecs.register_error("test.baddecodereturn1", baddecodereturn1) |
|
670 self.assertRaises(TypeError, "\xff".decode, "ascii", "test.baddecodereturn1") |
|
671 self.assertRaises(TypeError, "\\".decode, "unicode-escape", "test.baddecodereturn1") |
|
672 self.assertRaises(TypeError, "\\x0".decode, "unicode-escape", "test.baddecodereturn1") |
|
673 self.assertRaises(TypeError, "\\x0y".decode, "unicode-escape", "test.baddecodereturn1") |
|
674 self.assertRaises(TypeError, "\\Uffffeeee".decode, "unicode-escape", "test.baddecodereturn1") |
|
675 self.assertRaises(TypeError, "\\uyyyy".decode, "raw-unicode-escape", "test.baddecodereturn1") |
|
676 |
|
677 def baddecodereturn2(exc): |
|
678 return (u"?", None) |
|
679 codecs.register_error("test.baddecodereturn2", baddecodereturn2) |
|
680 self.assertRaises(TypeError, "\xff".decode, "ascii", "test.baddecodereturn2") |
|
681 |
|
682 handler = PosReturn() |
|
683 codecs.register_error("test.posreturn", handler.handle) |
|
684 |
|
685 # Valid negative position |
|
686 handler.pos = -1 |
|
687 self.assertEquals("\xff0".decode("ascii", "test.posreturn"), u"<?>0") |
|
688 |
|
689 # Valid negative position |
|
690 handler.pos = -2 |
|
691 self.assertEquals("\xff0".decode("ascii", "test.posreturn"), u"<?><?>") |
|
692 |
|
693 # Negative position out of bounds |
|
694 handler.pos = -3 |
|
695 self.assertRaises(IndexError, "\xff0".decode, "ascii", "test.posreturn") |
|
696 |
|
697 # Valid positive position |
|
698 handler.pos = 1 |
|
699 self.assertEquals("\xff0".decode("ascii", "test.posreturn"), u"<?>0") |
|
700 |
|
701 # Largest valid positive position (one beyond end of input) |
|
702 handler.pos = 2 |
|
703 self.assertEquals("\xff0".decode("ascii", "test.posreturn"), u"<?>") |
|
704 |
|
705 # Invalid positive position |
|
706 handler.pos = 3 |
|
707 self.assertRaises(IndexError, "\xff0".decode, "ascii", "test.posreturn") |
|
708 |
|
709 # Restart at the "0" |
|
710 handler.pos = 6 |
|
711 self.assertEquals("\\uyyyy0".decode("raw-unicode-escape", "test.posreturn"), u"<?>0") |
|
712 |
|
713 class D(dict): |
|
714 def __getitem__(self, key): |
|
715 raise ValueError |
|
716 self.assertRaises(UnicodeError, codecs.charmap_decode, "\xff", "strict", {0xff: None}) |
|
717 self.assertRaises(ValueError, codecs.charmap_decode, "\xff", "strict", D()) |
|
718 self.assertRaises(TypeError, codecs.charmap_decode, "\xff", "strict", {0xff: sys.maxunicode+1}) |
|
719 |
|
720 def test_encodehelper(self): |
|
721 # enhance coverage of: |
|
722 # Objects/unicodeobject.c::unicode_encode_call_errorhandler() |
|
723 # and callers |
|
724 self.assertRaises(LookupError, u"\xff".encode, "ascii", "test.unknown") |
|
725 |
|
726 def badencodereturn1(exc): |
|
727 return 42 |
|
728 codecs.register_error("test.badencodereturn1", badencodereturn1) |
|
729 self.assertRaises(TypeError, u"\xff".encode, "ascii", "test.badencodereturn1") |
|
730 |
|
731 def badencodereturn2(exc): |
|
732 return (u"?", None) |
|
733 codecs.register_error("test.badencodereturn2", badencodereturn2) |
|
734 self.assertRaises(TypeError, u"\xff".encode, "ascii", "test.badencodereturn2") |
|
735 |
|
736 handler = PosReturn() |
|
737 codecs.register_error("test.posreturn", handler.handle) |
|
738 |
|
739 # Valid negative position |
|
740 handler.pos = -1 |
|
741 self.assertEquals(u"\xff0".encode("ascii", "test.posreturn"), "<?>0") |
|
742 |
|
743 # Valid negative position |
|
744 handler.pos = -2 |
|
745 self.assertEquals(u"\xff0".encode("ascii", "test.posreturn"), "<?><?>") |
|
746 |
|
747 # Negative position out of bounds |
|
748 handler.pos = -3 |
|
749 self.assertRaises(IndexError, u"\xff0".encode, "ascii", "test.posreturn") |
|
750 |
|
751 # Valid positive position |
|
752 handler.pos = 1 |
|
753 self.assertEquals(u"\xff0".encode("ascii", "test.posreturn"), "<?>0") |
|
754 |
|
755 # Largest valid positive position (one beyond end of input |
|
756 handler.pos = 2 |
|
757 self.assertEquals(u"\xff0".encode("ascii", "test.posreturn"), "<?>") |
|
758 |
|
759 # Invalid positive position |
|
760 handler.pos = 3 |
|
761 self.assertRaises(IndexError, u"\xff0".encode, "ascii", "test.posreturn") |
|
762 |
|
763 handler.pos = 0 |
|
764 |
|
765 class D(dict): |
|
766 def __getitem__(self, key): |
|
767 raise ValueError |
|
768 for err in ("strict", "replace", "xmlcharrefreplace", "backslashreplace", "test.posreturn"): |
|
769 self.assertRaises(UnicodeError, codecs.charmap_encode, u"\xff", err, {0xff: None}) |
|
770 self.assertRaises(ValueError, codecs.charmap_encode, u"\xff", err, D()) |
|
771 self.assertRaises(TypeError, codecs.charmap_encode, u"\xff", err, {0xff: 300}) |
|
772 |
|
773 def test_translatehelper(self): |
|
774 # enhance coverage of: |
|
775 # Objects/unicodeobject.c::unicode_encode_call_errorhandler() |
|
776 # and callers |
|
777 # (Unfortunately the errors argument is not directly accessible |
|
778 # from Python, so we can't test that much) |
|
779 class D(dict): |
|
780 def __getitem__(self, key): |
|
781 raise ValueError |
|
782 self.assertRaises(ValueError, u"\xff".translate, D()) |
|
783 self.assertRaises(TypeError, u"\xff".translate, {0xff: sys.maxunicode+1}) |
|
784 self.assertRaises(TypeError, u"\xff".translate, {0xff: ()}) |
|
785 |
|
786 def test_bug828737(self): |
|
787 charmap = { |
|
788 ord("&"): u"&", |
|
789 ord("<"): u"<", |
|
790 ord(">"): u">", |
|
791 ord('"'): u""", |
|
792 } |
|
793 |
|
794 for n in (1, 10, 100, 1000): |
|
795 text = u'abc<def>ghi'*n |
|
796 text.translate(charmap) |
|
797 |
|
798 def test_main(): |
|
799 test.test_support.run_unittest(CodecCallbackTest) |
|
800 |
|
801 if __name__ == "__main__": |
|
802 test_main() |