|
1 """ codecs -- Python Codec Registry, API and helpers. |
|
2 |
|
3 |
|
4 Written by Marc-Andre Lemburg (mal@lemburg.com). |
|
5 |
|
6 (c) Copyright CNRI, All Rights Reserved. NO WARRANTY. |
|
7 |
|
8 """#" |
|
9 |
|
10 import __builtin__, sys |
|
11 |
|
12 ### Registry and builtin stateless codec functions |
|
13 |
|
14 try: |
|
15 from _codecs import * |
|
16 except ImportError, why: |
|
17 raise SystemError('Failed to load the builtin codecs: %s' % why) |
|
18 |
|
19 __all__ = ["register", "lookup", "open", "EncodedFile", "BOM", "BOM_BE", |
|
20 "BOM_LE", "BOM32_BE", "BOM32_LE", "BOM64_BE", "BOM64_LE", |
|
21 "BOM_UTF8", "BOM_UTF16", "BOM_UTF16_LE", "BOM_UTF16_BE", |
|
22 "BOM_UTF32", "BOM_UTF32_LE", "BOM_UTF32_BE", |
|
23 "strict_errors", "ignore_errors", "replace_errors", |
|
24 "xmlcharrefreplace_errors", |
|
25 "register_error", "lookup_error"] |
|
26 |
|
27 ### Constants |
|
28 |
|
29 # |
|
30 # Byte Order Mark (BOM = ZERO WIDTH NO-BREAK SPACE = U+FEFF) |
|
31 # and its possible byte string values |
|
32 # for UTF8/UTF16/UTF32 output and little/big endian machines |
|
33 # |
|
34 |
|
35 # UTF-8 |
|
36 BOM_UTF8 = '\xef\xbb\xbf' |
|
37 |
|
38 # UTF-16, little endian |
|
39 BOM_LE = BOM_UTF16_LE = '\xff\xfe' |
|
40 |
|
41 # UTF-16, big endian |
|
42 BOM_BE = BOM_UTF16_BE = '\xfe\xff' |
|
43 |
|
44 # UTF-32, little endian |
|
45 BOM_UTF32_LE = '\xff\xfe\x00\x00' |
|
46 |
|
47 # UTF-32, big endian |
|
48 BOM_UTF32_BE = '\x00\x00\xfe\xff' |
|
49 |
|
50 if sys.byteorder == 'little': |
|
51 |
|
52 # UTF-16, native endianness |
|
53 BOM = BOM_UTF16 = BOM_UTF16_LE |
|
54 |
|
55 # UTF-32, native endianness |
|
56 BOM_UTF32 = BOM_UTF32_LE |
|
57 |
|
58 else: |
|
59 |
|
60 # UTF-16, native endianness |
|
61 BOM = BOM_UTF16 = BOM_UTF16_BE |
|
62 |
|
63 # UTF-32, native endianness |
|
64 BOM_UTF32 = BOM_UTF32_BE |
|
65 |
|
66 # Old broken names (don't use in new code) |
|
67 BOM32_LE = BOM_UTF16_LE |
|
68 BOM32_BE = BOM_UTF16_BE |
|
69 BOM64_LE = BOM_UTF32_LE |
|
70 BOM64_BE = BOM_UTF32_BE |
|
71 |
|
72 |
|
73 ### Codec base classes (defining the API) |
|
74 |
|
75 class CodecInfo(tuple): |
|
76 |
|
77 def __new__(cls, encode, decode, streamreader=None, streamwriter=None, |
|
78 incrementalencoder=None, incrementaldecoder=None, name=None): |
|
79 self = tuple.__new__(cls, (encode, decode, streamreader, streamwriter)) |
|
80 self.name = name |
|
81 self.encode = encode |
|
82 self.decode = decode |
|
83 self.incrementalencoder = incrementalencoder |
|
84 self.incrementaldecoder = incrementaldecoder |
|
85 self.streamwriter = streamwriter |
|
86 self.streamreader = streamreader |
|
87 return self |
|
88 |
|
89 def __repr__(self): |
|
90 return "<%s.%s object for encoding %s at 0x%x>" % (self.__class__.__module__, self.__class__.__name__, self.name, id(self)) |
|
91 |
|
92 class Codec: |
|
93 |
|
94 """ Defines the interface for stateless encoders/decoders. |
|
95 |
|
96 The .encode()/.decode() methods may use different error |
|
97 handling schemes by providing the errors argument. These |
|
98 string values are predefined: |
|
99 |
|
100 'strict' - raise a ValueError error (or a subclass) |
|
101 'ignore' - ignore the character and continue with the next |
|
102 'replace' - replace with a suitable replacement character; |
|
103 Python will use the official U+FFFD REPLACEMENT |
|
104 CHARACTER for the builtin Unicode codecs on |
|
105 decoding and '?' on encoding. |
|
106 'xmlcharrefreplace' - Replace with the appropriate XML |
|
107 character reference (only for encoding). |
|
108 'backslashreplace' - Replace with backslashed escape sequences |
|
109 (only for encoding). |
|
110 |
|
111 The set of allowed values can be extended via register_error. |
|
112 |
|
113 """ |
|
114 def encode(self, input, errors='strict'): |
|
115 |
|
116 """ Encodes the object input and returns a tuple (output |
|
117 object, length consumed). |
|
118 |
|
119 errors defines the error handling to apply. It defaults to |
|
120 'strict' handling. |
|
121 |
|
122 The method may not store state in the Codec instance. Use |
|
123 StreamCodec for codecs which have to keep state in order to |
|
124 make encoding/decoding efficient. |
|
125 |
|
126 The encoder must be able to handle zero length input and |
|
127 return an empty object of the output object type in this |
|
128 situation. |
|
129 |
|
130 """ |
|
131 raise NotImplementedError |
|
132 |
|
133 def decode(self, input, errors='strict'): |
|
134 |
|
135 """ Decodes the object input and returns a tuple (output |
|
136 object, length consumed). |
|
137 |
|
138 input must be an object which provides the bf_getreadbuf |
|
139 buffer slot. Python strings, buffer objects and memory |
|
140 mapped files are examples of objects providing this slot. |
|
141 |
|
142 errors defines the error handling to apply. It defaults to |
|
143 'strict' handling. |
|
144 |
|
145 The method may not store state in the Codec instance. Use |
|
146 StreamCodec for codecs which have to keep state in order to |
|
147 make encoding/decoding efficient. |
|
148 |
|
149 The decoder must be able to handle zero length input and |
|
150 return an empty object of the output object type in this |
|
151 situation. |
|
152 |
|
153 """ |
|
154 raise NotImplementedError |
|
155 |
|
156 class IncrementalEncoder(object): |
|
157 """ |
|
158 An IncrementalEncoder encodes an input in multiple steps. The input can be |
|
159 passed piece by piece to the encode() method. The IncrementalEncoder remembers |
|
160 the state of the Encoding process between calls to encode(). |
|
161 """ |
|
162 def __init__(self, errors='strict'): |
|
163 """ |
|
164 Creates an IncrementalEncoder instance. |
|
165 |
|
166 The IncrementalEncoder may use different error handling schemes by |
|
167 providing the errors keyword argument. See the module docstring |
|
168 for a list of possible values. |
|
169 """ |
|
170 self.errors = errors |
|
171 self.buffer = "" |
|
172 |
|
173 def encode(self, input, final=False): |
|
174 """ |
|
175 Encodes input and returns the resulting object. |
|
176 """ |
|
177 raise NotImplementedError |
|
178 |
|
179 def reset(self): |
|
180 """ |
|
181 Resets the encoder to the initial state. |
|
182 """ |
|
183 |
|
184 def getstate(self): |
|
185 """ |
|
186 Return the current state of the encoder. |
|
187 """ |
|
188 return 0 |
|
189 |
|
190 def setstate(self, state): |
|
191 """ |
|
192 Set the current state of the encoder. state must have been |
|
193 returned by getstate(). |
|
194 """ |
|
195 |
|
196 class BufferedIncrementalEncoder(IncrementalEncoder): |
|
197 """ |
|
198 This subclass of IncrementalEncoder can be used as the baseclass for an |
|
199 incremental encoder if the encoder must keep some of the output in a |
|
200 buffer between calls to encode(). |
|
201 """ |
|
202 def __init__(self, errors='strict'): |
|
203 IncrementalEncoder.__init__(self, errors) |
|
204 self.buffer = "" # unencoded input that is kept between calls to encode() |
|
205 |
|
206 def _buffer_encode(self, input, errors, final): |
|
207 # Overwrite this method in subclasses: It must encode input |
|
208 # and return an (output, length consumed) tuple |
|
209 raise NotImplementedError |
|
210 |
|
211 def encode(self, input, final=False): |
|
212 # encode input (taking the buffer into account) |
|
213 data = self.buffer + input |
|
214 (result, consumed) = self._buffer_encode(data, self.errors, final) |
|
215 # keep unencoded input until the next call |
|
216 self.buffer = data[consumed:] |
|
217 return result |
|
218 |
|
219 def reset(self): |
|
220 IncrementalEncoder.reset(self) |
|
221 self.buffer = "" |
|
222 |
|
223 def getstate(self): |
|
224 return self.buffer or 0 |
|
225 |
|
226 def setstate(self, state): |
|
227 self.buffer = state or "" |
|
228 |
|
229 class IncrementalDecoder(object): |
|
230 """ |
|
231 An IncrementalDecoder decodes an input in multiple steps. The input can be |
|
232 passed piece by piece to the decode() method. The IncrementalDecoder |
|
233 remembers the state of the decoding process between calls to decode(). |
|
234 """ |
|
235 def __init__(self, errors='strict'): |
|
236 """ |
|
237 Creates a IncrementalDecoder instance. |
|
238 |
|
239 The IncrementalDecoder may use different error handling schemes by |
|
240 providing the errors keyword argument. See the module docstring |
|
241 for a list of possible values. |
|
242 """ |
|
243 self.errors = errors |
|
244 |
|
245 def decode(self, input, final=False): |
|
246 """ |
|
247 Decodes input and returns the resulting object. |
|
248 """ |
|
249 raise NotImplementedError |
|
250 |
|
251 def reset(self): |
|
252 """ |
|
253 Resets the decoder to the initial state. |
|
254 """ |
|
255 |
|
256 def getstate(self): |
|
257 """ |
|
258 Return the current state of the decoder. |
|
259 |
|
260 This must be a (buffered_input, additional_state_info) tuple. |
|
261 buffered_input must be a bytes object containing bytes that |
|
262 were passed to decode() that have not yet been converted. |
|
263 additional_state_info must be a non-negative integer |
|
264 representing the state of the decoder WITHOUT yet having |
|
265 processed the contents of buffered_input. In the initial state |
|
266 and after reset(), getstate() must return (b"", 0). |
|
267 """ |
|
268 return (b"", 0) |
|
269 |
|
270 def setstate(self, state): |
|
271 """ |
|
272 Set the current state of the decoder. |
|
273 |
|
274 state must have been returned by getstate(). The effect of |
|
275 setstate((b"", 0)) must be equivalent to reset(). |
|
276 """ |
|
277 |
|
278 class BufferedIncrementalDecoder(IncrementalDecoder): |
|
279 """ |
|
280 This subclass of IncrementalDecoder can be used as the baseclass for an |
|
281 incremental decoder if the decoder must be able to handle incomplete byte |
|
282 sequences. |
|
283 """ |
|
284 def __init__(self, errors='strict'): |
|
285 IncrementalDecoder.__init__(self, errors) |
|
286 self.buffer = "" # undecoded input that is kept between calls to decode() |
|
287 |
|
288 def _buffer_decode(self, input, errors, final): |
|
289 # Overwrite this method in subclasses: It must decode input |
|
290 # and return an (output, length consumed) tuple |
|
291 raise NotImplementedError |
|
292 |
|
293 def decode(self, input, final=False): |
|
294 # decode input (taking the buffer into account) |
|
295 data = self.buffer + input |
|
296 (result, consumed) = self._buffer_decode(data, self.errors, final) |
|
297 # keep undecoded input until the next call |
|
298 self.buffer = data[consumed:] |
|
299 return result |
|
300 |
|
301 def reset(self): |
|
302 IncrementalDecoder.reset(self) |
|
303 self.buffer = "" |
|
304 |
|
305 def getstate(self): |
|
306 # additional state info is always 0 |
|
307 return (self.buffer, 0) |
|
308 |
|
309 def setstate(self, state): |
|
310 # ignore additional state info |
|
311 self.buffer = state[0] |
|
312 |
|
313 # |
|
314 # The StreamWriter and StreamReader class provide generic working |
|
315 # interfaces which can be used to implement new encoding submodules |
|
316 # very easily. See encodings/utf_8.py for an example on how this is |
|
317 # done. |
|
318 # |
|
319 |
|
320 class StreamWriter(Codec): |
|
321 |
|
322 def __init__(self, stream, errors='strict'): |
|
323 |
|
324 """ Creates a StreamWriter instance. |
|
325 |
|
326 stream must be a file-like object open for writing |
|
327 (binary) data. |
|
328 |
|
329 The StreamWriter may use different error handling |
|
330 schemes by providing the errors keyword argument. These |
|
331 parameters are predefined: |
|
332 |
|
333 'strict' - raise a ValueError (or a subclass) |
|
334 'ignore' - ignore the character and continue with the next |
|
335 'replace'- replace with a suitable replacement character |
|
336 'xmlcharrefreplace' - Replace with the appropriate XML |
|
337 character reference. |
|
338 'backslashreplace' - Replace with backslashed escape |
|
339 sequences (only for encoding). |
|
340 |
|
341 The set of allowed parameter values can be extended via |
|
342 register_error. |
|
343 """ |
|
344 self.stream = stream |
|
345 self.errors = errors |
|
346 |
|
347 def write(self, object): |
|
348 |
|
349 """ Writes the object's contents encoded to self.stream. |
|
350 """ |
|
351 data, consumed = self.encode(object, self.errors) |
|
352 self.stream.write(data) |
|
353 |
|
354 def writelines(self, list): |
|
355 |
|
356 """ Writes the concatenated list of strings to the stream |
|
357 using .write(). |
|
358 """ |
|
359 self.write(''.join(list)) |
|
360 |
|
361 def reset(self): |
|
362 |
|
363 """ Flushes and resets the codec buffers used for keeping state. |
|
364 |
|
365 Calling this method should ensure that the data on the |
|
366 output is put into a clean state, that allows appending |
|
367 of new fresh data without having to rescan the whole |
|
368 stream to recover state. |
|
369 |
|
370 """ |
|
371 pass |
|
372 |
|
373 def __getattr__(self, name, |
|
374 getattr=getattr): |
|
375 |
|
376 """ Inherit all other methods from the underlying stream. |
|
377 """ |
|
378 return getattr(self.stream, name) |
|
379 |
|
380 def __enter__(self): |
|
381 return self |
|
382 |
|
383 def __exit__(self, type, value, tb): |
|
384 self.stream.close() |
|
385 |
|
386 ### |
|
387 |
|
388 class StreamReader(Codec): |
|
389 |
|
390 def __init__(self, stream, errors='strict'): |
|
391 |
|
392 """ Creates a StreamReader instance. |
|
393 |
|
394 stream must be a file-like object open for reading |
|
395 (binary) data. |
|
396 |
|
397 The StreamReader may use different error handling |
|
398 schemes by providing the errors keyword argument. These |
|
399 parameters are predefined: |
|
400 |
|
401 'strict' - raise a ValueError (or a subclass) |
|
402 'ignore' - ignore the character and continue with the next |
|
403 'replace'- replace with a suitable replacement character; |
|
404 |
|
405 The set of allowed parameter values can be extended via |
|
406 register_error. |
|
407 """ |
|
408 self.stream = stream |
|
409 self.errors = errors |
|
410 self.bytebuffer = "" |
|
411 # For str->str decoding this will stay a str |
|
412 # For str->unicode decoding the first read will promote it to unicode |
|
413 self.charbuffer = "" |
|
414 self.linebuffer = None |
|
415 |
|
416 def decode(self, input, errors='strict'): |
|
417 raise NotImplementedError |
|
418 |
|
419 def read(self, size=-1, chars=-1, firstline=False): |
|
420 |
|
421 """ Decodes data from the stream self.stream and returns the |
|
422 resulting object. |
|
423 |
|
424 chars indicates the number of characters to read from the |
|
425 stream. read() will never return more than chars |
|
426 characters, but it might return less, if there are not enough |
|
427 characters available. |
|
428 |
|
429 size indicates the approximate maximum number of bytes to |
|
430 read from the stream for decoding purposes. The decoder |
|
431 can modify this setting as appropriate. The default value |
|
432 -1 indicates to read and decode as much as possible. size |
|
433 is intended to prevent having to decode huge files in one |
|
434 step. |
|
435 |
|
436 If firstline is true, and a UnicodeDecodeError happens |
|
437 after the first line terminator in the input only the first line |
|
438 will be returned, the rest of the input will be kept until the |
|
439 next call to read(). |
|
440 |
|
441 The method should use a greedy read strategy meaning that |
|
442 it should read as much data as is allowed within the |
|
443 definition of the encoding and the given size, e.g. if |
|
444 optional encoding endings or state markers are available |
|
445 on the stream, these should be read too. |
|
446 """ |
|
447 # If we have lines cached, first merge them back into characters |
|
448 if self.linebuffer: |
|
449 self.charbuffer = "".join(self.linebuffer) |
|
450 self.linebuffer = None |
|
451 |
|
452 # read until we get the required number of characters (if available) |
|
453 while True: |
|
454 # can the request can be satisfied from the character buffer? |
|
455 if chars < 0: |
|
456 if size < 0: |
|
457 if self.charbuffer: |
|
458 break |
|
459 elif len(self.charbuffer) >= size: |
|
460 break |
|
461 else: |
|
462 if len(self.charbuffer) >= chars: |
|
463 break |
|
464 # we need more data |
|
465 if size < 0: |
|
466 newdata = self.stream.read() |
|
467 else: |
|
468 newdata = self.stream.read(size) |
|
469 # decode bytes (those remaining from the last call included) |
|
470 data = self.bytebuffer + newdata |
|
471 try: |
|
472 newchars, decodedbytes = self.decode(data, self.errors) |
|
473 except UnicodeDecodeError, exc: |
|
474 if firstline: |
|
475 newchars, decodedbytes = self.decode(data[:exc.start], self.errors) |
|
476 lines = newchars.splitlines(True) |
|
477 if len(lines)<=1: |
|
478 raise |
|
479 else: |
|
480 raise |
|
481 # keep undecoded bytes until the next call |
|
482 self.bytebuffer = data[decodedbytes:] |
|
483 # put new characters in the character buffer |
|
484 self.charbuffer += newchars |
|
485 # there was no data available |
|
486 if not newdata: |
|
487 break |
|
488 if chars < 0: |
|
489 # Return everything we've got |
|
490 result = self.charbuffer |
|
491 self.charbuffer = "" |
|
492 else: |
|
493 # Return the first chars characters |
|
494 result = self.charbuffer[:chars] |
|
495 self.charbuffer = self.charbuffer[chars:] |
|
496 return result |
|
497 |
|
498 def readline(self, size=None, keepends=True): |
|
499 |
|
500 """ Read one line from the input stream and return the |
|
501 decoded data. |
|
502 |
|
503 size, if given, is passed as size argument to the |
|
504 read() method. |
|
505 |
|
506 """ |
|
507 # If we have lines cached from an earlier read, return |
|
508 # them unconditionally |
|
509 if self.linebuffer: |
|
510 line = self.linebuffer[0] |
|
511 del self.linebuffer[0] |
|
512 if len(self.linebuffer) == 1: |
|
513 # revert to charbuffer mode; we might need more data |
|
514 # next time |
|
515 self.charbuffer = self.linebuffer[0] |
|
516 self.linebuffer = None |
|
517 if not keepends: |
|
518 line = line.splitlines(False)[0] |
|
519 return line |
|
520 |
|
521 readsize = size or 72 |
|
522 line = "" |
|
523 # If size is given, we call read() only once |
|
524 while True: |
|
525 data = self.read(readsize, firstline=True) |
|
526 if data: |
|
527 # If we're at a "\r" read one extra character (which might |
|
528 # be a "\n") to get a proper line ending. If the stream is |
|
529 # temporarily exhausted we return the wrong line ending. |
|
530 if data.endswith("\r"): |
|
531 data += self.read(size=1, chars=1) |
|
532 |
|
533 line += data |
|
534 lines = line.splitlines(True) |
|
535 if lines: |
|
536 if len(lines) > 1: |
|
537 # More than one line result; the first line is a full line |
|
538 # to return |
|
539 line = lines[0] |
|
540 del lines[0] |
|
541 if len(lines) > 1: |
|
542 # cache the remaining lines |
|
543 lines[-1] += self.charbuffer |
|
544 self.linebuffer = lines |
|
545 self.charbuffer = None |
|
546 else: |
|
547 # only one remaining line, put it back into charbuffer |
|
548 self.charbuffer = lines[0] + self.charbuffer |
|
549 if not keepends: |
|
550 line = line.splitlines(False)[0] |
|
551 break |
|
552 line0withend = lines[0] |
|
553 line0withoutend = lines[0].splitlines(False)[0] |
|
554 if line0withend != line0withoutend: # We really have a line end |
|
555 # Put the rest back together and keep it until the next call |
|
556 self.charbuffer = "".join(lines[1:]) + self.charbuffer |
|
557 if keepends: |
|
558 line = line0withend |
|
559 else: |
|
560 line = line0withoutend |
|
561 break |
|
562 # we didn't get anything or this was our only try |
|
563 if not data or size is not None: |
|
564 if line and not keepends: |
|
565 line = line.splitlines(False)[0] |
|
566 break |
|
567 if readsize<8000: |
|
568 readsize *= 2 |
|
569 return line |
|
570 |
|
571 def readlines(self, sizehint=None, keepends=True): |
|
572 |
|
573 """ Read all lines available on the input stream |
|
574 and return them as list of lines. |
|
575 |
|
576 Line breaks are implemented using the codec's decoder |
|
577 method and are included in the list entries. |
|
578 |
|
579 sizehint, if given, is ignored since there is no efficient |
|
580 way to finding the true end-of-line. |
|
581 |
|
582 """ |
|
583 data = self.read() |
|
584 return data.splitlines(keepends) |
|
585 |
|
586 def reset(self): |
|
587 |
|
588 """ Resets the codec buffers used for keeping state. |
|
589 |
|
590 Note that no stream repositioning should take place. |
|
591 This method is primarily intended to be able to recover |
|
592 from decoding errors. |
|
593 |
|
594 """ |
|
595 self.bytebuffer = "" |
|
596 self.charbuffer = u"" |
|
597 self.linebuffer = None |
|
598 |
|
599 def seek(self, offset, whence=0): |
|
600 """ Set the input stream's current position. |
|
601 |
|
602 Resets the codec buffers used for keeping state. |
|
603 """ |
|
604 self.reset() |
|
605 self.stream.seek(offset, whence) |
|
606 |
|
607 def next(self): |
|
608 |
|
609 """ Return the next decoded line from the input stream.""" |
|
610 line = self.readline() |
|
611 if line: |
|
612 return line |
|
613 raise StopIteration |
|
614 |
|
615 def __iter__(self): |
|
616 return self |
|
617 |
|
618 def __getattr__(self, name, |
|
619 getattr=getattr): |
|
620 |
|
621 """ Inherit all other methods from the underlying stream. |
|
622 """ |
|
623 return getattr(self.stream, name) |
|
624 |
|
625 def __enter__(self): |
|
626 return self |
|
627 |
|
628 def __exit__(self, type, value, tb): |
|
629 self.stream.close() |
|
630 |
|
631 ### |
|
632 |
|
633 class StreamReaderWriter: |
|
634 |
|
635 """ StreamReaderWriter instances allow wrapping streams which |
|
636 work in both read and write modes. |
|
637 |
|
638 The design is such that one can use the factory functions |
|
639 returned by the codec.lookup() function to construct the |
|
640 instance. |
|
641 |
|
642 """ |
|
643 # Optional attributes set by the file wrappers below |
|
644 encoding = 'unknown' |
|
645 |
|
646 def __init__(self, stream, Reader, Writer, errors='strict'): |
|
647 |
|
648 """ Creates a StreamReaderWriter instance. |
|
649 |
|
650 stream must be a Stream-like object. |
|
651 |
|
652 Reader, Writer must be factory functions or classes |
|
653 providing the StreamReader, StreamWriter interface resp. |
|
654 |
|
655 Error handling is done in the same way as defined for the |
|
656 StreamWriter/Readers. |
|
657 |
|
658 """ |
|
659 self.stream = stream |
|
660 self.reader = Reader(stream, errors) |
|
661 self.writer = Writer(stream, errors) |
|
662 self.errors = errors |
|
663 |
|
664 def read(self, size=-1): |
|
665 |
|
666 return self.reader.read(size) |
|
667 |
|
668 def readline(self, size=None): |
|
669 |
|
670 return self.reader.readline(size) |
|
671 |
|
672 def readlines(self, sizehint=None): |
|
673 |
|
674 return self.reader.readlines(sizehint) |
|
675 |
|
676 def next(self): |
|
677 |
|
678 """ Return the next decoded line from the input stream.""" |
|
679 return self.reader.next() |
|
680 |
|
681 def __iter__(self): |
|
682 return self |
|
683 |
|
684 def write(self, data): |
|
685 |
|
686 return self.writer.write(data) |
|
687 |
|
688 def writelines(self, list): |
|
689 |
|
690 return self.writer.writelines(list) |
|
691 |
|
692 def reset(self): |
|
693 |
|
694 self.reader.reset() |
|
695 self.writer.reset() |
|
696 |
|
697 def __getattr__(self, name, |
|
698 getattr=getattr): |
|
699 |
|
700 """ Inherit all other methods from the underlying stream. |
|
701 """ |
|
702 return getattr(self.stream, name) |
|
703 |
|
704 # these are needed to make "with codecs.open(...)" work properly |
|
705 |
|
706 def __enter__(self): |
|
707 return self |
|
708 |
|
709 def __exit__(self, type, value, tb): |
|
710 self.stream.close() |
|
711 |
|
712 ### |
|
713 |
|
714 class StreamRecoder: |
|
715 |
|
716 """ StreamRecoder instances provide a frontend - backend |
|
717 view of encoding data. |
|
718 |
|
719 They use the complete set of APIs returned by the |
|
720 codecs.lookup() function to implement their task. |
|
721 |
|
722 Data written to the stream is first decoded into an |
|
723 intermediate format (which is dependent on the given codec |
|
724 combination) and then written to the stream using an instance |
|
725 of the provided Writer class. |
|
726 |
|
727 In the other direction, data is read from the stream using a |
|
728 Reader instance and then return encoded data to the caller. |
|
729 |
|
730 """ |
|
731 # Optional attributes set by the file wrappers below |
|
732 data_encoding = 'unknown' |
|
733 file_encoding = 'unknown' |
|
734 |
|
735 def __init__(self, stream, encode, decode, Reader, Writer, |
|
736 errors='strict'): |
|
737 |
|
738 """ Creates a StreamRecoder instance which implements a two-way |
|
739 conversion: encode and decode work on the frontend (the |
|
740 input to .read() and output of .write()) while |
|
741 Reader and Writer work on the backend (reading and |
|
742 writing to the stream). |
|
743 |
|
744 You can use these objects to do transparent direct |
|
745 recodings from e.g. latin-1 to utf-8 and back. |
|
746 |
|
747 stream must be a file-like object. |
|
748 |
|
749 encode, decode must adhere to the Codec interface, Reader, |
|
750 Writer must be factory functions or classes providing the |
|
751 StreamReader, StreamWriter interface resp. |
|
752 |
|
753 encode and decode are needed for the frontend translation, |
|
754 Reader and Writer for the backend translation. Unicode is |
|
755 used as intermediate encoding. |
|
756 |
|
757 Error handling is done in the same way as defined for the |
|
758 StreamWriter/Readers. |
|
759 |
|
760 """ |
|
761 self.stream = stream |
|
762 self.encode = encode |
|
763 self.decode = decode |
|
764 self.reader = Reader(stream, errors) |
|
765 self.writer = Writer(stream, errors) |
|
766 self.errors = errors |
|
767 |
|
768 def read(self, size=-1): |
|
769 |
|
770 data = self.reader.read(size) |
|
771 data, bytesencoded = self.encode(data, self.errors) |
|
772 return data |
|
773 |
|
774 def readline(self, size=None): |
|
775 |
|
776 if size is None: |
|
777 data = self.reader.readline() |
|
778 else: |
|
779 data = self.reader.readline(size) |
|
780 data, bytesencoded = self.encode(data, self.errors) |
|
781 return data |
|
782 |
|
783 def readlines(self, sizehint=None): |
|
784 |
|
785 data = self.reader.read() |
|
786 data, bytesencoded = self.encode(data, self.errors) |
|
787 return data.splitlines(1) |
|
788 |
|
789 def next(self): |
|
790 |
|
791 """ Return the next decoded line from the input stream.""" |
|
792 data = self.reader.next() |
|
793 data, bytesencoded = self.encode(data, self.errors) |
|
794 return data |
|
795 |
|
796 def __iter__(self): |
|
797 return self |
|
798 |
|
799 def write(self, data): |
|
800 |
|
801 data, bytesdecoded = self.decode(data, self.errors) |
|
802 return self.writer.write(data) |
|
803 |
|
804 def writelines(self, list): |
|
805 |
|
806 data = ''.join(list) |
|
807 data, bytesdecoded = self.decode(data, self.errors) |
|
808 return self.writer.write(data) |
|
809 |
|
810 def reset(self): |
|
811 |
|
812 self.reader.reset() |
|
813 self.writer.reset() |
|
814 |
|
815 def __getattr__(self, name, |
|
816 getattr=getattr): |
|
817 |
|
818 """ Inherit all other methods from the underlying stream. |
|
819 """ |
|
820 return getattr(self.stream, name) |
|
821 |
|
822 def __enter__(self): |
|
823 return self |
|
824 |
|
825 def __exit__(self, type, value, tb): |
|
826 self.stream.close() |
|
827 |
|
828 ### Shortcuts |
|
829 |
|
830 def open(filename, mode='rb', encoding=None, errors='strict', buffering=1): |
|
831 |
|
832 """ Open an encoded file using the given mode and return |
|
833 a wrapped version providing transparent encoding/decoding. |
|
834 |
|
835 Note: The wrapped version will only accept the object format |
|
836 defined by the codecs, i.e. Unicode objects for most builtin |
|
837 codecs. Output is also codec dependent and will usually be |
|
838 Unicode as well. |
|
839 |
|
840 Files are always opened in binary mode, even if no binary mode |
|
841 was specified. This is done to avoid data loss due to encodings |
|
842 using 8-bit values. The default file mode is 'rb' meaning to |
|
843 open the file in binary read mode. |
|
844 |
|
845 encoding specifies the encoding which is to be used for the |
|
846 file. |
|
847 |
|
848 errors may be given to define the error handling. It defaults |
|
849 to 'strict' which causes ValueErrors to be raised in case an |
|
850 encoding error occurs. |
|
851 |
|
852 buffering has the same meaning as for the builtin open() API. |
|
853 It defaults to line buffered. |
|
854 |
|
855 The returned wrapped file object provides an extra attribute |
|
856 .encoding which allows querying the used encoding. This |
|
857 attribute is only available if an encoding was specified as |
|
858 parameter. |
|
859 |
|
860 """ |
|
861 if encoding is not None and \ |
|
862 'b' not in mode: |
|
863 # Force opening of the file in binary mode |
|
864 mode = mode + 'b' |
|
865 file = __builtin__.open(filename, mode, buffering) |
|
866 if encoding is None: |
|
867 return file |
|
868 info = lookup(encoding) |
|
869 srw = StreamReaderWriter(file, info.streamreader, info.streamwriter, errors) |
|
870 # Add attributes to simplify introspection |
|
871 srw.encoding = encoding |
|
872 return srw |
|
873 |
|
874 def EncodedFile(file, data_encoding, file_encoding=None, errors='strict'): |
|
875 |
|
876 """ Return a wrapped version of file which provides transparent |
|
877 encoding translation. |
|
878 |
|
879 Strings written to the wrapped file are interpreted according |
|
880 to the given data_encoding and then written to the original |
|
881 file as string using file_encoding. The intermediate encoding |
|
882 will usually be Unicode but depends on the specified codecs. |
|
883 |
|
884 Strings are read from the file using file_encoding and then |
|
885 passed back to the caller as string using data_encoding. |
|
886 |
|
887 If file_encoding is not given, it defaults to data_encoding. |
|
888 |
|
889 errors may be given to define the error handling. It defaults |
|
890 to 'strict' which causes ValueErrors to be raised in case an |
|
891 encoding error occurs. |
|
892 |
|
893 The returned wrapped file object provides two extra attributes |
|
894 .data_encoding and .file_encoding which reflect the given |
|
895 parameters of the same name. The attributes can be used for |
|
896 introspection by Python programs. |
|
897 |
|
898 """ |
|
899 if file_encoding is None: |
|
900 file_encoding = data_encoding |
|
901 data_info = lookup(data_encoding) |
|
902 file_info = lookup(file_encoding) |
|
903 sr = StreamRecoder(file, data_info.encode, data_info.decode, |
|
904 file_info.streamreader, file_info.streamwriter, errors) |
|
905 # Add attributes to simplify introspection |
|
906 sr.data_encoding = data_encoding |
|
907 sr.file_encoding = file_encoding |
|
908 return sr |
|
909 |
|
910 ### Helpers for codec lookup |
|
911 |
|
912 def getencoder(encoding): |
|
913 |
|
914 """ Lookup up the codec for the given encoding and return |
|
915 its encoder function. |
|
916 |
|
917 Raises a LookupError in case the encoding cannot be found. |
|
918 |
|
919 """ |
|
920 return lookup(encoding).encode |
|
921 |
|
922 def getdecoder(encoding): |
|
923 |
|
924 """ Lookup up the codec for the given encoding and return |
|
925 its decoder function. |
|
926 |
|
927 Raises a LookupError in case the encoding cannot be found. |
|
928 |
|
929 """ |
|
930 return lookup(encoding).decode |
|
931 |
|
932 def getincrementalencoder(encoding): |
|
933 |
|
934 """ Lookup up the codec for the given encoding and return |
|
935 its IncrementalEncoder class or factory function. |
|
936 |
|
937 Raises a LookupError in case the encoding cannot be found |
|
938 or the codecs doesn't provide an incremental encoder. |
|
939 |
|
940 """ |
|
941 encoder = lookup(encoding).incrementalencoder |
|
942 if encoder is None: |
|
943 raise LookupError(encoding) |
|
944 return encoder |
|
945 |
|
946 def getincrementaldecoder(encoding): |
|
947 |
|
948 """ Lookup up the codec for the given encoding and return |
|
949 its IncrementalDecoder class or factory function. |
|
950 |
|
951 Raises a LookupError in case the encoding cannot be found |
|
952 or the codecs doesn't provide an incremental decoder. |
|
953 |
|
954 """ |
|
955 decoder = lookup(encoding).incrementaldecoder |
|
956 if decoder is None: |
|
957 raise LookupError(encoding) |
|
958 return decoder |
|
959 |
|
960 def getreader(encoding): |
|
961 |
|
962 """ Lookup up the codec for the given encoding and return |
|
963 its StreamReader class or factory function. |
|
964 |
|
965 Raises a LookupError in case the encoding cannot be found. |
|
966 |
|
967 """ |
|
968 return lookup(encoding).streamreader |
|
969 |
|
970 def getwriter(encoding): |
|
971 |
|
972 """ Lookup up the codec for the given encoding and return |
|
973 its StreamWriter class or factory function. |
|
974 |
|
975 Raises a LookupError in case the encoding cannot be found. |
|
976 |
|
977 """ |
|
978 return lookup(encoding).streamwriter |
|
979 |
|
980 def iterencode(iterator, encoding, errors='strict', **kwargs): |
|
981 """ |
|
982 Encoding iterator. |
|
983 |
|
984 Encodes the input strings from the iterator using a IncrementalEncoder. |
|
985 |
|
986 errors and kwargs are passed through to the IncrementalEncoder |
|
987 constructor. |
|
988 """ |
|
989 encoder = getincrementalencoder(encoding)(errors, **kwargs) |
|
990 for input in iterator: |
|
991 output = encoder.encode(input) |
|
992 if output: |
|
993 yield output |
|
994 output = encoder.encode("", True) |
|
995 if output: |
|
996 yield output |
|
997 |
|
998 def iterdecode(iterator, encoding, errors='strict', **kwargs): |
|
999 """ |
|
1000 Decoding iterator. |
|
1001 |
|
1002 Decodes the input strings from the iterator using a IncrementalDecoder. |
|
1003 |
|
1004 errors and kwargs are passed through to the IncrementalDecoder |
|
1005 constructor. |
|
1006 """ |
|
1007 decoder = getincrementaldecoder(encoding)(errors, **kwargs) |
|
1008 for input in iterator: |
|
1009 output = decoder.decode(input) |
|
1010 if output: |
|
1011 yield output |
|
1012 output = decoder.decode("", True) |
|
1013 if output: |
|
1014 yield output |
|
1015 |
|
1016 ### Helpers for charmap-based codecs |
|
1017 |
|
1018 def make_identity_dict(rng): |
|
1019 |
|
1020 """ make_identity_dict(rng) -> dict |
|
1021 |
|
1022 Return a dictionary where elements of the rng sequence are |
|
1023 mapped to themselves. |
|
1024 |
|
1025 """ |
|
1026 res = {} |
|
1027 for i in rng: |
|
1028 res[i]=i |
|
1029 return res |
|
1030 |
|
1031 def make_encoding_map(decoding_map): |
|
1032 |
|
1033 """ Creates an encoding map from a decoding map. |
|
1034 |
|
1035 If a target mapping in the decoding map occurs multiple |
|
1036 times, then that target is mapped to None (undefined mapping), |
|
1037 causing an exception when encountered by the charmap codec |
|
1038 during translation. |
|
1039 |
|
1040 One example where this happens is cp875.py which decodes |
|
1041 multiple character to \u001a. |
|
1042 |
|
1043 """ |
|
1044 m = {} |
|
1045 for k,v in decoding_map.items(): |
|
1046 if not v in m: |
|
1047 m[v] = k |
|
1048 else: |
|
1049 m[v] = None |
|
1050 return m |
|
1051 |
|
1052 ### error handlers |
|
1053 |
|
1054 try: |
|
1055 strict_errors = lookup_error("strict") |
|
1056 ignore_errors = lookup_error("ignore") |
|
1057 replace_errors = lookup_error("replace") |
|
1058 xmlcharrefreplace_errors = lookup_error("xmlcharrefreplace") |
|
1059 backslashreplace_errors = lookup_error("backslashreplace") |
|
1060 except LookupError: |
|
1061 # In --disable-unicode builds, these error handler are missing |
|
1062 strict_errors = None |
|
1063 ignore_errors = None |
|
1064 replace_errors = None |
|
1065 xmlcharrefreplace_errors = None |
|
1066 backslashreplace_errors = None |
|
1067 |
|
1068 # Tell modulefinder that using codecs probably needs the encodings |
|
1069 # package |
|
1070 _false = 0 |
|
1071 if _false: |
|
1072 import encodings |
|
1073 |
|
1074 ### Tests |
|
1075 |
|
1076 if __name__ == '__main__': |
|
1077 |
|
1078 # Make stdout translate Latin-1 output into UTF-8 output |
|
1079 sys.stdout = EncodedFile(sys.stdout, 'latin-1', 'utf-8') |
|
1080 |
|
1081 # Have stdin translate Latin-1 input into UTF-8 input |
|
1082 sys.stdin = EncodedFile(sys.stdin, 'utf-8', 'latin-1') |