symbian-qemu-0.9.1-12/python-win32-2.6.1/lib/codecs.py
changeset 1 2fb8b9db1c86
equal deleted inserted replaced
0:ffa851df0825 1:2fb8b9db1c86
       
     1 """ codecs -- Python Codec Registry, API and helpers.
       
     2 
       
     3 
       
     4 Written by Marc-Andre Lemburg (mal@lemburg.com).
       
     5 
       
     6 (c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
       
     7 
       
     8 """#"
       
     9 
       
    10 import __builtin__, sys
       
    11 
       
    12 ### Registry and builtin stateless codec functions
       
    13 
       
    14 try:
       
    15     from _codecs import *
       
    16 except ImportError, why:
       
    17     raise SystemError('Failed to load the builtin codecs: %s' % why)
       
    18 
       
    19 __all__ = ["register", "lookup", "open", "EncodedFile", "BOM", "BOM_BE",
       
    20            "BOM_LE", "BOM32_BE", "BOM32_LE", "BOM64_BE", "BOM64_LE",
       
    21            "BOM_UTF8", "BOM_UTF16", "BOM_UTF16_LE", "BOM_UTF16_BE",
       
    22            "BOM_UTF32", "BOM_UTF32_LE", "BOM_UTF32_BE",
       
    23            "strict_errors", "ignore_errors", "replace_errors",
       
    24            "xmlcharrefreplace_errors",
       
    25            "register_error", "lookup_error"]
       
    26 
       
    27 ### Constants
       
    28 
       
    29 #
       
    30 # Byte Order Mark (BOM = ZERO WIDTH NO-BREAK SPACE = U+FEFF)
       
    31 # and its possible byte string values
       
    32 # for UTF8/UTF16/UTF32 output and little/big endian machines
       
    33 #
       
    34 
       
    35 # UTF-8
       
    36 BOM_UTF8 = '\xef\xbb\xbf'
       
    37 
       
    38 # UTF-16, little endian
       
    39 BOM_LE = BOM_UTF16_LE = '\xff\xfe'
       
    40 
       
    41 # UTF-16, big endian
       
    42 BOM_BE = BOM_UTF16_BE = '\xfe\xff'
       
    43 
       
    44 # UTF-32, little endian
       
    45 BOM_UTF32_LE = '\xff\xfe\x00\x00'
       
    46 
       
    47 # UTF-32, big endian
       
    48 BOM_UTF32_BE = '\x00\x00\xfe\xff'
       
    49 
       
    50 if sys.byteorder == 'little':
       
    51 
       
    52     # UTF-16, native endianness
       
    53     BOM = BOM_UTF16 = BOM_UTF16_LE
       
    54 
       
    55     # UTF-32, native endianness
       
    56     BOM_UTF32 = BOM_UTF32_LE
       
    57 
       
    58 else:
       
    59 
       
    60     # UTF-16, native endianness
       
    61     BOM = BOM_UTF16 = BOM_UTF16_BE
       
    62 
       
    63     # UTF-32, native endianness
       
    64     BOM_UTF32 = BOM_UTF32_BE
       
    65 
       
    66 # Old broken names (don't use in new code)
       
    67 BOM32_LE = BOM_UTF16_LE
       
    68 BOM32_BE = BOM_UTF16_BE
       
    69 BOM64_LE = BOM_UTF32_LE
       
    70 BOM64_BE = BOM_UTF32_BE
       
    71 
       
    72 
       
    73 ### Codec base classes (defining the API)
       
    74 
       
    75 class CodecInfo(tuple):
       
    76 
       
    77     def __new__(cls, encode, decode, streamreader=None, streamwriter=None,
       
    78         incrementalencoder=None, incrementaldecoder=None, name=None):
       
    79         self = tuple.__new__(cls, (encode, decode, streamreader, streamwriter))
       
    80         self.name = name
       
    81         self.encode = encode
       
    82         self.decode = decode
       
    83         self.incrementalencoder = incrementalencoder
       
    84         self.incrementaldecoder = incrementaldecoder
       
    85         self.streamwriter = streamwriter
       
    86         self.streamreader = streamreader
       
    87         return self
       
    88 
       
    89     def __repr__(self):
       
    90         return "<%s.%s object for encoding %s at 0x%x>" % (self.__class__.__module__, self.__class__.__name__, self.name, id(self))
       
    91 
       
    92 class Codec:
       
    93 
       
    94     """ Defines the interface for stateless encoders/decoders.
       
    95 
       
    96         The .encode()/.decode() methods may use different error
       
    97         handling schemes by providing the errors argument. These
       
    98         string values are predefined:
       
    99 
       
   100          'strict' - raise a ValueError error (or a subclass)
       
   101          'ignore' - ignore the character and continue with the next
       
   102          'replace' - replace with a suitable replacement character;
       
   103                     Python will use the official U+FFFD REPLACEMENT
       
   104                     CHARACTER for the builtin Unicode codecs on
       
   105                     decoding and '?' on encoding.
       
   106          'xmlcharrefreplace' - Replace with the appropriate XML
       
   107                                character reference (only for encoding).
       
   108          'backslashreplace'  - Replace with backslashed escape sequences
       
   109                                (only for encoding).
       
   110 
       
   111         The set of allowed values can be extended via register_error.
       
   112 
       
   113     """
       
   114     def encode(self, input, errors='strict'):
       
   115 
       
   116         """ Encodes the object input and returns a tuple (output
       
   117             object, length consumed).
       
   118 
       
   119             errors defines the error handling to apply. It defaults to
       
   120             'strict' handling.
       
   121 
       
   122             The method may not store state in the Codec instance. Use
       
   123             StreamCodec for codecs which have to keep state in order to
       
   124             make encoding/decoding efficient.
       
   125 
       
   126             The encoder must be able to handle zero length input and
       
   127             return an empty object of the output object type in this
       
   128             situation.
       
   129 
       
   130         """
       
   131         raise NotImplementedError
       
   132 
       
   133     def decode(self, input, errors='strict'):
       
   134 
       
   135         """ Decodes the object input and returns a tuple (output
       
   136             object, length consumed).
       
   137 
       
   138             input must be an object which provides the bf_getreadbuf
       
   139             buffer slot. Python strings, buffer objects and memory
       
   140             mapped files are examples of objects providing this slot.
       
   141 
       
   142             errors defines the error handling to apply. It defaults to
       
   143             'strict' handling.
       
   144 
       
   145             The method may not store state in the Codec instance. Use
       
   146             StreamCodec for codecs which have to keep state in order to
       
   147             make encoding/decoding efficient.
       
   148 
       
   149             The decoder must be able to handle zero length input and
       
   150             return an empty object of the output object type in this
       
   151             situation.
       
   152 
       
   153         """
       
   154         raise NotImplementedError
       
   155 
       
   156 class IncrementalEncoder(object):
       
   157     """
       
   158     An IncrementalEncoder encodes an input in multiple steps. The input can be
       
   159     passed piece by piece to the encode() method. The IncrementalEncoder remembers
       
   160     the state of the Encoding process between calls to encode().
       
   161     """
       
   162     def __init__(self, errors='strict'):
       
   163         """
       
   164         Creates an IncrementalEncoder instance.
       
   165 
       
   166         The IncrementalEncoder may use different error handling schemes by
       
   167         providing the errors keyword argument. See the module docstring
       
   168         for a list of possible values.
       
   169         """
       
   170         self.errors = errors
       
   171         self.buffer = ""
       
   172 
       
   173     def encode(self, input, final=False):
       
   174         """
       
   175         Encodes input and returns the resulting object.
       
   176         """
       
   177         raise NotImplementedError
       
   178 
       
   179     def reset(self):
       
   180         """
       
   181         Resets the encoder to the initial state.
       
   182         """
       
   183 
       
   184     def getstate(self):
       
   185         """
       
   186         Return the current state of the encoder.
       
   187         """
       
   188         return 0
       
   189 
       
   190     def setstate(self, state):
       
   191         """
       
   192         Set the current state of the encoder. state must have been
       
   193         returned by getstate().
       
   194         """
       
   195 
       
   196 class BufferedIncrementalEncoder(IncrementalEncoder):
       
   197     """
       
   198     This subclass of IncrementalEncoder can be used as the baseclass for an
       
   199     incremental encoder if the encoder must keep some of the output in a
       
   200     buffer between calls to encode().
       
   201     """
       
   202     def __init__(self, errors='strict'):
       
   203         IncrementalEncoder.__init__(self, errors)
       
   204         self.buffer = "" # unencoded input that is kept between calls to encode()
       
   205 
       
   206     def _buffer_encode(self, input, errors, final):
       
   207         # Overwrite this method in subclasses: It must encode input
       
   208         # and return an (output, length consumed) tuple
       
   209         raise NotImplementedError
       
   210 
       
   211     def encode(self, input, final=False):
       
   212         # encode input (taking the buffer into account)
       
   213         data = self.buffer + input
       
   214         (result, consumed) = self._buffer_encode(data, self.errors, final)
       
   215         # keep unencoded input until the next call
       
   216         self.buffer = data[consumed:]
       
   217         return result
       
   218 
       
   219     def reset(self):
       
   220         IncrementalEncoder.reset(self)
       
   221         self.buffer = ""
       
   222 
       
   223     def getstate(self):
       
   224         return self.buffer or 0
       
   225 
       
   226     def setstate(self, state):
       
   227         self.buffer = state or ""
       
   228 
       
   229 class IncrementalDecoder(object):
       
   230     """
       
   231     An IncrementalDecoder decodes an input in multiple steps. The input can be
       
   232     passed piece by piece to the decode() method. The IncrementalDecoder
       
   233     remembers the state of the decoding process between calls to decode().
       
   234     """
       
   235     def __init__(self, errors='strict'):
       
   236         """
       
   237         Creates a IncrementalDecoder instance.
       
   238 
       
   239         The IncrementalDecoder may use different error handling schemes by
       
   240         providing the errors keyword argument. See the module docstring
       
   241         for a list of possible values.
       
   242         """
       
   243         self.errors = errors
       
   244 
       
   245     def decode(self, input, final=False):
       
   246         """
       
   247         Decodes input and returns the resulting object.
       
   248         """
       
   249         raise NotImplementedError
       
   250 
       
   251     def reset(self):
       
   252         """
       
   253         Resets the decoder to the initial state.
       
   254         """
       
   255 
       
   256     def getstate(self):
       
   257         """
       
   258         Return the current state of the decoder.
       
   259 
       
   260         This must be a (buffered_input, additional_state_info) tuple.
       
   261         buffered_input must be a bytes object containing bytes that
       
   262         were passed to decode() that have not yet been converted.
       
   263         additional_state_info must be a non-negative integer
       
   264         representing the state of the decoder WITHOUT yet having
       
   265         processed the contents of buffered_input.  In the initial state
       
   266         and after reset(), getstate() must return (b"", 0).
       
   267         """
       
   268         return (b"", 0)
       
   269 
       
   270     def setstate(self, state):
       
   271         """
       
   272         Set the current state of the decoder.
       
   273 
       
   274         state must have been returned by getstate().  The effect of
       
   275         setstate((b"", 0)) must be equivalent to reset().
       
   276         """
       
   277 
       
   278 class BufferedIncrementalDecoder(IncrementalDecoder):
       
   279     """
       
   280     This subclass of IncrementalDecoder can be used as the baseclass for an
       
   281     incremental decoder if the decoder must be able to handle incomplete byte
       
   282     sequences.
       
   283     """
       
   284     def __init__(self, errors='strict'):
       
   285         IncrementalDecoder.__init__(self, errors)
       
   286         self.buffer = "" # undecoded input that is kept between calls to decode()
       
   287 
       
   288     def _buffer_decode(self, input, errors, final):
       
   289         # Overwrite this method in subclasses: It must decode input
       
   290         # and return an (output, length consumed) tuple
       
   291         raise NotImplementedError
       
   292 
       
   293     def decode(self, input, final=False):
       
   294         # decode input (taking the buffer into account)
       
   295         data = self.buffer + input
       
   296         (result, consumed) = self._buffer_decode(data, self.errors, final)
       
   297         # keep undecoded input until the next call
       
   298         self.buffer = data[consumed:]
       
   299         return result
       
   300 
       
   301     def reset(self):
       
   302         IncrementalDecoder.reset(self)
       
   303         self.buffer = ""
       
   304 
       
   305     def getstate(self):
       
   306         # additional state info is always 0
       
   307         return (self.buffer, 0)
       
   308 
       
   309     def setstate(self, state):
       
   310         # ignore additional state info
       
   311         self.buffer = state[0]
       
   312 
       
   313 #
       
   314 # The StreamWriter and StreamReader class provide generic working
       
   315 # interfaces which can be used to implement new encoding submodules
       
   316 # very easily. See encodings/utf_8.py for an example on how this is
       
   317 # done.
       
   318 #
       
   319 
       
   320 class StreamWriter(Codec):
       
   321 
       
   322     def __init__(self, stream, errors='strict'):
       
   323 
       
   324         """ Creates a StreamWriter instance.
       
   325 
       
   326             stream must be a file-like object open for writing
       
   327             (binary) data.
       
   328 
       
   329             The StreamWriter may use different error handling
       
   330             schemes by providing the errors keyword argument. These
       
   331             parameters are predefined:
       
   332 
       
   333              'strict' - raise a ValueError (or a subclass)
       
   334              'ignore' - ignore the character and continue with the next
       
   335              'replace'- replace with a suitable replacement character
       
   336              'xmlcharrefreplace' - Replace with the appropriate XML
       
   337                                    character reference.
       
   338              'backslashreplace'  - Replace with backslashed escape
       
   339                                    sequences (only for encoding).
       
   340 
       
   341             The set of allowed parameter values can be extended via
       
   342             register_error.
       
   343         """
       
   344         self.stream = stream
       
   345         self.errors = errors
       
   346 
       
   347     def write(self, object):
       
   348 
       
   349         """ Writes the object's contents encoded to self.stream.
       
   350         """
       
   351         data, consumed = self.encode(object, self.errors)
       
   352         self.stream.write(data)
       
   353 
       
   354     def writelines(self, list):
       
   355 
       
   356         """ Writes the concatenated list of strings to the stream
       
   357             using .write().
       
   358         """
       
   359         self.write(''.join(list))
       
   360 
       
   361     def reset(self):
       
   362 
       
   363         """ Flushes and resets the codec buffers used for keeping state.
       
   364 
       
   365             Calling this method should ensure that the data on the
       
   366             output is put into a clean state, that allows appending
       
   367             of new fresh data without having to rescan the whole
       
   368             stream to recover state.
       
   369 
       
   370         """
       
   371         pass
       
   372 
       
   373     def __getattr__(self, name,
       
   374                     getattr=getattr):
       
   375 
       
   376         """ Inherit all other methods from the underlying stream.
       
   377         """
       
   378         return getattr(self.stream, name)
       
   379 
       
   380     def __enter__(self):
       
   381         return self
       
   382 
       
   383     def __exit__(self, type, value, tb):
       
   384         self.stream.close()
       
   385 
       
   386 ###
       
   387 
       
   388 class StreamReader(Codec):
       
   389 
       
   390     def __init__(self, stream, errors='strict'):
       
   391 
       
   392         """ Creates a StreamReader instance.
       
   393 
       
   394             stream must be a file-like object open for reading
       
   395             (binary) data.
       
   396 
       
   397             The StreamReader may use different error handling
       
   398             schemes by providing the errors keyword argument. These
       
   399             parameters are predefined:
       
   400 
       
   401              'strict' - raise a ValueError (or a subclass)
       
   402              'ignore' - ignore the character and continue with the next
       
   403              'replace'- replace with a suitable replacement character;
       
   404 
       
   405             The set of allowed parameter values can be extended via
       
   406             register_error.
       
   407         """
       
   408         self.stream = stream
       
   409         self.errors = errors
       
   410         self.bytebuffer = ""
       
   411         # For str->str decoding this will stay a str
       
   412         # For str->unicode decoding the first read will promote it to unicode
       
   413         self.charbuffer = ""
       
   414         self.linebuffer = None
       
   415 
       
   416     def decode(self, input, errors='strict'):
       
   417         raise NotImplementedError
       
   418 
       
   419     def read(self, size=-1, chars=-1, firstline=False):
       
   420 
       
   421         """ Decodes data from the stream self.stream and returns the
       
   422             resulting object.
       
   423 
       
   424             chars indicates the number of characters to read from the
       
   425             stream. read() will never return more than chars
       
   426             characters, but it might return less, if there are not enough
       
   427             characters available.
       
   428 
       
   429             size indicates the approximate maximum number of bytes to
       
   430             read from the stream for decoding purposes. The decoder
       
   431             can modify this setting as appropriate. The default value
       
   432             -1 indicates to read and decode as much as possible.  size
       
   433             is intended to prevent having to decode huge files in one
       
   434             step.
       
   435 
       
   436             If firstline is true, and a UnicodeDecodeError happens
       
   437             after the first line terminator in the input only the first line
       
   438             will be returned, the rest of the input will be kept until the
       
   439             next call to read().
       
   440 
       
   441             The method should use a greedy read strategy meaning that
       
   442             it should read as much data as is allowed within the
       
   443             definition of the encoding and the given size, e.g.  if
       
   444             optional encoding endings or state markers are available
       
   445             on the stream, these should be read too.
       
   446         """
       
   447         # If we have lines cached, first merge them back into characters
       
   448         if self.linebuffer:
       
   449             self.charbuffer = "".join(self.linebuffer)
       
   450             self.linebuffer = None
       
   451 
       
   452         # read until we get the required number of characters (if available)
       
   453         while True:
       
   454             # can the request can be satisfied from the character buffer?
       
   455             if chars < 0:
       
   456                 if size < 0:
       
   457                     if self.charbuffer:
       
   458                         break
       
   459                 elif len(self.charbuffer) >= size:
       
   460                     break
       
   461             else:
       
   462                 if len(self.charbuffer) >= chars:
       
   463                     break
       
   464             # we need more data
       
   465             if size < 0:
       
   466                 newdata = self.stream.read()
       
   467             else:
       
   468                 newdata = self.stream.read(size)
       
   469             # decode bytes (those remaining from the last call included)
       
   470             data = self.bytebuffer + newdata
       
   471             try:
       
   472                 newchars, decodedbytes = self.decode(data, self.errors)
       
   473             except UnicodeDecodeError, exc:
       
   474                 if firstline:
       
   475                     newchars, decodedbytes = self.decode(data[:exc.start], self.errors)
       
   476                     lines = newchars.splitlines(True)
       
   477                     if len(lines)<=1:
       
   478                         raise
       
   479                 else:
       
   480                     raise
       
   481             # keep undecoded bytes until the next call
       
   482             self.bytebuffer = data[decodedbytes:]
       
   483             # put new characters in the character buffer
       
   484             self.charbuffer += newchars
       
   485             # there was no data available
       
   486             if not newdata:
       
   487                 break
       
   488         if chars < 0:
       
   489             # Return everything we've got
       
   490             result = self.charbuffer
       
   491             self.charbuffer = ""
       
   492         else:
       
   493             # Return the first chars characters
       
   494             result = self.charbuffer[:chars]
       
   495             self.charbuffer = self.charbuffer[chars:]
       
   496         return result
       
   497 
       
   498     def readline(self, size=None, keepends=True):
       
   499 
       
   500         """ Read one line from the input stream and return the
       
   501             decoded data.
       
   502 
       
   503             size, if given, is passed as size argument to the
       
   504             read() method.
       
   505 
       
   506         """
       
   507         # If we have lines cached from an earlier read, return
       
   508         # them unconditionally
       
   509         if self.linebuffer:
       
   510             line = self.linebuffer[0]
       
   511             del self.linebuffer[0]
       
   512             if len(self.linebuffer) == 1:
       
   513                 # revert to charbuffer mode; we might need more data
       
   514                 # next time
       
   515                 self.charbuffer = self.linebuffer[0]
       
   516                 self.linebuffer = None
       
   517             if not keepends:
       
   518                 line = line.splitlines(False)[0]
       
   519             return line
       
   520 
       
   521         readsize = size or 72
       
   522         line = ""
       
   523         # If size is given, we call read() only once
       
   524         while True:
       
   525             data = self.read(readsize, firstline=True)
       
   526             if data:
       
   527                 # If we're at a "\r" read one extra character (which might
       
   528                 # be a "\n") to get a proper line ending. If the stream is
       
   529                 # temporarily exhausted we return the wrong line ending.
       
   530                 if data.endswith("\r"):
       
   531                     data += self.read(size=1, chars=1)
       
   532 
       
   533             line += data
       
   534             lines = line.splitlines(True)
       
   535             if lines:
       
   536                 if len(lines) > 1:
       
   537                     # More than one line result; the first line is a full line
       
   538                     # to return
       
   539                     line = lines[0]
       
   540                     del lines[0]
       
   541                     if len(lines) > 1:
       
   542                         # cache the remaining lines
       
   543                         lines[-1] += self.charbuffer
       
   544                         self.linebuffer = lines
       
   545                         self.charbuffer = None
       
   546                     else:
       
   547                         # only one remaining line, put it back into charbuffer
       
   548                         self.charbuffer = lines[0] + self.charbuffer
       
   549                     if not keepends:
       
   550                         line = line.splitlines(False)[0]
       
   551                     break
       
   552                 line0withend = lines[0]
       
   553                 line0withoutend = lines[0].splitlines(False)[0]
       
   554                 if line0withend != line0withoutend: # We really have a line end
       
   555                     # Put the rest back together and keep it until the next call
       
   556                     self.charbuffer = "".join(lines[1:]) + self.charbuffer
       
   557                     if keepends:
       
   558                         line = line0withend
       
   559                     else:
       
   560                         line = line0withoutend
       
   561                     break
       
   562             # we didn't get anything or this was our only try
       
   563             if not data or size is not None:
       
   564                 if line and not keepends:
       
   565                     line = line.splitlines(False)[0]
       
   566                 break
       
   567             if readsize<8000:
       
   568                 readsize *= 2
       
   569         return line
       
   570 
       
   571     def readlines(self, sizehint=None, keepends=True):
       
   572 
       
   573         """ Read all lines available on the input stream
       
   574             and return them as list of lines.
       
   575 
       
   576             Line breaks are implemented using the codec's decoder
       
   577             method and are included in the list entries.
       
   578 
       
   579             sizehint, if given, is ignored since there is no efficient
       
   580             way to finding the true end-of-line.
       
   581 
       
   582         """
       
   583         data = self.read()
       
   584         return data.splitlines(keepends)
       
   585 
       
   586     def reset(self):
       
   587 
       
   588         """ Resets the codec buffers used for keeping state.
       
   589 
       
   590             Note that no stream repositioning should take place.
       
   591             This method is primarily intended to be able to recover
       
   592             from decoding errors.
       
   593 
       
   594         """
       
   595         self.bytebuffer = ""
       
   596         self.charbuffer = u""
       
   597         self.linebuffer = None
       
   598 
       
   599     def seek(self, offset, whence=0):
       
   600         """ Set the input stream's current position.
       
   601 
       
   602             Resets the codec buffers used for keeping state.
       
   603         """
       
   604         self.reset()
       
   605         self.stream.seek(offset, whence)
       
   606 
       
   607     def next(self):
       
   608 
       
   609         """ Return the next decoded line from the input stream."""
       
   610         line = self.readline()
       
   611         if line:
       
   612             return line
       
   613         raise StopIteration
       
   614 
       
   615     def __iter__(self):
       
   616         return self
       
   617 
       
   618     def __getattr__(self, name,
       
   619                     getattr=getattr):
       
   620 
       
   621         """ Inherit all other methods from the underlying stream.
       
   622         """
       
   623         return getattr(self.stream, name)
       
   624 
       
   625     def __enter__(self):
       
   626         return self
       
   627 
       
   628     def __exit__(self, type, value, tb):
       
   629         self.stream.close()
       
   630 
       
   631 ###
       
   632 
       
   633 class StreamReaderWriter:
       
   634 
       
   635     """ StreamReaderWriter instances allow wrapping streams which
       
   636         work in both read and write modes.
       
   637 
       
   638         The design is such that one can use the factory functions
       
   639         returned by the codec.lookup() function to construct the
       
   640         instance.
       
   641 
       
   642     """
       
   643     # Optional attributes set by the file wrappers below
       
   644     encoding = 'unknown'
       
   645 
       
   646     def __init__(self, stream, Reader, Writer, errors='strict'):
       
   647 
       
   648         """ Creates a StreamReaderWriter instance.
       
   649 
       
   650             stream must be a Stream-like object.
       
   651 
       
   652             Reader, Writer must be factory functions or classes
       
   653             providing the StreamReader, StreamWriter interface resp.
       
   654 
       
   655             Error handling is done in the same way as defined for the
       
   656             StreamWriter/Readers.
       
   657 
       
   658         """
       
   659         self.stream = stream
       
   660         self.reader = Reader(stream, errors)
       
   661         self.writer = Writer(stream, errors)
       
   662         self.errors = errors
       
   663 
       
   664     def read(self, size=-1):
       
   665 
       
   666         return self.reader.read(size)
       
   667 
       
   668     def readline(self, size=None):
       
   669 
       
   670         return self.reader.readline(size)
       
   671 
       
   672     def readlines(self, sizehint=None):
       
   673 
       
   674         return self.reader.readlines(sizehint)
       
   675 
       
   676     def next(self):
       
   677 
       
   678         """ Return the next decoded line from the input stream."""
       
   679         return self.reader.next()
       
   680 
       
   681     def __iter__(self):
       
   682         return self
       
   683 
       
   684     def write(self, data):
       
   685 
       
   686         return self.writer.write(data)
       
   687 
       
   688     def writelines(self, list):
       
   689 
       
   690         return self.writer.writelines(list)
       
   691 
       
   692     def reset(self):
       
   693 
       
   694         self.reader.reset()
       
   695         self.writer.reset()
       
   696 
       
   697     def __getattr__(self, name,
       
   698                     getattr=getattr):
       
   699 
       
   700         """ Inherit all other methods from the underlying stream.
       
   701         """
       
   702         return getattr(self.stream, name)
       
   703 
       
   704     # these are needed to make "with codecs.open(...)" work properly
       
   705 
       
   706     def __enter__(self):
       
   707         return self
       
   708 
       
   709     def __exit__(self, type, value, tb):
       
   710         self.stream.close()
       
   711 
       
   712 ###
       
   713 
       
   714 class StreamRecoder:
       
   715 
       
   716     """ StreamRecoder instances provide a frontend - backend
       
   717         view of encoding data.
       
   718 
       
   719         They use the complete set of APIs returned by the
       
   720         codecs.lookup() function to implement their task.
       
   721 
       
   722         Data written to the stream is first decoded into an
       
   723         intermediate format (which is dependent on the given codec
       
   724         combination) and then written to the stream using an instance
       
   725         of the provided Writer class.
       
   726 
       
   727         In the other direction, data is read from the stream using a
       
   728         Reader instance and then return encoded data to the caller.
       
   729 
       
   730     """
       
   731     # Optional attributes set by the file wrappers below
       
   732     data_encoding = 'unknown'
       
   733     file_encoding = 'unknown'
       
   734 
       
   735     def __init__(self, stream, encode, decode, Reader, Writer,
       
   736                  errors='strict'):
       
   737 
       
   738         """ Creates a StreamRecoder instance which implements a two-way
       
   739             conversion: encode and decode work on the frontend (the
       
   740             input to .read() and output of .write()) while
       
   741             Reader and Writer work on the backend (reading and
       
   742             writing to the stream).
       
   743 
       
   744             You can use these objects to do transparent direct
       
   745             recodings from e.g. latin-1 to utf-8 and back.
       
   746 
       
   747             stream must be a file-like object.
       
   748 
       
   749             encode, decode must adhere to the Codec interface, Reader,
       
   750             Writer must be factory functions or classes providing the
       
   751             StreamReader, StreamWriter interface resp.
       
   752 
       
   753             encode and decode are needed for the frontend translation,
       
   754             Reader and Writer for the backend translation. Unicode is
       
   755             used as intermediate encoding.
       
   756 
       
   757             Error handling is done in the same way as defined for the
       
   758             StreamWriter/Readers.
       
   759 
       
   760         """
       
   761         self.stream = stream
       
   762         self.encode = encode
       
   763         self.decode = decode
       
   764         self.reader = Reader(stream, errors)
       
   765         self.writer = Writer(stream, errors)
       
   766         self.errors = errors
       
   767 
       
   768     def read(self, size=-1):
       
   769 
       
   770         data = self.reader.read(size)
       
   771         data, bytesencoded = self.encode(data, self.errors)
       
   772         return data
       
   773 
       
   774     def readline(self, size=None):
       
   775 
       
   776         if size is None:
       
   777             data = self.reader.readline()
       
   778         else:
       
   779             data = self.reader.readline(size)
       
   780         data, bytesencoded = self.encode(data, self.errors)
       
   781         return data
       
   782 
       
   783     def readlines(self, sizehint=None):
       
   784 
       
   785         data = self.reader.read()
       
   786         data, bytesencoded = self.encode(data, self.errors)
       
   787         return data.splitlines(1)
       
   788 
       
   789     def next(self):
       
   790 
       
   791         """ Return the next decoded line from the input stream."""
       
   792         data = self.reader.next()
       
   793         data, bytesencoded = self.encode(data, self.errors)
       
   794         return data
       
   795 
       
   796     def __iter__(self):
       
   797         return self
       
   798 
       
   799     def write(self, data):
       
   800 
       
   801         data, bytesdecoded = self.decode(data, self.errors)
       
   802         return self.writer.write(data)
       
   803 
       
   804     def writelines(self, list):
       
   805 
       
   806         data = ''.join(list)
       
   807         data, bytesdecoded = self.decode(data, self.errors)
       
   808         return self.writer.write(data)
       
   809 
       
   810     def reset(self):
       
   811 
       
   812         self.reader.reset()
       
   813         self.writer.reset()
       
   814 
       
   815     def __getattr__(self, name,
       
   816                     getattr=getattr):
       
   817 
       
   818         """ Inherit all other methods from the underlying stream.
       
   819         """
       
   820         return getattr(self.stream, name)
       
   821 
       
   822     def __enter__(self):
       
   823         return self
       
   824 
       
   825     def __exit__(self, type, value, tb):
       
   826         self.stream.close()
       
   827 
       
   828 ### Shortcuts
       
   829 
       
   830 def open(filename, mode='rb', encoding=None, errors='strict', buffering=1):
       
   831 
       
   832     """ Open an encoded file using the given mode and return
       
   833         a wrapped version providing transparent encoding/decoding.
       
   834 
       
   835         Note: The wrapped version will only accept the object format
       
   836         defined by the codecs, i.e. Unicode objects for most builtin
       
   837         codecs. Output is also codec dependent and will usually be
       
   838         Unicode as well.
       
   839 
       
   840         Files are always opened in binary mode, even if no binary mode
       
   841         was specified. This is done to avoid data loss due to encodings
       
   842         using 8-bit values. The default file mode is 'rb' meaning to
       
   843         open the file in binary read mode.
       
   844 
       
   845         encoding specifies the encoding which is to be used for the
       
   846         file.
       
   847 
       
   848         errors may be given to define the error handling. It defaults
       
   849         to 'strict' which causes ValueErrors to be raised in case an
       
   850         encoding error occurs.
       
   851 
       
   852         buffering has the same meaning as for the builtin open() API.
       
   853         It defaults to line buffered.
       
   854 
       
   855         The returned wrapped file object provides an extra attribute
       
   856         .encoding which allows querying the used encoding. This
       
   857         attribute is only available if an encoding was specified as
       
   858         parameter.
       
   859 
       
   860     """
       
   861     if encoding is not None and \
       
   862        'b' not in mode:
       
   863         # Force opening of the file in binary mode
       
   864         mode = mode + 'b'
       
   865     file = __builtin__.open(filename, mode, buffering)
       
   866     if encoding is None:
       
   867         return file
       
   868     info = lookup(encoding)
       
   869     srw = StreamReaderWriter(file, info.streamreader, info.streamwriter, errors)
       
   870     # Add attributes to simplify introspection
       
   871     srw.encoding = encoding
       
   872     return srw
       
   873 
       
   874 def EncodedFile(file, data_encoding, file_encoding=None, errors='strict'):
       
   875 
       
   876     """ Return a wrapped version of file which provides transparent
       
   877         encoding translation.
       
   878 
       
   879         Strings written to the wrapped file are interpreted according
       
   880         to the given data_encoding and then written to the original
       
   881         file as string using file_encoding. The intermediate encoding
       
   882         will usually be Unicode but depends on the specified codecs.
       
   883 
       
   884         Strings are read from the file using file_encoding and then
       
   885         passed back to the caller as string using data_encoding.
       
   886 
       
   887         If file_encoding is not given, it defaults to data_encoding.
       
   888 
       
   889         errors may be given to define the error handling. It defaults
       
   890         to 'strict' which causes ValueErrors to be raised in case an
       
   891         encoding error occurs.
       
   892 
       
   893         The returned wrapped file object provides two extra attributes
       
   894         .data_encoding and .file_encoding which reflect the given
       
   895         parameters of the same name. The attributes can be used for
       
   896         introspection by Python programs.
       
   897 
       
   898     """
       
   899     if file_encoding is None:
       
   900         file_encoding = data_encoding
       
   901     data_info = lookup(data_encoding)
       
   902     file_info = lookup(file_encoding)
       
   903     sr = StreamRecoder(file, data_info.encode, data_info.decode,
       
   904                        file_info.streamreader, file_info.streamwriter, errors)
       
   905     # Add attributes to simplify introspection
       
   906     sr.data_encoding = data_encoding
       
   907     sr.file_encoding = file_encoding
       
   908     return sr
       
   909 
       
   910 ### Helpers for codec lookup
       
   911 
       
   912 def getencoder(encoding):
       
   913 
       
   914     """ Lookup up the codec for the given encoding and return
       
   915         its encoder function.
       
   916 
       
   917         Raises a LookupError in case the encoding cannot be found.
       
   918 
       
   919     """
       
   920     return lookup(encoding).encode
       
   921 
       
   922 def getdecoder(encoding):
       
   923 
       
   924     """ Lookup up the codec for the given encoding and return
       
   925         its decoder function.
       
   926 
       
   927         Raises a LookupError in case the encoding cannot be found.
       
   928 
       
   929     """
       
   930     return lookup(encoding).decode
       
   931 
       
   932 def getincrementalencoder(encoding):
       
   933 
       
   934     """ Lookup up the codec for the given encoding and return
       
   935         its IncrementalEncoder class or factory function.
       
   936 
       
   937         Raises a LookupError in case the encoding cannot be found
       
   938         or the codecs doesn't provide an incremental encoder.
       
   939 
       
   940     """
       
   941     encoder = lookup(encoding).incrementalencoder
       
   942     if encoder is None:
       
   943         raise LookupError(encoding)
       
   944     return encoder
       
   945 
       
   946 def getincrementaldecoder(encoding):
       
   947 
       
   948     """ Lookup up the codec for the given encoding and return
       
   949         its IncrementalDecoder class or factory function.
       
   950 
       
   951         Raises a LookupError in case the encoding cannot be found
       
   952         or the codecs doesn't provide an incremental decoder.
       
   953 
       
   954     """
       
   955     decoder = lookup(encoding).incrementaldecoder
       
   956     if decoder is None:
       
   957         raise LookupError(encoding)
       
   958     return decoder
       
   959 
       
   960 def getreader(encoding):
       
   961 
       
   962     """ Lookup up the codec for the given encoding and return
       
   963         its StreamReader class or factory function.
       
   964 
       
   965         Raises a LookupError in case the encoding cannot be found.
       
   966 
       
   967     """
       
   968     return lookup(encoding).streamreader
       
   969 
       
   970 def getwriter(encoding):
       
   971 
       
   972     """ Lookup up the codec for the given encoding and return
       
   973         its StreamWriter class or factory function.
       
   974 
       
   975         Raises a LookupError in case the encoding cannot be found.
       
   976 
       
   977     """
       
   978     return lookup(encoding).streamwriter
       
   979 
       
   980 def iterencode(iterator, encoding, errors='strict', **kwargs):
       
   981     """
       
   982     Encoding iterator.
       
   983 
       
   984     Encodes the input strings from the iterator using a IncrementalEncoder.
       
   985 
       
   986     errors and kwargs are passed through to the IncrementalEncoder
       
   987     constructor.
       
   988     """
       
   989     encoder = getincrementalencoder(encoding)(errors, **kwargs)
       
   990     for input in iterator:
       
   991         output = encoder.encode(input)
       
   992         if output:
       
   993             yield output
       
   994     output = encoder.encode("", True)
       
   995     if output:
       
   996         yield output
       
   997 
       
   998 def iterdecode(iterator, encoding, errors='strict', **kwargs):
       
   999     """
       
  1000     Decoding iterator.
       
  1001 
       
  1002     Decodes the input strings from the iterator using a IncrementalDecoder.
       
  1003 
       
  1004     errors and kwargs are passed through to the IncrementalDecoder
       
  1005     constructor.
       
  1006     """
       
  1007     decoder = getincrementaldecoder(encoding)(errors, **kwargs)
       
  1008     for input in iterator:
       
  1009         output = decoder.decode(input)
       
  1010         if output:
       
  1011             yield output
       
  1012     output = decoder.decode("", True)
       
  1013     if output:
       
  1014         yield output
       
  1015 
       
  1016 ### Helpers for charmap-based codecs
       
  1017 
       
  1018 def make_identity_dict(rng):
       
  1019 
       
  1020     """ make_identity_dict(rng) -> dict
       
  1021 
       
  1022         Return a dictionary where elements of the rng sequence are
       
  1023         mapped to themselves.
       
  1024 
       
  1025     """
       
  1026     res = {}
       
  1027     for i in rng:
       
  1028         res[i]=i
       
  1029     return res
       
  1030 
       
  1031 def make_encoding_map(decoding_map):
       
  1032 
       
  1033     """ Creates an encoding map from a decoding map.
       
  1034 
       
  1035         If a target mapping in the decoding map occurs multiple
       
  1036         times, then that target is mapped to None (undefined mapping),
       
  1037         causing an exception when encountered by the charmap codec
       
  1038         during translation.
       
  1039 
       
  1040         One example where this happens is cp875.py which decodes
       
  1041         multiple character to \u001a.
       
  1042 
       
  1043     """
       
  1044     m = {}
       
  1045     for k,v in decoding_map.items():
       
  1046         if not v in m:
       
  1047             m[v] = k
       
  1048         else:
       
  1049             m[v] = None
       
  1050     return m
       
  1051 
       
  1052 ### error handlers
       
  1053 
       
  1054 try:
       
  1055     strict_errors = lookup_error("strict")
       
  1056     ignore_errors = lookup_error("ignore")
       
  1057     replace_errors = lookup_error("replace")
       
  1058     xmlcharrefreplace_errors = lookup_error("xmlcharrefreplace")
       
  1059     backslashreplace_errors = lookup_error("backslashreplace")
       
  1060 except LookupError:
       
  1061     # In --disable-unicode builds, these error handler are missing
       
  1062     strict_errors = None
       
  1063     ignore_errors = None
       
  1064     replace_errors = None
       
  1065     xmlcharrefreplace_errors = None
       
  1066     backslashreplace_errors = None
       
  1067 
       
  1068 # Tell modulefinder that using codecs probably needs the encodings
       
  1069 # package
       
  1070 _false = 0
       
  1071 if _false:
       
  1072     import encodings
       
  1073 
       
  1074 ### Tests
       
  1075 
       
  1076 if __name__ == '__main__':
       
  1077 
       
  1078     # Make stdout translate Latin-1 output into UTF-8 output
       
  1079     sys.stdout = EncodedFile(sys.stdout, 'latin-1', 'utf-8')
       
  1080 
       
  1081     # Have stdin translate Latin-1 input into UTF-8 input
       
  1082     sys.stdin = EncodedFile(sys.stdin, 'utf-8', 'latin-1')