python-2.5.2/win32/Lib/gzip.py
changeset 0 ae805ac0140d
equal deleted inserted replaced
-1:000000000000 0:ae805ac0140d
       
     1 """Functions that read and write gzipped files.
       
     2 
       
     3 The user of the file doesn't have to worry about the compression,
       
     4 but random access is not allowed."""
       
     5 
       
     6 # based on Andrew Kuchling's minigzip.py distributed with the zlib module
       
     7 
       
     8 import struct, sys, time
       
     9 import zlib
       
    10 import __builtin__
       
    11 
       
    12 __all__ = ["GzipFile","open"]
       
    13 
       
    14 FTEXT, FHCRC, FEXTRA, FNAME, FCOMMENT = 1, 2, 4, 8, 16
       
    15 
       
    16 READ, WRITE = 1, 2
       
    17 
       
    18 def U32(i):
       
    19     """Return i as an unsigned integer, assuming it fits in 32 bits.
       
    20 
       
    21     If it's >= 2GB when viewed as a 32-bit unsigned int, return a long.
       
    22     """
       
    23     if i < 0:
       
    24         i += 1L << 32
       
    25     return i
       
    26 
       
    27 def LOWU32(i):
       
    28     """Return the low-order 32 bits of an int, as a non-negative int."""
       
    29     return i & 0xFFFFFFFFL
       
    30 
       
    31 def write32(output, value):
       
    32     output.write(struct.pack("<l", value))
       
    33 
       
    34 def write32u(output, value):
       
    35     # The L format writes the bit pattern correctly whether signed
       
    36     # or unsigned.
       
    37     output.write(struct.pack("<L", value))
       
    38 
       
    39 def read32(input):
       
    40     return struct.unpack("<l", input.read(4))[0]
       
    41 
       
    42 def open(filename, mode="rb", compresslevel=9):
       
    43     """Shorthand for GzipFile(filename, mode, compresslevel).
       
    44 
       
    45     The filename argument is required; mode defaults to 'rb'
       
    46     and compresslevel defaults to 9.
       
    47 
       
    48     """
       
    49     return GzipFile(filename, mode, compresslevel)
       
    50 
       
    51 class GzipFile:
       
    52     """The GzipFile class simulates most of the methods of a file object with
       
    53     the exception of the readinto() and truncate() methods.
       
    54 
       
    55     """
       
    56 
       
    57     myfileobj = None
       
    58     max_read_chunk = 10 * 1024 * 1024   # 10Mb
       
    59 
       
    60     def __init__(self, filename=None, mode=None,
       
    61                  compresslevel=9, fileobj=None):
       
    62         """Constructor for the GzipFile class.
       
    63 
       
    64         At least one of fileobj and filename must be given a
       
    65         non-trivial value.
       
    66 
       
    67         The new class instance is based on fileobj, which can be a regular
       
    68         file, a StringIO object, or any other object which simulates a file.
       
    69         It defaults to None, in which case filename is opened to provide
       
    70         a file object.
       
    71 
       
    72         When fileobj is not None, the filename argument is only used to be
       
    73         included in the gzip file header, which may includes the original
       
    74         filename of the uncompressed file.  It defaults to the filename of
       
    75         fileobj, if discernible; otherwise, it defaults to the empty string,
       
    76         and in this case the original filename is not included in the header.
       
    77 
       
    78         The mode argument can be any of 'r', 'rb', 'a', 'ab', 'w', or 'wb',
       
    79         depending on whether the file will be read or written.  The default
       
    80         is the mode of fileobj if discernible; otherwise, the default is 'rb'.
       
    81         Be aware that only the 'rb', 'ab', and 'wb' values should be used
       
    82         for cross-platform portability.
       
    83 
       
    84         The compresslevel argument is an integer from 1 to 9 controlling the
       
    85         level of compression; 1 is fastest and produces the least compression,
       
    86         and 9 is slowest and produces the most compression.  The default is 9.
       
    87 
       
    88         """
       
    89 
       
    90         # guarantee the file is opened in binary mode on platforms
       
    91         # that care about that sort of thing
       
    92         if mode and 'b' not in mode:
       
    93             mode += 'b'
       
    94         if fileobj is None:
       
    95             fileobj = self.myfileobj = __builtin__.open(filename, mode or 'rb')
       
    96         if filename is None:
       
    97             if hasattr(fileobj, 'name'): filename = fileobj.name
       
    98             else: filename = ''
       
    99         if mode is None:
       
   100             if hasattr(fileobj, 'mode'): mode = fileobj.mode
       
   101             else: mode = 'rb'
       
   102 
       
   103         if mode[0:1] == 'r':
       
   104             self.mode = READ
       
   105             # Set flag indicating start of a new member
       
   106             self._new_member = True
       
   107             self.extrabuf = ""
       
   108             self.extrasize = 0
       
   109             self.filename = filename
       
   110             # Starts small, scales exponentially
       
   111             self.min_readsize = 100
       
   112 
       
   113         elif mode[0:1] == 'w' or mode[0:1] == 'a':
       
   114             self.mode = WRITE
       
   115             self._init_write(filename)
       
   116             self.compress = zlib.compressobj(compresslevel,
       
   117                                              zlib.DEFLATED,
       
   118                                              -zlib.MAX_WBITS,
       
   119                                              zlib.DEF_MEM_LEVEL,
       
   120                                              0)
       
   121         else:
       
   122             raise IOError, "Mode " + mode + " not supported"
       
   123 
       
   124         self.fileobj = fileobj
       
   125         self.offset = 0
       
   126 
       
   127         if self.mode == WRITE:
       
   128             self._write_gzip_header()
       
   129 
       
   130     def __repr__(self):
       
   131         s = repr(self.fileobj)
       
   132         return '<gzip ' + s[1:-1] + ' ' + hex(id(self)) + '>'
       
   133 
       
   134     def _init_write(self, filename):
       
   135         if filename[-3:] != '.gz':
       
   136             filename = filename + '.gz'
       
   137         self.filename = filename
       
   138         self.crc = zlib.crc32("")
       
   139         self.size = 0
       
   140         self.writebuf = []
       
   141         self.bufsize = 0
       
   142 
       
   143     def _write_gzip_header(self):
       
   144         self.fileobj.write('\037\213')             # magic header
       
   145         self.fileobj.write('\010')                 # compression method
       
   146         fname = self.filename[:-3]
       
   147         flags = 0
       
   148         if fname:
       
   149             flags = FNAME
       
   150         self.fileobj.write(chr(flags))
       
   151         write32u(self.fileobj, long(time.time()))
       
   152         self.fileobj.write('\002')
       
   153         self.fileobj.write('\377')
       
   154         if fname:
       
   155             self.fileobj.write(fname + '\000')
       
   156 
       
   157     def _init_read(self):
       
   158         self.crc = zlib.crc32("")
       
   159         self.size = 0
       
   160 
       
   161     def _read_gzip_header(self):
       
   162         magic = self.fileobj.read(2)
       
   163         if magic != '\037\213':
       
   164             raise IOError, 'Not a gzipped file'
       
   165         method = ord( self.fileobj.read(1) )
       
   166         if method != 8:
       
   167             raise IOError, 'Unknown compression method'
       
   168         flag = ord( self.fileobj.read(1) )
       
   169         # modtime = self.fileobj.read(4)
       
   170         # extraflag = self.fileobj.read(1)
       
   171         # os = self.fileobj.read(1)
       
   172         self.fileobj.read(6)
       
   173 
       
   174         if flag & FEXTRA:
       
   175             # Read & discard the extra field, if present
       
   176             xlen = ord(self.fileobj.read(1))
       
   177             xlen = xlen + 256*ord(self.fileobj.read(1))
       
   178             self.fileobj.read(xlen)
       
   179         if flag & FNAME:
       
   180             # Read and discard a null-terminated string containing the filename
       
   181             while True:
       
   182                 s = self.fileobj.read(1)
       
   183                 if not s or s=='\000':
       
   184                     break
       
   185         if flag & FCOMMENT:
       
   186             # Read and discard a null-terminated string containing a comment
       
   187             while True:
       
   188                 s = self.fileobj.read(1)
       
   189                 if not s or s=='\000':
       
   190                     break
       
   191         if flag & FHCRC:
       
   192             self.fileobj.read(2)     # Read & discard the 16-bit header CRC
       
   193 
       
   194 
       
   195     def write(self,data):
       
   196         if self.mode != WRITE:
       
   197             import errno
       
   198             raise IOError(errno.EBADF, "write() on read-only GzipFile object")
       
   199 
       
   200         if self.fileobj is None:
       
   201             raise ValueError, "write() on closed GzipFile object"
       
   202         if len(data) > 0:
       
   203             self.size = self.size + len(data)
       
   204             self.crc = zlib.crc32(data, self.crc)
       
   205             self.fileobj.write( self.compress.compress(data) )
       
   206             self.offset += len(data)
       
   207 
       
   208     def read(self, size=-1):
       
   209         if self.mode != READ:
       
   210             import errno
       
   211             raise IOError(errno.EBADF, "read() on write-only GzipFile object")
       
   212 
       
   213         if self.extrasize <= 0 and self.fileobj is None:
       
   214             return ''
       
   215 
       
   216         readsize = 1024
       
   217         if size < 0:        # get the whole thing
       
   218             try:
       
   219                 while True:
       
   220                     self._read(readsize)
       
   221                     readsize = min(self.max_read_chunk, readsize * 2)
       
   222             except EOFError:
       
   223                 size = self.extrasize
       
   224         else:               # just get some more of it
       
   225             try:
       
   226                 while size > self.extrasize:
       
   227                     self._read(readsize)
       
   228                     readsize = min(self.max_read_chunk, readsize * 2)
       
   229             except EOFError:
       
   230                 if size > self.extrasize:
       
   231                     size = self.extrasize
       
   232 
       
   233         chunk = self.extrabuf[:size]
       
   234         self.extrabuf = self.extrabuf[size:]
       
   235         self.extrasize = self.extrasize - size
       
   236 
       
   237         self.offset += size
       
   238         return chunk
       
   239 
       
   240     def _unread(self, buf):
       
   241         self.extrabuf = buf + self.extrabuf
       
   242         self.extrasize = len(buf) + self.extrasize
       
   243         self.offset -= len(buf)
       
   244 
       
   245     def _read(self, size=1024):
       
   246         if self.fileobj is None:
       
   247             raise EOFError, "Reached EOF"
       
   248 
       
   249         if self._new_member:
       
   250             # If the _new_member flag is set, we have to
       
   251             # jump to the next member, if there is one.
       
   252             #
       
   253             # First, check if we're at the end of the file;
       
   254             # if so, it's time to stop; no more members to read.
       
   255             pos = self.fileobj.tell()   # Save current position
       
   256             self.fileobj.seek(0, 2)     # Seek to end of file
       
   257             if pos == self.fileobj.tell():
       
   258                 raise EOFError, "Reached EOF"
       
   259             else:
       
   260                 self.fileobj.seek( pos ) # Return to original position
       
   261 
       
   262             self._init_read()
       
   263             self._read_gzip_header()
       
   264             self.decompress = zlib.decompressobj(-zlib.MAX_WBITS)
       
   265             self._new_member = False
       
   266 
       
   267         # Read a chunk of data from the file
       
   268         buf = self.fileobj.read(size)
       
   269 
       
   270         # If the EOF has been reached, flush the decompression object
       
   271         # and mark this object as finished.
       
   272 
       
   273         if buf == "":
       
   274             uncompress = self.decompress.flush()
       
   275             self._read_eof()
       
   276             self._add_read_data( uncompress )
       
   277             raise EOFError, 'Reached EOF'
       
   278 
       
   279         uncompress = self.decompress.decompress(buf)
       
   280         self._add_read_data( uncompress )
       
   281 
       
   282         if self.decompress.unused_data != "":
       
   283             # Ending case: we've come to the end of a member in the file,
       
   284             # so seek back to the start of the unused data, finish up
       
   285             # this member, and read a new gzip header.
       
   286             # (The number of bytes to seek back is the length of the unused
       
   287             # data, minus 8 because _read_eof() will rewind a further 8 bytes)
       
   288             self.fileobj.seek( -len(self.decompress.unused_data)+8, 1)
       
   289 
       
   290             # Check the CRC and file size, and set the flag so we read
       
   291             # a new member on the next call
       
   292             self._read_eof()
       
   293             self._new_member = True
       
   294 
       
   295     def _add_read_data(self, data):
       
   296         self.crc = zlib.crc32(data, self.crc)
       
   297         self.extrabuf = self.extrabuf + data
       
   298         self.extrasize = self.extrasize + len(data)
       
   299         self.size = self.size + len(data)
       
   300 
       
   301     def _read_eof(self):
       
   302         # We've read to the end of the file, so we have to rewind in order
       
   303         # to reread the 8 bytes containing the CRC and the file size.
       
   304         # We check the that the computed CRC and size of the
       
   305         # uncompressed data matches the stored values.  Note that the size
       
   306         # stored is the true file size mod 2**32.
       
   307         self.fileobj.seek(-8, 1)
       
   308         crc32 = read32(self.fileobj)
       
   309         isize = U32(read32(self.fileobj))   # may exceed 2GB
       
   310         if U32(crc32) != U32(self.crc):
       
   311             raise IOError, "CRC check failed"
       
   312         elif isize != LOWU32(self.size):
       
   313             raise IOError, "Incorrect length of data produced"
       
   314 
       
   315     def close(self):
       
   316         if self.mode == WRITE:
       
   317             self.fileobj.write(self.compress.flush())
       
   318             # The native zlib crc is an unsigned 32-bit integer, but
       
   319             # the Python wrapper implicitly casts that to a signed C
       
   320             # long.  So, on a 32-bit box self.crc may "look negative",
       
   321             # while the same crc on a 64-bit box may "look positive".
       
   322             # To avoid irksome warnings from the `struct` module, force
       
   323             # it to look positive on all boxes.
       
   324             write32u(self.fileobj, LOWU32(self.crc))
       
   325             # self.size may exceed 2GB, or even 4GB
       
   326             write32u(self.fileobj, LOWU32(self.size))
       
   327             self.fileobj = None
       
   328         elif self.mode == READ:
       
   329             self.fileobj = None
       
   330         if self.myfileobj:
       
   331             self.myfileobj.close()
       
   332             self.myfileobj = None
       
   333 
       
   334     def __del__(self):
       
   335         try:
       
   336             if (self.myfileobj is None and
       
   337                 self.fileobj is None):
       
   338                 return
       
   339         except AttributeError:
       
   340             return
       
   341         self.close()
       
   342 
       
   343     def flush(self,zlib_mode=zlib.Z_SYNC_FLUSH):
       
   344         if self.mode == WRITE:
       
   345             # Ensure the compressor's buffer is flushed
       
   346             self.fileobj.write(self.compress.flush(zlib_mode))
       
   347         self.fileobj.flush()
       
   348 
       
   349     def fileno(self):
       
   350         """Invoke the underlying file object's fileno() method.
       
   351 
       
   352         This will raise AttributeError if the underlying file object
       
   353         doesn't support fileno().
       
   354         """
       
   355         return self.fileobj.fileno()
       
   356 
       
   357     def isatty(self):
       
   358         return False
       
   359 
       
   360     def tell(self):
       
   361         return self.offset
       
   362 
       
   363     def rewind(self):
       
   364         '''Return the uncompressed stream file position indicator to the
       
   365         beginning of the file'''
       
   366         if self.mode != READ:
       
   367             raise IOError("Can't rewind in write mode")
       
   368         self.fileobj.seek(0)
       
   369         self._new_member = True
       
   370         self.extrabuf = ""
       
   371         self.extrasize = 0
       
   372         self.offset = 0
       
   373 
       
   374     def seek(self, offset):
       
   375         if self.mode == WRITE:
       
   376             if offset < self.offset:
       
   377                 raise IOError('Negative seek in write mode')
       
   378             count = offset - self.offset
       
   379             for i in range(count // 1024):
       
   380                 self.write(1024 * '\0')
       
   381             self.write((count % 1024) * '\0')
       
   382         elif self.mode == READ:
       
   383             if offset < self.offset:
       
   384                 # for negative seek, rewind and do positive seek
       
   385                 self.rewind()
       
   386             count = offset - self.offset
       
   387             for i in range(count // 1024):
       
   388                 self.read(1024)
       
   389             self.read(count % 1024)
       
   390 
       
   391     def readline(self, size=-1):
       
   392         if size < 0:
       
   393             size = sys.maxint
       
   394             readsize = self.min_readsize
       
   395         else:
       
   396             readsize = size
       
   397         bufs = []
       
   398         while size != 0:
       
   399             c = self.read(readsize)
       
   400             i = c.find('\n')
       
   401 
       
   402             # We set i=size to break out of the loop under two
       
   403             # conditions: 1) there's no newline, and the chunk is
       
   404             # larger than size, or 2) there is a newline, but the
       
   405             # resulting line would be longer than 'size'.
       
   406             if (size <= i) or (i == -1 and len(c) > size):
       
   407                 i = size - 1
       
   408 
       
   409             if i >= 0 or c == '':
       
   410                 bufs.append(c[:i + 1])    # Add portion of last chunk
       
   411                 self._unread(c[i + 1:])   # Push back rest of chunk
       
   412                 break
       
   413 
       
   414             # Append chunk to list, decrease 'size',
       
   415             bufs.append(c)
       
   416             size = size - len(c)
       
   417             readsize = min(size, readsize * 2)
       
   418         if readsize > self.min_readsize:
       
   419             self.min_readsize = min(readsize, self.min_readsize * 2, 512)
       
   420         return ''.join(bufs) # Return resulting line
       
   421 
       
   422     def readlines(self, sizehint=0):
       
   423         # Negative numbers result in reading all the lines
       
   424         if sizehint <= 0:
       
   425             sizehint = sys.maxint
       
   426         L = []
       
   427         while sizehint > 0:
       
   428             line = self.readline()
       
   429             if line == "":
       
   430                 break
       
   431             L.append(line)
       
   432             sizehint = sizehint - len(line)
       
   433 
       
   434         return L
       
   435 
       
   436     def writelines(self, L):
       
   437         for line in L:
       
   438             self.write(line)
       
   439 
       
   440     def __iter__(self):
       
   441         return self
       
   442 
       
   443     def next(self):
       
   444         line = self.readline()
       
   445         if line:
       
   446             return line
       
   447         else:
       
   448             raise StopIteration
       
   449 
       
   450 
       
   451 def _test():
       
   452     # Act like gzip; with -d, act like gunzip.
       
   453     # The input file is not deleted, however, nor are any other gzip
       
   454     # options or features supported.
       
   455     args = sys.argv[1:]
       
   456     decompress = args and args[0] == "-d"
       
   457     if decompress:
       
   458         args = args[1:]
       
   459     if not args:
       
   460         args = ["-"]
       
   461     for arg in args:
       
   462         if decompress:
       
   463             if arg == "-":
       
   464                 f = GzipFile(filename="", mode="rb", fileobj=sys.stdin)
       
   465                 g = sys.stdout
       
   466             else:
       
   467                 if arg[-3:] != ".gz":
       
   468                     print "filename doesn't end in .gz:", repr(arg)
       
   469                     continue
       
   470                 f = open(arg, "rb")
       
   471                 g = __builtin__.open(arg[:-3], "wb")
       
   472         else:
       
   473             if arg == "-":
       
   474                 f = sys.stdin
       
   475                 g = GzipFile(filename="", mode="wb", fileobj=sys.stdout)
       
   476             else:
       
   477                 f = __builtin__.open(arg, "rb")
       
   478                 g = open(arg + ".gz", "wb")
       
   479         while True:
       
   480             chunk = f.read(1024)
       
   481             if not chunk:
       
   482                 break
       
   483             g.write(chunk)
       
   484         if g is not sys.stdout:
       
   485             g.close()
       
   486         if f is not sys.stdin:
       
   487             f.close()
       
   488 
       
   489 if __name__ == '__main__':
       
   490     _test()