|
1 """Functions that read and write gzipped files. |
|
2 |
|
3 The user of the file doesn't have to worry about the compression, |
|
4 but random access is not allowed.""" |
|
5 |
|
6 # based on Andrew Kuchling's minigzip.py distributed with the zlib module |
|
7 |
|
8 import struct, sys, time |
|
9 import zlib |
|
10 import __builtin__ |
|
11 |
|
12 __all__ = ["GzipFile","open"] |
|
13 |
|
14 FTEXT, FHCRC, FEXTRA, FNAME, FCOMMENT = 1, 2, 4, 8, 16 |
|
15 |
|
16 READ, WRITE = 1, 2 |
|
17 |
|
18 def write32u(output, value): |
|
19 # The L format writes the bit pattern correctly whether signed |
|
20 # or unsigned. |
|
21 output.write(struct.pack("<L", value)) |
|
22 |
|
23 def read32(input): |
|
24 return struct.unpack("<I", input.read(4))[0] |
|
25 |
|
26 def open(filename, mode="rb", compresslevel=9): |
|
27 """Shorthand for GzipFile(filename, mode, compresslevel). |
|
28 |
|
29 The filename argument is required; mode defaults to 'rb' |
|
30 and compresslevel defaults to 9. |
|
31 |
|
32 """ |
|
33 return GzipFile(filename, mode, compresslevel) |
|
34 |
|
35 class GzipFile: |
|
36 """The GzipFile class simulates most of the methods of a file object with |
|
37 the exception of the readinto() and truncate() methods. |
|
38 |
|
39 """ |
|
40 |
|
41 myfileobj = None |
|
42 max_read_chunk = 10 * 1024 * 1024 # 10Mb |
|
43 |
|
44 def __init__(self, filename=None, mode=None, |
|
45 compresslevel=9, fileobj=None): |
|
46 """Constructor for the GzipFile class. |
|
47 |
|
48 At least one of fileobj and filename must be given a |
|
49 non-trivial value. |
|
50 |
|
51 The new class instance is based on fileobj, which can be a regular |
|
52 file, a StringIO object, or any other object which simulates a file. |
|
53 It defaults to None, in which case filename is opened to provide |
|
54 a file object. |
|
55 |
|
56 When fileobj is not None, the filename argument is only used to be |
|
57 included in the gzip file header, which may includes the original |
|
58 filename of the uncompressed file. It defaults to the filename of |
|
59 fileobj, if discernible; otherwise, it defaults to the empty string, |
|
60 and in this case the original filename is not included in the header. |
|
61 |
|
62 The mode argument can be any of 'r', 'rb', 'a', 'ab', 'w', or 'wb', |
|
63 depending on whether the file will be read or written. The default |
|
64 is the mode of fileobj if discernible; otherwise, the default is 'rb'. |
|
65 Be aware that only the 'rb', 'ab', and 'wb' values should be used |
|
66 for cross-platform portability. |
|
67 |
|
68 The compresslevel argument is an integer from 1 to 9 controlling the |
|
69 level of compression; 1 is fastest and produces the least compression, |
|
70 and 9 is slowest and produces the most compression. The default is 9. |
|
71 |
|
72 """ |
|
73 |
|
74 # guarantee the file is opened in binary mode on platforms |
|
75 # that care about that sort of thing |
|
76 if mode and 'b' not in mode: |
|
77 mode += 'b' |
|
78 if fileobj is None: |
|
79 fileobj = self.myfileobj = __builtin__.open(filename, mode or 'rb') |
|
80 if filename is None: |
|
81 if hasattr(fileobj, 'name'): filename = fileobj.name |
|
82 else: filename = '' |
|
83 if mode is None: |
|
84 if hasattr(fileobj, 'mode'): mode = fileobj.mode |
|
85 else: mode = 'rb' |
|
86 |
|
87 if mode[0:1] == 'r': |
|
88 self.mode = READ |
|
89 # Set flag indicating start of a new member |
|
90 self._new_member = True |
|
91 self.extrabuf = "" |
|
92 self.extrasize = 0 |
|
93 self.name = filename |
|
94 # Starts small, scales exponentially |
|
95 self.min_readsize = 100 |
|
96 |
|
97 elif mode[0:1] == 'w' or mode[0:1] == 'a': |
|
98 self.mode = WRITE |
|
99 self._init_write(filename) |
|
100 self.compress = zlib.compressobj(compresslevel, |
|
101 zlib.DEFLATED, |
|
102 -zlib.MAX_WBITS, |
|
103 zlib.DEF_MEM_LEVEL, |
|
104 0) |
|
105 else: |
|
106 raise IOError, "Mode " + mode + " not supported" |
|
107 |
|
108 self.fileobj = fileobj |
|
109 self.offset = 0 |
|
110 |
|
111 if self.mode == WRITE: |
|
112 self._write_gzip_header() |
|
113 |
|
114 @property |
|
115 def filename(self): |
|
116 import warnings |
|
117 warnings.warn("use the name attribute", DeprecationWarning) |
|
118 if self.mode == WRITE and self.name[-3:] != ".gz": |
|
119 return self.name + ".gz" |
|
120 return self.name |
|
121 |
|
122 def __repr__(self): |
|
123 s = repr(self.fileobj) |
|
124 return '<gzip ' + s[1:-1] + ' ' + hex(id(self)) + '>' |
|
125 |
|
126 def _init_write(self, filename): |
|
127 self.name = filename |
|
128 self.crc = zlib.crc32("") & 0xffffffffL |
|
129 self.size = 0 |
|
130 self.writebuf = [] |
|
131 self.bufsize = 0 |
|
132 |
|
133 def _write_gzip_header(self): |
|
134 self.fileobj.write('\037\213') # magic header |
|
135 self.fileobj.write('\010') # compression method |
|
136 fname = self.name |
|
137 if fname.endswith(".gz"): |
|
138 fname = fname[:-3] |
|
139 flags = 0 |
|
140 if fname: |
|
141 flags = FNAME |
|
142 self.fileobj.write(chr(flags)) |
|
143 write32u(self.fileobj, long(time.time())) |
|
144 self.fileobj.write('\002') |
|
145 self.fileobj.write('\377') |
|
146 if fname: |
|
147 self.fileobj.write(fname + '\000') |
|
148 |
|
149 def _init_read(self): |
|
150 self.crc = zlib.crc32("") & 0xffffffffL |
|
151 self.size = 0 |
|
152 |
|
153 def _read_gzip_header(self): |
|
154 magic = self.fileobj.read(2) |
|
155 if magic != '\037\213': |
|
156 raise IOError, 'Not a gzipped file' |
|
157 method = ord( self.fileobj.read(1) ) |
|
158 if method != 8: |
|
159 raise IOError, 'Unknown compression method' |
|
160 flag = ord( self.fileobj.read(1) ) |
|
161 # modtime = self.fileobj.read(4) |
|
162 # extraflag = self.fileobj.read(1) |
|
163 # os = self.fileobj.read(1) |
|
164 self.fileobj.read(6) |
|
165 |
|
166 if flag & FEXTRA: |
|
167 # Read & discard the extra field, if present |
|
168 xlen = ord(self.fileobj.read(1)) |
|
169 xlen = xlen + 256*ord(self.fileobj.read(1)) |
|
170 self.fileobj.read(xlen) |
|
171 if flag & FNAME: |
|
172 # Read and discard a null-terminated string containing the filename |
|
173 while True: |
|
174 s = self.fileobj.read(1) |
|
175 if not s or s=='\000': |
|
176 break |
|
177 if flag & FCOMMENT: |
|
178 # Read and discard a null-terminated string containing a comment |
|
179 while True: |
|
180 s = self.fileobj.read(1) |
|
181 if not s or s=='\000': |
|
182 break |
|
183 if flag & FHCRC: |
|
184 self.fileobj.read(2) # Read & discard the 16-bit header CRC |
|
185 |
|
186 |
|
187 def write(self,data): |
|
188 if self.mode != WRITE: |
|
189 import errno |
|
190 raise IOError(errno.EBADF, "write() on read-only GzipFile object") |
|
191 |
|
192 if self.fileobj is None: |
|
193 raise ValueError, "write() on closed GzipFile object" |
|
194 if len(data) > 0: |
|
195 self.size = self.size + len(data) |
|
196 self.crc = zlib.crc32(data, self.crc) & 0xffffffffL |
|
197 self.fileobj.write( self.compress.compress(data) ) |
|
198 self.offset += len(data) |
|
199 |
|
200 def read(self, size=-1): |
|
201 if self.mode != READ: |
|
202 import errno |
|
203 raise IOError(errno.EBADF, "read() on write-only GzipFile object") |
|
204 |
|
205 if self.extrasize <= 0 and self.fileobj is None: |
|
206 return '' |
|
207 |
|
208 readsize = 1024 |
|
209 if size < 0: # get the whole thing |
|
210 try: |
|
211 while True: |
|
212 self._read(readsize) |
|
213 readsize = min(self.max_read_chunk, readsize * 2) |
|
214 except EOFError: |
|
215 size = self.extrasize |
|
216 else: # just get some more of it |
|
217 try: |
|
218 while size > self.extrasize: |
|
219 self._read(readsize) |
|
220 readsize = min(self.max_read_chunk, readsize * 2) |
|
221 except EOFError: |
|
222 if size > self.extrasize: |
|
223 size = self.extrasize |
|
224 |
|
225 chunk = self.extrabuf[:size] |
|
226 self.extrabuf = self.extrabuf[size:] |
|
227 self.extrasize = self.extrasize - size |
|
228 |
|
229 self.offset += size |
|
230 return chunk |
|
231 |
|
232 def _unread(self, buf): |
|
233 self.extrabuf = buf + self.extrabuf |
|
234 self.extrasize = len(buf) + self.extrasize |
|
235 self.offset -= len(buf) |
|
236 |
|
237 def _read(self, size=1024): |
|
238 if self.fileobj is None: |
|
239 raise EOFError, "Reached EOF" |
|
240 |
|
241 if self._new_member: |
|
242 # If the _new_member flag is set, we have to |
|
243 # jump to the next member, if there is one. |
|
244 # |
|
245 # First, check if we're at the end of the file; |
|
246 # if so, it's time to stop; no more members to read. |
|
247 pos = self.fileobj.tell() # Save current position |
|
248 self.fileobj.seek(0, 2) # Seek to end of file |
|
249 if pos == self.fileobj.tell(): |
|
250 raise EOFError, "Reached EOF" |
|
251 else: |
|
252 self.fileobj.seek( pos ) # Return to original position |
|
253 |
|
254 self._init_read() |
|
255 self._read_gzip_header() |
|
256 self.decompress = zlib.decompressobj(-zlib.MAX_WBITS) |
|
257 self._new_member = False |
|
258 |
|
259 # Read a chunk of data from the file |
|
260 buf = self.fileobj.read(size) |
|
261 |
|
262 # If the EOF has been reached, flush the decompression object |
|
263 # and mark this object as finished. |
|
264 |
|
265 if buf == "": |
|
266 uncompress = self.decompress.flush() |
|
267 self._read_eof() |
|
268 self._add_read_data( uncompress ) |
|
269 raise EOFError, 'Reached EOF' |
|
270 |
|
271 uncompress = self.decompress.decompress(buf) |
|
272 self._add_read_data( uncompress ) |
|
273 |
|
274 if self.decompress.unused_data != "": |
|
275 # Ending case: we've come to the end of a member in the file, |
|
276 # so seek back to the start of the unused data, finish up |
|
277 # this member, and read a new gzip header. |
|
278 # (The number of bytes to seek back is the length of the unused |
|
279 # data, minus 8 because _read_eof() will rewind a further 8 bytes) |
|
280 self.fileobj.seek( -len(self.decompress.unused_data)+8, 1) |
|
281 |
|
282 # Check the CRC and file size, and set the flag so we read |
|
283 # a new member on the next call |
|
284 self._read_eof() |
|
285 self._new_member = True |
|
286 |
|
287 def _add_read_data(self, data): |
|
288 self.crc = zlib.crc32(data, self.crc) & 0xffffffffL |
|
289 self.extrabuf = self.extrabuf + data |
|
290 self.extrasize = self.extrasize + len(data) |
|
291 self.size = self.size + len(data) |
|
292 |
|
293 def _read_eof(self): |
|
294 # We've read to the end of the file, so we have to rewind in order |
|
295 # to reread the 8 bytes containing the CRC and the file size. |
|
296 # We check the that the computed CRC and size of the |
|
297 # uncompressed data matches the stored values. Note that the size |
|
298 # stored is the true file size mod 2**32. |
|
299 self.fileobj.seek(-8, 1) |
|
300 crc32 = read32(self.fileobj) |
|
301 isize = read32(self.fileobj) # may exceed 2GB |
|
302 if crc32 != self.crc: |
|
303 raise IOError("CRC check failed %s != %s" % (hex(crc32), |
|
304 hex(self.crc))) |
|
305 elif isize != (self.size & 0xffffffffL): |
|
306 raise IOError, "Incorrect length of data produced" |
|
307 |
|
308 def close(self): |
|
309 if self.fileobj is None: |
|
310 return |
|
311 if self.mode == WRITE: |
|
312 self.fileobj.write(self.compress.flush()) |
|
313 write32u(self.fileobj, self.crc) |
|
314 # self.size may exceed 2GB, or even 4GB |
|
315 write32u(self.fileobj, self.size & 0xffffffffL) |
|
316 self.fileobj = None |
|
317 elif self.mode == READ: |
|
318 self.fileobj = None |
|
319 if self.myfileobj: |
|
320 self.myfileobj.close() |
|
321 self.myfileobj = None |
|
322 |
|
323 def __del__(self): |
|
324 try: |
|
325 if (self.myfileobj is None and |
|
326 self.fileobj is None): |
|
327 return |
|
328 except AttributeError: |
|
329 return |
|
330 self.close() |
|
331 |
|
332 def flush(self,zlib_mode=zlib.Z_SYNC_FLUSH): |
|
333 if self.mode == WRITE: |
|
334 # Ensure the compressor's buffer is flushed |
|
335 self.fileobj.write(self.compress.flush(zlib_mode)) |
|
336 self.fileobj.flush() |
|
337 |
|
338 def fileno(self): |
|
339 """Invoke the underlying file object's fileno() method. |
|
340 |
|
341 This will raise AttributeError if the underlying file object |
|
342 doesn't support fileno(). |
|
343 """ |
|
344 return self.fileobj.fileno() |
|
345 |
|
346 def isatty(self): |
|
347 return False |
|
348 |
|
349 def tell(self): |
|
350 return self.offset |
|
351 |
|
352 def rewind(self): |
|
353 '''Return the uncompressed stream file position indicator to the |
|
354 beginning of the file''' |
|
355 if self.mode != READ: |
|
356 raise IOError("Can't rewind in write mode") |
|
357 self.fileobj.seek(0) |
|
358 self._new_member = True |
|
359 self.extrabuf = "" |
|
360 self.extrasize = 0 |
|
361 self.offset = 0 |
|
362 |
|
363 def seek(self, offset, whence=0): |
|
364 if whence: |
|
365 if whence == 1: |
|
366 offset = self.offset + offset |
|
367 else: |
|
368 raise ValueError('Seek from end not supported') |
|
369 if self.mode == WRITE: |
|
370 if offset < self.offset: |
|
371 raise IOError('Negative seek in write mode') |
|
372 count = offset - self.offset |
|
373 for i in range(count // 1024): |
|
374 self.write(1024 * '\0') |
|
375 self.write((count % 1024) * '\0') |
|
376 elif self.mode == READ: |
|
377 if offset < self.offset: |
|
378 # for negative seek, rewind and do positive seek |
|
379 self.rewind() |
|
380 count = offset - self.offset |
|
381 for i in range(count // 1024): |
|
382 self.read(1024) |
|
383 self.read(count % 1024) |
|
384 |
|
385 def readline(self, size=-1): |
|
386 if size < 0: |
|
387 size = sys.maxint |
|
388 readsize = self.min_readsize |
|
389 else: |
|
390 readsize = size |
|
391 bufs = [] |
|
392 while size != 0: |
|
393 c = self.read(readsize) |
|
394 i = c.find('\n') |
|
395 |
|
396 # We set i=size to break out of the loop under two |
|
397 # conditions: 1) there's no newline, and the chunk is |
|
398 # larger than size, or 2) there is a newline, but the |
|
399 # resulting line would be longer than 'size'. |
|
400 if (size <= i) or (i == -1 and len(c) > size): |
|
401 i = size - 1 |
|
402 |
|
403 if i >= 0 or c == '': |
|
404 bufs.append(c[:i + 1]) # Add portion of last chunk |
|
405 self._unread(c[i + 1:]) # Push back rest of chunk |
|
406 break |
|
407 |
|
408 # Append chunk to list, decrease 'size', |
|
409 bufs.append(c) |
|
410 size = size - len(c) |
|
411 readsize = min(size, readsize * 2) |
|
412 if readsize > self.min_readsize: |
|
413 self.min_readsize = min(readsize, self.min_readsize * 2, 512) |
|
414 return ''.join(bufs) # Return resulting line |
|
415 |
|
416 def readlines(self, sizehint=0): |
|
417 # Negative numbers result in reading all the lines |
|
418 if sizehint <= 0: |
|
419 sizehint = sys.maxint |
|
420 L = [] |
|
421 while sizehint > 0: |
|
422 line = self.readline() |
|
423 if line == "": |
|
424 break |
|
425 L.append(line) |
|
426 sizehint = sizehint - len(line) |
|
427 |
|
428 return L |
|
429 |
|
430 def writelines(self, L): |
|
431 for line in L: |
|
432 self.write(line) |
|
433 |
|
434 def __iter__(self): |
|
435 return self |
|
436 |
|
437 def next(self): |
|
438 line = self.readline() |
|
439 if line: |
|
440 return line |
|
441 else: |
|
442 raise StopIteration |
|
443 |
|
444 |
|
445 def _test(): |
|
446 # Act like gzip; with -d, act like gunzip. |
|
447 # The input file is not deleted, however, nor are any other gzip |
|
448 # options or features supported. |
|
449 args = sys.argv[1:] |
|
450 decompress = args and args[0] == "-d" |
|
451 if decompress: |
|
452 args = args[1:] |
|
453 if not args: |
|
454 args = ["-"] |
|
455 for arg in args: |
|
456 if decompress: |
|
457 if arg == "-": |
|
458 f = GzipFile(filename="", mode="rb", fileobj=sys.stdin) |
|
459 g = sys.stdout |
|
460 else: |
|
461 if arg[-3:] != ".gz": |
|
462 print "filename doesn't end in .gz:", repr(arg) |
|
463 continue |
|
464 f = open(arg, "rb") |
|
465 g = __builtin__.open(arg[:-3], "wb") |
|
466 else: |
|
467 if arg == "-": |
|
468 f = sys.stdin |
|
469 g = GzipFile(filename="", mode="wb", fileobj=sys.stdout) |
|
470 else: |
|
471 f = __builtin__.open(arg, "rb") |
|
472 g = open(arg + ".gz", "wb") |
|
473 while True: |
|
474 chunk = f.read(1024) |
|
475 if not chunk: |
|
476 break |
|
477 g.write(chunk) |
|
478 if g is not sys.stdout: |
|
479 g.close() |
|
480 if f is not sys.stdin: |
|
481 f.close() |
|
482 |
|
483 if __name__ == '__main__': |
|
484 _test() |