|
1 /* gzjoin -- command to join gzip files into one gzip file |
|
2 |
|
3 Copyright (C) 2004 Mark Adler, all rights reserved |
|
4 version 1.0, 11 Dec 2004 |
|
5 |
|
6 This software is provided 'as-is', without any express or implied |
|
7 warranty. In no event will the author be held liable for any damages |
|
8 arising from the use of this software. |
|
9 |
|
10 Permission is granted to anyone to use this software for any purpose, |
|
11 including commercial applications, and to alter it and redistribute it |
|
12 freely, subject to the following restrictions: |
|
13 |
|
14 1. The origin of this software must not be misrepresented; you must not |
|
15 claim that you wrote the original software. If you use this software |
|
16 in a product, an acknowledgment in the product documentation would be |
|
17 appreciated but is not required. |
|
18 2. Altered source versions must be plainly marked as such, and must not be |
|
19 misrepresented as being the original software. |
|
20 3. This notice may not be removed or altered from any source distribution. |
|
21 |
|
22 Mark Adler madler@alumni.caltech.edu |
|
23 */ |
|
24 |
|
25 /* |
|
26 * Change history: |
|
27 * |
|
28 * 1.0 11 Dec 2004 - First version |
|
29 * 1.1 12 Jun 2005 - Changed ssize_t to long for portability |
|
30 */ |
|
31 |
|
32 /* |
|
33 gzjoin takes one or more gzip files on the command line and writes out a |
|
34 single gzip file that will uncompress to the concatenation of the |
|
35 uncompressed data from the individual gzip files. gzjoin does this without |
|
36 having to recompress any of the data and without having to calculate a new |
|
37 crc32 for the concatenated uncompressed data. gzjoin does however have to |
|
38 decompress all of the input data in order to find the bits in the compressed |
|
39 data that need to be modified to concatenate the streams. |
|
40 |
|
41 gzjoin does not do an integrity check on the input gzip files other than |
|
42 checking the gzip header and decompressing the compressed data. They are |
|
43 otherwise assumed to be complete and correct. |
|
44 |
|
45 Each joint between gzip files removes at least 18 bytes of previous trailer |
|
46 and subsequent header, and inserts an average of about three bytes to the |
|
47 compressed data in order to connect the streams. The output gzip file |
|
48 has a minimal ten-byte gzip header with no file name or modification time. |
|
49 |
|
50 This program was written to illustrate the use of the Z_BLOCK option of |
|
51 inflate() and the crc32_combine() function. gzjoin will not compile with |
|
52 versions of zlib earlier than 1.2.3. |
|
53 */ |
|
54 |
|
55 #include <stdio.h> /* fputs(), fprintf(), fwrite(), putc() */ |
|
56 #include <stdlib.h> /* exit(), malloc(), free() */ |
|
57 #include <fcntl.h> /* open() */ |
|
58 #include <unistd.h> /* close(), read(), lseek() */ |
|
59 #include "zlib.h" |
|
60 /* crc32(), crc32_combine(), inflateInit2(), inflate(), inflateEnd() */ |
|
61 |
|
62 #define local static |
|
63 |
|
64 /* exit with an error (return a value to allow use in an expression) */ |
|
65 local int bail(char *why1, char *why2) |
|
66 { |
|
67 fprintf(stderr, "gzjoin error: %s%s, output incomplete\n", why1, why2); |
|
68 exit(1); |
|
69 return 0; |
|
70 } |
|
71 |
|
72 /* -- simple buffered file input with access to the buffer -- */ |
|
73 |
|
74 #define CHUNK 32768 /* must be a power of two and fit in unsigned */ |
|
75 |
|
76 /* bin buffered input file type */ |
|
77 typedef struct { |
|
78 char *name; /* name of file for error messages */ |
|
79 int fd; /* file descriptor */ |
|
80 unsigned left; /* bytes remaining at next */ |
|
81 unsigned char *next; /* next byte to read */ |
|
82 unsigned char *buf; /* allocated buffer of length CHUNK */ |
|
83 } bin; |
|
84 |
|
85 /* close a buffered file and free allocated memory */ |
|
86 local void bclose(bin *in) |
|
87 { |
|
88 if (in != NULL) { |
|
89 if (in->fd != -1) |
|
90 close(in->fd); |
|
91 if (in->buf != NULL) |
|
92 free(in->buf); |
|
93 free(in); |
|
94 } |
|
95 } |
|
96 |
|
97 /* open a buffered file for input, return a pointer to type bin, or NULL on |
|
98 failure */ |
|
99 local bin *bopen(char *name) |
|
100 { |
|
101 bin *in; |
|
102 |
|
103 in = malloc(sizeof(bin)); |
|
104 if (in == NULL) |
|
105 return NULL; |
|
106 in->buf = malloc(CHUNK); |
|
107 in->fd = open(name, O_RDONLY, 0); |
|
108 if (in->buf == NULL || in->fd == -1) { |
|
109 bclose(in); |
|
110 return NULL; |
|
111 } |
|
112 in->left = 0; |
|
113 in->next = in->buf; |
|
114 in->name = name; |
|
115 return in; |
|
116 } |
|
117 |
|
118 /* load buffer from file, return -1 on read error, 0 or 1 on success, with |
|
119 1 indicating that end-of-file was reached */ |
|
120 local int bload(bin *in) |
|
121 { |
|
122 long len; |
|
123 |
|
124 if (in == NULL) |
|
125 return -1; |
|
126 if (in->left != 0) |
|
127 return 0; |
|
128 in->next = in->buf; |
|
129 do { |
|
130 len = (long)read(in->fd, in->buf + in->left, CHUNK - in->left); |
|
131 if (len < 0) |
|
132 return -1; |
|
133 in->left += (unsigned)len; |
|
134 } while (len != 0 && in->left < CHUNK); |
|
135 return len == 0 ? 1 : 0; |
|
136 } |
|
137 |
|
138 /* get a byte from the file, bail if end of file */ |
|
139 #define bget(in) (in->left ? 0 : bload(in), \ |
|
140 in->left ? (in->left--, *(in->next)++) : \ |
|
141 bail("unexpected end of file on ", in->name)) |
|
142 |
|
143 /* get a four-byte little-endian unsigned integer from file */ |
|
144 local unsigned long bget4(bin *in) |
|
145 { |
|
146 unsigned long val; |
|
147 |
|
148 val = bget(in); |
|
149 val += (unsigned long)(bget(in)) << 8; |
|
150 val += (unsigned long)(bget(in)) << 16; |
|
151 val += (unsigned long)(bget(in)) << 24; |
|
152 return val; |
|
153 } |
|
154 |
|
155 /* skip bytes in file */ |
|
156 local void bskip(bin *in, unsigned skip) |
|
157 { |
|
158 /* check pointer */ |
|
159 if (in == NULL) |
|
160 return; |
|
161 |
|
162 /* easy case -- skip bytes in buffer */ |
|
163 if (skip <= in->left) { |
|
164 in->left -= skip; |
|
165 in->next += skip; |
|
166 return; |
|
167 } |
|
168 |
|
169 /* skip what's in buffer, discard buffer contents */ |
|
170 skip -= in->left; |
|
171 in->left = 0; |
|
172 |
|
173 /* seek past multiples of CHUNK bytes */ |
|
174 if (skip > CHUNK) { |
|
175 unsigned left; |
|
176 |
|
177 left = skip & (CHUNK - 1); |
|
178 if (left == 0) { |
|
179 /* exact number of chunks: seek all the way minus one byte to check |
|
180 for end-of-file with a read */ |
|
181 lseek(in->fd, skip - 1, SEEK_CUR); |
|
182 if (read(in->fd, in->buf, 1) != 1) |
|
183 bail("unexpected end of file on ", in->name); |
|
184 return; |
|
185 } |
|
186 |
|
187 /* skip the integral chunks, update skip with remainder */ |
|
188 lseek(in->fd, skip - left, SEEK_CUR); |
|
189 skip = left; |
|
190 } |
|
191 |
|
192 /* read more input and skip remainder */ |
|
193 bload(in); |
|
194 if (skip > in->left) |
|
195 bail("unexpected end of file on ", in->name); |
|
196 in->left -= skip; |
|
197 in->next += skip; |
|
198 } |
|
199 |
|
200 /* -- end of buffered input functions -- */ |
|
201 |
|
202 /* skip the gzip header from file in */ |
|
203 local void gzhead(bin *in) |
|
204 { |
|
205 int flags; |
|
206 |
|
207 /* verify gzip magic header and compression method */ |
|
208 if (bget(in) != 0x1f || bget(in) != 0x8b || bget(in) != 8) |
|
209 bail(in->name, " is not a valid gzip file"); |
|
210 |
|
211 /* get and verify flags */ |
|
212 flags = bget(in); |
|
213 if ((flags & 0xe0) != 0) |
|
214 bail("unknown reserved bits set in ", in->name); |
|
215 |
|
216 /* skip modification time, extra flags, and os */ |
|
217 bskip(in, 6); |
|
218 |
|
219 /* skip extra field if present */ |
|
220 if (flags & 4) { |
|
221 unsigned len; |
|
222 |
|
223 len = bget(in); |
|
224 len += (unsigned)(bget(in)) << 8; |
|
225 bskip(in, len); |
|
226 } |
|
227 |
|
228 /* skip file name if present */ |
|
229 if (flags & 8) |
|
230 while (bget(in) != 0) |
|
231 ; |
|
232 |
|
233 /* skip comment if present */ |
|
234 if (flags & 16) |
|
235 while (bget(in) != 0) |
|
236 ; |
|
237 |
|
238 /* skip header crc if present */ |
|
239 if (flags & 2) |
|
240 bskip(in, 2); |
|
241 } |
|
242 |
|
243 /* write a four-byte little-endian unsigned integer to out */ |
|
244 local void put4(unsigned long val, FILE *out) |
|
245 { |
|
246 putc(val & 0xff, out); |
|
247 putc((val >> 8) & 0xff, out); |
|
248 putc((val >> 16) & 0xff, out); |
|
249 putc((val >> 24) & 0xff, out); |
|
250 } |
|
251 |
|
252 /* Load up zlib stream from buffered input, bail if end of file */ |
|
253 local void zpull(z_streamp strm, bin *in) |
|
254 { |
|
255 if (in->left == 0) |
|
256 bload(in); |
|
257 if (in->left == 0) |
|
258 bail("unexpected end of file on ", in->name); |
|
259 strm->avail_in = in->left; |
|
260 strm->next_in = in->next; |
|
261 } |
|
262 |
|
263 /* Write header for gzip file to out and initialize trailer. */ |
|
264 local void gzinit(unsigned long *crc, unsigned long *tot, FILE *out) |
|
265 { |
|
266 fwrite("\x1f\x8b\x08\0\0\0\0\0\0\xff", 1, 10, out); |
|
267 *crc = crc32(0L, Z_NULL, 0); |
|
268 *tot = 0; |
|
269 } |
|
270 |
|
271 /* Copy the compressed data from name, zeroing the last block bit of the last |
|
272 block if clr is true, and adding empty blocks as needed to get to a byte |
|
273 boundary. If clr is false, then the last block becomes the last block of |
|
274 the output, and the gzip trailer is written. crc and tot maintains the |
|
275 crc and length (modulo 2^32) of the output for the trailer. The resulting |
|
276 gzip file is written to out. gzinit() must be called before the first call |
|
277 of gzcopy() to write the gzip header and to initialize crc and tot. */ |
|
278 local void gzcopy(char *name, int clr, unsigned long *crc, unsigned long *tot, |
|
279 FILE *out) |
|
280 { |
|
281 int ret; /* return value from zlib functions */ |
|
282 int pos; /* where the "last block" bit is in byte */ |
|
283 int last; /* true if processing the last block */ |
|
284 bin *in; /* buffered input file */ |
|
285 unsigned char *start; /* start of compressed data in buffer */ |
|
286 unsigned char *junk; /* buffer for uncompressed data -- discarded */ |
|
287 z_off_t len; /* length of uncompressed data (support > 4 GB) */ |
|
288 z_stream strm; /* zlib inflate stream */ |
|
289 |
|
290 /* open gzip file and skip header */ |
|
291 in = bopen(name); |
|
292 if (in == NULL) |
|
293 bail("could not open ", name); |
|
294 gzhead(in); |
|
295 |
|
296 /* allocate buffer for uncompressed data and initialize raw inflate |
|
297 stream */ |
|
298 junk = malloc(CHUNK); |
|
299 strm.zalloc = Z_NULL; |
|
300 strm.zfree = Z_NULL; |
|
301 strm.opaque = Z_NULL; |
|
302 strm.avail_in = 0; |
|
303 strm.next_in = Z_NULL; |
|
304 ret = inflateInit2(&strm, -15); |
|
305 if (junk == NULL || ret != Z_OK) |
|
306 bail("out of memory", ""); |
|
307 |
|
308 /* inflate and copy compressed data, clear last-block bit if requested */ |
|
309 len = 0; |
|
310 zpull(&strm, in); |
|
311 start = strm.next_in; |
|
312 last = start[0] & 1; |
|
313 if (last && clr) |
|
314 start[0] &= ~1; |
|
315 strm.avail_out = 0; |
|
316 for (;;) { |
|
317 /* if input used and output done, write used input and get more */ |
|
318 if (strm.avail_in == 0 && strm.avail_out != 0) { |
|
319 fwrite(start, 1, strm.next_in - start, out); |
|
320 start = in->buf; |
|
321 in->left = 0; |
|
322 zpull(&strm, in); |
|
323 } |
|
324 |
|
325 /* decompress -- return early when end-of-block reached */ |
|
326 strm.avail_out = CHUNK; |
|
327 strm.next_out = junk; |
|
328 ret = inflate(&strm, Z_BLOCK); |
|
329 switch (ret) { |
|
330 case Z_MEM_ERROR: |
|
331 bail("out of memory", ""); |
|
332 case Z_DATA_ERROR: |
|
333 bail("invalid compressed data in ", in->name); |
|
334 } |
|
335 |
|
336 /* update length of uncompressed data */ |
|
337 len += CHUNK - strm.avail_out; |
|
338 |
|
339 /* check for block boundary (only get this when block copied out) */ |
|
340 if (strm.data_type & 128) { |
|
341 /* if that was the last block, then done */ |
|
342 if (last) |
|
343 break; |
|
344 |
|
345 /* number of unused bits in last byte */ |
|
346 pos = strm.data_type & 7; |
|
347 |
|
348 /* find the next last-block bit */ |
|
349 if (pos != 0) { |
|
350 /* next last-block bit is in last used byte */ |
|
351 pos = 0x100 >> pos; |
|
352 last = strm.next_in[-1] & pos; |
|
353 if (last && clr) |
|
354 strm.next_in[-1] &= ~pos; |
|
355 } |
|
356 else { |
|
357 /* next last-block bit is in next unused byte */ |
|
358 if (strm.avail_in == 0) { |
|
359 /* don't have that byte yet -- get it */ |
|
360 fwrite(start, 1, strm.next_in - start, out); |
|
361 start = in->buf; |
|
362 in->left = 0; |
|
363 zpull(&strm, in); |
|
364 } |
|
365 last = strm.next_in[0] & 1; |
|
366 if (last && clr) |
|
367 strm.next_in[0] &= ~1; |
|
368 } |
|
369 } |
|
370 } |
|
371 |
|
372 /* update buffer with unused input */ |
|
373 in->left = strm.avail_in; |
|
374 in->next = strm.next_in; |
|
375 |
|
376 /* copy used input, write empty blocks to get to byte boundary */ |
|
377 pos = strm.data_type & 7; |
|
378 fwrite(start, 1, in->next - start - 1, out); |
|
379 last = in->next[-1]; |
|
380 if (pos == 0 || !clr) |
|
381 /* already at byte boundary, or last file: write last byte */ |
|
382 putc(last, out); |
|
383 else { |
|
384 /* append empty blocks to last byte */ |
|
385 last &= ((0x100 >> pos) - 1); /* assure unused bits are zero */ |
|
386 if (pos & 1) { |
|
387 /* odd -- append an empty stored block */ |
|
388 putc(last, out); |
|
389 if (pos == 1) |
|
390 putc(0, out); /* two more bits in block header */ |
|
391 fwrite("\0\0\xff\xff", 1, 4, out); |
|
392 } |
|
393 else { |
|
394 /* even -- append 1, 2, or 3 empty fixed blocks */ |
|
395 switch (pos) { |
|
396 case 6: |
|
397 putc(last | 8, out); |
|
398 last = 0; |
|
399 case 4: |
|
400 putc(last | 0x20, out); |
|
401 last = 0; |
|
402 case 2: |
|
403 putc(last | 0x80, out); |
|
404 putc(0, out); |
|
405 } |
|
406 } |
|
407 } |
|
408 |
|
409 /* update crc and tot */ |
|
410 *crc = crc32_combine(*crc, bget4(in), len); |
|
411 *tot += (unsigned long)len; |
|
412 |
|
413 /* clean up */ |
|
414 inflateEnd(&strm); |
|
415 free(junk); |
|
416 bclose(in); |
|
417 |
|
418 /* write trailer if this is the last gzip file */ |
|
419 if (!clr) { |
|
420 put4(*crc, out); |
|
421 put4(*tot, out); |
|
422 } |
|
423 } |
|
424 |
|
425 /* join the gzip files on the command line, write result to stdout */ |
|
426 int main(int argc, char **argv) |
|
427 { |
|
428 unsigned long crc, tot; /* running crc and total uncompressed length */ |
|
429 |
|
430 /* skip command name */ |
|
431 argc--; |
|
432 argv++; |
|
433 |
|
434 /* show usage if no arguments */ |
|
435 if (argc == 0) { |
|
436 fputs("gzjoin usage: gzjoin f1.gz [f2.gz [f3.gz ...]] > fjoin.gz\n", |
|
437 stderr); |
|
438 return 0; |
|
439 } |
|
440 |
|
441 /* join gzip files on command line and write to stdout */ |
|
442 gzinit(&crc, &tot, stdout); |
|
443 while (argc--) |
|
444 gzcopy(*argv++, argc, &crc, &tot, stdout); |
|
445 |
|
446 /* done */ |
|
447 return 0; |
|
448 } |