diff --git a/deflate.c b/deflate.c index 29878558..f33d4fc6 100644 --- a/deflate.c +++ b/deflate.c @@ -2150,3 +2150,35 @@ local block_state deflate_huff(deflate_state *s, int flush) { FLUSH_BLOCK(s, 0); return block_done; } + +/* =========================================================================== + * Given histograms l_hist and d_hist builds the huffman table that + * can be used as the DHT in the front matter of Type 2 blocks. + * DHT is returned in the buffer strm->avail_out. + * Number of valid bits in the last valid byte of the buffer is also returned. + */ +extern int make_trees( deflate_state *s, int *l_hist, int *d_hist ); +int ZEXPORT deflate_make_dht(strm, lhistg, dhistg, bits_valid) + z_streamp strm; + int *lhistg; + int *dhistg; + int *bits_valid; +{ + deflate_state *s; + + if (strm == Z_NULL || strm->state == Z_NULL ) { + return Z_STREAM_ERROR; + } + if (strm->next_out == Z_NULL ) { + ERR_RETURN(strm, Z_STREAM_ERROR); + } + if (strm->avail_out == 0 || bits_valid == NULL) { + ERR_RETURN(strm, Z_BUF_ERROR); + } + + *bits_valid = make_trees( s = strm->state, lhistg, dhistg ); + + flush_pending( strm ); + + return Z_OK; +} diff --git a/examples/README.examples b/examples/README.examples index e3a4b88b..6d07f3ff 100644 --- a/examples/README.examples +++ b/examples/README.examples @@ -52,3 +52,7 @@ zran.h index a zlib or gzip stream and randomly access it - illustrates the use of Z_BLOCK, inflatePrime(), and inflateSetDictionary() to provide random access + +makedht.c + makes a dynamic huffman table given lit/len and distance histograms + -- illustrates the proper use of deflate_make_dht() \ No newline at end of file diff --git a/examples/jabber1.lzcount b/examples/jabber1.lzcount new file mode 100644 index 00000000..0f52399d --- /dev/null +++ b/examples/jabber1.lzcount @@ -0,0 +1,39 @@ + 32 : 2 + 39 : 1 + 84 : 1 + 97 : 1 + 98 : 1 +114 : 1 +115 : 1 +119 : 1 +256 : 1 + 0 : 0 + 1 : 0 + 2 : 0 + 3 : 0 + 4 : 0 + 5 : 0 + 6 : 0 + 7 : 0 + 8 : 0 + 9 : 0 + 10 : 0 + 11 : 0 + 12 : 0 + 13 : 0 + 14 : 0 + 15 : 0 + 16 : 0 + 17 : 0 + 18 : 0 + 19 : 0 + 20 : 0 + 21 : 0 + 22 : 0 + 23 : 0 + 24 : 0 + 25 : 0 + 26 : 0 + 27 : 0 + 28 : 0 + 29 : 0 diff --git a/examples/makedht.c b/examples/makedht.c new file mode 100644 index 00000000..f78e17e1 --- /dev/null +++ b/examples/makedht.c @@ -0,0 +1,314 @@ +/* + Makes a dynamic huffman table given the symbol counts. + Based on zlib/examples/zpipe.c in zlib 1.2.8 + + cd to zlib root directory + ./configure + make + cd examples + cc -O -I.. -o makedht makedht.c ../libz.a + + deflate_make_dht( (z_stream *)strm, (int *)lhist, (int *)dhist, (int *)valid_bits ); + Caller provides lhist and dhist int arrays. A dynamic huffman + table (DHT) formatted in the manner of Deflate Type 2 block is + returned in strm. Number of valid bits in the last byte is + returned in valid_bits. + + format_cpb(char *cpbtxt, char *zbuf, int have, int valid_bits ) + Pretty formats the DHT. +*/ + +/* + From command line, supply the Literal/Length/Distance symbols and + their counts in the *lzcount file. makedht then calls zlib to make + the dynamic huffman table (DHT). Makedht then writes human + readable DHT to stdout and binary DHT to . + + [abali@hahn examples]$ ./makedht jabber1.lzcount jabber1.dht + bytes: 19 invalid bits: 4 + -------------------------------- + 00000000000000000000000000000094 + 203826000000220058c5a6900244f0c3 + d7770700000000000000000000000000 + -------------------------------- + + Hex dump of the same: + [abali@hahn examples]$ xxd jabber1.dht + 0000000: 2038 2600 0000 2200 58c5 a690 0244 f0c3 8&...".X....D.. + 0000010: d777 07 .w. + + Notes: + Invalid bit count is the number of unused **left-most** bits in the + last byte. Bit endianness is due to the Deflate specification. + + When DEBUG is enabled in zlib, Huffman codes assigned to each + symbol are also printed to stderr. + + The -f flag asks zlib to produce a Huffman code for all the Lit/Len + (0-285) and Dist (0-29) symbols. The -f flag overrides the symbol + counts of 0 to 1, and therefore forces the code to be generated for + all the symbols. In the example below, you can see that the + result is larger when compared to the previous example. + + [abali@hahn examples]$ ./makedht -f jabber1.lzcount jabber1.dht + bytes: 56 invalid bits: 4 + -------------------------------- + 000000000000000000000000000001bc + bde300040208da443232b3f7cedeca48 + 56943d92ec952dbbec19d9ab4284ca4e + 43c8deca56b2f7cc2a65454564af9292 + f0fbff7e8ffbfd0f0000000000000000 + -------------------------------- + + Sample *.lzcount file + [abali@hahn examples]$ cat jabber1.lzcount + 32 : 2 + 39 : 1 + 84 : 1 + 97 : 1 + 98 : 1 +114 : 1 +115 : 1 +119 : 1 +256 : 1 + 0 : 0 + 1 : 0 + 29 : 0 +*/ + + +#include +#include +#include +#include "zlib.h" + +#define CHUNK 16384 + +/* + Cpb stands for compression parameter block. format_cpb converts the + zlib produced DHT in zbuf to ASCII text and writes to cpbtxt. Have + is number of bytes in zbuf. Valid_bits is the value returned from + deflateMakeDHT(). */ + +void format_cpb(char *cpbtxt, char *zbuf, int have, int valid_bits ) +{ + int i, b; + char *ptr; + char tmp[CHUNK]; + char hex[] = { '0','1','2','3','4','5','6','7','8','9','a','b','c','d','e','f' }; + int invalid_bits; + + /* last byte */ + invalid_bits = ( valid_bits ) ? 8 - valid_bits : 0 ; + fprintf(stderr,"bytes: %d invalid bits: %d\n", have, invalid_bits ); + /* format-clear the cpbparm file 1st line */ + memset( cpbtxt, '0', 32 ); + /* format the cpb bit count */ + sprintf( tmp, "%x", 8 * have - invalid_bits ); + b = strlen( tmp ); + /* write the bit count to cpb parm 1st line */ + strncpy( cpbtxt+(32-b), tmp, b ); + /* continue from the next line */ + ptr = cpbtxt+32; + for(i=0; i>4)&0xf ]; /* convert hex to ASCII */ + *(ptr++) = hex[ (byte )&0xf ]; + } + /* padding for the last line */ + for(i=have; i< (16*((have+15)/16)); i++) { + *(ptr++) = '0'; *(ptr++) = '0'; + } + *(ptr++) = '\n'; *(ptr++) = 0; +} + + + +int makedht(char *fname, int *lhist, int *dhist) +{ + int ret, flush; + unsigned have; + z_stream strm; + char in[CHUNK]; + char zbuf[CHUNK]; + char cpbtxt[CHUNK]; + FILE *cpbbin; + int i; + int valid_bits; + int b; + + /* file for the CPB binary output */ + if( NULL == ( cpbbin = fopen( fname, "w" )) ) { + fprintf( stderr, "error: cannot open %s\n", fname ); + return 1; + } + + /* allocate deflate state */ + strm.zalloc = Z_NULL; + strm.zfree = Z_NULL; + strm.opaque = Z_NULL; + + ret = deflateInit(&strm, Z_DEFAULT_COMPRESSION); + if (ret != Z_OK) + return ret; + + strm.avail_out = CHUNK; + strm.next_out = zbuf; + + ret = deflate_make_dht( &strm, lhist, dhist, &valid_bits ); + assert(ret != Z_STREAM_ERROR); + + have = CHUNK - strm.avail_out; + if (fwrite(zbuf, 1, have, cpbbin) != have || ferror(cpbbin)) { + (void)deflateEnd(&strm); + return Z_ERRNO; + } + fclose( cpbbin ); + + format_cpb( cpbtxt, zbuf, have, valid_bits ); + + fflush(stdout); + fputs("--------------------------------\n", stdout ); + fputs( cpbtxt, stdout ); + fputs("--------------------------------\n", stdout ); + fflush(stdout); + + (void)deflateEnd(&strm); + + return Z_OK; +} + +/* report a zlib or i/o error */ +void zerr(int ret) +{ + fputs("zpipe: ", stderr); + switch (ret) { + case Z_ERRNO: + if (ferror(stdin)) + fputs("error reading stdin\n", stderr); + if (ferror(stdout)) + fputs("error writing stdout\n", stderr); + break; + case Z_STREAM_ERROR: + fputs("invalid compression level\n", stderr); + break; + case Z_DATA_ERROR: + fputs("invalid or incomplete deflate data\n", stderr); + break; + case Z_MEM_ERROR: + fputs("out of memory\n", stderr); + break; + case Z_VERSION_ERROR: + fputs("zlib version mismatch!\n", stderr); + } +} + +/* Initialize zero lzcounts to a val. If the same DHT will be used + repeatedly by different input data, the DHT must contain a symbol + for all possible input symbols. Changing zero counts to a nonzero + count ensures that in the DHT there is a code for every symbol. Of + course this comes at the expense of DHTs being larger */ + +void fill_zero_lzcounts(int *llhist, int *dhist, int val) +{ + int i; + for(i=0; i<286; i++) + if( ! llhist[i] ) + llhist[i] = val; + for(i=0; i<30; i++) + if( ! dhist[i] ) + dhist[i] = val; +} + +/* read lzcounts from file fname and write them to the int arrays + llhist and dhist for Lit/Len and Distance respectively */ + +int get_lzcounts(char *fname, int *llhist, int *dhist) +{ + int i, lz, prev_lz, count, doll; + FILE *lzf; + char buf[1024]; + if( NULL == ( lzf = fopen( fname, "r" )) ) { + fprintf( stderr, "error: cannot open %s\n", fname ); + return 1; + } + for(i=0; i<286; i++) + llhist[i] = 0; + for(i=0; i<30; i++) + dhist[i] = 0; + prev_lz=0; + doll=1; + + while( NULL != fgets( buf, 1023, lzf ) ) { + sscanf( buf, "%d : %d", &lz, &count ); + if( prev_lz > lz ) /* detect LL to D transition */ + doll = 0; + assert( (doll==1 && lz >= 0 && lz <= 285) || (doll==0 && lz >= 0 && lz <= 29 ) ); + prev_lz = lz; + if( doll ) + llhist[ lz ] = count; + else + dhist[ lz ] = count; + } + llhist[256] = 1; /* The EOB symbol is always present */ + fclose( lzf ); + return 0; +} + +int main(int argc, char **argv) +{ + int ret; + int lhist[286]; + int dhist[30]; + + /* when -f argument is present */ + if (argc == 4 && strcmp(argv[1], "-f") == 0) { + + /* read LZ counts from file */ + if( get_lzcounts( argv[2], lhist, dhist ) ) + return 1; + + /* change zero counts to one */ + fill_zero_lzcounts( lhist, dhist, 1 ); + + /* make the dht */ + ret = makedht( argv[3], lhist, dhist ); + if (ret != Z_OK) + zerr(ret); + fflush(stderr); + + return ret; + } + /* no -f argument */ + else if( argc == 3 ) { + + /* read LZ counts from file */ + if( get_lzcounts( argv[1], lhist, dhist ) ) + return 1; + + /* make the dht */ + ret = makedht( argv[2], lhist, dhist ); + if (ret != Z_OK) + zerr(ret); + fflush(stderr); + + return ret; + } + /* when argument count is wrong, report usage */ + else { + fprintf( stderr, "usage:\n"); + fprintf( stderr, "%s [-f] \n", argv[0]); + fprintf( stderr, " contains a symbol : count pair per line of input.\n"); + fprintf( stderr, " Lit/Len symbols 0..285 must be followed by Distance symbols 0..29.\n"); + fprintf( stderr, " Missing symbols have a count of 0 by default.\n"); + fprintf( stderr, " The optional -f changes 0 counts to 1.\n"); + fprintf( stderr, " Human readable output is printed to stdout.\n"); + fprintf( stderr, " Number of bits in the DHT is printed in the first 16 bytes.\n"); + fprintf( stderr, " Number of unused bits in the DHT tail byte is also printed.\n"); + fprintf( stderr, " Binary output is dumped to dht.bin.\n"); + return 1; + } + +} diff --git a/trees.c b/trees.c index e052a470..939e90da 100644 --- a/trees.c +++ b/trees.c @@ -1117,3 +1117,63 @@ int ZLIB_INTERNAL _tr_tally(deflate_state *s, unsigned dist, unsigned lc) { } return (s->sym_next == s->sym_end); } + +/* =========================================================================== + * Make deflate trees for a given literal/length and distance histograms + */ +int make_trees(s, l_hist, d_hist) + deflate_state *s; + int *l_hist; + int *d_hist; +{ + int max_blindex; + int n; + int bits_valid; + + init_block( s ); + + /* copy in histograms */ + for (n = 0; n < D_CODES; n++) + s->dyn_dtree[n].Freq = d_hist[n]; + for (n = 0; n < L_CODES; n++) + s->dyn_ltree[n].Freq = l_hist[n]; + + /* EOB symbol always present in a dynamic block */ + s->dyn_ltree[END_BLOCK].Freq = 1; + + build_tree(s, (tree_desc *) (&(s->l_desc))); /* Lit/Len tree */ + build_tree(s, (tree_desc *) (&(s->d_desc))); /* Distance tree */ + max_blindex = build_bl_tree(s); /* Code length codes */ + + /* send_bits(s, (DYN_TREES << 1), 3); no space for the 3 bit block header */ + + /* write trees to the pending buffer */ + send_all_trees(s, s->l_desc.max_code + 1, s->d_desc.max_code + 1, max_blindex + 1); + + /* flush the bit buffer and align output tail to the byte boundary */ + bits_valid = s->bi_valid % 8; + bi_windup(s); + +#ifdef ZLIB_DEBUG + do { + int n; + fprintf(stderr, "BL_CODES:\n"); + for (n = 0; n < BL_CODES; n++) + if( s->bl_tree[n].Len != 0 && n <= s->bl_desc.max_code ) + fprintf(stderr, "bl: %3d l: %2d c: 0x%X\n", n, s->bl_tree[n].Len, s->bl_tree[n].Code ); + fprintf(stderr, "L_CODES:\n"); + for (n = 0; n < L_CODES; n++) + if( s->dyn_ltree[n].Len != 0 && n <= s->l_desc.max_code ) + fprintf(stderr, "ll: %3d l: %2d c: 0x%X\n", n, s->dyn_ltree[n].Len, s->dyn_ltree[n].Code ); + fprintf(stderr, "D_CODES:\n"); + for (n = 0; n < D_CODES; n++) + if( s->dyn_dtree[n].Len != 0 && n <= s->d_desc.max_code ) + fprintf(stderr, "di: %3d l: %2d c: 0x%X\n", n, s->dyn_dtree[n].Len, s->dyn_dtree[n].Code ); + fprintf(stderr, "\n"); + } while(0); + + fprintf(stderr, "valid bits in the last byte: %d\n", bits_valid ); +#endif + + return bits_valid; +} diff --git a/zlib.h b/zlib.h index ad33815f..4f988eaf 100644 --- a/zlib.h +++ b/zlib.h @@ -1946,6 +1946,16 @@ ZEXTERN int ZEXPORTVA gzvprintf(gzFile file, # endif #endif + +ZEXTERN int ZEXPORT deflate_make_dht OF(( z_streamp strm, int *lhistg, int *dhistg, int *bits_valid )); +/* + * Given histograms l_hist and d_hist, the function builds a huffman table that + * can be used as the front matter of Type 2 blocks. + * DHT is returned in the buffer strm->avail_out. + * Number of valid bits in the last valid byte of the buffer is also returned. + */ + + #ifdef __cplusplus } #endif