NCBI C++ ToolKit
ct_nlmzip_trees.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: ct_nlmzip_trees.cpp 94680 2021-08-30 13:02:50Z vakatov $ */
2 /*****************************************************************************
3 
4  Name: trees.c ur/compr/trees.c
5 
6  Description: Utility functions for compress/uncompress data
7 
8  Author: Grisha Starchenko
9  InforMax, Inc.
10  Gaithersburg, USA.
11 
12  ***************************************************************************
13 
14  PUBLIC DOMAIN NOTICE
15  National Center for Biotechnology Information
16 
17  This software/database is a "United States Government Work" under the
18  terms of the United States Copyright Act. It was written as part of
19  the author's official duties as a United States Government employee
20  and thus cannot be copyrighted. This software/database is freely
21  available to the public for use. The National Library of Medicine and
22  the U.S. Government have not placed any restriction on its use or
23  reproduction.
24 
25  Although all reasonable efforts have been taken to ensure the accuracy
26  and reliability of the software and data, the NLM and the U.S.
27  Government do not and cannot warrant the performance or results that
28  may be obtained by using this software or data. The NLM and the U.S.
29  Government disclaim all warranties, express or implied, including
30  warranties of performance, merchantability or fitness for any
31  particular purpose.
32 
33  Please cite the author in any work or product based on this material.
34 
35  ***************************************************************************
36 
37  Entry Points:
38 
39  void
40  Nlmzip_ct_init (attr,methodp)
41  ush *attr; [ pointer to internal file attribute (I/O) ]
42  int *methodp; [ pointer to compression Nlmzip_method (I/O) ]
43 
44  ulg
45  Nlmzip_flush_block (buf,stored_len,eof)
46  char *buf; [ input block, or NULL if too old (I) ]
47  ulg stored_len; [ length of input block (I) ]
48  int eof; [ true if this is the last block (I) ]
49 
50  int
51  Nlmzip_ct_tally (dist,lc)
52  int dist; [ distance of matched string (I) ]
53  int lc; [ match length-MIN_MATCH or unmatched char (I) ]
54 
55  Modification History:
56  11 Aug 1995 - grisha - original written
57 
58  Bugs and restriction on use:
59 
60  Notes:
61 
62 *****************************************************************************/
63 
64 #include <ncbi_pch.hpp>
65 #include "ct_nlmzip_i.h"
66 
68 
69 
70 /****************************************************************************/
71 /* TYPEDEFS */
72 /****************************************************************************/
73 /* Data structure describing a single value and its code string. */
74 typedef struct ct_data {
75  union {
76  ush freq; /* frequency count */
77  ush code; /* bit string */
78  } fc;
79  union {
80  ush dad; /* father node in Huffman tree */
81  ush len; /* length of bit string */
82  } dl;
84 
85 typedef struct tree_desc {
86  ct_data *dyn_tree; /* the dynamic tree */
87  ct_data *static_tree; /* corresponding static tree or NULL */
88  int *extra_bits; /* extra bits for each code or NULL */
89  int extra_base; /* base index for extra_bits */
90  int elems; /* max number of elements in the tree */
91  int max_length; /* max bit length for the codes */
92  int max_code; /* largest code with non zero frequency */
94 
95 /****************************************************************************/
96 /* DEFINES */
97 /****************************************************************************/
98 /* All codes must not exceed MAX_BITS bits */
99 #define MAX_BITS 15
100 
101 /* Bit length codes must not exceed MAX_BL_BITS bits */
102 #define MAX_BL_BITS 7
103 
104 /* number of length codes, not counting the special END_BLOCK code */
105 #define LENGTH_CODES 29
106 
107 /* number of literal bytes 0..255 */
108 #define LITERALS 256
109 
110 #define END_BLOCK 256
111 /* end of block literal code */
112 
113 /* number of Literal or Length codes, including the END_BLOCK code */
114 #define L_CODES (LITERALS+1+LENGTH_CODES)
115 
116 /* number of distance codes */
117 #define D_CODES 30
118 
119 /* number of codes used to transfer the bit lengths */
120 #define BL_CODES 19
121 
122 /* The three kinds of block type */
123 #define STORED_BLOCK 0
124 #define STATIC_TREES 1
125 #define DYN_TREES 2
126 
127 /* Sizes of match buffers for literals/lengths and distances. There are
128  4 reasons for limiting LIT_BUFSIZE to 64K:
129  - frequencies can be kept in 16 bit counters
130  - if compression is not successful for the first block, all input data is
131  still in the Nlmzip_window so we can still emit a stored block even when input
132  comes from standard input. (This can also be done for all blocks if
133  LIT_BUFSIZE is not greater than 32K.)
134  - if compression is not successful for a file smaller than 64K, we can
135  even emit a stored file instead of a stored block (saving 5 bytes).
136  - creating new Huffman trees less frequently may not provide fast
137  adaptation to changes in the input data statistics. (Take for
138  example a binary file with poorly compressible code followed by
139  a highly compressible string table.) Smaller buffer sizes give
140  fast adaptation but have of course the overhead of transmitting trees
141  more frequently.
142  The current code is general and allows DIST_BUFSIZE < LIT_BUFSIZE (to save
143  memory at the expense of compression). Some optimizations would be possible
144  if we rely on DIST_BUFSIZE == LIT_BUFSIZE.
145 */
146 #define LIT_BUFSIZE 0x8000
147 #if LIT_BUFSIZE > INBUFSIZ
148  error cannot overlay l_buf and Nlmzip_inbuf
149 #endif
150 
151 /* repeat previous bit length 3-6 times (2 bits of repeat count) */
152 #define REP_3_6 16
153 
154 /* repeat a zero length 3-10 times (3 bits of repeat count) */
155 #define REPZ_3_10 17
156 
157 /* repeat a zero length 11-138 times (7 bits of repeat count) */
158 #define REPZ_11_138 18
159 
160 #define Freq fc.freq
161 #define Code fc.code
162 #define Dad dl.dad
163 #define Len dl.len
164 
165 /* maximum heap size */
166 #define HEAP_SIZE (2*L_CODES+1)
167 
168 /* Send a code of the given tree. c and tree must not have
169  side effects
170 */
171 #define send_code(c, tree) Nlmzip_send_bits(tree[c].Code, tree[c].Len)
172 
173 /* Mapping from a distance to a distance code. dist is the
174  distance - 1 and must not have side effects. dist_code[256]
175  and dist_code[257] are never used.
176 */
177 #define d_code(dist) \
178  ((dist) < 256 \
179  ? dist_code[dist] \
180  : dist_code[256+((dist)>>7)])
181 
182 /* the arguments must not have side effects */
183 #define URMAX(a,b) (a >= b ? a : b)
184 
185 /* Index within the heap array of least frequent node in the Huffman tree */
186 #define SMALLEST 1
187 
188 /* Remove the smallest element from the heap and recreate the
189  heap with one less element. Updates heap and heap_len.
190 */
191 #define pqremove(tree, top) \
192 {\
193  top = heap[SMALLEST]; \
194  heap[SMALLEST] = heap[heap_len--]; \
195  pqdownheap(tree, SMALLEST); \
196 }
197 
198 /* Compares to subtrees, using the tree depth as tie breaker when
199  the subtrees have equal frequency. This minimizes the worst case
200  length.
201 */
202 #define smaller(tree, n, m) \
203  (tree[n].Freq < tree[m].Freq || \
204  (tree[n].Freq == tree[m].Freq && depth[n] <= depth[m]))
205 
206 /****************************************************************************/
207 /* LOCAL FUNCTIONS PROTOTYPES */
208 /****************************************************************************/
209 
210 static void init_block _((void));
211 static void pqdownheap _((ct_data*,int));
212 static void gen_bitlen _((tree_desc*));
213 static void gen_codes _((ct_data*,int));
214 static void build_tree _((tree_desc*));
215 static void scan_tree _((ct_data*,int));
216 static void send_tree _((ct_data*,int));
217 static int build_bl_tree _((void));
218 static void send_all_trees _((int,int,int));
219 static void compress_block _((ct_data*,ct_data*));
220 static void set_file_type _((void));
221 
222 /****************************************************************************/
223 /* LOCAL VARIABLES */
224 /****************************************************************************/
225 /* extra bits for each length code */
226 static int extra_lbits[LENGTH_CODES] = {
227  0,0,0,0,0,0,0,0,1,1,1,1,
228  2,2,2,2,3,3,3,3,4,4,4,4,
229  5,5,5,5,0
230  };
231 
232 /* extra bits for each distance code */
233 static int extra_dbits[D_CODES] = {
234  0,0,0,0,1,1,2,2,3,3,4,4,
235  5,5,6,6,7,7,8,8,9,9,10,10,
236  11,11,12,12,13,13
237  };
238 
239 /* extra bits for each bit length code */
240 static int extra_blbits[BL_CODES] = {
241  0,0,0,0,0,0,0,0,0,0,
242  0,0,0,0,0,0,2,3,7
243  };
244 
245 static ct_data dyn_ltree[HEAP_SIZE]; /* literal and length tree */
246 static ct_data dyn_dtree[2*D_CODES+1]; /* distance tree */
247 
248 /* The static literal tree. Since the bit lengths are imposed, there is no
249  need for the L_CODES extra codes used during heap construction. However
250  The codes 286 and 287 are needed to build a canonical tree (see Nlmzip_ct_init
251  below).
252 */
254 
255 /* The static distance tree. (Actually a trivial tree since all codes use
256  5 bits.)
257 */
259 
260 /* Huffman tree for the bit lengths */
262 
263 static tree_desc l_desc = {
264  dyn_ltree,
265  static_ltree,
266  extra_lbits,
267  LITERALS+1,
268  L_CODES,
269  MAX_BITS,
270  0
271  };
272 
273 static tree_desc d_desc = {
274  dyn_dtree,
275  static_dtree,
276  extra_dbits,
277  0,
278  D_CODES,
279  MAX_BITS,
280  0
281  };
282 
283 static tree_desc bl_desc = {
284  bl_tree,
285  (ct_data*)0,
286  extra_blbits,
287  0,
288  BL_CODES,
289  MAX_BL_BITS,
290  0
291  };
292 
293 /* number of codes at each bit length for an optimal tree */
294 static unsigned short bl_count[MAX_BITS+1];
295 
296 /* The lengths of the bit length codes are sent in order of decreasing
297  probability, to avoid transmitting the lengths for unused bit length codes.
298 */
299 static unsigned char bl_order[BL_CODES] = {
300  16,17,18,0,8,7,9,6,10,
301  5,11,4,12,3,13,2,14,1,15
302  };
303 
304 /* The sons of heap[n] are heap[2*n] and heap[2*n+1]. heap[0] is not used.
305  The same heap array is used to build all trees.
306 */
307 static int heap[2*L_CODES+1]; /* heap used to build the Huffman trees */
308 static int heap_len; /* number of elements in the heap */
309 static int heap_max; /* element of largest frequency */
310 
311 /* Depth of each subtree used as tie breaker for trees of equal frequency */
312 static unsigned char depth[2*L_CODES+1];
313 
314 /* length code for each normalized match length (0 == MIN_MATCH) */
315 static unsigned char length_code[MAX_MATCH-MIN_MATCH+1];
316 
317 /* distance codes. The first 256 values correspond to the distances
318  3 .. 258, the last 256 values correspond to the top 8 bits of
319  the 15 bit distances.
320 */
321 static unsigned char dist_code[512];
322 
323 /* First normalized length for each code (0 = MIN_MATCH) */
325 
326 /* First normalized distance for each code (0 = distance of 1) */
327 static int base_dist[D_CODES];
328 
329 /* flag_buf is a bit array distinguishing literals from lengths in
330  l_buf, thus indicating the presence or absence of a distance.
331 */
332 #define l_buf Nlmzip_inbuf /* overlay l_buf and Nlmzip_inbuf */
333 static unsigned char flag_buf[(LIT_BUFSIZE/8)];
334 
335 /* bits are filled in flags starting at bit 0 (least significant).
336  Note: these flags are overkill in the current code since we don't
337  take advantage of DIST_BUFSIZE == LIT_BUFSIZE.
338 */
339 static Uint4 last_lit; /* running index in l_buf */
340 static Uint4 last_dist; /* running index in Nlmzip_d_buf */
341 static Uint4 last_flags; /* running index in flag_buf */
342 static uch flags; /* current flags not yet saved in flag_buf */
343 static uch flag_bit; /* current bit used in flags */
344 
345 static ulg opt_len; /* bit length of current block with optimal trees */
346 static ulg static_len; /* bit length of current block with static trees */
347 
348 static ulg compressed_len; /* total bit length of compressed file */
349 
350 static ush *file_type; /* pointer to UNKNOWN, BINARY or ASCII */
351 static int *file_method; /* pointer to DEFLATE or STORE */
352 
353 /****************************************************************************/
354 /* LOCAL FUNCTIONS */
355 /****************************************************************************/
356 
357 /****************************************************************************/
358 /*.doc init_block (internal) */
359 /*+
360  Initialize a new block.
361 -*/
362 /****************************************************************************/
363 static void
364 init_block (void) /*FCN*/
365 {
366  int n; /* iterates over tree elements */
367 
368  /* Initialize the trees. */
369  for ( n = 0; n < L_CODES; n++ ) {
370  dyn_ltree[n].Freq = 0;
371  }
372  for ( n = 0; n < D_CODES; n++ ) {
373  dyn_dtree[n].Freq = 0;
374  }
375  for ( n = 0; n < BL_CODES; n++ ) {
376  bl_tree[n].Freq = 0;
377  }
378 
379  dyn_ltree[END_BLOCK].Freq = 1;
380  opt_len = static_len = 0L;
381  last_lit = last_dist = last_flags = 0;
382  flags = 0;
383  flag_bit = 1;
384 }
385 
386 /****************************************************************************/
387 /*.doc pqdownheap (internal) */
388 /*+
389  Restore the heap property by moving down the tree starting at node k,
390  exchanging a node with the smallest of its two sons if necessary, stopping
391  when the heap property is re-established (each father smaller than its
392  two sons).
393 -*/
394 /****************************************************************************/
395 static void
396 pqdownheap ( /*FCN*/
397  ct_data *tree, /* the tree to restore */
398  int k /* node to move down */
399 ){
400  int v = heap[k];
401  int j = k << 1; /* left son of k */
402 
403  while ( j <= heap_len ) {
404  /* Set j to the smallest of the two sons: */
405  if ( j < heap_len
406  && smaller(tree, heap[j+1], heap[j]) ) {
407  j++;
408  }
409 
410  /* Exit if v is smaller than both sons */
411  if ( smaller(tree, v, heap[j]) ) {
412  break;
413  }
414 
415  /* Exchange v with the smallest son */
416  heap[k] = heap[j];
417  k = j;
418 
419  /* And continue down the tree, setting j to the left son of k */
420  j <<= 1;
421  }
422  heap[k] = v;
423 }
424 
425 /****************************************************************************/
426 /*.doc gen_bitlen (internal) */
427 /*+
428  Compute the optimal bit lengths for a tree and update the total bit
429  length for the current block.
430  IN assertion: the fields freq and dad are set, heap[heap_max] and
431  above are the tree nodes sorted by increasing frequency.
432  OUT assertions: the field len is set to the optimal bit length, the
433  array bl_count contains the frequencies for each bit length.
434  The length opt_len is updated; static_len is also updated if stree
435  is not null.
436 -*/
437 /****************************************************************************/
438 static void
439 gen_bitlen ( /*FCN*/
440  tree_desc *desc /* the tree descriptor */
441 ){
442  ct_data *tree = desc->dyn_tree;
443  int *extra = desc->extra_bits;
444  int base = desc->extra_base;
445  int max_code = desc->max_code;
446  int max_length = desc->max_length;
447  ct_data *stree = desc->static_tree;
448  int h; /* heap index */
449  int n, m; /* iterate over the tree elements */
450  int bits; /* bit length */
451  int xbits; /* extra bits */
452  ush f; /* frequency */
453  int overflow = 0; /* number of elements with bit length too large */
454 
455  for ( bits = 0; bits <= MAX_BITS; bits++ ) {
456  bl_count[bits] = 0;
457  }
458 
459  /* In a first pass, compute the optimal bit lengths (which may
460  overflow in the case of the bit length tree).
461  */
462  tree[heap[heap_max]].Len = 0; /* root of the heap */
463 
464  for ( h = heap_max+1; h < HEAP_SIZE; h++ ) {
465  n = heap[h];
466  bits = tree[tree[n].Dad].Len + 1;
467  if ( bits > max_length ) {
468  bits = max_length;
469  overflow++;
470  }
471  /* We overwrite tree[n].Dad which is no longer needed */
472  tree[n].Len = (ush)bits;
473 
474  if ( n > max_code ) { /* not a leaf node */
475  continue;
476  }
477 
478  bl_count[bits]++;
479  xbits = 0;
480  if ( n >= base ) {
481  xbits = extra[n-base];
482  }
483  f = tree[n].Freq;
484  opt_len += (ulg)f * (bits + xbits);
485  if ( stree ) {
486  static_len += (ulg)f * (stree[n].Len + xbits);
487  }
488  }
489  if ( overflow == 0 ) {
490  return;
491  }
492 
493  /* Find the first bit length which could increase: */
494  do {
495  bits = max_length-1;
496  while ( bl_count[bits] == 0 ) {
497  bits--;
498  }
499  bl_count[bits]--; /* move one leaf down the tree */
500  bl_count[bits+1] += 2; /* move one overflow item as its brother */
501  bl_count[max_length]--;
502  /* The brother of the overflow item also moves one step up,
503  but this does not affect bl_count[max_length]
504  */
505  overflow -= 2;
506  } while ( overflow > 0 );
507 
508  /* Now recompute all bit lengths, scanning in increasing frequency.
509  h is still equal to HEAP_SIZE. (It is simpler to reconstruct all
510  lengths instead of fixing only the wrong ones. This idea is taken
511  from 'ar' written by Haruhiko Okumura.)
512  */
513  for ( bits = max_length; bits != 0; bits-- ) {
514  n = bl_count[bits];
515  while ( n != 0 ) {
516  m = heap[--h];
517  if ( m > max_code ) {
518  continue;
519  }
520  if ( tree[m].Len != (Uint4) bits ) {
521  opt_len += ((Int4)bits-(Int4)tree[m].Len)
522  *(Int4)tree[m].Freq;
523  tree[m].Len = (ush)bits;
524  }
525  n--;
526  }
527  }
528 }
529 
530 /****************************************************************************/
531 /*.doc gen_codes (internal) */
532 /*+
533  Generate the codes for a given tree and bit counts (which need not
534  be optimal).
535  IN assertion: the array bl_count contains the bit length statistics
536  for the given tree and the field len is set for all tree elements.
537  OUT assertion: the field code is set for all tree elements of non
538  zero code length.
539 -*/
540 /****************************************************************************/
541 static void
542 gen_codes ( /*FCN*/
543  ct_data *tree, /* the tree to decorate */
544  int max_code /* largest code with non zero frequency */
545 ){
546  ush next_code[MAX_BITS+1]; /* next code value for each bit length */
547  ush code = 0; /* running code value */
548  int bits; /* bit index */
549  int n; /* code index */
550  int len;
551 
552  /* The distribution counts are first used to generate the
553  code values without bit reversal.
554  */
555  for ( bits = 1; bits <= MAX_BITS; bits++ ) {
556  next_code[bits] = code = (code + bl_count[bits-1]) << 1;
557  }
558 
559  /* Check that the bit counts in bl_count are
560  consistent. The last code must be all ones.
561  */
562  for ( n = 0; n <= max_code; n++ ) {
563  if ( (len=tree[n].Len) == 0 ) {
564  continue;
565  }
566 
567  /* Now reverse the bits */
568  tree[n].Code = Nlmzip_bi_reverse (
569  next_code[len]++,
570  len
571  );
572  }
573 }
574 
575 /****************************************************************************/
576 /*.doc build_tree (internal) */
577 /*+
578  Construct one Huffman tree and assigns the code bit strings and
579  lengths. Update the total bit length for the current block.
580  IN assertion: the field freq is set for all tree elements.
581  OUT assertions: the fields len and code are set to the optimal bit
582  length and corresponding code. The length opt_len is updated;
583  static_len is also updated if stree is not null. The field
584  max_code is set.
585 -*/
586 /****************************************************************************/
587 static void
588 build_tree ( /*FCN*/
589  tree_desc *desc /* the tree descriptor */
590 ){
591  ct_data *tree = desc->dyn_tree;
592  ct_data *stree = desc->static_tree;
593  int elems = desc->elems;
594  int n, m; /* iterate over heap elements */
595  int max_code = -1; /* largest code with non zero frequency */
596  int node = elems; /* next internal node of the tree */
597 
598  /* Construct the initial heap, with least frequent element
599  in heap[SMALLEST]. The sons of heap[n] are heap[2*n] and
600  heap[2*n+1]. heap[0] is not used.
601  */
602  heap_len = 0;
604 
605  for ( n = 0; n < elems; n++ ) {
606  if (tree[n].Freq != 0) {
607  heap[++heap_len] = max_code = n;
608  depth[n] = 0;
609  } else {
610  tree[n].Len = 0;
611  }
612  }
613 
614  /* The pkzip format requires that at least one distance code
615  exists, and that at least one bit should be sent even if
616  there is only one possible code. So to avoid special checks
617  later on we force at least two codes of non zero frequency.
618  */
619  while ( heap_len < 2 ) {
620  int new_off = heap[++heap_len] = (max_code < 2 ? ++max_code : 0);
621  tree[new_off].Freq = 1;
622  depth[new_off] = 0;
623  opt_len--;
624  if ( stree ) {
625  static_len -= stree[new_off].Len;
626  }
627  /* new_off is 0 or 1 so it does not have extra bits */
628  }
629  desc->max_code = max_code;
630 
631  /* The elements heap[heap_len/2+1 .. heap_len] are leaves of
632  the tree, establish sub-heaps of increasing lengths:
633  */
634  for ( n = heap_len/2; n >= 1; n-- ) {
635  pqdownheap(tree, n);
636  }
637 
638  /* Construct the Huffman tree by repeatedly combining the
639  least two frequent nodes.
640  */
641  do {
642  pqremove(tree, n); /* n = node of least frequency */
643  m = heap[SMALLEST]; /* m = node of next least frequency */
644 
645  heap[--heap_max] = n; /* keep the nodes sorted by frequency */
646  heap[--heap_max] = m;
647 
648  /* Create a new node father of n and m */
649  tree[node].Freq = tree[n].Freq + tree[m].Freq;
650  depth[node] = (uch) (URMAX(depth[n], depth[m]) + 1);
651  tree[n].Dad = tree[m].Dad = (ush)node;
652 
653  /* and insert the new node in the heap */
654  heap[SMALLEST] = node++;
656 
657  } while ( heap_len >= 2 );
658 
659  heap[--heap_max] = heap[SMALLEST];
660 
661  /* At this point, the fields freq and dad are set.
662  We can now generate the bit lengths.
663  */
664  gen_bitlen((tree_desc *)desc);
665 
666  /* The field len is now set, we can generate the bit codes */
667  gen_codes ((ct_data*)tree, max_code);
668 }
669 
670 /****************************************************************************/
671 /*.doc scan_tree (internal) */
672 /*+
673  Scan a literal or distance tree to determine the frequencies of
674  the codes in the bit length tree. Updates opt_len to take into
675  account the repeat counts. (The contribution of the bit length
676  codes will be added later during the construction of bl_tree.)
677 -*/
678 /****************************************************************************/
680  ct_data *tree, /* the tree to be scanned */
681  int max_code /* and its largest code of non zero frequency */
682 ){
683  int n; /* iterates over all tree elements */
684  int prevlen = -1; /* last emitted length */
685  int curlen; /* length of current code */
686  int nextlen = tree[0].Len; /* length of next code */
687  int count = 0; /* repeat count of the current code */
688  int max_count = 7; /* max repeat count */
689  int min_count = 4; /* min repeat count */
690 
691  if ( nextlen == 0 ) {
692  max_count = 138; min_count = 3;
693  }
694  tree[max_code+1].Len = (ush)0xffff; /* guard */
695 
696  for ( n = 0; n <= max_code; n++ ) {
697  curlen = nextlen;
698  nextlen = tree[n+1].Len;
699  if ( ++count < max_count && curlen == nextlen ) {
700  continue;
701  } else if ( count < min_count ) {
702  bl_tree[curlen].Freq += count;
703  } else if ( curlen != 0 ) {
704  if ( curlen != prevlen ) {
705  bl_tree[curlen].Freq++;
706  }
707  bl_tree[REP_3_6].Freq++;
708  } else if ( count <= 10 ) {
709  bl_tree[REPZ_3_10].Freq++;
710  } else {
711  bl_tree[REPZ_11_138].Freq++;
712  }
713  count = 0;
714  prevlen = curlen;
715  if ( nextlen == 0 ) {
716  max_count = 138;
717  min_count = 3;
718  } else if ( curlen == nextlen ) {
719  max_count = 6;
720  min_count = 3;
721  } else {
722  max_count = 7;
723  min_count = 4;
724  }
725  }
726 }
727 
728 /****************************************************************************/
729 /*.doc send_tree (internal) */
730 /*+
731  Send a literal or distance tree in compressed form, using the
732  codes in bl_tree.
733 -*/
734 /****************************************************************************/
735 static void
736 send_tree ( /*FCN*/
737  ct_data *tree, /* the tree to be scanned */
738  int max_code /* and its largest code of non zero frequency */
739 ){
740  int n; /* iterates over all tree elements */
741  int prevlen = -1; /* last emitted length */
742  int curlen; /* length of current code */
743  int nextlen = tree[0].Len; /* length of next code */
744  int count = 0; /* repeat count of the current code */
745  int max_count = 7; /* max repeat count */
746  int min_count = 4; /* min repeat count */
747 
748  /* tree[max_code+1].Len = -1; guard already set */
749  if ( nextlen == 0 ) {
750  max_count = 138;
751  min_count = 3;
752  }
753 
754  for ( n = 0; n <= max_code; n++ ) {
755  curlen = nextlen;
756  nextlen = tree[n+1].Len;
757  if ( ++count < max_count && curlen == nextlen ) {
758  continue;
759  } else if ( count < min_count ) {
760  do {
761  send_code(curlen, bl_tree);
762  } while ( --count != 0 );
763  } else if ( curlen != 0 ) {
764  if ( curlen != prevlen ) {
765  send_code(curlen, bl_tree);
766  count--;
767  }
769  Nlmzip_send_bits(count-3, 2);
770  } else if ( count <= 10 ) {
772  Nlmzip_send_bits(count-3, 3);
773  } else {
775  Nlmzip_send_bits(count-11, 7);
776  }
777  count = 0;
778  prevlen = curlen;
779  if ( nextlen == 0 ) {
780  max_count = 138;
781  min_count = 3;
782  } else if ( curlen == nextlen ) {
783  max_count = 6;
784  min_count = 3;
785  } else {
786  max_count = 7;
787  min_count = 4;
788  }
789  }
790 }
791 
792 /****************************************************************************/
793 /*.doc build_bl_tree (internal) */
794 /*+
795  Construct the Huffman tree for the bit lengths and return the index in
796  bl_order of the last bit length code to send.
797 -*/
798 /****************************************************************************/
799 static int
800 build_bl_tree (void) /*FCN*/
801 {
802  int max_blindex; /* index of last bit length code of non zero freq */
803 
804  /* Determine the bit length frequencies for literal and distance trees */
807 
808  /* Build the bit length tree: */
810 
811  /* opt_len now includes the length of the tree representations,
812  except the lengths of the bit lengths codes and the 5+5+4 bits
813  for the counts.
814  */
815 
816  /* Determine the number of bit length codes to send. The pkzip
817  format requires that at least 4 bit length codes be sent.
818  (appnote.txt says 3 but the actual value used is 4.)
819  */
820  for ( max_blindex = BL_CODES-1; max_blindex >= 3; max_blindex-- ) {
821  if ( bl_tree[bl_order[max_blindex]].Len != 0 ) {
822  break;
823  }
824  }
825 
826  /* Update opt_len to include the bit length tree and counts */
827  opt_len += 3*(max_blindex+1) + (5+5+4);
828 
829  return max_blindex;
830 }
831 
832 /****************************************************************************/
833 /*.doc send_all_trees (internal) */
834 /*+
835  Send the header for a block using dynamic Huffman trees: the counts,
836  the lengths of the bit length codes, the literal tree and the distance
837  tree.
838  IN assertion: lcodes >= 257, dcodes >= 1, blcodes >= 4.
839 -*/
840 /****************************************************************************/
841 static void
842 send_all_trees ( /*FCN*/
843  int lcodes, /* number of codes for each tree */
844  int dcodes,
845  int blcodes
846 ){
847  int rank; /* index in bl_order */
848 
849  Nlmzip_send_bits(lcodes-257, 5); /* not +255 as stated in appnote.txt */
850  Nlmzip_send_bits(dcodes-1, 5);
851  Nlmzip_send_bits(blcodes-4, 4); /* not -3 as stated in appnote.txt */
852  for ( rank = 0; rank < blcodes; rank++ ) {
854  }
855  send_tree((ct_data*)dyn_ltree, lcodes-1); /* send the literal tree */
856  send_tree((ct_data*)dyn_dtree, dcodes-1); /* send the distance tree */
857 }
858 
859 /****************************************************************************/
860 /*.doc compress_block (internal) */
861 /*+
862  Send the block data compressed using the given Huffman trees
863 -*/
864 /****************************************************************************/
865 static void
866 compress_block ( /*FCN*/
867  ct_data *ltree, /* literal tree */
868  ct_data *dtree /* distance tree */
869 ){
870  Uint4 dist; /* distance of matched string */
871  int lc; /* match length or unmatched char (if dist == 0) */
872  Uint4 lx = 0; /* running index in l_buf */
873  Uint4 dx = 0; /* running index in Nlmzip_d_buf */
874  Uint4 fx = 0; /* running index in flag_buf */
875  uch flag = 0; /* current flags */
876  Uint4 code; /* the code to send */
877  int extra; /* number of extra bits to send */
878 
879  if ( last_lit != 0 ) do {
880  if ( (lx & 7) == 0 ) {
881  flag = flag_buf[fx++];
882  }
883  lc = l_buf[lx++];
884  if ( (flag & 1) == 0 ) {
885  send_code(lc, ltree); /* send a literal byte */
886  } else {
887  /* Here, lc is the match length - MIN_MATCH */
888  code = length_code[lc];
889  send_code(code+LITERALS+1, ltree); /* send the length code */
890  extra = extra_lbits[code];
891  if ( extra != 0 ) {
892  lc -= base_length[code];
893  Nlmzip_send_bits(lc, extra); /* send the extra length bits */
894  }
895  dist = Nlmzip_d_buf[dx++];
896 
897  /* Here, dist is the match distance - 1 */
898  code = d_code(dist);
899 
900  send_code(code, dtree); /* send the distance code */
901  extra = extra_dbits[code];
902  if ( extra != 0 ) {
903  dist -= base_dist[code];
904  Nlmzip_send_bits(dist, extra); /* send the extra distance bits */
905  }
906  } /* literal or match pair ? */
907  flag >>= 1;
908  } while (lx < last_lit);
909 
910  send_code(END_BLOCK, ltree);
911 }
912 
913 /****************************************************************************/
914 /*.doc set_file_type (internal) */
915 /*+
916  Set file type
917 -*/
918 /****************************************************************************/
919 static void
920 set_file_type (void) /*FCN*/
921 {
922  *file_type = BINARY;
923 }
924 
925 /****************************************************************************/
926 /* GLOBAL FUNCTIONS */
927 /****************************************************************************/
928 
929 /****************************************************************************/
930 /*.doc Nlmzip_ct_init (external) */
931 /*+
932  Allocate the match buffer, initialize the various tables and save the
933  location of the internal file attribute (ascii/binary) and Nlmzip_method
934  (DEFLATE/STORE).
935 -*/
936 /****************************************************************************/
937 void
938 Nlmzip_ct_init ( /*FCN*/
939  ush *attr, /* pointer to internal file attribute (I/O) */
940  int *methodp /* pointer to compression Nlmzip_method (I/O) */
941 ){
942  int n; /* iterates over tree elements */
943  int bits; /* bit counter */
944  int length; /* length value */
945  int code; /* code value */
946  int dist; /* distance index */
947 
948  file_type = attr;
949  file_method = methodp;
950  compressed_len = 0L;
951 
952  if ( static_dtree[0].Len != 0 ) { /* Nlmzip_ct_init already called */
953  return;
954  }
955 
956  /* Initialize the mapping length (0..255) -> length code (0..28) */
957  length = 0;
958  for ( code = 0; code < LENGTH_CODES-1; code++ ) {
959  base_length[code] = length;
960  for ( n = 0; n < (1<<extra_lbits[code]); n++ ) {
961  length_code[length++] = (uch)code;
962  }
963  }
964 
965  /* Note that the length 255 (match length 258) can be represented
966  in two different ways: code 284 + 5 bits or code 285, so we
967  overwrite length_code[255] to use the best encoding:
968  */
969  length_code[length-1] = (uch)code;
970 
971  /* Initialize the mapping dist (0..32K) -> dist code (0..29) */
972  dist = 0;
973  for ( code = 0 ; code < 16; code++ ) {
974  base_dist[code] = dist;
975  for ( n = 0; n < (1<<extra_dbits[code] ); n++) {
976  dist_code[dist++] = (uch)code;
977  }
978  }
979  dist >>= 7; /* from now on, all distances are divided by 128 */
980  for ( ; code < D_CODES; code++ ) {
981  base_dist[code] = dist << 7;
982  for (n = 0; n < (1<<(extra_dbits[code]-7)); n++) {
983  dist_code[256 + dist++] = (uch)code;
984  }
985  }
986 
987  /* Construct the codes of the static literal tree */
988  for ( bits = 0; bits <= MAX_BITS; bits++ ) {
989  bl_count[bits] = 0;
990  }
991  n = 0;
992  while ( n <= 143 ) {
993  static_ltree[n++].Len = 8;
994  bl_count[8]++;
995  }
996  while ( n <= 255 ) {
997  static_ltree[n++].Len = 9;
998  bl_count[9]++;
999  }
1000  while ( n <= 279 ) {
1001  static_ltree[n++].Len = 7;
1002  bl_count[7]++;
1003  }
1004  /* Codes 286 and 287 do not exist, but we must include them in the
1005  tree construction to get a canonical Huffman tree (longest code
1006  all ones)
1007  */
1008  while ( n <= 287 ) {
1009  static_ltree[n++].Len = 8;
1010  bl_count[8]++;
1011  }
1013 
1014  /* The static distance tree is trivial: */
1015  for ( n = 0; n < D_CODES; n++ ) {
1016  static_dtree[n].Len = 5;
1017  static_dtree[n].Code = Nlmzip_bi_reverse(n, 5);
1018  }
1019 
1020  /* Initialize the first block of the first file: */
1021  init_block();
1022 }
1023 
1024 /****************************************************************************/
1025 /*.doc Nlmzip_flush_block (external) */
1026 /*+
1027  Determine the best encoding for the current block: dynamic trees,
1028  static trees or store, and output the encoded block to the zip file.
1029  This function returns the total compressed length for the file so far.
1030 -*/
1031 /****************************************************************************/
1032 ulg
1034  char *buf, /* input block, or NULL if too old */
1035  ulg stored_len, /* length of input block */
1036  int eof /* true if this is the last block */
1037 ){
1038  ulg opt_lenb, static_lenb; /* opt_len and static_len in bytes */
1039  int max_blindex; /* index of last bit length code of non zero freq */
1040 
1041  flag_buf[last_flags] = flags; /* Save the flags for the last 8 items */
1042 
1043  /* Check if the file is ascii or binary */
1044  if ( *file_type == (ush)UNKNOWN ) {
1045  set_file_type();
1046  }
1047 
1048  /* Construct the literal and distance trees */
1049  build_tree((tree_desc*)(&l_desc));
1050 
1051  build_tree((tree_desc*)(&d_desc));
1052 
1053  /* At this point, opt_len and static_len are the total bit
1054  lengths of the compressed block data, excluding the tree
1055  representations.
1056  */
1057 
1058  /* Build the bit length tree for the above two trees, and get
1059  the index in bl_order of the last bit length code to send.
1060  */
1061  max_blindex = build_bl_tree();
1062 
1063  /* Determine the best encoding. Compute first the block
1064  length in bytes
1065  */
1066  opt_lenb = (opt_len+3+7)>>3;
1067  static_lenb = (static_len+3+7)>>3;
1068 
1069  if ( static_lenb <= opt_lenb ) {
1070  opt_lenb = static_lenb;
1071  }
1072 
1073  /* If compression failed and this is the first and last block,
1074  and if the zip file can be seeked (to rewrite the local header),
1075  the whole file is transformed into a stored file:
1076  */
1077  if ( stored_len+4 <= opt_lenb && buf != (char*)0 ) {
1078  /* The test buf != NULL is only necessary if LIT_BUFSIZE > WSIZE.
1079  Otherwise we can't have processed more than WSIZE input bytes
1080  since the last block flush, because compression would have
1081  been successful. If LIT_BUFSIZE <= WSIZE, it is never too late
1082  to transform a block into a stored block.
1083  */
1084  Nlmzip_send_bits((STORED_BLOCK<<1)+eof, 3); /* send block type */
1085  compressed_len = (compressed_len + 3 + 7) & ~7L;
1086  compressed_len += (stored_len + 4) << 3;
1087  Nlmzip_copy_block(buf, (Uint4)stored_len, 1); /* with header */
1088  } else if ( static_lenb == opt_lenb ) {
1089  Nlmzip_send_bits((STATIC_TREES<<1)+eof, 3);
1090  compress_block (
1093  );
1094  compressed_len += 3 + static_len;
1095  } else {
1096  Nlmzip_send_bits((DYN_TREES<<1)+eof, 3);
1097  send_all_trees(l_desc.max_code+1, d_desc.max_code+1, max_blindex+1);
1099  compressed_len += 3 + opt_len;
1100  }
1101 
1102  init_block();
1103 
1104  if ( eof ) {
1105  Nlmzip_bi_windup();
1106  compressed_len += 7; /* align on byte boundary */
1107  }
1108 
1109  return compressed_len >> 3;
1110 }
1111 
1112 /****************************************************************************/
1113 /*.doc Nlmzip_ct_tally (external) */
1114 /*+
1115  Save the match info and tally the frequency counts. Return
1116  true if the current block must be flushed.
1117 -*/
1118 /****************************************************************************/
1119 int
1121  int dist, /* distance of matched string */
1122  int lc /* match length-MIN_MATCH or unmatched char (if dist==0) */
1123 ){
1124  l_buf[last_lit++] = (uch)lc;
1125  if ( dist == 0 ) {
1126  /* lc is the unmatched char */
1127  dyn_ltree[lc].Freq++;
1128  } else {
1129  /* Here, lc is the match length - MIN_MATCH */
1130  dist--; /* dist = match distance - 1 */
1131 
1132  dyn_ltree[length_code[lc]+LITERALS+1].Freq++;
1133  dyn_dtree[d_code(dist)].Freq++;
1134 
1135  Nlmzip_d_buf[last_dist++] = (ush)dist;
1136  flags |= flag_bit;
1137  }
1138  flag_bit <<= 1;
1139 
1140  /* Output the flags if they fill a byte: */
1141  if ( (last_lit & 7) == 0 ) {
1142  flag_buf[last_flags++] = flags;
1143  flags = 0, flag_bit = 1;
1144  }
1145  /* Try to guess if it is profitable to stop the current block here */
1146  if ( Nlmzip_level > 2 && (last_lit & 0xfff) == 0 ) {
1147  /* Compute an upper bound for the compressed length */
1148  ulg out_length = (ulg)last_lit*8L;
1150  int dcode;
1151  for ( dcode = 0; dcode < D_CODES; dcode++ ) {
1152  out_length += (ulg)dyn_dtree[dcode].Freq*(5L+extra_dbits[dcode]);
1153  }
1154  out_length >>= 3;
1155  if ( last_dist < last_lit/2 && out_length < in_length/2 ) {
1156  return 1;
1157  }
1158  }
1159 
1160  /* We avoid equality with LIT_BUFSIZE because of wraparound at 64K
1161  on 16 bit machines and because stored blocks are restricted to
1162  64K-1 bytes.
1163  */
1164  return (last_lit == LIT_BUFSIZE-1 || last_dist == DIST_BUFSIZE);
1165 }
1166 
1167 
@ UNKNOWN
Definition: aln_formats.hpp:38
void Nlmzip_send_bits(int value, int length)
Uint4 Nlmzip_bi_reverse(Uint4 code, int len)
void Nlmzip_copy_block(char *buf, Uint4 len, int header)
void Nlmzip_bi_windup(void)
Uint4 Nlmzip_strstart
unsigned char Nlmzip_inbuf[0x8000]
BEGIN_CTRANSITION_SCOPE int Nlmzip_level
Int4 Nlmzip_block_start
unsigned short Nlmzip_d_buf[0x8000]
#define DIST_BUFSIZE
Definition: ct_nlmzip_i.h:131
#define _(proto)
Definition: ct_nlmzip_i.h:78
static ct_data dyn_ltree[(2 *(256+1+29)+1)]
static uch flags
static unsigned char length_code[258 - 3+1]
static void scan_tree(ct_data *, int)
#define STATIC_TREES
static Uint4 last_dist
static ct_data static_dtree[30]
#define HEAP_SIZE
#define END_BLOCK
#define pqremove(tree, top)
static int heap_max
static ct_data dyn_dtree[2 *30+1]
#define LIT_BUFSIZE
static tree_desc bl_desc
#define L_CODES
static void gen_codes(ct_data *, int)
static int heap_len
static int base_dist[30]
#define REPZ_11_138
void Nlmzip_ct_init(ush *attr, int *methodp)
static unsigned char dist_code[512]
static int heap[2 *(256+1+29)+1]
struct tree_desc tree_desc
#define REPZ_3_10
static int extra_blbits[19]
#define LITERALS
#define DYN_TREES
static unsigned char depth[2 *(256+1+29)+1]
#define Len
static int build_bl_tree(void)
#define MAX_BITS
static void build_tree(tree_desc *)
#define d_code(dist)
static ct_data bl_tree[2 *19+1]
#define REP_3_6
static ulg static_len
#define smaller(tree, n, m)
static int * file_method
static void pqdownheap(ct_data *, int)
ulg Nlmzip_flush_block(char *buf, ulg stored_len, int eof)
static ct_data static_ltree[(256+1+29)+2]
int Nlmzip_ct_tally(int dist, int lc)
#define send_code(c, tree)
static void set_file_type(void)
#define D_CODES
#define l_buf
static ush * file_type
#define Freq
static ulg compressed_len
#define LENGTH_CODES
#define MAX_BL_BITS
#define BL_CODES
static void compress_block(ct_data *, ct_data *)
#define STORED_BLOCK
static Uint4 last_flags
static void send_all_trees(int, int, int)
static void init_block(void)
static int extra_dbits[30]
static int extra_lbits[29]
static unsigned char flag_buf[(0x8000/8)]
static void send_tree(ct_data *, int)
static unsigned short bl_count[15+1]
#define SMALLEST
static unsigned char bl_order[19]
static tree_desc d_desc
static uch flag_bit
static void gen_bitlen(tree_desc *)
BEGIN_CTRANSITION_SCOPE struct ct_data ct_data
static Uint4 last_lit
#define URMAX(a, b)
static ulg opt_len
static tree_desc l_desc
static int base_length[29]
#define BINARY(opd)
Definition: expr.cpp:42
static int lc
Definition: getdata.c:30
static FILE * f
Definition: readconf.c:23
#define BEGIN_CTRANSITION_SCOPE
Definition: ncbilcl.hpp:49
#define END_CTRANSITION_SCOPE
Definition: ncbilcl.hpp:50
int32_t Int4
4-byte (32-bit) signed integer
Definition: ncbitype.h:102
uint32_t Uint4
4-byte (32-bit) unsigned integer
Definition: ncbitype.h:103
char * buf
yy_size_t n
int len
#define count
Definition: inftrees.h:24
union ct_data::@1015 fc
union ct_data::@1016 dl
ct_data * dyn_tree
ct_data * static_tree
#define uch
#define local
Definition: zutil.h:33
unsigned short ush
Definition: zutil.h:41
#define MIN_MATCH
Definition: zutil.h:84
#define MAX_MATCH
Definition: zutil.h:85
unsigned long ulg
Definition: zutil.h:43
unsigned char uch
Definition: zutil.h:39
Modified on Fri Sep 20 14:57:54 2024 by modify_doxy.py rev. 669887