/* Huffman coding, especially for digitized alphabets. * * Contents: * 1. The ESL_HUFFMAN object * 2. Huffman encoding * 3. Huffman decoding * 4. Debugging, development * 5. Internal function, components of creating huffman codes * 6. Example driver * * Useful emacs gdb tricks for displaying bit field v: * p /t v (no leading zeros, beware!) * x &v */ #include "esl_config.h" #include #include "easel.h" #include "esl_quicksort.h" #include "esl_huffman.h" /* Declarations of stuff in internal functions/structures section */ struct hufftree_s { float val; // Sum of frequencies of all leaves under this node int depth; // Depth of node int left; // index of left child in array of tree nodes (0..N-2; 0 is the root) int right; // "" for right child }; static int sort_floats_decreasing(const void *data, int e1, int e2); static int sort_canonical (const void *data, int e1, int e2); static int huffman_tree (ESL_HUFFMAN *hc, struct hufftree_s *htree, const float *fq); static int huffman_codelengths (ESL_HUFFMAN *hc, struct hufftree_s *htree, const float *fq); static int huffman_canonize (ESL_HUFFMAN *hc); static int huffman_decoding_table(ESL_HUFFMAN *hc); static void dump_uint32(FILE *fp, uint32_t v, int L); static void huffman_pack(uint32_t *X, int *ip, int *ap, uint32_t code, int L); static void huffman_unpack(const ESL_HUFFMAN *hc, uint32_t *vp, const uint32_t *X, int n, int *ip, int *ap, char *ret_x, int *ret_L); /***************************************************************** * 1. The ESL_HUFFMAN object *****************************************************************/ /* Function: esl_huffman_Build() * Synopsis: Build a new Huffman code. * Incept: SRE, Thu Nov 12 11:08:09 2015 * * Purpose: Build a canonical Huffman code for observed symbol * frequencies for possible symbols. * Frequencies can be counts, or normalized probabilities; * all that matters is their relative magnitude (and that * they're $\geq 0$). * * If you're encoding an Easel digital alphabet, you want * Kp>, inclusive of ambiguity codes, gaps, * missing data, and rare digital codes. * * If you're encoding 7-bit ASCII text, you want K=128, and * the symbols codes are ASCII codes. * * If you're encoding MTF-encoded ASCII text, again you * want K=128 and the "symbol" codes are 0..127 offsets in * the move-to-front encoding. * * If you're encoding an arbitrary symbol table -- a table * of gap lengths, perhaps? -- can be anything. * * Unobserved symbols (with ) will not be encoded; * they get a code length of 0, and a code of 0. * * Args: fq - symbol frequencies 0..K-1; sum to 1 * K - size of fq (encoded alphabet size) * ret_hc - RETURN: new huffman code object * * Returns: on success, and <*ret_hc> points to the new * object. * * Throws: on allocation error. * * if the encoding requires a code length * that exceeds , and won't fit in * a . */ int esl_huffman_Build(const float *fq, int K, ESL_HUFFMAN **ret_hc) { ESL_HUFFMAN *hc = NULL; struct hufftree_s *htree = NULL; // only need tree temporarily, during code construction. int i,r; int status; ESL_DASSERT1(( fq )); ESL_DASSERT1(( K > 0 )); ESL_ALLOC(hc, sizeof(ESL_HUFFMAN)); hc->len = NULL; hc->code = NULL; hc->sorted_at = NULL; hc->dt_len = NULL; hc->dt_lcode = NULL; hc->dt_rank = NULL; hc->K = K; hc->Ku = 0; hc->D = 0; hc->Lmax = 0; ESL_ALLOC(hc->len, sizeof(int) * hc->K); ESL_ALLOC(hc->code, sizeof(uint32_t) * hc->K); ESL_ALLOC(hc->sorted_at, sizeof(int) * hc->K); for (i = 0; i < hc->K; i++) hc->len[i] = 0; for (i = 0; i < hc->K; i++) hc->code[i] = 0; /* Sort the symbol frequencies, largest to smallest */ esl_quicksort(fq, hc->K, sort_floats_decreasing, hc->sorted_at); /* Figure out how many are nonzero: that's hc->Ku */ for (r = hc->K-1; r >= 0; r--) if (fq[hc->sorted_at[r]] > 0.) break; hc->Ku = r+1; ESL_ALLOC(htree, sizeof(struct hufftree_s) * (ESL_MAX(1, hc->Ku-1))); // Ku=1 is ok; avoid zero malloc. if ( (status = huffman_tree (hc, htree, fq)) != eslOK) goto ERROR; if ( (status = huffman_codelengths(hc, htree, fq)) != eslOK) goto ERROR; // can fail eslERANGE on maxlen > 32 if ( (status = huffman_canonize (hc)) != eslOK) goto ERROR; ESL_ALLOC(hc->dt_len, sizeof(int) * hc->D); ESL_ALLOC(hc->dt_lcode, sizeof(uint32_t) * hc->D); ESL_ALLOC(hc->dt_rank, sizeof(int) * hc->D); if ( (status = huffman_decoding_table(hc)) != eslOK) goto ERROR; free(htree); *ret_hc = hc; return eslOK; ERROR: free(htree); esl_huffman_Destroy(hc); *ret_hc = NULL; return status; } /* Function: esl_huffman_Destroy() * Synopsis: Free an code. * Incept: SRE, Thu Nov 12 11:07:39 2015 */ void esl_huffman_Destroy(ESL_HUFFMAN *hc) { if (hc) { free(hc->len); free(hc->code); free(hc->sorted_at); free(hc->dt_len); free(hc->dt_lcode); free(hc->dt_rank); free(hc); } } /***************************************************************** * 2. Encoding *****************************************************************/ /* Function: esl_huffman_Encode() * Synopsis: Encode a string. * Incept: SRE, Thu Jun 2 09:27:43 2016 [Hamilton] * * Purpose: Use Huffman code to encode the plaintext input of * length . The encoded result consists of bits, * stored in an array of 's; this result is * returned through the pointers <*ret_X>, <*ret_nX>, and * <*ret_nb>. * * The encoded array is allocated here, and must be * free'd by the caller. * * Args: hc - Huffman code to use for encoding * T - plaintext input to encode, [0..n-1]; does not need to be NUL-terminated. * n - length of T * ret_X - RETURN: encoded bit array * ret_nb - RETURN: length of X in bits (nX = nb / 32, rounded up) * * Returns: on success. * * Throws: on allocation failure. Now <*ret_X = NULL> and <*ret_nb = 0>. */ int esl_huffman_Encode(const ESL_HUFFMAN *hc, const char *T, int n, uint32_t **ret_X, int *ret_nb) { uint32_t *X = NULL; int xalloc = ESL_MAX(16, (n+15)/16); // current allocation for X, in uint32_t's int pos = 0; // current position in X's uint32_t array int nb; int i; int status; ESL_DASSERT1(( hc != NULL )); ESL_DASSERT1(( T != NULL )); ESL_DASSERT1(( n > 0 )); ESL_ALLOC(X, sizeof(uint32_t) * xalloc); X[0] = 0; nb = 0; for (i = 0; i < n; i++) { huffman_pack(X, &pos, &nb, hc->code[(int) T[i]], hc->len[(int) T[i]]); if (pos+1 == xalloc) { xalloc *= 2; ESL_REALLOC(X, sizeof(uint32_t) * xalloc); } } *ret_X = X; // X consists of uint32_t's *ret_nb = 32*pos + nb; // ... we return exact # of bits. return eslOK; ERROR: *ret_X = NULL; *ret_nb = 0; return status; } /***************************************************************** * 3. Decoding *****************************************************************/ /* Function: esl_huffman_Decode() * Synopsis: Decode a bit stream. * Incept: SRE, Thu Jun 2 09:52:46 2016 [Hamilton, Act I] * * Purpose: Use Huffman code to decode a bit stream of length * integers and bits. The result is a plaintext * string of length characters. Return this result * through <*ret_T> and <*ret_nT>. * * The decoded plaintext is allocated here, and must be * free'd by the caller. * * is NUL-terminated, just in case that's useful -- * though the caller isn't necessarily going to treat * as a string. (It could be using "symbols" 0..127, which * would include <\0> as a valid symbol.) * * Args: hc - Huffman code to use to decode * X - bit stream to decode * nb - length of in BITS (nX = nb/32, rounded up) * ret_T - RETURN: decoded plaintext string, \0-terminated * ret_n - RETURN: length of in chars * * Returns: on success; <*ret_T> and <*ret_nT> hold the result. * * Throws: on allocation failure. Now <*ret_T> is and * <*ret_nT> is 0. * * Xref: */ int esl_huffman_Decode(const ESL_HUFFMAN *hc, const uint32_t *X, int nb, char **ret_T, int *ret_n) { char *T = NULL; int allocT; // current allocation for T uint32_t v = X[0]; // current (full) 32 bits we're going to decode in this step int i = 1; // index of X[i] we will first pull *new* bits from, after decoding v int nX = (nb+31)/32; // length of X in uint32_t's: nb/32 rounded up. int a = (nX > 1 ? 32 : 0); int pos = 0; int L; // length of code we just decoded, in bits int status; allocT = nX * 4; // an initial guess: 4 bytes per X, maybe 4x compression ESL_ALLOC(T, sizeof(char) * allocT); while (nb > 0) { huffman_unpack(hc, &v, X, nX, &i, &a, &(T[pos]), &L); nb -= L; if (++pos == allocT) { allocT *= 2; ESL_REALLOC(T, sizeof(char) * allocT); } } /* We know we have space for the \0, from how we reallocated. */ T[pos] = '\0'; *ret_T = T; *ret_n = pos; return eslOK; ERROR: *ret_T = NULL; *ret_n = 0; return status; } /***************************************************************** * 4. Debugging, development *****************************************************************/ /* Function: esl_huffman_Dump() * Synopsis: Dump info on a huffman code structure. * Incept: SRE, Sat Jun 4 07:38:15 2016 * * Purpose: Dump the internals of object to output stream . */ int esl_huffman_Dump(FILE *fp, ESL_HUFFMAN *hc) { int r,x; int d,L; /* Encoding table:

  */
  fprintf(fp, "Encoding table:\n");
  for (r = 0; r < hc->Ku; r++)
    {
      x = hc->sorted_at[r];
      fprintf(fp, "%3d %2d ", x, hc->len[x]);
      dump_uint32(fp, hc->code[x], hc->len[x]);
      fprintf(fp, "\n");
    }
  fputc('\n', fp);


  /* Decoding table (if set) */
  if (hc->dt_len)
    {
      fprintf(fp, "Decoding table:\n");
      for (d = 0; d < hc->D; d++)
        {
          L = hc->dt_len[d];
          fprintf(fp, "L=%2d  r=%3d (%3d) ", L, hc->dt_rank[d], hc->sorted_at[hc->dt_rank[d]]);
          dump_uint32(fp, hc->dt_lcode[d], eslHUFFMAN_MAXCODE);
          fputc('\n', fp);
        }
    }

  return eslOK;
}



/*****************************************************************
 * 5. Internal functions and structures
 *****************************************************************/

/* sort_floats_decreasing()
 * Sorting function for esl_quicksort(), putting 
 * symbol frequencies in decreasing order.
 */
static int
sort_floats_decreasing(const void *data, int e1, int e2)
{
  float *fq = (float *) data;
  if (fq[e1] > fq[e2]) return -1;
  if (fq[e1] < fq[e2]) return 1;
  return 0;
}

/* sort_canonical()
 * Sorting function for esl_quicksort(), putting symbols into
 * canonical Huffman order: primarily by ascending code length,
 * secondarily by ascending symbol code.
 */
static int
sort_canonical(const void *data, int e1, int e2)
{
  ESL_HUFFMAN *hc = (ESL_HUFFMAN *) data;
  int          L1 = hc->len[e1];
  int          L2 = hc->len[e2];
  
  if      (L2 == 0) return -1;   // len=0 means symbol isn't encoded at all, doesn't occur
  else if (L1 == 0) return 1;
  else if (L1 < L2) return -1;
  else if (L1 > L2) return 1;
  else if (e1 < e2) return -1;
  else if (e1 > e2) return 1;
  else              return 0;
}

/* Build the Huffman tree, joining nodes/leaves of smallest frequency.
 * This takes advantage of having the fq[] array sorted, and the fact
 * that the internal node values also come out sorted... i.e. we don't
 * have to re-sort, we can always find the smallest leaves/nodes by 
 * looking at the last ones.
 *
 * For the Ku=1 edge case, there's no tree, and this no-ops. 
 * 
 * Input: 
 *   hc->sorted_at[] lists symbol indices from largest to smallest freq.
 *   hc->Ku          is the number of syms w/ nonzero freq; tree has Ku-1 nodes
 *   htree           blank, allocated for at least Ku-1 nodes
 *   
 * Output:
 *   htree's left, right, val fields are filled.
 *   
 * Returns:
 *    on success.  
 */
static int
huffman_tree(ESL_HUFFMAN *hc, struct hufftree_s *htree, const float *fq)
{
  int r = hc->Ku-1;   // r = smallest leaf symbol that hasn't been included in tree yet; r+1 = # of leaves left
  int k = hc->Ku-2;   // k = smallest internal node not used as a child yet; k-j = # nodes not used as child yet
  int j;

  for (j = hc->Ku-2; j >= 0; j--)  // j = index of next node we add; we add one per iteration
    {       
      /* Should we join two leaves?
       *   If we have no internal nodes yet (because we're just starting),
       *   or the two smallest frequencies are <= the smallest unjoined node's value
       */
      if ( (j == hc->Ku-2) ||  (r >= 1 && fq[hc->sorted_at[r]] <= htree[k].val))
	{
	  htree[j].right = -hc->sorted_at[r];    // leaves are signified by negative indices in tree
	  htree[j].left  = -hc->sorted_at[r-1];
	  htree[j].val   = fq[hc->sorted_at[r]] + fq[hc->sorted_at[r-1]];
	  r -= 2;
	}

      /* Or should we join two nodes?
       *  If we have no leaves left, 
       *  or (we do have two nodes) and both are smaller than smallest unjoined leaf's value
       */
       else if (r == -1  || (k-j >= 2 && htree[k-1].val < fq[hc->sorted_at[r]]))
	 {
	   htree[j].right = k;
	   htree[j].left  = k-1;
	   htree[j].val   = htree[k].val + htree[k-1].val;
	   k -= 2;
	 }
      
      /* Otherwise, we join smallest node and smallest leaf. */
       else 
	 {
	   htree[j].right = -hc->sorted_at[r];
	   htree[j].left  = k;
	   htree[j].val   = fq[hc->sorted_at[r]] + htree[k].val;
	   r--;
	   k--;
	 }
    }
  return eslOK;
}


/* Calculate code lengths, equal to the depth of each node. 
 * Traverse the tree, calculating depth of each node, starting with
 * depth 0 for root 0. We don't need a stack for this traversal,
 * tree is already indexed in traversal order.
 * 
 * For the Ku=1 edge case, there's no tree; for a single encoded
 * symbol we set hc->len[0] = 1, hc->Lmax = 1
 * 
 * Input:
 *   hc->Ku          is the number of syms w/ nonzero freqs; tree has Ku-1 nodes.
 *   htree[0..Ku-2]  is the constructed Huffman tree, with right/left/val set.
 *   htree[].len     has been initialized to 0 for all symbols 0..K
 *
 * Output:
 *   htree's depth field is set.
 *   hc->len is set for all encoded symbols (left at 0 for unused symbols)
 *   hc->Lmax is set
 *   
 * Return: 
 *    on success
 *    if max code length > eslHUFFMAN_MAXCODE and won't fit in uint32_t   
 */
static int
huffman_codelengths(ESL_HUFFMAN *hc, struct hufftree_s *htree, const float *fq)
{
  int i;

  if (hc->Ku == 1)
    {
      hc->len[ hc->sorted_at[0] ] = 1;
      hc->Lmax   = 1;
      return eslOK;
    }

  htree[0].depth = 0;
  for (i = 0; i < hc->Ku-1; i++)
    {
      if (htree[i].right <= 0) hc->len[-htree[i].right]    = htree[i].depth + 1;
      else                     htree[htree[i].right].depth = htree[i].depth + 1;
      
      if (htree[i].left <= 0)  hc->len[-htree[i].left]     = htree[i].depth + 1;
      else                     htree[htree[i].left].depth  = htree[i].depth + 1;
    }

  hc->Lmax = 0;
  for (i = 0; i < hc->K; i++)
    hc->Lmax = ESL_MAX(hc->len[i], hc->Lmax);

  return (hc->Lmax > eslHUFFMAN_MAXCODE ? eslERANGE : eslOK);
}


/* huffman_canonize()
 * Given code lengths, now we calculate the canonical Huffman encoding.
 * 
 * Input:
 *   hc->len[]  code lengths are set for all K (0 for unused symbols)
 *   hc->code[] have been initialized to 0 for all K
 *   
 * Output:  
 *   hc->code[] have been set for all used symbols.
 *   hc->D      number of different code lengths is set
 *   
 * Returns:
 *   on success.  
 */
static int
huffman_canonize(ESL_HUFFMAN *hc)
{
  int r;

  /* Sort symbols according to 1) code length; 2) order in digital alphabet (i.e. symbol code itself)
   * Reuse/reset .
   * You can't just sort the encoded Ku; you have to sort all K, because
   * quicksort expects indices to be contiguous (0..K-1).
   */
  esl_quicksort(hc, hc->K, sort_canonical, hc->sorted_at);

  /* Assign codes. (All K have been initialized to zero already.) */
  for (r = 1; r < hc->Ku; r++)
    hc->code[hc->sorted_at[r]] =
      (hc->code[hc->sorted_at[r-1]] + 1) << (hc->len[hc->sorted_at[r]] - hc->len[hc->sorted_at[r-1]]);

  /* Set D, the number of different code lengths */
  hc->D = 1;
  for (r = 1; r < hc->Ku; r++)
    if (hc->len[hc->sorted_at[r]] > hc->len[hc->sorted_at[r-1]]) hc->D++;

  return eslOK;
}


/* huffman_decoding_table()
 * Given a canonical Huffman code; build the table that lets us
 * efficiently decode it.
 * 
 * Input:
 *   hc->K         is set: total # of symbols (inclusive of unused ones)
 *   hc->Ku        is set: total # of encoded/used symbols
 *   hc->code      is set: canonical Huffman codes for symbols 0..K-1
 *   hc->len       is set: code lengths for symbols 0..K-1
 *   hc->sorted_at is set: canonical Huffman sort order 
 *   hc->Lmax      is set: maximum code length
 *   hc->D         is set: # of different code lengths
 *
 *   hc->dt_len    is allocated for hc->D, but otherwise uninitialized
 *   hc->dt_lcode  is allocated for hc->D, but otherwise uninitialized
 *   hc->dt_rank   is allocated for hc->D, but otherwise uninitialized
 *   
 * Output:  
 *   hc->dt_len    is set: lengths of each used code length 0..D-1
 *   hc->dt_lcode  is set: left-flushed first code for each code length [d]
 *   hc->dt_rank   is set: rank r for 1st code for each used code length [d]
 */
static int 
huffman_decoding_table(ESL_HUFFMAN *hc)
{
  int r;
  int D = 0;

  hc->dt_len[0]   = hc->len[hc->sorted_at[0]];
  hc->dt_lcode[0] = hc->code[hc->sorted_at[0]] << (eslHUFFMAN_MAXCODE - hc->len[hc->sorted_at[0]]);
  hc->dt_rank[0]  = 0;
  for (r = 1; r < hc->Ku; r++)
    if (hc->len[hc->sorted_at[r]] > hc->len[hc->sorted_at[r-1]]) 
      {
	D++;
	hc->dt_len[D]   = hc->len[hc->sorted_at[r]];
	hc->dt_lcode[D] = hc->code[hc->sorted_at[r]] << (eslHUFFMAN_MAXCODE - hc->len[hc->sorted_at[r]]);
	hc->dt_rank[D]  = r;
      }
  ESL_DASSERT1(( hc->D == D+1 ));
  return eslOK;
}


static void
dump_uint32(FILE *fp, uint32_t v, int L)
{
  uint32_t mask;
  int      i;
  
  for (mask = 1 << (L-1), i = L; i >= 1; i--, mask = mask >> 1)
    putc( ((v & mask) ? '1' : '0'), fp);
}



/* huffman_pack()
 *
 *  is the current uint32_t unit in the encoded buffer . It
 * has  bits in it so far, maximally left-shifted; therefore (32-a)
 * bits are available.
 *
 *  is the next Huffman code to pack into the buffer, of length
 * , and it's right flush.
 *
 *     a=10 used      (32-a)=20 free
 *    |xxxxxxxxxx|......................| X[i]
 *    |........................|yyyyyyyy| code, L=8
 *               |----- w -----|
 *                w = 32-(a+L)
 * 
 * If L < 32-a, then we just shift by w and pack it into X[i]. Else,
 * we shift the other way (by -w), pack what we can into X[i], and
 * leave the remainder in X[i+1].
 * 
 * We update  and  for  accordingly... so we pass them by
 * reference in  and .
 */
static void
huffman_pack(uint32_t *X, int *ip, int *ap, uint32_t code, int L)
{
  int w = 32 - (*ap+L);
  
  if (w > 0)      // code can pack into X[i]'s available space.
    {
      X[*ip] = X[*ip] | (code << w);
      *ap += L;
    }
  else if (w < 0) // code packs partly in X[i], remainder in X[i+1].
    {
      X[*ip] = X[*ip] | (code >> (-w));
      (*ip)++;
      X[*ip] = code << (32+w);
      (*ap) = -w;
    }
  else           // code packs exactly; w=0, no leftshift needed, OR it as is.
    {
      X[*ip] = X[*ip] | code;
      *ip += 1; 
      *ap =  0;  
      X[*ip] = 0;  // don't forget to initialize X[i+1]!
    }
}



/* huffman_unpack()
 * *vp  : ptr to v; v = next 32 bits
 * *X   : encoded input
 *  n   : length of input (in uint32_t)
 * *ip  : current position in 
 * *ap  : number of bits left in X[*ip]
 * 
 * If we have to buffer X (say, if we're reading it from
 * a long input) we'll have to redesign. Right now we assume
 * it's just an array.
 */
static void
huffman_unpack(const ESL_HUFFMAN *hc, uint32_t *vp, const uint32_t *X, int n, int *ip, int *ap, char *ret_x, int *ret_L)
{
  int      L,D;
  int      idx;
  uint32_t w;

  for (D = 0; D < hc->D-1; D++)
    if ((*vp) < hc->dt_lcode[D+1]) break;
  L = hc->dt_len[D];
  /* L is now the next code's length (prefix of v) */

  /* Decode, by taking advantage of lexicographic sort/numerical order of canonical code, within each L */
  idx     = hc->dt_rank[D] +  ( ((*vp) - hc->dt_lcode[D]) >> (eslHUFFMAN_MAXCODE-L) );
  
  /* Now refill v, as much as we can, from bits in X[i] and X[i+1], and update i, a */
  *vp  = ( (*vp) << L);                // Remove L bits from *vp by leftshifting it.

  if (*ip < n) {                      // Take either L or all *ap bits from X[i], if it exists.
    w    = X[*ip] << (32-(*ap));      // Shift off the bits we already used in X[i]. w is now X[i], left-flushed.
    *vp |= (w >> (32-L));             // Right-shift w into position, leaving it with leading 0's where *vp already has bits.
    *ap -= L;                         // We used up to L bits from X[i]

    // if *ap is still >0, we have bits left to use in X[i]. Otherwise:
    if (*ap == 0)                       // If we exactly finished off X[i]:
      {
	(*ip)++;                        // then advance in X[].
	*ap = 32;
      }
    else if (*ap < 0)                   // If we finished off X[i] but still need some bits
      { 
	(*ip)++;                        //   then go on to X[i+1] and 32 fresh bits.
	if (*ip < n)                    //   If it exists...
	  {                             //       (...no, I don't like all these branches either...)
	    *ap += 32;                  //     then we're going to leave it w/ <*ap> bits
	    *vp |= (X[*ip] >> *ap);     //     after taking the bits we need to fill v
	  }
	else 
	  {
	    *ap = 0;                      //   If X[i+1] doesn't exist, leave *ip = n and *ap = 0; out of data in X (though not necessarily in v)
	  }
      }
  }

  *ret_x  = (char) hc->sorted_at[idx];
  *ret_L  = L;
}


/*****************************************************************
 * 6. Unit tests
 *****************************************************************/
#ifdef eslHUFFMAN_TESTDRIVE
#include "esl_random.h"
#include "esl_randomseq.h"
#include "esl_vectorops.h"

#include 


static void
utest_kryptos(ESL_RANDOMNESS *rng)
{
  char         msg[]   = "kryptos utest failed";
  ESL_HUFFMAN *hc      = NULL;
  char         T[]     = "BETWEEN SUBTLE SHADING AND THE ABSENCE OF LIGHT LIES THE NUANCE OF IQLUSION";
  int          n       = strlen(T);
  uint32_t    *X       = NULL;
  int          nb;
  char        *T2      = NULL;
  int          n2;
  float        fq[128];
  int          K       = 128;
  int          i;
  int          status;

  /* Any half-assed frequency distribution will do for this, over [ A-Z] */
  for (i = 0;   i <  128; i++) fq[i] = 0.;
  for (i = 'A'; i <= 'Z'; i++) fq[i] = esl_random(rng);
  fq[' '] = esl_random(rng);
  esl_vec_FNorm(fq, 128);

  if (( status = esl_huffman_Build (fq, K, &hc) )         != eslOK) esl_fatal(msg);
  if (( status = esl_huffman_Encode(hc, T, n, &X, &nb))   != eslOK) esl_fatal(msg);
  if (( status = esl_huffman_Decode(hc, X, nb, &T2, &n2)) != eslOK) esl_fatal(msg);

  //esl_huffman_Dump(stdout, hc);
  //printf("%s\n", T);
  //printf("%s\n", T2);

  if (n2 != n)            esl_fatal(msg);
  if (strcmp(T, T2) != 0) esl_fatal(msg);

  free(X);
  free(T2);
  esl_huffman_Destroy(hc);
}
  
/* utest_uniletter()
 * Tests an edge case of a text consisting of a single letter, Ku=1.
 * (Ku=1 cases get tested occasionally by utest_backandforth() too.)
 */
static void
utest_uniletter(void)
{
  char   msg[]    = "uniletter utest failed";
  char   T[]      = "AAAAAAAAAA";
  int    n        = strlen(T);
  int    K        = 128;
  float  fq[128];
  ESL_HUFFMAN *hc = NULL;
  uint32_t    *X  = NULL;
  int          nb;
  char        *T2 = NULL;
  int          n2;
  int          i;
  int          status;

  for (i = 0; i < 128; i++) fq[i] = 0.;
  fq['A'] = (float) n;

  if (( status = esl_huffman_Build (fq, K, &hc) )         != eslOK) esl_fatal(msg);
  if (( status = esl_huffman_Encode(hc, T, n, &X, &nb))   != eslOK) esl_fatal(msg);
  if (( status = esl_huffman_Decode(hc, X, nb, &T2, &n2)) != eslOK) esl_fatal(msg);
  
  if (n2 != n)            esl_fatal(msg);
  if (strcmp(T, T2) != 0) esl_fatal(msg);

  free(X);
  free(T2);
  esl_huffman_Destroy(hc);
}


/* utest_backandforth()
 * Encode and decode a random text string, and test 
 * that it decodes to the original.
 */
static void
utest_backandforth(ESL_RANDOMNESS *rng)
{
  char         msg[] = "back and forth utest failed";
  ESL_HUFFMAN *hc    = NULL;
  double      *fq0   = NULL;
  float       *fq    = NULL;
  int          K;             // alphabet size: randomly chosen from 1..128
  char        *T     = NULL;  // random plaintext
  int          n;             // randomly chosen length of plaintext T
  uint32_t    *X     = NULL;  // Huffman-coded bit stream
  int          nb;	      // length of X in bits
  char        *T2    = NULL;  // decoded plaintext
  int          n2;            // length of T2 in chars
  int          i;          
  int          status;

  /* Sample a zero-peppered frequency distribution  for a randomly
   * selected alphabet size .
   */
  K  = 1 + esl_rnd_Roll(rng, 128);                  // Choose a random alphabet size from 1 to 128
  if (( fq0 = malloc(sizeof(double) * K)) == NULL) esl_fatal(msg);  // esl_random works in doubles
  if (( fq  = malloc(sizeof(float)  * K)) == NULL) esl_fatal(msg);  // esl_huffman works in floats
  esl_rnd_Dirichlet(rng, NULL, K, fq0);             // Sample a uniform random probability vector
  for (i = 0; i < K; i++)                           // Pepper it with exact 0's while converting to float
    fq[i] =  ( esl_rnd_Roll(rng, 4) == 0 ? 0. : (float) fq0[i] );
  esl_vec_FNorm(fq, K);                             // and renormalize. (edge case: if fq was all 0, now it's uniform.)

  /* Sample a random plaintext array , of randomly selected length .
   * We're using codes 0..K-1 -- T is not a string, it's an array -- don't \0 it.
   */
  n = 1 + esl_rnd_Roll(rng, 10);
  if (( T = malloc(sizeof(char) * (n+1))) == NULL) esl_fatal(msg);
  for (i = 0; i < n; i++) T[i] = esl_rnd_FChoose(rng,fq,K);

  if (( status = esl_huffman_Build (fq, K, &hc) )         != eslOK) esl_fatal(msg);
  if (( status = esl_huffman_Encode(hc, T, n, &X, &nb))   != eslOK) esl_fatal(msg);
  if (( status = esl_huffman_Decode(hc, X, nb, &T2, &n2)) != eslOK) esl_fatal(msg);

  //esl_huffman_Dump(stdout, hc);
  
  if ( n2 != n)               esl_fatal(msg);
  if ( memcmp(T, T2, n) != 0) esl_fatal(msg);

  free(T2);
  free(X);
  free(fq0);
  free(fq);
  free(T);
  esl_huffman_Destroy(hc);
}




#endif /*eslHUFFMAN_TESTDRIVE*/




/*****************************************************************
 * 7. Test driver
 *****************************************************************/
#ifdef eslHUFFMAN_TESTDRIVE
#include "esl_config.h"

#include 

#include "easel.h"
#include "esl_getopts.h"
#include "esl_huffman.h"
#include "esl_random.h"

static ESL_OPTIONS options[] = {
   /* name  type         default  env   range togs  reqs  incomp  help                docgrp */
  {"-h",  eslARG_NONE,    FALSE, NULL, NULL, NULL, NULL, NULL, "show help and usage",             0},
  {"-s",  eslARG_INT,       "0", NULL, NULL, NULL, NULL, NULL, "set random number seed to ",   0},
  { 0,0,0,0,0,0,0,0,0,0},
};
static char usage[]  = "[-options]";
static char banner[] = "test driver for huffman module";

int
main(int argc, char **argv)
{
  ESL_GETOPTS    *go   = esl_getopts_CreateDefaultApp(options, 0, argc, argv, banner, usage);
  ESL_RANDOMNESS *rng  = esl_randomness_Create(esl_opt_GetInteger(go, "-s"));

  fprintf(stderr, "## %s\n", argv[0]);
  fprintf(stderr, "#  rng seed = %" PRIu32 "\n", esl_randomness_GetSeed(rng));

  utest_kryptos     (rng);
  utest_uniletter   (   );
  utest_backandforth(rng);
  
  fprintf(stderr, "#  status = ok\n");
  
  esl_getopts_Destroy(go);
  esl_randomness_Destroy(rng);
  return eslOK;
}
#endif /*eslHUFFMAN_TESTDRIVE*/




/*****************************************************************
 * 8. Examples
 *****************************************************************/
#ifdef eslHUFFMAN_EXAMPLE2

/* esl_huffman_example2 
 *
 * The input  consists of N lines with 
 * two whitespace-delimited fields:
 *      
 * 
 * Huffman code the frequency distribution, and output the resulting
 * encoding.
 */


#include "easel.h"
#include "esl_buffer.h"
#include "esl_huffman.h"
#include "esl_mem.h"
#include "esl_vectorops.h"

int 
main(int argc, char **argv)
{
  ESL_HUFFMAN *hc     = NULL;
  ESL_BUFFER  *bf     = NULL;
  esl_pos_t    n;
  char        *p;
  char        *tok;
  esl_pos_t    toklen;
  int          kalloc = 16;
  char       **label  = malloc(sizeof(char *) * kalloc);
  float       *fq     = malloc(sizeof(float)  * kalloc);
  int          K      = 0;
  float        meanL  = 0.;
  int          i;
  int          status;

  status = esl_buffer_Open(argv[1], NULL, &bf);
  if      (status == eslENOTFOUND) esl_fatal("open failed: %s",   bf->errmsg);
  else if (status == eslFAIL)      esl_fatal("gzip -dc failed: %s", bf->errmsg);
  else if (status != eslOK)        esl_fatal("open failed with error code %d", status);
  
  while (( status = esl_buffer_GetLine(bf, &p, &n)) == eslOK) 
    {
      if ( esl_memtok(&p, &n, " \t\n", &tok, &toklen) != eslOK) continue;
      if ( esl_memstrdup(tok, toklen, &(label[K]))    != eslOK) continue;
      if ( esl_mem_strtof(p, n, NULL, &(fq[K]))       != eslOK) continue;

      if (++K == kalloc) {
	kalloc *= 2; 
	label = realloc(label, sizeof(char *) * kalloc);
	fq    = realloc(fq,    sizeof(float)  * kalloc); 
      }
    }
  esl_vec_FNorm(fq, K);
  
  if (( status = esl_huffman_Build(fq, K, &hc)) != eslOK) esl_fatal("failed to build huffman code");
  
  for (i = 0; i < K; i++)
    {
      printf("%-10s %2d ", label[i], hc->len[i]);
      dump_uint32(stdout, hc->code[i], hc->len[i]);
      printf("\n");
    }

  for (i = 0; i < K; i++)
    meanL += (float) hc->len[i] * fq[i];
  printf("\nMean code length = %.2f bits\n", meanL);

  for (i = 0; i < K; i++) free(label[i]);
  free(label);
  free(fq);
  esl_huffman_Destroy(hc);
  esl_buffer_Close(bf);
  return 0;
}
#endif /*eslHUFFMAN_EXAMPLE2*/




#ifdef eslHUFFMAN_EXAMPLE

#include "easel.h"
#include "esl_huffman.h"

#include 
#include 

/* Given an open  for reading;
 * input text from it and return it as a single string.
 * Optionally return the number of characters in .
 * Convert all \n to spaces.
 */
static char *
read_text(FILE *fp, int *opt_n)
{
  int   maxlinelen = 4096;
  char *text       = malloc(sizeof(char) * maxlinelen);
  int   n          = 0;
  char *p;

  while (fgets(text+n, maxlinelen-1, fp) != NULL)
    {
      for (p = text+n; *p != '\0'; p++) 
	if (*p == '\n') *p = ' ';
      n   += strlen(text+n);
      text = realloc(text, sizeof(char) * (n+maxlinelen));
    }

  if (opt_n) *opt_n = n;
  return text;
}

int
main(int argc, char **argv)
{
  FILE     *fp = fopen(argv[1], "r");
  int       n;
  char     *T  = read_text(fp, &n);
  uint32_t *X  = NULL;
  float     fq[128];
  int       c,i;
  int       nb;
  ESL_HUFFMAN *hc   = NULL;
  char        *newT = NULL;
  int          nT;

  /* You provide a frequency table for your digital alphabet 0..K-1.
   * It's fine for there to be 0-frequency characters, even many of them;
   * they will not be encoded, and cost nothing. 
   * Here, our digital alphabet is 7-bit ASCII text, 0..127, K=128.
   */
  for (c = 0; c < 128; c++) fq[c]           = 0.;
  for (i = 0; i < n;   i++) fq[(int) T[i]] += 1.;

  /* There does have to be at least one character to encode, of course. */
  ESL_DASSERT1(( n > 0 ));
  
  esl_huffman_Build(fq, 128, &hc);  

  esl_huffman_Dump(stdout, hc);

  esl_huffman_Encode(hc, T, n, &X, &nb);

  printf("\nOriginal:   %d bytes\n", n);
  printf("Compressed: %d bytes (%d bits)\n", 4*(nb+31)/32, nb);

  /* Dump the compresstext, up to 30 words of it */
  printf("\nCompressed text:\n");
  for (i = 0; i < ESL_MIN(30, (nb+31)/32); i++) {
    dump_uint32(stdout, X[i], 32);
    fputc('\n', stdout);
  }

  esl_huffman_Decode(hc, X, nb, &newT, &nT);

  /* Show the decoded plaintext, up to 100 chars of it */
  printf("\nDecoded text:\n");
  for (i = 0; i < ESL_MIN(100, nT); i++)
    fputc(newT[i], stdout);
  fputc('\n', stdout);

  esl_huffman_Destroy(hc);
  free(T);
  fclose(fp);
  return 0;
}
#endif /*eslHUFFMAN_EXAMPLE*/