/************************************************************************
*       Copyright                                                       *
*       (1999-2015) The Regents of the University of California.        *
*       All Rights Reserved.                                            *
*       Author: Timothy L. Bailey
*                                                                       *
*       Permission to use, copy, modify, and distribute any part of     *
*       this software for educational, research and non-profit purposes,*
*       without fee, and without a written agreement is hereby granted, *
*       provided that the above copyright notice, this paragraph and    *
*       the following three paragraphs appear in all copies.            *
*                                                                       *
*       Those desiring to incorporate this software into commercial     *
*       products or use for commercial purposes should contact the      *
*       Technology Transfer Office, University of California, San Diego,*
*       9500 Gilman Drive, La Jolla, California, 92093-0910,            *
*       Ph: (619) 534 5815.                                             *
*                                                                       *
*       IN NO EVENT SHALL THE UNIVERSITY OF CALIFORNIA BE LIABLE TO     *
*       ANY PARTY FOR DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR         *
*       CONSEQUENTIAL DAMAGES, INCLUDING LOST PROFITS, ARISING OUT OF   *
*       THE USE OF THIS SOFTWARE, EVEN IF THE UNIVERSITY OF CALIFORNIA  *
*       HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.             *
*                                                                       *
*       THE SOFTWARE PROVIDED HEREUNDER IS ON AN "AS IS" BASIS, AND THE *
*       UNIVERSITY OF CALIFORNIA HAS NO OBLIGATIONS TO PROVIDE          *
*       MAINTENANCE, SUPPORT, UPDATES, ENHANCEMENTS, OR MODIFICATIONS.  *
*       THE UNIVERSITY OF CALIFORNIA MAKES NO REPRESENTATIONS AND       *
*       EXTENDS NO WARRANTIES OF ANY KIND, EITHER EXPRESSED OR IMPLIED, *
*       INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF        *
*       MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, OR THAT    *
*       THE USE OF THE MATERIAL WILL NOT INFRINGE ANY PATENT,           *
*       TRADEMARK OR OTHER RIGHTS.                                      *
************************************************************************/

/***********************************************************************/
/*
        Information Content Statistics Routines
*/
/***********************************************************************/

#ifdef MAIN
#define DEFINE_GLOBALS
#endif
#include "llr.h"
#include "macros.h"
#include "general.h"
#include "io.h"
#include <stdio.h>
#include "logs.h"
#include "message.h"

/* probability distribution */ 
typedef struct distr {
  int w;                        /* maximum width of alignment */
  double alpha;                 /* scale factor for distribution */
  int *range;                   /* range of distributions w = [1..w] */
  int *offset;                  /* offset of distributions w = [1..w] */
  double **d;                   /* array of (log) distributions w = [1..w] */
  double **cdf;                 /* array of llr (log) 1 - CDFs w = [1..w] */
  double mean;                  /* mean of llr for w = 1 */
} DISTR;
static int ndistrs = -1;        /* largest N for which distr known */
static DISTR *distrs = NULL;    /* array of distributions for N = [1..ndistrs]*/
static ARRAY_T *dd = NULL;      // alphabet distribution

/************************************************************************/
/*
        init_llr_pv_tables

        Initialize the single-column log likelihood ratio p-value tables.

        Note: Palindromes have effectively double the apparent sites but half 
        the width.

*/
/************************************************************************/
void init_llr_pv_tables(
  int min,                              /* minimum number of sites */
  int max,                              /* maximum number of sites */
  int alength,                          /* alphabet length */
  ARRAY_T *back,                        /* background frequencies */
  BOOLEAN pal                           /* sites are palindromes */
)
{
  int nsites;                           /* number of sites */

  /* set effective number of sites to double if pal */
  if (pal) { min *= 2; max *= 2; }

  if (!NO_STATUS)
    fprintf(stderr,
      "Initializing the motif probability tables for %d to %d sites...\n",
        min, max);

  /* make sure the distr table gets initialized on all nodes */
  (void) get_llr_pv(0, 1, 1, LLR_RANGE, 1.0, alength, back);

  for (nsites=min; nsites<=max; nsites += pal ? 2 : 1) {   /* nsites */

    /* allocate space for table */
    (void) get_llr_pv(0, nsites, 0, LLR_RANGE, 1.0, alength, back);

    if (!load_balance_llr(nsites, pal)) {
      continue; /* for parallel */
    } 

    /* create table */
    if (!NO_STATUS) { fprintf(stderr, "nsites = %d\r", nsites); }
    (void) get_llr_pv(0, nsites, 1, LLR_RANGE, 1.0, alength, back);

  } /* nsites */
  broadcast_llr(min, max, pal);         /* for parallel; collect the tables */

#ifdef DEBUG
  /* print results */
  int n;
  for (n=min; n<=max; n++) {
    int I;
    int w = 1;
    printf("# N    I    llr         1-cdf\n");
    for (I=0; I<=distrs[n].range[w]; I++) {             /* LLR */
      double m2, e2;
      if (distrs[n].cdf[w][I] == LOGZERO) {
        m2 = e2 = 0;
      } else {
        exp10_logx(distrs[n].cdf[w][I]/log(10.0), m2, e2, 1);
      }
      printf("%3d %3d %5.1f %3.1fe%+05.0f\n",
        n, I, (distrs[n].offset[w]+I)/distrs[n].alpha, m2, e2);
    } /* LLR */
  }
#endif

  if (!NO_STATUS)fprintf(stderr, "\nDone initializing.\n");

} /* init_llr_pv_tables */

/******************************************************************************/
/*
        llr_distr

        Compute the probability distribution of the log-likelihood
        ratio (llr) statistic for a discrete distribution.
        Because of roundoff, LLR may be as small as -1 (instead of 0).

        Returns an array p where
                p[i] = log(Pr(LLR=i))

*/
/******************************************************************************/
double *llr_distr(
  int A,                                /* dimension of discrete distribution */
  ARRAY_T *dd,                          /* discrete distribution */
  int N,                                /* number of samples */
  int desired_range,                    /* desired range for scaled LLR */
  double frac,                          /* fraction of scores to use */
  double *alpha,                        /* scale factor for scaled LLR */
  int *offset,                          /* prob[0] = prob(offset) */
  int *range                            /* range for scaled LLR */
)
{
  int i;                                /* index over alphabet */
  int n;                                /* index over samples */
  int k;                                /* other index */
  int I;                                /* LLR */
  int **IP;                             /* I'_i[n] */
  int *minI=NULL;                       /* minimum intermediate value of I */
  int *maxI=NULL;                       /* maximum intermediate value of I */
  int Irange;                           /* maxI-minI+1 */
  double logNfact;                      /* log N! */
  double **logP;                        /* log P_i[n] */
  double **logSP;                       /* log script_P[# samples][LLR] */
  double *prob=NULL;                    /* final probability distribution */
  double min, max, min_dd;

  /* create space for IP, P, minI and maxI */
  create_2array(IP, int, A, N+1);
  create_2array(logP, double, A, N+1);
  Resize(minI, N+1, int);
  Resize(maxI, N+1, int);

  /* compute N! */
  logNfact = 0;
  for (i=2; i<=N; i++) logNfact += log(i); 

  /* get estimates of minimum and maximum values of llr */
  for (i=0, min_dd=1; i<A; i++) min_dd = MIN(min_dd, get_array_item(i, dd));
  //for (i=min=0; i<A; i++) min += dd[i]*N*(log(dd[i]) - log(dd[i]));
  // min = NINT(min);
  // Not sure what was intended with above line-- should always yield 0 though
  min = 0;
  max = NINT(-N * log(min_dd));
  //printf("min = %f max = %f\n", min, max);

  /* set alpha to achieve the desired range */
  *alpha = desired_range/((max-min));
  /* *alpha = NINT(((int)desired_range)/((max-min)));
  if (*alpha < 1) *alpha = 1;*/
  /*fprintf(stderr, "range %d max %f min %f alpha = %f\n",desired_range, max, min, *alpha);*/

  /* compute I', P, minI and maxI */ 
  for (n=0; n<=N; n++) minI[n] = maxI[n] = 0;
  for (i=0; i<A; i++) {                         /* index over alphabet */
    double logdd = LOG(get_array_item(i, dd));                  /* log(dd[i]) */
    IP[i][0] = 0;                               /* I'_i(0) */
    logP[i][0] = 0;                             /* log P_i(0) */
    for (n=1; n<=N; n++) {                      /* index over samples */
      IP[i][n] = NINT(*alpha*n*log(n/(N*get_array_item(i, dd))));       /* I'_i(n) */
      logP[i][n] = logP[i][n-1] + logdd - log(n);       /* log P_i(n) */
      for (k=1; k<=n; k++) {                    /* index over samples of new */
        minI[n] = MIN(minI[n], minI[n-k] + IP[i][k]);
        maxI[n] = MAX(maxI[n], maxI[n-k] + IP[i][k]);
      }
    }
  }

  /* get overall minI and maxI */
  for (n=1; n<=N; n++) {
    /*printf("minI[%d] %d maxI[%d] %d\n", n, minI[n], n, maxI[n]);*/
    minI[0] = MIN(minI[0], minI[n]);            /* min for intermediates */
    maxI[0] = MAX(maxI[0], maxI[n]);            /* max for intermediates */
    minI[n] = LOGZEROI;
    maxI[n] = 0;
  }
  Irange = maxI[0] - minI[0] + 2;
  *offset = minI[0] - 1;                        /* I offset: I=-1 is array 0 */
  /*printf("minI %d maxI %d Irange %d\n", minI[0], maxI[0], Irange);*/
  minI[0] = LOGZEROI;
  maxI[0] = 0;

  /* create script_P arrays with enough space for intermediate calculations */
  create_2array(logSP, double, N+1, Irange+1);
  
  /* clear intermediate probability array */
  for (n=0; n<=N; n++) for(I=0; I<Irange; I++) logSP[n][I] = LOGZERO;

  /* init probability array for first letter in alphabet */
  for (n=0; n<=N; n++) {
    I = IP[0][n] - *offset;                     /* offset I */
    logSP[n][I] = logNfact + logP[0][n];        /* init */
    minI[n] = maxI[n] = I;
  }

  /* compute probabilities recursively */
  for (i=1; i<A; i++) {                 /* index over (rest of) alphabet */
    for (n=N; n>=0; n--) {              /* index over samples */
      for (k=1; k<=n; k++) {            /* index over samples of new letter */
        int min = minI[n-k];
        int max = MAX(min, maxI[n-k] - (1-frac)*(maxI[n-k]-minI[n-k]+1));
        /*printf("min %d maxI %d max %d\n", min, maxI[n-k], max);*/
        for (I=min; I<=max; I++) {      /* index over I */
          if (logSP[n-k][I] > LOGZERO) {
            /*printf("i %d old: %d %d new: %d %d\n", i, n-k, I, n,I+IP[i][k]);*/
            logSP[n][I+IP[i][k]] = 
              LOGL_SUM(logSP[n][I+IP[i][k]], logP[i][k] + logSP[n-k][I]);
          }
        }
        /* get current minimum and maximum I in intermediate arrays */
        minI[n] = MIN(minI[n], minI[n-k]+IP[i][k]);
        maxI[n] = MAX(maxI[n], maxI[n-k]+IP[i][k]);
      }
      if (n==N && i==A-1) break;        /* all done */
    }
  }

  /* compute range */
  /*printf("minI[N] %d maxI[N] %d\n", minI[N], maxI[N]);*/
  *range = maxI[N] - minI[N]; 

  /* move to probability array with prob(offset) in position 0 */
  *offset += minI[N];                   /* prob[0] = prob(offset) */
  Resize(prob, *range+2, double);
  for (I=minI[N]; I<=maxI[N]; I++) prob[I-minI[N]] = logSP[N][I];
  /*fprintf(stderr, "N= %d range= %d offset= %d alpha= %f\n", N, *range, 
    *offset, *alpha);*/
      
  /* free up space */
  free_2array(IP, A);
  free_2array(logP, A);
  free_2array(logSP, N+1);
  myfree(minI);
  myfree(maxI);

  return prob;
} /* llr_distr */

/******************************************************************************/
/*
        sum_distr

        Compute the distribution of the sum of two integer-valued
        random variables whose ranges are [0..r1] and [0..r2], 
        respectively, given the log of their distributions.
*/
/******************************************************************************/
double *sum_distr(
  double *d1,                           /* (log) distribution of RV1 */
  int r1,                               /* range of RV1 */
  double *d2,                           /* (log) distribution of RV2 */
  int r2,                               /* range of RV2 */
  int *r_sum                            /* range of sum of RV1 and RV2 */
)
{
  int i, j, k;
  int range = r1 + r2;                  /* potential range of sum */
  double *d_sum = NULL;                 /* distribution of sum */

  Resize(d_sum, range+1, double);       /* space for distribution */

  for (i=0; i<=range; i++) {            /* value of sum */
    d_sum[i] = LOGZERO;
  }

  for (i=0; i<=r1; i++) {               /* range of RV1 */
    if (d1[i]==LOGZERO) continue;
    for (j=0, k=i; j<=r2; j++, k++) {   /* range of RV2 */
      if (d2[j]==LOGZERO) continue;
      d_sum[k] = LOGL_SUM(d_sum[k], d1[i]+d2[j]);
    } /* RV2 */
  } /* RV1 */

  for (i=range; i>=0; i--) {            /* value of sum */
    if (d_sum[i] > LOGZERO) break;
  }
  *r_sum = i;                           /* non-zero range */

  return d_sum;
} /* sum_distr */

/******************************************************************************/
/*
        cdf

        Compute (log) 1-CDF of an integer-valued (log) distribution.
        Smooths the CDF by linear interpolation so that adjacent positions
        in the table will have different values if possible.
*/
/******************************************************************************/
static double *cdf(
  double *d,                            /* integer valued distribution */
  int r                                 /* range [0..r] */
)
{
  double *cdf=NULL, slope=0;
  int I, i, j, k;

  Resize(cdf, r+1, double);
  cdf[r] = d[r];
  for (I=r-1; I>=0; I--) {
    cdf[I] = LOGL_SUM(cdf[I+1], d[I]);
  }

  /* smooth cdf by linear interpolation in logs */
  for (i=r; i>0; i=j) {
    for (j=i-1; j>0 && d[j]==LOGZERO; j--) ;    /* find next non-zero p */ 
    if (i!=j) slope = (cdf[i]-cdf[j])/(i-j);    /* slope */
    for (k=j+1; k<i; k++) cdf[k] = cdf[j] + (k-j)*slope;
  }

  return cdf;
} /* cdf */

/******************************************************************************/
/*
        get_scaled_llr

        Given a set of observed counts (o_i) and expected counts (e_i)
        and a scale factor alpha, return the scaled information content: 
                Sum_i=1..A NINT(alpha * o_i log(o_i/e_i))
*/
/******************************************************************************/
int get_scaled_llr(
  double alpha,                 /* scale factor */
  int A,                        /* size of alphabet */
  double *o,                    /* observed counts */
  double *e                     /* expected counts */
) {
  int i;
  int llr = 0;

  for (i=0; i<A; i++) {
    if (o[i]) {
      llr += NINT(alpha * o[i] * log(o[i]/e[i]));
    }
  }
  return llr >= 0 ? llr : 0;
} /* get_scaled_llr */

/******************************************************************************/
/*
        brute

        Calculate the distribution of scaled LLR using a brute force
        algorithm (for small values of n only).  Used for checking
        get_llr.
*/
/******************************************************************************/
static void brute1(
  int A,                                /* dimension of discrete distribution */
  ARRAY_T *dd,                           /* discrete distribution */
  int N,                                /* number of samples of distribution */
  double alpha,                         /* scale factor for llr */
  double *o,
  double *e,
  double *p,
  double prob
){
  int i;
  int llr;

  if (N==0) {
    llr = get_scaled_llr(alpha, A, o, e);
    p[llr] += prob;
  } else {
    for (i=0; i<A; i++) {
      o[i]++;
      brute1(A, dd, N-1, alpha, o, e, p, prob * get_array_item(i, dd));
      o[i]--;
    }
  } 
} /* brute1 */

static double *brute(
  int A,                                /* dimension of discrete distribution */
  ARRAY_T *dd,                           /* discrete distribution */
  int N,                                /* number of samples of distribution */
  int range,                            /* desired range for scaled LLR */
  double alpha                          /* scale factor for llr */
){
  int i;
  double *o=NULL;
  double *e=NULL;
  double *p=NULL;
  Resize(o, A, double);
  Resize(e, A, double);
  Resize(p, range+2, double);

  for (i=0; i<A; i++) { 
    o[i] = 0;
    e[i] = get_array_item(i, dd) * N;
  }
  for (i=0; i<range+2; i++) { 
    p[i] = 0;
  }

  brute1(A, dd, N, alpha, o, e, p, 1.0);
 
  return p;
} /* brute */

/******************************************************************************/
/*
        get_llr_pv

        Get the log p-value of a weighted log-likelihood ratio given:
                llr     log-likelihood ratio
                n       (weighted) number of sequences in the alignment
                w       width of the alignment
                range   desired number of values in range
                frac    speedup factor
                alength number of letters in alphabet
                dd[]    alphabet frequency distribution

        Computes the distributions on the fly only as needed.
        Range should be the same each time get_llr_pv is called.
        Call with integral n and w=0 to allocate table in parallel mode.

        Returns the p-value.
*/
/******************************************************************************/
double get_llr_pv(
  double llr,                           /* log likelihood ratio */
  double n,                             /* wgtd number sequences in alignment */
  int w,                                /* width of alignment */
  int range,                            /* desired range for resolution */
  double frac,                          /* speedup factor */
  int alength,                          /* length of alphabet */
  ARRAY_T *back                            /* alphabet frequency distribution */
)
{
  int i, N;
  double I;                             /* weighted log likelihood ratio */
  int I0, I1;                           /* position of llr in table */
  double logpv;                         /* log pvalue */
  double n0, n1;                        /* floor and ceil of n */
  double alpha;                         /* scale factor used */

  if (n<=1) return 0.0;                 /* only one site p-value = 1.0 */

  // Adjust the background distribution to make sure it
  // sums to 1 and has no 0's if first time called.
  if (ndistrs == -1) {
    double dd_sum;			/* sum of dd */
    dd = allocate_array(alength);	// adjusted distribution
    for (i=dd_sum=0; i<alength; i++) dd_sum += get_array_item(i, back) + EPSILON;
    for (i=0; i<alength; i++) set_array_item(i, (get_array_item(i, back) + EPSILON)/dd_sum, dd);
  }

  /* return geometric mean if N is not integral */
  if ( (n0=floor(n)) != (n1=ceil(n)) )
    return ( 
      (n1-n)*get_llr_pv(llr, n0, w, range, frac, alength, dd) +
      (n-n0)*get_llr_pv(llr, n1, w, range, frac, alength, dd) 
    );

  N = (int) n;                          /* make n an integer */

  /* N larger than any previous N? */
  if (ndistrs < N) {                    /* first call */
    Resize(distrs, N+1, DISTR);         /* create array of distributions */
    for (i=ndistrs+1; i<=N; i++) {
      distrs[i].w = 0;
      distrs[i].offset = NULL;
      distrs[i].range = NULL;
      distrs[i].d = NULL;
      distrs[i].cdf = NULL;
      distrs[i].mean = 0;
    }
    ndistrs = N;                                /* set maximum N */
  }

  /* done if w == 0 */
  if (w == 0) return 0.0;

  /* w larger than any previous w for this N? */
  if (distrs[N].w < w) {                        /* larger w */
    Resize(distrs[N].d, w+1, double *);
    Resize(distrs[N].cdf, w+1, double *);
    Resize(distrs[N].offset, w+1, int);
    Resize(distrs[N].range, w+1, int);

    /* first time? */ 
    if (distrs[N].w == 0) {                     /* get the w=1 distribution */
      distrs[N].d[1] = llr_distr(alength, dd, N, range, frac,
        &distrs[N].alpha, &distrs[N].offset[1], &distrs[N].range[1]);
      /* get mean of LLR for w = 1 */
      for (i=0; i<=distrs[N].range[1]; i++) {
        double llr = (i + distrs[N].offset[1]) / distrs[N].alpha;
        distrs[N].mean += exp(distrs[N].d[1][i])*llr;
      }
      distrs[N].cdf[1] = cdf(distrs[N].d[1], distrs[N].range[1]);
      distrs[N].w = 1;
    } /* first time */

    /* get the distributions for widths oldw .. maxw */
    /*fprintf(stderr, "enter cdf N= %d w= %d oldw= %d\n", N, w, distrs[N].w);*/
    for (i=distrs[N].w+1; i<=w; i++) {          /* width */
      distrs[N].d[i] = sum_distr(
        distrs[N].d[i-1], 
        distrs[N].range[i-1], 
        distrs[N].d[1], 
        distrs[N].range[1], 
        &distrs[N].range[i]
      );
      distrs[N].offset[i] = distrs[N].offset[i-1] + distrs[N].offset[1];
      distrs[N].cdf[i] = cdf(distrs[N].d[i], distrs[N].range[i]);
    } /* width */
    /*fprintf(stderr, "leave cdf\n");*/
    distrs[N].w = w;                            /* set maximum w */
  } /* new w */
  
  /* get position in table */
  alpha = distrs[N].alpha;
  I = (alpha * llr) - distrs[N].offset[w];              /* position in table */
  I0 = (int) I;                                         /* floor of position */
  I1 = I0 + 1;                                          /* ceil. of position */
  if (I < 0) {                                          /* lower bound I */
    logpv = distrs[N].cdf[w][0];
  } else if (I0 >= distrs[N].range[w]) {                /* upper bound I */
    logpv = distrs[N].cdf[w][distrs[N].range[w]];
  } else {                                              /* lin. interpolate */
    logpv =
      distrs[N].cdf[w][I0] + (I-I0)*(distrs[N].cdf[w][I1]-distrs[N].cdf[w][I0]);
  }

  /* return log p-value */
  return logpv;
} /* get_llr_pv */

/******************************************************************************/
/*
        get_llr_mean

        Returns the mean of the LLR distribution for w=1.
        Assumes get_llr_pv has already been called.
*/
/******************************************************************************/
double get_llr_mean(
  double n                              /* wgt. number sequences in alignment */
)
{
  return(distrs[NINT(n)].mean);
} /* get_llr_mean */


#ifdef MAIN
/************************************************************************/
/*
        llr <alength> <freq_file> <N>

        Compute the probability distribution for the scaled information
        content (LLR) of N letters.  Only the most probable <frac> LLR values
        are used to speed the calculation.
*/
/************************************************************************/
int main(
  int argc,
  char** argv
)
{
  int i, j, I, n;
  int A=0;                              /* length of alphabet */
  char *ffreq=NULL;                     /* name of frequency file */
  FILE *ffile;                          /* frequency file */
  ARRAY_T *back = NULL;                   /* discrete distribution */
  char *alphabet = NULL;                /* alphabet */
  int N=2, minN=0, maxN=0;              /* number of observations */
  int w=1;                              /* maximum width of alignment */
  double frac = 1;                      /* fraction of scores to use */
  int range = 100;                      /* desired range per N */
  double *p2=NULL;                      /* LLR statistic distribution */
  double *pv2=NULL;
  int brute_maxN = 0;                   /* maximum N for checking llr_distr */
  char *line;                           /* input buffer line */
  double alpha;                         /* scale factor used */

  i = 1;
  argv[0] = "llr";
  DO_STANDARD_COMMAND_LINE(2,
    USAGE(<alength> <freq_file> [options]);
    USAGE(\n\t<alength>\tnumber of letters in alphabet); 
    USAGE(\t<freq_file>\tfile containing alphabet and letter frequencies);
    USAGE(\t\t\twhere each line is: <letter> <freq>);
    NON_SWITCH(1,\r,
      switch (i++) {
        case 1: A = atoi(_OPTION_); break;
        case 2: ffreq = _OPTION_; break;
        default: COMMAND_LINE_ERROR;
      }
    );
    DATA_OPTN(1, N, <N>, \tnumber of observations; default=2,
      N = atof(_OPTION_));
    DATA_OPTN(1, minN, <minN>, \tminimum number of observations; default=2,
      minN = atoi(_OPTION_));
    DATA_OPTN(1, maxN, <maxN>, \tmaximum number of observations; default=2,
      maxN = atoi(_OPTION_));
    DATA_OPTN(1, w, <w>, \tmaximum width of alignment; default=1,
      w = atoi(_OPTION_));
    DATA_OPTN(1, frac, <frac>, \tfraction of possible scores to use; default=1,
      frac = atoi(_OPTION_));
    DATA_OPTN(1, range, <range>, scale llr to have <range> values; default=100,
      range = atoi(_OPTION_));
    USAGE(\n\tCompute the probability distribution for the log-likelihood);
    USAGE(\tratio (LLR) of N letters.  If a range for N is given);
    USAGE(\tthe distributions for each value in the range are computed.);
    USAGE(\tIf <frac> is given only the most probable <frac> fraction of);
    USAGE(\tvalues are used to speed the calculation.);
    USAGE(\n\tCopyright);
    USAGE(\t(2000-2006) The Regents of the University of California);
    USAGE(\tAll Rights Reserved.);
    USAGE(\tAuthor: Timothy L. Bailey);
  );

  init_log();
  init_exp();

  /* set up range for N */
  if (minN==0) minN = N;
  if (maxN==0) maxN = N;
  if (minN > maxN) {
    fprintf(stderr, "You must specify -maxN larger than -minN.\n");
    exit(1);
  }

  /* get alphabet and frequencies */
  ffile = fopen(ffreq, "r"); 
  back = allocate_array(A);
  Resize(alphabet, A, char);
  i = 0;
  while ((line = getline2(ffile))) {
    char c;
    double f;
    if (line[0] == '#') continue;               /* skip comments */
    if (sscanf(line, "%c %lf", &c, &f) != 2) break;
    alphabet[i] = c;
    set_array_item(i, f, back);
    myfree(line);
    i++;
  }
  if (i != A) {
    fprintf(stderr, "%d frequencies were found; %d were expected.\n", i,A);
    for (j=0; j<i; j++) printf("%c %f\n", alphabet[j], get_array_item(j, back));
    exit(1);
  }
  printf("# Alphabet frequency distribution:\n");
  for (j=0; j<A; j++) printf("# %c %f\n", alphabet[j], get_array_item(j, back));

  /* get distribution and print it */
  for (n=minN; n<=maxN; n++) {                  /* number of observations */

    printf("# desired range %d\n", range);

    /* get the distribution set */
    get_llr_pv(0, (double) n, w, range, frac, A, back);
    double alpha = distrs[n].alpha;
    printf("# alpha %f\n", alpha);

    /* get the check distribution */
    if (n <= brute_maxN) { 
      p2 = brute(A, back, n, range, alpha); 
      pv2 = cdf(p2, range);
    }

    /* print results */
    printf("# N    I    llr         p     1-cdf      log(1-cdf)\n");
    for (I=0; I<=distrs[n].range[w]; I++) {             /* LLR */
      double m1, e1, m2, e2;
      if (distrs[n].d[w][I] == LOGZERO) {
        m1 = e1 = 0;
      } else {
        exp10_logx(distrs[n].d[w][I]/log(10.0), m1, e1, 1);
      }
      if (distrs[n].cdf[w][I] == LOGZERO) {
        m2 = e2 = 0;
      } else {
        exp10_logx(distrs[n].cdf[w][I]/log(10.0), m2, e2, 1);
      }
      printf("%3d %3d %5.1f %3.1fe%+05.0f %3.1fe%+05.0f %16.2f\n",
        n, I, (distrs[n].offset[w]+I)/distrs[n].alpha, m1, e1, m2, e2, distrs[n].cdf[w][I]);
    } /* LLR */
  } /* n */

  return 0;
}
#endif /* MAIN */