/********************************************************************
 * FILE: beadstring.c
 * AUTHOR: William Stafford Noble Charles E. Grant
 * CREATE DATE: 06-23-2007
 * PROJECT: MHMM
 * COPYRIGHT: 2007, WNG
 * DESCRIPTION: Given MEME output, create a motif-based linear HMM and
 *              use it to search a sequence database for motif clusters.
 ********************************************************************/
#include <assert.h>
#include <ctype.h>
#include <math.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <time.h>
#include "matrix.h"         // Matrix functions
#include "alphabet.h"       // Amino acid and nucleotide alphabets
#include "beadstring.h"
#include "beadstring-xml.h" // Functions for printing Beadstring XML output
#include "build-hmm.h"      // Construct an HMM in memory.
#include "cisml.h"          // Structure for recording results
#include "dir.h"            // PATHS to needed directories
#include "dp.h"             // Dynamic programming algorithms
#include "fasta-io.h"       // Reading and writing FASTA files
#include "fitevd.h"         // Functions for fitting an extreme value dist
#include "io.h"             // File and directory utilites
#include "log-hmm.h"        // Convert an HMM to or from log form.
#include "match.h"          // Find sequence to model match from traceback
#include "metameme.h"       // Global Meta-MEME functions.
#include "mhmm.h"           // Scan a sequence db with HMM.
#include "mhmm-state.h"     // Data structure for HMMs.
#include "mhmms.h"          // Scan a sequence db with HMM.
#include "motif.h"          // Data structure for holding motifs.
#include "order.h"          // Linear motif order and spacing.
#include "projrel.h"
#include "pssm.h"           // Tools for position specific scoring matrix
#include "read-mhmm.h"      // Functions for reading HMM from file.
#include "simple-getopt.h"  // Command line processing
#include "string-list.h"    // List of strings (for motif IDs).
#include "utils.h"          // Generic utilities.
#include "write-mhmm.h"     // Functions for producing MHMM output.
#include "motif-in.h"

#define SLOP 1E-5

// Marker for end transition.
extern char* END_TRANSITION;

VERBOSE_T verbosity = NORMAL_VERBOSE;

enum path_values { SINGLE, ALL };

/************************************************************************
 * Fix list of transitions in
 ************************************************************************/
void fix_trans_in(MHMM_T* the_hmm) {

  int i_row, i_col;
  int n = 0;
  MATRIX_T *trans = the_hmm->trans;
  n = trans->num_rows;

  //
  // Visit the transition matrix cells just once each
  // to update trans_in arrays.
  // This is quadratic in n.
  //
  for (i_col = 0; i_col < n; i_col++) {
    for (i_row = 0; i_row < n; i_row++) {
      double p = get_matrix_cell(i_row, i_col, trans);	// The transition probability.
      int m = 0;
      if (!is_zero(p, false)) {
        MHMM_STATE_T * in_state = &(the_hmm->states[i_col]);
        assert(m < in_state->ntrans_in);
        set_array_item(m, p, in_state->trans_in);
        m++;
      }
    } // col
  } // row

} // compute_ins_and_outs

/*************************************************************************
 *  Build a linear HMM model for the motifs
 *************************************************************************/
MHMM_T* build_linear_model(OPTIONS_T *options) {

  int num_motifs = 0;              // The number of motifs in the model.
  RBTREE_T  *motifs; // The motifs.
  bool has_reverse_strand = false; // MEME file contained both strands
  ARRAY_T* background = NULL;      // Background probs for alphabet.
  ORDER_T* order_spacing = NULL;   // Linear HMM order and spacing.
  MATRIX_T* transp_freq = NULL;    // Matrix of inter-motif transitions freqs.
  MATRIX_T* spacer_ave = NULL;     // Matrix of average spacer lengths.
  MHMM_T* the_hmm = NULL;          // The HMM being constructed.

  // Gotta have positive number of spacer states.
  if (options->spacer_states <= 0) {
    die("Negative spacer length (%d).\n", options->spacer_states);
  }

  /**********************************************
   * READING THE MOTIFS
   **********************************************/
  load_filter_process_motifs_for_hmm(
      0, // no top n motifs
      options->request_nums,
      options->request_ids,
      options->motif_e_threshold,
      0.0, // no complexity threshold
      options->order_string,
      options->motif_p_threshold, // for filtering occurrence list
      options->meme_filename,
      options->bg_filename,
      options->motif_pseudo,
      options->trans_pseudo,
      options->spacer_pseudo,
      false, // keep unused motifs
      &(options->alphabet),
      &motifs,
      &background,
      &order_spacing,
      &transp_freq,
      &spacer_ave
      );

  /**********************************************
   * BUILDING THE HMM
   **********************************************/

  if (order_spacing == NULL) {
    die("No order specified for the motifs.\n"
        "For the linear model the motif file must contain motif occurence\n"
        "data or the motif order must be specified using the --order option.");
  }

  build_linear_hmm(
    background,
    order_spacing,
    options->spacer_states,
    motifs,
    options->fim,
    &the_hmm
  );
  fix_trans_in(the_hmm);

  free_array(background);
  free_order(order_spacing);
  free_matrix(transp_freq);
  free_matrix(spacer_ave);
  rbtree_destroy(motifs);

  // Add some global information.
  copy_string(&(the_hmm->motif_file), options->meme_filename);

  return the_hmm;

}


void search_with_model(
 MHMM_T* the_mhmm,
 bool* got_evd,
 OPTIONS_T *options,
 PATTERN_T *pattern
) {

  bool viterbi = false;
  bool use_pvalues = false;
  bool both_strands = false;
  EVD_SET evd_set;      // EVD data structure
  evd_set.evds = NULL;  // Initialize pointer to evds.

  /***********************************************
   * Set up the model.
   ***********************************************/

  // Figure out what kind of scoring to do.
  if (options->paths == SINGLE) {
    viterbi = true;
  } else {
    viterbi = false;
  }

  // Force p-value scoring if p-value threshold given.
  if (options->p_threshold != 0) {
    use_pvalues = true;
  }

  // Check p-threshold is in range [0<p<=1].
  if (use_pvalues && (options->p_threshold <= 0 || options->p_threshold > 1)) {
    die("You must specify a p-value threshold in the range [0<p<=1]\n");
  }

  // Set the PAM distance.
  if (options->pam_distance == -1) {
    options->pam_distance = 
       (alph_is_builtin_protein(the_mhmm->alph) ? DEFAULT_PROTEIN_PAM : DEFAULT_DNA_PAM);
  }
  //int beta = (options->alphabet == PROTEIN_ALPH) ? 10 : 1;
  int beta = alph_is_builtin_protein(the_mhmm->alph) ? 10 : 1;

  // Convert the model to log space.
  MHMM_T* the_log_hmm = allocate_mhmm(the_mhmm->alph, the_mhmm->num_states);
  convert_to_from_log_hmm(
    true, // Convert to logs.
    options->zselo,
    options->gap_open,
    options->gap_extend,
    the_mhmm->background,
    options->score_filename,
    options->pam_distance,
    beta,
    the_mhmm,
    the_log_hmm
  );

  // Set up PSSM matrices if doing motif_scoring
  // and pre-compute motif p-values if using p-values.
  // Set up the hot_states list.
  set_up_pssms_and_pvalues(
    options->motif_scoring,
    options->p_threshold,
    both_strands,
    options->allow_weak_motifs,
    the_log_hmm,
    NULL,	// PSP
    1		// PSP alpha
  );

  //
  // Set up for computing score distribution.
  //
  SCORE_SET *score_set = set_up_score_set(
    options->p_threshold,
    -1,    // dp_threshold,
    -1,    // max_gap
    false, // negatives_only
    the_log_hmm
  );

  /***********************************************
   * Search the database one sequence at a time.
  ***********************************************/

  // Open the file for reading.
  FILE* seq_file = NULL;
  if (open_file(
        options->seq_filename,
        "r",
        true,
        "sequence",
        "sequences",
        &seq_file
      ) == 0) {
    exit(1);
  }

  double start_time = mytime(0);
  double old_loop_time = start_time;
  double seconds_since_last_report = 0.0;

  SEQ_T* sequence = NULL;               // Sequence to search against.
  MATRIX_T* motif_score_matrix = NULL;  // Number of motifs x sequence length.
  MATRIX_T* dp_matrix = NULL;           // Dynamic programming matrix.
  MATRIX_T* trace_matrix = NULL;        // Traceback for Viterbi.
  MATCH_T*  this_match = NULL;          // This sequence-to-model match.
  int dp_rows = 0;                      // Size of the DP matrix.
  int dp_cols = 0;
  int num_seqs = 0;

  this_match = allocate_match(); // This sequence-to-model match.
  while (read_one_fasta(options->alphabet, seq_file, MAX_SEQ, &sequence)) {

    num_seqs++;

    // Create a scanned_sequence record and record it in pattern.
    char* seq_name = get_seq_name(sequence);
    int seq_length = get_seq_length(sequence);
    SCANNED_SEQUENCE_T *scanned_seq =
      allocate_scanned_sequence(seq_name, seq_name, pattern);
    set_scanned_sequence_length(scanned_seq, seq_length);
    // Keep track of total database size for E-value calculation.
    score_set->total_length += seq_length;

    // Let the user know what's going on.
    if (verbosity > NORMAL_VERBOSE) {
      fprintf(
        stderr,
        "Scoring %s (length=%d).\n",
        seq_name,
        seq_length
      );
    }

    // Convert the sequence to alphabet-specific indices.
    prepare_sequence(sequence, options->alphabet);
    // Sequeence length may have changed
    seq_length = get_seq_length(sequence);
    assert(get_seq_char(seq_length, sequence) == '\0');

    /* Allocate the dynamic programming matrix. Rows correspond to
       states in the model, columns to positions in the sequence. */
    if ((dp_rows < the_log_hmm->num_states)
      || (dp_cols < seq_length)) {
      free_matrix(dp_matrix);
      free_matrix(trace_matrix);
      if (dp_rows < the_log_hmm->num_states) {
        dp_rows = the_log_hmm->num_states;
      }
      if (dp_cols < seq_length) {
        dp_cols = seq_length;
      }
      // (Add one column for repeated match algorithm.)
      dp_matrix = allocate_matrix(dp_rows, dp_cols + 1);
      trace_matrix = allocate_matrix(dp_rows, dp_cols + 1);
    }

    // Compute the motif scoring matrix.
    if (options->motif_scoring) {
      motif_score_matrix = allocate_matrix(
        the_log_hmm->num_motifs,
        seq_length
      );
      compute_motif_score_matrix(
        use_pvalues,
        options->p_threshold,
        get_int_sequence(sequence),
        seq_length,
        NULL,		// PSP
        0,		// num_priors
	0,		// PSP ALPHA
        the_log_hmm,
        &motif_score_matrix
      );
    }

    // Perform the appropriate type of dynamic programming.
    if (viterbi) {
      viterbi_algorithm(
        options->local_scoring,
        get_int_sequence(sequence),
        seq_length,
        the_log_hmm,
        true,  // Save Viterbi path?
        motif_score_matrix,
        dp_matrix,
        trace_matrix,
        this_match
      );
    } else {
      forward_algorithm(
        options->local_scoring,
        get_int_sequence(sequence),
        seq_length,
        the_log_hmm,
        //motif_score_matrix, // FIXME
        dp_matrix,
        this_match
      );
    }

    // Store the score, ID, length and comment for later printing.
    //assert(get_match_seq_length(this_match) == get_seq_length(sequence));
    store_sequence(
      options->alphabet,
      options->motif_scoring,
      false, // Don't print in mhmmscan format.
      DEFAULT_OUTPUT_WIDTH,
      sequence,
      this_match,
      options->e_threshold,
      0,    // no dp_threshold
      options->p_threshold,
      score_set,
      *got_evd,
      evd_set,
      //false, // store the Viterbi trace
      true, // store the Viterbi trace
      the_log_hmm,
      motif_score_matrix,
      false, // Store GFF?,
      scanned_seq
    );

    /* Calculate the initial E-value distribution if the required
     * number of sequences has been saved.  This will allow the
     * descriptions of low-scoring sequences to not be stored.  The
     * distribution will be recomputed using all saved scores when all
     * sequences have been read.  */
    if (score_set->n == EVD_NUM_SEQS && *got_evd == false) {
      evd_set = calc_distr(
        *score_set,			// Set of scores.
        use_pvalues ? D_EXP : D_EVD,	// Use exponential distribution?
        false				// Use sequence E-values.
      );
      if (evd_set.n > 0) *got_evd = true;
    }

    // Free the memory used by this sequence.
    free_matrix(motif_score_matrix);
    free_seq(sequence);

    // If requested, periodically report progress
    if (options->progress > 0.0) {
      double new_loop_time = mytime(0);
      seconds_since_last_report = (new_loop_time - old_loop_time) / 1e6;
      if (seconds_since_last_report > options->progress) {
        double elapsed_seconds = (new_loop_time - start_time) / 1e6;
        old_loop_time = new_loop_time;
        fprintf(
          stderr,
          "%d sequences processed in %d seconds.\n",
          num_seqs,
          (int) elapsed_seconds
        );
      }
    }
  }

  /***********************************************
   * Calculate the E-values and store them as the keys.
   ***********************************************/
  // Recalculate the EVD using all scores.
  // If successful, calculate the E-values and
  // store them as keys.
  evd_set = calc_distr(
    *score_set,				// Set of scores.
    use_pvalues ? D_EXP : D_EVD,	// Use exponential distribution?
    false         			// Use sequence E-values.
  );
  if (evd_set.n > 0) {
    int q, t, N;
    q = 1;        // Ignore query "length".
    t = 0;        // Use stored target lengths.
    N = evd_set.non_outliers;    // Use number of non-outliers.
    evd_set.N = N;
    calc_evalues(&evd_set, N, q, t);
    *got_evd = true;
  }
  evd_set.negatives_only = false;  // Used real sequences.

  /* Tie up loose ends. */
  myfree(evd_set.evds);
  free_mhmm(the_log_hmm);
  free_matrix(dp_matrix);
  free_matrix(trace_matrix);
  free_match(this_match);
  free(score_set->scores);
  myfree(score_set);
  fclose(seq_file);
}

/***********************************************************************
 * Print beadstring specific information to an XML file
 ***********************************************************************/
static  void print_settings_xml(
  FILE *out,
  OPTIONS_T *options
) {
  fputs("<settings>\n",  out);
  fprintf(out, "<setting name=\"%s\">%s</setting>\n", "output directory", options->output_dirname);
  fprintf(out, "<setting name=\"%s\">%s</setting>\n", "MEME file name", options->meme_filename);
  fprintf(out, "<setting name=\"%s\">%s</setting>\n", "background file name", options->bg_filename);
  fprintf(out, "<setting name=\"%s\">%s</setting>\n", "score file name", options->score_filename);
  fprintf(out, "<setting name=\"%s\">%s</setting>\n", "sequence file name", options->seq_filename);
  fprintf(out, "<setting name=\"%s\">%s</setting>\n", "model file name", options->model_filename);
  fprintf(out, "<setting name=\"%s\">%s</setting>\n", "order string", options->order_string);
  fprintf(
    out,
    "<setting name=\"%s\">%s</setting>\n",
    "allow weak motifs",
    boolean_to_string(options->allow_weak_motifs)
  );
  fprintf(
    out,
    "<setting name=\"%s\">%s</setting>\n",
    "allow clobber",
    boolean_to_string(options->allow_clobber)
  );
  fprintf(
    out,
    "<setting name=\"%s\">%s</setting>\n",
    "fim",
    boolean_to_string(options->fim)
  );
  fprintf(
    out,
    "<setting name=\"%s\">%s</setting>\n",
    "local scoring",
    boolean_to_string(options->local_scoring)
  );
  fprintf(
    out,
    "<setting name=\"%s\">%s</setting>\n",
    "motif scoring",
    boolean_to_string(options->motif_scoring)
  );
  fprintf(
    out,
    "<setting name=\"%s\">%s</setting>\n",
    "synth",
    boolean_to_string(options->synth)
  );
  fprintf(
    out,
    "<setting name=\"%s\">%s</setting>\n",
    "zselo",
    boolean_to_string(options->zselo)
  );
  fprintf(out, "<setting name=\"%s\">%d</setting>\n", "max_seq", options->max_seq);
  fprintf(out, "<setting name=\"%s\">%d</setting>\n", "pam_distance", options->pam_distance);
  fprintf(out, "<setting name=\"%s\">%d</setting>\n", "paths", options->paths);
  fprintf(out, "<setting name=\"%s\">%d</setting>\n", "spacer_states", options->spacer_states);
  fprintf(out, "<setting name=\"%s\">%3.1g</setting>\n", "gap extend", options->gap_extend);
  fprintf(out, "<setting name=\"%s\">%3.1g</setting>\n", "gap open", options->gap_open);
  fprintf(out, "<setting name=\"%s\">%3.2g</setting>\n", "e-threshold", options->e_threshold);
  fprintf(out, "<setting name=\"%s\">%3.2g</setting>\n", "p-threshold", options->p_threshold);
  fprintf(
    out,
    "<setting name=\"%s\">%s</setting>\n",
    "perform search",
    boolean_to_string(options->perform_search)
  );
  fprintf(out, "<setting name=\"%s\">%3.2g</setting>\n", "progress", options->progress);
  fprintf(out, "<setting name=\"%s\">%3.2g</setting>\n", "motif e-threshold", options->motif_e_threshold);
  fprintf(out, "<setting name=\"%s\">%3.2g</setting>\n", "motif p-threshold", options->motif_p_threshold);
  fprintf(out, "<setting name=\"%s\">%3.2g</setting>\n", "motif pseudocount", options->motif_pseudo);
  fprintf(out, "<setting name=\"%s\">%3.2g</setting>\n", "spacer pseudocount", options->spacer_pseudo);
  fprintf(out, "<setting name=\"%s\">%3.2g</setting>\n", "transition pseudocount", options->trans_pseudo);
  fprintf(out, "<setting name=\"%s\">%d</setting>\n", "verbosity", verbosity);
  RBNODE_T *node;
  for(node = rbtree_first(options->request_nums); node != NULL; node = rbtree_next(node)) {
    int motif_num;
    int strands;
    motif_num = *((int*)rbtree_key(node));
    strands = *((int*)rbtree_value(node));
    if (strands & MOTIF_PLUS) {
      fprintf(out, "<setting name=\"%s\">+%d</setting>\n", "requested motif", motif_num);
    }
    if (strands & MOTIF_MINUS) {
      fprintf(out, "<setting name=\"%s\">-%d</setting>\n", "requested motif", motif_num);
    }
  }

  fputs("</settings>\n",  out);

}

/***********************************************************************
 * Print beadstring specific information to an XML file
 ***********************************************************************/
static void print_beadstring_xml_file(
  FILE *out,
  OPTIONS_T *options,
  ARRAY_T *background,
  char  *stylesheet
) {
  int i;

  fputs("<?xml version=\"1.0\" encoding=\"UTF-8\" standalone=\"no\"?>\n", out);
  if (stylesheet != NULL) {
    fprintf(out, "<?xml-stylesheet type=\"text/xsl\" href=\"%s\"?>\n", stylesheet);
  }
  fputs("<!-- Begin document body -->\n", out);

  fputs("<beadstring version=\"" VERSION "\" release=\"" ARCHIVE_DATE "\">\n", out);
  fputs("  xmlns:xsi=\"http://www.w3.org/2001/XMLSchema-instance\"", out);
  fputs("\n", out);
  fputs("  xsi:schemaLocation=", out);
  fputs("\"http://noble.gs.washsington.edu/schema/beadstring beadstring.xsd\"\n", out);
  fputs("  xmlns=\"http://zlab.bu.edu/schema/cisml\"\n", out);
  fputs("  xmlns:bstr=\"http://noble.gs.washington.edu/schema/beadstring\"\n>\n", out);
  fprintf(out, "<command-line>%s</command-line>\n", options->command_line);
  print_settings_xml(out, options);
  //TODO FIXME make work with other alphabets
  fprintf(
    out,
    "<alphabet>%s</alphabet>\n",
    alph_is_builtin_dna(options->alphabet) ? "nucleotide" : "protein"
  );
  fprintf(
    out,
    "<background source=\"%s\">\n",
    options->bg_filename ? options->bg_filename : "non-redundant database"
  );
  //int asize = alph_size(options->alphabet, ALPH_SIZE);
  int asize = alph_size_core(options->alphabet);
  for (i = 0; i < asize; i++) {
    fprintf(
      out,
      "<value letter=\"%c\">%1.3f</value>\n",
      alph_char(options->alphabet, i),
      get_array_item(i, background)
    );
  }
  fputs("</background>\n", out);
  fputs("<hmm-file>mhmm.xml</hmm-file>\n", out);
  fputs("<hmm-ps>mhmm.ps</hmm-ps>\n", out);
  fputs("<hmm-png>mhmm.png</hmm-png>\n", out);
  fputs("<cisml-file>cisml.xml</cisml-file>\n", out);
  fputs("</beadstring>\n", out);
}

/**********************************************************************
 * This function saves the sequences containing a hit to a motif
 * which is greater than the output p/q-value threshold. If the sequence
 * is less then 10k in length, a copy of the complete sequence will be
 * saved.
 *********************************************************************/
void print_sequences(FILE *out, OPTIONS_T *options, CISML_T *cisml) {

  SEQ_T* sequence = NULL;
  const int max_size_recorded = 10000;

  fputs("<?xml version=\"1.0\" encoding=\"UTF-8\" standalone=\"no\"?>\n", out);
  fputs("<matched-sequences>\n", out);

  // Open the FASTA file for reading.
  FILE* fasta_file = NULL;
  if (open_file(options->seq_filename, "r", true, "FASTA", "sequences", &fasta_file) == 0) {
    die("Couldn't open the file %s.\n", options->seq_filename);
  }

  // Read the FASTA file one sequence at a time.
  while (read_one_fasta(options->alphabet, fasta_file, MAX_SEQ, &sequence)) {
    char *seq_name = get_seq_name(sequence);
    int seq_length = get_seq_length(sequence);
    // FIXME Only print those sequences that contain a matched element
    if (seq_length <= max_size_recorded) {
      // If sequence is less then limit record raw sequence
      char *raw_sequence = get_raw_sequence(sequence);
      fprintf(
        out,
        "<sequence name=\"%s\" length=\"%d\">\n%s\n",
        seq_name,
        seq_length,
        raw_sequence
      );
      fprintf(out, "</sequence>\n");
    }
    else {
      // Just record name and length
      fprintf(
        out,
        "<sequence name=\"%s\" length=\"%d\"\\>\n",
        seq_name,
        seq_length
      );
    }
  }

  fputs("</matched-sequences>\n", out);
  fclose(fasta_file);
}

/***********************************************************************
  Create diagrams of HMM in Postscript and PNG format using
  'dot'  and  'convert' programs.
 ***********************************************************************/
static void create_hmm_diagram_files(char *mhmm_xml_path, char *mhmm_ps_path,  char *mhmm_png_path) {

  int cmd_buff_size = strlen(mhmm_ps_path) + strlen(mhmm_png_path) + 80;
  char *cmd = mm_malloc(cmd_buff_size * sizeof(char));

  // Create diagram of HMM in Postscript format
  int num_char_printed = snprintf(
    cmd,
    cmd_buff_size,
    "draw-mhmm -consensus %s |dot -Tps > %s",
    mhmm_xml_path,
    mhmm_ps_path
  );
  if (num_char_printed > cmd_buff_size) {
    die("Command buffer too small: %s\n", cmd);
  }
  int result = system(cmd);
  if (result < 0) {
    die("Command %s failed with status %d.\n", cmd, result);
  }

  // Convert Postscript to PNG
  num_char_printed = snprintf(cmd, cmd_buff_size, "convert %s %s", mhmm_ps_path, mhmm_png_path);
  if (num_char_printed > cmd_buff_size) {
    die("Command buffer too small: %s\n", cmd);
  }
  result = system(cmd);
  if (result < 0) {
    die("Command %s failed with status %d.\n", cmd, result);
  }

  myfree(cmd);

}

/***********************************************************************
  Write results to files
 ***********************************************************************/
void print_beadstring_results(
  MHMM_T* the_hmm,
  bool got_evd,
  OPTIONS_T* options,
  CISML_T *cisml
) {

  // Setup output directory
  if (create_output_directory(
       options->output_dirname,
       options->allow_clobber,
       false /* Don't print warning messages */
      )
    ) {
    // Failed to create output directory.
    die("Couldn't create output directory %s.\n", options->output_dirname);
  }

  // Create model file if one isn't specified.
  char *mhmm_xml_filename = "mhmm.xml";
  char *mhmm_xml_path = NULL;
  if (!options->model_filename) {
    mhmm_xml_path = make_path_to_file(options->output_dirname, mhmm_xml_filename);
    write_mhmm_xml_to_file(the_hmm, mhmm_xml_path);
  }

  // Generate diagram of HMM
  char*  mhmm_ps_path = make_path_to_file(options->output_dirname, "mhmm.ps");
  char*  mhmm_png_path = make_path_to_file(options->output_dirname, "mhmm.png");
  create_hmm_diagram_files(mhmm_xml_path, mhmm_ps_path,  mhmm_png_path);

  // If search peformed output results of search
  if (options->perform_search) {

    // Create the paths to the output files
    const char *HTML_STYLESHEET = "beadstring.xsl";
    const char *CSS_STYLESHEET = "cisml.css";
    //const char *GFF_STYLESHEET = "cisml-to-gff.xsl";
    const char *GFF_STYLESHEET = "cisml-to-gff3.xsl";
    const char *TEXT_STYLESHEET = "cisml-to-text.xsl";
    const char *SEQUENCE_FILENAME = "matched_sequences.xml";
    const char *CISML_FILENAME = "cisml.xml";
    const char *HTML_FILENAME = "beadstring.html";
    const char *TEXT_FILENAME = "beadstring.txt";
    const char *GFF_FILENAME = "beadstring.gff";
    char *sequence_path = make_path_to_file(options->output_dirname, SEQUENCE_FILENAME);
    char *cisml_path = make_path_to_file(options->output_dirname, CISML_FILENAME);
    char *html_stylesheet_path = make_path_to_file(get_meme_data_dir(), HTML_STYLESHEET);
    char *css_stylesheet_path = make_path_to_file(get_meme_data_dir(), CSS_STYLESHEET);
    char *text_stylesheet_path = make_path_to_file(get_meme_data_dir(), TEXT_STYLESHEET);
    char *gff_stylesheet_path = make_path_to_file(get_meme_data_dir(), GFF_STYLESHEET);
    char *html_path = make_path_to_file(options->output_dirname, HTML_FILENAME);
    char *text_path = make_path_to_file(options->output_dirname, TEXT_FILENAME);
    char *gff_path = make_path_to_file(options->output_dirname, GFF_FILENAME);
    char *html_stylesheet_copy_path = make_path_to_file(options->output_dirname, HTML_STYLESHEET);
    char *css_stylesheet_copy_path = make_path_to_file(options->output_dirname, CSS_STYLESHEET);

    // Copy XML to HTML and CSS stylesheets to output directory
    copy_file(html_stylesheet_path, html_stylesheet_copy_path);
    copy_file(css_stylesheet_path, css_stylesheet_copy_path);

    // Open the beadstring XML file.
    const char *BEADSTRING_FILENAME = "beadstring.xml";
    char *beadstring_path = make_path_to_file(options->output_dirname, BEADSTRING_FILENAME);
    FILE *beadstring_file = fopen(beadstring_path, "w");
    if (!beadstring_file) {
      die("Couldn't open file %s for output.\n", beadstring_path);
    }
    print_beadstring_xml_file(
      beadstring_file,
      options,
      the_hmm->background,
      "beadstring.xsl"
    );
    fclose(beadstring_file);

    // Output matched sequence XML
    FILE *sequence_file = fopen(sequence_path, "w");
    if (!sequence_file) {
      die("Couldn't open file %s for output.\n", cisml_path);
    }
    print_sequences(sequence_file, options, cisml);
    fclose(sequence_file);

    // Open the cisml XML file.
    FILE *cisml_file = fopen(cisml_path, "w");
    if (!cisml_file) {
      die("Couldn't open file %s for output.\n", cisml_path);
    }
    print_cisml(cisml_file, cisml, true, NULL, true);
    fclose(cisml_file);

    // Output HTML
    print_xml_filename_to_filename_using_stylesheet(
      beadstring_path, 
      html_stylesheet_copy_path, 
      html_path
    );

    // Output text
    print_xml_filename_to_filename_using_stylesheet(cisml_path, text_stylesheet_path, text_path);

    // Output GFF
    print_xml_filename_to_filename_using_stylesheet(cisml_path, gff_stylesheet_path, gff_path);

    /***********************************************
     * Print header information.
     ***********************************************/
    /*
    write_header(
      "beadstring",
      "Database search results",
      the_hmm->description,
      the_hmm->motif_file,
      NULL, // HMM
      options->seq_filename,
      text_file
    );
    */

    /***********************************************
     * Sort and print the results.
     ***********************************************/
    if (verbosity >= NORMAL_VERBOSE) {
      fprintf(stderr, "\nSorting the E-values.\n");
    }

    /*
    sort_and_print_scores(
      false, // print fancy
      true,  // print_header,
      got_evd,
      options->motif_scoring,
      false, // Don't print in mhmmscan format.
      options->max_seq, // Maximum number of sequences to print
      DEFAULT_OUTPUT_WIDTH,
      options->e_threshold,
      true, // Sort output
      NULL, // FIXME: GFF output?
      text_file
    );
    */

    myfree(html_stylesheet_path);
    myfree(html_stylesheet_copy_path);
    myfree(css_stylesheet_path);
    myfree(css_stylesheet_copy_path);
    myfree(text_stylesheet_path);
    myfree(gff_stylesheet_path);
    myfree(cisml_path);
    myfree(html_path);
    myfree(text_path);
    myfree(gff_path);
    myfree(beadstring_path);
    myfree(mhmm_xml_path);
    myfree(mhmm_ps_path);
    myfree(mhmm_png_path);

  }

}

/***********************************************************************
  Process command line options
 ***********************************************************************/
static void process_command_line(
  int argc,
  char* argv[],
  OPTIONS_T *options
) {

  // Define command line options.
  cmdoption const option_list[] = {
    {"allow-weak-motifs", NO_VALUE},
    {"bgfile", REQUIRED_VALUE},
    {"e-thresh", REQUIRED_VALUE},
    {"fim", NO_VALUE},
    {"gap-extend", REQUIRED_VALUE},
    {"gap-open", REQUIRED_VALUE},
    {"global", NO_VALUE},
    {"maxseqs", REQUIRED_VALUE},
    {"model-file", REQUIRED_VALUE},
    {"motif", REQUIRED_VALUE},
    {"motif-e-thresh", REQUIRED_VALUE},
    {"motif-p-thresh", REQUIRED_VALUE},
    {"motif-pseudo", REQUIRED_VALUE},
    {"no-search", NO_VALUE},
    {"nspacer", REQUIRED_VALUE},
    {"o", REQUIRED_VALUE},
    {"oc", REQUIRED_VALUE},
    {"order", REQUIRED_VALUE},
    {"pam", REQUIRED_VALUE},
    {"paths", REQUIRED_VALUE},
    {"p-score", REQUIRED_VALUE},
    {"progress", REQUIRED_VALUE},
    {"score-file", REQUIRED_VALUE},
    {"spacer-pseudo", REQUIRED_VALUE},
    {"synth", NO_VALUE},
    {"trans-pseudo", REQUIRED_VALUE},
    {"verbosity", REQUIRED_VALUE},
    {"zselo", NO_VALUE},
  };
  int option_count = 28;
  int option_index = 0;

  // Define the usage message.
  char usage[] =
    "Usage: beadstring [options] <motif file> <sequence file>\n"
     "\n"
     "   Options related to input and output:\n"
     "     --bgfile <background file>\n"
     "     --e-thresh <float>\n"
     "     --maxseqs <int>\n"
     "     --model-file <model file>\n"
     "     --no-search\n"
     "     --oc <output dir> (default=fimo_out)\n"
     "     --output-pthresh <float> (default 1e-4)\n"
     "     --progress <float>\n"
     "     --score-file <score file>\n"
     "     --verbosity 1|2|3|4|5 (default=2)\n"
     "\n"
     "   Options related to selecting motifs for the model:\n"
     "     --motif <string>\n"
     "     --motif-e-thresh <float>\n"
     "     --motif-p-thresh <float>\n"
     "     --order <string>\n"
     "\n"
     "   Options related to building the model:\n"
     "     --fim\n"
     "     --gap-extend <float>\n"
     "     --gap-open <float>\n"
     "     --motif-pseudo <float>\n"
     "     --nspacer <int>\n"
     "     --spacer-pseudo <float>\n"
     "     --trans-pseudo <float>\n"
     "     --zselo\n"
     "\n"
     "   Options related to scoring:\n"
     "     --allow-weak-motifs\n"
     "     --global\n"
     "     --pam <int>\n"
     "     --paths single|all\n"
     "     --p-score <float>\n"
     "\n";

  /* Make sure various options are set to NULL or defaults. */
  options->output_dirname = "beadstring_out";
  options->meme_filename = NULL;
  options->bg_filename = NULL;
  options->score_filename = NULL;
  options->seq_filename = NULL;
  options->model_filename = NULL;
  options->order_string = NULL;
  options->allow_weak_motifs = false;
  options->allow_clobber = true;
  options->fim = false;
  options->local_scoring = true;
  //options->motif_scoring = true; // Always do this (not really an option).
  options->motif_scoring = false;
  options->synth = false;
  options->zselo = false;
  options->max_seq = -1;
  options->pam_distance = -1;
  options->paths = SINGLE;
  options->spacer_states = DEFAULT_SPACER_STATES,
  options->gap_extend = -1.0;
  options->gap_open = -1.0;
  options->e_threshold = 0.01;
  options->p_threshold = 0.0;
  options->perform_search = true;
  options->progress = 0.0;
  options->request_nums = rbtree_create(rbtree_intcmp, rbtree_intcpy, free, rbtree_intcpy, free);
  options->request_ids = rbtree_create(rbtree_strcmp, rbtree_strcpy, free, rbtree_intcpy, free);
  options->motif_e_threshold = 0.0;
  options->motif_p_threshold = 0.0;
  options->motif_pseudo = DEFAULT_MOTIF_PSEUDO;
  options->spacer_pseudo = DEFAULT_SPACER_PSEUDO;
  options->trans_pseudo = DEFAULT_TRANS_PSEUDO;
  verbosity = 2;

  simple_setopt(argc, argv, option_count, option_list);

  // Parse the command line.
  while (1) {
    int c = 0;
    char* option_name = NULL;
    char* option_value = NULL;
    const char * message = NULL;


    // Read the next option, and break if we're done.
    c = simple_getopt(&option_name, &option_value, &option_index);
    if (c == 0) {
      break;
    } else if (c < 0) {
      simple_getopterror(&message);
      die("Error processing command line options (%s)\n", message);
    }

    if (strcmp(option_name, "bgfile") == 0) {
       options->bg_filename = option_value;
    } else if (strcmp(option_name, "e-thresh") == 0) {
      options->e_threshold = atof(option_value);
    } else if (strcmp(option_name, "maxseqs") == 0) {
      options->max_seq = atoi(option_value);
    } else if (strcmp(option_name, "model-file") == 0) {
      options->model_filename = option_value;
    } else if (strcmp(option_name, "no-search") == 0) {
      options->perform_search = false;
    } else if (strcmp(option_name, "o") == 0) {
      options->output_dirname = option_value;
      options->allow_clobber = false;
    } else if (strcmp(option_name, "oc") == 0) {
      options->output_dirname = option_value;
      options->allow_clobber = true;
    } else if (strcmp(option_name, "progress") == 0) {
      options->progress = atof(option_value);
    } else if (strcmp(option_name, "score-file") == 0) {
      options->score_filename = option_value;
    } else if (strcmp(option_name, "verbosity") == 0) {
      verbosity = (VERBOSE_T)atoi(option_value);

    } else if (strcmp(option_name, "motif") == 0) {
      if (!store_requested_num(options->request_nums, option_value)) {
        die("Error processing \"-motif %s\". "
            "Option value should be a (non-zero) motif position. "
            "For example the first motif in the file has the "
            "position 1 and it's reverse complement has the position -1.\n", 
            option_value);
      }
    } else if (strcmp(option_name, "motif-id") == 0) {
      if (!store_requested_id(options->request_ids, option_value)) {
        die("Error processing \"-motif-id %s\"."
            "Option value should be a motif identifer and hence "
            "not just be a + or a -.\n", option_value);
      }
    } else if (strcmp(option_name, "moitf-e-thresh") == 0) {
      options->motif_e_threshold = atof(option_value);
    } else if (strcmp(option_name, "motif-p-thresh") == 0) {
      options->motif_p_threshold = atof(option_value);
    } else if (strcmp(option_name, "order") == 0) {
      options->order_string = option_value;

    } else if (strcmp(option_name, "fim") == 0) {
      options->fim = true;
    } else if (strcmp(option_name, "gap-extend") == 0) {
      options->gap_extend = atof(option_value);
    } else if (strcmp(option_name, "gap-open") == 0) {
      options->gap_open = atof(option_value);
    } else if (strcmp(option_name, "motif-pseudo") == 0) {
      options->motif_pseudo = atof(option_value);
    } else if (strcmp(option_name, "nspacer") == 0) {
      options->spacer_states = atoi(option_value);
    } else if (strcmp(option_name, "spacer-pseudo") == 0) {
      options->spacer_pseudo = atof(option_value);
    } else if (strcmp(option_name, "trans-pseudo") == 0) {
      options->trans_pseudo = atof(option_value);
    } else if (strcmp(option_name, "zselo") == 0) {
      options->zselo = true;

    } else if (strcmp(option_name, "allow-weak-motifs") == 0) {
      options->allow_weak_motifs =  true;
    } else if (strcmp(option_name, "global") == 0) {
      options->local_scoring = false;
    } else if (strcmp(option_name, "motif-scoring") == 0) {
      options->motif_scoring = true;
    } else if (strcmp(option_name, "pam") == 0) {
      options->pam_distance = atoi(option_value);
    } else if (strcmp(option_name, "paths") == 0) {
      if (strcmp(option_value, "single") == 0) {
        options->paths = SINGLE;
      }
      else if (strcmp(option_value, "all") == 0) {
        options->paths = ALL;
      }
      else {
        fprintf(stderr, "Invalid path option: %s.\n", option_value);
        fprintf(stderr, "%s", usage);
        exit(1);
      }
    } else if (strcmp(option_name, "p-score") == 0) {
      options->p_threshold = atof(option_value);
    }
  }

  // Read the two required arguments.
  if (option_index + 2 > argc) {
    fprintf(stderr, "Missing required arguments.\n");
    fprintf(stderr, "%s", usage);
    exit(1);
  }
  options->meme_filename = argv[option_index];
  options->seq_filename = argv[option_index+1];

  //  Record full command line
  int i;
  int arg_length = strlen(argv[0]);
  int total_arg_length = arg_length;
  int buffer_length = 2 * arg_length + 1;
  options->command_line = mm_malloc(buffer_length * sizeof(char));
  strcpy(options->command_line, argv[0]);
  for (i = 1; i < argc; i++) {
    arg_length = strlen(argv[i]) + 2; // +1 for leading space, +1 for trailing null
    total_arg_length += arg_length;
    if (total_arg_length > buffer_length) {
      buffer_length = 2 * total_arg_length + 1;
      options->command_line =
        mm_realloc(options->command_line, buffer_length * sizeof(char));
    }
    strcat(options->command_line, " ");
    strcat(options->command_line, argv[i]);
  }
}

/*************************************************************************
 *  Entry point for program.
 *************************************************************************/
int main(int argc, char *argv[]) {

  OPTIONS_T options;
  RBNODE_T *node;
  MHMM_T *the_hmm = NULL;

  process_command_line(argc, argv, &options);

  if (options.model_filename != NULL ) {

    if (!file_exists(options.model_filename)) {
      die("Couldn't find the model file %s.\n", options.model_filename);
    }

    // Read the HMM.
    read_mhmm(options.model_filename, &the_hmm);

    // Check that the HMM has linear topology
    if (the_hmm->type != LINEAR_HMM) {
      die(
        "beadstring requires a linear topology HMM.\n"
        "The HMM in %s has a %s topology.\n",
        options.model_filename,
        HMM_STRS[the_hmm->type]
      );
    }
  }
  else {
    // If a model file was not specifed, we'll
    // build the HMM from motif occurrences.
    the_hmm = build_linear_model(&options);
  }

  // Create cisml data structure for recording results
  CISML_T *cisml = allocate_cisml(
    "beadstring",
    options.meme_filename,
    options.seq_filename
  );
  PATTERN_T *pattern = allocate_pattern("Beadstring linear HMM","Beadstring linear HMM" );
  add_cisml_pattern(cisml, pattern);

  // Perform the actual search.
  bool got_evd = false;
  if (options.perform_search) {
    search_with_model(the_hmm, &got_evd, &options, pattern);
  }

  print_beadstring_results(the_hmm, got_evd, &options, cisml);

  // Print the program parameters.
  printf("Program parameters for beadstring\n");
  printf("  MEME file: %s\n", options.meme_filename);
  printf("  Motif nums:");
  for (node = rbtree_first(options.request_nums); node != NULL; node = rbtree_next(node)) {
    int motif_num = *((int*)rbtree_key(node));
    int strands = *((int*)rbtree_value(node));
    if (strands & MOTIF_PLUS) {
      printf(" +%d", motif_num);
    }
    if (strands & MOTIF_MINUS) {
      printf(" -%d", motif_num);
    }
  }
  printf("\n");
  printf("  Motif IDs:");
  for (node = rbtree_first(options.request_ids); node != NULL; node = rbtree_next(node)) {
    char *motif_id = (char*)rbtree_key(node);
    int strands = *((int*)rbtree_value(node));
    if (strands & MOTIF_PLUS) {
      printf(" +%s", motif_id);
    }
    if (strands & MOTIF_MINUS) {
      printf(" -%s", motif_id);
    }
  }
  printf("\n");
  printf("  Model topology: linear\n");
  printf("  States per spacer: %d\n", options.spacer_states);
  printf(
    "  Spacers are free-insertion modules: %s\n",
    options.fim ? "true" : "false"
  );
  printf("\n");

  free_mhmm(the_hmm);
  myfree(options.command_line);
  rbtree_destroy(options.request_nums);
  rbtree_destroy(options.request_ids);

  return(0);
}

