/**************************************************************************
 * FILE: mhmms.h
 * AUTHOR: William Stafford Noble and Timothy L. Bailey
 * CREATE DATE: 8-13-97
 * PROJECT: MHMM
 * VERSION: $Revision 1.10$
 * DESCRIPTION: Search a database of sequences using a motif-based HMM.
 **************************************************************************/
#ifndef MHMMS_H
#define MHMMS_H

#include "cisml.h"
#include "heap.h"
#include "match.h"
#include "seq.h"

#define DEFAULT_OUTPUT_WIDTH 132  // Number of chars in output lines.
#define DEFAULT_E_THRESHOLD 10.0  // Default E-value threshold.
#define DEFAULT_DP_THRESHOLD 10.0 // Threshold for repeated matches.

#define EVD_NUM_SEQS 100000        // Number of sequences required for EVD.
#define NO_MAX_SEQS -1     // Value indicating maxseqs not set

// Print progress report every N iterations.
#define DEFAULT_PROGRESS_EVERY 1000

/*************************************************************************
 * Return the value of num_stored.
 *************************************************************************/
int get_num_stored();

/*************************************************************************
 * Print a program's parameters and CPU time.
 *************************************************************************/
void print_parameters(
  char* const argv[],                   // command line
  int argc,                             // number of fields in command line
  char* name,                           // program name
  ALPH_T* alph,                          // alphabet
  char* hmm_filename,                   // hmm input file name
  char* seq_filename,                   // sequence file name
  BOOLEAN_T viterbi,                    // Viterbi search?
  double dp_threshold,                  // Repeated match threshold
  BOOLEAN_T motif_scoring,              // No partial motifs?
  BOOLEAN_T use_pvalues,                // Convert matches to p-values?
  double p_threshold,                   // P-value threshold for motif hits.
  double e_threshold,                   // E-value threshold for scores.
  BOOLEAN_T both_strands,               // Score both DNA strands?
  char* bg_filename,                    // Background file name.
  char* sc_filename,                    // Score file name.
  int pam_distance,                     // PAM distance,
  double beta,                          // Pseudo-weight.
  BOOLEAN_T zero_spacer_emit_lo,        // Zero emission for spacers?
  int max_gap,				// Maximum gap.
  double egcost,			// Cost factor for gaps.
  double gap_open,                      // Gap open penalty.
  double gap_extend,                    // Gap extend penalty.
  BOOLEAN_T print_fancy,                // Print alignments?
  BOOLEAN_T print_time,                 // Print CPU time?
  double start_time,                    // Starting time
  double end_time,                      // Ending time
  int num_seqs,				// Number of sequences processed.
  EVD_SET evd_set,			// EVD data structure.
  SCORE_SET score_set,			// Scores.
  FILE* out_stream                      // Output stream.
);

/*************************************************************************
 * Given the desired output width, calculate the width of one line of
 * an alignment.
 *************************************************************************/
int compute_align_width
  (int output_width);

/*************************************************************************
 * Figure out which positions to start and end the alignment.
 *
 * We are given the length of this sequence segment, its offset
 * relative to the entire sequence, the width of the desired
 * alignment, and the start and end positions of the match within this
 * segment.  The function converts the start and end positions from
 * segment-relative to sequence-relative coordinates.  Using these new
 * positions, the function computes the start position of the row of
 * the alignment that would contain the start of this match, and the
 * end position of the row of the alignment that would contain the end
 * of this match.  These coordinates are then converted back into
 * segment-relative coordinates.
 *************************************************************************/
void compute_alignment_start_end(
  int  align_width,
  int  seq_length,
  int  seq_offset,
  int  start_match,
  int  end_match,
  int* align_start,
  int* align_end
);

/*************************************************************************
 * Given a sequence, model and traceback, generate a sequence-to-model
 * match.
 *
 * Side effect: replaces the flanking Xs in the sequence with spaces.
 *************************************************************************/
void generate_match_sequence
  (MHMM_T*  the_log_hmm,    // Log-odds version of the HMM.
   int      trace_start,    // Position in sequence to start trace at.
   int      trace_end,	    // Position in sequence to end trace at.
   MATCH_T* traceback,      // The path through the model.
   MATRIX_T* motif_score_matrix, // Motif score matrix.
   PROB_T   p_threshold,    // P-value threshold for motif hits.
   SEQ_T*   sequence,       // The sequence as characters.
   char*    score_sequence, // Motif scores printed above the motif IDs.
   char*    id_sequence,    // Motif IDs printed above the sequence.
   char*    model_sequence, // Consensus chars along path.
   char*    match_sequence);// Indications of matches between sequence & model.

/*************************************************************************
 * Store a sequence's scores, ID and description for later printing.
 *************************************************************************/
void store_sequence
  (ALPH_T*   alph,           // Alphabet
   BOOLEAN_T motif_scoring,  // Motif-scoring being used.
   BOOLEAN_T mhmmscan,       // Print mhmmscan output (else mhmms output)?
   int       output_width,   // Width in chars of the output.
   SEQ_T*    sequence,       // Info about the sequence. 
   MATCH_T*  this_match,     // Info about this match.
   PROB_T    e_threshold,    // E-value reporting threshold. 
   double    dp_threshold,   // Subtract from viterbi score.
   PROB_T    p_threshold,    // P-value threshold for motif hits.
   SCORE_SET *score_set,     // Scores and lengths for computing distribution.
   BOOLEAN_T got_evd,	     // EVD is available 
   EVD_SET   evd_set,	     // EVD
   BOOLEAN_T store_trace,    // Store the Viterbi trace?
   MHMM_T*   the_log_hmm,    // The HMM.
   MATRIX_T* motif_score_matrix, // Motif score matrix.
   BOOLEAN_T store_gff,     // Store a GFF entry?
   SCANNED_SEQUENCE_T *scanned_seq // CISML scanned sequence structure
);

/*************************************************************************
 * Given a sequence, a model, and a match, print the sequence-to-model
 * alignment in BLAST-style format.
 *
 * The formatted output is returned as a single string, which must
 * be freed by the caller.
 *************************************************************************/
void print_alignment
  (int      align_width,   // Number of characters in one alignment line.
   MHMM_T*  the_log_hmm,   // Log-odds version of the HMM.
   MATCH_T* this_match,	   // The match.
   SEQ_T*   sequence,      // The sequence as characters.
   MATRIX_T* motif_score_matrix, // Motif score matrix.
   PROB_T    p_threshold,  // P-value threshold for motif hits.
   char**   outstring);    // Output string.

/*************************************************************************
 * Sort the stored list of sequences, scores and IDs, and print them
 * out.
 *************************************************************************/
void sort_and_print_scores
  (BOOLEAN_T print_fancy,  // Include Viterbi alignment? 
   BOOLEAN_T print_header, // Print header? 
   BOOLEAN_T got_evd,      // EVD found? 
   BOOLEAN_T motif_scoring,// Motif-scoring being used.
   BOOLEAN_T mhmmscan,     // Print in mhmmscan format (else mhmms)?
   int       maxseqs,     // Maximum number of sequences to print
   int       output_width, // Width of output, in chars.
   PROB_T    threshold,    // Print sequences scoring below this. 
   BOOLEAN_T sort_output,  // Sort scores? 
   FILE*     gff_file,     // Auxiliary GFF file.
   FILE*     outfile);     // Print to this file. 

/*************************************************************************
 * Calculate the score distribution.
 *************************************************************************/
EVD_SET calc_distr(
  SCORE_SET score_set,    // Set of scores and lengths.
  DISTR_T dtype,          // Type of score distribution
  BOOLEAN_T match_evalues // Use match E-values?
);

/*************************************************************************
 * Calculate the E-values of sequence scores and store them as the keys.
 *************************************************************************/
void calc_evalues (
  EVD_SET *evd_set,	// EVD parameters.
  int n,         	// Number of sequences or matches.
  int q,                // Length of query.
  int t                 // Use as length of targets if non-zero.
);

#endif
