/* Unaligned sequence file i/o.
 * 
 * Contents:
 *    1. An <ESL_SQFILE> object, text mode.
 *    2. An <ESL_SQFILE> object, digital mode. [with <alphabet>]
 *    3. Sequence reading.
 *    4. Sequence writing.
 *    5. Miscellaneous routines.
 *    6. Sequence/subsequence fetching, random access [with <ssi>]
 *    7. Sequence database caching.
 *    8. Internal functions.
 *    9. Benchmark driver.
 *   10. Unit tests.
 *   11. Test driver.
 *   12. Examples.
 * 
 * This module shares remote evolutionary homology with Don Gilbert's
 * seminal, public domain ReadSeq package, though the last common
 * ancestor was circa 1991 and no recognizable vestiges are likely to
 * remain. Thanks Don!
 *
 */
#include "esl_config.h"

#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <ctype.h>
#ifdef HAVE_STRINGS_H
#include <strings.h>		/* POSIX strcasecmp() */
#endif

#include "easel.h"
#include "esl_alphabet.h"
#include "esl_msa.h"
#include "esl_msafile.h"
#include "esl_sqio.h"
#include "esl_sq.h"
#include "esl_sqio_ascii.h"
#include "esl_sqio_ncbi.h"

static int convert_sq_to_msa(ESL_SQ *sq, ESL_MSA **ret_msa);


/*****************************************************************
 *# 1. An <ESL_SQFILE> object, in text mode.
 *****************************************************************/ 

static int  sqfile_open(const char *filename, int format, const char *env, ESL_SQFILE **ret_sqfp);

/* Function:  esl_sqfile_Open()
 * Synopsis:  Open a sequence file <filename> for reading. 
 *
 * Purpose:   Open a sequence file <filename> for reading. 
 *            The opened <ESL_SQFILE> is returned through <ret_sqfp>.
 * 
 *            The format of the file is asserted to be <format> (for
 *            example, <eslSQFILE_FASTA>). If <format> is
 *            <eslSQFILE_UNKNOWN> then the routine attempts to
 *            autodetect the file format.
 *            
 *            If <env> is non-NULL, it is the name of an environment
 *            variable that contains a colon-delimited list of
 *            directories in which we may find this <filename>.
 *            For example, if we had 
 *            <setenv BLASTDB /nfs/db/blast-db:/nfs/db/genomes/>
 *            in the environment, a database search application
 *            could pass "BLASTDB" as <env>.
 *            
 * Returns:   <eslOK> on success, and <*ret_sqfp> points to a new
 *            open <ESL_SQFILE>. Caller deallocates this object with
 *            <esl_sqfile_Close()>. 
 *            
 *            Returns <eslENOTFOUND> if <filename> can't be opened.
 *            
 *            Returns <eslEFORMAT> if the file is empty, or
 *            if autodetection is attempted and the format can't be
 *            determined.  
 *
 *            On any error condition, <*ret_sqfp> is returned NULL.
 *             
 * Throws:    <eslEMEM> on allocation failure.
 */
int
esl_sqfile_Open(const char *filename, int format, const char *env, ESL_SQFILE **ret_sqfp)
{
  return sqfile_open(filename, format, env, ret_sqfp);
}


/* Function:  esl_sqfile_Close()
 * Synopsis:  Close a sequence file.
 *
 * Purpose:   Closes an open <sqfp>.
 *
 * Returns:   (void).
 */
void
esl_sqfile_Close(ESL_SQFILE *sqfp)
{
  if (sqfp == NULL) return;

  if (sqfp->close != NULL)    sqfp->close(sqfp);
  if (sqfp->filename != NULL) free(sqfp->filename);
  free(sqfp);

  return;
}


/* sqfile_open():
 * This is the routine that actually opens an ESL_SQFILE.
 * esl_sqfile_Open() and esl_sqfile_OpenDigital() are
 * small wrappers around it.
 */
static int
sqfile_open(const char *filename, int format, const char *env, ESL_SQFILE **ret_sqfp)
{
  ESL_SQFILE *sqfp    = NULL;
  int         status;		/* return status from an ESL call */
  int         n;

  char       *s1;
  char       *s2;
  char       *list  = NULL;
  char       *path  = NULL;

  ESL_ALLOC(sqfp, sizeof(ESL_SQFILE));
  *ret_sqfp          = NULL;

  sqfp->filename     = NULL;

  sqfp->do_digital   = FALSE;
  sqfp->abc          = NULL;

  sqfp->format       = format;

  /* initialize the function pointers to NULL */
  sqfp->position          = NULL;
  sqfp->close             = NULL;

  sqfp->set_digital       = NULL;
  sqfp->guess_alphabet    = NULL;

  sqfp->is_rewindable     = NULL;

  sqfp->read              = NULL;
  sqfp->read_info         = NULL;
  sqfp->read_seq          = NULL;
  sqfp->read_window       = NULL;
  sqfp->echo              = NULL;

  sqfp->read_block        = NULL;

  sqfp->open_ssi          = NULL;
  sqfp->pos_by_key        = NULL;
  sqfp->pos_by_number     = NULL;

  sqfp->fetch             = NULL;
  sqfp->fetch_info        = NULL;
  sqfp->fetch_subseq      = NULL;

  sqfp->get_error         = NULL;

  /* save the user supplied file name */
  ESL_ALLOC(sqfp->filename, sizeof(char) * (strlen(filename) + 1));
  strcpy(sqfp->filename, filename);

  /* we need to process the list of directories starting with the local
   * directory followed by the list in env one directory at a time 
   * passing the path to the different sequence parsers until we get a hit.
   */
  if (strcmp(filename, "-") == 0) { /* stdin special case */
    if ((status = esl_strdup(filename, -1, &path)) != eslOK) goto ERROR;
    if ((status = esl_sqascii_Open(path, sqfp->format, sqfp)) != eslOK) goto ERROR;
  } else {

    /* check the local directory first */
    status = eslENOTFOUND;

    if (format == eslSQFILE_NCBI && status == eslENOTFOUND)
      status = esl_sqncbi_Open(sqfp->filename, sqfp->format, sqfp);

    if (status == eslENOTFOUND)
      status = esl_sqascii_Open(sqfp->filename, sqfp->format, sqfp);

    /* if it's not there, then check in directory list provided by <env>. */
    if (status == eslENOTFOUND && env != NULL) {
      if ((s1 = getenv(env)) == NULL) { status = eslENOTFOUND; goto ERROR; }
      ESL_ALLOC(list, sizeof(char) * (strlen(s1) + 1));
      strcpy(list + 2, s1);

      ESL_ALLOC(path, sizeof(char) * (strlen(filename) + strlen(list) + 3));

      s1 = list;
      while (s1 != NULL && status == eslENOTFOUND) {
	if ((s2 = strchr(s1, ':')) != NULL) { *s2 = '\0'; s2++;}
	n = strlen(s1);
	strcpy(path, s1);
	path[n] = eslDIRSLASH;
	strcpy(path+n+1, filename);
	s1 = s2;


	if (format == eslSQFILE_NCBI && status == eslENOTFOUND)
	  status = esl_sqncbi_Open(path, sqfp->format, sqfp);

	if (status == eslENOTFOUND)
	  status = esl_sqascii_Open(path, sqfp->format, sqfp);
      }
    }
  }

  if (status != eslOK) goto ERROR;

  if (list != NULL) free(list);
  if (path != NULL) free(path);

  *ret_sqfp = sqfp;
  return eslOK;

 ERROR:
  esl_sqfile_Close(sqfp); 
  if (list != NULL) free(list);
  if (path != NULL) free(path);
  *ret_sqfp = NULL;
  return status;
}
/*------------------- ESL_SQFILE open/close -----------------------*/



/*****************************************************************
 *# 2. An <ESL_SQFILE> object, in digital mode [with <alphabet>]
 *****************************************************************/

/* Function:  esl_sqfile_OpenDigital()
 * Synopsis:  Open an <ESL_SQFILE> for digital input.
 *
 * Purpose:   Same as <esl_sqfile_Open()>, but we will expect all
 *            sequence input to conform to the digital alphabet <abc>.
 *            
 *            Normally, after opening the sequence file in digital
 *            mode, you'd read sequence into a digital <ESL_SQ>.
 *            However, you don't actually have to. The state of the
 *            <ESL_SQ> controls how the input is stored; the state of
 *            the <ESL_SQFILE> controls how the input is validated.
 *            
 * Returns:   <eslOK> on success, and <*ret_sqfp> points to a new
 *            open <ESL_SQFILE>.
 *            
 *            Returns <eslENOTFOUND> if <filename> can't be opened.
 *            Returns <eslEFORMAT> if the file is empty, or if
 *            autodetection is attempted and the format can't be
 *            determined.  On any error conditions, <*ret_sqfp> is
 *            returned NULL.
 *             
 * Throws:    <eslEMEM> on allocation failure.
 */
int
esl_sqfile_OpenDigital(const ESL_ALPHABET *abc, const char *filename, int format, const char *env, ESL_SQFILE **ret_sqfp)
{
  int status;

  if ((status = sqfile_open(filename, format, env, ret_sqfp)) != eslOK) return status;
  return esl_sqfile_SetDigital(*ret_sqfp, abc);
}

/* Function:  esl_sqfile_SetDigital()
 * Synopsis:  Set an open <ESL_SQFILE> to read in digital mode.
 *
 * Purpose:   Given an <ESL_SQFILE> that's already been opened,
 *            configure it to expect subsequent input to conform
 *            to the digital alphabet <abc>.
 *            
 *            Calling <esl_sqfile_Open(); esl_sqfile_SetDigital()> is
 *            equivalent to <esl_sqfile_OpenDigital()>. The two-step
 *            version is useful when you need a
 *            <esl_sqfile_GuessAlphabet()> call in between, guessing
 *            the file's alphabet in text mode before you set it to
 *            digital mode.
 *
 * Returns:   <eslOK> on success.
 */
int
esl_sqfile_SetDigital(ESL_SQFILE *sqfp, const ESL_ALPHABET *abc)
{
  sqfp->set_digital(sqfp, abc);

  sqfp->do_digital = TRUE;
  sqfp->abc        = abc;
  return eslOK;
}


/* Function:  esl_sqfile_GuessAlphabet()
 * Synopsis:  Guess the alphabet of an open <ESL_SQFILE>.
 *
 * Purpose:   After opening <sqfp>, attempt to guess what alphabet
 *            its sequences are in, by inspecting the first sequence
 *            in the file, and return this alphabet type in <*ret_type>.
 *
 * Returns:   <eslOK> on success, and <*ret_type> is set to <eslDNA>,
 *            <eslRNA>, or <eslAMINO>.
 *            
 *            Returns <eslENOALPHABET> if the alphabet can't be
 *            reliably guessed.
 *            
 *            Returns <eslEFORMAT> if a parse error is encountered.
 *            Call <esl_sqfile_GetErrorBuf()> to get a ptr to a
 *            user-directed error message describing the problem,
 *            including the line number on which it was found.
 *            
 *            Returns <eslENODATA> if the file appears to be empty.
 *
 *            On any error, <*ret_type> is <eslSQFILE_UNKNOWN>.
 *
 * Throws:    <eslEMEM> on allocation error;
 *            <eslEINCONCEIVABLE> on unimaginable internal errors.
 */
int
esl_sqfile_GuessAlphabet(ESL_SQFILE *sqfp, int *ret_type)
{
  return sqfp->guess_alphabet(sqfp, ret_type);
}

/*-------------- end, digital mode ESL_SQFILE -------------------*/



/*****************************************************************
 *# 3. Sequence reading (sequential)
 *****************************************************************/ 

/* Function:  esl_sqio_Read()
 * Synopsis:  Read the next sequence from a file.
 *
 * Purpose:   Reads the next sequence from open sequence file <sqfp> into 
 *            <sq>. Caller provides an allocated and initialized <sq>, which
 *            will be internally reallocated if its space is insufficient.
 *
 * Returns:   <eslOK> on success; the new sequence is stored in <sq>.
 * 
 *            Returns <eslEOF> when there is no sequence left in the
 *            file (including first attempt to read an empty file).
 * 
 *            Returns <eslEFORMAT> if a parse error is encountered.
 *            Call <esl_sqfile_GetErrorBuf()> to get a ptr to a
 *            user-directed error message describing the problem,
 *            including the line number on which it was found.
 *
 * Throws:    <eslEMEM> on allocation failure;
 *            <eslEINCONCEIVABLE> on internal error.
 */
int
esl_sqio_Read(ESL_SQFILE *sqfp, ESL_SQ *sq)
{
  return sqfp->read(sqfp, sq);
}


/* Function:  esl_sqio_ReadInfo()
 * Synopsis:  Read sequence info, but not the sequence itself.
 *
 * Purpose:   Read the next sequence from open sequence file <sqfp>,
 *            but don't store the sequence (or secondary structure).
 *            Upon successful return, <s> holds all the available 
 *            information about the sequence -- its name, accession,
 *            description, and overall length <sq->L>. 
 *            
 *            This is useful for indexing sequence files, where
 *            individual sequences might be ginormous, and we'd rather
 *            avoid reading complete seqs into memory.
 *
 * Returns:   <eslOK> on success.
 */
int
esl_sqio_ReadInfo(ESL_SQFILE *sqfp, ESL_SQ *sq)
{
  return sqfp->read_info(sqfp, sq);
}


/* Function:  esl_sqio_ReadSequence()
 * Synopsis:  Read sequence, but not the header itself.
 *
 * Purpose:   Read the next sequence from open sequence file <sqfp>,
 *            skipping over the header data.  Upon successful return, 
 *            <s> holds just the sequece data.  File offsets will be
 *            filled in.
 *            
 *            This is useful fast reads of binary formats where the
 *            header information and sequences are stored in different
 *            files.
 *
 * Returns:   <eslOK> on success.
 */
int
esl_sqio_ReadSequence(ESL_SQFILE *sqfp, ESL_SQ *sq)
{
  return sqfp->read_seq(sqfp, sq);
}


/* Function:  esl_sqio_ReadWindow()
 * Synopsis:  Read next window of sequence.
 *
 * Purpose:   Read a next window of <W> residues from open file <sqfp>,
 *            keeping <C> residues from the previous window as
 *            context, and keeping previous annotation in the <sq>
 *            as before. 
 *            
 *            If this is the first window of a new sequence record,
 *            <C> is ignored (there's no previous context yet), and
 *            the annotation fields of the <sq> (name, accession, and
 *            description) are initialized by reading the sequence
 *            record's header. This is the only time the annotation
 *            fields are initialized.
 *            
 *            On return, <sq->dsq[]> contains the window and its
 *            context; residues <1..sq->C> are the previous context,
 *            and residues <sq->C+1..sq->n> are the new window.  The
 *            start and end coordinates of the whole <dsq[1..n]>
 *            (including context) in the original source sequence are
 *            <sq->start..sq->end>. (Or, for text mode sequences,
 *            <sq->seq[0..sq->C-1,sq->C..sq->n-1]>, while <start> and
 *            <end> coords are still <1..L>.)
 *
 *            When a sequence record is completed and no more data
 *            remain, <eslEOD> is returned, with an ``info'' <sq>
 *            structure (containing the annotation and the total
 *            sequence length <L>, but no sequence). (The total
 *            sequence length <L> is unknown in <sq> until this
 *            <eslEOD> return.)
 *            
 *            The caller may then do one of two things before calling
 *            <esl_sq_ReadWindow()> again; it can reset the sequence
 *            with <esl_sq_Reuse()> to continue reading the next
 *            sequence in the file, or it can set a negative <W> as a
 *            signal to read windows from the reverse complement
 *            (Crick) strand. Reverse complement reading only works
 *            for nucleic acid sequence. 
 *            
 *            If you read the reverse complement strand, you must read
 *            the whole thing, calling <esl_sqio_ReadWindow()> with
 *            negative <W> windows until <eslEOD> is returned again
 *            with an empty (info-only) <sq> structure. When that
 *            <EOD> is reached, the <sqfp> is repositioned at the
 *            start of the next sequence record; the caller should now
 *            <Reuse()> the <sq>, and the next <esl_sqio_ReadWindow()>
 *            call must have a positive <W>, corresponding to starting
 *            to read the Watson strand of the next sequence.
 *
 *            Note that the <ReadWindow()> interface is designed for
 *            an idiom of sequential reading of complete sequences in
 *            overlapping windows, possibly on both strands; if you
 *            want more freedom to move around in the sequence
 *            grabbing windows in another order, you can use the
 *            <FetchSubseq()> interface.
 *
 *            Reading the reverse complement strand requires file
 *            repositioning, so it will not work on non-repositionable
 *            streams like gzipped files or a stdin pipe. Moreover,
 *            for reverse complement input to be efficient, the
 *            sequence file should have consistent line lengths, 
 *            suitable for SSI's fast subsequence indexing.
 *            
 * Returns:   <eslOK> on success; <sq> now contains next window of
 *            sequence, with at least 1 new residue. The number
 *            of new residues is <sq->W>; <sq->C> residues are 
 *            saved from the previous window. Caller may now
 *            process residues <sq->dsq[sq->C+1]..sq->dsq[sq->n]>.
 *            
 *            <eslEOD> if no new residues were read for this sequence
 *            and strand, and <sq> now contains an empty info-only
 *            structure (annotation and <L> are valid). Before calling
 *            <esl_sqio_ReadWindow()> again, caller will either want
 *            to make <W> negative (to start reading the Crick strand
 *            of the current sequence), or it will want to reset the
 *            <sq> (with <esl_sq_Reuse()>) to go on the next sequence.
 *            
 *            <eslEOF> if we've already returned <eslEOD> before to
 *            signal the end of the previous seq record, and moreover,
 *            there's no more sequence records in the file.
 *            
 *            <eslEINVAL> if an invalid residue is found in the
 *            sequence, or if you attempt to take the reverse
 *            complement of a sequence that can't be reverse
 *            complemented.
 *
 * Throws:    <eslESYNTAX> if you try to read a reverse window before
 *            you've read forward strand.
 *            
 *            <eslECORRUPT> if something goes awry internally in the
 *            coordinate system.
 *            
 *            <eslEMEM> on allocation error.
 */
int
esl_sqio_ReadWindow(ESL_SQFILE *sqfp, int C, int W, ESL_SQ *sq)
{
  return sqfp->read_window(sqfp, C, W, sq);
}

/* Function:  esl_sqio_ReadBlock()
 * Synopsis:  Read the next block of sequences from a file.
 *
 * Purpose:   Reads a block of sequences from open sequence file <sqfp> into 
 *            <sqBlock>.
 *
 * Returns:   <eslOK> on success; the new sequence is stored in <sqBlock>.
 * 
 *            Returns <eslEOF> when there is no sequence left in the
 *            file (including first attempt to read an empty file).
 * 
 *            Returns <eslEFORMAT> if a parse error is encountered.
 *            Call <esl_sqfile_GetErrorBuf()> to get a ptr to a
 *            user-directed error message describing the problem,
 *            including the line number on which it was found.
 *
 * Throws:    <eslEMEM> on allocation failure;
 *            <eslEINCONCEIVABLE> on internal error.
 */
int
esl_sqio_ReadBlock(ESL_SQFILE *sqfp, ESL_SQ_BLOCK *sqBlock, int max_residues, int max_sequences, int max_init_window, int long_target)
{
  return sqfp->read_block(sqfp, sqBlock, max_residues, max_sequences, max_init_window, long_target);
}

/* Function:  esl_sqio_Parse()
 * Synopsis:  Parse a sequence already read into a buffer.
 *
 * Purpose:   Parse the buffer <buf> for a sequence <s> of type
 *            <format>.  The buffer must contain the entire sequence.
 *            
 * Returns:   <eslOK> on success.
 *
 * Throws:    <eslEMEM>  on allocation error.
 *            <eslEFORMAT>  on parsing error.
 *            <eslEINVAL> on unsupported format.
 */
int
esl_sqio_Parse(char *buf, int size, ESL_SQ *s, int format)
{
  int status;

  switch (format) {
  case eslSQFILE_EMBL:     
  case eslSQFILE_UNIPROT:  
  case eslSQFILE_GENBANK:  
  case eslSQFILE_DDBJ:     
  case eslSQFILE_FASTA:    
  case eslSQFILE_DAEMON:   
    status = esl_sqascii_Parse(buf, size, s, format);

    break;
  default: 
    ESL_EXCEPTION(eslEINVAL, "can't parse that format");
  }
  return status;
}
/*------------------ end, sequential sequence input -------------*/



/*****************************************************************
 *# 4. Writing sequences.
 *****************************************************************/

/* Function:  esl_sqio_Write()
 * Synopsis:  Write a sequence to a file.
 *
 * Purpose:   Write sequence <s> to an open FILE <fp> in file format
 *            <format>.
 * 
 *            If <update> is <TRUE>, set the offsets for sequence <s>.
 *            
 * Returns:   <eslOK> on success.
 *
 * Throws:    <eslEMEM> on allocation error.
 *            <eslEWRITE> on system write error, such as filled disk.
 */
int
esl_sqio_Write(FILE *fp, ESL_SQ *s, int format, int update)
{
  ESL_MSA *msa;
  int status;

  if (esl_sqio_IsAlignment(format))
    {
      if ((status = convert_sq_to_msa(s, &msa)) != eslOK) return status;
      status = esl_msafile_Write(fp, msa, format);
      esl_msa_Destroy(msa);
      return status;
    }

  switch (format) {
  case eslSQFILE_FASTA:   
  case eslSQFILE_HMMPGMD:
    status = esl_sqascii_WriteFasta(fp, s, update); 
    break;
  default: 
    ESL_EXCEPTION(eslEINCONCEIVABLE, "can't write that format");
  }
  return status;
}

/* Function:  esl_sqio_Echo()
 * Synopsis:  Echo a sequence's record onto output stream.
 *
 * Purpose:   Given a complete <sq> that we have read by some means
 *            from an open <sqfp>; echo that sequence's record
 *            onto the output stream <ofp>. 
 *
 *            This allows records to be regurgitated exactly as they
 *            appear, rather than writing the subset of information
 *            stored in an <ESL_SQ>. <esl-sfetch> in the miniapps uses
 *            this, for example.
 *            
 *            Because this relies on repositioning the <sqfp>, it
 *            cannot be called on non-positionable streams (stdin or
 *            gzipped files). Because it relies on the sequence lying
 *            in a contiguous sequence of bytes in the file, it cannot
 *            be called on a sequence in a multiple alignment file.
 *            Trying to do so throws an <eslEINVAL> exception.
 *            
 * Returns:   <eslOK> on success.
 * 
 * Throws:    <eslEINVAL>   if <sqfp> isn't a repositionable sequence file.
 *            <eslECORRUPT> if we run out of data, probably from bad offsets
 *            <eslEMEM>     on allocation failure.
 *            <eslESYS>     on system call failures.
 *            
 *            
 */
int
esl_sqio_Echo(ESL_SQFILE *sqfp, const ESL_SQ *sq, FILE *ofp)
{
  return sqfp->echo(sqfp, sq, ofp);
}
/*----------------- end, writing sequences  ---------------------*/



/*****************************************************************
 *# 5. Miscellaneous routines 
 *****************************************************************/ 

/* Function:  esl_sqfile_GetErrorBuf()
 * Synopsis:  Return the error buffer
 *
 * Purpose:   Returns the pointer to the error buffer.
 *            Each parser is responsible for formatting
 *            a zero terminated string describing the
 *            error condition.
 *
 * Returns:   A pointer the error message.
 */
const char *
esl_sqfile_GetErrorBuf(const ESL_SQFILE *sqfp)
{
  return sqfp->get_error(sqfp);
}


/* Function:  esl_sqfile_IsRewindable()
 * Synopsis:  Return <TRUE> if <sqfp> can be rewound.
 *
 * Purpose:   Returns <TRUE> if <sqfp> can be rewound (positioned 
 *            to an offset of zero), in order to read it a second
 *            time.
 */
int
esl_sqfile_IsRewindable(const ESL_SQFILE *sqfp)
{
  return sqfp->is_rewindable(sqfp);
}

/* Function:  esl_sqio_IsAlignment()
 * Synopsis:  Return TRUE for alignment file format codes.
 *
 * Purpose:   Returns TRUE if <fmt> is an alignment file
 *            format code; else returns FALSE.
 *            
 *            This function only checks the convention
 *            that <fmt> codes $<$100 are unaligned formats,
 *            and $\geq$100 are aligned formats. It does
 *            not check that <fmt> is a recognized format
 *            code.
 */
int
esl_sqio_IsAlignment(int fmt)
{
  return (fmt >= 100 ? TRUE : FALSE);
}


/* Function:  esl_sqio_EncodeFormat()
 * Synopsis:  Convert a string to an internal format code.
 *
 * Purpose:   Given <fmtstring>, return format code.  For example, if
 *            <fmtstring> is "fasta", returns <eslSQFILE_FASTA>. Returns 
 *            <eslSQFILE_UNKNOWN> if <fmtstring> doesn't exactly match a 
 *            known format.
 *            
 *            Matching is case insensitive; fasta, FASTA, and FastA
 *            all return <eslSQFILE_FASTA>, for example.
 */
int
esl_sqio_EncodeFormat(char *fmtstring)
{
  if (strcasecmp(fmtstring, "fasta")     == 0) return eslSQFILE_FASTA;
  if (strcasecmp(fmtstring, "embl")      == 0) return eslSQFILE_EMBL;
  if (strcasecmp(fmtstring, "genbank")   == 0) return eslSQFILE_GENBANK;
  if (strcasecmp(fmtstring, "ddbj")      == 0) return eslSQFILE_DDBJ;
  if (strcasecmp(fmtstring, "uniprot")   == 0) return eslSQFILE_UNIPROT;
  if (strcasecmp(fmtstring, "ncbi")      == 0) return eslSQFILE_NCBI;
  if (strcasecmp(fmtstring, "daemon")    == 0) return eslSQFILE_DAEMON;
  if (strcasecmp(fmtstring, "hmmpgmd")   == 0) return eslSQFILE_HMMPGMD;
  if (strcasecmp(fmtstring, "fmindex")   == 0) return eslSQFILE_FMINDEX;
  return esl_msafile_EncodeFormat(fmtstring);
}

/* Function:  esl_sqio_DecodeFormat()
 * Synopsis:  Returns descriptive string for file format code.
 *
 * Purpose:   Given a format code <fmt>, returns a string label for
 *            that format. For example, if <fmt> is <eslSQFILE_FASTA>,
 *            returns "FASTA". 
 */
char *
esl_sqio_DecodeFormat(int fmt)
{
  if (esl_sqio_IsAlignment(fmt)) return esl_msafile_DecodeFormat(fmt);

  switch (fmt) {
  case eslSQFILE_UNKNOWN:    return "unknown";
  case eslSQFILE_FASTA:      return "FASTA";
  case eslSQFILE_EMBL:       return "EMBL";
  case eslSQFILE_GENBANK:    return "GenBank";
  case eslSQFILE_DDBJ:       return "DDBJ";
  case eslSQFILE_UNIPROT:    return "UniProt";
  case eslSQFILE_NCBI:       return "NCBI";
  case eslSQFILE_DAEMON:     return "daemon";
  case eslSQFILE_HMMPGMD:    return "hmmpgmd";
  case eslSQFILE_FMINDEX:    return "fmindex";
  default:                   break;
  }
  esl_exception(eslEINVAL, FALSE, __FILE__, __LINE__,  "no such sqio format code %d", fmt);
  return NULL;
}


/* Function:  esl_sqfile_Position()
 * Synopsis:  Reposition an open sequence file to an offset.
 *
 * Purpose:   Reposition an open <sqfp> to offset <offset>.
 *            <offset> would usually be the first byte of a
 *            desired sequence record.
 *            
 *            Only normal sequence files can be positioned to a
 *            nonzero offset. If <sqfp> corresponds to a standard
 *            input stream or gzip -dc stream, it may not be
 *            repositioned. If <sqfp> corresponds to a multiple
 *            sequence alignment file, the only legal <offset>
 *            is 0, to rewind the file to the beginning and 
 *            be able to read the entire thing again.
 *            
 *            After <esl_sqfile_Position()> is called on a nonzero
 *            <offset>, and other bookkeeping information is unknown.
 *            If caller knows it, it should set it explicitly.
 *            
 *            See the SSI module for manipulating offsets and indices.
 *
 * Returns:   <eslOK>     on success;
 *            <eslEOF>    if no data can be read from this position.
 *
 * Throws:    <eslESYS> if the fseeko() or fread() call fails.
 *            <eslEMEM> on (re-)allocation failure.
 *            <eslEINVAL> if the <sqfp> is not positionable.
 *            <eslENOTFOUND> if in trying to rewind an alignment file  
 *              by closing and reopening it, the open fails.
 *            On errors, the state of <sqfp> is indeterminate, and
 *            it should not be used again.
 */
int
esl_sqfile_Position(ESL_SQFILE *sqfp, off_t offset)
{
  return sqfp->position(sqfp, offset);
}

/* Function:  esl_sqio_Ignore()
 * Synopsis:  Sets the input map to ignore one or more input characters.
 *
 * Purpose:   Set the input map of the open <sqfp> to allow
 *            the characters in the string <ignoredchars> to appear
 *            in input sequences. These characters will be ignored.
 *
 *            For example, an application might want to ignore '*'
 *            characters in its sequence input, because some translated
 *            peptide files use '*' to indicate stop codons.
 *            
 * Returns:   <eslOK> on success.
 */
int
esl_sqio_Ignore(ESL_SQFILE *sqfp, const char *ignoredchars)
{
  int i;
  for (i = 0; ignoredchars[i] != '\0'; i++)
    sqfp->inmap[(int) ignoredchars[i]] = eslDSQ_IGNORED;
  return eslOK;
}

/* Function:  esl_sqio_AcceptAs()
 * Synopsis:  Map a list of additional characters.
 *
 * Purpose:   Set the input map of the open <sqfp> to allow the 
 *            characters in the string <xchars> to appear in 
 *            input sequences. These characters will all be 
 *            mapped to the character <readas> (or, for digital
 *            sequence input, to the digitized representation 
 *            of the text character <readas> in the <sqfp>'s
 *            digital alphabet).
 *            
 *            For example, an application might want to read
 *            '*' as 'X' when reading translated peptide files
 *            that use '*' to indicate a stop codon.
 *
 * Returns:   <eslOK> on success.
 */
int
esl_sqio_AcceptAs(ESL_SQFILE *sqfp, char *xchars, char readas)
{
  int i;
  
  if (sqfp->do_digital)
    {
      for (i = 0; xchars[i] != '\0'; i++)
	sqfp->inmap[(int) xchars[i]] = esl_abc_DigitizeSymbol(sqfp->abc, readas);
    }

  if (! sqfp->do_digital)
    {
      for (i = 0; xchars[i] != '\0'; i++)
	sqfp->inmap[(int) xchars[i]] = readas;
    }
  return eslOK;

}
/*--------------- end, miscellaneous routines -------------------*/



/*****************************************************************
 *# 6. Sequence/subsequence fetching, random access [with <ssi>]
 *****************************************************************/


/* Function:  esl_sqfile_OpenSSI()
 * Synopsis:  Opens an SSI index associated with a sequence file.
 *
 * Purpose:   Opens an SSI index file associated with the already open
 *            sequence file <sqfp>. If successful, the necessary
 *            information about the open SSI file is stored internally
 *            in <sqfp>.
 *            
 *            The SSI index file name is determined in one of two
 *            ways, depending on whether a non-<NULL> <ssifile_hint>
 *            is provided.
 *            
 *            If <ssifile_hint> is <NULL>, the default for
 *            constructing the SSI filename from the sequence
 *            filename, by using exactly the same path (if any) for
 *            the sequence filename, and appending the suffix <.ssi>.
 *            For example, the SSI index for <foo> is <foo.ssi>, for
 *            <./foo.fa> is <./foo.fa.ssi>, and for
 *            </my/path/to/foo.1.fa> is </my/path/to/foo.1.fa.ssi>.
 *            
 *            If <ssifile_hint> is <non-NULL>, this exact fully
 *            qualified path is used as the SSI file name.
 *
 * Returns:   <eslOK> on success, and <sqfp->ssi> is now internally
 *            valid.
 *            
 *            <eslENOTFOUND> if no SSI index file is found;
 *            <eslEFORMAT> if it's found, but appears to be in incorrect format;
 *            <eslERANGE> if the SSI file uses 64-bit offsets but we're on
 *            a system that doesn't support 64-bit file offsets.
 *
 * Throws:    <eslEINVAL> if the open sequence file <sqfp> doesn't
 *            correspond to a normal sequence flatfile -- we can't
 *            random access in .gz compressed files, standard input,
 *            or multiple alignment files that we're reading
 *            sequentially.
 *            
 *            Throws <eslEMEM> on allocation error.
 */
int
esl_sqfile_OpenSSI(ESL_SQFILE *sqfp, const char *ssifile_hint)
{
  return sqfp->open_ssi(sqfp, ssifile_hint);
}



/* Function:  esl_sqfile_PositionByKey()
 * Synopsis:  Use SSI to reposition seq file to a particular sequence.
 *
 * Purpose:   Reposition <sqfp> so that the next sequence we read will
 *            be the one named (or accessioned) <key>.
 *            
 *            <sqfp->linenumber> is reset to be relative to the start
 *            of the record named <key>, rather than the start of the
 *            file.
 *
 * Returns:   <eslOK> on success, and the file <sqfp> is repositioned
 *            so that the next <esl_sqio_Read()> call will read the
 *            sequence named <key>.
 *            
 *            Returns <eslENOTFOUND> if <key> isn't found in the
 *            index; in this case, the position of <sqfp> in the file
 *            is unchanged.
 *            
 *            Returns <eslEFORMAT> if something goes wrong trying to
 *            read the index, almost certainly indicating a format
 *            problem in the SSI file.
 *            
 *            Returns <eslEOF> if, after repositioning, we fail to
 *            load the next line or buffer from the sequence file;
 *            this probably also indicates a format problem in the SSI
 *            file.
 * 
 * Throws:    <eslEMEM>   on allocation error;
 *            <eslEINVAL> if there's no open SSI index in <sqfp>;
 *            <eslESYS>   if the <fseek()> fails.
 *            
 *            In all these cases, the state of <sqfp> becomes
 *            undefined, and the caller should not use it again.
 */
int
esl_sqfile_PositionByKey(ESL_SQFILE *sqfp, const char *key)
{
  return sqfp->pos_by_key(sqfp, key);
}


/* Function:  esl_sqfile_PositionByNumber()
 * Synopsis:  Use SSI to reposition by sequence number
 *
 * Purpose:   Reposition <sqfp> so that the next sequence we 
 *            read will be the <which>'th sequence, where <which>
 *            is <0..sqfp->ssi->nprimary-1>. 
 *            
 *            <sqfp->linenumber> is reset to be relative to the start
 *            of the record named <key>, rather than the start of the
 *            file.
 *
 * Returns:   <eslOK> on success, and the file <sqfp> is repositioned.
 *            
 *            Returns <eslENOTFOUND> if there is no sequence number
 *            <which> in the index; in this case, the position of
 *            <sqfp> in the file is unchanged.
 *            
 *            Returns <eslEFORMAT> if something goes wrong trying to
 *            read the index, almost certainly indicating a format
 *            problem in the SSI file.
 *            
 *            Returns <eslEOF> if, after repositioning, we fail to
 *            load the next line or buffer from the sequence file;
 *            this probably also indicates a format problem in the SSI
 *            file.
 * 
 * Throws:    <eslEMEM>   on allocation error;
 *            <eslEINVAL> if there's no open SSI index in <sqfp>;
 *            <eslESYS>   if the <fseek()> fails.
 *            
 *            In all these cases, the state of <sqfp> becomes
 *            undefined, and the caller should not use it again.
 */
int
esl_sqfile_PositionByNumber(ESL_SQFILE *sqfp, int which)
{
  return sqfp->pos_by_number(sqfp, which);
}


/* Function:  esl_sqio_Fetch()
 * Synopsis:  Fetch a complete sequence, using SSI indexing.
 *
 * Purpose:   Fetch a sequence named (or accessioned) <key> from
 *            the repositionable, open sequence file <sqfp>.
 *            The open <sqfp> must have an open SSI index.
 *            The sequence is returned in <sq>.
 *
 * Returns:   <eslOK> on soccess.
 *            <eslEINVAL> if no SSI index is present, or if <sqfp> can't
 *            be repositioned.
 *            <eslENOTFOUND> if <source> isn't found in the file.
 *            <eslEFORMAT> if either the index file or the sequence file
 *            can't be parsed, because of unexpected format issues.
 *       
 * Throws:    <eslEMEM> on allocation error.
 */
int
esl_sqio_Fetch(ESL_SQFILE *sqfp, const char *key, ESL_SQ *sq)
{
  return sqfp->fetch(sqfp, key, sq);
}
  
/* Function:  esl_sqio_FetchInfo()
 * Synopsis:  Fetch a sequence's info, using SSI indexing.
 *
 * Purpose:   Fetch a sequence named (or accessioned) <key> from
 *            the repositionable, open sequence file <sqfp>, reading
 *            all info except the sequence (and secondary structure).
 *            The open <sqfp> must have an open SSI index.
 *            The sequence info is returned in <sq>.
 *
 * Returns:   <eslOK> on soccess.
 *            <eslEINVAL> if no SSI index is present, or if <sqfp> can't
 *            be repositioned.
 *            <eslENOTFOUND> if <source> isn't found in the file.
 *            <eslEFORMAT> if either the index file or the sequence file
 *            can't be parsed, because of unexpected format issues.
 *       
 * Throws:    <eslEMEM> on allocation error.
 */
int
esl_sqio_FetchInfo(ESL_SQFILE *sqfp, const char *key, ESL_SQ *sq)
{
  return sqfp->fetch_info(sqfp, key, sq);
}
  

/* Function:  esl_sqio_FetchSubseq()
 * Synopsis:  Fetch a subsequence, using SSI indexing.
 *
 * Purpose:   Fetch subsequence <start..end> from a sequence named (or
 *            accessioned) <source>, in the repositionable, open sequence file <sqfp>.
 *            The open <sqfp> must have an SSI index. Put the
 *            subsequence in <sq>. 
 *            
 *            As a special case, if <end> is 0, the subsequence is
 *            fetched all the way to the end, so you don't need to
 *            look up the sequence length <L> to fetch a suffix.
 *            
 *            The caller may want to rename/reaccession/reannotate the
 *            subsequence.  Upon successful return, <sq->name> is set
 *            to <source/start-end>, and <sq->source> is set to
 *            <source> The accession and description <sq->acc> and
 *            <sq->desc> are set to the accession and description of
 *            the source sequence.
 *            
 * Returns:   <eslOK> on success.
 *            <eslEINVAL> if no SSI index is present, or if <sqfp> can't
 *            be repositioned.
 *            <eslENOTFOUND> if <source> isn't found in the file.
 *            <eslEFORMAT> if either the index file or the sequence file
 *            can't be parsed, because of unexpected format issues.
 *            <eslERANGE> if the <start..end> coords don't lie entirely
 *            within the <source> sequence.
 *
 * Throws:    <eslEMEM> on allocation errors.
 */
int
esl_sqio_FetchSubseq(ESL_SQFILE *sqfp, const char *source, int64_t start, int64_t end, ESL_SQ *sq)
{
  return sqfp->fetch_subseq(sqfp, source, start, end, sq);
}  
/*------------- end, random sequence access with SSI -------------------*/


/*****************************************************************
 *# 7. Sequence database caching.
 *****************************************************************/ 

/* Function:  esl_sqfile_Cache()
 * Synopsis:  Read a database into memory.
 *
 * Purpose:   Read an entire database into memory building a cached
 *            structure <ESL_SQCACHE>.  The cache structure has basic
 *            information about the database, ie number of sequences
 *            number of residues, etc.
 *
 *            All sequences <ESL_SQ> are in a memory array <sq_list>.
 *            The number of elements in the list is <seq_count>.  The
 *            header pointers, ie name, acc and desc are pointers into
 *            the <header_mem> buffer.  All digitized sequences are pointers
 *            into the <residue_mem> buffer.
 *
 * Returns:   <eslOK> on success.
 *            
 *            Returns <eslEFORMAT> if a parse error is encountered in
 *            trying to read the sequence file.
 *            
 *            Returns <eslENODATA> if the file appears to be empty.
 *
 * Throws:    <eslEMEM> on allocation error;
 */
int  
esl_sqfile_Cache(const ESL_ALPHABET *abc, const char *seqfile, int fmt, const char *env, ESL_SQCACHE **ret_sqcache)
{
  int          status;

  int          n;

  uint32_t     len;
  uint32_t     max;
  uint32_t     count;

  uint64_t     res_size = 1;
  uint64_t     hdr_size = 1;

  ESL_SQFILE  *sqfp    = NULL;

  ESL_SQ      *c       = NULL;
  ESL_SQ      *sq      = NULL;
  ESL_SQCACHE *cache   = NULL;

  ESL_DSQ     *res_ptr = NULL;
  char        *hdr_ptr = NULL;

  /* open the database */
  status = esl_sqfile_OpenDigital(abc, seqfile, fmt, env, &sqfp);
  if (status != eslOK) return status;

  /* if we can't rewind the database, stop now.  */
  if (!esl_sqfile_IsRewindable(sqfp)) return eslFAIL;

  /* loop through the database reading all the sequnces */
  max = 0;
  count = 0;
  sq  = esl_sq_CreateDigital(abc);
  while ((status = esl_sqio_Read(sqfp, sq)) == eslOK) {
    ++count;

    res_size += sq->n + 1;
    if (sq->n > max) max = sq->n;

    len = strlen(sq->name);
    if (len > 0) ++len;
    hdr_size += len;

    len = strlen(sq->acc);
    if (len > 0) ++len;
    hdr_size += len;

    len = strlen(sq->desc);
    if (len > 0) ++len;
    hdr_size += len;

    esl_sq_Reuse(sq);
  }
  if (status != eslEOF) goto ERROR;
  
  /* now that the database information is know, allocate the memory to
   * hold the data.  different memory blocks will be used to hold the
   * redisues and header info.  the idea is that since the header info,
   * ie, name, acc, etc is used infrenquently (only when there is a hit)
   * if some pages need to be swapped out, hopefully it will be the
   * header pages first leaving the sequences in memory.
   */
  ESL_ALLOC(cache, sizeof(ESL_SQCACHE));

  cache->filename    = NULL;
  cache->sq_list     = NULL;
  cache->residue_mem = NULL;
  cache->header_mem  = NULL;

  cache->abc         = abc;
  cache->format      = fmt;
  cache->seq_count   = count;
  cache->res_count   = res_size;
  cache->max_seq     = max;

  cache->res_size    = res_size + 2;
  cache->hdr_size    = hdr_size;

  ESL_ALLOC(cache->filename, strlen(seqfile) + 1);
  strcpy(cache->filename, seqfile);

  ESL_ALLOC(cache->sq_list, sizeof(ESL_SQ) * (count + 1));

  /* different memory blocks will be used to hold the residues and header
   * info.  the idea is that since the header info, ie, name, acc, etc.
   * is used infrenquently (only when there is a hit) if some pages need
   * to be swapped out, hopefully it will be the header pages first
   * leaving the sequences in memory.
   */
  ESL_ALLOC(cache->residue_mem, res_size + 2);
  ESL_ALLOC(cache->header_mem, hdr_size);

  hdr_ptr  = cache->header_mem;
  *hdr_ptr = 0;

  res_ptr  = cache->residue_mem;
  *res_ptr = eslDSQ_SENTINEL;

  /* loop through the database filling in the cache */
  n = 0;
  esl_sqfile_Position(sqfp, 0);
  while ((status = esl_sqio_Read(sqfp, sq)) == eslOK) {
    c = cache->sq_list + n;

    /* if header fields have been defined, copy them to the cache */
    c->name = hdr_ptr;
    if (sq->name[0] != 0) {
      c->name = hdr_ptr + 1;
      strcpy(c->name, sq->name);
      hdr_ptr += strlen(sq->name) + 1;
    }

    c->acc = hdr_ptr;
    if (sq->acc[0] != 0) {
      c->acc = hdr_ptr + 1;
      strcpy(c->acc, sq->acc);
      hdr_ptr += strlen(sq->acc) + 1;
    }

    c->desc = hdr_ptr;
    if (sq->desc[0] != 0) {
      c->desc = hdr_ptr + 1;
      strcpy(c->desc, sq->desc);
      hdr_ptr += strlen(sq->desc) + 1;
    }

    c->tax_id = sq->tax_id;
    c->seq    = NULL;
    c->ss     = NULL;
    c->nxr    = 0;
    c->xr_tag = NULL;
    c->xr     = NULL;

    /* copy the digitized sequence */
    memcpy(res_ptr + 1, sq->dsq + 1, sq->n + 1);
    c->dsq   = res_ptr;
    c->n     = sq->n;
    res_ptr += sq->n + 1;

    /* Coordinate info */
    c->start = sq->start;
    c->end = sq->end;
    c->C = sq->C;
    c->W = sq->W;
    c->L = sq->L;

    c->source = NULL;

    /* allocated lengths */
    c->nalloc   = -1;
    c->aalloc   = -1;
    c->dalloc   = -1;
    c->salloc   = -1;
    c->srcalloc = -1;

    /* Disk offset bookkeeping */
    c->idx  = n;
    c->roff = sq->roff;
    c->hoff = sq->hoff;
    c->doff = sq->doff;
    c->eoff = sq->eoff;

    c->abc = abc;

    esl_sq_Reuse(sq);
    ++n;
  }
  if (status != eslEOF) goto ERROR;

  /* add on last empty sequence */
  c = cache->sq_list + count;
  *(res_ptr + 1) = eslDSQ_SENTINEL;

  c->name     = hdr_ptr;
  c->acc      = hdr_ptr;
  c->desc     = hdr_ptr;

  c->tax_id   = -1;
  c->seq      = NULL;
  c->ss       = NULL;
  c->nxr      = 0;
  c->xr_tag   = NULL;
  c->xr       = NULL;

  c->dsq      = res_ptr;
  c->n        = 0;

  c->start    = 0;
  c->end      = 0;
  c->C        = 0;
  c->W        = 0;
  c->L        = -1;

  c->source   = NULL;

  c->nalloc   = -1;
  c->aalloc   = -1;
  c->dalloc   = -1;
  c->salloc   = -1;
  c->srcalloc = -1;

  c->idx      = count;
  c->roff     = -1;
  c->hoff     = -1;
  c->doff     = -1;
  c->eoff     = -1;

  c->abc      = NULL;
 
  if (sq != NULL) esl_sq_Destroy(sq);
  esl_sqfile_Close(sqfp);

  *ret_sqcache = cache;

  return eslOK;

ERROR:
  if (sq != NULL) esl_sq_Destroy(sq);
  esl_sqfile_Close(sqfp);

  esl_sqfile_Free(cache);

  return status;
}

/* Function:  esl_sqfile_Free()
 * Synopsis:  Free a cached database <ESL_SQCACHE>.
 *
 * Purpose:   Frees all the memory used to cache the sequence database.
 *
 * Returns:   none.
 */
void  
esl_sqfile_Free(ESL_SQCACHE *sqcache)
{
  if (sqcache == NULL) return;

  if (sqcache->filename    != NULL) free(sqcache->filename);
  if (sqcache->sq_list     != NULL) free(sqcache->sq_list);
  if (sqcache->residue_mem != NULL) free(sqcache->residue_mem);
  if (sqcache->header_mem  != NULL) free(sqcache->header_mem);

  sqcache->abc         = NULL;
  sqcache->filename    = NULL;
  sqcache->sq_list     = NULL;
  sqcache->residue_mem = NULL;
  sqcache->header_mem  = NULL;

  free(sqcache);
}
/*---------------- end, sequence database caching ---------------*/





/*****************************************************************
 *  8. Functions specific to sqio <-> msa interoperation [with <msa>] 
 *****************************************************************/

/* convert_sq_to_msa()
 * 
 * Given a <sq>, create and return an "MSA" through <ret_msa>, which
 * contains only the single unaligned sequence. <sq> is 
 * not affected in any way. This is only to convert from the SQ
 * object to an MSA object for the purpose of writing SQ in an MSA format
 * file format.
 * 
 * Returns <eslOK> on success, and <*ret_msa> points to
 * a new "alignment".
 * 
 * Throws <eslEMEM> on allocation error, and <*ret_msa> is NULL.
 */
static int
convert_sq_to_msa(ESL_SQ *sq, ESL_MSA **ret_msa)
{
  ESL_MSA *msa;
  int      x;        /* counter for extra-residue markups */
  int      status;

  if (sq->dsq != NULL) 
    { 
      if ((msa = esl_msa_CreateDigital(sq->abc, 1, sq->n)) == NULL) { status = eslEMEM; goto ERROR; }
    } 
  else if ((msa = esl_msa_Create(1, sq->n)) == NULL) { status = eslEMEM; goto ERROR; }

  if ((status = esl_strdup(sq->name, -1, &(msa->sqname[0]))) != eslOK) goto ERROR;
  
  if (*sq->acc != '\0')
    {
      ESL_ALLOC(msa->sqacc, sizeof(char *) * 1);
      if ((status = esl_strdup(sq->acc, -1, &(msa->sqacc[0]))) != eslOK) goto ERROR;
    }
  if (*sq->desc != '\0')
    {
      ESL_ALLOC(msa->sqdesc, sizeof(char *) * 1);
      if ((status = esl_strdup(sq->desc, -1, &(msa->sqdesc[0]))) != eslOK) goto ERROR;
    }

  if (sq->dsq != NULL) esl_abc_dsqcpy(sq->dsq, sq->n, msa->ax[0]);
  else                 strcpy(msa->aseq[0], sq->seq);
  
  if (sq->ss != NULL)
    {
      ESL_ALLOC(msa->ss, sizeof(char *) * 1);

      if (sq->dsq != NULL) {	/* sq->ss is 1..L in digital mode; but msa->ss is always 0..L-1 */
	if      ((status = esl_strdup(sq->ss+1, -1, &(msa->ss[0]))) != eslOK) goto ERROR;
      } else if ((status = esl_strdup(sq->ss,   -1, &(msa->ss[0]))) != eslOK) goto ERROR;     	
    }

  if (sq->nxr > 0) {
    msa->ngr = sq->nxr;
    ESL_ALLOC(msa->gr,     sizeof(char **) * msa->ngr);    
    ESL_ALLOC(msa->gr_tag, sizeof(char  *) * msa->ngr);

    for (x = 0; x < msa->ngr; x ++) {
      ESL_ALLOC(msa->gr[x],     sizeof(char *));  
      ESL_ALLOC(msa->gr_tag[x], sizeof(char));
   
      if (sq->dsq != NULL) {	/* sq->xr is 1..L in digital mode; but msa->gr is always 0..L-1 */
        if      ((status = esl_strdup(sq->xr[x]+1, -1, &(msa->gr[x][0]))) != eslOK) goto ERROR;
      } else if ((status = esl_strdup(sq->xr[x],   -1, &(msa->gr[x][0]))) != eslOK) goto ERROR;     	

      if ((status = esl_strdup(sq->xr_tag[x], -1, &(msa->gr_tag[x]))) != eslOK) goto ERROR;     	  
    }
  }

  msa->alen = sq->n;
  msa->nseq = 1;
  *ret_msa = msa;
  return eslOK;

 ERROR:
  esl_msa_Destroy(msa);
  *ret_msa = NULL;
  return status;
}
/*---------- end of msa <-> sqio module interop -----------------*/



/*****************************************************************
 *#  9. Benchmark driver
 *****************************************************************/ 
/* Some current results:
 *
 * ./benchmark /misc/data0/genomes/c.elegans/genome/allWS120
 * CPU Time: 0.90u 0.06s 00:00:00.96 Elapsed: 00:00:01
 *
 * /benchmark -i /misc/data0/genomes/c.elegans/genome/allWS120
 * CPU Time: 0.41u 0.04s 00:00:00.44 Elapsed: 00:00:00
 * 
 * ./benchmark -w /misc/data0/genomes/c.elegans/genome/allWS120
 * CPU Time: 0.83u 0.05s 00:00:00.88 Elapsed: 00:00:01
 *
 * ./benchmark -2w /misc/data0/genomes/c.elegans/genome/allWS120
 * CPU Time: 3.55u 0.26s 00:00:03.80 Elapsed: 00:00:04
 *
 * Digital times are comparable (maybe a titch faster), except
 * for -d2w, which runs much faster, because rev complementation is
 * more efficient:
 *
 * ./benchmark -d2w /misc/data0/genomes/c.elegans/genome/allWS120
 * CPU Time: 2.16u 0.31s 00:00:02.47 Elapsed: 00:00:03
 */
#ifdef eslSQIO_BENCHMARK
#include <stdlib.h>
#include <stdio.h>
#include <unistd.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <sys/mman.h>
#include <fcntl.h>

#include "easel.h"
#include "esl_getopts.h"
#include "esl_sq.h"
#include "esl_sqio.h"
#include "esl_stopwatch.h"

static ESL_OPTIONS options[] = {
  /* name           type      default  env  range toggles reqs incomp  help                                       docgroup*/
  { "-h",        eslARG_NONE,   FALSE,  NULL, NULL,  NULL,  NULL, NULL, "show brief help on version and usage",             0 },
  { "-d",        eslARG_NONE,   FALSE,  NULL, NULL,  NULL,  NULL, NULL, "use digital sequence input mode",                  0 },
  { "-i",        eslARG_NONE,   FALSE,  NULL, NULL,  NULL,  NULL, NULL, "benchmark ReadInfo() input",                       0 },
  { "-s",        eslARG_NONE,   FALSE,  NULL, NULL,  NULL,  NULL, NULL, "benchmark ReadSequence() input",                   0 },
  { "-w",        eslARG_NONE,   FALSE,  NULL, NULL,  NULL,  NULL, NULL, "benchmark ReadWindow() input",                     0 },
  { "-B",        eslARG_INT,   "4096",  NULL, NULL,  NULL,  NULL, NULL, "buffer size for read, fread tests",                0 },
  { "-C",        eslARG_INT,    "100",  NULL, NULL,  NULL,  NULL, NULL, "context size for ReadWindow()",                    0 },
  { "-W",        eslARG_INT,   "1000",  NULL, NULL,  NULL,  NULL, NULL, "window size for ReadWindow()",                     0 },
  { "-2",        eslARG_NONE,   FALSE,  NULL, NULL,  NULL,  "-w", NULL, "with ReadWindow(), do both strands",               0 },
  { "--format",  eslARG_STRING,  NULL,  NULL, NULL,  NULL,  NULL, NULL, "assert <seqfile> is in format <s>",                0 },
  { "--amino",   eslARG_NONE,   FALSE,  NULL, NULL,  NULL,  NULL, NULL, "use protein alphabet, not DNA",                    0 },
  {  0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
};
static char usage[]  = "[-options] <DNA FASTA file>";
static char banner[] = "benchmark driver for sqio module";

static int benchmark_read (char *filename, int bufsize, int64_t *ret_magic);
static int benchmark_fread(char *filename, int bufsize, int64_t *ret_magic);
static int benchmark_fgets(char *filename, int bufsize, int64_t *ret_magic);
/*static int benchmark_mmap (char *filename, int bufsize, int64_t *ret_magic);*/

int
main(int argc, char **argv)
{
  ESL_GETOPTS   *go       = esl_getopts_CreateDefaultApp(options, 1, argc, argv, banner, usage);
  ESL_STOPWATCH *w        = esl_stopwatch_Create();
  ESL_ALPHABET  *abc      = NULL;
  ESL_SQ        *sq       = NULL;
  ESL_SQFILE    *sqfp     = NULL;
  char          *filename = esl_opt_GetArg(go, 1);
  int            format   = eslSQFILE_UNKNOWN;
  int            bufsize  = esl_opt_GetInteger(go, "-B");
  int            C        = esl_opt_GetInteger(go, "-C");
  int            W        = esl_opt_GetInteger(go, "-W");
  int            do_crick = esl_opt_GetBoolean(go, "-2");
  int            n        = 0;
  int64_t        magic    = 0;
  int64_t        nr       = 0;

  if (esl_opt_IsOn(go, "--format")) {
    format = esl_sqio_EncodeFormat(esl_opt_GetString(go, "--format"));
    if (format == eslSQFILE_UNKNOWN) esl_fatal("unrecognized database format %s\n", esl_opt_GetString(go, "--format"));
  }

  if (esl_opt_GetBoolean(go, "-d"))
    {
      if (esl_opt_GetBoolean(go, "--amino")) abc = esl_alphabet_Create(eslAMINO);
      else                                   abc = esl_alphabet_Create(eslDNA);
      sq = esl_sq_CreateDigital(abc);
      if (esl_sqfile_OpenDigital(abc, filename, format, NULL, &sqfp) != eslOK) esl_fatal("failed to open %s", filename);
    } 
  else 
    {
      sq = esl_sq_Create();
      if (esl_sqfile_Open(filename, format, NULL, &sqfp) != eslOK) esl_fatal("failed to open %s", filename);
    }


  /* It's useful to have some baselines of just reading the file without parsing;
   * POSIX read(); C fread(); C fgets(); and POSIX mmap(). 
   * Counting a's (<na>) is just to keep the optimizer from optimizing the 
   * benchmark away.
   */
  esl_stopwatch_Start(w);   benchmark_read (filename, bufsize, &magic);   esl_stopwatch_Stop(w);  printf("magic=%" PRId64 "; ", magic); esl_stopwatch_Display(stdout, w, "read():  "); 
  esl_stopwatch_Start(w);   benchmark_fread(filename, bufsize, &magic);   esl_stopwatch_Stop(w);  printf("magic=%" PRId64 "; ", magic); esl_stopwatch_Display(stdout, w, "fread(): ");
  esl_stopwatch_Start(w);   benchmark_fgets(filename, bufsize, &magic);   esl_stopwatch_Stop(w);  printf("magic=%" PRId64 "; ", magic); esl_stopwatch_Display(stdout, w, "fgets(): ");
  /*  esl_stopwatch_Start(w);   benchmark_mmap (filename, bufsize, &magic);   esl_stopwatch_Stop(w);  printf("magic=%" PRId64 "; ", magic); esl_stopwatch_Display(stdout, w, "mmap():  "); */

  esl_stopwatch_Start(w);
  if (esl_opt_GetBoolean(go, "-i"))
    {
      while (esl_sqio_ReadInfo(sqfp, sq) == eslOK) { n++; nr += sq->L; esl_sq_Reuse(sq); }
    }
  else if (esl_opt_GetBoolean(go, "-s"))
    {
      while (esl_sqio_ReadSequence(sqfp, sq) == eslOK) { n++; nr += sq->L; esl_sq_Reuse(sq); }
    }
  else if (esl_opt_GetBoolean(go, "-w"))
    {
      int wstatus;
      while ((wstatus = esl_sqio_ReadWindow(sqfp, C, W, sq)) != eslEOF)
	{ 
	  if        (wstatus == eslEOD) {
	    if (!do_crick || W < 0) { n++; esl_sq_Reuse(sq); }
	    if (do_crick)           { W = -W; }
	    continue;
	  } else if (wstatus != eslOK) esl_fatal("Error: %s", esl_sqfile_GetErrorBuf(sqfp));
	  nr += sq->W;
	}
    }
  else 
    {
      while (esl_sqio_Read(sqfp, sq) == eslOK)  { n++; nr += sq->L; esl_sq_Reuse(sq); }
    }
  esl_stopwatch_Stop(w);
  esl_stopwatch_Display(stdout, w, "sqio:  ");
  printf("Read %d sequences; %lld residues.\n", n, (long long int) nr);

  if (sqfp->format == eslSQFILE_NCBI)
    printf("  DB %d sequences; %lld residues.\n", sqfp->data.ncbi.num_seq, (long long int) sqfp->data.ncbi.total_res);

  esl_sqfile_Close(sqfp);
  esl_sq_Destroy(sq);
  esl_stopwatch_Destroy(w);
  esl_alphabet_Destroy(abc);
  esl_getopts_Destroy(go);
  return 0;
}

static int
benchmark_read(char *filename, int bufsize, int64_t *ret_magic)
{
  char *buf  = malloc(sizeof(char) * bufsize);
  int   fd   = open(filename, O_RDONLY);
  int   n;
  int64_t magic = 0;
  int   pos;

  if (fd != -1) 
    {
      while ( (n = read(fd, buf, bufsize)) > 0)
	{
	  for (pos = 0; pos < n; pos++) magic += buf[pos];
	}
      close(fd);
    }

  free(buf);
  *ret_magic = magic;
  return eslOK;
}
  
static int
benchmark_fread(char *filename, int bufsize, int64_t *ret_magic)
{
  FILE *fp   = fopen(filename, "r");
  char *buf  = malloc(sizeof(char) * bufsize);
  int64_t magic = 0;
  int   pos;
  int   n;
  
  if (fp != NULL)
    {
      while ( (n = fread(buf, sizeof(char), bufsize, fp)) > 0)
	{
	  for (pos = 0; pos < n; pos++) magic += buf[pos];
	} 
      fclose(fp);
    }

  free(buf);
  *ret_magic = magic;
  return eslOK;
}


static int
benchmark_fgets(char *filename, int bufsize, int64_t *ret_magic)
{
  FILE *fp   = fopen(filename, "r");
  char *buf  = malloc(sizeof(char) * bufsize);
  int64_t magic = 0;
  char *s;
  
  if (fp != NULL)
    {
      while ( fgets(buf, bufsize, fp) != NULL)
	{
	  for (s = buf; *s; s++) magic += *s;
	} 
      fclose(fp);
    }

  free(buf);
  *ret_magic = magic;
  return eslOK;
}


#if 0
/* we can't use mmap anyway; we have to be able to read from streams
 * and pipes.
 */
static int
benchmark_mmap(char *filename, int bufsize, int64_t *ret_magic)
{
  int   fd   = open(filename, O_RDONLY);
  struct stat statbuf;
  char *p;
  uint64_t pos;
  uint64_t magic = 0;
 
  fstat(fd, &statbuf);
  p = mmap(0, statbuf.st_size, PROT_READ, MAP_SHARED, fd, 0);
  for (pos = 0; pos < statbuf.st_size; pos++)
    magic += p[pos];

  close(fd);
  *ret_magic = magic;
  return eslOK;
}
#endif  


#endif /*eslSQIO_BENCHMARK*/
/*------------------ end of benchmark ---------------------------*/



/*****************************************************************
 *#  10. Unit tests
 *****************************************************************/ 
#ifdef eslSQIO_TESTDRIVE
#include "esl_keyhash.h"
#include "esl_random.h"
#include "esl_randomseq.h"
#include "esl_vectorops.h"

static void
synthesize_testseqs(ESL_RANDOMNESS *r, ESL_ALPHABET *abc, int maxL, int N, ESL_SQ ***ret_sqarr)
{
  ESL_SQ **sqarr  = malloc(sizeof(ESL_SQ *) * N);
  ESL_KEYHASH *kh = esl_keyhash_Create();
  float   *fq     = malloc(sizeof(float)   * abc->Kp);
  char    *buf    = NULL;
  int      maxn   = eslSQ_NAMECHUNK*2;
  int      maxa   = eslSQ_ACCCHUNK*2;
  int      maxd   = eslSQ_DESCCHUNK*2;
  char     ascii[128];
  float    af[128];
  int      i, pos;
  int      n;
  int      x;

  n = ESL_MAX( maxn, ESL_MAX(maxa, maxd));
  buf = malloc(sizeof(char) * (n+1));

  /* Set a residue frequency vector that's going to sample degenerate residues too */
  esl_vec_FSet(fq, abc->Kp, 0.0);
  esl_vec_FSet(fq, abc->K,  0.9 / (float) abc->K);
  esl_vec_FSet(fq + abc->K + 1, abc->Kp - abc->K - 2, 0.1 / (float) (abc->Kp - abc->K - 2));

  /* Set an ASCII frequency vector that'll sample all nonspace chars */
  for (x = 0; x < 128; x++) {
    ascii[x] = (char) x;
    if      (isalpha(x))             af[x] = 3.0;
    else if (isdigit(x))             af[x] = 2.0;
    else if (ispunct(x) && x != '%') af[x] = 1.0; /* disallow %; it'll screw up printf()-like Set calls */
    else                             af[x] = 0.0;
  }
  esl_vec_FNorm(af, 128);

  for (i = 0; i < N; i++)
    {
      if ((sqarr[i] = esl_sq_CreateDigital(abc)) == NULL) esl_fatal("failed to allocate seq %d", i);

      do {
	n = esl_rnd_Roll(r, maxn) + 1; /* 1..maxn */
	esl_rsq_fIID(r, ascii, af, 128, n, buf);
	buf[n] = '\0';
      }	while (ispunct(buf[0]) ||                                // #, // are bad things for names to start with, in Stockholm format 
               esl_keyhash_Store(kh, buf, n, NULL) == eslEDUP);  // Make sure names are unique.
      esl_sq_SetName(sqarr[i], buf);

      if (esl_rnd_Roll(r, 2) == 0) { /* 50% chance of an accession */
	n = esl_rnd_Roll(r, maxa) + 1; 
	esl_rsq_fIID(r, ascii, af, 128, n, buf);
	buf[n] = '\0';
	esl_sq_SetAccession(sqarr[i], buf);
      }

      if (esl_rnd_Roll(r, 2) == 0) { /* 50% chance of a description */
	n = esl_rnd_Roll(r, maxd) + 1;
	esl_rsq_fIID(r, ascii, af, 128, n, buf);
	buf[n] = '\0';
	for (pos = 1; pos < n-1; pos++) {                 /* avoid first, last char, and... */
	  if (esl_rnd_Roll(r, 10)  == 0) buf[pos] = ' ';  /* ...sprinkle with spaces... */
	  if (esl_rnd_Roll(r, 100) == 0) buf[pos] = '\t'; /* ...and tabs. */
	}
	esl_sq_SetDesc(sqarr[i], buf);
      }

      n = esl_rnd_Roll(r, (maxL+1)); /* choose seqlen =  0..maxL; 0 length seqs occur in real dbs */
      esl_sq_GrowTo(sqarr[i], n);
      esl_rsq_xfIID(r, fq, abc->Kp, n, sqarr[i]->dsq);

      esl_sq_SetCoordComplete(sqarr[i], n);
    }

  *ret_sqarr = sqarr;
  free(buf);
  free(fq);
  esl_keyhash_Destroy(kh);
  return;
}

/* Write an uglified FASTA file to a stream.
 * Also, remember where the start of the descline and first
 * seq line are, in sq->{roff,doff}. We'll compare against
 * what the input function thinks these locations are.
 */
static void
write_ugly_fasta(ESL_RANDOMNESS *r, FILE *fp, ESL_SQ *sq)
{
  char buf[61];
  int  pos;
  
  sq->roff = ftello(fp);
  fputc('>', fp);
  while (esl_rnd_Roll(r, 10) == 0) fputc(' ', fp);
  fprintf(fp, "%s", sq->name);
  while (esl_rnd_Roll(r, 10) == 0) fputc(' ', fp);
  if (sq->desc[0] != 0) fprintf(fp, " %s", sq->desc);
  fputc('\n', fp);

  sq->doff = ftello(fp);             
  buf[60] = '\0';
  for (pos = 1; pos <= sq->n; pos+=60)
    {
      while (esl_rnd_Roll(r, 10) == 0) fputc(' ', fp);
      esl_abc_TextizeN(sq->abc, sq->dsq+pos, 60, buf);
      fputs(buf, fp);
      fputc('\n', fp);
    }
  while (esl_rnd_Roll(r, 10) == 0) fputc('\n', fp);

  sq->eoff = ftello(fp) - 1;
  if (sq->n == 0) sq->doff = sq->eoff+1;  // Deals with an edge case, an L=0 seq with multiple newlines.
}

static void
write_spaced_fasta(FILE *fp, ESL_SQ *sq)
{
  char buf[64];
  int  pos;

  sq->roff = ftello(fp);
  fprintf(fp, ">%s", sq->name);
  if (sq->desc[0] != 0) fprintf(fp, " %s", sq->desc);
  fputc('\n', fp);

  sq->doff = ftello(fp);
  buf[10]  = '\0';
  for (pos = 1; pos <= sq->n; pos += 10)
    {
      esl_abc_TextizeN(sq->abc, sq->dsq+pos, 10, buf);
      fputs(buf, fp);
      if (pos+9 >= sq->n || (pos+9) % 60 == 0) fputc('\n',  fp);
      else                                     fputc(' ', fp);
    }
  sq->eoff = ftello(fp) - 1;
}


static void
make_ssi_index(ESL_ALPHABET *abc, const char *tmpfile, int format, char *ssifile, int mode)
{ 
  char       *msg  = "sqio unit testing: failed to make SSI index";
  ESL_NEWSSI *ns   = NULL;
  ESL_SQFILE *sqfp = NULL;
  ESL_SQ     *sq   = esl_sq_CreateDigital(abc);
  uint16_t    fh   = 0;
  int         nseq = 0;
  int         status;

  int         bpl, rpl;
 
  sprintf(ssifile, "%s.ssi", tmpfile);
  if (esl_newssi_Open(ssifile, TRUE, &ns)                       != eslOK) esl_fatal(msg);
  if (esl_newssi_AddFile(ns, tmpfile, format, &fh)              != eslOK) esl_fatal(msg);
  if (esl_sqfile_OpenDigital(abc, tmpfile, format, NULL, &sqfp) != eslOK) esl_fatal(msg);
  while ((status = esl_sqio_ReadInfo(sqfp, sq)) == eslOK)
    {
      nseq++;
      if (esl_newssi_AddKey(ns, sq->name, fh, sq->roff, sq->doff, sq->L)   != eslOK) esl_fatal(msg);
      if (sq->acc[0] != '\0' && esl_newssi_AddAlias(ns, sq->acc, sq->name) != eslOK) esl_fatal(msg);
      esl_sq_Reuse(sq);
    }
  if (status != eslEOF) esl_fatal(msg);
  
  bpl = sqfp->data.ascii.bpl;
  rpl = sqfp->data.ascii.rpl;
  if (bpl > 0 && rpl > 0) 
    if ((status = esl_newssi_SetSubseq(ns, fh, bpl, rpl)) != eslOK) esl_fatal(msg);
  
  if (esl_newssi_Write(ns)        != eslOK)  esl_fatal(msg);

  bpl = sqfp->data.ascii.bpl;
  rpl = sqfp->data.ascii.rpl;

  switch (mode) {
  case 0:  if (bpl != 0)               esl_fatal(msg); break; /* uglified: bpl should be invalid (rpl might not be) */
  case 1:  if (rpl != 60 || bpl == 0)  esl_fatal(msg); break; /* spaced: bpl, rpl should be valid */
  case 2:  if (rpl != 60 || bpl != 61) esl_fatal(msg); break; /* normal: bpl, rpl should be valid, w/ bpl=rpl+1 */
  }

  esl_sqfile_Close(sqfp);
  esl_newssi_Close(ns);
  esl_sq_Destroy(sq);
}

static void
utest_read(ESL_ALPHABET *abc, ESL_SQ **sqarr, int N, char *seqfile, int format, int mode)
{
  char       *msg         = "sqio complete read unit test failed";
  ESL_SQ     *sq          = esl_sq_CreateDigital(abc);
  ESL_SQFILE *sqfp        = NULL;
  int         nseq        = 0;
  int         status;
  
  int         bpl, rpl;
 
  if (esl_sqfile_OpenDigital(abc, seqfile, format, NULL, &sqfp) != eslOK) esl_fatal(msg);
  while ((status = esl_sqio_Read(sqfp, sq)) == eslOK)
    {
      /* FASTA doesn't preserve accessions. Copy it, as a hack, so Compare test succeeds*/
      if (sq->acc[0] == '\0' && esl_sq_SetAccession(sq, sqarr[nseq]->acc) != eslOK) esl_fatal(msg);
      if (esl_sq_Compare(sq, sqarr[nseq])                                 != eslOK) esl_fatal(msg);      
      nseq++;
      esl_sq_Reuse(sq);
    }
  if (status != eslEOF) esl_fatal(msg);
  if (nseq   != N)      esl_fatal(msg);

  bpl = sqfp->data.ascii.bpl;
  rpl = sqfp->data.ascii.rpl;

  switch (mode) {
  case 0:  if (bpl != 0)                     esl_fatal(msg); break; /* uglified: bpl should be invalid (rpl might not be) */
  case 1:  if (rpl != 60 || bpl == 0)  esl_fatal(msg); break; /* spaced: bpl, rpl should be valid */
  case 2:  if (rpl != 60 || bpl != 61) esl_fatal(msg); break; /* normal: bpl, rpl should be valid, w/ bpl=rpl+1 */
  }

  esl_sqfile_Close(sqfp);
  esl_sq_Destroy(sq);
}

static void
utest_read_info(ESL_ALPHABET *abc, ESL_SQ **sqarr, int N, char *seqfile, int format, int mode)
{
  char       *msg         = "sqio info read unit test failed";
  ESL_SQ     *sq          = esl_sq_CreateDigital(abc);
  ESL_SQFILE *sqfp        = NULL;
  int         nseq        = 0;
  int         status;
  
  int         bpl, rpl;
 
  if (esl_sqfile_OpenDigital(abc, seqfile, format, NULL, &sqfp) != eslOK) esl_fatal(msg);
  while ((status = esl_sqio_ReadInfo(sqfp, sq)) == eslOK)
    {
      if (strcmp(sq->name,   sqarr[nseq]->name)   != 0) esl_fatal(msg);
      if (format != eslSQFILE_FASTA && 
	  strcmp(sq->acc,    sqarr[nseq]->acc)    != 0) esl_fatal(msg);
      if (strcmp(sq->desc,   sqarr[nseq]->desc)   != 0) esl_fatal(msg);
      if (strcmp(sq->source, sqarr[nseq]->source) != 0) esl_fatal(msg);
      if (sq->n     != 0)  esl_fatal(msg);
      if (sq->start != 0)  esl_fatal(msg);
      if (sq->end   != 0)  esl_fatal(msg);
      if (sq->C     != 0)  esl_fatal(msg);
      if (sq->W     != 0)  esl_fatal(msg);
      if (sq->L     != sqarr[nseq]->L)                  esl_fatal(msg);
      if (sq->roff != -1 && sqarr[nseq]->roff != -1 && sq->roff != sqarr[nseq]->roff) esl_fatal(msg);
      if (sq->doff != -1 && sqarr[nseq]->doff != -1 && sq->doff != sqarr[nseq]->doff) esl_fatal(msg);
  
      nseq++;
      esl_sq_Reuse(sq);
    }
  if (status != eslEOF) esl_fatal(msg);
  if (nseq   != N)      esl_fatal(msg);

  bpl = sqfp->data.ascii.bpl;
  rpl = sqfp->data.ascii.rpl;

  switch (mode) {
  case 0:  if (bpl != 0)                     esl_fatal(msg); break; /* uglified: bpl should be invalid (rpl might not be) */
  case 1:  if (rpl != 60 || bpl == 0)  esl_fatal(msg); break; /* spaced: bpl, rpl should be valid */
  case 2:  if (rpl != 60 || bpl != 61) esl_fatal(msg); break; /* normal: bpl, rpl should be valid, w/ bpl=rpl+1 */
  }

  esl_sqfile_Close(sqfp);
  esl_sq_Destroy(sq);
}

static void
utest_read_window(ESL_ALPHABET *abc, ESL_SQ **sqarr, int N, char *seqfile, int format, int mode)
{
  char       *msg         = "sqio window read unit test failed";
  ESL_SQ     *sq          = esl_sq_CreateDigital(abc);
  ESL_SQ     *rev         = esl_sq_CreateDigital(abc);
  ESL_SQFILE *sqfp        = NULL;
  int         nseq        = 0;
  int         C           = 10;
  int         W           = 50;
  int         nres        = 0;
  int         wstatus;

  int         bpl, rpl, L;
 
  if (esl_sqfile_OpenDigital(abc, seqfile, format, NULL, &sqfp) != eslOK) esl_fatal(msg);
  while ((wstatus = esl_sqio_ReadWindow(sqfp, C, W, sq)) == eslOK || wstatus == eslEOD)
    {
      if (wstatus == eslEOD) {
	if (W < 0) {
	  nseq++; 
	  nres = 0;
	  W    = -W;
	  esl_sq_Reuse(sq); 
	  esl_sq_Reuse(rev);
	} else       {
	  /* reverse complement */
	  nres = 0;
	  W    = -W; 	
	  esl_sq_Copy(sqarr[nseq], rev);
	  esl_sq_ReverseComplement(rev);
	}
	continue;
      }

      nres += sq->W;
      if (strcmp(sq->name,   sqarr[nseq]->name)   != 0) esl_fatal(msg);
      if (format != eslSQFILE_FASTA && 
	  strcmp(sq->acc,    sqarr[nseq]->acc)    != 0) esl_fatal(msg);
      if (strcmp(sq->desc,   sqarr[nseq]->desc)   != 0) esl_fatal(msg);

      L = sqfp->data.ascii.L;

      if (W > 0) {
	/* Forward strand coord checks */
	if (L   != nres)                                esl_fatal(msg);
	if (sq->start != nres - sq->n + 1)              esl_fatal(msg);
	if (sq->end   != nres)                          esl_fatal(msg);
	if (sq->C != 0 && sq->C != C)                   esl_fatal(msg);
	if (sq->n != sq->C+sq->W)                       esl_fatal(msg);
	if (sq->start+sq->n-1 > sqarr[nseq]->L)         esl_fatal(msg);
	if (wstatus == eslEOD && sq->L != L)            esl_fatal(msg);
	if (memcmp(sq->dsq + 1, sqarr[nseq]->dsq + sq->start, sq->C+sq->W) != 0) esl_fatal(msg);
      } else {
	/* Reverse strand coord checks */
	if (L    != -1)                                 esl_fatal(msg);
	if (sq->start  != sq->L - nres + sq->W + sq->C) esl_fatal(msg);
	if (sq->end    != sq->L - nres + 1)             esl_fatal(msg);
	if (sq->C != 0 && sq->C != C)                   esl_fatal(msg);
	if (sq->start-sq->n+1 < 1)                      esl_fatal(msg);
	if (wstatus == eslEOD && sq->end != 1)          esl_fatal(msg);
	if (memcmp(sq->dsq + 1, rev->dsq + (sq->L - sq->start + 1), sq->C+sq->W) != 0) esl_fatal(msg);
      }
    }

  bpl = sqfp->data.ascii.bpl;
  rpl = sqfp->data.ascii.rpl;

  switch (mode) {
  case 0:  if (bpl != 0)                     esl_fatal(msg); break; /* uglified: bpl should be invalid (rpl might not be) */
  case 1:  if (rpl != 60 || bpl == 0)  esl_fatal(msg); break; /* spaced: bpl, rpl should be valid */
  case 2:  if (rpl != 60 || bpl != 61) esl_fatal(msg); break; /* normal: bpl, rpl should be valid, w/ bpl=rpl+1 */
  }

  if (wstatus != eslEOF) esl_fatal(msg);
  if (nseq    != N)      esl_fatal(msg);
  esl_sqfile_Close(sqfp);
  esl_sq_Destroy(rev);
  esl_sq_Destroy(sq);
}

static void
utest_fetch_subseq(ESL_RANDOMNESS *r, ESL_ALPHABET *abc, ESL_SQ **sqarr, int N, char *seqfile, char *ssifile, int format)
{
  char       *msg         = "sqio subseq read unit test failure";
  ESL_SQ     *sq          = esl_sq_CreateDigital(abc);
  ESL_SQFILE *sqfp        = NULL;
  int         i;
  int         ntest       = 32;
  char       *source;
  int         start;
  int         end;

  if (esl_sqfile_OpenDigital(abc, seqfile, format, NULL, &sqfp) != eslOK) esl_fatal(msg);
  if (esl_sqfile_OpenSSI(sqfp, ssifile)                         != eslOK) esl_fatal(msg);
  while (ntest--) 
    {
      i      = esl_rnd_Roll(r, N);
      source = sqarr[i]->name; 
      if (sqarr[i]->L == 0) continue;   // Don't try to fetch from empty sequences.
      
      do {
	start = esl_rnd_Roll(r, sqarr[i]->n) + 1;
	end   = esl_rnd_Roll(r, sqarr[i]->n) + 1;
      } while (start > end);

      if (esl_sqio_FetchSubseq(sqfp, source, start, end, sq)        != eslOK) esl_fatal(msg);
      if (memcmp(&(sqarr[i]->dsq[start]), &sq->dsq[1], end-start+1) != 0)     esl_fatal(msg);
      
      esl_sq_Reuse(sq);
    }

  esl_sqfile_Close(sqfp);
  esl_sq_Destroy(sq);
}


/* Write the sequences out to a tmpfile in chosen <format>;
 * read them back and make sure they're the same.
 * reposition to beginning, read and check again.
 *
 * The sequences in <sqarr> are in digital mode.
 */
static void
utest_write(ESL_ALPHABET *abc, ESL_SQ **sqarr, int N, int format)
{
  char       *msg         = "sqio write unit test failure";
  char        tmpfile[32] = "esltmpXXXXXX";
  ESL_SQFILE *sqfp        = NULL;
  ESL_SQ     *sq          = esl_sq_CreateDigital(abc);
  FILE       *fp          = NULL;
  int         iterations  = 2;	/* 2: reposition and read again */
  int         i;
  int         require_nonzero_length = FALSE;

  if (esl_sqio_IsAlignment(format)) require_nonzero_length = TRUE;

  if (esl_tmpfile_named(tmpfile, &fp) != eslOK) esl_fatal(msg);
  for (i = 0; i < N; i++)
    if (! require_nonzero_length || sqarr[i]->L > 0)
      esl_sqio_Write(fp, sqarr[i], format, FALSE);
  fclose(fp);

  if (esl_sqfile_OpenDigital(abc, tmpfile, format, NULL, &sqfp)           != eslOK)  esl_fatal(msg);
  while (iterations--)
    {
      for (i = 0; i < N; i++)
	{
          if (require_nonzero_length && sqarr[i]->L == 0) continue;
	  if (esl_sqio_Read(sqfp, sq)                                     != eslOK)  esl_fatal(msg);
	  if (strcmp(sqarr[i]->name,   sq->name)                          != 0)      esl_fatal(msg);
	  if (sqarr[i]->L                                                 !=  sq->L) esl_fatal(msg);
	  if (memcmp(sqarr[i]->dsq, sq->dsq, sizeof(ESL_DSQ) * (sq->L+2)) != 0)      esl_fatal(msg);
	  esl_sq_Reuse(sq);
	}
      esl_sqfile_Position(sqfp, 0); /* rewind and make sure we get same reads again */
    }
  esl_sqfile_Close(sqfp);
  esl_sq_Destroy(sq);
  remove(tmpfile);
}


/* utest_guess_mechanics()
 * SRE H3/70, 8 Apr 17
 *
 * Related to EPN bugfix a61ee23: esl_sqfile_GuessAlphabet() segfaults
 * when file contains 1 seq, file is >4096 bytes, seq is <4000
 * residues, because of a fault in the mechanics of sqio with
 * is_recording TRUE and is_linebased FALSE.
 *
 * This unit test exercises those mechanics, the original bug and
 * more. It is *not* testing GuessAlphabet() itself. The DNA sequences
 * in <sqarr> are so dirty, their alphabet cannot be reliably
 * detected. This unit test is hunting segfaults, not even looking at
 * the return status of _GuessAlphabet().
 */
static void
utest_guess_mechanics(ESL_ALPHABET *abc, ESL_SQ **sqarr, int N)
{
  char       *msg         = "sqio guess_mechanics unit test failure";
  char        tmpfile[32];
  FILE       *fp;          
  ESL_SQFILE *sqfp;
  int         i;
  int         alphatype;

  for (i = 0; i < N; i++)  // for each individual sequence in <sqarr>, one at a time:
    {
      strcpy(tmpfile, "esltmpXXXXXX");
      if (esl_tmpfile_named(tmpfile, &fp)                      != eslOK) esl_fatal(msg);
      if (esl_sqio_Write(fp, sqarr[i], eslSQFILE_FASTA, FALSE) != eslOK) esl_fatal(msg);
      fclose(fp);

      if (esl_sqfile_Open(tmpfile, eslSQFILE_FASTA, NULL, &sqfp) != eslOK) esl_fatal(msg);

      esl_sqfile_GuessAlphabet(sqfp, &alphatype);
      // generally, the sequences are so dirty, GuessAlphabet() won't make a guess.
      // we're hunting segfaults in this utest.

      esl_sqfile_Close(sqfp);
      remove(tmpfile);
    }
}

#endif /*eslSQIO_TESTDRIVE*/
/*------------------ end, unit tests ----------------------------*/



/*****************************************************************
 *# 11. Test driver.
 *****************************************************************/

/* gcc -g -Wall -I. -L. -o sqio_utest -DeslSQIO_TESTDRIVE esl_sqio.c -leasel -lm
 * ./sqio_utest
 */
#ifdef eslSQIO_TESTDRIVE
#include "esl_config.h"

#include <stdlib.h>
#include <stdio.h>
#include <string.h>

#include "easel.h"
#include "esl_getopts.h"
#include "esl_random.h"
#include "esl_randomseq.h"
#include "esl_sq.h"
#include "esl_sqio.h"

static ESL_OPTIONS options[] = {
  /* name           type      default  env  range toggles reqs incomp  help                                       docgroup*/
  { "-h",        eslARG_NONE,   FALSE,  NULL, NULL,  NULL,  NULL, NULL, "show brief help on version and usage",             0 },
  { "-L",        eslARG_INT,   "8000",  NULL, NULL,  NULL,  NULL, NULL, "max length of test sequences",                     0 },
  { "-N",        eslARG_INT,    "100",  NULL, NULL,  NULL,  NULL, NULL, "number of test sequences",                         0 },
  { "-s",        eslARG_INT,      "0",  NULL, NULL,  NULL,  NULL, NULL, "set random number seed to <n>",                    0 },
  {  0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
};
static char usage[]  = "[-options]";
static char banner[] = "test driver for sqio module";

int
main(int argc, char **argv)
{
  ESL_GETOPTS    *go       = esl_getopts_CreateDefaultApp(options, 0, argc, argv, banner, usage);
  ESL_ALPHABET   *abc      = esl_alphabet_Create(eslDNA); /* DNA because some chars aren't legal in IUPAC DNA */
  ESL_RANDOMNESS *r        = esl_randomness_Create(esl_opt_GetInteger(go, "-s"));
  ESL_SQ        **sqarr    = NULL;
  int             maxL     = esl_opt_GetInteger(go, "-L");
  int             N        = esl_opt_GetInteger(go, "-N");
  int             i;
  int             mode;
  char            tmpfile[32];
  char            ssifile[32];
  FILE           *fp       = NULL;
  char            c;

  fprintf(stderr, "## %s\n", argv[0]);
  fprintf(stderr, "#  rng seed = %" PRIu32 "\n", esl_randomness_GetSeed(r));

  /* Create an array of sequences we'll use for all the tests */
  synthesize_testseqs(r, abc, maxL, N, &sqarr);

  for (mode = 0; mode < 3; mode++) /* 0=ugly 1=spaced 2=normal*/
    {
      /* Write FASTA file to disk, and SSI index it */
      strcpy(tmpfile, "esltmpXXXXXX");
      if (esl_tmpfile_named(tmpfile, &fp) != eslOK) esl_fatal("failed to make tmpfile");
      switch (mode) {
      case 0: for (i = 0; i < N; i++) write_ugly_fasta(r, fp, sqarr[i]); break;
      case 1: for (i = 0; i < N; i++) write_spaced_fasta(fp, sqarr[i]);  break;
      case 2:
	for (i = 0; i < N; i++) {
	  c = sqarr[i]->acc[0];	/* hack: hide the accession, so digital writer doesn't write it. */
	  sqarr[i]->acc[0] = '\0';
	  esl_sqio_Write(fp, sqarr[i], eslSQFILE_FASTA, TRUE); 
	  sqarr[i]->acc[0] = c;
	}
	break;
      }
      fclose(fp);
      make_ssi_index(abc, tmpfile, eslSQFILE_FASTA, ssifile, mode);

      utest_read        (abc, sqarr, N, tmpfile, eslSQFILE_FASTA, mode);
      utest_read_info   (abc, sqarr, N, tmpfile, eslSQFILE_FASTA, mode);
      utest_read_window (abc, sqarr, N, tmpfile, eslSQFILE_FASTA, mode);
      utest_fetch_subseq(r, abc, sqarr, N, tmpfile, ssifile, eslSQFILE_FASTA);

      remove(tmpfile);
      remove(ssifile);
    }  

  utest_guess_mechanics(abc, sqarr, N);
  utest_write          (abc, sqarr, N, eslMSAFILE_STOCKHOLM);

  for (i = 0; i < N; i++) esl_sq_Destroy(sqarr[i]);
  free(sqarr);
  esl_randomness_Destroy(r);
  esl_alphabet_Destroy(abc);
  esl_getopts_Destroy(go);

  fprintf(stderr, "#  status = ok\n");
  return 0;
}
#endif /*eslSQIO_TESTDRIVE*/
/*------------------ end, test driver ---------------------------*/



/*****************************************************************
 *# 12. Examples
 *****************************************************************/
/* The last example in a file is always the most useful example;
 * so you can M-> to it immediately.
 *    example3 = using esl_sqio_Parse()
 *    example2 = simplest, text mode
 *    example  = standard idiom for digital seqfile reading
 */


#ifdef eslSQIO_EXAMPLE3
/*::cexcerpt::sqio_example_parse::begin::*/
/* Example of using esl_sqio_Parse() to parse a buffer
 *  cc -g -Wall -I. -L. -o esl_sqio_example3 -DeslSQIO_EXAMPLE3 esl_sqio.c -leasel -lm
 *  ./esl_sqio_example3
 */
#include "easel.h"
#include "esl_alphabet.h"
#include "esl_sq.h"
#include "esl_sqio.h"

int
main(void)
{
  ESL_ALPHABET *abc       = NULL;
  ESL_SQ       *sq        = NULL;
  int           format    = eslSQFILE_FASTA;
  int           alphatype = eslAMINO;
  int           status;

  char *test = ">12345 TEST Test fasta buffer\n"
               "ARVAPVALPSACAPAGTQCLISGWGNTLSNGVNNPDLLQCVDAPVLSQADCEAAYPGEIT\n"
               "SSMICVGFLEGGKDSCQGDSGGPVVCNGQLQGIVSWGYGCALPDNPGVYTKVCNFVGWIQ\n"
               "DTIAAN";

  abc = esl_alphabet_Create(alphatype);
  sq  = esl_sq_CreateDigital(abc);

  status = esl_sqio_Parse(test, strlen(test), sq, format);
  if      (status == eslEFORMAT) esl_fatal("Parse failed, invalid format");
  else if (status != eslOK)      esl_fatal("Unexpected error %d", status);

  esl_sq_Destroy(sq);
  esl_alphabet_Destroy(abc);
  return 0;
}
/*::cexcerpt::sqio_example_parse::end::*/
#endif /*eslSQIO_EXAMPLE3*/


#ifdef eslSQIO_EXAMPLE2
/*::cexcerpt::sqio_example_text::begin::*/
/* cc -g -Wall -I. -L. -o esl_sqio_example2 -DeslSQIO_EXAMPLE2 esl_sqio.c -leasel -lm
 * ./esl_sqio_example2 <FASTA file>
 */
#include "easel.h"
#include "esl_sq.h"
#include "esl_sqio.h"

int
main(int argc, char **argv)
{
  ESL_SQ     *sq      = esl_sq_Create();
  ESL_SQFILE *sqfp;
  int         format  = eslSQFILE_FASTA;
  char       *seqfile = argv[1];
  int         status;

  status = esl_sqfile_Open(seqfile, format, NULL, &sqfp);
  if      (status == eslENOTFOUND) esl_fatal("No such file.");
  else if (status == eslEFORMAT)   esl_fatal("Format unrecognized.");
  else if (status != eslOK)        esl_fatal("Open failed, code %d.", status);

  while ((status = esl_sqio_Read(sqfp, sq)) == eslOK)
  {     /* use each sequence for whatever you want */
    printf("%-40s length: %8ld   desclen: %lu\n", sq->name, (long) sq->L, (unsigned long) strlen(sq->desc));
    esl_sq_Reuse(sq);
  }
  if      (status == eslEFORMAT) esl_fatal("Parse failed\n  %s", esl_sqfile_GetErrorBuf(sqfp));
  else if (status != eslEOF)     esl_fatal("Unexpected read error %d", status);
  
  esl_sq_Destroy(sq);
  esl_sqfile_Close(sqfp);
  return 0;
}
/*::cexcerpt::sqio_example_text::end::*/
#endif /*eslSQIO_EXAMPLE2*/




#ifdef eslSQIO_EXAMPLE
/*::cexcerpt::sqio_example_digital::begin::*/
/* Example showing standard idiom for opening sequence file, digital mode.
 *  cc -g -Wall -I. -L. -o esl_sqio_example -DeslSQIO_EXAMPLE esl_sqio.c -leasel -lm
 *  ./esl_sqio_example <sequence file>
 */
#include "easel.h"
#include "esl_alphabet.h"
#include "esl_getopts.h"
#include "esl_sq.h"
#include "esl_sqio.h"

static ESL_OPTIONS options[] = {
  /* name          type       default  env  range toggles reqs incomp  help         docgroup */
 { "-h",          eslARG_NONE,  FALSE, NULL, NULL, NULL, NULL, NULL, "show brief help",    0 },
 { "--dna",       eslARG_NONE,  FALSE, NULL, NULL, NULL, NULL, NULL, "use DNA alphabet",   0 },
 { "--rna",       eslARG_NONE,  FALSE, NULL, NULL, NULL, NULL, NULL, "use RNA alphabet",   0 },
 { "--amino",     eslARG_NONE,  FALSE, NULL, NULL, NULL, NULL, NULL, "use amino alphabet", 0 },
 { "--informat",  eslARG_STRING, NULL, NULL, NULL, NULL, NULL, NULL, "set input format",   0 },
 {  0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
};
static char usage[]  = "[-options] <seqfile>";
static char banner[] = "example of reading in standard sqio idiom, digital mode";

int
main(int argc, char **argv)
{
  ESL_GETOPTS  *go        = esl_getopts_CreateDefaultApp(options, 1, argc, argv, banner, usage);
  char         *seqfile   = esl_opt_GetArg(go, 1);
  ESL_ALPHABET *abc       = NULL;
  ESL_SQ       *sq        = NULL;
  ESL_SQFILE   *sqfp      = NULL;
  int           infmt     = eslSQFILE_UNKNOWN;
  int           alphatype = eslUNKNOWN;
  int           status;

  if (esl_opt_IsOn(go, "--informat")) {
    if ((infmt = esl_sqio_EncodeFormat(esl_opt_GetString(go, "--informat")))==eslSQFILE_UNKNOWN)
      esl_fatal("%s is not a valid input sequence file format for --informat"); 
  }

  status = esl_sqfile_Open(seqfile, infmt, NULL, &sqfp);
  if      (status == eslENOTFOUND) esl_fatal("No such file.");
  else if (status == eslEFORMAT)   esl_fatal("Format couldn't be determined.");
  else if (status != eslOK)        esl_fatal("Open failed, code %d.", status);

  if      (esl_opt_GetBoolean(go, "--rna"))   alphatype = eslRNA;
  else if (esl_opt_GetBoolean(go, "--dna"))   alphatype = eslDNA;
  else if (esl_opt_GetBoolean(go, "--amino")) alphatype = eslAMINO;
  else {
    status = esl_sqfile_GuessAlphabet(sqfp, &alphatype);
    if      (status == eslENOALPHABET)  esl_fatal("Couldn't guess alphabet");
    else if (status == eslEFORMAT)      esl_fatal("Parse failed\n  %s",
						  esl_sqfile_GetErrorBuf(sqfp));     
    else if (status == eslENODATA)      esl_fatal("Sequence file empty?");
    else if (status != eslOK)           esl_fatal("Unexpected error guessing alphabet");
  }
  abc = esl_alphabet_Create(alphatype);
  sq  = esl_sq_CreateDigital(abc);
  esl_sqfile_SetDigital(sqfp, abc);

  while ((status = esl_sqio_Read(sqfp, sq)) == eslOK)
    {  
      esl_sqio_Write(stdout, sq, eslSQFILE_FASTA, /*update=*/FALSE);
      esl_sq_Reuse(sq);
    }
  if      (status == eslEFORMAT) esl_fatal("Parse failed\n  %s", esl_sqfile_GetErrorBuf(sqfp));
  else if (status != eslEOF)     esl_fatal("Unexpected error %d in reading", status);
  
  esl_sqfile_Close(sqfp);
  esl_sq_Destroy(sq);
  esl_alphabet_Destroy(abc);
  esl_getopts_Destroy(go);
  return 0;
}
/*::cexcerpt::sqio_example_digital::end::*/
#endif /*eslSQIO_EXAMPLE*/


