/* i/o of multiple sequence alignment files in Clustal-like formats
 *
 * Contents:
 *   1. API for reading/writing Clustal and Clustal-like formats
 *   2. Internal routines for Clustal formats.
 *   3. Unit tests.
 *   4. Test driver.
 *   5. Example.
 *   
 * This module is responsible for i/o of both eslMSAFILE_CLUSTAL and
 * eslMSAFILE_CLUSTALLIKE alignment formats.
 */
#include "esl_config.h"

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <ctype.h>

#include "easel.h"
#include "esl_alphabet.h"
#include "esl_mem.h"
#include "esl_msa.h"
#include "esl_msafile.h"

#include "esl_msafile_clustal.h"

static int make_text_consensus_line(const ESL_MSA *msa, char **ret_consline);
static int make_digital_consensus_line(const ESL_MSA *msa, char **ret_consline);

/*****************************************************************
 *# 1. API for reading/writing Clustal and Clustal-like formats
 *****************************************************************/

/* Function:  esl_msafile_clustal_SetInmap()
 * Synopsis:  Configure input map for CLUSTAL, CLUSTALLIKE formats.
 *
 * Purpose:   Set the <afp->inmap> for Clustal-like formats. 
 *
 *            Text mode accepts any <isgraph()> character. 
 *            Digital mode enforces the usual Easel alphabets.
 */
int
esl_msafile_clustal_SetInmap(ESL_MSAFILE *afp)
{
  int sym;

  if (afp->abc)
    {
      for (sym = 0; sym < 128; sym++) 
	afp->inmap[sym] = afp->abc->inmap[sym];
      afp->inmap[0] = esl_abc_XGetUnknown(afp->abc);
    }

  if (! afp->abc)
    {
      for (sym = 1; sym < 128; sym++) 
	afp->inmap[sym] = (isgraph(sym) ? sym : eslDSQ_ILLEGAL);
      afp->inmap[0] = '?';
    }
  return eslOK;
}


/* Function:  esl_msafile_clustal_GuessAlphabet()
 * Synopsis:  Guess the alphabet of an open Clustal MSA input.
 *
 * Purpose:   Guess the alpbabet of the sequences in open
 *            Clustal format MSA file <afp>.
 *            
 *            On a normal return, <*ret_type> is set to <eslDNA>,
 *            <eslRNA>, or <eslAMINO>, and <afp> is reset to its
 *            original position.
 *
 * Args:      afp      - open Clustal format MSA file
 *            ret_type - RETURN: <eslDNA>, <eslRNA>, or <eslAMINO>       
 *
 * Returns:   <eslOK> on success.
 *            <eslENOALPHABET> if alphabet type can't be determined.
 *            In either case, <afp> is rewound to the position it
 *            started at.
 *
 * Throws:    <eslEMEM> on allocation error.
 *            <eslESYS> on failures of fread() or other system calls
 */
int 
esl_msafile_clustal_GuessAlphabet(ESL_MSAFILE *afp, int *ret_type)
{
  int       alphatype     = eslUNKNOWN;
  esl_pos_t anchor        = -1;
  int       threshold[3]  = { 500, 5000, 50000 }; /* we check after 500, 5000, 50000 residues; else we go to EOF */
  int       nsteps        = 3;
  int       step          = 0;
  int       nres          = 0;
  int       x;
  int64_t   ct[26];
  char     *p, *tok;
  esl_pos_t n,  toklen, pos;
  int       status;

  for (x = 0; x < 26; x++) ct[x] = 0;

  anchor = esl_buffer_GetOffset(afp->bf);
  if ((status = esl_buffer_SetAnchor(afp->bf, anchor)) != eslOK) { status = eslEINCONCEIVABLE; goto ERROR; } /* [eslINVAL] can't happen here */

  /* Ignore the first nonblank line, which says "CLUSTAL W (1.83) multiple sequence alignment" or some such */
  while ( (status = esl_buffer_GetLine(afp->bf, &p, &n)) == eslOK  && esl_memspn(p, n, " \t") == n) ;
  if      (status == eslEOF) ESL_XFAIL(eslENOALPHABET, afp->errmsg, "can't determine alphabet: no alignment data found");
  else if (status != eslOK)  goto ERROR;
  
  while ( (status = esl_buffer_GetLine(afp->bf, &p, &n)) == eslOK)
    {
      if ((status = esl_memtok(&p, &n, " \t", &tok, &toklen)) != eslOK) continue; /* ignore blank lines */
      /* p now points to the rest of the sequence line, after a name */
      
      /* count characters into ct[] array */
      for (pos = 0; pos < n; pos++)
	if (isalpha(p[pos])) {
	  x = toupper(p[pos]) - 'A';
	  ct[x]++;
	  nres++; 	  
	} 
      /* note that GuessAlphabet() is robust against the optional coord lines
       * and the annotation lines -- it only counts ascii characters.
       */

      /* try to stop early, checking after 500, 5000, and 50000 residues: */
      if (step < nsteps && nres > threshold[step]) {
	if ((status = esl_abc_GuessAlphabet(ct, &alphatype)) == eslOK) goto DONE; /* (eslENOALPHABET) */
	step++;
      }
    }
  if (status != eslEOF) goto ERROR; /* [eslEMEM,eslESYS,eslEINCONCEIVABLE] */
  status = esl_abc_GuessAlphabet(ct, &alphatype); /* (eslENOALPHABET) */

 DONE:
  esl_buffer_SetOffset(afp->bf, anchor);   /* Rewind to where we were. */
  esl_buffer_RaiseAnchor(afp->bf, anchor);
  *ret_type = alphatype;
  return status;

 ERROR:
  if (anchor != -1) {
    esl_buffer_SetOffset(afp->bf, anchor);
    esl_buffer_RaiseAnchor(afp->bf, anchor);
  }
  *ret_type = eslUNKNOWN;
  return status;
}


/* Function:  esl_msafile_clustal_Read()
 * Synopsis:  Read in a CLUSTAL or CLUSTALLIKE alignment.
 *
 * Purpose:   Read an MSA from an open <ESL_MSAFILE> <afp>, parsing
 *            for Clustal or Clustal-like format, starting from the 
 *            current point. (<afp->format> is expected to be
 *            <eslMSAFILE_CLUSTAL> or <eslMSAFILE_CLUSTALLIKE>.) Create a
 *            new multiple alignment, and return a ptr to that
 *            alignment in <*ret_msa>.  Caller is responsible for
 *            free'ing this <ESL_MSA>.
 *
 * Args:      afp     - open <ESL_MSAFILE>
 *            ret_msa - RETURN: newly parsed <ESL_MSA>
 *
 * Returns:   <eslOK> on success.
 * 
 *            <eslEOF> if no (more) alignment data are found in
 *            <afp>, and <afp> is returned at EOF. 
 *
 *            <eslEFORMAT> on a parse error. <*ret_msa> is set to
 *            <NULL>. <afp> contains information sufficient for
 *            constructing useful diagnostic output: 
 *            | <afp->errmsg>       | user-directed error message     |
 *            | <afp->linenumber>   | line # where error was detected |
 *            | <afp->line>         | offending line (not NUL-term)   |
 *            | <afp->n>            | length of offending line        |
 *            | <afp->bf->filename> | name of the file                |
 *            and <afp> is poised at the start of the following line,
 *            so (in principle) the caller could try to resume
 *            parsing.
 *
 * Throws:    <eslEMEM> - an allocation failed.
 *            <eslESYS> - a system call such as fread() failed
 *            <eslEINCONCEIVABLE> - "impossible" corruption 
 */
int
esl_msafile_clustal_Read(ESL_MSAFILE *afp, ESL_MSA **ret_msa)
{
  ESL_MSA  *msa     = NULL;
  char     *p       = NULL;
  esl_pos_t n       = 0;
  char     *tok     = NULL;
  esl_pos_t ntok    = 0;
  int       nblocks = 0;
  int       idx     = 0;
  int       nseq    = 0;
  int64_t   alen    = 0;
  int64_t   cur_alen;
  esl_pos_t pos;
  esl_pos_t name_start, name_len;
  esl_pos_t seq_start, seq_len;
  esl_pos_t block_seq_start, block_seq_len;
  int       status;

  ESL_DASSERT1( (afp->format == eslMSAFILE_CLUSTAL || afp->format == eslMSAFILE_CLUSTALLIKE) );

  afp->errmsg[0] = '\0';
  
  if (afp->abc   &&  (msa = esl_msa_CreateDigital(afp->abc, 16, -1)) == NULL) { status = eslEMEM; goto ERROR; }
  if (! afp->abc &&  (msa = esl_msa_Create(                 16, -1)) == NULL) { status = eslEMEM; goto ERROR; }

  /* skip leading blank lines in file */
  while ( (status = esl_msafile_GetLine(afp, &p, &n)) == eslOK  && esl_memspn(afp->line, afp->n, " \t") == afp->n) ;
  if      (status != eslOK)  goto ERROR; /* includes normal EOF */
    
  /* That first line says something like: "CLUSTAL W (1.83) multiple sequence alignment" */
  if (esl_memtok(&p, &n, " \t", &tok, &ntok) != eslOK)                             ESL_XFAIL(eslEFORMAT, afp->errmsg, "missing CLUSTAL header");
  if (afp->format == eslMSAFILE_CLUSTAL && ! esl_memstrpfx(tok, ntok, "CLUSTAL"))  ESL_XFAIL(eslEFORMAT, afp->errmsg, "missing CLUSTAL header"); 
  if (! esl_memstrcontains(p, n, "multiple sequence alignment"))                   ESL_XFAIL(eslEFORMAT, afp->errmsg, "missing CLUSTAL header");

  /* skip blank lines again */
  do {
    status = esl_msafile_GetLine(afp, &p, &n);
    if      (status == eslEOF) ESL_XFAIL(eslEFORMAT, afp->errmsg, "no alignment data following header");
    else if (status != eslOK) goto ERROR;
  } while (esl_memspn(afp->line, afp->n, " \t") == afp->n); /* idiom for "blank line" */

  /* Read the file a line at a time. */
  do { 		/* afp->line, afp->n is now the first line of a block... */
    idx = 0;
    do {
      for (pos = 0;     pos < n; pos++) if (! isspace(p[pos])) break;  
      name_start = pos; 
      for (pos = pos+1; pos < n; pos++) if (  isspace(p[pos])) break;  
      name_len   = pos - name_start;
      for (pos = pos+1; pos < n; pos++) if (! isspace(p[pos])) break;  
      seq_start  = pos;      
      if (pos >= n) ESL_XFAIL(eslEFORMAT, afp->errmsg, "invalid alignment line");
      for (pos = pos+1; pos < n; pos++) if (  isspace(p[pos])) break;  
      seq_len    = pos - seq_start; /* expect one block; ignore trailing stuff, inc. optional coords */

      if (idx == 0) {
	block_seq_start = seq_start;
	block_seq_len   = seq_len;
      } else {
	if (seq_start != block_seq_start) ESL_XFAIL(eslEFORMAT, afp->errmsg, "sequence start is misaligned");
	if (seq_len   != block_seq_len)   ESL_XFAIL(eslEFORMAT, afp->errmsg, "sequence end is misaligned");
      }

      /* Store the sequence name. */
      if (nblocks == 0)	{
	/* make sure we have room for another sequence */
	if (idx >= msa->sqalloc &&  (status = esl_msa_Expand(msa))           != eslOK) goto ERROR;
	if ( (status = esl_msa_SetSeqName(msa, idx, p+name_start, name_len)) != eslOK) goto ERROR;
	nseq++;
      } else {
	if (! esl_memstrcmp(p+name_start, name_len, msa->sqname[idx]))
	  ESL_XFAIL(eslEFORMAT, afp->errmsg, "expected sequence %s on this line, but saw %.*s", msa->sqname[idx], (int) name_len, p+name_start);
      }

      /* Append the sequence. */
      cur_alen = alen;
      if (msa->abc)    { status = esl_abc_dsqcat(afp->inmap, &(msa->ax[idx]),   &(cur_alen), p+seq_start, seq_len); }
      if (! msa->abc)  { status = esl_strmapcat (afp->inmap, &(msa->aseq[idx]), &(cur_alen), p+seq_start, seq_len); }
      if      (status == eslEINVAL)    ESL_XFAIL(eslEFORMAT, afp->errmsg, "one or more invalid sequence characters");
      else if (status != eslOK)        goto ERROR;
      if (cur_alen - alen != seq_len) ESL_XFAIL(eslEFORMAT, afp->errmsg, "unexpected number of seq characters");

      /* get next line. if it's a consensus line, we're done with the block */
      status = esl_msafile_GetLine(afp, &p, &n);
      if      (status == eslEOF) ESL_XFAIL(eslEFORMAT, afp->errmsg, "alignment block did not end with consensus line");
      else if (status != eslOK)  goto ERROR;

      idx++;
    } while (esl_memspn(afp->line, afp->n, " .:*") < afp->n); /* end loop over a block */
    
    if (idx != nseq) ESL_XFAIL(eslEFORMAT, afp->errmsg, "last block didn't contain same # of seqs as earlier blocks");

    /* skip blank lines until we find start of next block, or EOF */
    do {
      status = esl_msafile_GetLine(afp, &p, &n);
      if      (status == eslEOF) break;
      else if (status != eslOK)  goto ERROR;
    } while (esl_memspn(p, n, " \t") == n); 
    
    alen += block_seq_len;
    nblocks++;
  } while (status == eslOK);	/* normal end has status == EOF after last block. */

  msa->nseq = nseq;
  msa->alen = alen;
  if (( status = esl_msa_SetDefaultWeights(msa)) != eslOK) goto ERROR;
  *ret_msa = msa;
  return eslOK;

 ERROR:
  if (msa) esl_msa_Destroy(msa);
  *ret_msa = NULL;
  return status;
}  


/* Function:  esl_msafile_clustal_Write()
 * Synopsis:  Write a CLUSTAL format alignment file to a stream.
 *
 * Purpose:   Write alignment <msa> to output stream <fp>, in
 *            format <fmt>. If <fmt> is <eslMSAFILE_CLUSTAL>,
 *            write strict CLUSTAL 2.1 format. If <fmt>
 *            is <eslMSAFILE_CLUSTALLIKE>, put "EASEL (VERSION)"
 *            in the header.
 *            
 *            The alignment is written in blocks of 60 aligned
 *            residues at a time.
 *            
 *            Constructing the CLUSTAL consensus line properly
 *            requires knowing the alphabet. If the <msa> is in text
 *            mode, we don't know the alphabet, so then we use a
 *            simplified consensus line, with '*' marking completely
 *            conserved columns, ' ' on everything else. If the <msa>
 *            is in digital mode and of type <eslAMINO>, then we also
 *            use Clustal's "strong" and "weak" residue group
 *            annotations, ':' and '.'.  Strong groups are STA, NEQK,
 *            NHQK, NDEQ, QHRK, MILV, MILF, HY, and FYW. Weak groups
 *            are CSA, ATV, SAG, STNK, STPA, SGND, SNDEQK, NDEQHK,
 *            NEQHRK, FVLIM, and HFY.
 *            
 * Args:      fp  - open output stream, writable
 *            msa - alignment to write      
 *            fmt - eslMSAFILE_CLUSTAL or eslMSAFILE_CLUSTALLIKE      
 *
 * Returns:   <eslOK> on success.
 *
 * Throws:    <eslEMEM> on allocation error.
 *            <eslEWRITE> on any system write error, such as filled disk.
 */
int
esl_msafile_clustal_Write(FILE *fp, const ESL_MSA *msa, int fmt)
{
  int       cpl        = 60;
  int       maxnamelen = 0;
  int       namelen;
  char     *consline   = NULL;
  char     *buf        = NULL;
  int64_t   apos;
  int       i;
  int       status;

  ESL_ALLOC(buf, sizeof(char) * (cpl+1));
  buf[cpl] = '\0';
  for (i = 0; i < msa->nseq; i++)
    {
      namelen = strlen(msa->sqname[i]);
      maxnamelen = ESL_MAX(namelen, maxnamelen);
    }

  /* Make a CLUSTAL-like consensus line */
  if (  msa->abc && (status = make_digital_consensus_line(msa, &consline)) != eslOK) goto ERROR;
  if (! msa->abc && (status = make_text_consensus_line   (msa, &consline)) != eslOK) goto ERROR;

  /* The magic header */
  if      (fmt == eslMSAFILE_CLUSTAL)     { if (fprintf(fp, "CLUSTAL 2.1 multiple sequence alignment\n")               < 0) ESL_XEXCEPTION_SYS(eslEWRITE, "clustal msa write failed");  }
  else if (fmt == eslMSAFILE_CLUSTALLIKE) { if (fprintf(fp, "EASEL (%s) multiple sequence alignment\n", EASEL_VERSION) < 0) ESL_XEXCEPTION_SYS(eslEWRITE, "clustal msa write failed");  }

  /* The alignment */
  for (apos = 0; apos < msa->alen; apos += cpl)
    {
      if (fprintf(fp, "\n") < 0) ESL_XEXCEPTION_SYS(eslEWRITE, "clustal msa write failed"); 
      for (i = 0; i < msa->nseq; i++)
	{
	  if (msa->abc)   esl_abc_TextizeN(msa->abc, msa->ax[i]+apos+1, cpl, buf);
	  if (! msa->abc) strncpy(buf, msa->aseq[i]+apos, cpl);
	  if (fprintf(fp, "%-*s %s\n", maxnamelen, msa->sqname[i], buf) < 0) ESL_XEXCEPTION_SYS(eslEWRITE, "clustal msa write failed"); 
	}
      strncpy(buf, consline+apos, cpl);
      if (fprintf(fp, "%-*s %s\n", maxnamelen, "", buf) < 0) ESL_XEXCEPTION_SYS(eslEWRITE, "clustal msa write failed"); 
    }

  free(buf);
  free(consline);
  return eslOK;

 ERROR:
  if (buf)      free(buf);
  if (consline) free(consline);
  return status;
}
/*---------------- end, Clustal API -----------------------------*/



/*****************************************************************
 * 2. Internal routines for Clustal formats
 *****************************************************************/

/* Clustal consensus lines. 
 *    '*' :  100% conserved positions
 *    ':' :  all residues in column belong to a "strong group" 
 *    '.' :  all residues in column belong to a "weak group"
 *    ' ' :  otherwise
 *    
 * Gap characters count, and ambiguity codes are interpreted verbatim,
 * so even a single gap or ambiguity code makes the column a ' '.
 *
 * From examining the source code for ClustalW (as it writes its
 * "self explanatory format", ahem!):
 *   strong groups = STA, NEQK, NHQK, NDEQ, QHRK, MILV, MILF,
 *                   HY,  FYW
 *   weak groups =   CSA, ATV,  SAG,  STNK, STPA, SGND, SNDEQK,
 *	             NDEQHK, NEQHRK, FVLIM, HFY
 *    
 * These groups only apply to protein data, and therefore only to
 * digital alignments using an <eslAMINO> alphabet.  

 * Calculating the consensus line can be compute-intensive, for large
 * alignments. A naive implementation (for each column, collect
 * residue counts, compare to each conservation group) was judged too
 * slow: 16.2s to write the Pkinase full alignment, compared to 1.5s
 * to write Stockholm format [SRE:J8/22]. Here we use a slightly less
 * naive implementation, which collects a bit vector (one bit per
 * residue) for each column, and traverses the alignment in stride
 * (sequences, then columns). Writing Clustal format Pkinase now takes
 * 2.3s, and most of the difference w.r.t. Stockholm is now assignable
 * to the smaller width (thus greater number of blocks) written for
 * Clustal (60 cpl vs 200) rather than to consensus construction.
 * 
 * An oversophisticated approach could use a finite
 * automaton to store all groups in one machine, then to use the FA to
 * process each residue seen in a column; for most columns, we would
 * quickly reach a rejection state (most columns don't belong to 
 * a conservation group, especially in large alignments). For a sketch 
 * of how to construct and use such an automaton, xref SRE:J8/22.
 * I decided this was probably overkill, and didn't implement it.
 */


/* make_text_consensus_line()
 * 
 * Given a text mode <msa>, allocate and create a CLUSTAL-style
 * consensus line; return it in <*ret_consline>. Caller is responsible
 * for free'ing this string.
 * 
 * In text mode, we don't know the alphabet; in particular, we can't
 * know if the data are amino acids, so we don't know if it's
 * appropriate to use the amino acid group codes. So we don't;
 * in text mode, only '*' and ' ' appear in consensus lines.
 * 
 * The consensus line is numbered 0..alen-1, and is NUL-terminated.
 * 
 * Returns <eslOK> on success.
 * No normal failure codes.
 * Throws <eslEMEM> on allocation error.
 */
static int
make_text_consensus_line(const ESL_MSA *msa, char **ret_consline)
{
  char     *consline = NULL;
  uint32_t *v        = NULL;
  uint32_t  tmpv, maxv;
  int       n;
  int       idx, apos, x;
  int       status;

  ESL_ALLOC(consline, sizeof(char)     * (msa->alen+1));
  ESL_ALLOC(v,        sizeof(uint32_t) * (msa->alen));
  for (apos = 0; apos < msa->alen; apos++)
    v[apos] = 0;

  for (idx = 0; idx < msa->nseq; idx++)
    for (apos = 0; apos < msa->alen; apos++)
      {
	x = toupper(msa->aseq[idx][apos]) - 'A';
	if (x >= 0 && x < 26) v[apos] |= (1 <<  x);
	else                  v[apos] |= (1 << 26);
      }	
  maxv = (1 << 26) - 1;

  for (apos = 0; apos < msa->alen; apos++)
    {
      for (n = 0, tmpv = v[apos]; tmpv; n++) tmpv &= tmpv-1; /* Kernighan magic: count # of bits set in tmpv */
      consline[apos] = ((n == 1 && v[apos] < maxv) ? '*' : ' ');
    }
  consline[msa->alen] = '\0';

  *ret_consline = consline;
  free(v);
  return eslOK;

 ERROR:
  if (v)        free(v);
  if (consline) free(consline);
  *ret_consline = NULL;
  return status;
}


/* make_digital_consensus_line()
 * 
 * Exactly the same as make_text_consensus_line(), except for
 * digital mode <msa>.
 */
static int
matches_group_digital(ESL_ALPHABET *abc, uint32_t v, char *group)
{
  uint32_t gv  = 0;
  ESL_DSQ  sym;
  char    *c;

  for (c = group; *c; c++) {
    sym = esl_abc_DigitizeSymbol(abc, *c);
    gv |= (1 << sym);
  }
  return ( ((v & gv) == v) ? TRUE : FALSE);
}
  
static int
make_digital_consensus_line(const ESL_MSA *msa, char **ret_consline)
{
  char     *consline = NULL;
  uint32_t *v        = NULL;
  uint32_t  tmpv, maxv;
  int       n;
  int       idx, apos;
  int       status;

  /* if this ever becomes a problem, easy enough to make v a uint64_t to get up to Kp<=64 */
  if (msa->abc->Kp > 32) ESL_EXCEPTION(eslEINVAL, "Clustal format writer cannot handle digital alphabets of Kp>32 residues");

  ESL_ALLOC(v,        sizeof(uint32_t) * (msa->alen+1));
  ESL_ALLOC(consline, sizeof(char)     * (msa->alen+1));
  for (apos = 0; apos <= msa->alen; apos++)
    v[apos] = 0;

  for (idx = 0; idx < msa->nseq; idx++)
    for (apos = 1; apos <= msa->alen; apos++)
      v[apos] |= (1 << msa->ax[idx][apos]);

  maxv = (1 << msa->abc->K) - 1; /* maxv: has all canonical residue bits set */

  for (apos = 1; apos <= msa->alen; apos++)
    {
      consline[apos-1] = ' ';

      for (n = 0, tmpv = v[apos]; tmpv; n++) tmpv &= tmpv-1; /* Kernighan magic: count # of bits set in tmpv */

      if      (n == 0 || n > 6)  continue;               /* n==0 shouldn't happen; n > 6 means too many different residues seen */
      else if (v[apos] > maxv)   continue;	         /* gap or ambiguity chars seen; column must be left unannotated */
      else if (n == 1)           consline[apos-1] = '*'; /* complete conservation of a canonical residue */
      else if (msa->abc->type == eslAMINO) 
	{
	  if      (matches_group_digital(msa->abc, v[apos], "STA"))  consline[apos-1] = ':';
	  else if (matches_group_digital(msa->abc, v[apos], "NEQK")) consline[apos-1] = ':';
	  else if (matches_group_digital(msa->abc, v[apos], "NHQK")) consline[apos-1] = ':';
	  else if (matches_group_digital(msa->abc, v[apos], "NDEQ")) consline[apos-1] = ':';
	  else if (matches_group_digital(msa->abc, v[apos], "QHRK")) consline[apos-1] = ':';
	  else if (matches_group_digital(msa->abc, v[apos], "MILV")) consline[apos-1] = ':';
	  else if (matches_group_digital(msa->abc, v[apos], "MILF")) consline[apos-1] = ':';
	  else if (matches_group_digital(msa->abc, v[apos], "HY"))   consline[apos-1] = ':';
	  else if (matches_group_digital(msa->abc, v[apos], "FYW"))  consline[apos-1] = ':';

	  else if (matches_group_digital(msa->abc, v[apos], "CSA"))    consline[apos-1] = '.'; 
	  else if (matches_group_digital(msa->abc, v[apos], "ATV"))    consline[apos-1] = '.';
	  else if (matches_group_digital(msa->abc, v[apos], "SAG"))    consline[apos-1] = '.';
	  else if (matches_group_digital(msa->abc, v[apos], "STNK"))   consline[apos-1] = '.';
	  else if (matches_group_digital(msa->abc, v[apos], "STPA"))   consline[apos-1] = '.';
	  else if (matches_group_digital(msa->abc, v[apos], "SGND"))   consline[apos-1] = '.';
	  else if (matches_group_digital(msa->abc, v[apos], "SNDEQK")) consline[apos-1] = '.';
	  else if (matches_group_digital(msa->abc, v[apos], "NDEQHK")) consline[apos-1] = '.';
	  else if (matches_group_digital(msa->abc, v[apos], "NEQHRK")) consline[apos-1] = '.';
	  else if (matches_group_digital(msa->abc, v[apos], "FVLIM"))  consline[apos-1] = '.';
	  else if (matches_group_digital(msa->abc, v[apos], "HFY"))    consline[apos-1] = '.';
	}
    }
  consline[apos-1] = '\0';

  *ret_consline = consline;
  free(v);
  return eslOK;

 ERROR:
  if (v)        free(v);
  if (consline) free(consline);
  *ret_consline = NULL;
  return eslOK;
}
/*-------------- end, internal clustal routines -----------------*/


/*****************************************************************
 * 3. Unit tests.
 *****************************************************************/
#ifdef eslMSAFILE_CLUSTAL_TESTDRIVE

static void
utest_write_good1(FILE *ofp, int *ret_format, int *ret_alphatype, int *ret_nseq, int *ret_alen)
{
  fputs("MUSCLE (3.7) multiple sequence alignment\n", ofp);
  fputs("\n", ofp);
  fputs("\n", ofp);
  fputs("MYG_PHYCA       --------V-LSEGEWQLVLHVWAKVEADVAGHGQDILIRLFKSHPETLEKFDRFKHLKT\n", ofp);
  fputs("GLB5_PETMA      PIVDTGSVAPLSAAEKTKIRSAWAPVYSTYETSGVDILVKFFTSTPAAQEFFPKFKGLTT\n", ofp);
  fputs("HBB_HUMAN       --------VHLTPEEKSAVTALWGKV--NVDEVGGEALGRLLVVYPWTQRFFESFGDLST\n", ofp);
  fputs("HBA_HUMAN       --------V-LSPADKTNVKAAWGKVGAHAGEYGAEALERMFLSFPTTKTYFPHF-----\n", ofp);
  fputs("                        . *:  :.  :   *. *       * : * .::   * :   *  *     \n", ofp);
  fputs("\n", ofp);
  fputs("MYG_PHYCA       EAEMKASEDLKKHGVTVLTALGAILKKKGH---HEAELKPLAQSHATKHKIPIKYLEFIS\n", ofp);
  fputs("GLB5_PETMA      ADQLKKSADVRWHAERIINAVNDAVASMDDTEKMSMKLRDLSGKHAKSFQVDPQYFKVLA\n", ofp);
  fputs("HBB_HUMAN       PDAVMGNPKVKAHGKKVLGAFSDGLAHLDN---LKGTFATLSELHCDKLHVDPENFRLLG\n", ofp);
  fputs("HBA_HUMAN       -DLSHGSAQVKGHGKKVADALTNAVAHVDD---MPNALSALSDLHAHKLRVDPVNFKLLS\n", ofp);
  fputs("                                                                            \n", ofp); /* deliberately made blank */
  fputs("\n", ofp);
  fputs("MYG_PHYCA       EAIIHVLHSRHPGDFGADAQGAMNKALELFRKDIAAKYKELGYQG\n", ofp);
  fputs("GLB5_PETMA      AVI---------ADTVAAGDAGFEKLMSMICILLRSAY-------\n", ofp);
  fputs("HBB_HUMAN       NVLVCVLAHHFGKEFTPPVQAAYQKVVAGVANALAHKYH------\n", ofp);
  fputs("HBA_HUMAN       HCLLVTLAAHLPAEFTPAVHASLDKFLASVSTVLTSKYR------\n", ofp);
  fputs("                  :          :  .   .. :* :  .   :   *       \n", ofp);
  fputs("\n", ofp);

  *ret_format    = eslMSAFILE_CLUSTALLIKE;
  *ret_alphatype = eslAMINO;
  *ret_nseq      = 4;
  *ret_alen      = 165;
}

static void
utest_write_good2(FILE *ofp, int *ret_format, int *ret_alphatype, int *ret_nseq, int *ret_alen)
{
  fputs("CLUSTAL W (1.81) multiple sequence alignment\n", ofp);
  fputs("\n", ofp);
  fputs("tRNA2           UCCGAUAUAGUGUAACGGCUAUCACAUCACGCUUUCACCGUGG-AGACCGGGGUUCGACU\n", ofp);
  fputs("tRNA3           UCCGUGAUAGUUUAAUGGUCAGAAUGG-GCGCUUGUCGCGUGCCAGAUCGGGGUUCAAUU\n", ofp);
  fputs("tRNA5           GGGCACAUGGCGCAGUUGGUAGCGCGCUUCCCUUGCAAGGAAGAGGUCAUCGGUUCGAUU\n", ofp);
  fputs("tRNA1           GCGGAUUUAGCUCAGUUGGGAGAGCGCCAGACUGAAGAUCUGGAGGUCCUGUGUUCGAUC\n", ofp);
  fputs("tRNA4           GCUCGUAUGGCGCAGUGG-UAGCGCAGCAGAUUGCAAAUCUGUUGGUCCUUAGUUCGAUC\n", ofp);
  fputs("                       * *   *   *  *           *            *      **** *  \n", ofp);
  fputs("\n", ofp);
  fputs("tRNA2           CCCCGUAUCGGAG\n", ofp);
  fputs("tRNA3           CCCCGUCGCGGAG\n", ofp);
  fputs("tRNA5           CCGGUUGCGUCCA\n", ofp);
  fputs("tRNA1           CACAGAAUUCGCA\n", ofp);
  fputs("tRNA4           CUGAGUGCGAGCU\n", ofp);
  fputs("                *            \n", ofp);

  *ret_format    = eslMSAFILE_CLUSTAL;
  *ret_alphatype = eslRNA;
  *ret_nseq      = 5;
  *ret_alen      = 73;
}

/* An example of clustal format with optional sequence coords;
 * a quickly-taken subset of a larger alignment reported as a bug.
 */
static void
utest_write_good3(FILE *ofp, int *ret_format, int *ret_alphatype, int *ret_nseq, int *ret_alen)
{
  fputs("CLUSTAL 2.1 multiple sequence alignment\n", ofp);
  fputs("\n", ofp);
  fputs("gi|85091828|ref|XP_959093.1|        MSDFTSKVKVLRDGQKPEFPSN----ANTLEYAQSLDAQDELRHFRNEFI 46\n", ofp);
  fputs("gi|70993990|ref|XP_751842.1|        ----MSTNGTLS---KPEFPAN----AASKEYAASLDAADPFAGFREKFI 39\n", ofp);
  fputs("gi|71001376|ref|XP_755369.1|        ---MGSRLHVQVIHGGPPLPYKDDIRAFGKEYAEQLDAQDPLRRFRDEFI 47\n", ofp);
  fputs("gi|71744026|ref|XP_803513.1|        -----------------------------------MDRNDPLQVHRDAFN 15\n", ofp);
  fputs("                                                                                 :  : \n",    ofp);
  fputs("\n", ofp);
  fputs("gi|85091828|ref|XP_959093.1|        IPTRASLKKKALDGI--------------IPGTQANGTTTSTDADTPCIY 82\n", ofp);
  fputs("gi|70993990|ref|XP_751842.1|        IPSKANIASTKLA----------------KPGLSSE----------PCIY 63\n", ofp);
  fputs("gi|71001376|ref|XP_755369.1|        IPSKKDLKRKTLFPNDGMYSCGHPICFANTSCACVHAAETEETSDEKCIY 97\n", ofp);
  fputs("gi|71744026|ref|XP_803513.1|        IPKRRDGS--------------------------------------DHVY 27\n", ofp);
  fputs("                                                                                     *\n",    ofp);

  *ret_format    = eslMSAFILE_CLUSTAL;
  *ret_alphatype = eslAMINO;
  *ret_nseq      = 4;
  *ret_alen      = 100;
}


static void
utest_goodfile(char *filename, int testnumber, int expected_format, int expected_alphatype, int expected_nseq, int expected_alen)
{
  ESL_ALPHABET        *abc          = NULL;
  ESL_MSAFILE         *afp          = NULL;
  ESL_MSA             *msa1         = NULL;
  ESL_MSA             *msa2         = NULL;
  char                 tmpfile1[32] = "esltmpXXXXXX";
  char                 tmpfile2[32] = "esltmpXXXXXX";
  FILE                *ofp          = NULL;
  int                  status;

  /* guessing both the format and the alphabet should work: this is a digital open */
  if ( (status = esl_msafile_Open(&abc, filename, NULL, eslMSAFILE_UNKNOWN, NULL, &afp)) != eslOK) esl_fatal("clustal good file test %d failed: digital open",           testnumber);  
  if (afp->format != expected_format)                                                              esl_fatal("clustal good file test %d failed: format autodetection",   testnumber);
  if (abc->type   != expected_alphatype)                                                           esl_fatal("clustal good file test %d failed: alphabet autodetection", testnumber);

  /* This is a digital read, using <abc>. */
  if ( (status = esl_msafile_clustal_Read(afp, &msa1))   != eslOK) esl_fatal("clustal good file test %d failed: msa read, digital", testnumber);  
  if (msa1->nseq != expected_nseq || msa1->alen != expected_alen)  esl_fatal("clustal good file test %d failed: nseq/alen",         testnumber);
  if (esl_msa_Validate(msa1, NULL) != eslOK)                       esl_fatal("clustal good file test %d failed: msa1 invalid",      testnumber);
  esl_msafile_Close(afp);  

  /* write it back out to a new tmpfile (digital write) */
  if ( (status = esl_tmpfile_named(tmpfile1, &ofp))                     != eslOK) esl_fatal("clustal good file test %d failed: tmpfile creation",   testnumber);
  if ( (status = esl_msafile_clustal_Write(ofp, msa1, expected_format)) != eslOK) esl_fatal("clustal good file test %d failed: msa write, digital", testnumber);
  fclose(ofp);

  /* now open and read it as text mode, in known format. (We have to pass fmtd now, to deal with the possibility of a nonstandard name width) */
  if ( (status = esl_msafile_Open(NULL, tmpfile1, NULL, expected_format, NULL, &afp)) != eslOK) esl_fatal("clustal good file test %d failed: text mode open", testnumber);  
  if ( (status = esl_msafile_clustal_Read(afp, &msa2))                                != eslOK) esl_fatal("clustal good file test %d failed: msa read, text", testnumber);  
  if (msa2->nseq != expected_nseq || msa2->alen != expected_alen)                               esl_fatal("clustal good file test %d failed: nseq/alen",      testnumber);
  if (esl_msa_Validate(msa2, NULL) != eslOK)                                                    esl_fatal("clustal good file test %d failed: msa2 invalid",   testnumber);
  esl_msafile_Close(afp);
  
  /* write it back out to a new tmpfile (text write) */
  if ( (status = esl_tmpfile_named(tmpfile2, &ofp))                     != eslOK) esl_fatal("clustal good file test %d failed: tmpfile creation", testnumber);
  if ( (status = esl_msafile_clustal_Write(ofp, msa2, expected_format)) != eslOK) esl_fatal("clustal good file test %d failed: msa write, text",  testnumber);
  fclose(ofp);
  esl_msa_Destroy(msa2);

  /* open and read it in digital mode */
  if ( (status = esl_msafile_Open(&abc, tmpfile1, NULL, expected_format, NULL, &afp)) != eslOK) esl_fatal("clustal good file test %d failed: 2nd digital mode open", testnumber);  
  if ( (status = esl_msafile_clustal_Read(afp, &msa2))                                != eslOK) esl_fatal("clustal good file test %d failed: 2nd digital msa read",  testnumber);  
  if (esl_msa_Validate(msa2, NULL) != eslOK)                                                    esl_fatal("clustal good file test %d failed: msa2 invalid",          testnumber);
  esl_msafile_Close(afp);

  /* this msa <msa2> should be identical to <msa1> */
  if (esl_msa_Compare(msa1, msa2) != eslOK) esl_fatal("clustal good file test %d failed: msa compare", testnumber);  

  remove(tmpfile1);
  remove(tmpfile2);
  esl_msa_Destroy(msa1);
  esl_msa_Destroy(msa2);
  esl_alphabet_Destroy(abc);
}

static void
write_test_msas(FILE *ofp1, FILE *ofp2)
{
  fprintf(ofp1, "EASEL (X.x) multiple sequence alignment\n");
  fprintf(ofp1, "\n");
  fprintf(ofp1, "seq1 ..acdefghiklmnpqrstvwy\n");
  fprintf(ofp1, "seq2 ..acdefghiklmnpqrstv--\n");
  fprintf(ofp1, "seq3 aaacdefghiklmnpqrstv--\n");
  fprintf(ofp1, "seq4 ..acdefghiklmnpqrstvwy\n");
  fprintf(ofp1, "       ******************  \n");
  fprintf(ofp1, "\n");
  fprintf(ofp1, "seq1 ACDEFGHIKLMNPQRSTVWY\n");
  fprintf(ofp1, "seq2 ACDEFGHIKLMNPQRSTVWY\n");
  fprintf(ofp1, "seq3 ACDEFGHIKLMNPQRSTVWY\n");
  fprintf(ofp1, "seq4 ACDEFGHIKLMNPQRSTVWY\n");
  fprintf(ofp1, "     ********************\n");
  fprintf(ofp1, "\n");
  fprintf(ofp1, "seq1 ..\n");
  fprintf(ofp1, "seq2 YY\n");
  fprintf(ofp1, "seq3 ..\n");
  fprintf(ofp1, "seq4 ..\n");
  fprintf(ofp1, "\n");

  fprintf(ofp2, "# STOCKHOLM 1.0\n");
  fprintf(ofp2, "\n");
  fprintf(ofp2, "seq1    ..acdefghiklmnpqrstvwyACDEFGHIKLMNPQRSTVWY..\n");
  fprintf(ofp2, "seq2    ..acdefghiklmnpqrstv--ACDEFGHIKLMNPQRSTVWYYY\n");
  fprintf(ofp2, "seq3    aaacdefghiklmnpqrstv--ACDEFGHIKLMNPQRSTVWY..\n");
  fprintf(ofp2, "seq4    ..acdefghiklmnpqrstvwyACDEFGHIKLMNPQRSTVWY..\n");
  fprintf(ofp2, "//\n");
}

static void
read_test_msas_digital(char *alnfile, char *stkfile)
{
  char msg[]         = "CLUSTAL msa digital read unit test failed";
  ESL_ALPHABET *abc  = NULL;
  ESL_MSAFILE  *afp1 = NULL;
  ESL_MSAFILE  *afp2 = NULL;
  ESL_MSA      *msa1, *msa2, *msa3, *msa4;
  FILE         *alnfp, *stkfp;
  char          alnfile2[32] = "esltmpaln2XXXXXX";
  char          stkfile2[32] = "esltmpstk2XXXXXX";

  if ( esl_msafile_Open(&abc, alnfile, NULL, eslMSAFILE_CLUSTALLIKE, NULL, &afp1) != eslOK)  esl_fatal(msg);
  if ( !abc || abc->type != eslAMINO)                                                        esl_fatal(msg);
  if ( esl_msafile_Open(&abc, stkfile, NULL, eslMSAFILE_STOCKHOLM,   NULL, &afp2) != eslOK)  esl_fatal(msg);
  if ( esl_msafile_clustal_Read  (afp1, &msa1)                                    != eslOK)  esl_fatal(msg);
  if ( esl_msafile_stockholm_Read(afp2, &msa2)                                    != eslOK)  esl_fatal(msg);
  if ( esl_msa_Compare(msa1, msa2)                                                != eslOK)  esl_fatal(msg);
  
  if ( esl_msafile_clustal_Read  (afp1, &msa3) != eslEOF) esl_fatal(msg);
  if ( esl_msafile_stockholm_Read(afp2, &msa3) != eslEOF) esl_fatal(msg);

  esl_msafile_Close(afp2);
  esl_msafile_Close(afp1);

  /* Now write stk to clustal file, and vice versa; then retest */
  if ( esl_tmpfile_named(alnfile2, &alnfp)                                  != eslOK) esl_fatal(msg);
  if ( esl_tmpfile_named(stkfile2, &stkfp)                                  != eslOK) esl_fatal(msg);
  if ( esl_msafile_clustal_Write  (alnfp, msa2, eslMSAFILE_CLUSTAL)         != eslOK) esl_fatal(msg);
  if ( esl_msafile_stockholm_Write(stkfp, msa1, eslMSAFILE_STOCKHOLM)       != eslOK) esl_fatal(msg);
  fclose(alnfp);
  fclose(stkfp);
  if ( esl_msafile_Open(&abc, alnfile2, NULL, eslMSAFILE_CLUSTAL,   NULL, &afp1) != eslOK) esl_fatal(msg);
  if ( esl_msafile_Open(&abc, stkfile2, NULL, eslMSAFILE_STOCKHOLM, NULL, &afp2) != eslOK) esl_fatal(msg);
  if ( esl_msafile_clustal_Read  (afp1, &msa3)                                   != eslOK) esl_fatal(msg);
  if ( esl_msafile_stockholm_Read(afp2, &msa4)                                   != eslOK) esl_fatal(msg);
  if ( esl_msa_Compare(msa3, msa4)                                               != eslOK) esl_fatal(msg);

  remove(alnfile2);
  remove(stkfile2);
  esl_msafile_Close(afp2);
  esl_msafile_Close(afp1);

  esl_msa_Destroy(msa1);
  esl_msa_Destroy(msa2);
  esl_msa_Destroy(msa3);  
  esl_msa_Destroy(msa4);
  esl_alphabet_Destroy(abc);
}

static void
read_test_msas_text(char *alnfile, char *stkfile)
{
  char msg[]         = "CLUSTAL msa text-mode read unit test failed";
  ESL_MSAFILE  *afp1 = NULL;
  ESL_MSAFILE  *afp2 = NULL;
  ESL_MSA      *msa1, *msa2, *msa3, *msa4;
  FILE         *alnfp, *stkfp;
  char          alnfile2[32] = "esltmpaln2XXXXXX";
  char          stkfile2[32] = "esltmpstk2XXXXXX";

  /*                    vvvv-- everything's the same as the digital utest except these NULLs  */
  if ( esl_msafile_Open(NULL, alnfile, NULL, eslMSAFILE_CLUSTALLIKE, NULL, &afp1) != eslOK)  esl_fatal(msg);
  if ( esl_msafile_Open(NULL, stkfile, NULL, eslMSAFILE_STOCKHOLM,   NULL, &afp2) != eslOK)  esl_fatal(msg);
  if ( esl_msafile_clustal_Read  (afp1, &msa1)                                    != eslOK)  esl_fatal(msg);
  if ( esl_msafile_stockholm_Read(afp2, &msa2)                                    != eslOK)  esl_fatal(msg);
  if ( esl_msa_Compare(msa1, msa2)                                                != eslOK)  esl_fatal(msg);
  if ( esl_msafile_clustal_Read  (afp1, &msa3)                                    != eslEOF) esl_fatal(msg);
  if ( esl_msafile_stockholm_Read(afp2, &msa3)                                    != eslEOF) esl_fatal(msg);
  esl_msafile_Close(afp2);
  esl_msafile_Close(afp1);

  if ( esl_tmpfile_named(alnfile2, &alnfp)                                   != eslOK) esl_fatal(msg);
  if ( esl_tmpfile_named(stkfile2, &stkfp)                                   != eslOK) esl_fatal(msg);
  if ( esl_msafile_clustal_Write  (alnfp, msa2, eslMSAFILE_CLUSTAL)          != eslOK) esl_fatal(msg);
  if ( esl_msafile_stockholm_Write(stkfp, msa1, eslMSAFILE_STOCKHOLM)        != eslOK) esl_fatal(msg);
  fclose(alnfp);
  fclose(stkfp);
  if ( esl_msafile_Open(NULL, alnfile2, NULL, eslMSAFILE_CLUSTAL,   NULL, &afp1)  != eslOK) esl_fatal(msg);
  if ( esl_msafile_Open(NULL, stkfile2, NULL, eslMSAFILE_STOCKHOLM, NULL, &afp2)  != eslOK) esl_fatal(msg);
  if ( esl_msafile_clustal_Read  (afp1, &msa3)                                    != eslOK) esl_fatal(msg);
  if ( esl_msafile_stockholm_Read(afp2, &msa4)                                    != eslOK) esl_fatal(msg);
  if ( esl_msa_Compare(msa3, msa4)                                                != eslOK) esl_fatal(msg);

  remove(alnfile2);
  remove(stkfile2);
  esl_msafile_Close(afp2);
  esl_msafile_Close(afp1);

  esl_msa_Destroy(msa1);
  esl_msa_Destroy(msa2);
  esl_msa_Destroy(msa3);  
  esl_msa_Destroy(msa4);
}
#endif /*eslMSAFILE_CLUSTAL_TESTDRIVE*/
/*---------------------- end, unit tests ------------------------*/


/*****************************************************************
 * 4. Test driver.
 *****************************************************************/
#ifdef eslMSAFILE_CLUSTAL_TESTDRIVE
/* compile: gcc -g -Wall -I. -L. -o esl_msafile_clustal_utest -DeslMSAFILE_CLUSTAL_TESTDRIVE esl_msafile_clustal.c -leasel -lm
 *  (gcov): gcc -g -Wall -fprofile-arcs -ftest-coverage -I. -L. -o esl_msafile_clustal_utest -DeslMSAFILE_CLUSTAL_TESTDRIVE esl_msafile_clustal.c -leasel -lm
 * run:     ./esl_msafile_clustal_utest
 */
#include "esl_config.h"

#include <stdio.h>

#include "easel.h"
#include "esl_getopts.h"
#include "esl_random.h"
#include "esl_msafile.h"
#include "esl_msafile_clustal.h"

static ESL_OPTIONS options[] = {
   /* name  type         default  env   range togs  reqs  incomp  help                docgrp */
  {"-h",  eslARG_NONE,    FALSE, NULL, NULL, NULL, NULL, NULL, "show help and usage",                            0},
  { 0,0,0,0,0,0,0,0,0,0},
};
static char usage[]  = "[-options]";
static char banner[] = "test driver for CLUSTAL MSA format module";

int
main(int argc, char **argv)
{
  char            msg[]        = "CLUSTAL MSA i/o module test driver failed";
  ESL_GETOPTS    *go           = esl_getopts_CreateDefaultApp(options, 0, argc, argv, banner, usage);
  char            alnfile[32] = "esltmpalnXXXXXX";
  char            stkfile[32] = "esltmpstkXXXXXX";
  FILE           *alnfp, *stkfp;
  int             testnumber;
  int             ngoodtests = 3;
  char            tmpfile[32];
  FILE           *ofp;
  int             expected_format;
  int             expected_alphatype;
  int             expected_nseq;
  int             expected_alen;

  if ( esl_tmpfile_named(alnfile, &alnfp) != eslOK) esl_fatal(msg);
  if ( esl_tmpfile_named(stkfile, &stkfp) != eslOK) esl_fatal(msg);
  write_test_msas(alnfp, stkfp);
  fclose(alnfp);
  fclose(stkfp);

  read_test_msas_digital(alnfile, stkfile);
  read_test_msas_text   (alnfile, stkfile);

  /* Various "good" files that should be parsed correctly */
  for (testnumber = 1; testnumber <= ngoodtests; testnumber++)
    {
      strcpy(tmpfile, "esltmpXXXXXX"); 
      if (esl_tmpfile_named(tmpfile, &ofp) != eslOK) esl_fatal(msg);
      switch (testnumber) {
      case  1:  utest_write_good1 (ofp, &expected_format, &expected_alphatype, &expected_nseq, &expected_alen); break;
      case  2:  utest_write_good2 (ofp, &expected_format, &expected_alphatype, &expected_nseq, &expected_alen); break;
      case  3:  utest_write_good3 (ofp, &expected_format, &expected_alphatype, &expected_nseq, &expected_alen); break;
      }
      fclose(ofp);
      utest_goodfile(tmpfile, testnumber, expected_format, expected_alphatype, expected_nseq, expected_alen);
      remove(tmpfile);
    }

  remove(alnfile);
  remove(stkfile);
  esl_getopts_Destroy(go);
  return 0;
}
#endif /*eslMSAFILE_CLUSTAL_TESTDRIVE*/
/*--------------------- end, test driver ------------------------*/


/*****************************************************************
 * 5. Examples.
 *****************************************************************/


#ifdef eslMSAFILE_CLUSTAL_EXAMPLE
/* A full-featured example of reading/writing an MSA in Clustal format(s).
   gcc -g -Wall -o esl_msafile_clustal_example -I. -L. -DeslMSAFILE_CLUSTAL_EXAMPLE esl_msafile_clustal.c -leasel -lm
   ./esl_msafile_clustal_example <msafile>
 */
/*::cexcerpt::msafile_clustal_example::begin::*/
#include <stdio.h>

#include "easel.h"
#include "esl_alphabet.h"
#include "esl_getopts.h"
#include "esl_msa.h"
#include "esl_msafile.h"
#include "esl_msafile_clustal.h"

static ESL_OPTIONS options[] = {
  /* name             type          default  env  range toggles reqs incomp  help                                       docgroup*/
  { "-h",          eslARG_NONE,       FALSE,  NULL, NULL,  NULL,  NULL, NULL, "show brief help on version and usage",        0 },
  { "-1",          eslARG_NONE,       FALSE,  NULL, NULL,  NULL,  NULL, NULL, "no autodetection; use CLUSTAL format",        0 },
  { "-2",          eslARG_NONE,       FALSE,  NULL, NULL,  NULL,  NULL, NULL, "no autodetection; use CLUSTALLIKE format",    0 },
  { "-q",          eslARG_NONE,       FALSE,  NULL, NULL,  NULL,  NULL, NULL, "quieter: don't write msa back, just summary", 0 },
  { "-t",          eslARG_NONE,       FALSE,  NULL, NULL,  NULL,  NULL, NULL, "use text mode: no digital alphabet",          0 },
  { "--dna",       eslARG_NONE,       FALSE,  NULL, NULL,  NULL,  NULL, "-t", "specify that alphabet is DNA",                0 },
  { "--rna",       eslARG_NONE,       FALSE,  NULL, NULL,  NULL,  NULL, "-t", "specify that alphabet is RNA",                0 },
  { "--amino",     eslARG_NONE,       FALSE,  NULL, NULL,  NULL,  NULL, "-t", "specify that alphabet is protein",            0 },
  {  0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
};
static char usage[]  = "[-options] <msafile>";
static char banner[] = "example of guessing, reading, writing Clustal formats";

int 
main(int argc, char **argv)
{
  ESL_GETOPTS        *go          = esl_getopts_CreateDefaultApp(options, 1, argc, argv, banner, usage);
  char               *filename    = esl_opt_GetArg(go, 1);
  int                 infmt       = eslMSAFILE_UNKNOWN;
  ESL_ALPHABET       *abc         = NULL;
  ESL_MSAFILE        *afp         = NULL;
  ESL_MSA            *msa         = NULL;
  int                 status;

  if      (esl_opt_GetBoolean(go, "-1"))      infmt = eslMSAFILE_CLUSTAL;
  else if (esl_opt_GetBoolean(go, "-2"))      infmt = eslMSAFILE_CLUSTALLIKE;

  if      (esl_opt_GetBoolean(go, "--rna"))   abc = esl_alphabet_Create(eslRNA);
  else if (esl_opt_GetBoolean(go, "--dna"))   abc = esl_alphabet_Create(eslDNA);
  else if (esl_opt_GetBoolean(go, "--amino")) abc = esl_alphabet_Create(eslAMINO); 

  /* Text mode: pass NULL for alphabet.
   * Digital mode: pass ptr to expected ESL_ALPHABET; and if abc=NULL, alphabet is guessed 
   */
  if   (esl_opt_GetBoolean(go, "-t"))  status = esl_msafile_Open(NULL, filename, NULL, infmt, NULL, &afp);
  else                                 status = esl_msafile_Open(&abc, filename, NULL, infmt, NULL, &afp);
  if (status != eslOK) esl_msafile_OpenFailure(afp, status);

  if ( (status = esl_msafile_clustal_Read(afp, &msa)) != eslOK)
    esl_msafile_ReadFailure(afp, status);

  printf("format variant: %s\n", esl_msafile_DecodeFormat(afp->format));
  printf("alphabet:       %s\n", (abc ? esl_abc_DecodeType(abc->type) : "none (text mode)"));
  printf("# of seqs:      %d\n", msa->nseq);
  printf("# of cols:      %d\n", (int) msa->alen);
  printf("\n");

  if (! esl_opt_GetBoolean(go, "-q"))
    esl_msafile_clustal_Write(stdout, msa, eslMSAFILE_CLUSTAL);

  esl_msa_Destroy(msa);
  esl_msafile_Close(afp);
  if (abc) esl_alphabet_Destroy(abc);
  esl_getopts_Destroy(go);
  exit(0);
}
/*::cexcerpt::msafile_clustal_example::end::*/
#endif /*eslMSAFILE_CLUSTAL_EXAMPLE*/

#ifdef eslMSAFILE_CLUSTAL_EXAMPLE2
/* A minimal example. Read Clustal MSA, in text mode.
   gcc -g -Wall -o esl_msafile_clustal_example2 -I. -L. -DeslMSAFILE_CLUSTAL_EXAMPLE2 esl_msafile_clustal.c -leasel -lm
   ./esl_msafile_clustal_example2 <msafile>
 */

/*::cexcerpt::msafile_clustal_example2::begin::*/
#include <stdio.h>

#include "easel.h"
#include "esl_msa.h"
#include "esl_msafile.h"
#include "esl_msafile_clustal.h"

int 
main(int argc, char **argv)
{
  char         *filename = argv[1];
  int           fmt      = eslMSAFILE_CLUSTAL; /* or eslMSAFILE_CLUSTALLIKE */
  ESL_MSAFILE  *afp      = NULL;
  ESL_MSA      *msa      = NULL;
  int          status;

  if ( (status = esl_msafile_Open(NULL, filename, NULL, fmt, NULL, &afp)) != eslOK)  esl_msafile_OpenFailure(afp, status);
  if ( (status = esl_msafile_clustal_Read(afp, &msa))                     != eslOK)  esl_msafile_ReadFailure(afp, status);

  printf("%6d seqs, %5d columns\n", msa->nseq, (int) msa->alen);

  esl_msafile_clustal_Write(stdout, msa, eslMSAFILE_CLUSTAL);

  esl_msa_Destroy(msa);
  esl_msafile_Close(afp);
  exit(0);
}
/*::cexcerpt::msafile_clustal_example2::end::*/
#endif /*eslMSAFILE_CLUSTAL_EXAMPLE2*/
/*--------------------- end of example --------------------------*/

