/* I/O of multiple sequence alignment files in PHYLIP format(s) * * Contents: * 1. API for reading/writing PHYLIP format alignment files * 2. I/O of the interleaved variant of the format * 3. I/O of the sequential variant of the format * 4. Autodetection of format and its variants * 5. Rectifying valid input/output symbols in names, seqs * 6. Unit tests * 7. Test driver * 8. Example * * See: http://evolution.genetics.washington.edu/phylip/doc/sequence.html */ #include "esl_config.h" #include #include #include #include #include "easel.h" #include "esl_alphabet.h" #include "esl_mem.h" #include "esl_msa.h" #include "esl_msafile.h" #include "esl_msafile_phylip.h" #define eslMSAFILE_PHYLIP_LEGALSYMS "-ABCDEFGHIJKLMNOPQRSTUVWZYX*?." static int phylip_interleaved_Read(ESL_MSAFILE *afp, ESL_MSA *msa, int nseq, int32_t alen_stated); static int phylip_interleaved_Write(FILE *fp, const ESL_MSA *msa, ESL_MSAFILE_FMTDATA *opt_fmtd); static int phylip_sequential_Read(ESL_MSAFILE *afp, ESL_MSA *msa, int nseq, int32_t alen_stated); static int phylip_sequential_Write(FILE *fp, const ESL_MSA *msa, ESL_MSAFILE_FMTDATA *opt_fmtd); static int phylip_check_interleaved (ESL_BUFFER *bf, int *ret_nblocks, int *ret_namewidth); static int phylip_check_sequential_known (ESL_BUFFER *bf, int namewidth); static int phylip_check_sequential_unknown(ESL_BUFFER *bf, int *ret_namewidth); static int phylip_parse_header(ESL_BUFFER *bf, int32_t *ret_nseq, int32_t *ret_alen, char **ret_p, esl_pos_t *ret_n); static int phylip_collate_colcodes(char *p, esl_pos_t n, char *colcodes, int ncols); static int phylip_deduce_namewidth(char *colcodes0, int ncols0, int alen, int nres2, int *ret_namewidth); static int phylip_rectify_input_name(char *namebuf, char *p, int n); static int phylip_rectify_output_seq_digital(char *buf); static int phylip_rectify_output_seq_text(char *buf); /***************************************************************** * 1. API for reading/writing PHYLIP format alignment files *****************************************************************/ /* Function: esl_msafile_phylip_SetInmap() * Synopsis: Configure input map for PHYLIP formats. * * Purpose: Set the inmap> for PHYLIP formats. * * Phylip documentation states that DNA programs accept * 'ABCDGHKMNORSTUVWXY?-', that 'a period was previously * allowed' and that O means a deletion. Protein programs * accept 'ABCDEFGHIJKLMNOPQRSTUVWXYZ*?-', and while JOU * are accepted, they are unused. * * So: in text mode, we accept any alphabetic character * plus '-*?.', verbatim. '~_', which Easel would normally * accept, are illegal. Whitespace and numbers are ignored. * * In digital mode, we modify the digital alphabet by * demapping '~_' and making them illegal; '?' is mapped to * missing data; whitespace and numbers are ignored; * and ONLY in or alphabets, 'O' is * mapped to a gap. * * The inconsistent mapping of 'O' poses potential * problems. In text mode (where we don't know the * alphabet, and thus don't know what to do with O), we * input the O verbatim. In digital mode, in a DNA or RNA * alphabet, we map O to a gap; in other digital alphabets, * we use the default digital alphabet mapping of O. * * Xref: http://evolution.genetics.washington.edu/phylip/doc/sequence.html */ int esl_msafile_phylip_SetInmap(ESL_MSAFILE *afp) { int sym; if (afp->abc) { for (sym = 1; sym < 128; sym++) afp->inmap[sym] = afp->abc->inmap[sym]; for (sym = '0'; sym < '9'; sym++) afp->inmap[sym] = eslDSQ_IGNORED; afp->inmap['?'] = esl_abc_XGetMissing(afp->abc); afp->inmap['~'] = eslDSQ_ILLEGAL; afp->inmap['_'] = eslDSQ_ILLEGAL; afp->inmap[' '] = eslDSQ_IGNORED; afp->inmap['\t'] = eslDSQ_IGNORED; afp->inmap[0] = esl_abc_XGetUnknown(afp->abc); if (afp->abc->type == eslDNA || afp->abc->type == eslRNA) afp->inmap['O'] = esl_abc_XGetGap(afp->abc); } if (! afp->abc) { for (sym = 1; sym < 128; sym++) afp->inmap[sym] = eslDSQ_ILLEGAL; for (sym = 'a'; sym <= 'z'; sym++) afp->inmap[sym] = sym; for (sym = 'A'; sym <= 'Z'; sym++) afp->inmap[sym] = sym; for (sym = '0'; sym <= '9'; sym++) afp->inmap[sym] = eslDSQ_IGNORED; afp->inmap['-'] = '-'; afp->inmap['*'] = '*'; afp->inmap['?'] = '?'; afp->inmap['.'] = '.'; afp->inmap[' '] = eslDSQ_IGNORED; afp->inmap['\t'] = eslDSQ_IGNORED; afp->inmap[0] = '?'; } return eslOK; } /* Function: esl_msafile_phylip_GuessAlphabet() * Synopsis: Guess the alphabet of an open PHYLIP MSA input. * * Purpose: Guess the alphabet of the sequences in open * PHYLIP format MSA file . * * On normal return, <*ret_type> is set to , * , or , and is reset to its * original point. * * Args: afp - open PHYLIP format MSA file * *ret_type - RETURN: , , ; or . * * Returns: on success. * if autodetection fails. * * Throws: on allocation error. * on failures of fread() or other system calls */ int esl_msafile_phylip_GuessAlphabet(ESL_MSAFILE *afp, int *ret_type) { int namewidth = (afp->fmtd.namewidth ? afp->fmtd.namewidth : 10); /* default: strict PHYLIP, namewidth 10 */ int alphatype = eslUNKNOWN; int threshold[3] = { 500, 5000, 50000 }; /* we check after 500, 5000, 50000 residues; else we go to EOF */ int nsteps = 3; /* ...there's 3 threshold[]'s we check */ int step = 0; /* ...starting on threshold[0] */ int64_t nres = 0; char *p; esl_pos_t n, pos; int x; esl_pos_t anchor; int64_t ct[26]; int status; for (x = 0; x < 26; x++) ct[x] = 0; anchor = esl_buffer_GetOffset(afp->bf); if ((status = esl_buffer_SetAnchor(afp->bf, anchor)) != eslOK) { status = eslEINCONCEIVABLE; goto ERROR; } /* [eslINVAL] can't happen here */ /* Find the first nonblank line, which says " " and may also have options. we ignore this header */ while ( (status = esl_buffer_GetLine(afp->bf, &p, &n)) == eslOK && esl_memspn(p, n, " \t") == n) ; if (status == eslEOF) ESL_XFAIL(eslENOALPHABET, afp->errmsg, "can't determine alphabet: no alignment data found"); else if (status != eslOK) goto ERROR; /* Read line by line, just looking for residues, not worrying about nseq/alen or sequential/interleaved * Always skip the name field, even in continuation lines/blocks * This may miss some residues, but it means we work on both sequential and interleaved formats */ while ( (status = esl_buffer_GetLine(afp->bf, &p, &n)) == eslOK) { if (esl_memspn(p, n, " \t") == n) continue; if (n < namewidth) continue; p += namewidth; n -= namewidth; /* count characters into ct[] array */ for (pos = 0; pos < n; pos++) if (isalpha(p[pos])) { x = toupper(p[pos]) - 'A'; ct[x]++; nres++; } /* try to stop early, checking after 500, 5000, and 50000 residues: */ if (step < nsteps && nres > threshold[step]) { if ((status = esl_abc_GuessAlphabet(ct, &alphatype)) == eslOK) goto DONE; /* (eslENOALPHABET) */ step++; } } if (status != eslEOF) goto ERROR; /* [eslEMEM,eslESYS,eslEINCONCEIVABLE] */ status = esl_abc_GuessAlphabet(ct, &alphatype); /* (eslENOALPHABET) */ DONE: esl_buffer_SetOffset(afp->bf, anchor); /* Rewind to where we were. */ esl_buffer_RaiseAnchor(afp->bf, anchor); *ret_type = alphatype; return status; ERROR: if (anchor != -1) { esl_buffer_SetOffset(afp->bf, anchor); esl_buffer_RaiseAnchor(afp->bf, anchor); } *ret_type = eslUNKNOWN; return status; } /* Function: esl_msafile_phylip_Read() * Synopsis: Read in a PHYLIP format alignment. * * Purpose: Read an MSA from an open , parsing for * Phylip format, starting from the current point. The * format may be either the interleaved or sequential * variant, according to the format set in format>: * means interleaved, and * means sequential. Create a new * multiple alignment, and return a ptr to that alignment * in <*ret_msa>. Caller is responsible for free'ing this * . * * Args: afp - open * ret_msa - RETURN: newly parsed * * Returns: on success. * * if no (more) alignment data were found in * , and is returned at EOF. * * on a parse error. <*ret_msa> is set to * . contains information sufficient for * constructing useful diagnostic output: * | errmsg> | user-directed error message | * | linenumber> | line # where error was detected | * | line> | offending line (not NUL-term) | * | n> | length of offending line | * | bf->filename> | name of the file | * and is poised at the start of the following line, * so (in principle) the caller could try to resume * parsing. * * Throws: - an allocation failed. * - a system call such as fread() failed * - "impossible" corruption */ int esl_msafile_phylip_Read(ESL_MSAFILE *afp, ESL_MSA **ret_msa) { ESL_MSA *msa = NULL; int32_t alen_stated; /* int32_t because we're using strtoi32() to parse it from the file */ int nseq; char *p, *tok; esl_pos_t n, toklen; int status; ESL_DASSERT1( (afp->format == eslMSAFILE_PHYLIP || afp->format == eslMSAFILE_PHYLIPS) ); afp->errmsg[0] = '\0'; /* skip leading blank lines (though there shouldn't be any) */ while ( (status = esl_msafile_GetLine(afp, &p, &n)) == eslOK && esl_memspn(p, n, " \t") == n) ; if (status != eslOK) goto ERROR; /* includes normal EOF */ /* the first line: */ esl_memtok(&p, &n, " \t", &tok, &toklen); if (esl_mem_strtoi32(tok, toklen, 0, NULL, &nseq) != eslOK) ESL_XFAIL(eslEFORMAT, afp->errmsg, "first PHYLIP line should be : first field isn't an integer"); if (esl_memtok(&p, &n, " \t", &tok, &toklen) != eslOK) ESL_XFAIL(eslEFORMAT, afp->errmsg, "first PHYLIP line should be : only one field found"); if (esl_mem_strtoi32(tok, toklen, 0, NULL, &alen_stated) != eslOK) ESL_XFAIL(eslEFORMAT, afp->errmsg, "first PHYLIP line should be : second field isn't an integer"); /* believe and allocate accordingly */ /* don't believe ; use it for validation, after we've parsed everything. */ if (afp->abc && (msa = esl_msa_CreateDigital(afp->abc, nseq, -1)) == NULL) { status = eslEMEM; goto ERROR; } if (! afp->abc && (msa = esl_msa_Create( nseq, -1)) == NULL) { status = eslEMEM; goto ERROR; } /* load next line, skipping any blank ones (though there shouldn't be any) */ do { status = esl_msafile_GetLine(afp, &p, &n); if (status == eslEOF) ESL_XFAIL(eslEFORMAT, afp->errmsg, "no alignment data following PHYLIP header"); else if (status != eslOK) goto ERROR; } while (esl_memspn(p, n, " \t") == n); /* idiom for "blank line" */ /* hand off to interleaved vs. sequential parser for the rest */ if (afp->format == eslMSAFILE_PHYLIP) status = phylip_interleaved_Read(afp, msa, nseq, alen_stated); else if (afp->format == eslMSAFILE_PHYLIPS) status = phylip_sequential_Read (afp, msa, nseq, alen_stated); if (status != eslOK) goto ERROR; if (( status = esl_msa_SetDefaultWeights(msa)) != eslOK) goto ERROR; *ret_msa = msa; return eslOK; ERROR: if (msa) esl_msa_Destroy(msa); *ret_msa = NULL; return status; } /* Function: esl_msafile_phylip_Write() * Synopsis: Write an MSA to a stream in PHYLIP format. * * Purpose: Write alignment in PHYLIP format to stream . * If is , write interleaved format; * if is , write sequential format. * * Optionally, caller may pass additional formatting * information by passing a ptr to a valid * structure. namewidth> sets the width of the * name field. For strict PHYLIP format, this must be 10. * rpl> sets the number of residues per line. * The default is 60. For either value, if it is unset or * if is , the default is used. * * Args: fp - open output stream * msa - alignment to write * format - for interleaved; for sequential. * fmtd - optional: , or additional format information * * Returns: on success. * * Throws: if isn't or . * on allocation failure. * on any system write error, such as a filled disk. */ int esl_msafile_phylip_Write(FILE *fp, const ESL_MSA *msa, int format, ESL_MSAFILE_FMTDATA *opt_fmtd) { if (format == eslMSAFILE_PHYLIP) return phylip_interleaved_Write(fp, msa, opt_fmtd); else if (format == eslMSAFILE_PHYLIPS) return phylip_sequential_Write (fp, msa, opt_fmtd); else ESL_EXCEPTION(eslEINVAL, "format %s is not a PHYLIP format", esl_msafile_DecodeFormat(format)); } /* Function: esl_msafile_phylip_CheckFileFormat() * Synopsis: Check whether an input seems to be in PHYLIP format. * * Purpose: Check whether input buffer appears to be * in a PHYLIP format, starting from the current point. * Return if it is, and if it isn't. * * There are two main variants of the format, interleaved * and sequential. Upon successful return, <*ret_format> is * set to for interleaved, or * for sequential. * * Strict PHYLIP format has a name/identifier field width * of exactly 10 characters, but variants of the format are * in common use with different name widths. The * name width is determined and returned in <*ret_namewidth>. * * If the input doesn't appear to be in PHYLIP format, * return , with <*ret_format> as * and <*ret_namewidth> as 0. * * The PHYLIP format definition is ambiguous; it is * possible to construct pathological inputs that could be * validly parsed to yield different data when read as * interleaved versus sequential. (This requires names that * use a character set that looks like sequence, among * other unusual edge conditions.) If the format variant * cannot be unambiguously determined, we do not attempt to * make a guess that might result in corrupted input; * rather, return . * * Args: bf - input buffer * *ret_format - RETURN: format variant, , <_PHYLIPS>, <_UNKNOWN> * *ret_namewidth - RETURN: width of name field, in characters * * Returns: if input is in PHYLIP format; <*ret_format> is set to * or ; <*ret_namewidth> * is the name width, either the usual 10, or something nonstandard. * * if the input appears to be in a PHYLIP format but * we can't tell which one. <*ret_format> is , * <*ret_namewidth> is 0. * * if the input is not in either PHYLIP format; * <*ret_format> is , <*ret_namewidth> is 0. * * Throws: (no abnormal error conditions) */ int esl_msafile_phylip_CheckFileFormat(ESL_BUFFER *bf, int *ret_format, int *ret_namewidth) { int maybe_interleaved = FALSE; // Must worry about pathological files consistent with both formats. int maybe_sequential = FALSE; int nblocks; // number of interleaved blocks. if only 1, interleaved == sequential int w1, w2; // name widths if interleaved, sequential if ( phylip_check_interleaved(bf, &nblocks, &w1) == eslOK) { maybe_interleaved = TRUE; } if (maybe_interleaved && nblocks == 1) { *ret_namewidth = w1; *ret_format = eslMSAFILE_PHYLIP; return eslOK; } if ( phylip_check_sequential_known (bf, 10) == eslOK) { maybe_sequential = TRUE; w2 = 10; } else if ( phylip_check_sequential_unknown(bf, &w2) == eslOK) { maybe_sequential = TRUE; } if (maybe_interleaved && maybe_sequential) { *ret_namewidth = 0; *ret_format = eslMSAFILE_UNKNOWN; return eslEAMBIGUOUS; } else if (maybe_interleaved) { *ret_namewidth = w1; *ret_format = eslMSAFILE_PHYLIP; return eslOK; } else if (maybe_sequential) { *ret_namewidth = w2; *ret_format = eslMSAFILE_PHYLIPS; return eslOK; } else { *ret_namewidth = 0; *ret_format = eslMSAFILE_UNKNOWN; return eslFAIL; } } /*-------------- end, API for PHYLIP format i/o -----------------*/ /***************************************************************** * 2. i/o of the interleaved variant of the format *****************************************************************/ /* Read the interleaved variant. * header told us to expect , . * is already allocated , <-1>. * In , we've loaded the first line of the alignment data. */ static int phylip_interleaved_Read(ESL_MSAFILE *afp, ESL_MSA *msa, int nseq, int32_t alen_stated) { int namewidth = (afp->fmtd.namewidth ? afp->fmtd.namewidth : 10); /* default: strict PHYLIP, namewidth 10 */ char *namebuf = NULL; int nblocks = 0; int64_t alen = 0; /* alignment length observed so far */ char *p = afp->line; esl_pos_t n = afp->n; int64_t block_alen; /* # of residues being added by current block */ int64_t cur_alen; int idx; int status; ESL_ALLOC(namebuf, sizeof(char) * (namewidth+1)); /* p, n is now the first line of a block */ /* read the alignment data */ do { idx = 0; do { /* First block? store the sequence names */ if (nblocks == 0) { if (n < namewidth) ESL_XFAIL(eslEFORMAT, afp->errmsg, "PHYLIP line too short to find sequence name"); if (phylip_rectify_input_name(namebuf, p, namewidth) != eslOK) ESL_XFAIL(eslEFORMAT, afp->errmsg, "invalid character(s) in sequence name"); if ( (status = esl_msa_SetSeqName(msa, idx, namebuf, -1)) != eslOK) goto ERROR; p += namewidth; n -= namewidth; } /* Append the sequence. */ cur_alen = alen; if (msa->abc) { status = esl_abc_dsqcat(afp->inmap, &(msa->ax[idx]), &(cur_alen), p, n); } if (! msa->abc) { status = esl_strmapcat (afp->inmap, &(msa->aseq[idx]), &(cur_alen), p, n); } if (status == eslEINVAL) ESL_XFAIL(eslEFORMAT, afp->errmsg, "one or more invalid sequence characters"); else if (status != eslOK) goto ERROR; /* validate # of residues added */ if (idx == 0) block_alen = cur_alen - alen; else if (cur_alen - alen != block_alen) ESL_XFAIL(eslEFORMAT, afp->errmsg, "number of residues on line differs from previous seqs in alignment block"); /* get next line. */ idx++; status = esl_msafile_GetLine(afp, &p, &n); } while (status == eslOK && idx < nseq && esl_memspn(p, n, " \t") < n); /* stop block on: read error, EOF; idx == nseq; or blank line */ if (idx != nseq) ESL_XFAIL(eslEFORMAT, afp->errmsg, "unexpected number of sequences in block (saw %d, expected %d)", idx, nseq); nblocks += 1; alen += block_alen; /* tolerate blank lines only at the end of a block */ while (status == eslOK && esl_memspn(p, n, " \t") == n) status = esl_msafile_GetLine(afp, &p, &n); /* [eslEMEM,eslESYS] */ /* now status can be: read error, EOF, or OK. */ } while (status == eslOK && alen < alen_stated); /* End of all blocks. We swallowed blank lines following last block, * so we should be EOF; unless we're in a seqboot file, in which * case we just read the first line of the next MSA. */ if (status == eslOK) esl_msafile_PutLine(afp); /* put line back in the stream, so we will read it properly w/ next msa read */ else if (status != eslEOF) goto ERROR; else if (alen != alen_stated) ESL_XFAIL(eslEFORMAT, afp->errmsg, "alignment length disagrees with header: header said %d, parsed %" PRId64, alen_stated, alen); msa->nseq = nseq; msa->alen = alen; free(namebuf); return eslOK; ERROR: msa->nseq = nseq; /* we're allocated and initialized for : this makes sure we free everything we need to in */ if (namebuf) free(namebuf); return status; } /* Write an interleaved PHYLIP file. * Returns on success. * Throws on any system write error. */ static int phylip_interleaved_Write(FILE *fp, const ESL_MSA *msa, ESL_MSAFILE_FMTDATA *opt_fmtd) { int rpl = ( (opt_fmtd && opt_fmtd->rpl) ? opt_fmtd->rpl : 60); int namewidth = ( (opt_fmtd && opt_fmtd->namewidth) ? opt_fmtd->namewidth : 10); char *buf = NULL; int idx; int64_t apos; int status; ESL_ALLOC(buf, sizeof(char) * (rpl+1)); buf[rpl] = '\0'; if (fprintf(fp, " %d %" PRId64, msa->nseq, msa->alen) < 0) ESL_XEXCEPTION_SYS(eslEWRITE, "interleaved phylip write failed"); for (apos = 0; apos < msa->alen; apos += rpl) { if (fprintf(fp, "\n") < 0) ESL_XEXCEPTION_SYS(eslEWRITE, "interleaved phylip write failed"); for (idx = 0; idx < msa->nseq; idx++) { if (msa->abc) { esl_abc_TextizeN(msa->abc, msa->ax[idx]+apos+1, rpl, buf); phylip_rectify_output_seq_digital(buf); } if (! msa->abc) { strncpy(buf, msa->aseq[idx]+apos, rpl); phylip_rectify_output_seq_text(buf); } if (apos == 0) { if (fprintf(fp, "%-*.*s %s\n", namewidth, namewidth, msa->sqname[idx], buf) < 0) ESL_XEXCEPTION_SYS(eslEWRITE, "interleaved phylip write failed"); } else { if (fprintf(fp, "%s\n", buf) < 0) ESL_XEXCEPTION_SYS(eslEWRITE, "interleaved phylip write failed"); } } } free(buf); return eslOK; ERROR: if (buf) free(buf); return status; } /*----------------- end, interleaved variant -------------------*/ /***************************************************************** * 3. i/o of sequential variant of the format *****************************************************************/ /* Read the sequential variant. * header told us to expect , . * is already allocated , <-1>. * In , we've loaded the first line of the alignment data. */ static int phylip_sequential_Read(ESL_MSAFILE *afp, ESL_MSA *msa, int nseq, int32_t alen_stated) { int namewidth = (afp->fmtd.namewidth ? afp->fmtd.namewidth : 10); /* default: strict PHYLIP, namewidth 10 */ char *namebuf = NULL; char *p = afp->line; esl_pos_t n = afp->n; int idx; int64_t alen = 0; int status = eslOK; ESL_ALLOC(namebuf, sizeof(char) * (namewidth+1)); for (idx = 0; idx < nseq; idx++) { alen = 0; status = eslOK; while (status == eslOK && alen < alen_stated) { if (alen == 0) /* First line? Store the sequence name */ { if (n < namewidth) ESL_XFAIL(eslEFORMAT, afp->errmsg, "PHYLIP line too short to find sequence name"); if (phylip_rectify_input_name(namebuf, p, namewidth) != eslOK) ESL_XFAIL(eslEFORMAT, afp->errmsg, "invalid character(s) in sequence name"); if ( (status = esl_msa_SetSeqName(msa, idx, namebuf, -1)) != eslOK) goto ERROR; p += namewidth; n -= namewidth; } if (msa->abc) { status = esl_abc_dsqcat(afp->inmap, &(msa->ax[idx]), &alen, p, n); } if (! msa->abc) { status = esl_strmapcat (afp->inmap, &(msa->aseq[idx]), &alen, p, n); } if (status == eslEINVAL) ESL_XFAIL(eslEFORMAT, afp->errmsg, "one or more invalid sequence characters"); else if (status != eslOK) goto ERROR; /* get next line */ status = esl_msafile_GetLine(afp, &p, &n); } /* end looping over a seq */ /* tolerate blank lines after sequences. */ while (status == eslOK && esl_memspn(p, n, " \t") == n) status = esl_msafile_GetLine(afp, &p, &n); if (status == eslEOF) { if (idx < nseq-1) ESL_XFAIL(eslEFORMAT, afp->errmsg, "premature end of file: header said to expect %d sequences", nseq); } else if (status != eslOK) goto ERROR; else if (alen != alen_stated) ESL_XFAIL(eslEFORMAT, afp->errmsg, "aligned length of sequence disagrees with header: header says %d, parsed %" PRId64, alen_stated, alen); } /* end looping over all seqs */ /* we should be EOF; we've swallowed all trailing blank lines. * Exception: if we're in a seqboot file, we just read the line of the next msa record; push it back on stream */ if (status == eslOK) esl_msafile_PutLine(afp); msa->nseq = nseq; msa->alen = alen; free(namebuf); return eslOK; ERROR: msa->nseq = nseq; /* we're allocated and initialized for : this makes sure we free everything we need to in */ if (namebuf) free(namebuf); return status; } static int phylip_sequential_Write(FILE *fp, const ESL_MSA *msa, ESL_MSAFILE_FMTDATA *opt_fmtd) { int rpl = ( (opt_fmtd && opt_fmtd->rpl) ? opt_fmtd->rpl : 60); int namewidth = ( (opt_fmtd && opt_fmtd->namewidth) ? opt_fmtd->namewidth : 10); char *buf = NULL; int idx; int64_t apos; int status; ESL_ALLOC(buf, sizeof(char) * (rpl+1)); buf[rpl] = '\0'; fprintf(fp, " %d %" PRId64 "\n", msa->nseq, msa->alen); for (idx = 0; idx < msa->nseq; idx++) { for (apos = 0; apos < msa->alen; apos += rpl) { if (msa->abc) { esl_abc_TextizeN(msa->abc, msa->ax[idx]+apos+1, rpl, buf); phylip_rectify_output_seq_digital(buf); } if (! msa->abc) { strncpy(buf, msa->aseq[idx]+apos, rpl); phylip_rectify_output_seq_text(buf); } if (apos == 0) fprintf(fp, "%-*.*s %s\n", namewidth, namewidth, msa->sqname[idx], buf); else fprintf(fp, "%s\n", buf); } } free(buf); return eslOK; ERROR: if (buf) free(buf); return status; } /*------------------ end, sequential variant --------------------*/ /***************************************************************** * 4. Autodetection of the format and its variants *****************************************************************/ /* return if input is consistent with interleaved format; * set <*ret_namewidth> to the apparent name width; * set <*ret_nblocks> to the number of interleaved blocks seen. * * If , it doesn't matter whether we read the file as * interleaved or sequential; it will also appear to be consistent * with sequential format, but we will parse it as interleaved. * * can be uniquely determined because we know and * (from the Phylip header), so we know how many characters * should be accounted for by the alignment; the rest have to be * the names. * * If the input is not consistent with interleaved Phylip format, * return , and <*ret_namewidth> and <*ret_nblocks> are 0. * * upon return, restore the to its original position. */ static int phylip_check_interleaved(ESL_BUFFER *bf, int *ret_nblocks, int *ret_namewidth) { esl_pos_t anchor = -1; char *p; esl_pos_t n; int32_t nseq, alen; char *colcodes = NULL; char *colcodes0 = NULL; int ncols, ncols0; int c, idx; int nblocks = 0; int nres2 = 0; int nres1; int status; anchor = esl_buffer_GetOffset(bf); if (esl_buffer_SetAnchor(bf, anchor) != eslOK) { status = eslFAIL; goto ERROR; } if ((status = phylip_parse_header(bf, &nseq, &alen, &p, &n)) != eslOK) goto ERROR; /* read the whole file, one block at a time */ while (status == eslOK) { ncols = n; ESL_REALLOC(colcodes, sizeof(char) * ncols); for (c = 0; c < ncols; c++) colcodes[c] = '?'; /* read a block: for each line, update the colcodes[] array. */ for (idx = 0; idx < nseq; idx++) { if (status == eslEOF) goto ERROR; if ((status = phylip_collate_colcodes(p, n, colcodes, ncols)) != eslOK) goto ERROR; status = esl_buffer_GetLine(bf, &p, &n); if (status != eslOK && status != eslEOF) goto ERROR; /* EOF is ok on last seq of single-block data */ } /* finished a block. status may be EOF or OK. * save the first block's colcodes[] to defer its analysis * for all other blocks: count x columns, reject n columns, ignore . and o */ if (nblocks == 0) { colcodes0 = colcodes; ncols0 = ncols; colcodes = NULL; /* let's speculate that it's strictly conforming PHYLIP with a namewidth of 10 * in that case, we'll be able to stop parsing blocks when we reach the full * alignment length */ for (nres1 = 0, c = 10; c < ncols0; c++) if (colcodes0[c] != '.') nres1++; /* for this test, consider even invalid symbols to be "residues". */ } else { for (c = 0; c < ncols; c++) { if (colcodes[c] == 'x') nres2++; if (colcodes[c] == 'n') { status = eslFAIL; goto ERROR; } /* subsequent blocks can't contain name-like columns */ } } nblocks++; /* if it's strictly conforming w/ namewidth=10, we know when we're done, * and in that case we can tolerate trailing data (tree, whatever) in the * input. status will be eslOK if we break this way. */ if (nres1+nres2 == alen) break; /* skip blank lines until we load start of next block in

, */ while (status == eslOK && esl_memspn(p, n, "\t") == n) status = esl_buffer_GetLine(bf, &p, &n); } if (nres1+nres2 == alen) *ret_namewidth = 10; else if ((status = phylip_deduce_namewidth(colcodes0, ncols0, alen, nres2, ret_namewidth)) != eslOK) goto ERROR; free(colcodes); free(colcodes0); esl_buffer_SetOffset(bf, anchor); esl_buffer_RaiseAnchor(bf, anchor); *ret_nblocks = nblocks; return eslOK; ERROR: if (anchor != -1) { esl_buffer_SetOffset(bf, anchor); esl_buffer_RaiseAnchor(bf, anchor); } if (colcodes) free(colcodes); if (colcodes0) free(colcodes0); *ret_namewidth = 0; *ret_nblocks = 0; return status; } /* check for a sequential format file, given a known namewidth (10, for strict PHYLIP) */ static int phylip_check_sequential_known(ESL_BUFFER *bf, int namewidth) { esl_pos_t anchor = -1; char *p; esl_pos_t n; int32_t nseq, alen; int idx, nres, line, c; int status; anchor = esl_buffer_GetOffset(bf); if (esl_buffer_SetAnchor(bf, anchor) != eslOK) { status = eslFAIL; goto ERROR; } if ((status = phylip_parse_header(bf, &nseq, &alen, &p, &n)) != eslOK) goto ERROR; for (idx = 0; idx < nseq; idx++) { nres = 0; line = 0; while (nres < alen) { if (status == eslEOF) goto ERROR; c = (line == 0 ? namewidth : 0); for ( ; c < n; c++) if (strchr(eslMSAFILE_PHYLIP_LEGALSYMS, p[c]) != NULL) nres++; status = esl_buffer_GetLine(bf, &p, &n); if (status != eslOK && status != eslEOF) goto ERROR; } if (nres != alen) { status = eslFAIL; goto ERROR; } /* tolerate blank spaces after individual sequences */ while (status == eslOK && esl_memspn(p, n, "\t") == n) status = esl_buffer_GetLine(bf, &p, &n); if (status != eslOK && status != eslEOF) goto ERROR; } esl_buffer_SetOffset(bf, anchor); esl_buffer_RaiseAnchor(bf, anchor); return eslOK; ERROR: if (anchor != -1) { esl_buffer_SetOffset(bf, anchor); esl_buffer_RaiseAnchor(bf, anchor); } return status; } /* phylip_check_sequential_unknown() * Check for sequential format when namewidth may deviate from standard 10. * * Returns: if file is consistent with sequential Phylip format, * and <*ret_namewidth> is the name width. * * if file is inconsistent with sequential format. * * Throws: on allocation failure */ static int phylip_check_sequential_unknown(ESL_BUFFER *bf, int *ret_namewidth) { esl_pos_t anchor = -1; int nlines = 0; int nblocks = 0; int32_t nseq, alen; int *rth = NULL; // rth[i=0..L1-1] = # of legal chars, inclusive, in i..L1-1, or 0 int r; // total # of legal seq chars on line 1 of a seq int L1; // total length of line 1, including all chars & whitespace int b; // # of legal seq chars on remainder of lines of a seq int a; // # of seq chars we need on line 1: alen - b char *p; // pointer to current line in input buffer esl_pos_t n; // length of current line in input buffer int i, j, k; int w; // name width we determine int len2; // alen of subsequent seqs in file int status; /* Set an anchor where we are in the input, so we can rewind */ anchor = esl_buffer_GetOffset(bf); if ( esl_buffer_SetStableAnchor(bf, anchor) != eslOK) { status = eslFAIL; goto ERROR; } /* Pass 1: number of data lines in file / nseq = integral # of lines per sequence */ while ( (status = esl_buffer_GetLine(bf, &p, &n)) == eslOK) if (esl_memspn(p, n, " \t") != n) nlines++; if (status != eslEOF) { status = eslFAIL; goto ERROR; } nlines--; /* not counting the header */ esl_buffer_SetOffset(bf, anchor); /* rewind */ /* pass 2: first, parse the line */ if ((status = phylip_parse_header(bf, &nseq, &alen, &p, &n)) != eslOK) { status = eslFAIL; goto ERROR; } if (nlines % nseq != 0) { status = eslFAIL; goto ERROR; } nblocks = nlines / nseq; /* ... evaluate the first line, already at point in p,n */ ESL_ALLOC(rth, sizeof(int) * n); for (i = n-1, r = 0; i >= 0; i--) rth[i] = ( strchr(eslMSAFILE_PHYLIP_LEGALSYMS, p[i]) != NULL ? ++r : 0 ); L1 = n; /* rth[i] = 0 if line1[i] is not a legal seq char * else # of legal chars in line1[i..L1-1] * r = total # of legal chars on line 1 * L1 = total length of line 1, all chars (including whitespace) * Once we know how many seq residues we need to get on the first line, * we can use rth[] to find the position of the first one. */ /* Now count b: # of legal chars on subsequent lines for 1st seq */ for (k = 1, b = 0; k < nblocks; k++) { while ( (status = esl_buffer_GetLine(bf, &p, &n)) == eslOK && esl_memspn(p, n, " \t") == n) ; if (status != eslOK) { status = eslFAIL; goto ERROR; } for (i = 0; i < n; i++) if (strchr(eslMSAFILE_PHYLIP_LEGALSYMS, p[i]) != NULL) b++; } /* We need to find (alen - b) = a residues on the first line. */ a = alen - b; if (a > r) { status = eslFAIL; goto ERROR; } for (w = 0; w < L1; w++) // find the leftmost seq residue if (rth[w] == a) break; if (w == L1) { status = eslFAIL; goto ERROR; } for (i = 0; i < w; i++) // make sure there's a name of some sort if (! isspace(p[i])) break; if ( i == w) { status = eslFAIL; goto ERROR; } /* Now we "know" the name width is w. */ /* Check that the rest of the sequences (up to 100 of them) are consistent w/ that. */ for (j = 1; j < nseq && j < 100; j++) { while ( (status = esl_buffer_GetLine(bf, &p, &n)) == eslOK && esl_memspn(p, n, " \t") == n) ; if (status != eslOK) { status = eslFAIL; goto ERROR; } if ( strchr(eslMSAFILE_PHYLIP_LEGALSYMS, p[w]) == NULL) { status = eslFAIL; goto ERROR; } for (i = 0; i < w; i++) if (! isspace(p[i])) break; if ( i == w) { status = eslFAIL; goto ERROR; } for (i = w, len2 = 0; i < n; i++) if ( strchr(eslMSAFILE_PHYLIP_LEGALSYMS, p[i]) != NULL) len2++; for (k = 1; k < nblocks; k++) { while ( (status = esl_buffer_GetLine(bf, &p, &n)) == eslOK && esl_memspn(p, n, " \t") == n) ; if (status != eslOK) { status = eslFAIL; goto ERROR; } for (i = 0; i < n; i++) if ( strchr(eslMSAFILE_PHYLIP_LEGALSYMS, p[i]) != NULL) len2++; } if (len2 != alen) { status = eslFAIL; goto ERROR; } } esl_buffer_SetOffset(bf, anchor); esl_buffer_RaiseAnchor(bf, anchor); free(rth); *ret_namewidth = w; return eslOK; ERROR: if (anchor != -1) { esl_buffer_SetOffset(bf, anchor); esl_buffer_RaiseAnchor(bf, anchor); } free(rth); *ret_namewidth = 0; return status; } /* parse the header from a buffer (note, NOT an open MSAFILE). * used by the format checkers (before an MSAFILE is open) * return on success, on parse failure. * * Upon return

, contains the first alignment data line, * and the 's point is on the line following that. * */ static int phylip_parse_header(ESL_BUFFER *bf, int32_t *ret_nseq, int32_t *ret_alen, char **ret_p, esl_pos_t *ret_n) { char *p, *tok; esl_pos_t n, toklen; int32_t nseq, alen; int status; /* Find the first nonblank line, which says " " and may also have options (which we'll ignore) */ while ( (status = esl_buffer_GetLine(bf, &p, &n)) == eslOK && esl_memspn(p, n, " \t") == n) ; if (status == eslEOF) status = eslFAIL; if (status != eslOK) goto ERROR; esl_memtok(&p, &n, " \t", &tok, &toklen); if (esl_mem_strtoi32(tok, toklen, 0, NULL, &nseq) != eslOK) { status = eslFAIL; goto ERROR; } if (esl_memtok(&p, &n, " \t", &tok, &toklen) != eslOK) { status = eslFAIL; goto ERROR; } if (esl_mem_strtoi32(tok, toklen, 0, NULL, &alen) != eslOK) { status = eslFAIL; goto ERROR; } /* skip any blank lines, load p,n as first line of putative block */ while ( (status = esl_buffer_GetLine(bf, &p, &n)) == eslOK && esl_memspn(p, n, " \t") == n) ; if (status == eslEOF) status = eslFAIL; if (status != eslOK) goto ERROR; *ret_nseq = nseq; *ret_alen = alen; *ret_p = p; *ret_n = n; return eslOK; ERROR: *ret_nseq = 0; *ret_alen = 0; *ret_p = NULL; *ret_n = 0; return status; } /* * '?' : unset * 'x' : column consists only of valid data symbols * '.' : column consists only of spaces * 'o' : column consists only of spaces or other (invalid) symbols such as numbers * 'n' : column mixes x and (. or o): must be a name column * * A name can contain spaces, valid symbols, other symbols ('x', 'n', '.', 'o') * Any whitespace between name, data must contain only '.' * Spacer columns elsewhere can contain spaces or other symbols (such as numbers): 'o' * Alignment columns must contain only valid data symbols: 'x'. */ static int phylip_collate_colcodes(char *p, esl_pos_t n, char *colcodes, int ncols) { int c; for (c = 0; c < ncols && c < n; c++) { if (strchr(eslMSAFILE_PHYLIP_LEGALSYMS, p[c]) != NULL) { switch (colcodes[c]) { case '?': colcodes[c] = 'x'; break; case '.': colcodes[c] = 'n'; break; } } else if (p[c] == ' ') { switch (colcodes[c]) { case '?': colcodes[c] = '.'; break; case 'x': colcodes[c] = 'n'; break; } } else if (isgraph(p[c])) { switch (colcodes[c]) { case '?': colcodes[c] = 'o'; break; case 'x': colcodes[c] = 'n'; break; case '.': colcodes[c] = 'o'; break; } } else return eslFAIL; } for ( ; c < n; c++) if (strchr(eslMSAFILE_PHYLIP_LEGALSYMS, p[c]) != NULL) return eslFAIL; return eslOK; } static int phylip_deduce_namewidth(char *colcodes0, int ncols0, int alen, int nres2, int *ret_namewidth) { int nres1; int c; int nwA, nwB, namewidth; /* nres1 = alen - nres2 : # of columns that must be provided in first block */ nres1 = alen - nres2; if (nres1 <= 0) return eslFAIL; /* search leftward in first block until we've accounted for enough cols */ /* c becomes the position of the rightmost col before ali data, and hence = maximal namewidth-1 */ for (c = ncols0-1; nres1 && c >= 0; c--) if (colcodes0[c] == 'x') nres1--; if (nres1 > 0) return eslFAIL; nwB = c+1; /* search leftward past whitespace columns that might be trailing the name */ /* c becomes the position of the rightmost non-whitespace column, and hence = minimal namewidth-1 */ for ( ; c >= 0; c--) if (colcodes0[c] != '.') break; nwA = c+1; /* if nwA <= 10 <= nwB, the namewidth is consistent with strict PHYLIP namewidth of 10 */ if (nwA <= 10 && nwB >= 10) namewidth = 10; else namewidth = nwA; *ret_namewidth = namewidth; return eslOK; } /*-------------- end, format autodetection ----------------------*/ /***************************************************************** * 5. Rectifying valid input/output symbols in names and seqs *****************************************************************/ /* We allow any isprint() char in a name, except tabs. * Phylip allows spaces in names, but Easel doesn't. * Internal spaces are converted to underscore _ * Leading and trailing spaces are ignored * namebuf must be allocated at least n+1 chars * returns eslEINVAL if it sees an invalid character */ static int phylip_rectify_input_name(char *namebuf, char *p, int n) { int pos, endpos; int npos = 0; for (endpos = n-1; endpos > 0; endpos--) if (p[endpos] != ' ') break; for (pos = 0; pos <= endpos; pos++) if (p[pos] != ' ') break; for ( ; pos <= endpos; pos++) { if (! isgraph(p[pos]) && p[pos] != ' ') return eslEINVAL; namebuf[npos++] = (p[pos] == ' ' ? '_' : p[pos]); } namebuf[npos] = '\0'; return eslOK; } /* Easel's internal alphabet (and output syms) are compatible with PHYLIP * (upper case, '-' for gaps, '*' for stop codon; except that PHYLIP uses * '?' for missing data */ static int phylip_rectify_output_seq_digital(char *buf) { int i; for (i = 0; buf[i]; i++) { if (buf[i] == '~') buf[i] = '?'; } return eslOK; } static int phylip_rectify_output_seq_text(char *buf) { int i; for (i = 0; buf[i]; i++) { if (islower(buf[i])) buf[i] = toupper(buf[i]); if (strchr("._ ", buf[i]) != NULL) buf[i] = '-'; if (buf[i] == '~') buf[i] = '?'; } return eslOK; } /*---------- end, rectification of name, seq symbols ------------*/ /***************************************************************** * 6. Unit tests *****************************************************************/ #ifdef eslMSAFILE_PHYLIP_TESTDRIVE static void utest_write_good1(FILE *ofp, int *ret_format, int *ret_alphatype, int *ret_nseq, int *ret_alen) { fputs(" 5 42\n", ofp); fputs("Turkey AAGCTNGGGC ATTTCAGGGT\n", ofp); fputs("Salmo gairAAGCCTTGGC AGTGCAGGGT\n", ofp); fputs("H. SapiensACCGGTTGGC CGTTCAGGGT\n", ofp); fputs("Chimp AAACCCTTGC CGTTACGCTT\n", ofp); fputs("Gorilla AAACCCTTGC CGGTACGCTT\n", ofp); fputs("\n", ofp); fputs("GAGCCCGGGC AATACAGGGT AT\n", ofp); fputs("GAGCCGTGGC CGGGCACGGT AT\n", ofp); fputs("ACAGGTTGGC CGTTCAGGGT AA\n", ofp); fputs("AAACCGAGGC CGGGACACTC AT\n", ofp); fputs("AAACCATTGC CGGTACGCTT AA\n", ofp); *ret_format = eslMSAFILE_PHYLIP; *ret_alphatype = eslDNA; *ret_nseq = 5; *ret_alen = 42; } static void utest_write_good2(FILE *ofp, int *ret_format, int *ret_alphatype, int *ret_nseq, int *ret_alen) { fputs("7 50 \n", ofp); fputs("thermotogaATGGCGAAGGAAAAATTTGTGAGAACAAAACCGCATGTTAACGTTGGAAC\n", ofp); fputs("TthermophiATGGCGAAGGGCGAGTTTGTTCGGACGAAGCCTCACGTGAACGTGGGGAC \n", ofp); fputs("TaquaticusATGGCGAAGGGCGAGTTTATCCGGACGAAGCCCCACGTGAACGTGGGGAC \n", ofp); fputs("deinonema-ATGGCTAAGGGAACGTTTGAACGCACCAAACCCCACGTGAACGTGGGCAC \n", ofp); fputs("ChlamydiaBATGTCAAAAGAAACTTTTCAACGTAATAAGCCTCATATCAACATAGGGGC \n", ofp); fputs("flexistipsATGTCCAAGCAAAAGTACGAAAGGAAGAAACCTCACGTAAACGTAGGCAC \n", ofp); fputs("borrelia-bATGGCAAAAGAAGTTTTTCAAAGAACAAAGCCGCACATGAATGTTGGAAC \n", ofp); *ret_format = eslMSAFILE_PHYLIP; *ret_alphatype = eslDNA; *ret_nseq = 7; *ret_alen = 50; } static void utest_write_good3(FILE *ofp, int *ret_format, int *ret_alphatype, int *ret_nseq, int *ret_alen) { fputs(" 3 384\n", ofp); fputs("CYS1_DICDI -----MKVIL LFVLAVFTVF VSS------- --------RG IPPEEQ---- --------SQ \n", ofp); fputs("ALEU_HORVU MAHARVLLLA LAVLATAAVA VASSSSFADS NPIRPVTDRA ASTLESAVLG ALGRTRHALR \n", ofp); fputs("CATH_HUMAN ------MWAT LPLLCAGAWL LGV------- -PVCGAAELS VNSLEK---- --------FH \n", ofp); fputs("\n", ofp); fputs(" FLEFQDKFNK KY-SHEEYLE RFEIFKSNLG KIEELNLIAI NHKADTKFGV NKFADLSSDE \n", ofp); fputs(" FARFAVRYGK SYESAAEVRR RFRIFSESLE EVRSTN---- RKGLPYRLGI NRFSDMSWEE \n", ofp); fputs(" FKSWMSKHRK TY-STEEYHH RLQTFASNWR KINAHN---- NGNHTFKMAL NQFSDMSFAE \n", ofp); fputs("\n", ofp); fputs(" FKNYYLNNKE AIFTDDLPVA DYLDDEFINS IPTAFDWRTR G-AVTPVKNQ GQCGSCWSFS \n", ofp); fputs(" FQATRL-GAA QTCSATLAGN HLMRDA--AA LPETKDWRED G-IVSPVKNQ AHCGSCWTFS \n", ofp); fputs(" IKHKYLWSEP QNCSAT--KS NYLRGT--GP YPPSVDWRKK GNFVSPVKNQ GACGSCWTFS \n", ofp); fputs("\n", ofp); fputs(" TTGNVEGQHF ISQNKLVSLS EQNLVDCDHE CMEYEGEEAC DEGCNGGLQP NAYNYIIKNG \n", ofp); fputs(" TTGALEAAYT QATGKNISLS EQQLVDCAGG FNNF------ --GCNGGLPS QAFEYIKYNG \n", ofp); fputs(" TTGALESAIA IATGKMLSLA EQQLVDCAQD FNNY------ --GCQGGLPS QAFEYILYNK \n", ofp); fputs("\n", ofp); fputs(" GIQTESSYPY TAETGTQCNF NSANIGAKIS NFTMIP-KNE TVMAGYIVST GPLAIAADAV \n", ofp); fputs(" GIDTEESYPY KGVNGV-CHY KAENAAVQVL DSVNITLNAE DELKNAVGLV RPVSVAFQVI \n", ofp); fputs(" GIMGEDTYPY QGKDGY-CKF QPGKAIGFVK DVANITIYDE EAMVEAVALY NPVSFAFEVT \n", ofp); fputs("\n", ofp); fputs(" E-WQFYIGGV F-DIPCN--P NSLDHGILIV GYSAKNTIFR KNMPYWIVKN SWGADWGEQG \n", ofp); fputs(" DGFRQYKSGV YTSDHCGTTP DDVNHAVLAV GYGVENGV-- ---PYWLIKN SWGADWGDNG \n", ofp); fputs(" QDFMMYRTGI YSSTSCHKTP DKVNHAVLAV GYGEKNGI-- ---PYWIVKN SWGPQWGMNG \n", ofp); fputs("\n", ofp); fputs(" YIYLRRGKNT CGVSNFVSTS II-- \n", ofp); fputs(" YFKMEMGKNM CAIATCASYP VVAA \n", ofp); fputs(" YFLIERGKNM CGLAACASYP IPLV\n", ofp); *ret_format = eslMSAFILE_PHYLIP; *ret_alphatype = eslAMINO; *ret_nseq = 3; *ret_alen = 384; } static void utest_write_good4(FILE *ofp, int *ret_format, int *ret_alphatype, int *ret_nseq, int *ret_alen) { fputs(" 5 42\n", ofp); fputs("Turkey AAGCTNGGGC ATTTCAGGGT\n", ofp); fputs("GAGCCCGGGC AATACAGGGT AT\n", ofp); fputs("Salmo gairAAGCCTTGGC AGTGCAGGGT\n", ofp); fputs("GAGCCGTGGC CGGGCACGGT AT\n", ofp); fputs("H. SapiensACCGGTTGGC CGTTCAGGGT\n", ofp); fputs("ACAGGTTGGC CGTTCAGGGT AA\n", ofp); fputs("Chimp AAACCCTTGC CGTTACGCTT\n", ofp); fputs("AAACCGAGGC CGGGACACTC AT\n", ofp); fputs("Gorilla AAACCCTTGC CGGTACGCTT\n", ofp); fputs("AAACCATTGC CGGTACGCTT AA\n", ofp); *ret_format = eslMSAFILE_PHYLIPS; *ret_alphatype = eslDNA; *ret_nseq = 5; *ret_alen = 42; } static void utest_write_good5(FILE *ofp, int *ret_format, int *ret_alphatype, int *ret_nseq, int *ret_alen) { fputs(" 3 384\n", ofp); fputs("CYS1_DICDI-----MKVILLFVLAVFTVFVSS---------------RGIPPEEQ------------SQFLEFQDKFNKKY-SHEEY\n", ofp); fputs("LERFEIFKSNLGKIEELNLIAINHKADTKFGVNKFADLSSDEFKNYYLNNKEAIFTDDLPVADYLDDEFINSIPTAFDWRTRG-AVTP\n", ofp); fputs("VKNQGQCGSCWSFSTTGNVEGQHFISQNKLVSLSEQNLVDCDHECMEYEGEEACDEGCNGGLQPNAYNYIIKNGGIQTESSYPYTAET\n", ofp); fputs("GTQCNFNSANIGAKISNFTMIP-KNETVMAGYIVSTGPLAIAADAVE-WQFYIGGVF-DIPCN--PNSLDHGILIVGYSAKNTIFRKN\n", ofp); fputs("MPYWIVKNSWGADWGEQGYIYLRRGKNTCGVSNFVSTSII--\n", ofp); fputs("\n", ofp); fputs("ALEU_HORVUMAHARVLLLALAVLATAAVAVASSSSFADSNPIRPVTDRAASTLESAVLGALGRTRHALRFARFAVRYGKSYESAAEVR\n", ofp); fputs("RRFRIFSESLEEVRSTN----RKGLPYRLGINRFSDMSWEEFQATRL-GAAQTCSATLAGNHLMRDA--AALPETKDWREDG-IVSPVK\n", ofp); fputs("NQAHCGSCWTFSTTGALEAAYTQATGKNISLSEQQLVDCAGGFNNF--------GCNGGLPSQAFEYIKYNGGIDTEESYPYKGVNGV-\n", ofp); fputs("CHYKAENAAVQVLDSVNITLNAEDELKNAVGLVRPVSVAFQVIDGFRQYKSGVYTSDHCGTTPDDVNHAVLAVGYGVENGV-----PYW\n", ofp); fputs("LIKNSWGADWGDNGYFKMEMGKNMCAIATCASYPVVAA\n", ofp); fputs("\n", ofp); fputs("CATH_HUMAN------MWATLPLLCAGAWLLGV--------PVCGAAELSVNSLEK------------FHFKSWMSKHRKTY-STEEYH\n", ofp); fputs("HRLQTFASNWRKINAHN----NGNHTFKMALNQFSDMSFAEIKHKYLWSEPQNCSAT--KSNYLRGT--GPYPPSVDWRKKGNFVSPVK\n", ofp); fputs("NQGACGSCWTFSTTGALESAIAIATGKMLSLAEQQLVDCAQDFNNY--------GCQGGLPSQAFEYILYNKGIMGEDTYPYQGKDGY-\n", ofp); fputs("CKFQPGKAIGFVKDVANITIYDEEAMVEAVALYNPVSFAFEVTQDFMMYRTGIYSSTSCHKTPDKVNHAVLAVGYGEKNGI-----PYW\n", ofp); fputs("IVKNSWGPQWGMNGYFLIERGKNMCGLAACASYPIPLV\n", ofp); fputs("\n", ofp); *ret_format = eslMSAFILE_PHYLIPS; *ret_alphatype = eslAMINO; *ret_nseq = 3; *ret_alen = 384; } /* a tricky one, with a nonstandard name width of 12 (all names end in xx) */ static void utest_write_good6(FILE *ofp, int *ret_format, int *ret_alphatype, int *ret_nseq, int *ret_alen) { fputs(" 5 42\n", ofp); fputs("Turkey xxAAGCTNGGGC ATTTCAGGGT\n", ofp); fputs("Salmo gairxxAAGCCTTGGC AGTGCAGGGT\n", ofp); fputs("H. SapiensxxACCGGTTGGC CGTTCAGGGT\n", ofp); fputs("Chimp xxAAACCCTTGC CGTTACGCTT\n", ofp); fputs("Gorilla xxAAACCCTTGC CGGTACGCTT\n", ofp); fputs("\n", ofp); fputs("GAGCCCGGGC AATACAGGGT AT\n", ofp); fputs("GAGCCGTGGC CGGGCACGGT AT\n", ofp); fputs("ACAGGTTGGC CGTTCAGGGT AA\n", ofp); fputs("AAACCGAGGC CGGGACACTC AT\n", ofp); fputs("AAACCATTGC CGGTACGCTT AA\n", ofp); *ret_format = eslMSAFILE_PHYLIP; *ret_alphatype = eslDNA; *ret_nseq = 5; *ret_alen = 42; } /* nonstandard name field width in a sequential file. */ static void utest_write_good7(FILE *ofp, int *ret_format, int *ret_alphatype, int *ret_nseq, int *ret_alen) { fputs(" 5 42\n", ofp); fputs("Turkey xxAAGCTNGGGC ATTTCAGGGT\n", ofp); fputs("GAGCCCGGGC AATACAGGGT AT\n", ofp); fputs("\n", ofp); fputs("Salmo gairxxAAGCCTTGGC AGTGCAGGGT\n", ofp); fputs("GAGCCGTGGC CGGGCACGGT AT\n", ofp); fputs("\n", ofp); fputs("H. SapiensxxACCGGTTGGC CGTTCAGGGT\n", ofp); fputs("ACAGGTTGGC CGTTCAGGGT AA\n", ofp); fputs("\n", ofp); fputs("Chimp xxAAACCCTTGC CGTTACGCTT\n", ofp); fputs("AAACCGAGGC CGGGACACTC AT\n", ofp); fputs("\n", ofp); fputs("Gorilla xxAAACCCTTGC CGGTACGCTT\n", ofp); fputs("AAACCATTGC CGGTACGCTT AA\n", ofp); *ret_format = eslMSAFILE_PHYLIPS; *ret_alphatype = eslDNA; *ret_nseq = 5; *ret_alen = 42; } /* if using strict PHYLIP, 10 char namewidth results in reading a M=21 alignment, * which disagrees with alen=20 */ static void utest_write_bad1(FILE *ofp, int *ret_alphatype, int *ret_errstatus, int *ret_linenumber, char *errmsg) { fputs(" 2 20\n", ofp); fputs("Turkey xAAGCTNGGGC ATTTCAGGGT\n", ofp); fputs("Salmo gairxAAGCCTTGGC AGTGCAGGGT\n", ofp); *ret_alphatype = eslDNA; *ret_errstatus = eslEFORMAT; *ret_linenumber = 3; strcpy(errmsg, "alignment length disagrees"); } static void utest_write_bad2(FILE *ofp, int *ret_alphatype, int *ret_errstatus, int *ret_linenumber, char *errmsg) { fputs(" x 2 20\n", ofp); fputs("Turkey AAGCTNGGGC ATTTCAGGGT\n", ofp); fputs("Salmo gairAAGCCTTGGC AGTGCAGGGT\n", ofp); *ret_alphatype = eslDNA; *ret_errstatus = eslEFORMAT; *ret_linenumber = 1; strcpy(errmsg, "first field isn't an integer"); } static void utest_write_bad3(FILE *ofp, int *ret_alphatype, int *ret_errstatus, int *ret_linenumber, char *errmsg) { fputs(" 2 x\n", ofp); fputs("Turkey AAGCTNGGGC ATTTCAGGGT\n", ofp); fputs("Salmo gairAAGCCTTGGC AGTGCAGGGT\n", ofp); *ret_alphatype = eslDNA; *ret_errstatus = eslEFORMAT; *ret_linenumber = 1; strcpy(errmsg, "second field isn't an integer"); } static void utest_write_bad4(FILE *ofp, int *ret_alphatype, int *ret_errstatus, int *ret_linenumber, char *errmsg) { fputs(" 2\n", ofp); fputs("Turkey AAGCTNGGGC ATTTCAGGGT\n", ofp); fputs("Salmo gairAAGCCTTGGC AGTGCAGGGT\n", ofp); *ret_alphatype = eslDNA; *ret_errstatus = eslEFORMAT; *ret_linenumber = 1; strcpy(errmsg, "only one field found"); } static void utest_write_bad5(FILE *ofp, int *ret_alphatype, int *ret_errstatus, int *ret_linenumber, char *errmsg) { fputs(" 2 20\n", ofp); fputs("\n", ofp); *ret_alphatype = eslDNA; *ret_errstatus = eslEFORMAT; *ret_linenumber = 2; strcpy(errmsg, "no alignment data"); } static void utest_write_bad6(FILE *ofp, int *ret_alphatype, int *ret_errstatus, int *ret_linenumber, char *errmsg) { fputs(" 2 20\n", ofp); fputs("seq1 \n", ofp); fputs("seq2 ACDEFGHIKLMNPQRSTVWY\n", ofp); *ret_alphatype = eslAMINO; *ret_errstatus = eslEFORMAT; *ret_linenumber = 2; strcpy(errmsg, "line too short"); } static void utest_write_bad7(FILE *ofp, int *ret_alphatype, int *ret_errstatus, int *ret_linenumber, char *errmsg) { fputs(" 2 20\n", ofp); fputs("\tseq1_name ACDEFGHIKLMNPQRSTVWY\n", ofp); fputs("seq2_name ACDEFGHIKLMNPQRSTVWY\n", ofp); *ret_alphatype = eslAMINO; *ret_errstatus = eslEFORMAT; *ret_linenumber = 2; strcpy(errmsg, "invalid character(s) in sequence name"); } static void utest_write_bad8(FILE *ofp, int *ret_alphatype, int *ret_errstatus, int *ret_linenumber, char *errmsg) { fputs(" 2 20\n", ofp); fputs("seq1_name ACDEFGHIKLMNPQRSTVWY\n", ofp); fputs("seq2_name ACDEFGHI~LMNPQRSTVWY\n", ofp); *ret_alphatype = eslAMINO; *ret_errstatus = eslEFORMAT; *ret_linenumber = 3; strcpy(errmsg, "one or more invalid sequence characters"); } static void utest_write_bad9(FILE *ofp, int *ret_alphatype, int *ret_errstatus, int *ret_linenumber, char *errmsg) { fputs(" 2 20\n", ofp); fputs("seq1_name ACDEFGHIKLMNPQRSTVWY\n", ofp); fputs("seq2_name ACDEFGHIKLMNPQRSTVW\n", ofp); *ret_alphatype = eslAMINO; *ret_errstatus = eslEFORMAT; *ret_linenumber = 3; strcpy(errmsg, "number of residues on line differs from previous seqs"); } static void utest_write_bad10(FILE *ofp, int *ret_alphatype, int *ret_errstatus, int *ret_linenumber, char *errmsg) { fputs(" 3 40\n", ofp); fputs("seq1_name ACDEFGHIKLMNPQRSTVWY\n", ofp); fputs("seq2_name ACDEFGHIKLMNPQRSTVWY\n", ofp); fputs("\n", ofp); fputs("ACDEFGHIKLMNPQRSTVWY\n", ofp); fputs("ACDEFGHIKLMNPQRSTVWY\n", ofp); *ret_alphatype = eslAMINO; *ret_errstatus = eslEFORMAT; *ret_linenumber = 4; strcpy(errmsg, "unexpected number of sequences in block"); } static void utest_write_bad11(FILE *ofp, int *ret_alphatype, int *ret_errstatus, int *ret_linenumber, char *errmsg) { fputs(" 2 30\n", ofp); fputs("seq1_name ACDEFGHIKLMNPQRSTVWY\n", ofp); fputs("seq2_name ACDEFGHIKLMNPQRSTVWY\n", ofp); fputs("\n", ofp); fputs(" ACDEFGHIKLMNPQRSTVWY\n", ofp); fputs(" ACDEFGHIKLMNPQRSTVWY\n", ofp); fputs("\n", ofp); *ret_alphatype = eslAMINO; *ret_errstatus = eslEFORMAT; *ret_linenumber = 7; strcpy(errmsg, "alignment length disagrees with header"); } /* example of a pathological file where interleaved, sequential * can't be distinguished. If we read this as interleaved: * seq1 AAAAAAAAAA CCCCCCCCCC YYYYYYYYY FFFFFFFFFF GGGGGGGGGG * YYYYYYYYY DDDDDDDDDD EEEEEEEEEE HHHHHHHHH IIIIIIIIII KKKKKKKKKK * whereas if we read it as sequential: * seq1 AAAAAAAAAA CCCCCCCCCC YYYYYYYYY DDDDDDDDDD EEEEEEEEEE * YYYYYYYYY FFFFFFFFFF GGGGGGGGGG HHHHHHHHH IIIIIIIIII KKKKKKKKKK * * To fall into this, sequence names have to look like a chunk of * aligned sequence, with exactly the right length to give a correct * whichever way we read the file. */ static void utest_write_ambig1(FILE *ofp) { fputs(" 2 49\n", ofp); fputs("seq1 AAAAAAAAAA CCCCCCCCCC\n", ofp); fputs("YYYYYYYYY DDDDDDDDDD EEEEEEEEEE\n", ofp); fputs("YYYYYYYYY FFFFFFFFFF GGGGGGGGGG\n", ofp); fputs("HHHHHHHHH IIIIIIIIII KKKKKKKKKK\n", ofp); } static void utest_goodfile(char *filename, int testnumber, int expected_format, int expected_alphatype, int expected_nseq, int expected_alen) { ESL_ALPHABET *abc = NULL; ESL_MSAFILE *afp = NULL; ESL_MSAFILE_FMTDATA fmtd; /* for writing formats with nonstandard name field widths */ ESL_MSA *msa1 = NULL; ESL_MSA *msa2 = NULL; char tmpfile1[32] = "esltmpXXXXXX"; char tmpfile2[32] = "esltmpXXXXXX"; FILE *ofp = NULL; int status; esl_msafile_fmtdata_Init(&fmtd); /* guessing both the format and the alphabet should work: this is a digital open */ if ( (status = esl_msafile_Open(&abc, filename, NULL, eslMSAFILE_UNKNOWN, NULL, &afp)) != eslOK) esl_fatal("phylip good file unit test %d failed: digital open", testnumber); if (afp->format != expected_format) esl_fatal("phylip good file unit test %d failed: format autodetection", testnumber); if (abc->type != expected_alphatype) esl_fatal("phylip good file unit test %d failed: alphabet autodetection", testnumber); /* This is a digital read, using . */ if ( (status = esl_msafile_phylip_Read(afp, &msa1)) != eslOK) esl_fatal("phylip good file unit test %d failed: msa read, digital", testnumber); if (msa1->nseq != expected_nseq || msa1->alen != expected_alen) esl_fatal("phylip good file unit test %d failed: nseq/alen", testnumber); if (esl_msa_Validate(msa1, NULL) != eslOK) esl_fatal("phylip good file test %d failed: msa1 invalid", testnumber); fmtd.namewidth = afp->fmtd.namewidth; esl_msafile_Close(afp); /* write it back out to a new tmpfile (digital write) */ if ( (status = esl_tmpfile_named(tmpfile1, &ofp)) != eslOK) esl_fatal("phylip good file unit test %d failed: tmpfile creation", testnumber); if ( (status = esl_msafile_phylip_Write(ofp, msa1, expected_format, &fmtd)) != eslOK) esl_fatal("phylip good file unit test %d failed: msa write, digital", testnumber); fclose(ofp); /* now open and read it as text mode, in known format. (We have to pass fmtd now, to deal with the possibility of a nonstandard name width) */ if ( (status = esl_msafile_Open(NULL, tmpfile1, NULL, expected_format, &fmtd, &afp)) != eslOK) esl_fatal("phylip good file unit test %d failed: text mode open", testnumber); if ( (status = esl_msafile_phylip_Read(afp, &msa2)) != eslOK) esl_fatal("phylip good file unit test %d failed: msa read, text", testnumber); if (esl_msa_Validate(msa2, NULL) != eslOK) esl_fatal("phylip good file test %d failed: msa2 invalid", testnumber); if (msa2->nseq != expected_nseq || msa2->alen != expected_alen) esl_fatal("phylip good file unit test %d failed: nseq/alen", testnumber); fmtd.namewidth = afp->fmtd.namewidth; esl_msafile_Close(afp); /* write it back out to a new tmpfile (text write) */ if ( (status = esl_tmpfile_named(tmpfile2, &ofp)) != eslOK) esl_fatal("phylip good file unit test %d failed: tmpfile creation", testnumber); if ( (status = esl_msafile_phylip_Write(ofp, msa2, expected_format, &fmtd)) != eslOK) esl_fatal("phylip good file unit test %d failed: msa write, text", testnumber); fclose(ofp); esl_msa_Destroy(msa2); /* open and read it in digital mode */ if ( (status = esl_msafile_Open(&abc, tmpfile1, NULL, expected_format, &fmtd, &afp)) != eslOK) esl_fatal("phylip good file unit test %d failed: 2nd digital mode open", testnumber); if ( (status = esl_msafile_phylip_Read(afp, &msa2)) != eslOK) esl_fatal("phylip good file unit test %d failed: 2nd digital msa read", testnumber); if (esl_msa_Validate(msa2, NULL) != eslOK) esl_fatal("phylip good file test %d failed: msa2 invalid", testnumber); esl_msafile_Close(afp); /* this msa should be identical to */ if (esl_msa_Compare(msa1, msa2) != eslOK) esl_fatal("phylip good file unit test %d failed: msa compare", testnumber); remove(tmpfile1); remove(tmpfile2); esl_msa_Destroy(msa1); esl_msa_Destroy(msa2); esl_alphabet_Destroy(abc); } /* utest_badfile: * Test the strict PHYLIP parser's ability to detect bad input; * regression test its user-directed error messages. * * note: this test uses strict phylip, with 10 char name width. * it's not using the format guesser, which could determine a nonstandard name width * some "bad" utests can be parsed differently by esl_msafile_phylip_example, * which does use the format guesser * * TODO: we should also have "bad" tests for sequential PHYLIP, and for * nonstandard name widths. * */ static void utest_badfile(char *filename, int testnumber, int expected_alphatype, int expected_status, int expected_linenumber, char *expected_errmsg) { ESL_ALPHABET *abc = esl_alphabet_Create(expected_alphatype); ESL_MSAFILE *afp = NULL; ESL_MSA *msa = NULL; int status; if ( (status = esl_msafile_Open(&abc, filename, NULL, eslMSAFILE_PHYLIP, NULL, &afp)) != eslOK) esl_fatal("phylip bad file unit test %d failed: unexpected open failure", testnumber); if ( (status = esl_msafile_phylip_Read(afp, &msa)) != expected_status) esl_fatal("phylip bad file unit test %d failed: unexpected error code", testnumber); if (strstr(afp->errmsg, expected_errmsg) == NULL) esl_fatal("phylip bad file unit test %d failed: unexpected errmsg", testnumber); if (afp->linenumber != expected_linenumber) esl_fatal("phylip bad file unit test %d failed: unexpected linenumber", testnumber); esl_msafile_Close(afp); esl_alphabet_Destroy(abc); esl_msa_Destroy(msa); } static void utest_ambigfile(char *filename, int testnumber) { ESL_BUFFER *bf = NULL; int fmt; int namewidth; if ( esl_buffer_Open(filename, NULL, &bf) != eslOK) esl_fatal("phylip ambig file unit test %d failed: buffer open", testnumber); if ( esl_msafile_phylip_CheckFileFormat(bf, &fmt, &namewidth) != eslEAMBIGUOUS) esl_fatal("phylip ambig file unit test %d failed: ambiguity detection", testnumber); if ( fmt != eslMSAFILE_UNKNOWN ) esl_fatal("phylip ambig file unit test %d failed: format code", testnumber); if ( namewidth != 0 ) esl_fatal("phylip ambig file unit test %d failed: namewidth not 0", testnumber); esl_buffer_Close(bf); } /* PHYLIP's seqboot program can output many MSAs to the same phylip file. * For this reason (only), we allow PHYLIP format to have multiple MSAs per file. * PHYLIP format 'officially' does not document this! */ static void utest_seqboot(void) { char msg[] = "seqboot unit test failed"; char tmpfile[32]; FILE *ofp; int expected_fmt; int expected_alphatype; int expected_nseq; int expected_alen; int expected_nali; int i; ESL_ALPHABET *abc = NULL; ESL_MSAFILE *afp = NULL; ESL_MSA *msa = NULL; /* Write a tmp testfile with good1 concatenated 3 times. */ expected_nali = 3; strcpy(tmpfile, "esltmpXXXXXX"); if (esl_tmpfile_named(tmpfile, &ofp) != eslOK) esl_fatal(msg); for (i = 0; i < expected_nali; i++) utest_write_good1(ofp, &expected_fmt, &expected_alphatype, &expected_nseq, &expected_alen); fclose(ofp); /* open it and loop over it, reading MSAs. there should be 3, of the expected size. */ if (esl_msafile_Open(&abc, tmpfile, /*env=*/NULL, eslMSAFILE_UNKNOWN, /*fmtd=*/NULL, &afp) != eslOK) esl_fatal(msg); if (abc->type != expected_alphatype) esl_fatal(msg); if (afp->format != expected_fmt) esl_fatal(msg); i = 0; while ( esl_msafile_Read(afp, &msa) == eslOK) { i++; if (msa->nseq != expected_nseq) esl_fatal(msg); if (msa->alen != expected_alen) esl_fatal(msg); esl_msa_Destroy(msa); } if (i != expected_nali) esl_fatal(msg); remove(tmpfile); esl_msafile_Close(afp); esl_alphabet_Destroy(abc); } #endif /*eslMSAFILE_PHYLIP_TESTDRIVE*/ /*---------------------- end, unit tests ------------------------*/ /***************************************************************** * 7. Test driver *****************************************************************/ #ifdef eslMSAFILE_PHYLIP_TESTDRIVE /* compile: gcc -g -Wall -I. -L. -o esl_msafile_phylip_utest -DeslMSAFILE_PHYLIP_TESTDRIVE esl_msafile_phylip.c -leasel -lm * (gcov): gcc -g -Wall -fprofile-arcs -ftest-coverage -I. -L. -o esl_msafile_phylip_utest -DeslMSAFILE_PHYLIP_TESTDRIVE esl_msafile_phylip.c -leasel -lm * run: ./esl_msafile_phylip_utest */ #include "esl_config.h" #include #include "easel.h" #include "esl_getopts.h" #include "esl_msafile.h" #include "esl_msafile_phylip.h" static ESL_OPTIONS options[] = { /* name type default env range togs reqs incomp help docgrp */ {"-h", eslARG_NONE, FALSE, NULL, NULL, NULL, NULL, NULL, "show help and usage", 0}, { 0,0,0,0,0,0,0,0,0,0}, }; static char usage[] = "[-options]"; static char banner[] = "test driver for PHYLIP MSA format module"; int main(int argc, char **argv) { char msg[] = "PHYLIP MSA i/o module test driver failed"; ESL_GETOPTS *go = esl_getopts_CreateDefaultApp(options, 0, argc, argv, banner, usage); FILE *ofp = NULL; int ngoodtests = 7; int nbadtests = 11; int nambigtests = 1; int testnumber = 0; char tmpfile[32]; int expected_format; int expected_alphatype; int expected_errstatus; int expected_linenumber; int expected_nseq; int expected_alen; char expected_errmsg[eslERRBUFSIZE]; /* Test various correct versions of the format */ for (testnumber = 1; testnumber <= ngoodtests; testnumber++) { strcpy(tmpfile, "esltmpXXXXXX"); if (esl_tmpfile_named(tmpfile, &ofp) != eslOK) esl_fatal(msg); switch (testnumber) { case 1: utest_write_good1 (ofp, &expected_format, &expected_alphatype, &expected_nseq, &expected_alen); break; case 2: utest_write_good2 (ofp, &expected_format, &expected_alphatype, &expected_nseq, &expected_alen); break; case 3: utest_write_good3 (ofp, &expected_format, &expected_alphatype, &expected_nseq, &expected_alen); break; case 4: utest_write_good4 (ofp, &expected_format, &expected_alphatype, &expected_nseq, &expected_alen); break; case 5: utest_write_good5 (ofp, &expected_format, &expected_alphatype, &expected_nseq, &expected_alen); break; case 6: utest_write_good6 (ofp, &expected_format, &expected_alphatype, &expected_nseq, &expected_alen); break; case 7: utest_write_good7 (ofp, &expected_format, &expected_alphatype, &expected_nseq, &expected_alen); break; } fclose(ofp); utest_goodfile(tmpfile, testnumber, expected_format, expected_alphatype, expected_nseq, expected_alen); remove(tmpfile); } /* Test for all the possible EFORMAT errors (using strict Phylip interleaved parser) */ for (testnumber = 1; testnumber <= nbadtests; testnumber++) { strcpy(tmpfile, "esltmpXXXXXX"); if (esl_tmpfile_named(tmpfile, &ofp) != eslOK) esl_fatal(msg); switch (testnumber) { case 1: utest_write_bad1 (ofp, &expected_alphatype, &expected_errstatus, &expected_linenumber, expected_errmsg); break; case 2: utest_write_bad2 (ofp, &expected_alphatype, &expected_errstatus, &expected_linenumber, expected_errmsg); break; case 3: utest_write_bad3 (ofp, &expected_alphatype, &expected_errstatus, &expected_linenumber, expected_errmsg); break; case 4: utest_write_bad4 (ofp, &expected_alphatype, &expected_errstatus, &expected_linenumber, expected_errmsg); break; case 5: utest_write_bad5 (ofp, &expected_alphatype, &expected_errstatus, &expected_linenumber, expected_errmsg); break; case 6: utest_write_bad6 (ofp, &expected_alphatype, &expected_errstatus, &expected_linenumber, expected_errmsg); break; case 7: utest_write_bad7 (ofp, &expected_alphatype, &expected_errstatus, &expected_linenumber, expected_errmsg); break; case 8: utest_write_bad8 (ofp, &expected_alphatype, &expected_errstatus, &expected_linenumber, expected_errmsg); break; case 9: utest_write_bad9 (ofp, &expected_alphatype, &expected_errstatus, &expected_linenumber, expected_errmsg); break; case 10: utest_write_bad10(ofp, &expected_alphatype, &expected_errstatus, &expected_linenumber, expected_errmsg); break; case 11: utest_write_bad11(ofp, &expected_alphatype, &expected_errstatus, &expected_linenumber, expected_errmsg); break; } fclose(ofp); utest_badfile(tmpfile, testnumber, expected_alphatype, expected_errstatus, expected_linenumber, expected_errmsg); remove(tmpfile); } /* Test that we correctly detect pathological files that look both interleaved and sequential */ for (testnumber = 1; testnumber <= nambigtests; testnumber++) { strcpy(tmpfile, "esltmpXXXXXX"); if (esl_tmpfile_named(tmpfile, &ofp) != eslOK) esl_fatal(msg); switch (testnumber) { case 1: utest_write_ambig1 (ofp); } fclose(ofp); utest_ambigfile(tmpfile, testnumber); remove(tmpfile); } /* Other tests */ utest_seqboot(); esl_getopts_Destroy(go); return 0; } #endif /*eslMSAFILE_PHYLIP_TESTDRIVE*/ /*--------------------- end, test driver ------------------------*/ /***************************************************************** * 8. Example. *****************************************************************/ #ifdef eslMSAFILE_PHYLIP_EXAMPLE /* A full-featured example of reading/writing an MSA in Phylip format(s). gcc -g -Wall -o esl_msafile_phylip_example -I. -L. -DeslMSAFILE_PHYLIP_EXAMPLE esl_msafile_phylip.c -leasel -lm ./esl_msafile_phylip_example */ /*::cexcerpt::msafile_phylip_example::begin::*/ #include #include "easel.h" #include "esl_alphabet.h" #include "esl_getopts.h" #include "esl_msa.h" #include "esl_msafile.h" #include "esl_msafile_phylip.h" static ESL_OPTIONS options[] = { /* name type default env range toggles reqs incomp help docgroup*/ { "-h", eslARG_NONE, FALSE, NULL, NULL, NULL, NULL, NULL, "show brief help on version and usage", 0 }, { "-1", eslARG_NONE, FALSE, NULL, NULL, NULL, NULL, "-1", "no autodetection; use interleaved PHYLIP", 0 }, { "-2", eslARG_NONE, FALSE, NULL, NULL, NULL, NULL, "-2", "no autodetection; use sequential PHYLIPS", 0 }, { "-q", eslARG_NONE, FALSE, NULL, NULL, NULL, NULL, NULL, "quieter: don't write msa back, just summary", 0 }, { "-t", eslARG_NONE, FALSE, NULL, NULL, NULL, NULL, NULL, "use text mode: no digital alphabet", 0 }, { "-w", eslARG_INT, "10", NULL, NULL, NULL, NULL, NULL, "specify that format's name width is ", 0 }, { "--dna", eslARG_NONE, FALSE, NULL, NULL, NULL, NULL, "-t", "specify that alphabet is DNA", 0 }, { "--rna", eslARG_NONE, FALSE, NULL, NULL, NULL, NULL, "-t", "specify that alphabet is RNA", 0 }, { "--amino", eslARG_NONE, FALSE, NULL, NULL, NULL, NULL, "-t", "specify that alphabet is protein", 0 }, { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, }; static char usage[] = "[-options] "; static char banner[] = "example of guessing, reading, writing PHYLIP formats"; int main(int argc, char **argv) { ESL_GETOPTS *go = esl_getopts_CreateDefaultApp(options, 1, argc, argv, banner, usage); char *filename = esl_opt_GetArg(go, 1); int infmt = eslMSAFILE_UNKNOWN; ESL_ALPHABET *abc = NULL; ESL_MSAFILE *afp = NULL; ESL_MSA *msa = NULL; ESL_MSAFILE_FMTDATA fmtd; int status; if (esl_opt_GetBoolean(go, "-1")) infmt = eslMSAFILE_PHYLIP; /* interleaved format */ else if (esl_opt_GetBoolean(go, "-2")) infmt = eslMSAFILE_PHYLIPS; /* sequential format */ if (esl_opt_GetBoolean(go, "--rna")) abc = esl_alphabet_Create(eslRNA); else if (esl_opt_GetBoolean(go, "--dna")) abc = esl_alphabet_Create(eslDNA); else if (esl_opt_GetBoolean(go, "--amino")) abc = esl_alphabet_Create(eslAMINO); /* Variant PHYLIP formats allow nonstandard name field width. * Usually we can successfully guess this, when guessing format. * But if PHYLIP or PHYLIPS format is set (no guessing), caller may also * want to allow a nonstandard name field width to be set. */ esl_msafile_fmtdata_Init(&fmtd); fmtd.namewidth = esl_opt_GetInteger(go, "-w"); /* Text mode: pass NULL for alphabet. * Digital mode: pass ptr to expected ESL_ALPHABET; and if abc=NULL, alphabet is guessed */ if (esl_opt_GetBoolean(go, "-t")) status = esl_msafile_Open(NULL, filename, NULL, infmt, &fmtd, &afp); else status = esl_msafile_Open(&abc, filename, NULL, infmt, &fmtd, &afp); if (status != eslOK) esl_msafile_OpenFailure(afp, status); if ( (status = esl_msafile_phylip_Read(afp, &msa)) != eslOK) esl_msafile_ReadFailure(afp, status); printf("format variant: %s\n", esl_msafile_DecodeFormat(afp->format)); printf("name width: %d\n", afp->fmtd.namewidth); printf("alphabet: %s\n", (abc ? esl_abc_DecodeType(abc->type) : "none (text mode)")); printf("# of seqs: %d\n", msa->nseq); printf("# of cols: %d\n", (int) msa->alen); printf("\n"); if (afp->fmtd.namewidth != 10) fmtd.namewidth = afp->fmtd.namewidth; if (! esl_opt_GetBoolean(go, "-q")) esl_msafile_phylip_Write(stdout, msa, eslMSAFILE_PHYLIP, &fmtd); esl_msa_Destroy(msa); esl_msafile_Close(afp); if (abc) esl_alphabet_Destroy(abc); esl_getopts_Destroy(go); exit(0); } /*::cexcerpt::msafile_phylip_example::end::*/ #endif /*eslMSAFILE_PHYLIP_EXAMPLE*/ #ifdef eslMSAFILE_PHYLIP_EXAMPLE2 /* A minimal example. Reading a strict interleaved PHYLIP MSA in text mode. gcc -g -Wall -o esl_msafile_phylip_example2 -I. -L. -DeslMSAFILE_PHYLIP_EXAMPLE2 esl_msafile_phylip.c -leasel -lm ./esl_msafile_phylip_example2 */ /*::cexcerpt::msafile_phylip_example2::begin::*/ #include #include "easel.h" #include "esl_msa.h" #include "esl_msafile.h" #include "esl_msafile_phylip.h" int main(int argc, char **argv) { char *filename = argv[1]; int infmt = eslMSAFILE_PHYLIP; /* or eslMSAFILE_PHYLIPS, for sequential format */ ESL_MSAFILE *afp = NULL; ESL_MSA *msa = NULL; int status; if ( (status = esl_msafile_Open(NULL, filename, NULL, infmt, NULL, &afp)) != eslOK) esl_msafile_OpenFailure(afp, status); if ( (status = esl_msafile_phylip_Read(afp, &msa)) != eslOK) esl_msafile_ReadFailure(afp, status); printf("# of seqs: %d\n", msa->nseq); printf("# of cols: %d\n", (int) msa->alen); printf("\n"); esl_msafile_phylip_Write(stdout, msa, eslMSAFILE_PHYLIP, NULL); esl_msa_Destroy(msa); esl_msafile_Close(afp); exit(0); } /*::cexcerpt::msafile_phylip_example::end::*/ #endif /*eslMSAFILE_PHYLIP_EXAMPLE*/ /*--------------------- end of examples -------------------------*/