/* psl.h was originally generated by the autoSql program, which also 
 * generated psl.c and psl.sql.  This header links the database and 
 * the RAM representation of objects.   Additional functions were
 * added later. 
 *
 * This file is copyright 2002 Jim Kent, but license is hereby
 * granted for all use - public, private or commercial. */

#ifndef PSL_H
#define PSL_H

#ifndef LOCALMEM_H
#include "localmem.h"
#endif 

#ifndef LINEFILE_H
#include "linefile.h"
#endif

#ifndef FUZZYFIND_H
#include "fuzzyFind.h"
#endif

#ifndef DNASEQ_H
#include "dnaseq.h"
#endif

/* Some forward declarations of structures used but not defined here. */
struct rbTree;

#define PSL_NUM_COLS  21  /* number of columns in a PSL */
#define PSLX_NUM_COLS 23  /* number of columns in a PSLX */

#define PSL_XA_FORMAT 0x04  /* add XA format columns */

/* options for pslFromAlign */
#define PSL_IS_SOFTMASK 0x01 /* lower case are mask */

/* options for pslCheck */
#define PSL_CHECK_IGNORE_INSERT_CNTS 0x01 /* Don't check insert counts in psl */

struct psl
/* Summary info about a patSpace alignment */
    {
    struct psl *next;  /* Next in singly linked list. */
    unsigned match;	/* Number of bases that match that aren't repeats */
    unsigned misMatch;	/* Number of bases that don't match */
    unsigned repMatch;	/* Number of bases that match but are part of repeats */
    unsigned nCount;	/* Number of 'N' bases */
    unsigned qNumInsert;	/* Number of inserts in query */
    int qBaseInsert;	/* Number of bases inserted in query */
    unsigned tNumInsert;	/* Number of inserts in target */
    int tBaseInsert;	/* Number of bases inserted in target */
    char strand[3];	/* + or - for strand */
    char *qName;	/* Query sequence name */
    unsigned qSize;	/* Query sequence size */
    int qStart;	/* Alignment start position in query */
    int qEnd;	/* Alignment end position in query */
    char *tName;	/* Target sequence name */
    unsigned tSize;	/* Target sequence size */
    int tStart;	/* Alignment start position in target */
    int tEnd;	/* Alignment end position in target */
    unsigned blockCount;	/* Number of blocks in alignment */
    unsigned *blockSizes;	/* Size of each block */
    unsigned *qStarts;	/* Start of each block in query. */
    unsigned *tStarts;	/* Start of each block in target. */

    char **qSequence;  /* query sequence for each block */
    char **tSequence;  /* target sequence for each block */
    };

struct psl *pslxLoad(char **row);
/* Load a pslx from row fetched with select * from psl
 * from database.  Dispose of this with pslFree(). */

struct psl *pslLoad(char **row);
/* Load a psl from row fetched with select * from psl
 * from database.  Dispose of this with pslFree(). */

struct psl *pslCommaIn(char **pS, struct psl *ret);
/* Create a psl out of a comma separated string. 
 * This will fill in ret if non-null, otherwise will
 * return a new psl */

void pslFree(struct psl **pEl);
/* Free a single dynamically allocated psl such as created
 * with pslLoad(). */

void pslFreeList(struct psl **pList);
/* Free a list of dynamically allocated psl's */

void pslOutput(struct psl *el, FILE *f, char sep, char lastSep);
/* Print out psl.  Separate fields with sep. Follow last field with lastSep. */

#define pslTabOut(el,f) pslOutput(el,f,'\t','\n')
/* Print out psl as a line in a tab-separated file. */

#define pslCommaOut(el,f) pslOutput(el,f,',',',')
/* Print out psl as a comma separated list including final comma. */

/* ----- end autoSql generated part --------------- */

void pslOutFormat(struct psl *el, FILE *f, char sep, char lastSep);
/* Print out selected psl values.  Separate fields with sep. Follow last field with lastSep. */
/* Prints out a better format with bold field headings followed by value */
/* Requires further upstream work to ensure that only the field headers */
/* declared here are printed if replacing an existing psl print function*/

struct psl *pslLoadAll(char *fileName);
/* Load all psl's in file. */

struct psl *pslNext(struct lineFile *lf);
/* Read next line from file and convert it to psl.  Return
 * NULL at eof. */

struct psl *pslxLoadLm(char **row, struct lm *lm);
/* Load row into local memory pslx. */

struct psl *pslLoadLm(char **row, struct lm *lm);
/* Load row into local memory psl. */

void pslWriteHead(FILE *f);
/* Write head of psl. */

void pslxWriteHead(FILE *f, enum gfType qType, enum gfType tType);
/* Write head of pslx (extended psl). */

void pslWriteAll(struct psl *pslList, char *fileName, boolean writeHeader);
/* Write a psl file from list. */

void pslWriteAllJson(struct psl *pslList, FILE *f, char *db, boolean writeHeader);
/* Write a psl file from list as a json array . */

struct lineFile *pslFileOpen(char *fileName);
/* Read header part of psl and make sure it's right. 
 * Return line file handle to it. */

struct lineFile *pslFileOpenWithMeta(char *fileName, FILE *f);
/* Read header part of psl and make sure it's right. 
 * Return line file handle to it and send meta data to output file f */

struct lineFile *pslFileOpenWithUniqueMeta(char *fileName, FILE *f);
/* Read header part of psl and make sure it's right. 
* Set flag to suppress duplicate header comments.
* Return line file handle to it. */

void pslxFileOpen(char *fileName, enum gfType *retQueryType, 
	enum gfType *retTargetType, struct lineFile **retLf);
/* Read header part of psl and make sure it's right.  Return
 * sequence types and file handle. */

void pslxFileOpenWithMeta(char *fileName, enum gfType *retQueryType, enum gfType *retTargetType, struct lineFile **retLf, FILE *f);
/* Read header part of psl and make sure it's right.  Return
 * sequence types and file handle and send meta data to output file f */

void pslxFileOpenWithUniqueMeta(char *fileName, enum gfType *retQueryType, enum gfType *retTargetType, struct lineFile **retLf, FILE *f);
/* Read header part of psl and make sure it's right.  Return
* sequence types and file handle and send only unique meta data to output f */

int pslCmpQuery(const void *va, const void *vb);
/* Compare to sort based on query. */

int pslCmpTarget(const void *va, const void *vb);
/* Compare to sort based on target. */

int pslCmpTargetStart(const void *va, const void *vb);
/* Compare to sort based on target start. */

int pslCmpTargetScore(const void *va, const void *vb);
/* Compare to sort based on target then score. */

int pslCmpTargetAndStrand(const void *va, const void *vb);
/* Compare to sort based on target, strand,  tStart. */

int pslCmpScore(const void *va, const void *vb);
/* Compare to sort based on score (descending). */

int pslCmpQueryScore(const void *va, const void *vb);
/* Compare to sort based on query then score (descending). */

int pslCalcMilliBad(struct psl *psl, boolean isMrna);
/* Calculate badness in parts per thousand. */

int pslCmpScoreDesc(const void *va, const void *vb);
/* Compare to sort based on score descending. */

int pslCmpMatch(const void *va, const void *vb);
/* Compare to sort based on match. */

int pslScore(const struct psl *psl);
/* Return score for psl. */

struct ffAli *pslToFfAli(struct psl *psl, struct dnaSeq *query, struct dnaSeq *target,
	int targetOffset);
/* Convert from psl to ffAli format. */

struct ffAli *pslToFakeFfAli(struct psl *psl, DNA *needle, DNA *haystack);
/* Convert from psl to ffAli format.  In some cases you can pass NULL
 * for needle and haystack - depending what the post-processing is going
 * to be. */

struct psl *pslFromFakeFfAli(struct ffAli *ff, 
	DNA *needle, DNA *haystack, char strand,
	char *qName, int qSize, char *tName, int tSize);
/* This will create a basic psl structure from a sorted series of ffAli
 * blocks.  The fields that would need actual sequence to be filled in
 * are left zero however - fields including match, repMatch, mismatch. */

int pslOrientation(struct psl *psl);
/* Translate psl strand + or - to orientation +1 or -1 */

INLINE char pslQStrand(struct psl *psl)
/* Get query strand. */
{
return psl->strand[0];
}

INLINE char pslTStrand(struct psl *psl)
/* Get the target strand., Returns implied + when
 * it's not specific  */
{
return (psl->strand[1] != '-') ? '+' : '-';
}

int pslWeightedIntronOrientation(struct psl *psl, struct dnaSeq *genoSeq, int offset);
/* Return >0 if introns make it look like alignment is on + strand,
 *        <0 if introns make it look like alignment is on - strand,
 *        0 if can't tell.  The absolute value of the return indicates
 * how many splice sites we've seen supporting the orientation.
 * Sequence should NOT be reverse complemented.  */

int pslIntronOrientation(struct psl *psl, struct dnaSeq *genoSeq, int offset);
/* Return 1 if introns make it look like alignment is on + strand,
 *       -1 if introns make it look like alignment is on - strand,
 *        0 if can't tell.
 * Sequence should NOT be reverse complemented.  */

boolean pslHasIntron(struct psl *psl, struct dnaSeq *seq, int seqOffset);
/* Return TRUE if there's a probable intron. Sequence should NOT be
 * reverse complemented. */

void pslTailSizes(struct psl *psl, int *retStartTail, int *retEndTail);
/* Find the length of "tails" (rather than extensions) implied by psl. */

void pslRc(struct psl *psl);
/* Reverse-complement a PSL alignment.  This makes the target strand explicit. */

void pslSwap(struct psl *psl, boolean noRc);
/* swap query and target in psl.  If noRc is TRUE, don't reverse-complement
 * PSL if needed, instead make target strand explict. */

void pslTargetOffset(struct psl *psl, int offset);
/* Add offset to target positions in psl. */

void pslDump(struct psl *psl, FILE *f);
/* Dump most of PSL to file - for debugging. */

struct psl *pslTrimToTargetRange(struct psl *oldPsl, int tMin, int tMax);
/* Return psl trimmed to fit inside tMin/tMax.  Note this does not
 * update the match/misMatch and related fields. */

struct psl *pslTrimToQueryRange(struct psl *oldPsl, int qMin, int qMax);
/* Return psl trimmed to fit inside qMin/qMax.  Note this does not
 * update the match/misMatch and related fields. */

void pslRecalcBounds(struct psl *psl);
/* Calculate qStart/qEnd tStart/tEnd at top level to be consistent
 * with blocks. */

void pslRecalcMatchCounts(struct psl *psl);
/* Update the match/mismatch counts in PSL, assuming everything is a match. */

int pslCheck(char *pslDesc, FILE* out, struct psl* psl);
/* Validate a PSL for consistency.  pslDesc is printed the error messages
 * to file out (open /dev/null to discard). Return count of errors. */

int pslCheck2(unsigned opts, char *pslDesc, FILE* out, struct psl* psl);
/* Validate a PSL for consistency.  pslDesc is printed the error messages to
 * file out (open /dev/null to discard). Return count of errors.  Option
 * PSL_CHECK_IGNORE_INSERT_CNTS doesn't validate problems insert counts fields
 * in each PSL.  Useful because protein PSL doesn't seen to compute these in a
 * consistent way.
 */

int pslCountBlocks(struct psl *target, struct psl *query, int maxBlockGap);
/* count the number of blocks in the query that overlap the target */
/* merge blocks that are closer than maxBlockGap */

struct hash *readPslToBinKeeper(char *sizeFileName, char *pslFileName);
/* read a list of psls and return results in hash of binKeeper structure for fast query*/

boolean pslIsProtein(const struct psl *psl);
/* is psl a protein psl (are it's blockSizes and scores in protein space) */

struct psl* pslFromAlign(char *qName, int qSize, int qStart, int qEnd, char *qString,
                         char *tName, int tSize, int tStart, int tEnd, char *tString,
                         char* strand, unsigned options);
/* Create a PSL from an alignment.  Options PSL_IS_SOFTMASK if lower case
 * bases indicate repeat masking.  Returns NULL if alignment is empty after
 * triming leading and trailing indels.*/

int pslShowAlignment(struct psl *psl, boolean isProt,
	char *qName, bioSeq *qSeq, int qStart, int qEnd,
	char *tName, bioSeq *tSeq, int tStart, int tEnd, FILE *f);
/* Show protein/DNA alignment or translated DNA alignment in HTML format. */

int pslGenoShowAlignment(struct psl *psl, boolean isProt,
		      char *qName, bioSeq *qSeq, int qStart, int qEnd,
		      char *tName, bioSeq *tSeq, int tStart, int tEnd, int exnStarts[], int exnEnds[], int exnCnt, FILE *f);
/* Show protein/DNA alignment or translated DNA alignment in HTML format. */

struct psl* pslNew(char *qName, unsigned qSize, int qStart, int qEnd,
                   char *tName, unsigned tSize, int tStart, int tEnd,
                   char *strand, unsigned blockSpace, unsigned opts);
/* create a new psl with space for the specified number of blocks allocated.
 * pslGrow maybe used to expand this space if needed.  Valid options are
 * PSL_XA_FORMAT. */

void pslGrow(struct psl *psl, int *blockSpacePtr);
/* Increase memory allocated to a psl to hold more blocks.  blockSpacePtr
 * should point the the current maximum number of blocks and will be
 * updated to with the new amount of space. */

void pslComputeInsertCounts(struct psl *psl);
/* compute numInsert and baseInsert fields from the blocks */

struct psl* pslFromGff3Cigar(char *qName, int qSize, int qStart, int qEnd,
                             char *tName, int tSize, int tStart, int tEnd,
                             char* strand, char *cigar);
/* create a PSL from a GFF3-style cigar formatted alignment */

int pslRangeTreeOverlap(struct psl *psl, struct rbTree *rangeTree);
/* Return amount that psl overlaps (on target side) with rangeTree. */

float pslIdent(struct psl *psl);
/* computer fraction identity */

float pslQueryAligned(struct psl *psl);
/* compute fraction of query that was aligned */

INLINE unsigned pslQStart(struct psl *psl, int blkIdx)
/* return query start for the given block */
{
return psl->qStarts[blkIdx];
}

INLINE unsigned pslTStart(struct psl *psl, int blkIdx)
/* return target start for the given block */
{
return psl->tStarts[blkIdx];
}

INLINE unsigned pslQEnd(struct psl *psl, int blkIdx)
/* return query end for the given block */
{
return psl->qStarts[blkIdx] + psl->blockSizes[blkIdx];
}

INLINE unsigned pslTEnd(struct psl *psl, int blkIdx)
/* return target end for the given block */
{
if (pslIsProtein(psl))
    return psl->tStarts[blkIdx] + 3 * psl->blockSizes[blkIdx];
else
    return psl->tStarts[blkIdx] + psl->blockSizes[blkIdx];
}

INLINE unsigned pslQStartForStrand(struct psl *psl, int blkIdx, char strand)
/* return query start for the given block, mapped to specified strand, */
{
if (pslQStrand(psl) == strand)
    return psl->qStarts[blkIdx];
else
    return psl->qSize - pslQEnd(psl, blkIdx);
}

INLINE unsigned pslQEndForStrand(struct psl *psl, int blkIdx, char strand)
/* return query end for the given block, mapped to specified strand */
{
if (pslQStrand(psl) == strand)
    return pslQEnd(psl, blkIdx);
else
    return psl->qSize - pslQStart(psl, blkIdx);
}

INLINE unsigned pslTStartForStrand(struct psl *psl, int blkIdx, char strand)
/* return target start for the given block, mapped to specified strand */
{
if (pslTStrand(psl) == strand)
    return psl->tStarts[blkIdx];
else
    return psl->tSize - pslTEnd(psl, blkIdx);
}

INLINE unsigned pslTEndForStrand(struct psl *psl, int blkIdx, char strand)
/* return target end for the given block, mapped to specified strand */
{
if (pslTStrand(psl) == strand)
    return pslTEnd(psl, blkIdx);
else
    return psl->tSize - pslTStart(psl, blkIdx);
}

struct psl* pslClone(struct psl *psl);
/* clone a psl */

extern char *pslSortList[5];

void pslSortListByVar(struct psl **pslList, char *sort);
/* Sort a list of psls using the method definied in the sort string. */

void pslRemoveFrameShifts(struct psl *psl);
/* Remove any frameshits if present. Changes in place, doesn't update statistics in first nine fields. */
#endif /* PSL_H */

