/* object using in building a PSL from a blast record */

/* Copyright (C) 2011 The Regents of the University of California 
 * See kent/LICENSE or http://genome.ucsc.edu/license/ for licensing information. */
#include "common.h"
#include "pslBuild.h"
#include "psl.h"

unsigned pslBuildGetBlastAlgo(char *program)
/* determine blast algorithm flags */
{
if (sameWord(program, "BLASTN"))
    return blastn;
else if (sameWord(program, "BLASTP"))
    return blastp;
else if (sameWord(program, "BLASTX"))
    return blastx;
else if (sameWord(program, "TBLASTN"))
    return tblastn;
else if (sameWord(program, "TBLASTX"))
    return tblastx;
else if (sameWord(program, "PSIBLAST"))
    return psiblast;
else
    errAbort("unknown BLAST program: \"%s\"", program);
return 0;
}

struct block
/* coordinates of a block use during build */
{
    char *qAln;          /* alignment strings (not owned) */
    char *tAln;
    int qStart;          /* Query start/end positions, in blast units */
    int qEnd;
    int tStart;          /* Target start/end position, in blast units */
    int tEnd;
    int qLetMult;        /* each letter counts as this many units */
    int tLetMult;
    int qCoordMult;      /* used to convert coordinates */
    int tCoordMult;
    int q2tBlkSizeMult;  /* convert query block size to target block size */
    int countMult;       /* column count for match/mismatch */
    int alnStart;        /* start/end in alignment */
    int alnEnd;
};

static void blkInit(struct block *blk, int qStart, char *qAln, int tStart, char *tAln, unsigned flags)
/* initialize a block object */
{
ZeroVar(blk);
blk->qAln = qAln;
blk->tAln = tAln;

// initialize ends to starts for first call to nextUngappedBlk
blk->qEnd = qStart;
blk->tEnd = tStart;

if (flags & cnvNucCoords)
    {
    // DNA/DNA PSL from protein/DNA align
    assert(flags & tblastn);
    blk->qLetMult = 1;
    blk->qCoordMult = 3;
    blk->tLetMult = 3;
    blk->tCoordMult = 1;
    blk->q2tBlkSizeMult = 3;
    blk->countMult = 3;
    }
else if (flags & tblastn)
    {
    // protein/DNA PSL
    blk->qLetMult = 1;
    blk->qCoordMult = 1;
    blk->tLetMult = 3;
    blk->tCoordMult = 1;
    blk->q2tBlkSizeMult = 3;
    blk->countMult = 1;
    }
else if (flags & tblastx)
    {
    blk->qLetMult = 3;
    blk->qCoordMult = 1;
    blk->tLetMult = 3;
    blk->tCoordMult = 1;
    blk->q2tBlkSizeMult = 1;
    blk->countMult = 3;
    }
else
    {
    blk->qLetMult = 1;
    blk->qCoordMult = 1;
    blk->tLetMult = 1;
    blk->tCoordMult = 1;
    blk->q2tBlkSizeMult = 1;
    blk->countMult = 1;
    }
}

static boolean isProteinSeqs(unsigned flags)
/* do the flags indicate a protein sequences in alignments */
{
return (flags & (blastp|blastx|tblastn|tblastx)) != 0;
}

static boolean nextUngappedBlk(struct block* blk)
/* Find the next ungapped block in a blast alignment, in [0..n) coords in mrna
 * space.  blk qStartNext and tStartNext should be initialize to 
 */
{
// initBlk set this up for first call, subsequent start where previous left off
blk->qStart = blk->qEnd;
blk->tStart = blk->tEnd;
blk->alnStart = blk->alnEnd;

/* find start of next aligned block */
char *qPtr = blk->qAln + blk->alnStart;
char *tPtr = blk->tAln + blk->alnStart;

while ((*qPtr != '\0') && (*tPtr != '\0') && ((*qPtr == '-') || (*tPtr == '-')))
    {
    if (*qPtr != '-')
        blk->qStart += blk->qLetMult;
    if (*tPtr != '-')
        blk->tStart += blk->tLetMult;
    qPtr++;
    tPtr++;
    blk->alnStart++;
    }
blk->qEnd = blk->qStart;
blk->tEnd = blk->tStart;
blk->alnEnd = blk->alnStart;

if ((*qPtr == '\0') || (*tPtr == '\0'))
    {
    assert((*qPtr == '\0') && (*tPtr == '\0'));
    return FALSE;  /* no more */
    }

/* find end of aligned block */
while ((*qPtr != '\0') && (*tPtr != '\0')
       && (*qPtr != '-') && (*tPtr != '-'))
    {
    blk->qEnd += blk->qLetMult;
    blk->tEnd += blk->tLetMult;
    qPtr++;
    tPtr++;
    blk->alnEnd++;
    }

// sanity test for blast blocks being the same length after conversion to the same units
assert((blk->tLetMult * (blk->qEnd - blk->qStart)) == (blk->qLetMult * (blk->tEnd - blk->tStart)));
return TRUE;
}

static void addUngappedBlock(struct psl* psl, int* pslSpace, struct block* blk, unsigned flags)
/* add the next  ungapped block to a psl */
{
unsigned newIBlk = psl->blockCount;
unsigned blkSize = blk->qEnd - blk->qStart;  // uses query size so protein psl is right
if (newIBlk >= *pslSpace)
    pslGrow(psl, pslSpace);
psl->qStarts[newIBlk] = blk->qCoordMult * blk->qStart;
psl->tStarts[newIBlk] = blk->tCoordMult * blk->tStart;
psl->blockSizes[newIBlk] = blk->qCoordMult * blkSize;

/* keep bounds current */
psl->qStart = psl->qStarts[0];
psl->qEnd = psl->qStarts[newIBlk] + (blk->qCoordMult * blkSize);
if (psl->strand[0] == '-')
    reverseIntRange(&psl->qStart, &psl->qEnd, psl->qSize);
psl->tStart = psl->tStarts[0];
psl->tEnd = psl->tStarts[newIBlk] + (blk->q2tBlkSizeMult * blkSize);
if (psl->strand[1] == '-')
    reverseIntRange(&psl->tStart, &psl->tEnd, psl->tSize);

if (flags & bldPslx)
    {
    psl->qSequence[newIBlk] = cloneStringZ(blk->qAln + blk->alnStart, blkSize);
    psl->tSequence[newIBlk] = cloneStringZ(blk->tAln + blk->alnStart, blkSize);
    }
psl->blockCount++;
}

static void countMatches(struct psl* psl, struct block* blk, unsigned flags)
/* update the PSL match/mismatch after adding a block. */
{
int i, alnLen = (blk->alnEnd - blk->alnStart);
char *qPtr = blk->qAln + blk->alnStart;
char *tPtr = blk->tAln + blk->alnStart;
boolean isProt = isProteinSeqs(flags);
for (i = 0; i < alnLen; i++, qPtr++, tPtr++)
    {
    if ((!isProt && ((*qPtr == 'N') || (*tPtr == 'N')))
        || (isProt && ((*qPtr == 'X') || (*tPtr == 'X'))))
        psl->repMatch += blk->countMult;
    else if (*qPtr == *tPtr)
        psl->match += blk->countMult;
    else
        psl->misMatch += blk->countMult;
    }
}

static void hspToBlocks(struct psl *psl, int *pslSpace, struct block *blk, unsigned flags)
/* build PSl blocks from an HSP */
{
/* fill in ungapped blocks */
while (nextUngappedBlk(blk))
    {
    addUngappedBlock(psl, pslSpace, blk, flags);
    pslComputeInsertCounts(psl);
    countMatches(psl, blk, flags);
    }
pslComputeInsertCounts(psl);
assert(blk->qStart == blk->qEnd);
assert(blk->tStart == blk->tEnd);
}

static void makeUntranslated(struct psl* psl)
/* convert a PSL so it is in the untranslated form produced by blat */
{
if (psl->strand[1] == '-')
    {
    /* swap around blocks so it's query that is reversed */
    int i;
    for (i = 0; i < psl->blockCount; i++)
        {
        psl->tStarts[i] = psl->tSize - (psl->tStarts[i] + psl->blockSizes[i]);
        psl->qStarts[i] = psl->qSize - (psl->qStarts[i] + psl->blockSizes[i]);
        }
    reverseUnsigned(psl->tStarts, psl->blockCount);
    reverseUnsigned(psl->qStarts, psl->blockCount);
    reverseUnsigned(psl->blockSizes, psl->blockCount);

    /* fix strand, +- now -, -- now + */
    psl->strand[0] = (psl->strand[0] == '+') ? '-' : '+';
    }
psl->strand[1] = '\0';
}

static void finishPsl(struct psl* psl, unsigned flags)
/* put finishing touches on a psl */
{
if ((flags & tblastn) == 0)
    makeUntranslated(psl);
}

struct psl *pslBuildFromHsp(char *qName, int qSize, int qStart, int qEnd, char qStrand, char *qAln,
                            char *tName, int tSize, int tStart, int tEnd, char tStrand, char *tAln,
                            unsigned flags)
/* construct a new psl from an HSP.  Chaining is left to other programs. */
{
if ((flags & tblastx) && (flags & bldPslx))
    errAbort("can't convert TBLASTX to PSLX");

struct block blk;
blkInit(&blk, qStart, qAln, tStart, tAln, flags);

// construct psl
int pslSpace = 8;
char strand[3];
safef(strand, sizeof(strand), "%c%c", qStrand, tStrand);
struct psl *psl = pslNew(qName, blk.qCoordMult * qSize, blk.qCoordMult * qStart, blk.qCoordMult * qEnd,
                         tName, blk.tCoordMult * tSize, blk.tCoordMult * tStart, blk.tCoordMult * tEnd,
                         strand, pslSpace, ((flags & bldPslx) ? PSL_XA_FORMAT : 0));

// fill in psl
hspToBlocks(psl, &pslSpace, &blk, flags);
finishPsl(psl, flags);
return psl;
}

FILE *pslBuildScoresOpen(char *scoreFile, boolean inclDefs)
/* open score file and write headers */
{
FILE *fh = mustOpen(scoreFile, "w");
fputs("#strand\tqName\tqStart\tqEnd\ttName\ttStart\ttEnd\tbitScore\teVal", fh);
if (inclDefs)
    fputs("\tqDef\ttDef", fh);
fputc('\n', fh);
return fh;
}

static void writeBasicScores(FILE* scoreFh, struct psl *psl, double bitScore, double eValue)
/* write first part of row */
{
fprintf(scoreFh, "%s\t%s\t%d\t%d\t%s\t%d\t%d\t%g\t%g", psl->strand,
        psl->qName, psl->qStart, psl->qEnd, psl->tName, psl->tStart, psl->tEnd,
        bitScore, eValue);
}

void pslBuildScoresWrite(FILE* scoreFh, struct psl *psl, double bitScore, double eValue)
/* write scores for a PSL */
{
writeBasicScores(scoreFh, psl, bitScore, eValue);
fputc('\n', scoreFh);
}

void pslBuildScoresWriteWithDefs(FILE* scoreFh, struct psl *psl, double bitScore, double eValue, char *qDef, char *tDef)
/* write scores and definitions for a PSL */
{
writeBasicScores(scoreFh, psl, bitScore, eValue);
fprintf(scoreFh, "\t%s\t%s\n", qDef, tDef);
}
