/* gfOut - stuff to manage output for genoFind system -
 * currently supports psl, axt, blast, and wu-blast. 
 *
 * Copyright 2001-2003 Jim Kent.  All rights reserved. */

#include "common.h"
#include "linefile.h"
#include "hash.h"
#include "dystring.h"
#include "dnautil.h"
#include "axt.h"
#include "maf.h"
#include "trans3.h"
#include "psl.h"
#include "genoFind.h"


struct pslxData
/* This is the data structure put in gfOutput.data for psl/pslx output. */
    {
    FILE *f;			/* Output file. */
    boolean saveSeq;		/* Save sequence too? */
    };

struct axtData
/* This is the data structure put in gfOutput.data for axt/blast output. */
    {
    struct axtBundle *bundleList;	/* List of bundles. */
    char *databaseName;		/* Just used for blast. */
    int databaseSeqCount;	/* Just used for blast. */
    double databaseLetters; /* Just used for blast. */
    char *blastType;	/* 'blast' or 'wublast' or 'xml' or 'blast8' or 'blast9' */
    double minIdentity; /* Just used for blast. */
    };

static void savePslx(char *chromName, int chromSize, int chromOffset,
	struct ffAli *ali, struct dnaSeq *tSeq, struct dnaSeq *qSeq, 
	boolean isRc, enum ffStringency stringency, int minMatch, FILE *f,
	struct hash *t3Hash, boolean reportTargetStrand, boolean targetIsRc,
	struct hash *maskHash, int minIdentity, 
	boolean qIsProt, boolean tIsProt, boolean saveSeq)
/* Analyse one alignment and if it looks good enough write it out to file in
 * psl format (or pslX format - if saveSeq is TRUE).  */
{
/* This function was stolen from psLayout and slightly extensively to cope
 * with protein as well as DNA aligments. */
struct ffAli *ff, *nextFf;
struct ffAli *right = ffRightmost(ali);
DNA *needle = qSeq->dna;
DNA *hay = tSeq->dna;
int nStart = ali->nStart - needle;
int nEnd = right->nEnd - needle;
int hStart, hEnd; 
int nInsertBaseCount = 0;
int nInsertCount = 0;
int hInsertBaseCount = 0;
int hInsertCount = 0;
int matchCount = 0;
int mismatchCount = 0;
int repMatch = 0;
int countNs = 0;
DNA *np, *hp, n, h;
int blockSize;
int i;
struct trans3 *t3List = NULL;
Bits *maskBits = NULL;

if (maskHash != NULL)
    maskBits = hashMustFindVal(maskHash, tSeq->name);
if (t3Hash != NULL)
    t3List = hashMustFindVal(t3Hash, tSeq->name);
hStart = trans3GenoPos(ali->hStart, tSeq, t3List, FALSE) + chromOffset;
hEnd = trans3GenoPos(right->hEnd, tSeq, t3List, TRUE) + chromOffset;

/* Count up matches, mismatches, inserts, etc. */
for (ff = ali; ff != NULL; ff = nextFf)
    {
    nextFf = ff->right;
    blockSize = ff->nEnd - ff->nStart;
    np = ff->nStart;
    hp = ff->hStart;
    for (i=0; i<blockSize; ++i)
	{
	n = np[i];
	h = hp[i];
	if (n == 'n' || h == 'n')
	    ++countNs;
	else
	    {
	    if (n == h)
		{
		if (maskBits != NULL)
		    {
		    int seqOff = hp + i - hay;
		    if (bitReadOne(maskBits, seqOff))
		        ++repMatch;
		    else
		        ++matchCount;
		    }
		else
		    ++matchCount;
		}
	    else
		++mismatchCount;
	    }
	}
    if (nextFf != NULL)
	{
	int nhStart = trans3GenoPos(nextFf->hStart, tSeq, t3List, FALSE) + chromOffset;
	int ohEnd = trans3GenoPos(ff->hEnd, tSeq, t3List, TRUE) + chromOffset;
	int hGap = nhStart - ohEnd;
	int nGap = nextFf->nStart - ff->nEnd;

	if (nGap != 0)
	    {
	    ++nInsertCount;
	    nInsertBaseCount += nGap;
	    }
	if (hGap != 0)
	    {
	    ++hInsertCount;
	    hInsertBaseCount += hGap;
	    }
	}
    }


/* See if it looks good enough to output, and output. */
/* if (score >= minMatch) Moved to higher level */
    {
    int gaps = nInsertCount + (stringency == ffCdna ? 0: hInsertCount);
    if ((matchCount + repMatch + mismatchCount) > 0)
	{
	int id = roundingScale(1000, matchCount + repMatch - 2*gaps, matchCount + repMatch + mismatchCount);
	if (id >= minIdentity)
	    {
	    if (isRc)
		{
		int temp;
		int oSize = qSeq->size;
		temp = nStart;
		nStart = oSize - nEnd;
		nEnd = oSize - temp;
		}
	    if (targetIsRc)
		{
		int temp;
		temp = hStart;
		hStart = chromSize - hEnd;
		hEnd = chromSize - temp;
		}
	    fprintf(f, "%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%c",
		matchCount, mismatchCount, repMatch, countNs, nInsertCount, nInsertBaseCount, hInsertCount, hInsertBaseCount,
		(isRc ? '-' : '+'));
	    if (reportTargetStrand)
		fprintf(f, "%c", (targetIsRc ? '-' : '+') );
	    fprintf(f, "\t%s\t%d\t%d\t%d\t"
			 "%s\t%d\t%d\t%d\t%d\t",
		qSeq->name, qSeq->size, nStart, nEnd,
		chromName, chromSize, hStart, hEnd,
		ffAliCount(ali));
	    for (ff = ali; ff != NULL; ff = ff->right)
		fprintf(f, "%ld,", (long)(ff->nEnd - ff->nStart));
	    fprintf(f, "\t");
	    for (ff = ali; ff != NULL; ff = ff->right)
		fprintf(f, "%ld,", (long)(ff->nStart - needle));
	    fprintf(f, "\t");
	    for (ff = ali; ff != NULL; ff = ff->right)
		fprintf(f, "%d,", trans3GenoPos(ff->hStart, tSeq, t3List, FALSE) + chromOffset);
	    if (saveSeq)
		{
		fputc('\t', f);
		for (ff = ali; ff != NULL; ff = ff->right)
		    {
		    mustWrite(f, ff->nStart, ff->nEnd - ff->nStart);
		    fputc(',', f);
		    }
		fputc('\t', f);
		for (ff = ali; ff != NULL; ff = ff->right)
		    {
		    mustWrite(f, ff->hStart, ff->hEnd - ff->hStart);
		    fputc(',', f);
		    }
		}
	    fprintf(f, "\n");
	    if (ferror(f))
		{
		perror("");
		errAbort("Write error to .psl");
		}
	    }
	}
    }
}

static void pslOut(char *chromName, int chromSize, int chromOffset, struct ffAli *ali, 
	struct dnaSeq *tSeq, struct hash *t3Hash, struct dnaSeq *qSeq, 
	boolean qIsRc, boolean tIsRc, 
	enum ffStringency stringency, int minMatch, struct gfOutput *out)
/* Save psl for more complex alignments. */
{
struct pslxData *outForm = out->data;

savePslx(chromName, chromSize, chromOffset, ali, tSeq, qSeq,
    qIsRc, stringency, minMatch, outForm->f, t3Hash, 
    out->reportTargetStrand, tIsRc,
    out->maskHash, out->minGood, 
    out->qIsProt, out->tIsProt, outForm->saveSeq);
}

static struct ffAli *ffNextBreak(struct ffAli *ff, int maxInsert, 
	bioSeq *tSeq, struct trans3 *t3List)
/* Return ffAli after first gap in either sequence longer than maxInsert,
 * or after first gap in both sequences.  Return may legitimately
 * be NULL. */
{
struct ffAli *rt = ff->right;
int hGap, nGap;
int nhStart, ohEnd;
for (;;)
    {
    if (rt == NULL)
        break;
    nhStart = trans3GenoPos(rt->hStart, tSeq, t3List, FALSE);
    ohEnd = trans3GenoPos(ff->hEnd, tSeq, t3List, TRUE);
    hGap = nhStart - ohEnd;
    nGap = rt->nStart - ff->nEnd;
    if (hGap != 0 && nGap != 0)
        break;
    if (hGap < 0 || nGap < 0)
        break;
    if (hGap > maxInsert || nGap > maxInsert)
        break;
    ff = rt;
    rt = ff->right;
    }
return rt;
}


static void saveAxtBundle(char *chromName, int chromSize, int chromOffset,
	struct ffAli *ali, 
	struct dnaSeq *tSeq, struct hash *t3Hash, struct dnaSeq *qSeq, 
	boolean qIsRc, boolean tIsRc, 
	enum ffStringency stringency, int minMatch, struct gfOutput *out)
/* Save alignment to axtBundle. */
{
struct axtData *ad = out->data;
struct ffAli *sAli, *eAli, *ff, *rt, *eFf = NULL;
struct axt *axt;
struct dyString *q = dyStringNew(1024), *t = dyStringNew(1024);
struct axtBundle *gab;
struct trans3 *t3List = NULL;

if (t3Hash != NULL)
    t3List = hashMustFindVal(t3Hash, tSeq->name);
AllocVar(gab);
gab->tSize = chromSize;
gab->qSize = qSeq->size;
for (sAli = ali; sAli != NULL; sAli = eAli)
    {
    eAli = ffNextBreak(sAli, 8, tSeq, t3List);
    dyStringClear(q);
    dyStringClear(t);
    for (ff = sAli; ff != eAli; ff = ff->right)
        {
	dyStringAppendN(q, ff->nStart, ff->nEnd - ff->nStart);
	dyStringAppendN(t, ff->hStart, ff->hEnd - ff->hStart);
	rt = ff->right;
	if (rt != eAli)
	    {
	    int nGap = rt->nStart - ff->nEnd;
	    int nhStart = trans3GenoPos(rt->hStart, tSeq, t3List, FALSE) 
	    	+ chromOffset;
	    int ohEnd = trans3GenoPos(ff->hEnd, tSeq, t3List, TRUE) 
	    	+ chromOffset;
	    int hGap = nhStart - ohEnd;
	    int gap = max(nGap, hGap);
	    if (nGap < 0 || hGap < 0)
		{
	        errAbort("Negative gap size in %s vs %s", tSeq->name, qSeq->name);
		}
	    if (nGap == gap)
	        {
		dyStringAppendN(q, ff->nEnd, gap);
		dyStringAppendMultiC(t, '-', gap);
		}
	    else
	        {
		dyStringAppendN(t, ff->hEnd, gap);
		dyStringAppendMultiC(q, '-', gap);
		}
	    }
	eFf = ff;	/* Keep track of last block in bunch */
	}
    assert(t->stringSize == q->stringSize);
    AllocVar(axt);
    axt->qName = cloneString(qSeq->name);
    axt->qStart = sAli->nStart - qSeq->dna;
    axt->qEnd = eFf->nEnd - qSeq->dna;
    axt->qStrand = (qIsRc ? '-' : '+');
    axt->tName = cloneString(chromName);
    axt->tStart = trans3GenoPos(sAli->hStart, tSeq, t3List, FALSE) + chromOffset;
    axt->tEnd = trans3GenoPos(eFf->hEnd, tSeq, t3List, TRUE) + chromOffset;
    axt->tStrand = (tIsRc ? '-' : '+');
    axt->symCount = t->stringSize;
    axt->qSym = cloneString(q->string);
    axt->tSym = cloneString(t->string);
    axt->frame = trans3Frame(sAli->hStart, t3List);
    if (out->qIsProt)
	axt->score = axtScoreProteinDefault(axt);
    else 
	axt->score = axtScoreDnaDefault(axt);
    slAddHead(&gab->axtList, axt);
    }
slReverse(&gab->axtList);
dyStringFree(&q);
dyStringFree(&t);
slAddHead(&ad->bundleList, gab);
}

static void axtQueryOut(struct gfOutput *out, FILE *f)
/* Do axt oriented output - at end of processing query. */
{
struct axtData *aod = out->data;
struct axtBundle *gab;
for (gab = aod->bundleList; gab != NULL; gab = gab->next)
    {
    struct axt *axt;
    for (axt = gab->axtList; axt != NULL; axt = axt->next)
	axtWrite(axt, f);
    }
axtBundleFreeList(&aod->bundleList);
}

static void mafHead(struct gfOutput *out, FILE *f)
/* Write maf header. */
{
mafWriteStart(f, "blastz");
}

static void mafQueryOut(struct gfOutput *out, FILE *f)
/* Do axt oriented output - at end of processing query. */
{
struct axtData *aod = out->data;
struct axtBundle *gab;
for (gab = aod->bundleList; gab != NULL; gab = gab->next)
    {
    struct axt *axt;
    for (axt = gab->axtList; axt != NULL; axt = axt->next)
	{
	struct mafAli temp;
	mafFromAxtTemp(axt, gab->tSize, gab->qSize, &temp);
	mafWrite(f, &temp);
	}
    }
axtBundleFreeList(&aod->bundleList);
}

static int axtMatchCount(struct axt *axt)
/* Return matches. */
{
int matchCount = 0;
int i;
for (i=0; i<axt->symCount; ++i)
    {
    if (axt->qSym[i] == axt->tSym[i])
        ++matchCount;
    }
return matchCount;
}

static double axtIdRatio(struct axt *axt)
/* Return matches/total. */
{
int matchCount = axtMatchCount(axt);
return (double)matchCount/(double)axt->symCount;
}

static double axtListRatio(struct axt *axt)
/* Return matches/total. */
{
int matchCount = 0;
int symCount = 0;
for (; axt != NULL; axt = axt->next)
    {
    matchCount += axtMatchCount(axt);
    symCount += axt->symCount;
    }
return (double)matchCount/(double)symCount;
}

static void sim4QueryOut(struct gfOutput *out, FILE *f)
/* Do sim4-like output - at end of processing query. */
{
struct axtData *aod = out->data;
struct axtBundle *gab;
slReverse(&aod->bundleList);

for (gab = aod->bundleList; gab != NULL; gab = gab->next)
    {
    struct axt *axt = gab->axtList;
    // check minIdentity of the entire alignment
    int goodPpt = 1000 * axtListRatio(axt);
    if (!(goodPpt >= out->minGood))
	continue; 
    fprintf(f, "\n");
    fprintf(f, "seq1 = %s, %d bp\n", axt->qName, gab->qSize);
    fprintf(f, "seq2 = %s, %d bp\n", axt->tName, gab->tSize);
    fprintf(f, "\n");
    if (axt->qStrand == '-')
	fprintf(f, "(complement)\n");
    for (; axt != NULL; axt = axt->next)
	{
	fprintf(f, "%d-%d  ", axt->qStart+1, axt->qEnd);
	fprintf(f, "(%d-%d)   ", axt->tStart+1, axt->tEnd);
	fprintf(f, "%1.0f%% ", 100.0 * axtIdRatio(axt));
	if (axt->qStrand == '-')
	     fprintf(f, "<-\n");
	else
	     fprintf(f, "->\n");
	}
    }
axtBundleFreeList(&aod->bundleList);
}

static void blastQueryOut(struct gfOutput *out, FILE *f)
/* Output blast on query. */
{
struct axtData *aod = out->data;
axtBlastOut(aod->bundleList, out->queryIx, out->qIsProt, f,
	aod->databaseName, aod->databaseSeqCount, aod->databaseLetters,
	aod->blastType, "blat", aod->minIdentity);
axtBundleFreeList(&aod->bundleList);
}

static struct gfOutput *gfOutputInit(int goodPpt, boolean qIsProt, boolean tIsProt)
/* Allocate and initialize gfOutput.   You'll need to fill in 
 * gfOutput.out at a minimum, and likely gfOutput.data before
 * using though. */
{
struct gfOutput *out;
AllocVar(out);
out->minGood = goodPpt;
out->qIsProt = qIsProt;
out->tIsProt = tIsProt;
return out;
}

static void pslHead(struct gfOutput *out, FILE *f)
/* Write out psl head */
{
pslWriteHead(f);
}

struct gfOutput *gfOutputPsl(int goodPpt, 
	boolean qIsProt, boolean tIsProt, FILE *f, 
	boolean saveSeq, boolean noHead)
/* Set up psl/pslx output */
{
struct gfOutput *out = gfOutputInit(goodPpt, qIsProt, tIsProt);
struct pslxData *pslData;

AllocVar(pslData);
pslData->saveSeq = saveSeq;
pslData->f = f;
out->out = pslOut;
out->data = pslData;
if (!noHead)
    out->fileHead = pslHead;
return out;
}

struct gfOutput *gfOutputAxtMem(int goodPpt, boolean qIsProt, 
	boolean tIsProt)
/* Setup output for in memory axt output. */
{
struct gfOutput *out = gfOutputInit(goodPpt, qIsProt, tIsProt);
struct axtData *ad;
AllocVar(ad);
out->out = saveAxtBundle;
out->data = ad;
return out;
}

struct gfOutput *gfOutputAxt(int goodPpt, boolean qIsProt, 
	boolean tIsProt, FILE *f)
/* Setup output for axt format. */
{
struct gfOutput *out = gfOutputAxtMem(goodPpt, qIsProt, tIsProt);
out->queryOut = axtQueryOut;
return out;
}

struct gfOutput *gfOutputMaf(int goodPpt, boolean qIsProt, 
	boolean tIsProt, FILE *f)
/* Setup output for maf format. */
{
struct gfOutput *out = gfOutputAxtMem(goodPpt, qIsProt, tIsProt);
out->queryOut = mafQueryOut;
out->fileHead = mafHead;
return out;
}

struct gfOutput *gfOutputSim4(int goodPpt, boolean qIsProt, boolean tIsProt, 
	char *databaseName)
/* Set up to output in sim4 format. */
{
struct gfOutput *out = gfOutputAxtMem(goodPpt, qIsProt, tIsProt);
struct axtData *ad = out->data;
if (qIsProt || tIsProt)
    errAbort("sim4 output is not available for protein query sequences.");
ad->databaseName = databaseName;
out->queryOut = sim4QueryOut;
return out;
}

struct gfOutput *gfOutputBlast(int goodPpt, 
	boolean qIsProt, boolean tIsProt, 
	char *databaseName, int databaseSeqCount, double databaseLetters,
	char *blastType, double minIdentity, FILE *f)
/* Setup output for blast format. */
{
struct gfOutput *out = gfOutputAxtMem(goodPpt, qIsProt, tIsProt);
struct axtData *ad = out->data;
ad->databaseName = databaseName;
ad->databaseSeqCount = databaseSeqCount;
ad->databaseLetters = databaseLetters;
ad->blastType = blastType;
ad->minIdentity = minIdentity;
out->queryOut = blastQueryOut;
return out;
}

struct gfOutput *gfOutputAny(char *format, 
	int goodPpt, boolean qIsProt, boolean tIsProt, 
	boolean noHead, char *databaseName,
	int databaseSeqCount, double databaseLetters,
	double minIdentity,
	FILE *f)
/* Initialize output in a variety of formats in file or memory. 
 * Parameters:
 *    format - either 'psl', 'pslx', 'sim4', 'blast', 'wublast', 'axt', 'xml'
 *    goodPpt - minimum identity of alignments to output in parts per thousand
 *    qIsProt - true if query side is a protein.
 *    tIsProt - true if target (database) side is a protein.
 *    noHead - if true suppress header in psl/pslx output.
 *    databaseName - name of database.  Only used for blast output
 *    databaseSeq - number of sequences in database - only for blast
 *    databaseLetters - number of bases/aas in database - only blast
 *    minIdentity - minimum identity - only blast
 *    FILE *f - file.  
 */
{
struct gfOutput *out = NULL;
static char *blastTypes[] = {"blast", "wublast", "blast8", "blast9", "xml"};

if (format == NULL)
    format = "psl";
if (sameWord(format, "psl"))
    out = gfOutputPsl(goodPpt, qIsProt, tIsProt, f, FALSE, noHead);
else if (sameWord(format, "pslx"))
    out = gfOutputPsl(goodPpt, qIsProt, tIsProt, f, TRUE, noHead);
else if (sameWord(format, "sim4"))
    out = gfOutputSim4(goodPpt, qIsProt, tIsProt, databaseName);
else if (stringArrayIx(format, blastTypes, ArraySize(blastTypes)) >= 0)
    out = gfOutputBlast(goodPpt, qIsProt, tIsProt, 
	    databaseName, databaseSeqCount, databaseLetters, format, 
	    minIdentity, f);
else if (sameWord(format, "axt"))
    out = gfOutputAxt(goodPpt, qIsProt, tIsProt, f);
else if (sameWord(format, "maf"))
    out = gfOutputMaf(goodPpt, qIsProt, tIsProt, f);
else
    errAbort("Unrecognized output format '%s'", format);
return out;
}

void gfOutputQuery(struct gfOutput *out, FILE *f)
/* Finish writing out results for a query to file. */
{
++out->queryIx;
if (out->queryOut != NULL)
    out->queryOut(out, f);
}

void gfOutputHead(struct gfOutput *out, FILE *f)
/* Write out header if any. */
{
if (out->fileHead != NULL)
    out->fileHead(out, f);
}

void gfOutputFree(struct gfOutput **pOut)
/* Free up output */
{
struct gfOutput *out = *pOut;
if (out != NULL)
    {
    freeMem(out->data);
    freez(pOut);
    }
}
