/* chain - pairwise alignments that can include gaps in both
 * sequences at once.  This is similar in many ways to psl,
 * but more suitable to cross species genomic comparisons. */

/* Copyright (C) 2011 The Regents of the University of California 
 * See kent/LICENSE or http://genome.ucsc.edu/license/ for licensing information. */
#include "common.h"
#include "linefile.h"
#include "hash.h"
#include "dnaseq.h"
#include "dnautil.h"
#include "chain.h"


void chainFree(struct chain **pChain)
/* Free up a chain. */
{
struct chain *chain = *pChain;
if (chain != NULL)
    {
    slFreeList(&chain->blockList);
    freeMem(chain->qName);
    freeMem(chain->tName);
    freez(pChain);
    }
}

void chainFreeList(struct chain **pList)
/* Free a list of dynamically allocated chain's */
{
struct chain *el, *next;

for (el = *pList; el != NULL; el = next)
    {
    next = el->next;
    chainFree(&el);
    }
*pList = NULL;
}

int cBlockCmpQuery(const void *va, const void *vb)
/* Compare to sort based on query start. */
{
const struct cBlock *a = *((struct cBlock **)va);
const struct cBlock *b = *((struct cBlock **)vb);
return a->qStart - b->qStart;
}

int cBlockCmpTarget(const void *va, const void *vb)
/* Compare to sort based on target start. */
{
const struct cBlock *a = *((struct cBlock **)va);
const struct cBlock *b = *((struct cBlock **)vb);
return a->tStart - b->tStart;
}

int cBlockCmpBoth(const void *va, const void *vb)
/* Compare to sort based on query, then target. */
{
const struct cBlock *a = *((struct cBlock **)va);
const struct cBlock *b = *((struct cBlock **)vb);
int dif;
dif = a->qStart - b->qStart;
if (dif == 0)
    dif = a->tStart - b->tStart;
return dif;
}

int cBlockCmpDiagQuery(const void *va, const void *vb)
/* Compare to sort based on diagonal, then query. */
{
const struct cBlock *a = *((struct cBlock **)va);
const struct cBlock *b = *((struct cBlock **)vb);
int dif;
dif = (a->tStart - a->qStart) - (b->tStart - b->qStart);
if (dif == 0)
    dif = a->qStart - b->qStart;
return dif;
}

void cBlocksAddOffset(struct cBlock *blockList, int qOff, int tOff)
/* Add offsets to block list. */
{
struct cBlock *block;
for (block = blockList; block != NULL; block = block->next)
    {
    block->tStart += tOff;
    block->tEnd += tOff;
    block->qStart += qOff;
    block->qEnd += qOff;
    }
}

struct cBlock *cBlocksFromAliSym(int symCount, char *qSym, char *tSym, 
        int qPos, int tPos)
/* Convert alignment from alignment symbol (bases and dashes) format 
 * to a list of chain blocks.  The qPos and tPos correspond to the start
 * in the query and target sequences of the first letter in  qSym and tSym. */
{
struct cBlock *blockList = NULL, *block = NULL;
int i;
for (i=0; i<symCount; ++i)
    {
    if (qSym[i] == '-')
        {
        block = NULL;
        ++tPos;
        }
    else if (tSym[i] == '-')
        {
        block = NULL;
        ++qPos;
        }
    else
        {
        if (block == NULL)
            {
            AllocVar(block);
            slAddHead(&blockList, block);
            block->qStart = qPos;
            block->tStart = tPos;
            }
        block->qEnd = ++qPos;
        block->tEnd = ++tPos;
        }
    }
slReverse(&blockList);
return blockList;
}
        

int chainCmpScore(const void *va, const void *vb)
/* Compare to sort based on total score. */
{
const struct chain *a = *((struct chain **)va);
const struct chain *b = *((struct chain **)vb);
double diff = b->score - a->score;
if (diff < 0.0) return -1;
else if (diff > 0.0) return 1;
else return 0;
}

int chainCmpScoreDesc(const void *va, const void *vb)
/* Compare to sort based on total score descending. */
{
const struct chain *a = *((struct chain **)va);
const struct chain *b = *((struct chain **)vb);
double diff = a->score - b->score;
if (diff < 0.0) return -1;
else if (diff > 0.0) return 1;
else return 0;
}

int chainCmpTarget(const void *va, const void *vb)
/* Compare to sort based on target position. */
{
const struct chain *a = *((struct chain **)va);
const struct chain *b = *((struct chain **)vb);
int dif = strcmp(a->tName, b->tName);
if (dif == 0)
    dif = a->tStart - b->tStart;
return dif;
}

#define FACTOR 300000000

int chainCmpQuery(const void *va, const void *vb)
/* Compare to sort based on query chrom and target position. */
{
const struct chain *a = *((struct chain **)va);
const struct chain *b = *((struct chain **)vb);
int dif;                                                                        

dif = strcmp(a->qName, b->qName);                                               
if (dif == 0)                                                                   
    dif = a->qStart - b->qStart;                                                
return dif;                       
}

static int nextId = 1;

void chainIdSet(int id)
/* Set next chain id. */
{
nextId = id;
}

void chainIdReset()
/* Reset chain id. */
{
chainIdSet(1);
}

void chainIdNext(struct chain *chain)
/* Add an id to a chain if it doesn't have one already */
{
chain->id = nextId++;
}

void chainWriteHead(struct chain *chain, FILE *f)
/* Write chain before block/insert list. */
{
if (chain->id == 0)
    chainIdNext(chain);
fprintf(f, "chain %1.0f %s %d + %d %d %s %d %c %d %d %d\n", chain->score,
    chain->tName, chain->tSize, chain->tStart, chain->tEnd,
    chain->qName, chain->qSize, chain->qStrand, chain->qStart, chain->qEnd,
    chain->id);
}

void chainWrite(struct chain *chain, FILE *f)
/* Write out chain to file in usual format*/
{
struct cBlock *b, *nextB;

chainWriteHead(chain, f);
for (b = chain->blockList; b != NULL; b = nextB)
    {
    nextB = b->next;
    fprintf(f, "%d", b->qEnd - b->qStart);
    if (nextB != NULL)
	fprintf(f, "\t%d\t%d", 
		nextB->tStart - b->tEnd, nextB->qStart - b->qEnd);
    fputc('\n', f);
    }
fputc('\n', f);
}

void chainWriteAll(struct chain *chainList, FILE *f)
/* Write all chains to file. */
{
struct chain *chain;
for (chain = chainList; chain != NULL; chain = chain->next)
    chainWrite(chain, f);
}

void chainWriteLong(struct chain *chain, FILE *f)
/* Write out chain to file in longer format*/
{
struct cBlock *b, *nextB;

chainWriteHead(chain, f);
for (b = chain->blockList; b != NULL; b = nextB)
    {
    nextB = b->next;
    fprintf(f, "%d\t%d\t", b->tStart, b->qStart);
    fprintf(f, "%d", b->qEnd - b->qStart);
    if (nextB != NULL)
	fprintf(f, "\t%d\t%d", 
		nextB->tStart - b->tEnd, nextB->qStart - b->qEnd);
    fputc('\n', f);
    }
fputc('\n', f);
}

struct chain *chainReadChainLine(struct lineFile *lf)
/* Read line that starts with chain.  Allocate memory
 * and fill in values.  However don't read link lines. */
{
char *row[13];
int wordCount;
struct chain *chain;

wordCount = lineFileChop(lf, row);
if (wordCount == 0)
    return NULL;
if (wordCount < 12)
    errAbort("Expecting at least 12 words line %d of %s", 
    	lf->lineIx, lf->fileName);
if (!sameString(row[0], "chain"))
    errAbort("Expecting 'chain' line %d of %s", lf->lineIx, lf->fileName);
AllocVar(chain);
chain->score = atof(row[1]);
chain->tName = cloneString(row[2]);
chain->tSize = lineFileNeedNum(lf, row, 3);
if (wordCount >= 13)
    chain->id = lineFileNeedNum(lf, row, 12);
else
    chainIdNext(chain);

/* skip tStrand for now, always implicitly + */
chain->tStart = lineFileNeedNum(lf, row, 5);
chain->tEnd = lineFileNeedNum(lf, row, 6);
chain->qName = cloneString(row[7]);
chain->qSize = lineFileNeedNum(lf, row, 8);
chain->qStrand = row[9][0];
chain->qStart = lineFileNeedNum(lf, row, 10);
chain->qEnd = lineFileNeedNum(lf, row, 11);
if (chain->qStart >= chain->qEnd || chain->tStart >= chain->tEnd)
    errAbort("End before start line %d of %s", lf->lineIx, lf->fileName);
if (chain->qStart < 0 || chain->tStart < 0)
    errAbort("Start before zero line %d of %s", lf->lineIx, lf->fileName);
if (chain->qEnd > chain->qSize || chain->tEnd > chain->tSize)
    errAbort("Past end of sequence line %d of %s", lf->lineIx, lf->fileName);
return chain;
}

void chainReadBlocks(struct lineFile *lf, struct chain *chain)
/* Read in chain blocks from file. */
{
char *row[3];
int q,t;

/* Now read in block list. */
q = chain->qStart;
t = chain->tStart;
for (;;)
    {
    int wordCount = lineFileChop(lf, row);
    int size = lineFileNeedNum(lf, row, 0);
    struct cBlock *b;
    AllocVar(b);
    slAddHead(&chain->blockList, b);
    b->qStart = q;
    b->tStart = t;
    q += size;
    t += size;
    b->qEnd = q;
    b->tEnd = t;
    if (wordCount == 1)
        break;
    else if (wordCount < 3)
        errAbort("Expecting 1 or 3 words line %d of %s\n", 
		lf->lineIx, lf->fileName);
    t += lineFileNeedNum(lf, row, 1);
    q += lineFileNeedNum(lf, row, 2);
    }
if (q != chain->qEnd)
    errAbort("q end mismatch %d vs %d line %d of %s\n", 
    	q, chain->qEnd, lf->lineIx, lf->fileName);
if (t != chain->tEnd)
    errAbort("t end mismatch %d vs %d line %d of %s\n", 
    	t, chain->tEnd, lf->lineIx, lf->fileName);
slReverse(&chain->blockList);
}

struct chain *chainRead(struct lineFile *lf)
/* Read next chain from file.  Return NULL at EOF. 
 * Note that chain block scores are not filled in by
 * this. */
{
struct chain *chain = chainReadChainLine(lf);
if (chain != NULL)
    chainReadBlocks(lf, chain);
return chain;
}

void chainSwap(struct chain *chain)
/* Swap target and query side of chain. */
{
struct chain old = *chain;
struct cBlock *b;

/* Copy basic stuff swapping t and q. */
chain->qName = old.tName;
chain->tName = old.qName;
chain->qStart = old.tStart;
chain->qEnd = old.tEnd;
chain->tStart = old.qStart;
chain->tEnd = old.qEnd;
chain->qSize = old.tSize;
chain->tSize = old.qSize;

/* Swap t and q in blocks. */
for (b = chain->blockList; b != NULL; b = b->next)
    {
    struct cBlock old = *b;
    b->qStart = old.tStart;
    b->qEnd = old.tEnd;
    b->tStart = old.qStart;
    b->tEnd = old.qEnd;
    }

/* Cope with the minus strand. */
if (chain->qStrand == '-')
    {
    /* chain's are really set up so that the target is on the
     * + strand and the query is on the minus strand.
     * Therefore we need to reverse complement both 
     * strands while swapping to preserve this. */
    for (b = chain->blockList; b != NULL; b = b->next)
        {
	reverseIntRange(&b->tStart, &b->tEnd, chain->tSize);
	reverseIntRange(&b->qStart, &b->qEnd, chain->qSize);
	}
    reverseIntRange(&chain->tStart, &chain->tEnd, chain->tSize);
    reverseIntRange(&chain->qStart, &chain->qEnd, chain->qSize);
    slReverse(&chain->blockList);
    }
}

struct hash *chainReadUsedSwapLf(char *fileName, boolean swapQ, Bits *bits, struct lineFile *lf)
/* Read chains that are marked as used in the 
 * bits array (which may be NULL) into a hash keyed by id. */
{
char nameBuf[16];
struct hash *hash = hashNew(18);
struct chain *chain;
int usedCount = 0, count = 0;

while ((chain = chainRead(lf)) != NULL)
    {
    ++count;
    if (bits != NULL && !bitReadOne(bits, chain->id))
	{
	chainFree(&chain);
        continue;
	}
    safef(nameBuf, sizeof(nameBuf), "%x", chain->id);
    if (hashLookup(hash, nameBuf))
        errAbort("Duplicate chain %d ending line %d of %s", 
		chain->id, lf->lineIx, lf->fileName);
    if (swapQ)
        chainSwap(chain);
    hashAdd(hash, nameBuf, chain);
    ++usedCount;
    }
return hash;
}

struct hash *chainReadUsedSwap(char *fileName, boolean swapQ, Bits *bits)
/* Read chains that are marked as used in the 
 * bits array (which may be NULL) into a hash keyed by id. */
{
struct lineFile *lf = lineFileOpen(fileName, TRUE);
struct hash *hash = chainReadUsedSwapLf(fileName, swapQ, NULL, lf);
lineFileClose(&lf);
return hash;
}

struct hash *chainReadAllSwap(char *fileName, boolean swapQ)
/* Read chains into a hash keyed by id. */
{
return chainReadUsedSwap(fileName, swapQ, NULL);
}

struct hash *chainReadAll(char *fileName)
/* Read chains into a hash keyed by id. */
{
return chainReadAllSwap(fileName, FALSE);
}

struct hash *chainReadAllWithMeta(char *fileName, FILE *f)
/* Read chains into a hash keyed by id. */
{
struct lineFile *lf = lineFileOpen(fileName, TRUE);
struct hash *hash = NULL;
lineFileSetMetaDataOutput(lf, f);
hash = chainReadUsedSwapLf(fileName, FALSE, NULL, lf);
lineFileClose(&lf);
return hash;
}


struct chain *chainFind(struct hash *hash, int id)
/* Find chain in hash, return NULL if not found */
{
char nameBuf[16];
safef(nameBuf, sizeof(nameBuf), "%x", id);
return hashFindVal(hash, nameBuf);
}

struct chain *chainLookup(struct hash *hash, int id)
/* Find chain in hash. */
{
char nameBuf[16];
safef(nameBuf, sizeof(nameBuf), "%x", id);
return hashMustFindVal(hash, nameBuf);
}

void chainSubsetOnT(struct chain *chain, int subStart, int subEnd, 
    struct chain **retSubChain,  struct chain **retChainToFree)
/* Get subchain of chain bounded by subStart-subEnd on 
 * target side.  Return result in *retSubChain.  In some
 * cases this may be the original chain, in which case
 * *retChainToFree is NULL.  When done call chainFree on
 * *retChainToFree.  The score and id fields are not really
 * properly filled in. */
{
/* Find first relevant block. */
struct cBlock *firstBlock;
for (firstBlock = chain->blockList; firstBlock != NULL; firstBlock = firstBlock->next)
    {
    if (firstBlock->tEnd > subStart)
	break;
    }
chainFastSubsetOnT(chain, firstBlock, subStart, subEnd, retSubChain, retChainToFree);
}

void chainFastSubsetOnT(struct chain *chain, struct cBlock *firstBlock,
	int subStart, int subEnd, struct chain **retSubChain,  struct chain **retChainToFree)
/* Get subchain as in chainSubsetOnT. Pass in initial block that may
 * be known from some index to speed things up. */
{
struct chain *sub = NULL;
struct cBlock *oldB, *b, *bList = NULL;
int qStart = BIGNUM, qEnd = -BIGNUM;
int tStart = BIGNUM, tEnd = -BIGNUM;

/* Check for easy case. */
if (subStart <= chain->tStart && subEnd >= chain->tEnd)
    {
    *retSubChain = chain;
    *retChainToFree = NULL;
    return;
    }
/* Build new block list and calculate bounds. */
for (oldB = firstBlock; oldB != NULL; oldB = oldB->next)
    {
    if (oldB->tStart >= subEnd)
        break;
    b = CloneVar(oldB);
    if (b->tStart < subStart)
        {
	b->qStart += subStart - b->tStart;
	b->tStart = subStart;
	}
    if (b->tEnd > subEnd)
        {
	b->qEnd -= b->tEnd - subEnd;
	b->tEnd = subEnd;
	}
    slAddHead(&bList, b);
    if (qStart > b->qStart)
        qStart = b->qStart;
    if (qEnd < b->qEnd)
        qEnd = b->qEnd;
    if (tStart > b->tStart)
        tStart = b->tStart;
    if (tEnd < b->tEnd)
        tEnd = b->tEnd;
    }
slReverse(&bList);

/* Make new chain based on old. */
if (bList != NULL)
    {
    double sizeRatio;
    AllocVar(sub);
    sub->blockList = bList;
    sub->qName = cloneString(chain->qName);
    sub->qSize = chain->qSize;
    sub->qStrand = chain->qStrand;
    sub->qStart = qStart;
    sub->qEnd = qEnd;
    sub->tName = cloneString(chain->tName);
    sub->tSize = chain->tSize;
    sub->tStart = tStart;
    sub->tEnd = tEnd;
    sub->id = chain->id;

    /* Fake new score. */
    sizeRatio = (sub->tEnd - sub->tStart);
    sizeRatio /= (chain->tEnd - chain->tStart);
    sub->score = sizeRatio * chain->score;
    }
*retSubChain = *retChainToFree = sub;
}

void chainSubsetOnQ(struct chain *chain, int subStart, int subEnd, 
    struct chain **retSubChain,  struct chain **retChainToFree)
/* Get subchain of chain bounded by subStart-subEnd on 
 * query side.  Return result in *retSubChain.  In some
 * cases this may be the original chain, in which case
 * *retChainToFree is NULL.  When done call chainFree on
 * *retChainToFree.  The score and id fields are not really
 * properly filled in. */
{
struct chain *sub = NULL;
struct cBlock *oldB, *b, *bList = NULL;
int qStart = BIGNUM, qEnd = -BIGNUM;
int tStart = BIGNUM, tEnd = -BIGNUM;

/* Check for easy case. */
if (subStart <= chain->qStart && subEnd >= chain->qEnd)
    {
    *retSubChain = chain;
    *retChainToFree = NULL;
    return;
    }
/* Build new block list and calculate bounds. */
for (oldB = chain->blockList; oldB != NULL; oldB = oldB->next)
    {
    if (oldB->qEnd <= subStart)
        continue;
    if (oldB->qStart >= subEnd)
        break;
    b = CloneVar(oldB);
    if (b->qStart < subStart)
        {
	b->tStart += subStart - b->qStart;
	b->qStart = subStart;
	}
    if (b->qEnd > subEnd)
        {
	b->tEnd -= b->qEnd - subEnd;
	b->qEnd = subEnd;
	}
    slAddHead(&bList, b);
    if (tStart > b->tStart)
        tStart = b->tStart;
    if (tEnd < b->tEnd)
        tEnd = b->tEnd;
    if (qStart > b->qStart)
        qStart = b->qStart;
    if (qEnd < b->qEnd)
        qEnd = b->qEnd;
    }
slReverse(&bList);

/* Make new chain based on old. */
if (bList != NULL)
    {
    AllocVar(sub);
    sub->blockList = bList;
    sub->qName = cloneString(chain->qName);
    sub->qSize = chain->qSize;
    sub->qStrand = chain->qStrand;
    sub->qStart = qStart;
    sub->qEnd = qEnd;
    sub->tName = cloneString(chain->tName);
    sub->tSize = chain->tSize;
    sub->tStart = tStart;
    sub->tEnd = tEnd;
    sub->id = chain->id;
    }
*retSubChain = *retChainToFree = sub;
}

void chainRangeQPlusStrand(struct chain *chain, int *retQs, int *retQe)
/* Return range of bases covered by chain on q side on the plus
 * strand. */
{
if (chain == NULL)
    errAbort("chain::chainRangeQPlusStrand() - Can't find range in null query chain.");
if (chain->qStrand == '-')
    {
    *retQs = chain->qSize - chain->qEnd;
    *retQe = chain->qSize - chain->qStart;
    }
else
    {
    *retQs = chain->qStart;
    *retQe = chain->qEnd;
    }
}

