/* clusterPsl - Make clusters of mRNA and ESTs.  Optionally turn clusters into
 * graphs. */
#include "common.h"
#include "memalloc.h"
#include "linefile.h"
#include "hash.h"
#include "binRange.h"
#include "options.h"
#include "psl.h"
#include "rangeTree.h"
#include "geneGraph.h"
#include "altGraph.h"
#include "nib.h"
#include "twoBit.h"
#include "nibTwo.h"


int maxMergeGap = 5;
char *prefix = "c";

void usage()
/* Explain usage and exit. */
{
errAbort(
  "clusterPsl - Make clusters of mRNA aligments\n"
  "usage:\n"
  "   clusterPsl input output.bed\n"
  "where input is a psl file (not necessarily sorted)\n"
  "      output is a text file full of cluster info\n"
  "options:\n"
  "   -verbose=2 Make output more verbose.\n"
  "   -agx=output.agx - Create splicing graphs. Note this is usually done\n"
  "                     with txBedToGraph these days.\n"
  "   -dna=db.2bit - DNA - two bit file or nib dir.\n"
  "   -maxMergeGap=N Merge blocks separated by no more than this. Default %d\n"
  "   -prefix=name - Prefix to add before number in bed name. Default %s\n"
      , maxMergeGap, prefix
  );
}

static struct optionSpec options[] = {
   {"agx", OPTION_STRING},
   {"dna", OPTION_STRING},
   {"maxMergeGap", OPTION_INT},
   {"prefix", OPTION_STRING},
   {NULL, 0},
};


struct pslCluster
/* A cluster of overlapping (at the block level on the same strand)
 * alignments. */
    {
    struct pslCluster *next;
    int tStart,tEnd;
    struct psl *pslList;
    struct rbTree *exonTree;
    };

void pslClusterFree(struct pslCluster **pCluster)
/* Free up memory associated with cluster. */
{
struct pslCluster *cluster = *pCluster;
if (cluster != NULL)
    {
    rbTreeFree(&cluster->exonTree);
    pslFreeList(&cluster->pslList);
    freez(pCluster);
    }
}

void pslClusterFreeList(struct pslCluster **pList)
/* Free a list of dynamically allocated pslCluster's */
{
struct pslCluster *el, *next;

for (el = *pList; el != NULL; el = next)
    {
    next = el->next;
    pslClusterFree(&el);
    }
*pList = NULL;
}

struct pslCluster *pslClusterNew()
/* Create cluster around a single alignment. */
{
struct pslCluster *cluster;
AllocVar(cluster);
cluster->exonTree = rangeTreeNew();
cluster->tStart = BIGNUM;
cluster->tEnd = -BIGNUM;
return cluster;
}


void pslClusterAdd(struct pslCluster *cluster, struct psl *psl)
/* Add new psl to cluster.  Note, uses psl->next to put psl on list
 * inside of cluster. */
{
cluster->tStart = min(psl->tStart, cluster->tStart);
cluster->tEnd = max(psl->tEnd, cluster->tEnd);
int block, blockCount = psl->blockCount;
for (block = 0; block < blockCount; ++block)
    {
    int blockSize = psl->blockSizes[block];
    int tStart = psl->tStarts[block];
    int tEnd = tStart + blockSize;
    rangeTreeAdd(cluster->exonTree, tStart, tEnd);
    }
slAddHead(&cluster->pslList, psl);
}

void pslClusterMerge(struct pslCluster *a, struct pslCluster **pB)
/* Merge b into a.  Destroys b. */
{
struct pslCluster *b = *pB;
a->pslList = slCat(a->pslList, b->pslList);
b->pslList = NULL;
a->tStart = min(a->tStart, b->tStart);
a->tEnd = max(a->tEnd, b->tEnd);
struct range *range;
for (range = rangeTreeList(b->exonTree); range != NULL; range = range->next)
    rangeTreeAdd(a->exonTree, range->start, range->end);
pslClusterFree(pB);
}

boolean pslIntersectsCluster(struct pslCluster *cluster, struct psl *psl)
/* Return TRUE if any block in psl intersects with cluster. */
{
int block, blockCount = psl->blockCount;
for (block = 0; block < blockCount; ++block)
    {
    struct range tempR;
    int start = psl->tStarts[block];
    int end = start + psl->blockSizes[block];
    tempR.start = start;
    tempR.end = end;
    if (rbTreeFind(cluster->exonTree, &tempR))
        return TRUE;
    }
return FALSE;
}

void pslIntoBinsOfClusters(struct psl *psl, struct binKeeper *bins)
/* Add projection onto target into a binKeeper full of pslClusters.
 * This will create and merge clusters as need be. */
{
/* Create new cluster around psl. */
struct pslCluster *newCluster = pslClusterNew();
pslClusterAdd(newCluster, psl);

/* Merge in any existing overlapping clusters.. */
struct binElement *bel, *belList = binKeeperFind(bins, psl->tStart, psl->tEnd);
for (bel = belList; bel != NULL; bel = bel->next)
    {
    struct pslCluster *oldCluster = bel->val;
    if (pslIntersectsCluster(oldCluster, psl))
	{
	binKeeperRemove(bins, oldCluster->tStart, oldCluster->tEnd, oldCluster);
	pslClusterMerge(oldCluster, &newCluster);
	newCluster = oldCluster;
	}
    }
slFreeList(&belList);

/* Add to binKeeper. */
binKeeperAdd(bins, newCluster->tStart, newCluster->tEnd, newCluster);
}

struct pslCluster *clusterPslsOnChrom(struct psl *pslList)
/* Make clusters of overlapping alignments on a single chromosome. 
 * Return a list of such clusters. */
{
if (pslList == NULL)
    return NULL;

/* Creat binKeeper full of clusters. */
struct psl *psl, *next = NULL;
struct binKeeper *bins = binKeeperNew(0, pslList->tSize);
for (psl = pslList; psl != NULL; psl = next)
    {
    next = psl->next;
    pslIntoBinsOfClusters(psl, bins);
    }

/* Convert from binKeeper of clusters to simple list of clusters. */
struct pslCluster *clusterList = NULL;
struct binElement *binList, *bin;
binList = binKeeperFindAll(bins);
for (bin = binList; bin != NULL; bin = bin->next)
    {
    struct pslCluster *cluster = bin->val;
    slAddHead(&clusterList, cluster);
    }
slReverse(&clusterList);

/* Clean up and go home. */
binKeeperFree(&bins);
return clusterList;
}



void writeClusterGraph(struct pslCluster *cluster, struct dnaSeq *chrom, 
	char *chromName, FILE *f)
/* Create a geneGraph out of cluster, and write it to file. */
{
struct ggMrnaAli *maList = pslListToGgMrnaAliList(cluster->pslList, 
	chromName, 0, chrom->size, chrom, maxMergeGap);
struct ggMrnaInput *ci = ggMrnaInputFromAlignments(maList, chrom);
struct ggMrnaCluster *mcList = ggClusterMrna(ci);
struct ggMrnaCluster *mc;
for (mc = mcList; mc != NULL; mc = mc->next)
    {
    struct geneGraph *gg = ggGraphConsensusCluster(NULL, mc, ci, NULL, FALSE);
    struct altGraphX *ag = ggToAltGraphX(gg);
    if (ag != NULL)
	{
	static int id=0;
	char name[16];
	safef(name, sizeof(name), "a%d", ++id);
	freez(&ag->name);
	ag->name = name;
	altGraphXTabOut(ag, f);
	ag->name = NULL;
	}
    altGraphXFree(&ag);
    freeGeneGraph(&gg);
    }
ggFreeMrnaClusterList(&mcList);
ggMrnaAliFreeList(&maList);
freez(&ci);	/* Note - DON'T call freeGgMrnaInput, it'll free chrom! */
}

void pslClusterWrite(struct pslCluster *cluster, FILE *f)
/* Write out info on cluster to file as a bed12 + 2 */
{
static int id=0;
int pslCount = slCount(cluster->pslList);
struct rbTree *exonTree = cluster->exonTree;
struct range *range, *rangeList = rangeTreeList(exonTree);
int chromStart = cluster->tStart;
int chromEnd = cluster->tEnd;

assert(cluster->tStart == rangeList->start);
fprintf(f, "%s\t%d\t%d\t", cluster->pslList->tName, chromStart, chromEnd);
fprintf(f, "%s%d\t", prefix, ++id);
fprintf(f, "0\t");	/* score field */
fprintf(f, "%s\t", cluster->pslList->strand);
fprintf(f, "%d\t", chromStart);	/* thick start */
fprintf(f, "%d\t", chromEnd);	/* thick end */
fprintf(f, "0\t");	/* itemRgb*/
fprintf(f, "%d\t", exonTree->n);	/* Block count */
for (range = rangeList; range != NULL; range = range->next)
    fprintf(f, "%d,", range->end - range->start);
fprintf(f, "\t");
for (range = rangeList; range != NULL; range = range->next)
    fprintf(f, "%d,", range->start - chromStart);
fprintf(f, "\t");
fprintf(f, "%d\t", pslCount);
struct psl *psl;
for (psl = cluster->pslList; psl != NULL; psl = psl->next)
    {
    fprintf(f, "%s,", psl->qName);
    };
fprintf(f, "\n");
}

void clusterPsl(char *pslName, char *clusterOut, char *dnaSource, char *agxOut)
/* Transform a file full of psl's into a file full of rnaClusters */
{
FILE *fCluster = mustOpen(clusterOut, "w");

/* Set up DNA seqence accesser */
struct nibTwoCache *nibTwo = NULL;
if (dnaSource != NULL)
    nibTwo = nibTwoCacheNew(dnaSource);

FILE *fAgx = NULL;
if (agxOut != NULL)
    fAgx = mustOpen(agxOut, "w");
  
struct dnaSeq *chrom = NULL;

/* Load in list and sort by chromosomes. */
struct psl *pslList = pslLoadAll(pslName);
slSort(&pslList, pslCmpTargetAndStrand);

/* Go through input one chromosome at a time. */
struct psl *chromStart, *chromEnd = NULL;
char *oldChromName = cloneString("");
for (chromStart = pslList; chromStart != NULL; chromStart = chromEnd)
    {
    /* Find chromosome end. */
    char *chromName = chromStart->tName;
    char strand = chromStart->strand[0];
    struct psl *psl, *dummy, **endAddress = &dummy;
    for (psl = chromStart; psl != NULL; psl = psl->next)
        {
	if (!sameString(psl->tName, chromName))
	    break;
	if (psl->strand[0] != strand)
	    break;
	endAddress = &psl->next;
	}
    chromEnd = psl;

    /* Terminate list before next chromosome */
    *endAddress = NULL;

    /* Get chromosome sequence */
    verbose(1, "chrom %s %s\n", chromStart->tName, chromStart->strand); 
    if (nibTwo != NULL && !sameString(chromName, oldChromName))
        {
	dnaSeqFree(&chrom);
	chrom = nibTwoCacheSeq(nibTwo, chromName);
	toLowerN(chrom->dna, chrom->size);
	verbose(2, "Loaded %d bases in %s\n", chrom->size, chromName);
	}

    /* Create clusters. */
    struct pslCluster *clusterList = clusterPslsOnChrom(chromStart);

    /* Write clusters to file. */
    struct pslCluster *cluster;
    for (cluster = clusterList; cluster != NULL; cluster = cluster->next)
	 {
         pslClusterWrite(cluster, fCluster);
	 if (fAgx != NULL)
	     writeClusterGraph(cluster, chrom, chromName, fAgx);
	 }

    freez(&oldChromName);
    oldChromName = cloneString(chromName);
    pslClusterFreeList(&clusterList);	/* Note free's psls as well! */
    }
nibTwoCacheFree(&nibTwo);
carefulClose(&fCluster);
carefulClose(&fAgx);
}


int main(int argc, char *argv[])
/* Process command line. */
{
optionInit(&argc, argv, options);
if (argc != 3)
    usage();
if (optionExists("agx") && !optionExists("dna"))
   errAbort("Need to use dna option with agx option.");
maxMergeGap = optionInt("maxMergeGap", maxMergeGap);
prefix = optionVal("prefix", prefix);
clusterPsl(argv[1], argv[2], optionVal("dna", NULL), optionVal("agx", NULL));
return 0;
}
