/* peakCluster - cluster peak calls from different sources. Used by regCluster
 * and encodeMergeReplicates programs. */

/* Copyright (C) 2011 The Regents of the University of California 
 * See kent/LICENSE or http://genome.ucsc.edu/license/ for licensing information. */

#include "common.h"
#include "hash.h"
#include "linefile.h"
#include "localmem.h"
#include "sqlNum.h"
#include "obscure.h"
#include "peakCluster.h"
#include "rangeTree.h"

struct peakSource *peakSourceLoadAll(char *fileName, int dimCount)
/* Read file, parse it line by line and return list of peakSources. */
{
struct lineFile *lf = lineFileOpen(fileName, TRUE);
int rowSize = dimCount + 6;
char *row[rowSize];
struct peakSource *sourceList = NULL, *source;
while (lineFileNextRow(lf, row, rowSize))
    {
    /* Allocate struct and read in fixed fields. */
    AllocVar(source);
    source->dataSource = cloneString(row[0]);
    source->chromColIx = sqlUnsigned(row[1]);
    source->startColIx = sqlUnsigned(row[2]);
    source->endColIx = sqlUnsigned(row[3]);
    source->scoreColIx = sqlUnsigned(row[4]);
    source->normFactor = sqlDouble(row[5]);

    /* Read in dimension labels. */
    AllocArray(source->labels, dimCount);
    int i;
    for (i=0; i<dimCount; ++i)
        source->labels[i] = cloneString(row[i+6]);

    /* Calculate required columns. */
    int minColCount = max(source->chromColIx, source->startColIx);
    minColCount = max(minColCount, source->endColIx);
    minColCount = max(minColCount, source->scoreColIx);
    source->minColCount = minColCount + 1;
    slAddHead(&sourceList, source);
    }
lineFileClose(&lf);
slReverse(&sourceList);
return sourceList;
}

void peakClusterMakerAddFromSource(struct peakClusterMaker *maker, struct peakSource *source)
/* Read through data source and add items to it to rangeTrees in maker */
{
struct hash *chromHash = maker->chromHash;
struct lineFile *lf = lineFileOpen(source->dataSource, TRUE);
struct lm *lm = chromHash->lm;	/* Local memory pool - share with hash */
char *row[source->minColCount];
struct peakItem *item;
char *line;
while (lineFileNextReal(lf, &line))
    {
    char *asciiLine = lmCloneString(lm, line);
    int wordCount = chopByWhite(line, row, source->minColCount);
    lineFileExpectAtLeast(lf, source->minColCount, wordCount);
    char *chrom = row[source->chromColIx];
    struct hashEl *hel = hashLookup(chromHash, chrom);
    if (hel == NULL)
        {
	struct rbTree *tree = rangeTreeNewDetailed(lm, maker->stack);
	hel = hashAdd(chromHash, chrom, tree);
	}
    struct rbTree *tree = hel->val;
    lmAllocVar(lm, item);
    item->chrom = hel->name;
    item->chromStart = sqlUnsigned(row[source->startColIx]);
    item->chromEnd = sqlUnsigned(row[source->endColIx]);
    item->score = sqlDouble(row[source->scoreColIx]) * source->normFactor;
    if (item->score > 1000) item->score = 1000;
    item->source = source;
    item->asciiLine = asciiLine;
    rangeTreeAddValList(tree, item->chromStart, item->chromEnd, item);
    }
lineFileClose(&lf);
}

static void addCluster(struct lm *lm, struct peakItem *itemList, int start, int end,
	struct peakCluster **pList)
/* Make cluster of all items that overlap start/end, and put it on list. */
{
struct peakCluster *cluster;
lmAllocVar(lm, cluster);
double score = 0.0;
double maxSubScore = 0.0;
struct slRef  *refList = NULL, *ref;
struct peakItem *item;
for (item = itemList; item != NULL; item = item->next)
    {
    if (rangeIntersection(start, end, item->chromStart, item->chromEnd) > 0)
	{
	lmAllocVar(lm, ref);
	ref->val = item;
	slAddHead(&refList, ref);
	score += item->score;
	if (item->score > maxSubScore) maxSubScore = item->score;
	}
    }
slReverse(&refList);
cluster->chrom = itemList->chrom;
cluster->chromStart = start;
cluster->chromEnd = end;
cluster->itemRefList = refList;
cluster->score = score;
cluster->maxSubScore = maxSubScore;
slAddHead(pList, cluster);
}

struct peakCluster *peakClusterItems(struct lm *lm, struct peakItem *itemList, 
	double forceJoinScore, double weakLevel)
/* Convert a list of items to a list of clusters of items.  This may break up clusters that
 * have weakly linked parts. 
      [                ]
      AAAAAAAAAAAAAAAAAA 
       BBBBBB   DDDDDD
        CCCC     EEEE
   gets tranformed into
       [    ]   [    ]
      AAAAAAAAAAAAAAAAAA 
       BBBBBB   DDDDDD
        CCCC     EEEE
   The strategy is to build a rangeTree of coverage, which might look something like so:
      123333211123333211 
   then define cluster ends that exceed the minimum limit, which is either weakLevel
   (usually 10%) of the highest or forceJoinScore if weakLevel times the highest is 
   more than forceJoinScore.  This will go to something like so:
        [---]   [----]   
   Finally the items that are overlapping a cluster are assigned to it.  Note that this
   may mean that an item may be in multiple clusters.
        [ABC]   [ ADE]
 */
{
int easyMax = round(1.0/weakLevel);
int itemCount = slCount(itemList);
struct peakCluster *clusterList = NULL;
if (itemCount < easyMax)
    {
    struct peakItem *item = itemList;
    int chromStart = item->chromStart;
    int chromEnd = item->chromEnd;
    for (item = item->next; item != NULL; item = item->next)
        {
	if (item->chromStart < chromStart) chromStart = item->chromStart;
	if (item->chromEnd > chromEnd) chromEnd = item->chromEnd;
	}
    addCluster(lm, itemList, chromStart, chromEnd, &clusterList);
    }
else
    {
    /* Make up coverage tree. */
    struct rbTree *covTree = rangeTreeNew();
    struct peakItem *item;
    for (item = itemList; item != NULL; item = item->next)
	rangeTreeAddToCoverageDepth(covTree, item->chromStart, item->chromEnd);
    struct range *range, *rangeList = rangeTreeList(covTree);

    /* Figure out maximum coverage. */
    int maxCov = 0;
    for (range = rangeList; range != NULL; range = range->next)
        {
	int cov = ptToInt(range->val);
	if (cov > maxCov) maxCov = cov;
	}

    /* Figure coverage threshold. */
    int threshold = round(maxCov * weakLevel);
    if (threshold > forceJoinScore-1) threshold = forceJoinScore-1;

    /* Loop through emitting sections over threshold as clusters */
    boolean inRange = FALSE;
    boolean start = 0, end = 0;
    for (range = rangeList; range != NULL; range = range->next)
        {
	int cov = ptToInt(range->val);
	if (cov > threshold)
	    {
	    if (inRange)
	       end = range->end;
	    else
	       {
	       inRange = TRUE;
	       start = range->start;
	       end = range->end;
	       }
	    }
	else
	    {
	    if (inRange)
		{
		addCluster(lm, itemList, start, end, &clusterList);
		inRange = FALSE;
		}
	    }
	}
    if (inRange)
        addCluster(lm, itemList, start, end, &clusterList);
    }
slReverse(&clusterList);
return clusterList;
}

struct peakClusterMaker *peakClusterMakerNew()
/* Return a new peakClusterMaker. */
{
struct peakClusterMaker *maker;
AllocVar(maker);
maker->chromHash = hashNew(0);
return maker;
}

void peakClusterMakerFree(struct peakClusterMaker **pMaker)
/* Free up a peakClusterMaker. */
{
struct peakClusterMaker *maker = *pMaker;
if (maker != NULL)
    {
    hashFree(&maker->chromHash);
    freez(pMaker);
    }
}

static int cmpChromEls(const void *va, const void *vb)
/* Compare to sort based on query start. */
{
const struct hashEl *a = *((struct hashEl **)va);
const struct hashEl *b = *((struct hashEl **)vb);
return cmpWordsWithEmbeddedNumbers(a->name, b->name);
}

struct hashEl *peakClusterMakerChromList(struct peakClusterMaker *maker)
/* Return list of chromosomes.  In hashEl format where the hashEl val is
 * a rangeTree filled with items. */
{
struct hashEl *chromList =  hashElListHash(maker->chromHash);
slSort(&chromList, cmpChromEls);
return chromList;
}

