/* regChromiaMergeWindows - Merge adjacent identically labeled windows in BED file generated by 
 * Chromia.. */

/* Copyright (C) 2011 The Regents of the University of California 
 * See kent/LICENSE or http://genome.ucsc.edu/license/ for licensing information. */
#include "common.h"
#include "linefile.h"
#include "hash.h"
#include "options.h"


int scoreColumn = 4;
int labelColumn = 3;
int maxGap = 1200;
double minSumScore = 10.0;
double scoreNormFactor = 10000.0;
double inNoiseThreshold = 0.0;

void usage()
/* Explain usage and exit. */
{
errAbort(
  "regChromiaMergeWindows - Merge adjacent identically labeled windows in BED file \n"
  "generated by Chromia.\n"
  "usage:\n"
  "   regChromiaMergeWindows chromiaInput output.bed\n"
  "where chromiaInput is tab-separated with the following columns:\n"
  "   <chrom> <start> <end> <label> <score>\n"
  "options:\n"
  "   -maxGap=N - maximum number of bases between merged windows.  Default %d\n"
  "   -minSumScore=N.N - minimum sum of scores in region to output. Default %g\n"
  "   -scoreNormFactor=N.N - normalization factor to get scores in 0-1000. Default %g\n"
  "   -inNoiseThreshold=N.N - input lines scoring below this threshold are ignored\n"
  , maxGap, minSumScore, scoreNormFactor
  );
}

static struct optionSpec options[] = {
   {"maxGap", OPTION_INT},
   {"minSumScore", OPTION_DOUBLE},
   {"scoreNormFactor", OPTION_DOUBLE},
   {"inNoiseThreshold", OPTION_INT},
   {NULL, 0},
};


void outputRegion(FILE *f, char *chrom, int start, int end, char *label, double sumOfScores)
/* Output region to file. */
{
if (sumOfScores > minSumScore)
    fprintf(f, "%s\t%d\t%d\t%s\t%d\n", chrom, start, end, label, 
    	round(scoreNormFactor*sumOfScores/(end-start)));
}

void regChromiaMergeWindows(char *input, char *output)
/* regChromiaMergeWindows - Merge adjacent identically labeled windows in BED file generated 
 * by Chromia.. */
{
struct lineFile *lf = lineFileOpen(input, TRUE);
char *row[32];
int rowSize = 0;
FILE *f = mustOpen(output, "w");
char lastLabel[128];
lastLabel[0] = 0;
char lastChrom[128];
lastChrom[0] = 0;
int lastChromStart = 0, lastChromEnd = 0;
int regionStart = 0, regionEnd = 0;
double sumOfScores = 0.0;

for (;;)
    {
    /* Get next line chopped into words.  Break at end of file. Check to make sure
     * all lines have same number of words. */
    int thisRowSize = lineFileChop(lf, row);
    if (thisRowSize == 0)
        break;
    if (rowSize == 0)
        rowSize = thisRowSize;
    else if (rowSize != thisRowSize)
        {
	errAbort("First line of %s has %d words, but there are %d words in line %d",
		lf->fileName, rowSize, thisRowSize, lf->lineIx);
	}

    /* Convert row into local variables. */
    char *chrom = row[0];
    int chromStart = lineFileNeedNum(lf, row, 1);
    int chromEnd = lineFileNeedNum(lf, row, 2);
    char *label = row[labelColumn];
    double score = lineFileNeedDouble(lf, row, scoreColumn);
    
    /* Make sure file is sorted with no overlap.*/
    if (sameString(chrom, lastChrom))
        {
	int gapSize = chromStart - lastChromEnd;
	if (gapSize < 0)
	    {
	    if (chromStart < lastChromStart)
		errAbort("%s is not sorted. %s %d %d followed by %s %d %d line %d",
		    lf->fileName, lastChrom, lastChromStart, lastChromEnd,
		    chrom, chromStart, chromEnd, lf->lineIx);
	    else  
		errAbort("%s has overlaps. %s %d %d followed by %s %d %d line %d",
		    lf->fileName, lastChrom, lastChromStart, lastChromEnd,
		    chrom, chromStart, chromEnd, lf->lineIx);
	    }
	}

    /* Subtract noise threshold from score, and if not still positive just ignore line. */
    score -= inNoiseThreshold;
    if (score > 0)
	{
	/* See if we have entered a new region. */
	boolean newRegion = FALSE;
	if (sameString(chrom, lastChrom))
	    {
	    int gapSize = chromStart - lastChromEnd;
	    if (gapSize > maxGap)
		 newRegion = TRUE;
	    }
	else
	    newRegion = TRUE;
	if (!sameString(label, lastLabel))
	    newRegion = TRUE;
	    
	/* Got new region.  Output old region if any, and initialize new region. */
	if (newRegion)
	    {
	    if (regionStart != regionEnd)
		outputRegion(f, lastChrom, regionStart, regionEnd, lastLabel, sumOfScores);
	    regionStart = chromStart;
	    sumOfScores = 0;
	    }

	/* Update region. */
	regionEnd = chromEnd;
	sumOfScores += score;

	/* Keep track of this row so can compare it to next row. */
	safecpy(lastChrom, sizeof(lastChrom), chrom);
	safecpy(lastLabel, sizeof(lastLabel), label);
	lastChromStart = chromStart;
	lastChromEnd = chromEnd;
	}

    }
outputRegion(f, lastChrom, regionStart, regionEnd, lastLabel, sumOfScores);
carefulClose(&f);
lineFileClose(&lf);
}

int main(int argc, char *argv[])
/* Process command line. */
{
optionInit(&argc, argv, options);
maxGap = optionInt("maxGap", maxGap);
minSumScore = optionDouble("minSumScore", minSumScore);
scoreNormFactor = optionDouble("scoreNormFactor", scoreNormFactor);
inNoiseThreshold = optionDouble("inNoiseThreshold", inNoiseThreshold);
if (argc != 3)
    usage();
regChromiaMergeWindows(argv[1], argv[2]);
return 0;
}
