/* regCompanionChia - Analyse chia pet data against promoters and enhancers.. */

/* Copyright (C) 2011 The Regents of the University of California 
 * See kent/LICENSE or http://genome.ucsc.edu/license/ for licensing information. */
#include "common.h"
#include "linefile.h"
#include "hash.h"
#include "options.h"
#include "basicBed.h"
#include "bigWig.h"
#include "obscure.h"
#include "localmem.h"
#include "verbose.h"
#include "bits.h"
#include "genomeRangeTree.h"


void usage()
/* Explain usage and exit. */
{
errAbort(
  "regCompanionChia - Analyse chia pet data against promoters and enhancers.\n"
  "usage:\n"
  "   regCompanionChia input.bed output.tab\n"
  "where input.bed has to be all two-block items.  There's alas 7 other input files\n"
  "which you'll have to recompile the C source to change.  The output is also complex\n"
  "a table with 23 fields:\n"
  "  1 - chromosome\n"
  "  2 - chromosome start - 0 based\n"
  "  3 - chromosome end - 1 based\n"
  "  4 - name - unique id for this pair\n"
  "  5 - score - 0-1000 browser style score\n"
  "  6 - always the word \"block1\"\n"
  "  7 - size of block 1\n"
  "  8 - average DNAse level in block 1\n"
  "  9 - average H3K4Me3 level in block 1\n"
  " 10 - average H3K27Ac level in block 1\n"
  " 11 - average H3K4Me1 level in block 1\n"
  " 12 - average CTCF level in block 1\n"
  " 13 - average coverage by +-500 base from txStart promoters in block 1\n"
  " 14 - average coverage by enhancer in block 1\n"
  " 15 - always the word \"block2\"\n"
  " 16 - size of block 2\n"
  " 17 - average DNAse level in block 2\n"
  " 18 - average H3K4Me3 level in block 2\n"
  " 19 - average H3K27Ac level in block 2\n"
  " 20 - average H3K4Me1 level in block 2\n"
  " 21 - average CTCF level in block 2\n"
  " 22 - average coverage by +-500 base from txStart promoters in block 2\n"
  " 23 - average coverage by enhancer in block 2\n"
  );
}

static struct optionSpec options[] = {
   {NULL, 0},
};

enum inType {itBlockedBed, itBigWig, itPromoterBed, itUnstrandedBed};

struct inInfo
/* Help classify input types. */
    {
    char *name;			/* Name of input */
    enum inType type;		/* Type of input */
    char *fileName;			/* Path of file with input */
    struct lineFile *lf;	/* Open lineFile if a bed */
    struct bbiFile *bbi;	/* Open bbiFile if a bigWig */
    double *out[2];	/* Average of signal for each of two blocks. */
    };

struct inInfo inArray[] =
    {
    {"pairs", itBlockedBed, NULL},
    // {"CHIApet", itBlockedBed, "/hive/groups/encode/dcc/analysis/ftp/pipeline/hg19/wgEncodeGisChiaPet/wgEncodeGisChiaPetGm12878D000005628.bed.gz"},
    {"DNAse", itBigWig, "/hive/users/kent/regulate/companion/dnase/normalized/Gm12878.bw"},
    {"H3K4Me3", itBigWig, "/hive/users/kent/regulate/companion/histones/normalized/wgEncodeBroadHistoneGm12878H3k4me3NormSig.bw"},
    {"H3K27Ac", itBigWig, "/hive/users/kent/regulate/companion/histones/normalized/wgEncodeBroadHistoneGm12878H3k27acNormSig.bw"},
    {"H3K4Me1", itBigWig, "/hive/users/kent/regulate/companion/histones/normalized/wgEncodeBroadHistoneGm12878H3k4me1NormSig.bw"},
    {"CTCF", itBigWig, "/hive/users/kent/regulate/companion/histones/normalized/wgEncodeBroadHistoneGm12878CtcfNormSig.bw"},
    {"UCSCgenes", itPromoterBed, "/hive/data/genomes/hg19/bed/ucsc.12/ucscGenes.bed"},
    {"UCSCenh", itUnstrandedBed, "/hive/users/kent/regulate/companion/enhPicks/stringent/enh7.bed"},
    };

#ifdef SOON
#endif /* SOON */

void checkInputOpenFiles(struct inInfo *array, int count)
/* Make sure all of the input is there and of right format before going forward. Since
 * this is going to take a while we want to fail fast. */
{
int i;
for (i=0; i<count; ++i)
    {
    struct inInfo *in = &array[i];
    switch (in->type)
        {
	case itBigWig:
	    {
	    /* Just open and close, it will abort if any problem. */
	    in->bbi = bigWigFileOpen(in->fileName);
	    break;
	    }
	case itPromoterBed:
	case itUnstrandedBed:
	case itBlockedBed:
	    {
	    struct lineFile *lf = in->lf = lineFileOpen(in->fileName, TRUE);
	    char *line;
	    lineFileNeedNext(lf, &line, NULL);
	    char *dupe = cloneString(line);
	    char *row[256];
	    int wordCount = chopLine(dupe, row);
	    struct bed *bed = NULL;
	    switch (in->type)
	        {
		case itPromoterBed:
		    lineFileExpectAtLeast(lf, 6, wordCount);
		    bed = bedLoadN(row, 6);
		    char strand = bed->strand[0];
		    if (strand != '+' && strand != '-')
		        errAbort("%s must be stranded, got %s in that field", lf->fileName, row[6]);
		    break;
		case itUnstrandedBed:
		    lineFileExpectAtLeast(lf, 4, wordCount);
		    bed = bedLoadN(row, 4);
		    break;
		case itBlockedBed:
		    lineFileExpectAtLeast(lf, 4, wordCount);
		    bed = bedLoadN(row, 12);
		    break;
		default:
		    internalErr();
		    break;
		}
	    bedFree(&bed);
	    freez(&dupe);
	    lineFileReuse(lf);
	    break;
	    }
	default:
	    internalErr();
	    break;
	}
    }
}

struct bed *bedLoadTwoBlocks(struct lineFile *lf)
/* Load a bed12, and make sure each item is 1 or 2 blocks.  Return list of
 * the two block ones. */
{
struct bed *bedList = NULL, *bed;
char *row[12];
int totalCount = 0, pairCount = 0;
while (lineFileRow(lf, row))
    {
    bed = bedLoad12(row);
    if (bed->blockCount != 1 && bed->blockCount != 2)
       errAbort("Line %d of %s got blockCount of %d,  expecting 1 or 2", 
       	lf->lineIx, lf->fileName, bed->blockCount);
    ++totalCount;
    if (bed->blockCount == 2)
        {
	slAddHead(&bedList, bed);
	++pairCount;
	}
    }
slReverse(&bedList);
verbose(1, "Got %d items including %d pairs in %s\n", totalCount, pairCount, lf->fileName);
return bedList;
}

struct genomeRangeTree *grtFromOpenBed(struct lineFile *lf, int size, boolean doPromoter)
/* Read an open bed file into a genomeRangeTree and return it. */
{
struct genomeRangeTree *grt = genomeRangeTreeNew();
char *row[size];
while (lineFileRow(lf, row))
    {
    struct bed *bed = bedLoadN(row, size);
    if (doPromoter)
        {
	if (bed->strand[0] == '+')
	    genomeRangeTreeAdd(grt, bed->chrom, bed->chromStart - 500, bed->chromEnd + 500);
	else
	    genomeRangeTreeAdd(grt, bed->chrom, bed->chromEnd - 500, bed->chromEnd + 500);
	}
    else
	genomeRangeTreeAdd(grt,  bed->chrom, bed->chromStart, bed->chromEnd);
    }
return grt;
}

void outputOverlappingGrt(struct bed *chiaList, struct genomeRangeTree *grt,
	double *out1, double *out2)
{
int chiaIx = 0;
struct bed *chia;
for (chia = chiaList; chia != NULL; chia = chia->next)
    {
    int blockStart = chia->chromStart;
    int blockSize = chia->blockSizes[0];
    int blockEnd = blockStart + blockSize;
    int overlap = genomeRangeTreeOverlapSize(grt, chia->chrom, blockStart, blockEnd);
    out1[chiaIx] = (double)overlap/blockSize;

    blockStart = chia->chromStart + chia->chromStarts[1];
    blockSize = chia->blockSizes[1];
    blockEnd = blockStart + blockSize;
    overlap = genomeRangeTreeOverlapSize(grt, chia->chrom, blockStart, blockEnd);
    out2[chiaIx] = (double)overlap/blockSize;

    ++chiaIx;
    }
}

void doItPromoterBed(struct inInfo *in, struct bed *chiaList, double *out1, double *out2)
/* Process overlaps with promoters defined by a gene bed file. */
{
struct genomeRangeTree *grt = grtFromOpenBed(in->lf, 12, TRUE);
outputOverlappingGrt(chiaList, grt, out1, out2);
uglyf("done %s\n", in->fileName);
}

void doItUnstrandedBed(struct inInfo *in, struct bed *chiaList, double *out1, double *out2)
/* Process overlaps with unstranded bed into output */
{
struct genomeRangeTree *grt = grtFromOpenBed(in->lf, 3, FALSE);
outputOverlappingGrt(chiaList, grt, out1, out2);
}

double averageInRegion(struct bigWigValsOnChrom *chromVals, int start, int size)
/* Return average value in region where there is data. */
{
int n = bitCountRange(chromVals->covBuf, start, size);
if (n == 0)
    return 0.0;
else
    {
    double *val = chromVals->valBuf + start;
    double sum = 0;
    int i = size;
    while (--i >= 0)
        sum += *val++;
    return sum/n;
    }
}

void doItBigWig(struct inInfo *in, struct bed *chiaList, struct bigWigValsOnChrom *chromVals,
	double *out1, double *out2)
{
struct bed *chromStart, *chromEnd, *chia;
int chiaIx = 0;
for (chromStart = chiaList; chromStart != NULL; chromStart = chromEnd)
    {
    chromEnd = bedListNextDifferentChrom(chromStart);
    if (bigWigValsOnChromFetchData(chromVals, chromStart->chrom, in->bbi))
        {
	for (chia = chromStart; chia != chromEnd; chia = chia->next)
	    {
	    int blockStart = chia->chromStart;
	    int blockSize = chia->blockSizes[0];
	    out1[chiaIx] = averageInRegion(chromVals, blockStart, blockSize);

	    blockStart = chia->chromStart + chia->chromStarts[1];
	    blockSize = chia->blockSizes[1];
	    out2[chiaIx] = averageInRegion(chromVals, blockStart, blockSize);

	    ++chiaIx;
	    }
	}
    else
	{
	/* No data on this chrom, just output zero everywhere. */
	for (chia = chromStart; chia != chromEnd; chia = chia->next)
	    {
	    out1[chiaIx] = out2[chiaIx] = 0;
	    ++chiaIx;
	    }
	}
    verboseDot();
    }
}


void regCompanionChia(char *inBedPairs, char *output)
/* regCompanionChia - Analyse chia pet data against promoters and enhancers.. */
{
inArray[0].fileName = inBedPairs;
checkInputOpenFiles(inArray, ArraySize(inArray));
verbose(1, "Opened all %d inputs successfully\n", (int)ArraySize(inArray));
struct bed *chiaList = bedLoadTwoBlocks(inArray[0].lf);
slSort(&chiaList, bedCmp);
int chiaCount = slCount(chiaList);
struct bigWigValsOnChrom *chromVals = bigWigValsOnChromNew();

int inIx;
for (inIx=1; inIx < ArraySize(inArray); ++inIx)
    {
    /* Allocate output arrays. */
    struct inInfo *in = &inArray[inIx];
    double *out1 = AllocArray(in->out[0], chiaCount);
    double *out2 = AllocArray(in->out[1], chiaCount);

    /* Process input depending on type. */
    verbose(1, "Processing %s", in->fileName);
    switch(in->type)
        {
	case itPromoterBed:
	    doItPromoterBed(in, chiaList, out1, out2);
	    break;
	case itUnstrandedBed:
	    doItUnstrandedBed(in, chiaList, out1, out2);
	    break;
	case itBigWig:
	    doItBigWig(in, chiaList, chromVals, out1, out2);
	    break;
	default:
	    internalErr();
	    break;
	}
    verbose(1, "\n");
    }

/* do output */
FILE *f = mustOpen(output, "w");
struct bed *chia;
int chiaIx = 0;
for (chia = chiaList; chia != NULL; chia = chia->next, ++chiaIx)
    {
    // fprintf(f, "%s\t%d\t%d\tchia%d\t%d", chia->chrom, chia->chromStart, chia->chromEnd, chiaIx+1, chia->score);
    fprintf(f, "%s\t%d\t%d\t%s\t%d", chia->chrom, chia->chromStart, chia->chromEnd, chia->name, chia->score);
    int blockIx;
    for (blockIx=0; blockIx < 2; ++blockIx)
	{
	fprintf(f, "\tblock%d\t%d", blockIx+1, chia->blockSizes[blockIx]);
	for (inIx=1; inIx < ArraySize(inArray); ++inIx)
	    {
	    struct inInfo *in = &inArray[inIx];
	    double *out = in->out[blockIx];
	    fprintf(f, "\t%g", out[chiaIx]);
	    }
	}
    fprintf(f, "\n");
    }

carefulClose(&f);
}

int main(int argc, char *argv[])
/* Process command line. */
{
optionInit(&argc, argv, options);
if (argc != 3)
    usage();
regCompanionChia(argv[1], argv[2]);
return 0;
}
