/* mafAddIRows - Filter out maf files. */

/* Copyright (C) 2011 The Regents of the University of California 
 * See kent/LICENSE or http://genome.ucsc.edu/license/ for licensing information. */
#include "common.h"
#include "errAbort.h"
#include "linefile.h"
#include "hash.h"
#include "chain.h"
#include "options.h"
#include "maf.h"
#include "bed.h"
#include "twoBit.h"
#include "binRange.h"


char *masterSpecies;
char *masterChrom;
struct hash *speciesHash;
struct subSpecies *speciesList;
struct strandHead *strandHeads;

boolean addN = FALSE;
boolean addDash = FALSE;

void usage()
/* Explain usage and exit. */
{
errAbort(
  "mafAddIRows - add 'i' rows to a maf\n"
  "usage:\n"
  "   mafAddIRows mafIn twoBitFile mafOut\n"
  "WARNING:  requires a maf with only a single target sequence\n"
  "options:\n"
  "   -nBeds=listOfBedFiles\n"
  "       reads in list of bed files, one per species, with N locations\n"
  "   -addN\n"
  "       adds rows of N's into maf blocks (rather than just annotating them)\n"
  "   -addDash\n"
  "       adds rows of -'s into maf blocks (rather than just annotating them)\n"
  );
}

static struct optionSpec options[] = {
   {"nBeds", OPTION_STRING},
   {"addN", OPTION_BOOLEAN},
   {"addDash", OPTION_BOOLEAN},
   {NULL, 0},
};

struct bedHead
{
    struct bed *list;
};

struct blockStatus
{
    char *chrom;
    int start, end;
    char strand;
    struct mafComp *mc;
    int masterStart, masterEnd;
};

struct subSpecies
{
    struct subSpecies *next;
    char *name;
    struct hash *hash;
    struct blockStatus blockStatus;
};

struct linkBlock
{
    struct linkBlock *next;
    struct cBlock cb;
    struct mafComp *mc;
};

struct strandHead
{
    struct strandHead *next;
    char strand;
    char *name;
    char *qName;
    int qSize;
    char *species;
    struct linkBlock *links;
};

struct mafAli *readMafs(struct mafFile *mf)
{
struct mafAli *maf;
char buffer[2048];
char buffer2[2048];
struct strandHead *strandHead;
struct mafAli *mafList = NULL;
char *ourChrom = NULL;

while((maf = mafNext(mf)) != NULL)
    {
    struct mafComp *mc, *masterMc = maf->components;
    char *species = buffer;
    char *chrom;

    if (ourChrom == NULL)
	ourChrom = masterMc->src;
    else 
	{
	if (differentString(masterMc->src, ourChrom))
	    errAbort("ERROR: mafAddIrows requires maf have only one target sequence.\n"
		"Use mafSplit -byTarget -useFullSequenceName to split maf");
	}

    strcpy(species, masterMc->src);
    chrom = strchr(species,'.');
    if (chrom)
	*chrom++ = 0;
    else
	errAbort("reference species has no chrom name\n");

    if (masterSpecies == NULL)
	{
	masterSpecies = cloneString(species);
	masterChrom = cloneString(chrom);
	//printf("master %s %s\n",masterSpecies,masterChrom);
	}
    else
	{
	if (!sameString(masterSpecies, species))
	    errAbort("first species (%s) not master species (%s)\n",species,masterSpecies);
	}

    for(mc= masterMc->next; mc; mc = mc->next)
	{
	struct linkBlock *linkBlock;
	struct subSpecies *subSpecies = NULL;

	strcpy(species, mc->src);
	chrom = strchr(species,'.');
	*chrom++ = 0;

	if ((subSpecies = hashFindVal(speciesHash, species)) == NULL)
	    {
	    //printf("new species %s\n",species);
	    AllocVar(subSpecies);
	    subSpecies->name = cloneString(species);
	    subSpecies->hash = newHash(6);
	    subSpecies->blockStatus.strand = '+';
	    subSpecies->blockStatus.masterStart = masterMc->start;
	    slAddHead(&speciesList, subSpecies);
	    hashAdd(speciesHash, species, subSpecies);
	    }
	subSpecies->blockStatus.masterEnd = masterMc->start + masterMc->size ;
	sprintf(buffer2, "%s%c%s", masterChrom,mc->strand,chrom);
	if ((strandHead = hashFindVal(subSpecies->hash, buffer2)) == NULL)
	    {
	    //printf("new strand %s for species %s\n",buffer2, species);
	    AllocVar(strandHead);
	    hashAdd(subSpecies->hash, buffer2, strandHead);
	    strandHead->name = cloneString(buffer2);
	    strandHead->species = cloneString(species);
	    strandHead->qName = cloneString(chrom);
	    strandHead->qSize = mc->srcSize;
	    strandHead->strand = mc->strand;
	    slAddHead(&strandHeads, strandHead);
	    }
	AllocVar(linkBlock);
	linkBlock->mc = mc;
	linkBlock->cb.qStart = mc->start;
	linkBlock->cb.qEnd = mc->start + mc->size;
	linkBlock->cb.tStart = masterMc->start;
	linkBlock->cb.tEnd = masterMc->start + masterMc->size;


	slAddHead(&strandHead->links, linkBlock);
	}
    slAddHead(&mafList, maf);
    }
slReverse(&mafList);

return mafList;
}

void chainStrands(struct strandHead *strandHead, struct hash *bedHash)
{
for(; strandHead ; strandHead = strandHead->next)
    {
    struct linkBlock *link, *prevLink;
    struct hashEl *hel = hashLookup(bedHash, strandHead->species);
    struct hash *chromHash = (hel != NULL) ? hel->val : NULL;
    struct bedHead *bedHead = (chromHash != NULL) ? 
	hashFindVal(chromHash, strandHead->qName): NULL;

    slReverse(&strandHead->links);

    prevLink = strandHead->links;
    prevLink->mc->leftStatus = MAF_NEW_STATUS;
    for(link = prevLink->next; link; prevLink = link, link = link->next)
	{
	int tDiff = link->cb.tStart - prevLink->cb.tEnd;
	int qDiff = link->cb.qStart - prevLink->cb.qEnd;
	int nCount = 0;
	int nStart, nEnd;
	struct bed *bed;

	if (strandHead->strand == '+')
	    {
	    nStart = prevLink->cb.qEnd;
	    nEnd = link->cb.qStart;
	    }
	else
	    {
	    nEnd = strandHead->qSize - prevLink->cb.qEnd;
	    nStart = strandHead->qSize - link->cb.qStart;
	    }

	/* a very inefficient search for an N bed */
	if ((nEnd - nStart > 0) && (bedHead))
	    {
	    for(bed = bedHead->list; bed; bed = bed->next)
		{
		if (bed->chromStart >= nEnd)
		    break;

		if ( bed->chromEnd > nStart)
		    {
		    nCount += positiveRangeIntersection(nStart, nEnd, 
			bed->chromStart, bed->chromEnd);
		    }
		}
	    }


	if ((qDiff && (100 * nCount / qDiff > 95))
		&& (tDiff && (100 * nCount / tDiff > 10)))
	    {
	    prevLink->mc->rightStatus = link->mc->leftStatus = MAF_MISSING_STATUS;
	    prevLink->mc->rightLen = link->mc->leftLen = nCount;
	    }
	else if  ((tDiff > 100000) ||  
		  (qDiff > 100000) || (qDiff < -100000))
	    {
	    prevLink->mc->rightStatus = link->mc->leftStatus = MAF_NEW_STATUS;
	    prevLink->mc->rightLen = link->mc->leftLen = 0;
	    }
	else if  (qDiff < 0)
	    {
	    prevLink->mc->rightStatus = link->mc->leftStatus = MAF_TANDEM_STATUS;
	    prevLink->mc->rightLen = link->mc->leftLen = -qDiff;
	    }
	else if (qDiff == 0)
	    {
	    prevLink->mc->rightStatus = link->mc->leftStatus = MAF_CONTIG_STATUS;
	    prevLink->mc->rightLen = link->mc->leftLen = 0;
	    }
	else
	    {
	    prevLink->mc->rightStatus = link->mc->leftStatus = MAF_INSERT_STATUS;
	    prevLink->mc->rightLen = link->mc->leftLen = qDiff;
	    }
	}
    prevLink->mc->rightStatus = MAF_NEW_STATUS;
    }
}

void bridgeSpecies(struct mafAli *mafList, struct subSpecies *subSpecies)
{
struct mafAli *maf;
int pushState, leftLen;
struct mafComp *mc = NULL;
// struct mafComp *masterMc = NULL, *prevMc = NULL: unused

for(; subSpecies; subSpecies = subSpecies->next)
    {
    //printf("bridging %s\n",subSpecies->name);
    pushState = 0;
    leftLen = 0;
    for(maf = mafList; maf ;  maf = maf->next)
	{
	// masterMc = maf->components;
	// prevMc = mc;  unused
	if ((mc = mafMayFindCompPrefix(maf, subSpecies->name,NULL)) == NULL)
	    {
	    continue;
	    }
	if (mc->leftStatus == 0) 
	    errAbort("zero left status %s:%d\n",mc->src, mc->start);
#ifdef NOTNOW
	if (pushState && (mc->leftStatus == MAF_INSERT_STATUS))
	    {
	    if (prevMc && 
		!((prevMc->rightStatus == mc->leftStatus) &&
		(prevMc->rightLen == mc->leftLen)))
		{
		printf("boo\n");
		mc->leftStatus = MAF_NEW_NESTED_STATUS;
		mc->leftLen = 0;
		}
	    }
#endif
	if (mc->leftStatus == MAF_NEW_STATUS)
	    {
	    if (pushState)
		{
		//printf("bridged on %s: %s:%d-%d\n",mc->src,"chr22",masterMc->start, masterMc->start + masterMc->size);
		mc->leftStatus = MAF_NEW_NESTED_STATUS;
		mc->leftLen = leftLen;
		}
	    pushState++;
	    }

	if (mc->rightStatus == 0)
	    {
	    errAbort("zero right status\n");
	    }
	else if (isContigOrTandem(mc->rightStatus) || (mc->rightStatus == MAF_INSERT_STATUS))
	    leftLen = mc->rightLen;
	else if (mc->rightStatus == MAF_NEW_STATUS)
	    {
	    pushState--;
	    if (pushState)
		{
		//printf("arched on %s: %s:%d-%d\n",mc->src,"chr22",masterMc->start, masterMc->start + masterMc->size);
		mc->rightStatus = MAF_NEW_NESTED_STATUS;
		mc->rightLen = leftLen;
		}
	    }
	}
    //printf("pushState %d\n",pushState);
    }
}


void fillHoles(struct mafAli *mafList, struct subSpecies *speciesList, struct twoBitFile *twoBit)
{
int lastEnd = 100000000;
struct mafAli *prevMaf = NULL, *maf, *nextMaf;
struct subSpecies *species;

/*
for(species = speciesList; species; species = species->next)
    {
    blockStatus = &species->blockStatus;
    blockStatus->mc->rightStatus = MAF_NEW_STATUS;
    blockStatus->mc->rightLen = 0;
    }
    */

for(maf = mafList; maf ; prevMaf = maf, maf = nextMaf)
    {
    struct mafComp *mc = NULL, *masterMc, *lastMc = NULL;
    struct mafAli *newMaf = NULL;
    struct blockStatus *blockStatus;

    nextMaf = maf->next;

    masterMc=maf->components;
    if (masterMc->start > lastEnd)
	{
	struct subSpecies *species;

	for(species = speciesList; species; species = species->next)
	    {
	    mc = NULL;
//	    printf("looking at %s\n",species->name);
	    blockStatus = &species->blockStatus;
	    if (blockStatus->mc)
		{
//	    printf("should match at %s\n",blockStatus->mc->src);
		switch (blockStatus->mc->rightStatus)
		    {
		    case MAF_MISSING_STATUS:
		    //printf("missing right\n");
		    case MAF_NEW_NESTED_STATUS:
		    case MAF_MAYBE_NEW_NESTED_STATUS:
		    case MAF_CONTIG_STATUS:
		    case MAF_TANDEM_STATUS:
		    case MAF_INSERT_STATUS:
			AllocVar(mc);
			mc->rightStatus = mc->leftStatus = blockStatus->mc->rightStatus;
			mc->rightLen = mc->leftLen = blockStatus->mc->rightLen;
			mc->src = blockStatus->mc->src;
			mc->srcSize = blockStatus->mc->srcSize;
			mc->strand = blockStatus->mc->strand;
			mc->start = blockStatus->mc->start + blockStatus->mc->size;
			if (lastMc == NULL)
			    {
			    struct mafComp *miniMasterMc = NULL;
			    char *seqName;
			    struct dnaSeq *seq;

			    AllocVar(miniMasterMc);
			    miniMasterMc->next = mc;
			    miniMasterMc->strand = '+';
			    miniMasterMc->srcSize = masterMc->srcSize;
			    miniMasterMc->src = masterMc->src;
			    miniMasterMc->start = lastEnd;
			    miniMasterMc->size =  masterMc->start - lastEnd;

			    if ((seqName = strchr(miniMasterMc->src, '.')) != NULL)
				seqName++;
			    else 
			    	seqName = miniMasterMc->src;

//			    printf("hole filled from %d to %d\n",lastEnd, masterMc->start);
			    seq = twoBitReadSeqFrag(twoBit, seqName, lastEnd, masterMc->start);
			    miniMasterMc->text = seq->dna;

			    AllocVar(newMaf);
			    newMaf->textSize = maf->textSize;
			    newMaf->components = miniMasterMc;
			    newMaf->next = maf;
			    if (prevMaf)
				prevMaf->next = newMaf;
			    else
				mafList = newMaf;
			    //masterMc = miniMasterMc;
			    }
			else
			    {
			    lastMc->next = mc;
			    }
			lastMc = mc;
			if  (blockStatus->mc->rightStatus ==  MAF_MISSING_STATUS)
			    {
			    if (addN)
				{
				char buffer[256];

				safef(buffer, sizeof(buffer), "%s.N",species->name);
				mc->src = cloneString(buffer);
				mc->start = 0;
				mc->srcSize = 200000;
				mc->size =  masterMc->start - lastEnd;
				mc->text = needMem(mc->size + 1);
				memset(mc->text, 'N', mc->size);
				}
			    }
			else
			    {
			    if (addDash)
				{
				mc->size = masterMc->size;
				mc->srcSize = blockStatus->mc->srcSize;
				mc->text = needMem(mc->size + 1);
				memset(mc->text, '-', mc->size);
				mc->text[mc->size] = 0;
				if (mc->size == 0)
				    errAbort("bad dash add");
				mc->size = 0;
				}
			    }
			break;
		    }
		}
	    }
	}
    lastEnd = masterMc->start + masterMc->size;
    for(lastMc = masterMc; lastMc->next; lastMc = lastMc->next)
	;

    for(species = speciesList; species; species = species->next)
	{
	blockStatus = &species->blockStatus;
	
	mc = NULL;
	if ((blockStatus->masterStart <= masterMc->start) && 
	    (blockStatus->masterEnd > masterMc->start) && 
	 ((mc = mafMayFindCompPrefix(maf, species->name,NULL)) == NULL))
	    {
	    if (blockStatus->mc != NULL)
		{
		switch (blockStatus->mc->rightStatus)
		    {
		    case MAF_MISSING_STATUS:
		    case MAF_CONTIG_STATUS:
		    case MAF_TANDEM_STATUS:
		    case MAF_INSERT_STATUS:
		    case MAF_NEW_NESTED_STATUS:
		    case MAF_MAYBE_NEW_NESTED_STATUS:
			AllocVar(mc);
			mc->rightStatus = mc->leftStatus = blockStatus->mc->rightStatus;
			if (mc->rightStatus == MAF_NEW_NESTED_STATUS)
			    mc->rightStatus = MAF_INSERT_STATUS;
			if (mc->leftStatus == MAF_NEW_NESTED_STATUS)
			    mc->leftStatus = MAF_INSERT_STATUS;
			mc->rightLen = mc->leftLen = blockStatus->mc->rightLen;
			mc->src = blockStatus->mc->src;
			mc->strand = blockStatus->mc->strand;
			mc->srcSize = blockStatus->mc->srcSize;
			mc->start = blockStatus->mc->start + blockStatus->mc->size ;
			lastMc->next = mc;
			lastMc = mc;
			if  (addN && (blockStatus->mc->rightStatus ==  MAF_MISSING_STATUS))
			    {
			    char buffer[256];

			    safef(buffer, sizeof(buffer), "%s.N",species->name);
			    mc->src = cloneString(buffer);
			    mc->start = 0;
			    mc->srcSize = 200000;
			    mc->size = maf->textSize;
			    mc->text = needMem(mc->size + 1);
			    memset(mc->text, 'N', mc->size);
			    }
			else if (addDash)
			    {
				mc->size = masterMc->size;
			    mc->text = needMem(mc->size + 1);
			    if (mc->size == 0)
				errAbort("bad dash add");
			    memset(mc->text, '-', mc->size);
			    mc->text[mc->size] = 0;
			    mc->size = 0;
			    }
			    
			break;
		    default:
			break;
		    }
		}
	    }
	if (mc)
	    {
	    blockStatus->mc = mc;
	    }
	}
    }
}

struct hash *readBed(char *fileName)
/* Read bed and return it as a hash keyed by chromName */
{
char *row[3];
struct lineFile *lf = lineFileOpen(fileName, TRUE);
struct bedHead *bedHead = NULL;
struct hash *hash = newHash(6);
struct hashEl *hel, *lastHel = NULL;
struct bed3 *bed;

while (lineFileRow(lf, row))
    {
    hel = hashLookup(hash, row[0]);
    if ((lastHel) && (hel != lastHel))
	{
	assert(bedHead != NULL);
	slReverse(&bedHead->list);
	}

    if (hel == NULL)
       {
	char *ptr;

	AllocVar(bedHead);
	if ((ptr = strchr(row[0], '.')) != NULL)
	    ptr++;
	else 
	    ptr = row[0];
	hel = hashAdd(hash, ptr, bedHead);
	}
    bedHead = hel->val;
    AllocVar(bed);
    bed->chrom = hel->name;
    bed->chromStart = lineFileNeedNum(lf, row, 1);
    bed->chromEnd = lineFileNeedNum(lf, row, 2);
    if (bed->chromStart > bed->chromEnd)
        errAbort("start after end line %d of %s", lf->lineIx, lf->fileName);
    slAddHead(&bedHead->list, (struct bed *)bed);
    lastHel = hel;
    }

if (bedHead != NULL)
    slReverse(&bedHead->list);
lineFileClose(&lf);
return hash;
}

void addBed(char *file, struct hash *fileHash)
{
char name[128];

if (!endsWith(file, ".bed"))
    errAbort("filenames in bed list must end in '.bed'");

splitPath(file, NULL, name, NULL);
hashAdd(fileHash, name, readBed(file));
}

void mafAddIRows(char *mafIn, char *twoBitIn,  char *mafOut, char *nBedFile)
/* mafAddIRows - Filter out maf files. */
{
FILE *f = mustOpen(mafOut, "w");
struct twoBitFile *twoBit = twoBitOpen(twoBitIn);
struct mafAli *mafList, *maf;
struct mafFile *mf = mafOpen(mafIn);
struct hash *bedHash = newHash(6); 

if (nBedFile != NULL)
    {
    struct lineFile *lf = lineFileOpen(nBedFile, TRUE);
    char *row[1];
    while (lineFileRow(lf, row))
	{
	addBed(row[0], bedHash);
	}
    lineFileClose(&lf);
    }

speciesHash = newHash(6);
mafList = readMafs(mf);
mafWriteStart(f, mf->scoring);
mafFileFree(&mf);

chainStrands(strandHeads, bedHash);
bridgeSpecies(mafList, speciesList);
fillHoles(mafList, speciesList, twoBit);

for(maf = mafList; maf ; maf = maf->next)
    mafWrite(f, maf);
}

int main(int argc, char *argv[])
/* Process command line. */
{
char *nBedFile;

optionInit(&argc, argv, options);
if (argc != 4)
    usage();

nBedFile = optionVal("nBeds", NULL);
addN = optionExists("addN");
addDash = optionExists("addDash");

mafAddIRows(argv[1], argv[2], argv[3], nBedFile);
return 0;
}
