/* liftOverMerge - merge multiple regions in BED5's generated by liftOver -multiple */

/* Copyright (C) 2011 The Regents of the University of California 
 * See kent/LICENSE or http://genome.ucsc.edu/license/ for licensing information. */

#include "common.h"
#include "bed.h"
#include "linefile.h"
#include "options.h"


struct bedList 
    {
    struct bedList *next;
    char *name;
    struct bed *bed;
    };

struct optionSpec optionSpecs[] =
/* command line option specification */
    {
        { "mergeGap", OPTION_INT },
        { NULL, 0 }
    };

int mergeGap = 0;

void usage()
/* Explain usage and exit. */
{
errAbort(
    "liftOverMerge - Merge multiple regions in BED 5 files\n"
    "                   generated by liftOver -multiple\n"
    "usage:\n"
    "   liftOverMerge oldFile newFile\n"
    "options:\n"
    "   -mergeGap=N    Max size of gap to merge regions (default 0)\n"
    );
}

void liftOverMerge(char *oldFile, char *newFile)
/* liftOverMerge - Merge regions in BED5  generated by liftOver -multiple */
{
struct bed *bedList = NULL, *bed = NULL, *otherBed = NULL, *nextBed = NULL;
struct bedList *bedListHeaders = NULL, *bedListHeader = NULL;
FILE *f = mustOpen(newFile, "w");

bedList = bedLoadNAll(oldFile, 5);

/* break down bed list into a list of lists, one per "region", where region
 * is the name field in the bed */
for (bed = bedList; bed != NULL; bed = nextBed)
    {
    verbose(3, "%s:%d-%d %s %d\n", bed->chrom, bed->chromStart, bed->chromEnd,
                                        bed->name, bed->score);
    if (bedListHeader == NULL || 
            differentString(bed->name, bedListHeader->name))
        {
        verbose(2, "region %s\n", bed->name);
        AllocVar(bedListHeader);
        bedListHeader->name = cloneString(bed->name);
        slAddHead(&bedListHeaders, bedListHeader);
        }
    nextBed = bed->next;
    slAddHead(&bedListHeader->bed, bed);
    }
slReverse(&bedListHeaders);

for (bedListHeader = bedListHeaders; bedListHeader != NULL; 
        bedListHeader = bedListHeader->next)
    {
    int ix = 1;
    verbose(3, "region %s\n", bedListHeader->name);
    slReverse(&bedListHeader->bed);

    /* traverse list of bed lists, merging overlapping entries 
     * for each region */
    for (bed = bedListHeader->bed; bed != NULL; bed = bed->next)
        {
        for (otherBed = bed->next; otherBed != NULL; otherBed = nextBed)
            {
            nextBed = otherBed->next;
            if (sameString(bed->chrom, otherBed->chrom) && 
                (max(bed->chromStart, otherBed->chromStart) <= 
                 min(bed->chromEnd, otherBed->chromEnd) + mergeGap))
                {
                /* these regions overlap (or are within the merge gap),
                 * so create one that is a merge, and drop the other */
                verbose(2,"merging %s:%d-%d, %s:%d-%d (overlap=%d)",
                    otherBed->chrom, otherBed->chromStart, otherBed->chromEnd,
                    bed->chrom, bed->chromStart, bed->chromEnd,
                    min(bed->chromEnd, otherBed->chromEnd) -
                        max(bed->chromStart, otherBed->chromStart)); 
                bed->chromStart = min(otherBed->chromStart, bed->chromStart);
                bed->chromEnd = max(otherBed->chromEnd, bed->chromEnd);
                verbose(2," to %s:%d-%d\n",
                        bed->chrom, bed->chromStart, bed->chromEnd);
                slRemoveEl(&bedListHeader->bed, otherBed);
                }
            }
        }
    for (otherBed = bedListHeader->bed; otherBed != NULL; 
            otherBed = otherBed->next)
        {
        otherBed->score = ix++;
        bedOutputN(otherBed, 5, f, '\t', '\n');
        }
    }
}

int main(int argc, char *argv[])
/* Process command line. */
{
optionInit(&argc, argv, optionSpecs);
if (argc != 3)
    usage();
mergeGap = optionInt("mergeGap", mergeGap);
liftOverMerge(argv[1], argv[2]);
return 0;
}


