/* overlapSelect - select records based on overlap of chromosome ranges */

/* Copyright (C) 2011 The Regents of the University of California 
 * See kent/LICENSE or http://genome.ucsc.edu/license/ for licensing information. */

#include "common.h"
#include "selectTable.h"
#include "coordCols.h"
#include "chromAnn.h"
#include "dystring.h"
#include "options.h"


/* FIXME:
 * - would be nice to be able to specify ranges in the same manner
 *   as featureBits
 * - should keep header lines in files
 * - don't need to save if infile records if stats output
 */

static struct optionSpec optionSpecs[] = {
    {"selectFmt", OPTION_STRING},
    {"selectCoordCols", OPTION_STRING},
    {"selectCds", OPTION_BOOLEAN},
    {"selectRange", OPTION_BOOLEAN},
    {"inFmt", OPTION_STRING},
    {"inCoordCols", OPTION_STRING},
    {"inCds", OPTION_BOOLEAN},
    {"inRange", OPTION_BOOLEAN},
    {"nonOverlapping", OPTION_BOOLEAN},
    {"strand", OPTION_BOOLEAN},
    {"oppositeStrand", OPTION_BOOLEAN},
    {"excludeSelf", OPTION_BOOLEAN},
    {"idMatch", OPTION_BOOLEAN},
    {"dropped", OPTION_STRING},
    {"overlapThreshold", OPTION_FLOAT},
    {"overlapThresholdCeil", OPTION_FLOAT},
    {"overlapSimilarity", OPTION_FLOAT},
    {"overlapSimilarityCeil", OPTION_FLOAT},
    {"overlapBases", OPTION_INT},
    {"merge", OPTION_BOOLEAN},
    {"mergeOutput", OPTION_BOOLEAN},
    {"statsOutput", OPTION_BOOLEAN},
    {"statsOutputAll", OPTION_BOOLEAN},
    {"statsOutputBoth", OPTION_BOOLEAN},
    {"idOutput", OPTION_BOOLEAN},
    {"aggregate", OPTION_BOOLEAN},
    {NULL, 0}
};

/* incompatible with aggregate */
static char *aggIncompatible[] =
{
    "overlapSimilarity", "overlapSimilarityCeil", "overlapThresholdCeil", "overlapBases", "merge", "mergeOutput", "idMatch", NULL
};

/* file format constants */
enum recordFmt {
    UNKNOWN_FMT,
    PSL_FMT,
    PSLQ_FMT,
    CHAIN_FMT,
    CHAINQ_FMT,
    GENEPRED_FMT,
    BED_FMT,
    BED3P_FMT,
    BED6P_FMT,
    COORD_COLS_FMT
};

/* Options parsed from the command line */
enum recordFmt selectFmt = UNKNOWN_FMT;
struct coordCols selectCoordCols;
unsigned selectCaOpts = 0;

unsigned inFmt = UNKNOWN_FMT;
struct coordCols inCoordCols;
unsigned inCaOpts = 0;

unsigned selectOpts = 0;
boolean useAggregate = FALSE;
boolean nonOverlapping = FALSE;
boolean mergeOutput = FALSE;
boolean idOutput = FALSE;
boolean statsOutput = FALSE;
boolean outputAll = FALSE;
boolean outputBoth = FALSE;
struct overlapCriteria criteria = {0.0, 1.1, 0.0, 1.1, -1};

enum recordFmt parseFormatSpec(char *fmt)
/* parse a format specification */
{
if (sameString(fmt, "psl"))
    return PSL_FMT;
if (sameString(fmt, "pslq"))
    return PSLQ_FMT;
if (sameString(fmt, "chain"))
    return CHAIN_FMT;
if (sameString(fmt, "chainq"))
    return CHAINQ_FMT;
if (sameString(fmt, "genePred"))
    return GENEPRED_FMT;
if (sameString(fmt, "bed"))
    return BED_FMT;
if (sameString(fmt, "bed3+"))
    return BED3P_FMT;
if (sameString(fmt, "bed6+"))
    return BED6P_FMT;
errAbort("invalid file format: %s", fmt);
return UNKNOWN_FMT;
}

enum recordFmt getFileFormat(char *path)
/* determine the file format from the specified file extension */
{
char *filePath = path;
char filePathBuf[PATH_LEN];

if (endsWith(filePath, ".gz") || endsWith(filePath, ".bz2") || endsWith(filePath, ".Z"))
    {
    /* strip .gz/.bz2/.Z extension */
    splitPath(path, NULL, filePathBuf, NULL);
    filePath = filePathBuf;
    }
if (endsWith(filePath, ".psl"))
    return PSL_FMT;
if (endsWith(filePath, ".chain"))
    return CHAIN_FMT;
if (endsWith(filePath, ".genePred") || endsWith(filePath, ".gp"))
    return GENEPRED_FMT;
if (endsWith(filePath, ".bed"))
    return BED_FMT;
errAbort("can't determine file format of %s", filePath);
return UNKNOWN_FMT;
}

static struct  chromAnnReader *createChromAnnReader(char *fileName,
                                                    enum recordFmt fmt,
                                                    unsigned caOpts,
                                                    struct coordCols *cols)
/* construct a reader.  The coordCols spec is only used for tab files */
{
switch (fmt)
    {
    case PSL_FMT:
    case PSLQ_FMT:
        return chromAnnPslReaderNew(fileName, caOpts);
    case CHAIN_FMT:
    case CHAINQ_FMT:
        return chromAnnChainReaderNew(fileName, caOpts);
    case GENEPRED_FMT:
        return chromAnnGenePredReaderNew(fileName, caOpts);
    case BED_FMT:
        return chromAnnBedReaderNew(fileName, caOpts, 12);
    case BED3P_FMT:
        return chromAnnBedReaderNew(fileName, caOpts, 3);
    case BED6P_FMT:
        return chromAnnBedReaderNew(fileName, caOpts, 6);
    case COORD_COLS_FMT:
        return chromAnnTabReaderNew(fileName, cols, caOpts);
    case UNKNOWN_FMT:
        break; 
    }
assert(FALSE);
return NULL;
}

static char *getPrintId(struct chromAnn* ca)
/* get id for output, or <unknown> if not known */
{
return (ca->name == NULL) ? "<unknown>" : ca->name;
}

static void outputMerge(struct chromAnn* inCa, FILE *outFh,
                        struct chromAnnRef *overlappingRecs)
/* output for the -mergeOutput option; pairs of inRec and overlap */
{
struct chromAnnRef *selectCaRef;
for (selectCaRef = overlappingRecs; selectCaRef != NULL; selectCaRef = selectCaRef->next)
    {
    struct chromAnn *selectCa = selectCaRef->ref;
    inCa->recWrite(inCa, outFh, '\t');
    selectCa->recWrite(selectCa, outFh, '\n');
    }
}

static void outputIds(struct chromAnn* inCa, FILE *outFh,
                      struct chromAnnRef *overlappingRecs)
/* output for the -idOutput option; pairs of inRec and overlap ids */
{
struct chromAnnRef *selectCaRef;
for (selectCaRef = overlappingRecs; selectCaRef != NULL; selectCaRef = selectCaRef->next)
    {
    struct chromAnn *selectCa = selectCaRef->ref;
    fprintf(outFh, "%s\t%s\n", getPrintId(inCa), getPrintId(selectCa));
    }
}

/* format string for stats output */
static char *statsFmt = "%s\t%s\t%0.3g\t%0.3g\t%d\t%0.3g\t%d\t%d\n";

static void outputStats(struct chromAnn* inCa, FILE *outFh,
                        struct chromAnnRef *overlappingRecs)
/* output for the -statOutput option; pairs of inRec and overlap ids */
{
if (overlappingRecs == NULL)
    {
    // -statsOutputAll and nothing overlapping
    fprintf(outFh, statsFmt, getPrintId(inCa), "", 0.0, 0.0, 0, 0.0, inCa->totalSize, 0);
    }
struct chromAnnRef *selectCaRef;
for (selectCaRef = overlappingRecs; selectCaRef != NULL; selectCaRef = selectCaRef->next)
    {
    struct chromAnn *selectCa = selectCaRef->ref;
    unsigned overBases = selectOverlapBases(inCa, selectCa);
    fprintf(outFh, statsFmt, getPrintId(inCa), getPrintId(selectCa),
            selectFracOverlap(inCa, overBases), selectFracOverlap(selectCa, overBases), overBases,
            selectFracSimilarity(inCa, selectCa, overBases),
            inCa->totalSize,  selectCa->totalSize);
    }
}

static void outputStatsSelNotUsed(FILE *outFh)
/* output stats for select chromAnns that were not used */
{
struct chromAnnMapIter iter = selectTableFirst();
struct chromAnn *selCa;
while ((selCa = chromAnnMapIterNext(&iter)) != NULL)
    {
    if (!selCa->used)
        fprintf(outFh, statsFmt, "", getPrintId(selCa), 0.0, 0.0, 0, 0.0, 0, selCa->totalSize);
    }
}

static void doItemOverlap(struct chromAnn* inCa, FILE *outFh, FILE *dropFh)
/* Do individual item overlap process of chromAnn object given the criteria,
 * and if so output */
{
struct chromAnnRef *overlappingRecs = NULL;
struct chromAnnRef **overlappingRecsPtr = NULL;  /* used to indicate if recs should be collected */
if (mergeOutput || idOutput || statsOutput)
    overlappingRecsPtr = &overlappingRecs;

boolean overlaps = selectIsOverlapped(selectOpts, inCa, &criteria, overlappingRecsPtr);
if (overlappingRecsPtr != NULL)
    slSort(overlappingRecsPtr, chromAnnRefLocCmp);
if (((nonOverlapping) ? !overlaps : overlaps) || outputAll)
    {
    if (mergeOutput)
        outputMerge(inCa, outFh, overlappingRecs);
    else if (idOutput)
        outputIds(inCa, outFh, overlappingRecs);
    else if (statsOutput)
        outputStats(inCa, outFh, overlappingRecs);
    else
        inCa->recWrite(inCa, outFh, '\n');
    }
else if (dropFh != NULL)
    {
    if (idOutput)
        fprintf(dropFh, "%s\n", getPrintId(inCa));
    else
        inCa->recWrite(inCa, dropFh, '\n');
    }

slFreeList(&overlappingRecs);
}

static void doItemOverlaps(struct chromAnnReader* inCar, FILE *outFh, FILE *dropFh)
/* Do individual item overlap processings */
{
struct chromAnn *inCa;
while ((inCa = inCar->caRead(inCar)) != NULL)
    {
    doItemOverlap(inCa, outFh, dropFh);
    chromAnnFree(&inCa);
    }
}


static void doAggregateOverlap(struct chromAnn* inCa, FILE *outFh, FILE *dropFh)
/* Do aggreate overlap process of chromAnn object given the criteria,
 * and if so output */
{
struct overlapAggStats stats = selectAggregateOverlap(selectOpts, inCa);
boolean overlaps;
if (criteria.threshold <= 0.0)
    overlaps = (stats.inOverlap > 0.0); /* any overlap */
else
    overlaps = (stats.inOverlap >= criteria.threshold);
if (((nonOverlapping) ? !overlaps : overlaps) || outputAll)
    {
    if (idOutput)
        fprintf(outFh, "%s\n", getPrintId(inCa));
    else if (statsOutput)
        fprintf(outFh, "%s\t%0.3g\t%d\t%d\n", getPrintId(inCa),
                stats.inOverlap, stats.inOverBases, stats.inBases);
    else
        inCa->recWrite(inCa, outFh, '\n');
    }
else if (dropFh != NULL)
    {
    if (idOutput)
        fprintf(dropFh, "%s\n", getPrintId(inCa));
    else
        inCa->recWrite(inCa, dropFh, '\n');
    }
}

static void doAggregateOverlaps(struct chromAnnReader* inCar, FILE *outFh, FILE *dropFh)
/* Do aggreate overlap processing */
{
struct chromAnn *inCa;
while ((inCa = inCar->caRead(inCar)) != NULL)
    {
    doAggregateOverlap(inCa, outFh, dropFh);
    chromAnnFree(&inCa);
    }
}

void loadSelectTable(char *selectFile)
/* load the select table from a file */
{
struct chromAnnReader *selCar = createChromAnnReader(selectFile, selectFmt, selectCaOpts, &selectCoordCols);
selectTableAddRecords(selCar);
selCar->carFree(&selCar);
}

void overlapSelect(char *selectFile, char *inFile, char *outFile, char *dropFile)
/* select records based on overlap of chromosome ranges */
{
struct chromAnnReader *inCar
    = createChromAnnReader(inFile, inFmt, inCaOpts, &inCoordCols);
loadSelectTable(selectFile);
FILE *outFh = mustOpen(outFile, "w");
FILE *dropFh = NULL;
if (dropFile != NULL)
    dropFh = mustOpen(dropFile, "w");
if (idOutput)
    {
    if (useAggregate)
        fputs("#inId\n", outFh);
    else
        fputs("#inId\t" "selectId\n", outFh);
    }
if (statsOutput)
    {
    if (useAggregate)
        fputs("#inId\t" "inOverlap\t" "inOverBases\t" "inBases\n", outFh);
    else
        fputs("#inId\t" "selectId\t" "inOverlap\t" "selectOverlap\t" "overBases\t" "similarity\t" "inBases\t" "selectBases\n", outFh);
    }

if (useAggregate)
    doAggregateOverlaps(inCar, outFh, dropFh);
else
    doItemOverlaps(inCar, outFh, dropFh);

inCar->carFree(&inCar);
if (statsOutput && outputBoth)
    outputStatsSelNotUsed(outFh);

carefulClose(&outFh);
carefulClose(&dropFh);
/* enable for memory analysis */
#if 0
selectTableFree();
#endif
}

void usage()
/* usage message and abort */
{
static char *usageMsg =
#include "usage.msg"
    ;
errAbort("%s", usageMsg);
}

static boolean hasStrand(unsigned fmt, struct coordCols* coordCols)
/* does the format include strand? */
{
return !((selectFmt == BED3P_FMT) ||
         ((selectFmt == COORD_COLS_FMT) && (coordCols->strandCol == -1)));
}

int main(int argc, char** argv)
/* entry */
{
char *selectFile, *inFile, *outFile, *dropFile;
optionInit(&argc, argv, optionSpecs);
if (argc != 4)
    usage();
selectFile = argv[1];
inFile = argv[2];
outFile = argv[3];

/* select file options */
if (optionExists("selectFmt") && optionExists("selectCoordCols"))
    errAbort("can't specify both -selectFmt and -selectCoordCols");

if (optionExists("selectFmt"))
    selectFmt = parseFormatSpec(optionVal("selectFmt", NULL));
else if (optionExists("selectCoordCols"))
    {
    selectCoordCols = coordColsParseSpec("selectCoordCols",
                                         optionVal("selectCoordCols", NULL));
    selectFmt = COORD_COLS_FMT;
    }
else
    selectFmt = getFileFormat(selectFile);
if ((selectFmt == BED3P_FMT) || (selectFmt == BED6P_FMT))
    selectCaOpts |= chromAnnRange; // no blocks

if (optionExists("selectCds"))
    selectCaOpts |= chromAnnCds;
if (optionExists("selectRange"))
    selectCaOpts |= chromAnnRange;
if ((selectFmt == PSLQ_FMT) || (selectFmt == CHAINQ_FMT))
    selectCaOpts |= chromAnnUseQSide;

/* in file options */
if (optionExists("inFmt") && optionExists("inCoordCols"))
    errAbort("can't specify both -inFmt and -inCoordCols");
if (optionExists("inFmt"))
    inFmt = parseFormatSpec(optionVal("inFmt", NULL));
else if (optionExists("inCoordCols"))
    {
    inCoordCols = coordColsParseSpec("inCoordCols",
                                     optionVal("inCoordCols", NULL));
    inFmt = COORD_COLS_FMT;
    }
else
    inFmt = getFileFormat(inFile);
if ((inFmt == BED3P_FMT) || (inFmt == BED6P_FMT))
    inCaOpts |= chromAnnRange; // no blocks

inCaOpts = chromAnnSaveLines; // need lines for output
if (optionExists("inCds"))
    inCaOpts |= chromAnnCds;
if (optionExists("inRange"))
    inCaOpts |= chromAnnRange;
if ((inFmt == PSLQ_FMT) || (inFmt == CHAINQ_FMT))
    inCaOpts |= chromAnnUseQSide;

/* select options */
useAggregate = optionExists("aggregate");
nonOverlapping = optionExists("nonOverlapping");
if (optionExists("strand") && optionExists("oppositeStrand"))
    errAbort("can only specify one of -strand and -oppositeStrand");
if (optionExists("strand"))
    selectOpts |= selStrand;
if (optionExists("oppositeStrand"))
    selectOpts |= selOppositeStrand;
if (optionExists("excludeSelf") && (optionExists("idMatch")))
    errAbort("can't specify both -excludeSelf and -idMatch");
if (optionExists("excludeSelf"))
    selectOpts |= selExcludeSelf;
if (optionExists("idMatch"))
    selectOpts |= selIdMatch;

/* catch conflicts */
boolean needStrand = (selectOpts & (selStrand | selOppositeStrand)) != 0;
    if (!hasStrand(selectFmt, &selectCoordCols) && needStrand)
    errAbort("request selecting by strand and selectFile doesn't have strand");
if (!hasStrand(inFmt, &inCoordCols) && needStrand)
    errAbort("request selecting by strand and inFile doesn't have strand");


criteria.threshold = optionFloat("overlapThreshold", 0.0);
criteria.thresholdCeil = optionFloat("overlapThresholdCeil", 1.1);
criteria.similarity = optionFloat("overlapSimilarity", 0.0);
criteria.similarityCeil = optionFloat("overlapSimilarityCeil", 1.1);
criteria.bases = optionInt("overlapBases", -1);

/* output options */
mergeOutput = optionExists("mergeOutput");
idOutput = optionExists("idOutput");
statsOutput = optionExists("statsOutput") || optionExists("statsOutputAll") || optionExists("statsOutputBoth");
if ((mergeOutput + idOutput + statsOutput) > 1)
    errAbort("can only specify one of -mergeOutput, -idOutput, -statsOutput, -statsOutputAll, or -statsOutputBoth");
outputAll = optionExists("statsOutputAll");
outputBoth = optionExists("statsOutputBoth");
if (outputBoth)
    outputAll = TRUE;
if (mergeOutput)
    {
    if (nonOverlapping)
        errAbort("can't use -mergeOutput with -nonOverlapping");
    if (useAggregate)
        errAbort("can't use -mergeOutput with -aggregate");
    if ((selectFmt == CHAIN_FMT) || (selectFmt == CHAINQ_FMT)
        || (inFmt == CHAIN_FMT) || (inFmt == CHAINQ_FMT))
    if (useAggregate)
        errAbort("can't use -mergeOutput with chains");
    selectCaOpts |= chromAnnSaveLines;
    }
dropFile = optionVal("dropped", NULL);

/* check for options incompatible with aggregate mode */
if (useAggregate)
    {
    int i;
    for (i = 0; aggIncompatible[i] != NULL; i++)
        {
        if (optionExists(aggIncompatible[i]))
            errAbort("-%s is not allowed -aggregate", aggIncompatible[i]);
        }
    }

overlapSelect(selectFile, inFile, outFile, dropFile);
return 0;
}
