/* bedClip - Remove lines from bed file that refer to off-chromosome places.. */

/* Copyright (C) 2011 The Regents of the University of California 
 * See kent/LICENSE or http://genome.ucsc.edu/license/ for licensing information. */
#include "common.h"
#include "linefile.h"
#include "hash.h"
#include "options.h"
#include "bbiFile.h"
#include "sqlNum.h"
#include "obscure.h"

void usage()
/* Explain usage and exit. */
{
errAbort(
  "bedClip - Remove lines from bed file that refer to off-chromosome locations.\n"
  "usage:\n"
  "   bedClip [options] input.bed chrom.sizes output.bed\n"
  "chrom.sizes is a two-column file/URL: <chromosome name> <size in bases>\n"
  "If the assembly <db> is hosted by UCSC, chrom.sizes can be a URL like\n"
  "  http://hgdownload.soe.ucsc.edu/goldenPath/<db>/bigZips/<db>.chrom.sizes\n"
  "or you may use the script fetchChromSizes to download the chrom.sizes file.\n"
  "If not hosted by UCSC, a chrom.sizes file can be generated by running\n"
  "twoBitInfo on the assembly .2bit file.\n"
  "options:\n"
  "   -truncate  - truncate items that span ends of chrom instead of the\n"
  "                default of dropping the items\n"
  "   -verbose=2 - set to get list of lines clipped and why"
  );
}

static struct optionSpec options[] = {
   {"truncate", OPTION_BOOLEAN},
   {NULL, 0},
};

static boolean trim = FALSE;	// the name truncate is already taken

void bedClip(char *inFile, char *chromSizes, char *outFile)
/* bedClip - Remove lines from bed file that refer to off-chromosome places.. */
{
struct hash *chromSizesHash = bbiChromSizesFromFile(chromSizes);
struct lineFile *lf = lineFileOpen(inFile, TRUE);
FILE *f = mustOpen(outFile, "w");
char *line;
while (lineFileNextReal(lf, &line))
    {
    char *chrom = nextWord(&line);
    char *startString = nextWord(&line);
    char *endString = nextWord(&line);
    if (endString == NULL)
        errAbort("Need at least three fields line %d of %s", lf->lineIx, lf->fileName);
    if (startString[0] == '-')
	{
	if (trim)
	    {
	    verbose(2, "Truncating negative start line %d of %s: %s:%s-%s\n", lf->lineIx, lf->fileName, chrom, startString, endString);
	    startString = "0";
	    }
	else
	    {
	    verbose(2, "Clipping negative line %d of %s: %s:%s-%s\n", lf->lineIx, lf->fileName, chrom, startString, endString);
	    continue;		// Clip off negatives
	    }
	}
    if (!isdigit(startString[0]))
        errAbort("Expecting number got %s line %d of %s: %s:%s-%s", startString, lf->lineIx, lf->fileName, chrom, startString, endString);
    if (!isdigit(endString[0]))
        errAbort("Expecting number got %s line %d of %s: %s:%s-%s", endString, lf->lineIx, lf->fileName, chrom, startString, endString);
    int start = sqlUnsigned(startString);
    int end = sqlUnsigned(endString);
    if (start >= end)
	{
	verbose(2, "Clipping end <= start line %d of %s: %s:%s-%s\n", lf->lineIx, lf->fileName, chrom, startString, endString);
	continue;
	}
    struct hashEl *hel = hashLookup(chromSizesHash, chrom);
    if (hel == NULL)
        errAbort("Chromosome %s isn't in %s line %d of %s: %s:%s-%s\n", chrom, chromSizes, lf->lineIx, lf->fileName, chrom, startString, endString);
    int chromSize = ptToInt(hel->val);
    if (end > chromSize)
	{
	if (trim)
	    {
	    end = chromSize;
	    verbose(2, "Truncating end > chromSize(%d) line %d of %s: %s:%s-%s\n", chromSize, lf->lineIx, lf->fileName, chrom, startString, endString);
	    }
	else
	    {
	    verbose(2, "Clipping end > chromSize(%d) line %d of %s: %s:%s-%s\n", chromSize, lf->lineIx, lf->fileName, chrom, startString, endString);
	    continue;
	    }
	}
    fprintf(f, "%s\t%d\t%d", chrom, start, end);
    line = skipLeadingSpaces(line);
    if (line == NULL || line[0] == 0)
        fputc('\n', f);
    else
        fprintf(f, "\t%s\n", line);
    }
carefulClose(&f);
}

int main(int argc, char *argv[])
/* Process command line. */
{
optionInit(&argc, argv, options);
if (argc != 4)
    usage();
trim = optionExists("truncate");

bedClip(argv[1], argv[2], argv[3]);
return 0;
}
