/* fixEnsGtf - fix space vs. tab and start/stop codon problems in Ensemble .gtf file. */

/* Copyright (C) 2011 The Regents of the University of California 
 * See kent/LICENSE or http://genome.ucsc.edu/license/ for licensing information. */
#include "common.h"
#include "linefile.h"


void usage()
/* Explain usage and exit. */
{
errAbort(
  "fixEnsGtf - fix space vs. tab and start/stop codon problems in Ensemble .gtf file\n"
  "usage:\n"
  "   fixEnsGtf out.gtf in.gtf(s)\n");
}

void fixLine(struct lineFile *lf, char *line, FILE *f)
/* Fix up a single line. */
{
char *group;            /* Last. */
char *words[8];		/* First words. */
int i;
char fixStart[16], fixEnd[16];
char *type, *strand;

/* Pass through comments. */
if (line[0] == '#')
    {
    fprintf(f, "%s\n", line);
    return;
    }

/* Find the start of the "group" field. */
group = line;
for (i=0; i<8; ++i)
    {
    group = skipToSpaces(group);
    if (group == NULL)
       errAbort("Expecting at least 9 fields line %d of %s\n", lf->lineIx, lf->fileName);
    group = skipLeadingSpaces(group);
    }

/* Truncate initial string before group field and chop it up. */
group[-1] = 0;
chopLine(line, words);

#ifdef FLAKY    /* This doesn't fix all problems, we'll just ignore start/stop_codons. */
/* Fix up start and stop codons. */
type = words[2];
strand = words[6];
if (sameString(type, "start_codon") && sameString(strand, "-"))
    {
    sprintf(fixStart, "%d", atoi(words[3])-3);
    sprintf(fixEnd, "%d", atoi(words[4])-3);
    words[3] = fixStart;
    words[4] = fixEnd;
    }
else if (sameString(type, "stop_codon"))
    {
    /* Start and end reversed on both strands. */
    int start = atoi(words[4]);
    int end = atoi(words[3]);
    if (sameString(strand, "-"))
        {
	start += 3;
	end += 3;
	}
    sprintf(fixStart, "%d", start);
    sprintf(fixEnd, "%d", end);
    words[3] = fixStart;
    words[4] = fixEnd;
    }
#endif /* FLAKY */

/* Skip start/stop codons.  Code will then assume all exons are CDS. */
type = words[2];
if (sameString(type, "start_codon") || sameString(type, "stop_codon"))
    return;

/* Write fixed output. */
for (i=0; i<8; ++i)
    fprintf(f, "%s\t", words[i]);
fprintf(f, "%s\n", group);
}

void fixEnsGtf(char *outName, int inCount, char *inFiles[])
/* fixEnsGtf - fix space vs. tab and start/stop codon problems in Ensemble .gtf file. */
{
FILE *f = mustOpen(outName, "w");
struct lineFile *lf;
int fileIx;
int lineSize;
char *line;

for (fileIx = 0; fileIx < inCount; ++fileIx)
    {
    lf = lineFileOpen(inFiles[fileIx], TRUE);
    printf("Processing %s\n", lf->fileName);
    while (lineFileNext(lf, &line, &lineSize))
        {
	fixLine(lf, line, f);
	}
    lineFileClose(&lf);
    }
}

int main(int argc, char *argv[])
/* Process command line. */
{
if (argc < 3)
    usage();
fixEnsGtf(argv[1], argc-2, argv+2);
return 0;
}
