/* geniegff - make up a genigene.gdf file from genie/N.gff */
#include "common.h"
#include "hash.h"
#include "sig.h"

static char *inNames[] = {
    "I.gff",
    "II.gff",
    "III.gff",
    "IV.gff",
    "V.gff",
    "X.gff",
};

static char *chromNames[] = {
    "i", "ii", "iii", "iv", "v", "x",
};

struct exon
    {
    struct exon *next;
    int start, end;
    };

struct gene
    {
    struct gene *next;
    char *name;
    struct exon *exons;
    int start, end;
    UBYTE chromIx;
    char strand;
    };

int cmpGenes(const void *va, const void *vb)
{
const struct gene *a = *((struct gene **)va);
const struct gene *b = *((struct gene **)vb);
int dif = a->start - b->start;
if (a == 0)
    dif = a->end - b->end;
return dif;
}

void writeShortString(FILE *f, char *s)
{
UBYTE count = strlen(s);
writeOne(f, count);
mustWrite(f, s, count);
}

void writeGene(struct gene *gene, FILE *c2g, FILE *gl)
{
short pointCount;
struct exon *exon;

fprintf(c2g, "%s:%d-%d %c %s\n", chromNames[gene->chromIx], gene->start-1, gene->end, gene->strand, gene->name);
writeShortString(gl, gene->name);
writeOne(gl, gene->chromIx);
writeOne(gl, gene->strand);
pointCount = slCount(gene->exons) * 2;
writeOne(gl, pointCount);
for (exon = gene->exons; exon != NULL; exon = exon->next)
    {
    int start = exon->start - 1;
    writeOne(gl, start);
    writeOne(gl, exon->end);
    }
}

void procOne(char *inName, UBYTE chromIx, FILE *c2g, FILE *gl)
{
FILE *in = mustOpen(inName, "r");
struct gene *geneList = NULL, *g = NULL;
struct exon *exon;
char line[1024];
int lineCount = 0;
char *words[256];
int wordCount;
char *type;
char *geneName;
char *lastName = "";
struct hash *hash = newHash(12);

printf("Processing %s\n", inName);
while (fgets(line, sizeof(line), in))
    {
    ++lineCount;
    wordCount = chopLine(line, words);
    if (wordCount > 0)
        {
        if (wordCount < 9)
            errAbort("Short line %d of %s\n", lineCount, inName);
        type = words[2];
        if (differentString(type, "CDS"))
            {
            errAbort("Expecting CDS got %s in type field line %d of %s\n",
                type, lineCount, inName);
            }
        geneName = words[8];
        if (differentString(lastName, geneName) )
            {
            if (hashLookup(hash, geneName))
                errAbort("Repeating %s\n", geneName);
            hashAdd(hash, geneName, NULL);
            AllocVar(g);
            g->name = lastName = cloneString(geneName);
            g->strand = words[6][0];
            g->chromIx = chromIx;
            g->exons = NULL;
            slAddHead(&geneList, g);
            }
        AllocVar(exon);
        exon->start = atoi(words[3]);
        exon->end = atoi(words[4]);
        slAddTail(&g->exons, exon);
        }
    }
slReverse(&geneList);
slSort(&geneList, cmpGenes);
for (g=geneList; g != NULL; g=g->next)
    {
    int min = 0x7fffffff;
    int max = -min;
    for (exon = g->exons; exon != NULL; exon = exon->next)
        {
        if (min > exon->start)
            min = exon->start;
        if (max < exon->end)
            max = exon->end;
        }
    g->start = min;
    g->end = max;
    }
for (g = geneList; g != NULL; g=g->next)
    writeGene(g, c2g, gl);
fclose(in);
}


int main(int argc, char *argv[])
{
char *gdfName;
char *c2gName;
FILE *gdfFile;
FILE *c2gFile;
int i;
bits32 sig = glSig;

if (argc != 3)
    {
    errAbort("geniegff - makes up a gdf file from Genie gene predictions\n"
             "usage:\n"
             "     geniegff genigene.gdf c2gFile\n"
             "This must be run in the same directory as I.gff, II.gff, etc.\n"
             "generated by Genie\n");
    }
gdfName = argv[1];
gdfFile = mustOpen(gdfName, "wb");
c2gName = argv[2];
c2gFile = mustOpen(c2gName, "w");
writeOne(gdfFile, sig);
for (i=0; i<ArraySize(inNames); ++i)
    {
    procOne(inNames[i], (UBYTE)i, c2gFile, gdfFile);
    }
return 0;
}

