/* gff - routines to read many types of gff and gtf files
 * and turn them into a relatively easy to deal with form
 * in memory.
 *
 * This file is copyright 2002 Jim Kent, but license is hereby
 * granted for all use - public, private or commercial. */

#include "common.h"
#include "hash.h"
#include "linefile.h"
#include "gff.h"
#include "obscure.h"
#include "dystring.h"


void gffGroupFree(struct gffGroup **pGroup)
/* Free up a gffGroup including lineList. */
{
struct gffGroup *group;
if ((group = *pGroup) != NULL)
    {
    slFreeList(&group->lineList);
    freez(pGroup);
    }
}

void gffGroupFreeList(struct gffGroup **pList)
/* Free up a list of gffGroups. */
{
struct gffGroup *el, *next;
for (el = *pList; el != NULL; el = next)
    {
    next = el->next;
    gffGroupFree(&el);
    }
*pList = NULL;
}


void gffFileFree(struct gffFile **pGff)
/* Free up a gff file. */
{
struct gffFile *gff;
if ((gff = *pGff) != NULL)
    {
    freeMem(gff->fileName);
    freeHash(&gff->seqHash);
    freeHash(&gff->sourceHash);
    freeHash(&gff->featureHash);
    freeHash(&gff->groupHash);
    freeHash(&gff->geneIdHash);
    freeHash(&gff->strPool);
    slFreeList(&gff->lineList);
    slFreeList(&gff->seqList);
    slFreeList(&gff->sourceList);
    slFreeList(&gff->featureList);
    slFreeList(&gff->geneIdList);
    gffGroupFreeList(&gff->groupList);
    freez(pGff);
    }
}

static char *gffFileGetStr(struct gffFile *gff, char *str)
/* get a string from the string pool */
{
return hashStore(gff->strPool,  str)->name;
}

int gffLineCmp(const void *va, const void *vb)
/* Compare two gffLines. */
{
const struct gffLine *a = *((struct gffLine **)va);
const struct gffLine *b = *((struct gffLine **)vb);
int diff;

/* for overlaping starts, sort by end, genePredFromGroupedGtf() depends on
 * this */
diff = strcmp(a->seq, b->seq);
if (diff == 0)
    diff = a->start - b->start;
if (diff == 0)
    diff = a->end - b->end;
return diff;
}


static void gffSyntaxError(char *fileName, int line, char *msg)
/* Complain about syntax error in GFF file. */
{
errAbort("%s Bad line %d of %s:\n", msg, line, fileName);
}

static char *gffTnName(char *seqName, char *groupName)
/* Make name that encorperates seq and group names.... */
{
static struct dyString *nameBuf = NULL;
if (nameBuf == NULL)
    nameBuf = dyStringNew(0);
dyStringClear(nameBuf);
if (startsWith("gene-", groupName))
    groupName += 5;
if (startsWith("cc_", groupName))
    groupName += 3;
dyStringAppend(nameBuf, groupName);

return nameBuf->string;
}

static boolean isGtfGroup(char *group)
/* Return TRUE if group field looks like GTF */
{
if (strstr(group, "gene_id") == NULL)
    return FALSE;
if (countChars(group, '"') >= 2)
    return TRUE;
if (strstr(group, "transcript_id") != NULL)
    return TRUE;
return FALSE;
}

boolean gffHasGtfGroup(char *line)
/* Return TRUE if line has a GTF group field */
{
char *words[10];
char *dupe = cloneString(line);
int wordCt = chopTabs(dupe, words);
boolean isGtf = FALSE;
if (wordCt >= 9) 
    if (isGtfGroup(words[8]))
        isGtf = TRUE;
freeMem(dupe);
return isGtf;
}

static void readQuotedString(char *fileName, int lineIx, char *in, char *out, char **retNext)
/* Parse quoted string and abort on error. */
{
if (!parseQuotedString(in, out, retNext))
    errAbort("Line %d of %s\n", lineIx, fileName);
}

static void parseGtfEnd(char *s, struct gffFile *gff, struct gffLine *gl, 
    char *fileName, int lineIx)
/* Read the semi-colon separated end bits of a GTF line into gl and
 * hashes. */
{
char *type, *val;
struct hashEl *hel;
bool gotSemi;

for (;;)
   {
   gotSemi = FALSE;
   if ((type = nextWord(&s)) == NULL)
       break;
   s = skipLeadingSpaces(s);
   if (NULL == s || s[0] == 0)
       errAbort("Unpaired type(%s)/val on end of gtf line %d of %s", type, lineIx, fileName);
   if (s[0] == '"' || s[0] == '\'')
       {
       val = s;
       readQuotedString(fileName, lineIx, s, val, &s);
       }
   else
       {
       int len;
       val = nextWord(&s);
       len = strlen(val) - 1;
       if (val[len] == ';')
	   {
	   val[len] = 0;
	   len -= 1;
           gotSemi = TRUE;
	   }
       if (len < 0)
           errAbort("Empty value for %s line %d of %s", type, lineIx, fileName);
       }
   if (s != NULL && !gotSemi)
      {
      s = strchr(s, ';');
      if (s != NULL)
         ++s;
      }
   /* only use the first occurance of gene_id and transcript_id */
   if (sameString("gene_id", type) && (gl->geneId == NULL))
       {
       struct gffGeneId *gg;
       if ((hel = hashLookup(gff->geneIdHash, val)) == NULL)
	   {
	   AllocVar(gg);
           hel = hashAdd(gff->geneIdHash, val, gg);
	   gg->name = hel->name;
	   slAddHead(&gff->geneIdList, gg);
	   }
	else
	   {
	   gg = hel->val;
	   }
       gl->geneId = gg->name;
       }
   else if (sameString("transcript_id", type) && (gl->group == NULL))
       {
       struct gffGroup *gg;
       if ((hel = hashLookup(gff->groupHash, val)) == NULL)
	   {
	   AllocVar(gg);
           hel = hashAdd(gff->groupHash, val, gg);
	   gg->name = hel->name;
	   gg->seq = gl->seq;
	   gg->source = gl->source;
	   slAddHead(&gff->groupList, gg);
	   }
	else
	   {
	   gg = hel->val;
	   }
       gl->group = gg->name;
       }
   else if (sameString("exon_id", type))
       gl->exonId = gffFileGetStr(gff, val);
   else if (sameString("exon_number", type))
       {
       if (!isdigit(val[0]))
           errAbort("Expecting number after exon_number, got %s line %d of %s", val, lineIx, fileName);
       gl->exonNumber = atoi(val);
       }
   else if (sameString("intron_id", type))
       gl->intronId = gffFileGetStr(gff, val);
   else if (sameString("intron_status", type))
       gl->intronStatus = gffFileGetStr(gff, val);
   else if (sameString("protein_id", type))
       gl->proteinId = gffFileGetStr(gff, val);
   else if (sameString("gene_name", type))
       gl->geneName = gffFileGetStr(gff, val);
   else if (sameString("transcript_name", type))
       gl->transcriptName = gffFileGetStr(gff, val);
   else if (sameString("gene_version", type))
       gl->geneVersion = gffFileGetStr(gff, val);
   else if (sameString("transcript_version", type))
       gl->transcriptVersion = gffFileGetStr(gff, val);
   else if (sameString("protein_version", type))
       gl->proteinVersion = gffFileGetStr(gff, val);
   else if (sameString("gene_type", type) || sameString("gene_biotype", type))
       gl->geneType = gffFileGetStr(gff, val);
   else if (sameString("transcript_type", type)|| sameString("transcript_biotype", type))
       gl->transcriptType = gffFileGetStr(gff, val);
   }
}

void gffFileAddRow(struct gffFile *gff, int baseOffset, char *words[], int wordCount, 
    char *fileName, int lineIx)
/* Process one row of GFF file (a non-comment line parsed by tabs normally). */
{
struct hashEl *hel;
struct gffLine *gl;

if (wordCount < 8)
    gffSyntaxError(fileName, lineIx, "Word count less than 8 ");
AllocVar(gl);

if ((hel = hashLookup(gff->seqHash, words[0])) == NULL)
    {
    struct gffSeqName *el;
    AllocVar(el);
    hel = hashAdd(gff->seqHash, words[0], el);
    el->name = hel->name;
    slAddHead(&gff->seqList, el);
    }
gl->seq = hel->name;

if ((hel = hashLookup(gff->sourceHash, words[1])) == NULL)
    {
    struct gffSource *el;
    AllocVar(el);
    hel = hashAdd(gff->sourceHash, words[1], el);
    el->name = hel->name;
    slAddHead(&gff->sourceList, el);
    }
gl->source = hel->name;

if ((hel = hashLookup(gff->featureHash, words[2])) == NULL)
    {
    struct gffFeature *el;
    AllocVar(el);
    hel = hashAdd(gff->featureHash, words[2], el);
    el->name = hel->name;
    slAddHead(&gff->featureList, el);
    }
struct gffFeature *feature = hel->val;
feature->count += 1;
gl->feature = hel->name;

if (!isdigit(words[3][0]) || !isdigit(words[4][0]))
   gffSyntaxError(fileName, lineIx, "col 3 or 4 not a number ");	
gl->start = atol(words[3])-1 + baseOffset;
gl->end = atol(words[4]) + baseOffset;
gl->score = atof(words[5]);
gl->strand = words[6][0];
gl->frame = words[7][0];

if (wordCount >= 9)
    {
    if (!gff->typeKnown)
	{
	gff->typeKnown = TRUE;
	gff->isGtf = isGtfGroup(words[8]);
	}
    if (gff->isGtf)
	{
	parseGtfEnd(words[8], gff, gl, fileName, lineIx);
	}
    else
	{
	char *tnName = gffTnName(gl->seq, trimSpaces(words[8]));
	if ((hel = hashLookup(gff->groupHash, tnName)) == NULL)
	    {
	    struct gffGroup *group;
	    AllocVar(group);
	    hel = hashAdd(gff->groupHash, tnName, group);
	    group->name = hel->name;
	    group->seq = gl->seq;
	    group->source = gl->source;
	    slAddHead(&gff->groupList, group);
	    }
	gl->group = hel->name;
	}
    }
slAddHead(&gff->lineList, gl);
}


void gffFileAdd(struct gffFile *gff, char *fileName, int baseOffset)
/* Create a gffFile structure from a GFF file. */
{
/* Open file and do basic allocations. */
struct lineFile *lf = lineFileOpen(fileName, TRUE);
char *line, *words[9];
int lineSize, wordCount;

while (lineFileNext(lf, &line, &lineSize))
    {
    if (line[0] != '#')
	{
	wordCount = chopTabs(line, words);
        if (wordCount > 0)
            gffFileAddRow(gff, baseOffset, words, wordCount, lf->fileName, lf->lineIx);
	}
    }
slReverse(&gff->lineList);
slReverse(&gff->seqList);
slReverse(&gff->sourceList);
slReverse(&gff->featureList);
slReverse(&gff->groupList);
slReverse(&gff->geneIdList);
lineFileClose(&lf);
}

struct gffFile *gffFileNew(char *fileName)
/* Create a new gffFile structure. */
{
struct gffFile *gff;
AllocVar(gff);
gff->fileName = cloneString(fileName);
gff->seqHash = newHash(18);
gff->sourceHash = newHash(6);
gff->featureHash = newHash(6);
gff->groupHash = newHash(16);
gff->geneIdHash = newHash(16);
gff->strPool = newHash(20);
return gff;
}

struct gffFile *gffRead(char *fileName)
/* Create a gffFile structure from a GFF file. */
{
struct gffFile *gff = gffFileNew(fileName);
gffFileAdd(gff, fileName, 0);
return gff;
}

static void getGroupBoundaries(struct gffGroup *group)
/* Fill in start, end, strand of group from lines. */
{
struct gffLine *line;
long start = -1;
long end = -1;
line = group->lineList;
group->strand = line->strand;
for (; line != NULL; line = line->next)
    {
    if ((start < 0) || (start > line->start))
	start = line->start;
    if ((end < 0) || (end < line->end))
	end = line->end;
    }
group->start = start;
group->end = end;
}

void gffGroupLines(struct gffFile *gff)
/* Group lines of gff file together, in process moving
 * gff->lineList to gffGroup->lineList. */
{
struct gffLine *line, *nextLine;
struct hash *groupHash = gff->groupHash;
char *groupName;
struct gffGroup *group;
struct gffLine *ungroupedLines = NULL;

for (line = gff->lineList; line != NULL; line = nextLine)
    {
    nextLine = line->next;
    if ((groupName = line->group) != NULL)
	{
	struct hashEl *hel = hashLookup(groupHash, groupName);
	group = hel->val;
	slAddHead(&group->lineList, line);
	}
    else
	{
	slAddHead(&ungroupedLines, line);
	}
    }

/* Restore ungrouped lines to gff->lineList. */
slReverse(&ungroupedLines);
gff->lineList = ungroupedLines;

/* Restore order of grouped lines and fill in start and end. */
for (group = gff->groupList; group != NULL; group = group->next)
    {
    slSort(&group->lineList, gffLineCmp);
    getGroupBoundaries(group);
    }
}

void gffOutput(struct gffLine *el, FILE *f, char sep, char lastSep) 
/* Print out GTF.  Separate fields with sep. Follow last field with lastSep. */
{
if (sep == ',') fputc('"',f);
fprintf(f, "%s", el->seq);
if (sep == ',') fputc('"',f);
fputc(sep,f);
if (sep == ',') fputc('"',f);
fprintf(f, "%s", el->source);
if (sep == ',') fputc('"',f);
fputc(sep,f);
if (sep == ',') fputc('"',f);
fprintf(f, "%s", el->feature);
if (sep == ',') fputc('"',f);
fputc(sep,f);
fprintf(f, "%lu", el->start+1);
fputc(sep,f);
fprintf(f, "%lu", el->end);
fputc(sep,f);
fprintf(f, "%f", el->score);
fputc(sep,f);
if (sep == ',') fputc('"',f);
fprintf(f, "%c", el->strand);
if (sep == ',') fputc('"',f);
fputc(sep,f);
if (sep == ',') fputc('"',f);
fprintf(f, "%c", el->frame);
if (sep == ',') fputc('"',f);
fputc(sep,f);
if (sep == ',') fputc('"',f);
if (el->geneId != NULL)
    fprintf(f, "gene_id %s\"%s%s\"; ",
	    (sep == ',') ? "\\" : "",
	    el->geneId,
	    (sep == ',') ? "\\" : "");
fprintf(f, "transcript_id %s\"%s%s\"; ",
	(sep == ',') ? "\\" : "",
	el->group,
	(sep == ',') ? "\\" : "");
if (el->exonId != NULL)
    fprintf(f, "exon_id %s\"%s%s\"; ",
	    (sep == ',') ? "\\" : "",
	    el->exonId,
	    (sep == ',') ? "\\" : "");
if (sep == ',') fputc('"',f);
fputc(lastSep,f);
}

