/*
 * Object for accessing GFF3 files
 * See GFF3 specification for details of file format:
 *   http://www.sequenceontology.org/gff3.shtml
 */

/* Copyright (C) 2014 The Regents of the University of California 
 * See kent/LICENSE or http://genome.ucsc.edu/license/ for licensing information. */
#include "common.h"
#include "gff3.h"
#include <limits.h>
#include "errAbort.h"
#include "localmem.h"
#include "hash.h"
#include "linefile.h"
#include "dystring.h"
#include "fa.h"

// FIXME: spec unclear if attributes can be specified multiple times
// FIXME: spec unclear on attribute of discontinuous features.
// FIXME: should spaces be striped from attributes?

static const int gffNumCols = 9;

/* standard attribute names */
char *gff3AttrID = "ID";
char *gff3AttrName = "Name";
char *gff3AttrAlias = "Alias";
char *gff3AttrParent = "Parent";
char *gff3AttrTarget = "Target";
char *gff3AttrGap = "Gap";
char *gff3AttrDerivesFrom = "Derives_from";
char *gff3AttrNote = "Note";
char *gff3AttrDbxref = "Dbxref";
char *gff3AttrOntologyTerm = "Ontology_term";
char *gff3AttrIsCircular = "Is_circular";

/* commonly used features names */
char *gff3FeatGene = "gene";
char *gff3FeatPseudogene = "pseudogene";
char *gff3FeatNCRnaGene ="ncRNA_gene";
char *gff3FeatNCRna ="ncRNA";
char *gff3FeatRRna = "rRNA";
char *gff3FeatTRna = "tRNA";
char *gff3FeatMRna = "mRNA";
char *gff3FeatLncRna = "lnc_RNA";
char *gff3FeatPseudogenicTranscript = "pseudogenic_transcript";
char *gff3FeatScRna = "scRNA";
char *gff3FeatSnRna = "snRNA";
char *gff3FeatSnoRna = "snoRNA";
char *gff3FeatUnconfirmedTranscript = "unconfirmed_transcript";
char *gff3FeatExon = "exon";
char *gff3FeatCDS = "CDS";
char *gff3FeatUTR = "UTR";
char *gff3FeatThreePrimeUTR = "three_prime_UTR";
char *gff3FeatFivePrimeUTR = "five_prime_UTR";
char *gff3FeatStartCodon = "start_codon";
char *gff3FeatStopCodon = "stop_codon";
char *gff3FeatTranscript = "transcript";
char *gff3FeatPrimaryTranscript = "primary_transcript";
char *gff3FeatCGeneSegment = "C_gene_segment";
char *gff3FeatDGeneSegment = "D_gene_segment";
char *gff3FeatJGeneSegment = "J_gene_segment";
char *gff3FeatVGeneSegment = "V_gene_segment";

static bool gff3FileStopDueToErrors(struct gff3File *g3f)
/* determine if we should stop due to the number of errors */
{
return g3f->errCnt > g3f->maxErr;
}

static void gff3FileErr(struct gff3File *g3f, bool canWarn, char *format, ...)
#if defined(__GNUC__)
__attribute__((format(printf, 3, 4)))
#endif
;

static void gff3AnnErr(struct gff3Ann *g3a, bool canWarn, char *format, ...)
#if defined(__GNUC__)
__attribute__((format(printf, 3, 4)))
#endif
;

static void vaGff3FileErr(struct gff3File *g3f, bool canWarn, char *format, va_list args)
/* Print error message to error file, abort if max errors have been reached */
{
bool isWarning = canWarn && (g3f->flags & GFF3_WARN_WHEN_POSSIBLE);
if (g3f->lf != NULL)
    fprintf(g3f->errFh, "%s:%d: ", g3f->lf->fileName, g3f->lf->lineIx);
if (isWarning)
    fprintf(g3f->errFh, "WARNING: ");
vfprintf(g3f->errFh, format, args);
fprintf(g3f->errFh, "\n");
if (!isWarning)
    {
    g3f->errCnt++;
    if (gff3FileStopDueToErrors(g3f))
        errAbort("GFF3: %d parser errors", g3f->errCnt);
    }
}

static void gff3FileErr(struct gff3File *g3f, bool canWarn, char *format, ...)
/* Print error message and abort */
{
va_list args;
va_start(args, format);
vaGff3FileErr(g3f, canWarn, format, args);
va_end(args);
}

static void gff3AnnErr(struct gff3Ann *g3a, bool canWarn, char *format, ...)
/* Print error message abort */
{
va_list args;
va_start(args, format);
vaGff3FileErr(g3a->file, canWarn, format, args);
va_end(args);
}

static int gff3FileStrToInt(struct gff3File *g3f, char *str)
/* convert a string to an integer */
{
char *end;
long val = strtol(str, &end, 0);
if ((end == str) || (*end != '\0'))
    gff3FileErr(g3f, FALSE, "invalid integer: %s", str);
return (int)val;
}

static float gff3FileStrToFloat(struct gff3File *g3f, char *str)
/* convert a string to a float  */
{
char *end;
double val = strtod(str, &end);
if ((end == str) || (*end != '\0'))
    gff3FileErr(g3f, FALSE, "invalid float: %s", str);
return (float)val;
}

static void *gff3FileAlloc(struct gff3File *g3f, size_t size)
/* allocate memory from the memory pool */
{
return lmAlloc(g3f->pool->lm, size);
}

static char *gff3FileCloneStr(struct gff3File *g3f, char *str)
/* allocate memory for a string and copy if */
{
return lmCloneString(g3f->pool->lm, str);
}

static char *gff3FilePooledStr(struct gff3File *g3f, char *str)
/* allocate memory for a string from the shared string pool */
{
return hashStore(g3f->pool, str)->name;
}

static struct slName *gff3FileSlNameNew(struct gff3File *g3f, char *name)
/* allocate slName from the memory pool */
{
return lmSlName(g3f->pool->lm, name);
}


static char **dynChopStringWhite(struct gff3File *g3f, char *str, int minWords, int maxWords, int *numWordsRet, char *desc)
/* dynamic chop string without corrupting it, generate error if expected
 * number of words not found. Free return when done. Returns NULL on
 * error. */
{
int numWords = chopByWhite(str, NULL, 0);
if ((numWords < minWords) || (numWords > maxWords))
    {
    gff3FileErr(g3f, FALSE, "expected %s, got \"%s\"", desc, str);
    return NULL;
    }
// allocate buffer for both array and string
int wordsBytes = ((numWords+1)*sizeof(char**));
char **words = needMem(wordsBytes+strlen(str)+1);
char *strcp = ((char*)words)+wordsBytes;
strcpy(strcp, str);
chopByWhite(strcp, words, numWords);
words[numWords] = NULL;
if (numWordsRet != NULL)
    *numWordsRet = numWords;
return words;
}

struct gff3Attr *gff3AnnFindAttr(struct gff3Ann *g3a, char *tag)
/* find a user attribute, or NULL */
{
struct gff3Attr *attr;
for (attr = g3a->attrs; attr != NULL; attr = attr->next)
    {
    if (sameString(attr->tag, tag))
        return attr;
    }
return NULL;
}

static struct gff3AnnRef *gff3AnnRefAlloc(struct gff3Ann *g3a)
/* construct an annotation reference, allocated in the memory pool*/
{
struct gff3AnnRef *ref = gff3FileAlloc(g3a->file, sizeof(struct gff3AnnRef));
ref->ann = g3a;
return ref;
}

struct gff3AnnRef *gff3RefFind(struct gff3AnnRef *refList, struct gff3Ann *ann)
/* Return ref if ann is already on list, otherwise NULL. */
{
struct gff3AnnRef *ref;
for (ref = refList; ref != NULL; ref = ref->next)
    if (ref->ann == ann)
        return ref;
return NULL;
}

static void raiseInvalidEscape(struct gff3Ann *g3a, char *str)
/* raise an error about an invalid escape in a string */
{
gff3AnnErr(g3a, TRUE, "invalid GFF escape sequence in string: %s", str);
}

static char convertEscape(struct gff3Ann *g3a, char *esc, char *src)
/* convert character at esc, which should start with a `%' and be a string
 * in the form `%09' */
{
if (!(isxdigit(esc[1]) && isxdigit(esc[2])))
    {
    raiseInvalidEscape(g3a, src);
    return '_';  // something
    }
char num[3], *end;
strncpy(num, esc+1, 2);
num[2] = '\0';
long val = strtol(num, &end, 16);
if ((end == num) || (*end != '\0'))
    {
    raiseInvalidEscape(g3a, src);
    return '_';
    }
return (char)val;
}

static void unescapeStr(struct gff3Ann *g3a, char *dest, char *src)
/* remove URL-style escapes from a string. dest need only have enough
 * memory to hold src, as unescaping will not grow the string */
{
char *s = src, *d = dest;
while (*s != '\0')
    {
    if (*s == '%')
        {
        *d++ = convertEscape(g3a, s, src);
        s += 3;
        }
    else
        *d++ = *s++;
    }
*d = '\0';
}

static struct slName *unescapeSlName(struct gff3Ann *g3a, char *src)
/* unescape the string and put in an slName object, storing it in memory
 * allocated from localmem  */
{
struct slName *dest = gff3FileAlloc(g3a->file, sizeof(struct slName)+strlen(src));
unescapeStr(g3a, dest->name, src);
return dest;
}

static char *unescapeStrTmp(struct gff3Ann *g3a, char *src)
/* unescape the string into a tmp buffer. WARNING: return is a static and
 * maybe invalidated by calling another unescape function */
{
static struct dyString *buf = NULL;  // buffer for tmp copy of unescaped string
if (buf == NULL)
    buf = dyStringNew(256);
dyStringBumpBufSize(buf, strlen(src)+1);
unescapeStr(g3a, buf->string, src);
return buf->string;
}

static char *unescapeStrPooled(struct gff3Ann *g3a, char *src)
/* unescape the string and obtain it from localmem  */
{
return gff3FilePooledStr(g3a->file, unescapeStrTmp(g3a, src));
}

static char *escapeChar(char c)
/* escape a character.  Warning: static return */
{
static char ec[4];
safef(ec, sizeof(ec), "%%%02X", c);
return ec;
}

static boolean isMetaChar(char c)
/* determine if a character is a GFF3 meta character */
{
return ((c == '\t') || (c == '\n') || (c == '\r') || !isprint(c) || (c == ';') || (c == '=') || (c == '&') || (c == ','));
}

static void writeEscaped(char *str, FILE *fh)
/* write a data string to a file, escaping as needed */
{
char *c;
for (c = str; *c != '\0'; c++)
    {
    if (isMetaChar(*c))
        fputs(escapeChar(*c), fh);
    else
        fputc(*c, fh);
    }
}

static char *parseStrand(struct gff3Ann *g3a, char *strand)
/* parse strand into static string, validating it */
{
if (sameString(strand, "."))
    return NULL;
else if (sameString(strand, "+"))
    return "+";
else if (sameString(strand, "-"))
    return "-";
else if (sameString(strand, "?"))
    return "?";
else
    {
    gff3AnnErr(g3a, TRUE, "invalid strand: '%s'", strand);
    return NULL;
    }
return NULL;
}

static int parsePhase(struct gff3Ann *g3a, char *str)
/* parse phase into a number, validating it */
{
if (sameString(str, "."))
    return -1;
int phase = gff3FileStrToInt(g3a->file, str);
if ((phase < 0) || (phase  > 2))
    {
    gff3AnnErr(g3a, TRUE, "invalid phase: %d", phase);
    return -1;
    }
return phase;
}

static void parseFields(struct gff3Ann *g3a, char **words)
/* parse the field in an annotation record */
{
g3a->seqid = unescapeStrPooled(g3a, words[0]);
g3a->source = unescapeStrPooled(g3a, words[1]);
g3a->type = unescapeStrPooled(g3a, words[2]);
g3a->start = gff3FileStrToInt(g3a->file, words[3])-1;
g3a->end = gff3FileStrToInt(g3a->file, words[4]);
if (!sameString(words[5], "."))
    {
    g3a->score = gff3FileStrToFloat(g3a->file, words[5]);
    g3a->haveScore = TRUE;
    }
g3a->strand = parseStrand(g3a, words[6]);
g3a->phase = parsePhase(g3a, words[7]);
if (sameString(g3a->type, "CDS"))
    {
    if (g3a->phase < 0)
	gff3AnnErr(g3a, TRUE, "CDS feature must have phase");
    }
else
    {
#if 0 // spec unclear; bug report filed
    // spec currently doesn't restrict phase, unclear if it's allowed on start/stop codon features
    if (g3a->phase >= 0)
        gff3AnnErr(g3a, TRUE, "phase only allowed on CDS features");
#endif
    }
}

/* check that an attribute tag name is valid. */
static boolean checkAttrTag(struct gff3Ann *g3a, char *tag)
{
// FIXME: spec is not clear on what is a valid tag.
char *tc = tag;
boolean isOk = (isalnum(*tc) || (*tc == '-') || (*tc == '_'));
for (tc++; isOk && (*tc != '\0'); tc++)
    {
    if (!((*tc == '-') || (*tc == '_') || isalnum(*tc)))
        isOk = FALSE;
    }
if (!isOk)
    {
    gff3AnnErr(g3a, TRUE, "invalid attribute tag, must be only alphanumeric, dash or underscore: %s", tag);
    if (g3a->file->flags & GFF3_WARN_WHEN_POSSIBLE)
        return TRUE;
    }
return isOk;
}

static struct slName *parseAttrVals(struct gff3Ann *g3a, char *tag, char *valsStr)
/* parse an attribute into its values */
{
int i, numVals = chopString(valsStr, ",", NULL, 0);
char **vals = needMem((numVals+1)*sizeof(char**)); // +1 allows for no values
chopString(valsStr, ",", vals, numVals);
struct slName *unescVals = NULL;
for (i = 0; i < numVals; i++)
    slAddHead(&unescVals, unescapeSlName(g3a, vals[i]));
if (unescVals == NULL)
    slAddHead(&unescVals, slNameNew(""));  // empty value
freeMem(vals);
slReverse(&unescVals);
return unescVals;
}

static void addAttr(struct gff3Ann *g3a, char *tag, char *valStr)
/* Add an attribute to the list of attributes.  If attribute has already been
 * specified, values are merged.  Attribute name must already be unescaped,
 * attribute values will be split and then unescaped. */
{
struct gff3Attr *attr = gff3AnnFindAttr(g3a, tag);
if (attr == NULL)
    {
    attr = gff3FileAlloc(g3a->file, sizeof(struct gff3Attr));
    attr->tag = gff3FileCloneStr(g3a->file, tag);
    slAddHead(&g3a->attrs, attr);
    }
attr->vals = slCat(attr->vals, parseAttrVals(g3a, tag, valStr));
}

static void parseAttr(struct gff3Ann *g3a, char *attrStr)
/* parse one attribute from an annotation record */
{
char *eq = strchr(attrStr, '=');
if ((eq == NULL) || (eq == attrStr))
    gff3AnnErr(g3a, FALSE, "expected name=value: %s", attrStr);
else
    {
    char *tag = attrStr;
    char *vals = eq+1;
    *eq = '\0';
    unescapeStr(g3a, tag, tag);
    if (checkAttrTag(g3a, tag))
        addAttr(g3a, tag, vals);
    }
}

static void parseAttrs(struct gff3Ann *g3a, char *attrsCol)
/* parse the attribute column in an annotation record */
{
int i, numAttrs = chopString(attrsCol, ";", NULL, 0);
char **attrStrs = needMem(numAttrs*sizeof(char**));
chopString(attrsCol, ";", attrStrs, numAttrs);
for (i = 0; i < numAttrs; i++)
    {
    char *attrStr = trimSpaces(attrStrs[i]);
    if (strlen(attrStr) > 0)
        parseAttr(g3a, attrStr);
    }
freeMem(attrStrs);
slReverse(&g3a->attrs);
}

static void checkSingleValAttr(struct gff3Ann *g3a, struct gff3Attr *attr)
/* validate that an attribute has only one value */
{
if (attr->vals->next != NULL)
    gff3AnnErr(g3a, FALSE, "attribute %s must have a single value, found multiple comma-separated values", attr->tag);
}

static void parseIDAttr(struct gff3Ann *g3a, struct gff3Attr *attr)
/* parse the ID attribute */
{
checkSingleValAttr(g3a, attr);
g3a->id = attr->vals->name;
// link into other parts of feature if discontinuous
struct hashEl *hel = hashStore(g3a->file->byId, g3a->id);
struct gff3Ann *head = hel->val;
if (head != NULL)
    head->prevPart = g3a;
g3a->nextPart = head;
hel->val = g3a;
}

static void parseNameAttr(struct gff3Ann *g3a, struct gff3Attr *attr)
/* parse the Name attribute */
{
checkSingleValAttr(g3a, attr);
g3a->name = attr->vals->name;
}

static void parseAliasAttr(struct gff3Ann *g3a, struct gff3Attr *attr)
/* parse the Alias attribute */
{
g3a->aliases = attr->vals;
}

static void parseParentAttr(struct gff3Ann *g3a, struct gff3Attr *attr)
/* parse the Parent attribute */
{
g3a->parentIds = attr->vals;
}

static void parseTargetAttr(struct gff3Ann *g3a, struct gff3Attr *attr)
/* parse the Target attribute */
{
checkSingleValAttr(g3a, attr);

// target_id start end [strand]
int numWords;
char **words = dynChopStringWhite(g3a->file, attr->vals->name, 3, 4, &numWords,
                                  "Target attribute in the form \"target_id start end [strand]\"");
if (words == NULL)
    return;  // got an error
g3a->targetId = gff3FileCloneStr(g3a->file, words[0]);
g3a->targetStart = gff3FileStrToInt(g3a->file, words[1])-1;
g3a->targetEnd = gff3FileStrToInt(g3a->file, words[2]);
if (numWords > 3)
    g3a->targetStrand = parseStrand(g3a, words[3]);
freeMem(words);
}

static void parseGapAttr(struct gff3Ann *g3a, struct gff3Attr *attr)
/* parse the Gap attribute */
{
checkSingleValAttr(g3a, attr);
g3a->gap = attr->vals->name;
}

static void parseDerivesFromAttr(struct gff3Ann *g3a, struct gff3Attr *attr)
/* parse the Derives_from attribute */
{
checkSingleValAttr(g3a, attr);
g3a->derivesFromId = attr->vals->name;
}

static void parseIsCircular(struct gff3Ann *g3a, struct gff3Attr *attr)
/* parse the Note attribute */
{
g3a->isCircular = TRUE;
}

static void parseNoteAttr(struct gff3Ann *g3a, struct gff3Attr *attr)
/* parse the Note attribute */
{
g3a->notes = attr->vals;
}

static void parseDbxrefAttr(struct gff3Ann *g3a, struct gff3Attr *attr)
/* parse the Dbxref attribute */
{
g3a->dbxrefs = attr->vals;
}

static void parseOntologyTermAttr(struct gff3Ann *g3a, struct gff3Attr *attr)
/* parse the Ontology_term attribute */
{
g3a->ontologyTerms = attr->vals;
}

static void parseStdAttr(struct gff3Ann *g3a, struct gff3Attr *attr)
/* Parse one of the standard specified attributes (those starting with upper
 * case) into fields. Multiple specifications of an attribute should have been
 * merged before calling this function. */
{
if (sameString(attr->tag, gff3AttrID))
    parseIDAttr(g3a, attr);
else if (sameString(attr->tag, gff3AttrName))
    parseNameAttr(g3a, attr);
else if (sameString(attr->tag, gff3AttrAlias))
    parseAliasAttr(g3a, attr);
else if (sameString(attr->tag, gff3AttrParent))
    parseParentAttr(g3a, attr);
else if (sameString(attr->tag, gff3AttrTarget))
    parseTargetAttr(g3a, attr);
else if (sameString(attr->tag, gff3AttrGap))
    parseGapAttr(g3a, attr);
else if (sameString(attr->tag, gff3AttrDerivesFrom))
    parseDerivesFromAttr(g3a, attr);
else if (sameString(attr->tag, gff3AttrIsCircular))
    parseIsCircular(g3a, attr);
else if (sameString(attr->tag, gff3AttrNote))
    parseNoteAttr(g3a, attr);
else if (sameString(attr->tag, gff3AttrDbxref))
    parseDbxrefAttr(g3a, attr);
else if (sameString(attr->tag, gff3AttrOntologyTerm))
    parseOntologyTermAttr(g3a, attr);
else
    gff3AnnErr(g3a, TRUE, "unknown standard attribute, user defined attributes must start with a lower-case letter: %s", attr->tag);
}

static void parseStdAttrs(struct gff3Ann *g3a)
/* parse standard attributes (starting with upper case) into attributes
 * have been parsed into attribute list, which would have  merged multiply
 * specified attributes. */
{
struct gff3Attr *attr;
for (attr = g3a->attrs; attr != NULL; attr = attr->next)
    {
    if (isupper(attr->tag[0]))
        parseStdAttr(g3a, attr);
    }
}

static void parseAnn(struct gff3File *g3f, char *line)
/* parse an annotation line */
{
// extra column to check for too many
char *words[gffNumCols+1];
int numWords = chopString(line, "\t", words, gffNumCols+1);
if (numWords != gffNumCols)
    gff3FileErr(g3f, FALSE, "expected %d tab-separated columns: %s", gffNumCols, line);
else
    {
    struct gff3Ann *g3a = gff3FileAlloc(g3f, sizeof(struct gff3Ann));
    g3a->file = g3f;
    g3a->lineNum = g3f->lf->lineIx;
    parseFields(g3a, words);
    parseAttrs(g3a, words[8]);
    parseStdAttrs(g3a);
    slAddHead(&g3f->anns, gff3AnnRefNew(g3a));
    }
}

static void writeAttr(struct gff3Attr *attr, FILE *fh)
/* write one attribute and it's values */
{
writeEscaped(attr->tag, fh);
fputc('=', fh);
struct slName *val;
for (val = attr->vals; val != NULL; val = val->next)
    {
    if (val != attr->vals)
        fputc(',', fh);
    writeEscaped(val->name, fh);
    }
}

static void writeAttrs(struct gff3Ann *g3a, FILE *fh)
/* write annotation record attributes */
{
struct gff3Attr *attr;
for (attr = g3a->attrs; attr != NULL; attr = attr->next)
    {
    if (attr != g3a->attrs)
        fputc(';', fh);
    writeAttr(attr, fh);
    }
}

static void writeFields(struct gff3Ann *g3a, FILE *fh)
/* write an annotation record fields */
{
writeEscaped(g3a->seqid, fh);
fputc('\t', fh);
writeEscaped(g3a->source, fh);
fputc('\t', fh);
writeEscaped(g3a->type, fh);
fprintf(fh, "\t%d\t%d", g3a->start+1, g3a->end);
fputc('\t', fh);
if (g3a->haveScore)
    fprintf(fh, "%g", g3a->score);
else
    fputc('.', fh);
fprintf(fh, "\t%c", (g3a->strand != NULL) ? g3a->strand[0] : '.');
fputc('\t', fh);
if (g3a->phase < 0)
    fputc('.', fh);
else
    fprintf(fh, "%d", g3a->phase);
}

void gff3AnnWrite(struct gff3Ann *g3a, FILE *fh)
/* Write an annotation record to the specified file.
 * This only writes a single record, it is not recursive.*/
{
writeFields(g3a, fh);
fputc('\t', fh);
writeAttrs(g3a, fh);
fputc('\n', fh);
}

static void addFasta(struct gff3File *g3f, char *dna, int size, char *name)
/* add one fasta record */
{
struct dnaSeq *dnaSeq = gff3FileAlloc(g3f, sizeof(struct dnaSeq));
slAddHead(&g3f->seqs, dnaSeq);
struct hashEl *hel = hashAdd(g3f->seqMap, name, dnaSeq);
dnaSeq->name = hel->name;
dnaSeq->dna = gff3FileCloneStr(g3f, dna);
dnaSeq->size = size;
}

static void parseFasta(struct gff3File *g3f)
/* parse fasta records in the file, consumes remainder of file */
{
char *dna, *name;
int size;
g3f->seqMap = hashNew(0);
while (faMixedSpeedReadNext(g3f->lf, &dna, &size, &name))
    addFasta(g3f, dna, size, name);
}

static void writeFastas(struct gff3File *g3f, FILE *fh)
/* write fasta records fo the file */
{
if (g3f->seqs != NULL)
    {
    fputs("##FASTA\n", fh);
    struct dnaSeq *seq;
    for (seq = g3f->seqs; seq != NULL; seq = seq->next)
        faWriteNext(fh, seq->name, seq->dna, seq->size);
    }
}

static void parseSequenceRegion(struct gff3File *g3f, char *line)
/* parse ##sequence-region seqid start end */
{
char **words = dynChopStringWhite(g3f, line, 4, 4, NULL,
                                  "\"##sequence-region seqid start end\"");
if (words == NULL)
    return;  // got an error
struct gff3SeqRegion *sr = gff3FileAlloc(g3f, sizeof(struct gff3SeqRegion));
sr->seqid = gff3FileCloneStr(g3f, words[1]);
sr->start = gff3FileStrToInt(g3f, words[2])-1;
sr->end = gff3FileStrToInt(g3f, words[3]);
if (g3f->seqRegionMap == NULL)
    g3f->seqRegionMap = hashNew(0);
struct hashEl *hel = hashStore(g3f->seqRegionMap, sr->seqid);
if (hel->val != NULL)
    gff3FileErr(g3f, TRUE, "duplicate ##sequence-region for %s", sr->seqid);
else
    {
    hel->val = sr;
    slAddHead(&g3f->seqRegions, sr);
    }
freeMem(words);
}

static void writeSequenceRegions(struct gff3File *g3f, FILE *fh)
/* parse ##sequence-region metadata */
{
struct gff3SeqRegion *sr;
for (sr = g3f->seqRegions; sr != NULL; sr = sr->next)
    fprintf(fh, "##sequence-region %s %d %d\n", sr->seqid, sr->start, sr->end);
}

static void writeSlNameMetas(char *metaName, struct slName *metas, FILE *fh)
/* write meta records stores as slNames */
{
struct slName  *m;
for (m = metas; m != NULL; m = m->next)
    fprintf(fh, "%s %s\n", metaName, m->name);
}

static void parseFeatureOntology(struct gff3File *g3f, char *line)
/* parse ##feature-ontology URI  */
{
char **words = dynChopStringWhite(g3f, line, 2, 2, NULL,
                                  "\"##feature-ontology URI\"");
if (words == NULL)
    return;  // got an error
slSafeAddHead(&g3f->featureOntologies, gff3FileSlNameNew(g3f, words[1]));
freeMem(words);
}

static void writeFeatureOntologies(struct gff3File *g3f, FILE *fh)
/* parse ##feature-ontology metas */
{
writeSlNameMetas("##feature-ontology", g3f->featureOntologies, fh);
}

static void parseAttributeOntology(struct gff3File *g3f, char *line)
/* parse ##attribute-ontology URI */
{
char **words = dynChopStringWhite(g3f, line, 2, 2, NULL,
                                  "\"##attribute-ontology URI\"");
if (words == NULL)
    return;  // got an error
slSafeAddHead(&g3f->attributeOntologies, gff3FileSlNameNew(g3f, words[1]));
freeMem(words);
}

static void writeAttributeOntologies(struct gff3File *g3f, FILE *fh)
/* write ##attribute-ontology metas */
{
writeSlNameMetas("##attribute-ontology", g3f->attributeOntologies, fh);
}

static void parseSourceOntology(struct gff3File *g3f, char *line)
/* parse ##source-ontology URI */
{
char **words = dynChopStringWhite(g3f, line, 2, 2, NULL,
                                  "\"##source-ontology URI\"");
if (words == NULL)
    return;  // got an error
slSafeAddHead(&g3f->sourceOntologies, gff3FileSlNameNew(g3f, words[1]));
freeMem(words);
}

static void writeSourceOntologies(struct gff3File *g3f, FILE *fh)
/* write ##source-ontology metas */
{
writeSlNameMetas("##source-ontology", g3f->sourceOntologies, fh);
}

static void parseSpecies(struct gff3File *g3f, char *line)
/* parse ##species NCBI_Taxonomy_URI */
{
char **words = dynChopStringWhite(g3f, line, 2, 2, NULL,
                                  "\"##species NCBI_Taxonomy_URI\"");
if (words == NULL)
    return;  // got an error
slSafeAddHead(&g3f->species, gff3FileSlNameNew(g3f, words[1]));
freeMem(words);
}

static void writeSpecies(struct gff3File *g3f, FILE *fh)
/* write ##species NCBI_Taxonomy_URI */
{
writeSlNameMetas("##species", g3f->species, fh);
}

static void parseGenomeBuild(struct gff3File *g3f, char *line)
/* parse ##genome-build source buildName */
{
if (g3f->genomeBuildSource != NULL)
    {
    gff3FileErr(g3f, TRUE, "multiple ##genome-build records");
    return;  // skip
    }
char **words = dynChopStringWhite(g3f, line, 3, 3, NULL,
                                  "\"##genome-build source buildName\"");
if (words == NULL)
    return;  // got an error
g3f->genomeBuildSource = gff3FileCloneStr(g3f, words[1]);
g3f->genomeBuildName = gff3FileCloneStr(g3f, words[2]);
freeMem(words);
}

static void writeGenomeBuild(struct gff3File *g3f, FILE *fh)
/* parse ##genome-build source buildName */
{
if (g3f->genomeBuildSource != NULL)
    fprintf(fh, "##genome-build %s %s\n", g3f->genomeBuildSource, g3f->genomeBuildName);
}

static void parseMeta(struct gff3File *g3f, char *line)
/* parse a meta line of a gff3 file */
{
eraseTrailingSpaces(line);
if (sameString("###", line))
    ; // ignore
else if (sameString("##FASTA", line))
    parseFasta(g3f);
else if (startsWithWord("##sequence-region", line))
    parseSequenceRegion(g3f, line);
else if (startsWithWord("##feature-ontology", line))
    parseFeatureOntology(g3f, line);
else if (startsWithWord("##attribute-ontology", line))
    parseAttributeOntology(g3f, line);
else if (startsWithWord("##source-ontology", line))
    parseSourceOntology(g3f, line);
else if (startsWithWord("##species", line))
    parseSpecies(g3f, line);
else if (startsWithWord("##genome-build", line))
    parseGenomeBuild(g3f, line);
else if (startsWithWord("##gff-spec-version", line) ||
         startsWithWord("##source-version", line) ||
         startsWithWord("##source", line) ||
         startsWithWord("##date", line) ||
         startsWithWord("##Type", line))
    ;  /* FIXME: silently ignore these.  Mark says. */
else
    gff3FileErr(g3f, TRUE, "invalid meta line: %s", line);
}

static void parseLine(struct gff3File *g3f, char *line)
/* parse one line of a gff3 file */
{
if (startsWith("##", line))
    parseMeta(g3f, line);
else if (!startsWith("#", line) && (strlen(line) > 0))
    parseAnn(g3f, line);
}

static void parseHeader(struct gff3File *g3f)
/* parse and validate a GFF3 header */
{
char *line;
if (!lineFileNext(g3f->lf, &line, NULL))
    gff3FileErr(g3f, FALSE, "empty GFF file, must have header");
char *ver = skipToSpaces(line);
if ((ver != NULL) && (*ver != '\0'))
    {
    *ver++ = '\0';
    ver = trimSpaces(ver);
    }
if (!(sameString(line, "##gff-version") && ( sameString(ver, "3") || (startsWith("3.", ver)))))
    {
    gff3FileErr(g3f, FALSE, "invalid GFF3 header");
    errAbort("may not be a GFF3 file");
    }
}

static void parseFile(struct gff3File *g3f)
/* do parsing phase of reading a GFF3 file */
{
g3f->lf = lineFileOpen(g3f->fileName, TRUE);
parseHeader(g3f);
char *line;
while (lineFileNext(g3f->lf, &line, NULL))
    {
    parseLine(g3f, line);
    if (gff3FileStopDueToErrors(g3f))
        break;
    }
lineFileClose(&g3f->lf);
slReverse(&g3f->anns);
}

static int gff3AnnCount(struct gff3Ann *g3a)
/* count the number of gff3Ann objects linked together in a feature */
{
int cnt = 0;
for (; g3a != NULL; g3a = g3a->nextPart)
    cnt++;
return cnt;
}

static void discontinFeatureCheck(struct gff3Ann *g3a)
/* sanity check linked gff3Ann discontinuous features */
{
// Add non-spec restriction on only one parent id to make parent id checks easier
if (slCount(g3a->parentIds) > 1)
    gff3AnnErr(g3a, FALSE, "Annotation records for discontinuous features with ID=\"%s\" must have one and only one parent", g3a->id);
for (struct gff3Ann *g3a2 = g3a->nextPart; (g3a2 != NULL) && !gff3FileStopDueToErrors(g3a->file); g3a2 = g3a2->nextPart)
    {
    if (slCount(g3a2->parentIds) > 1)
        gff3AnnErr(g3a2, FALSE, "Annotation records for discontinuous features with ID=\"%s\" must have one and only one parent", g3a2->id);
    else if (!sameString(g3a->type, g3a2->type))
        gff3AnnErr(g3a2, FALSE, "Annotation records for discontinuous features with ID=\"%s\" do not have the same type, found \"%s\" and \"%s\"", g3a->id, g3a->type, g3a2->type);
    else if ((slCount(g3a2->parentIds) > 0) &&
             !sameString(g3a2->parentIds->name, g3a->parentIds->name))
        gff3AnnErr(g3a2, FALSE, "Annotation records for discontinuous features with ID=\"%s\" must have same parent, found: \"%s\" and \"%s\"", g3a->id,
                   g3a2->parentIds->name, g3a->parentIds->name);
    }
}

static void discontinFeatureFillArray(struct gff3Ann *g3a, int numAnns, struct gff3Ann *featAnns[])
/* convert list to array for sorting */
{
int i = 0;
for (; g3a != NULL; g3a = g3a->nextPart)
    featAnns[i++] = g3a;
}

static struct gff3Ann *discontinFeatureArrayLink(int numAnns, struct gff3Ann *featAnns[])
/* convert sorted array to a list */
{
struct gff3Ann *g3aHead = NULL, *g3aPrev = NULL;
int i;
for (i = 0; i < numAnns; i++)
    {
    if (g3aHead == NULL)
        g3aHead = featAnns[i];
    if (g3aPrev != NULL)
        g3aPrev->nextPart = featAnns[i];
    featAnns[i]->prevPart = g3aPrev;
    }
return g3aHead;
}

static int discontigFeatureSortCmp(const void *p1, const void *p2)
/* compare function for discontigFeatureSort */
{
struct gff3Ann *g3a1 = *((struct gff3Ann **)p1);
struct gff3Ann *g3a2 = *((struct gff3Ann **)p2);
int diff = g3a1->start - g3a2->start;
if (diff == 0)
    diff = g3a1->end - g3a2->end;
return diff;
}

static struct gff3Ann *discontigFeatureSort(struct gff3Ann *g3a)
/* sort a list of gff3Ann object representing discontinuous */
{
int numAnns = gff3AnnCount(g3a);
struct gff3Ann *featAnns[numAnns];
discontinFeatureFillArray(g3a, numAnns, featAnns);
qsort(featAnns, numAnns, sizeof(struct gff3Ann*), discontigFeatureSortCmp);
return discontinFeatureArrayLink(numAnns, featAnns);
}

static void discontigFeatureFinish(struct gff3File *g3f)
/* finish up discontinuous features, sorting them into ascending order */
{
// only both sorting if more than one annotation
struct hashCookie cookie = hashFirst(g3f->byId);
struct hashEl *hel;
while (((hel = hashNext(&cookie)) != NULL) && !gff3FileStopDueToErrors(g3f))
    {
    struct gff3Ann *g3a = hel->val;
    if (g3a->nextPart != NULL)
        {
        discontinFeatureCheck(g3a);
        hel->val = discontigFeatureSort(g3a);
        }
    }
}

static struct gff3Ann *resolveRef(struct gff3Ann *g3a, char *id, char *attr)
/* resolve a link for an attribute */
{
struct gff3Ann *ann = gff3FileFindAnn(g3a->file, id);
if (ann == NULL)
    gff3AnnErr(g3a, TRUE, "Can't find annotation record \"%s\" referenced by \"%s\" %s attribute", id, g3a->id, attr);
return ann;
}

static struct gff3AnnRef *resolveRefs(struct gff3Ann *g3a, struct slName *ids, char *attr)
/* resolve links for an attribute */
{
struct gff3AnnRef *refs = NULL;
struct slName *id;
for (id = ids; id != NULL; id = id->next)
    {
    struct gff3Ann *ann = resolveRef(g3a, id->name, attr);
    if (ann != NULL)
        slSafeAddHead(&refs, gff3AnnRefAlloc(ann));
    }
return refs;
}

static void resolveAnn(struct gff3Ann *g3a)
/* resolve links for an gff3Ann */
{
g3a->parents = resolveRefs(g3a, g3a->parentIds, gff3AttrParent);
if (g3a->parents == NULL)
    {
    // is droppped if there here ignored errors resolving reference
    if (g3a->parentIds == NULL)
        slSafeAddHead(&g3a->file->roots, gff3AnnRefAlloc(g3a));
    }
else
    {
    struct gff3AnnRef *par;
    for (par = g3a->parents; par != NULL; par = par->next)
        slSafeAddHead(&par->ann->children, gff3AnnRefAlloc(g3a));
    }
if (g3a->derivesFromId != NULL)
    g3a->derivesFrom = resolveRef(g3a, g3a->derivesFromId, gff3AttrDerivesFrom);
}

static void resolveAnns(struct gff3File *g3f)
/* resolve links */
{
struct gff3AnnRef *g3aRef;
for (g3aRef = g3f->anns; (g3aRef != NULL) && !gff3FileStopDueToErrors(g3f); g3aRef = g3aRef->next)
    resolveAnn(g3aRef->ann);
}

static void resolveFile(struct gff3File *g3f)
/* do resolution phase of reading a GFF3 file */
{
// must sort first, as links point to the first feature
discontigFeatureFinish(g3f);
resolveAnns(g3f);
// reorder just for test reproducibility
slReverse(&g3f->seqRegions);
slReverse(&g3f->featureOntologies);
slReverse(&g3f->attributeOntologies);
slReverse(&g3f->sourceOntologies);
slReverse(&g3f->species);
slReverse(&g3f->seqs);
}

static struct gff3File *gff3FileNew()
/* construct a new, empty gff3File object */
{
struct gff3File *g3f;
AllocVar(g3f);
g3f->byId = hashNew(0);
g3f->pool = hashNew(0);
return g3f;
}

struct gff3File *gff3FileOpen(char *fileName, int maxErr, unsigned flags, FILE *errFh)
/* Parse a GFF3 file into a gff3File object.  If maxErr not zero, then
 * continue to parse until this number of error have been reached.  A maxErr
 * less than zero does not stop reports all errors. Write errors to errFh,
 * if NULL, use stderr. */
{
struct gff3File *g3f = gff3FileNew();
g3f->fileName = gff3FileCloneStr(g3f, fileName);
g3f->flags = flags;
g3f->errFh = (errFh != NULL) ? errFh : stderr;
g3f->maxErr = (maxErr < 0) ? INT_MAX : maxErr;
parseFile(g3f);
if (!gff3FileStopDueToErrors(g3f))
    resolveFile(g3f);
if (g3f->errCnt > 0)
    errAbort("GFF3: %d parser errors", g3f->errCnt);
return g3f;
}

void gff3FileFree(struct gff3File **g3fPtr)
/* Free a gff3File object */
{
struct gff3File *g3f = *g3fPtr;
if (g3f != NULL)
    {
    hashFree(&g3f->byId);
    hashFree(&g3f->pool);
    hashFree(&g3f->seqRegionMap);
    freeMem(g3f);
    *g3fPtr = NULL;
    }
}

struct gff3Ann *gff3FileFindAnn(struct gff3File *g3f, char *id)
/* find an annotation record by id, or NULL if not found. */
{
return hashFindVal(g3f->byId, id);
}

static void writeMeta(struct gff3File *g3f, FILE *fh)
/* write meta data */
{
fputs("##gff-version 3\n", fh);
writeSequenceRegions(g3f, fh);
writeFeatureOntologies(g3f, fh);
writeAttributeOntologies(g3f, fh);
writeSourceOntologies(g3f, fh);
writeSpecies(g3f, fh);
writeGenomeBuild(g3f, fh);
}

void gff3FileWrite(struct gff3File *g3f, char *fileName)
/* write contents of an GFF3File object to a file */
{
FILE *fh = mustOpen(fileName, "w");
writeMeta(g3f, fh);
struct gff3AnnRef *g3aRef;
for (g3aRef = g3f->anns; g3aRef != NULL; g3aRef = g3aRef->next)
    gff3AnnWrite(g3aRef->ann, fh);
writeFastas(g3f, fh);
carefulClose(&fh);
}

int gff3AnnRefLocCmp(const void *va, const void *vb)
/* sort compare function for two gff3AnnRef objects */
{
const struct gff3Ann *a = (*((struct gff3AnnRef **)va))->ann;
const struct gff3Ann *b = (*((struct gff3AnnRef **)vb))->ann;
int diff = strcmp(a->seqid, b->seqid);
if ((diff == 0) && (a->strand != b->strand))
    {
    // allow for various types of strand fields. above tests handles both null
    if (a->strand == NULL)
        diff = 1;
    else if (b->strand == NULL)
        diff = -1;
    else
        diff = strcmp(a->strand, b->strand);
    }
if (diff == 0)
    diff = a->start - b->start;
if (diff == 0)
    diff = a->end - b->end;
return diff;
}

void gff3UnlinkChild(struct gff3Ann *g3a,
                     struct gff3Ann *child)
/* unlink the child from it's parent (do not free) */
{
struct gff3AnnRef *childRef = gff3RefFind(g3a->children, child);
if (childRef == NULL)
    errAbort("gff3UnlinkChild: not a child of specified parent");
slRemoveEl(&g3a->children, childRef);

// assume correct linked now that relationship is verfied
struct gff3AnnRef *parentRef = gff3RefFind(child->parents, g3a);
slRemoveEl(&child->parents, parentRef);

struct slName *parentId = slNameFind(child->parentIds, g3a->id);
slRemoveEl(&child->parentIds, parentId);
// don't free, links are in localmem
}

void gff3LinkChild(struct gff3Ann *g3a,
                   struct gff3Ann *child)
/* Add a child to new parent */
{
struct gff3AnnRef *childRef = gff3AnnRefAlloc(child);
slSafeAddHead(&g3a->children, childRef);
slSort(&g3a->children, gff3AnnRefLocCmp);
slAddHead(&child->parentIds, gff3FileSlNameNew(g3a->file, g3a->id));
slAddHead(&child->parents, gff3AnnRefAlloc(g3a));
}

