/* genePred.c was originally generated by the autoSql program, which also 
 * generated genePred.h and genePred.sql.  This module links the database and the RAM 
 * representation of objects. */

/* Copyright (C) 2014 The Regents of the University of California 
 * See kent/LICENSE or http://genome.ucsc.edu/license/ for licensing information. */

#include "common.h"
#include "gff.h"
#include "jksql.h"
#include "psl.h"
#include "linefile.h"
#include "genePred.h"
#include "genbank.h"
#include "rangeTree.h"
#include "hdb.h"
#include "chromInfo.h"


struct genePred *genePredLoad(char **row)
/* Load a genePred from row fetched with select * from genePred
 * from database.  Dispose of this with genePredFree(). 
 * NOTE: cannabalizes the row argument */
{
struct genePred *ret;
int sizeOne;

AllocVar(ret);
ret->exonCount = sqlUnsigned(row[7]);
ret->name = cloneString(row[0]);
ret->chrom = cloneString(row[1]);
strcpy(ret->strand, row[2]);
ret->txStart = sqlUnsigned(row[3]);
ret->txEnd = sqlUnsigned(row[4]);
ret->cdsStart = sqlUnsigned(row[5]);
ret->cdsEnd = sqlUnsigned(row[6]);
sqlUnsignedDynamicArray(row[8], &ret->exonStarts, &sizeOne);
if (sizeOne != ret->exonCount)
    errAbort("genePred: %s number of exonStarts (%d) != number of exons (%d)",
             ret->name, sizeOne, ret->exonCount);
sqlUnsignedDynamicArray(row[9], &ret->exonEnds, &sizeOne);
if (sizeOne != ret->exonCount)
    errAbort("genePred: %s number of exonEnds (%d) != number of exons (%d)",
             ret->name, sizeOne, ret->exonCount);
return ret;
}

struct genePred *genePredLoadAll(char *fileName) 
/* Load all genePred from a whitespace-separated file.
 * Dispose of this with genePredFreeList(). */
{
struct genePred *list = NULL, *el;
struct lineFile *lf = lineFileOpen(fileName, TRUE);
char *row[10];

while (lineFileRow(lf, row))
    {
    el = genePredLoad(row);
    slAddHead(&list, el);
    }
lineFileClose(&lf);
slReverse(&list);
return list;
}

struct genePred *genePredLoadAllByChar(char *fileName, char chopper) 
/* Load all genePred from a chopper separated file.
 * Dispose of this with genePredFreeList(). */
{
struct genePred *list = NULL, *el;
struct lineFile *lf = lineFileOpen(fileName, TRUE);
char *row[10];

while (lineFileNextCharRow(lf, chopper, row, ArraySize(row)))
    {
    el = genePredLoad(row);
    slAddHead(&list, el);
    }
lineFileClose(&lf);
slReverse(&list);
return list;
}

struct genePred *genePredCommaIn(char **pS, struct genePred *ret)
/* Create a genePred out of a comma separated string. 
 * This will fill in ret if non-null, otherwise will
 * return a new genePred */
{
char *s = *pS;
int i;

if (ret == NULL)
    AllocVar(ret);
ret->name = sqlStringComma(&s);
ret->chrom = sqlStringComma(&s);
sqlFixedStringComma(&s, ret->strand, sizeof(ret->strand));
ret->txStart = sqlUnsignedComma(&s);
ret->txEnd = sqlUnsignedComma(&s);
ret->cdsStart = sqlUnsignedComma(&s);
ret->cdsEnd = sqlUnsignedComma(&s);
ret->exonCount = sqlUnsignedComma(&s);
s = sqlEatChar(s, '{');
AllocArray(ret->exonStarts, ret->exonCount);
for (i=0; i<ret->exonCount; ++i)
    {
    ret->exonStarts[i] = sqlUnsignedComma(&s);
    }
s = sqlEatChar(s, '}');
s = sqlEatChar(s, ',');
s = sqlEatChar(s, '{');
AllocArray(ret->exonEnds, ret->exonCount);
for (i=0; i<ret->exonCount; ++i)
    {
    ret->exonEnds[i] = sqlUnsignedComma(&s);
    }
s = sqlEatChar(s, '}');
s = sqlEatChar(s, ',');
*pS = s;
return ret;
}

void genePredFree(struct genePred **pEl)
/* Free a single dynamically allocated genePred such as created
 * with genePredLoad(). */
{
struct genePred *el;

if ((el = *pEl) == NULL) return;
freeMem(el->name);
freeMem(el->chrom);
freeMem(el->exonStarts);
freeMem(el->exonEnds);
freeMem(el->name2);
freeMem(el->exonFrames);
freez(pEl);
}

void genePredFreeList(struct genePred **pList)
/* Free a list of dynamically allocated genePred's */
{
struct genePred *el, *next;

for (el = *pList; el != NULL; el = next)
    {
    next = el->next;
    genePredFree(&el);
    }
*pList = NULL;
}

void genePredOutput(struct genePred *el, FILE *f, char sep, char lastSep) 
/* Print out genePred.  Separate fields with sep. Follow last field with lastSep. */
{
int i;
if (sep == ',') fputc('"',f);
fprintf(f, "%s", el->name);
if (sep == ',') fputc('"',f);
fputc(sep,f);
if (sep == ',') fputc('"',f);
fprintf(f, "%s", el->chrom);
if (sep == ',') fputc('"',f);
fputc(sep,f);
if (sep == ',') fputc('"',f);
fprintf(f, "%s", el->strand);
if (sep == ',') fputc('"',f);
fputc(sep,f);
fprintf(f, "%u", el->txStart);
fputc(sep,f);
fprintf(f, "%u", el->txEnd);
fputc(sep,f);
fprintf(f, "%u", el->cdsStart);
fputc(sep,f);
fprintf(f, "%u", el->cdsEnd);
fputc(sep,f);
fprintf(f, "%u", el->exonCount);
fputc(sep,f);
if (sep == ',') fputc('{',f);
for (i=0; i<el->exonCount; ++i)
    {
    fprintf(f, "%u", el->exonStarts[i]);
    fputc(',', f);
    }
if (sep == ',') fputc('}',f);
fputc(sep,f);
if (sep == ',') fputc('{',f);
for (i=0; i<el->exonCount; ++i)
    {
    fprintf(f, "%u", el->exonEnds[i]);
    fputc(',', f);
    }
if (sep == ',') fputc('}',f);

/* optional fields, >= test is used so unspecified coumns can be filled in */
if (el->optFields >= genePredScoreFld)
    {
    fputc(sep,f);
    fprintf(f, "%d", el->score);
    }
if (el->optFields >= genePredName2Fld)
    {
    fputc(sep,f);
    if (sep == ',') fputc('"',f);
    fprintf(f, "%s", ((el->name2 != NULL) ? el->name2 : ""));
    if (sep == ',') fputc('"',f);
    }
if (el->optFields >= genePredCdsStatFld)
    {
    fputc(sep,f);
    if (sep == ',') fputc('"',f);
    fprintf(f, "%s", genePredCdsStatStr(el->cdsStartStat));
    if (sep == ',') fputc('"',f);
    fputc(sep,f);
    if (sep == ',') fputc('"',f);
    fprintf(f, "%s", genePredCdsStatStr(el->cdsEndStat));
    if (sep == ',') fputc('"',f);
    }
if (el->optFields >= genePredExonFramesFld)
    {
    fputc(sep,f);
    if (sep == ',') fputc('{',f);
    for (i=0; i<el->exonCount; ++i)
        {
        fprintf(f, "%d", el->exonFrames[i]);
        fputc(',', f);
        }
    if (sep == ',') fputc('}',f);
    }
fputc(lastSep,f);
}

/* ---------  Start of hand generated code. ---------------------------- */

char *genePredCdsStatStr(enum cdsStatus stat)
/* get string value of a cdsStatus */
{
switch (stat) {
case cdsNone:
    return "none";
case cdsUnknown:
    return "unk";
case cdsIncomplete:
    return "incmpl";
case cdsComplete:
    return "cmpl";
default:
    errAbort("invalid cdsStatus enum value: %d", stat);
    return NULL;  /* make compiler happy */
}
}

enum cdsStatus parseCdsStat(char *statStr)
/* parse a cdsStatus string */
{
if ((statStr == NULL) || sameString(statStr, "none"))
    return cdsNone;
if (sameString(statStr, "unk"))
    return cdsUnknown;
if (sameString(statStr, "incmpl"))
    return cdsIncomplete;
if (sameString(statStr, "cmpl"))
    return cdsComplete;

errAbort("invalid genePred cdsStatus: \"%s\"", statStr);
return cdsNone;  /* make compiler happy */
}

struct genePred *genePredKnownLoad(char **row, int numCols)
/* Load a genePred in knownGene format from row. */
{
struct genePred *ret;
int sizeOne;

AllocVar(ret);
ret->exonCount = sqlUnsigned(row[7]);
ret->name = cloneString(row[0]);
ret->chrom = cloneString(row[1]);
strcpy(ret->strand, row[2]);
ret->txStart = sqlUnsigned(row[3]);
ret->txEnd = sqlUnsigned(row[4]);
ret->cdsStart = sqlUnsigned(row[5]);
ret->cdsEnd = sqlUnsigned(row[6]);
sqlUnsignedDynamicArray(row[8], &ret->exonStarts, &sizeOne);
if (sizeOne != ret->exonCount)
    errAbort("genePred: %s number of exonStarts (%d) != number of exons (%d)",
             ret->name, sizeOne, ret->exonCount);
sqlUnsignedDynamicArray(row[9], &ret->exonEnds, &sizeOne);
if (sizeOne != ret->exonCount)
    errAbort("genePred: %s number of exonEnds (%d) != number of exons (%d)",
             ret->name, sizeOne, ret->exonCount);

ret->name2 = cloneString(row[11]);
return ret;
}
struct genePred *genePredExtLoad(char **row, int numCols)
/* Load a genePred with from a row, with optional fields.  The row must
 * contain columns in the order in the struct, and they must be present up to
 * the last specfied optional field.  Missing intermediate fields must have
 * zero or empty columns, they may not be omitted.  Fields at the end can be
 * omitted. Dispose of this with genePredFree(). */
{
struct genePred *ret;
int sizeOne, iCol;

AllocVar(ret);
ret->exonCount = sqlUnsigned(row[7]);
ret->name = cloneString(row[0]);
ret->chrom = cloneString(row[1]);
strcpy(ret->strand, row[2]);
ret->txStart = sqlUnsigned(row[3]);
ret->txEnd = sqlUnsigned(row[4]);
ret->cdsStart = sqlUnsigned(row[5]);
ret->cdsEnd = sqlUnsigned(row[6]);
sqlUnsignedDynamicArray(row[8], &ret->exonStarts, &sizeOne);
if (sizeOne != ret->exonCount)
    errAbort("genePred: %s number of exonStarts (%d) != number of exons (%d)",
             ret->name, sizeOne, ret->exonCount);
sqlUnsignedDynamicArray(row[9], &ret->exonEnds, &sizeOne);
if (sizeOne != ret->exonCount)
    errAbort("genePred: %s number of exonEnds (%d) != number of exons (%d)",
             ret->name, sizeOne, ret->exonCount);

iCol=GENEPRED_NUM_COLS;
if (iCol < numCols)
    {
    ret->score = sqlSigned(row[iCol++]);
    ret->optFields |= genePredScoreFld;
    }
if (iCol < numCols)
    {
    ret->name2 = cloneString(row[iCol++]);
    ret->optFields |= genePredName2Fld;
    }
if (iCol < numCols)
    {
    if (isEmpty(row[iCol])) // if the cdsStartStat field is empty
	return ret;         // ignore the rest of the fields
    ret->cdsStartStat = parseCdsStat(row[iCol++]);
    ret->optFields |= genePredCdsStatFld;
    }
if (iCol < numCols)
    {
    ret->cdsEndStat = parseCdsStat(row[iCol++]);
    ret->optFields |= genePredCdsStatFld;
    }
if (iCol < numCols)
    {
    sqlSignedDynamicArray(row[iCol++], &ret->exonFrames, &sizeOne);
    if (sizeOne != ret->exonCount)
        errAbort("genePred: %s number of exonFrames (%d) != number of exons (%d)",
                 ret->name, sizeOne, ret->exonCount);
    ret->optFields |= genePredExonFramesFld;
    }
return ret;
}

struct genePred *genePredKnownLoadAll(char *fileName)
/* Load all genePreds with from tab-separated file in knownGene format */
{
struct genePred *list = NULL, *el;
struct lineFile *lf = lineFileOpen(fileName, TRUE);
char *row[GENEPREDX_NUM_COLS];
int numCols;

while ((numCols = lineFileChopNextTab(lf, row, ArraySize(row))) > 0)
    {
    lineFileExpectAtLeast(lf, GENEPRED_NUM_COLS, numCols);
    el = genePredKnownLoad(row, numCols);
    slAddHead(&list, el);
    }
lineFileClose(&lf);
slReverse(&list);
return list;
}

struct genePred *genePredExtLoadAll(char *fileName)
/* Load all genePreds with from tab-separated file, possibly with optional
 * fields. Dispose of this with genePredFreeList(). */
{
struct genePred *list = NULL, *el;
struct lineFile *lf = lineFileOpen(fileName, TRUE);
char *row[GENEPREDX_NUM_COLS];
int numCols;

while ((numCols = lineFileChopNextTab(lf, row, ArraySize(row))) > 0)
    {
    lineFileExpectAtLeast(lf, GENEPRED_NUM_COLS, numCols);
    el = genePredExtLoad(row, numCols);
    slAddHead(&list, el);
    }
lineFileClose(&lf);
slReverse(&list);
return list;
}

int genePredCmp(const void *va, const void *vb)
/* Compare to sort based on chromosome, txStart. */
{
const struct genePred *a = *((struct genePred **)va);
const struct genePred *b = *((struct genePred **)vb);
int dif = strcmp(a->chrom, b->chrom);
if (dif == 0)
    dif = a->txStart - b->txStart;
return dif;
}

int genePredNameCmp(const void *va, const void *vb)
/* Compare to sort based on name, then chromosome, txStart. */
{
const struct genePred *a = *((struct genePred **)va);
const struct genePred *b = *((struct genePred **)vb);
int dif = strcmp(a->name, b->name);
if (dif == 0)
    dif = strcmp(a->chrom, b->chrom);
if (dif == 0)
    dif = a->txStart - b->txStart;
return dif;
}

static boolean haveStartStopCodons(struct gffFile *gff)
/* For GFFs, determine if any of the annotations use start_codon or                                                                                                                 
 * stop_codon */
{
struct gffFeature *feature;
for(feature = gff->featureList; feature != NULL; feature = feature->next)
    {
    if(sameWord(feature->name, "start_codon") ||
       sameWord(feature->name, "start_codon"))
        return TRUE;
    }
return FALSE;
}

static boolean isExon(char *feat, boolean isGtf, char *exonSelectWord)
/* determine if a feature is an exon; different criteria for GFF ane GTF */
{
if (isGtf)
    {
    /* must check for stop_codon here, because GTF put it outside of the
     * CDS */
    return (sameWord(feat, "CDS") || sameWord(feat, "stop_codon")
            || sameWord(feat, "start_codon") || sameWord(feat, "exon")
            || sameWord(feat, "utr") || sameWord(feat, "5utr")
            || sameWord(feat, "3utr"));
    }
else
    {
    return ((exonSelectWord == NULL) || sameWord(feat, exonSelectWord));
    }
}

static boolean isStartStopCodon(char *feat)
/* determine if a feature is GTF style start/stop codon */
{
return sameWord(feat, "stop_codon") || sameWord(feat, "start_codon");
}

static boolean isCds(char *feat)
/* determine if a feature is CDS */
{
return sameWord(feat, "CDS") || isStartStopCodon(feat);
}

static void chkGroupLine(struct gffGroup *group, struct gffLine *gl, struct genePred *gp)
/* check that a gffLine is consistent with the genePred being built.  this
 * helps detect some problems that lead to corrupt genePreds */
{
if (!(sameString(gl->seq, gp->chrom) && (gl->strand == gp->strand[0])))
    {
    fprintf(stderr, "invalid gffGroup detected on line: ");
    gffTabOut(gl, stderr);
    errAbort("GFF/GTF group %s on %s%c, this line is on %s%c, all group members must be on same seq and strand",
             group->name, gp->chrom, gp->strand[0],
                     gl->seq, gl->strand);
    }
}

static int phaseToFrame(char phase)
/* convert GTF/GFF phase to frame */
{
switch (phase)
    {
    case '0':
        return 0;
    case '1':
        return 2;
    case '2':
        return 1;
    }
return -1;
}

#ifdef NOT_CURRENTLY_USED
static char frameToPhase(int frame)
/* convert GTF/GFF frame to phase */
{
switch (frame)
    {
    case 0:
        return '0';
    case 1:
        return '2';
    case 2:
        return '1';
    }
return -1;
}
#endif

static int incrFrame(int frame, int amt)
/* increment frame, no-op if frame is -1 */
{
if (frame < 0)
    return frame;
else if (amt >= 0)
    return (frame+amt) % 3;
else
    {
    int amt3 = (-amt) % 3;
    return (frame - (amt - amt3)) % 3;
    }
}

static int findPosExon(struct genePred *gp, int pos)
/* find exon containing the specified position */
{
int i;
for (i = 0; i < gp->exonCount; i++)
    {
    if ((gp->exonStarts[i] <= pos) && (pos < gp->exonEnds[i]))
        return i;
    }
errAbort("bug: position %d not found in exon of %s genePred", pos, gp->name);
return 0;
}

static boolean adjImpliedStopPos(struct genePred *gp)
/* implied stop codon adjustment for positive strand gene */
{
int iExon = findPosExon(gp, gp->cdsEnd-1);
unsigned space = (gp->exonEnds[iExon]-gp->cdsEnd);
if (space >= 3)
    {
    // room in current exon
    gp->cdsEnd += 3;
    return TRUE;
    }
else if ((iExon < gp->exonCount)
         && ((gp->exonEnds[iExon+1]-gp->exonStarts[iExon+1])+space) >= 3)
    {
    // room including next exon
    gp->cdsEnd = gp->exonStarts[iExon+1] + (3-space);
    return TRUE;
    }
return FALSE;
}

static boolean adjImpliedStopNeg(struct genePred *gp)
/* implied stop codon adjustment for negative strand gene */
{
int iExon = findPosExon(gp, gp->cdsStart);
unsigned space = (gp->cdsStart-gp->exonStarts[iExon]);
if (space >= 3)
    {
    // room in current exon
    gp->cdsStart -= 3;
    return TRUE;
    }
else if ((iExon > 0)
         && ((gp->exonEnds[iExon-1]-gp->exonStarts[iExon-1])+space) >= 3)
    {
    // room including previous exon
    gp->cdsStart = gp->exonEnds[iExon-1] - (3-space);
    return TRUE;
    }
return FALSE;
}

static void adjImpliedStopAfterCds(struct genePred *gp)
/* adjust CDS bounds to include implied stop codon outside of CDS. */
{
if (gp->strand[0] == '+')
    {
    if (adjImpliedStopPos(gp))
        {
        if (gp->optFields & genePredCdsStatFld)
            gp->cdsEndStat = cdsComplete;
        }
    }
else
    {
    if (adjImpliedStopNeg(gp))
        {
        if (gp->optFields & genePredCdsStatFld)
            gp->cdsStartStat = cdsComplete;
        }
    }
}

static boolean isGtfFeature(char *feat)
/* check if a feature is one of the recognized features.  All otherse
 * should be ignored. http://mblab.wustl.edu/GTF22.html */
{
static char *gtfFeats[] = {"CDS", "start_codon", "stop_codon", "5UTR", "3UTR",
                           "inter", "inter_CNS", "intron_CNS", "exon", NULL};
int i;
for (i = 0; gtfFeats[i] != NULL; i++)
    {
    if (sameString(feat, gtfFeats[i]))
        return TRUE;
    }
return FALSE;
}

static boolean ignoreGxfLine(struct gffLine *gl, boolean isGtf)
/* should the specified line be skipped? */
{
if (isGtf)
    return !isGtfFeature(gl->feature);
else
    return FALSE;
}

static boolean allowedFrameFeature(struct gffLine *gl, boolean isGtf)
/* should this feature be used to assign frame? */
{
if (isStartStopCodon(gl->feature) || ignoreGxfLine(gl, isGtf))
    return FALSE;

// for GFF, any features with frame can be used to set frame,
// with GTF, there are some bogus files that set frame on UTR features,
// so only use CDS
if (isGtf)
    return sameString(gl->feature, "CDS");
else
    return (gl->frame != '.');
}

static struct gffLine *assignFrameForCdsExon(int start, int end, int *framePtr,
                                             struct gffLine *gl, boolean isGtf)
/* set frame from any overlapping records with frame. Don't include
 * start/stop_codon, as it isn't a full exon. */
{
/* skip lines preceeding this exon */
while (gl != NULL)
    {
    if (allowedFrameFeature(gl, isGtf) && (gl->end >= start))
        break;
    gl = gl->next;
}
/* set frame from any overlapping records with frame. */
while ((gl != NULL) && (rangeIntersection(gl->start, gl->end, start, end) > 0))
    {
    if (allowedFrameFeature(gl, isGtf))
        {
        int frame = phaseToFrame(gl->frame);
        if (frame >= 0)
            *framePtr = frame;
        }
    gl = gl->next;
    }
return gl;
}

static void assignFrame(boolean isGtf, struct gffGroup *group, struct genePred *gp, unsigned options)
/* Assign frame from GFF/GTF after genePred has been built.*/
{

struct gffLine *gl = group->lineList;
int start, end, i;
int numCdsExons = 0;
int numFrameExons = 0;
for (i = 0; i < gp->exonCount; i++)
    {
    if (genePredCdsExon(gp, i, &start, &end))
        {
        numCdsExons += 1;
        gl = assignFrameForCdsExon(start, end, &(gp->exonFrames[i]), gl, isGtf);
        if (gp->exonFrames[i] >= 0)
            numFrameExons += 1;
        }
    }

/* Assign frames for exons without frames, either due to it incorrectly not
 * being on cases.  This also Handle the nasty corner caseof GTF with the all
 * or part of the stop codon as the only codon in an exon, we must set the
 * frame on these exons.  Some ENSEMBL GTFs have single base `exons' (due to
 * low quality assemblies) with just a base of the stop codon, so must check
 * every base.  While less than ideal, we just extend the frame past the last
 * exon with frame, so a conflicting frame on the stop codon will be missed.
 * Just switching to GFF3 fixes this mess.  Return the count of exons that
 * actually had frame set. */
genePredFixExonFrames(gp);
}

static struct genePred *mkFromGroupedGxf(struct gffFile *gff, struct gffGroup *group, char *name,
                                         boolean isGtf, char *exonSelectWord, unsigned optFields,
                                         unsigned options)
/* common function to create genePreds from GFFs or GTFs.  This is a little
 * ugly with to many check of isGtf, however the was way to much identical
 * code the other way. Options are from genePredFromGxfOpts */
{
struct genePred *gp;
long stopCodonStart = -1, stopCodonEnd = -1;
long cdsStart = -1, cdsEnd = -1;
int exonCount = 0;
boolean haveStartCodon = FALSE, haveStopCodon = FALSE;
struct gffLine *gl;
unsigned *eStarts, *eEnds;
int i;

/* should we count on start/stop codon annotation in GFFs? */
boolean useStartStops = isGtf || haveStartStopCodons(gff);

int geneStart = 0, geneEnd = 0;

/* Count up exons and figure out cdsStart and cdsEnd. */
for (gl = group->lineList; gl != NULL; gl = gl->next)
    {
    boolean exonishLine = FALSE;
    if (ignoreGxfLine(gl, isGtf))
        continue;
    if (isExon(gl->feature, isGtf, exonSelectWord))
	{
	exonishLine = TRUE;
	++exonCount;
	}
    if (isCds(gl->feature))
        {
	exonishLine = TRUE;
	if ((cdsStart < 0) || (gl->start < cdsStart))
            cdsStart = gl->start;
	if ((cdsEnd < 0) || (gl->end > cdsEnd))
            cdsEnd = gl->end;
	}
    if (exonishLine)
        {
	if (geneStart == geneEnd)  // Not initialized yet
	     {
	     geneStart = gl->start;
	     geneEnd = gl->end;
	     }
	else
	     {
	     geneStart = min(gl->start, geneStart);
	     geneEnd = max(gl->end, geneEnd);
	     }
	}
    if (sameWord(gl->feature, "start_codon"))
        haveStartCodon = TRUE;
    if (sameWord(gl->feature, "stop_codon"))
        {
        /* stop_codon can be split, need bounds for adjusting CDS below */
        if ((stopCodonStart < 0) || (gl->start < stopCodonStart))
            stopCodonStart = gl->start;
        if ((stopCodonEnd < 0) || (gl->end > stopCodonEnd))
            stopCodonEnd = gl->end;
        haveStopCodon = TRUE;
        }
    }
if (exonCount == 0)
    return NULL;
if (cdsStart > cdsEnd)
    {
    /* no cds annotated */
    cdsStart = 0;
    cdsEnd = 0;
    }
else if (stopCodonStart >= 0)
    {
    /* adjust CDS to include stop codon as in GTF */
    if (group->strand == '+')
        {
        if (stopCodonEnd > cdsEnd)
            cdsEnd = stopCodonEnd;
        }
    else
        {
        if (stopCodonStart < cdsStart)
            cdsStart = stopCodonStart;
        }
    }

/* add in version numbers if requested and available */
char geneIdToUse[1024], transcriptIdToUse[1024];
geneIdToUse[0]= '\0';
if (options & genePredGxfGeneNameAsName2)
    {
    if (group->lineList->geneName != NULL)
        safecpy(geneIdToUse, sizeof(geneIdToUse), group->lineList->geneName);
    }
else if (group->lineList->geneId != NULL)
    {
    if ((options & genePredGxfIncludeVersion) && (group->lineList->geneVersion != NULL))
        safef(geneIdToUse, sizeof(geneIdToUse), "%s.%s", group->lineList->geneId, group->lineList->geneVersion);
    else
        safecpy(geneIdToUse, sizeof(geneIdToUse), group->lineList->geneId);
    }
if ((options & genePredGxfIncludeVersion) && (group->lineList->transcriptVersion != NULL))
    safef(transcriptIdToUse, sizeof(transcriptIdToUse), "%s.%s", name, group->lineList->transcriptVersion);
else
    safecpy(transcriptIdToUse, sizeof(transcriptIdToUse), name);


/* Allocate genePred and fill in values. */
AllocVar(gp);
gp->name = cloneString(transcriptIdToUse);
gp->chrom = cloneString(group->seq);
gp->strand[0] = group->strand;
gp->txStart = geneStart;
gp->txEnd = geneEnd;
if (cdsStart < cdsEnd)
    {
    gp->cdsStart = cdsStart;
    gp->cdsEnd = cdsEnd;
    }
else
    {
    // no CDS, set to txEnd
    gp->cdsStart = gp->txEnd;
    gp->cdsEnd = gp->txEnd;
    }
gp->exonStarts = AllocArray(eStarts, exonCount);
gp->exonEnds = AllocArray(eEnds, exonCount);
gp->optFields = optFields;

if (optFields & genePredName2Fld)
    gp->name2 = cloneString(geneIdToUse);
if (optFields & genePredCdsStatFld)
    {
    if (cdsStart < cdsEnd)
        {
        if (!useStartStops)
            {
            /* GFF doesn't require start or stop, if not used, assume complete */
            gp->cdsStartStat = cdsComplete;
            gp->cdsEndStat = cdsComplete;
            }
        else if (group->strand == '+')
            {
            gp->cdsStartStat = (haveStartCodon ? cdsComplete : cdsIncomplete);
            gp->cdsEndStat = (haveStopCodon ? cdsComplete : cdsIncomplete);;
            }
        else
            {
            gp->cdsEndStat = (haveStartCodon ? cdsComplete : cdsIncomplete);
            gp->cdsStartStat = (haveStopCodon ? cdsComplete : cdsIncomplete);;
            }
        }
    else
        {
        gp->cdsStartStat = cdsNone;
        gp->cdsEndStat = cdsNone;
        }
    }
if (optFields & genePredExonFramesFld)
    {
    AllocArray(gp->exonFrames, exonCount);
    for (i = 0; i < exonCount; i++)
        gp->exonFrames[i] = -1;
    }


/* adjust tx range to include stop codon */
if ((group->strand == '+') && (gp->txEnd == stopCodonStart))
     gp->txEnd = stopCodonEnd;
else if ((group->strand == '-') && (gp->txStart == stopCodonEnd))
    gp->txStart = stopCodonStart;

i = -1; /* before first exon */
/* fill in exons, merging overlaping and adjacent exons */
for (gl = group->lineList; gl != NULL; gl = gl->next)
    {
    if (ignoreGxfLine(gl, isGtf))
        continue;
    if (isExon(gl->feature, isGtf, exonSelectWord) || (isGtf && isCds(gl->feature)))
        {
        chkGroupLine(group, gl, gp);
        if ((i < 0) || (gl->start > eEnds[i]))
            {
            /* start a new exon */
            ++i;
            assert(i < exonCount);
            eStarts[i] = gl->start;
            eEnds[i] = gl->end;
            }
        else
            {
            /* overlap, extend exon, picking the largest of ends */
            assert(i < exonCount);
            assert(gl->start >= eStarts[i]);
            if (gl->end > eEnds[i])
                eEnds[i] = gl->end;
            }
        }
    }
gp->exonCount = i+1;

/* adjust for flybase type of GFFs, with stop codon implied and outside of
 * CDS. */
if ((options & genePredGxfImpliedStopAfterCds) && !haveStopCodon)
    adjImpliedStopAfterCds(gp);

if (optFields & genePredExonFramesFld)
    assignFrame(isGtf, group, gp, options);

return gp;
}

struct genePred *genePredFromGroupedGff(struct gffFile *gff, struct gffGroup *group, char *name,
                                        char *exonSelectWord, unsigned optFields, unsigned options)
/* Convert gff->groupList to genePred list. See genePred.h for details. */
{
struct gffLine *gl;
boolean anyExon = FALSE;

/* Look to see if any exons.  If not allow CDS to be used instead. */
if (exonSelectWord)
    {
    for (gl = group->lineList; gl != NULL; gl = gl->next)
	{
	if (sameWord(gl->feature, exonSelectWord))
	    {
	    anyExon = TRUE;
	    break;
	    }
	}
    }
else
    anyExon = TRUE;
if (!anyExon)
    exonSelectWord = "CDS";

return mkFromGroupedGxf(gff, group, name, FALSE, exonSelectWord, optFields,
                        options);
}

struct genePred *genePredFromGroupedGtf(struct gffFile *gff, struct gffGroup *group, char *name,
                                        unsigned optFields, unsigned options)
/* Convert gff->groupList to genePred list, using GTF feature conventions.
 * See genePred.h for details. */
{
return mkFromGroupedGxf(gff, group, name, TRUE, NULL, optFields,
                        options);
}

static void mapCdsToGenome(struct psl *psl, struct genbankCds* cds,
                           struct genePred* gene)
/* Convert set cdsStart/end from mrna to genomic coordinates. */
{
struct genbankCds genomeCds = genbankCdsToGenome(cds, psl);
if (genomeCds.start < genomeCds.end)
    {
    gene->cdsStart = genomeCds.start;
    gene->cdsEnd = genomeCds.end;
    if (gene->optFields & genePredCdsStatFld)
        {
        if (gene->strand[0] == '+')
            {
            gene->cdsStartStat = (genomeCds.startComplete) ? cdsComplete : cdsIncomplete;
            gene->cdsEndStat = (genomeCds.endComplete) ? cdsComplete : cdsIncomplete;;
            }
        else
            {
            gene->cdsStartStat = (genomeCds.endComplete) ? cdsComplete : cdsIncomplete;;
            gene->cdsEndStat = (genomeCds.startComplete) ? cdsComplete : cdsIncomplete;
            }
        }
    }
else
    {
    /* CDS not aligned */
    gene->cdsStart = gene->txEnd;
    gene->cdsEnd = gene->txEnd;
    if (gene->optFields & genePredCdsStatFld)
        {
        gene->cdsStartStat = cdsUnknown;
        gene->cdsEndStat = cdsUnknown;
        }
    }
}

void genePredAddGenbankCds(struct psl *psl, struct genbankCds* cds, 
	struct genePred *gene)
/* Convert cdsStart/End from mrna to genomic coordinates. 
 * Note that the genePred blocks need not be filled in before
 * this call. */
{
if (cds == NULL)
    {
    /* no CDS, set to zero-length at end */
    gene->cdsStart = psl->tEnd;
    gene->cdsEnd = psl->tEnd;
    if (gene->optFields & genePredCdsStatFld)
        {
        gene->cdsStartStat = cdsNone;
        gene->cdsEndStat = cdsNone;
        }
    }
else if ((cds->start < 0) || (cds->end <= 0))
    {
    /* unknown CDS, set to end */
    gene->cdsStart = psl->tEnd;
    gene->cdsEnd = psl->tEnd;
    if (gene->optFields & genePredCdsStatFld)
        {
        gene->cdsStartStat = cdsUnknown;
        gene->cdsEndStat = cdsUnknown;
        }
    }
else 
    {
    /* have CDS annotation, make sure it's in bounds */
    struct genbankCds adjCds = *cds;
    if (adjCds.end > psl->qSize)
        {
        adjCds.end = psl->qSize;
        adjCds.endComplete = FALSE;
        }
    mapCdsToGenome(psl, &adjCds, gene);
    }
}

static int getFrame(struct psl *psl, int start, int end,
                    struct genbankCds* cds)
/* get the starting frame for an exon of a mRNA.  start and end are
 * the bounds of the exon within the mRNA.  This may cover multiple psl
 * blocks due to merging of small gaps. */
{
/* use the 3' end is used if it's complete, as it is more often accurate when
 * genes are defined from mRNAs sequenced with reverse-transcriptase. */
int frame = -1;
/* map to mRNA coords in CDS since frame for an exon is in direction of
 * transcription. */
if (psl->strand[0] == '-')
    reverseIntRange(&start, &end, psl->qSize);
if (start < cds->start)
    start = cds->start;
if (end > cds->end)
    end = cds->end;

if (start < end)
    {
    /* Compute frame from end of RNA if CDS end is marked complete and start
     * is not complete.  This is done as the end of an RNA is more likely
     * completely due to reverse transcriptase not replicating the entire RNA.
     * However, code that create CDS from genePreds doesn't always create a
     * CDS specification that indicates incompleteness. So don't use CDS end
     * unless we know start is incomplete, mean code tried to set it.  This is
     * not a perfect solution, as handling of CDS specification is naive and
     * doesn't account for truncated start or stop.  Incomplete codons can
     * result in frame shift even is CDS completeness is set correctly. */
    if (cds->endComplete && !cds->startComplete)
        {
        int fr = (cds->end-start) % 3;
        frame = (fr == 2) ? 1 : ((fr == 1) ? 2 : 0);
        }
    else
        frame = (start-cds->start) % 3;
    }
return frame;
}

static boolean shouldMergeBlocks(struct genePred *gene, 
                                 unsigned tStart, unsigned prevTEnd,
                                 unsigned qStart, unsigned prevQEnd,
                                 unsigned options,
                                 int cdsMergeSize, int utrMergeSize)
/* determine if a new block starting at tStart whould be merged with
 * the preceeding exon.
 */
{
boolean inCds = (gene->cdsStart <= tStart) && (tStart < gene->cdsEnd);
int tGapSize = (tStart - prevTEnd);
int qGapSize = (qStart - prevQEnd);
if (inCds)
    {
    if ((options & genePredPslCdsMod3)
        && (((tGapSize % 3) != 0) || ((qGapSize % 3) != 0)))
        return FALSE;  /* not a multiple of three */
    if ((cdsMergeSize >= 0) && ((tGapSize <= cdsMergeSize) && (qGapSize <= cdsMergeSize)))
        return TRUE;
    }
else 
    {
    // qGapSize only matters for CDS where we are trying to keep frame sane
    if ((utrMergeSize >= 0) && (tGapSize <= utrMergeSize))
        return TRUE;
    }
return FALSE; /* don't merge */
}

static void pslToExons(struct psl *psl, struct genePred *gene,
                       struct genbankCds* cds, unsigned options,
                       int cdsMergeSize, int utrMergeSize)
/* Convert psl alignment blocks to genePred exons, merging together blocks by
 * the specified paraemeters.  Optionally add frame information. */
{
int iBlk, iExon;
int startIdx, stopIdx, idxIncr;

/* this is bigger than needed if blocks merge */
gene->exonStarts = needMem(psl->blockCount*sizeof(unsigned));
gene->exonEnds = needMem(psl->blockCount*sizeof(unsigned));
if (gene->optFields & genePredExonFramesFld)
    {
    /* default frames to indicate no frame */
    gene->exonFrames = needMem(psl->blockCount*sizeof(unsigned));
    for (iExon = 0; iExon < psl->blockCount; iExon++)
        gene->exonFrames[iExon] = -1;
    }

/* traverse psl in postive target order */
if (psl->strand[1] == '-')
    {
    startIdx = psl->blockCount-1;
    stopIdx = -1;
    idxIncr = -1;
    }
else
    {
    startIdx = 0;
    stopIdx = psl->blockCount;
    idxIncr = 1;
    }

iExon = -1;  /* indicate none have been added */
unsigned prevQEnd = 0;
unsigned prevTEnd = 0;
for (iBlk = startIdx; iBlk != stopIdx; iBlk += idxIncr)
    {
    int qStart = psl->qStarts[iBlk];
    int qEnd = qStart + psl->blockSizes[iBlk];
    if (psl->strand[0] == '-')
        reverseIntRange(&qStart, &qEnd, psl->qSize);
    int tStart = psl->tStarts[iBlk];
    int tEnd = tStart + psl->blockSizes[iBlk];
    if (psl->strand[1] == '-')
        reverseIntRange(&tStart, &tEnd, psl->tSize);
    if ((iExon < 0) || !shouldMergeBlocks(gene, tStart, prevTEnd, qStart, prevQEnd, options,
                                          cdsMergeSize, utrMergeSize))
        {
        /* new exon */
        iExon++;
        gene->exonStarts[iExon] = tStart;
	}
    gene->exonEnds[iExon] = tEnd;
    /* Set the frame. We have to deal with multiple blocks being merged.  For
     * positive strand, the first block frame is used, for negative strand,
     * the last is used. */
    if ((gene->optFields & genePredExonFramesFld) && (cds != NULL)
        && (((psl->strand[0] == '+') && (gene->exonFrames[iExon] < 0))
            || (psl->strand[0] == '-')))
	{
        int fr = getFrame(psl, psl->qStarts[iBlk], psl->qStarts[iBlk]+psl->blockSizes[iBlk], cds);
        if (fr >= 0)
	    gene->exonFrames[iExon] = fr;
	}
    prevTEnd = tEnd;
    prevQEnd = qEnd;
    }
gene->exonCount = iExon+1;
assert(gene->exonCount <= psl->blockCount);
}

struct genePred *genePredFromPsl3(struct psl *psl,  struct genbankCds* cds, 
                                  unsigned optFields, unsigned options,
                                  int cdsMergeSize, int utrMergeSize)
/* Convert a PSL of an mRNA alignment to a genePred, converting a genbank CDS
 * specification string to genomic coordinates. Small genomic inserts are
 * merged based on the mergeSize parameters.  Gaps no larger than the
 * specified merge sizes result in the adjacent blocks being merged into a
 * single exon.  Gaps in CDS use cdsMergeSize, in UTR use utrMergeSize.  If
 * the genePredPslCdsMod3 option is specified, then CDS gaps are only merged
 * if a multiple of three.  A negative merge sizes disables merging of blocks.
 * This differs from specifying zero in that adjacent blocks will not be
 * merged. The optfields field is a set from genePredFields, indicated what
 * fields to create.  Zero-length CDS, or null cds, creates without CDS
 * annotation.  If cds is null, it will set status fields to cdsNone.  */
{
struct genePred *gene;
#if 0
/* FIXME; the browser calls this on a protein psl when rendering the amino
 * acids of a protein alignment track.  Some of this code doesn't really make
 * sense for proteins and I really don't see how this produces the right
 * results, however the check is disabled until this can be investigated
 * because it *seems* to work.  markd 2006-05-22 */
if (pslIsProtein(psl))
    errAbort("can't convert protein psls to genePreds");
#endif

AllocVar(gene);
gene->name = cloneString(psl->qName);
gene->chrom = cloneString(psl->tName);
gene->txStart = psl->tStart;
gene->txEnd = psl->tEnd;
gene->optFields = optFields;

/* get strand in genome that the positive version mRNA aligns to */
if (psl->strand[1] == '\0')
    {
    /* assumed pos target strand, so neg query would be pos target */
    gene->strand[0] = psl->strand[0];
    }
else 
    {
    /* query and target strand are different; will be neg when query pos */
    gene->strand[0] = ((psl->strand[0] != psl->strand[1]) ? '-' : '+');
    }

genePredAddGenbankCds(psl, cds, gene);
pslToExons(psl, gene, cds, options, cdsMergeSize, utrMergeSize);
return gene;
}

struct genePred *genePredFromPsl2(struct psl *psl, unsigned optFields,
                                  struct genbankCds* cds, int insertMergeSize)
/* Compatibility function, genePredFromPsl3 is prefered.  See that function's
 * documentation for details. This calls genePredFromPsl3 with no options
 * and insertMergeSize set for CDS and UTR.
 */
{
return genePredFromPsl3(psl, cds, optFields, genePredPslDefaults, insertMergeSize, insertMergeSize);
}

struct genePred *genePredFromPsl(struct psl *psl, int cdsStart, int cdsEnd,
                                 int insertMergeSize)
/* Compatibility function, genePredFromPsl3 is prefered.  See that function's
 * documentation for details. This calls genePredFromPsl3 with no options.
 */
{
struct genbankCds cds;
ZeroVar(&cds);
cds.start = cdsStart;
cds.end = cdsEnd;
return genePredFromPsl3(psl, &cds, 0, genePredPslDefaults, insertMergeSize, insertMergeSize);
}

char* genePredGetCreateSql(char* table, unsigned optFields, unsigned options,
                           int chromIndexLen)
/* Get SQL required to create a genePred table. optFields is a bit set
 * consisting of the genePredFields values. Options are a bit set of
 * genePredCreateOpts. Returned string should be freed.  This will create all
 * optional fields that preceed the highest optFields column.  chromIndexLen
 * is now ignored.. */
{
/* the >= is used so that we create preceeding fields. */
struct dyString *dy = sqlDyStringCreate(
"CREATE TABLE %s (", table);

/* bin column goes here */
if (options & genePredWithBin)
    sqlDyStringPrintf(dy,
"    bin smallint unsigned not null,"
"    INDEX(chrom,bin),");
else
    sqlDyStringPrintf(dy,
"   INDEX(chrom,txStart),");

sqlDyStringPrintf(dy,
"   name varchar(255) not null,"	/* mrna accession of gene */
"   chrom varchar(255) not null,"	/* Chromosome name */
"   strand char(1) not null,"		/* + or - for strand */
"   txStart int unsigned not null,"	/* Transcription start position */
"   txEnd int unsigned not null,"	/* Transcription end position */
"   cdsStart int unsigned not null,"	/* Coding region start */
"   cdsEnd int unsigned not null,"	/* Coding region end */
"   exonCount int unsigned not null,"	/* Number of exons */
"   exonStarts longblob not null,"	/* Exon start positions */
"   exonEnds longblob not null,"	/* Exon end positions */
"   INDEX(name)"
);

if (optFields >= genePredScoreFld)
    sqlDyStringPrintf(dy,
"   ,score int");

if (optFields >= genePredName2Fld)
    sqlDyStringPrintf(dy,
"  ,name2 varchar(255) not null,"    /* Secondary name. (e.g. name of gene) or NULL if not available */
"   INDEX(name2)");

if (optFields >= genePredCdsStatFld)
    sqlDyStringPrintf(dy,
"  ,cdsStartStat enum('none', 'unk', 'incmpl', 'cmpl') not null,"    /* Status of cdsStart annotation */
"   cdsEndStat enum('none', 'unk', 'incmpl', 'cmpl') not null");     /* Status of cdsEnd annotation */

if (optFields >= genePredExonFramesFld)
    sqlDyStringPrintf(dy,
"   ,exonFrames longblob not null");    /* List of frame for each exon, or -1 if no frame or not known. NULL if not available. */

sqlDyStringPrintf(dy, ")");

return cloneString(dyStringCannibalize(&dy));
}

// FIXME: this really doesn't belong in this module
struct genePred *getOverlappingGene(char *db, struct genePred **list, char *table, char *chrom, int cStart, int cEnd, char *name, int *retOverlap)
{
/* read all genes from a table find the gene with the biggest overlap. 
   Cache the list of genes to so we only read it once */

char query[256];
struct genePred *gene;
struct sqlConnection *conn;
struct sqlResult *sr;
char **row;
struct genePred *el = NULL, *bestMatch = NULL, *gp = NULL;
int overlap = 0 , bestOverlap = 0, i;
struct psl *psl;
int *eFrames;


if (*list == NULL)
    {
    printf("Loading Predictions from %s\n",table);
    AllocVar(*list);
    conn = hAllocConn(db);
    AllocVar(gene);
    sqlSafef(query, sizeof(query), "select * from %s", table);
    sr = sqlGetResult(conn, query);
    while ((row = sqlNextRow(sr)) != NULL)
        {
        if (!sameString(table,"all_mrna"))
            {
            el = genePredLoad(row);
            }
        else
            {
            psl = pslLoad(row);
            el = genePredFromPsl2(psl, 0, NULL, genePredStdInsertMergeSize);
            }
        slAddHead(list, el);
        }
    slReverse(list);
    sqlFreeResult(&sr);
    hFreeConn(&conn);
    }
for (el = *list; el != NULL; el = el->next)
    {
    if (chrom != NULL && el->chrom != NULL)
        {
        overlap = 0;
        if ( sameString(chrom, el->chrom))
            {
            for (i = 0 ; i<(el->exonCount); i++)
                {
                overlap += positiveRangeIntersection(cStart,cEnd, el->exonStarts[i], el->exonEnds[i]) ;
                }
            if (overlap > 20 && sameString(name, el->name))
                {
                bestMatch = el;
                bestOverlap = overlap;
                *retOverlap = bestOverlap;
                }
            if (overlap > bestOverlap)
                {
                bestMatch = el;
                bestOverlap = overlap;
                *retOverlap = bestOverlap;
                }
            }
        }
    }
if (bestMatch != NULL)
    {
    /* Allocate genePred and fill in values. */
    AllocVar(gp);
    gp->name = cloneString(bestMatch->name);
    gp->chrom = cloneString(bestMatch->chrom);
    gp->strand[1] = bestMatch->strand[1];
    gp->strand[0] = bestMatch->strand[0];
    gp->txStart = bestMatch->txStart;
    gp->txEnd = bestMatch->txEnd;
    gp->cdsStart = bestMatch->cdsStart;
    gp->cdsEnd = bestMatch->cdsEnd;
    gp->exonCount = bestMatch->exonCount;
    AllocArray(gp->exonStarts, bestMatch->exonCount);
    AllocArray(gp->exonEnds, bestMatch->exonCount);
    for (i=0; i<bestMatch->exonCount; ++i)
        {
        gp->exonStarts[i] = bestMatch->exonStarts[i] ;
        gp->exonEnds[i] = bestMatch->exonEnds[i] ;
        }
    gp->optFields = bestMatch->optFields;
    gp->score = bestMatch->score;

    if (bestMatch->optFields & genePredName2Fld)
        gp->name2 = cloneString(bestMatch->name2);
    else
        gp->name2 = NULL;
    if (bestMatch->optFields & genePredCdsStatFld)
        {
        gp->cdsStartStat = bestMatch->cdsStartStat;
        gp->cdsEndStat = bestMatch->cdsEndStat;
        }
    if (bestMatch->optFields & genePredExonFramesFld)
        {
        gp->exonFrames = AllocArray(eFrames, bestMatch->exonCount);
        for (i = 0; i < bestMatch->exonCount; i++)
            gp->exonFrames[i] = bestMatch->exonFrames[i];
        }
    eFrames = gp->exonFrames;
    }

return gp;
}
int genePredBases(struct genePred *gp)
/* count coding and utr bases in a gene prediction */
{
int count = 0, i;

if (gp == NULL) return 0;

for (i=0; i<gp->exonCount; i++)
    {
    count += gp->exonEnds[i] - gp->exonStarts[i] ;
    }
return count;
}

int genePredCodingBases(struct genePred *gp)
/* Count up the number of coding bases in gene prediction. */
{
int i, exonCount = gp->exonCount;
int cdsStart = gp->cdsStart, cdsEnd = gp->cdsEnd;
int baseCount = 0;
for (i=0; i<exonCount; ++i)
    {
    baseCount += positiveRangeIntersection(cdsStart,cdsEnd,
    	gp->exonStarts[i], gp->exonEnds[i]);
    }
return baseCount;
}

boolean genePredCdsExon(struct genePred *gp, int iExon, int *startPtr, int *endPtr)
/* Get the CDS range in an exon.  If there is no CDS, return FALSE and then
 * set start == end */
{
assert((iExon >= 0) && (iExon < gp->exonCount));
int start = gp->exonStarts[iExon];
int end = gp->exonEnds[iExon];

if (start < gp->cdsStart)
    start = gp->cdsStart;
if (end > gp->cdsEnd)
    end = gp->cdsEnd;
if (startPtr != NULL)
    *startPtr = start;
if (endPtr != NULL)
    *endPtr = end;
return (start < end);
}

static void gpError(FILE* errFh, int* errorCnt, char *format, ...)
/* print and count an error */
{
va_list args;
fprintf(errFh, "Error: ");
va_start(args, format);
vfprintf(errFh, format, args);
va_end(args);
fputc('\n', errFh);
(*errorCnt)++;
}

static void checkExon(FILE* errFh, int* errorCnt, char *desc, struct genePred* gp, int iExon)
/* check one exon of a genePred */
{
unsigned exonStart = gp->exonStarts[iExon];
unsigned exonEnd = gp->exonEnds[iExon];
if (exonStart >= exonEnd)
    gpError(errFh, errorCnt, "%s: %s exon %u start %u >= end %u", desc, gp->name, iExon, exonStart, exonEnd);
if (exonStart < gp->txStart)
    gpError(errFh, errorCnt, "%s: %s exon %u start %u < txStart %u", desc, gp->name, iExon, exonStart, gp->txStart);
if (exonEnd > gp->txEnd)
    gpError(errFh, errorCnt, "%s: %s exon %u end %u > txEnd %u", desc, gp->name, iExon, exonEnd, gp->txEnd);
if (iExon > 0)
    {
    /* other than first exon */
    if (exonStart < gp->exonEnds[iExon-1])
        gpError(errFh, errorCnt, "%s: %s exon %u (%s:%d-%d) overlaps previous exon (%s:%d-%d)",
                desc, gp->name, iExon, gp->chrom, exonStart, exonEnd, gp->chrom, gp->exonStarts[iExon-1], gp->exonEnds[iExon-1]);
    }

if (gp->optFields & genePredExonFramesFld)
    {
    int frame = gp->exonFrames[iExon];
    if ((frame < -1) || (frame > 2))
        gpError(errFh, errorCnt, "%s: %s invalid exonFrame: %d", desc, gp->name, frame);
    if ((exonEnd > gp->cdsStart) && (exonStart < gp->cdsEnd))
        {
        if (frame == -1)
            gpError(errFh, errorCnt, "%s: %s no exonFrame on CDS exon %d", desc, gp->name, iExon);
        }
    else
        {
        if (frame != -1)
            gpError(errFh, errorCnt, "%s: %s exonFrame on non-CDS exon %d", desc, gp->name, iExon);
        }
    }
}

static void checkCdsBounds(FILE* errFh, int* errorCnt, char *desc, struct genePred* gp)
/* check cdsStart/cdsEnd */
{
int iExon;
boolean foundCdsStart = FALSE, foundCdsEnd = FALSE;
if (gp->cdsStart > gp->cdsEnd)
    gpError(errFh, errorCnt, "%s: %s cdsStart %u > cdsEnd %u", desc, gp->name, gp->cdsStart, gp->cdsEnd);
if ((gp->cdsStart < gp->txStart) || (gp->cdsStart > gp->txEnd))
    gpError(errFh, errorCnt, "%s: %s cdsStart %u not in tx bounds %u-%u", desc, gp->name, gp->cdsStart, gp->txStart, gp->txEnd);
if ((gp->cdsEnd < gp->txStart) || (gp->cdsEnd > gp->txEnd))
    gpError(errFh, errorCnt, "%s: %s cdsEnd %u not in tx bounds %u-%u", desc, gp->name, gp->cdsEnd, gp->txStart, gp->txEnd);
/*  is cdsStart/cdsEnd in an exon?  */
for (iExon = 0; iExon < gp->exonCount; iExon++)
    {
    if ((gp->cdsStart >= gp->exonStarts[iExon])
        && (gp->cdsStart < gp->exonEnds[iExon]))
        foundCdsStart = TRUE;
    if ((gp->cdsEnd > gp->exonStarts[iExon])
        && (gp->cdsEnd <= gp->exonEnds[iExon]))
        foundCdsEnd = TRUE;
    }
if (!foundCdsStart)
    gpError(errFh, errorCnt, "%s: %s cdsStart %u not in an exon", desc, gp->name, gp->cdsStart);
if (!foundCdsEnd)
    gpError(errFh, errorCnt, "%s: %s cdsEnd %u not in an exon", desc, gp->name, gp->cdsEnd);
}

int genePredCheck(char *desc, FILE* errFh, int chromSize, struct genePred* gp)
/* Validate a genePred for consistency.  desc is printed the error messages
 * to file errFh (open /dev/null to discard).  chromSize should contain
 * size of chromosome, or 0 if chrom is not valid, or -1 to not check
 * chromosome bounds. Returns count of errors. */
{
int iExon;
int errorCnt = 0;

if (gp->name == NULL)
    gpError(errFh, &errorCnt, "%s: name is null", desc);

if (!(sameString(gp->strand, "+") || sameString(gp->strand, "-")))
    gpError(errFh, &errorCnt, "%s: %s invalid strand: \"%s\"", desc, gp->name, gp->strand);

/* check chrom */
if (chromSize == 0)
    gpError(errFh, &errorCnt, "%s: %s chrom not a valid chromosome: \"%s\"", desc, gp->name, gp->chrom);
else if (chromSize > 0)
    {
    if (gp->txEnd > chromSize)
        gpError(errFh, &errorCnt, "%s: %s txEnd %u >= chromSize %u", desc, gp->name, gp->txEnd, chromSize);
    }

/* check internal consistency */
if (gp->txStart >= gp->txEnd)
    gpError(errFh, &errorCnt, "%s: %s txStart %u >= txEnd %u", desc, gp->name, gp->txStart, gp->txEnd);

/* no CDS is indicated by cdsStart == cdsEnd */
if (gp->cdsStart != gp->cdsEnd)
    checkCdsBounds(errFh, &errorCnt, desc, gp);

/* must be at least one exon */
if (gp->exonCount == 0)
    gpError(errFh, &errorCnt, "%s: %s contains no exons", desc, gp->name);
else
    {
/* make sure first/last exons match tx range */
    if (gp->txStart != gp->exonStarts[0])
        gpError(errFh, &errorCnt, "%s: %s first exon start %u doesn't match txStart %u", desc, gp->name, gp->exonStarts[0], gp->txStart);
    if (gp->txEnd != gp->exonEnds[gp->exonCount-1])
        gpError(errFh, &errorCnt, "%s: %s last exon end %u doesn't match txEnd %u", desc, gp->name, gp->exonEnds[gp->exonCount-1], gp->txEnd);
    }

/* check each exon */
for (iExon = 0; iExon < gp->exonCount; iExon++)
    checkExon(errFh, &errorCnt, desc, gp, iExon);

return errorCnt;
}

static int lookupChromSize(char *desc, FILE* errFh, char* db, struct genePred* gp, int *errorCnt)
/* lookup the chromosome size in the database, -1 if invalid */
{
// hGetChromInfo is case independent
struct chromInfo *ci = hGetChromInfo(db, gp->chrom);
if (ci == NULL)
    {
    fprintf(errFh, "Error: %s: %s has invalid chrom for %s: %s\n", desc, gp->name, db, gp->chrom);
    (*errorCnt)++;
    return -1;  // don't validate
    }
else if (differentString(gp->chrom, ci->chrom)) // verify case dependent equal
    {
    fprintf(errFh, "Error: %s: %s has invalid chrom for %s: %s\n", desc, gp->name, db, gp->chrom);
    (*errorCnt)++;
    return -1;  // don't validate
    }
else
    return ci->size;
}

int genePredCheckDb(char *desc, FILE* errFh, char* db, struct genePred* gp)
/* Validate a genePred for consistency.  desc is printed the error messages
 * to file errFh (open /dev/null to discard).  Lookup chromosome size in database if
 * db is not NULL. Returns count of errors. */
{
int errorCnt = 0;
int chromSize = -1;  /* default to not checking */
if (db != NULL)
    chromSize = lookupChromSize(desc, errFh, db, gp, &errorCnt);
errorCnt += genePredCheck(desc, errFh, chromSize, gp);
return errorCnt;
}

int genePredCheckChromSizes(char *desc, FILE* errFh, struct genePred* gp,
                            struct hash* chromSizes)
/* Validate a genePred for consistency.  desc is printed the error messages
 * to file errFh (open /dev/null to discard).  Lookup chromosome size in hash.
 */
{
int errorCnt = 0;
int chromSize = hashIntValDefault(chromSizes, gp->chrom, -1);
if (chromSize < 0)
    {
    fprintf(errFh, "Error: %s: %s has invalid chrom for: %s\n", desc, gp->name, gp->chrom);
    errorCnt++;
    }
else
    errorCnt += genePredCheck(desc, errFh, chromSize, gp);
return errorCnt;
}

boolean genePredNmdTarget(struct genePred *gp) 
/* Return TRUE if cds end is more than 50bp upstream of
   last intron. */
{
int gpSize = 0;
int i = 0;
int startDist = 0, endDist = 0;
int blockSize = 0;
if(gp->exonCount < 2 || gp->cdsStart == gp->cdsEnd)
    return FALSE;
for(i = 0; i < gp->exonCount; i++)
    {
    int blockStart = gp->exonStarts[i];
    int blockEnd = gp->exonEnds[i];
    blockSize = blockEnd - blockStart;
    if( blockStart <= gp->cdsStart && gp->cdsStart <= blockEnd)
	startDist += blockSize - (blockEnd - gp->cdsStart);
    else if(blockEnd <= gp->cdsStart)
	startDist += blockSize;
    if( blockStart <= gp->cdsEnd && gp->cdsEnd <= blockEnd)
	endDist += blockSize - (blockEnd - gp->cdsEnd);
    else if(blockEnd <= gp->cdsEnd)
	endDist += blockSize;
    gpSize += blockSize;
    }
/*  xxxxxXXX----XXXXXXXXXXXXX--------XXXXXXX----XXXxxxxxxxx-------xxxxxxxxx  */                                           
if(sameString(gp->strand, "+"))
    {
    blockSize = gp->exonEnds[gp->exonCount-1] - gp->exonStarts[gp->exonCount-1];
    return endDist < (gpSize - blockSize - 50); 
    }
else if(sameString(gp->strand, "-"))
    {
    blockSize = gp->exonEnds[0] - gp->exonStarts[0];
    return startDist > (blockSize + 50);
    }
else 
    errAbort("genePredNmdsTarget() - Don't recognize strand: %s\n", gp->strand);
return FALSE;
}

void genePredAddExonFrames(struct genePred *gp)
/* Add exonFrames array to a genePred that doesn't have it. Frame is assumed
 * to be contiguous.  NOTE: suggest using genePredFixExonFrames for new code. */
{
int iExon, start, end, iBase = 0;
int iStart, iEnd, iDir;

/* initial array to all -1, for no frame */
AllocArray(gp->exonFrames, gp->exonCount);
gp->optFields |= genePredExonFramesFld;
for (iExon = 0; iExon < gp->exonCount; iExon++)
    gp->exonFrames[iExon] = -1;

if (sameString(gp->strand, "+"))
    {
    iStart = 0;
    iEnd = gp->exonCount;
    iDir = 1;
    }
else
    {
    iStart = gp->exonCount-1;
    iEnd = -1;
    iDir = -1;
    }
for (iExon = iStart; iExon != iEnd; iExon += iDir)
    {
    if (genePredCdsExon(gp, iExon, &start, &end))
        {
        gp->exonFrames[iExon] = iBase % 3;
        iBase += (end - start);
        }
    }
}

void genePredFixExonFrames(struct genePred *gp)
/* Add exonFrames array to a genePred that has frame on only some or no
 * features. Frame is assumed to be contiguous when an existing frame is not
 * present. */
{
int iStart, iEnd, iDir;

if (gp->exonFrames == NULL)
    {
    /* doesn't currently have frames */
    AllocArray(gp->exonFrames, gp->exonCount);
    gp->optFields |= genePredExonFramesFld;
    for (int iExon = 0; iExon < gp->exonCount; iExon++)
        gp->exonFrames[iExon] = -1;
    }

if (sameString(gp->strand, "+"))
    {
    iStart = 0;
    iEnd = gp->exonCount;
    iDir = 1;
    }
else
    {
    iStart = gp->exonCount - 1;
    iEnd = -1;
    iDir = -1;
    }
int nextFrame = -1;
for (int iExon = iStart; iExon != iEnd; iExon += iDir)
    {
    int start, end;
    if (genePredCdsExon(gp, iExon, &start, &end))
        {
        if (gp->exonFrames[iExon] < 0)
            {
            // no existing frame
            if (nextFrame < 0)
                nextFrame = 0;  // first frame
            gp->exonFrames[iExon] = nextFrame;
            }
        nextFrame = incrFrame(gp->exonFrames[iExon], end - start);
        }
    else
        {
        gp->exonFrames[iExon] = -1;
        }
    }
}

void genePredRc(struct genePred *gp, int chromSize)
/* Reverse complement a genePred (project it to the opposite strand).  Useful
 * when doing analysis that is simplified by having things on the same strand.
 */
{
int iExon;
gp->strand[0] = (gp->strand[0] == '+') ? '-' : '+';
reverseUnsignedRange(&gp->txStart, &gp->txEnd, chromSize);
reverseUnsignedRange(&gp->cdsStart, &gp->cdsEnd, chromSize);
for (iExon = 0; iExon < gp->exonCount; iExon++)
    reverseUnsignedRange(&gp->exonStarts[iExon], &gp->exonEnds[iExon], chromSize);
reverseUnsigned(gp->exonStarts, gp->exonCount);
reverseUnsigned(gp->exonEnds, gp->exonCount);
if (gp->optFields & genePredCdsStatFld)
    {
    enum cdsStatus hold =  gp->cdsStartStat;
    gp->cdsStartStat = gp->cdsEndStat;
    gp->cdsEndStat = hold;
    }
if (gp->optFields & genePredExonFramesFld)
    reverseInts(gp->exonFrames, gp->exonCount);
}

struct genePred *genePredNew(char *name, char *chrom, char strand,
                             unsigned txStart, unsigned txEnd,
                             unsigned cdsStart, unsigned cdsEnd,
                             unsigned optFields, unsigned exonSpace)
/* create a new gene with space for the specified number of exons allocated.
 * genePredGrow maybe used to expand this space if needed. */
{
struct genePred *gp;
AllocVar(gp);
assert(exonSpace > 0);

gp->name = cloneString(name);
gp->chrom  = cloneString(chrom);
gp->strand[0] = strand;
gp->txStart = txStart;
gp->txEnd = txEnd;
gp->cdsStart  = cdsStart;
gp->cdsEnd = cdsEnd;
gp->optFields = optFields;
AllocArray(gp->exonStarts, exonSpace);
AllocArray(gp->exonEnds, exonSpace);
if (gp->optFields & genePredExonFramesFld)
    AllocArray(gp->exonFrames, exonSpace);
return gp;
}

void genePredGrow(struct genePred *gp, unsigned *exonSpacePtr)
/* Increase memory allocated to a psl to hold more exons.  exonSpacePtr
 * should point the the current maximum number of exons and will be
 * updated to with the new amount of space. */
{
int exonSpace = *exonSpacePtr;
int newSpace = 2 * exonSpace;
ExpandArray(gp->exonStarts, exonSpace, newSpace);
ExpandArray(gp->exonEnds, exonSpace, newSpace);
if (gp->optFields & genePredExonFramesFld)
    ExpandArray(gp->exonFrames, exonSpace, newSpace);
*exonSpacePtr = newSpace;
}

struct rbTree *genePredToRangeTree(struct genePred *gp, boolean cdsOnly)
/* Convert genePred into a range tree. */
{
struct rbTree *rangeTree = rangeTreeNew();
int i;
for (i=0; i<gp->exonCount; ++i)
    {
    int start = gp->exonStarts[i];
    int end = gp->exonEnds[i];
    if (cdsOnly)
        {
	start = max(start, gp->cdsStart);
	end = min(end, gp->cdsEnd);
	if (start >= end)
	    continue;
	}
    rangeTreeAdd(rangeTree, start, end);
    }
return rangeTree;
}

void gpPartOutAsBed(struct genePred *gp, int start, int end, FILE *f, 
	char *type, int id, int minSize)
/* Write out part of gp as bed12. */
{
/* Figure out # of blocks and min/max of area inside start/end */
int blockCount = 0;
int newStart = gp->txEnd, newEnd = gp->txStart;
int size = 0;
int i;
for (i=0; i<gp->exonCount; ++i)
    {
    int exonStart = gp->exonStarts[i];
    int exonEnd = gp->exonEnds[i];
    exonStart = max(start, exonStart);
    exonEnd = min(end, exonEnd);
    if (exonStart < exonEnd)
        {
	++blockCount;
	newStart = min(exonStart, newStart);
	newEnd = max(exonEnd, newEnd);
	size += exonEnd - exonStart;
	}
    }

/* Output first 10 fields of bed12. */
if (size > minSize)
    {
    fprintf(f, "%s\t%d\t%d\t", gp->chrom, newStart, newEnd);
    fprintf(f, "%s_%d_%s\t", type, id, gp->name);
    fprintf(f, "0\t%s\t0\t0\t0\t%d\t", gp->strand, blockCount);

    /* Output blockSizes field */
    for (i=0; i<gp->exonCount; ++i)
	{
	int exonStart = gp->exonStarts[i];
	int exonEnd = gp->exonEnds[i];
	exonStart = max(start, exonStart);
	exonEnd = min(end, exonEnd);
	if (exonStart < exonEnd)
	    fprintf(f, "%d,", exonEnd - exonStart);
	}
    fprintf(f, "\t");

    /* Output chromStarts field */
    for (i=0; i<gp->exonCount; ++i)
	{
	int exonStart = gp->exonStarts[i];
	int exonEnd = gp->exonEnds[i];
	exonStart = max(start, exonStart);
	exonEnd = min(end, exonEnd);
	if (exonStart < exonEnd)
	    fprintf(f, "%d,", exonStart - newStart);
	}
    fprintf(f, "\n");
    }
}

boolean codonToPos(struct genePred *gp, unsigned num, int *chromStart, int *chromEnd)
{
// map 1-based codon to genomic coordinates. If the codon crosses an exon junction, we return just the beginning (LHS) of the codon.
// Returns true if we find the codon in given gene predition; chromStart and end are set to appropriate three base region.

int pos = -1;
int i;
int offset = -1;  // current 1-based offset in bases (not codons)
if(gp->strand[0] == '+')
    {
    for(i = 0; i < gp->exonCount; i++)
        {
        if(gp->exonEnds[i] > gp->cdsStart && gp->exonStarts[i] < gp->cdsEnd)
            {
            int start, end;
            if(offset == -1 && gp->cdsStart <= gp->exonEnds[i])
                {
                // start counting
                start = gp->cdsStart;
                offset = 1;
                }
            else
                start = gp->exonStarts[i];
            if(gp->cdsEnd < gp->exonEnds[i])
                end = gp->cdsEnd;
            else
                end = gp->exonEnds[i];
            int next = offset + end - start;
            if(next > (num * 3 - 2))
                {
                pos = start + (((num - 1) * 3 + 1) - offset);
                break;
                }
            else
                offset = next;
            }
        }
    }
else
    {
    for(i = gp->exonCount - 1; i >= 0; i--)
        {
        if(gp->exonStarts[i] < gp->cdsEnd && gp->exonEnds[i] > gp->cdsStart)
            {
            int start, end; // start here is really the RHS, and end is the LHS
            if(offset == -1 && gp->cdsEnd >= gp->exonStarts[i])
                {
                // start counting
                start = gp->cdsEnd;
                offset = 1;
                }
            else
                start = gp->exonEnds[i];
            if(gp->cdsStart > gp->exonStarts[i])
                end = gp->cdsStart;
            else
                end = gp->exonStarts[i];
            int next = offset + start - end;
            if(next > num * 3)
                {
                pos = start - (num*3 - offset) - 1;
                break;
                }
            else
                offset = next;
            }
        }
    }
if(pos == -1)
    return FALSE;
else
    {
    *chromStart = pos;
    *chromEnd = pos + 3;
    return TRUE;
    }
}

boolean exonToPos(struct genePred *gp, unsigned num, int *chromStart, int *chromEnd)
{
// map 1-based exon number to genomic coordinates.
// Returns true if we find the exon in given gene predition; start and end are set to appropriate region.

if(num == 0 || num > gp->exonCount)
    return FALSE;
else if(gp->strand[0] == '+')
    {
    *chromStart = gp->exonStarts[num - 1];
    *chromEnd = gp->exonEnds[num - 1];
    }
else
    {
    *chromStart = gp->exonStarts[gp->exonCount - num];
    *chromEnd = gp->exonEnds[gp->exonCount - num];
    }
return TRUE;
}

static char *genePredAutoSqlString =
    "table genePred\n"
    "\"A gene prediction.\"\n"
    "    (\n"
    "    string name;	\"Name of gene\"\n"
    "    string chrom;	\"Reference sequence chromosome or scaffold\"\n"
    "    char[1] strand;     \"+ or - for strand\"\n"
    "    uint txStart;	\"Transcription start position\"\n"
    "    uint txEnd;         \"Transcription end position\"\n"
    "    uint cdsStart;	\"Coding region start\"\n"
    "    uint cdsEnd;        \"Coding region end\"\n"
    "    uint exonCount;     \"Number of exons\"\n"
    "    uint[exonCount] exonStarts; \"Exon start positions\"\n"
    "    uint[exonCount] exonEnds;   \"Exon end positions\"\n"
    "    )\n";

struct asObject *genePredAsObj()
// Return asObject describing fields of genePred
{
return asParseText(genePredAutoSqlString);
}

struct dnaSeq *genePredGetDna(char *database, struct genePred *gp,
                              boolean coding, enum dnaCase dnaCase)
// Returns the DNA sequence associated with gene prediction.
// Negative strand genes will return the sequence as read from the negative strand.
// Optionally restrict to coding sequence only
{
struct dnaSeq *dnaSeq = hDnaFromSeq(database, gp->chrom, gp->txStart, gp->txEnd,  dnaCase);
if (dnaSeq == NULL)
    return NULL;
DNA *seq = dnaSeq->dna;
int len = dnaSeq->size;
if (coding)
    {
    int cdnaOffset = 0, exonIx;
    for (exonIx = 0; exonIx < gp->exonCount; exonIx++)
        {
        int exonStart = max(gp->exonStarts[exonIx],gp->cdsStart);
        int exonEnd   = min(gp->exonEnds[  exonIx],gp->cdsEnd  );
        if (exonStart >= exonEnd) // non-coding exon
            continue;
        exonStart -= gp->txStart;
        exonEnd   -= gp->txStart;
        int exonSize  = exonEnd - exonStart;

        assert(cdnaOffset <= exonStart && exonEnd <= len);
        if (cdnaOffset < exonStart)
            memmove(seq + cdnaOffset, seq + exonStart, exonSize);
        cdnaOffset += exonSize;
        }
    seq[cdnaOffset] = '\0';
    dnaSeq->size = cdnaOffset;
    }

if (gp->strand[0] == '-')
    reverseComplement(seq,dnaSeq->size);

return dnaSeq;
}

int genePredBaseToCodingPos(struct genePred *gp, int basePos,
                            boolean stranded, boolean *isCoding)
// Given a genePred model and a single (0 based) base position, predict the 0-based
// DNA (stranded) coding sequence pos.  Dividing this number by 3 should give the AA position!
// Returns -1 when outside of coding exons unless OPTIONAL isCoding pointer to boolean is
// provided. In that case, returns last valid position and sets isCoding to FALSE.
{
if (isCoding == NULL && (basePos < gp->cdsStart || basePos >= gp->cdsEnd))
    return -1;

boolean reverse = (stranded && gp->strand[0] == '-');
assert(gp->exonStarts != NULL && gp->exonEnds != NULL);

// Walk through exons, advancing depending upon strand
int codingBasesSoFar = 0;
int exonIx = (reverse ? (gp->exonCount - 1) : 0);
while (0 <= exonIx && exonIx < gp->exonCount)
    {
    int exonStart = max(gp->exonStarts[exonIx],gp->cdsStart);
    int exonEnd   = min(gp->exonEnds[  exonIx],gp->cdsEnd  );
    if (exonStart < exonEnd) // coding exon
        {
        // Within this exon?
        if (exonStart <= basePos && basePos < exonEnd)
            {
            if (isCoding != NULL)
                *isCoding = TRUE;
            if (reverse)
                return (codingBasesSoFar + (exonEnd - basePos));
            else
                return (codingBasesSoFar + (basePos - exonStart));
            }

        // Past the target base already?
        if (( reverse && basePos >= exonEnd  )
        ||  (!reverse && basePos <  exonStart))
            break;

        // Yet to reach the target base... accumulate exon worth
        codingBasesSoFar += (exonEnd - exonStart);
        }

    exonIx += (reverse ? -1 : 1);
    }

if (isCoding != NULL && codingBasesSoFar > 0)
    {
    *isCoding = FALSE;
    return codingBasesSoFar;
    }
return -1;  // introns not okay
}

struct genePredExt  *genePredFromBedBigGenePred( char *chrom, struct bed *bed, struct bigBedInterval *bb)
/* build a genePred from a bigGenePred and a bed file */
{
char *extra = cloneString(bb->rest);
int numCols = 12 + 8 - 3;
char *row[numCols];
int wordCount = chopByChar(extra, '\t', row, numCols);
if (wordCount < numCols)
    errAbort("expected at least %d columns in bigGenePred, got %d; is this actually a bigGenePred?", numCols, wordCount);

struct genePredExt *gp = bedToGenePredExt(bed);

gp->name2 = cloneString(row[ 9]);

int numBlocks;
sqlSignedDynamicArray(row[ 12],  &gp->exonFrames, &numBlocks);
gp->optFields |= genePredExonFramesFld;
//assert (numBlocks == gp->exonCount);

gp->type = cloneString(row[13]);
gp->geneName = cloneString(row[14]);
gp->geneName2 = cloneString(row[15]);

return gp;
}

struct genePredExt  *genePredFromBigGenePred( char *chrom, struct bigBedInterval *bb)
/* build a genePred from a bigGenePred */
{
char *extra = cloneString(bb->rest);
int numCols = 12 + 8 - 3;
char *row[numCols];
int wordCount = chopByChar(extra, '\t', row, numCols);
assert(wordCount == numCols);

struct genePredExt *gp;
AllocVar(gp);

gp->chrom = chrom;
gp->txStart = bb->start;
gp->txEnd = bb->end;
gp->name =  cloneString(row[ 0]);
gp->strand[0] =  row[ 2][0];
gp->strand[1] =  row[ 2][1];
gp->cdsStart =  atoi(row[ 3]);
gp->cdsEnd =  atoi(row[ 4]);
gp->exonCount =  atoi(row[ 6]);
int numBlocks;
sqlUnsignedDynamicArray(row[ 8],  &gp->exonStarts, &numBlocks);
assert (numBlocks == gp->exonCount);
sqlUnsignedDynamicArray(row[ 7],  &gp->exonEnds, &numBlocks);
assert (numBlocks == gp->exonCount);

int ii;
for(ii=0; ii < numBlocks; ii++)
    {
    gp->exonStarts[ii] += bb->start;
    gp->exonEnds[ii] += gp->exonStarts[ii];
    }

gp->name2 = cloneString(row[ 9]);

gp->cdsStartStat = parseCdsStat(row[ 10]);
gp->cdsEndStat = parseCdsStat(row[ 11]);
sqlSignedDynamicArray(row[ 12],  &gp->exonFrames, &numBlocks);
gp->optFields |= genePredExonFramesFld;
assert (numBlocks == gp->exonCount);

gp->type = cloneString(row[13]);
gp->geneName = cloneString(row[14]);
gp->geneName2 = cloneString(row[15]);

return gp;
}

static void sqlUnsignedDynamicArrayNoClobber(char *s, unsigned **retArray, int *retSize)
/* Make a copy of s on stack and chop that up so we don't mangle s. */
{
char copy[strlen(s)+1];
safecpy(copy, sizeof(copy), s);
sqlUnsignedDynamicArray(copy, retArray, retSize);
}

struct genePredExt  *genePredFromBigGenePredRow(char **row)
/* build a genePred from a bigGenePred row */
{
struct genePredExt *gp;
AllocVar(gp);
gp->chrom = cloneString(row[0]);
gp->txStart = sqlUnsigned(row[1]);
gp->txEnd = sqlUnsigned(row[2]);
gp->name = cloneString(row[3]);
gp->strand[0] = row[5][0];
gp->strand[1] = row[5][1];
gp->cdsStart = sqlUnsigned(row[6]);
gp->cdsEnd = sqlUnsigned(row[7]);
gp->exonCount = sqlUnsigned(row[9]);
int numBlocks;
sqlUnsignedDynamicArrayNoClobber(row[11], &gp->exonStarts, &numBlocks);
assert (numBlocks == gp->exonCount);
// First put blockSizes in exonEnds:
sqlUnsignedDynamicArrayNoClobber(row[10], &gp->exonEnds, &numBlocks);
assert (numBlocks == gp->exonCount);
// Then add in txStart to relative starts, and add starts to block sizes to get ends:
int ii;
for(ii=0; ii < numBlocks; ii++)
    {
    gp->exonStarts[ii] += gp->txStart;
    gp->exonEnds[ii] += gp->exonStarts[ii];
    }
gp->name2 = cloneString(row[12]);
gp->cdsStartStat = parseCdsStat(row[13]);
gp->cdsEndStat = parseCdsStat(row[14]);
gp->optFields |= genePredCdsStatFld;
sqlSignedDynamicArray(row[15],  &gp->exonFrames, &numBlocks);
assert (numBlocks == gp->exonCount);
gp->optFields |= genePredExonFramesFld;
gp->type = cloneString(row[16]);
gp->geneName = cloneString(row[17]);
gp->geneName2 = cloneString(row[18]);
return gp;
}

struct cds
/* CDS sequence being assembled */
{
    char *bases;    // CDS string being built
    int iCds;       // Index into CDS
    int nextFrame;  // next required frame
};

static struct dnaSeq* getGeneRegion(struct nibTwoCache* genomeSeqs,  struct genePred *gp, int start, int end,
                                    int *chromSize)
/* Get the genome sequence covering the a region of a gene, in transcription order */
{
struct dnaSeq *dna = nibTwoCacheSeqPart(genomeSeqs, gp->chrom, start, end-start, chromSize);
if (gp->strand[0] == '-')
    reverseComplement(dna->dna, dna->size);
return dna;
}

static void removePartialCodon(struct cds* cds)
/* remove partial codon that has already been copied to CDS */
{
int iCdsNew = cds->iCds - cds->nextFrame;
if (iCdsNew < 0)
    iCdsNew = 0;
zeroBytes(cds->bases+iCdsNew, (cds->iCds - iCdsNew));
cds->iCds = iCdsNew;
cds->nextFrame = 0;
}

static void addCdsExonBases(struct nibTwoCache* genomeSeqs, struct genePred *genePred,
                            int exonCdsStart, int exonCdsEnd, struct cds* cds, int *chromSize)
{
struct dnaSeq *dna = getGeneRegion(genomeSeqs, genePred, exonCdsStart, exonCdsEnd, chromSize);
int iDna = 0;
int iCdsStart = cds->iCds;
while (iDna < dna->size)
    cds->bases[cds->iCds++] = dna->dna[iDna++];
cds->nextFrame = ((cds->nextFrame) + (cds->iCds - iCdsStart)) % 3;
dnaSeqFree(&dna);
}
    
static void addCdsExon(struct nibTwoCache* genomeSeqs, struct genePred *gp,
                       int exonCdsStart, int exonCdsEnd, int frame,
                       struct cds* cds)
/* get CDS from one exon */
{
// adjust for frame shift, dropping partial codon
if (frame != cds->nextFrame)
    {
    removePartialCodon(cds);
    if (gp->strand[0] == '+')
        exonCdsStart += frame;
    else
        exonCdsEnd -= frame;
    }
if (exonCdsStart < exonCdsEnd)
    {
    int chromSize = 0;
    addCdsExonBases(genomeSeqs, gp, exonCdsStart, exonCdsEnd, cds, &chromSize);
    }
}

static char* getCdsCodons(struct genePred *gp, struct nibTwoCache* genomeSeqs)
/* get the CDS sequence, dropping partial codons */
{
struct cds cds;
cds.bases = needMem(genePredCdsSize(gp) + 1);
cds.iCds = 0;
cds.nextFrame = 0;
    
// walk in direction of transcription
int dir = (gp->strand[0] == '+') ? 1 : -1;
int iExon = (dir > 0) ? 0 : gp->exonCount-1;
int iStop = (dir > 0) ? gp->exonCount : -1;
int exonCdsStart, exonCdsEnd;
for (; iExon != iStop; iExon += dir)
    {
    if (genePredCdsExon(gp, iExon, &exonCdsStart, &exonCdsEnd))
        addCdsExon(genomeSeqs, gp, exonCdsStart, exonCdsEnd, gp->exonFrames[iExon], &cds);
    }
if (cds.nextFrame != 0)
    removePartialCodon(&cds);
assert((strlen(cds.bases) % 3) == 0);  // ;-)
return cds.bases;
}

static char translateCodon(boolean isChrM, char* codon, bool lastCodon, unsigned options)
/* translate the first three bases starting at codon, handling weird
 * biology as requested giving */
{
char aa = isChrM ? lookupMitoCodon(codon) : lookupCodon(codon);
if (aa == '\0')
    {
    // stop, contains `N' or selenocysteine
    boolean isStopOrSelno = isStopCodon(codon);
    boolean isRealStop = isReallyStopCodon(codon, !lastCodon); // internal could be selenocysteine
    if (lastCodon)
        {
        if ((options & GENEPRED_TRANSLATE_INCLUDE_STOP) != 0)
            aa = '*';
        else if (!isRealStop)
            aa = 'X';
        // others, \0' will terminate
        }
    else if (((options & GENEPRED_TRANSLATE_SELENO) != 0)
             && isStopOrSelno && !isRealStop)
        aa = 'U';
    else if (isRealStop && ((options & GENEPRED_TRANSLATE_STAR_INFRAME_STOPS) != 0))
        aa = '*';
    else
        aa = 'X';
    }
return aa;
}

static char* translateCds(char* chrom, char* cds, unsigned options)
/* translate the CDS */
{
int cdsLen = strlen(cds);
char *prot = needMem((cdsLen/3)+1);
boolean isChrM = isMito(chrom);
int iCds, iProt;
for (iCds = 0, iProt = 0; iCds < cdsLen; iCds+=3, iProt++)
    prot[iProt] = translateCodon(isChrM, cds+iCds, (iCds == cdsLen-3), options);
return prot;
}

void genePredTranslate(struct genePred *gp, struct nibTwoCache* genomeSeqs, unsigned options,
                       char **protRet, char **cdsRet)
/* Translate a genePred into a protein.  It can also return the CDS part of the
 * mRNA sequence. If the chrom is chrM, the mitochondrial translation tables are
 * used. If protRet or cdsRet is NULL, those sequences are not returned.
 */
{
// note: code tests by genePredToProt
bool haveFrames = (gp->exonFrames != NULL);
if (!haveFrames)
    genePredAddExonFrames(gp);  // assume correct frame if not included

char* cds = getCdsCodons(gp, genomeSeqs);
char *prot = translateCds(gp->chrom, cds, options);

if (protRet != NULL)
    *protRet = prot;
else
    freeMem(prot);
if (cdsRet != NULL)
    *cdsRet = cds;
else
    freeMem(cds);

if (!haveFrames)
    freez(&gp->exonFrames);
}

void genePredToCds(struct genePred *gp, struct genbankCds *cds)
/* Fill in cds with transcript offsets computed from genePred. */
{
/*
 * Warning: Genbank CDS does't have the ability to represent
 * partial codons.  If we have genePreds created from GFF/GTF, they can have
 * partial codons, which is indicated in frame.  This code does not correctly handle
 * this case, or frame shifting indels.
 */
cds->start = cds->end = -1;
cds->startComplete = cds->endComplete = cds->complement = FALSE;
if (gp->cdsEnd > gp->cdsStart)
    {
    int e, off = 0;
    int qCdsStart = -1, qCdsEnd = -1;
    for (e = 0; e < gp->exonCount; ++e)
        {
        int eCdsStart, eCdsEnd;
        if (genePredCdsExon(gp, e, &eCdsStart, &eCdsEnd))
            {
            if (qCdsStart < 0)
                qCdsStart = off + (eCdsStart - gp->exonStarts[e]);
            qCdsEnd = off + (eCdsEnd - gp->exonStarts[e]);
            }
        off += gp->exonEnds[e] - gp->exonStarts[e];
        }
    int qSize = off;
    if (gp->strand[0] == '-')
        reverseIntRange(&qCdsStart, &qCdsEnd, qSize);
    cds->start = qCdsStart;
    cds->end = qCdsEnd;
    cds->startComplete = (gp->cdsStartStat != cdsIncomplete);
    cds->endComplete = (gp->cdsEndStat != cdsIncomplete);
    }
}

struct psl *genePredToPsl(struct genePred *gp, int chromSize, int qSize)
/* Convert a genePred to psl, assuming perfect concordance between target & query.
 * If qSize is 0 then the number of aligned bases will be used as qSize. */
{
int e = 0, aliSize = 0;
for (e = 0; e < gp->exonCount; ++e)
    aliSize += (gp->exonEnds[e] - gp->exonStarts[e]);
struct psl *psl = pslNew(gp->name, qSize ? qSize : aliSize, 0, aliSize,
                         gp->chrom, chromSize, gp->txStart, gp->txEnd,
                         gp->strand, gp->exonCount, 0);
// If qSize is greater than aliSize then we assume the extra bases are at the end of
// the transcript (poly-A tail).  If the alignment is on the '-' strand, then we need
// to offset the reversed qStarts by the number of extra bases.
int sizeAdjust = (gp->strand[0] == '-' && qSize > aliSize) ? (qSize - aliSize) : 0;
int i = -1;
for (e = 0; e < gp->exonCount; ++e)
    {
    if (e == 0 || gp->exonStarts[e] != gp->exonEnds[e-1])
        {
        i++;
        psl->blockSizes[i] = (gp->exonEnds[e] - gp->exonStarts[e]);
        psl->qStarts[i] = i==0 ? 0 + sizeAdjust : psl->qStarts[i-1] + psl->blockSizes[i-1];
        psl->tStarts[i] = gp->exonStarts[e];
        }
    else
        {
        // Merge "exons" that have a 0-length gap between them to avoid pslCheck failure
        psl->blockSizes[i] += (gp->exonEnds[e] - gp->exonStarts[e]);
        }
    }
psl->blockCount = i+1;
psl->match = aliSize;
psl->tNumInsert = psl->blockCount-1;
psl->tBaseInsert = (gp->txEnd - gp->txStart) - aliSize;
return psl;
}
