/* genePred.h was originally generated by the autoSql program, which also 
 * generated genePred.c and genePred.sql.  This header links the database and the RAM 
 * representation of objects. */

/* Copyright (C) 2013 The Regents of the University of California 
 * See kent/LICENSE or http://genome.ucsc.edu/license/ for licensing information. */

#ifndef GENEPRED_H
#define GENEPRED_H

#include "dnaseq.h"
#include "basicBed.h"
#include "bigBed.h"
#include "nibTwo.h"

struct gff;
struct gffFile;
struct gffGroup;
struct psl;
struct genbankCds;
struct rbTree;

enum cdsStatus
/* value to indicate status of CDS annotation at either start or end */
{
    cdsNone,        /* "none" - No CDS (non-coding)  */
    cdsUnknown,     /* "unk" - CDS is unknown (coding, but not known)  */
    cdsIncomplete,  /* "incmpl" - CDS is not complete at this end  */
    cdsComplete,    /* "cmpl" - CDS is complete at this end  */
};

enum genePredCreateOpts
/* bit set of options for genePredGetCreateSql */
{
    genePredBasicSql = 0x00, /* used if nothing special */
    genePredWithBin = 0x01   /* create bin column */
};


enum genePredFromPslOpts
/* bit set of options for genePredFromPsl3 */
{
    genePredPslDefaults = 0x00,  /* used if nothing special */
    genePredPslCdsMod3  = 0x01   /* only merge gaps in CDS if mod 3 */
};


enum genePredFromGxfOpts
/* bit set of options for genePredFromGroupedGff/genePredFromGroupedGtf */
{
    genePredGxfDefaults = 0x00,            /* used if nothing special */
    genePredGxfImpliedStopAfterCds = 0x01, /* stop codon is implied outside of
                                            * the annotated CDS bounds  */
    genePredGxfGeneNameAsName2 = 0x02,     /* use gene_name instead of gene_id
                                            * for name2 */
    genePredGxfIncludeVersion = 0x04       /* use include gene_version and transcript_version with
                                            * ids if available */
};

enum genePredFields
/* Bit set to indicate which optional fields are used.
 * N.B. value order must match order in genePred */
{
    genePredNoOptFld      = 0x00,  /* use for no opt fields */
    genePredScoreFld      = 0x01,  /* score field */
    genePredName2Fld      = 0x02,  /* name2 field */
    genePredCdsStatFld    = 0x04,  /* cdsStart/EndStat fields */
    genePredExonFramesFld = 0x08,  /* exonFrames field */
    genePredAllFlds       = 0xFF   /* include all extended fields */
};

struct genePredExt
/* A gene prediction, with extended fields. */
{
    struct genePredExt *next;  /* Next in singly linked list. */
    char *name;	/* Name of loci, transcript, mRNA, etc */
    char *chrom;	/* Chromosome name */
    char strand[2];	/* + or - for strand */
    unsigned txStart;	/* Transcription start position */
    unsigned txEnd;	/* Transcription end position */
    unsigned cdsStart;	/* Coding region start */
    unsigned cdsEnd;	/* Coding region end */
    unsigned exonCount;	/* Number of exons */
    unsigned *exonStarts;	/* Exon start positions */
    unsigned *exonEnds;	/* Exon end positions */

    /* optional fields */
    unsigned optFields;           /* which optional fields are used (not in
                                   * database) */
    int score;                    /* score */
    char *name2;                  /* Secondary name. (e.g. name of gene), or
                                   * empty if none, NULL if field not
                                   * requested */
    enum cdsStatus cdsStartStat;  /* Status of cdsStart annotation */
    enum cdsStatus cdsEndStat;    /* Status of cdsEnd annotation */
    int *exonFrames;              /* Reading frame of the start of the CDS region
                                   * of the exon, in the direction of transcription
                                   * (0,1,2), or -1 if there is no CDS region.
                                   * NULL if not available */
    char *type;
    char *geneName;  
    char *geneName2;
    char *geneType;
};

struct genePred
/* A gene prediction, with optional fields. */
{
    struct genePred *next;  /* Next in singly linked list. */
    char *name;	/* Name of loci, transcript, mRNA, etc */
    char *chrom;	/* Chromosome name */
    char strand[2];	/* + or - for strand */
    unsigned txStart;	/* Transcription start position */
    unsigned txEnd;	/* Transcription end position */
    unsigned cdsStart;	/* Coding region start */
    unsigned cdsEnd;	/* Coding region end */
    unsigned exonCount;	/* Number of exons */
    unsigned *exonStarts;	/* Exon start positions */
    unsigned *exonEnds;	/* Exon end positions */

    /* optional fields */
    unsigned optFields;           /* which optional fields are used (not in
                                   * database) */
    int score;                    /* score */
    char *name2;                  /* Secondary name. (e.g. name of gene), or
                                   * empty if none, NULL if field not
                                   * requested */
    enum cdsStatus cdsStartStat;  /* Status of cdsStart annotation */
    enum cdsStatus cdsEndStat;    /* Status of cdsEnd annotation */
    int *exonFrames;              /* List of frame for each exon, or -1
                                   * if no frame or not known. NULL if not
                                   * available. */
};

/* Standard value to use for insertMergeSize when creating genePred.
 * Set to 8 due to microdeletions.
 */
#define genePredStdInsertMergeSize 8

#define GENEPRED_NUM_COLS 10  /* number of columns in a genePred */
#define GENEPREDX_NUM_COLS 15  /* max number of columns in extended genePred */

struct genePred *genePredLoad(char **row);
/* Load a genePred from row fetched with select * from genePred
 * from database.  Dispose of this with genePredFree(). 
 * NOTE: cannabalizes the row argument */

struct genePred *genePredLoadAll(char *fileName);
/* Load all genePred from whitespace-separated file.
 * Dispose of this with genePredFreeList(). */

struct genePred *genePredLoadAllByChar(char *fileName, char chopper);
/* Load all genePred from chopper separated file.
 * Dispose of this with genePredFreeList(). */

#define genePredLoadAllByTab(a) genePredLoadAllByChar(a, '\t');
/* Load all genePred from tab separated file.
 * Dispose of this with genePredFreeList(). */

struct genePred *genePredCommaIn(char **pS, struct genePred *ret);
/* Create a genePred out of a comma separated string. 
 * This will fill in ret if non-null, otherwise will
 * return a new genePred */

void genePredFree(struct genePred **pEl);
/* Free a single dynamically allocated genePred such as created
 * with genePredLoad(). */

void genePredFreeList(struct genePred **pList);
/* Free a list of dynamically allocated genePred's */

void genePredOutput(struct genePred *el, FILE *f, char sep, char lastSep);
/* Print out genePred.  Separate fields with sep. Follow last field with lastSep. */

#define genePredTabOut(el,f) genePredOutput(el,f,'\t','\n')
/* Print out genePred as a line in a tab-separated file. */

#define genePredCommaOut(el,f) genePredOutput(el,f,',',',')
/* Print out genePred as a comma separated list including final comma. */

/* ---------  Start of hand generated code. ---------------------------- */

struct genePred *genePredKnownLoad(char **row, int numCols);
/* Load all genePreds with from tab-separated file in knownGene format */

struct genePred *genePredExtLoad(char **row, int numCols);
/* Load a genePred with from a row, with optional fields.  The row must
 * contain columns in the order in the struct, and they must be present up to
 * the last specfied optional field.  Missing intermediate fields must have
 * zero or empty columns, they may not be omitted.  Fields at the end can be
 * omitted. Dispose of this with genePredFree(). */

struct genePred *genePredKnownLoadAll(char *fileName);
/* Load all genePreds with from tab-separated file in knownGene format */

struct genePred *genePredExtLoadAll(char *fileName);
/* Load all genePreds with from tab-separated file, possibly with optional
 * fields. Dispose of this with genePredFreeList(). */

char *genePredCdsStatStr(enum cdsStatus stat);
/* get string value of a cdsStatus */

enum cdsStatus parseCdsStat(char *statStr);
/* parse a cdsStatus string */

void genePredAddGenbankCds(struct psl *psl, struct genbankCds* cds, 
	struct genePred *gene);
/* Convert cdsStart/End from mrna to genomic coordinates. 
 * Note that the genePred blocks need not be filled in before
 * this call. */

int genePredCmp(const void *va, const void *vb);
/* Compare to sort based on chromosome, txStart. */

int genePredNameCmp(const void *va, const void *vb);
/* Compare to sort based on name, then chromosome, txStart. */

struct genePred *genePredFromGroupedGff(struct gffFile *gff, struct gffGroup *group, 
                                        char *name, char *exonSelectWord, unsigned optFields,
                                        unsigned options);
/* Convert gff->groupList to genePred list.   Only put lines where feature type  matches
 * exonSelectWord into the gene.  (If exonSelectWord is NULL, all go in)
 * If optFields contains the bit set of optional fields to add to the genePred.
 * If genePredCdsStatFld is set, then the CDS status information is
 * set based on the presences of start_codon, stop_codon, and CDS features.
 * If genePredExonFramesFld is set, then frame is set as specified in the GTF.
 * Options are from genePredFromGxfOpts.  If genePredGxfImpliedStopAfterCds
 * is specified, it is treated as if a stop_codon annotation was found,
 * if there isn't one.  If genePredGxfGeneNameAsName2 is specified, use
 * gene_name for the name2 field otherwise gene_id.
 */

struct genePred *genePredFromGroupedGtf(struct gffFile *gff, struct gffGroup *group, char *name,
                                        unsigned optFields, unsigned options);
/* Convert gff->groupList to genePred list, using GTF feature conventions;
 * including the stop codon in the 3' UTR, not the CDS (grr).  Assumes
 * gffGroup is sorted in assending coords, with overlaping starts sorted by
 * end coords, which is true if it was created by gffGroupLines().  If
 * optFields contains the bit set of optional fields to add to the genePred.
 * If genePredName2Fld is specified, then the gene_id is used for the name2
 * field.  If genePredCdsStatFld is set, then the CDS status information is
 * set based on the presences of start_codon, stop_codon, and CDS features.
 * If genePredExonFramesFld is set, then frame is set as specified in the GTF.
 * Options are from genePredFromGxfOpts.  If genePredGxfImpliedStopAfterCds
 * is specified, it is treated as if a stop_codon annotation was found,
 * if there isn't one.
 */

struct genePred *genePredFromPsl3(struct psl *psl,  struct genbankCds* cds, 
                                  unsigned optFields, unsigned options,
                                  int cdsMergeSize, int utrMergeSize);
/* Convert a PSL of an mRNA alignment to a genePred, converting a genbank CDS
 * specification string to genomic coordinates. Small genomic inserts are
 * merged based on the mergeSize parameters.  Gaps no larger than the
 * specified merge sizes result in the adjacent blocks being merged into a
 * single exon.  Gaps in CDS use cdsMergeSize, in UTR use utrMergeSize.  If
 * the genePredPslCdsMod3 option is specified, then CDS gaps are only merged
 * if a multiple of three.  A negative merge sizes disables merging of blocks.
 * This differs from specifying zero in that adjacent blocks will not be
 * merged. The optfields field is a set from genePredFields, indicated what
 * fields to create.  Zero-length CDS, or null cds, creates without CDS
 * annotation.  If cds is null, it will set status fields to cdsNone.  */

struct genePred *genePredFromPsl2(struct psl *psl, unsigned optFields,
                                  struct genbankCds* cds, int insertMergeSize);
/* Compatibility function, genePredFromPsl3 is prefered.  See that function's
 * documentation for details. This calls genePredFromPsl3 with no options
 * and insertMergeSize set for CDS and UTR.
 */

struct genePred *genePredFromPsl(struct psl *psl, int cdsStart, int cdsEnd,
                                 int insertMergeSize);
/* Compatibility function, genePredFromPsl3 is prefered.  See that function's
 * documentation for details. This calls genePredFromPsl3 with no options.
 */

char* genePredGetCreateSql(char* table, unsigned optFields, unsigned options,
                           int chromIndexLen);
/* Get SQL required to create a genePred table. optFields is a bit set
 * consisting of the genePredFields values. Options are a bit set of
 * genePredCreateOpts. Returned string should be freed.  This will create all
 * optional fields that preceed the highest optFields column.  chromIndexLen
 * is now ignored.. */

struct genePred *getOverlappingGene(char *db, struct genePred **list,  char *table, char *chrom, int cStart, int cEnd, char *name, int *retOverlap);
/* read all genes from a table find the gene with the biggest overlap. 
 * Cache the list of genes to so we only read it once.
 * If there are multiple hits and the name that matches exactly, 
 * this overrides the biggest overlap */

int genePredBases(struct genePred *gp);
/* count coding and utr bases in a gene prediction */

int genePredCodingBases(struct genePred *gp);
/* Count up the number of coding bases in gene prediction. */

INLINE int genePredCdsSize(struct genePred *gp)
/* Count up the number of coding bases in gene prediction.
 * This function is redundant. */
{
return genePredCodingBases(gp);
}

boolean genePredCdsExon(struct genePred *gp, int iExon, int *startPtr, int *endPtr);
/* Get the CDS range in an exon.  If there is no CDS, return FALSE and then
 * set start == end */

int genePredCheck(char *desc, FILE* errFh, int chromSize, 
                  struct genePred* gp);
/* Validate a genePred for consistency.  desc is printed the error messages
 * to file errFh (open /dev/null to discard).  chromSize should contain
 * size of chromosome, or 0 if chrom is not valid, or -1 to not check
 * chromosome bounds. Returns count of errors. */

int genePredCheckDb(char *desc, FILE* errFh, char* db, struct genePred* gp);
/* Validate a genePred for consistency.  desc is printed the error messages
 * to file errFh (open /dev/null to discard).  Lookup chromosome size in database if
 * db is not NULL. Returns count of errors. */

int genePredCheckChromSizes(char *desc, FILE* errFh, struct genePred* gp,
                            struct hash* chromSizes);
/* Validate a genePred for consistency.  desc is printed the error messages
 * to file errFh (open /dev/null to discard).  Lookup chromosome size in hash.
 */

boolean genePredNmdTarget(struct genePred *gp);
/* Return TRUE if cds end is more than 50bp upstream of
   last intron. */

void genePredAddExonFrames(struct genePred *gp);
/* Add exonFrames array to a genePred that doesn't have it. Frame is assumed
 * to be contiguous.  NOTE: suggest using genePredFixExonFrames for new code. */

void genePredFixExonFrames(struct genePred *gp);
/* Add exonFrames array to a genePred that has frame on only some or no
 * features. Frame is assumed to be contiguous when an existing frame is not
 * present. */

void genePredRc(struct genePred *gp, int chromSize);
/* Reverse complement a genePred (project it to the opposite strand).  Useful
 * when doing analysis that is simplified by having things on the same strand.
 */

struct genePred *genePredNew(char *name, char *chrom, char strand,
                             unsigned txStart, unsigned txEnd,
                             unsigned cdsStart, unsigned cdsEnd,
                             unsigned optFields, unsigned exonSpace);
/* create a new gene with space for the specified number of exons allocated.
 * genePredGrow maybe used to expand this space if needed. */

void genePredGrow(struct genePred *gp, unsigned *exonSpacePtr);
/* Increase memory allocated to a psl to hold more exons.  exonSpacePtr
 * should point the the current maximum number of exons and will be
 * updated to with the new amount of space. */

struct rbTree *genePredToRangeTree(struct genePred *gp, boolean cdsOnly);
/* Convert genePred into a range tree. */

void gpPartOutAsBed(struct genePred *gp, int start, int end, FILE *f, 
	char *type, int id, int minSize);
/* Write out part of gp as bed12. */

boolean codonToPos(struct genePred *gp, unsigned num, int *chromStart, int *chromEnd);
// map 1-based codon to genomic coordinates. If the codon crosses an exon junction, we return just the beginning (LHS) of the codon.
// Returns true if we find the codon in given gene predition; chromStart and chromEnd are set to appropriate three base region.

boolean exonToPos(struct genePred *gp, unsigned num, int *chromStart, int *chromEnd);
// map 1-based exon number to genomic coordinates.
// Returns true if we find the exon in given gene predition; chromStart and chromEnd are set to appropriate region.

struct asObject *genePredAsObj();
// Return asObject describing fields of genePred

struct dnaSeq *genePredGetDna(char *database, struct genePred *gp,
                              boolean coding, enum dnaCase dnaCase);
// Returns the DNA sequence associated with gene prediction.
// Negative strand genes will return the sequence as read from the negative strand.
// Optionally restrict to coding sequence only

int genePredBaseToCodingPos(struct genePred *gp, int basePos,
                            boolean stranded, boolean *isCoding);
// Given a genePred model and a single (0 based) base position, predict the 0-based
// DNA (stranded) coding sequence pos.  Dividing this number by 3 should give the AA position!
// Returns -1 when outside of coding exons unless OPTIONAL isCoding pointer to boolean is
// provided. In that case, returns last valid position and sets isCoding to FALSE.

struct genePredExt  *genePredFromBigGenePred( char *chrom, struct bigBedInterval *bb);
/* build a genePred from a bigGenePred interval */

struct genePredExt  *genePredFromBigGenePredRow(char **row);
/* build a genePred from a bigGenePred row */

/* options to genePredTranslate */
#define GENEPRED_TRANSLATE_SELENO              0x01   /* Assume internal TGA code for selenocysteine and translate to `U' */
#define GENEPRED_TRANSLATE_INCLUDE_STOP        0x02   /* If the CDS ends with a stop codon, represent it as a `*' */
#define GENEPRED_TRANSLATE_STAR_INFRAME_STOPS  0x04   /* Use `*' instead of `X' for in-frame stop codons.
                                                       * This will result in selenocysteine's being `*', with only codons
                                                       * containing `N' being translated to `X'.  This doesn't include terminal
                                                       * stop */

void genePredTranslate(struct genePred *gp, struct nibTwoCache* genomeSeqs, unsigned options,
                       char **protRet, char **cdsRet);
/* Translate a genePred into a protein.  It can also return the CDS part of the
 * mRNA sequence. If the chrom is chrM, the mitochondrial translation tables are
 * used. If protRet or cdsRet is NULL, those sequences are not returned.
 */

void genePredToCds(struct genePred *gp, struct genbankCds *cds);
/* Fill in cds with transcript offsets computed from genePred. */

struct psl *genePredToPsl(struct genePred *gp, int chromSize, int qSize);
/* Convert a genePred to psl, assuming perfect concordance between target & query.
 * If qSize is 0 then the number of aligned bases will be used as qSize. */

struct genePredExt  *genePredFromBedBigGenePred( char *chrom, struct bed *bed, struct bigBedInterval *bb);
/* build a genePred from a bigGenePred and a bed file */
#endif /* GENEPRED_H */
