/* genePred.c was originally generated by the autoSql program, which also * generated genePred.h and genePred.sql. This module links the database and the RAM * representation of objects. */ /* Copyright (C) 2014 The Regents of the University of California * See kent/LICENSE or http://genome.ucsc.edu/license/ for licensing information. */ #include "common.h" #include "gff.h" #include "jksql.h" #include "psl.h" #include "linefile.h" #include "genePred.h" #include "genbank.h" #include "rangeTree.h" #include "hdb.h" #include "chromInfo.h" struct genePred *genePredLoad(char **row) /* Load a genePred from row fetched with select * from genePred * from database. Dispose of this with genePredFree(). * NOTE: cannabalizes the row argument */ { struct genePred *ret; int sizeOne; AllocVar(ret); ret->exonCount = sqlUnsigned(row[7]); ret->name = cloneString(row[0]); ret->chrom = cloneString(row[1]); strcpy(ret->strand, row[2]); ret->txStart = sqlUnsigned(row[3]); ret->txEnd = sqlUnsigned(row[4]); ret->cdsStart = sqlUnsigned(row[5]); ret->cdsEnd = sqlUnsigned(row[6]); sqlUnsignedDynamicArray(row[8], &ret->exonStarts, &sizeOne); if (sizeOne != ret->exonCount) errAbort("genePred: %s number of exonStarts (%d) != number of exons (%d)", ret->name, sizeOne, ret->exonCount); sqlUnsignedDynamicArray(row[9], &ret->exonEnds, &sizeOne); if (sizeOne != ret->exonCount) errAbort("genePred: %s number of exonEnds (%d) != number of exons (%d)", ret->name, sizeOne, ret->exonCount); return ret; } struct genePred *genePredLoadAll(char *fileName) /* Load all genePred from a whitespace-separated file. * Dispose of this with genePredFreeList(). */ { struct genePred *list = NULL, *el; struct lineFile *lf = lineFileOpen(fileName, TRUE); char *row[10]; while (lineFileRow(lf, row)) { el = genePredLoad(row); slAddHead(&list, el); } lineFileClose(&lf); slReverse(&list); return list; } struct genePred *genePredLoadAllByChar(char *fileName, char chopper) /* Load all genePred from a chopper separated file. * Dispose of this with genePredFreeList(). */ { struct genePred *list = NULL, *el; struct lineFile *lf = lineFileOpen(fileName, TRUE); char *row[10]; while (lineFileNextCharRow(lf, chopper, row, ArraySize(row))) { el = genePredLoad(row); slAddHead(&list, el); } lineFileClose(&lf); slReverse(&list); return list; } struct genePred *genePredCommaIn(char **pS, struct genePred *ret) /* Create a genePred out of a comma separated string. * This will fill in ret if non-null, otherwise will * return a new genePred */ { char *s = *pS; int i; if (ret == NULL) AllocVar(ret); ret->name = sqlStringComma(&s); ret->chrom = sqlStringComma(&s); sqlFixedStringComma(&s, ret->strand, sizeof(ret->strand)); ret->txStart = sqlUnsignedComma(&s); ret->txEnd = sqlUnsignedComma(&s); ret->cdsStart = sqlUnsignedComma(&s); ret->cdsEnd = sqlUnsignedComma(&s); ret->exonCount = sqlUnsignedComma(&s); s = sqlEatChar(s, '{'); AllocArray(ret->exonStarts, ret->exonCount); for (i=0; iexonCount; ++i) { ret->exonStarts[i] = sqlUnsignedComma(&s); } s = sqlEatChar(s, '}'); s = sqlEatChar(s, ','); s = sqlEatChar(s, '{'); AllocArray(ret->exonEnds, ret->exonCount); for (i=0; iexonCount; ++i) { ret->exonEnds[i] = sqlUnsignedComma(&s); } s = sqlEatChar(s, '}'); s = sqlEatChar(s, ','); *pS = s; return ret; } void genePredFree(struct genePred **pEl) /* Free a single dynamically allocated genePred such as created * with genePredLoad(). */ { struct genePred *el; if ((el = *pEl) == NULL) return; freeMem(el->name); freeMem(el->chrom); freeMem(el->exonStarts); freeMem(el->exonEnds); freeMem(el->name2); freeMem(el->exonFrames); freez(pEl); } void genePredFreeList(struct genePred **pList) /* Free a list of dynamically allocated genePred's */ { struct genePred *el, *next; for (el = *pList; el != NULL; el = next) { next = el->next; genePredFree(&el); } *pList = NULL; } void genePredOutput(struct genePred *el, FILE *f, char sep, char lastSep) /* Print out genePred. Separate fields with sep. Follow last field with lastSep. */ { int i; if (sep == ',') fputc('"',f); fprintf(f, "%s", el->name); if (sep == ',') fputc('"',f); fputc(sep,f); if (sep == ',') fputc('"',f); fprintf(f, "%s", el->chrom); if (sep == ',') fputc('"',f); fputc(sep,f); if (sep == ',') fputc('"',f); fprintf(f, "%s", el->strand); if (sep == ',') fputc('"',f); fputc(sep,f); fprintf(f, "%u", el->txStart); fputc(sep,f); fprintf(f, "%u", el->txEnd); fputc(sep,f); fprintf(f, "%u", el->cdsStart); fputc(sep,f); fprintf(f, "%u", el->cdsEnd); fputc(sep,f); fprintf(f, "%u", el->exonCount); fputc(sep,f); if (sep == ',') fputc('{',f); for (i=0; iexonCount; ++i) { fprintf(f, "%u", el->exonStarts[i]); fputc(',', f); } if (sep == ',') fputc('}',f); fputc(sep,f); if (sep == ',') fputc('{',f); for (i=0; iexonCount; ++i) { fprintf(f, "%u", el->exonEnds[i]); fputc(',', f); } if (sep == ',') fputc('}',f); /* optional fields, >= test is used so unspecified coumns can be filled in */ if (el->optFields >= genePredScoreFld) { fputc(sep,f); fprintf(f, "%d", el->score); } if (el->optFields >= genePredName2Fld) { fputc(sep,f); if (sep == ',') fputc('"',f); fprintf(f, "%s", ((el->name2 != NULL) ? el->name2 : "")); if (sep == ',') fputc('"',f); } if (el->optFields >= genePredCdsStatFld) { fputc(sep,f); if (sep == ',') fputc('"',f); fprintf(f, "%s", genePredCdsStatStr(el->cdsStartStat)); if (sep == ',') fputc('"',f); fputc(sep,f); if (sep == ',') fputc('"',f); fprintf(f, "%s", genePredCdsStatStr(el->cdsEndStat)); if (sep == ',') fputc('"',f); } if (el->optFields >= genePredExonFramesFld) { fputc(sep,f); if (sep == ',') fputc('{',f); for (i=0; iexonCount; ++i) { fprintf(f, "%d", el->exonFrames[i]); fputc(',', f); } if (sep == ',') fputc('}',f); } fputc(lastSep,f); } /* --------- Start of hand generated code. ---------------------------- */ char *genePredCdsStatStr(enum cdsStatus stat) /* get string value of a cdsStatus */ { switch (stat) { case cdsNone: return "none"; case cdsUnknown: return "unk"; case cdsIncomplete: return "incmpl"; case cdsComplete: return "cmpl"; default: errAbort("invalid cdsStatus enum value: %d", stat); return NULL; /* make compiler happy */ } } enum cdsStatus parseCdsStat(char *statStr) /* parse a cdsStatus string */ { if ((statStr == NULL) || sameString(statStr, "none")) return cdsNone; if (sameString(statStr, "unk")) return cdsUnknown; if (sameString(statStr, "incmpl")) return cdsIncomplete; if (sameString(statStr, "cmpl")) return cdsComplete; errAbort("invalid genePred cdsStatus: \"%s\"", statStr); return cdsNone; /* make compiler happy */ } struct genePred *genePredKnownLoad(char **row, int numCols) /* Load a genePred in knownGene format from row. */ { struct genePred *ret; int sizeOne; AllocVar(ret); ret->exonCount = sqlUnsigned(row[7]); ret->name = cloneString(row[0]); ret->chrom = cloneString(row[1]); strcpy(ret->strand, row[2]); ret->txStart = sqlUnsigned(row[3]); ret->txEnd = sqlUnsigned(row[4]); ret->cdsStart = sqlUnsigned(row[5]); ret->cdsEnd = sqlUnsigned(row[6]); sqlUnsignedDynamicArray(row[8], &ret->exonStarts, &sizeOne); if (sizeOne != ret->exonCount) errAbort("genePred: %s number of exonStarts (%d) != number of exons (%d)", ret->name, sizeOne, ret->exonCount); sqlUnsignedDynamicArray(row[9], &ret->exonEnds, &sizeOne); if (sizeOne != ret->exonCount) errAbort("genePred: %s number of exonEnds (%d) != number of exons (%d)", ret->name, sizeOne, ret->exonCount); ret->name2 = cloneString(row[11]); return ret; } struct genePred *genePredExtLoad(char **row, int numCols) /* Load a genePred with from a row, with optional fields. The row must * contain columns in the order in the struct, and they must be present up to * the last specfied optional field. Missing intermediate fields must have * zero or empty columns, they may not be omitted. Fields at the end can be * omitted. Dispose of this with genePredFree(). */ { struct genePred *ret; int sizeOne, iCol; AllocVar(ret); ret->exonCount = sqlUnsigned(row[7]); ret->name = cloneString(row[0]); ret->chrom = cloneString(row[1]); strcpy(ret->strand, row[2]); ret->txStart = sqlUnsigned(row[3]); ret->txEnd = sqlUnsigned(row[4]); ret->cdsStart = sqlUnsigned(row[5]); ret->cdsEnd = sqlUnsigned(row[6]); sqlUnsignedDynamicArray(row[8], &ret->exonStarts, &sizeOne); if (sizeOne != ret->exonCount) errAbort("genePred: %s number of exonStarts (%d) != number of exons (%d)", ret->name, sizeOne, ret->exonCount); sqlUnsignedDynamicArray(row[9], &ret->exonEnds, &sizeOne); if (sizeOne != ret->exonCount) errAbort("genePred: %s number of exonEnds (%d) != number of exons (%d)", ret->name, sizeOne, ret->exonCount); iCol=GENEPRED_NUM_COLS; if (iCol < numCols) { ret->score = sqlSigned(row[iCol++]); ret->optFields |= genePredScoreFld; } if (iCol < numCols) { ret->name2 = cloneString(row[iCol++]); ret->optFields |= genePredName2Fld; } if (iCol < numCols) { if (isEmpty(row[iCol])) // if the cdsStartStat field is empty return ret; // ignore the rest of the fields ret->cdsStartStat = parseCdsStat(row[iCol++]); ret->optFields |= genePredCdsStatFld; } if (iCol < numCols) { ret->cdsEndStat = parseCdsStat(row[iCol++]); ret->optFields |= genePredCdsStatFld; } if (iCol < numCols) { sqlSignedDynamicArray(row[iCol++], &ret->exonFrames, &sizeOne); if (sizeOne != ret->exonCount) errAbort("genePred: %s number of exonFrames (%d) != number of exons (%d)", ret->name, sizeOne, ret->exonCount); ret->optFields |= genePredExonFramesFld; } return ret; } struct genePred *genePredKnownLoadAll(char *fileName) /* Load all genePreds with from tab-separated file in knownGene format */ { struct genePred *list = NULL, *el; struct lineFile *lf = lineFileOpen(fileName, TRUE); char *row[GENEPREDX_NUM_COLS]; int numCols; while ((numCols = lineFileChopNextTab(lf, row, ArraySize(row))) > 0) { lineFileExpectAtLeast(lf, GENEPRED_NUM_COLS, numCols); el = genePredKnownLoad(row, numCols); slAddHead(&list, el); } lineFileClose(&lf); slReverse(&list); return list; } struct genePred *genePredExtLoadAll(char *fileName) /* Load all genePreds with from tab-separated file, possibly with optional * fields. Dispose of this with genePredFreeList(). */ { struct genePred *list = NULL, *el; struct lineFile *lf = lineFileOpen(fileName, TRUE); char *row[GENEPREDX_NUM_COLS]; int numCols; while ((numCols = lineFileChopNextTab(lf, row, ArraySize(row))) > 0) { lineFileExpectAtLeast(lf, GENEPRED_NUM_COLS, numCols); el = genePredExtLoad(row, numCols); slAddHead(&list, el); } lineFileClose(&lf); slReverse(&list); return list; } int genePredCmp(const void *va, const void *vb) /* Compare to sort based on chromosome, txStart. */ { const struct genePred *a = *((struct genePred **)va); const struct genePred *b = *((struct genePred **)vb); int dif = strcmp(a->chrom, b->chrom); if (dif == 0) dif = a->txStart - b->txStart; return dif; } int genePredNameCmp(const void *va, const void *vb) /* Compare to sort based on name, then chromosome, txStart. */ { const struct genePred *a = *((struct genePred **)va); const struct genePred *b = *((struct genePred **)vb); int dif = strcmp(a->name, b->name); if (dif == 0) dif = strcmp(a->chrom, b->chrom); if (dif == 0) dif = a->txStart - b->txStart; return dif; } static boolean haveStartStopCodons(struct gffFile *gff) /* For GFFs, determine if any of the annotations use start_codon or * stop_codon */ { struct gffFeature *feature; for(feature = gff->featureList; feature != NULL; feature = feature->next) { if(sameWord(feature->name, "start_codon") || sameWord(feature->name, "start_codon")) return TRUE; } return FALSE; } static boolean isExon(char *feat, boolean isGtf, char *exonSelectWord) /* determine if a feature is an exon; different criteria for GFF ane GTF */ { if (isGtf) { /* must check for stop_codon here, because GTF put it outside of the * CDS */ return (sameWord(feat, "CDS") || sameWord(feat, "stop_codon") || sameWord(feat, "start_codon") || sameWord(feat, "exon") || sameWord(feat, "utr") || sameWord(feat, "5utr") || sameWord(feat, "3utr")); } else { return ((exonSelectWord == NULL) || sameWord(feat, exonSelectWord)); } } static boolean isStartStopCodon(char *feat) /* determine if a feature is GTF style start/stop codon */ { return sameWord(feat, "stop_codon") || sameWord(feat, "start_codon"); } static boolean isCds(char *feat) /* determine if a feature is CDS */ { return sameWord(feat, "CDS") || isStartStopCodon(feat); } static void chkGroupLine(struct gffGroup *group, struct gffLine *gl, struct genePred *gp) /* check that a gffLine is consistent with the genePred being built. this * helps detect some problems that lead to corrupt genePreds */ { if (!(sameString(gl->seq, gp->chrom) && (gl->strand == gp->strand[0]))) { fprintf(stderr, "invalid gffGroup detected on line: "); gffTabOut(gl, stderr); errAbort("GFF/GTF group %s on %s%c, this line is on %s%c, all group members must be on same seq and strand", group->name, gp->chrom, gp->strand[0], gl->seq, gl->strand); } } static int phaseToFrame(char phase) /* convert GTF/GFF phase to frame */ { switch (phase) { case '0': return 0; case '1': return 2; case '2': return 1; } return -1; } #ifdef NOT_CURRENTLY_USED static char frameToPhase(int frame) /* convert GTF/GFF frame to phase */ { switch (frame) { case 0: return '0'; case 1: return '2'; case 2: return '1'; } return -1; } #endif static int incrFrame(int frame, int amt) /* increment frame, no-op if frame is -1 */ { if (frame < 0) return frame; else if (amt >= 0) return (frame+amt) % 3; else { int amt3 = (-amt) % 3; return (frame - (amt - amt3)) % 3; } } static int findPosExon(struct genePred *gp, int pos) /* find exon containing the specified position */ { int i; for (i = 0; i < gp->exonCount; i++) { if ((gp->exonStarts[i] <= pos) && (pos < gp->exonEnds[i])) return i; } errAbort("bug: position %d not found in exon of %s genePred", pos, gp->name); return 0; } static boolean adjImpliedStopPos(struct genePred *gp) /* implied stop codon adjustment for positive strand gene */ { int iExon = findPosExon(gp, gp->cdsEnd-1); unsigned space = (gp->exonEnds[iExon]-gp->cdsEnd); if (space >= 3) { // room in current exon gp->cdsEnd += 3; return TRUE; } else if ((iExon < gp->exonCount) && ((gp->exonEnds[iExon+1]-gp->exonStarts[iExon+1])+space) >= 3) { // room including next exon gp->cdsEnd = gp->exonStarts[iExon+1] + (3-space); return TRUE; } return FALSE; } static boolean adjImpliedStopNeg(struct genePred *gp) /* implied stop codon adjustment for negative strand gene */ { int iExon = findPosExon(gp, gp->cdsStart); unsigned space = (gp->cdsStart-gp->exonStarts[iExon]); if (space >= 3) { // room in current exon gp->cdsStart -= 3; return TRUE; } else if ((iExon > 0) && ((gp->exonEnds[iExon-1]-gp->exonStarts[iExon-1])+space) >= 3) { // room including previous exon gp->cdsStart = gp->exonEnds[iExon-1] - (3-space); return TRUE; } return FALSE; } static void adjImpliedStopAfterCds(struct genePred *gp) /* adjust CDS bounds to include implied stop codon outside of CDS. */ { if (gp->strand[0] == '+') { if (adjImpliedStopPos(gp)) { if (gp->optFields & genePredCdsStatFld) gp->cdsEndStat = cdsComplete; } } else { if (adjImpliedStopNeg(gp)) { if (gp->optFields & genePredCdsStatFld) gp->cdsStartStat = cdsComplete; } } } static boolean isGtfFeature(char *feat) /* check if a feature is one of the recognized features. All otherse * should be ignored. http://mblab.wustl.edu/GTF22.html */ { static char *gtfFeats[] = {"CDS", "start_codon", "stop_codon", "5UTR", "3UTR", "inter", "inter_CNS", "intron_CNS", "exon", NULL}; int i; for (i = 0; gtfFeats[i] != NULL; i++) { if (sameString(feat, gtfFeats[i])) return TRUE; } return FALSE; } static boolean ignoreGxfLine(struct gffLine *gl, boolean isGtf) /* should the specified line be skipped? */ { if (isGtf) return !isGtfFeature(gl->feature); else return FALSE; } static boolean allowedFrameFeature(struct gffLine *gl, boolean isGtf) /* should this feature be used to assign frame? */ { if (isStartStopCodon(gl->feature) || ignoreGxfLine(gl, isGtf)) return FALSE; // for GFF, any features with frame can be used to set frame, // with GTF, there are some bogus files that set frame on UTR features, // so only use CDS if (isGtf) return sameString(gl->feature, "CDS"); else return (gl->frame != '.'); } static struct gffLine *assignFrameForCdsExon(int start, int end, int *framePtr, struct gffLine *gl, boolean isGtf) /* set frame from any overlapping records with frame. Don't include * start/stop_codon, as it isn't a full exon. */ { /* skip lines preceeding this exon */ while (gl != NULL) { if (allowedFrameFeature(gl, isGtf) && (gl->end >= start)) break; gl = gl->next; } /* set frame from any overlapping records with frame. */ while ((gl != NULL) && (rangeIntersection(gl->start, gl->end, start, end) > 0)) { if (allowedFrameFeature(gl, isGtf)) { int frame = phaseToFrame(gl->frame); if (frame >= 0) *framePtr = frame; } gl = gl->next; } return gl; } static void assignFrame(boolean isGtf, struct gffGroup *group, struct genePred *gp, unsigned options) /* Assign frame from GFF/GTF after genePred has been built.*/ { struct gffLine *gl = group->lineList; int start, end, i; int numCdsExons = 0; int numFrameExons = 0; for (i = 0; i < gp->exonCount; i++) { if (genePredCdsExon(gp, i, &start, &end)) { numCdsExons += 1; gl = assignFrameForCdsExon(start, end, &(gp->exonFrames[i]), gl, isGtf); if (gp->exonFrames[i] >= 0) numFrameExons += 1; } } /* Assign frames for exons without frames, either due to it incorrectly not * being on cases. This also Handle the nasty corner caseof GTF with the all * or part of the stop codon as the only codon in an exon, we must set the * frame on these exons. Some ENSEMBL GTFs have single base `exons' (due to * low quality assemblies) with just a base of the stop codon, so must check * every base. While less than ideal, we just extend the frame past the last * exon with frame, so a conflicting frame on the stop codon will be missed. * Just switching to GFF3 fixes this mess. Return the count of exons that * actually had frame set. */ genePredFixExonFrames(gp); } static struct genePred *mkFromGroupedGxf(struct gffFile *gff, struct gffGroup *group, char *name, boolean isGtf, char *exonSelectWord, unsigned optFields, unsigned options) /* common function to create genePreds from GFFs or GTFs. This is a little * ugly with to many check of isGtf, however the was way to much identical * code the other way. Options are from genePredFromGxfOpts */ { struct genePred *gp; long stopCodonStart = -1, stopCodonEnd = -1; long cdsStart = -1, cdsEnd = -1; int exonCount = 0; boolean haveStartCodon = FALSE, haveStopCodon = FALSE; struct gffLine *gl; unsigned *eStarts, *eEnds; int i; /* should we count on start/stop codon annotation in GFFs? */ boolean useStartStops = isGtf || haveStartStopCodons(gff); int geneStart = 0, geneEnd = 0; /* Count up exons and figure out cdsStart and cdsEnd. */ for (gl = group->lineList; gl != NULL; gl = gl->next) { boolean exonishLine = FALSE; if (ignoreGxfLine(gl, isGtf)) continue; if (isExon(gl->feature, isGtf, exonSelectWord)) { exonishLine = TRUE; ++exonCount; } if (isCds(gl->feature)) { exonishLine = TRUE; if ((cdsStart < 0) || (gl->start < cdsStart)) cdsStart = gl->start; if ((cdsEnd < 0) || (gl->end > cdsEnd)) cdsEnd = gl->end; } if (exonishLine) { if (geneStart == geneEnd) // Not initialized yet { geneStart = gl->start; geneEnd = gl->end; } else { geneStart = min(gl->start, geneStart); geneEnd = max(gl->end, geneEnd); } } if (sameWord(gl->feature, "start_codon")) haveStartCodon = TRUE; if (sameWord(gl->feature, "stop_codon")) { /* stop_codon can be split, need bounds for adjusting CDS below */ if ((stopCodonStart < 0) || (gl->start < stopCodonStart)) stopCodonStart = gl->start; if ((stopCodonEnd < 0) || (gl->end > stopCodonEnd)) stopCodonEnd = gl->end; haveStopCodon = TRUE; } } if (exonCount == 0) return NULL; if (cdsStart > cdsEnd) { /* no cds annotated */ cdsStart = 0; cdsEnd = 0; } else if (stopCodonStart >= 0) { /* adjust CDS to include stop codon as in GTF */ if (group->strand == '+') { if (stopCodonEnd > cdsEnd) cdsEnd = stopCodonEnd; } else { if (stopCodonStart < cdsStart) cdsStart = stopCodonStart; } } /* add in version numbers if requested and available */ char geneIdToUse[1024], transcriptIdToUse[1024]; geneIdToUse[0]= '\0'; if (options & genePredGxfGeneNameAsName2) { if (group->lineList->geneName != NULL) safecpy(geneIdToUse, sizeof(geneIdToUse), group->lineList->geneName); } else if (group->lineList->geneId != NULL) { if ((options & genePredGxfIncludeVersion) && (group->lineList->geneVersion != NULL)) safef(geneIdToUse, sizeof(geneIdToUse), "%s.%s", group->lineList->geneId, group->lineList->geneVersion); else safecpy(geneIdToUse, sizeof(geneIdToUse), group->lineList->geneId); } if ((options & genePredGxfIncludeVersion) && (group->lineList->transcriptVersion != NULL)) safef(transcriptIdToUse, sizeof(transcriptIdToUse), "%s.%s", name, group->lineList->transcriptVersion); else safecpy(transcriptIdToUse, sizeof(transcriptIdToUse), name); /* Allocate genePred and fill in values. */ AllocVar(gp); gp->name = cloneString(transcriptIdToUse); gp->chrom = cloneString(group->seq); gp->strand[0] = group->strand; gp->txStart = geneStart; gp->txEnd = geneEnd; if (cdsStart < cdsEnd) { gp->cdsStart = cdsStart; gp->cdsEnd = cdsEnd; } else { // no CDS, set to txEnd gp->cdsStart = gp->txEnd; gp->cdsEnd = gp->txEnd; } gp->exonStarts = AllocArray(eStarts, exonCount); gp->exonEnds = AllocArray(eEnds, exonCount); gp->optFields = optFields; if (optFields & genePredName2Fld) gp->name2 = cloneString(geneIdToUse); if (optFields & genePredCdsStatFld) { if (cdsStart < cdsEnd) { if (!useStartStops) { /* GFF doesn't require start or stop, if not used, assume complete */ gp->cdsStartStat = cdsComplete; gp->cdsEndStat = cdsComplete; } else if (group->strand == '+') { gp->cdsStartStat = (haveStartCodon ? cdsComplete : cdsIncomplete); gp->cdsEndStat = (haveStopCodon ? cdsComplete : cdsIncomplete);; } else { gp->cdsEndStat = (haveStartCodon ? cdsComplete : cdsIncomplete); gp->cdsStartStat = (haveStopCodon ? cdsComplete : cdsIncomplete);; } } else { gp->cdsStartStat = cdsNone; gp->cdsEndStat = cdsNone; } } if (optFields & genePredExonFramesFld) { AllocArray(gp->exonFrames, exonCount); for (i = 0; i < exonCount; i++) gp->exonFrames[i] = -1; } /* adjust tx range to include stop codon */ if ((group->strand == '+') && (gp->txEnd == stopCodonStart)) gp->txEnd = stopCodonEnd; else if ((group->strand == '-') && (gp->txStart == stopCodonEnd)) gp->txStart = stopCodonStart; i = -1; /* before first exon */ /* fill in exons, merging overlaping and adjacent exons */ for (gl = group->lineList; gl != NULL; gl = gl->next) { if (ignoreGxfLine(gl, isGtf)) continue; if (isExon(gl->feature, isGtf, exonSelectWord) || (isGtf && isCds(gl->feature))) { chkGroupLine(group, gl, gp); if ((i < 0) || (gl->start > eEnds[i])) { /* start a new exon */ ++i; assert(i < exonCount); eStarts[i] = gl->start; eEnds[i] = gl->end; } else { /* overlap, extend exon, picking the largest of ends */ assert(i < exonCount); assert(gl->start >= eStarts[i]); if (gl->end > eEnds[i]) eEnds[i] = gl->end; } } } gp->exonCount = i+1; /* adjust for flybase type of GFFs, with stop codon implied and outside of * CDS. */ if ((options & genePredGxfImpliedStopAfterCds) && !haveStopCodon) adjImpliedStopAfterCds(gp); if (optFields & genePredExonFramesFld) assignFrame(isGtf, group, gp, options); return gp; } struct genePred *genePredFromGroupedGff(struct gffFile *gff, struct gffGroup *group, char *name, char *exonSelectWord, unsigned optFields, unsigned options) /* Convert gff->groupList to genePred list. See genePred.h for details. */ { struct gffLine *gl; boolean anyExon = FALSE; /* Look to see if any exons. If not allow CDS to be used instead. */ if (exonSelectWord) { for (gl = group->lineList; gl != NULL; gl = gl->next) { if (sameWord(gl->feature, exonSelectWord)) { anyExon = TRUE; break; } } } else anyExon = TRUE; if (!anyExon) exonSelectWord = "CDS"; return mkFromGroupedGxf(gff, group, name, FALSE, exonSelectWord, optFields, options); } struct genePred *genePredFromGroupedGtf(struct gffFile *gff, struct gffGroup *group, char *name, unsigned optFields, unsigned options) /* Convert gff->groupList to genePred list, using GTF feature conventions. * See genePred.h for details. */ { return mkFromGroupedGxf(gff, group, name, TRUE, NULL, optFields, options); } static void mapCdsToGenome(struct psl *psl, struct genbankCds* cds, struct genePred* gene) /* Convert set cdsStart/end from mrna to genomic coordinates. */ { struct genbankCds genomeCds = genbankCdsToGenome(cds, psl); if (genomeCds.start < genomeCds.end) { gene->cdsStart = genomeCds.start; gene->cdsEnd = genomeCds.end; if (gene->optFields & genePredCdsStatFld) { if (gene->strand[0] == '+') { gene->cdsStartStat = (genomeCds.startComplete) ? cdsComplete : cdsIncomplete; gene->cdsEndStat = (genomeCds.endComplete) ? cdsComplete : cdsIncomplete;; } else { gene->cdsStartStat = (genomeCds.endComplete) ? cdsComplete : cdsIncomplete;; gene->cdsEndStat = (genomeCds.startComplete) ? cdsComplete : cdsIncomplete; } } } else { /* CDS not aligned */ gene->cdsStart = gene->txEnd; gene->cdsEnd = gene->txEnd; if (gene->optFields & genePredCdsStatFld) { gene->cdsStartStat = cdsUnknown; gene->cdsEndStat = cdsUnknown; } } } void genePredAddGenbankCds(struct psl *psl, struct genbankCds* cds, struct genePred *gene) /* Convert cdsStart/End from mrna to genomic coordinates. * Note that the genePred blocks need not be filled in before * this call. */ { if (cds == NULL) { /* no CDS, set to zero-length at end */ gene->cdsStart = psl->tEnd; gene->cdsEnd = psl->tEnd; if (gene->optFields & genePredCdsStatFld) { gene->cdsStartStat = cdsNone; gene->cdsEndStat = cdsNone; } } else if ((cds->start < 0) || (cds->end <= 0)) { /* unknown CDS, set to end */ gene->cdsStart = psl->tEnd; gene->cdsEnd = psl->tEnd; if (gene->optFields & genePredCdsStatFld) { gene->cdsStartStat = cdsUnknown; gene->cdsEndStat = cdsUnknown; } } else { /* have CDS annotation, make sure it's in bounds */ struct genbankCds adjCds = *cds; if (adjCds.end > psl->qSize) { adjCds.end = psl->qSize; adjCds.endComplete = FALSE; } mapCdsToGenome(psl, &adjCds, gene); } } static int getFrame(struct psl *psl, int start, int end, struct genbankCds* cds) /* get the starting frame for an exon of a mRNA. start and end are * the bounds of the exon within the mRNA. This may cover multiple psl * blocks due to merging of small gaps. */ { /* use the 3' end is used if it's complete, as it is more often accurate when * genes are defined from mRNAs sequenced with reverse-transcriptase. */ int frame = -1; /* map to mRNA coords in CDS since frame for an exon is in direction of * transcription. */ if (psl->strand[0] == '-') reverseIntRange(&start, &end, psl->qSize); if (start < cds->start) start = cds->start; if (end > cds->end) end = cds->end; if (start < end) { /* Compute frame from end of RNA if CDS end is marked complete and start * is not complete. This is done as the end of an RNA is more likely * completely due to reverse transcriptase not replicating the entire RNA. * However, code that create CDS from genePreds doesn't always create a * CDS specification that indicates incompleteness. So don't use CDS end * unless we know start is incomplete, mean code tried to set it. This is * not a perfect solution, as handling of CDS specification is naive and * doesn't account for truncated start or stop. Incomplete codons can * result in frame shift even is CDS completeness is set correctly. */ if (cds->endComplete && !cds->startComplete) { int fr = (cds->end-start) % 3; frame = (fr == 2) ? 1 : ((fr == 1) ? 2 : 0); } else frame = (start-cds->start) % 3; } return frame; } static boolean shouldMergeBlocks(struct genePred *gene, unsigned tStart, unsigned prevTEnd, unsigned qStart, unsigned prevQEnd, unsigned options, int cdsMergeSize, int utrMergeSize) /* determine if a new block starting at tStart whould be merged with * the preceeding exon. */ { boolean inCds = (gene->cdsStart <= tStart) && (tStart < gene->cdsEnd); int tGapSize = (tStart - prevTEnd); int qGapSize = (qStart - prevQEnd); if (inCds) { if ((options & genePredPslCdsMod3) && (((tGapSize % 3) != 0) || ((qGapSize % 3) != 0))) return FALSE; /* not a multiple of three */ if ((cdsMergeSize >= 0) && ((tGapSize <= cdsMergeSize) && (qGapSize <= cdsMergeSize))) return TRUE; } else { // qGapSize only matters for CDS where we are trying to keep frame sane if ((utrMergeSize >= 0) && (tGapSize <= utrMergeSize)) return TRUE; } return FALSE; /* don't merge */ } static void pslToExons(struct psl *psl, struct genePred *gene, struct genbankCds* cds, unsigned options, int cdsMergeSize, int utrMergeSize) /* Convert psl alignment blocks to genePred exons, merging together blocks by * the specified paraemeters. Optionally add frame information. */ { int iBlk, iExon; int startIdx, stopIdx, idxIncr; /* this is bigger than needed if blocks merge */ gene->exonStarts = needMem(psl->blockCount*sizeof(unsigned)); gene->exonEnds = needMem(psl->blockCount*sizeof(unsigned)); if (gene->optFields & genePredExonFramesFld) { /* default frames to indicate no frame */ gene->exonFrames = needMem(psl->blockCount*sizeof(unsigned)); for (iExon = 0; iExon < psl->blockCount; iExon++) gene->exonFrames[iExon] = -1; } /* traverse psl in postive target order */ if (psl->strand[1] == '-') { startIdx = psl->blockCount-1; stopIdx = -1; idxIncr = -1; } else { startIdx = 0; stopIdx = psl->blockCount; idxIncr = 1; } iExon = -1; /* indicate none have been added */ unsigned prevQEnd = 0; unsigned prevTEnd = 0; for (iBlk = startIdx; iBlk != stopIdx; iBlk += idxIncr) { int qStart = psl->qStarts[iBlk]; int qEnd = qStart + psl->blockSizes[iBlk]; if (psl->strand[0] == '-') reverseIntRange(&qStart, &qEnd, psl->qSize); int tStart = psl->tStarts[iBlk]; int tEnd = tStart + psl->blockSizes[iBlk]; if (psl->strand[1] == '-') reverseIntRange(&tStart, &tEnd, psl->tSize); if ((iExon < 0) || !shouldMergeBlocks(gene, tStart, prevTEnd, qStart, prevQEnd, options, cdsMergeSize, utrMergeSize)) { /* new exon */ iExon++; gene->exonStarts[iExon] = tStart; } gene->exonEnds[iExon] = tEnd; /* Set the frame. We have to deal with multiple blocks being merged. For * positive strand, the first block frame is used, for negative strand, * the last is used. */ if ((gene->optFields & genePredExonFramesFld) && (cds != NULL) && (((psl->strand[0] == '+') && (gene->exonFrames[iExon] < 0)) || (psl->strand[0] == '-'))) { int fr = getFrame(psl, psl->qStarts[iBlk], psl->qStarts[iBlk]+psl->blockSizes[iBlk], cds); if (fr >= 0) gene->exonFrames[iExon] = fr; } prevTEnd = tEnd; prevQEnd = qEnd; } gene->exonCount = iExon+1; assert(gene->exonCount <= psl->blockCount); } struct genePred *genePredFromPsl3(struct psl *psl, struct genbankCds* cds, unsigned optFields, unsigned options, int cdsMergeSize, int utrMergeSize) /* Convert a PSL of an mRNA alignment to a genePred, converting a genbank CDS * specification string to genomic coordinates. Small genomic inserts are * merged based on the mergeSize parameters. Gaps no larger than the * specified merge sizes result in the adjacent blocks being merged into a * single exon. Gaps in CDS use cdsMergeSize, in UTR use utrMergeSize. If * the genePredPslCdsMod3 option is specified, then CDS gaps are only merged * if a multiple of three. A negative merge sizes disables merging of blocks. * This differs from specifying zero in that adjacent blocks will not be * merged. The optfields field is a set from genePredFields, indicated what * fields to create. Zero-length CDS, or null cds, creates without CDS * annotation. If cds is null, it will set status fields to cdsNone. */ { struct genePred *gene; #if 0 /* FIXME; the browser calls this on a protein psl when rendering the amino * acids of a protein alignment track. Some of this code doesn't really make * sense for proteins and I really don't see how this produces the right * results, however the check is disabled until this can be investigated * because it *seems* to work. markd 2006-05-22 */ if (pslIsProtein(psl)) errAbort("can't convert protein psls to genePreds"); #endif AllocVar(gene); gene->name = cloneString(psl->qName); gene->chrom = cloneString(psl->tName); gene->txStart = psl->tStart; gene->txEnd = psl->tEnd; gene->optFields = optFields; /* get strand in genome that the positive version mRNA aligns to */ if (psl->strand[1] == '\0') { /* assumed pos target strand, so neg query would be pos target */ gene->strand[0] = psl->strand[0]; } else { /* query and target strand are different; will be neg when query pos */ gene->strand[0] = ((psl->strand[0] != psl->strand[1]) ? '-' : '+'); } genePredAddGenbankCds(psl, cds, gene); pslToExons(psl, gene, cds, options, cdsMergeSize, utrMergeSize); return gene; } struct genePred *genePredFromPsl2(struct psl *psl, unsigned optFields, struct genbankCds* cds, int insertMergeSize) /* Compatibility function, genePredFromPsl3 is prefered. See that function's * documentation for details. This calls genePredFromPsl3 with no options * and insertMergeSize set for CDS and UTR. */ { return genePredFromPsl3(psl, cds, optFields, genePredPslDefaults, insertMergeSize, insertMergeSize); } struct genePred *genePredFromPsl(struct psl *psl, int cdsStart, int cdsEnd, int insertMergeSize) /* Compatibility function, genePredFromPsl3 is prefered. See that function's * documentation for details. This calls genePredFromPsl3 with no options. */ { struct genbankCds cds; ZeroVar(&cds); cds.start = cdsStart; cds.end = cdsEnd; return genePredFromPsl3(psl, &cds, 0, genePredPslDefaults, insertMergeSize, insertMergeSize); } char* genePredGetCreateSql(char* table, unsigned optFields, unsigned options, int chromIndexLen) /* Get SQL required to create a genePred table. optFields is a bit set * consisting of the genePredFields values. Options are a bit set of * genePredCreateOpts. Returned string should be freed. This will create all * optional fields that preceed the highest optFields column. chromIndexLen * is now ignored.. */ { /* the >= is used so that we create preceeding fields. */ struct dyString *dy = sqlDyStringCreate( "CREATE TABLE %s (", table); /* bin column goes here */ if (options & genePredWithBin) sqlDyStringPrintf(dy, " bin smallint unsigned not null," " INDEX(chrom,bin),"); else sqlDyStringPrintf(dy, " INDEX(chrom,txStart),"); sqlDyStringPrintf(dy, " name varchar(255) not null," /* mrna accession of gene */ " chrom varchar(255) not null," /* Chromosome name */ " strand char(1) not null," /* + or - for strand */ " txStart int unsigned not null," /* Transcription start position */ " txEnd int unsigned not null," /* Transcription end position */ " cdsStart int unsigned not null," /* Coding region start */ " cdsEnd int unsigned not null," /* Coding region end */ " exonCount int unsigned not null," /* Number of exons */ " exonStarts longblob not null," /* Exon start positions */ " exonEnds longblob not null," /* Exon end positions */ " INDEX(name)" ); if (optFields >= genePredScoreFld) sqlDyStringPrintf(dy, " ,score int"); if (optFields >= genePredName2Fld) sqlDyStringPrintf(dy, " ,name2 varchar(255) not null," /* Secondary name. (e.g. name of gene) or NULL if not available */ " INDEX(name2)"); if (optFields >= genePredCdsStatFld) sqlDyStringPrintf(dy, " ,cdsStartStat enum('none', 'unk', 'incmpl', 'cmpl') not null," /* Status of cdsStart annotation */ " cdsEndStat enum('none', 'unk', 'incmpl', 'cmpl') not null"); /* Status of cdsEnd annotation */ if (optFields >= genePredExonFramesFld) sqlDyStringPrintf(dy, " ,exonFrames longblob not null"); /* List of frame for each exon, or -1 if no frame or not known. NULL if not available. */ sqlDyStringPrintf(dy, ")"); return cloneString(dyStringCannibalize(&dy)); } // FIXME: this really doesn't belong in this module struct genePred *getOverlappingGene(char *db, struct genePred **list, char *table, char *chrom, int cStart, int cEnd, char *name, int *retOverlap) { /* read all genes from a table find the gene with the biggest overlap. Cache the list of genes to so we only read it once */ char query[256]; struct genePred *gene; struct sqlConnection *conn; struct sqlResult *sr; char **row; struct genePred *el = NULL, *bestMatch = NULL, *gp = NULL; int overlap = 0 , bestOverlap = 0, i; struct psl *psl; int *eFrames; if (*list == NULL) { printf("Loading Predictions from %s\n",table); AllocVar(*list); conn = hAllocConn(db); AllocVar(gene); sqlSafef(query, sizeof(query), "select * from %s", table); sr = sqlGetResult(conn, query); while ((row = sqlNextRow(sr)) != NULL) { if (!sameString(table,"all_mrna")) { el = genePredLoad(row); } else { psl = pslLoad(row); el = genePredFromPsl2(psl, 0, NULL, genePredStdInsertMergeSize); } slAddHead(list, el); } slReverse(list); sqlFreeResult(&sr); hFreeConn(&conn); } for (el = *list; el != NULL; el = el->next) { if (chrom != NULL && el->chrom != NULL) { overlap = 0; if ( sameString(chrom, el->chrom)) { for (i = 0 ; i<(el->exonCount); i++) { overlap += positiveRangeIntersection(cStart,cEnd, el->exonStarts[i], el->exonEnds[i]) ; } if (overlap > 20 && sameString(name, el->name)) { bestMatch = el; bestOverlap = overlap; *retOverlap = bestOverlap; } if (overlap > bestOverlap) { bestMatch = el; bestOverlap = overlap; *retOverlap = bestOverlap; } } } } if (bestMatch != NULL) { /* Allocate genePred and fill in values. */ AllocVar(gp); gp->name = cloneString(bestMatch->name); gp->chrom = cloneString(bestMatch->chrom); gp->strand[1] = bestMatch->strand[1]; gp->strand[0] = bestMatch->strand[0]; gp->txStart = bestMatch->txStart; gp->txEnd = bestMatch->txEnd; gp->cdsStart = bestMatch->cdsStart; gp->cdsEnd = bestMatch->cdsEnd; gp->exonCount = bestMatch->exonCount; AllocArray(gp->exonStarts, bestMatch->exonCount); AllocArray(gp->exonEnds, bestMatch->exonCount); for (i=0; iexonCount; ++i) { gp->exonStarts[i] = bestMatch->exonStarts[i] ; gp->exonEnds[i] = bestMatch->exonEnds[i] ; } gp->optFields = bestMatch->optFields; gp->score = bestMatch->score; if (bestMatch->optFields & genePredName2Fld) gp->name2 = cloneString(bestMatch->name2); else gp->name2 = NULL; if (bestMatch->optFields & genePredCdsStatFld) { gp->cdsStartStat = bestMatch->cdsStartStat; gp->cdsEndStat = bestMatch->cdsEndStat; } if (bestMatch->optFields & genePredExonFramesFld) { gp->exonFrames = AllocArray(eFrames, bestMatch->exonCount); for (i = 0; i < bestMatch->exonCount; i++) gp->exonFrames[i] = bestMatch->exonFrames[i]; } eFrames = gp->exonFrames; } return gp; } int genePredBases(struct genePred *gp) /* count coding and utr bases in a gene prediction */ { int count = 0, i; if (gp == NULL) return 0; for (i=0; iexonCount; i++) { count += gp->exonEnds[i] - gp->exonStarts[i] ; } return count; } int genePredCodingBases(struct genePred *gp) /* Count up the number of coding bases in gene prediction. */ { int i, exonCount = gp->exonCount; int cdsStart = gp->cdsStart, cdsEnd = gp->cdsEnd; int baseCount = 0; for (i=0; iexonStarts[i], gp->exonEnds[i]); } return baseCount; } boolean genePredCdsExon(struct genePred *gp, int iExon, int *startPtr, int *endPtr) /* Get the CDS range in an exon. If there is no CDS, return FALSE and then * set start == end */ { assert((iExon >= 0) && (iExon < gp->exonCount)); int start = gp->exonStarts[iExon]; int end = gp->exonEnds[iExon]; if (start < gp->cdsStart) start = gp->cdsStart; if (end > gp->cdsEnd) end = gp->cdsEnd; if (startPtr != NULL) *startPtr = start; if (endPtr != NULL) *endPtr = end; return (start < end); } static void gpError(FILE* errFh, int* errorCnt, char *format, ...) /* print and count an error */ { va_list args; fprintf(errFh, "Error: "); va_start(args, format); vfprintf(errFh, format, args); va_end(args); fputc('\n', errFh); (*errorCnt)++; } static void checkExon(FILE* errFh, int* errorCnt, char *desc, struct genePred* gp, int iExon) /* check one exon of a genePred */ { unsigned exonStart = gp->exonStarts[iExon]; unsigned exonEnd = gp->exonEnds[iExon]; if (exonStart >= exonEnd) gpError(errFh, errorCnt, "%s: %s exon %u start %u >= end %u", desc, gp->name, iExon, exonStart, exonEnd); if (exonStart < gp->txStart) gpError(errFh, errorCnt, "%s: %s exon %u start %u < txStart %u", desc, gp->name, iExon, exonStart, gp->txStart); if (exonEnd > gp->txEnd) gpError(errFh, errorCnt, "%s: %s exon %u end %u > txEnd %u", desc, gp->name, iExon, exonEnd, gp->txEnd); if (iExon > 0) { /* other than first exon */ if (exonStart < gp->exonEnds[iExon-1]) gpError(errFh, errorCnt, "%s: %s exon %u (%s:%d-%d) overlaps previous exon (%s:%d-%d)", desc, gp->name, iExon, gp->chrom, exonStart, exonEnd, gp->chrom, gp->exonStarts[iExon-1], gp->exonEnds[iExon-1]); } if (gp->optFields & genePredExonFramesFld) { int frame = gp->exonFrames[iExon]; if ((frame < -1) || (frame > 2)) gpError(errFh, errorCnt, "%s: %s invalid exonFrame: %d", desc, gp->name, frame); if ((exonEnd > gp->cdsStart) && (exonStart < gp->cdsEnd)) { if (frame == -1) gpError(errFh, errorCnt, "%s: %s no exonFrame on CDS exon %d", desc, gp->name, iExon); } else { if (frame != -1) gpError(errFh, errorCnt, "%s: %s exonFrame on non-CDS exon %d", desc, gp->name, iExon); } } } static void checkCdsBounds(FILE* errFh, int* errorCnt, char *desc, struct genePred* gp) /* check cdsStart/cdsEnd */ { int iExon; boolean foundCdsStart = FALSE, foundCdsEnd = FALSE; if (gp->cdsStart > gp->cdsEnd) gpError(errFh, errorCnt, "%s: %s cdsStart %u > cdsEnd %u", desc, gp->name, gp->cdsStart, gp->cdsEnd); if ((gp->cdsStart < gp->txStart) || (gp->cdsStart > gp->txEnd)) gpError(errFh, errorCnt, "%s: %s cdsStart %u not in tx bounds %u-%u", desc, gp->name, gp->cdsStart, gp->txStart, gp->txEnd); if ((gp->cdsEnd < gp->txStart) || (gp->cdsEnd > gp->txEnd)) gpError(errFh, errorCnt, "%s: %s cdsEnd %u not in tx bounds %u-%u", desc, gp->name, gp->cdsEnd, gp->txStart, gp->txEnd); /* is cdsStart/cdsEnd in an exon? */ for (iExon = 0; iExon < gp->exonCount; iExon++) { if ((gp->cdsStart >= gp->exonStarts[iExon]) && (gp->cdsStart < gp->exonEnds[iExon])) foundCdsStart = TRUE; if ((gp->cdsEnd > gp->exonStarts[iExon]) && (gp->cdsEnd <= gp->exonEnds[iExon])) foundCdsEnd = TRUE; } if (!foundCdsStart) gpError(errFh, errorCnt, "%s: %s cdsStart %u not in an exon", desc, gp->name, gp->cdsStart); if (!foundCdsEnd) gpError(errFh, errorCnt, "%s: %s cdsEnd %u not in an exon", desc, gp->name, gp->cdsEnd); } int genePredCheck(char *desc, FILE* errFh, int chromSize, struct genePred* gp) /* Validate a genePred for consistency. desc is printed the error messages * to file errFh (open /dev/null to discard). chromSize should contain * size of chromosome, or 0 if chrom is not valid, or -1 to not check * chromosome bounds. Returns count of errors. */ { int iExon; int errorCnt = 0; if (gp->name == NULL) gpError(errFh, &errorCnt, "%s: name is null", desc); if (!(sameString(gp->strand, "+") || sameString(gp->strand, "-"))) gpError(errFh, &errorCnt, "%s: %s invalid strand: \"%s\"", desc, gp->name, gp->strand); /* check chrom */ if (chromSize == 0) gpError(errFh, &errorCnt, "%s: %s chrom not a valid chromosome: \"%s\"", desc, gp->name, gp->chrom); else if (chromSize > 0) { if (gp->txEnd > chromSize) gpError(errFh, &errorCnt, "%s: %s txEnd %u >= chromSize %u", desc, gp->name, gp->txEnd, chromSize); } /* check internal consistency */ if (gp->txStart >= gp->txEnd) gpError(errFh, &errorCnt, "%s: %s txStart %u >= txEnd %u", desc, gp->name, gp->txStart, gp->txEnd); /* no CDS is indicated by cdsStart == cdsEnd */ if (gp->cdsStart != gp->cdsEnd) checkCdsBounds(errFh, &errorCnt, desc, gp); /* must be at least one exon */ if (gp->exonCount == 0) gpError(errFh, &errorCnt, "%s: %s contains no exons", desc, gp->name); else { /* make sure first/last exons match tx range */ if (gp->txStart != gp->exonStarts[0]) gpError(errFh, &errorCnt, "%s: %s first exon start %u doesn't match txStart %u", desc, gp->name, gp->exonStarts[0], gp->txStart); if (gp->txEnd != gp->exonEnds[gp->exonCount-1]) gpError(errFh, &errorCnt, "%s: %s last exon end %u doesn't match txEnd %u", desc, gp->name, gp->exonEnds[gp->exonCount-1], gp->txEnd); } /* check each exon */ for (iExon = 0; iExon < gp->exonCount; iExon++) checkExon(errFh, &errorCnt, desc, gp, iExon); return errorCnt; } static int lookupChromSize(char *desc, FILE* errFh, char* db, struct genePred* gp, int *errorCnt) /* lookup the chromosome size in the database, -1 if invalid */ { // hGetChromInfo is case independent struct chromInfo *ci = hGetChromInfo(db, gp->chrom); if (ci == NULL) { fprintf(errFh, "Error: %s: %s has invalid chrom for %s: %s\n", desc, gp->name, db, gp->chrom); (*errorCnt)++; return -1; // don't validate } else if (differentString(gp->chrom, ci->chrom)) // verify case dependent equal { fprintf(errFh, "Error: %s: %s has invalid chrom for %s: %s\n", desc, gp->name, db, gp->chrom); (*errorCnt)++; return -1; // don't validate } else return ci->size; } int genePredCheckDb(char *desc, FILE* errFh, char* db, struct genePred* gp) /* Validate a genePred for consistency. desc is printed the error messages * to file errFh (open /dev/null to discard). Lookup chromosome size in database if * db is not NULL. Returns count of errors. */ { int errorCnt = 0; int chromSize = -1; /* default to not checking */ if (db != NULL) chromSize = lookupChromSize(desc, errFh, db, gp, &errorCnt); errorCnt += genePredCheck(desc, errFh, chromSize, gp); return errorCnt; } int genePredCheckChromSizes(char *desc, FILE* errFh, struct genePred* gp, struct hash* chromSizes) /* Validate a genePred for consistency. desc is printed the error messages * to file errFh (open /dev/null to discard). Lookup chromosome size in hash. */ { int errorCnt = 0; int chromSize = hashIntValDefault(chromSizes, gp->chrom, -1); if (chromSize < 0) { fprintf(errFh, "Error: %s: %s has invalid chrom for: %s\n", desc, gp->name, gp->chrom); errorCnt++; } else errorCnt += genePredCheck(desc, errFh, chromSize, gp); return errorCnt; } boolean genePredNmdTarget(struct genePred *gp) /* Return TRUE if cds end is more than 50bp upstream of last intron. */ { int gpSize = 0; int i = 0; int startDist = 0, endDist = 0; int blockSize = 0; if(gp->exonCount < 2 || gp->cdsStart == gp->cdsEnd) return FALSE; for(i = 0; i < gp->exonCount; i++) { int blockStart = gp->exonStarts[i]; int blockEnd = gp->exonEnds[i]; blockSize = blockEnd - blockStart; if( blockStart <= gp->cdsStart && gp->cdsStart <= blockEnd) startDist += blockSize - (blockEnd - gp->cdsStart); else if(blockEnd <= gp->cdsStart) startDist += blockSize; if( blockStart <= gp->cdsEnd && gp->cdsEnd <= blockEnd) endDist += blockSize - (blockEnd - gp->cdsEnd); else if(blockEnd <= gp->cdsEnd) endDist += blockSize; gpSize += blockSize; } /* xxxxxXXX----XXXXXXXXXXXXX--------XXXXXXX----XXXxxxxxxxx-------xxxxxxxxx */ if(sameString(gp->strand, "+")) { blockSize = gp->exonEnds[gp->exonCount-1] - gp->exonStarts[gp->exonCount-1]; return endDist < (gpSize - blockSize - 50); } else if(sameString(gp->strand, "-")) { blockSize = gp->exonEnds[0] - gp->exonStarts[0]; return startDist > (blockSize + 50); } else errAbort("genePredNmdsTarget() - Don't recognize strand: %s\n", gp->strand); return FALSE; } void genePredAddExonFrames(struct genePred *gp) /* Add exonFrames array to a genePred that doesn't have it. Frame is assumed * to be contiguous. NOTE: suggest using genePredFixExonFrames for new code. */ { int iExon, start, end, iBase = 0; int iStart, iEnd, iDir; /* initial array to all -1, for no frame */ AllocArray(gp->exonFrames, gp->exonCount); gp->optFields |= genePredExonFramesFld; for (iExon = 0; iExon < gp->exonCount; iExon++) gp->exonFrames[iExon] = -1; if (sameString(gp->strand, "+")) { iStart = 0; iEnd = gp->exonCount; iDir = 1; } else { iStart = gp->exonCount-1; iEnd = -1; iDir = -1; } for (iExon = iStart; iExon != iEnd; iExon += iDir) { if (genePredCdsExon(gp, iExon, &start, &end)) { gp->exonFrames[iExon] = iBase % 3; iBase += (end - start); } } } void genePredFixExonFrames(struct genePred *gp) /* Add exonFrames array to a genePred that has frame on only some or no * features. Frame is assumed to be contiguous when an existing frame is not * present. */ { int iStart, iEnd, iDir; if (gp->exonFrames == NULL) { /* doesn't currently have frames */ AllocArray(gp->exonFrames, gp->exonCount); gp->optFields |= genePredExonFramesFld; for (int iExon = 0; iExon < gp->exonCount; iExon++) gp->exonFrames[iExon] = -1; } if (sameString(gp->strand, "+")) { iStart = 0; iEnd = gp->exonCount; iDir = 1; } else { iStart = gp->exonCount - 1; iEnd = -1; iDir = -1; } int nextFrame = -1; for (int iExon = iStart; iExon != iEnd; iExon += iDir) { int start, end; if (genePredCdsExon(gp, iExon, &start, &end)) { if (gp->exonFrames[iExon] < 0) { // no existing frame if (nextFrame < 0) nextFrame = 0; // first frame gp->exonFrames[iExon] = nextFrame; } nextFrame = incrFrame(gp->exonFrames[iExon], end - start); } else { gp->exonFrames[iExon] = -1; } } } void genePredRc(struct genePred *gp, int chromSize) /* Reverse complement a genePred (project it to the opposite strand). Useful * when doing analysis that is simplified by having things on the same strand. */ { int iExon; gp->strand[0] = (gp->strand[0] == '+') ? '-' : '+'; reverseUnsignedRange(&gp->txStart, &gp->txEnd, chromSize); reverseUnsignedRange(&gp->cdsStart, &gp->cdsEnd, chromSize); for (iExon = 0; iExon < gp->exonCount; iExon++) reverseUnsignedRange(&gp->exonStarts[iExon], &gp->exonEnds[iExon], chromSize); reverseUnsigned(gp->exonStarts, gp->exonCount); reverseUnsigned(gp->exonEnds, gp->exonCount); if (gp->optFields & genePredCdsStatFld) { enum cdsStatus hold = gp->cdsStartStat; gp->cdsStartStat = gp->cdsEndStat; gp->cdsEndStat = hold; } if (gp->optFields & genePredExonFramesFld) reverseInts(gp->exonFrames, gp->exonCount); } struct genePred *genePredNew(char *name, char *chrom, char strand, unsigned txStart, unsigned txEnd, unsigned cdsStart, unsigned cdsEnd, unsigned optFields, unsigned exonSpace) /* create a new gene with space for the specified number of exons allocated. * genePredGrow maybe used to expand this space if needed. */ { struct genePred *gp; AllocVar(gp); assert(exonSpace > 0); gp->name = cloneString(name); gp->chrom = cloneString(chrom); gp->strand[0] = strand; gp->txStart = txStart; gp->txEnd = txEnd; gp->cdsStart = cdsStart; gp->cdsEnd = cdsEnd; gp->optFields = optFields; AllocArray(gp->exonStarts, exonSpace); AllocArray(gp->exonEnds, exonSpace); if (gp->optFields & genePredExonFramesFld) AllocArray(gp->exonFrames, exonSpace); return gp; } void genePredGrow(struct genePred *gp, unsigned *exonSpacePtr) /* Increase memory allocated to a psl to hold more exons. exonSpacePtr * should point the the current maximum number of exons and will be * updated to with the new amount of space. */ { int exonSpace = *exonSpacePtr; int newSpace = 2 * exonSpace; ExpandArray(gp->exonStarts, exonSpace, newSpace); ExpandArray(gp->exonEnds, exonSpace, newSpace); if (gp->optFields & genePredExonFramesFld) ExpandArray(gp->exonFrames, exonSpace, newSpace); *exonSpacePtr = newSpace; } struct rbTree *genePredToRangeTree(struct genePred *gp, boolean cdsOnly) /* Convert genePred into a range tree. */ { struct rbTree *rangeTree = rangeTreeNew(); int i; for (i=0; iexonCount; ++i) { int start = gp->exonStarts[i]; int end = gp->exonEnds[i]; if (cdsOnly) { start = max(start, gp->cdsStart); end = min(end, gp->cdsEnd); if (start >= end) continue; } rangeTreeAdd(rangeTree, start, end); } return rangeTree; } void gpPartOutAsBed(struct genePred *gp, int start, int end, FILE *f, char *type, int id, int minSize) /* Write out part of gp as bed12. */ { /* Figure out # of blocks and min/max of area inside start/end */ int blockCount = 0; int newStart = gp->txEnd, newEnd = gp->txStart; int size = 0; int i; for (i=0; iexonCount; ++i) { int exonStart = gp->exonStarts[i]; int exonEnd = gp->exonEnds[i]; exonStart = max(start, exonStart); exonEnd = min(end, exonEnd); if (exonStart < exonEnd) { ++blockCount; newStart = min(exonStart, newStart); newEnd = max(exonEnd, newEnd); size += exonEnd - exonStart; } } /* Output first 10 fields of bed12. */ if (size > minSize) { fprintf(f, "%s\t%d\t%d\t", gp->chrom, newStart, newEnd); fprintf(f, "%s_%d_%s\t", type, id, gp->name); fprintf(f, "0\t%s\t0\t0\t0\t%d\t", gp->strand, blockCount); /* Output blockSizes field */ for (i=0; iexonCount; ++i) { int exonStart = gp->exonStarts[i]; int exonEnd = gp->exonEnds[i]; exonStart = max(start, exonStart); exonEnd = min(end, exonEnd); if (exonStart < exonEnd) fprintf(f, "%d,", exonEnd - exonStart); } fprintf(f, "\t"); /* Output chromStarts field */ for (i=0; iexonCount; ++i) { int exonStart = gp->exonStarts[i]; int exonEnd = gp->exonEnds[i]; exonStart = max(start, exonStart); exonEnd = min(end, exonEnd); if (exonStart < exonEnd) fprintf(f, "%d,", exonStart - newStart); } fprintf(f, "\n"); } } boolean codonToPos(struct genePred *gp, unsigned num, int *chromStart, int *chromEnd) { // map 1-based codon to genomic coordinates. If the codon crosses an exon junction, we return just the beginning (LHS) of the codon. // Returns true if we find the codon in given gene predition; chromStart and end are set to appropriate three base region. int pos = -1; int i; int offset = -1; // current 1-based offset in bases (not codons) if(gp->strand[0] == '+') { for(i = 0; i < gp->exonCount; i++) { if(gp->exonEnds[i] > gp->cdsStart && gp->exonStarts[i] < gp->cdsEnd) { int start, end; if(offset == -1 && gp->cdsStart <= gp->exonEnds[i]) { // start counting start = gp->cdsStart; offset = 1; } else start = gp->exonStarts[i]; if(gp->cdsEnd < gp->exonEnds[i]) end = gp->cdsEnd; else end = gp->exonEnds[i]; int next = offset + end - start; if(next > (num * 3 - 2)) { pos = start + (((num - 1) * 3 + 1) - offset); break; } else offset = next; } } } else { for(i = gp->exonCount - 1; i >= 0; i--) { if(gp->exonStarts[i] < gp->cdsEnd && gp->exonEnds[i] > gp->cdsStart) { int start, end; // start here is really the RHS, and end is the LHS if(offset == -1 && gp->cdsEnd >= gp->exonStarts[i]) { // start counting start = gp->cdsEnd; offset = 1; } else start = gp->exonEnds[i]; if(gp->cdsStart > gp->exonStarts[i]) end = gp->cdsStart; else end = gp->exonStarts[i]; int next = offset + start - end; if(next > num * 3) { pos = start - (num*3 - offset) - 1; break; } else offset = next; } } } if(pos == -1) return FALSE; else { *chromStart = pos; *chromEnd = pos + 3; return TRUE; } } boolean exonToPos(struct genePred *gp, unsigned num, int *chromStart, int *chromEnd) { // map 1-based exon number to genomic coordinates. // Returns true if we find the exon in given gene predition; start and end are set to appropriate region. if(num == 0 || num > gp->exonCount) return FALSE; else if(gp->strand[0] == '+') { *chromStart = gp->exonStarts[num - 1]; *chromEnd = gp->exonEnds[num - 1]; } else { *chromStart = gp->exonStarts[gp->exonCount - num]; *chromEnd = gp->exonEnds[gp->exonCount - num]; } return TRUE; } static char *genePredAutoSqlString = "table genePred\n" "\"A gene prediction.\"\n" " (\n" " string name; \"Name of gene\"\n" " string chrom; \"Reference sequence chromosome or scaffold\"\n" " char[1] strand; \"+ or - for strand\"\n" " uint txStart; \"Transcription start position\"\n" " uint txEnd; \"Transcription end position\"\n" " uint cdsStart; \"Coding region start\"\n" " uint cdsEnd; \"Coding region end\"\n" " uint exonCount; \"Number of exons\"\n" " uint[exonCount] exonStarts; \"Exon start positions\"\n" " uint[exonCount] exonEnds; \"Exon end positions\"\n" " )\n"; struct asObject *genePredAsObj() // Return asObject describing fields of genePred { return asParseText(genePredAutoSqlString); } struct dnaSeq *genePredGetDna(char *database, struct genePred *gp, boolean coding, enum dnaCase dnaCase) // Returns the DNA sequence associated with gene prediction. // Negative strand genes will return the sequence as read from the negative strand. // Optionally restrict to coding sequence only { struct dnaSeq *dnaSeq = hDnaFromSeq(database, gp->chrom, gp->txStart, gp->txEnd, dnaCase); if (dnaSeq == NULL) return NULL; DNA *seq = dnaSeq->dna; int len = dnaSeq->size; if (coding) { int cdnaOffset = 0, exonIx; for (exonIx = 0; exonIx < gp->exonCount; exonIx++) { int exonStart = max(gp->exonStarts[exonIx],gp->cdsStart); int exonEnd = min(gp->exonEnds[ exonIx],gp->cdsEnd ); if (exonStart >= exonEnd) // non-coding exon continue; exonStart -= gp->txStart; exonEnd -= gp->txStart; int exonSize = exonEnd - exonStart; assert(cdnaOffset <= exonStart && exonEnd <= len); if (cdnaOffset < exonStart) memmove(seq + cdnaOffset, seq + exonStart, exonSize); cdnaOffset += exonSize; } seq[cdnaOffset] = '\0'; dnaSeq->size = cdnaOffset; } if (gp->strand[0] == '-') reverseComplement(seq,dnaSeq->size); return dnaSeq; } int genePredBaseToCodingPos(struct genePred *gp, int basePos, boolean stranded, boolean *isCoding) // Given a genePred model and a single (0 based) base position, predict the 0-based // DNA (stranded) coding sequence pos. Dividing this number by 3 should give the AA position! // Returns -1 when outside of coding exons unless OPTIONAL isCoding pointer to boolean is // provided. In that case, returns last valid position and sets isCoding to FALSE. { if (isCoding == NULL && (basePos < gp->cdsStart || basePos >= gp->cdsEnd)) return -1; boolean reverse = (stranded && gp->strand[0] == '-'); assert(gp->exonStarts != NULL && gp->exonEnds != NULL); // Walk through exons, advancing depending upon strand int codingBasesSoFar = 0; int exonIx = (reverse ? (gp->exonCount - 1) : 0); while (0 <= exonIx && exonIx < gp->exonCount) { int exonStart = max(gp->exonStarts[exonIx],gp->cdsStart); int exonEnd = min(gp->exonEnds[ exonIx],gp->cdsEnd ); if (exonStart < exonEnd) // coding exon { // Within this exon? if (exonStart <= basePos && basePos < exonEnd) { if (isCoding != NULL) *isCoding = TRUE; if (reverse) return (codingBasesSoFar + (exonEnd - basePos)); else return (codingBasesSoFar + (basePos - exonStart)); } // Past the target base already? if (( reverse && basePos >= exonEnd ) || (!reverse && basePos < exonStart)) break; // Yet to reach the target base... accumulate exon worth codingBasesSoFar += (exonEnd - exonStart); } exonIx += (reverse ? -1 : 1); } if (isCoding != NULL && codingBasesSoFar > 0) { *isCoding = FALSE; return codingBasesSoFar; } return -1; // introns not okay } struct genePredExt *genePredFromBedBigGenePred( char *chrom, struct bed *bed, struct bigBedInterval *bb) /* build a genePred from a bigGenePred and a bed file */ { char *extra = cloneString(bb->rest); int numCols = 12 + 8 - 3; char *row[numCols]; int wordCount = chopByChar(extra, '\t', row, numCols); if (wordCount < numCols) errAbort("expected at least %d columns in bigGenePred, got %d; is this actually a bigGenePred?", numCols, wordCount); struct genePredExt *gp = bedToGenePredExt(bed); gp->name2 = cloneString(row[ 9]); int numBlocks; sqlSignedDynamicArray(row[ 12], &gp->exonFrames, &numBlocks); gp->optFields |= genePredExonFramesFld; //assert (numBlocks == gp->exonCount); gp->type = cloneString(row[13]); gp->geneName = cloneString(row[14]); gp->geneName2 = cloneString(row[15]); return gp; } struct genePredExt *genePredFromBigGenePred( char *chrom, struct bigBedInterval *bb) /* build a genePred from a bigGenePred */ { char *extra = cloneString(bb->rest); int numCols = 12 + 8 - 3; char *row[numCols]; int wordCount = chopByChar(extra, '\t', row, numCols); assert(wordCount == numCols); struct genePredExt *gp; AllocVar(gp); gp->chrom = chrom; gp->txStart = bb->start; gp->txEnd = bb->end; gp->name = cloneString(row[ 0]); gp->strand[0] = row[ 2][0]; gp->strand[1] = row[ 2][1]; gp->cdsStart = atoi(row[ 3]); gp->cdsEnd = atoi(row[ 4]); gp->exonCount = atoi(row[ 6]); int numBlocks; sqlUnsignedDynamicArray(row[ 8], &gp->exonStarts, &numBlocks); assert (numBlocks == gp->exonCount); sqlUnsignedDynamicArray(row[ 7], &gp->exonEnds, &numBlocks); assert (numBlocks == gp->exonCount); int ii; for(ii=0; ii < numBlocks; ii++) { gp->exonStarts[ii] += bb->start; gp->exonEnds[ii] += gp->exonStarts[ii]; } gp->name2 = cloneString(row[ 9]); gp->cdsStartStat = parseCdsStat(row[ 10]); gp->cdsEndStat = parseCdsStat(row[ 11]); sqlSignedDynamicArray(row[ 12], &gp->exonFrames, &numBlocks); gp->optFields |= genePredExonFramesFld; assert (numBlocks == gp->exonCount); gp->type = cloneString(row[13]); gp->geneName = cloneString(row[14]); gp->geneName2 = cloneString(row[15]); return gp; } static void sqlUnsignedDynamicArrayNoClobber(char *s, unsigned **retArray, int *retSize) /* Make a copy of s on stack and chop that up so we don't mangle s. */ { char copy[strlen(s)+1]; safecpy(copy, sizeof(copy), s); sqlUnsignedDynamicArray(copy, retArray, retSize); } struct genePredExt *genePredFromBigGenePredRow(char **row) /* build a genePred from a bigGenePred row */ { struct genePredExt *gp; AllocVar(gp); gp->chrom = cloneString(row[0]); gp->txStart = sqlUnsigned(row[1]); gp->txEnd = sqlUnsigned(row[2]); gp->name = cloneString(row[3]); gp->strand[0] = row[5][0]; gp->strand[1] = row[5][1]; gp->cdsStart = sqlUnsigned(row[6]); gp->cdsEnd = sqlUnsigned(row[7]); gp->exonCount = sqlUnsigned(row[9]); int numBlocks; sqlUnsignedDynamicArrayNoClobber(row[11], &gp->exonStarts, &numBlocks); assert (numBlocks == gp->exonCount); // First put blockSizes in exonEnds: sqlUnsignedDynamicArrayNoClobber(row[10], &gp->exonEnds, &numBlocks); assert (numBlocks == gp->exonCount); // Then add in txStart to relative starts, and add starts to block sizes to get ends: int ii; for(ii=0; ii < numBlocks; ii++) { gp->exonStarts[ii] += gp->txStart; gp->exonEnds[ii] += gp->exonStarts[ii]; } gp->name2 = cloneString(row[12]); gp->cdsStartStat = parseCdsStat(row[13]); gp->cdsEndStat = parseCdsStat(row[14]); gp->optFields |= genePredCdsStatFld; sqlSignedDynamicArray(row[15], &gp->exonFrames, &numBlocks); assert (numBlocks == gp->exonCount); gp->optFields |= genePredExonFramesFld; gp->type = cloneString(row[16]); gp->geneName = cloneString(row[17]); gp->geneName2 = cloneString(row[18]); return gp; } struct cds /* CDS sequence being assembled */ { char *bases; // CDS string being built int iCds; // Index into CDS int nextFrame; // next required frame }; static struct dnaSeq* getGeneRegion(struct nibTwoCache* genomeSeqs, struct genePred *gp, int start, int end, int *chromSize) /* Get the genome sequence covering the a region of a gene, in transcription order */ { struct dnaSeq *dna = nibTwoCacheSeqPart(genomeSeqs, gp->chrom, start, end-start, chromSize); if (gp->strand[0] == '-') reverseComplement(dna->dna, dna->size); return dna; } static void removePartialCodon(struct cds* cds) /* remove partial codon that has already been copied to CDS */ { int iCdsNew = cds->iCds - cds->nextFrame; if (iCdsNew < 0) iCdsNew = 0; zeroBytes(cds->bases+iCdsNew, (cds->iCds - iCdsNew)); cds->iCds = iCdsNew; cds->nextFrame = 0; } static void addCdsExonBases(struct nibTwoCache* genomeSeqs, struct genePred *genePred, int exonCdsStart, int exonCdsEnd, struct cds* cds, int *chromSize) { struct dnaSeq *dna = getGeneRegion(genomeSeqs, genePred, exonCdsStart, exonCdsEnd, chromSize); int iDna = 0; int iCdsStart = cds->iCds; while (iDna < dna->size) cds->bases[cds->iCds++] = dna->dna[iDna++]; cds->nextFrame = ((cds->nextFrame) + (cds->iCds - iCdsStart)) % 3; dnaSeqFree(&dna); } static void addCdsExon(struct nibTwoCache* genomeSeqs, struct genePred *gp, int exonCdsStart, int exonCdsEnd, int frame, struct cds* cds) /* get CDS from one exon */ { // adjust for frame shift, dropping partial codon if (frame != cds->nextFrame) { removePartialCodon(cds); if (gp->strand[0] == '+') exonCdsStart += frame; else exonCdsEnd -= frame; } if (exonCdsStart < exonCdsEnd) { int chromSize = 0; addCdsExonBases(genomeSeqs, gp, exonCdsStart, exonCdsEnd, cds, &chromSize); } } static char* getCdsCodons(struct genePred *gp, struct nibTwoCache* genomeSeqs) /* get the CDS sequence, dropping partial codons */ { struct cds cds; cds.bases = needMem(genePredCdsSize(gp) + 1); cds.iCds = 0; cds.nextFrame = 0; // walk in direction of transcription int dir = (gp->strand[0] == '+') ? 1 : -1; int iExon = (dir > 0) ? 0 : gp->exonCount-1; int iStop = (dir > 0) ? gp->exonCount : -1; int exonCdsStart, exonCdsEnd; for (; iExon != iStop; iExon += dir) { if (genePredCdsExon(gp, iExon, &exonCdsStart, &exonCdsEnd)) addCdsExon(genomeSeqs, gp, exonCdsStart, exonCdsEnd, gp->exonFrames[iExon], &cds); } if (cds.nextFrame != 0) removePartialCodon(&cds); assert((strlen(cds.bases) % 3) == 0); // ;-) return cds.bases; } static char translateCodon(boolean isChrM, char* codon, bool lastCodon, unsigned options) /* translate the first three bases starting at codon, handling weird * biology as requested giving */ { char aa = isChrM ? lookupMitoCodon(codon) : lookupCodon(codon); if (aa == '\0') { // stop, contains `N' or selenocysteine boolean isStopOrSelno = isStopCodon(codon); boolean isRealStop = isReallyStopCodon(codon, !lastCodon); // internal could be selenocysteine if (lastCodon) { if ((options & GENEPRED_TRANSLATE_INCLUDE_STOP) != 0) aa = '*'; else if (!isRealStop) aa = 'X'; // others, \0' will terminate } else if (((options & GENEPRED_TRANSLATE_SELENO) != 0) && isStopOrSelno && !isRealStop) aa = 'U'; else if (isRealStop && ((options & GENEPRED_TRANSLATE_STAR_INFRAME_STOPS) != 0)) aa = '*'; else aa = 'X'; } return aa; } static char* translateCds(char* chrom, char* cds, unsigned options) /* translate the CDS */ { int cdsLen = strlen(cds); char *prot = needMem((cdsLen/3)+1); boolean isChrM = isMito(chrom); int iCds, iProt; for (iCds = 0, iProt = 0; iCds < cdsLen; iCds+=3, iProt++) prot[iProt] = translateCodon(isChrM, cds+iCds, (iCds == cdsLen-3), options); return prot; } void genePredTranslate(struct genePred *gp, struct nibTwoCache* genomeSeqs, unsigned options, char **protRet, char **cdsRet) /* Translate a genePred into a protein. It can also return the CDS part of the * mRNA sequence. If the chrom is chrM, the mitochondrial translation tables are * used. If protRet or cdsRet is NULL, those sequences are not returned. */ { // note: code tests by genePredToProt bool haveFrames = (gp->exonFrames != NULL); if (!haveFrames) genePredAddExonFrames(gp); // assume correct frame if not included char* cds = getCdsCodons(gp, genomeSeqs); char *prot = translateCds(gp->chrom, cds, options); if (protRet != NULL) *protRet = prot; else freeMem(prot); if (cdsRet != NULL) *cdsRet = cds; else freeMem(cds); if (!haveFrames) freez(&gp->exonFrames); } void genePredToCds(struct genePred *gp, struct genbankCds *cds) /* Fill in cds with transcript offsets computed from genePred. */ { /* * Warning: Genbank CDS does't have the ability to represent * partial codons. If we have genePreds created from GFF/GTF, they can have * partial codons, which is indicated in frame. This code does not correctly handle * this case, or frame shifting indels. */ cds->start = cds->end = -1; cds->startComplete = cds->endComplete = cds->complement = FALSE; if (gp->cdsEnd > gp->cdsStart) { int e, off = 0; int qCdsStart = -1, qCdsEnd = -1; for (e = 0; e < gp->exonCount; ++e) { int eCdsStart, eCdsEnd; if (genePredCdsExon(gp, e, &eCdsStart, &eCdsEnd)) { if (qCdsStart < 0) qCdsStart = off + (eCdsStart - gp->exonStarts[e]); qCdsEnd = off + (eCdsEnd - gp->exonStarts[e]); } off += gp->exonEnds[e] - gp->exonStarts[e]; } int qSize = off; if (gp->strand[0] == '-') reverseIntRange(&qCdsStart, &qCdsEnd, qSize); cds->start = qCdsStart; cds->end = qCdsEnd; cds->startComplete = (gp->cdsStartStat != cdsIncomplete); cds->endComplete = (gp->cdsEndStat != cdsIncomplete); } } struct psl *genePredToPsl(struct genePred *gp, int chromSize, int qSize) /* Convert a genePred to psl, assuming perfect concordance between target & query. * If qSize is 0 then the number of aligned bases will be used as qSize. */ { int e = 0, aliSize = 0; for (e = 0; e < gp->exonCount; ++e) aliSize += (gp->exonEnds[e] - gp->exonStarts[e]); struct psl *psl = pslNew(gp->name, qSize ? qSize : aliSize, 0, aliSize, gp->chrom, chromSize, gp->txStart, gp->txEnd, gp->strand, gp->exonCount, 0); // If qSize is greater than aliSize then we assume the extra bases are at the end of // the transcript (poly-A tail). If the alignment is on the '-' strand, then we need // to offset the reversed qStarts by the number of extra bases. int sizeAdjust = (gp->strand[0] == '-' && qSize > aliSize) ? (qSize - aliSize) : 0; int i = -1; for (e = 0; e < gp->exonCount; ++e) { if (e == 0 || gp->exonStarts[e] != gp->exonEnds[e-1]) { i++; psl->blockSizes[i] = (gp->exonEnds[e] - gp->exonStarts[e]); psl->qStarts[i] = i==0 ? 0 + sizeAdjust : psl->qStarts[i-1] + psl->blockSizes[i-1]; psl->tStarts[i] = gp->exonStarts[e]; } else { // Merge "exons" that have a 0-length gap between them to avoid pslCheck failure psl->blockSizes[i] += (gp->exonEnds[e] - gp->exonStarts[e]); } } psl->blockCount = i+1; psl->match = aliSize; psl->tNumInsert = psl->blockCount-1; psl->tBaseInsert = (gp->txEnd - gp->txStart) - aliSize; return psl; }