/* annoFormatVep -- write functional predictions in the same format as Ensembl's
 * Variant Effect Predictor to fileName, interpreting input rows according to config.
 * See http://uswest.ensembl.org/info/docs/variation/vep/vep_formats.html */

/* Copyright (C) 2014 The Regents of the University of California 
 * See kent/LICENSE or http://genome.ucsc.edu/license/ for licensing information. */

#include "annoFormatVep.h"
#include "annoGratorGpVar.h"
#include "annoGratorQuery.h"
#include "annoStreamDbPslPlus.h"
#include "asParse.h"
#include "bigGenePred.h"
#include "dystring.h"
#include "genePred.h"
#include "gpFx.h"
#include "hdb.h"
#include "hgHgvs.h"
#include "htmshell.h"
#include "pgSnp.h"
#include "portable.h"
#include "vcf.h"
#include <time.h>

struct annoFormatVepExtraItem
    // A single input column whose value should be placed in the Extras output column,
    // identified by tag.
    {
    struct annoFormatVepExtraItem *next;
    char *tag;					// Keyword to use in extras column (tag=value;)
    char *description;				// Text description for output header
    int rowIx;					// Offset of column in row from data source
						// (N/A for wig sources)
    boolean isRegulatory;			// TRUE if overlap implies regulatory_region
    boolean isBoolean;				// TRUE if we don't need to print out the value,
						// only whether it is found
    };

struct annoFormatVep;

struct annoFormatVepExtraSource
    // A streamer or grator that supplies at least one value for Extras output column.
    {
    struct annoFormatVepExtraSource *next;
    struct annoStreamer *source;		// streamer or grator: same pointers as below
    struct annoFormatVepExtraItem *items;	// one or more columns of source and their tags

    void (*printExtra)(struct annoFormatVepExtraSource *self,
                       struct annoFormatVep *afVep, struct annoFormatVepExtraItem *extraItem,
		       struct annoRow *extraRows, struct annoRow *gpvRow, boolean *pGotExtra);
    /* Method for printing items from this source; pGotExtra is in/out to keep track of
     * whether any call has actually printed something yet. */
    };

struct annoFormatVepConfig
    // Describe the primary source and grators (whose rows must be delivered in this order)
    // that provide data for VEP output columns.
    {
    struct annoStreamer *variantSource;		// Primary source: variants
    char *variantDescription;			// Description to appear in output header
    struct annoStreamer *gpVarSource;		// annoGratorGpVar makes the core predictions
    char *gpVarDescription;			// Description to appear in output header
    struct annoStreamer *snpSource;		// Latest dbSNP provides IDs of known variants
    char *snpDescription;			// Description to appear in output header
    struct annoFormatVepExtraSource *extraSources;	// Everything else that may be tacked on
    };

enum afVariantDataType
    // What type of primary data (variants) are we integrating with?
    {
    afvdtInvalid,                       // uninitialized/unknown
    afvdtVcf,                           // primary data source is VCF
    afvdtPgSnpTable,                    // primary data source is pgSnp table with bin in .as
    afvdtPgSnpFile                      // primary data source is pgSnp file, no bin
    };

struct annoFormatVep
// Subclass of annoFormatter that writes VEP-equivalent output to a file.
    {
    struct annoFormatter formatter;	// superclass / external interface
    struct annoFormatVepConfig *config;	// Description of input sources and values for Extras col
    char *fileName;			// Output filename
    FILE *f;				// Output file handle
    struct lm *lm;			// localmem for scratch storage
    struct dyString *dyScratch;		// dyString for local temporary use
    struct annoAssembly *assembly;	// Reference assembly (for sequence)
    int lmRowCount;			// counter for periodic localmem cleanup
    int varNameIx;			// Index of name column from variant source, or -1 if N/A
    int varAllelesIx;			// Index of alleles column from variant source, or -1
    int txNameIx;                       // Index of transcript name from (big)genePred/PSL
    int geneNameIx;			// Index of gene name from (big)genePred/PSL+ if included
    int hgvsGIx;			// Index of HGVS g. column from annoGratorGpVar
    int hgvsCNIx;			// Index of HGVS c./n. column from annoGratorGpVar
    int hgvsPIx;			// Index of HGVS p. column from annoGratorGpVar
    int snpNameIx;			// Index of name column from dbSNP source, or -1
    boolean needHeader;			// TRUE if we should print out the header
    enum afVariantDataType variantType; // Are variants VCF or a flavor of pgSnp?
    boolean doHtml;			// TRUE if we should include html tags & make a <table>.
    struct seqWindow *gSeqWin;          // genomic sequence fetcher for HGVS term generation
    boolean hgvsMakeG;                  // Generate genomic (g.) HGVS terms only if this is set
    boolean hgvsBreakDelIns;            // Include deleted sequence (not only ins) e.g. delGGinsAT
    };


INLINE void afVepLineBreak(FILE *f, boolean doHtml)
/* Depending on whether we're printing HTML, print either a newline or <BR>. */
{
if (doHtml)
    fputs("<BR>", f);
fputc('\n', f);
}

INLINE void afVepStartRow(FILE *f, boolean doHtml)
/* If we're printing HTML, print a <TR><TD>. */
{
if (doHtml)
    fputs("<TR><TD>", f);
}

INLINE void afVepNextColumn(FILE *f, boolean doHtml)
/* Depending on whether we're printing HTML, print either a tab or </TD><TD>. */
{
if (doHtml)
    fputs("</TD><TD>", f);
else
    fputc('\t', f);
}

INLINE void afVepEndRow(FILE *f, boolean doHtml)
/* If we're printing HTML, print a </TD></TR>, otherwise, just a newline. */
{
if (doHtml)
    fputs("</TD></TR>\n", f);
else
    fputc('\n', f);
}

INLINE void afVepPuts(char *text, struct annoFormatVep *self)
/* Write text to self->f.  If we're printing HTML, write encoded text, otherwise just write text. */
{
if (self->doHtml)
    htmTextOut(self->f, text);
else
    fputs(text, self->f);
}

static void afVepPrintf(struct annoFormatVep *self, char *format, ...)
/*  Printf to self->f, but if self->doHtml then encode before writing out. */
#ifdef __GNUC__
__attribute__((format(printf, 2, 3)))
#endif
    ;

static void afVepPrintf(struct annoFormatVep *self, char *format, ...)
/*  Printf to self->f, but if self->doHtml then encode before writing out. */
{
va_list args;
va_start(args, format);
if (self->doHtml)
    vaHtmlFprintf(self->f, format, args);
else
    vfprintf(self->f, format, args);
va_end(args);
}

static void afVepPrintHeaderExtraTags(struct annoFormatVep *self, char *bStart, char *bEnd)
/* For each extra column described in config, write out its tag and a brief description.
 * bStart and bEnd are for bold output in HTML, or if not HTML, starting the line with "## ". */
{
struct annoFormatVepExtraSource *extras = self->config->extraSources, *extraSrc;
if (extras == NULL)
    return;
fprintf(self->f, "%sKeys for Extra column items:%s", bStart, bEnd);
afVepLineBreak(self->f, self->doHtml);
for (extraSrc = extras;  extraSrc != NULL;  extraSrc = extraSrc->next)
    {
    struct annoFormatVepExtraItem *extraItem;
    for (extraItem = extraSrc->items;  extraItem != NULL;  extraItem = extraItem->next)
	if (isNotEmpty(extraItem->tag))
	    {
	    fprintf(self->f, "%s%s%s: %s", bStart, extraItem->tag, bEnd, extraItem->description);
	    afVepLineBreak(self->f, self->doHtml);
	    }
    }
}

static void afVepPrintHeaderDate(FILE *f, boolean doHtml)
/* VEP header includes a date formatted like "2012-06-16 16:09:38" */
{
long now = clock1();
struct tm *tm = localtime(&now);
fprintf(f, "## Output produced at %d-%02d-%02d %02d:%02d:%02d",
	1900 + tm->tm_year, 1 + tm->tm_mon, tm->tm_mday, tm->tm_hour, tm->tm_min, tm->tm_sec);
afVepLineBreak(f, doHtml);
}

static char *columnLabels[] = { "Uploaded Variation",
				"Location",
				"Allele",
				"Gene",
				"Feature",
				"Feature type",
				"Consequence",
				"Position in cDNA",
				"Position in CDS",
				"Position in protein",
				"Amino acid change",
				"Codon change",
				"Co-located Variation",
				"Extra",
				NULL };

static void afVepPrintColumnLabels(struct annoFormatVep *self)
/* If we're printing HTML, begin a table and use TH for labels; otherwise just
 * print out a tab-sep text line with labels.  VEP doesn't begin this line with a #! */
{
char **pLabel = columnLabels;
if (self->doHtml)
    {
    fputs("<BR><TABLE class='stdTbl'>\n<TR><TH>", self->f);
    fputs(*pLabel++, self->f);
    while (*pLabel != NULL)
	{
	fputs("</TH><TH>", self->f);
	fputs(*pLabel++, self->f);
	}
    fputs("</TH></TR>\n", self->f);
    }
else
    {
    fputs(*pLabel++, self->f);
    while (*pLabel != NULL)
	{
	fputc('\t', self->f);
	fputs(*pLabel++, self->f);
	}
    fputc('\n', self->f);
    }
}

static void afVepPrintHeader(struct annoFormatVep *self, char *db)
/* Print a header that looks almost like a VEP header. */
{
FILE *f = self->f;
boolean doHtml = self->doHtml;
char *bStart = doHtml ? "<B>" : "## ";
char *bEnd = doHtml ? "</B>" : "";
if (!doHtml)
    {
    // Suppress these lines from HTML output -- IMO they're better suited for a file header:
    fputs("## ENSEMBL VARIANT EFFECT PREDICTOR format (UCSC Variant Annotation Integrator)", f);
    afVepLineBreak(f, doHtml);
    afVepPrintHeaderDate(f, doHtml);
    fprintf(f, "## Connected to UCSC database %s", db);
    afVepLineBreak(f, doHtml);
    }
struct annoFormatVepConfig *config = self->config;
fprintf(f, "%sVariants:%s ", bStart, bEnd);
afVepPrintf(self, "%s", config->variantDescription);
if (! strstr(config->variantSource->name, "/trash/"))
    fprintf(f, " (%s)", config->variantSource->name);
afVepLineBreak(f, doHtml);
fprintf(f, "%sTranscripts:%s %s (%s)", bStart, bEnd,
	config->gpVarDescription, config->gpVarSource->name);
afVepLineBreak(f, doHtml);
if (config->snpSource != NULL)
    {
    fprintf(f, "%sdbSNP:%s %s (%s)", bStart, bEnd,
	    config->snpDescription, config->snpSource->name);
    afVepLineBreak(f, doHtml);
    }
afVepPrintHeaderExtraTags(self, bStart, bEnd);
afVepPrintColumnLabels(self);
self->needHeader = FALSE;
}

static void afVepInitialize(struct annoFormatter *fSelf, struct annoStreamer *primarySource,
			    struct annoStreamer *integrators)
/* Print header, regardless of whether we get any data after this. */
{
struct annoFormatVep *self = (struct annoFormatVep *)fSelf;
if (self->needHeader)
    afVepPrintHeader(self, primarySource->assembly->name);
}

static void afVepComment(struct annoFormatter *fSelf, char *content)
/* Print out a comment, either starting with "# " or as a warnBox depending on doHtml. */
{
if (strchr(content, '\n'))
    errAbort("afVepComment: no multi-line input");
struct annoFormatVep *self = (struct annoFormatVep *)fSelf;
if (self->doHtml)
    warn("%s", content);
else
    fprintf(self->f, "# %s\n", content);
}

static void compressDashes(char *string)
/* If string has a run of '-' characters, turn it into single '-'. */
{
char *p = string;
while ((p = strchr(p, '-')) != NULL)
    {
    char *end = p+1;
    while (*end == '-')
	end++;
    if (end > p+1)
	memmove(p+1, end, strlen(end)+1);
    p++;
    }
}

static char *afVepGetAlleles(struct annoFormatVep *self, struct annoRow *varRow)
/* Return a string with slash-separated alleles.  Result is alloc'd by self->lm. */
{
char **varWords = (char **)(varRow->data);
char *alleles = NULL;
if (self->variantType == afvdtVcf)
    alleles = lmCloneString(self->lm, vcfGetSlashSepAllelesFromWords(varWords, self->dyScratch));
else if (self->varAllelesIx >= 0)
    alleles = lmCloneString(self->lm, varWords[self->varAllelesIx]);
else
    errAbort("annoFormatVep: afVepSetConfig didn't specify how to get alleles");
compressDashes(alleles);
return alleles;
}

static void afVepPrintNameAndLoc(struct annoFormatVep *self, struct annoRow *varRow)
/* Print variant name and position in genome. */
{
char **varWords = (char **)(varRow->data);
uint start1Based = varRow->start + 1;
// Use variant name if available, otherwise construct an identifier:
if (self->varNameIx >= 0 && !sameString(varWords[self->varNameIx], "."))
    afVepPuts(varWords[self->varNameIx], self);
else
    {
    char *alleles = afVepGetAlleles(self, varRow);
    afVepPrintf(self, "%s_%u_%s", varRow->chrom, start1Based, alleles);
    }
afVepNextColumn(self->f, self->doHtml);
// Location is chr:start for single-base, chr:start-end for indels:
if (varRow->end == start1Based)
    afVepPrintf(self, "%s:%u", varRow->chrom, start1Based);
else if (start1Based > varRow->end)
    afVepPrintf(self, "%s:%u-%u", varRow->chrom, varRow->end, start1Based);
else
    afVepPrintf(self, "%s:%u-%u", varRow->chrom, start1Based, varRow->end);
afVepNextColumn(self->f, self->doHtml);
}

INLINE void afVepPrintPlaceholders(FILE *f, int count, boolean doHtml)
/* VEP uses "-" for N/A.  Sometimes there are several consecutive N/A columns.
 * Count = 0 means print "-" with no tab. Count > 0 means print that many "-" columns. */
{
if (count == 0)
    fputc('-', f);
else
    {
    int i;
    for (i = 0;  i < count;  i++)
	{
	fputc('-', f);
	afVepNextColumn(f, doHtml);
	}
    }
}

#define afVepPrintPlaceholder(f, doHtml) afVepPrintPlaceholders(f, 1, doHtml)

INLINE char *placeholderForEmpty(char *val)
/* Return VEP placeholder "-" if val is empty, otherwise return val. */
{
return (isEmpty(val) ? "-" : val);
}


static void afVepPrintGene(struct annoFormatVep *self, struct annoRow *gpvRow)
/* If the genePred portion of gpvRow contains a gene name (in addition to transcript name),
 * print it out; otherwise print a placeholder. */
{
if (self->geneNameIx >= 0)
    {
    char **words = (char **)(gpvRow->data);
    afVepPuts(placeholderForEmpty(words[self->geneNameIx]), self);
    afVepNextColumn(self->f, self->doHtml);
    }
else
    afVepPrintPlaceholder(self->f, self->doHtml);
}

static void limitLength(char *seq, int baseCount, char *unit)
/* If seq is longer than an abbreviated version of itself, change it to the abbreviated version. */
{
if (isEmpty(seq))
    return;
int len = strlen(seq);
const int elipsLen = 3;
char lengthNote[512];
safef(lengthNote, sizeof(lengthNote), "(%d %s)", len, unit);
if (len > 2*baseCount + elipsLen + strlen(lengthNote))
    {
    // First baseCount bases, then "...":
    int offset = baseCount;
    safecpy(seq+offset, len+1-offset, "...");
    offset += elipsLen;
    // then last baseCount bases:
    safecpy(seq+offset, len+1-offset, seq+len-baseCount);
    offset += baseCount;
    // then lengthNote:
    safecpy(seq+offset, len+1-offset, lengthNote);
    }
}

static void tweakStopCodonAndLimitLength(char *aaSeq, char *codonSeq)
/* If aa from gpFx has a stop 'Z', replace it with "*".
 * If the strings are very long, truncate with a note about how long they are. */
{
char *earlyStop = strchr(aaSeq, 'Z');
if (earlyStop)
    {
    earlyStop[0] = '*';
    earlyStop[1] = '\0';
    }
limitLength(aaSeq, 5, "aa");
limitLength(codonSeq, 12, "nt");
}

INLINE char *dashForEmpty(char *s)
/* Represent empty alleles in insertions/deletions by the customary "-". */
{
if (isEmpty(s))
    return "-";
else
    return s;
}

static void afVepPrintPredictions(struct annoFormatVep *self, struct annoRow *varRow,
				  struct annoRow *gpvRow, struct gpFx *gpFx)
/* Print VEP columns computed by annoGratorGpVar (or placeholders) */
{
// variant allele used to calculate the consequence (or first alternate allele)
char *abbrevAllele = cloneString(gpFx->gAllele);
limitLength(abbrevAllele, 24, "nt");
afVepPuts(placeholderForEmpty(abbrevAllele), self);
afVepNextColumn(self->f, self->doHtml);
// ID of affected gene
afVepPrintGene(self, gpvRow);
// ID of feature
afVepPrintf(self, "%s", placeholderForEmpty(gpFx->transcript));
afVepNextColumn(self->f, self->doHtml);
// type of feature {Transcript, RegulatoryFeature, MotifFeature}
if (gpFx->soNumber == intergenic_variant)
    afVepPrintPlaceholder(self->f, self->doHtml);
else
    {
    fputs("Transcript", self->f);
    afVepNextColumn(self->f, self->doHtml);
    }
// consequence: SO term e.g. splice_region_variant
afVepPuts(soTermToString(gpFx->soNumber), self);
afVepNextColumn(self->f, self->doHtml);
if (gpFx->detailType == codingChange)
    {
    struct codingChange *change = &(gpFx->details.codingChange);
    uint refLen = strlen(change->txRef), altLen = strlen(change->txAlt);
    boolean isInsertion = (refLen == 0);
    boolean isDeletion = altLen < refLen;
    if (isInsertion)
	{
	fprintf(self->f, "%u-%u", change->cDnaPosition, change->cDnaPosition+1);
	afVepNextColumn(self->f, self->doHtml);
	fprintf(self->f, "%u-%u", change->cdsPosition, change->cdsPosition+1);
	}
    else if (isDeletion && refLen > 1)
	{
	fprintf(self->f, "%u-%u", change->cDnaPosition+1, change->cDnaPosition+refLen);
	afVepNextColumn(self->f, self->doHtml);
	fprintf(self->f, "%u-%u", change->cdsPosition+1, change->cdsPosition+refLen);
	}
    else
	{
	fprintf(self->f, "%u", change->cDnaPosition+1);
	afVepNextColumn(self->f, self->doHtml);
	fprintf(self->f, "%u", change->cdsPosition+1);
	}
    afVepNextColumn(self->f, self->doHtml);
    fprintf(self->f, "%u", change->pepPosition+1);
    afVepNextColumn(self->f, self->doHtml);
    int variantFrame = change->cdsPosition % 3;
    strLower(change->codonOld);
    int upLen = min(strlen(change->codonOld+variantFrame), refLen);
    toUpperN(change->codonOld+variantFrame, upLen);
    strLower(change->codonNew);
    int alleleLength = altLen;
    // watch out for symbolic alleles [actually now that we're using txAlt we shouldn't see these]:
    if (sameString(change->txAlt, "<X>") || sameString(change->txAlt, "<*>"))
        alleleLength = upLen;
    else if (startsWith("<", change->txAlt))
        alleleLength = 0;
    toUpperN(change->codonNew+variantFrame, alleleLength);
    tweakStopCodonAndLimitLength(change->aaOld, change->codonOld);
    tweakStopCodonAndLimitLength(change->aaNew, change->codonNew);
    fprintf(self->f, "%s/%s", dashForEmpty(change->aaOld), dashForEmpty(change->aaNew));
    afVepNextColumn(self->f, self->doHtml);
    fprintf(self->f, "%s/%s", dashForEmpty(change->codonOld), dashForEmpty(change->codonNew));
    afVepNextColumn(self->f, self->doHtml);
    }
else if (gpFx->detailType == nonCodingExon)
    {
    struct nonCodingExon *change = &(gpFx->details.nonCodingExon);
    uint refLen = strlen(change->txRef), altLen = strlen(change->txAlt);
    boolean isInsertion = (refLen == 0);
    boolean isDeletion = altLen < refLen;
    int cDnaPosition = change->cDnaPosition;
    if (isInsertion)
	fprintf(self->f, "%u-%u", cDnaPosition, cDnaPosition+1);
    else if (isDeletion)
	fprintf(self->f, "%u-%u", cDnaPosition+1, cDnaPosition+refLen);
    else
	fprintf(self->f, "%u", cDnaPosition+1);
    afVepNextColumn(self->f, self->doHtml);
    // Coding effect columns (except for cDnaPosition) are N/A:
    afVepPrintPlaceholders(self->f, 4, self->doHtml);
    }
else
    // Coding effect columns are N/A:
    afVepPrintPlaceholders(self->f, 5, self->doHtml);
freez(&abbrevAllele);
}

static void afVepPrintExistingVar(struct annoFormatVep *self, struct annoRow *varRow,
				  struct annoStreamRows gratorData[], int gratorCount)
/* Print existing variant ID (or placeholder) */
{
if (self->snpNameIx >= 0)
    {
    if (gratorCount < 2 || gratorData[1].streamer != self->config->snpSource)
	errAbort("annoFormatVep: config error, snpSource is not where expected");
    struct annoRow *snpRows = gratorData[1].rowList, *row;
    if (snpRows != NULL)
	{
	int count = 0;
	for (row = snpRows;  row != NULL;  row = row->next)
	    {
	    char **snpWords = (char **)(row->data);
	    if (row->start == varRow->start && row->end == varRow->end)
		{
		if (count > 0)
		    fputc(',', self->f);
		afVepPrintf(self, "%s", snpWords[self->snpNameIx]);
		count++;
		}
	    }
	if (count == 0)
	    afVepPrintPlaceholder(self->f, self->doHtml);
	else
	    afVepNextColumn(self->f, self->doHtml);
	}
    else
	afVepPrintPlaceholder(self->f, self->doHtml);
    }
else
    afVepPrintPlaceholder(self->f, self->doHtml);
}

static boolean isCodingSnv(struct annoRow *primaryRow, struct gpFx *gpFx)
/* Return TRUE if this is a single-nucleotide non-synonymous change. */
{
if (gpFx == NULL ||
    gpFx->detailType != codingChange ||
    gpFx->soNumber == synonymous_variant)
    return FALSE;
char *txRef = gpFx->details.codingChange.txRef;
char *txAlt = gpFx->details.codingChange.txAlt;
if (txRef == NULL || txAlt == NULL ||
    strlen(txRef) != 1 || sameString(txRef, "-") ||
    strlen(txAlt) != 1 || sameString(txAlt, "-"))
    return FALSE;
return TRUE;
}

static int commaSepFindIx(char *item, char *s)
/* Treating comma-separated, non-NULL s as an array of words,
 * return the index of item in the array, or -1 if item is not in array. */
{
int itemLen = strlen(item);
int ix = 0;
char *p = strchr(s, ',');
while (p != NULL)
    {
    int elLen = (p - s);
    if (elLen == itemLen && strncmp(s, item, itemLen) == 0)
	return ix;
    s = p+1;
    p = strchr(s, ',');
    ix++;
    }
if (strlen(s) == itemLen && strncmp(s, item, itemLen) == 0)
    return ix;
return -1;
}

static int commaSepFindIntIx(int item, char *s)
/* Treating comma-separated, non-NULL s as an array of words that encode integers,
 * return the index of item in the array, or -1 if item is not in array. */
{
char itemString[64];
safef(itemString, sizeof(itemString), "%d", item);
return commaSepFindIx(itemString, s);
}

static char *commaSepWordFromIxOrWhole(int ix, char *s, struct lm *lm)
/* Treating comma-separated, non-NULL s as an array of words,
 * return the word at ix in the array -- but if s has no commas,
 * just return (lmCloned) s.  errAborts if s is comma-sep but ix is out of bounds. */
{
int i = 0;
char *p = strchr(s, ',');
if (p == NULL)
    return lmCloneString(lm, s);
while (p != NULL)
    {
    if (i == ix)
	return lmCloneStringZ(lm, s, p-s);
    s = p+1;
    p = strchr(s, ',');
    i++;
    }
if (i == ix)
    return lmCloneString(lm, s);
errAbort("commaSepWordFromIxOrWhole: Bad index %d for string '%s'", ix, s);
return NULL;
}

INLINE void afVepNewExtra(struct annoFormatVep *self, boolean *pGotExtra)
/* If we already printed an extra item, print the extra column's separator; set pGotExtra. */
{
assert(pGotExtra);
if (*pGotExtra)
    {
    if (self->doHtml)
	fputs("; ", self->f);
    else
	fputc(';', self->f);
    }
*pGotExtra = TRUE;
}

static void afVepPrintDbNsfpSift(struct annoFormatVep *self,
				 struct annoFormatVepExtraSource *extraSrc,
				 struct annoRow *extraRows, struct gpFx *gpFx, char *ensTxId,
				 boolean *pGotExtra)
/* Match the allele from gpFx to the per-allele scores in row from dbNsfpSift. */
{
// Look up column indices only once:
static int ensTxIdIx=-1, altAl1Ix, score1Ix, altAl2Ix, score2Ix, altAl3Ix, score3Ix;
if (ensTxIdIx == -1)
    {
    struct asColumn *columns = extraSrc->source->asObj->columnList;
    ensTxIdIx = asColumnMustFindIx(columns, "ensTxId");
    altAl1Ix = asColumnMustFindIx(columns, "altAl1");
    score1Ix = asColumnMustFindIx(columns, "score1");
    altAl2Ix = asColumnMustFindIx(columns, "altAl2");
    score2Ix = asColumnMustFindIx(columns, "score2");
    altAl3Ix = asColumnMustFindIx(columns, "altAl3");
    score3Ix = asColumnMustFindIx(columns, "score3");
    }

struct annoRow *row;
for (row = extraRows;  row != NULL;  row = row->next)
    {
    // Skip this row unless it contains the ensTxId found by getDbNsfpEnsTx
    // (but handle rare cases where dbNsfpSeqChange has "." for ensTxId, lame)
    char **words = row->data;
    if (differentString(ensTxId, ".") && differentString(words[ensTxIdIx], ".") &&
	commaSepFindIx(ensTxId, words[ensTxIdIx]) < 0)
	continue;
    struct annoFormatVepExtraItem *extraItem = extraSrc->items;
    char *scoreStr = NULL;
    if (sameString(gpFx->gAllele, words[altAl1Ix]))
	scoreStr = words[score1Ix];
    else if (sameString(gpFx->gAllele, words[altAl2Ix]))
	scoreStr = words[score2Ix];
    else if (sameString(gpFx->gAllele, words[altAl3Ix]))
	scoreStr = words[score3Ix];
    if (isNotEmpty(scoreStr) && differentString(scoreStr, "."))
	{
        double score = atof(scoreStr);
        char prediction = '?';
        if (score < 0.05)
            prediction = 'D';
        else
            prediction = 'T';
	afVepNewExtra(self, pGotExtra);
	fprintf(self->f, "%s=%c(%s)", extraItem->tag, prediction, scoreStr);
	}
    }
}

static void afVepPrintDbNsfpPolyPhen2(struct annoFormatVep *self,
				      struct annoFormatVepExtraSource *extraSrc,
				      struct annoRow *extraRows, struct gpFx *gpFx,
				      boolean *pGotExtra)
/* Match the allele from gpFx to the per-allele scores in each row from dbNsfpPolyPhen2. */
{
// Look up column indices only once:
static int aaPosIx=-1,
    altAl1Ix, hDivScore1Ix, hDivPred1Ix, hVarScore1Ix, hVarPred1Ix,
    altAl2Ix, hDivScore2Ix, hDivPred2Ix, hVarScore2Ix, hVarPred2Ix,
    altAl3Ix, hDivScore3Ix, hDivPred3Ix, hVarScore3Ix, hVarPred3Ix;
if (aaPosIx == -1)
    {
    struct asColumn *columns = extraSrc->source->asObj->columnList;
    aaPosIx = asColumnMustFindIx(columns, "uniProtAaPos");
    altAl1Ix = asColumnMustFindIx(columns, "altAl1");
    hDivScore1Ix = asColumnMustFindIx(columns, "hDivScore1");
    hDivPred1Ix = asColumnMustFindIx(columns, "hDivPred1");
    hVarScore1Ix = asColumnMustFindIx(columns, "hVarScore1");
    hVarPred1Ix = asColumnMustFindIx(columns, "hVarPred1");
    altAl2Ix = asColumnMustFindIx(columns, "altAl2");
    hDivScore2Ix = asColumnMustFindIx(columns, "hDivScore2");
    hDivPred2Ix = asColumnMustFindIx(columns, "hDivPred2");
    hVarScore2Ix = asColumnMustFindIx(columns, "hVarScore2");
    hVarPred2Ix = asColumnMustFindIx(columns, "hVarPred2");
    altAl3Ix = asColumnMustFindIx(columns, "altAl3");
    hDivScore3Ix = asColumnMustFindIx(columns, "hDivScore3");
    hDivPred3Ix = asColumnMustFindIx(columns, "hDivPred3");
    hVarScore3Ix = asColumnMustFindIx(columns, "hVarScore3");
    hVarPred3Ix = asColumnMustFindIx(columns, "hVarPred3");
    }

struct codingChange *cc = &(gpFx->details.codingChange);
int count = 0;
struct annoRow *row;
for (row = extraRows;  row != NULL;  row = row->next)
    {
    char **words = row->data;
//#*** polyphen2 can actually have multiple scores/preds for the same pepPosition...
//#*** so what we really should do is loop on comma-sep words[aaPosIx] and print scores/preds
//#*** whenever pepPosition matches.
    int txIx = commaSepFindIntIx(cc->pepPosition+1, words[aaPosIx]);
    if (txIx < 0)
	continue;
    struct annoFormatVepExtraItem *extraItem = extraSrc->items;
    for (;  extraItem != NULL;  extraItem = extraItem->next)
	{
	boolean isHdiv = (stringIn("HDIV", extraItem->tag) != NULL);
	int predIx = -1, scoreIx = -1;
	if (sameString(gpFx->gAllele, words[altAl1Ix]))
	    {
	    predIx = isHdiv ? hDivPred1Ix : hVarPred1Ix;
	    scoreIx = isHdiv ? hDivScore1Ix : hVarScore1Ix;
	    }
	else if (sameString(gpFx->gAllele, words[altAl2Ix]))
	    {
	    predIx = isHdiv ? hDivPred2Ix : hVarPred2Ix;
	    scoreIx = isHdiv ? hDivScore2Ix : hVarScore2Ix;
	    }
	else if (sameString(gpFx->gAllele, words[altAl3Ix]))
	    {
	    predIx = isHdiv ? hDivPred3Ix : hVarPred3Ix;
	    scoreIx = isHdiv ? hDivScore3Ix : hVarScore3Ix;
	    }
	char *pred = (predIx < 0) ? NULL : words[predIx];
	if (pred == NULL || sameString(pred, "."))
	    continue;
	pred = commaSepWordFromIxOrWhole(txIx, pred, self->lm);
	if (isNotEmpty(pred))
	    {
	    if (count == 0)
		{
		afVepNewExtra(self, pGotExtra);
		fprintf(self->f, "%s=", extraItem->tag);
		}
	    else
		fputc(',', self->f);
	    char *score = (scoreIx < 0) ? "?" : commaSepWordFromIxOrWhole(txIx, words[scoreIx], self->lm);
	    fprintf(self->f, "%s(%s)", pred, score);
	    count++;
	    }
	}
    }
}

static void afVepPrintDbNsfpMutationTA(struct annoFormatVep *self,
				       struct annoFormatVepExtraSource *extraSrc,
				       struct annoRow *extraRows, struct gpFx *gpFx, char *ensTxId,
				       boolean *pGotExtra)
/* Match the allele from gpFx to the per-allele scores in row from dbNsfpMutationTaster
 * or dbNsfpMutationAssessor -- they have identical column indices. */
{
// Look up column indices only once:
static int ensTxIdIx=-1, altAl1Ix, score1Ix, pred1Ix,
    altAl2Ix, score2Ix, pred2Ix, altAl3Ix, score3Ix, pred3Ix;
if (ensTxIdIx == -1)
    {
    struct asColumn *columns = extraSrc->source->asObj->columnList;
    ensTxIdIx = asColumnMustFindIx(columns, "ensTxId");
    altAl1Ix = asColumnMustFindIx(columns, "altAl1");
    score1Ix = asColumnMustFindIx(columns, "score1");
    pred1Ix = asColumnMustFindIx(columns, "pred1");
    altAl2Ix = asColumnMustFindIx(columns, "altAl2");
    score2Ix = asColumnMustFindIx(columns, "score2");
    pred2Ix = asColumnMustFindIx(columns, "pred2");
    altAl3Ix = asColumnMustFindIx(columns, "altAl3");
    score3Ix = asColumnMustFindIx(columns, "score3");
    pred3Ix = asColumnMustFindIx(columns, "pred3");
    }

struct annoRow *row;
for (row = extraRows;  row != NULL;  row = row->next)
    {
    // Skip this row unless it contains the ensTxId found by getDbNsfpEnsTx
    // (but handle rare cases where dbNsfpSeqChange has "." for ensTxId, lame)
    char **words = row->data;
    if (differentString(ensTxId, ".") && differentString(words[ensTxIdIx], ".") &&
	commaSepFindIx(ensTxId, words[ensTxIdIx]) < 0)
	continue;
    struct annoFormatVepExtraItem *extraItem = extraSrc->items;
    char *score = NULL, *pred = NULL;
    if (sameString(gpFx->gAllele, words[altAl1Ix]))
	{
	score = words[score1Ix];
	pred = words[pred1Ix];
	}
    else if (sameString(gpFx->gAllele, words[altAl2Ix]))
	{
	score = words[score2Ix];
	pred = words[pred2Ix];
	}
    else if (sameString(gpFx->gAllele, words[altAl3Ix]))
	{
	score = words[score3Ix];
	pred = words[pred3Ix];
	}
    if (isNotEmpty(score) && differentString(score, "."))
	{
	afVepNewExtra(self, pGotExtra);
	if (isEmpty(pred))
	    pred = "?";
	fprintf(self->f, "%s=%s(%s)", extraItem->tag, pred, score);
	}
    }
}

static void afVepPrintDbNsfpLrt(struct annoFormatVep *self,
				struct annoFormatVepExtraSource *extraSrc,
				struct annoRow *extraRows, struct gpFx *gpFx, char *ensTxId,
				boolean *pGotExtra)
/* Match the allele from gpFx to the per-allele scores in row from dbNsfpLrt --
 * it also has omega{1,2,3} columns, but I'm not sure what those mean so am leaving out for now. */
{
// Look up column indices only once:
static int ensTxIdIx=-1, altAl1Ix, score1Ix, pred1Ix,
    altAl2Ix, score2Ix, pred2Ix, altAl3Ix, score3Ix, pred3Ix;
if (ensTxIdIx == -1)
    {
    struct asColumn *columns = extraSrc->source->asObj->columnList;
    ensTxIdIx = asColumnMustFindIx(columns, "ensTxId");
    altAl1Ix = asColumnMustFindIx(columns, "altAl1");
    score1Ix = asColumnMustFindIx(columns, "score1");
    pred1Ix = asColumnMustFindIx(columns, "pred1");
    altAl2Ix = asColumnMustFindIx(columns, "altAl2");
    score2Ix = asColumnMustFindIx(columns, "score2");
    pred2Ix = asColumnMustFindIx(columns, "pred2");
    altAl3Ix = asColumnMustFindIx(columns, "altAl3");
    score3Ix = asColumnMustFindIx(columns, "score3");
    pred3Ix = asColumnMustFindIx(columns, "pred3");
    }

struct annoRow *row;
for (row = extraRows;  row != NULL;  row = row->next)
    {
    // Skip this row unless it contains the ensTxId found by getDbNsfpEnsTx
    // (but handle rare cases where dbNsfpSeqChange has "." for ensTxId, lame)
    char **words = row->data;
    if (differentString(ensTxId, ".") && differentString(words[ensTxIdIx], ".") &&
	commaSepFindIx(ensTxId, words[ensTxIdIx]) < 0)
	continue;
    struct annoFormatVepExtraItem *extraItem = extraSrc->items;
    char *score = NULL, *pred = NULL;
    if (sameString(gpFx->gAllele, words[altAl1Ix]))
	{
	score = words[score1Ix];
	pred = words[pred1Ix];
	}
    else if (sameString(gpFx->gAllele, words[altAl2Ix]))
	{
	score = words[score2Ix];
	pred = words[pred2Ix];
	}
    else if (sameString(gpFx->gAllele, words[altAl3Ix]))
	{
	score = words[score3Ix];
	pred = words[pred3Ix];
	}
    if (isNotEmpty(score) && differentString(score, "."))
	{
	afVepNewExtra(self, pGotExtra);
	if (isEmpty(pred))
	    pred = "?";
	fprintf(self->f, "%s=%s(%s)", extraItem->tag, pred, score);
	}
    }
}

static void afVepPrintDbNsfpVest(struct annoFormatVep *self,
                                 struct annoFormatVepExtraSource *extraSrc,
                                 struct annoRow *extraRows, struct gpFx *gpFx, char *ensTxId,
                                 boolean *pGotExtra)
/* Match the allele from gpFx to the per-allele scores in row from dbNsfpVest */
{
// TODO: also compare var[123] protein change columns to gpFx to make sure we have the right one
// Look up column indices only once:
static int ensTxIdIx=-1, altAl1Ix, score1Ix, altAl2Ix, score2Ix, altAl3Ix, score3Ix;
if (ensTxIdIx == -1)
    {
    struct asColumn *columns = extraSrc->source->asObj->columnList;
    ensTxIdIx = asColumnMustFindIx(columns, "ensTxId");
    altAl1Ix = asColumnMustFindIx(columns, "altAl1");
    score1Ix = asColumnMustFindIx(columns, "score1");
    altAl2Ix = asColumnMustFindIx(columns, "altAl2");
    score2Ix = asColumnMustFindIx(columns, "score2");
    altAl3Ix = asColumnMustFindIx(columns, "altAl3");
    score3Ix = asColumnMustFindIx(columns, "score3");
    }

struct annoRow *row;
for (row = extraRows;  row != NULL;  row = row->next)
    {
    // Skip this row unless it contains the ensTxId found by getDbNsfpEnsTx
    char **words = row->data;
    if (differentString(ensTxId, ".") && differentString(words[ensTxIdIx], ".") &&
	commaSepFindIx(ensTxId, words[ensTxIdIx]) < 0)
	continue;
    struct annoFormatVepExtraItem *extraItem = extraSrc->items;
    char *score = NULL;
    if (sameString(gpFx->gAllele, words[altAl1Ix]))
	score = words[score1Ix];
    else if (sameString(gpFx->gAllele, words[altAl2Ix]))
	score = words[score2Ix];
    else if (sameString(gpFx->gAllele, words[altAl3Ix]))
	score = words[score3Ix];
    if (isNotEmpty(score) && differentString(score, "."))
	{
	afVepNewExtra(self, pGotExtra);
	fprintf(self->f, "%s=%s", extraItem->tag, score);
	}
    }
}

static void afVepPrintDbNsfpInterPro(struct annoFormatVep *self,
				     struct annoFormatVepExtraSource *extraSrc,
				     struct annoRow *extraRows, struct gpFx *gpFx, char *ensTxId,
				     boolean *pGotExtra)
/* Print out any overlapping (comma-sep list, HTML-encoded ',' and '=') InterPro domains. */
{
// Look up column indices only once:
static int domainsIx = -1;
if (domainsIx == -1)
    {
    struct asColumn *columns = extraSrc->source->asObj->columnList;
    domainsIx = asColumnMustFindIx(columns, "domains");
    }

struct annoRow *row;
for (row = extraRows;  row != NULL;  row = row->next)
    {
    char **words = row->data;
    struct annoFormatVepExtraItem *extraItem = extraSrc->items;
    char *domains = words[domainsIx];
    int lastC = strlen(domains) - 1;
    if (domains[lastC] == ',')
	domains[lastC] = '\0';
    if (isNotEmpty(domains) && differentString(domains, "."))
	{
	afVepNewExtra(self, pGotExtra);
	fprintf(self->f, "%s=%s", extraItem->tag, domains);
	}
    }
}

static boolean allelesAgree(char altNt, char altAa, char **words)
/* Return TRUE if dbNsfpSeqChange words have altAa associated with altNt. */
{
//#*** TODO: handle stop codon representation
if ((altNt == words[11][0] && altAa == words[12][0]) ||
    (altNt == words[13][0] && altAa == words[14][0]) ||
    (altNt == words[15][0] && altAa == words[16][0]))
    return TRUE;
return FALSE;
}

static struct annoRow *getRowsFromSource(struct annoStreamer *src,
					 struct annoStreamRows gratorData[], int gratorCount)
/* Search gratorData for src, and return its rows when found. */
{
int i;
for (i = 0;  i < gratorCount;  i++)
    {
    if (gratorData[i].streamer == src)
	return gratorData[i].rowList;
    }
errAbort("annoFormatVep: Can't find source %s in gratorData", src->name);
return NULL;
}

static char *getDbNsfpEnsTx(struct annoFormatVep *self, struct gpFx *gpFx,
			    struct annoStreamRows *gratorData, int gratorCount)
/* Find the Ensembl transcript ID, if any, for which dbNsfp has results consistent
 * with gpFx. */
{
int i;
for (i = 0;  i < gratorCount;  i++)
    {
    struct annoStreamer *source = gratorData[i].streamer;
    if (!sameString(source->asObj->name, "dbNsfpSeqChange")) //#*** need metadata!
	continue;
    struct codingChange *cc = &(gpFx->details.codingChange);
    struct annoRow *extraRows = getRowsFromSource(source, gratorData, gratorCount);
    struct annoRow *row;
    for (row = extraRows;  row != NULL;  row = row->next)
	{
	char **words = row->data;
	if (strcasecmp(cc->codonOld, words[7]) != 0)
	    continue;
	if (!allelesAgree(gpFx->gAllele[0], cc->aaNew[0], words))
	    continue;
	int txIx = commaSepFindIntIx(cc->pepPosition+1, words[10]);
	if (txIx >= 0)
	    {
	    if (sameString(words[4], "."))
		return ".";
	    char *ensTxId = commaSepWordFromIxOrWhole(txIx, words[4], self->lm);
	    return ensTxId;
	    }
	}
    break;
    }
return NULL;
}

static void afVepPrintExtrasDbNsfp(struct annoFormatVep *self, struct annoRow *varRow,
				   struct annoRow *gpvRow, struct gpFx *gpFx,
				   struct annoStreamRows gratorData[], int gratorCount,
				   boolean *pGotExtra)
/* Print the Extra column's tag=value; components from dbNSFP data, if we have any. */
{
// dbNSFP has data only for coding non-synonymous single-nucleotide changes:
if (!isCodingSnv(varRow, gpFx))
    return;
// Does dbNsfpSeqChange have a coding change consistent with gpFx?:
char *ensTxId = getDbNsfpEnsTx(self, gpFx, gratorData, gratorCount);
// Now cycle through selected dbNsfp* sources, printing out scores for ensTxId:
struct annoFormatVepExtraSource *extras = self->config->extraSources, *extraSrc;
for (extraSrc = extras;  extraSrc != NULL;  extraSrc = extraSrc->next)
    {
    char *asObjName = extraSrc->source->asObj->name;
    if (!startsWith("dbNsfp", asObjName))
	continue;
    struct annoRow *extraRows = getRowsFromSource(extraSrc->source, gratorData, gratorCount);
    if (sameString(asObjName, "dbNsfpPolyPhen2"))
	{
	// PolyPhen2 is based on UniProt proteins, not GENCODE/Ensembl transcripts,
	// so ensTxId doesn't apply.
	afVepPrintDbNsfpPolyPhen2(self, extraSrc, extraRows, gpFx, pGotExtra);
	if (ensTxId == NULL)
	    break; // all done now, no need to keep looking
	}
    else if (ensTxId != NULL)
	{
	if (sameString(asObjName, "dbNsfpSift"))
	    afVepPrintDbNsfpSift(self, extraSrc, extraRows, gpFx, ensTxId, pGotExtra);
	else if (sameString(asObjName, "dbNsfpMutationTaster") ||
		 sameString(asObjName, "dbNsfpMutationAssessor"))
	    afVepPrintDbNsfpMutationTA(self, extraSrc, extraRows, gpFx, ensTxId, pGotExtra);
	else if (sameString(asObjName, "dbNsfpLrt"))
	    afVepPrintDbNsfpLrt(self, extraSrc, extraRows, gpFx, ensTxId, pGotExtra);
        else if (sameString(asObjName, "dbNsfpVest"))
	    afVepPrintDbNsfpVest(self, extraSrc, extraRows, gpFx, ensTxId, pGotExtra);
	else if (sameString(asObjName, "dbNsfpInterPro"))
	    afVepPrintDbNsfpInterPro(self, extraSrc, extraRows, gpFx, ensTxId, pGotExtra);
	else if (!sameString(asObjName, "dbNsfpSeqChange"))
	    errAbort("Unrecognized asObj->name '%s' from dbNSFP source '%s'",
		     asObjName, extraSrc->source->name);
	}
    }
}

static void afVepPrintExtrasHgvs(struct annoFormatVep *self, struct annoRow *gpvRow,
                                 boolean *pGotExtra)
/* If not empty, print HGVS terms (last column of gpvRow). */
{
char **row = gpvRow->data;
if (isNotEmpty(row[self->hgvsGIx]))
    {
    afVepNewExtra(self, pGotExtra);
    fprintf(self->f, "HGVSG=%s", row[self->hgvsGIx]);
    }
if (isNotEmpty(row[self->hgvsCNIx]))
    {
    afVepNewExtra(self, pGotExtra);
    fprintf(self->f, "HGVSCN=%s", row[self->hgvsCNIx]);
    }
if (isNotEmpty(row[self->hgvsPIx]))
    {
    afVepNewExtra(self, pGotExtra);
    fprintf(self->f, "HGVSP=%s", row[self->hgvsPIx]);
    }
}

static void afVepPrintExtraWigVec(struct annoFormatVepExtraSource *extraSrc,
                                  struct annoFormatVep *self,
                                  struct annoFormatVepExtraItem *extraItem,
                                  struct annoRow *extraRows, struct annoRow *gpvRow,
                                  boolean *pGotExtra)
/* Print values from a wig source with type arWigVec (per-base floats) */
{
int i;
struct annoRow *row;
for (i = 0, row = extraRows;  row != NULL;  i++, row = row->next)
    {
    float *vector = row->data;
    int len = row->end - row->start;
    int j;
    for (j = 0;  j < len;  j++)
	{
	if (i+j == 0)
	    {
	    afVepNewExtra(self, pGotExtra);
	    fprintf(self->f, "%s=", extraItem->tag);
	    }
	else
	    fputc(',', self->f);
	fprintf(self->f, "%g", vector[j]);
	}
    }
}

static void afVepPrintExtraWigSingle(struct annoFormatVepExtraSource *extraSrc,
                                     struct annoFormatVep *self,
                                     struct annoFormatVepExtraItem *extraItem,
                                     struct annoRow *extraRows, struct annoRow *gpvRow,
                                     boolean *pGotExtra)
/* Print values from a wig source with type arWigSingle (one double per row, e.g. an average) */
{
struct annoRow *row;
for (row = extraRows;  row != NULL;  row = row->next)
    {
    double *pValue = row->data;
    if (row == extraRows)
        {
        afVepNewExtra(self, pGotExtra);
        fprintf(self->f, "%s=", extraItem->tag);
        }
    else
        fputc(',', self->f);
    fprintf(self->f, "%g", *pValue);
    }
}

static void afVepPrintExtraWords(struct annoFormatVepExtraSource *extraSrc,
                                 struct annoFormatVep *self,
				 struct annoFormatVepExtraItem *extraItem,
				 struct annoRow *extraRows, struct annoRow *gpvRow,
                                 boolean *pGotExtra)
/* Print comma-separated values in the specified column from the usual array-of-words. */
{
if (extraItem->rowIx < 0)
    errAbort("annoFormatVep: invalid rowIx for tag %s", extraItem->tag);
char **gpvWords = gpvRow ? gpvRow->data : NULL;
char *gpvTranscript = gpvWords ? gpvWords[self->txNameIx] : NULL;
boolean gotGpvTx = FALSE;
int i;
struct annoRow *row;
for (i = 0, row = extraRows;  row != NULL;  row = row->next)
    {
    char **words = row->data;
    char *extraTranscript = words[0];
    // If extraSrc happens to be gpVarSource, use only the row that matches the current
    // transcript, and use it only once.
    if (extraSrc->source == self->config->gpVarSource)
        {
        extraTranscript = words[self->txNameIx];
        if (gpvTranscript == NULL || differentString(extraTranscript, gpvTranscript) || gotGpvTx)
            continue;
        gotGpvTx = TRUE;
        }
    char *val = words[extraItem->rowIx];
    if (isEmpty(val))
        continue;
    // Strip trailing comma if found
    int valLen = strlen(val);
    if (val[valLen-1] == ',')
        val[valLen-1] = '\0';
    if (i == 0)
	{
	afVepNewExtra(self, pGotExtra);
	fprintf(self->f, "%s=", extraItem->tag);
        if (extraItem->isBoolean)
            fputs("YES", self->f);
	}
    else if (! extraItem->isBoolean)
	fputc(',', self->f);
    if (! extraItem->isBoolean)
        {
        // Watch out for characters that will mess up parsing of EXTRAS column:
        subChar(val, '=', '_');
        subChar(val, ';', '_');
        afVepPuts(val, self);
        }
    i++;
    }
}

static void afVepPrintExtrasOther(struct annoFormatVep *self, struct annoRow *varRow,
				  struct annoRow *gpvRow, struct gpFx *gpFx,
				  struct annoStreamRows gratorData[], int gratorCount,
				  boolean includeRegulatory, boolean *pGotExtra)
/* Print the Extra column's tag=value; components (other than dbNSFP) if we have any. */
{
struct annoFormatVepExtraSource *extras = self->config->extraSources, *extraSrc;
for (extraSrc = extras;  extraSrc != NULL;  extraSrc = extraSrc->next)
    {
    char *asObjName = extraSrc->source->asObj->name;
    if (startsWith("dbNsfp", asObjName))
	continue;
    struct annoRow *extraRows = getRowsFromSource(extraSrc->source, gratorData, gratorCount);
    if (extraRows != NULL)
	{
	struct annoFormatVepExtraItem *extraItem;
	for (extraItem = extraSrc->items;  extraItem != NULL;  extraItem = extraItem->next)
	    if (includeRegulatory || ! extraItem->isRegulatory)
		extraSrc->printExtra(extraSrc, self, extraItem, extraRows, gpvRow, pGotExtra);
	}
    }
// VEP automatically adds DISTANCE for upstream/downstream variants
if (gpFx && (gpFx->soNumber == upstream_gene_variant || gpFx->soNumber == downstream_gene_variant))
    {
    afVepNewExtra(self, pGotExtra);
    // In order to really understand the boundary conditions of Ensembl's DISTANCE function,
    // I'll have to read the code someday.  Upstream deletions on + strand still not handled
    // quite right.
    int ensemblFudge = 0;
    int distance = gpvRow->start - varRow->start + ensemblFudge;
    if (distance < 0)
	{
	ensemblFudge = 1;
	distance = varRow->start - gpvRow->end + ensemblFudge;
	}
    fprintf(self->f, "DISTANCE=%d", distance);
    }
boolean includeExonNumber = TRUE;  //#*** optional in VEP
if (includeExonNumber && gpFx)
    {
    // Add Exon or intron number if applicable
    enum detailType deType = gpFx->detailType;
    int num = -1, count = 0;
    if (deType == codingChange)
        {
	num = gpFx->details.codingChange.exonNumber;
	count = gpFx->details.codingChange.exonCount;
        }
    else if (deType == nonCodingExon)
        {
	num = gpFx->details.nonCodingExon.exonNumber;
	count = gpFx->details.nonCodingExon.exonCount;
        }
    else if (deType == intron)
        {
	num = gpFx->details.intron.intronNumber;
        count = gpFx->details.intron.intronCount;
        }
    if (num >= 0)
	{
	afVepNewExtra(self, pGotExtra);
        fprintf(self->f, "%s=%d/%d", (deType == intron) ? "INTRON" : "EXON", num+1, count);
        }
    }
if (!*pGotExtra)
    afVepPrintPlaceholders(self->f, 0, self->doHtml);
}

static void afVepLmCleanup(struct annoFormatVep *self)
/* Clean out localmem every 1k rows. */
{
self->lmRowCount++;
if (self->lmRowCount > 1024)
    {
    lmCleanup(&(self->lm));
    self->lm = lmInit(0);
    self->lmRowCount = 0;
    }
}

static void afVepPrintOneLineGpVar(struct annoFormatVep *self, struct annoStreamRows *varData,
				   struct annoRow *gpvRow,
				   struct annoStreamRows gratorData[], int gratorCount)
/* Print one line of VEP with a coding gene consequence: a variant, an allele,
 * functional consequences of that allele, and whatever else is included in the config. */
{
afVepLmCleanup(self);
struct annoRow *varRow = varData->rowList;
struct gpFx *gpFx = annoGratorGpVarGpFxFromRow(self->config->gpVarSource, gpvRow, self->lm);
afVepStartRow(self->f, self->doHtml);
afVepPrintNameAndLoc(self, varRow);
afVepPrintPredictions(self, varRow, gpvRow, gpFx);
afVepPrintExistingVar(self, varRow, gratorData, gratorCount);
boolean gotExtra = FALSE;
afVepPrintExtrasDbNsfp(self, varRow, gpvRow, gpFx, gratorData, gratorCount, &gotExtra);
afVepPrintExtrasHgvs(self, gpvRow, &gotExtra);
afVepPrintExtrasOther(self, varRow, gpvRow, gpFx, gratorData, gratorCount, FALSE, &gotExtra);
afVepEndRow(self->f, self->doHtml);
}

static boolean afVepIsRegulatory(struct annoFormatVep *self,
				 struct annoStreamRows gratorData[], int gratorCount)
/* Return TRUE if one of the extraSource items is a regulatory region. */
{
struct annoFormatVepExtraSource *extras = self->config->extraSources, *extraSrc;
for (extraSrc = extras;  extraSrc != NULL;  extraSrc = extraSrc->next)
    {
    struct annoRow *extraRows = getRowsFromSource(extraSrc->source, gratorData, gratorCount);
    if (extraRows != NULL)
	{
	struct annoFormatVepExtraItem *extraItem;
	for (extraItem = extraSrc->items;  extraItem != NULL;  extraItem = extraItem->next)
	    {
	    if (extraItem->isRegulatory)
		return TRUE;
	    }
	}
    }
return FALSE;
}

static char *afVepGetFirstAltAllele(struct annoFormatVep *self, struct annoRow *varRow)
/* For VEP we must print out some alternate allele, whether the alternate allele is
 * related to any other info (like coding changes) or not.  Pick the first non-reference
 * allele. */
{
int refAlBufSize = varRow->end - varRow->start + 1;
char refAllele[refAlBufSize];
annoAssemblyGetSeq(self->assembly, varRow->chrom, varRow->start, varRow->end,
		   refAllele, sizeof(refAllele));
struct variant *variant = NULL;
if (self->variantType == afvdtVcf)
    variant = variantFromVcfAnnoRow(varRow, refAllele, self->lm, self->dyScratch);
else if (self->variantType == afvdtPgSnpTable)
    variant = variantFromPgSnpAnnoRow(varRow, refAllele, TRUE, self->lm);
else if (self->variantType == afvdtPgSnpFile)
    variant = variantFromPgSnpAnnoRow(varRow, refAllele, FALSE, self->lm);
return firstAltAllele(variant->alleles);
}

static void afVepPrintPredictionsReg(struct annoFormatVep *self, char *alt)
/* Print VEP allele, placeholder gene and item names, 'RegulatoryFeature',
 * 'regulatory_region_variant', and placeholder coding effect columns. */
{
afVepPuts(alt, self);
afVepNextColumn(self->f, self->doHtml);
afVepPrintPlaceholders(self->f, 2, self->doHtml);
fputs("RegulatoryFeature", self->f);
afVepNextColumn(self->f, self->doHtml);
fputs(soTermToString(regulatory_region_variant), self->f);
afVepNextColumn(self->f, self->doHtml);
// Coding effect columns are N/A:
afVepPrintPlaceholders(self->f, 5, self->doHtml);
}

static void afVepPrintExtrasHgvsGOnly(struct annoFormatVep *self, struct annoRow *varRow, char *alt,
                                      boolean *pGotExtra)
/* Make our own HGVS g. term (when we aren't taking HGVS terms from annoGratorGpVar). */
{
if (self->hgvsMakeG)
    {
    struct bed3 *variantBed = (struct bed3 *)varRow;
    char *chromAcc = hRefSeqAccForChrom(self->assembly->name, varRow->chrom);
    char *hgvsG = hgvsGFromVariant(self->gSeqWin, variantBed, alt, chromAcc, self->hgvsBreakDelIns);
    afVepNewExtra(self, pGotExtra);
    fprintf(self->f, "HGVSG=%s", hgvsG);
    freeMem(hgvsG);
    }
}

static void afVepPrintRegulatory(struct annoFormatVep *self, struct annoStreamRows *varData,
				 struct annoStreamRows gratorData[], int gratorCount)
/* If there are EXTRAS column sources that imply regulatory_region_variant, print out
 * lines with that consequence (no gpFx/dbNSFP). */
{
if (afVepIsRegulatory(self, gratorData, gratorCount))
    {
    afVepLmCleanup(self);
    struct annoRow *varRow = varData->rowList;
    afVepStartRow(self->f, self->doHtml);
    afVepPrintNameAndLoc(self, varRow);
    char *alt = afVepGetFirstAltAllele(self, varRow);
    afVepPrintPredictionsReg(self, alt);
    afVepPrintExistingVar(self, varRow, gratorData, gratorCount);
    boolean gotExtra = FALSE;
    afVepPrintExtrasHgvsGOnly(self, varRow, alt, &gotExtra);
    afVepPrintExtrasOther(self, varRow, NULL, NULL, gratorData, gratorCount, TRUE, &gotExtra);
    afVepEndRow(self->f, self->doHtml);
    }
}

static void afVepFormatOne(struct annoFormatter *fSelf, struct annoStreamRows *primaryData,
			   struct annoStreamRows gratorData[], int gratorCount)
/* Print one variant's VEP (possibly multiple lines) using collected rows. */
{
struct annoFormatVep *self = (struct annoFormatVep *)fSelf;
struct annoRow *gpVarRows = gratorData[0].rowList, *gpvRow;
for (gpvRow = gpVarRows;  gpvRow != NULL;  gpvRow = gpvRow->next)
    afVepPrintOneLineGpVar(self, primaryData, gpvRow, gratorData, gratorCount);
afVepPrintRegulatory(self, primaryData, gratorData, gratorCount);
}

static void afVepEndHtml(struct annoFormatVep *self)
{
fputs("</TABLE><BR>\n", self->f);
}

static void afVepClose(struct annoFormatter **pFSelf)
/* Close file handle, free self. */
{
if (pFSelf == NULL)
    return;
struct annoFormatVep *self = *(struct annoFormatVep **)pFSelf;
if (self->doHtml)
    afVepEndHtml(self);
freeMem(self->fileName);
carefulClose(&(self->f));
lmCleanup(&(self->lm));
dyStringFree(&(self->dyScratch));
chromSeqWindowFree(&self->gSeqWin);
annoFormatterFree(pFSelf);
}

static void afVepSetConfig(struct annoFormatVep *self, struct annoFormatVepConfig *config)
/* Check config and figure out where various output columns will come from. */
{
self->config = config;
struct asColumn *varAsColumns = config->variantSource->asObj->columnList;
self->varNameIx = -1;
self->varAllelesIx = -1;
if (asObjectsMatch(config->variantSource->asObj, pgSnpAsObj()))
    self->variantType = afvdtPgSnpTable;
else if (asObjectsMatch(config->variantSource->asObj, pgSnpFileAsObj()))
    self->variantType = afvdtPgSnpFile;
else if (asObjectsMatch(config->variantSource->asObj, vcfAsObj()))
    {
    self->variantType = afvdtVcf;
    self->varNameIx = asColumnMustFindIx(varAsColumns, "id");
    }
else
    errAbort("afVepSetConfig: variant source %s doesn't look like pgSnp or VCF",
	     config->variantSource->name);
if (self->variantType == afvdtPgSnpTable || self->variantType == afvdtPgSnpFile)
    {
    // pgSnp's "name" column actually contains slash-separated alleles
    self->varAllelesIx = asColumnMustFindIx(varAsColumns, "name");
    }
if (config->gpVarSource == NULL)
    errAbort("afVepSetConfig: config must have a gpVarSource");
else if (! asColumnNamesMatchFirstN(config->gpVarSource->asObj, genePredAsObj(), 10) &&
         ! asColumnNamesMatchFirstN(config->gpVarSource->asObj, bigGenePredAsObj(), 16) &&
         ! asColumnNamesMatchFirstN(config->gpVarSource->asObj, annoStreamDbPslPlusAsObj(),
                                    PSLPLUS_NUM_COLS))
    errAbort("afVepSetConfig: gpVarSource %s doesn't look like genePred or pslPlus",
	     config->gpVarSource->name);
// refGene and augmented knownGene have extra name fields that have the HGNC gene symbol:
struct asColumn *gpvAsColumns = config->gpVarSource->asObj->columnList;
self->txNameIx = asColumnFindIx(gpvAsColumns, "name");
if (self->txNameIx < 0)
    self->txNameIx = asColumnFindIx(gpvAsColumns, "qName");
if (self->txNameIx < 0)
    errAbort("afVepSetConfig: can't find name or qName column in gpVarSource %s",
             config->gpVarSource->name);
self->geneNameIx = asColumnFindIx(gpvAsColumns, "geneSymbol");
if (self->geneNameIx < 0)
    self->geneNameIx = asColumnFindIx(gpvAsColumns, "kgXref_geneSymbol");
if (self->geneNameIx < 0)
    self->geneNameIx = asColumnFindIx(gpvAsColumns, "name2");
if (self->geneNameIx < 0)
    self->geneNameIx = asColumnFindIx(gpvAsColumns, "proteinID");
self->hgvsGIx = asColumnMustFindIx(gpvAsColumns, "hgvsG");
self->hgvsCNIx = asColumnMustFindIx(gpvAsColumns, "hgvsCN");
self->hgvsPIx = asColumnMustFindIx(gpvAsColumns, "hgvsP");
if (config->snpSource != NULL)
    {
    struct asColumn *snpAsColumns = config->snpSource->asObj->columnList;
    self->snpNameIx = asColumnMustFindIx(snpAsColumns, "name");
    }
else
    self->snpNameIx = -1;
}

struct annoFormatVepConfig *annoFormatVepConfigNew(struct annoStreamer *variantSource,
						   char *variantDescription,
						   struct annoStreamer *gpVarSource,
						   char *gpVarDescription,
						   struct annoStreamer *snpSource,
						   char *snpDescription)
/* Return a basic configuration for VEP output.  variantSource and gpVarSource must be
 * provided; snpSource can be NULL. */
{
struct annoFormatVepConfig *config;
AllocVar(config);
config->variantSource = variantSource;
config->variantDescription = cloneString(variantDescription);
config->gpVarSource = gpVarSource;
config->gpVarDescription = cloneString(gpVarDescription);
config->snpSource = snpSource;
config->snpDescription = cloneString(snpDescription);
return config;
}

struct annoFormatter *annoFormatVepNew(char *fileName, boolean doHtml,
				       struct annoStreamer *variantSource,
				       char *variantDescription,
				       struct annoStreamer *gpVarSource,
				       char *gpVarDescription,
				       struct annoStreamer *snpSource,
				       char *snpDescription,
				       struct annoAssembly *assembly)
/* Return a formatter that will write functional predictions in the same format as Ensembl's
 * Variant Effect Predictor to fileName (can be "stdout").
 * variantSource and gpVarSource must be provided; snpSource can be NULL. */
{
struct annoFormatVep *self;
AllocVar(self);
struct annoFormatter *fSelf = &(self->formatter);
fSelf->getOptions = annoFormatterGetOptions;
fSelf->setOptions = annoFormatterSetOptions;
fSelf->initialize = afVepInitialize;
fSelf->comment = afVepComment;
fSelf->formatOne = afVepFormatOne;
fSelf->close = afVepClose;
self->fileName = cloneString(fileName);
self->f = mustOpen(fileName, "w");
self->lm = lmInit(0);
self->dyScratch = dyStringNew(0);
self->needHeader = TRUE;
self->doHtml = doHtml;
self->assembly = assembly;
struct annoFormatVepConfig *config = annoFormatVepConfigNew(variantSource, variantDescription,
							    gpVarSource, gpVarDescription,
							    snpSource, snpDescription);
afVepSetConfig(self, config);
return (struct annoFormatter *)self;
}

static void afvAddExtraItemMaybeReg(struct annoFormatter *fSelf, struct annoStreamer *extraSource,
				    char *tag, char *description, char *column, boolean isBoolean,
                                    boolean isReg)
/* Tell annoFormatVep that it should include the given column of extraSource
 * in the EXTRAS column with tag.  The VEP header will include tag's description.
 * If isReg, overlap implies that the variant has consequence regulatory_region_variant.
 * For some special-cased sources e.g. dbNsfp files, column may be NULL/ignored. */
{
struct annoFormatVep *self = (struct annoFormatVep *)fSelf;
struct annoFormatVepExtraSource *src;
for (src = self->config->extraSources;  src != NULL;  src = src->next)
    if (src->source == extraSource)
	break;
if (src == NULL)
    {
    AllocVar(src);
    src->source = extraSource;
    if (extraSource->rowType == arWigVec)
	src->printExtra = afVepPrintExtraWigVec;
    else if (extraSource->rowType == arWigSingle)
	src->printExtra = afVepPrintExtraWigSingle;
    else
	src->printExtra = afVepPrintExtraWords;
    slAddTail(&(self->config->extraSources), src);
    }
struct annoFormatVepExtraItem *item;
AllocVar(item);
item->tag = cloneString(tag);
item->description = cloneString(description);
item->rowIx = -1;
if (isNotEmpty(column))
    item->rowIx = asColumnMustFindIx(extraSource->asObj->columnList, column);
item->isRegulatory = isReg;
item->isBoolean = isBoolean;
slAddTail(&(src->items), item);
}

void annoFormatVepAddExtraItem(struct annoFormatter *fSelf, struct annoStreamer *extraSource,
			       char *tag, char *description, char *column, boolean isBoolean)
/* Tell annoFormatVep that it should include the given column of extraSource
 * in the EXTRAS column with tag.  The VEP header will include tag's description.
 * For some special-cased sources e.g. dbNsfp files, column may be NULL/ignored.
 * If isBoolean, the column's description won't be output, only whether a match was found. */
{
afvAddExtraItemMaybeReg(fSelf, extraSource, tag, description, column, isBoolean, FALSE);
}

void annoFormatVepAddRegulatory(struct annoFormatter *fSelf, struct annoStreamer *regSource,
				char *tag, char *description, char *column)
/* Tell annoFormatVep to use the regulatory_region_variant consequence if there is overlap
 * with regSource, and to include the given column of regSource in the EXTRAS column.
 * The VEP header will include tag's description. */
{
afvAddExtraItemMaybeReg(fSelf, regSource, tag, description, column, FALSE, TRUE);
struct annoFormatVep *self = (struct annoFormatVep *)fSelf;
if (self->gSeqWin == NULL)
    {
    char *db = regSource->assembly->name;
    self->gSeqWin = chromSeqWindowNew(db, NULL, 0, 0);
    }
}

void annoFormatVepSetHgvsOutOptions(struct annoFormatter *fSelf, uint hgvsOutOptions)
/* Import the HGVS output options described in hgHgvs.h */
{
struct annoFormatVep *self = (struct annoFormatVep *)fSelf;
// We only do g. terms, for regulatory region variants, so no need for transcript/protein opts.
if (hgvsOutOptions & HGVS_OUT_G)
    self->hgvsMakeG = TRUE;
if (hgvsOutOptions & HGVS_OUT_BREAK_DELINS)
    self->hgvsBreakDelIns = TRUE;
}
