/* VCF: Variant Call Format, version 4.0 / 4.1
 * http://www.1000genomes.org/wiki/Analysis/Variant%20Call%20Format/vcf-variant-call-format-version-40
 * http://www.1000genomes.org/wiki/Analysis/Variant%20Call%20Format/vcf-variant-call-format-version-41
 * The vcfFile object borrows many memory handling and error reporting tricks from MarkD's
 * gff3File; any local deficiencies are not to reflect poorly on Mark's fine work! :) */

#ifndef vcf_h
#define vcf_h

#include "limits.h"
#include "hash.h"
#include "linefile.h"
#include "asParse.h"

enum vcfInfoType
/* VCF header defines INFO column components; each component has one of these types: */
    {
    vcfInfoNoType,	// uninitialized value (0) or unrecognized type name
    vcfInfoInteger,
    vcfInfoFloat,
    vcfInfoFlag,
    vcfInfoCharacter,
    vcfInfoString,
    };

union vcfDatum
/* Container for a value whose type is specified by an enum vcfInfoType. */
    {
    int datInt;
    double datFloat;
    boolean datFlag;
    char datChar;
    char *datString;
    };

struct vcfInfoDef
/* Definition of INFO column component from VCF header: */
    {
    struct vcfInfoDef *next;
    char *key;			// A short identifier, e.g. MQ for mapping quality
    int fieldCount;		// The number of values to follow the id, or -1 if it varies
    enum vcfInfoType type;	// The type of values that follow the id
    char *description;		// Brief description of info
    };

struct vcfInfoElement
/* A single INFO column component; each row's INFO column may contain multiple components. */
    {
    char *key;			// An identifier described by a struct vcfInfoDef
    int count;			// Number of data values following id
    union vcfDatum *values;	// Array of data values following id
    bool *missingData;		// Array of flags for missing data values ("." instead of number)
    };

struct vcfGenotype
/* A single component of the optional GENOTYPE column. */
    {
    char *id;			// Name of individual/sample (pointer to vcfFile genotypeIds) or .
    char hapIxA;		// Index of one haplotype's allele: 0=reference, 1=alt, 2=other alt
				// *or* if negative, missing data
    char hapIxB;		// Index of other haplotype's allele, or if negative, missing data
    bool isPhased;		// True if haplotypes are phased
    bool isHaploid;		// True if there is only one haplotype (e.g. chrY)
    int infoCount;		// Number of components named in FORMAT column
    struct vcfInfoElement *infoElements;	// Array of info components for this genotype call
    };

struct vcfRecord
/* A VCF data row (or list of rows). */
{
    struct vcfRecord *next;
    char *chrom;		// Reference assembly sequence name
    unsigned int chromStart;	// Start offset in chrom
    unsigned int chromEnd;	// End offset in chrom
    char *name;			// Variant name from ID column
    int alleleCount;		// Number of alleles (reference + alternates)
    char **alleles;		// Alleles: reference first then alternate alleles
    char *qual;			// . or Phred-scaled score, i.e. -10log_10 P(call in ALT is wrong)
    int filterCount;		// Number of ;-separated filter codes in FILTER column
    char **filters;		// Code(s) described in header for failed filters (or PASS or .)
    int infoCount;		// Number of components of INFO column
    struct vcfInfoElement *infoElements;	// Array of INFO column components
    char *format;		// Optional column containing ordered list of genotype components
    char **genotypeUnparsedStrings;	// Temporary array of unparsed optional genotype columns
    struct vcfGenotype *genotypes;	// If built, array of parsed genotype components;
					// call vcfParseGenotypes(record) to build.
    struct vcfFile *file;	// Pointer back to parent vcfFile
};

struct vcfFile
/* Info extracted from a VCF file.  Manages all memory for contents.
 * Clearly borrowing structure from MarkD's gff3File. :) */
{
    char *fileOrUrl;		// VCF local file path or URL
    char *headerString;		// Complete original header including newlines.
    int majorVersion;		// 4 etc.
    int minorVersion;		// 0, 1 etc.
    struct vcfInfoDef *infoDefs;	// Header's definitions of INFO column components
    struct vcfInfoDef *filterDefs;	// Header's definitions of FILTER column failure codes
    struct vcfInfoDef *altDefs;	// Header's defs of symbolic alternate alleles (e.g. DEL, INS)
    struct vcfInfoDef *gtFormatDefs;	// Header's defs of GENOTYPE compnts. listed in FORMAT col.
    bool allPhased;         // True if all record->genotypes have been phased
    int genotypeCount;		// Number of optional genotype columns described in header
    char **genotypeIds;		// Array of optional genotype column names described in header
    struct vcfRecord *records;	// VCF data rows, sorted by position
    struct hash *byName;		// Hash records by name -- not populated until needed.
    struct hash *pool;		// Used to allocate string values that tend to
				// be repeated in the files.  hash's localMem is also used to
				// allocated memory for all other objects (if recordPool null)
    struct lm *reusePool;       // If created with vcfFileMakeReusePool, non-shared record data is
                                // allocated from this pool. Useful when walking through huge files.
    struct lineFile *lf;	// Used only during parsing
    int maxErr;			// Maximum number of errors before aborting
    int errCnt;			// Error count
};

/* Reserved but optional INFO keys: */
extern const char *vcfInfoAncestralAllele;
extern const char *vcfInfoPerAlleleGtCount;	// allele count in genotypes, for each ALT allele,
						// in the same order as listed
extern const char *vcfInfoAlleleFrequency;  	// allele frequency for each ALT allele in the same
						// order as listed: use this when estimated from
						// primary data, not called genotypes
extern const char *vcfInfoNumAlleles;		// total number of alleles in called genotypes
extern const char *vcfInfoBaseQuality;		// RMS base quality at this position
extern const char *vcfInfoCigar;		// cigar string describing how to align an
						// alternate allele to the reference allele
extern const char *vcfInfoIsDbSnp;		// dbSNP membership
extern const char *vcfInfoDepth;		// combined depth across samples, e.g. DP=154
extern const char *vcfInfoEnd;			// end position of the variant described in this
						// record (esp. for CNVs)
extern const char *vcfInfoIsHapMap2;		// membership in hapmap2
extern const char *vcfInfoIsHapMap3;		// membership in hapmap3
extern const char *vcfInfoIs1000Genomes;	// membership in 1000 Genomes
extern const char *vcfInfoMappingQuality;	// RMS mapping quality, e.g. MQ=52
extern const char *vcfInfoMapQual0Count;	// number of MAPQ == 0 reads covering this record
extern const char *vcfInfoNumSamples;		// Number of samples with data
extern const char *vcfInfoStrandBias;		// strand bias at this position
extern const char *vcfInfoIsSomatic;		// indicates that the record is a somatic mutation,
						// for cancer genomics
extern const char *vcfInfoIsValidated;		// validated by follow-up experiment

/* Reserved but optional per-genotype keys: */
extern const char *vcfGtGenotype;	// numeric allele values separated by "/" (unphased)
					// or "|" (phased). Allele values are 0 for
					// reference allele, 1 for the first allele in ALT,
					// 2 for the second allele in ALT and so on.
extern const char *vcfGtDepth;		// read depth at this position for this sample
extern const char *vcfGtFilter;		// analogous to variant's FILTER field
extern const char *vcfGtLikelihoods;	// three floating point log10-scaled likelihoods for
					// AA,AB,BB genotypes where A=ref and B=alt;
					// not applicable if site is not biallelic.
extern const char *vcfGtPhred;		// Phred-scaled genotype likelihoods rounded to closest int
extern const char *vcfGtConditionalQual;	// Conditional genotype quality
					// i.e. phred quality -10log_10 P(genotype call is wrong,
					// conditioned on the site's being variant)
extern const char *vcfGtHaplotypeQualities;	// Two phred qualities comma separated
extern const char *vcfGtPhaseSet;	// Set of phased genotypes to which this genotype belongs
extern const char *vcfGtPhasingQuality;	// Phred-scaled P(alleles ordered wrongly in heterozygote)
extern const char *vcfGtExpectedAltAlleleCount;	// Typically used in association analyses

INLINE void vcfPrintDatum(FILE *f, const union vcfDatum datum, const enum vcfInfoType type)
/* Print datum to f in the format determined by type. */
{
switch (type)
    {
    case vcfInfoInteger:
	fprintf(f, "%d", datum.datInt);
	break;
    case vcfInfoFloat:
	fprintf(f, "%f", datum.datFloat);
	break;
    case vcfInfoFlag:
	fprintf(f, "%s", datum.datString); // Flags could have values in older VCF
	break;
    case vcfInfoCharacter:
	fprintf(f, "%c", datum.datChar);
	break;
    case vcfInfoString:
        if (startsWith("http", datum.datString))
            fprintf(f, "<a target=_blank href='%s'>%s</a>", datum.datString, datum.datString);
        else
            fprintf(f, "%s", datum.datString);
	break;
    default:
	errAbort("vcfPrintDatum: Unrecognized type %d", type);
	break;
    }
}

#define VCF_IGNORE_ERRS (INT_MAX - 1)

struct vcfFile *vcfFileNew();
/* Return a new, empty vcfFile object. */

struct vcfFile *vcfFileFromHeader(char *name, char *headerString, int maxErr);
/* Parse the VCF header string into a vcfFile object with no rows.
 * name is for error reporting.
 * If maxErr is non-negative then continue to parse until maxErr+1 errors have been found.
 * A maxErr less than zero does not stop and reports all errors.
 * Set maxErr to VCF_IGNORE_ERRS for silence. */

struct vcfFile *vcfFileMayOpen(char *fileOrUrl, char *chrom, int start, int end,
			       int maxErr, int maxRecords, boolean parseAll);
/* Open fileOrUrl and parse VCF header; return NULL if unable.
 * If chrom is non-NULL, scan past any variants that precede {chrom, chromStart}.
 * Note: this is very inefficient -- it's better to use vcfTabix if possible!
 * If parseAll, then read in all lines in region, parse and store in
 * vcff->records; if maxErr >= zero, then continue to parse until
 * there are maxErr+1 errors.  A maxErr less than zero does not stop
 * and reports all errors. Set maxErr to VCF_IGNORE_ERRS for silence. */

struct vcfFile *vcfTabixFileAndIndexMayOpen(char *fileOrUrl, char *tbiFileOrUrl, char *chrom, int start, int end,
				    int maxErr, int maxRecords);
/* Open a VCF file that has been compressed and indexed by tabix and
 * parse VCF header, or return NULL if unable. tbiFileOrUrl can be NULL.
 * If chrom is non-NULL, seek to the position range and parse all lines in
 * range into vcff->records.  If maxErr >= zero, then continue to parse until
 * there are maxErr+1 errors.  A maxErr less than zero does not stop
 * and reports all errors. Set maxErr to VCF_IGNORE_ERRS for silence */

struct vcfFile *vcfTabixFileMayOpen(char *fileOrUrl, char *chrom, int start, int end,
				    int maxErr, int maxRecords);
/* Open a VCF file that has been compressed and indexed by tabix and
 * parse VCF header, or return NULL if unable.  If chrom is non-NULL,
 * seek to the position range and parse all lines in range into
 * vcff->records.  If maxErr >= zero, then continue to parse until
 * there are maxErr+1 errors.  A maxErr less than zero does not stop
 * and reports all errors. Set maxErr to VCF_IGNORE_ERRS for silence. */

long long vcfTabixItemCount(char *fileOrUrl, char *tbiFileOrUrl);
/* Return the total number of items across all sequences in fileOrUrl, using index file.
 * If tbiFileOrUrl is NULL, the index file is assumed to be fileOrUrl.tbi.
 * NOTE: not all tabix index files include mapped item counts, so this may return 0 even for
 * large files. */

int vcfTabixBatchRead(struct vcfFile *vcff, char *chrom, int start, int end,
                      int maxErr, int maxRecords);
// Reads a batch of records from an opened and indexed VCF file, adding them to
// vcff->records and returning the count of new records added in this batch.
// Note: vcff->records will continue to be sorted, even if batches are loaded
// out of order.  Additionally, resulting vcff->records will contain no duplicates
// so returned count refects only the new records added, as opposed to all records
// in range.  If maxErr >= zero, then continue to parse until there are maxErr+1
// errors.  A maxErr less than zero does not stop and reports all errors.  Set
// maxErr to VCF_IGNORE_ERRS for silence.

void vcfFileMakeReusePool(struct vcfFile *vcff, int initialSize);
// Creates a separate memory pool for records.  Establishing this pool allows
// using vcfFileFlushRecords to abandon previously read records and free
// the associated memory. Very useful when reading an entire file in batches.
#define vcfFileLm(vcff) ((vcff)->reusePool ? (vcff)->reusePool : (vcff)->pool->lm)

void vcfFileFlushRecords(struct vcfFile *vcff);
// Abandons all previously read vcff->records and flushes the reuse pool (if it exists).
// USE WITH CAUTION.  All previously allocated record pointers are now invalid.

struct vcfRecord *vcfNextRecord(struct vcfFile *vcff);
/* Parse the words in the next line from vcff into a vcfRecord. Return NULL at end of file.
 * Note: this does not store record in vcff->records! */

struct vcfRecord *vcfRecordFromRow(struct vcfFile *vcff, char **words);
/* Parse words from a VCF data line into a VCF record structure. */

boolean allelesHavePaddingBase(char **alleles, int alleleCount);
/* Examine alleles to see if they either a) all start with the same base or
 * b) include a symbolic or 0-length allele.  In either of those cases, there
 * must be an initial padding base that we'll need to trim from non-symbolic
 * alleles. */

unsigned int vcfRecordTrimIndelLeftBase(struct vcfRecord *rec);
/* For indels, VCF includes the left neighboring base; for example, if the alleles are
 * AA/- following a G base, then the VCF record will start one base to the left and have
 * "GAA" and "G" as the alleles.  That is not nice for display for two reasons:
 * 1. Indels appear one base wider than their dbSNP entries.
 * 2. In pgSnp display mode, the two alleles are always the same color.
 * However, for hgTracks' mapBox we need the correct chromStart for identifying the
 * record in hgc -- so return the original chromStart. */

unsigned int vcfRecordTrimAllelesRight(struct vcfRecord *rec);
/* Some tools output indels with extra base to the right, for example ref=ACC, alt=ACCC
 * which should be ref=A, alt=AC.  When the extra bases make the variant extend from an
 * intron (or gap) into an exon, it can cause a false appearance of a frameshift.
 * To avoid this, when all alleles have identical base(s) at the end, trim all of them,
 * and update rec->chromEnd.
 * For hgTracks' mapBox we need the correct chromStart for identifying the record in hgc,
 * so return the original chromEnd. */

int vcfRecordCmp(const void *va, const void *vb);
/* Compare to sort based on position. */

void vcfFileFree(struct vcfFile **vcffPtr);
/* Free a vcfFile object. */

const struct vcfRecord *vcfFileFindVariant(struct vcfFile *vcff, char *variantId);
/* Return all records with name=variantId, or NULL if not found. */

const struct vcfInfoElement *vcfRecordFindInfo(const struct vcfRecord *record, char *key);
/* Find an INFO element, or NULL. */

struct vcfInfoDef *vcfInfoDefForKey(struct vcfFile *vcff, const char *key);
/* Return infoDef for key, or NULL if it wasn't specified in the header or VCF spec. */

void vcfParseGenotypes(struct vcfRecord *record);
/* Translate record->genotypesUnparsedStrings[] into proper struct vcfGenotype[].
 * This destroys genotypesUnparsedStrings. */

void vcfParseGenotypesGtOnly(struct vcfRecord *record);
/* Translate record->genotypesUnparsedStrings[] into proper struct vcfGenotype[], but ignore
 * genotype info elements, IDs, etc; parse only the actual genotypes (e.g. for quick display
 * in hgTracks).  This destroys genotypesUnparsedStrings. */

const struct vcfGenotype *vcfRecordFindGenotype(struct vcfRecord *record, char *sampleId);
/* Find the genotype and associated info for the individual, or return NULL.
 * This calls vcfParseGenotypes if it has not already been called. */

struct vcfInfoDef *vcfInfoDefForGtKey(struct vcfFile *vcff, const char *key);
/* Look up the type of genotype FORMAT component key, in the definitions from the header,
 * and failing that, from the keys reserved in the spec. */

const struct vcfInfoElement *vcfGenotypeFindInfo(const struct vcfGenotype *gt, char *key);
/* Find the genotype infoElement for key, or return NULL. */

int vcfGenotypeIndex(int h0Ix, int h1Ix);
/* Return the index in a linear array of distinct genotypes, given two 0-based allele indexes.
 * This follows the following convention used by GnomAD (GATK?), that has the advantage that
 * gt indexes of small numbers don't change as the number of alleles increases, and also matches
 * the ref/ref, ref/alt, alt/alt convention for biallelic variants:
 * 0/0,
 * 0/1, 1/1,
 * 0/2, 1/2, 2/2,
 * 0/3, 1/3, 2/3, 3/3,
 * ... */

void vcfCountGenotypes(struct vcfRecord *rec, int **retGtCounts, int **retAlleleCounts,
                       int *retPhasedCount, int *retDiploidCount);
/* Tally genotypes and alleles for summary, adding 1 to rec->alleleCount to represent missing data */

char *vcfFilePooledStr(struct vcfFile *vcff, char *str);
/* Allocate memory for a string from vcff's shared string pool. */

#define VCF_NUM_COLS 10
#define VCF_NUM_COLS_BEFORE_GENOTYPES 9

struct asObject *vcfAsObj();
// Return asObject describing fields of VCF

char *vcfGetSlashSepAllelesFromWords(char **words, struct dyString *dy);
/* Overwrite dy with a /-separated allele string from VCF words,
 * skipping the extra initial base that VCF requires for indel alleles if necessary.
 * Return dy->string for convenience. */

void vcfRecordWriteNoGt(FILE *f, struct vcfRecord *rec);
/* Write the first 8 columns of VCF rec to f.  Genotype data will be ignored if present. */

// Characters we expect to see in |-separated parts of an ##INFO description that specifies
// tabular contents:
#define COL_DESC_WORD_REGEX "[A-Za-z_0-9.-]+"
// Series of |-separated words:
#define COL_DESC_REGEX COL_DESC_WORD_REGEX"(\\|"COL_DESC_WORD_REGEX")+"

// Minimum number of |-separated values for interpreting descriptions and values as tabular:
#define MIN_COLUMN_COUNT 3

boolean looksTabular(const struct vcfInfoDef *def, const struct vcfInfoElement *el);
/* Return TRUE if def->description seems to contain a |-separated description of columns
 * and el's first non-empty string value has the same number of |-separated parts. */

#endif // vcf_h
