/* bigDbSnp.c was originally generated by the autoSql program, which also 
 * generated bigDbSnp.h and bigDbSnp.sql.  This module links the database and
 * the RAM representation of objects. */

#include "common.h"
#include "linefile.h"
#include "dystring.h"
#include "jksql.h"
#include "bigDbSnp.h"



char *bigDbSnpCommaSepFieldNames = "chrom,chromStart,chromEnd,name,ref,altCount,alts,shiftBases,freqSourceCount,minorAlleleFreq,majorAllele,minorAllele,maxFuncImpact,class,ucscNotes,_dataOffset,_dataLen";

/* definitions for class column */
static char *values_class[] = {"snv", "mnv", "ins", "del", "delins", "identity", NULL};
static struct hash *valhash_class = NULL;

struct bigDbSnp *bigDbSnpLoad(char **row)
/* Load a bigDbSnp from row fetched with select * from bigDbSnp
 * from database.  Dispose of this with bigDbSnpFree(). */
{
struct bigDbSnp *ret;

AllocVar(ret);
ret->altCount = sqlSigned(row[5]);
ret->freqSourceCount = sqlSigned(row[8]);
ret->chrom = cloneString(row[0]);
ret->chromStart = sqlUnsigned(row[1]);
ret->chromEnd = sqlUnsigned(row[2]);
ret->name = cloneString(row[3]);
ret->ref = cloneString(row[4]);
{
int sizeOne;
sqlStringDynamicArray(row[6], &ret->alts, &sizeOne);
assert(sizeOne == ret->altCount);
}
ret->shiftBases = sqlUnsigned(row[7]);
{
int sizeOne;
sqlDoubleDynamicArray(row[9], &ret->minorAlleleFreq, &sizeOne);
assert(sizeOne == ret->freqSourceCount);
}
{
int sizeOne;
sqlStringDynamicArray(row[10], &ret->majorAllele, &sizeOne);
assert(sizeOne == ret->freqSourceCount);
}
{
int sizeOne;
sqlStringDynamicArray(row[11], &ret->minorAllele, &sizeOne);
assert(sizeOne == ret->freqSourceCount);
}
ret->maxFuncImpact = sqlUnsigned(row[12]);
ret->class = sqlEnumParse(row[13], values_class, &valhash_class);
ret->ucscNotes = cloneString(row[14]);
ret->_dataOffset = sqlLongLong(row[15]);
ret->_dataLen = sqlSigned(row[16]);
return ret;
}

struct bigDbSnp *bigDbSnpLoadAll(char *fileName) 
/* Load all bigDbSnp from a whitespace-separated file.
 * Dispose of this with bigDbSnpFreeList(). */
{
struct bigDbSnp *list = NULL, *el;
struct lineFile *lf = lineFileOpen(fileName, TRUE);
char *row[17];

while (lineFileRow(lf, row))
    {
    el = bigDbSnpLoad(row);
    slAddHead(&list, el);
    }
lineFileClose(&lf);
slReverse(&list);
return list;
}

struct bigDbSnp *bigDbSnpLoadAllByChar(char *fileName, char chopper) 
/* Load all bigDbSnp from a chopper separated file.
 * Dispose of this with bigDbSnpFreeList(). */
{
struct bigDbSnp *list = NULL, *el;
struct lineFile *lf = lineFileOpen(fileName, TRUE);
char *row[17];

while (lineFileNextCharRow(lf, chopper, row, ArraySize(row)))
    {
    el = bigDbSnpLoad(row);
    slAddHead(&list, el);
    }
lineFileClose(&lf);
slReverse(&list);
return list;
}

struct bigDbSnp *bigDbSnpCommaIn(char **pS, struct bigDbSnp *ret)
/* Create a bigDbSnp out of a comma separated string. 
 * This will fill in ret if non-null, otherwise will
 * return a new bigDbSnp */
{
char *s = *pS;

if (ret == NULL)
    AllocVar(ret);
ret->chrom = sqlStringComma(&s);
ret->chromStart = sqlUnsignedComma(&s);
ret->chromEnd = sqlUnsignedComma(&s);
ret->name = sqlStringComma(&s);
ret->ref = sqlStringComma(&s);
ret->altCount = sqlSignedComma(&s);
{
int i;
s = sqlEatChar(s, '{');
AllocArray(ret->alts, ret->altCount);
for (i=0; i<ret->altCount; ++i)
    {
    ret->alts[i] = sqlStringComma(&s);
    }
s = sqlEatChar(s, '}');
s = sqlEatChar(s, ',');
}
ret->shiftBases = sqlUnsignedComma(&s);
ret->freqSourceCount = sqlSignedComma(&s);
{
int i;
s = sqlEatChar(s, '{');
AllocArray(ret->minorAlleleFreq, ret->freqSourceCount);
for (i=0; i<ret->freqSourceCount; ++i)
    {
    ret->minorAlleleFreq[i] = sqlDoubleComma(&s);
    }
s = sqlEatChar(s, '}');
s = sqlEatChar(s, ',');
}
{
int i;
s = sqlEatChar(s, '{');
AllocArray(ret->majorAllele, ret->freqSourceCount);
for (i=0; i<ret->freqSourceCount; ++i)
    {
    ret->majorAllele[i] = sqlStringComma(&s);
    }
s = sqlEatChar(s, '}');
s = sqlEatChar(s, ',');
}
{
int i;
s = sqlEatChar(s, '{');
AllocArray(ret->minorAllele, ret->freqSourceCount);
for (i=0; i<ret->freqSourceCount; ++i)
    {
    ret->minorAllele[i] = sqlStringComma(&s);
    }
s = sqlEatChar(s, '}');
s = sqlEatChar(s, ',');
}
ret->maxFuncImpact = sqlUnsignedComma(&s);
ret->class = sqlEnumComma(&s, values_class, &valhash_class);
ret->ucscNotes = sqlStringComma(&s);
ret->_dataOffset = sqlLongLongComma(&s);
ret->_dataLen = sqlSignedComma(&s);
*pS = s;
return ret;
}

void bigDbSnpFree(struct bigDbSnp **pEl)
/* Free a single dynamically allocated bigDbSnp such as created
 * with bigDbSnpLoad(). */
{
struct bigDbSnp *el;

if ((el = *pEl) == NULL) return;
freeMem(el->chrom);
freeMem(el->name);
freeMem(el->ref);
/* All strings in alts are allocated at once, so only need to free first. */
if (el->alts != NULL)
    freeMem(el->alts[0]);
freeMem(el->alts);
freeMem(el->minorAlleleFreq);
/* All strings in majorAllele are allocated at once, so only need to free first. */
if (el->majorAllele != NULL)
    freeMem(el->majorAllele[0]);
freeMem(el->majorAllele);
/* All strings in minorAllele are allocated at once, so only need to free first. */
if (el->minorAllele != NULL)
    freeMem(el->minorAllele[0]);
freeMem(el->minorAllele);
freeMem(el->ucscNotes);
freez(pEl);
}

void bigDbSnpFreeList(struct bigDbSnp **pList)
/* Free a list of dynamically allocated bigDbSnp's */
{
struct bigDbSnp *el, *next;

for (el = *pList; el != NULL; el = next)
    {
    next = el->next;
    bigDbSnpFree(&el);
    }
*pList = NULL;
}

void bigDbSnpOutput(struct bigDbSnp *el, FILE *f, char sep, char lastSep) 
/* Print out bigDbSnp.  Separate fields with sep. Follow last field with lastSep. */
{
if (sep == ',') fputc('"',f);
fprintf(f, "%s", el->chrom);
if (sep == ',') fputc('"',f);
fputc(sep,f);
fprintf(f, "%u", el->chromStart);
fputc(sep,f);
fprintf(f, "%u", el->chromEnd);
fputc(sep,f);
if (sep == ',') fputc('"',f);
fprintf(f, "%s", el->name);
if (sep == ',') fputc('"',f);
fputc(sep,f);
if (sep == ',') fputc('"',f);
fprintf(f, "%s", el->ref);
if (sep == ',') fputc('"',f);
fputc(sep,f);
fprintf(f, "%d", el->altCount);
fputc(sep,f);
{
int i;
if (sep == ',') fputc('{',f);
for (i=0; i<el->altCount; ++i)
    {
    if (sep == ',') fputc('"',f);
    fprintf(f, "%s", el->alts[i]);
    if (sep == ',') fputc('"',f);
    fputc(',', f);
    }
if (sep == ',') fputc('}',f);
}
fputc(sep,f);
fprintf(f, "%u", el->shiftBases);
fputc(sep,f);
fprintf(f, "%d", el->freqSourceCount);
fputc(sep,f);
{
int i;
if (sep == ',') fputc('{',f);
for (i=0; i<el->freqSourceCount; ++i)
    {
    fprintf(f, "%g", el->minorAlleleFreq[i]);
    fputc(',', f);
    }
if (sep == ',') fputc('}',f);
}
fputc(sep,f);
{
int i;
if (sep == ',') fputc('{',f);
for (i=0; i<el->freqSourceCount; ++i)
    {
    if (sep == ',') fputc('"',f);
    fprintf(f, "%s", el->majorAllele[i]);
    if (sep == ',') fputc('"',f);
    fputc(',', f);
    }
if (sep == ',') fputc('}',f);
}
fputc(sep,f);
{
int i;
if (sep == ',') fputc('{',f);
for (i=0; i<el->freqSourceCount; ++i)
    {
    if (sep == ',') fputc('"',f);
    fprintf(f, "%s", el->minorAllele[i]);
    if (sep == ',') fputc('"',f);
    fputc(',', f);
    }
if (sep == ',') fputc('}',f);
}
fputc(sep,f);
fprintf(f, "%u", el->maxFuncImpact);
fputc(sep,f);
if (sep == ',') fputc('"',f);
sqlEnumPrint(f, el->class, values_class);
if (sep == ',') fputc('"',f);
fputc(sep,f);
if (sep == ',') fputc('"',f);
fprintf(f, "%s", el->ucscNotes);
if (sep == ',') fputc('"',f);
fputc(sep,f);
fprintf(f, "%lld", el->_dataOffset);
fputc(sep,f);
fprintf(f, "%d", el->_dataLen);
fputc(lastSep,f);
}

/* -------------------------------- End autoSql Generated Code -------------------------------- */

struct symbolDesc
{
    char *symbol;
    char *description;
};

struct symbolDesc ucscNotesDesc[] =
    {
    { bdsAltIsAmbiguous,
      "At least one alternate allele "
      "contains an IUPAC ambiguous base (e.g. 'R' for 'A or G')." },
    { bdsClassMismatch,
      "Variation class/type is inconsistent with alleles mapped to this genome assembly." },
    { bdsClinvar,
      "Variant is in ClinVar." },
    { bdsClinvarBenign,
      "Variant is in ClinVar with clinical significance of benign and/or likely benign." },
    { bdsClinvarConflicting,
      "Variant is in ClinVar with reports of both benign and pathogenic significance." },
    { bdsClinvarPathogenic,
      "Variant is in ClinVar with clinical significance of pathogenic and/or likely pathogenic." },
    { bdsClusterError,
      "This variant has the same start, end and class as another variant; "
      "they probably should have been merged into one variant." },
    { bdsCommonAll,
      "Variant is \"common\", i.e. has a Minor Allele Frequency of at least 1% "
      "in all projects reporting frequencies." },
    { bdsCommonSome,
      "Variant is \"common\", i.e. has a Minor Allele Frequency of at least 1% "
      "in some, but not all, projects reporting frequencies." },
    { bdsDiffMajor,
      "Different frequency sources have different major alleles "
      "(see table of allele frequencies above)." },
    { bdsFreqIncomplete,
      "At least one project's frequency data is incomplete (only one allele reported)." },
    { bdsFreqIsAmbiguous,
      "At least one allele reported by at least one project "
      "contains an IUPAC ambiguous base (e.g. 'R' for 'A or G')." },
    { bdsFreqNotMapped,
      "At least one project reported frequencies on a different assembly, for which dbSNP does not "
      "provide a mapping.  The mapping on this assembly may have an issue." },
    { bdsFreqNotRefAlt,
      "The reference genome allele is not the major allele in at least one project." },
    { bdsMultiMap,
      "This variant has been mapped to more than one distinct genomic location." },
    { bdsOtherMapErr,
      "Another mapping of this variant has illegal coordinates implying indel mapping error." },
    { bdsOverlapDiffClass,
      "This variant overlaps another variant with a different type/class." },
    { bdsOverlapSameClass,
      "This variant overlaps another with the same type/class but different start/end." },
    { bdsRareAll,
      "Variant is \"rare\", i.e. has a Minor Allele Frequency of less than 1% "
      "in all projects reporting frequencies, or has no frequency data." },
    { bdsRareSome,
      "Variant is \"rare\", i.e. has a Minor Allele Frequency of less than 1% "
      "in some, but not all, projects reporting frequencies." },
    { bdsRefIsAmbiguous,
      "The reference genome allele "
      "contains an IUPAC ambiguous base (e.g. 'R' for 'A or G')." },
    { bdsRefIsMinor,
      "The reference genome allele is not the major allele in at least one project." },
    { bdsRefIsRare,
      "The reference genome allele is rare (i.e. allele frequency < 1%)." },
    { bdsRefIsSingleton,
      "The reference genome allele has never been observed "
      "in a population sequencing project reporting frequencies." },
    { bdsRefMismatch,
      "The reference genome allele reported by dbSNP differs from the GenBank assembly sequence." },
    { bdsRevStrand,
      "The orientation of the currently viewed reference genome sequence is different from "
      "the orientation of dbSNP's preferred assembly; alleles are "
      "presented on the forward strand of the currently viewed reference sequence." },
    };

char *bigDbSnpDescribeUcscNote(char *ucscNote)
/* Return a string describing ucscNote, unless it is unrecognized in which case return NULL.
 * Do not free returned value. */
{
int i;
for (i = 0;  i < ArraySize(ucscNotesDesc);  i++)
    {
    if (sameString(ucscNote, ucscNotesDesc[i].symbol))
        return ucscNotesDesc[i].description;
    }
return NULL;
}

char *bigDbSnpClassToString(enum bigDbSnpClass class)
/* Return the string version of enum bigDbSnpClass.  Do not free result. */
{
char *string = NULL;
switch (class)
    {
    case bigDbSnpSnv:
        string = "snv";
        break;
    case bigDbSnpMnv:
        string = "mnv";
        break;
    case bigDbSnpIns:
        string = "ins";
        break;
    case bigDbSnpDel:
        string = "del";
        break;
    case bigDbSnpDelins:
        string = "delins";
        break;
    case bigDbSnpIdentity:
        string = "identity";
        break;
    default:
        errAbort("bigDbSnpClassToString: unrecognized value %d", (int)class);
    }
return string;
}

static boolean abbrevNRepeat(char *allele, int n, char *buf, size_t bufLen)
/* If allele is an N-base repeat, and a shorter representation fits in buf, return TRUE. */
{
boolean canAbbrev = FALSE;
int len = strlen(allele);
int minAbbrevLen = max(n*2, n+4);
if (len >= minAbbrevLen && bufLen >= minAbbrevLen)
    {
    int reps = 1;
    int i;
    for (i = n;  i < len;  i++)
        {
        if (allele[i] != allele[i-n])
            break;
        if (i % n == n-1)
            reps++;
        }
    if (i >= minAbbrevLen)
        {
        // End of repeating section; are there enough repeats to make the notation shorter?
        char repeatUnit[n+1];
        safencpy(repeatUnit, sizeof repeatUnit, allele, n);
        int abbrevLen = snprintf(buf, bufLen, "(%s)%d", repeatUnit, reps);
        // Does the rest of the sequence start with a different repeat?
        char *bufRest = buf+abbrevLen;
        size_t bufRestLen = bufLen - abbrevLen;
        char *alRest = allele + (reps * n);
        if (bufRestLen > 5 && abbrevNRepeat(alRest, n, bufRest, bufRestLen))
            abbrevLen = strlen(buf);
        else
            abbrevLen += snprintf(bufRest, bufRestLen, "%s", alRest);
        if (abbrevLen < bufLen)
            canAbbrev = TRUE;
        else
            buf[0] = '\0';
        }
    }
return canAbbrev;
}

char *bigDbSnpAbbrevAllele(char *allele, char *buf, size_t bufLen)
/* If allele can be abbreviated to something shorter than itself that fits in buf,
 * and doesn't end up with a tiny bit of abbreviation followed by a bunch of unabbreviated
 * sequence, then put the abbreviation in buf and return buf; otherwise return allele.
 * If allele is the empty string, returns "-" (in buf). */
{
if (isEmpty(allele))
    {
    safecpy(buf, bufLen, "-");
    return buf;
    }
char *abbrev = allele;
int maxN = (bufLen - 3) / 2;
int n;
for (n = 1; n <= maxN; n++)
    {
    if (abbrevNRepeat(allele, n, buf, bufLen))
        {
        abbrev = buf;
        break;
        }
    }
if (abbrev == buf)
    {
    int alLen = strlen(buf);
    char *abbrevEnd = strrchr(buf, ')');
    if (abbrevEnd == NULL)
        errAbort("bigDbSnpAbbrevAllele: expect abbreviated allele '%s' to contain at least one ')'",
                 buf);
    int abbrevLen = abbrevEnd + 1 - buf;
    if (abbrevLen < alLen>>2)
        {
        // Never mind, the abbreviated portion is much smaller than the unabbreviated portion.
        abbrev = allele;
        }
    }
return abbrev;
}
