/* lrg.c was originally generated by the autoSql program, which also 
 * generated lrg.h and lrg.sql.  This module links the database and
 * the RAM representation of objects. */

/* Copyright (C) 2014 The Regents of the University of California 
 * See kent/LICENSE or http://genome.ucsc.edu/license/ for licensing information. */

#include "common.h"
#include "linefile.h"
#include "dystring.h"
#include "jksql.h"
#include "lrg.h"



char *lrgCommaSepFieldNames = "chrom,chromStart,chromEnd,name,score,strand,thickStart,thickEnd,reserved,blockCount,blockSizes,chromStarts,mismatches,indels,lrgSize,hgncId,hgncSymbol,ncbiAcc,lrgSource,lrgSourceUrl,creationDate";

struct lrg *lrgLoad(char **row)
/* Load a lrg from row fetched with select * from lrg
 * from database.  Dispose of this with lrgFree(). */
{
struct lrg *ret;

AllocVar(ret);
ret->blockCount = sqlSigned(row[9]);
ret->chrom = cloneString(row[0]);
ret->chromStart = sqlUnsigned(row[1]);
ret->chromEnd = sqlUnsigned(row[2]);
ret->name = cloneString(row[3]);
ret->score = sqlUnsigned(row[4]);
safecpy(ret->strand, sizeof(ret->strand), row[5]);
ret->thickStart = sqlUnsigned(row[6]);
ret->thickEnd = sqlUnsigned(row[7]);
ret->reserved = sqlUnsigned(row[8]);
{
int sizeOne;
sqlSignedDynamicArray(row[10], &ret->blockSizes, &sizeOne);
assert(sizeOne == ret->blockCount);
}
{
int sizeOne;
sqlSignedDynamicArray(row[11], &ret->chromStarts, &sizeOne);
assert(sizeOne == ret->blockCount);
}
ret->mismatches = cloneString(row[12]);
ret->indels = cloneString(row[13]);
ret->lrgSize = sqlUnsigned(row[14]);
ret->hgncId = sqlSigned(row[15]);
ret->hgncSymbol = cloneString(row[16]);
ret->ncbiAcc = cloneString(row[17]);
ret->lrgSource = cloneString(row[18]);
ret->lrgSourceUrl = cloneString(row[19]);
ret->creationDate = cloneString(row[20]);
return ret;
}

struct lrg *lrgLoadAll(char *fileName) 
/* Load all lrg from a whitespace-separated file.
 * Dispose of this with lrgFreeList(). */
{
struct lrg *list = NULL, *el;
struct lineFile *lf = lineFileOpen(fileName, TRUE);
char *row[21];

while (lineFileRow(lf, row))
    {
    el = lrgLoad(row);
    slAddHead(&list, el);
    }
lineFileClose(&lf);
slReverse(&list);
return list;
}

struct lrg *lrgLoadAllByChar(char *fileName, char chopper) 
/* Load all lrg from a chopper separated file.
 * Dispose of this with lrgFreeList(). */
{
struct lrg *list = NULL, *el;
struct lineFile *lf = lineFileOpen(fileName, TRUE);
char *row[21];

while (lineFileNextCharRow(lf, chopper, row, ArraySize(row)))
    {
    el = lrgLoad(row);
    slAddHead(&list, el);
    }
lineFileClose(&lf);
slReverse(&list);
return list;
}

struct lrg *lrgCommaIn(char **pS, struct lrg *ret)
/* Create a lrg out of a comma separated string. 
 * This will fill in ret if non-null, otherwise will
 * return a new lrg */
{
char *s = *pS;

if (ret == NULL)
    AllocVar(ret);
ret->chrom = sqlStringComma(&s);
ret->chromStart = sqlUnsignedComma(&s);
ret->chromEnd = sqlUnsignedComma(&s);
ret->name = sqlStringComma(&s);
ret->score = sqlUnsignedComma(&s);
sqlFixedStringComma(&s, ret->strand, sizeof(ret->strand));
ret->thickStart = sqlUnsignedComma(&s);
ret->thickEnd = sqlUnsignedComma(&s);
ret->reserved = sqlUnsignedComma(&s);
ret->blockCount = sqlSignedComma(&s);
{
int i;
s = sqlEatChar(s, '{');
AllocArray(ret->blockSizes, ret->blockCount);
for (i=0; i<ret->blockCount; ++i)
    {
    ret->blockSizes[i] = sqlSignedComma(&s);
    }
s = sqlEatChar(s, '}');
s = sqlEatChar(s, ',');
}
{
int i;
s = sqlEatChar(s, '{');
AllocArray(ret->chromStarts, ret->blockCount);
for (i=0; i<ret->blockCount; ++i)
    {
    ret->chromStarts[i] = sqlSignedComma(&s);
    }
s = sqlEatChar(s, '}');
s = sqlEatChar(s, ',');
}
ret->mismatches = sqlStringComma(&s);
ret->indels = sqlStringComma(&s);
ret->lrgSize = sqlUnsignedComma(&s);
ret->hgncId = sqlSignedComma(&s);
ret->hgncSymbol = sqlStringComma(&s);
ret->ncbiAcc = sqlStringComma(&s);
ret->lrgSource = sqlStringComma(&s);
ret->lrgSourceUrl = sqlStringComma(&s);
ret->creationDate = sqlStringComma(&s);
*pS = s;
return ret;
}

void lrgFree(struct lrg **pEl)
/* Free a single dynamically allocated lrg such as created
 * with lrgLoad(). */
{
struct lrg *el;

if ((el = *pEl) == NULL) return;
freeMem(el->chrom);
freeMem(el->name);
freeMem(el->blockSizes);
freeMem(el->chromStarts);
freeMem(el->mismatches);
freeMem(el->indels);
freeMem(el->hgncSymbol);
freeMem(el->ncbiAcc);
freeMem(el->lrgSource);
freeMem(el->lrgSourceUrl);
freeMem(el->creationDate);
freez(pEl);
}

void lrgFreeList(struct lrg **pList)
/* Free a list of dynamically allocated lrg's */
{
struct lrg *el, *next;

for (el = *pList; el != NULL; el = next)
    {
    next = el->next;
    lrgFree(&el);
    }
*pList = NULL;
}

void lrgOutput(struct lrg *el, FILE *f, char sep, char lastSep) 
/* Print out lrg.  Separate fields with sep. Follow last field with lastSep. */
{
if (sep == ',') fputc('"',f);
fprintf(f, "%s", el->chrom);
if (sep == ',') fputc('"',f);
fputc(sep,f);
fprintf(f, "%u", el->chromStart);
fputc(sep,f);
fprintf(f, "%u", el->chromEnd);
fputc(sep,f);
if (sep == ',') fputc('"',f);
fprintf(f, "%s", el->name);
if (sep == ',') fputc('"',f);
fputc(sep,f);
fprintf(f, "%u", el->score);
fputc(sep,f);
if (sep == ',') fputc('"',f);
fprintf(f, "%s", el->strand);
if (sep == ',') fputc('"',f);
fputc(sep,f);
fprintf(f, "%u", el->thickStart);
fputc(sep,f);
fprintf(f, "%u", el->thickEnd);
fputc(sep,f);
fprintf(f, "%u", el->reserved);
fputc(sep,f);
fprintf(f, "%d", el->blockCount);
fputc(sep,f);
{
int i;
if (sep == ',') fputc('{',f);
for (i=0; i<el->blockCount; ++i)
    {
    fprintf(f, "%d", el->blockSizes[i]);
    fputc(',', f);
    }
if (sep == ',') fputc('}',f);
}
fputc(sep,f);
{
int i;
if (sep == ',') fputc('{',f);
for (i=0; i<el->blockCount; ++i)
    {
    fprintf(f, "%d", el->chromStarts[i]);
    fputc(',', f);
    }
if (sep == ',') fputc('}',f);
}
fputc(sep,f);
if (sep == ',') fputc('"',f);
fprintf(f, "%s", el->mismatches);
if (sep == ',') fputc('"',f);
fputc(sep,f);
if (sep == ',') fputc('"',f);
fprintf(f, "%s", el->indels);
if (sep == ',') fputc('"',f);
fputc(sep,f);
fprintf(f, "%u", el->lrgSize);
fputc(sep,f);
fprintf(f, "%d", el->hgncId);
fputc(sep,f);
if (sep == ',') fputc('"',f);
fprintf(f, "%s", el->hgncSymbol);
if (sep == ',') fputc('"',f);
fputc(sep,f);
if (sep == ',') fputc('"',f);
fprintf(f, "%s", el->ncbiAcc);
if (sep == ',') fputc('"',f);
fputc(sep,f);
if (sep == ',') fputc('"',f);
fprintf(f, "%s", el->lrgSource);
if (sep == ',') fputc('"',f);
fputc(sep,f);
if (sep == ',') fputc('"',f);
fprintf(f, "%s", el->lrgSourceUrl);
if (sep == ',') fputc('"',f);
fputc(sep,f);
if (sep == ',') fputc('"',f);
fprintf(f, "%s", el->creationDate);
if (sep == ',') fputc('"',f);
fputc(lastSep,f);
}

/* -------------------------------- End autoSql Generated Code -------------------------------- */

#include "psl.h"
#include "hdb.h"

static struct lrgDiff *lrgDiffNew(uint chromStart, uint chromEnd, char *chromSeq,
				  uint lrgStart, uint lrgEnd, char *lrgSeq)
/* Alloc, populate and return a new lrgDiff. */
{
struct lrgDiff *diff;
AllocVar(diff);
diff->chromStart = chromStart;
diff->chromEnd = chromEnd;
diff->chromSeq = chromSeq;
diff->lrgStart = lrgStart;
diff->lrgEnd = lrgEnd;
diff->lrgSeq = lrgSeq;
return diff;
}

static struct lrgDiff *lrgParseDiffs(struct lrg *lrg, char *diffStr)
/* Parse diffStr, i.e. a comma-separated sequence of colon-separated elements.
 * Each colon-separated element has chromStart, lrgStart, chromSeq and lrgSeq.
 * Return a list of lrgDiffs. */
{
struct lrgDiff *diffList = NULL;
struct slName *diffEl, *diffStrList = slNameListFromString(diffStr, ',');
for (diffEl = diffStrList;  diffEl != NULL;  diffEl = diffEl->next)
    {
    int expectedCount = 4;
    char *words[expectedCount+1];
    int wordCount = chopByChar(cloneString(diffEl->name), ':', words, ArraySize(words));
    if (wordCount != expectedCount)
	errAbort("lrgParseMismatches: expected %d :-sep words, got %d", expectedCount, wordCount);
    uint chromStart = atoll(words[0]) + lrg->chromStart;
    uint lrgStart = atoll(words[1]);
    char *chromSeq = words[2];
    char *lrgSeq = words[3];
    uint chromEnd = sameString(chromSeq, "-") ? chromStart : chromStart + strlen(chromSeq);
    uint lrgEnd = sameString(lrgSeq, "-") ? lrgStart : lrgStart + strlen(lrgSeq);
    struct lrgDiff *diff = lrgDiffNew(chromStart, chromEnd, chromSeq, lrgStart, lrgEnd, lrgSeq);
    slAddHead(&diffList, diff);
    }
slNameFreeList(&diffStrList);
slReverse(&diffList);
return diffList;
}

struct lrgDiff *lrgParseMismatches(struct lrg *lrg)
/* Parse lrg->mismatches and return a list of lrgDiffs. */
{
return lrgParseDiffs(lrg, lrg->mismatches);
}

struct lrgDiff *lrgParseIndels(struct lrg *lrg)
/* Parse lrg->indels and return a list of lrgDiffs. */
{
return lrgParseDiffs(lrg, lrg->indels);
}

void lrgDiffFreeList(struct lrgDiff **pDiff)
/* Free up a list of parsed diffs. */
{
if (pDiff == NULL || *pDiff == NULL)
    return;
struct lrgDiff *diff;
for (diff = *pDiff;  diff != NULL;  diff = diff->next)
    {
    freeMem(diff->chromSeq);
    freeMem(diff->lrgSeq);
    }
slFreeList(pDiff);
}

static boolean lrgNameParseRange(char *lrgName, int lrgSize, char *buf, size_t bufSize,
                                 int *retStart, int *retEnd)
/* If lrgName is of the format name:start-end, write the name into buf, set retStart and retEnd
 * and return TRUE.  Otherwise leave inputs alone and return FALSE. */
{
if (strchr(lrgName, ':'))
    {
    // Parse lrgStart and lrgEnd out of name field (LRG_*:start-end)
    safecpy(buf, bufSize, lrgName);
    char *p = strchr(buf, ':');
    // Chop at : (after name)
    *p++ = '\0';
    char *num = p;
    p = strchr(num, '-');
    if (p == NULL)
        errAbort("Error parsing LRG name:start-end '%s' -- no '-' following ':'", lrgName);
    *p++ = '\0';
    *retStart = atoi(num) - 1;
    num = p;
    *retEnd = atoi(num);
    if (*retStart < 0 || *retStart >= *retEnd || *retEnd < 0 || *retEnd > lrgSize)
        errAbort("lrgNameParseRange: got bad coords (%d, %d) from '%s' size %d",
                 *retStart, *retEnd, lrgName, lrgSize);
    return TRUE;
    }
return FALSE;
}

static int calcBlockSize(uint nextTStart, uint nextQStart, struct psl *psl, int blockIx)
{
int tLen = nextTStart - psl->tStarts[blockIx];
int qLen = nextQStart - psl->qStarts[blockIx];
return min(tLen, qLen);
}

struct psl *lrgToPsl(struct lrg *lrg, uint chromSize)
/* Use lrg's mismatches and indels to make a PSL. */
{
struct lrgDiff *mismatches = lrgParseMismatches(lrg), *indels = lrgParseIndels(lrg);
int lrgStart = 0, lrgEnd = lrg->lrgSize;
char *lrgName = lrg->name;
char lrgNameBuf[strlen(lrgName)+1];
if (lrgNameParseRange(lrgName, lrg->lrgSize, lrgNameBuf, sizeof(lrgNameBuf), &lrgStart, &lrgEnd))
    lrgName = lrgNameBuf;
int blockCount = slCount(indels) + 1;
unsigned opts = 0;
struct psl *psl = pslNew(lrgName, lrg->lrgSize, lrgStart, lrgEnd,
                         lrg->chrom, chromSize, lrg->chromStart, lrg->chromEnd,
                         lrg->strand, blockCount, opts);
psl->blockCount = blockCount;
psl->misMatch = slCount(mismatches);
boolean isRc = (lrg->strand[0] == '-');
// Translate gap coords from indels into block coords:
psl->qStarts[0] = isRc ? (lrg->lrgSize - lrgEnd) : lrgStart;
psl->tStarts[0] = lrg->chromStart;
int alignedBaseCount = 0;
int blockIx = 1;
struct lrgDiff *diff;
for (diff = indels;  diff != NULL;  diff = diff->next)
    {
    // now we know the size of the previous block:
    int lrgStartPosStrand = isRc ? (lrg->lrgSize - diff->lrgEnd) : diff->lrgStart;
    // Insertion on t or q, or both?
    int lrgLen = diff->lrgEnd - diff->lrgStart;
    int refLen = diff->chromEnd - diff->chromStart;
    if (refLen == 0 || lrgLen == 0) // insertion only on t or q
        {
        if (refLen == 0)
            {
            psl->qNumInsert++;
            psl->qBaseInsert += (lrgLen - refLen);
            }
        else if (lrgLen == 0)
            {
            psl->tNumInsert++;
            psl->tBaseInsert += (refLen - lrgLen);
            }
        psl->qStarts[blockIx] = lrgStartPosStrand + lrgLen;
        psl->tStarts[blockIx] = diff->chromEnd;
        }
    else // double sided insertion
        {
        psl->tNumInsert++;
        psl->tBaseInsert += refLen;
        psl->qNumInsert++;
        psl->qBaseInsert += lrgLen;
        // the qStart needs to account for the lrg insertion AND the prev block
        psl->qStarts[blockIx] = diff->lrgStart + lrgLen;
        }
    psl->tStarts[blockIx] = diff->chromEnd;
    psl->blockSizes[blockIx-1] = calcBlockSize(diff->chromStart, lrgStartPosStrand, psl, blockIx-1);
    alignedBaseCount += psl->blockSizes[blockIx-1];
    blockIx++;
    }
// size of last block:
psl->blockSizes[blockIx-1] = calcBlockSize(lrg->chromEnd, lrg->lrgSize, psl, blockIx-1);
alignedBaseCount += psl->blockSizes[blockIx-1];
if (blockIx != psl->blockCount)
    errAbort("lrgToPsl: expected %d blocks but somehow ended up with %d", psl->blockCount,
	     blockIx);
psl->match = alignedBaseCount - psl->misMatch;
return psl;
}

static int sumInsertions(struct lrgDiff *indels)
/* Return the sum of insertions of lrg into reference sequence, ignoring deletions.
 * That is the max headroom that we will need while shifting bases around. */
{
int sumIns = 0;
struct lrgDiff *diff;
for (diff = indels;  diff != NULL;  diff = diff->next)
    sumIns += (diff->lrgEnd - diff->lrgStart);
return sumIns;
}

struct dnaSeq *lrgReconstructSequence(struct lrg *lrg, char *db)
/* Use genomic sequence, lrg->mismatches and lrg->indels to reconstruct LRG sequence */
{
struct lrgDiff *diff, *indels = lrgParseIndels(lrg);
int refSize = lrg->chromEnd - lrg->chromStart;
// Fetch genomic sequence:
struct dnaSeq *lrgSeq = hDnaFromSeq(db, lrg->chrom, lrg->chromStart, lrg->chromEnd, dnaUpper);
boolean isRc = (lrg->strand[0] == '-');
if (isRc)
    reverseComplement(lrgSeq->dna, refSize);
// If this was only a partial mapping of the LRG, pad N's at the beginning and/or end to get
// the correct size.
int lrgStart = 0, lrgEnd = lrg->lrgSize;
char lrgNameBuf[strlen(lrg->name)+1];
lrgNameParseRange(lrg->name, lrg->lrgSize, lrgNameBuf, sizeof(lrgNameBuf), &lrgStart, &lrgEnd);
// Replace lrgSeq->dna with a larger version that has room for genomic sequence plus inserted bases
// as well as possible padding at start and end for partial LRG mappings
int addedBases = sumInsertions(indels);
int paddedSize = refSize + lrgStart + (lrg->lrgSize - lrgEnd) + addedBases + 1;
char *lrgSeqDna = needMem(paddedSize);
if (lrgStart > 0)
    memset(lrgSeqDna, 'N', lrgStart);
// Copy in the genomic data
safencpy(lrgSeqDna+lrgStart, paddedSize-lrgStart, lrgSeq->dna, refSize);
refSize += lrgStart;
// If the LRG has inserted bases, zero out some extra bytes after genomic seq
if (addedBases)
    zeroBytes(lrgSeqDna+refSize, addedBases);
freeMem(lrgSeq->dna);
lrgSeq->dna = lrgSeqDna;
// Go through indels backwards w.r.t. LRG so we move sequence to the right
// while not changing coords to the left:
if (!isRc)
    slReverse(&indels);
for (diff = indels;  diff != NULL;  diff = diff->next)
    {
    int lrgLen = diff->lrgEnd - diff->lrgStart;
    int refLen = diff->chromEnd - diff->chromStart;
    int refStart = diff->chromStart - lrg->chromStart + lrgStart;
    if (isRc)
	refStart = refSize - (refStart + refLen);
    if (lrgLen > refLen)
	{
	// LRG inserts sequence: shift the rest of the sequence to the right
	int insSize = lrgLen - refLen;
	int moveSize = refSize + addedBases - refStart - insSize;
	memmove(lrgSeqDna+refStart+insSize, lrgSeqDna+refStart, moveSize);
	}
    else
	{
	// LRG deletes sequence: shift the rest of the sequence to the left
	int delSize = refLen - lrgLen;
	int moveSize = refSize + addedBases - refStart - delSize + 1; // '\0' at end too
	memmove(lrgSeqDna+refStart, lrgSeqDna+refStart+delSize, moveSize);
	}
    // If there is LRG sequence, copy it in.
    if (lrgLen > 0)
	memcpy(lrgSeqDna+refStart, diff->lrgSeq, lrgLen);
    }
int len = strlen(lrgSeqDna);
if (len != lrgEnd)
    errAbort("lrgReconstructSequence: expected sequence length to be lrgEnd %d, got %d",
             lrgEnd, len);
// Now that indels have been resolved, use LRG coords to substitute mismatches:
struct lrgDiff *mismatches = lrgParseMismatches(lrg);
for (diff = mismatches;  diff != NULL;  diff = diff->next)
    {
    if (lrgSeqDna[diff->lrgStart] != diff->chromSeq[0])
	errAbort("Mismatch with unexpected assembly sequence: expected '%c' on %c strand, "
		 "got '%c'", diff->chromSeq[0], lrg->strand[0], lrgSeqDna[diff->lrgStart]);
    int size = diff->lrgEnd - diff->lrgStart;
    memcpy(lrgSeqDna+diff->lrgStart, diff->lrgSeq, size);
    }
// Add padding N's at end if applicable
if (lrgEnd < lrg->lrgSize)
    {
    memset(lrgSeqDna+lrgEnd, 'N', lrg->lrgSize - lrgEnd);
    lrgSeqDna[lrg->lrgSize] = '\0';
    }
len = strlen(lrgSeqDna);
if (len != lrg->lrgSize)
    errAbort("lrgReconstructSequence: Error applying LRG indels for '%s': expected size %d, got %d",
             lrg->name, lrg->lrgSize, len);
lrgSeq->size = lrg->lrgSize;
return lrgSeq;
}

