/* dnaMotif.c was originally generated by the autoSql program, which also 
 * generated dnaMotif.h and dnaMotif.sql.  This module links the database and
 * the RAM representation of objects. */

/* Copyright (C) 2011 The Regents of the University of California 
 * See kent/LICENSE or http://genome.ucsc.edu/license/ for licensing information. */

#include "common.h"
#include "linefile.h"
#include "sqlList.h"
#include "dystring.h"
#include "dnaMotif.h"
#include "portable.h"
#include "pipeline.h"


struct dnaMotif *dnaMotifCommaIn(char **pS, struct dnaMotif *ret)
/* Create a dnaMotif out of a comma separated string. 
 * This will fill in ret if non-null, otherwise will
 * return a new dnaMotif */
{
char *s = *pS;
int i;

if (ret == NULL)
    AllocVar(ret);
ret->name = sqlStringComma(&s);
ret->columnCount = sqlSignedComma(&s);
s = sqlEatChar(s, '{');
AllocArray(ret->aProb, ret->columnCount);
for (i=0; i<ret->columnCount; ++i)
    {
    ret->aProb[i] = sqlSignedComma(&s);
    }
s = sqlEatChar(s, '}');
s = sqlEatChar(s, ',');
s = sqlEatChar(s, '{');
AllocArray(ret->cProb, ret->columnCount);
for (i=0; i<ret->columnCount; ++i)
    {
    ret->cProb[i] = sqlSignedComma(&s);
    }
s = sqlEatChar(s, '}');
s = sqlEatChar(s, ',');
s = sqlEatChar(s, '{');
AllocArray(ret->gProb, ret->columnCount);
for (i=0; i<ret->columnCount; ++i)
    {
    ret->gProb[i] = sqlSignedComma(&s);
    }
s = sqlEatChar(s, '}');
s = sqlEatChar(s, ',');
s = sqlEatChar(s, '{');
AllocArray(ret->tProb, ret->columnCount);
for (i=0; i<ret->columnCount; ++i)
    {
    ret->tProb[i] = sqlSignedComma(&s);
    }
s = sqlEatChar(s, '}');
s = sqlEatChar(s, ',');
*pS = s;
return ret;
}

void dnaMotifFree(struct dnaMotif **pEl)
/* Free a single dynamically allocated dnaMotif such as created
 * with dnaMotifLoad(). */
{
struct dnaMotif *el;

if ((el = *pEl) == NULL) return;
freeMem(el->name);
freeMem(el->aProb);
freeMem(el->cProb);
freeMem(el->gProb);
freeMem(el->tProb);
freez(pEl);
}

void dnaMotifFreeList(struct dnaMotif **pList)
/* Free a list of dynamically allocated dnaMotif's */
{
struct dnaMotif *el, *next;

for (el = *pList; el != NULL; el = next)
    {
    next = el->next;
    dnaMotifFree(&el);
    }
*pList = NULL;
}

void dnaMotifOutput(struct dnaMotif *el, FILE *f, char sep, char lastSep) 
/* Print out dnaMotif.  Separate fields with sep. Follow last field with lastSep. */
{
int i;
if (sep == ',') fputc('"',f);
fprintf(f, "%s", el->name);
if (sep == ',') fputc('"',f);
fputc(sep,f);
fprintf(f, "%d", el->columnCount);
fputc(sep,f);
if (sep == ',') fputc('{',f);
for (i=0; i<el->columnCount; ++i)
    {
    fprintf(f, "%f", el->aProb[i]);
    fputc(',', f);
    }
if (sep == ',') fputc('}',f);
fputc(sep,f);
if (sep == ',') fputc('{',f);
for (i=0; i<el->columnCount; ++i)
    {
    fprintf(f, "%f", el->cProb[i]);
    fputc(',', f);
    }
if (sep == ',') fputc('}',f);
fputc(sep,f);
if (sep == ',') fputc('{',f);
for (i=0; i<el->columnCount; ++i)
    {
    fprintf(f, "%f", el->gProb[i]);
    fputc(',', f);
    }
if (sep == ',') fputc('}',f);
fputc(sep,f);
if (sep == ',') fputc('{',f);
for (i=0; i<el->columnCount; ++i)
    {
    fprintf(f, "%f", el->tProb[i]);
    fputc(',', f);
    }
if (sep == ',') fputc('}',f);
fputc(lastSep,f);
}

float dnaMotifSequenceProb(struct dnaMotif *motif, DNA *dna)
/* Return probability of dna according to motif.  Make sure
 * motif is probabalistic (with call to dnaMotifMakeProbabalistic
 * if you're not sure) before calling this. */
{
float p = 1.0;
int i;
for (i=0; i<motif->columnCount; ++i)
    {
    switch (dna[i])
        {
	case 'a':
	case 'A':
	    p *= motif->aProb[i];
	    break;
	case 'c':
	case 'C':
	    p *= motif->cProb[i];
	    break;
	case 'g':
	case 'G':
	    p *= motif->gProb[i];
	    break;
	case 't':
	case 'T':
	    p *= motif->tProb[i];
	    break;
	case 0:
	    warn("dna shorter than motif");
	    internalErr();
	    break;
	default:
	    p *= 0.25;
	    break;
	}
    }
return p;
}

static float dnaMotifSequenceProbWithMark0(struct dnaMotif *motif, DNA *dna, double mark0[5])
{
float p = 1.0;
int i;
for (i=0; i<motif->columnCount; ++i)
    {
    int val = ntVal[(int) dna[i]] + 1;
    p *= (mark0[val]/mark0[0]);
    }
return p;
}

static float dnaMotifSequenceProbLog(struct dnaMotif *motif, DNA *dna)
{
float p = 0;
int i;
for (i=0; i<motif->columnCount; ++i)
    {
    switch (dna[i])
        {
	case 'a':
	case 'A':
	    p += motif->aProb[i];
	    break;
	case 'c':
	case 'C':
	    p += motif->cProb[i];
	    break;
	case 'g':
	case 'G':
	    p += motif->gProb[i];
	    break;
	case 't':
	case 'T':
	    p += motif->tProb[i];
	    break;
	case 0:
	    warn("dna shorter than motif");
	    internalErr();
	    break;
	default:
	    p += logBase2(0.25);
	    break;
	}
    }
return p;
}

char dnaMotifBestStrand(struct dnaMotif *motif, DNA *dna)
/* Figure out which strand of DNA is better for probabalistic motif. */
{
float fScore, rScore;
fScore = dnaMotifSequenceProb(motif, dna);
reverseComplement(dna, motif->columnCount);
rScore = dnaMotifSequenceProb(motif, dna);
reverseComplement(dna, motif->columnCount);
if (fScore >= rScore)
    return '+';
else
    return '-';
}

double dnaMotifBitScore(struct dnaMotif *motif, DNA *dna)
/* Return logBase2-odds score of dna given a probabalistic motif. */
{
double p = dnaMotifSequenceProb(motif, dna);
double q = pow(0.25, motif->columnCount);
double odds = p/q;
return logBase2(odds);
}

double dnaMotifBitScoreWithMark0Bg(struct dnaMotif *motif, DNA *dna, double mark0[5])
/* Return logBase2-odds score of dna given a probabalistic motif and using a 0-order markov model for the background. */
{
double p = dnaMotifSequenceProb(motif, dna);
double q = dnaMotifSequenceProbWithMark0(motif, dna, mark0);
double odds = p/q;
return logBase2(odds);
}

double dnaMotifBitScoreWithMarkovBg(struct dnaMotif *motif, DNA *dna, double mark2[5][5][5])
/* Return logBase2-odds score of dna given a probabalistic motif using a 2nd-order markov model for the background.
   motif and markd2 must be in log2 format.
   Seq must contain an extra two bases at the front (i.e. we start scoring from dna + 2). */
{
double p = dnaMotifSequenceProbLog(motif, dna + 2);
double q = 0;
int i, index;
// XXXX assert somehow that length(dna) == motif->columnCount + 2?
dnaUtilOpen();
for(i = 0, index = 2; i < motif->columnCount; i++, index++)
    {
    double tmp = mark2[ntVal5[(int) dna[index - 2]] + 1][ntVal5[(int) dna[index - 1]] + 1][ntVal5[(int) dna[index]] + 1];
#if 0
    char buf[4];
    safencpy(buf, sizeof(buf), dna + index - 2, 3);
    fprintf(stderr, "%s: tmp: %.6f\n", buf, tmp);
#endif
    q += tmp;
    }
return p - q;
}


void dnaMotifMakeLog2(struct dnaMotif *motif)
{
int i;
for (i = 0; i < motif->columnCount; i++)
    {
    motif->aProb[i] = logBase2(motif->aProb[i]);
    motif->cProb[i] = logBase2(motif->cProb[i]);
    motif->gProb[i] = logBase2(motif->gProb[i]);
    motif->tProb[i] = logBase2(motif->tProb[i]);
    }
}

void dnaMotifNormalize(struct dnaMotif *motif)
/* Make all columns of motif sum to one. */
{
int i;
for (i=0; i<motif->columnCount; ++i)
    {
    float sum = motif->aProb[i] + motif->cProb[i] + motif->gProb[i] + motif->tProb[i];
    if (sum < 0)
        errAbort("%s has negative numbers, perhaps it's score not probability based", 
		motif->name);
    if (sum == 0)
         motif->aProb[i] = motif->cProb[i] = motif->gProb[i] = motif->tProb[i] = 0.25;
    motif->aProb[i] /= sum;
    motif->cProb[i] /= sum;
    motif->gProb[i] /= sum;
    motif->tProb[i] /= sum;
    }
}

boolean dnaMotifIsScoreBased(struct dnaMotif *motif)
/* Return TRUE if dnaMotif is score-based (which we decide by
 * the presense of negative values. */
{
int i;
for (i=0; i<motif->columnCount; ++i)
    {
    if (motif->aProb[i] < 0) return TRUE;
    if (motif->cProb[i] < 0) return TRUE;
    if (motif->gProb[i] < 0) return TRUE;
    if (motif->tProb[i] < 0) return TRUE;
    }
return FALSE;
}

void dnaMotifScoreToProb(struct dnaMotif *motif)
/* Convert motif that is log-odds score based to motif
 * that is probability based.  This assumes that the
 * background distribution is simple: 25% for each base */
{
int i;
for (i=0; i<motif->columnCount; ++i)
    {
    motif->aProb[i] = exp(motif->aProb[i]);
    motif->cProb[i] = exp(motif->cProb[i]);
    motif->gProb[i] = exp(motif->gProb[i]);
    motif->tProb[i] = exp(motif->tProb[i]);
    }
dnaMotifNormalize(motif);
}

void dnaMotifMakeProbabalistic(struct dnaMotif *motif)
/* Change motif, which may be score or count based, to 
 * probabalistic one, where each column adds to 1.0 */
{
if (dnaMotifIsScoreBased(motif))
    dnaMotifScoreToProb(motif);
else
    dnaMotifNormalize(motif);
}

static void printProbRow(FILE *f, char *label, float *p, int pCount)
/* Print one row of a probability profile. */
{
int i;
fprintf(f, "%s ", label);
for (i=0; i < pCount; ++i)
    fprintf(f, "%5.2f ", p[i]);
printf("\n");
}

void dnaMotifPrintProb(struct dnaMotif *motif, FILE *f)
/* Print DNA motif probabilities. */
{
printProbRow(f, "A", motif->aProb, motif->columnCount);
printProbRow(f, "C", motif->cProb, motif->columnCount);
printProbRow(f, "G", motif->gProb, motif->columnCount);
printProbRow(f, "T", motif->tProb, motif->columnCount);
}


static double u1(double prob)
/* Calculate partial uncertainty for one base. */
{
if (prob == 0)
    return 0;
return prob * logBase2(prob);
}

static double uncertainty(struct dnaMotif *motif, int pos)
/* Return the uncertainty at pos of motif.  This corresponds
 * to the H function in logo.pm */
{
return -( u1(motif->aProb[pos]) + u1(motif->cProb[pos])
	+ u1(motif->gProb[pos]) +u1(motif->tProb[pos]) );
}

double dnaMotifBitsOfInfo(struct dnaMotif *motif, int pos)
/* Return bits of information at position. */
{
if (pos > motif->columnCount || pos < 0)
    internalErr();
return 2 - uncertainty(motif, pos);
}

struct letterProb
/* A letter tied to a probability. */
    {
    struct letterProb *next;
    double prob;	/* Probability for this letter. */
    char letter;	/* The letter (upper case) */
    };

static struct letterProb *letterProbNew(char letter, double prob)
/* Make a new letterProb. */
{
struct letterProb *lp;
AllocVar(lp);
lp->letter = letter;
lp->prob = prob;
return lp;
}

static int letterProbCmp(const void *va, const void *vb)
/* Compare to sort highest probability first. */
{
const struct letterProb *a = *((struct letterProb **)va);
const struct letterProb *b = *((struct letterProb **)vb);
double dif = a->prob - b->prob;
if (dif < 0)
   return -1;
else if (dif > 0)
   return 1;
else
   return 0;
}

static void addBaseProb(struct letterProb **pList, char letter, double prob)
/* If prob > 0 add letterProb to list. */
{
if (prob > 0)
    {
    struct letterProb *lp = letterProbNew(letter, prob);
    slAddHead(pList, lp);
    }
}

static struct letterProb *letterProbFromMotifColumn(struct dnaMotif *motif, int pos)
/* Return letterProb list corresponding to column of motif. */
{
struct letterProb *lpList = NULL;
addBaseProb(&lpList, 'A', motif->aProb[pos]);
addBaseProb(&lpList, 'C', motif->cProb[pos]);
addBaseProb(&lpList, 'G', motif->gProb[pos]);
addBaseProb(&lpList, 'T', motif->tProb[pos]);
slSort(&lpList, letterProbCmp);
return lpList;
}

static void psOneColumn(struct dnaMotif *motif, int pos,
    double xStart, double yStart, double width, double totalHeight,
    FILE *f)
/* Write one column of logo to postScript. */
{
struct letterProb *lp, *lpList = letterProbFromMotifColumn(motif, pos);
double x = xStart, y = yStart, w = width, h;
for (lp = lpList; lp != NULL; lp = lp->next)
    {
    h = totalHeight * lp->prob;
    if (h >= 1.0)
	{
	fprintf(f, "%cColor ", tolower(lp->letter));
	fprintf(f, "%3.2f ", x);
	fprintf(f, "%3.2f ", y);
	fprintf(f, "%3.2f ", x + w);
	fprintf(f, "%3.2f ", y + h);
	fprintf(f, "(%c) textInBox\n", lp->letter);
	}
    y += h;
    }
fprintf(f, "\n");
slFreeList(&lpList);
}

static void dnaMotifDims(struct dnaMotif *motif, double widthPerBase, double height, 
	int *retWidth, int *retHeight)
/* Calculate dimensions of motif when rendered. */
{
static int widthFudgeFactor = 2, heightFudgeFactor = 2;
*retWidth = ceil(widthPerBase * motif->columnCount) + widthFudgeFactor;
*retHeight = ceil(height) + heightFudgeFactor;
}

void dnaMotifToLogoPs2(struct dnaMotif *motif, double widthPerBase, double height, 
                       double minHeight, char *fileName)
/* Write logo corresponding to motif to postScript file, with extended options. minHeight
 * is the minimum height that is excluded from information content scaling.  This allows
 * something to show up in columns with very little information content.  Setting this
 * to be the same as height creates an frequency-based logo.
 */
{
FILE *f = mustOpen(fileName, "w");
int i;
int xStart = 0;
int w, h;
char *s = 
#include "dnaMotif.pss"
;

dnaMotifDims(motif, widthPerBase, height, &w, &h);
fprintf(f, "%%!PS-Adobe-3.1 EPSF-3.0\n");
fprintf(f, "%%%%BoundingBox: 0 0 %d %d\n\n", w, h);
fprintf(f, "%s", s);

fprintf(f, "%s", "% Start of code for this specific logo\n");

for (i=0; i<motif->columnCount; ++i)
    {
    double infoScale = dnaMotifBitsOfInfo(motif, i)/2.0;
    // only scale part beyond minHeight
    double useHeight = minHeight + (infoScale * (height - minHeight));
    if (useHeight > height)
        useHeight = height;
    psOneColumn(motif, i, xStart, 0, widthPerBase, useHeight, f);
    xStart += widthPerBase;
    }
fprintf(f, "showpage\n");
carefulClose(&f);
}

void dnaMotifToLogoPs(struct dnaMotif *motif, double widthPerBase, double height, char *fileName)
/* Write logo corresponding to motif to postScript file. */
{
dnaMotifToLogoPs2(motif, widthPerBase, height, 0.0, fileName);
}

void dnaMotifToLogoPng(
	struct dnaMotif *motif,	/* Motif to draw. */
	double widthPerBase, 	/* Width of each base. */
	double height, 		/* Max height. */
	char *gsExe, 		/* ghostscript executable, NULL for default */
	char *tempDir,          /* temp dir , NULL for default */
	char *fileName)		/* output png file name. */
/* Write logo corresponding to motif to png file. */
{
char *psName = rTempName(tempDir, "dnaMotif", ".ps");
int w, h;
int sysRet;

if (gsExe == NULL) gsExe = "gs";
if (tempDir == NULL) tempDir = "/tmp";
dnaMotifToLogoPs(motif, widthPerBase, height, psName);
dnaMotifDims(motif, widthPerBase, height, &w, &h);
char geoBuf[1024];
safef(geoBuf, sizeof geoBuf, "-g%dx%d", w, h);
char outputBuf[1024];
safef(outputBuf, sizeof outputBuf, "-sOutputFile=%s", fileName);
char *pipeCmd[] = {gsExe, "-sDEVICE=png16m", outputBuf, "-dBATCH","-dNOPAUSE","-q", geoBuf, psName, NULL};
struct pipeline *pl = pipelineOpen1(pipeCmd, pipelineWrite | pipelineNoAbort, "/dev/null", NULL, 0);
sysRet = pipelineWait(pl);
if (sysRet != 0)
    errAbort("System call returned %d for:\n  %s", sysRet, pipelineDesc(pl));

remove(psName);
}

void dnaMotifToLogoPsW(struct dnaMotif *motif, double widthPerBase, double width, double height, char *fileName)
/* Write logo corresponding to motif to postScript file. */
{
FILE *f = mustOpen(fileName, "w");
int i;
int xStart = 0;
int w, h;
char *s = 
#include "dnaMotif.pss"
;

dnaMotifDims(motif, widthPerBase, height, &w, &h);
fprintf(f, "%%!PS-Adobe-3.1 EPSF-3.0\n");
fprintf(f, "%%%%BoundingBox: 0 0 %d %d\n\n", w, h);
fprintf(f, "%s", s);

fprintf(f, "%s", "% Start of code for this specific logo\n");

for (i=0; i<motif->columnCount; ++i)
    {
    double infoScale = 0.9 ; 
    xStart = i * width / motif->columnCount;
    psOneColumn(motif, i, xStart, 0, widthPerBase, infoScale * height, f);
    }
fprintf(f, "showpage\n");
carefulClose(&f);
}

void dnaMotifToLogoPGM(
	struct dnaMotif *motif,	/* Motif to draw. */
	double widthPerBase, 	/* Width of each base. */
	double width, 		/* Max width. */
	double height, 		/* Max height. */
	char *gsExe, 		/* ghostscript executable, NULL for default */
	char *tempDir,          /* temp dir , NULL for default */
	char *fileName)		/* output png file name. */
/* Write logo corresponding to motif to pgm file. */
{
char *psName = rTempName(tempDir, "dnaMotif", ".ps");
int w, h;
int sysRet;

if (gsExe == NULL) gsExe = "gs";
if (tempDir == NULL) tempDir = "/tmp";
dnaMotifToLogoPsW(motif, widthPerBase, width, height, psName);
dnaMotifDims(motif, widthPerBase, height, &w, &h);
char geoBuf[1024];
safef(geoBuf, sizeof geoBuf, "-g%dx%d ", (int) ceil(width), h);
char outputBuf[1024];
safef(outputBuf, sizeof outputBuf, "-sOutputFile=%s", fileName);
char *pipeCmd[] = {gsExe, "-sDEVICE=pgmraw", outputBuf, "-dBATCH","-dNOPAUSE","-q", geoBuf, psName, NULL};
struct pipeline *pl = pipelineOpen1(pipeCmd, pipelineWrite | pipelineNoAbort, "/dev/null", NULL, 0);
sysRet = pipelineWait(pl);

if (sysRet != 0)
    errAbort("System call returned %d for:\n  %s", sysRet, pipelineDesc(pl));

remove(psName);
}
