/* dnaMarkov - stuff to build 1st, 2nd, 3rd, and coding
 * 3rd degree Markov models for DNA. */

/* Copyright (C) 2011 The Regents of the University of California 
 * See kent/LICENSE or http://genome.ucsc.edu/license/ for licensing information. */

#include "common.h"
#include "dnautil.h"
#include "dnaseq.h"
#include "slog.h"
#include "dnaMarkov.h"


void dnaMark0(struct dnaSeq *seqList, double mark0[5], int slogMark0[5])
/* Figure out frequency of bases in input.  Results go into
 * mark0 and optionally in scaled log form into slogMark0.
 * Order is N, T, C, A, G.  (TCAG is our normal order) */
{
struct dnaSeq *seq;
int histo[4];
int oneHisto[4];
double total;
int i;
double *freq = mark0+1;

zeroBytes(histo, sizeof(histo));
for (seq = seqList; seq != NULL; seq = seq->next)
    {
    dnaBaseHistogram(seq->dna, seq->size, oneHisto);
    for (i=0; i<4; ++i)
        histo[i] += oneHisto[i];
    }
total = histo[0] + histo[1] + histo[2] + histo[3];
freq[-1] = 1.0;
for (i=0; i<4; ++i)
    freq[i] = (double)histo[i] / total;
if (slogMark0 != NULL)
    {
    int *slogFreq = slogMark0 + 1;
    slogFreq[-1] = 0;
    for (i=0; i<4; ++i)
	slogFreq[i] = slog(freq[i]);
    }
}


void dnaMark1(struct dnaSeq *seqList, double mark0[5], int slogMark0[5], 
	double mark1[5][5], int slogMark1[5][5])
/* Make up 1st order Markov model - probability that one nucleotide
 * will follow another. Input is sequence and 0th order Markov models.
 * Output is first order Markov model. slogMark1 can be NULL. */
{
struct dnaSeq *seq;
DNA *dna, *endDna;
int i,j;
int histo[5][5];
int hist1[5];

zeroBytes(histo, sizeof(histo));
zeroBytes(hist1, sizeof(hist1));
for (seq = seqList; seq != NULL; seq = seq->next)
    {
    dna = seq->dna;
    endDna = dna + seq->size-1;
    for (;dna < endDna; ++dna)
        {
        i = ntVal[(int)dna[0]];
        j = ntVal[(int)dna[1]];
        hist1[i+1] += 1;
        histo[i+1][j+1] += 1;
        }
    }
for (i=0; i<5; ++i)
    {
    for (j=0; j<5; ++j)
        {
        double mark1Val;
        int matVal = histo[i][j] + 1;
        mark1Val = ((double)matVal)/(hist1[i]+5);
        mark1[i][j] = mark1Val;
	if (slogMark1 != NULL)
	    slogMark1[i][j] = slog(mark1Val);
        }
    }
for (i=0; i<5; ++i)
    {
    mark1[i][0] = 1;
    mark1[0][i] = mark0[i];
    if (slogMark1 != NULL)
	{
	slogMark1[i][0] = 0;
	slogMark1[0][i] = slogMark0[i];
	}
    }
}

void dnaMarkTriple(struct dnaSeq *seqList, 
    double mark0[5], int slogMark0[5],
    double mark1[5][5], int slogMark1[5][5],
    double mark2[5][5][5], int slogMark2[5][5][5],
    int offset, int advance, int earlyEnd)
/* Make up a table of how the probability of a nucleotide depends on the previous two.
 * Depending on offset and advance parameters this could either be a straight 2nd order
 * Markov model, or a model for a particular coding frame. */
{
struct dnaSeq *seq;
DNA *dna, *endDna;
int i,j,k;
int histo[5][5][5];
int hist2[5][5];
int total = 0;
zeroBytes(histo, sizeof(histo));
zeroBytes(hist2, sizeof(hist2));
for (seq = seqList; seq != NULL; seq = seq->next)
    {
    dna = seq->dna;
    endDna = dna + seq->size - earlyEnd - 2;
    dna += offset;
    for (;dna < endDna; dna += advance)
        {
        i = ntVal[(int)dna[0]];
        j = ntVal[(int)dna[1]];
        k = ntVal[(int)dna[2]];
        hist2[i+1][j+1] += 1;
        histo[i+1][j+1][k+1] += 1;
        total += 1;
        }
    }
for (i=0; i<5; ++i)
    {
    for (j=0; j<5; ++j)
        {
        for (k=0; k<5; ++k)
            {
            double markVal;
            int matVal = histo[i][j][k]+1;
            if (i == 0 || j == 0 || k == 0)
                {
                if (k == 0)
                    {
                    mark2[i][j][k] = 1;
		    if (slogMark2 != NULL)
			slogMark2[i][j][k] = 0;
                    }
                else if (j == 0)
                    {
                    mark2[i][j][k] = mark0[k];
		    if (slogMark2 != NULL)
			slogMark2[i][j][k] = slogMark0[k];
                    }
                else if (i == 0)
                    {
                    mark2[i][j][k] = mark1[j][k];
		    if (slogMark2 != NULL)
			slogMark2[i][j][k] = slogMark1[j][k];
                    }
                }
            else
                {
                markVal = ((double)matVal)/(hist2[i][j]+5);
                mark2[i][j][k] = markVal;
		if (slogMark2 != NULL)
		    slogMark2[i][j][k] = slog(markVal);
                }
            }
        }
    }
}

void dnaMark2(struct dnaSeq *seqList, double mark0[5], int slogMark0[5],
	double mark1[5][5], int slogMark1[5][5],
	double mark2[5][5][5], int slogMark2[5][5][5])
/* Make up 1st order Markov model - probability that one nucleotide
 * will follow the previous two. */
{
dnaMarkTriple(seqList, mark0, slogMark0, mark1, slogMark1, 
	mark2, slogMark2, 0, 1, 0);
}

#define SIG 10

char *dnaMark2Serialize(double mark2[5][5][5])
// serialize a 2nd order markov model
{
int i, j, k;
int offset = 0;
char *buf = NULL;
int bufLen = 5*5*5 * (SIG + 3) + 1;
buf = needMem(bufLen);
for(i = 0; i < 5; i++)
    for(j = 0; j < 5; j++)
        for(k = 0; k < 5; k++)
            {
            if(offset)
                {
                sprintf(buf + offset, ";%1.*f", SIG, mark2[i][j][k]);
                offset += (SIG + 3);
                }
            else
                {
                sprintf(buf + offset, "%1.*f", SIG, mark2[i][j][k]);
                offset += (SIG + 2);
                }
            }
buf[offset] = 0;
return buf;
}

void dnaMark2Deserialize(char *buf, double mark2[5][5][5])
// deserialize a 2nd order markov model
{
int i, j, k;
int offset = 0;
for(i = 0; i < 5; i++)
    for(j = 0; j < 5; j++)
        for(k = 0; k < 5; k++)
            {
            float f;
            if(offset)
                {
                sscanf(buf + offset, ";%f", &f);
                mark2[i][j][k] = f;
                offset += (SIG + 3);
                }
            else
                {
                sscanf(buf + offset, "%f", &f);
                mark2[i][j][k] = f;
                offset += (SIG + 2);
                }
            }
}

void dnaMarkMakeLog2(double mark2[5][5][5])
// convert a 2nd-order markov array to log2
{
int i, j, k;
for(i = 0; i < 5; i++)
    for(j = 0; j < 5; j++)
        for(k = 0; k < 5; k++)
            mark2[i][j][k] = logBase2(mark2[i][j][k]);
}