/* gfPcrLib - Routines to help do in silico PCR.
 * Copyright 2004-2005 Jim Kent.  All rights reserved. */

#include "common.h"
#include "linefile.h"
#include "hash.h"
#include "options.h"
#include "dystring.h"
#include "fa.h"
#include "net.h"
#include "genoFind.h"
#include "sqlNum.h"
#include "gfInternal.h"
#include "gfPcrLib.h"


/**** Input and Output Handlers *****/

void gfPcrOutputFree(struct gfPcrOutput **pOut)
/* Free up a gfPcrOutput structure. */
{
struct gfPcrOutput *out = *pOut;
if (pOut != NULL)
    {
    freeMem(out->name);
    freeMem(out->fPrimer);
    freeMem(out->rPrimer);
    freeMem(out->seqName);
    freeMem(out->dna);
    freez(pOut);
    }
}

void gfPcrOutputFreeList(struct gfPcrOutput **pList)
/* Free up a list of gfPcrOutput structures. */
{
struct gfPcrOutput *el, *next;

for (el = *pList; el != NULL; el = next)
    {
    next = el->next;
    gfPcrOutputFree(&el);
    }
*pList = NULL;
}

void gfPcrInputStaticLoad(char **row, struct gfPcrInput *ret)
/* Load a row from gfPcrInput table into ret.  The contents of ret will
 * be replaced at the next call to this function. */
{
ret->name = row[0];
ret->fPrimer = row[1];
ret->rPrimer = row[2];
}

struct gfPcrInput *gfPcrInputLoad(char **row)
/* Load a gfPcrInput from row fetched with select * from gfPcrInput
 * from database.  Dispose of this with gfPcrInputFree(). */
{
struct gfPcrInput *ret;

AllocVar(ret);
ret->name = cloneString(row[0]);
ret->fPrimer = cloneString(row[1]);
ret->rPrimer = cloneString(row[2]);
return ret;
}

struct gfPcrInput *gfPcrInputLoadAll(char *fileName) 
/* Load all gfPcrInput from a whitespace-separated file.
 * Dispose of this with gfPcrInputFreeList(). */
{
struct gfPcrInput *list = NULL, *el;
struct lineFile *lf = lineFileOpen(fileName, TRUE);
char *row[3];

while (lineFileRow(lf, row))
    {
    el = gfPcrInputLoad(row);
    slAddHead(&list, el);
    }
lineFileClose(&lf);
slReverse(&list);
return list;
}


void gfPcrInputFree(struct gfPcrInput **pEl)
/* Free a single dynamically allocated gfPcrInput such as created
 * with gfPcrInputLoad(). */
{
struct gfPcrInput *el;

if ((el = *pEl) == NULL) return;
freeMem(el->name);
freeMem(el->fPrimer);
freeMem(el->rPrimer);
freez(pEl);
}

void gfPcrInputFreeList(struct gfPcrInput **pList)
/* Free a list of dynamically allocated gfPcrInput's */
{
struct gfPcrInput *el, *next;

for (el = *pList; el != NULL; el = next)
    {
    next = el->next;
    gfPcrInputFree(&el);
    }
*pList = NULL;
}



/**** Handle Refinement (after Index has given us approximate position) *****/

static boolean goodMatch(char *a, char *b, int size)
/* Return TRUE if there are 2 matches per mismatch. */
{
int score = 0, i;
for (i=0; i<size; ++i)
   {
   if (a[i] == b[i])
       score += 1;
   else
       score -= 2;
   }
return score >= 0;
}

static void upperMatch(char *dna, char *primer, int size)
/* Uppercase DNA where it matches primer. */
{
int i;
for (i=0; i<size; ++i)
    {
    if (dna[i] == primer[i])
        dna[i] = toupper(dna[i]);
    }
}

static void outputFa(struct gfPcrOutput *out, FILE *f, char *url)
/* Output match in fasta format. */
{
int fPrimerSize = strlen(out->fPrimer);
int rPrimerSize = strlen(out->rPrimer);
int productSize = out->rPos - out->fPos;
char *dna = cloneStringZ(out->dna, productSize);
char *rrPrimer = cloneString(out->rPrimer);
char *ffPrimer = cloneString(out->fPrimer);
struct dyString *faLabel = dyStringNew(0);
char *name = out->name;

/* Create fasta header with position, possibly empty name, and upper cased primers with position optionally hyperlinked. */
touppers(rrPrimer);
touppers(ffPrimer);
if (url != NULL)
    {
    dyStringAppend(faLabel, "<A HREF=\"");
    dyStringPrintf(faLabel, url, out->seqName, out->fPos+1, out->rPos);
    dyStringAppend(faLabel, "\">");
    }
dyStringPrintf(faLabel, "%s:%d%c%d", 
	out->seqName, out->fPos+1, out->strand, out->rPos);
if (url != NULL)
    dyStringAppend(faLabel, "</A>");
if (name != NULL)
    dyStringPrintf(faLabel, " %s", name);
dyStringPrintf(faLabel, " %dbp %s %s",
	out->rPos - out->fPos, ffPrimer, rrPrimer);

/* Flip reverse primer to be in same direction and case as sequence. */
reverseComplement(rrPrimer, rPrimerSize);
tolowers(rrPrimer);

/* Capitalize where sequence and primer match, and write out sequence. */
upperMatch(dna, out->fPrimer, fPrimerSize);
upperMatch(dna + productSize - rPrimerSize, rrPrimer, rPrimerSize);
faWriteNext(f, faLabel->string, dna, productSize);

/* Clean up. */
freez(&dna);
freez(&rrPrimer);
freez(&ffPrimer);
dyStringFree(&faLabel);
}

static int countMatch(char *a, char *b, int size)
/* Count number of letters in a, b that match */
{
int count = 0, i;
for (i=0; i<size; ++i)
    if (a[i] == b[i])
       ++count;
return count;
}

static int getBedScore(struct gfPcrOutput *out)
/* Return a score in BED range (0-1000). */
{
int size = out->rPos - out->fPos;
int fPrimerSize = strlen(out->fPrimer);
int rPrimerSize = strlen(out->rPrimer);
int match = countMatch(out->dna, out->fPrimer, fPrimerSize);
assert(size > 0);
reverseComplement(out->rPrimer, rPrimerSize);
match += countMatch(out->dna + size - rPrimerSize, out->rPrimer, rPrimerSize);
reverseComplement(out->rPrimer, rPrimerSize);
return round(1000.0 * match / (double)(fPrimerSize + rPrimerSize));
}

static void outputBed(struct gfPcrOutput *out, FILE *f, char *url)
/* Output match in BED 6 format. */
{
char *name = out->name;
if (name == NULL) name = "n/a";
fprintf(f, "%s\t%d\t%d\t", out->seqName, out->fPos, out->rPos);
fprintf(f, "%s\t", name);
fprintf(f, "%d\t", getBedScore(out));
fprintf(f, "%c\n", out->strand);
}

static void outputBed12(struct gfPcrOutput *out, FILE *f, char *url)
/* Output match in BED 12 format (a block for each primer). */
{
int fPrimerSize = strlen(out->fPrimer);
int rPrimerSize = strlen(out->rPrimer);
char *name = out->name;
if (name == NULL) name = "n/a";
fprintf(f, "%s\t%d\t%d\t%s\t", out->seqName, out->fPos, out->rPos, name);
fprintf(f, "%d\t%c\t", getBedScore(out), out->strand);
fprintf(f, "%d\t%d\t", out->fPos, out->rPos);
fprintf(f, "0\t2\t%d,%d,\t0,%d\n", fPrimerSize, rPrimerSize,
	out->rPos - out->fPos - rPrimerSize);
}

static void outputPsl(struct gfPcrOutput *out, FILE *f, char *url)
/* Output match in PSL format. */
{
int match;
int size = out->rPos - out->fPos;
int fPrimerSize = strlen(out->fPrimer);
int rPrimerSize = strlen(out->rPrimer);
int bothSize = fPrimerSize + rPrimerSize;
int gapSize = size - bothSize;
char *name = out->name;
if (name == NULL)
    {
    struct dyString *dy = dyStringNew(0);
    dyStringPrintf(dy, "%s_%s", out->fPrimer, out->rPrimer);
    name = dyStringCannibalize(&dy);;
    }
match = countMatch(out->dna, out->fPrimer, fPrimerSize);
reverseComplement(out->rPrimer, rPrimerSize);
assert(size > 0);
match += countMatch(out->dna + size - rPrimerSize, out->rPrimer, rPrimerSize);
reverseComplement(out->rPrimer, rPrimerSize);

fprintf(f, "%d\t", match);
fprintf(f, "%d\t", bothSize - match);
fprintf(f, "0\t0\t");	/* repMatch, nCount. */
fprintf(f, "1\t%d\t", gapSize);   /* qNumInsert, qBaseInsert */
fprintf(f, "1\t%d\t", gapSize);   /* tNumInsert, tBaseInsert */
fprintf(f, "%c\t", out->strand);
fprintf(f, "%s\t", name);
fprintf(f, "%d\t", size);
fprintf(f, "0\t%d\t", size);	/* qStart, qEnd */
fprintf(f, "%s\t%d\t", out->seqName, out->seqSize);
fprintf(f, "%d\t%d\t", out->fPos, out->rPos);
fprintf(f, "2\t");
if (out->strand == '+')
    {
    fprintf(f, "%d,%d,\t", fPrimerSize, rPrimerSize);
    fprintf(f, "%d,%d,\t", 0,size - rPrimerSize);
    fprintf(f, "%d,%d,\n", out->fPos, out->rPos - rPrimerSize);
    }
else
    {
    fprintf(f, "%d,%d,\t", rPrimerSize, fPrimerSize);
    fprintf(f, "%d,%d,\t", 0,size - fPrimerSize);
    fprintf(f, "%d,%d,\n", out->fPos, out->rPos - fPrimerSize);
    }
}

typedef void (*outFunction)(struct gfPcrOutput *out, FILE *f, char *url) ;

static outFunction gfPcrOutputFunction(char *outType)
/* Return a pointer to output function for the given output type. */
{
outFunction output = NULL;
if (sameWord(outType, "fa"))
    output = outputFa;
else if (sameWord(outType, "bed"))
    output = outputBed;
else if (sameWord(outType, "bed12"))
    output = outputBed12;
else if (sameWord(outType, "psl"))
    output = outputPsl;
else
    errAbort("Unrecognized pcr output type %s", outType);
return output;
}

void gfPcrOutputWriteList(struct gfPcrOutput *outList, char *outType, 
	char *url, FILE *f)
/* Write list of outputs in specified format (either "fa" or "bed") 
 * to file.  If url is non-null it should be a printf formatted
 * string that takes %s, %d, %d for chromosome, start, end. */
{
outFunction output = gfPcrOutputFunction(outType);
struct gfPcrOutput *out;
for (out = outList; out != NULL; out = out->next)
    {
    output(out, f, url);
    }
}

void gfPcrOutputWriteOne(struct gfPcrOutput *out, char *outType, 
	char *url, FILE *f)
/* Write a single output in specified format (either "fa" or "bed") 
 * to file.  If url is non-null it should be a printf formatted
 * string that takes %s, %d, %d for chromosome, start, end. */
{
outFunction output = gfPcrOutputFunction(outType);
output(out, f, url);
}

void gfPcrOutputWriteAll(struct gfPcrOutput *outList, 
	char *outType, char *url, char *fileName)
/* Create file of outputs in specified format (either "fa" or "bed") 
 * to file.  If url is non-null it should be a printf formatted
 * string that takes %s, %d, %d for chromosome, start, end. */
{
FILE *f = mustOpen(fileName, "a");
gfPcrOutputWriteList(outList, outType, url, f);
carefulClose(&f);
}

static void pcrLocalStrand(char *pcrName, 
	struct dnaSeq *seq,  int seqOffset, char *seqName, int seqSize,
	int maxSize, char *fPrimer, int fPrimerSize, char *rPrimer, int rPrimerSize,
	int minPerfect, int minGood,
	char strand, struct gfPcrOutput **pOutList)
/* Do detailed PCR scan on one strand and report results. */
{
char *fDna = seq->dna, *rDna;
char *endDna = fDna + seq->size;
char *fpPerfect = fPrimer + fPrimerSize - minPerfect;
char *rpPerfect = rPrimer;
char *fpMatch, *rpMatch;
int goodSize = minGood - minPerfect;
struct gfPcrOutput  *out;
int matchSize;

reverseComplement(rPrimer, rPrimerSize);
for (;;)
    {
    fpMatch = memMatch(fpPerfect, minPerfect, fDna, endDna - fDna);
    if (fpMatch == NULL)
        break;
    rDna = fpMatch;
    for (;;)
        {
	int fPos, rPos, fGoodPos, rGoodPos, fTrim, rTrim;
	rpMatch = memMatch(rpPerfect, minPerfect, rDna, endDna - rDna);
	if (rpMatch == NULL)
	    break;
	fPos = fpMatch - (fPrimerSize - minPerfect) - seq->dna;
	rPos = rpMatch + rPrimerSize - seq->dna;

	/* deal with primers dangling off either end of the target sequence */
	if (fPos < 0)
	    {
	    fTrim = 0 - fPos;
	    fPos = 0;
	    }
	else
	    fTrim = 0;

	if (rPos > seq->size)
	    {
	    rTrim = rPos - seq->size;
	    rPos = seq->size;
	    }
	else
	    rTrim = 0;

	fGoodPos = fpMatch - goodSize - seq->dna;
	rGoodPos = rpMatch + minPerfect - seq->dna;

	fGoodPos = max(fGoodPos, 0);
	rGoodPos = min(rGoodPos, seq->size);

	matchSize = rPos - fPos;

	int fGoodSize = fpMatch - (seq->dna + fGoodPos);
	int rGoodSize = rPos - rGoodPos;
	if (rGoodSize >= goodSize && fGoodSize >= goodSize)
	    {
	    if (rPos >= 0 && fPos >= 0 && fPos < seq->size && matchSize <= maxSize)
		{
		/* If matches well enough create output record. */
		if (goodMatch(seq->dna + fGoodPos, fpPerfect - fGoodSize, fGoodSize) &&
		    goodMatch(seq->dna + rGoodPos, rpPerfect + minPerfect, rGoodSize))
		    {
		    /* Truncate the copy of the primers going into the out-> record using rTrim and fTrim if needed. */
		    AllocVar(out);
		    out->name  = cloneString(pcrName);
		    out->fPrimer = cloneString(fPrimer + fTrim);
		    out->rPrimer = cloneStringZ(rPrimer, rPrimerSize - rTrim);
		    reverseComplement(out->rPrimer, rPrimerSize - rTrim);
		    out->seqName = cloneString(seqName);
		    out->seqSize = seqSize;
		    out->fPos = fPos + seqOffset;
		    out->rPos = rPos + seqOffset;
		    out->strand = strand;
		    out->dna = cloneStringZ(seq->dna + fPos, matchSize);
		    

		    /* Dealing with the strand of darkness....  Here we just have to swap
		     * forward and reverse primers to flip strands, and reverse complement
		     * the amplified area.. */
		    if (strand == '-')
			{
			char *temp = out->rPrimer;
			out->rPrimer = out->fPrimer;
			out->fPrimer = temp;
			reverseComplement(out->dna, matchSize);
			}
		    slAddHead(pOutList, out);
		    }
		}
	    }
	rDna = rpMatch+1;
	}
    fDna = fpMatch + 1;
    }
reverseComplement(rPrimer, rPrimerSize);
}

void gfPcrLocal(char *pcrName, 
	struct dnaSeq *seq, int seqOffset, char *seqName, int seqSize,
	int maxSize, char *fPrimer, int fPrimerSize, char *rPrimer, int rPrimerSize,
	int minPerfect, int minGood, char strand, struct gfPcrOutput **pOutList)
/* Do detailed PCR scan on DNA already loaded into memory and put results
 * (in reverse order) on *pOutList. */
{
/* For PCR primers reversing search strand just means switching
 * order of primers. */
if (strand == '-')
    pcrLocalStrand(pcrName, seq, seqOffset, seqName, seqSize, maxSize, 
	rPrimer, rPrimerSize, fPrimer, fPrimerSize, 
	minPerfect, minGood, strand, pOutList);
else
    pcrLocalStrand(pcrName, seq, seqOffset, seqName, seqSize, maxSize, 
	fPrimer, fPrimerSize, rPrimer, rPrimerSize, 
	minPerfect, minGood, strand, pOutList);
}

struct gfRange *gfPcrGetRanges(struct gfConnection *conn, char *fPrimer, char *rPrimer,
	int maxSize)
/* Query gfServer with primers and convert response to a list of gfRanges. */
{
char buf[4096];
struct gfRange *rangeList = NULL, *range;

/* Query server and put results into rangeList. */
gfBeginRequest(conn);
if (conn->isDynamic)
    safef(buf, sizeof(buf), "%spcr %s %s %s %s %d", gfSignature(), conn->genome, conn->genomeDataDir,
          fPrimer, rPrimer, maxSize);
else
    safef(buf, sizeof(buf), "%spcr %s %s %d", gfSignature(), fPrimer, rPrimer, maxSize);
mustWriteFd(conn->fd, buf, strlen(buf));
for (;;)
    {
    if (netGetString(conn->fd, buf) == NULL)
	break;
    if (sameString(buf, "end"))
	break;
    else if (startsWith("Error:", buf))
	errAbort("%s", buf);
    else
	{
	char *s = buf;
	char *name, *start, *end, *strand;
	name = nextWord(&s);
	start = nextWord(&s);
	end = nextWord(&s);
	strand = nextWord(&s);
	if (strand == NULL)
	    errAbort("Truncated gfServer response");
	AllocVar(range);
	range->tName = cloneString(name);
	range->tStart = atoi(start);
	range->tEnd = atoi(end);
	range->tStrand = strand[0];
	slAddHead(&rangeList, range);
	}
    }
gfEndRequest(conn);
slReverse(&rangeList);
return rangeList;
}

static void gfPcrOneViaNet(
	struct gfConnection *conn, char *seqDir,
	char *pcrName, char *fPrimer, char *rPrimer, int maxSize,
	int minPerfect, int minGood,
	struct hash *tFileCache, struct gfPcrOutput **pOutList)
/* Query gfServer for likely primer hits, load DNA to do detailed
 * examination, and output hits to head of *pOutList.. */
{
struct gfRange *rangeList = NULL, *range;
int fPrimerSize = strlen(fPrimer);
int rPrimerSize = strlen(rPrimer);
int maxPrimerSize = max(fPrimerSize, rPrimerSize);
int minPrimerSize = min(fPrimerSize, rPrimerSize);

tolowers(fPrimer);
tolowers(rPrimer);

/* Check for obvious user mistake. */
if (minPrimerSize < minGood || minPrimerSize < minPerfect)
    errAbort("Need longer primer in pair %s (%dbp) %s (%dbp).  Minimum size is %d\n",
	fPrimer, fPrimerSize, rPrimer, rPrimerSize, minGood);

/* Load ranges and do more detailed snooping. */
rangeList = gfPcrGetRanges(conn, fPrimer, rPrimer, maxSize);
for (range = rangeList; range != NULL; range = range->next)
    {
    int tSeqSize;
    char seqName[PATH_LEN];
    struct dnaSeq *seq = gfiExpandAndLoadCached(range,
	tFileCache, seqDir,  0, &tSeqSize, FALSE, FALSE, maxPrimerSize);
    gfiGetSeqName(range->tName, seqName, NULL);
    gfPcrLocal(pcrName, seq, range->tStart, seqName, tSeqSize, maxSize, 
	    fPrimer, fPrimerSize, rPrimer, rPrimerSize, 
	    minPerfect, minGood, range->tStrand, pOutList);
    dnaSeqFree(&seq);
    }
gfRangeFreeList(&rangeList);
}


struct gfPcrOutput *gfPcrViaNet(struct gfConnection *conn, char *seqDir, 
	struct gfPcrInput *inList, int maxSize, int minPerfect, int minGood)
/* Do PCRs using gfServer index, returning list of results. */
{
struct hash *tFileCache = gfFileCacheNew();
struct gfPcrInput *in;
struct gfPcrOutput *outList = NULL;

for (in = inList; in != NULL; in = in->next)
    {
    gfPcrOneViaNet(conn, seqDir,
    	in->name, in->fPrimer, in->rPrimer, maxSize,
	minPerfect, minGood,
    	tFileCache, &outList);
    }
gfFileCacheFree(&tFileCache);
slReverse(&outList);
return outList;
}

char *gfPcrMakePrimer(char *s)
/* Make primer (lowercased DNA) out of text.  Complain if
 * it is too short or too long. */
{
int size = dnaFilteredSize(s);
int realSize;
char *primer = needMem(size+1);
dnaFilter(s, primer);
realSize = size - countChars(primer, 'n');
if (realSize < 10 || realSize < size/2)
   errAbort("%s does not seem to be a good primer", s);
return primer;
}

