/* intronEnds - Gather stats on intron ends.. */

/* Copyright (C) 2013 The Regents of the University of California 
 * See kent/LICENSE or http://genome.ucsc.edu/license/ for licensing information. */
#include "common.h"
#include "linefile.h"
#include "dystring.h"
#include "hash.h"
#include "cheapcgi.h"
#include "dnautil.h"
#include "dnaseq.h"
#include "jksql.h"
#include "hdb.h"
#include "genePred.h"


char *chromName;

void usage()
/* Explain usage and exit. */
{
errAbort(
  "intronEnds - Gather stats on intron ends.\n"
  "usage:\n"
  "   intronEnds database geneTable\n"
  "options:\n"
  "   -chrom=chrN - restrict to a particular chromosome\n"
  "   -withUtr - restrict to genes with UTR regions\n"
  );
}

void intronEnds(char *database, char *table)
/* intronEnds - Gather stats on intron ends.. */
{
struct dyString *query = dyStringNew(1024);
struct sqlConnection *conn;
struct sqlResult *sr;
char **row;
struct genePred *gp;
int total = 0;
int gtag = 0;
int gcag = 0;
int atac = 0;
int ctac = 0;
DNA ends[4];
int exonIx, txStart;
struct dnaSeq *seq;
int rowOffset;
char strand;

rowOffset = hOffsetPastBin(database, NULL, table);
conn = hAllocConn(database);
sqlDyStringPrintf(query, "select * from %s", table);
if (chromName != NULL)
    sqlDyStringPrintf(query, " where chrom = '%s'", chromName);
if (cgiBoolean("withUtr"))
    {
    sqlDyStringPrintf(query, " %s txStart != cdsStart", 
        (chromName == NULL ? "where" : "and"));
    }
sr = sqlGetResult(conn, query->string);
while ((row = sqlNextRow(sr)) != NULL)
    {
    gp = genePredLoad(row+rowOffset);
    strand = gp->strand[0];
    txStart = gp->txStart;
    seq = hDnaFromSeq(database, gp->chrom, txStart, gp->txEnd, dnaLower);
    for (exonIx=1; exonIx < gp->exonCount; ++exonIx)
        {
	++total;
	memcpy(ends, seq->dna + gp->exonEnds[exonIx-1] - txStart, 2);
	memcpy(ends+2, seq->dna + gp->exonStarts[exonIx] - txStart - 2, 2);
	if (strand == '-')
	    reverseComplement(ends, 4);
	if (ends[0] == 'g' && ends[1] == 't' && ends[2] == 'a' && ends[3] == 'g')
	   ++gtag;
	if (ends[0] == 'g' && ends[1] == 'c' && ends[2] == 'a' && ends[3] == 'g')
	   ++gcag;
	if (ends[0] == 'a' && ends[1] == 't' && ends[2] == 'a' && ends[3] == 'c')
	   ++atac;
	if (ends[0] == 'c' && ends[1] == 't' && ends[2] == 'a' && ends[3] == 'c')
	   ++ctac;
	}
    freeDnaSeq(&seq);
    genePredFree(&gp);
    }
sqlFreeResult(&sr);
hFreeConn(&conn);
printf("gt/ag %d (%4.2f)\n", gtag, 100.0*gtag/total);
printf("gc/ag %d (%4.2f)\n", gcag, 100.0*gcag/total);
printf("at/ac %d (%4.2f)\n", atac, 100.0*atac/total);
printf("ct/ac %d (%4.2f)\n", ctac, 100.0*ctac/total);
printf("Total %d\n", total);
}

int main(int argc, char *argv[])
/* Process command line. */
{
cgiSpoof(&argc, argv);
chromName = cgiOptionalString("chrom");
if (argc != 3)
    usage();
intronEnds(argv[1], argv[2]);
return 0;
}
