/* findToFixBedGraphViewLimits - Calculate min/max/mean/std and use them to set viewLimits on bedGraphs described in input.ra. */

/* Copyright (C) 2013 The Regents of the University of California 
 * See kent/LICENSE or http://genome.ucsc.edu/license/ for licensing information. */
#include "common.h"
#include "linefile.h"
#include "hash.h"
#include "options.h"
#include "ra.h"
#include "hmmstats.h"
#include "jksql.h"


void usage()
/* Explain usage and exit. */
{
errAbort(
  "findToFixBedGraphViewLimits - Calculate min/max/mean/std and use them to set viewLimits on bedGraphs described in input.ra\n"
  "usage:\n"
  "   findToFixBedGraphViewLimits input.ra output.ra\n"
  "options:\n"
  "   -xxx=XXX\n"
  );
}

static struct optionSpec options[] = {
   {NULL, 0},
};

static char *mustFindVal(struct slPair *list, char *tag, struct lineFile *lf)
/* Look for tag in list and return value.  If none complain and abort. */
{
char *val = slPairFindVal(list, tag);
if (val == NULL)
    errAbort("missing required %s tag near line %d of %s", tag, lf->lineIx, lf->fileName);
return val;
}

void findToFixBedGraphViewLimits(char *input, char *output)
/* findToFixBedGraphViewLimits - Calculate min/max/mean/std and use them to set viewLimits on bedGraphs described in input.ra. */
{
struct lineFile *lf = lineFileOpen(input, TRUE);
FILE *f = mustOpen(output, "w");
struct slPair *el, *list;
while ((list = raNextRecordAsSlPairList(lf)) != NULL)
    {
    /* Find required fields for calcs. */
    char *db = mustFindVal(list, "db", lf);
    char *track = mustFindVal(list, "track", lf);
    char *type = cloneString(mustFindVal(list, "type", lf));

    /* Parse out type value, which should be "bedGraph 4" and put the 4 or whatever other number
     * in dataFieldIndex. */
    char *typeWords[3];
    int typeWordCount = chopLine(type, typeWords);
    if (typeWordCount != 2 || !sameString(typeWords[0], "bedGraph"))
           errAbort("Not well formed bedGraph type line %d of %s", lf->lineIx, lf->fileName);
    int dataFieldIndex = sqlUnsigned(typeWords[1]);
    freez(&type);

    /* Figure out field corresponding to dataFieldIndex. */
    struct sqlConnection *conn = sqlConnect(db);
    struct slName *fieldList = sqlFieldNames(conn, track);
    struct slName *pastBin = fieldList;
    if (sameString(pastBin->name, "bin"))
         pastBin = pastBin->next;
    struct slName *fieldName = slElementFromIx(pastBin, dataFieldIndex - 1);
    if (fieldName == NULL)
         errAbort("%s doesn't have enough fields", track);
    char *field = fieldName->name;
    assert(sqlFieldIndex(conn, track, field) >= 0);

    /* Print reassuring status message */
    verbose(1, "%s.%s has %d elements.  Data field is %s\n", db, track, sqlTableSize(conn, track), field);
         
    /* Get min/max dataValues in fields.  Do it ourselves rather than using SQL min/max because sometimes
     * the data field is a name column.... */
    char query[512];
    sqlSafef(query, sizeof(query), "select chromStart,chromEnd,%s from %s", field, track);
    struct sqlResult *sr = sqlGetResult(conn, query);
    char **row;
    row = sqlNextRow(sr);
    assert(row != NULL);
    int size = sqlUnsigned(row[1]) - sqlUnsigned(row[0]);
    double val = sqlDouble(row[2]);
    double minLimit = val, maxLimit = val;
    double sumData = val*size;
    double sumSquares = val*val*size;
    bits64 n = size;
    while ((row = sqlNextRow(sr)) != NULL)
        {
	val = sqlDouble(row[2]);
	size = sqlUnsigned(row[1]) - sqlUnsigned(row[0]);
	if (val < minLimit) minLimit = val;
	if (val > maxLimit) maxLimit = val;
	sumData += val*size;
	sumSquares += val*val*size;
	n += size;
	}
    sqlFreeResult(&sr);
    double mean = sumData/n;
    double std = calcStdFromSums(sumData, sumSquares, n);
    verbose(1, "    min=%g max=%g mean=%g std=%g\n",  minLimit, maxLimit, mean, std);

    double viewMax = mean + 6*std;
    if (viewMax > maxLimit) viewMax = maxLimit;
    double viewMin = mean - 6*std;
    if (viewMin < minLimit) viewMin = minLimit;

    /* Output original table plus new minLimit/maxLimit. */
    for (el = list; el != NULL; el = el->next)
	fprintf(f, "%s %s\n", el->name, (char *)el->val);
    fprintf(f, "new_minLimit %g\n", minLimit);
    fprintf(f, "new_maxLimit %g\n", maxLimit);
    fprintf(f, "new_mean %g\n", mean);
    fprintf(f, "new_std %g\n", std);
    fprintf(f, "new_viewLimits %g:%g\n", viewMin, viewMax);
    fprintf(f, "\n");

    sqlDisconnect(&conn);
    slFreeList(&fieldList);
    slPairFreeValsAndList(&list);
    }
lineFileClose(&lf);
carefulClose(&f);
}

int main(int argc, char *argv[])
/* Process command line. */
{
optionInit(&argc, argv, options);
if (argc != 3)
    usage();
findToFixBedGraphViewLimits(argv[1], argv[2]);
return 0;
}
