/* hgGetAnn - get chromosome annotation rows from database tables using
 * browser-style position specification.  */

/* Copyright (C) 2013 The Regents of the University of California 
 * See kent/LICENSE or http://genome.ucsc.edu/license/ for licensing information. */
#include "common.h"
#include "options.h"
#include "verbose.h"
#include "hdb.h"
#include "hgFind.h"
#include "jksql.h"
#include "cheapcgi.h"


void usage(char *msg)
/* Explain usage and exit. */
{
errAbort(
    "%s\n"
    "hgGetAnn - get chromosome annotation rows from database tables using\n"
    "           browser-style position specification.\n"
    "\n"
    "usage:\n"
    "   hgGetAnn [options] db table spec tabfile\n"
    "\n"
    "Get chromosome annotation rows from tables.  This takes a browser-style\n"
    "position specification and retrieves matching rows from the database.\n"
    "table. Output is a tab-separated file with optional header.  The bin \n"
    "column of the table will be not be included in the output. If the spec\n"
    "contains whitespace or shell meta-characters, it must be quoted.\n"
    "For split tables, the leading chrN_ should be omitted.  Use `mrna' or\n"
    "`est' for mRNAs/ESTs.  If spec is \"-\", then all rows are retrieved.\n"
    "This will even work for split tables.\n"
    "\n"
    "Options:\n"
    "   -colHeaders - Include column headers with leading #\n"
    "   -tsvHeaders - Include TSV style column headers\n"
    "   -keepBin - don't exclude bin column\n"
    "   -noMatchOk - don't generated an error if nothing is found\n"
    "   -noRandom - Exclude *_random chromsomes\n"
    "   -verbose=n - 2 is basic info, 3 prints positions found\n",
    msg);
}


/* command line */
static struct optionSpec optionSpec[] = {
    {"colHeaders", OPTION_BOOLEAN},
    {"tsvHeaders", OPTION_BOOLEAN},
    {"keepBin", OPTION_BOOLEAN},
    {"noMatchOk", OPTION_BOOLEAN},
    {"noRandom", OPTION_BOOLEAN},
    {NULL, 0}
};

boolean colHeaders;
boolean tsvHeaders;
boolean keepBin;
boolean noMatchOk;
boolean noRandom;

struct cart *cart = NULL; /* hgFind assumes this global */

void prIndent(int indent, char *format, ...)
/* print with indentation */
{
va_list args;

fprintf(stderr, "%*s", 2*indent, "");
va_start(args, format);
vfprintf(stderr, format, args);
va_end(args);
}

void printHgPos(int indent, struct hgPos *pos)
/* print a hgPos struct */
{
prIndent(indent, "%s", pos->name);
if (pos->chrom != NULL)
    fprintf(stderr, "  %s:%d-%d", pos->chrom, pos->chromStart, pos->chromEnd);
if (pos->browserName != NULL)
    fprintf(stderr, "  %s", pos->browserName);
fprintf(stderr, "\n");
}

void printHgPosTable(int indent, struct hgPosTable *posTab)
/* print a hgPosTable struct */
{
struct hgPos *pos;
prIndent(indent, "name: %s\n", posTab->name);
prIndent(indent, "desc: %s\n", posTab->description);
for (pos = posTab->posList; pos != NULL; pos = pos->next)
    printHgPos(indent+1, pos);
} 

void printHgPositions(int indent, struct hgPositions *pos)
/* print a hgPositions struct */
{
prIndent(indent, "query: %s\n", pos->query);
indent++;
prIndent(indent, "database: %s, posCount: %d, useAlias: %s\n",
         pos->database, pos->posCount, (pos->useAlias ? "true" : "false"));
if (pos->singlePos != NULL)
    {
    prIndent(indent, "singlePos:\n");
    printHgPos(indent+1, pos->singlePos);
    assert(pos->singlePos->next == NULL);
    }
if (pos->tableList != NULL)
    {
    struct hgPosTable *posTab;
    prIndent(indent, "tableList:\n");
    for (posTab = pos->tableList; posTab != NULL; posTab = posTab->next)
        printHgPosTable(indent+1, posTab);
    }
} 

void printPositionsList(struct hgPositions *positions)
/* print a list of positions */
{
struct hgPositions *pos;
fprintf(stderr, "Returned positions:\n");
for (pos = positions; pos != NULL; pos = pos->next)
    printHgPositions(1, pos);
} 

boolean inclChrom(char *chrom)
/* test if chromsome should be included, NULL chrom returns TRUE. */
{
if (noRandom)
    return (strstr(chrom, "_random") == NULL);
else
    return TRUE;
}

int getResultCol(struct sqlResult *sr, char *colName)
/* get the column in a result set, or -1 if it's not defined */
{
int iCol = -1;
if ((colName != NULL) && (strlen(colName) > 0))
    {
    iCol = sqlFieldColumn(sr, colName);
    if (iCol < 0)
        errAbort("field %s not in result set", colName);
    }
return iCol;
}

int countFindMatches(struct hgPositions *positions)
/* count number of matches to query */
{
int numMatches = 0;
struct hgPositions *pos;
for (pos = positions; pos != NULL; pos = pos->next)
    numMatches += pos->posCount;
return numMatches;
}

char *getTableName(char *chrom, struct hTableInfo *tableInfo)
/* get the actual table name, given a chrom.  Note: static return */
{
static char tableName[256];
if (tableInfo->isSplit)
    safef(tableName, sizeof(tableName), "%s_%s", chrom,
          tableInfo->rootName);
else
    safef(tableName, sizeof(tableName), "%s", tableInfo->rootName);
return tableName;
}

char *getTableDesc(struct hTableInfo *tableInfo)
/* get a description of a table to use in error messages.  This describes
 * split tables as chr*_xxx, to make it clear.  Note: static return */
{
static char tableDesc[256];
if (tableInfo->isSplit)
    safef(tableDesc, sizeof(tableDesc), "chr*_%s", tableInfo->rootName);
else
    safef(tableDesc, sizeof(tableDesc), "%s", tableInfo->rootName);
return tableDesc;
}

struct hgPositions* findAllChroms(char *db)
/* generate a hgPositions record for the full range of all chromsomes */
{
struct hgPositions *positions;
struct hgPosTable *posTab;
struct slName *chrom;

/* setup s hgPositions object */
AllocVar(positions);
positions->query = cloneString("-");
positions->database = db;

AllocVar(posTab);
posTab->name = "chromInfo";
posTab->description = "all rows";
positions->tableList = posTab;

for (chrom = hAllChromNames(db); chrom != NULL; chrom = chrom->next)
    {
    struct hgPos *pos;
    AllocVar(pos);
    pos->chrom = chrom->name;
    pos->chromStart = 0;
    pos->chromEnd = hChromSize(db, pos->chrom);
    slAddHead(&posTab->posList, pos);
    positions->posCount++;
    }
slReverse(&posTab->posList);
return positions;
}

struct hgPositions *findPositions(char *db, char *spec)
/* query database with hgFind algorithm */
{
struct hgPositions *positions;
verbose(2, "begin position query: %s\n", spec);

if (sameString(spec, "-"))
    positions = findAllChroms(db);
else
    positions = hgPositionsFind(db, spec, NULL, "hgGetAnn", NULL, FALSE, FALSE, NULL);

verbose(2, "end position query: %d matches\n", countFindMatches(positions));
if (verboseLevel() >= 2)
    printPositionsList(positions);
if ((!noMatchOk) && (countFindMatches(positions) == 0))
    errAbort("Error: no matches to find query");
return positions;
}

void checkTableFields(struct hTableInfo *tableInfo, boolean requireName)
/* check that the specified table has the required fields */
{
if (strlen(tableInfo->chromField) == 0)
    errAbort("Error: table %s doesn't have a chrom name field", getTableDesc(tableInfo));
if ((strlen(tableInfo->startField) == 0) || (strlen(tableInfo->endField) == 0))
    errAbort("Error: table %s doesn't have a chromosome start or end fields", getTableDesc(tableInfo));
if (requireName && (strlen(tableInfo->nameField) == 0))
    errAbort("Error: table %s doesn't have a name field", getTableDesc(tableInfo));
}

void writeHeaders(char *db, FILE *outFh, struct hTableInfo *tableInfo)
/* write column headers */
{
char *tableName = getTableName(hDefaultChrom(db), tableInfo);
struct sqlConnection *conn = hAllocConn(db);
struct slName *fields, *fld;

if (colHeaders)
    fputc('#', outFh);

fields = sqlListFields(conn, tableName);
if (tableInfo->hasBin && !keepBin)
    fields = fields->next;  /* skipBin */

for (fld = fields; fld != NULL; fld = fld->next)
    {
    if (fld != fields)
        fputc('\t', outFh);
    fputs(fld->name, outFh);
    }
fputc('\n', outFh);
hFreeConn(&conn);
}

FILE* outputOpen(char *db, char *tabFile, struct hTableInfo *tableInfo)
/* open output file and write optional header */
{
FILE* outFh = mustOpen(tabFile, "w");

if (colHeaders || tsvHeaders)
    writeHeaders(db, outFh, tableInfo);

return outFh;
}

void outputRow(FILE *outFh, char **row, int numCols)
/* output a row, which should already be adjusted to include/exclude the bin
 * column */
{
int i;
for (i = 0; i < numCols; i++)
    {
    if (i > 0)
        fputc('\t', outFh);
    fputs(row[i], outFh);
    }
fputc('\n', outFh);
}

boolean sameStringNull(char *str1, char* str2)
/* compare strings, allow NULLs (which always return false) */
{
if ((str1 == NULL)  || (str2 == NULL))
    return FALSE;
else
    return sameString(str1, str2);
}

boolean inclRow(char **row, int chromCol, int nameCol, struct hgPos *nameSelect)
/* check if a row should be included based on check optional criteria
 * of chrom and name */
{
if ((chromCol >= 0) && !inclChrom(row[chromCol]))
    return FALSE;
if ((nameCol >= 0) && (nameSelect != NULL))
    {
    if (!(sameStringNull(row[nameCol], nameSelect->name)
          || sameStringNull(row[nameCol], nameSelect->browserName)))
        return FALSE;
    }
return TRUE;                                                                            
}

int outputRows(FILE *outFh, struct hTableInfo *tableInfo, struct sqlResult *sr,
               struct hgPos *nameSelect)
/* read query resuts and output rows.  Name will be checked against
 * the  names in nameSelect if not null */
{
int chromCol =  getResultCol(sr, tableInfo->chromField);
int nameCol = -1;
int rowOff = tableInfo->hasBin ? 1 : 0;
int numCols, rowCnt = 0;
char **row;
if (nameSelect != NULL)
    nameCol = getResultCol(sr, tableInfo->nameField);
if (keepBin)
    rowOff = 0;  /* force bin to be included */
numCols = sqlCountColumns(sr) - rowOff;

while ((row = sqlNextRow(sr)) != NULL)
    {
    if (inclRow(row, chromCol, nameCol, nameSelect))
        {
        outputRow(outFh, row+rowOff, numCols);
        rowCnt++;
        }
    }
return rowCnt;
}

int outputByChromRange(char *db, FILE *outFh, struct hTableInfo *tableInfo, struct hgPos *pos)
/* output a hgPos by overlaping chrom range */
{
struct sqlConnection *conn = hAllocConn(db);
struct sqlResult *sr;
int rowOff = 0;
int rowCnt = 0;

/* start query */
if ((pos->chromStart == 0) && (pos->chromEnd >= hChromSize(db, pos->chrom)))
    {
    /* optimize full chromosome query */
    sr = hChromQuery(conn, tableInfo->rootName, pos->chrom, NULL, &rowOff);
    }
else
    {
    /* chromosome range query */
    sr = hRangeQuery(conn, tableInfo->rootName, pos->chrom, 
                     pos->chromStart, pos->chromEnd,
                     NULL, &rowOff);
    }

rowCnt = outputRows(outFh, tableInfo, sr, NULL);

sqlFreeResult(&sr);
hFreeConn(&conn);
return rowCnt;
}

int outputChromRangeHits(char *db, FILE *outFh, struct hTableInfo *tableInfo, struct hgPosTable *posTab)
/* output for a chromosome ranges query, where hgPosTable is for chromInfo */
{
struct hgPos *pos;
int rowCnt = 0;
checkTableFields(tableInfo, FALSE);
for (pos = posTab->posList; pos != NULL; pos = pos->next)
    rowCnt += outputByChromRange(db, outFh, tableInfo, pos);
return rowCnt;
}

int outputByName(char *db, FILE *outFh, struct hTableInfo *tableInfo, char *realTable, struct hgPos *pos)
/* Output results where there is a name and no chrom range hgPos. Actual table
 * name must be supplied, as hgPos does not have a chrom. */
{
struct sqlConnection *conn = hAllocConn(db);
struct sqlResult *sr;
char query[512];
int rowCnt = 0;

sqlSafef(query, sizeof(query), "select * from %s where (%s = '%s')",
      realTable, tableInfo->nameField, pos->name);

sr = sqlGetResult(conn, query);
rowCnt = outputRows(outFh, tableInfo, sr, NULL);

sqlFreeResult(&sr);
hFreeConn(&conn);
return rowCnt;
}

int outputByPosition(char *db, FILE *outFh, struct hTableInfo *tableInfo, struct hgPos *pos)
/* Output results where there is a name and chrom location hgPos. */
{
struct sqlConnection *conn = hAllocConn(db);
struct sqlResult *sr;
char query[512];
int rowCnt = 0;

sqlSafef(query, sizeof(query), "select * from %s where (%s = '%s') and (%s = %d) and (%s = %d)",
      getTableName(pos->chrom, tableInfo), 
      tableInfo->chromField, pos->chrom,
      tableInfo->startField, pos->chromStart, 
      tableInfo->endField, pos->chromEnd);

sr = sqlGetResult(conn, query);
rowCnt = outputRows(outFh, tableInfo, sr, pos);

sqlFreeResult(&sr);
hFreeConn(&conn);
return rowCnt;
}

int outputTablePosHits(char *db, FILE *outFh, struct hTableInfo *tableInfo, struct hgPos *pos)
/* Output results for when query matches requested table. */
{
int rowCnt = 0;

/* handle different cases */
if (pos->chrom != NULL)
    {
    /* have exact location */
    rowCnt = outputByPosition(db, outFh, tableInfo, pos);
    }
else if (!tableInfo->isSplit)
    {
    /* table not split */
    rowCnt += outputByName(db, outFh, tableInfo, tableInfo->rootName, pos);
    }
else if (pos->chrom != NULL) 
   {
    /* split table, but we have chrom */
    rowCnt += outputByName(db, outFh, tableInfo, getTableName(pos->chrom, tableInfo), pos);
    }
else 
    {
    /* got to try each chrom */
    struct slName *chrom;
    for (chrom = hAllChromNames(db); chrom != NULL; chrom = chrom->next)
        {
        char *table = getTableName(chrom->name, tableInfo);
        if (hTableExists(db, table))
            rowCnt += outputByName(db, outFh, tableInfo, table, pos);
        }
    }
return rowCnt;
}

int outputTableHits(char *db, FILE *outFh, struct hTableInfo *tableInfo, struct hgPosTable *posTab)
/* output results where table is requested table hgPos */
{
struct hgPos *pos;
int rowCnt = 0;
checkTableFields(tableInfo, TRUE);
for (pos = posTab->posList; pos != NULL; pos = pos->next)
    rowCnt += outputTablePosHits(db, outFh, tableInfo, pos);
return rowCnt;
}

int outputPositions(char *db, FILE *outFh, struct hTableInfo *tableInfo, struct hgPositions *positions)
/* output results for a single hgPositions record  */
{
struct hgPosTable *posTab;
int rowCnt = 0;
for (posTab = positions->tableList; posTab != NULL; posTab = posTab->next)
    {
    if (sameString(posTab->name, "chromInfo"))
        rowCnt += outputChromRangeHits(db, outFh, tableInfo, posTab);
    else if (sameString(posTab->name, tableInfo->rootName))
        rowCnt += outputTableHits(db, outFh, tableInfo, posTab);
    }
return rowCnt;
}

void outputResults(char *db, FILE *outFh, struct hTableInfo *tableInfo, struct hgPositions *positions)
/* output hits from hgFind in table */
{
struct hgPositions *pos;
int numRows = 0;

for (pos = positions; pos != NULL; pos = pos->next)
    numRows += outputPositions(db, outFh, tableInfo, pos);

if ((!noMatchOk) && (numRows == 0))
    errAbort("Error: no table rows matching query");
}

void hgGetAnn(char *db, char *table, char *spec, char *tabFile)
/* get chromosome annotation rows from database tables using
 * browser-style position specification. */
{
struct hgPositions *positions;
struct hTableInfo *tableInfo;

/* get table info upfront so don't have to wait long find for error */
tableInfo = hFindTableInfo(db, NULL, table);
if (tableInfo == NULL)
    errAbort("Error: no table: %s or *_%s", table, table);

positions = findPositions(db, spec);
if (positions != NULL)
    {
    FILE* outFh = outputOpen(db, tabFile, tableInfo);
    outputResults(db, outFh, tableInfo, positions);
    carefulClose(&outFh);
    }
}

int main(int argc, char *argv[])
/* Process command line. */
{
optionInit(&argc, argv, optionSpec);
if (argc != 5)
    usage("wrong # of args");
colHeaders = optionExists("colHeaders");
tsvHeaders = optionExists("tsvHeaders");
keepBin = optionExists("keepBin");
noMatchOk = optionExists("noMatchOk");
noRandom = optionExists("noRandom");

cgiSpoof(&argc, argv);
hgGetAnn(argv[1], argv[2], argv[3], argv[4]);
return 0;
}
