/* hgLoadGenePred - Load genePred tables. */

/* Copyright (C) 2011 The Regents of the University of California 
 * See kent/LICENSE or http://genome.ucsc.edu/license/ for licensing information. */
#include "common.h"
#include "options.h"
#include "jksql.h"
#include "genePred.h"
#include "genePredReader.h"
#include "hdb.h"
#include "hgRelate.h"
#include "chromInfo.h"


/* command line option specifications */
static struct optionSpec optionSpecs[] = {
    {"bin", OPTION_BOOLEAN},
    {"nobin", OPTION_BOOLEAN},
    {"genePredExt", OPTION_BOOLEAN},
    {"skipInvalid", OPTION_BOOLEAN},
    {"noValidate", OPTION_BOOLEAN},
    {NULL, 0}
};

boolean gBin = TRUE;
boolean gGenePredExt = FALSE;
boolean gSkipInvalid = FALSE;
boolean gNoValidate = FALSE;

int gInvalidCnt = 0;

void usage(char *msg)
/* Explain usage and exit. */
{
errAbort("%s\n"
         "hgLoadGenePred - Load up a mySQL database genePred table\n"
         "usage:\n"
         "   hgLoadGenePred database table genePredFile [...]\n"
         "\n"
         "This will sort the input file by chrom+start and validate the genePreds\n"
         "\n"
         "Options:\n"
         "   -bin - add binning (the default)\n"
         "   -nobin - don't add binning (you probably don't want this)\n"
         "   -genePredExt - use extended genePred format\n"
         "   -skipInvalid - instead of aborting on genePreds that\n"
         "    don't pass genePredCheck, generate a warning and skip\n"
         "    them.  You really should fix the data instead of using\n"
         "    this option\n"
         "   -noValidate - don't validate genePred; for debugging only!!\n",
         msg);
}

void setupTable(char *db, struct sqlConnection *conn, char *table)
/* create a psl table as needed */
{
unsigned sqlOpts = gBin ? genePredWithBin : 0;
unsigned fldOpts =  gGenePredExt ? genePredAllFlds : 0;
char* sqlCmd = genePredGetCreateSql(table, fldOpts, sqlOpts, hGetMinIndexLength(db));
sqlRemakeTable(conn, table, sqlCmd);
freez(&sqlCmd);
}

struct genePred *loadGenes(int numGenePreds, char **genePredFiles)
/* load and sort genes */
{
int i;
struct genePred *genes = NULL;
for (i = 0; i < numGenePreds; i++)
    genes = slCat(genes, genePredReaderLoadFile(genePredFiles[i], NULL));
slSort(&genes, genePredCmp);
return genes;
}

boolean checkGene(char *db, struct genePred *gene)
/* validate that a genePred is ok, either exit or return false if it's not */
{
struct chromInfo *ci = hGetChromInfo(db, gene->chrom);
if (ci == NULL)
    {
    fprintf(stderr, "Error: %s has invalid chrom for %s: %s\n",
            gene->name, db, gene->chrom);
    gInvalidCnt++;
    return FALSE;
    }
else if (genePredCheck("invalid genePred", stderr, ci->size, gene) == 0)
    return TRUE;
else
    {
    gInvalidCnt++;
    return FALSE;
    }
}

void copyGene(char *db, struct genePred *gene, FILE *tabFh)
/* copy one gene to the tab file */
{
unsigned holdOptFields = gene->optFields;
unsigned optFields = (genePredScoreFld|genePredName2Fld|genePredCdsStatFld|genePredExonFramesFld);

if (gGenePredExt && ((optFields & optFields) != optFields))
    errAbort("genePred %s doesn't have fields required for -genePredExt", gene->name);

if (gNoValidate || checkGene(db, gene))
    {
    if (!gGenePredExt)
        gene->optFields = 0;  /* omit optional fields */

    if (gBin)
        fprintf(tabFh, "%u\t", hFindBin(gene->txStart, gene->txEnd));
    genePredTabOut(gene, tabFh);

    gene->optFields = holdOptFields; /* restore optional fields */
    }
}

void mkTabFile(char *db, struct genePred *genes, FILE *tabFh)
/* create a tab file to load, optionally adding binning or stripping extended
 * fields if not requested */
{
struct genePred *gene;

for (gene = genes; gene != NULL; gene = gene->next)
    copyGene(db, gene, tabFh);

if (gInvalidCnt > 0)
    {
    if (gSkipInvalid)
        fprintf(stderr, "Warning: skipping %d invalid genePreds\n", gInvalidCnt);
    else
        errAbort("Error: %d invalid genePreds, database unchanged", gInvalidCnt);
    }
}

void hgLoadGenePred(char *db, char *table, int numGenePreds, char **genePredFiles)
/* hgLoadGenePred - Load up a mySQL database genePred table. */
{
struct genePred *genes = loadGenes(numGenePreds, genePredFiles);
struct sqlConnection *conn = sqlConnect(db);
char *tmpDir = ".";
FILE *tabFh = hgCreateTabFile(tmpDir, table);

mkTabFile(db, genes, tabFh);
genePredFreeList(&genes);
setupTable(db, conn, table);
hgLoadTabFile(conn, tmpDir, table, &tabFh);
sqlDisconnect(&conn);
hgRemoveTabFile(tmpDir, table);
}

int main(int argc, char *argv[])
/* Process command line. */
{
optionInit(&argc, argv, optionSpecs);
if (argc < 4)
    usage("wrong # args");
if (optionExists("bin") && optionExists("nobin"))
    errAbort("can't specify both -bin and -nobin");
gBin = !optionExists("nobin");
gGenePredExt = optionExists("genePredExt");
gSkipInvalid = optionExists("skipInvalid");
gNoValidate = optionExists("noValidate");
hgLoadGenePred(argv[1], argv[2], argc-3, argv+3);
return 0;
}
