/* knownToVisiGene - Create knownToVisiGene table by riffling through various other knownTo tables. */

/* Copyright (C) 2013 The Regents of the University of California 
 * See kent/LICENSE or http://genome.ucsc.edu/license/ for licensing information. */
#include "common.h"
#include "linefile.h"
#include "hash.h"
#include "dystring.h"
#include "options.h"
#include "jksql.h"
#include "hgRelate.h"
#include "genePred.h"
#include "psl.h"
#include "rangeTree.h"
#include "binRange.h"
#include "hdb.h"
#include "visiGene.h"

void usage()
/* Explain usage and exit. */
{
errAbort(
  "knownToVisiGene - Create knownToVisiGene table by riffling through various other knownTo tables\n"
  "usage:\n"
  "   knownToVisiGene database\n"
  "options:\n"
  "   -table=XXX - give another name to table other than knownToVisiGene\n"
  "   -visiDb=XXX - use a VisiGene database other than 'visiGene'\n"
  "   -probesDb=database - use given database for probes (default is to use database)\n"
  );
}

char *outTable = "knownToVisiGene";
char *visiDb = "visiGene";
char *probesDb;
boolean vgProbes = FALSE;
boolean vgAllProbes = FALSE;

static struct optionSpec options[] = {
   {"table", OPTION_STRING},
   {"visiDb", OPTION_STRING},
   {"probesDb", OPTION_STRING},
   {NULL, 0},
};

struct prioritizedImage 
    {
    int imageId;	/* ID of image */
    float priority;	/* Image priority - smaller is more urgent */
    int geneId;         /* visiGene gene.id */
    };

void createTable(struct sqlConnection *conn, char *tableName)
/* Create our name/value table, dropping if it already exists. */
{
struct dyString *dy = dyStringNew(512);
sqlDyStringPrintf(dy, 
"CREATE TABLE  %s (\n"
"    name varchar(255) not null,\n"
"    value varchar(255) not null,\n"
"    geneId int not null,\n"
"              #Indices\n"
"    PRIMARY KEY(name),\n"
"    INDEX(value)\n"
")\n",  tableName);
sqlRemakeTable(conn, tableName, dy->string);
dyStringFree(&dy);
}

void addPrioritizedImage(struct hash *hash, int id, float priority, int geneId, char *key)
/* Add image to hash, replacing what's already there if we have a better priority */
{
struct prioritizedImage *pi = hashFindVal(hash, key);
if (pi == NULL)
    {
    AllocVar(pi);
    pi->imageId = id;
    pi->priority = priority;
    pi->geneId = geneId;
    hashAdd(hash, key, pi);
    }
else if (pi->priority > priority)
    {
    pi->imageId = id;
    pi->priority = priority;
    pi->geneId = geneId;
    }
}

void foldIntoHash(struct sqlConnection *conn, char *table, char *keyField, char *valField,
	struct hash *hash, struct hash *uniqHash, boolean secondary)
/* Add key/value pairs from table into hash */
{
struct sqlResult *sr;
char query[512];
char **row;
sqlSafef(query, sizeof(query), "select %s,%s from %s", keyField, valField, table);
sr = sqlGetResult(conn, query);
while ((row = sqlNextRow(sr)) != NULL)
    {
    if (uniqHash != NULL)
        {
	if (hashLookup(uniqHash, row[1]))
	    {
	    if (secondary)
		continue;
	    }
	else
	    hashAdd(uniqHash, row[1], NULL);
	}
    hashAdd(hash, row[0], cloneString(row[1]));
    }
sqlFreeResult(&sr);
}

struct hash *keepersForChroms(struct sqlConnection *conn)
/* Create hash of binKeepers keyed by chromosome */
{
struct hash *keeperHash = hashNew(0);
char query[1024];
sqlSafef(query, sizeof query, "select chrom,size from chromInfo");
struct sqlResult *sr = sqlGetResult(conn, query);
char **row;
while ((row = sqlNextRow(sr)) != NULL)
    {
    char *chrom = row[0];
    int size = sqlUnsigned(row[1]);
    struct binKeeper *bk = binKeeperNew(0, size);
    hashAdd(keeperHash, chrom, bk);
    }
sqlFreeResult(&sr);
return keeperHash;
}

void bestProbeOverlap(struct sqlConnection *conn, char *probeTable, 
	struct genePred *gpList, struct hash *gpToProbeHash)
/* Create hash of most overlapping probe if any for each gene. Require
 * at least 100 base overlap. */
{
/* Create a hash of binKeepers filled with probes. */
struct hash *keeperHash = keepersForChroms(conn);
struct hashCookie it = hashFirst(keeperHash);
struct hashEl *hel;
int pslCount = 0;
while ((hel = hashNext(&it)) != NULL)
    {
    char *chrom = hel->name;
    struct binKeeper *bk = hel->val;
    int rowOffset;
    struct sqlResult *sr = hChromQuery(conn, probeTable, chrom, NULL, &rowOffset);
    char **row;
    while ((row = sqlNextRow(sr)) != NULL)
        {
	struct psl *psl = pslLoad(row+rowOffset);
	binKeeperAdd(bk, psl->tStart, psl->tEnd, psl);
	++pslCount;
	}
    sqlFreeResult(&sr);
    }
verbose(2, "Loaded %d psls from %s\n", pslCount, probeTable);

/* Loop through gene list, finding best probe if any for each gene. */
struct genePred *gp;
for (gp = gpList; gp != NULL; gp = gp->next)
    {
    struct rbTree *rangeTree = genePredToRangeTree(gp, FALSE);
    struct psl *bestPsl = NULL;
    int bestOverlap = 99;	/* MinOverlap - 1 */
    struct binKeeper *bk = hashMustFindVal(keeperHash, gp->chrom);
    struct binElement *bin, *binList = binKeeperFind(bk, gp->txStart, gp->txEnd);
    for (bin = binList; bin != NULL; bin = bin->next)
        {
	struct psl *psl = bin->val;
	if (psl->strand[0] == gp->strand[0])
	    {
	    int overlap = pslRangeTreeOverlap(psl, rangeTree);
	    if (overlap > bestOverlap)
		{
		bestOverlap = overlap;
		bestPsl = psl;
		}
	    }
	}
    if (bestPsl != NULL)
        hashAdd(gpToProbeHash, gp->name, bestPsl->qName);
    }
}

struct prioritizedImage *bestImage(char *kgId, struct hash *kgToHash, struct hash *imageHash)
/* Return best image id if possible, otherwise 0 */
{
struct hashEl *extId = hashLookup(kgToHash, kgId);
struct prioritizedImage *best = NULL;

while (extId)
    {
    struct prioritizedImage *pi = hashFindVal(imageHash, extId->val);
    if (pi)
	{
	if (!best || pi->priority < best->priority)
	    {
	    best = pi;
	    }
	}
    extId = hashLookupNext(extId);
    }
return best;
}

void knownToVisiGene(char *database)
/* knownToVisiGene - Create knownToVisiGene table by riffling through various other knownTo tables. */
{
char *tempDir = ".";
FILE *f = hgCreateTabFile(tempDir, outTable);
struct sqlConnection *hConn = sqlConnect(database);
struct sqlConnection *iConn = sqlConnect(visiDb);
struct sqlResult *sr;
char **row;
struct hash *geneImageHash = newHash(18);
struct hash *locusLinkImageHash = newHash(18);
struct hash *refSeqImageHash = newHash(18);
struct hash *genbankImageHash = newHash(18);
struct hash *probeImageHash = newHash(18);
struct hash *knownToLocusLinkHash = newHash(18);
struct hash *knownToRefSeqHash = newHash(18);
struct hash *knownToGeneHash = newHash(18);
struct hash *favorHugoHash = newHash(18);
struct hash *knownToProbeHash = newHash(18);
struct hash *knownToAllProbeHash = newHash(18);
struct genePred *knownList = NULL, *known;
struct hash *dupeHash = newHash(17);

char query[1024];
probesDb  = optionVal("probesDb", database);
struct sqlConnection *probesConn = sqlConnect(probesDb);
vgProbes = sqlTableExists(probesConn,"vgProbes");
vgAllProbes = sqlTableExists(probesConn,"vgAllProbes");

/* Go through and make up hashes of images keyed by various fields. */
sqlSafef(query, sizeof query, 
        "select image.id,imageFile.priority,gene.name,gene.locusLink,gene.refSeq,gene.genbank"
	",probe.id,submissionSet.privateUser,vgPrbMap.vgPrb,gene.id"
	" from image,imageFile,imageProbe,probe,gene,submissionSet,vgPrbMap"
	" where image.imageFile = imageFile.id"
	" and image.id = imageProbe.image"
	" and imageProbe.probe = probe.id"
	" and probe.gene = gene.id"
	" and image.submissionSet=submissionSet.id"
	" and vgPrbMap.probe = probe.id");
sr = sqlGetResult(iConn, query);

while ((row = sqlNextRow(sr)) != NULL)
    {
    int id = sqlUnsigned(row[0]);
    float priority = atof(row[1]);
    int privateUser = sqlSigned(row[7]);
    char vgPrb_Id[256];
    safef(vgPrb_Id, sizeof(vgPrb_Id), "vgPrb_%s",row[8]);
    int geneId = sqlUnsigned(row[9]);
    if (privateUser == 0)
	{
	addPrioritizedImage(probeImageHash, id, priority, geneId, vgPrb_Id);
	addPrioritizedImage(geneImageHash, id, priority, geneId, row[2]);
	addPrioritizedImage(locusLinkImageHash, id, priority, geneId, row[3]);
	addPrioritizedImage(refSeqImageHash, id, priority, geneId, row[4]);
	addPrioritizedImage(genbankImageHash, id, priority, geneId, row[5]);
	}
    }
verbose(2, "Made hashes of image: geneImageHash %d, locusLinkImageHash %d, refSeqImageHash %d"
           ", genbankImageHash %d probeImageHash %d\n", 
            geneImageHash->elCount, locusLinkImageHash->elCount, refSeqImageHash->elCount, 
	    genbankImageHash->elCount, probeImageHash->elCount);
sqlFreeResult(&sr);

/* Build up list of known genes. */
sqlSafef(query, sizeof query, "select * from knownGene");
sr = sqlGetResult(hConn, query);
while ((row = sqlNextRow(sr)) != NULL)
    {
    struct genePred *known = genePredLoad(row);
    if (!hashLookup(dupeHash, known->name))
        {
	hashAdd(dupeHash, known->name, NULL);
	slAddHead(&knownList, known);
	}
    }
slReverse(&knownList);
sqlFreeResult(&sr);
verbose(2, "Got %d known genes\n", slCount(knownList));

/* Build up hashes from knownGene to other things. */
if (vgProbes)
    bestProbeOverlap(probesConn, "vgProbes", knownList, knownToProbeHash);
if (vgAllProbes)
    bestProbeOverlap(probesConn, "vgAllProbes", knownList, knownToAllProbeHash);

foldIntoHash(hConn, "knownToLocusLink", "name", "value", knownToLocusLinkHash, NULL, FALSE);
foldIntoHash(hConn, "knownToRefSeq", "name", "value", knownToRefSeqHash, NULL, FALSE);
foldIntoHash(hConn, "kgXref", "kgID", "geneSymbol", knownToGeneHash, favorHugoHash, FALSE);
foldIntoHash(hConn, "kgAlias", "kgID", "alias", knownToGeneHash, favorHugoHash, TRUE);
foldIntoHash(hConn, "kgProtAlias", "kgID", "alias", knownToGeneHash, favorHugoHash, TRUE);

verbose(2, "knownToLocusLink %d, knownToRefSeq %d, knownToGene %d knownToProbe %d knownToAllProbe %d\n", 
   knownToLocusLinkHash->elCount, knownToRefSeqHash->elCount, knownToGeneHash->elCount,
   knownToProbeHash->elCount, knownToAllProbeHash->elCount);

/* Try and find an image for each gene. */
for (known = knownList; known != NULL; known = known->next)
    {
    char *name = known->name;
    struct prioritizedImage *best = NULL;
    {
    best = bestImage(name, knownToLocusLinkHash, locusLinkImageHash);
    if (!best)
	best = bestImage(name, knownToRefSeqHash, refSeqImageHash);
    if (!best)
	{
	best = hashFindVal(genbankImageHash, name);
	}
    if (!best)
	best = bestImage(name, knownToGeneHash, geneImageHash);
    if (vgProbes && !best)
	best = bestImage(name, knownToProbeHash, probeImageHash);
    if (vgAllProbes && !best)
	best = bestImage(name, knownToAllProbeHash, probeImageHash);
    }	    
    if (best)
        {
	fprintf(f, "%s\t%d\t%d\n", name, best->imageId, best->geneId);
	}
    }

createTable(hConn, outTable);
hgLoadTabFile(hConn, tempDir, outTable, &f);
hgRemoveTabFile(tempDir, outTable);
}

int main(int argc, char *argv[])
/* Process command line. */
{
optionInit(&argc, argv, options);
outTable = optionVal("table", outTable);
visiDb = optionVal("visiDb", visiDb);
if (argc != 2)
    usage();
knownToVisiGene(argv[1]);
return 0;
}
