/* hgClusterGenes - Cluster overlapping gene predictions. */

/* Copyright (C) 2013 The Regents of the University of California 
 * See kent/LICENSE or http://genome.ucsc.edu/license/ for licensing information. */
#include "common.h"
#include "linefile.h"
#include "hash.h"
#include "options.h"
#include "dlist.h"
#include "jksql.h"
#include "genePred.h"
#include "hdb.h"
#include "hgRelate.h"
#include "binRange.h"
#include "rbTree.h"


void usage()
/* Explain usage and exit. */
{
errAbort(
  "hgClusterGenes - Cluster overlapping gene predictions\n"
  "usage:\n"
  "   hgClusterGenes database geneTable clusterTable cannonicalTable\n"
  "where clusterTable pairs clusterIds and geneIds, and cannonicalTable\n"
  "pairs the longest protein isoform with the clusterId\n"
  "options:\n"
  "   -chrom=chrN - Just work on one chromosome\n"
  "   -verbose=N - Print copious debugging info. 0 for none, 3 for loads\n"
  "   -noProt - Skip protein field\n"
  "   -sangerLinks - Use sangerLinks table for protein\n"
  "   -protAccQuery='query' - Use query to retrieve gene->protein map\n"
  );
}

boolean noProt = FALSE;
boolean sangerLinks = FALSE;
char *protAccQuery = NULL;

static struct optionSpec options[] = {
   {"chrom", OPTION_STRING},
   {"noProt", OPTION_BOOLEAN},
   {"sangerLinks", OPTION_BOOLEAN},
   {"protAccQuery", OPTION_STRING},
   {NULL, 0},
};

struct cluster
/* A cluster of overlapping genes. */
    {
    struct cluster *next;	/* Next in list. */
    struct hash *geneHash;	/* Associated genes. */
    int start,end;		/* Range covered by cluster. */
    };

void clusterDump(int verbosity, struct cluster *cluster)
/* Dump contents of cluster to log. */
{
struct hashEl *helList = hashElListHash(cluster->geneHash);
struct hashEl *hel;
verbose(verbosity, "%d-%d", cluster->start, cluster->end);
for (hel = helList; hel != NULL; hel = hel->next)
    {
    struct genePred *gp = hel->val;
    verbose(verbosity, " %s", gp->name);
    }
verbose(verbosity, "\n");
}

struct cluster *clusterNew()
/* Create new cluster. */
{
struct cluster *cluster;
AllocVar(cluster);
cluster->geneHash = hashNew(5);
return cluster;
}

void clusterFree(struct cluster **pCluster)
/* Free up a cluster. */
{
struct cluster *cluster = *pCluster;
if (cluster == NULL)
    return;
hashFree(&cluster->geneHash);
freez(pCluster);
}

void clusterFreeList(struct cluster **pList)
/* Free a list of dynamically allocated cluster's */
{
struct cluster *el, *next;

for (el = *pList; el != NULL; el = next)
    {
    next = el->next;
    clusterFree(&el);
    }
*pList = NULL;
}

int clusterCmp(const void *va, const void *vb)
/* Compare to sort based on start. */
{
const struct cluster *a = *((struct cluster **)va);
const struct cluster *b = *((struct cluster **)vb);
return a->start - b->start;
}


void clusterAddExon(struct cluster *cluster,
	int start, int end, struct genePred *gp)
/* Add exon to cluster. */
{
if (!hashLookup(cluster->geneHash, gp->name))
    hashAdd(cluster->geneHash, gp->name, gp);
if (cluster->start == cluster->end)
    {
    cluster->start = start;
    cluster->end = end;
    }
else
    {
    if (start < cluster->start) cluster->start = start;
    if (cluster->end < end) cluster->end = end;
    }
}

void addExon(struct binKeeper *bk, struct dlNode *clusterNode,
	int start, int end, struct genePred *gp)
/* Add exon to cluster and binKeeper. */
{
clusterAddExon(clusterNode->val, start, end, gp);
binKeeperAdd(bk, start, end, clusterNode);
}

void mergeClusters(struct binKeeper *bk, struct binElement *bkRest,
	struct dlNode *aNode, struct dlNode *bNode)
/* Move bNode into aNode. */
{
struct cluster *aCluster = aNode->val;
struct cluster *bCluster = bNode->val;
struct binElement *bkEl;
struct hashEl *hEl, *hList;

verbose(3, " a: ");
clusterDump(3, aCluster);
verbose(3, " b: ");
clusterDump(3, bCluster);

/* First change references to bNode. */
binKeeperReplaceVal(bk, bCluster->start, bCluster->end, bNode, aNode);
for (bkEl = bkRest; bkEl != NULL; bkEl = bkEl->next)
    if (bkEl->val == bNode) 
        bkEl->val = aNode;

/* Add b's genes to a. */
hList = hashElListHash(bCluster->geneHash);
for (hEl = hList; hEl != NULL; hEl = hEl->next)
    hashAdd(aCluster->geneHash, hEl->name, hEl->val);

/* Adjust start/end. */
if (bCluster->start < aCluster->start) 
    aCluster->start = bCluster->start;
if (aCluster->end < bCluster->end)
    aCluster->end = bCluster->end;

/* Remove all traces of bNode. */
dlRemove(bNode);
clusterFree(&bCluster);
verbose(3, " ab: ");
clusterDump(3, aCluster);
}

int totalGeneCount = 0;
int totalClusterCount = 0;

struct cluster *makeCluster(struct genePred *geneList, int chromSize)
/* Create clusters out of overlapping genes. */
{
struct binKeeper *bk = binKeeperNew(0, chromSize);
struct dlList *clusters = newDlList();
struct dlNode *node;
struct cluster *clusterList = NULL, *cluster;
struct genePred *gp;

/* Build up cluster list with aid of binKeeper.
 * For each exon look to see if it overlaps an
 * existing cluster.  If so put it into existing
 * cluster, otherwise make a new cluster.  The hard
 * case is where part of the gene is already in one
 * cluster and the exon overlaps with a new
 * cluster.  In this case merge the new cluster into
 * the old one. */
for (gp = geneList; gp != NULL; gp = gp->next)
    {
    int exonIx;
    struct dlNode *oldNode = NULL;
    verbose(2, "%s %d-%d\n", gp->name, gp->txStart, gp->txEnd);
    for (exonIx = 0; exonIx < gp->exonCount; ++exonIx)
        {
	int exonStart = gp->exonStarts[exonIx];
	int exonEnd = gp->exonEnds[exonIx];
	struct binElement *bEl, *bList = binKeeperFind(bk, exonStart, exonEnd);
	verbose(4, "  %d %d\n", exonIx, exonStart);
	if (bList == NULL)
	    {
	    if (oldNode == NULL)
		{
		struct cluster *cluster = clusterNew();
		oldNode = dlAddValTail(clusters, cluster);
		}
	    addExon(bk, oldNode, exonStart, exonEnd, gp);
	    }
	else
	    {
	    for (bEl = bList; bEl != NULL; bEl = bEl->next)
		{
		struct dlNode *newNode = bEl->val;
		if (newNode != oldNode)
		    {
		    if (oldNode == NULL)
			{
			/* Add to existing cluster. */
			oldNode = newNode;
			}
		    else
			{
			/* Merge new cluster into old one. */
			verbose(3, "Merging %p %p\n", oldNode, newNode);
			mergeClusters(bk, bEl->next, oldNode, newNode);
			}
		    }
		}
	    addExon(bk, oldNode, exonStart, exonEnd, gp);
	    slFreeList(&bList);
	    }
	}
    }

/* We build up the cluster list as a doubly-linked list
 * to make it faster to remove clusters that get merged
 * into another cluster.  At the end though we make
 * a singly-linked list out of it. */
for (node = clusters->tail; !dlStart(node); node=node->prev)
    {
    cluster = node->val;
    slAddHead(&clusterList, cluster);
    }
slSort(&clusterList, clusterCmp);

/* Clean up and go home. */
binKeeperFree(&bk);
dlListFree(&clusters);
return clusterList;
}

void createClusterTable(struct sqlConnection *conn, 
	char *tableName, int longestName)
/* Create cluster table. */
{
struct dyString *dy = dyStringNew(1024);
sqlDyStringPrintf(dy,
    "CREATE TABLE %s (\n"
    " clusterId int unsigned not null,\n"
    " transcript varchar(255) not null,\n"
    " INDEX(transcript(%d)),\n"
    " INDEX(clusterId))\n"
    , tableName, longestName);
sqlRemakeTable(conn, tableName, dy->string);
dyStringFree(&dy);
}

void createCannonicalTable(struct sqlConnection *conn, 
	char *tableName, int longestName)
/* Create cannonical representative of cluster table. */
{
struct dyString *dy = dyStringNew(1024);
sqlDyStringPrintf(dy,
    "CREATE TABLE %s (\n"
    " chrom varchar(255) not null,\n"
    " chromStart int not null,\n"
    " chromEnd int not null,\n"
    " clusterId int unsigned not null,\n"
    " transcript varchar(255) not null,\n"
    " protein varchar(255) not null,\n"
    " UNIQUE(clusterId),\n"
    " INDEX(transcript(%d)),\n"
    " INDEX(protein(8)))\n"
    , tableName, longestName);
sqlRemakeTable(conn, tableName, dy->string);
dyStringFree(&dy);
}

int clusterId = 0;	/* Assign a unique id to each cluster. */
int longestName = 1;	/* Longest gene name. */

void clusterGenesOnStrand(char *database, struct sqlConnection *conn,
	char *geneTable, char *chrom, char strand, 
	FILE *clusterFile, FILE *canFile)
/* Scan through genes on this strand, cluster, and write clusters to file. */
{
struct sqlResult *sr;
char **row;
int rowOffset;
struct genePred *gp, *gpList = NULL;
char extraWhere[64], query[256];
struct cluster *clusterList = NULL, *cluster;
int nameLen;
struct hash *protHash = NULL;

verbose(1, "%s %c\n", chrom, strand);
safef(extraWhere, sizeof(extraWhere), "strand = '%c'", strand);
sr = hChromQuery(conn, geneTable, chrom, extraWhere, &rowOffset);
while ((row = sqlNextRow(sr)) != NULL)
    {
    gp = genePredLoad(row + rowOffset);
    nameLen = strlen(gp->name);
    slAddHead(&gpList, gp);
    if (nameLen > longestName)
        longestName = nameLen;
    ++totalGeneCount;
    }
sqlFreeResult(&sr);

/* Build hash to map between transcript names and protein IDs. */
if (!noProt)
    {
    protHash = newHash(16);
    if (sangerLinks)
	sqlSafef(query, sizeof(query), "select orfName,protName from sangerLinks");
    else if (isNotEmpty(protAccQuery))
	// accepting special user query without checking
	sqlSafef(query, sizeof(query), protAccQuery, NULL);  
    else
	sqlSafef(query, sizeof(query), "select name, proteinId from %s", geneTable);
    sr = sqlGetResult(conn, query);
    while ((row = sqlNextRow(sr)) != NULL)
	hashAdd(protHash, row[0], cloneString(row[1]));
    sqlFreeResult(&sr);
    }

slReverse(&gpList);
clusterList = makeCluster(gpList, hChromSize(database, chrom));
for (cluster = clusterList; cluster != NULL; cluster = cluster->next)
    {
    struct hashEl *helList = hashElListHash(cluster->geneHash);
    struct hashEl *hel;
    struct genePred *cannonical = NULL;
    char *protName;
    int cannonicalSize = -1, size = 0;
    ++clusterId;
    for (hel = helList; hel != NULL; hel = hel->next)
        {
	struct genePred *gp = hel->val;
	fprintf(clusterFile, "%d\t%s\n", clusterId, gp->name);
	size = genePredCodingBases(gp);
	if (size > cannonicalSize)
	    {
	    cannonical = gp;
	    cannonicalSize = size;
	    }
	}
    protName = cannonical->name;
    if (protHash != NULL)
	{
	char *newVal = hashFindVal(protHash, protName);
	if (newVal != NULL)
	    protName = newVal;
	}
    fprintf(canFile, "%s\t%d\t%d\t%d\t%s\t%s\n", 
    	chrom, cannonical->txStart, cannonical->txEnd, clusterId, cannonical->name,
	protName);
    ++totalClusterCount;
    }
genePredFreeList(&gpList);
freeHashAndVals(&protHash);
}

void hgClusterGenes(char *database, char *geneTable, char *clusterTable,
	char *cannonicalTable)
/* hgClusterGenes - Cluster overlapping gene predictions. */
{
struct slName *chromList, *chrom;
FILE *clusterFile = hgCreateTabFile(".", clusterTable);
FILE *canFile = hgCreateTabFile(".", cannonicalTable);
struct sqlConnection *conn;

if (optionExists("chrom"))
    chromList = slNameNew(optionVal("chrom", NULL));
else
    chromList = hAllChromNames(database);
conn = hAllocConn(database);
for (chrom = chromList; chrom != NULL; chrom = chrom->next)
    {
    clusterGenesOnStrand(database, conn, geneTable, chrom->name, '+', 
    	clusterFile, canFile);
    clusterGenesOnStrand(database, conn, geneTable, chrom->name, '-', 
    	clusterFile, canFile);
    }
createClusterTable(conn, clusterTable, longestName);
createCannonicalTable(conn, cannonicalTable, longestName);
verbose(1, "Loading database\n");
hgLoadTabFile(conn, ".", clusterTable, &clusterFile);
hgLoadTabFile(conn, ".", cannonicalTable, &canFile);
verbose(1, "Got %d clusters, from %d genes in %d chromosomes\n",
	totalClusterCount, totalGeneCount, slCount(chromList));
}

int main(int argc, char *argv[])
/* Process command line. */
{
optionInit(&argc, argv, options);
noProt = optionExists("noProt");
sangerLinks = optionExists("sangerLinks");
protAccQuery = optionVal("protAccQuery", protAccQuery);
if (argc != 5)
    usage();
hgClusterGenes(argv[1], argv[2], argv[3], argv[4]);
return 0;
}