/* hgKgMrna - is a modified version of hgRefSeqMrna to be used 
   as part of the building process of Known Genes track.  
   It loads all mRNA alignments and other info into refGene 
   tables in a temporary working genome database. */ 

/* Copyright (C) 2013 The Regents of the University of California 
 * See kent/LICENSE or http://genome.ucsc.edu/license/ for licensing information. */

#include "common.h"
#include "linefile.h"
#include "hash.h"
#include "cheapcgi.h"
#include "fa.h"
#include "psl.h"
#include "jksql.h"
#include "hdb.h"
#include "hgRelate.h"
#include "obscure.h"

/* Variables that can be set from command line. */
boolean clTest = FALSE;
boolean clDots = 100;

char *proteinDB;

void usage()
/* Explain usage and exit. */
{
errAbort(
  "hgKgMrna - Load mRNA alignments and other info into refGene tables\n"
  "           into a TEMPORARY database to build Known Genes track.\n"
  "usage:\n"
  "   hgKgMrna rn3Temp mrna.fa mrna.ra tight_mrna.psl loc2ref mrnaPep.fa mim2loc proteins070403\n");
}

struct hash *loadNameTable(struct sqlConnection *conn, 
    char *tableName, int hashSize)
/* Create a hash and load it up from table. */
{
char query[128];
struct sqlResult *sr;
char **row;
struct hash *hash = newHash(hashSize);

sqlSafef(query, sizeof query, "select id,name from %s", tableName);
sr = sqlGetResult(conn, query);
while ((row = sqlNextRow(sr)) != NULL)
    {
    hashAdd(hash, row[1], intToPt(sqlUnsigned(row[0])));
    }
sqlFreeResult(&sr);
return hash;
}

int putInNameTable(struct hash *hash, FILE *f, char *name)
/* Add to name table if it isn't there already. 
 * Return ID of name in table. */
{
struct hashEl *hel;
if (name == NULL)
    return 0;
if ((hel = hashLookup(hash, name)) != NULL)
    return ptToInt(hel->val);
else
    {
    // It appears like this program is dead code that is no longer used.
    // I don't want to make a potential bogus change to track the removal
    // of hgNextId(), so a landmine is added.  Code can be change to determine
    // max id from table if ever needed.  markd 2010-12-15
#if 1
    errAbort("code hasn't been updated to work, please see markd");
    return 0;
#else
    int id = hgNextId();
    fprintf(f, "%u\t%s\n", id, name);
    hashAdd(hash, name, intToPt(id));
    return id;
#endif
    }
}

struct refSeqInfo
/* Information on one refSeq. */
    {
    struct refSeqInfo *next;
    char *mrnaAcc;		/* Accession of mRNA. */
    char *proteinAcc;		/* Accession of protein. */
    char *geneName;		/* Gene name */
    char *productName;		/* Product name (long name for protein) or NULL. */
    int locusLinkId;		/* LocusLink id (of mRNA) */
    int omimId;			/* OMIM id (or 0) */
    int size;                   /* mRNA size. */
    int cdsStart, cdsEnd;       /* start/end of coding region (mRNA coords) */
    int ngi;			/* Nucleotide GI number. */
    struct psl *pslList;	/* List of aligments. */
    int geneNameId;		/* Id of gene name in table. */
    int productNameId;		/* Id of product name in table. */
    };

struct hash *hashNextRa(struct lineFile *lf)
/* Read in a record of a .ra file into a hash */
{
struct hash *hash = newHash(5);
char *line, *firstWord;
int lineSize;
int lineCount = 0;

while (lineFileNext(lf, &line, &lineSize))
    {
    if (line[0] == 0)
        break;
    ++lineCount;
    firstWord = nextWord(&line);
    hashAdd(hash, firstWord, cloneString(line));
    }
if (lineCount == 0)
    freeHash(&hash);
return hash;
}

void dotOut()
/* Print dot to standard output and flush it so user can
 * see it right away. */
{
fputc('.', stdout);
fflush(stdout);
}

void parseCds(char *gbCds, int start, int end, int *retStart, int *retEnd)
/* Parse genBank style cds string into something we can use... */
{
char *s = gbCds;

char *startP, *endP;

endP = gbCds + strlen(gbCds) - 1;
if (*endP == ',')
    {
    printf("\n^^^%s\n", gbCds);
    fflush(stdout);
    }

while (!isdigit(*endP)) endP--;
while (isdigit(*endP)) endP--;
endP++;

startP = gbCds;
if (*s == '<') 
	{
	s++;
	startP = s;
	}
else
	{
	if (*s == 'j')
		{
		while (!isdigit(*s)) s++;
		startP = s;
		}
	}
s = strchr(s, '.');
if (s == NULL || s[1] != '.')
    {
    //errAbort("Malformed GenBank cds %s", gbCds);
    goto skip;
    }

start = atoi(startP) - 1;
end = atoi(endP);
skip:
*retStart = start;
*retEnd = end;
}

char *unburyAcc(struct lineFile *lf, char *longNcbiName)
/* Return accession given a long style NCBI name.  
 * Cannibalizes the longNcbiName. */
{
char *parts[5];
int partCount;

partCount = chopByChar(longNcbiName, '|', parts, ArraySize(parts));
if (partCount < 4) 
    errAbort("Badly formed longNcbiName line %d of %s\n", lf->lineIx, lf->fileName);
chopSuffix(parts[3]);
return parts[3];
}

char *accWithoutSuffix(char *acc) 
/* 
Function to strip the suffix from an accession in order to make it consistent
with our notation here. We ignore the suffix.
Eg. NM_123456.1 becomes NM_123456
*/
{
char *fixedAcc = acc;
char *dotIndex = strchr(acc, '.');

if (dotIndex)
    {
    char *accNum = NULL;
    int dotPos = dotIndex - acc; /* stupid C pointer arith. No other way to do get the string
                                    length up to the period. */
    accNum = needMem(dotPos + 1);
    strncpy(accNum, acc, dotPos);
    accNum[dotPos] = 0; /* Null terminate */
    fixedAcc = accNum;
    }

return fixedAcc;
}

void writeSeqTable(char *faName, FILE *out, boolean unburyAccession, boolean isPep)
/* Write out contents of fa file to name/sequence pairs in tabbed out file. */
{
struct lineFile *lf = lineFileOpen(faName, TRUE);
bioSeq seq;
int dotMod = 0;

printf("Reading %s\n", faName);
while (faSomeSpeedReadNext(lf, &seq.dna, &seq.size, &seq.name, !isPep))
    {
    if (clDots > 0 && ++dotMod == clDots )
        {
	dotMod = 0;
	dotOut();
	}
    if (unburyAccession) 
        {
	seq.name = unburyAcc(lf, seq.name);
        }
    
    seq.name = accWithoutSuffix(seq.name);
    fprintf(out, "%s\t%s\n", seq.name, seq.dna);
    }
if (clDots) printf("\n");
lineFileClose(&lf);
}

struct exon
/* Keep track of an exon. */
    {
    struct exon *next;
    int start, end;
    };

struct exon *pslToExonList(struct psl *psl)
/* Convert psl to exon list, merging together blocks
 * separated by small inserts as necessary. */
{
struct exon *exonList = NULL, *exon = NULL;
int i, tStart, tEnd;
for (i=0; i<psl->blockCount; ++i)
    {
    tStart = psl->tStarts[i];
    tEnd = tStart + psl->blockSizes[i];
    if (exon == NULL || tStart - exon->end > genePredStdInsertMergeSize)
        {
	AllocVar(exon);
	slAddHead(&exonList, exon);
	exon->start = tStart;
	}
    exon->end = tEnd;
    }
slReverse(&exonList);
return exonList;
}


void processRefSeq(char *database, char *faFile, char *raFile, char *pslFile, char *loc2refFile, 
	char *pepFile, char *mim2locFile)
/* hgRefSeqMrna - Load refSeq mRNA alignments and other info into 
 * refSeqGene table. */
{
struct lineFile *lf;
struct hash *raHash, *rsiHash = newHash(0);
struct hash *loc2mimHash = newHash(0);
struct refSeqInfo *rsiList = NULL, *rsi;
char *s, *line, *row[5];
int wordCount, dotMod = 0;
int noLocCount = 0;
int rsiCount = 0;
int noProtCount = 0;
struct psl *psl;
struct sqlConnection *conn = hgStartUpdate(database);
struct hash *productHash = loadNameTable(conn, "productName", 16);
struct hash *geneHash = loadNameTable(conn, "geneName", 16);
char *kgName = "refGene";

FILE *kgTab = hgCreateTabFile(".", kgName);
FILE *productTab = hgCreateTabFile(".", "productName");
FILE *geneTab = hgCreateTabFile(".", "geneName");
FILE *refLinkTab = hgCreateTabFile(".", "refLink");
FILE *refPepTab = hgCreateTabFile(".", "refPep");
FILE *refMrnaTab = hgCreateTabFile(".", "refMrna");

struct exon *exonList = NULL, *exon;
char *answer;
char cond_str[200];
char query[1024];

sqlSafef(query, sizeof query, 
"CREATE TABLE refLink (\n"
"    name varchar(255) not null,        # Name displayed in UI\n"
"    product varchar(255) not null, 	# Name of protein product\n"
"    mrnaAcc varchar(255) not null,	# mRNA accession\n"
"    protAcc varchar(255) not null,	# protein accession\n"
"    geneName int unsigned not null,	# pointer to geneName table\n"
"    prodName int unsigned not null,	# pointer to product name table\n"
"    locusLinkId int unsigned not null,	# Locus Link ID\n"
"    omimId int unsigned not null,	# Locus Link ID\n"
"              #Indices\n"
"    PRIMARY KEY(mrnaAcc(12)),\n"
"    index(name(10)),\n"
"    index(protAcc(10)),\n"
"    index(locusLinkId),\n"
"    index(omimId),\n"
"    index(prodName),\n"
"    index(geneName)\n"
")");
/* Make refLink and other tables table if they don't exist already. */
sqlMaybeMakeTable(conn, "refLink", query);
sqlSafef(query, sizeof query, "delete from refLink");
sqlUpdate(conn, query);

sqlSafef(query, sizeof query, 
"CREATE TABLE refGene ( \n"
"   name varchar(255) not null,	# mrna accession of gene \n"
"   chrom varchar(255) not null,	# Chromosome name \n"
"   strand char(1) not null,	# + or - for strand \n"
"   txStart int unsigned not null,	# Transcription start position \n"
"   txEnd int unsigned not null,	# Transcription end position \n"
"   cdsStart int unsigned not null,	# Coding region start \n"
"   cdsEnd int unsigned not null,	# Coding region end \n"
"   exonCount int unsigned not null,	# Number of exons \n"
"   exonStarts longblob not null,	# Exon start positions \n"
"   exonEnds longblob not null,	# Exon end positions \n"
          "   #Indices \n"
"   INDEX(name(10)), \n"
"   INDEX(chrom(12),txStart), \n"
"   INDEX(chrom(12),txEnd) \n"
")");
sqlMaybeMakeTable(conn, "refGene", query);
sqlSafef(query, sizeof query, "delete from refGene");
sqlUpdate(conn, query);

sqlSafef(query, sizeof query, 
"CREATE TABLE refPep (\n"
"    name varchar(255) not null,        # Accession of sequence\n"
"    seq longblob not null,     # Peptide sequence\n"
"              #Indices\n"
"    PRIMARY KEY(name(32))\n"
")\n");
sqlMaybeMakeTable(conn, "refPep", query);
sqlSafef(query, sizeof query, "delete from refPep");
sqlUpdate(conn, query);

sqlSafef(query, sizeof query, 
"CREATE TABLE refMrna (\n"
"    name varchar(255) not null,        # Accession of sequence\n"
"    seq longblob not null,     	# Nucleotide sequence\n"
"              #Indices\n"
"    PRIMARY KEY(name(32))\n"
")\n");
sqlMaybeMakeTable(conn, "refMrna", query);
sqlSafef(query, sizeof query, "delete from refMrna");
sqlUpdate(conn, query);

/* Scan through locus link to omim ID file and put in hash. */
    {
    char *row[2];

    printf("Scanning %s\n", mim2locFile);
    lf = lineFileOpen(mim2locFile, TRUE);
    while (lineFileRow(lf, row))
	{
	hashAdd(loc2mimHash, row[1], intToPt(atoi(row[0])));
	}
    lineFileClose(&lf);
    }

/* Scan through .ra file and make up start of refSeqInfo
 * objects in hash and list. */
printf("Scanning %s\n", raFile);
lf = lineFileOpen(raFile, TRUE);
while ((raHash = hashNextRa(lf)) != NULL)
    {
    if (clDots > 0 && ++dotMod == clDots )
        {
	dotMod = 0;
	dotOut();
	}
    AllocVar(rsi);
    slAddHead(&rsiList, rsi);
    if ((s = hashFindVal(raHash, "acc")) == NULL)
        errAbort("No acc near line %d of %s", lf->lineIx, lf->fileName);
    rsi->mrnaAcc = cloneString(s);
    if ((s = hashFindVal(raHash, "siz")) == NULL)
        errAbort("No siz near line %d of %s", lf->lineIx, lf->fileName);
    rsi->size = atoi(s);
    if ((s = hashFindVal(raHash, "gen")) != NULL)
	rsi->geneName = cloneString(s);
    //!!!else
      //!!!  warn("No gene name for %s", rsi->mrnaAcc);
    if ((s = hashFindVal(raHash, "cds")) != NULL)
        parseCds(s, 0, rsi->size, &rsi->cdsStart, &rsi->cdsEnd);
    else
        rsi->cdsEnd = rsi->size;
    if ((s = hashFindVal(raHash, "ngi")) != NULL)
        rsi->ngi = atoi(s);

    rsi->geneNameId = putInNameTable(geneHash, geneTab, rsi->geneName);
    s = hashFindVal(raHash, "pro");
    if (s != NULL)
        rsi->productName = cloneString(s);
    rsi->productNameId = putInNameTable(productHash, productTab, s);
    hashAdd(rsiHash, rsi->mrnaAcc, rsi);

    freeHashAndVals(&raHash);
    }
lineFileClose(&lf);
if (clDots) printf("\n");

/* Scan through loc2ref filling in some gaps in rsi. */
printf("Scanning %s\n", loc2refFile);
lf = lineFileOpen(loc2refFile, TRUE);
while (lineFileNext(lf, &line, NULL))
    {
    char *mrnaAcc;

    if (line[0] == '#')
        continue;
    wordCount = chopTabs(line, row);
    if (wordCount < 5)
        errAbort("Expecting at least 5 tab-separated words line %d of %s",
		lf->lineIx, lf->fileName);
    mrnaAcc = row[1];
    mrnaAcc = accWithoutSuffix(mrnaAcc);

    if (mrnaAcc[2] != '_')
        warn("%s is and odd name %d of %s", 
		mrnaAcc, lf->lineIx, lf->fileName);
    if ((rsi = hashFindVal(rsiHash, mrnaAcc)) != NULL)
        {
	rsi->locusLinkId = lineFileNeedNum(lf, row, 0);
	rsi->omimId = ptToInt(hashFindVal(loc2mimHash, row[0]));
	rsi->proteinAcc = cloneString(accWithoutSuffix(row[4]));
	}
    }
lineFileClose(&lf);

/* Report how many seem to be missing from loc2ref file. 
 * Write out knownInfo file. */
printf("Writing %s\n", "refLink.tab");
for (rsi = rsiList; rsi != NULL; rsi = rsi->next)
    {
    ++rsiCount;
    if (rsi->locusLinkId == 0)
        ++noLocCount;
    if (rsi->proteinAcc == NULL)
        ++noProtCount;
    fprintf(refLinkTab, "%s\t%s\t%s\t%s\t%u\t%u\t%u\t%u\n",
	emptyForNull(rsi->geneName), 
	emptyForNull(rsi->productName),
    	emptyForNull(rsi->mrnaAcc), 
	emptyForNull(rsi->proteinAcc),
	rsi->geneNameId, rsi->productNameId, 
	rsi->locusLinkId, rsi->omimId);
    }
if (noLocCount) 
    printf("Missing locusLinkIds for %d of %d\n", noLocCount, rsiCount);
if (noProtCount)
    printf("Missing protein accessions for %d of %d\n", noProtCount, rsiCount);

/* Process alignments and write them out as genes. */
lf = pslFileOpen(pslFile);
dotMod = 0;
while ((psl = pslNext(lf)) != NULL)
  {
  if (hashFindVal(rsiHash, psl->qName) != NULL)
    {
    if (clDots > 0 && ++dotMod == clDots )
        {
	dotMod = 0;
	dotOut();
	}
   
    sqlSafef(cond_str, sizeof cond_str, "extAC='%s'", psl->qName);
    answer = sqlGetField(proteinDB, "spXref2", "displayID", cond_str);
	       
    if (answer == NULL)
	{
	fprintf(stderr, "%s NOT FOUND.\n", psl->qName);
   	fflush(stderr);
	}

    if (answer != NULL)
    	{	
        struct genePred *gp = NULL;
    	exonList = pslToExonList(psl);
    	fprintf(kgTab, "%s\t%s\t%c\t%d\t%d\t",
	psl->qName, psl->tName, psl->strand[0], psl->tStart, psl->tEnd);
    	rsi = hashMustFindVal(rsiHash, psl->qName);

        gp = genePredFromPsl(psl, rsi->cdsStart, rsi->cdsEnd, genePredStdInsertMergeSize);
        if (!gp)
            errAbort("Cannot convert psl (%s) to genePred.\n", psl->qName);

    	fprintf(kgTab, "%d\t%d\t", gp->cdsStart, gp->cdsEnd);
    	fprintf(kgTab, "%d\t", slCount(exonList));
    
    	fflush(kgTab);
     
    	for (exon = exonList; exon != NULL; exon = exon->next)
        fprintf(kgTab, "%d,", exon->start);
    	fprintf(kgTab, "\t");
    
        for (exon = exonList; exon != NULL; exon = exon->next)
        	fprintf(kgTab, "%d,", exon->end);
    	fprintf(kgTab, "\n");
    	slFreeList(&exonList);
    	}
    }
  else
    {
    fprintf(stderr, "%s found in psl, but not in .fa or .ra data files.\n", psl->qName);
    fflush(stderr);
    }
  }

if (clDots) printf("\n");

if (!clTest)
    {
    writeSeqTable(pepFile, refPepTab, FALSE, TRUE);
    writeSeqTable(faFile, refMrnaTab, FALSE, FALSE);
    }

carefulClose(&kgTab);
carefulClose(&productTab);
carefulClose(&geneTab);
carefulClose(&refLinkTab);
carefulClose(&refPepTab);
carefulClose(&refMrnaTab);

if (!clTest)
    {
    printf("Loading database with %s\n", kgName);
    fflush(stdout);
    
    hgLoadTabFile(conn, ".", kgName, NULL);

    printf("Loading database with %s\n", "productName");
    fflush(stdout);
    hgLoadTabFile(conn, ".", "productName", NULL);
    
    printf("Loading database with %s\n", "geneName");
    fflush(stdout);
    hgLoadTabFile(conn, ".", "geneName", NULL);
    
    printf("Loading database with %s\n", "refLink");
    fflush(stdout);
    hgLoadTabFile(conn, ".", "refLink", NULL);
    
    printf("Loading database with %s\n", "refPep");
    fflush(stdout);
    hgLoadTabFile(conn, ".", "refPep", NULL);
    
    printf("Loading database with %s\n", "refMrna");
    fflush(stdout);
    hgLoadTabFile(conn, ".", "refMrna", NULL);
    }
}

void hgRefSeqMrna(char *database, char *faFile, char *raFile, char *pslFile, 
	char *loc2refFile, char *pepFile, char *mim2locFile)
/* hgRefSeqMrna - Load refSeq mRNA alignments and other info into 
 * refSeqGene table. */
{
//catch anyone trying to load data into actual genome database
if (strstr(database, "Temp") == NULL)
    {
    errAbort("hgKgMrna is meant to load mrna data into a Temporary database only, exiting ...\n");
    }
processRefSeq(database, faFile, raFile, pslFile, loc2refFile, pepFile, mim2locFile);
}

int main(int argc, char *argv[])
/* Process command line. */
{
cgiSpoof(&argc, argv);
clTest = cgiBoolean("test");
clDots = cgiOptionalInt("dots", clDots);
if (argc != 9)
    usage();
proteinDB = argv[8];
hgRefSeqMrna(argv[1], argv[2], argv[3], argv[4], argv[5], argv[6], argv[7]);
return 0;
}
