/* kgPick - select the best repersentative mRNA/protein pair from KG candidates 
   having identical CDS Structure */

/* Copyright (C) 2013 The Regents of the University of California 
 * See kent/LICENSE or http://genome.ucsc.edu/license/ for licensing information. */
#include "common.h"
#include "hCommon.h"
#include "hdb.h"

char printedMrna[40], printedProt[40], printedCds[40];

void usage()
/* Explain usage and exit. */
{
errAbort(
  "kgPick - select the best repersentative mRNA/protein pair\n"
  "usage:\n"
  "   kgPick tempDb outfile\n"
  "      tempDb is the temp KG DB name\n"
  "      roDb   is the read only genome DB name\n"
  "      spDb   is the proteinsYYMMDDt DB name\n"
  "      outfile is the output file name\n"
  "      dupOutfile is the duplicate mrna/prot output file name\n"
  "example: kgPick kgHg17ETemp hg17 proteins050415 kg2.tmp dupSpMrna.tmp\n");
}

/* convert a possible compund mrnaID ("protID_mrnaID") to just the mrnaID. */
char *convert(char *compound)
{
char *chp;
    
chp = compound;
if ((*chp == 'N') && (*(chp+1) == 'M') && (*(chp+2) == '_'))
    {
    return compound;
    }
else
    {
    chp = strstr(compound, "_");
    if (chp == NULL) return compound;

    chp ++;
    }
return chp;
}	
    
void printKg(struct sqlConnection *conn, struct sqlConnection *conn2, 
             char *kgTempDb, char *spDb, char *cdsId, char *mrnaID, char *protAcc, 
	     char *alignID, FILE *outf)
{
char query[256];
struct sqlResult *sr;
char **row;
int i;
char condStr[256];
char *displayId;

sqlSafef(query, sizeof(query), "select * from %s.kgCandidate where alignID='%s'", kgTempDb, alignID);
sr = sqlMustGetResult(conn, query);
row = sqlNextRow(sr);
while (row != NULL)
    {
    fprintf(outf, "%s\t", convert(row[0]));
    for (i=1; i<10; i++) 
    	{
	fprintf(outf, "%s\t", row[i]);
	}
    
    /* get protein display ID */
    sqlSafef(condStr, sizeof condStr, "acc='%s'", protAcc);
    displayId = sqlGetField(spDb, "displayId", "val", condStr);
    if (displayId == NULL) displayId = protAcc;
    
    fprintf(outf, "%s\t%s\t%s\n", displayId, alignID, cdsId);
    
    /* remember what have been printed */
    safecpy(printedCds, sizeof(printedCds), cdsId);
    safecpy(printedMrna, sizeof(printedCds), row[0]);
    safecpy(printedProt, sizeof(printedProt), displayId);
    row = sqlNextRow(sr);
    }
sqlFreeResult(&sr);
}

int main(int argc, char *argv[])
{
struct sqlConnection *conn2, *conn3, *conn4, *conn5;
char query2[256], query3[256];
struct sqlResult *sr2, *sr3;
char **row2, **row3;
char condStr[256];
char *kgTempDb;
char *outfileName, *dupOutfileName;
FILE *outf, *dupOutf;
char *score;
char *protAcc;
char *ranking;
char *cdsId, *mrnaID, *alignID;
char *protDbId;
char *spDb;
boolean gotRefseq, gotNonRef;
boolean isRefseq;
int printCnt;
char *roDb;
char *refseqId;

if (argc != 6) usage();
kgTempDb       = argv[1];
roDb           = argv[2];
spDb           = argv[3];
outfileName    = argv[4];
dupOutfileName = argv[5];

outf    = mustOpen(outfileName, "w");
dupOutf = mustOpen(dupOutfileName, "w");

conn2= hAllocConn(roDb);
conn3= hAllocConn(roDb);
conn4= hAllocConn(roDb);
conn5= hAllocConn(roDb);

strcpy(printedCds, "");
strcpy(printedMrna, "");
strcpy(printedProt, "");

/* go through each unique cds structure */
sqlSafef(query2, sizeof(query2), "select distinct cdsId from %s.kgCandidateZ", kgTempDb);
sr2 = sqlMustGetResult(conn2, query2);
row2 = sqlNextRow(sr2);

while (row2 != NULL)
    {
    /* get all mrna/prot pairs with this CDS structure */
    cdsId = row2[0];
   
    /* ranking reflects CDS quaility and preference for RefSeq and MGC */
    /* Swiss-Prot is prefered than TrEMBL */
    /* finally, higher prot-Mrna alignment score is prefered */
    sqlSafef(query3, sizeof(query3), 
    "select * from %s.kgCandidateZ where cdsId='%s' order by ranking, protDbId, score desc", 
    	    kgTempDb, cdsId);
    sr3  = sqlMustGetResult(conn3, query3);
    row3 = sqlNextRow(sr3);
	      
    gotRefseq    = FALSE;
    gotNonRef    = FALSE;
    
    printCnt = 0;
    while (row3 != NULL)
        {
        cdsId    = row3[0];
	ranking  = row3[1];
	protDbId = row3[2];
	score    = row3[3];
	mrnaID   = row3[4];
	protAcc  = row3[5];
	alignID  = row3[6];

    	isRefseq    = FALSE;
	
	if ((mrnaID[0] == 'N') && (mrnaID[1] == 'M') && (mrnaID[2] == '_'))
	    {
	    isRefseq  = TRUE;
	    }
	
	if (isRefseq)
	    {
	    /* print one qualified RefSeq */
	    if (!gotRefseq)
	    	{
		/* double check again to see if the RefSeq is still valid */
    		sqlSafef(condStr, sizeof condStr, "name='%s'", mrnaID);
    		refseqId = sqlGetField(roDb, "refGene", "name", condStr);
		if (refseqId != NULL)
		    {
		    if (printCnt < 1)
		    	{
		    	printKg(conn4, conn5, kgTempDb, spDb, cdsId, mrnaID, protAcc, 
		    		alignID, outf);
		    	printCnt++;
		    	gotRefseq = TRUE;
		    	}
		    }
		else
		    {
		    fprintf(stderr, "%s no longer found in refGene.\n", mrnaID);
		    }
		}
	    }
	else
	    {
	    /* print out only one non-RefSeq entry for this CDS structure */
	    if (!gotRefseq && (!gotNonRef))
	    	{
	        printKg(conn4, conn5, kgTempDb, spDb, cdsId, mrnaID, protAcc, alignID, outf);
		printCnt++;
		gotNonRef = TRUE;
		}
	    }
	
	/* save duplicates of mrna/prot that has same CDS but does not make it to final KG set */
	if (sameWord(printedCds, cdsId))
	    {
	    if ( (!sameWord(printedMrna, mrnaID)) || (!sameWord(printedProt, protAcc)) )
	    	{
	    	fprintf(dupOutf, "%s\t%s\t%s\t%s\n", convert(printedMrna), printedProt, convert(mrnaID), protAcc);
	   	}
	    }
        row3 = sqlNextRow(sr3);
	}
    
    sqlFreeResult(&sr3);
    row2 = sqlNextRow(sr2);
    }
sqlFreeResult(&sr2);

hFreeConn(&conn2);
hFreeConn(&conn3);
fclose(outf);
fclose(dupOutf);
return(0);
}

