/* rmKGPepMrna generates new .tab files with unused mRNA and protein sequences from known genes db tables removed. */

/* Copyright (C) 2013 The Regents of the University of California 
 * See kent/LICENSE or http://genome.ucsc.edu/license/ for licensing information. */

#include "common.h"
#include "hCommon.h"
#include "hdb.h"


void usage()
/* Explain usage and exit. */
{
errAbort(
  "rmKGPepMrna - generate new .tab files with unused mRNA and protein sequences from known genes db tables removed."
  "usage:\n"
  "   rmKGPepMrna xxxx yyyy\n"
  "      xxxx is the genome  database name\n"
  "      yyyy is the release date of the biosql and proteins DBs\n"
  "example: rmKGPepMrna hg15 0405\n");
}

int main(int argc, char *argv[])
{
struct sqlConnection *conn, *conn2;
char query2[256];
struct sqlResult *sr2;
char **row2;

char cond_str[256];
char *kgID;
char *proteinID;
char *seq;
char *acc;
 
char protDbName[100];
char spDbName[100];
char *dbName;
char *ro_dbName;

FILE *o1, *o2;

struct dnaSeq *kgSeq;
    
if (argc != 4) usage();

o1 = fopen("j.dat",  "w");
o2 = fopen("jj.dat", "w");
    
dbName = argv[1];
ro_dbName = argv[3];
sprintf(protDbName,   "proteins%s", argv[2]);
sprintf(spDbName, "sp%s",   argv[2]);

conn= hAllocConn(ro_dbName);
conn2= hAllocConn(ro_dbName);
sqlSafef(query2, sizeof query2, "select name from %s.knownGene;", dbName);
sr2 = sqlMustGetResult(conn2, query2);
row2 = sqlNextRow(sr2);
while (row2 != NULL)
    {
    kgID    = row2[0];
    
    sqlSafef(cond_str, sizeof cond_str, "name = '%s';", kgID);
    seq = sqlGetField(dbName, "knownGenePep", "seq", cond_str);
    if (seq != NULL)
	{
        fprintf(o1, "%s\t%s\n", kgID, seq);fflush(o1);
	}
    else
	{
        sqlSafef(cond_str, sizeof cond_str, "name = '%s';", kgID);
        proteinID=sqlGetField(dbName, "knownGene", "proteinID", cond_str);
	if (proteinID != NULL)
	    {
            sqlSafef(cond_str, sizeof cond_str, "val = '%s';", proteinID);
            acc = sqlGetField(spDbName, "displayId", "acc", cond_str);
	    if (acc == NULL)
		{
fprintf(stderr, "NO acc.displayId.%s: %s from name.knownGene.%s: %s\n", spDbName, proteinID, dbName, kgID);
		fflush(stderr);
		}
	    else
		{
		sqlSafef(cond_str, sizeof cond_str, "acc = '%s';", acc);
		seq = sqlGetField(spDbName, "protein", "val", cond_str);
		if (seq == NULL)
		    {
		    fprintf(stderr, "NO protein seq for %s\n", kgID);
		    fprintf(stderr, "proteinID.knownGene.%s: %s, acc.displayID.%s: %s\n", dbName, proteinID, spDbName, acc);
		    fflush(stderr);
		    }
		    else
		    {
		    fprintf(o1, "%s\t%s\n", kgID, seq);
		    }
		}
	    } else {
fprintf(stderr, "kgID: %s not in knownGenePep or knownGene\n", kgID);
	    }
	}

    sqlSafef(cond_str, sizeof cond_str, "name = '%s';", kgID);
        
    seq = sqlGetField(dbName, "knownGeneMrna", "seq", cond_str);
    if (seq != NULL)
    	{
        fprintf(o2, "%s\t%s\n", kgID, seq);fflush(o1);
        }
    else
        {
	kgSeq = hGenBankGetMrna(dbName, kgID, NULL);
	   
	if (kgSeq != NULL)
	    {
            fprintf(o2, "%s\t%s\n", kgID, kgSeq->dna);fflush(o1);
            }
	else
	    {
	    fprintf(stderr, "NO mRNA seq for %s\n", kgID);fflush(stderr);
	    }
	}
    row2 = sqlNextRow(sr2);
    }

sqlFreeResult(&sr2);
hFreeConn(&conn);
hFreeConn(&conn2);

fclose(o1);
fclose(o2);
    
mustSystem("cat j.dat |sort|uniq > knownGenePep.tab");
mustSystem("cat jj.dat|sort|uniq > knownGeneMrna.tab");
mustSystem("rm j.dat");
mustSystem("rm jj.dat");

return(0);
}

