/* spToProteins- Create tab delimited data file for proteinsxxxx database */
/* CURRENTLY UNUSED because it's horribly slow perhaps.... */

/* Copyright (C) 2013 The Regents of the University of California 
 * See kent/LICENSE or http://genome.ucsc.edu/license/ for licensing information. */

#include "common.h"
#include "hCommon.h"
#include "hdb.h"
#include "spDb.h"
#include "options.h"

void usage()
/* Explain usage and exit. */
{
errAbort(
  "spToProteins- Create tab delimited data files from spxxxx database for proteinsxxxx database.\n"
  "usage:\n"
  "   spToProteins xxxx\n"
  "      xxxx is the release date of SWISS-PROT data\n"
  "Example: spToProteins 100503\n");
}

int main(int argc, char *argv[])
{
struct sqlConnection *conn, *conn2, *conn3;
char query[256], query2[256];
struct sqlResult *sr, *sr2;
char **row, **row2;
char cond_str[255];
char proteinDatabaseName[255];
char proteinsDB[255];
char *isCurated;
char *desc;
char *hugoSymbol, *hugoDesc;
char empty_str[1] = {""};
FILE *o1, *o2, *o3;
char *proteinDataDate;
int  bioDatabase, bioentryId;
char *displayId;
char *accession;
char *extAC;
char *extDb;
int taxon;
struct slName *taxonList, *name;

optionHash(&argc, argv);
if (argc != 2) usage();

proteinDataDate = argv[1];
safef(proteinDatabaseName, sizeof(proteinDatabaseName), "sp%s", proteinDataDate);
safef(proteinsDB, sizeof(proteinsDB), "proteins%s", proteinDataDate);

o1 = mustOpen("temp_spXref2.dat", "w");
o2 = mustOpen("spXref3.tab", "w");
o3 = mustOpen("temp_spOrganism.dat", "w");

conn  = hAllocConn(hDefaultDb());
conn2 = hAllocConn(hDefaultDb());
conn3 = sqlConnect(proteinDatabaseName);

bioentryId = 0;

sqlSafef(query2, sizeof(query2), "select count(*) from %s.info", proteinDatabaseName);
int totalIds = sqlQuickNum(conn2, query2);
sqlSafef(query2, sizeof(query2), "select acc, isCurated from %s.info;", proteinDatabaseName);
sr2 = sqlMustGetResult(conn2, query2);
row2 = sqlNextRow(sr2);
while (row2 != NULL)
    {
    if (bioentryId%1000 == 0)
        verbose(1, "Processed %d of %d %5.2f%%\n", bioentryId, totalIds, 100.0*bioentryId/totalIds);
    bioentryId++;
        
    accession		= row2[0];   
    isCurated           = row2[1];
    verbose(3, "%d %s %s\n", bioentryId, accession, isCurated);
  
    if (*isCurated == '1')
	{
	bioDatabase = 1;
	}
    else
	{
	if (strlen(accession) > 7) 
	    {
	    bioDatabase = 3;
	    }
	else
	    {
	    bioDatabase = 2;
	    }
	}
    sqlSafef(cond_str, sizeof(cond_str), "acc='%s'", accession);
    displayId = sqlGetField(proteinDatabaseName, "displayId", "val", cond_str);

    // !!! the divsion field probably should be eliminated later
    // use the simple 1 taxon returned value function for the time being, 
    // could expand into multiple by calling spBinomialNames later 
    taxon = spTaxon(conn3, accession);

    taxonList = spTaxons(conn3, accession);
    for (name = taxonList; name != NULL; name = name->next)
	{
	fprintf(o3, "%s\t%s\n", displayId, name->name);
	}

    sqlSafef(cond_str, sizeof(cond_str), "acc='%s'", accession);
    desc = sqlGetField(proteinDatabaseName, "description", "val", cond_str);
    sqlSafef(cond_str, sizeof(cond_str), "uniProt='%s'", accession);
    hugoSymbol = sqlGetField(proteinsDB, "hgnc", "symbol", cond_str);
    hugoDesc = sqlGetField(proteinsDB, "hgnc", "name", cond_str);
    if (hugoSymbol==NULL) hugoSymbol = empty_str;
    if (hugoDesc==NULL)   hugoDesc   = empty_str;

    fprintf(o2, "%s\t%s\t%d\t%d\t%d\t%s\t%s\t%s\n", accession, displayId, 
	   taxon, bioentryId, bioDatabase, desc, hugoSymbol, hugoDesc);
    
    sqlSafef(query, sizeof(query),
	    "select extAcc1, extDb.val from sp%s.extDb, sp%s.extDbRef where extDbRef.acc='%s' %s",
	    proteinDataDate, proteinDataDate, accession, "and extDb.id = extDbRef.extDb;"); 
    sr = sqlMustGetResult(conn, query);
    row = sqlNextRow(sr);
    while (row != NULL)
    	{
    	extAC = row[0];
	extDb = row[1];

        fprintf(o1, "%s\t%s\t%d\t%s\t%s\t%d\t%d\n", accession, displayId, taxon, 
		extDb, extAC, bioentryId, bioDatabase);
  
	row = sqlNextRow(sr);
	}
    sqlFreeResult(&sr);
    row2 = sqlNextRow(sr2);
    }

fclose(o1);
fclose(o2);
fclose(o3);

sqlFreeResult(&sr2);
hFreeConn(&conn);
hFreeConn(&conn2);
sqlDisconnect(&conn3);

mustSystem("cat temp_spXref2.dat | sort |uniq > spXref2.tab");
mustSystem("rm temp_spXref2.dat");
mustSystem("cat temp_spOrganism.dat | sort |uniq > spOrganism.tab");
mustSystem("rm temp_spOrganism.dat");
return(0);
}

