/* spKgMap - map swissprot accs to known gene ids */

/* Copyright (C) 2013 The Regents of the University of California 
 * See kent/LICENSE or http://genome.ucsc.edu/license/ for licensing information. */
#include "common.h"
#include "spKgMap.h"
#include "jksql.h"
#include "kgXref.h"
#include "kgProtAlias.h"
#include "localmem.h"
#include "hash.h"
#include "linefile.h"


struct spKgMap
/* map of swissprot accs to kg ids */
{
    struct hash *spHash;  /* hash to slName list, or to NULL if no mapping */
};

static void spKgMapAdd(struct spKgMap *spKgMap, char *spAcc, char *kgId)
/* add an entry to the map, if it doesn't already exist. */
{
struct hashEl *spHel = hashStore(spKgMap->spHash, spAcc);

if ((spHel->val == NULL) || !slNameInList(spHel->val, kgId))
    {
    struct slName *kgIdSln = lmSlName(spKgMap->spHash->lm, kgId);
    slAddHead((struct slName**)&spHel->val, kgIdSln);
    }
}

static void loadKgXRef(struct spKgMap *spKgMap,
                       struct sqlConnection *conn)
/* load kgXRef data into the table */
{
char query[1024];
struct kgXref kgXref;
struct sqlResult *sr;
char **row;

/* Verify that the number of fields present in this kgXref table is what's
 * expected, since more fields were added to the schema recently (10/19/2011) */
struct slName *kgXrefFields = sqlListFields(conn, "kgXref");
if (slCount(kgXrefFields) != KGXREF_NUM_COLS) 
    {
    errAbort("This genome has %d columns in kgXref but %d are expected - old genome?", 
	     slCount(kgXrefFields), KGXREF_NUM_COLS);
    }
slFreeList(kgXrefFields);


sqlSafef(query, sizeof(query), "SELECT * from kgXref");
sr = sqlGetResult(conn, query);
while ((row = sqlNextRow(sr)) != NULL)
    {
    kgXrefStaticLoad(row, &kgXref);
    spKgMapAdd(spKgMap, kgXref.spID, kgXref.kgID);
    }
sqlFreeResult(&sr);
}

static void loadKgProtAlias(struct spKgMap *spKgMap,
                            struct sqlConnection *conn)
/* load entries from kgProtAlias that appear to be for other 
 * swissprot.  Just looks for aliases starting with O,P, or Q.
 * followed by a number. */
{
char query[1024];
struct kgProtAlias kgPA;
struct sqlResult *sr;
char **row;

sqlSafef(query, sizeof(query), "SELECT * from kgProtAlias");
sr = sqlGetResult(conn, query);
while ((row = sqlNextRow(sr)) != NULL)
    {
    kgProtAliasStaticLoad(row, &kgPA);
    if ((strlen(kgPA.alias) > 2)
        && ((kgPA.alias[0] == 'O') || (kgPA.alias[0] == 'P') || (kgPA.alias[0] ==  'Q'))
        && isdigit(kgPA.alias[1]))
        spKgMapAdd(spKgMap, kgPA.alias, kgPA.kgID);
    }
sqlFreeResult(&sr);
}

static void addSpSecondaryAcc(struct spKgMap *spKgMap, char *spAcc, char *spAcc2)
/* add a secondary accession to the map */
{
struct slName *kgId = spKgMapGet(spKgMap, spAcc);
if (kgId != NULL)
    spKgMapAdd(spKgMap, spAcc2, kgId->name);
}

static void loadSpSecondaryAcc(struct spKgMap *spKgMap)
/* Add secondary accessions from swissProt database, link */
{
struct sqlConnection *conn = sqlConnect("swissProt");
char query[1024];
struct sqlResult *sr;
char **row;

sqlSafef(query, sizeof(query), "SELECT acc,val FROM otherAcc");
sr = sqlGetResult(conn, query);
while ((row = sqlNextRow(sr)) != NULL)
    addSpSecondaryAcc(spKgMap, row[0], row[1]);
sqlFreeResult(&sr);
sqlDisconnect(&conn);
}

struct spKgMap *spKgMapNew(struct sqlConnection *conn)
/* load data from from kgXRef and kgProtAlias tables to map from swissprot
 * accs to known gene ids. */
{
struct spKgMap *spKgMap;

AllocVar(spKgMap);
spKgMap->spHash = hashNew(24);
loadKgXRef(spKgMap, conn);
loadKgProtAlias(spKgMap, conn);
loadSpSecondaryAcc(spKgMap);
return spKgMap;
}

void spKgMapFree(struct spKgMap **spKgMapPtr)
/* free a spKgMap structure */
{
struct spKgMap *spKgMap = *spKgMapPtr;
if (spKgMap != NULL)
    {
    hashFree(&spKgMap->spHash);
    freeMem(spKgMap);
    *spKgMapPtr = NULL;
    }
}

struct slName *spKgMapGet(struct spKgMap *spKgMap, char *spAcc)
/* Get the list of known gene ids for a swissprot acc.  If not found NULL is
 * returned. */
{
return hashFindVal(spKgMap->spHash, spAcc);
}

