/* spMapper - map swissprot accessions to KnownGene ids.  Intended to use in
 * building gene sorter tables.  Can optionally map uniref entry ids.  Used
 * to collect results so that duplication of entries can be avoided.  */

/* Copyright (C) 2005 The Regents of the University of California 
 * See kent/LICENSE or http://genome.ucsc.edu/license/ for licensing information. */
#ifndef SPMAPPER_H
#define SPMAPPER_H

struct sqlConnection *conn;

struct spMapper
/* mapper object */
{
    struct spKgMap *spKgMap;      /* hash of swissprot id to kg id. */
    struct unirefTbl *unirefTbl;  /* table of uniref entries. */
    struct hash *noMapTbl;        /* table of ids that could not be mapped,
                                   * value is code indicating the reasons */

    /* counts of ids that count not be found, only counted once */
    int noSpIdMapCnt;       /* couldn't map a sp id */
    int noUnirefMapCnt;     /* couldn't map a uniref entry id */

    /* counts of swissprot query/target pair mappings */
    int qtMapCnt;           /* pairs successfuly mapped */
    int qtNoSpIdMapCnt;     /* pairs not mapped due to no sp mapping */
    int qtNoUnirefMapCnt;   /* pairs not mapped due to no uniref entry mapping */

    int scoreDir;           /* 1 if higher score is better, -1 if lower score
                             * is better */

    /* hash table of best score for query and target ids.  A naive
     * implementation would have the pair as a single string, but this greatly
     * increase the memory usage.  Instead, both are entered separately, point
     * to the same struct. */
    struct hash *kgPairMap;      /* hash to kgEntry objects, which are chained
                                  * together for a given id */
    struct kgPair *kgPairList;   /* list of all spPair objects. */
};

struct kgEntry
/* struct used to store a known gene id with a link to the next entry
 * and the associated spPairScore struct. */
{
    struct kgEntry *next;
    char *id;                /* kd id, memory not owned by this struct */
    struct kgPair *kgPair;   /* containing shared score */
};

struct kgPair
/* structure used to keep the best e-value of a pair of kg ids. */
{
    struct kgPair *next;
    struct kgEntry *kg1Entry;   /* pointers back to each entry */
    struct kgEntry *kg2Entry;
    float score;                 /* best score for the pair */
};

  
struct spMapper *spMapperNew(struct sqlConnection *conn, int scoreDir,
                             char *unirefFile, char *orgFilter);
/* construct a new spMapper object. scoreDir is 1 if higher score is better,
 * -1 if lower score is better If unirefFile is not null, this is a tab file
 * reformatting of the uniref xml file with member data extracted (see
 * uniref.h).  If orgFilter is not null, then only uniref members from this
 * organism are loaded.
 */

void spMapperFree(struct spMapper **smPtr);
/* free smMapper object */

void spMapperMapPair(struct spMapper *sm, char *qSpId, char *tSpId, float score);
/* map a pair of swissprot/uniprot ids to one or more pairs of known gene
 * ids and add them to the table */

void spMapperPrintNoMapInfo(struct spMapper *sm, char *outFile);
/* Print missed id table. First column is id, second column is R if it wasn't
 * found in uniref, or K if it couldn't be associated with a known gene */


#endif
