/* object to track refseqs that are to be retrieved */

/* Copyright (C) 2013 The Regents of the University of California 
 * See kent/LICENSE or http://genome.ucsc.edu/license/ for licensing information. */
#include "common.h"
#include "refSeqVerInfo.h"
#include "linefile.h"
#include "hash.h"
#include "jksql.h"
#include "sqlNum.h"

enum refSeqVerInfoStatus
/* validation status */
{
    refSeqVerInfoOk,
    refSeqVerInfoIgnore,  // duplicate, ignore
    refSeqVerInfoError
};

static int refSeqVerInfoCmp(const void *va, const void *vb)
/* Compare by accession. */
{
const struct refSeqVerInfo *a = *((struct refSeqVerInfo **)va);
const struct refSeqVerInfo *b = *((struct refSeqVerInfo **)vb);
return strcmp(a->acc, b->acc);
}


static struct refSeqVerInfo *refSeqVerInfoNewDb(char **row)
/* construct a refSeqVerInfo object from a database query */
{
struct refSeqVerInfo *rsvi;
AllocVar(rsvi);
rsvi->acc = cloneString(row[0]);
rsvi->ver = sqlSigned(row[1]);
return rsvi;
}

struct hash *refSeqVerInfoFromDb(struct sqlConnection *conn, boolean getNM, boolean getNR, struct refSeqVerInfo **refSeqVerInfoList)
/* load refSeqVerInfo table for all native refseqs in the database */
{
struct hash *refSeqVerInfoTbl = hashNew(18);
*refSeqVerInfoList = NULL;
char *accRestrict = "";
if (getNM && !getNR)
    accRestrict = " AND (acc LIKE \"NM%\")";
else if (!getNM && getNR)
    accRestrict = " AND (acc LIKE \"NR%\")";
char query[128];
sqlSafef(query, sizeof(query),
      "SELECT acc, version FROM gbStatus WHERE (srcDb = \"RefSeq\") AND (orgCat = \"native\")%s", accRestrict);
struct sqlResult *sr = sqlGetResult(conn, query);
char **row;
while ((row = sqlNextRow(sr)) != NULL)
    {
    struct refSeqVerInfo *rsvi = refSeqVerInfoNewDb(row);
    hashAdd(refSeqVerInfoTbl, rsvi->acc, rsvi);
    slAddHead(refSeqVerInfoList, rsvi);
    }

sqlFreeResult(&sr);
slSort(refSeqVerInfoList, refSeqVerInfoCmp);
return refSeqVerInfoTbl;
}

static struct refSeqVerInfo *refSeqVerInfoNewFile(char *acc)
/* construct a refSeqVerInfo object from a file */
{
struct refSeqVerInfo *rsvi;
AllocVar(rsvi);
char *dot = strchr(acc, '.');
rsvi->acc = cloneStringZ(acc, ((dot != NULL) ? (dot - acc) : strlen(acc)));
if (dot != NULL)
    rsvi->requestVer = sqlUnsigned(dot+1);
return rsvi;
}

static void prAccReqVer(struct refSeqVerInfo *rsvi, FILE *fh)
/* print acc + requested version, or just acc if no version */
{
fputs(rsvi->acc, fh);
if (rsvi->requestVer != 0)
    fprintf(fh, ".%d", rsvi->requestVer);
}

static enum refSeqVerInfoStatus dupCheck(struct hash *refSeqVerInfoTbl, struct refSeqVerInfo *rsvi, struct sqlConnection *conn)
{
struct refSeqVerInfo *rsvi0 = hashFindVal(refSeqVerInfoTbl, rsvi->acc);
if (rsvi0 != NULL)
    {
    if (rsvi0->requestVer == rsvi->requestVer)
        return refSeqVerInfoIgnore;  // already there with the same version, ignore
    else
        {
        fprintf(stderr, "Error: RefSeq %s specified multiple times in accList, once as ", rsvi->acc);
        prAccReqVer(rsvi0, stderr);
        fputs(" and once as ", stderr);
        prAccReqVer(rsvi0, stderr);
        fputc('\n', stderr);
        return refSeqVerInfoError;
        }
    }
else
    return refSeqVerInfoOk;
}

int refSeqVerInfoGetVersion(char *acc, struct sqlConnection *conn)
/* get the version from the database, or zero if accession is not found */
{
char query[128];
sqlSafef(query, sizeof(query),
      "SELECT version FROM hgFixed.gbSeq WHERE (acc = \"%s\")", acc);
return sqlQuickNum(conn, query);
}

static enum refSeqVerInfoStatus versionGetCheck(struct refSeqVerInfo *rsvi, struct sqlConnection *conn)
/* get or validate the version from the file against the database */
{
int dbVer = refSeqVerInfoGetVersion(rsvi->acc, conn);
if (dbVer == 0)
    {
    fprintf(stderr, "Error: RefSeq %s not in database\n", rsvi->acc);
    return refSeqVerInfoError;
    }
if ((rsvi->requestVer != 0) && (dbVer != rsvi->requestVer))
    {
    fprintf(stderr, "Error: RefSeq %s.%d requested in accList, database contains %s.%d \n", rsvi->acc, rsvi->requestVer, rsvi->acc, dbVer);
    return refSeqVerInfoError;
    }
rsvi->ver = dbVer;
return refSeqVerInfoOk;
}

static enum refSeqVerInfoStatus fromFileAdd(struct hash *refSeqVerInfoTbl, struct refSeqVerInfo *rsvi, struct sqlConnection *conn, struct refSeqVerInfo **refSeqVerInfoList)
/* add a refseq parsed from a file to the table, validating against database
 * and setting the actually version number. */
{
enum refSeqVerInfoStatus stat = dupCheck(refSeqVerInfoTbl, rsvi, conn);
if (stat != refSeqVerInfoOk)
    return stat;
stat = versionGetCheck(rsvi, conn);
if (stat != refSeqVerInfoOk)
    return stat;
hashAdd(refSeqVerInfoTbl, rsvi->acc, rsvi);
slAddHead(refSeqVerInfoList, rsvi);
return refSeqVerInfoOk;
}

struct hash *refSeqVerInfoFromFile(struct sqlConnection *conn, char *accList, struct refSeqVerInfo **refSeqVerInfoList)
/* load refSeqVerInfo table for all native refseqs specified in a file, then validate it against
 * the database. */
{
struct hash *refSeqVerInfoTbl = hashNew(18);
*refSeqVerInfoList = NULL;
struct lineFile *lf = lineFileOpen(accList, TRUE); 
int errCnt = 0;
char *line;
while (lineFileNextReal(lf, &line))
    {
    char *acc = trimSpaces(line);
    if (fromFileAdd(refSeqVerInfoTbl, refSeqVerInfoNewFile(acc), conn, refSeqVerInfoList) == refSeqVerInfoError)
        errCnt++;
    }
lineFileClose(&lf);
if (errCnt > 0)
    errAbort("%d errors detected loading RefSeq accessioned from %s", errCnt, accList);
slSort(refSeqVerInfoList, refSeqVerInfoCmp);
return refSeqVerInfoTbl;
}

