/* pairClo - pair up 3' and 5' ESTs taken from the same clone.
 * Input - an EST .ra file and it's clone-keyed index.
 * Output - a pair list file. */
#include "common.h"
#include "hash.h"
#include "sig.h"
#include "snof.h"
#include "snofmake.h"

enum {
    maxCloNameSize = 64,
    };

void usage()
{
errAbort(
    "pairClo - pair up 3' and 5' ESTs taken from the same clone.\n"
    "usage:\n"
    "   pairClo  est.ra est_clo.ix output.pai\n");
}

struct cloIx
/* Structure to manage an index file. */
    {
    FILE *file;
    int itemSize;
    int maxNameSize;
    char *fileName;
    };

struct cloIx *openIx(char *ixName)
/* Open up index file */
{
struct cloIx *cx;
int sigBuf[4];
FILE *f;

AllocVar(cx);
cx->fileName = ixName;
f = cx->file = mustOpen(ixName, "rb");
mustRead(f, sigBuf, sizeof(sigBuf));
if (!isSnofSig(&sigBuf))
    errAbort("Bad signature on %s", ixName);
mustReadOne(f, cx->maxNameSize);
if (cx->maxNameSize >= maxCloNameSize)
    errAbort("Name field %d chars, can only handle %d", cx->maxNameSize, maxCloNameSize-1);
cx->itemSize = cx->maxNameSize + sizeof(unsigned);
return cx;
}

boolean nextIx(struct cloIx *cx, unsigned *retOff, char retName[maxCloNameSize])
/* Read next record from index file.  Return FALSE at EOF. */
{
int nameSize = cx->maxNameSize;
FILE *f = cx->file;
if (fread(retName, nameSize, 1, f) < 1)
    return FALSE;
retName[nameSize] = 0;
mustRead(f, retOff, sizeof(*retOff));
return TRUE;
}

enum {
    maxValSize = 512,
    };

struct keyVal
/* A list of key/value pairs. */
    {
    struct keyVal *next;
    struct keyVal *listNext;
    char key[16];
    char val[maxValSize];
    };

enum {
    keyHashSize = 32,
};
    
struct keyHash
/* This structure holds one parsed record from
 * a .ra (flat keyed) file. */
    {
    struct keyHash *next;
    struct keyVal *hash[keyHashSize];
    struct keyVal *list;
    };

struct keyVal *freeKeyValList = NULL;
struct keyHash *freeKeyHashList = NULL;

void freeKeyHash(struct keyHash **pKeyHash)
/* Free up keyHash for reuse. */
{
struct keyHash *kh;
if ((kh = *pKeyHash) != NULL)
    {
    struct keyVal *kv, *nextKv;
    for (kv = kh->list; kv != NULL; kv = nextKv)
        {
        nextKv = kv->listNext;
        slAddHead(&freeKeyValList, kv);
        }
    memset(kh, 0, sizeof(*kh));
    slAddHead(&freeKeyHashList, kh);
    *pKeyHash = NULL;
    }
}

struct keyHash *newKeyHash()
/* Get a new keyHash structure, from free list if possible. */
{
struct keyHash *kh;
if ((kh = freeKeyHashList) != NULL)
    {
    freeKeyHashList = kh->next;
    kh->next = NULL;
    return kh;
    }
else
    return needMem(sizeof(struct keyHash));
}

int keyHashFunc(char *key)
/* Hash function optimized for keyHash. */
{
int acc = 0;
char c;
while ((c = *key++) != 0)
    {
    acc <<= 2;
    acc += (c - 'a');
    if (acc > keyHashSize)
        {
        acc += (acc >> 5);
        acc &= (keyHashSize-1);
        }
    }
return acc;
}

void addKey(struct keyHash *kh, char *key, char *val)
/* Add new key/val pair to keyHash. */
{
struct keyVal *kv;
int hashIx;
int keySize, valSize;

/* Get a keyVal. */
if ((kv = freeKeyValList) != NULL)
    {
    freeKeyValList = kv->next;
    kv->next = NULL;
    }
else
    {
    kv = needMem(sizeof(*kv));
    }
/* Make sure that key and val aren't too long. */
keySize = strlen(key);
if (keySize >= sizeof(kv->key))
    errAbort("Key too long in addKey(%s %s)", key, val);
valSize = strlen(val);
if (valSize >= sizeof(kv->val))
    errAbort("Val too long in addKey(%s %s)", key, val);
/* Copy in key and val and add zero tags. */
memcpy(kv->key, key, keySize);
kv->key[keySize] = 0;
memcpy(kv->val, val, valSize);
kv->val[valSize] = 0;
/* Add to hash table. */
hashIx = keyHashFunc(key);
slAddHead(kh->hash+hashIx, kv);
kv->listNext = kh->list;
kh->list = kv;
}

char *getVal(struct keyHash *kh, char *key)
/* Returns val associated with key, or NULL if it doesn't exist. */
{
struct keyVal *kv;
int hashIx = keyHashFunc(key);
for (kv = kh->hash[hashIx]; kv != NULL; kv = kv->next)
    {
    if (sameString(kv->key, key))
        return kv->val;
    }
return NULL;
}


struct keyHash *readKeys(FILE *f)
/* Read in keys from current file position to blank line. */
{
struct keyHash *kh;
char line[2*1024];
int lineCount;
char *key;
int keyCount = 0;
int keySize;
char *val;

if (fgets(line, sizeof(line), f) == NULL)
    return NULL;
kh = newKeyHash();
for (;;)
    {
    ++lineCount;
    key = line;
    val = strchr(line, ' ');
    if (val == NULL)    /* Blank line. */
        break;
    keySize = val-key;
    *val++ = 0;   /* Terminate key string. */
    eraseTrailingSpaces(val);
    ++keyCount;
    addKey(kh, key, val);
    if (fgets(line, sizeof(line), f) == NULL)
        break;
    }
if (keyCount > 0)
    return kh;
else
    {
    freeKeyHash(&kh);
    return NULL;
    }
}

struct estInfo
/* Relevant info about one EST. */
    {
    struct estInfo *next;
    struct keyHash *keys;
    char *clone;
    char *author;
    char *lib;
    char *acc;
    char *dir;
    };

struct estInfo *freeEstInfoList = NULL;

void freeEstInfo(struct estInfo **pEstInfo)
/* Free up estInfo for reuse. */
{
struct estInfo *ei;
if ((ei = *pEstInfo) != NULL)
    {
    freeKeyHash(&ei->keys);
    zeroBytes(ei, sizeof(*ei));
    slAddHead(&freeEstInfoList, ei);
    *pEstInfo = NULL;
    }
}

void freeEstList(struct estInfo **pEstList)
/* Free up list of estInfo's for reuse. */
{
struct estInfo *ei, *nextEi;

for (ei = *pEstList; ei != NULL; ei = nextEi)
    {
    nextEi = ei->next;
    freeEstInfo(&ei);
    }
*pEstList = NULL;
}

boolean sameStringWithNull(char *a, char *b)
/* Returns TRUE if both strings are NULL or both are the same. */
{
if (a == b)
    return TRUE;
if (a == NULL || b == NULL)
    return FALSE;
return sameString(a,b);
}

struct estInfo *findCompatibleClone(struct estInfo *list, struct estInfo *ei)
/* Find a clone that's compatible.  (Assumes that all on list are from same
 * clone, this checks the other fields...). */
{
struct estInfo *el;
char *dir, *opDir;
char *clone = ei->clone;

if ((dir = ei->dir) == NULL)
    return NULL;
if (dir[0] == '3')
    opDir = "5'";
else if (dir[0] == '5')
    opDir = "3'";
else
    errAbort("Unknown dir %s in %s", dir, ei->acc);
for (el = list; el != NULL; el = el->next)
    {
    if ( sameStringWithNull(el->author, ei->author)
      && sameStringWithNull(el->lib, ei->lib)
      && sameStringWithNull(el->dir, opDir) )
        return el;
    }
return NULL;
}

struct estInfo *getNextInfo(struct cloIx *cx, FILE *ra)
/* Read in next info. */
{
unsigned offset;
char cloName[maxCloNameSize];
struct keyHash *kh;
struct estInfo *ei;

if (!nextIx(cx, &offset, cloName))
    return FALSE;
fseek(ra, offset, SEEK_SET);
if ((ei = freeEstInfoList) != NULL)
    {
    freeEstInfoList = ei->next;
    ei->next = NULL;
    }
else
    {
    AllocVar(ei);
    }
if ((kh = ei->keys = readKeys(ra)) == NULL)
    errAbort("Couldn't read keys for %s at %u", cloName, offset);
if ((ei->acc = getVal(kh, "acc")) == NULL)
    errAbort("No accession for %s at %u", cloName, offset);
if ((ei->clone = getVal(kh, "clo")) == NULL)
    errAbort("No clone for %s", ei->acc);
if (!sameString(ei->clone, cloName))
    errAbort("Clone names out of phase %s %s", ei->clone, cloName);
ei->author = getVal(kh, "aut");
ei->lib = getVal(kh, "lib");
ei->dir = getVal(kh, "dir");
return ei;
}

int main(int argc, char *argv[])
{
char *raName, *ixName, *outName;
FILE *ra, *out;
struct cloIx *cx;
struct estInfo *cloneList = NULL, *ei, *mate;
int pairCount = 0;
int estCount;
int cloneCount = 1;

if (argc != 4)
    usage();
raName = argv[1];
ixName = argv[2];
outName = argv[3];

ra = mustOpen(raName, "rb");
cx = openIx(ixName);
out = mustOpen(outName, "w");

while ((ei = getNextInfo(cx, ra)) != NULL)
    {
    ++estCount;
    if ((estCount&0x3fff) == 0)
        {
        putchar('.');
        fflush(stdout);
        }
    if (cloneList != NULL && !sameString(cloneList->clone, ei->clone))
        {
        freeEstList(&cloneList);
        ++cloneCount;
        }
    if ((mate = findCompatibleClone(cloneList, ei)) != NULL)
        {
        ++pairCount;
        if (ei->dir[0] == '5')
            fprintf(out, "%s %s\n", ei->acc, mate->acc);
        else
            fprintf(out, "%s %s\n", mate->acc, ei->acc);
        }
    slAddHead(&cloneList, ei);
    }
printf("Found %d pairs in %d clones in %d ests\n", pairCount, cloneCount, estCount);
return 0;
}

