/* cntGlue - count up gluing and do some stats in general to see
 * how well EST and mrna data is gluing together fragmented BACs. */
#include "common.h"
#include "hash.h"


void usage()
/* Explain how program works and exit. */
{
errAbort(
    "cntGlue - count up gluing data from patSpace runs\n"
    "usage\n"
    "   cntGlue file(s).glu");
}

struct contigTrack 
/* Keep track of a contig. */
    {
    struct contigTrack *next;   /* Next in list. */
    int ix;                     /* Index of contig in bak. */
    char strand;                /* Strand: + or - */
    char dir;                   /* Direction: 5' or 3' */
    int start, end;             /* Start and end of aligning parts of mrna. */
    double cookedScore;         /* Percentage alignment between start and end. */
    };


int cmpContigTrack(const void *va, const void *vb)
/* Compare two contigTracks. */
{
const struct contigTrack *a = *((struct contigTrack **)va);
const struct contigTrack *b = *((struct contigTrack **)vb);
int dif;
dif = b->dir - a->dir;
if (dif == 0)
    dif = a->start - b->start;
return dif;
}

struct contigPair
/* This glues together a pair of contigs. */
    {
    struct contigPair *next;    /* Next in list. */
    int ix1, ix2;
    int useCount;
    };

struct bacTrack
/* Keep track of one BAC. */
    {
    struct bacTrack *next;          /* Next in list. */
    char *name;                     /* Name (allocated in hash table.) */
    struct contigPair *gluePairs;   /* List of gluing pairs. */
    };
struct bacTrack *bacList = NULL;

int cmpBacTrack(const void *va, const void *vb)
/* Compare two bacTracks. */
{
const struct bacTrack *a = *((struct bacTrack **)va);
const struct bacTrack *b = *((struct bacTrack **)vb);
return strcmp(a->name, b->name);
}

struct contigPair *findPair(struct contigPair *pairList, int ix1, int ix2)
/* Find matching pair on list.  Return NULL if not found. */
{
struct contigPair *pair;
for (pair = pairList; pair != NULL; pair = pair->next)
    {
    if (pair->ix1 == ix1 && pair->ix2 == ix2)
        return pair;
    }
return NULL;
}

void addPairs(struct bacTrack *bt, struct contigTrack *ctList)
/* Add in pairs that are contained in ctList. */
{
struct contigTrack *ct;
int lastIx = ctList->ix;
int ix;
int a, b;
struct contigPair *cp;

for (ct = ctList->next; ct != NULL; ct = ct->next)
    {
    ix = ct->ix;
    if (ix < lastIx)
        {
        a = ix;
        b = lastIx;
        }
    else
        {
        a = lastIx;
        b = ix;
        }
    if ((cp = findPair(bt->gluePairs, a, b)) != NULL)
        ++cp->useCount;
    else
        {
        AllocVar(cp);
        cp->ix1 = a;
        cp->ix2 = b;
        cp->useCount = 1;
        slAddHead(&bt->gluePairs, cp);
        }
    lastIx = ix;
    }
}

void printStats()
/* Print out some statistics about gluing. */
{
int bacCount = 0;       /* Count how many different bacs have gluing. */
int pairCount = 0;      /* Count # of gluing pairs. */
int uniqPairCount = 0;  /* Count # of unique gluing pairs. */

struct bacTrack *bt;
struct contigPair *cp;

for (bt = bacList; bt != NULL; bt = bt->next)
    {
    ++bacCount;
    for (cp = bt->gluePairs; cp != NULL; cp = cp->next)
        {
        ++uniqPairCount;
        pairCount += cp->useCount;
        }
    }
printf("%d glued bacs.  %d gluing pairs.  %d uniq pairs\n",
    bacCount, pairCount, uniqPairCount);
}


int main(int argc, char *argv[])
{
struct hash *bacHash;
char line[1024];
int lineCount;
char *words[256];
int wordCount;
int fileIx;
char *fileName;
FILE *f;

if (argc < 2)
    usage();
bacHash = newHash(16);

for (fileIx = 1; fileIx < argc; ++fileIx)
    {
    fileName = argv[fileIx];
    uglyf("Processing %s\n", fileName);
    f = mustOpen(fileName, "r");
    lineCount = 0;
    while (fgets(line, sizeof(line), f))
        {
        ++lineCount;
        wordCount = chopLine(line, words);
        if (wordCount == ArraySize(words))
            errAbort("Too many words line %d of %s\n", lineCount, fileName);
        if (wordCount != 0)
            {
            char *bacName;
            int cIx;
            struct contigTrack *ctList = NULL, *ct;
            struct bacTrack *bt;
            struct hashEl *hel;

            /* Check line syntax and parse it. */
            if (!sameString(words[1], "glues"))
                errAbort("Bad format line %d of %s\n", lineCount, fileName);
            bacName = words[2];
            for (cIx = 4; cIx < wordCount; cIx += 5)
                {
                char *parts[3];
                int partCount;

                AllocVar(ct);
                ct->ix = atoi(words[cIx]);
                ct->strand = words[cIx+1][0];
                ct->dir = words[cIx+2][0];
                partCount = chopString(words[cIx+3], "(-)", parts, ArraySize(parts));
                if (partCount != 2)
                    errAbort("Bad format line %d of %s\n", lineCount, fileName);
                ct->start = atoi(parts[0]);
                ct->end = atoi(parts[1]);
                ct->cookedScore = atof(words[cIx+4]);
                slAddHead(&ctList, ct);                
                }
            slSort(&ctList, cmpContigTrack);
        
            /* Lookup bacTrack and make it if new. */
            hel = hashLookup(bacHash, bacName);
            if (hel == NULL)
                {
                AllocVar(bt);
                hel = hashAdd(bacHash, bacName, bt);
                bt->name = hel->name;
                slAddHead(&bacList, bt);
                }
            else
                {
                bt = hel->val;
                }
            
            /* Process pairs into bacTrack. */
            addPairs(bt, ctList);
            slFreeList(&ctList);
            }
        }
    fclose(f);
    }
slSort(&bacList, cmpBacTrack);

printStats();
return 0;
}