/* lumpRep - lump together repeating elements of various sizes
 * and strands into (hopefully) fewer repeats. */
#include "common.h"
#include "dnautil.h"
#include "fuzzyFind.h"
#include "portable.h"

struct rawRep
/* Struct read in from repeat.txt. */
    {
    struct rawRep *next;
    int repCount;
    DNA *seq;
    int seqSize;
    };

struct rawRep *readRawReps(char *fileName, int maxCount)
/* Read in raw repeats from file. */
{
struct rawRep *rrList = NULL, *rr;
FILE *f = mustOpen(fileName, "r");
char line[512];
int lineCount=0;
char *words[3];
int wordCount;
char *seq;

while (fgets(line, sizeof(line), f))
    {
    ++lineCount;
    if (--maxCount < 0)
        break;
    wordCount = chopLine(line, words);
    if (wordCount == 0)
        continue;   /* Allow blank lines. */
    if (wordCount != 2 || !isdigit(words[0][0]))
        errAbort("Strange line %d of %s\n", lineCount, fileName);
    AllocVar(rr);
    rr->repCount = atoi(words[0]);
    seq = words[1];
    dnaFilter(seq, seq);
    rr->seqSize = strlen(seq);
    rr->seq = cloneStringZ(seq, rr->seqSize);
    slAddHead(&rrList, rr);
    }
fclose(f);
slReverse(&rrList);
return rrList;
}

struct lumpedRep
/* Struct for lumping together raw repeats. */
    {
    struct lumpedRep *next;
    char *seq;
    int seqSize;
    struct rawRep *repList;
    };

void findSkipEnds(struct ffAli *left, struct ffAli *right, 
    DNA *needle, int needleSize, 
    int *retStartSkip, int *retEndSkip,
    int *retFuseStart, int *retFuseEnd)
/* Figure out how much of either end of needle is skipped before reaching solid alignment. */
{
int minMatch = 6;
int blockSize;


/* Skip over bogus bits on left and store start. */
for ( ; left != NULL; left = left->right)
    {
    blockSize = left->nEnd - left->nStart;
    if (blockSize >= minMatch && ffScoreMatch(left->nStart, left->hStart, blockSize) >= minMatch)
        break;
    }
if (left == NULL)
    *retStartSkip = needleSize;
else
    {
    *retStartSkip = left->nStart - needle;
    *retFuseStart = left->hStart - left->nStart;
    }

/* Skip over bogus bits on right and store end. */
for ( ; right != NULL; right = right->left)
    {
    blockSize = right->nEnd - right->nStart;
    if (blockSize >= minMatch && ffScoreMatch(right->nStart, right->hStart, blockSize) >= minMatch)
        break;
    }
if (right == NULL)
    *retEndSkip = needleSize;
else
    {
    *retEndSkip = needle + needleSize - right->nEnd;
    *retFuseEnd = right->hEnd - right->nEnd;
    }
}

const int endMismatchTolerance = 3;

boolean closestLump(struct lumpedRep *lumpedList, DNA *seq, int seqLen,
    struct lumpedRep **retClosest, int *retSkipStart, int *retSkipEnd,
    int *retFuseStart, int *retFuseEnd, boolean *retIsRc, 
    boolean *retTotalIn, boolean *retTotalOut)
/* Return closest lump in list and information needed to fuse
 * sequence to it. */
{
struct lumpedRep *lump;
struct lumpedRep *bestLump = NULL;
int bestSkipStart, bestSkipEnd;
int bestFuseStart, bestFuseEnd;
int bestScore = 0;   /* If best isn't pretty good we're not interested. */
boolean bestTotalIn, bestTotalOut;
boolean bestIsRc;
int score;
int skipStart, skipEnd;
int fuseStart, fuseEnd;
boolean totalIn, totalOut;

for (lump = lumpedList; lump != NULL; lump = lump->next)
    {
    boolean rc;
    struct ffAli *ffAli;
    int lumpSize = lump->seqSize;

    if (ffFindEitherStrandN(seq, seqLen, lump->seq, lump->seqSize, ffTight, &ffAli, &rc) )
        {
        if (rc)
            reverseComplement(seq, seqLen);
        score = ffScoreCdna(ffAli);
        if (score >= bestScore)
            {
            struct ffAli *right, *left = ffAli;
            int nSize, hSize;

            /* Find right end. */
            for (right = ffAli; right->right != NULL; right = right->right)
                ;
            totalIn = totalOut = FALSE;
            nSize = right->nEnd - left->nStart;
            hSize = right->hEnd - left->hStart;
            if (nSize >= seqLen-endMismatchTolerance)
                totalIn = TRUE;
            if (hSize >= lump->seqSize-endMismatchTolerance && score >= lumpSize)
                totalOut = TRUE;
            findSkipEnds(ffAli, right, seq, seqLen, 
                &skipStart, &skipEnd, &fuseStart, &fuseEnd);
            if (totalIn || totalOut || 
                skipStart <= endMismatchTolerance || skipEnd <= endMismatchTolerance)
                {
                int relScore = ((score * 9 + 4) >> 3);
                if (nSize <= relScore && hSize <= relScore)
                    {
                    bestScore = score;
                    bestLump = lump;
                    bestSkipStart = skipStart;
                    bestSkipEnd = skipEnd;
                    bestFuseStart = fuseStart;
                    bestFuseEnd = fuseEnd;
                    bestIsRc = rc;
                    bestTotalIn = totalIn;
                    bestTotalOut = totalOut;
                    }
                }
            }
        if (rc)
            reverseComplement(seq, seqLen);
        ffFreeAli(&ffAli);
        }
    }
if (bestScore < seqLen/2 && !bestTotalOut)
    bestLump = NULL;
if (bestLump == NULL)
    return FALSE;
else
    {
    *retClosest = bestLump;
    *retSkipStart = bestSkipStart;    
    *retSkipEnd = bestSkipEnd;
    *retFuseStart = bestFuseStart;
    *retFuseEnd = bestFuseEnd;
    *retIsRc = bestIsRc;
    *retTotalIn = bestTotalIn;
    *retTotalOut = bestTotalOut;
    return TRUE;
    }
}

void addToLump(struct lumpedRep *lump, struct rawRep *raw,
    int skipStart, int skipEnd, int fuseStart, int fuseEnd)
/* Add rawRep to lump - fusing at start or end depending on
 * whether skipStart or skipEnd is better. */
{
DNA *n, *h;
int nSize, hSize, newSize;
/* Make sure we have a processing buffer big enough. */
static char *fuseBuf = NULL;
static int fuseAlloced = 0;
int fuseNeeded = lump->seqSize + raw->seqSize + 1;
if (fuseNeeded > fuseAlloced)
    {
    freeMem(fuseBuf);
    fuseAlloced = fuseNeeded*2;
    fuseBuf = needMem(fuseAlloced*sizeof(fuseBuf[0]));
    }

if (skipStart <= skipEnd)
    {
    n = raw->seq + skipStart;
    h = n + fuseStart;
    nSize = raw->seqSize - skipStart;
    hSize = h - lump->seq;
    newSize = nSize + hSize;
    if (newSize > lump->seqSize)
        {
        memcpy(fuseBuf, lump->seq, hSize);
        memcpy(fuseBuf+hSize, n, nSize);
        fuseBuf[newSize] = 0;
        lump->seq = cloneStringZ(fuseBuf, newSize);
        lump->seqSize = newSize;
        }
    }
else
    {
    n = raw->seq + raw->seqSize - skipEnd;
    h = n + fuseEnd;
    nSize = n - raw->seq;
    hSize = lump->seq + lump->seqSize - h;
    assert(nSize >= 0 && hSize >= 0);
    newSize = nSize + hSize;
    if (newSize > lump->seqSize)
        {
        memcpy(fuseBuf, raw->seq, nSize);
        memcpy(fuseBuf+nSize, h, hSize);
        fuseBuf[newSize] = 0;
        lump->seq = cloneStringZ(fuseBuf, newSize);
        lump->seqSize = newSize;
        }    
    }
slAddHead(&lump->repList, raw);
}

struct lumpedRep *lumpRaw(struct rawRep *rawList)
/* Lump together repeats that share sequence. */
{
struct lumpedRep *lumpedList = NULL, *lump;
struct rawRep *raw, *rawNext;
int skipStart, skipEnd, fuseStart, fuseEnd;
boolean isRc, totalIn, totalOut;
int rawCount = slCount(rawList);
int rawIx = 0;
int rawMod = 0;

for (raw = rawList; raw != NULL; raw = rawNext)
    {
    ++rawIx;
    if (++rawMod == 10)
        {
        printf("lumping %d of %d\n", rawIx, rawCount);
        rawMod = 0;
        }
    rawNext = raw->next;
    if (closestLump(lumpedList, raw->seq, raw->seqSize, &lump, 
        &skipStart, &skipEnd, &fuseStart, &fuseEnd, 
        &isRc, &totalIn, &totalOut))
        {
        if (isRc)
            reverseComplement(raw->seq, raw->seqSize);
        if (totalIn)
            {
            slAddHead(&lump->repList, raw);
            }
        else if (totalOut)
            {
            slAddHead(&lump->repList, raw);
            lump->seqSize = raw->seqSize;
            lump->seq = raw->seq;
            }
        else
            {
            addToLump(lump, raw, skipStart, skipEnd, fuseStart, fuseEnd);
            }
        }
    else
        {
        AllocVar(lump);
        lump->seq = raw->seq;
        lump->seqSize = raw->seqSize;
        lump->repList = raw;
        raw->next = NULL;
        slAddHead(&lumpedList, lump);
        }
    }
slReverse(&lumpedList);
return lumpedList;
}

void printLumps(struct lumpedRep *lumpList, FILE *f)
/* Print out lump list to file. */
{
struct lumpedRep *lump;
struct rawRep *raw;

for (lump = lumpList; lump != NULL; lump = lump->next)
    {
    fprintf(f, "%s\n", lump->seq);
    for (raw = lump->repList; raw != NULL; raw = raw->next)
        {
        fprintf(f, "   %d %s\n", raw->repCount, raw->seq);
        }
    }
}

int main(int argc, char *argv[])
{
char *inName = "repeats.in";
char *outName = "repeats.out";
FILE *out;
struct rawRep *rawList;
struct lumpedRep *lumpList;
int rawCount, lumpedCount;
long startTime;

dnaUtilOpen();
rawList = readRawReps(inName, 100000);
rawCount = slCount(rawList);
startTime = clock1000();
lumpList = lumpRaw(rawList);
lumpedCount = slCount(lumpList);
out = mustOpen(outName, "w");
printf("%f seconds to lump %d down to %d\n", (clock1000()-startTime)*0.001, rawCount, lumpedCount);
printLumps(lumpList, out);
return 0;
}
