/* This program finds an alignment between a short and a long
 * .fa file and and prints the result as a .html file. */

#include "common.h"         /* String, list, file utilities. */
#include "dnautil.h"       /* Handy utilities on DNA. */
#include "dnaseq.h"         /* DNA sequence - byte per nucleotide. */
#include "fa.h"             /* Read/write fasta files. */
#include "fuzzyFind.h"      /* Local aligner for similar sequences. */
#include "patSpace.h"       /* Fast homology finder for similar sequences. */

void usage()
/* Print usage instructions and exit. */
{
    errAbort("forChuck - Aligns a short (8 kb or less) query sequence to a\n"
             "long target sequence and prints some stats about the alignment.\n"
             "Usage:\n"
             "    forChuck query.fa target.fa\n");
}

int main(int argc, char *argv[])
{
char *queryName, *targetName;
struct dnaSeq *query, *target;
struct patSpace *ps;
struct patClump *clumpList, *clump;

/* Check command line arguments and assign to local variables. */
if (argc != 3)
    usage();
queryName = argv[1];
targetName = argv[2];

/* Read in DNA from fasta files and check query not too big.
 * The 8000 query limit is soft.  Performance gradually degrades 
 * on longer sequences.  It's starts hurting around 8000. */
query = faReadDna(queryName);
if (query->size > 8000)
    errAbort("Query sequence is %d bases, can only handle 8000", query->size);
target = faReadDna(targetName);

/* Make a pattern space query structure and use it once.
 * (This is really optimized for multiple queries to the
 * same target). */
ps = makePatSpace(&target, 1, NULL, 4, 500);
clumpList = patSpaceFindOne(ps, query->dna, query->size);
freePatSpace(&ps);

/* For each homology clump patSpace finds, do a fuzzyFinder
 * alignment of it and print the results. */
printf("Found %d homologous regions\n", slCount(clumpList));
for (clump = clumpList; clump != NULL; clump = clump->next)
    {
    struct ffAli *ali, *a;
    boolean isRc;
    int score;
    struct dnaSeq *t = clump->seq; /* There could be multiple seq's in patSpace. */

    if (ffFindAndScore(query->dna, query->size,
                       t->dna + clump->start, clump->size,
                       ffCdna, &ali, &isRc, &score))
        {
        printf("Found alignment score %d\n", score);
        printf("Blocks are:\n");
        for (a = ali; a != NULL; a = a->right)
            {
            printf("Q %4d - %4d\t T %4d -%4d\n", 
                a->nStart - query->dna, a->nEnd - query->dna,
                a->hStart - t->dna, a->hEnd - t->dna);
            }
        printf("\n");
        ffFreeAli(&ali);
        }
    else
        {
        printf("Couldn't align clump at %s %d-%d\n",
            t->name, clump->start, clump->start + clump->size);
        }
    }
slFreeList(&clumpList);
return 0;
}
