/* 
checkAgpAndFa - take a .agp file and a .fa file and validate
that they are in synch.
*/

#include "common.h"
#include "options.h"
#include "linefile.h"
#include "hash.h"
#include "fa.h"
#include "twoBit.h"
#include "agpFrag.h"
#include "agpGap.h"


/* Command line option variables */
char *exclude=NULL;

/* Command line option specifications */
static struct optionSpec optionSpecs[] = {
    {"exclude", OPTION_STRING},
    {NULL, 0}
};

/* Make the arguments global for convenience: */
char *agpFile = NULL;
char *faFile = NULL;

void usage()
/* 
Explain usage and exit. 
*/
{
    fflush(stdout);
    errAbort("checkAgpAndFa - takes a .agp file and .fa file and ensures that they are in synch\n"
      "usage:\n\n"
      "   checkAgpAndFa in.agp in.fa\n\n"
      "options:\n"
      "   -exclude=seq - Ignore seq (e.g. chrM for which we usually get\n"
      "                  sequence from GenBank but don't have AGP)\n"
      "in.fa can be a .2bit file.  If it is .fa then sequences must appear\n"
      "in the same order in .agp and .fa.\n"
      "\n");
}

boolean isExcluded(char *seqName)
/* Return true if seqName is -exclude'd. */
{
return exclude != NULL && sameString(seqName, exclude);
}

void getNextSeqFromFa(char *agpName, DNA **retDna, int *retSize, char **retName)
/* Get the next sequence in the FASTA file, if any.  If -exclude is given,
 * skip over that sequence.  Complain if sequence name is not what we expect 
 * from AGP, or if it appears that one input has ended before the other. */
{
static struct lineFile *lfFa = NULL;
boolean gotSeq = FALSE;

if (lfFa == NULL)
    lfFa = lineFileOpen(faFile, TRUE);

gotSeq = faSpeedReadNext(lfFa, retDna, retSize, retName);
if (gotSeq && isExcluded(*retName))
    gotSeq = faSpeedReadNext(lfFa, retDna, retSize, retName);
if (!gotSeq && agpName != NULL)
    {
    fflush(stdout);
    errAbort("Unexpected end of FASTA input %s when looking "
	     "for AGP sequence %s", faFile, agpName);
    }
if (gotSeq && agpName == NULL)
    {
    fflush(stdout);
    errAbort("FASTA file %s has sequence %s that is not present in "
	     "AGP file %s", faFile, *retName, agpFile);
    }
if (gotSeq && !sameString(agpName, *retName))
    {
    fflush(stdout);
    errAbort("Expecting to find sequence %s at line %d of %s, but got %s.  "
	     "When input is FASTA, sequences must appear in exactly the same "
	     "order as in AGP.",
	     agpName, lfFa->lineIx, faFile, *retName);
    }
}

void getNextSeqFromTwoBit(char *agpName, DNA **retDna, int *retSize,
			  char **retName)
/* Fetch agpName from twoBit.  Complain if it's not there.  If agpName is NULL
 * then AGP is done -- complain if any sequences not -exclude'd are in twoBit
 * but not in AGP. */
{
static struct twoBitFile *tbf = NULL;
static struct hash *seqNameHash = NULL;

if (tbf == NULL)
    {
    tbf = twoBitOpen(faFile);
    seqNameHash = hashNew(20);
    }

if (agpName != NULL)
    {
    struct dnaSeq *ds = twoBitReadSeqFragLower(tbf, agpName, 0, 0);
    /* That will errAbort if agpName is not in the 2bit. */ 
    *retDna = ds->dna;
    *retSize = ds->size;
    *retName = ds->name;
    hashAddUnique(seqNameHash, ds->name, NULL);
    /* Free the struct -- members will be freed below. */
    freeMem(ds);
    }
else
    {
    /* End of AGP has been reached.  See if any sequences (not -exclude'd) 
     * have been skipped over. */
    struct slName *seqNameList = twoBitSeqNames(faFile);
    struct slName *sn = NULL;
    for (sn = seqNameList;  sn != NULL;  sn = sn->next)
	{
	if (hashLookup(seqNameHash, sn->name) == NULL && !isExcluded(sn->name))
	    {
	    fflush(stdout);
	    errAbort("twoBit file %s has sequence %s that is not present in "
		     "AGP file %s", faFile, sn->name, agpFile);
	    }
	}
    slNameFreeList(&seqNameList);
    }
}

void getNextSeq(char *agpName, DNA **retDna, int *retSize, char **retName)
/* Get next sequence named in AGP, or if agpName is NULL (end of AGP), 
 * make sure there are no extra sequences missing from AGP.  
 * Complain if there's an inconsistency. */
{
if (twoBitIsFile(faFile))
    getNextSeqFromTwoBit(agpName, retDna, retSize, retName);
else
    getNextSeqFromFa(agpName, retDna, retSize, retName);
}

boolean containsOnlyChar(char *string, int offset, int strLength, char searchChar)
/* 
Ensure that the searched string only contains a certain set char, like 'n', for example.

param string - the string to examine
param offset - the starting offset at which to begin examination
param strLength - the number of chars to examine up from the offset
param searchChar - the char to search for in the string

returns true if this string contains only the searchChar
returns false if any other char is in this string 
*/
{
int charIndex = 0;
int stringEnd = offset + strLength;

for (charIndex = offset; charIndex < stringEnd; charIndex++)
    {
    if (searchChar != string[charIndex])
	{
        printf("Bad char %c found at index %d\n", string[charIndex], charIndex);
	return FALSE;
	}
    }
printf("Contains %d Ns\n", strLength);
return TRUE;
}

boolean containsOnlyChars(char *string, int offset, int strLength, char *searchCharList, int numSearchChars)
/* 
Ensure that the searched string only contains a certain set of chars, like 'acgtACGT',
for example.

param string - the string to examine
param offset - the starting offset at which to begin examination
param strLength - the number of chars to examine up from the offset
param searchCharList - the set of chars to search for in the string
param numSearchChars - the size of the searchCharList containing the searched-for chars

returns true if this string contains only the searchChars
returns false if any other char is in this string 
*/
{
int charIndex = 0;
int currentChar = 0;
int numMismatches = 0;
int stringEnd =  offset + strLength;

for(charIndex = offset; charIndex < stringEnd; charIndex++)
    {
    numMismatches = 0;
    for (currentChar = 0; currentChar < numSearchChars; currentChar++)
        {
        if (searchCharList[currentChar] != string[charIndex])
 	    {
	    ++numMismatches;
	    }
        /* If there was not even a single match then return FALSE */
	if(numMismatches == numSearchChars)
	    {
	    printf("Bad char %c found at index %d\n", string[charIndex], charIndex);
	    return FALSE;
	    }
	}
    }

return TRUE;
}

boolean agpMatchesFaEntry(struct agpFrag *agp, int offset, char *dna, int seqSize, char *seqName)
/* Return TRUE if the agp and fasta entries agree, FALSE otherwise. */
{
int fragSize = (agp->chromEnd - agp->chromStart);
boolean result = FALSE;

if (agp->chromEnd > seqSize)
    {
    printf("agp chromEnd (%d) > seqSize (%d)\n", agp->chromEnd, seqSize);
    return FALSE;
    }

if (sameString(agp->chrom, seqName))
    {
    if (agp->type[0] == 'N' || agp->type[0] == 'U')
        {
        printf("FASTA gap entry\n");
        result =  containsOnlyChar(dna, offset, fragSize, 'n');
        }
    else 
	{
        printf("FASTA sequence entry\n");
	/* Sometimes there are ambiguous characters so can't do this: */
	/* containsOnlyChars(dna, offset, fragSize, "acgt", 4); */
	result = TRUE;
	}
    }

return result;
}

struct agpFrag *agpFragOrGapLoadCheck(struct lineFile *lfAgp, char *words[],
				      int wordCount)
/* Read in and check the next line of AGP.  It may be a gap or a fragment 
 * line.  If it is a gap, cast it to a fragment and return (only coords will
 * be used by caller). */
{
struct agpFrag *agpFrag = NULL;
if (words[4][0] != 'N' && words[4][0] != 'U')
    {
    lineFileExpectWords(lfAgp, 9, wordCount);
    agpFrag = agpFragLoad(words);
    /* file is 1-based but agpFragLoad() now assumes 0-based: */
    agpFrag->chromStart -= 1;
    agpFrag->fragStart  -= 1;
    agpFragOutput(agpFrag, stdout, ' ', '\n');
    if (agpFrag->chromEnd - agpFrag->chromStart != 
	agpFrag->fragEnd - agpFrag->fragStart)
	{
	fflush(stdout);
	errAbort("1Sizes don't match in %s and %s line %d of %s\n",
		 agpFrag->chrom, agpFrag->frag, lfAgp->lineIx,
		 lfAgp->fileName);
	}
    if (agpFrag->chromEnd - agpFrag->chromStart <= 0)
	{
	fflush(stdout);
	errAbort("Size is %d in %s and %s line %d of %s\n",
		 agpFrag->chromEnd - agpFrag->chromStart,
		 agpFrag->chrom, agpFrag->frag, lfAgp->lineIx,
		 lfAgp->fileName);
	}
    }
else
    {
    struct agpGap *agpGap = agpGapLoad(words);
    int gapSize = -1;
    agpGap->chromStart--;
    gapSize = (agpGap->chromEnd - agpGap->chromStart);

    if (gapSize != agpGap->size)
	{
	fflush(stdout);
	errAbort("2Sizes don't match in %s, calculated size %d, size %d, "
		 "line %d of %s\n",
		 agpGap->chrom, gapSize, agpGap->size,
		 lfAgp->lineIx, lfAgp->fileName);
	}   
    agpFrag = (struct agpFrag*) agpGap;
    }
return agpFrag;
}

void checkAgpAndFa()
/* Read the .agp file and make sure that it agrees with the .fa file. */
{
struct lineFile *lfAgp = lineFileOpen(agpFile, TRUE);
char *line = NULL;
char *words[9];
int wordCount = 0;
char *agpName = NULL, *prevAgpSeqName = NULL;
struct agpFrag *agpFrag = NULL;
int dnaOffset = -1;
int seqSize = 0;
char *seqName = NULL;
DNA *dna = NULL;

while (lineFileNextReal(lfAgp, &line))
    {
    wordCount = chopLine(line, words);
    if (wordCount < 5)
	{
	fflush(stdout);
	errAbort("Bad line %d of %s\n", lfAgp->lineIx, lfAgp->fileName);
	}
    agpName = words[0];
    if (prevAgpSeqName == NULL || ! sameString(prevAgpSeqName, agpName))
	{
	if (prevAgpSeqName != NULL && dnaOffset != seqSize)
	    {
	    fflush(stdout);
	    errAbort("AGP for %s ends at %d but sequence size is %d",
		     prevAgpSeqName, dnaOffset, seqSize);
	    }
	getNextSeq(agpName, &dna, &seqSize, &seqName);
	dnaOffset = 0;
	printf("\n\nAnalyzing data for sequence: %s, size: %d, "
	       "dnaOffset = %d\n\n",
	       seqName, seqSize, dnaOffset);
	freez(&prevAgpSeqName);
	prevAgpSeqName = cloneString(agpName);
	}

    printf("\nLoop: %s, dnaOffset=%d, seqSize=%d\n",
	   agpName, dnaOffset, seqSize);
    agpFrag = agpFragOrGapLoadCheck(lfAgp, words, wordCount);
    if (dnaOffset != 0 && agpFrag->chromStart != dnaOffset)
	{
	fflush(stdout);
	errAbort("%s start %d doesn't match previous end %d, line %d of %s",
		 agpFrag->chrom, agpFrag->chromStart, dnaOffset, lfAgp->lineIx,
		 lfAgp->fileName);
	}
   
    /* If the agp entry is ignoring gaps at the start of the sequence 
       then we need to fake an agp entry and check the fake entry first 
       against the fasta entry */
    if (0 == dnaOffset && 0 != agpFrag->chromStart)
	{
	int origStart = agpFrag->chromStart;
	int origEnd = agpFrag->chromEnd;
	char origType[2];
	agpFrag->chromStart = 0;
	agpFrag->chromEnd = origStart;	
	strcpy(origType, agpFrag->type);
	strcpy(agpFrag->type, "N");
 
	if (!agpMatchesFaEntry(agpFrag, dnaOffset, dna, seqSize, seqName))
	    {
	    fflush(stdout);
	    errAbort("Invalid Fasta file entry\n");
	    }

	agpFrag->chromStart = origStart;
	agpFrag->chromEnd = origEnd;
	strcpy(agpFrag->type, origType);
	}

    dnaOffset = agpFrag->chromStart;                

    printf("agpFrag->chromStart: %d, agpFrag->chromEnd: %d, dnaOffset: %d\n",
	   agpFrag->chromStart, agpFrag->chromEnd, dnaOffset);
    if (!agpMatchesFaEntry(agpFrag, dnaOffset, dna, seqSize, seqName))
	{
	printf("Invalid Agp or Fasta file entry for sequence %s\n",
	       agpFrag->chrom);
	fflush(stdout);
	errAbort("agpMatchesFaEntry failed; exiting\n");
	}
    else 
	{
	printf("Valid Fasta file entry\n");
	}

    dnaOffset = agpFrag->chromEnd;
    }  

if (prevAgpSeqName == NULL)
    {
    fflush(stdout);
    errAbort("Empty AGP input");
    }
if (dnaOffset != seqSize)
    {
    fflush(stdout);
    errAbort("AGP for %s ends at %d but sequence size is %d",
	     prevAgpSeqName, dnaOffset, seqSize);
    }

/* Check for extra FASTA sequences not described by AGP: */
getNextSeq(NULL, &dna, &seqSize, &seqName);

/* Magic phrase to look for at the end of stdout: */
printf("All AGP and FASTA entries agree - both files are valid\n");
}

int main(int argc, char *argv[])
/* 
Process command line then delegate  main work to checkAgpAndFa().
*/
{
optionInit(&argc, argv, optionSpecs);
if (argc != 3)
    usage();
exclude = optionVal("exclude", exclude);
agpFile = argv[1];
faFile = argv[2];
checkAgpAndFa();
return 0;
}
