/* splatCheck1 - Check that all the test set really is being covered.. */
/* This file is copyright 2008 Jim Kent, but license is hereby
 * granted for all use - public, private or commercial. */

#include "common.h"
#include "linefile.h"
#include "hash.h"
#include "options.h"
#include "sqlNum.h"
#include "fa.h"


void usage()
/* Explain usage and exit. */
{
errAbort(
  "splatCheck1 - Check that all the test set really is being covered.\n"
  "Works on test sets generated by splatTestSet.\n"
  "usage:\n"
  "   splatCheck1 int.fa in.splat out.miss out.bad\n"
  "where out.miss contains the names of unmapped reads, and out.bad\n"
  "contains the names of reads that don't map to the right place\n"
  "options:\n"
  "   -xxx=XXX\n"
  );
}

static struct optionSpec options[] = {
   {NULL, 0},
};

struct hash *hashFaNames(char *fileName)
/* Return a hash full of the names (but not sequence) of all 
 * records in fa file. */
{
struct hash *hash = hashNew(0);
struct lineFile *lf = lineFileOpen(fileName, TRUE);
char *line;
while (lineFileNext(lf, &line, NULL))
    {
    line = skipLeadingSpaces(line);
    if (line[0] == '>')
        {
	line += 1;
	char *name = firstWordInLine(line);
	if (name == NULL || name[0] == 0)
	    errAbort("Empty name in fasta file line %d of %s", lf->lineIx, lf->fileName);
	if (hashLookup(hash, name))
	    errAbort("Duplicate name %s in fasta file line %d of %s",
	    	name, lf->lineIx, lf->fileName);
	hashAdd(hash, name, NULL);
	}
    }
lineFileClose(&lf);
return hash;
}

int splatCheck1(char *inFa, char *inSplat, char *outMiss, char *outWrong)
/* splatCheck1 - Check that all the test set really is being covered.. */
{
struct lineFile *lf = lineFileOpen(inSplat, TRUE);
FILE *missF = mustOpen(outMiss, "w");
FILE *badF = mustOpen(outWrong, "w");
char *row[7];
struct hash *allHash = hashFaNames(inFa);
struct hash *mappedHash = hashNew(0);	/* Keep track of reads we've seen here. */
struct hash *goodHash = hashNew(0);	/* Keep track of good reads here. */
while (lineFileRow(lf, row))
    {
    /* Read in line and parse it, track it. */
    int chromStart = sqlUnsigned(row[1]);
    char *strand = row[5];
    char *name = row[6];
    hashStore(mappedHash, name);

    /* Parse out name field to figure out where we expect it to map. */
    char *pt = name;
    char *expectStrand = "+";
    if (startsWith("RC_", pt))
        {
	pt += 3;
	expectStrand = "-";
	}
    char *nameCopy = cloneString(pt);
    char *parts[4];
    int partCount = chopByChar(nameCopy, '_', parts, ArraySize(parts));
    if (partCount != 3)
        errAbort("Can't parse name field line %d of %s", lf->lineIx, lf->fileName);
    int expectStart = sqlUnsigned(parts[1]);
    if (sameString(strand, expectStrand))
        {
	int diff = intAbs(chromStart - expectStart);
	if (diff <= 2)
	    {
	    hashStore(goodHash, name);
	    }
	}
    freeMem(nameCopy);
    }

struct hashEl *hel, *helList = hashElListHash(allHash);
int allCount = allHash->elCount;
int missCount = 0, badCount = 0;
for (hel = helList; hel != NULL; hel = hel->next)
    {
    char *name = hel->name;
    if (!hashLookup(mappedHash, name))
	{
        fprintf(missF, "%s\n", name);
	++missCount;
	}
    else
        {
	if (!hashLookup(goodHash, hel->name))
	    {
	    fprintf(badF, "%s\n", hel->name);
	    ++badCount;
	    }
	}
     
    }
carefulClose(&badF);
carefulClose(&missF);
verbose(1, "Total reads %d\n", allCount);
verbose(1, "Unmapped %d (%5.2f%%)\n", missCount, 100.0*missCount/allCount);
verbose(1, "Mapped wrong %d (%5.2f%%)\n", badCount, 100.0*badCount/allCount);
return -(missCount + badCount);
}

int main(int argc, char *argv[])
/* Process command line. */
{
optionInit(&argc, argv, options);
if (argc != 5)
    usage();
return splatCheck1(argv[1], argv[2], argv[3], argv[4]);
}
