/* agpToFa - Tweak an .agp file. 
 *  Galt Barber 2005-08-20
 *  This is handy for reducing the size of an agp element,
 *  i.e. chrUn.  You can squeeze some wasted space by changing
 *  the size of the clone gap padding, or simply split the chrom
 *  creating chrUn, chrUn2, etc.  You can also split chrUn into 
 *  scaffolds.
*/

/* Copyright (C) 2011 The Regents of the University of California 
 * See kent/LICENSE or http://genome.ucsc.edu/license/ for licensing information. */
#include "common.h"
#include "linefile.h"
#include "hash.h"
#include "cheapcgi.h"
#include "fa.h"
#include "agpFrag.h"
#include "agpGap.h"
#include "options.h"


static void usage()
/* Explain usage and exit. */
{
errAbort(
  "agpTweak - Create a modified .agp file\n"
  "usage:\n"
  "   agpToFa in.agp out.agp\n"
  "\n"
  "options:\n"
  "   -splitIntoScaffolds - will break obj into scaffolds at 'clone no' boundaries\n"
  "   -splitSize=N - will keep the agp object at about size N in agp file,\n"
  "                split names will be obj.1, obj.2, etc.\n"
  "                and the split point is chosen to start a next object on a non-gap;\n"
  "                note that you can use splitIntoScaffolds or splitSize but not both\n"
  "   -cloneGapSize=N - force clone gaps to size N in agp file\n"
  "   -agpSeq=xxxx limits changes to where agp object=xxxx matches a sequence name in in.agp\n"
  "   -verbose=N - N=2 display extra information\n"
  );
}

char *agpSeq = NULL;
int cloneGapSize = 0;
int splitSize= 0;
boolean splitIntoScaffolds = FALSE;

/* command line option specifications */
static struct optionSpec optionSpecs[] = {
    {"splitIntoScaffolds", OPTION_BOOLEAN},
    {"splitSize", OPTION_INT},
    {"cloneGapSize", OPTION_INT},
    {"agpSeq", OPTION_STRING},
    {NULL, 0}
};


static void agpTweak(char *agpFile, char *agpOut)
/* agpToFa - Convert a .agp file to a .fa file. */
{
struct lineFile *lf = lineFileOpen(agpFile, TRUE);
char *line, *words[16];
int lineSize, wordCount;
unsigned lastPos = 0;
struct agpFrag *agp;
struct agpGap *gap;
FILE *f;
int accumDif = 0;
char *lastObj = NULL;
f = mustOpen(agpOut, "w");
int suffixCount = 1;
int lineNo = 1;
char buf[256];
char *newChrom = NULL;
boolean suppressGap = FALSE;

verbose(2,"#\tprocessing AGP file: %s\n", agpFile);
while (lineFileNext(lf, &line, &lineSize))
    {
    if (line[0] == 0 || line[0] == '#' || line[0] == '\n')
        continue;
    //verbose(2,"#\tline: %d\n", lf->lineIx);
    wordCount = chopLine(line, words);
    if (wordCount < 5)
        errAbort("Bad line %d of %s: need at least 5 words, got %d\n",
		 lf->lineIx, lf->fileName, wordCount);

    suppressGap = FALSE;
    if (!lastObj || !sameString(words[0],lastObj))
	{
	freez(&newChrom);
	if (splitIntoScaffolds)
	    newChrom = cloneString("scaffold1");
	else
	    newChrom = cloneString(words[0]);
	}
	
    if (lastObj && !sameString(words[0],lastObj))
	{
	accumDif = 0;
	lastPos = 0;
	suffixCount = 1;
	lineNo = 1;
	}
		 
    if (words[4][0] != 'N')
	{
	lineFileExpectAtLeast(lf, 9, wordCount);
	agp = agpFragLoad(words);
	/* agp is 1-based but agp loaders do not adjust for 0-based: */
    	agp->chromStart -= 1;
	agp->fragStart  -= 1;
	if (agp->chromEnd - agp->chromStart != agp->fragEnd - agp->fragStart)
	    errAbort("Sizes don't match in %s and %s line %d of %s\n",
		agp->chrom, agp->frag, lf->lineIx, lf->fileName);
	}
    else
        {
	lineFileExpectAtLeast(lf, 8, wordCount);
	gap = agpGapLoad(words);
	/* to be consistent with agpFrag */
	gap->chromStart -= 1;
	agp = (struct agp*)gap;
	}

    /* adjust for accumulated difference if any */
    agp->chromStart += accumDif;
    agp->chromEnd += accumDif;
    
    if (agp->chromStart != lastPos)
	errAbort("Start doesn't match previous end line %d of %s\n"
	    "agp->chromStart: %u\n" 
	    "agp->chromEnd: %u\n" 
	    "lastPos: %u\n" 
	    "accumDif: %d\n"
	    ,lf->lineIx, lf->fileName
	    ,agp->chromStart
	    ,agp->chromEnd
	    ,lastPos
	    ,accumDif
	    );

    if ((agpSeq == NULL) || sameWord(agp->chrom, agpSeq))
	{
	if ((splitIntoScaffolds && words[4][0] == 'N' && sameWord(gap->type, "clone")) 
	 || (splitSize > 0 && words[4][0] != 'N' && agp->chromEnd > splitSize))
	    {
	    char *base;
	    if (splitIntoScaffolds)
		base = "scaffold";
	    else
		base = words[0];
	    freez(&newChrom);
	    safef(buf,sizeof(buf),"%s%d",base,++suffixCount);
    	    newChrom = cloneString(buf);
	    accumDif -= agp->chromStart;
	    agp->chromEnd = agp->chromEnd - agp->chromStart;
	    agp->chromStart = 0;
	    if (splitIntoScaffolds)
		{
    		suppressGap = TRUE;
		/* suppress the gap itself */
		accumDif -= agp->chromEnd - agp->chromStart;
		agp->chromEnd = 0;
		lineNo = 0;
		}
	    }
	if (splitSize > 0 || splitIntoScaffolds)
	    {
	    char *temp = agp->chrom;
	    agp->chrom = cloneString(newChrom);
	    freez(&temp);
	    agp->ix = lineNo++;
	    }
	if (cloneGapSize > 0 && words[4][0] == 'N' && sameWord(gap->type, "clone"))
	    {
	    int dif = cloneGapSize - gap->size;
	    gap->size = cloneGapSize;
	    agp->chromEnd += dif;
	    accumDif += dif;
	    }
	}

    lastPos = agp->chromEnd;
    freez(&lastObj);
    lastObj = cloneString(words[0]); /* not agp->chrom which may be modified already */
	
    if (words[4][0] != 'N')
	{
	/* agpFragOutput assumes 0-based-half-open, but writes 1-based for agp */
	agpFragOutput(agp, f, '\t', '\n');
	agpFragFree(&agp);
	}
    else
        {
	/* restore back to 1-based for agp 
	 * because agpGapOutput doesn't compensate */
	gap->chromStart += 1;
	if (!suppressGap)
    	    agpGapOutput(gap, f, '\t', '\n');
	agpGapFree(&gap);
	}
	
    }

carefulClose(&f);
}

int main(int argc, char *argv[])
/* Process command line. */
{
optionInit(&argc, argv, optionSpecs);

if (argc != 3)
    usage();
if (optionExists("agpSeq"))
    agpSeq = optionVal("agpSeq", NULL);
if (optionExists("cloneGapSize"))
    cloneGapSize = optionInt("cloneGapSize", cloneGapSize);
if (optionExists("splitSize"))
    splitSize = optionInt("splitSize", splitSize);
if (optionExists("splitIntoScaffolds"))
    splitIntoScaffolds = TRUE;

if (splitSize > 0 && splitIntoScaffolds)
    usage();

if (verboseLevel() > 1)
    {
    verbose(2,
    "#\t"
    "agpSeq: %s\n"
    "cloneGapSize: %d\n"
    "splitSize: %d\n"
    , agpSeq
    , cloneGapSize
    , splitSize
    );
    }
agpTweak(argv[1], argv[2]);

return 0;
}
