/* hgMaf.c - Stuff to load up mafs from the browser database. */

/* Copyright (C) 2014 The Regents of the University of California 
 * See kent/LICENSE or http://genome.ucsc.edu/license/ for licensing information. */

#include "common.h"
#include "linefile.h"
#include "hash.h"
#include "dystring.h"
#include "options.h"
#include "jksql.h"
#include "dnautil.h"
#include "dnaseq.h"
#include "maf.h"
#include "hdb.h"
#include "scoredRef.h"
#include "hgMaf.h"


int mafCmp(const void *va, const void *vb)
/* Compare to sort based on start of first component. */
{
const struct mafAli *a = *((struct mafAli **)va);
const struct mafAli *b = *((struct mafAli **)vb);
return a->components->start - b->components->start;
}


struct mafAli *mafLoadInRegion2(struct sqlConnection *conn,
    struct sqlConnection *conn2, char *table, char *chrom,
    int start, int end, char *file)
/* Return list of alignments in region. */
{
char **row;
unsigned int extFileId = 0;
struct mafAli *maf, *mafList = NULL;
struct mafFile *mf = NULL;
int rowOffset;

if (file != NULL)
    mf = mafOpen(file);

struct sqlResult *sr = hRangeQuery(conn, table, chrom,
    start, end, NULL, &rowOffset);

while ((row = sqlNextRow(sr)) != NULL)
    {
    struct scoredRef ref;
    scoredRefStaticLoad(row + rowOffset, &ref);
    if ((file != NULL) && (ref.extFile != 0))
	errAbort("expect extFile to be zero if file specified\n");
    if ((file == NULL) && (ref.extFile == 0))
	errAbort("expect extFile to be not zero or file specified\n");

    if (ref.extFile != extFileId)
	{
	char *path = hExtFileNameC(conn2, "extFile", ref.extFile);
	mafFileFree(&mf);
	mf = mafOpen(path);
	extFileId = ref.extFile;
	}
    lineFileSeek(mf->lf, ref.offset, SEEK_SET);
    maf = mafNext(mf);
    if (maf == NULL)
        internalErr();
    slAddHead(&mafList, maf);
    }
sqlFreeResult(&sr);
mafFileFree(&mf);
slReverse(&mafList);
/* hRangeQuery may return items out-of-order when bin is used in the query,
 * so sort here in order to avoid trouble at base-level view: */
slSort(&mafList, mafCmp);
return mafList;
}

struct mafAli *mafLoadInRegion(struct sqlConnection *conn, char *table,
	char *chrom, int start, int end)
{
struct sqlConnection *conn2 = hAllocConn(sqlGetDatabase(conn));
struct mafAli *ret = mafLoadInRegion2(conn, conn2, table, chrom,
    start, end,NULL);
hFreeConn(&conn2);
return ret;
}

struct mafAli *axtLoadAsMafInRegion(struct sqlConnection *conn, char *table,
        char *chrom, int start, int end,
        char *tPrefix, char *qPrefix, int tSize,  struct hash *qSizeHash)
/* Return list of alignments in region from axt external file as a maf. */
{
char **row;
unsigned int extFileId = 0;
struct lineFile *lf = NULL;
struct mafAli *maf, *mafList = NULL;
struct axt *axt;
int rowOffset;
struct sqlResult *sr = hRangeQuery(conn, table, chrom,
    start, end, NULL, &rowOffset);

while ((row = sqlNextRow(sr)) != NULL)
    {
    struct scoredRef ref;
    scoredRefStaticLoad(row + rowOffset, &ref);
    if (ref.extFile != extFileId)
	{
	char *path = hExtFileName(sqlGetDatabase(conn),"extFile", ref.extFile);
	lf = lineFileOpen(path, TRUE);
	extFileId = ref.extFile;
	}
    lineFileSeek(lf, ref.offset, SEEK_SET);
    axt = axtRead(lf);
    if (axt == NULL)
        internalErr();
    maf = mafFromAxt(axt, tSize, tPrefix, hashIntVal(qSizeHash, axt->qName), qPrefix);
    axtFree(&axt);
    slAddHead(&mafList, maf);
    }
sqlFreeResult(&sr);
lineFileClose(&lf);
slReverse(&mafList);
return mafList;
}

static void mafCheckFirstComponentSrc(struct mafAli *mafList, char *src)
/* Check that the first component of each maf has given src. */
{
struct mafAli *maf;
for (maf = mafList; maf != NULL; maf = maf->next)
    {
    if (!sameString(maf->components->src, src))
        errAbort("maf first component isn't %s", src);
    }
}

static void mafCheckFirstComponentStrand(struct mafAli *mafList, char strand)
/* Check that the first component of each maf has given strand. */
{
struct mafAli *maf;
for (maf = mafList; maf != NULL; maf = maf->next)
    {
    if (maf->components->strand != strand)
        errAbort("maf first component isn't %c", strand);
    }
}

struct oneOrg
/* Info on one organism. */
    {
    struct oneOrg *next;
    char *name;		/* Name - allocated in hash */
    int order;		/* Help sort organisms. */
    struct dyString *dy;	/* Associated alignment for this organism. */
    boolean hit;	/* Flag to see if hit this time around. */
    };

static int oneOrgCmp(const void *va, const void *vb)
/* Compare to sort based on order. */
{
const struct oneOrg *a = *((struct oneOrg **)va);
const struct oneOrg *b = *((struct oneOrg **)vb);
return a->order - b->order;
}

static void fillInMissing(struct oneOrg *nativeOrg, struct oneOrg *orgList,
	struct dnaSeq *native, int seqStart, int curPos, int aliStart)
/* Fill in alignment strings in orgList with native sequence
 * for first organism, and dots for rest. */
{
int fillSize = aliStart - curPos;
int offset = curPos - seqStart;
struct oneOrg *org;
if (nativeOrg == NULL)
    return;
dyStringAppendN(nativeOrg->dy, native->dna + offset, fillSize);
for (org = orgList; org != NULL; org = org->next)
    {
    if (org != nativeOrg)
	dyStringAppendMultiC(org->dy, '.', fillSize);
    }
}

static int countAlpha(char *s)
/* Count up number of alphabetic characters in string */
{
int count = 0;
char c;
while ((c = *s++) != 0)
    if (isalpha(c))
        ++count;
return count;
}

struct mafBaseProbs *hgMafProbsHelper(
        int size,
        struct mafAli *maf
        )
/* calculate the probability of each nucleotide in each column of a maf. */
{
struct mafBaseProbs *probs;

AllocArray(probs, size);
int count = 0;
unsigned letterBox[256]; 

int ii;
for(ii=0; ii < maf->textSize; ii++)
    {
    struct mafComp *comp = maf->components;
    char ch = comp->text[ii];

    if (ch == '-')
        continue;

    memset(letterBox, 0, sizeof letterBox);
    for(; comp; comp = comp->next)
        {
        if (comp->text == NULL)
            continue;
        letterBox[tolower(comp->text[ii])]++;
        }

    double total = 0;
    total += letterBox['a'];
    total += letterBox['c'];
    total += letterBox['g'];
    total += letterBox['t'];
    probs[count].aProb = letterBox['a'] / total;
    probs[count].cProb = letterBox['c'] / total;
    probs[count].gProb = letterBox['g'] / total;
    probs[count].tProb = letterBox['t'] / total;
    count++;
    }   

return probs;
}

struct mafBaseProbs *hgBigMafProbs(
	char *database,     /* Database, must already have hSetDb to this */
        struct bbiFile *bbi,
	char *chrom,        /* Chromosome (in database genome) */
	int start, int end, /* start/end in chromosome */
        char strand         /* strand */
        )
/* calculate the probability of each nucleotide in each column of a bigMaf. */
{
struct mafAli *maf = hgBigMafFrag(database, bbi, chrom, start, end, '+', NULL, NULL);

return hgMafProbsHelper(end - start, maf);
}

struct mafBaseProbs *hgMafProbs(
	char *database,     /* Database, must already have hSetDb to this */
	char *track,        /* Name of MAF track */
	char *chrom,        /* Chromosome (in database genome) */
	int start, int end, /* start/end in chromosome */
        char strand         /* strand */
        )
/* calculate the probability of each nucleotide in each column of a maf. */
{
struct mafAli *maf = hgMafFrag(database, track, chrom, start, end, '+', NULL, NULL);

return hgMafProbsHelper(end - start, maf);
}

static struct mafAli *hgMafFragHelper(
	char *database,     /* Database, must already have hSetDb to this */
	//char *track,        /* Name of MAF track */
	char *chrom,        /* Chromosome (in database genome) */
	int start, int end, /* start/end in chromosome */
	char strand,        /* Chromosome strand. */
        struct mafAli *mafList,
	char *outName,      /* Optional name to use in first component */
	struct slName *orderList /* Optional order of organisms. */
	)
/* hgMafFragHelper- Extract maf sequences for a region from database.
 * This creates a somewhat unusual MAF that extends from start
 * to end whether or not there are actually alignments.  Where
 * there are no alignments (or alignments missing a species)
 * a . character fills in.   The score is always zero, and
 * the sources just indicate the species.  You can mafFree this
 * as normal. */
{
int chromSize = hChromSize(database, chrom);
struct dnaSeq *native = hChromSeq(database, chrom, start, end);
struct mafAli *maf;
char masterSrc[128];
struct hash *orgHash = newHash(10);
struct oneOrg *orgList = NULL, *org, *nativeOrg = NULL;
int curPos = start, symCount = 0;
struct slName *name;
int order = 0;

/* Check that the mafs are really copacetic, the particular
 * subtype we think is in the database that this (relatively)
 * simple code can handle. */
safef(masterSrc, sizeof(masterSrc), "%s.%s", database, chrom);
mafCheckFirstComponentSrc(mafList, masterSrc);
mafCheckFirstComponentStrand(mafList, '+');
slSort(&mafList, mafCmp);

/* Prebuild organisms if possible from input orderList. */
for (name = orderList; name != NULL; name = name->next)
    {
    AllocVar(org);
    slAddHead(&orgList, org);
    hashAddSaveName(orgHash, name->name, org, &org->name);
    org->dy = dyStringNew(native->size*1.5);
    org->order = order++;
    if (nativeOrg == NULL)
        nativeOrg = org;
    }
if (orderList == NULL)
    {
    AllocVar(org);
    slAddHead(&orgList, org);
    hashAddSaveName(orgHash, database, org, &org->name);
    org->dy = dyStringNew(native->size*1.5);
    if (nativeOrg == NULL)
        nativeOrg = org;
    }

/* Go through all mafs in window, mostly building up
 * org->dy strings. */
for (maf = mafList; maf != NULL; maf = maf->next)
    {
    struct mafComp *mc, *mcMaster = maf->components;
    struct mafAli *subMaf = NULL;
    order = 0;
    if (curPos < mcMaster->start)
	{
        fillInMissing(nativeOrg, orgList, native, start,
		curPos, mcMaster->start);
	symCount += mcMaster->start - curPos;
	}
    if (curPos < mcMaster->start + mcMaster->size) /* Prevent worst
                                                    * backtracking */
	{
	if (mafNeedSubset(maf, masterSrc, curPos, end))
	    {
	    subMaf = mafSubset(maf, masterSrc, curPos, end);
	    if (subMaf == NULL)
	        continue;
	    }
	else
	    subMaf = maf;
	for (mc = subMaf->components; mc != NULL; mc = mc->next, ++order)
	    {
	    /* Extract name up to dot into 'orgName' */
	    char buf[128], *e, *orgName;

	    if ((mc->size == 0) || (mc->srcSize == 0)) /* skip over components without sequence */
		continue;

	    mc->leftStatus = mc->rightStatus = 0; /* squash annotation */

	    e = strchr(mc->src, '.');
	    /* Look up dyString corresponding to  org */
	    if (e == NULL)
                {
		orgName = mc->src;
                org = hashFindVal(orgHash, orgName);
                }
	    else
		{
		int len = e - mc->src;
		if (len >= sizeof(buf))
		    errAbort("organism/database name %s too long", mc->src);
		memcpy(buf, mc->src, len);
		buf[len] = 0;
		orgName = buf;
                // if the orderList is present, it may have organism names with dots in them,
                // If we can't find the sequence after one dot, we look again after two
                if (orderList != NULL)  
                    {
                    if ((org = hashFindVal(orgHash, orgName)) == NULL)  // couldn't find this org
                        {
                        e = strchr(e + 1, '.');  // look for another dot following the first dot
                        if (e != NULL)
                            {
                            // if we found a dot, try the longer name 
                            len = e - mc->src;
                            if (len >= sizeof(buf))
                                errAbort("organism/database name %s too long", mc->src);
                            memcpy(buf, mc->src, len);
                            buf[len] = 0;
                            org = hashFindVal(orgHash, orgName);
                            }
                        }
                    }
                else
                    org = hashFindVal(orgHash, orgName);
		}

	    /* create a  new org if necessary. */
	    if (org == NULL)
		{
		if (orderList != NULL)
		   errAbort("%s is not in orderList", orgName);
		AllocVar(org);
		slAddHead(&orgList, org);
		hashAddSaveName(orgHash, orgName, org, &org->name);
		org->dy = dyStringNew(native->size*1.5);
		dyStringAppendMultiC(org->dy, '.', symCount);
		if (nativeOrg == NULL)
		    nativeOrg = org;
		}
	    if (orderList == NULL && order > org->order)
		org->order = order;
	    org->hit = TRUE;

	    /* Fill it up with alignment. */
	    dyStringAppendN(org->dy, mc->text, subMaf->textSize);
	    }
	for (org = orgList; org != NULL; org = org->next)
	    {
	    if (!org->hit)
		dyStringAppendMultiC(org->dy, '.', subMaf->textSize);
	    org->hit = FALSE;
	    }
	symCount += subMaf->textSize;
	curPos = mcMaster->start + mcMaster->size;
	if (subMaf != maf)
	    mafAliFree(&subMaf);
	}
    }
if (curPos < end)
    {
    fillInMissing(nativeOrg, orgList, native, start, curPos, end);
    symCount += end - curPos;
    }
mafAliFreeList(&mafList);

slSort(&orgList, oneOrgCmp);
if (strand == '-')
    {
    for (org = orgList; org != NULL; org = org->next)
	reverseComplement(org->dy->string, org->dy->stringSize);
    }

/* Construct our maf */
AllocVar(maf);
maf->textSize = symCount;
for (org = orgList; org != NULL; org = org->next)
    {
    struct mafComp *mc;
    AllocVar(mc);
    if (org == orgList)
        {
	if (outName != NULL)
	    {
	    mc->src = cloneString(outName);
	    mc->srcSize = native->size;
	    mc->strand = '+';
	    mc->start = 0;
	    mc->size = native->size;
	    }
	else
	    {
	    mc->src = cloneString(masterSrc);
	    mc->srcSize = chromSize;
	    mc->strand = strand;
	    if (strand == '-')
	       reverseIntRange(&start, &end, chromSize);
	    mc->start = start;
	    mc->size = end-start;
	    }
	}
    else
        {
	int size = countAlpha(org->dy->string);
	mc->src = cloneString(org->name);
	mc->srcSize = size;
	mc->strand = '+';
	mc->start = 0;
	mc->size = size;
	}
    mc->text = cloneString(org->dy->string);
    dyStringFree(&org->dy);
    slAddHead(&maf->components, mc);
    }
slReverse(&maf->components);

slFreeList(&orgList);
freeHash(&orgHash);
return maf;
}

struct mafAli *hgBigMafFrag(
	char *database,     /* Database, must already have hSetDb to this */
        struct bbiFile *bbi,
	char *chrom,        /* Chromosome (in database genome) */
	int start, int end, /* start/end in chromosome */
	char strand,        /* Chromosome strand. */
	char *outName,      /* Optional name to use in first component */
	struct slName *orderList /* Optional order of organisms. */
	)
/* hgBigMafFrag - Extract maf sequences for a region from a bigMaf and call hgMafFragHelper. */
{
struct mafAli *mafList = bigMafLoadInRegion( bbi,  chrom, start, end);
return hgMafFragHelper(database, chrom, start, end, strand, mafList, outName, orderList);
}

struct mafAli *hgMafFrag(
	char *database,     /* Database, must already have hSetDb to this */
	char *track,        /* Name of MAF track */
	char *chrom,        /* Chromosome (in database genome) */
	int start, int end, /* start/end in chromosome */
	char strand,        /* Chromosome strand. */
	char *outName,      /* Optional name to use in first component */
	struct slName *orderList /* Optional order of organisms. */
	)
/* hgMafFrag- Extract maf sequences for a region from database and call hgMafFragHelper. */
{
struct sqlConnection *conn = hAllocConn(database);
struct mafAli  *mafList = mafLoadInRegion(conn, track, chrom, start, end);
struct mafAli *ret = hgMafFragHelper(database, chrom, start, end, strand, mafList, outName, orderList);

hFreeConn(&conn);

return ret;
}

struct consWiggle *wigMafWiggles(char *db, struct trackDb *tdb)
/* get conservation wiggle table names and labels from trackDb setting,
   ignoring those where table doesn't exist */
{
char *fields[20];
int fieldCt;
int i;
char *wigTable, *wigLabel;
struct consWiggle *wig, *wigList = NULL;
char *setting = trackDbSetting(tdb, CONS_WIGGLE);
if (!setting)
    return NULL;
fieldCt = chopLine(cloneString(setting), fields);
for (i = 0; i < fieldCt; i += 3)
    {
    wigTable = fields[i];
    if (hTableExists(db, wigTable))
        {
        AllocVar(wig);
        wig->table = cloneString(wigTable);
        wigLabel = (i+1 == fieldCt ? DEFAULT_CONS_LABEL : fields[i+1]);
        wig->leftLabel = cloneString(wigLabel);
        wigLabel = (i+2 >= fieldCt ? wig->leftLabel : fields[i+2]);
        wig->uiLabel = cloneString(wigLabel);
        slAddTail(&wigList, wig);
        }
    }
return wigList;
}

char *wigMafWiggleVar(char *prefix, struct consWiggle *wig,char **suffix)
// Return name of cart variable (and optionally the suffix) for this cons wiggle
{
char option[128];
safef(option, sizeof option, "%s.cons.%s", prefix, wig->leftLabel);
if (suffix != NULL)
    *suffix = option + strlen(prefix) + 1;
return (cloneString(option));
}

struct consWiggle *consWiggleFind(char *db,struct trackDb *parent,char *table)
/* Return conservation wig if it is found in the parent. */
{
if (parent == NULL || !startsWith("wigMaf", parent->type))
    return NULL;

struct consWiggle *wig, *wiggles = wigMafWiggles(db, parent);
for (wig = wiggles;
     wig != NULL && differentString(wig->table,table);
     wig = wig->next) {}
return wig;
}

struct mafAli *bigMafLoadInRegion( struct bbiFile *bbi,  char *chrom, int start, int end)
/* Read in MAF blocks from bigBed. */
{
struct lm *lm = lmInit(0);
struct bigBedInterval *bb, *bbList =  bigBedIntervalQuery(bbi, chrom, start, end, 0, lm);
struct mafAli *mafList = NULL;
for (bb = bbList; bb != NULL; bb = bb->next)
    {
    // the MAF block in the bigBed record has a semi-colon instead of newlines 
    replaceChar(bb->rest, ';','\n');

    struct mafFile mf;
    mf.lf = lineFileOnString(NULL, TRUE, bb->rest);

    struct mafAli *maf = mafNext(&mf);
    slAddHead(&mafList, maf);
    }
slReverse(&mafList);
return mafList;
}

struct hash *mafGetLabelHash(struct trackDb *tdb)
/* Get mapping of sequence name to label. */
{
char *labels = trackDbSetting(tdb, SPECIES_LABELS);
struct hash *labelHash = NULL;

if (labels)
    labelHash = hashFromString(labels);

return labelHash;
}

