/* hmmpfamParse - Parse hmmpfam files.. */

/* Copyright (C) 2011 The Regents of the University of California 
 * See kent/LICENSE or http://genome.ucsc.edu/license/ for licensing information. */

#include "common.h"
#include "linefile.h"
#include "errAbort.h"
#include "spacedColumn.h"
#include "hmmPfamParse.h"


void hpfModelFree(struct hpfModel **pMod)
/* Free memory associated with hpfModel */
{
struct hpfModel *mod = *pMod;
if (mod != NULL)
    {
    freeMem(mod->name);
    freeMem(mod->description);
    slFreeList(&mod->domainList);
    freez(pMod);
    }
}

void hpfModelFreeList(struct hpfModel **pList)
/* Free a list of dynamically allocated hpfModel's */
{
struct hpfModel *el, *next;
for (el = *pList; el != NULL; el = next)
    {
    next = el->next;
    hpfModelFree(&el);
    }
*pList = NULL;
}


void hpfResultFree(struct hpfResult **pHr)
/* Free memory associated with hpfResult */
{
struct hpfResult *hr = *pHr;
if (hr != NULL)
    {
    freeMem(hr->name);
    hpfModelFreeList(&hr->modelList);
    freez(pHr);
    }
}

void hpfResultFreeList(struct hpfResult **pList)
/* Free a list of dynamically allocated hpfResult's */
{
struct hpfResult *el, *next;
for (el = *pList; el != NULL; el = next)
    {
    next = el->next;
    hpfResultFree(&el);
    }
*pList = NULL;
}

void parseErr(struct lineFile *lf, char *format, ...)
/* Print out a parse error message. */
{
va_list args;
va_start(args, format);
vaWarn(format, args);
va_end(args);
errAbort("line %d of %s", lf->lineIx, lf->fileName);
}

char *needLineStartingWith(struct lineFile *lf, char *start, int maxCount)
/* Get next line that starts as so */
{
char *line = lineFileSkipToLineStartingWith(lf, start, maxCount);
if (line == NULL)
     parseErr(lf, "Missing line starting with \"%s\"", start);
return line;
}

void spacedColumnFatten(struct spacedColumn *colList)
/* Make columns extend all the way to the next column. */
{
struct spacedColumn *col, *nextCol;
for (col = colList; col != NULL; col = nextCol)
    {
    nextCol = col->next;
    if (nextCol == NULL)
        break;
    col->size = nextCol->start - col->start - 1;
    }
}

struct hpfModel *hpfFindResultInModel(struct hpfResult *hr, char *modName)
/* Look for named result in model. */
{
struct hpfModel *mod;
for (mod = hr->modelList; mod != NULL; mod = mod->next)
    if (sameString(mod->name, modName))
	break;
return mod;
}

struct hpfResult *hpfNext(struct lineFile *lf)
/* Parse out next record in hmmpfam result file. */
{
/* Seek to first line that starts with "Query sequence:" and parse name out of it. */
char *queryPat = "Query sequence: ";
char *line = lineFileSkipToLineStartingWith(lf, queryPat, 100);
if (line == NULL)
    return NULL;
line += strlen(queryPat);
char *query = cloneString(nextWord(&line));
if (query == NULL)
    parseErr(lf, "Missing sequence name");

/* Seek to start of model list, figuring out width of fields we need in the process. */
needLineStartingWith(lf, "Scores for sequence family", 10);
needLineStartingWith(lf, "Model ", 2);
char *template = needLineStartingWith(lf, "----", 1);
struct spacedColumn *colList = spacedColumnFromSample(template);
spacedColumnFatten(colList);
int colCount = slCount(colList);
if (colCount < 5)
    parseErr(lf, "Expecting at least 5 columns");

/* Parse out all the models. */
struct hpfResult *hr;
AllocVar(hr);
hr->name = query;
for (;;)
    {
    lineFileNeedNext(lf, &line, NULL);
    line = skipLeadingSpaces(line);
    if (line[0] == 0)
        break;
    if (startsWith("[no hits above thresholds]", line))
        break;
    char *row[colCount];
    if (!spacedColumnParseLine(colList, line, row))
        parseErr(lf, "short line");
    struct hpfModel *mod;
    AllocVar(mod);
    mod->name = cloneString(row[0]);
    mod->description = cloneString(row[1]);
    mod->score = lineFileNeedDouble(lf, row, 2);
    mod->eVal = lineFileNeedDouble(lf, row, 3);
    slAddTail(&hr->modelList, mod);
    }
slFreeList(&colList);

/* Skip over to the section on domains, figuriong out column widths while we're at it. */
needLineStartingWith(lf, "Parsed for domains:", 10);
needLineStartingWith(lf, "Model ", 2);
template = needLineStartingWith(lf, "----", 1);
colList = spacedColumnFromSample(template);
colCount = slCount(colList);
if (colCount < 8)
    parseErr(lf, "Expecting at least 8 columns.");
struct spacedColumn *col2 = colList->next;
colList->size = col2->start - 1;

/* Parse out all the domains. */
for (;;)
    {
    lineFileNeedNext(lf, &line, NULL);
    line = skipLeadingSpaces(line);
    if (line[0] == 0)
        break;
    if (startsWith("[no hits above thresholds]", line))
        break;
    char *row[colCount];
    if (!spacedColumnParseLine(colList, line, row))
        parseErr(lf, "short line");
    struct hpfModel *mod = hpfFindResultInModel(hr, row[0]);
    if (mod == NULL)
        parseErr(lf, "Model %s in domain section but not model section", row[0]);
    struct hpfDomain *dom;
    AllocVar(dom);
    dom->qStart = lineFileNeedNum(lf, row, 2) - 1;
    dom->qEnd = lineFileNeedNum(lf, row, 3);
    dom->hmmStart = lineFileNeedNum(lf, row, 4) - 1;
    dom->hmmEnd = lineFileNeedNum(lf, row, 5);
    dom->score = lineFileNeedDouble(lf, row, 6);
    dom->eVal = lineFileNeedDouble(lf, row, 7);
    slAddTail(&mod->domainList, dom);
    }
slFreeList(&colList);
if (!lineFileSkipToLineStartingWith(lf, "//", 10000000))
    parseErr(lf, "Expecting //");
return hr;
}

