/* Copyright (C) 2011 The Regents of the University of California 
 * See kent/LICENSE or http://genome.ucsc.edu/license/ for licensing information. */

/* Parse EMBL formatted files. EMBL files are basically line
 * oriented.  Each line begins with a short (usually two letter)
 * type word.  Adjacent lines with the same type are generally
 * considered logical extensions of each other.  In many cases
 * lines can be considered fields in an EMBL database.  Records
 * are separated by lines starting with '//'  Generally lines
 * starting with XX are empty and used to make the records more
 * human readable.   Here is an example record:
 
 C  M00001
 XX
 ID  V$MYOD_01
 XX
 NA  MyoD
 XX
 DT  EWI (created); 19.10.92.
 DT  ewi (updated); 22.06.95.
 XX
 PO     A     C     G     T
 01     0     0     0     0
 02     0     0     0     0
 03     1     2     2     0
 04     2     1     2     0
 05     3     0     1     1
 06     0     5     0     0
 07     5     0     0     0
 08     0     0     4     1
 09     0     1     4     0
 10     0     0     0     5
 11     0     0     5     0
 12     0     1     2     2
 13     0     2     0     3
 14     1     0     3     1
 15     0     0     0     0
 16     0     0     0     0
 17     0     0     0     0
 XX
 BF  T00526; MyoD                         ; mouse
 XX
 BA  5 functional elements in 3 genes
 XX
 XX
 //
 
 */

#include "common.h"
#include "linefile.h"
#include "hash.h"
#include "emblParse.h"


boolean emblLineGroup(struct lineFile *lf, char type[16], struct dyString *val)
/* Read next line of embl file.  Read line after that too if it
 * starts with the same type field. Return FALSE at EOF. */
{
char *line, *word;
int typeLen = 0;

dyStringClear(val);
while (lineFileNext(lf, &line, NULL))
    {
    line = skipLeadingSpaces(line);

    /* Parse out first word into type. */
    if (isspace(line[0]))
        errAbort("embl line that doesn't start with type line %d of %s", 
		lf->lineIx, lf->fileName);
    if (typeLen == 0)
        {
	word = nextWord(&line);
	typeLen = strlen(word);
	if (typeLen >= 16)
	    errAbort("Type word at start of line too long for embl file line %d of %s",
	    	lf->lineIx, lf->fileName);
	strcpy(type, word);
	}
    else if (!startsWith(type, line) || !isspace(line[typeLen]))
        {
	lineFileReuse(lf);
	break;
	}
    else
        {
	dyStringAppendC(val, '\n');
	word = nextWord(&line);
	}

    if (line != NULL)
	{
	/* Usually have two spaces after type. */
	if (isspace(line[0]))
	   ++line;
	if (isspace(line[0]))
	   ++line;

	/* Append what's rest of line to return value. */
	dyStringAppend(val, line);
	}
    }
return typeLen > 0;
}

struct hash *emblRecord(struct lineFile *lf)
/* Read next record and return it in hash.   (Free this
 * hash with freeHashAndVals.)   Hash is keyed by type
 * and has string values. */
{
struct hash *hash = NULL;
char type[16];
struct dyString *val = dyStringNew(256);
boolean gotEnd = FALSE;

while (emblLineGroup(lf, type, val))
    {
    if (hash == NULL)
        hash = newHash(7);
    if (sameString(type, "//"))
        {
	gotEnd = TRUE;
	break;
	}
    hashAdd(hash, type, cloneString(val->string));
    }
if (hash != NULL && !gotEnd)
    warn("Incomplete last record of embl file %s\n", lf->fileName);
return hash;
}

static void notEmbl(char *fileName)
/* Complain it's not really an EMBL file. */
{
errAbort("%s is not an emblFile", fileName);
}

struct lineFile *emblOpen(char *fileName, char type[256])
/* Open up embl file, verify format and optionally  return 
 * type (VV line).  Close this with lineFileClose(). */
{
struct lineFile *lf = lineFileOpen(fileName, TRUE);
struct hash *hash = emblRecord(lf);
char *vv;

if (hash == NULL)
    notEmbl(fileName);
if ((vv = hashFindVal(hash, "VV")) == NULL)
    notEmbl(fileName);
if (type != NULL)
    {
    if (strlen(vv) >= 256)
	notEmbl(fileName);
    strcpy(type, vv);
    }
freeHashAndVals(&hash);
return lf;
}