/* omimParseRec - parse text of OMIM records by different field types */

/* Copyright (C) 2011 The Regents of the University of California 
 * See kent/LICENSE or http://genome.ucsc.edu/license/ for licensing information. */

#include "common.h"
#include "linefile.h"
#include "hash.h"
#include "options.h"
#include "localmem.h"
#include "dystring.h"
#include "portable.h"
#include "obscure.h"


FILE *fh2; /* 2nd file handle pointing to the OMIM text file */
FILE *recFh;
FILE *fieldFh;

void usage()
/* Explain usage and exit. */
{
errAbort(
  "omimParseRec - parse OMIM records by different field types\n"
  "usage:\n"
  "   omimParseRec omimText outRecFile outFieldFile\n"
  );
}

static struct optionSpec options[] = {
   {NULL, 0},
};

struct omimRecord
/* An OMIM record. */
    {
    char *id;		/* OMIM ID */
    char type;		/* OMIM type */
    char *title;	/* OMIM title */
    char *geneSymbol;	/* gene symbol */

    int  startPos;
    int  endPos;
    
    char *av;
    char *cd;
    char *cn;
    char *cs;
    char *ed;
    char *rf;
    char *sa;
    char *tx;
  };

struct omimField
/* An OMIM field. */
    {
    char *omimId;	/* OMIM ID */
    char *type;		/* field type */
    char *title;	/* field title */

    int  startPos;
    int  endPos;
    };

struct omimLitRef 
/* A OMIM literature reference. */
    {
    struct omimLitRef *next;
    char *title;	/* Title of article. */
    char *cite;		/* Journal/book/patent citation. */
    struct slName *authorList;	/* Author names in lastName, F.M. format. */
    char *rp;		/* Somewhat complex 'Reference Position' line. */
    struct hashEl *rxList; /* Cross-references. */
    struct hashEl *rcList; /* TISSUE=XXX X; STRAIN=YYY; parsed out. */
    char *pubMedId;	/* pubMed ID, may be NULL. */
    char *medlineId;	/* Medline ID, may be NULL. */
    char *doiId;	/* DOI ID, may be NULL. */
    };

char buffer[80*10000];

void printField(struct omimField *omimFd, FILE *outf)
/* print out field info */
{
long bytesRead;

fprintf(outf, "%s\t%s\t%d\t%d\n", 
       omimFd->omimId, omimFd->type, omimFd->startPos, omimFd->endPos - omimFd->startPos + 1);
       fflush(stdout);
	
if (((omimFd->endPos - omimFd->startPos + 1) + 1) >sizeof(buffer))
    {
    fprintf(stderr, 
    	    "field %s for OMIM record %s needs %d bytes, which exceeded buffer size of %d\n", 
	    omimFd->type, omimFd->omimId, 
	    (omimFd->endPos - omimFd->startPos + 1) + 1, (int)sizeof(buffer));
    exit(1);
    }

fseek(fh2, (long)(omimFd->startPos), SEEK_SET);
bytesRead = (long)fread(buffer, (size_t)1, (size_t)(omimFd->endPos - omimFd->startPos + 1), fh2);
*(buffer+bytesRead) = '\0';
}
    
struct omimRecord *omimRecordNext(struct lineFile *lf, 
	struct lm *lm, 	/* Local memory pool for this structure. */
	struct dyString *dy,	/* Scratch string to use. */
	FILE *recFh,
	FILE *fieldFh)
/* Read next record from file and parse it into omimRecord structure
 * that is allocated in memory. */
{
char *line;
struct omimRecord *omimr;
char *chp;
int lineSize;
char *fieldType;
boolean recDone, fieldDone;
char *row[1];

boolean endOfFile = FALSE;
boolean endOfRec  = FALSE;

boolean firstFieldDone = FALSE;

struct omimField *omimFd = NULL;

recDone = FALSE;

/* Parse record number and title lines. */
    if (!lineFileRow(lf, row))
	return NULL;
    if (!sameString(row[0], "*RECORD*"))
	{
	errAbort("Expecting *RECORD* line %d of %s", lf->lineIx, lf->fileName);
    	}
    
    lmAllocVar(lm, omimr);
    omimr->startPos = lf->lineStart + lf->bufOffsetInFile;
    

    if (!lineFileNext(lf, &line, &lineSize))
	return NULL;
    
    if (!sameString(line, "*FIELD* NO"))
	errAbort("Expecting *FIELD* NO line %d of %s", lf->lineIx, lf->fileName);
    
    if (!lineFileNextReal(lf, &line)) errAbort("%s ends in middle of a record", lf->fileName);
    omimr->id = lmCloneString(lm, line);
   
    if (!lineFileNextReal(lf, &line)) errAbort("%s ends in middle of a record", lf->fileName);
    if (!sameString(line, "*FIELD* TI"))
	errAbort("Expecting *FIELD* TI line %d of %s ---%s---", lf->lineIx, lf->fileName, line);
    if (!lineFileNext(lf, &line, &lineSize)) errAbort("%s ends in middle of a record", lf->fileName);
    if (!isdigit(*line)) 
    	{
	omimr->type = *line;
	}
    else
    	{
	omimr->type = '\0';
	}
   
    /* some records may not have gene symbol */
    omimr->geneSymbol = "";
    chp = strstr(line, ";");
    if (chp != NULL)
    	{
	*chp = '\0';
	chp++;
	if (*chp == ' ') chp++;
	if (*chp != '\0') omimr->geneSymbol = cloneString(chp);
	}
    chp = strstr(line, omimr->id);
    if (chp == NULL)
    	{
	errAbort("Expecting TI line for record %s, %d of %s ---%s===", omimr->id, lf->lineIx, 
	lf->fileName, line);
   	} 
    chp = chp + strlen(omimr->id); chp++;
    omimr->title = lmCloneString(lm, chp);
    
    // !!! need to enhance it later to include startPos and length info.
    fprintf(recFh, "%s\t%c\t%s\t%s\n", omimr->id, omimr->type, omimr->geneSymbol, omimr->title);
   
    /* !!! temporarily skip lines before first FIELD after title */
    /* further processing TBD */
    while (strstr(line, "*FIELD*") == NULL)
    	{
	lineFileNext(lf, &line, &lineSize);
	}
    lineFileReuse(lf);
    
    /* process a field */
    firstFieldDone = FALSE;
    fieldDone = FALSE;
    while (!recDone)
    	{
        if (!lineFileNext(lf, &line, &lineSize)) 
	    {
	    endOfFile = TRUE;
	    }
	    
	/* "*THEEND*" signals the end of the OMIM text file */
	if (sameWord(line, "*THEEND*")) endOfFile = TRUE;
	
	if (!endOfFile && (strstr(line, "*RECORD*") != NULL)) 
	    {
	    endOfRec = TRUE;
	    }
	    
	/* handle termination of record here */    
	if (endOfFile || endOfRec)
	    {
	    lineFileReuse(lf);
	    recDone = TRUE;
    	    if (firstFieldDone) 
	    	{
		/* extra minus 1 to get rid of empty line at the end */
		omimr ->endPos = lf->lineStart + lf->bufOffsetInFile - 1 - 1;
		omimFd->endPos = lf->lineStart + lf->bufOffsetInFile - 1 - 1;
		
	   	/* End of record or end of line also means end of a field.  Print the field info */
		printField(omimFd, fieldFh);
		} 
	    break;
	    }
	
	chp = strstr(line, "*FIELD*");
	if (chp != NULL)
	    {
	    fieldType = chp + strlen("*FIELD* ");
	    if (!firstFieldDone)
	    	{
    	    	lmAllocVar(lm, omimFd);
		firstFieldDone = TRUE;
		/* do not print anything at the first "*FIELD*" line */
		}
	    else
	    	{
		/* extra minus 1 to get rid of empty line at the end */
		omimFd->endPos = lf->lineStart + lf->bufOffsetInFile - 1 - 1;
	   	/* print previous field info */
		printField(omimFd, fieldFh);
		}
		
	    /* fill in current field info */		
	    omimFd->omimId   = cloneString(omimr->id);
	    omimFd->type     = cloneString(fieldType);
	    omimFd->startPos = lf->lineEnd + lf->bufOffsetInFile;
            }
	}
if (endOfFile) return NULL;    
return omimr;
}

void omimParseRec(char *omimTextFile, FILE *recFh, FILE *fieldFh)
/* omimParseRec - parse OMIM flat text file. */
{
struct lineFile *lf = lineFileOpen(omimTextFile, TRUE);
struct omimRecord *omimr;
struct dyString *dy = dyStringNew(4096);

for (;;)
    {
    struct lm *lm = lmInit(8*1024);
    omimr = omimRecordNext(lf, lm, dy, recFh, fieldFh);
    if (omimr == NULL)
        break;
    lmCleanup(&lm);
    }
dyStringFree(&dy);
}

int main(int argc, char *argv[])
/* Process command line. */
{
optionInit(&argc, argv, options);
if (argc != 4)
    usage();

fh2     = mustOpen(argv[1], "r");
recFh   = mustOpen(argv[2], "w");
fieldFh = mustOpen(argv[3], "w");

omimParseRec(argv[1], recFh, fieldFh);

return 0;
}
