/* seq.c -- procedures for reading a file of DNA sequences */

/* Copyright (C) 2011 The Regents of the University of California 
 * See kent/LICENSE or http://genome.ucsc.edu/license/ for licensing information. */

#include "util.h"
#include "seq.h"
#include <errno.h>


/* This implementation uses a "growable array" abstraction */

typedef struct charvec {
    char *a;
    unsigned int len;
    unsigned int max;
    void *((*alloc)(void*, size_t));
    void (*free)(void*);
} charvec_t;

#define charvec_INIT(a,f)  {0, 0, 0, a, f}

static int charvec_fini(charvec_t *t);
static int charvec_need(charvec_t *t, unsigned int n);
static int charvec_more(charvec_t *t, unsigned int n);
static int charvec_append(charvec_t *t, char e);
static int charvec_fit(charvec_t *t);

static int charvec_fini(charvec_t *t)
{
    if (t->a && t->free) { t->free(t->a); t->a = 0; t->max = 0; }
    t->len = 0;
    return 1;
}

#ifndef BASE_ALLOC
#define BASE_ALLOC 30
#endif
enum { BASE = BASE_ALLOC };

static int charvec_need(charvec_t *t, unsigned int n)
{
    if (t->a == 0) {
        t->len = 0; 
        t->max = n;
        t->a = t->alloc(0, n * sizeof(char));
        return t->a != 0;
    }

    if (n > t->max) { 
        unsigned int i = BASE + n + (n >> 3); 
        void *p = t->alloc(t->a, i * sizeof(char));
        if (!p)
            return 0;
        t->max = i;
        t->a = p;
    }
    return 1;
}

static int charvec_more(charvec_t *t, unsigned int n)
{
    return charvec_need(t, n + t->len);
}

static int charvec_append(charvec_t *t, char e)
{
    if (!charvec_more(t, 1))
        return 0;
    t->a[t->len++] = e;
    return 1;
}

static int charvec_fit(charvec_t *t)
{
    unsigned int i = t->len;
    void *p = t->alloc(t->a, i * sizeof(char));
    if (!p)
        return 0;
    t->max = i;
    t->a = p;
    return 1;
}

#define _ (-1)

extern const signed char fasta_encoding[256];
const signed char fasta_encoding[] = {
 _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _,
 _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _,
 _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _,
 _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _,
 _, 0, _, 1, _, _, _, 2, _, _, _, _, _, _, _, _,
 _, _, _, _, 3, _, _, _, _, _, _, _, _, _, _, _,
 _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _,
 _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _,
 _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _,
 _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _,
 _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _,
 _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _,
 _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _,
 _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _,
 _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _,
 _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _,
};

extern const signed char nfasta_encoding[256];
const signed char nfasta_encoding[] = {
 _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _,
 _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _,
 _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _,
 _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _,
 _, 0, _, 1, _, _, _, 2, _, _, _, _, _, _, _, _,
 _, _, _, _, 3, _, _, _, _, _, _, _, _, _, _, _,
 _, 0, _, 1, _, _, _, 2, _, _, _, _, _, _, _, _,
 _, _, _, _, 3, _, _, _, _, _, _, _, _, _, _, _,
 _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _,
 _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _,
 _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _,
 _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _,
 _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _,
 _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _,
 _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _,
 _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _,
};

static const unsigned char nfasta_ctype[256];
static const unsigned char nfasta_ctype[] = {
 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 0, 2, 2, 0, 0,
 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0,
 0, 1, 3, 1, 3, 0, 0, 1, 3, 0, 0, 3, 0, 3, 1, 0,
 0, 0, 3, 3, 1, 0, 3, 3, 1, 3, 0, 0, 0, 0, 0, 0,
 0, 1, 3, 1, 3, 0, 0, 1, 3, 0, 0, 3, 0, 3, 1, 0,
 0, 0, 3, 3, 1, 0, 3, 3, 1, 3, 0, 0, 0, 0, 0, 0,
 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
};

static const unsigned char dna_complement[256];
static const unsigned char dna_complement[256] = 
  "                                                                "
  " TVGH  CD  M KN   YSA BWXR       tvgh  cd  m kn   ysa bwxr      "
  "                                                                "
  "                                                                ";
/* ................................................................ */
/* @ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~. */
/* ................................................................ */
/* ................................................................ */

// static const unsigned char fasta_decoding[4];
// static const unsigned char fasta_decoding[] = "ACGT";

static void Fatalfr(const char *fmt, ...)
{
	va_list ap;
	va_start(ap, fmt);
	fflush(stdout);
	print_argv0();
	(void)vfprintf(stderr, fmt, ap);
	(void)fprintf(stderr, ": %s\n", strerror(errno));
	va_end(ap);
	exit(1);
}

void *ckrealloc(void * p, size_t size);

static SEQ *seq_mask_inplace(SEQ *seq);
static int getpair(FILE *fp, int *a, int *b);
static char *byte_fill_range(uchar *p, int l, int c, int a, int b);
static int ws(int c);
static int getnwc(FILE *fp);
static void un_getc(int c, FILE *fp);
static void char_append(charvec_t *s, int c);

enum /* powerset */
{ SEQ_IS_SUBRANGE = (1<<0) /* seq is a subrange of a file */
/* , SEQ_IS_REVCOMP  = (1<<1)  seq is reverse compliment of a file */
, SEQ_IS_SUBSEQ   = (1<<2) /* seq is a reference to another seq */
, SEQ_HAS_MASK    = (1<<3) /* seq has a mask applied */
, SEQ_HAS_PIPE	  = (1<<4) /* input fd is a pipe */
, SEQ_DO_REVCOMP  = (1<<5) /* make it so after open */
, SEQ_DO_SUBRANGE = (1<<6) /* make it so after open */
, SEQ_DO_MASK	  = (1<<7) /* make it so after open */
, SEQ_ALLOW_AMB	  = (1<<8) /* checked while reading */
, SEQ_DISALLOW_AMB  = (1<<9) /* checked while reading */
};

enum { Nfasta_bad=0, Nfasta_nt=1, Nfasta_ws=2, Nfasta_amb=3 };

static int ws(int c)
{
	return (c == ' ' || c == '\t' || c == '\n' || c == '\f' || c == '\r');
}

static int getnwc(FILE *fp)
{
	int c = EOF;
	if (!feof(fp)) do c = getc(fp); while (c != EOF && ws(c));
	return c;
}

static void un_getc(int c, FILE *fp)
{
	if (c != EOF)
		if (ungetc(c, fp) == EOF)
			fatalf("cannot ungetc '%c'", c);
}

static void char_append(charvec_t *s, int c)
{
	if (!charvec_append(s, (char)c))
		fatal("cannot append");
}

int seq_read(SEQ *seq)
{
	int b, c;
	charvec_t shdr = charvec_INIT(ckrealloc,ckfree);
	charvec_t sseq = charvec_INIT(ckrealloc,ckfree);

	if (feof(seq->fp))
		return 0;

	if (seq->count > 0) { 
		if (seq->flags & SEQ_IS_SUBRANGE) {
			return 0;
		} else {
			seq->from = 1;
			seq->slen = -1; /* will be computed below */
		}
	}

	if (seq->header) ZFREE(seq->header);
	if (seq->seq) ZFREE(seq->seq);

	seq->offset = ftell(seq->fp);

	/* --- header --- */
	c = getnwc(seq->fp);
	if (c == '>') {
		while (c != '\n' && c != EOF) {
			char_append(&shdr, c);
			c = getc(seq->fp);
		}
	} else {
		un_getc(c, seq->fp);
	}
	char_append(&shdr, 0);
	seq->header = shdr.a;
	seq->hlen = shdr.len;

	/* --- seq --- */
	b = '\n';
	c = getnwc(seq->fp);
	while ((c != EOF) && !(b == '\n' && c == '>')) {
		switch (nfasta_ctype[c]) {
		case Nfasta_nt:
			char_append(&sseq, c);
			break;
		case Nfasta_ws:
			/* skip space */
			break;
		case Nfasta_amb:
			if (seq->flags & SEQ_ALLOW_AMB) {
				char_append(&sseq, c);
				break;
			}
			/* FALLTHRU */
		default:
			fatalf("non-DNA character '%c' in sequence '%s'",
			  c, seq->fname);
			break;
		}
		b = c;
		c = getc(seq->fp);
	}
	un_getc(c, seq->fp);

	/* check conformance */
	if (SEQ_LEN(seq) == -1) {
		char_append(&sseq, 0);
		charvec_fit(&sseq);
		seq->seq = (uchar*)sseq.a;
		seq->slen = sseq.len;
		if (seq->slen > 0) --seq->slen;  /* don't include '\0' */
	} else {
		charvec_t ssub = charvec_INIT(ckrealloc,ckfree);
		int i;

		if (SEQ_FROM(seq) < 1 ||
		    (int)sseq.len < SEQ_FROM(seq) ||
		    SEQ_TO(seq) < 1 ||
		    (int)sseq.len < SEQ_TO(seq) ||
		    SEQ_TO(seq) < SEQ_FROM(seq))
		    fatalf("range [%d,%d] incommensurate with sequence [%d,%d]",
				SEQ_FROM(seq), SEQ_TO(seq), 1, sseq.len);
		
		for (i = SEQ_FROM(seq); i <= SEQ_TO(seq); ++i)
			char_append(&ssub, sseq.a[i-1]);
		char_append(&ssub, 0);
		charvec_fini(&sseq);
		seq->seq = (uchar*)ssub.a;
	}

	seq->flags = seq->flags &~ SEQ_IS_REVCOMP;
        if (seq->flags & SEQ_DO_REVCOMP) {
	    (void)seq_revcomp_inplace(seq);
	}
	if (seq->flags & SEQ_HAS_MASK) {
	    (void)seq_mask_inplace(seq);
	}

	seq->count++;
	return 1;
}

static SEQ *seq_mask_inplace(SEQ *seq)
{
        int a, b;  
        FILE *fp = fopen(seq->maskname, "r");

        if (fp == 0) {
                Fatalfr("cannot open '%s'", seq->maskname);
                return 0;
        } else {
                while (getpair(fp, &a, &b))
                        byte_fill_range(SEQ_CHARS(seq),SEQ_LEN(seq),'X',a,b);
                fclose(fp);
                seq->flags |= SEQ_HAS_MASK;
                return seq;
        }
}

#define BUF 128
static int getpair(FILE *fp, int *a, int *b)
{
        char buf[BUF];

        /* XXX - should handle comments, etc */
        if (fgets(buf, (int)sizeof(buf), fp) == 0)
                return 0;
        if (sscanf(buf, "%d%d", a, b) != 2)
                return 0;
        return 1;
}

static char *byte_fill_range(uchar *p, int l, int c, int a, int b)
{
        /* fill [a,b] (1-indexed) in p with c */

        a--; b = b-a; /* make it into a 0-indexed, open interval */
        return (b < 0 || l < b) ? 0 : memset(p+a, c, (size_t)b);
}

static int parse_fname(const char* arg, 
		char **fname, int *from, int *len, char **maskfile)
{
        char *p = 0;
        int flags = 0;

	/* "seqfile{maskfile}[from,to]-" */
        *fname = copy_string(arg);

        p = (*fname)+strlen(*fname)-1;
        if (*p == '-') {
            *p = 0;
            flags |= SEQ_DO_REVCOMP;
        }

        if ((p = strchr(*fname, '['))) {
                int to;

		if (sscanf(p+1, "%d,%d", from, &to) != 2)
			return -1;
		if (*from <= 0 || *from > to)
			return -1;
		*p = '\0';
		*len = to - *from + 1;
		flags |= (SEQ_DO_SUBRANGE|SEQ_IS_SUBRANGE);
	} else {
		*from = 1;
		*len = -1;
	}

        if ((p = strchr(*fname, '{'))) {
		char *q = strchr(p+1, '}');
		if (q) {
			*p = *q = 0;
			if (maskfile) {
				*maskfile = copy_string(p+1);
				flags |= SEQ_DO_MASK;
			}
		}
	} else {
		*maskfile = copy_string(""); /* XXX ugh */
	}
	return flags;
}

static int check_flags(int flags)
{
	switch (flags & (SEQ_DISALLOW_AMB|SEQ_ALLOW_AMB)) {
	case 0: 
		/* default is to allow ambiguious */
		flags |= SEQ_ALLOW_AMB;
		break;
	case SEQ_ALLOW_AMB:
	case SEQ_DISALLOW_AMB:
		break;
	case SEQ_DISALLOW_AMB|SEQ_ALLOW_AMB:
		fatalf("seq_open: contradictory flags: SEQ_DISALLOW_AMB|SEQ_ALLOW_AMB");
	}
	return flags;
}

SEQ* seq_open(const char *fname)
{
	SEQ *s = ckallocz(sizeof(SEQ));
	int r, flags = 0;

	r = parse_fname(fname, 
		&(s->fname), &(s->from), &(s->slen), &(s->maskname));
	if (r == -1)
		fatalf("improper positions specification: %s", fname);

	s->flags = check_flags(r|flags);
	s->fp = ckopen(s->fname, "r");
	s->count = 0;
	s->offset = 0;
	return s;
}

SEQ *seq_copy(const SEQ *s)
{
	SEQ *ss = ckallocz(sizeof(SEQ));
	*ss = *s;
	ss->seq = (uchar*)copy_string((const char*)s->seq);
	ss->header = copy_string(s->header);
	ss->fname = copy_string(s->fname);
	ss->maskname = copy_string(s->fname);
	ss->fp = 0; /* XXX - no subsequent seq_read operations allowed */
	ss->offset = 0; /* XXX - no subsequent seq_read operations allowed */
	return ss;
}

SEQ* seq_close(SEQ *s)
{
	if (s) {
		if (!(s->flags & SEQ_IS_SUBSEQ)) {
			if (s->fp)
				fclose(s->fp);
			if (s->fname) ckfree(s->fname);
			if (s->header) ckfree(s->header);
			if (s->seq) ckfree(s->seq);
			if (s->maskname) ckfree(s->maskname);
		}
		memset(s, 0, sizeof(SEQ));
		ckfree(s);
	}
	return 0;
}

uchar dna_cmpl(uchar ch)
{
	/* XXX - assumes ascii, returns space on error. */
	return dna_complement[ch];
}

void do_revcomp(uchar *s, int len)
{
	uchar *p = s + len - 1;

	while (s<=p) {
		uchar c;

		c = dna_cmpl(*s); 
		*s = dna_cmpl(*p); 
		*p = c;
		++s, --p;
	}
}

SEQ *seq_revcomp_inplace(SEQ *seq) 
{
	do_revcomp(SEQ_CHARS(seq), SEQ_LEN(seq));
	seq->flags ^= SEQ_IS_REVCOMP;
	return seq;
}

SEQ *seq_get(const char *fname)
{
	SEQ *s = seq_open(fname);
	int r = seq_read(s);
	if (r < 0)
		Fatalfr("could not read from %s", fname);
	else if (r == 0)
		return 0;
	else
		return s;
	/*NOTREACHED*/
	return 0;
}
