/*
 * Copyright 2011, Ben Langmead <langmea@cs.jhu.edu>
 *
 * This file is part of Bowtie 2.
 *
 * Bowtie 2 is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * Bowtie 2 is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with Bowtie 2.  If not, see <http://www.gnu.org/licenses/>.
 */

#include "pat.h"

/**
 * Parse a name from fb_ and store in r.  Assume that the next
 * character obtained via fb_.get() is the first character of
 * the sequence and the string stops at the next char upto (could
 * be tab, newline, etc.).
 */
int QseqPatternSource::parseName(
	Read& r,      // buffer for mate 1
	Read* r2,     // buffer for mate 2 (NULL if mate2 is read separately)
	bool append,     // true -> append characters, false -> skip them
	bool clearFirst, // clear the name buffer first
	bool warnEmpty,  // emit a warning if nothing was added to the name
	bool useDefault, // if nothing is read, put readCnt_ as a default value
	int upto)        // stop parsing when we first reach character 'upto'
{
	if(clearFirst) {
		if(r2 != NULL) r2->name.clear();
		r.name.clear();
	}
	while(true) {
		int c;
		if((c = fb_.get()) < 0) {
			// EOF reached in the middle of the name
			return -1;
		}
		if(c == '\n' || c == '\r') {
			// EOL reached in the middle of the name
			return -1;
		}
		if(c == upto) {
			// Finished with field
			break;
		}
		if(append) {
			if(r2 != NULL) r2->name.append(c);
			r.name.append(c);
		}
	}
	// Set up a default name if one hasn't been set
	if(r.name.empty() && useDefault && append) {
		char cbuf[20];
		itoa10(readCnt_, cbuf);
		r.name.append(cbuf);
		if(r2 != NULL) r2->name.append(cbuf);
	}
	if(r.name.empty() && warnEmpty) {
		cerr << "Warning: read had an empty name field" << endl;
	}
	return (int)r.name.length();
}

/**
 * Parse a single sequence from fb_ and store in r.  Assume
 * that the next character obtained via fb_.get() is the first
 * character of the sequence and the sequence stops at the next
 * char upto (could be tab, newline, etc.).
 */
int QseqPatternSource::parseSeq(
	Read& r,
	int& charsRead,
	int& trim5,
	char upto)
{
	int begin = 0;
	int c = fb_.get();
	assert(c != upto);
	r.patFw.clear();
	r.color = gColor;
	if(gColor) {
		// NOTE: clearly this is not relevant for Illumina output, but
		// I'm keeping it here in case there's some reason to put SOLiD
		// data in this format in the future.
	
		// This may be a primer character.  If so, keep it in the
		// 'primer' field of the read buf and parse the rest of the
		// read without it.
		c = toupper(c);
		if(asc2dnacat[c] > 0) {
			// First char is a DNA char
			int c2 = toupper(fb_.peek());
			// Second char is a color char
			if(asc2colcat[c2] > 0) {
				r.primer = c;
				r.trimc = c2;
				trim5 += 2; // trim primer and first color
			}
		}
		if(c < 0) { return -1; }
	}
	while(c != upto) {
		if(c == '.') c = 'N';
		if(gColor) {
			if(c >= '0' && c <= '4') c = "ACGTN"[(int)c - '0'];
		}
		if(isalpha(c)) {
			assert_in(toupper(c), "ACGTN");
			if(begin++ >= trim5) {
				assert_neq(0, asc2dnacat[c]);
				r.patFw.append(asc2dna[c]);
			}
			charsRead++;
		}
		if((c = fb_.get()) < 0) {
			return -1;
		}
	}
	r.patFw.trimEnd(gTrim3);
	return (int)r.patFw.length();
}

/**
 * Parse a single quality string from fb_ and store in r.
 * Assume that the next character obtained via fb_.get() is
 * the first character of the quality string and the string stops
 * at the next char upto (could be tab, newline, etc.).
 */
int QseqPatternSource::parseQuals(
	Read& r,
	int charsRead,
	int dstLen,
	int trim5,
	char& c2,
	char upto = '\t',
	char upto2 = -1)
{
	int qualsRead = 0;
	int c = 0;
	if (intQuals_) {
		// Probably not relevant
		char buf[4096];
		while (qualsRead < charsRead) {
			qualToks_.clear();
			if(!tokenizeQualLine(fb_, buf, 4096, qualToks_)) break;
			for (unsigned int j = 0; j < qualToks_.size(); ++j) {
				char c = intToPhred33(atoi(qualToks_[j].c_str()), solQuals_);
				assert_geq(c, 33);
				if (qualsRead >= trim5) {
					r.qual.append(c);
				}
				++qualsRead;
			}
		} // done reading integer quality lines
		if (charsRead > qualsRead) tooFewQualities(r.name);
	} else {
		// Non-integer qualities
		while((qualsRead < dstLen + trim5) && c >= 0) {
			c = fb_.get();
			c2 = c;
			if (c == ' ') wrongQualityFormat(r.name);
			if(c < 0) {
				// EOF occurred in the middle of a read - abort
				return -1;
			}
			if(!isspace(c) && c != upto && (upto2 == -1 || c != upto2)) {
				if (qualsRead >= trim5) {
					c = charToPhred33(c, solQuals_, phred64Quals_);
					assert_geq(c, 33);
					r.qual.append(c);
				}
				qualsRead++;
			} else {
				break;
			}
		}
	}
	if(r.qual.length() < (size_t)dstLen) {
		tooFewQualities(r.name);
	}
	// TODO: How to detect too many qualities??
	r.qual.resize(dstLen);
	while(c != -1 && c != upto && (upto2 == -1 || c != upto2)) {
		c = fb_.get();
		c2 = c;
	}
	return qualsRead;
}

/**
 * Read another pattern from a Qseq input file.
 */
bool QseqPatternSource::read(
	Read& r,
	TReadId& rdid,
	TReadId& endid,
	bool& success,
	bool& done)
{
	r.reset();
	r.color = gColor;
	success = true;
	done = false;
	readCnt_++;
	rdid = endid = readCnt_-1;
	peekOverNewline(fb_);
	fb_.resetLastN();
	// 1. Machine name
	if(parseName(r, NULL, true, true,  true, false, '\t') == -1) BAIL_UNPAIRED();
	assert_neq('\t', fb_.peek());
	r.name.append('_');
	// 2. Run number
	if(parseName(r, NULL, true, false, true, false, '\t') == -1) BAIL_UNPAIRED();
	assert_neq('\t', fb_.peek());
	r.name.append('_');
	// 3. Lane number
	if(parseName(r, NULL, true, false, true, false, '\t') == -1) BAIL_UNPAIRED();
	assert_neq('\t', fb_.peek());
	r.name.append('_');
	// 4. Tile number
	if(parseName(r, NULL, true, false, true, false, '\t') == -1) BAIL_UNPAIRED();
	assert_neq('\t', fb_.peek());
	r.name.append('_');
	// 5. X coordinate of spot
	if(parseName(r, NULL, true, false, true, false, '\t') == -1) BAIL_UNPAIRED();
	assert_neq('\t', fb_.peek());
	r.name.append('_');
	// 6. Y coordinate of spot
	if(parseName(r, NULL, true, false, true, false, '\t') == -1) BAIL_UNPAIRED();
	assert_neq('\t', fb_.peek());
	r.name.append('_');
	// 7. Index
	if(parseName(r, NULL, true, false, true, false, '\t') == -1) BAIL_UNPAIRED();
	assert_neq('\t', fb_.peek());
	r.name.append('/');
	// 8. Mate number
	if(parseName(r, NULL, true, false, true, false, '\t') == -1) BAIL_UNPAIRED();
	// Empty sequence??
	if(fb_.peek() == '\t') {
		// Get tab that separates seq from qual
		ASSERT_ONLY(int c =) fb_.get();
		assert_eq('\t', c);
		assert_eq('\t', fb_.peek());
		// Get tab that separates qual from filter
		ASSERT_ONLY(c =) fb_.get();
		assert_eq('\t', c);
		// Next char is first char of filter flag
		assert_neq('\t', fb_.peek());
		fb_.resetLastN();
		cerr << "Warning: skipping empty QSEQ read with name '" << r.name << "'" << endl;
	} else {
		assert_neq('\t', fb_.peek());
		int charsRead = 0;
		int mytrim5 = gTrim5;
		// 9. Sequence
		int dstLen = parseSeq(r, charsRead, mytrim5, '\t');
		assert_neq('\t', fb_.peek());
		if(dstLen < 0) BAIL_UNPAIRED();
		char ct = 0;
		// 10. Qualities
		if(parseQuals(r, charsRead, dstLen, mytrim5, ct, '\t', -1) < 0) BAIL_UNPAIRED();
		r.trimmed3 = gTrim3;
		r.trimmed5 = mytrim5;
		if(ct != '\t') {
			cerr << "Error: QSEQ with name " << r.name << " did not have tab after qualities" << endl;
			throw 1;
		}
		assert_eq(ct, '\t');
	}
	// 11. Filter flag
	int filt = fb_.get();
	if(filt == -1) BAIL_UNPAIRED();
	r.filter = filt;
	if(filt != '0' && filt != '1') {
		// Bad value for filt
	}
	if(fb_.peek() != -1 && fb_.peek() != '\n') {
		// Bad value right after the filt field
	}
	fb_.get();
	r.readOrigBuf.install(fb_.lastN(), fb_.lastNLen());
	fb_.resetLastN();
	if(r.qual.length() < r.patFw.length()) {
		tooFewQualities(r.name);
	} else if(r.qual.length() > r.patFw.length()) {
		tooManyQualities(r.name);
	}
#ifndef NDEBUG
	assert_eq(r.patFw.length(), r.qual.length());
	for(size_t i = 0; i < r.qual.length(); i++) {
		assert_geq((int)r.qual[i], 33);
	}
#endif
	return true;
}
