/*
 * Copyright 2011, Ben Langmead <langmea@cs.jhu.edu>
 *
 * This file is part of Bowtie 2.
 *
 * Bowtie 2 is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * Bowtie 2 is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with Bowtie 2.  If not, see <http://www.gnu.org/licenses/>.
 */

#ifndef PAT_H_
#define PAT_H_

#include <cassert>
#include <cmath>
#include <stdexcept>
#include <vector>
#include <string>
#include <cstring>
#include <ctype.h>
#include <fstream>
#include "alphabet.h"
#include "assert_helpers.h"
#include "tokenize.h"
#include "random_source.h"
#include "threading.h"
#include "filebuf.h"
#include "qual.h"
#include "search_globals.h"
#include "sstring.h"
#include "ds.h"
#include "read.h"
#include "util.h"

extern bool threeN;
/**
 * Classes and routines for reading reads from various input sources.
 */

using namespace std;

/**
 * Calculate a per-read random seed based on a combination of
 * the read data (incl. sequence, name, quals) and the global
 * seed in '_randSeed'.
 */
static inline uint32_t genRandSeed(const BTDnaString& qry,
                                   const BTString& qual,
                                   const BTString& name,
                                   uint32_t seed)
{
	// Calculate a per-read random seed based on a combination of
	// the read data (incl. sequence, name, quals) and the global
	// seed
	uint32_t rseed = (seed + 101) * 59 * 61 * 67 * 71 * 73 * 79 * 83;
	size_t qlen = qry.length();
	// Throw all the characters of the read into the random seed
	for(size_t i = 0; i < qlen; i++) {
		int p = (int)qry[i];
		assert_leq(p, 4);
		size_t off = ((i & 15) << 1);
		rseed ^= (p << off);
	}
	// Throw all the quality values for the read into the random
	// seed
	for(size_t i = 0; i < qlen; i++) {
		int p = (int)qual[i];
		assert_leq(p, 255);
		size_t off = ((i & 3) << 3);
		rseed ^= (p << off);
	}
	// Throw all the characters in the read name into the random
	// seed
	size_t namelen = name.length();
	for(size_t i = 0; i < namelen; i++) {
		int p = (int)name[i];
		if(p == '/') break;
		assert_leq(p, 255);
		size_t off = ((i & 3) << 3);
		rseed ^= (p << off);
	}
	return rseed;
}

/**
 * Parameters affecting how reads and read in.
 */
struct PatternParams {
	PatternParams(
		int format_,
		bool fileParallel_,
		uint32_t seed_,
		bool useSpinlock_,
		bool solexa64_,
		bool phred64_,
		bool intQuals_,
		bool fuzzy_,
		int sampleLen_,
		int sampleFreq_,
		uint32_t skip_) :
		format(format_),
		fileParallel(fileParallel_),
		seed(seed_),
		useSpinlock(useSpinlock_),
		solexa64(solexa64_),
		phred64(phred64_),
		intQuals(intQuals_),
		fuzzy(fuzzy_),
		sampleLen(sampleLen_),
		sampleFreq(sampleFreq_),
		skip(skip_) { }

	int format;           // file format
	bool fileParallel;    // true -> wrap files with separate PairedPatternSources
	uint32_t seed;        // pseudo-random seed
	bool useSpinlock;     // use spin locks instead of pthreads
	bool solexa64;        // true -> qualities are on solexa64 scale
	bool phred64;         // true -> qualities are on phred64 scale
	bool intQuals;        // true -> qualities are space-separated numbers
	bool fuzzy;           // true -> try to parse fuzzy fastq
	int sampleLen;        // length of sampled reads for FastaContinuous...
	int sampleFreq;       // frequency of sampled reads for FastaContinuous...
	uint32_t skip;        // skip the first 'skip' patterns
};

/**
 * Encapsulates a synchronized source of patterns; usually a file.
 * Optionally reverses reads and quality strings before returning them,
 * though that is usually more efficiently done by the concrete
 * subclass.  Concrete subclasses should delimit critical sections with
 * calls to lock() and unlock().
 */
class PatternSource {

public:

	PatternSource(const PatternParams& p) :
		seed_(p.seed),
		readCnt_(0),
		numWrappers_(0),
		doLocking_(true),
		useSpinlock_(p.useSpinlock),
		mutex()
	{
	}

	virtual ~PatternSource() { }

	/**
	 * Call this whenever this PatternSource is wrapped by a new
	 * WrappedPatternSourcePerThread.  This helps us keep track of
	 * whether locks will be contended.
	 */
	void addWrapper() {
		lock();
		numWrappers_++;
		unlock();
	}
	
	/**
	 * The main member function for dispensing patterns.
	 *
	 * Returns true iff a pair was parsed succesfully.
	 */
	virtual bool nextReadPair(
		Read& ra,
		Read& rb,
		TReadId& rdid,
		TReadId& endid,
		bool& success,
		bool& done,
		bool& paired,
		bool fixName);

	/**
	 * The main member function for dispensing patterns.
	 */
	virtual bool nextRead(
		Read& r,
		TReadId& rdid,
		TReadId& endid,
		bool& success,
		bool& done);

	/**
	 * Implementation to be provided by concrete subclasses.  An
	 * implementation for this member is only relevant for formats that
	 * can read in a pair of reads in a single transaction with a
	 * single input source.  If paired-end input is given as a pair of
	 * parallel files, this member should throw an error and exit.
	 */
	virtual bool nextReadPairImpl(
		Read& ra,
		Read& rb,
		TReadId& rdid,
		TReadId& endid,
		bool& success,
		bool& done,
		bool& paired) = 0;

	/**
	 * Implementation to be provided by concrete subclasses.  An
	 * implementation for this member is only relevant for formats
	 * where individual input sources look like single-end-read
	 * sources, e.g., formats where paired-end reads are specified in
	 * parallel read files.
	 */
	virtual bool nextReadImpl(
		Read& r,
		TReadId& rdid, 
		TReadId& endid, 
		bool& success,
		bool& done) = 0;

	/// Reset state to start over again with the first read
	virtual void reset() { readCnt_ = 0; }

	/**
	 * Concrete subclasses call lock() to enter a critical region.
	 * What constitutes a critical region depends on the subclass.
	 */
	void lock() {
		if(!doLocking_) return; // no contention
        mutex.lock();
	}

	/**
	 * Concrete subclasses call unlock() to exit a critical region
	 * What constitutes a critical region depends on the subclass.
	 */
	void unlock() {
		if(!doLocking_) return; // no contention
        mutex.unlock();
	}

	/**
	 * Return a new dynamically allocated PatternSource for the given
	 * format, using the given list of strings as the filenames to read
	 * from or as the sequences themselves (i.e. if -c was used).
	 */
	static PatternSource* patsrcFromStrings(
		const PatternParams& p,
		const EList<string>& qs,
        size_t nthreads = 1);

	/**
	 * Return the number of reads attempted.
	 */
	TReadId readCnt() const { return readCnt_ - 1; }

    int paired_type; // 1 - left or unpaird, 2-right
//    int align_times = 0;

protected:

	uint32_t seed_;

	/// The number of reads read by this PatternSource
	TReadId readCnt_;

	int numWrappers_;      /// # threads that own a wrapper for this PatternSource
	bool doLocking_;       /// override whether to lock (true = don't override)
	/// User can ask to use the normal pthreads-style lock even if
	/// spinlocks is enabled and compiled in.  This is sometimes better
	/// if we expect bad I/O latency on some reads.
	bool useSpinlock_;
	MUTEX_T mutex;
};

/**
 * Abstract parent class for synhconized sources of paired-end reads
 * (and possibly also single-end reads).
 */
class PairedPatternSource {
public:
	PairedPatternSource(const PatternParams& p) : mutex_m(), seed_(p.seed) {}
	virtual ~PairedPatternSource() { }

	virtual void addWrapper() = 0;
	virtual void reset() = 0;
	
	virtual bool nextReadPair(
		Read& ra,
		Read& rb,
		TReadId& rdid,
		TReadId& endid,
		bool& success,
		bool& done,
		bool& paired,
		bool fixName) = 0;
	
	virtual pair<TReadId, TReadId> readCnt() const = 0;

	/**
	 * Lock this PairedPatternSource, usually because one of its shared
	 * fields is being updated.
	 */
	void lock() {
		mutex_m.lock();
	}

	/**
	 * Unlock this PairedPatternSource.
	 */
	void unlock() {
		mutex_m.unlock();
	}

	/**
	 * Given the values for all of the various arguments used to specify
	 * the read and quality input, create a list of pattern sources to
	 * dispense them.
	 */
	static PairedPatternSource* setupPatternSources(
		const EList<string>& si,    // singles, from argv
		const EList<string>& m1,    // mate1's, from -1 arg
		const EList<string>& m2,    // mate2's, from -2 arg
		const EList<string>& m12,   // both mates on each line, from --12 arg
#ifdef USE_SRA
        const EList<string>& sra_accs,
#endif
		const EList<string>& q,     // qualities associated with singles
		const EList<string>& q1,    // qualities associated with m1
		const EList<string>& q2,    // qualities associated with m2
		const PatternParams& p,     // read-in params
        size_t nthreads,
		bool verbose);              // be talkative?

protected:

	MUTEX_T mutex_m; /// mutex for syncing over critical regions
	uint32_t seed_;
};

/**
 * Encapsulates a synchronized source of both paired-end reads and
 * unpaired reads, where the paired-end must come from parallel files.
 */
class PairedSoloPatternSource : public PairedPatternSource {

public:

	PairedSoloPatternSource(
		const EList<PatternSource*>* src,
		const PatternParams& p) :
		PairedPatternSource(p),
		cur_(0),
		src_(src)
	{
		assert(src_ != NULL);
		for(size_t i = 0; i < src_->size(); i++) {
			assert((*src_)[i] != NULL);
		}
	}

	virtual ~PairedSoloPatternSource() { delete src_; }

	/**
	 * Call this whenever this PairedPatternSource is wrapped by a new
	 * WrappedPatternSourcePerThread.  This helps us keep track of
	 * whether locks within PatternSources will be contended.
	 */
	virtual void addWrapper() {
		for(size_t i = 0; i < src_->size(); i++) {
			(*src_)[i]->addWrapper();
		}
	}

	/**
	 * Reset this object and all the PatternSources under it so that
	 * the next call to nextReadPair gets the very first read pair.
	 */
	virtual void reset() {
		for(size_t i = 0; i < src_->size(); i++) {
			(*src_)[i]->reset();
		}
		cur_ = 0;
	}

	/**
	 * The main member function for dispensing pairs of reads or
	 * singleton reads.  Returns true iff ra and rb contain a new
	 * pair; returns false if ra contains a new unpaired read.
	 */
	virtual bool nextReadPair(
		Read& ra,
		Read& rb,
		TReadId& rdid,
		TReadId& endid,
		bool& success,
		bool& done,
		bool& paired,
		bool fixName);

	/**
	 * Return the number of reads attempted.
	 */
	virtual pair<TReadId, TReadId> readCnt() const {
		uint64_t ret = 0llu;
		for(size_t i = 0; i < src_->size(); i++) ret += (*src_)[i]->readCnt();
		return make_pair(ret, 0llu);
	}

protected:

	volatile uint32_t cur_; // current element in parallel srca_, srcb_ vectors
	const EList<PatternSource*>* src_; /// PatternSources for paired-end reads
};

/**
 * Encapsulates a synchronized source of both paired-end reads and
 * unpaired reads, where the paired-end must come from parallel files.
 */
class PairedDualPatternSource : public PairedPatternSource {

public:

	PairedDualPatternSource(
		const EList<PatternSource*>* srca,
		const EList<PatternSource*>* srcb,
		const PatternParams& p) :
		PairedPatternSource(p), cur_(0), srca_(srca), srcb_(srcb)
	{
		assert(srca_ != NULL);
		assert(srcb_ != NULL);
		// srca_ and srcb_ must be parallel
		assert_eq(srca_->size(), srcb_->size());
		for(size_t i = 0; i < srca_->size(); i++) {
			// Can't have NULL first-mate sources.  Second-mate sources
			// can be NULL, in the case when the corresponding first-
			// mate source is unpaired.
			assert((*srca_)[i] != NULL);
			for(size_t j = 0; j < srcb_->size(); j++) {
				assert_neq((*srca_)[i], (*srcb_)[j]);
			}
		}
	}

	virtual ~PairedDualPatternSource() {
		delete srca_;
		delete srcb_;
	}

	/**
	 * Call this whenever this PairedPatternSource is wrapped by a new
	 * WrappedPatternSourcePerThread.  This helps us keep track of
	 * whether locks within PatternSources will be contended.
	 */
	virtual void addWrapper() {
		for(size_t i = 0; i < srca_->size(); i++) {
			(*srca_)[i]->addWrapper();
			if((*srcb_)[i] != NULL) {
				(*srcb_)[i]->addWrapper();
			}
		}
	}

	/**
	 * Reset this object and all the PatternSources under it so that
	 * the next call to nextReadPair gets the very first read pair.
	 */
	virtual void reset() {
		for(size_t i = 0; i < srca_->size(); i++) {
			(*srca_)[i]->reset();
			if((*srcb_)[i] != NULL) {
				(*srcb_)[i]->reset();
			}
		}
		cur_ = 0;
	}

	/**
	 * The main member function for dispensing pairs of reads or
	 * singleton reads.  Returns true iff ra and rb contain a new
	 * pair; returns false if ra contains a new unpaired read.
	 */
	virtual bool nextReadPair(
		Read& ra,
		Read& rb,
		TReadId& rdid,
		TReadId& endid,
		bool& success,
		bool& done,
		bool& paired,
		bool fixName);
	
	/**
	 * Return the number of reads attempted.
	 */
	virtual pair<TReadId, TReadId> readCnt() const;

protected:

	volatile uint32_t cur_; // current element in parallel srca_, srcb_ vectors
	const EList<PatternSource*>* srca_; /// PatternSources for 1st mates and/or unpaired reads
	const EList<PatternSource*>* srcb_; /// PatternSources for 2nd mates
};

/**
 * Encapsulates a single thread's interaction with the PatternSource.
 * Most notably, this class holds the buffers into which the
 * PatterSource will write sequences.  This class is *not* threadsafe
 * - it doesn't need to be since there's one per thread.  PatternSource
 * is thread-safe.
 */
class PatternSourcePerThread {

public:

	PatternSourcePerThread() :
		buf1_(), buf2_(), rdid_(0xffffffff), endid_(0xffffffff) { }

	virtual ~PatternSourcePerThread() { }

	/**
	 * change 3N plan for both mate1 and mate2
	 */
	void changePlan3N(int mappingCycle) {
        buf1_.changePlan3N(mappingCycle);
        buf2_.changePlan3N(3-mappingCycle);
	}

	/**
	 * Read the next read pair.
	 */
	virtual bool nextReadPair(
		bool& success,
		bool& done,
		bool& paired,
		bool fixName)
	{
		return success;
	}

	Read& bufa()             { return buf1_;    }	
	Read& bufb()             { return buf2_;    }
	const Read& bufa() const { return buf1_;    }
	const Read& bufb() const { return buf2_;    }

	TReadId       rdid()  const { return rdid_;  }
	TReadId       endid() const { return endid_; }
	virtual void  reset()       { rdid_ = endid_ = 0xffffffff;  }
	
	/**
	 * Return the length of mate 1 or mate 2.
	 */
	size_t length(int mate) const {
		return (mate == 1) ? buf1_.length() : buf2_.length();
	}

protected:

	Read  buf1_;    // read buffer for mate a
	Read  buf2_;    // read buffer for mate b
	TReadId rdid_;  // index of read just read
	TReadId endid_; // index of read just read
};

/**
 * Abstract parent factory for PatternSourcePerThreads.
 */
class PatternSourcePerThreadFactory {
public:
	virtual ~PatternSourcePerThreadFactory() { }
	virtual PatternSourcePerThread* create() const = 0;
	virtual EList<PatternSourcePerThread*>* create(uint32_t n) const = 0;

	/// Free memory associated with a pattern source
	virtual void destroy(PatternSourcePerThread* patsrc) const {
		assert(patsrc != NULL);
		// Free the PatternSourcePerThread
		delete patsrc;
	}

	/// Free memory associated with a pattern source list
	virtual void destroy(EList<PatternSourcePerThread*>* patsrcs) const {
		assert(patsrcs != NULL);
		// Free all of the PatternSourcePerThreads
		for(size_t i = 0; i < patsrcs->size(); i++) {
			if((*patsrcs)[i] != NULL) {
				delete (*patsrcs)[i];
				(*patsrcs)[i] = NULL;
			}
		}
		// Free the vector
		delete patsrcs;
	}
};

/**
 * A per-thread wrapper for a PairedPatternSource.
 */
class WrappedPatternSourcePerThread : public PatternSourcePerThread {
public:
	WrappedPatternSourcePerThread(PairedPatternSource& __patsrc) :
		patsrc_(__patsrc)
	{
		patsrc_.addWrapper();
	}

	/**
	 * Get the next paired or unpaired read from the wrapped
	 * PairedPatternSource.
	 */
	virtual bool nextReadPair(
		bool& success,
		bool& done,
		bool& paired,
		bool fixName);

private:

	/// Container for obtaining paired reads from PatternSources
	PairedPatternSource& patsrc_;
};

/**
 * Abstract parent factory for PatternSourcePerThreads.
 */
class WrappedPatternSourcePerThreadFactory : public PatternSourcePerThreadFactory {
public:
	WrappedPatternSourcePerThreadFactory(PairedPatternSource& patsrc) :
		patsrc_(patsrc) { }

	/**
	 * Create a new heap-allocated WrappedPatternSourcePerThreads.
	 */
	virtual PatternSourcePerThread* create() const {
		return new WrappedPatternSourcePerThread(patsrc_);
	}

	/**
	 * Create a new heap-allocated vector of heap-allocated
	 * WrappedPatternSourcePerThreads.
	 */
	virtual EList<PatternSourcePerThread*>* create(uint32_t n) const {
		EList<PatternSourcePerThread*>* v = new EList<PatternSourcePerThread*>;
		for(size_t i = 0; i < n; i++) {
			v->push_back(new WrappedPatternSourcePerThread(patsrc_));
			assert(v->back() != NULL);
		}
		return v;
	}

private:
	/// Container for obtaining paired reads from PatternSources
	PairedPatternSource& patsrc_;
};

/// Skip to the end of the current string of newline chars and return
/// the first character after the newline chars, or -1 for EOF
static inline int getOverNewline(FileBuf& in) {
	int c;
	while(isspace(c = in.get()));
	return c;
}

/// Skip to the end of the current string of newline chars such that
/// the next call to get() returns the first character after the
/// whitespace
static inline int peekOverNewline(FileBuf& in) {
	while(true) {
		int c = in.peek();
		if(c != '\r' && c != '\n') {
			return c;
		}
		in.get();
	}
}

/// Skip to the end of the current line; return the first character
/// of the next line or -1 for EOF
static inline int getToEndOfLine(FileBuf& in) {
	while(true) {
		int c = in.get(); if(c < 0) return -1;
		if(c == '\n' || c == '\r') {
			while(c == '\n' || c == '\r') {
				c = in.get(); if(c < 0) return -1;
			}
			// c now holds first character of next line
			return c;
		}
	}
}

/// Skip to the end of the current line such that the next call to
/// get() returns the first character on the next line
static inline int peekToEndOfLine(FileBuf& in) {
	while(true) {
		int c = in.get(); if(c < 0) return c;
		if(c == '\n' || c == '\r') {
			c = in.peek();
			while(c == '\n' || c == '\r') {
				in.get(); if(c < 0) return c; // consume \r or \n
				c = in.peek();
			}
			// next get() gets first character of next line
			return c;
		}
	}
}

extern void wrongQualityFormat(const BTString& read_name);
extern void tooFewQualities(const BTString& read_name);
extern void tooManyQualities(const BTString& read_name);

/**
 * Encapsulates a source of patterns which is an in-memory vector.
 */
class VectorPatternSource : public PatternSource {

public:

	VectorPatternSource(
		const EList<string>& v,
		const PatternParams& p);
	
	virtual ~VectorPatternSource() { }
	
	virtual bool nextReadImpl(
		Read& r,
		TReadId& rdid,
		TReadId& endid,
		bool& success,
		bool& done);
	
	/**
	 * This is unused, but implementation is given for completeness.
	 */
	virtual bool nextReadPairImpl(
		Read& ra,
		Read& rb,
		TReadId& rdid,
		TReadId& endid,
		bool& success,
		bool& done,
		bool& paired);
	
	virtual void reset() {
		PatternSource::reset();
		cur_ = skip_;
		paired_ = false;
	}
	
private:

	size_t cur_;
	uint32_t skip_;
	bool paired_;
	EList<BTDnaString> v_;  // forward sequences
	EList<BTString> quals_; // forward qualities
	EList<BTString> names_; // names
	EList<int> trimmed3_;   // names
	EList<int> trimmed5_;   // names
};

/**
 *
 */
class BufferedFilePatternSource : public PatternSource {
public:
	BufferedFilePatternSource(
		const EList<string>& infiles,
		const PatternParams& p) :
		PatternSource(p),
		infiles_(infiles),
		filecur_(0),
		fb_(),
		skip_(p.skip),
		first_(true)
	{
		assert_gt(infiles.size(), 0);
		errs_.resize(infiles_.size());
		errs_.fill(0, infiles_.size(), false);
		assert(!fb_.isOpen());
		open(); // open first file in the list
		filecur_++;
	}

	virtual ~BufferedFilePatternSource() {
		if(fb_.isOpen()) fb_.close();
	}

	/**
	 * Fill Read with the sequence, quality and name for the next
	 * read in the list of read files.  This function gets called by
	 * all the search threads, so we must handle synchronization.
	 */
	virtual bool nextReadImpl(
		Read& r,
		TReadId& rdid,
		TReadId& endid,
		bool& success,
		bool& done)
	{
		// We'll be manipulating our file handle/filecur_ state
		lock();
		while(true) {
			do { read(r, rdid, endid, success, done); }
			while(!success && !done);
			if(!success && filecur_ < infiles_.size()) {
				assert(done);
				open();
				resetForNextFile(); // reset state to handle a fresh file
				filecur_++;
				continue;
			}
			break;
		}
		assert(r.repOk());
		// Leaving critical region
		unlock();
		return success;
	}
	
	/**
	 *
	 */
	virtual bool nextReadPairImpl(
		Read& ra,
		Read& rb,
		TReadId& rdid,
		TReadId& endid,
		bool& success,
		bool& done,
		bool& paired)
	{
		// We'll be manipulating our file handle/filecur_ state
		lock();
		while(true) {
			do { readPair(ra, rb, rdid, endid, success, done, paired); }
			while(!success && !done);
			if(!success && filecur_ < infiles_.size()) {
				assert(done);
				open();
				resetForNextFile(); // reset state to handle a fresh file
				filecur_++;
				continue;
			}
			break;
		}
		assert(ra.repOk());
		assert(rb.repOk());
		// Leaving critical region
		unlock();
		return success;
	}
	
	/**
	 * Reset state so that we read start reading again from the
	 * beginning of the first file.  Should only be called by the
	 * master thread.
	 */
	virtual void reset() {
		PatternSource::reset();
        filecur_ = 0;
		open();
		filecur_++;
	}

protected:

	/// Read another pattern from the input file; this is overridden
	/// to deal with specific file formats
	virtual bool read(
		Read& r,
		TReadId& rdid,
		TReadId& endid,
		bool& success,
		bool& done) = 0;
	
	/// Read another pattern pair from the input file; this is
	/// overridden to deal with specific file formats
	virtual bool readPair(
		Read& ra,
		Read& rb,
		TReadId& rdid,
		TReadId& endid,
		bool& success,
		bool& done,
		bool& paired) = 0;
	
	/// Reset state to handle a fresh file
	virtual void resetForNextFile() { }
	
	void open() {
		if(fb_.isOpen()) fb_.close();
		while(filecur_ < infiles_.size()) {
			// Open read
			FILE *in;
			if(infiles_[filecur_] == "-") {
				in = stdin;
			} else if((in = fopen(infiles_[filecur_].c_str(), "rb")) == NULL) {
				if(!errs_[filecur_]) {
					cerr << "Warning: Could not open read file \"" << infiles_[filecur_].c_str() << "\" for reading; skipping..." << endl;
					errs_[filecur_] = true;
				}
				filecur_++;
				continue;
			}
			fb_.newFile(in);
			return;
		}
		cerr << "Error: No input read files were valid" << endl;
		exit(1);
		return;
	}
	
	EList<string> infiles_;  // filenames for read files
	EList<bool> errs_;       // whether we've already printed an error for each file
	size_t filecur_;         // index into infiles_ of next file to read
	FileBuf fb_;             // read file currently being read from
	TReadId skip_;           // number of reads to skip
	bool first_;
};

/**
 * Parse a single quality string from fb and store qualities in r.
 * Assume the next character obtained via fb.get() is the first
 * character of the quality string.  When returning, the next
 * character returned by fb.peek() or fb.get() should be the first
 * character of the following line.
 */
int parseQuals(
	Read& r,
	FileBuf& fb,
	int firstc,
	int readLen,
	int trim3,
	int trim5,
	bool intQuals,
	bool phred64,
	bool solexa64);

/**
 * Synchronized concrete pattern source for a list of FASTA or CSFASTA
 * (if color = true) files.
 */
class FastaPatternSource : public BufferedFilePatternSource {
public:
	FastaPatternSource(const EList<string>& infiles,
	                   const PatternParams& p) :
		BufferedFilePatternSource(infiles, p),
		first_(true), solexa64_(p.solexa64), phred64_(p.phred64), intQuals_(p.intQuals)
	{ }
	virtual void reset() {
		first_ = true;
		BufferedFilePatternSource::reset();
	}
protected:
	/**
	 * Scan to the next FASTA record (starting with >) and return the first
	 * character of the record (which will always be >).
	 */
	static int skipToNextFastaRecord(FileBuf& in) {
		int c;
		while((c = in.get()) != '>') {
			if(in.eof()) return -1;
		}
		return c;
	}

	/// Called when we have to bail without having parsed a read.
	void bail(Read& r) {
		r.reset();
		fb_.resetLastN();
	}

	/// Read another pattern from a FASTA input file
	virtual bool read(
		Read& r,
		TReadId& rdid,
		TReadId& endid,
		bool& success,
		bool& done);
	
	/// Read another pair of patterns from a FASTA input file
	virtual bool readPair(
		Read& ra,
		Read& rb,
		TReadId& rdid,
		TReadId& endid,
		bool& success,
		bool& done,
		bool& paired)
	{
		// (For now, we shouldn't ever be here)
		cerr << "In FastaPatternSource.readPair()" << endl;
		throw 1;
		return false;
	}
	
	virtual void resetForNextFile() {
		first_ = true;
	}
	
private:
	bool first_;
    
public:
	bool solexa64_;
	bool phred64_;
	bool intQuals_;
};


/**
 * Tokenize a line of space-separated integer quality values.
 */
static inline bool tokenizeQualLine(
	FileBuf& filebuf,
	char *buf,
	size_t buflen,
	EList<string>& toks)
{
	size_t rd = filebuf.gets(buf, buflen);
	if(rd == 0) return false;
	assert(NULL == strrchr(buf, '\n'));
	tokenize(string(buf), " ", toks);
	return true;
}

/**
 * Synchronized concrete pattern source for a list of files with tab-
 * delimited name, seq, qual fields (or, for paired-end reads,
 * basename, seq1, qual1, seq2, qual2).
 */
class TabbedPatternSource : public BufferedFilePatternSource {

public:

	TabbedPatternSource(
		const EList<string>& infiles,
		const PatternParams& p,
		bool  secondName) :
		BufferedFilePatternSource(infiles, p),
		solQuals_(p.solexa64),
		phred64Quals_(p.phred64),
		intQuals_(p.intQuals),
		secondName_(secondName) { }

protected:

	/// Read another pattern from a FASTA input file
	virtual bool read(
		Read& r,
		TReadId& rdid,
		TReadId& endid,
		bool& success,
		bool& done);

	/// Read another pair of patterns from a FASTA input file
	virtual bool readPair(
		Read& ra,
		Read& rb,
		TReadId& rdid,
		TReadId& endid,
		bool& success,
		bool& done,
		bool& paired);
	
private:

	/**
	 * Parse a name from fb_ and store in r.  Assume that the next
	 * character obtained via fb_.get() is the first character of
	 * the sequence and the string stops at the next char upto (could
	 * be tab, newline, etc.).
	 */
	int parseName(Read& r, Read* r2, char upto = '\t');

	/**
	 * Parse a single sequence from fb_ and store in r.  Assume
	 * that the next character obtained via fb_.get() is the first
	 * character of the sequence and the sequence stops at the next
	 * char upto (could be tab, newline, etc.).
	 */
	int parseSeq(Read& r, int& charsRead, int& trim5, char upto = '\t');

	/**
	 * Parse a single quality string from fb_ and store in r.
	 * Assume that the next character obtained via fb_.get() is
	 * the first character of the quality string and the string stops
	 * at the next char upto (could be tab, newline, etc.).
	 */
	int parseQuals(Read& r, int charsRead, int dstLen, int trim5,
	               char& c2, char upto = '\t', char upto2 = -1);

	bool solQuals_;
	bool phred64Quals_;
	bool intQuals_;
	EList<string> qualToks_;
	bool secondName_;
};

/**
 * Synchronized concrete pattern source for Illumina Qseq files.  In
 * Qseq files, each read appears on a separate line and the tab-
 * delimited fields are:
 *
 * 1. Machine name
 * 2. Run number
 * 3. Lane number
 * 4. Tile number
 * 5. X coordinate of spot
 * 6. Y coordinate of spot
 * 7. Index: "Index sequence or 0. For no indexing, or for a file that
 *    has not been demultiplexed yet, this field should have a value of
 *    0."
 * 8. Read number: 1 for unpaired, 1 or 2 for paired
 * 9. Sequence
 * 10. Quality
 * 11. Filter: 1 = passed, 0 = didn't
 */
class QseqPatternSource : public BufferedFilePatternSource {

public:

	QseqPatternSource(
		const EList<string>& infiles,
	    const PatternParams& p) :
		BufferedFilePatternSource(infiles, p),
		solQuals_(p.solexa64),
		phred64Quals_(p.phred64),
		intQuals_(p.intQuals) { }

protected:

#define BAIL_UNPAIRED() { \
	peekOverNewline(fb_); \
	r.reset(); \
	success = false; \
	done = true; \
	return success; \
}

	/**
	 * Parse a name from fb_ and store in r.  Assume that the next
	 * character obtained via fb_.get() is the first character of
	 * the sequence and the string stops at the next char upto (could
	 * be tab, newline, etc.).
	 */
	int parseName(
		Read& r,      // buffer for mate 1
		Read* r2,     // buffer for mate 2 (NULL if mate2 is read separately)
		bool append,     // true -> append characters, false -> skip them
		bool clearFirst, // clear the name buffer first
		bool warnEmpty,  // emit a warning if nothing was added to the name
		bool useDefault, // if nothing is read, put readCnt_ as a default value
		int upto);       // stop parsing when we first reach character 'upto'

	/**
	 * Parse a single sequence from fb_ and store in r.  Assume
	 * that the next character obtained via fb_.get() is the first
	 * character of the sequence and the sequence stops at the next
	 * char upto (could be tab, newline, etc.).
	 */
	int parseSeq(
		Read& r,      // buffer for read
		int& charsRead,
		int& trim5,
		char upto);

	/**
	 * Parse a single quality string from fb_ and store in r.
	 * Assume that the next character obtained via fb_.get() is
	 * the first character of the quality string and the string stops
	 * at the next char upto (could be tab, newline, etc.).
	 */
	int parseQuals(
		Read& r,      // buffer for read
		int charsRead,
		int dstLen,
		int trim5,
		char& c2,
		char upto,
		char upto2);

	/**
	 * Read another pattern from a Qseq input file.
	 */
	virtual bool read(
		Read& r,
		TReadId& rdid,
		TReadId& endid,
		bool& success,
		bool& done);

	/**
	 * Read a pair of patterns from 1 Qseq file.  Note: this is never used.
	 */
	virtual bool readPair(
		Read& ra,
		Read& rb,
		TReadId& rdid,
		TReadId& endid,
		bool& success,
		bool& done,
		bool& paired)
	{
		// (For now, we shouldn't ever be here)
		cerr << "In QseqPatternSource.readPair()" << endl;
		throw 1;
		return false;
	}

	bool solQuals_;
	bool phred64Quals_;
	bool intQuals_;
	EList<string> qualToks_;
};

/**
 * Synchronized concrete pattern source for a list of FASTA files where
 * reads need to be extracted from long continuous sequences.
 */
class FastaContinuousPatternSource : public BufferedFilePatternSource {
public:
	FastaContinuousPatternSource(const EList<string>& infiles, const PatternParams& p) :
		BufferedFilePatternSource(infiles, p),
		length_(p.sampleLen), freq_(p.sampleFreq),
		eat_(length_-1), beginning_(true),
		bufCur_(0), subReadCnt_(0llu)
	{
		resetForNextFile();
	}

	virtual void reset() {
		BufferedFilePatternSource::reset();
		resetForNextFile();
	}

protected:

	/// Read another pattern from a FASTA input file
	virtual bool read(
		Read& r,
		TReadId& rdid,
		TReadId& endid,
		bool& success,
		bool& done)
	{
		success = true;
		done = false;
		r.reset();
		while(true) {
			r.color = gColor;
			int c = fb_.get();
			if(c < 0) { success = false; done = true; return success; }
			if(c == '>') {
				resetForNextFile();
				c = fb_.peek();
				bool sawSpace = false;
				while(c != '\n' && c != '\r') {
					if(!sawSpace) {
						sawSpace = isspace(c);
					}
					if(!sawSpace) {
						nameBuf_.append(c);
					}
					fb_.get();
					c = fb_.peek();
				}
				while(c == '\n' || c == '\r') {
					fb_.get();
					c = fb_.peek();
				}
				nameBuf_.append('_');
			} else {
				int cat = asc2dnacat[c];
				if(cat >= 2) c = 'N';
				if(cat == 0) {
					// Encountered non-DNA, non-IUPAC char; skip it
					continue;
				} else {
					// DNA char
					buf_[bufCur_++] = c;
					if(bufCur_ == 1024) bufCur_ = 0;
					if(eat_ > 0) {
						eat_--;
						// Try to keep readCnt_ aligned with the offset
						// into the reference; that lets us see where
						// the sampling gaps are by looking at the read
						// name
						if(!beginning_) readCnt_++;
						continue;
					}
					for(size_t i = 0; i < length_; i++) {
						if(length_ - i <= bufCur_) {
							c = buf_[bufCur_ - (length_ - i)];
						} else {
							// Rotate
							c = buf_[bufCur_ - (length_ - i) + 1024];
						}
						r.patFw.append(asc2dna[c]);
						r.qual.append('I');
					}
					// Set up a default name if one hasn't been set
					r.name = nameBuf_;
					char cbuf[20];
					itoa10<TReadId>(readCnt_ - subReadCnt_, cbuf);
					r.name.append(cbuf);
					eat_ = freq_-1;
					readCnt_++;
					beginning_ = false;
					rdid = endid = readCnt_-1;
					break;
				}
			}
		}
		return true;
	}
	
	/// Shouldn't ever be here; it's not sensible to obtain read pairs
	// from a continuous input.
	virtual bool readPair(
		Read& ra,
		Read& rb,
		TReadId& rdid,
		TReadId& endid,
		bool& success,
		bool& done,
		bool& paired)
	{
		cerr << "In FastaContinuousPatternSource.readPair()" << endl;
		throw 1;
		return false;
	}

	/**
	 * Reset state to be read for the next file.
	 */
	virtual void resetForNextFile() {
		eat_ = length_-1;
		beginning_ = true;
		bufCur_ = 0;
		nameBuf_.clear();
		subReadCnt_ = readCnt_;
	}

private:
	size_t length_;     /// length of reads to generate
	size_t freq_;       /// frequency to sample reads
	size_t eat_;        /// number of characters we need to skip before
	                    /// we have flushed all of the ambiguous or
	                    /// non-existent characters out of our read
	                    /// window
	bool beginning_;    /// skipping over the first read length?
	char buf_[1024];    /// read buffer
	BTString nameBuf_;  /// read buffer for name of fasta record being
	                    /// split into mers
	size_t bufCur_;     /// buffer cursor; points to where we should
	                    /// insert the next character
	uint64_t subReadCnt_;/// number to subtract from readCnt_ to get
	                    /// the pat id to output (so it resets to 0 for
	                    /// each new sequence)
};

/**
 * Read a FASTQ-format file.
 * See: http://maq.sourceforge.net/fastq.shtml
 */
class FastqPatternSource : public BufferedFilePatternSource {

public:

	FastqPatternSource(const EList<string>& infiles, const PatternParams& p) :
		BufferedFilePatternSource(infiles, p),
		first_(true),
		solQuals_(p.solexa64),
		phred64Quals_(p.phred64),
		intQuals_(p.intQuals),
		fuzzy_(p.fuzzy)
	{ }
	
	virtual void reset() {
		first_ = true;
		fb_.resetLastN();
		BufferedFilePatternSource::reset();
	}
	
protected:

	/**
	 * Scan to the next FASTQ record (starting with @) and return the first
	 * character of the record (which will always be @).  Since the quality
	 * line may start with @, we keep scanning until we've seen a line
	 * beginning with @ where the line two lines back began with +.
	 */
	static int skipToNextFastqRecord(FileBuf& in, bool sawPlus) {
		int line = 0;
		int plusLine = -1;
		int c = in.get();
		int firstc = c;
		while(true) {
			if(line > 20) {
				// If we couldn't find our desired '@' in the first 20
				// lines, it's time to give up
				if(firstc == '>') {
					// That firstc is '>' may be a hint that this is
					// actually a FASTA file, so return it intact
					return '>';
				}
				// Return an error
				return -1;
			}
			if(c == -1) return -1;
			if(c == '\n') {
				c = in.get();
				if(c == '@' && sawPlus && plusLine == (line-2)) {
					return '@';
				}
				else if(c == '+') {
					// Saw a '+' at the beginning of a line; remember where
					// we saw it
					sawPlus = true;
					plusLine = line;
				}
				else if(c == -1) {
					return -1;
				}
				line++;
			}
			c = in.get();
		}
	}

	/// Read another pattern from a FASTQ input file
	virtual bool read(
		Read& r,
		TReadId& rdid,
		TReadId& endid,
		bool& success,
		bool& done);
	
	/// Read another read pair from a FASTQ input file
	virtual bool readPair(
		Read& ra,
		Read& rb,
		TReadId& rdid,
		TReadId& endid,
		bool& success,
		bool& done,
		bool& paired)
	{
		// (For now, we shouldn't ever be here)
		cerr << "In FastqPatternSource.readPair()" << endl;
		throw 1;
		return false;
	}
	
	virtual void resetForNextFile() {
		first_ = true;
	}
	
private:

	/**
	 * Do things we need to do if we have to bail in the middle of a
	 * read, usually because we reached the end of the input without
	 * finishing.
	 */
	void bail(Read& r) {
		r.patFw.clear();
		fb_.resetLastN();
	}

	bool first_;
	bool solQuals_;
	bool phred64Quals_;
	bool intQuals_;
	bool fuzzy_;
	EList<string> qualToks_;
};

/**
 * Read a Raw-format file (one sequence per line).  No quality strings
 * allowed.  All qualities are assumed to be 'I' (40 on the Phred-33
 * scale).
 */
class RawPatternSource : public BufferedFilePatternSource {

public:

	RawPatternSource(const EList<string>& infiles, const PatternParams& p) :
		BufferedFilePatternSource(infiles, p), first_(true) { }

	virtual void reset() {
		first_ = true;
		BufferedFilePatternSource::reset();
	}

protected:

	/// Read another pattern from a Raw input file
	virtual bool read(
		Read& r,
		TReadId& rdid,
		TReadId& endid,
		bool& success,
		bool& done)
	{
		int c;
		success = true;
		done = false;
		r.reset();
		c = getOverNewline(this->fb_);
		if(c < 0) {
			bail(r); success = false; done = true; return success;
		}
		assert(!isspace(c));
		r.color = gColor;
		int mytrim5 = gTrim5;
		if(first_) {
			// Check that the first character is sane for a raw file
			int cc = c;
			if(gColor) {
				if(cc >= '0' && cc <= '4') cc = "ACGTN"[(int)cc - '0'];
				if(cc == '.') cc = 'N';
			}
			if(asc2dnacat[cc] == 0) {
				cerr << "Error: reads file does not look like a Raw file" << endl;
				if(c == '>') {
					cerr << "Reads file looks like a FASTA file; please use -f" << endl;
				}
				if(c == '@') {
					cerr << "Reads file looks like a FASTQ file; please use -q" << endl;
				}
				throw 1;
			}
			first_ = false;
		}
		if(gColor) {
			// This may be a primer character.  If so, keep it in the
			// 'primer' field of the read buf and parse the rest of the
			// read without it.
			c = toupper(c);
			if(asc2dnacat[c] > 0) {
				// First char is a DNA char
				int c2 = toupper(fb_.peek());
				// Second char is a color char
				if(asc2colcat[c2] > 0) {
					r.primer = c;
					r.trimc = c2;
					mytrim5 += 2; // trim primer and first color
				}
			}
			if(c < 0) {
				bail(r); success = false; done = true; return success;
			}
		}
		// _in now points just past the first character of a sequence
		// line, and c holds the first character
		int chs = 0;
		while(!isspace(c) && c >= 0) {
			if(gColor) {
				if(c >= '0' && c <= '4') c = "ACGTN"[(int)c - '0'];
				if(c == '.') c = 'N';
			}
			// 5' trimming
			if(isalpha(c) && chs >= mytrim5) {
				//size_t len = chs - mytrim5;
				//if(len >= 1024) tooManyQualities(BTString("(no name)"));
				r.patFw.append(asc2dna[c]);
				r.qual.append('I');
			}
			chs++;
			if(isspace(fb_.peek())) break;
			c = fb_.get();
		}
		// 3' trimming
		r.patFw.trimEnd(gTrim3);
		r.qual.trimEnd(gTrim3);
		c = peekToEndOfLine(fb_);
		r.trimmed3 = gTrim3;
		r.trimmed5 = mytrim5;
		r.readOrigBuf.install(fb_.lastN(), fb_.lastNLen());
		fb_.resetLastN();

		// Set up name
		char cbuf[20];
		itoa10<TReadId>(readCnt_, cbuf);
		r.name.install(cbuf);
		readCnt_++;

		rdid = endid = readCnt_-1;
		return success;
	}
	
	/// Read another read pair from a FASTQ input file
	virtual bool readPair(
		Read& ra,
		Read& rb,
		TReadId& rdid,
		TReadId& endid,
		bool& success,
		bool& done,
		bool& paired)
	{
		// (For now, we shouldn't ever be here)
		cerr << "In RawPatternSource.readPair()" << endl;
		throw 1;
		return false;
	}
	
	virtual void resetForNextFile() {
		first_ = true;
	}
	
private:

	/**
	 * Do things we need to do if we have to bail in the middle of a
	 * read, usually because we reached the end of the input without
	 * finishing.
	 */
	void bail(Read& r) {
		r.patFw.clear();
		fb_.resetLastN();
	}
	
	bool first_;
};

#ifdef USE_SRA

namespace ngs {
    class ReadCollection;
    class ReadIterator;
}

namespace tthread {
    class thread;
};

struct SRA_Data;

/**
 *
 */
class SRAPatternSource : public PatternSource {
public:
    SRAPatternSource(
                     const EList<string>& sra_accs,
                     const PatternParams& p,
                     const size_t nthreads = 1) :
    PatternSource(p),
    sra_accs_(sra_accs),
    sra_acc_cur_(0),
    skip_(p.skip),
    first_(true),
    nthreads_(nthreads),
    sra_run_(NULL),
    sra_it_(NULL),
    sra_data_(NULL),
    io_thread_(NULL)
    {
        assert_gt(sra_accs_.size(), 0);
        errs_.resize(sra_accs_.size());
        errs_.fill(0, sra_accs_.size(), false);
        open(); // open first file in the list
        sra_acc_cur_++;
    }
    
    virtual ~SRAPatternSource();
    
    /**
     * Fill Read with the sequence, quality and name for the next
     * read in the list of read files.  This function gets called by
     * all the search threads, so we must handle synchronization.
     */
    virtual bool nextReadImpl(
                              Read& r,
                              TReadId& rdid,
                              TReadId& endid,
                              bool& success,
                              bool& done)
    {
        // We'll be manipulating our file handle/filecur_ state
        lock();
        while(true) {
            do { read(r, rdid, endid, success, done); }
            while(!success && !done);
            if(!success && sra_acc_cur_ < sra_accs_.size()) {
                assert(done);
                open();
                resetForNextFile(); // reset state to handle a fresh file
                sra_acc_cur_++;
                continue;
            }
            break;
        }
        assert(r.repOk());
        // Leaving critical region
        unlock();
        return success;
    }
    
    /**
     *
     */
    virtual bool nextReadPairImpl(
                                  Read& ra,
                                  Read& rb,
                                  TReadId& rdid,
                                  TReadId& endid,
                                  bool& success,
                                  bool& done,
                                  bool& paired)
    {
        // We'll be manipulating our file handle/filecur_ state
        lock();
        while(true) {
            do { readPair(ra, rb, rdid, endid, success, done, paired); }
            while(!success && !done);
            if(!success && sra_acc_cur_ < sra_accs_.size()) {
                assert(done);
                open();
                resetForNextFile(); // reset state to handle a fresh file
                sra_acc_cur_++;
                continue;
            }
            break;
        }
        assert(ra.repOk());
        assert(rb.repOk());
        // Leaving critical region
        unlock();
        return success;
    }
    
    /**
     * Reset state so that we read start reading again from the
     * beginning of the first file.  Should only be called by the
     * master thread.
     */
    virtual void reset() {
        PatternSource::reset();
        sra_acc_cur_ = 0,
        open();
        sra_acc_cur_++;
    }
    
    /// Read another pattern from the input file; this is overridden
    /// to deal with specific file formats
    virtual bool read(
                      Read& r,
                      TReadId& rdid,
                      TReadId& endid,
                      bool& success,
                      bool& done)
    {
        return true;
    }
    
    /// Read another pattern pair from the input file; this is
    /// overridden to deal with specific file formats
    virtual bool readPair(
                          Read& ra,
                          Read& rb,
                          TReadId& rdid,
                          TReadId& endid,
                          bool& success,
                          bool& done,
                          bool& paired);
    
protected:
    
    /// Reset state to handle a fresh file
    virtual void resetForNextFile() { }
    
    void open();
    
    EList<string> sra_accs_; // filenames for read files
    EList<bool> errs_;       // whether we've already printed an error for each file
    size_t sra_acc_cur_;     // index into infiles_ of next file to read
    TReadId skip_;           // number of reads to skip
    bool first_;
    
    size_t nthreads_;
    
    ngs::ReadCollection* sra_run_;
    ngs::ReadIterator* sra_it_;
    
    SRA_Data* sra_data_;
    tthread::thread* io_thread_;    
};

#endif

#endif /*PAT_H_*/
