/*
 * Copyright 2011, Ben Langmead <langmea@cs.jhu.edu>
 *
 * This file is part of Bowtie 2.
 *
 * Bowtie 2 is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * Bowtie 2 is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with Bowtie 2.  If not, see <http://www.gnu.org/licenses/>.
 */

#include "aligner_bt.h"
#include "mask.h"

using namespace std;

#define CHECK_ROW_COL(rowc, colc) \
	if(rowc >= 0 && colc >= 0) { \
		if(!sawcell_[colc].insert(rowc)) { \
			/* was already in there */ \
			abort = true; \
			return; \
		} \
		assert(local || prob_.cper_->debugCell(rowc, colc, hefc)); \
	}

/**
 * Fill in a triangle of the DP table and backtrace from the given cell to
 * a cell in the previous checkpoint, or to the terminal cell.
 */
void BtBranchTracer::triangleFill(
	int64_t rw,          // row of cell to backtrace from
	int64_t cl,          // column of cell to backtrace from
	int hef,             // cell to backtrace from is H (0), E (1), or F (2)
	TAlScore targ,       // score of cell to backtrace from
	TAlScore targ_final, // score of alignment we're looking for
	RandomSource& rnd,   // pseudo-random generator
	int64_t& row_new,    // out: row we ended up in after backtrace
	int64_t& col_new,    // out: column we ended up in after backtrace
	int& hef_new,        // out: H/E/F after backtrace
	TAlScore& targ_new,  // out: score up to cell we ended up in
	bool& done,          // out: finished tracing out an alignment?
	bool& abort)         // out: aborted b/c cell was seen before?
{
	assert_geq(rw, 0);
	assert_geq(cl, 0);
	assert_range(0, 2, hef);
	assert_lt(rw, (int64_t)prob_.qrylen_);
	assert_lt(cl, (int64_t)prob_.reflen_);
	assert(prob_.usecp_ && prob_.fill_);
	int64_t row = rw, col = cl;
	const int64_t colmin = 0;
	const int64_t rowmin = 0;
	const int64_t colmax = prob_.reflen_ - 1;
	const int64_t rowmax = prob_.qrylen_ - 1;
	assert_leq(prob_.reflen_, (TRefOff)sawcell_.size());
	assert_leq(col, (int64_t)prob_.cper_->hicol());
	assert_geq(col, (int64_t)prob_.cper_->locol());
	assert_geq(prob_.cper_->per(), 2);
	size_t mod = (row + col) & prob_.cper_->lomask();
	assert_lt(mod, prob_.cper_->per());
	// Allocate room for diags
	size_t depth = mod+1;
	assert_leq(depth, prob_.cper_->per());
	size_t breadth = depth;
	tri_.resize(depth);
	// Allocate room for each diag
	for(size_t i = 0; i < depth; i++) {
		tri_[i].resize(breadth - i);
	}
	bool upperleft = false;
	size_t off = (row + col) >> prob_.cper_->perpow2();
	if(off == 0) {
		upperleft = true;
	} else {
		off--;
	}
	const TAlScore sc_rdo = prob_.sc_->readGapOpen();
	const TAlScore sc_rde = prob_.sc_->readGapExtend();
	const TAlScore sc_rfo = prob_.sc_->refGapOpen();
	const TAlScore sc_rfe = prob_.sc_->refGapExtend();
	const bool local = !prob_.sc_->monotone;
	int64_t row_lo = row - (int64_t)mod;
	const CpQuad *prev2 = NULL, *prev1 = NULL;
	if(!upperleft) {
		// Read-only pointer to cells in diagonal -2.  Start one row above the
		// target row.
		prev2 = prob_.cper_->qdiag1sPtr() + (off * prob_.cper_->nrow() + row_lo - 1);
		// Read-only pointer to cells in diagonal -1.  Start one row above the
		// target row
		prev1 = prob_.cper_->qdiag2sPtr() + (off * prob_.cper_->nrow() + row_lo - 1);
#ifndef NDEBUG
		if(row >= (int64_t)mod) {
			size_t rowc = row - mod, colc = col;
			if(rowc > 0 && prob_.cper_->isCheckpointed(rowc-1, colc)) {
				TAlScore al = prev1[0].sc[0];
				if(al == MIN_I16) al = MIN_I64;
				assert_eq(prob_.cper_->scoreTriangle(rowc-1, colc, 0), al);
			}
			if(rowc > 0 && colc > 0 && prob_.cper_->isCheckpointed(rowc-1, colc-1)) {
				TAlScore al = prev2[0].sc[0];
				if(al == MIN_I16) al = MIN_I64;
				assert_eq(prob_.cper_->scoreTriangle(rowc-1, colc-1, 0), al);
			}
		}
#endif
	}
	// Pointer to cells in current diagonal
	// For each diagonal we need to fill in
	for(size_t i = 0; i < depth; i++) {
		CpQuad * cur = tri_[i].ptr();
		CpQuad * curc = cur;
		size_t doff = mod - i; // # diagonals we are away from target diag
		//assert_geq(row, (int64_t)doff);
		int64_t rowc = row - doff;
		int64_t colc = col;
		size_t neval = 0; // # cells evaluated in this diag
		ASSERT_ONLY(const CpQuad *last = NULL);
		// Fill this diagonal from upper right to lower left
		for(size_t j = 0; j < breadth; j++) {
			if(rowc >= rowmin && rowc <= rowmax &&
			   colc >= colmin && colc <= colmax)
			{
				neval++;
				int64_t fromend = prob_.qrylen_ - rowc - 1;
				bool allowGaps = fromend >= prob_.sc_->gapbar && rowc >= prob_.sc_->gapbar;
				// Fill this cell
				// Some things we might want to calculate about this cell up front:
				// 1. How many matches are possible from this cell to the cell in
				//    row, col, in case this allows us to prune
				// Get character from read
				int qc = prob_.qry_[rowc];
				// Get quality value from read
				int qq = prob_.qual_[rowc];
				assert_geq(qq, 33);
				// Get character from reference
				int rc = prob_.ref_[colc];
				assert_range(0, 16, rc);
				int16_t sc_diag = prob_.sc_->score(qc, rc, qq - 33);
				int16_t sc_h_up = MIN_I16;
				int16_t sc_f_up = MIN_I16;
				int16_t sc_h_lf = MIN_I16;
				int16_t sc_e_lf = MIN_I16;
				if(allowGaps) {
					if(rowc > 0) {
						assert(local || prev1[j+0].sc[2] < 0);
						if(prev1[j+0].sc[0] > MIN_I16) {
							sc_h_up = prev1[j+0].sc[0] - sc_rfo;
							if(local) sc_h_up = max<int16_t>(sc_h_up, 0);
						}
						if(prev1[j+0].sc[2] > MIN_I16) {
							sc_f_up = prev1[j+0].sc[2] - sc_rfe;
							if(local) sc_f_up = max<int16_t>(sc_f_up, 0);
						}
#ifndef NDEBUG
						TAlScore hup = prev1[j+0].sc[0];
						TAlScore fup = prev1[j+0].sc[2];
						if(hup == MIN_I16) hup = MIN_I64;
						if(fup == MIN_I16) fup = MIN_I64;
						if(local) {
							hup = max<int16_t>(hup, 0);
							fup = max<int16_t>(fup, 0);
						}
						if(prob_.cper_->isCheckpointed(rowc-1, colc)) {
							assert_eq(hup, prob_.cper_->scoreTriangle(rowc-1, colc, 0));
							assert_eq(fup, prob_.cper_->scoreTriangle(rowc-1, colc, 2));
						}
#endif
					}
					if(colc > 0) {
						assert(local || prev1[j+1].sc[1] < 0);
						if(prev1[j+1].sc[0] > MIN_I16) {
							sc_h_lf = prev1[j+1].sc[0] - sc_rdo;
							if(local) sc_h_lf = max<int16_t>(sc_h_lf, 0);
						}
						if(prev1[j+1].sc[1] > MIN_I16) {
							sc_e_lf = prev1[j+1].sc[1] - sc_rde;
							if(local) sc_e_lf = max<int16_t>(sc_e_lf, 0);
						}
#ifndef NDEBUG
						TAlScore hlf = prev1[j+1].sc[0];
						TAlScore elf = prev1[j+1].sc[1];
						if(hlf == MIN_I16) hlf = MIN_I64;
						if(elf == MIN_I16) elf = MIN_I64;
						if(local) {
							hlf = max<int16_t>(hlf, 0);
							elf = max<int16_t>(elf, 0);
						}
						if(prob_.cper_->isCheckpointed(rowc, colc-1)) {
							assert_eq(hlf, prob_.cper_->scoreTriangle(rowc, colc-1, 0));
							assert_eq(elf, prob_.cper_->scoreTriangle(rowc, colc-1, 1));
						}
#endif
					}
				}
				assert(rowc <= 1 || colc <= 0 || prev2 != NULL);
				int16_t sc_h_dg = ((rowc > 0 && colc > 0) ? prev2[j+0].sc[0] : 0);
				if(colc == 0 && rowc > 0 && !local) {
					sc_h_dg = MIN_I16;
				}
				if(sc_h_dg > MIN_I16) {
					sc_h_dg += sc_diag;
				}
				if(local) sc_h_dg = max<int16_t>(sc_h_dg, 0);
				// cerr << sc_diag << " " << sc_h_dg << " " << sc_h_up << " " << sc_f_up << " " << sc_h_lf << " " << sc_e_lf << endl;
				int mask = 0;
				// Calculate best ways into H, E, F cells starting with H.
				// Mask bits:
				// H: 1=diag, 2=hhoriz, 4=ehoriz, 8=hvert, 16=fvert
				// E: 32=hhoriz, 64=ehoriz
				// F: 128=hvert, 256=fvert
				int16_t sc_best = sc_h_dg;
				if(sc_h_dg > MIN_I64) {
					mask = 1;
				}
				if(colc > 0 && sc_h_lf >= sc_best && sc_h_lf > MIN_I64) {
					if(sc_h_lf > sc_best) mask = 0;
					mask |= 2;
					sc_best = sc_h_lf;
				}
				if(colc > 0 && sc_e_lf >= sc_best && sc_e_lf > MIN_I64) {
					if(sc_e_lf > sc_best) mask = 0;
					mask |= 4;
					sc_best = sc_e_lf;
				}
				if(rowc > 0 && sc_h_up >= sc_best && sc_h_up > MIN_I64) {
					if(sc_h_up > sc_best) mask = 0;
					mask |= 8;
					sc_best = sc_h_up;
				}
				if(rowc > 0 && sc_f_up >= sc_best && sc_f_up > MIN_I64) {
					if(sc_f_up > sc_best) mask = 0;
					mask |= 16;
					sc_best = sc_f_up;
				}
				// Calculate best way into E cell
				int16_t sc_e_best = sc_h_lf;
				if(colc > 0) {
					if(sc_h_lf >= sc_e_lf && sc_h_lf > MIN_I64) {
						if(sc_h_lf == sc_e_lf) {
							mask |= 64;
						}
						mask |= 32;
					} else if(sc_e_lf > MIN_I64) {
						sc_e_best = sc_e_lf;
						mask |= 64;
					}
				}
				if(sc_e_best > sc_best) {
					sc_best = sc_e_best;
					mask &= ~31; // don't go diagonal
				}
				// Calculate best way into F cell
				int16_t sc_f_best = sc_h_up;
				if(rowc > 0) {
					if(sc_h_up >= sc_f_up && sc_h_up > MIN_I64) {
						if(sc_h_up == sc_f_up) {
							mask |= 256;
						}
						mask |= 128;
					} else if(sc_f_up > MIN_I64) {
						sc_f_best = sc_f_up;
						mask |= 256;
					}
				}
				if(sc_f_best > sc_best) {
					sc_best = sc_f_best;
					mask &= ~127; // don't go horizontal or diagonal
				}
				// Install results in cur
				assert(!prob_.sc_->monotone || sc_best <= 0);
				assert(!prob_.sc_->monotone || sc_e_best <= 0);
				assert(!prob_.sc_->monotone || sc_f_best <= 0);
				curc->sc[0] = sc_best;
				assert( local || sc_e_best < 0);
				assert( local || sc_f_best < 0);
				assert(!local || sc_e_best >= 0 || sc_e_best == MIN_I16);
				assert(!local || sc_f_best >= 0 || sc_f_best == MIN_I16);
				curc->sc[1] = sc_e_best;
				curc->sc[2] = sc_f_best;
				curc->sc[3] = mask;
				// cerr << curc->sc[0] << " " << curc->sc[1] << " " << curc->sc[2] << " " << curc->sc[3] << endl;
				ASSERT_ONLY(last = curc);
#ifndef NDEBUG
				if(prob_.cper_->isCheckpointed(rowc, colc)) {
					if(local) {
						sc_e_best = max<int16_t>(sc_e_best, 0);
						sc_f_best = max<int16_t>(sc_f_best, 0);
					}
					TAlScore sc_best64   = sc_best;   if(sc_best   == MIN_I16) sc_best64   = MIN_I64;
					TAlScore sc_e_best64 = sc_e_best; if(sc_e_best == MIN_I16) sc_e_best64 = MIN_I64;
					TAlScore sc_f_best64 = sc_f_best; if(sc_f_best == MIN_I16) sc_f_best64 = MIN_I64;
					assert_eq(prob_.cper_->scoreTriangle(rowc, colc, 0), sc_best64);
					assert_eq(prob_.cper_->scoreTriangle(rowc, colc, 1), sc_e_best64);
					assert_eq(prob_.cper_->scoreTriangle(rowc, colc, 2), sc_f_best64);
				}
#endif
			}
			// Update row, col
			assert_lt(rowc, (int64_t)prob_.qrylen_);
			rowc++;
			colc--;
			curc++;
		} // for(size_t j = 0; j < breadth; j++)
		if(i == depth-1) {
			// Final iteration
			assert(last != NULL);
			assert_eq(1, neval);
			assert_neq(0, last->sc[3]);
			assert_eq(targ, last->sc[hef]);
		} else {
			breadth--;
			prev2 = prev1 + 1;
			prev1 = cur;
		}
	} // for(size_t i = 0; i < depth; i++)
	//
	// Now backtrack through the triangle.  Abort as soon as we enter a cell
	// that was visited by a previous backtrace.
	//
	int64_t rowc = row, colc = col;
	size_t curid;
	int hefc = hef;
	if(bs_.empty()) {
		// Start an initial branch
		CHECK_ROW_COL(rowc, colc);
		curid = bs_.alloc();
		assert_eq(0, curid);
		Edit e;
		bs_[curid].init(
			prob_,
			0,      // parent ID
			0,      // penalty
			0,      // score_en
			rowc,   // row
			colc,   // col
			e,      // edit
			0,      // hef
			true,   // I am the root
			false); // don't try to extend with exact matches
		bs_[curid].len_ = 0;
	} else {
		curid = bs_.size()-1;
	}
	size_t idx_orig = (row + col) >> prob_.cper_->perpow2();
	while(true) {
		// What depth are we?
		size_t mod = (rowc + colc) & prob_.cper_->lomask();
		assert_lt(mod, prob_.cper_->per());
		CpQuad * cur = tri_[mod].ptr();
		int64_t row_off = rowc - row_lo - mod;
		assert(!local || cur[row_off].sc[0] > 0);
		assert_geq(row_off, 0);
		int mask = cur[row_off].sc[3];
		assert_gt(mask, 0);
		int sel = -1;
		// Select what type of move to make, which depends on whether we're
		// currently in H, E, F:
		if(hefc == 0) {
			if(       (mask & 1) != 0) {
				// diagonal
				sel = 0;
			} else if((mask & 8) != 0) {
				// up to H
				sel = 3;
			} else if((mask & 16) != 0) {
				// up to F
				sel = 4;
			} else if((mask & 2) != 0) {
				// left to H
				sel = 1;
			} else if((mask & 4) != 0) {
				// left to E
				sel = 2;
			}
		} else if(hefc == 1) {
			if(       (mask & 32) != 0) {
				// left to H
				sel = 5;
			} else if((mask & 64) != 0) {
				// left to E
				sel = 6;
			}
		} else {
			assert_eq(2, hefc);
			if(       (mask & 128) != 0) {
				// up to H
				sel = 7;
			} else if((mask & 256) != 0) {
				// up to F
				sel = 8;
			}
		}
		assert_geq(sel, 0);
		// Get character from read
		int qc = prob_.qry_[rowc], qq = prob_.qual_[rowc];
		// Get character from reference
		int rc = prob_.ref_[colc];
		assert_range(0, 16, rc);
		// Now that we know what type of move to make, make it, updating our
		// row and column and moving updating the branch.
		if(sel == 0) {
			assert_geq(rowc, 0);
			assert_geq(colc, 0);
			TAlScore scd = prob_.sc_->score(qc, rc, qq - 33);
			if((rc & (1 << qc)) == 0) {
				// Mismatch
				size_t id = curid;
				// Check if the previous branch was the initial (bottommost)
				// branch with no matches.  If so, the mismatch should be added
				// to the initial branch, instead of starting a new branch.
				bool empty = (bs_[curid].len_ == 0 && curid == 0);
				if(!empty) {
					id = bs_.alloc();
				}
				Edit e((int)rowc, mask2dna[rc], "ACGTN"[qc], EDIT_TYPE_MM);
				assert_lt(scd, 0);
				TAlScore score_en = bs_[curid].score_st_ + scd;
				bs_[id].init(
					prob_,
					curid,    // parent ID
					-scd,     // penalty
					score_en, // score_en
					rowc,     // row
					colc,     // col
					e,        // edit
					hefc,     // hef
					empty,    // root?
					false);   // don't try to extend with exact matches
				//assert(!local || bs_[id].score_st_ >= 0);
				curid = id;
			} else {
				// Match
				bs_[curid].score_st_ += prob_.sc_->match();
				bs_[curid].len_++;
				assert_leq((int64_t)bs_[curid].len_, bs_[curid].row_ + 1);
			}
			rowc--;
			colc--;
			assert(local || bs_[curid].score_st_ >= targ_final);
			hefc = 0;
		} else if((sel >= 1 && sel <= 2) || (sel >= 5 && sel <= 6)) {
			assert_gt(colc, 0);
			// Read gap
			size_t id = bs_.alloc();
			Edit e((int)rowc+1, mask2dna[rc], '-', EDIT_TYPE_READ_GAP);
			TAlScore gapp = prob_.sc_->readGapOpen();
			if(bs_[curid].len_ == 0 && bs_[curid].e_.inited() && bs_[curid].e_.isReadGap()) {
				gapp = prob_.sc_->readGapExtend();
			}
			TAlScore score_en = bs_[curid].score_st_ - gapp;
			bs_[id].init(
				prob_,
				curid,    // parent ID
				gapp,     // penalty
				score_en, // score_en
				rowc,     // row
				colc-1,   // col
				e,        // edit
				hefc,     // hef
				false,    // root?
				false);   // don't try to extend with exact matches
			colc--;
			curid = id;
			assert( local || bs_[curid].score_st_ >= targ_final);
			//assert(!local || bs_[curid].score_st_ >= 0);
			if(sel == 1 || sel == 5) {
				hefc = 0;
			} else {
				hefc = 1;
			}
		} else {
			assert_gt(rowc, 0);
			// Reference gap
			size_t id = bs_.alloc();
			Edit e((int)rowc, '-', "ACGTN"[qc], EDIT_TYPE_REF_GAP);
			TAlScore gapp = prob_.sc_->refGapOpen();
			if(bs_[curid].len_ == 0 && bs_[curid].e_.inited() && bs_[curid].e_.isRefGap()) {
				gapp = prob_.sc_->refGapExtend();
			}
			TAlScore score_en = bs_[curid].score_st_ - gapp;
			bs_[id].init(
				prob_,
				curid,    // parent ID
				gapp,     // penalty
				score_en, // score_en
				rowc-1,   // row
				colc,     // col
				e,        // edit
				hefc,     // hef
				false,    // root?
				false);   // don't try to extend with exact matches
			rowc--;
			curid = id;
			//assert(!local || bs_[curid].score_st_ >= 0);
			if(sel == 3 || sel == 7) {
				hefc = 0;
			} else {
				hefc = 2;
			}
		}
		CHECK_ROW_COL(rowc, colc);
		size_t mod_new = (rowc + colc) & prob_.cper_->lomask();
		size_t idx = (rowc + colc) >> prob_.cper_->perpow2();
		assert_lt(mod_new, prob_.cper_->per());
		int64_t row_off_new = rowc - row_lo - mod_new;
		CpQuad * cur_new = NULL;
		if(colc >= 0 && rowc >= 0 && idx == idx_orig) {
			cur_new = tri_[mod_new].ptr();
		}
		bool hit_new_tri = (idx < idx_orig && colc >= 0 && rowc >= 0);
		// Check whether we made it to the top row or to a cell with score 0
		if(colc < 0 || rowc < 0 ||
		   (cur_new != NULL && (local && cur_new[row_off_new].sc[0] == 0)))
		{
			done = true;
			assert(bs_[curid].isSolution(prob_));
			addSolution(curid);
#ifndef NDEBUG
			// A check to see if any two adjacent branches in the backtrace
			// overlap.  If they do, the whole alignment will be filtered out
			// in trySolution(...)
			size_t cur = curid;
			if(!bs_[cur].root_) {
				size_t next = bs_[cur].parentId_;
				while(!bs_[next].root_) {
					assert_neq(cur, next);
					if(bs_[next].len_ != 0 || bs_[cur].len_ == 0) {
						assert(!bs_[cur].overlap(prob_, bs_[next]));
					}
					cur = next;
					next = bs_[cur].parentId_;
				}
			}
#endif
			return;
		}
		if(hit_new_tri) {
			assert(rowc < 0 || colc < 0 || prob_.cper_->isCheckpointed(rowc, colc));
			row_new = rowc; col_new = colc;
			hef_new = hefc;
			done = false;
			if(rowc < 0 || colc < 0) {
				assert(local);
				targ_new = 0;
			} else {
				targ_new = prob_.cper_->scoreTriangle(rowc, colc, hefc);
			}
			if(local && targ_new == 0) {
				done = true;
				assert(bs_[curid].isSolution(prob_));
				addSolution(curid);
			}
			assert((row_new >= 0 && col_new >= 0) || done);
			return;
		}
	}
	assert(false);
}

#ifndef NDEBUG
#define DEBUG_CHECK(ss, row, col, hef) { \
	if(prob_.cper_->debug() && row >= 0 && col >= 0) { \
		TAlScore s = ss; \
		if(s == MIN_I16) s = MIN_I64; \
		if(local && s < 0) s = 0; \
		TAlScore deb = prob_.cper_->debugCell(row, col, hef); \
		if(local && deb < 0) deb = 0; \
		assert_eq(s, deb); \
	} \
}
#else
#define DEBUG_CHECK(ss, row, col, hef)
#endif


/**
 * Fill in a square of the DP table and backtrace from the given cell to
 * a cell in the previous checkpoint, or to the terminal cell.
 */
void BtBranchTracer::squareFill(
	int64_t rw,          // row of cell to backtrace from
	int64_t cl,          // column of cell to backtrace from
	int hef,             // cell to backtrace from is H (0), E (1), or F (2)
	TAlScore targ,       // score of cell to backtrace from
	TAlScore targ_final, // score of alignment we're looking for
	RandomSource& rnd,   // pseudo-random generator
	int64_t& row_new,    // out: row we ended up in after backtrace
	int64_t& col_new,    // out: column we ended up in after backtrace
	int& hef_new,        // out: H/E/F after backtrace
	TAlScore& targ_new,  // out: score up to cell we ended up in
	bool& done,          // out: finished tracing out an alignment?
	bool& abort)         // out: aborted b/c cell was seen before?
{
	assert_geq(rw, 0);
	assert_geq(cl, 0);
	assert_range(0, 2, hef);
	assert_lt(rw, (int64_t)prob_.qrylen_);
	assert_lt(cl, (int64_t)prob_.reflen_);
	assert(prob_.usecp_ && prob_.fill_);
	const bool is8_ = prob_.cper_->is8_;
	int64_t row = rw, col = cl;
	assert_leq(prob_.reflen_, (TRefOff)sawcell_.size());
	assert_leq(col, (int64_t)prob_.cper_->hicol());
	assert_geq(col, (int64_t)prob_.cper_->locol());
	assert_geq(prob_.cper_->per(), 2);
	size_t xmod = col & prob_.cper_->lomask();
	size_t ymod = row & prob_.cper_->lomask();
	size_t xdiv = col >> prob_.cper_->perpow2();
	size_t ydiv = row >> prob_.cper_->perpow2();
	size_t sq_ncol = xmod+1, sq_nrow = ymod+1;
	sq_.resize(sq_ncol * sq_nrow);
	bool upper = ydiv == 0;
	bool left  = xdiv == 0;
	const TAlScore sc_rdo = prob_.sc_->readGapOpen();
	const TAlScore sc_rde = prob_.sc_->readGapExtend();
	const TAlScore sc_rfo = prob_.sc_->refGapOpen();
	const TAlScore sc_rfe = prob_.sc_->refGapExtend();
	const bool local = !prob_.sc_->monotone;
	const CpQuad *qup = NULL;
	const __m128i *qlf = NULL;
	size_t per = prob_.cper_->per_;
	ASSERT_ONLY(size_t nrow = prob_.cper_->nrow());
	size_t ncol = prob_.cper_->ncol();
	assert_eq(prob_.qrylen_, nrow);
	assert_eq(prob_.reflen_, (TRefOff)ncol);
	size_t niter = prob_.cper_->niter_;
	if(!upper) {
		qup = prob_.cper_->qrows_.ptr() + (ncol * (ydiv-1)) + xdiv * per;
	}
	if(!left) {
		// Set up the column pointers to point to the first __m128i word in the
		// relevant column
		size_t off = (niter << 2) * (xdiv-1);
		qlf = prob_.cper_->qcols_.ptr() + off;
	}
	size_t xedge = xdiv * per; // absolute offset of leftmost cell in square
	size_t yedge = ydiv * per; // absolute offset of topmost cell in square
	size_t xi = xedge, yi = yedge; // iterators for columns, rows
	size_t ii = 0; // iterator into packed square
	// Iterate over rows, then over columns
	size_t m128mod = yi % prob_.cper_->niter_;
	size_t m128div = yi / prob_.cper_->niter_;
	int16_t sc_h_dg_lastrow = MIN_I16;
	for(size_t i = 0; i <= ymod; i++, yi++) {
		assert_lt(yi, nrow);
 		xi = xedge;
		// Handling for first column is done outside the loop
		size_t fromend = prob_.qrylen_ - yi - 1;
		bool allowGaps = fromend >= (size_t)prob_.sc_->gapbar && yi >= (size_t)prob_.sc_->gapbar;
		// Get character, quality from read
		int qc = prob_.qry_[yi], qq = prob_.qual_[yi];
		assert_geq(qq, 33);
		int16_t sc_h_lf_last = MIN_I16;
		int16_t sc_e_lf_last = MIN_I16;
		for(size_t j = 0; j <= xmod; j++, xi++) {
			assert_lt(xi, ncol);
			// Get character from reference
			int rc = prob_.ref_[xi];
			assert_range(0, 16, rc);
			int16_t sc_diag = prob_.sc_->score(qc, rc, qq - 33);
			int16_t sc_h_up = MIN_I16, sc_f_up = MIN_I16,
			        sc_h_lf = MIN_I16, sc_e_lf = MIN_I16,
					sc_h_dg = MIN_I16;
			int16_t sc_h_up_c = MIN_I16, sc_f_up_c = MIN_I16,
			        sc_h_lf_c = MIN_I16, sc_e_lf_c = MIN_I16,
					sc_h_dg_c = MIN_I16;
			if(yi == 0) {
				// If I'm in the first first row or column set it to 0
				sc_h_dg = 0;
			} else if(xi == 0) {
				// Do nothing; leave it at min
				if(local) {
					sc_h_dg = 0;
				}
			} else if(i == 0 && j == 0) {
				// Otherwise, if I'm in the upper-left square corner, I can get
				// it from the checkpoint 
				sc_h_dg = qup[-1].sc[0];
			} else if(j == 0) {
				// Otherwise, if I'm in the leftmost cell of this row, I can
				// get it from sc_h_lf in first column of previous row
				sc_h_dg = sc_h_dg_lastrow;
			} else {
				// Otherwise, I can get it from qup
				sc_h_dg = qup[j-1].sc[0];
			}
			if(yi > 0 && xi > 0) DEBUG_CHECK(sc_h_dg, yi-1, xi-1, 2);
			
			// If we're in the leftmost column, calculate sc_h_lf regardless of
			// allowGaps.
			if(j == 0 && xi > 0) {
				// Get values for left neighbors from the checkpoint
				if(is8_) {
					size_t vecoff = (m128mod << 6) + m128div;
					sc_e_lf = ((uint8_t*)(qlf + 0))[vecoff];
					sc_h_lf = ((uint8_t*)(qlf + 2))[vecoff];
					if(local) {
						// No adjustment
					} else {
						if(sc_h_lf == 0) sc_h_lf = MIN_I16;
						else sc_h_lf -= 0xff;
						if(sc_e_lf == 0) sc_e_lf = MIN_I16;
						else sc_e_lf -= 0xff;
					}
				} else {
					size_t vecoff = (m128mod << 5) + m128div;
					sc_e_lf = ((int16_t*)(qlf + 0))[vecoff];
					sc_h_lf = ((int16_t*)(qlf + 2))[vecoff];
					if(local) {
						sc_h_lf += 0x8000; assert_geq(sc_h_lf, 0);
						sc_e_lf += 0x8000; assert_geq(sc_e_lf, 0);
					} else {
						if(sc_h_lf != MIN_I16) sc_h_lf -= 0x7fff;
						if(sc_e_lf != MIN_I16) sc_e_lf -= 0x7fff;
					}
				}
				DEBUG_CHECK(sc_e_lf, yi, xi-1, 0);
				DEBUG_CHECK(sc_h_lf, yi, xi-1, 2);
				sc_h_dg_lastrow = sc_h_lf;
			}
			
			if(allowGaps) {
				if(j == 0 /* at left edge */ && xi > 0 /* not extreme */) {
					sc_h_lf_c = sc_h_lf;
					sc_e_lf_c = sc_e_lf;
					if(sc_h_lf_c != MIN_I16) sc_h_lf_c -= sc_rdo;
					if(sc_e_lf_c != MIN_I16) sc_e_lf_c -= sc_rde;
					assert_leq(sc_h_lf_c, prob_.cper_->perf_);
					assert_leq(sc_e_lf_c, prob_.cper_->perf_);
				} else if(xi > 0) {
					// Get values for left neighbors from the previous iteration
					if(sc_h_lf_last != MIN_I16) {
						sc_h_lf = sc_h_lf_last;
						sc_h_lf_c = sc_h_lf - sc_rdo;
					}
					if(sc_e_lf_last != MIN_I16) {
						sc_e_lf = sc_e_lf_last;
						sc_e_lf_c = sc_e_lf - sc_rde;
					}
				}
				if(yi > 0 /* not extreme */) {
					// Get column values
					assert(qup != NULL);
					assert(local || qup[j].sc[2] < 0);
					if(qup[j].sc[0] > MIN_I16) {
						DEBUG_CHECK(qup[j].sc[0], yi-1, xi, 2);
						sc_h_up = qup[j].sc[0];
						sc_h_up_c = sc_h_up - sc_rfo;
					}
					if(qup[j].sc[2] > MIN_I16) {
						DEBUG_CHECK(qup[j].sc[2], yi-1, xi, 1);
						sc_f_up = qup[j].sc[2];
						sc_f_up_c = sc_f_up - sc_rfe;
					}
				}
				if(local) {
					sc_h_up_c = max<int16_t>(sc_h_up_c, 0);
					sc_f_up_c = max<int16_t>(sc_f_up_c, 0);
					sc_h_lf_c = max<int16_t>(sc_h_lf_c, 0);
					sc_e_lf_c = max<int16_t>(sc_e_lf_c, 0);
				}
			}
			
			if(sc_h_dg > MIN_I16) {
				sc_h_dg_c = sc_h_dg + sc_diag;
			}
			if(local) sc_h_dg_c = max<int16_t>(sc_h_dg_c, 0);
			
			int mask = 0;
			// Calculate best ways into H, E, F cells starting with H.
			// Mask bits:
			// H: 1=diag, 2=hhoriz, 4=ehoriz, 8=hvert, 16=fvert
			// E: 32=hhoriz, 64=ehoriz
			// F: 128=hvert, 256=fvert
			int16_t sc_best = sc_h_dg_c;
			if(sc_h_dg_c > MIN_I64) {
				mask = 1;
			}
			if(xi > 0 && sc_h_lf_c >= sc_best && sc_h_lf_c > MIN_I64) {
				if(sc_h_lf_c > sc_best) mask = 0;
				mask |= 2;
				sc_best = sc_h_lf_c;
			}
			if(xi > 0 && sc_e_lf_c >= sc_best && sc_e_lf_c > MIN_I64) {
				if(sc_e_lf_c > sc_best) mask = 0;
				mask |= 4;
				sc_best = sc_e_lf_c;
			}
			if(yi > 0 && sc_h_up_c >= sc_best && sc_h_up_c > MIN_I64) {
				if(sc_h_up_c > sc_best) mask = 0;
				mask |= 8;
				sc_best = sc_h_up_c;
			}
			if(yi > 0 && sc_f_up_c >= sc_best && sc_f_up_c > MIN_I64) {
				if(sc_f_up_c > sc_best) mask = 0;
				mask |= 16;
				sc_best = sc_f_up_c;
			}
			// Calculate best way into E cell
			int16_t sc_e_best = sc_h_lf_c;
			if(xi > 0) {
				if(sc_h_lf_c >= sc_e_lf_c && sc_h_lf_c > MIN_I64) {
					if(sc_h_lf_c == sc_e_lf_c) {
						mask |= 64;
					}
					mask |= 32;
				} else if(sc_e_lf_c > MIN_I64) {
					sc_e_best = sc_e_lf_c;
					mask |= 64;
				}
			}
			if(sc_e_best > sc_best) {
				sc_best = sc_e_best;
				mask &= ~31; // don't go diagonal
			}
			// Calculate best way into F cell
			int16_t sc_f_best = sc_h_up_c;
			if(yi > 0) {
				if(sc_h_up_c >= sc_f_up_c && sc_h_up_c > MIN_I64) {
					if(sc_h_up_c == sc_f_up_c) {
						mask |= 256;
					}
					mask |= 128;
				} else if(sc_f_up_c > MIN_I64) {
					sc_f_best = sc_f_up_c;
					mask |= 256;
				}
			}
			if(sc_f_best > sc_best) {
				sc_best = sc_f_best;
				mask &= ~127; // don't go horizontal or diagonal
			}
			// Install results in cur
			assert( local || sc_best <= 0);
			sq_[ii+j].sc[0] = sc_best;
			assert( local || sc_e_best < 0);
			assert( local || sc_f_best < 0);
			assert(!local || sc_e_best >= 0 || sc_e_best == MIN_I16);
			assert(!local || sc_f_best >= 0 || sc_f_best == MIN_I16);
			sq_[ii+j].sc[1] = sc_e_best;
			sq_[ii+j].sc[2] = sc_f_best;
			sq_[ii+j].sc[3] = mask;
			DEBUG_CHECK(sq_[ii+j].sc[0], yi, xi, 2); // H
			DEBUG_CHECK(sq_[ii+j].sc[1], yi, xi, 0); // E
			DEBUG_CHECK(sq_[ii+j].sc[2], yi, xi, 1); // F
			// Update sc_h_lf_last, sc_e_lf_last
			sc_h_lf_last = sc_best;
			sc_e_lf_last = sc_e_best;
		}
		// Update m128mod, m128div
		m128mod++;
		if(m128mod == prob_.cper_->niter_) {
			m128mod = 0;
			m128div++;
		}
		// update qup
		ii += sq_ncol;
		// dimensions of sq_
		qup = sq_.ptr() + sq_ncol * i;
	}
	assert_eq(targ, sq_[ymod * sq_ncol + xmod].sc[hef]);
	//
	// Now backtrack through the triangle.  Abort as soon as we enter a cell
	// that was visited by a previous backtrace.
	//
	int64_t rowc = row, colc = col;
	size_t curid;
	int hefc = hef;
	if(bs_.empty()) {
		// Start an initial branch
		CHECK_ROW_COL(rowc, colc);
		curid = bs_.alloc();
		assert_eq(0, curid);
		Edit e;
		bs_[curid].init(
			prob_,
			0,      // parent ID
			0,      // penalty
			0,      // score_en
			rowc,   // row
			colc,   // col
			e,      // edit
			0,      // hef
			true,   // root?
			false); // don't try to extend with exact matches
		bs_[curid].len_ = 0;
	} else {
		curid = bs_.size()-1;
	}
	size_t ymodTimesNcol = ymod * sq_ncol;
	while(true) {
		// What depth are we?
		assert_eq(ymodTimesNcol, ymod * sq_ncol);
		CpQuad * cur = sq_.ptr() + ymodTimesNcol + xmod;
		int mask = cur->sc[3];
		assert_gt(mask, 0);
		int sel = -1;
		// Select what type of move to make, which depends on whether we're
		// currently in H, E, F:
		if(hefc == 0) {
			if(       (mask & 1) != 0) {
				// diagonal
				sel = 0;
			} else if((mask & 8) != 0) {
				// up to H
				sel = 3;
			} else if((mask & 16) != 0) {
				// up to F
				sel = 4;
			} else if((mask & 2) != 0) {
				// left to H
				sel = 1;
			} else if((mask & 4) != 0) {
				// left to E
				sel = 2;
			}
		} else if(hefc == 1) {
			if(       (mask & 32) != 0) {
				// left to H
				sel = 5;
			} else if((mask & 64) != 0) {
				// left to E
				sel = 6;
			}
		} else {
			assert_eq(2, hefc);
			if(       (mask & 128) != 0) {
				// up to H
				sel = 7;
			} else if((mask & 256) != 0) {
				// up to F
				sel = 8;
			}
		}
		assert_geq(sel, 0);
		// Get character from read
		int qc = prob_.qry_[rowc], qq = prob_.qual_[rowc];
		// Get character from reference
		int rc = prob_.ref_[colc];
		assert_range(0, 16, rc);
		bool xexit = false, yexit = false;
		// Now that we know what type of move to make, make it, updating our
		// row and column and moving updating the branch.
		if(sel == 0) {
			assert_geq(rowc, 0);
			assert_geq(colc, 0);
			TAlScore scd = prob_.sc_->score(qc, rc, qq - 33);
			if((rc & (1 << qc)) == 0) {
				// Mismatch
				size_t id = curid;
				// Check if the previous branch was the initial (bottommost)
				// branch with no matches.  If so, the mismatch should be added
				// to the initial branch, instead of starting a new branch.
				bool empty = (bs_[curid].len_ == 0 && curid == 0);
				if(!empty) {
					id = bs_.alloc();
				}
				Edit e((int)rowc, mask2dna[rc], "ACGTN"[qc], EDIT_TYPE_MM);
				assert_lt(scd, 0);
				TAlScore score_en = bs_[curid].score_st_ + scd;
				bs_[id].init(
					prob_,
					curid,    // parent ID
					-scd,     // penalty
					score_en, // score_en
					rowc,     // row
					colc,     // col
					e,        // edit
					hefc,     // hef
					empty,    // root?
					false);   // don't try to extend with exact matches
				curid = id;
				//assert(!local || bs_[curid].score_st_ >= 0);
			} else {
				// Match
				bs_[curid].score_st_ += prob_.sc_->match();
				bs_[curid].len_++;
				assert_leq((int64_t)bs_[curid].len_, bs_[curid].row_ + 1);
			}
			if(xmod == 0) xexit = true;
			if(ymod == 0) yexit = true;
			rowc--; ymod--; ymodTimesNcol -= sq_ncol;
			colc--; xmod--;
			assert(local || bs_[curid].score_st_ >= targ_final);
			hefc = 0;
		} else if((sel >= 1 && sel <= 2) || (sel >= 5 && sel <= 6)) {
			assert_gt(colc, 0);
			// Read gap
			size_t id = bs_.alloc();
			Edit e((int)rowc+1, mask2dna[rc], '-', EDIT_TYPE_READ_GAP);
			TAlScore gapp = prob_.sc_->readGapOpen();
			if(bs_[curid].len_ == 0 && bs_[curid].e_.inited() && bs_[curid].e_.isReadGap()) {
				gapp = prob_.sc_->readGapExtend();
			}
			//assert(!local || bs_[curid].score_st_ >= gapp);
			TAlScore score_en = bs_[curid].score_st_ - gapp;
			bs_[id].init(
				prob_,
				curid,    // parent ID
				gapp,     // penalty
				score_en, // score_en
				rowc,     // row
				colc-1,   // col
				e,        // edit
				hefc,     // hef
				false,    // root?
				false);   // don't try to extend with exact matches
			if(xmod == 0) xexit = true;
			colc--; xmod--;
			curid = id;
			assert( local || bs_[curid].score_st_ >= targ_final);
			//assert(!local || bs_[curid].score_st_ >= 0);
			if(sel == 1 || sel == 5) {
				hefc = 0;
			} else {
				hefc = 1;
			}
		} else {
			assert_gt(rowc, 0);
			// Reference gap
			size_t id = bs_.alloc();
			Edit e((int)rowc, '-', "ACGTN"[qc], EDIT_TYPE_REF_GAP);
			TAlScore gapp = prob_.sc_->refGapOpen();
			if(bs_[curid].len_ == 0 && bs_[curid].e_.inited() && bs_[curid].e_.isRefGap()) {
				gapp = prob_.sc_->refGapExtend();
			}
			//assert(!local || bs_[curid].score_st_ >= gapp);
			TAlScore score_en = bs_[curid].score_st_ - gapp;
			bs_[id].init(
				prob_,
				curid,    // parent ID
				gapp,     // penalty
				score_en, // score_en
				rowc-1,   // row
				colc,     // col
				e,        // edit
				hefc,     // hef
				false,    // root?
				false);   // don't try to extend with exact matches
			if(ymod == 0) yexit = true;
			rowc--; ymod--; ymodTimesNcol -= sq_ncol;
			curid = id;
			assert( local || bs_[curid].score_st_ >= targ_final);
			//assert(!local || bs_[curid].score_st_ >= 0);
			if(sel == 3 || sel == 7) {
				hefc = 0;
			} else {
				hefc = 2;
			}
		}
		CHECK_ROW_COL(rowc, colc);
		CpQuad * cur_new = NULL;
		if(!xexit && !yexit) {
			cur_new = sq_.ptr() + ymodTimesNcol + xmod;
		}
		// Check whether we made it to the top row or to a cell with score 0
		if(colc < 0 || rowc < 0 ||
		   (cur_new != NULL && local && cur_new->sc[0] == 0))
		{
			done = true;
			assert(bs_[curid].isSolution(prob_));
			addSolution(curid);
#ifndef NDEBUG
			// A check to see if any two adjacent branches in the backtrace
			// overlap.  If they do, the whole alignment will be filtered out
			// in trySolution(...)
			size_t cur = curid;
			if(!bs_[cur].root_) {
				size_t next = bs_[cur].parentId_;
				while(!bs_[next].root_) {
					assert_neq(cur, next);
					if(bs_[next].len_ != 0 || bs_[cur].len_ == 0) {
						assert(!bs_[cur].overlap(prob_, bs_[next]));
					}
					cur = next;
					next = bs_[cur].parentId_;
				}
			}
#endif
			return;
		}
		assert(!xexit || hefc == 0 || hefc == 1);
		assert(!yexit || hefc == 0 || hefc == 2);
		if(xexit || yexit) {
			//assert(rowc < 0 || colc < 0 || prob_.cper_->isCheckpointed(rowc, colc));
			row_new = rowc; col_new = colc;
			hef_new = hefc;
			done = false;
			if(rowc < 0 || colc < 0) {
				assert(local);
				targ_new = 0;
			} else {
				// TODO: Don't use scoreSquare
				targ_new = prob_.cper_->scoreSquare(rowc, colc, hefc);
				assert(local || targ_new >= targ);
				assert(local || targ_new >= targ_final);
			}
			if(local && targ_new == 0) {
				assert_eq(0, hefc);
				done = true;
				assert(bs_[curid].isSolution(prob_));
				addSolution(curid);
			}
			assert((row_new >= 0 && col_new >= 0) || done);
			return;
		}
	}
	assert(false);
}

/**
 * Caller gives us score_en, row and col.  We figure out score_st and len_
 * by comparing characters from the strings.
 *
 * If this branch comes after a mismatch, (row, col) describe the cell that the
 * mismatch occurs in.  len_ is initially set to 1, and the next cell we test
 * is the next cell up and to the left (row-1, col-1).
 *
 * If this branch comes after a read gap, (row, col) describe the leftmost cell
 * involved in the gap.  len_ is initially set to 0, and the next cell we test
 * is the current cell (row, col).
 *
 * If this branch comes after a reference gap, (row, col) describe the upper
 * cell involved in the gap.  len_ is initially set to 0, and the next cell we
 * test is the current cell (row, col).
 */
void BtBranch::init(
	const BtBranchProblem& prob,
	size_t parentId,
	TAlScore penalty,
	TAlScore score_en,
	int64_t row,
	int64_t col,
	Edit e,
	int hef,
	bool root,
	bool extend)
{
	score_en_ = score_en;
	penalty_ = penalty;
	score_st_ = score_en_;
	row_ = row;
	col_ = col;
	parentId_ = parentId;
	e_ = e;
	root_ = root;
	assert(!root_ || parentId == 0);
	assert_lt(row, (int64_t)prob.qrylen_);
	assert_lt(col, (int64_t)prob.reflen_);
	// First match to check is diagonally above and to the left of the cell
	// where the edit occurs
	int64_t rowc = row;
	int64_t colc = col;
	len_ = 0;
	if(e.inited() && e.isMismatch()) {
		rowc--; colc--;
		len_ = 1;
	}
	int64_t match = prob.sc_->match();
	bool cp = prob.usecp_;
	size_t iters = 0;
	curtailed_ = false;
	if(extend) {
		while(rowc >= 0 && colc >= 0) {
			int rfm = prob.ref_[colc];
			assert_range(0, 16, rfm);
			int rdc = prob.qry_[rowc];
			bool matches = (rfm & (1 << rdc)) != 0;
			if(!matches) {
				// What's the mismatch penalty?
				break;
			}
			// Get score from checkpointer
			score_st_ += match;
			if(cp && rowc - 1 >= 0 && colc - 1 >= 0 &&
			   prob.cper_->isCheckpointed(rowc - 1, colc - 1))
			{
				// Possibly prune
				int16_t cpsc;
				cpsc = prob.cper_->scoreTriangle(rowc - 1, colc - 1, hef);
				if(cpsc + score_st_ < prob.targ_) {
					curtailed_ = true;
					break;
				}
			}
			iters++;
			rowc--; colc--;
		}
	}
	assert_geq(rowc, -1);
	assert_geq(colc, -1);
	len_ = (int64_t)row - rowc;
	assert_leq((int64_t)len_, row_+1);
	assert_leq((int64_t)len_, col_+1);
	assert_leq((int64_t)score_st_, (int64_t)prob.qrylen_ * match);
}

/**
 * Given a potential branch to add to the queue, see if we can follow the
 * branch a little further first.  If it's still valid, or if we reach a
 * choice between valid outgoing paths, go ahead and add it to the queue.
 */
void BtBranchTracer::examineBranch(
	int64_t row,
	int64_t col,
	const Edit& e,
	TAlScore pen,  // penalty associated with edit
	TAlScore sc,
	size_t parentId)
{
	size_t id = bs_.alloc();
	bs_[id].init(prob_, parentId, pen, sc, row, col, e, 0, false, true);
	if(bs_[id].isSolution(prob_)) {
		assert(bs_[id].isValid(prob_));
		addSolution(id);
	} else {
		// Check if this branch is legit
		if(bs_[id].isValid(prob_)) {
			add(id);
		} else {
			bs_.pop();
		}
	}
}

/**
 * Take all possible ways of leaving the given branch and add them to the
 * branch queue.
 */
void BtBranchTracer::addOffshoots(size_t bid) {
	BtBranch& b = bs_[bid];
	TAlScore sc = b.score_en_;
	int64_t match = prob_.sc_->match();
	int64_t scoreFloor = prob_.sc_->monotone ? MIN_I64 : 0;
	bool cp = prob_.usecp_; // Are there are any checkpoints?
	ASSERT_ONLY(TAlScore perfectScore = prob_.sc_->perfectScore(prob_.qrylen_));
	assert_leq(prob_.targ_, perfectScore);
	// For each cell in the branch
	for(size_t i = 0 ; i < b.len_; i++) {
		assert_leq((int64_t)i, b.row_+1);
		assert_leq((int64_t)i, b.col_+1);
		int64_t row = b.row_ - i, col = b.col_ - i;
		int64_t bonusLeft = (row + 1) * match;
		int64_t fromend = prob_.qrylen_ - row - 1;
		bool allowGaps = fromend >= prob_.sc_->gapbar && row >= prob_.sc_->gapbar;
		if(allowGaps && row >= 0 && col >= 0) {
			if(col > 0) {
				// Try a read gap - it's either an extension or an open
				bool extend = b.e_.inited() && b.e_.isReadGap() && i == 0;
				TAlScore rdgapPen = extend ?
					prob_.sc_->readGapExtend() : prob_.sc_->readGapOpen();
				bool prune = false;
				assert_gt(rdgapPen, 0);
				if(cp && prob_.cper_->isCheckpointed(row, col - 1)) {
					// Possibly prune
					int16_t cpsc = (int16_t)prob_.cper_->scoreTriangle(row, col - 1, 0);
					assert_leq(cpsc, perfectScore);
					assert_geq(prob_.sc_->readGapOpen(), prob_.sc_->readGapExtend());
					TAlScore bonus = prob_.sc_->readGapOpen() - prob_.sc_->readGapExtend();
					assert_geq(bonus, 0);
					if(cpsc + bonus + sc - rdgapPen < prob_.targ_) {
						prune = true;
					}
				}
				if(prune) {
					if(extend) { nrdexPrune_++; } else { nrdopPrune_++; }
				} else if(sc - rdgapPen >= scoreFloor && sc - rdgapPen + bonusLeft >= prob_.targ_) {
					// Yes, we can introduce a read gap here
					Edit e((int)row + 1, mask2dna[(int)prob_.ref_[col]], '-', EDIT_TYPE_READ_GAP);
					assert(e.isReadGap());
					examineBranch(row, col - 1, e, rdgapPen, sc - rdgapPen, bid);
					if(extend) { nrdex_++; } else { nrdop_++; }
				}
			}
			if(row > 0) {
				// Try a reference gap - it's either an extension or an open
				bool extend = b.e_.inited() && b.e_.isRefGap() && i == 0;
				TAlScore rfgapPen = (b.e_.inited() && b.e_.isRefGap()) ?
					prob_.sc_->refGapExtend() : prob_.sc_->refGapOpen();
				bool prune = false;
				assert_gt(rfgapPen, 0);
				if(cp && prob_.cper_->isCheckpointed(row - 1, col)) {
					// Possibly prune
					int16_t cpsc = (int16_t)prob_.cper_->scoreTriangle(row - 1, col, 0);
					assert_leq(cpsc, perfectScore);
					assert_geq(prob_.sc_->refGapOpen(), prob_.sc_->refGapExtend());
					TAlScore bonus = prob_.sc_->refGapOpen() - prob_.sc_->refGapExtend();
					assert_geq(bonus, 0);
					if(cpsc + bonus + sc - rfgapPen < prob_.targ_) {
						prune = true;
					}
				}
				if(prune) {
					if(extend) { nrfexPrune_++; } else { nrfopPrune_++; }
				} else if(sc - rfgapPen >= scoreFloor && sc - rfgapPen + bonusLeft >= prob_.targ_) {
					// Yes, we can introduce a ref gap here
					Edit e((int)row, '-', "ACGTN"[(int)prob_.qry_[row]], EDIT_TYPE_REF_GAP);
					assert(e.isRefGap());
					examineBranch(row - 1, col, e, rfgapPen, sc - rfgapPen, bid);
					if(extend) { nrfex_++; } else { nrfop_++; }
				}
			}
		}
		// If we're at the top of the branch but not yet at the top of
		// the DP table, a mismatch branch is also possible.
		if(i == b.len_ && !b.curtailed_ && row >= 0 && col >= 0) {
			int rfm = prob_.ref_[col];
			assert_lt(row, (int64_t)prob_.qrylen_);
			int rdc = prob_.qry_[row];
			int rdq = prob_.qual_[row];
			int scdiff = prob_.sc_->score(rdc, rfm, rdq - 33);
			assert_lt(scdiff, 0); // at end of branch, so can't match
			bool prune = false;
			if(cp && row > 0 && col > 0 && prob_.cper_->isCheckpointed(row - 1, col - 1)) {
				// Possibly prune
				int16_t cpsc = prob_.cper_->scoreTriangle(row - 1, col - 1, 0);
				assert_leq(cpsc, perfectScore);
				assert_leq(cpsc + scdiff + sc, perfectScore);
				if(cpsc + scdiff + sc < prob_.targ_) {
					prune = true;
				}
			}
			if(prune) {
				nmm_++;
			} else  {
				// Yes, we can introduce a mismatch here
				if(sc + scdiff >= scoreFloor && sc + scdiff + bonusLeft >= prob_.targ_) {
					Edit e((int)row, mask2dna[rfm], "ACGTN"[rdc], EDIT_TYPE_MM);
					bool nmm = (mask2dna[rfm] == 'N' || rdc > 4);
					assert_neq(e.chr, e.qchr);
					assert_lt(scdiff, 0);
					examineBranch(row - 1, col - 1, e, -scdiff, sc + scdiff, bid);
					if(nmm) { nnmm_++; } else { nmm_++; }
				}
			}
		}
		sc += match;
	}
}

/**
 * Sort unsorted branches, merge them with master sorted list.
 */
void BtBranchTracer::flushUnsorted() {
	if(unsorted_.empty()) {
		return;
	}
	unsorted_.sort();
	unsorted_.reverse();
#ifndef NDEBUG
	for(size_t i = 1; i < unsorted_.size(); i++) {
		assert_leq(bs_[unsorted_[i].second].score_st_, bs_[unsorted_[i-1].second].score_st_);
	}
#endif
	EList<size_t> *src2 = sortedSel_ ? &sorted1_ : &sorted2_;
	EList<size_t> *dest = sortedSel_ ? &sorted2_ : &sorted1_;
	// Merge src1 and src2 into dest
	dest->clear();
	size_t cur1 = 0, cur2 = cur_;
	while(cur1 < unsorted_.size() || cur2 < src2->size()) {
		// Take from 1 or 2 next?
		bool take1 = true;
		if(cur1 == unsorted_.size()) {
			take1 = false;
		} else if(cur2 == src2->size()) {
			take1 = true;
		} else {
			assert_neq(unsorted_[cur1].second, (*src2)[cur2]);
			take1 = bs_[unsorted_[cur1].second] < bs_[(*src2)[cur2]];
		}
		if(take1) {
			dest->push_back(unsorted_[cur1++].second); // Take from list 1
		} else {
			dest->push_back((*src2)[cur2++]); // Take from list 2
		}
	}
	assert_eq(cur1, unsorted_.size());
	assert_eq(cur2, src2->size());
	sortedSel_ = !sortedSel_;
	cur_ = 0;
	unsorted_.clear();
}

/**
 * Try all the solutions accumulated so far.  Solutions might be rejected
 * if they, for instance, overlap a previous solution, have too many Ns,
 * fail to overlap a core diagonal, etc.
 */
bool BtBranchTracer::trySolutions(
	bool lookForOlap,
	SwResult& res,
	size_t& off,
	size_t& nrej,
	RandomSource& rnd,
	bool& success)
{
	if(solutions_.size() > 0) {
		for(size_t i = 0; i < solutions_.size(); i++) {
			int ret = trySolution(solutions_[i], lookForOlap, res, off, nrej, rnd);
			if(ret == BT_FOUND) {
				success = true;
				return true; // there were solutions and one was good
			}
		}
		solutions_.clear();
		success = false;
		return true; // there were solutions but none were good
	}
	return false; // there were no solutions to check
}

/**
 * Given the id of a branch that completes a successful backtrace, turn the
 * chain of branches into 
 */
int BtBranchTracer::trySolution(
	size_t id,
	bool lookForOlap,
	SwResult& res,
	size_t& off,
	size_t& nrej,
	RandomSource& rnd)
{
	AlnScore score;
	BtBranch *br = &bs_[id];
	// 'br' corresponds to the leftmost edit in a right-to-left
	// chain of edits.  
	EList<Edit>& ned = res.alres.ned();
	const BtBranch *cur = br, *prev = NULL;
	size_t ns = 0, nrefns = 0;
	size_t ngap = 0;
	while(true) {
		if(cur->e_.inited()) {
			if(cur->e_.isMismatch()) {
				if(cur->e_.qchr == 'N' || cur->e_.chr == 'N') {
					ns++;
				}
			} else if(cur->e_.isGap()) {
				ngap++;
			}
			if(cur->e_.chr == 'N') {
				nrefns++;
			}
			ned.push_back(cur->e_);
		}
		if(cur->root_) {
			break;
		}
		cur = &bs_[cur->parentId_];
	}
	if(ns > prob_.nceil_) {
		// Alignment has too many Ns in it!
		res.reset();
		assert(res.alres.ned().empty());
		nrej++;
		return BT_REJECTED_N;
	}
	// Update 'seenPaths_'
	cur = br;
	bool rejSeen = false; // set =true if we overlap prev path
	bool rejCore = true; // set =true if we don't touch core diag
	while(true) {
		// Consider row, col, len, then do something
		int64_t row = cur->row_, col = cur->col_;
		assert_lt(row, (int64_t)prob_.qrylen_);
		size_t fromend = prob_.qrylen_ - row - 1;
		size_t diag = fromend + col;
		// Calculate the diagonal within the *trimmed* rectangle,
		// i.e. the rectangle we dealt with in align, gather and
		// backtrack.
		int64_t diagi = col - row;
		// Now adjust to the diagonal within the *untrimmed*
		// rectangle by adding on the amount trimmed from the left.
		diagi += prob_.rect_->triml;
		assert_lt(diag, seenPaths_.size());
		// Does it overlap a core diagonal?
		if(diagi >= 0) {
			size_t diag = (size_t)diagi;
			if(diag >= prob_.rect_->corel &&
			   diag <= prob_.rect_->corer)
			{
				// Yes it does - it's OK
				rejCore = false;
			}
		}
		if(lookForOlap) {
			int64_t newlo, newhi;
			if(cur->len_ == 0) {
				if(prev != NULL && prev->len_ > 0) {
					// If there's a gap at the base of a non-0 length branch, the
					// gap will appear to overlap the branch if we give it length 1.
					newhi = newlo = 0;
				} else {
					// Read or ref gap with no matches coming off of it
					newlo = row;
					newhi = row + 1;
				}
			} else {
				// Diagonal with matches
				newlo = row - (cur->len_ - 1);
				newhi = row + 1;
			}
			assert_geq(newlo, 0);
			assert_geq(newhi, 0);
			// Does the diagonal cover cells?
			if(newhi > newlo) {
				// Check whether there is any overlap with previously traversed
				// cells
				bool added = false;
				const size_t sz = seenPaths_[diag].size();
				for(size_t i = 0; i < sz; i++) {
					// Does the new interval overlap this already-seen
					// interval?  Also of interest: does it abut this
					// already-seen interval?  If so, we should merge them.
					size_t lo = seenPaths_[diag][i].first;
					size_t hi = seenPaths_[diag][i].second;
					assert_lt(lo, hi);
					size_t lo_sm = newlo, hi_sm = newhi;
					if(hi - lo < hi_sm - lo_sm) {
						swap(lo, lo_sm);
						swap(hi, hi_sm);
					}
					if((lo <= lo_sm && hi > lo_sm) ||
					   (lo <  hi_sm && hi >= hi_sm))
					{
						// One or both of the shorter interval's end points
						// are contained in the longer interval - so they
						// overlap.
						rejSeen = true;
						// Merge them into one longer interval
						seenPaths_[diag][i].first = min(lo, lo_sm);
						seenPaths_[diag][i].second = max(hi, hi_sm);
#ifndef NDEBUG
						for(int64_t ii = seenPaths_[diag][i].first;
							ii < (int64_t)seenPaths_[diag][i].second;
							ii++)
						{
							//cerr << "trySolution rejected (" << ii << ", " << (ii + col - row) << ")" << endl;
						}
#endif
						added = true;
						break;
					} else if(hi == lo_sm || lo == hi_sm) {
						// Merge them into one longer interval
						seenPaths_[diag][i].first = min(lo, lo_sm);
						seenPaths_[diag][i].second = max(hi, hi_sm);
#ifndef NDEBUG
						for(int64_t ii = seenPaths_[diag][i].first;
							ii < (int64_t)seenPaths_[diag][i].second;
							ii++)
						{
							//cerr << "trySolution rejected (" << ii << ", " << (ii + col - row) << ")" << endl;
						}
#endif
						added = true;
						// Keep going in case it overlaps one of the other
						// intervals
					}
				}
				if(!added) {
					seenPaths_[diag].push_back(make_pair(newlo, newhi));
				}
			}
		}
		// After the merging that may have occurred above, it's no
		// longer guarnateed that all the overlapping intervals in
		// the list have been merged.  That's OK though.  We'll
		// still get correct answers to overlap queries.
		if(cur->root_) {
			assert_eq(0, cur->parentId_);
			break;
		}
		prev = cur;
		cur = &bs_[cur->parentId_];
	} // while(cur->e_.inited())
	if(rejSeen) {
		res.reset();
		assert(res.alres.ned().empty());
		nrej++;
		return BT_NOT_FOUND;
	}
	if(rejCore) {
		res.reset();
		assert(res.alres.ned().empty());
		nrej++;
		return BT_REJECTED_CORE_DIAG;
	}
	off = br->leftmostCol();
	score.score_ = prob_.targ_;
	score.ns_    = ns;
	score.gaps_  = ngap;
	res.alres.setScore(score);
	res.alres.setRefNs(nrefns);
	size_t trimBeg = br->uppermostRow();
	size_t trimEnd = prob_.qrylen_ - prob_.row_ - 1;
	assert_leq(trimBeg, prob_.qrylen_);
	assert_leq(trimEnd, prob_.qrylen_);
	TRefOff refoff = off + prob_.refoff_ + prob_.rect_->refl;
	res.alres.setShape(
		prob_.refid_,                   // ref id
		refoff,                         // 0-based ref offset
		prob_.treflen(),                // ref length
		prob_.fw_,                      // aligned to Watson?
		prob_.qrylen_,                  // read length
        0,                              // read id
		true,                           // pretrim soft?
		0,                              // pretrim 5' end
		0,                              // pretrim 3' end
		true,                           // alignment trim soft?
		prob_.fw_ ? trimBeg : trimEnd,  // alignment trim 5' end
		prob_.fw_ ? trimEnd : trimBeg); // alignment trim 3' end
	return BT_FOUND;
}

/**
 * Get the next valid alignment given a backtrace problem.  Return false
 * if there is no valid solution.  Use a backtracking search to find the
 * solution.  This can be very slow.
 */
bool BtBranchTracer::nextAlignmentBacktrace(
	size_t maxiter,
	SwResult& res,
	size_t& off,
	size_t& nrej,
	size_t& niter,
	RandomSource& rnd)
{
	assert(!empty() || !emptySolution());
	assert(prob_.inited());
	// There's a subtle case where we might fail to backtracing in
	// local-alignment mode.  The basic fact to remember is that when we're
	// backtracing from the highest-scoring cell in the table, we're guaranteed
	// to be able to backtrace without ever dipping below 0.  But if we're
	// backtracing from a cell other than the highest-scoring cell in the
	// table, we might dip below 0.  Dipping below 0 implies that there's a
	// shorted local alignment with a better score.  In which case, it's
	// perfectly fair for us to abandon any path that dips below the floor, and
	// this might result in the queue becoming empty before we finish.
	bool result = false;
	niter = 0;
	while(!empty()) {
		if(trySolutions(true, res, off, nrej, rnd, result)) {
			return result;
		}
		if(niter++ >= maxiter) {
			break;
		}
		size_t brid = best(rnd); // put best branch in 'br'
		assert(!seen_.contains(brid));
		ASSERT_ONLY(seen_.insert(brid));
#if 0
		BtBranch *br = &bs_[brid];
		cerr << brid
		     << ": targ:" << prob_.targ_
		     << ", sc:" << br->score_st_
		     << ", row:" << br->uppermostRow()
			 << ", nmm:" << nmm_
			 << ", nnmm:" << nnmm_
			 << ", nrdop:" << nrdop_
			 << ", nrfop:" << nrfop_
			 << ", nrdex:" << nrdex_
			 << ", nrfex:" << nrfex_
			 << ", nrdop_pr: " << nrdopPrune_
			 << ", nrfop_pr: " << nrfopPrune_
			 << ", nrdex_pr: " << nrdexPrune_
			 << ", nrfex_pr: " << nrfexPrune_
			 << endl;
#endif
		addOffshoots(brid);
	}
	if(trySolutions(true, res, off, nrej, rnd, result)) {
		return result;
	}
	return false;
}

/**
 * Get the next valid alignment given a backtrace problem.  Return false
 * if there is no valid solution.  Use a triangle-fill backtrace to find
 * the solution.  This is usually fast (it's O(m + n)).
 */
bool BtBranchTracer::nextAlignmentFill(
	size_t maxiter,
	SwResult& res,
	size_t& off,
	size_t& nrej,
	size_t& niter,
	RandomSource& rnd)
{
	assert(prob_.inited());
	assert(!emptySolution());
	bool result = false;
	if(trySolutions(false, res, off, nrej, rnd, result)) {
		return result;
	}
	return false;
}

/**
 * Get the next valid alignment given the backtrace problem.  Return false
 * if there is no valid solution, e.g., if 
 */
bool BtBranchTracer::nextAlignment(
	size_t maxiter,
	SwResult& res,
	size_t& off,
	size_t& nrej,
	size_t& niter,
	RandomSource& rnd)
{
	if(prob_.fill_) {
		return nextAlignmentFill(
			maxiter,
			res,
			off,
			nrej,
			niter,
			rnd);
	} else {
		return nextAlignmentBacktrace(
			maxiter,
			res,
			off,
			nrej,
			niter,
			rnd);
	}
}

#ifdef MAIN_ALIGNER_BT

#include <iostream>

int main(int argc, char **argv) {
	size_t off = 0;
	RandomSource rnd(77);
	BtBranchTracer tr;
	Scoring sc = Scoring::base1();
	SwResult res;
	tr.init(
		"ACGTACGT", // in: read sequence
		"IIIIIIII", // in: quality sequence
		8,          // in: read sequence length
		"ACGTACGT", // in: reference sequence
		8,          // in: reference sequence length
		0,          // in: reference id
		0,          // in: reference offset
		true,       // in: orientation
		sc,         // in: scoring scheme
		0,          // in: N ceiling
		8,          // in: alignment score
		7,          // start in this row
		7,          // start in this column
		rnd);       // random gen, to choose among equal paths
	size_t nrej = 0;
	tr.nextAlignment(
		res,
		off,
		nrej,
		rnd);
}

#endif /*def MAIN_ALIGNER_BT*/
