#ifndef MULTIKEY_QSORT_H_
#define MULTIKEY_QSORT_H_

#include <iostream>
#include <seqan/basic.h>
#include <seqan/file.h>
#include <seqan/sequence.h>
#include "sequence_io.h"
#include "alphabet.h"
#include "assert_helpers.h"
#include "diff_sample.h"

using namespace std;
using namespace seqan;

/**
 * Swap elements a and b in seqan::String s
 */
template <typename TStr, typename TPos>
static inline void swap(TStr& s, size_t slen, TPos a, TPos b) {
	typedef typename Value<TStr>::Type TAlphabet;
	assert_lt(a, slen);
	assert_lt(b, slen);
	TAlphabet tmp = s[a];
	s[a] = s[b];
	s[b] = tmp;
}

/**
 * Swap elements a and b in array s
 */
template <typename TVal, typename TPos>
static inline void swap(TVal* s, size_t slen, TPos a, TPos b) {
	assert_lt(a, slen);
	assert_lt(b, slen);
	TVal tmp = s[a];
	s[a] = s[b];
	s[b] = tmp;
}

/**
 * Helper macro for swapping elements a and b in seqan::String s.  Does
 * some additional sainty checking w/r/t begin and end (which are
 * parameters to the sorting routines below).
 */
#define SWAP(s, a, b) { \
	assert_geq(a, begin); \
	assert_geq(b, begin); \
	assert_lt(a, end); \
	assert_lt(b, end); \
	swap(s, slen, a, b); \
}

/**
 * Helper macro for swapping the same pair of elements a and b in
 * two different seqan::Strings s and s2.  This is a helpful variant
 * if, for example, the caller would like to see how their input was
 * permuted by the sort routine (in that case, the caller would let s2
 * be an array s2[] where s2 is the same length as s and s2[i] = i).
 */
#define SWAP2(s, s2, a, b) { \
	SWAP(s, a, b); \
	swap(s2, slen, a, b); \
}

#define SWAP1(s, s2, a, b) { \
	SWAP(s, a, b); \
}

/**
 * Helper macro that swaps a range of elements [i, i+n) with another
 * range [j, j+n) in seqan::String s.
 */
#define VECSWAP(s, i, j, n) { \
	if(n > 0) { vecswap(s, slen, i, j, n, begin, end); } \
}

/**
 * Helper macro that swaps a range of elements [i, i+n) with another
 * range [j, j+n) both in seqan::String s and seqan::String s2.
 */
#define VECSWAP2(s, s2, i, j, n) { \
	if(n > 0) { vecswap2(s, slen, s2, i, j, n, begin, end); } \
}

/**
 * Helper function that swaps a range of elements [i, i+n) with another
 * range [j, j+n) in seqan::String s.  begin and end represent the
 * current range under consideration by the caller (one of the
 * recursive multikey_quicksort routines below).
 */
template <typename TStr, typename TPos>
static inline void vecswap(TStr& s, size_t slen, TPos i, TPos j, TPos n, TPos begin, TPos end) {
	assert_geq(i, begin);
	assert_geq(j, begin);
	assert_lt(i, end);
	assert_lt(j, end);
	while(n-- > 0) {
		assert_geq(n, 0);
		TPos a = i+n;
		TPos b = j+n;
		assert_geq(a, begin);
		assert_geq(b, begin);
		assert_lt(a, end);
		assert_lt(b, end);
		swap(s, slen, a, b);
	}
}

template <typename TVal, typename TPos>
static inline void vecswap(TVal *s, size_t slen, TPos i, TPos j, TPos n, TPos begin, TPos end) {
	assert_geq(i, begin);
	assert_geq(j, begin);
	assert_lt(i, end);
	assert_lt(j, end);
	while(n-- > 0) {
		assert_geq(n, 0);
		TPos a = i+n;
		TPos b = j+n;
		assert_geq(a, begin);
		assert_geq(b, begin);
		assert_lt(a, end);
		assert_lt(b, end);
		swap(s, slen, a, b);
	}
}

/**
 * Helper function that swaps a range of elements [i, i+n) with another
 * range [j, j+n) both in seqan::String s and seqan::String s2.  begin
 * and end represent the current range under consideration by the
 * caller (one of the recursive multikey_quicksort routines below).
 */
template <typename TStr, typename TPos>
static inline void vecswap2(TStr& s, size_t slen, TStr& s2, TPos i, TPos j, TPos n, TPos begin, TPos end) {
	assert_geq(i, begin);
	assert_geq(j, begin);
	assert_lt(i, end);
	assert_lt(j, end);
	while(n-- > 0) {
		assert_geq(n, 0);
		TPos a = i+n;
		TPos b = j+n;
		assert_geq(a, begin);
		assert_geq(b, begin);
		assert_lt(a, end);
		assert_lt(b, end);
		swap(s, slen, a, b);
		swap(s2, slen, a, b);
	}
}

template <typename TVal, typename TPos>
static inline void vecswap2(TVal* s, size_t slen, TVal* s2, TPos i, TPos j, TPos n, TPos begin, TPos end) {
	assert_geq(i, begin);
	assert_geq(j, begin);
	assert_lt(i, end);
	assert_lt(j, end);
	while(n-- > 0) {
		assert_geq(n, 0);
		TPos a = i+n;
		TPos b = j+n;
		assert_geq(a, begin);
		assert_geq(b, begin);
		assert_lt(a, end);
		assert_lt(b, end);
		swap(s, slen, a, b);
		swap(s2, slen, a, b);
	}
}

/// Retrieve an int-ized version of the ath character of string s, or,
/// if a goes off the end of s, return a (user-specified) int greater
/// than any TAlphabet character - 'hi'.
#define CHAR_AT(ss, aa) ((length(s[ss]) > aa) ? (int)(Dna)(s[ss][aa]) : hi)

/// Retrieve an int-ized version of the ath character of string s, or,
/// if a goes off the end of s, return a (user-specified) int greater
/// than any TAlphabet character - 'hi'.
#define CHAR_AT_SUF(si, off) (((off+s[si]) < hlen) ? ((int)(Dna)(host[off+s[si]])) : (hi))

/// Retrieve an int-ized version of the ath character of string s, or,
/// if a goes off the end of s, return a (user-specified) int greater
/// than any TAlphabet character - 'hi'.

#define CHAR_AT_SUF_U8(si, off) char_at_suf_u8(host, hlen, s, si, off, hi)

// Note that CHOOSE_AND_SWAP_RANDOM_PIVOT is unused
#define CHOOSE_AND_SWAP_RANDOM_PIVOT(sw, ch) {                            \
	/* Note: rand() didn't really cut it here; it seemed to run out of */ \
	/* randomness and, after a time, returned the same thing over and */  \
	/* over again */                                                      \
	a = (rand() % n) + begin; /* choose pivot between begin and end */  \
	assert_lt(a, end); assert_geq(a, begin);                              \
	sw(s, s2, begin, a); /* move pivot to beginning */                    \
}

/**
 * Ad-hoc DNA-centric way of choose a pretty good pivot without using
 * the pseudo-random number generator.  We try to get a 1 or 2 if
 * possible, since they'll split things more evenly than a 0 or 4.  We
 * also avoid swapping in the event that we choose the first element.
 */
#define CHOOSE_AND_SWAP_SMART_PIVOT(sw, ch) {                                    \
	a = begin; /* choose first elt */                                            \
	/* now try to find a better elt */                                           \
	if(n >= 5) { /* n is the difference between begin and end */                 \
		if     (ch(begin+1, depth) == 1 || ch(begin+1, depth) == 2) a = begin+1; \
		else if(ch(begin+2, depth) == 1 || ch(begin+2, depth) == 2) a = begin+2; \
		else if(ch(begin+3, depth) == 1 || ch(begin+3, depth) == 2) a = begin+3; \
		else if(ch(begin+4, depth) == 1 || ch(begin+4, depth) == 2) a = begin+4; \
		if(a != begin) sw(s, s2, begin, a); /* move pivot to beginning */        \
	}                                                                            \
	/* the element at [begin] now holds the pivot value */                       \
}

#define CHOOSE_AND_SWAP_PIVOT CHOOSE_AND_SWAP_SMART_PIVOT

#ifndef NDEBUG

/**
 * Assert that the range of chars at depth 'depth' in strings 'begin'
 * to 'end' in string-of-suffix-offsets s is parititioned properly
 * according to the ternary paritioning strategy of Bentley and McIlroy
 * (*prior to* swapping the = regions to the center)
 */
template<typename THost>
bool assertPartitionedSuf(const THost& host,
                          uint32_t *s,
                          size_t slen,
                          int hi,
                          int pivot,
                          size_t begin,
                          size_t end,
                          size_t depth)
{
	size_t hlen = length(host);
	int state = 0; // 0 -> 1st = section, 1 -> < section, 2 -> > section, 3 -> 2nd = section
	for(size_t i = begin; i < end; i++) {
		switch(state) {
		case 0:
			if       (CHAR_AT_SUF(i, depth) < pivot)  { state = 1; break; }
			else if  (CHAR_AT_SUF(i, depth) > pivot)  { state = 2; break; }
			assert_eq(CHAR_AT_SUF(i, depth), pivot);  break;
		case 1:
			if       (CHAR_AT_SUF(i, depth) > pivot)  { state = 2; break; }
			else if  (CHAR_AT_SUF(i, depth) == pivot) { state = 3; break; }
			assert_lt(CHAR_AT_SUF(i, depth), pivot);  break;
		case 2:
			if       (CHAR_AT_SUF(i, depth) == pivot) { state = 3; break; }
			assert_gt(CHAR_AT_SUF(i, depth), pivot);	 break;
		case 3:
			assert_eq(CHAR_AT_SUF(i, depth), pivot);	 break;
		}
	}
	return true;
}

/**
 * Assert that the range of chars at depth 'depth' in strings 'begin'
 * to 'end' in string-of-suffix-offsets s is parititioned properly
 * according to the ternary paritioning strategy of Bentley and McIlroy
 * (*after* swapping the = regions to the center)
 */
template<typename THost>
bool assertPartitionedSuf2(const THost& host,
                           uint32_t *s,
                           size_t slen,
                           int hi,
                           int pivot,
                           size_t begin,
                           size_t end,
                           size_t depth)
{
	size_t hlen = length(host);
	int state = 0; // 0 -> < section, 1 -> = section, 2 -> > section
	for(size_t i = begin; i < end; i++) {
		switch(state) {
		case 0:
			if       (CHAR_AT_SUF(i, depth) == pivot) { state = 1; break; }
			else if  (CHAR_AT_SUF(i, depth) > pivot)  { state = 2; break; }
			assert_lt(CHAR_AT_SUF(i, depth), pivot);  break;
		case 1:
			if       (CHAR_AT_SUF(i, depth) > pivot)  { state = 2; break; }
			assert_eq(CHAR_AT_SUF(i, depth), pivot);  break;
		case 2:
			assert_gt(CHAR_AT_SUF(i, depth), pivot);  break;
		}
	}
	return true;
}
#endif

/**
 * Assert that the seqan::String s of suffix offsets into seqan::String
 * 'host' is a seemingly legitimate suffix-offset list (at this time,
 * we just check that it doesn't list any suffix twice).
 */
static void sanityCheckInputSufs(uint32_t *s, size_t slen) {
	assert_gt(slen, 0);
	for(size_t i = 0; i < slen; i++) {
		// Actually, it's convenient to allow the caller to provide
		// suffix offsets thare are off the end of the host string.
		// See, e.g., build() in diff_sample.cpp.
		//assert_lt(s[i], length(host));
		for(size_t j = i+1; j < slen; j++) {
			assert_neq(s[i], s[j]);
		}
	}
}

/**
 * Assert that the seqan::String s of suffix offsets into seqan::String
 * 'host' really are in lexicographical order up to depth 'upto'.
 */
template <typename T>
void sanityCheckOrderedSufs(const T& host,
                            size_t hlen,
                            uint32_t *s,
                            size_t slen,
                            size_t upto,
                            uint32_t lower = 0,
                            uint32_t upper = 0xffffffff)
{
	assert_lt(s[0], hlen);
	upper = min<uint32_t>(upper, slen-1);
	for(size_t i = lower; i < upper; i++) {
		// Allow s[i+t] to point off the end of the string; this is
		// convenient for some callers
		if(s[i+1] >= hlen) continue;
		if(upto == 0xffffffff) {
			assert(dollarLt(suffix(host, s[i]), suffix(host, s[i+1])));
		} else {
#ifndef NDEBUG
			if(prefix(suffix(host, s[i]), upto) > prefix(suffix(host, s[i+1]), upto)) {
				// operator > treats shorter strings as
				// lexicographically smaller, but we want to opposite
				assert(isPrefix(suffix(host, s[i+1]), suffix(host, s[i])));
			}
#endif
		}
	}
}

/**
 * Main multikey quicksort function for suffixes.  Based on Bentley &
 * Sedgewick's algorithm on p.5 of their paper "Fast Algorithms for
 * Sorting and Searching Strings".  That algorithm has been extended in
 * three ways:
 *
 *  1. Deal with keys of different lengths by checking bounds and
 *     considering off-the-end values to be 'hi' (b/c our goal is the
 *     BWT transform, we're biased toward considring prefixes as
 *     lexicographically *greater* than their extensions).
 *  2. The multikey_qsort_suffixes version takes a single host string
 *     and a list of suffix offsets as input.  This reduces memory
 *     footprint compared to an approach that treats its input
 *     generically as a set of strings (not necessarily suffixes), thus
 *     requiring that we store at least two integers worth of
 *     information for each string.
 *  3. Sorting functions take an extra "upto" parameter that upper-
 *     bounds the depth to which the function sorts.
 *
 * TODO: Consult a tie-breaker (like a difference cover sample) if two
 * keys share a long prefix.
 */
template<typename T>
void mkeyQSortSuf(const T& host,
                  size_t hlen,
                  uint32_t *s,
                  size_t slen,
                  int hi,
                  size_t begin,
                  size_t end,
                  size_t depth,
                  size_t upto = 0xffffffff)
{
	// Helper for making the recursive call; sanity-checks arguments to
	// make sure that the problem actually got smaller.
	#define MQS_RECURSE_SUF(nbegin, nend, ndepth) { \
		assert(nbegin > begin || nend < end || ndepth > depth); \
		if(ndepth < upto) { /* don't exceed depth of 'upto' */ \
			mkeyQSortSuf(host, hlen, s, slen, hi, nbegin, nend, ndepth, upto); \
		} \
	}
	assert_leq(begin, slen);
	assert_leq(end, slen);
	size_t a, b, c, d, /*e,*/ r;
	size_t n = end - begin;
	if(n <= 1) return;                 // 1-element list already sorted
	CHOOSE_AND_SWAP_PIVOT(SWAP1, CHAR_AT_SUF); // pick pivot, swap it into [begin]
	int v = CHAR_AT_SUF(begin, depth); // v <- randomly-selected pivot value
	#ifndef NDEBUG
	{
		bool stillInBounds = false;
		for(size_t i = begin; i < end; i++) {
			if(depth < (hlen-s[i])) {
				stillInBounds = true;
				break;
			} else { /* already fell off this suffix */ }
		}
		assert(stillInBounds); // >=1 suffix must still be in bounds
	}
	#endif
	a = b = begin;
	c = d = end-1;
	while(true) {
		// Invariant: everything before a is = pivot, everything
		// between a and b is <
		int bc = 0; // shouldn't have to init but gcc on Mac complains
		while(b <= c && v >= (bc = CHAR_AT_SUF(b, depth))) {
			if(v == bc) {
				SWAP(s, a, b); a++;
			}
			b++;
		}
		// Invariant: everything after d is = pivot, everything
		// between c and d is >
		int cc = 0; // shouldn't have to init but gcc on Mac complains
		while(b <= c && v <= (cc = CHAR_AT_SUF(c, depth))) {
			if(v == cc) {
				SWAP(s, c, d); d--;
			}
			c--;
		}
		if(b > c) break;
		SWAP(s, b, c);
		b++;
		c--;
	}
	assert(a > begin || c < end-1);                      // there was at least one =s
	assert_lt(d-c, n); // they can't all have been > pivot
	assert_lt(b-a, n); // they can't all have been < pivot
	assert(assertPartitionedSuf(host, s, slen, hi, v, begin, end, depth));  // check pre-=-swap invariant
	r = min(a-begin, b-a); VECSWAP(s, begin, b-r,   r);  // swap left = to center
	r = min(d-c, end-d-1); VECSWAP(s, b,     end-r, r);  // swap right = to center
	assert(assertPartitionedSuf2(host, s, slen, hi, v, begin, end, depth)); // check post-=-swap invariant
	r = b-a; // r <- # of <'s
	if(r > 0) {
		MQS_RECURSE_SUF(begin, begin + r, depth); // recurse on <'s
	}
	// Do not recurse on ='s if the pivot was the off-the-end value;
	// they're already fully sorted
	if(v != hi) {
		MQS_RECURSE_SUF(begin + r, begin + r + (a-begin) + (end-d-1), depth+1); // recurse on ='s
	}
	r = d-c; // r <- # of >'s excluding those exhausted
	if(r > 0 && v < hi-1) {
		MQS_RECURSE_SUF(end-r, end, depth); // recurse on >'s
	}
}

/**
 * Toplevel function for multikey quicksort over suffixes.
 */
template<typename T>
void mkeyQSortSuf(const T& host,
                  uint32_t *s,
                  size_t slen,
                  int hi,
                  bool verbose = false,
                  bool sanityCheck = false,
                  size_t upto = 0xffffffff)
{
	size_t hlen = length(host);
	assert(!empty(s));
	if(sanityCheck) sanityCheckInputSufs(s, slen);
	mkeyQSortSuf(host, hlen, s, slen, hi, (size_t)0, slen, (size_t)0, upto);
	if(sanityCheck) sanityCheckOrderedSufs(host, hlen, s, slen, upto);
}

/**
 * Just like mkeyQSortSuf but all swaps are applied to s2 as well as s.
 * This is a helpful variant if, for example, the caller would like to
 * see how their input was permuted by the sort routine (in that case,
 * the caller would let s2 be an array s2[] where s2 is the same length
 * as s and s2[i] = i).
 */
template<typename T>
void mkeyQSortSuf2(const T& host,
                   size_t hlen,
                   uint32_t *s,
                   size_t slen,
                   uint32_t *s2,
                   int hi,
                   size_t begin,
                   size_t end,
                   size_t depth,
                   size_t upto = 0xffffffff)
{
	// Helper for making the recursive call; sanity-checks arguments to
	// make sure that the problem actually got smaller.
	#define MQS_RECURSE_SUF_DS(nbegin, nend, ndepth) { \
		assert(nbegin > begin || nend < end || ndepth > depth); \
		if(ndepth < upto) { /* don't exceed depth of 'upto' */ \
			mkeyQSortSuf2(host, hlen, s, slen, s2, hi, nbegin, nend, ndepth, upto); \
		} \
	}
	assert_leq(begin, slen);
	assert_leq(end, slen);
	size_t a, b, c, d, /*e,*/ r;
	size_t n = end - begin;
	if(n <= 1) return;                 // 1-element list already sorted
	CHOOSE_AND_SWAP_PIVOT(SWAP2, CHAR_AT_SUF); // pick pivot, swap it into [begin]
	int v = CHAR_AT_SUF(begin, depth); // v <- randomly-selected pivot value
	#ifndef NDEBUG
	{
		bool stillInBounds = false;
		for(size_t i = begin; i < end; i++) {
			if(depth < (hlen-s[i])) {
				stillInBounds = true;
				break;
			} else { /* already fell off this suffix */ }
		}
		assert(stillInBounds); // >=1 suffix must still be in bounds
	}
	#endif
	a = b = begin;
	c = d = /*e =*/ end-1;
	while(true) {
		// Invariant: everything before a is = pivot, everything
		// between a and b is <
		int bc = 0; // shouldn't have to init but gcc on Mac complains
		while(b <= c && v >= (bc = CHAR_AT_SUF(b, depth))) {
			if(v == bc) {
				SWAP2(s, s2, a, b); a++;
			}
			b++;
		}
		// Invariant: everything after d is = pivot, everything
		// between c and d is >
		int cc = 0; // shouldn't have to init but gcc on Mac complains
		while(b <= c && v <= (cc = CHAR_AT_SUF(c, depth))) {
			if(v == cc) {
				SWAP2(s, s2, c, d); d--; /*e--;*/
			}
			//else if(c == e && v == hi) e--;
			c--;
		}
		if(b > c) break;
		SWAP2(s, s2, b, c);
		b++;
		c--;
	}
	assert(a > begin || c < end-1);                      // there was at least one =s
	assert_lt(/*e*/d-c, n); // they can't all have been > pivot
	assert_lt(b-a, n); // they can't all have been < pivot
	assert(assertPartitionedSuf(host, s, slen, hi, v, begin, end, depth));  // check pre-=-swap invariant
	r = min(a-begin, b-a); VECSWAP2(s, s2, begin, b-r,   r);  // swap left = to center
	r = min(d-c, end-d-1); VECSWAP2(s, s2, b,     end-r, r);  // swap right = to center
	assert(assertPartitionedSuf2(host, s, slen, hi, v, begin, end, depth)); // check post-=-swap invariant
	r = b-a; // r <- # of <'s
	if(r > 0) {
		MQS_RECURSE_SUF_DS(begin, begin + r, depth); // recurse on <'s
	}
	// Do not recurse on ='s if the pivot was the off-the-end value;
	// they're already fully sorted
	if(v != hi) {
		MQS_RECURSE_SUF_DS(begin + r, begin + r + (a-begin) + (end-d-1), depth+1); // recurse on ='s
	}
	r = d-c;   // r <- # of >'s excluding those exhausted
	if(r > 0 && v < hi-1) {
		MQS_RECURSE_SUF_DS(end-r, end, depth); // recurse on >'s
	}
}

/**
 * Toplevel function for multikey quicksort over suffixes with double
 * swapping.
 */
template<typename T>
void mkeyQSortSuf2(const T& host,
                   uint32_t *s,
                   size_t slen,
                   uint32_t *s2,
                   int hi,
                   bool verbose = false,
                   bool sanityCheck = false,
                   size_t upto = 0xffffffff)
{
	size_t hlen = length(host);
	if(sanityCheck) sanityCheckInputSufs(s, slen);
	uint32_t *sOrig = NULL;
	if(sanityCheck) {
		sOrig = new uint32_t[slen];
		memcpy(sOrig, s, 4 * slen);
	}
	mkeyQSortSuf2(host, hlen, s, slen, s2, hi, (size_t)0, slen, (size_t)0, upto);
	if(sanityCheck) {
		sanityCheckOrderedSufs(host, hlen, s, slen, upto);
		for(size_t i = 0; i < slen; i++) {
			assert_eq(s[i], sOrig[s2[i]]);
		}
	}
}

// Ugly but necessary; otherwise the compiler chokes dramatically on
// the DifferenceCoverSample<> template args to the next few functions
template <typename T>
class DifferenceCoverSample;

/**
 * Constant time
 */
template<typename T1, typename T2> inline
bool sufDcLt(const T1& host,
             const T2& s1,
             const T2& s2,
             const DifferenceCoverSample<T1>& dc,
             bool sanityCheck = false)
{
	uint32_t diff = dc.tieBreakOff(s1, s2);
	assert_lt(diff, dc.v());
	assert_lt(diff, length(host)-s1);
	assert_lt(diff, length(host)-s2);
	if(sanityCheck) {
		for(uint32_t i = 0; i < diff; i++) {
			assert_eq(host[s1+i], host[s2+i]);
		}
	}
	bool ret = dc.breakTie(s1+diff, s2+diff) < 0;
#ifndef NDEBUG
	if(sanityCheck && ret != dollarLt(suffix(host, s1), suffix(host, s2))) {
		assert(false);
	}
#endif
	return ret;
}

/**
 * k log(k)
 */
template<typename T> inline
void qsortSufDc(const T& host,
                size_t hlen,
                uint32_t* s,
                size_t slen,
                const DifferenceCoverSample<T>& dc,
                size_t begin,
                size_t end,
                bool sanityCheck = false)
{
	assert_leq(end, slen);
	assert_lt(begin, slen);
	assert_gt(end, begin);
	size_t n = end - begin;
	if(n <= 1) return;                 // 1-element list already sorted
	// Note: rand() didn't really cut it here; it seemed to run out of
	// randomness and, after a time, returned the same thing over and
	// over again
	size_t a = (rand() % n) + begin; // choose pivot between begin and end
	assert_lt(a, end);
	assert_geq(a, begin);
	SWAP(s, end-1, a); // move pivot to end
	size_t cur = 0;
	for(size_t i = begin; i < end-1; i++) {
		if(sufDcLt(host, s[i], s[end-1], dc, sanityCheck)) {
			if(sanityCheck)
				assert(dollarLt(suffix(host, s[i]), suffix(host, s[end-1])));
			assert_lt(begin + cur, end-1);
			SWAP(s, i, begin + cur);
			cur++;
		}
	}
	// Put pivot into place
	assert_lt(cur, end-begin);
	SWAP(s, end-1, begin+cur);
	if(begin+cur > begin) qsortSufDc(host, hlen, s, slen, dc, begin, begin+cur);
	if(end > begin+cur+1) qsortSufDc(host, hlen, s, slen, dc, begin+cur+1, end);
}

/**
 * Toplevel function for multikey quicksort over suffixes.
 */
template<typename T1, typename T2>
void mkeyQSortSufDcU8(const T1& seqanHost,
                      const T2& host,
                      size_t hlen,
                      uint32_t* s,
                      size_t slen,
                      const DifferenceCoverSample<T1>& dc,
                      int hi,
                      bool verbose = false,
                      bool sanityCheck = false)
{
	if(sanityCheck) sanityCheckInputSufs(s, slen);
	mkeyQSortSufDcU8(seqanHost, host, hlen, s, slen, dc, hi, 0, slen, 0, sanityCheck);
	if(sanityCheck) sanityCheckOrderedSufs(seqanHost, hlen, s, slen, 0xffffffff);
}

/**
 * Return a boolean indicating whether s1 < s2 using the difference
 * cover to break the tie.
 */
template<typename T1, typename T2> inline
bool sufDcLtU8(const T1& seqanHost,
               const T2& host,
               size_t hlen,
               uint32_t s1,
               uint32_t s2,
               const DifferenceCoverSample<T1>& dc,
               bool sanityCheck = false)
{
	hlen += 0;
	uint32_t diff = dc.tieBreakOff(s1, s2);
	assert_lt(diff, dc.v());
	assert_lt(diff, hlen-s1);
	assert_lt(diff, hlen-s2);
	if(sanityCheck) {
		for(uint32_t i = 0; i < diff; i++) {
			assert_eq(host[s1+i], host[s2+i]);
		}
	}
	bool ret = dc.breakTie(s1+diff, s2+diff) < 0;
	// Sanity-check return value using dollarLt
#ifndef NDEBUG
	if(sanityCheck &&
	   ret != dollarLt(suffix(seqanHost, s1), suffix(seqanHost, s2)))
	{
		assert(false);
	}
#endif
	return ret;
}

/**
 * k log(k)
 */
template<typename T1, typename T2> inline
void qsortSufDcU8(const T1& seqanHost,
                  const T2& host,
                  size_t hlen,
                  uint32_t* s,
                  size_t slen,
                  const DifferenceCoverSample<T1>& dc,
                  size_t begin,
                  size_t end,
                  bool sanityCheck = false)
{
	assert_leq(end, slen);
	assert_lt(begin, slen);
	assert_gt(end, begin);
	size_t n = end - begin;
	if(n <= 1) return;                 // 1-element list already sorted
	// Note: rand() didn't really cut it here; it seemed to run out of
	// randomness and, after a time, returned the same thing over and
	// over again
	size_t a = (rand() % n) + begin; // choose pivot between begin and end
	assert_lt(a, end);
	assert_geq(a, begin);
	SWAP(s, end-1, a); // move pivot to end
	size_t cur = 0;
	for(size_t i = begin; i < end-1; i++) {
		if(sufDcLtU8(seqanHost, host, hlen, s[i], s[end-1], dc, sanityCheck)) {
#ifndef NDEBUG
			if(sanityCheck)
				assert(dollarLt(suffix(seqanHost, s[i]), suffix(seqanHost, s[end-1])));
			assert_lt(begin + cur, end-1);
#endif
			SWAP(s, i, begin + cur);
			cur++;
		}
	}
	// Put pivot into place
	assert_lt(cur, end-begin);
	SWAP(s, end-1, begin+cur);
	if(begin+cur > begin) qsortSufDcU8(seqanHost, host, hlen, s, slen, dc, begin, begin+cur);
	if(end > begin+cur+1) qsortSufDcU8(seqanHost, host, hlen, s, slen, dc, begin+cur+1, end);
}

#define BUCKET_SORT_CUTOFF (4 * 1024 * 1024)
#define SELECTION_SORT_CUTOFF 6

// 5 64-element buckets for bucket-sorting A, C, G, T, $
static uint32_t bkts[4][4 * 1024 * 1024];

/**
 * Straightforwardly obtain a uint8_t-ized version of t[off].  This
 * works fine as long as TStr is not packed.
 */
template<typename TStr>
inline uint8_t get_uint8(const TStr& t, uint32_t off) {
	return t[off];
}

/**
 * For incomprehensible generic-programming reasons, getting a uint8_t
 * version of a character in a packed String<> requires casting first
 * to Dna then to uint8_t.
 */
template<>
inline uint8_t get_uint8(const String<Dna, Packed<> >& t, uint32_t off) {
	return (uint8_t)(Dna)t[off];
}

/**
 * Return character at offset 'off' from the 'si'th suffix in the array
 * 's' of suffixes.  If the character is out-of-bounds, return hi.
 */
template<typename TStr>
static inline int char_at_suf_u8(const TStr& host,
                                 uint32_t hlen,
                                 uint32_t* s,
                                 uint32_t si,
                                 uint32_t off,
                                 uint8_t hi)
{
	return ((off+s[si]) < hlen) ? get_uint8(host, off+s[si]) : (hi);
}

template<typename T1, typename T2>
static void selectionSortSufDcU8(
		const T1& seqanHost,
		const T2& host,
        size_t hlen,
        uint32_t* s,
        size_t slen,
        const DifferenceCoverSample<T1>& dc,
        uint8_t hi,
        size_t begin,
        size_t end,
        size_t depth,
        bool sanityCheck = false)
{
#define ASSERT_SUF_LT(l, r) \
	if(sanityCheck && \
	   !dollarLt(suffix(seqanHost, s[l]), \
	             suffix(seqanHost, s[r]))) { \
		cout << "l: " << suffixStr(seqanHost, s[l]) << endl; \
		cout << "r: " << suffixStr(seqanHost, s[r]) << endl; \
		assert(false); \
	}

	assert_gt(end, begin+1);
	assert_leq(end-begin, SELECTION_SORT_CUTOFF);
	assert_eq(hi, 4);
	uint32_t v = dc.v();
	if(end == begin+2) {
		uint32_t off = dc.tieBreakOff(s[begin], s[begin+1]);
		if(off + s[begin] >= hlen ||
		   off + s[begin+1] >= hlen)
		{
			off = 0xffffffff;
		}
		if(off != 0xffffffff) {
			if(off < depth) {
				qsortSufDcU8<T1,T2>(seqanHost, host, hlen, s, slen, dc,
				                    begin, end, sanityCheck);
				// It's helpful for debugging if we call this here
				if(sanityCheck) {
					sanityCheckOrderedSufs(seqanHost, hlen, s, slen,
					                       0xffffffff, begin, end);
				}
				return;
			}
			v = off - depth + 1;
		}
	}
	assert_leq(v, dc.v());
	uint32_t lim = v;
	assert_geq(lim, 0);
	for(size_t i = begin; i < end-1; i++) {
		uint32_t targ = i;
		uint32_t targoff = depth + s[i];
		for(size_t j = i+1; j < end; j++) {
			assert_neq(j, targ);
			uint32_t joff = depth + s[j];
			uint32_t k;
			for(k = 0; k <= lim; k++) {
				assert_neq(j, targ);
				uint8_t jc = (k + joff < hlen)    ? get_uint8(host, k + joff)    : hi;
				uint8_t tc = (k + targoff < hlen) ? get_uint8(host, k + targoff) : hi;
				assert(jc != hi || tc != hi);
				if(jc > tc) {
					// the jth suffix is greater than the current
					// smallest suffix
					ASSERT_SUF_LT(targ, j);
					break;
				} else if(jc < tc) {
					// the jth suffix is less than the current smallest
					// suffix, so update smallest to be j
					ASSERT_SUF_LT(j, targ);
					targ = j;
					targoff = joff;
					break;
				} else if(k == lim) {
					// Check whether either string ends immediately
					// after this character
					assert_leq(k + joff + 1, hlen);
					assert_leq(k + targoff + 1, hlen);
					if(k + joff + 1 == hlen) {
						// targ < j
						assert_neq(k + targoff + 1, hlen);
						ASSERT_SUF_LT(targ, j);
						break;
					} else if(k + targoff + 1 == hlen) {
						// j < targ
						ASSERT_SUF_LT(j, targ);
						targ = j;
						targoff = joff;
						break;
					}
				} else {
					// They're equal so far, keep going
				}
			}
			// The jth suffix was equal to the current smallest suffix
			// up to the difference-cover period, so disambiguate with
			// difference cover
			if(k == lim+1) {
				assert_neq(j, targ);
				if(sufDcLtU8(seqanHost, host, hlen, s[j], s[targ], dc, sanityCheck)) {
					// j < targ
					assert(!sufDcLtU8(seqanHost, host, hlen, s[targ], s[j], dc, sanityCheck));
					ASSERT_SUF_LT(j, targ);
					targ = j;
					targoff = joff;
				} else {
					assert(sufDcLtU8(seqanHost, host, hlen, s[targ], s[j], dc, sanityCheck));
					ASSERT_SUF_LT(targ, j); // !
				}
			}
		}
		if(i != targ) {
			ASSERT_SUF_LT(targ, i);
			// swap i and targ
			uint32_t tmp = s[i];
			s[i] = s[targ];
			s[targ] = tmp;
		}
		for(size_t j = i+1; j < end; j++) {
			ASSERT_SUF_LT(i, j);
		}
	}
	if(sanityCheck) {
		sanityCheckOrderedSufs(seqanHost, hlen, s, slen,
		                       0xffffffff, begin, end);
	}
}

template<typename T1, typename T2>
static void bucketSortSufDcU8(
		const T1& seqanHost,
		const T2& host,
        size_t hlen,
        uint32_t* s,
        size_t slen,
        const DifferenceCoverSample<T1>& dc,
        uint8_t hi,
        size_t begin,
        size_t end,
        size_t depth,
        bool sanityCheck = false)
{
	uint32_t cnts[] = { 0, 0, 0, 0, 0 };
	#define BKT_RECURSE_SUF_DC_U8(nbegin, nend) { \
		bucketSortSufDcU8<T1,T2>(seqanHost, host, hlen, s, slen, dc, hi, \
		                         (nbegin), (nend), depth+1, sanityCheck); \
	}
	assert_gt(end, begin);
	assert_leq(end-begin, BUCKET_SORT_CUTOFF);
	assert_eq(hi, 4);
	if(end == begin+1) return; // 1-element list already sorted
	if(depth > dc.v()) {
		// Quicksort the remaining suffixes using difference cover
		// for constant-time comparisons; this is O(k*log(k)) where
		// k=(end-begin)
		qsortSufDcU8<T1,T2>(seqanHost, host, hlen, s, slen, dc, begin, end, sanityCheck);
		return;
	}
	if(end-begin <= SELECTION_SORT_CUTOFF) {
		// Bucket sort remaining items
		selectionSortSufDcU8(seqanHost, host, hlen, s, slen, dc, hi,
		                     begin, end, depth, sanityCheck);
		if(sanityCheck) {
			sanityCheckOrderedSufs(seqanHost, hlen, s, slen,
			                       0xffffffff, begin, end);
		}
		return;
	}
	for(size_t i = begin; i < end; i++) {
		uint32_t off = depth+s[i];
		uint8_t c = (off < hlen) ? get_uint8(host, off) : hi;
		assert_leq(c, 4);
		if(c == 0) {
			s[begin + cnts[0]++] = s[i];
		} else {
			bkts[c-1][cnts[c]++] = s[i];
		}
	}
	assert_eq(cnts[0] + cnts[1] + cnts[2] + cnts[3] + cnts[4], end - begin);
	uint32_t cur = begin + cnts[0];
	if(cnts[1] > 0) { memcpy(&s[cur], bkts[0], cnts[1] << 2); cur += cnts[1]; }
	if(cnts[2] > 0) { memcpy(&s[cur], bkts[1], cnts[2] << 2); cur += cnts[2]; }
	if(cnts[3] > 0) { memcpy(&s[cur], bkts[2], cnts[3] << 2); cur += cnts[3]; }
	if(cnts[4] > 0) { memcpy(&s[cur], bkts[3], cnts[4] << 2); }
	// This frame is now totally finished with bkts[][], so recursive
	// callees can safely clobber it; we're not done with cnts[], but
	// that's local to the stack frame.
	cur = begin;
	if(cnts[0] > 0) {
		BKT_RECURSE_SUF_DC_U8(cur, cur + cnts[0]); cur += cnts[0];
	}
	if(cnts[1] > 0) {
		BKT_RECURSE_SUF_DC_U8(cur, cur + cnts[1]); cur += cnts[1];
	}
	if(cnts[2] > 0) {
		BKT_RECURSE_SUF_DC_U8(cur, cur + cnts[2]); cur += cnts[2];
	}
	if(cnts[3] > 0) {
		BKT_RECURSE_SUF_DC_U8(cur, cur + cnts[3]);
	}
	// Done
}

/**
 * Main multikey quicksort function for suffixes.  Based on Bentley &
 * Sedgewick's algorithm on p.5 of their paper "Fast Algorithms for
 * Sorting and Searching Strings".  That algorithm has been extended in
 * three ways:
 *
 *  1. Deal with keys of different lengths by checking bounds and
 *     considering off-the-end values to be 'hi' (b/c our goal is the
 *     BWT transform, we're biased toward considring prefixes as
 *     lexicographically *greater* than their extensions).
 *  2. The multikey_qsort_suffixes version takes a single host string
 *     and a list of suffix offsets as input.  This reduces memory
 *     footprint compared to an approach that treats its input
 *     generically as a set of strings (not necessarily suffixes), thus
 *     requiring that we store at least two integers worth of
 *     information for each string.
 *  3. Sorting functions take an extra "upto" parameter that upper-
 *     bounds the depth to which the function sorts.
 */
template<typename T1, typename T2>
void mkeyQSortSufDcU8(const T1& seqanHost,
                      const T2& host,
                      size_t hlen,
                      uint32_t* s,
                      size_t slen,
                      const DifferenceCoverSample<T1>& dc,
                      int hi,
                      size_t begin,
                      size_t end,
                      size_t depth,
                      bool sanityCheck = false)
{
	// Helper for making the recursive call; sanity-checks arguments to
	// make sure that the problem actually got smaller.
	#define MQS_RECURSE_SUF_DC_U8(nbegin, nend, ndepth) { \
		assert(nbegin > begin || nend < end || ndepth > depth); \
		mkeyQSortSufDcU8(seqanHost, host, hlen, s, slen, dc, hi, nbegin, nend, ndepth, sanityCheck); \
	}
	assert_leq(begin, slen);
	assert_leq(end, slen);
	size_t n = end - begin;
	if(n <= 1) return; // 1-element list already sorted
	if(depth > dc.v()) {
		// Quicksort the remaining suffixes using difference cover
		// for constant-time comparisons; this is O(k*log(k)) where
		// k=(end-begin)
		qsortSufDcU8<T1,T2>(seqanHost, host, hlen, s, slen, dc, begin, end, sanityCheck);
		if(sanityCheck) {
			sanityCheckOrderedSufs(seqanHost, hlen, s, slen, 0xffffffff, begin, end);
		}
		return;
	}
	if(n <= BUCKET_SORT_CUTOFF) {
		// Bucket sort remaining items
		bucketSortSufDcU8(seqanHost, host, hlen, s, slen, dc,
		                  (uint8_t)hi, begin, end, depth, sanityCheck);
		if(sanityCheck) {
			sanityCheckOrderedSufs(seqanHost, hlen, s, slen, 0xffffffff, begin, end);
		}
		return;
	}
	size_t a, b, c, d, r;
	CHOOSE_AND_SWAP_PIVOT(SWAP1, CHAR_AT_SUF_U8); // choose pivot, swap to begin
	int v = CHAR_AT_SUF_U8(begin, depth); // v <- pivot value
	#ifndef NDEBUG
	{
		bool stillInBounds = false;
		for(size_t i = begin; i < end; i++) {
			if(depth < (hlen-s[i])) {
				stillInBounds = true;
				break;
			} else { /* already fell off this suffix */ }
		}
		assert(stillInBounds); // >=1 suffix must still be in bounds
	}
	#endif
	a = b = begin;
	c = d = end-1;
	while(true) {
		// Invariant: everything before a is = pivot, everything
		// between a and b is <
		int bc = 0; // shouldn't have to init but gcc on Mac complains
		while(b <= c && v >= (bc = CHAR_AT_SUF_U8(b, depth))) {
			if(v == bc) {
				SWAP(s, a, b); a++;
			}
			b++;
		}
		// Invariant: everything after d is = pivot, everything
		// between c and d is >
		int cc = 0; // shouldn't have to init but gcc on Mac complains
		bool hiLatch = true;
		while(b <= c && v <= (cc = CHAR_AT_SUF_U8(c, depth))) {
			if(v == cc) {
				SWAP(s, c, d); d--;
			}
			else if(hiLatch && cc == hi) {

			}
			c--;
		}
		if(b > c) break;
		SWAP(s, b, c);
		b++;
		c--;
	}
	assert(a > begin || c < end-1);                      // there was at least one =s
	assert_lt(d-c, n); // they can't all have been > pivot
	assert_lt(b-a, n); // they can't all have been < pivot
	//assert(assertPartitionedSuf(host, s, slen, hi, v, begin, end, depth));  // check pre-=-swap invariant
	r = min(a-begin, b-a); VECSWAP(s, begin, b-r,   r);  // swap left = to center
	r = min(d-c, end-d-1); VECSWAP(s, b,     end-r, r);  // swap right = to center
	//assert(assertPartitionedSuf2(host, s, slen, hi, v, begin, end, depth)); // check post-=-swap invariant
	r = b-a; // r <- # of <'s
	if(r > 0) {
		MQS_RECURSE_SUF_DC_U8(begin, begin + r, depth); // recurse on <'s
	}
	// Do not recurse on ='s if the pivot was the off-the-end value;
	// they're already fully sorted
	if(v != hi) {
		MQS_RECURSE_SUF_DC_U8(begin + r, begin + r + (a-begin) + (end-d-1), depth+1); // recurse on ='s
	}
	r = d-c; // r <- # of >'s excluding those exhausted
	if(r > 0 && v < hi-1) {
		MQS_RECURSE_SUF_DC_U8(end-r, end, depth); // recurse on >'s
	}
}


#endif /*MULTIKEY_QSORT_H_*/
