#ifndef EBWT_H_ #define EBWT_H_ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef BOWTIE_MM #include #include #endif #include "auto_array.h" #include "shmem.h" #include "alphabet.h" #include "assert_helpers.h" #include "bitpack.h" #include "blockwise_sa.h" #include "endian_swap.h" #include "word_io.h" #include "random_source.h" #include "hit.h" #include "ref_read.h" #include "threading.h" #include "bitset.h" #include "str_util.h" #include "mm.h" #include "timer.h" #include "refmap.h" #include "color_dec.h" #include "reference.h" using namespace std; using namespace seqan; #ifndef PREFETCH_LOCALITY // No locality by default #define PREFETCH_LOCALITY 2 #endif // From ccnt_lut.cpp, automatically generated by gen_lookup_tables.pl extern uint8_t cCntLUT_4[4][4][256]; #ifndef VMSG_NL #define VMSG_NL(args...) \ if(this->verbose()) { \ stringstream tmp; \ tmp << args << endl; \ this->verbose(tmp.str()); \ } #endif #ifndef VMSG #define VMSG(args...) \ if(this->verbose()) { \ stringstream tmp; \ tmp << args; \ this->verbose(tmp.str()); \ } #endif /** * Flags describing type of Ebwt. */ enum EBWT_FLAGS { EBWT_COLOR = 2, // true -> Ebwt is colorspace EBWT_ENTIRE_REV = 4 // true -> reverse Ebwt is the whole // concatenated string reversed, rather than // each stretch reversed }; /** * Extended Burrows-Wheeler transform header. This together with the * actual data arrays and other text-specific parameters defined in * class Ebwt constitute the entire Ebwt. */ class EbwtParams { public: EbwtParams() { } EbwtParams(uint32_t len, int32_t lineRate, int32_t linesPerSide, int32_t offRate, int32_t isaRate, int32_t ftabChars, bool color, bool entireReverse) { init(len, lineRate, linesPerSide, offRate, isaRate, ftabChars, color, entireReverse); } EbwtParams(const EbwtParams& eh) { init(eh._len, eh._lineRate, eh._linesPerSide, eh._offRate, eh._isaRate, eh._ftabChars, eh._color, eh._entireReverse); } void init(uint32_t len, int32_t lineRate, int32_t linesPerSide, int32_t offRate, int32_t isaRate, int32_t ftabChars, bool color, bool entireReverse) { _color = color; _entireReverse = entireReverse; _len = len; _bwtLen = _len + 1; _sz = (len+3)/4; _bwtSz = (len/4 + 1); _lineRate = lineRate; _linesPerSide = linesPerSide; _origOffRate = offRate; _offRate = offRate; _offMask = 0xffffffff << _offRate; _isaRate = isaRate; _isaMask = 0xffffffff << ((_isaRate >= 0) ? _isaRate : 0); _ftabChars = ftabChars; _eftabLen = _ftabChars*2; _eftabSz = _eftabLen*4; _ftabLen = (1 << (_ftabChars*2))+1; _ftabSz = _ftabLen*4; _offsLen = (_bwtLen + (1 << _offRate) - 1) >> _offRate; _offsSz = _offsLen*4; _isaLen = (_isaRate == -1)? 0 : ((_bwtLen + (1 << _isaRate) - 1) >> _isaRate); _isaSz = _isaLen*4; _lineSz = 1 << _lineRate; _sideSz = _lineSz * _linesPerSide; _sideBwtSz = _sideSz - 8; _sideBwtLen = _sideBwtSz*4; _numSidePairs = (_bwtSz+(2*_sideBwtSz)-1)/(2*_sideBwtSz); _numSides = _numSidePairs*2; _numLines = _numSides * _linesPerSide; _ebwtTotLen = _numSidePairs * (2*_sideSz); _ebwtTotSz = _ebwtTotLen; assert(repOk()); } uint32_t len() const { return _len; } uint32_t bwtLen() const { return _bwtLen; } uint32_t sz() const { return _sz; } uint32_t bwtSz() const { return _bwtSz; } int32_t lineRate() const { return _lineRate; } int32_t linesPerSide() const { return _linesPerSide; } int32_t origOffRate() const { return _origOffRate; } int32_t offRate() const { return _offRate; } uint32_t offMask() const { return _offMask; } int32_t isaRate() const { return _isaRate; } uint32_t isaMask() const { return _isaMask; } int32_t ftabChars() const { return _ftabChars; } uint32_t eftabLen() const { return _eftabLen; } uint32_t eftabSz() const { return _eftabSz; } uint32_t ftabLen() const { return _ftabLen; } uint32_t ftabSz() const { return _ftabSz; } uint32_t offsLen() const { return _offsLen; } uint32_t offsSz() const { return _offsSz; } uint32_t isaLen() const { return _isaLen; } uint32_t isaSz() const { return _isaSz; } uint32_t lineSz() const { return _lineSz; } uint32_t sideSz() const { return _sideSz; } uint32_t sideBwtSz() const { return _sideBwtSz; } uint32_t sideBwtLen() const { return _sideBwtLen; } uint32_t numSidePairs() const { return _numSidePairs; } uint32_t numSides() const { return _numSides; } uint32_t numLines() const { return _numLines; } uint32_t ebwtTotLen() const { return _ebwtTotLen; } uint32_t ebwtTotSz() const { return _ebwtTotSz; } bool color() const { return _color; } bool entireReverse() const { return _entireReverse; } /** * Set a new suffix-array sampling rate, which involves updating * rate, mask, sample length, and sample size. */ void setOffRate(int __offRate) { _offRate = __offRate; _offMask = 0xffffffff << _offRate; _offsLen = (_bwtLen + (1 << _offRate) - 1) >> _offRate; _offsSz = _offsLen*4; } /** * Set a new inverse suffix-array sampling rate, which involves * updating rate, mask, sample length, and sample size. */ void setIsaRate(int __isaRate) { _isaRate = __isaRate; _isaMask = 0xffffffff << _isaRate; _isaLen = (_bwtLen + (1 << _isaRate) - 1) >> _isaRate; _isaSz = _isaLen*4; } /// Check that this EbwtParams is internally consistent bool repOk() const { assert_gt(_len, 0); assert_gt(_lineRate, 3); assert_geq(_offRate, 0); assert_leq(_ftabChars, 16); assert_geq(_ftabChars, 1); assert_lt(_lineRate, 32); assert_lt(_linesPerSide, 32); assert_lt(_ftabChars, 32); assert_eq(0, _ebwtTotSz % (2*_lineSz)); return true; } /** * Pretty-print the header contents to the given output stream. */ void print(ostream& out) const { out << "Headers:" << endl << " len: " << _len << endl << " bwtLen: " << _bwtLen << endl << " sz: " << _sz << endl << " bwtSz: " << _bwtSz << endl << " lineRate: " << _lineRate << endl << " linesPerSide: " << _linesPerSide << endl << " offRate: " << _offRate << endl << " offMask: 0x" << hex << _offMask << dec << endl << " isaRate: " << _isaRate << endl << " isaMask: 0x" << hex << _isaMask << dec << endl << " ftabChars: " << _ftabChars << endl << " eftabLen: " << _eftabLen << endl << " eftabSz: " << _eftabSz << endl << " ftabLen: " << _ftabLen << endl << " ftabSz: " << _ftabSz << endl << " offsLen: " << _offsLen << endl << " offsSz: " << _offsSz << endl << " isaLen: " << _isaLen << endl << " isaSz: " << _isaSz << endl << " lineSz: " << _lineSz << endl << " sideSz: " << _sideSz << endl << " sideBwtSz: " << _sideBwtSz << endl << " sideBwtLen: " << _sideBwtLen << endl << " numSidePairs: " << _numSidePairs << endl << " numSides: " << _numSides << endl << " numLines: " << _numLines << endl << " ebwtTotLen: " << _ebwtTotLen << endl << " ebwtTotSz: " << _ebwtTotSz << endl << " reverse: " << _entireReverse << endl; } uint32_t _len; uint32_t _bwtLen; uint32_t _sz; uint32_t _bwtSz; int32_t _lineRate; int32_t _linesPerSide; int32_t _origOffRate; int32_t _offRate; uint32_t _offMask; int32_t _isaRate; uint32_t _isaMask; int32_t _ftabChars; uint32_t _eftabLen; uint32_t _eftabSz; uint32_t _ftabLen; uint32_t _ftabSz; uint32_t _offsLen; uint32_t _offsSz; uint32_t _isaLen; uint32_t _isaSz; uint32_t _lineSz; uint32_t _sideSz; uint32_t _sideBwtSz; uint32_t _sideBwtLen; uint32_t _numSidePairs; uint32_t _numSides; uint32_t _numLines; uint32_t _ebwtTotLen; uint32_t _ebwtTotSz; bool _color; bool _entireReverse; }; /** * Exception to throw when a file-realted error occurs. */ class EbwtFileOpenException : public std::runtime_error { public: EbwtFileOpenException(const std::string& msg = "") : std::runtime_error(msg) { } }; /** * Calculate size of file with given name. */ static inline int64_t fileSize(const char* name) { std::ifstream f; f.open(name, std::ios_base::binary | std::ios_base::in); if (!f.good() || f.eof() || !f.is_open()) { return 0; } f.seekg(0, std::ios_base::beg); std::ifstream::pos_type begin_pos = f.tellg(); f.seekg(0, std::ios_base::end); return static_cast(f.tellg() - begin_pos); } // Forward declarations for Ebwt class struct SideLocus; template class EbwtSearchParams; /** * Extended Burrows-Wheeler transform data. * * An Ebwt may be transferred to and from RAM with calls to * evictFromMemory() and loadIntoMemory(). By default, a newly-created * Ebwt is not loaded into memory; if the user would like to use a * newly-created Ebwt to answer queries, they must first call * loadIntoMemory(). */ template class Ebwt { public: typedef typename Value::Type TAlphabet; #define Ebwt_INITS \ _toBigEndian(currentlyBigEndian()), \ _overrideOffRate(__overrideOffRate), \ _overrideIsaRate(__overrideIsaRate), \ _verbose(verbose), \ _passMemExc(passMemExc), \ _sanity(sanityCheck), \ _fw(__fw), \ _in1(MM_FILE_INIT), \ _in2(MM_FILE_INIT), \ _zOff(0xffffffff), \ _zEbwtByteOff(0xffffffff), \ _zEbwtBpOff(-1), \ _nPat(0), \ _nFrag(0), \ _plen(NULL), \ _rstarts(NULL), \ _fchr(NULL), \ _ftab(NULL), \ _eftab(NULL), \ _offs(NULL), \ _isa(NULL), \ _ebwt(NULL), \ _useMm(false), \ useShmem_(false), \ _refnames(), \ rmap_(NULL), \ mmFile1_(NULL), \ mmFile2_(NULL) #ifdef EBWT_STATS #define Ebwt_STAT_INITS \ ,mapLFExs_(0llu), \ mapLFs_(0llu), \ mapLFcs_(0llu), \ mapLF1cs_(0llu), \ mapLF1s_(0llu) #else #define Ebwt_STAT_INITS #endif /// Construct an Ebwt from the given input file Ebwt(const string& in, int color, int needEntireReverse, bool __fw, int32_t __overrideOffRate = -1, int32_t __overrideIsaRate = -1, bool useMm = false, bool useShmem = false, bool mmSweep = false, bool loadNames = false, const ReferenceMap* rmap = NULL, bool verbose = false, bool startVerbose = false, bool passMemExc = false, bool sanityCheck = false) : Ebwt_INITS Ebwt_STAT_INITS { assert(!useMm || !useShmem); rmap_ = rmap; _useMm = useMm; useShmem_ = useShmem; _in1Str = in + ".1.ebwt"; _in2Str = in + ".2.ebwt"; readIntoMemory( color, // expect colorspace reference? __fw ? -1 : needEntireReverse, // need REF_READ_REVERSE true, // stop after loading the header portion? &_eh, // params structure to fill in mmSweep, // mmSweep loadNames, // loadNames startVerbose); // startVerbose // If the offRate has been overridden, reflect that in the // _eh._offRate field if(_overrideOffRate > _eh._offRate) { _eh.setOffRate(_overrideOffRate); assert_eq(_overrideOffRate, _eh._offRate); } // Same with isaRate if(_overrideIsaRate > _eh._isaRate) { _eh.setIsaRate(_overrideIsaRate); assert_eq(_overrideIsaRate, _eh._isaRate); } assert(repOk()); } /// Construct an Ebwt from the given header parameters and string /// vector, optionally using a blockwise suffix sorter with the /// given 'bmax' and 'dcv' parameters. The string vector is /// ultimately joined and the joined string is passed to buildToDisk(). Ebwt(int color, int32_t lineRate, int32_t linesPerSide, int32_t offRate, int32_t isaRate, int32_t ftabChars, const string& file, // base filename for EBWT files bool __fw, bool useBlockwise, uint32_t bmax, uint32_t bmaxSqrtMult, uint32_t bmaxDivN, int dcv, vector& is, vector& szs, vector& plens, uint32_t sztot, const RefReadInParams& refparams, uint32_t seed, int32_t __overrideOffRate = -1, int32_t __overrideIsaRate = -1, bool verbose = false, bool passMemExc = false, bool sanityCheck = false) : Ebwt_INITS Ebwt_STAT_INITS, _eh(joinedLen(szs), lineRate, linesPerSide, offRate, isaRate, ftabChars, color, refparams.reverse == REF_READ_REVERSE) { _in1Str = file + ".1.ebwt"; _in2Str = file + ".2.ebwt"; // Open output files ofstream fout1(_in1Str.c_str(), ios::binary); if(!fout1.good()) { cerr << "Could not open index file for writing: \"" << _in1Str << "\"" << endl << "Please make sure the directory exists and that permissions allow writing by" << endl << "Bowtie." << endl; throw 1; } ofstream fout2(_in2Str.c_str(), ios::binary); if(!fout2.good()) { cerr << "Could not open index file for writing: \"" << _in2Str << "\"" << endl << "Please make sure the directory exists and that permissions allow writing by" << endl << "Bowtie." << endl; throw 1; } // Build initFromVector( is, szs, plens, sztot, refparams, fout1, fout2, useBlockwise, bmax, bmaxSqrtMult, bmaxDivN, dcv, seed); // Close output files fout1.flush(); int64_t tellpSz1 = (int64_t)fout1.tellp(); VMSG_NL("Wrote " << fout1.tellp() << " bytes to primary EBWT file: " << _in1Str); fout1.close(); bool err = false; if(tellpSz1 > fileSize(_in1Str.c_str())) { err = true; cerr << "Index is corrupt: File size for " << _in1Str << " should have been " << tellpSz1 << " but is actually " << fileSize(_in1Str.c_str()) << "." << endl; } fout2.flush(); int64_t tellpSz2 = (int64_t)fout2.tellp(); VMSG_NL("Wrote " << fout2.tellp() << " bytes to secondary EBWT file: " << _in2Str); fout2.close(); if(tellpSz2 > fileSize(_in2Str.c_str())) { err = true; cerr << "Index is corrupt: File size for " << _in2Str << " should have been " << tellpSz2 << " but is actually " << fileSize(_in2Str.c_str()) << "." << endl; } if(err) { cerr << "Please check if there is a problem with the disk or if disk is full." << endl; throw 1; } // Reopen as input streams VMSG_NL("Re-opening _in1 and _in2 as input streams"); if(_sanity) { VMSG_NL("Sanity-checking Ebwt"); assert(!isInMemory()); readIntoMemory( color, __fw ? -1 : refparams.reverse == REF_READ_REVERSE, false, NULL, false, true, false); sanityCheckAll(refparams.reverse); evictFromMemory(); assert(!isInMemory()); } VMSG_NL("Returning from Ebwt constructor"); } bool isPacked(); /** * Write the rstarts array given the szs array for the reference. */ void szsToDisk(const vector& szs, ostream& os, int reverse) { size_t seq = 0; uint32_t off = 0; uint32_t totlen = 0; for(unsigned int i = 0; i < szs.size(); i++) { if(szs[i].len == 0) continue; if(szs[i].first) off = 0; off += szs[i].off; #ifdef ACCOUNT_FOR_ALL_GAP_REFS if(szs[i].first && szs[i].len > 0) seq++; #else if(szs[i].first) seq++; #endif size_t seqm1 = seq-1; assert_lt(seqm1, _nPat); size_t fwoff = off; if(reverse == REF_READ_REVERSE) { // Invert pattern idxs seqm1 = _nPat - seqm1 - 1; // Invert pattern idxs assert_leq(off + szs[i].len, _plen[seqm1]); fwoff = _plen[seqm1] - (off + szs[i].len); } writeU32(os, totlen, this->toBe()); // offset from beginning of joined string writeU32(os, seqm1, this->toBe()); // sequence id writeU32(os, fwoff, this->toBe()); // offset into sequence totlen += szs[i].len; off += szs[i].len; } } /** * Helper for the constructors above. Takes a vector of text * strings and joins them into a single string with a call to * joinToDisk, which does a join (with padding) and writes some of * the resulting data directly to disk rather than keep it in * memory. It then constructs a suffix-array producer (what kind * depends on 'useBlockwise') for the resulting sequence. The * suffix-array producer can then be used to obtain chunks of the * joined string's suffix array. */ void initFromVector( vector& is, vector& szs, vector& plens, uint32_t sztot, const RefReadInParams& refparams, ofstream& out1, ofstream& out2, bool useBlockwise, uint32_t bmax, uint32_t bmaxSqrtMult, uint32_t bmaxDivN, int dcv, uint32_t seed) { // Compose text strings into single string VMSG_NL("Calculating joined length"); TStr s; // holds the entire joined reference after call to joinToDisk uint32_t jlen; jlen = joinedLen(szs); assert_geq(jlen, sztot); VMSG_NL("Writing header"); writeFromMemory(true, out1, out2); try { VMSG_NL("Reserving space for joined string"); seqan::reserve(s, jlen, Exact()); VMSG_NL("Joining reference sequences"); if(refparams.reverse == REF_READ_REVERSE) { { Timer timer(cout, " Time to join reference sequences: ", _verbose); joinToDisk(is, szs, plens, sztot, refparams, s, out1, out2, seed); } { Timer timer(cout, " Time to reverse reference sequence: ", _verbose); vector tmp; reverseInPlace(s); reverseRefRecords(szs, tmp, false, false); szsToDisk(tmp, out1, refparams.reverse); } } else { Timer timer(cout, " Time to join reference sequences: ", _verbose); joinToDisk(is, szs, plens, sztot, refparams, s, out1, out2, seed); szsToDisk(szs, out1, refparams.reverse); } // Joined reference sequence now in 's' } catch(bad_alloc& e) { // If we throw an allocation exception in the try block, // that means that the joined version of the reference // string itself is too larger to fit in memory. The only // alternatives are to tell the user to give us more memory // or to try again with a packed representation of the // reference (if we haven't tried that already). cerr << "Could not allocate space for a joined string of " << jlen << " elements." << endl; if(!isPacked() && _passMemExc) { // Pass the exception up so that we can retry using a // packed string representation throw e; } // There's no point passing this exception on. The fact // that we couldn't allocate the joined string means that // --bmax is irrelevant - the user should re-run with // ebwt-build-packed if(isPacked()) { cerr << "Please try running bowtie-build on a computer with more memory." << endl; } else { cerr << "Please try running bowtie-build in packed mode (-p/--packed) or in automatic" << endl << "mode (-a/--auto), or try again on a computer with more memory." << endl; } if(sizeof(void*) == 4) { cerr << "If this computer has more than 4 GB of memory, try using a 64-bit executable;" << endl << "this executable is 32-bit." << endl; } throw 1; } // Succesfully obtained joined reference string assert_geq(length(s), jlen); if(bmax != 0xffffffff) { VMSG_NL("bmax according to bmax setting: " << bmax); } else if(bmaxSqrtMult != 0xffffffff) { bmax *= bmaxSqrtMult; VMSG_NL("bmax according to bmaxSqrtMult setting: " << bmax); } else if(bmaxDivN != 0xffffffff) { bmax = max(jlen / bmaxDivN, 1); VMSG_NL("bmax according to bmaxDivN setting: " << bmax); } else { bmax = (uint32_t)sqrt(length(s)); VMSG_NL("bmax defaulted to: " << bmax); } int iter = 0; bool first = true; // Look for bmax/dcv parameters that work. while(true) { if(!first && bmax < 40 && _passMemExc) { cerr << "Could not find approrpiate bmax/dcv settings for building this index." << endl; if(!isPacked()) { // Throw an exception exception so that we can // retry using a packed string representation throw bad_alloc(); } else { cerr << "Already tried a packed string representation." << endl; } cerr << "Please try indexing this reference on a computer with more memory." << endl; if(sizeof(void*) == 4) { cerr << "If this computer has more than 4 GB of memory, try using a 64-bit executable;" << endl << "this executable is 32-bit." << endl; } throw 1; } if(dcv > 4096) dcv = 4096; if((iter % 6) == 5 && dcv < 4096 && dcv != 0) { dcv <<= 1; // double difference-cover period } else { bmax -= (bmax >> 2); // reduce by 25% } VMSG("Using parameters --bmax " << bmax); if(dcv == 0) { VMSG_NL(" and *no difference cover*"); } else { VMSG_NL(" --dcv " << dcv); } iter++; try { { VMSG_NL(" Doing ahead-of-time memory usage test"); // Make a quick-and-dirty attempt to force a bad_alloc iff // we would have thrown one eventually as part of // constructing the DifferenceCoverSample dcv <<= 1; size_t sz = DifferenceCoverSample::simulateAllocs(s, dcv >> 1); AutoArray tmp(sz); dcv >>= 1; // Likewise with the KarkkainenBlockwiseSA sz = KarkkainenBlockwiseSA::simulateAllocs(s, bmax); AutoArray tmp2(sz); // Now throw in the 'ftab' and 'isaSample' structures // that we'll eventually allocate in buildToDisk AutoArray ftab(_eh._ftabLen * 2); AutoArray side(_eh._sideSz); // Grab another 20 MB out of caution AutoArray extra(20*1024*1024); // If we made it here without throwing bad_alloc, then we // passed the memory-usage stress test VMSG(" Passed! Constructing with these parameters: --bmax " << bmax << " --dcv " << dcv); if(isPacked()) { VMSG(" --packed"); } VMSG_NL(""); } VMSG_NL("Constructing suffix-array element generator"); KarkkainenBlockwiseSA bsa(s, bmax, dcv, seed, _sanity, _passMemExc, _verbose); assert(bsa.suffixItrIsReset()); assert_eq(bsa.size(), length(s)+1); VMSG_NL("Converting suffix-array elements to index image"); buildToDisk(bsa, s, out1, out2); out1.flush(); out2.flush(); if(out1.fail() || out2.fail()) { cerr << "An error occurred writing the index to disk. Please check if the disk is full." << endl; throw 1; } break; } catch(bad_alloc& e) { if(_passMemExc) { VMSG_NL(" Ran out of memory; automatically trying more memory-economical parameters."); } else { cerr << "Out of memory while constructing suffix array. Please try using a smaller" << endl << "number of blocks by specifying a smaller --bmax or a larger --bmaxdivn" << endl; throw 1; } } first = false; } assert(repOk()); // Now write reference sequence names on the end #ifdef ACCOUNT_FOR_ALL_GAP_REFS assert_geq(this->_refnames.size(), this->_nPat); #else assert_eq(this->_refnames.size(), this->_nPat); #endif for(size_t i = 0; i < this->_refnames.size(); i++) { out1 << this->_refnames[i] << endl; } out1 << '\0'; out1.flush(); out2.flush(); if(out1.fail() || out2.fail()) { cerr << "An error occurred writing the index to disk. Please check if the disk is full." << endl; throw 1; } VMSG_NL("Returning from initFromVector"); } /** * Return the length that the joined string of the given string * list will have. Note that this is indifferent to how the text * fragments correspond to input sequences - it just cares about * the lengths of the fragments. */ uint32_t joinedLen(vector& szs) { uint32_t ret = 0; for(unsigned int i = 0; i < szs.size(); i++) { ret += szs[i].len; } return ret; } /// Destruct an Ebwt ~Ebwt() { // Only free buffers if we're *not* using memory-mapped files if(!_useMm) { // Delete everything that was allocated in read(false, ...) if(_fchr != NULL) delete[] _fchr; _fchr = NULL; if(_ftab != NULL) delete[] _ftab; _ftab = NULL; if(_eftab != NULL) delete[] _eftab; _eftab = NULL; if(_offs != NULL && !useShmem_) { delete[] _offs; _offs = NULL; } else if(_offs != NULL && useShmem_) { FREE_SHARED(_offs); } if(_isa != NULL) delete[] _isa; _isa = NULL; if(_plen != NULL) delete[] _plen; _plen = NULL; if(_rstarts != NULL) delete[] _rstarts; _rstarts = NULL; if(_ebwt != NULL && !useShmem_) { delete[] _ebwt; _ebwt = NULL; } else if(_ebwt != NULL && useShmem_) { FREE_SHARED(_ebwt); } } MM_FILE_CLOSE(_in1); MM_FILE_CLOSE(_in2); #ifdef EBWT_STATS cout << (_fw ? "Forward index:" : "Mirror index:") << endl; cout << " mapLFEx: " << mapLFExs_ << endl; cout << " mapLF: " << mapLFs_ << endl; cout << " mapLF(c): " << mapLFcs_ << endl; cout << " mapLF1(c): " << mapLF1cs_ << endl; cout << " mapLF(c): " << mapLF1s_ << endl; #endif } /// Accessors const EbwtParams& eh() const { return _eh; } uint32_t zOff() const { return _zOff; } uint32_t zEbwtByteOff() const { return _zEbwtByteOff; } int zEbwtBpOff() const { return _zEbwtBpOff; } uint32_t nPat() const { return _nPat; } uint32_t nFrag() const { return _nFrag; } uint32_t* fchr() const { return _fchr; } uint32_t* ftab() const { return _ftab; } uint32_t* eftab() const { return _eftab; } uint32_t* offs() const { return _offs; } uint32_t* isa() const { return _isa; } uint32_t* plen() const { return _plen; } uint32_t* rstarts() const { return _rstarts; } uint8_t* ebwt() const { return _ebwt; } const ReferenceMap* rmap() const { return rmap_; } bool toBe() const { return _toBigEndian; } bool verbose() const { return _verbose; } bool sanityCheck() const { return _sanity; } vector& refnames() { return _refnames; } bool fw() const { return _fw; } /// Return true iff the Ebwt is currently in memory bool isInMemory() const { if(_ebwt != NULL) { assert(_eh.repOk()); assert(_ftab != NULL); assert(_eftab != NULL); assert(_fchr != NULL); assert(_offs != NULL); assert(_isa != NULL); assert(_rstarts != NULL); assert_neq(_zEbwtByteOff, 0xffffffff); assert_neq(_zEbwtBpOff, -1); return true; } else { assert(_ftab == NULL); assert(_eftab == NULL); assert(_fchr == NULL); assert(_offs == NULL); assert(_rstarts == NULL); assert_eq(_zEbwtByteOff, 0xffffffff); assert_eq(_zEbwtBpOff, -1); return false; } } /// Return true iff the Ebwt is currently stored on disk bool isEvicted() const { return !isInMemory(); } /** * Load this Ebwt into memory by reading it in from the _in1 and * _in2 streams. */ void loadIntoMemory( int color, int needEntireReverse, bool loadNames, bool verbose) { readIntoMemory( color, // expect index to be colorspace? needEntireReverse, // require reverse index to be concatenated reference reversed false, // stop after loading the header portion? NULL, // params false, // mmSweep loadNames, // loadNames verbose); // startVerbose } /** * Frees memory associated with the Ebwt. */ void evictFromMemory() { assert(isInMemory()); if(!_useMm) { delete[] _fchr; delete[] _ftab; delete[] _eftab; if(!useShmem_) delete[] _offs; delete[] _isa; // Keep plen; it's small and the client may want to query it // even when the others are evicted. //delete[] _plen; delete[] _rstarts; if(!useShmem_) delete[] _ebwt; } _fchr = NULL; _ftab = NULL; _eftab = NULL; _offs = NULL; _isa = NULL; // Keep plen; it's small and the client may want to query it // even when the others are evicted. //_plen = NULL; _rstarts = NULL; _ebwt = NULL; _zEbwtByteOff = 0xffffffff; _zEbwtBpOff = -1; } /** * Non-static facade for static function ftabHi. */ uint32_t ftabHi(uint32_t i) const { return Ebwt::ftabHi(_ftab, _eftab, _eh._len, _eh._ftabLen, _eh._eftabLen, i); } /** * Get "high interpretation" of ftab entry at index i. The high * interpretation of a regular ftab entry is just the entry * itself. The high interpretation of an extended entry is the * second correpsonding ui32 in the eftab. * * It's a static member because it's convenient to ask this * question before the Ebwt is fully initialized. */ static uint32_t ftabHi(uint32_t *ftab, uint32_t *eftab, uint32_t len, uint32_t ftabLen, uint32_t eftabLen, uint32_t i) { assert_lt(i, ftabLen); if(ftab[i] <= len) { return ftab[i]; } else { uint32_t efIdx = ftab[i] ^ 0xffffffff; assert_lt(efIdx*2+1, eftabLen); return eftab[efIdx*2+1]; } } /** * Non-static facade for static function ftabLo. */ uint32_t ftabLo(uint32_t i) const { return Ebwt::ftabLo(_ftab, _eftab, _eh._len, _eh._ftabLen, _eh._eftabLen, i); } /** * Get "low interpretation" of ftab entry at index i. The low * interpretation of a regular ftab entry is just the entry * itself. The low interpretation of an extended entry is the * first correpsonding ui32 in the eftab. * * It's a static member because it's convenient to ask this * question before the Ebwt is fully initialized. */ static uint32_t ftabLo(uint32_t *ftab, uint32_t *eftab, uint32_t len, uint32_t ftabLen, uint32_t eftabLen, uint32_t i) { assert_lt(i, ftabLen); if(ftab[i] <= len) { return ftab[i]; } else { uint32_t efIdx = ftab[i] ^ 0xffffffff; assert_lt(efIdx*2+1, eftabLen); return eftab[efIdx*2]; } } /** * When using read() to create an Ebwt, we have to set a couple of * additional fields in the Ebwt object that aren't part of the * parameter list and are not stored explicitly in the file. Right * now, this just involves initializing _zEbwtByteOff and * _zEbwtBpOff from _zOff. */ void postReadInit(EbwtParams& eh) { uint32_t sideNum = _zOff / eh._sideBwtLen; uint32_t sideCharOff = _zOff % eh._sideBwtLen; uint32_t sideByteOff = sideNum * eh._sideSz; _zEbwtByteOff = sideCharOff >> 2; assert_lt(_zEbwtByteOff, eh._sideBwtSz); _zEbwtBpOff = sideCharOff & 3; assert_lt(_zEbwtBpOff, 4); if((sideNum & 1) == 0) { // This is an even (backward) side _zEbwtByteOff = eh._sideBwtSz - _zEbwtByteOff - 1; _zEbwtBpOff = 3 - _zEbwtBpOff; assert_lt(_zEbwtBpOff, 4); } _zEbwtByteOff += sideByteOff; assert(repOk(eh)); // Ebwt should be fully initialized now } /** * Pretty-print the Ebwt to the given output stream. */ void print(ostream& out) const { print(out, _eh); } /** * Pretty-print the Ebwt and given EbwtParams to the given output * stream. */ void print(ostream& out, const EbwtParams& eh) const { eh.print(out); // print params out << "Ebwt (" << (isInMemory()? "memory" : "disk") << "):" << endl << " zOff: " << _zOff << endl << " zEbwtByteOff: " << _zEbwtByteOff << endl << " zEbwtBpOff: " << _zEbwtBpOff << endl << " nPat: " << _nPat << endl << " plen: "; if(_plen == NULL) { out << "NULL" << endl; } else { out << "non-NULL, [0] = " << _plen[0] << endl; } out << " rstarts: "; if(_rstarts == NULL) { out << "NULL" << endl; } else { out << "non-NULL, [0] = " << _rstarts[0] << endl; } out << " ebwt: "; if(_ebwt == NULL) { out << "NULL" << endl; } else { out << "non-NULL, [0] = " << _ebwt[0] << endl; } out << " fchr: "; if(_fchr == NULL) { out << "NULL" << endl; } else { out << "non-NULL, [0] = " << _fchr[0] << endl; } out << " ftab: "; if(_ftab == NULL) { out << "NULL" << endl; } else { out << "non-NULL, [0] = " << _ftab[0] << endl; } out << " eftab: "; if(_eftab == NULL) { out << "NULL" << endl; } else { out << "non-NULL, [0] = " << _eftab[0] << endl; } out << " offs: "; if(_offs == NULL) { out << "NULL" << endl; } else { out << "non-NULL, [0] = " << _offs[0] << endl; } } // Building static TStr join(vector& l, uint32_t seed); static TStr join(vector& l, vector& szs, uint32_t sztot, const RefReadInParams& refparams, uint32_t seed); void joinToDisk(vector& l, vector& szs, vector& plens, uint32_t sztot, const RefReadInParams& refparams, TStr& ret, ostream& out1, ostream& out2, uint32_t seed = 0); void buildToDisk(InorderBlockwiseSA& sa, const TStr& s, ostream& out1, ostream& out2); // I/O void readIntoMemory(int color, int needEntireReverse, bool justHeader, EbwtParams *params, bool mmSweep, bool loadNames, bool startVerbose); void writeFromMemory(bool justHeader, ostream& out1, ostream& out2) const; void writeFromMemory(bool justHeader, const string& out1, const string& out2) const; // Sanity checking void printRangeFw(uint32_t begin, uint32_t end) const; void printRangeBw(uint32_t begin, uint32_t end) const; void sanityCheckUpToSide(int upToSide) const; void sanityCheckAll(int reverse) const; void restore(TStr& s) const; void checkOrigs(const vector >& os, bool color, bool mirror) const; // Searching and reporting void joinedToTextOff(uint32_t qlen, uint32_t off, uint32_t& tidx, uint32_t& textoff, uint32_t& tlen) const; inline bool report(const String& query, String* quals, String* name, bool color, char primer, char trimc, bool colExEnds, int snpPhred, const BitPairReference* ref, const std::vector& mmui32, const std::vector& refcs, size_t numMms, uint32_t off, uint32_t top, uint32_t bot, uint32_t qlen, int stratum, uint16_t cost, uint32_t patid, uint32_t seed, const EbwtSearchParams& params) const; inline bool reportChaseOne(const String& query, String* quals, String* name, bool color, char primer, char trimc, bool colExEnds, int snpPhred, const BitPairReference* ref, const std::vector& mmui32, const std::vector& refcs, size_t numMms, uint32_t i, uint32_t top, uint32_t bot, uint32_t qlen, int stratum, uint16_t cost, uint32_t patid, uint32_t seed, const EbwtSearchParams& params, SideLocus *l = NULL) const; inline bool reportReconstruct(const String& query, String* quals, String* name, String& lbuf, String& rbuf, const uint32_t *mmui32, const char* refcs, size_t numMms, uint32_t i, uint32_t top, uint32_t bot, uint32_t qlen, int stratum, const EbwtSearchParams& params, SideLocus *l = NULL) const; inline int rowL(const SideLocus& l) const; inline uint32_t countUpTo(const SideLocus& l, int c) const; inline void countUpToEx(const SideLocus& l, uint32_t* pairs) const; inline uint32_t countFwSide(const SideLocus& l, int c) const; inline void countFwSideEx(const SideLocus& l, uint32_t *pairs) const; inline uint32_t countBwSide(const SideLocus& l, int c) const; inline void countBwSideEx(const SideLocus& l, uint32_t *pairs) const; inline uint32_t mapLF(const SideLocus& l ASSERT_ONLY(, bool overrideSanity = false)) const; inline void mapLFEx(const SideLocus& l, uint32_t *pairs ASSERT_ONLY(, bool overrideSanity = false)) const; inline void mapLFEx(const SideLocus& ltop, const SideLocus& lbot, uint32_t *tops, uint32_t *bots ASSERT_ONLY(, bool overrideSanity = false)) const; inline uint32_t mapLF(const SideLocus& l, int c ASSERT_ONLY(, bool overrideSanity = false)) const; inline uint32_t mapLF1(uint32_t row, const SideLocus& l, int c ASSERT_ONLY(, bool overrideSanity = false)) const; inline int mapLF1(uint32_t& row, const SideLocus& l ASSERT_ONLY(, bool overrideSanity = false)) const; /// Check that in-memory Ebwt is internally consistent with respect /// to given EbwtParams; assert if not bool inMemoryRepOk(const EbwtParams& eh) const { assert_leq(ValueSize::VALUE, 4); assert_geq(_zEbwtBpOff, 0); assert_lt(_zEbwtBpOff, 4); assert_lt(_zEbwtByteOff, eh._ebwtTotSz); assert_lt(_zOff, eh._bwtLen); assert(_rstarts != NULL); assert_geq(_nFrag, _nPat); return true; } /// Check that in-memory Ebwt is internally consistent; assert if /// not bool inMemoryRepOk() const { return repOk(_eh); } /// Check that Ebwt is internally consistent with respect to given /// EbwtParams; assert if not bool repOk(const EbwtParams& eh) const { assert(_eh.repOk()); if(isInMemory()) { return inMemoryRepOk(eh); } return true; } /// Check that Ebwt is internally consistent; assert if not bool repOk() const { return repOk(_eh); } bool _toBigEndian; int32_t _overrideOffRate; int32_t _overrideIsaRate; bool _verbose; bool _passMemExc; bool _sanity; bool _fw; // true iff this is a forward index MM_FILE _in1; // input fd for primary index file MM_FILE _in2; // input fd for secondary index file string _in1Str; // filename for primary index file string _in2Str; // filename for secondary index file uint32_t _zOff; uint32_t _zEbwtByteOff; int _zEbwtBpOff; uint32_t _nPat; /// number of reference texts uint32_t _nFrag; /// number of fragments uint32_t* _plen; uint32_t* _rstarts; // starting offset of fragments / text indexes // _fchr, _ftab and _eftab are expected to be relatively small // (usually < 1MB, perhaps a few MB if _fchr is particularly large // - like, say, 11). For this reason, we don't bother with writing // them to disk through separate output streams; we uint32_t* _fchr; uint32_t* _ftab; uint32_t* _eftab; // "extended" entries for _ftab // _offs may be extremely large. E.g. for DNA w/ offRate=4 (one // offset every 16 rows), the total size of _offs is the same as // the total size of the input sequence uint32_t* _offs; uint32_t* _isa; // _ebwt is the Extended Burrows-Wheeler Transform itself, and thus // is at least as large as the input sequence. uint8_t* _ebwt; bool _useMm; /// use memory-mapped files to hold the index bool useShmem_; /// use shared memory to hold large parts of the index vector _refnames; /// names of the reference sequences const ReferenceMap* rmap_; /// mapping into another reference coordinate space char *mmFile1_; char *mmFile2_; EbwtParams _eh; #ifdef EBWT_STATS uint64_t mapLFExs_; uint64_t mapLFs_; uint64_t mapLFcs_; #endif private: ostream& log() const { return cout; // TODO: turn this into a parameter } /// Print a verbose message and flush (flushing is helpful for /// debugging) void verbose(const string& s) const { if(this->verbose()) { this->log() << s; this->log().flush(); } } }; /// Specialization for packed Ebwts - return true template<> bool Ebwt > >::isPacked() { return true; } /// By default, Ebwts are not packed template bool Ebwt::isPacked() { return false; } /** * Structure encapsulating search parameters, such as whether and how * to backtrack and how to deal with multiple equally-good hits. */ template class EbwtSearchParams { public: EbwtSearchParams(HitSinkPerThread& sink, const vector >& texts, bool fw = true, bool ebwtFw = true) : _sink(sink), _texts(texts), _patid(0xffffffff), _fw(fw) { } HitSinkPerThread& sink() const { return _sink; } void setPatId(uint32_t patid) { _patid = patid; } uint32_t patId() const { return _patid; } void setFw(bool fw) { _fw = fw; } bool fw() const { return _fw; } /** * Report a hit. Returns true iff caller can call off the search. */ bool reportHit(const String& query, // read sequence String* quals, // read quality values String* name, // read name bool color, // true -> read is colorspace char primer, // primer base trimmed from beginning char trimc, // first color trimmed from beginning bool colExEnds, // true -> exclude nucleotides at extreme ends after decoding int snpPhred, // penalty for a SNP const BitPairReference* ref, // reference (= NULL if not necessary) const ReferenceMap* rmap, // map to another reference coordinate system bool ebwtFw, // whether index is forward (true) or mirror (false) const std::vector& mmui32, // mismatch list const std::vector& refcs, // reference characters size_t numMms, // # mismatches U32Pair h, // ref coords U32Pair mh, // mate's ref coords bool mfw, // mate's orientation uint16_t mlen, // mate length U32Pair a, // arrow pair uint32_t tlen, // length of text uint32_t qlen, // length of query int stratum, // alignment stratum uint16_t cost, // cost of alignment uint32_t oms, // approx. # other valid alignments uint32_t patid, uint32_t seed, uint8_t mate) const { #ifndef NDEBUG // Check that no two elements of the mms array are the same for(size_t i = 0; i < numMms; i++) { for(size_t j = i+1; j < numMms; j++) { assert_neq(mmui32[i], mmui32[j]); } } #endif // If ebwtFw is true, then 'query' and 'quals' are reversed // If _fw is false, then 'query' and 'quals' are reverse complemented assert(!color || ref != NULL); assert(quals != NULL); assert(name != NULL); assert_eq(mmui32.size(), refcs.size()); assert_leq(numMms, mmui32.size()); assert_gt(qlen, 0); Hit hit; hit.stratum = stratum; hit.cost = cost; hit.patSeq = query; hit.quals = *quals; if(!ebwtFw) { // Re-reverse the pattern and the quals back to how they // appeared in the read file ::reverseInPlace(hit.patSeq); ::reverseInPlace(hit.quals); } if(color) { hit.colSeq = hit.patSeq; hit.colQuals = hit.quals; hit.crefcs.resize(qlen, 0); // Turn the mmui32 and refcs arrays into the mm FixedBitset and // the refc vector for(size_t i = 0; i < numMms; i++) { if (ebwtFw != _fw) { // The 3' end is on the left but the mm vector encodes // mismatches w/r/t the 5' end, so we flip uint32_t off = qlen - mmui32[i] - 1; hit.cmms.set(off); hit.crefcs[off] = refcs[i]; } else { hit.cmms.set(mmui32[i]); hit.crefcs[i] = refcs[i]; } } assert(ref != NULL); char read[1024]; uint32_t rfbuf[(1024+16)/4]; ASSERT_ONLY(char rfbuf2[1024]); char qual[1024]; char ns[1024]; char cmm[1024]; char nmm[1024]; int cmms = 0; int nmms = 0; // TODO: account for indels when calculating these bounds size_t readi = 0; size_t readf = seqan::length(hit.patSeq); size_t refi = 0; size_t reff = readf + 1; bool maqRound = false; for(size_t i = 0; i < qlen + 1; i++) { if(i < qlen) { read[i] = (int)hit.patSeq[i]; qual[i] = mmPenalty(maqRound, phredCharToPhredQual(hit.quals[i])); } ASSERT_ONLY(rfbuf2[i] = ref->getBase(h.first, h.second + i)); } int offset = ref->getStretch(rfbuf, h.first, h.second, qlen + 1); char *rf = (char*)rfbuf + offset; for(size_t i = 0; i < qlen + 1; i++) { assert_eq(rf[i], rfbuf2[i]); rf[i] = (1 << rf[i]); } decodeHit( read, // ASCII colors, '0', '1', '2', '3', '.' qual, // ASCII quals, Phred+33 encoded readi, // offset of first character within 'read' to consider readf, // offset of last char (exclusive) in 'read' to consider rf, // reference sequence, as masks refi, // offset of first character within 'ref' to consider reff, // offset of last char (exclusive) in 'ref' to consider snpPhred, // penalty incurred by a SNP ns, // decoded nucleotides are appended here cmm, // where the color mismatches are in the string nmm, // where nucleotide mismatches are in the string cmms, // number of color mismatches nmms);// number of nucleotide mismatches size_t nqlen = qlen + (colExEnds ? -1 : 1); seqan::resize(hit.patSeq, nqlen); seqan::resize(hit.quals, nqlen); hit.refcs.resize(nqlen); size_t lo = colExEnds ? 1 : 0; size_t hi = colExEnds ? qlen : qlen+1; size_t destpos = 0; for(size_t i = lo; i < hi; i++, destpos++) { // Set sequence character assert_leq(ns[i], 4); assert_geq(ns[i], 0); hit.patSeq[destpos] = (Dna5)(int)ns[i]; // Set initial quality hit.quals[destpos] = '!'; // Color mismatches penalize quality if(i > 0) { if(cmm[i-1] == 'M') { if((int)hit.quals[destpos] + (int)qual[i-1] > 126) { hit.quals[destpos] = 126; } else { hit.quals[destpos] += qual[i-1]; } } else if((int)hit.colSeq[i-1] != 4) { hit.quals[destpos] -= qual[i-1]; } } if(i < qlen) { if(cmm[i] == 'M') { if((int)hit.quals[destpos] + (int)qual[i] > 126) { hit.quals[destpos] = 126; } else { hit.quals[destpos] += qual[i]; } } else if((int)hit.patSeq[i] != 4) { hit.quals[destpos] -= qual[i]; } } if(hit.quals[destpos] < '!') { hit.quals[destpos] = '!'; } if(nmm[i] != 'M') { uint32_t off = i - (colExEnds? 1:0); if(!_fw) off = nqlen - off - 1; assert_lt(off, nqlen); hit.mms.set(off); hit.refcs[off] = "ACGT"[ref->getBase(h.first, h.second+i)]; } } if(colExEnds) { // Extreme bases have been removed; that makes the // nucleotide alignment one character shorter than the // color alignment qlen--; mlen--; // It also shifts the alignment's offset up by 1 h.second++; } else { // Extreme bases are included; that makes the // nucleotide alignment one character longer than the // color alignment qlen++; mlen++; } } else { // Turn the mmui32 and refcs arrays into the mm FixedBitset and // the refc vector hit.refcs.resize(qlen, 0); for(size_t i = 0; i < numMms; i++) { if (ebwtFw != _fw) { // The 3' end is on the left but the mm vector encodes // mismatches w/r/t the 5' end, so we flip uint32_t off = qlen - mmui32[i] - 1; hit.mms.set(off); hit.refcs[off] = refcs[i]; } else { hit.mms.set(mmui32[i]); hit.refcs[mmui32[i]] = refcs[i]; } } } // Check the hit against the original text, if it's available if(_texts.size() > 0) { assert_lt(h.first, _texts.size()); FixedBitset<1024> diffs; // This type of check assumes that only mismatches are // possible. If indels are possible, then we either need // the caller to provide information about indel locations, // or we need to extend this to a more complicated check. assert_leq(h.second + qlen, length(_texts[h.first])); for(size_t i = 0; i < qlen; i++) { assert_neq(4, (int)_texts[h.first][h.second + i]); // Forward pattern appears at h if((int)hit.patSeq[i] != (int)_texts[h.first][h.second + i]) { uint32_t qoff = i; // if ebwtFw != _fw the 3' end is on on the // left end of the pattern, but the diff vector // should encode mismatches w/r/t the 5' end, // so we flip if (_fw) diffs.set(qoff); else diffs.set(qlen - qoff - 1); } } if(diffs != hit.mms) { // Oops, mismatches were not where we expected them; // print a diagnostic message before asserting cerr << "Expected " << hit.mms.str() << " mismatches, got " << diffs.str() << endl; cerr << " Pat: " << hit.patSeq << endl; cerr << " Tseg: "; for(size_t i = 0; i < qlen; i++) { cerr << _texts[h.first][h.second + i]; } cerr << endl; cerr << " mmui32: "; for(size_t i = 0; i < numMms; i++) { cerr << mmui32[i] << " "; } cerr << endl; cerr << " FW: " << _fw << endl; cerr << " Ebwt FW: " << ebwtFw << endl; } if(diffs != hit.mms) assert(false); } hit.h = h; if(rmap != NULL) rmap->map(hit.h); hit.patId = ((patid == 0xffffffff) ? _patid : patid); hit.patName = *name; hit.mh = mh; hit.fw = _fw; hit.mfw = mfw; hit.mlen = mlen; hit.oms = oms; hit.mate = mate; hit.color = color; hit.primer = primer; hit.trimc = trimc; hit.seed = seed; assert(hit.repOk()); return sink().reportHit(hit, stratum); } private: HitSinkPerThread& _sink; const vector >& _texts; // original texts, if available (if not // available, _texts.size() == 0) uint32_t _patid; // id of current read bool _fw; // current read is forward-oriented }; /** * Encapsulates a location in the bwt text in terms of the side it * occurs in and its offset within the side. */ struct SideLocus { SideLocus() : _sideByteOff(0), _sideNum(0), _charOff(0), _fw(true), _by(-1), _bp(-1) { } /** * Construct from row and other relevant information about the Ebwt. */ SideLocus(uint32_t row, const EbwtParams& ep, const uint8_t* ebwt) { initFromRow(row, ep, ebwt); } /** * Init two SideLocus objects from a top/bot pair, using the result * from one call to initFromRow to possibly avoid a second call. */ static void initFromTopBot(uint32_t top, uint32_t bot, const EbwtParams& ep, const uint8_t* ebwt, SideLocus& ltop, SideLocus& lbot) { const uint32_t sideBwtLen = ep._sideBwtLen; const uint32_t sideBwtSz = ep._sideBwtSz; assert_gt(bot, top); ltop.initFromRow(top, ep, ebwt); uint32_t spread = bot - top; if(ltop._charOff + spread < sideBwtLen) { lbot._charOff = ltop._charOff + spread; lbot._sideNum = ltop._sideNum; lbot._sideByteOff = ltop._sideByteOff; lbot._fw = ltop._fw; lbot._by = lbot._charOff >> 2; assert_lt(lbot._by, (int)sideBwtSz); if(!lbot._fw) lbot._by = sideBwtSz - lbot._by - 1; lbot._bp = lbot._charOff & 3; if(!lbot._fw) lbot._bp ^= 3; } else { lbot.initFromRow(bot, ep, ebwt); } } /** * Calculate SideLocus based on a row and other relevant * information about the shape of the Ebwt. */ void initFromRow(uint32_t row, const EbwtParams& ep, const uint8_t* ebwt) { const uint32_t sideSz = ep._sideSz; // Side length is hard-coded for now; this allows the compiler // to do clever things to accelerate / and %. _sideNum = row / 224; _charOff = row % 224; _sideByteOff = _sideNum * sideSz; assert_leq(row, ep._len); assert_leq(_sideByteOff + sideSz, ep._ebwtTotSz); #ifndef NO_PREFETCH __builtin_prefetch((const void *)(ebwt + _sideByteOff), 0 /* prepare for read */, PREFETCH_LOCALITY); #endif // prefetch tjside too _fw = (_sideNum & 1) != 0; // odd-numbered sides are forward _by = _charOff >> 2; // byte within side assert_lt(_by, (int)ep._sideBwtSz); _bp = _charOff & 3; // bit-pair within byte if(!_fw) { _by = ep._sideBwtSz - _by - 1; _bp ^= 3; } } /// Return true iff this is an initialized SideLocus bool valid() { return _bp != -1; } /// Make this look like an invalid SideLocus void invalidate() { _bp = -1; } const uint8_t *side(const uint8_t* ebwt) const { return ebwt + _sideByteOff; } const uint8_t *oside(const uint8_t* ebwt) const { return ebwt + _sideByteOff + (_fw? (-128) : (128)); } uint32_t _sideByteOff; // offset of top side within ebwt[] uint32_t _sideNum; // index of side uint16_t _charOff; // character offset within side bool _fw; // side is forward or backward? int16_t _by; // byte within side (not adjusted for bw sides) int8_t _bp; // bitpair within byte (not adjusted for bw sides) }; #include "ebwt_search_backtrack.h" /////////////////////////////////////////////////////////////////////// // // Functions for printing and sanity-checking Ebwts // /////////////////////////////////////////////////////////////////////// /** * Given a range of positions in the EBWT array within the BWT portion * of a forward side, print the characters at those positions along * with a summary occ[] array. */ template void Ebwt::printRangeFw(uint32_t begin, uint32_t end) const { assert(isInMemory()); uint32_t occ[] = {0, 0, 0, 0}; assert_gt(end, begin); for(uint32_t i = begin; i < end; i++) { uint8_t by = this->_ebwt[i]; for(int j = 0; j < 4; j++) { // Unpack from lowest to highest bit pair int twoBit = unpack_2b_from_8b(by, j); occ[twoBit]++; cout << "ACGT"[twoBit]; } assert_eq(0, (occ[0] + occ[1] + occ[2] + occ[3]) & 3); } cout << ":{" << occ[0] << "," << occ[1] << "," << occ[2] << "," << occ[3] << "}" << endl; } /** * Given a range of positions in the EBWT array within the BWT portion * of a backward side, print the characters at those positions along * with a summary occ[] array. */ template void Ebwt::printRangeBw(uint32_t begin, uint32_t end) const { assert(isInMemory()); uint32_t occ[] = {0, 0, 0, 0}; assert_gt(end, begin); for(uint32_t i = end-1; i >= begin; i--) { uint8_t by = this->_ebwt[i]; for(int j = 3; j >= 0; j--) { // Unpack from lowest to highest bit pair int twoBit = unpack_2b_from_8b(by, j); occ[twoBit]++; cout << "ACGT"[twoBit]; } assert_eq(0, (occ[0] + occ[1] + occ[2] + occ[3]) & 3); if(i == 0) break; } cout << ":{" << occ[0] << "," << occ[1] << "," << occ[2] << "," << occ[3] << "}" << endl; } /** * Check that the ebwt array is internally consistent up to (and not * including) the given side index by re-counting the chars and * comparing against the embedded occ[] arrays. */ template void Ebwt::sanityCheckUpToSide(int upToSide) const { assert(isInMemory()); uint32_t occ[] = {0, 0, 0, 0}; ASSERT_ONLY(uint32_t occ_save[] = {0, 0}); uint32_t cur = 0; // byte pointer const EbwtParams& eh = this->_eh; bool fw = false; while(cur < (upToSide * eh._sideSz)) { assert_leq(cur + eh._sideSz, eh._ebwtTotLen); for(uint32_t i = 0; i < eh._sideBwtSz; i++) { uint8_t by = this->_ebwt[cur + (fw ? i : eh._sideBwtSz-i-1)]; for(int j = 0; j < 4; j++) { // Unpack from lowest to highest bit pair int twoBit = unpack_2b_from_8b(by, fw ? j : 3-j); occ[twoBit]++; //if(_verbose) cout << "ACGT"[twoBit]; } assert_eq(0, (occ[0] + occ[1] + occ[2] + occ[3]) % 4); } assert_eq(0, (occ[0] + occ[1] + occ[2] + occ[3]) % eh._sideBwtLen); if(fw) { // Finished forward bucket; check saved [G] and [T] // against the two uint32_ts encoded here ASSERT_ONLY(uint32_t *u32ebwt = reinterpret_cast(&this->_ebwt[cur + eh._sideBwtSz])); ASSERT_ONLY(uint32_t gs = u32ebwt[0]); ASSERT_ONLY(uint32_t ts = u32ebwt[1]); assert_eq(gs, occ_save[0]); assert_eq(ts, occ_save[1]); fw = false; } else { // Finished backward bucket; check current [A] and [C] // against the two uint32_ts encoded here ASSERT_ONLY(uint32_t *u32ebwt = reinterpret_cast(&this->_ebwt[cur + eh._sideBwtSz])); ASSERT_ONLY(uint32_t as = u32ebwt[0]); ASSERT_ONLY(uint32_t cs = u32ebwt[1]); assert(as == occ[0] || as == occ[0]-1); // one 'a' is a skipped '$' and doesn't count toward occ[] assert_eq(cs, occ[1]); ASSERT_ONLY(occ_save[0] = occ[2]); // save gs ASSERT_ONLY(occ_save[1] = occ[3]); // save ts fw = true; } cur += eh._sideSz; } } /** * Sanity-check various pieces of the Ebwt */ template void Ebwt::sanityCheckAll(int reverse) const { const EbwtParams& eh = this->_eh; assert(isInMemory()); // Check ftab for(uint32_t i = 1; i < eh._ftabLen; i++) { assert_geq(this->ftabHi(i), this->ftabLo(i-1)); assert_geq(this->ftabLo(i), this->ftabHi(i-1)); assert_leq(this->ftabHi(i), eh._bwtLen+1); } assert_eq(this->ftabHi(eh._ftabLen-1), eh._bwtLen); // Check offs int seenLen = (eh._bwtLen + 31) >> 5; uint32_t *seen; try { seen = new uint32_t[seenLen]; // bitvector marking seen offsets } catch(bad_alloc& e) { cerr << "Out of memory allocating seen[] at " << __FILE__ << ":" << __LINE__ << endl; throw e; } memset(seen, 0, 4 * seenLen); uint32_t offsLen = eh._offsLen; for(uint32_t i = 0; i < offsLen; i++) { assert_lt(this->_offs[i], eh._bwtLen); int w = this->_offs[i] >> 5; int r = this->_offs[i] & 31; assert_eq(0, (seen[w] >> r) & 1); // shouldn't have been seen before seen[w] |= (1 << r); } delete[] seen; // Check nPat assert_gt(this->_nPat, 0); // Check plen, flen for(uint32_t i = 0; i < this->_nPat; i++) { assert_geq(this->_plen[i], 0); } // Check rstarts for(uint32_t i = 0; i < this->_nFrag-1; i++) { assert_gt(this->_rstarts[(i+1)*3], this->_rstarts[i*3]); if(reverse == REF_READ_REVERSE) { assert(this->_rstarts[(i*3)+1] >= this->_rstarts[((i+1)*3)+1]); } else { assert(this->_rstarts[(i*3)+1] <= this->_rstarts[((i+1)*3)+1]); } } // Check ebwt sanityCheckUpToSide(eh._numSides); VMSG_NL("Ebwt::sanityCheck passed"); } /////////////////////////////////////////////////////////////////////// // // Functions for searching Ebwts // /////////////////////////////////////////////////////////////////////// /** * Return the final character in row i (i.e. the i'th character in the * BWT transform). Note that the 'L' in the name of the function * stands for 'last', as in the literature. */ template inline int Ebwt::rowL(const SideLocus& l) const { // Extract and return appropriate bit-pair #ifdef SIXTY4_FORMAT return (((uint64_t*)l.side(this->_ebwt))[l._by >> 3] >> ((((l._by & 7) << 2) + l._bp) << 1)) & 3; #else return unpack_2b_from_8b(l.side(this->_ebwt)[l._by], l._bp); #endif } /** * Inline-function version of the above. This does not always seem to * be inlined */ #if 0 // Use gcc's intrinsic popcountll. I don't recommend it because it // seems to be somewhat slower than the bit-bashing pop64 routine both // on an AMD server and on an Intel workstation. On the other hand, // perhaps when the builtin is used GCC is smart enough to insert a // pop-count instruction on architectures that have one (e.g. Itanium). // For now, it's disabled. #define pop64(x) __builtin_popcountll(x) #elif 0 __declspec naked int __stdcall pop64 (uint64_t v) { static const uint64_t C55 = 0x5555555555555555ll; static const uint64_t C33 = 0x3333333333333333ll; static const uint64_t C0F = 0x0F0F0F0F0F0F0F0Fll; __asm { MOVD MM0, [ESP+4] ;v_low PUNPCKLDQ MM0, [ESP+8] ;v MOVQ MM1, MM0 ;v PSRLD MM0, 1 ;v >> 1 PAND MM0, [C55] ;(v >> 1) & 0x55555555 PSUBD MM1, MM0 ;w = v - ((v >> 1) & 0x55555555) MOVQ MM0, MM1 ;w PSRLD MM1, 2 ;w >> 2 PAND MM0, [C33] ;w & 0x33333333 PAND MM1, [C33] ;(w >> 2) & 0x33333333 PADDD MM0, MM1 ;x = (w & 0x33333333) + ; ((w >> 2) & 0x33333333) MOVQ MM1, MM0 ;x PSRLD MM0, 4 ;x >> 4 PADDD MM0, MM1 ;x + (x >> 4) PAND MM0, [C0F] ;y = (x + (x >> 4) & 0x0F0F0F0F) PXOR MM1, MM1 ;0 PSADBW (MM0, MM1) ;sum across all 8 bytes MOVD EAX, MM0 ;result in EAX per calling ; convention EMMS ;clear MMX state RET 8 ;pop 8-byte argument off stack ; and return } } #elif 0 // Use a bytewise LUT version of popcount. This is slower than the // bit-bashing pop64 routine both on an AMD server and on an Intel // workstation. It seems to be about the same speed as the GCC builtin // on Intel, and a bit faster than it on AMD. For now, it's disabled. const int popcntU8Table[256] = { 0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,1,2,2,3,2,3,3,4,2,3,3,4,3,4,4,5, 1,2,2,3,2,3,3,4,2,3,3,4,3,4,4,5,2,3,3,4,3,4,4,5,3,4,4,5,4,5,5,6, 1,2,2,3,2,3,3,4,2,3,3,4,3,4,4,5,2,3,3,4,3,4,4,5,3,4,4,5,4,5,5,6, 2,3,3,4,3,4,4,5,3,4,4,5,4,5,5,6,3,4,4,5,4,5,5,6,4,5,5,6,5,6,6,7, 1,2,2,3,2,3,3,4,2,3,3,4,3,4,4,5,2,3,3,4,3,4,4,5,3,4,4,5,4,5,5,6, 2,3,3,4,3,4,4,5,3,4,4,5,4,5,5,6,3,4,4,5,4,5,5,6,4,5,5,6,5,6,6,7, 2,3,3,4,3,4,4,5,3,4,4,5,4,5,5,6,3,4,4,5,4,5,5,6,4,5,5,6,5,6,6,7, 3,4,4,5,4,5,5,6,4,5,5,6,5,6,6,7,4,5,5,6,5,6,6,7,5,6,6,7,6,7,7,8 }; // Use this bytewise population count table inline static int pop64(uint64_t x) { const unsigned char * p = (const unsigned char *) &x; return popcntU8Table[p[0]] + popcntU8Table[p[1]] + popcntU8Table[p[2]] + popcntU8Table[p[3]] + popcntU8Table[p[4]] + popcntU8Table[p[5]] + popcntU8Table[p[6]] + popcntU8Table[p[7]]; } #else // Use this standard bit-bashing population count inline static int pop64(uint64_t x) { x = x - ((x >> 1) & 0x5555555555555555llu); x = (x & 0x3333333333333333llu) + ((x >> 2) & 0x3333333333333333llu); x = (x + (x >> 4)) & 0x0F0F0F0F0F0F0F0Fllu; x = x + (x >> 8); x = x + (x >> 16); x = x + (x >> 32); return x & 0x3F; } #endif /** * Tricky-bit-bashing bitpair counting for given two-bit value (0-3) * within a 64-bit argument. */ inline static int countInU64(int c, uint64_t dw) { uint64_t dwA = dw & 0xAAAAAAAAAAAAAAAAllu; uint64_t dwNA = dw & ~0xAAAAAAAAAAAAAAAAllu; uint64_t tmp; switch(c) { case 0: tmp = (dwA >> 1) | dwNA; break; case 1: tmp = ~(dwA >> 1) & dwNA; break; case 2: tmp = (dwA >> 1) & ~dwNA; break; case 3: tmp = (dwA >> 1) & dwNA; break; default: throw; } tmp = pop64(tmp); // Gets 7.62% in profile if(c == 0) { tmp = 32 - tmp; } assert_leq(tmp, 32); assert_geq(tmp, 0); return (int)tmp; } /** * Tricky-bit-bashing bitpair counting for given two-bit value (0-3) * within a 64-bit argument. * * Function gets 2.32% in profile */ inline static void countInU64Ex(uint64_t dw, uint32_t* arrs) { uint64_t dwA = dw & 0xAAAAAAAAAAAAAAAAllu; uint64_t dwNA = dw & ~0xAAAAAAAAAAAAAAAAllu; arrs[0] += (32 - pop64((dwA >> 1) | dwNA)); arrs[1] += pop64(~(dwA >> 1) & dwNA); arrs[2] += pop64((dwA >> 1) & ~dwNA); arrs[3] += pop64((dwA >> 1) & dwNA); } /** * Counts the number of occurrences of character 'c' in the given Ebwt * side up to (but not including) the given byte/bitpair (by/bp). * * This is a performance-critical function. This is the top search- * related hit in the time profile. * * Function gets 11.09% in profile */ template inline uint32_t Ebwt::countUpTo(const SideLocus& l, int c) const { // Count occurrences of c in each 64-bit (using bit trickery); // Someday countInU64() and pop() functions should be // vectorized/SSE-ized in case that helps. uint32_t cCnt = 0; const uint8_t *side = l.side(this->_ebwt); int i = 0; #if 1 for(; i + 7 < l._by; i += 8) { cCnt += countInU64(c, *(uint64_t*)&side[i]); } #else for(; i + 2 < l._by; i += 2) { cCnt += cCntLUT_16b_4[c][*(uint16_t*)&side[i]]; } #endif #ifdef SIXTY4_FORMAT // Calculate number of bit pairs to shift off the end const int bpShiftoff = 32 - (((l._by & 7) << 2) + l._bp); if(bpShiftoff < 32) { assert_lt(bpShiftoff, 32); const uint64_t sw = (*(uint64_t*)&side[i]) << (bpShiftoff << 1); cCnt += countInU64(c, sw); if(c == 0) cCnt -= bpShiftoff; // we turned these into As } #else // Count occurences of c in the rest of the side (using LUT) for(; i < l._by; i++) { cCnt += cCntLUT_4[0][c][side[i]]; } // Count occurences of c in the rest of the byte if(l._bp > 0) { cCnt += cCntLUT_4[(int)l._bp][c][side[i]]; } #endif return cCnt; } /** * Counts the number of occurrences of character 'c' in the given Ebwt * side up to (but not including) the given byte/bitpair (by/bp). */ template inline void Ebwt::countUpToEx(const SideLocus& l, uint32_t* arrs) const { int i = 0; // Count occurrences of c in each 64-bit (using bit trickery); // note: this seems does not seem to lend a significant boost to // performance. If you comment out this whole loop (which won't // affect correctness - it will just cause the following loop to // take up the slack) then runtime does not change noticeably. // Someday the countInU64() and pop() functions should be // vectorized/SSE-ized in case that helps. const uint8_t *side = l.side(this->_ebwt); for(; i+7 < l._by; i += 8) { countInU64Ex(*(uint64_t*)&side[i], arrs); } #ifdef SIXTY4_FORMAT // Calculate number of bit pairs to shift off the end const int bpShiftoff = 32 - (((l._by & 7) << 2) + l._bp); assert_leq(bpShiftoff, 32); if(bpShiftoff < 32) { const uint64_t sw = (*(uint64_t*)&l.side(this->_ebwt)[i]) << (bpShiftoff << 1); countInU64Ex(sw, arrs); arrs[0] -= bpShiftoff; } #else // Count occurences of c in the rest of the side (using LUT) for(; i < l._by; i++) { arrs[0] += cCntLUT_4[0][0][side[i]]; arrs[1] += cCntLUT_4[0][1][side[i]]; arrs[2] += cCntLUT_4[0][2][side[i]]; arrs[3] += cCntLUT_4[0][3][side[i]]; } // Count occurences of c in the rest of the byte if(l._bp > 0) { arrs[0] += cCntLUT_4[(int)l._bp][0][side[i]]; arrs[1] += cCntLUT_4[(int)l._bp][1][side[i]]; arrs[2] += cCntLUT_4[(int)l._bp][2][side[i]]; arrs[3] += cCntLUT_4[(int)l._bp][3][side[i]]; } #endif } /** * Count all occurrences of character c from the beginning of the * forward side to and add in the occ[] count up to the side * break just prior to the side. */ template inline uint32_t Ebwt::countFwSide(const SideLocus& l, int c) const { assert_lt(c, 4); assert_geq(c, 0); assert_lt(l._by, (int)this->_eh._sideBwtSz); assert_geq(l._by, 0); assert_lt(l._bp, 4); assert_geq(l._bp, 0); const uint8_t *side = l.side(this->_ebwt); uint32_t cCnt = countUpTo(l, c); assert_leq(cCnt, this->_eh._sideBwtLen); if(c == 0 && l._sideByteOff <= _zEbwtByteOff && l._sideByteOff + l._by >= _zEbwtByteOff) { // Adjust for the fact that we represented $ with an 'A', but // shouldn't count it as an 'A' here if((l._sideByteOff + l._by > _zEbwtByteOff) || (l._sideByteOff + l._by == _zEbwtByteOff && l._bp > _zEbwtBpOff)) { cCnt--; // Adjust for '$' looking like an 'A' } } uint32_t ret; // Now factor in the occ[] count at the side break if(c < 2) { const uint32_t *ac = reinterpret_cast(side - 8); assert_leq(ac[0], this->_eh._numSides * this->_eh._sideBwtLen); // b/c it's used as padding assert_leq(ac[1], this->_eh._len); ret = ac[c] + cCnt + this->_fchr[c]; } else { const uint32_t *gt = reinterpret_cast(side + this->_eh._sideSz - 8); // next assert_leq(gt[0], this->_eh._len); assert_leq(gt[1], this->_eh._len); ret = gt[c-2] + cCnt + this->_fchr[c]; } #ifndef NDEBUG assert_leq(ret, this->_fchr[c+1]); // can't have jumpded into next char's section if(c == 0) { assert_leq(cCnt, this->_eh._sideBwtLen); } else { assert_leq(ret, this->_eh._bwtLen); } #endif return ret; } /** * Count all occurrences of character c from the beginning of the * forward side to and add in the occ[] count up to the side * break just prior to the side. */ template inline void Ebwt::countFwSideEx(const SideLocus& l, uint32_t* arrs) const { assert_lt(l._by, (int)this->_eh._sideBwtSz); assert_geq(l._by, 0); assert_lt(l._bp, 4); assert_geq(l._bp, 0); countUpToEx(l, arrs); #ifndef NDEBUG assert_leq(arrs[0], this->_fchr[1]); // can't have jumped into next char's section assert_leq(arrs[1], this->_fchr[2]); // can't have jumped into next char's section assert_leq(arrs[2], this->_fchr[3]); // can't have jumped into next char's section assert_leq(arrs[3], this->_fchr[4]); // can't have jumped into next char's section #endif assert_leq(arrs[0], this->_eh._sideBwtLen); assert_leq(arrs[1], this->_eh._sideBwtLen); assert_leq(arrs[2], this->_eh._sideBwtLen); assert_leq(arrs[3], this->_eh._sideBwtLen); const uint8_t *side = l.side(this->_ebwt); if(l._sideByteOff <= _zEbwtByteOff && l._sideByteOff + l._by >= _zEbwtByteOff) { // Adjust for the fact that we represented $ with an 'A', but // shouldn't count it as an 'A' here if((l._sideByteOff + l._by > _zEbwtByteOff) || (l._sideByteOff + l._by == _zEbwtByteOff && l._bp > _zEbwtBpOff)) { arrs[0]--; // Adjust for '$' looking like an 'A' } } // Now factor in the occ[] count at the side break const uint32_t *ac = reinterpret_cast(side - 8); const uint32_t *gt = reinterpret_cast(side + this->_eh._sideSz - 8); #ifndef NDEBUG assert_leq(ac[0], this->_fchr[1] + this->_eh.sideBwtLen()); assert_leq(ac[1], this->_fchr[2]-this->_fchr[1]); assert_leq(gt[0], this->_fchr[3]-this->_fchr[2]); assert_leq(gt[1], this->_fchr[4]-this->_fchr[3]); #endif assert_leq(ac[0], this->_eh._len + this->_eh.sideBwtLen()); assert_leq(ac[1], this->_eh._len); assert_leq(gt[0], this->_eh._len); assert_leq(gt[1], this->_eh._len); arrs[0] += (ac[0] + this->_fchr[0]); arrs[1] += (ac[1] + this->_fchr[1]); arrs[2] += (gt[0] + this->_fchr[2]); arrs[3] += (gt[1] + this->_fchr[3]); #ifndef NDEBUG assert_leq(arrs[0], this->_fchr[1]); // can't have jumpded into next char's section assert_leq(arrs[1], this->_fchr[2]); // can't have jumpded into next char's section assert_leq(arrs[2], this->_fchr[3]); // can't have jumpded into next char's section assert_leq(arrs[3], this->_fchr[4]); // can't have jumpded into next char's section #endif } /** * Count all instances of character c from to the logical end * (actual beginning) of the backward side, and subtract that from the * occ[] count up to the side break. */ template inline uint32_t Ebwt::countBwSide(const SideLocus& l, int c) const { assert_lt(c, 4); assert_geq(c, 0); assert_lt(l._by, (int)this->_eh._sideBwtSz); assert_geq(l._by, 0); assert_lt(l._bp, 4); assert_geq(l._bp, 0); const uint8_t *side = l.side(this->_ebwt); uint32_t cCnt = countUpTo(l, c); if(rowL(l) == c) cCnt++; assert_leq(cCnt, this->_eh._sideBwtLen); if(c == 0 && l._sideByteOff <= _zEbwtByteOff && l._sideByteOff + l._by >= _zEbwtByteOff) { // Adjust for the fact that we represented $ with an 'A', but // shouldn't count it as an 'A' here if((l._sideByteOff + l._by > _zEbwtByteOff) || (l._sideByteOff + l._by == _zEbwtByteOff && l._bp >= _zEbwtBpOff)) { cCnt--; } } uint32_t ret; // Now factor in the occ[] count at the side break if(c < 2) { const uint32_t *ac = reinterpret_cast(side + this->_eh._sideSz - 8); assert_leq(ac[0], this->_eh._numSides * this->_eh._sideBwtLen); // b/c it's used as padding assert_leq(ac[1], this->_eh._len); ret = ac[c] - cCnt + this->_fchr[c]; } else { const uint32_t *gt = reinterpret_cast(side + (2*this->_eh._sideSz) - 8); // next assert_leq(gt[0], this->_eh._len); assert_leq(gt[1], this->_eh._len); ret = gt[c-2] - cCnt + this->_fchr[c]; } #ifndef NDEBUG assert_leq(ret, this->_fchr[c+1]); // can't have jumped into next char's section if(c == 0) { assert_leq(cCnt, this->_eh._sideBwtLen); } else { assert_lt(ret, this->_eh._bwtLen); } #endif return ret; } /** * Count all instances of character c from to the logical end * (actual beginning) of the backward side, and subtract that from the * occ[] count up to the side break. */ template inline void Ebwt::countBwSideEx(const SideLocus& l, uint32_t* arrs) const { assert_lt(l._by, (int)this->_eh._sideBwtSz); assert_geq(l._by, 0); assert_lt(l._bp, 4); assert_geq(l._bp, 0); const uint8_t *side = l.side(this->_ebwt); countUpToEx(l, arrs); arrs[rowL(l)]++; assert_leq(arrs[0], this->_eh._sideBwtLen); assert_leq(arrs[1], this->_eh._sideBwtLen); assert_leq(arrs[2], this->_eh._sideBwtLen); assert_leq(arrs[3], this->_eh._sideBwtLen); if(l._sideByteOff <= _zEbwtByteOff && l._sideByteOff + l._by >= _zEbwtByteOff) { // Adjust for the fact that we represented $ with an 'A', but // shouldn't count it as an 'A' here if((l._sideByteOff + l._by > _zEbwtByteOff) || (l._sideByteOff + l._by == _zEbwtByteOff && l._bp >= _zEbwtBpOff)) { arrs[0]--; // Adjust for '$' looking like an 'A' } } // Now factor in the occ[] count at the side break const uint32_t *ac = reinterpret_cast(side + this->_eh._sideSz - 8); const uint32_t *gt = reinterpret_cast(side + (2*this->_eh._sideSz) - 8); #ifndef NDEBUG assert_leq(ac[0], this->_fchr[1] + this->_eh.sideBwtLen()); assert_leq(ac[1], this->_fchr[2]-this->_fchr[1]); assert_leq(gt[0], this->_fchr[3]-this->_fchr[2]); assert_leq(gt[1], this->_fchr[4]-this->_fchr[3]); #endif assert_leq(ac[0], this->_eh._len + this->_eh.sideBwtLen()); assert_leq(ac[1], this->_eh._len); assert_leq(gt[0], this->_eh._len); assert_leq(gt[1], this->_eh._len); arrs[0] = (ac[0] - arrs[0] + this->_fchr[0]); arrs[1] = (ac[1] - arrs[1] + this->_fchr[1]); arrs[2] = (gt[0] - arrs[2] + this->_fchr[2]); arrs[3] = (gt[1] - arrs[3] + this->_fchr[3]); #ifndef NDEBUG assert_leq(arrs[0], this->_fchr[1]); // can't have jumped into next char's section assert_leq(arrs[1], this->_fchr[2]); // can't have jumped into next char's section assert_leq(arrs[2], this->_fchr[3]); // can't have jumped into next char's section assert_leq(arrs[3], this->_fchr[4]); // can't have jumped into next char's section #endif } /** * Given top and bot loci, calculate counts of all four DNA chars up to * those loci. Used for more advanced backtracking-search. */ template inline void Ebwt::mapLFEx(const SideLocus& ltop, const SideLocus& lbot, uint32_t *tops, uint32_t *bots ASSERT_ONLY(, bool overrideSanity) ) const { // TODO: Where there's overlap, reuse the count for the overlapping // portion #ifdef EBWT_STATS const_cast*>(this)->mapLFExs_++; #endif assert_eq(0, tops[0]); assert_eq(0, bots[0]); assert_eq(0, tops[1]); assert_eq(0, bots[1]); assert_eq(0, tops[2]); assert_eq(0, bots[2]); assert_eq(0, tops[3]); assert_eq(0, bots[3]); if(ltop._fw) countFwSideEx(ltop, tops); // Forward side else countBwSideEx(ltop, tops); // Backward side if(lbot._fw) countFwSideEx(lbot, bots); // Forward side else countBwSideEx(lbot, bots); // Backward side #ifndef NDEBUG if(_sanity && !overrideSanity) { // Make sure results match up with individual calls to mapLF; // be sure to override sanity-checking in the callee, or we'll // have infinite recursion assert_eq(mapLF(ltop, 0, true), tops[0]); assert_eq(mapLF(ltop, 1, true), tops[1]); assert_eq(mapLF(ltop, 2, true), tops[2]); assert_eq(mapLF(ltop, 3, true), tops[3]); assert_eq(mapLF(lbot, 0, true), bots[0]); assert_eq(mapLF(lbot, 1, true), bots[1]); assert_eq(mapLF(lbot, 2, true), bots[2]); assert_eq(mapLF(lbot, 3, true), bots[3]); } #endif } #ifndef NDEBUG /** * Given top and bot loci, calculate counts of all four DNA chars up to * those loci. Used for more advanced backtracking-search. */ template inline void Ebwt::mapLFEx(const SideLocus& l, uint32_t *arrs ASSERT_ONLY(, bool overrideSanity) ) const { assert_eq(0, arrs[0]); assert_eq(0, arrs[1]); assert_eq(0, arrs[2]); assert_eq(0, arrs[3]); if(l._fw) countFwSideEx(l, arrs); // Forward side else countBwSideEx(l, arrs); // Backward side #ifndef NDEBUG if(_sanity && !overrideSanity) { // Make sure results match up with individual calls to mapLF; // be sure to override sanity-checking in the callee, or we'll // have infinite recursion assert_eq(mapLF(l, 0, true), arrs[0]); assert_eq(mapLF(l, 1, true), arrs[1]); assert_eq(mapLF(l, 2, true), arrs[2]); assert_eq(mapLF(l, 3, true), arrs[3]); } #endif } #endif /** * Given row i, return the row that the LF mapping maps i to. */ template inline uint32_t Ebwt::mapLF(const SideLocus& l ASSERT_ONLY(, bool overrideSanity) ) const { #ifdef EBWT_STATS const_cast*>(this)->mapLFs_++; #endif uint32_t ret; assert(l.side(this->_ebwt) != NULL); int c = rowL(l); assert_lt(c, 4); assert_geq(c, 0); if(l._fw) ret = countFwSide(l, c); // Forward side else ret = countBwSide(l, c); // Backward side assert_lt(ret, this->_eh._bwtLen); #ifndef NDEBUG if(_sanity && !overrideSanity) { // Make sure results match up with results from mapLFEx; // be sure to override sanity-checking in the callee, or we'll // have infinite recursion uint32_t arrs[] = { 0, 0, 0, 0 }; mapLFEx(l, arrs, true); assert_eq(arrs[c], ret); } #endif return ret; } /** * Given row i and character c, return the row that the LF mapping maps * i to on character c. */ template inline uint32_t Ebwt::mapLF(const SideLocus& l, int c ASSERT_ONLY(, bool overrideSanity) ) const { #ifdef EBWT_STATS const_cast*>(this)->mapLFcs_++; #endif uint32_t ret; assert_lt(c, 4); assert_geq(c, 0); if(l._fw) ret = countFwSide(l, c); // Forward side else ret = countBwSide(l, c); // Backward side assert_lt(ret, this->_eh._bwtLen); #ifndef NDEBUG if(_sanity && !overrideSanity) { // Make sure results match up with results from mapLFEx; // be sure to override sanity-checking in the callee, or we'll // have infinite recursion uint32_t arrs[] = { 0, 0, 0, 0 }; mapLFEx(l, arrs, true); assert_eq(arrs[c], ret); } #endif return ret; } /** * Given row i and character c, return the row that the LF mapping maps * i to on character c. */ template inline uint32_t Ebwt::mapLF1(uint32_t row, const SideLocus& l, int c ASSERT_ONLY(, bool overrideSanity) ) const { #ifdef EBWT_STATS const_cast*>(this)->mapLF1cs_++; #endif if(rowL(l) != c || row == _zOff) return 0xffffffff; uint32_t ret; assert_lt(c, 4); assert_geq(c, 0); if(l._fw) ret = countFwSide(l, c); // Forward side else ret = countBwSide(l, c); // Backward side assert_lt(ret, this->_eh._bwtLen); #ifndef NDEBUG if(_sanity && !overrideSanity) { // Make sure results match up with results from mapLFEx; // be sure to override sanity-checking in the callee, or we'll // have infinite recursion uint32_t arrs[] = { 0, 0, 0, 0 }; mapLFEx(l, arrs, true); assert_eq(arrs[c], ret); } #endif return ret; } /** * Given row i and character c, return the row that the LF mapping maps * i to on character c. */ template inline int Ebwt::mapLF1(uint32_t& row, const SideLocus& l ASSERT_ONLY(, bool overrideSanity) ) const { #ifdef EBWT_STATS const_cast*>(this)->mapLF1s_++; #endif if(row == _zOff) return -1; int c = rowL(l); assert_lt(c, 4); assert_geq(c, 0); if(l._fw) row = countFwSide(l, c); // Forward side else row = countBwSide(l, c); // Backward side assert_lt(row, this->_eh._bwtLen); #ifndef NDEBUG if(_sanity && !overrideSanity) { // Make sure results match up with results from mapLFEx; // be sure to override sanity-checking in the callee, or we'll // have infinite recursion uint32_t arrs[] = { 0, 0, 0, 0 }; mapLFEx(l, arrs, true); assert_eq(arrs[c], row); } #endif return c; } /** * Take an offset into the joined text and translate it into the * reference of the index it falls on, the offset into the reference, * and the length of the reference. Use a binary search through the * sorted list of reference fragment ranges t */ template void Ebwt::joinedToTextOff(uint32_t qlen, uint32_t off, uint32_t& tidx, uint32_t& textoff, uint32_t& tlen) const { uint32_t top = 0; uint32_t bot = _nFrag; // 1 greater than largest addressable element uint32_t elt = 0xffffffff; // Begin binary search while(true) { ASSERT_ONLY(uint32_t oldelt = elt); elt = top + ((bot - top) >> 1); assert_neq(oldelt, elt); // must have made progress uint32_t lower = _rstarts[elt*3]; uint32_t upper; if(elt == _nFrag-1) { upper = _eh._len; } else { upper = _rstarts[((elt+1)*3)]; } assert_gt(upper, lower); uint32_t fraglen = upper - lower; if(lower <= off) { if(upper > off) { // not last element, but it's within // off is in this range; check if it falls off if(off + qlen > upper) { // it falls off; signal no-go and return tidx = 0xffffffff; assert_lt(elt, _nFrag-1); return; } tidx = _rstarts[(elt*3)+1]; assert_lt(tidx, this->_nPat); assert_leq(fraglen, this->_plen[tidx]); // it doesn't fall off; now calculate textoff. // Initially it's the number of characters that precede // the alignment in the fragment uint32_t fragoff = off - _rstarts[(elt*3)]; if(!this->_fw) { fragoff = fraglen - fragoff - 1; fragoff -= (qlen-1); } // Add the alignment's offset into the fragment // ('fragoff') to the fragment's offset within the text textoff = fragoff + _rstarts[(elt*3)+2]; assert_lt(textoff, this->_plen[tidx]); break; // done with binary search } else { // 'off' belongs somewhere in the region between elt // and bot top = elt; } } else { // 'off' belongs somewhere in the region between top and // elt bot = elt; } // continue with binary search } tlen = this->_plen[tidx]; } /** * Report a potential match at offset 'off' with pattern length * 'qlen'. Filter out spurious matches that span texts. */ template inline bool Ebwt::report(const String& query, String* quals, String* name, bool color, char primer, char trimc, bool colExEnds, int snpPhred, const BitPairReference* ref, const std::vector& mmui32, const std::vector& refcs, size_t numMms, uint32_t off, uint32_t top, uint32_t bot, uint32_t qlen, int stratum, uint16_t cost, uint32_t patid, uint32_t seed, const EbwtSearchParams& params) const { VMSG_NL("In report"); assert_geq(cost, (uint32_t)(stratum << 14)); assert_lt(off, this->_eh._len); uint32_t tidx; uint32_t textoff; uint32_t tlen; joinedToTextOff(qlen, off, tidx, textoff, tlen); if(tidx == 0xffffffff) { return false; } return params.reportHit( query, // read sequence quals, // read quality values name, // read name color, // true -> read is colorspace primer, trimc, colExEnds, // true -> exclude nucleotides on ends snpPhred, // phred probability of SNP ref, // reference sequence rmap_, // map to another reference coordinate system _fw, // true = index is forward; false = mirror mmui32, // mismatch positions refcs, // reference characters for mms numMms, // # mismatches make_pair(tidx, textoff), // position make_pair(0, 0), // (bogus) mate position true, // (bogus) mate orientation 0, // (bogus) mate length make_pair(top, bot), // arrows tlen, // textlen qlen, // qlen stratum, // alignment stratum cost, // cost, including stratum & quality penalty bot-top-1, // # other hits patid, // pattern id seed, // pseudo-random seed 0); // mate (0 = unpaired) } #include "row_chaser.h" /** * Report a result. Involves walking backwards along the original * string by way of the LF-mapping until we reach a marked SA row or * the row corresponding to the 0th suffix. A marked row's offset * into the original string can be read directly from the this->_offs[] * array. */ template inline bool Ebwt::reportChaseOne(const String& query, String* quals, String* name, bool color, char primer, char trimc, bool colExEnds, int snpPhred, const BitPairReference* ref, const std::vector& mmui32, const std::vector& refcs, size_t numMms, uint32_t i, uint32_t top, uint32_t bot, uint32_t qlen, int stratum, uint16_t cost, uint32_t patid, uint32_t seed, const EbwtSearchParams& params, SideLocus *l) const { VMSG_NL("In reportChaseOne"); uint32_t off; uint32_t jumps = 0; ASSERT_ONLY(uint32_t origi = i); SideLocus myl; const uint32_t offMask = this->_eh._offMask; const uint32_t offRate = this->_eh._offRate; const uint32_t* offs = this->_offs; // If the caller didn't give us a pre-calculated (and prefetched) // locus, then we have to do that now if(l == NULL) { l = &myl; l->initFromRow(i, this->_eh, this->_ebwt); } assert(l != NULL); assert(l->valid()); // Walk along until we reach the next marked row to the left while(((i & offMask) != i) && i != _zOff) { // Not a marked row; walk left one more char uint32_t newi = mapLF(*l); // calc next row assert_neq(newi, i); i = newi; // update row l->initFromRow(i, this->_eh, this->_ebwt); // update locus jumps++; } // This is a marked row if(i == _zOff) { // Special case: it's the row corresponding to the // lexicographically smallest suffix, which is implicitly // marked 0 off = jumps; VMSG_NL("reportChaseOne found zoff off=" << off << " (jumps=" << jumps << ")"); } else { // Normal marked row, calculate offset of row i off = offs[i >> offRate] + jumps; VMSG_NL("reportChaseOne found off=" << off << " (jumps=" << jumps << ")"); } #ifndef NDEBUG { uint32_t rcoff = RowChaser::toFlatRefOff(this, qlen, origi); assert_eq(rcoff, off); } #endif return report(query, quals, name, color, primer, trimc, colExEnds, snpPhred, ref, mmui32, refcs, numMms, off, top, bot, qlen, stratum, cost, patid, seed, params); } /** * Report a result. Involves walking backwards along the original * string by way of the LF-mapping until we reach a marked SA row or * the row corresponding to the 0th suffix. A marked row's offset * into the original string can be read directly from the this->_offs[] * array. */ template inline bool Ebwt::reportReconstruct(const String& query, String* quals, String* name, String& lbuf, String& rbuf, const uint32_t *mmui32, const char* refcs, size_t numMms, uint32_t i, uint32_t top, uint32_t bot, uint32_t qlen, int stratum, const EbwtSearchParams& params, SideLocus *l) const { VMSG_NL("In reportReconstruct"); assert_gt(_eh._isaLen, 0); // Must have inverse suffix array to reconstruct uint32_t off; uint32_t jumps = 0; SideLocus myl; const uint32_t offMask = this->_eh._offMask; const uint32_t offRate = this->_eh._offRate; const uint32_t* offs = this->_offs; const uint32_t* isa = this->_isa; assert(isa != NULL); if(l == NULL) { l = &myl; myl.initFromRow(i, this->_eh, this->_ebwt); } assert(l != NULL); clear(lbuf); clear(rbuf); // Walk along until we reach the next marked row to the left while(((i & offMask) != i) && i != _zOff) { // Not a marked row; walk left one more char int c = rowL(*l); appendValue(lbuf, (Dna5)c); uint32_t newi; assert_lt(c, 4); assert_geq(c, 0); if(l->_fw) newi = countFwSide(*l, c); // Forward side else newi = countBwSide(*l, c); // Backward side assert_lt(newi, this->_eh._bwtLen); assert_neq(newi, i); i = newi; // update row l->initFromRow(i, this->_eh, this->_ebwt); // update locus jumps++; } // This is a marked row if(i == _zOff) { // Special case: it's the row corresponding to the // lexicographically smallest suffix, which is implicitly // marked 0 off = jumps; VMSG_NL("reportChaseOne found zoff off=" << off << " (jumps=" << jumps << ")"); } else { // Normal marked row, calculate offset of row i off = offs[i >> offRate] + jumps; VMSG_NL("reportChaseOne found off=" << off << " (jumps=" << jumps << ")"); } // 'off' now holds the text offset of the first (leftmost) position // involved in the alignment. Next we call joinedToTextOff to // check whether the seed is valid (i.e., does not straddle a // boundary between two reference seuqences) and to obtain its // extents uint32_t tidx; // the index (id) of the reference we hit in uint32_t textoff; // the offset of the alignment within the reference uint32_t tlen; // length of reference seed hit in joinedToTextOff(qlen, off, tidx, textoff, tlen); if(tidx == 0xffffffff) { // The seed straddled a reference boundary, and so is spurious. // Return false, indicating that we shouldn't stop. return false; } if(jumps > textoff) { // In our progress toward a marked row, we passed the boundary // between the reference sequence containing the seed and the // reference sequence to the left of it. That's OK, we just // need to knock off the extra characters we added to 'lbuf'. assert_eq(jumps, length(lbuf)); _setLength(lbuf, textoff); jumps = textoff; assert_eq(textoff, length(lbuf)); } else if(jumps < textoff) { // Keep walking until we reach the end of the reference assert_neq(i, _zOff); uint32_t diff = textoff-jumps; for(size_t j = 0; j < diff; j++) { // Not a marked row; walk left one more char int c = rowL(*l); appendValue(lbuf, (Dna5)c); uint32_t newi; assert_lt(c, 4); assert_geq(c, 0); if(l->_fw) newi = countFwSide(*l, c); // Forward side else newi = countBwSide(*l, c); // Backward side assert_lt(newi, this->_eh._bwtLen); assert_neq(newi, i); i = newi; // update row assert_neq(i, _zOff); l->initFromRow(i, this->_eh, this->_ebwt); // update locus jumps++; } assert_eq(textoff, jumps); assert_eq(textoff, length(lbuf)); } assert_eq(textoff, jumps); assert_eq(textoff, length(lbuf)); // Calculate the right-hand extent of the reference uint32_t ref_right = off - textoff + tlen; // Round the right-hand extent to the nearest ISA element that maps // to it or a character to its right uint32_t ref_right_rounded = ref_right; if((ref_right_rounded & _eh._isaMask) != ref_right_rounded) { ref_right_rounded = ((ref_right_rounded >> _eh._isaRate)+1) << _eh._isaRate; } // TODO: handle case where ref_right_rounded is off the end of _isa // Let the current suffix-array elt be determined by the ISA if((ref_right_rounded >> _eh._isaRate) >= _eh._isaLen) { i = _eh._len; ref_right_rounded = _eh._len; } else { i = isa[ref_right_rounded >> _eh._isaRate]; } uint32_t right_steps_rounded = ref_right_rounded - (off + qlen); uint32_t right_steps = ref_right - (off + qlen); l->initFromRow(i, this->_eh, this->_ebwt); // update locus for(size_t j = 0; j < right_steps_rounded; j++) { // Not a marked row; walk left one more char int c = rowL(*l); appendValue(rbuf, (Dna5)c); uint32_t newi; assert_lt(c, 4); assert_geq(c, 0); if(l->_fw) newi = countFwSide(*l, c); // Forward side else newi = countBwSide(*l, c); // Backward side assert_lt(newi, this->_eh._bwtLen); assert_neq(newi, i); i = newi; // update row assert_neq(i, _zOff); l->initFromRow(i, this->_eh, this->_ebwt); // update locus jumps++; } if(right_steps_rounded > right_steps) { jumps -= (right_steps_rounded - right_steps); _setLength(rbuf, right_steps); } assert_eq(right_steps, length(rbuf)); assert_eq(tlen, jumps + qlen); ::reverseInPlace(lbuf); ::reverseInPlace(rbuf); { cout << "reportReconstruct:" << endl << " " << lbuf << query << rbuf << endl; cout << " "; for(size_t i = 0; i < length(lbuf); i++) cout << " "; cout << query << endl; } // Now we've reconstructed the return false; } /** * Transform this Ebwt into the original string in linear time by using * the LF mapping to walk backwards starting at the row correpsonding * to the end of the string. The result is written to s. The Ebwt * must be in memory. */ template void Ebwt::restore(TStr& s) const { assert(isInMemory()); resize(s, this->_eh._len, Exact()); uint32_t jumps = 0; uint32_t i = this->_eh._len; // should point to final SA elt (starting with '$') SideLocus l(i, this->_eh, this->_ebwt); while(i != _zOff) { assert_lt(jumps, this->_eh._len); //if(_verbose) cout << "restore: i: " << i << endl; // Not a marked row; go back a char in the original string uint32_t newi = mapLF(l); assert_neq(newi, i); s[this->_eh._len - jumps - 1] = rowL(l); i = newi; l.initFromRow(i, this->_eh, this->_ebwt); jumps++; } assert_eq(jumps, this->_eh._len); } /** * Check that this Ebwt, when restored via restore(), matches up with * the given array of reference sequences. For sanity checking. */ template void Ebwt::checkOrigs(const vector >& os, bool color, bool mirror) const { TStr rest; restore(rest); uint32_t restOff = 0; size_t i = 0, j = 0; if(mirror) { // TODO: FIXME return; } while(i < os.size()) { size_t olen = length(os[i]); int lastorig = -1; for(; j < olen; j++) { size_t joff = j; if(mirror) joff = olen - j - 1; if((int)os[i][joff] == 4) { // Skip over Ns lastorig = -1; if(!mirror) { while(j < olen && (int)os[i][j] == 4) j++; } else { while(j < olen && (int)os[i][olen-j-1] == 4) j++; } j--; continue; } if(lastorig == -1 && color) { lastorig = os[i][joff]; continue; } if(color) { assert_neq(-1, lastorig); assert_eq(dinuc2color[(int)os[i][joff]][lastorig], rest[restOff]); } else { assert_eq(os[i][joff], rest[restOff]); } lastorig = (int)os[i][joff]; restOff++; } if(j == length(os[i])) { // Moved to next sequence i++; j = 0; } else { // Just jumped over a gap } } } /////////////////////////////////////////////////////////////////////// // // Functions for reading and writing Ebwts // /////////////////////////////////////////////////////////////////////// /** * Read an Ebwt from file with given filename. */ template void Ebwt::readIntoMemory( int color, int needEntireRev, bool justHeader, EbwtParams *params, bool mmSweep, bool loadNames, bool startVerbose) { bool switchEndian; // dummy; caller doesn't care #ifdef BOWTIE_MM char *mmFile[] = { NULL, NULL }; #endif if(_in1Str.length() > 0) { if(_verbose || startVerbose) { cerr << " About to open input files: "; logTime(cerr); } #ifdef BOWTIE_MM // Initialize our primary and secondary input-stream fields if(_in1 != -1) close(_in1); if(_verbose || startVerbose) { cerr << "Opening \"" << _in1Str << "\"" << endl; } if((_in1 = open(_in1Str.c_str(), O_RDONLY)) < 0) { cerr << "Could not open index file " << _in1Str << endl; } if(_in2 != -1) close(_in2); if(_verbose || startVerbose) { cerr << "Opening \"" << _in2Str << "\"" << endl; } if((_in2 = open(_in2Str.c_str(), O_RDONLY)) < 0) { cerr << "Could not open index file " << _in2Str << endl; } #else // Initialize our primary and secondary input-stream fields if(_in1 != NULL) fclose(_in1); if(_verbose || startVerbose) cerr << "Opening \"" << _in1Str << "\"" << endl; if((_in1 = fopen(_in1Str.c_str(), "rb")) == NULL) { cerr << "Could not open index file " << _in1Str << endl; } if(_in2 != NULL) fclose(_in2); if(_verbose || startVerbose) cerr << "Opening \"" << _in2Str << "\"" << endl; if((_in2 = fopen(_in2Str.c_str(), "rb")) == NULL) { cerr << "Could not open index file " << _in2Str << endl; } #endif if(_verbose || startVerbose) { cerr << " Finished opening input files: "; logTime(cerr); } #ifdef BOWTIE_MM if(_useMm /*&& !justHeader*/) { const char *names[] = {_in1Str.c_str(), _in2Str.c_str()}; int fds[] = { _in1, _in2 }; for(int i = 0; i < 2; i++) { if(_verbose || startVerbose) { cerr << " Memory-mapping input file " << (i+1) << ": "; logTime(cerr); } struct stat sbuf; if (stat(names[i], &sbuf) == -1) { perror("stat"); cerr << "Error: Could not stat index file " << names[i] << " prior to memory-mapping" << endl; throw 1; } mmFile[i] = (char*)mmap((void *)0, sbuf.st_size, PROT_READ, MAP_SHARED, fds[i], 0); if(mmFile[i] == (void *)(-1)) { perror("mmap"); cerr << "Error: Could not memory-map the index file " << names[i] << endl; throw 1; } if(mmSweep) { int sum = 0; for(off_t j = 0; j < sbuf.st_size; j += 1024) { sum += (int) mmFile[i][j]; } if(startVerbose) { cerr << " Swept the memory-mapped ebwt index file 1; checksum: " << sum << ": "; logTime(cerr); } } } mmFile1_ = mmFile[0]; mmFile2_ = mmFile[1]; } #endif } #ifdef BOWTIE_MM else if(_useMm && !justHeader) { mmFile[0] = mmFile1_; mmFile[1] = mmFile2_; } if(_useMm && !justHeader) { assert(mmFile[0] == mmFile1_); assert(mmFile[1] == mmFile2_); } #endif if(_verbose || startVerbose) { cerr << " Reading header: "; logTime(cerr); } // Read endianness hints from both streams size_t bytesRead = 0; switchEndian = false; uint32_t one = readU32(_in1, switchEndian); // 1st word of primary stream bytesRead += 4; #ifndef NDEBUG assert_eq(one, readU32(_in2, switchEndian)); // should match! #else readU32(_in2, switchEndian); #endif if(one != 1) { assert_eq((1u<<24), one); assert_eq(1, endianSwapU32(one)); switchEndian = true; } // Can't switch endianness and use memory-mapped files; in order to // support this, someone has to modify the file to switch // endiannesses appropriately, and we can't do this inside Bowtie // or we might be setting up a race condition with other processes. if(switchEndian && _useMm) { cerr << "Error: Can't use memory-mapped files when the index is the opposite endianness" << endl; throw 1; } // Reads header entries one by one from primary stream uint32_t len = readU32(_in1, switchEndian); bytesRead += 4; int32_t lineRate = readI32(_in1, switchEndian); bytesRead += 4; int32_t linesPerSide = readI32(_in1, switchEndian); bytesRead += 4; int32_t offRate = readI32(_in1, switchEndian); bytesRead += 4; // TODO: add isaRate to the actual file format (right now, the // user has to tell us whether there's an ISA sample and what the // sampling rate is. int32_t isaRate = _overrideIsaRate; int32_t ftabChars = readI32(_in1, switchEndian); bytesRead += 4; // chunkRate was deprecated in an earlier version of Bowtie; now // we use it to hold flags. int32_t flags = readI32(_in1, switchEndian); bool entireRev = false; if(flags < 0 && (((-flags) & EBWT_COLOR) != 0)) { if(color != -1 && !color) { cerr << "Error: -C was not specified when running bowtie, but index is in colorspace. If" << endl << "your reads are in colorspace, please use the -C option. If your reads are not" << endl << "in colorspace, please use a normal index (one built without specifying -C to" << endl << "bowtie-build)." << endl; throw 1; } color = 1; } else if(flags < 0) { if(color != -1 && color) { cerr << "Error: -C was specified when running bowtie, but index is not in colorspace. If" << endl << "your reads are in colorspace, please use a colorspace index (one built using" << endl << "bowtie-build -C). If your reads are not in colorspace, don't specify -C when" << endl << "running bowtie." << endl; throw 1; } color = 0; } if(flags < 0 && (((-flags) & EBWT_ENTIRE_REV) == 0)) { if(needEntireRev != -1 && needEntireRev != 0) { cerr << "Error: This index is not compatible with this version of bowtie. Please use a" << endl << "current version of bowtie-build." << endl; throw 1; } } else entireRev = true; bytesRead += 4; // Create a new EbwtParams from the entries read from primary stream EbwtParams *eh; bool deleteEh = false; if(params != NULL) { params->init(len, lineRate, linesPerSide, offRate, isaRate, ftabChars, color, entireRev); if(_verbose || startVerbose) params->print(cerr); eh = params; } else { eh = new EbwtParams(len, lineRate, linesPerSide, offRate, isaRate, ftabChars, color, entireRev); deleteEh = true; } // Set up overridden suffix-array-sample parameters uint32_t offsLen = eh->_offsLen; uint32_t offRateDiff = 0; uint32_t offsLenSampled = offsLen; if(_overrideOffRate > offRate) { offRateDiff = _overrideOffRate - offRate; } if(offRateDiff > 0) { offsLenSampled >>= offRateDiff; if((offsLen & ~(0xffffffff << offRateDiff)) != 0) { offsLenSampled++; } } // Set up overridden inverted-suffix-array-sample parameters uint32_t isaLen = eh->_isaLen; uint32_t isaRateDiff = 0; uint32_t isaLenSampled = isaLen; if(_overrideIsaRate > isaRate) { isaRateDiff = _overrideIsaRate - isaRate; } if(isaRateDiff > 0) { isaLenSampled >>= isaRateDiff; if((isaLen & ~(0xffffffff << isaRateDiff)) != 0) { isaLenSampled++; } } // Can't override the offrate or isarate and use memory-mapped // files; ultimately, all processes need to copy the sparser sample // into their own memory spaces. if(_useMm && (offRateDiff || isaRateDiff)) { cerr << "Error: Can't use memory-mapped files when the offrate or isarate is overridden" << endl; throw 1; } // Read nPat from primary stream this->_nPat = readI32(_in1, switchEndian); bytesRead += 4; if(this->_plen != NULL && !_useMm) { // Delete it so that we can re-read it delete[] this->_plen; this->_plen = NULL; } // Read plen from primary stream if(_useMm) { #ifdef BOWTIE_MM this->_plen = (uint32_t*)(mmFile[0] + bytesRead); bytesRead += this->_nPat*4; lseek(_in1, this->_nPat*4, SEEK_CUR); #endif } else { try { if(_verbose || startVerbose) { cerr << "Reading plen (" << this->_nPat << "): "; logTime(cerr); } this->_plen = new uint32_t[this->_nPat]; if(switchEndian) { for(uint32_t i = 0; i < this->_nPat; i++) { this->_plen[i] = readU32(_in1, switchEndian); } } else { MM_READ_RET r = MM_READ(_in1, (void*)this->_plen, this->_nPat*4); if(r != (MM_READ_RET)(this->_nPat*4)) { cerr << "Error reading _plen[] array: " << r << ", " << (this->_nPat*4) << endl; throw 1; } } } catch(bad_alloc& e) { cerr << "Out of memory allocating plen[] in Ebwt::read()" << " at " << __FILE__ << ":" << __LINE__ << endl; throw e; } } bool shmemLeader; // TODO: I'm not consistent on what "header" means. Here I'm using // "header" to mean everything that would exist in memory if we // started to build the Ebwt but stopped short of the build*() step // (i.e. everything up to and including join()). if(justHeader) goto done; this->_nFrag = readU32(_in1, switchEndian); bytesRead += 4; if(_verbose || startVerbose) { cerr << "Reading rstarts (" << this->_nFrag*3 << "): "; logTime(cerr); } assert_geq(this->_nFrag, this->_nPat); if(_useMm) { #ifdef BOWTIE_MM this->_rstarts = (uint32_t*)(mmFile[0] + bytesRead); bytesRead += this->_nFrag*4*3; lseek(_in1, this->_nFrag*4*3, SEEK_CUR); #endif } else { this->_rstarts = new uint32_t[this->_nFrag*3]; if(switchEndian) { for(uint32_t i = 0; i < this->_nFrag*3; i += 3) { // fragment starting position in joined reference // string, text id, and fragment offset within text this->_rstarts[i] = readU32(_in1, switchEndian); this->_rstarts[i+1] = readU32(_in1, switchEndian); this->_rstarts[i+2] = readU32(_in1, switchEndian); } } else { MM_READ_RET r = MM_READ(_in1, (void *)this->_rstarts, this->_nFrag*4*3); if(r != (MM_READ_RET)(this->_nFrag*4*3)) { cerr << "Error reading _rstarts[] array: " << r << ", " << (this->_nFrag*4*3) << endl; throw 1; } } } if(_useMm) { #ifdef BOWTIE_MM this->_ebwt = (uint8_t*)(mmFile[0] + bytesRead); bytesRead += eh->_ebwtTotLen; lseek(_in1, eh->_ebwtTotLen, SEEK_CUR); #endif } else { // Allocate ebwt (big allocation) if(_verbose || startVerbose) { cerr << "Reading ebwt (" << eh->_ebwtTotLen << "): "; logTime(cerr); } bool shmemLeader = true; if(useShmem_) { shmemLeader = ALLOC_SHARED_U8( (_in1Str + "[ebwt]"), eh->_ebwtTotLen, &this->_ebwt, "ebwt[]", (_verbose || startVerbose)); if(_verbose || startVerbose) { cerr << " shared-mem " << (shmemLeader ? "leader" : "follower") << endl; } } else { try { this->_ebwt = new uint8_t[eh->_ebwtTotLen]; } catch(bad_alloc& e) { cerr << "Out of memory allocating the ebwt[] array for the Bowtie index. Please try" << endl << "again on a computer with more memory." << endl; throw 1; } } if(shmemLeader) { // Read ebwt from primary stream MM_READ_RET r = MM_READ(_in1, (void *)this->_ebwt, eh->_ebwtTotLen); if(r != (MM_READ_RET)eh->_ebwtTotLen) { cerr << "Error reading ebwt array: returned " << r << ", length was " << (eh->_ebwtTotLen) << endl << "Your index files may be corrupt; please try re-building or re-downloading." << endl << "A complete index consists of 6 files: XYZ.1.ebwt, XYZ.2.ebwt, XYZ.3.ebwt," << endl << "XYZ.4.ebwt, XYZ.rev.1.ebwt, and XYZ.rev.2.ebwt. The XYZ.1.ebwt and " << endl << "XYZ.rev.1.ebwt files should have the same size, as should the XYZ.2.ebwt and" << endl << "XYZ.rev.2.ebwt files." << endl; throw 1; } if(switchEndian) { uint8_t *side = this->_ebwt; for(size_t i = 0; i < eh->_numSides; i++) { uint32_t *cums = reinterpret_cast(side + eh->_sideSz - 8); cums[0] = endianSwapU32(cums[0]); cums[1] = endianSwapU32(cums[1]); side += this->_eh._sideSz; } } if(useShmem_) NOTIFY_SHARED(this->_ebwt, eh->_ebwtTotLen); } else { // Seek past the data and wait until master is finished MM_SEEK(_in1, eh->_ebwtTotLen, SEEK_CUR); if(useShmem_) WAIT_SHARED(this->_ebwt, eh->_ebwtTotLen); } } // Read zOff from primary stream _zOff = readU32(_in1, switchEndian); bytesRead += 4; assert_lt(_zOff, len); try { // Read fchr from primary stream if(_verbose || startVerbose) cerr << "Reading fchr (5)" << endl; if(_useMm) { #ifdef BOWTIE_MM this->_fchr = (uint32_t*)(mmFile[0] + bytesRead); bytesRead += 5*4; lseek(_in1, 5*4, SEEK_CUR); #endif } else { this->_fchr = new uint32_t[5]; for(int i = 0; i < 5; i++) { this->_fchr[i] = readU32(_in1, switchEndian); assert_leq(this->_fchr[i], len); if(i > 0) assert_geq(this->_fchr[i], this->_fchr[i-1]); } } assert_gt(this->_fchr[4], this->_fchr[0]); // Read ftab from primary stream if(_verbose || startVerbose) { cerr << "Reading ftab (" << eh->_ftabLen << "): "; logTime(cerr); } if(_useMm) { #ifdef BOWTIE_MM this->_ftab = (uint32_t*)(mmFile[0] + bytesRead); bytesRead += eh->_ftabLen*4; lseek(_in1, eh->_ftabLen*4, SEEK_CUR); #endif } else { this->_ftab = new uint32_t[eh->_ftabLen]; if(switchEndian) { for(uint32_t i = 0; i < eh->_ftabLen; i++) this->_ftab[i] = readU32(_in1, switchEndian); } else { MM_READ_RET r = MM_READ(_in1, (void *)this->_ftab, eh->_ftabLen*4); if(r != (MM_READ_RET)(eh->_ftabLen*4)) { cerr << "Error reading _ftab[] array: " << r << ", " << (eh->_ftabLen*4) << endl; throw 1; } } } // Read etab from primary stream if(_verbose || startVerbose) { cerr << "Reading eftab (" << eh->_eftabLen << "): "; logTime(cerr); } if(_useMm) { #ifdef BOWTIE_MM this->_eftab = (uint32_t*)(mmFile[0] + bytesRead); bytesRead += eh->_eftabLen*4; lseek(_in1, eh->_eftabLen*4, SEEK_CUR); #endif } else { this->_eftab = new uint32_t[eh->_eftabLen]; if(switchEndian) { for(uint32_t i = 0; i < eh->_eftabLen; i++) this->_eftab[i] = readU32(_in1, switchEndian); } else { MM_READ_RET r = MM_READ(_in1, (void *)this->_eftab, eh->_eftabLen*4); if(r != (MM_READ_RET)(eh->_eftabLen*4)) { cerr << "Error reading _eftab[] array: " << r << ", " << (eh->_eftabLen*4) << endl; throw 1; } } } for(uint32_t i = 0; i < eh->_eftabLen; i++) { if(i > 0 && this->_eftab[i] > 0) { assert_geq(this->_eftab[i], this->_eftab[i-1]); } else if(i > 0 && this->_eftab[i-1] == 0) { assert_eq(0, this->_eftab[i]); } } } catch(bad_alloc& e) { cerr << "Out of memory allocating fchr[], ftab[] or eftab[] arrays for the Bowtie index." << endl << "Please try again on a computer with more memory." << endl; throw 1; } // Read reference sequence names from primary index file (or not, // if --refidx is specified) if(loadNames) { while(true) { char c = '\0'; if(MM_READ(_in1, (void *)(&c), (size_t)1) != (MM_READ_RET)1) break; bytesRead++; if(c == '\0') break; else if(c == '\n') { this->_refnames.push_back(""); } else { if(this->_refnames.size() == 0) { this->_refnames.push_back(""); } this->_refnames.back().push_back(c); } } } bytesRead = 4; // reset for secondary index file (already read 1-sentinel) shmemLeader = true; if(_verbose || startVerbose) { cerr << "Reading offs (" << offsLenSampled << " 32-bit words): "; logTime(cerr); } if(!_useMm) { if(!useShmem_) { // Allocate offs_ try { this->_offs = new uint32_t[offsLenSampled]; } catch(bad_alloc& e) { cerr << "Out of memory allocating the offs[] array for the Bowtie index." << endl << "Please try again on a computer with more memory." << endl; throw 1; } } else { shmemLeader = ALLOC_SHARED_U32( (_in2Str + "[offs]"), offsLenSampled*4, &this->_offs, "offs", (_verbose || startVerbose)); } } if(_overrideOffRate < 32) { if(shmemLeader) { // Allocate offs (big allocation) if(switchEndian || offRateDiff > 0) { assert(!_useMm); const uint32_t blockMaxSz = (2 * 1024 * 1024); // 2 MB block size const uint32_t blockMaxSzU32 = (blockMaxSz >> 2); // # U32s per block char *buf = new char[blockMaxSz]; for(uint32_t i = 0; i < offsLen; i += blockMaxSzU32) { uint32_t block = min(blockMaxSzU32, offsLen - i); MM_READ_RET r = MM_READ(_in2, (void *)buf, block << 2); if(r != (MM_READ_RET)(block << 2)) { cerr << "Error reading block of offs array: " << r << ", " << (block << 2) << endl << "Your index files may be corrupt; please try re-building or re-downloading." << endl << "A complete index consists of 6 files: XYZ.1.ebwt, XYZ.2.ebwt, XYZ.3.ebwt," << endl << "XYZ.4.ebwt, XYZ.rev.1.ebwt, and XYZ.rev.2.ebwt. The XYZ.1.ebwt and " << endl << "XYZ.rev.1.ebwt files should have the same size, as should the XYZ.2.ebwt and" << endl << "XYZ.rev.2.ebwt files." << endl; throw 1; } uint32_t idx = i >> offRateDiff; for(uint32_t j = 0; j < block; j += (1 << offRateDiff)) { assert_lt(idx, offsLenSampled); this->_offs[idx] = ((uint32_t*)buf)[j]; if(switchEndian) { this->_offs[idx] = endianSwapU32(this->_offs[idx]); } idx++; } } delete[] buf; } else { if(_useMm) { #ifdef BOWTIE_MM this->_offs = (uint32_t*)(mmFile[1] + bytesRead); bytesRead += (offsLen << 2); lseek(_in2, (offsLen << 2), SEEK_CUR); #endif } else { // If any of the high two bits are set if((offsLen & 0xf0000000) != 0) { if(sizeof(char *) <= 4) { cerr << "Sanity error: sizeof(char *) <= 4 but offsLen is " << hex << offsLen << endl; throw 1; } // offsLen << 4 overflows sometimes, so do it in four reads char *offs = (char *)this->_offs; for(int i = 0; i < 16; i++) { MM_READ_RET r = MM_READ(_in2, (void*)offs, offsLen >> 2); if(r != (MM_READ_RET)(offsLen >> 2)) { cerr << "Error reading block of _offs[] array: " << r << ", " << (offsLen >> 2) << endl; throw 1; } offs += (offsLen >> 2); } } else { // Do it all in one read MM_READ_RET r = MM_READ(_in2, (void*)this->_offs, offsLen << 2); if(r != (MM_READ_RET)(offsLen << 2)) { cerr << "Error reading _offs[] array: " << r << ", " << (offsLen << 2) << endl; throw 1; } } } } { ASSERT_ONLY(Bitset offsSeen(len+1)); for(uint32_t i = 0; i < offsLenSampled; i++) { assert(!offsSeen.test(this->_offs[i])); ASSERT_ONLY(offsSeen.set(this->_offs[i])); assert_leq(this->_offs[i], len); } } if(useShmem_) NOTIFY_SHARED(this->_offs, offsLenSampled*4); } else { // Not the shmem leader MM_SEEK(_in2, offsLenSampled*4, SEEK_CUR); if(useShmem_) WAIT_SHARED(this->_offs, offsLenSampled*4); } } // Allocate _isa[] (big allocation) if(_verbose || startVerbose) { cerr << "Reading isa (" << isaLenSampled << "): "; logTime(cerr); } if(!_useMm) { try { this->_isa = new uint32_t[isaLenSampled]; } catch(bad_alloc& e) { cerr << "Out of memory allocating the isa[] array for the Bowtie index." << endl << "Please try again on a computer with more memory." << endl; throw 1; } } // Read _isa[] if(switchEndian || isaRateDiff > 0) { assert(!_useMm); for(uint32_t i = 0; i < isaLen; i++) { if((i & ~(0xffffffff << isaRateDiff)) != 0) { char tmp[4]; MM_READ_RET r = MM_READ(_in2, (void *)tmp, 4); if(r != (MM_READ_RET)4) { cerr << "Error reading a word of the _isa[] array: " << r << ", 4" << endl; throw 1; } } else { uint32_t idx = i >> isaRateDiff; assert_lt(idx, isaLenSampled); this->_isa[idx] = readU32(_in2, switchEndian); } } } else { if(_useMm) { #ifdef BOWTIE_MM this->_isa = (uint32_t*)(mmFile[1] + bytesRead); bytesRead += (isaLen << 2); lseek(_in2, (isaLen << 2), SEEK_CUR); #endif } else { MM_READ_RET r = MM_READ(_in2, (void *)this->_isa, isaLen*4); if(r != (MM_READ_RET)(isaLen*4)) { cerr << "Error reading _isa[] array: " << r << ", " << (isaLen*4) << endl; throw 1; } } } { ASSERT_ONLY(Bitset isasSeen(len+1)); for(uint32_t i = 0; i < isaLenSampled; i++) { assert(!isasSeen.test(this->_isa[i])); ASSERT_ONLY(isasSeen.set(this->_isa[i])); assert_leq(this->_isa[i], len); } } this->postReadInit(*eh); // Initialize fields of Ebwt not read from file if(_verbose || startVerbose) print(cerr, *eh); // The fact that _ebwt and friends actually point to something // (other than NULL) now signals to other member functions that the // Ebwt is loaded into memory. done: // Exit hatch for both justHeader and !justHeader // Be kind if(deleteEh) delete eh; #ifdef BOWTIE_MM lseek(_in1, 0, SEEK_SET); lseek(_in2, 0, SEEK_SET); #else rewind(_in1); rewind(_in2); #endif } /** * Read reference names from an input stream 'in' for an Ebwt primary * file and store them in 'refnames'. */ static inline void readEbwtRefnames(istream& in, vector& refnames) { // _in1 must already be open with the get cursor at the // beginning and no error flags set. assert(in.good()); assert_eq((streamoff)in.tellg(), ios::beg); // Read endianness hints from both streams bool switchEndian = false; uint32_t one = readU32(in, switchEndian); // 1st word of primary stream if(one != 1) { assert_eq((1u<<24), one); switchEndian = true; } // Reads header entries one by one from primary stream uint32_t len = readU32(in, switchEndian); int32_t lineRate = readI32(in, switchEndian); int32_t linesPerSide = readI32(in, switchEndian); int32_t offRate = readI32(in, switchEndian); int32_t ftabChars = readI32(in, switchEndian); // BTL: chunkRate is now deprecated int32_t flags = readI32(in, switchEndian); bool color = false; bool entireReverse = false; if(flags < 0) { color = (((-flags) & EBWT_COLOR) != 0); entireReverse = (((-flags) & EBWT_ENTIRE_REV) != 0); } // Create a new EbwtParams from the entries read from primary stream EbwtParams eh(len, lineRate, linesPerSide, offRate, -1, ftabChars, color, entireReverse); uint32_t nPat = readI32(in, switchEndian); // nPat in.seekg(nPat*4, ios_base::cur); // skip plen // Skip rstarts uint32_t nFrag = readU32(in, switchEndian); in.seekg(nFrag*4*3, ios_base::cur); // Skip ebwt in.seekg(eh._ebwtTotLen, ios_base::cur); // Skip zOff from primary stream readU32(in, switchEndian); // Skip fchr in.seekg(5 * 4, ios_base::cur); // Skip ftab in.seekg(eh._ftabLen*4, ios_base::cur); // Skip eftab in.seekg(eh._eftabLen*4, ios_base::cur); // Read reference sequence names from primary index file while(true) { char c = '\0'; in.read(&c, 1); if(in.eof()) break; if(c == '\0') break; else if(c == '\n') { refnames.push_back(""); } else { if(refnames.size() == 0) { refnames.push_back(""); } refnames.back().push_back(c); } } if(refnames.back().empty()) { refnames.pop_back(); } // Be kind in.clear(); in.seekg(0, ios::beg); assert(in.good()); } /** * Read reference names from the index with basename 'in' and store * them in 'refnames'. */ static inline void readEbwtRefnames(const string& instr, vector& refnames) { ifstream in; // Initialize our primary and secondary input-stream fields in.open((instr + ".1.ebwt").c_str(), ios_base::in | ios::binary); if(!in.is_open()) { throw EbwtFileOpenException("Cannot open file " + instr); } assert(in.is_open()); assert(in.good()); assert_eq((streamoff)in.tellg(), ios::beg); readEbwtRefnames(in, refnames); } /** * Read just enough of the Ebwt's header to get its flags */ static inline int32_t readFlags(const string& instr) { ifstream in; // Initialize our primary and secondary input-stream fields in.open((instr + ".1.ebwt").c_str(), ios_base::in | ios::binary); if(!in.is_open()) { throw EbwtFileOpenException("Cannot open file " + instr); } assert(in.is_open()); assert(in.good()); bool switchEndian = false; uint32_t one = readU32(in, switchEndian); // 1st word of primary stream if(one != 1) { assert_eq((1u<<24), one); assert_eq(1, endianSwapU32(one)); switchEndian = true; } readU32(in, switchEndian); readI32(in, switchEndian); readI32(in, switchEndian); readI32(in, switchEndian); readI32(in, switchEndian); int32_t flags = readI32(in, switchEndian); return flags; } /** * Read just enough of the Ebwt's header to determine whether it's * colorspace. */ static inline bool readEbwtColor(const string& instr) { int32_t flags = readFlags(instr); if(flags < 0 && (((-flags) & EBWT_COLOR) != 0)) { return true; } else { return false; } } /** * Read just enough of the Ebwt's header to determine whether it's * entirely reversed. */ static inline bool readEntireReverse(const string& instr) { int32_t flags = readFlags(instr); if(flags < 0 && (((-flags) & EBWT_ENTIRE_REV) != 0)) { return true; } else { return false; } } /** * Write an extended Burrows-Wheeler transform to a pair of output * streams. * * @param out1 output stream to primary file * @param out2 output stream to secondary file * @param be write in big endian? */ template void Ebwt::writeFromMemory(bool justHeader, ostream& out1, ostream& out2) const { const EbwtParams& eh = this->_eh; assert(eh.repOk()); uint32_t be = this->toBe(); assert(out1.good()); assert(out2.good()); // When building an Ebwt, these header parameters are known // "up-front", i.e., they can be written to disk immediately, // before we join() or buildToDisk() writeI32(out1, 1, be); // endian hint for priamry stream writeI32(out2, 1, be); // endian hint for secondary stream writeU32(out1, eh._len, be); // length of string (and bwt and suffix array) writeI32(out1, eh._lineRate, be); // 2^lineRate = size in bytes of 1 line writeI32(out1, eh._linesPerSide, be); // not used writeI32(out1, eh._offRate, be); // every 2^offRate chars is "marked" writeI32(out1, eh._ftabChars, be); // number of 2-bit chars used to address ftab int32_t flags = 1; if(eh._color) flags |= EBWT_COLOR; if(eh._entireReverse) flags |= EBWT_ENTIRE_REV; writeI32(out1, -flags, be); // BTL: chunkRate is now deprecated if(!justHeader) { assert(isInMemory()); // These Ebwt parameters are known after the inputs strings have // been joined() but before they have been built(). These can // written to the disk next and then discarded from memory. writeU32(out1, this->_nPat, be); for(uint32_t i = 0; i < this->_nPat; i++) writeU32(out1, this->_plen[i], be); assert_geq(this->_nFrag, this->_nPat); writeU32(out1, this->_nFrag, be); for(uint32_t i = 0; i < this->_nFrag*3; i++) writeU32(out1, this->_rstarts[i], be); // These Ebwt parameters are discovered only as the Ebwt is being // built (in buildToDisk()). Of these, only 'offs' and 'ebwt' are // terribly large. 'ebwt' is written to the primary file and then // discarded from memory as it is built; 'offs' is similarly // written to the secondary file and discarded. out1.write((const char *)this->ebwt(), eh._ebwtTotLen); writeU32(out1, this->zOff(), be); uint32_t offsLen = eh._offsLen; for(uint32_t i = 0; i < offsLen; i++) writeU32(out2, this->_offs[i], be); uint32_t isaLen = eh._isaLen; for(uint32_t i = 0; i < isaLen; i++) writeU32(out2, this->_isa[i], be); // 'fchr', 'ftab' and 'eftab' are not fully determined until the // loop is finished, so they are written to the primary file after // all of 'ebwt' has already been written and only then discarded // from memory. for(int i = 0; i < 5; i++) writeU32(out1, this->_fchr[i], be); for(uint32_t i = 0; i < eh._ftabLen; i++) writeU32(out1, this->ftab()[i], be); for(uint32_t i = 0; i < eh._eftabLen; i++) writeU32(out1, this->eftab()[i], be); } } /** * Given a pair of strings representing output filenames, and assuming * this Ebwt object is currently in memory, write out this Ebwt to the * specified files. * * If sanity-checking is enabled, then once the streams have been * fully written and closed, we reopen them and read them into a * (hopefully) exact copy of this Ebwt. We then assert that the * current Ebwt and the copy match in all of their fields. */ template void Ebwt::writeFromMemory(bool justHeader, const string& out1, const string& out2) const { const EbwtParams& eh = this->_eh; assert(isInMemory()); assert(eh.repOk()); ofstream fout1(out1.c_str(), ios::binary); ofstream fout2(out2.c_str(), ios::binary); writeFromMemory(justHeader, fout1, fout2); fout1.close(); fout2.close(); // Read the file back in and assert that all components match if(_sanity) { if(_verbose) cout << "Re-reading \"" << out1 << "\"/\"" << out2 << "\" for sanity check" << endl; Ebwt copy(out1, out2, _verbose, _sanity); assert(!isInMemory()); copy.loadIntoMemory(eh._color ? 1 : 0, -1, false, false); assert(isInMemory()); assert_eq(eh._lineRate, copy.eh()._lineRate); assert_eq(eh._linesPerSide, copy.eh()._linesPerSide); assert_eq(eh._offRate, copy.eh()._offRate); assert_eq(eh._isaRate, copy.eh()._isaRate); assert_eq(eh._ftabChars, copy.eh()._ftabChars); assert_eq(eh._len, copy.eh()._len); assert_eq(_zOff, copy.zOff()); assert_eq(_zEbwtBpOff, copy.zEbwtBpOff()); assert_eq(_zEbwtByteOff, copy.zEbwtByteOff()); assert_eq(_nPat, copy.nPat()); for(uint32_t i = 0; i < _nPat; i++) assert_eq(this->_plen[i], copy.plen()[i]); assert_eq(this->_nFrag, copy.nFrag()); for(uint32_t i = 0; i < this->nFrag*3; i++) { assert_eq(this->_rstarts[i], copy.rstarts()[i]); } for(uint32_t i = 0; i < 5; i++) assert_eq(this->_fchr[i], copy.fchr()[i]); for(uint32_t i = 0; i < eh._ftabLen; i++) assert_eq(this->ftab()[i], copy.ftab()[i]); for(uint32_t i = 0; i < eh._eftabLen; i++) assert_eq(this->eftab()[i], copy.eftab()[i]); for(uint32_t i = 0; i < eh._offsLen; i++) assert_eq(this->_offs[i], copy.offs()[i]); for(uint32_t i = 0; i < eh._isaLen; i++) assert_eq(this->_isa[i], copy.isa()[i]); for(uint32_t i = 0; i < eh._ebwtTotLen; i++) assert_eq(this->ebwt()[i], copy.ebwt()[i]); //copy.sanityCheckAll(); if(_verbose) cout << "Read-in check passed for \"" << out1 << "\"/\"" << out2 << "\"" << endl; } } /////////////////////////////////////////////////////////////////////// // // Functions for building Ebwts // /////////////////////////////////////////////////////////////////////// /** * Join several text strings together in a way that's compatible with * the text-chunking scheme dictated by chunkRate parameter. * * The non-static member Ebwt::join additionally builds auxilliary * arrays that maintain a mapping between chunks in the joined string * and the original text strings. */ template TStr Ebwt::join(vector& l, uint32_t seed) { RandomSource rand; // reproducible given same seed rand.init(seed); TStr ret; size_t guessLen = 0; for(size_t i = 0; i < l.size(); i++) { guessLen += length(l[i]); } reserve(ret, guessLen, Exact()); for(size_t i = 0; i < l.size(); i++) { TStr& s = l[i]; assert_gt(length(s), 0); append(ret, s); } return ret; } /** * Join several text strings together in a way that's compatible with * the text-chunking scheme dictated by chunkRate parameter. * * The non-static member Ebwt::join additionally builds auxilliary * arrays that maintain a mapping between chunks in the joined string * and the original text strings. */ template TStr Ebwt::join(vector& l, vector& szs, uint32_t sztot, const RefReadInParams& refparams, uint32_t seed) { RandomSource rand; // reproducible given same seed rand.init(seed); RefReadInParams rpcp = refparams; TStr ret; size_t guessLen = sztot; reserve(ret, guessLen, Exact()); ASSERT_ONLY(size_t szsi = 0); for(size_t i = 0; i < l.size(); i++) { // For each sequence we can pull out of istream l[i]... assert(!l[i]->eof()); bool first = true; while(!l[i]->eof()) { RefRecord rec = fastaRefReadAppend(*l[i], first, ret, rpcp); #ifndef ACCOUNT_FOR_ALL_GAP_REFS if(rec.first && rec.len == 0) rec.first = false; #endif first = false; size_t bases = rec.len; assert_eq(rec.off, szs[szsi].off); assert_eq(rec.len, szs[szsi].len); assert_eq(rec.first, szs[szsi].first); ASSERT_ONLY(szsi++); if(bases == 0) continue; } } return ret; } /** * Join several text strings together according to the text-chunking * scheme specified in the EbwtParams. Ebwt fields calculated in this * function are written directly to disk. * * It is assumed, but not required, that the header values have already * been written to 'out1' before this function is called. * * The static member Ebwt::join just returns a joined version of a * list of strings without building any of the auxiliary arrays. * Because the pseudo-random number generator is the same, we expect * this function and the static function to give the same result given * the same seed. */ template void Ebwt::joinToDisk( vector& l, vector& szs, vector& plens, uint32_t sztot, const RefReadInParams& refparams, TStr& ret, ostream& out1, ostream& out2, uint32_t seed) { RandomSource rand; // reproducible given same seed rand.init(seed); RefReadInParams rpcp = refparams; assert_gt(szs.size(), 0); assert_gt(l.size(), 0); assert_gt(sztot, 0); // Not every fragment represents a distinct sequence - many // fragments may correspond to a single sequence. Count the // number of sequences here by counting the number of "first" // fragments. this->_nPat = 0; this->_nFrag = 0; #ifdef ACCOUNT_FOR_ALL_GAP_REFS int nGapFrag = 0; #endif for(size_t i = 0; i < szs.size(); i++) { if(szs[i].len > 0) this->_nFrag++; #ifdef ACCOUNT_FOR_ALL_GAP_REFS if(szs[i].len == 0 && szs[i].off > 0) nGapFrag++; if(szs[i].first && szs[i].len > 0) this->_nPat++; #else // For all records where len=0 and first=1, set first=0 assert(szs[i].len > 0 || !szs[i].first); if(szs[i].first) this->_nPat++; #endif } assert_gt(this->_nPat, 0); assert_geq(this->_nFrag, this->_nPat); this->_rstarts = NULL; writeU32(out1, this->_nPat, this->toBe()); assert_eq(plens.size(), this->_nPat); // Allocate plen[] try { this->_plen = new uint32_t[this->_nPat]; } catch(bad_alloc& e) { cerr << "Out of memory allocating plen[] in Ebwt::join()" << " at " << __FILE__ << ":" << __LINE__ << endl; throw e; } // For each pattern, set plen for(size_t i = 0; i < plens.size(); i++) { this->_plen[i] = plens[i]; writeU32(out1, this->_plen[i], this->toBe()); } // Write the number of fragments writeU32(out1, this->_nFrag, this->toBe()); size_t seqsRead = 0; ASSERT_ONLY(uint32_t szsi = 0); ASSERT_ONLY(uint32_t entsWritten = 0); // For each filebuf for(unsigned int i = 0; i < l.size(); i++) { assert(!l[i]->eof()); bool first = true; uint32_t patoff = 0; // For each *fragment* (not necessary an entire sequence) we // can pull out of istream l[i]... while(!l[i]->eof()) { string name; // Push a new name onto our vector _refnames.push_back(""); //uint32_t oldRetLen = length(ret); RefRecord rec = fastaRefReadAppend(*l[i], first, ret, rpcp, &_refnames.back()); #ifndef ACCOUNT_FOR_ALL_GAP_REFS if(rec.first && rec.len == 0) rec.first = false; #endif first = false; if(rec.first) { if(_refnames.back().length() == 0) { // If name was empty, replace with an index ostringstream stm; stm << (_refnames.size()-1); _refnames.back() = stm.str(); } } else { // This record didn't actually start a new sequence so // no need to add a name //assert_eq(0, _refnames.back().length()); _refnames.pop_back(); } assert_lt(szsi, szs.size()); assert(szs[szsi].first == 0 || szs[szsi].first == 1); assert_eq(rec.off, szs[szsi].off); assert_eq(rec.len, szs[szsi].len); // szs[szsi].first == 2 sometimes?!?! g++ is unable to do // the following correctly, regardless of how I write it //assert((rec.first == 0) == (szs[szsi].first == 0)); assert(rec.first || rec.off > 0); ASSERT_ONLY(szsi++); #ifdef ACCOUNT_FOR_ALL_GAP_REFS if(rec.len == 0) continue; if(rec.first && rec.len > 0) seqsRead++; assert_leq(rec.len, this->_plen[seqsRead-1]); #else if(rec.first) seqsRead++; if(rec.len == 0) continue; assert_leq(rec.len, this->_plen[seqsRead-1]); #endif // Reset the patoff if this is the first fragment if(rec.first) patoff = 0; patoff += rec.off; // add fragment's offset from end of last frag. // Adjust rpcps //uint32_t seq = seqsRead-1; ASSERT_ONLY(entsWritten++); // This is where rstarts elements are written to the output stream //writeU32(out1, oldRetLen, this->toBe()); // offset from beginning of joined string //writeU32(out1, seq, this->toBe()); // sequence id //writeU32(out1, patoff, this->toBe()); // offset into sequence patoff += rec.len; } assert_gt(szsi, 0); l[i]->reset(); assert(!l[i]->eof()); #ifndef NDEBUG int c = l[i]->get(); assert_eq('>', c); assert(!l[i]->eof()); l[i]->reset(); assert(!l[i]->eof()); #endif } assert_eq(entsWritten, this->_nFrag); } /** * Build an Ebwt from a string 's' and its suffix array 'sa' (which * might actually be a suffix array *builder* that builds blocks of the * array on demand). The bulk of the Ebwt, i.e. the ebwt and offs * arrays, is written directly to disk. This is by design: keeping * those arrays in memory needlessly increases the footprint of the * building process. Instead, we prefer to build the Ebwt directly * "to disk" and then read it back into memory later as necessary. * * It is assumed that the header values and join-related values (nPat, * plen) have already been written to 'out1' before this function * is called. When this function is finished, it will have * additionally written ebwt, zOff, fchr, ftab and eftab to the primary * file and offs to the secondary file. * * Assume DNA/RNA/any alphabet with 4 or fewer elements. * Assume occ array entries are 32 bits each. * * @param sa the suffix array to convert to a Ebwt * @param buildISA whether to output an ISA sample into out2 after * the SA sample * @param s the original string * @param out */ template void Ebwt::buildToDisk(InorderBlockwiseSA& sa, const TStr& s, ostream& out1, ostream& out2) { const EbwtParams& eh = this->_eh; assert(eh.repOk()); assert_eq(length(s)+1, sa.size()); assert_eq(length(s), eh._len); assert_gt(eh._lineRate, 3); assert(sa.suffixItrIsReset()); assert_leq((int)ValueSize::VALUE, 4); uint32_t len = eh._len; uint32_t ftabLen = eh._ftabLen; uint32_t sideSz = eh._sideSz; uint32_t ebwtTotSz = eh._ebwtTotSz; uint32_t fchr[] = {0, 0, 0, 0, 0}; uint32_t* ftab = NULL; uint32_t zOff = 0xffffffff; // Save # of occurrences of each character as we walk along the bwt uint32_t occ[4] = {0, 0, 0, 0}; // Save 'G' and 'T' occurrences between backward and forward buckets uint32_t occSave[2] = {0, 0}; // Record rows that should "absorb" adjacent rows in the ftab. // The absorbed rows represent suffixes shorter than the ftabChars // cutoff. uint8_t absorbCnt = 0; uint8_t *absorbFtab; try { VMSG_NL("Allocating ftab, absorbFtab"); ftab = new uint32_t[ftabLen]; memset(ftab, 0, 4 * ftabLen); absorbFtab = new uint8_t[ftabLen]; memset(absorbFtab, 0, ftabLen); } catch(bad_alloc &e) { cerr << "Out of memory allocating ftab[] or absorbFtab[] " << "in Ebwt::buildToDisk() at " << __FILE__ << ":" << __LINE__ << endl; throw e; } assert(ftab != NULL); assert(absorbFtab != NULL); // Allocate the side buffer; holds a single side as its being // constructed and then written to disk. Reused across all sides. #ifdef SIXTY4_FORMAT uint64_t *ebwtSide = NULL; #else uint8_t *ebwtSide = NULL; #endif try { #ifdef SIXTY4_FORMAT ebwtSide = new uint64_t[sideSz >> 3]; #else ebwtSide = new uint8_t[sideSz]; #endif } catch(bad_alloc &e) { cerr << "Out of memory allocating ebwtSide[] in " << "Ebwt::buildToDisk() at " << __FILE__ << ":" << __LINE__ << endl; throw e; } assert(ebwtSide != NULL); // Allocate a buffer to hold the ISA sample, which we accumulate in // the loop and then output at the end. We can't write output the // ISA right away because the order in which we calculate its // elements is based on the suffix array, which we only see bit by // bit uint32_t *isaSample = NULL; if(eh._isaRate >= 0) { try { isaSample = new uint32_t[eh._isaLen]; } catch(bad_alloc &e) { cerr << "Out of memory allocating isaSample[] in " << "Ebwt::buildToDisk() at " << __FILE__ << ":" << __LINE__ << endl; throw e; } assert(isaSample != NULL); } // Points to the base offset within ebwt for the side currently // being written uint32_t side = 0; // Points to a byte offset from 'side' within ebwt[] where next // char should be written #ifdef SIXTY4_FORMAT int sideCur = (eh._sideBwtSz >> 3) - 1; #else int sideCur = eh._sideBwtSz - 1; #endif // Whether we're assembling a forward or a reverse bucket bool fw = false; // Did we just finish writing a forward bucket? (Must be true when // we exit the loop.) ASSERT_ONLY(bool wroteFwBucket = false); // Have we skipped the '$' in the last column yet? ASSERT_ONLY(bool dollarSkipped = false); uint32_t si = 0; // string offset (chars) ASSERT_ONLY(uint32_t lastSufInt = 0); ASSERT_ONLY(bool inSA = true); // true iff saI still points inside suffix // array (as opposed to the padding at the // end) // Iterate over packed bwt bytes VMSG_NL("Entering Ebwt loop"); ASSERT_ONLY(uint32_t beforeEbwtOff = (uint32_t)out1.tellp()); while(side < ebwtTotSz) { ASSERT_ONLY(wroteFwBucket = false); // Sanity-check our cursor into the side buffer assert_geq(sideCur, 0); assert_lt(sideCur, (int)eh._sideBwtSz); assert_eq(0, side % sideSz); // 'side' must be on side boundary ebwtSide[sideCur] = 0; // clear assert_lt(side + sideCur, ebwtTotSz); // Iterate over bit-pairs in the si'th character of the BWT #ifdef SIXTY4_FORMAT for(int bpi = 0; bpi < 32; bpi++, si++) #else for(int bpi = 0; bpi < 4; bpi++, si++) #endif { int bwtChar; bool count = true; if(si <= len) { // Still in the SA; extract the bwtChar uint32_t saElt = sa.nextSuffix(); // (that might have triggered sa to calc next suf block) if(isaSample != NULL && (saElt & eh._isaMask) == saElt) { // This element belongs in the ISA sample. Add // an entry mapping the text offset to the offset // into the suffix array that holds the suffix // beginning with the character at that text offset assert_lt((saElt >> eh._isaRate), eh._isaLen); isaSample[saElt >> eh._isaRate] = si; } if(saElt == 0) { // Don't add the '$' in the last column to the BWT // transform; we can't encode a $ (only A C T or G) // and counting it as, say, an A, will mess up the // LR mapping bwtChar = 0; count = false; ASSERT_ONLY(dollarSkipped = true); zOff = si; // remember the SA row that // corresponds to the 0th suffix } else { bwtChar = (int)(Dna)(s[saElt-1]); assert_lt(bwtChar, 4); // Update the fchr fchr[bwtChar]++; } // Update ftab if((len-saElt) >= (uint32_t)eh._ftabChars) { // Turn the first ftabChars characters of the // suffix into an integer index into ftab uint32_t sufInt = 0; for(int i = 0; i < eh._ftabChars; i++) { sufInt <<= 2; assert_lt(i, (int)(len-saElt)); sufInt |= (unsigned char)(Dna)(s[saElt+i]); } // Assert that this prefix-of-suffix is greater // than or equal to the last one (true b/c the // suffix array is sorted) #ifndef NDEBUG if(lastSufInt > 0) assert_geq(sufInt, lastSufInt); lastSufInt = sufInt; #endif // Update ftab assert_lt(sufInt+1, ftabLen); ftab[sufInt+1]++; if(absorbCnt > 0) { // Absorb all short suffixes since the last // transition into this transition absorbFtab[sufInt] = absorbCnt; absorbCnt = 0; } } else { // Otherwise if suffix is fewer than ftabChars // characters long, then add it to the 'absorbCnt'; // it will be absorbed into the next transition assert_lt(absorbCnt, 255); absorbCnt++; } // Suffix array offset boundary? - update offset array if((si & eh._offMask) == si) { assert_lt((si >> eh._offRate), eh._offsLen); // Write offsets directly to the secondary output // stream, thereby avoiding keeping them in memory writeU32(out2, saElt, this->toBe()); } } else { // Strayed off the end of the SA, now we're just // padding out a bucket #ifndef NDEBUG if(inSA) { // Assert that we wrote all the characters in the // string before now assert_eq(si, len+1); inSA = false; } #endif // 'A' used for padding; important that padding be // counted in the occ[] array bwtChar = 0; } if(count) occ[bwtChar]++; // Append BWT char to bwt section of current side if(fw) { // Forward bucket: fill from least to most #ifdef SIXTY4_FORMAT ebwtSide[sideCur] |= ((uint64_t)bwtChar << (bpi << 1)); if(bwtChar > 0) assert_gt(ebwtSide[sideCur], 0); #else pack_2b_in_8b(bwtChar, ebwtSide[sideCur], bpi); assert_eq((ebwtSide[sideCur] >> (bpi*2)) & 3, bwtChar); #endif } else { // Backward bucket: fill from most to least #ifdef SIXTY4_FORMAT ebwtSide[sideCur] |= ((uint64_t)bwtChar << ((31 - bpi) << 1)); if(bwtChar > 0) assert_gt(ebwtSide[sideCur], 0); #else pack_2b_in_8b(bwtChar, ebwtSide[sideCur], 3-bpi); assert_eq((ebwtSide[sideCur] >> ((3-bpi)*2)) & 3, bwtChar); #endif } } // end loop over bit-pairs assert_eq(dollarSkipped ? 3 : 0, (occ[0] + occ[1] + occ[2] + occ[3]) & 3); #ifdef SIXTY4_FORMAT assert_eq(0, si & 31); #else assert_eq(0, si & 3); #endif if(fw) sideCur++; else sideCur--; #ifdef SIXTY4_FORMAT if(sideCur == (int)eh._sideBwtSz >> 3) #else if(sideCur == (int)eh._sideBwtSz) #endif { // Forward side boundary assert_eq(0, si % eh._sideBwtLen); #ifdef SIXTY4_FORMAT sideCur = (eh._sideBwtSz >> 3) - 1; #else sideCur = eh._sideBwtSz - 1; #endif assert(fw); fw = false; ASSERT_ONLY(wroteFwBucket = true); // Write 'G' and 'T' assert_leq(occSave[0], occ[2]); assert_leq(occSave[1], occ[3]); uint32_t *u32side = reinterpret_cast(ebwtSide); side += sideSz; assert_leq(side, eh._ebwtTotSz); u32side[(sideSz >> 2)-2] = endianizeU32(occSave[0], this->toBe()); u32side[(sideSz >> 2)-1] = endianizeU32(occSave[1], this->toBe()); // Write forward side to primary file out1.write((const char *)ebwtSide, sideSz); } else if (sideCur == -1) { // Backward side boundary assert_eq(0, si % eh._sideBwtLen); sideCur = 0; assert(!fw); fw = true; // Write 'A' and 'C' uint32_t *u32side = reinterpret_cast(ebwtSide); side += sideSz; assert_leq(side, eh._ebwtTotSz); u32side[(sideSz >> 2)-2] = endianizeU32(occ[0], this->toBe()); u32side[(sideSz >> 2)-1] = endianizeU32(occ[1], this->toBe()); occSave[0] = occ[2]; // save 'G' count occSave[1] = occ[3]; // save 'T' count // Write backward side to primary file out1.write((const char *)ebwtSide, sideSz); } } VMSG_NL("Exited Ebwt loop"); assert(ftab != NULL); assert_neq(zOff, 0xffffffff); if(absorbCnt > 0) { // Absorb any trailing, as-yet-unabsorbed short suffixes into // the last element of ftab absorbFtab[ftabLen-1] = absorbCnt; } // Assert that our loop counter got incremented right to the end assert_eq(side, eh._ebwtTotSz); // Assert that we wrote the expected amount to out1 assert_eq(((uint32_t)out1.tellp() - beforeEbwtOff), eh._ebwtTotSz); // assert that the last thing we did was write a forward bucket assert(wroteFwBucket); // // Write zOff to primary stream // writeU32(out1, zOff, this->toBe()); // // Finish building fchr // // Exclusive prefix sum on fchr for(int i = 1; i < 4; i++) { fchr[i] += fchr[i-1]; } assert_eq(fchr[3], len); // Shift everybody up by one for(int i = 4; i >= 1; i--) { fchr[i] = fchr[i-1]; } fchr[0] = 0; if(_verbose) { for(int i = 0; i < 5; i++) cout << "fchr[" << "ACGT$"[i] << "]: " << fchr[i] << endl; } // Write fchr to primary file for(int i = 0; i < 5; i++) { writeU32(out1, fchr[i], this->toBe()); } // // Finish building ftab and build eftab // // Prefix sum on ftable uint32_t eftabLen = 0; assert_eq(0, absorbFtab[0]); for(uint32_t i = 1; i < ftabLen; i++) { if(absorbFtab[i] > 0) eftabLen += 2; } assert_leq(eftabLen, (uint32_t)eh._ftabChars*2); eftabLen = eh._ftabChars*2; uint32_t *eftab = NULL; try { eftab = new uint32_t[eftabLen]; memset(eftab, 0, 4 * eftabLen); } catch(bad_alloc &e) { cerr << "Out of memory allocating eftab[] " << "in Ebwt::buildToDisk() at " << __FILE__ << ":" << __LINE__ << endl; throw e; } assert(eftab != NULL); uint32_t eftabCur = 0; for(uint32_t i = 1; i < ftabLen; i++) { uint32_t lo = ftab[i] + Ebwt::ftabHi(ftab, eftab, len, ftabLen, eftabLen, i-1); if(absorbFtab[i] > 0) { // Skip a number of short pattern indicated by absorbFtab[i] uint32_t hi = lo + absorbFtab[i]; assert_lt(eftabCur*2+1, eftabLen); eftab[eftabCur*2] = lo; eftab[eftabCur*2+1] = hi; ftab[i] = (eftabCur++) ^ 0xffffffff; // insert pointer into eftab assert_eq(lo, Ebwt::ftabLo(ftab, eftab, len, ftabLen, eftabLen, i)); assert_eq(hi, Ebwt::ftabHi(ftab, eftab, len, ftabLen, eftabLen, i)); } else { ftab[i] = lo; } } assert_eq(Ebwt::ftabHi(ftab, eftab, len, ftabLen, eftabLen, ftabLen-1), len+1); // Write ftab to primary file for(uint32_t i = 0; i < ftabLen; i++) { writeU32(out1, ftab[i], this->toBe()); } // Write eftab to primary file for(uint32_t i = 0; i < eftabLen; i++) { writeU32(out1, eftab[i], this->toBe()); } // Write isa to primary file if(isaSample != NULL) { ASSERT_ONLY(Bitset sawISA(eh._len+1)); for(uint32_t i = 0; i < eh._isaLen; i++) { uint32_t s = isaSample[i]; assert_leq(s, eh._len); assert(!sawISA.test(s)); ASSERT_ONLY(sawISA.set(s)); writeU32(out2, s, this->toBe()); } delete[] isaSample; } delete[] ftab; delete[] eftab; delete[] absorbFtab; // Note: if you'd like to sanity-check the Ebwt, you'll have to // read it back into memory first! assert(!isInMemory()); VMSG_NL("Exiting Ebwt::buildToDisk()"); } /** * Try to find the Bowtie index specified by the user. First try the * exact path given by the user. Then try the user-provided string * appended onto the path of the "indexes" subdirectory below this * executable, then try the provided string appended onto * "$BOWTIE_INDEXES/". */ string adjustEbwtBase(const string& cmdline, const string& ebwtFileBase, bool verbose = false); #endif /*EBWT_H_*/