/* * Copyright 2015, Daehwan Kim * * This file is part of HISAT 2. * * HISAT 2 is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * HISAT 2 is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with HISAT 2. If not, see . */ #ifndef GFM_H_ #define GFM_H_ #include #include #include #include #include #include #include #include #include #include #include #include #ifdef BOWTIE_MM #include #include #endif #include "shmem.h" #include "alphabet.h" #include "assert_helpers.h" #include "bitpack.h" #include "blockwise_sa.h" #include "endian_swap.h" #include "word_io.h" #include "random_source.h" #include "ref_read.h" #include "threading.h" #include "str_util.h" #include "mm.h" #include "timer.h" #include "reference.h" #include "search_globals.h" #include "ds.h" #include "random_source.h" #include "mem_ids.h" #include "btypes.h" #include "tokenize.h" #include "repeat.h" #include "repeat_kmer.h" #ifdef POPCNT_CAPABILITY #include "processor_support.h" #endif #include "gbwt_graph.h" using namespace std; // From ccnt_lut.cpp, automatically generated by gen_lookup_tables.pl extern uint8_t cCntLUT_4[4][4][256]; extern uint8_t cCntLUT_4_rev[4][4][256]; extern uint8_t cCntBIT[8][256]; extern bool threeN; static const uint64_t c_table[4] = { 0xffffffffffffffff, 0xaaaaaaaaaaaaaaaa, 0x5555555555555555, 0x0000000000000000 }; #ifndef VMSG_NL #define VMSG_NL(...) \ if(this->verbose()) { \ stringstream tmp; \ tmp << __VA_ARGS__ << endl; \ this->verbose(tmp.str()); \ } #endif #ifndef VMSG #define VMSG(...) \ if(this->verbose()) { \ stringstream tmp; \ tmp << __VA_ARGS__; \ this->verbose(tmp.str()); \ } #endif /** * Flags describing type of Ebwt. */ enum GFM_FLAGS { GFM_ENTIRE_REV = 4 // true -> reverse Ebwt is the whole // concatenated string reversed, rather than // each stretch reversed }; /** * Extended Burrows-Wheeler transform header. This together with the * actual data arrays and other text-specific parameters defined in * class Ebwt constitute the entire Ebwt. */ template class GFMParams { public: GFMParams() { } GFMParams( index_t len, index_t gbwtLen, index_t numNodes, int32_t lineRate, int32_t offRate, int32_t ftabChars, index_t eftabLen, bool entireReverse) { init(len, gbwtLen, numNodes, lineRate, offRate, ftabChars, eftabLen, entireReverse); } GFMParams(const GFMParams& gh) { init(gh._len, gh._gbwtLen, gh._numNodes, gh._lineRate, gh._offRate, gh._ftabChars, gh._eftabLen, gh._entireReverse); } void init( index_t len, index_t gbwtLen, index_t numNodes, int32_t lineRate, int32_t offRate, int32_t ftabChars, index_t eftabLen, bool entireReverse) { _entireReverse = entireReverse; _linearFM = (len + 1 == gbwtLen || gbwtLen == 0); _len = len; _gbwtLen = (gbwtLen == 0 ? len + 1 : gbwtLen); _numNodes = (numNodes == 0 ? len + 1 : numNodes); if(_linearFM) { _sz = (len+3)/4; _gbwtSz = _gbwtLen/4 + 1; } else { _sz = (len+1)/2; _gbwtSz = _gbwtLen/2 + 1; } _lineRate = lineRate; _origOffRate = offRate; _offRate = offRate; _offMask = std::numeric_limits::max() << _offRate; _ftabChars = ftabChars; _eftabLen = eftabLen; _eftabSz = _eftabLen*sizeof(index_t); _ftabLen = (1 << (_ftabChars*2))+1; _ftabSz = _ftabLen*sizeof(index_t); _offsLen = (_numNodes + (1 << _offRate) - 1) >> _offRate; _offsSz = _offsLen*sizeof(index_t); _lineSz = 1 << _lineRate; _sideSz = _lineSz * 1 /* lines per side */; if(_linearFM) { _sideGbwtSz = _sideSz - (sizeof(index_t) * 4); _sideGbwtLen = _sideGbwtSz << 2; } else { _sideGbwtSz = _sideSz - (sizeof(index_t) * 6); _sideGbwtLen = _sideGbwtSz << 1; } _numSides = (_gbwtSz+(_sideGbwtSz)-1)/(_sideGbwtSz); _numLines = _numSides * 1 /* lines per side */; _gbwtTotLen = _numSides * _sideSz; _gbwtTotSz = _gbwtTotLen; assert(repOk()); } index_t len() const { return _len; } index_t lenNucs() const { return _len; } index_t gbwtLen() const { return _gbwtLen; } index_t sz() const { return _sz; } index_t gbwtSz() const { return _gbwtSz; } int32_t lineRate() const { return _lineRate; } int32_t origOffRate() const { return _origOffRate; } int32_t offRate() const { return _offRate; } index_t offMask() const { return _offMask; } int32_t ftabChars() const { return _ftabChars; } index_t eftabLen() const { return _eftabLen; } index_t eftabSz() const { return _eftabSz; } index_t ftabLen() const { return _ftabLen; } index_t ftabSz() const { return _ftabSz; } index_t offsLen() const { return _offsLen; } index_t offsSz() const { return _offsSz; } index_t lineSz() const { return _lineSz; } index_t sideSz() const { return _sideSz; } index_t sideGbtSz() const { return _sideGbwtSz; } index_t sideGbwtLen() const { return _sideGbwtLen; } index_t numSides() const { return _numSides; } index_t numLines() const { return _numLines; } index_t gbwtTotLen() const { return _gbwtTotLen; } index_t gbwtTotSz() const { return _gbwtTotSz; } bool entireReverse() const { return _entireReverse; } bool linearFM() const { return _linearFM; } index_t numNodes() const { return _numNodes; } /** * Set a new suffix-array sampling rate, which involves updating * rate, mask, sample length, and sample size. */ void setOffRate(int __offRate) { _offRate = __offRate; _offMask = std::numeric_limits::max() << _offRate; _offsLen = (_gbwtLen + (1 << _offRate) - 1) >> _offRate; _offsSz = _offsLen * sizeof(index_t); } #ifndef NDEBUG /// Check that this EbwtParams is internally consistent bool repOk() const { // assert_gt(_len, 0); assert_gt(_lineRate, 3); assert_geq(_offRate, 0); assert_leq(_ftabChars, 16); assert_geq(_ftabChars, 1); assert_lt(_lineRate, 32); assert_lt(_ftabChars, 32); assert_eq(0, _gbwtTotSz % _lineSz); return true; } #endif /** * Pretty-print the header contents to the given output stream. */ void print(ostream& out) const { out << "Headers:" << endl << " len: " << _len << endl << " gbwtLen: " << _gbwtLen << endl << " nodes: " << _numNodes << endl << " sz: " << _sz << endl << " gbwtSz: " << _gbwtSz << endl << " lineRate: " << _lineRate << endl << " offRate: " << _offRate << endl << " offMask: 0x" << hex << _offMask << dec << endl << " ftabChars: " << _ftabChars << endl << " eftabLen: " << _eftabLen << endl << " eftabSz: " << _eftabSz << endl << " ftabLen: " << _ftabLen << endl << " ftabSz: " << _ftabSz << endl << " offsLen: " << _offsLen << endl << " offsSz: " << _offsSz << endl << " lineSz: " << _lineSz << endl << " sideSz: " << _sideSz << endl << " sideGbwtSz: " << _sideGbwtSz << endl << " sideGbwtLen: " << _sideGbwtLen << endl << " numSides: " << _numSides << endl << " numLines: " << _numLines << endl << " gbwtTotLen: " << _gbwtTotLen << endl << " gbwtTotSz: " << _gbwtTotSz << endl << " reverse: " << _entireReverse << endl << " linearFM: " << (_linearFM ? "Yes" : "No") << endl; } index_t _len; index_t _gbwtLen; index_t _sz; index_t _gbwtSz; int32_t _lineRate; int32_t _origOffRate; int32_t _offRate; index_t _offMask; int32_t _ftabChars; index_t _eftabLen; index_t _eftabSz; index_t _ftabLen; index_t _ftabSz; index_t _offsLen; index_t _offsSz; index_t _lineSz; index_t _sideSz; index_t _sideGbwtSz; index_t _sideGbwtLen; index_t _numSides; index_t _numLines; index_t _gbwtTotLen; index_t _gbwtTotSz; bool _entireReverse; bool _linearFM; index_t _numNodes; }; /** * Exception to throw when a file-realted error occurs. */ class GFMFileOpenException : public std::runtime_error { public: GFMFileOpenException(const std::string& msg = "") : std::runtime_error(msg) { } }; /** * Calculate size of file with given name. */ static inline int64_t fileSize(const char* name) { std::ifstream f; f.open(name, std::ios_base::binary | std::ios_base::in); if (!f.good() || f.eof() || !f.is_open()) { return 0; } f.seekg(0, std::ios_base::beg); std::ifstream::pos_type begin_pos = f.tellg(); f.seekg(0, std::ios_base::end); return static_cast(f.tellg() - begin_pos); } /** * Encapsulates a location in the gbwt text in terms of the side it * occurs in and its offset within the side. */ template struct SideLocus { SideLocus() : _sideByteOff(0), _sideNum(0), _charOff(0), _by(-1), _bp(-1) { } /** * Construct from row and other relevant information about the Ebwt. */ SideLocus(index_t row, const GFMParams& ep, const uint8_t* ebwt) { initFromRow(row, ep, ebwt); } /** * Init two SideLocus objects from a top/bot pair, using the result * from one call to initFromRow to possibly avoid a second call. */ static void initFromTopBot( index_t top, index_t bot, const GFMParams& gp, const uint8_t* gfm, SideLocus& ltop, SideLocus& lbot) { const index_t sideGbwtLen = gp._sideGbwtLen; assert_gt(bot, top); ltop.initFromRow(top, gp, gfm); index_t spread = bot - top; // Many cache misses on the following lines if(ltop._charOff + spread < sideGbwtLen) { lbot._charOff = ltop._charOff + spread; lbot._sideNum = ltop._sideNum; lbot._sideByteOff = ltop._sideByteOff; lbot._by = lbot._charOff >> 2; assert_lt(lbot._by, (int)gp._sideGbwtSz); lbot._bp = lbot._charOff & 0x3; } else { lbot.initFromRow(bot, gp, gfm); } } /** * Calculate SideLocus based on a row and other relevant * information about the shape of the Ebwt. */ void initFromRow( index_t row, const GFMParams& gp, const uint8_t* gfm) { const index_t sideSz = gp._sideSz; // Side length is hard-coded for now; this allows the compiler // to do clever things to accelerate / and %. _sideNum = row / gp._sideGbwtLen; assert_lt(_sideNum, gp._numSides); _charOff = row % gp._sideGbwtLen; _sideByteOff = _sideNum * sideSz; assert_leq(row, gp._gbwtLen); assert_leq(_sideByteOff + sideSz, gp._gbwtTotSz); // Tons of cache misses on the next line _by = _charOff >> 2; // byte within side assert_lt(_by, (int)gp._sideGbwtSz); _bp = _charOff & 0x3; // bit-pair within byte } /** * Init two SideLocus objects from a top/bot pair, using the result * from one call to initFromRow to possibly avoid a second call. */ static void initFromTopBot_bit( index_t top, index_t bot, const GFMParams& gp, const uint8_t* gfm, SideLocus& ltop, SideLocus& lbot) { const index_t sideGbwtLen = gp._sideGbwtLen; // assert_gt(bot, top); ltop.initFromRow_bit(top, gp, gfm); index_t spread = bot - top; // Many cache misses on the following lines if(ltop._charOff + spread < sideGbwtLen) { lbot._charOff = ltop._charOff + spread; lbot._sideNum = ltop._sideNum; lbot._sideByteOff = ltop._sideByteOff; lbot._by = lbot._charOff >> 3; assert_lt(lbot._by, (int)gp._sideGbwtSz); lbot._bp = lbot._charOff & 0x7; } else { lbot.initFromRow_bit(bot, gp, gfm); } } /** * Calculate SideLocus based on a row and other relevant * information about the shape of the Ebwt. */ void initFromRow_bit( index_t row, const GFMParams& gp, const uint8_t* gfm) { const index_t sideSz = gp._sideSz; // Side length is hard-coded for now; this allows the compiler // to do clever things to accelerate / and %. _sideNum = row / gp._sideGbwtLen; assert_lt(_sideNum, gp._numSides); _charOff = row % gp._sideGbwtLen; _sideByteOff = _sideNum * sideSz; assert_lt(row, gp._gbwtLen); assert_leq(_sideByteOff + sideSz, gp._gbwtTotSz); // Tons of cache misses on the next line _by = _charOff >> 3; // byte within side assert_lt(_by, (int)gp._sideGbwtSz); _bp = _charOff & 0x7; // bit-pair within byte } /** * Transform this SideLocus to refer to the next side (i.e. the one * corresponding to the next side downstream). Set all cursors to * point to the beginning of the side. */ void nextSide(const GFMParams& gp) { assert(valid()); _sideByteOff += gp.sideSz(); _sideNum++; _by = _bp = _charOff = 0; assert(valid()); } /** * Return true iff this is an initialized SideLocus */ bool valid() const { if(_bp != -1) { return true; } return false; } /** * Convert locus to BW row it corresponds to. */ index_t toBWRow(const GFMParams& gp) const; #ifndef NDEBUG /** * Check that SideLocus is internally consistent and consistent * with the (provided) EbwtParams. */ bool repOk(const GFMParams& gp) const { ASSERT_ONLY(index_t row = toBWRow(gp)); assert_leq(row, gp._gbwtLen); assert_range(-1, 3, _bp); assert_range(0, (int)gp._sideGbwtSz, _by); return true; } #endif /// Make this look like an invalid SideLocus void invalidate() { _bp = -1; } /** * Return a read-only pointer to the beginning of the top side. */ const uint8_t *side(const uint8_t* gbwt) const { return gbwt + _sideByteOff; } /** * Return a read-only pointer to the beginning of the top side. */ const uint8_t *next_side(const GFMParams& gp, const uint8_t* gbwt) const { if(_sideByteOff + gp._sideSz < gp._ebwtTotSz) { return gbwt + _sideByteOff + gp._sideSz; } else { return NULL; } } index_t _sideByteOff; // offset of top side within ebwt[] index_t _sideNum; // index of side index_t _charOff; // character offset within side int32_t _by; // byte within side (not adjusted for bw sides) int32_t _bp; // bitpair within byte (not adjusted for bw sides) }; /** * Convert locus to BW row it corresponds to. */ template inline index_t SideLocus::toBWRow(const GFMParams& gp) const { return _sideNum * (gp._sideGbwtSz << (gp.linearFM() ? 2 : 1)) + _charOff; } #ifdef POPCNT_CAPABILITY // wrapping of "struct" struct USE_POPCNT_GENERIC { #endif // Use this standard bit-bashing population count inline static int pop64(uint64_t x) { // Lots of cache misses on following lines (>10K) x = x - ((x >> 1) & 0x5555555555555555llu); x = (x & 0x3333333333333333llu) + ((x >> 2) & 0x3333333333333333llu); x = (x + (x >> 4)) & 0x0F0F0F0F0F0F0F0Fllu; x = x + (x >> 8); x = x + (x >> 16); x = x + (x >> 32); return (int)(x & 0x3Fllu); } #ifdef POPCNT_CAPABILITY // wrapping a "struct" }; #endif #ifdef POPCNT_CAPABILITY struct USE_POPCNT_INSTRUCTION { inline static int pop64(uint64_t x) { int64_t count; #ifdef USING_MSC_COMPILER count = __popcnt64(x); #else asm ("popcntq %[x],%[count]\n": [count] "=&r" (count): [x] "r" (x)); #endif return (int)count; } }; #endif /** * Tricky-bit-bashing bitpair counting for given two-bit value (0-3) * within a 64-bit argument. */ #ifdef POPCNT_CAPABILITY template #endif inline static int countInU64(int c, uint64_t dw) { uint64_t c0 = c_table[c]; uint64_t x0 = dw ^ c0; uint64_t x1 = (x0 >> 1); uint64_t x2 = x1 & (0x5555555555555555); uint64_t x3 = x0 & x2; #ifdef POPCNT_CAPABILITY uint64_t tmp = Operation().pop64(x3); #else uint64_t tmp = pop64(x3); #endif return (int) tmp; } #ifdef POPCNT_CAPABILITY // wrapping of "struct" struct USE_POPCNT_GENERIC_BITS { // Use this standard bit-bashing population count inline static uint64_t pop64(uint64_t x) { #else // Use this standard bit-bashing population count inline static uint64_t pop6464(uint64_t x) { #endif x -= (x >> 1) & 0x5555555555555555ULL; x = (x & 0x3333333333333333ULL) + ((x >> 2) & 0x3333333333333333ULL); x = (x + (x >> 4)) & 0x0f0f0f0f0f0f0f0fULL; return int((x * 0x0101010101010101ULL) >> 56); } #ifdef POPCNT_CAPABILITY // wrapping a "struct" }; #endif /** * Tricky-bit-bashing bitpair counting for given two-bit value (0-3) * within a 64-bit argument. */ #ifdef POPCNT_CAPABILITY template #endif inline static int countInU64_bits(uint64_t dw) { #ifdef POPCNT_CAPABILITY uint64_t tmp = Operation().pop64(dw); #else uint64_t tmp = pop6464(dw); #endif return (int) tmp; } // Forward declarations for Ebwt class class GFMSearchParams; /** * Extended Burrows-Wheeler transform data. * * An Ebwt may be transferred to and from RAM with calls to * evictFromMemory() and loadIntoMemory(). By default, a newly-created * Ebwt is not loaded into memory; if the user would like to use a * newly-created Ebwt to answer queries, they must first call * loadIntoMemory(). */ template class GFM { public: #define GFM_INITS \ _toBigEndian(currentlyBigEndian()), \ _overrideOffRate(overrideOffRate), \ _verbose(verbose), \ _passMemExc(passMemExc), \ _sanity(sanityCheck), \ fw_(fw), \ _in1(NULL), \ _in2(NULL), \ _nPat(0), \ _nFrag(0), \ _plen(EBWT_CAT), \ _rstarts(EBWT_CAT), \ _fchr(EBWT_CAT), \ _ftab(EBWT_CAT), \ _eftab(EBWT_CAT), \ _offs(EBWT_CAT), \ _gfm(EBWT_CAT), \ _useMm(false), \ useShmem_(false), \ _refnames(EBWT_CAT), \ mmFile1_(NULL), \ mmFile2_(NULL), \ _nthreads(1) GFM() {} /// Construct a GFM from the given input file GFM(const string& in, ALTDB* altdb, RepeatDB* repeatdb, EList* readLens, int needEntireReverse, bool fw, int32_t overrideOffRate, // = -1, int32_t offRatePlus, // = -1, bool useMm, // = false, bool useShmem, // = false, bool mmSweep, // = false, bool loadNames, // = false, bool loadSASamp, // = true, bool loadFtab, // = true, bool loadRstarts, // = true, bool loadSpliceSites, // = true, bool verbose, // = false, bool startVerbose, // = false, bool passMemExc, // = false, bool sanityCheck, // = false) bool useHaplotype, // = false bool skipLoading = false) : GFM_INITS { assert(!useMm || !useShmem); #ifdef POPCNT_CAPABILITY ProcessorSupport ps; _usePOPCNTinstruction = ps.POPCNTenabled(); #endif packed_ = false; _useMm = useMm; useShmem_ = useShmem; _in1Str = in + ".1." + gfm_ext; _in2Str = in + ".2." + gfm_ext; if(skipLoading) return; if(repeatdb == NULL) { readIntoMemory( fw ? -1 : needEntireReverse, // need REF_READ_REVERSE loadSASamp, // load the SA sample portion? loadFtab, // load the ftab & eftab? loadRstarts, // load the rstarts array? true, // stop after loading the header portion? &_gh, // params mmSweep, // mmSweep loadNames, // loadNames startVerbose); // startVerbose // If the offRate has been overridden, reflect that in the // _eh._offRate field if(offRatePlus > 0 && _overrideOffRate == -1) { _overrideOffRate = _gh._offRate + offRatePlus; } if(_overrideOffRate > _gh._offRate) { _gh.setOffRate(_overrideOffRate); assert_eq(_overrideOffRate, _gh._offRate); } } // Read ALTs EList >& alts = altdb->alts(); EList >& haplotypes = altdb->haplotypes(); EList& altnames = altdb->altnames(); alts.clear(); altnames.clear(); string in7Str = in + ".7." + gfm_ext; string in8Str = in + ".8." + gfm_ext; // open alts if(verbose || startVerbose) cerr << "Opening \"" << in7Str.c_str() << "\"" << endl; ifstream in7(in7Str.c_str(), ios::binary); if(!in7.good()) { cerr << "Could not open index file " << in7Str.c_str() << endl; } EList to_alti; index_t to_alti_far = 0; readI32(in7, this->toBe()); index_t numAlts = readIndex(in7, this->toBe()); // open altnames if(verbose || startVerbose) cerr << "Opening \"" << in8Str.c_str() << "\"" << endl; ifstream in8(in8Str.c_str(), ios::binary); if(!in8.good()) { cerr << "Could not open index file " << in8Str.c_str() << endl; } readI32(in8, this->toBe()); index_t numAltnames = readIndex(in8, this->toBe()); assert_eq(numAlts, numAltnames); if(numAlts > 0) { alts.resizeExact(numAlts); alts.clear(); to_alti.resizeExact(numAlts); to_alti.clear(); while(!in7.eof() && !in8.eof()) { alts.expand(); alts.back().read(in7, this->toBe()); to_alti.push_back(to_alti_far); to_alti_far++; altnames.expand(); in8 >> altnames.back(); if(!loadSpliceSites) { if(alts.back().splicesite()) { alts.pop_back(); assert_gt(numAlts, 0); altnames.pop_back(); assert_gt(numAltnames, 0); numAlts--; numAltnames--; to_alti.back() = std::numeric_limits::max(); to_alti_far--; } } if(alts.size() == numAlts) break; } } assert_eq(alts.size(), numAlts); assert_eq(to_alti_far, numAlts); assert_eq(alts.size(), altnames.size()); // Check if it hits the end of file, and this routine is needed for backward compatibility if(in7.peek() != std::ifstream::traits_type::eof()) { index_t numHaplotypes = readIndex(in7, this->toBe()); if(numHaplotypes > 0) { haplotypes.resizeExact(numHaplotypes); haplotypes.clear(); while(!in7.eof()) { haplotypes.expand(); haplotypes.back().read(in7, this->toBe()); Haplotype& ht = haplotypes.back(); for(index_t h = 0; h < ht.alts.size(); h++) { ht.alts[h] = to_alti[ht.alts[h]]; } if(haplotypes.size() == numHaplotypes) break; } } if(!useHaplotype) { haplotypes.nullify(); } } // Read repeats _repeat = false; if(repeatdb != NULL) { _repeat = true; // Number of repeat groups in the index index_t numRepeatIndex = readIndex(in7, this->toBe()); assert_gt(numRepeatIndex, 0); EList > repeatLens; repeatLens.resizeExact(numRepeatIndex); for(size_t k = 0; k < numRepeatIndex; k++) { repeatLens[k].first = readIndex(in7, this->toBe()); repeatLens[k].second = readIndex(in7, this->toBe()); } if (readLens != NULL && !readLens->empty()) { // Load subset of repeat groups. size_t k = 0; size_t k2 = 0; _repeatIncluded.resizeExact(numRepeatIndex); _repeatIncluded.fillZero(); while(k < numRepeatIndex && k2 < readLens->size()) { if (repeatLens[k].first >= (*readLens)[k2]) { _repeatIncluded[k] = true; k2++; } else { k++; } } // at least last repeat group is included _repeatIncluded[numRepeatIndex - 1] = true; _repeatLens.clear(); for(size_t i = 0; i < numRepeatIndex; i++) { if (_repeatIncluded[i]) { _repeatLens.push_back(repeatLens[i]); } } } else { // Load all repeat groups _repeatLens = repeatLens; _repeatIncluded.resizeExact(numRepeatIndex); _repeatIncluded.fill(true); } repeatdb->read(in7, this->toBe(), _repeatIncluded); index_t numKmertables = readIndex(in7, this->toBe()); EList filePos; filePos.resizeExact(numKmertables); for(size_t k = 0; k < numKmertables; k++) { filePos[k] = readIndex(in7, this->toBe()); } for(size_t k = 0; k < numKmertables; k++) { if(!_repeatIncluded[k]) continue; if(k > 0) { in7.seekg(filePos[k-1]); } _repeat_kmertables.expand(); _repeat_kmertables.back().read(in7, this->toBe()); } in7.seekg(filePos.back()); } in7.close(); in8.close(); // Sort SNPs and Splice Sites based on positions index_t nalts = (index_t)alts.size(); for(index_t s = 0; s < nalts; s++) { ALT alt = alts[s]; if(alt.snp()) altdb->setSNPs(true); if(alt.exon()) altdb->setExons(true); if(alt.splicesite()) { altdb->setSpliceSites(true); alts.push_back(alt); alts.back().left = alt.right; alts.back().right = alt.left; altnames.push_back("ssr"); } else if(alt.deletion()) { alts.push_back(alt); alts.back().pos = alt.pos + alt.len - 1; alts.back().reversed = true; string altname = altnames[s]; altnames.push_back(altname); } } if(alts.size() > 1 && alts.size() > nalts) { assert_eq(alts.size(), altnames.size()); EList, index_t> > buf; buf.resize(alts.size()); EList buf2; buf2.resize(alts.size()); for(size_t i = 0; i < alts.size(); i++) { buf[i].first = alts[i]; buf[i].second = (index_t)i; buf2[i] = altnames[i]; } buf.sort(); for(size_t i = 0; i < alts.size(); i++) { alts[i] = buf[i].first; altnames[i] = buf2[buf[i].second]; if(buf[i].second < numAlts) { to_alti[buf[i].second] = i; } } } if(useHaplotype) { EList& haplotype_maxrights = altdb->haplotype_maxrights(); haplotype_maxrights.resizeExact(haplotypes.size()); for(index_t h = 0; h < haplotypes.size(); h++) { Haplotype& ht = haplotypes[h]; for(index_t h2 = 0; h2 < ht.alts.size(); h2++) { ht.alts[h2] = to_alti[ht.alts[h2]]; } if(h == 0) { haplotype_maxrights[h] = ht.right; } else { haplotype_maxrights[h] = std::max(haplotype_maxrights[h - 1], ht.right); } } } assert(repeatdb != NULL || repOk()); } /// Construct an Ebwt from the given header parameters and string /// vector, optionally using a blockwise suffix sorter with the /// given 'bmax' and 'dcv' parameters. The string vector is /// ultimately joined and the joined string is passed to buildToDisk(). GFM( bool packed, int needEntireReverse, int32_t lineRate, int32_t offRate, int32_t ftabChars, const string& file, // base filename for GFM files bool fw, int dcv, EList& szs, index_t sztot, const RefReadInParams& refparams, uint32_t seed, int32_t overrideOffRate = -1, bool verbose = false, bool passMemExc = false, bool sanityCheck = false) : GFM_INITS, _gh( joinedLen(szs), 0, 0, lineRate, offRate, ftabChars, 0, refparams.reverse == REF_READ_REVERSE) { #ifdef POPCNT_CAPABILITY ProcessorSupport ps; _usePOPCNTinstruction = ps.POPCNTenabled(); #endif packed_ = packed; } /// Construct an Ebwt from the given header parameters and string /// vector, optionally using a blockwise suffix sorter with the /// given 'bmax' and 'dcv' parameters. The string vector is /// ultimately joined and the joined string is passed to buildToDisk(). template GFM( TStr& s, bool packed, int needEntireReverse, int32_t lineRate, int32_t offRate, int32_t ftabChars, int nthreads, const string& snpfile, const string& htfile, const string& ssfile, const string& exonfile, const string& svfile, const string& repeatfile, const string& outfile, // base filename for GFM files bool fw, bool useBlockwise, index_t bmax, index_t bmaxSqrtMult, index_t bmaxDivN, int dcv, EList& is, EList& szs, index_t sztot, const RefReadInParams& refparams, EList* parent_szs, EList* parent_refnames, uint32_t seed, int32_t overrideOffRate = -1, bool verbose = false, bool passMemExc = false, bool sanityCheck = false) : GFM_INITS, _gh( joinedLen(szs), 0, 0, lineRate, offRate, ftabChars, 0, refparams.reverse == REF_READ_REVERSE) { assert_gt(nthreads, 0); _nthreads = nthreads; #ifdef POPCNT_CAPABILITY ProcessorSupport ps; _usePOPCNTinstruction = ps.POPCNTenabled(); #endif _in1Str = outfile + ".1." + gfm_ext; _in2Str = outfile + ".2." + gfm_ext; packed_ = packed; // Open output files ofstream fout1(_in1Str.c_str(), ios::binary); if(!fout1.good()) { cerr << "Could not open index file for writing: \"" << _in1Str.c_str() << "\"" << endl << "Please make sure the directory exists and that permissions allow writing by" << endl << "HISAT2." << endl; throw 1; } ofstream fout2(_in2Str.c_str(), ios::binary); if(!fout2.good()) { cerr << "Could not open index file for writing: \"" << _in2Str.c_str() << "\"" << endl << "Please make sure the directory exists and that permissions allow writing by" << endl << "HISAT2." << endl; throw 1; } // Build initFromVector( s, snpfile, htfile, ssfile, exonfile, svfile, repeatfile, is, szs, sztot, refparams, fout1, fout2, outfile, useBlockwise, bmax, bmaxSqrtMult, bmaxDivN, dcv, parent_szs, parent_refnames, seed, verbose); // Close output files fout1.flush(); int64_t tellpSz1 = (int64_t)fout1.tellp(); VMSG_NL("Wrote " << fout1.tellp() << " bytes to primary GFM file: " << _in1Str.c_str()); fout1.close(); bool err = false; if(tellpSz1 > fileSize(_in1Str.c_str())) { err = true; cerr << "Index is corrupt: File size for " << _in1Str.c_str() << " should have been " << tellpSz1 << " but is actually " << fileSize(_in1Str.c_str()) << "." << endl; } fout2.flush(); int64_t tellpSz2 = (int64_t)fout2.tellp(); VMSG_NL("Wrote " << fout2.tellp() << " bytes to secondary GFM file: " << _in2Str.c_str()); fout2.close(); if(tellpSz2 > fileSize(_in2Str.c_str())) { err = true; cerr << "Index is corrupt: File size for " << _in2Str.c_str() << " should have been " << tellpSz2 << " but is actually " << fileSize(_in2Str.c_str()) << "." << endl; } if(err) { cerr << "Please check if there is a problem with the disk or if disk is full." << endl; throw 1; } // Reopen as input streams VMSG_NL("Re-opening _in1 and _in2 as input streams"); if(_sanity) { VMSG_NL("Sanity-checking Bt2"); assert(!isInMemory()); readIntoMemory( fw ? -1 : needEntireReverse, // 1 -> need the reverse to be reverse-of-concat true, // load SA sample (_offs[])? true, // load ftab (_ftab[] & _eftab[])? true, // load r-starts (_rstarts[])? false, // just load header? NULL, // Params object to fill false, // mm sweep? true, // load names? false); // verbose startup? // sanityCheckAll(refparams.reverse); evictFromMemory(); assert(!isInMemory()); } VMSG_NL("Returning from GFM constructor"); } /** * Static constructor for a pair of forward/reverse indexes for the * given reference string. */ template static pair fromString( const char* str, bool packed, int reverse, bool bigEndian, int32_t lineRate, int32_t offRate, int32_t ftabChars, const string& file, bool useBlockwise, index_t bmax, index_t bmaxSqrtMult, index_t bmaxDivN, int dcv, uint32_t seed, bool verbose, bool autoMem, bool sanity) { EList strs(EBWT_CAT); strs.push_back(std::string(str)); return fromStrings( strs, packed, reverse, bigEndian, lineRate, offRate, ftabChars, file, useBlockwise, bmax, bmaxSqrtMult, bmaxDivN, dcv, seed, verbose, autoMem, sanity); } /** * Static constructor for a pair of forward/reverse indexes for the * given list of reference strings. */ template static pair fromStrings( const EList& strs, bool packed, int reverse, bool bigEndian, int32_t lineRate, int32_t offRate, int32_t ftabChars, const string& file, bool useBlockwise, index_t bmax, index_t bmaxSqrtMult, index_t bmaxDivN, int dcv, uint32_t seed, bool verbose, bool autoMem, bool sanity) { assert(!strs.empty()); EList is(EBWT_CAT); RefReadInParams refparams(false /* color */, REF_READ_FORWARD, false, false); // Adapt sequence strings to stringstreams open for input auto_ptr ss(new stringstream()); for(index_t i = 0; i < strs.size(); i++) { (*ss) << ">" << i << endl << strs[i] << endl; } auto_ptr fb(new FileBuf(ss.get())); assert(!fb->eof()); assert(fb->get() == '>'); ASSERT_ONLY(fb->reset()); assert(!fb->eof()); is.push_back(fb.get()); // Vector for the ordered list of "records" comprising the input // sequences. A record represents a stretch of unambiguous // characters in one of the input sequences. EList szs(EBWT_CAT); std::pair sztot; sztot = BitPairReference::szsFromFasta(is, file, bigEndian, refparams, szs, sanity); // Construct Ebwt from input strings and parameters GFM *gfmFw = new GFM( TStr(), packed, -1, // fw lineRate, offRate, // suffix-array sampling rate ftabChars, // number of chars in initial arrow-pair calc file, // basename for .?.ebwt files true, // fw? useBlockwise, // useBlockwise bmax, // block size for blockwise SA builder bmaxSqrtMult, // block size as multiplier of sqrt(len) bmaxDivN, // block size as divisor of len dcv, // difference-cover period is, // list of input streams szs, // list of reference sizes sztot.first, // total size of all unambiguous ref chars refparams, // reference read-in parameters seed, // pseudo-random number generator seed -1, // override offRate verbose, // be talkative autoMem, // pass exceptions up to the toplevel so that we can adjust memory settings automatically sanity); // verify results and internal consistency refparams.reverse = reverse; szs.clear(); sztot = BitPairReference::szsFromFasta(is, file, bigEndian, refparams, szs, sanity); // Construct Ebwt from input strings and parameters GFM *gfmBw = new GFM( TStr(), packed, reverse == REF_READ_REVERSE, lineRate, offRate, // suffix-array sampling rate ftabChars, // number of chars in initial arrow-pair calc file + ".rev",// basename for .?.ebwt files false, // fw? useBlockwise, // useBlockwise bmax, // block size for blockwise SA builder bmaxSqrtMult, // block size as multiplier of sqrt(len) bmaxDivN, // block size as divisor of len dcv, // difference-cover period is, // list of input streams szs, // list of reference sizes sztot.first, // total size of all unambiguous ref chars refparams, // reference read-in parameters seed, // pseudo-random number generator seed -1, // override offRate verbose, // be talkative autoMem, // pass exceptions up to the toplevel so that we can adjust memory settings automatically sanity); // verify results and internal consistency return make_pair(gfmFw, gfmBw); } /// Return true iff the Ebwt is packed bool isPacked() { return packed_; } /** * Write the rstarts array given the szs array for the reference. */ void szsToDisk(const EList& szs, ostream& os, int reverse); bool checkPosToSzs(const EList& szs, index_t start_idx, index_t pos) { assert(szs[start_idx].first); for(index_t i = start_idx; i < szs.size(); i++) { if((i != start_idx) && (szs[i].first)) { // span to next chr return false; } if(pos < szs[i].off) { return false; } else { pos -= szs[i].off; if(pos < szs[i].len) { return true; } pos -= szs[i].len; } } assert(false); return false; } /** * Helper for the constructors above. Takes a vector of text * strings and joins them into a single string with a call to * joinToDisk, which does a join (with padding) and writes some of * the resulting data directly to disk rather than keep it in * memory. It then constructs a suffix-array producer (what kind * depends on 'useBlockwise') for the resulting sequence. The * suffix-array producer can then be used to obtain chunks of the * joined string's suffix array. */ template void initFromVector(TStr& s, const string& snpfile, const string& htfile, const string& ssfile, const string& exonfile, const string& svfile, const string& repeatfile, EList& is, EList& szs, index_t sztot, const RefReadInParams& refparams, ofstream& out1, ofstream& out2, const string& outfile, bool useBlockwise, index_t bmax, index_t bmaxSqrtMult, index_t bmaxDivN, int dcv, EList* parent_szs, EList* parent_refnames, uint32_t seed, bool verbose) { // Compose text strings into single string VMSG_NL("Calculating joined length"); index_t jlen; jlen = joinedLen(szs); _repeat = (parent_szs != NULL); assert_geq(jlen, sztot); VMSG_NL("Writing header"); writeFromMemory(true, out1, out2); try { VMSG_NL("Reserving space for joined string"); s.resize(jlen); VMSG_NL("Joining reference sequences"); if(refparams.reverse == REF_READ_REVERSE) { { Timer timer(cerr, " Time to join reference sequences: ", _verbose); joinToDisk(is, szs, sztot, refparams, s, out1, out2); } { Timer timer(cerr, " Time to reverse reference sequence: ", _verbose); EList tmp(EBWT_CAT); s.reverse(); reverseRefRecords(szs, tmp, false, verbose); szsToDisk(tmp, out1, refparams.reverse); } } else { Timer timer(cerr, " Time to join reference sequences: ", _verbose); joinToDisk(is, szs, sztot, refparams, s, out1, out2); szsToDisk(szs, out1, refparams.reverse); } { Timer timer(cerr, " Time to read SNPs and splice sites: ", _verbose); _alts.clear(); _altnames.clear(); EList > chr_szs; index_t tmp_len = 0; for(index_t i = 0; i < szs.size(); i++) { if(szs[i].first) { chr_szs.expand(); chr_szs.back().first = tmp_len; chr_szs.back().second = i; } tmp_len += (index_t)szs[i].len; } // Write SNPs into 7.ht2 and 8.ht2 string file7 = outfile + ".7." + gfm_ext; string file8 = outfile + ".8." + gfm_ext; // Open output stream for the '.7.gfm_ext' file which will // hold SNPs (except IDs). ofstream fout7(file7.c_str(), ios::binary); if(!fout7.good()) { cerr << "Could not open index file for writing: \"" << file7.c_str() << "\"" << endl << "Please make sure the directory exists and that permissions allow writing by" << endl << "HISAT2." << endl; throw 1; } // Open output stream for the '.8.gfm_ext' file which will // hold SNP IDs. ofstream fout8(file8.c_str(), ios::binary); if(!fout8.good()) { cerr << "Could not open index file for writing: \"" << file8.c_str() << "\"" << endl << "Please make sure the directory exists and that permissions allow writing by" << endl << "HISAT2." << endl; throw 1; } writeIndex(fout7, 1, this->toBe()); // endianness sentinel writeIndex(fout8, 1, this->toBe()); // endianness sentinel for(index_t i = 0; i < _refnames.size(); i++) { _refnames_nospace.push_back(""); for(index_t j = 0; j < _refnames[i].size(); j++) { char c = _refnames[i][j]; if(c == ' ') break; _refnames_nospace.back().push_back(c); } } map snpID2num; if(snpfile != "") { ifstream snp_file(snpfile.c_str(), ios::in); if(!snp_file.is_open()) { cerr << "Error: could not open " << snpfile.c_str() << endl; throw 1; } while(!snp_file.eof()) { // rs73387790 single 22:20000001-21000000 145 A string snp_id; snp_file >> snp_id; if(snp_id.empty() || snp_id[0] == '#') { string line; getline(snp_file, line); continue; } string type, chr; index_t genome_pos; char snp_ch = '\0'; string ins_seq; index_t del_len = 0; snp_file >> type >> chr >> genome_pos; if(type == "single") { snp_file >> snp_ch; } else if(type == "deletion") { snp_file >> del_len; } else if(type == "insertion") { snp_file >> ins_seq; } index_t chr_idx = 0; for(; chr_idx < _refnames_nospace.size(); chr_idx++) { if(chr == _refnames_nospace[chr_idx]) break; } if(chr_idx >= _refnames_nospace.size()) { continue; } assert_eq(chr_szs.size(), _refnames_nospace.size()); assert_lt(chr_idx, chr_szs.size()); pair tmp_pair = chr_szs[chr_idx]; const index_t sofar_len = tmp_pair.first; const index_t szs_idx = tmp_pair.second; bool involve_Ns = false; index_t pos = genome_pos; index_t add_pos = 0; assert(szs[szs_idx].first); for(index_t i = szs_idx; i < szs.size(); i++) { if(i != szs_idx && szs[i].first) { break; } if(pos < szs[i].off) { involve_Ns = true; break; } else { pos -= szs[i].off; if(pos == 0) { if(type == "deletion" || type == "insertion") { involve_Ns = true; break; } } if(pos < szs[i].len) { break; } else { pos -= szs[i].len; add_pos += szs[i].len; } } } if(involve_Ns) { continue; } pos = sofar_len + add_pos + pos; if(chr_idx + 1 < chr_szs.size()) { if(pos >= chr_szs[chr_idx + 1].first) { continue; } } else { if(pos >= jlen){ continue; } } _alts.expand(); ALT& snp = _alts.back(); snp.pos = pos; if(type == "single") { snp.type = ALT_SNP_SGL; snp_ch = toupper(snp_ch); if(snp_ch != 'A' && snp_ch != 'C' && snp_ch != 'G' && snp_ch != 'T') { _alts.pop_back(); continue; } uint64_t bp = asc2dna[(int)snp_ch]; assert_lt(bp, 4); if((int)bp == s[pos]) { if (!threeN) { cerr << "Warning: single type should have a different base than " << "ACGTN"[(int)s[pos]] << " (" << snp_id << ") at " << genome_pos << " on " << chr << endl; } _alts.pop_back(); continue; // throw 1; } snp.len = 1; snp.seq = bp; } else if(type == "deletion") { snp.type = ALT_SNP_DEL; snp.len = del_len; snp.seq = 0; snp.reversed = false; } else if(type == "insertion") { snp.type = ALT_SNP_INS; snp.len = (index_t)ins_seq.size(); if(snp.len > sizeof(snp.seq) * 4) { _alts.pop_back(); continue; } snp.seq = 0; bool failed = false; for(size_t i = 0; i < ins_seq.size(); i++) { char ch = toupper(ins_seq[i]); if(ch != 'A' && ch != 'C' && ch != 'G' && ch != 'T') { failed = true; break; } uint64_t bp = asc2dna[(int)ch]; assert_lt(bp, 4); snp.seq = (snp.seq << 2) | bp; } if(failed) { _alts.pop_back(); continue; } } else { cerr << "Error: unknown snp type " << type << endl; throw 1; } _altnames.push_back(snp_id); assert_eq(_alts.size(), _altnames.size()); snpID2num[snp_id] = (index_t)_alts.size() - 1; } snp_file.close(); assert_eq(_alts.size(), _altnames.size()); } _haplotypes.clear(); if(_alts.size() > 0 && htfile != "") { ifstream ht_file(htfile.c_str(), ios::in); if(!ht_file.is_open()) { cerr << "Error: could not open "<< htfile.c_str() << endl; throw 1; } while(!ht_file.eof()) { // ht66 A*01:01:01:01 371 533 66,69,72,75,76,77,84,88,90,92,95 string ht_id; ht_file >> ht_id; if(ht_id.empty() || ht_id[0] == '#') { string line; getline(ht_file, line); continue; } string chr, alt_list; index_t left, right; // inclusive [left, right] ht_file >> chr >> left >> right >> alt_list; assert_leq(left, right); index_t chr_idx = 0; for(; chr_idx < _refnames_nospace.size(); chr_idx++) { if(chr == _refnames_nospace[chr_idx]) break; } if(chr_idx >= _refnames_nospace.size()) { continue; } assert_eq(chr_szs.size(), _refnames_nospace.size()); assert_lt(chr_idx, chr_szs.size()); pair tmp_pair = chr_szs[chr_idx]; const index_t sofar_len = tmp_pair.first; const index_t szs_idx = tmp_pair.second; bool inside_Ns = false; index_t add_pos = 0; assert(szs[szs_idx].first); for(index_t i = szs_idx; i < szs.size(); i++) { if(i != szs_idx && szs[i].first) break; if(left < szs[i].off) { inside_Ns = true; break; } else { left -= szs[i].off; right -= szs[i].off; if(left < szs[i].len) { if(right >= szs[i].len) { inside_Ns = true; } break; } else { left -= szs[i].len; right -= szs[i].len; add_pos += szs[i].len; } } } if(inside_Ns) { continue; } left = sofar_len + add_pos + left; right = sofar_len + add_pos + right; if(chr_idx + 1 < chr_szs.size()) { if(right >= chr_szs[chr_idx + 1].first) { continue; } } else { if(right >= jlen) { continue; } } _haplotypes.expand(); _haplotypes.back().left = left; _haplotypes.back().right = right; EList alts; tokenize(alt_list, ",", alts); assert_gt(alts.size(), 0); _haplotypes.back().alts.clear(); for(size_t i = 0; i < alts.size(); i++) { const string& alt = alts[i]; if(snpID2num.find(alt) != snpID2num.end()) { _haplotypes.back().alts.push_back(snpID2num[alt]); } } if(_haplotypes.back().alts.size() <= 0) { _haplotypes.pop_back(); } } _haplotypes.sort(); ht_file.close(); } else { for(index_t a = 0; a < _alts.size(); a++) { const ALT& alt = _alts[a]; if(!alt.snp()) continue; _haplotypes.expand(); _haplotypes.back().left = alt.pos; if(alt.deletion()) { _haplotypes.back().right = alt.pos + alt.len - 1; } else { _haplotypes.back().right = alt.pos; } _haplotypes.back().alts.clear(); _haplotypes.back().alts.push_back(a); } } if(ssfile != "") { ifstream ss_file(ssfile.c_str(), ios::in); if(!ss_file.is_open()) { cerr << "Error: could not open " << ssfile.c_str() << endl; throw 1; } map ss_seq; while(!ss_file.eof()) { // 22 16062315 16062810 + string chr; ss_file >> chr; if(chr.empty() || chr[0] == '#') { string line; getline(ss_file, line); continue; } index_t left, right; char strand; ss_file >> left >> right >> strand; // Convert exonic position to intronic position left += 1; right -= 1; if(left >= right) continue; index_t chr_idx = 0; for(; chr_idx < _refnames_nospace.size(); chr_idx++) { if(chr == _refnames_nospace[chr_idx]) break; } if(chr_idx >= _refnames_nospace.size()) continue; assert_eq(chr_szs.size(), _refnames_nospace.size()); assert_lt(chr_idx, chr_szs.size()); pair tmp_pair = chr_szs[chr_idx]; const index_t sofar_len = tmp_pair.first; const index_t szs_idx = tmp_pair.second; // check whether ambiguous base is in exon's last and first base if(!checkPosToSzs(szs, szs_idx, left - 1) || !checkPosToSzs(szs, szs_idx, right + 1)) { //cerr << "Skip ss. " << chr << ", " << left - 1 << ", " << right + 1 << endl; continue; } bool inside_Ns = false; index_t add_pos = 0; assert(szs[szs_idx].first); for(index_t i = szs_idx; i < szs.size(); i++) { if(i != szs_idx && szs[i].first) break; if(left < szs[i].off) { inside_Ns = true; break; } else { left -= szs[i].off; right -= szs[i].off; if(left < szs[i].len) { if(right >= szs[i].len) { inside_Ns = true; } break; } else { left -= szs[i].len; right -= szs[i].len; add_pos += szs[i].len; } } } if(inside_Ns) continue; left = sofar_len + add_pos + left; right = sofar_len + add_pos + right; if(chr_idx + 1 < chr_szs.size()) { if(right >= chr_szs[chr_idx + 1].first) continue; } else { if(right >= jlen) continue; } // Avoid splice sites in repetitive sequences // Otherwise, it will likely explode due to an exponential number of combinations index_t seqlen = 16; assert_leq(seqlen, 16); if(left >= seqlen && right + 1 + seqlen <= s.length()) { uint64_t seq = 0; for(index_t si = left - seqlen; si < left; si++) { seq = seq << 2 | s[si]; } for(index_t si = right + 1; si < right + 1 + seqlen; si++) { seq = seq << 2 | s[si]; } if(_alts.size() > 0) { if(_alts.back().left == left && _alts.back().right == right) continue; } if(ss_seq.find(seq) == ss_seq.end()) ss_seq[seq] = 1; else ss_seq[seq]++; } _alts.expand(); ALT& alt = _alts.back(); alt.type = ALT_SPLICESITE; alt.left = left; alt.right = right; alt.fw = (strand == '+' ? true : false); alt.excluded = false; _altnames.push_back("ss"); } ss_file.close(); assert_eq(_alts.size(), _altnames.size()); for(size_t i = 0; i < _alts.size(); i++) { ALT& alt = _alts[i]; if(!alt.splicesite()) continue; index_t seqlen = 16; assert_leq(seqlen, 16); if(alt.left >= seqlen && alt.right + 1 + seqlen <= s.length()) { uint64_t seq = 0; for(index_t si = alt.left - seqlen; si < alt.left; si++) { seq = seq << 2 | s[si]; } for(index_t si = alt.right + 1; si < alt.right + 1 + seqlen; si++) { seq = seq << 2 | s[si]; } assert(ss_seq.find(seq) != ss_seq.end()); alt.excluded = ss_seq[seq] > 1; } } } if(exonfile != "") { ifstream exon_file(exonfile.c_str(), ios::in); if(!exon_file.is_open()) { cerr << "Error: could not open " << ssfile.c_str() << endl; throw 1; } while(!exon_file.eof()) { // 22 16062156 16062315 + string chr; exon_file >> chr; if(chr.empty() || chr[0] == '#') { string line; getline(exon_file, line); continue; } index_t left, right; char strand; exon_file >> left >> right >> strand; // Convert exonic position to intronic position left += 1; right -= 1; if(left >= right) continue; index_t chr_idx = 0; for(; chr_idx < _refnames_nospace.size(); chr_idx++) { if(chr == _refnames_nospace[chr_idx]) break; } if(chr_idx >= _refnames_nospace.size()) continue; assert_eq(chr_szs.size(), _refnames_nospace.size()); assert_lt(chr_idx, chr_szs.size()); pair tmp_pair = chr_szs[chr_idx]; const index_t sofar_len = tmp_pair.first; const index_t szs_idx = tmp_pair.second; bool inside_Ns = false; index_t add_pos = 0; assert(szs[szs_idx].first); for(index_t i = szs_idx; i < szs.size(); i++) { if(i != szs_idx && szs[i].first) break; if(left < szs[i].off) { inside_Ns = true; break; } else { left -= szs[i].off; right -= szs[i].off; if(left < szs[i].len) { if(right >= szs[i].len) { inside_Ns = true; } break; } else { left -= szs[i].len; right -= szs[i].len; add_pos += szs[i].len; } } } if(inside_Ns) continue; left = sofar_len + add_pos + left; right = sofar_len + add_pos + right; if(chr_idx + 1 < chr_szs.size()) { if(right >= chr_szs[chr_idx + 1].first) continue; } else { if(right >= jlen) continue; } _alts.expand(); ALT& alt = _alts.back(); alt.type = ALT_EXON; alt.left = left; alt.right = right; alt.fw = (strand == '+' ? true : false); _altnames.push_back("exon"); } exon_file.close(); } // Todo - implement structural variations if(svfile != "") { cerr << "Warning: SV option is not implemented " << svfile.c_str() << endl; } // Sort SNPs and Splice Sites based on positions if(_alts.size() > 1) { assert_eq(_alts.size(), _altnames.size()); EList, index_t> > buf; buf.resize(_alts.size()); EList buf2; buf2.resize(_alts.size()); for(size_t i = 0; i < _alts.size(); i++) { buf[i].first = _alts[i]; buf[i].second = (index_t)i; buf2[i] = _altnames[i]; } buf.sort(); for(size_t i = 0; i < _alts.size(); i++) { _alts[i] = buf[i].first; _altnames[i] = buf2[buf[i].second]; } EList buf3; buf3.resize(_alts.size()); for(size_t i = 0; i < buf3.size(); i++) { index_t before = buf[i].second; assert_lt(before, buf3.size()); buf3[before] = (index_t)i; } for(size_t h = 0; h < _haplotypes.size(); h++) { EList& alts = _haplotypes[h].alts; for(size_t a = 0; a < alts.size(); a++) { index_t before = alts[a]; assert_lt(before, buf3.size()); alts[a] = buf3[before]; } } #ifndef NDEBUG for(size_t i = 0; i < _alts.size(); i++) { if(i + 1 < _alts.size()) { assert(_alts[i] < _alts[i+1]); } const ALT& alt = _alts[i]; if(alt.snp()) { assert(_altnames[i] != ""); } else if(alt.splicesite()) { assert(_altnames[i] == "ss"); } else if(alt.exon()) { assert(_altnames[i] == "exon"); } else { assert(false); } } #endif } writeIndex(fout7, (index_t)_alts.size(), this->toBe()); writeIndex(fout8, (index_t)_alts.size(), this->toBe()); for(index_t i = 0; i < _alts.size(); i++) { _alts[i].write(fout7, this->toBe()); fout8 << _altnames[i] << endl; } writeIndex(fout7, (index_t)_haplotypes.size(), this->toBe()); for(index_t i = 0; i < _haplotypes.size(); i++) { _haplotypes[i].write(fout7, this->toBe()); } EList >& repeats = _repeatdb.repeats(); if(_repeat) { ifstream repeat_file(repeatfile.c_str(), ios::in); if(!repeat_file.is_open()) { cerr << "Error: could not open " << ssfile.c_str() << endl; throw 1; } if(parent_szs == NULL) { throw 1; } if(parent_refnames == NULL) { throw 1; } EList > parent_chr_szs; index_t tmp_len = 0; for(index_t i = 0; i < parent_szs->size(); i++) { if((*parent_szs)[i].first) { parent_chr_szs.expand(); parent_chr_szs.back().first = tmp_len; parent_chr_szs.back().second = i; } tmp_len += (index_t)(*parent_szs)[i].len; } index_t parent_jlen = joinedLen(*parent_szs); string prev_repName = ""; while(!repeat_file.eof()) { // >rep1*0 rep 0 100 470 0 // 20_rep:26622650:+ 20_rep:26628088:+ 20_rep:26632508:+ 20_rep:26635636:+ // 20_rep:26669936:+ 20_rep:26672654:+ 20_rep:26675373:+ 20_rep:26678095:+ string repName, repAlleleName; repeat_file >> repAlleleName; if(repAlleleName.empty()) // Reached the end of file break; if(repAlleleName[0] != '>') { cerr << "Error: the file format is not correct" << endl; throw 1; } repAlleleName = repAlleleName.substr(1); // Remove '>' index_t alleleID = 0; size_t star_pos = repAlleleName.find('*'); if(star_pos >= repAlleleName.length()) { repName = repAlleleName; } else { repName = repAlleleName.substr(0, star_pos); string strID = repAlleleName.substr(star_pos + 1); istringstream(strID) >> alleleID; } string refRepName; index_t repPos, repLen; repeat_file >> refRepName >> repPos >> repLen; index_t rep_idx = 0; for(; rep_idx < _refnames_nospace.size(); rep_idx++) { if(refRepName == _refnames_nospace[rep_idx]) break; } if(rep_idx >= _refnames_nospace.size()) { cerr << "Error: " << refRepName << " is not found in " << endl; throw 1; } if(repeats.size() == 0 || repeats.back().repID != rep_idx || repeats.back().repName != repName) { if(repeats.size() > 0) { repeats.back().positions.sort(); } repeats.expand(); repeats.back().init(repName, rep_idx, repPos, repLen); } // update repPos and repLen if(repPos < repeats.back().repPos) { repeats.back().repLen += (repeats.back().repPos - repPos); repeats.back().repPos = repPos; } if(repPos + repLen > repeats.back().repPos + repeats.back().repLen) { repeats.back().repLen = repPos + repLen - repeats.back().repPos; } size_t baseOff = 0; if(repeats.size() > 1 && repeats[repeats.size() - 2].repID == rep_idx) { baseOff = repeats[repeats.size() - 2].repPos + repeats[repeats.size() - 2].repLen; } index_t numCoords, numAlts; repeat_file >> numCoords >> numAlts; EList snpIDs; EList snpStrIDs; if(numAlts > 0) { string snpStrID; repeat_file >> snpStrID; tokenize(snpStrID, ",", snpStrIDs); if(snpStrIDs.size() != numAlts) { assert(false); cerr << "Error: the number of SNPs (" << snpIDs.size() << ", " << snpStrID << ") does not equal to " << numAlts << endl; throw 1; } for(index_t i = 0; i < snpStrIDs.size(); i++) { if(snpID2num.find(snpStrIDs[i]) == snpID2num.end()) { cerr << "Error: " << snpStrIDs[i] << " is not found" << endl; throw 1; } index_t numID = snpID2num[snpStrIDs[i]]; snpIDs.push_back(numID); } } EList >& positions = repeats.back().positions; size_t sofar_numCoords = positions.size(); while(positions.size() - sofar_numCoords < numCoords) { string chr_pos; repeat_file >> chr_pos; size_t colon_pos = chr_pos.find(':'); if(colon_pos + 1 >= chr_pos.length()) { cerr << "Error: : is not found in " << chr_pos << endl; throw 1; } string chr = chr_pos.substr(0, colon_pos); string strPos = chr_pos.substr(colon_pos + 1, chr_pos.length() - colon_pos - 3); bool repfw = (chr_pos[chr_pos.length() - 1] == '+'); index_t pos = 0; istringstream(strPos) >> pos; index_t chr_idx = 0; for(; chr_idx < parent_refnames->size(); chr_idx++) { if(chr == (*parent_refnames)[chr_idx]) break; } if(chr_idx >= parent_refnames->size()) { cerr << "Error: " << chr << " is not found in " << endl; throw 1; } assert_eq(parent_chr_szs.size(), parent_refnames->size()); assert_lt(chr_idx, parent_chr_szs.size()); positions.expand(); positions.back().tid = chr_idx; positions.back().toff = pos; positions.back().fw = repfw; positions.back().alleleID = alleleID; pair tmp_pair = parent_chr_szs[chr_idx]; const index_t sofar_len = tmp_pair.first; const index_t szs_idx = tmp_pair.second; bool involve_Ns = false; index_t add_pos = 0; assert((*parent_szs)[szs_idx].first); for(index_t i = szs_idx; i < parent_szs->size(); i++) { if(i != szs_idx && (*parent_szs)[i].first) { break; } if(pos < (*parent_szs)[i].off) { involve_Ns = true; break; } else { pos -= (*parent_szs)[i].off; if(pos < (*parent_szs)[i].len) { break; } else { pos -= (*parent_szs)[i].len; add_pos += (*parent_szs)[i].len; } } } if(involve_Ns) { assert(false); throw 1; } pos = sofar_len + add_pos + pos; if(chr_idx + 1 < parent_chr_szs.size()) { if(pos >= parent_chr_szs[chr_idx + 1].first) { assert(false); throw 1; } } else { if(pos >= parent_jlen){ assert(false); throw 1; } } positions.back().joinedOff = pos; } repeats.back().alleles.expand(); assert_geq(repPos, baseOff); repeats.back().alleles.back().init(repPos - baseOff, repLen); } if(repeats.size() > 0) { repeats.back().positions.sort(); } repeat_file.close(); index_t total_repeat_len = 0; for(size_t r = 0; r + 1 < repeats.size(); r++) { if(repeats[r].repID != repeats[r+1].repID) { index_t repeat_len = repeats[r].repPos + repeats[r].repLen; total_repeat_len += repeat_len; } } index_t repeat_len = repeats.back().repPos + repeats.back().repLen; total_repeat_len += repeat_len; if(total_repeat_len != s.length()) { cerr << "Error: repeat length (" << repeats.back().repPos + repeats.back().repLen; cerr << ") does not match sequence length (" << s.length() << ")" << endl; throw 1; } _repeatLens.resizeExact(szs.size()); for(size_t i = 0; i < _repeatLens.size(); i++) { _repeatLens[i].first = numeric_limits::max(); _repeatLens[i].second = 0; } for(size_t i = 0; i < repeats.size(); i++) { index_t id = repeats[i].repID; index_t len = repeats[i].repLen; assert_lt(id, _repeatLens.size()); if(_repeatLens[id].first > len) { _repeatLens[id].first = len; } if(_repeatLens[id].second < len) { _repeatLens[id].second = len; } } writeIndex(fout7, _repeatLens.size(), this->toBe()); for(size_t i = 0; i < _repeatLens.size(); i++) { writeIndex(fout7, _repeatLens[i].first, this->toBe()); writeIndex(fout7, _repeatLens[i].second, this->toBe()); } _repeatdb.write(fout7, this->toBe()); writeIndex(fout7, chr_szs.size(), this->toBe()); // number of repeat indexes EList seqs; EList tableFilePos; streampos filepos = fout7.tellp(); for(size_t i = 0; i < chr_szs.size(); i++) { writeIndex(fout7, 0, this->toBe()); } for(size_t i = 0; i < repeats.size(); i++) { const Repeat& repeat = repeats[i]; assert_lt(repeat.repID, chr_szs.size()); index_t template_len = 0; if(repeat.repID + 1 < chr_szs.size()) { template_len = chr_szs[repeat.repID + 1].first - chr_szs[repeat.repID].first; } else { template_len = s.length() - chr_szs[repeat.repID].first; } assert_leq(repeat.repPos + repeat.repLen, template_len); index_t pos = chr_szs[repeat.repID].first + repeat.repPos; assert_leq(pos + repeat.repLen, s.length()); seqs.expand(); seqs.back().clear(); for(index_t j = 0; j < repeat.repLen; j++) { int c = s[pos + j]; assert_range(0, 3, c); seqs.back().push_back("ACGT"[c]); } if(i + 1 == repeats.size() || repeats[i].repID != repeats[i+1].repID) { const size_t w = RB_Minimizer::default_w, k = RB_Minimizer::default_k; RB_KmerTable kmer_table; kmer_table.build(seqs, w, k); kmer_table.write(fout7, this->toBe()); seqs.clear(); tableFilePos.push_back(fout7.tellp()); } } assert_eq(tableFilePos.size(), chr_szs.size()); streampos origpos = fout7.tellp(); fout7.seekp(filepos); for(size_t i = 0; i < tableFilePos.size(); i++) { writeIndex(fout7, tableFilePos[i], this->toBe()); } fout7.seekp(origpos); } fout7.close(); fout8.close(); } // Joined reference sequence now in 's' } catch(bad_alloc& e) { // If we throw an allocation exception in the try block, // that means that the joined version of the reference // string itself is too larger to fit in memory. The only // alternatives are to tell the user to give us more memory // or to try again with a packed representation of the // reference (if we haven't tried that already). cerr << "Could not allocate space for a joined string of " << jlen << " elements." << endl; if(!isPacked() && _passMemExc) { // Pass the exception up so that we can retry using a // packed string representation throw e; } // There's no point passing this exception on. The fact // that we couldn't allocate the joined string means that // --bmax is irrelevant - the user should re-run with // ebwt-build-packed if(isPacked()) { cerr << "Please try running bowtie-build on a computer with more memory." << endl; } else { cerr << "Please try running bowtie-build in packed mode (-p/--packed) or in automatic" << endl << "mode (-a/--auto), or try again on a computer with more memory." << endl; } if(sizeof(void*) == 4) { cerr << "If this computer has more than 4 GB of memory, try using a 64-bit executable;" << endl << "this executable is 32-bit." << endl; } throw 1; } // Succesfully obtained joined reference string assert_geq(s.length(), jlen); if(bmax != (index_t)OFF_MASK) { // VMSG_NL("bmax according to bmax setting: " << bmax); } else if(bmaxSqrtMult != (index_t)OFF_MASK) { bmax *= bmaxSqrtMult; // VMSG_NL("bmax according to bmaxSqrtMult setting: " << bmax); } else if(bmaxDivN != (index_t)OFF_MASK) { bmax = max(jlen / (bmaxDivN * _nthreads), 1); // VMSG_NL("bmax according to bmaxDivN setting: " << bmax); } else { bmax = (uint32_t)sqrt(s.length()); // VMSG_NL("bmax defaulted to: " << bmax); } int iter = 0; bool first = true; streampos out1pos = out1.tellp(); streampos out2pos = out2.tellp(); if(!_repeat) { // Look for bmax/dcv parameters that work. while(true) { if(!first && bmax < 40 && _passMemExc) { cerr << "Could not find approrpiate bmax/dcv settings for building this index." << endl; if(!isPacked()) { // Throw an exception exception so that we can // retry using a packed string representation throw bad_alloc(); } else { cerr << "Already tried a packed string representation." << endl; } cerr << "Please try indexing this reference on a computer with more memory." << endl; if(sizeof(void*) == 4) { cerr << "If this computer has more than 4 GB of memory, try using a 64-bit executable;" << endl << "this executable is 32-bit." << endl; } throw 1; } if(!first) { out1.seekp(out1pos); out2.seekp(out2pos); } if(dcv > 4096) dcv = 4096; if((iter % 6) == 5 && dcv < 4096 && dcv != 0) { dcv <<= 1; // double difference-cover period } else { bmax -= (bmax >> 2); // reduce by 25% } iter++; try { if(_alts.empty()) { VMSG("Using parameters --bmax " << bmax); if(dcv == 0) { VMSG_NL(" and *no difference cover*"); } else { VMSG_NL(" --dcv " << dcv); } { VMSG_NL(" Doing ahead-of-time memory usage test"); // Make a quick-and-dirty attempt to force a bad_alloc iff // we would have thrown one eventually as part of // constructing the DifferenceCoverSample dcv <<= 1; index_t sz = (index_t)DifferenceCoverSample::simulateAllocs(s, dcv >> 1); if(_nthreads > 1) sz *= (_nthreads + 1); AutoArray tmp(sz, EBWT_CAT); dcv >>= 1; // Likewise with the KarkkainenBlockwiseSA sz = (index_t)KarkkainenBlockwiseSA::simulateAllocs(s, bmax); AutoArray tmp2(sz, EBWT_CAT); // Now throw in the 'ftab' and 'isaSample' structures // that we'll eventually allocate in buildToDisk AutoArray ftab(_gh._ftabLen * 2, EBWT_CAT); AutoArray side(_gh._sideSz, EBWT_CAT); // Grab another 20 MB out of caution AutoArray extra(20*1024*1024, EBWT_CAT); // If we made it here without throwing bad_alloc, then we // passed the memory-usage stress test VMSG(" Passed! Constructing with these parameters: --bmax " << bmax << " --dcv " << dcv); if(isPacked()) { VMSG(" --packed"); } VMSG_NL(""); } VMSG_NL("Constructing suffix-array element generator"); KarkkainenBlockwiseSA bsa(s, bmax, _nthreads, dcv, seed, _sanity, _passMemExc, _verbose, outfile); assert(bsa.suffixItrIsReset()); assert_eq(bsa.size(), s.length()+1); VMSG_NL("Converting suffix-array elements to index image"); buildToDisk(bsa, s, out1, out2); } else { RefGraph* graph = new RefGraph( s, szs, _alts, _haplotypes, outfile, _nthreads, verbose); PathGraph* pg = new PathGraph( *graph, outfile, std::numeric_limits::max(), _nthreads, verbose); if(verbose) { cerr << "Generating edges... " << endl; } if(!pg->generateEdges(*graph)) { return; } // Re-initialize GFM parameters to reflect real number of edges (gbwt string) _gh.init( _gh.len(), pg->getNumEdges(), pg->getNumNodes(), _gh.lineRate(), _gh.offRate(), _gh.ftabChars(), 0, _gh.entireReverse()); buildToDisk(*pg, s, out1, out2); delete pg; pg = NULL; delete graph; graph = NULL; } out1.flush(); out2.flush(); if(out1.fail() || out2.fail()) { cerr << "An error occurred writing the index to disk. Please check if the disk is full." << endl; throw 1; } break; } catch(bad_alloc& e) { if(_passMemExc) { VMSG_NL(" Ran out of memory; automatically trying more memory-economical parameters."); } else { cerr << "Out of memory while constructing suffix array. Please try using a smaller" << endl << "number of blocks by specifying a smaller --bmax or a larger --bmaxdivn" << endl; throw 1; } } first = false; } assert(repOk()); // Now write reference sequence names on the end assert_eq(this->_refnames.size(), this->_nPat); for(index_t i = 0; i < this->_refnames.size(); i++) { out1 << this->_refnames[i].c_str() << endl; } out1 << '\0'; out1.flush(); out2.flush(); if(out1.fail() || out2.fail()) { cerr << "An error occurred writing the index to disk. Please check if the disk is full." << endl; throw 1; } } VMSG_NL("Returning from initFromVector"); } /** * Return the length that the joined string of the given string * list will have. Note that this is indifferent to how the text * fragments correspond to input sequences - it just cares about * the lengths of the fragments. */ index_t joinedLen(EList& szs) { index_t ret = 0; for(unsigned int i = 0; i < szs.size(); i++) { ret += (index_t)szs[i].len; } return ret; } /// Destruct an Ebwt ~GFM() { _fchr.reset(); _ftab.reset(); _eftab.reset(); _plen.reset(); _rstarts.reset(); _offs.reset(); _gfm.reset(); if(offs() != NULL && useShmem_) { FREE_SHARED(offs()); } if(gfm() != NULL && useShmem_) { FREE_SHARED(gfm()); } if (_in1 != NULL) fclose(_in1); if (_in2 != NULL) fclose(_in2); } /// Accessors inline const GFMParams& gh() const { return _gh; } index_t numZOffs() const { return _zOffs.size(); } index_t zOff(index_t i) const { assert_lt(i, _zOffs.size()); return _zOffs[i]; } index_t zGbwtByteOff(index_t i) const { assert_lt(i, _zGbwtByteOffs.size()); return _zGbwtByteOffs[i]; } int zGbwtBpOff(index_t i) const { assert_lt(i, _zGbwtBpOffs.size()); return _zGbwtBpOffs[i]; } index_t nPat() const { return _nPat; } index_t nFrag() const { return _nFrag; } inline index_t* fchr() { return _fchr.get(); } inline index_t* ftab() { return _ftab.get(); } inline index_t* eftab() { return _eftab.get(); } inline index_t* offs() { return _offs.get(); } inline index_t* plen() { return _plen.get(); } inline index_t* rstarts() { return _rstarts.get(); } inline uint8_t* gfm() { return _gfm.get(); } inline const index_t* fchr() const { return _fchr.get(); } inline const index_t* ftab() const { return _ftab.get(); } inline const index_t* eftab() const { return _eftab.get(); } inline const index_t* offs() const { return _offs.get(); } inline const index_t* plen() const { return _plen.get(); } inline const index_t* rstarts() const { return _rstarts.get(); } inline const uint8_t* gfm() const { return _gfm.get(); } inline const EList >& alts() const { return _alts; } inline const EList& altnames() const { return _altnames; } bool toBe() const { return _toBigEndian; } bool verbose() const { return _verbose; } bool sanityCheck() const { return _sanity; } EList& refnames() { return _refnames; } bool fw() const { return fw_; } bool repeat() const { return _repeat; } const EList& getRepeatIncluded() const { return _repeatIncluded; } #ifdef POPCNT_CAPABILITY bool _usePOPCNTinstruction; #endif /** * Returns true iff the index contains the given string (exactly). The * given string must contain only unambiguous characters. TODO: * support skipping of ambiguous characters. */ bool contains( const BTDnaString& str, index_t *top = NULL, index_t *bot = NULL) const; /** * Returns true iff the index contains the given string (exactly). The * given string must contain only unambiguous characters. TODO: * support skipping of ambiguous characters. */ bool contains( const char *str, index_t *top = NULL, index_t *bot = NULL) const { return contains(BTDnaString(str, true), top, bot); } /// Return true iff the Ebwt is currently in memory bool isInMemory() const { if(gfm() != NULL) { // Note: We might have skipped loading _offs, _ftab, // _eftab, and _rstarts depending on whether this is the // reverse index and what algorithm is being used. assert(_gh.repOk()); //assert(_ftab != NULL); //assert(_eftab != NULL); assert(fchr() != NULL); //assert(_offs != NULL); //assert(_rstarts != NULL); // assert_neq(_zGbwtByteOff, INDEX_MAX); // assert_neq(_zGbwtBpOff, -1); return true; } else { assert(ftab() == NULL); assert(eftab() == NULL); assert(fchr() == NULL); assert(offs() == NULL); assert(rstarts() == NULL); assert_eq(_zOffs.size(), 0); assert_eq(_zGbwtByteOffs.size(), 0); assert_eq(_zGbwtBpOffs.size(), 0); return false; } } /// Return true iff the Ebwt is currently stored on disk bool isEvicted() const { return !isInMemory(); } /** * Load this Ebwt into memory by reading it in from the _in1 and * _in2 streams. */ void loadIntoMemory( int needEntireReverse, bool loadSASamp, bool loadFtab, bool loadRstarts, bool loadNames, bool verbose) { readIntoMemory( needEntireReverse, // require reverse index to be concatenated reference reversed loadSASamp, // load the SA sample portion? loadFtab, // load the ftab (_ftab[] and _eftab[])? loadRstarts, // load the r-starts (_rstarts[])? false, // stop after loading the header portion? NULL, // params false, // mmSweep loadNames, // loadNames verbose); // startVerbose } /** * Frees memory associated with the Ebwt. */ void evictFromMemory() { assert(isInMemory()); _fchr.free(); _ftab.free(); _eftab.free(); _rstarts.free(); _offs.free(); // might not be under control of APtrWrap _gfm.free(); // might not be under control of APtrWrap // Keep plen; it's small and the client may want to seq it // even when the others are evicted. //_plen = NULL; _zOffs.clear(); _zGbwtByteOffs.clear(); _zGbwtBpOffs.clear(); } /** * Turn a substring of 'seq' starting at offset 'off' and having * length equal to the index's 'ftabChars' into an int that can be * used to index into the ftab array. */ index_t ftabSeqToInt( const BTDnaString& seq, index_t off, bool rev) const { int fc = _gh._ftabChars; index_t lo = off, hi = lo + fc; assert_leq(hi, seq.length()); index_t ftabOff = 0; for(int i = 0; i < fc; i++) { bool fwex = fw(); if(rev) fwex = !fwex; // We add characters to the ftabOff in the order they would // have been consumed in a normal search. For BWT, this // means right-to-left order; for BWT' it's left-to-right. int c = (fwex ? seq[lo + i] : seq[hi - i - 1]); if(c > 3) { return std::numeric_limits::max(); } assert_range(0, 3, c); ftabOff <<= 2; ftabOff |= c; } return ftabOff; } /** * Non-static facade for static function ftabHi. */ index_t ftabHi(index_t i) const { return GFM::ftabHi( ftab(), eftab(), _gh.linearFM() ? _gh._len : _gh._gbwtLen, _gh._ftabLen, _gh._eftabLen, i); } /** * Get "high interpretation" of ftab entry at index i. The high * interpretation of a regular ftab entry is just the entry * itself. The high interpretation of an extended entry is the * second correpsonding ui32 in the eftab. * * It's a static member because it's convenient to ask this * question before the Ebwt is fully initialized. */ static index_t ftabHi( const index_t *ftab, const index_t *eftab, index_t gbwtLen, index_t ftabLen, index_t eftabLen, index_t i) { assert_lt(i, ftabLen); if(ftab[i] <= gbwtLen) { return ftab[i]; } else { index_t efIdx = ftab[i] ^ (index_t)INDEX_MAX; assert_lt(efIdx*2+1, eftabLen); return eftab[efIdx*2+1]; } } /** * Non-static facade for static function ftabLo. */ index_t ftabLo(index_t i) const { return GFM::ftabLo( ftab(), eftab(), _gh.linearFM() ? _gh._len : _gh._gbwtLen, _gh._ftabLen, _gh._eftabLen, i); } /** * Get low bound of ftab range. */ index_t ftabLo(const BTDnaString& seq, index_t off) const { return ftabLo(ftabSeqToInt(seq, off, false)); } /** * Get high bound of ftab range. */ index_t ftabHi(const BTDnaString& seq, index_t off) const { return ftabHi(ftabSeqToInt(seq, off, false)); } /** * Extract characters from seq starting at offset 'off' and going either * forward or backward, depending on 'rev'. Order matters when compiling * the integer that gets looked up in the ftab. Each successive character * is ORed into the least significant bit-pair, and characters are * integrated in the direction of the search. */ bool ftabLoHi( const BTDnaString& seq, // sequence to extract from index_t off, // offset into seq to begin extracting bool rev, // reverse while extracting index_t& top, index_t& bot) const { index_t fi = ftabSeqToInt(seq, off, rev); if(fi == std::numeric_limits::max()) { return false; } top = ftabHi(fi); bot = ftabLo(fi+1); assert_geq(bot, top); return true; } /** * Get "low interpretation" of ftab entry at index i. The low * interpretation of a regular ftab entry is just the entry * itself. The low interpretation of an extended entry is the * first correpsonding ui32 in the eftab. * * It's a static member because it's convenient to ask this * question before the Ebwt is fully initialized. */ static index_t ftabLo( const index_t *ftab, const index_t *eftab, index_t gbwtLen, index_t ftabLen, index_t eftabLen, index_t i) { assert_lt(i, ftabLen); if(ftab[i] <= gbwtLen) { return ftab[i]; } else { index_t efIdx = ftab[i] ^ (index_t)INDEX_MAX; assert_lt(efIdx*2+1, eftabLen); return eftab[efIdx*2]; } } /** * Try to resolve the reference offset of the BW element 'elt'. If * it can be resolved immediately, return the reference offset. If * it cannot be resolved immediately, return 0xffffffff. */ index_t tryOffset(index_t elt, index_t node) const { assert(offs() != NULL); for(index_t i = 0; i < _zOffs.size(); i++) { if(elt == _zOffs[i]) return 0; } if((node & _gh._offMask) == node) { index_t nodeOff = node >> _gh._offRate; assert_lt(nodeOff, _gh._offsLen); index_t off = offs()[nodeOff]; return off; } else { // Try looking at zoff return (index_t)INDEX_MAX; } } /** * Try to resolve the reference offset of the BW element 'elt' such * that the offset returned is at the right-hand side of the * forward reference substring involved in the hit. */ index_t tryOffset( index_t elt, bool fw, index_t hitlen) const { index_t off = tryOffset(elt); if(off != (index_t)INDEX_MAX && !fw) { assert_lt(off, _gh._len); off = _gh._len - off - 1; assert_geq(off, hitlen-1); off -= (hitlen-1); assert_lt(off, _gh._len); } return off; } /** * Walk 'steps' steps to the left and return the row arrived at. */ index_t walkLeft(index_t row, index_t steps) const; /** * Resolve the reference offset of the BW element 'elt'. */ index_t getOffset(index_t row, index_t node = 0) const; /** * Resolve the reference offset of the BW element 'elt' such that * the offset returned is at the right-hand side of the forward * reference substring involved in the hit. */ index_t getOffset( index_t elt, bool fw, index_t hitlen) const; /** * When using read() to create an Ebwt, we have to set a couple of * additional fields in the Ebwt object that aren't part of the * parameter list and are not stored explicitly in the file. Right * now, this just involves initializing _zEbwtByteOff and * _zEbwtBpOff from _zOff. */ void postReadInit(const GFMParams& gh) { _zGbwtByteOffs.resizeExact(_zOffs.size()); _zGbwtBpOffs.resizeExact(_zOffs.size()); for(index_t i = 0; i < _zOffs.size(); i++) { index_t sideNum = _zOffs[i] / gh._sideGbwtLen; index_t sideCharOff = _zOffs[i] % gh._sideGbwtLen; index_t sideByteOff = sideNum * gh._sideSz; _zGbwtByteOffs[i] = sideCharOff >> 2; assert_lt(_zGbwtByteOffs[i], gh._sideGbwtSz); _zGbwtBpOffs[i] = sideCharOff & 3; assert_lt(_zGbwtBpOffs[i], 4); _zGbwtByteOffs[i] += sideByteOff; } assert(repOk(gh)); // Ebwt should be fully initialized now } /** * Given basename of an Ebwt index, read and return its flag. */ static int32_t readVersionFlags(const string& instr, int& major, int& minor, string& extra_version); static void readProgramVersion(int& major_version, int& minor_version, string& extra_version) { char extra[256] = {0,}; int second_version; sscanf(HISAT2_VERSION, "%d.%d.%d-%s", &second_version, &major_version, &minor_version, extra); extra_version = extra; } static void readIndexVersion(int index_version, int& major_version, int& minor_version, string& extra_version) { major_version = (index_version >> 16) & 0xff; minor_version = (index_version >> 8) & 0xff; if((index_version & 0xff) == 1) { extra_version = "alpha"; } else if((index_version & 0xff) == 2) { extra_version = "beta"; } else { extra_version = ""; } } static int getIndexVersion() { int major_version = 0, minor_version = 0; string extra_version; readProgramVersion(major_version, minor_version, extra_version); int version = 2; // HISAT2 version = (version << 8) | (major_version & 0xff); version = (version << 8) | (minor_version & 0xff); version = version << 8; if(extra_version == "alpha") { version |= 0x1; } else if(extra_version == "beta") { version |= 0x2; } return version; } /** * Pretty-print the Ebwt to the given output stream. */ void print(ostream& out) const { print(out, _gh); } /** * Pretty-print the Ebwt and given EbwtParams to the given output * stream. */ void print(ostream& out, const GFMParams& gh) const { gh.print(out); // print params return; out << "Ebwt (" << (isInMemory()? "memory" : "disk") << "):" << endl; for(index_t i = 0; i < _zOffs.size(); i++) { out << " " << (i+1) << " zOffs: " << _zOffs[i] << endl << " " << (i+1) << " zGbwtByteOff: " << _zGbwtByteOffs[i] << endl << " " << (i+1) << " zGbwtBpOff: " << _zGbwtBpOffs[i] << endl; } out << " nPat: " << _nPat << endl << " plen: "; if(plen() == NULL) { out << "NULL" << endl; } else { out << "non-NULL, [0] = " << plen()[0] << endl; } out << " rstarts: "; if(rstarts() == NULL) { out << "NULL" << endl; } else { out << "non-NULL, [0] = " << rstarts()[0] << endl; } out << " ebwt: "; if(gfm() == NULL) { out << "NULL" << endl; } else { out << "non-NULL, [0] = " << gfm()[0] << endl; } out << " fchr: "; if(fchr() == NULL) { out << "NULL" << endl; } else { out << "non-NULL, [0] = " << fchr()[0] << endl; } out << " ftab: "; if(ftab() == NULL) { out << "NULL" << endl; } else { out << "non-NULL, [0] = " << ftab()[0] << endl; } out << " eftab: "; if(eftab() == NULL) { out << "NULL" << endl; } else { out << "non-NULL, [0] = " << eftab()[0] << endl; } out << " offs: "; if(offs() == NULL) { out << "NULL" << endl; } else { out << "non-NULL, [0] = " << offs()[0] << endl; } } // Building template static TStr join(EList& l, uint32_t seed); template static void join(EList& l, EList& szs, index_t sztot, const RefReadInParams& refparams, uint32_t seed, TStr& s, bool include_rc = false, bool CGtoTG = false); template void joinToDisk(EList& l, EList& szs, index_t sztot, const RefReadInParams& refparams, TStr& ret, ostream& out1, ostream& out2); template void buildToDisk(PathGraph& gbwt, const TStr& s, ostream& out1, ostream& out2, streampos headerPos = -1); template void buildToDisk(InorderBlockwiseSA& sa, const TStr& s, ostream& out1, ostream& out2, streampos headerPos = -1); // I/O void readIntoMemory(int needEntireRev, bool loadSASamp, bool loadFtab, bool loadRstarts, bool justHeader, GFMParams *params, bool mmSweep, bool loadNames, bool startVerbose, bool subIndex = false); void writeFromMemory(bool justHeader, ostream& out1, ostream& out2) const; void writeFromMemory(bool justHeader, const string& out1, const string& out2) const; // Sanity checking void sanityCheckUpToSide(int upToSide) const; void sanityCheckAll(int reverse) const; void restore(SString& s) const; void checkOrigs(const EList >& os, bool mirror) const; // Searching and reporting bool joinedToTextOff(index_t qlen, index_t off, index_t& tidx, index_t& textoff, index_t& tlen, bool rejectStraddle, bool& straddled) const; bool textOffToJoined(index_t tid, index_t tlen, index_t& off) const; #define WITHIN_BWT_LEN(x) \ assert_leq(x[0], this->_gh._sideGbwtLen); \ assert_leq(x[1], this->_gh._sideGbwtLen); \ assert_leq(x[2], this->_gh._sideGbwtLen); \ assert_leq(x[3], this->_gh._sideGbwtLen) #define WITHIN_FCHR(x) \ assert_leq(x[0], this->fchr()[1]); \ assert_leq(x[1], this->fchr()[2]); \ assert_leq(x[2], this->fchr()[3]); \ assert_leq(x[3], this->fchr()[4]) #define WITHIN_FCHR_DOLLARA(x) \ assert_leq(x[0], this->fchr()[1]+1); \ assert_leq(x[1], this->fchr()[2]); \ assert_leq(x[2], this->fchr()[3]); \ assert_leq(x[3], this->fchr()[4]) /** * Count all occurrences of character c from the beginning of the * forward side to and add in the occ[] count up to the side * break just prior to the side. * * A Bowtie 2 side is shaped like: * * XXXXXXXXXXXXXXXX [A] [C] [G] [T] * --------48------ -4- -4- -4- -4- (numbers in bytes) */ inline index_t countBt2Side(const SideLocus& l, int c) const { assert_range(0, 3, c); assert_range(0, (int)this->_gh._sideGbwtSz-1, (int)l._by); assert_range(0, 3, (int)l._bp); const uint8_t *side = l.side(this->gfm()); index_t cCnt = countUpTo(l, c); assert_leq(cCnt, l.toBWRow(_gh)); assert_leq(cCnt, this->_gh._sideGbwtLen); assert_eq(_zGbwtByteOffs.size(), _zGbwtBpOffs.size()); for(index_t i = 0; i < _zGbwtByteOffs.size(); i++) { index_t zGbwtByteOff = _zGbwtByteOffs[i]; if(c == 0 && l._sideByteOff <= zGbwtByteOff && l._sideByteOff + l._by >= zGbwtByteOff) { // Adjust for the fact that we represented $ with an 'A', but // shouldn't count it as an 'A' here int zGbwtBpOff = _zGbwtBpOffs[i]; if((l._sideByteOff + l._by > zGbwtByteOff) || (l._sideByteOff + l._by == zGbwtByteOff && l._bp > zGbwtBpOff)) { cCnt--; // Adjust for '$' looking like an 'A' } } } index_t ret; // Now factor in the occ[] count at the side break const uint8_t *acgt8 = side + _gh._sideGbwtSz; if(!_gh._linearFM) { acgt8 += (sizeof(index_t) << 1); } const index_t *acgt = reinterpret_cast(acgt8); assert_leq(acgt[0], this->_gh._numSides * this->_gh._sideGbwtLen); // b/c it's used as padding assert_lt(acgt[1], this->_gh._gbwtLen); assert_lt(acgt[2], this->_gh._gbwtLen); assert_lt(acgt[3], this->_gh._gbwtLen); ret = acgt[c] + cCnt + this->fchr()[c]; #ifndef NDEBUG assert_leq(ret, this->fchr()[c+1]); // can't have jumpded into next char's section if(c == 0) { assert_leq(cCnt, this->_gh._sideGbwtLen); } else { assert_leq(ret, this->_gh._gbwtLen); } #endif return ret; } /** * Count all occurrences of all four nucleotides up to the starting * point (which must be in a forward side) given by 'l' storing the * result in 'cntsUpto', then count nucleotide occurrences within the * range of length 'num' storing the result in 'cntsIn'. Also, keep * track of the characters occurring within the range by setting * 'masks' accordingly (masks[1][10] == true -> 11th character is a * 'C', and masks[0][10] == masks[2][10] == masks[3][10] == false. */ inline void countBt2SideRange( const SideLocus& l, // top locus index_t num, // number of elts in range to tall index_t* cntsUpto, // A/C/G/T counts up to top index_t* cntsIn, // A/C/G/T counts within range EList *masks) const // masks indicating which range elts = A/C/G/T { assert_gt(num, 0); assert_range(0, (int)this->_gh._sideGbwtSz-1, (int)l._by); assert_range(0, 3, (int)l._bp); countUpToEx(l, cntsUpto); WITHIN_FCHR_DOLLARA(cntsUpto); WITHIN_BWT_LEN(cntsUpto); const uint8_t *side = l.side(this->gfm()); assert_eq(_zGbwtByteOffs.size(), _zGbwtBpOffs.size()); for(index_t i = 0; i < _zGbwtByteOffs.size(); i++) { index_t zGbwtByteOff = _zGbwtByteOffs[i]; if(l._sideByteOff <= zGbwtByteOff && l._sideByteOff + l._by >= zGbwtByteOff) { // Adjust for the fact that we represented $ with an 'A', but // shouldn't count it as an 'A' here int zGbwtBpOff = _zGbwtBpOffs[i]; if((l._sideByteOff + l._by > zGbwtByteOff) || (l._sideByteOff + l._by == zGbwtByteOff && l._bp > zGbwtBpOff)) { cntsUpto[0]--; // Adjust for '$' looking like an 'A' } } } // Now factor in the occ[] count at the side break const index_t *acgt = reinterpret_cast(side + _gh._sideGbwtSz); if(!this->_gh.linearFM()) acgt += 2; assert_leq(acgt[0], this->fchr()[1] + this->_gh.sideGbwtLen()); assert_leq(acgt[1], this->fchr()[2]-this->fchr()[1]); assert_leq(acgt[2], this->fchr()[3]-this->fchr()[2]); assert_leq(acgt[3], this->fchr()[4]-this->fchr()[3]); assert_leq(acgt[0], this->_gh._gbwtLen + this->_gh.sideGbwtLen()); assert_leq(acgt[1], this->_gh._gbwtLen); assert_leq(acgt[2], this->_gh._gbwtLen); assert_leq(acgt[3], this->_gh._gbwtLen); cntsUpto[0] += (acgt[0] + this->fchr()[0]); cntsUpto[1] += (acgt[1] + this->fchr()[1]); cntsUpto[2] += (acgt[2] + this->fchr()[2]); cntsUpto[3] += (acgt[3] + this->fchr()[3]); masks[0].resize(num); masks[1].resize(num); masks[2].resize(num); masks[3].resize(num); WITHIN_FCHR_DOLLARA(cntsUpto); WITHIN_FCHR_DOLLARA(cntsIn); // 'cntsUpto' is complete now. // Walk forward until we've tallied the entire 'In' range index_t nm = 0; // Rest of this side nm += countBt2SideRange2(l, true, num - nm, cntsIn, masks, nm); assert_eq(nm, cntsIn[0] + cntsIn[1] + cntsIn[2] + cntsIn[3]); assert_leq(nm, num); SideLocus lcopy = l; while(nm < num) { // Subsequent sides, if necessary lcopy.nextSide(this->_gh); nm += countBt2SideRange2(lcopy, false, num - nm, cntsIn, masks, nm); WITHIN_FCHR_DOLLARA(cntsIn); assert_leq(nm, num); assert_eq(nm, cntsIn[0] + cntsIn[1] + cntsIn[2] + cntsIn[3]); } assert_eq(num, cntsIn[0] + cntsIn[1] + cntsIn[2] + cntsIn[3]); WITHIN_FCHR_DOLLARA(cntsIn); } /** * Count all occurrences of character c from the beginning of the * forward side to and add in the occ[] count up to the side * break just prior to the side. * * A forward side is shaped like: * * [A] [C] XXXXXXXXXXXXXXXX * -4- -4- --------56------ (numbers in bytes) * ^ * Side ptr (result from SideLocus.side()) * * And following it is a reverse side shaped like: * * [G] [T] XXXXXXXXXXXXXXXX * -4- -4- --------56------ (numbers in bytes) * ^ * Side ptr (result from SideLocus.side()) * */ inline void countBt2SideEx(const SideLocus& l, index_t* arrs) const { assert_range(0, (int)this->_gh._sideGbwtSz-1, (int)l._by); assert_range(0, 3, (int)l._bp); countUpToEx(l, arrs); assert_eq(_zGbwtByteOffs.size(), _zGbwtBpOffs.size()); for(index_t i = 0; i < _zGbwtByteOffs.size(); i++) { index_t zGbwtByteOff = _zGbwtByteOffs[i]; if(l._sideByteOff <= zGbwtByteOff && l._sideByteOff + l._by >= zGbwtByteOff) { // Adjust for the fact that we represented $ with an 'A', but // shouldn't count it as an 'A' here int zGbwtBpOff = _zGbwtBpOffs[i]; if((l._sideByteOff + l._by > zGbwtByteOff) || (l._sideByteOff + l._by == zGbwtByteOff && l._bp > zGbwtBpOff)) { arrs[0]--; // Adjust for '$' looking like an 'A' } } } WITHIN_FCHR(arrs); WITHIN_BWT_LEN(arrs); // Now factor in the occ[] count at the side break const uint8_t *side = l.side(this->gfm()); const uint8_t *acgt16 = side + this->_gh._sideSz - sizeof(index_t) * 4; const index_t *acgt = reinterpret_cast(acgt16); assert_leq(acgt[0], this->fchr()[1] + this->_gh.sideGbwtLen()); assert_leq(acgt[1], this->fchr()[2]-this->fchr()[1]); assert_leq(acgt[2], this->fchr()[3]-this->fchr()[2]); assert_leq(acgt[3], this->fchr()[4]-this->fchr()[3]); assert_leq(acgt[0], this->_gh._len + this->_gh.sideGbwtLen()); assert_leq(acgt[1], this->_gh._len); assert_leq(acgt[2], this->_gh._len); assert_leq(acgt[3], this->_gh._len); arrs[0] += (acgt[0] + this->fchr()[0]); arrs[1] += (acgt[1] + this->fchr()[1]); arrs[2] += (acgt[2] + this->fchr()[2]); arrs[3] += (acgt[3] + this->fchr()[3]); WITHIN_FCHR(arrs); } /** * Count all occurrences of character 1 from the beginning of the * forward side to and add in the occ[] count up to the side * break just prior to the side. * */ inline index_t countMSide(const SideLocus& l) const { assert_range(0, (int)this->_gh._sideGbwtSz-1, (int)l._by); assert_range(0, 7, (int)l._bp); index_t cCnt = countUpTo_bits(l, false /* F? */); const uint8_t *side = l.side(this->gfm()); cCnt += *(index_t*)(side + _gh._sideGbwtSz + sizeof(index_t)); assert_leq(cCnt, l.toBWRow(_gh)); assert_leq(cCnt, this->_gh._numNodes); return cCnt; } /** * Counts the number of occurrences of character 'c' in the given Ebwt * side up to (but not including) the given byte/bitpair (by/bp). * * This is a performance-critical function. This is the top search- * related hit in the time profile. * * Function gets 11.09% in profile */ inline index_t countUpTo(const SideLocus& l, int c) const { // Count occurrences of c in each 64-bit (using bit trickery); // Someday countInU64() and pop() functions should be // vectorized/SSE-ized in case that helps. bool usePOPCNT = false; index_t cCnt = 0; const uint8_t *side = l.side(this->gfm()); int i = 0; #ifdef POPCNT_CAPABILITY if(_usePOPCNTinstruction) { usePOPCNT = true; int by = l._by + (l._bp > 0 ? 1 : 0); for(; i < by; i += 8) { if(i + 8 < by) { cCnt += countInU64(c, *(uint64_t*)&side[i]); } else { index_t by_shift = 8 - (by - i); index_t bp_shift = (l._bp > 0 ? 4 - l._bp : 0); index_t shift = (by_shift << 3) + (bp_shift << 1); uint64_t side_i = *(uint64_t*)&side[i]; side_i = (_toBigEndian ? side_i >> shift : side_i << shift); index_t cCnt_add = countInU64(c, side_i); if(c == 0) cCnt_add -= (shift >> 1); #ifndef NDEBUG index_t cCnt_temp = 0; for(int j = i; j < l._by; j++) { cCnt_temp += cCntLUT_4[0][c][side[j]]; } if(l._bp > 0) { cCnt_temp += cCntLUT_4[(int)l._bp][c][side[l._by]]; } assert_eq(cCnt_add, cCnt_temp); #endif cCnt += cCnt_add; break; } } } else { for(; i + 7 < l._by; i += 8) { cCnt += countInU64(c, *(uint64_t*)&side[i]); } } #else for(; i + 7 < l._by; i += 8) { cCnt += countInU64(c, *(uint64_t*)&side[i]); } #endif if(!usePOPCNT) { // Count occurences of c in the rest of the side (using LUT) for(; i < l._by; i++) { cCnt += cCntLUT_4[0][c][side[i]]; } // Count occurences of c in the rest of the byte if(l._bp > 0) { cCnt += cCntLUT_4[(int)l._bp][c][side[i]]; } } return cCnt; } /** * Counts the number of occurrences of character 'c' in the given Ebwt * side down to the given byte/bitpair (by/bp). * */ inline index_t countDownTo(const SideLocus& l, int c) const { // Count occurrences of c in each 64-bit (using bit trickery); // Someday countInU64() and pop() functions should be // vectorized/SSE-ized in case that helps. index_t cCnt = 0; const uint8_t *side = l.side(this->gfm()); int i = 64 - 4 * sizeof(index_t) - 1; #ifdef POPCNT_CAPABILITY if ( _usePOPCNTinstruction) { for(; i - 7 > l._by; i -= 8) { cCnt += countInU64(c, *(uint64_t*)&side[i-7]); } } else { for(; i + 7 > l._by; i -= 8) { cCnt += countInU64(c, *(uint64_t*)&side[i-7]); } } #else for(; i + 7 > l._by; i -= 8) { cCnt += countInU64(c, *(uint64_t*)&side[i-7]); } #endif // Count occurences of c in the rest of the side (using LUT) for(; i > l._by; i--) { cCnt += cCntLUT_4_rev[0][c][side[i]]; } // Count occurences of c in the rest of the byte if(l._bp > 0) { cCnt += cCntLUT_4_rev[4-(int)l._bp][c][side[i]]; } else { cCnt += cCntLUT_4_rev[0][c][side[i]]; } return cCnt; } /** * Tricky-bit-bashing bitpair counting for given two-bit value (0-3) * within a 64-bit argument. * * Function gets 2.32% in profile */ #ifdef POPCNT_CAPABILITY template #endif inline static void countInU64Ex(uint64_t dw, index_t* arrs) { uint64_t c0 = c_table[0]; uint64_t x0 = dw ^ c0; uint64_t x1 = (x0 >> 1); uint64_t x2 = x1 & (0x5555555555555555llu); uint64_t x3 = x0 & x2; #ifdef POPCNT_CAPABILITY uint64_t tmp = Operation().pop64(x3); #else uint64_t tmp = pop64(x3); #endif arrs[0] += (uint32_t) tmp; c0 = c_table[1]; x0 = dw ^ c0; x1 = (x0 >> 1); x2 = x1 & (0x5555555555555555llu); x3 = x0 & x2; #ifdef POPCNT_CAPABILITY tmp = Operation().pop64(x3); #else tmp = pop64(x3); #endif arrs[1] += (uint32_t) tmp; c0 = c_table[2]; x0 = dw ^ c0; x1 = (x0 >> 1); x2 = x1 & (0x5555555555555555llu); x3 = x0 & x2; #ifdef POPCNT_CAPABILITY tmp = Operation().pop64(x3); #else tmp = pop64(x3); #endif arrs[2] += (uint32_t) tmp; c0 = c_table[3]; x0 = dw ^ c0; x1 = (x0 >> 1); x2 = x1 & (0x5555555555555555llu); x3 = x0 & x2; #ifdef POPCNT_CAPABILITY tmp = Operation().pop64(x3); #else tmp = pop64(x3); #endif arrs[3] += (uint32_t) tmp; } /** * Counts the number of occurrences of all four nucleotides in the * given side up to (but not including) the given byte/bitpair (by/bp). * Count for 'a' goes in arrs[0], 'c' in arrs[1], etc. */ inline void countUpToEx(const SideLocus& l, index_t* arrs) const { int i = 0; // Count occurrences of each nucleotide in each 64-bit word using // bit trickery; note: this seems does not seem to lend a // significant boost to performance in practice. If you comment // out this whole loop (which won't affect correctness - it will // just cause the following loop to take up the slack) then runtime // does not change noticeably. Someday the countInU64() and pop() // functions should be vectorized/SSE-ized in case that helps. const uint8_t *side = l.side(this->gfm()); #ifdef POPCNT_CAPABILITY if (_usePOPCNTinstruction) { for(; i+7 < l._by; i += 8) { countInU64Ex(*(uint64_t*)&side[i], arrs); } } else { for(; i+7 < l._by; i += 8) { countInU64Ex(*(uint64_t*)&side[i], arrs); } } #else for(; i+7 < l._by; i += 8) { countInU64Ex(*(uint64_t*)&side[i], arrs); } #endif // Count occurences of nucleotides in the rest of the side (using LUT) // Many cache misses on following lines (~20K) for(; i < l._by; i++) { arrs[0] += cCntLUT_4[0][0][side[i]]; arrs[1] += cCntLUT_4[0][1][side[i]]; arrs[2] += cCntLUT_4[0][2][side[i]]; arrs[3] += cCntLUT_4[0][3][side[i]]; } // Count occurences of c in the rest of the byte if(l._bp > 0) { arrs[0] += cCntLUT_4[(int)l._bp][0][side[i]]; arrs[1] += cCntLUT_4[(int)l._bp][1][side[i]]; arrs[2] += cCntLUT_4[(int)l._bp][2][side[i]]; arrs[3] += cCntLUT_4[(int)l._bp][3][side[i]]; } } /** * Counts the number of occurrences of character 'c' in the given Ebwt * side up to (but not including) the given byte/bitpair (by/bp). * * This is a performance-critical function. This is the top search- * related hit in the time profile. */ inline index_t countUpTo_bits(const SideLocus& l, bool F) const { // Count occurrences of c in each 64-bit (using bit trickery); // Someday countInU64() and pop() functions should be // vectorized/SSE-ized in case that helps. bool usePOPCNT = false; index_t cCnt = 0; const uint8_t *side = l.side(this->gfm()); if(F) { side += (_gh._sideGbwtSz >> 1); } else { side += (_gh._sideGbwtSz - (_gh._sideGbwtSz >> 2)); } int i = 0; #ifdef POPCNT_CAPABILITY if(_usePOPCNTinstruction) { usePOPCNT = true; int by = l._by + (l._bp > 0 ? 1 : 0); for(; i < by; i += 8) { if(i + 8 < by) { cCnt += countInU64_bits(*(uint64_t*)&side[i]); } else { index_t by_shift = 8 - (by - i); index_t bp_shift = (l._bp > 0 ? 8 - l._bp : 0); index_t shift = (by_shift << 3) + bp_shift; uint64_t side_i = *(uint64_t*)&side[i]; side_i = (_toBigEndian ? side_i >> shift : side_i << shift); index_t cCnt_add = countInU64_bits(side_i); #ifndef NDEBUG index_t cCnt_temp = 0; for(int j = i; j < l._by; j++) { cCnt_temp += cCntBIT[0][side[j]]; } if(l._bp > 0) { cCnt_temp += cCntBIT[(int)l._bp][side[l._by]]; } assert_eq(cCnt_add, cCnt_temp); #endif cCnt += cCnt_add; break; } } } else { for(; i + 7 < l._by; i += 8) { cCnt += countInU64_bits(*(uint64_t*)&side[i]); } } #else for(; i + 7 < l._by; i += 8) { cCnt += countInU64_bits(*(uint64_t*)&side[i]); } #endif if(!usePOPCNT) { // Count occurences of c in the rest of the side (using LUT) for(; i < l._by; i++) { cCnt += cCntBIT[0][side[i]]; } // Count occurences of c in the rest of the byte if(l._bp > 0) { cCnt += cCntBIT[(int)l._bp][side[i]]; } } return cCnt; } #ifndef NDEBUG /** * Given top and bot loci, calculate counts of all four DNA chars up to * those loci. Used for more advanced backtracking-search. */ inline void mapLFEx( const SideLocus& l, index_t *arrs ASSERT_ONLY(, bool overrideSanity = false) ) const { assert_eq(0, arrs[0]); assert_eq(0, arrs[1]); assert_eq(0, arrs[2]); assert_eq(0, arrs[3]); countBt2SideEx(l, arrs); if(_sanity && !overrideSanity) { // Make sure results match up with individual calls to mapLF; // be sure to override sanity-checking in the callee, or we'll // have infinite recursion assert_eq(mapLF(l, 0, true), arrs[0]); assert_eq(mapLF(l, 1, true), arrs[1]); assert_eq(mapLF(l, 2, true), arrs[2]); assert_eq(mapLF(l, 3, true), arrs[3]); } } #endif /** * Given top and bot rows, calculate counts of all four DNA chars up to * those loci. */ inline void mapLFEx( index_t top, index_t bot, index_t *tops, index_t *bots ASSERT_ONLY(, bool overrideSanity = false) ) const { SideLocus ltop, lbot; SideLocus::initFromTopBot(top, bot, _gh, gfm(), ltop, lbot); mapLFEx(ltop, lbot, tops, bots ASSERT_ONLY(, overrideSanity)); } /** * Given top and bot loci, calculate counts of all four DNA chars up to * those loci. Used for more advanced backtracking-search. */ inline void mapLFEx( const SideLocus& ltop, const SideLocus& lbot, index_t *tops, index_t *bots ASSERT_ONLY(, bool overrideSanity = false) ) const { assert(ltop.repOk(this->gh())); assert(lbot.repOk(this->gh())); assert_eq(0, tops[0]); assert_eq(0, bots[0]); assert_eq(0, tops[1]); assert_eq(0, bots[1]); assert_eq(0, tops[2]); assert_eq(0, bots[2]); assert_eq(0, tops[3]); assert_eq(0, bots[3]); countBt2SideEx(ltop, tops); countBt2SideEx(lbot, bots); #ifndef NDEBUG if(_sanity && !overrideSanity) { // Make sure results match up with individual calls to mapLF; // be sure to override sanity-checking in the callee, or we'll // have infinite recursion assert_eq(mapLF(ltop, 0, true), tops[0]); assert_eq(mapLF(ltop, 1, true), tops[1]); assert_eq(mapLF(ltop, 2, true), tops[2]); assert_eq(mapLF(ltop, 3, true), tops[3]); assert_eq(mapLF(lbot, 0, true), bots[0]); assert_eq(mapLF(lbot, 1, true), bots[1]); assert_eq(mapLF(lbot, 2, true), bots[2]); assert_eq(mapLF(lbot, 3, true), bots[3]); } #endif } /** * Counts the number of occurrences of all four nucleotides in the * given side from the given byte/bitpair (l->_by/l->_bp) (or the * beginning of the side if l == 0). Count for 'a' goes in arrs[0], * 'c' in arrs[1], etc. * * Note: must account for $. * * Must fill in masks */ inline index_t countBt2SideRange2( const SideLocus& l, bool startAtLocus, index_t num, index_t* arrs, EList *masks, index_t maskOff) const { assert(!masks[0].empty()); assert_eq(masks[0].size(), masks[1].size()); assert_eq(masks[0].size(), masks[2].size()); assert_eq(masks[0].size(), masks[3].size()); ASSERT_ONLY(index_t myarrs[4] = {0, 0, 0, 0}); index_t nm = 0; // number of nucleotides tallied so far int iby = 0; // initial byte offset int ibp = 0; // initial base-pair offset if(startAtLocus) { iby = l._by; ibp = l._bp; } else { // Start at beginning } int by = iby, bp = ibp; assert_lt(bp, 4); index_t sideGbwtSz = this->_gh._sideGbwtSz >> (this->_gh.linearFM() ? 0 : 1); assert_lt(by, (int)sideGbwtSz); const uint8_t *side = l.side(this->gfm()); while(nm < num) { int c = (side[by] >> (bp * 2)) & 3; assert_lt(maskOff + nm, masks[c].size()); masks[0][maskOff + nm] = masks[1][maskOff + nm] = masks[2][maskOff + nm] = masks[3][maskOff + nm] = false; assert_range(0, 3, c); // Note: we tally $ just like an A arrs[c]++; // tally it ASSERT_ONLY(myarrs[c]++); masks[c][maskOff + nm] = true; // not dead nm++; if(++bp == 4) { bp = 0; by++; assert_leq(by, (int)sideGbwtSz); if(by == (int)sideGbwtSz) { // Fell off the end of the side break; } } } WITHIN_FCHR_DOLLARA(arrs); #ifndef NDEBUG if(_sanity) { // Make sure results match up with a call to mapLFEx. index_t tops[4] = {0, 0, 0, 0}; index_t bots[4] = {0, 0, 0, 0}; index_t top = l.toBWRow(gh()); index_t bot = top + nm; mapLFEx(top, bot, tops, bots, false); assert(myarrs[0] == (bots[0] - tops[0]) || myarrs[0] == (bots[0] - tops[0])+1); assert_eq(myarrs[1], bots[1] - tops[1]); assert_eq(myarrs[2], bots[2] - tops[2]); assert_eq(myarrs[3], bots[3] - tops[3]); } #endif return nm; } /** * Return the final character in row i (i.e. the i'th character in the * BWT transform). Note that the 'L' in the name of the function * stands for 'last', as in the literature. */ inline int rowL(const SideLocus& l) const { // Extract and return appropriate bit-pair return unpack_2b_from_8b(l.side(this->gfm())[l._by], l._bp); } /** * Return the final character in row i (i.e. the i'th character in the * BWT transform). Note that the 'L' in the name of the function * stands for 'last', as in the literature. */ inline int rowL(index_t i) const { // Extract and return appropriate bit-pair SideLocus l; l.initFromRow(i, _gh, gfm()); return rowL(l); } /** * Given top and bot loci, calculate counts of all four DNA chars up to * those loci. Used for more advanced backtracking-search. */ inline void mapLFRange( const SideLocus& ltop, const SideLocus& lbot, index_t num, // Number of elts index_t* cntsUpto, // A/C/G/T counts up to top index_t* cntsIn, // A/C/G/T counts within range EList *masks ASSERT_ONLY(, bool overrideSanity = false) ) const { assert(ltop.repOk(this->gh())); assert(lbot.repOk(this->gh())); assert_eq(num, lbot.toBWRow(this->gh()) - ltop.toBWRow(this->gh())); assert_eq(0, cntsUpto[0]); assert_eq(0, cntsIn[0]); assert_eq(0, cntsUpto[1]); assert_eq(0, cntsIn[1]); assert_eq(0, cntsUpto[2]); assert_eq(0, cntsIn[2]); assert_eq(0, cntsUpto[3]); assert_eq(0, cntsIn[3]); countBt2SideRange(ltop, num, cntsUpto, cntsIn, masks); assert_eq(num, cntsIn[0] + cntsIn[1] + cntsIn[2] + cntsIn[3]); #ifndef NDEBUG if(_sanity && !overrideSanity) { // Make sure results match up with individual calls to mapLF; // be sure to override sanity-checking in the callee, or we'll // have infinite recursion index_t tops[4] = {0, 0, 0, 0}; index_t bots[4] = {0, 0, 0, 0}; assert(ltop.repOk(this->gh())); assert(lbot.repOk(this->gh())); mapLFEx(ltop, lbot, tops, bots, false); for(int i = 0; i < 4; i++) { assert(cntsUpto[i] == tops[i] || tops[i] == bots[i]); if(i == 0) { assert(cntsIn[i] == bots[i]-tops[i] || cntsIn[i] == bots[i]-tops[i]+1); } else { assert_eq(cntsIn[i], bots[i]-tops[i]); } } } #endif } /** * Given row i, return the row that the LF mapping maps i to. */ inline index_t mapLF( const SideLocus& l ASSERT_ONLY(, bool overrideSanity = false) ) const { ASSERT_ONLY(index_t srcrow = l.toBWRow(_gh)); index_t ret; assert(l.side(this->gfm()) != NULL); int c = rowL(l); assert_lt(c, 4); assert_geq(c, 0); ret = countBt2Side(l, c); assert_lt(ret, this->_gh._gbwtLen); assert_neq(srcrow, ret); #ifndef NDEBUG if(_sanity && !overrideSanity) { // Make sure results match up with results from mapLFEx; // be sure to override sanity-checking in the callee, or we'll // have infinite recursion index_t arrs[] = { 0, 0, 0, 0 }; mapLFEx(l, arrs, true); assert_eq(arrs[c], ret); } #endif return ret; } /** * Given row i and character c, return the row that the LF mapping maps * i to on character c. */ inline index_t mapLF( const SideLocus& l, int c ASSERT_ONLY(, bool overrideSanity = false) ) const { index_t ret; assert_lt(c, 4); assert_geq(c, 0); ret = countBt2Side(l, c); assert_lt(ret, this->_gh._gbwtLen); #ifndef NDEBUG if(_sanity && !overrideSanity) { // Make sure results match up with results from mapLFEx; // be sure to override sanity-checking in the callee, or we'll // have infinite recursion index_t arrs[] = { 0, 0, 0, 0 }; mapLFEx(l, arrs, true); assert_eq(arrs[c], ret); } #endif return ret; } /** * Given row i and character c, return the row that the GLF mapping maps * i to on character c. */ inline pair mapLF( SideLocus& tloc, SideLocus& bloc, int c, pair* node_range = NULL ASSERT_ONLY(, bool overrideSanity = false) ) const { assert_lt(c, 4); assert_geq(c, 0); index_t top = mapLF(tloc, c); index_t bot = mapLF(bloc, c); if(node_range != NULL) { node_range->first = top; node_range->second = bot; } return pair(top, bot); } /** * Given row i and character c, return the row that the GLF mapping maps * i to on character c. */ inline pair mapGLF( SideLocus& tloc, SideLocus& bloc, int c, pair* node_range = NULL, EList >* node_iedges = NULL, index_t k = 5 ASSERT_ONLY(, bool overrideSanity = false) ) const { assert_lt(c, 4); assert_geq(c, 0); index_t top = mapLF(tloc, c); index_t bot = mapLF(bloc, c); if(gh().linearFM()) { if(node_range != NULL) { node_range->first = top; node_range->second = bot; } if(node_iedges != NULL) { node_iedges->clear(); } return pair(top, bot); } if(top + 1 >= gh()._gbwtLen || top >= bot) { assert_eq(top, bot); return pair(0, 0); } tloc.initFromRow_bit(top + 1, gh(), gfm()); index_t node_top = rank_M(tloc) - 1; index_t top_F_loc = 0, top_M_occ = 0; size_t iter = 0; while(true) { const uint8_t *side = tloc.side(gfm()) + gh()._sideGbwtSz - gh()._sideSz * iter; top_F_loc = *((index_t*)side); side += sizeof(index_t); top_M_occ = *((index_t*)side); assert_leq(top_M_occ, node_top + 1); if(top_M_occ <= node_top) break; iter++; } if(top_M_occ > 0) top_F_loc++; tloc.initFromRow_bit(top_F_loc, gh(), gfm()); if(node_top + 1 > top_M_occ) { top = select_F(tloc, node_top + 1 - top_M_occ); } else { top = top_F_loc; } bloc.initFromRow_bit(bot, gh(), gfm()); index_t node_bot = rank_M(bloc); const uint8_t *side = bloc.side(gfm()) + gh()._sideGbwtSz; index_t bot_F_loc = *((index_t*)side); side += sizeof(index_t); index_t bot_M_occ = *((index_t*)side); assert_leq(bot_M_occ, node_bot + 1); if(bot_M_occ > 0) bot_F_loc++; bloc.initFromRow_bit(bot_F_loc, gh(), gfm()); if(node_bot + 1 > bot_M_occ) { bot = select_F(bloc, node_bot + 1 - bot_M_occ); } else { bot = bot_F_loc; } if(node_range != NULL) { (*node_range).first = node_top; (*node_range).second = node_bot; } assert_leq(node_bot - node_top, bot - top); if(node_iedges != NULL && node_bot - node_top <= k && node_bot - node_top < bot - top) { getInEdgeCount(top, bot, *node_iedges); } return pair(top, bot); } /** * Given top and bot loci, calculate counts of all four DNA chars up to * those loci. Also, update a set of tops and bots for the reverse * index/direction using the idea from the bi-directional BWT paper. */ inline void mapBiLFEx( const SideLocus& ltop, const SideLocus& lbot, index_t *tops, index_t *bots, index_t *topsP, // topsP[0] = top index_t *botsP ASSERT_ONLY(, bool overrideSanity = false) ) const { #ifndef NDEBUG for(int i = 0; i < 4; i++) { assert_eq(0, tops[0]); assert_eq(0, bots[0]); } #endif countBt2SideEx(ltop, tops); countBt2SideEx(lbot, bots); #ifndef NDEBUG if(_sanity && !overrideSanity) { // Make sure results match up with individual calls to mapLF; // be sure to override sanity-checking in the callee, or we'll // have infinite recursion assert_eq(mapLF(ltop, 0, true), tops[0]); assert_eq(mapLF(ltop, 1, true), tops[1]); assert_eq(mapLF(ltop, 2, true), tops[2]); assert_eq(mapLF(ltop, 3, true), tops[3]); assert_eq(mapLF(lbot, 0, true), bots[0]); assert_eq(mapLF(lbot, 1, true), bots[1]); assert_eq(mapLF(lbot, 2, true), bots[2]); assert_eq(mapLF(lbot, 3, true), bots[3]); } #endif // bots[0..3] - tops[0..3] = # of ways to extend the suffix with an // A, C, G, T botsP[0] = topsP[0] + (bots[0] - tops[0]); topsP[1] = botsP[0]; botsP[1] = topsP[1] + (bots[1] - tops[1]); topsP[2] = botsP[1]; botsP[2] = topsP[2] + (bots[2] - tops[2]); topsP[3] = botsP[2]; botsP[3] = topsP[3] + (bots[3] - tops[3]); } /** * Given row and its locus information, proceed on the given character * and return the next row, or all-fs if we can't proceed on that * character. Returns 0xffffffff if this row ends in $. */ inline index_t mapLF1( index_t row, // starting row const SideLocus& l, // locus for starting row int c // character to proceed on ASSERT_ONLY(, bool overrideSanity = false) ) const { if(rowL(l) != c) return (index_t)INDEX_MAX; for(index_t i = 0; i < _zOffs.size(); i++) { if(row == _zOffs[i]) return (index_t)INDEX_MAX; } index_t ret; assert_lt(c, 4); assert_geq(c, 0); ret = countBt2Side(l, c); assert_lt(ret, this->_gh._gbwtLen); #ifndef NDEBUG if(_sanity && !overrideSanity) { // Make sure results match up with results from mapLFEx; // be sure to override sanity-checking in the callee, or we'll // have infinite recursion index_t arrs[] = { 0, 0, 0, 0 }; mapLFEx(l, arrs, true); assert_eq(arrs[c], ret); } #endif return ret; } /** * Given row and its locus information, set the row to LF(row) and * return the character that was in the final column. */ inline int mapLF1( index_t& row, // starting row const SideLocus& l // locus for starting row ASSERT_ONLY(, bool overrideSanity = false) ) const { for(index_t i = 0; i < _zOffs.size(); i++) { if(row == _zOffs[i]) return -1; } int c = rowL(l); assert_range(0, 3, c); row = countBt2Side(l, c); assert_lt(row, this->_gh._gbwtLen); #ifndef NDEBUG if(_sanity && !overrideSanity) { // Make sure results match up with results from mapLFEx; // be sure to override sanity-checking in the callee, or we'll // have infinite recursion index_t arrs[] = { 0, 0, 0, 0 }; mapLFEx(l, arrs, true); assert_eq(arrs[c], row); } #endif return c; } /** * Given row and its locus information, proceed on the given character * and return the next row, or all-fs if we can't proceed on that * character. Returns 0xffffffff if this row ends in $. */ inline pair mapGLF1( index_t row, // starting row SideLocus& l, // locus for starting row int c, // character to proceed pair* node_range = NULL ASSERT_ONLY(, bool overrideSanity = false) ) const { assert_lt(c, 4); assert_geq(c, 0); index_t top = mapLF1(row, l, c); if(top == (index_t)INDEX_MAX) return pair(0, 0); if(gh().linearFM()) { if(node_range != NULL) { node_range->first = top; node_range->second = top + 1; } return pair(top, top + 1); } index_t bot = top; l.initFromRow_bit(top + 1, gh(), gfm()); index_t node_top = rank_M(l) - 1; index_t F_loc = 0, M_occ = 0; size_t iter = 0; while(true) { const uint8_t *side = l.side(gfm()) + gh()._sideGbwtSz - gh()._sideSz * iter; F_loc = *((index_t*)side); side += sizeof(index_t); M_occ = *((index_t*)side); assert_leq(M_occ, node_top + 1); if(M_occ <= node_top) break; iter++; } if(M_occ > 0) F_loc++; l.initFromRow_bit(F_loc, gh(), gfm()); if(node_top + 1 > M_occ) { top = select_F(l, node_top + 1 - M_occ); } else { top = F_loc; } index_t node_bot = node_top + 1; if(node_bot + 1 > M_occ) { SideLocus l2; #if 0 l2.initFromRow_bit(top + 1, gh(), gfm()); bot = select_F(l2, 1); ASSERT_ONLY(index_t bot2 = select_F(l, node_bot + 1 - M_occ)); assert_eq(bot, bot2); #else bot = select_F(l, node_bot + 1 - M_occ); #endif } else { bot = F_loc; } if(node_range != NULL) { (*node_range).first = node_top; (*node_range).second = node_bot; } return pair(top, bot); } /** * Given row and its locus information, proceed on the given character * and return the next row, or all-fs if we can't proceed on that * character. Returns 0xffffffff if this row ends in $. */ inline pair mapGLF1( index_t row, // starting row SideLocus& l, // locus for starting row pair* node_range = NULL ASSERT_ONLY(, bool overrideSanity = false) ) const { for(index_t i = 0; i < _zOffs.size(); i++) { if(row == _zOffs[i]) return pair((index_t)INDEX_MAX, (index_t)INDEX_MAX); } mapLF1(row, l); index_t top = row; if(top == (index_t)INDEX_MAX) return pair(0, 0); if(gh().linearFM()) { if(node_range != NULL) { node_range->first = top; node_range->second = top + 1; } return pair(top, top + 1); } index_t bot = top; l.initFromRow_bit(top + 1, gh(), gfm()); index_t node_top = rank_M(l) - 1; index_t F_loc = 0, M_occ = 0; size_t iter = 0; while(true) { const uint8_t *side = l.side(gfm()) + gh()._sideGbwtSz - gh()._sideSz * iter; F_loc = *((index_t*)side); side += sizeof(index_t); M_occ = *((index_t*)side); assert_leq(M_occ, node_top + 1); if(M_occ <= node_top) break; iter++; } if(M_occ > 0) F_loc++; l.initFromRow_bit(F_loc, gh(), gfm()); if(node_top + 1 > M_occ) { top = select_F(l, node_top + 1 - M_occ); } else { top = F_loc; } index_t node_bot = node_top + 1; if(node_bot + 1 > M_occ) { #if 0 l2.initFromRow_bit(top + 1, gh(), gfm()); bot = select_F(l2, 1); ASSERT_ONLY(index_t bot2 = select_F(l, node_bot + 1 - M_occ)); assert_eq(bot, bot2); #else bot = select_F(l, node_bot + 1 - M_occ); #endif } else { bot = F_loc; } if(node_range != NULL) { (*node_range).first = node_top; (*node_range).second = node_bot; } return pair(top, bot); } /** * Given row i, return rank */ inline index_t rank_M( const SideLocus& l ASSERT_ONLY(, bool overrideSanity = false) ) const { index_t ret = countMSide(l); assert_leq(ret, this->_gh._numNodes); return ret; } /** * Given row i, return select */ inline index_t select_F( SideLocus l, index_t count ASSERT_ONLY(, bool overrideSanity = false) ) const { assert_gt(count, 0); const uint8_t *side = l.side(this->gfm()) + (_gh._sideGbwtSz >> 1); while(true) { index_t remainingBitsSide = (_gh._sideGbwtSz << 1) - l._charOff; assert_gt(remainingBitsSide, 0); index_t minSide = (count < remainingBitsSide ? count : remainingBitsSide); uint64_t bits = *(uint64_t*)&side[l._by]; uint8_t advance = 64; if(l._bp > 0) { bits >>= l._bp; advance -= l._bp; } if(minSide < advance) { advance = minSide; bits <<= (64 - minSide); } uint8_t tmp_count = 0; #ifdef POPCNT_CAPABILITY if(_usePOPCNTinstruction) { tmp_count = countInU64_bits(bits); } else { tmp_count = countInU64_bits(bits); } #else tmp_count = countInU64_bits(bits); #endif assert_leq(tmp_count, count); count -= tmp_count; if(count == 0) { assert_gt(advance, 0); l._charOff += (advance - 1); assert_lt(l._charOff, _gh._sideGbwtSz << 1); l._by = l._charOff >> 3; l._bp = l._charOff & 0x7; break; } assert_leq(l._charOff + advance, (_gh._sideGbwtSz << 1)); if(l._charOff + advance == (_gh._sideGbwtSz << 1)) { l.nextSide(_gh); side = l.side(this->gfm()) + (_gh._sideGbwtSz >> 1); } else { l._charOff += advance; l._by = l._charOff >> 3; l._bp = l._charOff & 0x7; } } return l.toBWRow(_gh); } /** * */ inline void getInEdgeCount( index_t top, index_t bot, EList >& node_iedges) const { assert_lt(top, bot); node_iedges.clear(); SideLocus l; l.initFromRow_bit(top, _gh, gfm()); const uint8_t *side = l.side(this->gfm()) + (_gh._sideGbwtSz >> 1); assert_lt(l._by, (_gh._sideGbwtSz >> 2)); assert_eq((side[l._by] >> l._bp) & 0x1, 0x1); bool first = true; index_t curr_node = 0; index_t num0s = 0; while(top < bot) { if(first) { first = false; } else { int bit = (side[l._by] >> l._bp) & 0x1; if(bit == 0x1) { curr_node++; num0s = 0; } else { num0s++; if(num0s == 1) { node_iedges.expand(); node_iedges.back().first = curr_node; } node_iedges.back().second = num0s; } } if(l._charOff + 1 == (_gh._sideGbwtSz << 1)) { l.nextSide(_gh); side = l.side(this->gfm()) + (_gh._sideGbwtSz >> 1); } else { l._charOff++; l._by = l._charOff >> 3; l._bp = l._charOff & 0x7; } top++; } } #ifndef NDEBUG /// Check that in-memory Ebwt is internally consistent with respect /// to given EbwtParams; assert if not bool inMemoryRepOk(const GFMParams& gh) const { assert_eq(_zOffs.size(), _zGbwtByteOffs.size()); assert_eq(_zOffs.size(), _zGbwtBpOffs.size()); for(index_t i = 0; i < _zOffs.size(); i++) { assert_geq(_zGbwtBpOffs[i], 0); assert_lt(_zGbwtBpOffs[i], 4); assert_lt(_zGbwtByteOffs[i], gh._gbwtTotSz); assert_lt(_zOffs[i], gh._gbwtLen); } assert_geq(_nFrag, _nPat); assert_eq(_alts.size(), _altnames.size()); return true; } /// Check that in-memory Ebwt is internally consistent; assert if /// not bool inMemoryRepOk() const { return repOk(_gh); } /// Check that Ebwt is internally consistent with respect to given /// EbwtParams; assert if not bool repOk(const GFMParams& gh) const { assert(_gh.repOk()); if(isInMemory()) { return inMemoryRepOk(gh); } return true; } /// Check that Ebwt is internally consistent; assert if not bool repOk() const { return repOk(_gh); } #endif bool _toBigEndian; int32_t _overrideOffRate; bool _verbose; bool _passMemExc; bool _sanity; bool fw_; // true iff this is a forward index FILE *_in1; // input fd for primary index file FILE *_in2; // input fd for secondary index file string _in1Str; // filename for primary index file string _in2Str; // filename for secondary index file EList _zOffs; EList _zGbwtByteOffs; EList _zGbwtBpOffs; index_t _nPat; /// number of reference texts index_t _nFrag; /// number of fragments APtrWrap _plen; APtrWrap _rstarts; // starting offset of fragments / text indexes // _fchr, _ftab and _eftab are expected to be relatively small // (usually < 1MB, perhaps a few MB if _fchr is particularly large // - like, say, 11). For this reason, we don't bother with writing // them to disk through separate output streams; we APtrWrap _fchr; APtrWrap _ftab; APtrWrap _eftab; // "extended" entries for _ftab // _offs may be extremely large. E.g. for DNA w/ offRate=4 (one // offset every 16 rows), the total size of _offs is the same as // the total size of the input sequence APtrWrap _offs; // _ebwt is the Extended Burrows-Wheeler Transform itself, and thus // is at least as large as the input sequence. APtrWrap _gfm; bool _useMm; /// use memory-mapped files to hold the index bool useShmem_; /// use shared memory to hold large parts of the index EList _refnames; /// names of the reference sequences EList _refnames_nospace; // names of the reference sequences (names stop at space) char *mmFile1_; char *mmFile2_; int _nthreads; GFMParams _gh; bool packed_; static const uint64_t default_bmax = INDEX_MAX; static const uint64_t default_bmaxMultSqrt = INDEX_MAX; static const uint64_t default_bmaxDivN = 4; static const int default_dcv = 1024; static const bool default_noDc = false; static const bool default_useBlockwise = true; static const uint32_t default_seed = 0; #ifdef BOWTIE_64BIT_INDEX static const int default_lineRate_gfm = 8; static const int default_lineRate_fm = 7; #else static const int default_lineRate_gfm = 7; static const int default_lineRate_fm = 6; #endif static const int default_offRate = 5; static const int default_offRatePlus = 0; static const int default_ftabChars = 10; static const bool default_bigEndian = false; // data used to build an index EList > _alts; EList _altnames; EList > _haplotypes; RepeatDB _repeatdb; EList _repeat_kmertables; bool _repeat; EList > _repeatLens; EList _repeatIncluded; protected: ostream& log() const { return cerr; // TODO: turn this into a parameter } /// Print a verbose message and flush (flushing is helpful for /// debugging) void verbose(const string& s) const { if(this->verbose()) { this->log() << s.c_str(); this->log().flush(); } } }; /** * Read reference names from an input stream 'in' for an Ebwt primary * file and store them in 'refnames'. */ template void readEbwtRefnames(istream& in, EList& refnames) { // _in1 must already be open with the get cursor at the // beginning and no error flags set. assert(in.good()); assert_eq((streamoff)in.tellg(), ios::beg); // Read endianness hints from both streams bool switchEndian = false; uint32_t one = readU32(in, switchEndian); // 1st word of primary stream if(one != 1) { assert_eq((1u<<24), one); switchEndian = true; } readU32(in, switchEndian); // version // Reads header entries one by one from primary stream index_t len = readIndex(in, switchEndian); index_t gbwtLen = readIndex(in, switchEndian); index_t numNodes = readIndex(in, switchEndian); int32_t lineRate = readI32(in, switchEndian); /*int32_t linesPerSide =*/ readI32(in, switchEndian); int32_t offRate = readI32(in, switchEndian); int32_t ftabChars = readI32(in, switchEndian); index_t eftabLen = readIndex(in, switchEndian); // BTL: chunkRate is now deprecated int32_t flags = readI32(in, switchEndian); bool entireReverse = false; if(flags < 0) { entireReverse = (((-flags) & GFM_ENTIRE_REV) != 0); } // Create a new EbwtParams from the entries read from primary stream GFMParams gh(len, gbwtLen, numNodes, lineRate, offRate, ftabChars, eftabLen, entireReverse); index_t nPat = readIndex(in, switchEndian); // nPat in.seekg(nPat*sizeof(index_t), ios_base::cur); // skip plen // Skip rstarts index_t nFrag = readIndex(in, switchEndian); in.seekg(nFrag*sizeof(index_t)*3, ios_base::cur); // Skip ebwt in.seekg(gh._gbwtTotLen, ios_base::cur); // Skip zOff from primary stream index_t numZOffs = readIndex(in, switchEndian); in.seekg(numZOffs * sizeof(index_t), ios_base::cur); // Skip fchr in.seekg(5 * sizeof(index_t), ios_base::cur); // Skip ftab in.seekg(gh._ftabLen*sizeof(index_t), ios_base::cur); // Skip eftab in.seekg(gh._eftabLen*sizeof(index_t), ios_base::cur); // Read reference sequence names from primary index file while(true) { char c = '\0'; in.read(&c, 1); if(in.eof()) break; if(c == '\0') break; else if(c == '\n') { refnames.push_back(""); } else { if(refnames.size() == 0) { refnames.push_back(""); } refnames.back().push_back(c); } } if(refnames.back().empty()) { refnames.pop_back(); } // Be kind in.clear(); in.seekg(0, ios::beg); assert(in.good()); } /** * Read reference names from the index with basename 'in' and store * them in 'refnames'. */ template void readEbwtRefnames(const string& instr, EList& refnames) { ifstream in; // Initialize our primary and secondary input-stream fields in.open((instr + ".1." + gfm_ext).c_str(), ios_base::in | ios::binary); if(!in.is_open()) { throw GFMFileOpenException("Cannot open file " + instr); } assert(in.is_open()); assert(in.good()); assert_eq((streamoff)in.tellg(), ios::beg); readEbwtRefnames(in, refnames); } /////////////////////////////////////////////////////////////////////// // // Functions for building Ebwts // /////////////////////////////////////////////////////////////////////// /** * Join several text strings together in a way that's compatible with * the text-chunking scheme dictated by chunkRate parameter. * * The non-static member Ebwt::join additionally builds auxilliary * arrays that maintain a mapping between chunks in the joined string * and the original text strings. */ template template TStr GFM::join(EList& l, uint32_t seed) { RandomSource rand; // reproducible given same seed rand.init(seed); TStr ret; index_t guessLen = 0; for(index_t i = 0; i < l.size(); i++) { guessLen += length(l[i]); } ret.resize(guessLen); index_t off = 0; for(size_t i = 0; i < l.size(); i++) { TStr& s = l[i]; assert_gt(s.length(), 0); for(size_t j = 0; j < s.size(); j++) { ret.set(s[j], off++); } } return ret; } /** * Join several text strings together in a way that's compatible with * the text-chunking scheme dictated by chunkRate parameter. * * The non-static member Ebwt::join additionally builds auxilliary * arrays that maintain a mapping between chunks in the joined string * and the original text strings. */ template template void GFM::join(EList& l, EList& szs, index_t sztot, const RefReadInParams& refparams, uint32_t seed, TStr& s, bool include_rc, bool CGtoTG) { RandomSource rand; // reproducible given same seed rand.init(seed); RefReadInParams rpcp = refparams; index_t guessLen = sztot; if(include_rc) { s.resize(guessLen << 1); } else { s.resize(guessLen); } ASSERT_ONLY(index_t szsi = 0); TIndexOffU dstoff = 0; for(index_t i = 0; i < l.size(); i++) { // For each sequence we can pull out of istream l[i]... assert(!l[i]->eof()); bool first = true; while(!l[i]->eof()) { RefRecord rec = fastaRefReadAppend(*l[i], first, s, dstoff, rpcp); first = false; index_t bases = (index_t)rec.len; assert_eq(rec.off, szs[szsi].off); assert_eq(rec.len, szs[szsi].len); assert_eq(rec.first, szs[szsi].first); ASSERT_ONLY(szsi++); if(bases == 0) continue; } } // Change 'C' in CG to 'T' so that CG becomes TG if(CGtoTG) { for(TIndexOffU i = 0; i + 1 < guessLen; i++) { int nt1 = s[i], nt2 = s[i+1]; if(nt1 == 1 && nt2 == 2) { s[i] = 3; } } } // Append reverse complement if(include_rc) { for (TIndexOffU i = 0; i < guessLen; i++) { int nt = s[guessLen - i - 1]; assert_range(0, 3, nt); s[guessLen + i] = dnacomp[nt]; } } } /** * Join several text strings together according to the text-chunking * scheme specified in the EbwtParams. Ebwt fields calculated in this * function are written directly to disk. * * It is assumed, but not required, that the header values have already * been written to 'out1' before this function is called. * * The static member Ebwt::join just returns a joined version of a * list of strings without building any of the auxilliary arrays. */ template template void GFM::joinToDisk( EList& l, EList& szs, index_t sztot, const RefReadInParams& refparams, TStr& ret, ostream& out1, ostream& out2) { RefReadInParams rpcp = refparams; assert_gt(szs.size(), 0); assert_gt(l.size(), 0); assert_gt(sztot, 0); // Not every fragment represents a distinct sequence - many // fragments may correspond to a single sequence. Count the // number of sequences here by counting the number of "first" // fragments. this->_nPat = 0; this->_nFrag = 0; for(index_t i = 0; i < szs.size(); i++) { if(szs[i].len > 0) this->_nFrag++; if(szs[i].first && szs[i].len > 0) this->_nPat++; } assert_gt(this->_nPat, 0); assert_geq(this->_nFrag, this->_nPat); _rstarts.reset(); writeIndex(out1, this->_nPat, this->toBe()); // Allocate plen[] try { this->_plen.init(new index_t[this->_nPat], this->_nPat); } catch(bad_alloc& e) { cerr << "Out of memory allocating plen[] in Ebwt::join()" << " at " << __FILE__ << ":" << __LINE__ << endl; throw e; } // For each pattern, set plen int npat = -1; for(index_t i = 0; i < szs.size(); i++) { if(szs[i].first && szs[i].len > 0) { if(npat >= 0) { writeIndex(out1, this->plen()[npat], this->toBe()); } npat++; this->plen()[npat] = (szs[i].len + szs[i].off); } else { this->plen()[npat] += (szs[i].len + szs[i].off); } } assert_eq((index_t)npat, this->_nPat-1); writeIndex(out1, this->plen()[npat], this->toBe()); // Write the number of fragments writeIndex(out1, this->_nFrag, this->toBe()); index_t seqsRead = 0; ASSERT_ONLY(index_t szsi = 0); ASSERT_ONLY(index_t entsWritten = 0); index_t dstoff = 0; // For each filebuf for(unsigned int i = 0; i < l.size(); i++) { assert(!l[i]->eof()); bool first = true; index_t patoff = 0; // For each *fragment* (not necessary an entire sequence) we // can pull out of istream l[i]... while(!l[i]->eof()) { string name; // Push a new name onto our vector _refnames.push_back(""); RefRecord rec = fastaRefReadAppend( *l[i], first, ret, dstoff, rpcp, &_refnames.back()); first = false; index_t bases = rec.len; if(rec.first && rec.len > 0) { if(_refnames.back().length() == 0) { // If name was empty, replace with an index ostringstream stm; stm << seqsRead; _refnames.back() = stm.str(); } } else { // This record didn't actually start a new sequence so // no need to add a name //assert_eq(0, _refnames.back().length()); _refnames.pop_back(); } // Increment seqsRead if this is the first fragment if(rec.first && rec.len > 0) seqsRead++; assert_lt(szsi, szs.size()); assert_eq(rec.off, szs[szsi].off); assert_eq(rec.len, szs[szsi].len); assert_eq(rec.first, szs[szsi].first); assert(rec.first || rec.off > 0); ASSERT_ONLY(szsi++); assert_leq(bases, this->plen()[seqsRead-1]); // Reset the patoff if this is the first fragment if(rec.first) patoff = 0; patoff += rec.off; // add fragment's offset from end of last frag. // Adjust rpcps //index_t seq = seqsRead-1; #ifndef NDEBUG if(bases > 0) { ASSERT_ONLY(entsWritten++); } #endif // This is where rstarts elements are written to the output stream //writeU32(out1, oldRetLen, this->toBe()); // offset from beginning of joined string //writeU32(out1, seq, this->toBe()); // sequence id //writeU32(out1, patoff, this->toBe()); // offset into sequence patoff += (index_t)bases; } assert_gt(szsi, 0); l[i]->reset(); assert(!l[i]->eof()); #ifndef NDEBUG int c = l[i]->get(); assert_eq('>', c); assert(!l[i]->eof()); l[i]->reset(); assert(!l[i]->eof()); #endif } assert_eq(entsWritten, this->_nFrag); } /** * Build an Ebwt from a string 's' and its suffix array 'sa' (which * might actually be a suffix array *builder* that builds blocks of the * array on demand). The bulk of the Ebwt, i.e. the ebwt and offs * arrays, is written directly to disk. This is by design: keeping * those arrays in memory needlessly increases the footprint of the * building process. Instead, we prefer to build the Ebwt directly * "to disk" and then read it back into memory later as necessary. * * It is assumed that the header values and join-related values (nPat, * plen) have already been written to 'out1' before this function * is called. When this function is finished, it will have * additionally written ebwt, zOff, fchr, ftab and eftab to the primary * file and offs to the secondary file. * * Assume DNA/RNA/any alphabet with 4 or fewer elements. * Assume occ array entries are 32 bits each. * * @param sa the suffix array to convert to a Ebwt * @param s the original string * @param out */ template template void GFM::buildToDisk( PathGraph& gbwt, const TStr& s, ostream& out1, ostream& out2, streampos headerPos) { const GFMParams& gh = this->_gh; assert(gh.repOk()); assert_lt(s.length(), gh.gbwtLen()); assert_eq(s.length(), gh._len); assert_gt(gh._lineRate, 3); index_t gbwtLen = gh._gbwtLen; streampos out1pos = out1.tellp(); if(headerPos < 0) { out1.seekp(8 + sizeof(index_t)); } else { out1.seekp(headerPos); } writeIndex(out1, gbwtLen, this->toBe()); writeIndex(out1, gh._numNodes, this->toBe()); out1.seekp(out1pos); index_t ftabLen = gh._ftabLen; index_t sideSz = gh._sideSz; index_t gbwtTotSz = gh._gbwtTotSz; index_t fchr[] = {0, 0, 0, 0, 0}; EList ftab(EBWT_CAT); EList zOffs; // Save # of occurrences of each character as we walk along the bwt index_t occ[4] = {0, 0, 0, 0}; index_t occSave[4] = {0, 0, 0, 0}; // # of occurrences of 1 in M arrays index_t M_occ = 0, M_occSave = 0; // Location in F that corresponds to 1 in M index_t F_loc = 0, F_locSave = 0; // Record rows that should "absorb" adjacent rows in the ftab. try { VMSG_NL("Allocating ftab, absorbFtab"); ftab.resize(ftabLen); ftab.fillZero(); } catch(bad_alloc &e) { cerr << "Out of memory allocating ftab[] " << "in GFM::buildToDisk() at " << __FILE__ << ":" << __LINE__ << endl; throw e; } // Allocate the side buffer; holds a single side as its being // constructed and then written to disk. Reused across all sides. #ifdef SIXTY4_FORMAT EList gfmSide(EBWT_CAT); #else EList gfmSide(EBWT_CAT); #endif try { // Used to calculate ftab and eftab, but having gfm costs a lot of memory _gfm.init(new uint8_t[gh._gbwtTotLen], gh._gbwtTotLen, true); #ifdef SIXTY4_FORMAT gfmSide.resize(sideSz >> 3); #else gfmSide.resize(sideSz); #endif } catch(bad_alloc &e) { cerr << "Out of memory allocating ebwtSide[] in " << "GFM::buildToDisk() at " << __FILE__ << ":" << __LINE__ << endl; throw e; } // Points to the base offset within ebwt for the side currently // being written index_t side = 0; // Whether we're assembling a forward or a reverse bucket bool fw = true; int sideCur = 0; index_t si = 0; // string offset (chars) ASSERT_ONLY(bool inSA = true); // true iff saI still points inside suffix // array (as opposed to the padding at the // end) // Iterate over packed bwt bytes VMSG_NL("Entering GFM loop"); ASSERT_ONLY(index_t beforeGbwtOff = (index_t)out1.tellp()); while(side < gbwtTotSz) { // Sanity-check our cursor into the side buffer assert_geq(sideCur, 0); assert_lt(sideCur, (int)gh._sideGbwtSz); assert_eq(0, side % sideSz); // 'side' must be on side boundary if(sideCur == 0) { memset(gfmSide.ptr(), 0, gh._sideGbwtSz); gfmSide[sideCur] = 0; // clear } assert_lt(side + sideCur, gbwtTotSz); // Iterate over bit-pairs in the si'th character of the BWT #ifdef SIXTY4_FORMAT for(int bpi = 0; bpi < 32; bpi++, si++) #else for(int bpi = 0; bpi < 4; bpi++, si++) #endif { int gbwtChar = 0; // one of A, C, G, T, and Z int F= 0, M = 0; // either 0 or 1 index_t pos = 0; // pos on joined string bool count = true; if(si < gbwtLen) { gbwt.nextRow(gbwtChar, F, M, pos); // (that might have triggered sa to calc next suf block) if(gbwtChar == 'Z') { // Don't add the 'Z' in the last column to the BWT // transform; we can't encode a $ (only A C T or G) // and counting it as, say, an A, will mess up the // LF mapping gbwtChar = 0; count = false; #ifndef NDEBUG if(zOffs.size() > 0) { assert_gt(si, zOffs.back()); } #endif zOffs.push_back(si); // remember GBWT row that corresponds to the 0th suffix } else { gbwtChar = asc2dna[gbwtChar]; assert_lt(gbwtChar, 4); // Update the fchr fchr[gbwtChar]++; } assert_lt(F, 2); assert_lt(M, 2); if(M == 1) { assert_neq(F_loc, numeric_limits::max()); F_loc = gbwt.nextFLocation(); #ifndef NDEBUG if(F_loc > 0) { assert_gt(F_loc, F_locSave); } #endif } // Suffix array offset boundary? - update offset array if(M == 1 && (M_occ & gh._offMask) == M_occ) { assert_lt((M_occ >> gh._offRate), gh._offsLen); // Write offsets directly to the secondary output // stream, thereby avoiding keeping them in memory writeIndex(out2, pos, this->toBe()); } } else { // Strayed off the end of the SA, now we're just // padding out a bucket #ifndef NDEBUG if(inSA) { // Assert that we wrote all the characters in the // string before now assert_eq(si, gbwtLen); inSA = false; } #endif // 'A' used for padding; important that padding be // counted in the occ[] array gbwtChar = 0; F = M = 0; } if(count) occ[gbwtChar]++; if(M) M_occ++; // Append BWT char to bwt section of current side if(fw) { // Forward bucket: fill from least to most #ifdef SIXTY4_FORMAT gfmSide[sideCur] |= ((uint64_t)gbwtChar << (bpi << 1)); if(gbwtChar > 0) assert_gt(gfmSide[sideCur], 0); // To be implemented ... assert(false); cerr << "Not implemented" << endl; exit(1); #else pack_2b_in_8b(gbwtChar, gfmSide[sideCur], bpi); assert_eq((gfmSide[sideCur] >> (bpi*2)) & 3, gbwtChar); int F_sideCur = (gh._sideGbwtSz + sideCur) >> 1; int F_bpi = bpi + ((sideCur & 0x1) << 2); // Can be used as M_bpi as well pack_1b_in_8b(F, gfmSide[F_sideCur], F_bpi); assert_eq((gfmSide[F_sideCur] >> F_bpi) & 1, F); int M_sideCur = F_sideCur + (gh._sideGbwtSz >> 2); pack_1b_in_8b(M, gfmSide[M_sideCur], F_bpi); assert_eq((gfmSide[M_sideCur] >> F_bpi) & 1, M); #endif } else { // Backward bucket: fill from most to least #ifdef SIXTY4_FORMAT gfmSide[sideCur] |= ((uint64_t)gbwtChar << ((31 - bpi) << 1)); if(gbwtChar > 0) assert_gt(gfmSide[sideCur], 0); // To be implemented ... assert(false); cerr << "Not implemented" << endl; exit(1); #else pack_2b_in_8b(gbwtChar, gfmSide[sideCur], 3-bpi); assert_eq((gfmSide[sideCur] >> ((3-bpi)*2)) & 3, gbwtChar); // To be implemented ... assert(false); cerr << "Not implemented" << endl; exit(1); #endif } } // end loop over bit-pairs assert_eq(0, (occ[0] + occ[1] + occ[2] + occ[3] + zOffs.size()) & 3); #ifdef SIXTY4_FORMAT assert_eq(0, si & 31); #else assert_eq(0, si & 3); #endif sideCur++; if((sideCur << 1) == (int)gh._sideGbwtSz) { sideCur = 0; index_t *uside = reinterpret_cast(gfmSide.ptr()); // Write 'A', 'C', 'G', 'T', and '1' in M tallies side += sideSz; assert_leq(side, gh._gbwtTotSz); uside[(sideSz / sizeof(index_t))-6] = endianizeIndex(F_locSave, this->toBe()); uside[(sideSz / sizeof(index_t))-5] = endianizeIndex(M_occSave, this->toBe()); uside[(sideSz / sizeof(index_t))-4] = endianizeIndex(occSave[0], this->toBe()); uside[(sideSz / sizeof(index_t))-3] = endianizeIndex(occSave[1], this->toBe()); uside[(sideSz / sizeof(index_t))-2] = endianizeIndex(occSave[2], this->toBe()); uside[(sideSz / sizeof(index_t))-1] = endianizeIndex(occSave[3], this->toBe()); F_locSave = F_loc; M_occSave = M_occ; occSave[0] = occ[0]; occSave[1] = occ[1]; occSave[2] = occ[2]; occSave[3] = occ[3]; // Write backward side to primary file out1.write((const char *)gfmSide.ptr(), sideSz); // memcpy(((char*)_gfm.get()) + side - sideSz, (const char *)gfmSide.ptr(), sideSz); } } VMSG_NL("Exited GFM loop"); // Assert that our loop counter got incremented right to the end assert_eq(side, gh._gbwtTotSz); // Assert that we wrote the expected amount to out1 assert_eq(((index_t)out1.tellp() - beforeGbwtOff), gh._gbwtTotSz); // assert that the last thing we did was write a forward bucket // // Write zOffs to primary stream // assert_gt(zOffs.size(), 0); writeIndex(out1, (index_t)zOffs.size(), this->toBe()); for(size_t i = 0; i < zOffs.size(); i++) { writeIndex(out1, zOffs[i], this->toBe()); } // // Finish building fchr // // Exclusive prefix sum on fchr for(int i = 1; i < 4; i++) { fchr[i] += fchr[i-1]; } assert_lt(fchr[3], gbwtLen); // Shift everybody up by one for(int i = 4; i >= 1; i--) { fchr[i] = fchr[i-1]; } fchr[0] = 0; if(_verbose) { for(int i = 0; i < 5; i++) cerr << "fchr[" << "ACGT$"[i] << "]: " << fchr[i] << endl; } // Write fchr to primary file for(int i = 0; i < 5; i++) { writeIndex(out1, fchr[i], this->toBe()); } _fchr.init(new index_t[5], 5, true); memcpy(_fchr.get(), fchr, sizeof(index_t) * 5); // Initialize _zGbwtByteOffs and _zGbwtBpOffs _zOffs = zOffs; postReadInit(gh); // Build ftab and eftab EList > tFtab; tFtab.resizeExact(ftabLen - 1); for(index_t i = 0; i + 1 < ftabLen; i++) { index_t q = i; pair range(0, gh._gbwtLen); SideLocus tloc, bloc; SideLocus::initFromTopBot(range.first, range.second, gh, gfm(), tloc, bloc); index_t j = 0; for(; j < (index_t)gh._ftabChars; j++) { int nt = q & 0x3; q >>= 2; if(bloc.valid()) { range = mapGLF(tloc, bloc, nt); } else { range = mapGLF1(range.first, tloc, nt); } if(range.first == (index_t)INDEX_MAX || range.first >= range.second) { break; } if(range.first + 1 == range.second) { tloc.initFromRow(range.first, gh, gfm()); bloc.invalidate(); } else { SideLocus::initFromTopBot(range.first, range.second, gh, gfm(), tloc, bloc); } } if(range.first >= range.second || j < (index_t)gh._ftabChars) { if(i == 0) { tFtab[i].first = tFtab[i].second = 0; } else { tFtab[i].first = tFtab[i].second = tFtab[i-1].second; } } else { tFtab[i].first = range.first; tFtab[i].second = range.second; } #ifndef NDEBUG if(gbwt.ftab.size() > i) { assert_eq(tFtab[i].first, gbwt.ftab[i].first); assert_eq(tFtab[i].second, gbwt.ftab[i].second); } #endif } // Clear memory _gfm.reset(); _fchr.reset(); _zOffs.clear(); _zGbwtByteOffs.clear(); _zGbwtBpOffs.clear(); // // Finish building ftab and build eftab // // Prefix sum on ftable index_t eftabLen = 0; for(index_t i = 1; i + 1 < ftabLen; i++) { if(tFtab[i-1].second != tFtab[i].first) { eftabLen += 2; } } if(gh._gbwtLen + (eftabLen >> 1) < gh._gbwtLen) { cerr << "Too many eftab entries: " << gh._gbwtLen << " + " << (eftabLen >> 1) << " > " << (index_t)INDEX_MAX << endl; throw 1; } EList eftab(EBWT_CAT); try { eftab.resize(eftabLen); eftab.fillZero(); } catch(bad_alloc &e) { cerr << "Out of memory allocating eftab[] " << "in GFM::buildToDisk() at " << __FILE__ << ":" << __LINE__ << endl; throw e; } index_t eftabCur = 0; ftab[0] = tFtab[0].first; ftab[1] = tFtab[0].second; for(index_t i = 1; i + 1 < ftabLen; i++) { if(ftab[i] != tFtab[i].first) { index_t lo = ftab[i]; index_t hi = tFtab[i].first; assert_lt(eftabCur*2+1, eftabLen); eftab[eftabCur*2] = lo; eftab[eftabCur*2+1] = hi; // one node can be shared, and one node can have at most four incoming edges assert_leq(lo, hi + 4); ftab[i] = (eftabCur++) ^ (index_t)INDEX_MAX; // insert pointer into eftab assert_eq(lo, GFM::ftabLo(ftab.ptr(), eftab.ptr(), gbwtLen, ftabLen, eftabLen, i)); assert_eq(hi, GFM::ftabHi(ftab.ptr(), eftab.ptr(), gbwtLen, ftabLen, eftabLen, i)); } ftab[i+1] = tFtab[i].second; } #ifndef NDEBUG for(index_t i = 0; i + 1 < ftabLen; i++ ){ assert_eq(tFtab[i].first, GFM::ftabHi(ftab.ptr(), eftab.ptr(), gbwtLen, ftabLen, eftabLen, i)); assert_eq(tFtab[i].second, GFM::ftabLo(ftab.ptr(), eftab.ptr(), gbwtLen, ftabLen, eftabLen, i+1)); } #endif // Write ftab to primary file for(index_t i = 0; i < ftabLen; i++) { writeIndex(out1, ftab[i], this->toBe()); } // Write eftab to primary file out1pos = out1.tellp(); if(headerPos < 0) { out1.seekp(24 + sizeof(index_t) * 3); } else { out1.seekp((int)headerPos + 16 + sizeof(index_t) * 2); } writeIndex(out1, eftabLen, this->toBe()); out1.seekp(out1pos); for(index_t i = 0; i < eftabLen; i++) { writeIndex(out1, eftab[i], this->toBe()); } // Note: if you'd like to sanity-check the Ebwt, you'll have to // read it back into memory first! assert(!isInMemory()); VMSG_NL("Exiting GFM::buildToDisk()"); } /** * Build an Ebwt from a string 's' and its suffix array 'sa' (which * might actually be a suffix array *builder* that builds blocks of the * array on demand). The bulk of the Ebwt, i.e. the ebwt and offs * arrays, is written directly to disk. This is by design: keeping * those arrays in memory needlessly increases the footprint of the * building process. Instead, we prefer to build the Ebwt directly * "to disk" and then read it back into memory later as necessary. * * It is assumed that the header values and join-related values (nPat, * plen) have already been written to 'out1' before this function * is called. When this function is finished, it will have * additionally written ebwt, zOff, fchr, ftab and eftab to the primary * file and offs to the secondary file. * * Assume DNA/RNA/any alphabet with 4 or fewer elements. * Assume occ array entries are 32 bits each. * * @param sa the suffix array to convert to a Ebwt * @param s the original string * @param out */ template template void GFM::buildToDisk( InorderBlockwiseSA& sa, const TStr& s, ostream& out1, ostream& out2, streampos headerPos) { const GFMParams& gh = this->_gh; assert(gh.repOk()); assert(gh.linearFM()); assert_lt(s.length(), gh.gbwtLen()); assert_eq(s.length(), gh._len); assert_gt(gh._lineRate, 3); index_t len = gh._len; index_t gbwtLen = gh._gbwtLen; assert_eq(len + 1, gbwtLen); streampos out1pos = out1.tellp(); if(headerPos < 0) { out1.seekp(8 + sizeof(index_t)); } else { out1.seekp(headerPos); } writeIndex(out1, gbwtLen, this->toBe()); writeIndex(out1, gh._numNodes, this->toBe()); out1.seekp(out1pos); index_t ftabLen = gh._ftabLen; index_t sideSz = gh._sideSz; index_t gbwtTotSz = gh._gbwtTotSz; index_t fchr[] = {0, 0, 0, 0, 0}; EList ftab(EBWT_CAT); EList zOffs; // Save # of occurrences of each character as we walk along the bwt index_t occ[4] = {0, 0, 0, 0}; index_t occSave[4] = {0, 0, 0, 0}; // Record rows that should "absorb" adjacent rows in the ftab. // The absorbed rows represent suffixes shorter than the ftabChars // cutoff. uint8_t absorbCnt = 0; EList absorbFtab(EBWT_CAT); try { VMSG_NL("Allocating ftab, absorbFtab"); ftab.resize(ftabLen); ftab.fillZero(); absorbFtab.resize(ftabLen); absorbFtab.fillZero(); } catch(bad_alloc &e) { cerr << "Out of memory allocating ftab[] or absorbFtab[] " << "in GFM::buildToDisk() at " << __FILE__ << ":" << __LINE__ << endl; throw e; } // Allocate the side buffer; holds a single side as its being // constructed and then written to disk. Reused across all sides. #ifdef SIXTY4_FORMAT EList gfmSide(EBWT_CAT); #else EList gfmSide(EBWT_CAT); #endif try { #ifdef SIXTY4_FORMAT gfmSide.resize(sideSz >> 3); #else gfmSide.resize(sideSz); #endif } catch(bad_alloc &e) { cerr << "Out of memory allocating gfmSide[] in " << "GFM::buildToDisk() at " << __FILE__ << ":" << __LINE__ << endl; throw e; } // Points to the base offset within ebwt for the side currently // being written index_t side = 0; // Whether we're assembling a forward or a reverse bucket bool fw = true; int sideCur = 0; // Have we skipped the '$' in the last column yet? ASSERT_ONLY(bool dollarSkipped = false); index_t si = 0; // string offset (chars) ASSERT_ONLY(index_t lastSufInt = 0); ASSERT_ONLY(bool inSA = true); // true iff saI still points inside suffix // array (as opposed to the padding at the // end) // Iterate over packed bwt bytes VMSG_NL("Entering GFM loop"); ASSERT_ONLY(index_t beforeGbwtOff = (index_t)out1.tellp()); while(side < gbwtTotSz) { // Sanity-check our cursor into the side buffer assert_geq(sideCur, 0); assert_lt(sideCur, (int)gh._sideGbwtSz); assert_eq(0, side % sideSz); // 'side' must be on side boundary gfmSide[sideCur] = 0; // clear assert_lt(side + sideCur, gbwtTotSz); // Iterate over bit-pairs in the si'th character of the BWT #ifdef SIXTY4_FORMAT for(int bpi = 0; bpi < 32; bpi++, si++) #else for(int bpi = 0; bpi < 4; bpi++, si++) #endif { int bwtChar; bool count = true; if(si <= len) { // Still in the SA; extract the bwtChar index_t saElt = sa.nextSuffix(); // (that might have triggered sa to calc next suf block) if(saElt == 0) { // Don't add the '$' in the last column to the BWT // transform; we can't encode a $ (only A C T or G) // and counting it as, say, an A, will mess up the // LR mapping bwtChar = 0; count = false; ASSERT_ONLY(dollarSkipped = true); zOffs.push_back(si); // remember the SA row that // corresponds to the 0th suffix } else { bwtChar = (int)(s[saElt-1]); assert_lt(bwtChar, 4); // Update the fchr fchr[bwtChar]++; } // Update ftab if((len-saElt) >= (index_t)gh._ftabChars) { // Turn the first ftabChars characters of the // suffix into an integer index into ftab. The // leftmost (lowest index) character of the suffix // goes in the most significant bit pair if the // integer. index_t sufInt = 0; for(int i = 0; i < gh._ftabChars; i++) { sufInt <<= 2; assert_lt((index_t)i, len-saElt); sufInt |= (unsigned char)(s[saElt+i]); } // Assert that this prefix-of-suffix is greater // than or equal to the last one (true b/c the // suffix array is sorted) #ifndef NDEBUG if(lastSufInt > 0) assert_geq(sufInt, lastSufInt); lastSufInt = sufInt; #endif // Update ftab assert_lt(sufInt+1, ftabLen); ftab[sufInt+1]++; if(absorbCnt > 0) { // Absorb all short suffixes since the last // transition into this transition absorbFtab[sufInt] = absorbCnt; absorbCnt = 0; } } else { // Otherwise if suffix is fewer than ftabChars // characters long, then add it to the 'absorbCnt'; // it will be absorbed into the next transition assert_lt(absorbCnt, 255); absorbCnt++; } // Suffix array offset boundary? - update offset array if((si & gh._offMask) == si) { assert_lt((si >> gh._offRate), gh._offsLen); // Write offsets directly to the secondary output // stream, thereby avoiding keeping them in memory writeIndex(out2, saElt, this->toBe()); } } else { // Strayed off the end of the SA, now we're just // padding out a bucket #ifndef NDEBUG if(inSA) { // Assert that we wrote all the characters in the // string before now assert_eq(si, len+1); inSA = false; } #endif // 'A' used for padding; important that padding be // counted in the occ[] array bwtChar = 0; } if(count) occ[bwtChar]++; // Append BWT char to bwt section of current side if(fw) { // Forward bucket: fill from least to most #ifdef SIXTY4_FORMAT ebwtSide[sideCur] |= ((uint64_t)bwtChar << (bpi << 1)); if(bwtChar > 0) assert_gt(ebwtSide[sideCur], 0); #else pack_2b_in_8b(bwtChar, gfmSide[sideCur], bpi); assert_eq((gfmSide[sideCur] >> (bpi*2)) & 3, bwtChar); #endif } else { // Backward bucket: fill from most to least #ifdef SIXTY4_FORMAT ebwtSide[sideCur] |= ((uint64_t)bwtChar << ((31 - bpi) << 1)); if(bwtChar > 0) assert_gt(ebwtSide[sideCur], 0); #else pack_2b_in_8b(bwtChar, gfmSide[sideCur], 3-bpi); assert_eq((gfmSide[sideCur] >> ((3-bpi)*2)) & 3, bwtChar); #endif } } // end loop over bit-pairs assert_eq(dollarSkipped ? 3 : 0, (occ[0] + occ[1] + occ[2] + occ[3]) & 3); #ifdef SIXTY4_FORMAT assert_eq(0, si & 31); #else assert_eq(0, si & 3); #endif sideCur++; if(sideCur == (int)gh._sideGbwtSz) { sideCur = 0; index_t *uside = reinterpret_cast(gfmSide.ptr()); // Write 'A', 'C', 'G', 'T', and '1' in M tallies side += sideSz; assert_leq(side, gh._gbwtTotSz); uside[(sideSz / sizeof(index_t))-4] = endianizeIndex(occSave[0], this->toBe()); uside[(sideSz / sizeof(index_t))-3] = endianizeIndex(occSave[1], this->toBe()); uside[(sideSz / sizeof(index_t))-2] = endianizeIndex(occSave[2], this->toBe()); uside[(sideSz / sizeof(index_t))-1] = endianizeIndex(occSave[3], this->toBe()); occSave[0] = occ[0]; occSave[1] = occ[1]; occSave[2] = occ[2]; occSave[3] = occ[3]; // Write backward side to primary file out1.write((const char *)gfmSide.ptr(), sideSz); } } VMSG_NL("Exited GFM loop"); if(absorbCnt > 0) { // Absorb any trailing, as-yet-unabsorbed short suffixes into // the last element of ftab absorbFtab[ftabLen-1] = absorbCnt; } // Assert that our loop counter got incremented right to the end assert_eq(side, gh._gbwtTotSz); // Assert that we wrote the expected amount to out1 assert_eq(((index_t)out1.tellp() - beforeGbwtOff), gh._gbwtTotSz); // assert that the last thing we did was write a forward bucket // // Write zOffs to primary stream // assert_eq(zOffs.size(), 1); writeIndex(out1, (index_t)zOffs.size(), this->toBe()); for(size_t i = 0; i < zOffs.size(); i++) { assert_neq(zOffs[i], (index_t)OFF_MASK); writeIndex(out1, zOffs[i], this->toBe()); } // // Finish building fchr // // Exclusive prefix sum on fchr for(int i = 1; i < 4; i++) { fchr[i] += fchr[i-1]; } assert_lt(fchr[3], gbwtLen); // Shift everybody up by one for(int i = 4; i >= 1; i--) { fchr[i] = fchr[i-1]; } fchr[0] = 0; if(_verbose) { for(int i = 0; i < 5; i++) cerr << "fchr[" << "ACGT$"[i] << "]: " << fchr[i] << endl; } // Write fchr to primary file for(int i = 0; i < 5; i++) { writeIndex(out1, fchr[i], this->toBe()); } // // Finish building ftab and build eftab // // Prefix sum on ftable index_t eftabLen = 0; assert_eq(0, absorbFtab[0]); for(index_t i = 1; i < ftabLen; i++) { if(absorbFtab[i] > 0) eftabLen += 2; } assert_leq(eftabLen, (index_t)gh._ftabChars*2); eftabLen = gh._ftabChars*2; EList eftab(EBWT_CAT); try { eftab.resize(eftabLen); eftab.fillZero(); } catch(bad_alloc &e) { cerr << "Out of memory allocating eftab[] " << "in GFM::buildToDisk() at " << __FILE__ << ":" << __LINE__ << endl; throw e; } index_t eftabCur = 0; for(index_t i = 1; i < ftabLen; i++) { index_t lo = ftab[i] + GFM::ftabHi(ftab.ptr(), eftab.ptr(), len, ftabLen, eftabLen, i-1); if(absorbFtab[i] > 0) { // Skip a number of short pattern indicated by absorbFtab[i] index_t hi = lo + absorbFtab[i]; assert_lt(eftabCur*2+1, eftabLen); eftab[eftabCur*2] = lo; eftab[eftabCur*2+1] = hi; ftab[i] = (eftabCur++) ^ (index_t)OFF_MASK; // insert pointer into eftab assert_eq(lo, GFM::ftabLo(ftab.ptr(), eftab.ptr(), len, ftabLen, eftabLen, i)); assert_eq(hi, GFM::ftabHi(ftab.ptr(), eftab.ptr(), len, ftabLen, eftabLen, i)); } else { ftab[i] = lo; } } assert_eq(GFM::ftabHi(ftab.ptr(), eftab.ptr(), len, ftabLen, eftabLen, ftabLen-1), len+1); // Write ftab to primary file for(index_t i = 0; i < ftabLen; i++) { writeIndex(out1, ftab[i], this->toBe()); } // Write eftab to primary file out1pos = out1.tellp(); if(headerPos < 0) { out1.seekp(24 + sizeof(index_t) * 3); } else { out1.seekp((int)headerPos + 16 + sizeof(index_t) * 2); } writeIndex(out1, eftabLen, this->toBe()); out1.seekp(out1pos); for(index_t i = 0; i < eftabLen; i++) { writeIndex(out1, eftab[i], this->toBe()); } // Note: if you'd like to sanity-check the Ebwt, you'll have to // read it back into memory first! assert(!isInMemory()); VMSG_NL("Exiting GFM::buildToDisk()"); } extern string gLastIOErrMsg; /* Checks whether a call to read() failed or not. */ inline bool is_read_err(int fdesc, ssize_t ret, size_t count) { if (ret < 0) { std::stringstream sstm; sstm << "ERRNO: " << errno << " ERR Msg:" << strerror(errno) << std::endl; gLastIOErrMsg = sstm.str(); return true; } return false; } /* Checks whether a call to fread() failed or not. */ inline bool is_fread_err(FILE* file_hd, size_t ret, size_t count) { if (ferror(file_hd)) { gLastIOErrMsg = "Error Reading File!"; return true; } return false; } /////////////////////////////////////////////////////////////////////// // // Functions for searching Ebwts // (But most of them are defined in the header) // /////////////////////////////////////////////////////////////////////// /** * Take an offset into the joined text and translate it into the * reference of the index it falls on, the offset into the reference, * and the length of the reference. Use a binary search through the * sorted list of reference fragment ranges t */ template bool GFM::joinedToTextOff( index_t qlen, index_t off, index_t& tidx, index_t& textoff, index_t& tlen, bool rejectStraddle, bool& straddled) const { assert(rstarts() != NULL); // must have loaded rstarts index_t top = 0; index_t bot = _nFrag; // 1 greater than largest addressable element index_t elt = (index_t)INDEX_MAX; // Begin binary search while(true) { index_t oldelt = elt; elt = top + ((bot - top) >> 1); if(oldelt == elt) { tidx = (index_t)INDEX_MAX; return false; } index_t lower = rstarts()[elt*3]; index_t upper; if(elt == _nFrag-1) { upper = _gh._len; } else { upper = rstarts()[((elt+1)*3)]; } assert_gt(upper, lower); index_t fraglen = upper - lower; if(lower <= off) { if(upper > off) { // not last element, but it's within // off is in this range; check if it falls off if(off + qlen > upper) { straddled = true; if(rejectStraddle) { // it falls off; signal no-go and return tidx = (index_t)INDEX_MAX; return false; } } // This is the correct text idx whether the index is // forward or reverse tidx = rstarts()[(elt*3)+1]; assert_lt(tidx, this->_nPat); assert_leq(fraglen, this->plen()[tidx]); // it doesn't fall off; now calculate textoff. // Initially it's the number of characters that precede // the alignment in the fragment index_t fragoff = off - rstarts()[(elt*3)]; if(!this->fw_) { fragoff = fraglen - fragoff - 1; fragoff -= (qlen-1); } // Add the alignment's offset into the fragment // ('fragoff') to the fragment's offset within the text textoff = fragoff + rstarts()[(elt*3)+2]; assert_lt(textoff, this->plen()[tidx]); break; // done with binary search } else { // 'off' belongs somewhere in the region between elt // and bot top = elt; } } else { // 'off' belongs somewhere in the region between top and // elt bot = elt; } // continue with binary search } tlen = this->plen()[tidx]; return true; } template bool GFM::textOffToJoined( index_t tid, index_t textoff, index_t& off) const { assert(rstarts() != NULL); // must have loaded rstarts index_t top = 0; index_t bot = _nFrag; // 1 greater than largest addressable element index_t elt = (index_t)INDEX_MAX; // Begin binary search while(true) { ASSERT_ONLY(index_t oldelt = elt); elt = top + ((bot - top) >> 1); assert_neq(oldelt, elt); // must have made progress index_t elt_tid = rstarts()[elt*3 + 1]; if(elt_tid == tid) { while(true) { if(tid != rstarts()[elt*3+1]) { return false; } if(rstarts()[elt*3 + 2] <= textoff) break; if(elt == 0) return false; elt--; } while(true) { assert_leq(rstarts()[elt*3+2], textoff); if(elt + 1 == _nFrag || tid + 1 == rstarts()[(elt+1)*3 + 1] || textoff < rstarts()[(elt+1)*3 + 2]) { off = rstarts()[elt*3] + (textoff - rstarts()[elt*3 + 2]); if(elt + 1 < _nFrag && tid == rstarts()[(elt+1)*3 + 1] && off >= rstarts()[(elt+1)*3]) { return false; } break; } elt++; } break; // done with binary search } else if(elt_tid < tid) { top = elt; } else { bot = elt; } // continue with binary search } return true; } /** * Walk 'steps' steps to the left and return the row arrived at. If we * walk through the dollar sign, return 0xffffffff. */ template index_t GFM::walkLeft(index_t row, index_t steps) const { assert(offs() != NULL); assert_neq((index_t)INDEX_MAX, row); SideLocus l; if(steps > 0) l.initFromRow(row, _gh, gfm()); while(steps > 0) { for(index_t i = 0; i < _zOffs.size(); i++) { if(row == _zOffs[i]) return (index_t)INDEX_MAX; } pair range = this->mapGLF1(row, l, (pair *)NULL ASSERT_ONLY(, false)); index_t newrow = range.first; assert_neq((index_t)INDEX_MAX, newrow); assert_neq(newrow, row); row = newrow; steps--; if(steps > 0) l.initFromRow(row, _gh, gfm()); } return row; } /** * Resolve the reference offset of the BW element 'elt'. */ template index_t GFM::getOffset(index_t row, index_t node) const { assert(offs() != NULL); assert_neq((index_t)INDEX_MAX, row); for(index_t i = 0; i < _zOffs.size(); i++) { if(row == _zOffs[i]) return 0; } if((node & _gh._offMask) == node) { index_t off = this->offs()[node >> _gh._offRate]; if(off != (index_t)INDEX_MAX) return off; } index_t jumps = 0; SideLocus l; l.initFromRow(row, _gh, gfm()); while(true) { pair node_range(0, 0); pair range = this->mapGLF1(row, l, &node_range ASSERT_ONLY(, false)); index_t newrow = range.first; jumps++; assert_neq((index_t)INDEX_MAX, newrow); assert_neq(newrow, row); row = newrow; for(index_t i = 0; i < _zOffs.size(); i++) { if(row == _zOffs[i]) return jumps; } if((node_range.first & _gh._offMask) == node_range.first) { index_t off = this->offs()[node_range.first >> _gh._offRate]; if(off != (index_t)INDEX_MAX) return jumps + off; } l.initFromRow(row, _gh, gfm()); } } /** * Resolve the reference offset of the BW element 'elt' such that * the offset returned is at the right-hand side of the forward * reference substring involved in the hit. */ template index_t GFM::getOffset( index_t elt, bool fw, index_t hitlen) const { index_t off = getOffset(elt); assert_neq((index_t)INDEX_MAX, off); if(!fw) { assert_lt(off, _gh._len); off = _gh._len - off - 1; assert_geq(off, hitlen-1); off -= (hitlen-1); assert_lt(off, _gh._len); } return off; } /** * Returns true iff the index contains the given string (exactly). The given * string must contain only unambiguous characters. TODO: support ambiguous * characters in 'str'. */ template bool GFM::contains( const BTDnaString& str, index_t *otop, index_t *obot) const { assert(isInMemory()); SideLocus tloc, bloc; if(str.empty()) { if(otop != NULL && obot != NULL) *otop = *obot = 0; return true; } int c = str[str.length()-1]; assert_range(0, 4, c); index_t top = 0, bot = 0; if(c < 4) { top = fchr()[c]; bot = fchr()[c+1]; } else { bool set = false; for(int i = 0; i < 4; i++) { if(fchr()[c] < fchr()[c+1]) { if(set) { return false; } else { set = true; top = fchr()[c]; bot = fchr()[c+1]; } } } } assert_geq(bot, top); tloc.initFromRow(top, gh(), gfm()); bloc.initFromRow(bot, gh(), gfm()); ASSERT_ONLY(index_t lastDiff = bot - top); for(int64_t i = (int64_t)str.length()-2; i >= 0; i--) { c = str[i]; assert_range(0, 4, c); if(c <= 3) { top = mapLF(tloc, c); bot = mapLF(bloc, c); } else { index_t sz = bot - top; int c1 = mapLF1(top, tloc ASSERT_ONLY(, false)); bot = mapLF(bloc, c1); assert_leq(bot - top, sz); if(bot - top < sz) { // Encountered an N and could not proceed through it because // there was more than one possible nucleotide we could replace // it with return false; } } assert_geq(bot, top); assert_leq(bot-top, lastDiff); ASSERT_ONLY(lastDiff = bot-top); if(i > 0) { tloc.initFromRow(top, gh(), gfm()); bloc.initFromRow(bot, gh(), gfm()); } } if(otop != NULL && obot != NULL) { *otop = top; *obot = bot; } return bot > top; } /////////////////////////////////////////////////////////////////////// // // Functions for reading and writing Ebwts // /////////////////////////////////////////////////////////////////////// /** * Read an Ebwt from file with given filename. */ template void GFM::readIntoMemory( int needEntireRev, bool loadSASamp, bool loadFtab, bool loadRstarts, bool justHeader, GFMParams *params, bool mmSweep, bool loadNames, bool startVerbose, bool subIndex) { bool switchEndian; // dummy; caller doesn't care #ifdef BOWTIE_MM char *mmFile[] = { NULL, NULL }; #endif if(_in1Str.length() > 0 && !subIndex) { if(_verbose || startVerbose) { cerr << " About to open input files: "; logTime(cerr); } // Initialize our primary and secondary input-stream fields if(_in1 != NULL) fclose(_in1); if(_verbose || startVerbose) cerr << "Opening \"" << _in1Str.c_str() << "\"" << endl; if((_in1 = fopen(_in1Str.c_str(), "rb")) == NULL) { cerr << "Could not open index file " << _in1Str.c_str() << endl; } if(loadSASamp) { if(_in2 != NULL) fclose(_in2); if(_verbose || startVerbose) cerr << "Opening \"" << _in2Str.c_str() << "\"" << endl; if((_in2 = fopen(_in2Str.c_str(), "rb")) == NULL) { cerr << "Could not open index file " << _in2Str.c_str() << endl; } } if(_verbose || startVerbose) { cerr << " Finished opening input files: "; logTime(cerr); } #ifdef BOWTIE_MM if(_useMm /*&& !justHeader*/) { const char *names[] = {_in1Str.c_str(), _in2Str.c_str()}; int fds[] = { fileno(_in1), fileno(_in2) }; for(int i = 0; i < (loadSASamp ? 2 : 1); i++) { if(_verbose || startVerbose) { cerr << " Memory-mapping input file " << (i+1) << ": "; logTime(cerr); } struct stat sbuf; if (stat(names[i], &sbuf) == -1) { perror("stat"); cerr << "Error: Could not stat index file " << names[i] << " prior to memory-mapping" << endl; throw 1; } mmFile[i] = (char*)mmap((void *)0, (size_t)sbuf.st_size, PROT_READ, MAP_SHARED, fds[(size_t)i], 0); if(mmFile[i] == (void *)(-1)) { perror("mmap"); cerr << "Error: Could not memory-map the index file " << names[i] << endl; throw 1; } if(mmSweep) { int sum = 0; for(off_t j = 0; j < sbuf.st_size; j += 1024) { sum += (int) mmFile[i][j]; } if(startVerbose) { cerr << " Swept the memory-mapped ebwt index file 1; checksum: " << sum << ": "; logTime(cerr); } } } mmFile1_ = mmFile[0]; mmFile2_ = loadSASamp ? mmFile[1] : NULL; } #endif } #ifdef BOWTIE_MM else if(_useMm && !justHeader) { mmFile[0] = mmFile1_; mmFile[1] = mmFile2_; } if(_useMm && !justHeader) { assert(mmFile[0] == mmFile1_); assert(mmFile[1] == mmFile2_); } #endif if(_verbose || startVerbose) { cerr << " Reading header: "; logTime(cerr); } // Read endianness hints from both streams size_t bytesRead = 0; if(!subIndex) { switchEndian = false; uint32_t one = readU32(_in1, switchEndian); // 1st word of primary stream bytesRead += 4; if(loadSASamp) { #ifndef NDEBUG assert_eq(one, readU32(_in2, switchEndian)); // should match! #else readU32(_in2, switchEndian); #endif } if(one != 1) { assert_eq((1u<<24), one); assert_eq(1, endianSwapU32(one)); switchEndian = true; } _toBigEndian = switchEndian; // Can't switch endianness and use memory-mapped files; in order to // support this, someone has to modify the file to switch // endiannesses appropriately, and we can't do this inside Bowtie // or we might be setting up a race condition with other processes. if(switchEndian && _useMm) { cerr << "Error: Can't use memory-mapped files when the index is the opposite endianness" << endl; throw 1; } // Reads header entries one by one from primary stream int index_version = (int)readU32(_in1, switchEndian); bytesRead += 4; int major_index_version, minor_index_version; string index_version_extra; readIndexVersion(index_version, major_index_version, minor_index_version, index_version_extra); int major_program_version, minor_program_version; string program_version_extra; readProgramVersion(major_program_version, minor_program_version, program_version_extra); if(major_program_version < major_index_version || (major_program_version == major_index_version && minor_program_version < minor_index_version)) { cerr << "Warning: the current version of HISAT2 (" << HISAT2_VERSION << ") is older than the version (2." << major_index_version << "." << minor_index_version; if(index_version_extra.length() > 0) { cerr << "-" << index_version_extra; } cerr << ") used to build the index." << endl; cerr << " Users are strongly recommended to update HISAT2 to the latest version." << endl; } } else { switchEndian = _toBigEndian; } index_t len = readIndex(_in1, switchEndian); bytesRead += sizeof(index_t); index_t gbwtLen = readIndex(_in1, switchEndian); bytesRead += sizeof(index_t); assert_lt(len, gbwtLen); index_t numNodes = readIndex(_in1, switchEndian); bytesRead += sizeof(index_t); int32_t lineRate = readI32(_in1, switchEndian); bytesRead += 4; /*int32_t linesPerSide =*/ readI32(_in1, switchEndian); bytesRead += 4; int32_t offRate = readI32(_in1, switchEndian); bytesRead += 4; // TODO: add isaRate to the actual file format (right now, the // user has to tell us whether there's an ISA sample and what the // sampling rate is. int32_t ftabChars = readI32(_in1, switchEndian); bytesRead += 4; index_t eftabLen = readIndex(_in1, switchEndian); bytesRead += sizeof(index_t); // chunkRate was deprecated in an earlier version of Bowtie; now // we use it to hold flags. int32_t flags = readI32(_in1, switchEndian); bool entireRev = false; if(flags < 0 && (((-flags) & GFM_ENTIRE_REV) == 0)) { if(needEntireRev != -1 && needEntireRev != 0) { cerr << "Error: This index is compatible with 0.* versions of Bowtie, but not with 2.*" << endl << "versions. Please build or download a version of the index that is compitble" << endl << "with Bowtie 2.* (i.e. built with bowtie-build 2.* or later)" << endl; throw 1; } } else entireRev = true; bytesRead += 4; // Create a new EbwtParams from the entries read from primary stream GFMParams *gh; bool deleteGh = false; if(params != NULL) { params->init(len, gbwtLen, numNodes, lineRate, offRate, ftabChars, eftabLen, entireRev); if(_verbose || startVerbose) params->print(cerr); gh = params; } else { gh = new GFMParams(len, gbwtLen, numNodes, lineRate, offRate, ftabChars, eftabLen, entireRev); deleteGh = true; } // Set up overridden suffix-array-sample parameters index_t offsLen = gh->_offsLen; index_t offRateDiff = 0; index_t offsLenSampled = offsLen; if(_overrideOffRate > offRate) { offRateDiff = _overrideOffRate - offRate; } if(offRateDiff > 0) { offsLenSampled >>= offRateDiff; if((offsLen & ~(((index_t)INDEX_MAX) << offRateDiff)) != 0) { offsLenSampled++; } } // Can't override the offrate or isarate and use memory-mapped // files; ultimately, all processes need to copy the sparser sample // into their own memory spaces. #if 0 if(_useMm && (offRateDiff)) { cerr << "Error: Can't use memory-mapped files when the offrate is overridden" << endl; throw 1; } #endif // Read nPat from primary stream this->_nPat = readIndex(_in1, switchEndian); bytesRead += sizeof(index_t); _plen.reset(); // Read plen from primary stream if(_useMm) { #ifdef BOWTIE_MM _plen.init((index_t*)(mmFile[0] + bytesRead), _nPat, false); bytesRead += _nPat*sizeof(index_t); fseek(_in1, _nPat*sizeof(index_t), SEEK_CUR); #endif } else { try { if(_verbose || startVerbose) { cerr << "Reading plen (" << this->_nPat << "): "; logTime(cerr); } _plen.init(new index_t[_nPat], _nPat, true); if(switchEndian) { for(index_t i = 0; i < this->_nPat; i++) { plen()[i] = readIndex(_in1, switchEndian); } } else { size_t r = MM_READ(_in1, (void*)(plen()), _nPat*sizeof(index_t)); if(r != (size_t)(_nPat*sizeof(index_t))) { cerr << "Error reading _plen[] array: " << r << ", " << _nPat*sizeof(index_t) << endl; throw 1; } } } catch(bad_alloc& e) { cerr << "Out of memory allocating plen[] in Ebwt::read()" << " at " << __FILE__ << ":" << __LINE__ << endl; throw e; } } // TODO: I'm not consistent on what "header" means. Here I'm using // "header" to mean everything that would exist in memory if we // started to build the Ebwt but stopped short of the build*() step // (i.e. everything up to and including join()). if(justHeader) { // Be kind if(deleteGh) delete gh; #ifdef BOWTIE_MM fseek(_in1, 0, SEEK_SET); if(loadSASamp) fseek(_in2, 0, SEEK_SET); #else rewind(_in1); if(loadSASamp) rewind(_in2); #endif return; } bool shmemLeader; this->_nFrag = readIndex(_in1, switchEndian); bytesRead += sizeof(index_t); if(_verbose || startVerbose) { cerr << "Reading rstarts (" << this->_nFrag*3 << "): "; logTime(cerr); } assert_geq(this->_nFrag, this->_nPat); _rstarts.reset(); if(loadRstarts) { if(_useMm) { #ifdef BOWTIE_MM _rstarts.init((index_t*)(mmFile[0] + bytesRead), _nFrag*3, false); bytesRead += this->_nFrag*sizeof(index_t)*3; fseek(_in1, this->_nFrag*sizeof(index_t)*3, SEEK_CUR); #endif } else { _rstarts.init(new index_t[_nFrag*3], _nFrag*3, true); if(switchEndian) { for(size_t i = 0; i < (size_t)(this->_nFrag*3); i += 3) { // fragment starting position in joined reference // string, text id, and fragment offset within text this->rstarts()[i] = readIndex(_in1, switchEndian); this->rstarts()[i+1] = readIndex(_in1, switchEndian); this->rstarts()[i+2] = readIndex(_in1, switchEndian); } } else { size_t r = MM_READ(_in1, (void *)rstarts(), this->_nFrag*sizeof(index_t)*3); if(r != (size_t)(this->_nFrag*sizeof(index_t)*3)) { cerr << "Error reading _rstarts[] array: " << r << ", " << (this->_nFrag*sizeof(index_t)*3) << endl; throw 1; } } } } else { // Skip em assert(rstarts() == NULL); bytesRead += this->_nFrag*sizeof(index_t)*3; fseek(_in1, this->_nFrag*sizeof(index_t)*3, SEEK_CUR); } _gfm.reset(); if(_useMm) { #ifdef BOWTIE_MM _gfm.init((uint8_t*)(mmFile[0] + bytesRead), gh->_gbwtTotLen, false); bytesRead += gh->_gbwtTotLen; fseek(_in1, gh->_gbwtTotLen, SEEK_CUR); #endif } else { // Allocate ebwt (big allocation) if(_verbose || startVerbose) { cerr << "Reading ebwt (" << gh->_gbwtTotLen << "): "; logTime(cerr); } bool shmemLeader = true; if(useShmem_) { uint8_t *tmp = NULL; shmemLeader = ALLOC_SHARED_U8( (_in1Str + "[ebwt]"), gh->_gbwtTotLen, &tmp, "gfm[]", (_verbose || startVerbose)); assert(tmp != NULL); _gfm.init(tmp, gh->_gbwtTotLen, false); if(_verbose || startVerbose) { cerr << " shared-mem " << (shmemLeader ? "leader" : "follower") << endl; } } else { try { _gfm.init(new uint8_t[gh->_gbwtTotLen], gh->_gbwtTotLen, true); } catch(bad_alloc& e) { cerr << "Out of memory allocating the gfm[] array for the Bowtie index. Please try" << endl << "again on a computer with more memory." << endl; throw 1; } } if(shmemLeader) { // Read ebwt from primary stream uint64_t bytesLeft = gh->_gbwtTotLen; char *pgbwt = (char*)this->gfm(); while (bytesLeft>0){ size_t r = MM_READ(this->_in1, (void *)pgbwt, bytesLeft); if(MM_IS_IO_ERR(this->_in1, r, bytesLeft)) { cerr << "Error reading _ebwt[] array: " << r << ", " << bytesLeft << endl; throw 1; } pgbwt += r; bytesLeft -= r; } if(switchEndian) { uint8_t *side = this->gfm(); for(size_t i = 0; i < gh->_numSides; i++) { index_t *cums = reinterpret_cast(side + gh->_sideSz - sizeof(index_t)*2); cums[0] = endianSwapIndex(cums[0]); cums[1] = endianSwapIndex(cums[1]); side += this->_gh._sideSz; } } #ifdef BOWTIE_SHARED_MEM if(useShmem_) NOTIFY_SHARED(gfm(), gh->_gbwtTotLen); #endif } else { // Seek past the data and wait until master is finished fseek(_in1, gh->_gbwtTotLen, SEEK_CUR); #ifdef BOWTIE_SHARED_MEM if(useShmem_) WAIT_SHARED(gfm(), gh->_gbwtTotLen); #endif } } // Read zOff from primary stream _zOffs.clear(); index_t num_zOffs = readIndex(_in1, switchEndian); bytesRead += sizeof(index_t); for(index_t i = 0; i < num_zOffs; i++) { index_t zOff = readIndex(_in1, switchEndian); bytesRead += sizeof(index_t); assert_lt(zOff, gbwtLen); _zOffs.push_back(zOff); } try { // Read fchr from primary stream if(_verbose || startVerbose) cerr << "Reading fchr (5)" << endl; _fchr.reset(); if(_useMm) { #ifdef BOWTIE_MM _fchr.init((index_t*)(mmFile[0] + bytesRead), 5, false); bytesRead += 5*sizeof(index_t); fseek(_in1, 5*sizeof(index_t), SEEK_CUR); #endif } else { _fchr.init(new index_t[5], 5, true); for(int i = 0; i < 5; i++) { this->fchr()[i] = readIndex(_in1, switchEndian); assert_leq(this->fchr()[i], gbwtLen); assert(i <= 0 || this->fchr()[i] >= this->fchr()[i-1]); } } assert_gt(this->fchr()[4], this->fchr()[0]); // Read ftab from primary stream if(_verbose || startVerbose) { if(loadFtab) { cerr << "Reading ftab (" << gh->_ftabLen << "): "; logTime(cerr); } else { cerr << "Skipping ftab (" << gh->_ftabLen << "): "; } } _ftab.reset(); if(loadFtab) { if(_useMm) { #ifdef BOWTIE_MM _ftab.init((index_t*)(mmFile[0] + bytesRead), gh->_ftabLen, false); bytesRead += gh->_ftabLen*sizeof(index_t); fseek(_in1, gh->_ftabLen*sizeof(index_t), SEEK_CUR); #endif } else { _ftab.init(new index_t[gh->_ftabLen], gh->_ftabLen, true); if(switchEndian) { for(size_t i = 0; i < gh->_ftabLen; i++) this->ftab()[i] = readIndex(_in1, switchEndian); } else { size_t r = MM_READ(_in1, (void *)ftab(), gh->_ftabLen*sizeof(index_t)); if(r != (size_t)(gh->_ftabLen*sizeof(index_t))) { cerr << "Error reading _ftab[] array: " << r << ", " << (gh->_ftabLen*sizeof(index_t)) << endl; throw 1; } } } // Read etab from primary stream if(_verbose || startVerbose) { if(loadFtab) { cerr << "Reading eftab (" << gh->_eftabLen << "): "; logTime(cerr); } else { cerr << "Skipping eftab (" << gh->_eftabLen << "): "; } } _eftab.reset(); if(_useMm) { #ifdef BOWTIE_MM _eftab.init((index_t*)(mmFile[0] + bytesRead), gh->_eftabLen, false); bytesRead += gh->_eftabLen*sizeof(index_t); fseek(_in1, gh->_eftabLen*sizeof(index_t), SEEK_CUR); #endif } else { _eftab.init(new index_t[gh->_eftabLen], gh->_eftabLen, true); if(switchEndian) { for(size_t i = 0; i < gh->_eftabLen; i++) this->eftab()[i] = readIndex(_in1, switchEndian); } else { size_t r = MM_READ(_in1, (void *)this->eftab(), gh->_eftabLen*sizeof(index_t)); if(r != (size_t)(gh->_eftabLen*sizeof(index_t))) { cerr << "Error reading _eftab[] array: " << r << ", " << (gh->_eftabLen*sizeof(index_t)) << endl; throw 1; } } } for(index_t i = 0; i < gh->_eftabLen; i++) { if(i > 0 && this->eftab()[i] > 0) { assert_geq(this->eftab()[i] + 4, this->eftab()[i-1]); } else if(i > 0 && this->eftab()[i-1] == 0) { assert_eq(0, this->eftab()[i]); } } } else { assert(ftab() == NULL); assert(eftab() == NULL); // Skip ftab bytesRead += gh->_ftabLen*sizeof(index_t); fseek(_in1, gh->_ftabLen*sizeof(index_t), SEEK_CUR); // Skip eftab bytesRead += sizeof(index_t); bytesRead += gh->_eftabLen*sizeof(index_t); fseek(_in1, gh->_eftabLen*sizeof(index_t), SEEK_CUR); } } catch(bad_alloc& e) { cerr << "Out of memory allocating fchr[], ftab[] or eftab[] arrays for the Bowtie index." << endl << "Please try again on a computer with more memory." << endl; throw 1; } // Read reference sequence names from primary index file (or not, // if --refidx is specified) if(loadNames) { while(true) { char c = '\0'; if(MM_READ(_in1, (void *)(&c), (size_t)1) != (size_t)1) break; bytesRead++; if(c == '\0') break; else if(c == '\n') { this->_refnames.push_back(""); } else { if(this->_refnames.size() == 0) { this->_refnames.push_back(""); } this->_refnames.back().push_back(c); } } } _offs.reset(); if(loadSASamp) { bytesRead = 4; // reset for secondary index file (already read 1-sentinel) shmemLeader = true; if(_verbose || startVerbose) { cerr << "Reading offs (" << offsLenSampled << " " << std::setw(2) << sizeof(index_t)*8 << "-bit words): "; logTime(cerr); } if(!_useMm) { if(!useShmem_) { // Allocate offs_ try { _offs.init(new index_t[offsLenSampled], offsLenSampled, true); } catch(bad_alloc& e) { cerr << "Out of memory allocating the offs[] array for the Bowtie index." << endl << "Please try again on a computer with more memory." << endl; throw 1; } } else { index_t *tmp = NULL; shmemLeader = ALLOC_SHARED_U32( (_in2Str + "[offs]"), offsLenSampled*sizeof(index_t), &tmp, "offs", (_verbose || startVerbose)); _offs.init((index_t*)tmp, offsLenSampled, false); } } if(_overrideOffRate < 32) { if(shmemLeader) { // Allocate offs (big allocation) if(switchEndian || offRateDiff > 0) { assert(!_useMm); const index_t blockMaxSz = (index_t)(2 * 1024 * 1024); // 2 MB block size const index_t blockMaxSzU = (blockMaxSz / sizeof(index_t)); // # U32s per block char *buf; try { buf = new char[blockMaxSz]; } catch(std::bad_alloc& e) { cerr << "Error: Out of memory allocating part of _offs array: '" << e.what() << "'" << endl; throw e; } for(index_t i = 0; i < offsLen; i += blockMaxSzU) { index_t block = min