/* * Copyright 2011, Ben Langmead * * This file is part of Bowtie 2. * * Bowtie 2 is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * Bowtie 2 is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with Bowtie 2. If not, see . */ #include #include #include "reference.h" #include "mem_ids.h" using namespace std; /** * Load from .3.gfm_ext/.4.gfm_ext HISAT2 index files. */ BitPairReference::BitPairReference( const string& in, const EList* included, bool color, bool sanity, EList* infiles, EList >* origs, bool infilesSeq, bool useMm, bool useShmem, bool mmSweep, bool verbose, bool startVerbose) : buf_(NULL), sanityBuf_(NULL), loaded_(true), sanity_(sanity), useMm_(useMm), useShmem_(useShmem), verbose_(verbose) { string s3 = in + ".3." + gfm_ext; string s4 = in + ".4." + gfm_ext; FILE *f3, *f4; if((f3 = fopen(s3.c_str(), "rb")) == NULL) { cerr << "Could not open reference-string index file " << s3 << " for reading." << endl; cerr << "This is most likely because your index was built with an older version" << endl << "(<= 0.9.8.1) of bowtie-build. Please re-run bowtie-build to generate a new" << endl << "index (or download one from the Bowtie website) and try again." << endl; loaded_ = false; return; } if((f4 = fopen(s4.c_str(), "rb")) == NULL) { cerr << "Could not open reference-string index file " << s4 << " for reading." << endl; loaded_ = false; return; } #ifdef BOWTIE_MM char *mmFile = NULL; if(useMm_) { if(verbose_ || startVerbose) { cerr << " Memory-mapping reference index file " << s4.c_str() << ": "; logTime(cerr); } struct stat sbuf; if (stat(s4.c_str(), &sbuf) == -1) { perror("stat"); cerr << "Error: Could not stat index file " << s4.c_str() << " prior to memory-mapping" << endl; throw 1; } mmFile = (char*)mmap((void *)0, (size_t)sbuf.st_size, PROT_READ, MAP_SHARED, fileno(f4), 0); if(mmFile == (void *)(-1) || mmFile == NULL) { perror("mmap"); cerr << "Error: Could not memory-map the index file " << s4.c_str() << endl; throw 1; } if(mmSweep) { TIndexOff sum = 0; for(off_t i = 0; i < sbuf.st_size; i += 1024) { sum += (TIndexOff) mmFile[i]; } if(startVerbose) { cerr << " Swept the memory-mapped ref index file; checksum: " << sum << ": "; logTime(cerr); } } } #endif // Read endianness sentinel, set 'swap' uint32_t one; bool swap = false; one = readIndex(f3, swap); if(one != 1) { if(useMm_) { cerr << "Error: Can't use memory-mapped files when the index is the opposite endianness" << endl; throw 1; } assert_eq(0x1000000, one); swap = true; // have to endian swap U32s } // Read # records TIndexOffU sz; sz = readIndex(f3, swap); if(sz == 0) { cerr << "Error: number of reference records is 0 in " << s3.c_str() << endl; throw 1; } // Read records nrefs_ = 0; // Cumulative count of all unambiguous characters on a per- // stretch 8-bit alignment (i.e. count of bytes we need to // allocate in buf_) TIndexOffU cumsz = 0; TIndexOffU cumlen = 0; EList seq_poss; TIndexOffU seq_cumpos = 0; TIndexOffU skips = 0; // For each unambiguous stretch... for(TIndexOffU i = 0; i < sz; i++) { recs_.push_back(RefRecord(f3, swap)); if(included != NULL && !(*included)[i]) { seq_cumpos += recs_.back().len; recs_.pop_back(); skips++; continue; } seq_poss.push_back(seq_cumpos); if(recs_.back().first) { // This is the first record for this reference sequence (and the // last record for the one before) refRecOffs_.push_back((TIndexOffU)recs_.size()-1); // refOffs_ links each reference sequence with the total number of // unambiguous characters preceding it in the pasted reference refOffs_.push_back(cumsz); if(nrefs_ > 0) { // refLens_ links each reference sequence with the total number // of ambiguous and unambiguous characters in it. refLens_.push_back(cumlen); } cumlen = 0; nrefs_++; } else if(i == 0) { cerr << "First record in reference index file was not marked as " << "'first'" << endl; throw 1; } cumUnambig_.push_back(cumsz); cumRefOff_.push_back(cumlen); cumsz += recs_.back().len; cumlen += recs_.back().off; cumlen += recs_.back().len; seq_cumpos += recs_.back().len; } if(verbose_ || startVerbose) { cerr << "Read " << nrefs_ << " reference strings from " << sz << " records: "; logTime(cerr); } // Store a cap entry for the end of the last reference seq refRecOffs_.push_back((TIndexOffU)recs_.size()); refOffs_.push_back(cumsz); refLens_.push_back(cumlen); cumUnambig_.push_back(cumsz); cumRefOff_.push_back(cumlen); bufSz_ = cumsz; assert_eq(nrefs_, refLens_.size()); assert_eq(sz, recs_.size() + skips); if (f3 != NULL) fclose(f3); // done with .3.gfm_ext file // Round cumsz up to nearest byte boundary if((cumsz & 3) != 0) { cumsz += (4 - (cumsz & 3)); } bufAllocSz_ = cumsz >> 2; assert_eq(0, cumsz & 3); // should be rounded up to nearest 4 if(useMm_) { #ifdef BOWTIE_MM buf_ = (uint8_t*)mmFile; if(sanity_) { FILE *ftmp = fopen(s4.c_str(), "rb"); sanityBuf_ = new uint8_t[cumsz >> 2]; size_t ret = fread(sanityBuf_, 1, cumsz >> 2, ftmp); if(ret != (cumsz >> 2)) { cerr << "Only read " << ret << " bytes (out of " << (cumsz >> 2) << ") from reference index file " << s4.c_str() << endl; throw 1; } fclose(ftmp); for(size_t i = 0; i < (cumsz >> 2); i++) { assert_eq(sanityBuf_[i], buf_[i]); } } #else cerr << "Shouldn't be at " << __FILE__ << ":" << __LINE__ << " without BOWTIE_MM defined" << endl; throw 1; #endif } else { bool shmemLeader = true; if(!useShmem_) { // Allocate a buffer to hold the reference string try { buf_ = new uint8_t[cumsz >> 2]; if(buf_ == NULL) throw std::bad_alloc(); } catch(std::bad_alloc& e) { cerr << "Error: Ran out of memory allocating space for the bitpacked reference. Please" << endl << "re-run on a computer with more memory." << endl; throw 1; } } else { shmemLeader = ALLOC_SHARED_U8( (s4 + "[ref]"), (cumsz >> 2), &buf_, "ref", (verbose_ || startVerbose)); } if(shmemLeader) { // Open the bitpair-encoded reference file FILE *f4 = fopen(s4.c_str(), "rb"); if(f4 == NULL) { cerr << "Could not open reference-string index file " << s4.c_str() << " for reading." << endl; cerr << "This is most likely because your index was built with an older version" << endl << "(<= 0.9.8.1) of bowtie-build. Please re-run bowtie-build to generate a new" << endl << "index (or download one from the Bowtie website) and try again." << endl; loaded_ = false; return; } if(included == NULL) { // Read the whole thing in size_t ret = fread(buf_, 1, cumsz >> 2, f4); // Didn't read all of it? if(ret != (cumsz >> 2)) { cerr << "Only read " << ret << " bytes (out of " << (cumsz >> 2) << ") from reference index file " << s4.c_str() << endl; throw 1; } // Make sure there's no more char c; ret = fread(&c, 1, 1, f4); assert_eq(0, ret); // should have failed } else { TIndexOffU buf_pos = 0; uint8_t four_buf = 0, four_buf2 = 0; for(size_t i = 0; i < seq_poss.size(); i++) { TIndexOffU seq_pos = seq_poss[i]; TIndexOffU cur_len = refLens_[i]; TIndexOffU seq_pos2 = seq_pos + cur_len; TIndexOffU left_pad = seq_pos & 3; assert_eq((seq_pos - left_pad) & 3, 0); TIndexOffU right_pad = 4 - (seq_pos2 & 3); if(right_pad == 4) right_pad = 0; assert_eq((seq_pos2 + right_pad) & 3, 0); TIndexOffU cur_len2 = left_pad + cur_len + right_pad; assert_eq(cur_len2 & 3, 0); uint8_t *buf2_ = new uint8_t[cur_len2 >> 2]; // Read sequences selectively fseek(f4, (seq_pos - left_pad) >> 2, SEEK_SET); size_t ret = fread(buf2_, 1, cur_len2 >> 2, f4); // Didn't read all of it? if(ret != (cur_len2 >> 2)) { cerr << "Only read " << ret << " bytes (out of " << (cur_len2 >> 2) << ") from reference index file " << s4.c_str() << endl; throw 1; } four_buf2 = buf2_[0] >> (left_pad << 1); for(TIndexOffU j = seq_pos; j < seq_pos2; j++, buf_pos++) { if((j & 3) == 0) { four_buf2 = buf2_[(j - (seq_pos - left_pad)) >> 2]; } uint8_t nt = four_buf2 & 3; four_buf2 >>= 2; four_buf |= (nt << ((buf_pos & 3) << 1)); if((buf_pos & 3) == 3) { buf_[buf_pos >> 2] = four_buf; four_buf = 0; } } delete [] buf2_; seq_pos += cur_len; } #ifndef NDEBUG TIndexOffU cumsz2 = 0; for(size_t i = 0; i < refLens_.size(); i++) { cumsz2 += refLens_[i]; } assert_eq(buf_pos, cumsz2); #endif if((buf_pos & 3) != 0) { buf_[buf_pos >> 2] = four_buf; } assert_eq(nrefs_, refLens_.size()); } fclose(f4); #ifdef BOWTIE_SHARED_MEM if(useShmem_) NOTIFY_SHARED(buf_, (cumsz >> 2)); #endif } else { #ifdef BOWTIE_SHARED_MEM if(useShmem_) WAIT_SHARED(buf_, (cumsz >> 2)); #endif } } // Populate byteToU32_ bool big = currentlyBigEndian(); for(int i = 0; i < 256; i++) { uint32_t word = 0; if(big) { word |= ((i >> 0) & 3) << 24; word |= ((i >> 2) & 3) << 16; word |= ((i >> 4) & 3) << 8; word |= ((i >> 6) & 3) << 0; } else { word |= ((i >> 0) & 3) << 0; word |= ((i >> 2) & 3) << 8; word |= ((i >> 4) & 3) << 16; word |= ((i >> 6) & 3) << 24; } byteToU32_[i] = word; } #ifndef NDEBUG if(sanity_) { // Compare the sequence we just read from the compact index // file to the true reference sequence. EList > *os; // for holding references EList > osv(DEBUG_CAT); // for holding ref seqs EList > osn(DEBUG_CAT); // for holding ref names EList osvLen(DEBUG_CAT); // for holding ref seq lens EList osnLen(DEBUG_CAT); // for holding ref name lens SStringExpandable tmp_destU32_; if(infiles != NULL) { if(infilesSeq) { for(size_t i = 0; i < infiles->size(); i++) { // Remove initial backslash; that's almost // certainly being used to protect the first // character of the sequence from getopts (e.g., // when the first char is -) if((*infiles)[i].at(0) == '\\') { (*infiles)[i].erase(0, 1); } osv.push_back(SString((*infiles)[i])); } } else { parseFastas(*infiles, osn, osnLen, osv, osvLen); } os = &osv; } else { assert(origs != NULL); os = origs; } // Go through the loaded reference files base-by-base and // sanity check against what we get by calling getBase and // getStretch for(size_t i = 0; i < os->size(); i++) { size_t olen = ((*os)[i]).length(); size_t olenU32 = (olen + 12) / 4; uint32_t *buf = new uint32_t[olenU32]; uint8_t *bufadj = (uint8_t*)buf; bufadj += getStretch(buf, i, 0, olen, tmp_destU32_); for(size_t j = 0; j < olen; j++) { assert_eq((int)(*os)[i][j], (int)bufadj[j]); assert_eq((int)(*os)[i][j], (int)getBase(i, j)); } delete[] buf; } } #endif // generate minkRepeat long long int genomeLen = approxLen(0); minkRepeat = 0; while(genomeLen > 0) { genomeLen >>= 2; minkRepeat++; } } BitPairReference::~BitPairReference() { if(buf_ != NULL && !useMm_ && !useShmem_) delete[] buf_; if(sanityBuf_ != NULL) delete[] sanityBuf_; } /** * Return a single base of the reference. Calling this repeatedly * is not an efficient way to retrieve bases from the reference; * use loadStretch() instead. * * This implementation scans linearly through the records for the * unambiguous stretches of the target reference sequence. When * there are many records, binary search would be more appropriate. */ int BitPairReference::getBase(size_t tidx, size_t toff) const { uint64_t reci = refRecOffs_[tidx]; // first record for target reference sequence uint64_t recf = refRecOffs_[tidx+1]; // last record (exclusive) for target seq assert_gt(recf, reci); uint64_t bufOff = refOffs_[tidx]; uint64_t off = 0; // For all records pertaining to the target reference sequence... for(uint64_t i = reci; i < recf; i++) { assert_geq(toff, off); off += recs_[i].off; if(toff < off) { return 4; } assert_geq(toff, off); uint64_t recOff = off + recs_[i].len; if(toff < recOff) { toff -= off; bufOff += (uint64_t)toff; assert_lt(bufOff, bufSz_); const uint64_t bufElt = (bufOff) >> 2; const uint64_t shift = (bufOff & 3) << 1; return ((buf_[bufElt] >> shift) & 3); } bufOff += recs_[i].len; off = recOff; assert_geq(toff, off); } // end for loop over records return 4; } /** * Load a stretch of the reference string into memory at 'dest'. * * This implementation scans linearly through the records for the * unambiguous stretches of the target reference sequence. When * there are many records, binary search would be more appropriate. */ int BitPairReference::getStretchNaive( uint32_t *destU32, size_t tidx, size_t toff, size_t count) const { uint8_t *dest = (uint8_t*)destU32; uint64_t reci = refRecOffs_[tidx]; // first record for target reference sequence uint64_t recf = refRecOffs_[tidx+1]; // last record (exclusive) for target seq assert_gt(recf, reci); uint64_t cur = 0; uint64_t bufOff = refOffs_[tidx]; uint64_t off = 0; // For all records pertaining to the target reference sequence... for(uint64_t i = reci; i < recf; i++) { assert_geq(toff, off); off += recs_[i].off; for(; toff < off && count > 0; toff++) { dest[cur++] = 4; count--; } if(count == 0) break; assert_geq(toff, off); if(toff < off + recs_[i].len) { bufOff += (TIndexOffU)(toff - off); // move bufOff pointer forward } else { bufOff += recs_[i].len; } off += recs_[i].len; for(; toff < off && count > 0; toff++) { assert_lt(bufOff, bufSz_); const uint64_t bufElt = (bufOff) >> 2; const uint64_t shift = (bufOff & 3) << 1; dest[cur++] = (buf_[bufElt] >> shift) & 3; bufOff++; count--; } if(count == 0) break; assert_geq(toff, off); } // end for loop over records // In any chars are left after scanning all the records, // they must be ambiguous while(count > 0) { count--; dest[cur++] = 4; } assert_eq(0, count); return 0; } /** * Load a stretch of the reference string into memory at 'dest'. */ int BitPairReference::getStretch( uint32_t *destU32, size_t tidx, size_t toff, size_t count ASSERT_ONLY(, SStringExpandable& destU32_2)) const { ASSERT_ONLY(size_t origCount = count); ASSERT_ONLY(size_t origToff = toff); if(count == 0) return 0; uint8_t *dest = (uint8_t*)destU32; #ifndef NDEBUG destU32_2.clear(); uint8_t *dest_2 = NULL; int off2; if((rand() % 10) == 0) { destU32_2.resize((origCount >> 2) + 2); off2 = getStretchNaive(destU32_2.wbuf(), tidx, origToff, origCount); dest_2 = ((uint8_t*)destU32_2.wbuf()) + off2; } #endif destU32[0] = 0x04040404; // Add Ns, which we might end up using later uint64_t reci = refRecOffs_[tidx]; // first record for target reference sequence uint64_t recf = refRecOffs_[tidx+1]; // last record (exclusive) for target seq assert_gt(recf, reci); uint64_t cur = 4; // keep a cushion of 4 bases at the beginning uint64_t bufOff = refOffs_[tidx]; uint64_t off = 0; int64_t offset = 4; bool firstStretch = true; bool binarySearched = false; uint64_t left = reci; uint64_t right = recf; uint64_t mid = 0; // For all records pertaining to the target reference sequence... for(uint64_t i = reci; i < recf; i++) { uint64_t origBufOff = bufOff; assert_geq(toff, off); if (firstStretch && recf > reci + 16){ // binary search finds smallest i s.t. toff >= cumRefOff_[i] while (left < right-1) { mid = left + ((right - left) >> 1); if (cumRefOff_[mid] <= toff) left = mid; else right = mid; } off = cumRefOff_[left]; bufOff = cumUnambig_[left]; origBufOff = bufOff; i = left; assert(cumRefOff_[i+1] == 0 || cumRefOff_[i+1] > toff); binarySearched = true; } off += recs_[i].off; // skip Ns at beginning of stretch assert_gt(count, 0); if(toff < off) { size_t cpycnt = min((size_t)(off - toff), count); memset(&dest[cur], 4, cpycnt); count -= cpycnt; toff += cpycnt; cur += cpycnt; if(count == 0) break; } assert_geq(toff, off); if(toff < off + recs_[i].len) { bufOff += toff - off; // move bufOff pointer forward } else { bufOff += recs_[i].len; } off += recs_[i].len; assert(off == cumRefOff_[i+1] || cumRefOff_[i+1] == 0); assert(!binarySearched || toff < off); if(toff < off) { if(firstStretch) { if(toff + 8 < off && count > 8) { // We already added some Ns, so we have to do // a fixup at the beginning of the buffer so // that we can start clobbering at cur >> 2 if(cur & 3) { offset -= (cur & 3); } uint64_t curU32 = cur >> 2; // Do the initial few bases if(bufOff & 3) { const uint64_t bufElt = (bufOff) >> 2; const int64_t low2 = bufOff & 3; // Lots of cache misses on the following line destU32[curU32] = byteToU32_[buf_[bufElt]]; for(int j = 0; j < low2; j++) { ((char *)(&destU32[curU32]))[j] = 4; } curU32++; offset += low2; const int64_t chars = 4 - low2; count -= chars; bufOff += chars; toff += chars; } assert_eq(0, bufOff & 3); uint64_t bufOffU32 = bufOff >> 2; uint64_t countLim = count >> 2; uint64_t offLim = ((off - (toff + 4)) >> 2); uint64_t lim = min(countLim, offLim); // Do the fast thing for as far as possible for(uint64_t j = 0; j < lim; j++) { // Lots of cache misses on the following line destU32[curU32] = byteToU32_[buf_[bufOffU32++]]; #ifndef NDEBUG if(dest_2 != NULL) { assert_eq(dest[(curU32 << 2) + 0], dest_2[(curU32 << 2) - offset + 0]); assert_eq(dest[(curU32 << 2) + 1], dest_2[(curU32 << 2) - offset + 1]); assert_eq(dest[(curU32 << 2) + 2], dest_2[(curU32 << 2) - offset + 2]); assert_eq(dest[(curU32 << 2) + 3], dest_2[(curU32 << 2) - offset + 3]); } #endif curU32++; } toff += (lim << 2); assert_leq(toff, off); assert_leq((lim << 2), count); count -= (lim << 2); bufOff = bufOffU32 << 2; cur = curU32 << 2; } // Do the slow thing for the rest for(; toff < off && count > 0; toff++) { assert_lt(bufOff, bufSz_); const uint64_t bufElt = (bufOff) >> 2; const uint64_t shift = (bufOff & 3) << 1; dest[cur++] = (buf_[bufElt] >> shift) & 3; bufOff++; count--; } firstStretch = false; } else { // Do the slow thing for(; toff < off && count > 0; toff++) { assert_lt(bufOff, bufSz_); const uint64_t bufElt = (bufOff) >> 2; const uint64_t shift = (bufOff & 3) << 1; dest[cur++] = (buf_[bufElt] >> shift) & 3; bufOff++; count--; } } } if(count == 0) break; assert_eq(recs_[i].len, bufOff - origBufOff); assert_geq(toff, off); } // end for loop over records // In any chars are left after scanning all the records, // they must be ambiguous while(count > 0) { count--; dest[cur++] = 4; } assert_eq(0, count); return (int)offset; } /** * Parse the input fasta files, populating the szs list and writing the * .3.gfm_ext and .4.gfm_ext portions of the index as we go. */ pair BitPairReference::szsFromFasta( EList& is, const string& outfile, bool bigEndian, const RefReadInParams& refparams, EList& szs, bool sanity, EList *names) { RefReadInParams parms = refparams; std::pair sztot; if(!outfile.empty()) { string file3 = outfile + ".3." + gfm_ext; string file4 = outfile + ".4." + gfm_ext; // Open output stream for the '.3.gfm_ext' file which will // hold the size records. ofstream fout3(file3.c_str(), ios::binary); if(!fout3.good()) { cerr << "Could not open index file for writing: \"" << file3.c_str() << "\"" << endl << "Please make sure the directory exists and that permissions allow writing by" << endl << "HISAT2." << endl; throw 1; } BitpairOutFileBuf bpout(file4.c_str()); // Read in the sizes of all the unambiguous stretches of the genome // into a vector of RefRecords. The input streams are reset once // it's done. writeIndex(fout3, 1, bigEndian); // endianness sentinel TIndexOff numSeqs = 0; sztot = fastaRefReadSizes(is, szs, parms, &bpout, numSeqs); writeIndex(fout3, (TIndexOffU)szs.size(), bigEndian); // write # records for(size_t i = 0; i < szs.size(); i++) szs[i].write(fout3, bigEndian); if(sztot.first == 0) { cerr << "Error: No unambiguous stretches of characters in the input. Aborting..." << endl; throw 1; } assert_gt(sztot.first, 0); assert_gt(sztot.second, 0); bpout.close(); fout3.close(); } else { // Read in the sizes of all the unambiguous stretches of the // genome into a vector of RefRecords TIndexOff numSeqs = 0; //sztot = fastaRefReadSizes(is, szs, parms, NULL, numSeqs); sztot = fastaRefReadFragsNames(is, szs, parms, NULL, numSeqs, *names); #ifndef NDEBUG if(parms.color) { parms.color = false; EList szs2(EBWTB_CAT); TIndexOff numSeqs2 = 0; ASSERT_ONLY(std::pair sztot2 =) fastaRefReadSizes(is, szs2, parms, NULL, numSeqs2); assert_eq(numSeqs, numSeqs2); // One less color than base assert_geq(sztot2.second, sztot.second + numSeqs); parms.color = true; } #endif } return sztot; }