/* * Copyright 2011, Ben Langmead * * This file is part of Bowtie 2. * * Bowtie 2 is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * Bowtie 2 is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with Bowtie 2. If not, see . */ /* * aligner_sw.h * * Classes and routines for solving dynamic programming problems in aid of read * alignment. Goals include the ability to handle: * * - Both read alignment, where the query must align end-to-end, and local * alignment, where we seek a high-scoring alignment that need not involve * the entire query. * - Situations where: (a) we've found a seed hit and are trying to extend it * into a larger hit, (b) we've found an alignment for one mate of a pair and * are trying to find a nearby alignment for the other mate, (c) we're * aligning against an entire reference sequence. * - Caller-specified indicators for what columns of the dynamic programming * matrix we are allowed to start in or end in. * * TODO: * * - A slicker way to filter out alignments that violate a ceiling placed on * the number of Ns permitted in the reference portion of the alignment. * Right now we accomplish this by masking out ending columns that correspond * to *ungapped* alignments with too many Ns. This results in false * positives and false negatives for gapped alignments. The margin of error * (# of Ns by which we might miscount) is bounded by the number of gaps. */ /** * |-maxgaps-| * ***********oooooooooooooooooooooo - * ***********ooooooooooooooooooooo | * ***********oooooooooooooooooooo | * ***********ooooooooooooooooooo | * ***********oooooooooooooooooo | * ***********ooooooooooooooooo read len * ***********oooooooooooooooo | * ***********ooooooooooooooo | * ***********oooooooooooooo | * ***********ooooooooooooo | * ***********oooooooooooo - * |-maxgaps-| * |-readlen-| * |-------skip--------| */ #ifndef ALIGNER_SW_H_ #define ALIGNER_SW_H_ #define INLINE_CUPS #include #include #include #include "threading.h" #include #include "aligner_sw_common.h" #include "aligner_sw_nuc.h" #include "ds.h" #include "aligner_seed.h" #include "reference.h" #include "random_source.h" #include "mem_ids.h" #include "aligner_result.h" #include "mask.h" #include "dp_framer.h" #include "aligner_swsse.h" #include "aligner_bt.h" #define QUAL2(d, f) sc_->mm((int)(*rd_)[rdi_ + d], \ (int) rf_ [rfi_ + f], \ (int)(*qu_)[rdi_ + d] - 33) #define QUAL(d) sc_->mm((int)(*rd_)[rdi_ + d], \ (int)(*qu_)[rdi_ + d] - 33) #define N_SNP_PEN(c) (((int)rf_[rfi_ + c] > 15) ? sc_->n(30) : sc_->penSnp) /** * SwAligner * ========= * * Ensapsulates facilities for alignment using dynamic programming. Handles * alignment of nucleotide reads against known reference nucleotides. * * The class is stateful. First the user must call init() to initialize the * object with details regarding the dynamic programming problem to be solved. * Next, the user calls align() to fill the dynamic programming matrix and * calculate summaries describing the solutions. Finally the user calls * nextAlignment(...), perhaps repeatedly, to populate the SwResult object with * the next result. Results are dispensend in best-to-worst, left-to-right * order. * * The class expects the read string, quality string, and reference string * provided by the caller live at least until the user is finished aligning and * obtaining alignments from this object. * * There is a design tradeoff between hiding/exposing details of the genome and * its strands to the SwAligner. In a sense, a better design is to hide * details such as the id of the reference sequence aligned to, or whether * we're aligning the read in its original forward orientation or its reverse * complement. But this means that any alignment results returned by SwAligner * have to be extended to include those details before they're useful to the * caller. We opt for messy but expedient - the reference id and orientation * of the read are given to SwAligner, remembered, and used to populate * SwResults. * * LOCAL VS GLOBAL * * The dynamic programming aligner supports both local and global alignment, * and one option in between. To implement global alignment, the aligner (a) * allows negative scores (i.e. doesn't necessarily clamp them up to 0), (b) * checks in rows other than the last row for acceptable solutions, and (c) * optionally adds a bonus to the score for matches. * * For global alignment, we: * * (a) Allow negative scores * (b) Check only in the last row * (c) Either add a bonus for matches or not (doesn't matter) * * For local alignment, we: * * (a) Clamp scores to 0 * (b) Check in any row for a sufficiently high score * (c) Add a bonus for matches * * An in-between solution is to allow alignments to be curtailed on the * right-hand side if a better score can be achieved thereby, but not on the * left. For this, we: * * (a) Allow negative scores * (b) Check in any row for a sufficiently high score * (c) Either add a bonus for matches or not (doesn't matter) * * REDUNDANT ALIGNMENTS * * When are two alignments distinct and when are they redundant (not distinct)? * At one extreme, we might say the best alignment from any given dynamic * programming problem is redundant with all other alignments from that # problem. At the other extreme, we might say that any two alignments with * distinct starting points and edits are distinct. The former is probably too * conservative for mate-finding DP problems. The latter is certainly too * permissive, since two alignments that differ only in how gaps are arranged * should not be considered distinct. * * Some in-between solutions are: * * (a) If two alignments share an end point on either end, they are redundant. * Otherwise, they are distinct. * (b) If two alignments share *both* end points, they are redundant. * (c) If two alignments share any cells in the DP table, they are redundant. * (d) 2 alignments are redundant if either end within N poss of each other * (e) Like (d) but both instead of either * (f, g) Like d, e, but where N is tied to maxgaps somehow * * Why not (a)? One reason is that it's possible for two alignments to have * different start & end positions but share many cells. Consider alignments 1 * and 2 below; their end-points are labeled. * * 1 2 * \ \ * -\ * \ * \ * \ * -\ * \ \ * 1 2 * * 1 and 2 are distinct according to (a) but they share many cells in common. * * Why not (f, g)? It fixes the problem with (a) above by forcing the * alignments to be spread so far that they can't possibly share diagonal cells * in common */ class SwAligner { typedef std::pair SizeTPair; // States that the aligner can be in enum { STATE_UNINIT, // init() hasn't been called yet STATE_INITED, // init() has been called, but not align() STATE_ALIGNED, // align() has been called }; const static size_t ALPHA_SIZE = 5; public: explicit SwAligner() : sseU8fw_(DP_CAT), sseU8rc_(DP_CAT), sseI16fw_(DP_CAT), sseI16rc_(DP_CAT), state_(STATE_UNINIT), initedRead_(false), readSse16_(false), initedRef_(false), rfwbuf_(DP_CAT), btnstack_(DP_CAT), btcells_(DP_CAT), btdiag_(), btncand_(DP_CAT), btncanddone_(DP_CAT), btncanddoneSucc_(0), btncanddoneFail_(0), cper_(), cperMinlen_(), cperPerPow2_(), cperEf_(), cperTri_(), colstop_(0), lastsolcol_(0), cural_(0) ASSERT_ONLY(, cand_tmp_(DP_CAT)) { } /** * Prepare the dynamic programming driver with a new read and a new scoring * scheme. */ void initRead( const BTDnaString& rdfw, // read sequence for fw read const BTDnaString& rdrc, // read sequence for rc read const BTString& qufw, // read qualities for fw read const BTString& qurc, // read qualities for rc read size_t rdi, // offset of first read char to align size_t rdf, // offset of last read char to align const Scoring& sc); // scoring scheme /** * Initialize with a new alignment problem. */ void initRef( bool fw, // whether to forward or revcomp read is aligning TRefId refidx, // id of reference aligned against const DPRect& rect, // DP rectangle char *rf, // reference sequence size_t rfi, // offset of first reference char to align to size_t rff, // offset of last reference char to align to TRefOff reflen, // length of reference sequence const Scoring& sc, // scoring scheme TAlScore minsc, // minimum score bool enable8, // use 8-bit SSE if possible? size_t cminlen, // minimum length for using checkpointing scheme size_t cpow2, // interval b/t checkpointed diags; 1 << this bool doTri, // triangular mini-fills? bool extend); // true iff this is a seed extension /** * Given a read, an alignment orientation, a range of characters in a * referece sequence, and a bit-encoded version of the reference, * execute the corresponding dynamic programming problem. * * Here we expect that the caller has already narrowed down the relevant * portion of the reference (e.g. using a seed hit) and all we do is * banded dynamic programming in the vicinity of that portion. This is not * the function to call if we are trying to solve the whole alignment * problem with dynamic programming (that is TODO). * * Returns true if an alignment was found, false otherwise. */ void initRef( bool fw, // whether to forward or revcomp read aligned TRefId refidx, // reference aligned against const DPRect& rect, // DP rectangle const BitPairReference& refs, // Reference strings TRefOff reflen, // length of reference sequence const Scoring& sc, // scoring scheme TAlScore minsc, // minimum alignment score bool enable8, // use 8-bit SSE if possible? size_t cminlen, // minimum length for using checkpointing scheme size_t cpow2, // interval b/t checkpointed diags; 1 << this bool doTri, // triangular mini-fills? bool extend, // true iff this is a seed extension size_t upto, // count the number of Ns up to this offset size_t& nsUpto); // output: the number of Ns up to 'upto' /** * Given a read, an alignment orientation, a range of characters in a * referece sequence, and a bit-encoded version of the reference, set up * and execute the corresponding ungapped alignment problem. There can * only be one solution. * * The caller has already narrowed down the relevant portion of the * reference using, e.g., the location of a seed hit, or the range of * possible fragment lengths if we're searching for the opposite mate in a * pair. */ int ungappedAlign( const BTDnaString& rd, // read sequence (could be RC) const BTString& qu, // qual sequence (could be rev) const Coord& coord, // coordinate aligned to const BitPairReference& refs, // Reference strings size_t reflen, // length of reference sequence const Scoring& sc, // scoring scheme bool ohang, // allow overhang? TAlScore minsc, // minimum score SwResult& res); // put alignment result here /** * Align read 'rd' to reference using read & reference information given * last time init() was called. Uses dynamic programming. */ bool align(RandomSource& rnd, TAlScore& best); /** * Populate the given SwResult with information about the "next best" * alignment if there is one. If there isn't one, false is returned. Note * that false might be returned even though a call to done() would have * returned false. */ bool nextAlignment( SwResult& res, TAlScore minsc, RandomSource& rnd); /** * Print out an alignment result as an ASCII DP table. */ void printResultStacked( const SwResult& res, std::ostream& os) { res.alres.printStacked(*rd_, os); } /** * Return true iff there are no more solution cells to backtace from. * Note that this may return false in situations where there are actually * no more solutions, but that hasn't been discovered yet. */ bool done() const { assert(initedRead() && initedRef()); return cural_ == btncand_.size(); } /** * Return true iff this SwAligner has been initialized with a read to align. */ inline bool initedRef() const { return initedRef_; } /** * Return true iff this SwAligner has been initialized with a reference to * align against. */ inline bool initedRead() const { return initedRead_; } /** * Reset, signaling that we're done with this dynamic programming problem * and won't be asking for any more alignments. */ inline void reset() { initedRef_ = initedRead_ = false; } #ifndef NDEBUG /** * Check that aligner is internally consistent. */ bool repOk() const { assert_gt(dpRows(), 0); // Check btncand_ for(size_t i = 0; i < btncand_.size(); i++) { assert(btncand_[i].repOk()); assert_geq(btncand_[i].score, minsc_); } return true; } #endif /** * Return the number of alignments given out so far by nextAlignment(). */ size_t numAlignmentsReported() const { return cural_; } /** * Merge tallies in the counters related to filling the DP table. */ void merge( SSEMetrics& sseU8ExtendMet, SSEMetrics& sseU8MateMet, SSEMetrics& sseI16ExtendMet, SSEMetrics& sseI16MateMet, uint64_t& nbtfiltst, uint64_t& nbtfiltsc, uint64_t& nbtfiltdo) { sseU8ExtendMet.merge(sseU8ExtendMet_); sseU8MateMet.merge(sseU8MateMet_); sseI16ExtendMet.merge(sseI16ExtendMet_); sseI16MateMet.merge(sseI16MateMet_); nbtfiltst += nbtfiltst_; nbtfiltsc += nbtfiltsc_; nbtfiltdo += nbtfiltdo_; } /** * Reset all the counters related to filling in the DP table to 0. */ void resetCounters() { sseU8ExtendMet_.reset(); sseU8MateMet_.reset(); sseI16ExtendMet_.reset(); sseI16MateMet_.reset(); nbtfiltst_ = nbtfiltsc_ = nbtfiltdo_ = 0; } /** * Return the size of the DP problem. */ size_t size() const { return dpRows() * (rff_ - rfi_); } protected: /** * Return the number of rows that will be in the dynamic programming table. */ inline size_t dpRows() const { assert(initedRead_); return rdf_ - rdi_; } /** * Align nucleotides from read 'rd' to the reference string 'rf' using * vector instructions. Return the score of the best alignment found, or * the minimum integer if an alignment could not be found. Flag is set to * 0 if an alignment is found, -1 if no valid alignment is found, or -2 if * the score saturated at any point during alignment. */ TAlScore alignNucleotidesEnd2EndSseU8( // unsigned 8-bit elements int& flag, bool debug); TAlScore alignNucleotidesLocalSseU8( // unsigned 8-bit elements int& flag, bool debug); TAlScore alignNucleotidesEnd2EndSseI16( // signed 16-bit elements int& flag, bool debug); TAlScore alignNucleotidesLocalSseI16( // signed 16-bit elements int& flag, bool debug); /** * Aligns by filling a dynamic programming matrix with the SSE-accelerated, * banded DP approach of Farrar. As it goes, it determines which cells we * might backtrace from and tallies the best (highest-scoring) N backtrace * candidate cells per diagonal. Also returns the alignment score of the best * alignment in the matrix. * * This routine does *not* maintain a matrix holding the entire matrix worth of * scores, nor does it maintain any other dense O(mn) data structure, as this * would quickly exhaust memory for queries longer than about 10,000 kb. * Instead, in the fill stage it maintains two columns worth of scores at a * time (current/previous, or right/left) - these take O(m) space. When * finished with the current column, it determines which cells from the * previous column, if any, are candidates we might backtrace from to find a * full alignment. A candidate cell has a score that rises above the threshold * and isn't improved upon by a match in the next column. The best N * candidates per diagonal are stored in a O(m + n) data structure. */ TAlScore alignGatherEE8( // unsigned 8-bit elements int& flag, bool debug); TAlScore alignGatherLoc8( // unsigned 8-bit elements int& flag, bool debug); TAlScore alignGatherEE16( // signed 16-bit elements int& flag, bool debug); TAlScore alignGatherLoc16( // signed 16-bit elements int& flag, bool debug); /** * Build query profile look up tables for the read. The query profile look * up table is organized as a 1D array indexed by [i][j] where i is the * reference character in the current DP column (0=A, 1=C, etc), and j is * the segment of the query we're currently working on. */ void buildQueryProfileEnd2EndSseU8(bool fw); void buildQueryProfileLocalSseU8(bool fw); /** * Build query profile look up tables for the read. The query profile look * up table is organized as a 1D array indexed by [i][j] where i is the * reference character in the current DP column (0=A, 1=C, etc), and j is * the segment of the query we're currently working on. */ void buildQueryProfileEnd2EndSseI16(bool fw); void buildQueryProfileLocalSseI16(bool fw); bool gatherCellsNucleotidesLocalSseU8(TAlScore best); bool gatherCellsNucleotidesEnd2EndSseU8(TAlScore best); bool gatherCellsNucleotidesLocalSseI16(TAlScore best); bool gatherCellsNucleotidesEnd2EndSseI16(TAlScore best); bool backtraceNucleotidesLocalSseU8( TAlScore escore, // in: expected score SwResult& res, // out: store results (edits and scores) here size_t& off, // out: store diagonal projection of origin size_t& nbts, // out: # backtracks size_t row, // start in this rectangle row size_t col, // start in this rectangle column RandomSource& rand); // random gen, to choose among equal paths bool backtraceNucleotidesLocalSseI16( TAlScore escore, // in: expected score SwResult& res, // out: store results (edits and scores) here size_t& off, // out: store diagonal projection of origin size_t& nbts, // out: # backtracks size_t row, // start in this rectangle row size_t col, // start in this rectangle column RandomSource& rand); // random gen, to choose among equal paths bool backtraceNucleotidesEnd2EndSseU8( TAlScore escore, // in: expected score SwResult& res, // out: store results (edits and scores) here size_t& off, // out: store diagonal projection of origin size_t& nbts, // out: # backtracks size_t row, // start in this rectangle row size_t col, // start in this rectangle column RandomSource& rand); // random gen, to choose among equal paths bool backtraceNucleotidesEnd2EndSseI16( TAlScore escore, // in: expected score SwResult& res, // out: store results (edits and scores) here size_t& off, // out: store diagonal projection of origin size_t& nbts, // out: # backtracks size_t row, // start in this rectangle row size_t col, // start in this rectangle column RandomSource& rand); // random gen, to choose among equal paths bool backtrace( TAlScore escore, // in: expected score bool fill, // in: use mini-fill? bool usecp, // in: use checkpoints? SwResult& res, // out: store results (edits and scores) here size_t& off, // out: store diagonal projection of origin size_t row, // start in this rectangle row size_t col, // start in this rectangle column size_t maxiter,// max # extensions to try size_t& niter, // # extensions tried RandomSource& rnd) // random gen, to choose among equal paths { bter_.initBt( escore, // in: alignment score row, // in: start in this row col, // in: start in this column fill, // in: use mini-fill? usecp, // in: use checkpoints? cperTri_, // in: triangle-shaped mini-fills? rnd); // in: random gen, to choose among equal paths assert(bter_.inited()); size_t nrej = 0; if(bter_.emptySolution()) { return false; } else { return bter_.nextAlignment(maxiter, res, off, nrej, niter, rnd); } } const BTDnaString *rd_; // read sequence const BTString *qu_; // read qualities const BTDnaString *rdfw_; // read sequence for fw read const BTDnaString *rdrc_; // read sequence for rc read const BTString *qufw_; // read qualities for fw read const BTString *qurc_; // read qualities for rc read TReadOff rdi_; // offset of first read char to align TReadOff rdf_; // offset of last read char to align bool fw_; // true iff read sequence is original fw read TRefId refidx_; // id of reference aligned against TRefOff reflen_; // length of entire reference sequence const DPRect* rect_; // DP rectangle char *rf_; // reference sequence TRefOff rfi_; // offset of first ref char to align to TRefOff rff_; // offset of last ref char to align to (excl) size_t rdgap_; // max # gaps in read size_t rfgap_; // max # gaps in reference bool enable8_;// enable 8-bit sse bool extend_; // true iff this is a seed-extend problem const Scoring *sc_; // penalties for edit types TAlScore minsc_; // penalty ceiling for valid alignments int nceil_; // max # Ns allowed in ref portion of aln bool sse8succ_; // whether 8-bit worked bool sse16succ_; // whether 16-bit worked SSEData sseU8fw_; // buf for fw query, 8-bit score SSEData sseU8rc_; // buf for rc query, 8-bit score SSEData sseI16fw_; // buf for fw query, 16-bit score SSEData sseI16rc_; // buf for rc query, 16-bit score bool sseU8fwBuilt_; // built fw query profile, 8-bit score bool sseU8rcBuilt_; // built rc query profile, 8-bit score bool sseI16fwBuilt_; // built fw query profile, 16-bit score bool sseI16rcBuilt_; // built rc query profile, 16-bit score SSEMetrics sseU8ExtendMet_; SSEMetrics sseU8MateMet_; SSEMetrics sseI16ExtendMet_; SSEMetrics sseI16MateMet_; int state_; // state bool initedRead_; // true iff initialized with initRead bool readSse16_; // true -> sse16 from now on for read bool initedRef_; // true iff initialized with initRef EList rfwbuf_; // buffer for wordized ref stretches EList btnstack_; // backtrace stack for nucleotides EList btcells_; // cells involved in current backtrace NBest btdiag_; // per-diagonal backtrace candidates EList btncand_; // cells we might backtrace from EList btncanddone_; // candidates that we investigated size_t btncanddoneSucc_; // # investigated and succeeded size_t btncanddoneFail_; // # investigated and failed BtBranchTracer bter_; // backtracer Checkpointer cper_; // structure for saving checkpoint cells size_t cperMinlen_; // minimum length for using checkpointer size_t cperPerPow2_; // checkpoint every 1 << perpow2 diags (& next) bool cperEf_; // store E and F in addition to H? bool cperTri_; // checkpoint for triangular mini-fills? size_t colstop_; // bailed on DP loop after this many cols size_t lastsolcol_; // last DP col with valid cell size_t cural_; // index of next alignment to be given uint64_t nbtfiltst_; // # candidates filtered b/c starting cell was seen uint64_t nbtfiltsc_; // # candidates filtered b/c score uninteresting uint64_t nbtfiltdo_; // # candidates filtered b/c dominated by other cell ASSERT_ONLY(SStringExpandable tmp_destU32_); ASSERT_ONLY(BTDnaString tmp_editstr_, tmp_refstr_); ASSERT_ONLY(EList cand_tmp_); }; #endif /*ALIGNER_SW_H_*/