/*========================================================================== RazerS - Fast Read Mapping with Controlled Loss Rate http://www.seqan.de/projects/razers.html ============================================================================ Copyright (C) 2008 by David Weese This program is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either version 3 of the License, or (at your options) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . ==========================================================================*/ #ifndef SEQAN_HEADER_RAZERS_H #define SEQAN_HEADER_RAZERS_H #include #include #include #include #ifdef RAZERS_PARALLEL #include "tbb/spin_mutex.h" #endif namespace SEQAN_NAMESPACE_MAIN { ////////////////////////////////////////////////////////////////////////////// // Default options #ifdef RAZERS_OPENADDRESSING typedef OpenAddressing TQGramIndexSpec; #else typedef Default TQGramIndexSpec; #endif template < bool DONT_VERIFY_ = false, bool DONT_DUMP_RESULTS_ = false > struct RazerSSpec { enum { DONT_VERIFY = DONT_VERIFY_ }; // omit verifying potential matches enum { DONT_DUMP_RESULTS = DONT_DUMP_RESULTS_ }; // omit dumping results }; template < typename TSpec = RazerSSpec<> > struct RazerSOptions { // main options TSpec spec; bool forward; // compute forward oriented read matches bool reverse; // compute reverse oriented read matches double errorRate; // Criteria 1 threshold unsigned maxHits; // output at most maxHits many matches unsigned distanceRange; // output only the best, second best, ..., distanceRange best matches bool purgeAmbiguous; // true..remove reads with more than maxHits best matches, false..keep them CharString output; // name of result file int _debugLevel; // level of verbosity bool printVersion; // print version number bool hammingOnly; // no indels int trimLength; // if >0, cut reads to #trimLength characters CharString outputUnmapped; // name of file storing unmapped reads // output format options unsigned outputFormat; // 0..Razer format // 1..enhanced Fasta // 2..ELAND format bool dumpAlignment; // compute and dump the match alignments in the result files unsigned genomeNaming; // 0..use Fasta id // 1..enumerate reads beginning with 1 unsigned readNaming; // 0..use Fasta id // 1..enumerate reads beginning with 1 // 2..use the read sequence (only for short reads!) unsigned sortOrder; // 0..sort keys: 1. read number, 2. genome position // 1.. 1. genome pos50ition, 2. read number unsigned positionFormat; // 0..gap space // 1..position space const char *runID; // runID needed for gff output // filtration parameters ::std::string shape; // shape (e.g. 11111111111) int threshold; // threshold int tabooLength; // taboo length int repeatLength; // repeat length threshold double abundanceCut; // abundance threshold // mate-pair parameters int libraryLength; // offset between two mates int libraryError; // offset tolerance unsigned nextMatePairId; // use this id for the next mate-pair // verification parameters bool matchN; // false..N is always a mismatch, true..N matches with all unsigned char compMask[5]; // statistics __int64 FP; // false positives (threshold reached, no match) __int64 TP; // true positives (threshold reached, match) double timeLoadFiles; // time for loading input files double timeMapReads; // time for mapping reads double timeDumpResults; // time for dumping the results #ifdef RAZERS_DIRECT_MAQ_MAPPING bool maqMapping; int maxMismatchQualSum; int mutationRateQual; int absMaxQualSumErrors; unsigned artSeedLength; bool noBelowIdentity; #endif bool readsWithQualities; #ifdef RAZERS_MICRO_RNA bool microRNA; unsigned rnaSeedLength; bool exactSeed; #endif bool lowMemory; // set maximum shape weight to 13 to limit size of q-gram index bool fastaIdQual; // hidden option for special fasta+quality format we use int minClippedLen; // misc unsigned compactThresh; // compact match array if larger than compactThresh int maxReadLength; int penaltyC; // puts penalty on the existence on a middle gap in split mapping mode unsigned minMatchLen; // minimum prefix/suffix match length in split mapping mode int maxPrefixErrors; // int maxSuffixErrors; int maxGap; int minGap; ::std::string shapeR; // shape (e.g. 11111111111) int thresholdR; // threshold unsigned int specifiedGenomeLen; bool anchored; int maxReadRegionsEnd; int minReadRegionsStart; // multi-threading #ifdef RAZERS_PARALLEL typedef ::tbb::spin_mutex TMutex; TMutex *patternMutex; TMutex optionsMutex; TMutex matchMutex; #endif RazerSOptions() { forward = true; reverse = true; errorRate = 0.08; maxHits = 100; distanceRange = 0; // disabled purgeAmbiguous = false; output = ""; _debugLevel = 0; printVersion = false; hammingOnly = false; outputUnmapped = ""; trimLength = 0; outputFormat = 0; dumpAlignment = false; genomeNaming = 0; readNaming = 0; sortOrder = 0; positionFormat = 0; runID = "s"; // matchN = false; shape = "11111111111"; threshold = 1; tabooLength = 1; repeatLength = 1000; abundanceCut = 1; libraryLength = 220; libraryError = 50; nextMatePairId = 1; for (unsigned i = 0; i < 4; ++i) compMask[i] = 1 << i; compMask[4] = 0; compactThresh = 1024; #ifdef RAZERS_DIRECT_MAQ_MAPPING maqMapping = false; maxMismatchQualSum = 70; mutationRateQual = 30; artSeedLength = 28; // the "artificial" seed length that is used for mapping quality assignment // (28bp is maq default) absMaxQualSumErrors = 100; // maximum for sum of mism qualities in total readlength noBelowIdentity = false; #endif readsWithQualities = false; #ifdef RAZERS_MICRO_RNA microRNA = false; rnaSeedLength = 16; exactSeed = true; #endif penaltyC = 2; minMatchLen = 18; shapeR = "11111111111"; thresholdR = 1; maxGap = 10000; minGap = 0; maxPrefixErrors = 1; maxSuffixErrors = 1; specifiedGenomeLen = 3000000000U; //whole human genome default anchored = false; maxReadRegionsEnd = 0; minReadRegionsStart = 0; // maxReadLength = 0; lowMemory = false; // set maximum shape weight to 13 to limit size of q-gram index fastaIdQual = false; minClippedLen = 0; } }; struct MicroRNA{}; #ifdef RAZERS_MICRO_RNA #define RAZERS_EXTENDED_MATCH #endif #ifdef RAZERS_DIRECT_MAQ_MAPPING #define RAZERS_EXTENDED_MATCH #endif #ifdef RAZERS_SPLICED #define RAZERS_EXTENDED_MATCH #endif ////////////////////////////////////////////////////////////////////////////// // Typedefs // definition of a Read match template struct ReadMatch { typedef typename MakeSigned_::Type TGPos; unsigned gseqNo; // genome seqNo unsigned rseqNo; // read seqNo TGPos gBegin; // begin position of the match in the genome TGPos gEnd; // end position of the match in the genome #ifdef RAZERS_MATEPAIRS unsigned pairId; // unique id for the two mate-pair matches (0 if unpaired) int mateDelta:23; // outer coordinate delta to the other mate unsigned pairScore:9; // combined score of both mates max 256 #endif unsigned short editDist; // Levenshtein distance #ifdef RAZERS_EXTENDED_MATCH short mScore; short seedEditDist:8; #endif #ifdef RAZERS_SPLICED char traceExtension; short gSeedLen:8;// used as gMinMatchLen to store genomic end position of seed match #endif char orientation; // 'F'..forward strand, 'R'..reverse comp. strand }; enum RAZERS_ERROR { RAZERS_INVALID_OPTIONS = -1, RAZERS_READS_FAILED = -2, RAZERS_GENOME_FAILED = -3, RAZERS_INVALID_SHAPE = -4 }; ////////////////////////////////////////////////////////////////////////////// // Definitions typedef Dna5String TGenome; typedef StringSet TGenomeSet; // typedef Dna5String TRead; typedef String TRead; #ifdef RAZERS_CONCATREADS typedef StringSet > > TReadSet; #else typedef StringSet TReadSet; #endif typedef ReadMatch::Type> TMatch; // a single match typedef String*/ > TMatches; // array of matches //chrNo, startPos, endPos // typedef String::Type>::Type > > TReadRegions; typedef MakeSigned_::Type>::Type TSignedPos; typedef Pair::VALUE> > TFlagPos; //chrNo, flag to remember flag, startPos of mate typedef String > TReadRegions; template struct Cargo< Index > { typedef struct { double abundanceCut; int _debugLevel; } Type; }; template struct Cargo< Index > { typedef struct { double abundanceCut; int _debugLevel; } Type; }; ////////////////////////////////////////////////////////////////////////////// // Memory tuning #ifdef RAZERS_MEMOPT template struct SAValue< Index > { typedef Pair< unsigned, unsigned, BitCompressed<24, 8> // max. 16M reads of length < 256 > Type; }; //454 // template // struct SAValue< Index > { // typedef Pair< // unsigned, // unsigned, // BitCompressed<22, 10> // max. 4M reads of length < 1024 // > Type; // }; #else template struct SAValue< Index > { typedef Pair< unsigned, // many reads unsigned, // of arbitrary length Compressed > Type; }; #endif template <> struct Size { typedef unsigned Type; }; template struct Size< Index > > { typedef unsigned Type; }; #ifdef RAZERS_PRUNE_QGRAM_INDEX ////////////////////////////////////////////////////////////////////////////// // Repeat masker template inline bool _qgramDisableBuckets(Index > &index) { typedef Index > TReadIndex; typedef typename Fibre::Type TDir; typedef typename Iterator::Type TDirIterator; typedef typename Value::Type TSize; TDir &dir = indexDir(index); bool result = false; unsigned counter = 0; TSize thresh = (TSize)(length(index) * cargo(index).abundanceCut); if (thresh < 100) thresh = 100; TDirIterator it = begin(dir, Standard()); TDirIterator itEnd = end(dir, Standard()); for (; it != itEnd; ++it) if (*it > thresh) { *it = (TSize)-1; result = true; ++counter; } if (counter > 0 && cargo(index)._debugLevel >= 1) ::std::cerr << "Removed " << counter << " k-mers" << ::std::endl; return result; } #endif template bool _clipReads(TIdString & fastaID, TSeqString & seq, TQString & qual, TOptions & options) { typedef typename Value::Type TChar; int tagStart = length(fastaID); int clipFront = -1; int clipBack = -1; for(unsigned i = 0; i < length(fastaID); ++i) { TChar c = fastaID[i]; if(c == 'c') { tagStart = i-1; if(infix(fastaID,i,i+5)=="clip=") { i += 5; String clipStr = ""; while(i < length(fastaID) && _parseIsDigit(fastaID[i])) { append(clipStr,fastaID[i]); ++i; } std::istringstream istr(toCString(clipStr)); istr >> clipFront; if(i < length(fastaID) && fastaID[i] == ',') ++i; clipStr = ""; while(i < length(fastaID) && _parseIsDigit(fastaID[i])) { append(clipStr,fastaID[i]); ++i; } std::istringstream istr2(toCString(clipStr)); istr2 >> clipBack; break; } } } if(clipFront<0 && clipBack<0) return true; //no clip tag found if(clipFront<0) clipFront = 0; if(clipBack<0) clipBack = 0; resize(fastaID,tagStart); // only resize fastaID, as it might contain the quality string if(options.minClippedLen == 0) // clipping tag was found, but clipping option is turned off return true; if(((int)length(seq)-clipBack-clipFront) < options.minClippedLen) // sequence is too short after clipping return false; // int newLen = (int)length(seq)-clipBack-clipFront; //clip if(length(qual) == 0) // meaning that quality is in fasta header { // first adapt the fasta header TIdString tmp = fastaID; fastaID = infix(tmp,0,length(tmp)-length(seq)); append(fastaID,infix(tmp,length(tmp)-length(seq)+clipFront,length(tmp)-clipBack)); } else qual = infix(qual,clipFront,length(qual)-clipBack); // now adapt the sequence seq = infix(seq,clipFront,length(seq)-clipBack); return true; } ////////////////////////////////////////////////////////////////////////////// // Load multi-Fasta sequences with or w/o quality values template bool loadReads( TReadSet &reads, TNameSet &fastaIDs, const char *fileName, TRazerSOptions &options) { bool countN = !(options.matchN || options.outputFormat == 1); if (!empty(options.outputUnmapped)) countN = false; #ifdef RAZERS_MICRO_RNA if(options.microRNA) countN = false; #endif MultiFasta multiFasta; if (!open(multiFasta.concat, fileName, OPEN_RDONLY)) return false; AutoSeqFormat format; guessFormat(multiFasta.concat, format); split(multiFasta, format); unsigned seqCount = length(multiFasta); #ifndef RAZERS_CONCATREADS resize(reads, seqCount, Exact()); #endif if (options.readNaming == 0) resize(fastaIDs, seqCount, Exact()); String seq; CharString qual; unsigned kickoutcount = 0; for(unsigned i = 0; i < seqCount; ++i) { if (options.readNaming == 0 #ifdef RAZERS_DIRECT_MAQ_MAPPING || options.fastaIdQual #endif ) assignSeqId(fastaIDs[i], multiFasta[i], format); // read Fasta id assignSeq(seq, multiFasta[i], format); // read Read sequence assignQual(qual, multiFasta[i], format); // read ascii quality values if(!empty(qual)) options.readsWithQualities = true; #ifdef RAZERS_DIRECT_MAQ_MAPPING //check if sequence has a clip tag if(options.minClippedLen > 0) { if(!_clipReads(fastaIDs[i],seq,qual,options)) { clear(seq); clear(qual); ++kickoutcount; } } if(options.fastaIdQual && !empty(seq)) { if(options.minClippedLen == 0)_clipReads(fastaIDs[i],seq,qual,options); // if the header wasnt clipped before, then clip now!! necessary for quality in header qual = suffix(fastaIDs[i],length(fastaIDs[i])-length(seq)); if(options.readNaming == 0) fastaIDs[i] = prefix(fastaIDs[i],length(fastaIDs[i])-length(seq)); else clear(fastaIDs[i]); } #endif if (countN) { int count = 0; int cutoffCount = (int)(options.errorRate * length(seq)); for (unsigned j = 0; j < length(seq); ++j) if (getValue(seq, j) == 'N') if (++count > cutoffCount) { clear(seq); clear(qual); ++kickoutcount; break; } // low qual. reads are empty to output them and their id later as LQ reads // if (count > cutoffCount) continue; } #ifdef RAZERS_MICRO_RNA if(options.microRNA && length(seq)3) setValue(hybridSeq[j],(unsigned int)164); //N=164 such that &3 == 0 else setValue(hybridSeq[j],(unsigned int)((ordValue(qual[j]) - 33) * 4 + ordValue(seq[j]))); */ if (options.trimLength > 0 && length(seq) > (unsigned)options.trimLength) resize(seq, options.trimLength); if((int)length(seq) > options.maxReadLength) options.maxReadLength = length(seq); #ifdef RAZERS_CONCATREADS appendValue(reads, seq, Generous()); #else assign(reads[i], seq, Exact()); #endif } #ifdef RAZERS_CONCATREADS reserve(reads.concat, length(reads.concat), Exact()); #endif if (options._debugLevel > 1 && kickoutcount > 0) ::std::cerr << "Ignoring " << kickoutcount << " low quality reads.\n"; return (seqCount > 0); } ////////////////////////////////////////////////////////////////////////////// // Read the first sequence of a multi-sequence file // and return its length inline int estimateReadLength(char const *fileName) { MultiFasta multiFasta; if (!open(multiFasta.concat, fileName, OPEN_RDONLY)) // open the whole file return RAZERS_READS_FAILED; AutoSeqFormat format; guessFormat(multiFasta.concat, format); // guess file format split(multiFasta, format); // divide into single sequences if (length(multiFasta) == 0) return 0; Dna5String firstRead; assignSeq(firstRead, multiFasta[0], format); // read the first sequence return length(firstRead); } /* ////////////////////////////////////////////////////////////////////////////// // Load multi-Fasta sequences template bool loadGenomes(TGenomeSet &genomes, const char *fileName) { MultiFasta multiFasta; if (!open(multiFasta.concat, fileName, OPEN_RDONLY)) return false; split(multiFasta, Fasta()); unsigned seqCount = length(multiFasta); resize(genomes, seqCount, Exact()); for(unsigned i = 0; i < seqCount; ++i) assignSeq(genomes[i], multiFasta[i], Fasta()); // read Genome sequence return (seqCount > 0); }*/ ////////////////////////////////////////////////////////////////////////////// // Load multi-Fasta sequences from multiple files template bool loadGenomes(TGenomeSet &genomes, StringSet &fileNameList) { unsigned gSeqNo = 0; unsigned filecount = 0; while(filecount < length(fileNameList)) { MultiFasta multiFasta; if (!open(multiFasta.concat, toCString(fileNameList[filecount]), OPEN_RDONLY)) return false; split(multiFasta, Fasta()); unsigned seqCount = length(multiFasta); if(length(genomes) < gSeqNo+seqCount) resize(genomes,gSeqNo+seqCount); for(unsigned i = 0; i < seqCount; ++i) assignSeq(genomes[gSeqNo+i], multiFasta[i], Fasta()); // read Genome sequence gSeqNo += seqCount; ++filecount; } resize(genomes,gSeqNo); return (gSeqNo > 0); } #ifdef RAZERS_MICRO_RNA template struct LessRNoGPos : public ::std::binary_function < TReadMatch, TReadMatch, bool > { inline bool operator() (TReadMatch const &a, TReadMatch const &b) const { // read number if (a.rseqNo < b.rseqNo) return true; if (a.rseqNo > b.rseqNo) return false; // genome position and orientation if (a.gseqNo < b.gseqNo) return true; if (a.gseqNo > b.gseqNo) return false; if (a.gBegin < b.gBegin) return true; if (a.gBegin > b.gBegin) return false; if (a.orientation < b.orientation) return true; if (a.orientation > b.orientation) return false; if(a.editDist < b.editDist) return true; if(a.editDist > b.editDist) return false; return (a.mScore > b.mScore); } }; template struct LessRNoEdistHLen : public ::std::binary_function < TReadMatch, TReadMatch, bool > { inline bool operator() (TReadMatch const &a, TReadMatch const &b) const { // read number if (a.rseqNo < b.rseqNo) return true; if (a.rseqNo > b.rseqNo) return false; if(a.editDist < b.editDist) return true; if(a.editDist > b.editDist) return false; return (a.mScore > b.mScore); } }; #else template struct LessRNoGPos : public ::std::binary_function < TReadMatch, TReadMatch, bool > { inline bool operator() (TReadMatch const &a, TReadMatch const &b) const { // read number if (a.rseqNo < b.rseqNo) return true; if (a.rseqNo > b.rseqNo) return false; // genome position and orientation if (a.gseqNo < b.gseqNo) return true; if (a.gseqNo > b.gseqNo) return false; if (a.gBegin < b.gBegin) return true; if (a.gBegin > b.gBegin) return false; if (a.orientation < b.orientation) return true; if (a.orientation > b.orientation) return false; // quality #ifdef RAZERS_MATEPAIRS return a.pairScore > b.pairScore; #else return a.editDist < b.editDist; #endif } }; #endif // ... to sort matches and remove duplicates with equal gEnd template struct LessRNoGEndPos : public ::std::binary_function < TReadMatch, TReadMatch, bool > { inline bool operator() (TReadMatch const &a, TReadMatch const &b) const { // read number if (a.rseqNo < b.rseqNo) return true; if (a.rseqNo > b.rseqNo) return false; // genome position and orientation if (a.gseqNo < b.gseqNo) return true; if (a.gseqNo > b.gseqNo) return false; if (a.gEnd < b.gEnd) return true; if (a.gEnd > b.gEnd) return false; if (a.orientation < b.orientation) return true; if (a.orientation > b.orientation) return false; // quality #ifdef RAZERS_MATEPAIRS return a.pairScore > b.pairScore; #else return a.editDist < b.editDist; #endif } }; template struct LessErrors : public ::std::binary_function < TReadMatch, TReadMatch, bool > { inline bool operator() (TReadMatch const &a, TReadMatch const &b) const { // read number if (a.rseqNo < b.rseqNo) return true; if (a.rseqNo > b.rseqNo) return false; // quality #ifdef RAZERS_MATEPAIRS return a.pairScore > b.pairScore; #else return a.editDist < b.editDist; #endif } }; template struct LessSplicedScore : public ::std::binary_function < TReadMatch, TReadMatch, bool > { inline bool operator() (TReadMatch const &a, TReadMatch const &b) const { // read number if (a.rseqNo < b.rseqNo ) return true; if (a.rseqNo > b.rseqNo ) return false; //if ((a.rseqNo >> 1) < (b.rseqNo >> 1)) return true; //if ((a.rseqNo >> 1) > (b.rseqNo >> 1)) return false; // quality if (a.pairScore > b.pairScore) return true; if (a.pairScore < b.pairScore) return false; //CHECK: if (abs(a.mateDelta) < abs(b.mateDelta)) return true; //CHECK: if (abs(a.mateDelta) > abs(b.mateDelta)) return falsee; return a.pairId < b.pairId; } }; template struct LessSplicedScoreGPos : public ::std::binary_function < TReadMatch, TReadMatch, bool > { inline bool operator() (TReadMatch const &a, TReadMatch const &b) const { // read number if (a.rseqNo < b.rseqNo ) return true; if (a.rseqNo > b.rseqNo ) return false; //if ((a.rseqNo >> 1) < (b.rseqNo >> 1)) return true; //if ((a.rseqNo >> 1) > (b.rseqNo >> 1)) return false; // quality if (a.pairScore > b.pairScore) return true; if (a.pairScore < b.pairScore) return false; if (a.pairId < b.pairId) return true; if (a.pairId > b.pairId) return false; if (a.gBegin < b.gBegin) return true; if (a.gBegin > b.gBegin) return false; return a.editDist < b.editDist; } }; #ifdef RAZERS_DIRECT_MAQ_MAPPING struct QualityBasedScoring{}; template struct LessRNoMQ : public ::std::binary_function < TReadMatch, TReadMatch, bool > { inline bool operator() (TReadMatch const &a, TReadMatch const &b) const { // read number if (a.rseqNo < b.rseqNo) return true; if (a.rseqNo > b.rseqNo) return false; // quality if (a.mScore < b.mScore) return true; // sum of quality values of mismatches (the smaller the better) if (a.mScore > b.mScore) return false; return (a.editDist < b.editDist); // seedEditDist? // genome position and orientation /* if (a.gseqNo < b.gseqNo) return true; if (a.gseqNo > b.gseqNo) return false; if (a.gBegin < b.gBegin) return true; if (a.gBegin > b.gBegin) return false; if (a.orientation < b.orientation) return true; if (a.orientation > b.orientation) return false; */ } }; #endif ////////////////////////////////////////////////////////////////////////////// // Remove duplicate matches and leave at most maxHits many distanceRange // best matches per read template < typename TMatches > void maskDuplicates(TMatches &matches) { typedef typename Value::Type TMatch; typedef typename Iterator::Type TIterator; ////////////////////////////////////////////////////////////////////////////// // remove matches with equal ends ::std::sort( begin(matches, Standard()), end(matches, Standard()), LessRNoGEndPos()); typename TMatch::TGPos gBegin = -1; typename TMatch::TGPos gEnd = -1; unsigned gseqNo = -1; unsigned readNo = -1; char orientation = '-'; TIterator it = begin(matches, Standard()); TIterator itEnd = end(matches, Standard()); for (; it != itEnd; ++it) { #ifdef RAZERS_MATEPAIRS if ((*it).pairId != 0) continue; #endif if (gEnd == (*it).gEnd && orientation == (*it).orientation && gseqNo == (*it).gseqNo && readNo == (*it).rseqNo) { (*it).orientation = '-'; continue; } readNo = (*it).rseqNo; gseqNo = (*it).gseqNo; gEnd = (*it).gEnd; orientation = (*it).orientation; } ////////////////////////////////////////////////////////////////////////////// // remove matches with equal begins ::std::sort( begin(matches, Standard()), end(matches, Standard()), LessRNoGPos()); orientation = '-'; it = begin(matches, Standard()); itEnd = end(matches, Standard()); for (; it != itEnd; ++it) { if ((*it).orientation == '-' #ifdef RAZERS_MATEPAIRS || ((*it).pairId != 0) #endif ) continue; if (gBegin == (*it).gBegin && readNo == (*it).rseqNo && gseqNo == (*it).gseqNo && orientation == (*it).orientation) { (*it).orientation = '-'; continue; } readNo = (*it).rseqNo; gseqNo = (*it).gseqNo; gBegin = (*it).gBegin; orientation = (*it).orientation; } ::std::sort( begin(matches, Standard()), end(matches, Standard()), LessErrors()); } ////////////////////////////////////////////////////////////////////////////// // Count matches for each number of errors template < typename TMatches, typename TCounts > void countMatches(TMatches &matches, TCounts &cnt) { typedef typename Value::Type TMatch; typedef typename Iterator::Type TIterator; typedef typename Value::Type TRow; typedef typename Value::Type TValue; TIterator it = begin(matches, Standard()); TIterator itEnd = end(matches, Standard()); unsigned readNo = -1; short editDist = -1; __int64 count = 0; __int64 maxVal = MaxValue::VALUE; for (; it != itEnd; ++it) { if ((*it).orientation == '-') continue; if (readNo == (*it).rseqNo && editDist == (*it).editDist) ++count; else { if (readNo != (unsigned)-1 && (unsigned)editDist < length(cnt)) cnt[editDist][readNo] = (maxVal < count)? maxVal : count; readNo = (*it).rseqNo; editDist = (*it).editDist; count = 1; } } if (readNo != (unsigned)-1 && (unsigned)editDist < length(cnt)) cnt[editDist][readNo] = count; } ////////////////////////////////////////////////////////////////////////////// template < typename TReadNo, typename TMaxErrors > inline void setMaxErrors(Nothing &, TReadNo, TMaxErrors) { } template < typename TSwift, typename TReadNo, typename TMaxErrors > inline void setMaxErrors(TSwift &swift, TReadNo readNo, TMaxErrors maxErrors) { int minT = _qgramLemma(swift, readNo, maxErrors); if (minT > 1) { if (maxErrors < 0) minT = MaxValue::VALUE; // ::std::cout<<" read:"< void compactMatches(TMatches &matches, TCounts & #ifdef RAZERS_DIRECT_MAQ_MAPPING cnts #endif , RazerSOptions &options , bool compactFinal , TSwift & #if defined RAZERS_DIRECT_MAQ_MAPPING || defined RAZERS_MASK_READS swift #endif ) { // Get rid of "unused variable" warnings. This is hard to read // and should not be done anywhere else. Better not use ifdefs. (void)compactFinal; #ifdef RAZERS_DIRECT_MAQ_MAPPING if(options.maqMapping) compactMatches(matches, cnts,options,compactFinal,swift,true); #endif typedef typename Value::Type TMatch; typedef typename Iterator::Type TIterator; #ifdef RAZERS_MICRO_RNA if(options.microRNA) ::std::sort( begin(matches, Standard()), end(matches, Standard()), LessRNoEdistHLen()); int bestMScore = 0; #endif unsigned readNo = -1; unsigned hitCount = 0; unsigned hitCountCutOff = options.maxHits; __int64 disabled = 0; #ifdef RAZERS_MICRO_RNA if(options.microRNA && options.purgeAmbiguous) ++hitCountCutOff; // we keep one more match than we actually want, so we can later decide // whether the read mapped more than maxhits times #endif int editDistCutOff = MaxValue::VALUE; TIterator it = begin(matches, Standard()); TIterator itEnd = end(matches, Standard()); TIterator dit = it; TIterator ditBeg = it; for (; it != itEnd; ++it) { if ((*it).orientation == '-') continue; if (readNo == (*it).rseqNo #ifdef RAZERS_MATEPAIRS && (*it).pairId == 0 #endif ) { if ((*it).editDist >= editDistCutOff) continue; #ifdef RAZERS_MICRO_RNA if ( (*it).mScore < bestMScore) continue; #endif if (++hitCount >= hitCountCutOff) { #ifdef RAZERS_MASK_READS if (hitCount == hitCountCutOff) { // we have enough, now look for better matches int maxErrors = (*it).editDist - 1; if (options.purgeAmbiguous && (options.distanceRange == 0 || (*it).editDist < options.distanceRange)) maxErrors = -1; setMaxErrors(swift, readNo, maxErrors); if (maxErrors == -1) ++disabled; // ::std::cerr << "(read #" << readNo << " disabled)"; if (options.purgeAmbiguous) { if (options.distanceRange == 0 || (*it).editDist < options.distanceRange || compactFinal) dit = ditBeg; else { *dit = *it; ++dit; } } } #endif continue; } } else { readNo = (*it).rseqNo; hitCount = 0; if (options.distanceRange > 0) editDistCutOff = (*it).editDist + options.distanceRange; #ifdef RAZERS_MICRO_RNA bestMScore = (*it).mScore; #endif ditBeg = dit; } *dit = *it; ++dit; } if (options._debugLevel >= 2) { std::cerr << " [" << length(matches) - (dit - begin(matches, Standard())) << " matches removed]"; std::cerr << " [" << disabled << " reads disabled]"; } resize(matches, dit - begin(matches, Standard())); } #ifdef RAZERS_DIRECT_MAQ_MAPPING ////////////////////////////////////////////////////////////////////////////// // Remove low quality matches template < typename TMatches, typename TCounts, typename TSpec, typename TSwift > void compactMatches(TMatches &matches, TCounts &cnts, RazerSOptions &, bool, TSwift &, bool dontCountFirstTwo) { typedef typename Value::Type TMatch; typedef typename Iterator::Type TIterator; ::std::sort( begin(matches, Standard()), end(matches, Standard()), LessRNoMQ()); unsigned readNo = -1; TIterator it = begin(matches, Standard()); TIterator itEnd = end(matches, Standard()); TIterator dit = it; //number of errors may not exceed 31! bool second = true; for (; it != itEnd; ++it) { if ((*it).orientation == '-') continue; if (readNo == (*it).rseqNo) { //second best match if (second) { second = false; if((cnts[1][(*it).rseqNo] & 31) > (*it).editDist) { //this second best match is better than any second best match before cnts[1][(*it).rseqNo] = (*it).editDist; // second best dist is this editDist // count is 0 (will be updated if countFirstTwo) } if(!dontCountFirstTwo) if((cnts[1][(*it).rseqNo]>>5) != 2047) cnts[1][(*it).rseqNo] += 32; } else { if ((*it).editDist <= (cnts[0][(*it).rseqNo] & 31) ) if(cnts[0][(*it).rseqNo]>>5 != 2047) cnts[0][(*it).rseqNo] +=32; if ((*it).editDist <= (cnts[1][(*it).rseqNo] & 31) ) if((cnts[1][(*it).rseqNo]>>5) != 2047) cnts[1][(*it).rseqNo] +=32; continue; } } else { //best match second = true; readNo = (*it).rseqNo; //cnts has 16bits, 11:5 for count:dist if((cnts[0][(*it).rseqNo] & 31) > (*it).editDist) { //this match is better than any match before cnts[1][(*it).rseqNo] = cnts[0][(*it).rseqNo]; // best before is now second best // (count will be updated when match is removed) cnts[0][(*it).rseqNo] = (*it).editDist; // best dist is this editDist // count is 0 (will be updated if countFirstTwo) } if(!dontCountFirstTwo) if((cnts[0][(*it).rseqNo]>>5) != 2047) cnts[0][(*it).rseqNo] += 32; // shift 5 to the right, add 1, shift 5 to the left, and keep count } *dit = *it; ++dit; } resize(matches, dit - begin(matches, Standard())); } #endif #ifdef RAZERS_MICRO_RNA template < typename TMatches, typename TSpec > void purgeAmbiguousRnaMatches(TMatches &matches, RazerSOptions &options) { typedef typename Value::Type TMatch; typedef typename Iterator::Type TIterator; ::std::sort(begin(matches, Standard()),end(matches, Standard()),LessRNoEdistHLen()); int bestMScore = 0; unsigned readNo = -1; unsigned hitCount = 0; unsigned hitCountCutOff = options.maxHits; int editDistCutOff = MaxValue::VALUE; TIterator it = begin(matches, Standard()); TIterator itEnd = end(matches, Standard()); TIterator dit = it; TIterator ditBeg = it; for (; it != itEnd; ++it) { if ((*it).orientation == '-') continue; if (readNo == (*it).rseqNo) { if ((*it).editDist >= editDistCutOff) continue; if ( (*it).mScore < bestMScore) continue; if (++hitCount >= hitCountCutOff) { if (hitCount == hitCountCutOff) dit = ditBeg; continue; } } else { readNo = (*it).rseqNo; hitCount = 0; if (options.distanceRange > 0) editDistCutOff = (*it).editDist + options.distanceRange; bestMScore = (*it).mScore; ditBeg = dit; } *dit = *it; ++dit; } resize(matches, dit - begin(matches, Standard())); } ////////////////////////////////////////////////////////////////////////////// // Hamming verification recording sum of mismatching qualities in m.mScore template < typename TMatch, typename TGenome, typename TReadSet, typename TSpec > inline bool matchVerify( TMatch &m, // resulting match Segment inf, // potential match genome region unsigned rseqNo, // read number TReadSet &readSet, // reads RazerSOptions const &options, // RazerS options MicroRNA) // MaqMapping { typedef Segment TGenomeInfix; typedef typename Value::Type const TRead; typedef typename Iterator::Type TGenomeIterator; typedef typename Iterator::Type TReadIterator; unsigned ndlLength = sequenceLength(rseqNo, readSet); if (length(inf) < ndlLength) return false; // verify TRead &read = readSet[rseqNo]; TReadIterator ritBeg = begin(read, Standard()); TReadIterator ritEnd = end(read, Standard()); TGenomeIterator git = begin(inf, Standard()); TGenomeIterator gitEnd = end(inf, Standard()) - (ndlLength - 1); // this is max number of errors the seed should have unsigned maxErrorsSeed = (unsigned)(options.rnaSeedLength * options.errorRate); unsigned minSeedErrors = maxErrorsSeed + 1; unsigned bestHitLength = 0; for (; git < gitEnd; ++git) { bool hit = true; unsigned hitLength = 0; unsigned count = 0; unsigned seedErrors = 0; TGenomeIterator g = git; //maq would count errors in the first 28bp only (at least initially. not for output) for(TReadIterator r = ritBeg; r != ritEnd; ++r, ++g) { if ((options.compMask[ordValue(*g)] & options.compMask[ordValue(*r)]) == 0) { if (count < options.rnaSeedLength) // the maq (28bp-)seed { if(++seedErrors > maxErrorsSeed) { hit = false; break; } } else break; } ++count; } if (hit) hitLength = count; if (hitLength > bestHitLength ) //simply take the longest hit { minSeedErrors = seedErrors; bestHitLength = hitLength; m.gBegin = git - begin(host(inf), Standard()); } } // std::cout << "options.absMaxQualSumErrors" << options.absMaxQualSumErrors << std::endl; // std::cout << "maxSeedErrors" << maxErrorsSeed << std::endl; // std::cout << "minErrors" << minSeedErrors << std::endl; // if(derBesgte) ::std::cout << minErrors <<"minErrors\n"; if (minSeedErrors > maxErrorsSeed) return false; m.gEnd = m.gBegin + bestHitLength; m.editDist = minSeedErrors; // errors in seed or total number of errors? m.mScore = bestHitLength; m.seedEditDist = minSeedErrors; return true; } #endif ////////////////////////////////////////////////////////////////////////////// // Hamming verification template < typename TMatch, typename TGenome, typename TReadSet, typename TMyersPatterns, typename TSpec > inline bool matchVerify( TMatch &m, // resulting match Segment inf, // potential match genome region unsigned rseqNo, // read number TReadSet &readSet, // reads TMyersPatterns const &, // MyersBitVector preprocessing data RazerSOptions const &options, // RazerS options SwiftSemiGlobalHamming) // Hamming only { #ifdef RAZERS_DIRECT_MAQ_MAPPING if(options.maqMapping) return matchVerify(m,inf,rseqNo,readSet,options,QualityBasedScoring()); #endif #ifdef RAZERS_MICRO_RNA if(options.microRNA) return matchVerify(m,inf,rseqNo,readSet,options,MicroRNA()); #endif typedef Segment TGenomeInfix; typedef typename Value::Type const TRead; typedef typename Iterator::Type TGenomeIterator; typedef typename Iterator::Type TReadIterator; #ifdef RAZERS_DEBUG ::std::cout<<"Verify: "<<::std::endl; ::std::cout<<"Genome: "< maxErrors) break; if (minErrors > errors) { minErrors = errors; m.gBegin = git - begin(host(inf), Standard()); } } if (minErrors > maxErrors) return false; m.gEnd = m.gBegin + ndlLength; m.editDist = minErrors; return true; } ////////////////////////////////////////////////////////////////////////////// // Edit distance verification template < typename TMatch, typename TGenome, typename TReadSet, typename TMyersPatterns, typename TSpec > inline bool matchVerify( TMatch &m, // resulting match Segment inf, // potential match genome region unsigned rseqNo, // read number TReadSet &readSet, // reads TMyersPatterns &forwardPatterns, // MyersBitVector preprocessing data RazerSOptions const &options, // RazerS options SwiftSemiGlobal) // Swift specialization { typedef Segment TGenomeInfix; typedef typename Value::Type TRead; // find read match end typedef Finder TMyersFinder; typedef typename Value::Type TMyersPattern; // find read match begin typedef ModifiedString TGenomeInfixRev; typedef ModifiedString TReadRev; typedef Finder TMyersFinderRev; typedef Pattern TMyersPatternRev; TMyersFinder myersFinder(inf); TMyersPattern &myersPattern = forwardPatterns[rseqNo]; #ifdef RAZERS_DEBUG ::std::cout<<"Verify: "<<::std::endl; ::std::cout<<"Genome: "<::VALUE; int minScore = -(int)(ndlLength * options.errorRate); TMyersFinder maxPos; // find end of best semi-global alignment while (find(myersFinder, myersPattern, minScore)) if (maxScore <= getScore(myersPattern)) { maxScore = getScore(myersPattern); maxPos = myersFinder; } if (maxScore < minScore) return false; m.editDist = (unsigned)-maxScore; setEndPosition(inf, m.gEnd = (beginPosition(inf) + position(maxPos) + 1)); // limit the beginning to needle length plus errors (== -maxScore) if (length(inf) > ndlLength - maxScore) setBeginPosition(inf, endPosition(inf) - ndlLength + maxScore); // find beginning of best semi-global alignment TGenomeInfixRev infRev(inf); TReadRev readRev(readSet[rseqNo]); TMyersFinderRev myersFinderRev(infRev); TMyersPatternRev myersPatternRev(readRev); _patternMatchNOfPattern(myersPatternRev, options.matchN); _patternMatchNOfFinder(myersPatternRev, options.matchN); while (find(myersFinderRev, myersPatternRev, maxScore)) m.gBegin = m.gEnd - (position(myersFinderRev) + 1); return true; } #ifdef RAZERS_DIRECT_MAQ_MAPPING ////////////////////////////////////////////////////////////////////////////// // Hamming verification recording sum of mismatching qualities in m.mScore template < typename TMatch, typename TGenome, typename TReadSet, typename TSpec > inline bool matchVerify( TMatch &m, // resulting match Segment inf, // potential match genome region unsigned rseqNo, // read number TReadSet &readSet, // reads RazerSOptions const &options, // RazerS options QualityBasedScoring) // MaqMapping { typedef Segment TGenomeInfix; typedef typename Value::Type const TRead; typedef typename Iterator::Type TGenomeIterator; typedef typename Iterator::Type TReadIterator; // #ifdef RAZERS_DEBUG // cout<<"Verify: "<<::std::endl; // cout<<"Genome: "< maxErrorsTotal) { hit = false; break; } //int qualityValue = (int)((unsigned char)*r >> 3); //qualSumErrors += (qualityValue < options.mutationRateQual) ? qualityValue : options.mutationRateQual; qualSumErrors += (getQualityValue(*r) < options.mutationRateQual) ? getQualityValue(*r) : options.mutationRateQual; if(qualSumErrors > options.absMaxQualSumErrors || qualSumErrors > minQualSumErrors) { hit = false; break; } if (count < options.artSeedLength) // the maq (28bp-)seed { if(++seedErrors > maxErrorsSeed) { hit = false; break; } if(qualSumErrors > options.maxMismatchQualSum) { hit = false; break; }// discard match, if 'seed' is bad (later calculations are done with the quality sum over the whole read) } } ++count; } if (hit && (qualSumErrors < minQualSumErrors /*&& seedErrors <=maxErrorsSeed*/) ) //oder (seedErrors < minSeedErrors) { minErrors = errors; minSeedErrors = seedErrors; minQualSumErrors = qualSumErrors; m.gBegin = git - begin(host(inf), Standard()); bestIt = git; } } if (minQualSumErrors > options.absMaxQualSumErrors || minSeedErrors > maxErrorsSeed || minErrors > maxErrorsTotal) return false; m.gEnd = m.gBegin + ndlLength; m.editDist = minErrors; // errors in seed or total number of errors? m.mScore = minQualSumErrors; m.seedEditDist = minSeedErrors; return true; } ////////////////////////////////////////////////////////////////////////////// // Edit distance verification template < typename TMatch, typename TGenome, typename TReadSet, typename TMyersPatterns, typename TSpec > inline bool matchVerify( TMatch &m, // resulting match Segment inf, // potential match genome region unsigned rseqNo, // read number TReadSet &readSet, // reads TMyersPatterns const & pat, // MyersBitVector preprocessing data RazerSOptions const &options, // RazerS options SwiftSemiGlobal const &swiftsemi, // Hamming only QualityBasedScoring) // Swift specialization { //if(!options.maqMapping) return matchVerify(m,inf,rseqNo,readSet,pat,options,swiftsemi); //else // return matchVerify(m,inf,rseqNo,readSet,pat,options,swiftsemi); // todo! } #endif #ifndef RAZERS_PARALLEL ////////////////////////////////////////////////////////////////////////////// // Find read matches in one genome sequence template < typename TMatches, typename TGenome, typename TReadIndex, typename TSwiftSpec, typename TVerifier, typename TCounts, typename TSpec > void mapSingleReads( TMatches &matches, // resulting matches TGenome &genome, // genome ... unsigned gseqNo, // ... and its sequence number Pattern > &swiftPattern, TVerifier &forwardPatterns, TCounts & cnts, char orientation, // q-gram index of reads RazerSOptions &options) { typedef typename Fibre::Type TReadSet; typedef typename Size::Type TSize; typedef typename Value::Type TMatch; // FILTRATION typedef Finder > TSwiftFinder; typedef Pattern > TSwiftPattern; // iterate all genomic sequences if (options._debugLevel >= 1) { ::std::cerr << ::std::endl << "Process genome seq #" << gseqNo; if (orientation == 'F') ::std::cerr << "[fwd]"; else ::std::cerr << "[rev]"; } TReadSet &readSet = host(host(swiftPattern)); TSwiftFinder swiftFinder(genome, options.repeatLength, 1); TMatch m = { // to supress uninitialized warnings 0, 0, 0, 0, #ifdef RAZERS_MATEPAIRS 0, 0, 0, #endif 0, #ifdef RAZERS_EXTENDED_MATCH 0, 0, #endif #ifdef RAZERS_SPLICED 0, 0, #endif 0 }; SEQAN_PROTIMESTART(map_contig_time); // iterate all verification regions returned by SWIFT TSize gLength = length(genome); __int64 localTP = 0; __int64 localFP = 0; #ifdef RAZERS_MICRO_RNA while (find(swiftFinder, swiftPattern, 0.2)) #else while (find(swiftFinder, swiftPattern, options.errorRate)) #endif { unsigned rseqNo = (*swiftFinder.curHit).ndlSeqNo; if (!options.spec.DONT_VERIFY && matchVerify(m, infix(swiftFinder), rseqNo, readSet, forwardPatterns, options, TSwiftSpec())) { // transform coordinates to the forward strand if (orientation == 'R') { TSize temp = m.gBegin; m.gBegin = gLength - m.gEnd; m.gEnd = gLength - temp; } m.gseqNo = gseqNo; m.rseqNo = rseqNo; m.orientation = orientation; #ifdef RAZERS_MATEPAIRS m.pairId = 0; m.pairScore = 0 - m.editDist; #endif if (!options.spec.DONT_DUMP_RESULTS) { appendValue(matches, m, Generous()); if (length(matches) > options.compactThresh) { #ifndef RAZERS_MICRO_RNA typename Size::Type oldSize = length(matches); #endif maskDuplicates(matches); // overlapping parallelograms cause duplicates #ifdef RAZERS_DIRECT_MAQ_MAPPING if(options.maqMapping) compactMatches(matches, cnts, options, false, swiftPattern, true); else #endif compactMatches(matches, cnts, options, false, swiftPattern); #ifdef RAZERS_MICRO_RNA options.compactThresh = 2 * length(matches); #else if (length(matches) * 4 > oldSize) // the threshold should not be raised options.compactThresh += (options.compactThresh >> 1); // if too many matches were removed #endif } } ++localTP; // ::std::cerr << "\"" << infix(swiftFinder) << "\" "; // ::std::cerr << hstkPos << " + "; // ::std::cerr << ::std::endl; } else { ++localFP; // ::std::cerr << "\"" << infx(swiftFinder) << "\" \"" << infix(swiftPattern) << "\" "; // ::std::cerr << rseqNo << " : "; // ::std::cerr << hstkPos << " + "; // ::std::cerr << bucketWidth << " " << TP << ::std::endl; } } options.TP += localTP; options.FP += localFP; if (options._debugLevel >= 2) { double spec = 100; double time = SEQAN_PROTIMEDIFF(map_contig_time); if (localFP+localTP != 0) spec = (1000*localTP/(localFP+localTP))/10.0; ::std::cerr << " [" << (int)((gLength / time)/100.0)/10.0 << "kbp/s " << time << "s] [" << spec << "%SP " << localFP+localTP << "V]"; } } #endif #ifdef RAZERS_MICRO_RNA // multiple sequences template < typename TSA, typename TStringSet, typename TShape, typename TDir, typename TValue, typename TWithConstraints > inline void _qgramFillSuffixArray( TSA &sa, TStringSet &stringSet, TShape &shape, TDir &dir, Nothing, TWithConstraints const, TValue prefixLen, MicroRNA) { SEQAN_CHECKPOINT typedef typename Value::Type TString; typedef typename Iterator::Type TIterator; typedef typename Value::Type TSize; typedef typename Value::Type THash; for(unsigned seqNo = 0; seqNo < length(stringSet); ++seqNo) { TString const &sequence = value(stringSet, seqNo); if (length(sequence) < length(shape)) continue; int num_qgrams = prefixLen - (int)length(shape) + 1; typename Value::Type localPos; assignValueI1(localPos, seqNo); assignValueI2(localPos, 0); TIterator itText = begin(sequence, Standard()); if (TWithConstraints::VALUE) { THash h = hash(shape, itText) + 1; // first hash if (dir[h] != (TSize)-1) sa[dir[h]++] = localPos; // if bucket is enabled } else sa[dir[hash(shape, itText) + 1]++] = localPos; // first hash for(int i = 1; i < num_qgrams; ++i) { ++itText; assignValueI2(localPos, i); if (TWithConstraints::VALUE) { THash h = hashNext(shape, itText) + 1; // next hash if (dir[h] != (TSize)-1) sa[dir[h]++] = localPos; // if bucket is enabled } else sa[dir[hashNext(shape, itText) + 1]++] = localPos; // next hash } } } template < typename TDir, typename TStringSet, typename TShape, typename TValue > inline void _qgramCountQGrams(TDir &dir, TStringSet &stringSet, TShape &shape, TValue prefixLen, MicroRNA) { SEQAN_CHECKPOINT typedef typename Value::Type TString; typedef typename Iterator::Type TIterator; typedef typename Value::Type TSize; for(unsigned seqNo = 0; seqNo < length(stringSet); ++seqNo) { TString const &sequence = value(stringSet, seqNo); if (length(sequence) < length(shape)) continue; TSize num_qgrams = prefixLen - length(shape) + 1; TIterator itText = begin(sequence, Standard()); ++dir[hash(shape, itText)]; for(TSize i = 1; i < num_qgrams; ++i) { ++itText; ++dir[hashNext(shape, itText)]; } } } template < typename TIndex, typename TValue> void createQGramIndex(TIndex &index, TValue prefixLen, MicroRNA) { SEQAN_CHECKPOINT typename Fibre::Type &text = indexText(index); typename Fibre::Type &sa = indexSA(index); typename Fibre::Type &dir = indexDir(index); typename Fibre::Type &shape = indexShape(index); Nothing nothing; // 1. clear counters arrayFill(begin(dir, Standard()), end(dir, Standard()), 0); // 2. count q-grams _qgramCountQGrams(dir, text, shape, prefixLen,MicroRNA()); if (_qgramDisableBuckets(index)) { // 3. cumulative sum _qgramCummulativeSum(dir, True()); // 4. fill suffix array _qgramFillSuffixArray(sa, text, shape, dir, nothing, True(), prefixLen, MicroRNA()); // 5. correct disabled buckets _qgramPostprocessBuckets(dir); } else { // 3. cumulative sum _qgramCummulativeSum(dir, False()); // 4. fill suffix array _qgramFillSuffixArray(sa, text, shape, dir, nothing, False(),prefixLen, MicroRNA()); } } #endif ////////////////////////////////////////////////////////////////////////////// // Find read matches in many genome sequences (import from Fasta) template < typename TMatches, typename TReadSet, typename TCounts, typename TSpec, typename TShape, typename TSwiftSpec > int mapSingleReads( TMatches & matches, StringSet & genomeFileNameList, StringSet & genomeNames, // genome names, taken from the Fasta file ::std::map > & gnoToFileMap, TReadSet const & readSet, TCounts & cnts, RazerSOptions & options, TShape const & shape, Swift const) { typedef typename Value::Type TRead; typedef Index > TIndex; // q-gram index typedef Pattern > TSwiftPattern; // filter typedef Pattern TMyersPattern; // verifier /* // try opening each genome file once before running the whole mapping procedure int filecount = 0; int numFiles = length(genomeFileNameList); while(filecount < numFiles) { ::std::ifstream file; file.open(toCString(genomeFileNameList[filecount]), ::std::ios_base::in | ::std::ios_base::binary); if (!file.is_open()) return RAZERS_GENOME_FAILED; file.close(); ++filecount; } */ // configure q-gram index TIndex swiftIndex(readSet, shape); #ifdef RAZERS_OPENADDRESSING swiftIndex.alpha = 2; #endif cargo(swiftIndex).abundanceCut = options.abundanceCut; cargo(swiftIndex)._debugLevel = options._debugLevel; // configure Swift TSwiftPattern swiftPattern(swiftIndex); swiftPattern.params.minThreshold = options.threshold; swiftPattern.params.tabooLength = options.tabooLength; swiftPattern.params.printDots = options._debugLevel > 0; // init edit distance verifiers unsigned readCount = countSequences(swiftIndex); String forwardPatterns; options.compMask[4] = (options.matchN)? 15: 0; if (!options.hammingOnly) { resize(forwardPatterns, readCount, Exact()); for(unsigned i = 0; i < readCount; ++i) { setHost(forwardPatterns[i], indexText(swiftIndex)[i]); _patternMatchNOfPattern(forwardPatterns[i], options.matchN); _patternMatchNOfFinder(forwardPatterns[i], options.matchN); } } #ifdef RAZERS_MICRO_RNA typename Size::Type qgram_count = 0; if(options.microRNA) { for(unsigned i = 0; i < countSequences(swiftIndex); ++i) if (sequenceLength(i, swiftIndex) >= options.rnaSeedLength) qgram_count += options.rnaSeedLength - (length(shape) - 1); resize(indexSA(swiftIndex), qgram_count, Exact()); resize(indexDir(swiftIndex), _fullDirLength(swiftIndex), Exact()); createQGramIndex(swiftIndex,options.rnaSeedLength,MicroRNA()); } #endif #ifdef RAZERS_DIRECT_MAQ_MAPPING if(options.maqMapping) { resize(cnts, 2); for (unsigned i = 0; i < length(cnts); ++i) resize(cnts[i], readCount, 31); //initialize with maxeditDist, 11:5 for count:dist } #endif // clear stats options.FP = 0; options.TP = 0; options.timeMapReads = 0; options.timeDumpResults = 0; unsigned filecount = 0; unsigned numFiles = length(genomeFileNameList); unsigned gseqNo = 0; // open genome files, one by one while (filecount < numFiles) { // open genome file ::std::ifstream file; file.open(toCString(genomeFileNameList[filecount]), ::std::ios_base::in | ::std::ios_base::binary); if (!file.is_open()) return RAZERS_GENOME_FAILED; // remove the directory prefix of current genome file ::std::string genomeFile(toCString(genomeFileNameList[filecount])); size_t lastPos = genomeFile.find_last_of('/') + 1; if (lastPos == genomeFile.npos) lastPos = genomeFile.find_last_of('\\') + 1; if (lastPos == genomeFile.npos) lastPos = 0; ::std::string genomeName = genomeFile.substr(lastPos); CharString id; Dna5String genome; unsigned gseqNoWithinFile = 0; // iterate over genome sequences SEQAN_PROTIMESTART(find_time); for(; !_streamEOF(file); ++gseqNo) { if (options.genomeNaming == 0) { //readID(file, id, Fasta()); // read Fasta id readShortID(file, id, Fasta()); // read Fasta id up to first whitespace appendValue(genomeNames, id, Generous()); } read(file, genome, Fasta()); // read Fasta sequence gnoToFileMap.insert(::std::make_pair >(gseqNo,::std::make_pair< ::std::string,unsigned>(genomeName,gseqNoWithinFile))); if (options.forward) mapSingleReads(matches, genome, gseqNo, swiftPattern, forwardPatterns, cnts, 'F', options); if (options.reverse) { reverseComplement(genome); mapSingleReads(matches, genome, gseqNo, swiftPattern, forwardPatterns, cnts, 'R', options); } ++gseqNoWithinFile; } options.timeMapReads += SEQAN_PROTIMEDIFF(find_time); file.close(); ++filecount; } if (options._debugLevel >= 1) ::std::cerr << ::std::endl << "Finding reads took \t" << options.timeMapReads << " seconds" << ::std::endl; if (options._debugLevel >= 2) { ::std::cerr << ::std::endl; ::std::cerr << "___FILTRATION_STATS____" << ::std::endl; ::std::cerr << "Swift FP: " << options.FP << ::std::endl; ::std::cerr << "Swift TP: " << options.TP << ::std::endl; } return 0; } ////////////////////////////////////////////////////////////////////////////// // Find read matches in many genome sequences (given as StringSet) template < typename TMatches, typename TGenomeSet, typename TReadSet, typename TCounts, typename TSpec, typename TShape, typename TSwiftSpec > int mapSingleReads( TMatches & matches, TGenomeSet & genomeSet, TReadSet const & readSet, TCounts & cnts, RazerSOptions & options, TShape const & shape, Swift const) { typedef typename Value::Type TRead; typedef Index > TIndex; // q-gram index typedef Pattern > TSwiftPattern; // filter typedef Pattern TMyersPattern; // verifier // configure q-gram index TIndex swiftIndex(readSet, shape); cargo(swiftIndex).abundanceCut = options.abundanceCut; cargo(swiftIndex)._debugLevel = options._debugLevel; // configure Swift TSwiftPattern swiftPattern(swiftIndex); swiftPattern.params.minThreshold = options.threshold; swiftPattern.params.tabooLength = options.tabooLength; // init edit distance verifiers String forwardPatterns; options.compMask[4] = (options.matchN)? 15: 0; if (!options.hammingOnly) { unsigned readCount = countSequences(swiftIndex); resize(forwardPatterns, readCount, Exact()); for(unsigned i = 0; i < readCount; ++i) { setHost(forwardPatterns[i], indexText(swiftIndex)[i]); _patternMatchNOfPattern(forwardPatterns[i], options.matchN); _patternMatchNOfFinder(forwardPatterns[i], options.matchN); } } // clear stats options.FP = 0; options.TP = 0; options.timeMapReads = 0; options.timeDumpResults = 0; CharString id; #ifdef RAZERS_DIRECT_MAQ_MAPPING if(options.maqMapping) { resize(cnts, 2); for (unsigned i = 0; i < length(cnts); ++i) resize(cnts[i], length(readSet), 31); } #endif // iterate over genome sequences SEQAN_PROTIMESTART(find_time); for(unsigned gseqNo = 0; gseqNo < length(genomeSet); ++gseqNo) { if (options.forward) mapSingleReads(matches, genomeSet[gseqNo], gseqNo, swiftPattern, forwardPatterns, cnts, 'F', options); if (options.reverse) { reverseComplement(genomeSet[gseqNo]); mapSingleReads(matches, genomeSet[gseqNo], gseqNo, swiftPattern, forwardPatterns, cnts, 'R', options); reverseComplement(genomeSet[gseqNo]); } } options.timeMapReads += SEQAN_PROTIMEDIFF(find_time); if (options._debugLevel >= 1) ::std::cerr << ::std::endl << "Finding reads took \t" << options.timeMapReads << " seconds" << ::std::endl; if (options._debugLevel >= 2) { ::std::cerr << ::std::endl; ::std::cerr << "___FILTRATION_STATS____" << ::std::endl; ::std::cerr << "Swift FP: " << options.FP << ::std::endl; ::std::cerr << "Swift TP: " << options.TP << ::std::endl; } return 0; } ////////////////////////////////////////////////////////////////////////////// // Wrapper for single/mate-pair mapping template < typename TMatches, typename TReadSet, typename TCounts, typename TSpec, typename TShape, typename TSwiftSpec > int mapReads( TMatches & matches, StringSet & genomeFileNameList, StringSet & genomeNames, // genome names, taken from the Fasta file ::std::map > & gnoToFileMap, TReadSet & readSet, TCounts & cnts, RazerSOptions & options, TShape const & shape, Swift const) { #ifdef RAZERS_MATEPAIRS if (options.libraryLength >= 0) return mapMatePairReads(matches, genomeFileNameList, genomeNames, gnoToFileMap, readSet, cnts, options, shape, Swift()); else #endif return mapSingleReads(matches, genomeFileNameList, genomeNames, gnoToFileMap, readSet, cnts, options, shape, Swift()); } template < typename TMatches, typename TGenomeSet, typename TReadSet, typename TCounts, typename TSpec, typename TShape, typename TSwiftSpec > int mapReads( TMatches & matches, TGenomeSet & genomeSet, TReadSet & readSet, TCounts & cnts, RazerSOptions & options, TShape const & shape, Swift const) { #ifdef RAZERS_MATEPAIRS if (options.libraryLength >= 0) return mapMatePairReads(matches, genomeSet, readSet, cnts, options, shape, Swift()); else #endif return mapSingleReads(matches, genomeSet, readSet, cnts, options, shape, Swift()); } ////////////////////////////////////////////////////////////////////////////// // Wrapper for different template specializations template int mapReads( TMatches & matches, StringSet & genomeFileNameList, StringSet & genomeNames, // genome names, taken from the Fasta file ::std::map > & gnoToFileMap, TReadSet & readSet, TReadRegions & readRegions, TCounts & cnts, RazerSOptions & options) { // first check whether split mapping (--> two shapes) or normal mapping #ifdef RAZERS_SPLICED if (options.minMatchLen > 0) return mapSplicedReads(matches, genomeFileNameList, genomeNames, gnoToFileMap, readSet, readRegions, cnts, options); #else (void)readRegions; #endif Shape ungapped; Shape onegapped; Shape gapped; // 2x3 SPECIALIZATION if (options.hammingOnly) { // select best-fitting shape if (stringToShape(ungapped, options.shape)) return mapReads(matches, genomeFileNameList, genomeNames, gnoToFileMap, readSet, cnts, options, ungapped, Swift()); if (stringToShape(onegapped, options.shape)) return mapReads(matches, genomeFileNameList, genomeNames, gnoToFileMap, readSet, cnts, options, onegapped, Swift()); if (stringToShape(gapped, options.shape)) return mapReads(matches, genomeFileNameList, genomeNames, gnoToFileMap, readSet, cnts, options, gapped, Swift()); } else { if (stringToShape(ungapped, options.shape)) return mapReads(matches, genomeFileNameList, genomeNames, gnoToFileMap, readSet, cnts, options, ungapped, Swift()); if (stringToShape(onegapped, options.shape)) return mapReads(matches, genomeFileNameList, genomeNames, gnoToFileMap, readSet, cnts, options, onegapped, Swift()); if (stringToShape(gapped, options.shape)) return mapReads(matches, genomeFileNameList, genomeNames, gnoToFileMap, readSet, cnts, options, gapped, Swift()); } return RAZERS_INVALID_SHAPE; } template int mapReads( TMatches & matches, TGenomeSet & genomeSet, TReadSet & readSet, TReadRegions & #ifdef RAZERS_SPLICED readRegions #endif , TCounts & cnts, RazerSOptions & options) { // first check whether split mapping (--> two shapes) or normal mapping #ifdef RAZERS_SPLICED if (options.minMatchLen > 0) return mapSplicedReads(matches, genomeSet, readSet, readRegions, cnts, options); #endif Shape ungapped; Shape onegapped; Shape gapped; // 2x3 SPECIALIZATION if (options.hammingOnly) { // select best-fitting shape if (stringToShape(ungapped, options.shape)) return mapReads(matches, genomeSet, readSet, cnts, options, ungapped, Swift()); if (stringToShape(onegapped, options.shape)) return mapReads(matches, genomeSet, readSet, cnts, options, onegapped, Swift()); if (stringToShape(gapped, options.shape)) return mapReads(matches, genomeSet, readSet, cnts, options, gapped, Swift()); } else { if (stringToShape(ungapped, options.shape)) return mapReads(matches, genomeSet, readSet, cnts, options, ungapped, Swift()); if (stringToShape(onegapped, options.shape)) return mapReads(matches, genomeSet, readSet, cnts, options, onegapped, Swift()); if (stringToShape(gapped, options.shape)) return mapReads(matches, genomeSet, readSet, cnts, options, gapped, Swift()); } return RAZERS_INVALID_SHAPE; } } #endif