// ========================================================================== // SeqAn - The Library for Sequence Analysis // ========================================================================== // Copyright (c) 2006-2010, Knut Reinert, FU Berlin // All rights reserved. // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions are met: // // * Redistributions of source code must retain the above copyright // notice, this list of conditions and the following disclaimer. // * Redistributions in binary form must reproduce the above copyright // notice, this list of conditions and the following disclaimer in the // documentation and/or other materials provided with the distribution. // * Neither the name of Knut Reinert or the FU Berlin nor the names of // its contributors may be used to endorse or promote products derived // from this software without specific prior written permission. // // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE // ARE DISCLAIMED. IN NO EVENT SHALL KNUT REINERT OR THE FU BERLIN BE LIABLE // FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL // DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR // SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT // LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY // OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH // DAMAGE. // // ========================================================================== #ifndef SEQAN_HEADER_EM_ALGORITHM_H #define SEQAN_HEADER_EM_ALGORITHM_H namespace SEQAN_NAMESPACE_MAIN { ////////////////////////////////////////////////////////////////////////////// // Functions ////////////////////////////////////////////////////////////////////////////// /** .Function.em: ..summary:Represents the EM algorithm as used by MEME. ..cat:Motif Search ..signature:em(profile,dataset_start,t,l,oops_model) ..signature:em(profile,dataset_start,t,l,gamma,zoops_model) ..signature:em(profile,dataset_start,t,l,lambda,tcm_model) ..param.profile:A StringSet of @Class.FrequencyDistribution|frequency distributions@. ...type:Class.StringSet ..param.dataset_start:An iterator pointing to the first input sequence of a given dataset. ...type:Concept.RandomAccessIteratorConcept ..param.t:The number of input sequences. ..param.l:The size of the motif. ..param.oops_model:The oops_model object. ...type:Tag.Oops ..param.zoops_model:The zoops_model object. ...type:Tag.Zoops ..param.tcm_model:The tcm_model object. ...type:Tag.Tcm ..param.gamma:The probability of sequence having a motif occurence. ..param.lambda:The probability of starting a motif occurence ...remarks:$lambda$ is calculated by dividing $gamma$ by the length of the corresponding sequence. ..remarks:This version of EM is used in the MEME program of Bailey and Elkan. It is a Bayesian variant of the basic EM which allows multiple occurrences of a motif in any sequence and can therefore be performed on sequences of one of the model types @Tag.Oops@, @Tag.Zoops@ and @Tag.Tcm@. We use the EM algorithm of MEME for the refinement step of PROJECTION. ..include:seqan/find_motif.h */ ////////////////////////////////////////////////////////////////////////////// // Oops model ////////////////////////////////////////////////////////////////////////////// template double em(TProfile & profile, TIter dataset_start, TType const & t, TType const & l, Oops const & oops) { // matrix w - allocate space (memory) TType row_size = t; double ** matrix_w = new double*[row_size]; for(TType pos=0; pos ..param.dataset_start:n iterator pointing to the first input sequence of a given dataset. ..param.t:The number of input sequences. ..param.l:The size of the motif. ..param.oops_model:The oops_model object. ...type:Tag.Oops ..remarks:The joint log likelihood is not computed in the M-step as MEME does, but rather in the E-step of the algorithm. ..include:seqan/find_motif.h */ template void _computeEStep(TMatrix & matrix_w, double & joint_log_likelihood, TProfile & profile, TIter dataset_start, //start iterator of dataset TType const & t, TType const & l, Oops const & /*oops*/) { typedef typename Value::Type TFrequencyDist; typedef typename Value::Type TValue; typedef String TString; typedef typename Position::Type TPos; for(TPos pos=0; pos::Type seq_iter, seq_end; seq_iter = begin(*dataset_start); seq_end = seq_iter+m; while(seq_iter!=seq_end) { TFrequencyDist fd = absLetterFrequencies; double sum_of_log_probs = 0; for(TType h=0; hDBL_MIN_EXP ) { log_of_sums += log(1+exp(matrix_w[seq_nr][j]-log_of_sums)); } } for(j=0; j ..param.dataset_start:n iterator pointing to the first input sequence of a given dataset. ..param.matrix_w:The matrix_w object. ...remarks:w(i,j) is the probability that the motif pattern starts at position j in the i-th input sequence. ..param.t:The number of input sequences. ..param.l:The size of the motif. ..param.oops_model:The oops_model object. ...type:Tag.Oops ..remarks:The joint log likelihood is computed in the E-step of the algorithm. ..include:seqan/find_motif.h */ template void _computeMStep(TProfile & profile, TIter dataset_start, TMatrix const & matrix_w, TType const & t, TType const & l, Oops const & /*oops*/) { typedef typename Value::Type TFrequencyDist; typedef typename Value::Type TValue; typedef typename Position::Type TPos; TPos k = 0; typedef String TString; // total_counts_of_letters is used for the computation of the background frequency TFrequencyDist total_counts_of_letters; //<-c absFreqOfLettersInSetOfSeqs(total_counts_of_letters,dataset_start, dataset_start+t); // for(k=0; k::Type seq_iter, seq_end, ptr; seq_iter = begin(*dataset_start); seq_end = seq_iter+m; while(seq_iter!=seq_end) { ptr = seq_iter; for(TType i=0; i(epsilon)); // matrix w - deallocate space (memory) for(TType pos=0; pos double em(TProfile & profile, TIter dataset_start, TType const & t, TType const & l, double & gamma, Zoops const & zoops) { // matrix w - allocate space (memory) TType row_size = t; double ** matrix_w = new double*[row_size]; for(TType pos=0; pos ..param.dataset_start:n iterator pointing to the first input sequence of a given dataset. ..param.gamma:The probability of sequence having a motif occurence. ...type:$double$ ..param.t:The number of input sequences. ..param.l:The size of the motif. ..param.zoops_model:The oops_model object. ...type:Tag.Zoops ..remarks:The joint log likelihood is not computed in the M-step as MEME does, but rather in the E-step of the algorithm. ..remarks:The parameter gamma is reestimated during the E-step to save calculation time. ..include:seqan/find_motif.h */ template void _computeEStep(TMatrix & matrix_w, double & joint_log_likelihood, TProfile & profile, TIter dataset_start, double & gamma, TType const & t, TType const & l, Zoops const & /*zoops*/) { typedef typename Value::Type TFrequencyDist; typedef typename Value::Type TValue; typedef String TString; typedef typename Position::Type TPos; for(TPos pos=0; pos::Type seq_iter, seq_end; seq_iter = begin(*dataset_start); seq_end = seq_iter+m; while(seq_iter!=seq_end) { TFrequencyDist fd = absLetterFrequencies; double sum_of_log_probs = 0; for(TType h=0; hDBL_MIN_EXP ) { log_of_sums += log(1+exp(matrix_w[seq_nr][j]-log_of_sums)); } } for(j=0; jDBL_MIN_EXP ) { matrix_w[seq_nr][j] -= (log_probs_of_motifless_sequences[seq_nr]+log(1-gamma)+log(1+exp(exponent))); } else { matrix_w[seq_nr][j] -= (log_probs_of_motifless_sequences[seq_nr]+log(1-gamma)); } matrix_w[seq_nr][j] = exp(matrix_w[seq_nr][j]); joint_log_likelihood += matrix_w[seq_nr][j]*prob; } double Q_i = std::accumulate(matrix_w[seq_nr],matrix_w[seq_nr]+m,(double)0); sum_of_Q_i += Q_i; joint_log_likelihood += ((1-Q_i)*log_probs_of_motifless_sequences[seq_nr]) +(Q_i*log(lambda_i)) +((1-Q_i)*log(1-gamma)); ++dataset_start; } delete[] log_probs_of_motifless_sequences; // refine value of gamma gamma = sum_of_Q_i*((double)1/(double)t); } ////////////////////////////////////////////////////////////////////////////// /* .Function._computeStep_M: ..summary:Represents the M-step of the EM algorithm for Zoops models. Refines the background and motif component. ..cat:Motif Search ..signature:_computeStep_M(profile,dataset_start,matrix_w,t,l,zoops_model) ..param.profile:The profile object which is a set of frequency distributions. ...type:Class.String ....signature:String ..param.dataset_start:n iterator pointing to the first input sequence of a given dataset. ..param.matrix_w:The matrix_w object. ...remarks:w(i,j) is the probability that the motif pattern starts at position j in the i-th input sequence. ..param.t:The number of input sequences. ..param.l:The size of the motif. ..param.zoops_model:The zoops_model object. ...type:Tag.Zoops ..remarks:The joint log likelihood is computed in the E-step of the algorithm. ..include:seqan/find_motif.h */ template void _computeMStep(TProfile & profile, TIter dataset_start, TMatrix const & matrix_w, TType const & t, TType const & l, Zoops const & /*zoops*/) { typedef typename Value::Type TFrequencyDist; typedef typename Value::Type TValue; typedef typename Position::Type TPos; TPos k = 0; typedef String TString; // total_counts_of_letters is used for the computtation of the background frequency TFrequencyDist total_counts_of_letters; //<-c absFreqOfLettersInSetOfSeqs(total_counts_of_letters,dataset_start, dataset_start+t); // for(k=0; k::Type seq_iter, seq_end, ptr; seq_iter = begin(*dataset_start); seq_end = seq_iter+m; while(seq_iter!=seq_end) { ptr = seq_iter; for(TType i=0; i(epsilon)); // matrix w - deallocate space (memory) for(TType pos=0; pos double em(TProfile & profile, TIter dataset_start, TType const & t, TType const & l, double & lambda, Tcm const & tcm) { // matrix z - allocate space (memory) TType row_size = t; double ** matrix_w = new double*[row_size]; for(TType pos=0; pos ..param.dataset_start:n iterator pointing to the first input sequence of a given dataset. ..param.lambda:The probability of starting a motif occurence ...type:$double$ ..param.t:The number of input sequences. ..param.l:The size of the motif. ..param.tcm_model:The tcm_model object. ...type:Tag.Tcm ..remarks:The joint log likelihood is not computed in the M-step as MEME does, but rather in the E-step of the algorithm. ..remarks:The parameter lambda is reestimated during the E-step to save calculation time. ..include:seqan/find_motif.h */ template void _computeEStep(TMatrix & matrix_w, double & joint_log_likelihood, TProfile & profile, TIter dataset_start, double & lambda, TType const & t, TType const & l, Tcm const & /*tcm*/) { typedef typename Value::Type TFrequencyDist; typedef typename Value::Type TValue; typedef String TString; typedef typename Position::Type TPos; for(TPos pos=0; pos::Type seq_iter, seq_end, ptr; seq_iter = begin(*dataset_start); seq_end = seq_iter+m; while(seq_iter!=seq_end) { TType seq_pos = m-(seq_end-seq_iter); ptr = seq_iter; double log_prob_given_theta0 = 0; double log_prob_given_theta1 = 0; for(TType i=0; iDBL_MIN_EXP) { subtrahend = log_prob_given_theta0+log((double)(1-prev_lambda)) +log((double)(1+exp(exponent))); } else { subtrahend = log_prob_given_theta0+log((double)(1-prev_lambda)); } matrix_w[seq_nr][seq_pos] = exp(minuend-subtrahend); Q_i += matrix_w[seq_nr][seq_pos]; joint_log_likelihood += ((double)(1-matrix_w[seq_nr][seq_pos]))*log_prob_given_theta0 +matrix_w[seq_nr][seq_pos]*log_prob_given_theta1 +((double)(1-matrix_w[seq_nr][seq_pos]))*log((double)(1-prev_lambda)) +matrix_w[seq_nr][seq_pos]*log(prev_lambda); ++seq_iter; } lambda += (Q_i/((double)m)); ++dataset_start; } lambda = lambda/((double)t); // apply a smoothing step to reduce the degree to which any two overlapping // subsequences can both be assigned to the motif component dataset_start -= t; _smoothingStep(matrix_w, dataset_start, t, l); } ////////////////////////////////////////////////////////////////////////////// /* .Function._computeStep_M: ..summary:Represents the M-step of the EM algorithm for Tcm models. Refines the background and motif component. ..cat:Motif Search ..signature:_computeStep_M(profile,dataset_start,matrix_w,t,l,tcm_model) ..param.profile:The profile object which is a set of frequency distributions. ...type:Class.String ....signature:String ..param.dataset_start:n iterator pointing to the first input sequence of a given dataset. ..param.matrix_w:The matrix_w object. ...remarks:w(i,j) is the probability that the motif pattern starts at position j in the i-th input sequence. ..param.t:The number of input sequences. ..param.l:The size of the motif. ..param.tcm_model:The tcm_model object. ...type:Tag.Tcm ..remarks:The joint log likelihood is computed in the E-step of the algorithm. ..include:seqan/find_motif.h */ template void _computeMStep(TProfile & profile, TIter dataset_start, TMatrix const & matrix_w, TType const & t, TType const & l, Tcm const & /*tcm*/) { typedef typename Value::Type TFrequencyDist; typedef typename Value::Type TValue; typedef typename Position::Type TPos; TPos k = 0; typedef String TString; // for(k=0; k::Type seq_iter, seq_end, ptr; seq_iter = begin(*dataset_start); seq_end = seq_iter+m; while(seq_iter!=seq_end) { ptr = seq_iter; for(TType i=0; i(epsilon)); // matrix w - deallocate space (memory) for(TType pos=0; pos void _smoothingStep(TMatrix & matrix_w, TIter dataset_start, TType const & t, TType const & l) { TType h = 0; for(TType i=0; i=1 ) { w_sum = 0; w_max = 0; factor = 0; // w_sum:=sum of Zij in the current window of size l (l:=l) w_sum = (double)(std::accumulate(matrix_w[i]+k, matrix_w[i]+k+l, (double)0)); // w_max:=the largest Zij in the current window w_max = (double)(*std::max_element(matrix_w[i]+k,matrix_w[i]+k+l)); if(w_sum>1.0) { factor = ((double)1-w_max)/(w_sum-w_max); for(h=k; h1 ) { w_sum = 0; w_max = 0; factor = 0; // w_sum:=sum of Zij in the current window of size l (l:=l) w_sum = (double)(std::accumulate(matrix_w[i]+k, matrix_w[i]+m, (double)0)); // w_max:=the largest Zij in the current window w_max = (double)(*std::max_element(matrix_w[i]+k,matrix_w[i]+m)); if(w_sum>1.0) { factor = ((double)1-w_max)/(w_sum-w_max); for(h=k; h