ó
üIc           @   sÃ  d  Z  d d l Z d „  Z e j j ƒ  d Z e j e ƒ Z d d d „  ƒ  YZ d „  Z	 d „  Z
 d	 „  Z d d d d d
 „ Z d Z d d d d d d d d „ Z d „  Z d „  Z d „  Z d d d d „ Z d „  Z d d „ Z d „  Z d „  Z d „  Z d „  Z d „  Z d „  Z d „  Z d „  Z d „  Z d „  Z y d d l  Z  Wn e! k
 reZ" nZ Xd d l# Z# e# j$ e% Z& x= e  j' j( ƒ  D], Z) e) j* d ƒ se  j' e) e& j' e) <qqWd S(   sÕ  
This is an implementation of a state-emitting MarkovModel.  I am using
terminology similar to Manning and Schutze.



Functions:
train_bw        Train a markov model using the Baum-Welch algorithm.
train_visible   Train a visible markov model using MLE.
find_states     Find the a state sequence that explains some observations.

load            Load a MarkovModel.
save            Save a MarkovModel.

Classes:
MarkovModel     Holds the description of a markov model
iÿÿÿÿNc         C   sX   i  } t  |  d  d  d … ƒ } t |  ƒ d } x" | D] \ } } | | | | <q6 W| S(   Niÿÿÿÿi   (   t	   enumeratet   len(   t   valuest   dt   entriest   nt   indext   key(    (    s‚   /oak/stanford/groups/akundaje/marinovg/programs/biopython-1.50.tar.gz/biopython-1.50/build/lib.linux-x86_64-2.7/Bio/MarkovModel.pyt	   itemindex   s     gYóøÂn¥t   MarkovModelc           B   s#   e  Z d d d d  „ Z d „  Z RS(   c         C   s1   | |  _  | |  _ | |  _ | |  _ | |  _ d  S(   N(   t   statest   alphabett	   p_initialt   p_transitiont
   p_emission(   t   selfR
   R   R   R   R   (    (    s‚   /oak/stanford/groups/akundaje/marinovg/programs/biopython-1.50.tar.gz/biopython-1.50/build/lib.linux-x86_64-2.7/Bio/MarkovModel.pyt   __init__#   s
    				c         C   s<   d d  l  } | j  ƒ  } t |  | ƒ | j d ƒ | j ƒ  S(   Niÿÿÿÿi    (   t   StringIOt   savet   seekt   read(   R   R   t   handle(    (    s‚   /oak/stanford/groups/akundaje/marinovg/programs/biopython-1.50.tar.gz/biopython-1.50/build/lib.linux-x86_64-2.7/Bio/MarkovModel.pyt   __str__*   s
    N(   t   __name__t
   __module__t   NoneR   R   (    (    (    s‚   /oak/stanford/groups/akundaje/marinovg/programs/biopython-1.50.tar.gz/biopython-1.50/build/lib.linux-x86_64-2.7/Bio/MarkovModel.pyR	   "   s   c         C   s8   |  j  ƒ  } | j | ƒ s4 t d | | f ƒ ‚ n  | S(   Ns   I expected %r but got %r(   t   readlinet
   startswitht
   ValueError(   R   t   startt   line(    (    s‚   /oak/stanford/groups/akundaje/marinovg/programs/biopython-1.50.tar.gz/biopython-1.50/build/lib.linux-x86_64-2.7/Bio/MarkovModel.pyt   _readline_and_check_start1   s    c         C   sê  t  |  d ƒ } | j ƒ  d } t  |  d ƒ } | j ƒ  d } t | | ƒ } t | ƒ t | ƒ } } t j | ƒ | _ t  |  d ƒ } xN t t | ƒ ƒ D]: } t  |  d | | ƒ } t | j ƒ  d ƒ | j | <qš Wt j | | f ƒ | _	 t  |  d ƒ } x] t t | ƒ ƒ D]I } t  |  d | | ƒ } t
 t | j ƒ  d ƒ | j	 | d d … f <qWt j | | f ƒ | _ t  |  d	 ƒ } x] t t | ƒ ƒ D]I } t  |  d | | ƒ } t
 t | j ƒ  d ƒ | j | d d … f <q™W| S(
   s   load(handle) -> MarkovModel()s   STATES:i   s	   ALPHABET:s   INITIAL:s     %s:iÿÿÿÿs   TRANSITION:Ns	   EMISSION:(   R   t   splitR	   R   t   numpyt   zerosR   t   ranget   floatR   t   mapR   (   R   R   R
   R   t   mmt   Nt   Mt   i(    (    s‚   /oak/stanford/groups/akundaje/marinovg/programs/biopython-1.50.tar.gz/biopython-1.50/build/lib.linux-x86_64-2.7/Bio/MarkovModel.pyt   load7   s,    !00c         C   sU  | j  } | d d j |  j ƒ ƒ | d d j |  j ƒ ƒ | d ƒ x? t t |  j ƒ ƒ D]( } | d |  j | |  j | f ƒ q] W| d ƒ xW t t |  j ƒ ƒ D]@ } t t	 |  j | ƒ } | d |  j | d j | ƒ f ƒ q© W| d ƒ xW t t |  j
 ƒ ƒ D]@ } t t	 |  j
 | ƒ } | d |  j | d j | ƒ f ƒ qWd	 S(
   s   save(mm, handle)s   STATES: %s
t    s   ALPHABET: %s
s	   INITIAL:
s	     %s: %g
s   TRANSITION:
s	     %s: %s
s
   EMISSION:
N(   t   writet   joinR
   R   R#   R   R   R   R%   t   strR   (   R&   R   t   wR)   t   x(    (    s‚   /oak/stanford/groups/akundaje/marinovg/programs/biopython-1.50.tar.gz/biopython-1.50/build/lib.linux-x86_64-2.7/Bio/MarkovModel.pyR   [   s    	
&
(
c         C   s¶  t  |  ƒ t  | ƒ } } | s. t d ƒ ‚ n  | d k rj t | ƒ } | j | f k rj t d ƒ ‚ qj n  | d k r­ t | ƒ } | j | | f k r­ t d d ƒ ‚ q­ n  | d k rð t | ƒ } | j | | f k rð t d d ƒ ‚ qð n  g  }	 t | ƒ }
 x2 | D]* } |	 j g  | D] } |
 | ^ qƒ q	Wt t  |	 ƒ } t | ƒ d k rgt d ƒ ‚ n  t	 | | |	 d	 | d
 | d | d | ƒ} | \ } } } t
 |  | | | | ƒ S(   s^  train_bw(states, alphabet, training_data[, pseudo_initial]
    [, pseudo_transition][, pseudo_emission][, update_fn]) -> MarkovModel

    Train a MarkovModel using the Baum-Welch algorithm.  states is a list
    of strings that describe the names of each state.  alphabet is a
    list of objects that indicate the allowed outputs.  training_data
    is a list of observations.  Each observation is a list of objects
    from the alphabet.

    pseudo_initial, pseudo_transition, and pseudo_emission are
    optional parameters that you can use to assign pseudo-counts to
    different matrices.  They should be matrices of the appropriate
    size that contain numbers to add to each parameter matrix, before
    normalization.

    update_fn is an optional callback that takes parameters
    (iteration, log_likelihood).  It is called once per iteration.

    s   No training data given.s$   pseudo_initial not shape len(states)s   pseudo_transition not shape s   len(states) X len(states)s   pseudo_emission not shape s   len(states) X len(alphabet)i    s,   I got training data with outputs of length 0t   pseudo_initialt   pseudo_transitiont   pseudo_emissiont	   update_fnN(   R   R   R   t   asarrayt   shapeR   t   appendR%   t   mint   _baum_welchR	   (   R
   R   t   training_dataR1   R2   R3   R4   R'   R(   t   training_outputst   indexest   outputsR0   t   lengthsR   R   R   (    (    s‚   /oak/stanford/groups/akundaje/marinovg/programs/biopython-1.50.tar.gz/biopython-1.50/build/lib.linux-x86_64-2.7/Bio/MarkovModel.pyt   train_bwn   s>    (	iè  c
         C   sÿ  | d  k r t |  ƒ } n t | |  f ƒ } | d  k rN t |  |  f ƒ } n t | |  |  f ƒ } | d  k r„ t |  | f ƒ } n t | |  | f ƒ } t t j | | | f ƒ \ }
 } } | d  k rÛ t j | ƒ } n d  } | d  k rÿ t j | ƒ } n d  } | d  k r#t j | ƒ } n d  } d  } x´ t t ƒ D]– } t } x< | D]4 } t	 |  | | |
 | | | | | ƒ	 } | | 7} qOW|	 d  k	 r£|	 | | ƒ n  | d  k	 rÌt j
 | | ƒ d k  rÌPn  | } q<Wt d t ƒ ‚ t t j |
 | | f ƒ S(   Ngš™™™™™¹?s%   HMM did not converge in %d iterations(   R   t   _random_normt   _copy_and_checkR%   R!   t   logR#   t   MAX_ITERATIONSt   LOG0t   _baum_welch_onet   fabst   RuntimeErrort   exp(   R'   R(   R;   R   R   R   R1   R2   R3   R4   t
   lp_initialt   lp_transitiont   lp_emissiont   lpseudo_initialt   lpseudo_transitiont   lpseudo_emissiont	   prev_llikR)   t   llikR=   R0   (    (    s‚   /oak/stanford/groups/akundaje/marinovg/programs/biopython-1.50.tar.gz/biopython-1.50/build/lib.linux-x86_64-2.7/Bio/MarkovModel.pyR9   ®   sJ    !		%

c	      	   C   s³  t  | ƒ }	 t |  |	 | | | | ƒ }
 t |  |	 | | | ƒ } t j |  |  |	 f ƒ } xÎ t |	 ƒ D]À } | | } t j |  |  f ƒ } xo t |  ƒ D]a } xX t |  ƒ D]J } |
 | | | | | | | | | | | d } | | | | <q© Wq– W| t | ƒ | d  d  … d  d  … | f <qd Wt j |  |	 f ƒ } xR t |	 ƒ D]D } x; t |  ƒ D]- } t | | d  d  … | f ƒ | | | <q]WqJWt j |  ƒ } x4 t |  ƒ D]& } t | | d  d  … f ƒ | | <q®W| d  d  … d f } | d  k rt | | ƒ } | t | ƒ } n  xœ t |  ƒ D]Ž } xC t |  ƒ D]5 } t | | | d  d  … f ƒ | | | | | <q<W| d  k r)t | | | ƒ | | <| | t | | ƒ | | <q)q)WxÛ t |  ƒ D]Í } t j | ƒ t	 } xY t |	 ƒ D]K } | | } x8 t |  ƒ D]* } t
 | | | | | | f ƒ | | <qWqîW| t | ƒ } | d  k rt | | | ƒ } | t | ƒ } n  | | | d  d  … f <qÈWt |
 d  d  … |	 f ƒ S(   Ni   i    (   R   t   _forwardt	   _backwardR!   R"   R#   t   _logsumR   t
   _logvecaddRD   t   _logadd(   R'   R(   R=   RI   RJ   RK   RL   RM   RN   t   Tt   fmatt   bmatt   lp_arct   tt   kt   lp_traverseR)   t   jt   lpt   lp_arcout_tt	   lp_arcoutt   ksum(    (    s‚   /oak/stanford/groups/akundaje/marinovg/programs/biopython-1.50.tar.gz/biopython-1.50/build/lib.linux-x86_64-2.7/Bio/MarkovModel.pyRE   é   sT    
#-/$3#
,	c         C   sÚ   t  j |  | d f ƒ } | | d  d  … d f <x¤ t d | d ƒ D] } | | d } xx t |  ƒ D]j }	 t }
 xM t |  ƒ D]? } | | | d | | |	 | | | } t |
 | ƒ }
 q} W|
 | |	 | <qd WqC W| S(   Ni   i    (   R!   R"   R#   RD   RU   (   R'   RV   RI   RJ   RK   R=   t   matrixRZ   R[   R]   t   lprobR)   R^   (    (    s‚   /oak/stanford/groups/akundaje/marinovg/programs/biopython-1.50.tar.gz/biopython-1.50/build/lib.linux-x86_64-2.7/Bio/MarkovModel.pyRQ   ;  s    c         C   sÃ   t  j |  | d f ƒ } x£ t | d d d ƒ D]‹ } | | } xx t |  ƒ D]j } t }	 xM t |  ƒ D]? }
 | |
 | d | | |
 | | | } t |	 | ƒ }	 qf W|	 | | | <qM Wq0 W| S(   Ni   iÿÿÿÿ(   R!   R"   R#   RD   RU   (   R'   RV   RJ   RK   R=   Rb   RZ   R[   R)   Rc   R]   R^   (    (    s‚   /oak/stanford/groups/akundaje/marinovg/programs/biopython-1.50.tar.gz/biopython-1.50/build/lib.linux-x86_64-2.7/Bio/MarkovModel.pyRR   R  s    
c         C   sÉ  t  |  ƒ t  | ƒ } } | d k rU t | ƒ } | j | f k rU t d ƒ ‚ qU n  | d k r˜ t | ƒ } | j | | f k r˜ t d d ƒ ‚ q˜ n  | d k rÛ t | ƒ } | j | | f k rÛ t d d ƒ ‚ qÛ n  g  g  } }	 t |  ƒ }
 t | ƒ } xƒ | D]{ \ } } t  | ƒ t  | ƒ k r:t d ƒ ‚ n  | j g  | D] } |
 | ^ qGƒ |	 j g  | D] } | | ^ qkƒ qWt | | |	 | | | | ƒ } | \ } } } t |  | | | | ƒ S(   s.  train_visible(states, alphabet, training_data[, pseudo_initial]
    [, pseudo_transition][, pseudo_emission]) -> MarkovModel

    Train a visible MarkovModel using maximum likelihoood estimates
    for each of the parameters.  states is a list of strings that
    describe the names of each state.  alphabet is a list of objects
    that indicate the allowed outputs.  training_data is a list of
    (outputs, observed states) where outputs is a list of the emission
    from the alphabet, and observed states is a list of states from
    states.

    pseudo_initial, pseudo_transition, and pseudo_emission are
    optional parameters that you can use to assign pseudo-counts to
    different matrices.  They should be matrices of the appropriate
    size that contain numbers to add to each parameter matrix

    s$   pseudo_initial not shape len(states)s   pseudo_transition not shape s   len(states) X len(states)s   pseudo_emission not shape s   len(states) X len(alphabet)s   states and outputs not alignedN(	   R   R   R5   R6   R   R   R7   t   _mleR	   (   R
   R   R:   R1   R2   R3   R'   R(   t   training_statesR;   t   states_indexest   outputs_indexest   toutputst   tstatesR0   R   R   R   (    (    s‚   /oak/stanford/groups/akundaje/marinovg/programs/biopython-1.50.tar.gz/biopython-1.50/build/lib.linux-x86_64-2.7/Bio/MarkovModel.pyt   train_visibleb  s6    $(c         C   s6  t  j |  ƒ } | r" | | } n  x" | D] } | | d c d 7<q) Wt | ƒ } t  j |  |  f ƒ }	 | r{ |	 | }	 n  x^ | D]V } xM t t | ƒ d ƒ D]5 }
 | |
 | |
 d } } |	 | | f c d 7<qŸ Wq‚ WxZ t t |	 ƒ ƒ D]F } |	 | d  d  … f t |	 | d  d  … f ƒ |	 | d  d  … f <qï Wt  j |  | f ƒ } | ra| | } n  t  j |  | f ƒ } xS t | | ƒ D]B \ } } x3 t | | ƒ D]" \ } } | | | f c d 7<q¢Wq†WxZ t t | ƒ ƒ D]F } | | d  d  … f t | | d  d  … f ƒ | | d  d  … f <qßW| |	 | f S(   Ni    i   (   R!   R"   t
   _normalizeR#   R   t   sumt   onest   zip(   R'   R(   R;   Re   R1   R2   R3   R   R
   R   R   R)   R]   R   R=   t   ot   s(    (    s‚   /oak/stanford/groups/akundaje/marinovg/programs/biopython-1.50.tar.gz/biopython-1.50/build/lib.linux-x86_64-2.7/Bio/MarkovModel.pyRd   ˜  s2    DDc         C   s   t  j |  ƒ g S(   N(   R!   t   argmax(   t   vectort	   allowance(    (    s‚   /oak/stanford/groups/akundaje/marinovg/programs/biopython-1.50.tar.gz/biopython-1.50/build/lib.linux-x86_64-2.7/Bio/MarkovModel.pyt	   _argmaxes½  s    c         C   s  |  } t  | j ƒ } | j t } | j t } | j t } t t j | | | f ƒ \ } } }	 t	 | j
 ƒ }
 g  | D] } |
 | ^ qv } t | | | |	 | ƒ } x] t t  | ƒ ƒ D]I } | | \ } } g  | D] } | j | ^ qÔ t j | ƒ f | | <q· W| S(   s<   find_states(markov_model, output) -> list of (states, score)(   R   R
   R   t   VERY_SMALL_NUMBERR   R   R%   R!   RB   R   R   t   _viterbiR#   RH   (   t   markov_modelt   outputR&   R'   R0   t   yt   zRI   RJ   RK   R<   t   resultsR)   R
   t   score(    (    s‚   /oak/stanford/groups/akundaje/marinovg/programs/biopython-1.50.tar.gz/biopython-1.50/build/lib.linux-x86_64-2.7/Bio/MarkovModel.pyt   find_statesÀ  s    $7c         C   s  t  | ƒ } g  } x( t |  ƒ D] } | j d  g | ƒ q Wt j |  | f ƒ } | | d  d  … | d f | d  d  … d f <x¦ t d | ƒ D]• }	 | |	 }
 x‚ t |  ƒ D]t } | d  d  … |	 d f | d  d  … | f | | |
 f } t | ƒ } | | d | | |	 f <| | | |	 <q­ Wq Wg  } g  } t | d  d  … | d f ƒ } x7 | D]/ } | j | d | g | | | d f ƒ q\Wx… | r| j ƒ  \ }	 } } |	 d k rÏ| j | | f ƒ q’| | d |	 } x/ | D]' } | j |	 d | g | | f ƒ qèWq’W| S(   Ni    i   (   R   R#   R7   R   R!   R"   Rt   t   pop(   R'   RI   RJ   RK   Rx   RV   t	   backtraceR)   t   scoresRZ   R[   R]   t   i_scoresR<   t
   in_processR{   R
   R|   (    (    s‚   /oak/stanford/groups/akundaje/marinovg/programs/biopython-1.50.tar.gz/biopython-1.50/build/lib.linux-x86_64-2.7/Bio/MarkovModel.pyRv   ×  s6    .
+ -	)c         C   s³   t  |  j ƒ d k r. |  t t |  ƒ ƒ }  n t  |  j ƒ d k r£ xi t t  |  ƒ ƒ D]F } |  | d  d  … f t |  | d  d  … f ƒ |  | d  d  … f <qV Wn t d ƒ ‚ |  S(   Ni   i   s&   I cannot handle matrixes of that shape(   R   R6   R$   Rl   R#   R   (   Rb   R)   (    (    s‚   /oak/stanford/groups/akundaje/marinovg/programs/biopython-1.50.tar.gz/biopython-1.50/build/lib.linux-x86_64-2.7/Bio/MarkovModel.pyRk     s    Gc         C   s   t  j |  ƒ } t | ƒ S(   N(   R!   Rm   Rk   (   R6   Rb   (    (    s‚   /oak/stanford/groups/akundaje/marinovg/programs/biopython-1.50.tar.gz/biopython-1.50/build/lib.linux-x86_64-2.7/Bio/MarkovModel.pyt   _uniform_norm  s    c         C   s   t  j j |  ƒ } t | ƒ S(   N(   R!   t   randomRk   (   R6   Rb   (    (    s‚   /oak/stanford/groups/akundaje/marinovg/programs/biopython-1.50.tar.gz/biopython-1.50/build/lib.linux-x86_64-2.7/Bio/MarkovModel.pyR@     s    c         C   sô   t  j |  d d ƒ}  |  j | k r3 t d ƒ ‚ n  t |  j ƒ d k ry t  j t |  ƒ d ƒ d k rð t d ƒ ‚ qð nw t |  j ƒ d k rä x_ t t |  ƒ ƒ D]< } t  j t |  | ƒ d ƒ d k r¡ t d | ƒ ‚ q¡ q¡ Wn t d	 ƒ ‚ |  S(
   Nt   copyi   s   Incorrect dimensiong      ð?g{®Gáz„?s   matrix not normalized to 1.0i   s   matrix %d not normalized to 1.0s&   I don't handle matrices > 2 dimensions(	   R!   t   arrayR6   t	   ValuErrorR   RF   Rl   R   R#   (   Rb   t   desired_shapeR)   (    (    s‚   /oak/stanford/groups/akundaje/marinovg/programs/biopython-1.50.tar.gz/biopython-1.50/build/lib.linux-x86_64-2.7/Bio/MarkovModel.pyRA     s    #c         C   sf   | |  d k r | S|  | d k r( |  St  |  | ƒ } | t j t j |  | ƒ t j | | ƒ ƒ S(   Nid   (   R8   R!   RB   RH   (   t   logxt   logyt   minxy(    (    s‚   /oak/stanford/groups/akundaje/marinovg/programs/biopython-1.50.tar.gz/biopython-1.50/build/lib.linux-x86_64-2.7/Bio/MarkovModel.pyRU   )  s    c         C   si   t  |  j ƒ d k r9 t j |  t j |  j ƒ f ƒ } n |  } t } x | D] } t | | ƒ } qL W| S(   Ni   (   R   R6   R!   t   reshapet   productRD   RU   (   Rb   t   vecRl   t   num(    (    s‚   /oak/stanford/groups/akundaje/marinovg/programs/biopython-1.50.tar.gz/biopython-1.50/build/lib.linux-x86_64-2.7/Bio/MarkovModel.pyRS   1  s    $c         C   su   t  |  ƒ t  | ƒ k s$ t d ƒ ‚ t j t  |  ƒ ƒ } x5 t t  |  ƒ ƒ D]! } t |  | | | ƒ | | <qL W| S(   Ns   vectors aren't the same length(   R   t   AssertionErrorR!   R"   R#   RU   (   t   logvec1t   logvec2t   sumvecR)   (    (    s‚   /oak/stanford/groups/akundaje/marinovg/programs/biopython-1.50.tar.gz/biopython-1.50/build/lib.linux-x86_64-2.7/Bio/MarkovModel.pyRT   ;  s
    $c         C   s   t  |  ƒ } t j | ƒ S(   N(   RS   R!   RH   (   t   numbersRl   (    (    s‚   /oak/stanford/groups/akundaje/marinovg/programs/biopython-1.50.tar.gz/biopython-1.50/build/lib.linux-x86_64-2.7/Bio/MarkovModel.pyt   _exp_logsumB  s    t   __(    (+   t   __doc__R!   R   R„   t   seedRu   RB   RD   R	   R   R*   R   R   R?   RC   R9   RE   RQ   RR   Rj   Rd   Rt   R}   Rv   Rk   Rƒ   R@   RA   RU   RS   RT   R•   t   cMarkovModelt   ImportErrorR0   t   syst   modulesR   t   this_modulet   __dict__t   keyst   nameR   (    (    (    s‚   /oak/stanford/groups/akundaje/marinovg/programs/biopython-1.50.tar.gz/biopython-1.50/build/lib.linux-x86_64-2.7/Bio/MarkovModel.pyt   <module>   sR   			$		=	8	R		4	%		,						
		