ó
K,TXc           @   s|  d  d l  Z  d  d l Z d  d l Z d  d l Z d  d l Z d  d l m Z m Z m Z m	 Z	 e j
 j d ƒ d  d l Z d  d l Z d  d l m Z d Z e d ƒ Z d „  Z d „  Z d	 „  Z d
 „  Z d „  Z e d „ Z d „  Z d „  Z d „  Z d „  Z d „  Z d „  Z d „  Z e d „ Z  d „  Z! d „  Z" d „  Z# d „  Z$ d „  Z% d „  Z& e' d k rxe& ƒ  n  d S(   iÿÿÿÿN(   t   logt   powt   floort   exps@   /woldlab/castor/home/georgi/programs/meme_4.11.2/meme/lib/python(   t   getLogFETPvalues   %4.1fe%+04.0fi
   c         C   sn   |  t  } t | ƒ } t d | | ƒ } | d t d | ƒ d k rZ d } | d 7} n  | | | f } | S(   s•    Print x with given format given logx.  Handles very large
    and small numbers with prec digits after the decimal.
    Returns the string to print.i
   g      à?i   (   t   _log10R   R   (   t   logxt   prect   formatt   log10xt   et   mt   str(    (    sP   /woldlab/castor/home/georgi/programs/meme_4.11.2/scripts/fasta-hamming-enrich.pyt   sprint_logx   s    
c         C   s@   g  } x* t  |  ƒ D] } | j | j | ƒ ƒ q Wd j | ƒ S(   Nt    (   t   reversedt   appendt   getComplementt   join(   t   wordt   alpht   rcwordt   sym(    (    sP   /woldlab/castor/home/georgi/programs/meme_4.11.2/scripts/fasta-hamming-enrich.pyt   get_rc   s    c         C   s.   g  } x! |  D] } | j  | j ƒ  ƒ q W| S(   s2    Extract strings from FASTA sequence records.
    (   R   t	   getString(   t   seqst   stringst   s(    (    sP   /woldlab/castor/home/georgi/programs/meme_4.11.2/scripts/fasta-hamming-enrich.pyt   get_strings_from_seqs%   s    c         C   s&  | } t  | ƒ } | j ƒ  } g  | D] } | j | ƒ ^ q& } | rr g  t | | ƒ D] } | j | ƒ ^ qW n d }	 g  t | d ƒ D] }
 d ^ q‰ } g  } g  } xo|  D]g} | j | ƒ } | } d } xt d t  | ƒ | d ƒ D]ú } xá d d g D]Ó } | d k r| n |	 } d } xo t | ƒ D]a } | | } | | | } | | k ri| d } Pn  | | k r1| d 7} | | k r’Pq’q1q1W| | k  rÉ| } | | d } | d k rÉPqÉn  | s Pq q W| d k rí Pqí qí W| | c d 7<| j | ƒ | j | ƒ q® W| | | f S(   s¹  
    Return a list of the number of sequences whose minimum
    Hamming distance to the given word or its reverse complement
    is X, for X in [0..w].

    Also returns a list with the best distance for each sequence.

    Also returns a list with the offset (1-relative) of the leftmost
    match to the word in each sequence. (Matches to reverse
    complement of the word have negative offsets.)

    Returns: counts, dists, offsets
    i   i    iÿÿÿÿN(   t   lent   getLent   getComprisingIndexesR   t   Nonet   xranget   encodeStringR   (   R   R   R   t
   given_onlyt   use_rct   wt   alenR   t   eword_givent   eword_rct   _t   countst   distst   offsetst   seqR   t   min_distt   best_offsett   it   strandt   ewordt   distt   jt   esymst   esym(    (    sP   /woldlab/castor/home/georgi/programs/meme_4.11.2/scripts/fasta-hamming-enrich.pyt   get_min_hamming_dists-   sN    "7#$


c         C   sˆ   t  | |  | | ƒ \ } } } t  | |  | | ƒ \ }	 }
 } t t |  ƒ t | ƒ t | ƒ | |	 ƒ \ } } } } | | | | | | f S(   sX  
    Find the most enriched Hamming distance for given word.
    Returns (best_dist, best_log_pvalue, pos_dists, pos_offsets, best_p, best_n)
    Pos_dists[s] is best distance for sequence s.
    Pos_offsets[s] is offset to the leftmost best match in sequence s.
    Offsets are 1-relative; match on reverse complement has negative offset.
    (   R7   t'   get_best_hamming_enrichment_from_countsR   (   R   t   pos_seqst   neg_seqsR   R#   t   print_distst
   pos_countst	   pos_distst   pos_offsetst
   neg_countst	   neg_distst   neg_offsetst	   best_distt   best_log_pvaluet   best_pt   best_n(    (    sP   /woldlab/castor/home/georgi/programs/meme_4.11.2/scripts/fasta-hamming-enrich.pyt   get_best_hamming_enrichmentp   s
    c         C   s<  t  j  | ƒ } t  j  | ƒ } xM t d t | ƒ ƒ D]6 } | | c | | d 7<| | c | | d 7<q4 W|  }	 d }
 d } d } x£ t t | ƒ ƒ D] } | | } | | } t | | | | |
 ƒ } | |
 k  rò | }
 | }	 | } | } n  | r™ t | d t ƒ } d | | | | | | f GHq™ q™ W|	 |
 | | f S(   s|   
    Find the most enriched Hamming distance for given counts.
    Returns (best_dist, best_log_pvalue, best_p, best_n)
    i   i    s   d %d : %d %d %d %d %s(   t   copyt   rangeR   R   R   t
   _pv_format(   R%   t   Pt   NR<   R?   R;   t   cum_pos_countst   cum_neg_countsR0   RB   RC   RD   RE   t   pt   nt
   log_pvaluet	   pv_string(    (    sP   /woldlab/castor/home/georgi/programs/meme_4.11.2/scripts/fasta-hamming-enrich.pyR8   …   s,    

	"c   "      C   sd  t  | |  | | ƒ \ } } } t  | |  | | ƒ \ }	 }
 } t | ƒ } t | ƒ } t |  ƒ } t | | | | |	 ƒ \ } } } } | | | | | | g } i  } t j d I|  Id It | d t ƒ I| IJ| | k ré | | f St | | | | | ƒ \ } } t | |
 | | | ƒ \ } } t j d IJt |  | | | | ƒ } t j d IJt |  | |	 | | ƒ } t j d IJ| j	 ƒ  } xÏ t
 | ƒ D]Á } x¸ t
 | ƒ D]ª } | | j |  | ƒ k rÍq¨n  |  |  | j | ƒ |  | d } | | | } | | | }  t | | | | |  ƒ \ } } } } | | | | | | g }! |! | | <q¨Wq•W| | f S(   s!  
    Find the Hamming enrichment of the given word.
    If pvalue under given pvalue, estimate the Hamming
    enrichment for each distance-1 neighbor of the given word.
    Returns word_pvalue_record, neighbor_pvalue_records (dict).
    pvalue_record = [p, P, n, N, log_pvalue, dist]
    s   Exact p-value oft   isi   s;   Estimating Hamming distance counts in positive sequences...s;   Estimating Hamming distance counts in negative sequences...s   Finding best neighbors...(   R7   R   R8   t   syst   stderrR   RI   t   get_freqs_and_nsitest   estimate_new_hamming_countst
   getFullLenRH   t   getIndext	   getSymbol("   R   t   min_log_pvalueR9   R:   R   R#   R<   R=   R>   R?   R@   RA   RJ   RK   R%   R3   RP   RN   RO   t   pvalue_recordt	   neighborst	   pos_freqst
   pos_nsitest	   neg_freqst
   neg_nsitest   pos_neighbor_countst   neg_neighbor_countsR&   t   colt   lett   new_wordt   p_countst   n_countst   new_pvalue_record(    (    sP   /woldlab/castor/home/georgi/programs/meme_4.11.2/scripts/fasta-hamming-enrich.pyt   get_enrichment_and_neighbors§   s<    $1
#$c   
   	   C   s‘   g  } g  } xx t  |  d ƒ D]f } t |  | | | | | | ƒ } t j | ƒ }	 |	 j | ƒ | j |	 j ƒ  ƒ | j t | ƒ ƒ q W| | f S(   sk   
    Get PWMs and numbers of sites for each exact Hamming distance.
    Returns (freqs[d], nsites[d]).
    i   (   RH   t   get_aln_from_dists_and_offsetst   sequencet   PWMt   setFromAlignmentR   t   getFreqR   (
   R%   R+   R,   R   R   t   freqst   nsitest   dt   alnt   pwm(    (    sP   /woldlab/castor/home/georgi/programs/meme_4.11.2/scripts/fasta-hamming-enrich.pyRU   à   s    c         C   s¢   t  |  ƒ } g  } x‰ t | ƒ D]{ } | j g  ƒ | j |  | ƒ } xR t | j ƒ  ƒ D]> }	 | j |	 ƒ }
 | | j t | | | | | | |
 ƒ ƒ qX Wq W| S(   s¶   
    Return neighbor_counts[col][let][dist] list estimated if old_let
    were replaced by let.  List contains estimated counts of sequences
    containing the neighboring word.
    (   R   R!   R   R   RW   t#   estimate_new_hamming_counts_col_let(   R   R   R*   Ro   Rp   R%   t   neighbor_countsRc   t   old_compriset   symit   new_comprise(    (    sP   /woldlab/castor/home/georgi/programs/meme_4.11.2/scripts/fasta-hamming-enrich.pyRV   ö   s    
$c      	   C   sN  t  j  |  ƒ } | | k r | Sx(t | d ƒ D]} d }	 }
 | | } t | | ƒ d k rf q0 n  | | | } t t | t g  | j | ƒ D] } | | ^ q ƒ ƒ ƒ }	 t t | t g  | j | ƒ D] } | | ^ qÌ ƒ ƒ ƒ }
 | | c |	 |
 8<| d k r#| | d c |	 7<n  | | k  r0 | | d c |
 7<q0 q0 W| S(   s  
    Return counts[dist] list estimated if old_comprise were replaced by new_comprise
    in column col.
FIXME: could add columns on either side of freqs array so we can do "lengthen" and "shift" moves.
    freqs[d][col] is the frequency column of the alignment of nsites[d] sites with
    Hamming distance d (PWM of sites at Hamming distance d).
    old/new_comprise are the sets of the indexes of the comprising core symbols
    so for example DNA's N is frozenset([0,1,2,3]) and DNA's S is frozenset([1,2]) .
    i   i    (   RG   R!   R   t   intt   roundt   sumt
   difference(   R*   Ro   Rp   t   max_distRc   Rv   Rx   t
   new_countsRq   t   upt   downRO   t   fRw   (    (    sP   /woldlab/castor/home/georgi/programs/meme_4.11.2/scripts/fasta-hamming-enrich.pyRt     s"    

<<c         C   sÌ   g  } x¿ t  t | ƒ ƒ D]« } | | }	 |	 | k r | |	 k r | | }
 |
 d k r] q n  |
 d k rv |
 d } n |
 d } | | | | |  !} |
 d k  r´ t | | ƒ } n  | j | ƒ q q W| S(   so   
    Get an alignment of the leftmost best site in each sequence.
    Site must have d1 <= distance <= d2.
    i    i   (   RH   R   R   R   (   R%   t   d1t   d2R+   R,   R   R   Rr   R0   R3   t   offsett   startt   match(    (    sP   /woldlab/castor/home/georgi/programs/meme_4.11.2/scripts/fasta-hamming-enrich.pyRj   5  s    

c         C   s  t  | |  | | ƒ \ } } } t |  ƒ } | | j ƒ  }	 xÎ t t | ƒ ƒ D]º }
 | |
 } d } | | k rM | |
 } | d k r‹ qM n  | d k r¶ | | k  r© qM n  | d } n | | k  rÉ qM n  | d } | | } | |
 |  |	 | |
 | | | |
 <qM qM Wd S(   s½   
    Erase all non-overlapping matches to word in seqs by changing to 'N'.
    Greedy algorithm erases leftmost site first.
    Site must be within given Hamming distance to be erased.
    i   i    N(   R7   R   t   getWildcardRH   (   R   R3   R   R   R#   R*   R+   R,   R%   t   ensR0   Rq   t
   min_offsetR„   R…   (    (    sP   /woldlab/castor/home/georgi/programs/meme_4.11.2/scripts/fasta-hamming-enrich.pyt   erase_word_distanceM  s&    

   
c   	      C   s@   t  | |  | | ƒ \ } } } t t |  ƒ | | | | | | ƒ S(   so   
    Get an alignment of the leftmost best site in each sequence.
    Site must have d1 <= distance <= d2.
    (   R7   Rj   R   (	   R   R‚   Rƒ   R   R   R#   R*   R+   R,   (    (    sP   /woldlab/castor/home/georgi/programs/meme_4.11.2/scripts/fasta-hamming-enrich.pyt   get_aln_from_wordh  s    c         C   sv   t  |  | | | | | ƒ \ } } } }	 }
 } t t |  ƒ d | | |	 | | ƒ } | | |
 t | ƒ | t | ƒ | f S(   s¢   
    Find the most enriched Hamming distance to the word.
    Get the alignment of the matching sites (ZOOPS).

    Returns dist, log_pvalue, p, P, n, N, aln
    i    (   RF   Rj   R   (   R   R9   R:   R   R#   R;   RB   RC   R+   R,   RN   RO   Rr   (    (    sP   /woldlab/castor/home/georgi/programs/meme_4.11.2/scripts/fasta-hamming-enrich.pyt   get_best_hamming_alignmentr  s    	-$c         C   s9  t  j j d ƒ |  t j d ƒ k s: |  t j d ƒ k re t  j j d j d j |  j ƒ  ƒ ƒ ƒ nN t  j j d j t j	 |  j
 ƒ  ƒ ƒ ƒ t  j j |  j ƒ  ƒ t  j j d ƒ |  j ƒ  rÒ t  j j d ƒ n  t  j j d	 ƒ d
 |  j ƒ  } x0 |  j ƒ  D]" } t  j j d j | | ƒ ƒ qÿ Wt  j j d ƒ d  S(   Ns   
MEME version 4

t   DNAt   Proteins   ALPHABET= {}

R   s   ALPHABET {}
s   END ALPHABET

s   strands: + -

s9   Background letter frequencies (from uniform background):
g      ð?s   {:s} {:.4f} s   
(   RS   t   stdoutt   writet   alphabett	   getByNameR   R   t
   getSymbolst   jsont   dumpst   getNamet   asTextt   isComplementableR   (   R   t   freqR   (    (    sP   /woldlab/castor/home/georgi/programs/meme_4.11.2/scripts/fasta-hamming-enrich.pyt   print_meme_headerƒ  s    *+( c   	      C   sq   t  j | ƒ } | j | ƒ | j ƒ  } t |  ƒ } d |  | | | | f GHx | j ƒ  D] } | GHqY Wd GHd  S(   NsG   
MOTIF %s
letter-probability matrix: alength= %d w= %d nsites= %d E= %sR   (   Rk   Rl   Rm   R   R   t   pretty(	   R   Rp   t	   ev_stringRr   R   Rs   R&   R%   t   row(    (    sP   /woldlab/castor/home/georgi/programs/meme_4.11.2/scripts/fasta-hamming-enrich.pyt   print_meme_motif”  s    	c            s0   ˆ  s
 g  Sˆ  j  ƒ  } | j ‡  f d †  ƒ | S(   sG    Return the keys of a p-value dictionary, sorted by increasing p-value c            s,   t  ˆ  |  d ˆ  | d ƒ p+ t  |  | ƒ S(   Ni   (   t   cmp(   t   xt   y(   t
   re_pvalues(    sP   /woldlab/castor/home/georgi/programs/meme_4.11.2/scripts/fasta-hamming-enrich.pyt   <lambda>§  s    (   t   keyst   sort(   R¢   R¤   (    (   R¢   sP   /woldlab/castor/home/georgi/programs/meme_4.11.2/scripts/fasta-hamming-enrich.pyt   sorted_re_pvalue_keys£  s
     c         C   s¼   t  |  | | | | | ƒ \ } } | d | k s8 | rF | d d f St g  | D] } | | d | f ^ qP ƒ \ }	 }
 d G|  Gt | d d t ƒ Gd G|
 Gt |	 d t ƒ GH| d |
 f S(   Ni   R   t   originali   s   best neighbor (estimated)(   Ri   t   minR   RI   (   R   RZ   R9   R:   R   R#   R[   R\   t   tmpt   best_pvaluet	   best_word(    (    sP   /woldlab/castor/home/georgi/programs/meme_4.11.2/scripts/fasta-hamming-enrich.pyt   get_best_neighborª  s    !35c   	      C   s`   d } |  } xG t  |  | | | | | ƒ \ } } | | k r@ Pq |  } | } | }  q W| | f S(   Ni   (   R¬   (	   R   R9   R:   R   R#   RC   R«   RP   t   neighbor(    (    sP   /woldlab/castor/home/georgi/programs/meme_4.11.2/scripts/fasta-hamming-enrich.pyt   refine_consensus¹  s    !
c          C   s/  d  }  d  } d  } t } t } d t j d } t t j ƒ d k ra t j | IJt j d ƒ n  d } xè| t t j ƒ k  rQt j | } | d k rÚ | d 7} y t j | } WqDt j | IJt j d ƒ qDXnj| d k r(| d 7} y t j | }  WqDt j | IJt j d ƒ qDXn| d k rv| d 7} y t j | } WqDt j | IJt j d ƒ qDXnÎ | d k rÄ| d 7} y t j | } WqDt j | IJt j d ƒ qDXn€ | d k ry
 t } WqDt j | IJt j d ƒ qDXnC | d	 k r*t j | IJt j d ƒ n t j | IJt j d ƒ | d 7} qj W|  d  k sj| d  k r‡t j | IJt j d ƒ n  t j ƒ  }	 d  }
 | d  k r·t	 j
 | ƒ }
 n t	 j ƒ  }
 t j d
 IJt t j |  |
 ƒ ƒ } t t j | |
 ƒ ƒ } | r't | | | |
 | ƒ \ } } n | } t j d IJt | | | |
 | ƒ \ } } } } } } } t | d t ƒ } t | ƒ } t j d I| I| I| I| I| Id IJt j d | | | | f IJt |
 ƒ t | | | | |
 ƒ t j ƒ  } | |	 } t j d | IJt j d | IJd  S(   Ns“  USAGE:
    %s [options]

    -w <word>               word (required)
    -p <file_name>          positive sequences FASTA file name (required)
    -n <file_name>          negative sequences FASTA file name (required)
    -a <file_name>          alphabet definition file
    -r                      refine consensus by branching search
                            (distance 1 steps; beam size = 1).
    -h                      print this usage message

    Compute the Hamming distance from <word> to each FASTA sequence
    in the positive and negative files.  Apply Fisher's Exact test to
    each distance.
    <word> may contain ambiguous characters.

    i    i   s   -ws   -ps   -ns   -as   -rs   -hs   Reading sequences...s   Computing Hamming alignment...t   [t   ]sE   Best ZOOPs alignment has %d sites / %d at distance %d with p-value %ss   elapsed time: %.2f secondss   #elapsed time: %.2f seconds(   R    t   FalseRS   t   argvR   RT   t   exitt   Truet   timeR‘   t   loadFromFilet   dnaR   Rk   t	   readFASTAR®   RŒ   R   RI   Rš   Rž   R   (   t   pos_seq_file_namet   neg_seq_file_namet   alphabet_file_namet   refineR#   t   usageR0   t   argR   t
   start_timeR   R9   R:   R«   RC   R3   RP   RN   RJ   RO   RK   Rr   RQ   Rp   t   end_timet   elapsed(    (    sP   /woldlab/castor/home/georgi/programs/meme_4.11.2/scripts/fasta-hamming-enrich.pyt   mainÈ  sœ     
   
   
   
    
     !-1

t   __main__((   R”   RS   t   stringRG   Rµ   t   mathR    R   R   R   t   pathR   R‘   Rk   t   hypergeometricR   RI   R   R   R   R   R7   RF   R±   R8   Ri   RU   RV   Rt   Rj   RŠ   R‹   RŒ   Rš   Rž   R¦   R¬   R®   RÂ   t   __name__(    (    (    sP   /woldlab/castor/home/georgi/programs/meme_4.11.2/scripts/fasta-hamming-enrich.pyt   <module>   s:   <"				C	"	9			'			
						i 