ó
)†Wc           @   sŠ   d  d l  Z d  d l m Z d  d l m Z d „  Z d „  Z d „  Z d „  Z	 d „  Z
 d	 „  Z d
 „  Z d „  Z d „  Z d „  Z d S(   iÿÿÿÿN(   t   utils(   t   fclusterdatac         C   s"   t  |  ƒ t  | ƒ t  |  | ƒ S(   N(   t   log_factorial(   t   nt   x(    (    s   DP_GP/cluster_tools.pyt   log_binomial_coefficient   s    c         C   s   t  j |  d ƒ S(   Ni   (   t   npt   log(   R   (    (    s   DP_GP/cluster_tools.pyR      s    c         C   sˆ  | j  d } t j t | d ƒ ƒ } d } xT t | ƒ D]F } x= t | ƒ D]/ } |  | |  | k rK | | | | 7} qK qK Wq8 Wd } xf t | ƒ D]X } xO t | ƒ D]A } |  | |  | k r¨ | | d | d … | f j ƒ  7} q¨ q¨ Wq• W| | :} d } x^ t | ƒ D]P } xG t | ƒ D]9 } | | | | 7} |  | |  | k r!| d 7} q!q!WqW| d :} | | }	 | | }
 |	 |
 S(   sÛ  
    Compute MPEAR (Fritsch and Ickstadt 2009, DOI:10.1214/09-BA414).
    This function and accessory routines were taken with little 
    modification from Pyclone (Roth et al. 2014 DOI:10.1038/nmeth.2883).
    
    :param cluster_labels: cluster labels
    :type cluster_labels: numpy array of ints
    :param sim_mat: sim_mat[i,j] = (# samples gene i in cluster with gene j)/(# total samples)
    :type sim_mat: numpy array of (0-1) floats
    
    :rtype: float
    
    i    i   Ni   (   t   shapeR   t   expR   t   ranget   sum(   t   cluster_labelst   sim_matt   Nt   ct
   num_term_1t   jt   it
   num_term_2t
   den_term_1t   numt   den(    (    s   DP_GP/cluster_tools.pyt   compute_mpear   s.    /



c         C   s}   i  } d } g  } xd t  |  ƒ D]V } | | k rU | j | ƒ | | | <| d 7} q | | k r | j | | ƒ q q W| S(   sV   
    Given some cluster labels, relabel (equivalently) so that labels start at 1.
    i   (   t   listt   append(   R   t
   clust_dictt	   new_labelt
   new_labelst   label(    (    s   DP_GP/cluster_tools.pyt   relabel_clusteringE   s    
c         C   s)   |  | } t  j t  j | | ƒ ƒ } | S(   s@   
    Compute the squared distance between two matrices.    
    (   R   R   t   dot(   t   St   S_newt   difft   sq_dist(    (    s   DP_GP/cluster_tools.pyt   compute_sq_distX   s    
c         C   sh   d } xO t  t |  ƒ ƒ D]; } t |  | f | ƒ } | | k r | } |  | } q q Wt | ƒ } | S(   sš  
    Find the optimal clustering according to the MPEAR criterion.
    
    :param clusterings: clusterings[i,j] is the cluster to which gene j belongs at sample i
    :type clusterings: numpy array of ints
    :param sim_mat: sim_mat[i,j] = (# samples gene i in cluster with gene j)/(# total samples)
    :type sim_mat: numpy array of (0-1) floats
    
    :rtype: numpy array of best cluster labels
    
    i    (   R
   t   lenR   R   (   t   clusteringsR   t   max_pearR   t   peart   best_cluster_labelsR   (    (    s   DP_GP/cluster_tools.pyt   best_clustering_by_mpearc   s    c         C   se   t  j } xH t t t |  ƒ ƒ | ƒ D]+ \ } } | | k r& | } |  | } q& q& Wt | ƒ } | S(   sÁ  
    Find the optimal clustering according to log posterior likelihood
    (i.e. maximum a posteriori clustering).
    
    :param clusterings: clusterings[i,j] is the cluster to which gene j belongs at sample i
    :type clusterings: numpy array of ints
    :param log_post_list: list of log posterior likelihood over the course of Gibbs sampling
    :type log_post_list: list of floats
    
    :rtype: numpy array of best cluster labels
    
    (   R   t   inft   zipR
   R%   R   (   R&   t   log_post_listt   max_postR   t   postR)   R   (    (    s   DP_GP/cluster_tools.pyt!   best_clustering_by_log_likelihood   s    
(c         C   s  t  j } xð t t |  ƒ ƒ D]Ü } |  | d d … f } t  j t | ƒ t | ƒ f ƒ } xn t t | ƒ ƒ D]Z } xQ t t | ƒ ƒ D]= } | | | | k r² d | | | f <q… d | | | f <q… Wql Wt | | ƒ } | | k  r | } |  | }	 q q Wt |	 ƒ }
 |
 S(   s  
    Find the optimal clustering according to the Dahl 2006 least-squares criterion
    ("Model-based clustering for expression data via a Dirichlet process mixture model").
    
    :param clusterings: clusterings[i,j] is the cluster to which gene j belongs at sample i
    :type clusterings: numpy array of ints
    :param sim_mat: sim_mat[i,j] = (# samples gene i in cluster with gene j)/(# total samples)
    :type sim_mat: numpy array of (0-1) floats
    
    :rtype: numpy array of best cluster labels
    
    Ni   i    (   R   R+   R
   R%   t   zerosR$   R   (   R&   R   t   min_distR   t
   clusteringR    R   t   kt   distR)   R   (    (    s   DP_GP/cluster_tools.pyt   best_clustering_by_sq_distš   s    	!c         C   s+   t  |  d d | d d ƒ} t | ƒ } | S(   sã  
    Find the optimal clustering by hierarchical clustering. For more details, see
    description of scipy.cluster.hierarchy.fclusterdata.
    
    :param clusterings: clusterings[i,j] is the cluster to which gene j belongs at sample i
    :type clusterings: numpy array of ints
    :param sim_mat: sim_mat[i,j] = (# samples gene i in cluster with gene j)/(# total samples)
    :type sim_mat: numpy array of (0-1) floats
    
    :rtype: numpy array of best cluster labels
    
    g®Gáz®ï?t   methodt   metrict   hamming(   R   R   (   R&   R7   R)   R   (    (    s   DP_GP/cluster_tools.pyt   best_clustering_by_h_clust¾   s    c         C   s†   t  | d d ƒ } | j d ƒ xU t |  ƒ D]G } |  | } t j | ƒ } x% | D] } | j d | | f ƒ qS Wq- W| j ƒ  d S(   s–   
    Save cluster membership information in the form:
    cluster<tab>gene
    (e.g.) 1<tab>gene1
              .
              .
              .
    s   _optimal_clustering.txtt   ws   cluster	gene
s   %s	%s
N(   t   opent   writet   sortedR    t   sorted_nicelyt   close(   t   optimal_cluster_labelst   output_path_prefixt   handlet   clustert   genest   gene(    (    s   DP_GP/cluster_tools.pyt#   save_cluster_membership_informationÑ   s    

(   t   numpyR   t   DP_GPR    t   scipy.cluster.hierarchyR   R   R   R   R   R$   R*   R0   R6   R:   RG   (    (    (    s   DP_GP/cluster_tools.pyt   <module>   s   			4					$	