ó
jF‰\c           @   sY  d  d l  m Z d  d l Z d  d l Z d  d l Z d  d l m Z m	 Z	 m
 Z
 i d d 6d d 6d d 6d d 6d d	 6d
 d 6d d 6d d 6d d 6d d 6d d 6d d 6d d 6d d 6d d 6Z e j e ƒ j Z d d e d „ Z d  d! „ Z d d" „ Z d d# „ Z d d$ „ Z d d% „ Z d& „  Z d' „  Z d( „  Z d) d* „ Z d+ d, d- „ Z d S(.   iÿÿÿÿ(   t   divisionN(   t   validate_matrixt   validate_probability_matt   validate_information_matt   At   Ct   Gt   Tt   AGt   Rt   CTt   Yt   GCt   St   ATt   Wt   GTt   Kt   ACt   Mt   CGTt   Bt   AGTt   Dt   ACTt   Ht   ACGt   Vt   ACGTt   Ni   c   
      C   s–  d d d d h } d d d h } t  |  ƒ }  | | k rH |  j ƒ  } n| | k sd t d | ƒ ‚ | | k s€ t d | ƒ ‚ | d k rË | d k rª t |  | ƒ } qU| d k rUt |  | ƒ } qUnŠ | d k ré t |  | ƒ }	 nN | d k rt |  | ƒ }	 n0 | d k r%t |  | ƒ }	 n t s7t d ƒ ‚ t	 |	 d d d	 | d
 | ƒ} | r†| d k swt d d ƒ ‚ t
 | ƒ } n  t  | ƒ } | S(   s¿  
    Transforms a matrix of one type into a matrix of another type.

    i = position
    c, d = character

    l = pseudocount
    C = number of characters

    N_ic = counts matrix element
    P_ic = probability matrix element
    Q_ic = background probability matrix element
    W_ic = weight matrix element
    I_ic = information matrix element

    counts -> probability:
        P_ic = (N_ic + l)/(N_i + C*l), N_i = sum_c(N_ic)

    probability -> weight:
        W_ic = log_2(P_ic / Q_ic)

    weight -> probability:
        P_ic = Q_ic * 2^(W_ic)

    probability -> information:
        I_ic = P_ic * sum_d(P_id * log2(P_id / W_id))

    information -> probability:
        P_ic = I_ic / sum_d(I_id)


    parameters
    ----------

    df: (dataframe)
        The matrix to be transformed.

    from_type: (str)
        Type of input matrix. Must be one of 'counts', 'probability',
        'weight', or 'information'.

    to_type: (str)
        Type of output matrix. Must be one of 'probability', 'weight', or
        'information'. Can NOT be 'counts'.

    background: (array, or df)
        Specification of background probabilities. If array, should be the
        same length as df.columns and correspond to the probability of each
        column's character. If df, should be a probability matrix the same
        shape as df.

    pseudocount: (number >= 0)
        Pseudocount to use when transforming from a count matrix to a
        probability matrix.

    center: (bool)
        Whether to center the output matrix. Note: this only works when
        to_type = 'weight', as centering a matrix doesn't make sense otherwise.

    returns
    -------
    out_df: (dataframe)
        Transformed matrix
    t   countst   probabilityt   weightt   informations   Error: invalid from_type=%ss   Error: invalid to_type="%s"s   THIS SHOULD NEVER HAPPENt	   from_typet   to_typet
   backgrounds6   Error: the option center=True is only compatible with s   to_type == "weight"(   R   t   copyt   AssertionErrort   _probability_mat_to_weight_matt#   _probability_mat_to_information_matt   _counts_mat_to_probability_matt   _weight_mat_to_probability_matt#   _information_mat_to_probability_matt   Falset   transform_matrixt   center_matrix(
   t   dfR"   R#   R$   t   pseudocountt   centert
   FROM_TYPESt   TO_TYPESt   out_dft   prob_df(    (    s   ../../logomaker/src/data.pyR-   !   s@    E		
g      ð?c         C   s›   | d k s t  d ƒ ‚ t |  ƒ }  |  j ƒ  } |  j | } | | j d d ƒ d d … t j f | j d d … d d … f <t | ƒ } t	 | ƒ } | S(   s:   
    Converts a counts matrix to a probability matrix
    i    s!   Error: Pseudocount must be >= 0. t   axisi   N(
   R&   R   R%   t   valuest   sumt   npt   newaxist   loct   normalize_matrixR   (   R/   R0   R4   t   vals(    (    s   ../../logomaker/src/data.pyR)   £   s    Bc         C   st   t  |  ƒ }  t |  | ƒ } |  j ƒ  } t j |  t ƒ t j | t ƒ | j d d … d d … f <t | ƒ } | S(   s:   
    Converts a probability matrix to a weight matrix
    N(   R   t   _get_background_matR%   R9   t   log2t   SMALLR;   R   (   R/   R$   t   bg_dfR4   (    (    s   ../../logomaker/src/data.pyR'   ¸   s    =c         C   sx   t  |  ƒ }  t |  | ƒ } |  j ƒ  } | j t j d |  j ƒ | j d d … d d … f <t | ƒ } t | ƒ } | S(   s:   
    Converts a probability matrix to a weight matrix
    i   N(	   R   R>   R%   R7   R9   t   powerR;   R<   R   (   R/   R$   RA   R4   (    (    s   ../../logomaker/src/data.pyR*   Ì   s    5c         C   s¹   t  |  ƒ }  t |  | ƒ } |  j ƒ  } |  j } | j } | t j | t ƒ t j | t ƒ } | j d d ƒ } | | d d … t j f | j	 d d … d d … f <t
 | ƒ } | S(   s@   
    Converts a probability matrix to an information matrix
    R6   i   N(   R   R>   R%   R7   R9   R?   R@   R8   R:   R;   R   (   R/   R$   RA   R4   t   fg_valst   bg_valst   tmp_valst   info_vec(    (    s   ../../logomaker/src/data.pyR(   ä   s    		(6c         C   s(   t  |  ƒ }  t |  ƒ } t | ƒ } | S(   s@   
    Converts a probability matrix to an information matrix
    (   R   R<   R   (   R/   R$   R4   (    (    s   ../../logomaker/src/data.pyR+   ý   s    c         C   sÂ   t  |  ƒ }  t |  j j ƒ  d k ƒ s3 t d ƒ ‚ |  j d d ƒ j } t t j | d ƒ ƒ sm t d ƒ ‚ |  j	 ƒ  } |  j | d d … t j
 f | j d d … d d … f <t | ƒ } | S(   s?   
    Normalizes a matrix df to a probability matrix out_df
    i    s,   Error: Some data frame entries are negative.R6   i   g        s-   Error: some columns in df sum to nearly zero.N(   R   t   allR7   t   ravelR&   R8   t   anyR9   t   iscloseR%   R:   R;   R   (   R/   t   sumsR4   (    (    s   ../../logomaker/src/data.pyR<     s    		9c         C   sv   t  |  ƒ }  |  j d d ƒ j } |  j ƒ  } |  j | d d … t j f | j d d … d d … f <t  | ƒ } | S(   sN   
    Centers each row of a matrix about zero by subtracting out the mean.
    R6   i   N(   R   t   meanR7   R%   R9   R:   R;   (   R/   t   meansR4   (    (    s   ../../logomaker/src/data.pyR.   *  s    9c         C   sm  |  j  \ } } |  j ƒ  } | d k rM d | | j d d … d d … f <nt | t j t t f ƒ rÖ t j	 | ƒ } t
 | ƒ | k s• t d ƒ ‚ | t j d d … f | j d d … d d … f <t | ƒ } n‡ t | t j j j ƒ r]t | ƒ } t |  j | j k ƒ st d ƒ ‚ t |  j | j k ƒ sBt d ƒ ‚ | j ƒ  } t | ƒ } n  t | ƒ } | S(   sl  
    Creates a background matrix given a background specification. There
    are three possiblities:

    1. background is None => out_df represents a uniform background
    2. background is a vector => this vector is normalized then used as
        the entries of the rows of out_df
    3. background is a dataframe => it is then normalized and use as out_df
    i   Ns4   Error: df and background have mismatched dimensions.s,   Error: df and bg_mat have different indexes.s,   Error: df and bg_mat have different columns.(   t   shapeR%   t   NoneR;   t
   isinstanceR9   t   ndarrayt   listt   tuplet   arrayt   lenR&   R:   R<   t   pdt   coret   framet	   DataFrameR   RG   t   indext   columnsR   (   R/   R$   t   num_post   num_colsR4   (    (    s   ../../logomaker/src/data.pyR>   >  s(    &	2		R   c         K   s»   t  |  ƒ } t d ƒ } t t | ƒ ƒ } t j d d d | d | ƒ } xN t t |  ƒ ƒ D]: \ } } t | }	 x! |	 D] }
 d | j | |
 f <qx Wq[ Wt | d d d	 d
 d | ƒ} | S(   sÎ  
    Generates a matrix corresponding to a (DNA) IUPAC string.

    parameters
    ----------
    iupac_seq: (str)
        An IUPAC sequence.

    to_type: (str)
        The type of matrix to convert to. Must be 'probability', 'weight',
        or 'information'

    **kwargs:
        Additional arguments to send to transform_matrix, e.g. background
        or center

    returns
    -------
    out_df: (dataframe)
        A matrix of the requested type.
    R   t   datag        R[   RZ   i   R0   i    R"   R   R#   (	   RU   RR   t   rangeRV   RY   t	   enumeratet
   iupac_dictR;   R-   (   t	   iupac_seqR#   t   kwargst   Lt   colsRZ   t
   counts_matt   it   ct   bst   bR4   (    (    s   ../../logomaker/src/data.pyt   iupac_to_matrixj  s    
		R   s   .-c         K   s  t  j g  |  D] } t  j t | ƒ ƒ ^ q ƒ } | j d } t  j | j ƒ  ƒ } | j ƒ  g  | D] } | | k rd | ^ qd }	 t t | ƒ ƒ }
 t j	 d d d |	 d |
 ƒ } xH |	 D]@ } | | k j
 t ƒ j d d ƒ j ƒ  | j d d … | f <q¹ Wt | d d	 d
 | | } | S(   só  
    Generates matrix from a sequence alignment

    parameters
    ----------
    sequences: (list of str)
        An list of sequences, all of which must be the same length

    to_type: (str)
        The type of matrix to output. Must be 'counts', 'probability',
        'weight', or 'information'

    **kwargs:
        Other arguments to pass to logomaker.transform_matrix(), e.g.
        pseudocount

    returns
    -------
    out_df: (dataframe)
        A matrix of the requested type.
    i   R^   i    R[   RZ   R6   NR"   R   R#   (   R9   RT   RR   RN   t   uniqueRH   t   sortR_   RV   RY   t   astypet   floatR8   R;   R-   (   t	   sequencesR#   t   characters_to_ignoreRc   t   seqt
   char_arrayRd   t   unique_charactersRh   R[   RZ   t	   counts_dfR4   (    (    s   ../../logomaker/src/data.pyt   alignment_to_matrix“  s    1
%>		(   t
   __future__R    t   numpyR9   t   pandasRV   t   pdbt   logomaker.src.validateR   R   R   Ra   t   finfoRo   t   tinyR@   RO   R,   R-   R)   R'   R*   R(   R+   R<   R.   R>   Rk   Rv   (    (    (    s   ../../logomaker/src/data.pyt   <module>   sF   
			,*