ó
jÜxec           @   sÌ  d  d l  m Z d  d l Z d  d l Z d  d l m Z m Z d  d l	 m
 Z
 i d d 6d d 6d	 d
 6Z i d d 6d d 6d d 6d d 6d d 6d d 6d d 6d d 6d d 6d d 6d d 6d d 6d d  6d! d" 6d d# 6Z e j e ƒ j Z d$ d% d& d' h Z e e e d d d d( d) „ ƒ Z d* d+ „ Z d d, „ Z d d- „ Z d d. „ Z d d/ „ Z d0 „  Z d1 „  Z d2 „  Z e d d$ d d3 e d* d4 „ ƒ Z e d d e d% e d5 „ ƒ Z e d d d6 „ ƒ Z d S(7   iÿÿÿÿ(   t   divisionN(   t   checkt   handle_errors(   t   validate_matrixt   ACGTt   dnat   ACGUt   rnat   ACDEFGHIKLMNPQRSTVWYt   proteint   At   Ct   Gt   Tt   AGt   Rt   CTt   Yt   GCt   St   ATt   Wt   GTt   Kt   ACt   Mt   CGTt   Bt   AGTt   Dt   ACTt   Ht   ACGt   Vt   Nt   countst   probabilityt   weightt   informationi   c   	      C   sö  t  |  ƒ }  t t | t ƒ d t | ƒ ƒ t t | t ƒ d t | ƒ ƒ t | t k pd | d k d | t f ƒ t | t k p | d k d | t f ƒ t t | t g  ƒ t j t	 j
 f ƒ pÎ | d k d t | ƒ ƒ t t | t t f ƒ d t | ƒ ƒ t | d k d | ƒ | t k r`t | d k o@| d k d	 | | f ƒ t |  ƒ } n†| t k r¤t | d k o„| d k d
 | | f ƒ t |  ƒ } nB| | k r¿|  j ƒ  } n't | d k	 o×| d k	 d | | f ƒ t | d k d | ƒ | d k r\| d k r)t |  | ƒ } qæ| d k rGt |  | ƒ } qæt sæt d ƒ ‚ nŠ | d k rzt |  | ƒ } nN | d k r˜t |  | ƒ } n0 | d k r¶t |  | ƒ } n t sÈt d ƒ ‚ t | d d d | d | ƒ} t  | ƒ } | S(   s,  
    Performs transformations on a matrix. There are three types of
    transformations that can be performed:

    1. Center values:
        Subtracts the mean from each row in df. This is common for weight
        matrices or energy matrices. To do this, set center_values=True.

    2. Normalize values:
        Divides each row by the sum of the row. This is needed for probability
        matrices. To do this, set normalize_values=True.

    3. From/To transformations:
        Transforms from one type of matrix (e.g. 'counts') to another type
        of matrix (e.g. 'information'). To do this, set from_type and to_type
        arguments.

    Here are the mathematical formulas invoked by From/To transformations:

        from_type='counts' ->  to_type='probability':
            P_ic = (N_ic + l)/(N_i + C*l), N_i = sum_c(N_ic)

        from_type='probability' -> to_type='weight':
            W_ic = log_2(P_ic / Q_ic)

        from_type='weight' -> to_type='probability':
            P_ic = Q_ic * 2^(W_ic)

        from_type='probability' -> to_type='information':
            I_ic = P_ic * sum_d(P_id * log2(P_id / W_id))

        from_type='information' -> to_type='probability':
            P_ic = I_ic / sum_d(I_id)

        notation:
            i = position
            c, d = character
            l = pseudocount
            C = number of characters
            N_ic = counts matrix element
            P_ic = probability matrix element
            Q_ic = background probability matrix element
            W_ic = weight matrix element
            I_ic = information matrix element

    Using these five 1-step transformations, 2-step transformations
    are also enabled, e.g., from_type='counts' -> to_type='information'.

    parameters
    ----------

    df: (dataframe)
        The matrix to be transformed.

    center_values: (bool)
        Whether to center matrix values, i.e., subtract the mean from each
        row.

    normalize_values: (bool)
        Whether to normalize each row, i.e., divide each row by
        the sum of that row.

    from_type: (str)
        Type of input matrix. Must be one of 'counts', 'probability',
        'weight', or 'information'.

    to_type: (str)
        Type of output matrix. Must be one of 'probability', 'weight', or
        'information'. Can be 'counts' ONLY if from_type is 'counts' too.

    background: (array, or df)
        Specification of background probabilities. If array, should be the
        same length as df.columns and correspond to the probability of each
        column's character. If df, should be a probability matrix the same
        shape as df.

    pseudocount: (number >= 0)
        Pseudocount to use when transforming from a counts matrix to a
        probability matrix.

    returns
    -------
    out_df: (dataframe)
        Transformed matrix
    s-   type(center_values) = %s must be of type bools0   type(normalize_values) = %s must be of type bools$   from_type = %s must be None or in %ss"   to_type = %s must be None or in %ss@   type(background) = %s must be None or array-like or a dataframe.s'   type(pseudocount) = %s must be a numberi    s   pseudocount=%s must be >= 0s`   If center_values is True, both from_type and to_typemust be None. Here, from_type=%s, to_type=%ssc   If normalize_values is True, both from_type and to_typemust be None. Here, from_type=%s, to_type=%sso   Unless center_values is True or normalize_values is True,Neither from_type (=%s) nor to_type (=%s) can be None.R#   sS   Can only have to_type='counts' if from_type='counts'. Here, however, from_type='%s'R$   R%   R&   s   THIS SHOULD NEVER EXECUTEt	   from_typet   to_typet
   backgroundN(   R   R   t
   isinstancet   boolt   typet   MATRIX_TYPESt   Nonet   npt   ndarrayt   pdt	   DataFramet   intt   floatt   Truet   _center_matrixt   _normalize_matrixt   copyt   _probability_mat_to_weight_matt#   _probability_mat_to_information_matt   Falset   AssertionErrort   _counts_mat_to_probability_matt   _weight_mat_to_probability_matt#   _information_mat_to_probability_matt   transform_matrix(	   t   dft   center_valuest   normalize_valuesR'   R(   R)   t   pseudocountt   out_dft   prob_df(    (    s9   /tmp/pip-install-l3LICk/logomaker/logomaker/src/matrix.pyR@   (   sr    _'			g      ð?c         C   sœ   t  |  ƒ }  t | d k d ƒ |  j ƒ  } |  j | } | | j d d ƒ d d … t j f | j d d … d d … f <t | ƒ } t  | d d ƒ} | S(   s:   
    Converts a counts matrix to a probability matrix
    i    s   pseudocount must be >= 0.t   axisi   Nt   matrix_typeR$   (	   R   R   R8   t   valuest   sumR/   t   newaxist   locR7   (   t	   counts_dfRD   RF   t   vals(    (    s9   /tmp/pip-install-l3LICk/logomaker/logomaker/src/matrix.pyR=      s    Bc         C   sz   t  |  d d ƒ}  t |  | ƒ } |  j ƒ  } t j |  t ƒ t j | t ƒ | j d d … d d … f <t  | ƒ } | S(   s:   
    Converts a probability matrix to a weight matrix
    RH   R$   N(   R   t   _get_background_matR8   R/   t   log2t   SMALLRL   (   RF   R)   t   bg_dft	   weight_df(    (    s9   /tmp/pip-install-l3LICk/logomaker/logomaker/src/matrix.pyR9     s    =c         C   s~   t  |  ƒ }  t |  | ƒ } |  j ƒ  } | j t j d |  j ƒ | j d d … d d … f <t | ƒ } t  | d d ƒ} | S(   s:   
    Converts a weight matrix to a probability matrix
    i   NRH   R$   (   R   RO   R8   RI   R/   t   powerRL   R7   (   RS   R)   RR   RF   (    (    s9   /tmp/pip-install-l3LICk/logomaker/logomaker/src/matrix.pyR>   +  s    5c         C   sÅ   t  |  d d ƒ}  t |  | ƒ } |  j ƒ  } |  j } | j } | t j | t ƒ t j | t ƒ } | j d d ƒ } | | d d … t j f | j	 d d … d d … f <t  | d d ƒ} | S(   s@   
    Converts a probability matrix to an information matrix
    RH   R$   RG   i   NR&   (
   R   RO   R8   RI   R/   RP   RQ   RJ   RK   RL   (   RF   R)   RR   t   info_dft   fg_valst   bg_valst   tmp_valst   info_vec(    (    s9   /tmp/pip-install-l3LICk/logomaker/logomaker/src/matrix.pyR:   B  s    		(6c         C   s   t  |  d d ƒ}  t |  | ƒ } t j |  j d d ƒ d ƒ } | j | d d … f |  j | d d … f <t |  ƒ } t  | d d ƒ} | S(   sA   
    Converts an information matrix to an probability matrix
    RH   R&   RG   i   g        NR$   (   R   RO   R/   t   iscloseRJ   RL   R7   (   RU   R)   RR   t   zero_indicesRF   (    (    s9   /tmp/pip-install-l3LICk/logomaker/logomaker/src/matrix.pyR?   Z  s    ,c         C   s¾   t  |  ƒ }  t t |  j j ƒ  d k ƒ d ƒ |  j d d ƒ j } t t t j | d ƒ ƒ d ƒ |  j	 ƒ  } |  j | d d … t j
 f | j d d … d d … f <t  | d d	 ƒ} | S(
   s@   
    Normalizes a matrix df to a probability matrix prob_df
    i    s%   Some data frame entries are negative.RG   i   g        s&   Some columns in df sum to nearly zero.NRH   R$   (   R   R   t   allRI   t   ravelRJ   t   anyR/   RZ   R8   RK   RL   (   RA   t   sumsRF   (    (    s9   /tmp/pip-install-l3LICk/logomaker/logomaker/src/matrix.pyR7   u  s    9c         C   sv   t  |  ƒ }  |  j d d ƒ j } |  j ƒ  } |  j | d d … t j f | j d d … d d … f <t  | ƒ } | S(   sN   
    Centers each row of a matrix about zero by subtracting out the mean.
    RG   i   N(   R   t   meanRI   R8   R/   RK   RL   (   RA   t   meansRE   (    (    s9   /tmp/pip-install-l3LICk/logomaker/logomaker/src/matrix.pyR6   ‘  s    9c         C   sX  |  j  \ } } |  j ƒ  } | d k rM d | | j d d … d d … f <nõ t | t j t t f ƒ rÑ t j	 | ƒ } t
 t | ƒ | k d ƒ | t j d d … f | j d d … d d … f <t | ƒ } nq t | t j j j ƒ rBt | ƒ } t
 t |  j | j k ƒ d ƒ t
 t |  j | j k ƒ d ƒ t | ƒ } n  t | d d ƒ} | S(   sü  
    Creates a background matrix given a background specification. There
    are three possiblities:

    1. background is None => out_df represents a uniform background
    2. background is a vector => this vector is normalized then used as
        the entries of the rows of out_df. Vector must be the same length
        as the number of columns in df
    3. background is a dataframe => it is then normalized and use as out_df.
        In this case, background must have the same rows and cols as df
    i   Ns-   df and background have mismatched dimensions.s,   Error: df and bg_mat have different indexes.s,   Error: df and bg_mat have different columns.RH   R$   (   t   shapeR8   R.   RL   R*   R/   R0   t   listt   tuplet   arrayR   t   lenRK   R7   R1   t   coret   frameR2   R   R\   t   indext   columns(   RA   R)   t   num_post   num_colsRR   (    (    s9   /tmp/pip-install-l3LICk/logomaker/logomaker/src/matrix.pyRO   ¥  s&    &2s   .-c      
   C   s<  t  t |  t t t j t j f ƒ d ƒ t |  ƒ }  t  t |  ƒ d k d ƒ t  t	 d „  |  Dƒ ƒ d ƒ t  t | t
 ƒ d t | ƒ ƒ t  t | t ƒ d t | ƒ ƒ t |  d ƒ } t  t	 g  |  D] } t | ƒ | k ^ qÇ ƒ d ƒ t  t | t t t j t j f ƒ p| d k d	 ƒ | d k rDt j t |  ƒ ƒ } n5 t  t | ƒ t |  ƒ k d
 t | ƒ t |  ƒ f ƒ t  t | t g  ƒ t j t j f ƒ p©| d k d t | ƒ ƒ t j ƒ  }	 t  | |	 k d | |	 f ƒ t j g  |  D] }
 t j t |
 ƒ ƒ ^ qðƒ } t j | j ƒ  ƒ } | j ƒ  g  | D] } | | k r:| ^ q:} t t | ƒ ƒ } t j d d d | d | ƒ } xb | D]Z } | | k j t ƒ | d d … t j f } | j d d ƒ j | j d d … | f <qWt | d d d | d | d | ƒ} | r8| d k r8t | d t ƒ} n  | S(   s  
    Generates matrix from a sequence alignment

    parameters
    ----------
    sequences: (list of strings)
        A list of sequences, all of which must be the same length

    counts: (None or list of numbers)
        If not None, must be a list of numbers the same length os sequences,
        containing the (nonnegative) number of times that each sequence was
        observed. If None, defaults to 1.

    to_type: (str)
        The type of matrix to output. Must be 'counts', 'probability',
        'weight', or 'information'

    background: (array, or df)
        Specification of background probabilities. If array, should be the
        same length as df.columns and correspond to the probability of each
        column's character. If df, should be a probability matrix the same
        shape as df.

    characters_to_ignore: (str)
        Characters to ignore within sequences. This is often needed when
        creating matrices from gapped alignments.

    center_weights: (bool)
        Whether to subtract the mean of each row, but only if to_type=='weight'.

    pseudocount: (number >= 0.0)
        Pseudocount to use when converting from counts to probabilities.

    returns
    -------
    out_df: (dataframe)
        A matrix of the requested type.
    s:   sequences must be a list, tuple, np.ndarray, or pd.Series.i    s   sequences must have length > 0.c         s   s   |  ] } t  | t ƒ Vq d  S(   N(   R*   t   str(   t   .0t   seq(    (    s9   /tmp/pip-install-l3LICk/logomaker/logomaker/src/matrix.pys	   <genexpr>  s    s$   sequences must all be of type strings"   type(seq) = %s must be of type strs(   type(center_weights) = %s; must be bool.s4   all elements of sequences must have the same length.s?   counts must be None or a list, tuple, np.ndarray, or pd.Series.sQ   counts must be the same length as sequences;len(counts) = %d; len(sequences) = %ds@   type(background) = %s must be None or array-like or a dataframe.s   to_type=%s; must be in %st   dataRj   Ri   NRG   R'   R#   R(   RD   R)   R%   RB   (   R   R*   Rc   Rd   R/   R0   R1   t   SeriesRf   R\   Rm   R,   R+   R.   t   onesR2   R-   R8   Re   t   uniqueR]   t   sortt   ranget   astypeR4   RK   RJ   R   RL   R@   R5   (   t	   sequencesR#   R(   R)   t   characters_to_ignoret   center_weightsRD   t   Lt   st   valid_typesRo   t
   char_arrayt   unique_characterst   cRj   Ri   RM   t   tmp_matRE   (    (    s9   /tmp/pip-install-l3LICk/logomaker/logomaker/src/matrix.pyt   alignment_to_matrixÒ  sZ    2!+$	'	1
%,,		c         C   sÁ  t  j ƒ  } | j d ƒ t t |  t ƒ d t |  ƒ ƒ t t | t ƒ d t | ƒ ƒ | d k r„ t	 t
 |  ƒ ƒ } | j ƒ  n+ t t	 t
 t j f } t t | | ƒ d ƒ | d k	 rý t	 t j ƒ  ƒ } t | | k d | | f ƒ t	 t | ƒ } n  t | | k d | | f ƒ t t | t ƒ d t | ƒ ƒ | rft | d k d ƒ t	 t d	 ƒ } n  t |  ƒ }	 t	 t |	 ƒ ƒ }
 t j d
 d d | d |
 ƒ } | r(t	 t j ƒ  ƒ } x» t |  ƒ D]Z \ } } t | | k d | | | f ƒ t | } x! | D] } d | j | | f <qWqÇWnP xM t |  ƒ D]? \ } } t | | k d | | | f ƒ d | j | | f <q5Wt | d d d d d | ƒ} | r½| d k r½t | d t ƒ} n  | S(   s¬  
    Generates a matrix from a sequence. With default keyword arguments,
    this is a one-hot-encoded version of the sequence provided. Alternatively,
    is_iupac=True allows users to get matrix models based in IUPAC motifs.

    parameters
    ----------

    seq: (str)
        Sequence from which to construct matrix.

    cols: (str or array-like or None)
        The characters to use for the matrix columns. If None, cols is
        constructed from the unqiue characters in seq. Overriden by alphabet
        and is_iupac.

    alphabet: (str or None)
        The alphabet used to determine the columns of the matrix.
        Options are: 'dna', 'rna', 'protein'. Ignored if None. Overrides cols.

    is_iupac: (bool)
        If True, it is assumed that the sequence represents an IUPAC DNA
        string. In this case, cols is overridden, and alphabet must be None.

    to_type: (str)
        The type of matrix to output. Must be 'probability', 'weight',
        or 'information'

    center_weights: (bool)
        Whether to subtract the mean of each row, but only if to_type='weight'.

    returns
    -------
    seq_df: (dataframe)
        the matrix returned to the user.
    R#   s"   type(seq) = %s must be of type strs(   type(center_weights) = %s; must be bool.s<   cols = %s must be None or a string, set, list, or np.ndarrays   alphabet = %s; must be in %s.s)   invalid to_type=%s; to_type must be in %ss"   type(is_iupac) = %s; must be bool.s(   must have alphabet=None if is_iupac=TrueR   Rp   g        Rj   Ri   sL   character %s at position %d is not a valid IUPAC character;must be one of %sg      ð?s-   character %s at position %d is not in cols=%sRD   i    R'   R(   R%   RB   N(   R-   R8   t   removeR   R*   Rm   R,   R+   R.   Rc   t   setRt   R/   R0   t   ALPHABET_DICTt   keysRf   Ru   R1   R2   t
   IUPAC_DICTt	   enumerateRL   R@   R5   (   Ro   t   colst   alphabett   is_iupacR(   Ry   R|   t
   cols_typest   valid_alphabetsRz   Ri   RM   t   iupac_characterst   iR   t   bst   bRE   (    (    s9   /tmp/pip-install-l3LICk/logomaker/logomaker/src/matrix.pyt   sequence_to_matrixX  s`    -
		c   	      C   sq  t  |  t t j t j f ƒ rn y, d j g  |  D] } t | ƒ ^ q. ƒ }  WqŸ t t	 d t
 t ƒ ƒ qŸ Xn1 y t |  ƒ }  Wn t t	 d t
 t ƒ ƒ n Xt t  |  t ƒ d t |  ƒ ƒ t t  | t g  ƒ t j t j f ƒ d t | ƒ ƒ t | ƒ } t t |  ƒ t | ƒ k d ƒ | d k rJt t |  ƒ ƒ } | j ƒ  nu t t t t j f } t t  | | ƒ d ƒ t t t | ƒ ƒ t t |  ƒ ƒ k d ƒ t t | ƒ t |  ƒ k d ƒ | d k	 rt t j ƒ  ƒ } t | | k d	 | | f ƒ t t | ƒ } n  t |  d
 | ƒ} | j ƒ  } | j t j | ƒ d d … t j f | j d d … d d … f <| S(   s$  
    Takes a sequence string and an array of values values and outputs a
    values dataframe. The returned dataframe is a L by C matrix where C is
    the number ofcharacters and L is sequence length.  If matrix is denoted as
    S, i indexes positions and c indexes characters, then S_ic will be non-zero
    (equal to the value in the values array at position p) only if character c
    occurs at position p in sequence. All other elements of S are zero.

    example usage:

    saliency_mat = logomaker.saliency_to_matrix(sequence,values)
    logomaker.Logo(saliency_mat)

    parameters
    ----------

    seq: (str or array-like list of single characters)
        sequence for which values matrix is constructed

    values: (array-like list of numbers)
        array of values values for each character in sequence

    cols: (str or array-like or None)
        The characters to use for the matrix columns. If None, cols is
        constructed from the unqiue characters in seq. Overridden by alphabet
        and is_iupac.

    alphabet: (str or None)
        The alphabet used to determine the columns of the matrix.
        Options are: 'dna', 'rna', 'protein'. Ignored if None. Overrides cols.

    returns
    -------
    saliency_df: (dataframe)
        values matrix in the form of a dataframe

    t    s    could not convert %s to type strs"   type(seq) = %s must be of type strs&   type(values) = %s must be of type lists,   length of seq and values list must be equal.s<   cols = %s must be None or a string, set, list, or np.ndarraysF   length of set of unique characters must be equal for "cols " and "seq"s5   unique characters for "cols" and "seq" must be equal.s   alphabet = %s; must be in %s.Rˆ   N(   R*   Rc   R/   R0   R1   Rq   t   joinRm   R   R;   t   reprR,   Rf   R.   Rƒ   Rt   R„   R…   R‘   R8   RI   Re   RK   RL   (	   Ro   RI   Rˆ   R‰   t   xR‹   RŒ   t   ohe_sequencet   saliency_df(    (    s9   /tmp/pip-install-l3LICk/logomaker/logomaker/src/matrix.pyt   saliency_to_matrixé  sH    ),$$<(    t
   __future__R    t   numpyR/   t   pandasR1   t   logomaker.src.error_handlingR   R   t   logomaker.src.validateR   R„   R†   t   finfoR4   t   tinyRQ   R-   R;   R.   R@   R=   R9   R>   R:   R?   R7   R6   RO   R   R‘   R˜   (    (    (    s9   /tmp/pip-install-l3LICk/logomaker/logomaker/src/matrix.pyt   <module>   sp   

Ñ			-‹