3
jF\A                 @   s   d dl mZ d dlZd dlZd dlZd dlmZm	Z	m
Z
 dddddd	d
dddddddddZejejZd0ddZd1ddZd2ddZd3ddZd4dd Zd5d!d"Zd#d$ Zd%d& Zd'd( Zd6d*d+Zd7d.d/ZdS )8    )divisionN)validate_matrixvalidate_probability_matvalidate_information_matACGTZAGZCTGCATGTZACZCGTZAGTZACTZACGACGT)r   r   r   r	   RYSWKMBDHVN   Fc       
      C   s  ddddh}dddh}t | } ||kr0| j }n||ksDtd| ||ksXtd| |dkr|dkrtt| |}q|dkrt| |}nX|dkrt| |}	n4|dkrt| |}	n |dkrt| |}	ndstdt|	d||d	}|r|dkstd
d t	|}t |}|S )a  
    Transforms a matrix of one type into a matrix of another type.

    i = position
    c, d = character

    l = pseudocount
    C = number of characters

    N_ic = counts matrix element
    P_ic = probability matrix element
    Q_ic = background probability matrix element
    W_ic = weight matrix element
    I_ic = information matrix element

    counts -> probability:
        P_ic = (N_ic + l)/(N_i + C*l), N_i = sum_c(N_ic)

    probability -> weight:
        W_ic = log_2(P_ic / Q_ic)

    weight -> probability:
        P_ic = Q_ic * 2^(W_ic)

    probability -> information:
        I_ic = P_ic * sum_d(P_id * log2(P_id / W_id))

    information -> probability:
        P_ic = I_ic / sum_d(I_id)


    parameters
    ----------

    df: (dataframe)
        The matrix to be transformed.

    from_type: (str)
        Type of input matrix. Must be one of 'counts', 'probability',
        'weight', or 'information'.

    to_type: (str)
        Type of output matrix. Must be one of 'probability', 'weight', or
        'information'. Can NOT be 'counts'.

    background: (array, or df)
        Specification of background probabilities. If array, should be the
        same length as df.columns and correspond to the probability of each
        column's character. If df, should be a probability matrix the same
        shape as df.

    pseudocount: (number >= 0)
        Pseudocount to use when transforming from a count matrix to a
        probability matrix.

    center: (bool)
        Whether to center the output matrix. Note: this only works when
        to_type = 'weight', as centering a matrix doesn't make sense otherwise.

    returns
    -------
    out_df: (dataframe)
        Transformed matrix
    countsprobabilityweightZinformationzError: invalid from_type=%szError: invalid to_type="%s"FzTHIS SHOULD NEVER HAPPEN)	from_typeto_type
backgroundz6Error: the option center=True is only compatible with zto_type == "weight")
r   copyAssertionError_probability_mat_to_weight_mat#_probability_mat_to_information_mat_counts_mat_to_probability_mat_weight_mat_to_probability_mat#_information_mat_to_probability_mattransform_matrixcenter_matrix)
dfr   r   r   pseudocountcenterZ
FROM_TYPESZTO_TYPESout_dfZprob_df r-   ../../logomaker/src/data.pyr'   !   s@    E






r'         ?c             C   sn   |dkst dt| } | j }| j| }||jddddtjf  |jddddf< t|}t	|}|S )z:
    Converts a counts matrix to a probability matrix
    r   z!Error: Pseudocount must be >= 0. r   )axisN)
r!   r   r    valuessumnpnewaxislocnormalize_matrixr   )r)   r*   r,   valsr-   r-   r.   r$      s    
0r$   c             C   sT   t | } t| |}| j }tj| t tj|t  |jddddf< t|}|S )z:
    Converts a probability matrix to a weight matrix
    N)r   _get_background_matr    r3   log2SMALLr5   r   )r)   r   bg_dfr,   r-   r-   r.   r"      s    
.r"   c             C   sT   t | } t| |}| j }|jtjd| j |jddddf< t|}t|}|S )z:
    Converts a probability matrix to a weight matrix
       N)	r   r8   r    r1   r3   powerr5   r6   r   )r)   r   r;   r,   r-   r-   r.   r%      s    
&r%   c             C   s   t | } t| |}| j }| j}|j}|tj|t tj|t   }|jdd}||ddtjf  |j	ddddf< t
|}|S )z@
    Converts a probability matrix to an information matrix
    r   )r0   N)r   r8   r    r1   r3   r9   r:   r2   r4   r5   r   )r)   r   r;   r,   Zfg_valsZbg_valsZtmp_valsZinfo_vecr-   r-   r.   r#      s    
 (r#   c             C   s   t | } t| }t|}|S )z@
    Converts a probability matrix to an information matrix
    )r   r6   r   )r)   r   r,   r-   r-   r.   r&      s    r&   c             C   s   t | } t| jj dks"td| jddj}ttj|d sJtd| j	 }| j|ddtj
f  |jddddf< t|}|S )z?
    Normalizes a matrix df to a probability matrix out_df
    r   z,Error: Some data frame entries are negative.r   )r0   g        z-Error: some columns in df sum to nearly zero.N)r   allr1   ravelr!   r2   anyr3   iscloser    r4   r5   r   )r)   sumsr,   r-   r-   r.   r6     s    *r6   c             C   sT   t | } | jddj}| j }| j|ddtjf  |jddddf< t |}|S )zN
    Centers each row of a matrix about zero by subtracting out the mean.
    r   )r0   N)r   meanr1   r    r3   r4   r5   )r)   meansr,   r-   r-   r.   r(   *  s    *r(   c             C   s   | j \}}| j }|dkr6d| |jddddf< nt|tjttfrtj|}t	||ksft
d|tjddf |jddddf< t|}nXt|tjjjrt|}t| j|jkst
dt| j|jkst
d|j }t|}t|}|S )al  
    Creates a background matrix given a background specification. There
    are three possiblities:

    1. background is None => out_df represents a uniform background
    2. background is a vector => this vector is normalized then used as
        the entries of the rows of out_df
    3. background is a dataframe => it is then normalized and use as out_df
    Nr   z4Error: df and background have mismatched dimensions.z,Error: df and bg_mat have different indexes.z,Error: df and bg_mat have different columns.)shaper    r5   
isinstancer3   ndarraylisttuplearraylenr!   r4   r6   pdcoreframe	DataFramer   r>   indexcolumnsr   )r)   r   num_posZnum_colsr,   r-   r-   r.   r8   >  s(    

$
r8   r   c             K   s~   t | }td}tt|}tjd||d}x<tt| D ],\}}t| }	x|	D ]}
d|j||
f< qPW q:W t|dd|d}|S )a  
    Generates a matrix corresponding to a (DNA) IUPAC string.

    parameters
    ----------
    iupac_seq: (str)
        An IUPAC sequence.

    to_type: (str)
        The type of matrix to convert to. Must be 'probability', 'weight',
        or 'information'

    **kwargs:
        Additional arguments to send to transform_matrix, e.g. background
        or center

    returns
    -------
    out_df: (dataframe)
        A matrix of the requested type.
    r   g        )datarQ   rP   r   r   r   )r*   r   r   )	rK   rH   rangerL   rO   	enumerate
iupac_dictr5   r'   )Z	iupac_seqr   kwargsLcolsrP   Z
counts_maticbsbr,   r-   r-   r.   iupac_to_matrixj  s    
r^   r   .-c                s   t jdd | D }|jd }t j|j }|j   fdd|D }tt|}tj	d||d}	x4|D ],}
||
kj
tjddj |	jdd|
f< qhW t|	fd	|d
|}|S )a  
    Generates matrix from a sequence alignment

    parameters
    ----------
    sequences: (list of str)
        An list of sequences, all of which must be the same length

    to_type: (str)
        The type of matrix to output. Must be 'counts', 'probability',
        'weight', or 'information'

    **kwargs:
        Other arguments to pass to logomaker.transform_matrix(), e.g.
        pseudocount

    returns
    -------
    out_df: (dataframe)
        A matrix of the requested type.
    c             S   s   g | ]}t jt|qS r-   )r3   rJ   rH   ).0seqr-   r-   r.   
<listcomp>  s    z'alignment_to_matrix.<locals>.<listcomp>r   c                s   g | ]}| kr|qS r-   r-   )r`   r[   )characters_to_ignorer-   r.   rb     s    r   )rS   rQ   rP   )r0   Nr   )r   r   )r3   rJ   rE   uniquer?   sortrH   rT   rL   rO   astypefloatr2   r5   r'   )	sequencesr   rc   rW   Z
char_arrayrX   Zunique_charactersrQ   rP   Z	counts_dfr[   r,   r-   )rc   r.   alignment_to_matrix  s    

,ri   )Nr   F)r/   )N)N)N)N)r   )r   r_   )
__future__r   numpyr3   pandasrL   pdblogomaker.src.validater   r   r   rV   finforg   tinyr:   r'   r$   r"   r%   r#   r&   r6   r(   r8   r^   ri   r-   r-   r-   r.   <module>   sD     





,
* 