3
"G\8m                 @   s  d dl mZ d dlZd dlZd dlmZmZ d dl	m
Z
 ddddZd	d
ddddddddddddddZejejZddddhZed8ddZd9d!d"Zd:d#d$Zd;d%d&Zd<d'd(Zd=d)d*Zd+d, Zd-d. Zd/d0 Zed>d2d3Zed?d4d5Zed@d6d7ZdS )A    )divisionN)checkhandle_errors)validate_matrixACGTACGUACDEFGHIKLMNPQRSTVWY)dnarnaproteinACGTZAGZCTGCATZGTZACZCGTZAGTZACTZACG)r   r   r   r   RYSWKMBDHVNcountsprobabilityweightinformationF   c       	      C   s0  t | } tt|tdt|  tt|tdt|  t|tkpH|dkd|tf  t|tkpf|dkd|tf  tt|tg tjtj	fp|dkdt|  tt|t
tfdt|  t|dkd	|  |d
krt|dko|dkd||f  t| }n |d
kr8t|dko |dkd||f  t| }n||krL| j }nt|dk	o^|dk	d||f  t|dkd|  |dkr|dkrt| |}n$|dkrt| |}nds$tdn`|dkrt| |}n:|dkrt| |}n$|dkrt| |}ndstdt|d||d}t |}|S )a,  
    Performs transformations on a matrix. There are three types of
    transformations that can be performed:

    1. Center values:
        Subtracts the mean from each row in df. This is common for weight
        matrices or energy matrices. To do this, set center_values=True.

    2. Normalize values:
        Divides each row by the sum of the row. This is needed for probability
        matrices. To do this, set normalize_values=True.

    3. From/To transformations:
        Transforms from one type of matrix (e.g. 'counts') to another type
        of matrix (e.g. 'information'). To do this, set from_type and to_type
        arguments.

    Here are the mathematical formulas invoked by From/To transformations:

        from_type='counts' ->  to_type='probability':
            P_ic = (N_ic + l)/(N_i + C*l), N_i = sum_c(N_ic)

        from_type='probability' -> to_type='weight':
            W_ic = log_2(P_ic / Q_ic)

        from_type='weight' -> to_type='probability':
            P_ic = Q_ic * 2^(W_ic)

        from_type='probability' -> to_type='information':
            I_ic = P_ic * sum_d(P_id * log2(P_id / W_id))

        from_type='information' -> to_type='probability':
            P_ic = I_ic / sum_d(I_id)

        notation:
            i = position
            c, d = character
            l = pseudocount
            C = number of characters
            N_ic = counts matrix element
            P_ic = probability matrix element
            Q_ic = background probability matrix element
            W_ic = weight matrix element
            I_ic = information matrix element

    Using these five 1-step transformations, 2-step transformations
    are also enabled, e.g., from_type='counts' -> to_type='information'.

    parameters
    ----------

    df: (dataframe)
        The matrix to be transformed.

    center_values: (bool)
        Whether to center matrix values, i.e., subtract the mean from each
        row.

    normalize_values: (bool)
        Whether to normalize each row, i.e., divide each row by
        the sum of that row.

    from_type: (str)
        Type of input matrix. Must be one of 'counts', 'probability',
        'weight', or 'information'.

    to_type: (str)
        Type of output matrix. Must be one of 'probability', 'weight', or
        'information'. Can be 'counts' ONLY if from_type is 'counts' too.

    background: (array, or df)
        Specification of background probabilities. If array, should be the
        same length as df.columns and correspond to the probability of each
        column's character. If df, should be a probability matrix the same
        shape as df.

    pseudocount: (number >= 0)
        Pseudocount to use when transforming from a counts matrix to a
        probability matrix.

    returns
    -------
    out_df: (dataframe)
        Transformed matrix
    z-type(center_values) = %s must be of type boolz0type(normalize_values) = %s must be of type boolNz$from_type = %s must be None or in %sz"to_type = %s must be None or in %sz@type(background) = %s must be None or array-like or a dataframe.z'type(pseudocount) = %s must be a numberr   zpseudocount=%s must be >= 0Tz`If center_values is True, both from_type and to_typemust be None. Here, from_type=%s, to_type=%szcIf normalize_values is True, both from_type and to_typemust be None. Here, from_type=%s, to_type=%szoUnless center_values is True or normalize_values is True,Neither from_type (=%s) nor to_type (=%s) can be None.r   zSCan only have to_type='counts' if from_type='counts'. Here, however, from_type='%s'r   r   r    FzTHIS SHOULD NEVER EXECUTE)	from_typeto_type
background)r   r   
isinstancebooltypeMATRIX_TYPESnpndarraypd	DataFrameintfloat_center_matrix_normalize_matrixcopy_probability_mat_to_weight_mat#_probability_mat_to_information_matAssertionError_counts_mat_to_probability_mat_weight_mat_to_probability_mat#_information_mat_to_probability_mattransform_matrix)	dfcenter_valuesnormalize_valuesr"   r#   r$   pseudocountout_dfprob_df r?   ../../logomaker/src/matrix.pyr8   (   sr    _














r8         ?c             C   sp   t | } t|dkd | j }| j| }||jddddtjf  |jddddf< t|}t |dd}|S )z:
    Converts a counts matrix to a probability matrix
    r   zpseudocount must be >= 0.r!   )axisNr   )matrix_type)	r   r   r1   valuessumr)   newaxislocr0   )	counts_dfr<   r>   valsr?   r?   r@   r5      s    
0r5   c             C   sX   t | dd} t| |}| j }tj| t tj|t  |jddddf< t |}|S )z:
    Converts a probability matrix to a weight matrix
    r   )rC   N)r   _get_background_matr1   r)   log2SMALLrG   )r>   r$   bg_df	weight_dfr?   r?   r@   r2     s    
.r2   c             C   sX   t | } t| |}| j }|jtjd| j |jddddf< t|}t |dd}|S )z:
    Converts a weight matrix to a probability matrix
       Nr   )rC   )r   rJ   r1   rD   r)   powerrG   r0   )rN   r$   rM   r>   r?   r?   r@   r6   +  s    
&r6   c             C   s   t | dd} t| |}| j }| j}|j}|tj|t tj|t   }|jdd}||ddtjf  |j	ddddf< t |dd}|S )z@
    Converts a probability matrix to an information matrix
    r   )rC   r!   )rB   Nr    )
r   rJ   r1   rD   r)   rK   rL   rE   rF   rG   )r>   r$   rM   info_dfZfg_valsZbg_valsZtmp_valsZinfo_vecr?   r?   r@   r3   B  s    
 (r3   c             C   sb   t | dd} t| |}tj| jddd}|j|ddf | j|ddf< t| }t |dd}|S )zA
    Converts an information matrix to an probability matrix
    r    )rC   r!   )rB   g        Nr   )r   rJ   r)   iscloserE   rG   r0   )rQ   r$   rM   Zzero_indicesr>   r?   r?   r@   r7   Z  s    
 r7   c             C   s   t | } tt| jj dkd | jddj}tttj|d d | j	 }| j|ddtj
f  |jddddf< t |dd	}|S )
z@
    Normalizes a matrix df to a probability matrix prob_df
    r   z%Some data frame entries are negative.r!   )rB   g        z&Some columns in df sum to nearly zero.Nr   )rC   )r   r   allrD   ravelrE   anyr)   rR   r1   rF   rG   )r9   Zsumsr>   r?   r?   r@   r0   u  s    *r0   c             C   sT   t | } | jddj}| j }| j|ddtjf  |jddddf< t |}|S )zN
    Centers each row of a matrix about zero by subtracting out the mean.
    r!   )rB   N)r   meanrD   r1   r)   rF   rG   )r9   meansr=   r?   r?   r@   r/     s    *r/   c             C   s   | j \}}| j }|dkr6d| |jddddf< nt|tjttfrtj|}t	t
||kd |tjddf |jddddf< t|}nLt|tjjjrt|}t	t| j|jkd t	t| j|jkd t|}t|dd}|S )a  
    Creates a background matrix given a background specification. There
    are three possiblities:

    1. background is None => out_df represents a uniform background
    2. background is a vector => this vector is normalized then used as
        the entries of the rows of out_df. Vector must be the same length
        as the number of columns in df
    3. background is a dataframe => it is then normalized and use as out_df.
        In this case, background must have the same rows and cols as df
    Nr!   z-df and background have mismatched dimensions.z,Error: df and bg_mat have different indexes.z,Error: df and bg_mat have different columns.r   )rC   )shaper1   rG   r%   r)   r*   listtuplearrayr   lenrF   r0   r+   coreframer,   r   rS   indexcolumns)r9   r$   num_posZnum_colsrM   r?   r?   r@   rJ     s&    

$
rJ   .-c                s,  t t| tttjtjfd t| } t t| dkd t t	dd | D d t tt
dt  t t|tdt|  t| d  t t	 fd	d
| D d t t|tttjtjfp|dkd |dkrtjt| }n&t t|t| kdt|t| f  t t|tg tjtjfp*|dkdt|  tj }t ||kd||f  tjdd
 | D }tj|j }	|	j  fdd
|	D }
tt }tjd|
|d}xJ|
D ]B}||kjt|ddtjf  }|jddj|jdd|f< qW t|d|||d}|r(|dkr(t|dd}|S )a  
    Generates matrix from a sequence alignment

    parameters
    ----------
    sequences: (list of strings)
        A list of sequences, all of which must be the same length

    counts: (None or list of numbers)
        If not None, must be a list of numbers the same length os sequences,
        containing the (nonnegative) number of times that each sequence was
        observed. If None, defaults to 1.

    to_type: (str)
        The type of matrix to output. Must be 'counts', 'probability',
        'weight', or 'information'

    background: (array, or df)
        Specification of background probabilities. If array, should be the
        same length as df.columns and correspond to the probability of each
        column's character. If df, should be a probability matrix the same
        shape as df.

    characters_to_ignore: (str)
        Characters to ignore within sequences. This is often needed when
        creating matrices from gapped alignments.

    center_weights: (bool)
        Whether to subtract the mean of each row, but only if to_type=='weight'.

    pseudocount: (number >= 0.0)
        Pseudocount to use when converting from counts to probabilities.

    returns
    -------
    out_df: (dataframe)
        A matrix of the requested type.
    z:sequences must be a list, tuple, np.ndarray, or pd.Series.r   zsequences must have length > 0.c             s   s   | ]}t |tV  qd S )N)r%   str).0seqr?   r?   r@   	<genexpr>  s    z&alignment_to_matrix.<locals>.<genexpr>z$sequences must all be of type stringz"type(seq) = %s must be of type strz(type(center_weights) = %s; must be bool.c                s   g | ]}t | kqS r?   )r\   )rd   s)Lr?   r@   
<listcomp>  s    z'alignment_to_matrix.<locals>.<listcomp>z4all elements of sequences must have the same length.Nz?counts must be None or a list, tuple, np.ndarray, or pd.Series.zQcounts must be the same length as sequences;len(counts) = %d; len(sequences) = %dz@type(background) = %s must be None or array-like or a dataframe.zto_type=%s; must be in %sc             S   s   g | ]}t jt|qS r?   )r)   r[   rY   )rd   re   r?   r?   r@   ri   :  s    c                s   g | ]}| kr|qS r?   r?   )rd   c)characters_to_ignorer?   r@   ri   A  s    )datar`   r_   )rB   r   )r"   r#   r<   r$   r   T)r:   )r   r%   rY   rZ   r)   r*   r+   Seriesr\   rS   rc   r'   r&   onesr,   r(   r1   r[   uniquerT   sortrangeastyper.   rF   rE   r   rG   r8   )	sequencesr   r#   r$   rk   center_weightsr<   valid_typesZ
char_arrayZunique_charactersr`   r_   rH   rj   Ztmp_matr=   r?   )rh   rk   r@   alignment_to_matrix  sZ    2


 "rv   c             C   s  t j }|jd tt| tdt|   tt|tdt|  |dkr`tt	| }|j
  nttt	tjf}tt||d |dk	rttj }t||kd||f  tt| }t||kd||f  tt|tdt|  |rt|dkd	 ttd
 }t| }	tt|	}
tjd||
d}|rttj }xt| D ]H\}}t||kd|||f  t| }x|D ]}d|j||f< qpW qBW n>x<t| D ]0\}}t||kd|||f  d|j||f< qW t|dd|d}|r|dkrt|dd}|S )a  
    Generates a matrix from a sequence. With default keyword arguments,
    this is a one-hot-encoded version of the sequence provided. Alternatively,
    is_iupac=True allows users to get matrix models based in IUPAC motifs.

    parameters
    ----------

    seq: (str)
        Sequence from which to construct matrix.

    cols: (str or array-like or None)
        The characters to use for the matrix columns. If None, cols is
        constructed from the unqiue characters in seq. Overriden by alphabet
        and is_iupac.

    alphabet: (str or None)
        The alphabet used to determine the columns of the matrix.
        Options are: 'dna', 'rna', 'protein'. Ignored if None. Overrides cols.

    is_iupac: (bool)
        If True, it is assumed that the sequence represents an IUPAC DNA
        string. In this case, cols is overridden, and alphabet must be None.

    to_type: (str)
        The type of matrix to output. Must be 'probability', 'weight',
        or 'information'

    center_weights: (bool)
        Whether to subtract the mean of each row, but only if to_type='weight'.

    returns
    -------
    seq_df: (dataframe)
        the matrix returned to the user.
    r   z"type(seq) = %s must be of type strz(type(center_weights) = %s; must be bool.Nz<cols = %s must be None or a string, set, list, or np.ndarrayzalphabet = %s; must be in %s.z)invalid to_type=%s; to_type must be in %sz"type(is_iupac) = %s; must be bool.z(must have alphabet=None if is_iupac=Truer	   g        )rl   r`   r_   zLcharacter %s at position %d is not a valid IUPAC character;must be one of %sg      ?z-character %s at position %d is not in cols=%sr   )r<   r"   r#   r   T)r:   )r(   r1   remover   r%   rc   r'   r&   rY   setrp   r)   r*   ALPHABET_DICTkeysr\   rq   r+   r,   
IUPAC_DICT	enumeraterG   r8   )re   colsalphabetis_iupacr#   rt   ru   
cols_typesvalid_alphabetsrh   r_   rH   Ziupac_charactersirj   bsbr=   r?   r?   r@   sequence_to_matrixX  s`    -






r   c             C   s  t | ttjtjfrNydjdd | D } W qz   tddtt	  Y qzX n,yt	| } W n   tddtt	  Y nX tt | t	dt
|   tt |t
g tjtjfdt
|  t|}tt| t|kd |d	krtt| }|j  nRt	tttjf}tt ||d
 ttt|tt| kd tt|t| kd |d	k	r~ttj }t||kd||f  tt| }t| |d}|j }|jtj|d	d	tjf  |jd	d	d	d	f< |S )a$  
    Takes a sequence string and an array of values values and outputs a
    values dataframe. The returned dataframe is a L by C matrix where C is
    the number ofcharacters and L is sequence length.  If matrix is denoted as
    S, i indexes positions and c indexes characters, then S_ic will be non-zero
    (equal to the value in the values array at position p) only if character c
    occurs at position p in sequence. All other elements of S are zero.

    example usage:

    saliency_mat = logomaker.saliency_to_matrix(sequence,values)
    logomaker.Logo(saliency_mat)

    parameters
    ----------

    seq: (str or array-like list of single characters)
        sequence for which values matrix is constructed

    values: (array-like list of numbers)
        array of values values for each character in sequence

    cols: (str or array-like or None)
        The characters to use for the matrix columns. If None, cols is
        constructed from the unqiue characters in seq. Overridden by alphabet
        and is_iupac.

    alphabet: (str or None)
        The alphabet used to determine the columns of the matrix.
        Options are: 'dna', 'rna', 'protein'. Ignored if None. Overrides cols.

    returns
    -------
    saliency_df: (dataframe)
        values matrix in the form of a dataframe

     c             S   s   g | ]}t |qS r?   )rc   )rd   xr?   r?   r@   ri     s    z&saliency_to_matrix.<locals>.<listcomp>Fz could not convert %s to type strz"type(seq) = %s must be of type strz&type(values) = %s must be of type listz,length of seq and values list must be equal.Nz<cols = %s must be None or a string, set, list, or np.ndarrayzFlength of set of unique characters must be equal for "cols " and "seq"z5unique characters for "cols" and "seq" must be equal.zalphabet = %s; must be in %s.)r}   )r%   rY   r)   r*   r+   rm   joinr   reprrc   r'   r\   rx   rp   ry   rz   r   r1   rD   r[   rF   rG   )re   rD   r}   r~   r   r   Zohe_sequenceZsaliency_dfr?   r?   r@   saliency_to_matrix  sH    )



,r   )FFNNNr!   )rA   )N)N)N)N)Nr   Nrb   FrA   )NNFr   F)NN)
__future__r   numpyr)   pandasr+   logomaker.src.error_handlingr   r   logomaker.src.validater   ry   r{   finfor.   tinyrL   r(   r8   r5   r2   r6   r3   r7   r0   r/   rJ   rv   r   r   r?   r?   r?   r@   <module>   sp         R




-          