U
    f8m                     @   s  d dl mZ d dlZd dlZd dlmZmZ d dl	m
Z
 ddddZd	d
ddddddddddddddZeejZddddhZed8ddZd9d!d"Zd:d#d$Zd;d%d&Zd<d'd(Zd=d)d*Zd+d, Zd-d. Zd/d0 Zed>d2d3Zed?d4d5Zed@d6d7ZdS )A    )divisionN)checkhandle_errors)validate_matrixZACGTZACGUZACDEFGHIKLMNPQRSTVWY)dnaZrnaZproteinACGTZAGZCTZGCATGTZACZCGTZAGTZACTZACG)r   r   r	   r
   RYSWKMBDHVNcountsprobabilityweightinformationF   c           	      C   s0  t | } tt|tdt|  tt|tdt|  t|tkpH|dkd|tf  t|tkpf|dkd|tf  tt|tg tjtj	fp|dkdt|  tt|t
tfdt|  t|dkd	|  |d
krt|dko|dkd||f  t| }n |d
kr8t|dko |dkd||f  t| }n||krL|  }nt|dk	o^|dk	d||f  t|dkd|  |dkr|dkrt| |}n$|dkrt| |}nds$tdn`|dkrt| |}n:|dkrt| |}n$|dkrt| |}ndstdt|d||d}t |}|S )a,  
    Performs transformations on a matrix. There are three types of
    transformations that can be performed:

    1. Center values:
        Subtracts the mean from each row in df. This is common for weight
        matrices or energy matrices. To do this, set center_values=True.

    2. Normalize values:
        Divides each row by the sum of the row. This is needed for probability
        matrices. To do this, set normalize_values=True.

    3. From/To transformations:
        Transforms from one type of matrix (e.g. 'counts') to another type
        of matrix (e.g. 'information'). To do this, set from_type and to_type
        arguments.

    Here are the mathematical formulas invoked by From/To transformations:

        from_type='counts' ->  to_type='probability':
            P_ic = (N_ic + l)/(N_i + C*l), N_i = sum_c(N_ic)

        from_type='probability' -> to_type='weight':
            W_ic = log_2(P_ic / Q_ic)

        from_type='weight' -> to_type='probability':
            P_ic = Q_ic * 2^(W_ic)

        from_type='probability' -> to_type='information':
            I_ic = P_ic * sum_d(P_id * log2(P_id / W_id))

        from_type='information' -> to_type='probability':
            P_ic = I_ic / sum_d(I_id)

        notation:
            i = position
            c, d = character
            l = pseudocount
            C = number of characters
            N_ic = counts matrix element
            P_ic = probability matrix element
            Q_ic = background probability matrix element
            W_ic = weight matrix element
            I_ic = information matrix element

    Using these five 1-step transformations, 2-step transformations
    are also enabled, e.g., from_type='counts' -> to_type='information'.

    parameters
    ----------

    df: (dataframe)
        The matrix to be transformed.

    center_values: (bool)
        Whether to center matrix values, i.e., subtract the mean from each
        row.

    normalize_values: (bool)
        Whether to normalize each row, i.e., divide each row by
        the sum of that row.

    from_type: (str)
        Type of input matrix. Must be one of 'counts', 'probability',
        'weight', or 'information'.

    to_type: (str)
        Type of output matrix. Must be one of 'probability', 'weight', or
        'information'. Can be 'counts' ONLY if from_type is 'counts' too.

    background: (array, or df)
        Specification of background probabilities. If array, should be the
        same length as df.columns and correspond to the probability of each
        column's character. If df, should be a probability matrix the same
        shape as df.

    pseudocount: (number >= 0)
        Pseudocount to use when transforming from a counts matrix to a
        probability matrix.

    returns
    -------
    out_df: (dataframe)
        Transformed matrix
    z-type(center_values) = %s must be of type boolz0type(normalize_values) = %s must be of type boolNz$from_type = %s must be None or in %sz"to_type = %s must be None or in %s@type(background) = %s must be None or array-like or a dataframe.z'type(pseudocount) = %s must be a numberr   zpseudocount=%s must be >= 0Tz`If center_values is True, both from_type and to_typemust be None. Here, from_type=%s, to_type=%szcIf normalize_values is True, both from_type and to_typemust be None. Here, from_type=%s, to_type=%szoUnless center_values is True or normalize_values is True,Neither from_type (=%s) nor to_type (=%s) can be None.r   zSCan only have to_type='counts' if from_type='counts'. Here, however, from_type='%s'r   r   r   FzTHIS SHOULD NEVER EXECUTE)	from_typeto_type
background)r   r   
isinstancebooltypeMATRIX_TYPESnpndarraypd	DataFrameintfloat_center_matrix_normalize_matrixcopy_probability_mat_to_weight_mat#_probability_mat_to_information_matAssertionError_counts_mat_to_probability_mat_weight_mat_to_probability_mat#_information_mat_to_probability_mattransform_matrix)	dfcenter_valuesZnormalize_valuesr   r   r    pseudocountout_dfprob_df r:   ;/tmp/pip-target-lpfmz8o1/lib/python/logomaker/src/matrix.pyr4   (   s    _














r4         ?c                 C   sp   t | } t|dkd |  }| j| }||jddddtjf  |jddddf< t|}t |dd}|S )z:
    Converts a counts matrix to a probability matrix
    r   zpseudocount must be >= 0.r   ZaxisNr   Zmatrix_type)	r   r   r-   valuessumr%   newaxislocr,   )	counts_dfr7   r9   valsr:   r:   r;   r1      s    
0r1   c                 C   sX   t | dd} t| |}|  }t| t t|t  |jddddf< t |}|S )z:
    Converts a probability matrix to a weight matrix
    r   r>   N)r   _get_background_matr-   r%   log2SMALLrB   )r9   r    bg_df	weight_dfr:   r:   r;   r.     s    
.r.   c                 C   sX   t | } t| |}|  }|jtd| j |jddddf< t|}t |dd}|S )z:
    Converts a weight matrix to a probability matrix
       Nr   r>   )r   rE   r-   r?   r%   powerrB   r,   )rI   r    rH   r9   r:   r:   r;   r2   +  s    
&r2   c                 C   s   t | dd} t| |}|  }| j}|j}|t|t t|t   }|jdd}||ddtjf  |j	ddddf< t |dd}|S )z@
    Converts a probability matrix to an information matrix
    r   r>   r   r=   Nr   )
r   rE   r-   r?   r%   rF   rG   r@   rA   rB   )r9   r    rH   info_dfZfg_valsZbg_valsZtmp_valsZinfo_vecr:   r:   r;   r/   B  s    
 (r/   c                 C   sb   t | dd} t| |}t| jddd}|j|ddf | j|ddf< t| }t |dd}|S )zA
    Converts an information matrix to an probability matrix
    r   r>   r   r=           Nr   )r   rE   r%   iscloser@   rB   r,   )rL   r    rH   Zzero_indicesr9   r:   r:   r;   r3   Z  s    
 r3   c                 C   s   t | } tt| j dkd | jddj}ttt|d d | 	 }| j|ddtj
f  |jddddf< t |dd	}|S )
z@
    Normalizes a matrix df to a probability matrix prob_df
    r   z%Some data frame entries are negative.r   r=   rM   z&Some columns in df sum to nearly zero.Nr   r>   )r   r   allr?   ravelr@   anyr%   rN   r-   rA   rB   )r5   Zsumsr9   r:   r:   r;   r,   u  s    *r,   c                 C   sT   t | } | jddj}|  }| j|ddtjf  |jddddf< t |}|S )zN
    Centers each row of a matrix about zero by subtracting out the mean.
    r   r=   N)r   Zmeanr?   r-   r%   rA   rB   )r5   Zmeansr8   r:   r:   r;   r+     s    *r+   c                 C   s   | j \}}|  }|dkr6d| |jddddf< nt|tjttfrt|}t	t
||kd |tjddf |jddddf< t|}nLt|tjjjrt|}t	t| j|jkd t	t| j|jkd t|}t|dd}|S )a  
    Creates a background matrix given a background specification. There
    are three possiblities:

    1. background is None => out_df represents a uniform background
    2. background is a vector => this vector is normalized then used as
        the entries of the rows of out_df. Vector must be the same length
        as the number of columns in df
    3. background is a dataframe => it is then normalized and use as out_df.
        In this case, background must have the same rows and cols as df
    Nr   z-df and background have mismatched dimensions.z,Error: df and bg_mat have different indexes.z,Error: df and bg_mat have different columns.r   r>   )shaper-   rB   r!   r%   r&   listtuplearrayr   lenrA   r,   r'   coreframer(   r   rO   indexcolumns)r5   r    num_posZnum_colsrH   r:   r:   r;   rE     s,    

$
rE   .-c                    s(  t t| tttjtjfd t| } t t| dkd t t	dd | D d t tt
dt  t t|tdt|  t| d  t t	 fd	d
| D d t t|tttjtjfp|dkd |dkrtt| }n&t t|t| kdt|t| f  t t|tg tjtjfp*|dkdt|  t }t ||kd||f  tdd
 | D }t| }	|	  fdd
|	D }
tt }tjd|
|d}|
D ]B}||kt|ddtjf  }|jddj|jdd|f< qt|d|||d}|r$|dkr$t|dd}|S )a  
    Generates matrix from a sequence alignment

    parameters
    ----------
    sequences: (list of strings)
        A list of sequences, all of which must be the same length

    counts: (None or list of numbers)
        If not None, must be a list of numbers the same length os sequences,
        containing the (nonnegative) number of times that each sequence was
        observed. If None, defaults to 1.

    to_type: (str)
        The type of matrix to output. Must be 'counts', 'probability',
        'weight', or 'information'

    background: (array, or df)
        Specification of background probabilities. If array, should be the
        same length as df.columns and correspond to the probability of each
        column's character. If df, should be a probability matrix the same
        shape as df.

    characters_to_ignore: (str)
        Characters to ignore within sequences. This is often needed when
        creating matrices from gapped alignments.

    center_weights: (bool)
        Whether to subtract the mean of each row, but only if to_type=='weight'.

    pseudocount: (number >= 0.0)
        Pseudocount to use when converting from counts to probabilities.

    returns
    -------
    out_df: (dataframe)
        A matrix of the requested type.
    z:sequences must be a list, tuple, np.ndarray, or pd.Series.r   zsequences must have length > 0.c                 s   s   | ]}t |tV  qd S )N)r!   str.0seqr:   r:   r;   	<genexpr>  s     z&alignment_to_matrix.<locals>.<genexpr>z$sequences must all be of type string"type(seq) = %s must be of type str(type(center_weights) = %s; must be bool.c                    s   g | ]}t | kqS r:   )rV   )r_   s)Lr:   r;   
<listcomp>  s     z'alignment_to_matrix.<locals>.<listcomp>z4all elements of sequences must have the same length.Nz?counts must be None or a list, tuple, np.ndarray, or pd.Series.zQcounts must be the same length as sequences;len(counts) = %d; len(sequences) = %dr   zto_type=%s; must be in %sc                 S   s   g | ]}t t|qS r:   )r%   rU   rS   r^   r:   r:   r;   rf   :  s     c                    s   g | ]}| kr|qS r:   r:   )r_   c)characters_to_ignorer:   r;   rf   A  s      datarZ   rY   r=   r   )r   r   r7   r    r   Tr6   )r   r!   rS   rT   r%   r&   r'   SeriesrV   rO   r]   r#   r"   Zonesr(   r$   r-   rU   uniquerP   sortrangeZastyper*   rA   r@   r
   rB   r4   )	sequencesr   r   r    rh   center_weightsr7   valid_typesZ
char_arrayZunique_charactersrZ   rY   rC   rg   Ztmp_matr8   r:   )re   rh   r;   alignment_to_matrix  sr    2




  rs   c                 C   s  t  }|d tt| tdt|   tt|tdt|  |dkr`tt	| }|
  nttt	tjf}tt||d |dk	rtt }t||kd||f  tt| }t||kd||f  tt|tdt|  |rt|dkd	 ttd
 }t| }	tt|	}
tjd||
d}|rtt }t| D ]D\}}t||kd|||f  t| }|D ]}d|j||f< qlq@n:t| D ]0\}}t||kd|||f  d|j||f< qt|dd|d}|r|dkrt|dd}|S )a  
    Generates a matrix from a sequence. With default keyword arguments,
    this is a one-hot-encoded version of the sequence provided. Alternatively,
    is_iupac=True allows users to get matrix models based in IUPAC motifs.

    parameters
    ----------

    seq: (str)
        Sequence from which to construct matrix.

    cols: (str or array-like or None)
        The characters to use for the matrix columns. If None, cols is
        constructed from the unqiue characters in seq. Overriden by alphabet
        and is_iupac.

    alphabet: (str or None)
        The alphabet used to determine the columns of the matrix.
        Options are: 'dna', 'rna', 'protein'. Ignored if None. Overrides cols.

    is_iupac: (bool)
        If True, it is assumed that the sequence represents an IUPAC DNA
        string. In this case, cols is overridden, and alphabet must be None.

    to_type: (str)
        The type of matrix to output. Must be 'probability', 'weight',
        or 'information'

    center_weights: (bool)
        Whether to subtract the mean of each row, but only if to_type='weight'.

    returns
    -------
    seq_df: (dataframe)
        the matrix returned to the user.
    r   rb   rc   N<cols = %s must be None or a string, set, list, or np.ndarrayalphabet = %s; must be in %s.z)invalid to_type=%s; to_type must be in %sz"type(is_iupac) = %s; must be bool.z(must have alphabet=None if is_iupac=Truer   rM   ri   zLcharacter %s at position %d is not a valid IUPAC character;must be one of %sr<   z-character %s at position %d is not in cols=%sr   )r7   r   r   r   Trk   )r$   r-   remover   r!   r]   r#   r"   rS   setrn   r%   r&   ALPHABET_DICTkeysrV   ro   r'   r(   
IUPAC_DICT	enumeraterB   r4   )r`   colsalphabetZis_iupacr   rq   rr   
cols_typesvalid_alphabetsre   rY   rC   Ziupac_charactersirg   bsbr8   r:   r:   r;   sequence_to_matrixX  sv    -










r   c                 C   s  t | ttjtjfrNzddd | D } W qz   tddtt	  Y qzX n,zt	| } W n   tddtt	  Y nX tt | t	dt
|   tt |t
g tjtjfdt
|  t|}tt| t|kd |d	krtt| }|  nRt	tttjf}tt ||d
 ttt|tt| kd tt|t| kd |d	k	r~tt }t||kd||f  tt| }t| |d}| }|jt|d	d	tjf  |jd	d	d	d	f< |S )a$  
    Takes a sequence string and an array of values values and outputs a
    values dataframe. The returned dataframe is a L by C matrix where C is
    the number ofcharacters and L is sequence length.  If matrix is denoted as
    S, i indexes positions and c indexes characters, then S_ic will be non-zero
    (equal to the value in the values array at position p) only if character c
    occurs at position p in sequence. All other elements of S are zero.

    example usage:

    saliency_mat = logomaker.saliency_to_matrix(sequence,values)
    logomaker.Logo(saliency_mat)

    parameters
    ----------

    seq: (str or array-like list of single characters)
        sequence for which values matrix is constructed

    values: (array-like list of numbers)
        array of values values for each character in sequence

    cols: (str or array-like or None)
        The characters to use for the matrix columns. If None, cols is
        constructed from the unqiue characters in seq. Overridden by alphabet
        and is_iupac.

    alphabet: (str or None)
        The alphabet used to determine the columns of the matrix.
        Options are: 'dna', 'rna', 'protein'. Ignored if None. Overrides cols.

    returns
    -------
    saliency_df: (dataframe)
        values matrix in the form of a dataframe

     c                 S   s   g | ]}t |qS r:   )r]   )r_   xr:   r:   r;   rf     s     z&saliency_to_matrix.<locals>.<listcomp>Fz could not convert %s to type strrb   z&type(values) = %s must be of type listz,length of seq and values list must be equal.Nrt   zFlength of set of unique characters must be equal for "cols " and "seq"z5unique characters for "cols" and "seq" must be equal.ru   )r|   )r!   rS   r%   r&   r'   rl   joinr   reprr]   r#   rV   rw   rn   rx   ry   r   r-   r?   rU   rA   rB   )r`   r?   r|   r}   r~   r   Zohe_sequenceZsaliency_dfr:   r:   r;   saliency_to_matrix  sX    )






r   )FFNNNr   )r<   )N)N)N)N)Nr   Nr\   Fr<   )NNFr   F)NN)
__future__r   numpyr%   Zpandasr'   Zlogomaker.src.error_handlingr   r   Zlogomaker.src.validater   rx   rz   Zfinfor*   ZtinyrG   r$   r4   r1   r.   r2   r/   r3   r,   r+   rE   rs   r   r   r:   r:   r:   r;   <module>   s|          X




-             