U
    Ùf8m  ã                   @   s  d dl mZ d dlZd dlZd dlmZmZ d dl	m
Z
 ddddœZd	d
ddddddddddddddœZe e¡jZddddhZed8dd„ƒZd9d!d"„Zd:d#d$„Zd;d%d&„Zd<d'd(„Zd=d)d*„Zd+d,„ Zd-d.„ Zd/d0„ Zed>d2d3„ƒZed?d4d5„ƒZed@d6d7„ƒZdS )Aé    )ÚdivisionN)ÚcheckÚhandle_errors)Úvalidate_matrixZACGTZACGUZACDEFGHIKLMNPQRSTVWY)ÚdnaZrnaZproteinÚAÚCÚGÚTZAGZCTZGCÚATÚGTZACZCGTZAGTZACTZACG)r   r   r	   r
   ÚRÚYÚSÚWÚKÚMÚBÚDÚHÚVÚNÚcountsÚprobabilityÚweightÚinformationFé   c           	      C   s0  t | ƒ} tt|tƒdt|ƒ ƒ tt|tƒdt|ƒ ƒ t|tkpH|dkd|tf ƒ t|tkpf|dkd|tf ƒ tt|tg ƒtjtj	fƒp”|dkdt|ƒ ƒ tt|t
tfƒdt|ƒ ƒ t|dkd	| ƒ |d
krt|dkoê|dkd||f ƒ t| ƒ}n |d
kr8t|dko |dkd||f ƒ t| ƒ}nì||krL|  ¡ }nØt|dk	o^|dk	d||f ƒ t|dkd| ƒ |dkrÄ|dkržt| |ƒ}n$|dkr´t| |ƒ}nds$tdƒ‚n`|dkrÚt| |ƒ}n:|dkrðt| |ƒ}n$|dkrt| |ƒ}ndstdƒ‚t|d||d}t |ƒ}|S )a,  
    Performs transformations on a matrix. There are three types of
    transformations that can be performed:

    1. Center values:
        Subtracts the mean from each row in df. This is common for weight
        matrices or energy matrices. To do this, set center_values=True.

    2. Normalize values:
        Divides each row by the sum of the row. This is needed for probability
        matrices. To do this, set normalize_values=True.

    3. From/To transformations:
        Transforms from one type of matrix (e.g. 'counts') to another type
        of matrix (e.g. 'information'). To do this, set from_type and to_type
        arguments.

    Here are the mathematical formulas invoked by From/To transformations:

        from_type='counts' ->  to_type='probability':
            P_ic = (N_ic + l)/(N_i + C*l), N_i = sum_c(N_ic)

        from_type='probability' -> to_type='weight':
            W_ic = log_2(P_ic / Q_ic)

        from_type='weight' -> to_type='probability':
            P_ic = Q_ic * 2^(W_ic)

        from_type='probability' -> to_type='information':
            I_ic = P_ic * sum_d(P_id * log2(P_id / W_id))

        from_type='information' -> to_type='probability':
            P_ic = I_ic / sum_d(I_id)

        notation:
            i = position
            c, d = character
            l = pseudocount
            C = number of characters
            N_ic = counts matrix element
            P_ic = probability matrix element
            Q_ic = background probability matrix element
            W_ic = weight matrix element
            I_ic = information matrix element

    Using these five 1-step transformations, 2-step transformations
    are also enabled, e.g., from_type='counts' -> to_type='information'.

    parameters
    ----------

    df: (dataframe)
        The matrix to be transformed.

    center_values: (bool)
        Whether to center matrix values, i.e., subtract the mean from each
        row.

    normalize_values: (bool)
        Whether to normalize each row, i.e., divide each row by
        the sum of that row.

    from_type: (str)
        Type of input matrix. Must be one of 'counts', 'probability',
        'weight', or 'information'.

    to_type: (str)
        Type of output matrix. Must be one of 'probability', 'weight', or
        'information'. Can be 'counts' ONLY if from_type is 'counts' too.

    background: (array, or df)
        Specification of background probabilities. If array, should be the
        same length as df.columns and correspond to the probability of each
        column's character. If df, should be a probability matrix the same
        shape as df.

    pseudocount: (number >= 0)
        Pseudocount to use when transforming from a counts matrix to a
        probability matrix.

    returns
    -------
    out_df: (dataframe)
        Transformed matrix
    z-type(center_values) = %s must be of type boolz0type(normalize_values) = %s must be of type boolNz$from_type = %s must be None or in %sz"to_type = %s must be None or in %sú@type(background) = %s must be None or array-like or a dataframe.z'type(pseudocount) = %s must be a numberr   zpseudocount=%s must be >= 0Tz`If center_values is True, both from_type and to_typemust be None. Here, from_type=%s, to_type=%szcIf normalize_values is True, both from_type and to_typemust be None. Here, from_type=%s, to_type=%szoUnless center_values is True or normalize_values is True,Neither from_type (=%s) nor to_type (=%s) can be None.r   zSCan only have to_type='counts' if from_type='counts'. Here, however, from_type='%s'r   r   r   FzTHIS SHOULD NEVER EXECUTE)Ú	from_typeÚto_typeÚ
background)r   r   Ú
isinstanceÚboolÚtypeÚMATRIX_TYPESÚnpÚndarrayÚpdÚ	DataFrameÚintÚfloatÚ_center_matrixÚ_normalize_matrixÚcopyÚ_probability_mat_to_weight_matÚ#_probability_mat_to_information_matÚAssertionErrorÚ_counts_mat_to_probability_matÚ_weight_mat_to_probability_matÚ#_information_mat_to_probability_matÚtransform_matrix)	ÚdfÚcenter_valuesZnormalize_valuesr   r   r    ÚpseudocountÚout_dfÚprob_df© r:   ú;/tmp/pip-target-lpfmz8o1/lib/python/logomaker/src/matrix.pyr4   (   sš    _
ÿÿ
ÿÿÿÿÿÿÿþ
ÿÿ
þÿ
þÿ


þÿ
þ





ýr4   ç      ð?c                 C   sp   t | ƒ} t|dkdƒ |  ¡ }| j| }||jdddd…tjf  |jdd…dd…f< t|ƒ}t |dd}|S )z:
    Converts a counts matrix to a probability matrix
    r   zpseudocount must be >= 0.r   ©ZaxisNr   ©Zmatrix_type)	r   r   r-   ÚvaluesÚsumr%   ÚnewaxisÚlocr,   )Ú	counts_dfr7   r9   Úvalsr:   r:   r;   r1      s    
0r1   c                 C   sX   t | dd} t| |ƒ}|  ¡ }t | t ¡t |t ¡ |jdd…dd…f< t |ƒ}|S )z:
    Converts a probability matrix to a weight matrix
    r   r>   N)r   Ú_get_background_matr-   r%   Úlog2ÚSMALLrB   )r9   r    Úbg_dfÚ	weight_dfr:   r:   r;   r.     s    
.r.   c                 C   sX   t | ƒ} t| |ƒ}|  ¡ }|jt d| j¡ |jdd…dd…f< t|ƒ}t |dd}|S )z:
    Converts a weight matrix to a probability matrix
    é   Nr   r>   )r   rE   r-   r?   r%   ÚpowerrB   r,   )rI   r    rH   r9   r:   r:   r;   r2   +  s    
&r2   c                 C   sŽ   t | dd} t| |ƒ}|  ¡ }| j}|j}|t |t ¡t |t ¡  }|jdd}||dd…tjf  |j	dd…dd…f< t |dd}|S )z@
    Converts a probability matrix to an information matrix
    r   r>   r   r=   Nr   )
r   rE   r-   r?   r%   rF   rG   r@   rA   rB   )r9   r    rH   Úinfo_dfZfg_valsZbg_valsZtmp_valsZinfo_vecr:   r:   r;   r/   B  s    
 (r/   c                 C   sb   t | dd} t| |ƒ}t | jddd¡}|j|dd…f | j|dd…f< t| ƒ}t |dd}|S )zA
    Converts an information matrix to an probability matrix
    r   r>   r   r=   ç        Nr   )r   rE   r%   Úiscloser@   rB   r,   )rL   r    rH   Zzero_indicesr9   r:   r:   r;   r3   Z  s    
 r3   c                 C   sˆ   t | ƒ} tt| j ¡ dkƒdƒ | jddj}ttt |d¡ƒ dƒ |  	¡ }| j|dd…tj
f  |jdd…dd…f< t |dd	}|S )
z@
    Normalizes a matrix df to a probability matrix prob_df
    r   z%Some data frame entries are negative.r   r=   rM   z&Some columns in df sum to nearly zero.Nr   r>   )r   r   Úallr?   Úravelr@   Úanyr%   rN   r-   rA   rB   )r5   Zsumsr9   r:   r:   r;   r,   u  s    ÿÿ*r,   c                 C   sT   t | ƒ} | jddj}|  ¡ }| j|dd…tjf  |jdd…dd…f< t |ƒ}|S )zN
    Centers each row of a matrix about zero by subtracting out the mean.
    r   r=   N)r   Zmeanr?   r-   r%   rA   rB   )r5   Zmeansr8   r:   r:   r;   r+   ‘  s    *r+   c                 C   sî   | j \}}|  ¡ }|dkr6d| |jdd…dd…f< n¨t|tjttfƒr’t |¡}t	t
|ƒ|kdƒ |tjdd…f |jdd…dd…f< t|ƒ}nLt|tjjjƒrÞt|ƒ}t	t| j|jkƒdƒ t	t| j|jkƒdƒ t|ƒ}t|dd}|S )aü  
    Creates a background matrix given a background specification. There
    are three possiblities:

    1. background is None => out_df represents a uniform background
    2. background is a vector => this vector is normalized then used as
        the entries of the rows of out_df. Vector must be the same length
        as the number of columns in df
    3. background is a dataframe => it is then normalized and use as out_df.
        In this case, background must have the same rows and cols as df
    Nr   z-df and background have mismatched dimensions.z,Error: df and bg_mat have different indexes.z,Error: df and bg_mat have different columns.r   r>   )Úshaper-   rB   r!   r%   r&   ÚlistÚtupleÚarrayr   ÚlenrA   r,   r'   ÚcoreÚframer(   r   rO   ÚindexÚcolumns)r5   r    Únum_posZnum_colsrH   r:   r:   r;   rE   ¥  s,    

ÿ$
ÿÿrE   ú.-c                    s(  t t| tttjtjfƒdƒ t| ƒ} t t| ƒdkdƒ t t	dd„ | D ƒƒdƒ t tˆt
ƒdtˆƒ ƒ t t|tƒdt|ƒ ƒ t| d ƒ‰ t t	‡ fd	d
„| D ƒƒdƒ t t|tttjtjfƒpÄ|dkdƒ |dkrât t| ƒ¡}n&t t|ƒt| ƒkdt|ƒt| ƒf ƒ t t|tg ƒtjtjfƒp*|dkdt|ƒ ƒ t ¡ }t ||kd||f ƒ t dd
„ | D ƒ¡}t | ¡ ¡}	|	 ¡  ‡fdd
„|	D ƒ}
ttˆ ƒƒ}tjd|
|d}|
D ]B}||k t¡|dd…tjf  }|jddj|jdd…|f< q²t|d|||d}|r$|dkr$t|dd}|S )a  
    Generates matrix from a sequence alignment

    parameters
    ----------
    sequences: (list of strings)
        A list of sequences, all of which must be the same length

    counts: (None or list of numbers)
        If not None, must be a list of numbers the same length os sequences,
        containing the (nonnegative) number of times that each sequence was
        observed. If None, defaults to 1.

    to_type: (str)
        The type of matrix to output. Must be 'counts', 'probability',
        'weight', or 'information'

    background: (array, or df)
        Specification of background probabilities. If array, should be the
        same length as df.columns and correspond to the probability of each
        column's character. If df, should be a probability matrix the same
        shape as df.

    characters_to_ignore: (str)
        Characters to ignore within sequences. This is often needed when
        creating matrices from gapped alignments.

    center_weights: (bool)
        Whether to subtract the mean of each row, but only if to_type=='weight'.

    pseudocount: (number >= 0.0)
        Pseudocount to use when converting from counts to probabilities.

    returns
    -------
    out_df: (dataframe)
        A matrix of the requested type.
    z:sequences must be a list, tuple, np.ndarray, or pd.Series.r   zsequences must have length > 0.c                 s   s   | ]}t |tƒV  qd S )N)r!   Ústr©Ú.0Úseqr:   r:   r;   Ú	<genexpr>  s     z&alignment_to_matrix.<locals>.<genexpr>z$sequences must all be of type stringú"type(seq) = %s must be of type strú(type(center_weights) = %s; must be bool.c                    s   g | ]}t |ƒˆ k‘qS r:   )rV   )r_   Ús)ÚLr:   r;   Ú
<listcomp>  s     z'alignment_to_matrix.<locals>.<listcomp>z4all elements of sequences must have the same length.Nz?counts must be None or a list, tuple, np.ndarray, or pd.Series.zQcounts must be the same length as sequences;len(counts) = %d; len(sequences) = %dr   zto_type=%s; must be in %sc                 S   s   g | ]}t  t|ƒ¡‘qS r:   )r%   rU   rS   r^   r:   r:   r;   rf   :  s     c                    s   g | ]}|ˆ kr|‘qS r:   r:   )r_   Úc)Úcharacters_to_ignorer:   r;   rf   A  s      ©ÚdatarZ   rY   r=   r   )r   r   r7   r    r   T©r6   )r   r!   rS   rT   r%   r&   r'   ÚSeriesrV   rO   r]   r#   r"   Zonesr(   r$   r-   rU   ÚuniquerP   ÚsortÚrangeZastyper*   rA   r@   r
   rB   r4   )Ú	sequencesr   r   r    rh   Úcenter_weightsr7   Úvalid_typesZ
char_arrayZunique_charactersrZ   rY   rC   rg   Ztmp_matr8   r:   )re   rh   r;   Úalignment_to_matrixÒ  sr    2ÿÿ

ÿ

ÿÿþþÿÿþ
ÿ  ürs   c                 C   sò  t  ¡ }| d¡ tt| tƒdt| ƒ ƒ tt|tƒdt|ƒ ƒ |dkr`tt	| ƒƒ}| 
¡  nttt	tjf}tt||ƒdƒ |dk	r´tt ¡ ƒ}t||kd||f ƒ tt| ƒ}t||kd||f ƒ tt|tƒdt|ƒ ƒ |rt|dkd	ƒ ttd
 ƒ}t| ƒ}	tt|	ƒƒ}
tjd||
d}|rˆtt ¡ ƒ}t| ƒD ]D\}}t||kd|||f ƒ t| }|D ]}d|j||f< qlq@n:t| ƒD ]0\}}t||kd|||f ƒ d|j||f< qt|dd|d}|rî|dkrît|dd}|S )a¬  
    Generates a matrix from a sequence. With default keyword arguments,
    this is a one-hot-encoded version of the sequence provided. Alternatively,
    is_iupac=True allows users to get matrix models based in IUPAC motifs.

    parameters
    ----------

    seq: (str)
        Sequence from which to construct matrix.

    cols: (str or array-like or None)
        The characters to use for the matrix columns. If None, cols is
        constructed from the unqiue characters in seq. Overriden by alphabet
        and is_iupac.

    alphabet: (str or None)
        The alphabet used to determine the columns of the matrix.
        Options are: 'dna', 'rna', 'protein'. Ignored if None. Overrides cols.

    is_iupac: (bool)
        If True, it is assumed that the sequence represents an IUPAC DNA
        string. In this case, cols is overridden, and alphabet must be None.

    to_type: (str)
        The type of matrix to output. Must be 'probability', 'weight',
        or 'information'

    center_weights: (bool)
        Whether to subtract the mean of each row, but only if to_type='weight'.

    returns
    -------
    seq_df: (dataframe)
        the matrix returned to the user.
    r   rb   rc   Nú<cols = %s must be None or a string, set, list, or np.ndarrayúalphabet = %s; must be in %s.z)invalid to_type=%s; to_type must be in %sz"type(is_iupac) = %s; must be bool.z(must have alphabet=None if is_iupac=Truer   rM   ri   zLcharacter %s at position %d is not a valid IUPAC character;must be one of %sr<   z-character %s at position %d is not in cols=%sr   )r7   r   r   r   Trk   )r$   r-   Úremover   r!   r]   r#   r"   rS   Úsetrn   r%   r&   ÚALPHABET_DICTÚkeysrV   ro   r'   r(   Ú
IUPAC_DICTÚ	enumeraterB   r4   )r`   ÚcolsÚalphabetZis_iupacr   rq   rr   Ú
cols_typesÚvalid_alphabetsre   rY   rC   Ziupac_charactersÚirg   ÚbsÚbr8   r:   r:   r;   Úsequence_to_matrixX  sv    -


ÿ

ÿ

ÿ
ÿ
ÿ

ÿþÿÿÿýrƒ   c                 C   sÆ  t | ttjtjfƒrNzd dd„ | D ƒ¡} W qz   tddtt	ƒ ƒ Y qzX n,zt	| ƒ} W n   tddtt	ƒ ƒ Y nX tt | t	ƒdt
| ƒ ƒ tt |t
g ƒtjtjfƒdt
|ƒ ƒ t|ƒ}tt| ƒt|ƒkdƒ |d	krôtt| ƒƒ}| ¡  nRt	tttjf}tt ||ƒd
ƒ ttt|ƒƒtt| ƒƒkdƒ tt|ƒt| ƒkdƒ |d	k	r~tt ¡ ƒ}t||kd||f ƒ tt| ƒ}t| |d}| ¡ }|jt |¡d	d	…tjf  |jd	d	…d	d	…f< |S )a$  
    Takes a sequence string and an array of values values and outputs a
    values dataframe. The returned dataframe is a L by C matrix where C is
    the number ofcharacters and L is sequence length.  If matrix is denoted as
    S, i indexes positions and c indexes characters, then S_ic will be non-zero
    (equal to the value in the values array at position p) only if character c
    occurs at position p in sequence. All other elements of S are zero.

    example usage:

    saliency_mat = logomaker.saliency_to_matrix(sequence,values)
    logomaker.Logo(saliency_mat)

    parameters
    ----------

    seq: (str or array-like list of single characters)
        sequence for which values matrix is constructed

    values: (array-like list of numbers)
        array of values values for each character in sequence

    cols: (str or array-like or None)
        The characters to use for the matrix columns. If None, cols is
        constructed from the unqiue characters in seq. Overridden by alphabet
        and is_iupac.

    alphabet: (str or None)
        The alphabet used to determine the columns of the matrix.
        Options are: 'dna', 'rna', 'protein'. Ignored if None. Overrides cols.

    returns
    -------
    saliency_df: (dataframe)
        values matrix in the form of a dataframe

    Ú c                 S   s   g | ]}t |ƒ‘qS r:   )r]   )r_   Úxr:   r:   r;   rf     s     z&saliency_to_matrix.<locals>.<listcomp>Fz could not convert %s to type strrb   z&type(values) = %s must be of type listz,length of seq and values list must be equal.Nrt   zFlength of set of unique characters must be equal for "cols " and "seq"z5unique characters for "cols" and "seq" must be equal.ru   )r|   )r!   rS   r%   r&   r'   rl   Újoinr   Úreprr]   r#   rV   rw   rn   rx   ry   rƒ   r-   r?   rU   rA   rB   )r`   r?   r|   r}   r~   r   Zohe_sequenceZsaliency_dfr:   r:   r;   Úsaliency_to_matrixé  sX    )

ÿ
ÿÿ

ÿÿÿ

ÿÿrˆ   )FFNNNr   )r<   )N)N)N)N)Nr   Nr\   Fr<   )NNFr   F)NN)Ú
__future__r   Únumpyr%   Zpandasr'   Zlogomaker.src.error_handlingr   r   Zlogomaker.src.validater   rx   rz   Zfinfor*   ZtinyrG   r$   r4   r1   r.   r2   r/   r3   r,   r+   rE   rs   rƒ   rˆ   r:   r:   r:   r;   Ú<module>   s|   ýñ      ú X




-      ú      û 