3
"GÔ\8m  ã               @   s  d dl mZ d dlZd dlZd dlmZmZ d dl	m
Z
 ddddœZd	d
ddddddddddddddœZejeƒjZddddhZed8dd„ƒZd9d!d"„Zd:d#d$„Zd;d%d&„Zd<d'd(„Zd=d)d*„Zd+d,„ Zd-d.„ Zd/d0„ Zed>d2d3„ƒZed?d4d5„ƒZed@d6d7„ƒZdS )Aé    )ÚdivisionN)ÚcheckÚhandle_errors)Úvalidate_matrixÚACGTÚACGUÚACDEFGHIKLMNPQRSTVWY)ÚdnaÚrnaÚproteinÚAÚCÚGÚTZAGZCTÚGCÚATZGTZACZCGTZAGTZACTZACG)r   r   r   r   ÚRÚYÚSÚWÚKÚMÚBÚDÚHÚVÚNÚcountsÚprobabilityÚweightÚinformationFé   c       	      C   s0  t | ƒ} tt|tƒdt|ƒ ƒ tt|tƒdt|ƒ ƒ t|tkpH|dkd|tf ƒ t|tkpf|dkd|tf ƒ tt|tg ƒtjtj	fƒp”|dkdt|ƒ ƒ tt|t
tfƒdt|ƒ ƒ t|dkd	| ƒ |d
krt|dkoê|dkd||f ƒ t| ƒ}n |d
kr8t|dko |dkd||f ƒ t| ƒ}nì||krL| jƒ }nØt|dk	o^|dk	d||f ƒ t|dkd| ƒ |dkrÄ|dkržt| |ƒ}n$|dkr´t| |ƒ}nds$tdƒ‚n`|dkrÚt| |ƒ}n:|dkrðt| |ƒ}n$|dkrt| |ƒ}ndstdƒ‚t|d||d}t |ƒ}|S )a,  
    Performs transformations on a matrix. There are three types of
    transformations that can be performed:

    1. Center values:
        Subtracts the mean from each row in df. This is common for weight
        matrices or energy matrices. To do this, set center_values=True.

    2. Normalize values:
        Divides each row by the sum of the row. This is needed for probability
        matrices. To do this, set normalize_values=True.

    3. From/To transformations:
        Transforms from one type of matrix (e.g. 'counts') to another type
        of matrix (e.g. 'information'). To do this, set from_type and to_type
        arguments.

    Here are the mathematical formulas invoked by From/To transformations:

        from_type='counts' ->  to_type='probability':
            P_ic = (N_ic + l)/(N_i + C*l), N_i = sum_c(N_ic)

        from_type='probability' -> to_type='weight':
            W_ic = log_2(P_ic / Q_ic)

        from_type='weight' -> to_type='probability':
            P_ic = Q_ic * 2^(W_ic)

        from_type='probability' -> to_type='information':
            I_ic = P_ic * sum_d(P_id * log2(P_id / W_id))

        from_type='information' -> to_type='probability':
            P_ic = I_ic / sum_d(I_id)

        notation:
            i = position
            c, d = character
            l = pseudocount
            C = number of characters
            N_ic = counts matrix element
            P_ic = probability matrix element
            Q_ic = background probability matrix element
            W_ic = weight matrix element
            I_ic = information matrix element

    Using these five 1-step transformations, 2-step transformations
    are also enabled, e.g., from_type='counts' -> to_type='information'.

    parameters
    ----------

    df: (dataframe)
        The matrix to be transformed.

    center_values: (bool)
        Whether to center matrix values, i.e., subtract the mean from each
        row.

    normalize_values: (bool)
        Whether to normalize each row, i.e., divide each row by
        the sum of that row.

    from_type: (str)
        Type of input matrix. Must be one of 'counts', 'probability',
        'weight', or 'information'.

    to_type: (str)
        Type of output matrix. Must be one of 'probability', 'weight', or
        'information'. Can be 'counts' ONLY if from_type is 'counts' too.

    background: (array, or df)
        Specification of background probabilities. If array, should be the
        same length as df.columns and correspond to the probability of each
        column's character. If df, should be a probability matrix the same
        shape as df.

    pseudocount: (number >= 0)
        Pseudocount to use when transforming from a counts matrix to a
        probability matrix.

    returns
    -------
    out_df: (dataframe)
        Transformed matrix
    z-type(center_values) = %s must be of type boolz0type(normalize_values) = %s must be of type boolNz$from_type = %s must be None or in %sz"to_type = %s must be None or in %sz@type(background) = %s must be None or array-like or a dataframe.z'type(pseudocount) = %s must be a numberr   zpseudocount=%s must be >= 0Tz`If center_values is True, both from_type and to_typemust be None. Here, from_type=%s, to_type=%szcIf normalize_values is True, both from_type and to_typemust be None. Here, from_type=%s, to_type=%szoUnless center_values is True or normalize_values is True,Neither from_type (=%s) nor to_type (=%s) can be None.r   zSCan only have to_type='counts' if from_type='counts'. Here, however, from_type='%s'r   r   r    FzTHIS SHOULD NEVER EXECUTE)Ú	from_typeÚto_typeÚ
background)r   r   Ú
isinstanceÚboolÚtypeÚMATRIX_TYPESÚnpÚndarrayÚpdÚ	DataFrameÚintÚfloatÚ_center_matrixÚ_normalize_matrixÚcopyÚ_probability_mat_to_weight_matÚ#_probability_mat_to_information_matÚAssertionErrorÚ_counts_mat_to_probability_matÚ_weight_mat_to_probability_matÚ#_information_mat_to_probability_matÚtransform_matrix)	ÚdfÚcenter_valuesÚnormalize_valuesr"   r#   r$   ÚpseudocountÚout_dfÚprob_df© r?   ú../../logomaker/src/matrix.pyr8   (   sr    _














r8   ç      ð?c             C   sp   t | ƒ} t|dkdƒ | jƒ }| j| }||jdddd…tjf  |jdd…dd…f< t|ƒ}t |dd}|S )z:
    Converts a counts matrix to a probability matrix
    r   zpseudocount must be >= 0.r!   )ÚaxisNr   )Úmatrix_type)	r   r   r1   ÚvaluesÚsumr)   ÚnewaxisÚlocr0   )Ú	counts_dfr<   r>   Úvalsr?   r?   r@   r5      s    
0r5   c             C   sX   t | dd} t| |ƒ}| jƒ }tj| t ƒtj|t ƒ |jdd…dd…f< t |ƒ}|S )z:
    Converts a probability matrix to a weight matrix
    r   )rC   N)r   Ú_get_background_matr1   r)   Úlog2ÚSMALLrG   )r>   r$   Úbg_dfÚ	weight_dfr?   r?   r@   r2     s    
.r2   c             C   sX   t | ƒ} t| |ƒ}| jƒ }|jtjd| jƒ |jdd…dd…f< t|ƒ}t |dd}|S )z:
    Converts a weight matrix to a probability matrix
    é   Nr   )rC   )r   rJ   r1   rD   r)   ÚpowerrG   r0   )rN   r$   rM   r>   r?   r?   r@   r6   +  s    
&r6   c             C   sŽ   t | dd} t| |ƒ}| jƒ }| j}|j}|tj|t ƒtj|t ƒ  }|jdd}||dd…tjf  |j	dd…dd…f< t |dd}|S )z@
    Converts a probability matrix to an information matrix
    r   )rC   r!   )rB   Nr    )
r   rJ   r1   rD   r)   rK   rL   rE   rF   rG   )r>   r$   rM   Úinfo_dfZfg_valsZbg_valsZtmp_valsZinfo_vecr?   r?   r@   r3   B  s    
 (r3   c             C   sb   t | dd} t| |ƒ}tj| jdddƒ}|j|dd…f | j|dd…f< t| ƒ}t |dd}|S )zA
    Converts an information matrix to an probability matrix
    r    )rC   r!   )rB   g        Nr   )r   rJ   r)   ÚiscloserE   rG   r0   )rQ   r$   rM   Zzero_indicesr>   r?   r?   r@   r7   Z  s    
 r7   c             C   sˆ   t | ƒ} tt| jjƒ dkƒdƒ | jddj}tttj|dƒƒ dƒ | j	ƒ }| j|dd…tj
f  |jdd…dd…f< t |dd	}|S )
z@
    Normalizes a matrix df to a probability matrix prob_df
    r   z%Some data frame entries are negative.r!   )rB   g        z&Some columns in df sum to nearly zero.Nr   )rC   )r   r   ÚallrD   ÚravelrE   Úanyr)   rR   r1   rF   rG   )r9   Zsumsr>   r?   r?   r@   r0   u  s    *r0   c             C   sT   t | ƒ} | jddj}| jƒ }| j|dd…tjf  |jdd…dd…f< t |ƒ}|S )zN
    Centers each row of a matrix about zero by subtracting out the mean.
    r!   )rB   N)r   ÚmeanrD   r1   r)   rF   rG   )r9   Úmeansr=   r?   r?   r@   r/   ‘  s    *r/   c             C   sî   | j \}}| jƒ }|dkr6d| |jdd…dd…f< n¨t|tjttfƒr’tj|ƒ}t	t
|ƒ|kdƒ |tjdd…f |jdd…dd…f< t|ƒ}nLt|tjjjƒrÞt|ƒ}t	t| j|jkƒdƒ t	t| j|jkƒdƒ t|ƒ}t|dd}|S )aü  
    Creates a background matrix given a background specification. There
    are three possiblities:

    1. background is None => out_df represents a uniform background
    2. background is a vector => this vector is normalized then used as
        the entries of the rows of out_df. Vector must be the same length
        as the number of columns in df
    3. background is a dataframe => it is then normalized and use as out_df.
        In this case, background must have the same rows and cols as df
    Nr!   z-df and background have mismatched dimensions.z,Error: df and bg_mat have different indexes.z,Error: df and bg_mat have different columns.r   )rC   )Úshaper1   rG   r%   r)   r*   ÚlistÚtupleÚarrayr   ÚlenrF   r0   r+   ÚcoreÚframer,   r   rS   ÚindexÚcolumns)r9   r$   Únum_posZnum_colsrM   r?   r?   r@   rJ   ¥  s&    

$
rJ   ú.-c                s,  t t| tttjtjfƒdƒ t| ƒ} t t| ƒdkdƒ t t	dd„ | D ƒƒdƒ t tˆt
ƒdtˆƒ ƒ t t|tƒdt|ƒ ƒ t| d ƒ‰ t t	‡ fd	d
„| D ƒƒdƒ t t|tttjtjfƒpÄ|dkdƒ |dkrâtjt| ƒƒ}n&t t|ƒt| ƒkdt|ƒt| ƒf ƒ t t|tg ƒtjtjfƒp*|dkdt|ƒ ƒ tjƒ }t ||kd||f ƒ tjdd
„ | D ƒƒ}tj|jƒ ƒ}	|	jƒ  ‡fdd
„|	D ƒ}
ttˆ ƒƒ}tjd|
|d}xJ|
D ]B}||kjtƒ|dd…tjf  }|jddj|jdd…|f< q´W t|d|||d}|r(|dkr(t|dd}|S )a  
    Generates matrix from a sequence alignment

    parameters
    ----------
    sequences: (list of strings)
        A list of sequences, all of which must be the same length

    counts: (None or list of numbers)
        If not None, must be a list of numbers the same length os sequences,
        containing the (nonnegative) number of times that each sequence was
        observed. If None, defaults to 1.

    to_type: (str)
        The type of matrix to output. Must be 'counts', 'probability',
        'weight', or 'information'

    background: (array, or df)
        Specification of background probabilities. If array, should be the
        same length as df.columns and correspond to the probability of each
        column's character. If df, should be a probability matrix the same
        shape as df.

    characters_to_ignore: (str)
        Characters to ignore within sequences. This is often needed when
        creating matrices from gapped alignments.

    center_weights: (bool)
        Whether to subtract the mean of each row, but only if to_type=='weight'.

    pseudocount: (number >= 0.0)
        Pseudocount to use when converting from counts to probabilities.

    returns
    -------
    out_df: (dataframe)
        A matrix of the requested type.
    z:sequences must be a list, tuple, np.ndarray, or pd.Series.r   zsequences must have length > 0.c             s   s   | ]}t |tƒV  qd S )N)r%   Ústr)Ú.0Úseqr?   r?   r@   ú	<genexpr>  s    z&alignment_to_matrix.<locals>.<genexpr>z$sequences must all be of type stringz"type(seq) = %s must be of type strz(type(center_weights) = %s; must be bool.c                s   g | ]}t |ƒˆ k‘qS r?   )r\   )rd   Ús)ÚLr?   r@   ú
<listcomp>  s    z'alignment_to_matrix.<locals>.<listcomp>z4all elements of sequences must have the same length.Nz?counts must be None or a list, tuple, np.ndarray, or pd.Series.zQcounts must be the same length as sequences;len(counts) = %d; len(sequences) = %dz@type(background) = %s must be None or array-like or a dataframe.zto_type=%s; must be in %sc             S   s   g | ]}t jt|ƒƒ‘qS r?   )r)   r[   rY   )rd   re   r?   r?   r@   ri   :  s    c                s   g | ]}|ˆ kr|‘qS r?   r?   )rd   Úc)Úcharacters_to_ignorer?   r@   ri   A  s    )Údatar`   r_   )rB   r   )r"   r#   r<   r$   r   T)r:   )r   r%   rY   rZ   r)   r*   r+   ÚSeriesr\   rS   rc   r'   r&   Úonesr,   r(   r1   r[   ÚuniquerT   ÚsortÚrangeÚastyper.   rF   rE   r   rG   r8   )Ú	sequencesr   r#   r$   rk   Úcenter_weightsr<   Úvalid_typesZ
char_arrayZunique_charactersr`   r_   rH   rj   Ztmp_matr=   r?   )rh   rk   r@   Úalignment_to_matrixÒ  sZ    2


 "rv   c             C   sþ  t jƒ }|jdƒ tt| tƒdt| ƒ ƒ tt|tƒdt|ƒ ƒ |dkr`tt	| ƒƒ}|j
ƒ  nttt	tjf}tt||ƒdƒ |dk	r´ttjƒ ƒ}t||kd||f ƒ tt| ƒ}t||kd||f ƒ tt|tƒdt|ƒ ƒ |rt|dkd	ƒ ttd
 ƒ}t| ƒ}	tt|	ƒƒ}
tjd||
d}|rttjƒ ƒ}x”t| ƒD ]H\}}t||kd|||f ƒ t| }x|D ]}d|j||f< qpW qBW n>x<t| ƒD ]0\}}t||kd|||f ƒ d|j||f< qšW t|dd|d}|rú|dkrút|dd}|S )a¬  
    Generates a matrix from a sequence. With default keyword arguments,
    this is a one-hot-encoded version of the sequence provided. Alternatively,
    is_iupac=True allows users to get matrix models based in IUPAC motifs.

    parameters
    ----------

    seq: (str)
        Sequence from which to construct matrix.

    cols: (str or array-like or None)
        The characters to use for the matrix columns. If None, cols is
        constructed from the unqiue characters in seq. Overriden by alphabet
        and is_iupac.

    alphabet: (str or None)
        The alphabet used to determine the columns of the matrix.
        Options are: 'dna', 'rna', 'protein'. Ignored if None. Overrides cols.

    is_iupac: (bool)
        If True, it is assumed that the sequence represents an IUPAC DNA
        string. In this case, cols is overridden, and alphabet must be None.

    to_type: (str)
        The type of matrix to output. Must be 'probability', 'weight',
        or 'information'

    center_weights: (bool)
        Whether to subtract the mean of each row, but only if to_type='weight'.

    returns
    -------
    seq_df: (dataframe)
        the matrix returned to the user.
    r   z"type(seq) = %s must be of type strz(type(center_weights) = %s; must be bool.Nz<cols = %s must be None or a string, set, list, or np.ndarrayzalphabet = %s; must be in %s.z)invalid to_type=%s; to_type must be in %sz"type(is_iupac) = %s; must be bool.z(must have alphabet=None if is_iupac=Truer	   g        )rl   r`   r_   zLcharacter %s at position %d is not a valid IUPAC character;must be one of %sg      ð?z-character %s at position %d is not in cols=%sr   )r<   r"   r#   r   T)r:   )r(   r1   Úremover   r%   rc   r'   r&   rY   Úsetrp   r)   r*   ÚALPHABET_DICTÚkeysr\   rq   r+   r,   Ú
IUPAC_DICTÚ	enumeraterG   r8   )re   ÚcolsÚalphabetÚis_iupacr#   rt   ru   Ú
cols_typesÚvalid_alphabetsrh   r_   rH   Ziupac_charactersÚirj   ÚbsÚbr=   r?   r?   r@   Úsequence_to_matrixX  s`    -






r…   c             C   sÆ  t | ttjtjfƒrNydjdd„ | D ƒƒ} W qz   tddtt	ƒ ƒ Y qzX n,yt	| ƒ} W n   tddtt	ƒ ƒ Y nX tt | t	ƒdt
| ƒ ƒ tt |t
g ƒtjtjfƒdt
|ƒ ƒ t|ƒ}tt| ƒt|ƒkdƒ |d	krôtt| ƒƒ}|jƒ  nRt	tttjf}tt ||ƒd
ƒ ttt|ƒƒtt| ƒƒkdƒ tt|ƒt| ƒkdƒ |d	k	r~ttjƒ ƒ}t||kd||f ƒ tt| ƒ}t| |d}|jƒ }|jtj|ƒd	d	…tjf  |jd	d	…d	d	…f< |S )a$  
    Takes a sequence string and an array of values values and outputs a
    values dataframe. The returned dataframe is a L by C matrix where C is
    the number ofcharacters and L is sequence length.  If matrix is denoted as
    S, i indexes positions and c indexes characters, then S_ic will be non-zero
    (equal to the value in the values array at position p) only if character c
    occurs at position p in sequence. All other elements of S are zero.

    example usage:

    saliency_mat = logomaker.saliency_to_matrix(sequence,values)
    logomaker.Logo(saliency_mat)

    parameters
    ----------

    seq: (str or array-like list of single characters)
        sequence for which values matrix is constructed

    values: (array-like list of numbers)
        array of values values for each character in sequence

    cols: (str or array-like or None)
        The characters to use for the matrix columns. If None, cols is
        constructed from the unqiue characters in seq. Overridden by alphabet
        and is_iupac.

    alphabet: (str or None)
        The alphabet used to determine the columns of the matrix.
        Options are: 'dna', 'rna', 'protein'. Ignored if None. Overrides cols.

    returns
    -------
    saliency_df: (dataframe)
        values matrix in the form of a dataframe

    Ú c             S   s   g | ]}t |ƒ‘qS r?   )rc   )rd   Úxr?   r?   r@   ri     s    z&saliency_to_matrix.<locals>.<listcomp>Fz could not convert %s to type strz"type(seq) = %s must be of type strz&type(values) = %s must be of type listz,length of seq and values list must be equal.Nz<cols = %s must be None or a string, set, list, or np.ndarrayzFlength of set of unique characters must be equal for "cols " and "seq"z5unique characters for "cols" and "seq" must be equal.zalphabet = %s; must be in %s.)r}   )r%   rY   r)   r*   r+   rm   Újoinr   Úreprrc   r'   r\   rx   rp   ry   rz   r…   r1   rD   r[   rF   rG   )re   rD   r}   r~   r€   r   Zohe_sequenceZsaliency_dfr?   r?   r@   Úsaliency_to_matrixé  sH    )



,rŠ   )FFNNNr!   )rA   )N)N)N)N)Nr   Nrb   FrA   )NNFr   F)NN)Ú
__future__r   Únumpyr)   Úpandasr+   Úlogomaker.src.error_handlingr   r   Úlogomaker.src.validater   ry   r{   Úfinfor.   ÚtinyrL   r(   r8   r5   r2   r6   r3   r7   r0   r/   rJ   rv   r…   rŠ   r?   r?   r?   r@   Ú<module>   sp         R




-          