B
    ¦	ˆ\€\  ã               @   s˜  d dl Z d dlZd dlmZ d dlZd dlmZ d dlmZ d dlm	Z	 d dlm
Z
 d dlmZ d dlmZ d d	lmZ d d
lmZ d dlmZ d dlmZ d dlmZ d dlmZ dd„ Zdd„ Ze j ddddg¡dd„ ƒZe j ddddg¡dd„ ƒZe j dddddg¡dd„ ƒZd d!„ Zd"d#„ Z d$d%„ Z!d&d'„ Z"e j dddg¡e j d(de#e$g¡d)d*„ ƒƒZ%e j dddg¡e j d(e$e &d+¡e &d,¡g¡d-d.„ ƒƒZ'd/d0„ Z(e j d1dej)d2d3d g¡d4d5„ ƒZ*e j d(e#d6g¡d7d8„ ƒZ+e j d9d:d;ej)fg¡d<d=„ ƒZ,d>d?„ Z-e j d@ej.ej/g¡dAdB„ ƒZ0e j d1dej)d2d3d g¡dCdD„ ƒZ1e j d(e#d6g¡dEdF„ ƒZ2e j 3dG¡e j 3dH¡dIdJ„ ƒƒZ4dKdL„ Z5e j dMe 6dNdOgdOdPgg¡e 6dNdOgdOdNgg¡dQdRdSœdTfe 6dNdOgdOdPgg¡e 6dNdOgdOdPgg¡dUdRdSœdVfe 6dNdOgdOdPgg¡e 6dNdOgdOdPgg¡dWdUdSœdXfej6dYdZgd[dYgge$d\ej6dYdZgd[dYgge$d\i d]fg¡d^d_„ ƒZ7e j d`ej)ej8fd ej9fdNej9fg¡e j daej6ej:ej.ej;ej<ej=g¡e j dbdQdPe 6d dOg¡fdWdce 6d dOdPg¡fg¡ddde„ ƒƒƒZ>e j dfdgdhdRg¡e j diej)d g¡e j daej6ej:ej.ej;g¡djdk„ ƒƒƒZ?dldm„ Z@e j dnej6dYdZgdZdYgge#d\dYej6dZdZdgdhgdZdZdhdggge#d\fe 6ej)d;gd;ej)gg¡ej)e 6d;d;dgdhgd;d;dhdggg¡fej6ej)dZgdZej)gge#d\ej)ej6dZdZdgdhgdZdZdhdggge#d\fej6ddZgdZdgge#d\dej6dZdZdgdhgdZdZdhdggge#d\fg¡dodp„ ƒZAe j dqeg¡e j drdsej)dtfdug¡dvdw„ ƒƒZBdS )xé    N)Úsparse)Úassert_allclose)Úassert_allclose_dense_sparse)Úassert_array_equal)Úassert_array_almost_equal)Úassert_false)ÚMissingIndicator)ÚSimpleImputer)ÚPipeline)Ú
make_union)ÚGridSearchCV)Útree)Úsparse_random_matrixc       	      C   sæ   d||f }t }| jjdks(|jjdkr,t}t||d}| | ¡ |  ¡ ¡}||j|| 	d¡d |||| 	d¡d t||d}| t
 | ¡¡ | t
 |  ¡ ¡¡}t
 |¡r¸| ¡ }||j|| 	d¡d |||| 	d¡d dS )zâUtility function for testing imputation for a given strategy.

    Test with dense and sparse arrays

    Check that:
        - the statistics (mean, median, mode) are correct
        - the missing values are imputed correctlyz<Parameters: strategy = %s, missing_values = %s, sparse = {0}Úf)ÚstrategyF)Úerr_msgTN)r   ÚdtypeZkindr   r	   ÚfitÚ	transformÚcopyZstatistics_Úformatr   Ú
csc_matrixÚissparseÚtoarray)	ÚXÚX_truer   Z
statisticsÚmissing_valuesr   Z	assert_aeÚimputerÚX_trans© r   ú8lib/python3.7/site-packages/sklearn/tests/test_impute.pyÚ_check_statistics   s$    


r!   c              C   sp   t j dd¡} t j| d d d…< xLdD ]D}t|d}| t | ¡¡}|jdksPt	‚| | ¡}|jdks$t	‚q$W d S )Né
   é   )ÚmeanÚmedianÚmost_frequentÚconstant)r   )r"   r#   )
ÚnpÚrandomÚrandnÚnanr	   Úfit_transformr   Ú
csr_matrixÚshapeÚAssertionError)r   r   r   Z	X_imputedr   r   r    Útest_imputation_shape=   s    


r0   r   Zconstée   c          	   C   sJ   t  d¡}t j|d< tjtt| ƒd t| d}| |¡ W d Q R X d S )N)é   é   )r   r   )Úmatch)r   )	r(   Úonesr+   ÚpytestÚraisesÚ
ValueErrorÚstrr	   r,   )r   r   r   r   r   r    Ú&test_imputation_error_invalid_strategyJ   s
    


r:   r$   r%   r&   c          	   C   sP   t  d¡}t j|d d …df< tjtdd t| dd}| |¡ W d Q R X d S )N)r2   r3   r   ZDeleting)r4   T)r   Úverbose)r(   r5   r+   r6   ZwarnsÚUserWarningr	   r,   )r   r   r   r   r   r    Ú test_imputation_deletion_warningT   s
    
r=   r'   c          	   C   s‚   t  d¡}d|d< t |¡}t| dd}tjtdd | |¡ W d Q R X | | 	¡ ¡ tjtdd | 
|¡ W d Q R X d S )N)r2   r3   r   )r   r   zProvide a dense array)r4   )r(   r5   r   r   r	   r6   r7   r8   r   r   r   )r   r   r   r   r   r    Útest_imputation_error_sparse_0^   s    

r>   c             O   s8   t | dƒr| jnt| ƒ}|dkr&tjS tj| f|ž|ŽS )NÚsizer   )Úhasattrr?   Úlenr(   r+   r%   )ÚarrÚargsÚkwargsÚlengthr   r   r    Úsafe_mediano   s    rF   c             O   s8   t | dƒr| jnt| ƒ}|dkr&tjS tj| f|ž|ŽS )Nr?   r   )r@   r?   rA   r(   r+   r$   )rB   rC   rD   rE   r   r   r    Ú	safe_meanu   s    rG   c           
   C   s‚  t j d¡} d}d}|| || f}t  |d ¡}t  d|d d ¡}|dd d…  |dd d…< dt jdd„ fd	t jd
d„ fg}xú|D ]ð\}}}	t  |¡}
t  |¡}t  |d ¡}xlt|d ƒD ]Z}|| d dk|| d  || d  }t|d ||  || ||   dƒ}|d | | }|d |… }t  	||¡}||  
t|ƒ¡d |…  }|	|||ƒ||< t  |||f¡|
d d …|f< d|kr¾t  |t  	|| || ¡f¡|d d …|f< n(t  ||t  	|| |¡f¡|d d …|f< t j |¡ |
d d …|f ¡ t j |¡ |d d …|f ¡ qÆW |d	krFt  |¡jdd }nt  |¡jdd }|d d …|f }t|
||||ƒ qˆW d S )Nr   r"   é   é   r#   r$   c             S   s   t t | |f¡ƒS )N)rG   r(   Úhstack)ÚzÚvÚpr   r   r    Ú<lambda>ˆ   s    z-test_imputation_mean_median.<locals>.<lambda>r%   c             S   s   t t | |f¡ƒS )N)rF   r(   rJ   )rK   rL   rM   r   r   r    rN   Š   s    )Zaxis)r(   r)   ÚRandomStateÚzerosZaranger+   ÚemptyÚrangeÚmaxÚrepeatZpermutationrA   rJ   ZshuffleZisnanÚanyÚallr!   )ÚrngZdimZdecr.   rP   ÚvaluesZtestsr   Ztest_missing_valuesZtrue_value_funr   r   Ztrue_statisticsÚjZnb_zerosZnb_missing_valuesZ	nb_valuesrK   rM   rL   Zcols_to_keepr   r   r    Útest_imputation_mean_median{   sR    

($

"
rZ   c              C   sÚ   t  dt jt jgdt jt jgddt jgddt jgddt jgddt jgddt jgddt jgg¡ ¡ } t  dddgdddgdddgdddgddd	gddd
gdddgdddgg¡ ¡ }ddddd	d
ddg}t| |d|t jƒ d S )Nr   r3   éûÿÿÿrI   éüÿÿÿéÿÿÿÿr#   g      Àg      @g      @g      Àg      à?r%   )r(   Úarrayr+   Z	transposer!   )r   ZX_imputed_medianZstatistics_medianr   r   r    Ú$test_imputation_median_special_casesÀ   s*    




r_   r   c          	   C   sX   t jdddgdddgddd	gg|d
}tjtdd t| d}| |¡ W d Q R X d S )NÚaÚbr2   rI   Úeé   ÚgÚhé	   )r   znon-numeric data)r4   )r   )r(   r^   r6   r7   r8   r	   r,   )r   r   r   r   r   r   r    Ú.test_imputation_mean_median_error_invalid_typeÝ   s    
rg   ÚUÚSc          	   C   s€   t jt jt jddgt jdt jdgt jddt jgt jdddgg|d}d}tjt|d	  t| d
}| |¡ |¡ W d Q R X d S )Nr`   r   ÚcÚdra   re   )r   z#SimpleImputer does not support data)r4   )r   )	r(   r^   r+   r6   r7   r8   r	   r   r   )r   r   r   r   r   r   r   r    Ú/test_imputation_const_mostf_error_invalid_typesé   s    
rl   c           	   C   sz   t  ddddgddddgddddgddddgg¡} t  dddgdddgdddgdddgg¡}t| |dt jdddgdƒ d S )	Nr]   r   r3   r#   r2   rH   é   r&   )r(   r^   r!   r+   )r   r   r   r   r    Útest_imputation_most_frequentû   s    


rn   ÚmarkerZNANÚ c             C   sŽ   t j| | ddg| d| dg| dd| g| dddggtd}t jdddgdddgdddgdddggtd}t| dd	}| |¡ |¡}t||ƒ d S )
Nr`   r   rj   rk   ra   re   )r   r&   )r   r   )r(   r^   Úobjectr	   r   r   r   )ro   r   r   r   r   r   r   r    Ú%test_imputation_most_frequent_objects  s     



rr   Úcategoryc             C   sr   t  d¡}t d¡}|j|| d}tjdddgdddgdddgd	ddggtd}td
d}| 	|¡}t
||ƒ d S )NÚpandasz,Cat1,Cat2,Cat3,Cat4
,i,x,
a,,y,
a,j,,
b,j,x,)r   r`   ÚiÚxrY   Úyra   r&   )r   )r6   ÚimportorskipÚioÚStringIOÚread_csvr(   r^   rq   r	   r,   r   )r   Úpdr   Údfr   r   r   r   r   r    Ú$test_imputation_most_frequent_pandas*  s    




r~   zX_data, missing_value)rH   r   g      ð?c          	   C   sN   t jd| td}||d< tjtdd t|ddd}| |¡ W d Q R X d S )	N)r2   r3   )r   )r   r   zimputing numerical)r4   r'   rv   )r   r   Ú
fill_value)r(   ZfullÚfloatr6   r7   r8   r	   r,   )ZX_dataÚmissing_valuer   r   r   r   r    Ú+test_imputation_constant_error_invalid_typeD  s    r‚   c           	   C   sŠ   t  ddddgddddgddddgdd	d
dgg¡} t  d
ddd
gdd
dd
gddd
d
gdd	d
d
gg¡}tddd
d}| | ¡}t||ƒ d S )Nr]   r#   r2   rI   r3   rc   rm   é   rf   r   r'   )r   r   r   )r(   r^   r	   r,   r   )r   r   r   r   r   r   r    Ú test_imputation_constant_integerQ  s    






r„   Úarray_constructorc          	   C   s¦   t  t jddt jgdt jdt jgddt jt jgdddt jgg¡}t  ddddgddddgddddgddddgg¡}| |ƒ}| |ƒ}tddd	}| |¡}t||ƒ d S )
Ngš™™™™™ñ?r   g333333ó?gÍÌÌÌÌÌô?gffffffö?g      ø?r]   r'   )r   r   )r(   r^   r+   r	   r,   r   )r…   r   r   r   r   r   r   r    Útest_imputation_constant_floath  s    



r†   c             C   s’   t j| dd| gd| d| gdd| | gddd	| ggtd
}t jddddgddddgddddgddd	dggtd
}t| ddd}| |¡}t||ƒ d S )Nr`   ra   rj   rk   rb   r   rd   re   ru   )r   Zmissingr'   )r   r   r   )r(   r^   rq   r	   r,   r   )ro   r   r   r   r   r   r   r    Útest_imputation_constant_objectƒ  s     






r‡   c             C   sz   t  d¡}t d¡}|j|| d}tjddddgddddgdd	ddgd
d	ddggtd}tdd}| 	|¡}t
||ƒ d S )Nrt   z,Cat1,Cat2,Cat3,Cat4
,i,x,
a,,y,
a,j,,
b,j,x,)r   r   ru   rv   r`   rw   rY   ra   r'   )r   )r6   rx   ry   rz   r{   r(   r^   rq   r	   r,   r   )r   r|   r   r}   r   r   r   r   r   r    Útest_imputation_constant_pandas›  s    






rˆ   z ignore: The default of the `iid`z"ignore: You should specify a valuec              C   st   t dddd} | jd }tdt|dfdtjddfgƒ}d	d
ddgi}t dddd ¡ }t||ƒ}| | |¡ d S )Néd   gš™™™™™¹?)Údensityr   r   )r   r   )Úrandom_stateZimputer__strategyr$   r%   r&   rH   )	r   Údatar
   r	   r   ZDecisionTreeRegressorr   r   r   )r   r   ZpipelineZ
parametersÚYZgsr   r   r    Ú$test_imputation_pipeline_grid_searchµ  s    


rŽ   c              C   st  t ddddd} |  ¡  ¡ }tdddd}| |¡ |¡}d|d	< tt ||k¡ƒ |  ¡ }t|j	d ddd}| |¡ |¡}d|j	d< tt |j	|j	k¡ƒ |  ¡  ¡ }tddd
d}| |¡ |¡}d|d	< t
||ƒ |  ¡  ¡ }t|j	d dd
d}| |¡ |¡}d|j	d< t
|j	|j	ƒ |  ¡ }t|j	d dd
d}| |¡ |¡}d|j	d< tt |j	|j	k¡ƒ d S )Nr3   g      è?r   )rŠ   r‹   r$   T)r   r   r   r]   )r   r   F)r   r   r   r	   r   r   r   r(   rV   rŒ   r   Ztocsc)ZX_origr   r   ZXtr   r   r    Útest_imputation_copyÊ  s:    



r   zX_fit, X_trans, params, msg_errr]   rH   r#   zmissing-onlyÚauto)Úfeaturesr   zBhave missing values in transform but have no missing values in fitr)   z3'features' has to be either 'missing-only' or 'all'rV   z&'sparse' has to be a boolean or 'auto'r`   ra   rj   )r   z1MissingIndicator does not support data with dtypec          	   C   sD   t dd}|jf |Ž tjt|d | | ¡ |¡ W d Q R X d S )Nr]   )r   )r4   )r   Ú
set_paramsr6   r7   r8   r   r   )ÚX_fitr   ZparamsZmsg_errÚ	indicatorr   r   r    Útest_missing_indicator_errorø  s    
r•   zmissing_values, dtypeÚarr_typez,param_features, n_features, features_indicesr2   c             C   sÚ  t  | | dgd| dgg¡}t  | | dgdddgg¡}t  dddgdddgg¡}t  dddgdddgg¡}	||ƒ |¡}||ƒ |¡}| |¡}|	 |¡}	t| |dd}
|
 |¡}|
 |¡}|jd |ksÌt‚|jd |ksÞt‚t|
j	|ƒ t
||d d …|f ƒ t
||	d d …|f ƒ |jtks&t‚|jtks6t‚t|t jƒsHt‚t|t jƒsZt‚|
jd	d
 |
 |¡}|
 |¡}|jtksŠt‚|jtksšt‚|jdksªt‚|jdksºt‚t
| ¡ |ƒ t
| ¡ |ƒ d S )NrH   rI   r#   é   r"   r   F)r   r‘   r   T)r   Úcsc)r(   r^   Úastyper   r,   r   r.   r/   r   Z	features_r   r   ÚboolÚ
isinstanceÚndarrayr’   r   r   )r   r–   r   Zparam_featuresZ
n_featuresZfeatures_indicesr“   r   ZX_fit_expectedZX_trans_expectedr”   Ú
X_fit_maskÚX_trans_maskZX_fit_mask_sparseZX_trans_mask_sparser   r   r    Útest_missing_indicator_new  sB    





rŸ   Úparam_sparseTFr   c             C   sL  t  ||dgd|dgg¡}t  ||dgdddgg¡}| |ƒ t j¡}| |ƒ t j¡}t||d}| |¡}| |¡}|dkrš|jdksŠt‚|jdks˜t‚n®|d	krÌ|d
krÌt	|t j
ƒsºt‚t	|t j
ƒsÊt‚n||dkröt	|t j
ƒsät‚t	|t j
ƒsôt‚nRt |¡r$|jdkst‚|jdksHt‚n$t	|t j
ƒs6t‚t	|t j
ƒsHt‚d S )NrH   rI   r#   r—   r"   )r   r   Tr˜   r   r   F)r(   r^   r™   Úfloat64r   r,   r   r   r/   r›   rœ   r   r   )r–   r   r    r“   r   r”   r   rž   r   r   r    Ú#test_missing_indicator_sparse_paramH  s0    

r¢   c              C   sX   t jdddgdddggtd} tddd}| | ¡}t|t  dddgdddgg¡ƒ d S )	Nr`   ra   rj   )r   rV   )r   r‘   TF)r(   r^   rq   r   r,   r   )r   r”   r   r   r   r    Útest_missing_indicator_stringn  s
    
r£   zX, missing_values, X_trans_expc             C   s0   t t|ddt|dƒ}| | ¡}t||ƒ d S )Nr&   )r   r   )r   )r   r	   r   r,   r   )r   r   ZX_trans_expZtransr   r   r   r    Ú#test_missing_indicator_with_imputerv  s
    

r¤   Úimputer_constructorz.imputer_missing_values, missing_value, err_msgZNaNzInput contains NaN)z-1r]   z(types are expected to be both numerical.c          	   C   sR   t j d¡}| dd¡}||d< | |d}tjt|d | |¡ W d Q R X d S )Né*   r"   )r   r   )r   )r4   )r(   r)   rO   r*   r6   r7   r8   r,   )r¥   Zimputer_missing_valuesr   r   rW   r   r   r   r   r    Ú(test_inconsistent_dtype_X_missing_values  s    
r§   )Cr6   Znumpyr(   Zscipyr   ry   Zsklearn.utils.testingr   r   r   r   r   Zsklearn.imputer   r	   Zsklearn.pipeliner
   r   Zsklearn.model_selectionr   Zsklearnr   Zsklearn.random_projectionr   r!   r0   ZmarkZparametrizer:   r=   r>   rF   rG   rZ   r_   rq   r9   rg   r   rl   rn   r+   rr   r~   r‚   r„   r-   Zasarrayr†   r‡   rˆ   ÚfilterwarningsrŽ   r   r^   r•   r¡   Zint32r   Z
coo_matrixZ
lil_matrixZ
bsr_matrixrŸ   r¢   r£   r¤   r§   r   r   r   r    Ú<module>   s¸   &

E*" "
.(((	
&."" 
