U
    Ùf~"  ã                   @   sâ   d dl Zd dlZd dlZd dlZzd dlmZ W n  e	k
rP   d dl
mZ Y nX dZd"dd„Zd#dd	„Zd$d
d„Zd%dd„Zd&dd„Zd'dd„Zd(dd„Zd)dd„Zd*dd„Zd+dd„Zd,dd„Zdd„ Zdd„ Zd-d d!„ZdS ).é    N)Úurlretrievez2https://github.com/slundberg/shap/raw/master/data/Féà   c                 C   sD   t d }t t|d||f  ƒ¡ tj¡}t t|d ƒ¡}||fS )aj   This is a set of 50 images representative of ImageNet images.

    This dataset was collected by randomly finding a working ImageNet link and then pasting the
    original ImageNet image into Google image search restricted to images licensed for reuse. A
    similar image (now with rights to reuse) was downloaded as a rough replacment for the original
    ImageNet image. The point is to have a random sample of ImageNet for use as a background
    distribution for explaining models trained on ImageNet data.

    Note that because the images are only rough replacements the labels might no longer be correct.
    Zimagenet50_z	%sx%s.npyz
labels.csv)Úgithub_data_urlÚnpÚloadÚcacheZastypeÚfloat32Úloadtxt)ÚdisplayÚ
resolutionÚprefixÚXÚy© r   ú4/tmp/pip-target-lpfmz8o1/lib/python/shap/datasets.pyÚ
imagenet50   s    "r   c                 C   s&   t j ¡ }tj|j|jd}||jfS )z3 Return the boston housing data in a nice package. ©ÚdataÚcolumns)ÚsklearnÚdatasetsZload_bostonÚpdÚ	DataFramer   Úfeature_namesÚtarget©r
   ÚdÚdfr   r   r   Úboston   s    
r   c              	   C   sH   t ttd ƒƒ}| ¡ }W 5 Q R X tjdtjd}d|dd…< ||fS )zø Return the clssic IMDB sentiment analysis training data in a nice package.

    Full data is at: http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz
    Paper to cite when using the data is: http://www.aclweb.org/anthology/P11-1015
    zimdb_train.txti¨a  ©Údtyper   NiÔ0  )Úopenr   r   Ú	readlinesr   ZonesÚbool)r
   Úfr   r   r   r   r   Úimdb%   s
    r%   c              	   C   s¤   t jttd ƒdd}t t t |jdd…df ¡¡¡d }tj	|j|df tj
d}|j|dd	…f }t t |j¡ d¡dk¡d }|jdd…|f }||fS )
zÚ Predict total number of non-violent crimes per 100K popuation.

    This dataset is from the classic UCI Machine Learning repository:
    https://archive.ics.uci.edu/ml/datasets/Communities+and+Crime+Unnormalized
    z CommViolPredUnnormalizedData.txtú?)Ú	na_valuesNéþÿÿÿr   r   é   iîÿÿÿ)r   Úread_csvr   r   r   ÚwhereÚinvertÚisnanZilocÚarrayÚfloatÚvaluesÚsum)r
   Úraw_dataZ
valid_indsr   r   Z
valid_colsr   r   r   Úcommunitiesandcrime2   s    
þ( r3   c                 C   s&   t j ¡ }tj|j|jd}||jfS )z- Return the diabetes data in a nice package. r   )r   r   Zload_diabetesr   r   r   r   r   r   r   r   r   ÚdiabetesI   s    
r4   c                    sF   t j ¡ ‰ tjˆ jˆ jd}| r8|‡ fdd„ˆ jD ƒfS |ˆ jfS dS )z1 Return the classic iris data in a nice package. r   c                    s   g | ]}ˆ j | ‘qS r   )Ztarget_names©Ú.0Úv©r   r   r   Ú
<listcomp>W   s     ziris.<locals>.<listcomp>N)r   r   Z	load_irisr   r   r   r   r   )r
   r   r   r8   r   ÚirisQ   s
    
r:   c                    s"  ddddddddd	d
dddddg}t jttd ƒdd„ |D ƒdt|ƒd}|jdgdd}ttdd„ |ƒƒ}|d dk|d< dddddd d!œ‰ |D ]J\}}|d"kr’|d#krÌt 	‡ fd$d„|| D ƒ¡||< q’|| j
j||< q’| r|jddd%gdd|d jfS |jdd%gdd|d jfS d&S )'z1 Return the Adult census data in a nice package. )ZAger   )Z	WorkclassÚcategory)Úfnlwgtr   )Ú	Educationr;   )zEducation-Numr   )zMarital Statusr;   )Z
Occupationr;   )ÚRelationshipr;   )ZRacer;   )ÚSexr;   )zCapital Gainr   )zCapital Lossr   )zHours per weekr   )ZCountryr;   )ÚTargetr;   z
adult.datac                 S   s   g | ]}|d  ‘qS )r   r   )r6   r   r   r   r   r9   g   s     zadult.<locals>.<listcomp>r&   )Únamesr'   r    r=   é   )Zaxisc                 S   s   | d dkS )Nr   )r@   r=   r   )Úxr   r   r   Ú<lambda>l   ó    zadult.<locals>.<lambda>r@   z >50Kr   é   é   é   r)   )zNot-in-familyZ	UnmarriedzOther-relativez	Own-childZHusbandZWifer;   r>   c                    s   g | ]}ˆ |  ¡  ‘qS r   )Ústripr5   ©Zrcoder   r   r9   y   s     r<   N)r   r*   r   r   ÚdictZdropÚlistÚfilterr   r.   ÚcatÚcodesr0   )r
   Zdtypesr2   r   Zfilt_dtypesÚkr    r   rJ   r   Úadult\   sP              û
üú"rQ   c                 C   sj   t  ttd ƒ¡}t  ttd ƒ¡d }| rX| ¡ }dd„ |d D ƒ|d< |t |¡fS |t |¡fS dS )zP A nicely packaged version of NHANES I data with surivival times as labels.
    zNHANESI_subset_X.csvzNHANESI_subset_y.csvr   c                 S   s   g | ]}|d krdnd‘qS )rB   ZMaleZFemaler   r5   r   r   r   r9   Š   s     znhanesi.<locals>.<listcomp>r?   N)r   r*   r   r   Úcopyr   r.   ©r
   r   r   Z	X_displayr   r   r   Únhanesiƒ   s    rT   c                 C   sD   t  ttd ƒ¡}t ttd ƒ¡}| r8| ¡ }||fS ||fS dS )zb A nicely packaged version of CRIC data with progression to ESRD within 4 years as the label.
    zCRIC_time_4yearESRD_X.csvzCRIC_time_4yearESRD_y.csvN)r   r*   r   r   r   r	   rR   rS   r   r   r   Úcric   s    rU   c                    sœ  t j ¡ }t j d¡ d}d}t  |¡‰ dˆ ddd…< t  |¡}tdddƒD ]l}d |||d f< ||d |f< d |||d f< ||d |f< d ||d |d f< ||d |d f< qL‡ fd	d
„}t j ||¡}|| d¡ }t  |j	|¡|j
d  }	t j t j |	¡¡j	}
t  ||
j	¡}t j t  t  ||
j	¡j	¡t  |¡ ¡dk sPt‚t  |t j |¡j	¡}|}||ƒt j |¡d  }t j |¡ t |¡|fS )zr Correlated Groups 60
    
    A simulated dataset with tight correlations among distinct groups of features.
    r   éè  é<   rB   é   rG   g®Gáz®ï?rF   c                    s   t  | ˆ ¡S ©N©r   Úmatmul©r   ©Úbetar   r   rD   ³   rE   zcorrgroups60.<locals>.<lambda>gíµ ÷Æ°>ç{®Gáz„?)r   ÚrandomÚseedÚzerosÚeyeÚrangeÚrandnÚmeanr[   ÚTÚshapeZlinalgZcholeskyÚinvZnormZcorrcoefÚAssertionErrorr   r   )r
   Úold_seedÚNÚMÚCÚir$   ÚX_startZ
X_centeredÚSigmaÚWZX_whiteZX_finalr   r   r   r]   r   Úcorrgroups60›   s.    


  *2rs   c                    s   t j ¡ }t j d¡ d}d}t  |¡‰ dˆ ddd…< ‡ fdd„}t j ||¡}|| d¡ }||ƒt j |¡d	  }t j |¡ t |¡|fS )
zT A simulated dataset with tight correlations among distinct groups of features.
    r   rV   rW   rB   rX   rG   c                    s   t  | ˆ ¡S rY   rZ   r\   r]   r   r   rD   ×   rE   z%independentlinear60.<locals>.<lambda>r_   )r   r`   ra   rb   re   rf   r   r   )r
   rk   rl   rm   r$   rp   r   r   r   r]   r   Úindependentlinear60È   s    

rt   c                   C   s   t j ttd ƒ¡S )z2 A sparse dataset in scipy csr matrix format.
    za1a.svmlight)r   r   Úload_svmlight_filer   r   r   r   r   r   Úa1aä   s    rv   c                  C   sh   d} t j t| d ƒ¡\}}t j t| d ƒ¡\}}t t| d ƒ¡}t t| d ƒ¡}||||||fS )z0 Ranking datasets from lightgbm repository.
    zPhttps://raw.githubusercontent.com/Microsoft/LightGBM/master/examples/lambdarank/z
rank.trainz	rank.testzrank.train.queryzrank.test.query)r   r   ru   r   r   r	   )Zrank_data_urlZx_trainZy_trainZx_testZy_testZq_trainZq_testr   r   r   Úrankê   s    rw   c                 C   sh   |d krt j | ¡}t j t j t¡d¡}t j |¡s@t  |¡ t j ||¡}t j |¡sdt	| |ƒ |S )NZcached_data)
ÚosÚpathÚbasenameÚjoinÚdirnameÚ__file__ÚisdirÚmkdirÚisfiler   )ÚurlÚ	file_nameÚdata_dirÚ	file_pathr   r   r   r   õ   s    

r   )Fr   )F)F)F)F)F)F)F)F)F)F)N)Zpandasr   Únumpyr   Zsklearn.datasetsr   rx   Úurllib.requestr   ÚImportErrorÚurllibr   r   r   r%   r3   r4   r:   rQ   rT   rU   rs   rt   rv   rw   r   r   r   r   r   Ú<module>   s,   






'


-
