U
    fb                     @   sl   d dl Zd dlZd dlZddlmZ zd dlZW n& ek
rD   Y n   ed Y nX G dd deZ	dS )    N   )	Explainerz*xgboost is installed...but failed to load!c                   @   s.   e Zd ZdZdi fddZdd Zdd Zd	S )
MimicExplainera  Fits a mimic model to the original model and then explains predictions using the mimic model.

    Tree SHAP allows for very fast SHAP value explainations of flexible gradient boosted decision
    tree (GBDT) models. Since GBDT models are so flexible we can train them to mimic any black-box
    model and then using Tree SHAP we can explain them. This won't work well for images, but for
    any type of problem that GBDTs do reasonable well on, they should also be able to learn how to
    explain black-box models on the data. This mimic explainer also allows you to use a linear model,
    but keep in mind that will not do as well at explaining typical non-linear black-box models. In
    the future we could include other mimic model types given enough demand/help. Finally, we would
    like to note that this explainer is vaugely inspired by https://arxiv.org/abs/1802.07814 where
    they learn an explainer that can be applied to any input.


    Parameters
    ----------
    model : function or iml.Model
        User supplied function that takes a matrix of samples (# samples x # features) and
        computes a the output of the model for those samples. The output can be a vector
        (# samples) or a matrix (# samples x # model outputs).

    data : numpy.array or pandas.DataFrame or iml.DenseData
        The background dataset to use for integrating out features. To determine the impact
        of a feature, that feature is set to "missing" and the change in the model output
        is observed. Since most models aren't designed to handle arbitrary missing data at test
        time, we simulate "missing" by replacing the feature with the values it takes in the
        background dataset. So if the background dataset is a simple sample of all zeros, then
        we would approximate a feature being missing by setting it to zero. For small problems
        this background datset can be the whole training set, but for larger problems consider
        using a single reference value or using the kmeans function to summarize the dataset.
    xgboostc                 C   s   || _ || _tt| _t|| _tdd| _t	|| jd| _
t| j| j
 | j|j
| _t| j
tsptd| j
jrtdt| j
jdk rtdtt| j
j d d	 d
  |   d S )N
keep_indexF)r   z;Shap explainer only supports the DenseData input currently.z?Shap explainer does not support transposed DenseData currently.d   zUsing only z# training data samples could cause zSthe mimic model poorly to fit the real model. Consider using more training samples zPor if you don't have more samples, using shap.inflate(data, N) to generate more.)mimic_model_typemimic_model_paramsZconvert_to_linklinkZconvert_to_modelmodelkwargsgetr   Zconvert_to_datadataZmatch_model_to_datafZ	model_out
isinstanceZ	DenseDataAssertionErrorZ
transposedlenweightslogwarningstr_train_mimic_model)selfr   r   mimic_modelr	    r   </tmp/pip-target-lpfmz8o1/lib/python/shap/explainers/mimic.py__init__.   s"    

zMimicExplainer.__init__c                 C   s&   | j dkr"t| jttj| _d S )Nr   )r   r   trainr	   DMatrixr   r   )r   r   r   r   r   G   s    
z!MimicExplainer._train_mimic_modelc                    st   d | j dkr:tt|ds*t|}| jj|dd  dk	rpt j	dkrl fddt
 j	d	 D S  S dS )
a   Estimate the SHAP values for a set of samples.

        Parameters
        ----------
        X : numpy.array or pandas.DataFrame
            A matrix of samples (# samples x # features) on which to explain the model's output.

        Returns
        -------
        For a models with a single output this returns a matrix of SHAP values
        (# samples x # features + 1). The last column is the base value of the model, which is
        the expected value of the model applied to the background dataset. This causes each row to
        sum to the model output for that sample. For models with vector outputs this returns a list
        of such matrices, one for each output.
        Nr   zxgboost.core.DMatrix'>T)Zpred_contribs   c                    s"   g | ]} d d |d d f qS )Nr   ).0iphir   r   
<listcomp>d   s     z.MimicExplainer.shap_values.<locals>.<listcomp>r   )r   r   typeendswithr   r   ZtreesZpredictr   shaperange)r   Xr   r   r"   r   shap_valuesK   s    

zMimicExplainer.shap_valuesN)__name__
__module____qualname____doc__r   r   r*   r   r   r   r   r      s   r   )
numpynpmultiprocessingsysZ	explainerr   r   ImportErrorprintr   r   r   r   r   <module>   s   