U
    fcu                     @   s   d dl mZmZmZmZmZmZmZmZm	Z	m
Z
 ddlmZ ddlZddlZddlZddlZddlZddlZddlZddlmZmZmZ ddlmZ ddlmZ dd	lm Z  e!d
Z"dddZ#G dd de Z$dS )   )
convert_to_instanceconvert_to_modelmatch_instance_to_datamatch_model_to_dataconvert_to_instance_with_indexconvert_to_linkIdentityLinkconvert_to_data	DenseData
SparseData    )binomN)LassoLarsICLasso	lars_path)KMeans)tqdm   )	ExplainerZshapTc              
   C   s   dd t | jd D }tt| dr6| j}| j} t|dd| }|rt |D ]X}t | jd D ]D}t	
t	| dd|f |j||f  }| ||f |j||f< qfqTt|j|ddt	|j S )	a'   Summarize a dataset with k mean samples weighted by the number of data points they
    each represent.

    Parameters
    ----------
    X : numpy.array or pandas.DataFrame
        Matrix of data samples to summarize (# samples x # features)

    k : int
        Number of means to use for approximation.

    round_values : bool
        For all i, round the ith dimension of each mean sample to match the nearest value
        from X[:,i]. This ensures discrete features always get a valid value.

    Returns
    -------
    DenseData object.
    c                 S   s   g | ]}t |qS  )str.0ir   r   =/tmp/pip-target-lpfmz8o1/lib/python/shap/explainers/kernel.py
<listcomp>'   s     zkmeans.<locals>.<listcomp>r   'pandas.core.frame.DataFrame'>r   )Z
n_clustersZrandom_stateN      ?)rangeshaper   typeendswithcolumnsvaluesr   fitnpZargminabsZcluster_centers_r
   ZbincountZlabels_)XkZround_valuesgroup_nameskmeansr   jindr   r   r   r*      s    *r*   c                   @   sV   e Zd ZdZe fddZdd Zdd Zdd	 Zd
d Z	dd Z
dd Zdd ZdS )KernelExplainera  Uses the Kernel SHAP method to explain the output of any function.

    Kernel SHAP is a method that uses a special weighted linear regression
    to compute the importance of each feature. The computed importance values
    are Shapley values from game theory and also coefficents from a local linear
    regression.


    Parameters
    ----------
    model : function or iml.Model
        User supplied function that takes a matrix of samples (# samples x # features) and
        computes a the output of the model for those samples. The output can be a vector
        (# samples) or a matrix (# samples x # model outputs).

    data : numpy.array or pandas.DataFrame or shap.common.DenseData or any scipy.sparse matrix
        The background dataset to use for integrating out features. To determine the impact
        of a feature, that feature is set to "missing" and the change in the model output
        is observed. Since most models aren't designed to handle arbitrary missing data at test
        time, we simulate "missing" by replacing the feature with the values it takes in the
        background dataset. So if the background dataset is a simple sample of all zeros, then
        we would approximate a feature being missing by setting it to zero. For small problems
        this background dataset can be the whole training set, but for larger problems consider
        using a single reference value or using the kmeans function to summarize the dataset.
        Note: for sparse case we accept any sparse matrix but convert to lil format for
        performance. 

    link : "identity" or "logit"
        A generalized linear model link to connect the feature importance values to the model
        output. Since the feature importance values, phi, sum up to the model output, it often makes
        sense to connect them to the ouput with a link function where link(outout) = sum(phi).
        If the model output is a probability then the LogitLink link function makes the feature
        importance values have log-odds units.
    c                 K   s  t || _t|| _|dd| _|dd| _t|| jd| _t	| j| j}t
| jtsnt
| jtsntd| jjr~tdt| jjdkrtdtt| jj d	 d
 d  | jjjd | _| jjjd | _t| jj| _d| _d| _t
|tjtjfrt |j!}t"|j#| jj j#d| _$| | j$| _%d| _&t| j$jdkrnd| _&t'| j$g| _$d| _(n| j$jd | _(d S )N
keep_indexFkeep_index_ordered)r.   zJShap explainer only supports the DenseData and SparseData input currently.zMShap explainer does not support transposed DenseData or SparseData currently.d   zUsing z% background data samples could cause zRslower run times. Consider using shap.kmeans(data, K) to summarize the background zas K weighted samples.r   r   T))r   linkr   modelgetr.   r/   r	   datar   
isinstancer
   r   AssertionErrorZ
transposedlenweightslogwarningr   r   NPr%   Z	vectorizeflinkfvnsamplesAddednsamplesRunpd	DataFrameSeriessqueezer#   sumTfnullZexpected_value
vector_outarrayD)selfr2   r4   r1   kwargsZ
model_nullr   r   r   __init__Y   s>    

zKernelExplainer.__init__c                    s  t t dr j n8t t drR| jrL jj} jj}t j} j t t }d}t	j
 rt	j
 s   ||st	j
 std| t jdkst jdkstdt jdkr d jd f}| jrt||||}| j|f|}	|	jtdkrjfd	d
td D }
td D ]}|	dd|f |
|< qJ|
S td }|	|dd< |S nDt jdkrg }tt jd |dddD ]T} ||d ddf }| jrt|||||d  |}|| j|f| q|d jtdkr fdd
td D }
t jd D ]6}td D ]"}|| dd|f |
| |< qfqV|
S t jd d f}t jd D ]}|| ||< q|S dS )a   Estimate the SHAP values for a set of samples.

        Parameters
        ----------
        X : numpy.array or pandas.DataFrame or any scipy.sparse matrix
            A matrix of samples (# samples x # features) on which to explain the model's output.

        nsamples : "auto" or int
            Number of times to re-evaluate the model when explaining each prediction. More samples
            lead to lower variance estimates of the SHAP values. The "auto" setting uses
            `nsamples = 2 * X.shape[1] + 2048`.

        l1_reg : "num_features(int)", "auto" (default for now, but deprecated), "aic", "bic", or float
            The l1 regularization to use for feature selection (the estimation procedure is based on
            a debiased lasso). The auto option currently uses "aic" when less that 20% of the possible sample
            space is enumerated, otherwise it uses no regularization. THE BEHAVIOR OF "auto" WILL CHANGE
            in a future version to be based on num_features instead of AIC.
            The "aic" and "bic" options use the AIC and BIC rules for regularization.
            Using "num_features(int)" selects a fix number of top features. Passing a float directly sets the
            "alpha" parameter of the sklearn.linear_model.Lasso model used for feature selection.

        Returns
        -------
        For models with a single output this returns a matrix of SHAP values
        (# samples x # features). Each row sums to the difference between the model output for that
        sample and the expected value of the model output (which is stored as expected_value
        attribute of the explainer). For models with vector outputs this returns a list
        of such matrices, one for each output.
        zpandas.core.series.Series'>r   z'numpy.ndarray'>zUnknown instance type: r   r   z%Instance must have 1 or 2 dimensions!r   c                    s   g | ]}t  d  qS r   )r%   zerosr   r+   )sr   r   r      s     z/KernelExplainer.shap_values.<locals>.<listcomp>NZsilentF)disablec                    s$   g | ]}t  jd  d  fqS rN   )r%   rO   r   rP   r'   rQ   r   r   r      s     )r   r    r!   r#   r.   indexnamelistr"   spsparseissparseZisspmatrix_liltolilr6   r7   r   reshaper   explainr   r%   rO   r   r3   append)rK   r'   rL   index_value
index_nameZcolumn_nameZx_typeZarr_typer4   ZexplanationZoutsr+   outZexplanationsr   r   rS   r   shap_values   s\     
"$"
$zKernelExplainer.shap_valuesc               	      sx  t |}t|j |j_jjd krVtdd jD _	j	j
d _nrfddjD _	tj	_jj j	rt fddjD rtj	_	j	j
d dkrȈj	 _	jrj| }nj|j}t|tjtjfr|j}|d _js*tjg_jdkrbtjjjf}tjjjf}njdkrtjjjf}tjjjf}jjjj }tjD ]}|| |jd |f< qnr| dd	_!| d
d	_"j"d	krdj d _"d_#jdkrNdj d _#j"j#krNj#_"$  t%t&jd d }	t%t'jd d }
tfddtd|	d D }|d |
  d9  < |t(| }t)*d+| t)*d+|	 t)*d+|
 t)*d+j d}j"}tj,jdd}tj}t--|}td|	d D ]t}t.j|}||
krr|d9 }t)*d+| t)*d+| t)*d+|||d    t)*d+|||d   |  |||d   | dkr|d7 }||8 }||d  dk r|d||d    }||d  t.j| }||
krH|d }t/0||D ]d}d|d d < d|tj|dd< 1|j|| ||
krTt2|d |d d < 1|j|| qTn qƐqNt)3d+| j4}j"j4 }t)*d+| ||	krt--|}|d |
  d  < ||d  }|t(| }t)3d +| t)3d+|
 tj5j6t|d!| |d"}d}i }|dkr|t|k r|7d || }|d7 }|| d }d|tj58jd | < t9|}d#}||krd$}j4||< |d8 }1|j|d nj:||   d7  < |dkr|||
kr|t2|d |d d < |rr|d8 }1|j|d nj:|| d   d7  < q|t(||d  }t)3d%+| j:|d   |j:|d  (  9  < ;  tjjjf}tjjjf}tjD ]:}<j"j# |\}}||j|f< ||j|f< qjsttj=|dd&}tj=|dd&}|S )'Nc                 S   s   g | ]}|qS r   r   r   r   r   r   r      s     z+KernelExplainer.explain.<locals>.<listcomp>r   c                    s   g | ]} j j| qS r   )r4   groupsr   rK   r   r   r      s     c                 3   s&   | ]}t  | t  d  kV  qdS )r   N)r7   r   )rb   r   r   	<genexpr>   s     z*KernelExplainer.explain.<locals>.<genexpr>r   l1_regautonsamplesr   i   i   @   g       @c                    s$   g | ]} j d  | j |   qS )r   )Mr   rc   r   r   r   (  s     zweight_vector = {0}znum_subset_sizes = {0}znum_paired_subset_sizes = {0}zM = {0}Zint64dtypezsubset_size = {0}znsubsets = {0}z0self.nsamples*weight_vector[subset_size-1] = {0}z9self.nsamples*weight_vector[subset_size-1]/nsubsets = {0}gG?r   g        znum_full_subsets = {0}zsamples_left = {0}zremaining_weight_vector = {0}   )pFTzweight_left = {0}Zaxis)>r   r   r4   varying_groupsxZvaryingIndsrb   r%   rI   varyingFeatureGroupsr   ri   r7   allflattenr.   r2   r=   Zconvert_to_dfr5   rA   rB   rC   r#   fxrH   rO   groups_sizerJ   r1   rG   r   r3   re   rg   Zmax_samplesallocateintceilfloorrE   r9   debugformatarangecopyr   	itertoolscombinations	addsampler&   infor?   randomchoicefillZpermutationtuplekernelWeightsrunsolverD   ) rK   Zincoming_instancerL   instanceZ	model_outphiZphi_vardiffdZnum_subset_sizesZnum_paired_subset_sizesZweight_vectorZnum_full_subsetsZnum_samples_leftZ
group_indsmaskZremaining_weight_vectorZsubset_sizeZnsubsetswindsZnfixed_samplesZsamples_leftZind_setZind_set_posZ
used_masksr,   Z
mask_tupleZ
new_sampleZweight_leftZvphiZvphi_varr   )rb   rK   r   r\      s   
"

 
 






(zKernelExplainer.explainc              
      s  t j st| jj}td| jjD ]}| jj| } d|f }t j|rxt	 fdd|D rpd||< q(|
 }tttj|| jjd d |f dd}|dk||< q(t|d }|S g }tt| jj d   d }g }tdt|D ]}|| }	| jjd d |	gf }
|
 d }|jdkr tt|
|   d|	f  dk}|dkr t d|	gf d	 dkrt||
jd k s || q tjt|td
}d||< || }|S d S )Nr   c                 3   s   | ]}|   d  kV  qdS )r   N)nonzerorP   rp   r   r   rd     s     z1KernelExplainer.varying_groups.<locals>.<genexpr>FT)Z	equal_nanr   gHz>)r   r   rj   )rW   rX   rY   r%   rO   r4   ru   r   rb   rr   ZtodenserE   invertiscloser   uniqueZunion1dr7   sizer&   Ztoarrayr   r]   onesbool)rK   rp   Zvaryingr   r   Zx_groupZnum_mismatchesZvarying_indicesZremove_unvarying_indicesZvarying_index	data_rowsZnonzero_rowsr   r   r   r   ro     sB    ,&(
zKernelExplainer.varying_groupsc                 C   s  t j| jjr(| jjj}| jjj}|\}}|| j }||f}|dkrft jj|| jjjd	 | _
n| jjj}| jjj}| jjj}|t|d  }	|d d }
g }td| jd D ]}||
||	   q||| jd |	   t|}t|| j}t|| j}t jj|||f|d	 | _
nt| jj| jdf| _
t| j| jf| _t| j| _t| j| j | jf| _t| j| jf| _t| j| _d| _d| _| jrt| jj| j| _ d S )Nr   rj   r   )r   )!rW   rX   rY   r4   r   nnzrg   Z
csr_matrixrk   rZ   
synth_dataindicesindptrr7   r   r]   r%   ZconcatenateZtilerO   ri   
maskMatrixr   r;   rJ   yeyZlastMaskr?   r@   r.   r^   synth_data_index)rK   r   r   r   Z	data_colsrowsr4   r   r   Zlast_indptr_idxZindptr_wo_lastZnew_indptrsr   Z
new_indptrZnew_dataZnew_indicesr   r   r   rv     s>    






zKernelExplainer.allocatec           
      C   s  | j | j }t| jtfrht| jD ]@}| j| D ]0}|| dkr2|d|f | j||| j |f< q2q$nl|dk}| j| }t|j	dkr|D ]$}	|d|	f | j||| j |	f< qn |d|f | j||| j |f< || j
| j d d f< || j| j < |  j d7  _ d S )Nr   r   r   r   )r?   r;   r5   rq   rV   r   ri   r   r7   r   r   r   )
rK   rp   mr   offsetr+   r(   r   rb   groupr   r   r   r     s    &
$ zKernelExplainer.addsamplec                 C   s  | j | j | j| j  }| j| j| j | j | j d d f }| jr| j| j| j | j | j  }tj|| jj	gd}tj|| jj
d}tj||gdd| jj	}| jr| }| j|}t|tjtjfr|j}t||| jf| j| j| j | j | j d d f< t| j| j D ]r}t| j}td| jD ]2}|| j|| j | d d f | jj|  7 }q4|| j|d d f< |  jd7  _qd S )N)r"   r   rn   r   )r?   r;   r@   r   r.   r   rA   rB   r4   r_   r)   concatZ	set_indexr/   Z
sort_indexr2   r=   r5   rC   r#   r%   r[   rJ   r   r   rO   r8   r   )rK   Z
num_to_runr4   rT   ZmodelOutr   ZeyValr+   r   r   r   r      s&    &40zKernelExplainer.runc              
   C   s^  |  | jd d |f | j| j|  }t| jd}t| j	}t
d| | jdkrhtd | jdks|dk r | jdkr t| j| j	|  | j| f}t
dt| t
dt| j t|}t||| j| j| | j| j|   f}||9 }t|tt| j| jd f }	t| jtr| jd	rt| jtd	d
 }
t|	||
dd }nz| jdks| jdks| jdkr| jdkrdn| j}tt|d|	|j d }n tt!| jd|	|j d }t|dkr&t"| j	t#| j	fS || jd d |d
 f | j| j| | j| j|    }tt| jd d |d d
 f | jd d |d
 f  }t
d|d dd d f  tt|t| j }tj$%t&t||}t&|t&t||}t
dt| t
d| j| j| | j| j|   t
d| j|  t
d| j| j|  t
d| j|  t
d| j| j|  t"| j	}|||d d
 < | j| j| | j| j|  t| ||d
 < t
d| t'| j	D ]"}t(|| dk r(d||< q(|t#t|fS )Nr   zfraction_evaluated = {0}rf   zl1_reg="auto" is deprecated and in the next version (v0.29) the behavior will change from a conditional use of AIC to simply "num_features(10)"!)rf   Fr   g?znp.sum(w_aug) = {0}z np.sum(self.kernelWeights) = {0}znum_features(r   )Zmax_iterZbicZaic)	criterionr   )alphazetmp[:4,:] {0}rl   znp.sum(w) = {0}z0self.link(self.fx) - self.link(self.fnull) = {0}zself.fx = {0}zself.link(self.fx) = {0}zself.fnull = {0}zself.link(self.fnull) = {0}z	phi = {0}g|=))r>   r   r1   r=   rG   r%   rE   r   r|   ri   r9   rz   r{   re   warningswarnZhstackr   r   sqrtrt   Z	transposeZvstackr5   r   
startswithrw   r7   r   r   r   r$   Zcoef_r   rO   r   Zlinalginvdotr   r&   )rK   Zfraction_evaluateddimZeyAdjrQ   Znonzero_indsZw_augZ
w_sqrt_augZ	eyAdj_augZmask_augrcZeyAdj2ZetmptmpZtmp2r   r   r   r   r   r   r     sb    *
  
2&$  "< "4zKernelExplainer.solveN)__name__
__module____qualname____doc__r   rM   ra   r\   ro   rv   r   r   r   r   r   r   r   r-   5   s   #+_ =&&r-   )T)%commonr   r   r   r   r   r   r   r	   r
   r   Zscipy.specialr   numpyr%   ZpandasrA   ZscipyrW   loggingr}   r~   r   Zsklearn.linear_modelr   r   r   Zsklearn.clusterr   Z	tqdm.autor   Z	explainerr   	getLoggerr9   r*   r-   r   r   r   r   <module>   s   0

#