U
    fj                  
   @   s  d dl Zd dlZd dlZd dlZd dlZd dlZd dlZd dl	m
Z
 ddlmZ ddlmZmZmZ zddlmZ W n. ek
r Z zedd	e W 5 dZ[X Y nX zd dlZW n. ek
r Z zed
de W 5 dZ[X Y nX zd dlZW n0 ek
r Z zedde W 5 dZ[X Y nX zd dlZW n0 ek
rZ Z zedde W 5 dZ[X Y nX d ddddZd dddZG dd deZG dd dZG dd dZdd ZG dd deZG dd dZ dS )    N)LooseVersion   )	Explainer   )assert_importrecord_import_error	DenseData)_cextcextz)C extension was not built during install!xgboostzXGBoost could not be imported!lightgbmzLightGBM could not be imported!catboostzCatBoost could not be imported!   )identitylogisticlogistic_nloglosssquared_loss)Zindependenttree_path_dependentZglobal_path_dependentc                   @   s6   e Zd ZdZdddZdd Zdd
dZdddZdS )TreeExplainera  Uses Tree SHAP algorithms to explain the output of ensemble tree models.

    Tree SHAP is a fast and exact method to estimate SHAP values for tree models and ensembles of trees,
    under several different possible assumptions about feature dependence. It depends on fast C++
    implementations either inside an externel model package or in the local compiled C extention.

    Parameters
    ----------
    model : model object
        The tree based machine learning model that we want to explain. XGBoost, LightGBM, CatBoost,
        and most tree-based scikit-learn models are supported.

    data : numpy.array or pandas.DataFrame
        The background dataset to use for integrating out features. This argument is optional when
        feature_dependence="tree_path_dependent", since in that case we can use the number of training
        samples that went down each tree path as our background dataset (this is recorded in the model object).

    feature_dependence : "tree_path_dependent" (default) or "independent"
        Since SHAP values rely on conditional expectations we need to decide how to handle correlated
        (or otherwise dependent) input features. The default "tree_path_dependent" approach is to just
        follow the trees and use the number of training examples that went down each leaf to represent
        the background distribution. This approach repects feature dependecies along paths in the trees.
        However, for non-linear marginal transforms (like explaining the model loss)  we don't yet
        have fast algorithms that respect the tree path dependence, so instead we offer an "independent"
        approach that breaks the dependencies between features, but allows us to explain non-linear
        transforms of the model's output. Note that the "independent" option requires a background
        dataset and its runtime scales linearly with the size of the background dataset you use. Anywhere
        from 100 to 1000 random background samples are good sizes to use.
    
    model_output : "margin", "probability", or "log_loss"
        What output of the model should be explained. If "margin" then we explain the raw output of the
        trees, which varies by model (for binary classification in XGBoost this is the log odds ratio).
        If "probability" then we explain the output of the model transformed into probability space
        (note that this means the SHAP values now sum to the probability output of the model). If "log_loss"
        then we explain the log base e of the model loss function, so that the SHAP values sum up to the
        log loss of the model for each sample. This is helpful for breaking down model performance by feature.
        Currently the probability and log_loss options are only supported when feature_dependence="independent".
    Nmarginr   c                 C   s  t t|dr|j| _nt|tr0|j| _n|| _| jd krDd n
t| j| _	|| _
|| _d | _t|| j| j	| _|tkstd|dkr|dkstdn|d k	std|dkr| jjd kr| jjd krtdt t|dr|dkrtd	 ttjtd
kstd| j
dkr0| j| _n|d k	r| jj| j|dd| _t| jdrt| jdkr| jd | _n@t| jdr| jjd d df dd | _|  j| jj7  _d S )Npandas.core.frame.DataFrame'>z"Invalid feature_dependence option!r   r   zROnly margin model_output is supported for feature_dependence="tree_path_dependent"zdA background dataset must be provided unless you are using feature_dependence="tree_path_dependent"!zModel does have a known objective or output type! When model_output is not "margin" then we need to know the model's objective or link function.xgboost.sklearn.XGBClassifier'>r   z0.81ztA bug in XGBoost fixed in v0.81 makes XGBClassifier fail to give margin outputs! Please upgrade to XGBoost >= v0.81!loglossoutputr   __len__r   node_sample_weight)strtypeendswithvaluesdata
isinstancer   npisnandata_missingmodel_outputfeature_dependenceexpected_valueTreeEnsemblemodelfeature_dependence_codesAssertionError	objectivetree_output	Exceptionr   r   r   __version__&_TreeExplainer__dynamic_expected_valuepredictmeanhasattrlensumbase_offset)selfr*   r!   r&   r'    r9   ;/tmp/pip-target-lpfmz8o1/lib/python/shap/explainers/tree.py__init__U   s>    




 zTreeExplainer.__init__c                 C   s.   | j j| jt| jjd | | jddS )zP This computes the expected value conditioned on the given label value.
        r   r   )r*   r2   r!   r#   onesshaper&   r3   )r8   yr9   r9   r:   Z__dynamic_expected_value   s    z&TreeExplainer.__dynamic_expected_valueFc           	         s  |dkr | j jdkrdn| j j}| jdkr| j jdkr| jdkrd | j jdkrtd tt|dszt	
|}|dkrd}| j jj||d|d	d
 n| j jdkr|rtd| j jj||dd  jd |jd d krv |jd  jd |jd d  |jd d  nZ| j jdkrv|r8td|dksJtdt|tjkrdt|}| j jj|dd  dk	rt jdkrʇ fddt jd D | _ fddt jd D S  d | _ ddddf S |}tt|dr|j}ntt|dr"|j}d	}t|jdkrLd}|d|jd }|j| j jkrj|| j j}tj|tjd}tt|dstdtt| t|jdkstd|dk s|| j jjd kr| j jjd }| jd kr2|dk	std!|jd t|ks2td"t||jd f | j | j}| jdkr^| j js^td#td$ t|jd |jd d | j j f |st!"| j j#| j j$| j j%| j j&| j j'| j j| j j(| j j)|||| j| j*|| j j+ t,| j t-| d	 nHt!.| j j#| j j$| j j%| j j&| j j'| j j| j j)|| j j+t-| |||  | j j dkr| jd kr^ d% | _|rv ddddf S  dddddf S nd| jd kr fd&dt jd D | _|r؇ fd'dt| j j D S  fd(dt| j j D S dS ))a   Estimate the SHAP values for a set of samples.

        Parameters
        ----------
        X : numpy.array, pandas.DataFrame or catboost.Pool (for catboost)
            A matrix of samples (# samples x # features) on which to explain the model's output.

        y : numpy.array
            An array of label values for each sample. Used when explaining loss functions.

        tree_limit : None (default) or int 
            Limit the number of trees used by the model. By default None means no use the limit of the
            original model, and -1 means no limit.

        approximate : bool
            Run fast, but only roughly approximate the Tree SHAP values. This runs a method
            previously proposed by Saabas which only considers a single feature ordering. Take care
            since this does not have the consistency guarantees of Shapley values and places too
            much weight on lower splits in the tree.

        Returns
        -------
        For models with a single output this returns a matrix of SHAP values
        (# samples x # features). Each row sums to the difference between the model output for that
        sample and the expected value of the model output (which is stored in the expected_value
        attribute of the explainer when it is constant). For models with vector outputs this returns
        a list of such matrices, one for each output.
        Nr   internalr   xgboost.core.DMatrix'>r   TF)ntree_limitZpred_contribsZapprox_contribsZvalidate_featuresr   z6approximate=True is not supported for LightGBM models!)Znum_iterationZpred_contribr   r   z6approximate=True is not supported for CatBoost models!z4tree_limit is not yet supported for CatBoost models!Z
ShapValues)r!   Z	fstr_typer   c                    s   g | ]} d |df qS r   r?   r9   .0iphir9   r:   
<listcomp>   s     z-TreeExplainer.shap_values.<locals>.<listcomp>c                    s"   g | ]} d d |d df qS Nr?   r9   rD   rG   r9   r:   rI      s     rC   pandas.core.series.Series'>r   dtype'numpy.ndarray'>Unknown instance type: r   7Passed input data matrix X must have 1 or 2 dimensions!r   gBoth samples and labels must be provided when explaining the loss (i.e. `explainer.shap_values(X, y)`)!OThe number of labels (%d) does not match the number of samples to explain (%d)!zThe background dataset you provided does not cover all the leaves in the model, so TreeExplainer cannot run with the feature_dependence="tree_path_dependent" option! Try providing a larger background dataset, or using feature_dependence="independent".r
   )r   r?   r   c                    s   g | ]} d d|f qS rC   r9   rD   rG   r9   r:   rI     s     c                    s   g | ]} d dd|f qS r   Nr?   r9   rD   rG   r9   r:   rI     s     c                    s"   g | ]} d d d d|f qS rJ   r9   rD   rG   r9   r:   rI   
  s     )/r*   
tree_limitr'   
model_typer!   r   r   r   r   r   DMatrixoriginal_modelr2   r,   r=   reshaper   ZPoolZget_feature_importancer5   ranger(   r    rM   astyper#   r$   boolr&   get_transformfully_defined_weightingzeros	n_outputsr	   dense_tree_shapchildren_leftchildren_rightchildren_defaultfeatures
thresholdsr   	max_depthr%   r7   r+   output_transform_codesZdense_tree_saabas)	r8   Xr>   rT   ZapproximateZorig_Xflat_output	X_missing	transformr9   rG   r:   shap_values   s    &
   4


(.$                        
zTreeExplainer.shap_valuesc                    sH  | j dkstd| jdks$tdd}|dkrH| jjdkr@dn| jj}| jjdkrtd tt|	d	szt
|}|dkrd
}| jjj||dd t jdkr fddt jd D | _ fddt jd D S  d | _ ddddddf S tt|	dr"|j}ntt|	dr<|j}d}t|jdkrfd}|d|jd
 }|j| jjkr|| jj}tj|tjd}tt|	dstdtt| t|jdkstd|d
k s|| jjjd
 kr | jjjd
 }td t|jd
 |jd d |jd d | jjf t| jj| jj| jj| jj | jj!| jj| jj"| jj#|||| j$| j%|| jj& t'| j t(| d | jjdkr d | _|rΈ d
ddddd
f S  ddddddd
f S nX fddt jd D | _|r* fddt| jjD S  fd dt| jjD S dS )!a   Estimate the SHAP interaction values for a set of samples.

        Parameters
        ----------
        X : numpy.array, pandas.DataFrame or catboost.Pool (for catboost)
            A matrix of samples (# samples x # features) on which to explain the model's output.

        y : numpy.array
            An array of label values for each sample. Used when explaining loss functions (not yet supported).

        tree_limit : None (default) or int 
            Limit the number of trees used by the model. By default None means no use the limit of the
            original model, and -1 means no limit.

        Returns
        -------
        For models with a single output this returns a tensor of SHAP values
        (# samples x # features x # features). The matrix (# features x # features) for each sample sums
        to the difference between the model output for that sample and the expected value of the model output
        (which is stored in the expected_value attribute of the explainer). Each row of this matrix sums to the
        SHAP value for that feature for that sample. The diagonal entries of the matrix represent the
        "main effect" of that feature on the prediction and the symmetric off-diagonal entries represent the
        interaction effects between all pairs of features for that sample. For models with vector outputs
        this returns a list of tensors, one for each output.
        r   zPOnly model_output = "margin" is supported for SHAP interaction values right now!r   zcOnly feature_dependence = "tree_path_dependent" is supported for SHAP interaction values right now!r   Nr?   r   rA   r   T)rB   Zpred_interactions   c                    s   g | ]} d |ddf qS rC   r9   rD   rG   r9   r:   rI   :  s     z9TreeExplainer.shap_interaction_values.<locals>.<listcomp>r   c                    s(   g | ] } d d |d dd df qS rJ   r9   rD   rG   r9   r:   rI   ;  s     )r   r?   r?   rK   r   FrL   rN   rO   r   rP   r
   )r   r?   r?   r   c                    s   g | ]} d dd|f qS rC   r9   rD   rG   r9   r:   rI   e  s     r   c                    s$   g | ]} d dddd|f qS rS   r9   rD   rG   r9   r:   rI   g  s     c                    s(   g | ] } d d d dd d|f qS rJ   r9   rD   rG   r9   r:   rI   i  s     ))r&   r,   r'   r*   rT   rU   r   r   r   r   r   rV   rW   r2   r5   r=   rY   r(   r    rX   rM   rZ   r#   r$   r[   r^   r_   r	   r`   ra   rb   rc   rd   re   r   rf   r!   r%   r7   r+   rg   )r8   rh   r>   rT   rk   ri   rj   r9   rG   r:   shap_interaction_values  s    

(0              	
z%TreeExplainer.shap_interaction_values)Nr   r   )NNF)NN)__name__
__module____qualname____doc__r;   r1   rl   rn   r9   r9   r9   r:   r   -   s   '
+
 r   c                   @   s,   e Zd ZdZd
ddZdd Zddd	ZdS )r)   zt An ensemble of decision trees.

    This object provides a common interface to many different types of models.
    Nc                    s  d| _ d | _d}d| _d | _d | _tj| _ | _| _	d| _
d | _dddddddddddd}dddd	d	d	d
}ttkrtd tkr| _	nttdrtj| _dtj  fddjD | _|jd | _d| _	nttdrPtj| _dtj  fddjD | _|jd | _d| _	n$ttdrtj| _dtj  fddjD | _|jd | _d| _nttdrtj| _dtj  fddjD | _|jd | _d| _nhttdrVtj| _tj dg| _|jd | _d| _nttdrtj| _tjd dg| _|jd | _d| _nttdr tj| _dtj  fddjD | _|jd | _d| _ntttdr^tj| _dtj  fddjD | _|jd | _d| _nttdr@tj| _ttjdrjj| _ndttjd rjj| _nBttjd!rjjd | _nd"s td#ttj  fd$djd d df D | _|jd | _d| _n4ttd%r.tj| _jjd& d&kr|d"s|td'ttjd(rjj| _d	| _nPttjd)rtj jj!d& | _d	| _nd"std#ttj  fd*djd d df D | _|jd | _nFttd+rt"d, | _#d,| _ t$| j#}|j% d| _|j&| _d"}||j'd | _||j'd | _nttd-r,t"d, tj| _d,| _ ( | _#t$| j#}|j% d| _|j&| _d"}||j'd | _||j'd | _t)d.d | _nHttd/rt"d, ( | _#d,| _ t$| j#}|j% d| _|j&| _d"}||j'd | _||j'd | _t)d.d | _nttd0rt"d, ( | _#d,| _ t$| j#}|j% d| _|j&| _d"}t)d.d | _n`ttd1rt"d2 d2| _ | _#| j#* d3 }z fd4d|D | _W n   d | _Y nX |j+d5d6d | _|j+d5d6d | _nttd7rNt"d2 d2| _ j,| _#| j#* d3 }z fd8d|D | _W n   d | _Y nX |jd | _|jd | _jd k
rtd| _d| _n&ttd9rt"d2 d2| _ j,| _#| j#* d3 }z fd:d|D | _W n   d | _Y nX nttd;	rZt"d2 d2| _ j,| _#| j#* d3 }z fd<d|D | _W n   d | _Y nX |jd | _|jd | _jd k
rtd| _d	| _nttd=	rt"d> d>| _ | _#nttd?	rt"d> d>| _ | _#tj| _t-}	|	j% d| _d	| _d| _nttd@
rt"d> d>| _ | _#npttdA
r`tj| _dtj  fdBdjD | _|jd | _d| _nt.dCtt | jd k	rt/dDd | jD }
tt0dEd | jD d&k
stdFt| j}| jd j1jd& | _2tj3||
ftj4dG | _5tj3||
ftj4dG | _6tj3||
ftj4dG | _7tj3||
ftj4dG | _8tj9||
f| jdG| _:tj9||
| jd j1jd& f| jdG| _1tj9||
f| jdG| _;t<|D ]}t| j| j8}| j| j5| j5|d |f< | j| j6| j6|d |f< | j| j7| j7|d |f< | j| j8| j8|d |f< | j| j:| j:|d |f< | j| j1| j1|d |d d f< | j| j;| j;|d |f< t=| j| j;dkrd"| _
q|st>| j:tj? | _:tj@dHd | jD tj4dG| _At/dId | jD | _Bd S )JNr@   Tr   squared_errorZabsolute_errorbinary_crossentropy)ZmseZfriedman_mse
reg:linear
regressionregression_l2ZmaeZginiZentropybinary:logisticbinary_loglossbinaryZ	raw_valuelog_odds)rv   rw   ru   rx   ry   rz   z/sklearn.ensemble.forest.RandomForestRegressor'>      ?c                    s   g | ]}t |j d qS )scalingr!   r%   Treetree_rE   er!   r%   r~   r9   r:   rI     s     z)TreeEnsemble.__init__.<locals>.<listcomp>z-skopt.learning.forest.RandomForestRegressor'>c                    s   g | ]}t |j d qS r}   r   r   r   r9   r:   rI     s     z-sklearn.ensemble.forest.ExtraTreesRegressor'>c                    s   g | ]}t |j d qS r}   r   r   r   r9   r:   rI     s     z+skopt.learning.forest.ExtraTreesRegressor'>c                    s   g | ]}t |j d qS r}   r   r   r   r9   r:   rI     s     z)sklearn.tree.tree.DecisionTreeRegressor'>r!   r%   z*sklearn.tree.tree.DecisionTreeClassifier'>)	normalizer!   r%   probabilityz0sklearn.ensemble.forest.RandomForestClassifier'>c              	      s    g | ]}t |jd  dqS T)r   r~   r!   r%   r   r   r   r9   r:   rI     s     z.sklearn.ensemble.forest.ExtraTreesClassifier'>c              	      s    g | ]}t |jd  dqS r   r   r   r   r9   r:   rI     s     z>sklearn.ensemble.gradient_boosting.GradientBoostingRegressor'>z*ensemble.gradient_boosting.MeanEstimator'>z.ensemble.gradient_boosting.QuantileEstimator'>zsklearn.dummy.DummyRegressor'>FzUnsupported init model type: c                    s    g | ]}t |jj d qS r}   r   r   Zlearning_rater   r!   r%   r*   r9   r:   rI     s     z?sklearn.ensemble.gradient_boosting.GradientBoostingClassifier'>r   zQGradientBoostingClassifier is only supported for binary classification right now!z-ensemble.gradient_boosting.LogOddsEstimator'>zsklearn.dummy.DummyClassifier'>c                    s    g | ]}t |jj d qS r}   r   r   r   r9   r:   rI     s     zxgboost.core.Booster'>r   r   Zbest_ntree_limitzxgboost.sklearn.XGBRegressor'>zxgboost.sklearn.XGBRanker'>zlightgbm.basic.Booster'>r   	tree_infoc                    s   g | ]}t | d qS r   r   r   r   r9   r:   rI     s     r-   rv   z lightgbm.sklearn.LGBMRegressor'>c                    s   g | ]}t | d qS r   r   r   r   r9   r:   rI   )  s     zlightgbm.sklearn.LGBMRanker'>c                    s   g | ]}t | d qS r   r   r   r   r9   r:   rI   7  s     z!lightgbm.sklearn.LGBMClassifier'>c                    s   g | ]}t | d qS r   r   r   r   r9   r:   rI   B  s     z!catboost.core.CatBoostRegressor'>r   z"catboost.core.CatBoostClassifier'>zcatboost.core.CatBoost'>z:imblearn.ensemble._forest.BalancedRandomForestClassifier'>c              	      s    g | ]}t |jd  dqS r   r   r   r   r9   r:   rI   ^  s     z/Model type not yet supported by TreeExplainer: c                 S   s   g | ]}t |jqS r9   r5   r    rE   tr9   r9   r:   rI   f  s     c                 S   s   g | ]}|j jd  qS )r   )r    r=   r   r9   r9   r:   rI   g  s     z>All trees in the ensemble must have the same output dimension!rL   c                 S   s   g | ]}t |jqS r9   r   r   r9   r9   r:   rI     s     c                 S   s   g | ]
}|j qS r9   )rf   r   r9   r9   r:   rI     s     )CrU   treesr7   r-   r.   r#   float64rM   r!   r%   r]   rT   r   listr   r   r   float32r5   Zestimators_get	criterionr   Zinit_r3   ZquantileZ	constant_r,   r=   ZpriorscipyspecialZlogitZclass_prior_r   rW   XGBTreeModelLoader	get_trees
base_scorename_objZget_boostergetattrZ
dump_modelparamsZbooster_CatBoostTreeModelLoaderr/   maxuniquer    r_   r<   int32ra   rb   rc   rd   r^   re   r   rY   minZ	nextafterinfarray	num_nodesrf   )r8   r*   r!   r%   Zless_than_or_equalZobjective_name_mapZtree_output_name_mapZ
xgb_loaderr   Z	cb_loaderZ	max_nodesZntreesrF   lr9   )r!   r%   r*   r~   r:   r;   r  s   	








&

&








(
& 
zTreeEnsemble.__init__c                 C   s   |dkrd}nv|dkrJ| j dkr&d}q| j dkr6d}qtd| j  d n:|dkr| jd	krbd
}n"| jdkrrd}ntd| j d |S )zE A consistent interface to make predictions from this model.
        r   r   r   r{   r   zLmodel_output = "probability" is not yet supported when model.tree_output = "z"!r   rs   r   rt   r   zFmodel_output = "logloss" is not yet supported when model.objective = ")r.   r/   r-   )r8   r&   rk   r9   r9   r:   r\     s    



zTreeEnsemble.get_transformr   c                 C   s,  |dkr| j dkrdn| j }tt|dr6|j}ntt|drN|j}d}t|jdkrvd}|d|jd }|j| jkr|	| j}t
j|t
jd	}tt|d
stdtt| t|jdkstd|dk s|| jjd kr| jjd }|dkrH|dk	std|jd t|ksHtdt||jd f | |}dsd| jdkrt
|jd | jf}td t| j| j| j| j| j| j| j|| jt| |||| n&| jdkrtd | jj|d|d}|r| jdkr | d S |d| jS n| jdkr$| S |S dS )a2   A consistent interface to make predictions from this model.

        Parameters
        ----------
        tree_limit : None (default) or int 
            Limit the number of trees used by the model. By default None means no use the limit of the
            original model, and -1 means no limit.
        Nr?   rK   r   Fr   Tr   rL   rN   rO   r   rP   r   rQ   rR   r@   r
   r   )Zoutput_marginrT   ) rT   r   r   r   r    r5   r=   rX   rM   rZ   r#   r$   r[   r,   r\   rU   r^   r_   r   r	   Zdense_tree_predictra   rb   rc   rd   re   rf   r7   rg   rW   r2   flatten)r8   rh   r>   r   rT   ri   rj   rk   r9   r9   r:   r2     sd    &
.
          zTreeEnsemble.predict)NN)Nr   N)ro   rp   rq   rr   r;   r\   r2   r9   r9   r9   r:   r)   l  s   
  r)   c                   @   s   e Zd ZdZdddZdS )r   z A single decision tree.

    The primary point of this object is to parse many different tree types into a common format.
    Fr|   Nc           #         s  t d tt|dr|jtj| _|jtj| _| j| _	|j
tj| _|jtj| _|j|jjd |jjd |jjd  | _|r| jj| jd j| _| j | _|jtj| _nt|tkrPd|krP|d tj| _|d tj| _|d tj| _	|d	 tj| _|d
 | _|d  | _|d | _nt|tkrd|kr|d }|d d }tjd| d tjd| _tjd| d tjd| _tjd| d tjd| _	tjd| d tjd| _tjd| d tjd| _dgd| d  | _tjd| d tjd| _g |g }}	|	rr|	d}
d|
 kr|
d |krnd|
d  kr|
d d | j|
d < n|
d d | | j|
d < d|
d  kr|
d d | j|
d < n|
d d | | j|
d < |
d r| j|
d  | j	|
d < n| j|
d  | j	|
d < |
d | j|
d < |
d
 | j|
d < |
d g| j|
d < |
d | j|
d < ||
d  |	|
d  |	|
d  nd| j|
d | < d| j|
d | < d| j	|
d | < d| j|
d | < d| j|
d | < d| j|
d | < d| j	|
d | < d| j|
d | < d| j|
d | < |
d g| j|
d | < |
d | j|
d | < q8t| j| _t| j| _nt|tkrjd|krjfdd|d }tj|tjd | _tj|tjd | _tj|tjd | _	tj|tjd | _tj|tjd| _tj|dftjd| _tj|tjd| _ fdd   ||  nt|tkrBd!d" |d d  d#D }i }|D ]&}| d$d |t!| d$d < qt"| d }dtj|d%d }dtj|d%d }dtj|d%d }dtj|d%d }dtj|d&d }dtj|d&d }tj|d&d}t#| }t#| }t$dt%|D ]}|| }|| }d'|krt&| d(d  d)d }t&| d*d }|||< |||< qtt!| d+d  d)d }t!| d,d  d)d }t!| d-d  d)d }| d.d } d/| krvt!|  d/d dd  }!t&|  d/d d d }"d0| krt!|  d0d dd  }!t&|  d0d d d }"t&| d*d  d)d }|||< |||< |||< |!||< |"||< |||< qt|| _|| _|| _	|| _|| _|d d tj'f  | _|| _nt(d1|d k	r|d k	rd2| jd d < t)*| j| j| j	| j| j| jd| j||
 t)+| j| j| j| j| _,d S )3Nr
   z'sklearn.tree._tree.Tree'>r   r   r   ra   rb   rc   feature	thresholdvaluer   Ztree_structureZ
num_leavesrL   Zsplit_indexZ
left_childZ
leaf_indexZright_childZdefault_leftZsplit_featureZinternal_valueZinternal_countr?   Z
leaf_valueZ
leaf_countnodeidc                    s8   d| kr,t | d f fdd| d D  S | d S d S )Nchildrenr   c                    s   g | ]} |qS r9   r9   )rE   nmax_idr9   r:   rI   4  s     z1Tree.__init__.<locals>.max_id.<locals>.<listcomp>)r   )noder   r9   r:   r   2  s    $zTree.__init__.<locals>.max_idc                    s   | d }| d |j |< d| kr~| d |j|< | d |j|< | d |j|< | d |j|< | d |j|< | d D ]} || qlnd	| kr| d	  |j|< d S )
Nr   Zcoverr   yesnomissingsplitZsplit_conditionleaf)r   ra   rb   rc   rd   re   r    )r   treerF   r   )extract_datar~   r9   r:   r   A  s    z#Tree.__init__.<locals>.extract_datac                 S   s   g | ]}|  qS r9   )lstripr   r9   r9   r:   rI   W  s     z!Tree.__init__.<locals>.<listcomp>
:r   r   r   zleaf=,zcover=zyes=zno=zmissing= <=z"Unknown input to Tree constructor!g        )-r   r   r   r   ra   rZ   r#   r   rb   rc   r   rd   r   r   re   r   rX   r=   r    Tr6   Zweighted_n_node_samplesr   dictemptypopkeysappendZasarraymultiplyr<   r^   r   intr   r   rY   r5   floatZnewaxisr/   r	   Zdense_tree_update_weightsZcompute_expectationsrf   )#r8   r   r   r~   r!   r%   startZnum_parentsvisitedqueueZvertexmZnodesZ
nodes_dictr   ra   rb   rc   rd   re   r    r   Z
values_lstZkeys_lstrF   r   keyvalZnode_sample_weight_valZc_leftZc_rightZ	c_defaultZ
feat_thresr   r   r9   )r   r   r~   r:   r;     s    ,


 $


          zTree.__init__)Fr|   NN)ro   rp   rq   rr   r;   r9   r9   r9   r:   r     s   r   c                 C   s@   | j }d| _ | jddd}|| _ dd |D }dd |D }|S )zd This gets a JSON dump of an XGBoost model while ensuring the features names are their indexes.
    NTjson)Z
with_statsZdump_formatc                 S   s   g | ]}| d dqS )z: inf,z: 1000000000000.0,replacer   r9   r9   r:   rI     s     z$get_xgboost_json.<locals>.<listcomp>c                 S   s   g | ]}| d dqS )z: -inf,z: -1000000000000.0,r   r   r9   r9   r:   rI     s     )Zfeature_namesZget_dump)r*   fnamesZ
json_treesr9   r9   r:   get_xgboost_json  s    r   c                   @   sB   e Zd ZdZdd ZdddZdd Zd	d
 Zdd Zdd Z	dS )r   z This loads an XGBoost model directly from a raw memory dump.

    We can't use the JSON dump because due to numerical precision issues those
    tree can actually be wrong when feature values land almost on a threshold.
    c                 C   s$  |  | _d| _| d| _| d| _| d| _| d| _| d| _| 	dd | d| _
| | j
| _| d| _| | j| _| jdkstd| j | d| _| d| _| d| _| d| _| d| _| d| _| d| _| 	dd	 tj| jtjd
| _tj| jtjd
| _tj| jtjd
| _tj| jtjd
| _tj| jtjd
| _tj| jtjd
| _g | _g | _g | _g | _g | _ g | _!g | _"g | _#g | _$t%| jD ]`}| d| j|< | d| j|< | d| j|< | d| j|< | d| j|< | d| j|< | 	dd | j&tj| j| tjd
 | j&tj| j| tjd
 | j&tj| j| tjd
 | j&tj| j| tj'd
 | j &tj| j| tj(d
 t%| j| D ]j}| d| jd |< | d| jd |< | d| jd |< | d| jd |< | d| j d |< q| j!&tj| j| tj(d
 | j"&tj| j| tj(d
 | j#&tj| j| tj(d
 | j$&tj| j| tj)d
 t%| j| D ]V}| d| j!d |< | d| j"d |< | d| j#d |< | d| j$d |< qĐqd S )Nr   fIrF      QZgbtreez4Only the 'gbtree' model type is supported, not '%s'!    rL      r?   )*Zsave_rawbufposreadr   num_feature	num_classcontain_extra_attrscontain_eval_metricsread_arrname_obj_lenread_strr   name_gbm_lenname_gbmr,   	num_trees	num_roots	pad_32bitnum_pbuffer_deprecatednum_output_groupsize_leaf_vectorr#   r^   r   r   Znum_deletedrf   Znode_parents
node_cleftnode_crightnode_sindex	node_infoZloss_chgsum_hessZbase_weightZleaf_child_cntrY   r   uint32r   r   )r8   Z	xgb_modelrF   jr9   r9   r:   r;     s    
zXGBTreeModelLoader.__init__Nc                 C   s  | j | j f}tj|tjd| _tj|tjd| _tj|tjd| _	tj|d |d dftjd| _
g }t| j D ]b}t| j| D ]}t| j| | tddkr| j| | | j||f< n| j| | | j||f< | j| | tdtd> td @ | j||f< | j| | dkrB| j| | | j	||f< q| j| | | j
||f< qt| j| }|t| j| | j| | j|d |f | j|d |f | j	|d |f | j
|d |f | j| d||d qt|S )NrL   r   r   r   ra   rb   rc   r   r   r   r   r   )r   r   r   r#   r^   r   rc   rd   r   re   r    rY   Zright_shiftr   r   r   r   r   r5   r   r   r   )r8   r!   r%   r=   r   rF   r   r   r9   r9   r:   r   
  s:      6 
	zXGBTreeModelLoader.get_treesc                 C   s>   t |}t || j| j| j|  d }|  j|7  _|S )Nr   structcalcsizeunpackr   r   )r8   rM   sizer   r9   r9   r:   r   *  s    
"zXGBTreeModelLoader.readc                 C   sJ   d||f }t |}t || j| j| j|  d }|  j|7  _|S )Nz%d%sr   r   )r8   rM   Zn_itemsformatr   r   r9   r9   r:   r   0  s
    
"zXGBTreeModelLoader.read_arrc                 C   s.   | j | j| j|  d}|  j|7  _|S )Nzutf-8)r   r   decode)r8   r   r   r9   r9   r:   r   7  s    zXGBTreeModelLoader.read_strc                 C   s   t d t d| j t d| j t d| j t d| j t d| j t d| j t d| j t d	| j t d
| j	 t   t d t d| j
 t d| j t d| j t d| j t d| j t d| j t d| j d S )Nz--- global parmeters ---zbase_score =znum_feature =znum_class =zcontain_extra_attrs =zcontain_eval_metrics =zname_obj_len =z
name_obj =zname_gbm_len =z
name_gbm =z"--- gbtree specific parameters ---znum_trees =znum_roots =zpad_32bit =znum_pbuffer_deprecated =znum_output_group =zsize_leaf_vector =)printr   r   r   r   r   r   r   r   r   r   r   r   r   r   r   )r8   r9   r9   r:   
print_info<  s&    zXGBTreeModelLoader.print_info)NN)
ro   rp   rq   rr   r;   r   r   r   r   r   r9   r9   r9   r:   r     s   Y
 r   c                   @   s   e Zd Zdd ZdddZdS )r   c                 C   sT   |j ddd ttdd| _| jd d d d | _| jd d d	 d
 | _d S )Nzcb_model.jsonr   )r   rZ
model_infor   Zboosting_optionsZ
iterationsZtree_learner_optionsdepth)Z
save_modelr   loadopenloaded_cb_modelr   rf   )r8   Zcb_modelr9   r9   r:   r;   T  s    z CatBoostTreeModelLoader.__init__Nc                 C   sj  g }t | jD ]T}t| jd | d }| jd | d }dgt|d  | }t||d< t t|d ddD ](}|d| d  |d| d   ||< qr| jd | d }	dgt|	d  |	 }
d	d
 t t|	d D }|dgt|	 7 }dd
 t dt|	D }|dgt|	 7 }dd
 t t|	d D }|dgt|	 7 }g }g }| jd | d D ]"}||d  ||d  q`g }t|d d d D ]\}}||gd|  7 }q|dgt|	 7 }g }t|d d d D ]\}}||gd|  7 }q|dgt|	 7 }|tt	|t	|t	|t	|t	|t	|

dt	|d||d q|S )NZoblivious_treesZsplitsleaf_weightsr   r   r   r?   leaf_valuesc                 S   s   g | ]}|d  d qS r   r   r9   rD   r9   r9   r:   rI   p  s     z5CatBoostTreeModelLoader.get_trees.<locals>.<listcomp>c                 S   s   g | ]}|d  qS )r   r9   rD   r9   r9   r:   rI   s  s     c                 S   s   g | ]}|d  d qS r  r9   rD   r9   r9   r:   rI   v  s     Zfloat_feature_indexborder)r?   r   r   r   )rY   r   r5   r  r6   r   	enumerater   r#   r   rX   )r8   r!   r%   r   Z
tree_indexr   r  Zleaf_weights_unraveledindexr  Zleaf_values_unraveledra   rb   rc   Zsplit_features_indexZborderselemZsplit_features_index_unraveledcounterZfeature_indexZborders_unraveledr  r9   r9   r:   r   \  sR    & 
	z!CatBoostTreeModelLoader.get_trees)NN)ro   rp   rq   r;   r   r9   r9   r9   r:   r   S  s   r   )!numpyr#   Zscipy.specialr   multiprocessingsysr   osr   Zdistutils.versionr   Z	explainerr   commonr   r   r    r	   ImportErrorr   r   r   r   rg   r+   r   r)   r   r   objectr   r   r9   r9   r9   r:   <module>   s\     A  v = )