B
    Zi                 @   s   d Z ddlmZ ddlmZ ddlZddlmZ ddl	m
Z
 ddlmZ ddlmZ dddZdddZdd ZG dd deZdddZdS )zkInfluence and Outlier Measures

Created on Sun Jan 29 11:16:09 2012

Author: Josef Perktold
License: BSD-3
    )lzip)defaultdictN)OLS)cache_readonly)multipletests)maybe_unwrap_resultsbonf皙?Fc             C   s@  ddl m} |dkr$t| jjdd}t| dd}|dkrPt| }td|jj | j	}	|rt
|	 ddd }
|	|
 }	|dk	rt
||
 }| jd }|jt
|	|d	 }t|||d
}t
j|	||d f }|dk	r|dddf |k }|| }ntd}|dk	r<ddlm} ||dd|d gt
|| dS |S )a^  
    Outlier Tests for RegressionResults instances.

    Parameters
    ----------
    model_results : RegressionResults instance
        Linear model results
    method : str
        - `bonferroni` : one-step correction
        - `sidak` : one-step correction
        - `holm-sidak` :
        - `holm` :
        - `simes-hochberg` :
        - `hommel` :
        - `fdr_bh` : Benjamini/Hochberg
        - `fdr_by` : Benjamini/Yekutieli
        See `statsmodels.stats.multitest.multipletests` for details.
    alpha : float
        familywise error rate
    labels : None or array_like
        If `labels` is not None, then it will be used as index to the
        returned pandas DataFrame. See also Returns below
    order : bool
        Whether or not to order the results by the absolute value of the
        studentized residuals. If labels are provided they will also be sorted.
    cutoff : None or float in [0, 1]
        If cutoff is not None, then the return only includes observations with
        multiple testing corrected p-values strictly below the cutoff. The
        returned array or dataframe can be empty if there are no outlier
        candidates at the specified cutoff.

    Returns
    -------
    table : ndarray or DataFrame
        Returns either an ndarray or a DataFrame if labels is not None.
        Will attempt to get labels from model_results if available. The
        columns are the Studentized residuals, the unadjusted p-value,
        and the corrected p-value according to method.

    Notes
    -----
    The unadjusted p-value is stats.t.sf(abs(resid), df) where
    df = df_resid - 1.
    r   )statsN
row_labelsZget_influencez=model_results object %s does not have a get_influence method.      )alphamethod)	DataFramestudent_residunadj_pz(p))columnsindex)scipyr
   getattrmodeldatar   AttributeError	__class____name__resid_studentized_externalnpabsZargsortasarraydf_residtsfr   Zc_slicepandasr   )Zmodel_resultsr   r   labelsordercutoffr
   inflresultsresididxZdfr   Zadj_pr   maskr    r.   Clib/python3.7/site-packages/statsmodels/stats/outliers_influence.pyoutlier_test   s8    .


r0      c             C   sz   |d }| j jjd }t| j|ddddf }t| j j|f}t| j j|	 }t
|d |jd |}||S )a  Ramsey's RESET specification test for linear models

    This is a general specification test, for additional non-linear effects
    in a model.


    Notes
    -----
    The test fits an auxiliary OLS regression where the design matrix, exog,
    is augmented by powers 2 to degree of the fitted values. Then it performs
    an F-test whether these additional terms are significant.

    If the p-value of the f-test is below a threshold, e.g. 0.1, then this
    indicates that there might be additional non-linear effects in the model
    and that the linear model is mis-specified.


    References
    ----------
    http://en.wikipedia.org/wiki/Ramsey_RESET_test

    r   N)r   exogshaper   Zvanderfittedvaluescolumn_stackr   endogfitZeyeZf_test)resZdegreer'   k_varsZy_fitted_vanderr3   Zres_auxZr_matrixr.   r.   r/   reset_ramseyd   s    r;   c             C   sX   | j d }| dd|f }t||k}| dd|f }t|| j}dd|  }|S )a~  variance inflation factor, VIF, for one exogenous variable

    The variance inflation factor is a measure for the increase of the
    variance of the parameter estimates if an additional variable, given by
    exog_idx is added to the linear regression. It is a measure for
    multicollinearity of the design matrix, exog.

    One recommendation is that if VIF is greater than 5, then the explanatory
    variable given by exog_idx is highly collinear with the other explanatory
    variables, and the parameter estimates will have large standard errors
    because of this.

    Parameters
    ----------
    exog : ndarray
        design matrix with all explanatory variables, as for example used in
        regression
    exog_idx : int
        index of the exogenous variable in the columns of exog

    Returns
    -------
    vif : float
        variance inflation factor

    Notes
    -----
    This function does not save the auxiliary regression.

    See Also
    --------
    xxx : class for regression diagnostics  TODO: doesn't exist yet

    References
    ----------
    http://en.wikipedia.org/wiki/Variance_inflation_factor

    r   Ng      ?)r4   r   aranger   r8   Zrsquared)r3   Zexog_idxr:   x_ir-   x_notiZr_squared_iZvifr.   r.   r/   variance_inflation_factor   s    '
r?   c               @   s  e Zd ZdZdd Zedd Zedd Zedd	 Zed
d Z	edd Z
edd Zedd Zd6ddZedd Zedd Zedd Zedd Zedd Zedd  Zed!d" Zed#d$ Zed%d& Zed'd( Zd7d+d,Zd-d. Zed/d0 Zd1d2 Zd8d4d5ZdS )9OLSInfluenceaf  class to calculate outlier and influence measures for OLS result

    Parameters
    ----------
    results : Regression Results instance
        currently assumes the results are from an OLS regression

    Notes
    -----
    One part of the results can be calculated without any auxiliary regression
    (some of which have the `_internal` postfix in the name. Other statistics
    require leave-one-observation-out (LOOO) auxiliary regression, and will be
    slower (mainly results with `_external` postfix in the name).
    The auxiliary LOOO regression only the required results are stored.

    Using the LOO measures is currently only recommended if the data set
    is not too large. One possible approach for LOOO measures would be to
    identify possible problem observations with the _internal measures, and
    then run the leave-one-observation-out only with observations that are
    possible outliers. (However, this is not yet available in an automized way.)

    This should be extended to general least squares.

    The leave-one-variable-out (LOVO) auxiliary regression are currently not
    used.

    c             C   sX   t || _|jjj\| _| _|jj| _|jj| _|jj| _	t
|j| _i | _i | _d S )N)r   r*   r   r3   r4   nobsr:   r7   r   model_classr   sqrt	mse_residZ	sigma_estaux_regression_exogaux_regression_endog)selfr*   r.   r.   r/   __init__   s    



zOLSInfluence.__init__c             C   s   | j | jjjj dS )z(cached attribute) diagonal of the hat_matrix for OLS

        Notes
        -----
        temporarily calculated here, this should go to model class
        r   )r3   r*   r   Z
pinv_wexogTsum)rG   r.   r.   r/   hat_matrix_diag   s    zOLSInfluence.hat_matrix_diagc             C   s   | j }| jjd|  S )z+(cached attribute) PRESS residuals
        r   )rK   r*   r+   )rG   hiir.   r.   r/   resid_press   s    zOLSInfluence.resid_pressc             C   s   | j }| jj| d|  S )z(cached attribute) influence measure

        matches the influence measure that gretl reports
        u * h / (1 - h)
        where u are the residuals and h is the diagonal of the hat_matrix
        r   )rK   r*   r+   )rG   rL   r.   r.   r/   	influence   s    zOLSInfluence.influencec             C   s   | j }|d|  S )z(cached attribute) factor of diagonal of hat_matrix used in influence

        this might be useful for internal reuse
        h / (1 - h)
        r   )rK   )rG   rL   r.   r.   r/   hat_diag_factor   s    zOLSInfluence.hat_diag_factorc             C   s   t | j| jS )zC(cached attribute) error sum of squares of PRESS residuals
        )r   dotrM   )rG   r.   r.   r/   	ess_press	  s    zOLSInfluence.ess_pressc             C   s   | j ddS )z(cached attribute) studentized residuals using variance from OLS

        this uses sigma from original estimate
        does not require leave one out loop
        N)sigma)get_resid_studentized_external)rG   r.   r.   r/   resid_studentized_internal  s    z'OLSInfluence.resid_studentized_internalc             C   s   t | j}| j|dS )z(cached attribute) studentized residuals using LOOO variance

        this uses sigma from leave-one-out estimates

        requires leave one out loop for observations
        )rR   )r   rC   sigma2_not_obsirS   )rG   Z
sigma_looor.   r.   r/   r     s    z'OLSInfluence.resid_studentized_externalNc             C   s:   | j }|dkr | jj}t|}| jj| td|  S )a  calculate studentized residuals

        Parameters
        ----------
        sigma : None or float
            estimate of the standard deviation of the residuals. If None, then
            the estimate from the regression results is used.

        Returns
        -------
        stzd_resid : ndarray
            studentized residuals

        Notes
        -----
        studentized residuals are defined as ::

           resid / sigma / np.sqrt(1 - hii)

        where resid are the residuals from the regression, sigma is an
        estimate of the standard deviation of the residuals, and hii is the
        diagonal of the hat_matrix.

        Nr   )rK   r*   rD   r   rC   r+   )rG   rR   rL   Z
sigma2_estr.   r.   r/   rS   $  s
    
z+OLSInfluence.get_resid_studentized_externalc             C   s@   | j }| jt|d|   }dt| jd | j  }||fS )z(cached attribute) dffits measure for influence of an observation

        based on resid_studentized_internal
        uses original results, no nobs loop

        r   r   g      ?)rK   rT   r   rC   r:   rA   )rG   rL   dffits_dffits_thresholdr.   r.   r/   dffits_internalE  s    zOLSInfluence.dffits_internalc             C   s@   | j }| jt|d|   }dt| jd | j  }||fS )a  (cached attribute) dffits measure for influence of an observation

        based on resid_studentized_external,
        uses results from leave-one-observation-out loop

        It is recommended that observations with dffits large than a
        threshold of 2 sqrt{k / n} where k is the number of parameters, should
        be investigated.

        Returns
        -------
        dffits: float
        dffits_threshold : float

        References
        ----------
        `Wikipedia <http://en.wikipedia.org/wiki/DFFITS>`_

        r   r   g      ?)rK   r   r   rC   r:   rA   )rG   rL   rV   rW   r.   r.   r/   dffitsU  s    zOLSInfluence.dffitsc             C   sF   | j j| j }|t| jdddf  }|tt| j j }|S )z](cached attribute) dfbetas

        uses results from leave-one-observation-out loop
        N)r*   paramsparams_not_obsir   rC   rU   ZdiagZnormalized_cov_params)rG   dfbetasr.   r.   r/   r\   r  s    zOLSInfluence.dfbetasc             C   s   t | jd S )z(cached attribute) error variance for all LOOO regressions

        This is 'mse_resid' from each auxiliary regression.

        uses results from leave-one-observation-out loop
        rD   )r   r    	_res_looo)rG   r.   r.   r/   rU   }  s    zOLSInfluence.sigma2_not_obsic             C   s   t | jd S )z(cached attribute) parameter estimates for all LOOO regressions

        uses results from leave-one-observation-out loop
        rZ   )r   r    r]   )rG   r.   r.   r/   r[     s    zOLSInfluence.params_not_obsic             C   s   t | jd S )z(cached attribute) determinant of cov_params of all LOOO regressions

        uses results from leave-one-observation-out loop
        det_cov_params)r   r    r]   )rG   r.   r.   r/   det_cov_params_not_obsi  s    z$OLSInfluence.det_cov_params_not_obsic             C   sP   | j }| jd | j }||d|  9 }ddlm} |j|| j| jj}||fS )zX(cached attribute) Cooks distance

        uses original results, no nobs loop

        r   r   r   )r
   )	rK   rT   r:   r   r
   fr#   r*   r!   )rG   rL   Zcooks_d2r
   Zpvalsr.   r.   r/   cooks_distance  s    zOLSInfluence.cooks_distancec             C   s   | j tj| j  }|S )z(cached attribute) covariance ratio between LOOO and original

        This uses determinant of the estimate of the parameter covariance
        from leave-one-out estimates.
        requires leave one out loop for observations

        )r_   r   linalgdetr*   
cov_params)rG   	cov_ratior.   r.   r/   re     s    
zOLSInfluence.cov_ratioc             C   s   | j jd| j  S )z(cached attribute) estimate of variance of the residuals

        ::

           sigma2 = sigma2_OLS * (1 - hii)

        where hii is the diagonal of the hat matrix

        r   )r*   rD   rK   )rG   r.   r.   r/   	resid_var  s    zOLSInfluence.resid_varc             C   s   t | jS )z(cached attribute) estimate of standard deviation of the residuals

        See Also
        --------
        resid_var

        )r   rC   rf   )rG   r.   r.   r/   	resid_std  s    	zOLSInfluence.resid_stdr7   Tc       
      C   s   |dkr,| j }t||r || S | jjj}nHy| j| |  W n tk
rR   Y nX | j| }i }| jdd|f }| jjd }t	
||k}| jdd|f }t|| }	|r|	||< |	S )a  regression results from LOVO auxiliary regression with cache


        The result instances are stored, which could use a large amount of
        memory if the datasets are large. There are too many combinations to
        store them all, except for small problems.

        Parameters
        ----------
        drop_idx : int
            index of exog that is dropped from the regression
        endog_idx : 'endog' or int
            If 'endog', then the endogenous variable of the result instance
            is regressed on the exogenous variables, excluding the one at
            drop_idx. If endog_idx is an integer, then the exog with that
            index is regressed with OLS on all other exogenous variables.
            (The latter is the auxiliary regression for the variance inflation
            factor.)

        this needs more thought, memory versus speed
        not yet used in any other parts, not sufficiently tested
        r7   Nr   )rF   hasattrr*   r   r7   rE   KeyErrorr3   r4   r   r<   r   r8   )
rG   Zdrop_idxZ	endog_idxZstoreZstoredr=   r:   r-   r>   r9   r.   r.   r/   
_ols_xnoti  s&    

zOLSInfluence._ols_xnotic             C   s   ddl m} | jjj}| j}|| j}tt}xN|D ]F\}}x<|D ]4}	| 	||dd|f 
 }
||	 t|
|	 qBW q4W |S )ao  regress endog on exog without one of the variables

        This uses a k_vars loop, only attributes of the OLS instance are stored.

        Parameters
        ----------
        attributes : list of strings
           These are the names of the attributes of the auxiliary OLS results
           instance that are stored and returned.

        not yet used
        r   )LeaveOneOutN)#statsmodels.sandbox.tools.cross_valrk   r*   r   r7   r3   r:   r   listrB   r8   appendr   )rG   Z
attributesrk   r7   r3   cv_iterZres_looinidxoutidxZattres_ir.   r.   r/   _get_drop_vari  s    


zOLSInfluence._get_drop_varic             C   s   ddl m} dd }| j}| j}tj|jtjd}tj|jtjd}tj|jtjd}|| j}xH|D ]@\}	}
| 	||	 ||	 
 }|j||
< |j||
< ||||
< qfW t|||dS )a9  collect required results from the LOOO loop

        all results will be attached.
        currently only 'params', 'mse_resid', 'det_cov_params' are stored

        regresses endog on exog dropping one observation at a time

        this uses a nobs loop, only attributes of the OLS instance are stored.
        r   )rk   c             S   s   t j|  S )N)r   rb   rc   rd   )r9   r.   r.   r/   <lambda>-  s    z(OLSInfluence._res_looo.<locals>.<lambda>)Zdtype)rZ   rD   r^   )rl   rk   r7   r3   r   Zzerosr4   floatrA   rB   r8   rZ   rD   dict)rG   rk   Zget_det_cov_paramsr7   r3   rZ   rD   r^   ro   rp   rq   rr   r.   r.   r/   r]   !  s    


zOLSInfluence._res_loooc          	   C   sz   ddl m} | jjj}|j}dd |jD }|t| jd | j	| j
| jd | j| jd d|d}|| j||d}||S )a  
        Creates a DataFrame with all available influence results.

        Returns
        -------
        frame : DataFrame
            A DataFrame with all results.

        Notes
        -----
        The resultant DataFrame contains six variables in addition to the
        DFBETAS. These are:

        * cooks_d : Cook's Distance defined in `Influence.cooks_distance`
        * standard_resid : Standardized residuals defined in
          `Influence.resid_studentized_internal`
        * hat_diag : The diagonal of the projection, or hat, matrix defined in
          `Influence.hat_matrix_diag`
        * dffits_internal : DFFITS statistics using internally Studentized
          residuals defined in `Influence.dffits_internal`
        * dffits : DFFITS statistics using externally Studentized residuals
          defined in `Influence.dffits`
        * student_resid : Externally Studentized residuals defined in
          `Influence.resid_studentized_external`
        r   )r   c             S   s   g | ]}d | qS )Zdfb_r.   ).0ir.   r.   r/   
<listcomp>_  s    z.OLSInfluence.summary_frame.<locals>.<listcomp>)Zcooks_dZstandard_residZhat_diagrX   r   rY   )r   )r   r   )r%   r   r*   r   r   r   Zxnamesrv   ra   rT   rK   rX   r   rY   r\   join)rG   r   r   r   Zbeta_labelsZsummary_dataZdfbetar.   r.   r/   summary_frame@  s    
zOLSInfluence.summary_frame%6.3fc             C   s   dt | jfd| jfd| jjfd| jd fd| jfd| jfd| j	d fd	| j
fd
| jd fg	}t| \}}t |}|| _ddlm}m} ddlm} ddlm} ||}	||}
dg|g|jd d   |	d< ||||	|
dS )a  create a summary table with all influence and outlier measures

        This does currently not distinguish between statistics that can be
        calculated from the original regression results and for which a
        leave-one-observation-out loop is needed

        Returns
        -------
        res : SimpleTable instance
           SimpleTable instance with the results, can be printed

        Notes
        -----
        This also attaches table_data to the instance.



        Zobsr7   zfitted
valuezCook's
dr   zstudent.
residualzhat diagzdffits 
internalzext.stud.
residualrY   )SimpleTabledefault_html_fmt)fmt_base)deepcopyz%4dr   	data_fmts)headerstxt_fmthtml_fmt)r   r<   rA   r7   r*   r5   ra   rT   rK   rX   r   rY   r   r6   Z
table_datastatsmodels.iolib.tabler}   r~   !statsmodels.iolib.tableformattingr   copyr   r4   )rG   Z	float_fmtZ	table_rawcolnamesr   r}   r~   r   r   fmtfmt_htmlr.   r.   r/   summary_tableq  s(     

zOLSInfluence.summary_table)N)r7   T)r|   )r   
__module____qualname____doc__rH   r   rK   rM   rN   rO   rQ   rT   r   rS   rX   rY   r\   rU   r[   r_   ra   re   rf   rg   rj   rs   r]   r{   r   r.   r.   r.   r/   r@      s2   



!

41r@   c             C   s  ddl m} ddlm} t| }t|j| j }|j	
|d | j}t| j||  | j||  g}|| |d}|\}	}
}t|
|f}t| jd|j  }tt| jd | jj| j||dddf |dddf |dddf |dddf | j||j|jd g}|}dd	d
dddddddddg}|}ddlm}m} ddlm} ddlm} ||}||}dgdg|jd d   |d< |||||d}|||fS )a  
    Generate summary table of outlier and influence similar to SAS

    Parameters
    ----------
    alpha : float
       significance level for confidence interval

    Returns
    -------
    st : SimpleTable instance
       table with results that can be printed
    data : ndarray
       calculated measures and statistics for the table
    ss2 : list of strings
       column_names for table (Note: rows of table are observations)
    r   )r
   )wls_prediction_stdg       @)r   r   NZObszDep Var
PopulationzPredicted
ValuezStd Error
Mean PredictzMean ci
95% lowzMean ci
95% uppzPredict ci
95% lowzPredict ci
95% uppZResidualzStd Error
ResidualzStudent
ResidualzCook's
D)r}   r~   )r   )r   z%4dz%6.3fr   )r   r   r   )r   r
   Z&statsmodels.sandbox.regression.predstdr   r@   r   rC   rK   rD   r"   Zisfr!   r6   r5   r<   rA   r   r7   r+   rT   ra   r   r}   r~   r   r   r   r   r4   )r9   r   r
   r   r)   Zpredict_mean_seZtppfZpredict_mean_ciZtmpZ
predict_seZpredict_ci_lowZpredict_ci_uppZ
predict_ciZresid_seZtable_smr   Zss2r   r}   r~   r   r   r   r   str.   r.   r/   r     sJ    
r   )r   r	   NFN)r1   )r	   )r   Zstatsmodels.compat.pythonr   collectionsr   Znumpyr   Z#statsmodels.regression.linear_modelr   Zstatsmodels.tools.decoratorsr   Zstatsmodels.stats.multitestr   Zstatsmodels.tools.toolsr   r0   r;   r?   objectr@   r   r.   r.   r.   r/   <module>   s     
O
%0   s