B
    Zg                 @   s~   d Z ddlmZ ddlmZmZ ddlZddlm	Z	 ddl
mZmZmZmZmZ dd	d
gZG dd deZG dd	 d	eZdS )aZ  
Multivariate Conditional and Unconditional Kernel Density Estimation
with Mixed Data Types.

References
----------
[1] Racine, J., Li, Q. Nonparametric econometrics: theory and practice.
    Princeton University Press. (2007)
[2] Racine, Jeff. "Nonparametric Econometrics: A Primer," Foundation
    and Trends in Econometrics: Vol 3: No 1, pp1-88. (2008)
    http://dx.doi.org/10.1561/0800000009
[3] Racine, J., Li, Q. "Nonparametric Estimation of Distributions
    with Categorical and Continuous Data." Working Paper. (2000)
[4] Racine, J. Li, Q. "Kernel Estimation of Multivariate Conditional
    Distributions Annals of Economics and Finance 5, 211-235 (2004)
[5] Liu, R., Yang, L. "Kernel estimation of multivariate
    cumulative distribution function."
    Journal of Nonparametric Statistics (2008)
[6] Li, R., Ju, G. "Nonparametric Estimation of Multivariate CDF
    with Categorical and Continuous Data." Working Paper
[7] Li, Q., Racine, J. "Cross-validated local linear nonparametric
    regression" Statistica Sinica 14(2004), pp. 485-512
[8] Racine, J.: "Consistent Significance Testing for Nonparametric
        Regression" Journal of Business & Economics Statistics
[9] Racine, J., Hart, J., Li, Q., "Testing the Significance of
        Categorical Predictor Variables in Nonparametric Regression
        Models", 2006, Econometric Reviews 25, 523-544

    )division)rangenextN   )kernels)
GenericKDEEstimatorSettingsgpkeLeaveOneOut_adjust_shapeKDEMultivariateKDEMultivariateConditionalr   c               @   s\   e Zd ZdZde fddZdd Zdd fd	d
ZdddZdddZ	dd Z
dd ZdS )r   a  
    Multivariate kernel density estimator.

    This density estimator can handle univariate as well as multivariate data,
    including mixed continuous / ordered discrete / unordered discrete data.
    It also provides cross-validated bandwidth selection methods (least
    squares, maximum likelihood).

    Parameters
    ----------
    data: list of ndarrays or 2-D ndarray
        The training data for the Kernel Density Estimation, used to determine
        the bandwidth(s).  If a 2-D array, should be of shape
        (num_observations, num_variables).  If a list, each list element is a
        separate observation.
    var_type: str
        The type of the variables:

            - c : continuous
            - u : unordered (discrete)
            - o : ordered (discrete)

        The string should contain a type specifier for each variable, so for
        example ``var_type='ccuo'``.
    bw: array_like or str, optional
        If an array, it is a fixed user-specified bandwidth.  If a string,
        should be one of:

            - normal_reference: normal reference rule of thumb (default)
            - cv_ml: cross validation maximum likelihood
            - cv_ls: cross validation least squares

    defaults: EstimatorSettings instance, optional
        The default values for (efficient) bandwidth estimation.

    Attributes
    ----------
    bw: array_like
        The bandwidth parameters.

    See Also
    --------
    KDEMultivariateConditional

    Examples
    --------
    >>> import statsmodels.api as sm
    >>> nobs = 300
    >>> np.random.seed(1234)  # Seed random generator
    >>> c1 = np.random.normal(size=(nobs,1))
    >>> c2 = np.random.normal(2, 1, size=(nobs,1))

    Estimate a bivariate distribution and display the bandwidth found:

    >>> dens_u = sm.nonparametric.KDEMultivariate(data=[c1,c2],
    ...     var_type='cc', bw='normal_reference')
    >>> dens_u.bw
    array([ 0.39967419,  0.38423292])
    Nc             C   s|   || _ t| j | _t|| j| _|| _t| j\| _| _| j| jkrNt	d| 
| | jsl| || _n| || _d S )NzGThe number of observations must be larger than the number of variables.)var_typelenk_varsr   data	data_typenpshapenobs
ValueError_set_defaults	efficient_compute_bwbw_compute_efficient)selfr   r   r   defaults r   Glib/python3.7/site-packages/statsmodels/nonparametric/kernel_density.py__init__h   s    
zKDEMultivariate.__init__c             C   sX   d}|dt | j d 7 }|dt | j d 7 }|d| j d 7 }|d| j d 7 }|S )z Provide something sane to print.zKDE instance
zNumber of variables: k_vars = 
zNumber of samples:   nobs = zVariable types:      zBW selection method: )strr   r   r   
_bw_method)r   rprr   r   r   __repr__x   s    zKDEMultivariate.__repr__c             C   s   | S )Nr   )xr   r   r   <lambda>   s    zKDEMultivariate.<lambda>c             C   sZ   t | j}d}xDt|D ]8\}}t|| | j|ddf  | jd}|||7 }qW | S )aV  
        Returns the leave-one-out likelihood function.

        The leave-one-out likelihood function for the unconditional KDE.

        Parameters
        ----------
        bw: array_like
            The value for the bandwidth parameter(s).
        func: callable, optional
            Function to transform the likelihood values (before summing); for
            the log likelihood, use ``func=np.log``.  Default is ``f(x) = x``.

        Notes
        -----
        The leave-one-out kernel estimator of :math:`f_{-i}` is:

        .. math:: f_{-i}(X_{i})=\frac{1}{(n-1)h}
                    \sum_{j=1,j\neq i}K_{h}(X_{i},X_{j})

        where :math:`K_{h}` represents the generalized product kernel
        estimator:

        .. math:: K_{h}(X_{i},X_{j}) =
            \prod_{s=1}^{q}h_{s}^{-1}k\left(\frac{X_{is}-X_{js}}{h_{s}}\right)
        r   N)r   data_predictr   )r
   r   	enumerater	   r   )r   r   funcLOOLiX_not_if_ir   r   r   loo_likelihood   s    

zKDEMultivariate.loo_likelihoodc          
   C   sx   |dkr| j }nt|| j}g }xHtt|d D ]2}|t| j| j ||ddf | j	d| j
  q4W t|}|S )aR  
        Evaluate the probability density function.

        Parameters
        ----------
        data_predict: array_like, optional
            Points to evaluate at.  If unspecified, the training data is used.

        Returns
        -------
        pdf_est: array_like
            Probability density function evaluated at `data_predict`.

        Notes
        -----
        The probability density is given by the generalized product kernel
        estimator:

        .. math:: K_{h}(X_{i},X_{j}) =
            \prod_{s=1}^{q}h_{s}^{-1}k\left(\frac{X_{is}-X_{js}}{h_{s}}\right)
        Nr   )r   r(   r   )r   r   r   r   r   r   appendr	   r   r   r   squeeze)r   r(   pdf_estr-   r   r   r   pdf   s    
zKDEMultivariate.pdfc             C   s~   |dkr| j }nt|| j}g }xNtt|d D ]8}|t| j| j ||ddf | j	dddd| j
  q4W t|}|S )a  
        Evaluate the cumulative distribution function.

        Parameters
        ----------
        data_predict: array_like, optional
            Points to evaluate at.  If unspecified, the training data is used.

        Returns
        -------
        cdf_est: array_like
            The estimate of the cdf.

        Notes
        -----
        See http://en.wikipedia.org/wiki/Cumulative_distribution_function
        For more details on the estimation see Ref. [5] in module docstring.

        The multivariate CDF for mixed data (continuous and ordered/unordered
        discrete) is estimated by:

        .. math:: 
        
            F(x^{c},x^{d})=n^{-1}\sum_{i=1}^{n}\left[G(\frac{x^{c}-X_{i}}{h})\sum_{u\leq x^{d}}L(X_{i}^{d},x_{i}^{d}, \lambda)\right]

        where G() is the product kernel CDF estimator for the continuous
        and L() for the discrete variables.

        Used bandwidth is ``self.bw``.
        Nr   gaussian_cdfaitchisonaitken_cdfwangryzin_cdf)r   r(   r   ckertypeukertypeokertype)r   r   r   r   r   r   r1   r	   r   r   r   r2   )r   r(   cdf_estr-   r   r   r   cdf   s    
zKDEMultivariate.cdfc             C   s  d}t tjtjtjd}| j}| j }| j}t	dd |D }|| 
 }t|j}	x~t|D ]r}
xHt|D ]<\}}|| || |dd|f ||
|f |	dd|f< qpW |	j
dd| }|jdd}||7 }qbW t tjtjtjd}t| j}d}t|jd d |jd f}	xt|D ]x\}
}xLt|D ]@\}}|| || |dd|f  ||
|f |	dd|f< q4W |	j
dd| }||jdd7 }q"W ||d  d| ||d    S )	a  
        Returns the Integrated Mean Square Error for the unconditional KDE.

        Parameters
        ----------
        bw: array_like
            The bandwidth parameter(s).

        Returns
        -------
        CV: float
            The cross-validation objective function.

        Notes
        -----
        See p. 27 in [1]_ for details on how to handle the multivariate
        estimation with mixed data types see p.6 in [2]_.

        The formula for the cross-validation objective function is:

        .. math:: CV=\frac{1}{n^{2}}\sum_{i=1}^{n}\sum_{j=1}^{N}
            \bar{K}_{h}(X_{i},X_{j})-\frac{2}{n(n-1)}\sum_{i=1}^{n}
            \sum_{j=1,j\neq i}^{N}K_{h}(X_{i},X_{j})

        Where :math:`\bar{K}_{h}` is the multivariate product convolution
        kernel (consult [2]_ for mixed data types).

        References
        ----------
        .. [1] Racine, J., Li, Q. Nonparametric econometrics: theory and
                practice. Princeton University Press. (2007)
        .. [2] Racine, J., Li, Q. "Nonparametric Estimation of Distributions
                with Categorical and Continuous Data." Working Paper. (2000)
        r   )couc             S   s   g | ]}|d kqS )r=   r   ).0r=   r   r   r   
<listcomp>3  s    z(KDEMultivariate.imse.<locals>.<listcomp>Nr   )axis   )dictr   Zgaussian_convolutionZwang_ryzin_convolutionZaitchison_aitken_convolutionr   r   r   r   ZarrayZprodemptyr   r   r)   sumZgaussianZ
wang_ryzinZaitchison_aitkenr
   )r   r   FZkertypesr   r   r   Zix_contZ_bw_cont_productZKvalr-   iiZvtypeZdensZ	k_bar_sumr+   r,   r.   r   r   r   imse   s@    3


 zKDEMultivariate.imsec             C   s   d}| j f}||fS )z@Helper method to be able to pass needed vars to _compute_subset.r   )r   )r   
class_type
class_varsr   r   r   _get_class_vars_typeQ  s    z$KDEMultivariate._get_class_vars_type)N)N)__name__
__module____qualname____doc__r   r    r%   r0   r4   r<   rI   rL   r   r   r   r   r   ,   s   ;	$
$
0Xc               @   sZ   e Zd ZdZe fddZdd Zdd fdd	ZdddZdddZ	dd Z
dd Zd
S )r   au  
    Conditional multivariate kernel density estimator.

    Calculates ``P(Y_1,Y_2,...Y_n | X_1,X_2...X_m) =
    P(X_1, X_2,...X_n, Y_1, Y_2,..., Y_m)/P(X_1, X_2,..., X_m)``.
    The conditional density is by definition the ratio of the two densities,
    see [1]_.

    Parameters
    ----------
    endog: list of ndarrays or 2-D ndarray
        The training data for the dependent variables, used to determine
        the bandwidth(s).  If a 2-D array, should be of shape
        (num_observations, num_variables).  If a list, each list element is a
        separate observation.
    exog: list of ndarrays or 2-D ndarray
        The training data for the independent variable; same shape as `endog`.
    dep_type: str
        The type of the dependent variables:

            c : Continuous
            u : Unordered (Discrete)
            o : Ordered (Discrete)

        The string should contain a type specifier for each variable, so for
        example ``dep_type='ccuo'``.
    indep_type: str
        The type of the independent variables; specifed like `dep_type`.
    bw: array_like or str, optional
        If an array, it is a fixed user-specified bandwidth.  If a string,
        should be one of:

            - normal_reference: normal reference rule of thumb (default)
            - cv_ml: cross validation maximum likelihood
            - cv_ls: cross validation least squares

    defaults: Instance of class EstimatorSettings
        The default values for the efficient bandwidth estimation

    Attributes
    ---------
    bw: array_like
        The bandwidth parameters

    See Also
    --------
    KDEMultivariate

    References
    ----------
    .. [1] http://en.wikipedia.org/wiki/Conditional_probability_distribution

    Examples
    --------
    >>> import statsmodels.api as sm
    >>> nobs = 300
    >>> c1 = np.random.normal(size=(nobs,1))
    >>> c2 = np.random.normal(2,1,size=(nobs,1))

    >>> dens_c = sm.nonparametric.KDEMultivariateConditional(endog=[c1],
    ...     exog=[c2], dep_type='c', indep_type='c', bw='normal_reference')
    >>> dens_c.bw   # show computed bandwidth
    array([ 0.41223484,  0.40976931])
    c             C   s   || _ || _|| | _t| j | _t| j| _t|| j| _t|| j| _t	
| j\| _| _t	| j| jf| _t	
| jd | _| | | js| || _n| || _d S )Nr   )dep_type
indep_typer   r   k_depk_indepr   endogexogr   r   r   column_stackr   r   r   r   r   r   r   )r   rU   rV   rQ   rR   r   r   r   r   r   r      s    

z#KDEMultivariateConditional.__init__c             C   s   d}|dt | j d 7 }|dt | j d 7 }|dt | j d 7 }|d| j d 7 }|d| j d 7 }|d| j d 7 }|S )	z Provide something sane to print.z$KDEMultivariateConditional instance
z+Number of independent variables: k_indep = r!   z'Number of dependent variables: k_dep = zNumber of observations: nobs = z!Independent variable types:      zDependent variable types:      zBW selection method: )r"   rT   rS   r   rR   rQ   r#   )r   r$   r   r   r   r%     s    z#KDEMultivariateConditional.__repr__c             C   s   | S )Nr   )r&   r   r   r   r'     s    z#KDEMultivariateConditional.<lambda>c             C   s   t | j}t | j }d}xt|D ]|\}}t|}t|| | j|ddf  | j| j d}	t|| j	d | | j|ddf  | jd}
|	|
 }|||7 }q&W | S )a  
        Returns the leave-one-out conditional likelihood of the data.

        If `func` is not equal to the default, what's calculated is a function
        of the leave-one-out conditional likelihood.

        Parameters
        ----------
        bw: array_like
            The bandwidth parameter(s).
        func: callable, optional
            Function to transform the likelihood values (before summing); for
            the log likelihood, use ``func=np.log``.  Default is ``f(x) = x``.

        Returns
        -------
        L: float
            The value of the leave-one-out function for the data.

        Notes
        -----
        Similar to ``KDE.loo_likelihood`, but substitute ``f(y|x)=f(x,y)/f(x)``
        for ``f(x)``.
        r   N)r   r(   r   )
r
   r   rV   __iter__r)   r   r	   rQ   rR   rS   )r   r   r*   ZyLOOZxLOOr,   r-   ZY_jr.   f_yxf_xr/   r   r   r   r0     s    

z)KDEMultivariateConditional.loo_likelihoodNc             C   s   |dkr| j }nt|| j}|dkr,| j}nt|| j}g }t||f}x|tt|d D ]f}t	| j
| j||ddf | j| j d}t	| j
| jd | j||ddf | jd}|||  q^W t|S )aN  
        Evaluate the probability density function.

        Parameters
        ----------
        endog_predict: array_like, optional
            Evaluation data for the dependent variables.  If unspecified, the
            training data is used.
        exog_predict: array_like, optional
            Evaluation data for the independent variables.

        Returns
        -------
        pdf: array_like
            The value of the probability density at `endog_predict` and `exog_predict`.

        Notes
        -----
        The formula for the conditional probability density is:

        .. math:: f(y|x)=\frac{f(x,y)}{f(x)}

        with

        .. math:: f(x)=\prod_{s=1}^{q}h_{s}^{-1}k
                            \left(\frac{x_{is}-x_{js}}{h_{s}}\right)

        where :math:`k` is the appropriate kernel for each variable.
        Nr   )r   r(   r   )rU   r   rS   rV   rT   r   rW   r   r   r	   r   r   rQ   rR   r1   r2   )r   endog_predictexog_predictr3   r(   r-   rY   rZ   r   r   r   r4     s"    

zKDEMultivariateConditional.pdfc       
      C   s&  |dkr| j }nt|| j}|dkr,| j}nt|| j}t|d }t|}xt|D ]}t	| j
| jd | j||ddf | jd| j }t|}t	| j
d| j | j ||ddf | jddddd}t	| j
| jd | j||ddf | jdd	}|| jdd
}	|	| j|  ||< qZW |S )a  
        Cumulative distribution function for the conditional density.

        Parameters
        ----------
        endog_predict: array_like, optional
            The evaluation dependent variables at which the cdf is estimated.
            If not specified the training dependent variables are used.
        exog_predict: array_like, optional
            The evaluation independent variables at which the cdf is estimated.
            If not specified the training independent variables are used.

        Returns
        -------
        cdf_est: array_like
            The estimate of the cdf.

        Notes
        -----
        For more details on the estimation see [2]_, and p.181 in [1]_.

        The multivariate conditional CDF for mixed data (continuous and
        ordered/unordered discrete) is estimated by:

        .. math:: 
            
            F(y|x)=\frac{n^{-1}\sum_{i=1}^{n}G(\frac{y-Y_{i}}{h_{0}}) W_{h}(X_{i},x)}{\widehat{\mu}(x)}

        where G() is the product kernel CDF estimator for the dependent (y)
        variable(s) and W() is the product kernel CDF estimator for the
        independent variable(s).

        References
        ----------
        .. [1] Racine, J., Li, Q. Nonparametric econometrics: theory and
                practice. Princeton University Press. (2007)
        .. [2] Liu, R., Yang, L. "Kernel estimation of multivariate cumulative
                    distribution function." Journal of Nonparametric
                    Statistics (2008)
        Nr   )r   r(   r   r5   r6   r7   F)r   r(   r   r8   r9   r:   tosum)r   r(   r   r]   )rB   )rU   r   rS   rV   rT   r   r   rE   r   r	   r   rR   r   r2   rQ   rF   )
r   r[   r\   ZN_data_predictr;   r-   Zmu_xZ	cdf_endogZcdf_exogSr   r   r   r<     s2    )


zKDEMultivariateConditional.cdfc             C   s  t | j}d}t| j}t| jd df}xt|D ]t\}}|dd| jdf }|ddd| jf }	t|	|}
t||	}t||}t||}t	|| jd || j
|ddf | jdd}t	|| jd || j
|ddf | jdd}t	|d| j |
|| jddddd	}|| |  |d
  }t	|| | j|ddf  | j| j d| }t	|| jd | | j
|ddf  | jd| }|||d
  d
||   7 }q8W || S )a  
        The integrated mean square error for the conditional KDE.

        Parameters
        ----------
        bw: array_like
            The bandwidth parameter(s).

        Returns
        -------
        CV: float
            The cross-validation objective function.

        Notes
        -----
        For more details see pp. 156-166 in [1]_. For details on how to
        handle the mixed variable types see [2]_.

        The formula for the cross-validation objective function for mixed
        variable types is:

        .. math:: CV(h,\lambda)=\frac{1}{n}\sum_{l=1}^{n}
            \frac{G_{-l}(X_{l})}{\left[\mu_{-l}(X_{l})\right]^{2}}-
            \frac{2}{n}\sum_{l=1}^{n}\frac{f_{-l}(X_{l},Y_{l})}{\mu_{-l}(X_{l})}

        where

        .. math:: G_{-l}(X_{l}) = n^{-2}\sum_{i\neq l}\sum_{j\neq l}
                        K_{X_{i},X_{l}} K_{X_{j},X_{l}}K_{Y_{i},Y_{j}}^{(2)}

        where :math:`K_{X_{i},X_{l}}` is the multivariate product kernel and
        :math:`\mu_{-l}(X_{l})` is the leave-one-out estimator of the pdf.

        :math:`K_{Y_{i},Y_{j}}^{(2)}` is the convolution kernel.

        The value of the function is minimized by the ``_cv_ls`` method of the
        `GenericKDE` class to return the bw estimates that minimize the
        distance between the estimated and "true" probability density.

        References
        ----------
        .. [1] Racine, J., Li, Q. Nonparametric econometrics: theory and
                practice. Princeton University Press. (2007)
        .. [2] Racine, J., Li, Q. "Nonparametric Estimation of Distributions
                with Categorical and Continuous Data." Working Paper. (2000)
        r   r   NF)r   r(   r   r]   Zgauss_convolutionZwangryzin_convolutionZaitchisonaitken_convolution)r   r(   r   r8   r:   r9   r]   rC   )r   r(   r   )r
   r   floatr   r   Zonesr)   rS   Zkronr	   rV   rR   rQ   rF   )r   r   ZzLOOZCVr   ZexpanderrH   ZXYZYe_LZYe_RZXe_LZXe_RZK_Xi_XlZK_Xj_XlZK2_Yi_YjGZf_X_YZm_xr   r   r   rI   ]  s>    /

 zKDEMultivariateConditional.imsec             C   s   d}| j | j| jf}||fS )z@Helper method to be able to pass needed vars to _compute_subset.r   )rS   rQ   rR   )r   rJ   rK   r   r   r   rL     s    z/KDEMultivariateConditional._get_class_vars_type)NN)NN)rM   rN   rO   rP   r   r    r%   r0   r4   r<   rI   rL   r   r   r   r   r   X  s   @(
4
HP)rP   Z
__future__r   Zstatsmodels.compat.pythonr   r   Znumpyr    r   Z_kernel_baser   r   r	   r
   r   __all__r   r   r   r   r   r   <module>   s   
  .