ó
áp7]c           @   sˆ  d  Z  d d l m Z m Z d d l Z d d l Z d d l m Z d d l	 m
 Z
 y d d l Z e Z Wn e k
 rƒ e Z n Xd d l m Z e d e j d	 e j d
 e j d e j d e j d e j d e j d e j d e j d e j d e j d e j ƒ Z d „  Z  d „  Z! d e" f d „  ƒ  YZ# d e" f d „  ƒ  YZ$ d e" f d „  ƒ  YZ% d „  Z& d „  Z' d
 d d	 e d „ Z( d S(   sh   
Module containing the base object for multivariate kernel density and
regression, plus some utilities.
iÿÿÿÿ(   t   ranget   string_typesN(   t   optimize(   t
   mquantilesi   (   t   kernelst	   wangryzint   aitchisonaitkent   gaussiant   aitchison_aitken_regt   wangryzin_regt   gauss_convolutiont   wangryzin_convolutiont   aitchisonaitken_convolutiont   gaussian_cdft   aitchisonaitken_cdft   wangryzin_cdft
   d_gaussianc         C   sq   t  j |  d d ƒ} t |  d d d ƒj d } t |  d d d ƒj d } | | d } t  j | | ƒ } | S(   s1   Compute minimum of std and IQR for each variable.t   axisi    g      è?g      Ð?g/Ý$•õ?(   t   npt   stdR   t   datat   minimum(   R   t   s1t   q75t   q25t   s2t
   dispersion(    (    sE   lib/python2.7/site-packages/statsmodels/nonparametric/_kernel_base.pyt   _compute_min_std_IQR#   s    c         C   sƒ  |
 r5 t  j j | ƒ | d | … d d … f } n$ | | d | d … d d … f } |  d k r¦ d d l m } |	 d } | | | d | d t d t ƒ ƒ} nD|  d	 k r6d d
 l m } |	 \ } } } | d d … d | … f } | d d … | d … f } | | | | | d | d t d t ƒ ƒ} n´ |  d k rÞd d l m	 } |	 \ } } } t
 | d d … d f d ƒ } t
 | d d … d d … f | ƒ } | d | d | d | d | d | d t d t ƒ ƒ } n t d ƒ ‚ |  d k r| d d … d d … f } n  t | ƒ } | | d | | } | d | | | | <| d | | | | <| j | } | j } | | f S(   s¾   "Compute bw on subset of data.

    Called from ``GenericKDE._compute_efficient_*``.

    Notes
    -----
    Needs to be outside the class in order for joblib to be able to pickle it.

    Ni    i   t   KDEMultivariate(   R   t   bwt   defaultst	   efficientt   KDEMultivariateConditional(   R    t	   KernelReg(   R!   t   endogt   exogt   reg_typet   var_typesd   class_type not recognized, should be one of {KDEMultivariate, KDEMultivariateConditional, KernelReg}g      ð¿g       À(   R   t   randomt   shufflet   kernel_densityR   t   EstimatorSettingst   FalseR    t   kernel_regressionR!   t   _adjust_shapet
   ValueErrorR   R   (   t
   class_typeR   R   t   cot   dot   n_cvarst   ix_ordt   ix_unordt   n_subt
   class_varst	   randomizet   boundt   sub_dataR   R%   t	   sub_modelR    t   k_dept   dep_typet
   indep_typeR"   R#   R!   t   k_varsR$   R   t   fctt   sample_scale_subt   bw_sub(    (    sE   lib/python2.7/site-packages/statsmodels/nonparametric/_kernel_base.pyt   _compute_subset-   sD    $
!%	t
   GenericKDEc           B   sh   e  Z d  Z d „  Z d „  Z d „  Z d „  Z d „  Z d „  Z d „  Z	 d „  Z
 d	 „  Z d
 „  Z RS(   sG   
    Base class for density estimation and regression KDE classes.
    c         C   sŽ   | d k r d } n  t | t ƒ s? d |  _ t j | ƒ } nK | |  _ | d k r` |  j } n! | d k rx |  j } n	 |  j } | ƒ  } | S(   sé  
        Computes the bandwidth of the data.

        Parameters
        ----------
        bw: array_like or str
            If array_like: user-specified bandwidth.
            If a string, should be one of:

                - cv_ml: cross validation maximum likelihood
                - normal_reference: normal reference rule of thumb
                - cv_ls: cross validation least squares

        Notes
        -----
        The default values for bw is 'normal_reference'.
        t   normal_references   user-specifiedt   cv_mlN(	   t   Nonet
   isinstanceR   t
   _bw_methodR   t   asarrayt   _normal_referencet   _cv_mlt   _cv_ls(   t   selfR   t   rest   bwfunc(    (    sE   lib/python2.7/site-packages/statsmodels/nonparametric/_kernel_base.pyt   _compute_bwh   s    					c         C   s
   t  | ƒ S(   sß  
        Computes the measure of dispersion.

        The minimum of the standard deviation and interquartile range / 1.349

        Notes
        -----
        Reimplemented in `KernelReg`, because the first column of `data` has to
        be removed.

        References
        ----------
        See the user guide for the np package in R.
        In the notes on bwscaling option in npreg, npudens, npcdens there is
        a discussion on the measure of dispersion
        (   R   (   RL   R   (    (    sE   lib/python2.7/site-packages/statsmodels/nonparametric/_kernel_base.pyt   _compute_dispersionŽ   s    c         C   s   d S(   sp   Helper method to be able to pass needed vars to _compute_subset.

        Needs to be implemented by subclasses.N(    (   RL   (    (    sE   lib/python2.7/site-packages/statsmodels/nonparametric/_kernel_base.pyt   _get_class_vars_type¡   s    c            s6  ˆ d	 k r d ˆ _ n  t ˆ t ƒ r3 ˆ ˆ _ n d ˆ _ ˆ Sˆ j } ˆ j ‰
 t j ˆ j ƒ ‰ ˆ j	 j
 d ƒ ‰	 d ‰ d ‰ t ˆ j	 ƒ \ } ‰ ‰ ˆ j r¶ d	 g ˆ j ‰  nc g  t | ˆ
 ƒ D] } | ˆ
 | d ˆ
 f ^ qÇ ‰  | ˆ
 d k rˆ  j | | ˆ
 | f ƒ n  ˆ j r+ˆ j n	 t ˆ  ƒ } t j | ˆ j f ƒ } t j | ˆ j f ƒ } ˆ j ƒ  \ ‰ ‰ t rÔt j d ˆ j ƒ ‡  ‡ ‡ ‡ ‡ ‡ ‡ ‡ ‡ ‡	 ‡
 ‡ f d †  t | ƒ Dƒ ƒ } nX g  } xO t | ƒ D]A } | j t ˆ ˆ ˆ ˆ ˆ ˆ	 ˆ ˆ ˆ
 ˆ ˆ j ˆ  | ƒ ƒ qçWxP t | ƒ D]B } | | d | | d	 d	 … f <| | d | | d	 d	 … f <q9Wˆ j ˆ ƒ }	 ˆ j r t j n t j }
 |
 | d
 d ƒ} | |	 | d ˆ	 ˆ ‰ | ˆ | d ˆ	 ˆ ˆ ˆ <| ˆ | d ˆ	 ˆ ˆ ˆ <ˆ j r2t j | d
 d ƒ‰ n  ˆ S(   s\  
        Computes the bandwidth by estimating the scaling factor (c)
        in n_res resamples of size ``n_sub`` (in `randomize` case), or by
        dividing ``nobs`` into as many ``n_sub`` blocks as needed (if
        `randomize` is False).

        References
        ----------
        See p.9 in socserv.mcmaster.ca/racine/np_faq.pdf
        RC   s   user-specifiedt   ci   i   i    t   n_jobsc         3   sL   |  ]B } t  j t ƒ ˆ ˆ ˆ ˆ ˆ ˆ	 ˆ ˆ ˆ
 ˆ ˆ j ˆ  | ƒ Vq d  S(   N(   t   joblibt   delayedRA   R6   (   t   .0t   i(   t   boundsR   R.   R5   R/   R   R0   R2   R3   R1   R4   RL   (    sE   lib/python2.7/site-packages/statsmodels/nonparametric/_kernel_base.pys	   <genexpr>Ô   s   NR   g      ð¿g       À(   RE   RG   RF   R   t   nobsR4   t   copyt   deepcopyR   t	   data_typet   countt   _get_type_posR6   t   n_resR    t   appendt   lenR   t   emptyR=   RQ   t
   has_joblibRT   t   ParallelRS   RA   RP   t   return_mediant   mediant   meant   return_only_bw(   RL   R   RY   t   _RW   t   n_blockst   sample_scalet   only_bwRM   t   st
   order_funct   m_scale(    (   RX   R   R.   R5   R/   R   R0   R2   R3   R1   R4   RL   sE   lib/python2.7/site-packages/statsmodels/nonparametric/_kernel_base.pyt   _compute_efficient§   sX    				5-	"	c         C   sX   | j  |  _  | j |  _ | j |  _ | j |  _ | j |  _ | j |  _ | j |  _ d S(   s4   Sets the default values for the efficient estimationN(   R_   R4   R6   Re   R   Rh   RS   (   RL   R   (    (    sE   lib/python2.7/site-packages/statsmodels/nonparametric/_kernel_base.pyt   _set_defaultsñ   s    c         C   s=   t  j |  j d d ƒ} d | |  j d d |  j j d S(   s\  
        Returns Scott's normal reference rule of thumb bandwidth parameter.

        Notes
        -----
        See p.13 in [2] for an example and discussion.  The formula for the
        bandwidth is

        .. math:: h = 1.06n^{-1/(4+q)}

        where ``n`` is the number of observations and ``q`` is the number of
        variables.
        R   i    gö(\Âõð?g      ð¿i   i   (   R   R   R   RY   t   shape(   RL   t   X(    (    sE   lib/python2.7/site-packages/statsmodels/nonparametric/_kernel_base.pyRI   û   s    c         C   s`   d | | d k  <t  |  j ƒ \ } } } t j | | d ƒ | | <t j | | d ƒ | | <| S(   s{   
        Sets bandwidth lower bound to effectively zero )1e-10), and for
        discrete values upper bound to 1.
        g»½×Ùß|Û=i    g      ð?(   R^   R\   R   R   (   RL   R   Ri   R2   R3   (    (    sE   lib/python2.7/site-packages/statsmodels/nonparametric/_kernel_base.pyt   _set_bw_bounds  s
    c         C   s[   |  j  ƒ  } t j |  j d | d t j f d d d d d d d d	 ƒ} |  j | ƒ } | S(
   s  
        Returns the cross validation maximum likelihood bandwidth parameter.

        Notes
        -----
        For more details see p.16, 18, 27 in Ref. [1] (see module docstring).

        Returns the bandwidth estimate that maximizes the leave-out-out
        likelihood.  The leave-one-out log likelihood function is:

        .. math:: \ln L=\sum_{i=1}^{n}\ln f_{-i}(X_{i})

        The leave-one-out kernel estimator of :math:`f_{-i}` is:

        .. math:: f_{-i}(X_{i})=\frac{1}{(n-1)h}
                        \sum_{j=1,j\neq i}K_{h}(X_{i},X_{j})

        where :math:`K_{h}` represents the Generalized product kernel
        estimator:

        .. math:: K_{h}(X_{i},X_{j})=\prod_{s=1}^
                        {q}h_{s}^{-1}k\left(\frac{X_{is}-X_{js}}{h_{s}}\right)
        t   x0t   argst   maxiterg     @@t   maxfunt   dispi    t   xtolgü©ñÒMbP?(   RI   R   t   fmint   loo_likelihoodR   t   logRt   (   RL   t   h0R   (    (    sE   lib/python2.7/site-packages/statsmodels/nonparametric/_kernel_base.pyRJ     s
    !c         C   sO   |  j  ƒ  } t j |  j d | d d d d d d d d ƒ} |  j | ƒ } | S(	   s‘  
        Returns the cross-validation least squares bandwidth parameter(s).

        Notes
        -----
        For more details see pp. 16, 27 in Ref. [1] (see module docstring).

        Returns the value of the bandwidth that maximizes the integrated mean
        square error between the estimated and actual distribution.  The
        integrated mean square error (IMSE) is given by:

        .. math:: \int\left[\hat{f}(x)-f(x)\right]^{2}dx

        This is the general formula for the IMSE.  The IMSE differs for
        conditional (``KDEMultivariateConditional``) and unconditional
        (``KDEMultivariate``) kernel density estimation.
        Ru   Rw   g     @@Rx   Ry   i    Rz   gü©ñÒMbP?(   RI   R   R{   t   imseRt   (   RL   R~   R   (    (    sE   lib/python2.7/site-packages/statsmodels/nonparametric/_kernel_base.pyRK   7  s
    '	c         C   s
   t  ‚ d  S(   N(   t   NotImplementedError(   RL   (    (    sE   lib/python2.7/site-packages/statsmodels/nonparametric/_kernel_base.pyR|   O  s    (   t   __name__t
   __module__t   __doc__RO   RP   RQ   Rp   Rq   RI   Rt   RJ   RK   R|   (    (    (    sE   lib/python2.7/site-packages/statsmodels/nonparametric/_kernel_base.pyRB   d   s   	&			J	
				R)   c           B   s,   e  Z d  Z e e d d e e d d „ Z RS(   s®  
    Object to specify settings for density estimation or regression.

    `EstimatorSettings` has several proporties related to how bandwidth
    estimation for the `KDEMultivariate`, `KDEMultivariateConditional`,
    `KernelReg` and `CensoredKernelReg` classes behaves.

    Parameters
    ----------
    efficient: bool, optional
        If True, the bandwidth estimation is to be performed
        efficiently -- by taking smaller sub-samples and estimating
        the scaling factor of each subsample.  This is useful for large
        samples (nobs >> 300) and/or multiple variables (k_vars > 3).
        If False (default), all data is used at the same time.
    randomize: bool, optional
        If True, the bandwidth estimation is to be performed by
        taking `n_res` random resamples (with replacement) of size `n_sub` from
        the full sample.  If set to False (default), the estimation is
        performed by slicing the full sample in sub-samples of size `n_sub` so
        that all samples are used once.
    n_sub: int, optional
        Size of the sub-samples.  Default is 50.
    n_res: int, optional
        The number of random re-samples used to estimate the bandwidth.
        Only has an effect if ``randomize == True``.  Default value is 25.
    return_median: bool, optional
        If True (default), the estimator uses the median of all scaling factors
        for each sub-sample to estimate the bandwidth of the full sample.
        If False, the estimator uses the mean.
    return_only_bw: bool, optional
        If True, the estimator is to use the bandwidth and not the
        scaling factor.  This is *not* theoretically justified.
        Should be used only for experimenting.
    n_jobs : int, optional
        The number of jobs to use for parallel estimation with
        ``joblib.Parallel``.  Default is -1, meaning ``n_cores - 1``, with
        ``n_cores`` the number of available CPU cores.
        See the `joblib documentation
        <https://pythonhosted.org/joblib/parallel.html>`_ for more details.

    Examples
    --------
    >>> settings = EstimatorSettings(randomize=True, n_jobs=3)
    >>> k_dens = KDEMultivariate(data, var_type, defaults=settings)

    i   i2   iÿÿÿÿc         C   sC   | |  _  | |  _ | |  _ | |  _ | |  _ | |  _ | |  _ d  S(   N(   R   R6   R_   R4   Re   Rh   RS   (   RL   R   R6   R_   R4   Re   Rh   RS   (    (    sE   lib/python2.7/site-packages/statsmodels/nonparametric/_kernel_base.pyt   __init__ƒ  s    						(   R   R‚   Rƒ   R*   t   TrueR„   (    (    (    sE   lib/python2.7/site-packages/statsmodels/nonparametric/_kernel_base.pyR)   S  s   /t   LeaveOneOutc           B   s    e  Z d  Z d „  Z d „  Z RS(   sŽ  
    Generator to give leave-one-out views on X.

    Parameters
    ----------
    X : array-like
        2-D array.

    Examples
    --------
    >>> X = np.random.normal(0, 1, [10,2])
    >>> loo = LeaveOneOut(X)
    >>> for x in loo:
    ...    print x

    Notes
    -----
    A little lighter weight than sklearn LOO. We don't need test index.
    Also passes views on X, not the index.
    c         C   s   t  j | ƒ |  _ d  S(   N(   R   RH   Rs   (   RL   Rs   (    (    sE   lib/python2.7/site-packages/statsmodels/nonparametric/_kernel_base.pyR„   £  s    c         c   sp   |  j  } t j | ƒ \ } } xK t | ƒ D]= } t j | d t j ƒ} t | | <| | d  d  … f Vq+ Wd  S(   Nt   dtype(   Rs   R   Rr   R    t   onest   boolR*   (   RL   Rs   RY   R=   RW   t   index(    (    sE   lib/python2.7/site-packages/statsmodels/nonparametric/_kernel_base.pyt   __iter__¦  s    	
(   R   R‚   Rƒ   R„   R‹   (    (    (    sE   lib/python2.7/site-packages/statsmodels/nonparametric/_kernel_base.pyR†   Ž  s   	c         C   s…   t  j g  |  D] } | d k ^ q ƒ } t  j g  |  D] } | d k ^ q5 ƒ } t  j g  |  D] } | d k ^ q] ƒ } | | | f S(   NRR   t   ot   u(   R   t   array(   R%   RR   t   ix_contR2   R3   (    (    sE   lib/python2.7/site-packages/statsmodels/nonparametric/_kernel_base.pyR^   °  s    (((c         C   së   t  j |  ƒ }  |  j d k r0 t  j |  ƒ }  n  |  j d k rT | d k rT d } n{ |  j d k r~ | d k r~ t |  ƒ } nQ t  j |  ƒ d | k r¼ t  j |  ƒ d | k r¼ |  j }  n  t  j |  ƒ d } t  j |  | | f ƒ }  |  S(   s>    Returns an array of shape (nobs, k_vars) for use with `gpke`.i   i   i    (   R   RH   t   ndimt   squeezeRa   Rr   t   Tt   reshape(   t   datR=   RY   (    (    sE   lib/python2.7/site-packages/statsmodels/nonparametric/_kernel_base.pyR,   ·  s    	2c         C   sû   t  d | d | d | ƒ } t j | j ƒ }	 xb t | ƒ D]T \ }
 } t | | } | |  |
 | d d … |
 f | |
 ƒ |	 d d … |
 f <q: Wt j g  | D] } | d k ^ qŸ ƒ } |	 j d d ƒ t j |  | ƒ } | ró | j d d ƒ S| Sd S(   sS  
    Returns the non-normalized Generalized Product Kernel Estimator

    Parameters
    ----------
    bw: 1-D ndarray
        The user-specified bandwidth parameters.
    data: 1D or 2-D ndarray
        The training data.
    data_predict: 1-D ndarray
        The evaluation points at which the kernel estimation is performed.
    var_type: str, optional
        The variable type (continuous, ordered, unordered).
    ckertype: str, optional
        The kernel used for the continuous variables.
    okertype: str, optional
        The kernel used for the ordered discrete variables.
    ukertype: str, optional
        The kernel used for the unordered discrete variables.
    tosum : bool, optional
        Whether or not to sum the calculated array of densities.  Default is
        True.

    Returns
    -------
    dens: array-like
        The generalized product kernel density estimator.

    Notes
    -----
    The formula for the multivariate kernel estimator for the pdf is:

    .. math:: f(x)=\frac{1}{nh_{1}...h_{q}}\sum_{i=1}^
                        {n}K\left(\frac{X_{i}-x}{h}\right)

    where

    .. math:: K\left(\frac{X_{i}-x}{h}\right) =
                k\left( \frac{X_{i1}-x_{1}}{h_{1}}\right)\times
                k\left( \frac{X_{i2}-x_{2}}{h_{2}}\right)\times...\times
                k\left(\frac{X_{iq}-x_{q}}{h_{q}}\right)
    RR   RŒ   R   NR   i   i    (	   t   dictR   Rb   Rr   t	   enumeratet   kernel_funcRŽ   t   prodt   sum(   R   R   t   data_predictR%   t   ckertypet   okertypet   ukertypet   tosumt   kertypest   Kvalt   iit   vtypet   funcRR   t   iscontinuoust   dens(    (    sE   lib/python2.7/site-packages/statsmodels/nonparametric/_kernel_base.pyt   gpkeÊ  s    ,>(#()   Rƒ   t   statsmodels.compat.pythonR    R   RZ   t   numpyR   t   scipyR   t   scipy.stats.mstatsR   RT   R…   Rc   t   ImportErrorR*   t    R   R•   t
   wang_ryzint   aitchison_aitkenR   R   t   wang_ryzin_regt   gaussian_convolutiont   wang_ryzin_convolutiont   aitchison_aitken_convolutionR   t   aitchison_aitken_cdft   wang_ryzin_cdfR   R—   R   RA   t   objectRB   R)   R†   R^   R,   R¦   (    (    (    sE   lib/python2.7/site-packages/statsmodels/nonparametric/_kernel_base.pyt   <module>   s@   

											
	7ï;"		