ó
‡ˆ\c           @` sR  d  Z  d d l m Z m Z m Z d d l Z d d l m Z d d l Z	 d d l
 m Z d d l m Z d d l m Z d d	 l m Z d
 d l m Z d
 d l m Z d
 d l m Z d
 d l m Z m Z m Z d
 d l m Z d
 d l m Z e	 j e	 j  ƒ j! Z" d „  Z# d d d „ Z$ d „  Z% d „  Z& d e e f d „  ƒ  YZ' d S(   s<   
A Theil-Sen Estimator for Multiple Linear Regression Model
i    (   t   divisiont   print_functiont   absolute_importN(   t   combinations(   t   linalg(   t   binom(   t   get_lapack_funcsi   (   t   LinearModeli   (   t   RegressorMixin(   t   check_random_state(   t	   check_X_y(   t   Parallelt   delayedt   effective_n_jobs(   t   xrange(   t   ConvergenceWarningc         C` s)  |  | } t  j t  j | d d d ƒƒ } | t k } t | j ƒ  |  j d k  ƒ } | | } | | d d … t  j f } t j t  j | | d d ƒƒ } | t k rï t  j |  | d d … f | d d ƒt  j d | d d ƒ} n d } d } t	 d d | | ƒ | t
 d | | ƒ | S(   sø  Modified Weiszfeld step.

    This function defines one iteration step in order to approximate the
    spatial median (L1 median). It is a form of an iteratively re-weighted
    least squares method.

    Parameters
    ----------
    X : array, shape = [n_samples, n_features]
        Training vector, where n_samples is the number of samples and
        n_features is the number of features.

    x_old : array, shape = [n_features]
        Current start vector.

    Returns
    -------
    x_new : array, shape = [n_features]
        New iteration step.

    References
    ----------
    - On Computation of Spatial Median for Robust Data Mining, 2005
      T. KÃ¤rkkÃ¤inen and S. Ã„yrÃ¤mÃ¶
      http://users.jyu.fi/~samiayr/pdf/ayramo_eurogen05.pdf
    i   t   axisi   i    Ng      ð?g        (   t   npt   sqrtt   sumt   _EPSILONt   intt   shapet   newaxisR   t   normt   maxt   min(   t   Xt   x_oldt   difft	   diff_normt   maskt   is_x_old_in_Xt   quotient_normt   new_direction(    (    s=   lib/python2.7/site-packages/sklearn/linear_model/theil_sen.pyt   _modified_weiszfeld_step   s    
"
"&i,  gü©ñÒMbP?c         C` s¾   |  j  d d k r, d t j |  j ƒ  ƒ f S| d C} t j |  d d ƒ} xf t | ƒ D]< } t |  | ƒ } t j | | d ƒ | k  rŽ PqX | } qX Wt j	 d j
 d | ƒ t ƒ | | f S(   s  Spatial median (L1 median).

    The spatial median is member of a class of so-called M-estimators which
    are defined by an optimization problem. Given a number of p points in an
    n-dimensional space, the point x minimizing the sum of all distances to the
    p other points is called spatial median.

    Parameters
    ----------
    X : array, shape = [n_samples, n_features]
        Training vector, where n_samples is the number of samples and
        n_features is the number of features.

    max_iter : int, optional
        Maximum number of iterations.  Default is 300.

    tol : float, optional
        Stop the algorithm if spatial_median has converged. Default is 1.e-3.

    Returns
    -------
    spatial_median : array, shape = [n_features]
        Spatial median.

    n_iter : int
        Number of iterations needed.

    References
    ----------
    - On Computation of Spatial Median for Robust Data Mining, 2005
      T. KÃ¤rkkÃ¤inen and S. Ã„yrÃ¤mÃ¶
      http://users.jyu.fi/~samiayr/pdf/ayramo_eurogen05.pdf
    i   i   R   i    sY   Maximum number of iterations {max_iter} reached in spatial median for TheilSen regressor.t   max_iter(   R   R   t   mediant   ravelt   meant   rangeR#   R   t   warningst   warnt   formatR   (   R   R$   t   tolt   spatial_median_oldt   n_itert   spatial_median(    (    s=   lib/python2.7/site-packages/sklearn/linear_model/theil_sen.pyt   _spatial_medianO   s    "

c         C` s(   d d d | |  | d | d |  S(   s  Approximation of the breakdown point.

    Parameters
    ----------
    n_samples : int
        Number of samples.

    n_subsamples : int
        Number of subsamples to consider.

    Returns
    -------
    breakdown_point : float
        Approximation of breakdown point.
    i   g      à?(    (   t	   n_samplest   n_subsamples(    (    s=   lib/python2.7/site-packages/sklearn/linear_model/theil_sen.pyt   _breakdown_point…   s    c         C` s  t  | ƒ } |  j d | } | j d } t j | j d | f ƒ } t j | | f ƒ } t j t | | ƒ ƒ } t d | | f ƒ \ }	 xo t | ƒ D]a \ }
 } |  | d d … f | d d … | d … f <| | | | *|	 | | ƒ d |  | |
 <q˜ W| S(   s‹  Least Squares Estimator for TheilSenRegressor class.

    This function calculates the least squares method on a subset of rows of X
    and y defined by the indices array. Optionally, an intercept column is
    added if intercept is set to true.

    Parameters
    ----------
    X : array, shape = [n_samples, n_features]
        Design matrix, where n_samples is the number of samples and
        n_features is the number of features.

    y : array, shape = [n_samples]
        Target vector, where n_samples is the number of samples.

    indices : array, shape = [n_subpopulation, n_subsamples]
        Indices of all subsamples with respect to the chosen subpopulation.

    fit_intercept : bool
        Fit intercept or not.

    Returns
    -------
    weights : array, shape = [n_subpopulation, n_features + intercept]
        Solution matrix of n_subpopulation solved least square problems.
    i   i    t   gelssN(   R4   (	   R   R   R   t   emptyt   onest   zerosR   R   t	   enumerate(   R   t   yt   indicest   fit_interceptt
   n_featuresR2   t   weightst   X_subpopulationt   y_subpopulationt   lstsqt   indext   subset(    (    s=   lib/python2.7/site-packages/sklearn/linear_model/theil_sen.pyt   _lstsq™   s    ,t   TheilSenRegressorc        
   B` sD   e  Z d  Z e e d d d d d d e d „	 Z d „  Z d „  Z RS(   sÀ  Theil-Sen Estimator: robust multivariate regression model.

    The algorithm calculates least square solutions on subsets with size
    n_subsamples of the samples in X. Any value of n_subsamples between the
    number of features and samples leads to an estimator with a compromise
    between robustness and efficiency. Since the number of least square
    solutions is "n_samples choose n_subsamples", it can be extremely large
    and can therefore be limited with max_subpopulation. If this limit is
    reached, the subsets are chosen randomly. In a final step, the spatial
    median (or L1 median) is calculated of all least square solutions.

    Read more in the :ref:`User Guide <theil_sen_regression>`.

    Parameters
    ----------
    fit_intercept : boolean, optional, default True
        Whether to calculate the intercept for this model. If set
        to false, no intercept will be used in calculations.

    copy_X : boolean, optional, default True
        If True, X will be copied; else, it may be overwritten.

    max_subpopulation : int, optional, default 1e4
        Instead of computing with a set of cardinality 'n choose k', where n is
        the number of samples and k is the number of subsamples (at least
        number of features), consider only a stochastic subpopulation of a
        given maximal size if 'n choose k' is larger than max_subpopulation.
        For other than small problem sizes this parameter will determine
        memory usage and runtime if n_subsamples is not changed.

    n_subsamples : int, optional, default None
        Number of samples to calculate the parameters. This is at least the
        number of features (plus 1 if fit_intercept=True) and the number of
        samples as a maximum. A lower number leads to a higher breakdown
        point and a low efficiency while a high number leads to a low
        breakdown point and a high efficiency. If None, take the
        minimum number of subsamples leading to maximal robustness.
        If n_subsamples is set to n_samples, Theil-Sen is identical to least
        squares.

    max_iter : int, optional, default 300
        Maximum number of iterations for the calculation of spatial median.

    tol : float, optional, default 1.e-3
        Tolerance when calculating spatial median.

    random_state : int, RandomState instance or None, optional, default None
        A random number generator instance to define the state of the random
        permutations generator.  If int, random_state is the seed used by the
        random number generator; If RandomState instance, random_state is the
        random number generator; If None, the random number generator is the
        RandomState instance used by `np.random`.

    n_jobs : int or None, optional (default=None)
        Number of CPUs to use during the cross validation.
        ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
        ``-1`` means using all processors. See :term:`Glossary <n_jobs>`
        for more details.

    verbose : boolean, optional, default False
        Verbose mode when fitting the model.

    Attributes
    ----------
    coef_ : array, shape = (n_features)
        Coefficients of the regression model (median of distribution).

    intercept_ : float
        Estimated intercept of regression model.

    breakdown_ : float
        Approximated breakdown point.

    n_iter_ : int
        Number of iterations needed for the spatial median.

    n_subpopulation_ : int
        Number of combinations taken into account from 'n choose k', where n is
        the number of samples and k is the number of subsamples.

    Examples
    --------
    >>> from sklearn.linear_model import TheilSenRegressor
    >>> from sklearn.datasets import make_regression
    >>> X, y = make_regression(
    ...     n_samples=200, n_features=2, noise=4.0, random_state=0)
    >>> reg = TheilSenRegressor(random_state=0).fit(X, y)
    >>> reg.score(X, y) # doctest: +ELLIPSIS
    0.9884...
    >>> reg.predict(X[:1,])
    array([-31.5871...])

    References
    ----------
    - Theil-Sen Estimators in a Multiple Linear Regression Model, 2009
      Xin Dang, Hanxiang Peng, Xueqin Wang and Heping Zhang
      http://home.olemiss.edu/~xdang/papers/MTSE.pdf
    g     ˆÃ@i,  gü©ñÒMbP?c
   
      C` s[   | |  _  | |  _ t | ƒ |  _ | |  _ | |  _ | |  _ | |  _ | |  _ |	 |  _	 d  S(   N(
   R;   t   copy_XR   t   max_subpopulationR2   R$   R,   t   random_statet   n_jobst   verbose(
   t   selfR;   RE   RF   R2   R$   R,   RG   RH   RI   (    (    s=   lib/python2.7/site-packages/sklearn/linear_model/theil_sen.pyt   __init__*  s    							c         C` sL  |  j  } |  j r | d } n | } | d  k	 rÐ | | k rX t d j | | ƒ ƒ ‚ n  | | k r¦ | | k rÍ |  j r d n d } t d j | | | ƒ ƒ ‚ qÍ qß | | k rß t d j | | ƒ ƒ ‚ qß n t | | ƒ } |  j d k r	t d j |  j ƒ ƒ ‚ n  t d t j	 t
 | | ƒ ƒ ƒ } t t |  j | ƒ ƒ } | | f S(	   Ni   s=   Invalid parameter since n_subsamples > n_samples ({0} > {1}).s   +1t    sA   Invalid parameter since n_features{0} > n_subsamples ({1} > {2}).s\   Invalid parameter since n_subsamples != n_samples ({0} != {1}) while n_samples < n_features.i    s3   Subpopulation must be strictly positive ({0} <= 0).(   R2   R;   t   Nonet
   ValueErrorR+   R   RF   R   R   t   rintR   R   (   RJ   R1   R<   R2   t   n_dimt   plus_1t   all_combinationst   n_subpopulation(    (    s=   lib/python2.7/site-packages/sklearn/linear_model/theil_sen.pyt   _check_subparams7  s2    						!c         ` s  t  ˆ j ƒ } t ˆ  ˆ d t ƒ\ ‰  ‰ ˆ  j \ } } ˆ j | | ƒ \ } ˆ _ t | | ƒ ˆ _ ˆ j	 r× t
 d j ˆ j ƒ ƒ t
 d j | ƒ ƒ t ˆ j | ƒ } t
 d j | ƒ ƒ t
 d j ˆ j ƒ ƒ n  t j t | | ƒ ƒ ˆ j k rt t t | ƒ | ƒ ƒ } n7 g  t ˆ j ƒ D]! }	 | j | d | d t ƒ^ q&} t ˆ j ƒ }
 t j | |
 ƒ ‰ t d |
 d	 ˆ j	 ƒ ‡  ‡ ‡ ‡ f d
 †  t |
 ƒ Dƒ ƒ } t j | ƒ } t | d ˆ j d ˆ j ƒ\ ˆ _ } ˆ j r| d ˆ _  | d ˆ _! n d ˆ _  | ˆ _! ˆ S(   s(  Fit linear model.

        Parameters
        ----------
        X : numpy array of shape [n_samples, n_features]
            Training data
        y : numpy array of shape [n_samples]
            Target values

        Returns
        -------
        self : returns an instance of self.
        t	   y_numerics   Breakdown point: {0}s   Number of samples: {0}s   Tolerable outliers: {0}s   Number of subpopulations: {0}t   sizet   replaceRH   RI   c         3` s1   |  ]' } t  t ƒ ˆ  ˆ ˆ | ˆ j ƒ Vq d  S(   N(   R   RC   R;   (   t   .0t   job(   R   t
   index_listRJ   R9   (    s=   lib/python2.7/site-packages/sklearn/linear_model/theil_sen.pys	   <genexpr>…  s   R$   R,   i    i   g        ("   R	   RG   R
   t   TrueR   RT   t   n_subpopulation_R3   t
   breakdown_RI   t   printR+   R   R   RO   R   RF   t   listR   R(   t   choicet   FalseR   RH   t   array_splitR   t   vstackR0   R$   R,   t   n_iter_R;   t
   intercept_t   coef_(   RJ   R   R9   RG   R1   R<   R2   t   tol_outliersR:   t   _RH   R=   t   coefs(    (   R   RZ   RJ   R9   s=   lib/python2.7/site-packages/sklearn/linear_model/theil_sen.pyt   fit\  sB    			!4						N(	   t   __name__t
   __module__t   __doc__R[   RM   Ra   RK   RT   Rj   (    (    (    s=   lib/python2.7/site-packages/sklearn/linear_model/theil_sen.pyRD   Æ   s   b		%((   Rm   t
   __future__R    R   R   R)   t	   itertoolsR   t   numpyR   t   scipyR   t   scipy.specialR   t   scipy.linalg.lapackR   t   baseR   R   t   utilsR	   R
   t   utils._joblibR   R   R   t   externals.six.movesR   R(   t
   exceptionsR   t   finfot   doublet   epsR   R#   R0   R3   RC   RD   (    (    (    s=   lib/python2.7/site-packages/sklearn/linear_model/theil_sen.pyt   <module>   s(   	06		-