ó
‡ˆ\c           @   sT  d  Z  d d l Z d d l Z d d l Z d d l j Z d d l m Z d d l m	 Z	 d d l
 m Z m Z d d l m Z d d l m Z d d	 l m Z d d
 l m Z d d l m Z d d l m Z d d l m Z e j j Z e j j Z d d g Z d „  Z d „  Z d „  Z d e e f d „  ƒ  YZ  d e e f d „  ƒ  YZ! d S(   s)   Transformers for missing value imputationiÿÿÿÿN(   t   sparse(   t   statsi   (   t   BaseEstimatort   TransformerMixin(   t   check_array(   t   _get_median(   t   check_is_fitted(   t   FLOAT_DTYPES(   t   _object_dtype_isnan(   t   is_scalar_nan(   t   sixt   MissingIndicatort   SimpleImputerc         C   sM   |  j  j d k rI t | t j ƒ rI t d j |  j  t | ƒ ƒ ƒ ‚ n  d  S(   Nt   ft   it   usn   'X' and 'missing_values' types are expected to be both numerical. Got X.dtype={} and  type(missing_values)={}.(   R   R   R   (   t   dtypet   kindt
   isinstancet   numberst   Realt
   ValueErrort   formatt   type(   t   Xt   missing_values(    (    s-   lib/python2.7/site-packages/sklearn/impute.pyt   _check_inputs_dtype!   s    	c         C   st   t  | ƒ r` |  j j d k r+ t j |  ƒ S|  j j d k rS t j |  j d t ƒSt |  ƒ Sn t j	 |  | ƒ Sd S(   s-   Compute the boolean mask X == missing_values.R   R   R   R   N(   R   R   (
   R	   R   R   t   npt   isnant   zerost   shapet   boolR   t   equal(   R   t   value_to_mask(    (    s-   lib/python2.7/site-packages/sklearn/impute.pyt	   _get_mask*   s    c         C   sÒ   |  j  d k r` t j ƒ  $ t j d t ƒ t j |  ƒ } Wd QX| d d } | d d } n d } d } | d k r‹ | d k r‹ t j S| | k  r› | S| | k r« | S| | k rÎ | | k  rÇ | S| Sn  d S(   s   Compute the most frequent value in a 1d array extended with
       [extra_value] * n_repeat, where extra_value is assumed to be not part
       of the array.i    t   ignoreNi   (	   t   sizet   warningst   catch_warningst   simplefiltert   RuntimeWarningR   t   modeR   t   nan(   t   arrayt   extra_valuet   n_repeatR)   t   most_frequent_valuet   most_frequent_count(    (    s-   lib/python2.7/site-packages/sklearn/impute.pyt   _most_frequent;   s$    c           B   sY   e  Z d  Z e j d d	 d e d „ Z d „  Z d	 d „ Z	 d „  Z
 d „  Z d „  Z RS(
   s_
  Imputation transformer for completing missing values.

    Read more in the :ref:`User Guide <impute>`.

    Parameters
    ----------
    missing_values : number, string, np.nan (default) or None
        The placeholder for the missing values. All occurrences of
        `missing_values` will be imputed.

    strategy : string, optional (default="mean")
        The imputation strategy.

        - If "mean", then replace missing values using the mean along
          each column. Can only be used with numeric data.
        - If "median", then replace missing values using the median along
          each column. Can only be used with numeric data.
        - If "most_frequent", then replace missing using the most frequent
          value along each column. Can be used with strings or numeric data.
        - If "constant", then replace missing values with fill_value. Can be
          used with strings or numeric data.

        .. versionadded:: 0.20
           strategy="constant" for fixed value imputation.

    fill_value : string or numerical value, optional (default=None)
        When strategy == "constant", fill_value is used to replace all
        occurrences of missing_values.
        If left to the default, fill_value will be 0 when imputing numerical
        data and "missing_value" for strings or object data types.

    verbose : integer, optional (default=0)
        Controls the verbosity of the imputer.

    copy : boolean, optional (default=True)
        If True, a copy of X will be created. If False, imputation will
        be done in-place whenever possible. Note that, in the following cases,
        a new copy will always be made, even if `copy=False`:

        - If X is not an array of floating values;
        - If X is encoded as a CSR matrix.

    Attributes
    ----------
    statistics_ : array of shape (n_features,)
        The imputation fill value for each feature.

    Examples
    --------
    >>> import numpy as np
    >>> from sklearn.impute import SimpleImputer
    >>> imp_mean = SimpleImputer(missing_values=np.nan, strategy='mean')
    >>> imp_mean.fit([[7, 2, 3], [4, np.nan, 6], [10, 5, 9]])
    ... # doctest: +NORMALIZE_WHITESPACE
    SimpleImputer(copy=True, fill_value=None, missing_values=nan,
           strategy='mean', verbose=0)
    >>> X = [[np.nan, 2, 3], [4, np.nan, 6], [10, np.nan, 9]]
    >>> print(imp_mean.transform(X))
    ... # doctest: +NORMALIZE_WHITESPACE
    [[ 7.   2.   3. ]
     [ 4.   3.5  6. ]
     [10.   3.5  9. ]]

    Notes
    -----
    Columns which only contained missing values at `fit` are discarded upon
    `transform` if strategy is not "constant".

    t   meani    c         C   s1   | |  _  | |  _ | |  _ | |  _ | |  _ d  S(   N(   R   t   strategyt
   fill_valuet   verboset   copy(   t   selfR   R2   R3   R4   R5   (    (    s-   lib/python2.7/site-packages/sklearn/impute.pyt   __init__£   s
    				c      
   C   s9  d d d d g } |  j  | k r? t d j | |  j  ƒ ƒ ‚ n  |  j  d k rW d  } n t } t |  j ƒ su t } n d } y+ t | d d d	 | d
 | d |  j	 ƒ} WnO t k
 r÷ } d t
 | ƒ k rî t d j |  j  | j j ƒ ƒ ‚ qø | ‚ n Xt | |  j ƒ | j j d k r5t d j | j ƒ ƒ ‚ n  | S(   NR1   t   mediant   most_frequentt   constants4   Can only use these strategies: {0}  got strategy={1}s	   allow-nant   accept_sparset   cscR   t   force_all_finiteR5   s   could not convertsF   Cannot use {0} strategy with non-numeric data. Received datatype :{1}.R   R   R   t   Osû   SimpleImputer does not support data with dtype {0}. Please provide either a numeric array (with a floating point or integer dtype) or categorical data represented either as an array with integer dtype or an array of string values with an object dtype.(   R9   R:   (   R   R   R   R>   (   R2   R   R   t   NoneR   R	   R   t   TrueR   R5   t   strR   R   R   (   R6   R   t   allowed_strategiesR   R=   t   ve(    (    s-   lib/python2.7/site-packages/sklearn/impute.pyt   _validate_input«   s0    				
	c         C   s  |  j  | ƒ } |  j d	 k rB | j j d
 k r9 d } qK d } n	 |  j } |  j d k r— | j j d k r— t | t j ƒ r— t	 d j
 | ƒ ƒ ‚ n  t j | ƒ rè |  j d k rÄ t	 d ƒ ‚ q	|  j | |  j |  j | ƒ |  _ n! |  j | |  j |  j | ƒ |  _ |  S(   sI  Fit the imputer on X.

        Parameters
        ----------
        X : {array-like, sparse matrix}, shape (n_samples, n_features)
            Input data, where ``n_samples`` is the number of samples and
            ``n_features`` is the number of features.

        Returns
        -------
        self : SimpleImputer
        R   R   R   i    t   missing_valueR:   sT   'fill_value'={0} is invalid. Expected a numerical value when imputing numerical datasd   Imputation not possible when missing_values == 0 and input is sparse. Provide a dense array instead.N(   R   R   R   (   R   R   R   (   RD   R3   R?   R   R   R2   R   R   R   R   R   R    t   issparseR   t   _sparse_fitt   statistics_t
   _dense_fit(   R6   R   t   yR3   (    (    s-   lib/python2.7/site-packages/sklearn/impute.pyt   fitÒ   s.    						c         C   s  t  | j | ƒ } | j d t j | j ƒ } t j | j d ƒ } | d k ra | j | ƒ n*x't | j d ƒ D]} | j | j | | j | d !}	 | | j | | j | d !}
 |	 |
 }	 t  |	 d ƒ } |	 | }	 | j	 ƒ  } | | | } | d k r@|	 j
 | } | d k r)t j n |	 j	 ƒ  | | | <qu | d k rbt |	 | ƒ | | <qu | d k ru t |	 d | ƒ | | <qu qu W| S(   s#   Fit the transformer on sparse data.i    i   R:   R1   R8   R9   (   R"   t   dataR   R   t   difft   indptrt   emptyt   fillt   ranget   sumR$   R*   R   R0   (   R6   R   R2   R   R3   t	   mask_datat   n_implicit_zerost
   statisticsR   t   columnt   mask_columnt
   mask_zerost   n_explicit_zerost   n_zerost   s(    (    s-   lib/python2.7/site-packages/sklearn/impute.pyRG     s0    ",c         C   sÕ  t  | | ƒ } t j | d | ƒ} | d k rw t j j | d d ƒ} t j j | ƒ } t j | t j j | ƒ <| S| d k rÊ t j j | d d ƒ}	 t j j |	 ƒ }
 t j |
 t j j	 |	 ƒ <|
 S| d k r¥| j
 ƒ  } | j
 ƒ  } | j j d k rt j | j d d t ƒ} n t j | j d ƒ } xi t t | | ƒ ƒ D]P \ } \ } } t j | ƒ j t j ƒ } | | } t | t j d ƒ | | <qMW| S| d	 k rÑt j | j d
 | d | j ƒSd S(   s"   Fit the transformer on dense data.t   maskR1   t   axisi    R8   R9   R>   R   R:   i   N(   R"   t   mat   masked_arrayR   R1   t   getdataR*   t   getmaskR8   t   getmaskarrayt	   transposeR   R   RO   R   t   objectt	   enumeratet   zipt   logical_nott   astypeR   R0   t   full(   R6   R   R2   R   R3   R\   t   masked_Xt   mean_maskedR1   t   median_maskedR8   R9   R   t   rowt   row_mask(    (    s-   lib/python2.7/site-packages/sklearn/impute.pyRI   -  s2    *
c         C   s)  t  |  d ƒ |  j | ƒ } |  j } | j d | j d k ri t d | j d |  j j d f ƒ ‚ n  |  j d k r | } n– t | t j ƒ } t j	 | ƒ } | | } t j
 | ƒ } | j ƒ  rt j | j d ƒ | } |  j rþ t j d | ƒ n  | d d … | f } n  t j | ƒ rÀ|  j d k rDt d ƒ ‚ q%t | j |  j ƒ } t j t j t | j ƒ d d	 t j ƒt j | j ƒ ƒ | }	 | |	 j | j d
 t ƒ| j | <ne t | |  j ƒ } t j | d d ƒ}
 t j | |
 ƒ } t j | j ƒ  ƒ d d d … } | | | <| S(   s¾   Impute all missing values in X.

        Parameters
        ----------
        X : {array-like, sparse matrix}, shape (n_samples, n_features)
            The input data to complete.
        RH   i   i    s)   X has %d features per sample, expected %dR:   s-   Deleting features without observed values: %sNsd   Imputation not possible when missing_values == 0 and input is sparse. Provide a dense array instead.R   R5   R]   iÿÿÿÿ(   R   RD   RH   R   R   R2   R"   R   R*   Rg   t   flatnonzerot   anyt   arangeR4   R%   t   warnR    RF   R   RL   t   repeatt   lenRN   t   intRM   Rh   R   t   FalseRR   t   whereRc   (   R6   R   RU   t   valid_statisticst   invalid_maskt
   valid_maskt   valid_statistics_indexest   missingR\   t   indexest	   n_missingt   valuest   coordinates(    (    s-   lib/python2.7/site-packages/sklearn/impute.pyt	   transforma  s@    	$	
		("
N(   t   __name__t
   __module__t   __doc__R   R*   R?   R@   R7   RD   RK   RG   RI   R   (    (    (    s-   lib/python2.7/site-packages/sklearn/impute.pyR   ]   s   E		'5	&	4c           B   sY   e  Z d  Z e j d d e d „ Z d „  Z d „  Z d	 d „ Z
 d „  Z d	 d „ Z RS(
   s	  Binary indicators for missing values.

    Note that this component typically should not not be used in a vanilla
    :class:`Pipeline` consisting of transformers and a classifier, but rather
    could be added using a :class:`FeatureUnion` or :class:`ColumnTransformer`.

    Read more in the :ref:`User Guide <impute>`.

    Parameters
    ----------
    missing_values : number, string, np.nan (default) or None
        The placeholder for the missing values. All occurrences of
        `missing_values` will be indicated (True in the output array), the
        other values will be marked as False.

    features : str, optional
        Whether the imputer mask should represent all or a subset of
        features.

        - If "missing-only" (default), the imputer mask will only represent
          features containing missing values during fit time.
        - If "all", the imputer mask will represent all features.

    sparse : boolean or "auto", optional
        Whether the imputer mask format should be sparse or dense.

        - If "auto" (default), the imputer mask will be of same type as
          input.
        - If True, the imputer mask will be a sparse matrix.
        - If False, the imputer mask will be a numpy array.

    error_on_new : boolean, optional
        If True (default), transform will raise an error when there are
        features with missing values in transform that have no missing values
        in fit. This is applicable only when ``features="missing-only"``.

    Attributes
    ----------
    features_ : ndarray, shape (n_missing_features,) or (n_features,)
        The features indices which will be returned when calling ``transform``.
        They are computed during ``fit``. For ``features='all'``, it is
        to ``range(n_features)``.

    Examples
    --------
    >>> import numpy as np
    >>> from sklearn.impute import MissingIndicator
    >>> X1 = np.array([[np.nan, 1, 3],
    ...                [4, 0, np.nan],
    ...                [8, 1, 0]])
    >>> X2 = np.array([[5, 1, np.nan],
    ...                [np.nan, 2, 3],
    ...                [2, 4, 0]])
    >>> indicator = MissingIndicator()
    >>> indicator.fit(X1)
    MissingIndicator(error_on_new=True, features='missing-only',
             missing_values=nan, sparse='auto')
    >>> X2_tr = indicator.transform(X2)
    >>> X2_tr
    array([[False,  True],
           [ True, False],
           [False, False]])

    s   missing-onlyt   autoc         C   s(   | |  _  | |  _ | |  _ | |  _ d  S(   N(   R   t   featuresR    t   error_on_new(   R6   R   R†   R    R‡   (    (    s-   lib/python2.7/site-packages/sklearn/impute.pyR7   Þ  s    			c         C   s‘  t  j | ƒ r|  j d k rt | j |  j ƒ } | j d k rK t  j n t  j } | | | j j	 ƒ  | j
 j	 ƒ  f d | j d t ƒ} | j	 ƒ  } | j ƒ  | j d k rÊ t j t j | j
 ƒ ƒ n t j | j ƒ } |  j  t k rú | j ƒ  } q‡| j d k r‡| j ƒ  } q‡nl t  j | ƒ r9| j ƒ  } n  t | |  j ƒ } t j | j d d ƒ ƒ } |  j  t k r‡t  j | ƒ } n  | | f S(   sµ  Compute the imputer mask and the indices of the features
        containing missing values.

        Parameters
        ----------
        X : {ndarray or sparse matrix}, shape (n_samples, n_features)
            The input data with missing values. Note that ``X`` has been
            checked in ``fit`` and ``transform`` before to call this function.

        Returns
        -------
        imputer_mask : {ndarray or sparse matrix}, shape (n_samples, n_features) or (n_samples, n_features_with_missing)
            The imputer mask of the original data.

        features_with_missing : ndarray, shape (n_features_with_missing)
            The features containing missing values.

        i    t   csrR   R   R<   R]   (   R    RF   R   R"   RL   R   t
   csr_matrixt
   csc_matrixt   indicesR5   RN   R   R   t   eliminate_zerosR   Ro   RM   t   uniqueRv   t   toarrayt   tocscRR   R@   (   R6   R   R\   t   sparse_constructort   imputer_maskt   missing_values_maskt   features_with_missing(    (    s-   lib/python2.7/site-packages/sklearn/impute.pyt   _get_missing_features_infoå  s,    	!
*c         C   s}   t  |  j ƒ s t } n d } t | d d d d  d | ƒ} t | |  j ƒ | j j d k ry t d j	 | j ƒ ƒ ‚ n  | S(   Ns	   allow-nanR;   R<   Rˆ   R   R=   R   R   R   R>   sþ   MissingIndicator does not support data with dtype {0}. Please provide either a numeric array (with a floating point or integer dtype) or categorical data represented either as an array with integer dtype or an array of string values with an object dtype.(   R<   Rˆ   (   R   R   R   R>   (
   R	   R   R@   R   R?   R   R   R   R   R   (   R6   R   R=   (    (    s-   lib/python2.7/site-packages/sklearn/impute.pyRD     s    			c         C   sÕ   |  j  | ƒ } | j d |  _ |  j d k rI t d j |  j ƒ ƒ ‚ n  t |  j t j	 ƒ rm |  j d k p| t |  j t
 ƒ sš t d j |  j ƒ ƒ ‚ n  |  j d k r¼ |  j | ƒ d n t j |  j ƒ |  _ |  S(   s`  Fit the transformer on X.

        Parameters
        ----------
        X : {array-like, sparse matrix}, shape (n_samples, n_features)
            Input data, where ``n_samples`` is the number of samples and
            ``n_features`` is the number of features.

        Returns
        -------
        self : object
            Returns self.
        i   s   missing-onlyt   allsD   'features' has to be either 'missing-only' or 'all'. Got {} instead.R…   s9   'sparse' has to be a boolean or 'auto'. Got {!r} instead.(   s   missing-onlyR•   (   RD   R   t   _n_featuresR†   R   R   R   R    R
   t   string_typesR   R”   R   Rq   t	   features_(   R6   R   RJ   (    (    s-   lib/python2.7/site-packages/sklearn/impute.pyRK   -  s    	!	"c         C   sô   t  |  d ƒ |  j | ƒ } | j d |  j k rA t d ƒ ‚ n  |  j | ƒ \ } } |  j d k rð t j | |  j	 ƒ } |  j
 rª | j d k rª t d j | ƒ ƒ ‚ n  |  j	 j d k rð |  j	 j |  j k  rð | d d … |  j	 f } qð n  | S(   s—  Generate missing values indicator for X.

        Parameters
        ----------
        X : {array-like, sparse matrix}, shape (n_samples, n_features)
            The input data to complete.

        Returns
        -------
        Xt : {ndarray or sparse matrix}, shape (n_samples, n_features)
            The missing indicator for input data. The data type of ``Xt``
            will be boolean.

        R˜   i   s9   X has a different number of features than during fitting.s   missing-onlyi    sS   The features {} have missing values in transform but have no missing values in fit.N(   R   RD   R   R–   R   R”   R†   R   t	   setdiff1dR˜   R‡   R$   R   (   R6   R   R‘   R†   t   features_diff_fit_trans(    (    s-   lib/python2.7/site-packages/sklearn/impute.pyR   M  s    	c         C   s   |  j  | | ƒ j | ƒ S(   s—  Generate missing values indicator for X.

        Parameters
        ----------
        X : {array-like, sparse matrix}, shape (n_samples, n_features)
            The input data to complete.

        Returns
        -------
        Xt : {ndarray or sparse matrix}, shape (n_samples, n_features)
            The missing indicator for input data. The data type of ``Xt``
            will be boolean.

        (   RK   R   (   R6   R   RJ   (    (    s-   lib/python2.7/site-packages/sklearn/impute.pyt   fit_transformr  s    N(   R‚   Rƒ   R„   R   R*   R@   R7   R”   RD   R?   RK   R   R›   (    (    (    s-   lib/python2.7/site-packages/sklearn/impute.pyR   œ  s   @		7	 	%("   R„   R%   R   t   numpyR   t   numpy.maR^   t   scipyR    R   t   baseR   R   t   utilsR   t   utils.sparsefuncsR   t   utils.validationR   R   t   utils.fixesR   R	   t	   externalsR
   t   movesRf   t   mapt   __all__R   R"   R0   R   R   (    (    (    s-   lib/python2.7/site-packages/sklearn/impute.pyt   <module>   s0   					"ÿ @