B
    	\`                 @   s   d Z ddlZddlZddlZddlmZ ddlmZ ddlm	Z	 ddl
mZmZ ddlmZ ddlmZ dd	lmZ dd
lmZ ddlmZ ddlmZ ddlmZ ejjZejjZddgZdd Zdd Zdd ZG dd deeZ G dd deeZ!dS )z)Transformers for missing value imputation    N)sparse)stats   )BaseEstimatorTransformerMixin)check_array)_get_median)check_is_fitted)FLOAT_DTYPES)_object_dtype_isnan)is_scalar_nan)sixMissingIndicatorSimpleImputerc             C   s2   | j jdkr.t|tjs.td| j t|d S )N)fiuzn'X' and 'missing_values' types are expected to be both numerical. Got X.dtype={} and  type(missing_values)={}.)dtypekind
isinstancenumbersReal
ValueErrorformattype)Xmissing_values r   -lib/python3.7/site-packages/sklearn/impute.py_check_inputs_dtype!   s    r   c             C   sT   t |rD| jjdkrt| S | jjdkr:tj| jtdS t| S nt	| |S dS )z-Compute the boolean mask X == missing_values.r   )r   r   )r   N)
r   r   r   npZisnanZzerosshapeboolr   Zequal)r   Zvalue_to_maskr   r   r   	_get_mask*   s    

r#   c          	   C   s   | j dkrNt  tdt t| }W dQ R X |d d }|d d }nd}d}|dkrl|dkrltjS ||k rx|S ||kr|S ||kr||k r|S |S dS )zCompute the most frequent value in a 1d array extended with
       [extra_value] * n_repeat, where extra_value is assumed to be not part
       of the array.r   ignoreNr   )	sizewarningscatch_warningssimplefilterRuntimeWarningr   moder    nan)ZarrayZextra_valueZn_repeatr*   Zmost_frequent_valueZmost_frequent_countr   r   r   _most_frequent;   s$    

r,   c               @   sP   e Zd ZdZejddddfddZdd	 Zdd
dZdd Z	dd Z
dd ZdS )r   a_
  Imputation transformer for completing missing values.

    Read more in the :ref:`User Guide <impute>`.

    Parameters
    ----------
    missing_values : number, string, np.nan (default) or None
        The placeholder for the missing values. All occurrences of
        `missing_values` will be imputed.

    strategy : string, optional (default="mean")
        The imputation strategy.

        - If "mean", then replace missing values using the mean along
          each column. Can only be used with numeric data.
        - If "median", then replace missing values using the median along
          each column. Can only be used with numeric data.
        - If "most_frequent", then replace missing using the most frequent
          value along each column. Can be used with strings or numeric data.
        - If "constant", then replace missing values with fill_value. Can be
          used with strings or numeric data.

        .. versionadded:: 0.20
           strategy="constant" for fixed value imputation.

    fill_value : string or numerical value, optional (default=None)
        When strategy == "constant", fill_value is used to replace all
        occurrences of missing_values.
        If left to the default, fill_value will be 0 when imputing numerical
        data and "missing_value" for strings or object data types.

    verbose : integer, optional (default=0)
        Controls the verbosity of the imputer.

    copy : boolean, optional (default=True)
        If True, a copy of X will be created. If False, imputation will
        be done in-place whenever possible. Note that, in the following cases,
        a new copy will always be made, even if `copy=False`:

        - If X is not an array of floating values;
        - If X is encoded as a CSR matrix.

    Attributes
    ----------
    statistics_ : array of shape (n_features,)
        The imputation fill value for each feature.

    Examples
    --------
    >>> import numpy as np
    >>> from sklearn.impute import SimpleImputer
    >>> imp_mean = SimpleImputer(missing_values=np.nan, strategy='mean')
    >>> imp_mean.fit([[7, 2, 3], [4, np.nan, 6], [10, 5, 9]])
    ... # doctest: +NORMALIZE_WHITESPACE
    SimpleImputer(copy=True, fill_value=None, missing_values=nan,
           strategy='mean', verbose=0)
    >>> X = [[np.nan, 2, 3], [4, np.nan, 6], [10, np.nan, 9]]
    >>> print(imp_mean.transform(X))
    ... # doctest: +NORMALIZE_WHITESPACE
    [[ 7.   2.   3. ]
     [ 4.   3.5  6. ]
     [10.   3.5  9. ]]

    Notes
    -----
    Columns which only contained missing values at `fit` are discarded upon
    `transform` if strategy is not "constant".

    meanNr   Tc             C   s"   || _ || _|| _|| _|| _d S )N)r   strategy
fill_valueverbosecopy)selfr   r.   r/   r0   r1   r   r   r   __init__   s
    zSimpleImputer.__init__c          
   C   s   ddddg}| j |kr(td|| j | j dkr8d }nt}t| jsLd}nd}yt|d	||| jd
}W nJ tk
r } z,dt|krtd| j |j	j
n|W d d }~X Y nX t|| j |j	j
dkrtd|j	|S )Nr-   medianmost_frequentconstantz4Can only use these strategies: {0}  got strategy={1})r5   r6   Tz	allow-nancsc)accept_sparser   force_all_finiter1   zcould not convertzFCannot use {0} strategy with non-numeric data. Received datatype :{1}.)r   r   r   OzSimpleImputer does not support data with dtype {0}. Please provide either a numeric array (with a floating point or integer dtype) or categorical data represented either as an array with integer dtype or an array of string values with an object dtype.)r.   r   r   r
   r   r   r   r1   strr   r   r   )r2   r   Zallowed_strategiesr   r9   Zver   r   r   _validate_input   s0    




zSimpleImputer._validate_inputc             C   s   |  |}| jdkr,|jjdkr&d}q2d}n| j}| jdkrb|jjdkrbt|tjsbtd	|t
|r| jdkrtdq| || j| j|| _n| || j| j|| _| S )aI  Fit the imputer on X.

        Parameters
        ----------
        X : {array-like, sparse matrix}, shape (n_samples, n_features)
            Input data, where ``n_samples`` is the number of samples and
            ``n_features`` is the number of features.

        Returns
        -------
        self : SimpleImputer
        N)r   r   r   r   Zmissing_valuer6   zT'fill_value'={0} is invalid. Expected a numerical value when imputing numerical datazdImputation not possible when missing_values == 0 and input is sparse. Provide a dense array instead.)r<   r/   r   r   r.   r   r   r   r   r   r   issparser   _sparse_fitstatistics_
_dense_fit)r2   r   yr/   r   r   r   fit   s.    






zSimpleImputer.fitc             C   s2  t |j|}|jd t|j }t|jd }|dkrF|| nxt|jd D ]}|j|j| |j|d   }	||j| |j|d   }
|	|
  }	t |	d}|	|  }	|	 }|| | }|dkr|	j
| }|dkrtjn
|		 | ||< qV|dkrt|	|||< qV|dkrVt|	d|||< qVW |S )z#Fit the transformer on sparse data.r   r   r6   r-   r4   r5   )r#   datar!   r    diffindptremptyZfillrangesumr%   r+   r   r,   )r2   r   r.   r   r/   Z	mask_dataZn_implicit_zeros
statisticsr   columnZmask_columnZ
mask_zerosZn_explicit_zerosZn_zerossr   r   r   r>     s0    



 
zSimpleImputer._sparse_fitc             C   s^  t ||}tj||d}|dkrRtjj|dd}tj|}tj|tj|< |S |dkrtjj|dd}	tj|	}
tj|
tj	|	< |
S |dkr8|
 }|
 }|jjdkrtj|jd td}nt|jd }xZtt|d	d	 |d	d	 D ]8\}\}}t|tj}|| }t|tjd||< qW |S |d
krZtj|jd ||jdS d	S )z"Fit the transformer on dense data.)maskr-   r   )axisr4   r5   r:   )r   Nr6   r   )r#   maZmasked_arrayr    r-   Zgetdatar+   Zgetmaskr4   Zgetmaskarray	transposer   r   rF   r!   object	enumerateziplogical_notastyper"   r,   Zfull)r2   r   r.   r   r/   rL   Zmasked_XZmean_maskedr-   Zmedian_maskedr4   r5   r   rowZrow_maskr   r   r   r@   -  s2    

,
zSimpleImputer._dense_fitc             C   s  t | d | |}| j}|jd |jd krLtd|jd | jjd f | jdkr\|}nht|tj}t	|}|| }t
|}| rt|jd | }| jrtd|  |dd|f }t|r<| jdkrtdnVt|j| j}ttjt|jd tjd	t|j| }	||	 j|jd
d|j|< nFt|| j}tj|dd}
t||
}t| ddd }|||< |S )zImpute all missing values in X.

        Parameters
        ----------
        X : {array-like, sparse matrix}, shape (n_samples, n_features)
            The input data to complete.
        r?   r   r   z)X has %d features per sample, expected %dr6   z-Deleting features without observed values: %sNzdImputation not possible when missing_values == 0 and input is sparse. Provide a dense array instead.)r   F)r1   )rM   )r	   r<   r?   r!   r   r.   r#   r    r+   rS   flatnonzeroanyaranger0   r&   warnr   r=   r   rC   repeatlenrE   intrD   rT   r   rH   whererO   )r2   r   rI   Zvalid_statisticsZinvalid_maskZ
valid_maskZvalid_statistics_indexesZmissingrL   ZindexesZ	n_missingvaluesZcoordinatesr   r   r   	transforma  s@    






zSimpleImputer.transform)N)__name__
__module____qualname____doc__r    r+   r3   r<   rB   r>   r@   r`   r   r   r   r   r   ]   s   E'
5&4c               @   sP   e Zd ZdZejdddfddZdd Zd	d
 ZdddZ	dd Z
dddZdS )r   a	  Binary indicators for missing values.

    Note that this component typically should not not be used in a vanilla
    :class:`Pipeline` consisting of transformers and a classifier, but rather
    could be added using a :class:`FeatureUnion` or :class:`ColumnTransformer`.

    Read more in the :ref:`User Guide <impute>`.

    Parameters
    ----------
    missing_values : number, string, np.nan (default) or None
        The placeholder for the missing values. All occurrences of
        `missing_values` will be indicated (True in the output array), the
        other values will be marked as False.

    features : str, optional
        Whether the imputer mask should represent all or a subset of
        features.

        - If "missing-only" (default), the imputer mask will only represent
          features containing missing values during fit time.
        - If "all", the imputer mask will represent all features.

    sparse : boolean or "auto", optional
        Whether the imputer mask format should be sparse or dense.

        - If "auto" (default), the imputer mask will be of same type as
          input.
        - If True, the imputer mask will be a sparse matrix.
        - If False, the imputer mask will be a numpy array.

    error_on_new : boolean, optional
        If True (default), transform will raise an error when there are
        features with missing values in transform that have no missing values
        in fit. This is applicable only when ``features="missing-only"``.

    Attributes
    ----------
    features_ : ndarray, shape (n_missing_features,) or (n_features,)
        The features indices which will be returned when calling ``transform``.
        They are computed during ``fit``. For ``features='all'``, it is
        to ``range(n_features)``.

    Examples
    --------
    >>> import numpy as np
    >>> from sklearn.impute import MissingIndicator
    >>> X1 = np.array([[np.nan, 1, 3],
    ...                [4, 0, np.nan],
    ...                [8, 1, 0]])
    >>> X2 = np.array([[5, 1, np.nan],
    ...                [np.nan, 2, 3],
    ...                [2, 4, 0]])
    >>> indicator = MissingIndicator()
    >>> indicator.fit(X1)
    MissingIndicator(error_on_new=True, features='missing-only',
             missing_values=nan, sparse='auto')
    >>> X2_tr = indicator.transform(X2)
    >>> X2_tr
    array([[False,  True],
           [ True, False],
           [False, False]])

    zmissing-onlyautoTc             C   s   || _ || _|| _|| _d S )N)r   featuresr   error_on_new)r2   r   rf   r   rg   r   r   r   r3     s    zMissingIndicator.__init__c             C   s  t |r| jdkrt|j| j}|jdkr2t jnt j}|||j	 |j
	 f|jtd}|	 }|  |jdkrtt|j
n
t|j}| j dkr| }q|jdkr| }nDt |r| }t|| j}t|jdd}| j dkrt |}||fS )a  Compute the imputer mask and the indices of the features
        containing missing values.

        Parameters
        ----------
        X : {ndarray or sparse matrix}, shape (n_samples, n_features)
            The input data with missing values. Note that ``X`` has been
            checked in ``fit`` and ``transform`` before to call this function.

        Returns
        -------
        imputer_mask : {ndarray or sparse matrix}, shape (n_samples, n_features) or (n_samples, n_features_with_missing)
            The imputer mask of the original data.

        features_with_missing : ndarray, shape (n_features_with_missing)
            The features containing missing values.

        r   csr)r!   r   r7   F)rM   T)r   r=   r   r#   rC   r   Z
csr_matrixZ
csc_matrixindicesr1   rE   r!   r"   Zeliminate_zerosr    rW   rD   uniqueZtoarrayZtocscrH   )r2   r   rL   Zsparse_constructorimputer_maskZmissing_values_maskZfeatures_with_missingr   r   r   _get_missing_features_info  s,    






z+MissingIndicator._get_missing_features_infoc             C   sP   t | jsd}nd}t|dd |d}t|| j |jjdkrLtd|j|S )NTz	allow-nan)r7   rh   )r8   r   r9   )r   r   r   r:   zMissingIndicator does not support data with dtype {0}. Please provide either a numeric array (with a floating point or integer dtype) or categorical data represented either as an array with integer dtype or an array of string values with an object dtype.)r   r   r   r   r   r   r   r   )r2   r   r9   r   r   r   r<     s    

z MissingIndicator._validate_inputNc             C   s   |  |}|jd | _| jdkr0td| jt| jtj	rH| jdksdt| jt
sdtd| j| jdkr|| |d n
t| j| _| S )a`  Fit the transformer on X.

        Parameters
        ----------
        X : {array-like, sparse matrix}, shape (n_samples, n_features)
            Input data, where ``n_samples`` is the number of samples and
            ``n_features`` is the number of features.

        Returns
        -------
        self : object
            Returns self.
        r   )zmissing-onlyallzD'features' has to be either 'missing-only' or 'all'. Got {} instead.re   z9'sparse' has to be a boolean or 'auto'. Got {!r} instead.zmissing-only)r<   r!   _n_featuresrf   r   r   r   r   r   Zstring_typesr"   rl   r    rY   	features_)r2   r   rA   r   r   r   rB   -  s    



zMissingIndicator.fitc             C   s   t | d | |}|jd | jkr,td| |\}}| jdkrt|| j	}| j
rp|jdkrptd|| j	jdkr| j	j| jk r|dd| j	f }|S )a  Generate missing values indicator for X.

        Parameters
        ----------
        X : {array-like, sparse matrix}, shape (n_samples, n_features)
            The input data to complete.

        Returns
        -------
        Xt : {ndarray or sparse matrix}, shape (n_samples, n_features)
            The missing indicator for input data. The data type of ``Xt``
            will be boolean.

        ro   r   z9X has a different number of features than during fitting.zmissing-onlyr   zSThe features {} have missing values in transform but have no missing values in fit.N)r	   r<   r!   rn   r   rl   rf   r    Z	setdiff1dro   rg   r%   r   )r2   r   rk   rf   Zfeatures_diff_fit_transr   r   r   r`   M  s    


zMissingIndicator.transformc             C   s   |  |||S )a  Generate missing values indicator for X.

        Parameters
        ----------
        X : {array-like, sparse matrix}, shape (n_samples, n_features)
            The input data to complete.

        Returns
        -------
        Xt : {ndarray or sparse matrix}, shape (n_samples, n_features)
            The missing indicator for input data. The data type of ``Xt``
            will be boolean.

        )rB   r`   )r2   r   rA   r   r   r   fit_transformr  s    zMissingIndicator.fit_transform)N)N)ra   rb   rc   rd   r    r+   r3   rl   r<   rB   r`   rp   r   r   r   r   r     s   @7
 %)"rd   r&   r   Znumpyr    Znumpy.marN   Zscipyr   r   baser   r   Zutilsr   Zutils.sparsefuncsr   Zutils.validationr	   r
   Zutils.fixesr   r   Z	externalsr   ZmovesrR   map__all__r   r#   r,   r   r   r   r   r   r   <module>   s2   	"  A