B
    T\y$                 @   s   d dl mZmZmZ d dlZd dlZddlm	Z	m
Z
mZmZ ddlmZ ddlmZmZmZmZ ddddddejfd	d
ZdddZdddZdS )    )absolute_importdivisionprint_functionN   )Series	DataFramemap_partitionsapply_concat_apply)methods)is_categorical_dtype	is_scalarhas_known_categoriesPANDAS_VERSION_Fc          
      sX  t dkrd|i}n|tjkr,tdt ni }t tjtjfrbtj	 f||||||d|S d}	d}
t trt
 st|	t st|
n|t tr|dkrԈ jdk rt|	 jjd	gd
j}nt fdd|D st|	t fdd|D st|
tj	 jf||||||d|}ttj	 f|||||||d|S )a
  
    Convert categorical variable into dummy/indicator variables.

    Data must have category dtype to infer result's ``columns``.

    Parameters
    ----------
    data : Series, or DataFrame
        For Series, the dtype must be categorical.
        For DataFrame, at least one column must be categorical.
    prefix : string, list of strings, or dict of strings, default None
        String to append DataFrame column names.
        Pass a list with length equal to the number of columns
        when calling get_dummies on a DataFrame. Alternatively, `prefix`
        can be a dictionary mapping column names to prefixes.
    prefix_sep : string, default '_'
        If appending prefix, separator/delimiter to use. Or pass a
        list or dictionary as with `prefix.`
    dummy_na : bool, default False
        Add a column to indicate NaNs, if False NaNs are ignored.
    columns : list-like, default None
        Column names in the DataFrame to be encoded.
        If `columns` is None then all the columns with
        `category` dtype will be converted.
    sparse : bool, default False
        Whether the dummy columns should be sparse or not.  Returns
        SparseDataFrame if `data` is a Series or if all columns are included.
        Otherwise returns a DataFrame with some SparseBlocks.

        .. versionadded:: 0.18.2

    drop_first : bool, default False
        Whether to get k-1 dummies out of k categorical levels by removing the
        first level.

    dtype : dtype, default np.uint8
        Data type for new columns. Only a single dtype is allowed.
        Only valid if pandas is 0.23.0 or newer.

        .. versionadded:: 0.18.2

    Returns
    -------
    dummies : DataFrame

    Examples
    --------
    Dask's version only works with Categorical data, as this is the only way to
    know the output shape without computing all the data.

    >>> import pandas as pd
    >>> import dask.dataframe as dd
    >>> s = dd.from_pandas(pd.Series(list('abca')), npartitions=2)
    >>> dd.get_dummies(s)
    Traceback (most recent call last):
        ...
    NotImplementedError: `get_dummies` with non-categorical dtypes is not supported...

    With categorical data:

    >>> s = dd.from_pandas(pd.Series(list('abca'), dtype='category'), npartitions=2)
    >>> dd.get_dummies(s)  # doctest: +NORMALIZE_WHITESPACE
    Dask DataFrame Structure:
                       a      b      c
    npartitions=2
    0              uint8  uint8  uint8
    2                ...    ...    ...
    3                ...    ...    ...
    Dask Name: get_dummies, 4 tasks
    >>> dd.get_dummies(s).compute()
       a  b  c
    0  1  0  0
    1  0  1  0
    2  0  0  1
    3  1  0  0

    See Also
    --------
    pandas.get_dummies
    z0.23.0dtypezOYour version of pandas is '{}'. The 'dtype' keyword was added in pandas 0.23.0.)prefix
prefix_sepdummy_nacolumnssparse
drop_firstz`get_dummies` with non-categorical dtypes is not supported. Please use `df.categorize()` beforehand to convert to categorical dtype.z`get_dummies` with unknown categories is not supported. Please use `column.cat.as_known()` or `df.categorize()` beforehand to ensure known categoriesNobjectcategory)Zincludec             3   s   | ]}t  | V  qd S )N)r   ).0c)data 5lib/python3.7/site-packages/dask/dataframe/reshape.py	<genexpr>   s    zget_dummies.<locals>.<genexpr>c             3   s   | ]}t  | V  qd S )N)r   )r   r   )r   r   r   r      s    )r   r   r   r   r   r   meta)r   npuint8
ValueErrorformat
isinstancepdr   r   get_dummiesr   NotImplementedErrorr   ZdtypesanyZ_metaZselect_dtypesr   allr   )r   r   r   r   r   r   r   r   kwargsZnot_cat_msgZunknown_cat_msgr   r   )r   r   r&      sJ    S






r&   meanc       
      C   s,  t |r|dkrtdt |r(|dkr0tdt| | sDtdt| | sXtdt |rh|dkrptdt |r|dkrtdtj| | jj|d	}tj|t	j
d
}||j_|||d}t| gtjtj|d|d}t| gtjtj|d|d}	|dkr|S |dkr|	S |dkr$||	 S tdS )a  
    Create a spreadsheet-style pivot table as a DataFrame. Target ``columns``
    must have category dtype to infer result's ``columns``.
    ``index``, ``columns``, ``values`` and ``aggfunc`` must be all scalar.

    Parameters
    ----------
    data : DataFrame
    values : scalar
        column to aggregate
    index : scalar
        column to be index
    columns : scalar
        column to be columns
    aggfunc : {'mean', 'sum', 'count'}, default 'mean'

    Returns
    -------
    table : DataFrame
    Nz.'index' must be the name of an existing columnz0'columns' must be the name of an existing columnz 'columns' must be category dtypezs'columns' must have known categories. Please use `df[columns].cat.as_known()` beforehand to ensure known categoriesz/'values' must be the name of an existing column)r+   sumcountz/aggfunc must be either 'mean', 'sum' or 'count')name)r   r   )indexr   valuesZpivot_table_sum)chunkZ	aggregater   tokenZchunk_kwargsZpivot_table_countr,   r-   r+   )r   r"   r   r   r%   ZCategoricalIndexcatZ
categoriesr   r    Zfloat64r/   r.   r	   r
   Z	pivot_sumZ	pivot_aggZpivot_count)
Zdfr/   r   r0   ZaggfuncZnew_columnsr   r*   Zpv_sumZpv_countr   r   r   pivot_table   sF    


r4   valuec          
   C   s(   ddl m} | jtj||||||ddS )Nr   )
no_defaultmelt)r   id_vars
value_varsvar_name
value_name	col_levelr2   )Zdask.dataframe.corer6   r   r%   r7   )framer8   r9   r:   r;   r<   r6   r   r   r   r7      s
    r7   )NNNr+   )NNNr5   N)Z
__future__r   r   r   Znumpyr    Zpandasr%   Zcorer   r   r   r	    r
   Zutilsr   r   r   r   r!   r&   r4   r7   r   r   r   r   <module>   s   
  
I 