ó
¦–Õ\c           @` sç   d  d l  m Z m Z m Z d  d l m Z d  d l Z d  d l m	 Z	 d  d l
 m Z d d l m Z m Z d d	 l m Z d d
 l m Z m Z m Z m Z d „  Z d „  Z d „  Z d d d d „ Z d e f d „  ƒ  YZ d S(   i    (   t   absolute_importt   divisiont   print_function(   t   defaultdictN(   t   partition_all(   t   Integrali   (   t   tokenizet   compute_as_if_collectioni   (   t   Accessor(   t   has_known_categoriest   clear_known_categoriest	   is_scalart   is_categorical_dtypec         C` sæ   |  j  ƒ  }  xj | j ƒ  D]\ \ } } t |  | ƒ rR |  | j j | ƒ |  | <q t j |  | d | d t ƒ|  | <q W| d k	 râ t |  j	 ƒ r© |  j	 j | ƒ } n t j |  j	 d | d t ƒ} |  j	 j
 | _
 | |  _	 n  |  S(   sˆ    Categorize a dataframe with given categories

    df: DataFrame
    categories: dict mapping column name to iterable of categories
    t
   categoriest   orderedN(   t   copyt   itemsR   t   catt   set_categoriest   pdt   Categoricalt   Falset   Nonet   indext   name(   t   dfR   R   t   colt   valst   ind(    (    s9   lib/python2.7/site-packages/dask/dataframe/categorical.pyt   _categorize_block   s    'c         C` s§   i  } xV | D]N } |  | } t  | ƒ rE t j | j j ƒ | | <q | j ƒ  j ƒ  | | <q W| r t  |  j ƒ r„ | |  j j f S| |  j j ƒ  j ƒ  f S| d  f S(   N(	   R   R   t   SeriesR   R   t   dropnat   drop_duplicatesR   R   (   R   t   columnsR   t   resR   t   x(    (    s9   lib/python2.7/site-packages/dask/dataframe/categorical.pyt   _get_categories$   s    
c         C` s·   t  t ƒ } g  } xQ |  D]I } x/ | d j ƒ  D] \ } } | | j | ƒ q0 W| j | d ƒ q Wd „  | j ƒ  Dƒ } | d d  k r– | d  f S| | d j | d ƒ j ƒ  f S(   Ni    i   c         S` s4   i  |  ]* \ } } t  j | d  t ƒj ƒ  | “ q S(   t   ignore_index(   R   t   concatt   TrueR    (   t   .0t   kt   v(    (    s9   lib/python2.7/site-packages/dask/dataframe/categorical.pys
   <dictcomp>:   s   	(   R   t   listR   t   appendR   R    (   t   partsR"   t   res_indt   pR)   R*   (    (    s9   lib/python2.7/site-packages/dask/dataframe/categorical.pyt   _get_categories_agg3   s    
c         ` sÅ  |  j  } ˆ d k r6 t | j d d g ƒ j ƒ ‰ n t ˆ ƒ rN ˆ g ‰ n  g  ˆ D], } t | | ƒ ox t | | ƒ sU | ^ qU ‰ ˆ t k	 rÙ t | j	 ƒ rµ t | j	 ƒ ‰ qÙ ˆ d k rÙ | j	 j
 t k ‰ qÙ n  t ˆ ƒ rö ˆ t k rö |  S| d k rd } nC | t k r#|  j } n+ t | t ƒ s?| d k  rNt d ƒ ‚ n  t |  ˆ ˆ | ƒ } d | ‰  ‡  ‡ ‡ f d †  t |  j ƒ  ƒ Dƒ } d | }	 |  j }
 d	 } x• |
 | k rE|	 t | ƒ } xX t t | t |
 ƒ ƒ ƒ D]; \ } } t g  | D] } ˆ  | f ^ qÿf | | | f <qéW| d
 }
 | ‰  | d
 7} q±Wt g  t |
 ƒ D] } ˆ  | f ^ qVf | |	 d	 f <| j |  j ƒ t t |  ƒ | |	 d	 f |  \ } ‰ |  j t | ˆ ƒ S(   s0  Convert columns of the DataFrame to category dtype.

    Parameters
    ----------
    columns : list, optional
        A list of column names to convert to categoricals. By default any
        column with an object dtype is converted to a categorical, and any
        unknown categoricals are made known.
    index : bool, optional
        Whether to categorize the index. By default, object indices are
        converted to categorical, and unknown categorical indices are made
        known. Set True to always categorize the index, False to never.
    split_every : int, optional
        Group partitions into groups of this size while performing a
        tree-reduction. If set to False, no tree-reduction will be used.
        Default is 16.
    kwargs
        Keyword arguments are passed on to compute.
    t   objectt   categoryi   i   s#   split_every must be an integer >= 2s   get-categories-chunk-c         ` s1   i  |  ]' \ } } t  | ˆ ˆ f ˆ  | f “ q S(    (   R$   (   R(   t   it   key(   t   aR!   R   (    s9   lib/python2.7/site-packages/dask/dataframe/categorical.pys
   <dictcomp>r   s   	s   get-categories-agg-i    i   N(   t   _metaR   R+   t   select_dtypesR!   R   R   R	   R   R   t   dtypeR1   t   lent   npartitionst
   isinstanceR   t
   ValueErrorR   t	   enumeratet   __dask_keys__t   strR   t   rangeR0   t   updatet   daskR   t   typet   map_partitionsR   (   R   R!   R   t   split_everyt   kwargst   metat   ct   tokent   dskt   prefixR)   t   deptht   bt   part_it   indsR3   R   (    (   R5   R!   R   s9   lib/python2.7/site-packages/dask/dataframe/categorical.pyt
   categorizeA   sN    	!	

	(3
5t   CategoricalAccessorc           B` s€   e  Z d  Z e j j Z d Z d „  Z e	 d „  ƒ Z
 d „  Z d „  Z e	 d „  ƒ Z e	 d „  ƒ Z e	 d „  ƒ Z d	 „  Z RS(
   sÒ  
    Accessor object for categorical properties of the Series values.

    Examples
    --------
    >>> s.cat.categories  # doctest: +SKIP

    Notes
    -----
    Attributes that depend only on metadata are eager

    * categories
    * ordered

    Attributes depending on the entire dataset are lazy

    * codes
    * ...

    So `df.a.cat.categories` <=> `df.a._meta.cat.categories`
    So `df.a.cat.codes` <=> `df.a.map_partitions(lambda x: x.cat.codes)`
    R   c         C` s"   t  | j ƒ s t d ƒ ‚ n  d  S(   Ns2   Can only use .cat accessor with a 'category' dtype(   R   R8   t   AttributeError(   t   selft   series(    (    s9   lib/python2.7/site-packages/dask/dataframe/categorical.pyt	   _validate¥   s    c         C` s   t  |  j ƒ S(   s&   Whether the categories are fully known(   R	   t   _series(   RS   (    (    s9   lib/python2.7/site-packages/dask/dataframe/categorical.pyt   knownª   s    c         K` s>   |  j  r |  j S|  j d ƒ j ƒ  j |   } |  j | j ƒ S(   sL  Ensure the categories in this series are known.

        If the categories are known, this is a no-op. If unknown, the
        categories are computed, and a new series with known categories is
        returned.

        Parameters
        ----------
        kwargs
            Keywords to pass on to the call to `compute`.
        R   (   RW   RV   t   _property_mapt   uniquet   computeR   t   values(   RS   RF   R   (    (    s9   lib/python2.7/site-packages/dask/dataframe/categorical.pyt   as_known¯   s    	c         C` s5   |  j  s |  j S|  j j ƒ  } t | j ƒ | _ | S(   s0   Ensure the categories in this series are unknown(   RW   RV   R   R
   R6   (   RS   t   out(    (    s9   lib/python2.7/site-packages/dask/dataframe/categorical.pyt
   as_unknownÀ   s
    	c         C` s   |  j  |  j j d d ƒ S(   NR   R   (   t   _delegate_propertyRV   R6   (   RS   (    (    s9   lib/python2.7/site-packages/dask/dataframe/categorical.pyR   È   s    c         C` s7   |  j  s d } t | ƒ ‚ n  |  j |  j j d d ƒ S(   sZ   The categories of this categorical.

        If categories are unknown, an error is raiseds£   `df.column.cat.categories` with unknown categories is not supported.  Please use `column.cat.as_known()` or `df.categorize()` beforehand to ensure known categoriesR   R   (   RW   t   NotImplementedErrorR_   RV   R6   (   RS   t   msg(    (    s9   lib/python2.7/site-packages/dask/dataframe/categorical.pyR   Ì   s    	c         C` s+   |  j  s d } t | ƒ ‚ n  |  j d ƒ S(   sU   The codes of this categorical.

        If categories are unknown, an error is raisedsž   `df.column.cat.codes` with unknown categories is not supported.  Please use `column.cat.as_known()` or `df.categorize()` beforehand to ensure known categoriest   codes(   RW   R`   RX   (   RS   Ra   (    (    s9   lib/python2.7/site-packages/dask/dataframe/categorical.pyRb   Ø   s    	c      
   C` så   |  j  j ƒ  j ƒ  } t j | j ƒ  ƒ } t |  j  j t j ƒ rQ |  j  j } n |  j  j j	 } | j
 | j ƒ \ } } | d	 k r‹ |  j  S| | d k } | j | d | j ƒ} |  j  j |  j d d d
 i | d 6d | d d ƒS(   sÇ   
        Removes categories which are not used

        Notes
        -----
        This method requires a full scan of the data to compute the
        unique values, which can be expensive.
        iÿÿÿÿR   R   R   t   new_categoriesRG   RI   s   cat-set_categoriesN(    (   RV   R   RY   R   t   IndexRZ   R;   R6   t   CategoricalIndexR   t   reindexR   R   R   R   RD   t   _delegate_method(   RS   t   presentt   meta_catR   t   maskRc   RG   (    (    s9   lib/python2.7/site-packages/dask/dataframe/categorical.pyt   remove_unused_categoriesä   s    
(   t   __name__t
   __module__t   __doc__R   R   R   t	   _accessort   _accessor_nameRU   t   propertyRW   R\   R^   R   R   Rb   Rk   (    (    (    s9   lib/python2.7/site-packages/dask/dataframe/categorical.pyRQ   ‹   s   			(   t
   __future__R    R   R   t   collectionsR   t   pandasR   t   toolzR   t   numbersR   t   baseR   R   t   accessorR   t   utilsR	   R
   R   R   R   R$   R0   R   RP   RQ   (    (    (    s9   lib/python2.7/site-packages/dask/dataframe/categorical.pyt   <module>   s   "			J