ó
¦–Õ\c           @` sr   d  Z  d d l m Z m Z m Z d d l Z d d l Z d d l	 m
 Z
 d „  Z d „  Z d „  Z d „  Z d S(	   uc  Implementation of HyperLogLog

This implements the HyperLogLog algorithm for cardinality estimation, found
in

    Philippe Flajolet, Ã‰ric Fusy, Olivier Gandouet and FrÃ©dÃ©ric Meunier.
        "HyperLogLog: the analysis of a near-optimal cardinality estimation
        algorithm". 2007 Conference on Analysis of Algorithms. Nice, France
        (2007)

i    (   t   absolute_importt   divisiont   print_functionN(   t   hash_pandas_objectc         C` sT   t  j j |  d t  j d ƒ >ƒ } | j d d ƒ j t  j ƒ } d | j d d ƒ S(   sG   Compute the position of the first nonzero bit for each int in an array.i   i    t   axisi!   (   t   npt   bitwise_andt   outert   aranget   cumsumt   astypet   boolt   sum(   t   at   bits(    (    s9   lib/python2.7/site-packages/dask/dataframe/hyperloglog.pyt   compute_first_bit   s    "c   	      C` sø   d | k o d k n s+ t  d ƒ ‚ n  d | } d | >} t |  d t ƒ} t | t j ƒ ro | j } n  | j t j	 ƒ } | | ?} t
 | ƒ } t j i | d 6| d 6ƒ } | j d ƒ j ƒ  d } | j t j | ƒ d	 d
 ƒj j t j ƒ S(   Ni   i   s   b should be between 8 and 16i    i   t   indext   jt	   first_bitt
   fill_valuei    (   t
   ValueErrorR   t   Falset
   isinstancet   pdt   Seriest   _valuesR
   R   t   uint32R   t	   DataFramet   groupbyt   maxt   reindexR   t   valuest   uint8(	   t   objt   bt   num_bits_discardedt   mt   hashesR   R   t   dft   series(    (    s9   lib/python2.7/site-packages/dask/dataframe/hyperloglog.pyt   compute_hll_array   s    


c         C` s6   d | >} |  j  t |  ƒ | | ƒ }  |  j d d ƒ S(   Ni   R   i    (   t   reshapet   lenR   (   t   MsR"   R$   (    (    s9   lib/python2.7/site-packages/dask/dataframe/hyperloglog.pyt   reduce_state7   s    
c         C` s·   d | >} t  |  | ƒ } d d d | } | | d | j d ƒ j ƒ  | } | d | k  r‘ | d k j ƒ  } | r‘ | t j | | ƒ Sn  | d k r³ d t j | d ƒ S| S(   Ni   g†ZÓ¼ãç?gÝ$•Cñ?g       @t   f8g      @i    i   i    g      >@I       g¡AI       I    ÿÿÿÿI       (   R,   R
   R   R   t   logt   log1p(   R+   R"   R$   t   Mt   alphat   Et   V(    (    s9   lib/python2.7/site-packages/dask/dataframe/hyperloglog.pyt   estimate_count@   s    
&(   t   __doc__t
   __future__R    R   R   t   numpyR   t   pandasR   t   pandas.utilR   R   R(   R,   R4   (    (    (    s9   lib/python2.7/site-packages/dask/dataframe/hyperloglog.pyt   <module>   s   				