B
    Y>9              
   @   s  d Z ddlmZ ddlZddlZddlmZ ddl	m
Z
 ddlmZ ddlmZmZmZmZmZmZmZmZmZ dd	lmZmZmZ G d
d deZG dd deZG dd deZG dd deZG dd deZG dd deZ G dd deZ!G dd deZ"G dd deZ#G dd deZ$G dd deZ%G d d! d!eZ&G d"d# d#eZ'G d$d% d%e'Z(G d&d' d'e'Z)d+d(d)Z*eeeee e!e"e#e$d*	Z+dS ),a   Statistical methods used to define or modify position of glyphs.

References:
    Wilkinson L. The Grammer of Graphics, sections 7, 7.1

Method Types:
    - Bin: Partitions a space before statistical calculation
    - Summary: Produces a single value comprising a statistical summary
    - Region: Produces two values bounding an interval.
    - Smooth: Produces values representing smoothed versions of the input data.
    - Link: Produces edges from pairs of nodes in a graph.

    )absolute_importN)string_types)ColumnDataSource)HasProps)	BoolDateDatetimeEitherFloatInstanceIntListString   )ColumnColumnLabelEitherColumnc            
       s   e Zd ZdZeddZeeddZe	e
ee
ee
ee
ee
ee
edddZeddZ fd	d
Zdd ZdddZdddZdd Zdd Z  ZS )StatzRepresents a statistical operation to summarize a column of data.

    Can be computed from either a ColumnLabel with a ColumnDataSource, *or*, a
    discrete column of data.
    zfA column to use for the stat calculation. Required
        when providing a ColumnDataSource as input.)helpzFOne option for providing the data
        source for stat calculation.Nz
                  Second option for providing values for stat calculation is by
                  passing the actual column of data.)defaultr   z{The value calculated for the stat. Some stats could use
        multiple properties to provide the calculation if required.c                sN   | dd }|d k	r0t|tjr(t|}||d< tt| jf | |   d S )Nsource)	pop
isinstancepd	DataFramer   superr   __init___refresh)self
propertiesr   )	__class__ -lib/python3.7/site-packages/bkcharts/stats.pyr   /   s    zStat.__init__c             C   s    |   dk	r|   |   dS )z;Lazy update of properties, used for initial transform init.N)get_dataupdate	calculate)r   r!   r!   r"   r   9   s    zStat._refreshc             C   sN   t |tjrt|}t |tr4|| _|dk	r:|| _n|| _|   |   dS )z8Set data properties and update all dependent properties.N)	r   r   r   r   r   columnvaluesr$   r%   )r   datar&   r!   r!   r"   set_data?   s    
zStat.set_datac             C   s   | j dk	rB| jdk	s|dk	rB|dk	r*|}n| j}t| j j| S | jdkrh| j dk	rht| j  jS | jdk	rx| jS dS dS )zAReturns the available columnlabel/source values or column values.N)r   r&   r   ZSeriesr(   r'   to_dfindex)r   r&   colr!   r!   r"   r#   N   s    
zStat.get_datac             C   s   t ddS )zFReturn transformed value from column label/source or column-like data.z;You must implement the calculate method for each stat type.N)NotImplementedError)r   r!   r!   r"   r%   ^   s    zStat.calculatec             C   s   dS )zDPerform any initial work before the actual calculation is performed.Nr!   )r   r!   r!   r"   r$   c   s    zStat.update)N)N)__name__
__module____qualname____doc__r   r&   r   r   r   r   r   r
   r   r   r   r   r   r'   valuer   r   r)   r#   r%   r$   __classcell__r!   r!   )r    r"   r      s    


r   c               @   s   e Zd Zdd ZdS )Sumc             C   s   |    | _d S )N)r#   sumr2   )r   r!   r!   r"   r%   i   s    zSum.calculateN)r.   r/   r0   r%   r!   r!   r!   r"   r4   h   s   r4   c               @   s   e Zd Zdd ZdS )Meanc             C   s   |    | _d S )N)r#   meanr2   )r   r!   r!   r"   r%   n   s    zMean.calculateN)r.   r/   r0   r%   r!   r!   r!   r"   r6   m   s   r6   c               @   s   e Zd Zdd ZdS )Countc             C   s   |    | _d S )N)r#   countr2   )r   r!   r!   r"   r%   s   s    zCount.calculateN)r.   r/   r0   r%   r!   r!   r!   r"   r8   r   s   r8   c               @   s   e Zd Zdd ZdS )CountDistinctc             C   s   |    | _d S )N)r#   nuniquer2   )r   r!   r!   r"   r%   x   s    zCountDistinct.calculateN)r.   r/   r0   r%   r!   r!   r!   r"   r:   w   s   r:   c               @   s   e Zd Zdd ZdS )Medianc             C   s   |    | _d S )N)r#   medianr2   )r   r!   r!   r"   r%   }   s    zMedian.calculateN)r.   r/   r0   r%   r!   r!   r!   r"   r<   |   s   r<   c               @   s   e Zd Zdd ZdS )StdDeviationc             C   s   |    | _d S )N)r#   stdr2   )r   r!   r!   r"   r%      s    zStdDeviation.calculateN)r.   r/   r0   r%   r!   r!   r!   r"   r>      s   r>   c               @   s   e Zd Zdd ZdS )Minc             C   s   |    | _d S )N)r#   minr2   )r   r!   r!   r"   r%      s    zMin.calculateN)r.   r/   r0   r%   r!   r!   r!   r"   r@      s   r@   c               @   s   e Zd Zdd ZdS )Maxc             C   s   |    | _d S )N)r#   maxr2   )r   r!   r!   r"   r%      s    zMax.calculateN)r.   r/   r0   r%   r!   r!   r!   r"   rB      s   rB   c               @   s"   e Zd ZdZeddZdd ZdS )QuantilezProduces the cutpoint that divides the input data by the interval.

    Quartiles are a special case of quartiles that divide a dataset into four
    equal-size groups. (https://en.wikipedia.org/wiki/Quantile)
    g      ?)r   c             C   s   |   | j| _d S )N)r#   quantileintervalr2   )r   r!   r!   r"   r%      s    zQuantile.calculateN)r.   r/   r0   r1   r
   rF   r%   r!   r!   r!   r"   rD      s   
rD   c                   s   e Zd ZdZeeeeZeeeeZ	eeeeZ
e Ze ZeeeeZeee dZe Zd fdd	Zedd Zdd	 Zd
d Zdd Z  ZS )BinzARepresents a single bin of data values and attributes of the bin.)r   Nc       	         s   t |trt|}nt |tr&|g}n
t|g}||d< | |}t| \}}dd t||D }t|dkr|d }|d }|d }nt|}t|}t|}||d< ||d< ||d< ||d	< tt	| j
f | d S )
Nlabelc             S   s   g | ]\}}|| d  qS )g       @r!   ).0startstopr!   r!   r"   
<listcomp>   s    z Bin.__init__.<locals>.<listcomp>r   r   rJ   rK   centerr'   )r   tuplelistr   strprocess_boundsziplenr   rG   r   )	r   	bin_labelr'   r   r   ZboundsZstartsZstopscenters)r    r!   r"   r      s*    





zBin.__init__c             C   s6   |  d}dd |D }dd |D }|d |d fS )z.Produce a consistent display of a bin of data.,c             S   s0   g | ](}| d d dd dd ddqS )[ ]())replace)rI   valr!   r!   r"   rL      s    z&Bin.binstr_to_list.<locals>.<listcomp>c             S   s   g | ]}t |qS r!   )float)rI   r2   r!   r!   r"   rL      s    r   r   )split)binsZvalue_chunks
bin_valuesr!   r!   r"   binstr_to_list   s    
zBin.binstr_to_listc                s,   t |tr fdd|D S  |gS d S )Nc                s   g | ]}  |qS r!   )rb   )rI   Zdim)r   r!   r"   rL      s    z&Bin.process_bounds.<locals>.<listcomp>)r   rO   rb   )r   rT   r!   )r   r"   rQ      s    
zBin.process_boundsc             C   s   | j | j d S )N)statr)   r'   )r   r!   r!   r"   r$      s    z
Bin.updatec             C   s   | j j| _d S )N)rc   r2   )r   r!   r!   r"   r%      s    zBin.calculate)NN)r.   r/   r0   r1   r	   r   r   rH   r
   rJ   rK   Zstart_labelZ
stop_labelrM   r   r   r8   rc   widthr   staticmethodrb   rQ   r$   r%   r3   r!   r!   )r    r"   rG      s   	rG   c                   sx   e Zd ZdZeeeeedddZedddZ	e
ddZe
ddZeeZd fd	d
	Zdd Zdd Zdd Z  ZS )BinStatszA set of statistical calculations for binning values.

    Bin counts using: https://en.wikipedia.org/wiki/Freedman%E2%80%93Diaconis_rule
    Na  
    If bins is an int, it defines the number of equal-width bins in the
    given range. If bins is a sequence, it defines the
    bin edges, including the rightmost edge, allowing for non-uniform
    bin widths.

    (default: None, use Freedman-Diaconis rule)
    )r   r   z#Use Freedman-Diaconis rule if None.g      ?)rF   g      ?c                s*   ||d< |pd|d< t t| jf | d S )Nr'   r&   )r   rf   r   )r   r'   r&   r   )r    r!   r"   r      s    zBinStats.__init__c             C   s8   |   }| j| | j| | jd kr4| | d S )N)r#   q1r)   q3r`   calc_num_bins)r   r'   r!   r!   r"   r$      s
    
zBinStats.updatec             C   st   | j j| jj }|dkr(t|j| _nd| t|d  | _tt	|
 |  | j | _| jdkrpd| _dS )zCalculate optimal number of bins using IQR.

        From: http://stats.stackexchange.com/questions/114490/optimal-bin-width-for-two-dimensional-histogram

        r      gUUUUUUտr      N)rh   r2   rg   npZsqrtsize	bin_widthrS   intZceilrC   rA   r`   )r   r'   Ziqrr!   r!   r"   ri      s    "
zBinStats.calc_num_binsc             C   s   d S )Nr!   )r   r!   r!   r"   r%     s    zBinStats.calculate)NN)r.   r/   r0   r1   r	   r   r
   r   r`   rn   rD   rg   rh   r   labelsr   r$   ri   r%   r3   r!   r!   )r    r"   rf      s   

rf   c                   s   e Zd ZdZeeddZeeeddZ	ee
e ddZe Ze ZeddZed	dZe Zd fdd	Zdd Zdd Z  ZS )
BinnedStatzc Base class for shared functionality accross bins and aggregates
    dimensions for plotting.

    zW
        A mapping between each dimension and associated binning calculations.
        )r   z
        A list of the `Bin` instances that were produced as result of the inputs.
        Iterating over `Bins` will iterate over this list. Each `Bin` can be inspected
        for metadata about the bin and the values associated with it.
        zQ
        The statistical operation to be used on the values in each bin.
        )r   r   T)r   FNr9   c                sT   t |trt|  }|pd|d< ||d< ||d< ||d< || _tt| jf | d S )NZvalsr&   rc   r'   r   )r   rP   stats_binsr   rq   r   )r   r'   r&   r`   rc   r   r   )r    r!   r"   r   .  s    

zBinnedStat.__init__c             C   sL   i }| j d k	r$| j |d< | j|d< n| jd k	r8| j|d< | j|d< tf |S )Nr   r&   r'   r`   )r   r&   r'   rs   rf   )r   Zstat_kwargsr!   r!   r"   	_get_stat<  s    




zBinnedStat._get_statc             C   s   |   | _| j  d S )N)rt   bin_statr$   )r   r!   r!   r"   r$   J  s    
zBinnedStat.update)NNNr9   N)r.   r/   r0   r1   r   rf   ru   r   rG   r`   r   r8   rc   r   
bin_columncenters_columnr   Z	aggregatera   r
   rn   r   rt   r$   r3   r!   r!   )r    r"   rq     s    

 rq   c               @   s2   e Zd ZdZdd Zdd Zdd Zdd	d
ZdS )BinszBins and aggregates dimensions for plotting.

    Takes the inputs and produces a list of bins that can be iterated over and
    inspected for their metadata. The bins provide easy access to consistent labeling,
    bounds, and values.
    c             C   s  d}| j | | _g }| j }| jj}|jdk r8td|jdkr| dkrdt	t
|d  pdd}t|d | |d | |d }tj||dddd\}}t|d |d  d| _| jd k	r| jj| | jd	 | j }nt| j | j| j|i}x6|| jD ]&\}	}
|t|	|
| j  | jd
 qW || _| }|t}x | jD ]}|j|||jk< qVW | j d | _ | jd k	r| jj| | j d	 n
||| j < d S )N_binrj   z/Histogram data must have at least two elements.r   r   g{Gz?T)ZretbinsZinclude_lowestZ	precision)name)rT   r'   rc   Z_center)!r&   rv   ru   r#   r`   rm   
ValueErrorndimr?   absr^   rl   Zlinspacer   Zcutroundrn   r   addtolistr*   r   r'   groupbyappendrG   rc   copyZastyperP   rM   rH   rw   )r   bin_strZ
bin_modelsr(   r`   Zmarginbinned
bin_boundsZdfrz   grouprU   binr!   r!   r"   r%   W  s:    

"

zBins.calculatec             C   s
   | j | S )N)r`   )r   itemr!   r!   r"   __getitem__  s    zBins.__getitem__c             C   s   |  |j | j S )N)r)   r   r*   )r   r(   r!   r!   r"   apply  s    z
Bins.applyTc             C   s*   | j d k	r&tt| j dd | d| _ d S )Nc             S   s   | j S )N)rM   )xr!   r!   r"   <lambda>  s    zBins.sort.<locals>.<lambda>)keyreverse)r`   rO   sorted)r   Z	ascendingr!   r!   r"   sort  s    
z	Bins.sortN)T)r.   r/   r0   r1   r%   r   r   r   r!   r!   r!   r"   rx   O  s
   .rx   c               @   s$   e Zd ZdZedddZdd ZdS )	HistogramzBins and aggregates dimensions for plotting.

    Takes the inputs and produces a list of bins that can be iterated over and
    inspected for their metadata. The bins provide easy access to consistent labeling,
    bounds, and values.
    Faf  
    Whether to normalize the histogram.

    If True, the result is the value of the probability *density* function
    at the bin, normalized such that the *integral* over the range is 1. If
    False, the result will contain the number of samples in each bin.

    For more info check ``numpy.histogram`` function documentation.

    (default: False)
    )r   c       
   	   C   s   d}| j | | _| j }| jj}tjt|| j|d\}}t	|d |d  d| _
g | _xt|D ]v\}}||d  ||  }|dkrd|| ||d  f }	nd|| ||d  f }	| jt|	|| gt |d qhW d S )	Nry   )densityr`   rj   r   r   z[%f, %f]z(%f, %f])rT   r'   rc   rd   )r&   rv   ru   r#   r`   rl   Z	histogramZarrayr   r~   rn   	enumerater   rG   rB   )
r   r   r(   r`   r   r   ibrd   Zlblr!   r!   r"   r%     s    
zHistogram.calculateN)r.   r/   r0   r1   r   r   r%   r!   r!   r!   r"   r     s   
r   c             K   s.   t | tr| }d}nd}tf |||d|S )z8Specify binning or bins to be used for column or values.N)r'   r&   r`   )r   rP   rx   )r(   r'   r&   r`   rp   kwargsr!   r!   r"   r`     s
    
r`   )	r5   r7   r9   r;   r=   ZstddevrA   rC   rE   )NNNN),r1   Z
__future__r   Znumpyrl   Zpandasr   Zsixr   Zbokeh.models.sourcesr   Zbokeh.core.has_propsr   Zbokeh.core.propertiesr   r   r   r	   r
   r   r   r   r   r   r   r   r   r   r4   r6   r8   r:   r<   r>   r@   rB   rD   rG   rf   rq   rx   r   r`   rr   r!   r!   r!   r"   <module>   sB   ,NA5=C,
