ó
áp7]c           @  s~  d  Z  d d l m Z d d l j Z d d l Z d „  Z d „  Z d d „ Z	 d e
 f d	 „  ƒ  YZ e d
 k rzd d l j Z d Z e j j e ƒ Z d g Z d e k rÈe	 e ƒ e e e d ƒ ƒ e e e d ƒ ƒ e j e j ƒ  e j ƒ  ƒ Z e e e ƒ Z e j e e ƒ e j e e e ƒ e d ƒ e j e j e ƒ e j e	 e ƒ ƒ d d ƒZ e j g  e D] Z  e j! e  ƒ d ^ qtƒ Z" e j# ƒ  e j e e" ƒ e e ƒ Z$ e j# ƒ  e j e e$ ƒ e j# ƒ  e j% e d  e j& e$ ƒ e j& e ƒ ƒ e j e j ƒ  e j ƒ  d ƒ Z' e j# ƒ  e j% e' d  e j& e e' ƒ ƒ e j& e' ƒ ƒ e j e ƒ Z( e( d d e d … Z) e j# ƒ  e j% e) d  e j& e e) ƒ ƒ e j& e) ƒ ƒ n  e e ƒ Z* e e* j+ ƒ  ƒ e e* j, e* j- ƒ ƒ e e* j. d d d g ƒ ƒ e e* j, d d d d d g ƒ ƒ e j e j ƒ  e j ƒ  d ƒ Z e j e j e ƒ e j e	 e ƒ ƒ d d ƒZ e e ƒ Z$ e j# ƒ  e j e e$ ƒ e j e$ e d d ƒZ/ e/ e$ ƒ Z0 e j e0 e$ ƒ e j1 e j e	 e ƒ ƒ e j e ƒ d d d d ƒZ2 e2 e$ ƒ Z3 e j e3 e$ ƒ e d ƒ e d e j& e0 ƒ j ƒ  ƒ e d e j& e$ ƒ j ƒ  ƒ n  d S(   s
  
from David Huard's scipy sandbox, also attached to a ticket and
in the matplotlib-user mailinglist  (links ???)


Notes
=====

out of bounds interpolation raises exception and wouldn't be completely
defined ::

>>> scoreatpercentile(x, [0,25,50,100])
Traceback (most recent call last):
...
    raise ValueError("A value in x_new is below the interpolation "
ValueError: A value in x_new is below the interpolation range.
>>> percentileofscore(x, [-50, 50])
Traceback (most recent call last):
...
    raise ValueError("A value in x_new is below the interpolation "
ValueError: A value in x_new is below the interpolation range.


idea
====

histogram and empirical interpolated distribution
-------------------------------------------------

dual constructor
* empirical cdf : cdf on all observations through linear interpolation
* binned cdf : based on histogram
both should work essentially the same, although pdf of empirical has
many spikes, fluctuates a lot
- alternative: binning based on interpolated cdf : example in script
* ppf: quantileatscore based on interpolated cdf
* rvs : generic from ppf
* stats, expectation ? how does integration wrt cdf work - theory?

Problems
* limits, lower and upper bound of support
  does not work or is undefined with empirical cdf and interpolation
* extending bounds ?
  matlab has pareto tails for empirical distribution, breaks linearity

empirical distribution with higher order interpolation
------------------------------------------------------

* should work easily enough with interpolating splines
* not piecewise linear
* can use pareto (or other) tails
* ppf how do I get the inverse function of a higher order spline?
  Chuck: resample and fit spline to inverse function
  this will have an approximation error in the inverse function
* -> doesn't work: higher order spline doesn't preserve monotonicity
  see mailing list for response to my question
* pmf from derivative available in spline

-> forget this and use kernel density estimator instead


bootstrap/empirical distribution:
---------------------------------

discrete distribution on real line given observations
what's defined?
* cdf : step function
* pmf : points with equal weight 1/nobs
* rvs : resampling
* ppf : quantileatscore on sample?
* moments : from data ?
* expectation ? sum_{all observations x} [func(x) * pmf(x)]
* similar for discrete distribution on real line
* References : ?
* what's the point? most of it is trivial, just for the record ?


Created on Monday, May 03, 2010, 11:47:03 AM
Author: josef-pktd, parts based on David Huard
License: BSD

iÿÿÿÿ(   t   print_functionNc         C  sM   t  j | ƒ } t |  ƒ } t j t  j | ƒ t  j |  ƒ ƒ } | | d ƒ S(   sÄ   Return the score at the given percentile of the data.

    Example:
        >>> data = randn(100)
            >>> scoreatpercentile(data, 50)

        will return the median of sample `data`.
    g      Y@(   t   npt   arrayt   empiricalcdft   interpolatet   interp1dt   sort(   t   datat
   percentilet   pert   cdft   interpolator(    (    sE   lib/python2.7/site-packages/statsmodels/sandbox/stats/stats_dhuard.pyt   scoreatpercentileW   s    	$c         C  s>   t  |  ƒ } t j t j |  ƒ t j | ƒ ƒ } | | ƒ d S(   sD  Return the percentile-position of score relative to data.

    score: Array of scores at which the percentile is computed.

    Return percentiles (0-100).

    Example
            r = randn(50)
        x = linspace(-2,2,100)
        percentileofscore(r,x)

    Raise an error if the score is outside the range of data.
    g      Y@(   R   R   R   R   R   (   R   t   scoreR
   R   (    (    sE   lib/python2.7/site-packages/statsmodels/sandbox/stats/stats_dhuard.pyt   percentileofscoree   s    $t   Hazenc         C  sþ   t  j t  j |  ƒ ƒ d } t |  ƒ } | j ƒ  } | d k rQ | d | } n© | d k rn | | d } nŒ | d k r‹ | d | } no | d k r¬ | d | d } nN | d	 k rÍ | d | d
 } n- | d k rî | d | d } n t d ƒ ‚ | S(   s  Return the empirical cdf.

    Methods available:
        Hazen:       (i-0.5)/N
            Weibull:     i/(N+1)
        Chegodayev:  (i-.3)/(N+.4)
        Cunnane:     (i-.4)/(N+.2)
        Gringorten:  (i-.44)/(N+.12)
        California:  (i-1)/N

    Where i goes from 1 to N.
    g      ð?t   hazeng      à?t   weibullt
   californiat
   chegodayevg333333Ó?gš™™™™™Ù?t   cunnanegš™™™™™É?t
   gringorteng)\Âõ(Ü?g¸…ëQ¸¾?s[   Unknown method. Choose among Weibull, Hazen,Chegodayev, Cunnane, Gringorten and California.(   R   t   argsortt   lent   lowert
   ValueError(   R   t   methodt   it   NR
   (    (    sE   lib/python2.7/site-packages/statsmodels/sandbox/stats/stats_dhuard.pyR   w   s"    t   HistDistc           B  sD   e  Z d  Z d „  Z d d d „ Z d „  Z d „  Z d d „ Z RS(	   s»   Distribution with piecewise linear cdf, pdf is step function

    can be created from empiricial distribution or from a histogram (not done yet)

    work in progress, not finished


    c         C  sÂ   t  j | ƒ |  _ t  j |  j j ƒ  |  j j ƒ  g ƒ |  _ t  j | ƒ } | | |  _ t  j | ƒ |  _	 |  j
 ƒ  } t  j | ƒ |  _ t j |  j |  j ƒ |  _ t j |  j |  j ƒ |  _ d  S(   N(   R   t
   atleast_1dR   R   t   mint   maxt   binlimitR   t   _datasortedt   rankingR   R   t   _empcdfsortedR   R   t   cdfintpt   ppfintp(   t   selfR   t   sortindR
   (    (    sE   lib/python2.7/site-packages/statsmodels/sandbox/stats/stats_dhuard.pyt   __init__¥   s    *R   c         C  s  | d k r! |  j } |  j } n t j t j | ƒ ƒ d } t | ƒ } | j ƒ  } | d k rr | d | } n© | d k r | | d } nŒ | d k r¬ | d | } no | d k rÍ | d | d } nN | d	 k rî | d | d
 } n- | d k r| d | d } n t d ƒ ‚ | S(   sA  Return the empirical cdf.

        Methods available:
            Hazen:       (i-0.5)/N
                Weibull:     i/(N+1)
            Chegodayev:  (i-.3)/(N+.4)
            Cunnane:     (i-.4)/(N+.2)
            Gringorten:  (i-.44)/(N+.12)
            California:  (i-1)/N

        Where i goes from 1 to N.
        g      ð?R   g      à?R   R   R   g333333Ó?gš™™™™™Ù?R   gš™™™™™É?R   g)\Âõ(Ü?g¸…ëQ¸¾?s[   Unknown method. Choose among Weibull, Hazen,Chegodayev, Cunnane, Gringorten and California.N(   t   NoneR   R#   R   R   R   R   R   (   R'   R   R   R   R   R
   (    (    sE   lib/python2.7/site-packages/statsmodels/sandbox/stats/stats_dhuard.pyR   ±   s(    	c         C  s   |  j  | ƒ S(   s&   
        this is score in dh

        (   R%   (   R'   R   (    (    sE   lib/python2.7/site-packages/statsmodels/sandbox/stats/stats_dhuard.pyt   cdf_empÚ   s    c         C  s   |  j  | ƒ S(   s&   
        this is score in dh

        (   R&   (   R'   t   quantile(    (    sE   lib/python2.7/site-packages/statsmodels/sandbox/stats/stats_dhuard.pyt   ppf_empâ   s    t   Freedmanc         C  s¡   t  |  j ƒ } | d k rP |  j d ƒ |  j d ƒ } d | | d d } n1 | d k r d t j |  j ƒ | d d } n  t j |  j ƒ | |  _ |  j S(	   s”   Find the optimal number of bins and update the bin countaccordingly.
        Available methods : Freedman
                            Scott
        R.   g      è?g      Ð?i   g      ð¿i   t   ScottgìQ¸…ë@(   R   R   R-   R   t   stdt   ptpR!   t   nbin(   R'   R   t   nobst   IQRt   width(    (    sE   lib/python2.7/site-packages/statsmodels/sandbox/stats/stats_dhuard.pyt   optimize_binningì   s    %N(	   t   __name__t
   __module__t   __doc__R)   R*   R   R+   R-   R6   (    (    (    sE   lib/python2.7/site-packages/statsmodels/sandbox/stats/stats_dhuard.pyR   ›   s   	)		
t   __main__id   i   i   g      à?i2   t   ki   g      Ð?g      è?g      à¿g      Ð¿i    iô  i   t   sg¸…ëQ¸ž?s   negative densitys   (np.diff(ppfs)).min()s   (np.diff(cdf_ongrid)).min()(4   R9   t
   __future__R    t   scipy.interpolateR   t   numpyR   R   R   R   t   objectR   R7   t   matplotlib.pyplott   pyplott   pltR3   t   randomt   randnt   xt   examplest   printt   linspaceR   R    t   xsuppt   post   plott   InterpolatedUnivariateSplineR   t   empR   t   xit   derivativest   pdfempt   figuret
   cdf_ongridt   stept   difft   xsupp2t   xsot   xst   histdR6   R+   R!   R-   R&   t   ppfst   UnivariateSplinet   ppfempt   ppfe(    (    (    sE   lib/python2.7/site-packages/statsmodels/sandbox/stats/stats_dhuard.pyt   <module>R   sn   		$d	
0/


*!
0
3"!0
6
