ó
áp7]c        	   @   s5  d  Z  d d l Z d d l m Z d d l m Z d d l j j	 Z	 e
 d „ Z e
 d „ Z e
 d „ Z e d k r1d d l j Z d	 d
 g d Z d Z d Z e j d e j j e ƒ ƒ Z e j e d e
 ƒZ e d
 k rd e d e e j j e ƒ Z n  e d	 k r1d e e e j j e ƒ Z n  d GHe j e e ƒ d' GHd e j e e ƒ f GHd e j e e ƒ f GHd e j e e ƒ f GHe j  e e d d ƒ\ Z! Z" Z# e j$ e d e" ƒ\ Z% Z& e j$ e d e# ƒ\ Z' Z( d e	 j) e% d e e' d e d e! d e d e j* ƒf GHd e e e ƒ f GHd e e e d e+ ƒf GHe e e d d e
 ƒ\ Z, \ Z- Z. Z/ Z0 Z1 Z2 d e, f GHd e2 j3 ƒ  f GHe e e d  d e
 ƒ\ Z, \ Z- Z. Z/ Z0 Z1 Z2 d  GHd e, f GHd e2 j3 ƒ  f GHe j e ƒ Z4 e j e ƒ Z5 e4 e d e j6 d d! d" d# d$ d g ƒ j7 e8 ƒ Z9 e5 e d e j6 d d! d" d# d$ d g ƒ j7 e8 ƒ Z: e e e e9 e: f d e
 ƒ\ Z, \ Z- Z. Z/ Z0 Z1 Z2 d% GHd e, f GHd e2 j3 ƒ  f GHd Z; e; r1d d l< j= Z> e> j? e e d& ƒ e j@ e e ƒ jA ƒ  ZB e> j? e eB jC ƒ q1n  d S((   sæ  using multivariate dependence and divergence measures

The standard correlation coefficient measures only linear dependence between
random variables.
kendall's tau measures any monotonic relationship also non-linear.

mutual information measures any kind of dependence, but does not distinguish
between positive and negative relationship


mutualinfo_kde and mutualinfo_binning follow Khan et al. 2007

Shiraj Khan, Sharba Bandyopadhyay, Auroop R. Ganguly, Sunil Saigal,
David J. Erickson, III, Vladimir Protopopescu, and George Ostrouchov,
Relative performance of mutual information estimation methods for
quantifying the dependence among short and noisy data,
Phys. Rev. E 76, 026209 (2007)
http://pre.aps.org/abstract/PRE/v76/i2/e026209


iÿÿÿÿN(   t   stats(   t   gaussian_kdec         C   s  t  | ƒ } t  |  ƒ | k s- t d ƒ ‚ n  t j | t ƒ } t j |  t ƒ }  t j |  | f ƒ } t | ƒ | ƒ } t |  ƒ |  ƒ } t | ƒ | ƒ } t j | ƒ t j | ƒ t j | ƒ } | j ƒ  | }	 | rÿ t j	 d t j
 d |	 ƒ ƒ }
 |
 S|	 Sd S(   sC   mutual information of two random variables estimated with kde

    s+   both data arrays need to have the same sizeg      ð?iþÿÿÿN(   t   lent
   ValueErrort   npt   asarrayt   floatt   vstackR   t   logt   sumt   sqrtt   exp(   t   yt   xt   normedt   nobst   yxt   kde_xt   kde_yt   kde_yxt   mi_obst   mit	   mi_normed(    (    sL   lib/python2.7/site-packages/statsmodels/sandbox/distributions/mv_measures.pyt   mutualinfo_kde   s    ) c   	      C   sÛ   t  | ƒ } t j | t ƒ } t j |  t ƒ }  t | j ƒ | j ƒ } t |  j ƒ | j ƒ } t j | ƒ t j | ƒ } t  | ƒ | k r t d ƒ ‚ n  | j ƒ  } | rÓ t j	 d t j
 d | ƒ ƒ } | S| Sd S(   sC   mutual information of two random variables estimated with kde

    s   Wrong number of observationsg      ð?iþÿÿÿN(   R   R   R   R   R   t   TR   R   t   meanR
   R   (	   R   R   R   R   R   R   R   R   R   (    (    sL   lib/python2.7/site-packages/statsmodels/sandbox/distributions/mv_measures.pyt   mutualinfo_kde_2sample4   s     c         C   sf  t  | ƒ } t  |  ƒ | k s- t d ƒ ‚ n  t j | t ƒ } t j |  t ƒ }  | d k rt j |  ƒ } t j | ƒ } t j d | ƒ } t j d d d | ƒ } | d | j t	 ƒ }	 d t j
 | j ƒ }
 |
 d c d 8<| |	 |
 } | |	 |
 } nE t j | ƒ d k r'| } | } n! t  | ƒ d k rH| \ } } n  t j | d	 | ƒ\ } } t j |  d	 | ƒ\ } } t j |  | d	 | | f ƒ\ } } } | d | } | d | } | d | } | t j | d
 ƒ t j | ƒ d d … d f t j | ƒ } | j ƒ  } | r^t j d t j d | ƒ ƒ } | | | | | | f | f S| Sd S(   s0  mutual information of two random variables estimated with kde



    Notes
    -----
    bins='auto' selects the number of bins so that approximately 5 observations
    are expected to be in each bin under the assumption of independence. This
    follows roughly the description in Kahn et al. 2007

    s+   both data arrays need to have the same sizet   autog      @i    i   g      ð?gíµ ÷Æ°>i   t   binsg»½×Ùß|Û=Niþÿÿÿgíµ ÷ÆÀ>(   R   R   R   R   R   t   sortR
   t   linspacet   astypet   intt   onest   shapet   sizet	   histogramt   histogram2dR   t   NoneR	   R   (   R   R   R   R   R   t   yst   xst   qbin_sqrt	   quantilest   quantile_indext   shiftt   binsyt   binsxt   fxt   fyt   fyxt   pyxt   pxt   pyR   R   R   (    (    sL   lib/python2.7/site-packages/statsmodels/sandbox/distributions/mv_measures.pyt   mutualinfo_binnedJ   s@    	'A t   __main__t   lineart	   quadratici   iÈ   i   i   t   prependi    t   correlationt   pearsonrt	   spearmanrt
   kendalltauR   i   t
   mutualinfog      ð?gVçž¯Ò<t   logbases   mutualinfo_kde normeds   mutualinfo_kde       R   s   mutualinfo_binned normeds   mutualinfo_binned       R   g      Ð?gš™™™™™Ù?g333333ã?g      è?R*   t   o(   i    i   (D   t   __doc__t   numpyR   t   scipyR    t   scipy.statsR   t   statsmodels.sandbox.infotheot   sandboxt   infotheot   TrueR   R   R5   t   __name__t   statsmodels.apit   apit   smt   funtypeR   t   sigR   t   randomt   randnR   t   add_constantt   exogR   t   corrcoefR;   R<   R=   R%   t   pxyR.   R-   R$   R3   t   binsx_R4   t   binsy_R>   t   et   FalseR   t   pyx2t   py2t   px2t   binsy2t   binsx2R   R	   R'   R(   t   arrayR   R    t   byt   bxt   doplott   matplotlib.pyplott   pyplott   pltt   plott   OLSt   fitt   olsrest   fittedvalues(    (    (    sL   lib/python2.7/site-packages/statsmodels/sandbox/distributions/mv_measures.pyt   <module>   sf   B%!! 00666