
p7]c        	   @   s  d  d l  Z d  d l m Z d  d l Z d  d l m Z m Z d  d l Z d  d l	 m
 Z
 d  d l m Z d  d l m Z m Z m Z d  d l m Z d   Z d	   Z d
   Z d   Z d   Z d   Z d   Z d   Z d e f d     YZ d e f d     YZ e d k rd  d l Z d  d l m  Z  e j! d d d d d d d d g Z" e  d d e" j#   Z$ e  d  d e" j#   Z% e e$ d! d" Z& n  d S(#   iN(   t   stats(   t	   DataFramet   Index(   t   OLS(   t   lrange(   t   _remove_intercept_patsyt   _has_interceptt   _intercept_idx(   t   summary2c         C   s   | d  k r |  j   S| d k r2 |  j } |  j S| d k rN |  j } |  j S| d k rj |  j } |  j S| d k r |  j } |  j	 St
 d |   d  S(   Nt   hc0t   hc1t   hc2t   hc3s    robust options %s not understood(   t   Nonet
   cov_paramst   HC0_set   cov_HC0t   HC1_set   cov_HC1t   HC2_set   cov_HC2t   HC3_set   cov_HC3t
   ValueError(   t   modelt   robustt   se(    (    s6   lib/python2.7/site-packages/statsmodels/stats/anova.pyt   _get_covariance   s    
				c         K   s  | j  d d  } | j  d d  } | j  d d  } | j  d d  } | r] | j   } n  |  j j } |  j j } | j d } |  j j }	 |  j j j	 }
 |  j j
 } t |
 j  t |
  d } d | } d	 d
 d | | g } t t j | d f  d | } | d k r7t |  | | | |
 | | | | | 
 S| d k r\t |  |
 | | | |  S| d k rt |  |
 | | | |  S| d k rt d   n t d t |    d S(   s9  
    Anova table for one fitted linear model.

    Parameters
    ----------
    model : fitted linear model results instance
        A fitted linear model
    typ : int or str {1,2,3} or {"I","II","III"}
        Type of sum of squares to use.

    **kwargs**

    scale : float
        Estimate of variance, If None, will be estimated from the largest
    model. Default is None.
        test : str {"F", "Chisq", "Cp"} or None
        Test statistics to provide. Default is "F".

    Notes
    -----
    Use of this function is discouraged. Use anova_lm instead.
    t   testt   Ft   scalet   typi   R   i    s   PR(>%s)t   dft   sum_sqt   mean_sqi   t   columnst   Ii   t   IIi   t   IIIi   t   IVs   Type IV not yet implementeds   Type %s not understoodN(   i   R$   (   i   R%   (   i   R&   (   i   R'   (   t   getR   t   lowerR   t   endogt   exogt   shapet   endog_namest   datat   design_infot
   exog_namest   lent   termsR   R   t   npt   zerost   anova1_lm_singlet   anova2_lm_singlet   anova3_lm_singlet   NotImplementedErrorR   t   str(   R   t   kwargsR   R   R   R   R*   R+   t   nobst   response_nameR/   R0   t   n_rowst   pr_testt   namest   table(    (    s6   lib/python2.7/site-packages/statsmodels/stats/anova.pyt   anova_single!   s8    
!c
         C   s%  t  |  d d	  }
 |
 d	 k rN t j j |  \ } } t j | j |  }
 n  t j t | j	  t | j
  f  } g  | j D] } | j |  ^ q } x* t |  D] \ } } d | | | f <q Wt j | |
 d  } t |  } | | } t j | j  } | | } | j   } t | d g  | _ t j | | j d  | f | j | d d g f <|  j |  j f | j d d d g f <| d k r| d | d |  j |  j | | <t j j | d | d |  j  | | <t j t j f | j d | | g f <n  | d | d | d <| S(
   s  
    Anova table for one fitted linear model.

    Parameters
    ----------
    model : fitted linear model results instance
        A fitted linear model

    **kwargs**

    scale : float
        Estimate of variance, If None, will be estimated from the largest
    model. Default is None.
        test : str {"F", "Chisq", "Cp"} or None
        Test statistics to provide. Default is "F".

    Notes
    -----
    Use of this function is discouraged. Use anova_lm instead.
    t   effectsi   i   t   ResidualR    R!   R   R"   N(   t   getattrR   R3   t   linalgt   qrt   dott   TR4   R1   R2   t   column_namest
   term_namest   slicet	   enumerateR   t   arrayt   tolistR   t   indext   c_t   sumt   loct   ssrt   df_residR    t   ft   sft   nan(   R   R*   R+   R;   R/   R@   R=   R   R>   R   RB   t   qt   rt   arrt   namet   slicest   it   slice_R!   t   idxRJ   RO   (    (    s6   lib/python2.7/site-packages/statsmodels/stats/anova.pyR5   ]   s2    '%4%(c         C   s  | j  } t |  } d d | | g } t t j | d f  d | } t |  d	  }	 t |  |  }
 g  } g  } xat |  D]S\ } } | j |  } t	 | j
 | j  } g  } t | j  } x | D]{ } t | j  } | j |  r | | k r | j |  } | j t	 | j
 | j   | j t	 | j
 | j   q q Wt j |  j j j d  | } t j |  j j j d  | } | j r!t j t j | |
  | j  } d d l m } | j |  \ } } | j d | j d } t j | d	 d	  | d	  f j |  } n | } | j d } | d
 k r|  j | d |
 } | j | j | j | | f <} | j | j | j | | f <n  | | j | j | d f <| j | j
  | j | j     q Wt! | d g  | _ | j" t j# | |  j j j d d g  } | | | d |  j$ |  j% } | | d <|  j$ |  j% t j& t j& f | j d d d | | g f <| S(   s  
    Anova type II table for one fitted linear model.

    Parameters
    ----------
    model : fitted linear model results instance
        A fitted linear model

    **kwargs**

    scale : float
        Estimate of variance, If None, will be estimated from the largest
    model. Default is None.
        test : str {"F", "Chisq", "Cp"} or None
        Test statistics to provide. Default is "F".

    Notes
    -----
    Use of this function is discouraged. Use anova_lm instead.

    Type II
    Sum of Squares compares marginal contribution of terms. Thus, it is
    not particularly useful for models with significant interaction terms.
    R!   R    i   R#   i   i(   RE   i    NR   t   cov_pRC   ('   R2   R   R   R3   R4   R   R   RL   RK   R   t   startt   stopt   sett   factorst   issubsett   extendt   eyeR   R+   R,   t   sizeRG   RH   t   scipyRE   RF   t   f_testt   fvalueRR   RO   t   pvaluet   appendR[   R   t   iloct   argsortRS   RT   RW   (   R   R/   R=   R   R>   R   t
   terms_infoR?   R@   t   covt
   robust_covt	   col_orderRO   R]   t   termt   colst   L1t   L2t   term_sett   tt	   other_sett   colt   LVLRE   t
   orth_complt   _RY   t   L12RU   t
   test_valueRS   (    (    s6   lib/python2.7/site-packages/statsmodels/stats/anova.pyR6      sX    
!#  	!/! . 
+c         C   s  | t  |  7} | j } d d | | g } t t j | d f  d | } t |  |  }	 g  }
 g  } x t |  D] \ } } | j |  } t j |  j	 j
 j d  | } | } | j d } | d k r$|  j | d |	 } | j | j | j | | f <} | j | j | j | | f <n  | | j | j | d f <| j | j    qt Wt | d	 g  | _ | | | d |  j |  j } | | d <|  j |  j t j t j f | j d	 d d | | g f <| S(
   NR!   R    i   R#   i   i    R   R`   RC   (   R   R2   R   R3   R4   R   RL   RK   Rg   R   R+   R,   Rj   Rk   RR   RO   Rl   Rm   R[   R   RS   RT   RW   (   R   R/   R=   R   R>   R   Rp   R?   R@   Rq   Rs   RO   R]   Rt   Ru   Rv   R   RY   RU   R   RS   (    (    s6   lib/python2.7/site-packages/statsmodels/stats/anova.pyR7      s2    	! !  
+c          O   s  | j  d d  } t |   d k r; |  d } t | |  S| d k r` t d t |    n  | j  d d  } | j  d d  } t |   } d	 | } d
 d d d | | g } t t j | d f  d | }	 | s |  d j	 } n  g  |  D] }
 |
 j
 ^ q |	 d <g  |  D] }
 |
 j ^ q|	 d
 <t j |	 d
 j  |	 j |	 j d d f <|	 d j   |	 d <| d k r|	 d |	 d | |	 d <t j j |	 d |	 d |	 d
  |	 | <t j |	 | |	 d j   <n  |	 S(   s	  
    Anova table for one or more fitted linear models.

    Parameters
    ----------
    args : fitted linear model results instance
        One or more fitted linear models
    scale : float
        Estimate of variance, If None, will be estimated from the largest
        model. Default is None.
    test : str {"F", "Chisq", "Cp"} or None
        Test statistics to provide. Default is "F".
    typ : str or int {"I","II","III"} or {1,2,3}
        The type of Anova test to perform. See notes.
    robust : {None, "hc0", "hc1", "hc2", "hc3"}
        Use heteroscedasticity-corrected coefficient covariance matrix.
        If robust covariance is desired, it is recommended to use `hc3`.

    Returns
    -------
    anova : DataFrame
        When args is a single model, return is DataFrame with columns:

        sum_sq : float64
            Sum of squares for model terms.
        df : float64
            Degrees of freedom for model terms.
        F : float64
            F statistic value for significance of adding model terms.
        PR(>F) : float64
            P-value for significance of adding model terms.

        When args is multiple models, return is DataFrame with columns:

        df_resid : float64
            Degrees of freedom of residuals in models.
        ssr : float64
            Sum of squares of residuals in models.
        df_diff : float64
            Degrees of freedom difference from previous model in args
        ss_dff : float64
            Difference in ssr from previous model in args
        F : float64
            F statistic comparing to previous model in args
        PR(>F): float64
            P-value for significance comparing to previous model in args

    Notes
    -----
    Model statistics are given in the order of args. Models must have been fit
    using the formula api.

    See Also
    --------
    model_results.compare_f_test, model_results.compare_lm_test

    Examples
    --------
    >>> import statsmodels.api as sm
    >>> from statsmodels.formula.api import ols
    >>> moore = sm.datasets.get_rdataset("Moore", "carData", cache=True) # load
    >>> data = moore.data
    >>> data = data.rename(columns={"partner.status" :
    ...                             "partner_status"}) # make name pythonic
    >>> moore_lm = ols('conformity ~ C(fcategory, Sum)*C(partner_status, Sum)',
    ...                 data=data).fit()
    >>> table = sm.stats.anova_lm(moore_lm, typ=2) # Type 2 Anova DataFrame
    >>> print(table)
    R   i   i    R$   s6   Multiple models only supported for type I. Got type %sR   R   R   s   Pr(>%s)RT   RS   t   df_difft   ss_diffi   R#   i(   i   R$   N(   R(   R1   RA   R   R9   R   R   R3   R4   R   RS   RT   t   difft   valuesRR   RO   R    RU   RV   RW   t   isnull(   t   argsR:   R   R   R   R   t   n_modelsR>   R?   R@   t   mdl(    (    s6   lib/python2.7/site-packages/statsmodels/stats/anova.pyt   anova_lm  s2    F

!  +c         C   s?   t  j t g |  } x" | D] } |  | } t | | <q W| S(   N(   R3   RM   t   Truet   False(   R\   t   slices_to_excludet   nt   indRt   t   s(    (    s6   lib/python2.7/site-packages/statsmodels/stats/anova.pyt
   _not_slice}  s
    
c   	      C   s   t  | | | j d  } | | } t j |  | d d  | f j |   } | j j |  } t |   t |  } | | f S(   si  
    Residual sum of squares of OLS model excluding factors in `keys`
    Assumes x matrix is orthogonal

    Parameters
    ----------
    y : array_like
        dependent variable
    x : array_like
        independent variables
    term_slices : a dict of slices
        term_slices[key] is a boolean array specifies the parameters
        associated with the factor `key`
    params : ndarray
        OLS solution of y = x * params
    keys : keys for term_slices
        factors to be excluded

    Returns
    -------
    rss : float
        residual sum of squares
    df : int
        degrees of freedom

    i   N(   R   R,   R3   t   subtractRG   RH   R1   (	   t   yt   xt   term_slicest   paramst   keysR   t   params1RS   RT   (    (    s6   lib/python2.7/site-packages/statsmodels/stats/anova.pyt   _ssr_reduced_model  s    
+t   AnovaRMc           B   s;   e  Z d  Z d d d d  Z d   Z d   Z d   Z RS(   s  
    Repeated measures Anova using least squares regression

    The full model regression residual sum of squares is
    used to compare with the reduced model for calculating the
    within-subject effect sum of squares [1].

    Currently, only fully balanced within-subject designs are supported.
    Calculation of between-subject effects and corrections for violation of
    sphericity are not yet implemented.

    Parameters
    ----------
    data : DataFrame
    depvar : string
        The dependent variable in `data`
    subject : string
        Specify the subject id
    within : a list of string(s)
        The within-subject factors
    between : a list of string(s)
        The between-subject factors, this is not yet implemented
    aggregate_func : None, 'mean', or function
        If the data set contains more than a single observation per subject
        and cell of the specified model, this function will be used to
        aggregate the data before running the Anova. `None` (the default) will
        not perform any aggregation; 'mean' is s shortcut to `numpy.mean`.
        An exception will be raised if aggregation is required, but no
        aggregation function was specified.

    Returns
    -------
    results: AnovaResults instance

    Raises
    ------
    ValueError
        If the data need to be aggregated, but `aggregate_func` was not
        specified.

    Notes
    -----
    This implementation currently only supports fully balanced designs. If the
    data contain more than one observation per subject and cell of the design,
    these observations need to be aggregated into a single observation
    before the Anova is calculated, either manually or by passing an aggregation
    function via the `aggregate_func` keyword argument.
    Note that if the input data set was not balanced before performing the
    aggregation, the implied heteroscedasticity of the data is ignored.

    References
    ----------
    .. [*] Rutherford, Andrew. Anova and ANCOVA: a GLM approach. John Wiley & Sons, 2011.

    c         C   s   | |  _  | |  _ | |  _ d | k r6 t d   n  | |  _ | d  k	 rZ t d   n  | |  _ | d k r~ t j	 |  _
 n	 | |  _
 | j | j d | g |   s |  j
 d  k	 r |  j   q d } t |   n  |  j   d  S(   Nt   CsS   Factor name cannot be 'C'! This is in conflict with patsy's contrast function name.s)   Between subject effect not yet supported!t   meant   subsets   The data set contains more than one observation per subject and cell. Either aggregate the data manually, or pass the `aggregate_func` parameter.(   R.   t   depvart   withinR   t   betweenR   R8   t   subjectR3   R   t   aggregate_funct   equalst   drop_duplicatest
   _aggregatet   _check_data_balanced(   t   selfR.   R   R   R   R   R   t   msg(    (    s6   lib/python2.7/site-packages/statsmodels/stats/anova.pyt   __init__  s$    						"c         C   s?   |  j  j |  j g |  j d t |  j j |  j  |  _  d  S(   Nt   as_index(   R.   t   groupbyR   R   R   R   t   aggR   (   R   (    (    s6   lib/python2.7/site-packages/statsmodels/stats/anova.pyR     s    	c   	      C   sX  d } x. |  j  D]# } | t |  j | j    9} q Wi  } x t |  j j d  D]r } g  } x, |  j  D]! } | j |  j | j |  qj Wt |  } | | k r | | d | | <qT d | | <qT Wd } t |  | k r t	 |   n  | | } x- | D]% } | | | k rt	 |   qqW|  j j d | | k rTt	 d   n  d S(   s   raise if data is not balanced

        This raises a ValueError if the data is not balanced, and
        returns None if it is balance

        Return might change

        i   i    s   Data is unbalanced.s9   There are more than 1 element in a cell! Missing factors?N(
   R   R1   R.   t   uniquet   rangeR,   Rm   Rn   t   tupleR   (	   R   t   factor_levelst   wit
   cell_countRO   t   keyR{   t   error_messaget   count(    (    s6   lib/python2.7/site-packages/statsmodels/stats/anova.pyR     s*    	!
c         C   sU  |  j  |  j j } g  |  j D] } d | ^ q } d |  j } | | g } t j d j |  d |  j  } | j j	 } xL | D]D } t
 j t g | j d  }	 t |	 | | <t
 j |	  | | <q Wd j |  g }
 t | |
 | j d  }	 | d d  |	 f } t | |  } | j   } | j | j d k  rJt d   n  x |
 D] } | j |  qQWx  | D] } | | |	 | | <qoW| j } | j } | j } d d	 d
 d g } t j t
 j d  d | } xr| D]j} |  j | k r| d k rt | | | | | g  \ } } | | } | | | } | d j | d   k sd| d | | k rw| | } | } nA t | | | | | d | g  \ } } | | } | | | } | | } t j j | | |  } | j d d  j d d  } | | j  | d f <| | j  | d	 f <| | j  | d
 f <| | j  | d f <qqWt! |  S(   sw   estimate the model and compute the Anova table

        Returns
        -------
        AnovaResults instance

        s
   C(%s, Sum)t   *R.   i   t   :Ns$   Independent variables are collinear.s   F Values   Num DFs   Den DFs   Pr > Fi    i   R#   t	   Interceptis   C(t    s   , Sum)(   i    i   ("   R.   R   R   R   R   t   patsyt   dmatrixt   joinR/   t   term_name_slicesR3   RM   R   R,   R   R   R   t   fitt   rankR   t   popR   RT   RS   t   pdR   R4   R   R    RU   RV   t   replaceRR   t   AnovaResults(   R   R   R]   R   R   Rd   R   R   R   R   t   term_excludeR   t   resultsR   RT   RS   R#   t   anova_tablet   ssr1t	   df_resid1t   df1t   msmt   mset   df2R   t   pRt   (    (    s6   lib/python2.7/site-packages/statsmodels/stats/anova.pyR   '  sb     !			

	

N(   t   __name__t
   __module__t   __doc__R   R   R   R   R   (    (    (    s6   lib/python2.7/site-packages/statsmodels/stats/anova.pyR     s   7		"R   c           B   s)   e  Z d  Z d   Z d   Z d   Z RS(   sX   
    Anova results class

    Attributes
    ----------
    anova_table : DataFrame
    c         C   s   | |  _  d  S(   N(   R   (   R   R   (    (    s6   lib/python2.7/site-packages/statsmodels/stats/anova.pyR   s  s    c         C   s   |  j    j   S(   N(   t   summaryt   __str__(   R   (    (    s6   lib/python2.7/site-packages/statsmodels/stats/anova.pyR   v  s    c         C   s-   t  j   } | j d  | j |  j  | S(   sm   create summary results

        Returns
        -------
        summary : summary2.Summary instance

        t   Anova(   R   t   Summaryt	   add_titlet   add_dfR   (   R   t   summ(    (    s6   lib/python2.7/site-packages/statsmodels/stats/anova.pyR   y  s    (   R   R   R   R   R   R   (    (    (    s6   lib/python2.7/site-packages/statsmodels/stats/anova.pyR   k  s   		t   __main__(   t   olss	   moore.csvt   skiprowsi   R?   t   partner_statust
   conformityt	   fcategoryt   fscores5   conformity ~ C(fcategory, Sum)*C(partner_status, Sum)R.   s#   conformity ~ C(partner_status, Sum)R   i   ('   t   numpyR3   Ri   R    t   pandasR   R   R   R   t#   statsmodels.regression.linear_modelR   t   statsmodels.compat.pythonR   t    statsmodels.formula.formulatoolsR   R   R   t   statsmodels.iolibR   R   RA   R5   R6   R7   R   R   R   t   objectR   R   R   t   statsmodels.formula.apiR   t   read_csvt   mooreR   t   moore_lmt   mooreBR@   (    (    (    s6   lib/python2.7/site-packages/statsmodels/stats/anova.pyt   <module>   s8   		<	7	X	'	j		#		