ó
áp7]c        
   @  s®  d  Z  d d l m Z d d l m Z d d l Z d d l j Z	 e
 d „ Z d „  Z d „  Z d Z d	 Z d
 „  Z d „  Z d „  Z e d k rªd Z e j j d d e d f ƒj d e f d e f d e f d e f g ƒ Z e j j d e d f ƒ j d e f d e f g ƒ Z d d l Z e j j j  e e f d e! ƒZ" e j# e d f e" ƒ Z$ x" e j% j& D] Z' e e' e$ e' <qkWx" e j% j& D] Z' e e' e$ e' <qWe d e$ ƒ \ Z( Z& e d e$ ƒ \ Z( Z& e j) g  e& D] Z* e( e* ^ qßƒ Z+ e+ j, d ƒ d e j j d e ƒ Z- e	 j. e- e+ ƒ j/ ƒ  Z0 e1 e0 j2 ƒ e1 e e e0 ƒ ƒ e j) g  e d e& ƒ D] Z* e( e* ^ qmƒ Z+ e+ j, d ƒ d e j j d e ƒ Z- e	 j. e- e+ ƒ j/ ƒ  Z0 e1 e0 j2 ƒ e1 e e e0 ƒ ƒ e j% d e f d  e f d! e f d" e f d# e f d$ e f d% e f d& e f g ƒ Z3 e j4 d' e3 d( d) d* e! ƒZ5 e1 d( g  e5 j% j& D] Z6 e5 j7 e6 j, ƒ  ^ qmƒ e5 j7 j e8 ƒ Z9 e9 j: d e; e5 j% j& ƒ ƒ j< d ƒ Z= e5 e= d d … f j> Z? e1 e? j@ ƒ e1 e? j% ƒ e d+ e? ƒ \ ZA ZB e j) g  e d, eB ƒ D] Z* eA e* ^ q%ƒ ZC e? d& ZD e	 j. eD eC ƒ j/ ƒ  ZE e1 eE j2 ƒ e1 e e eE ƒ ƒ d- jF e5 j% j& d  ƒ ZG e d. e? ƒ \ ZH ZI e j) g  e d, eI ƒ D] Z* eH e* ^ qÅƒ ZJ e? d& ZK e	 j. eK eJ ƒ j/ ƒ  ZL e1 eL j2 ƒ e1 e e eL ƒ ƒ x† eI D]{ ZM e1 d/ eM f ƒ e j) g  e eM eI ƒ D] Z* eH e* ^ qTƒ ZN e? d& ZO e	 j. eO eN ƒ j/ ƒ  ZP e1 e e eP ƒ ƒ q(Wn  d S(0   s   convenience functions for ANOVA type analysis with OLS

Note: statistical results of ANOVA are not checked, OLS is
checked but not whether the reported results are the ones used
in ANOVA

includes form2design for creating dummy variables

TODO:
 * ...
 *

iÿÿÿÿ(   t   print_function(   t   lmapNc         C  s   |  j  ƒ  }  t j |  ƒ } | rD |  d d … d f | k j t ƒ S|  d d … d f | k j t ƒ d d … d d … f Sd S(   s|   convert array of categories to dummy variables
    by default drops dummy variable for last category
    uses ravel, 1d onlyNiÿÿÿÿ(   t   ravelt   npt   uniquet   Nonet   astypet   int(   t   xt	   returnallt   groups(    (    sK   lib/python2.7/site-packages/statsmodels/sandbox/regression/try_ols_anova.pyt
   data2dummy   s
    #c         C  sl   t  j t t |  j ƒ  ƒ ƒ } |  | d d … d d d … f k j d ƒ j j t	 ƒ d d … d d … f S(   sÝ   creates product dummy variables from 2 columns of 2d array

    drops last dummy variable, but not from each category
    singular with simple dummy variable but not with constant

    quickly written, no safeguards

    Niÿÿÿÿ(
   R   R   R   t   tuplet   tolistR   t   allt   TR   R   (   R   R
   (    (    sK   lib/python2.7/site-packages/statsmodels/sandbox/regression/try_ols_anova.pyt   data2proddummy    s    c         C  sB   | j  d k r( | d d … d f } n  t |  d t ƒ} | | S(   sý   create dummy continuous variable

    Parameters
    ----------
    x1 : 1d array
        label or group array
    x2 : 1d array (float)
        continuous variable

    Notes
    -----
    useful for group specific slope coefficients in regression
    i   NR	   (   t   ndimR   R   t   True(   t   x1t   x2t   dummy(    (    sK   lib/python2.7/site-packages/statsmodels/sandbox/regression/try_ols_anova.pyt   data2groupcont/   s    sW  
ANOVA statistics (model sum of squares excludes constant)
Source    DF  Sum Squares   Mean Square    F Value    Pr > F
Model     %(df_model)i        %(ess)f       %(mse_model)f   %(fvalue)f %(f_pvalue)f
Error     %(df_resid)i     %(ssr)f       %(mse_resid)f
CTotal    %(nobs)i    %(uncentered_tss)f     %(mse_total)f

R squared  %(rsquared)f
s]  
ANOVA statistics (model sum of squares includes constant)
Source    DF  Sum Squares   Mean Square    F Value    Pr > F
Model     %(df_model)i      %(ssmwithmean)f       %(mse_model)f   %(fvalue)f %(f_pvalue)f
Error     %(df_resid)i     %(ssr)f       %(mse_resid)f
CTotal    %(nobs)i    %(uncentered_tss)f     %(mse_total)f

R squared  %(rsquared)f
c         C  s‰   i  } | j  |  j ƒ d d d d d d d d d	 d
 d g } x! | D] } t |  | ƒ | | <qD W|  j j | d <|  j |  j | d <| S(   sj   update regression results dictionary with ANOVA specific statistics

    not checked for completeness
    t   df_modelt   df_residt   esst   ssrt   uncentered_tsst	   mse_modelt	   mse_residt	   mse_totalt   fvaluet   f_pvaluet   rsquaredt   nobst   ssmwithmean(   t   updatet   __dict__t   getattrt   modelR"   R   R   (   t   rest   adt
   anova_attrt   key(    (    sK   lib/python2.7/site-packages/statsmodels/sandbox/regression/try_ols_anova.pyt	   anovadict\   s    	c         C  sÌ  i  } g  } x³|  j  ƒ  D]¥} | d k rU t j | j d ƒ | d <| j d ƒ q d | k r | | | | <| j | ƒ q | d  d k rÆ | j  d ƒ d } t | | ƒ | | <| j | ƒ q | d  d k rA| j  d ƒ d j  d	 ƒ } t t j | | d | | d f ƒ | d
 j | ƒ <| j d
 j | ƒ ƒ q | d  d k r²| j  d ƒ d j  d	 ƒ } t	 | | d | | d ƒ | d
 j | ƒ <| j d
 j | ƒ ƒ q t
 d ƒ ‚ q W| | f S(   s  convert string formula to data dictionary

    ss : string
     * I : add constant
     * varname : for simple varnames data is used as is
     * F:varname : create dummy variables for factor varname
     * P:varname1*varname2 : create product dummy variables for
       varnames
     * G:varname1*varname2 : create product between factor and
       continuous variable
    data : dict or structured array
       data set, access of variables by name as in dictionaries

    Returns
    -------
    vars : dictionary
        dictionary of variables with converted dummy variables
    names : list
        list of names, product (P:) and grouped continuous
        variables (G:) have name by joining individual names
        sorted according to input

    Examples
    --------
    >>> xx, n = form2design('I a F:b P:c*d G:c*f', testdata)
    >>> xx.keys()
    ['a', 'b', 'const', 'cf', 'cd']
    >>> n
    ['const', 'a', 'b', 'cd', 'cf']

    Notes
    -----

    with sorted dict, separate name list wouldn't be necessary
    t   Ii    t   constt   :i   s   F:i   s   P:t   *t    s   G:s   unknown expression in formula(   t   splitR   t   onest   shapet   appendR   R   t   c_t   joinR   t
   ValueError(   t   sst   datat   varst   namest   itemt   v(    (    sK   lib/python2.7/site-packages/statsmodels/sandbox/regression/try_ols_anova.pyt   form2designm   s.    $6,c         C  s/   | } x! |  j  ƒ  D] } | j | ƒ q W| S(   sw   drop names from a list of strings,
    names to drop are in space delimeted list
    does not change original list
    (   R2   t   remove(   R9   t   lit   newliR=   (    (    sK   lib/python2.7/site-packages/statsmodels/sandbox/regression/try_ols_anova.pyt   dropnameª   s    t   __main__iè  i   t   sizei   t   at   bt   ct   di   t   et   ft   flatteni   s   I a F:b P:c*ds   I a F:b P:c*d G:a*e fg{®Gáz„?s   ae ft   breedt   sext   littert   pent   pigt   aget   baget   ys   dftest3.datat   missingt   .t   usemasks   I F:sex ageR1   t    s'   I F:breed F:sex F:litter F:pen age bages   
Results dropping(Q   t   __doc__t
   __future__R    t   statsmodels.compat.pythonR   t   numpyR   t   statsmodels.apit   apit   smt   FalseR   R   R   t
   anova_str0t	   anova_strR,   R?   RC   t   __name__R"   t   randomt   randintt   viewR   t   testdataintt   normalt   floatt   testdatacontt   numpy.lib.recfunctionst   libt   recfunctionst	   zip_descrR   t   dt2t   emptyt   testdatat   dtypeR<   t   namet   xxt   column_stackt   nnt   Xt   sumRT   t   OLSt   fitt   rest1t   printt   paramst   dt_bt
   genfromtxtt   dtat   kt   maskt   boolt   mt   reshapet   lent   anyt   droprowsR:   t
   dta_use_b1R4   t   xx_b1t   names_b1t   X_b1t   y_b1t   rest_b1R7   t   allexogt   xx_b1at	   names_b1at   X_b1at   y_b1at   rest_b1at   dropnt   X_b1a_t   y_b1a_t	   rest_b1a_(    (    (    sK   lib/python2.7/site-packages/statsmodels/sandbox/regression/try_ols_anova.pyt   <module>   s€   		 
		=	
K6!&&/&!3'/
/
/
