B
    ZY                 @   s8  d dl Zd dlmZ d dlZd dlmZmZ d dlZd dl	m
Z
 d dlmZmZ d dlmZmZmZ d dlmZ dd	 Zd
d Zdd Zdd Zdd Zdd Zdd Zdd ZG dd deZG dd deZedkr4d dlZd dl m!Z! ej"ddd d!d"d#d$gd%Z#e!d&e#d'$ Z%e!d(e#d'$ Z&ee%d)d*Z'dS )+    N)stats)	DataFrameIndex)OLS)lrangelmap)_remove_intercept_patsy_has_intercept_intercept_idx)summary2c             C   sp   |d kr|   S |dkr$| j}| jS |dkr8| j}| jS |dkrL| j}| jS |dkr`| j}| jS t	d| d S )NZhc0Zhc1Zhc2Zhc3z robust options %s not understood)
Z
cov_paramsZHC0_seZcov_HC0ZHC1_seZcov_HC1ZHC2_seZcov_HC2ZHC3_seZcov_HC3
ValueError)modelrobustZse r   6lib/python3.7/site-packages/statsmodels/stats/anova.py_get_covariance   s    r   c             K   s2  | dd}| dd}| dd}| dd}|r<| }| jj}| jj}|jd }| jj}	| jjj}
| jj	}t
|
jt|
 d }d	| }d
dd||g}tt|df|d}|dkrt| ||||
|||||
S |dkrt| |
||||S |dkr
t| |
||||S |dkrtdntdt| dS )a9  
    Anova table for one fitted linear model.

    Parameters
    ----------
    model : fitted linear model results instance
        A fitted linear model
    typ : int or str {1,2,3} or {"I","II","III"}
        Type of sum of squares to use.

    **kwargs**

    scale : float
        Estimate of variance, If None, will be estimated from the largest
    model. Default is None.
        test : str {"F", "Chisq", "Cp"} or None
        Test statistics to provide. Default is "F".

    Notes
    -----
    Use of this function is discouraged. Use anova_lm instead.
    testFscaleNtyp   r   r   zPR(>%s)dfsum_sqmean_sq   )columns)r   I)   ZII)   ZIII)   ZIVzType IV not yet implementedzType %s not understood)getlowerr   endogexogshapeZendog_namesdatadesign_info
exog_nameslentermsr	   r   npzerosanova1_lm_singleanova2_lm_singleanova3_lm_singleNotImplementedr   str)r   kwargsr   r   r   r   r"   r#   nobsZresponse_namer&   r'   n_rowspr_testnamestabler   r   r   anova_single!   s8    




r7   c
                s  t | dd}
|
dkr2tj|\}}t|j|}
tt jt j	f} fdd j
D }x t|D ]\}}d|||f< qjW t||
d }t }||  }t j
}||  }| }t|dg |_tj||  d|f |j|dd	gf< | j| jf|jdd	dgf< |d
krv|d	 |d  | j| j  ||< tj|d
 |d | j||< tjtjf|jd||gf< |d	 |d  |d< |S )a  
    Anova table for one fitted linear model.

    Parameters
    ----------
    model : fitted linear model results instance
        A fitted linear model

    **kwargs**

    scale : float
        Estimate of variance, If None, will be estimated from the largest
    model. Default is None.
        test : str {"F", "Chisq", "Cp"} or None
        Test statistics to provide. Default is "F".

    Notes
    -----
    Use of this function is discouraged. Use anova_lm instead.
    effectsNc                s   g | ]}  |qS r   )slice).0name)r&   r   r   
<listcomp>y   s    z$anova1_lm_single.<locals>.<listcomp>r   r   Residualr   r   r   r   )getattrr*   linalgqrdotTr+   r(   r)   Zcolumn_names
term_names	enumerater
   arraytolistr   indexZc_sumlocssrdf_residr   fsfnan)r   r"   r#   r2   r&   r6   r3   r   r4   r   r8   qrZarrslicesiZslice_r   idxrC   rG   r   )r&   r   r,   \   s2    

(
r,   c             C   s  |j dd }t|}dd||g}tt|df|d}t| d}	t| |}
g }g }xt|D ]\}}||}t|j	|j
}g }t|j}xZ|D ]R}t|j}||r||ks||}|t|j	|j
 |t|j	|j
 qW t| jjjd | }t| jjjd | }|jrtt||
|j}ddlm} ||\}}|jd |jd  }t|dd| df j|}n|}|jd }|d	kr| j||
d
}|j |j|j| |f< }|j|j|j| |f< ||j|j| df< ||j	 ||  q`W t |dg |_|j!t"|| jjjd d g  }|| |d  | j# | j$ }||d< | j#| j$tj%tj%f|jddd||gf< |S )a  
    Anova type II table for one fitted linear model.

    Parameters
    ----------
    model : fitted linear model results instance
        A fitted linear model

    **kwargs**

    scale : float
        Estimate of variance, If None, will be estimated from the largest
    model. Default is None.
        test : str {"F", "Chisq", "Cp"} or None
        Test statistics to provide. Default is "F".

    Notes
    -----
    Use of this function is discouraged. Use anova_lm instead.

    Type II
    Sum of Squares compares marginal contribution of terms. Thus, it is
    not particularly useful for models with significant interaction terms.
    Nr   r   r   )r   r   r   )r?   r   )cov_pr=   )&r)   r   r   r*   r+   r   rD   r9   r   startstopsetfactorsissubsetextendeyer   r#   r$   sizerA   rB   scipyr?   r@   f_testfvaluerI   rG   pvalueappendr;   r   ilocZargsortrJ   rK   rN   )r   r&   r3   r   r4   r   
terms_infor5   r6   covZ
robust_cov	col_orderrG   rR   termcolsL1ZL2Zterm_settZ	other_setcolZLVLr?   Z
orth_compl_rP   L12rL   
test_valuerJ   r   r   r   r-      sX    






"

$r-   c             C   sR  |t |7 }|j}dd||g}tt|df|d}t| |}	g }
g }xt|D ]\}}||}t| j	j
jd | }|}|jd }|dkr| j||	d}|j |j|j| |f< }|j|j|j| |f< ||j|j| df< ||  qPW t|d	g |_|| |d  | j | j }||d< | j| jtjtjf|jd	dd||gf< |S )
Nr   r   r   )r   r   r   r   )rT   r=   )r	   r)   r   r*   r+   r   rD   r9   r[   r   r#   r$   r^   r_   rI   rG   r`   ra   r;   r   rJ   rK   rN   )r   r&   r3   r   r4   r   rc   r5   r6   rd   re   rG   rR   rf   rg   rh   rl   rP   rL   rm   rJ   r   r   r   r.      s2    


r.   c        
      O   sv  | dd}t| dkr,| d }t|f|S y|dks:tW n   tdt| Y nX | dd}| dd	}t| }d
| }dddd||g}tt|df|d}	|s| d j	}t
t| dg| |	d< t
t| dg| |	d< t|	d j |	j|	jdd	 df< |	d   |	d< |dkrr|	d |	d  | |	d< tj|	d |	d |	d |	|< tj|	| |	d  < |	S )a  
    Anova table for one or more fitted linear models.

    Parameters
    ----------
    args : fitted linear model results instance
        One or more fitted linear models
    scale : float
        Estimate of variance, If None, will be estimated from the largest
        model. Default is None.
    test : str {"F", "Chisq", "Cp"} or None
        Test statistics to provide. Default is "F".
    typ : str or int {"I","II","III"} or {1,2,3}
        The type of Anova test to perform. See notes.
    robust : {None, "hc0", "hc1", "hc2", "hc3"}
        Use heteroscedasticity-corrected coefficient covariance matrix.
        If robust covariance is desired, it is recommended to use `hc3`.

    Returns
    -------
    anova : DataFrame
    A DataFrame containing.

    Notes
    -----
    Model statistics are given in the order of args. Models must have
    been fit using the formula api.

    See Also
    --------
    model_results.compare_f_test, model_results.compare_lm_test

    Examples
    --------
    >>> import statsmodels.api as sm
    >>> from statsmodels.formula.api import ols
    >>> moore = sm.datasets.get_rdataset("Moore", "car", cache=True) # load
    >>> data = moore.data
    >>> data = data.rename(columns={"partner.status" :
    ...                             "partner_status"}) # make name pythonic
    >>> moore_lm = ols('conformity ~ C(fcategory, Sum)*C(partner_status, Sum)',
    ...                 data=data).fit()
    >>> table = sm.stats.anova_lm(moore_lm, typ=2) # Type 2 Anova DataFrame
    >>> print(table)
    r   r   r   )r   r   z6Multiple models only supported for type I. Got type %sr   r   r   NzPr(>%s)rK   rJ   Zdf_diffZss_diff   )r   )r    r(   r7   AssertionErrorr   r0   r   r*   r+   r   r   r>   ZdiffvaluesrI   rG   r   rL   rM   rN   Zisnull)
argsr1   r   r   r   r   Zn_modelsr4   r5   r6   r   r   r   anova_lm  s6    .
&
rs   c             C   s2   t dg| }x|D ]}| | }d||< qW |S )NTF)r*   rE   )rQ   Zslices_to_excludenindrf   sr   r   r   
_not_slicef  s
    
rw   c       	      C   s\   t |||jd }|| }t| |dd|f |}|j|}t| t| }||fS )ai  
    Residual sum of squares of OLS model excluding factors in `keys`
    Assumes x matrix is orthogonal

    Parameters
    ----------
    y : array_like
        dependent variable
    x : array_like
        independent variables
    term_slices : a dict of slices
        term_slices[key] is a boolean array specifies the parameters
        associated with the factor `key`
    params : ndarray
        OLS solution of y = x * params
    keys : keys for term_slices
        factors to be excluded

    Returns
    -------
    rss : float
        residual sum of squares
    df : int
        degrees of freedom

    r   N)rw   r$   r*   subtractrA   rB   r(   )	yxterm_slicesparamskeysru   Zparams1rJ   rK   r   r   r   _ssr_reduced_modeln  s    r~   c               @   s2   e Zd ZdZdddZdd Zdd Zd	d
 ZdS )AnovaRMa  
    Repeated measures Anova using least squares regression

    The full model regression residual sum of squares is
    used to compare with the reduced model for calculating the
    within-subject effect sum of squares [1].

    Currently, only fully balanced within-subject designs are supported.
    Calculation of between-subject effects and corrections for violation of
    sphericity are not yet implemented.

    Parameters
    ----------
    data : DataFrame
    depvar : string
        The dependent variable in `data`
    subject : string
        Specify the subject id
    within : a list of string(s)
        The within-subject factors
    between : a list of string(s)
        The between-subject factors, this is not yet implemented
    aggregate_func : None, 'mean', or function
        If the data set contains more than a single observation per subject
        and cell of the specified model, this function will be used to
        aggregate the data before running the Anova. `None` (the default) will
        not perform any aggregation; 'mean' is s shortcut to `numpy.mean`.
        An exception will be raised if aggregation is required, but no
        aggregation function was specified.

    Returns
    -------
    results: AnovaResults instance

    Raises
    ------
    ValueError
        If the data need to be aggregated, but `aggregate_func` was not
        specified.

    Notes
    -----
    This implementation currently only supports fully balanced designs. If the
    data contain more than one observation per subject and cell of the design,
    these observations need to be aggregated into a single observation
    before the Anova is calculated, either manually or by passing an aggregation
    function via the `aggregate_func` keyword argument.
    Note that if the input data set was not balanced before performing the
    aggregation, the implied heteroscedasticity of the data is ignored.

    References
    ----------
    .. [*] Rutherford, Andrew. Anova and ANCOVA: a GLM approach. John Wiley & Sons, 2011.

    Nc             C   s   || _ || _|| _d|kr"td|| _|d k	r8td|| _|dkrPtj| _	n|| _	|
|j|g| ds| j	d k	r|   nd}t||   d S )NCzSFactor name cannot be 'C'! This is in conflict with patsy's contrast function name.z)Between subject effect not yet supported!mean)ZsubsetzThe data set contains more than one observation per subject and cell. Either aggregate the data manually, or pass the `aggregate_func` parameter.)r%   depvarwithinr   betweenNotImplementedErrorsubjectr*   r   aggregate_funcZequalsZdrop_duplicates
_aggregate_check_data_balanced)selfr%   r   r   r   r   r   msgr   r   r   __init__  s$    


zAnovaRM.__init__c             C   s.   | j j| jg| j dd| j | j| _ d S )NF)Zas_index)r%   groupbyr   r   r   Zaggr   )r   r   r   r   r     s    zAnovaRM._aggregatec       	      C   s   d}x$| j D ]}|t| j|  9 }qW i }xlt| jjd D ]X}g }x$| j D ]}|| j| j|  qPW t|}||kr|| d ||< q@d||< q@W d}t||krt	||| }x |D ]}||| krt	|qW | jjd || krt	ddS )zraise if data is not balanced

        This raises a ValueError if the data is not balanced, and
        returns None if it is balance

        Return might change

        r   r   zData is unbalanced.z9There are more than 1 element in a cell! Missing factors?N)
r   r(   r%   uniqueranger$   ra   rb   tupler   )	r   Zfactor_levelsZwiZ
cell_countrG   keyrj   Zerror_messagecountr   r   r   r     s*    	
zAnovaRM._check_data_balancedc             C   s  | j | j j}dd | jD }d| j }||g }tjd|| j d}|jj	}x<|D ]4}t
dg|jd  }d||| < t
|||< qVW d	|g}	t||	|jd }|d
d
|f }t||}
|
 }|
j|jd k rtdx|	D ]}|| qW x|D ]}|| | ||< qW |j}|j}|j}tg g g g d}x|D ]}| j|krL|dkrLt|||||g\}}|| }|| | }|d	|d
d ks|d	 | |kr|| }|}n2t|||||d	 | g\}}|| }|| | }|| }tj|||}|dddd}||j|df< ||j|df< ||j|df< ||j|df< qLW t|jd
d
ddddgf S )zwestimate the model and compute the Anova table

        Returns
        -------
        AnovaResults instance

        c             S   s   g | ]}d | qS )z
C(%s, Sum)r   )r:   rR   r   r   r   r<     s    zAnovaRM.fit.<locals>.<listcomp>z
C(%s, Sum)*)r%   Fr   T:Nz$Independent variables are collinear.)zF ValuezNum DFzDen DFzPr > FZ	Interceptro   zC( z, Sum)zF ValuezNum DFzDen DFzPr > Fr   r   r   ) r%   r   rq   r   r   patsyZdmatrixjoinr&   Zterm_name_slicesr*   rE   r$   rw   r   fitZrankr   popr|   rK   rJ   pdr   r~   r   rL   rM   replacerI   AnovaResultsrb   )r   ry   r   r   rX   rz   r{   r   ru   Zterm_excluder   ZresultsrR   r|   rK   rJ   anova_tableZssr1Z	df_resid1Zdf1ZmsmZmseZdf2r   prf   r   r   r   r     sb    





zAnovaRM.fit)NNN)__name__
__module____qualname____doc__r   r   r   r   r   r   r   r   r     s   7 
"r   c               @   s(   e Zd ZdZdd Zdd Zdd ZdS )	r   zX
    Anova results class

    Attributes
    ----------
    anova_table : DataFrame
    c             C   s
   || _ d S )N)r   )r   r   r   r   r   r   \  s    zAnovaResults.__init__c             C   s   |    S )N)summary__str__)r   r   r   r   r   _  s    zAnovaResults.__str__c             C   s"   t  }|d || j |S )zdcreate summary results

        Returns
        -------
        summary : Summary instance

        ZAnova)r   ZSummaryZ	add_titleZadd_dfr   )r   Zsummr   r   r   r   b  s    
zAnovaResults.summaryN)r   r   r   r   r   r   r   r   r   r   r   r   T  s   r   __main__)olsz	moore.csv,r   Zpartner_statusZ
conformityZ	fcategoryZfscore)Z	delimiterZskiprowsr5   z5conformity ~ C(fcategory, Sum)*C(partner_status, Sum))r%   z#conformity ~ C(partner_status, Sum)r   )r   )(Znumpyr*   r]   r   Zpandasr   r   r   r   Z#statsmodels.regression.linear_modelr   Zstatsmodels.compat.pythonr   r   Z statsmodels.formula.formulatoolsr   r	   r
   Zstatsmodels.iolibr   r   r7   r,   r-   r.   rs   rw   r~   objectr   r   r   Zstatsmodels.formula.apir   Z
read_tableZmoorer   Zmoore_lmZmooreBr6   r   r   r   r   <module>   s:   ;7X'T# D

	