B
    ZE                 @   s   d Z ddlmZmZmZmZmZmZ ddlZ	ddl
m  mZ ddlmZ ddlmZ ddlZddlmZ ddlmZmZ ddlmZ d-dd	Zd.ddZd/ddZd0ddZdd Z d1ddZ!dd Z"dd Z#dd Z$d2dd Z%d!d" Z&d#d$ Z'd%d& Z(d'd( Z)G d)d* d*e*Z+d3d+d,Z,dS )4z
Utility functions models code
    )reducelziplmapasstr2rangelongN)svdvals)webuse)_is_using_pandas_is_recarray)np_matrix_rankc             C   s0   i }x&t | D ]\}}||| |i qW |S )zd
    Helper function to create a dictionary mapping a column number
    to the name in tmp_arr.
    )	enumerateupdate)tmp_arroffsetcol_mapiZcol_name r   6lib/python3.7/site-packages/statsmodels/tools/tools.py_make_dictnames   s    r      c             C   s   t | } | jdkr$| dddf } |dk	rt |}|jdkrP|dddf }t t | | t || }| | || fS t | | }| | S dS )a  
    Returns views on the arrays Y and X where missing observations are dropped.

    Y : array-like
    X : array-like, optional
    axis : int
        Axis along which to look for missing observations.  Default is 1, ie.,
        observations in rows.

    Returns
    -------
    Y : array
        All Y where the
    X : array

    Notes
    -----
    If either Y or X is 1d, it is reshaped to be 2d.
    r   N)npasarrayndimarrayZlogical_andisnanany)YXaxisZkeepidxr   r   r   drop_missing   s    



r    Fc       
         s  t  ttfr@yt dks t d  W n   tdY nX | jjsV| jt	j
krP srt	| jdkrrtdt  ttfr| jj    dkr| jjrt| jjdkr| jjd  t	|   }d}|   jdkr|dddf }d}||   kt}|rt	|dd}|jjs6dd	 t	|D }n"|jjrXd
d	 t	| D } dkry| jjd  W n   d Y nX  fdd	|D }|dkr.t| jdkr|jd |jd k rt	|dd}t||jjgt| }t	jtt| |dt| S tj|  dt| t	j
kd} tj| ||dt| t	j
kd} | S | jt	jkslt | t	jst | t	jst dt  ttfr<| jd }t	| dd f }|ddt	j!f | dd f kt}|dd}|dkr|d8 }t	j"|  ddt} t	#| |f} |dkr8t$||}	| |	fS | S  dkrt	| jdkrt	| }|dddf | kt}|dd}|dkr|dkrt$|}	||	fS |S t	#| |f} |dkrt$|dd}	| |	fS | S ntd  dS )a%  
    Returns a dummy matrix given an array of categorical variables.

    Parameters
    ----------
    data : array
        A structured array, recarray, or array.  This can be either
        a 1d vector of the categorical variable or a 2d array with
        the column specifying the categorical variable specified by the col
        argument.
    col : 'string', int, or None
        If data is a structured array or a recarray, `col` can be a string
        that is the name of the column that contains the variable.  For all
        arrays `col` can be an int that is the (zero-based) column index
        number.  `col` can only be None for a 1d array.  The default is None.
    dictnames : bool, optional
        If True, a dictionary mapping the column number to the categorical
        name is returned.  Used to have information about plain arrays.
    drop : bool
        Whether or not keep the categorical variable in the returned matrix.

    Returns
    --------
    dummy_matrix, [dictnames, optional]
        A matrix of dummy (indicator/binary) float variables for the
        categorical data.  If dictnames is True, then the dictionary
        is returned as well.

    Notes
    -----
    This returns a dummy variable for EVERY distinct variable.  If a
    a structured or recarray is provided, the names for the new variable is the
    old variable name - underscore - category name.  So if the a variable
    'vote' had answers as 'yes' or 'no' then the returned array would have to
    new variables-- 'vote_yes' and 'vote_no'.  There is currently
    no name checking.

    Examples
    --------
    >>> import numpy as np
    >>> import statsmodels.api as sm

    Univariate examples

    >>> import string
    >>> string_var = [string.ascii_lowercase[0:5],                       string.ascii_lowercase[5:10],                       string.ascii_lowercase[10:15],                       string.ascii_lowercase[15:20],                         string.ascii_lowercase[20:25]]
    >>> string_var *= 5
    >>> string_var = np.asarray(sorted(string_var))
    >>> design = sm.tools.categorical(string_var, drop=True)

    Or for a numerical categorical variable

    >>> instr = np.floor(np.arange(10,60, step=2)/10)
    >>> design = sm.tools.categorical(instr, drop=True)

    With a structured array

    >>> num = np.random.randn(25,2)
    >>> struct_ar = np.zeros((25,1), dtype=[('var1', 'f4'),('var2', 'f4'),                      ('instrument','f4'),('str_instr','a5')])
    >>> struct_ar['var1'] = num[:,0][:,None]
    >>> struct_ar['var2'] = num[:,1][:,None]
    >>> struct_ar['instrument'] = instr[:,None]
    >>> struct_ar['str_instr'] = string_var[:,None]
    >>> design = sm.tools.categorical(struct_ar, col='instrument', drop=True)

    Or

    >>> design2 = sm.tools.categorical(struct_ar, col='str_instr', drop=True)
    r   r   z%Can only convert one column at a timez)col is None and the input array is not 1dNTFc             S   s   g | ]}t |qS r   )r   ).0itemr   r   r   
<listcomp>   s    zcategorical.<locals>.<listcomp>c             S   s   g | ]}t |qS r   )r   )r!   r"   r   r   r   r#      s    varc                s   g | ]} d  | qS )_r   )r!   r"   )colr   r   r#      s    )dtype)usemask
asrecarray)datar(   r)   z$Array-like objects are not supported)r   )r   zThe index %s is not understood)%
isinstancelisttuplelenAssertionError
ValueErrorr'   names	__class__r   ZrecarrayZsqueezer   
IndexErrorintr   uniqueastypefloatZswapaxestolistshaper   strr   r   ZviewtypenprfZdrop_fieldsZappend_fieldsndarrayNotImplementedErrornewaxisdeletecolumn_stackr   )
r*   r&   Z	dictnamesZdropr   Z_swapZ	tmp_dummyZdtr   r   r   )r&   r   categoricalB   s    K 







(






rB   Tskipc             C   s   t | dst| r.ddlm} || d||dS t| }|jdkrT|dddf }n|jdkrftdtj|dd	dk}|tj	|d
kdd	M }|
 r|dkr|S |dkrtdt|jd |g}|r|n|ddd }t|S )a  
    Adds a column of ones to an array

    Parameters
    ----------
    data : array-like
        ``data`` is the column-ordered design matrix
    prepend : bool
        If true, the constant is in the first column.  Else the constant is
        appended (last column).
    has_constant : str {'raise', 'add', 'skip'}
        Behavior if ``data`` already has a constant. The default will return
        data without adding another constant. If 'raise', will raise an
        error if a constant is present. Using 'add' will duplicate the
        constant, if one is present.

    Returns
    -------
    data : array, recarray or DataFrame
        The original values with a constant (column of ones) as the first or
        last column. Returned value depends on input type.

    Notes
    -----
    When the input is recarray or a pandas Series or DataFrame, the added
    column's name is 'const'.
    Nr   )	add_trendc)Ztrendprependhas_constantr      z$Only implementd 2-dimensional arrays)r   g        rC   raisez data already contains a constant)r
   r   Zstatsmodels.tsa.tsatoolsrD   r   Z
asanyarrayr   r0   Zptpallr   Zonesr9   rA   )r*   rF   rG   rD   xZis_nonzero_constr   r   r   add_constant   s$    


rM   c             C   sz   t | } t |}| jdkr.| dddf } | jd |jd krTtd|jd  t | |g}t|t|krvdS dS )a    True if (Q, P) contrast `C` is estimable for (N, P) design `D`

    From an Q x P contrast matrix `C` and an N x P design matrix `D`, checks if
    the contrast `C` is estimable by looking at the rank of ``vstack([C,D])``
    and verifying it is the same as the rank of `D`.

    Parameters
    ----------
    C : (Q, P) array-like
        contrast matrix. If `C` has is 1 dimensional assume shape (1, P)
    D: (N, P) array-like
        design matrix

    Returns
    -------
    tf : bool
        True if the contrast `C` is estimable on design `D`

    Examples
    --------
    >>> D = np.array([[1, 1, 1, 0, 0, 0],
    ...               [0, 0, 0, 1, 1, 1],
    ...               [1, 1, 1, 1, 1, 1]]).T
    >>> isestimable([1, 0, 0], D)
    False
    >>> isestimable([1, -1, 0], D)
    True
    r   NzContrast should have %d columnsFT)r   r   r   r9   r0   Zvstackr   )CDnewr   r   r   isestimable$  s    


rQ   V瞯<c          	   C   s   t | } |  } t j| d\}}}t |}|jd }|jd }|t j| }x<t	t
||D ]*}	||	 |krd||	  ||	< qdd||	< qdW t t |t |ddt jjf t |}
|
|fS )z}
    Return the pinv of an array X as well as the singular values
    used in computation.

    Code adapted from numpy.
    r   r   g      ?g        N)r   r   	conjugatelinalgsvdcopyr9   Zmaximumr   r   mindot	transposeZmultiplyZcorer?   )r   ZrcondusZvtZs_origmncutoffr   Zresr   r   r   pinv_extendedM  s    



"r_   c             C   sj   t | } t j| t jd}t | j}| }|| | j| dk@ ||< d| j|  |j|< t j|j|< |S )z
    Return the reciprocal of an array, setting all entries less than or
    equal to 0 to 0. Therefore, it presumes that X should be positive in
    general.
    )r'   r   g      ?)r   r   
zeros_likefloat64r   flatnan)rL   outnansposr   r   r   recipre  s    
rg   c             C   sj   t | } t j| t jd}t | j}| }|| | j| dk@ ||< d| j|  |j|< t j|j|< |S )z
    Return the reciprocal of an array, setting all entries equal to 0
    as 0. It does not assume that X should be positive in
    general.
    )r'   r   g      ?)r   r   r`   ra   r   rb   rc   )rL   rd   re   Znon_zeror   r   r   recipr0v  s    
rh   c                s:   t j d d} fddt |D }t t |S )zF
    Erase columns of zeros: can save some time in pseudoinverse.
    rH   r   c                s   g | ]} d d |f qS )Nr   )r!   r   )matrixr   r   r#     s    zclean0.<locals>.<listcomp>)r   addr   Zflatnonzeror   rY   )ri   Zcolsumvalr   )ri   r   clean0  s    rl   c             C   s   |dkrt | }tj| dd\}}}t|}|ddd }g }x*t|D ]}||dd|| f  qJW tt|	tj
S )z
    Return a matrix whose column span is the same as X.

    If the rank of X is known it can be specified as r -- no check
    is made to ensure that this really is the rank of X.

    Nr   )Zfull_matricesrJ   )r   LrU   r   Zargsortr   appendr   rY   r6   ra   )r   rVrO   Uordervaluer   r   r   r   fullrank  s    	
rt   c             C   s   t |}d||< | |S )a  
    Unsqueeze a collapsed array

    >>> from numpy import mean
    >>> from numpy.random import standard_normal
    >>> x = standard_normal((3,4,5))
    >>> m = mean(x, axis=1)
    >>> m.shape
    (3, 5)
    >>> m = unsqueeze(m, 1, x.shape)
    >>> m.shape
    (3, 1, 5)
    >>>
    r   )r,   Zreshape)r*   r   ZoldshapeZnewshaper   r   r   	unsqueeze  s    ru   c              G   s   t dd | ddd S )a  
    Returns the dot product of the given matrices.

    Parameters
    ----------
    arrs: argument list of ndarray

    Returns
    -------
    Dot product of all arguments.

    Examples
    --------
    >>> import numpy as np
    >>> from statsmodels.tools import chain_dot
    >>> A = np.arange(1,13).reshape(3,4)
    >>> B = np.arange(3,15).reshape(4,3)
    >>> C = np.arange(5,8).reshape(3,1)
    >>> chain_dot(A,B,C)
    array([[1820],
       [4300],
       [6780]])
    c             S   s   t || S )N)r   rX   )rL   yr   r   r   <lambda>  s    zchain_dot.<locals>.<lambda>NrJ   )r   )Zarrsr   r   r   	chain_dot  s    rx   c             C   sZ   t t | |dk}t | dkt |}|| }t t | t |}t j||< |S )z
    Returns np.dot(left_matrix, right_matrix) with the convention that
    nan * 0 = 0 and nan * x = nan if x != 0.

    Parameters
    ----------
    A, B : np.ndarrays
    r   )r   rX   r   Z
nan_to_numrc   )ABZshould_be_nan_1Zshould_be_nan_2Zshould_be_nanrN   r   r   r   nan_dot  s    

r{   c             C   s   t | d| S )z
    Gets raw results back from wrapped results.

    Can be used in plotting functions or other post-estimation type
    routines.
    Z_results)getattr)Zresultsr   r   r   maybe_unwrap_results  s    r}   c                   s    e Zd ZdZ fddZ  ZS )BunchzO
    Returns a dict-like object with keys accessible via attribute lookup.
    c                s   t t| j|| | | _d S )N)superr~   __init____dict__)selfargskwargs)r2   r   r   r     s    zBunch.__init__)__name__
__module____qualname____doc__r   __classcell__r   r   )r2   r   r~     s   r~   c             C   s   | dkr| S t | d}| jdkr8|r.| | jfS | dfS n| jdkrJtd|rT| jnd}|rvt| dddf |fS t| |fS dS )a  

    Parameters
    ----------
    x : array, Series, DataFrame or None
        Input to verify dimensions, and to transform as necesary
    ndarray : bool
        Flag indicating whether to always return a NumPy array. Setting False
        will return an pandas DataFrame when the input is a Series or a
        DataFrame.

    Returns
    -------
    out : array, DataFrame or None
        array or DataFrame with 2 dimensiona.  One dimensional arrays are
        returned as nobs by 1. None is returned if x is None.
    names : list of str or None
        list containing variables names when the input is a pandas datatype.
        Returns None if the input is an ndarray.

    Notes
    -----
    Accepts None for simplicity
    NrH   zx mst be 1 or 2-dimensional.)	r
   r   columnsr0   namer   r   pdZ	DataFrame)rL   r=   Z	is_pandasr   r   r   r   
_ensure_2d  s    




r   )r   )Nr   )NFF)TrC   )rR   )N)F)-r   Zstatsmodels.compat.pythonr   r   r   r   r   r   Znumpyr   Znumpy.lib.recfunctionslibZrecfunctionsr<   Znumpy.linalgrT   rm   Zscipy.linalgr   Zpandasr   Zstatsmodels.datasetsr	   Zstatsmodels.tools.datar
   r   Zstatsmodels.compat.numpyr   r   r    rB   rM   rQ   r_   rg   rh   rl   rt   ru   rx   r{   r}   dictr~   r   r   r   r   r   <module>   s4    

'
 /
4)
	
		