B
    •xŠ\?ª ã               @   sä  d Z ddlmZ ddlZddlZddlZddlZddlZddlm	Z	 ddl
ZddlmZ ddlmZmZ ddlmZ ddlmZmZmZmZmZmZmZmZmZ dd	lmZmZ dd
l m!Z!m"Z"m#Z# ddl$m%Z%m&Z&m'Z'm(Z(m)Z) ddl*m+Z+ ddl,m-Z- ddl.m/Z/ ddl0m1Z1 ddl2m3Z3m4Z4m5Z5 dZ6dZ7dZ8dZ9dZ:dZ;de7e8e9e:e;f Z<de7e9f Z=de7e9f Z>de7e9e8e:f Z?ee<ƒedddeddddad!d"„ƒƒƒZ@d#d$d%d&d'd(d)d*d+g	ZAe d,d-d-¡ZBd.d/„ ZCd0d1„ ZDd2ZEG d3d4„ d4eFƒZGd5ZHG d6d7„ d7eFƒZId8ZJG d9d:„ d:eFƒZKd;ZLd<d=„ ZMG d>d?„ d?eNƒZOG d@dA„ dAe-ƒZPG dBdC„ dCeNƒZQG dDdE„ dEeQe3ƒZRdFdG„ ZSdHdI„ ZTdJdK„ ZUdLdM„ ZVdNdO„ ZWdPdQ„ ZXdbdSdT„ZYG dUdV„ dVeQƒZZdWdX„ Z[dYdZ„ Z\d[d\„ Z]G d]d^„ d^eNƒZ^G d_d`„ d`eZƒZ_dS )ca¯  
Module contains tools for processing Stata files into DataFrames

The StataReader below was originally written by Joe Presbrey as part of PyDTA.
It has been extended and improved by Skipper Seabold from the Statsmodels
project who also developed the StataWriter and was finally added to pandas in
a once again improved version.

You can find more information on http://presbrey.mit.edu/PyDTA and
http://www.statsmodels.org/devel/
é    )ÚOrderedDictN)Úrelativedelta)Úinfer_dtype)ÚNaTÚ	Timestamp)Úmax_len_string_array)	ÚBytesIOÚResourceWarningÚlmapÚlrangeÚlzipÚrangeÚstring_typesÚ	text_typeÚzip)ÚAppenderÚdeprecate_kwarg)Úensure_objectÚis_categorical_dtypeÚis_datetime64_dtype)ÚDatetimeIndexÚcompatÚisnaÚto_datetimeÚto_timedelta)ÚCategorical)ÚStringMixin)Ú	DataFrame)ÚSeries)ÚBaseIteratorÚ_stringify_pathÚget_filepath_or_bufferz˜Version of given Stata file is not 104, 105, 108, 111 (Stata 7SE), 113 (Stata 8/9), 114 (Stata 10/11), 115 (Stata 12), 117 (Stata 13), or 118 (Stata 14)zÚconvert_dates : boolean, defaults to True
    Convert date variables to DataFrame time values.
convert_categoricals : boolean, defaults to True
    Read value labels and convert columns to Categorical/Factor variables.zcencoding : string, None or encoding
    Encoding used to parse the files. None defaults to latin-1.a6  index_col : string, optional, default: None
    Column to set as index.
convert_missing : boolean, defaults to False
    Flag indicating whether to convert missing values to their Stata
    representations.  If False, missing values are replaced with nan.
    If True, columns containing missing values are returned with
    object data types and missing values are represented by
    StataMissingValue objects.
preserve_dtypes : boolean, defaults to True
    Preserve Stata datatypes. If False, numeric data are upcast to pandas
    default types for foreign data (float64 or int64).
columns : list or None
    Columns to retain.  Columns will be returned in the given order.  None
    returns all columns.
order_categoricals : boolean, defaults to True
    Flag indicating whether converted categorical data are ordered.zzchunksize : int, default None
    Return StataReader object for iterations, returns chunks with
    given number of lines.z@iterator : boolean, default False
    Return StataReader object.aa  
Read Stata file into DataFrame.

Parameters
----------
filepath_or_buffer : string or file-like object
    Path to .dta file or object implementing a binary read() functions.
%s
%s
%s
%s
%s

Returns
-------
DataFrame or StataReader

See Also
--------
pandas.io.stata.StataReader : Low-level reader for Stata data files.
pandas.DataFrame.to_stata: Export Stata data files.

Examples
--------
Read a Stata dta file:

>>> df = pd.read_stata('filename.dta')

Read a Stata dta file in 10,000 line chunks:

>>> itr = pd.read_stata('filename.dta', chunksize=10000)
>>> for chunk in itr:
...     do_something(chunk)
zÄReads observations from Stata file, converting them into a dataframe

.. deprecated::
    This is a legacy method.  Use `read` in new code.

Parameters
----------
%s
%s

Returns
-------
DataFrame
zÎReads observations from Stata file, converting them into a dataframe

Parameters
----------
nrows : int
    Number of lines to read from data file, if None read whole file.
%s
%s

Returns
-------
DataFrame
a.  Class for reading Stata dta files.

Parameters
----------
path_or_buf : path (string), buffer or path object
    string, path object (pathlib.Path or py._path.local.LocalPath) or object
    implementing a binary read() functions.

    .. versionadded:: 0.23.0 support for pathlib, py.path.
%s
%s
%s
%s
Úencoding)Úold_arg_nameÚnew_arg_nameÚindexÚ	index_colTFc             C   sD   t | ||||||||	d	}|
s"|	r(|}nz| ¡ }W d | ¡  X |S )N)Úconvert_datesÚconvert_categoricalsr&   Úconvert_missingÚpreserve_dtypesÚcolumnsÚorder_categoricalsÚ	chunksize)ÚStataReaderÚreadÚclose)Zfilepath_or_bufferr'   r(   r"   r&   r)   r*   r+   r,   r-   ÚiteratorÚreaderÚdata© r4   ú.lib/python3.7/site-packages/pandas/io/stata.pyÚ
read_stata«   s    
r6   z%tcz%tCz%tdz%dz%twz%tmz%tqz%thz%tyi¨  é   c                sV  t jjt jj ‰‰t jt ddd¡ j‰ t jt ddd¡ j‰ˆd d d ‰ˆ d d d ‰‡‡fdd„}‡‡fdd	„}‡ ‡‡‡fd
d„}t | ¡}d}| ¡ r¶d}t	| ƒ}d||< |  
tj¡} | d¡rät}| }	|||	dƒ}
n`| d¡rt d¡ t	| tjd}
|rt|
|< |
S | d¡r>t}| }|||dƒ}
n| d¡rptj| d  }| d d }|||ƒ}
nÔ| d¡r¢tj| d  }| d d }|||ƒ}
n¢| d¡rØtj| d  }| d d d }|||ƒ}
nl| d¡rtj| d  }| d d  d }|||ƒ}
n6| d!¡r4| }t | ¡}|||ƒ}
ntd"j|d#ƒ‚|rRt|
|< |
S )$af  
    Convert from SIF to datetime. http://www.stata.com/help.cgi?datetime

    Parameters
    ----------
    dates : Series
        The Stata Internal Format date to convert to datetime according to fmt
    fmt : str
        The format to convert to. Can be, tc, td, tw, tm, tq, th, ty
        Returns

    Returns
    -------
    converted : Series
        The converted dates

    Examples
    --------
    >>> dates = pd.Series([52])
    >>> _stata_elapsed_date_to_datetime_vec(dates , "%tw")
    0   1961-01-01
    dtype: datetime64[ns]

    Notes
    -----
    datetime/c - tc
        milliseconds since 01jan1960 00:00:00.000, assuming 86,400 s/day
    datetime/C - tC - NOT IMPLEMENTED
        milliseconds since 01jan1960 00:00:00.000, adjusted for leap seconds
    date - td
        days since 01jan1960 (01jan1960 = 0)
    weekly date - tw
        weeks since 1960w1
        This assumes 52 weeks in a year, then adds 7 * remainder of the weeks.
        The datetime value is the start of the week in terms of days in the
        year, not ISO calendar weeks.
    monthly date - tm
        months since 1960m1
    quarterly date - tq
        quarters since 1960q1
    half-yearly date - th
        half-years since 1960h1 yearly
    date - ty
        years since 0000

    If you don't have pandas with datetime support, then you can't do
    milliseconds accurately.
    i¨  r7   é   i  iè  c                sX   |   ¡ ˆ k r,|  ¡ ˆkr,td|  | ddS t| ddƒ}tdd„ t| |ƒD ƒ|dS dS )	zú
        Convert year and month to datetimes, using pandas vectorized versions
        when the date range falls within the range supported by pandas.
        Otherwise it falls back to a slower but more robust method
        using datetime.
        éd   z%Y%m)Úformatr%   Nc             S   s   g | ]\}}t   ||d ¡‘qS )r7   )Údatetime)Ú.0ÚyÚmr4   r4   r5   ú
<listcomp>  s    zX_stata_elapsed_date_to_datetime_vec.<locals>.convert_year_month_safe.<locals>.<listcomp>)r%   )ÚmaxÚminr   Úgetattrr   r   )ÚyearÚmonthr%   )ÚMAX_YEARÚMIN_YEARr4   r5   Úconvert_year_month_safe  s    zD_stata_elapsed_date_to_datetime_vec.<locals>.convert_year_month_safec                sd   |   ¡ ˆ d k r4|  ¡ ˆkr4t| ddt|dd S t| ddƒ}dd	„ t| |ƒD ƒ}t||d
S dS )z{
        Converts year (e.g. 1999) and days since the start of the year to a
        datetime or datetime64 Series
        r7   z%Y)r:   Úd)Úunitr%   Nc             S   s,   g | ]$\}}t   |d d ¡tt|ƒd ‘qS )r7   )Údays)r;   r   Úint)r<   r=   rH   r4   r4   r5   r?     s   zW_stata_elapsed_date_to_datetime_vec.<locals>.convert_year_days_safe.<locals>.<listcomp>)r%   )r@   rA   r   r   rB   r   r   )rC   rJ   r%   Úvalue)rE   rF   r4   r5   Úconvert_year_days_safe  s    
zC_stata_elapsed_date_to_datetime_vec.<locals>.convert_year_days_safec                s°   t |ddƒ}|dkrL| ¡ ˆks,| ¡ ˆk r”‡ fdd„|D ƒ}t||dS nH|dkrŒ| ¡ ˆksl| ¡ ˆk r”‡ fdd„|D ƒ}t||dS ntd	ƒ‚tˆ ƒ‰ t||d
}ˆ | S )z¾
        Convert base dates and deltas to datetimes, using pandas vectorized
        versions if the deltas satisfy restrictions required to be expressed
        as dates in pandas.
        r%   NrH   c                s   g | ]}ˆ t t|ƒd  ‘qS ))rJ   )r   rK   )r<   rH   )Úbaser4   r5   r?   )  s    zS_stata_elapsed_date_to_datetime_vec.<locals>.convert_delta_safe.<locals>.<listcomp>)r%   Úmsc                s"   g | ]}ˆ t t|ƒd  d ‘qS )iè  )Úmicroseconds)r   rK   )r<   rH   )rN   r4   r5   r?   -  s   zformat not understood)rI   )rB   r@   rA   r   Ú
ValueErrorr   r   )rN   ZdeltasrI   r%   Úvalues)ÚMAX_DAY_DELTAÚMAX_MS_DELTAÚMIN_DAY_DELTAÚMIN_MS_DELTA)rN   r5   Úconvert_delta_safe   s    
z?_stata_elapsed_date_to_datetime_vec.<locals>.convert_delta_safeFTg      ð?)z%tcÚtcrO   )z%tCÚtCz9Encountered %tC format. Leaving in Stata Internal Format.)Údtype)z%tdÚtdz%drH   rH   )z%twÚtwé4   é   )z%tmÚtmé   )z%tqÚtqé   é   )z%thÚthé   é   )z%tyÚtyzDate fmt {fmt} not understood)Úfmt)r   rA   rC   r@   r;   rJ   ÚnpZisnanÚanyr   ÚastypeÚint64Ú
startswithÚstata_epochÚwarningsÚwarnÚobjectr   Z	ones_likerQ   r:   )Údatesrh   rG   rM   rW   Zbad_locsZhas_bad_valuesZdata_colrN   rO   Ú
conv_datesrJ   rC   rD   r4   )rS   rT   rE   rU   rV   rF   r5   Ú#_stata_elapsed_date_to_datetime_vecÌ   sj    1



rt   c                sò  | j ‰d‰ ˆ d ‰d"‡ ‡‡fdd„	}t| ƒ}| j ‰| ¡ r`t| ƒ} t| ƒrXttƒ| |< nt| |< |dkr‚|| dd}|jd }n>|d	krœt 	d
¡ | }n$|dkr¾|| dd}|jˆ }n|dkrð|| ddd}d|j
tj
  |jd  }nÐ|dkr"|| dd}d|j
tj
  |j d }nž|dkrX|| dd}d|j
tj
  |jd d  }nh|dkr’|| dd}d|j
tj
  |jdk tj¡ }n.|dkr°|| dd}|j
}ntdj|dƒ‚t|tjd}t dd¡d  }|||< t|ˆd!S )#aW  
    Convert from datetime to SIF. http://www.stata.com/help.cgi?datetime

    Parameters
    ----------
    dates : Series
        Series or array containing datetime.datetime or datetime64[ns] to
        convert to the Stata Internal Format given by fmt
    fmt : str
        The format to convert to. Can be, tc, td, tw, tm, tq, th, ty
    l     ž"R: iè  Fc                s@  i }t | jƒrŒ|r0| t }|j tj¡d |d< |s8|rVt| ƒ} | j| j |d< |d< |rŠ|  tj¡t	|d dd tj¡ }|ˆ  |d< n¨t
| dd	d
kr,|rÎ| jt }‡fdd„}t |¡}||ƒ|d< |r|  dd„ ¡}|jd |d< |j|d d  |d< |r4dd„ }t |¡}|| ƒ|d< ntdƒ‚t|ˆdS )Niè  ÚdeltarC   rD   z%Y)r:   rJ   F)Úskipnar;   c                s   ˆ | j  d| j  | j S )Ni@B )rJ   ZsecondsrP   )Úx)Ú
US_PER_DAYr4   r5   Ú<lambda>’  s    zJ_datetime_to_stata_elapsed_vec.<locals>.parse_dates_safe.<locals>.<lambda>c             S   s   d| j  | j S )Nr9   )rC   rD   )rw   r4   r4   r5   ry   —  s    r9   c             S   s   | t   | jdd¡ jS )Nr7   )r;   rC   rJ   )rw   r4   r4   r5   ry   ›  s    zZColumns containing dates must contain either datetime64, datetime.datetime or null values.)r%   )r   rR   rn   rk   ri   rl   r   rC   rD   r   r   Z	vectorizeÚapplyrQ   r   )rr   ru   rC   rJ   rH   ÚfÚvZ
year_month)Ú
NS_PER_DAYrx   r%   r4   r5   Úparse_dates_safe€  s:    




z8_datetime_to_stata_elapsed_vec.<locals>.parse_dates_safe)z%tcrX   T)ru   )z%tCrY   z'Stata Internal Format tC not supported.)z%tdr[   )z%twr\   )rC   rJ   r]   r^   )z%tmr_   )rC   r`   r7   )z%tqra   rb   rc   )z%thrd   re   rf   )z%tyrg   z-Format {fmt} is not a known Stata date format)rh   )rZ   z<ds         àr   )r%   )FFF)r%   r   rj   r   r   r   rn   ru   ro   rp   rC   rJ   rD   rk   ri   rK   rQ   r:   Úfloat64ÚstructÚunpack)rr   rh   r~   Zbad_locrH   rs   Úmissing_valuer4   )r}   rx   r%   r5   Ú_datetime_to_stata_elapsed_vecp  sT    $


 

rƒ   zÓ
Fixed width strings in Stata .dta files are limited to 244 (or fewer)
characters.  Column '%s' does not satisfy this restriction. Use the
'version=117' parameter to write the newer (Stata 13 and later) format.
c               @   s   e Zd ZdS )ÚPossiblePrecisionLossN)Ú__name__Ú
__module__Ú__qualname__r4   r4   r4   r5   r„   Ø  s   r„   z—
Column converted from %s to %s, and some data are outside of the lossless
conversion range. This may result in a loss of precision in the saved data.
c               @   s   e Zd ZdS )ÚValueLabelTypeMismatchN)r…   r†   r‡   r4   r4   r4   r5   rˆ   â  s   rˆ   zä
Stata value labels (pandas categories) must be strings. Column {0} contains
non-string labels which will be converted to strings.  Please check that the
Stata data file created has not lost information due to duplicate labels.
c               @   s   e Zd ZdS )ÚInvalidColumnNameN)r…   r†   r‡   r4   r4   r4   r5   r‰   í  s   r‰   a;  
Not all pandas column names were valid Stata variable names.
The following replacements have been made:

    {0}

If this is not what you expect, please make sure you have Stata-compliant
column names in your DataFrame (strings only, max 32 characters, only
alphanumerics and underscores, no Stata reserved words)
c       	      C   s°  d}t jt jt jft jt jt jft jt jt jft jt jt jff}t	 
dd¡d }t	 
dd¡d }x6| D ],}| | j}x~|D ]v}||d kr~| |  ¡ t  |d ¡jkr´|d }n|d }|d t jkrâ| |  ¡ d	krâtd
 }| |  |¡| |< q~W |t jkr>| |  ¡ dks(| |  ¡ dk r–| |  t j¡| |< qh|t jkr„| |  ¡ dksn| |  ¡ dk r–| |  t j¡| |< qh|t jkr| |  ¡ dkrÊ| |  ¡ dkrÊ| |  t j¡| |< n@| |  t j¡| |< | |  ¡ d	ks| |  ¡ dkr–td }qh|t jt jfkrh| |  ¡ }t  |¡rDtdj|dƒ‚|t jkrp||krp| |  t j¡| |< qh|t jkrh||krhtdj|||dƒ‚qhW |r¬t |t¡ | S )a(  Checks the dtypes of the columns of a pandas DataFrame for
    compatibility with the data types and ranges supported by Stata, and
    converts if necessary.

    Parameters
    ----------
    data : DataFrame
        The DataFrame to check and convert

    Notes
    -----
    Numeric columns in Stata must be one of int8, int16, int32, float32 or
    float64, with some additional value restrictions.  int8 and int16 columns
    are checked for violations of the value restrictions and upcast if needed.
    int64 data is not usable in Stata, and so it is downcast to int32 whenever
    the value are in the int32 range, and sidecast to float64 when larger than
    this range.  If the int64 values are outside of the range of those
    perfectly representable as float64 values, a warning is raised.

    bool columns are cast to int8.  uint columns are converted to int of the
    same size if there is no loss in precision, otherwise are upcast to a
    larger type.  uint64 is currently not supported since it is concerted to
    object in a DataFrame.
    Ú z<fs   ÿÿÿ~r   z<ds   ÿÿÿÿÿÿßr7   re   l          )Úuint64r   r9   iÿÿÿiä  i€ÿÿiäÿÿi  €lüÿÿÿ       )rl   r   z[Column {col} has a maximum value of infinity which is outside the range supported by Stata.)Úcolz]Column {col} has a maximum value ({val}) outside the range supported by Stata ({float64_max}))rŒ   ÚvalÚfloat64_max)ri   ÚboolÚint8Úuint8Úint16Zuint16Úint32Zuint32rl   r€   r   rZ   r@   Ziinfor   Úprecision_loss_docrk   rA   Úfloat32ZisinfrQ   r:   ro   rp   r„   )	r3   ÚwsZconversion_dataÚfloat32_maxrŽ   rŒ   rZ   Zc_datarL   r4   r4   r5   Ú_cast_to_stata_typesý  s\    


$$$


r˜   c               @   s(   e Zd ZdZdd„ Zdd„ Zdd„ ZdS )	ÚStataValueLabela¶  
    Parse a categorical column and prepare formatted output

    Parameters
    -----------
    value : int8, int16, int32, float32 or float64
        The Stata missing value code

    Attributes
    ----------
    string : string
        String representation of the Stata missing value
    value : int8, int16, int32, float32 or float64
        The original encoded missing value

    Methods
    -------
    generate_value_label

    c             C   sJ  |j | _|jj}ttt t|ƒ¡|ƒƒ| _	| j	j
dd„ d t d¡| _g | _g | _g | _d| _xŠ| j	D ]€}|d }t|tƒsšt|ƒ}t t |j ¡t¡ | j | j¡ |  jt|ƒd 7  _| j |d ¡ | j |¡ |  jd7  _qhW | jdkrþtdƒ‚tj| jtjd| _tj| jtjd| _d	d
| j  d
| j  | j | _d S )Nc             S   s   | d S )Nr   r4   )rw   r4   r4   r5   ry   p  s    z*StataValueLabel.__init__.<locals>.<lambda>)Úkeyr   r7   i }  zaStata value labels for a single variable must have a combined length less than 32,000 characters.)rZ   é   rb   )ÚnameÚlabnameÚcatÚ
categoriesÚlistr   ri   ÚarangeÚlenÚvalue_labelsÚsortr“   Útext_lenÚoffr   ÚtxtÚnÚ
isinstancer   Ústrro   rp   Úvalue_label_mismatch_docr:   rˆ   ÚappendrQ   Úarray)ÚselfZcatarrayrŸ   ÚvlÚcategoryr4   r4   r5   Ú__init__j  s2    

zStataValueLabel.__init__c             C   s   t jr| | j¡S |S dS )z-
        Python 3 compatibility shim
        N)r   ÚPY3ÚencodeÚ	_encoding)r®   Úsr4   r4   r5   Ú_encode‘  s    zStataValueLabel._encodec             C   s&  || _ tƒ }d}d}| t |d | j¡¡ |  t| jdd… dƒ¡}| |¡ x"t	dƒD ]}| t d|¡¡ qZW | t |d | j
¡¡ | t |d | j¡¡ x$| jD ]}| t |d |¡¡ q¬W x$| jD ]}	| t |d |	¡¡ qÒW x"| jD ]}
| |  |
| ¡¡ qøW | d	¡ | ¡ S )
a  
        Parameters
        ----------
        byteorder : str
            Byte order of the output
        encoding : str
            File encoding

        Returns
        -------
        value_label : bytes
            Bytes containing the formatted value label
        ú ó    ÚiNé    é!   rc   Úcr   )r´   r   Úwriter€   Úpackr¢   r¶   Ú
_pad_bytesr   r   r¨   r¥   r¦   r   r§   Úseekr/   )r®   Ú	byteorderr"   ÚbioZnull_stringÚ	null_byter   r¹   ÚoffsetrL   Útextr4   r4   r5   Úgenerate_value_labelš  s&    

z$StataValueLabel.generate_value_labelN)r…   r†   r‡   Ú__doc__r±   r¶   rÆ   r4   r4   r4   r5   r™   T  s   '	r™   c               @   sê  e Zd ZdZi ZdZxLeD ]DZdee e¡< x0e	ddƒD ]"Z
dede
 ƒ ee e
e ¡< q8W qW dZe dd	¡d
 Zxpe	dƒD ]dZ
e de¡d
 Zdee< e
d
kr¼ee  ede
 ƒ7  < e de de¡¡d
 e Ze de¡Zq€W dZe dd¡d
 Zxte	dƒD ]hZ
e de¡d
 Zdee< e
d
krDee  ede
 ƒ7  < e de de¡¡d
 e Ze de¡ZqW ddde de¡d
 e de¡d
 dœZdd„ Zedd„ ddZedd„ ddZdd„ Zdd„ Zd d!„ Zed"d#„ ƒZd$S )%ÚStataMissingValueax  
    An observation's missing value.

    Parameters
    -----------
    value : int8, int16, int32, float32 or float64
        The Stata missing value code

    Attributes
    ----------
    string : string
        String representation of the Stata missing value
    value : int8, int16, int32, float32 or float64
        The original encoded missing value

    Notes
    -----
    More information: <http://www.stata.com/help.cgi?missing>

    Integer missing values make the code '.', '.a', ..., '.z' to the ranges
    101 ... 127 (for int8), 32741 ... 32767  (for int16) and 2147483621 ...
    2147483647 (for int32).  Missing values for floating point data types are
    more complex but the pattern is simple to discern from the following table.

    np.float32 missing values (float in Stata)
    0000007f    .
    0008007f    .a
    0010007f    .b
    ...
    00c0007f    .x
    00c8007f    .y
    00d0007f    .z

    np.float64 missing values (double in Stata)
    000000000000e07f    .
    000000000001e07f    .a
    000000000002e07f    .b
    ...
    000000000018e07f    .x
    000000000019e07f    .y
    00000000001ae07f    .z
    )ée   iå  iåÿÿÚ.r7   é   é`   s      z<is      r   z<fs         àÚqs          z<drÉ   iå  iåÿÿ)r   r’   r“   r•   r   c             C   s0   || _ |dk rt |¡nt|ƒ}| j| | _d S )Nl        )Ú_valuer   ÚlongÚfloatÚMISSING_VALUESÚ_str)r®   rL   r4   r4   r5   r±      s    zStataMissingValue.__init__c             C   s   | j S )N)rÒ   )r®   r4   r4   r5   ry   &  s    zStataMissingValue.<lambda>z>The Stata representation of the missing value: '.', '.a'..'.z')Údocc             C   s   | j S )N)rÎ   )r®   r4   r4   r5   ry   )  s    z/The binary representation of the missing value.c             C   s   | j S )N)Ústring)r®   r4   r4   r5   Ú__unicode__,  s    zStataMissingValue.__unicode__c             C   s   dj | j| dS )Nz{cls}({obj}))ÚclsÚobj)r:   Ú	__class__)r®   r4   r4   r5   Ú__repr__/  s    zStataMissingValue.__repr__c             C   s$   t || jƒo"| j|jko"| j|jkS )N)r©   rØ   rÔ   rL   )r®   Úotherr4   r4   r5   Ú__eq__3  s    zStataMissingValue.__eq__c             C   sz   |t jkr| jd }n`|t jkr,| jd }nJ|t jkrB| jd }n4|t jkrX| jd }n|t jkrn| jd }ntdƒ‚|S )Nr   r’   r“   r•   r   zUnsupported dtype)ri   r   ÚBASE_MISSING_VALUESr’   r“   r•   r   rQ   )rÖ   rZ   rL   r4   r4   r5   Úget_base_missing_value7  s    




z(StataMissingValue.get_base_missing_valueN)r…   r†   r‡   rÇ   rÑ   ÚbasesÚbr   rÏ   r   r¹   ÚchrZfloat32_baser€   r   Z	incrementrL   r¾   Z	int_valueZfloat64_baserÜ   r±   ÚpropertyrÔ   rÕ   rÙ   rÛ   ÚclassmethodrÝ   r4   r4   r4   r5   rÈ   Ð  sP   *
&

rÈ   c               @   s   e Zd Zdd„ ZdS )ÚStataParserc             C   sŠ  t ttddƒdd„ tddƒD ƒƒdtjfdtjfdtjfdtjfd	tjfg ƒ| _	t d
tj
fdtjfdtjfdtjfdtjfdtjfgƒ| _tdƒtdƒ | _t ddddddgƒ| _d}d}d}d}dddt t d|¡d ¡t t d|¡d ¡ft t d |¡d ¡t t d |¡d ¡fd!œ| _ddddd"œ| _d#d$d%t t dd&¡d ¡t t d d'¡d ¡d!œ| _d(d)d*d+d,d-d.œ| _d/| _d S )0Nr7   éõ   c             S   s   g | ]}d t |ƒ ‘qS )Úa)rª   )r<   r¹   r4   r4   r5   r?   ]  s    z(StataParser.__init__.<locals>.<listcomp>éû   éü   éý   éþ   éÿ   i €  iöÿ  i÷ÿ  iøÿ  iùÿ  iúÿ  Zbhlfd)i €  ÚQ)iöÿ  rH   )i÷ÿ  r{   )iøÿ  Úl)iùÿ  Úh)iúÿ  rß   s   ÿÿÿþs   ÿÿÿ~s   ÿÿÿÿÿÿïÿs   ÿÿÿÿÿÿß)iÿÿÿr9   )i€ÿÿiä  )i  €iäÿÿz<fr   z<d)rß   rí   rì   r{   rH   )éb   éi   él   éf   rÉ   iå  iåÿÿs      s         àZi1Zi2Úi4Zf4Zf8Úu8)rß   rí   rì   r{   rH   rë   ),Z	aggregater­   ZbooleanÚbreakZbyteZcaseZcatchÚclassZ	colvectorÚcomplexZconstÚcontinueÚdefaultZdelegateÚdeleteZdoZdoubleÚelseZ	eltypedefÚendÚenumZexplicitZexportZexternalrÐ   ÚforZfriendZfunctionÚglobalZgotoÚifZinlinerK   ZlocalrÏ   ZNULLZpragmaZ	protectedZquadZ	rowvectorZshortZtypedefÚtypenameZvirtual)Údictr   r   ri   r   r’   r“   r•   r   Ú	DTYPE_MAPr‘   ÚDTYPE_MAP_XMLr   r    ÚTYPE_MAPÚTYPE_MAP_XMLr€   r   ÚVALID_RANGEÚOLD_TYPE_MAPPINGrÑ   ÚNUMPY_TYPE_MAPÚRESERVED_WORDS)r®   Zfloat32_minr—   Zfloat64_minrŽ   r4   r4   r5   r±   J  sd    


zStataParser.__init__N)r…   r†   r‡   r±   r4   r4   r4   r5   rã   H  s   rã   c                   sb  e Zd ZeZedddeddddL‡ fdd	„	ƒƒZd
d„ Zdd„ Zdd„ Z	dd„ Z
dd„ Zdd„ Zdd„ Zdd„ Zdd„ Zdd„ Zdd„ Zd d!„ Zd"d#„ Zd$d%„ Zd&d'„ Zd(d)„ Zd*d+„ Zd,d-„ Zd.d/„ Zd0d1„ Zd2d3„ Zd4d5„ Zeeƒd6d7„ ƒZd8d9„ Z dMd:d;„Z!ee"ƒeddddNd<d=„ƒƒZ#d>d?„ Z$d@dA„ Z%dBdC„ Z&dDdE„ Z'dFdG„ Z(dHdI„ Z)dJdK„ Z*‡  Z+S )Or.   r"   N)r#   r$   r%   r&   TFc                sì   t t| ƒ ¡  d| _|| _|| _|| _|| _|| _|| _	|| _
d | _|
| _d| _d| _d| _d| _d| _d| _d | _d| _ttjƒ| _t|ƒ}t|tƒr¨t|ƒ\}}	}}t|tttfƒrÆt|dƒ| _ n| !¡ }t"|ƒ| _ |  #¡  |  $¡  d S )Nr4   Fr   Úrb)%Úsuperr.   r±   Ú	col_sizesÚ_convert_datesÚ_convert_categoricalsÚ
_index_colÚ_convert_missingÚ_preserve_dtypesÚ_columnsÚ_order_categoricalsr´   Ú
_chunksizeZ_has_string_dataZ_missing_valuesÚ_can_read_value_labelsÚ_column_selector_setÚ_value_labels_readÚ
_data_readÚ_dtypeÚ_lines_readÚ_set_endiannessÚsysrÁ   Ú_native_byteorderr    r©   rª   r!   r   ÚbytesÚopenÚpath_or_bufr/   r   Ú_read_headerÚ_setup_dtype)r®   r   r'   r(   r&   r)   r*   r+   r,   r"   r-   Ú_Zshould_closeÚcontents)rØ   r4   r5   r±   »  s<    

zStataReader.__init__c             C   s   | S )z enter context manager r4   )r®   r4   r4   r5   Ú	__enter__ë  s    zStataReader.__enter__c             C   s   |   ¡  dS )z exit context manager N)r0   )r®   Úexc_typeÚ	exc_valueÚ	tracebackr4   r4   r5   Ú__exit__ï  s    zStataReader.__exit__c             C   s(   y| j  ¡  W n tk
r"   Y nX dS )z close the handle if its open N)r   r0   ÚIOError)r®   r4   r4   r5   r0   ó  s    zStataReader.closec             C   s   | j dk rd| _nd| _dS )zC
        Set string encoding which depends on file version
        év   zlatin-1zutf-8N)Úformat_versionr´   )r®   r4   r4   r5   Ú_set_encodingú  s    
zStataReader._set_encodingc                sj   ˆ j  d¡}t d|¡d dkr,ˆ  |¡ n
ˆ  |¡ tdd„ ˆ jD ƒƒdkˆ _t	‡ fdd„ˆ jƒˆ _
d S )	Nr7   r¼   r   ó   <c             S   s   g | ]}t |ƒtkr|‘qS r4   )ÚtyperK   )r<   rw   r4   r4   r5   r?   
  s    z,StataReader._read_header.<locals>.<listcomp>c                s
   ˆ   | ¡S )N)Ú	_calcsize)rw   )r®   r4   r5   ry     s    z*StataReader._read_header.<locals>.<lambda>)r   r/   r€   r   Ú_read_new_headerÚ_read_old_headerr¢   ÚtyplistZhas_string_datar
   r  )r®   Ú
first_charr4   )r®   r5   r!    s    
zStataReader._read_headerc             C   sä  | j  d¡ t| j  d¡ƒ| _| jdkr0ttƒ‚|  ¡  | j  d¡ | j  d¡dkrXdpZd| _| j  d¡ t 	| jd	 | j  d
¡¡d | _
| j  d¡ |  ¡ | _| j  d¡ |  ¡ | _| j  d¡ |  ¡ | _| j  d¡ | j  d¡ | j  d¡ t 	| jd | j  d¡¡d d | _t 	| jd | j  d¡¡d d | _t 	| jd | j  d¡¡d d | _t 	| jd | j  d¡¡d d | _t 	| jd | j  d¡¡d d | _|  ¡ | _| j  d¡ t 	| jd | j  d¡¡d d | _t 	| jd | j  d¡¡d d | _t 	| jd | j  d¡¡d d | _|  | j¡\| _| _| j  | j¡ |  ¡ | _ | j  | j¡ t 	| jd| j
d   | j  d
| j
d  ¡¡d d… | _!| j  | j¡ |  "¡ | _#| j  | j¡ |  $¡ | _%| j  | j¡ |  &¡ | _'d S )NrË   rc   )éu   r+  é   s   MSFú>ú<é   ÚHre   r   r^   é   é   é   r›   rÍ   é   é
   é	   rf   é   rí   r7   éÿÿÿÿ)(r   r/   rK   r,  rQ   Ú_version_errorr-  rÁ   r€   r   ÚnvarÚ	_get_nobsÚnobsÚ_get_data_labelÚ
data_labelÚ_get_time_stampÚ
time_stampZ_seek_vartypesZ_seek_varnamesZ_seek_sortlistZ_seek_formatsÚ_seek_value_label_namesÚ_get_seek_variable_labelsZ_seek_variable_labelsÚdata_locationÚ
seek_strlsÚseek_value_labelsÚ_get_dtypesr3  ÚdtyplistrÀ   Ú_get_varlistÚvarlistÚsrtlistÚ_get_fmtlistÚfmtlistÚ_get_lbllistÚlbllistÚ_get_variable_labelsÚ_variable_labels)r®   r4  r4   r4   r5   r1    sf    



     
   



zStataReader._read_new_headerc                sh   ˆj  |¡ ‡fdd„tˆjƒD ƒ}‡fdd„‰ ‡ fdd„|D ƒ}‡fdd„‰ ‡ fdd„|D ƒ}||fS )Nc                s*   g | ]"}t  ˆ jd  ˆ j d¡¡d ‘qS )r:  re   r   )r€   r   rÁ   r   r/   )r<   r¹   )r®   r4   r5   r?   U  s   z+StataReader._get_dtypes.<locals>.<listcomp>c                s>   | dkr| S y
ˆ j |  S  tk
r8   td | ¡ƒ‚Y nX d S )Niý  z cannot convert stata types [{0}])r  ÚKeyErrorrQ   r:   )Útyp)r®   r4   r5   r{   Y  s    
z"StataReader._get_dtypes.<locals>.fc                s   g | ]}ˆ |ƒ‘qS r4   r4   )r<   rw   )r{   r4   r5   r?   b  s    c                sB   | dkrt | ƒS y
ˆ j|  S  tk
r<   td | ¡ƒ‚Y nX d S )Niý  z cannot convert stata dtype [{0}])rª   r  r[  rQ   r:   )r\  )r®   r4   r5   r{   d  s    
c                s   g | ]}ˆ |ƒ‘qS r4   r4   )r<   rw   )r{   r4   r5   r?   m  s    )r   rÀ   r   rD  )r®   Zseek_vartypesZraw_typlistr3  rQ  r4   )r{   r®   r5   rP  R  s    
		zStataReader._get_dtypesc                s8   ˆj dkrd‰ nˆj dkrd‰ ‡ ‡fdd„tˆjƒD ƒS )Nr5  r»   r+  é   c                s   g | ]}ˆ  ˆj ˆ ¡¡‘qS r4   )Ú_null_terminater   r/   )r<   r¹   )rß   r®   r4   r5   r?   w  s   z,StataReader._get_varlist.<locals>.<listcomp>)r,  r   rD  )r®   r4   )rß   r®   r5   rR  q  s    

zStataReader._get_varlistc                sN   ˆj dkrd‰ n$ˆj dkr d‰ nˆj dkr0d‰ nd‰ ‡ ‡fdd	„tˆjƒD ƒS )
Nr+  é9   éq   é1   éh   r`   r^   c                s   g | ]}ˆ  ˆj ˆ ¡¡‘qS r4   )r^  r   r/   )r<   r¹   )rß   r®   r4   r5   r?   …  s   z,StataReader._get_fmtlist.<locals>.<listcomp>)r,  r   rD  )r®   r4   )rß   r®   r5   rU  {  s    


zStataReader._get_fmtlistc                s>   ˆj dkrd‰ nˆj dkr d‰ nd‰ ‡ ‡fdd„tˆjƒD ƒS )Nr+  r]  rð   r»   r@  c                s   g | ]}ˆ  ˆj ˆ ¡¡‘qS r4   )r^  r   r/   )r<   r¹   )rß   r®   r4   r5   r?     s   z,StataReader._get_lbllist.<locals>.<listcomp>)r,  r   rD  )r®   r4   )rß   r®   r5   rW  ‰  s    

zStataReader._get_lbllistc                sd   ˆ j dkr$‡ fdd„tˆ jƒD ƒ}n<ˆ j dkrH‡ fdd„tˆ jƒD ƒ}n‡ fdd„tˆ jƒD ƒ}|S )Nr+  c                s   g | ]}ˆ   ˆ j d ¡¡‘qS )iA  )Ú_decoder   r/   )r<   r¹   )r®   r4   r5   r?   •  s   z4StataReader._get_variable_labels.<locals>.<listcomp>rï   c                s   g | ]}ˆ   ˆ j d ¡¡‘qS )éQ   )r^  r   r/   )r<   r¹   )r®   r4   r5   r?   ˜  s   c                s   g | ]}ˆ   ˆ j d ¡¡‘qS )rº   )r^  r   r/   )r<   r¹   )r®   r4   r5   r?   ›  s   )r,  r   rD  )r®   Zvlblistr4   )r®   r5   rY  “  s    




z StataReader._get_variable_labelsc             C   sJ   | j dkr(t | jd | j d¡¡d S t | jd | j d¡¡d S d S )Nr+  rë   r›   r   ÚIrb   )r,  r€   r   rÁ   r   r/   )r®   r4   r4   r5   rE  Ÿ  s
    
zStataReader._get_nobsc             C   s    | j dkr:t | jd | j d¡¡d }|  | j |¡¡S | j dkrnt d| j d¡¡d }|  | j |¡¡S | j dkrŠ|  | j d	¡¡S |  | j d
¡¡S d S )Nr+  r:  re   r   r5  rß   r7   rï   rd  rº   )r,  r€   r   rÁ   r   r/   rc  r^  )r®   Ústrlenr4   r4   r5   rG  §  s    


zStataReader._get_data_labelc             C   sŽ   | j dkr4t d| j d¡¡d }| j |¡ d¡S | j dkrht d| j d¡¡d }|  | j |¡¡S | j dkr„|  | j d¡¡S tƒ ‚d S )	Nr+  rß   r7   r   zutf-8r5  rb  é   )r,  r€   r   r   r/   Údecoder^  rQ   )r®   rf  r4   r4   r5   rI  ´  s    


zStataReader._get_time_stampc             C   sd   | j dkr.| j d¡ | jd| j  d d S | j dkrZt | jd | j d¡¡d d S tƒ ‚d S )	Nr5  r›   r»   é   é   r+  rÍ   r   )	r,  r   r/   rK  rD  r€   r   rÁ   rQ   )r®   r4   r4   r5   rL  À  s    

z%StataReader._get_seek_variable_labelsc          
      s  t  d|¡d ˆ _ˆ jdkr$ttƒ‚ˆ  ¡  t  dˆ j d¡¡d dkrLdpNdˆ _t  dˆ j d¡¡d ˆ _	ˆ j d¡ t  ˆ jd ˆ j d¡¡d ˆ _
ˆ  ¡ ˆ _ˆ  ¡ ˆ _ˆ  ¡ ˆ _ˆ jd	krÚ‡ fd
d„tˆ j
ƒD ƒ}n^ˆ j ˆ j
¡}tj|tjd}g }x:|D ]2}|ˆ jkr$| ˆ j| ¡ n| |d ¡ qW y‡ fdd„|D ƒˆ _W n4 tk
r„   td d dd„ |D ƒ¡¡ƒ‚Y nX y‡ fdd„|D ƒˆ _W n4 tk
rÒ   td d dd„ |D ƒ¡¡ƒ‚Y nX ˆ jd	krü‡ fdd„tˆ j
ƒD ƒˆ _n‡ fdd„tˆ j
ƒD ƒˆ _t  ˆ jdˆ j
d   ˆ j dˆ j
d  ¡¡d d… ˆ _ˆ  ¡ ˆ _ˆ  ¡ ˆ _ ˆ  !¡ ˆ _"ˆ jdkr x†t  ˆ jd ˆ j d¡¡d }ˆ jd	krÄt  ˆ jd ˆ j d¡¡d }nt  ˆ jd ˆ j d¡¡d }|dkrîP ˆ j |¡ qzW ˆ j #¡ ˆ _$d S )Nrß   r   )rb  rï   rð   éo   r`  ér   és   r7   r7  r8  r:  re   rð   c                s   g | ]}t ˆ j d ¡ƒ‘qS )r7   )Úordr   r/   )r<   r¹   )r®   r4   r5   r?   á  s   z0StataReader._read_old_header.<locals>.<listcomp>)rZ   é   c                s   g | ]}ˆ j | ‘qS r4   )r  )r<   r\  )r®   r4   r5   r?   î  s    z cannot convert stata types [{0}]ú,c             s   s   | ]}t |ƒV  qd S )N)rª   )r<   rw   r4   r4   r5   ú	<genexpr>ñ  s    z/StataReader._read_old_header.<locals>.<genexpr>c                s   g | ]}ˆ j | ‘qS r4   )r  )r<   r\  )r®   r4   r5   r?   ó  s    z!cannot convert stata dtypes [{0}]c             s   s   | ]}t |ƒV  qd S )N)rª   )r<   rw   r4   r4   r5   rq  ö  s    c                s   g | ]}ˆ   ˆ j d ¡¡‘qS )r»   )r^  r   r/   )r<   r¹   )r®   r4   r5   r?   ù  s   c                s   g | ]}ˆ   ˆ j d ¡¡‘qS )r@  )r^  r   r/   )r<   r¹   )r®   r4   r5   r?   ü  s   rí   rB  rb  r¹   rb   )%r€   r   r,  rQ   rC  r-  r   r/   rÁ   ZfiletyperD  rE  rF  rG  rH  rI  rJ  r   ri   Ú
frombufferr‘   r  r¬   r3  r:   ÚjoinrQ  rS  rT  rU  rV  rW  rX  rY  rZ  ÚtellrM  )r®   r4  r3  ÚbufZtyplistbÚtpZ	data_typeZdata_lenr4   )r®   r5   r2  Í  st    













zStataReader._read_old_headerc             C   sŽ   | j dk	r| j S g }xbt| jƒD ]T\}}|| jkrV| dt|ƒ | j| j|  f¡ q | dt|ƒ dt|ƒ f¡ q W t |¡}|| _ | j S )z"Map between numpy and state dtypesNrµ   ÚS)	r  Ú	enumerater3  r  r¬   rª   rÁ   ri   rZ   )r®   rZ   r¹   r\  r4   r4   r5   r"    s    

"
zStataReader._setup_dtypec             C   s    t |ƒtkr|pt | j| ¡S )N)r/  rK   r€   ZcalcsizerÁ   )r®   rh   r4   r4   r5   r0  0  s    zStataReader._calcsizec             C   s   |  d¡d }| d¡S )Nr¸   r   zutf-8)Ú	partitionrh  )r®   rµ   r4   r4   r5   rc  4  s    zStataReader._decodec             C   s   |  d¡d }| | j¡S )Nr¸   r   )ry  rh  r´   )r®   rµ   r4   r4   r5   r^  8  s    zStataReader._null_terminatec             C   s:  | j r
d S | jdkr&d| _ tƒ | _d S | jdkr@| j | j¡ n | j| jj	 }| j | j
| ¡ d| _ tƒ | _x¾| jdkrŽ| j d¡dkrŽP | j d¡}|s P | jdkr¾|  | j d¡¡}n|  | j d¡¡}| j d	¡ t | jd
 | j d¡¡d }t | jd
 | j d¡¡d }tj| j d| ¡| jd |d}tj| j d| ¡| jd |d}t |¡}|| }|| }| j |¡}	tƒ | j|< x†t|ƒD ]z}
|
|d k r¶||
d  n|}| jdkrì|  |	||
 |… ¡| j| ||
 < n$|  |	||
 |… ¡| j| ||
 < q˜W | jdkrr| j d¡ qrW d| _ d S )Nrð   Tr5  é   s   </valrb   r»   r]  rc   re  r   rò   )rZ   Úcountr7   rf   )r  r,  r  Úvalue_label_dictr   rÀ   rO  rF  r  ÚitemsizerM  r/   r^  rc  r€   r   rÁ   ri   rr  Zargsortr   )r®   rÄ   Zslengthr   r¨   Ztxtlenr¦   r   Ziir§   r¹   rû   r4   r4   r5   Ú_read_value_labels=  s\    




&*
zStataReader._read_value_labelsc             C   s&  | j  | j¡ ddi| _x| j  d¡dkr.P | jdkrXt | jd | j  d¡¡d }nX| j  d	¡}| jd
krˆ|dd… |dd…  }n|dd… |dd …  }t d|¡d }t d| j  d¡¡d }t | jd | j  d¡¡d }| j  |¡}|dkr|dd…  	| j
¡}|| jt|ƒ< qW d S )NÚ0rŠ   rc   s   GSOr5  rë   r›   r   r`   r8  re   rb   r?  rf   ÚBr7   re  é‚   rB  )r   rÀ   rN  ÚGSOr/   r,  r€   r   rÁ   rh  r´   rª   )r®   Zv_oru  r\  ÚlengthZvar4   r4   r5   Ú_read_strlsy  s(    



zStataReader._read_strlsc             K   s*   t  d¡ | jrtdƒ‚d| _| jd|ŽS )Nz('data' is deprecated, use 'read' insteadzData has already been read.T)N)ro   rp   r  Ú	Exceptionr/   )r®   Úkwargsr4   r4   r5   r3   –  s
    
zStataReader.datac             C   s   | j | jpddS )Nr7   )Únrows)r/   r  )r®   r4   r4   r5   Ú__next__¡  s    zStataReader.__next__c             C   s   |dkr| j }| j|dS )a  
        Reads lines from Stata file and returns as dataframe

        Parameters
        ----------
        size : int, defaults to None
            Number of lines to read.  If None, reads whole file.

        Returns
        -------
        DataFrame
        N)r‡  )r  r/   )r®   Úsizer4   r4   r5   Ú	get_chunk¤  s    zStataReader.get_chunkc	          	   C   s   | j dkr2|d kr2d| _d| _|  ¡  t| jdS |d kr@| j}|d krN| j}|d kr\| j}|d krj| j	}|d krx| j
}|d kr†| j}|d kr”| j}|d kr¢| j }| jdkrÀ| jsÀd| _|  ¡  | j}	| j | j |	j }
||	j }t||
ƒ}|dkr|r|  ¡  |  ¡  t‚| j|	j }| j | j| ¡ t|| j | j ƒ}tj| j |¡|	|d}|  j|7  _| j| j kr€d| _d| _| j| jkrš| ¡  ¡ }|r¨|  ¡  t |ƒdkrÄt| jd}nt !|¡}| j|_"|d krþt #| j| | j¡}| $|¡}|d k	r:y|  %||¡}W n  t&k
r8   |  ¡  ‚ Y nX x@t'|| j(ƒD ]0\}}t)|ƒt*krH|| j+| j,dd||< qHW |  -|¡}t .| j/¡d }|j0}d}g }xŠ|D ]‚}| j/| d k	rª|j"| }|| j1}	|	t 1t2¡kr|	| j/| krd}| 3|t4|| || j/| ƒf¡ n| 3||| f¡ qªW |rDt 5t6|ƒ¡}~|  7||¡ |rÊt .t8dd	„ | j9ƒ¡d }xV|D ]N}|j"| }yt:|| | j9| ƒ||< W n  t&k
rÂ   |  ¡  ‚ Y nX qxW |rð| jd
krð|  ;|| j<| j=|¡}|s‚g }d}xn|D ]f}|| j1}	|	tj>tj?fkr0tj@}	d}n |	tjAtjBtjCfkrPtjD}	d}| 3|||  E|	¡f¡ qW |r‚t 5t6|ƒ¡}|d k	rœ| $| F|¡¡}|S )Nr   T)r+   r5  )rZ   r{  )Zconvert_dtypeFc                s   t ‡ fdd„tD ƒƒS )Nc             3   s   | ]}ˆ   |¡V  qd S )N)rm   )r<   rh   )rw   r4   r5   rq  *  s   z5StataReader.read.<locals>.<lambda>.<locals>.<genexpr>)rj   Ú_date_formats)rw   r4   )rw   r5   ry   *  s   z"StataReader.read.<locals>.<lambda>rð   )GrF  r  r  r0   r   rS  r  r  r  r  r  r  r  r,  r  r„  r  r  r}  rA   r~  ÚStopIterationr   rÀ   rM  ri   rr  r/   rÁ   r  ZbyteswapÚnewbyteorderr¢   Zfrom_recordsr+   r¡   Z	set_indexÚ_do_select_columnsrQ   r   r3  r/  rK   rz   r^  Ú_insert_strlsÚwhererQ  r%   rZ   rq   r¬   r   Ú	from_dictr   Ú_do_convert_missingr
   rV  rt   Ú_do_convert_categoricalsr|  rX  Zfloat16r•   r   r   r’   r“   rl   rk   Úpop)r®   r‡  r'   r(   r&   r)   r*   r+   r,   rZ   Zmax_read_lenZread_lenrÄ   Z
read_linesr3   ZixrŒ   r\  Zcols_Zrequires_type_conversionÚdata_formattedr¹   ZcolsZretyped_dataZconvertr4   r4   r5   r/   µ  sæ    	










 




zStataReader.readc             C   s
  xt |ƒD ]ö\}}| j| }|| jkr*q| j| \}}|| }t ||k ||k¡}	|	 ¡ s^q|rÈt |	j¡}
tj||	 dd\}}t	|tj
d}xft |ƒD ]&\}}t|ƒ}|
||k }||j|< qœW n2|j}|tjtjfkrätj}t	||d}tj||	< |||< qW d S )NT)Zreturn_inverse)rZ   )rx  r3  r  ri   Z
logical_orrj   ZargwhereZ_ndarray_valuesÚuniquer   rq   rÈ   ÚilocrZ   r•   r   Únan)r®   r3   r)   r¹   Zcolnamerh   ZnminZnmaxZseriesZmissingZmissing_locZumissingZumissing_locÚreplacementÚjZumr‚   ZlocrZ   r4   r4   r5   r’  Q  s0    



zStataReader._do_convert_missingc                sp   t ˆ dƒrtˆ jƒdkr|S xNtˆ jƒD ]@\}}|dkr:q(‡ fdd„|jd d …|f D ƒ|jd d …|f< q(W |S )Nr‚  r   rë   c                s   g | ]}ˆ j t|ƒ ‘qS r4   )r‚  rª   )r<   Úk)r®   r4   r5   r?   {  s    z-StataReader._insert_strls.<locals>.<listcomp>)Úhasattrr¢   r‚  rx  r3  r—  )r®   r3   r¹   r\  r4   )r®   r5   r  t  s    2zStataReader._insert_strlsc             C   sÜ   | j sÔt|ƒ}t|ƒt|ƒkr&tdƒ‚| |j¡}|rLtdd t|ƒ¡ ƒ‚g }g }g }g }xX|D ]P}	|j |	¡}
| 	| j
|
 ¡ | 	| j|
 ¡ | 	| j|
 ¡ | 	| j|
 ¡ qbW || _
|| _|| _|| _d| _ || S )Nz"columns contains duplicate entriesz<The following columns were not found in the Stata data set: z, T)r  Úsetr¢   rQ   Ú
differencer+   rs  r    Zget_locr¬   rQ  r3  rV  rX  )r®   r3   r+   Z
column_setZ	unmatchedrQ  r3  rV  rX  rŒ   r¹   r4   r4   r5   rŽ  ~  s0    
zStataReader._do_select_columnsc          	   C   s   t t |¡ƒ}g }xút||ƒD ]ì\}}||krøt|| |d}	g }
x8|	jD ].}||| krn|
 || | ¡ qJ|
 |¡ qJW y
|
|	_W nR tk
rØ   t|
ƒ 	¡ }t |j
|dk ƒ}dd |¡ }tdj||dƒ‚Y nX t|	|j
d}	| ||	f¡ q| ||| f¡ qW t t|ƒ¡}|S )zC
        Converts categorical columns to Categorical type.
        )Zorderedr7   zQ
--------------------------------------------------------------------------------Ú
zPValue labels for column {col} are not unique. The repeated labels are:
{repeats})rŒ   Úrepeats)r%   )r    r   Ziterkeysr   r   rŸ   r¬   rQ   r   Zvalue_countsr%   rs  r:   r   r‘  r   )r®   r3   r|  rX  r,   r£   Zcat_converted_datarŒ   ÚlabelZcat_datarŸ   r°   Zvcr   r4   r4   r5   r“    s.    
z$StataReader._do_convert_categoricalsc             C   s   | j S )z Returns data label of Stata file)rH  )r®   r4   r4   r5   rH  À  s    zStataReader.data_labelc             C   s   t t| j| jƒƒS )zkReturns variable labels as a dict, associating each variable name
        with corresponding label
        )r  r   rS  rZ  )r®   r4   r4   r5   Úvariable_labelsÄ  s    zStataReader.variable_labelsc             C   s   | j s|  ¡  | jS )zvReturns a dict, associating each variable name a dict, associating
        each value its corresponding label
        )r  r~  r|  )r®   r4   r4   r5   r£   Ê  s    zStataReader.value_labels)	TTNFTNTNN)N)NNNNNNNN),r…   r†   r‡   Ú_stata_reader_docrÇ   r   r±   r%  r)  r0   r-  r!  r1  rP  rR  rU  rW  rY  rE  rG  rI  rL  r2  r"  r0  rc  r^  r~  r„  r   Ú_data_method_docr3   rˆ  rŠ  Ú_read_method_docr/   r’  r  rŽ  r“  rH  r¢  r£   Ú__classcell__r4   r4   )rØ   r5   r.   ¸  s^   

    *	B

R<

    #
#r.   c             C   s    t | dƒr| dfS t| dƒdfS )a  
    Open a binary file or no-op if file-like

    Parameters
    ----------
    fname : string path, path object or buffer

    Returns
    -------
    file : file-like object
        File object supporting write
    own : bool
        True if the file was created, otherwise False
    r½   FÚwbT)rœ  r  )Úfnamer4   r4   r5   Ú_open_file_binary_writeÔ  s    
r©  c             C   s4   |   ¡ dkrdS |   ¡ dkr dS tdj| dƒ‚d S )N)r8  Úlittler8  )r7  Zbigr7  z"Endianness {endian} not understood)Zendian)ÚlowerrQ   r:   )Z
endiannessr4   r4   r5   r  é  s    r  c             C   s   | d|t | ƒ   S )zQ
    Takes a char string and pads it with null bytes until it's length chars
    r·   )r¢   )rœ   rƒ  r4   r4   r5   r¿   ó  s    r¿   c             C   s"   | dkrt jS tdj| dƒ‚dS )zK
    Converts from one of the stata date formats to a type in TYPE_MAP
    )rX   z%tcr[   z%tdr\   z%twr_   z%tmra   z%tqrd   z%thrg   z%tyzFormat {fmt} not implemented)rh   N)ri   r   ÚNotImplementedErrorr:   )rh   r4   r4   r5   Ú_convert_datetime_to_stata_typeú  s    r­  c             C   sz   i }xp| D ]h}| |   d¡s,d| |  | |< ||krN| | |¡| | i¡ q
t|tƒs`tdƒ‚| || | i¡ q
W |S )Nú%z0convert_dates key must be a column or an integer)rm   Úupdater%   r©   rK   rQ   )r'   rS  Znew_dictrš   r4   r4   r5   Ú_maybe_convert_to_int_keys  s    

r°  c             C   s~   | j tjkr$tt|jƒƒ}t|dƒS | tjkr2dS | tjkr@dS | tj	krNdS | tj
kr\dS | tjkrjdS tdj| dƒ‚d	S )
aö  
    Converts dtype types to stata types. Returns the byte of the given ordinal.
    See TYPE_MAP and comments for an explanation. This is also explained in
    the dta spec.
    1 - 244 are strings of this length
                         Pandas    Stata
    251 - for int8      byte
    252 - for int16     int
    253 - for int32     long
    254 - for float32   float
    255 - for double    double

    If there are dates to convert, then dtype will already have the correct
    type inserted.
    r7   rê   ré   rè   rç   ræ   z Data type {dtype} not supported.)rZ   N)r/  ri   Úobject_r   r   rR   r@   r   r•   r“   r’   r   r¬  r:   )rZ   Úcolumnr}  r4   r4   r5   Ú_dtype_to_stata_type  s    





r³  rl  c             C   sô   |dk rd}nd}|rdS | j tjkržt|dd}|dksXt|ƒdksXtd	j|jd
ƒ‚tt	|j
ƒƒ}||krˆ|dkrzdS tt|j ƒ‚dtt|dƒƒ d S | tjkr¬dS | tjkrºdS | tjkrÈdS | tjksÜ| tjkràdS tdj| dƒ‚dS )a£  
    Maps numpy dtype to stata's default format for this type. Not terribly
    important since users can change this in Stata. Semantics are

    object  -> "%DDs" where DD is the length of the string.  If not a string,
                raise ValueError
    float64 -> "%10.0g"
    float32 -> "%9.0g"
    int64   -> "%9.0g"
    int32   -> "%12.0g"
    int16   -> "%8.0g"
    int8    -> "%8.0g"
    strl    -> "%9s"
    r5  éô   iý  z%9sT)rv   )rÔ   Zunicoder   a!  Column `{col}` cannot be exported.

Only string-like object arrays containing all strings or a mix of strings and None can be exported. Object arrays containing only null values are prohibited. Other object typescannot be exported and must first be converted to one of the supported types.)rŒ   r®  r7   rµ   z%10.0gz%9.0gz%12.0gz%8.0gz Data type {dtype} not supported.)rZ   N)r/  ri   r±  r   r¢   rQ   r:   rœ   r   r   rR   Úexcessive_string_length_errorrª   r@   r   r•   r“   r   r’   r¬  )rZ   r²  Údta_versionÚ
force_strlZmax_str_lenZinferred_dtyper}  r4   r4   r5   Ú_dtype_to_default_stata_fmt:  s6    


r¸  c                   s  e Zd ZdZdZedddd?‡ fdd	„	ƒZd
d„ Zdd„ Zdd„ Z	dd„ Z
dd„ Zdd„ Zdd„ Zdd„ Zdd„ Zdd„ Zdd„ Zd d!„ Zd"d#„ Zd$d%„ Zd&d'„ Zd@d(d)„Zd*d+„ Zd,d-„ Zd.d/„ Zd0d1„ Zd2d3„ Zd4d5„ Zd6d7„ Zd8d9„ Zd:d;„ ZdAd=d>„Z ‡  Z!S )BÚStataWriteraÒ	  
    A class for writing Stata binary dta files

    Parameters
    ----------
    fname : path (string), buffer or path object
        string, path object (pathlib.Path or py._path.local.LocalPath) or
        object implementing a binary write() functions. If using a buffer
        then the buffer will not be automatically closed after the file
        is written.

        .. versionadded:: 0.23.0 support for pathlib, py.path.

    data : DataFrame
        Input to save
    convert_dates : dict
        Dictionary mapping columns containing datetime types to stata internal
        format to use when writing the dates. Options are 'tc', 'td', 'tm',
        'tw', 'th', 'tq', 'ty'. Column can be either an integer or a name.
        Datetime columns that do not have a conversion type specified will be
        converted to 'tc'. Raises NotImplementedError if a datetime column has
        timezone information
    write_index : bool
        Write the index to Stata dataset.
    encoding : str
        Default is latin-1. Only latin-1 and ascii are supported.
    byteorder : str
        Can be ">", "<", "little", or "big". default is `sys.byteorder`
    time_stamp : datetime
        A datetime to use as file creation date.  Default is the current time
    data_label : str
        A label for the data set.  Must be 80 characters or smaller.
    variable_labels : dict
        Dictionary containing columns as keys and variable labels as values.
        Each label must be 80 characters or smaller.

        .. versionadded:: 0.19.0

    Returns
    -------
    writer : StataWriter instance
        The StataWriter instance has a write_file method, which will
        write the file to the given `fname`.

    Raises
    ------
    NotImplementedError
        * If datetimes contain timezone information
    ValueError
        * Columns listed in convert_dates are neither datetime64[ns]
          or datetime.datetime
        * Column dtype is not representable in Stata
        * Column listed in convert_dates is not in DataFrame
        * Categorical label contains more than 32,000 characters

    Examples
    --------
    >>> data = pd.DataFrame([[1.0, 1]], columns=['a', 'b'])
    >>> writer = StataWriter('./data_file.dta', data)
    >>> writer.write_file()

    Or with dates
    >>> from datetime import datetime
    >>> data = pd.DataFrame([[datetime(2000,1,1)]], columns=['date'])
    >>> writer = StataWriter('./date_data_file.dta', data, {'date' : 'tw'})
    >>> writer.write_file()
    r´  r"   N)r#   r$   Túlatin-1c
       
         sŽ   t t| ƒ ¡  |d kri n|| _|| _d| _|| _|| _|	| _d| _	|  
|¡ |d kr\tj}t|ƒ| _t|ƒ| _tjtjtjdœ| _i | _d S )Nzlatin-1T)rè   rç   ræ   )r  r¹  r±   r  Ú_write_indexr´   Ú_time_stampÚ_data_labelrZ  Ú	_own_fileÚ_prepare_pandasr  rÁ   r  Ú
_byteorderr    Ú_fnameri   r“   r’   r   Ztype_convertersÚ_converted_names)
r®   r¨  r3   r'   Úwrite_indexr"   rÁ   rJ  rH  r¢  )rØ   r4   r5   r±   ¹  s    


zStataWriter.__init__c             C   s2   t jr"| j | | jp| j¡¡ n| j |¡ dS )zS
        Helper to call encode before writing to file for Python 3 compat.
        N)r   r²   Ú_filer½   r³   r´   Z_default_encoding)r®   Zto_writer4   r4   r5   Ú_writeÏ  s    zStataWriter._writec       	         s  ‡ fdd„ˆ D ƒ}|| _ g | _t|ƒs*ˆ S tj}g }xÚtˆ |ƒD ]Ì\}}|rú| j tˆ | ƒ¡ ˆ | jj	j
}|tjkr€tdƒ‚ˆ | jj	j ¡ }| ¡ ||ƒkrÚ|tjkr´tj}n|tjkrÆtj}ntj}tj||d}||ƒ||dk< | ||f¡ q@| |ˆ | f¡ q@W t t|ƒ¡S )zxCheck for categorical columns, retain categorical information for
        Stata file and convert categorical data to intc                s   g | ]}t ˆ | ƒ‘qS r4   )r   )r<   rŒ   )r3   r4   r5   r?   Ý  s    z5StataWriter._prepare_categoricals.<locals>.<listcomp>zCIt is not possible to export int64-based categorical data to Stata.)rZ   rB  )Ú_is_col_catÚ_value_labelsrj   rÈ   rÝ   r   r¬   r™   rž   ZcodesrZ   ri   rl   rQ   rR   Úcopyr@   r   r’   r“   r   r­   r   r‘  r   )	r®   r3   Zis_catrÝ   r•  rŒ   Z
col_is_catrZ   rR   r4   )r3   r5   Ú_prepare_categoricalsÙ  s2    


z!StataWriter._prepare_categoricalsc             C   s^   xX|D ]P}|| j }|tjtjfkr|tjkr:| jd }n
| jd }||  |¡||< qW |S )ztChecks floating point data columns for nans, and replaces these with
        the generic Stata for missing value (.)r{   rH   )rZ   ri   r•   r   rÑ   Úfillna)r®   r3   r¼   rZ   r™  r4   r4   r5   Ú_replace_nansÿ  s    



zStataWriter._replace_nansc             C   s   dS )zNo-op, forward compatibilityNr4   )r®   r4   r4   r5   Ú_update_strl_names  s    zStataWriter._update_strl_namesc          
   C   s   i }t |jƒ}|dd… }d}x$t|ƒD ]\}}|}t|tƒsJt|ƒ}xP|D ]H}	|	dk sd|	dkrP|	dk st|	dkrP|	dk s„|	dkrP|	d	krP| |	d	¡}qPW || jkr®d	| }|d dkrÎ|d dkrÎd	| }|dtt	|ƒd
ƒ… }||ks:xB| 
|¡dkr0d	t|ƒ | }|dtt	|ƒd
ƒ… }|d7 }qðW |||< |||< q*W ||_| jrx:t||ƒD ],\}	}
|	|
kr`| j|
 | j|	< | j|
= q`W |rg }xV| ¡ D ]J\}}y| d¡}W n ttfk
rÔ   Y nX d ||¡}| |¡ q¤W t d |¡¡}t |t¡ || _|  ¡  |S )aÌ  
        Checks column names to ensure that they are valid Stata column names.
        This includes checks for:
            * Non-string names
            * Stata keywords
            * Variables that start with numbers
            * Variables with names that are too long

        When an illegal variable name is detected, it is converted, and if
        dates are exported, the variable name is propagated to the date
        conversion dictionary
        Nr   ÚAÚZrå   Úzr  Ú9r#  rº   r7   zutf-8z{0}   ->   {1}z
    )r    r+   rx  r©   r   r   Úreplacer	  rA   r¢   r{  rª   r  r   Úitemsr³   ÚUnicodeDecodeErrorÚAttributeErrorr:   r¬   Úinvalid_name_docrs  ro   rp   r‰   rÂ  rÌ  )r®   r3   Zconverted_namesr+   Zoriginal_columnsZduplicate_var_idrš  rœ   Z	orig_namer¼   ÚoZconversion_warningÚmsgr–   r4   r4   r5   Ú_check_column_names  sX    


 


zStataWriter._check_column_namesc             C   sR   g | _ g | _x@| ¡ D ]4\}}| j t||| ƒ¡ | j  t||| ƒ¡ qW d S )N)r3  rV  Ú	iteritemsr¬   r¸  r³  )r®   r3   ÚdtypesrŒ   rZ   r4   r4   r5   Ú_set_formats_and_types_  s
    z"StataWriter._set_formats_and_typesc             C   s
  |  ¡ }| jr| ¡ }|  |¡}t|ƒ}|  |¡}|  |¡}|j\| _| _	|| _
|j ¡ | _|j}x.|D ]&}|| jkrxqht|| ƒrhd| j|< qhW t| j| jƒ| _x*| jD ] }t| j| ƒ}t |¡||< qªW |  ||¡ | jd k	rx| jD ]}| j| | j|< qîW d S )NrX   )rÈ  r»  Zreset_indexrØ  r˜   rË  rÉ  ÚshaperF  rD  r3   r+   ÚtolistrS  rÚ  r  r   r°  r­  ri   rZ   rÛ  rV  )r®   r3   rÚ  rŒ   rš   Znew_typer4   r4   r5   r¿  f  s4    





zStataWriter._prepare_pandasc             C   s  t | jƒ\| _| _yŽ| j| j| jd |  ¡  |  ¡  |  	¡  |  
¡  |  ¡  |  ¡  |  ¡  |  ¡  |  ¡  |  ¡  |  ¡  |  ¡  |  ¡  |  ¡  |  ¡  W np tk
r } zP|  ¡  y| jrÐt | j¡ W n( tk
rú   t d | j¡t¡ Y nX |‚W d d }~X Y n
X |  ¡  d S )N)rJ  rH  zSThis save was not successful but {0} could not be deleted.  This file is not valid.)r©  rÁ  rÄ  r¾  Ú_write_headerr¼  r½  Ú
_write_mapÚ_write_variable_typesÚ_write_varnamesÚ_write_sortlistÚ_write_formatsÚ_write_value_label_namesÚ_write_variable_labelsÚ_write_expansion_fieldsÚ_write_characteristicsÚ_prepare_dataÚ_write_dataÚ_write_strlsÚ_write_value_labelsÚ_write_file_close_tagr…  Ú_closeÚosÚunlinkro   rp   r:   r	   )r®   Úexcr4   r4   r5   Ú
write_file˜  s:    
zStataWriter.write_filec             C   s8   y| j  ¡  W n tk
r"   Y nX | jr4| j  ¡  dS )aA  
        Close the file if it was created by the writer.

        If a buffer or file-like object was passed in, for example a GzipFile,
        then leave this file open for the caller to close. In either case,
        attempt to flush the file contents to ensure they are written to disk
        (if supported)
        N)rÄ  ÚflushrÔ  r¾  r0   )r®   r4   r4   r5   rí  ¹  s    
zStataWriter._closec             C   s   dS )zNo-op, future compatibilityNr4   )r®   r4   r4   r5   rß  Ê  s    zStataWriter._write_mapc             C   s   dS )zNo-op, future compatibilityNr4   )r®   r4   r4   r5   rì  Î  s    z!StataWriter._write_file_close_tagc             C   s   dS )zNo-op, future compatibilityNr4   )r®   r4   r4   r5   rç  Ò  s    z"StataWriter._write_characteristicsc             C   s   dS )zNo-op, future compatibilityNr4   )r®   r4   r4   r5   rê  Ö  s    zStataWriter._write_strlsc             C   s   |   tddƒ¡ dS )z"Write 5 zeros for expansion fieldsrŠ   rz  N)rÅ  r¿   )r®   r4   r4   r5   ræ  Ú  s    z#StataWriter._write_expansion_fieldsc             C   s,   x&| j D ]}| j | | j| j¡¡ qW d S )N)rÇ  rÄ  r½   rÆ   rÀ  r´   )r®   r¯   r4   r4   r5   rë  Þ  s    zStataWriter._write_value_labelsc             C   sT  | j }| j t dd¡¡ |  |dkr*dp,d¡ |  d¡ |  d¡ | j t |d | j¡d d… ¡ | j t |d	 | j¡d d
… ¡ |d krª| j |  t	ddƒ¡¡ n | j |  t	|d d… dƒ¡¡ |d krÞt
j
 ¡ }nt|t
j
ƒsòtdƒ‚ddddddddddddg}dd„ t|ƒD ƒ}| d¡||j  | d¡ }| j |  |¡¡ d S )Nrß   rl  r7  úúr·   rí   re   r¹   rb   rŠ   éP   z"time_stamp should be datetime typeÚJanÚFebÚMarÚAprÚMayÚJunÚJulÚAugÚSepÚOctÚNovÚDecc             S   s   i | ]\}}||d  “qS )r7   r4   )r<   r¹   rD   r4   r4   r5   ú
<dictcomp>	  s    z-StataWriter._write_header.<locals>.<dictcomp>z%d z	 %Y %H:%M)rÀ  rÄ  r½   r€   r¾   rÅ  rD  rF  r^  r¿   r;   Únowr©   rQ   rx  ÚstrftimerD   )r®   rH  rJ  rÁ   ÚmonthsÚmonth_lookupÚtsr4   r4   r5   rÞ  ã  s*    

""zStataWriter._write_headerc             C   s(   x"| j D ]}| j t d|¡¡ qW d S )Nr€  )r3  rÄ  r½   r€   r¾   )r®   r\  r4   r4   r5   rà  	  s    z!StataWriter._write_variable_typesc             C   s<   x6| j D ],}|  |d¡}t|d d… dƒ}|  |¡ qW d S )NTrº   r»   )rS  r^  r¿   rÅ  )r®   rœ   r4   r4   r5   rá  	  s    zStataWriter._write_varnamesc             C   s"   t dd| jd  ƒ}|  |¡ d S )NrŠ   re   r7   )r¿   rD  rÅ  )r®   rT  r4   r4   r5   râ  	  s    zStataWriter._write_sortlistc             C   s$   x| j D ]}|  t|dƒ¡ qW d S )Nra  )rV  rÅ  r¿   )r®   rh   r4   r4   r5   rã  	  s    zStataWriter._write_formatsc             C   sf   x`t | jƒD ]R}| j| rN| j| }|  |d¡}t|d d… dƒ}|  |¡ q|  tddƒ¡ qW d S )NTrº   r»   rŠ   )r   rD  rÆ  rS  r^  r¿   rÅ  )r®   r¹   rœ   r4   r4   r5   rä  	  s    

z$StataWriter._write_value_label_namesc             C   s¬   t ddƒ}| jd kr6xt| jƒD ]}|  |¡ q W d S xp| jD ]f}|| jkrš| j| }t|ƒdkrjtdƒ‚tdd„ |D ƒƒ}|sˆtdƒ‚|  t |dƒ¡ q>|  |¡ q>W d S )NrŠ   rd  rõ  z.Variable labels must be 80 characters or fewerc             s   s   | ]}t |ƒd k V  qdS )é   N)rn  )r<   r¼   r4   r4   r5   rq  9	  s    z5StataWriter._write_variable_labels.<locals>.<genexpr>zKVariable labels must contain only characters that can be encoded in Latin-1)	r¿   rZ  r   rD  rÅ  r3   r¢   rQ   Úall)r®   Úblankr¹   rŒ   r¡  Ú	is_latin1r4   r4   r5   rå  *	  s    



z"StataWriter._write_variable_labelsc             C   s   |S )zNo-op, future compatibilityr4   )r®   r3   r4   r4   r5   Ú_convert_strlsB	  s    zStataWriter._convert_strlsc             C   s  | j }| j}| j}| jd k	rRx4t|ƒD ](\}}||kr&t|| | j| ƒ||< q&W |  |¡}g }g }d}| jtt	j
ƒk}	xØt|ƒD ]Ì\}}|| }
|
| jkrd}||  d¡jt|
fd||< dj|
d}| dt|ƒ |f¡ || j | j¡}| |j |¡¡ q‚|| j}|| j}|	s.| | j¡}| dt|ƒ |f¡ | |¡ q‚W t |¡}|sh|	s~tjt|Ž |d| _ n|jdd	| _ d S )
NFTrŠ   )ÚargszS{type})r/  r¼   )rZ   )r%   )r3   r3  r  rx  rƒ   rV  r  rÀ  r  r  rÁ   Ú_max_string_lengthrÊ  rz   r¿   r:   r¬   rª   r³   r´   rR   rk   rZ   r  ri   Zfromiterr   Z
to_records)r®   r3   r3  r'   r¹   rŒ   rÚ  Z	data_colsZhas_stringsZnative_byteorderr\  ÚstyperÔ   rR   rZ   r4   r4   r5   rè  F	  s@    




zStataWriter._prepare_datac             C   s   | j }| j | ¡ ¡ d S )N)r3   rÄ  r½   Útobytes)r®   r3   r4   r4   r5   ré  o	  s    zStataWriter._write_dataFc             C   s2   d}t jr"|s"||7 }| | j¡S ||7 }|S d S )Nr·   )r   r²   r³   r´   )r®   rµ   Z	as_stringrÃ   r4   r4   r5   r^  s	  s    
zStataWriter._null_terminate)NTrº  NNNN)NN)F)"r…   r†   r‡   rÇ   r  r   r±   rÅ  rÉ  rË  rÌ  rØ  rÛ  r¿  rñ  rí  rß  rì  rç  rê  ræ  rë  rÞ  rà  rá  râ  rã  rä  rå  r  rè  ré  r^  r¦  r4   r4   )rØ   r5   r¹  r  s@   C
  
&M2!
%)r¹  c             C   s’   |rdS | j tjkr<tt|jƒƒ}t|dƒ}|dkr8|S dS | tjkrJdS | tjkrXdS | tj	krfdS | tj
krtdS | tjkr‚dS td	|  ƒ‚d
S )a  
    Converts dtype types to stata types. Returns the byte of the given ordinal.
    See TYPE_MAP and comments for an explanation. This is also explained in
    the dta spec.
    1 - 2045 are strings of this length
                Pandas    Stata
    32768 - for object    strL
    65526 - for int8      byte
    65527 - for int16     int
    65528 - for int32     long
    65529 - for float32   float
    65530 - for double    double

    If there are dates to convert, then dtype will already have the correct
    type inserted.
    i €  r7   iý  iöÿ  i÷ÿ  iøÿ  iùÿ  iúÿ  zData type %s not supported.N)r/  ri   r±  r   r   rR   r@   r   r•   r“   r’   r   r¬  )rZ   r²  r·  r}  r4   r4   r5   Ú_dtype_to_stata_type_117}	  s&    





r  c             C   s"   t jrt| |ƒS t|  |¡ƒS d S )N)r   r²   r  r³   )rµ   r"   r4   r4   r5   Ú_bytes§	  s    
r  c             C   s(   t | tƒrt| dƒ} | d|t| ƒ   S )zU
    Takes a bytes instance and pads it with null bytes until it's length chars.
    zutf-8r¸   )r©   r   r  r¢   )rœ   rƒ  r4   r4   r5   Ú_pad_bytes_new®	  s    

r  c               @   s:   e Zd ZdZddd„Zdd„ Zdd	„ Zd
d„ Zdd„ ZdS )ÚStataStrLWriteraÈ  
    Converter for Stata StrLs

    Stata StrLs map 8 byte values to strings which are stored using a
    dictionary-like format where strings are keyed to two values.

    Parameters
    ----------
    df : DataFrame
        DataFrame to convert
    columns : list
        List of columns names to convert to StrL
    version : int, optional
        dta version.  Currently supports 117, 118 and 119
    byteorder : str, optional
        Can be ">", "<", "little", or "big". default is `sys.byteorder`

    Notes
    -----
    Supports creation of the StrL block of a dta file for dta versions
    117, 118 and 119.  These differ in how the GSO is stored.  118 and
    119 store the GSO lookup value as a uint32 and a uint64, while 117
    uses two uint32s. 118 and 119 also encode all strings as unicode
    which is required by the format.  117 uses 'latin-1' a fixed width
    encoding that extends the 7-bit ascii table with an additional 128
    characters.
    r5  Nc             C   sž   |dkrt dƒ‚|| _|| _|| _tdƒ| _|d kr:tj}t|ƒ| _	d}d}d| _
|dkrjd}d}d	| _
n|d
krxd}nd}ddd|   | _|| _|| _d S )N)r5  r+  éw   z,Only dta versions 117, 118 and 119 supported))rŠ   )r   r   re  rë   zutf-8r5  rb   zlatin-1r+  rf   rz  re   r›   )rQ   Z_dta_verÚdfr+   r   Ú
_gso_tabler  rÁ   r  rÀ  r´   Ú_o_offetÚ_gso_o_typeÚ_gso_v_type)r®   r  r+   ÚversionrÁ   Z
gso_v_typeZ
gso_o_typeZo_sizer4   r4   r5   r±   Ô	  s,    

zStataStrLWriter.__init__c             C   s   |\}}|| j |  S )N)r  )r®   rš   r|   rÖ  r4   r4   r5   Ú_convert_keyï	  s    zStataStrLWriter._convert_keyc                s  | j }| j}t|jƒ‰ || j }‡ fdd„| jD ƒ}tj|jtjd}xŒt| 	¡ ƒD ]|\}\}}xnt|ƒD ]b\}	\}
}||
 }|dkrŠdn|}| 
|d¡}|dkrº|d |d f}|||< |  |¡|||	f< qjW qTW x*t| jƒD ]\}}
|dd…|f ||
< qàW ||fS )aÿ  
        Generates the GSO lookup table for the DataFRame

        Returns
        -------
        gso_table : OrderedDict
            Ordered dictionary using the string found as keys
            and their lookup position (v,o) as values
        gso_df : DataFrame
            DataFrame where strl columns have been converted to
            (v,o) values

        Notes
        -----
        Modifies the DataFrame in-place.

        The DataFrame returned encodes the (v,o) values as uint64s. The
        encoding depends on teh dta version, and can be expressed as

        enc = v + o * 2 ** (o_size * 8)

        so that v is stored in the lower bits and o is in the upper
        bits. o_size is

          * 117: 4
          * 118: 6
          * 119: 5
        c                s   g | ]}|ˆ   |¡f‘qS r4   )r%   )r<   rŒ   )r+   r4   r5   r?   
  s    z2StataStrLWriter.generate_table.<locals>.<listcomp>)rZ   NrŠ   r7   )r  r  r    r+   ri   ÚemptyrÜ  r‹   rx  ZiterrowsÚgetr  )r®   Ú	gso_tableZgso_dfZselectedZ	col_indexÚkeysrÖ  ÚidxÚrowrš  rŒ   r|   r   rš   r¹   r4   )r+   r5   Úgenerate_tableó	  s$    

zStataStrLWriter.generate_tablec             C   s0   t jr| | j¡S t|tƒr(| | j¡S |S dS )z-
        Python 3 compatibility shim
        N)r   r²   r³   r´   r©   r   )r®   rµ   r4   r4   r5   r¶   '
  s
    
zStataStrLWriter._encodec             C   s   t ƒ }tddƒ}t | jd d¡}t | jd d¡}| j| j }| j| j }| jd }x–| ¡ D ]Š\}	}
|
dkrrq`|
\}}| |¡ | t ||¡¡ | t ||¡¡ | |¡ t|	dƒ}| t |t	|ƒd	 ¡¡ | |¡ | |¡ q`W | 
d¡ | ¡ S )
aî  
        Generates the binary blob of GSOs that is written to the dta file.

        Parameters
        ----------
        gso_table : OrderedDict
            Ordered dictionary (str, vo)

        Returns
        -------
        gso : bytes
            Binary content of dta file to be placed between strl tags

        Notes
        -----
        Output format depends on dta version.  117 uses two uint32s to
        express v and o while 118+ uses a uint32 for v and a uint64 for o.
        r‚  Úasciir€  r  r   re  )r   r   zutf-8r7   )r   r  r€   r¾   rÀ  r  r  rÒ  r½   r¢   rÀ   r/   )r®   r  rÂ   ZgsoZgso_typeZnullZv_typeZo_typeZlen_typeZstrlZvor|   rÖ  Zutf8_stringr4   r4   r5   Úgenerate_blob2
  s*    






zStataStrLWriter.generate_blob)r5  N)	r…   r†   r‡   rÇ   r±   r  r#  r¶   r%  r4   r4   r4   r5   r  ·	  s   
4r  c            	       sÐ   e Zd ZdZdZedddd0‡ fdd	„	ƒZed
d„ ƒZdd„ Z	d1dd„Z
dd„ Zdd„ Zdd„ Zdd„ Zdd„ Zdd„ Zdd„ Zdd„ Zd d!„ Zd"d#„ Zd$d%„ Zd&d'„ Zd(d)„ Zd*d+„ Zd,d-„ Zd.d/„ Z‡  ZS )2ÚStataWriter117aÌ  
    A class for writing Stata binary dta files in Stata 13 format (117)

    .. versionadded:: 0.23.0

    Parameters
    ----------
    fname : path (string), buffer or path object
        string, path object (pathlib.Path or py._path.local.LocalPath) or
        object implementing a binary write() functions. If using a buffer
        then the buffer will not be automatically closed after the file
        is written.
    data : DataFrame
        Input to save
    convert_dates : dict
        Dictionary mapping columns containing datetime types to stata internal
        format to use when writing the dates. Options are 'tc', 'td', 'tm',
        'tw', 'th', 'tq', 'ty'. Column can be either an integer or a name.
        Datetime columns that do not have a conversion type specified will be
        converted to 'tc'. Raises NotImplementedError if a datetime column has
        timezone information
    write_index : bool
        Write the index to Stata dataset.
    encoding : str
        Default is latin-1. Only latin-1 and ascii are supported.
    byteorder : str
        Can be ">", "<", "little", or "big". default is `sys.byteorder`
    time_stamp : datetime
        A datetime to use as file creation date.  Default is the current time
    data_label : str
        A label for the data set.  Must be 80 characters or smaller.
    variable_labels : dict
        Dictionary containing columns as keys and variable labels as values.
        Each label must be 80 characters or smaller.
    convert_strl : list
        List of columns names to convert to Stata StrL format.  Columns with
        more than 2045 characters are aautomatically written as StrL.
        Smaller columns can be converted by including the column name.  Using
        StrLs can reduce output file size when strings are longer than 8
        characters, and either frequently repeated or sparse.

    Returns
    -------
    writer : StataWriter117 instance
        The StataWriter117 instance has a write_file method, which will
        write the file to the given `fname`.

    Raises
    ------
    NotImplementedError
        * If datetimes contain timezone information
    ValueError
        * Columns listed in convert_dates are neither datetime64[ns]
          or datetime.datetime
        * Column dtype is not representable in Stata
        * Column listed in convert_dates is not in DataFrame
        * Categorical label contains more than 32,000 characters

    Examples
    --------
    >>> from pandas.io.stata import StataWriter117
    >>> data = pd.DataFrame([[1.0, 1, 'a']], columns=['a', 'b', 'c'])
    >>> writer = StataWriter117('./data_file.dta', data)
    >>> writer.write_file()

    Or with long strings stored in strl format

    >>> data = pd.DataFrame([['A relatively long string'], [''], ['']],
    ...                     columns=['strls'])
    >>> writer = StataWriter117('./data_file_with_long_strings.dta', data,
    ...                         convert_strl=['strls'])
    >>> writer.write_file()
    iý  r"   N)r#   r$   Túlatin-1c          
      sJ   |
d krg n
|
d d … | _ tt| ƒj||||||||	d d | _d | _d S )N)rÁ   rJ  rH  r¢  )Ú_convert_strlr  r&  r±   Ú_mapÚ
_strl_blob)r®   r¨  r3   r'   rÃ  r"   rÁ   rJ  rH  r¢  Zconvert_strl)rØ   r4   r5   r±   À
  s    zStataWriter117.__init__c             C   sB   t | tƒrtjrt| dƒ} td| d dƒ|  td| d dƒ S )zSurround val with <tag></tag>zutf-8r8  r7  z</)r©   rª   r   r²   r  )r   Útagr4   r4   r5   Ú_tagÏ
  s    
zStataWriter117._tagc             C   s   | j  ¡ | j|< dS )z.Update map location for tag with file positionN)rÄ  rt  r)  )r®   r+  r4   r4   r5   Ú_update_map×
  s    zStataWriter117._update_mapc       
      C   s¬  | j }| j tddƒ¡ tƒ }| |  tddƒd¡¡ | |  |dkrJdpLdd¡¡ | jd	k sbt‚| |  t 	|d
 | j¡d¡¡ | |  t 	|d | j
¡d¡¡ |dk	r¶|dd… nd}t 	|d t|ƒ¡}|t|dƒ }| |  |d¡¡ |dkrtj ¡ }nt|tjƒstdƒ‚ddddddddddddg}d d!„ t|ƒD ƒ}| d"¡||j  | d#¡ }	d$t|	d%ƒ }	| |  |	d&¡¡ | d'¡ | j |  | ¡ d(¡¡ dS ))zWrite the file headerz<stata_dta>zutf-8Z117Úreleaser7  ZMSFZLSFrÁ   i   r:  ÚKre  ÚNNrõ  rŠ   r€  r¡  z"time_stamp should be datetime typerö  r÷  rø  rù  rú  rû  rü  rý  rþ  rÿ  r   r  c             S   s   i | ]\}}||d  “qS )r7   r4   )r<   r¹   rD   r4   r4   r5   r  ÷
  s    z0StataWriter117._write_header.<locals>.<dictcomp>z%d z	 %Y %H:%Mó   Úutf8Z	timestampr   Úheader)rÀ  rÄ  r½   r  r   r,  rD  ÚAssertionErrorr€   r¾   rF  r¢   r;   r  r©   rQ   rx  r  rD   rÀ   r/   )
r®   rH  rJ  rÁ   rÂ   r¡  Z	label_lenr  r  r  r4   r4   r5   rÞ  Û
  s2      

zStataWriter117._write_headerc             C   s¤   | j dkr:tdd| j ¡ fdddddd	d
dddddfƒ| _ | j | j d ¡ tƒ }x*| j  ¡ D ]}| t 	| j
d |¡¡ q^W | d¡ | j |  | ¡ d¡¡ dS )zµCalled twice during file write. The first populates the values in
        the map with 0s.  The second call writes the final map locations when
        all blocks have been written.N)Z
stata_datar   Úmap)Úvariable_typesr   )Úvarnamesr   )Úsortlistr   )Úformatsr   )Úvalue_label_namesr   )r¢  r   )Úcharacteristicsr   )r3   r   )Ústrlsr   )r£   r   )Ústata_data_closer   )zend-of-filer   rë   r   )r)  r   rÄ  rt  rÀ   r   rR   r½   r€   r¾   rÀ  r,  r/   )r®   rÂ   r   r4   r4   r5   rß    s*    


zStataWriter117._write_mapc             C   s^   |   d¡ tƒ }x&| jD ]}| t | jd |¡¡ qW | d¡ | j |  	| 
¡ d¡¡ d S )Nr6  r:  r   )r-  r   r3  r½   r€   r¾   rÀ  rÀ   rÄ  r,  r/   )r®   rÂ   r\  r4   r4   r5   rà    s    

z$StataWriter117._write_variable_typesc             C   sn   |   d¡ tƒ }x6| jD ],}|  |d¡}t|d d… dƒ}| |¡ qW | d¡ | j |  | 	¡ d¡¡ d S )Nr7  Trº   r»   r   )
r-  r   rS  r^  r  r½   rÀ   rÄ  r,  r/   )r®   rÂ   rœ   r4   r4   r5   rá  $  s    

zStataWriter117._write_varnamesc             C   s,   |   d¡ | j |  d| jd  d¡¡ d S )Nr8  s     r7   )r-  rÄ  r½   r,  rD  )r®   r4   r4   r5   râ  .  s    
zStataWriter117._write_sortlistc             C   sV   |   d¡ tƒ }x| jD ]}| t|dƒ¡ qW | d¡ | j |  | ¡ d¡¡ d S )Nr9  ra  r   )	r-  r   rV  r½   r  rÀ   rÄ  r,  r/   )r®   rÂ   rh   r4   r4   r5   rã  2  s    

zStataWriter117._write_formatsc             C   sŠ   |   d¡ tƒ }xRt| jƒD ]D}d}| j| r8| j| }|  |d¡}t|d d… dƒ}| |¡ qW | 	d¡ | j
 |  | ¡ d¡¡ d S )Nr:  rŠ   Trº   r»   r   )r-  r   r   rD  rÆ  rS  r^  r  r½   rÀ   rÄ  r,  r/   )r®   rÂ   r¹   rœ   r4   r4   r5   rä  :  s    



z'StataWriter117._write_value_label_namesc             C   s   |   d¡ tƒ }tddƒ}| jd krhxt| jƒD ]}| |¡ q0W | d¡ | j |  	| 
¡ d¡¡ d S xp| jD ]f}|| jkrÌ| j| }t|ƒdkrœtdƒ‚tdd„ |D ƒƒ}|sºtd	ƒ‚| t|dƒ¡ qp| |¡ qpW | d¡ | j |  	| 
¡ d¡¡ d S )
Nr¢  rŠ   rd  r   rõ  z.Variable labels must be 80 characters or fewerc             s   s   | ]}t |ƒd k V  qdS )r  N)rn  )r<   r¼   r4   r4   r5   rq  [  s    z8StataWriter117._write_variable_labels.<locals>.<genexpr>zKVariable labels must contain only characters that can be encoded in Latin-1)r-  r   r  rZ  r   rD  r½   rÀ   rÄ  r,  r/   r3   r¢   rQ   r	  )r®   rÂ   r
  r#  rŒ   r¡  r  r4   r4   r5   rå  H  s*    






z%StataWriter117._write_variable_labelsc             C   s"   |   d¡ | j |  dd¡¡ d S )Nr;  ó    )r-  rÄ  r½   r,  )r®   r4   r4   r5   rç  f  s    
z%StataWriter117._write_characteristicsc             C   s<   |   d¡ | j}| j d¡ | j | ¡ ¡ | j d¡ d S )Nr3   s   <data>s   </data>)r-  r3   rÄ  r½   r  )r®   r3   r4   r4   r5   ré  j  s
    
zStataWriter117._write_datac             C   s6   |   d¡ d}| jd k	r| j}| j |  |d¡¡ d S )Nr<  r>  )r-  r*  rÄ  r½   r,  )r®   r<  r4   r4   r5   rê  q  s
    

zStataWriter117._write_strlsc             C   s   dS )zNo-op in dta 117+Nr4   )r®   r4   r4   r5   ræ  x  s    z&StataWriter117._write_expansion_fieldsc             C   sl   |   d¡ tƒ }x4| jD ]*}| | j| j¡}|  |d¡}| |¡ qW | d¡ | j	 |  | 
¡ d¡¡ d S )Nr£   Zlblr   )r-  r   rÇ  rÆ   rÀ  r´   r,  r½   rÀ   rÄ  r/   )r®   rÂ   r¯   Zlabr4   r4   r5   rë  |  s    

z"StataWriter117._write_value_labelsc             C   s*   |   d¡ | j tddƒ¡ |   d¡ d S )Nr=  z</stata_dta>zutf-8zend-of-file)r-  rÄ  r½   r  )r®   r4   r4   r5   rì  †  s    
z$StataWriter117._write_file_close_tagc             C   s<   x6| j  ¡ D ](\}}|| jkr| j |¡}|| j|< qW dS )ztUpdate column names for conversion to strl if they might have been
        changed to comply with Stata naming rulesN)rÂ  rÒ  r(  r%   )r®   ZorigÚnewr!  r4   r4   r5   rÌ  ‹  s    
z!StataWriter117._update_strl_namesc                sD   ‡ fdd„t |ƒD ƒ}|r@t||ƒ}| ¡ \}}|}| |¡ˆ _|S )zUConvert columns to StrLs if either very large or in the
        convert_strl variablec                s,   g | ]$\}}ˆ j | d ks$|ˆ jkr|‘qS )i €  )r3  r(  )r<   r¹   rŒ   )r®   r4   r5   r?   ˜  s    z1StataWriter117._convert_strls.<locals>.<listcomp>)rx  r  r#  r%  r*  )r®   r3   Zconvert_colsZsswZtabZnew_datar4   )r®   r5   r  ”  s    
zStataWriter117._convert_strlsc             C   sh   g | _ g | _xV| ¡ D ]J\}}|| jk}t||| d|d}| j |¡ | j  t||| |ƒ¡ qW d S )Nr5  )r¶  r·  )r3  rV  rÙ  r(  r¸  r¬   r  )r®   r3   rÚ  rŒ   rZ   r·  rh   r4   r4   r5   rÛ  ¢  s    

z%StataWriter117._set_formats_and_types)NTr'  NNNNN)NN)r…   r†   r‡   rÇ   r  r   r±   Ústaticmethodr,  r-  rÞ  rß  rà  rá  râ  rã  rä  rå  rç  ré  rê  ræ  rë  rì  rÌ  r  rÛ  r¦  r4   r4   )rØ   r5   r&  s
  s2   I
  
&

	r&  )
TTNNFTNTNF)rl  F)`rÇ   Úcollectionsr   r;   rî  r€   r  ro   Zdateutil.relativedeltar   Znumpyri   Zpandas._libs.libr   Zpandas._libs.tslibsr   r   Zpandas._libs.writersr   Zpandas.compatr   r	   r
   r   r   r   r   r   r   Zpandas.util._decoratorsr   r   Zpandas.core.dtypes.commonr   r   r   Zpandasr   r   r   r   r   Zpandas.core.arraysr   Zpandas.core.baser   Zpandas.core.framer   Zpandas.core.seriesr   Zpandas.io.commonr   r    r!   rC  Z_statafile_processing_params1Z_encoding_paramsZ_statafile_processing_params2Z_chunksize_paramsZ_iterator_paramsZ_read_stata_docr¤  r¥  r£  r6   r‹  rn   rt   rƒ   rµ  ÚWarningr„   r”   rˆ   r«   r‰   rÕ  r˜   rq   r™   rÈ   rã   r.   r©  r  r¿   r­  r°  r³  r¸  r¹  r  r  r  r  r&  r4   r4   r4   r5   Ú<module>   s¤   ,#


    %eW|xp      "
% 
7    *	 =