B
    •xŠ\2m  ã               @   s¨   d Z ddlmZ ddlZddlZddlmZ ddlZddlm	Z	 ddl
mZmZ ddlmZ ddlm  m  mZ G dd	„ d	eƒZG d
d„ deƒZG dd„ deƒZdS )a¢  
Read SAS7BDAT files

Based on code written by Jared Hobbs:
  https://bitbucket.org/jaredhobbs/sas7bdat

See also:
  https://github.com/BioStatMatt/sas7bdat

Partial documentation of the file format:
  https://cran.r-project.org/web/packages/sas7bdat/vignettes/sas7bdat.pdf

Reference for binary data compression:
  http://collaboration.cmc.ec.gc.ca/science/rpn/biblio/ddj/Website/articles/CUJ/1992/9210/ross/ross.htm
é    )ÚdatetimeN)ÚEmptyDataError)Úcompat)ÚBaseIteratorÚget_filepath_or_buffer)ÚParserc               @   s   e Zd ZdS )Ú_subheader_pointerN)Ú__name__Ú
__module__Ú__qualname__© r   r   ú5lib/python3.7/site-packages/pandas/io/sas/sas7bdat.pyr      s   r   c               @   s   e Zd ZdS )Ú_columnN)r	   r
   r   r   r   r   r   r   #   s   r   c               @   sü   e Zd ZdZd>dd„Zdd„ Zdd	„ Zd
d„ Zdd„ Zdd„ Z	dd„ Z
dd„ Zdd„ Zdd„ Zdd„ Zdd„ Zdd„ Zdd„ Zd d!„ Zd"d#„ Zd$d%„ Zd&d'„ Zd(d)„ Zd*d+„ Zd,d-„ Zd.d/„ Zd0d1„ Zd2d3„ Zd4d5„ Zd6d7„ Zd?d8d9„Zd:d;„ Zd<d=„ Z dS )@ÚSAS7BDATReadera!  
    Read SAS files in SAS7BDAT format.

    Parameters
    ----------
    path_or_buf : path name or buffer
        Name of SAS file or file-like object pointing to SAS file
        contents.
    index : column identifier, defaults to None
        Column to use as index.
    convert_dates : boolean, defaults to True
        Attempt to convert dates to Pandas datetime values.  Note that
        some rarely used SAS date formats may be unsupported.
    blank_missing : boolean, defaults to True
        Convert empty strings to missing values (SAS uses blanks to
        indicate missing character variables).
    chunksize : int, defaults to None
        Return SAS7BDATReader object for iterations, returns chunks
        with given number of lines.
    encoding : string, defaults to None
        String encoding.
    convert_text : bool, defaults to True
        If False, text variables are left as raw bytes.
    convert_header_text : bool, defaults to True
        If False, header text, including column names, are left as raw
        bytes.
    NTc	       
      C   sÈ   || _ || _|| _|| _|| _|| _|| _d| _d| _g | _	g | _
g | _g | _g | _d | _g | _g | _g | _d| _d| _d| _t|ƒ\| _}	}	}	t| jtjƒr´t| jdƒ| _| j| _|  ¡  |  ¡  d S )Nzlatin-1Ú r   Úrb)ÚindexÚconvert_datesÚblank_missingÚ	chunksizeÚencodingÚconvert_textÚconvert_header_textÚdefault_encodingÚcompressionÚcolumn_names_stringsÚcolumn_namesÚcolumn_formatsÚcolumnsÚ%_current_page_data_subheader_pointersÚ_cached_pageÚ_column_data_lengthsÚ_column_data_offsetsÚ_column_typesÚ_current_row_in_file_indexZ_current_row_on_page_indexr   Ú_path_or_bufÚ
isinstancer   Zstring_typesÚopenÚhandleÚ_get_propertiesÚ_parse_metadata)
ÚselfZpath_or_bufr   r   r   r   r   r   r   Ú_r   r   r   Ú__init__E   s6    zSAS7BDATReader.__init__c             C   s   t j| jt jdS )z5Return a numpy int64 array of the column data lengths)Údtype)ÚnpÚasarrayr!   Úint64)r+   r   r   r   Úcolumn_data_lengthsj   s    z"SAS7BDATReader.column_data_lengthsc             C   s   t j| jt jdS )z0Return a numpy int64 array of the column offsets)r.   )r/   r0   r"   r1   )r+   r   r   r   Úcolumn_data_offsetsn   s    z"SAS7BDATReader.column_data_offsetsc             C   s   t j| jt  d¡dS )zXReturns a numpy character array of the column types:
           s (string) or d (double)ZS1)r.   )r/   r0   r#   r.   )r+   r   r   r   Úcolumn_typesr   s    zSAS7BDATReader.column_typesc             C   s(   y| j  ¡  W n tk
r"   Y nX d S )N)r(   ÚcloseÚAttributeError)r+   r   r   r   r5   w   s    zSAS7BDATReader.closec             C   s
  | j  d¡ | j  d¡| _| jdttjƒ… tjkrD|  ¡  tdƒ‚d\}}|  	tj
tj¡}|tjkrŠtj}d| _d| _tj| _tj| _nd| _tj| _tj| _d| _|  	tjtj¡}|tjkrÆtj}|| }|  	tjtj¡}|d	krîd
| _nd| _|  	tjtj¡d }|tjkr"tj| | _ndj |d| _|  	tj!tj"¡}|dkrRd| _#n|dkrdd| _#nd| _#|  	tj$tj%¡}| &d¡| _'| j(r¦| j' )| j*p | j+¡| _'|  	tj,tj-¡}| &d¡| _.| j(râ| j. )| j*pÜ| j+¡| _.t/dddƒ}|  0tj1| tj2¡}|t3j4|dd | _5|  0tj6| tj7¡}|t3j4|dd | _8|  9tj:| tj;¡| _<| j  | j<d ¡}|  j|7  _t| jƒ| j<kr–|  ¡  tdƒ‚|  9tj=| tj>¡| _?|  9tj@| tjA¡| _B|  	tjC| tjD¡}| &d¡| _E| j(r| jE )| j*pü| j+¡| _E|  	tjF| tjG¡}| &d¡| _H| j(rB| jH )| j*p<| j+¡| _H|  	tjI| tjJ¡}| &d¡| _K| j(r‚| jK )| j*p|| j+¡| _K|  	tjL| tjM¡}| &d¡}t|ƒdkrÆ| )| j*p¾| j+¡| _Nn@|  	tjO| tjP¡}| &d¡| _N| j(r| jN )| j*p | j+¡| _Nd S )Nr   i   z'magic number mismatch (not a SAS file?))r   r   Té   Fé   ó   ú<ú>zunknown (code={name!s}))Únameó   1Zunixó   2ZwindowsÚunknowns     i¨  é   Ús)Úunitz*The SAS7BDAT file appears to be truncated.)Qr%   ÚseekÚreadr    ÚlenÚconstÚmagicr5   Ú
ValueErrorÚ_read_bytesZalign_1_offsetZalign_1_lengthZu64_byte_checker_valueZalign_2_valueÚU64Ú_int_lengthZpage_bit_offset_x64Ú_page_bit_offsetZsubheader_pointer_length_x64Ú_subheader_pointer_lengthZpage_bit_offset_x86Zsubheader_pointer_length_x86Zalign_2_offsetZalign_2_lengthZalign_1_checker_valueZendianness_offsetZendianness_lengthÚ
byte_orderZencoding_offsetZencoding_lengthZencoding_namesÚfile_encodingÚformatZplatform_offsetZplatform_lengthÚplatformZdataset_offsetZdataset_lengthÚrstripr<   r   Údecoder   r   Zfile_type_offsetZfile_type_lengthZ	file_typer   Ú_read_floatZdate_created_offsetZdate_created_lengthÚpdZto_timedeltaZdate_createdZdate_modified_offsetZdate_modified_lengthZdate_modifiedÚ	_read_intZheader_size_offsetZheader_size_lengthZheader_lengthZpage_size_offsetZpage_size_lengthÚ_page_lengthZpage_count_offsetZpage_count_lengthZ_page_countZsas_release_offsetZsas_release_lengthZsas_releaseZsas_server_type_offsetZsas_server_type_lengthZserver_typeZos_version_number_offsetZos_version_number_lengthZ
os_versionZos_name_offsetZos_name_lengthZos_nameZos_maker_offsetZos_maker_length)r+   Zalign1Zalign2ÚbufZtotal_alignZepochÚxr   r   r   r)   }   sº    








zSAS7BDATReader._get_propertiesc             C   s"   | j | jpdd}|d krt‚|S )Nr@   )Únrows)rD   r   ÚStopIteration)r+   Zdar   r   r   Ú__next__÷   s    zSAS7BDATReader.__next__c             C   sJ   |dkr|   ¡  tdƒ‚|  ||¡}|dkr0dnd}t | j| |¡d S )N)r8   r7   zinvalid float widthr8   ÚfÚdr   )r5   rH   rI   ÚstructÚunpackrN   )r+   ÚoffsetÚwidthrX   Úfdr   r   r   rT   þ   s    zSAS7BDATReader._read_floatc             C   sP   |dkr|   ¡  tdƒ‚|  ||¡}dddddœ| }t | j| |¡d }|S )N)r@   é   r8   r7   zinvalid int widthÚbÚhÚlÚqr   )r5   rH   rI   r_   r`   rN   )r+   ra   rb   rX   ÚitZivr   r   r   rV     s    zSAS7BDATReader._read_intc             C   s†   | j d krN| j |¡ | j |¡}t|ƒ|k rJ|  ¡  d}t| ||¡ƒ‚|S || t| j ƒkrp|  ¡  tdƒ‚| j ||| … S d S )Nz2Unable to read {:d} bytes from file position {:d}.zThe cached page is too small.)r    r%   rC   rD   rE   r5   rH   rP   )r+   ra   ÚlengthrX   Úmsgr   r   r   rI     s    
zSAS7BDATReader._read_bytesc             C   sZ   d}xP|sT| j  | j¡| _t| jƒdkr*P t| jƒ| jkrJ|  ¡  tdƒ‚|  ¡ }qW d S )NFr   z2Failed to read a meta data page from the SAS file.)r%   rD   rW   r    rE   r5   rH   Ú_process_page_meta)r+   Zdoner   r   r   r*     s    zSAS7BDATReader._parse_metadatac             C   sV   |   ¡  tjtjgtj }| j|kr,|  ¡  | jtj@ }| jtjk}|pT|pT| jg kS )N)	Ú_read_page_headerrF   Úpage_meta_typeZpage_amd_typeÚpage_mix_typesÚ_current_page_typeÚ_process_page_metadataÚpage_data_typer   )r+   ÚptÚis_data_pageZis_mix_pager   r   r   rl   +  s    
z!SAS7BDATReader._process_page_metac             C   sX   | j }tj| }|  |tj¡| _tj| }|  |tj¡| _tj	| }|  |tj
¡| _d S )N)rL   rF   Zpage_type_offsetrV   Zpage_type_lengthrp   Zblock_count_offsetZblock_count_lengthZ_current_page_block_countZsubheader_count_offsetZsubheader_count_lengthÚ_current_page_subheaders_count)r+   Ú
bit_offsetZtxr   r   r   rm   5  s    


z SAS7BDATReader._read_page_headerc             C   st   | j }xht| jƒD ]Z}|  tj| |¡}|jdkr4q|jtjkrBq|  	|j
¡}|  ||j|j¡}|  ||¡ qW d S )Nr   )rL   Úrangeru   Ú_process_subheader_pointersrF   Zsubheader_pointers_offsetrj   r   Ztruncated_subheader_idÚ_read_subheader_signaturera   Ú_get_subheader_indexÚptypeÚ_process_subheader)r+   rv   ÚiÚpointerÚsubheader_signatureÚsubheader_indexr   r   r   rq   @  s    
z%SAS7BDATReader._process_page_metadatac             C   s`   t j |¡}|d kr\|t jkp$|dk}|t jk}| jdkrL|rL|rLt jj}n|  ¡  t	dƒ‚|S )Nr   r   zUnknown subheader signature)
rF   Zsubheader_signature_to_indexÚgetZcompressed_subheader_idZcompressed_subheader_typer   ÚSASIndexÚdata_subheader_indexr5   rH   )r+   Z	signaturer   r{   r   Úf1Úf2r   r   r   rz   Q  s    


z#SAS7BDATReader._get_subheader_indexc       
      C   s„   | j }|||  }|  || j¡}|| j7 }|  || j¡}|| j7 }|  |d¡}|d7 }|  |d¡}tƒ }	||	_||	_||	_||	_|	S )Nr@   )rM   rV   rK   r   ra   rj   r   r{   )
r+   ra   Zsubheader_pointer_indexZsubheader_pointer_lengthZtotal_offsetZsubheader_offsetZsubheader_lengthZsubheader_compressionZsubheader_typerY   r   r   r   rx   ^  s     


z*SAS7BDATReader._process_subheader_pointersc             C   s   |   || j¡}|S )N)rI   rK   )r+   ra   r   r   r   r   ry   w  s    z(SAS7BDATReader._read_subheader_signaturec             C   sÞ   |j }|j}|tjjkr | j}n°|tjjkr4| j}nœ|tjjkrH| j	}nˆ|tjj
kr\| j}nt|tjjkrp| j}n`|tjjkr„| j}nL|tjjkr˜| j}n8|tjjkr¬| j}n$|tjjkrÈ| j |¡ d S tdƒ‚|||ƒ d S )Nzunknown subheader index)ra   rj   rF   r‚   Zrow_size_indexÚ_process_rowsize_subheaderZcolumn_size_indexÚ_process_columnsize_subheaderZcolumn_text_indexÚ_process_columntext_subheaderZcolumn_name_indexÚ_process_columnname_subheaderZcolumn_attributes_indexÚ#_process_columnattributes_subheaderZformat_and_label_indexÚ_process_format_subheaderZcolumn_list_indexÚ_process_columnlist_subheaderZsubheader_counts_indexÚ_process_subheader_countsrƒ   r   ÚappendrH   )r+   r€   r~   ra   rj   Z	processorr   r   r   r|   {  s.    z!SAS7BDATReader._process_subheaderc             C   sÒ   | j }|}|}| jr&|d7 }|d7 }n|d7 }|d7 }|  |tj|  |¡| _|  |tj|  |¡| _|  |tj|  |¡| _	|  |tj
|  |¡| _tj| }|  || |¡| _|  |d¡| _|  |d¡| _d S )Niª  iÂ  ib  iz  rd   )rK   rJ   rV   rF   Zrow_length_offset_multiplierZ
row_lengthZrow_count_offset_multiplierÚ	row_countZcol_count_p1_multiplierÚcol_count_p1Zcol_count_p2_multiplierÚcol_count_p2Z'row_count_on_mix_page_offset_multiplierZ_mix_page_row_countÚ_lcsÚ_lcp)r+   ra   rj   Úint_lenZ
lcs_offsetZ
lcp_offsetZmxr   r   r   r†   —  s(    

z)SAS7BDATReader._process_rowsize_subheaderc             C   sL   | j }||7 }|  ||¡| _| j| j | jkrHtdj| j| j| jdƒ d S )Nz?Warning: column count mismatch ({p1} + {p2} != {column_count})
)Zp1Zp2Úcolumn_count)rK   rV   r•   r   r‘   ÚprintrP   )r+   ra   rj   r”   r   r   r   r‡   °  s    
z,SAS7BDATReader._process_columnsize_subheaderc             C   s   d S )Nr   )r+   ra   rj   r   r   r   r   ½  s    z(SAS7BDATReader._process_subheader_countsc       
      C   sÎ  || j 7 }|  |tj¡}|  ||¡}|d|…  d¡}|}| jrR| | jpN| j	¡}| j
 |¡ t| j
ƒdkrÊd}xtjD ]}||krz|}qzW || _|| j 8 }|d }	| jr´|	d7 }	|  |	| j¡}| d¡}|dkrd| _|d }	| jrò|	d7 }	|  |	| j¡}|d| j… | _nŒ|tjkrV|d	 }	| jr6|	d7 }	|  |	| j¡}|d| j… | _nH| jdkržd| _|d }	| jr€|	d7 }	|  |	| j¡}|d| j… | _| jrÊt| d
ƒrÊ| j | jpÄ| j	¡| _d S )Nr   s     r@   r   é   r8   ó    é    é(   Úcreator_proc)rK   rV   rF   Ztext_block_size_lengthrI   rR   r   rS   r   r   r   rŽ   rE   Zcompression_literalsr   rJ   r“   r’   r›   Zrle_compressionÚhasattr)
r+   ra   rj   Ztext_block_sizerX   Z	cname_rawZcnameZcompression_literalZclZoffset1r   r   r   rˆ   À  sX    



z,SAS7BDATReader._process_columntext_subheaderc             C   sÌ   | j }||7 }|d|  d d }x¤t|ƒD ]˜}|tj|d   tj }|tj|d   tj }|tj|d   tj }|  |tj¡}	|  |tj	¡}
|  |tj
¡}| j|	 }| j ||
|
| … ¡ q,W d S )Nrd   é   r7   r@   )rK   rw   rF   Zcolumn_name_pointer_lengthZ!column_name_text_subheader_offsetZcolumn_name_offset_offsetZcolumn_name_length_offsetrV   Z!column_name_text_subheader_lengthZcolumn_name_offset_lengthZcolumn_name_length_lengthr   r   rŽ   )r+   ra   rj   r”   Zcolumn_name_pointers_countr}   Ztext_subheaderZcol_name_offsetZcol_name_lengthÚidxÚ
col_offsetZcol_lenZname_strr   r   r   r‰   ó  s    



z,SAS7BDATReader._process_columnname_subheaderc       
      C   sâ   | j }|d|  d |d  }x¾t|ƒD ]²}|| tj ||d   }|d|  tj ||d   }|d|  tj ||d   }|  ||¡}	| j |	¡ |  |tj	¡}	| j
 |	¡ |  |tj¡}	| j |	dkrÔdnd¡ q(W d S )Nrd   r   r7   r@   ó   dó   s)rK   rw   rF   Zcolumn_data_offset_offsetZcolumn_data_length_offsetZcolumn_type_offsetrV   r"   rŽ   Zcolumn_data_length_lengthr!   Zcolumn_type_lengthr#   )
r+   ra   rj   r”   Zcolumn_attributes_vectors_countr}   Zcol_data_offsetZcol_data_lenZ	col_typesrY   r   r   r   rŠ   	  s    z2SAS7BDATReader._process_columnattributes_subheaderc             C   s   d S )Nr   )r+   ra   rj   r   r   r   rŒ      s    z,SAS7BDATReader._process_columnlist_subheaderc             C   s„  | j }|tj d|  }|tj d|  }|tj d|  }|tj d|  }|tj d|  }|tj d|  }	|  |tj	¡}
t
|
t| jƒd ƒ}|  |tj¡}|  |tj¡}|  |tj¡}t
|t| jƒd ƒ}|  |tj¡}|  |	tj¡}| j| }|||| … }| j| }|||| … }t| jƒ}tƒ }||_| j| |_||_||_| j| |_| j| |_| j |¡ | j |¡ d S )Né   r@   )rK   rF   Z)column_format_text_subheader_index_offsetZcolumn_format_offset_offsetZcolumn_format_length_offsetZ(column_label_text_subheader_index_offsetZcolumn_label_offset_offsetZcolumn_label_length_offsetrV   Z)column_format_text_subheader_index_lengthÚminrE   r   Zcolumn_format_offset_lengthZcolumn_format_length_lengthZ(column_label_text_subheader_index_lengthZcolumn_label_offset_lengthZcolumn_label_length_lengthr   r   Zcol_idr   r<   ZlabelrP   r#   Zctyper!   rj   r   rŽ   )r+   ra   rj   r”   Ztext_subheader_formatZcol_format_offsetZcol_format_lenZtext_subheader_labelZcol_label_offsetZcol_label_lenrY   Z
format_idxZformat_startZ
format_lenZ	label_idxZlabel_startZ	label_lenZlabel_namesZcolumn_labelZformat_namesZcolumn_formatZcurrent_column_numberÚcolr   r   r   r‹   $  sR    










z(SAS7BDATReader._process_format_subheaderc             C   sð   |d kr| j d k	r| j }n|d kr(| j}t| jƒdkrF|  ¡  tdƒ‚| j| jkrVd S | j| j }||krn|}| j d¡}| j d¡}tj	||ftj
d| _tj|d| ftjd| _d| _t| ƒ}| |¡ |  ¡ }| jd k	rì| | j¡}|S )Nr   zNo columns to parse from filer    r¡   )r.   r7   )r   r   rE   r#   r5   r   r$   Úcountr/   ÚemptyÚobjectÚ_string_chunkZzerosZuint8Ú_byte_chunkÚ_current_row_in_chunk_indexr   rD   Ú_chunk_to_dataframer   Z	set_index)r+   rZ   ÚmZndÚnsÚpÚrsltr   r   r   rD   ]  s.    

zSAS7BDATReader.readc             C   s®   g | _ | j | j¡| _t| jƒdkr(dS t| jƒ| jkr\|  ¡  d}t| t| jƒ| j¡ƒ‚|  	¡  | j
}|tjkr||  ¡  |tj@ }tjgtj }|sª| j
|krª|  ¡ S dS )Nr   Tz@failed to read complete page from file (read {:d} of {:d} bytes)F)r   r%   rD   rW   r    rE   r5   rH   rP   rm   rp   rF   rn   rq   rr   ro   Ú_read_next_page)r+   rk   Z	page_typert   rs   r   r   r   r°     s$    


zSAS7BDATReader._read_next_pagec             C   s¢  | j }| j}t|| |ƒ}tj|d}d\}}xlt| jƒD ]\}| j| }| j| dkrð| j|d d …f j	| j
d d||< tj|| tjd||< | jræd }	| j| tjkr¶d}	n| j| tjkrÊd}	|	rætj|| |	dd||< |d	7 }q<| j| d
kr|| j|d d …f ||< | jrH| jd k	rH|| j | jp@| j¡||< | jrr|| j ¡ dk}
tj|j|
|f< |d	7 }q<|  ¡  tdj | j| dƒ‚q<W |S )N)r   )r   r   r    r^   )r.   rA   z
1960-01-01)rB   Úoriginr@   r¡   r   zunknown column type {type})Útype)!rª   r$   rw   rU   Z	DataFramer•   r   r#   r©   ZviewrN   r/   r0   Zfloat64r   r   rF   Zsas_date_formatsZsas_datetime_formatsZto_datetimer¨   r   r   ÚstrrS   r   r   rE   ÚnanZlocr5   rH   rP   )r+   Únr¬   Zixr¯   ZjsZjbÚjr<   rB   Ziir   r   r   r«   —  sD    



z"SAS7BDATReader._chunk_to_dataframe)NTTNNTT)N)!r	   r
   r   Ú__doc__r-   r2   r3   r4   r5   r)   r\   rT   rV   rI   r*   rl   rm   rq   rz   rx   ry   r|   r†   r‡   r   rˆ   r‰   rŠ   rŒ   r‹   rD   r°   r«   r   r   r   r   r   (   s@     
#z		
39
"r   )r·   r   r_   Znumpyr/   Zpandas.errorsr   ZpandasrU   r   Zpandas.io.commonr   r   Zpandas.io.sas._sasr   Zpandas.io.sas.sas_constantsÚioZsasZsas_constantsrF   r§   r   r   r   r   r   r   r   Ú<module>   s   