
\c        	   @` s  d  d l  m Z m Z m Z d  d l Z d  d l m Z d  d l Z d  d l Z d  d l	 Z	 d  d l
 m Z d  d l Z d  d l Z d d l m Z m Z d d l m Z m Z m Z d d	 l m Z d d
 l m Z d d l m Z m Z d d l m Z d d l m  Z  d d l! m" Z" d d l m# Z# m$ Z$ d d l m% Z% m& Z& d3 Z' d   Z( d   Z) d d d d d d  Z+ d   Z, d d d d  Z- d   Z. d   Z/ d   Z0 d   Z1 d   Z2 d e3 e3 d d d  Z4 d   Z5 d d d d d d   Z6 d!   Z7 d"   Z8 d#   Z9 d$ d% d& d' d( d) d* d+ h Z: d% d( d) h Z; d e3 e3 d d,  Z< d d-  Z= i  Z> d.   Z? d d d d d d/ d d0  Z@ d/ d1 d e3 e3 d d eA d2  ZB e reB jC e jB _C n  d S(4   i    (   t   absolute_importt   divisiont   print_functionN(   t   OrderedDict(   t   LooseVersioni   (   t	   DataFramet   Series(   t   clear_known_categoriest   strip_unknown_categoriest   UNKNOWN_CATEGORIESi   (   t   compress(   t   tokenize(   t   PY3t   string_types(   t   delayed(   t   get_fs_token_paths(   t   infer_storage_options(   t   import_requiredt   natural_sort_keyi   (   t   _get_pyarrow_dtypest   _meta_from_dtypest   read_parquett
   to_parquetc         C` s  g  |  d D]% } t  | t  r* | d n | ^ q } t j d  } g  |  d D]& } | j d | d  | d f ^ qP } g  } xE | D]= \ } } | r | j |  r d } n  | j | | f  q Wg  | D] \ } }	 |	 | k r |	 ^ q }
 |  j d i d d 6g  } g  | D] } | d ^ q} |
 szt |  }
 t	 |  } g  | D] \ } }	 |	 | k rS|	 ^ qS} n+ g  | D] \ } }	 |	 | k r|	 ^ q} t |  } |
 | | | f S(   sP  Get the set of names from the pandas metadata section

    Parameters
    ----------
    pandas_metadata : dict
        Should conform to the pandas parquet metadata spec

    Returns
    -------
    index_names : list
        List of strings indicating the actual index names
    column_names : list
        List of strings indicating the actual column names
    storage_name_mapping : dict
        Pairs of storage names (e.g. the field names for
        PyArrow) and actual names. The storage and field names will
        differ for index names for certain writers (pyarrow > 0.8).
    column_indexes_names : list
        The names for ``df.columns.name`` or ``df.columns.names`` for
        a MultiIndex in the columns

    Notes
    -----
    This should support metadata written by at least

    * fastparquet>=0.1.3
    * pyarrow>=0.7.0
    t   index_columnst   names   __index_level_\d+__t   columnst
   field_namet   column_indexesN(
   t
   isinstancet   dictt   ret   compilet   gett   matcht   Nonet   appendt   listt   set(   t   pandas_metadatat   nt   index_storage_namest   index_name_xprt   xt   pairst   pairs2t   storage_namet	   real_nameR   t   index_namest   column_index_namest   index_storage_names2t   column_namest   storage_name_mapping(    (    s8   lib/python2.7/site-packages/dask/dataframe/io/parquet.pyt   _parse_pandas_metadata   s.    34	+c   
      C` s  |  d k	 } | d k	 } t } |  d k r9 t |  }  n- t |  t  rZ |  g }  t } n t |   }  | d k r{ | } nF | t k r g  } | | } n' t | t  r | g } n t |  } | r | r | } g  | D] } | | k r | ^ q }	 n | r7| r7|  }	 g  | D] } | |	 k r| ^ q} nK | rv| rv|  }	 | } t |	  j |  rt	 d   qn | }	 | } |	 | | f S(   sm  Normalize user and file-provided column and index names

    Parameters
    ----------
    user_columns : None, str or list of str
    data_columns : list of str
    user_index : None, str, or list of str
    data_index : list of str

    Returns
    -------
    column_names : list of str
    index_names : list of str
    out_type : {pd.Series, pd.DataFrame}
    s3   Specified index and column names must not intersectN(
   R"   R   R$   R   R   R   t   FalseR%   t   intersectiont
   ValueError(
   t   user_columnst   data_columnst
   user_indext
   data_indext   specified_columnst   specified_indext   out_typeR/   R*   R2   (    (    s8   lib/python2.7/site-packages/dask/dataframe/io/parquet.pyt   _normalize_index_columnsk   s>    			((c      
   ` s|  d d  l  } t | | j j  r* |  n t |  d k r | t k	 rl | j | d  j d  j  q t  | | d | d  d | Snm y4 | j | d  j d d  j d  j  Wn6 t	 k
 r | j | d d  j d  j  n Xt
 j j  j  d	 d k r8| t k r8t d
   n  t  | |  |  \ }	 } }
      g   j D]> } | j j | |  j  ro| j j | |  ro| ^ qo} d t | |   |             f d   t |  D } | s2i |	  d f 6} d }  |  |	 |  S r| t k	 r|	 j j }
 y | j j  |  } Wn# t k
 r| j j   } n X|
 | k r| |
 } | d | d d	 g } q.| t k rt d j d |
    n  d t |  d } n/ | t k rt d   n  d t |  d } t | d t j  rig  | D] } t  j! |  ^ qK} n   |  |	 |  S(   Ni    i   t	   open_witht   sepR   t
   categoriest   indext	   _metadataisz   infer_divisions=True is not supported by the fastparquet engine for datasets that do not contain a global '_metadata' files   read-parquet-c         ` sv   i  |  ]l \ } } t    j |     |  t k   j  j  j  j  t  d  i   f  | f  q S(   t   tz(   t   _read_parquet_row_groupt   row_group_filenameR   t   schemat   catst   dtypest   file_schemet   getattr(   t   .0t   it   rg(   t   all_columnsRB   t   fsR/   R   R>   t   pfR3   (    s8   lib/python2.7/site-packages/dask/dataframe/io/parquet.pys
   <dictcomp>   s   	t   mint   maxsl   Unable to infer divisions for index of '{index_name}' because it is not known to be sorted across partitionst
   index_namesD   Unable to infer divisions for because no index column was discovered(   NN(   N(   N("   t   fastparquetR   t   apit   ParquetFilet   lenR5   t   openRA   t   _read_fp_multifilet	   Exceptiont   ost   patht   splitt   fnt   Truet   NotImplementedErrort   _pf_validationt
   row_groupst   filter_out_statsRH   t   filter_out_catsR   t	   enumerateR"   RC   R   t   sorted_partitioned_columnst	   TypeErrorR7   t   formatt   npt
   datetime64t   pdt	   Timestamp(   RQ   t   fs_tokent   pathsR   t   filtersRB   RC   t   infer_divisionsRV   t   metaRU   RO   t   rgst   dskt	   divisionst   minmaxt   d(    (   RP   RB   RQ   R/   R   R>   RR   R3   s8   lib/python2.7/site-packages/dask/dataframe/io/parquet.pyt   _read_fastparquet   sd    	$	)+*"
!
	%c         ` s  d d l  m } | |  j |  | |  j t |  j p8 g   |  t | t  ra t |  } n  |  j j r g  |  j j D] } | j	 d k rz | j
 ^ qz } n g  } t |  d k r	|  j   } t | t  s | g } n  |  j t |  j    d     D  n{ t |  d k rxt t j | d   \ }    }	 g  | D] }
 |
 d k	 rG|
 ^ qG}   j |  j  n t d   | d k rg  } n  t |   | |  \   } } | d k r|  j } n' t | t  r| g } n t |  } t    } | j   f d   | D  |  j |  }  f d   | j   D } t | | | d g  } t |  d k rt d	   n t |  d k rd } n  xK | D]C } | | k rt j t j g  d
 t g d | j | | <qqWxu |  j D]j } | | j k r/| | j j |  j |  | | <q| j j  | k r| j j |  j |  | _ qqW| t k rt | j  d k st!  | | j d } n  | | | | | |  f S(   sV  Validate user options against metadata in dataset

     columns, index and categories must be in the list of columns available
     (both data columns and path-based partitioning - subject to possible
     renaming, if pandas metadata is present). The output index will
     be inferred from any available pandas metadata, if not given.
     i    (   t   check_column_namest   pandasc         S` s   i  |  ] } | |  q S(    (    (   RM   t   k(    (    s8   lib/python2.7/site-packages/dask/dataframe/io/parquet.pys
   <dictcomp>  s   	 i   s/   File has multiple entries for 'pandas' metadatac         3` s!   |  ] } |   k r | Vq d  S(   N(    (   RM   R*   (   R2   (    s8   lib/python2.7/site-packages/dask/dataframe/io/parquet.pys	   <genexpr>:  s    c         ` s+   i  |  ]! \ } } |   j  | |   q S(    (   R    (   RM   R|   t   v(   R3   (    s8   lib/python2.7/site-packages/dask/dataframe/io/parquet.pys
   <dictcomp>=  s   	 s&   Cannot read DataFrame with MultiIndex.RB   RC   N("   t   fastparquet.utilRz   R   R$   RI   R   t   tuplet   fmdt   key_value_metadatat   keyt   valueRY   t
   _get_indexR4   t   jsont   loadsR"   t   extendR7   R?   RB   R   t   _dtypest   itemsR   Rm   R   t   CategoricalR	   RC   t   catt   set_categoriesR   t   AssertionError(   RR   R   RC   RB   Rq   Rz   R*   t	   pandas_mdR/   R0   R'   R>   RP   RJ   Rs   R   t   catcol(    (   R2   R3   s8   lib/python2.7/site-packages/dask/dataframe/io/parquet.pyRc     sj    #!%%		$#c      	   ` s>  d d l  m } d d l m } m } m }	 | |  \  }
 g  | D] } |	 |  ^ qE } | |
  } | | d d  j  |  _ t |
 |   _	 t
  | |  g   \ } } }      d t | |                f	 d   t |  D } d	 t |  d }  |  | |  S(
   sB   Read dataset with fastparquet by assuming metadata from first filei    (   RX   (   t   analyse_pathst   get_file_schemet	   join_pathR@   s   read-parquet-c         ` sR   i  |  ]H \ } } t   |      t k   j  j  f  | f  q S(    (   t   _read_pf_simpleR   RI   RK   (   RM   RN   R^   (	   RP   t   baseRB   RQ   R/   R   R>   RR   R3   (    s8   lib/python2.7/site-packages/dask/dataframe/io/parquet.pys
   <dictcomp>j  s   	i   N(   N(   RV   RX   R~   R   R   R   RZ   RK   t   _paths_to_catsRI   Rc   R   Rg   R"   RY   (   RQ   Ro   Rp   R   RB   RC   RX   R   R   R   t   fnst   pt   parsed_pathst   schemeRs   t   _RU   Ru   Rv   (    (	   RP   R   RB   RQ   R/   R   R>   RR   R3   s8   lib/python2.7/site-packages/dask/dataframe/io/parquet.pyR[   Z  s     	*
$c
         C` sm  d d l  m }
 |
 | d |  j } | j | d  j d  } x. | j D]# } x | j D] } | | _ qZ WqJ W| | _ | | _	 | | _
 | j | | d | } | j j d k r | r|	 j | d | d  | j _ qn4 | rg  | D] } |	 j | |  ^ q | j _ n  g  | D]* } | | p/g  k r|	 j | |  ^ q| _ | re| | j d S| Sd S(	   s9   Read dataset with fastparquet using ParquetFile machineryi    (   RX   R@   t    t   /RC   i   N(   RV   RX   RZ   t   replacet   lstripRd   R   t	   file_pathRK   RI   R`   t	   to_pandasRC   t   nlevelsR    R   t   names(   RQ   R^   R   R/   RP   t	   is_seriesRB   RI   R   R3   RX   RR   t   relpathRO   t   cht   dfR   t   col(    (    s8   lib/python2.7/site-packages/dask/dataframe/io/parquet.pyR   s  s.    			+
-c         C` su  d d l  m } m } m } t   } t   } x |  D] } | d  } | d k r | j |  }	 x |	 D]J \ }
 } | j |
 t    j | |   | j |
 t    j |  qi Wq5 xo t	 | j
 d  d   D]T \ } } d | }
 | j |
 t    j | |   | j |
 t    j |  q Wq5 Wx.| j   D] \ }
 } | |
 } t |  t |  k rt   } x4 | |
 D]( } | j | |  t    j |  qWg  | j   D]+ } t |  d k r| D] } | ^ qq} t d |   n  | |  } t |  d k r=d d	 l } g  | j   D] } | d ^ q3} | j d
 |  q=q=Wd   | j   D S(   s2   Extract out fields and labels from directory namesi    (   t   ex_from_sept
   val_to_numt   groupby_typesR   t   hiveis   dir%ii   s)   Partition names map to the same value: %sNs<   Partition names coerce to values of different types, e.g. %sc         S` s%   i  |  ] \ } } t  |  |  q S(    (   R$   (   RM   R|   R}   (    (    s8   lib/python2.7/site-packages/dask/dataframe/io/parquet.pys
   <dictcomp>  s   	 (   R~   R   R   R   R   t   findallt
   setdefaultR%   t   addRg   R_   R   RY   t   valuesR7   t   warningst   warn(   Rp   R   R   R   R   RI   t   raw_catsR^   t   st
   partitionsR   t   valRN   R}   t   rawt   conflicts_by_valuet   raw_valR|   t   ct	   conflictst   vals_by_typeR   R*   t   examples(    (    s8   lib/python2.7/site-packages/dask/dataframe/io/parquet.pyR     s@    		"#&
"$
	+#	c         G` sE  d d l  m } d d l m } d   |
 j   D } t | t t f  s] | g } t } n  | r | \ } | | k r | | g } q n  g  | D] } | j	 | |  ^ q } | j	 | |  } | g  | j   D]$ \ } } | | k r | | f ^ q  } | | d |  j
 } |	 | _ xC | j D]8 } x/ | j D]$ } | j | d  j d  | _ q7Wq'W| | _ | j d | d	 | d
 |  } | j j d k r| r|
 j	 | |  | j _ qn4 | rg  | D] } |
 j	 | |  ^ q| j _ n  g  | D]$ } | | k r|
 j	 | |  ^ q| _ | r=| | j d S| Sd S(   s9   Read a single file with fastparquet, to be used in a taski    (   RX   (   R   c         S` s   i  |  ] \ } } | |  q S(    (    (   RM   R|   R}   (    (    s8   lib/python2.7/site-packages/dask/dataframe/io/parquet.pys
   <dictcomp>  s   	 R@   R   R   R   RC   RB   i   N(   t   fastparquet.apiRX   t   collectionsR   R   R   R   R$   Ra   R    RZ   RK   Rd   R   R   R   R   R`   R   RC   R   R   R   (   RQ   R   R`   RC   R   t   seriesRB   t   cst   dtR   R3   t   argsRX   R   t   name_storage_mappingR   R|   R}   RR   RO   R   R   R   (    (    s8   lib/python2.7/site-packages/dask/dataframe/io/parquet.pyt   _read_parquet_file  s@    			%=	&	+
'c         G` s  d d l  m } d d l m } d d l m } d   | j   D } t | t t	 f  sm | g } t
 } n  | r | \ } | | k r | | g } q n  g  | D] } | j | |  ^ q } | j | |  } | g  | j   D]$ \ } } | | k r | | f ^ q  } | | j | | | | |	 |  \ } } | | | | | | | d |  j d | d |
 | j j d	 k r| r| j | |  | j _ qn4 | rg  | D] } | j | |  ^ q| j _ n  g  | D]$ } | | k r| j | |  ^ q| _ | r| | j d S| Sd  S(
   Ni    (   t   _pre_allocate(   t   read_row_group_file(   R   c         S` s   i  |  ] \ } } | |  q S(    (    (   RM   R|   R}   (    (    s8   lib/python2.7/site-packages/dask/dataframe/io/parquet.pys
   <dictcomp>  s   	 RZ   t   assignR   i   (   R   R   t   fastparquet.coreR   R   R   R   R   R   R$   Ra   R    t   num_rowsRZ   RC   R   R   R   R   (   RQ   R`   RC   R   RO   R   RB   RH   R   R   R   R3   RE   R   R   R   R   R   R   R|   R}   R   t   viewsR   (    (    s8   lib/python2.7/site-packages/dask/dataframe/io/parquet.pyRF     s<    			%=+
'c         C` s  d d l  m } m } d d  l }	 t j |  } t |   sF d  }
 n | r t |	 j  d k r | |  | | | | | | j	 | j
  }
 q| |  | | | | | j | | j	 | j
 	 }
 n] t |   | _ | j	 | j j | | g  d  ( } | | |  | j d | d | }
 Wd  QX|
 S(   Ni    (   t   partition_on_columnst   make_part_files   0.1.4t   wbt   compressionR   (   t   fastparquet.writerR   R   RV   t   copyRY   R"   R   t   __version__RZ   t   mkdirsRA   R   t   joinRH   (   R   RQ   R^   t   filenameR   R   t   partition_onR   R   RV   Rt   t   fil(    (    s8   lib/python2.7/site-packages/dask/dataframe/io/parquet.pyt   _write_partition_fastparquet  s     	'c	      
   K` sp  d d  l  }
 | j |  | j } |	 j d d  } | d k sa t | t  rp d | j   k rp t d   n  |  j } | t	 k s | d  k r |  j r |  j   }  |  j d g } n t	 } g  } | ry% |
 j j | d | j d | } Wqt t f k
 rt } qXn  | r| j d k r7t d   n t | j  t |  j  t |  k szt |  t | j  k rt d j | j t |  j     n} t j | j  j | j |  | j j k j   rt d j t | j j    t |  j j    A   n |  | j | }  | j } |
 j  j! | j"  } | s|
 j j# |  } | | d d d } | d | k  rt d j | | d    qqn0 |
 j  j$ |  j% d | d | d | |	 } d } g  t& |  j'  D] } d | | ^ q} t( t) d t } g  t* | |  j+    D]* \ } } | | | | | | | |  ^ q!} t( t,  | | | | | |  S(   Ni    t   object_encodingt   utf8t   infersM   "infer" not allowed as object encoding, because this required data in memory.R@   RA   R   t   emptyt   flats?   Requested file scheme is hive, but existing file scheme is not.s5   Appended columns not the same.
Previous: {} | New: {}s   Appended dtypes differ.
{}RT   isM   Appended divisions overlapping with the previous ones.
Previous: {} | New: {}t
   index_colst   ignore_columnss   part.%i.parquett   pure(   R   R   R   (-   RV   R   RA   t   popR   R   R   R7   Rv   Ra   R"   t   known_divisionst   reset_indexR   RW   RX   RZ   t   IOErrorR5   RK   R%   RI   Rj   R$   Rm   R   RJ   t   loct   anyR   t	   iteritemsR   t   writert   find_max_partRd   Rh   t   make_metadatat   _metat   ranget   npartitionsR   R   t   zipt
   to_delayedt   _write_metadata(   R   RQ   Ro   R^   t   write_indexR#   t   ignore_divisionsR   R   t   kwargsRV   RA   R   Rv   R   RR   R   t   i_offsetRw   t   old_endRN   t	   filenamest   writeR   t   partt   writes(    (    s8   lib/python2.7/site-packages/dask/dataframe/io/parquet.pyt   _write_fastparquet.  sb    	-	!%C	2			'Cc         C` s  d d l  } t j |  } x t | |   D]{ \ } } | d k	 r+ t | t  rv xN | D] }	 | j j |	  qY Wq x | j D] }
 | |
 _	 q W| j j |  q+ q+ W| j
 | d g  } | j j | | d | j d t | j
 | d g  } | j j | | d | j d S(   sc    Write Parquet metadata after writing all row groups

    See Also
    --------
    to_parquet
    i    NRD   R@   t   no_row_groupst   _common_metadata(   RV   R   R   R"   R   R$   Rd   R#   R   R   R   R   t   write_common_metadataRZ   R5   (   R   R   R   R^   RQ   RA   RV   R`   RO   t   rt   chunk(    (    s8   lib/python2.7/site-packages/dask/dataframe/io/parquet.pyR   t  s    c         ` s  d d l  m } d d  l j  t   t  r:   g   n!   d  k rO g    n t      t | t  ry t |  } n   j	 | d |   d |   j
 d  k	 r g   j
 j D] }	 |	 d  k	 r |	 ^ q }
 n g  }
  j j   } | j d  k	 od | j k } | rJt j | j d j d   } t |  \    } n( g   | j  d    D  d  g }  g  |
 D] } |  k r|| ^ q|7 t |  |   \      } g  }   f d	   } x?  j D]4 } | j |  } | j d k r| j |  qqWt |  d
 k r\t  j  d
 k r\t | d d   } n  t   d
 k rg   j   D]" \ } }  d | k r{| ^ q{} | r| d } qd  } n d  } t | | | |  } t |    }  f d   | j   D } t | |  |  } t | d   }  t  k rgt | j!  d
 k sSt"  | | j! d } n  d t# | | |   | r        f d   t$ |  D } n t% |  } i |  d f 6}  |  | |  S(   Ni   (   t   get_pyarrow_filesystemi    t
   filesystemRq   R{   R   c         S` s   i  |  ] } | |  q S(    (    (   RM   R|   (    (    s8   lib/python2.7/site-packages/dask/dataframe/io/parquet.pys
   <dictcomp>  s   	 c         ` s    j    j |  d d  S(   Nt   modet   rb(   RX   RZ   (   R`   (   RQ   t   pq(    s8   lib/python2.7/site-packages/dask/dataframe/io/parquet.pyt   <lambda>  R   i   R   c         S` s   t  |  j  S(   N(   R   R^   (   t   piece(    (    s8   lib/python2.7/site-packages/dask/dataframe/io/parquet.pyR    R   c         ` s+   i  |  ]! \ } } |   j  | |   q S(    (   R    (   RM   R|   R}   (   R3   (    s8   lib/python2.7/site-packages/dask/dataframe/io/parquet.pys
   <dictcomp>  s   	 t   colss   read-parquet-c      
   ` sF   i  |  ]< \ } } t   |    t k  j   f  | f  q S(    (   t   _read_pyarrow_parquet_pieceR   R   (   RM   RN   R  (   RB   R2   t   datasetRQ   R/   R>   t	   task_name(    s8   lib/python2.7/site-packages/dask/dataframe/io/parquet.pys
   <dictcomp>  s   	(&   t
   bytes.coreR   t   pyarrow.parquett   parquetR   R   R"   R$   R   t   ParquetDatasetR   t   partition_namesRH   t   to_arrow_schemat   metadataR   R   t   decodeR4   R   R?   t   piecest   get_metadatat   num_row_groupsR#   RY   t   sortedR   t   _get_pyarrow_divisionsR   R   R   R   R   R   R   Rg   R   (   RQ   Ro   Rp   R   Rq   RB   RC   Rr   R   R'   R   RH   t   has_pandas_metadataR&   R0   R   RP   t   non_empty_piecest   _openR  RR   R-   R   t   divisions_namest   divisions_nameRv   RJ   Rs   t	   task_plan(    (	   RB   R2   R  RQ   R/   R>   R  R3   R	  s8   lib/python2.7/site-packages/dask/dataframe/io/parquet.pyt   _read_pyarrow  s|    				)
'			c         C` s{   i t  d  d 6t  d  d 6t  d  d 6d d 6} y | j |  } Wn) t k
 rr t d	 j d
 |    n X|  | S(   s  
    Convert an input time in the specified units to nanoseconds

    Parameters
    ----------
    val: int
        Input time value
    unit : str
        Time units of `val`.
        One of 's', 'ms', 'us', 'ns'

    Returns
    -------
    int
        Time val in nanoseconds
    g    eAR   g    .At   msg     @@t   usi   t   nss   Unsupported time unit '{unit}'t   unit(   t   intR    t   KeyErrorR7   Rj   (   R   R!  t   factorst   factor(    (    s8   lib/python2.7/site-packages/dask/dataframe/io/parquet.pyt   _to_ns  s    4c         C` sp  d d l  } d d l j } | t k rK | j t d  k  rK t d   n  | t k r| d k	 ou | j |  d k } | t	 k r	| d k r | j t d  k r d g } x, |  D]$ } | j
 | d | j   j  q W| d c d 8<| St d   q	n d } |  rI| rIg  }	 d }
 x |  D] } | j | j  } | j d  } g  t | j  D] } | j |  j ^ q_} y | j |  } Wn t k
 rd }	 Pn X| j |  } | j } | j r|
 d k s|
 | j k  r|	 j
 | j | j f  | j }
 q(d }	 Pq(W|	 rg  |	 D] \ } } | ^ q$|	 d d g } | j |  } | j j | j  r| j j } g  | D] } t | |  ^ q} g  | D] } t j  |  ^ q} n  | j | j!   k rFd	 } g  | D] } | j" |  j#   ^ q} qFql| t k r2t d
 j$ d |    n  d t% |   d } n# |  rfd t% |   d } n d } | S(   s{  
    Compute DataFrame divisions from a list of pyarrow dataset pieces

    Parameters
    ----------
    pa_pieces : list[pyarrow.parquet.ParquetDatasetPiece]
        List of dataset pieces. Each piece corresponds to a single partition in the eventual dask DataFrame
    divisions_name : str|None
        The name of the column to compute divisions for
    pa_schema : pyarrow.lib.Schema
        The pyarrow schema for the dataset
    infer_divisions : bool or None
        If True divisions must be inferred (otherwise an exception is raised). If False or None divisions are not
        inferred
    Returns
    -------
    list
    i    Ns   0.9.0s-   infer_divisions=True requires pyarrow >=0.9.0s   0.13.0ii   sD   Unable to infer divisions for because no index column was discovereds   utf-8sl   Unable to infer divisions for index of '{index_name}' because it is not known to be sorted across partitionsRU   (   N(   N(   NN(&   t   pyarrowR  R  Ra   R   R   Rb   R"   t   get_field_indexR5   R#   R  R   R7   RX   t	   row_groupR   t   num_columnst   columnt   path_in_schemaRC   t
   statisticst   has_min_maxRS   RT   t   field_by_namet   typest   is_timestampt   typeR!  R&  Rm   Rn   t   stringR  t   stripRj   RY   (   t	   pa_piecesR  t	   pa_schemaRr   t   paR  t   divisions_name_in_schemaRv   R  t   min_maxst   last_maxRR   RO   RN   t   rg_pathst   divisions_col_indext   col_metat   statst   mnt   mxt   index_fieldt	   time_unitRx   t   divisions_nsR   t   encoding(    (    s8   lib/python2.7/site-packages/dask/dataframe/io/parquet.pyR  !  st    !	.	$.%.	c         C` s  d d  l  } |  j | j d d / } | j d | | d | d t d |  }	 Wd  QX| j t d  k  r |	 j   }
 xj | D] } |
 | j d	  |
 | <q} WnB | j t d
  k  r |	 j d |  }
 n |	 j d | d t	  }
 t
 |
 j t j  p|
 j j } | r$| r$|
 j |  }
 n | r|
 j j | k r|
 j d t	  }
 | rf|
 j |  }
 n  t t |
 j  j |   } | r|
 j | d d }
 n  |
 j d | d t	  }
 n  | r|
 |
 j d S|
 | Sd  S(   Ni    R  R  R   R   t   use_pandas_metadatat   files   0.9.0t   categorys   0.11.0RB   t   date_as_objectt   dropt   axisi   R   (   R'  RZ   R^   t   readRa   R   R   R   t   astypeR5   R   RC   Rm   t
   RangeIndexR   t	   set_indexR   R$   R%   R   t
   differenceRI  t   reindex(   RQ   R  R   R   R   R   RB   R7  t   ft   tableR   R   t	   has_indexRI  (    (    s8   lib/python2.7/site-packages/dask/dataframe/io/parquet.pyR    s6    "t   row_group_sizet   versiont   use_dictionaryR   t   use_deprecated_int96_timestampst   coerce_timestampst   flavort
   chunk_sizec      
   K` sK  | r t  d   n  | r* t  d   n  t |  j t  rq d d t t |  j t   }	 t |	   n  | d  k r |  j r t } n  | j	 |  | j
 j | d g  }
 t t d t } | j   } | j
 j | d g  | d <g  t |  j    D]: \ } } | | | | |
 | | | | r2| n |  ^ q} t |  S(	   Ns/   `append` not implemented for `engine='pyarrow'`s9   `ignore_divisions` not implemented for `engine='pyarrow'`s   Unexpected keyword arguments: s   %rs   part.%i.parquetR   R   t   metadata_path(   Rb   R%   RO  t   _pyarrow_write_table_kwargsR$   Ri   R"   R   Ra   R   RA   R   R   t   _write_partition_pyarrowR5   R   Rg   R   (   R   RQ   Ro   R^   R   R#   R   R   R   t   msgt   templateR   t   first_kwargsRN   R   R   (    (    s8   lib/python2.7/site-packages/dask/dataframe/io/parquet.pyt   _write_pyarrow  s$     	Pc      
   K` s   d d  l  } d d l  m }	 | j j |  d | }
 | rb |	 j |
 | d | d | d | | n. | j | d   } |	 j |
 | |  Wd  QX| d  k	 r | j | d  3 } d   | j   D } |	 j	 |
 j
 | |  Wd  QXn  d  S(   Ni    (   R  t   preserve_indext   partition_colsR   R   c         S` s+   i  |  ]! \ } } | t  k r | |  q S(    (   t   _pyarrow_write_metadata_kwargs(   RM   R|   R}   (    (    s8   lib/python2.7/site-packages/dask/dataframe/io/parquet.pys
   <dictcomp>  s   	 	(   R'  R  t   Tablet   from_pandast   write_to_datasetRZ   t   write_tableR"   R   t   write_metadataRH   (   R   R^   RQ   R   R   R   R[  R   R7  R  t   tR   t   kwargs_meta(    (    s8   lib/python2.7/site-packages/dask/dataframe/io/parquet.pyR]    s    c         C` s>  |  t  k r t  |  S|  d k rh xd d g D]( } y t |  SWq- t k
 rT q- Xq- Wt d   n |  d k r t d d  i t d 6t d 6t  d <} | S|  d k r!t d d  } t | j  } | d	 k  r t d
   n | d k rt d   n  i t d 6t	 d 6t  d <} | St
 d j |   d   d S(   s8  Get the parquet engine backend implementation.

    Parameters
    ----------
    engine : {'auto', 'fastparquet', 'pyarrow'}, default 'auto'
        Parquet reader library to use. Default is first installed in this list.

    Returns
    -------
    A dict containing a ``'read'`` and ``'write'`` function.
    t   autoRV   R'  s,   Please install either fastparquet or pyarrows   `fastparquet` not installedRK  R   s   `pyarrow` not installeds   0.8.0s!   PyArrow version >= 0.8.0 requireds   0.13.0sC   PyArrow version 0.13.0 isn't supported, please upgrade or downgrades   Unsupported engine: "{0}".s4     Valid choices include "pyarrow" and "fastparquet".N(   t   _ENGINESt
   get_enginet   RuntimeErrorR   Ry   R   R   R   R  Ra  R7   Rj   (   t   enginet   engR7  t
   pa_version(    (    s8   lib/python2.7/site-packages/dask/dataframe/io/parquet.pyRn    s4    

Rl  c         C` s  t  } y d d l }	 t |  |	 j j  r |  j |	 j j k rr t j	 d |  j
  sr t d d d |  j
   n  | d k s t d	 d
   t } n  Wn t k
 r n X| rt d  d }
 |  j
 j d  r |  j
 t d   } n	 |  j
 } t | d d d | \ } } } |  } ng t |  d }
 t |  d d d | \ } } } t |  t  rt |  d k rt | d t } n  |
 | | | d | d | d | d | d | S(   s  
    Read ParquetFile into a Dask DataFrame

    This reads a directory of Parquet data into a Dask.dataframe, one file per
    partition.  It selects the index among the sorted columns if any exist.

    Parameters
    ----------
    path : string, list or fastparquet.ParquetFile
        Source directory for data, or path(s) to individual parquet files.
        Prefix with a protocol like ``s3://`` to read from alternative
        filesystems. To read from multiple files you can pass a globstring or a
        list of paths, with the caveat that they must all have the same
        protocol.
        Alternatively, also accepts a previously opened
        fastparquet.ParquetFile()
    columns : string, list or None (default)
        Field name(s) to read in as columns in the output. By default all
        non-index fields will be read (as determined by the pandas parquet
        metadata, if present). Provide a single field name instead of a list to
        read in the data as a Series.
    filters : list
        List of filters to apply, like ``[('x', '>', 0), ...]``. This implements
        row-group (partition) -level filtering only, i.e., to prevent the
        loading of some chunks of the data, and only if relevant statistics
        have been included in the metadata.
    index : string, list, False or None (default)
        Field name(s) to use as the output frame index. By default will be
        inferred from the pandas parquet file metadata (if present). Use False
        to read all fields as columns.
    categories : list, dict or None
        For any fields listed here, if the parquet encoding is Dictionary,
        the column will be created with dtype category. Use only if it is
        guaranteed that the column is encoded as dictionary in all row-groups.
        If a list, assumes up to 2**16-1 labels; if a dict, specify the number
        of labels expected; if None, will load categories automatically for
        data written by dask/fastparquet, not otherwise.
    storage_options : dict
        Key/value pairs to be passed on to the file-system backend, if any.
    engine : {'auto', 'fastparquet', 'pyarrow'}, default 'auto'
        Parquet reader library to use. If only one library is installed, it
        will use that one; if both, it will use 'fastparquet'
    infer_divisions : bool or None (default).
        By default, divisions are inferred if the read `engine` supports
        doing so efficiently and the `index` of the underlying dataset is
        sorted across the individual parquet files. Set to ``True`` to
        force divisions to be inferred in all cases. Note that this may
        require reading metadata from each file in the dataset, which may
        be expensive. Set to ``False`` to never infer divisions.

    Examples
    --------
    >>> df = dd.read_parquet('s3://bucket/my-parquet-data')  # doctest: +SKIP

    See Also
    --------
    to_parquet
    i    Ns   .*://s'   ParquetFile: Path must contain protocols3    (e.g., s3://...) when using other than the defaults    LocalFileSystem. Path given: Rl  RV   s2   'engine' should be set to 'auto' or 'fastparquet' s)   when reading from fastparquet.ParquetFileRK  RD   R  R  t   storage_optionsi   R   R   Rq   RB   RC   Rr   (   Rl  RV   (   R5   RV   R   RW   RX   RZ   t   utilt   default_openR   R!   R`   R   Ra   t   ImportErrorRn  t   endswithRY   R   R   R  R   (   R^   R   Rq   RB   RC   Rs  Rp  Rr   t   is_ParquetFileRV   RK  t   urlpathRQ   Ro   Rp   (    (    s8   lib/python2.7/site-packages/dask/dataframe/io/parquet.pyR   6  s>    <
		!t   defaultc
         K` s   | p	 g  } t  |  t  |  j  r4 t d   n  | d k rM | |
 d <n d t k rf d |
 d <n  t |  d } t | d d d | \ } } } t |  d	 } | |  | | | d
 | d | d | d | |
 } |	 r | j   d S| S(   s  Store Dask.dataframe to Parquet files

    Notes
    -----
    Each partition will be written to a separate file.

    Parameters
    ----------
    df : dask.dataframe.DataFrame
    path : string
        Destination directory for data.  Prepend with protocol like ``s3://``
        or ``hdfs://`` for remote data.
    engine : {'auto', 'fastparquet', 'pyarrow'}, default 'auto'
        Parquet library to use. If only one library is installed, it will use
        that one; if both, it will use 'fastparquet'.
    compression : string or dict, optional
        Either a string like ``"snappy"`` or a dictionary mapping column names
        to compressors like ``{"name": "gzip", "values": "snappy"}``. The
        default is ``"default"``, which uses the default compression for
        whichever engine is selected.
    write_index : boolean, optional
        Whether or not to write the index. Defaults to True *if* divisions are
        known.
    append : bool, optional
        If False (default), construct data-set from scratch. If True, add new
        row-group(s) to an existing data-set. In the latter case, the data-set
        must exist, and the schema must match the input data.
    ignore_divisions : bool, optional
        If False (default) raises error when previous divisions overlap with
        the new appended divisions. Ignored if append=False.
    partition_on : list, optional
        Construct directory-based partitioning by splitting on these fields'
        values. Each dask partition will result in one or more datafiles,
        there will be no global groupby.
    storage_options : dict, optional
        Key/value pairs to be passed on to the file-system backend, if any.
    compute : bool, optional
        If True (default) then the result is computed immediately. If False
        then a ``dask.delayed`` object is returned for future computation.
    **kwargs
        Extra options to be passed on to the specific backend.

    Examples
    --------
    >>> df = dd.read_csv(...)  # doctest: +SKIP
    >>> dd.to_parquet(df, '/path/to/output/', compression='snappy')  # doctest: +SKIP

    See Also
    --------
    read_parquet: Read parquet data to dask.dataframe
    s#   Partitioning on non-existent columnRz  R   t   snappyR   R  R   Rs  R^   R   R#   R   R   N(	   R%   R   R7   R
   Rn  R   R   t   computeR"   (   R   R^   Rp  R   R   R#   R   R   Rs  R|  R   R   RQ   Ro   R   t   out(    (    s8   lib/python2.7/site-packages/dask/dataframe/io/parquet.pyR     s$    6		
(   R   R   (D   t
   __future__R    R   R   R   R   R   R   R   R]   t   distutils.versionR   t   numpyRk   R{   Rm   t   coreR   R   t   utilsR   R   R	   t   bytes.compressionR
   R   R   t   compatibilityR   R   R   R
  R   t   bytes.utilsR   R   R   R   R   t   __all__R4   R?   R"   Ry   Rc   R[   R   R   R   RF   R   R5   R   R   R  R&  R  R  R\  Rd  Ra  R]  Rm  Rn  R   Ra   R   t   __doc__(    (    (    s8   lib/python2.7/site-packages/dask/dataframe/io/parquet.pyt   <module>   sr   	O	GP	W		-	*	*	D	t		u	)		 	2j		O