
\c           @` s  d  d l  m Z m Z m Z d  d l m Z d  d l m Z m Z m	 Z	 y d  d l
 Z
 Wn e k
 rq d Z
 n Xd  d l Z d  d l Z d  d l m Z m Z m Z m Z m Z d d l m Z m Z d d l m Z m Z d d	 l m Z m  Z  m! Z! m" Z" d d
 l# m# Z# d d l$ m% Z% m& Z& d d l$ m' Z' d d l m( Z( d d e) e* d d  Z+ d   Z, e) e* d d d  Z- d   Z. e
 d k	 re   < e	 d e/  e
 j0   j1 Z2 e
 j3   Z4 e. e2 e4  Z5 Wd QXn d# Z5 e5 e) d d d e* e* d e* d 	 Z6 d Z7 d   Z8 e8 e j9 d d  Z9 e8 e j: d d  Z: e8 e j; d d  Z; d    Z< d d e) d d e* d!  Z= e  rd d" l> m? Z? e= j@ e? j= _@ n  d S($   i    (   t   print_functiont   divisiont   absolute_import(   t   BytesIO(   t   warnt   catch_warningst   simplefilterN(   t   is_integer_dtypet   is_float_dtypet   is_object_dtypet   is_datetime64_any_dtypet   CategoricalDtypei   (   t
   read_bytest
   open_files(   t   seekable_filest   files(   t   PY2t   PY3t   Mappingt   unicode(   t   delayed(   t
   asciitablet   parse_bytesi   (   t   clear_known_categoriesi   (   t   from_delayedc	         C` s'  t    }	 | r5 | j | j    r5 |	 j |  n  |	 j |  |	 j d  |  |	 |  }
 | rt t |
 |  n  | r | r t |
 j  t |  k r t d |
 j |   n | r | |
 _ n  | r#| \ } } } | j	 |  } |
 j
 i t j j t j t |
  |  |  | 6  }
 n  |
 S(   sS   Convert a block of bytes to a Pandas DataFrame

    Parameters
    ----------
    reader : callable
        ``pd.read_csv`` or ``pd.read_table``.
    b : bytestring
        The content to be parsed with ``reader``
    header : bytestring
        An optional header to prepend to ``b``
    kwargs : dict
        A dictionary of keyword arguments to be passed to ``reader``
    dtypes : dict
        DTypes to assign to columns
    path : tuple
        A tuple containing path column name, path to file, and all paths.

    See Also
    --------
    dask.dataframe.csv.read_pandas_from_bytes
    i    s   Columns do not match(   R   t
   startswitht   rstript   writet   seekt   coerce_dtypest   listt   columnst
   ValueErrort   indext   assignt   pdt   Categoricalt
   from_codest   npt   fullt   len(   t   readert   bt   headert   kwargst   dtypesR   t   write_headert   enforcet   patht   biot   dft   colnamet   pathst   code(    (    s4   lib/python2.7/site-packages/dask/dataframe/io/csv.pyt   pandas_read_text   s$    	'	1c      	   C` s  g  } g  } g  } x|  j  D] } | | k r |  j | | | k r |  j | } | | } t |  r t |  r | j | | | f  qt |  r t |  r | j |  qy |  | j | |  |  | <Wqt k
 r} | j | | | f  | j | | f  qXq q W| r| rad j	 d   t
 | d d   D  }	 d |	 }
 d } n d }
 d } t
 | d d   } t d	 d
 d g |  } d d j	 d   | D  } d j d | d |
 d | d |  } n d } | r0| rd n d } d j	 d   | D  } d j d | d |  } n d } | sB| rd d d } d | j	 t d | | g   } t |   n  d S(   s    Coerce dataframe to dtypes safely

    Operates in place

    Parameters
    ----------
    df: Pandas DataFrame
    dtypes: dict like {'x': float}
    s   
c         s` s%   |  ] \ } } d  | | f Vq d S(   s	   - %s
  %rN(    (   t   .0t   ct   e(    (    s4   lib/python2.7/site-packages/dask/dataframe/io/csv.pys	   <genexpr>j   s    t   keyc         S` s   t  |  d  S(   Ni    (   t   str(   t   x(    (    s4   lib/python2.7/site-packages/dask/dataframe/io/csv.pyt   <lambda>k   t    sA   The following columns also raised exceptions on conversion:

%s

R>   sf   

Alternatively, provide `assume_missing=True` to interpret
all unspecified integer columns as floats.c         S` s   t  |  d  S(   Ni    (   R;   (   R<   (    (    s4   lib/python2.7/site-packages/dask/dataframe/io/csv.pyR=   v   R>   t   Columnt   Foundt   Expecteds
   dtype={%s}s	   ,
       c         s` s(   |  ] \ } } } d  | | f Vq d S(   s   %r: '%s'N(    (   R7   t   kt   vt   _(    (    s4   lib/python2.7/site-packages/dask/dataframe/io/csv.pys	   <genexpr>y   s   s   {table}

{exceptions}Usually this is due to dask's dtype inference failing, and
*may* be fixed by specifying dtypes manually by adding:

{dtype_kw}

to the call to `read_csv`/`read_table`.{extra}t   tablet
   exceptionst   dtype_kwt   extras    also t    c         s` s   |  ] } d  | Vq d S(   s   - %sN(    (   R7   R8   (    (    s4   lib/python2.7/site-packages/dask/dataframe/io/csv.pys	   <genexpr>   s    s  The following columns{also}failed to properly parse as dates:

{cols}

This is usually due to an invalid value in that column. To
diagnose and fix it's recommended to drop these columns from the
`parse_dates` keyword, and manually convert them to dates later
using `dd.to_datetime`.t   alsot   colss   

%s

t   -i=   s=   Mismatched dtypes found in `pd.read_csv`/`pd.read_table`.

%sN(   R   R-   R   R   t   appendR	   R
   t   astypet	   Exceptiont   joint   sortedR   t   formatt   Nonet   filterR    (   R2   R-   t
   bad_dtypest	   bad_datest   errorsR8   t   actualt   desiredR9   t   exRF   RH   RE   RG   t	   dtype_msgRJ   RK   t   date_msgt   rulet   msg(    (    s4   lib/python2.7/site-packages/dask/dataframe/io/csv.pyR   H   sZ    
#
				c	         C` s  | j  j   }	 | j d d g  j }
 g  } |
 } t | t  r g  |
 D]< } t | j |  t  rI | j |  j d k	 rI | ^ qI } |
 j
 |  } n- t | t  r | j d k r g  } |
 } n  x | D] } d |	 | <q Wt | j  } t t d t } g  } | pd \ } } x t |  D] \ } } | sCq+n  | r_| | | | f } n d } | |  | d | | |	 | d t d | d | } | j |  | j   } | j d d  x@ | d	 D]4 } | j | |  | | | |	 | d | d |  qWq+W| r}| rO| j i t j j t j t |  d
 t |  | 6  } n  t |  rpt | d | } n  t | |  S| Sd S(   s   Convert blocks of bytes to a dask.dataframe or other high-level object

    This accepts a list of lists of values of bytes where each list corresponds
    to one file, and the value of bytes concatenate to comprise the entire
    file, in order.

    Parameters
    ----------
    reader : callable
        ``pd.read_csv`` or ``pd.read_table``.
    block_lists : list of lists of delayed values of bytes
        The lists of bytestrings where each list corresponds to one logical file
    header : bytestring
        The header, found at the front of the first file, to be prepended to
        all blocks
    head : pd.DataFrame
        An example Pandas DataFrame to be used for metadata.
        Can be ``None`` if ``collection==False``
    kwargs : dict
        Keyword arguments to pass down to ``reader``
    collection: boolean, optional (defaults to True)
    path : tuple, optional
        A tuple containing column name for path and list of all paths

    Returns
    -------
    A dask.dataframe or list of delayed values
    t   includet   categoryt   purei    R.   R/   R0   t   skiprowsi   t   dtypeRK   N(   NN(   R-   t   to_dictt   select_dtypesR   t
   isinstanceR   t   getR   t
   categoriesRS   t
   differenceR   R   R6   t   Truet	   enumeratet   FalseRM   t   copyt   popR"   R#   R$   R%   R&   t   zerosR(   t   intR   R   (   R)   t   block_listsR+   t   headR,   t
   collectionR/   t   specified_dtypesR0   R-   t   categoricalst   known_categoricalst   unknown_categoricalsRB   R   t   delayed_pandas_read_textt   dfsR3   R4   t   it   blockst	   path_infoR2   t   rest_kwargsR*   (    (    s4   lib/python2.7/site-packages/dask/dataframe/io/csv.pyt   text_blocks_to_pandas   sX    $				+c         C` s-   d } t  |  | |  } t | t  d   S(   Ni
   g    A(   Rp   t   min(   t   total_memoryt	   cpu_countt   memory_factort	   blocksize(    (    s4   lib/python2.7/site-packages/dask/dataframe/io/csv.pyt   auto_blocksize   s    t   ignorei   i  c          K` s  |  j  } | d  k	 r4 t |  d k r4 | | d <n d } |
 rX t |
 t  rX d }
 n  d | k sp d | k r t d j |    n  x; d d	 g D]- } | | k r t d
 j | |    q q W| j d d   r t d j |    n  t | j d  t  r"| j d  } } } ns | j d  d  k rHd } } } nM t	 | j d   } t
 |  } t t	 t t |  d   t	 |   } t | j d  t  rt d j |    n  t | j d  t  r|
 r| j d  j |
 d   } n d  } t | t t f  r+t |  } n  | rT| t k rTt d |  d  } n  | t k r| t k rt d |   n  | r| r| | k  r| d k rt d  | } n  | j   } t | d | d | d | d | d |
 |	 pi  } |
 rA| \ } } } | r2g  | D] } | |  ^ q} n  |
 | f } n | \ } } d  } t | d t t f  sx| g } n  | t k rt | d  r| d d j   } n  | j d d   } | j d | d  k rd n d   } | d  k rd n d } | j | | |  } | sd n t |  t | d  } | t k	 rs| | | k  rst |  | k rst d   n  | d  k rd n | | | } |  t |  |  } |
 r|
 | j k rt d  |
   n  | j d! i   } | d  k ri  } n  | r\t | t  r\xM | j D]? } t | | j   r| | k r| | j! t"  | | <qqWn  t# |  | | | | d" | d# | d$ | d | S(%   Ni   t   lineterminators   
R0   R!   t	   index_cols]   Keywords 'index' and 'index_col' not supported. Use dd.{0}(...).set_index('my-index') insteadt   iteratort	   chunksizes   {0} not supported for dd.{1}t   nrowss   The 'nrows' keyword is not supported by `dd.{0}`. To achieve the same behavior, it's recommended to use `dd.{0}(...).head(n=nrows)`Rb   i    R+   s,   List of header rows not supported for dd.{0}t
   converterss   Warning %s compression does not support breaking apart files
Please ensure that each individual file can fit in memory and
use the keyword ``blocksize=None to remove this message``
Setting ``blocksize=None``s#   Compression format %s not installeds}   Unexpected behavior can result from passing skiprows when
blocksize is smaller than sample size.
Setting ``sample=blocksize``t	   delimiterR   t   samplet   compressiont   include_patht   namest   inferi   is   Sample is not large enough to include at least one row of data. Please increase the number of bytes in `sample` in the call to `read_csv`/`read_table`R>   s   Files already contain the column name: %s, so the path column cannot use this name. Please set `include_path_column` to a unique name.Rc   Rs   R/   Rt   ($   t   __name__RS   R(   Rf   t   boolR    RR   Rg   Rp   t   sett   maxR   t   rangeR   t	   TypeErrort   dictR;   R   R   R   R   t   cfilest   NotImplementedErrort   encodeR   t   tupleRl   t   computet   splitR   R   R   Rc   RN   t   floatR~   (    R)   t   urlpathR   Rs   R   R   R   R/   t   assume_missingt   storage_optionst   include_path_columnR,   t   reader_namet   kwRb   t   lastskiprowt   firstrowt   path_convertert   b_lineterminatort   b_outt   b_samplet   valuesR4   R0   R   R+   t   needt   partst   npartsRr   Rt   R8   (    (    s4   lib/python2.7/site-packages/dask/dataframe/io/csv.pyt   read_pandas  s    					,		$
	"$'. 	!sQ  
Read {file_type} files into a Dask.DataFrame

This parallelizes the :func:`pandas.{reader}` function in the following ways:

- It supports loading many files at once using globstrings:

    >>> df = dd.{reader}('myfiles.*.csv')  # doctest: +SKIP

- In some cases it can break up large files:

    >>> df = dd.{reader}('largefile.csv', blocksize=25e6)  # 25MB chunks  # doctest: +SKIP

- It can read CSV files from external resources (e.g. S3, HDFS) by
  providing a URL:

    >>> df = dd.{reader}('s3://bucket/myfiles.*.csv')  # doctest: +SKIP
    >>> df = dd.{reader}('hdfs:///myfiles.*.csv')  # doctest: +SKIP
    >>> df = dd.{reader}('hdfs://namenode.example.com/myfiles.*.csv')  # doctest: +SKIP

Internally ``dd.{reader}`` uses :func:`pandas.{reader}` and supports many of the
same keyword arguments with the same performance guarantees. See the docstring
for :func:`pandas.{reader}` for more information on available keyword arguments.

Parameters
----------
urlpath : string or list
    Absolute or relative filepath(s). Prefix with a protocol like ``s3://``
    to read from alternative filesystems. To read from multiple files you
    can pass a globstring or a list of paths, with the caveat that they
    must all have the same protocol.
blocksize : str, int or None, optional
    Number of bytes by which to cut up larger files. Default value is
    computed based on available physical memory and the number of cores.
    If ``None``, use a single block for each file.
    Can be a number like 64000000 or a string like "64MB"
collection : boolean, optional
    Return a dask.dataframe if True or list of dask.delayed objects if False
sample : int, optional
    Number of bytes to use when determining dtypes
assume_missing : bool, optional
    If True, all integer columns that aren't specified in ``dtype`` are assumed
    to contain missing values, and are converted to floats. Default is False.
storage_options : dict, optional
    Extra options that make sense for a particular storage connection, e.g.
    host, port, username, password, etc.
include_path_column : bool or str, optional
    Whether or not to include the path to each particular file. If True a new
    column is added to the dataframe called ``path``. If str, sets new column
    name. Default is False.
**kwargs
    Extra keyword arguments to forward to :func:`pandas.{reader}`.

Notes
-----
Dask dataframe tries to infer the ``dtype`` of each column by reading a sample
from the start of the file (or of the first file if it's a glob). Usually this
works fine, but if the ``dtype`` is different later in the file (or in other
files) this can cause issues. For example, if all the rows in the sample had
integer dtypes, but later on there was a ``NaN``, then this would error at
compute time. To fix this, you have a few options:

- Provide explicit dtypes for the offending columns using the ``dtype``
  keyword. This is the recommended solution.

- Use the ``assume_missing`` keyword to assume that all columns inferred as
  integers contain missing values, and convert them to floats.

- Increase the size of the sample using the ``sample`` keyword.

It should also be noted that this function may fail if a {file_type} file
includes quoted strings that contain the line terminator. To get around this
you can specify ``blocksize=None`` to not split files into multiple partitions,
at the cost of reduced parallelism.
c         ` sR   t  t d  d  d t t d  t   f d 	 } t j d | d |  | _ | | _ | S(   Ni  c
         ` sF   t    |  d | d | d | d | d | d | d | d | d	 |	 |
 	S(
   NR   Rs   R   R   R   R/   R   R   R   (   R   (   R   R   Rs   R   R   R   R/   R   R   R   R,   (   R)   (    s4   lib/python2.7/site-packages/dask/dataframe/io/csv.pyt   read  s    R)   t	   file_type(   t   AUTO_BLOCKSIZERj   RS   Rl   t   READ_DOC_TEMPLATERR   t   __doc__R   (   R)   R   R   R   (    (   R)   s4   lib/python2.7/site-packages/dask/dataframe/io/csv.pyt   make_reader  s    		
	t   read_csvt   CSVt
   read_tablet	   delimitedt   read_fwfs   fixed-widthc         K` s#   |  } |  j  | |  Wd  QXd  S(   N(   t   to_csv(   R2   t   filR,   t   f(    (    s4   lib/python2.7/site-packages/dask/dataframe/io/csv.pyt
   _write_csv  s    	c         K` sM  t  r d }	 d }
 n d }	 d }
 | j d |	  } t | d | d |
 d | d | d |  j | pc i  } t t d	 t } |  j   } | | d
 | d
 |  g } t	 |  d k r| r t | d <n  | j
 g  t | d | d  D] \ } } | | | |  ^ q  n  | rEt |  j d |  g  | D] } | j ^ q2S| Sd S(   s  
    Store Dask DataFrame to CSV files

    One filename per partition will be created. You can specify the
    filenames in a variety of ways.

    Use a globstring::

    >>> df.to_csv('/path/to/data/export-*.csv')  # doctest: +SKIP

    The * will be replaced by the increasing sequence 0, 1, 2, ...

    ::

        /path/to/data/export-0.csv
        /path/to/data/export-1.csv

    Use a globstring and a ``name_function=`` keyword argument.  The
    name_function function should expect an integer and produce a string.
    Strings produced by name_function must preserve the order of their
    respective partition indices.

    >>> from datetime import date, timedelta
    >>> def name(i):
    ...     return str(date(2015, 1, 1) + i * timedelta(days=1))

    >>> name(0)
    '2015-01-01'
    >>> name(15)
    '2015-01-16'

    >>> df.to_csv('/path/to/data/export-*.csv', name_function=name)  # doctest: +SKIP

    ::

        /path/to/data/export-2015-01-01.csv
        /path/to/data/export-2015-01-02.csv
        ...

    You can also provide an explicit list of paths::

    >>> paths = ['/path/to/data/alice.csv', '/path/to/data/bob.csv', ...]  # doctest: +SKIP
    >>> df.to_csv(paths) # doctest: +SKIP

    Parameters
    ----------
    filename : string
        Path glob indicating the naming scheme for the output files
    name_function : callable, default None
        Function accepting an integer (partition index) and producing a
        string to replace the asterisk in the given filename globstring.
        Should preserve the lexicographic order of partitions
    compression : string or None
        String like 'gzip' or 'xz'.  Must support efficient random access.
        Filenames with extensions corresponding to known compression
        algorithms (gz, bz2) will be compressed accordingly automatically
    sep : character, default ','
        Field delimiter for the output file
    na_rep : string, default ''
        Missing data representation
    float_format : string, default None
        Format string for floating point numbers
    columns : sequence, optional
        Columns to write
    header : boolean or list of string, default True
        Write out column names. If a list of string is given it is assumed
        to be aliases for the column names
    header_first_partition_only : boolean, default False
        If set, only write the header row in the first output file
    index : boolean, default True
        Write row names (index)
    index_label : string or sequence, or False, default None
        Column label for index column(s) if desired. If None is given, and
        `header` and `index` are True, then the index names are used. A
        sequence should be given if the DataFrame uses MultiIndex.  If
        False do not print fields for index names. Use index_label=False
        for easier importing in R
    nanRep : None
        deprecated, use na_rep
    mode : str
        Python write mode, default 'w'
    encoding : string, optional
        A string representing the encoding to use in the output file,
        defaults to 'ascii' on Python 2 and 'utf-8' on Python 3.
    compression : string, optional
        a string representing the compression to use in the output file,
        allowed values are 'gzip', 'bz2', 'xz',
        only used when the first argument is a filename
    line_terminator : string, default '\n'
        The newline character or character sequence to use in the output
        file
    quoting : optional constant from csv module
        defaults to csv.QUOTE_MINIMAL
    quotechar : string (length 1), default '"'
        character used to quote fields
    doublequote : boolean, default True
        Control quoting of `quotechar` inside a field
    escapechar : string (length 1), default None
        character used to escape `sep` and `quotechar` when appropriate
    chunksize : int or None
        rows to write at a time
    tupleize_cols : boolean, default False
        write multi_index columns as a list of tuples (if True)
        or new (expanded format) if False)
    date_format : string, default None
        Format string for datetime objects
    decimal: string, default '.'
        Character recognized as decimal separator. E.g. use ',' for
        European data
    storage_options: dict
        Parameters passed on to the backend filesystem class.

    Returns
    -------
    The names of the file written if they were computed right away
    If not, the delayed tasks associated to the writing of the files
    t   wbs   utf-8t   wtt   encodingR   t   modet   name_functiont   numRa   i    i   R+   t	   schedulerN(   R   RS   Rg   R   t   npartitionsR   R   Rl   t
   to_delayedR(   t   extendt   zipR   R0   (   R2   t   filenameR   R   R   R   R   t   header_first_partition_onlyR,   t   default_encodingR   R   R   t   to_csv_chunkRy   R   t   dR   (    (    s4   lib/python2.7/site-packages/dask/dataframe/io/csv.pyR     s*    x		=(   t   _Framei   (A   t
   __future__R    R   R   t   ioR   t   warningsR   R   R   t   psutilt   ImportErrorRS   t   numpyR&   t   pandasR#   t   pandas.api.typesR   R   R	   R
   R   t   bytesR   R   t   bytes.compressionR   R   R   t   compatibilityR   R   R   R   R   t   utilsR   R   R   R   Rj   Rl   R6   R   R~   R   t   RuntimeWarningt   virtual_memoryt   totalt	   TOTAL_MEMR   t	   CPU_COUNTR   R   R   R   R   R   R   R   R   t   coreR   R   (    (    (    s4   lib/python2.7/site-packages/dask/dataframe/io/csv.pyt   <module>   sZ   
("+	U\	
					