B
    	\"                 @   s   d Z ddlmZ ddlmZmZ ddlmZ ddlZddl	Z	ddl
ZddlmZ ddlmZ dd	lmZ d
dlmZ ddlmZ ddlmZ ejd dkZeddddZeddddZdZe	eZdejfddZdd Z dd Z!d!dd Z"dS )"a  
=============================
Species distribution dataset
=============================

This dataset represents the geographic distribution of species.
The dataset is provided by Phillips et. al. (2006).

The two species are:

 - `"Bradypus variegatus"
   <http://www.iucnredlist.org/details/3038/0>`_ ,
   the Brown-throated Sloth.

 - `"Microryzomys minutus"
   <http://www.iucnredlist.org/details/13408/0>`_ ,
   also known as the Forest Small Rice Rat, a rodent that lives in Peru,
   Colombia, Ecuador, Peru, and Venezuela.

References
----------

`"Maximum entropy modeling of species geographic distributions"
<http://rob.schapire.net/papers/ecolmod.pdf>`_ S. J. Phillips,
R. P. Anderson, R. E. Schapire - Ecological Modelling, 190:231-259, 2006.

Notes
-----

For an example of using this dataset, see
:ref:`examples/applications/plot_species_distribution_modeling.py
<sphx_glr_auto_examples_applications_plot_species_distribution_modeling.py>`.
    )BytesIO)makedirsremove)existsN   )get_data_home)_fetch_remote)RemoteFileMetadata   )Bunch)_pkl_filepath)_joblib   zsamples.zipz.https://ndownloader.figshare.com/files/5976075Z@abb07ad284ac50d9e6d20f1c4211e0fd3c098f7f85955e89d321ee8efe37ac28)filenameurlZchecksumzcoverages.zipz.https://ndownloader.figshare.com/files/5976078Z@4d862674d72e79d6cee77e63b98651ec7926043ba7d39dcb31329cf3f6073807zspecies_coverage.pkz   c                sb    fddt |D }dd tfdd|D }tj |d}t|d }|dkr^d||< |S )	zjLoad a coverage file from an open file object.

    This will return a numpy array of the given dtype
    c                s   g | ]}   qS  )readline).0i)Fr   Elib/python3.7/site-packages/sklearn/datasets/species_distributions.py
<listcomp>U   s    z"_load_coverage.<locals>.<listcomp>c             S   s   |   d t|   d fS )Nr   r   )splitfloat)tr   r   r   <lambda>V   s    z _load_coverage.<locals>.<lambda>c                s   g | ]} |qS r   r   )r   line)
make_tupler   r   r   W   s    )dtypes   NODATA_valuei)rangedictnploadtxtint)r   Zheader_lengthr   headerMZnodatar   )r   r   r   _load_coverageP   s    r'   c             C   sN   t r|  d d}n|   d}tj| dddd}||j_|S )zLoad csv file.

    Parameters
    ----------
    F : file object
        CSV file open in byte mode.

    Returns
    -------
    rec : np.ndarray
        record array representing the data
    ascii,r   z	a22,f4,f4)ZskiprowsZ	delimiterr   )	PY3_OR_LATERr   decodestripr   r"   r#   r   names)r   r-   Zrecr   r   r   	_load_csv`   s    r.   c             C   s`   | j | j }|| j| j  }| j| j }|| j| j  }t||| j}t||| j}||fS )a%  Construct the map grid from the batch object

    Parameters
    ----------
    batch : Batch object
        The object returned by :func:`fetch_species_distributions`

    Returns
    -------
    (xgrid, ygrid) : 1-D arrays
        The grid corresponding to the values in batch.coverages
    )x_left_lower_corner	grid_sizeNxy_left_lower_cornerNyr"   Zarange)ZbatchZxminZxmaxZyminZymaxZxgridZygridr   r   r   construct_gridsy   s    r4   Tc          	   C   s  t | } t| st|  tdddddd}tj}t| t}t|s~|sPtdt	
dtj| f  tt| d	}t|B}x:|jD ]0}t|| }d
|krt|}	d|krt|}
qW W dQ R X t| t	
dtj| f  tt| d	}t|T}g }x:|jD ]0}t|| }t	d| |t| qW tj||d}W dQ R X t| tf ||
|	d|}tj||dd n
t|}|S )a	  Loader for species distribution dataset from Phillips et. al. (2006)

    Read more in the :ref:`User Guide <datasets>`.

    Parameters
    ----------
    data_home : optional, default: None
        Specify another download and cache folder for the datasets. By default
        all scikit-learn data is stored in '~/scikit_learn_data' subfolders.

    download_if_missing : optional, True by default
        If False, raise a IOError if the data is not locally available
        instead of trying to download the data from the source site.

    Returns
    --------
    The data is returned as a Bunch object with the following attributes:

    coverages : array, shape = [14, 1592, 1212]
        These represent the 14 features measured at each point of the map grid.
        The latitude/longitude values for the grid are discussed below.
        Missing data is represented by the value -9999.

    train : record array, shape = (1624,)
        The training points for the data.  Each point has three fields:

        - train['species'] is the species name
        - train['dd long'] is the longitude, in degrees
        - train['dd lat'] is the latitude, in degrees

    test : record array, shape = (620,)
        The test points for the data.  Same format as the training data.

    Nx, Ny : integers
        The number of longitudes (x) and latitudes (y) in the grid

    x_left_lower_corner, y_left_lower_corner : floats
        The (x,y) position of the lower-left corner, in degrees

    grid_size : float
        The spacing between points of the grid, in degrees

    References
    ----------

    * `"Maximum entropy modeling of species geographic distributions"
      <http://rob.schapire.net/papers/ecolmod.pdf>`_
      S. J. Phillips, R. P. Anderson, R. E. Schapire - Ecological Modelling,
      190:231-259, 2006.

    Notes
    -----

    This dataset represents the geographic distribution of species.
    The dataset is provided by Phillips et. al. (2006).

    The two species are:

    - `"Bradypus variegatus"
      <http://www.iucnredlist.org/details/3038/0>`_ ,
      the Brown-throated Sloth.

    - `"Microryzomys minutus"
      <http://www.iucnredlist.org/details/13408/0>`_ ,
      also known as the Forest Small Rice Rat, a rodent that lives in Peru,
      Colombia, Ecuador, Peru, and Venezuela.

    - For an example of using this dataset with scikit-learn, see
      :ref:`examples/applications/plot_species_distribution_modeling.py
      <sphx_glr_auto_examples_applications_plot_species_distribution_modeling.py>`.
    g33333Wi  gfffffLi8  g?)r/   r1   r2   r3   r0   z1Data not found and `download_if_missing` is Falsez&Downloading species data from %s to %s)dirnametraintestNz'Downloading coverage data from %s to %sz - converting {})r   )	coveragesr7   r6   	   )compress)r   r   r   r!   r"   int16r   DATA_ARCHIVE_NAMEIOErrorloggerinfoSAMPLESr   r   loadfilesr   r.   r   	COVERAGESdebugformatappendr'   Zasarrayr   r   dump)Z	data_homeZdownload_if_missingZextra_paramsr   Zarchive_pathZsamples_pathXfZfhandler6   r7   Zcoverages_pathr8   Zbunchr   r   r   fetch_species_distributions   sT    I


rJ   )NT)#__doc__ior   osr   r   os.pathr   sysZloggingZnumpyr"   baser   r   r	   Zutilsr   Zsklearn.datasets.baser   Zsklearn.utilsr   version_infor*   r@   rC   r<   Z	getLogger__name__r>   r;   r'   r.   r4   rJ   r   r   r   r   <module>!   s8   
 