ó
‡ˆ\c           @   s½   d  d l  m  Z  d  d l m Z d  d l Z d  d l j Z d d l m	 Z	 m
 Z
 d d l m Z d d l m Z d d l m Z m Z d d	 l m Z d
 „  Z d e	 e
 f d „  ƒ  YZ d S(   iÿÿÿÿ(   t   array(   t
   itemgetterNi   (   t   BaseEstimatort   TransformerMixin(   t   six(   t   xrange(   t   check_arrayt
   tosequence(   t   _Mappingc         C   s$   t  |  t ƒ r |  g St |  ƒ Sd S(   s?   Turn X into a sequence or ndarray, avoiding a copy if possible.N(   t
   isinstancet   MappingR   (   t   X(    (    sI   lib/python2.7/site-packages/sklearn/feature_extraction/dict_vectorizer.pyt   _tosequence   s    t   DictVectorizerc           B   sq   e  Z d  Z e j d e e d „ Z d
 d „ Z d „  Z	 d
 d „ Z
 e d „ Z d „  Z d „  Z e d	 „ Z RS(   sW
  Transforms lists of feature-value mappings to vectors.

    This transformer turns lists of mappings (dict-like objects) of feature
    names to feature values into Numpy arrays or scipy.sparse matrices for use
    with scikit-learn estimators.

    When feature values are strings, this transformer will do a binary one-hot
    (aka one-of-K) coding: one boolean-valued feature is constructed for each
    of the possible string values that the feature can take on. For instance,
    a feature "f" that can take on the values "ham" and "spam" will become two
    features in the output, one signifying "f=ham", the other "f=spam".

    However, note that this transformer will only do a binary one-hot encoding
    when feature values are of type string. If categorical features are
    represented as numeric values such as int, the DictVectorizer can be
    followed by :class:`sklearn.preprocessing.OneHotEncoder` to complete
    binary one-hot encoding.

    Features that do not occur in a sample (mapping) will have a zero value
    in the resulting array/matrix.

    Read more in the :ref:`User Guide <dict_feature_extraction>`.

    Parameters
    ----------
    dtype : callable, optional
        The type of feature values. Passed to Numpy array/scipy.sparse matrix
        constructors as the dtype argument.
    separator : string, optional
        Separator string used when constructing new features for one-hot
        coding.
    sparse : boolean, optional.
        Whether transform should produce scipy.sparse matrices.
        True by default.
    sort : boolean, optional.
        Whether ``feature_names_`` and ``vocabulary_`` should be sorted when fitting.
        True by default.

    Attributes
    ----------
    vocabulary_ : dict
        A dictionary mapping feature names to feature indices.

    feature_names_ : list
        A list of length n_features containing the feature names (e.g., "f=ham"
        and "f=spam").

    Examples
    --------
    >>> from sklearn.feature_extraction import DictVectorizer
    >>> v = DictVectorizer(sparse=False)
    >>> D = [{'foo': 1, 'bar': 2}, {'foo': 3, 'baz': 1}]
    >>> X = v.fit_transform(D)
    >>> X
    array([[2., 0., 1.],
           [0., 1., 3.]])
    >>> v.inverse_transform(X) ==         [{'bar': 2.0, 'foo': 1.0}, {'baz': 1.0, 'foo': 3.0}]
    True
    >>> v.transform({'foo': 4, 'unseen_feature': 3})
    array([[0., 0., 4.]])

    See also
    --------
    FeatureHasher : performs vectorization using only a hash function.
    sklearn.preprocessing.OrdinalEncoder : handles nominal/categorical
      features encoded as columns of arbitrary data types.
    t   =c         C   s(   | |  _  | |  _ | |  _ | |  _ d  S(   N(   t   dtypet	   separatort   sparset   sort(   t   selfR   R   R   R   (    (    sI   lib/python2.7/site-packages/sklearn/feature_extraction/dict_vectorizer.pyt   __init__`   s    			c         C   sÜ   g  } i  } x… | D]} } xt t  j | ƒ D]c \ } } t | t  j ƒ r` d | |  j | f } n  | | k r) | j | ƒ t | ƒ | | <q) q) Wq W|  j rÆ | j ƒ  t d „  t	 | ƒ Dƒ ƒ } n  | |  _
 | |  _ |  S(   sd  Learn a list of feature name -> indices mappings.

        Parameters
        ----------
        X : Mapping or iterable over Mappings
            Dict(s) or Mapping(s) from feature names (arbitrary Python
            objects) to feature values (strings or convertible to dtype).
        y : (ignored)

        Returns
        -------
        self
        s   %s%s%sc         s   s!   |  ] \ } } | | f Vq d  S(   N(    (   t   .0t   it   f(    (    sI   lib/python2.7/site-packages/sklearn/feature_extraction/dict_vectorizer.pys	   <genexpr>‚   s    (   R   t	   iteritemsR	   t   string_typesR   t   appendt   lenR   t   dictt	   enumeratet   feature_names_t   vocabulary_(   R   R   t   yt   feature_namest   vocabt   xR   t   v(    (    sI   lib/python2.7/site-packages/sklearn/feature_extraction/dict_vectorizer.pyt   fitg   s    	
		c         C   sÜ  t  d ƒ j d k s! t d ƒ ‚ |  j } | r? g  } i  } n |  j } |  j } t | t ƒ ri | g n | } t  d ƒ } t  d d g ƒ } g  } xï | D]ç }	 xË t j	 |	 ƒ D]º \ }
 } t | t j
 ƒ rí d |
 |  j | f }
 d } n  |
 | k r | j | |
 ƒ | j | | ƒ ƒ q° | r° | j |
 ƒ t | ƒ | |
 <| j | |
 ƒ | j | | ƒ ƒ q° q° W| j t | ƒ ƒ qš Wt | ƒ d k r¦t d ƒ ‚ n  t j | d t j ƒ} t j | d t j ƒ} t | ƒ d t | ƒ f } t j | | | f d	 | d | ƒ} | r›|  j r›| j ƒ  t j t | ƒ d t j ƒ} x2 t | ƒ D]$ \ } }
 | |
 | | <| | |
 <qZW| d  d  … | f } n  |  j r±| j ƒ  n | j ƒ  } | rØ| |  _ | |  _ n  | S(
   NR   i   s¯   sizeof(int) != 4 on your platform; please report this at https://github.com/scikit-learn/scikit-learn/issues and include the output from platform.platform() in your bug reporti    s   %s%s%si   s   Sample sequence X is empty.R   t   shape(   R    t   itemsizet   AssertionErrorR   R   R   R	   R
   R   R   R   R   R   R   t
   ValueErrort   npt
   frombuffert   intct   spt
   csr_matrixR   t   emptyt   int32R   R   t   sort_indicest   toarray(   R   R   t   fittingR   R!   R"   t   indicest   indptrt   valuesR#   R   R$   R&   t   result_matrixt	   map_indext   new_val(    (    sI   lib/python2.7/site-packages/sklearn/feature_extraction/dict_vectorizer.pyt
   _transform‰   s^    						
		c         C   s   |  j  | d t ƒS(   s  Learn a list of feature name -> indices mappings and transform X.

        Like fit(X) followed by transform(X), but does not require
        materializing X in memory.

        Parameters
        ----------
        X : Mapping or iterable over Mappings
            Dict(s) or Mapping(s) from feature names (arbitrary Python
            objects) to feature values (strings or convertible to dtype).
        y : (ignored)

        Returns
        -------
        Xa : {array, sparse matrix}
            Feature vectors; always 2-d.
        R3   (   R:   t   True(   R   R   R    (    (    sI   lib/python2.7/site-packages/sklearn/feature_extraction/dict_vectorizer.pyt   fit_transformÕ   s    c         C   s  t  | d d d g ƒ} | j d } |  j } g  t | ƒ D] } | ƒ  ^ q; } t j | ƒ r¡ x° t | j ƒ  Œ  D]( \ } } | | | f | | | | <qr Wnq xn t | ƒ D]` \ } }	 xQ t | | d d … f ƒ D]3 \ } }
 |
 d k r× | | | f |	 | | <q× q× Wq® W| S(   sL  Transform array or sparse matrix X back to feature mappings.

        X must have been produced by this DictVectorizer's transform or
        fit_transform method; it may only have passed through transformers
        that preserve the number of features and their order.

        In the case of one-hot/one-of-K coding, the constructed feature
        names and values are returned rather than the original ones.

        Parameters
        ----------
        X : {array-like, sparse matrix}, shape = [n_samples, n_features]
            Sample matrix.
        dict_type : callable, optional
            Constructor for feature mappings. Must conform to the
            collections.Mapping API.

        Returns
        -------
        D : list of dict_type objects, length = n_samples
            Feature mappings for the samples in X.
        t   accept_sparset   csrt   csci    N(	   R   R&   R   R   R-   t   issparset   zipt   nonzeroR   (   R   R   t	   dict_typet	   n_samplest   namest   _t   dictsR   t   jt   dR$   (    (    sI   lib/python2.7/site-packages/sklearn/feature_extraction/dict_vectorizer.pyt   inverse_transformé   s    	"#)#c   	      C   s	  |  j  r |  j | d t ƒS|  j } |  j } t | ƒ } t j t | ƒ t | ƒ f d | ƒ} x t	 | ƒ D] \ } } x€ t
 j | ƒ D]o \ } } t | t
 j ƒ rÇ d | |  j | f } d } n  y | | ƒ | | | | f <WqŠ t k
 rø qŠ XqŠ Wqn W| Sd S(   s  Transform feature->value dicts to array or sparse matrix.

        Named features not encountered during fit or fit_transform will be
        silently ignored.

        Parameters
        ----------
        X : Mapping or iterable over Mappings, length = n_samples
            Dict(s) or Mapping(s) from feature names (arbitrary Python
            objects) to feature values (strings or convertible to dtype).

        Returns
        -------
        Xa : {array, sparse matrix}
            Feature vectors; always 2-d.
        R3   R   s   %s%s%si   N(   R   R:   t   FalseR   R   R   R*   t   zerosR   R   R   R   R	   R   R   t   KeyError(	   R   R   R   R"   t   XaR   R#   R   R$   (    (    sI   lib/python2.7/site-packages/sklearn/feature_extraction/dict_vectorizer.pyt	   transform  s     			'	c         C   s   |  j  S(   sÖ   Returns a list of feature names, ordered by their indices.

        If one-of-K coding is applied to categorical features, this will
        include the constructed feature names but not the original ones.
        (   R   (   R   (    (    sI   lib/python2.7/site-packages/sklearn/feature_extraction/dict_vectorizer.pyt   get_feature_names8  s    c         C   sš   | s t  j | ƒ d } n  |  j } i  } x" | D] } t | ƒ | | | <q2 W| |  _ g  t t j | ƒ d t d ƒ ƒD] \ } } | ^ q{ |  _ |  S(   s<  Restrict the features to those in support using feature selection.

        This function modifies the estimator in-place.

        Parameters
        ----------
        support : array-like
            Boolean mask or list of indices (as returned by the get_support
            member of feature selectors).
        indices : boolean, optional
            Whether support is a list of indices.

        Returns
        -------
        self

        Examples
        --------
        >>> from sklearn.feature_extraction import DictVectorizer
        >>> from sklearn.feature_selection import SelectKBest, chi2
        >>> v = DictVectorizer()
        >>> D = [{'foo': 1, 'bar': 2}, {'foo': 3, 'baz': 1}]
        >>> X = v.fit_transform(D)
        >>> support = SelectKBest(chi2, k=2).fit(X, [0, 1])
        >>> v.get_feature_names()
        ['bar', 'baz', 'foo']
        >>> v.restrict(support.get_support()) # doctest: +ELLIPSIS
        DictVectorizer(dtype=..., separator='=', sort=True,
                sparse=True)
        >>> v.get_feature_names()
        ['bar', 'foo']
        i    t   keyi   (	   R*   t   whereR   R   R   t   sortedR   R   R   (   R   t   supportR4   RE   t	   new_vocabR   R   (    (    sI   lib/python2.7/site-packages/sklearn/feature_extraction/dict_vectorizer.pyt   restrict@  s    !		(N(   t   __name__t
   __module__t   __doc__R*   t   float64R;   R   t   NoneR%   R:   R<   R   RJ   RO   RP   RK   RV   (    (    (    sI   lib/python2.7/site-packages/sklearn/feature_extraction/dict_vectorizer.pyR      s   D"	L)	&	(   R    t   operatorR   t   numpyR*   t   scipy.sparseR   R-   t   baseR   R   t	   externalsR   t   externals.six.movesR   t   utilsR   R   t   utils.fixesR   R
   R   R   (    (    (    sI   lib/python2.7/site-packages/sklearn/feature_extraction/dict_vectorizer.pyt   <module>   s   	