
\c           @  s  d  Z  d d l m Z m Z d d l Z d d l m Z d d l Z d d l m	 Z	 d d l
 Z
 d d l Z d d l Z d d l Z d d l j Z d d l m Z m Z d d l m Z d d	 l m Z d d
 l m Z d d l m Z d d l m Z d d l m  Z  m! Z! m" Z" d d l# m$ Z$ d d l# m% Z& d d l' m( Z( d d d d d d d g Z) d   Z* d   Z+ d   Z, d   Z- d e. f d     YZ/ d e e/ e f d      YZ0 d!   Z1 d" e e/ f d#     YZ2 d$   Z3 d% e e f d&     YZ4 d' e2 f d(     YZ5 d S()   uv   
The :mod:`sklearn.feature_extraction.text` submodule gathers utilities to
build feature vectors from text documents.
i(   t   unicode_literalst   divisionN(   t   defaultdict(   t
   itemgetteri   (   t   BaseEstimatort   TransformerMixin(   t   six(   t   xrange(   t	   normalizei   (   t   FeatureHasher(   t   ENGLISH_STOP_WORDS(   t   check_is_fittedt   check_arrayt   FLOAT_DTYPES(   t
   sp_version(   t   _Mapping(   t	   _IS_32BITu   CountVectorizeru   ENGLISH_STOP_WORDSu   TfidfTransformeru   TfidfVectorizeru   strip_accents_asciiu   strip_accents_unicodeu
   strip_tagsc         C  sU   t  j d |   } | |  k r" |  Sd j g  | D] } t  j |  s/ | ^ q/  Sd S(   u  Transform accentuated unicode symbols into their simple counterpart

    Warning: the python-level loop and join operations make this
    implementation 20 times slower than the strip_accents_ascii basic
    normalization.

    Parameters
    ----------
    s : string
        The string to strip

    See also
    --------
    strip_accents_ascii
        Remove accentuated char for any unicode symbol that has a direct
        ASCII equivalent.
    u   NFKDu    N(   t   unicodedataR   t   joint	   combining(   t   st
   normalizedt   c(    (    s>   lib/python2.7/site-packages/sklearn/feature_extraction/text.pyt   strip_accents_unicode0   s    c         C  s+   t  j d |   } | j d d  j d  S(   ui  Transform accentuated unicode symbols into ascii or nothing

    Warning: this solution is only suited for languages that have a direct
    transliteration to ASCII symbols.

    Parameters
    ----------
    s : string
        The string to strip

    See also
    --------
    strip_accents_unicode
        Remove accentuated char for any unicode symbol.
    u   NFKDu   ASCIIu   ignore(   R   R   t   encodet   decode(   R   t	   nkfd_form(    (    s>   lib/python2.7/site-packages/sklearn/feature_extraction/text.pyt   strip_accents_asciiI   s    c         C  s"   t  j d d t  j j d |   S(   u   Basic regexp based HTML / XML tag stripper function

    For serious HTML/XML preprocessing you should rather use an external
    library such as lxml or BeautifulSoup.

    Parameters
    ----------
    s : string
        The string to strip
    u	   <([^>]+)>t   flagsu    (   t   ret   compilet   UNICODEt   sub(   R   (    (    s>   lib/python2.7/site-packages/sklearn/feature_extraction/text.pyt
   strip_tags]   s    c         C  sS   |  d k r t  St |  t j  r5 t d |    n |  d  k rE d  St |   Sd  S(   Nu   englishu   not a built-in stop list: %s(   R
   t
   isinstanceR   t   string_typest
   ValueErrort   Nonet	   frozenset(   t   stop(    (    s>   lib/python2.7/site-packages/sklearn/feature_extraction/text.pyt   _check_stop_listk   s    t   VectorizerMixinc           B  s   e  Z d  Z e j d  Z d   Z d d  Z d   Z	 d   Z
 d   Z d   Z d   Z d	   Z d
   Z d   Z d   Z d   Z RS(   u?   Provides common code for text vectorizers (tokenization logic).u   \s\s+c         C  s   |  j  d k r6 t | d   } | j   } Wd QXn |  j  d k rT | j   } n  t | t  r~ | j |  j |  j  } n  | t j	 k r t
 d   n  | S(   u   Decode the input into a string of unicode symbols

        The decoding strategy depends on the vectorizer parameters.

        Parameters
        ----------
        doc : string
            The string to decode
        u   filenameu   rbNu   fileu?   np.nan is an invalid document, expected byte or unicode string.(   t   inputt   opent   readR"   t   bytesR   t   encodingt   decode_errort   npt   nanR$   (   t   selft   doct   fh(    (    s>   lib/python2.7/site-packages/sklearn/feature_extraction/text.pyR   {   s    
c         C  s
  | d k	 r4 g  | D] } | | k r | ^ q } n  |  j \ } } | d k r| } | d k rz t |  } | d 7} n g  } t |  } | j } d j }	 xe t | t | d | d   D]@ }
 x7 t | |
 d  D]! } | |	 | | | |
 !  q Wq Wn  | S(   uA   Turn tokens into a sequence of n-grams after stop words filteringi   u    N(   R%   t   ngram_ranget   listt   lent   appendR   R   t   min(   R2   t   tokenst
   stop_wordst   wt   min_nt   max_nt   original_tokenst   n_original_tokenst   tokens_appendt
   space_joint   nt   i(    (    s>   lib/python2.7/site-packages/sklearn/feature_extraction/text.pyt   _word_ngrams   s"    (			&c   	      C  s   |  j  j d |  } t |  } |  j \ } } | d k rU t |  } | d 7} n g  } | j } x\ t | t | d | d   D]: } x1 t | | d  D] } | | | | | ! q Wq W| S(   u;   Tokenize text_document into a sequence of character n-gramsu    i   (   t   _white_spacesR    R7   R5   R6   R8   R   R9   (	   R2   t   text_documentt   text_lenR=   R>   t   ngramst   ngrams_appendRC   RD   (    (    s>   lib/python2.7/site-packages/sklearn/feature_extraction/text.pyt   _char_ngrams   s    	'c   
      C  s   |  j  j d |  } |  j \ } } g  } | j } x | j   D] } d | d } t |  } x| t | | d  D]g } d }	 | | |	 |	 | ! x3 |	 | | k  r |	 d 7}	 | | |	 |	 | ! q W|	 d k rt Pqt qt Wq@ W| S(   u   Whitespace sensitive char-n-gram tokenization.

        Tokenize text_document into a sequence of character n-grams
        operating only inside word boundaries. n-grams at the edges
        of words are padded with space.u    i   i    (   RF   R    R5   R8   t   splitR7   R   (
   R2   RG   R=   R>   RI   RJ   R<   t   w_lenRC   t   offset(    (    s>   lib/python2.7/site-packages/sklearn/feature_extraction/text.pyt   _char_wb_ngrams   s     	
c           s   |  j  d k	 r |  j  Sd   } |  j s1 |   n^ t |  j  rL |  j   nC |  j d k rd t   n+ |  j d k r| t   n t d |  j   |  j r   f d   S  Sd S(   u<   Return a function to preprocess the text before tokenizationc         S  s   |  S(   N(    (   t   x(    (    s>   lib/python2.7/site-packages/sklearn/feature_extraction/text.pyt   <lambda>   s    u   asciiu   unicodeu%   Invalid value for "strip_accents": %sc           s     |  j     S(   N(   t   lower(   RP   (   t   strip_accents(    s>   lib/python2.7/site-packages/sklearn/feature_extraction/text.pyRQ     s    N(   t   preprocessorR%   RS   t   callableR   R   R$   t	   lowercase(   R2   t   noop(    (   RS   s>   lib/python2.7/site-packages/sklearn/feature_extraction/text.pyt   build_preprocessor   s     						c           s5   |  j  d k	 r |  j  St j |  j      f d   S(   u@   Return a function that splits a string into a sequence of tokensc           s     j  |   S(   N(   t   findall(   R3   (   t   token_pattern(    s>   lib/python2.7/site-packages/sklearn/feature_extraction/text.pyRQ   
  s    N(   t	   tokenizerR%   R   R   RZ   (   R2   (    (   RZ   s>   lib/python2.7/site-packages/sklearn/feature_extraction/text.pyt   build_tokenizer  s    c         C  s   t  |  j  S(   u,   Build or fetch the effective stop words list(   R(   R;   (   R2   (    (    s>   lib/python2.7/site-packages/sklearn/feature_extraction/text.pyt   get_stop_words  s    c         C  s   t  |  j  t |  d d  k r% d Sy t   } xY | p= d D]K } t | | |    } x* | D]" } | | k rc | j |  qc qc Wq> Wt  |  j  |  _ | r t j	 d t
 |   n  | SWn$ t k
 r t  |  j  |  _ d SXd S(   u  Check if stop words are consistent

        Returns
        -------
        is_consistent : True if stop words are consistent with the preprocessor
                        and tokenizer, False if they are not, None if the check
                        was previously performed, "error" if it could not be
                        performed (e.g. because of the use of a custom
                        preprocessor / tokenizer)
        u   _stop_words_idu}   Your stop_words may be inconsistent with your preprocessing. Tokenizing the stop words generated tokens %r not in stop_words.u   errorN(    (   t   idR;   t   getattrR%   t   setR6   t   addt   _stop_words_idt   warningst   warnt   sortedt	   Exception(   R2   R;   t
   preprocesst   tokenizet   inconsistentR<   R:   t   token(    (    s>   lib/python2.7/site-packages/sklearn/feature_extraction/text.pyt   _check_stop_words_consistency  s"    !			c           s   t   j  r  j S j      j d k rA    f d   S j d k r`    f d   S j d k r  j     j     j           f d   St d  j   d S(	   u=   Return a callable that handles preprocessing and tokenizationu   charc           s    j     j |     S(   N(   RK   R   (   R3   (   Rg   R2   (    s>   lib/python2.7/site-packages/sklearn/feature_extraction/text.pyRQ   =  s    u   char_wbc           s    j     j |     S(   N(   RO   R   (   R3   (   Rg   R2   (    s>   lib/python2.7/site-packages/sklearn/feature_extraction/text.pyRQ   @  s   u   wordc           s%    j      j |       S(   N(   RE   R   (   R3   (   Rg   R2   R;   Rh   (    s>   lib/python2.7/site-packages/sklearn/feature_extraction/text.pyRQ   H  s   u.   %s is not a valid tokenization scheme/analyzerN(   RU   t   analyzerRX   R]   R\   Rk   R$   (   R2   (    (   Rg   R2   R;   Rh   s>   lib/python2.7/site-packages/sklearn/feature_extraction/text.pyt   build_analyzer5  s    c         C  sf  |  j  } | d  k	 rYt | t  r3 t |  } n  t | t  s i  } xK t |  D]= \ } } | j | |  | k rU d | } t |   qU qU W| } n t t	 j
 |   } t |  t |  k r t d   n  xK t t |   D]7 } | | k r d t |  | f } t |   q q W| s>t d   n  t |  _ t |  |  _ n	 t |  _ d  S(   Nu    Duplicate term in vocabulary: %ru%   Vocabulary contains repeated indices.u/   Vocabulary of size %d doesn't contain index %d.u   empty vocabulary passed to fit(   t
   vocabularyR%   R"   R`   Re   t   Mappingt	   enumeratet
   setdefaultR$   R   t
   itervaluesR7   R   t   Truet   fixed_vocabulary_t   dictt   vocabulary_t   False(   R2   Rn   t   vocabRD   t   tt   msgt   indices(    (    s>   lib/python2.7/site-packages/sklearn/feature_extraction/text.pyt   _validate_vocabularyO  s0    	
		c         C  sD   d } t  |  d d | f t |  j  d k r@ t d   n  d S(   u4   Check if vocabulary is empty or missing (not fit-ed)u$   %(name)s - Vocabulary wasn't fitted.u   vocabulary_Rz   i    u   Vocabulary is emptyN(   R   R7   Rv   R$   (   R2   Rz   (    (    s>   lib/python2.7/site-packages/sklearn/feature_extraction/text.pyt   _check_vocabularyk  s    c         C  s;   |  j  \ } } | | k r7 t d t |  j     n  d S(   u'   Check validity of ngram_range parameteruO   Invalid value for ngram_range=%s lower boundary larger than the upper boundary.N(   R5   R$   t   str(   R2   R=   t   max_m(    (    s>   lib/python2.7/site-packages/sklearn/feature_extraction/text.pyt   _validate_paramss  s
    N(   t   __name__t
   __module__t   __doc__R   R   RF   R   R%   RE   RK   RO   RX   R\   R]   Rk   Rm   R|   R}   R   (    (    (    s>   lib/python2.7/site-packages/sklearn/feature_extraction/text.pyR)   v   s   							%			t   HashingVectorizerc           B  s   e  Z d  Z d d d d e d d d d d d d e d	 e e e j d
  Z d d  Z	 d d  Z
 d   Z d d  Z d   Z RS(   u  Convert a collection of text documents to a matrix of token occurrences

    It turns a collection of text documents into a scipy.sparse matrix holding
    token occurrence counts (or binary occurrence information), possibly
    normalized as token frequencies if norm='l1' or projected on the euclidean
    unit sphere if norm='l2'.

    This text vectorizer implementation uses the hashing trick to find the
    token string name to feature integer index mapping.

    This strategy has several advantages:

    - it is very low memory scalable to large datasets as there is no need to
      store a vocabulary dictionary in memory

    - it is fast to pickle and un-pickle as it holds no state besides the
      constructor parameters

    - it can be used in a streaming (partial fit) or parallel pipeline as there
      is no state computed during fit.

    There are also a couple of cons (vs using a CountVectorizer with an
    in-memory vocabulary):

    - there is no way to compute the inverse transform (from feature indices to
      string feature names) which can be a problem when trying to introspect
      which features are most important to a model.

    - there can be collisions: distinct tokens can be mapped to the same
      feature index. However in practice this is rarely an issue if n_features
      is large enough (e.g. 2 ** 18 for text classification problems).

    - no IDF weighting as this would render the transformer stateful.

    The hash function employed is the signed 32-bit version of Murmurhash3.

    Read more in the :ref:`User Guide <text_feature_extraction>`.

    Parameters
    ----------

    input : string {'filename', 'file', 'content'}
        If 'filename', the sequence passed as an argument to fit is
        expected to be a list of filenames that need reading to fetch
        the raw content to analyze.

        If 'file', the sequence items must have a 'read' method (file-like
        object) that is called to fetch the bytes in memory.

        Otherwise the input is expected to be the sequence strings or
        bytes items are expected to be analyzed directly.

    encoding : string, default='utf-8'
        If bytes or files are given to analyze, this encoding is used to
        decode.

    decode_error : {'strict', 'ignore', 'replace'}
        Instruction on what to do if a byte sequence is given to analyze that
        contains characters not of the given `encoding`. By default, it is
        'strict', meaning that a UnicodeDecodeError will be raised. Other
        values are 'ignore' and 'replace'.

    strip_accents : {'ascii', 'unicode', None}
        Remove accents and perform other character normalization
        during the preprocessing step.
        'ascii' is a fast method that only works on characters that have
        an direct ASCII mapping.
        'unicode' is a slightly slower method that works on any characters.
        None (default) does nothing.

        Both 'ascii' and 'unicode' use NFKD normalization from
        :func:`unicodedata.normalize`.

    lowercase : boolean, default=True
        Convert all characters to lowercase before tokenizing.

    preprocessor : callable or None (default)
        Override the preprocessing (string transformation) stage while
        preserving the tokenizing and n-grams generation steps.

    tokenizer : callable or None (default)
        Override the string tokenization step while preserving the
        preprocessing and n-grams generation steps.
        Only applies if ``analyzer == 'word'``.

    stop_words : string {'english'}, list, or None (default)
        If 'english', a built-in stop word list for English is used.
        There are several known issues with 'english' and you should
        consider an alternative (see :ref:`stop_words`).

        If a list, that list is assumed to contain stop words, all of which
        will be removed from the resulting tokens.
        Only applies if ``analyzer == 'word'``.

    token_pattern : string
        Regular expression denoting what constitutes a "token", only used
        if ``analyzer == 'word'``. The default regexp selects tokens of 2
        or more alphanumeric characters (punctuation is completely ignored
        and always treated as a token separator).

    ngram_range : tuple (min_n, max_n), default=(1, 1)
        The lower and upper boundary of the range of n-values for different
        n-grams to be extracted. All values of n such that min_n <= n <= max_n
        will be used.

    analyzer : string, {'word', 'char', 'char_wb'} or callable
        Whether the feature should be made of word or character n-grams.
        Option 'char_wb' creates character n-grams only from text inside
        word boundaries; n-grams at the edges of words are padded with space.

        If a callable is passed it is used to extract the sequence of features
        out of the raw, unprocessed input.

    n_features : integer, default=(2 ** 20)
        The number of features (columns) in the output matrices. Small numbers
        of features are likely to cause hash collisions, but large numbers
        will cause larger coefficient dimensions in linear learners.

    binary : boolean, default=False.
        If True, all non zero counts are set to 1. This is useful for discrete
        probabilistic models that model binary events rather than integer
        counts.

    norm : 'l1', 'l2' or None, optional
        Norm used to normalize term vectors. None for no normalization.

    alternate_sign : boolean, optional, default True
        When True, an alternating sign is added to the features as to
        approximately conserve the inner product in the hashed space even for
        small n_features. This approach is similar to sparse random projection.

        .. versionadded:: 0.19

    non_negative : boolean, optional, default False
        When True, an absolute value is applied to the features matrix prior to
        returning it. When used in conjunction with alternate_sign=True, this
        significantly reduces the inner product preservation property.

        .. deprecated:: 0.19
            This option will be removed in 0.21.
    dtype : type, optional
        Type of the matrix returned by fit_transform() or transform().

    Examples
    --------
    >>> from sklearn.feature_extraction.text import HashingVectorizer
    >>> corpus = [
    ...     'This is the first document.',
    ...     'This document is the second document.',
    ...     'And this is the third one.',
    ...     'Is this the first document?',
    ... ]
    >>> vectorizer = HashingVectorizer(n_features=2**4)
    >>> X = vectorizer.fit_transform(corpus)
    >>> print(X.shape)
    (4, 16)

    See also
    --------
    CountVectorizer, TfidfVectorizer

    u   contentu   utf-8u   strictu   (?u)\b\w\w+\bi   u   wordi   i   u   l2c         C  s   | |  _  | |  _ | |  _ | |  _ | |  _ | |  _ | |  _ | |  _ |	 |  _ | |  _	 | |  _
 |
 |  _ | |  _ | |  _ | |  _ | |  _ | |  _ d  S(   N(   R*   R.   R/   RS   RT   R[   Rl   RV   RZ   R;   t
   n_featuresR5   t   binaryt   normt   alternate_signt   non_negativet   dtype(   R2   R*   R.   R/   RS   RV   RT   R[   R;   RZ   R5   Rl   R   R   R   R   R   R   (    (    s>   lib/python2.7/site-packages/sklearn/feature_extraction/text.pyt   __init__   s"    																c         C  s   |  S(   u  Does nothing: this transformer is stateless.

        This method is just there to mark the fact that this transformer
        can work in a streaming setup.

        Parameters
        ----------
        X : array-like, shape [n_samples, n_features]
            Training data.
        (    (   R2   t   Xt   y(    (    s>   lib/python2.7/site-packages/sklearn/feature_extraction/text.pyt   partial_fit9  s    c         C  sH   t  | t j  r! t d   n  |  j   |  j   j | d | |  S(   u   Does nothing: this transformer is stateless.

        Parameters
        ----------
        X : array-like, shape [n_samples, n_features]
            Training data.
        uB   Iterable over raw text documents expected, string object received.R   (   R"   R   R#   R$   R   t   _get_hashert   fit(   R2   R   R   (    (    s>   lib/python2.7/site-packages/sklearn/feature_extraction/text.pyR   F  s    	
c           s   t  | t j  r! t d   n  |  j   |  j     |  j   j   f d   | D  } |  j rx | j	 j
 d  n  |  j d k	 r t | d |  j d t } n  | S(   u  Transform a sequence of documents to a document-term matrix.

        Parameters
        ----------
        X : iterable over raw text documents, length = n_samples
            Samples. Each sample must be a text document (either bytes or
            unicode strings, file name or file object depending on the
            constructor argument) which will be tokenized and hashed.

        Returns
        -------
        X : scipy.sparse matrix, shape = (n_samples, self.n_features)
            Document-term matrix.
        uB   Iterable over raw text documents expected, string object received.c         3  s   |  ] }   |  Vq d  S(   N(    (   t   .0R3   (   Rl   (    s>   lib/python2.7/site-packages/sklearn/feature_extraction/text.pys	   <genexpr>p  s    i   R   t   copyN(   R"   R   R#   R$   R   Rm   R   t	   transformR   t   datat   fillR   R%   R   Rw   (   R2   R   (    (   Rl   s>   lib/python2.7/site-packages/sklearn/feature_extraction/text.pyR   Y  s    
%	c         C  s   |  j  | |  j |  S(   u  Transform a sequence of documents to a document-term matrix.

        Parameters
        ----------
        X : iterable over raw text documents, length = n_samples
            Samples. Each sample must be a text document (either bytes or
            unicode strings, file name or file object depending on the
            constructor argument) which will be tokenized and hashed.
        y : any
            Ignored. This parameter exists only for compatibility with
            sklearn.pipeline.Pipeline.

        Returns
        -------
        X : scipy.sparse matrix, shape = (n_samples, self.n_features)
            Document-term matrix.
        (   R   R   (   R2   R   R   (    (    s>   lib/python2.7/site-packages/sklearn/feature_extraction/text.pyt   fit_transformw  s    c         C  s1   t  d |  j d d d |  j d |  j d |  j  S(   NR   t
   input_typeu   stringR   R   R   (   R	   R   R   R   R   (   R2   (    (    s>   lib/python2.7/site-packages/sklearn/feature_extraction/text.pyR     s    	N(   i   i   i   (   R   R   R   R%   Rs   Rw   R0   t   float64R   R   R   R   R   R   (    (    (    s>   lib/python2.7/site-packages/sklearn/feature_extraction/text.pyR   }  s   				c         C  s@   t  j |   r, t j |  j d |  j d St j |  j  Sd S(   uA   Count the number of non-zero values for each feature in sparse X.t	   minlengthi   N(   t   spt   isspmatrix_csrR0   t   bincountR{   t   shapet   difft   indptr(   R   (    (    s>   lib/python2.7/site-packages/sklearn/feature_extraction/text.pyt   _document_frequency  s    t   CountVectorizerc           B  s   e  Z d  Z d d d d e d d d d d d d d d d e e j d  Z d	   Z	 d d d d
  Z
 d   Z d d  Z d d  Z d   Z d   Z d   Z RS(   u  Convert a collection of text documents to a matrix of token counts

    This implementation produces a sparse representation of the counts using
    scipy.sparse.csr_matrix.

    If you do not provide an a-priori dictionary and you do not use an analyzer
    that does some kind of feature selection then the number of features will
    be equal to the vocabulary size found by analyzing the data.

    Read more in the :ref:`User Guide <text_feature_extraction>`.

    Parameters
    ----------
    input : string {'filename', 'file', 'content'}
        If 'filename', the sequence passed as an argument to fit is
        expected to be a list of filenames that need reading to fetch
        the raw content to analyze.

        If 'file', the sequence items must have a 'read' method (file-like
        object) that is called to fetch the bytes in memory.

        Otherwise the input is expected to be the sequence strings or
        bytes items are expected to be analyzed directly.

    encoding : string, 'utf-8' by default.
        If bytes or files are given to analyze, this encoding is used to
        decode.

    decode_error : {'strict', 'ignore', 'replace'}
        Instruction on what to do if a byte sequence is given to analyze that
        contains characters not of the given `encoding`. By default, it is
        'strict', meaning that a UnicodeDecodeError will be raised. Other
        values are 'ignore' and 'replace'.

    strip_accents : {'ascii', 'unicode', None}
        Remove accents and perform other character normalization
        during the preprocessing step.
        'ascii' is a fast method that only works on characters that have
        an direct ASCII mapping.
        'unicode' is a slightly slower method that works on any characters.
        None (default) does nothing.

        Both 'ascii' and 'unicode' use NFKD normalization from
        :func:`unicodedata.normalize`.

    lowercase : boolean, True by default
        Convert all characters to lowercase before tokenizing.

    preprocessor : callable or None (default)
        Override the preprocessing (string transformation) stage while
        preserving the tokenizing and n-grams generation steps.

    tokenizer : callable or None (default)
        Override the string tokenization step while preserving the
        preprocessing and n-grams generation steps.
        Only applies if ``analyzer == 'word'``.

    stop_words : string {'english'}, list, or None (default)
        If 'english', a built-in stop word list for English is used.
        There are several known issues with 'english' and you should
        consider an alternative (see :ref:`stop_words`).

        If a list, that list is assumed to contain stop words, all of which
        will be removed from the resulting tokens.
        Only applies if ``analyzer == 'word'``.

        If None, no stop words will be used. max_df can be set to a value
        in the range [0.7, 1.0) to automatically detect and filter stop
        words based on intra corpus document frequency of terms.

    token_pattern : string
        Regular expression denoting what constitutes a "token", only used
        if ``analyzer == 'word'``. The default regexp select tokens of 2
        or more alphanumeric characters (punctuation is completely ignored
        and always treated as a token separator).

    ngram_range : tuple (min_n, max_n)
        The lower and upper boundary of the range of n-values for different
        n-grams to be extracted. All values of n such that min_n <= n <= max_n
        will be used.

    analyzer : string, {'word', 'char', 'char_wb'} or callable
        Whether the feature should be made of word or character n-grams.
        Option 'char_wb' creates character n-grams only from text inside
        word boundaries; n-grams at the edges of words are padded with space.

        If a callable is passed it is used to extract the sequence of features
        out of the raw, unprocessed input.

    max_df : float in range [0.0, 1.0] or int, default=1.0
        When building the vocabulary ignore terms that have a document
        frequency strictly higher than the given threshold (corpus-specific
        stop words).
        If float, the parameter represents a proportion of documents, integer
        absolute counts.
        This parameter is ignored if vocabulary is not None.

    min_df : float in range [0.0, 1.0] or int, default=1
        When building the vocabulary ignore terms that have a document
        frequency strictly lower than the given threshold. This value is also
        called cut-off in the literature.
        If float, the parameter represents a proportion of documents, integer
        absolute counts.
        This parameter is ignored if vocabulary is not None.

    max_features : int or None, default=None
        If not None, build a vocabulary that only consider the top
        max_features ordered by term frequency across the corpus.

        This parameter is ignored if vocabulary is not None.

    vocabulary : Mapping or iterable, optional
        Either a Mapping (e.g., a dict) where keys are terms and values are
        indices in the feature matrix, or an iterable over terms. If not
        given, a vocabulary is determined from the input documents. Indices
        in the mapping should not be repeated and should not have any gap
        between 0 and the largest index.

    binary : boolean, default=False
        If True, all non zero counts are set to 1. This is useful for discrete
        probabilistic models that model binary events rather than integer
        counts.

    dtype : type, optional
        Type of the matrix returned by fit_transform() or transform().

    Attributes
    ----------
    vocabulary_ : dict
        A mapping of terms to feature indices.

    stop_words_ : set
        Terms that were ignored because they either:

          - occurred in too many documents (`max_df`)
          - occurred in too few documents (`min_df`)
          - were cut off by feature selection (`max_features`).

        This is only available if no vocabulary was given.

    Examples
    --------
    >>> from sklearn.feature_extraction.text import CountVectorizer
    >>> corpus = [
    ...     'This is the first document.',
    ...     'This document is the second document.',
    ...     'And this is the third one.',
    ...     'Is this the first document?',
    ... ]
    >>> vectorizer = CountVectorizer()
    >>> X = vectorizer.fit_transform(corpus)
    >>> print(vectorizer.get_feature_names())
    ['and', 'document', 'first', 'is', 'one', 'second', 'the', 'third', 'this']
    >>> print(X.toarray())  # doctest: +NORMALIZE_WHITESPACE
    [[0 1 1 1 0 0 1 0 1]
     [0 2 0 1 0 1 1 0 1]
     [1 0 0 1 1 0 1 1 1]
     [0 1 1 1 0 0 1 0 1]]

    See also
    --------
    HashingVectorizer, TfidfVectorizer

    Notes
    -----
    The ``stop_words_`` attribute can get large and increase the model size
    when pickling. This attribute is provided only for introspection and can
    be safely removed using delattr or set to None before pickling.
    u   contentu   utf-8u   strictu   (?u)\b\w\w+\bi   u   wordg      ?c         C  s  | |  _  | |  _ | |  _ | |  _ | |  _ | |  _ | |  _ | |  _ |	 |  _ | |  _	 | |  _
 | |  _ | d k  s | d k  r t d   n  | |  _ | d  k	 r t | t j  s | d k r t d |   q n  |
 |  _ | |  _ | |  _ | |  _ d  S(   Ni    u#   negative value for max_df or min_dfu4   max_features=%r, neither a positive integer nor None(   R*   R.   R/   RS   RT   R[   Rl   RV   RZ   R;   t   max_dft   min_dfR$   t   max_featuresR%   R"   t   numberst   IntegralR5   Rn   R   R   (   R2   R*   R.   R/   RS   RV   RT   R[   R;   RZ   R5   Rl   R   R   R   Rn   R   R   (    (    s>   lib/python2.7/site-packages/sklearn/feature_extraction/text.pyR   E  s2    																c         C  s   t  t j |   } t j t |  d | j j } x4 t |  D]& \ } \ } } | | | <| | | <qC W| j	 | j d d | _ | S(   ug   Sort features by name

        Returns a reordered matrix and modifies the vocabulary in place
        R   t   modeu   clip(
   Re   R   t	   iteritemsR0   t   emptyR7   R{   R   Rp   t   take(   R2   R   Rn   t   sorted_featurest	   map_indext   new_valt   termt   old_val(    (    s>   lib/python2.7/site-packages/sklearn/feature_extraction/text.pyt   _sort_featuresf  s    !
c         C  s  | d k r1 | d k r1 | d k r1 | t   f St |  } t j | j d d   j   } t j t |  d t	 } | d k	 r | | | k M} n  | d k	 r | | | k M} n  | d k	 r)| j   | k r)| | j
   |  }	 t j t |  d t	 }
 t |
 t j |  d |	 <|
 } n  t j |  d } t   } xR t t j |   D]; \ } } | | r| | | | <q[| | =| j |  q[Wt j |  d } t |  d k rt d   n  | d d  | f | f S(   u*  Remove too rare or too common features.

        Prune features that are non zero in more samples than high or less
        documents than low, modifying the vocabulary, and restricting it to
        at most the limit most frequent.

        This does not prune samples with zero features.
        t   axisi    R   i   uF   After pruning, no terms remain. Try a lower min_df or a higher max_df.N(   R%   R`   R   R0   t   asarrayt   sumt   ravelt   onesR7   t   boolt   argsortt   zerosRs   t   wheret   cumsumR6   R   R   Ra   R$   (   R2   R   Rn   t   hight   lowt   limitt   dfst   tfst   maskt	   mask_indst   new_maskt   new_indicest   removed_termsR   t	   old_indext   kept_indices(    (    s>   lib/python2.7/site-packages/sklearn/feature_extraction/text.pyt   _limit_featurest  s2    
$!		"
c         C  s  | r |  j  } n t   } | j | _ |  j   } g  } g  } t   } | j d  x | D] } i  }	 xb | |  D]T }
 y7 | |
 } | |	 k r d |	 | <n |	 | c d 7<Wqu t k
 r qu qu Xqu W| j |	 j	    | j |	 j
    | j t |   q\ W| s4t |  } | s4t d   q4n  | d d k rrt rft d j | d    n  t j } n	 t j } t j | d | } t j | d | } t j | d t j } t j | | | f d t |  d t |  f d |  j } | j   | | f S(	   uM   Create sparse feature matrix, and vocabulary where fixed_vocab=False
        i    i   u?   empty vocabulary; perhaps the documents only contain stop wordsiI       up   sparse CSR array has {} non-zero elements and requires 64 bit indexing, which is unsupported with 32 bit Python.R   R   (   Rv   R   t   __len__t   default_factoryRm   t   _make_int_arrayR8   t   KeyErrort   extendt   keyst   valuesR7   Ru   R$   R   t   formatR0   t   int64t   int32R   t
   frombuffert   intcR   t
   csr_matrixR   t   sort_indices(   R2   t   raw_documentst   fixed_vocabRn   t   analyzet	   j_indicesR   R   R3   t   feature_countert   featuret   feature_idxt   indices_dtypeR   (    (    s>   lib/python2.7/site-packages/sklearn/feature_extraction/text.pyt   _count_vocab  sP    		
		
c         C  s   |  j  |  |  S(   u
  Learn a vocabulary dictionary of all tokens in the raw documents.

        Parameters
        ----------
        raw_documents : iterable
            An iterable which yields either str, unicode or file objects.

        Returns
        -------
        self
        (   R   (   R2   R   R   (    (    s>   lib/python2.7/site-packages/sklearn/feature_extraction/text.pyR     s    c         C  sB  t  | t j  r! t d   n  |  j   |  j   |  j } |  j } |  j } |  j	 | |  j
  \ } } |  j r | j j d  n  |  j
 s>|  j | |  } | j d } t  | t j  r | n | | }	 t  | t j  r | n | | }
 |	 |
 k  rt d   n  |  j | | |	 |
 |  \ } |  _ | |  _ n  | S(   u  Learn the vocabulary dictionary and return term-document matrix.

        This is equivalent to fit followed by transform, but more efficiently
        implemented.

        Parameters
        ----------
        raw_documents : iterable
            An iterable which yields either str, unicode or file objects.

        Returns
        -------
        X : array, [n_samples, n_features]
            Document-term matrix.
        uB   Iterable over raw text documents expected, string object received.i   i    u-   max_df corresponds to < documents than min_df(   R"   R   R#   R$   R   R|   R   R   R   R   Rt   R   R   R   R   R   R   R   R   t   stop_words_Rv   (   R2   R   R   R   R   R   Rn   R   t   n_doct   max_doc_countt   min_doc_count(    (    s>   lib/python2.7/site-packages/sklearn/feature_extraction/text.pyR     s8    

						

c         C  s   t  | t j  r! t d   n  t |  d  s= |  j   n  |  j   |  j | d t \ } } |  j	 r~ | j
 j d  n  | S(   u  Transform documents to document-term matrix.

        Extract token counts out of raw text documents using the vocabulary
        fitted with fit or the one provided to the constructor.

        Parameters
        ----------
        raw_documents : iterable
            An iterable which yields either str, unicode or file objects.

        Returns
        -------
        X : sparse matrix, [n_samples, n_features]
            Document-term matrix.
        uB   Iterable over raw text documents expected, string object received.u   vocabulary_R   i   (   R"   R   R#   R$   t   hasattrR|   R}   R   Rs   R   R   R   (   R2   R   t   _R   (    (    s>   lib/python2.7/site-packages/sklearn/feature_extraction/text.pyR   "  s    
	c         C  s   |  j    t j |  r( | j   } n t j |  } | j d } t j t |  j	 j
     } t j t |  j	 j     } | t j |  } g  t |  D]0 } | | | d d  f j   d j   ^ q S(   u  Return terms per document with nonzero entries in X.

        Parameters
        ----------
        X : {array, sparse matrix}, shape = [n_samples, n_features]

        Returns
        -------
        X_inv : list of arrays, len = n_samples
            List of arrays of terms.
        i    Ni   (   R}   R   t   issparset   tocsrR0   t   asmatrixR   t   arrayR6   Rv   R   R   R   t   ranget   nonzeroR   (   R2   R   t	   n_samplest   termsR{   t   inverse_vocabularyRD   (    (    s>   lib/python2.7/site-packages/sklearn/feature_extraction/text.pyt   inverse_transformB  s    
c         C  sa   t  |  d  s |  j   n  |  j   g  t t j |  j  d t d  D] \ } } | ^ qK S(   u:   Array mapping from feature integer indices to feature nameu   vocabulary_t   keyi   (   R   R|   R}   Re   R   R   Rv   R   (   R2   Ry   RD   (    (    s>   lib/python2.7/site-packages/sklearn/feature_extraction/text.pyt   get_feature_names`  s
    
N(   i   i   (   R   R   R   R%   Rs   Rw   R0   R   R   R   R   R   R   R   R   R   R   (    (    (    s>   lib/python2.7/site-packages/sklearn/feature_extraction/text.pyR     s"   			(	<:	 	c           C  s   t  j  t d   S(   uE   Construct an array.array of a type suitable for scipy.sparse indices.u   i(   R   R~   (    (    (    s>   lib/python2.7/site-packages/sklearn/feature_extraction/text.pyR   k  s    t   TfidfTransformerc           B  s\   e  Z d  Z d e e e d  Z d d  Z e d  Z e	 d    Z
 e
 j d    Z
 RS(   u  Transform a count matrix to a normalized tf or tf-idf representation

    Tf means term-frequency while tf-idf means term-frequency times inverse
    document-frequency. This is a common term weighting scheme in information
    retrieval, that has also found good use in document classification.

    The goal of using tf-idf instead of the raw frequencies of occurrence of a
    token in a given document is to scale down the impact of tokens that occur
    very frequently in a given corpus and that are hence empirically less
    informative than features that occur in a small fraction of the training
    corpus.

    The formula that is used to compute the tf-idf for a term t of a document d
    in a document set is tf-idf(t, d) = tf(t, d) * idf(t), and the idf is
    computed as idf(t) = log [ n / df(t) ] + 1 (if ``smooth_idf=False``), where
    n is the total number of documents in the document set and df(t) is the
    document frequency of t; the document frequency is the number of documents
    in the document set that contain the term t. The effect of adding "1" to
    the idf in the equation above is that terms with zero idf, i.e., terms
    that occur in all documents in a training set, will not be entirely
    ignored.
    (Note that the idf formula above differs from the standard textbook
    notation that defines the idf as
    idf(t) = log [ n / (df(t) + 1) ]).

    If ``smooth_idf=True`` (the default), the constant "1" is added to the
    numerator and denominator of the idf as if an extra document was seen
    containing every term in the collection exactly once, which prevents
    zero divisions: idf(d, t) = log [ (1 + n) / (1 + df(d, t)) ] + 1.

    Furthermore, the formulas used to compute tf and idf depend
    on parameter settings that correspond to the SMART notation used in IR
    as follows:

    Tf is "n" (natural) by default, "l" (logarithmic) when
    ``sublinear_tf=True``.
    Idf is "t" when use_idf is given, "n" (none) otherwise.
    Normalization is "c" (cosine) when ``norm='l2'``, "n" (none)
    when ``norm=None``.

    Read more in the :ref:`User Guide <text_feature_extraction>`.

    Parameters
    ----------
    norm : 'l1', 'l2' or None, optional (default='l2')
        Each output row will have unit norm, either:
        * 'l2': Sum of squares of vector elements is 1. The cosine
        similarity between two vectors is their dot product when l2 norm has
        been applied.
        * 'l1': Sum of absolute values of vector elements is 1.
        See :func:`preprocessing.normalize`

    use_idf : boolean (default=True)
        Enable inverse-document-frequency reweighting.

    smooth_idf : boolean (default=True)
        Smooth idf weights by adding one to document frequencies, as if an
        extra document was seen containing every term in the collection
        exactly once. Prevents zero divisions.

    sublinear_tf : boolean (default=False)
        Apply sublinear tf scaling, i.e. replace tf with 1 + log(tf).

    Attributes
    ----------
    idf_ : array, shape (n_features)
        The inverse document frequency (IDF) vector; only defined
        if  ``use_idf`` is True.

    References
    ----------

    .. [Yates2011] `R. Baeza-Yates and B. Ribeiro-Neto (2011). Modern
                   Information Retrieval. Addison Wesley, pp. 68-74.`

    .. [MRS2008] `C.D. Manning, P. Raghavan and H. Schütze  (2008).
                   Introduction to Information Retrieval. Cambridge University
                   Press, pp. 118-120.`
    u   l2c         C  s(   | |  _  | |  _ | |  _ | |  _ d  S(   N(   R   t   use_idft
   smooth_idft   sublinear_tf(   R2   R   R   R   R   (    (    s>   lib/python2.7/site-packages/sklearn/feature_extraction/text.pyR     s    			c      
   C  s   t  | d d
 } t j |  s3 t j |  } n  | j t k rK | j n t j } |  j r | j	 \ } } t
 |  j |  } | t |  j  7} | t |  j  7} t j | |  d } t j | d d d | | f d d d	 | |  _ n  |  S(   u   Learn the idf vector (global term weights)

        Parameters
        ----------
        X : sparse matrix, [n_samples, n_features]
            a matrix of term/token counts
        t   accept_sparseu   csru   csci   t   offsetsi    R   R   R   (   u   csru   csc(   R   R   R   R   R   R   R0   R   R   R   R   t   astypet   intR   t   logt   diagst	   _idf_diag(   R2   R   R   R   R   R   t   dft   idf(    (    s>   lib/python2.7/site-packages/sklearn/feature_extraction/text.pyR     s    !	c         C  s  t  | d d d t d | } t j |  sH t j | d t j } n  | j \ } } |  j r t j	 | j
 | j
  | j
 d 7_
 n  |  j r t |  d d  |  j j d } | | k r t d	 | | f   n  | |  j } n  |  j rt | d
 |  j d t } n  | S(   u  Transform a count matrix to a tf or tf-idf representation

        Parameters
        ----------
        X : sparse matrix, [n_samples, n_features]
            a matrix of term/token counts

        copy : boolean, default True
            Whether to copy X and operate on the copy or perform in-place
            operations.

        Returns
        -------
        vectors : sparse matrix, [n_samples, n_features]
        R   u   csrR   R   i   u	   _idf_diagu   idf vector is not fittedi    uK   Input has n_features=%d while the model has been trained with n_features=%dR   (   R   R   R   R   R   R0   R   R   R   R   R   R   R   R   R$   R   R   Rw   (   R2   R   R   R   R   t   expected_n_features(    (    s>   lib/python2.7/site-packages/sklearn/feature_extraction/text.pyR     s"    			c         C  s   t  j |  j j d d   S(   NR   i    (   R0   R   R   R   (   R2   (    (    s>   lib/python2.7/site-packages/sklearn/feature_extraction/text.pyt   idf_  s    c      
   C  sS   t  j | d t  j } | j d } t j | d d d | d | d d |  _ d  S(   NR   i    R   t   mRC   R   u   csr(   R0   R   R   R   R   t   spdiagsR   (   R2   t   valueR   (    (    s>   lib/python2.7/site-packages/sklearn/feature_extraction/text.pyR     s    N(   R   R   R   Rs   Rw   R   R%   R   R   t   propertyR   t   setter(    (    (    s>   lib/python2.7/site-packages/sklearn/feature_extraction/text.pyR   p  s   O	*t   TfidfVectorizerc           B  s+  e  Z d  Z d d d d e d d d d d d d d d d e e j d e e e d	  Z e	 d
    Z
 e
 j d    Z
 e	 d    Z e j d    Z e	 d    Z e j d    Z e	 d    Z e j d    Z e	 d    Z e j d    Z d   Z d d  Z d d  Z e d  Z RS(   u8  Convert a collection of raw documents to a matrix of TF-IDF features.

    Equivalent to :class:`CountVectorizer` followed by
    :class:`TfidfTransformer`.

    Read more in the :ref:`User Guide <text_feature_extraction>`.

    Parameters
    ----------
    input : string {'filename', 'file', 'content'}
        If 'filename', the sequence passed as an argument to fit is
        expected to be a list of filenames that need reading to fetch
        the raw content to analyze.

        If 'file', the sequence items must have a 'read' method (file-like
        object) that is called to fetch the bytes in memory.

        Otherwise the input is expected to be the sequence strings or
        bytes items are expected to be analyzed directly.

    encoding : string, 'utf-8' by default.
        If bytes or files are given to analyze, this encoding is used to
        decode.

    decode_error : {'strict', 'ignore', 'replace'} (default='strict')
        Instruction on what to do if a byte sequence is given to analyze that
        contains characters not of the given `encoding`. By default, it is
        'strict', meaning that a UnicodeDecodeError will be raised. Other
        values are 'ignore' and 'replace'.

    strip_accents : {'ascii', 'unicode', None} (default=None)
        Remove accents and perform other character normalization
        during the preprocessing step.
        'ascii' is a fast method that only works on characters that have
        an direct ASCII mapping.
        'unicode' is a slightly slower method that works on any characters.
        None (default) does nothing.

        Both 'ascii' and 'unicode' use NFKD normalization from
        :func:`unicodedata.normalize`.

    lowercase : boolean (default=True)
        Convert all characters to lowercase before tokenizing.

    preprocessor : callable or None (default=None)
        Override the preprocessing (string transformation) stage while
        preserving the tokenizing and n-grams generation steps.

    tokenizer : callable or None (default=None)
        Override the string tokenization step while preserving the
        preprocessing and n-grams generation steps.
        Only applies if ``analyzer == 'word'``.

    analyzer : string, {'word', 'char', 'char_wb'} or callable
        Whether the feature should be made of word or character n-grams.
        Option 'char_wb' creates character n-grams only from text inside
        word boundaries; n-grams at the edges of words are padded with space.

        If a callable is passed it is used to extract the sequence of features
        out of the raw, unprocessed input.

    stop_words : string {'english'}, list, or None (default=None)
        If a string, it is passed to _check_stop_list and the appropriate stop
        list is returned. 'english' is currently the only supported string
        value.
        There are several known issues with 'english' and you should
        consider an alternative (see :ref:`stop_words`).

        If a list, that list is assumed to contain stop words, all of which
        will be removed from the resulting tokens.
        Only applies if ``analyzer == 'word'``.

        If None, no stop words will be used. max_df can be set to a value
        in the range [0.7, 1.0) to automatically detect and filter stop
        words based on intra corpus document frequency of terms.

    token_pattern : string
        Regular expression denoting what constitutes a "token", only used
        if ``analyzer == 'word'``. The default regexp selects tokens of 2
        or more alphanumeric characters (punctuation is completely ignored
        and always treated as a token separator).

    ngram_range : tuple (min_n, max_n) (default=(1, 1))
        The lower and upper boundary of the range of n-values for different
        n-grams to be extracted. All values of n such that min_n <= n <= max_n
        will be used.

    max_df : float in range [0.0, 1.0] or int (default=1.0)
        When building the vocabulary ignore terms that have a document
        frequency strictly higher than the given threshold (corpus-specific
        stop words).
        If float, the parameter represents a proportion of documents, integer
        absolute counts.
        This parameter is ignored if vocabulary is not None.

    min_df : float in range [0.0, 1.0] or int (default=1)
        When building the vocabulary ignore terms that have a document
        frequency strictly lower than the given threshold. This value is also
        called cut-off in the literature.
        If float, the parameter represents a proportion of documents, integer
        absolute counts.
        This parameter is ignored if vocabulary is not None.

    max_features : int or None (default=None)
        If not None, build a vocabulary that only consider the top
        max_features ordered by term frequency across the corpus.

        This parameter is ignored if vocabulary is not None.

    vocabulary : Mapping or iterable, optional (default=None)
        Either a Mapping (e.g., a dict) where keys are terms and values are
        indices in the feature matrix, or an iterable over terms. If not
        given, a vocabulary is determined from the input documents.

    binary : boolean (default=False)
        If True, all non-zero term counts are set to 1. This does not mean
        outputs will have only 0/1 values, only that the tf term in tf-idf
        is binary. (Set idf and normalization to False to get 0/1 outputs.)

    dtype : type, optional (default=float64)
        Type of the matrix returned by fit_transform() or transform().

    norm : 'l1', 'l2' or None, optional (default='l2')
        Each output row will have unit norm, either:
        * 'l2': Sum of squares of vector elements is 1. The cosine
        similarity between two vectors is their dot product when l2 norm has
        been applied.
        * 'l1': Sum of absolute values of vector elements is 1.
        See :func:`preprocessing.normalize`

    use_idf : boolean (default=True)
        Enable inverse-document-frequency reweighting.

    smooth_idf : boolean (default=True)
        Smooth idf weights by adding one to document frequencies, as if an
        extra document was seen containing every term in the collection
        exactly once. Prevents zero divisions.

    sublinear_tf : boolean (default=False)
        Apply sublinear tf scaling, i.e. replace tf with 1 + log(tf).

    Attributes
    ----------
    vocabulary_ : dict
        A mapping of terms to feature indices.

    idf_ : array, shape (n_features)
        The inverse document frequency (IDF) vector; only defined
        if  ``use_idf`` is True.

    stop_words_ : set
        Terms that were ignored because they either:

          - occurred in too many documents (`max_df`)
          - occurred in too few documents (`min_df`)
          - were cut off by feature selection (`max_features`).

        This is only available if no vocabulary was given.

    Examples
    --------
    >>> from sklearn.feature_extraction.text import TfidfVectorizer
    >>> corpus = [
    ...     'This is the first document.',
    ...     'This document is the second document.',
    ...     'And this is the third one.',
    ...     'Is this the first document?',
    ... ]
    >>> vectorizer = TfidfVectorizer()
    >>> X = vectorizer.fit_transform(corpus)
    >>> print(vectorizer.get_feature_names())
    ['and', 'document', 'first', 'is', 'one', 'second', 'the', 'third', 'this']
    >>> print(X.shape)
    (4, 9)

    See also
    --------
    CountVectorizer : Transforms text into a sparse matrix of n-gram counts.

    TfidfTransformer : Performs the TF-IDF transformation from a provided
        matrix of counts.

    Notes
    -----
    The ``stop_words_`` attribute can get large and increase the model size
    when pickling. This attribute is provided only for introspection and can
    be safely removed using delattr or set to None before pickling.
    u   contentu   utf-8u   strictu   wordu   (?u)\b\w\w+\bi   g      ?u   l2c      #   C  s   t  t |   j d | d | d | d | d | d | d | d | d	 |	 d
 |
 d | d | d | d | d | d | d |  t d | d | d | d |  |  _ d  S(   NR*   R.   R/   RS   RV   RT   R[   Rl   R;   RZ   R5   R   R   R   Rn   R   R   R   R   R   R   (   t   superR  R   R   t   _tfidf(   R2   R*   R.   R/   RS   RV   RT   R[   Rl   R;   RZ   R5   R   R   R   Rn   R   R   R   R   R   R   (    (    s>   lib/python2.7/site-packages/sklearn/feature_extraction/text.pyR     s    	c         C  s
   |  j  j S(   N(   R  R   (   R2   (    (    s>   lib/python2.7/site-packages/sklearn/feature_extraction/text.pyR     s    c         C  s   | |  j  _ d  S(   N(   R  R   (   R2   R  (    (    s>   lib/python2.7/site-packages/sklearn/feature_extraction/text.pyR     s    c         C  s
   |  j  j S(   N(   R  R   (   R2   (    (    s>   lib/python2.7/site-packages/sklearn/feature_extraction/text.pyR     s    c         C  s   | |  j  _ d  S(   N(   R  R   (   R2   R  (    (    s>   lib/python2.7/site-packages/sklearn/feature_extraction/text.pyR     s    c         C  s
   |  j  j S(   N(   R  R   (   R2   (    (    s>   lib/python2.7/site-packages/sklearn/feature_extraction/text.pyR     s    c         C  s   | |  j  _ d  S(   N(   R  R   (   R2   R  (    (    s>   lib/python2.7/site-packages/sklearn/feature_extraction/text.pyR   
  s    c         C  s
   |  j  j S(   N(   R  R   (   R2   (    (    s>   lib/python2.7/site-packages/sklearn/feature_extraction/text.pyR     s    c         C  s   | |  j  _ d  S(   N(   R  R   (   R2   R  (    (    s>   lib/python2.7/site-packages/sklearn/feature_extraction/text.pyR     s    c         C  s
   |  j  j S(   N(   R  R   (   R2   (    (    s>   lib/python2.7/site-packages/sklearn/feature_extraction/text.pyR     s    c         C  so   |  j    t |  d  r_ t |  j  t |  k r_ t d t |  t |  j  f   q_ n  | |  j _ d  S(   Nu   vocabulary_u5   idf length = %d must be equal to vocabulary size = %d(   R|   R   R7   Rv   R$   Rn   R  R   (   R2   R  (    (    s>   lib/python2.7/site-packages/sklearn/feature_extraction/text.pyR     s    
%c         C  s5   |  j  t k r1 t j d j t |  j   t  n  d  S(   NuK   Only {} 'dtype' should be used. {} 'dtype' will be converted to np.float64.(   R   R   Rc   Rd   R   t   UserWarning(   R2   (    (    s>   lib/python2.7/site-packages/sklearn/feature_extraction/text.pyt   _check_params$  s    c         C  s6   |  j    t t |   j |  } |  j j |  |  S(   u  Learn vocabulary and idf from training set.

        Parameters
        ----------
        raw_documents : iterable
            an iterable which yields either str, unicode or file objects

        Returns
        -------
        self : TfidfVectorizer
        (   R	  R  R  R   R  R   (   R2   R   R   R   (    (    s>   lib/python2.7/site-packages/sklearn/feature_extraction/text.pyR   +  s    
c         C  sH   |  j    t t |   j |  } |  j j |  |  j j | d t S(   u  Learn vocabulary and idf, return term-document matrix.

        This is equivalent to fit followed by transform, but more efficiently
        implemented.

        Parameters
        ----------
        raw_documents : iterable
            an iterable which yields either str, unicode or file objects

        Returns
        -------
        X : sparse matrix, [n_samples, n_features]
            Tf-idf-weighted document-term matrix.
        R   (   R	  R  R  R   R  R   R   Rw   (   R2   R   R   R   (    (    s>   lib/python2.7/site-packages/sklearn/feature_extraction/text.pyR   <  s    
c         C  s>   t  |  d d  t t |   j |  } |  j j | d t S(   u:  Transform documents to document-term matrix.

        Uses the vocabulary and document frequencies (df) learned by fit (or
        fit_transform).

        Parameters
        ----------
        raw_documents : iterable
            an iterable which yields either str, unicode or file objects

        copy : boolean, default True
            Whether to copy X and operate on the copy or perform in-place
            operations.

        Returns
        -------
        X : sparse matrix, [n_samples, n_features]
            Tf-idf-weighted document-term matrix.
        u   _tfidfu   The tfidf vector is not fittedR   (   R   R  R  R   R  Rw   (   R2   R   R   R   (    (    s>   lib/python2.7/site-packages/sklearn/feature_extraction/text.pyR   S  s    N(   i   i   (   R   R   R   R%   Rs   Rw   R0   R   R   R  R   R  R   R   R   R   R	  R   R   R   (    (    (    s>   lib/python2.7/site-packages/sklearn/feature_extraction/text.pyR    s.   				
	(6   R   t
   __future__R    R   R   t   collectionsR   R   t   operatorR   R   R   Rc   t   numpyR0   t   scipy.sparset   sparseR   t   baseR   R   t	   externalsR   t   externals.six.movesR   t   preprocessingR   t   hashingR	   R;   R
   t   utils.validationR   R   R   t   utils.fixesR   R   Ro   t   utilsR   t   __all__R   R   R!   R(   t   objectR)   R   R   R   R   R   R  (    (    (    s>   lib/python2.7/site-packages/sklearn/feature_extraction/text.pyt   <module>   sR   					  	 	