
\c           @  sE  d  d l  m Z d  d l Z d  d l Z d  d l Z d  d l m Z d  d l m Z d  d l	 m
 Z
 d  d l	 m Z d  d l	 m Z d  d l	 m Z d  d	 l	 m Z d  d
 l	 m Z d  d l	 m Z d  d l	 m Z d  d l m Z d  d l m Z d  d l m Z d  d l m Z d  d l m Z d  d l m Z d  d l Z d  d l m Z d  d l m  Z  d  d l! m" Z" d  d l# m$ Z$ m% Z% m& Z& m' Z' m( Z( m) Z) m* Z* m+ Z+ m, Z, m- Z- m. Z. m/ Z/ m0 Z0 m1 Z1 m2 Z2 m3 Z3 m4 Z4 d  d l5 m6 Z7 d  d l8 m9 Z9 d  d l: m; Z; d  d l< Z< d  d l= m> Z> dm Z? dn Z@ e? e@ ZA d&   ZB d'   ZC d(   ZD d)   ZE d*   ZF d+   ZG e jH jI d, e e f  d-    ZJ d.   ZK d/   ZL d0   ZM d1   ZN d2   ZO d3   ZP d4   ZQ d5   ZR d6   ZS d7   ZT d8   ZU d9   ZV d:   ZW d;   ZX d<   ZY d=   ZZ d>   Z[ e2 e. d? e\  d@     Z] dA   Z^ e jH jI d, e e f  dB    Z_ dC   Z` dD   Za dE   Zb dF   Zc e2 e. d? e\  dG     Zd e jH jI d, e e f  dH    Ze e jH jf dI  e jH jf dJ  dK     Zg e jH jf dI  e jH jf dJ  dL     Zh dM   Zi e2 e. d? e\  dN     Zj dO   Zk dP   Zl dQ   Zm dR   Zn dS   Zo dT   Zp dU   Zq dV   Zr dW   Zs dX   Zt e2 dY    Zu dZ   Zv d[   Zw d\   Zx e jH jI d, e e e f  d]    Zy e jH jI d^ e jz e j{ g  d_    Z| d`   Z} e jH jI da e j~ e j{ e f e j e j{ e f e jz e jz e f e j{ e j{ e f g  db    Z e jH jI dc e dd do  e dd dp  e dd dq  g  dg    Z dh   Z e2 di    Z e4 dj    Z e2 e jH jI dk e e e g  dl     Z d S(r   i(   t   unicode_literalsN(   t   sparse(   t   PY2(   t
   strip_tags(   t   strip_accents_unicode(   t   strip_accents_ascii(   t   HashingVectorizer(   t   CountVectorizer(   t   TfidfTransformer(   t   TfidfVectorizer(   t   ENGLISH_STOP_WORDS(   t   train_test_split(   t   cross_val_score(   t   GridSearchCV(   t   Pipeline(   t	   LinearSVC(   t   clone(   t   assert_array_almost_equal(   t   assert_array_equal(   t   IS_PYPY(   t   assert_equalt   assert_falset   assert_not_equalt   assert_almost_equalt	   assert_int   assert_lesst   assert_greatert   assert_warns_messaget   assert_raise_messaget   clean_warning_registryt   ignore_warningst   SkipTestt   assert_raisest   assert_no_warningst   fails_if_pypyt   assert_allclose_dense_sparset   skip_if_32bit(   t   _Mapping(   t   defaultdict(   t   partial(   t   StringIOu   the pizza pizza beer copyrightu   the pizza burger beer copyrightu!   the the pizza beer beer copyrightu   the burger beer beer copyrightu   the coke burger coke copyrightu   the coke burger burgeru   the salad celeri copyrightu)   the salad salad sparkling water copyrightu   the the celeri celeri copyrightu   the tomato tomato salad wateru    the tomato salad water copyrightc         C  s   t  |   j   S(   N(   R   t   upper(   t   s(    (    sI   lib/python2.7/site-packages/sklearn/feature_extraction/tests/test_text.pyt	   uppercaseB   s    c         C  s   |  j  d d  S(   Nu   éu   e(   t   replace(   R*   (    (    sI   lib/python2.7/site-packages/sklearn/feature_extraction/tests/test_text.pyt   strip_eacuteF   s    c         C  s
   |  j    S(   N(   t   split(   R*   (    (    sI   lib/python2.7/site-packages/sklearn/feature_extraction/tests/test_text.pyt   split_tokenizeJ   s    c         C  s   d g S(   Nu   the_ultimate_feature(    (   R*   (    (    sI   lib/python2.7/site-packages/sklearn/feature_extraction/tests/test_text.pyt   lazy_analyzeN   s    c          C  s   d }  d } t  t |   |  d }  d } t  t |   |  d }  d } t  t |   |  d }  d } t  t |   |  d  S(	   Nu   àáâãäåçèéêëu   aaaaaaceeeeu   ìíîïñòóôõöùúûüýu   iiiinooooouuuuyu   إu   اu   this is à testu   this is a test(   R   R   (   t   at   expected(    (    sI   lib/python2.7/site-packages/sklearn/feature_extraction/tests/test_text.pyt   test_strip_accentsR   s    c          C  s   d }  d } t  t |   |  d }  d } t  t |   |  d }  d } t  t |   |  d }  d } t  t |   |  d  S(	   Nu   àáâãäåçèéêëu   aaaaaaceeeeu   ìíîïñòóôõöùúûüýu   iiiinooooouuuuyu   إu    u   this is à testu   this is a test(   R   R   (   R1   R2   (    (    sI   lib/python2.7/site-packages/sklearn/feature_extraction/tests/test_text.pyt   test_to_asciig   s    u
   Vectorizerc      
   C  s  |  d d  j    } d } d d d d d d	 d
 d d d g
 } t | |  |  d } d d d d d d d g } t | |  |  |  d d  j    } t d  } d d d d d d d g } t | |  |  |  d t  j    } d } d d d  d! d" d# d$ d% d& d' g
 } t | |  |  |  d( t d d  j    } d } d) d d d d d* d+ d d d, g
 } t | |  |  d  S(-   Nt   strip_accentsu   asciiu:   J'ai mangé du kangourou  ce midi, c'était pas três bon.u   aiu   mangeu   duu	   kangourouu   ceu   midiu   etaitu   pasu   tresu   bonu0   This is a test, really.

 I met Harry yesterday.u   thisu   isu   testu   reallyu   metu   harryu	   yesterdayt   inputu   fileu'   This is a test with a file-like object!u   withu   likeu   objectt   preprocessoru;   J'ai mangé du kangourou  ce midi,  c'était pas três bon.u   AIu   MANGEu   DUu	   KANGOUROUu   CEu   MIDIu   ETAITu   PASu   TRESu   BONt	   tokenizeru   j'aiu   midi,u   c'etaitu   bon.(   t   build_analyzerR   R(   R+   R/   (   t
   Vectorizert   wat   textR2   (    (    sI   lib/python2.7/site-packages/sklearn/feature_extraction/tests/test_text.pyt   test_word_analyzer_unigrams|   s2    		c          C  s}   t  d d d d d d  j   }  d } d	 d
 d d d d d d d d d d d d d d d d d g } t |  |  |  d  S(   Nt   analyzeru   wordR5   u   unicodet   ngram_rangei   i   u:   J'ai mangé du kangourou  ce midi, c'était pas três bon.u   aiu   mangeu   duu	   kangourouu   ceu   midiu   etaitu   pasu   tresu   bonu   ai mangeu   mange duu   du kangourouu   kangourou ceu   ce midiu
   midi etaitu	   etait pasu   pas tresu   tres bon(   i   i   (   R   R9   R   (   R;   R<   R2   (    (    sI   lib/python2.7/site-packages/sklearn/feature_extraction/tests/test_text.pyt'   test_word_analyzer_unigrams_and_bigrams   s    c          C  su   d }  |  j  d  } t d d d d  j   } t t | |  t d d	 d d d d  j   } t t | |  d  S(   Nu:   J'ai mangé du kangourou  ce midi, c'était pas três bon.u   utf-8R?   i   i   t   encodingu   asciiR>   u   chari   i   (   i   i   (   i   i   (   t   encodeR   R9   R    t   UnicodeDecodeError(   R<   t
   text_bytesR;   t   ca(    (    sI   lib/python2.7/site-packages/sklearn/feature_extraction/tests/test_text.pyt   test_unicode_decode_error   s    c          C  s:  t  d d d d d d#  j   }  d } d	 d
 d d d g } t |  |  d  |  d d d d d g } t |  |  d |  d } d d d d d g } t |  |  d  |  d d d d d g } t |  |  d |  t  d  d! d d d d$  j   }  t d"  } d d d d d g } t |  |  d  |  d  S(%   NR>   u   charR5   u   unicodeR?   i   i   u9   J'ai mangé du kangourou  ce midi, c'était pas três bonu   j'au   'aiu   ai u   i mu    mai   u   s tresu    tres u   tres bu   res bou   es boniu1   This 
	is a test, really.

 I met Harry yesterdayu   thiu   hisu   is u   s iu    isu    yesteu   yesteru   esterdu   sterdau   terdayR6   u   fileu'   This is a test with a file-like object!(   i   i   (   i   i   (   R   R9   R   R(   (   t   cngaR<   R2   (    (    sI   lib/python2.7/site-packages/sklearn/feature_extraction/tests/test_text.pyt   test_char_ngram_analyzer   s"    c          C  s   t  d d d d d d  j   }  d } d	 d
 d d d g } t |  |  d  |  d d d d d g } t |  |  d |  t  d d d d d d  j   }  t d  } d d d d d d g } t |  |  d  |  d  S(    NR>   u   char_wbR5   u   unicodeR?   i   i   u1   This 
	is a test, really.

 I met Harry yesterdayu    thu   thiu   hisu   is u    thii   u   yesteru   esterdu   sterdau   terdayu   erday iR6   u   fileu   A test with a file-like object!u    a u    teu   tesu   estu   st u    tes(   i   i   (   i   i   (   R   R9   R   R(   (   RG   R<   R2   (    (    sI   lib/python2.7/site-packages/sklearn/feature_extraction/tests/test_text.pyt   test_char_wb_ngram_analyzer   s    c          C  s   t  d d d d d d  j   }  d } d	 d
 d g } t |  |  d  |  d d d g } t |  |  d |  t  d d d d d d  j   } t |  } t | |  |  |   d  S(   NR>   u   wordR5   u   unicodeR?   i   i   u1   This 
	is a test, really.

 I met Harry yesterdayu   this is testu   is test reallyu   test really metu   test really met harry yesterdayu   this is test really met harryu"   is test really met harry yesterdayiR6   u   file(   i   i   (   i   i   (   R   R9   R   R(   (   RG   R<   R2   t	   cnga_filet   file(    (    sI   lib/python2.7/site-packages/sklearn/feature_extraction/tests/test_text.pyt   test_word_ngram_analyzer   s    	c          C  s   i d d 6d d 6}  t  |  j    } x t t t t t t  g D] } | |   } t d |  } | j	 t
  t | t  r t | j |   n t t  | j  |  | j t
  } t | j d t |   qB Wd  S(   Ni    u   pizzai   u   beert
   vocabulary(   t   sett   keyst   dictt   listt   iterR'   R&   t   intR   t   fitt   JUNK_FOOD_DOCSt
   isinstancet   MappingR   t   vocabulary_t	   transformt   shapet   len(   t   vocabt   termst   typt   vt   vectt   X(    (    sI   lib/python2.7/site-packages/sklearn/feature_extraction/tests/test_text.pyt&   test_countvectorizer_custom_vocabulary   s    "c          C  s   d d g }  t  d t d |   f d t   f g  } | j t  } t t | j d j  t |    t | j	 d t
 |    d  S(   Nu   pizzau   beeru   countRM   u   tfidfi   (   R   R   R   t   fit_transformt   ALL_FOOD_DOCSR   RN   t   named_stepsRX   RZ   R[   (   t   what_we_liket   pipeRa   (    (    sI   lib/python2.7/site-packages/sklearn/feature_extraction/tests/test_text.pyt/   test_countvectorizer_custom_vocabulary_pipeline  s    c          C  sX   i d d 6d d 6}  y t  d |   Wn, t k
 rS } t d t |  j    n Xd  S(   Ni    u   pizzau   beerRM   u$   vocabulary contains repeated indices(   R   t
   ValueErrorR   t   strt   lower(   R\   t   e(    (    sI   lib/python2.7/site-packages/sklearn/feature_extraction/tests/test_text.pyt7   test_countvectorizer_custom_vocabulary_repeated_indices  s
    c          C  sX   i d d 6d d 6}  y t  d |   Wn, t k
 rS } t d t |  j    n Xd  S(   Ni   u   pizzai   u   beerRM   u   doesn't contain index(   R   Ri   R   Rj   Rk   (   R\   Rl   (    (    sI   lib/python2.7/site-packages/sklearn/feature_extraction/tests/test_text.pyt0   test_countvectorizer_custom_vocabulary_gap_index  s
    c          C  s   t    }  |  j d d  t |  j   t  |  j d d  t t |  j  |  j d d  t t |  j  d d d g } |  j d |  t |  j   t |   d  S(   Nt
   stop_wordsu   englishu   _bad_str_stop_u   _bad_unicode_stop_u   someu   otheru   words(   R   t
   set_paramsR   t   get_stop_wordsR
   R    Ri   RN   (   t   cvt   stoplist(    (    sI   lib/python2.7/site-packages/sklearn/feature_extraction/tests/test_text.pyt   test_countvectorizer_stop_words'  s    	c          C  s   y5 t  d g   }  |  j d g  t s4 t d   Wn, t k
 rc } t d t |  j    n XyA t  d d d d  } | j d	 d
 d g  t s t d   Wn, t k
 r } t d t |  j    n Xd  S(   NRM   u   foou   we shouldn't get hereu   empty vocabularyt   max_dfg      ?Ro   u   englishu   to be or not to beu
   and me toou   and so do you(   R   RT   t   Falset   AssertionErrorRi   R   Rj   Rk   (   R`   Rl   R_   (    (    sI   lib/python2.7/site-packages/sklearn/feature_extraction/tests/test_text.pyt%   test_countvectorizer_empty_vocabulary4  s    c          C  sN   t    }  |  j t d   } |  j t d  } t | j d | j d  d  S(   Ni   i   (   R   Rc   Rd   R   RZ   (   Rr   t   X1t   X2(    (    sI   lib/python2.7/site-packages/sklearn/feature_extraction/tests/test_text.pyt   test_fit_countvectorizer_twiceE  s    	c          C  s  d d d g d d d g d d d g g }  t  d t d d  } | j |   j   } | d k j   sl t  t | d j d d  d d d g  d d d g d d d g d d d g g }  t  d t d d  } | j |   j   } | d k j   s t  d  S(	   Ni   i    t
   smooth_idft   normu   l2i   t   axisg      ?(   R   t   TrueRc   t   toarrayt   allRw   R   t   sum(   Ra   t   trt   tfidf(    (    sI   lib/python2.7/site-packages/sklearn/feature_extraction/tests/test_text.pyt   test_tf_idf_smoothingL  s    &c          C  sV  d d d g d d d g d d d g g }  t  d t d d  } | j |   j   } | d k j   sl t  t | d j d d  d d d g  d d d g d d d g d d d g g }  t  d t d d  } t   t	 j
 d	 t  - } d t j d
 g  t |  d k } Wd  QXd } t t | | j |   j   } | sRt d   n  d  S(   Ni   i    R|   R}   u   l2i   R~   g      ?t   recordg        u   divide by zerou&   Numpy does not provide div 0 warnings.(   R   Rv   Rc   R   R   Rw   R   R   R   t   warningst   catch_warningsR   t   npt   arrayR[   R   t   RuntimeWarningR   (   Ra   R   R   t   wt   numpy_provides_div0_warningt   in_warning_message(    (    sI   lib/python2.7/site-packages/sklearn/feature_extraction/tests/test_text.pyt   test_tfidf_no_smoothing`  s(    &	c          C  s   d g d g d g g }  t  d t d t d d   } | j |   j   } t | d d  t | d | d  t | d | d  t | d d  t | d d  d  S(   Ni   i   i   t   sublinear_tft   use_idfR}   i    (	   R   R   Rv   t   NoneRc   R   R   R   R   (   Ra   R   R   (    (    sI   lib/python2.7/site-packages/sklearn/feature_extraction/tests/test_text.pyt   test_sublinear_tf~  s    c          C  sS  t  t d   }  t d g } t t  d } t d d  } | j |   } t | d  ri | j   } n  t | d | j d f d  t d	 | j  } x'| | f D]} | j	 |  } t | d  r | j   } n  | j } t | d | d
 f d  t | d | d f d  t | d | d f d  t
 d | k  t
 d | k  t | d | d f d  t | d | d f d  t | d | d f d  t | d | d f d  q Wt d d  }	 |	 j |  j	 |  j   }
 t t |	 j  t | j   t |
 j | t | j  f  |	 j	 |  j   } t | j t |  t | j  f  t d d d t  } | j |  j	 |  j   } t
 t | d   t d t  } t t | j	 |  d d d g d d d g g } | j |  d d g d d g g } t t | j	 |  t t j | d d d g |  t  t d   }  t d d  } | j | _ | j |   j   } t
 | j  t |
 |  | j	 |  j   } t | |  t d	 d   } t t | j	 |   | j d d d t  t | j   t  | j d d d d   t t | j  d | _ t t | j  d  S(    Nii   Ru   g      ?u   tocsri    u   pizzai   RM   u   saladu   tomatou   wateru   theu	   copyrightu   cokeu   burgeru   beerR}   u   l1R   u   idf_i   i   R~   g      ?R5   u   asciit	   lowercaseu   _gabbledegook_R7   u   _invalid_analyzer_type_(   RR   Rd   R[   R   Rc   t   hasattrt   tocsrR   RX   RY   R   R   RT   R   t   idf_RZ   Rv   R   R    Ri   R   R   R   R	   Ru   t   fixed_vocabulary_R   Rp   t   build_preprocessorR   R9   (   t
   train_datat	   test_datat   n_traint   v1t   counts_traint   v2R_   t   counts_testRM   t   t1R   t
   tfidf_testt   t2t   tft   t3Ra   t	   X_incomptt   tvt   tfidf2t   tfidf_test2t   v3(    (    sI   lib/python2.7/site-packages/sklearn/feature_extraction/tests/test_text.pyt   test_vectorizer  sr    	%	#	c       	   C  s   t  d d d t d t d t  }  d |  _ t |  j j d  t |  _ |  j j sX t  t |  _ |  j j ss t  t |  _	 |  j j	 s t  d  S(   NR}   u   l2R   R|   R   u   l1(
   R	   Rv   R}   R   t   _tfidfR   R   Rw   R|   R   (   R   (    (    sI   lib/python2.7/site-packages/sklearn/feature_extraction/tests/test_text.pyt   test_tfidf_vectorizer_setters  s    					t   categoryc          C  s  t    }  |  j t  } | j } t | j t t  |  j f  t | j |  j  t	 j
 | j  d k sq t  t	 j
 | j  d k  s t  t	 j | j  d k s t  t	 j | j  d k  s t  x> t | j d  D]) } t t	 j j | d j d  d  q Wt  d d
 d t d d	  }  |  j t  } t | j t t  |  j f  t | j |  j  | j } | | k st  | d | k  st  t	 j
 | j  d k st  t	 j | j  d k  st  x> t | j d  D]) } t t	 j j | d j d  d  qWd  S(   Nii    i   i   g      ?R?   t   non_negativeR}   u   l1(   i   i   (   R   RY   Rd   t   nnzR   RZ   R[   t
   n_featurest   dtypeR   t   mint   dataRw   t   maxt   rangeR   t   linalgR}   R   (   R_   Ra   t	   token_nnzt   it
   ngrams_nnz(    (    sI   lib/python2.7/site-packages/sklearn/feature_extraction/tests/test_text.pyt   test_hashing_vectorizer  s,    		'	c       
   C  s  t  d d  }  t t |  j  t |  j  |  j t  } | j \ } } t	 t
 |  j  |  |  j   } t	 t
 |  |  t d d d d d d d	 d
 d g	 |  x3 t |  D]% \ } } t	 | |  j j |   q Wd d d d d d d	 d
 d g	 } t  d |  }  |  j   } t d d d d d d d	 d
 d g	 |  |  j sPt  x3 t |  D]% \ } } t	 | |  j j |   q]Wd  S(   NRu   g      ?u   beeru   burgeru   celeriu   cokeu   pizzau   saladu	   sparklingu   tomatou   waterRM   (   R   R    Ri   t   get_feature_namesR   R   Rc   Rd   RZ   R   R[   RX   R   t	   enumeratet   getRw   (   Rr   Ra   t	   n_samplesR   t   feature_namest   idxt   nameR\   (    (    sI   lib/python2.7/site-packages/sklearn/feature_extraction/tests/test_text.pyt   test_feature_names%  s,    c         C  s   t  d d d d g  } t  d d d d d	 d
 d g  } |  d d d d  } | j t  t t  | j  |  t | j |  d  S(   Nu   burgeru   beeru   saladu   pizzau   celeriu   tomatou	   copyrightu   cokeu	   sparklingu   wateru   theRu   g333333?t   max_featuresi   (   RN   RT   Rd   R   RX   t   stop_words_(   R:   t   expected_vocabularyt   expected_stop_wordst
   vectorizer(    (    sI   lib/python2.7/site-packages/sklearn/feature_extraction/tests/test_text.pyt   test_vectorizer_max_featuresH  s    c    	      C  s-  t  d d  }  t  d d  } t  d d   } |  j t  j d d  } | j t  j d d  } | j t  j d d  } |  j   } | j   } | j   } t d | j    t d | j    t d | j    t d | t j	 |   t d | t j	 |   t d | t j	 |   d  S(   NR   i   i   R~   i    i   u   the(
   R   R   Rc   RU   R   R   R   R   R   t   argmax(	   t   cv_1t   cv_3t   cv_Nonet   counts_1t   counts_3t   counts_Nonet
   features_1t
   features_3t   features_None(    (    sI   lib/python2.7/site-packages/sklearn/feature_extraction/tests/test_text.pyt"   test_count_vectorizer_max_featuresU  s    c          C  sr  d d d g }  t  d d d d  } | j |   d | j j   k sL t  t t | j j    d	  t t | j  d
  d | _ | j |   d | j j   k s t  t t | j j    d  d | j k s t  t t | j  d  d | _ | j |   d | j j   k s't  t t | j j    d  d | j k sXt  t t | j  d  d  S(   Nu   abcu   deau   eatR>   u   charRu   g      ?u   ai   i    g      ?i   i   i   (	   R   RT   RX   RO   Rw   R   R[   R   Ru   (   R   R`   (    (    sI   lib/python2.7/site-packages/sklearn/feature_extraction/tests/test_text.pyt   test_vectorizer_max_dfo  s$    		c          C  sr  d d d g }  t  d d d d  } | j |   d | j j   k sL t  t t | j j    d	  t t | j  d
  d | _ | j |   d | j j   k s t  t t | j j    d  d | j k s t  t t | j  d  d | _ | j |   d | j j   k s't  t t | j j    d  d | j k sXt  t t | j  d  d  S(   Nu   abcu   deau   eatR>   u   chart   min_dfi   u   ai   i    i   u   ci   g?i   (	   R   RT   RX   RO   Rw   R   R[   R   R   (   R   R`   (    (    sI   lib/python2.7/site-packages/sklearn/feature_extraction/tests/test_text.pyt   test_vectorizer_min_df  s$    		c       	   C  s4  d d g }  t  d d d d  } | j |   j   } t d d d	 d
 d g | j    t d d d d d g d d d d d g g |  t  d d d d d t  } | j |   j   } t d d d d d g d d d d d g g |  t  d d d d d t d t j  } | j |   } t | j	 t j  d  S(   Nu   aaabcu   abbdeR>   u   charRu   g      ?u   au   bu   cu   du   ei   i   i    i   t   binaryR   (
   R   Rc   R   R   R   R   R   t   float32R   R   (   R   R`   Ra   t   X_sparse(    (    sI   lib/python2.7/site-packages/sklearn/feature_extraction/tests/test_text.pyt   test_count_binary_occurrences  s    "c          C  s5  d d g }  t  d d d t d d   } | j |   } t t j | d d !j  d	  t t j | d d
 !j  d
  t | j t j	  t  d d d t d t d d   } | j |   } t t j | j  d  t | j t j	  t  d d d t d t d d  d t j	  } | j |   } t | j t j	  d  S(   Nu   aaabcu   abbdeR>   u   charR   R}   i    i   i   i   R   R   (
   R   R   R   RY   R   R   R   R   R   t   float64(   R   R`   Ra   (    (    sI   lib/python2.7/site-packages/sklearn/feature_extraction/tests/test_text.pyt   test_hashed_binary_occurrences  s     	  	c         C  s   t  } |    } | j |  } | j |  } | j   } x` t | |  D]O \ } } t j t j | |    } t j t j |   } t | |  qI W| j	   } | j |  }	 x< t | |	  D]+ \ } }
 t t j |  t j |
   q Wd  S(   N(
   Rd   Rc   t   inverse_transformR9   t   zipR   t   sortt   uniqueR   R   (   R:   R   R   t   transformed_datat   inversed_datat   analyzet   doct   inversed_termsR]   t   inversed_data2t   terms2(    (    sI   lib/python2.7/site-packages/sklearn/feature_extraction/tests/test_text.pyt!   test_vectorizer_inverse_transform  s    	u    ignore: The default of the `iid`u"   ignore: You should specify a valuec          C  s  t  t }  d g t t   d g t t  } t |  | d d d d \ } } } } t d t   f d t   f g  } i d d g d
 6d d 6} t | | d d } | j | |  j	 |  }	 t
 |	 |  t | j d  | j j d }
 t |
 j d  d  S(   Nii   t	   test_sizeg?t   random_statei    u   vectu   svci   u   vect__ngram_rangeu   hingeu   squared_hingeu	   svc__losst   n_jobsg      ?(   i   i   (   i   i   (   u   hingeu   squared_hinge(   i   i   (   RU   t   NOTJUNK_FOOD_DOCSR[   R   R   R   R   R   RT   t   predictR   R   t   best_score_t   best_estimator_Re   R?   (   R   t   targetR   R   t   target_traint   target_testt   pipelinet
   parameterst   grid_searcht   predt   best_vectorizer(    (    sI   lib/python2.7/site-packages/sklearn/feature_extraction/tests/test_text.pyt-   test_count_vectorizer_pipeline_grid_selection  s    
$$
c          C  s(  t  t }  d g t t   d g t t  } t |  | d d d d \ } } } } t d t   f d t   f g  } i d d g d
 6d d 6d d 6} t | | d d } | j | |  j	 |  }	 t
 |	 |  t | j d  | j j d }
 t |
 j d  t |
 j d  t |
 j  d  S(   Nii   R   g?R   i    u   vectu   svci   u   vect__ngram_rangeu   l1u   l2u
   vect__normu   hingeu   squared_hingeu	   svc__lossR   g      ?(   i   i   (   i   i   (   u   l1u   l2(   u   hingeu   squared_hinge(   i   i   (   RU   R   R[   R   R   R	   R   R   RT   R   R   R   R   R   Re   R?   R}   R   R   (   R   R   R   R   R   R   R   R   R   R   R   (    (    sI   lib/python2.7/site-packages/sklearn/feature_extraction/tests/test_text.pyt'   test_vectorizer_pipeline_grid_selection	  s$    
$$
c          C  s   t  t }  d g t t   d g t t  } t d t   f d t   f g  } t | |  | d d } t | d d d g  d  S(   Nii   u   vectu   svcRr   i   g      ?(   RU   R   R[   R   R	   R   R   R   (   R   R   R   t	   cv_scores(    (    sI   lib/python2.7/site-packages/sklearn/feature_extraction/tests/test_text.pyt)   test_vectorizer_pipeline_cross_validation2  s    
$c          C  s   d }  t    } | j |  g  } t | j d  t d d  d t  } | j |  g  } t | j d d	 f  t | j | j  t	 t
 j | j  t
 j | j   d  S(
   Nu  ÐÐ°ÑÐ¸Ð½Ð½Ð¾Ðµ Ð¾Ð±ÑÑÐµÐ½Ð¸Ðµ â Ð¾Ð±ÑÐ¸ÑÐ½ÑÐ¹ Ð¿Ð¾Ð´ÑÐ°Ð·Ð´ÐµÐ» Ð¸ÑÐºÑÑÑÑÐ²ÐµÐ½Ð½Ð¾Ð³Ð¾ Ð¸Ð½ÑÐµÐ»Ð»ÐµÐºÑÐ°, Ð¸Ð·ÑÑÐ°ÑÑÐ¸Ð¹ Ð¼ÐµÑÐ¾Ð´Ñ Ð¿Ð¾ÑÑÑÐ¾ÐµÐ½Ð¸Ñ Ð°Ð»Ð³Ð¾ÑÐ¸ÑÐ¼Ð¾Ð², ÑÐ¿Ð¾ÑÐ¾Ð±Ð½ÑÑ Ð¾Ð±ÑÑÐ°ÑÑÑÑ.i   i   R}   R   i   i   (   i   i   i   (   R   Rc   R   RZ   R   R   R   RY   R   R   R   R   R   (   t   documentR`   t	   X_countedt   X_hashed(    (    sI   lib/python2.7/site-packages/sklearn/feature_extraction/tests/test_text.pyt   test_vectorizer_unicode@  s    	c          C  se   d d g }  t  d |   } | j t  } | j t  } t | j   | j    | j sa t  d  S(   Nu   pizzau   celeriRM   (   R	   Rc   Rd   RY   R   R   R   Rw   (   RM   R`   t   X_1t   X_2(    (    sI   lib/python2.7/site-packages/sklearn/feature_extraction/tests/test_text.pyt+   test_tfidf_vectorizer_with_fixed_vocabularye  s    c          C  sA  t    t  d d  t  d t  t  d d
  t   t d t  t d t  t d t  j t  t d	 t  j t  t   t d t  t   j t  g }  x |  D] } t	 j
 |  } t	 j |  } t t |  | j  t | j   | j    t rt | t   rq q t | j t  j   | j t  j    q Wd  S(   NR}   u   l1R   R?   i   i   R7   R>   R5   (   i   i   (   R   R   R   R   R0   RT   RU   R-   R	   t   picklet   dumpst   loadsR   t   typet	   __class__t
   get_paramsR   RV   R   Rc   R   (   t	   instancest   origR*   t   copy(    (    sI   lib/python2.7/site-packages/sklearn/feature_extraction/tests/test_text.pyt   test_pickling_vectorizero  s,    c       
   C  s   t  j j d  }  t  j d d d d d d d d	 d
 g	  } x t d d  D] } t |  j | d d d t  } t d |  } t	 j
 t	 j |   } | j t  | j t  t | j   | j    qL Wd  S(   Ni    u   beeru   burgeru   celeriu   cokeu   pizzau   saladu	   sparklingu   tomatou   waterid   t   sizei   R,   RM   (   R   t   randomt   RandomStateR   R   RN   t   choiceRv   R   R  R  R  RT   Rd   R   R   (   t   rngt   vocab_wordst   xt	   vocab_setRr   t   unpickled_cv(    (    sI   lib/python2.7/site-packages/sklearn/feature_extraction/tests/test_text.pyt-   test_countvectorizer_vocab_sets_when_pickling  s    !c       
   C  s   t  j j d  }  t  j d d d d d d d d	 d
 g	  } x t d d  D] } t   } |  j | d d d t } x% t d d  D] } | | | | <q Wt d |  } t	 j
 t	 j |   } | j t  | j t  t | j   | j    qL Wd  S(   Ni    u   beeru   burgeru   celeriu   cokeu   pizzau   saladu	   sparklingu   tomatou   waterid   R  i   R,   RM   (   R   R  R  R   R   RP   R  Rv   R   R  R  R  RT   Rd   R   R   (   R  R  R  t
   vocab_dictt   wordst   yRr   R  (    (    sI   lib/python2.7/site-packages/sklearn/feature_extraction/tests/test_text.pyt.   test_countvectorizer_vocab_dicts_when_pickling  s    	c          C  s   t    j t  t d t  j t  t d t  j t  f }  x} |  D]u } | j t  j   } d  | _	 | j t  j   } t
 | d  | j t  j   } t | |  t | |  qF Wd  S(   NR7   R5   u   stop_words_(   R	   RT   RU   R   R   R-   RY   R   R   R   t   delattrR   (   t   fitted_vectorizersR`   t   vect_transformt   stop_None_transformt   stop_del_transform(    (    sI   lib/python2.7/site-packages/sklearn/feature_extraction/tests/test_text.pyt   test_stop_words_removal  s    	c          C  s   t    j t  }  t   j |   } t j |  } t j |  } t t	 |  | j
  t | j |   j   | j |   j    d  S(   N(   R   Rc   RU   R   RT   R  R  R  R   R  R  R   R   (   Ra   R  R*   R  (    (    sI   lib/python2.7/site-packages/sklearn/feature_extraction/tests/test_text.pyt   test_pickling_transformer  s    c          C  sh   t    j t  }  t   j |   } t   } | j | _ t | j |   j   | j |   j    d  S(   N(	   R   Rc   RU   R   RT   R   R   RY   R   (   Ra   R  R  (    (    sI   lib/python2.7/site-packages/sklearn/feature_extraction/tests/test_text.pyt   test_transformer_idf_setter  s    	c          C  so   t  d t  }  |  j t  t  d |  j d t  } |  j | _ t | j t  j   |  j t  j    d  S(   NR   RM   (	   R	   R   RT   RU   RX   R   R   RY   R   (   R  R  (    (    sI   lib/python2.7/site-packages/sklearn/feature_extraction/tests/test_text.pyt   test_tfidf_vectorizer_setter  s    c          C  sn   t  d t  }  |  j t  t  d |  j d t  } t |  j  } d g | d } t t t	 | d |  d  S(   NR   RM   g      ?i   u   idf_(
   R	   R   RT   RU   RX   R[   R   R    Ri   t   setattr(   R`   R  t   expected_idf_lent   invalid_idf(    (    sI   lib/python2.7/site-packages/sklearn/feature_extraction/tests/test_text.pyt%   test_tfidfvectorizer_invalid_idf_attr  s    c          C  s;   d d d d d g }  t  d |   } t t | j g   d  S(   Nu   au   bu   cRM   (   R   R    Ri   RT   (   R\   R`   (    (    sI   lib/python2.7/site-packages/sklearn/feature_extraction/tests/test_text.pyt   test_non_unique_vocab  s    c          C  s)   d }  t  } d   } t | |  |  d  S(   Nu?   np.nan is an invalid document, expected byte or unicode string.c          S  s&   t    }  |  j d t j d g  d  S(   Nu   hello worldu   hello hello(   R   Rc   R   t   nan(   t   hv(    (    sI   lib/python2.7/site-packages/sklearn/feature_extraction/tests/test_text.pyt   func  s    	(   Ri   R   (   t   messaget	   exceptionR,  (    (    sI   lib/python2.7/site-packages/sklearn/feature_extraction/tests/test_text.pyt"   test_hashingvectorizer_nan_in_docs  s    	c          C  s   t  d t d t d d   }  |  j s* t  |  j d d g  j   } t | j	   d d d d g  |  j
 d d g  j   } t | j	   d d d d g  d  S(   NR   R   R}   u   hello worldu   hello helloi   i    (   R	   R   Rv   R   R   Rw   Rc   R   R   t   ravelRY   (   R_   Ra   Rz   (    (    sI   lib/python2.7/site-packages/sklearn/feature_extraction/tests/test_text.pyt   test_tfidfvectorizer_binary  s    c          C  s6   t  d t  }  |  j t  t |  j |  j j  d  S(   NR   (   R	   R   RT   RU   R   R   R   (   R`   (    (    sI   lib/python2.7/site-packages/sklearn/feature_extraction/tests/test_text.pyt   test_tfidfvectorizer_export_idf  s    c          C  sO   t  d d g  }  t |   } |  j t  | j t  t | j |  j  d  S(   NRM   u   the(   R	   R   RT   Rd   R   RX   (   t
   vect_vocabt   vect_vocab_clone(    (    sI   lib/python2.7/site-packages/sklearn/feature_extraction/tests/test_text.pyt   test_vectorizer_vocab_clone  s
    c         C  sU   d } |    } t  t | | j d  t  t | | j d  t  t | | j d  d  S(   NuB   Iterable over raw text documents expected, string object received.u   hello world!(   R   Ri   Rc   RT   RY   (   R:   R-  t   vec(    (    sI   lib/python2.7/site-packages/sklearn/feature_extraction/tests/test_text.pyt&   test_vectorizer_string_object_as_input  s    	u   X_dtypec         C  sL   t  j d d d |  d d } t   j |  } | j | j k sH t  d  S(   Ni
   i N  R   R   i*   (   R   t   randR   Rc   R   Rw   (   t   X_dtypeRa   t   X_trans(    (    sI   lib/python2.7/site-packages/sklearn/feature_extraction/tests/test_text.pyt   test_tfidf_transformer_type"  s    c          C  s   t  j d d d t j d d }  t  j |   } t  j |   } t   j |  } t   j |  } t | |  | j	 | j	 k s t
  d  S(   Ni
   i N  R   R   i*   (   R   R8  R   R   t
   csc_matrixt
   csr_matrixR   Rc   R#   t   formatRw   (   Ra   t   X_csct   X_csrt   X_trans_csct   X_trans_csr(    (    sI   lib/python2.7/site-packages/sklearn/feature_extraction/tests/test_text.pyt   test_tfidf_transformer_sparse)  s    !u0   vectorizer_dtype, output_dtype, warning_expectedc         C  s   t  j d d d g  } t d |   } d } t } | r? | n d  } t j | d |  } | j |  }	 Wd  QX| d  k r g  | D] }
 t |
 |  r |
 ^ q } t	 |  d k s t
  n  |	 j | k s t
  d  S(   Nu   numpyu   scipyu   sklearnR   u   'dtype' should be used.t   matchi    (   R   R   R	   t   UserWarningR   t   pytestt   warnsRc   RV   R[   Rw   R   (   t   vectorizer_dtypet   output_dtypet   warning_expectedRa   R   t   warning_msg_matcht   warning_clst   expected_warning_clsR   t   X_idfR   t   relevant_warnings(    (    sI   lib/python2.7/site-packages/sklearn/feature_extraction/tests/test_text.pyt   test_tfidf_vectorizer_type4  s    	u   vecR?   i   i   c         C  s   |  j  } d t |  } t |  t  r; t j d d  n  t t | |  j d g  t t | |  j	 d g  t |  t  r t t | |  j
 d g  n  d  S(   NuO   Invalid value for ngram_range=%s lower boundary larger than the upper boundary.t   reasonu'   HashingVectorizer not supported on PyPyu   good news everyone(   R?   Rj   RV   R   RF  t   xfailR   Ri   RT   Rc   RY   (   R6  t   invalid_rangeR-  (    (    sI   lib/python2.7/site-packages/sklearn/feature_extraction/tests/test_text.pyt$   test_vectorizers_invalid_ngram_rangeM  s    	c         C  s7   |  j    } |  j   } |  j   } |  j | | |  S(   N(   Rq   t   build_tokenizerR   t   _check_stop_words_consistency(   t	   estimatorRo   t   tokenizet
   preprocess(    (    sI   lib/python2.7/site-packages/sklearn/feature_extraction/tests/test_text.pyRV  f  s
    c          C  s   t  r d }  n d }  d |  } xs t   t   t   g D]Y } | j d d d d d g  t t | | j d	 g  | ` t	 |  t
 k s8 t  q8 Wt | j d	 g  t	 |  d  k s t  | j d d d d d
 d g  t t | | j d	 g  d  S(   Nu   [u'and', u'll', u've']u   ['and', 'll', 've']u}   Your stop_words may be inconsistent with your preprocessing. Tokenizing the stop words generated tokens %s not in stop_words.Ro   u   you'veu   youu   you'llu   ANDu   hello worldu   blah(   R   R   R	   R   Rp   R   RE  Rc   t   _stop_words_idRV  Rv   Rw   R!   R   (   t   lstrR-  R6  (    (    sI   lib/python2.7/site-packages/sklearn/feature_extraction/tests/test_text.pyt'   test_vectorizer_stop_words_inconsistentn  s"    		
c          C  s   t  j d
 d t j }  t j } |  j j |  |  _ |  j j |  |  _ i d d 6d d 6d d 6} t   j |  |  } | | j j	 k s t
  d	 S(   u   
    Check that CountVectorizer._sort_features preserves the dtype of its sparse
    feature matrix.

    This test is skipped on 32bit platforms, see:
        https://github.com/scikit-learn/scikit-learn/pull/11295
    for more details.
    i   R   i    u   scikit-learni   u   isi   u   great!N(   i   i   (   R   R=  R   t   int64t   indicest   astypet   indptrR   t   _sort_featuresR   Rw   (   Ra   t   INDICES_DTYPERM   t   Xs(    (    sI   lib/python2.7/site-packages/sklearn/feature_extraction/tests/test_text.pyt7   test_countvectorizer_sort_features_64bit_sparse_indices  s    	
u	   Estimatorc         C  s   i d d 6g } |    } t  |  t k s1 t  |  d d   d d g  } t  |  d k sd t  t  |  d  k s| t  | j |  d |  f d	     Y} | d d g  } t  |  d k s t  |  d
 d   d d g  } t  |  t k s t  d  S(   Nu	   some textu   textR7   c         S  s   |  d S(   Nu   text(    (   R  (    (    sI   lib/python2.7/site-packages/sklearn/feature_extraction/tests/test_text.pyt   <lambda>  s    Ro   u   andu   errort   CustomEstimatorc           B  s   e  Z d    Z RS(   c         S  s   d   S(   Nc         S  s   |  d S(   Nu   text(    (   R  (    (    sI   lib/python2.7/site-packages/sklearn/feature_extraction/tests/test_text.pyRe    s    (    (   t   self(    (    sI   lib/python2.7/site-packages/sklearn/feature_extraction/tests/test_text.pyR     s    (   t   __name__t
   __module__R   (    (    (    sI   lib/python2.7/site-packages/sklearn/feature_extraction/tests/test_text.pyRf    s   R8   c         S  s   t  j d  j |   S(   Nu   \w{1,}(   t   ret   compilet   findall(   R   (    (    sI   lib/python2.7/site-packages/sklearn/feature_extraction/tests/test_text.pyRe    s   (   RV  R   Rw   R   Rc   (   t	   EstimatorR   R6  Rf  (    (    sI   lib/python2.7/site-packages/sklearn/feature_extraction/tests/test_text.pyt-   test_stop_word_validation_custom_preprocessor  s    	(   u   the pizza pizza beer copyrightu   the pizza burger beer copyrightu!   the the pizza beer beer copyrightu   the burger beer beer copyrightu   the coke burger coke copyrightu   the coke burger burger(   u   the salad celeri copyrightu)   the salad salad sparkling water copyrightu   the the celeri celeri copyrightu   the tomato tomato salad wateru    the tomato salad water copyright(   i   i   (   i   i   (   i   i   (   t
   __future__R    Rj  R   RF  t   scipyR   t   sklearn.externals.sixR   t   sklearn.feature_extraction.textR   R   R   R   R   R   R	   R
   t   sklearn.model_selectionR   R   R   t   sklearn.pipelineR   t   sklearn.svmR   t   sklearn.baseR   t   numpyR   t   numpy.testingR   R   t   sklearn.utilsR   t   sklearn.utils.testingR   R   R   R   R   R   R   R   R   R   R   R   R    R!   R"   R#   R$   t   sklearn.utils.fixesR%   RW   t   collectionsR&   t	   functoolsR'   R  t   ioR(   RU   R   Rd   R+   R-   R/   R0   R3   R4   t   markt   parametrizeR=   R@   RF   RH   RI   RL   Rb   Rh   Rm   Rn   Rt   Rx   R{   R   R   R   R   R   t   DeprecationWarningR   R   R   R   R   R   R   R   R   t   filterwarningsR   R   R   R   R  R  R  R  R!  R"  R#  R$  R(  R)  R/  R1  R2  R5  R7  R   R   R;  RC  t   int32R   R]  Rv   RP  RT  RV  R\  Rd  Rn  (    (    (    sI   lib/python2.7/site-packages/sklearn/feature_extraction/tests/test_text.pyt   <module>   s   p         
						$&																g	'	#$				$%(	$	
						
	
						*		$	