
`]c           @  s   d  Z  d d l m Z d d l Z d d l m Z d d l m Z d d l m	 Z	 m
 Z
 m Z m Z m Z m Z d e j f d     YZ d S(	   uC   
Unit tests for nltk.tokenize.
See also nltk/test/tokenize.doctest
i(   t   unicode_literalsN(   t   SkipTest(   t   assert_equal(   t   punktt   word_tokenizet   TweetTokenizert   StanfordSegmentert   TreebankWordTokenizert   SyllableTokenizert   TestTokenizec           B  sk   e  Z d    Z d   Z d   Z d   Z d   Z d   Z d   Z d   Z	 d   Z
 d	   Z d
   Z RS(   c      
   C  sb   t  d t d t  } d } | j |  } d d d d d d d	 d
 d d g
 } |  j | |  d S(   uW   
        Test TweetTokenizer using words with special and accented characters.
        t   strip_handlest
   reduce_lenuA   @myke: Let's test these words: resumé España München françaisu   :u   Let'su   testu   theseu   wordsu   resuméu   Españau   Münchenu	   françaisN(   R   t   Truet   tokenizet   assertEqual(   t   selft	   tokenizert   s9t   tokenst   expected(    (    s;   lib/python2.7/site-packages/nltk/test/unit/test_tokenize.pyt   test_tweet_tokenizer   s    	c         C  s;   t    } | j d  } |  j | d d d d d g  d S(   u3   
        Test SyllableTokenizer tokenizer.
        u   justificationu   jusu   tiu   fiu   cau   tionN(   R   R   R   (   R   R   R   (    (    s;   lib/python2.7/site-packages/nltk/test/unit/test_tokenize.pyt+   test_sonority_sequencing_syllable_tokenizer/   s    	c         C  s   yq t    } | j d  d } | j | j    } | j   d d d d d d d	 d
 d d d d g k sp t  Wn% t k
 r } t t |    n Xd S(   uN   
        Test the Stanford Word Segmenter for Arabic (default config)
        u   arun   يبحث علم الحاسوب استخدام الحوسبة بجميع اشكالها لحل المشكلاتu   يبحثu   علمu   الحاسوبu   استخدامu   الحوسبةu   بu   جميعu
   اشكالu   هاu   لu   حلu   المشكلاتN(   R   t   default_configt   segmentt   splitt   AssertionErrort   LookupErrorR   t   str(   R   t   segt   sentt   segmented_sentt   e(    (    s;   lib/python2.7/site-packages/nltk/test/unit/test_tokenize.pyt   test_stanford_segmenter_arabic7   s(    		c         C  s   y_ t    } | j d  d } | j | j    } | j   d d d d d d g k s^ t  Wn% t k
 r } t t |    n Xd	 S(
   uO   
        Test the Stanford Word Segmenter for Chinese (default config)
        u   zhu$   这是斯坦福中文分词器测试u   这u   是u	   斯坦福u   中文u	   分词器u   测试N(   R   R   R   R   R   R   R   R   (   R   R   R   R   R   (    (    s;   lib/python2.7/site-packages/nltk/test/unit/test_tokenize.pyt   test_stanford_segmenter_chineseQ   s    	.c         C  sr   t    } d } d g } | j |  } |  j | |  d } d d d d g } | j |  } |  j | |  d S(   uT   
        Test a string that resembles a phone number but contains a newline
        u   (393)  928 -3010u   (393)
928 -3010u   (u   393u   )u	   928 -3010N(   R   R   R   (   R   R   t   test1R   t   resultt   test2(    (    s;   lib/python2.7/site-packages/nltk/test/unit/test_tokenize.pyt   test_phone_tokenizer^   s    		c      &   C  s  t  d t  } d } d d d g } | j |  } |  j | |  d } d d d	 d
 d d d d d d d d d d d d d d d d d d d d d d d d d g } | j |  } |  j | |  d  } d! d" d# d" d$ d" d% d" d& d" d' d" d( d" d) d" d* d" d+ d" d, d" d- d" d. d" d/ d" d0 d" d1 d" d2 d" d3 d" d4 d" g& } | j |  } |  j | |  d5 } d. d! d0 d! d1 d! d2 d! d3 d! d4 d! g } | j |  } |  j | |  d6 } d. d" d0 d" d1 d" d2 d" d3 d" d4 d" d" d" d/ d" d" d/ d" d7 d" d8 d" d9 d" g } | j |  } |  j | |  d: }	 d; d< d- d= g } | j |	  } |  j | |  d> }
 d? d@ dA d@ d- d@ dB d@ g } | j |
  } |  j | |  dC S(D   uW   
        Test remove_handle() from casual.py with specially crafted edge cases
        R
   u-   @twitter hello @twi_tter_. hi @12345 @123newsu   hellou   .u   hiu]   @n`@n~@n(@n)@n-@n=@n+@n\@n|@n[@n]@n{@n}@n;@n:@n'@n"@n/@n?@n.@n,@n<@n>@n @n
@n ñ@n.ü@n.ç@n.u   `u   ~u   (u   )u   -u   =u   +u   \u   |u   [u   ]u   {u   }u   ;u   :u   'u   "u   /u   ?u   ,u   <u   >u   ñu   üu   çuK   a@n j@n z@n A@n L@n Z@n 1@n 4@n 7@n 9@n 0@n _@n !@n @@n #@n $@n %@n &@n *@nu   au   @nu   ju   zu   Au   Lu   Zu   1u   4u   7u   9u   0u   _u   !u   @u   #u   $u   %u   &u   *u   @n!a @n#a @n$a @n%a @n&a @n*auD   @n!@n @n#@n @n$@n @n%@n @n&@n @n*@n @n@n @@n @n@@n @n_@n @n7@n @nj@nu   @n_u   @n7u   @njum   @abcdefghijklmnopqrstuvwxyz @abcdefghijklmnopqrst1234 @abcdefghijklmnopqrst_ @abcdefghijklmnopqrstendofhandleu   uvwxyzu   1234u   endofhandleur   @abcdefghijklmnopqrstu@abcde @abcdefghijklmnopqrst@abcde @abcdefghijklmnopqrst_@abcde @abcdefghijklmnopqrst5@abcdeu   uu   @abcdeu   @abcdefghijklmnopqrstu   5N(   R   R   R   R   (   R   R   R"   R   R#   R$   t   test3t   test4t   test5t   test6t   test7(    (    s;   lib/python2.7/site-packages/nltk/test/unit/test_tokenize.pyt   test_remove_handleq   s    		*		c         C  so  t    } d } dO dP dQ dR dS dT dU dV dW dX dY dZ d[ d\ d] d^ d_ d` da db dc dd de g } t | j |   } |  j | |  d' } df dg dh di dj dk dl dm dn do dp dq dr ds dt du dv dw dx dy dz d{ g } t | j |   } |  j | |  dC } d| d} d~ d d d d d d d d d d d d d d d d d d d d d g } t | j |   } |  j | |  dN S(   uC   
        Test TreebankWordTokenizer.span_tokenize function
        uN   Good muffins cost $3.88
in New (York).  Please (buy) me
two of them.
(Thanks).i    i   i   i   i   i   i   i   i   i   i   i   i   i   i    i$   i%   i&   i(   i.   i/   i0   i3   i4   i5   i7   i8   i;   i<   i>   i?   iD   iE   iF   iL   iM   iN   um   The DUP is similar to the "religious right" in the United States and takes a hardline stance on social issuesi   i   i   i
   i   i   i   i   i*   i+   i,   i2   i9   i:   i@   iA   iJ   iK   iU   iV   i\   i]   i_   i`   if   ig   im   uq   The DUP is similar to the "religious right" in the United States and takes a ``hardline'' stance on social issuesiO   iW   iY   iZ   ia   ic   id   ij   ik   iq   N(   i    i   (   i   i   (   i   i   (   i   i   (   i   i   (   i   i   (   i   i   (   i   i    (   i    i$   (   i$   i%   (   i%   i&   (   i(   i.   (   i/   i0   (   i0   i3   (   i3   i4   (   i5   i7   (   i8   i;   (   i<   i>   (   i?   iD   (   iE   iF   (   iF   iL   (   iL   iM   (   iM   iN   (   i    i   (   i   i   (   i   i
   (   i   i   (   i   i   (   i   i   (   i   i   (   i   i$   (   i%   i*   (   i*   i+   (   i,   i.   (   i/   i2   (   i3   i9   (   i:   i@   (   iA   iD   (   iE   iJ   (   iK   iL   (   iM   iU   (   iV   i\   (   i]   i_   (   i`   if   (   ig   im   (   i    i   (   i   i   (   i   i
   (   i   i   (   i   i   (   i   i   (   i   i   (   i   i$   (   i%   i*   (   i*   i+   (   i,   i.   (   i/   i2   (   i3   i9   (   i:   i@   (   iA   iD   (   iE   iJ   (   iK   iL   (   iM   iO   (   iO   iW   (   iW   iY   (   iZ   i`   (   ia   ic   (   id   ij   (   ik   iq   (   R   t   listt   span_tokenizeR   (   R   R   R"   R   R#   R$   R&   (    (    s;   lib/python2.7/site-packages/nltk/test/unit/test_tokenize.pyt   test_treebank_span_tokenizer
  s    				c         C  s   d } d d d d d d d d d	 d
 d d d d d g } |  j  t |  |  d } d d d d d g } |  j  t |  |  d S(   u-   
        Test word_tokenize function
        u0   The 'v', I've been fooled but I'll seek revenge.u   Theu   'u   vu   ,u   Iu   'veu   beenu   fooledu   butu   'llu   seeku   revengeu   .u   'v' 're'u   'reN(   R   R   (   R   t   sentenceR   (    (    s;   lib/python2.7/site-packages/nltk/test/unit/test_tokenize.pyt   test_word_tokenizek  s    c         C  s   d d d	 g f d d
 d d g f d d d d d g f g } xC | D]; \ } } g  t j |  D] } | ^ q_ } t | |  qC Wd  S(   Nu   12u   1u   2u   123u   3u   1234u   4(   u   1u   2(   u   2N(   u   1u   2(   u   2u   3(   u   3N(   u   1u   2(   u   2u   3(   u   3u   4(   u   4N(   t   NoneR   t
   _pair_iterR   (   R   t
   test_casest
   test_inputt   expected_outputt   xt   actual_output(    (    s;   lib/python2.7/site-packages/nltk/test/unit/test_tokenize.pyt   test_punkt_pair_itery  s    "c         C  s)   t  g   } t j |  } t |  d  S(   N(   t   iterR   R2   R,   (   R   t   itt   gen(    (    s;   lib/python2.7/site-packages/nltk/test/unit/test_tokenize.pyt5   test_punkt_pair_iter_handles_stop_iteration_exception  s    c         C  sB   t  j   } d d d     Y} |   | _ t | j d   d  S(   Nt   TestPunktTokenizeWordsMockc           B  s   e  Z d    Z RS(   c         S  s
   t  g   S(   N(   R9   (   R   t   s(    (    s;   lib/python2.7/site-packages/nltk/test/unit/test_tokenize.pyR     s    (   t   __name__t
   __module__R   (    (    (    s;   lib/python2.7/site-packages/nltk/test/unit/test_tokenize.pyR=     s   u   test(    (   R   t   PunktBaseClasst
   _lang_varsR,   t   _tokenize_words(   R   t   objR=   (    (    s;   lib/python2.7/site-packages/nltk/test/unit/test_tokenize.pyt:   test_punkt_tokenize_words_handles_stop_iteration_exception  s    (   R?   R@   R   R   R    R!   R%   R+   R.   R0   R8   R<   RE   (    (    (    s;   lib/python2.7/site-packages/nltk/test/unit/test_tokenize.pyR	      s   							a			(   t   __doc__t
   __future__R    t   unittestt   noseR   t
   nose.toolsR   t   nltk.tokenizeR   R   R   R   R   R   t   TestCaseR	   (    (    (    s;   lib/python2.7/site-packages/nltk/test/unit/test_tokenize.pyt   <module>   s   .
