ó
ù`]c           @   s  d  Z  d d l Z d d l m Z d d l m Z m Z d d l m Z d d l	 m
 Z
 d d l m Z m Z m Z m Z m Z m Z m Z d d l m Z d d	 l m Z m Z d d
 l m Z m Z m Z m Z d d l m Z d d l m  Z  d d l! m" Z" d d l# m$ Z$ m% Z% d d l& m' Z' d d l( m) Z) d d „ Z* e" ƒ  Z+ e j, d e j- ƒ Z. e j, d e j- ƒ Z/ e j, d e j- ƒ Z0 e j, d e j- ƒ Z1 e+ j2 j3 d e. d f ƒ e+ j2 j4 e/ d f ƒ e+ j5 j3 d e0 d f ƒ e+ j6 j3 d e1 d f ƒ d e7 d „ Z8 d S(   s¼  
NLTK Tokenizer Package

Tokenizers divide strings into lists of substrings.  For example,
tokenizers can be used to find the words and punctuation in a string:

    >>> from nltk.tokenize import word_tokenize
    >>> s = '''Good muffins cost $3.88\nin New York.  Please buy me
    ... two of them.\n\nThanks.'''
    >>> word_tokenize(s)
    ['Good', 'muffins', 'cost', '$', '3.88', 'in', 'New', 'York', '.',
    'Please', 'buy', 'me', 'two', 'of', 'them', '.', 'Thanks', '.']

This particular tokenizer requires the Punkt sentence tokenization
models to be installed. NLTK also provides a simpler,
regular-expression based tokenizer, which splits text on whitespace
and punctuation:

    >>> from nltk.tokenize import wordpunct_tokenize
    >>> wordpunct_tokenize(s)
    ['Good', 'muffins', 'cost', '$', '3', '.', '88', 'in', 'New', 'York', '.',
    'Please', 'buy', 'me', 'two', 'of', 'them', '.', 'Thanks', '.']

We can also operate at the level of sentences, using the sentence
tokenizer directly as follows:

    >>> from nltk.tokenize import sent_tokenize, word_tokenize
    >>> sent_tokenize(s)
    ['Good muffins cost $3.88\nin New York.', 'Please buy me\ntwo of them.', 'Thanks.']
    >>> [word_tokenize(t) for t in sent_tokenize(s)]
    [['Good', 'muffins', 'cost', '$', '3.88', 'in', 'New', 'York', '.'],
    ['Please', 'buy', 'me', 'two', 'of', 'them', '.'], ['Thanks', '.']]

Caution: when tokenizing a Unicode string, make sure you are not
using an encoded version of the string (it may be necessary to
decode it first, e.g. with ``s.decode("utf8")``.

NLTK tokenizers can produce token-spans, represented as tuples of integers
having the same semantics as string slices, to support efficient comparison
of tokenizers.  (These methods are implemented as generators.)

    >>> from nltk.tokenize import WhitespaceTokenizer
    >>> list(WhitespaceTokenizer().span_tokenize(s))
    [(0, 4), (5, 12), (13, 17), (18, 23), (24, 26), (27, 30), (31, 36), (38, 44),
    (45, 48), (49, 51), (52, 55), (56, 58), (59, 64), (66, 73)]

There are numerous ways to tokenize text.  If you need more control over
tokenization, see the other methods provided in this package.

For further information, please see Chapter 3 of the NLTK book.
iÿÿÿÿN(   t   load(   t   TweetTokenizert   casual_tokenize(   t   MWETokenizer(   t   PunktSentenceTokenizer(   t   RegexpTokenizert   WhitespaceTokenizert   BlanklineTokenizert   WordPunctTokenizert   wordpunct_tokenizet   regexp_tokenizet   blankline_tokenize(   t   ReppTokenizer(   t   SExprTokenizert   sexpr_tokenize(   t   SpaceTokenizert   TabTokenizert   LineTokenizert   line_tokenize(   t   TextTilingTokenizer(   t   ToktokTokenizer(   t   TreebankWordTokenizer(   t   string_span_tokenizet   regexp_span_tokenize(   t   StanfordSegmenter(   t   SyllableTokenizert   englishc         C   s"   t  d j | ƒ ƒ } | j |  ƒ S(   s  
    Return a sentence-tokenized copy of *text*,
    using NLTK's recommended sentence tokenizer
    (currently :class:`.PunktSentenceTokenizer`
    for the specified language).

    :param text: text to split into sentences
    :param language: the model name in the Punkt corpus
    s   tokenizers/punkt/{0}.pickle(   R    t   formatt   tokenize(   t   textt   languaget	   tokenizer(    (    s5   lib/python2.7/site-packages/nltk/tokenize/__init__.pyt   sent_tokenize_   s    
u   ([Â«â€œâ€˜â€ž]|[`]+)s"   (?i)(\')(?!re|ve|ll|m|t|s|d)(\w)\bu   ([Â»â€â€™])u&   ([^\.])(\.)([\]\)}>"\'Â»â€â€™ ]*)\s*$i    s    \1 s   \1 \2s	   \1 \2 \3 c         C   sK   | r |  g n t  |  | ƒ } g  | D]" } t j | ƒ D] } | ^ q8 q% S(   sõ  
    Return a tokenized copy of *text*,
    using NLTK's recommended word tokenizer
    (currently an improved :class:`.TreebankWordTokenizer`
    along with :class:`.PunktSentenceTokenizer`
    for the specified language).

    :param text: text to split into words
    :type text: str
    :param language: the model name in the Punkt corpus
    :type language: str
    :param preserve_line: An option to keep the preserve the sentence and not sentence tokenize it.
    :type preserve_line: bool
    (   R    t   _treebank_word_tokenizerR   (   R   R   t   preserve_linet	   sentencest   sentt   token(    (    s5   lib/python2.7/site-packages/nltk/tokenize/__init__.pyt   word_tokenize   s    (9   t   __doc__t   ret	   nltk.dataR    t   nltk.tokenize.casualR   R   t   nltk.tokenize.mweR   t   nltk.tokenize.punktR   t   nltk.tokenize.regexpR   R   R   R   R	   R
   R   t   nltk.tokenize.reppR   t   nltk.tokenize.sexprR   R   t   nltk.tokenize.simpleR   R   R   R   t   nltk.tokenize.texttilingR   t   nltk.tokenize.toktokR   t   nltk.tokenize.treebankR   t   nltk.tokenize.utilR   R   t    nltk.tokenize.stanford_segmenterR   t!   nltk.tokenize.sonority_sequencingR   R    R!   t   compilet   Ut   improved_open_quote_regext    improved_open_single_quote_regext   improved_close_quote_regext   improved_punct_regext   STARTING_QUOTESt   insertt   appendt   ENDING_QUOTESt   PUNCTUATIONt   FalseR&   (    (    (    s5   lib/python2.7/site-packages/nltk/tokenize/__init__.pyt   <module>=   s4   4	"		