ó
ù`]c           @  sª   d  Z  d d l m Z d d l m Z m Z d d l m Z m Z d e f d „  ƒ  YZ	 d e f d „  ƒ  YZ
 d	 e f d
 „  ƒ  YZ d e f d „  ƒ  YZ d d „ Z d S(   uq  
Simple Tokenizers

These tokenizers divide strings into substrings using the string
``split()`` method.
When tokenizing using a particular delimiter string, use
the string ``split()`` method directly, as this is more efficient.

The simple tokenizers are *not* available as separate functions;
instead, you should just use the string ``split()`` method directly:

    >>> s = "Good muffins cost $3.88\nin New York.  Please buy me\ntwo of them.\n\nThanks."
    >>> s.split()
    ['Good', 'muffins', 'cost', '$3.88', 'in', 'New', 'York.',
    'Please', 'buy', 'me', 'two', 'of', 'them.', 'Thanks.']
    >>> s.split(' ')
    ['Good', 'muffins', 'cost', '$3.88\nin', 'New', 'York.', '',
    'Please', 'buy', 'me\ntwo', 'of', 'them.\n\nThanks.']
    >>> s.split('\n')
    ['Good muffins cost $3.88', 'in New York.  Please buy me',
    'two of them.', '', 'Thanks.']

The simple tokenizers are mainly useful because they follow the
standard ``TokenizerI`` interface, and so can be used with any code
that expects a tokenizer.  For example, these tokenizers can be used
to specify the tokenization conventions when building a `CorpusReader`.

iÿÿÿÿ(   t   unicode_literals(   t
   TokenizerIt   StringTokenizer(   t   string_span_tokenizet   regexp_span_tokenizet   SpaceTokenizerc           B  s   e  Z d  Z d Z RS(   u­  Tokenize a string using the space character as a delimiter,
    which is the same as ``s.split(' ')``.

        >>> from nltk.tokenize import SpaceTokenizer
        >>> s = "Good muffins cost $3.88\nin New York.  Please buy me\ntwo of them.\n\nThanks."
        >>> SpaceTokenizer().tokenize(s)
        ['Good', 'muffins', 'cost', '$3.88\nin', 'New', 'York.', '',
        'Please', 'buy', 'me\ntwo', 'of', 'them.\n\nThanks.']
    u    (   t   __name__t
   __module__t   __doc__t   _string(    (    (    s3   lib/python2.7/site-packages/nltk/tokenize/simple.pyR   *   s   	t   TabTokenizerc           B  s   e  Z d  Z d Z RS(   uä   Tokenize a string use the tab character as a delimiter,
    the same as ``s.split('\t')``.

        >>> from nltk.tokenize import TabTokenizer
        >>> TabTokenizer().tokenize('a\tb c\n\t d')
        ['a', 'b c\n', ' d']
    u   	(   R   R   R   R	   (    (    (    s3   lib/python2.7/site-packages/nltk/tokenize/simple.pyR
   8   s   t   CharTokenizerc           B  s    e  Z d  Z d „  Z d „  Z RS(   u„   Tokenize a string into individual characters.  If this functionality
    is ever required directly, use ``for char in string``.
    c         C  s
   t  | ƒ S(   N(   t   list(   t   selft   s(    (    s3   lib/python2.7/site-packages/nltk/tokenize/simple.pyt   tokenizeI   s    c         c  s?   x8 t  t d t | ƒ d ƒ ƒ D] \ } } | | f Vq  Wd  S(   Ni   (   t	   enumeratet   ranget   len(   R   R   t   it   j(    (    s3   lib/python2.7/site-packages/nltk/tokenize/simple.pyt   span_tokenizeL   s    ,(   R   R   R   R   R   (    (    (    s3   lib/python2.7/site-packages/nltk/tokenize/simple.pyR   D   s   	t   LineTokenizerc           B  s,   e  Z d  Z d d „ Z d „  Z d „  Z RS(   uV  Tokenize a string into its lines, optionally discarding blank lines.
    This is similar to ``s.split('\n')``.

        >>> from nltk.tokenize import LineTokenizer
        >>> s = "Good muffins cost $3.88\nin New York.  Please buy me\ntwo of them.\n\nThanks."
        >>> LineTokenizer(blanklines='keep').tokenize(s)
        ['Good muffins cost $3.88', 'in New York.  Please buy me',
        'two of them.', '', 'Thanks.']
        >>> # same as [l for l in s.split('\n') if l.strip()]:
        >>> LineTokenizer(blanklines='discard').tokenize(s)
        ['Good muffins cost $3.88', 'in New York.  Please buy me',
        'two of them.', 'Thanks.']

    :param blanklines: Indicates how blank lines should be handled.  Valid values are:

        - ``discard``: strip blank lines out of the token list before returning it.
           A line is considered blank if it contains only whitespace characters.
        - ``keep``: leave all blank lines in the token list.
        - ``discard-eof``: if the string ends with a newline, then do not generate
           a corresponding token ``''`` after that newline.
    u   discardc         C  s;   d } | | k r. t  d d j | ƒ ƒ ‚ n  | |  _ d  S(   Nu   discardu   keepu   discard-eofu   Blank lines must be one of: %su    (   u   discardu   keepu   discard-eof(   t
   ValueErrort   joint   _blanklines(   R   t
   blanklinest   valid_blanklines(    (    s3   lib/python2.7/site-packages/nltk/tokenize/simple.pyt   __init__h   s
    c         C  s}   | j  ƒ  } |  j d k rC g  | D] } | j ƒ  r" | ^ q" } n6 |  j d k ry | ry | d j ƒ  ry | j ƒ  qy n  | S(   Nu   discardu   discard-eofiÿÿÿÿ(   t
   splitlinesR   t   rstript   stript   pop(   R   R   t   linest   l(    (    s3   lib/python2.7/site-packages/nltk/tokenize/simple.pyR   q   s    (c         c  sT   |  j  d k r1 x> t | d ƒ D] } | Vq Wn x t | d ƒ D] } | VqA Wd  S(   Nu   keepu   \nu
   \n(\s+\n)*(   R   R   R   (   R   R   t   span(    (    s3   lib/python2.7/site-packages/nltk/tokenize/simple.pyR   |   s
    (   R   R   R   R   R   R   (    (    (    s3   lib/python2.7/site-packages/nltk/tokenize/simple.pyR   Q   s   		u   discardc         C  s   t  | ƒ j |  ƒ S(   N(   R   R   (   t   textR   (    (    s3   lib/python2.7/site-packages/nltk/tokenize/simple.pyt   line_tokenize‹   s    N(   R   t
   __future__R    t   nltk.tokenize.apiR   R   t   nltk.tokenize.utilR   R   R   R
   R   R   R%   (    (    (    s3   lib/python2.7/site-packages/nltk/tokenize/simple.pyt   <module>$   s   :