B
    P?ð[5  ã               @   s~   d Z ddlmZ ddlmZmZ ddlmZmZ G dd„ deƒZ	G dd„ deƒZ
G d	d
„ d
eƒZG dd„ deƒZddd„ZdS )aq  
Simple Tokenizers

These tokenizers divide strings into substrings using the string
``split()`` method.
When tokenizing using a particular delimiter string, use
the string ``split()`` method directly, as this is more efficient.

The simple tokenizers are *not* available as separate functions;
instead, you should just use the string ``split()`` method directly:

    >>> s = "Good muffins cost $3.88\nin New York.  Please buy me\ntwo of them.\n\nThanks."
    >>> s.split()
    ['Good', 'muffins', 'cost', '$3.88', 'in', 'New', 'York.',
    'Please', 'buy', 'me', 'two', 'of', 'them.', 'Thanks.']
    >>> s.split(' ')
    ['Good', 'muffins', 'cost', '$3.88\nin', 'New', 'York.', '',
    'Please', 'buy', 'me\ntwo', 'of', 'them.\n\nThanks.']
    >>> s.split('\n')
    ['Good muffins cost $3.88', 'in New York.  Please buy me',
    'two of them.', '', 'Thanks.']

The simple tokenizers are mainly useful because they follow the
standard ``TokenizerI`` interface, and so can be used with any code
that expects a tokenizer.  For example, these tokenizers can be used
to specify the tokenization conventions when building a `CorpusReader`.

é    )Úunicode_literals)Ú
TokenizerIÚStringTokenizer)Ústring_span_tokenizeÚregexp_span_tokenizec               @   s   e Zd ZdZdZdS )ÚSpaceTokenizera­  Tokenize a string using the space character as a delimiter,
    which is the same as ``s.split(' ')``.

        >>> from nltk.tokenize import SpaceTokenizer
        >>> s = "Good muffins cost $3.88\nin New York.  Please buy me\ntwo of them.\n\nThanks."
        >>> SpaceTokenizer().tokenize(s)
        ['Good', 'muffins', 'cost', '$3.88\nin', 'New', 'York.', '',
        'Please', 'buy', 'me\ntwo', 'of', 'them.\n\nThanks.']
    ú N)Ú__name__Ú
__module__Ú__qualname__Ú__doc__Ú_string© r   r   ú3lib/python3.7/site-packages/nltk/tokenize/simple.pyr   *   s   	r   c               @   s   e Zd ZdZdZdS )ÚTabTokenizerzäTokenize a string use the tab character as a delimiter,
    the same as ``s.split('\t')``.

        >>> from nltk.tokenize import TabTokenizer
        >>> TabTokenizer().tokenize('a\tb c\n\t d')
        ['a', 'b c\n', ' d']
    ú	N)r	   r
   r   r   r   r   r   r   r   r   8   s   r   c               @   s    e Zd ZdZdd„ Zdd„ ZdS )ÚCharTokenizerz„Tokenize a string into individual characters.  If this functionality
    is ever required directly, use ``for char in string``.
    c             C   s   t |ƒS )N)Úlist)ÚselfÚsr   r   r   ÚtokenizeI   s    zCharTokenizer.tokenizec             c   s2   x,t tdt|ƒd ƒƒD ]\}}||fV  qW d S )Né   )Ú	enumerateÚrangeÚlen)r   r   ÚiÚjr   r   r   Úspan_tokenizeL   s     zCharTokenizer.span_tokenizeN)r	   r
   r   r   r   r   r   r   r   r   r   D   s   r   c               @   s*   e Zd ZdZd
dd„Zdd„ Zdd„ Zd	S )ÚLineTokenizeraV  Tokenize a string into its lines, optionally discarding blank lines.
    This is similar to ``s.split('\n')``.

        >>> from nltk.tokenize import LineTokenizer
        >>> s = "Good muffins cost $3.88\nin New York.  Please buy me\ntwo of them.\n\nThanks."
        >>> LineTokenizer(blanklines='keep').tokenize(s)
        ['Good muffins cost $3.88', 'in New York.  Please buy me',
        'two of them.', '', 'Thanks.']
        >>> # same as [l for l in s.split('\n') if l.strip()]:
        >>> LineTokenizer(blanklines='discard').tokenize(s)
        ['Good muffins cost $3.88', 'in New York.  Please buy me',
        'two of them.', 'Thanks.']

    :param blanklines: Indicates how blank lines should be handled.  Valid values are:

        - ``discard``: strip blank lines out of the token list before returning it.
           A line is considered blank if it contains only whitespace characters.
        - ``keep``: leave all blank lines in the token list.
        - ``discard-eof``: if the string ends with a newline, then do not generate
           a corresponding token ``''`` after that newline.
    Údiscardc             C   s(   d}||krt dd |¡ ƒ‚|| _d S )N)r   Úkeepzdiscard-eofzBlank lines must be one of: %sr   )Ú
ValueErrorÚjoinÚ_blanklines)r   Ú
blanklinesZvalid_blanklinesr   r   r   Ú__init__h   s
    zLineTokenizer.__init__c             C   sH   |  ¡ }| jdkr"dd„ |D ƒ}n"| jdkrD|rD|d  ¡ sD| ¡  |S )Nr   c             S   s   g | ]}|  ¡ r|‘qS r   )Úrstrip)Ú.0Úlr   r   r   ú
<listcomp>u   s    z*LineTokenizer.tokenize.<locals>.<listcomp>zdiscard-eoféÿÿÿÿ)Ú
splitlinesr#   ÚstripÚpop)r   r   Úlinesr   r   r   r   q   s    

zLineTokenizer.tokenizec             c   sD   | j dkr&x4t|dƒD ]
}|V  qW nxt|dƒD ]
}|V  q2W d S )Nr    z\nz
\n(\s+\n)*)r#   r   r   )r   r   Úspanr   r   r   r   |   s
    
zLineTokenizer.span_tokenizeN)r   )r	   r
   r   r   r%   r   r   r   r   r   r   r   Q   s   
	r   r   c             C   s   t |ƒ | ¡S )N)r   r   )Útextr$   r   r   r   Úline_tokenize‹   s    r1   N)r   )r   Z
__future__r   Znltk.tokenize.apir   r   Znltk.tokenize.utilr   r   r   r   r   r   r1   r   r   r   r   Ú<module>$   s   :