B
    P?ð[
=  ã               @   sV   d Z ddlZddlmZ ddlmZ G dd„ dƒZG dd„ deƒZG d	d
„ d
eƒZdS )a	  

Penn Treebank Tokenizer

The Treebank tokenizer uses regular expressions to tokenize text as in Penn Treebank.
This implementation is a port of the tokenizer sed script written by Robert McIntyre
and available at http://www.cis.upenn.edu/~treebank/tokenizer.sed.
é    N)Ú
TokenizerI)Úalign_tokensc               @   s4   e Zd ZdZdddddddd	gZd
dgZddgZdS )ÚMacIntyreContractionszI
    List of contractions adapted from Robert MacIntyre's tokenizer.
    z(?i)\b(can)(?#X)(not)\bz(?i)\b(d)(?#X)('ye)\bz(?i)\b(gim)(?#X)(me)\bz(?i)\b(gon)(?#X)(na)\bz(?i)\b(got)(?#X)(ta)\bz(?i)\b(lem)(?#X)(me)\bz(?i)\b(mor)(?#X)('n)\bz(?i)\b(wan)(?#X)(na)\sz(?i) ('t)(?#X)(is)\bz(?i) ('t)(?#X)(was)\bz(?i)\b(whad)(dd)(ya)\bz(?i)\b(wha)(t)(cha)\bN)Ú__name__Ú
__module__Ú__qualname__Ú__doc__ÚCONTRACTIONS2ÚCONTRACTIONS3ZCONTRACTIONS4© r   r   ú5lib/python3.7/site-packages/nltk/tokenize/treebank.pyr      s   r   c            	   @   sh  e Zd ZdZe d¡dfe d¡dfe d¡dfgZe d¡d	fe d
¡dfe d¡dfe d¡dfe d¡dfe d¡dfe d¡dfgZe d¡dfZe d¡dfe d¡dfe d¡dfe d¡dfe d¡dfe d¡d fgZ	e d!¡d"fZ
e d#¡d$fe d%¡d&fe d'¡d&fe d(¡d&fgZeƒ ZeeejejƒƒZeeejejƒƒZd/d*d+„Zd,d-„ Zd.S )0ÚTreebankWordTokenizeraÙ  
    The Treebank tokenizer uses regular expressions to tokenize text as in Penn Treebank.
    This is the method that is invoked by ``word_tokenize()``.  It assumes that the
    text has already been segmented into sentences, e.g. using ``sent_tokenize()``.

    This tokenizer performs the following steps:

    - split standard contractions, e.g. ``don't`` -> ``do n't`` and ``they'll`` -> ``they 'll``
    - treat most punctuation characters as separate tokens
    - split off commas and single quotes, when followed by whitespace
    - separate periods that appear at the end of line

        >>> from nltk.tokenize import TreebankWordTokenizer
        >>> s = '''Good muffins cost $3.88\nin New York.  Please buy me\ntwo of them.\nThanks.'''
        >>> TreebankWordTokenizer().tokenize(s)
        ['Good', 'muffins', 'cost', '$', '3.88', 'in', 'New', 'York.', 'Please', 'buy', 'me', 'two', 'of', 'them.', 'Thanks', '.']
        >>> s = "They'll save and invest more."
        >>> TreebankWordTokenizer().tokenize(s)
        ['They', "'ll", 'save', 'and', 'invest', 'more', '.']
        >>> s = "hi, my name can't hello,"
        >>> TreebankWordTokenizer().tokenize(s)
        ['hi', ',', 'my', 'name', 'ca', "n't", 'hello', ',']
    z^\"z``z(``)z \1 z([ \(\[{<])(\"|\'{2})z\1 `` z([:,])([^\d])z \1 \2z([:,])$z\.\.\.z ... z[;@#$%&]z \g<0> z([^\.])(\.)([\]\)}>"\']*)\s*$z\1 \2\3 z[?!]z([^'])' z\1 ' z[\]\[\(\)\{\}\<\>]z\(z-LRB-z\)z-RRB-z\[z-LSB-z\]z-RSB-z\{z-LCB-z\}z-RCB-z--z -- ú"z '' z
(\S)(\'\')z\1 \2 z([^' ])('[sS]|'[mM]|'[dD]|') z)([^' ])('ll|'LL|'re|'RE|'ve|'VE|n't|N'T) Fc             C   s  x| j D ]\}}| ||¡}qW x| jD ]\}}| ||¡}q(W | j\}}| ||¡}|rzx| jD ]\}}| ||¡}qbW | j\}}| ||¡}d| d }x| jD ]\}}| ||¡}q¤W x| jD ]}| d|¡}qÄW x| jD ]}| d|¡}qàW |rü|S | 	¡ S )Nú z \1 \2 )
ÚSTARTING_QUOTESÚsubÚPUNCTUATIONÚPARENS_BRACKETSÚCONVERT_PARENTHESESÚDOUBLE_DASHESÚENDING_QUOTESr	   r
   Úsplit)ÚselfÚtextÚconvert_parenthesesZ
return_strÚregexpÚsubstitutionr   r   r   Útokenizeu   s&    

zTreebankWordTokenizer.tokenizec             #   sf   |   |¡}d|ksd|krDdd„ t d|¡D ƒ‰ ‡ fdd„|D ƒ}n|}xt||ƒD ]
}|V  qTW dS )aÂ  
        Uses the post-hoc nltk.tokens.align_tokens to return the offset spans.

            >>> from nltk.tokenize import TreebankWordTokenizer
            >>> s = '''Good muffins cost $3.88\nin New (York).  Please (buy) me\ntwo of them.\n(Thanks).'''
            >>> expected = [(0, 4), (5, 12), (13, 17), (18, 19), (19, 23),
            ... (24, 26), (27, 30), (31, 32), (32, 36), (36, 37), (37, 38),
            ... (40, 46), (47, 48), (48, 51), (51, 52), (53, 55), (56, 59),
            ... (60, 62), (63, 68), (69, 70), (70, 76), (76, 77), (77, 78)]
            >>> list(TreebankWordTokenizer().span_tokenize(s)) == expected
            True
            >>> expected = ['Good', 'muffins', 'cost', '$', '3.88', 'in',
            ... 'New', '(', 'York', ')', '.', 'Please', '(', 'buy', ')',
            ... 'me', 'two', 'of', 'them.', '(', 'Thanks', ')', '.']
            >>> [s[start:end] for start, end in TreebankWordTokenizer().span_tokenize(s)] == expected
            True

            Additional example
            >>> from nltk.tokenize import TreebankWordTokenizer
            >>> s = '''I said, "I'd like to buy some ''good muffins" which cost $3.88\n each in New (York)."'''
            >>> expected = [(0, 1), (2, 6), (6, 7), (8, 9), (9, 10), (10, 12),
            ... (13, 17), (18, 20), (21, 24), (25, 29), (30, 32), (32, 36),
            ... (37, 44), (44, 45), (46, 51), (52, 56), (57, 58), (58, 62),
            ... (64, 68), (69, 71), (72, 75), (76, 77), (77, 81), (81, 82),
            ... (82, 83), (83, 84)]
            >>> list(TreebankWordTokenizer().span_tokenize(s)) == expected
            True
            >>> expected = ['I', 'said', ',', '"', 'I', "'d", 'like', 'to',
            ... 'buy', 'some', "''", "good", 'muffins', '"', 'which', 'cost',
            ... '$', '3.88', 'each', 'in', 'New', '(', 'York', ')', '.', '"']
            >>> [s[start:end] for start, end in TreebankWordTokenizer().span_tokenize(s)] == expected
            True

        r   z''c             S   s   g | ]}|  ¡ ‘qS r   )Úgroup)Ú.0Úmr   r   r   ú
<listcomp>Å   s    z7TreebankWordTokenizer.span_tokenize.<locals>.<listcomp>z
``|'{2}|\"c                s"   g | ]}|d krˆ   d¡n|‘qS ))r   z``z''r   )Úpop)r   Útok)Úmatchedr   r   r!   É   s   N)r   ÚreÚfinditerr   )r   r   Z
raw_tokensÚtokensr#   r   )r$   r   Úspan_tokenizeš   s    #


z#TreebankWordTokenizer.span_tokenizeN)FF)r   r   r   r   r%   Úcompiler   r   r   r   r   r   r   Ú_contractionsÚlistÚmapr	   r
   r   r(   r   r   r   r   r   +   s:   
%r   c               @   s˜  e Zd ZdZeƒ Zdd„ ejD ƒZdd„ ejD ƒZe 	d¡dfe 	d¡dfe 	d¡dfe 	d	¡d
fgZ
e 	d¡dfZe 	d¡dfe 	d¡dfe 	d¡dfe 	d¡dfe 	d¡dfe 	d¡dfgZe 	d¡dfe 	d¡dfe 	d¡dfgZe 	d¡d fe 	d!¡d"fe 	d#¡d$fe 	d%¡dfe 	d&¡dfe 	d'¡d(fe 	d)¡d*fe 	d+¡d,fe 	d-¡d.fg	Ze 	d/¡d0fe 	d1¡d,fe 	d2¡d3fgZd:d5d6„Zd;d7d8„Zd9S )<ÚTreebankWordDetokenizeram  
    The Treebank detokenizer uses the reverse regex operations corresponding to
    the Treebank tokenizer's regexes.

    Note:
    - There're additional assumption mades when undoing the padding of [;@#$%&]
      punctuation symbols that isn't presupposed in the TreebankTokenizer.
    - There're additional regexes added in reversing the parentheses tokenization,
       - the r'([\]\)\}\>])\s([:;,.])' removes the additional right padding added
         to the closing parentheses precedding [:;,.].
    - It's not possible to return the original whitespaces as they were because
      there wasn't explicit records of where '
', '	' or '\s' were removed at
      the text.split() operation.

        >>> from nltk.tokenize.treebank import TreebankWordTokenizer, TreebankWordDetokenizer
        >>> s = '''Good muffins cost $3.88\nin New York.  Please buy me\ntwo of them.\nThanks.'''
        >>> d = TreebankWordDetokenizer()
        >>> t = TreebankWordTokenizer()
        >>> toks = t.tokenize(s)
        >>> d.detokenize(toks)
        'Good muffins cost $3.88 in New York. Please buy me two of them. Thanks.'

    The MXPOST parentheses substitution can be undone using the `convert_parentheses`
    parameter:

    >>> s = '''Good muffins cost $3.88\nin New (York).  Please (buy) me\ntwo of them.\n(Thanks).'''
    >>> expected_tokens = ['Good', 'muffins', 'cost', '$', '3.88', 'in',
    ... 'New', '-LRB-', 'York', '-RRB-', '.', 'Please', '-LRB-', 'buy',
    ... '-RRB-', 'me', 'two', 'of', 'them.', '-LRB-', 'Thanks', '-RRB-', '.']
    >>> expected_tokens == t.tokenize(s, convert_parentheses=True)
    True
    >>> expected_detoken = 'Good muffins cost $3.88 in New (York). Please (buy) me two of them. (Thanks).'
    >>> expected_detoken == d.detokenize(t.tokenize(s, convert_parentheses=True), convert_parentheses=True)
    True

    During tokenization it's safe to add more spaces but during detokenization,
    simply undoing the padding doesn't really help.

    - During tokenization, left and right pad is added to [!?], when
      detokenizing, only left shift the [!?] is needed.
      Thus (re.compile(r'\s([?!])'), r'\g<1>')

    - During tokenization [:,] are left and right padded but when detokenizing,
      only left shift is necessary and we keep right pad after comma/colon
      if the string after is a non-digit.
      Thus (re.compile(r'\s([:,])\s([^\d])'), r' ')

    >>> from nltk.tokenize.treebank import TreebankWordDetokenizer
    >>> toks = ['hello', ',', 'i', 'ca', "n't", 'feel', 'my', 'feet', '!', 'Help', '!', '!']
    >>> twd = TreebankWordDetokenizer()
    >>> twd.detokenize(toks)
    "hello, i can't feel my feet! Help!!"

    >>> toks = ['hello', ',', 'i', "can't", 'feel', ';', 'my', 'feet', '!',
    ... 'Help', '!', '!', 'He', 'said', ':', 'Help', ',', 'help', '?', '!']
    >>> twd.detokenize(toks)
    "hello, i can't feel; my feet! Help!! He said: Help, help?!"
    c             C   s   g | ]}t  | d d¡¡‘qS )z(?#X)z\s)r%   r)   Úreplace)r   Úpatternr   r   r   r!     s   z"TreebankWordDetokenizer.<listcomp>c             C   s   g | ]}t  | d d¡¡‘qS )z(?#X)z\s)r%   r)   r.   )r   r/   r   r   r   r!     s   z+([^' ])\s('ll|'LL|'re|'RE|'ve|'VE|n't|N'T) z\1\2 z([^' ])\s('[sS]|'[mM]|'[dD]|') z
(\S)(\'\')z '' r   z -- z--z-LRB-ú(z-RRB-ú)z-LSB-ú[z-RSB-ú]z-LCB-Ú{z-RCB-Ú}z\s([\[\(\{\<])\sz \g<1>z\s([\]\)\}\>])\sz\g<1> z([\]\)\}\>])\s([:;,.])z\1\2z([^'])\s'\sz\1' z\s([?!])z\g<1>z([^\.])\s(\.)([\]\)}>"\']*)\s*$z\1\2\3z
\s([#$])\sz
\s([;%])\sz	\s([&])\sz \g<1> z
\s\.\.\.\sz...z\s([:,])\s$z\1z\s([:,])\s([^\d])z\1 \2z([ (\[{<])\s``z\1"z\s(``)\sz^``z\"Fc             C   s  d  |¡}x| jD ]}| d|¡}qW x| jD ]}| d|¡}q.W x| jD ]\}}| ||¡}qJW | ¡ }| j\}}| ||¡}|r¤x| jD ]\}}| ||¡}qŒW x| jD ]\}}| ||¡}q¬W x| j	D ]\}}| ||¡}qÌW x| j
D ]\}}| ||¡}qìW | ¡ S )z¬
        Python port of the Moses detokenizer.

        :param tokens: A list of strings, i.e. tokenized text.
        :type tokens: list(str)
        :return: str
        r   z\1\2)Újoinr
   r   r	   r   Ústripr   r   r   r   r   )r   r'   r   r   r   r   r   r   r   r   R  s(    

z TreebankWordDetokenizer.tokenizec             C   s   |   ||¡S )z' Duck-typing the abstract *tokenize()*.)r   )r   r'   r   r   r   r   Ú
detokenize  s    z"TreebankWordDetokenizer.detokenizeN)F)F)r   r   r   r   r   r*   r	   r
   r%   r)   r   r   r   r   r   r   r   r8   r   r   r   r   r-   Ó   sF   :


-r-   )	r   r%   Znltk.tokenize.apir   Znltk.tokenize.utilr   r   r   r-   r   r   r   r   Ú<module>   s    )