ó
ù`]c           @   su   d  Z  d d l Z d d l m Z d d l m Z d d d „  ƒ  YZ d e f d „  ƒ  YZ d	 e f d
 „  ƒ  YZ d S(   s	  

Penn Treebank Tokenizer

The Treebank tokenizer uses regular expressions to tokenize text as in Penn Treebank.
This implementation is a port of the tokenizer sed script written by Robert McIntyre
and available at http://www.cis.upenn.edu/~treebank/tokenizer.sed.
iÿÿÿÿN(   t
   TokenizerI(   t   align_tokenst   MacIntyreContractionsc           B   sD   e  Z d  Z d d d d d d d d g Z d	 d
 g Z d d g Z RS(   sI   
    List of contractions adapted from Robert MacIntyre's tokenizer.
    s   (?i)\b(can)(?#X)(not)\bs   (?i)\b(d)(?#X)('ye)\bs   (?i)\b(gim)(?#X)(me)\bs   (?i)\b(gon)(?#X)(na)\bs   (?i)\b(got)(?#X)(ta)\bs   (?i)\b(lem)(?#X)(me)\bs   (?i)\b(mor)(?#X)('n)\bs   (?i)\b(wan)(?#X)(na)\ss   (?i) ('t)(?#X)(is)\bs   (?i) ('t)(?#X)(was)\bs   (?i)\b(whad)(dd)(ya)\bs   (?i)\b(wha)(t)(cha)\b(   t   __name__t
   __module__t   __doc__t   CONTRACTIONS2t   CONTRACTIONS3t   CONTRACTIONS4(    (    (    s5   lib/python2.7/site-packages/nltk/tokenize/treebank.pyR      s   	t   TreebankWordTokenizerc           B   s  e  Z d  Z e j d ƒ d f e j d ƒ d f e j d ƒ d f g Z e j d ƒ d f e j d	 ƒ d f e j d
 ƒ d f e j d ƒ d f e j d ƒ d f e j d ƒ d f e j d ƒ d f g Z e j d ƒ d f Z e j d ƒ d f e j d ƒ d f e j d ƒ d f e j d ƒ d f e j d ƒ d f e j d ƒ d f g Z e j d  ƒ d! f Z	 e j d" ƒ d# f e j d$ ƒ d% f e j d& ƒ d% f e j d' ƒ d% f g Z
 e ƒ  Z e e e j e j ƒ ƒ Z e e e j e j ƒ ƒ Z e e d( „ Z d) „  Z RS(*   sÙ  
    The Treebank tokenizer uses regular expressions to tokenize text as in Penn Treebank.
    This is the method that is invoked by ``word_tokenize()``.  It assumes that the
    text has already been segmented into sentences, e.g. using ``sent_tokenize()``.

    This tokenizer performs the following steps:

    - split standard contractions, e.g. ``don't`` -> ``do n't`` and ``they'll`` -> ``they 'll``
    - treat most punctuation characters as separate tokens
    - split off commas and single quotes, when followed by whitespace
    - separate periods that appear at the end of line

        >>> from nltk.tokenize import TreebankWordTokenizer
        >>> s = '''Good muffins cost $3.88\nin New York.  Please buy me\ntwo of them.\nThanks.'''
        >>> TreebankWordTokenizer().tokenize(s)
        ['Good', 'muffins', 'cost', '$', '3.88', 'in', 'New', 'York.', 'Please', 'buy', 'me', 'two', 'of', 'them.', 'Thanks', '.']
        >>> s = "They'll save and invest more."
        >>> TreebankWordTokenizer().tokenize(s)
        ['They', "'ll", 'save', 'and', 'invest', 'more', '.']
        >>> s = "hi, my name can't hello,"
        >>> TreebankWordTokenizer().tokenize(s)
        ['hi', ',', 'my', 'name', 'ca', "n't", 'hello', ',']
    s   ^\"s   ``s   (``)s    \1 s   ([ \(\[{<])(\"|\'{2})s   \1 `` s   ([:,])([^\d])s    \1 \2s   ([:,])$s   \.\.\.s    ... s   [;@#$%&]s    \g<0> s   ([^\.])(\.)([\]\)}>"\']*)\s*$s   \1 \2\3 s   [?!]s   ([^'])' s   \1 ' s   [\]\[\(\)\{\}\<\>]s   \(s   -LRB-s   \)s   -RRB-s   \[s   -LSB-s   \]s   -RSB-s   \{s   -LCB-s   \}s   -RCB-s   --s    -- t   "s    '' s
   (\S)(\'\')s   \1 \2 s   ([^' ])('[sS]|'[mM]|'[dD]|') s)   ([^' ])('ll|'LL|'re|'RE|'ve|'VE|n't|N'T) c         C   si  x) |  j  D] \ } } | j | | ƒ } q
 Wx) |  j D] \ } } | j | | ƒ } q6 W|  j \ } } | j | | ƒ } | r® x, |  j D] \ } } | j | | ƒ } q‰ Wn  |  j \ } } | j | | ƒ } d | d } x) |  j D] \ } } | j | | ƒ } qç Wx# |  j D] } | j d | ƒ } qWx# |  j D] } | j d | ƒ } q9W| r_| S| j	 ƒ  S(   Nt    s    \1 \2 (
   t   STARTING_QUOTESt   subt   PUNCTUATIONt   PARENS_BRACKETSt   CONVERT_PARENTHESESt   DOUBLE_DASHESt   ENDING_QUOTESR   R   t   split(   t   selft   textt   convert_parenthesest
   return_strt   regexpt   substitution(    (    s5   lib/python2.7/site-packages/nltk/tokenize/treebank.pyt   tokenizeu   s&    c         c   s²   |  j  | ƒ } d | k s' d | k r‰ g  t j d | ƒ D] } | j ƒ  ^ q: } g  | D]' } | d k rz | j d ƒ n | ^ qY } n | } x t | | ƒ D] } | VqŸ Wd S(   sÂ  
        Uses the post-hoc nltk.tokens.align_tokens to return the offset spans.

            >>> from nltk.tokenize import TreebankWordTokenizer
            >>> s = '''Good muffins cost $3.88\nin New (York).  Please (buy) me\ntwo of them.\n(Thanks).'''
            >>> expected = [(0, 4), (5, 12), (13, 17), (18, 19), (19, 23),
            ... (24, 26), (27, 30), (31, 32), (32, 36), (36, 37), (37, 38),
            ... (40, 46), (47, 48), (48, 51), (51, 52), (53, 55), (56, 59),
            ... (60, 62), (63, 68), (69, 70), (70, 76), (76, 77), (77, 78)]
            >>> list(TreebankWordTokenizer().span_tokenize(s)) == expected
            True
            >>> expected = ['Good', 'muffins', 'cost', '$', '3.88', 'in',
            ... 'New', '(', 'York', ')', '.', 'Please', '(', 'buy', ')',
            ... 'me', 'two', 'of', 'them.', '(', 'Thanks', ')', '.']
            >>> [s[start:end] for start, end in TreebankWordTokenizer().span_tokenize(s)] == expected
            True

            Additional example
            >>> from nltk.tokenize import TreebankWordTokenizer
            >>> s = '''I said, "I'd like to buy some ''good muffins" which cost $3.88\n each in New (York)."'''
            >>> expected = [(0, 1), (2, 6), (6, 7), (8, 9), (9, 10), (10, 12),
            ... (13, 17), (18, 20), (21, 24), (25, 29), (30, 32), (32, 36),
            ... (37, 44), (44, 45), (46, 51), (52, 56), (57, 58), (58, 62),
            ... (64, 68), (69, 71), (72, 75), (76, 77), (77, 81), (81, 82),
            ... (82, 83), (83, 84)]
            >>> list(TreebankWordTokenizer().span_tokenize(s)) == expected
            True
            >>> expected = ['I', 'said', ',', '"', 'I', "'d", 'like', 'to',
            ... 'buy', 'some', "''", "good", 'muffins', '"', 'which', 'cost',
            ... '$', '3.88', 'each', 'in', 'New', '(', 'York', ')', '.', '"']
            >>> [s[start:end] for start, end in TreebankWordTokenizer().span_tokenize(s)] == expected
            True

        R
   s   ''s
   ``|'{2}|\"s   ``i    N(   R
   s   ``s   ''(   R   t   ret   finditert   groupt   popR   (   R   R   t
   raw_tokenst   mt   matchedt   tokt   tokens(    (    s5   lib/python2.7/site-packages/nltk/tokenize/treebank.pyt   span_tokenizeš   s    #+4(   R   R   R   R   t   compileR   R   R   R   R   R   R   t   _contractionst   listt   mapR   R   t   FalseR   R$   (    (    (    s5   lib/python2.7/site-packages/nltk/tokenize/treebank.pyR	   +   s:   	%t   TreebankWordDetokenizerc        
   B   s†  e  Z d  Z e ƒ  Z g  e j D]! Z e j e j	 d d ƒ ƒ ^ q Z g  e j
 D]! Z e j e j	 d d ƒ ƒ ^ qP Z
 e j d ƒ d f e j d ƒ d f e j d ƒ d f e j d ƒ d f g Z e j d	 ƒ d
 f Z e j d ƒ d f e j d ƒ d f e j d ƒ d f e j d ƒ d f e j d ƒ d f e j d ƒ d f g Z e j d ƒ d f e j d ƒ d f e j d ƒ d f g Z e j d ƒ d f e j d ƒ d  f e j d! ƒ d" f e j d# ƒ d f e j d$ ƒ d f e j d% ƒ d& f e j d' ƒ d( f e j d) ƒ d* f e j d+ ƒ d, f g	 Z e j d- ƒ d. f e j d/ ƒ d* f e j d0 ƒ d1 f g Z e d2 „ Z e d3 „ Z RS(4   sm  
    The Treebank detokenizer uses the reverse regex operations corresponding to
    the Treebank tokenizer's regexes.

    Note:
    - There're additional assumption mades when undoing the padding of [;@#$%&]
      punctuation symbols that isn't presupposed in the TreebankTokenizer.
    - There're additional regexes added in reversing the parentheses tokenization,
       - the r'([\]\)\}\>])\s([:;,.])' removes the additional right padding added
         to the closing parentheses precedding [:;,.].
    - It's not possible to return the original whitespaces as they were because
      there wasn't explicit records of where '
', '	' or '\s' were removed at
      the text.split() operation.

        >>> from nltk.tokenize.treebank import TreebankWordTokenizer, TreebankWordDetokenizer
        >>> s = '''Good muffins cost $3.88\nin New York.  Please buy me\ntwo of them.\nThanks.'''
        >>> d = TreebankWordDetokenizer()
        >>> t = TreebankWordTokenizer()
        >>> toks = t.tokenize(s)
        >>> d.detokenize(toks)
        'Good muffins cost $3.88 in New York. Please buy me two of them. Thanks.'

    The MXPOST parentheses substitution can be undone using the `convert_parentheses`
    parameter:

    >>> s = '''Good muffins cost $3.88\nin New (York).  Please (buy) me\ntwo of them.\n(Thanks).'''
    >>> expected_tokens = ['Good', 'muffins', 'cost', '$', '3.88', 'in',
    ... 'New', '-LRB-', 'York', '-RRB-', '.', 'Please', '-LRB-', 'buy',
    ... '-RRB-', 'me', 'two', 'of', 'them.', '-LRB-', 'Thanks', '-RRB-', '.']
    >>> expected_tokens == t.tokenize(s, convert_parentheses=True)
    True
    >>> expected_detoken = 'Good muffins cost $3.88 in New (York). Please (buy) me two of them. (Thanks).'
    >>> expected_detoken == d.detokenize(t.tokenize(s, convert_parentheses=True), convert_parentheses=True)
    True

    During tokenization it's safe to add more spaces but during detokenization,
    simply undoing the padding doesn't really help.

    - During tokenization, left and right pad is added to [!?], when
      detokenizing, only left shift the [!?] is needed.
      Thus (re.compile(r'\s([?!])'), r'\g<1>')

    - During tokenization [:,] are left and right padded but when detokenizing,
      only left shift is necessary and we keep right pad after comma/colon
      if the string after is a non-digit.
      Thus (re.compile(r'\s([:,])\s([^\d])'), r' ')

    >>> from nltk.tokenize.treebank import TreebankWordDetokenizer
    >>> toks = ['hello', ',', 'i', 'ca', "n't", 'feel', 'my', 'feet', '!', 'Help', '!', '!']
    >>> twd = TreebankWordDetokenizer()
    >>> twd.detokenize(toks)
    "hello, i can't feel my feet! Help!!"

    >>> toks = ['hello', ',', 'i', "can't", 'feel', ';', 'my', 'feet', '!',
    ... 'Help', '!', '!', 'He', 'said', ':', 'Help', ',', 'help', '?', '!']
    >>> twd.detokenize(toks)
    "hello, i can't feel; my feet! Help!! He said: Help, help?!"
    s   (?#X)s   \ss+   ([^' ])\s('ll|'LL|'re|'RE|'ve|'VE|n't|N'T) s   \1\2 s   ([^' ])\s('[sS]|'[mM]|'[dD]|') s
   (\S)(\'\')s    '' R
   s    -- s   --s   -LRB-t   (s   -RRB-t   )s   -LSB-t   [s   -RSB-t   ]s   -LCB-t   {s   -RCB-t   }s   \s([\[\(\{\<])\ss    \g<1>s   \s([\]\)\}\>])\ss   \g<1> s   ([\]\)\}\>])\s([:;,.])s   \1\2s   ([^'])\s'\ss   \1' s   \s([?!])s   \g<1>s   ([^\.])\s(\.)([\]\)}>"\']*)\s*$s   \1\2\3s
   \s([#$])\ss
   \s([;%])\ss	   \s([&])\ss    \g<1> s
   \s\.\.\.\ss   ...s   \s([:,])\s$s   \1s   \s([:,])\s([^\d])s   \1 \2s   ([ (\[{<])\s``s   \1"s   \s(``)\ss   ^``s   \"c         C   sw  d j  | ƒ } x# |  j D] } | j d | ƒ } q Wx# |  j D] } | j d | ƒ } q? Wx) |  j D] \ } } | j | | ƒ } qe W| j ƒ  } |  j \ } } | j | | ƒ } | ré x, |  j D] \ } } | j | | ƒ } qÄ Wn  x) |  j D] \ } } | j | | ƒ } qó Wx) |  j	 D] \ } } | j | | ƒ } qWx) |  j
 D] \ } } | j | | ƒ } qKW| j ƒ  S(   sí   
        Treebank detokenizer, created by undoing the regexes from 
        the TreebankWordTokenizer.tokenize.

        :param tokens: A list of strings, i.e. tokenized text.
        :type tokens: list(str)
        :return: str
        R   s   \1\2(   t   joinR   R   R   R   t   stripR   R   R   R   R   (   R   R#   R   R   R   R   (    (    s5   lib/python2.7/site-packages/nltk/tokenize/treebank.pyR   R  s(    	c         C   s   |  j  | | ƒ S(   s'    Duck-typing the abstract *tokenize()*.(   R   (   R   R#   R   (    (    s5   lib/python2.7/site-packages/nltk/tokenize/treebank.pyt
   detokenize€  s    (   R   R   R   R   R&   R   t   patternR   R%   t   replaceR   R   R   R   R   R   R   R)   R   R3   (    (    (    s5   lib/python2.7/site-packages/nltk/tokenize/treebank.pyR*   Ó   sF   :	...(    (	   R   R   t   nltk.tokenize.apiR    t   nltk.tokenize.utilR   R   R	   R*   (    (    (    s5   lib/python2.7/site-packages/nltk/tokenize/treebank.pyt   <module>   s   ¨