ó
ù`]c           @   s@   d  Z  d d l m Z d d l m Z d e f d „  ƒ  YZ d S(   s(  
Multi-Word Expression Tokenizer

A ``MWETokenizer`` takes a string which has already been divided into tokens and
retokenizes it, merging multi-word expressions into single tokens, using a lexicon
of MWEs:


    >>> from nltk.tokenize import MWETokenizer

    >>> tokenizer = MWETokenizer([('a', 'little'), ('a', 'little', 'bit'), ('a', 'lot')])
    >>> tokenizer.add_mwe(('in', 'spite', 'of'))

    >>> tokenizer.tokenize('Testing testing testing one two three'.split())
    ['Testing', 'testing', 'testing', 'one', 'two', 'three']

    >>> tokenizer.tokenize('This is a test in spite'.split())
    ['This', 'is', 'a', 'test', 'in', 'spite']

    >>> tokenizer.tokenize('In a little or a little bit or a lot in spite of'.split())
    ['In', 'a_little', 'or', 'a_little_bit', 'or', 'a_lot', 'in_spite_of']

iÿÿÿÿ(   t   Trie(   t
   TokenizerIt   MWETokenizerc           B   s/   e  Z d  Z d d d „ Z d „  Z d „  Z RS(   sh   A tokenizer that processes tokenized text and merges multi-word expressions
    into single tokens.
    t   _c         C   s+   | s g  } n  t  | ƒ |  _ | |  _ d S(   s¥  Initialize the multi-word tokenizer with a list of expressions and a
        separator

        :type mwes: list(list(str))
        :param mwes: A sequence of multi-word expressions to be merged, where
            each MWE is a sequence of strings.
        :type separator: str
        :param separator: String that should be inserted between words in a multi-word
            expression token. (Default is '_')

        N(   R    t   _mwest
   _separator(   t   selft   mwest	   separator(    (    s0   lib/python2.7/site-packages/nltk/tokenize/mwe.pyt   __init__)   s    	c         C   s   |  j  j | ƒ d S(   s—  Add a multi-word expression to the lexicon (stored as a word trie)

        We use ``util.Trie`` to represent the trie. Its form is a dict of dicts. 
        The key True marks the end of a valid MWE.

        :param mwe: The multi-word expression we're adding into the word trie
        :type mwe: tuple(str) or list(str)

        :Example:

        >>> tokenizer = MWETokenizer()
        >>> tokenizer.add_mwe(('a', 'b'))
        >>> tokenizer.add_mwe(('a', 'b', 'c'))
        >>> tokenizer.add_mwe(('a', 'x'))
        >>> expected = {'a': {'x': {True: None}, 'b': {True: None, 'c': {True: None}}}}
        >>> tokenizer._mwes == expected
        True

        N(   R   t   insert(   R   t   mwe(    (    s0   lib/python2.7/site-packages/nltk/tokenize/mwe.pyt   add_mwe:   s    c         C   sý   d } t  | ƒ } g  } xÞ | | k  rø | | |  j k rÚ | } |  j } x© | | k  rƒ | | | k rƒ | | | } | d } qL Wt j | k r¼ | j |  j j | | | !ƒ ƒ | } qõ | j | | ƒ | d 7} q | j | | ƒ | d 7} q W| S(   s­  

        :param text: A list containing tokenized text
        :type text: list(str)
        :return: A list of the tokenized text with multi-words merged together
        :rtype: list(str)

        :Example:

        >>> tokenizer = MWETokenizer([('hors', "d'oeuvre")], separator='+')
        >>> tokenizer.tokenize("An hors d'oeuvre tonight, sir?".split())
        ['An', "hors+d'oeuvre", 'tonight,', 'sir?']
        
        i    i   (   t   lenR   R    t   LEAFt   appendR   t   join(   R   t   textt   it   nt   resultt   jt   trie(    (    s0   lib/python2.7/site-packages/nltk/tokenize/mwe.pyt   tokenizeP   s$    	 	N(   t   __name__t
   __module__t   __doc__t   NoneR	   R   R   (    (    (    s0   lib/python2.7/site-packages/nltk/tokenize/mwe.pyR   $   s   	N(   R   t	   nltk.utilR    t   nltk.tokenize.apiR   R   (    (    (    s0   lib/python2.7/site-packages/nltk/tokenize/mwe.pyt   <module>   s   