ó
ù`]c           @  s¬   d  d l  m Z m Z d  d l Z d  d l Z d  d l Z d  d l Z d  d l Z d  d l m	 Z	 d  d l
 m Z d  d l m Z d  d l m Z d e f d „  ƒ  YZ d S(	   iÿÿÿÿ(   t   unicode_literalst   print_functionN(   t	   text_type(   t   ZipFilePathPointer(   t   find_dir(   t
   TokenizerIt   ReppTokenizerc           B  s_   e  Z d  Z d d „ Z d „  Z e d „ Z d „  Z e d „  ƒ Z	 e d „  ƒ Z
 d „  Z RS(	   ué  
    A class for word tokenization using the REPP parser described in
    Rebecca Dridan and Stephan Oepen (2012) Tokenization: Returning to a
    Long Solved Problem - A Survey, Contrastive  Experiment, Recommendations,
    and Toolkit. In ACL. http://anthology.aclweb.org/P/P12/P12-2.pdf#page=406

    >>> sents = ['Tokenization is widely regarded as a solved problem due to the high accuracy that rulebased tokenizers achieve.' ,
    ... 'But rule-based tokenizers are hard to maintain and their rules language specific.' ,
    ... 'We evaluated our method on three languages and obtained error rates of 0.27% (English), 0.35% (Dutch) and 0.76% (Italian) for our best models.'
    ... ]
    >>> tokenizer = ReppTokenizer('/home/alvas/repp/') # doctest: +SKIP
    >>> for sent in sents:                             # doctest: +SKIP
    ...     tokenizer.tokenize(sent)                   # doctest: +SKIP
    ...
    (u'Tokenization', u'is', u'widely', u'regarded', u'as', u'a', u'solved', u'problem', u'due', u'to', u'the', u'high', u'accuracy', u'that', u'rulebased', u'tokenizers', u'achieve', u'.')
    (u'But', u'rule-based', u'tokenizers', u'are', u'hard', u'to', u'maintain', u'and', u'their', u'rules', u'language', u'specific', u'.')
    (u'We', u'evaluated', u'our', u'method', u'on', u'three', u'languages', u'and', u'obtained', u'error', u'rates', u'of', u'0.27', u'%', u'(', u'English', u')', u',', u'0.35', u'%', u'(', u'Dutch', u')', u'and', u'0.76', u'%', u'(', u'Italian', u')', u'for', u'our', u'best', u'models', u'.')

    >>> for sent in tokenizer.tokenize_sents(sents): # doctest: +SKIP
    ...     print sent                               # doctest: +SKIP
    ...
    (u'Tokenization', u'is', u'widely', u'regarded', u'as', u'a', u'solved', u'problem', u'due', u'to', u'the', u'high', u'accuracy', u'that', u'rulebased', u'tokenizers', u'achieve', u'.')
    (u'But', u'rule-based', u'tokenizers', u'are', u'hard', u'to', u'maintain', u'and', u'their', u'rules', u'language', u'specific', u'.')
    (u'We', u'evaluated', u'our', u'method', u'on', u'three', u'languages', u'and', u'obtained', u'error', u'rates', u'of', u'0.27', u'%', u'(', u'English', u')', u',', u'0.35', u'%', u'(', u'Dutch', u')', u'and', u'0.76', u'%', u'(', u'Italian', u')', u'for', u'our', u'best', u'models', u'.')
    >>> for sent in tokenizer.tokenize_sents(sents, keep_token_positions=True): # doctest: +SKIP
    ...     print sent                                                          # doctest: +SKIP
    ...
    [(u'Tokenization', 0, 12), (u'is', 13, 15), (u'widely', 16, 22), (u'regarded', 23, 31), (u'as', 32, 34), (u'a', 35, 36), (u'solved', 37, 43), (u'problem', 44, 51), (u'due', 52, 55), (u'to', 56, 58), (u'the', 59, 62), (u'high', 63, 67), (u'accuracy', 68, 76), (u'that', 77, 81), (u'rulebased', 82, 91), (u'tokenizers', 92, 102), (u'achieve', 103, 110), (u'.', 110, 111)]
    [(u'But', 0, 3), (u'rule-based', 4, 14), (u'tokenizers', 15, 25), (u'are', 26, 29), (u'hard', 30, 34), (u'to', 35, 37), (u'maintain', 38, 46), (u'and', 47, 50), (u'their', 51, 56), (u'rules', 57, 62), (u'language', 63, 71), (u'specific', 72, 80), (u'.', 80, 81)]
    [(u'We', 0, 2), (u'evaluated', 3, 12), (u'our', 13, 16), (u'method', 17, 23), (u'on', 24, 26), (u'three', 27, 32), (u'languages', 33, 42), (u'and', 43, 46), (u'obtained', 47, 55), (u'error', 56, 61), (u'rates', 62, 67), (u'of', 68, 70), (u'0.27', 71, 75), (u'%', 75, 76), (u'(', 77, 78), (u'English', 78, 85), (u')', 85, 86), (u',', 86, 87), (u'0.35', 88, 92), (u'%', 92, 93), (u'(', 94, 95), (u'Dutch', 95, 100), (u')', 100, 101), (u'and', 102, 105), (u'0.76', 106, 110), (u'%', 110, 111), (u'(', 112, 113), (u'Italian', 113, 120), (u')', 120, 121), (u'for', 122, 125), (u'our', 126, 129), (u'best', 130, 134), (u'models', 135, 141), (u'.', 141, 142)]
    u   utf8c         C  s.   |  j  | ƒ |  _ t j ƒ  |  _ | |  _ d  S(   N(   t   find_repptokenizert   repp_dirt   tempfilet
   gettempdirt   working_dirt   encoding(   t   selfR   R   (    (    s1   lib/python2.7/site-packages/nltk/tokenize/repp.pyt   __init__<   s    c         C  s   t  |  j | g ƒ ƒ S(   uÈ   
        Use Repp to tokenize a single sentence.

        :param sentence: A single sentence string.
        :type sentence: str
        :return: A tuple of tokens.
        :rtype: tuple(str)
        (   t   nextt   tokenize_sents(   R   t   sentence(    (    s1   lib/python2.7/site-packages/nltk/tokenize/repp.pyt   tokenizeC   s    	c   
   	   c  sÖ   t  j d d d |  j d d d t ƒ © } x% | D] } | j t | ƒ d ƒ q1 W| j ƒ  |  j | j ƒ } |  j	 | ƒ j
 |  j ƒ j ƒ  } x: |  j | ƒ D]) } | sÃ t | Œ  \ } } }	 n  | VqŸ WWd QXd S(	   uà   
        Tokenize multiple sentences using Repp.

        :param sentences: A list of sentence strings.
        :type sentences: list(str)
        :return: A list of tuples of tokens
        :rtype: iter(tuple(str))
        t   prefixu   repp_input.t   dirt   modeu   wt   deleteu   
N(   R	   t   NamedTemporaryFileR   t   Falset   writeR   t   closet   generate_repp_commandt   namet   _executet   decodeR   t   stript   parse_repp_outputst   zip(
   R   t	   sentencest   keep_token_positionst
   input_filet   sentt   cmdt   repp_outputt   tokenized_sentt   startst   ends(    (    s1   lib/python2.7/site-packages/nltk/tokenize/repp.pyR   N   s    		
!c         C  sH   |  j  d g } | d |  j  d g 7} | d d g 7} | | g 7} | S(   u«   
        This module generates the REPP command to be used at the terminal.

        :param inputfilename: path to the input file
        :type inputfilename: str
        u	   /src/reppu   -cu   /erg/repp.setu   --formatu   triple(   R   (   R   t   inputfilenameR&   (    (    s1   lib/python2.7/site-packages/nltk/tokenize/repp.pyR   h   s
    c         C  s7   t  j |  d t  j d t  j ƒ} | j ƒ  \ } } | S(   Nt   stdoutt   stderr(   t
   subprocesst   Popent   PIPEt   communicate(   R&   t   pR,   R-   (    (    s1   lib/python2.7/site-packages/nltk/tokenize/repp.pyR   u   s    !c         c  sŽ   t  j d t  j ƒ } xr |  j d ƒ D]a } g  | j | ƒ D]* \ } } } | t | ƒ t | ƒ f ^ q; } t d „  | Dƒ ƒ } | Vq% Wd S(   uZ  
        This module parses the tri-tuple format that REPP outputs using the
        "--format triple" option and returns an generator with tuple of string
        tokens.

        :param repp_output:
        :type repp_output: type
        :return: an iterable of the tokenized sentences as tuples of strings
        :rtype: iter(tuple)
        u   ^\((\d+), (\d+), (.+)\)$u   

c         s  s   |  ] } | d  Vq d S(   i   N(    (   t   .0t   t(    (    s1   lib/python2.7/site-packages/nltk/tokenize/repp.pys	   <genexpr>   s    N(   t   ret   compilet	   MULTILINEt   splitt   findallt   intt   tuple(   R'   t
   line_regext   sectiont   startt   endt   tokent   words_with_positionst   words(    (    s1   lib/python2.7/site-packages/nltk/tokenize/repp.pyR    {   s    =c         C  si   t  j j | ƒ r | } n t | d d ƒ} t  j j | d ƒ sI t ‚ t  j j | d ƒ se t ‚ | S(   uX   
        A module to find REPP tokenizer binary and its *repp.set* config file.
        t   env_varsu   REPP_TOKENIZERu	   /src/reppu   /erg/repp.set(   u   REPP_TOKENIZER(   t   ost   patht   existsR   t   AssertionError(   R   t   repp_dirnamet	   _repp_dir(    (    s1   lib/python2.7/site-packages/nltk/tokenize/repp.pyR      s    	(   t   __name__t
   __module__t   __doc__R   R   R   R   R   t   staticmethodR   R    R   (    (    (    s1   lib/python2.7/site-packages/nltk/tokenize/repp.pyR      s   		(   t
   __future__R    R   RD   R5   t   sysR.   R	   t   sixR   t	   nltk.dataR   t   nltk.internalsR   t   nltk.tokenize.apiR   R   (    (    (    s1   lib/python2.7/site-packages/nltk/tokenize/repp.pyt   <module>   s   