ó
ù`]c           @  sˆ   d  Z  d d l m Z d d l Z d d l Z d d l m Z d d l m Z d d l	 m
 Z
 d d l m Z d e
 f d	 „  ƒ  YZ d S(
   u  
This is a NLTK port of the tokenizer used in the NIST BLEU evaluation script,
https://github.com/moses-smt/mosesdecoder/blob/master/scripts/generic/mteval-v14.pl#L926
which was also ported into Python in
https://github.com/lium-lst/nmtpy/blob/master/nmtpy/metrics/mtevalbleu.py#L162
iÿÿÿÿ(   t   unicode_literalsN(   t	   text_type(   t   perluniprops(   t
   TokenizerI(   t   xml_unescapet   NISTTokenizerc           B  s  e  Z d  Z e j d ƒ d f Z e j d ƒ d f Z e j d ƒ d f Z e j d ƒ d f Z e j d	 ƒ d
 f Z	 e j d ƒ d f Z
 e e e	 e
 g Z e d j e e j d ƒ ƒ ƒ ƒ Z e d j e e j d ƒ ƒ ƒ ƒ Z e d j e e j d ƒ ƒ ƒ ƒ Z e j d d e ƒ Z e j d d e ƒ Z e j d d e ƒ Z e j d ƒ d f Z e j d j d e d e ƒ ƒ d f Z e j d j d e d e ƒ ƒ d
 f Z e j d j d e ƒ ƒ d f Z e e e e g Z d „  Z e e  e d „ Z! e e  e d „ Z" RS(   uv  
    This NIST tokenizer is sentence-based instead of the original
    paragraph-based tokenization from mteval-14.pl; The sentence-based
    tokenization is consistent with the other tokenizers available in NLTK.

    >>> from six import text_type
    >>> from nltk.tokenize.nist import NISTTokenizer
    >>> nist = NISTTokenizer()
    >>> s = "Good muffins cost $3.88 in New York."
    >>> expected_lower = [u'good', u'muffins', u'cost', u'$', u'3.88', u'in', u'new', u'york', u'.']
    >>> expected_cased = [u'Good', u'muffins', u'cost', u'$', u'3.88', u'in', u'New', u'York', u'.']
    >>> nist.tokenize(s, lowercase=False) == expected_cased
    True
    >>> nist.tokenize(s, lowercase=True) == expected_lower  # Lowercased.
    True

    The international_tokenize() is the preferred function when tokenizing
    non-european text, e.g.

    >>> from nltk.tokenize.nist import NISTTokenizer
    >>> nist = NISTTokenizer()

    # Input strings.
    >>> albb = u'Alibaba Group Holding Limited (Chinese: é˜¿é‡Œå·´å·´é›†å›¢æŽ§è‚¡ æœ‰é™å…¬å¸) us a Chinese e-commerce company...'
    >>> amz = u'Amazon.com, Inc. (/ËˆÃ¦mÉ™zÉ’n/) is an American electronic commerce...'
    >>> rkt = u'Rakuten, Inc. (æ¥½å¤©æ ªå¼ä¼šç¤¾ Rakuten Kabushiki-gaisha) is a Japanese electronic commerce and Internet company based in Tokyo.'

    # Expected tokens.
    >>> expected_albb = [u'Alibaba', u'Group', u'Holding', u'Limited', u'(', u'Chinese', u':', u'é˜¿é‡Œå·´å·´é›†å›¢æŽ§è‚¡', u'æœ‰é™å…¬å¸', u')']
    >>> expected_amz = [u'Amazon', u'.', u'com', u',', u'Inc', u'.', u'(', u'/', u'ËˆÃ¦', u'm']
    >>> expected_rkt = [u'Rakuten', u',', u'Inc', u'.', u'(', u'æ¥½å¤©æ ªå¼ä¼šç¤¾', u'Rakuten', u'Kabushiki', u'-', u'gaisha']

    >>> nist.international_tokenize(albb)[:10] == expected_albb
    True
    >>> nist.international_tokenize(amz)[:10] == expected_amz
    True
    >>> nist.international_tokenize(rkt)[:10] == expected_rkt
    True

    # Doctest for patching issue #1926
    >>> sent = u'this is a fooâ˜„sentence.'
    >>> expected_sent = [u'this', u'is', u'a', u'foo', u'â˜„', u'sentence', u'.']
    >>> nist.international_tokenize(sent) == expected_sent
    True
    u	   <skipped>u    u   â€¨u    u   ([\{-\~\[-\` -\&\(-\+\:-\@\/])u    \1 u   ([^0-9])([\.,])u   \1 \2 u   ([\.,])([^0-9])u    \1 \2u
   ([0-9])(-)u   Numberu   Punctuationu   Symbolu   []^\\-]u   \\\g<0>u   ([ -]+)u   ([{n}])([{p}])t   nt   pu   ([{p}])([{n}])u   ([{s}])t   sc         C  sR   |  j  \ } } | j | | ƒ } t | ƒ } |  j \ } } | j | | ƒ } | S(   u9   Performs the language independent string substituitions. (   t
   STRIP_SKIPt   subR   t   STRIP_EOL_HYPHEN(   t   selft   textt   regexpt   substitution(    (    s1   lib/python2.7/site-packages/nltk/tokenize/nist.pyt   lang_independent_subƒ   s    c         C  s®   t  | ƒ } |  j | ƒ } | rs d | d } | rD | j ƒ  } n  x, |  j D] \ } } | j | | ƒ } qN Wn  d j | j ƒ  ƒ } t  | j ƒ  ƒ } | r¤ | S| j ƒ  S(   Nu    (   R   R   t   lowert   LANG_DEPENDENT_REGEXESR
   t   joint   splitt   strip(   R   R   t	   lowercaset   western_langt
   return_strR   R   (    (    s1   lib/python2.7/site-packages/nltk/tokenize/nist.pyt   tokenize   s    c         C  sÊ   t  | ƒ } |  j \ } } | j | | ƒ } |  j \ } } | j | | ƒ } t | ƒ } | ro | j ƒ  } n  x) |  j D] \ } } | j | | ƒ } qy Wd j | j ƒ  j	 ƒ  ƒ } | rÀ | S| j	 ƒ  S(   Nu    (
   R   R	   R
   R   R   R   t   INTERNATIONAL_REGEXESR   R   R   (   R   R   R   t   split_non_asciiR   R   R   (    (    s1   lib/python2.7/site-packages/nltk/tokenize/nist.pyt   international_tokenize¢   s    (#   t   __name__t
   __module__t   __doc__t   ret   compileR	   R   t   PUNCTt   PERIOD_COMMA_PRECEEDt   PERIOD_COMMA_FOLLOWt   DASH_PRECEED_DIGITR   R   R   t   setR   t   charst
   pup_numbert	   pup_punctt
   pup_symbolR
   t   number_regext   punct_regext   symbol_regext   NONASCIIt   formatt   PUNCT_1t   PUNCT_2t   SYMBOLSR   R   t   Falset   TrueR   R   (    (    (    s1   lib/python2.7/site-packages/nltk/tokenize/nist.pyR      s6   -	$$$		!	(   R   t
   __future__R    t   ioR    t   sixR   t   nltk.corpusR   t   nltk.tokenize.apiR   t   nltk.tokenize.utilR   R   (    (    (    s1   lib/python2.7/site-packages/nltk/tokenize/nist.pyt   <module>   s   