B
    P?ð[U  ã               @   sd   d Z ddlmZ ddlZddlZddlmZ ddlmZ ddl	m
Z
 ddlmZ G dd	„ d	e
ƒZdS )
a  
This is a NLTK port of the tokenizer used in the NIST BLEU evaluation script,
https://github.com/moses-smt/mosesdecoder/blob/master/scripts/generic/mteval-v14.pl#L926
which was also ported into Python in
https://github.com/lium-lst/nmtpy/blob/master/nmtpy/metrics/mtevalbleu.py#L162
é    )Úunicode_literalsN)Ú	text_type)Úperluniprops)Ú
TokenizerI)Úxml_unescapec               @   s^  e Zd ZdZe d¡dfZe d¡dfZe d¡dfZe d¡d	fZ	e d
¡dfZ
e d¡d	fZee	e
egZed ee d¡ƒ¡ƒZed ee d¡ƒ¡ƒZed ee d¡ƒ¡ƒZe dde¡Ze dde¡Ze dde¡Ze d¡dfZe djeed¡d	fZe djeed¡dfZe djed¡dfZeeeegZdd„ Zd!dd„Z d"dd„Z!d S )#ÚNISTTokenizeruv  
    This NIST tokenizer is sentence-based instead of the original
    paragraph-based tokenization from mteval-14.pl; The sentence-based
    tokenization is consistent with the other tokenizers available in NLTK.

    >>> from six import text_type
    >>> from nltk.tokenize.nist import NISTTokenizer
    >>> nist = NISTTokenizer()
    >>> s = "Good muffins cost $3.88 in New York."
    >>> expected_lower = [u'good', u'muffins', u'cost', u'$', u'3.88', u'in', u'new', u'york', u'.']
    >>> expected_cased = [u'Good', u'muffins', u'cost', u'$', u'3.88', u'in', u'New', u'York', u'.']
    >>> nist.tokenize(s, lowercase=False) == expected_cased
    True
    >>> nist.tokenize(s, lowercase=True) == expected_lower  # Lowercased.
    True

    The international_tokenize() is the preferred function when tokenizing
    non-european text, e.g.

    >>> from nltk.tokenize.nist import NISTTokenizer
    >>> nist = NISTTokenizer()

    # Input strings.
    >>> albb = u'Alibaba Group Holding Limited (Chinese: é˜¿é‡Œå·´å·´é›†å›¢æŽ§è‚¡ æœ‰é™å…¬å¸) us a Chinese e-commerce company...'
    >>> amz = u'Amazon.com, Inc. (/ËˆÃ¦mÉ™zÉ’n/) is an American electronic commerce...'
    >>> rkt = u'Rakuten, Inc. (æ¥½å¤©æ ªå¼ä¼šç¤¾ Rakuten Kabushiki-gaisha) is a Japanese electronic commerce and Internet company based in Tokyo.'

    # Expected tokens.
    >>> expected_albb = [u'Alibaba', u'Group', u'Holding', u'Limited', u'(', u'Chinese', u':', u'é˜¿é‡Œå·´å·´é›†å›¢æŽ§è‚¡', u'æœ‰é™å…¬å¸', u')']
    >>> expected_amz = [u'Amazon', u'.', u'com', u',', u'Inc', u'.', u'(', u'/', u'ËˆÃ¦', u'm']
    >>> expected_rkt = [u'Rakuten', u',', u'Inc', u'.', u'(', u'æ¥½å¤©æ ªå¼ä¼šç¤¾', u'Rakuten', u'Kabushiki', u'-', u'gaisha']

    >>> nist.international_tokenize(albb)[:10] == expected_albb
    True
    >>> nist.international_tokenize(amz)[:10] == expected_amz
    True
    >>> nist.international_tokenize(rkt)[:10] == expected_rkt
    True

    # Doctest for patching issue #1926
    >>> sent = u'this is a fooâ˜„sentence.'
    >>> expected_sent = [u'this', u'is', u'a', u'foo', u'â˜„', u'sentence', u'.']
    >>> nist.international_tokenize(sent) == expected_sent
    True
    z	<skipped>Ú u   â€¨ú z([\{-\~\[-\` -\&\(-\+\:-\@\/])z \1 z([^0-9])([\.,])z\1 \2 z([\.,])([^0-9])z \1 \2z
([0-9])(-)ÚNumberZPunctuationZSymbolz[]^\\-]z\\\g<0>z([ -]+)z([{n}])([{p}]))ÚnÚpz([{p}])([{n}])z([{s}]))Úsc             C   s8   | j \}}| ||¡}t|ƒ}| j\}}| ||¡}|S )z9Performs the language independent string substituitions. )Ú
STRIP_SKIPÚsubr   ÚSTRIP_EOL_HYPHEN)ÚselfÚtextÚregexpÚsubstitution© r   ú1lib/python3.7/site-packages/nltk/tokenize/nist.pyÚlang_independent_subƒ   s    

z"NISTTokenizer.lang_independent_subFTc             C   sx   t |ƒ}|  |¡}|rNd| d }|r.| ¡ }x| jD ]\}}| ||¡}q6W d | ¡ ¡}t | ¡ ƒ}|rp|S | ¡ S )Nr	   )r   r   ÚlowerÚLANG_DEPENDENT_REGEXESr   ÚjoinÚsplitÚstrip)r   r   Ú	lowercaseZwestern_langÚ
return_strr   r   r   r   r   Útokenize   s    
zNISTTokenizer.tokenizec             C   sŠ   t |ƒ}| j\}}| ||¡}| j\}}| ||¡}t|ƒ}|rH| ¡ }x| jD ]\}}| ||¡}qPW d | ¡  	¡ ¡}|r‚|S | 	¡ S )Nr	   )
r   r   r   r   r   r   ÚINTERNATIONAL_REGEXESr   r   r   )r   r   r   Zsplit_non_asciir   r   r   r   r   r   Úinternational_tokenize¢   s    

z$NISTTokenizer.international_tokenizeN)FTF)FTF)"Ú__name__Ú
__module__Ú__qualname__Ú__doc__ÚreÚcompiler   r   ZPUNCTZPERIOD_COMMA_PRECEEDZPERIOD_COMMA_FOLLOWZDASH_PRECEED_DIGITr   r   r   Úsetr   ÚcharsZ
pup_numberZ	pup_punctZ
pup_symbolr   Znumber_regexZpunct_regexZsymbol_regexZNONASCIIÚformatZPUNCT_1ZPUNCT_2ZSYMBOLSr    r   r   r!   r   r   r   r   r      s6   -
r   )r%   Z
__future__r   Úior&   Zsixr   Znltk.corpusr   Znltk.tokenize.apir   Znltk.tokenize.utilr   r   r   r   r   r   Ú<module>   s   