B
    P?[(                 @   s   d dl mZmZ d dlZd dlZd dlZd dlmZ d dlZd dl	m
Z
 d dlmZmZmZmZ d dlmZ d dlmZ dZG d	d
 d
eZdd ZdS )    )unicode_literalsprint_functionN)PIPE)	text_type)find_jarconfig_javajava_java_options)
TokenizerI)CoreNLPParserz1https://nlp.stanford.edu/software/tokenizer.shtmlc               @   s<   e Zd ZdZdZdddZed	d
 Zdd ZdddZ	dS )StanfordTokenizera$  
    Interface to the Stanford Tokenizer

    >>> from nltk.tokenize.stanford import StanfordTokenizer
    >>> s = "Good muffins cost $3.88\nin New York.  Please buy me\ntwo of them.\nThanks."
    >>> StanfordTokenizer().tokenize(s)
    ['Good', 'muffins', 'cost', '$', '3.88', 'in', 'New', 'York', '.', 'Please', 'buy', 'me', 'two', 'of', 'them', '.', 'Thanks', '.']
    >>> s = "The colour of the wall is blue."
    >>> StanfordTokenizer(options={"americanize": True}).tokenize(s)
    ['The', 'color', 'of', 'the', 'wall', 'is', 'blue', '.']
    zstanford-postagger.jarNutf8F-mx1000mc             C   sf   t jtdtdd t| j|ddt|d| _|| _|| _	|d krDi n|}d
dd	 | D | _d S )
Nzz
The StanfordTokenizer will be deprecated in version 3.2.5.
Please use [91mnltk.parse.corenlp.CoreNLPParser[0m instead.'   )
stacklevel)ZSTANFORD_POSTAGGER )Zenv_varsZ
searchpathZurlverbose,c             s   s   | ]\}}d  ||V  qdS )z{0}={1}N)format).0keyvalr   r   5lib/python3.7/site-packages/nltk/tokenize/stanford.py	<genexpr>K   s    z-StanfordTokenizer.__init__.<locals>.<genexpr>)warningswarnstrDeprecationWarningr   _JAR_stanford_url_stanford_jar	_encodingjava_optionsjoinitems_options_cmd)selfZpath_to_jarencodingoptionsr   r"   r   r   r   __init__*   s"    	
zStanfordTokenizer.__init__c             C   s   |   S )N)
splitlines)sr   r   r   _parse_tokenized_outputN   s    z)StanfordTokenizer._parse_tokenized_outputc             C   s   dg}|  | ||S )zW
        Use stanford tokenizer's PTBTokenizer to tokenize multiple sentences.
        z%edu.stanford.nlp.process.PTBTokenizer)r,   _execute)r&   r+   cmdr   r   r   tokenizeR   s    zStanfordTokenizer.tokenizec       
   	   C   s   | j }|d|g | j}|r.|d| jg dt}t| j|d tjddd\}t	|t
rn|rn||}|| |  ||j t|| jttd\}}	||}W d Q R X t|j t|dd |S )	Nz-charsetz-options )r(   r   wbF)modedelete)Z	classpathstdoutstderr)r!   extendr%   r#   r	   r   r"   tempfileZNamedTemporaryFile
isinstancer   encodewriteflushappendnamer   r    r   decodeosunlink)
r&   r.   Zinput_r   r'   r%   Zdefault_optionsZ
input_filer4   r5   r   r   r   r-   Y   s&    


zStanfordTokenizer._execute)Nr   NFr   )F)
__name__
__module____qualname____doc__r   r)   staticmethodr,   r/   r-   r   r   r   r   r      s       
r   c             C   s8   ddl m} y
t  W n tk
r2   |dY nX d S )Nr   )SkipTestzadoctests from nltk.tokenize.stanford are skipped because the stanford postagger jar doesn't exist)ZnoserF   r   LookupError)modulerF   r   r   r   setup_module}   s    
rI   )Z
__future__r   r   r7   r?   Zjson
subprocessr   r   Zsixr   Znltk.internalsr   r   r   r	   Znltk.tokenize.apir
   Znltk.parse.corenlpr   r   r   rI   r   r   r   r   <module>
   s   b