B
    P?[K                 @   s   d dl mZ d dlZd dlZd dlZd dlmZ d dlmZ d dl	m
Z
 d dlmZmZmZmZmZ d dlmZ d dlmZ d d	lmZ d
ZG dd deZG dd deZG dd deZG dd deZeddd ZdS )    )unicode_literalsN)skip)PIPE)	text_type)find_jar_iterconfig_javajava_java_optionsfind_jars_within_path)ParserI)DependencyGraph)Treez2https://nlp.stanford.edu/software/lex-parser.shtmlc               @   sr   e Zd ZdZdZdZdZdZdZdddZ	dd Z
dddZdddZdddZdddZd ddZd!ddZdS )"GenericStanfordParserz Interface to the Stanford Parserz+stanford-parser-(\d+)(\.(\d+))+-models\.jarzstanford-parser\.jarz3edu.stanford.nlp.parser.lexparser.LexicalizedParserFN4edu/stanford/nlp/models/lexparser/englishPCFG.ser.gzutf8-mx4g c          
   C   s   t t| j|ddt|dddd d}t t| j|ddt|ddd	d d}	tj|d
 }
t|	gt	|
 | _
|| _|| _|| _|| _d S )N)ZSTANFORD_PARSERSTANFORD_CORENLP T)Zenv_varsZ
searchpathZurlverboseZis_regexc             S   s   t j| S )N)ospathdirname)
model_pathr   r   2lib/python3.7/site-packages/nltk/parse/stanford.py<lambda>C   s    z0GenericStanfordParser.__init__.<locals>.<lambda>)key)ZSTANFORD_MODELSr   c             S   s   t j| S )N)r   r   r   )r   r   r   r   r   P   s    r   )maxr   _JAR_stanford_url_MODEL_JAR_PATTERNr   r   splittupler
   
_classpathr   	_encodingcorenlp_optionsjava_options)selfZpath_to_jarZpath_to_models_jarr   encodingr   r&   r%   Zstanford_jarZ	model_jarZstanford_dirr   r   r   __init__-   s4    zGenericStanfordParser.__init__c          	   C   s   g }g }g }d}x| dD ]~}|dkr|rD|t| g }d}q| jrj|| d| g }d}q|t| d|g g }q|| d}qW t|S )NFr   
T)
splitlinesappenditer_DOUBLE_SPACED_OUTPUT
_make_treejoin)r'   Zoutput_ZresZ	cur_linesZ	cur_treesZblankliner   r   r   _parse_trees_output^   s&    
z)GenericStanfordParser._parse_trees_outputc          
   C   sB   | j d| jddd| jdddg
}| | |dd	d
 |D |S )a  
        Use StanfordParser to parse multiple sentences. Takes multiple sentences as a
        list where each sentence is a list of words.
        Each sentence will be automatically tagged with this StanfordParser instance's
        tagger.
        If whitespaces exists inside a token, then the token will be treated as
        separate tokens.

        :param sentences: Input sentences to parse
        :type sentences: list(list(str))
        :rtype: iter(iter(Tree))
        z-modelz
-sentencesnewlinez-outputFormatz
-tokenizedz-escaperz-edu.stanford.nlp.process.PTBEscapingProcessorr*   c             s   s   | ]}d  |V  qdS ) N)r0   ).0sentencer   r   r   	<genexpr>   s    z4GenericStanfordParser.parse_sents.<locals>.<genexpr>)_MAIN_CLASSr   _OUTPUT_FORMATr2   _executer0   )r'   	sentencesr   cmdr   r   r   parse_sentsu   s    z!GenericStanfordParser.parse_sentsc             C   s   t | |g|S )a&  
        Use StanfordParser to parse a sentence. Takes a sentence as a string;
        before parsing, it will be automatically tokenized and tagged by
        the Stanford Parser.

        :param sentence: Input sentence to parse
        :type sentence: str
        :rtype: iter(Tree)
        )nextraw_parse_sents)r'   r6   r   r   r   r   	raw_parse   s    
zGenericStanfordParser.raw_parsec             C   s2   | j d| jddd| jg}| | |d||S )aI  
        Use StanfordParser to parse multiple sentences. Takes multiple sentences as a
        list of strings.
        Each sentence will be automatically tokenized and tagged by the Stanford Parser.

        :param sentences: Input sentences to parse
        :type sentences: list(str)
        :rtype: iter(iter(Tree))
        z-modelz
-sentencesr3   z-outputFormatr*   )r8   r   r9   r2   r:   r0   )r'   r;   r   r<   r   r   r   r?      s    z%GenericStanfordParser.raw_parse_sentsc             C   s   t | |g|S )a0  
        Use StanfordParser to parse a sentence. Takes a sentence as a list of
        (word, tag) tuples; the sentence must have already been tokenized and
        tagged.

        :param sentence: Input sentence to parse
        :type sentence: list(tuple(str, str))
        :rtype: iter(Tree)
        )r>   tagged_parse_sents)r'   r6   r   r   r   r   tagged_parse   s    
z"GenericStanfordParser.tagged_parsec                sR   d | j d| jddd| jdd dd	d
dg}| | |d fdd|D |S )ad  
        Use StanfordParser to parse multiple sentences. Takes multiple sentences
        where each sentence is a list of (word, tag) tuples.
        The sentences must have already been tokenized and tagged.

        :param sentences: Input sentences to parse
        :type sentences: list(list(tuple(str, str)))
        :rtype: iter(iter(Tree))
        /z-modelz
-sentencesr3   z-outputFormatz
-tokenizedz-tagSeparatorz-tokenizerFactoryz,edu.stanford.nlp.process.WhitespaceTokenizerz-tokenizerMethodZnewCoreLabelTokenizerFactoryr*   c             3   s&   | ]}d   fdd|D V  qdS )r4   c             3   s   | ]}  |V  qd S )N)r0   )r5   Ztagged)tag_separatorr   r   r7      s    zEGenericStanfordParser.tagged_parse_sents.<locals>.<genexpr>.<genexpr>N)r0   )r5   r6   )rD   r   r   r7      s   z;GenericStanfordParser.tagged_parse_sents.<locals>.<genexpr>)r8   r   r9   r2   r:   r0   )r'   r;   r   r<   r   )rD   r   rA      s,    

z(GenericStanfordParser.tagged_parse_sentsc       	   	   C   s  | j }|d|g | jr&|| j dt}t| j|d tj	ddd}t
|trf|rf||}|| |  | jr|d t|| j|ttd\}}n"||j t|| jttd	\}}|d
d}|dd}||}W d Q R X t|j t|dd |S )Nz	-encodingr4   )Zoptionsr   wbF)modedeleter   )	classpathstdinstdoutstderr)rH   rJ   rK   s        s    )r$   extendr%   r,   r0   r	   r   r&   tempfileZNamedTemporaryFile
isinstancer   encodewriteflush
_USE_STDINseekr   r#   r   namereplacedecoder   unlink)	r'   r<   Zinput_r   r(   Zdefault_optionsZ
input_filerJ   rK   r   r   r   r:      s8    



zGenericStanfordParser._execute)NNr   r   Fr   r   )F)F)F)F)F)F)__name__
__module____qualname____doc__r    r   r8   rS   r.   r)   r2   r=   r@   r?   rB   rA   r:   r   r   r   r   r   #   s(         
)




'r   c                   s,   e Zd ZdZdZ fddZdd Z  ZS )StanfordParsera{  
    >>> parser=StanfordParser(
    ...     model_path="edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz"
    ... )

    >>> list(parser.raw_parse("the quick brown fox jumps over the lazy dog")) # doctest: +NORMALIZE_WHITESPACE
    [Tree('ROOT', [Tree('NP', [Tree('NP', [Tree('DT', ['the']), Tree('JJ', ['quick']), Tree('JJ', ['brown']),
    Tree('NN', ['fox'])]), Tree('NP', [Tree('NP', [Tree('NNS', ['jumps'])]), Tree('PP', [Tree('IN', ['over']),
    Tree('NP', [Tree('DT', ['the']), Tree('JJ', ['lazy']), Tree('NN', ['dog'])])])])])])]

    >>> sum([list(dep_graphs) for dep_graphs in parser.raw_parse_sents((
    ...     "the quick brown fox jumps over the lazy dog",
    ...     "the quick grey wolf jumps over the lazy fox"
    ... ))], []) # doctest: +NORMALIZE_WHITESPACE
    [Tree('ROOT', [Tree('NP', [Tree('NP', [Tree('DT', ['the']), Tree('JJ', ['quick']), Tree('JJ', ['brown']),
    Tree('NN', ['fox'])]), Tree('NP', [Tree('NP', [Tree('NNS', ['jumps'])]), Tree('PP', [Tree('IN', ['over']),
    Tree('NP', [Tree('DT', ['the']), Tree('JJ', ['lazy']), Tree('NN', ['dog'])])])])])]), Tree('ROOT', [Tree('NP',
    [Tree('NP', [Tree('DT', ['the']), Tree('JJ', ['quick']), Tree('JJ', ['grey']), Tree('NN', ['wolf'])]), Tree('NP',
    [Tree('NP', [Tree('NNS', ['jumps'])]), Tree('PP', [Tree('IN', ['over']), Tree('NP', [Tree('DT', ['the']),
    Tree('JJ', ['lazy']), Tree('NN', ['fox'])])])])])])]

    >>> sum([list(dep_graphs) for dep_graphs in parser.parse_sents((
    ...     "I 'm a dog".split(),
    ...     "This is my friends ' cat ( the tabby )".split(),
    ... ))], []) # doctest: +NORMALIZE_WHITESPACE
    [Tree('ROOT', [Tree('S', [Tree('NP', [Tree('PRP', ['I'])]), Tree('VP', [Tree('VBP', ["'m"]),
    Tree('NP', [Tree('DT', ['a']), Tree('NN', ['dog'])])])])]), Tree('ROOT', [Tree('S', [Tree('NP',
    [Tree('DT', ['This'])]), Tree('VP', [Tree('VBZ', ['is']), Tree('NP', [Tree('NP', [Tree('NP', [Tree('PRP$', ['my']),
    Tree('NNS', ['friends']), Tree('POS', ["'"])]), Tree('NN', ['cat'])]), Tree('PRN', [Tree('-LRB-', [Tree('', []),
    Tree('NP', [Tree('DT', ['the']), Tree('NN', ['tabby'])]), Tree('-RRB-', [])])])])])])])]

    >>> sum([list(dep_graphs) for dep_graphs in parser.tagged_parse_sents((
    ...     (
    ...         ("The", "DT"),
    ...         ("quick", "JJ"),
    ...         ("brown", "JJ"),
    ...         ("fox", "NN"),
    ...         ("jumped", "VBD"),
    ...         ("over", "IN"),
    ...         ("the", "DT"),
    ...         ("lazy", "JJ"),
    ...         ("dog", "NN"),
    ...         (".", "."),
    ...     ),
    ... ))],[]) # doctest: +NORMALIZE_WHITESPACE
    [Tree('ROOT', [Tree('S', [Tree('NP', [Tree('DT', ['The']), Tree('JJ', ['quick']), Tree('JJ', ['brown']),
    Tree('NN', ['fox'])]), Tree('VP', [Tree('VBD', ['jumped']), Tree('PP', [Tree('IN', ['over']), Tree('NP',
    [Tree('DT', ['the']), Tree('JJ', ['lazy']), Tree('NN', ['dog'])])])]), Tree('.', ['.'])])])]
    Zpennc                s&   t jdtdd tt| j|| d S )NzcThe StanfordParser will be deprecated
Please use [91mnltk.parse.corenlp.CoreNLPParser[0m instead.   )
stacklevel)warningswarnDeprecationWarningsuperr]   r)   )r'   argskwargs)	__class__r   r   r)   N  s
    zStanfordParser.__init__c             C   s
   t |S )N)r   Z
fromstring)r'   resultr   r   r   r/   X  s    zStanfordParser._make_tree)rY   rZ   r[   r\   r9   r)   r/   __classcell__r   r   )rf   r   r]     s   1
r]   c                   s,   e Zd ZdZdZ fddZdd Z  ZS )StanfordDependencyParseraT
  
    >>> dep_parser=StanfordDependencyParser(
    ...     model_path="edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz"
    ... )

    >>> [parse.tree() for parse in dep_parser.raw_parse("The quick brown fox jumps over the lazy dog.")] # doctest: +NORMALIZE_WHITESPACE
    [Tree('jumps', [Tree('fox', ['The', 'quick', 'brown']), Tree('dog', ['over', 'the', 'lazy'])])]

    >>> [list(parse.triples()) for parse in dep_parser.raw_parse("The quick brown fox jumps over the lazy dog.")] # doctest: +NORMALIZE_WHITESPACE
    [[((u'jumps', u'VBZ'), u'nsubj', (u'fox', u'NN')), ((u'fox', u'NN'), u'det', (u'The', u'DT')),
    ((u'fox', u'NN'), u'amod', (u'quick', u'JJ')), ((u'fox', u'NN'), u'amod', (u'brown', u'JJ')),
    ((u'jumps', u'VBZ'), u'nmod', (u'dog', u'NN')), ((u'dog', u'NN'), u'case', (u'over', u'IN')),
    ((u'dog', u'NN'), u'det', (u'the', u'DT')), ((u'dog', u'NN'), u'amod', (u'lazy', u'JJ'))]]

    >>> sum([[parse.tree() for parse in dep_graphs] for dep_graphs in dep_parser.raw_parse_sents((
    ...     "The quick brown fox jumps over the lazy dog.",
    ...     "The quick grey wolf jumps over the lazy fox."
    ... ))], []) # doctest: +NORMALIZE_WHITESPACE
    [Tree('jumps', [Tree('fox', ['The', 'quick', 'brown']), Tree('dog', ['over', 'the', 'lazy'])]),
    Tree('jumps', [Tree('wolf', ['The', 'quick', 'grey']), Tree('fox', ['over', 'the', 'lazy'])])]

    >>> sum([[parse.tree() for parse in dep_graphs] for dep_graphs in dep_parser.parse_sents((
    ...     "I 'm a dog".split(),
    ...     "This is my friends ' cat ( the tabby )".split(),
    ... ))], []) # doctest: +NORMALIZE_WHITESPACE
    [Tree('dog', ['I', "'m", 'a']), Tree('cat', ['This', 'is', Tree('friends', ['my', "'"]), Tree('tabby', ['the'])])]

    >>> sum([[list(parse.triples()) for parse in dep_graphs] for dep_graphs in dep_parser.tagged_parse_sents((
    ...     (
    ...         ("The", "DT"),
    ...         ("quick", "JJ"),
    ...         ("brown", "JJ"),
    ...         ("fox", "NN"),
    ...         ("jumped", "VBD"),
    ...         ("over", "IN"),
    ...         ("the", "DT"),
    ...         ("lazy", "JJ"),
    ...         ("dog", "NN"),
    ...         (".", "."),
    ...     ),
    ... ))],[]) # doctest: +NORMALIZE_WHITESPACE
    [[((u'jumped', u'VBD'), u'nsubj', (u'fox', u'NN')), ((u'fox', u'NN'), u'det', (u'The', u'DT')),
    ((u'fox', u'NN'), u'amod', (u'quick', u'JJ')), ((u'fox', u'NN'), u'amod', (u'brown', u'JJ')),
    ((u'jumped', u'VBD'), u'nmod', (u'dog', u'NN')), ((u'dog', u'NN'), u'case', (u'over', u'IN')),
    ((u'dog', u'NN'), u'det', (u'the', u'DT')), ((u'dog', u'NN'), u'amod', (u'lazy', u'JJ'))]]

    Z	conll2007c                s&   t jdtdd tt| j|| d S )NzwThe StanfordDependencyParser will be deprecated
Please use [91mnltk.parse.corenlp.CoreNLPDependencyParser[0m instead.r^   )r_   )r`   ra   rb   rc   ri   r)   )r'   rd   re   )rf   r   r   r)     s
    z!StanfordDependencyParser.__init__c             C   s   t |ddS )Nroot)top_relation_label)r   )r'   rg   r   r   r   r/     s    z#StanfordDependencyParser._make_tree)rY   rZ   r[   r\   r9   r)   r/   rh   r   r   )rf   r   ri   \  s   0
ri   c                   sJ   e Zd ZdZdZdZdZdZdZdZ	 fddZ
dd
dZdd Z  ZS )StanfordNeuralDependencyParsera9  
    >>> from nltk.parse.stanford import StanfordNeuralDependencyParser
    >>> dep_parser=StanfordNeuralDependencyParser(java_options='-mx4g')

    >>> [parse.tree() for parse in dep_parser.raw_parse("The quick brown fox jumps over the lazy dog.")] # doctest: +NORMALIZE_WHITESPACE
    [Tree('jumps', [Tree('fox', ['The', 'quick', 'brown']), Tree('dog', ['over', 'the', 'lazy']), '.'])]

    >>> [list(parse.triples()) for parse in dep_parser.raw_parse("The quick brown fox jumps over the lazy dog.")] # doctest: +NORMALIZE_WHITESPACE
    [[((u'jumps', u'VBZ'), u'nsubj', (u'fox', u'NN')), ((u'fox', u'NN'), u'det',
    (u'The', u'DT')), ((u'fox', u'NN'), u'amod', (u'quick', u'JJ')), ((u'fox', u'NN'),
    u'amod', (u'brown', u'JJ')), ((u'jumps', u'VBZ'), u'nmod', (u'dog', u'NN')),
    ((u'dog', u'NN'), u'case', (u'over', u'IN')), ((u'dog', u'NN'), u'det',
    (u'the', u'DT')), ((u'dog', u'NN'), u'amod', (u'lazy', u'JJ')), ((u'jumps', u'VBZ'),
    u'punct', (u'.', u'.'))]]

    >>> sum([[parse.tree() for parse in dep_graphs] for dep_graphs in dep_parser.raw_parse_sents((
    ...     "The quick brown fox jumps over the lazy dog.",
    ...     "The quick grey wolf jumps over the lazy fox."
    ... ))], []) # doctest: +NORMALIZE_WHITESPACE
    [Tree('jumps', [Tree('fox', ['The', 'quick', 'brown']), Tree('dog', ['over',
    'the', 'lazy']), '.']), Tree('jumps', [Tree('wolf', ['The', 'quick', 'grey']),
    Tree('fox', ['over', 'the', 'lazy']), '.'])]

    >>> sum([[parse.tree() for parse in dep_graphs] for dep_graphs in dep_parser.parse_sents((
    ...     "I 'm a dog".split(),
    ...     "This is my friends ' cat ( the tabby )".split(),
    ... ))], []) # doctest: +NORMALIZE_WHITESPACE
    [Tree('dog', ['I', "'m", 'a']), Tree('cat', ['This', 'is', Tree('friends',
    ['my', "'"]), Tree('tabby', ['-LRB-', 'the', '-RRB-'])])]
    Zconllz)edu.stanford.nlp.pipeline.StanfordCoreNLPz%stanford-corenlp-(\d+)(\.(\d+))+\.jarz,stanford-corenlp-(\d+)(\.(\d+))+-models\.jarTc                s4   t jdtdd tt| j|| |  jd7  _d S )Nz}The StanfordNeuralDependencyParser will be deprecated
Please use [91mnltk.parse.corenlp.CoreNLPDependencyParser[0m instead.r^   )r_   z(-annotators tokenize,ssplit,pos,depparse)r`   ra   rb   rc   rl   r)   r%   )r'   rd   re   )rf   r   r   r)     s    z'StanfordNeuralDependencyParser.__init__Fc             C   s   t ddS )z
        Currently unimplemented because the neural dependency parser (and
        the StanfordCoreNLP pipeline class) doesn't support passing in pre-
        tagged tokens.
        zxtagged_parse[_sents] is not supported by StanfordNeuralDependencyParser; use parse[_sents] or raw_parse[_sents] instead.N)NotImplementedError)r'   r;   r   r   r   r   rA     s    z1StanfordNeuralDependencyParser.tagged_parse_sentsc             C   s   t |ddS )NZROOT)rk   )r   )r'   rg   r   r   r   r/     s    z)StanfordNeuralDependencyParser._make_tree)F)rY   rZ   r[   r\   r9   r8   r   r    rS   r.   r)   rA   r/   rh   r   r   )rf   r   rl     s   
rl   zEdoctests from nltk.parse.stanford are skipped because it's deprecatedc             C   sB   ddl m} ytdd t  W n tk
r<   |dY nX d S )Nr   )SkipTestz4edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz)r   zndoctests from nltk.parse.stanford are skipped because one of the stanford parser or CoreNLP jars doesn't exist)Znosern   r]   rl   LookupError)modulern   r   r   r   setup_module  s    
rq   )Z
__future__r   rN   r   r`   Zunittestr   
subprocessr   Zsixr   Znltk.internalsr   r   r   r	   r
   Znltk.parse.apir   Znltk.parse.dependencygraphr   Z	nltk.treer   r   r   r]   ri   rl   rq   r   r   r   r   <module>
   s"    wCBB