B
    >?ð[xQ  ã            	   @   s  d dl mZmZmZ d dlZd dlmZ d dlmZ d dl	m
Z
 d dlmZ d dlmZ dd	„ ZG d
d„ deƒZdd„ Zd0dd„Ze d¡Zd1dd„Zdd„ Zd2dd„Zdd„ Ze dej¡Ze d¡Zd d!„ Zd"d#d$d%d&d'd(d)d*g	dfd+d,„Zd-d.„ Ze d/kreƒ  dS )3é    )Úprint_functionÚunicode_literalsÚdivisionN)ÚTree)Úmap_tag)Ú	str2tuple)Úpython_2_unicode_compatible)Úaccuracyc             C   sF   g }g }x2|D ]*}|   | ¡ ¡}|t|ƒ7 }|t|ƒ7 }qW t||ƒS )a|  
    Score the accuracy of the chunker against the gold standard.
    Strip the chunk information from the gold standard and rechunk it using
    the chunker, then compute the accuracy score.

    :type chunker: ChunkParserI
    :param chunker: The chunker being evaluated.
    :type gold: tree
    :param gold: The chunk structures to score the chunker on.
    :rtype: float
    )ÚparseZflattenÚtree2conlltagsÚ	_accuracy)ZchunkerZgoldZ	gold_tagsZ	test_tagsZ	gold_treeZ	test_tree© r   ú.lib/python3.7/site-packages/nltk/chunk/util.pyr	      s    
r	   c               @   s‚   e Zd ZdZdd„ Zdd„ Zdd„ Zdd	„ Zd
d„ Zdd„ Z	d dd„Z
dd„ Zdd„ Zdd„ Zdd„ Zdd„ Zdd„ Zdd„ ZdS )!Ú
ChunkScorea;  
    A utility class for scoring chunk parsers.  ``ChunkScore`` can
    evaluate a chunk parser's output, based on a number of statistics
    (precision, recall, f-measure, misssed chunks, incorrect chunks).
    It can also combine the scores from the parsing of multiple texts;
    this makes it significantly easier to evaluate a chunk parser that
    operates one sentence at a time.

    Texts are evaluated with the ``score`` method.  The results of
    evaluation can be accessed via a number of accessor methods, such
    as ``precision`` and ``f_measure``.  A typical use of the
    ``ChunkScore`` class is::

        >>> chunkscore = ChunkScore()           # doctest: +SKIP
        >>> for correct in correct_sentences:   # doctest: +SKIP
        ...     guess = chunkparser.parse(correct.leaves())   # doctest: +SKIP
        ...     chunkscore.score(correct, guess)              # doctest: +SKIP
        >>> print('F Measure:', chunkscore.f_measure())       # doctest: +SKIP
        F Measure: 0.823

    :ivar kwargs: Keyword arguments:

        - max_tp_examples: The maximum number actual examples of true
          positives to record.  This affects the ``correct`` member
          function: ``correct`` will not return more than this number
          of true positive examples.  This does *not* affect any of
          the numerical metrics (precision, recall, or f-measure)

        - max_fp_examples: The maximum number actual examples of false
          positives to record.  This affects the ``incorrect`` member
          function and the ``guessed`` member function: ``incorrect``
          will not return more than this number of examples, and
          ``guessed`` will not return more than this number of true
          positive examples.  This does *not* affect any of the
          numerical metrics (precision, recall, or f-measure)

        - max_fn_examples: The maximum number actual examples of false
          negatives to record.  This affects the ``missed`` member
          function and the ``correct`` member function: ``missed``
          will not return more than this number of examples, and
          ``correct`` will not return more than this number of true
          negative examples.  This does *not* affect any of the
          numerical metrics (precision, recall, or f-measure)

        - chunk_label: A regular expression indicating which chunks
          should be compared.  Defaults to ``'.*'`` (i.e., all chunks).

    :type _tp: list(Token)
    :ivar _tp: List of true positives
    :type _fp: list(Token)
    :ivar _fp: List of false positives
    :type _fn: list(Token)
    :ivar _fn: List of false negatives

    :type _tp_num: int
    :ivar _tp_num: Number of true positives
    :type _fp_num: int
    :ivar _fp_num: Number of false positives
    :type _fn_num: int
    :ivar _fn_num: Number of false negatives.
    c             K   sŽ   t ƒ | _t ƒ | _t ƒ | _t ƒ | _t ƒ | _| dd¡| _| dd¡| _| dd¡| _	| dd¡| _
d| _d| _d| _d| _d| _d| _d	| _d S )
NZmax_tp_exampleséd   Zmax_fp_examplesZmax_fn_examplesÚchunk_labelz.*r   g        F)ÚsetÚ_correctÚ_guessedÚ_tpÚ_fpÚ_fnÚgetZ_max_tpZ_max_fpZ_max_fnÚ_chunk_labelÚ_tp_numÚ_fp_numÚ_fn_numÚ_countÚ_tags_correctÚ_tags_totalÚ_measuresNeedUpdate)ÚselfÚkwargsr   r   r   Ú__init__u   s     zChunkScore.__init__c             C   s^   | j rZ| j| j@ | _| j| j | _| j| j | _t| jƒ| _t| jƒ| _t| jƒ| _	d| _ d S )NF)
r    r   r   r   r   r   Úlenr   r   r   )r!   r   r   r   Ú_updateMeasuresˆ   s    zChunkScore._updateMeasuresc             C   s²   |  j t|| j| jƒO  _ |  jt|| j| jƒO  _|  jd7  _d| _yt|ƒ}t|ƒ}W n tk
rx   d }}Y nX |  jt	|ƒ7  _|  j
tdd„ t||ƒD ƒƒ7  _
dS )aU  
        Given a correctly chunked sentence, score another chunked
        version of the same sentence.

        :type correct: chunk structure
        :param correct: The known-correct ("gold standard") chunked
            sentence.
        :type guessed: chunk structure
        :param guessed: The chunked sentence to be scored.
        é   Tr   c             s   s   | ]\}}||krd V  qdS )r&   Nr   )Ú.0ÚtÚgr   r   r   ú	<genexpr>¬   s    z#ChunkScore.score.<locals>.<genexpr>N)r   Ú
_chunksetsr   r   r   r    r   Ú
ValueErrorr   r$   r   ÚsumÚzip)r!   ÚcorrectÚguessedZcorrect_tagsZguessed_tagsr   r   r   Úscore’   s    zChunkScore.scorec             C   s   | j dkrdS | j| j  S )zÁ
        Return the overall tag-based accuracy for all text that have
        been scored by this ``ChunkScore``, using the IOB (conll2000)
        tag encoding.

        :rtype: float
        r   r&   )r   r   )r!   r   r   r   r	   ¯   s    
zChunkScore.accuracyc             C   s.   |   ¡  | j| j }|dkr dS | j| S dS )z‰
        Return the overall precision for all texts that have been
        scored by this ``ChunkScore``.

        :rtype: float
        r   N)r%   r   r   )r!   Údivr   r   r   Ú	precision»   s
    zChunkScore.precisionc             C   s.   |   ¡  | j| j }|dkr dS | j| S dS )z†
        Return the overall recall for all texts that have been
        scored by this ``ChunkScore``.

        :rtype: float
        r   N)r%   r   r   )r!   r2   r   r   r   ÚrecallÉ   s
    zChunkScore.recallç      à?c             C   sD   |   ¡  |  ¡ }|  ¡ }|dks(|dkr,dS d|| d| |   S )a»  
        Return the overall F measure for all texts that have been
        scored by this ``ChunkScore``.

        :param alpha: the relative weighting of precision and recall.
            Larger alpha biases the score towards the precision value,
            while smaller alpha biases the score towards the recall
            value.  ``alpha`` should have a value in the range [0,1].
        :type alpha: float
        :rtype: float
        r   r&   )r%   r3   r4   )r!   ZalphaÚpÚrr   r   r   Ú	f_measure×   s    zChunkScore.f_measurec             C   s    |   ¡  t| jƒ}dd„ |D ƒS )zÈ
        Return the chunks which were included in the
        correct chunk structures, but not in the guessed chunk
        structures, listed in input order.

        :rtype: list of chunks
        c             S   s   g | ]}|d  ‘qS )r&   r   )r'   Úcr   r   r   ú
<listcomp>ô   s    z%ChunkScore.missed.<locals>.<listcomp>)r%   Úlistr   )r!   Úchunksr   r   r   Úmissedê   s    
zChunkScore.missedc             C   s    |   ¡  t| jƒ}dd„ |D ƒS )zÀ
        Return the chunks which were included in the guessed chunk structures,
        but not in the correct chunk structures, listed in input order.

        :rtype: list of chunks
        c             S   s   g | ]}|d  ‘qS )r&   r   )r'   r9   r   r   r   r:   ÿ   s    z(ChunkScore.incorrect.<locals>.<listcomp>)r%   r;   r   )r!   r<   r   r   r   Ú	incorrectö   s    
zChunkScore.incorrectc             C   s   t | jƒ}dd„ |D ƒS )z—
        Return the chunks which were included in the correct
        chunk structures, listed in input order.

        :rtype: list of chunks
        c             S   s   g | ]}|d  ‘qS )r&   r   )r'   r9   r   r   r   r:   	  s    z&ChunkScore.correct.<locals>.<listcomp>)r;   r   )r!   r<   r   r   r   r/     s    
zChunkScore.correctc             C   s   t | jƒ}dd„ |D ƒS )z—
        Return the chunks which were included in the guessed
        chunk structures, listed in input order.

        :rtype: list of chunks
        c             S   s   g | ]}|d  ‘qS )r&   r   )r'   r9   r   r   r   r:     s    z&ChunkScore.guessed.<locals>.<listcomp>)r;   r   )r!   r<   r   r   r   r0     s    
zChunkScore.guessedc             C   s   |   ¡  | j| j S )N)r%   r   r   )r!   r   r   r   Ú__len__  s    zChunkScore.__len__c             C   s   dt t| ƒƒ d S )z`
        Return a concise representation of this ``ChunkScoring``.

        :rtype: str
        z<ChunkScoring of z chunks>)Úreprr$   )r!   r   r   r   Ú__repr__  s    zChunkScore.__repr__c             C   sL   dd  |  ¡ d ¡ d  |  ¡ d ¡ d  |  ¡ d ¡ d  |  ¡ d ¡ S )a-  
        Return a verbose representation of this ``ChunkScoring``.
        This representation includes the precision, recall, and
        f-measure scores.  For other information about the score,
        use the accessor methods (e.g., ``missed()`` and ``incorrect()``).

        :rtype: str
        zChunkParse score:
z    IOB Accuracy: {:5.1f}%%
r   z    Precision:    {:5.1f}%%
z    Recall:       {:5.1f}%%
z    F-Measure:    {:5.1f}%%)Úformatr	   r3   r4   r8   )r!   r   r   r   Ú__str__!  s    zChunkScore.__str__N)r5   )Ú__name__Ú
__module__Ú__qualname__Ú__doc__r#   r%   r1   r	   r3   r4   r8   r=   r>   r/   r0   r?   rA   rC   r   r   r   r   r   6   s   =



r   c             C   sh   d}g }xV| D ]N}t |tƒrTt || ¡ ¡rB| ||f| ¡ f¡ |t| ¡ ƒ7 }q|d7 }qW t	|ƒS )Nr   r&   )
Ú
isinstancer   ÚreÚmatchÚlabelÚappendZfreezer$   Zleavesr   )r(   Úcountr   Úposr<   Úchildr   r   r   r+   5  s    

r+   ÚNPÚSú/c             C   s(  t  d¡}t|g ƒg}xè| | ¡D ]Ú}| ¡ }	|	d dkr|t|ƒdkrXtd | ¡ ¡ƒ‚t|g ƒ}
|d  	|
¡ | 	|
¡ q"|	d dkr°t|ƒdkr¦td	 | ¡ ¡ƒ‚| 
¡  q"|d
krÈ|d  	|	¡ q"t|	|ƒ\}}|rê|rêt|||ƒ}|d  	||f¡ q"W t|ƒdkr td t| ƒ¡ƒ‚|d S )aB  
    Divide a string of bracketted tagged text into
    chunks and unchunked tokens, and produce a Tree.
    Chunks are marked by square brackets (``[...]``).  Words are
    delimited by whitespace, and each word should have the form
    ``text/tag``.  Words that do not contain a slash are
    assigned a ``tag`` of None.

    :param s: The string to be converted
    :type s: str
    :param chunk_label: The label to use for chunk nodes
    :type chunk_label: str
    :param root_label: The label to use for the root of the tree
    :type root_label: str
    :rtype: Tree
    z\[|\]|[^\[\]\s]+r   ú[r&   zUnexpected [ at char {:d}éÿÿÿÿú]é   zUnexpected ] at char {:d}NzExpected ] at char {:d})rI   Úcompiler   ÚfinditerÚgroupr$   r,   rB   ÚstartrL   Úpopr   r   )Úsr   Ú
root_labelÚsepZsource_tagsetZtarget_tagsetZWORD_OR_BRACKETÚstackrJ   ÚtextÚchunkÚwordÚtagr   r   r   Útagstr2treeB  s.    


rd   z(\S+)\s+(\S+)\s+([IOB])-?(\S+)?©rP   ÚPPZVPc             C   sö   t |g ƒg}xàt|  d¡ƒD ]Î\}}| ¡ s.qt |¡}|dkrNtd |¡ƒ‚| ¡ \}}}	}
|dk	rr|
|krrd}	|	dkoˆ|
|d  	¡ k}|	dks–|rªt
|ƒdkrª| ¡  |	d	ks¶|rØt |
g ƒ}|d  |¡ | |¡ |d  ||f¡ qW |d
 S )a*  
    Return a chunk structure for a single sentence
    encoded in the given CONLL 2000 style string.
    This function converts a CoNLL IOB string into a tree.
    It uses the specified chunk types
    (defaults to NP, PP and VP), and creates a tree rooted at a node
    labeled S (by default).

    :param s: The CoNLL string to be converted.
    :type s: str
    :param chunk_types: The chunk types to be converted.
    :type chunk_types: tuple
    :param root_label: The node label to use for the root.
    :type root_label: str
    :rtype: Tree
    Ú
NzError on line {:d}ÚOÚIrT   ZBOrV   ÚBr   )r   Ú	enumerateÚsplitÚstripÚ_LINE_RErJ   r,   rB   ÚgroupsrK   r$   r[   rL   )r\   Úchunk_typesr]   r_   ÚlinenoÚlinerJ   rb   rc   ÚstateZ
chunk_typeZ
mismatch_Ira   r   r   r   Úconllstr2treex  s(    


rt   c          	   C   s”   g }xŠ| D ]‚}yP|  ¡ }d}x>|D ]6}t|tƒr8tdƒ‚| |d |d || f¡ d}q"W W q
 tk
rŠ   | |d |d df¡ Y q
X q
W |S )zË
    Return a list of 3-tuples containing ``(word, tag, IOB-tag)``.
    Convert a tree to the CoNLL IOB tag format.

    :param t: The tree to be converted.
    :type t: Tree
    :rtype: list(tuple)
    zB-z7Tree is too deeply nested to be printed in CoNLL formatr   r&   zI-rh   )rK   rH   r   r,   rL   ÚAttributeError)r(   ZtagsrO   ÚcategoryÚprefixÚcontentsr   r   r   r   ­  s    



"r   Fc             C   s  t |g ƒ}x
| D ] \}}}|dkrD|r4tdƒ‚n| ||f¡ q| d¡rn| t |dd… ||fgƒ¡ q| d¡rìt|ƒdksªt|d t ƒrª|d  ¡ |dd… krØ|r¸tdƒ‚qê| t |dd… ||fgƒ¡ n|d  ||f¡ q|dkr| ||f¡ qtd	 |¡ƒ‚qW |S )
z1
    Convert the CoNLL IOB format to a tree.
    NzBad conll tag sequencezB-rV   zI-r   rT   rh   zBad conll tag {0!r})r   r,   rL   Ú
startswithr$   rH   rK   rB   )Zsentencerp   r]   ÚstrictZtreerb   ZpostagZchunktagr   r   r   Úconlltags2treeÈ  s(    


 

 
r{   c             C   s   dd„ t | ƒD ƒ}d |¡S )zÒ
    Return a multiline string where each line contains a word, tag and IOB tag.
    Convert a tree to the CoNLL IOB string format

    :param t: The tree to be converted.
    :type t: Tree
    :rtype: str
    c             S   s   g | ]}d   |¡‘qS )ú )Újoin)r'   Útokenr   r   r   r:   õ  s    z!tree2conllstr.<locals>.<listcomp>rg   )r   r}   )r(   Úlinesr   r   r   Útree2conllstrì  s    	r€   a   <DOC>\s*(<DOCNO>\s*(?P<docno>.+?)\s*</DOCNO>\s*)?(<DOCTYPE>\s*(?P<doctype>.+?)\s*</DOCTYPE>\s*)?(<DATE_TIME>\s*(?P<date_time>.+?)\s*</DATE_TIME>\s*)?<BODY>\s*(<HEADLINE>\s*(?P<headline>.+?)\s*</HEADLINE>\s*)?<TEXT>(?P<text>.*?)</TEXT>\s*</BODY>\s*</DOC>\s*z#<b_\w+\s+[^>]*?type="(?P<type>\w+)"c          
   C   sô   t |g ƒg}| d krg S x¾t d| ¡D ]®}| ¡ }yv| d¡r„t |¡}|d krZtd|ƒ t | d¡g ƒ}|d  |¡ | |¡ n"| d¡r˜| 	¡  n|d  |¡ W q& t
tfk
rÒ   td | ¡ ¡ƒ‚Y q&X q&W t|ƒdkrìtd	ƒ‚|d
 S )Nz<[^>]+>|[^\s<]+z<b_ZXXXXÚtyperT   z<e_z)Bad IEER string (error at character {:d})r&   zBad IEER stringr   )r   rI   rX   rY   ry   Ú_IEER_TYPE_RErJ   ÚprintrL   r[   Ú
IndexErrorr,   rB   rZ   r$   )r\   r]   r_   Zpiece_mZpieceÚmra   r   r   r   Ú_ieer_read_text
  s,    




r†   ZLOCATIONZORGANIZATIONZPERSONZDURATIONZDATEZCARDINALÚPERCENTZMONEYZMEASUREc             C   sV   t  | ¡}|rHt| d¡|ƒ| d¡| d¡| d¡t| d¡|ƒdœS t| |ƒS dS )ap  
    Return a chunk structure containing the chunked tagged text that is
    encoded in the given IEER style string.
    Convert a string of chunked tagged text in the IEER named
    entity format into a chunk structure.  Chunks are of several
    types, LOCATION, ORGANIZATION, PERSON, DURATION, DATE, CARDINAL,
    PERCENT, MONEY, and MEASURE.

    :rtype: Tree
    r`   ÚdocnoÚdoctypeÚ	date_timeÚheadline)r`   rˆ   r‰   rŠ   r‹   N)Ú_IEER_DOC_RErJ   r†   rY   )r\   rp   r]   r…   r   r   r   Úieerstr2tree*  s    
r   c              C   sd   d} dd l }|jj| dd}| ¡  tƒ  d} t| dd}| ¡  tdƒ t|j |¡ƒ tƒ  d S )	Nzd[ Pierre/NNP Vinken/NNP ] ,/, [ 61/CD years/NNS ] old/JJ ,/, will/MD join/VB [ the/DT board/NN ] ./.r   rP   )r   av  
These DT B-NP
research NN I-NP
protocols NNS I-NP
offer VBP B-VP
to TO B-PP
the DT B-NP
patient NN I-NP
not RB O
only RB O
the DT B-NP
very RB I-NP
best JJS I-NP
therapy NN I-NP
which WDT B-NP
we PRP B-NP
have VBP B-VP
established VBN I-VP
today NN B-NP
but CC B-NP
also RB I-NP
the DT B-NP
hope NN I-NP
of IN B-PP
something NN B-NP
still RB B-ADJP
better JJR I-ADJP
. . O
)rP   rf   )rp   zCoNLL output:)Únltkra   rd   Zpprintrƒ   rt   r€   )r\   rŽ   r(   Z
conll_treer   r   r   ÚdemoU  s    r   Ú__main__)rP   rQ   rR   NN)re   rQ   )re   rQ   F)!Z
__future__r   r   r   rI   Z	nltk.treer   Znltk.tag.mappingr   Znltk.tag.utilr   Znltk.compatr   Znltk.metricsr	   r   Úobjectr   r+   rd   rW   rn   rt   r   r{   r€   ÚDOTALLrŒ   r‚   r†   r   r   rD   r   r   r   r   Ú<module>   sF     
2

5
#
#0
