ó
ù`]c        	   @  s†  d  d l  m Z m Z m Z d  d l Z d  d l m Z d  d l m Z d  d l	 m
 Z
 d  d l m Z d  d l m Z d „  Z d	 e f d
 „  ƒ  YZ d „  Z d d d e e d „ Z e j d ƒ Z d d d f d d „ Z d „  Z d d d f d e d „ Z d „  Z e j d e j ƒ Z e j d ƒ Z d „  Z d d d d d d d  d! d" g	 d d# „ Z  d$ „  Z! e" d% k r‚e! ƒ  n  d S(&   iÿÿÿÿ(   t   print_functiont   unicode_literalst   divisionN(   t   Tree(   t   map_tag(   t	   str2tuple(   t   python_2_unicode_compatible(   t   accuracyc         C  s_   g  } g  } xC | D]; } |  j  | j ƒ  ƒ } | t | ƒ 7} | t | ƒ 7} q Wt | | ƒ S(   u|  
    Score the accuracy of the chunker against the gold standard.
    Strip the chunk information from the gold standard and rechunk it using
    the chunker, then compute the accuracy score.

    :type chunker: ChunkParserI
    :param chunker: The chunker being evaluated.
    :type gold: tree
    :param gold: The chunk structures to score the chunker on.
    :rtype: float
    (   t   parset   flattent   tree2conlltagst	   _accuracy(   t   chunkert   goldt	   gold_tagst	   test_tagst	   gold_treet	   test_tree(    (    s.   lib/python2.7/site-packages/nltk/chunk/util.pyR      s    t
   ChunkScorec           B  s   e  Z d  Z d „  Z d „  Z d „  Z d „  Z d „  Z d „  Z d d „ Z	 d	 „  Z
 d
 „  Z d „  Z d „  Z d „  Z d „  Z d „  Z RS(   u;  
    A utility class for scoring chunk parsers.  ``ChunkScore`` can
    evaluate a chunk parser's output, based on a number of statistics
    (precision, recall, f-measure, misssed chunks, incorrect chunks).
    It can also combine the scores from the parsing of multiple texts;
    this makes it significantly easier to evaluate a chunk parser that
    operates one sentence at a time.

    Texts are evaluated with the ``score`` method.  The results of
    evaluation can be accessed via a number of accessor methods, such
    as ``precision`` and ``f_measure``.  A typical use of the
    ``ChunkScore`` class is::

        >>> chunkscore = ChunkScore()           # doctest: +SKIP
        >>> for correct in correct_sentences:   # doctest: +SKIP
        ...     guess = chunkparser.parse(correct.leaves())   # doctest: +SKIP
        ...     chunkscore.score(correct, guess)              # doctest: +SKIP
        >>> print('F Measure:', chunkscore.f_measure())       # doctest: +SKIP
        F Measure: 0.823

    :ivar kwargs: Keyword arguments:

        - max_tp_examples: The maximum number actual examples of true
          positives to record.  This affects the ``correct`` member
          function: ``correct`` will not return more than this number
          of true positive examples.  This does *not* affect any of
          the numerical metrics (precision, recall, or f-measure)

        - max_fp_examples: The maximum number actual examples of false
          positives to record.  This affects the ``incorrect`` member
          function and the ``guessed`` member function: ``incorrect``
          will not return more than this number of examples, and
          ``guessed`` will not return more than this number of true
          positive examples.  This does *not* affect any of the
          numerical metrics (precision, recall, or f-measure)

        - max_fn_examples: The maximum number actual examples of false
          negatives to record.  This affects the ``missed`` member
          function and the ``correct`` member function: ``missed``
          will not return more than this number of examples, and
          ``correct`` will not return more than this number of true
          negative examples.  This does *not* affect any of the
          numerical metrics (precision, recall, or f-measure)

        - chunk_label: A regular expression indicating which chunks
          should be compared.  Defaults to ``'.*'`` (i.e., all chunks).

    :type _tp: list(Token)
    :ivar _tp: List of true positives
    :type _fp: list(Token)
    :ivar _fp: List of false positives
    :type _fn: list(Token)
    :ivar _fn: List of false negatives

    :type _tp_num: int
    :ivar _tp_num: Number of true positives
    :type _fp_num: int
    :ivar _fp_num: Number of false positives
    :type _fn_num: int
    :ivar _fn_num: Number of false negatives.
    c         K  sÓ   t  ƒ  |  _ t  ƒ  |  _ t  ƒ  |  _ t  ƒ  |  _ t  ƒ  |  _ | j d d ƒ |  _ | j d d ƒ |  _ | j d d ƒ |  _	 | j d d ƒ |  _
 d |  _ d |  _ d |  _ d |  _ d |  _ d |  _ t |  _ d  S(	   Nu   max_tp_examplesid   u   max_fp_examplesu   max_fn_examplesu   chunk_labelu   .*i    g        (   t   sett   _correctt   _guessedt   _tpt   _fpt   _fnt   gett   _max_tpt   _max_fpt   _max_fnt   _chunk_labelt   _tp_numt   _fp_numt   _fn_numt   _countt   _tags_correctt   _tags_totalt   Falset   _measuresNeedUpdate(   t   selft   kwargs(    (    s.   lib/python2.7/site-packages/nltk/chunk/util.pyt   __init__u   s     						c         C  sˆ   |  j  r„ |  j |  j @|  _ |  j |  j |  _ |  j |  j |  _ t |  j ƒ |  _ t |  j ƒ |  _ t |  j ƒ |  _	 t
 |  _  n  d  S(   N(   R%   R   R   R   R   R   t   lenR   R   R    R$   (   R&   (    (    s.   lib/python2.7/site-packages/nltk/chunk/util.pyt   _updateMeasuresˆ   s    	c         C  sÕ   |  j  t | |  j |  j ƒ O_  |  j t | |  j |  j ƒ O_ |  j d 7_ t |  _ y t | ƒ } t | ƒ } Wn t k
 r“ d } } n X|  j	 t
 | ƒ 7_	 |  j t d „  t | | ƒ Dƒ ƒ 7_ d S(   uU  
        Given a correctly chunked sentence, score another chunked
        version of the same sentence.

        :type correct: chunk structure
        :param correct: The known-correct ("gold standard") chunked
            sentence.
        :type guessed: chunk structure
        :param guessed: The chunked sentence to be scored.
        i   c         s  s'   |  ] \ } } | | k r d  Vq d S(   i   N(    (   t   .0t   tt   g(    (    s.   lib/python2.7/site-packages/nltk/chunk/util.pys	   <genexpr>¬   s    N(    (   R   t
   _chunksetsR!   R   R   t   TrueR%   R
   t
   ValueErrorR#   R)   R"   t   sumt   zip(   R&   t   correctt   guessedt   correct_tagst   guessed_tags(    (    s.   lib/python2.7/site-packages/nltk/chunk/util.pyt   score’   s    !!	
c         C  s!   |  j  d k r d S|  j |  j  S(   uÁ   
        Return the overall tag-based accuracy for all text that have
        been scored by this ``ChunkScore``, using the IOB (conll2000)
        tag encoding.

        :rtype: float
        i    i   (   R#   R"   (   R&   (    (    s.   lib/python2.7/site-packages/nltk/chunk/util.pyR   ¯   s    c         C  s9   |  j  ƒ  |  j |  j } | d k r* d S|  j | Sd S(   u‰   
        Return the overall precision for all texts that have been
        scored by this ``ChunkScore``.

        :rtype: float
        i    N(   R*   R   R   (   R&   t   div(    (    s.   lib/python2.7/site-packages/nltk/chunk/util.pyt	   precision»   s
    
c         C  s9   |  j  ƒ  |  j |  j } | d k r* d S|  j | Sd S(   u†   
        Return the overall recall for all texts that have been
        scored by this ``ChunkScore``.

        :rtype: float
        i    N(   R*   R   R    (   R&   R8   (    (    s.   lib/python2.7/site-packages/nltk/chunk/util.pyt   recallÉ   s
    
g      à?c         C  sV   |  j  ƒ  |  j ƒ  } |  j ƒ  } | d k s: | d k r> d Sd | | d | | S(   u»  
        Return the overall F measure for all texts that have been
        scored by this ``ChunkScore``.

        :param alpha: the relative weighting of precision and recall.
            Larger alpha biases the score towards the precision value,
            while smaller alpha biases the score towards the recall
            value.  ``alpha`` should have a value in the range [0,1].
        :type alpha: float
        :rtype: float
        i    i   (   R*   R9   R:   (   R&   t   alphat   pt   r(    (    s.   lib/python2.7/site-packages/nltk/chunk/util.pyt	   f_measure×   s    
c         C  s4   |  j  ƒ  t |  j ƒ } g  | D] } | d ^ q  S(   uÈ   
        Return the chunks which were included in the
        correct chunk structures, but not in the guessed chunk
        structures, listed in input order.

        :rtype: list of chunks
        i   (   R*   t   listR   (   R&   t   chunkst   c(    (    s.   lib/python2.7/site-packages/nltk/chunk/util.pyt   missedê   s    
c         C  s4   |  j  ƒ  t |  j ƒ } g  | D] } | d ^ q  S(   uÀ   
        Return the chunks which were included in the guessed chunk structures,
        but not in the correct chunk structures, listed in input order.

        :rtype: list of chunks
        i   (   R*   R?   R   (   R&   R@   RA   (    (    s.   lib/python2.7/site-packages/nltk/chunk/util.pyt	   incorrectö   s    
c         C  s*   t  |  j ƒ } g  | D] } | d ^ q S(   u—   
        Return the chunks which were included in the correct
        chunk structures, listed in input order.

        :rtype: list of chunks
        i   (   R?   R   (   R&   R@   RA   (    (    s.   lib/python2.7/site-packages/nltk/chunk/util.pyR3     s    c         C  s*   t  |  j ƒ } g  | D] } | d ^ q S(   u—   
        Return the chunks which were included in the guessed
        chunk structures, listed in input order.

        :rtype: list of chunks
        i   (   R?   R   (   R&   R@   RA   (    (    s.   lib/python2.7/site-packages/nltk/chunk/util.pyR4     s    c         C  s   |  j  ƒ  |  j |  j S(   N(   R*   R   R    (   R&   (    (    s.   lib/python2.7/site-packages/nltk/chunk/util.pyt   __len__  s    
c         C  s   d t  t |  ƒ ƒ d S(   u`   
        Return a concise representation of this ``ChunkScoring``.

        :rtype: str
        u   <ChunkScoring of u    chunks>(   t   reprR)   (   R&   (    (    s.   lib/python2.7/site-packages/nltk/chunk/util.pyt   __repr__  s    c         C  s`   d d j  |  j ƒ  d ƒ d j  |  j ƒ  d ƒ d j  |  j ƒ  d ƒ d j  |  j ƒ  d ƒ S(   u-  
        Return a verbose representation of this ``ChunkScoring``.
        This representation includes the precision, recall, and
        f-measure scores.  For other information about the score,
        use the accessor methods (e.g., ``missed()`` and ``incorrect()``).

        :rtype: str
        u   ChunkParse score:
u       IOB Accuracy: {:5.1f}%%
id   u       Precision:    {:5.1f}%%
u       Recall:       {:5.1f}%%
u       F-Measure:    {:5.1f}%%(   t   formatR   R9   R:   R>   (   R&   (    (    s.   lib/python2.7/site-packages/nltk/chunk/util.pyt   __str__!  s    (   t   __name__t
   __module__t   __doc__R(   R*   R7   R   R9   R:   R>   RB   RC   R3   R4   RD   RF   RH   (    (    (    s.   lib/python2.7/site-packages/nltk/chunk/util.pyR   6   s   =		
							
	
		c         C  s“   d } g  } xz |  D]r } t  | t ƒ r{ t j | | j ƒ  ƒ rb | j | | f | j ƒ  f ƒ n  | t | j ƒ  ƒ 7} q | d 7} q Wt	 | ƒ S(   Ni    i   (
   t
   isinstanceR   t   ret   matcht   labelt   appendt   freezeR)   t   leavesR   (   R,   t   countt   chunk_labelt   posR@   t   child(    (    s.   lib/python2.7/site-packages/nltk/chunk/util.pyR.   5  s    "u   NPu   Su   /c         C  s©  t  j d ƒ } t | g  ƒ g } xM| j |  ƒ D]<} | j ƒ  }	 |	 d d k r³ t | ƒ d k rƒ t d j | j ƒ  ƒ ƒ ‚ n  t | g  ƒ }
 | d j	 |
 ƒ | j	 |
 ƒ q1 |	 d d k r t | ƒ d k ró t d	 j | j ƒ  ƒ ƒ ‚ n  | j
 ƒ  q1 | d k r | d j	 |	 ƒ q1 t |	 | ƒ \ } } | rV| rVt | | | ƒ } n  | d j	 | | f ƒ q1 Wt | ƒ d k r¡t d
 j t |  ƒ ƒ ƒ ‚ n  | d S(   uB  
    Divide a string of bracketted tagged text into
    chunks and unchunked tokens, and produce a Tree.
    Chunks are marked by square brackets (``[...]``).  Words are
    delimited by whitespace, and each word should have the form
    ``text/tag``.  Words that do not contain a slash are
    assigned a ``tag`` of None.

    :param s: The string to be converted
    :type s: str
    :param chunk_label: The label to use for chunk nodes
    :type chunk_label: str
    :param root_label: The label to use for the root of the tree
    :type root_label: str
    :rtype: Tree
    u   \[|\]|[^\[\]\s]+i    u   [i   u   Unexpected [ at char {:d}iÿÿÿÿu   ]i   u   Unexpected ] at char {:d}u   Expected ] at char {:d}N(   RM   t   compileR   t   finditert   groupR)   R0   RG   t   startRP   t   popt   NoneR   R   (   t   sRT   t
   root_labelt   sept   source_tagsett   target_tagsett   WORD_OR_BRACKETt   stackRN   t   textt   chunkt   wordt   tag(    (    s.   lib/python2.7/site-packages/nltk/chunk/util.pyt   tagstr2treeB  s.    u   (\S+)\s+(\S+)\s+([IOB])-?(\S+)?u   PPu   VPc         C  sm  t  | g  ƒ g } xPt |  j d ƒ ƒ D]9\ } } | j ƒ  sF q( n  t j | ƒ } | d
 k ry t d j | ƒ ƒ ‚ n  | j	 ƒ  \ } } }	 }
 | d
 k	 r² |
 | k r² d }	 n  |	 d k oÑ |
 | d j
 ƒ  k } |	 d k sæ | rt | ƒ d k r| j ƒ  qn  |	 d k s| rJt  |
 g  ƒ } | d j | ƒ | j | ƒ n  | d j | | f ƒ q( W| d	 S(   u*  
    Return a chunk structure for a single sentence
    encoded in the given CONLL 2000 style string.
    This function converts a CoNLL IOB string into a tree.
    It uses the specified chunk types
    (defaults to NP, PP and VP), and creates a tree rooted at a node
    labeled S (by default).

    :param s: The CoNLL string to be converted.
    :type s: str
    :param chunk_types: The chunk types to be converted.
    :type chunk_types: tuple
    :param root_label: The node label to use for the root.
    :type root_label: str
    :rtype: Tree
    u   
u   Error on line {:d}u   Ou   Iiÿÿÿÿu   BOi   u   Bi    N(   R   t	   enumeratet   splitt   stript   _LINE_RERN   R\   R0   RG   t   groupsRO   R)   R[   RP   (   R]   t   chunk_typesR^   Rc   t   linenot   lineRN   Rf   Rg   t   statet
   chunk_typet
   mismatch_IRe   (    (    s.   lib/python2.7/site-packages/nltk/chunk/util.pyt   conllstr2treex  s(    "	"c         C  sº   g  } x­ |  D]¥ } ym | j  ƒ  } d } xT | D]L } t | t ƒ rS t d ƒ ‚ n  | j | d | d | | f ƒ d } q/ WWq t k
 r± | j | d | d d f ƒ q Xq W| S(   uË   
    Return a list of 3-tuples containing ``(word, tag, IOB-tag)``.
    Convert a tree to the CoNLL IOB tag format.

    :param t: The tree to be converted.
    :type t: Tree
    :rtype: list(tuple)
    u   B-u7   Tree is too deeply nested to be printed in CoNLL formati    i   u   I-u   O(   RO   RL   R   R0   RP   t   AttributeError(   R,   t   tagsRV   t   categoryt   prefixt   contents(    (    s.   lib/python2.7/site-packages/nltk/chunk/util.pyR
   ­  s    
"&c         C  st  t  | g  ƒ } x^|  D]V\ } } } | d	 k r\ | rF t d ƒ ‚ ql| j | | f ƒ q | j d ƒ r‘ | j t  | d | | f g ƒ ƒ q | j d ƒ r5t | ƒ d k sà t | d t  ƒ sà | d j ƒ  | d k r| rõ t d ƒ ‚ q2| j t  | d | | f g ƒ ƒ ql| d j | | f ƒ q | d k rW| j | | f ƒ q t d j | ƒ ƒ ‚ q W| S(
   u1   
    Convert the CoNLL IOB format to a tree.
    u   Bad conll tag sequenceu   B-i   u   I-i    iÿÿÿÿu   Ou   Bad conll tag {0!r}N(	   R   R\   R0   RP   t
   startswithR)   RL   RO   RG   (   t   sentenceRn   R^   t   strictt   treeRf   t   postagt   chunktag(    (    s.   lib/python2.7/site-packages/nltk/chunk/util.pyt   conlltags2treeÈ  s(    &&c         C  s5   g  t  |  ƒ D] } d j | ƒ ^ q } d j | ƒ S(   uÒ   
    Return a multiline string where each line contains a word, tag and IOB tag.
    Convert a tree to the CoNLL IOB string format

    :param t: The tree to be converted.
    :type t: Tree
    :rtype: str
    u    u   
(   R
   t   join(   R,   t   tokent   lines(    (    s.   lib/python2.7/site-packages/nltk/chunk/util.pyt   tree2conllstrì  s    	(u   <DOC>\s*(<DOCNO>\s*(?P<docno>.+?)\s*</DOCNO>\s*)?(<DOCTYPE>\s*(?P<doctype>.+?)\s*</DOCTYPE>\s*)?(<DATE_TIME>\s*(?P<date_time>.+?)\s*</DATE_TIME>\s*)?<BODY>\s*(<HEADLINE>\s*(?P<headline>.+?)\s*</HEADLINE>\s*)?<TEXT>(?P<text>.*?)</TEXT>\s*</BODY>\s*</DOC>\s*u#   <b_\w+\s+[^>]*?type="(?P<type>\w+)"c         C  sM  t  | g  ƒ g } |  d  k r" g  Sxÿ t j d |  ƒ D]ë } | j ƒ  } y¤ | j d ƒ r½ t j | ƒ } | d  k r„ t d | ƒ n  t  | j d ƒ g  ƒ } | d j	 | ƒ | j	 | ƒ n- | j d ƒ rÙ | j
 ƒ  n | d j	 | ƒ Wq5 t t f k
 rt d j | j ƒ  ƒ ƒ ‚ q5 Xq5 Wt | ƒ d k rEt d	 ƒ ‚ n  | d
 S(   Nu   <[^>]+>|[^\s<]+u   <b_u   XXXXu   typeiÿÿÿÿu   <e_u)   Bad IEER string (error at character {:d})i   u   Bad IEER stringi    (   R   R\   RM   RX   RY   Rz   t   _IEER_TYPE_RERN   t   printRP   R[   t
   IndexErrorR0   RG   RZ   R)   (   R]   R^   Rc   t   piece_mt   piecet   mRe   (    (    s.   lib/python2.7/site-packages/nltk/chunk/util.pyt   _ieer_read_text
  s,     u   LOCATIONu   ORGANIZATIONu   PERSONu   DURATIONu   DATEu   CARDINALu   PERCENTu   MONEYu   MEASUREc         C  sŒ   t  j |  ƒ } | r{ i t | j d ƒ | ƒ d 6| j d ƒ d 6| j d ƒ d 6| j d ƒ d 6t | j d ƒ | ƒ d 6St |  | ƒ Sd S(   up  
    Return a chunk structure containing the chunked tagged text that is
    encoded in the given IEER style string.
    Convert a string of chunked tagged text in the IEER named
    entity format into a chunk structure.  Chunks are of several
    types, LOCATION, ORGANIZATION, PERSON, DURATION, DATE, CARDINAL,
    PERCENT, MONEY, and MEASURE.

    :rtype: Tree
    u   textu   docnou   doctypeu	   date_timeu   headlineN(   t   _IEER_DOC_RERN   R‹   RY   (   R]   Rn   R^   RŠ   (    (    s.   lib/python2.7/site-packages/nltk/chunk/util.pyt   ieerstr2tree*  s    c          C  sˆ   d }  d d  l  } | j j |  d d ƒ} | j ƒ  t ƒ  d }  t |  d d	 ƒ} | j ƒ  t d ƒ t | j j | ƒ ƒ t ƒ  d  S(
   Nud   [ Pierre/NNP Vinken/NNP ] ,/, [ 61/CD years/NNS ] old/JJ ,/, will/MD join/VB [ the/DT board/NN ] ./.iÿÿÿÿRT   u   NPuv  
These DT B-NP
research NN I-NP
protocols NNS I-NP
offer VBP B-VP
to TO B-PP
the DT B-NP
patient NN I-NP
not RB O
only RB O
the DT B-NP
very RB I-NP
best JJS I-NP
therapy NN I-NP
which WDT B-NP
we PRP B-NP
have VBP B-VP
established VBN I-VP
today NN B-NP
but CC B-NP
also RB I-NP
the DT B-NP
hope NN I-NP
of IN B-PP
something NN B-NP
still RB B-ADJP
better JJR I-ADJP
. . O
Rn   u   PPu   CoNLL output:(   u   NPu   PP(   t   nltkRe   Rh   t   pprintR†   Rt   R„   (   R]   RŽ   R,   t
   conll_tree(    (    s.   lib/python2.7/site-packages/nltk/chunk/util.pyt   demoU  s    


u   __main__(#   t
   __future__R    R   R   RM   t	   nltk.treeR   t   nltk.tag.mappingR   t   nltk.tag.utilR   t   nltk.compatR   t   nltk.metricsR   R   t   objectR   R.   R\   Rh   RW   Rl   Rt   R
   R$   R€   R„   t   DOTALLRŒ   R…   R‹   R   R‘   RI   (    (    (    s.   lib/python2.7/site-packages/nltk/chunk/util.pyt   <module>   sB   	ÿ	25	#		#	0