ó
ù`]c           @` sÆ  d  d l  m Z m Z m Z d  d l Z d  d l Z d  d l Z d  d l Z d  d l m	 Z	 d  d l
 m Z m Z d  d l m Z m Z d  d l m Z m Z m Z d „  Z d „  Z d	 „  Z d
 „  Z d „  Z d „  Z d „  Z d „  Z d „  Z d „  Z d „  Z d „  Z d d d d d d d d e! d e! e! d d d d d e! d d „ Z" d „  Z# d d d „ Z$ e d. d/ g ƒ Z% e d0 d1 d2 d3 d4 d5 d6 d7 d8 g	 ƒ Z& d, „  Z' e( d- k rÂe ƒ  n  d S(9   i    (   t   print_functiont   absolute_importt   divisionN(   t   treebank(   t
   error_listt   Template(   t   Wordt   Pos(   t   BrillTaggerTrainert   RegexpTaggert   UnigramTaggerc           C` s   t  ƒ  d S(   s„   
    Run a demo with defaults. See source comments for details,
    or docstrings of any of the more specific demo_* functions.
    N(   t   postag(    (    (    s,   lib/python2.7/site-packages/nltk/tbl/demo.pyt   demo   s    c           C` s   t  d d ƒ d S(   sN   
    Exemplify repr(Rule) (see also str(Rule) and Rule.format("verbose"))
    t
   ruleformatt   reprN(   R   (    (    (    s,   lib/python2.7/site-packages/nltk/tbl/demo.pyt   demo_repr_rule_format!   s    c           C` s   t  d d ƒ d S(   sN   
    Exemplify repr(Rule) (see also str(Rule) and Rule.format("verbose"))
    R   t   strN(   R   (    (    (    s,   lib/python2.7/site-packages/nltk/tbl/demo.pyt   demo_str_rule_format(   s    c           C` s   t  d d ƒ d S(   s*   
    Exemplify Rule.format("verbose")
    R   t   verboseN(   R   (    (    (    s,   lib/python2.7/site-packages/nltk/tbl/demo.pyt   demo_verbose_rule_format/   s    c           C` s)   t  d t t d d d g ƒ ƒ g ƒ d S(   s¾  
    The feature/s of a template takes a list of positions
    relative to the current word where the feature should be
    looked for, conceptually joined by logical OR. For instance,
    Pos([-1, 1]), given a value V, will hold whenever V is found
    one step to the left and/or one step to the right.

    For contiguous ranges, a 2-arg form giving inclusive end
    points can also be used: Pos(-3, -1) is the same as the arg
    below.
    t	   templatesiýÿÿÿiþÿÿÿiÿÿÿÿN(   R   R   R   (    (    (    s,   lib/python2.7/site-packages/nltk/tbl/demo.pyt   demo_multiposition_feature6   s    c           C` s2   t  d t t d g ƒ t d d g ƒ ƒ g ƒ d S(   s8   
    Templates can have more than a single feature.
    R   i    iþÿÿÿiÿÿÿÿN(   R   R   R   R   (    (    (    s,   lib/python2.7/site-packages/nltk/tbl/demo.pyt   demo_multifeature_templateE   s    c           C` s   t  d t d t ƒ d S(   sh  
    Show aggregate statistics per template. Little used templates are
    candidates for deletion, much used templates may possibly be refined.

    Deleting unused templates is mostly about saving time and/or space:
    training is basically O(T) in the number of templates T
    (also in terms of memory usage, which often will be the limiting factor).
    t   incremental_statst   template_statsN(   R   t   True(    (    (    s,   lib/python2.7/site-packages/nltk/tbl/demo.pyt   demo_template_statisticsL   s    	c          C` s¨   t  j d d d g d d g d t ƒ}  t j d d d d g d d g d t ƒ} t t j |  | g d d ƒƒ } t d	 j t	 | ƒ ƒ ƒ t
 d
 | d t d t ƒ d S(   s	  
    Template.expand and Feature.expand are class methods facilitating
    generating large amounts of templates. See their documentation for
    details.

    Note: training with 500 templates can easily fill all available
    even on relatively small corpora
    iÿÿÿÿi    i   i   t   excludezeroiþÿÿÿt   combinationsi   s9   Generated {0} templates for transformation-based learningR   R   R   N(   i   i   (   R   t   expandt   FalseR   R   t   listR   t   printt   formatt   lenR   (   t   wordtplst   tagtplsR   (    (    s,   lib/python2.7/site-packages/nltk/tbl/demo.pyt   demo_generated_templatesX   s    	'*!c           C` s   t  d t d t d d ƒ d S(   s‚   
    Plot a learning curve -- the contribution on tagging accuracy of
    the individual rules.
    Note: requires matplotlib
    R   t   separate_baseline_datat   learning_curve_outputs   learningcurve.pngN(   R   R   (    (    (    s,   lib/python2.7/site-packages/nltk/tbl/demo.pyt   demo_learning_curvel   s    c           C` s   t  d d ƒ d S(   sW   
    Writes a file with context for each erroneous word after tagging testing data
    t   error_outputs
   errors.txtN(   R   (    (    (    s,   lib/python2.7/site-packages/nltk/tbl/demo.pyt   demo_error_analysisy   s    c           C` s   t  d d ƒ d S(   sm   
    Serializes the learned tagger to a file in pickle format; reloads it
    and validates the process.
    t   serialize_outputs
   tagger.pclN(   R   (    (    (    s,   lib/python2.7/site-packages/nltk/tbl/demo.pyt   demo_serialize_tagger€   s    c           C` s   t  d d d d d d ƒ d S(   s˜   
    Discard rules with low accuracy. This may hurt performance a bit,
    but will often produce rules which are more interesting read to a human.
    t	   num_sentsi¸  t   min_accg¸…ëQ¸î?t	   min_scorei
   N(   R   (    (    (    s,   lib/python2.7/site-packages/nltk/tbl/demo.pyt   demo_high_accuracy_rulesˆ   s    iè  i,  i   gš™™™™™é?R   c   &      C` s-  | p	 t  } |  d k r: d d l m } m } | ƒ  }  n  t | | | | | ƒ \ } } } } | rt j j | ƒ sÆ t	 | d | ƒ} t
 | d ƒ  } t j | | ƒ Wd QXt d j | ƒ ƒ n  t
 | d ƒ ) } t j | ƒ } t d j | ƒ ƒ Wd QXn t	 | d | ƒ} t d	 ƒ | rDt d
 j | j | ƒ ƒ ƒ n  t j ƒ  } t | |  | d |	 ƒ} t d ƒ | j | | | | ƒ } t d j t j ƒ  | ƒ ƒ | rÇt d | j | ƒ ƒ n  | d k r%t d ƒ xE t | j ƒ  d ƒ D]+ \ } } t d j | | j |	 ƒ ƒ ƒ qóWn  |
 rÁt d ƒ | j | | ƒ \ }  }! t d ƒ | sjt d ƒ n  | j ƒ  }" | rŒ| j |! ƒ n  | rít | |! |" d | ƒt d j | ƒ ƒ qín, t d ƒ | j | ƒ }  | rí| j ƒ  n  | d k	 rdt
 | d ƒ D }# |# j d | ƒ |# j d j t | |  ƒ ƒ j d ƒ d ƒ Wd QXt d j | ƒ ƒ n  | d k	 r)| j | ƒ }  t
 | d ƒ  } t j | | ƒ Wd QXt d j | ƒ ƒ t
 | d ƒ  } t j | ƒ }$ Wd QXt d j | ƒ ƒ | j | ƒ }% |  |% k rt d ƒ q)t d ƒ n  d S(    s’
  
    Brill Tagger Demonstration
    :param templates: how many sentences of training and testing data to use
    :type templates: list of Template

    :param tagged_data: maximum number of rule instances to create
    :type tagged_data: C{int}

    :param num_sents: how many sentences of training and testing data to use
    :type num_sents: C{int}

    :param max_rules: maximum number of rule instances to create
    :type max_rules: C{int}

    :param min_score: the minimum score for a rule in order for it to be considered
    :type min_score: C{int}

    :param min_acc: the minimum score for a rule in order for it to be considered
    :type min_acc: C{float}

    :param train: the fraction of the the corpus to be used for training (1=all)
    :type train: C{float}

    :param trace: the level of diagnostic tracing output to produce (0-4)
    :type trace: C{int}

    :param randomize: whether the training data should be a random subset of the corpus
    :type randomize: C{bool}

    :param ruleformat: rule output format, one of "str", "repr", "verbose"
    :type ruleformat: C{str}

    :param incremental_stats: if true, will tag incrementally and collect stats for each rule (rather slow)
    :type incremental_stats: C{bool}

    :param template_stats: if true, will print per-template statistics collected in training and (optionally) testing
    :type template_stats: C{bool}

    :param error_output: the file where errors will be saved
    :type error_output: C{string}

    :param serialize_output: the file where the learned tbl tagger will be saved
    :type serialize_output: C{string}

    :param learning_curve_output: filename of plot of learning curve(s) (train and also test, if available)
    :type learning_curve_output: C{string}

    :param learning_curve_take: how many rules plotted
    :type learning_curve_take: C{int}

    :param baseline_backoff_tagger: the file where rules will be saved
    :type baseline_backoff_tagger: tagger

    :param separate_baseline_data: use a fraction of the training data exclusively for training baseline
    :type separate_baseline_data: C{bool}

    :param cache_baseline_tagger: cache baseline tagger to this file (only interesting as a temporary workaround to get
                                  deterministic output from the baseline unigram tagger between python versions)
    :type cache_baseline_tagger: C{string}


    Note on separate_baseline_data: if True, reuse training data both for baseline and rule learner. This
    is fast and fine for a demo, but is likely to generalize worse on unseen data.
    Also cannot be sensibly used for learning curves on training data (the baseline will be artificially high).
    i    (   t   describe_template_setst   brill24t   backofft   wNs*   Trained baseline tagger, pickled it to {0}t   rs    Reloaded pickled tagger from {0}s   Trained baseline taggers"       Accuracy on test set: {0:0.4f}R   s   Training tbl tagger...s&   Trained tbl tagger in {0:0.2f} secondss       Accuracy on test set: %.4fi   s   
Learned rules: s   {0:4d} {1:s}sJ   Incrementally tagging the test data, collecting individual rule statisticss       Rule statistics collectedsb   WARNING: train_stats asked for separate_baseline_data=True; the baseline will be artificially hight   takes#   Wrote plot of learning curve to {0}s   Tagging the test datas   Errors for Brill Tagger %r

u   
s   utf-8s   
s,   Wrote tagger errors including context to {0}s   Wrote pickled tagger to {0}s4   Reloaded tagger tried on test set, results identicals;   PROBLEM: Reloaded tagger gave different results on test set(   t   REGEXP_TAGGERt   Nonet   nltk.tag.brillR1   R2   t   _demo_prepare_datat   ost   patht   existsR
   t   opent   picklet   dumpR    R!   t   loadt   evaluatet   timeR   t   traint	   enumeratet   rulest   batch_tag_incrementalt   train_statst   print_template_statisticst
   _demo_plott	   tag_sentst   writet   joinR   t   encode(&   R   t   tagged_dataR-   t	   max_rulesR/   R.   RD   t   tracet	   randomizeR   R   R   R)   R+   R'   t   learning_curve_taket   baseline_backoff_taggerR&   t   cache_baseline_taggerR1   R2   t   training_datat   baseline_datat	   gold_datat   testing_datat   baseline_taggert   print_rulest   tbrillt   trainert   brill_taggert   rulenot   rulet
   taggedtestt	   teststatst
   trainstatst   ft   brill_tagger_reloadedt   taggedtest_reloaded(    (    s,   lib/python2.7/site-packages/nltk/tbl/demo.pyR      s”    X!


"&


,c         C` s  |  d  k r% t d ƒ t j ƒ  }  n  | d  k sC t |  ƒ | k rR t |  ƒ } n  | r{ t j t |  ƒ ƒ t j |  ƒ n  t | | ƒ } |  |  } |  | | !} g  | D]# } g  | D] }	 |	 d ^ q¶ ^ q© }
 | sá | } n% t | ƒ d } | |  | | } } t	 | ƒ \ } } t	 |
 ƒ \ } } t	 | ƒ \ } } t d j
 | | ƒ ƒ t d j
 | | ƒ ƒ t d j
 | | | rƒd n d ƒ ƒ | | | |
 f S(	   Ns%   Loading tagged data from treebank... i    i   s)   Read testing data ({0:d} sents/{1:d} wds)s*   Read training data ({0:d} sents/{1:d} wds)s0   Read baseline data ({0:d} sents/{1:d} wds) {2:s}t    s   [reused the training set](   R8   R    R   t   tagged_sentsR"   t   randomt   seedt   shufflet   intt   corpus_sizeR!   (   RO   RD   R-   RR   R&   t   cutoffRV   RX   t   sentt   tRY   RW   t	   bl_cutofft	   trainseqst   traintokenst   testseqst
   testtokenst   bltrainseqst   bltraintokens(    (    s,   lib/python2.7/site-packages/nltk/tbl/demo.pyR:   W  s8    

0	c   
      C` s'  | d g } x' | d D] } | j  | d | ƒ q Wg  | |  D] } d | | d ^ qB } | d g } x' | d D] } | j  | d | ƒ qx Wg  | |  D] } d | | d ^ q¢ } d d  l j } t t t | ƒ ƒ ƒ }	 | j |	 | |	 | ƒ | j d  d  d  d g ƒ | j	 |  ƒ d  S(   Nt   initialerrorst
   rulescoresiÿÿÿÿi   t
   tokencounti    g      ð?(
   t   appendt   matplotlib.pyplott   pyplotR   t   rangeR"   t   plott   axisR8   t   savefig(
   R'   Rb   Rc   R6   t	   testcurvet	   rulescoret   xt
   traincurvet   pltR5   (    (    s,   lib/python2.7/site-packages/nltk/tbl/demo.pyRJ     s    ))s   ^-?[0-9]+(.[0-9]+)?$t   CDs   .*t   NNs   (The|the|A|a|An|an)$t   ATs   .*able$t   JJs   .*ness$s   .*ly$t   RBs   .*s$t   NNSs   .*ing$t   VBGs   .*ed$t   VBDc         C` s    t  |  ƒ t d „  |  Dƒ ƒ f S(   Nc         s` s   |  ] } t  | ƒ Vq d  S(   N(   R"   (   t   .0R„   (    (    s,   lib/python2.7/site-packages/nltk/tbl/demo.pys	   <genexpr>¤  s    (   R"   t   sum(   t   seqs(    (    s,   lib/python2.7/site-packages/nltk/tbl/demo.pyRm   £  s    t   __main__(   s   ^-?[0-9]+(.[0-9]+)?$R‡   (   s   .*Rˆ   (   s   ^-?[0-9]+(.[0-9]+)?$R‡   (   s   (The|the|A|a|An|an)$R‰   (   s   .*able$RŠ   (   s   .*ness$Rˆ   (   s   .*ly$R‹   (   s   .*s$RŒ   (   s   .*ing$R   (   s   .*ed$RŽ   (   s   .*Rˆ   ()   t
   __future__R    R   R   R;   R?   Ri   RC   t   nltk.corpusR   t   nltk.tblR   R   R9   R   R   t   nltk.tagR   R	   R
   R   R   R   R   R   R   R   R%   R(   R*   R,   R0   R8   R   R   R:   RJ   t   NN_CD_TAGGERR7   Rm   t   __name__(    (    (    s,   lib/python2.7/site-packages/nltk/tbl/demo.pyt   <module>   sn   													´	(	