B
    P?[%;                 @   s2  d dl mZmZmZ d dlZd dlZd dlZd dlZd dlm	Z	 d dl
mZmZ d dlmZmZ d dlmZmZmZ dd Zd	d
 Zdd Zdd Zdd Zdd Zdd Zdd Zdd Zdd Zdd Zdd Zd7d%d&Z d'd( Z!d8d)d*Z"ed+d,gZ#ed+d-d.d/d0d1d2d3d,g	Z$d4d5 Z%e&d6kr.e  dS )9    )print_functionabsolute_importdivisionN)treebank)
error_listTemplate)WordPos)BrillTaggerTrainerRegexpTaggerUnigramTaggerc               C   s
   t   dS )z
    Run a demo with defaults. See source comments for details,
    or docstrings of any of the more specific demo_* functions.
    N)postag r   r   ,lib/python3.7/site-packages/nltk/tbl/demo.pydemo   s    r   c               C   s   t dd dS )zN
    Exemplify repr(Rule) (see also str(Rule) and Rule.format("verbose"))
    repr)
ruleformatN)r   r   r   r   r   demo_repr_rule_format!   s    r   c               C   s   t dd dS )zN
    Exemplify repr(Rule) (see also str(Rule) and Rule.format("verbose"))
    str)r   N)r   r   r   r   r   demo_str_rule_format(   s    r   c               C   s   t dd dS )z*
    Exemplify Rule.format("verbose")
    verbose)r   N)r   r   r   r   r   demo_verbose_rule_format/   s    r   c               C   s   t ttdddggd dS )a  
    The feature/s of a template takes a list of positions
    relative to the current word where the feature should be
    looked for, conceptually joined by logical OR. For instance,
    Pos([-1, 1]), given a value V, will hold whenever V is found
    one step to the left and/or one step to the right.

    For contiguous ranges, a 2-arg form giving inclusive end
    points can also be used: Pos(-3, -1) is the same as the arg
    below.
    )	templatesN)r   r   r	   r   r   r   r   demo_multiposition_feature6   s    r   c               C   s$   t ttdgtddggd dS )z8
    Templates can have more than a single feature.
    r   r   r   )r   N)r   r   r   r	   r   r   r   r   demo_multifeature_templateE   s    r   c               C   s   t ddd dS )ah  
    Show aggregate statistics per template. Little used templates are
    candidates for deletion, much used templates may possibly be refined.

    Deleting unused templates is mostly about saving time and/or space:
    training is basically O(T) in the number of templates T
    (also in terms of memory usage, which often will be the limiting factor).
    T)incremental_statstemplate_statsN)r   r   r   r   r   demo_template_statisticsL   s    	r    c              C   sp   t jdddgddgdd} tjddddgddgdd}ttj| |gd	d
}tdt| t|ddd dS )a	  
    Template.expand and Feature.expand are class methods facilitating
    generating large amounts of templates. See their documentation for
    details.

    Note: training with 500 templates can easily fill all available
    even on relatively small corpora
    r   r         F)Zexcludezeror   T)r!      )combinationsz9Generated {0} templates for transformation-based learning)r   r   r   N)	r   expandr	   listr   printformatlenr   )ZwordtplsZtagtplsr   r   r   r   demo_generated_templatesX   s    	r*   c               C   s   t dddd dS )z
    Plot a learning curve -- the contribution on tagging accuracy of
    the individual rules.
    Note: requires matplotlib
    Tzlearningcurve.png)r   separate_baseline_datalearning_curve_outputN)r   r   r   r   r   demo_learning_curvel   s    r-   c               C   s   t dd dS )zW
    Writes a file with context for each erroneous word after tagging testing data
    z
errors.txt)error_outputN)r   r   r   r   r   demo_error_analysisy   s    r/   c               C   s   t dd dS )zm
    Serializes the learned tagger to a file in pickle format; reloads it
    and validates the process.
    z
tagger.pcl)serialize_outputN)r   r   r   r   r   demo_serialize_tagger   s    r1   c               C   s   t dddd dS )z
    Discard rules with low accuracy. This may hurt performance a bit,
    but will often produce rules which are more interesting read to a human.
    i  gQ?
   )	num_sentsmin_acc	min_scoreN)r   r   r   r   r   demo_high_accuracy_rules   s    r6     ,  r#   皙?Fr   c       &   	   C   s  |pt }| dkr&ddlm}m} | } t|||||\}}}}|rtj|st||d}t	|d}t
|| W dQ R X td| t	|d}t
|}td| W dQ R X nt||d}td	 |rtd
|| t }t|| ||	d}td |||||}tdt |  |rDtd||  |dkrtd x2t| dD ] \}}td|||	 qfW |
rtd |||\} }!td |std | }"|r||! |rt||!|"|d td| n td ||} |r|  |dk	rxt	|d4}#|#d|  |#dt|| dd  W dQ R X td| |dk	r||} t	|d}t
|| W dQ R X td| t	|d}t
|}$W dQ R X td| ||}%| |%krtd ntd dS )a
  
    Brill Tagger Demonstration
    :param templates: how many sentences of training and testing data to use
    :type templates: list of Template

    :param tagged_data: maximum number of rule instances to create
    :type tagged_data: C{int}

    :param num_sents: how many sentences of training and testing data to use
    :type num_sents: C{int}

    :param max_rules: maximum number of rule instances to create
    :type max_rules: C{int}

    :param min_score: the minimum score for a rule in order for it to be considered
    :type min_score: C{int}

    :param min_acc: the minimum score for a rule in order for it to be considered
    :type min_acc: C{float}

    :param train: the fraction of the the corpus to be used for training (1=all)
    :type train: C{float}

    :param trace: the level of diagnostic tracing output to produce (0-4)
    :type trace: C{int}

    :param randomize: whether the training data should be a random subset of the corpus
    :type randomize: C{bool}

    :param ruleformat: rule output format, one of "str", "repr", "verbose"
    :type ruleformat: C{str}

    :param incremental_stats: if true, will tag incrementally and collect stats for each rule (rather slow)
    :type incremental_stats: C{bool}

    :param template_stats: if true, will print per-template statistics collected in training and (optionally) testing
    :type template_stats: C{bool}

    :param error_output: the file where errors will be saved
    :type error_output: C{string}

    :param serialize_output: the file where the learned tbl tagger will be saved
    :type serialize_output: C{string}

    :param learning_curve_output: filename of plot of learning curve(s) (train and also test, if available)
    :type learning_curve_output: C{string}

    :param learning_curve_take: how many rules plotted
    :type learning_curve_take: C{int}

    :param baseline_backoff_tagger: the file where rules will be saved
    :type baseline_backoff_tagger: tagger

    :param separate_baseline_data: use a fraction of the training data exclusively for training baseline
    :type separate_baseline_data: C{bool}

    :param cache_baseline_tagger: cache baseline tagger to this file (only interesting as a temporary workaround to get
                                  deterministic output from the baseline unigram tagger between python versions)
    :type cache_baseline_tagger: C{string}


    Note on separate_baseline_data: if True, reuse training data both for baseline and rule learner. This
    is fast and fine for a demo, but is likely to generalize worse on unseen data.
    Also cannot be sensibly used for learning curves on training data (the baseline will be artificially high).
    Nr   )describe_template_setsbrill24)Zbackoffwz*Trained baseline tagger, pickled it to {0}rz Reloaded pickled tagger from {0}zTrained baseline taggerz"    Accuracy on test set: {0:0.4f})r   zTraining tbl tagger...z&Trained tbl tagger in {0:0.2f} secondsz    Accuracy on test set: %.4fr!   z
Learned rules: z{0:4d} {1:s}zJIncrementally tagging the test data, collecting individual rule statisticsz    Rule statistics collectedzbWARNING: train_stats asked for separate_baseline_data=True; the baseline will be artificially high)takez#Wrote plot of learning curve to {0}zTagging the test datazErrors for Brill Tagger %r


zutf-8z,Wrote tagger errors including context to {0}zWrote pickled tagger to {0}z4Reloaded tagger tried on test set, results identicalz;PROBLEM: Reloaded tagger gave different results on test set)REGEXP_TAGGERnltk.tag.brillr:   r;   _demo_prepare_dataospathexistsr   openpickledumpr'   r(   loadZevaluatetimer
   train	enumerateZrulesZbatch_tag_incrementalZtrain_statsZprint_template_statistics
_demo_plotZ	tag_sentswritejoinr   encode)&r   tagged_datar3   Z	max_rulesr5   r4   rK   Ztrace	randomizer   r   r   r.   r0   r,   Zlearning_curve_takeZbaseline_backoff_taggerr+   Zcache_baseline_taggerr:   r;   training_databaseline_data	gold_datatesting_dataZbaseline_taggerZprint_rulesZtbrillZtrainerZbrill_taggerZrulenoZruleZ
taggedtest	teststats
trainstatsfZbrill_tagger_reloadedZtaggedtest_reloadedr   r   r   r      s    X





&




r   c             C   s  | d krt d t } |d ks,t| |kr4t| }|rPtt|  t|  t|| }| d | }| || }dd |D }|s|}	n&t|d }
|d |
 ||
d   }	}t|\}}t|\}}t|	\}}t d	|| t d	|| t d	|||r
dnd	 ||	||fS )
Nz%Loading tagged data from treebank... c             S   s   g | ]}d d |D qS )c             S   s   g | ]}|d  qS )r   r   ).0tr   r   r   
<listcomp>g  s    z1_demo_prepare_data.<locals>.<listcomp>.<listcomp>r   )rZ   Zsentr   r   r   r\   g  s    z&_demo_prepare_data.<locals>.<listcomp>r#   z)Read testing data ({0:d} sents/{1:d} wds)z*Read training data ({0:d} sents/{1:d} wds)z0Read baseline data ({0:d} sents/{1:d} wds) {2:s} z[reused the training set])
r'   r   Ztagged_sentsr)   randomZseedZshuffleintcorpus_sizer(   )rQ   rK   r3   rR   r+   cutoffrS   rU   rV   rT   Z	bl_cutoffZ	trainseqsZtraintokensZtestseqsZ
testtokensZbltrainseqsZbltraintokensr   r   r   rB   W  s8    

rB   c       	         s    d g}x" d D ]}| |d |  qW  fdd|d | D }d g}x"d D ]}| |d |  q\W fdd|d | D }dd lm} ttt|}||||| |d d d dg ||  d S )	NZinitialerrorsZ
rulescoresr   c                s   g | ]}d | d   qS )r!   
tokencountr   )rZ   x)rW   r   r   r\     s    z_demo_plot.<locals>.<listcomp>c                s   g | ]}d | d   qS )r!   rb   r   )rZ   rc   )rX   r   r   r\     s    r   g      ?)	appendZmatplotlib.pyplotZpyplotr&   ranger)   ZplotZaxisZsavefig)	r,   rW   rX   r>   Z	testcurveZ	rulescoreZ
traincurveZpltr=   r   )rW   rX   r   rM     s    

rM   )z^-?[0-9]+(.[0-9]+)?$ZCD)z.*NN)z(The|the|A|a|An|an)$AT)z.*able$ZJJ)z.*ness$rf   )z.*ly$ZRB)z.*s$ZNNS)z.*ing$ZVBG)z.*ed$ZVBDc             C   s   t | tdd | D fS )Nc             s   s   | ]}t |V  qd S )N)r)   )rZ   rc   r   r   r   	<genexpr>  s    zcorpus_size.<locals>.<genexpr>)r)   sum)Zseqsr   r   r   r`     s    r`   __main__)NNr7   r8   r#   Nr9   r#   Fr   FFNNNr8   NFN)NN)'Z
__future__r   r   r   rC   rG   r^   rJ   Znltk.corpusr   Znltk.tblr   r   rA   r   r	   Znltk.tagr
   r   r   r   r   r   r   r   r   r    r*   r-   r/   r1   r6   r   rB   rM   ZNN_CD_TAGGERr@   r`   __name__r   r   r   r   <module>   sp   	                  
 5(

