B
    >?[                 @   s  d Z ddlmZmZ yddlZW n ek
r4   Y nX ddlZddlZddlm	Z	 ddl
mZ ddlmZ ddlmZ ddlmZ dd	lmZ dd
lmZ ddlmZmZmZ ddlmZmZmZ ddlmZm Z m!Z! dZ"ej#G dd deZ$e$Z%G dd de&Z'G dd de'Z(G dd de'Z)G dd de)Z*G dd de)Z+G dd de'Z,d1ddZ-d d! Z.d"d# Z/d2d$d%Z0d&d' Z1d(d) Z2d3d*d+Z3G d,d- d-e$Z4d.d/ Z5e6d0kre5  dS )4a  
A classifier model based on maximum entropy modeling framework.  This
framework considers all of the probability distributions that are
empirically consistent with the training data; and chooses the
distribution with the highest entropy.  A probability distribution is
"empirically consistent" with a set of training data if its estimated
frequency with which a class and a feature vector value co-occur is
equal to the actual frequency in the data.

Terminology: 'feature'
======================
The term *feature* is usually used to refer to some property of an
unlabeled token.  For example, when performing word sense
disambiguation, we might define a ``'prevword'`` feature whose value is
the word preceding the target word.  However, in the context of
maxent modeling, the term *feature* is typically used to refer to a
property of a "labeled" token.  In order to prevent confusion, we
will introduce two distinct terms to disambiguate these two different
concepts:

  - An "input-feature" is a property of an unlabeled token.
  - A "joint-feature" is a property of a labeled token.

In the rest of the ``nltk.classify`` module, the term "features" is
used to refer to what we will call "input-features" in this module.

In literature that describes and discusses maximum entropy models,
input-features are typically called "contexts", and joint-features
are simply referred to as "features".

Converting Input-Features to Joint-Features
-------------------------------------------
In maximum entropy models, joint-features are required to have numeric
values.  Typically, each input-feature ``input_feat`` is mapped to a
set of joint-features of the form:

|   joint_feat(token, label) = { 1 if input_feat(token) == feat_val
|                              {      and label == some_label
|                              {
|                              { 0 otherwise

For all values of ``feat_val`` and ``some_label``.  This mapping is
performed by classes that implement the ``MaxentFeatureEncodingI``
interface.
    )print_functionunicode_literalsN)defaultdict)integer_types)compat)gzip_open_unicode)OrderedDict)DictionaryProbDist)ClassifierI)CutoffCheckeraccuracylog_likelihood)
call_megamwrite_megam_fileparse_megam_weights)	call_tadmwrite_tadm_fileparse_tadm_weightsz
epytext enc               @   s   e Zd ZdZd#ddZdd Zdd Zd	d
 Zdd Zdd Z	d$ddZ
d%ddZd&ddZdd ZddddgZed'd!d"ZdS )(MaxentClassifiera  
    A maximum entropy classifier (also known as a "conditional
    exponential classifier").  This classifier is parameterized by a
    set of "weights", which are used to combine the joint-features
    that are generated from a featureset by an "encoding".  In
    particular, the encoding maps each ``(featureset, label)`` pair to
    a vector.  The probability of each label is then computed using
    the following equation::

                                dotprod(weights, encode(fs,label))
      prob(fs|label) = ---------------------------------------------------
                       sum(dotprod(weights, encode(fs,l)) for l in labels)

    Where ``dotprod`` is the dot product::

      dotprod(a,b) = sum(x*y for (x,y) in zip(a,b))
    Tc             C   s*   || _ || _|| _| t|ks&tdS )a{  
        Construct a new maxent classifier model.  Typically, new
        classifier models are created using the ``train()`` method.

        :type encoding: MaxentFeatureEncodingI
        :param encoding: An encoding that is used to convert the
            featuresets that are given to the ``classify`` method into
            joint-feature vectors, which are used by the maxent
            classifier model.

        :type weights: list of float
        :param weights:  The feature weight vector for this classifier.

        :type logarithmic: bool
        :param logarithmic: If false, then use non-logarithmic weights.
        N)	_encoding_weights_logarithmiclengthlenAssertionError)selfencodingweightsZlogarithmic r   3lib/python3.7/site-packages/nltk/classify/maxent.py__init__h   s    zMaxentClassifier.__init__c             C   s
   | j  S )N)r   labels)r   r   r   r   r!      s    zMaxentClassifier.labelsc             C   s    || _ | j t|kstdS )z
        Set the feature weight vector for this classifier.
        :param new_weights: The new feature weight vector.
        :type new_weights: list of float
        N)r   r   r   r   r   )r   Znew_weightsr   r   r   set_weights   s    zMaxentClassifier.set_weightsc             C   s   | j S )zg
        :return: The feature weight vector for this classifier.
        :rtype: list of float
        )r   )r   r   r   r   r      s    zMaxentClassifier.weightsc             C   s   |  | S )N)prob_classifymax)r   
featuresetr   r   r   classify   s    zMaxentClassifier.classifyc       	      C   s   i }x| j  D ]z}| j ||}| jrZd}x"|D ]\}}|| j| | 7 }q2W |||< qd}x"|D ]\}}|| j| | 9 }qdW |||< qW t|| jddS )Ng        g      ?T)logZ	normalize)r   r!   encoder   r   r	   )	r   r%   Z	prob_dictlabelfeature_vectortotalf_idf_valZprodr   r   r   r#      s    
zMaxentClassifier.prob_classify   c                s  d}dt |d  d }| t   jdd}|d| }td|d	d
d |D   tdd|d dt|     t	t
xt|D ]\}}j||}|jfdddd x|D ]\}	}
jrj|	 |
 }nj|	 |
 }j|	}|dd }|d|
 7 }t|dkr8|dd d }t|||d d |f  |  |7  < qW qW tdd|d dt|     td|d	fdd|D   td|d	 fdd|D   dS )z
        Print a table showing the effect of each of the features in
        the given feature set, and how they combine to determine the
        probabilities of each label for that featureset.
        2   z  %-   zs%s%8.3fT)keyreverseNz	  Feature c             s   s"   | ]}d d| dd  V  qdS )z%8sz%sN   r   ).0lr   r   r   	<genexpr>   s    z+MaxentClassifier.explain.<locals>.<genexpr>z  -   c                s   t  j| d  S )Nr   )absr   )Zfid__)r   r   r   <lambda>   s    z*MaxentClassifier.explain.<locals>.<lambda>z and label is r   z (%s)/   ,   z...    z  TOTAL:c             3   s   | ]}d  |  V  qdS )z%8.3fNr   )r5   r6   )sumsr   r   r7      s    z  PROBS:c             3   s   | ]}d   | V  qdS )z%8.3fN)prob)r5   r6   )pdistr   r   r7      s    )strr#   sortedsamplesrA   printljustjoinr   r   int	enumerater   r(   sortr   r   describesplit)r   r%   columnsZdescr_widthTEMPLATEr!   ir)   r*   r,   r-   ZscoreZdescrr   )rB   r   r@   r   explain   s>    
  $zMaxentClassifier.explain
   c                sP   t  dr jd| S tttt j fdddd _ jd| S dS )zW
        Generates the ranked list of informative features from most to least.
        _most_informative_featuresNc                s   t  j|  S )N)r:   r   )fid)r   r   r   r;      s    z<MaxentClassifier.most_informative_features.<locals>.<lambda>T)r1   r2   )hasattrrS   rD   listranger   r   )r   nr   )r   r   most_informative_features   s    


z*MaxentClassifier.most_informative_featuresallc                sx     d}|dkr& fdd|D }n|dkr@ fdd|D }x2|d| D ]"}td j|  j|f  qNW dS )z
        :param show: all, neg, or pos (for negative-only or positive-only)
        :type show: str
        :param n: The no. of top features
        :type n: int
        Nposc                s   g | ]} j | d kr|qS )r   )r   )r5   rT   )r   r   r   
<listcomp>   s    zCMaxentClassifier.show_most_informative_features.<locals>.<listcomp>negc                s   g | ]} j | d k r|qS )r   )r   )r5   rT   )r   r   r   r\      s    z%8.3f %s)rY   rF   r   r   rL   )r   rX   ZshowZfidsrT   r   )r   r   show_most_informative_features   s    
z/MaxentClassifier.show_most_informative_featuresc             C   s   dt | j | j f S )Nz:<ConditionalExponentialClassifier: %d labels, %d features>)r   r   r!   r   )r   r   r   r   __repr__   s    zMaxentClassifier.__repr__ZGISZIISZMEGAMZTADMN   r   c       
      K   s   |dkrd}x |D ]}|dkrt d| qW | }|dkrPt||||f|S |dkrjt||||f|S |dkrt|||||f|S |dkr|}	||	d< ||	d	< ||	d
< ||	d< tj|f|	S td| dS )a	  
        Train a new maxent classifier based on the given corpus of
        training samples.  This classifier will have its weights
        chosen to maximize entropy while remaining empirically
        consistent with the training corpus.

        :rtype: MaxentClassifier
        :return: The new maxent classifier

        :type train_toks: list
        :param train_toks: Training data, represented as a list of
            pairs, the first member of which is a featureset,
            and the second of which is a classification label.

        :type algorithm: str
        :param algorithm: A case-insensitive string, specifying which
            algorithm should be used to train the classifier.  The
            following algorithms are currently available.

            - Iterative Scaling Methods: Generalized Iterative Scaling (``'GIS'``),
              Improved Iterative Scaling (``'IIS'``)
            - External Libraries (requiring megam):
              LM-BFGS algorithm, with training performed by Megam (``'megam'``)

            The default algorithm is ``'IIS'``.

        :type trace: int
        :param trace: The level of diagnostic tracing output to produce.
            Higher values produce more verbose output.
        :type encoding: MaxentFeatureEncodingI
        :param encoding: A feature encoding, used to convert featuresets
            into feature vectors.  If none is specified, then a
            ``BinaryMaxentFeatureEncoding`` will be built based on the
            features that are attested in the training corpus.
        :type labels: list(str)
        :param labels: The set of possible labels.  If none is given, then
            the set of all labels attested in the training data will be
            used instead.
        :param gaussian_prior_sigma: The sigma value for a gaussian
            prior on model weights.  Currently, this is supported by
            ``megam``. For other algorithms, its value is ignored.
        :param cutoffs: Arguments specifying various conditions under
            which the training should be halted.  (Some of the cutoff
            conditions are not supported by some algorithms.)

            - ``max_iter=v``: Terminate after ``v`` iterations.
            - ``min_ll=v``: Terminate after the negative average
              log-likelihood drops under ``v``.
            - ``min_lldelta=v``: Terminate if a single iteration improves
              log likelihood by less than ``v``.
        NZiis)	max_iterZmin_llmin_lldeltaZmax_accZmin_accdeltacount_cutoffZnormexplicit	bernoullizUnexpected keyword arg %rZgisZmegamZtadmtracer   r!   gaussian_prior_sigmazUnknown algorithm %s)	TypeErrorlower train_maxent_classifier_with_iis train_maxent_classifier_with_gis"train_maxent_classifier_with_megamTadmMaxentClassifiertrain
ValueError)
cls
train_toks	algorithmrf   r   r!   rg   cutoffsr1   kwargsr   r   r   rn      s0    >

zMaxentClassifier.train)T)r.   )rR   )rR   rZ   )Nr`   NNr   )__name__
__module____qualname____doc__r    r!   r"   r   r&   r#   rQ   rY   r^   r_   Z
ALGORITHMSclassmethodrn   r   r   r   r   r   T   s$   
	
,

    r   c               @   s8   e Zd ZdZdd Zdd Zdd Zdd	 Zd
d ZdS )MaxentFeatureEncodingIa  
    A mapping that converts a set of input-feature values to a vector
    of joint-feature values, given a label.  This conversion is
    necessary to translate featuresets into a format that can be used
    by maximum entropy models.

    The set of joint-features used by a given encoding is fixed, and
    each index in the generated joint-feature vectors corresponds to a
    single joint-feature.  The length of the generated joint-feature
    vectors is therefore constant (for a given encoding).

    Because the joint-feature vectors generated by
    ``MaxentFeatureEncodingI`` are typically very sparse, they are
    represented as a list of ``(index, value)`` tuples, specifying the
    value of each non-zero joint-feature.

    Feature encodings are generally created using the ``train()``
    method, which generates an appropriate encoding based on the
    input-feature values and labels that are present in a given
    corpus.
    c             C   s
   t  dS )aC  
        Given a (featureset, label) pair, return the corresponding
        vector of joint-feature values.  This vector is represented as
        a list of ``(index, value)`` tuples, specifying the value of
        each non-zero joint-feature.

        :type featureset: dict
        :rtype: list(tuple(int, int))
        N)NotImplementedError)r   r%   r)   r   r   r   r(     s    
zMaxentFeatureEncodingI.encodec             C   s
   t  dS )z
        :return: The size of the fixed-length joint-feature vectors
            that are generated by this encoding.
        :rtype: int
        N)r{   )r   r   r   r   r     s    zMaxentFeatureEncodingI.lengthc             C   s
   t  dS )z
        :return: A list of the "known labels" -- i.e., all labels
            ``l`` such that ``self.encode(fs,l)`` can be a nonzero
            joint-feature vector for some value of ``fs``.
        :rtype: list
        N)r{   )r   r   r   r   r!     s    zMaxentFeatureEncodingI.labelsc             C   s
   t  dS )z
        :return: A string describing the value of the joint-feature
            whose index in the generated feature vectors is ``fid``.
        :rtype: str
        N)r{   )r   rT   r   r   r   rL     s    zMaxentFeatureEncodingI.describec             C   s
   t  dS )ao  
        Construct and return new feature encoding, based on a given
        training corpus ``train_toks``.

        :type train_toks: list(tuple(dict, str))
        :param train_toks: Training data, represented as a list of
            pairs, the first member of which is a feature dictionary,
            and the second of which is a classification label.
        N)r{   )rp   rq   r   r   r   rn     s    
zMaxentFeatureEncodingI.trainN)	ru   rv   rw   rx   r(   r   r!   rL   rn   r   r   r   r   rz   k  s   	rz   c               @   s8   e Zd ZdZdd Zdd Zdd Zdd	 Zd
d ZdS )#FunctionBackedMaxentFeatureEncodingz
    A feature encoding that calls a user-supplied function to map a
    given featureset/label pair to a sparse joint-feature vector.
    c             C   s   || _ || _|| _dS )ag  
        Construct a new feature encoding based on the given function.

        :type func: (callable)
        :param func: A function that takes two arguments, a featureset
             and a label, and returns the sparse joint feature vector
             that encodes them::

                 func(featureset, label) -> feature_vector

             This sparse joint feature vector (``feature_vector``) is a
             list of ``(index,value)`` tuples.

        :type length: int
        :param length: The size of the fixed-length joint-feature
            vectors that are generated by this encoding.

        :type labels: list
        :param labels: A list of the "known labels" for this
            encoding -- i.e., all labels ``l`` such that
            ``self.encode(fs,l)`` can be a nonzero joint-feature vector
            for some value of ``fs``.
        N)_length_func_labels)r   funcr   r!   r   r   r   r      s    z,FunctionBackedMaxentFeatureEncoding.__init__c             C   s   |  ||S )N)r~   )r   r%   r)   r   r   r   r(     s    z*FunctionBackedMaxentFeatureEncoding.encodec             C   s   | j S )N)r}   )r   r   r   r   r     s    z*FunctionBackedMaxentFeatureEncoding.lengthc             C   s   | j S )N)r   )r   r   r   r   r!     s    z*FunctionBackedMaxentFeatureEncoding.labelsc             C   s   dS )Nzno description availabler   )r   rT   r   r   r   rL     s    z,FunctionBackedMaxentFeatureEncoding.describeN)	ru   rv   rw   rx   r    r(   r   r!   rL   r   r   r   r   r|     s   r|   c               @   sH   e Zd ZdZdddZdd Zdd Zd	d
 Zdd Ze	dddZ
dS )BinaryMaxentFeatureEncodinga  
    A feature encoding that generates vectors containing a binary
    joint-features of the form:

    |  joint_feat(fs, l) = { 1 if (fs[fname] == fval) and (l == label)
    |                      {
    |                      { 0 otherwise

    Where ``fname`` is the name of an input-feature, ``fval`` is a value
    for that input-feature, and ``label`` is a label.

    Typically, these features are constructed based on a training
    corpus, using the ``train()`` method.  This method will create one
    feature for each combination of ``fname``, ``fval``, and ``label``
    that occurs at least once in the training corpus.

    The ``unseen_features`` parameter can be used to add "unseen-value
    features", which are used whenever an input feature has a value
    that was not encountered in the training corpus.  These features
    have the form:

    |  joint_feat(fs, l) = { 1 if is_unseen(fname, fs[fname])
    |                      {      and l == label
    |                      {
    |                      { 0 otherwise

    Where ``is_unseen(fname, fval)`` is true if the encoding does not
    contain any joint features that are true when ``fs[fname]==fval``.

    The ``alwayson_features`` parameter can be used to add "always-on
    features", which have the form::

    |  joint_feat(fs, l) = { 1 if (l == label)
    |                      {
    |                      { 0 otherwise

    These always-on features allow the maxent model to directly model
    the prior probabilities of each label.
    Fc                s   t | t tt|kr$tdt| _| _t| _d _	d _
|r~t fddt|D  _	  jt j	7  _|rt dd |D }t fddt|D  _
  jt|7  _dS )a  
        :param labels: A list of the "known labels" for this encoding.

        :param mapping: A dictionary mapping from ``(fname,fval,label)``
            tuples to corresponding joint-feature indexes.  These
            indexes must be the set of integers from 0...len(mapping).
            If ``mapping[fname,fval,label]=id``, then
            ``self.encode(..., fname:fval, ..., label)[id]`` is 1;
            otherwise, it is 0.

        :param unseen_features: If true, then include unseen value
           features in the generated joint-feature vectors.

        :param alwayson_features: If true, then include always-on
           features in the generated joint-feature vectors.
        zHMapping values must be exactly the set of integers from 0...len(mapping)Nc             3   s    | ]\}}|| j  fV  qd S )N)r}   )r5   rP   r)   )r   r   r   r7   4  s    z7BinaryMaxentFeatureEncoding.__init__.<locals>.<genexpr>c             s   s   | ]\}}}|V  qd S )Nr   )r5   fnamefvalr)   r   r   r   r7   9  s    c             3   s    | ]\}}|| j  fV  qd S )N)r}   )r5   rP   r   )r   r   r   r7   ;  s    )setvaluesrW   r   ro   rV   r   _mappingr}   	_alwayson_unseendictrJ   )r   r!   mappingunseen_featuresalwayson_featuresfnamesr   )r   r   r      s"    

z$BinaryMaxentFeatureEncoding.__init__c             C   s   g }x|  D ]z\}}|||f| jkrB|| j|||f df q| jrx>| jD ]}|||f| jkrPP qPW || jkr|| j| df qW | jr|| jkr|| j| df |S )Nr?   )itemsr   appendr   r   r   )r   r%   r)   r   r   r   label2r   r   r   r(   ?  s    
z"BinaryMaxentFeatureEncoding.encodec             C   s  t |tstdy
| j W nH tk
rd   dgt| j | _x | j D ]\}}|| j|< qJW Y nX |t| jk r| j| \}}}d|||f S | jr|| j	 krxr| j D ]\}}||krd| S qW nJ| j
r|| j
	 krx0| j
 D ]\}}||krd| S qW ntdd S )Nzdescribe() expected an intz%s==%r and label is %rzlabel is %rz%s is unseenzBad feature id)
isinstancer   rh   _inv_mappingAttributeErrorr   r   r   r   r   r   ro   )r   r,   inforP   r   r   r)   f_id2r   r   r   rL   Z  s(    

z$BinaryMaxentFeatureEncoding.describec             C   s   | j S )N)r   )r   r   r   r   r!   s  s    z"BinaryMaxentFeatureEncoding.labelsc             C   s   | j S )N)r}   )r   r   r   r   r   w  s    z"BinaryMaxentFeatureEncoding.lengthr   Nc             K   s   i }t  }tt}x|D ]\}}	|r8|	|kr8td|	 ||	 xX| D ]L\}
}||
|f  d7  < ||
|f |krL|
||	f|krLt|||
||	f< qLW qW |dkr|}| ||f|S )a  
        Construct and return new feature encoding, based on a given
        training corpus ``train_toks``.  See the class description
        ``BinaryMaxentFeatureEncoding`` for a description of the
        joint-features that will be included in this encoding.

        :type train_toks: list(tuple(dict, str))
        :param train_toks: Training data, represented as a list of
            pairs, the first member of which is a feature dictionary,
            and the second of which is a classification label.

        :type count_cutoff: int
        :param count_cutoff: A cutoff value that is used to discard
            rare joint-features.  If a joint-feature's value is 1
            fewer than ``count_cutoff`` times in the training corpus,
            then that joint-feature is not included in the generated
            encoding.

        :type labels: list
        :param labels: A list of labels that should be used by the
            classifier.  If not specified, then the set of labels
            attested in ``train_toks`` will be used.

        :param options: Extra parameters for the constructor, such as
            ``unseen_features`` and ``alwayson_features``.
        zUnexpected label %sr?   N)r   r   rI   ro   addr   r   )rp   rq   rc   r!   optionsr   seen_labelscounttokr)   r   r   r   r   r   rn   {  s    
z!BinaryMaxentFeatureEncoding.train)FF)r   N)ru   rv   rw   rx   r    r(   rL   r!   r   ry   rn   r   r   r   r   r     s   '
3r   c               @   s>   e Zd ZdZdddZedd Zdd	 Zd
d Zdd Z	dS )GISEncodinga  
    A binary feature encoding which adds one new joint-feature to the
    joint-features defined by ``BinaryMaxentFeatureEncoding``: a
    correction feature, whose value is chosen to ensure that the
    sparse vector always sums to a constant non-negative number.  This
    new feature is used to ensure two preconditions for the GIS
    training algorithm:

      - At least one feature vector index must be nonzero for every
        token.
      - The feature vector must sum to a constant non-negative number
        for every token.
    FNc             C   s>   t | |||| |dkr4ttdd |D d }|| _dS )a	  
        :param C: The correction constant.  The value of the correction
            feature is based on this value.  In particular, its value is
            ``C - sum([v for (f,v) in encoding])``.
        :seealso: ``BinaryMaxentFeatureEncoding.__init__``
        Nc             s   s   | ]\}}}|V  qd S )Nr   )r5   r   r   r)   r   r   r   r7     s    z'GISEncoding.__init__.<locals>.<genexpr>r?   )r   r    r   r   _C)r   r!   r   r   r   Cr   r   r   r      s
    	zGISEncoding.__init__c             C   s   | j S )zOThe non-negative constant that all encoded feature vectors
        will sum to.)r   )r   r   r   r   r     s    zGISEncoding.Cc             C   sT   t | ||}t | }tdd |D }|| jkr<td||| j| f |S )Nc             s   s   | ]\}}|V  qd S )Nr   )r5   fvr   r   r   r7     s    z%GISEncoding.encode.<locals>.<genexpr>z&Correction feature is not high enough!)r   r(   r   sumr   ro   r   )r   r%   r)   r   Zbase_lengthr+   r   r   r   r(     s    

zGISEncoding.encodec             C   s   t | d S )Nr?   )r   r   )r   r   r   r   r     s    zGISEncoding.lengthc             C   s(   |t | krd| j S t | |S d S )NzCorrection feature (%s))r   r   r   rL   )r   r,   r   r   r   rL     s    
zGISEncoding.describe)FFN)
ru   rv   rw   rx   r    propertyr   r(   r   rL   r   r   r   r   r     s   
r   c               @   sD   e Zd ZdddZdd Zdd Zdd	 Zd
d ZedddZ	dS )TadmEventMaxentFeatureEncodingFc             C   s*   t || _t  | _t| || j|| d S )N)r   r   _label_mappingr   r    )r   r!   r   r   r   r   r   r   r      s    
z'TadmEventMaxentFeatureEncoding.__init__c             C   s   g }x|  D ]x\}}||f| jkr8t| j| j||f< || jkrht|ts^t| j| j|< n
|| j|< || j||f | j| f qW |S )N)r   r   r   r   r   rI   r   )r   r%   r)   r   featurevaluer   r   r   r(     s    


z%TadmEventMaxentFeatureEncoding.encodec             C   s   | j S )N)r   )r   r   r   r   r!     s    z%TadmEventMaxentFeatureEncoding.labelsc             C   s2   x,| j D ]"\}}| j ||f |kr||fS qW d S )N)r   )r   rT   r   r)   r   r   r   rL     s    z'TadmEventMaxentFeatureEncoding.describec             C   s
   t | jS )N)r   r   )r   r   r   r   r     s    z%TadmEventMaxentFeatureEncoding.lengthr   Nc       	      K   s   t  }|sg }t|}x"|D ]\}}||kr|| qW xH|D ]@\}}x6|D ].}x(|D ] }||f|krXt||||f< qXW qNW q@W | ||f|S )N)r   rV   r   r   )	rp   rq   rc   r!   r   r   r%   r)   r   r   r   r   rn     s    

z$TadmEventMaxentFeatureEncoding.train)FF)r   N)
ru   rv   rw   r    r(   r!   rL   r   ry   rn   r   r   r   r   r     s   
r   c               @   sH   e Zd ZdZdddZdd Zdd Zd	d
 Zdd Ze	dddZ
dS )TypedMaxentFeatureEncodingaZ  
    A feature encoding that generates vectors containing integer,
    float and binary joint-features of the form:

    Binary (for string and boolean features):

    |  joint_feat(fs, l) = { 1 if (fs[fname] == fval) and (l == label)
    |                      {
    |                      { 0 otherwise

    Value (for integer and float features):

    |  joint_feat(fs, l) = { fval if     (fs[fname] == type(fval))
    |                      {         and (l == label)
    |                      {
    |                      { not encoded otherwise

    Where ``fname`` is the name of an input-feature, ``fval`` is a value
    for that input-feature, and ``label`` is a label.

    Typically, these features are constructed based on a training
    corpus, using the ``train()`` method.

    For string and boolean features [type(fval) not in (int, float)]
    this method will create one feature for each combination of
    ``fname``, ``fval``, and ``label`` that occurs at least once in the
    training corpus.

    For integer and float features [type(fval) in (int, float)] this
    method will create one feature for each combination of ``fname``
    and ``label`` that occurs at least once in the training corpus.

    For binary features the ``unseen_features`` parameter can be used
    to add "unseen-value features", which are used whenever an input
    feature has a value that was not encountered in the training
    corpus.  These features have the form:

    |  joint_feat(fs, l) = { 1 if is_unseen(fname, fs[fname])
    |                      {      and l == label
    |                      {
    |                      { 0 otherwise

    Where ``is_unseen(fname, fval)`` is true if the encoding does not
    contain any joint features that are true when ``fs[fname]==fval``.

    The ``alwayson_features`` parameter can be used to add "always-on
    features", which have the form:

    |  joint_feat(fs, l) = { 1 if (l == label)
    |                      {
    |                      { 0 otherwise

    These always-on features allow the maxent model to directly model
    the prior probabilities of each label.
    Fc                s   t | t tt|kr$tdt| _| _t| _d _	d _
|r~t fddt|D  _	  jt j	7  _|rt dd |D }t fddt|D  _
  jt|7  _dS )a  
        :param labels: A list of the "known labels" for this encoding.

        :param mapping: A dictionary mapping from ``(fname,fval,label)``
            tuples to corresponding joint-feature indexes.  These
            indexes must be the set of integers from 0...len(mapping).
            If ``mapping[fname,fval,label]=id``, then
            ``self.encode({..., fname:fval, ...``, label)[id]} is 1;
            otherwise, it is 0.

        :param unseen_features: If true, then include unseen value
           features in the generated joint-feature vectors.

        :param alwayson_features: If true, then include always-on
           features in the generated joint-feature vectors.
        zHMapping values must be exactly the set of integers from 0...len(mapping)Nc             3   s    | ]\}}|| j  fV  qd S )N)r}   )r5   rP   r)   )r   r   r   r7     s    z6TypedMaxentFeatureEncoding.__init__.<locals>.<genexpr>c             s   s   | ]\}}}|V  qd S )Nr   )r5   r   r   r)   r   r   r   r7     s    c             3   s    | ]\}}|| j  fV  qd S )N)r}   )r5   rP   r   )r   r   r   r7     s    )r   r   rW   r   ro   rV   r   r   r}   r   r   r   rJ   )r   r!   r   r   r   r   r   )r   r   r    ^  s"    

z#TypedMaxentFeatureEncoding.__init__c             C   s   g }x|  D ]\}}t|ttfrX|t||f| jkr|| j|t||f |f q|||f| jkr|| j|||f df q| jrx>| jD ]}|||f| jkrP qW || jkr|| j| df qW | j	r|| j	kr|| j	| df |S )Nr?   )
r   r   r   floattyper   r   r   r   r   )r   r%   r)   r   r   r   r   r   r   r   r(     s      
z!TypedMaxentFeatureEncoding.encodec             C   s  t |tstdy
| j W nH tk
rd   dgt| j | _x | j D ]\}}|| j|< qJW Y nX |t| jk r| j| \}}}d|||f S | jr|| j	 krxr| j D ]\}}||krd| S qW nJ| j
r|| j
	 krx0| j
 D ]\}}||krd| S qW ntdd S )Nzdescribe() expected an intr   z%s==%r and label is %rzlabel is %rz%s is unseenzBad feature id)r   r   rh   r   r   r   r   r   r   r   r   ro   )r   r,   r   rP   r   r   r)   r   r   r   r   rL     s(    

z#TypedMaxentFeatureEncoding.describec             C   s   | j S )N)r   )r   r   r   r   r!     s    z!TypedMaxentFeatureEncoding.labelsc             C   s   | j S )N)r}   )r   r   r   r   r     s    z!TypedMaxentFeatureEncoding.lengthr   Nc             K   s   i }t  }tt}x|D ]\}}	|r8|	|kr8td|	 ||	 xp| D ]d\}
}t|ttfkrlt|}||
|f  d7  < ||
|f |krL|
||	f|krLt|||
||	f< qLW qW |dkr|}| ||f|S )a)  
        Construct and return new feature encoding, based on a given
        training corpus ``train_toks``.  See the class description
        ``TypedMaxentFeatureEncoding`` for a description of the
        joint-features that will be included in this encoding.

        Note: recognized feature values types are (int, float), over
        types are interpreted as regular binary features.

        :type train_toks: list(tuple(dict, str))
        :param train_toks: Training data, represented as a list of
            pairs, the first member of which is a feature dictionary,
            and the second of which is a classification label.

        :type count_cutoff: int
        :param count_cutoff: A cutoff value that is used to discard
            rare joint-features.  If a joint-feature's value is 1
            fewer than ``count_cutoff`` times in the training corpus,
            then that joint-feature is not included in the generated
            encoding.

        :type labels: list
        :param labels: A list of labels that should be used by the
            classifier.  If not specified, then the set of labels
            attested in ``train_toks`` will be used.

        :param options: Extra parameters for the constructor, such as
            ``unseen_features`` and ``alwayson_features``.
        zUnexpected label %sr?   N)	r   r   rI   ro   r   r   r   r   r   )rp   rq   rc   r!   r   r   r   r   r   r)   r   r   r   r   r   rn     s"    
z TypedMaxentFeatureEncoding.train)FF)r   N)ru   rv   rw   rx   r    r(   rL   r!   r   ry   rn   r   r   r   r   r   %  s   7
3 r   r`   c             K   s  | dd t|}|dkr*tj| |d}t|ds<tdd|j }t| |}tt	
|dkd }t	t|d	}	x|D ]}
t	j|	|
< q|W t||	}t	|}~|dkrtd
|d   |dkrt  td td yx|dkr"|jpt|| }|jpt|| }|j}td|||f  t|| |}x|D ]}
||
  d7  < q4W t	|}~| }	|	|| | 7 }	||	 ||| rP qW W n* tk
r   td Y n    Y nX |dkrt|| }t|| }td||f  |S )a  
    Train a new ``ConditionalExponentialClassifier``, using the given
    training samples, using the Generalized Iterative Scaling
    algorithm.  This ``ConditionalExponentialClassifier`` will encode
    the model that maximizes entropy from all the models that are
    empirically consistent with ``train_toks``.

    :see: ``train_maxent_classifier()`` for parameter descriptions.
    ra   d   N)r!   r   zJThe GIS algorithm requires an encoding that defines C (e.g., GISEncoding).g      ?r   dz  ==> Training (%d iterations)r0   z-      Iteration    Log Likelihood    Accuracyz-      ---------------------------------------z     %9d    %14.5f    %9.3fr?   z*      Training stopped: keyboard interruptz!         Final    %14.5f    %9.3f)
setdefaultr   r   rn   rU   rh   r   calculate_empirical_fcountr   numpynonzerozerosr   NINF ConditionalExponentialClassifierlog2rF   llr   accr   itercalculate_estimated_fcountr   r"   checkKeyboardInterrupt)rq   rf   r   r!   rs   cutoffcheckerZCinvZempirical_fcount
unattestedr   rT   
classifierZlog_empirical_fcountr   r   iternumZestimated_fcountZlog_estimated_fcountr   r   r   rk     sb    














rk   c             C   sP   t | d}x:| D ]2\}}x(|||D ]\}}||  |7  < q,W qW |S )Nr   )r   r   r   r(   )rq   r   fcountr   r)   indexvalr   r   r   r   r  s
    r   c       
      C   sz   t | d}xd|D ]\\}}| |}xH| D ]<}||}x,|||D ]\}}	||  ||	 7  < qNW q2W qW |S )Nr   )r   r   r   r#   rE   rA   r(   )
r   rq   r   r   r   r)   rB   rA   rT   r   r   r   r   r   |  s    

 r   c          	   K   s  | dd t|}|dkr*tj| |d}t| |t|  }t| |}tt	||j
dd}t|t|df}	tt|dkd }
tt|d}x|
D ]}tj||< qW t||}|dkrtd	|d   |d
krt  td td yx|d
kr8|jpt|| }|jpt|| }|j}td|||f  t| ||
||||	|}| }||7 }|| ||| rP qW W n* tk
r   td Y n    Y nX |d
krt|| }t|| }td||f  |S )a  
    Train a new ``ConditionalExponentialClassifier``, using the given
    training samples, using the Improved Iterative Scaling algorithm.
    This ``ConditionalExponentialClassifier`` will encode the model
    that maximizes entropy from all the models that are empirically
    consistent with ``train_toks``.

    :see: ``train_maxent_classifier()`` for parameter descriptions.
    ra   r   N)r!   )r1   r   r?   r   z  ==> Training (%d iterations)r0   z-      Iteration    Log Likelihood    Accuracyz-      ---------------------------------------z     %9d    %14.5f    %9.3fz*      Training stopped: keyboard interruptz!         Final    %14.5f    %9.3f)r   r   r   rn   r   r   calculate_nfmapr   ZarrayrD   __getitem__Zreshaper   r   r   r   r   rF   r   r   r   r   r   calculate_deltasr   r"   r   r   )rq   rf   r   r!   rs   r   Zempirical_ffreqnfmapnfarraynftransposer   r   rT   r   r   r   r   deltasr   r   r   rj     sb    








rj   c          
   C   s`   t  }xB| D ]:\}}x0| D ]$}|tdd |||D  qW qW tdd t|D S )a  
    Construct a map that can be used to compress ``nf`` (which is
    typically sparse).

    *nf(feature_vector)* is the sum of the feature values for
    *feature_vector*.

    This represents the number of features that are active for a
    given labeled text.  This method finds all values of *nf(t)*
    that are attested for at least one token in the given list of
    training tokens; and constructs a dictionary mapping these
    attested values to a continuous range *0...N*.  For example,
    if the only values of *nf()* that were attested were 3, 5, and
    7, then ``_nfmap`` might return the dictionary ``{3:0, 5:1, 7:2}``.

    :return: A map that can be used to compress ``nf`` to a dense
        vector.
    :rtype: dict(int -> int)
    c             s   s   | ]\}}|V  qd S )Nr   )r5   idr   r   r   r   r7     s    z"calculate_nfmap.<locals>.<genexpr>c             s   s   | ]\}}||fV  qd S )Nr   )r5   rP   nfr   r   r   r7     s    )r   r!   r   r   r(   r   rJ   )rq   r   Znfsetr   _r)   r   r   r   r     s
    (r   c          	   C   sp  d}d}	t | d}
t t|| fd}x~| D ]v\}}||}xb| D ]V}|||}tdd |D }x2|D ]*\}}||| |f  |	|| 7  < qzW qRW q6W |t|  }xt
|	D ]}t ||
}d| }|| }t j|| dd}t j|| dd}x|D ]}||  d	7  < qW |
|| |  8 }
t t|| t t|
 }||k r|
S qW |
S )
a
  
    Calculate the update values for the classifier weights for
    this iteration of IIS.  These update weights are the value of
    ``delta`` that solves the equation::

      ffreq_empirical[i]
             =
      SUM[fs,l] (classifier.prob_classify(fs).prob(l) *
                 feature_vector(fs,l)[i] *
                 exp(delta[i] * nf(feature_vector(fs,l))))

    Where:
        - *(fs,l)* is a (featureset, label) tuple from ``train_toks``
        - *feature_vector(fs,l)* = ``encoding.encode(fs,l)``
        - *nf(vector)* = ``sum([val for (id,val) in vector])``

    This method uses Newton's method to solve this equation for
    *delta[i]*.  In particular, it starts with a guess of
    ``delta[i]`` = 1; and iteratively updates ``delta`` with:

    | delta[i] -= (ffreq_empirical[i] - sum1[i])/(-sum2[i])

    until convergence, where *sum1* and *sum2* are defined as:

    |    sum1[i](delta) = SUM[fs,l] f[i](fs,l,delta)
    |    sum2[i](delta) = SUM[fs,l] (f[i](fs,l,delta).nf(feature_vector(fs,l)))
    |    f[i](fs,l,delta) = (classifier.prob_classify(fs).prob(l) .
    |                        feature_vector(fs,l)[i] .
    |                        exp(delta[i] . nf(feature_vector(fs,l))))

    Note that *sum1* and *sum2* depend on ``delta``; so they need
    to be re-computed each iteration.

    The variables ``nfmap``, ``nfarray``, and ``nftranspose`` are
    used to generate a dense encoding for *nf(ltext)*.  This
    allows ``_deltas`` to calculate *sum1* and *sum2* using
    matrices, which yields a significant performance improvement.

    :param train_toks: The set of training tokens.
    :type train_toks: list(tuple(dict, str))
    :param classifier: The current classifier.
    :type classifier: ClassifierI
    :param ffreq_empirical: An array containing the empirical
        frequency for each feature.  The *i*\ th element of this
        array is the empirical frequency for feature *i*.
    :type ffreq_empirical: sequence of float
    :param unattested: An array that is 1 for features that are
        not attested in the training data; and 0 for features that
        are attested.  In other words, ``unattested[i]==0`` iff
        ``ffreq_empirical[i]==0``.
    :type unattested: sequence of int
    :param nfmap: A map that can be used to compress ``nf`` to a dense
        vector.
    :type nfmap: dict(int -> int)
    :param nfarray: An array that can be used to uncompress ``nf``
        from a dense vector.
    :type nfarray: array(float)
    :param nftranspose: The transpose of ``nfarray``
    :type nftranspose: array(float)
    g-q=i,  r   c             s   s   | ]\}}|V  qd S )Nr   )r5   r   r   r   r   r   r7   `  s    z#calculate_deltas.<locals>.<genexpr>r0   r   )Zaxisr?   )r   Zonesr   r   r   r#   r!   r(   r   rA   rW   Zouterr:   )rq   r   r   Zffreq_empiricalr   r   r   r   ZNEWTON_CONVERGEZ
MAX_NEWTONr   Ar   r)   Zdistr*   r   r   r   ZrangenumZnf_deltaZexp_nf_deltaZnf_exp_nf_deltaZsum1Zsum2rT   Zn_errorr   r   r   r     s2    I
.

 r   c          
   K   s&  d}d}d|kr|d }d|kr(|d }|dkrP| dd}tj| ||dd}n|dk	r`tdyFtjd	d
\}	}
t|
d}t| ||||d W dQ R X t	|	 W n4 t
ttfk
r } ztd| W dd}~X Y nX g }|dddg7 }|r|dg7 }|s|dg7 }|r d|d  }nd}|dd| dg7 }|dk rJ|dg7 }d|krh|dd|d  g7 }d|kr|ddt|d  g7 }t|dr|d g7 }|d!|
g7 }t|}yt|
 W n8 t
tfk
r } ztd"|
|f  W dd}~X Y nX t|| |}|ttj9 }t||S )#a  
    Train a new ``ConditionalExponentialClassifier``, using the given
    training samples, using the external ``megam`` library.  This
    ``ConditionalExponentialClassifier`` will encode the model that
    maximizes entropy from all the models that are empirically
    consistent with ``train_toks``.

    :see: ``train_maxent_classifier()`` for parameter descriptions.
    :see: ``nltk.classify.megam``
    Trd   re   Nrc   r   )r!   r   z$Specify encoding or labels, not bothznltk-)prefixw)rd   re   z,Error while creating megam training file: %sz-nobiasz-repeatZ10z	-explicitz-fvalsg      ?r0   z-lambdaz%.2fz-tuner`   z-quietra   z-maxiz%sll_deltaz-dppZcostz-multilabelZ
multiclassz Warning: unable to delete %s: %s)getr   rn   ro   tempfilemkstempopenr   oscloseOSErrorIOErrorr:   rU   r   removerF   r   r   r   r   er   )rq   rf   r   r!   rg   rt   rd   re   rc   fdtrainfile_name	trainfiler   r   Zinv_variancestdoutr   r   r   r   rl     s^    






"rl   c               @   s   e Zd Zedd ZdS )rm   c          	   K   s  | dd}| dd}| dd }| dd }| dd}| d	d}| d
}	| d}
|sptj|||d}tjddd\}}tjdd\}}t|d}t||| |  g }|dg |d|g |r|dd|d  g |	r|dd|	 g |
r|ddt	|
 g |d|g |d|g |dk rN|dg n|dg t
| t|d}t|}W d Q R X t| t| |ttj9 }| ||S ) Nrr   Ztao_lmvmrf   r`   r   r!   rg   r   rc   ra   rb   )r!   znltk-tadm-events-z.gz)r   suffixznltk-tadm-weights-)r   r   z-monitorz-methodz-l2z%.6fr0   z-max_itz%dz-fatolz
-events_inz-params_outz2>&1z-summaryr)r   r   rn   r   r   r   r   r   extendr:   r   r   r   r   r   r   r   r   )rp   rq   rt   rr   rf   r   r!   Zsigmarc   ra   r   Ztrainfile_fdr   Zweightfile_fdZweightfile_namer   r   Z
weightfiler   r   r   r   rn     sL    





zTadmMaxentClassifier.trainN)ru   rv   rw   ry   rn   r   r   r   r   rm     s   rm   c              C   s   ddl m}  | tj}d S )Nr   )
names_demo)nltk.classify.utilr   r   rn   )r   r   r   r   r   demo%  s    r   __main__)r`   NN)r`   NN)r`   NNr   )7rx   Z
__future__r   r   r   ImportErrorr   r   collectionsr   Zsixr   Znltkr   Z	nltk.datar   Z	nltk.utilr   Znltk.probabilityr	   Znltk.classify.apir
   r   r   r   r   Znltk.classify.megamr   r   r   Znltk.classify.tadmr   r   r   Z__docformat__Zpython_2_unicode_compatibler   r   objectrz   r|   r   r   r   r   rk   r   r   rj   r   r   rl   rm   r   ru   r   r   r   r   <module>5   sV     I/ N=8 m
a

[ 
[=
