B
    >?ð[.(  ã               @   sn   d Z ddlmZmZ ddlmZ ddlmZmZm	Z	m
Z
 ddlmZ G dd„ deƒZdd	„ Zed
krjeƒ  dS )aê  
A classifier based on the Naive Bayes algorithm.  In order to find the
probability for a label, this algorithm first uses the Bayes rule to
express P(label|features) in terms of P(label) and P(features|label):

|                       P(label) * P(features|label)
|  P(label|features) = ------------------------------
|                              P(features)

The algorithm then makes the 'naive' assumption that all features are
independent, given the label:

|                       P(label) * P(f1|label) * ... * P(fn|label)
|  P(label|features) = --------------------------------------------
|                                         P(features)

Rather than computing P(features) explicitly, the algorithm just
calculates the numerator for each label, and normalizes them so they
sum to one:

|                       P(label) * P(f1|label) * ... * P(fn|label)
|  P(label|features) = --------------------------------------------
|                        SUM[l]( P(l) * P(f1|l) * ... * P(fn|l) )
é    )Úprint_functionÚunicode_literals)Údefaultdict)ÚFreqDistÚDictionaryProbDistÚELEProbDistÚsum_logs)ÚClassifierIc               @   sT   e Zd ZdZdd„ Zdd„ Zdd„ Zdd	„ Zddd„Zddd„Z	e
efdd„ƒZdS )ÚNaiveBayesClassifiera  
    A Naive Bayes classifier.  Naive Bayes classifiers are
    paramaterized by two probability distributions:

      - P(label) gives the probability that an input will receive each
        label, given no information about the input's features.

      - P(fname=fval|label) gives the probability that a given feature
        (fname) will receive a given value (fval), given that the
        label (label).

    If the classifier encounters an input with a feature that has
    never been seen with any label, then rather than assigning a
    probability of 0 to all labels, it will ignore that feature.

    The feature value 'None' is reserved for unseen feature values;
    you generally should not use 'None' as a feature value for one of
    your own features.
    c             C   s   || _ || _t| ¡ ƒ| _dS )a=  
        :param label_probdist: P(label), the probability distribution
            over labels.  It is expressed as a ``ProbDistI`` whose
            samples are labels.  I.e., P(label) =
            ``label_probdist.prob(label)``.

        :param feature_probdist: P(fname=fval|label), the probability
            distribution for feature values, given labels.  It is
            expressed as a dictionary whose keys are ``(label, fname)``
            pairs and whose values are ``ProbDistI`` objects over feature
            values.  I.e., P(fname=fval|label) =
            ``feature_probdist[label,fname].prob(fval)``.  If a given
            ``(label,fname)`` is not a key in ``feature_probdist``, then
            it is assumed that the corresponding P(fname=fval|label)
            is 0 for all values of ``fval``.
        N)Ú_label_probdistÚ_feature_probdistÚlistÚsamplesÚ_labels)ÚselfÚlabel_probdistÚfeature_probdist© r   ú7lib/python3.7/site-packages/nltk/classify/naivebayes.pyÚ__init__A   s    zNaiveBayesClassifier.__init__c             C   s   | j S )N)r   )r   r   r   r   ÚlabelsV   s    zNaiveBayesClassifier.labelsc             C   s   |   |¡ ¡ S )N)Úprob_classifyÚmax)r   Ú
featuresetr   r   r   ÚclassifyY   s    zNaiveBayesClassifier.classifyc             C   sä   |  ¡ }x:t| ¡ ƒD ]*}x$| jD ]}||f| jkr"P q"W ||= qW i }x| jD ]}| j |¡||< qPW xl| jD ]b}x\| ¡ D ]P\}}||f| jkrº| j||f }||  | |¡7  < q~||  tg ƒ7  < q~W qpW t	|dddS )NT)Z	normalizeÚlog)
Úcopyr   Úkeysr   r   r   ÚlogprobÚitemsr   r   )r   r   ÚfnameÚlabelr   ÚfvalZfeature_probsr   r   r   r   \   s     
z"NaiveBayesClassifier.prob_classifyé
   c          	      sê   | j ‰ tdƒ xÖ|  |¡D ]È\‰‰‡ ‡‡fdd„}t‡ ‡‡fdd„| jD ƒ|d}t|ƒdkr`q|d }|d	 }ˆ |ˆf  ˆ¡dkrŒd
}n(dˆ |ˆf  ˆ¡ˆ |ˆf  ˆ¡  }tdˆˆd| d d… d| d d… |f ƒ qW d S )NzMost Informative Featuresc                s   ˆ | ˆf   ˆ¡S )N)Úprob)Úl)Úcpdistr    r"   r   r   Ú	labelprob„   s    zFNaiveBayesClassifier.show_most_informative_features.<locals>.labelprobc                s$   g | ]}ˆˆ |ˆf   ¡ kr|‘qS r   )r   )Ú.0r%   )r&   r    r"   r   r   ú
<listcomp>ˆ   s    zGNaiveBayesClassifier.show_most_informative_features.<locals>.<listcomp>)Úkeyé   r   éÿÿÿÿZINFz%8.1fz"%24s = %-14r %6s : %-6s = %s : 1.0z%sé   )r   ÚprintÚmost_informative_featuresÚsortedr   Úlenr$   )r   Únr'   r   Zl0Úl1Zratior   )r&   r    r"   r   Úshow_most_informative_features}   s$    &z3NaiveBayesClassifier.show_most_informative_featureséd   c       	         sà   t | dƒr| jd|… S tƒ }tdd„ ƒ‰ tdd„ ƒ‰x‚| j ¡ D ]t\\}}}xf| ¡ D ]Z}||f}| |¡ | |¡}t	|ˆ | ƒˆ |< t
|ˆ| ƒˆ|< ˆ| dkrX| |¡ qXW qBW t|‡ ‡fdd„d| _| jd|… S )	a—  
        Return a list of the 'most informative' features used by this
        classifier.  For the purpose of this function, the
        informativeness of a feature ``(fname,fval)`` is equal to the
        highest value of P(fname=fval|label), for any label, divided by
        the lowest value of P(fname=fval|label), for any label:

        |  max[ P(fname=fval|label1) / P(fname=fval|label2) ]
        Ú_most_informative_featuresNc               S   s   dS )Ng        r   r   r   r   r   Ú<lambda>­   s    z@NaiveBayesClassifier.most_informative_features.<locals>.<lambda>c               S   s   dS )Ng      ð?r   r   r   r   r   r7   ®   s    r   c                s   ˆ|  ˆ |   S )Nr   )Zfeature_)ÚmaxprobÚminprobr   r   r7   ½   s    )r*   )Úhasattrr6   Úsetr   r   r   r   Úaddr$   r   ÚminÚdiscardr0   )	r   r2   Zfeaturesr!   r    Úprobdistr"   ZfeatureÚpr   )r8   r9   r   r/   œ   s"    



z.NaiveBayesClassifier.most_informative_featuresc             C   s@  t ƒ }tt ƒ}ttƒ}tƒ }xf|D ]^\}}||  d7  < xD| ¡ D ]8\}	}
|||	f |
  d7  < ||	  |
¡ | |	¡ qDW q"W xh|D ]`}|| }xR|D ]J}	|||	f  ¡ }|| dkrœ|||	f d  || 7  < ||	  d¡ qœW qŠW ||ƒ}i }x:| ¡ D ].\\}}	}||t||	 ƒd}||||	f< qW | ||ƒS )z‹
        :param labeled_featuresets: A list of classified featuresets,
            i.e., a list of tuples ``(featureset, label)``.
        r+   r   N)Zbins)r   r   r;   r   r<   ÚNr1   )ÚclsZlabeled_featuresetsZ	estimatorZlabel_freqdistZfeature_freqdistZfeature_valuesÚfnamesr   r!   r    r"   Znum_samplesÚcountr   r   Zfreqdistr?   r   r   r   ÚtrainÁ   s.    

zNaiveBayesClassifier.trainN)r#   )r5   )Ú__name__Ú
__module__Ú__qualname__Ú__doc__r   r   r   r   r4   r/   Úclassmethodr   rE   r   r   r   r   r
   ,   s   !

%r
   c              C   s"   ddl m}  | tjƒ}| ¡  d S )Nr   )Ú
names_demo)Znltk.classify.utilrK   r
   rE   r4   )rK   Z
classifierr   r   r   Údemoø   s    
rL   Ú__main__N)rI   Z
__future__r   r   Úcollectionsr   Znltk.probabilityr   r   r   r   Znltk.classify.apir	   r
   rL   rF   r   r   r   r   Ú<module>   s    M