ó
ù`]c           @  s—   d  Z  d d l m Z m Z d d l m Z d d l m Z m Z m	 Z	 m
 Z
 d d l m Z d e f d „  ƒ  YZ d „  Z e d	 k r“ e ƒ  n  d
 S(   uê  
A classifier based on the Naive Bayes algorithm.  In order to find the
probability for a label, this algorithm first uses the Bayes rule to
express P(label|features) in terms of P(label) and P(features|label):

|                       P(label) * P(features|label)
|  P(label|features) = ------------------------------
|                              P(features)

The algorithm then makes the 'naive' assumption that all features are
independent, given the label:

|                       P(label) * P(f1|label) * ... * P(fn|label)
|  P(label|features) = --------------------------------------------
|                                         P(features)

Rather than computing P(features) explicitly, the algorithm just
calculates the numerator for each label, and normalizes them so they
sum to one:

|                       P(label) * P(f1|label) * ... * P(fn|label)
|  P(label|features) = --------------------------------------------
|                        SUM[l]( P(l) * P(f1|l) * ... * P(fn|l) )
iÿÿÿÿ(   t   print_functiont   unicode_literals(   t   defaultdict(   t   FreqDistt   DictionaryProbDistt   ELEProbDistt   sum_logs(   t   ClassifierIt   NaiveBayesClassifierc           B  s\   e  Z d  Z d „  Z d „  Z d „  Z d „  Z d d „ Z d d „ Z e	 e
 d	 „ ƒ Z RS(
   u  
    A Naive Bayes classifier.  Naive Bayes classifiers are
    paramaterized by two probability distributions:

      - P(label) gives the probability that an input will receive each
        label, given no information about the input's features.

      - P(fname=fval|label) gives the probability that a given feature
        (fname) will receive a given value (fval), given that the
        label (label).

    If the classifier encounters an input with a feature that has
    never been seen with any label, then rather than assigning a
    probability of 0 to all labels, it will ignore that feature.

    The feature value 'None' is reserved for unseen feature values;
    you generally should not use 'None' as a feature value for one of
    your own features.
    c         C  s+   | |  _  | |  _ t | j ƒ  ƒ |  _ d S(   u=  
        :param label_probdist: P(label), the probability distribution
            over labels.  It is expressed as a ``ProbDistI`` whose
            samples are labels.  I.e., P(label) =
            ``label_probdist.prob(label)``.

        :param feature_probdist: P(fname=fval|label), the probability
            distribution for feature values, given labels.  It is
            expressed as a dictionary whose keys are ``(label, fname)``
            pairs and whose values are ``ProbDistI`` objects over feature
            values.  I.e., P(fname=fval|label) =
            ``feature_probdist[label,fname].prob(fval)``.  If a given
            ``(label,fname)`` is not a key in ``feature_probdist``, then
            it is assumed that the corresponding P(fname=fval|label)
            is 0 for all values of ``fval``.
        N(   t   _label_probdistt   _feature_probdistt   listt   samplest   _labels(   t   selft   label_probdistt   feature_probdist(    (    s7   lib/python2.7/site-packages/nltk/classify/naivebayes.pyt   __init__A   s    		c         C  s   |  j  S(   N(   R   (   R   (    (    s7   lib/python2.7/site-packages/nltk/classify/naivebayes.pyt   labelsV   s    c         C  s   |  j  | ƒ j ƒ  S(   N(   t   prob_classifyt   max(   R   t
   featureset(    (    s7   lib/python2.7/site-packages/nltk/classify/naivebayes.pyt   classifyY   s    c         C  s.  | j  ƒ  } xN t | j ƒ  ƒ D]: } x1 |  j D] } | | f |  j k r/ Pq/ q/ W| | =q Wi  } x' |  j D] } |  j j | ƒ | | <qm Wxˆ |  j D]} } xt | j ƒ  D]f \ } } | | f |  j k rú |  j | | f } | | c | j | ƒ 7<qª | | c t g  ƒ 7<qª Wq— Wt	 | d t
 d t
 ƒS(   Nt	   normalizet   log(   t   copyR   t   keysR   R
   R	   t   logprobt   itemsR   R   t   True(   R   R   t   fnamet   labelR   t   fvalt   feature_probs(    (    s7   lib/python2.7/site-packages/nltk/classify/naivebayes.pyR   \   s     i
   c           sE  |  j  ‰  t d ƒ x+|  j | ƒ D]\ ‰ ‰ ‡  ‡ ‡ f d †  } t g  |  j D]( } ˆ ˆ  | ˆ f j ƒ  k rQ | ^ qQ d | ƒ} t | ƒ d k r  q# n  | d } | d } ˆ  | ˆ f j ˆ ƒ d k rÜ d } n4 d ˆ  | ˆ f j ˆ ƒ ˆ  | ˆ f j ˆ ƒ } t d	 ˆ ˆ d
 | d  d
 | d  | f ƒ q# Wd  S(   Nu   Most Informative Featuresc           s   ˆ  |  ˆ f j  ˆ ƒ S(   N(   t   prob(   t   l(   t   cpdistR   R    (    s7   lib/python2.7/site-packages/nltk/classify/naivebayes.pyt	   labelprob„   s    t   keyi   i    iÿÿÿÿu   INFu   %8.1fu"   %24s = %-14r %6s : %-6s = %s : 1.0u   %si   (   R
   t   printt   most_informative_featurest   sortedR   R   t   lenR"   (   R   t   nR%   R#   R   t   l0t   l1t   ratio(    (   R$   R   R    s7   lib/python2.7/site-packages/nltk/classify/naivebayes.pyt   show_most_informative_features}   s$    	
8	

	1id   c   	        s   t  |  d ƒ r |  j |  St ƒ  } t d „  ƒ ‰  t d „  ƒ ‰ x° |  j j ƒ  D]Ÿ \ \ } } } xŠ | j ƒ  D]| } | | f } | j | ƒ | j | ƒ } t	 | ˆ  | ƒ ˆ  | <t
 | ˆ | ƒ ˆ | <ˆ | d k rp | j | ƒ qp qp WqQ Wt | d ‡  ‡ f d †  ƒ|  _ |  j |  S(   u—  
        Return a list of the 'most informative' features used by this
        classifier.  For the purpose of this function, the
        informativeness of a feature ``(fname,fval)`` is equal to the
        highest value of P(fname=fval|label), for any label, divided by
        the lowest value of P(fname=fval|label), for any label:

        |  max[ P(fname=fval|label1) / P(fname=fval|label2) ]
        u   _most_informative_featuresc           S  s   d S(   Ng        (    (    (    (    s7   lib/python2.7/site-packages/nltk/classify/naivebayes.pyt   <lambda>­   t    c           S  s   d S(   Ng      ð?(    (    (    (    s7   lib/python2.7/site-packages/nltk/classify/naivebayes.pyR0   ®   R1   i    R&   c           s   ˆ |  ˆ  |  S(   N(    (   t   feature_(   t   maxprobt   minprob(    s7   lib/python2.7/site-packages/nltk/classify/naivebayes.pyR0   ½   R1   (   t   hasattrt   _most_informative_featurest   setR   R
   R   R   t   addR"   R   t   mint   discardR)   (	   R   R+   t   featuresR   R   t   probdistR    t   featuret   p(    (   R3   R4   s7   lib/python2.7/site-packages/nltk/classify/naivebayes.pyR(   œ   s"    
	"c         C  s˜  t  ƒ  } t t  ƒ } t t ƒ } t ƒ  } xy | D]q \ } } | | c d 7<xR | j ƒ  D]D \ }	 }
 | | |	 f |
 c d 7<| |	 j |
 ƒ | j |	 ƒ qZ Wq1 Wx | D]y } | | } xf | D]^ }	 | | |	 f j ƒ  } | | d k rÄ | | |	 f d c | | 7<| |	 j d ƒ qÄ qÄ Wq­ W| | ƒ } i  } xL | j ƒ  D]> \ \ } }	 } | | d t | |	 ƒ ƒ} | | | |	 f <qIW|  | | ƒ S(   u‹   
        :param labeled_featuresets: A list of classified featuresets,
            i.e., a list of tuples ``(featureset, label)``.
        i   i    t   binsN(   R   R   R7   R   R8   t   Nt   NoneR*   (   t   clst   labeled_featuresetst	   estimatort   label_freqdistt   feature_freqdistt   feature_valuest   fnamesR   R   R   R    t   num_samplest   countR   R   t   freqdistR<   (    (    s7   lib/python2.7/site-packages/nltk/classify/naivebayes.pyt   trainÁ   s.    		
(   t   __name__t
   __module__t   __doc__R   R   R   R   R/   R(   t   classmethodR   RL   (    (    (    s7   lib/python2.7/site-packages/nltk/classify/naivebayes.pyR   ,   s   				!%c          C  s-   d d l  m }  |  t j ƒ } | j ƒ  d  S(   Niÿÿÿÿ(   t
   names_demo(   t   nltk.classify.utilRQ   R   RL   R/   (   RQ   t
   classifier(    (    s7   lib/python2.7/site-packages/nltk/classify/naivebayes.pyt   demoø   s    u   __main__N(   RO   t
   __future__R    R   t   collectionsR   t   nltk.probabilityR   R   R   R   t   nltk.classify.apiR   R   RT   RM   (    (    (    s7   lib/python2.7/site-packages/nltk/classify/naivebayes.pyt   <module>   s   "Ì	