ó
ù`]c           @   s:  d  Z  d d l m Z d d l Z d d l m Z m Z d d l m	 Z	 d d l
 m Z d „  Z e j Z d „  Z d	 Z y d d
 l m Z Wn e k
 r© d „  Z n Xd Z d Z d Z e e ƒ d e f d „  ƒ  Yƒ Z d e f d „  ƒ  YZ d e f d „  ƒ  YZ d e f d „  ƒ  YZ d e f d „  ƒ  YZ d S(   sÌ   
Provides scoring functions for a number of association measures through a
generic, abstract implementation in ``NgramAssocMeasures``, and n-specific
``BigramAssocMeasures`` and ``TrigramAssocMeasures``.
iÿÿÿÿ(   t   divisionN(   t   ABCMetat   abstractmethod(   t   reduce(   t   add_metaclassc         C   s   t  j |  d ƒ S(   Ng       @(   t   _matht   log(   t   x(    (    s7   lib/python2.7/site-packages/nltk/metrics/association.pyt   <lambda>   t    c         C   s   t  d „  |  ƒ S(   Nc         S   s   |  | S(   N(    (   R   t   y(    (    s7   lib/python2.7/site-packages/nltk/metrics/association.pyR      R	   (   R   (   t   s(    (    s7   lib/python2.7/site-packages/nltk/metrics/association.pyR      R	   g#B’¡œÇ;(   t   fisher_exactc          O   s
   t  ‚ d  S(   N(   t   NotImplementedError(   t   _argst   _kwargs(    (    s7   lib/python2.7/site-packages/nltk/metrics/association.pyR   !   s    i    iþÿÿÿt   NgramAssocMeasuresc           B   sÅ   e  Z d  Z d Z e e d „  ƒ ƒ Z e e d „  ƒ ƒ Z e d „  ƒ Z	 e d „  ƒ Z
 e d „  ƒ Z e d „  ƒ Z e d „  ƒ Z e d	 „  ƒ Z e d
 „  ƒ Z e d „  ƒ Z e d „  ƒ Z RS(   s¿  
    An abstract class defining a collection of generic association measures.
    Each public method returns a score, taking the following arguments::

        score_fn(count_of_ngram,
                 (count_of_n-1gram_1, ..., count_of_n-1gram_j),
                 (count_of_n-2gram_1, ..., count_of_n-2gram_k),
                 ...,
                 (count_of_1gram_1, ..., count_of_1gram_n),
                 count_of_total_words)

    See ``BigramAssocMeasures`` and ``TrigramAssocMeasures``

    Inheriting classes should define a property _n, and a method _contingency
    which calculates contingency values from marginals in order for all
    association measures defined here to be usable.
    i    c          G   s   t  d ƒ ‚ d S(   s>   Calculates values of a contingency table from marginal values.s?   The contingency table is not availablein the general ngram caseN(   R   (   t	   marginals(    (    s7   lib/python2.7/site-packages/nltk/metrics/association.pyt   _contingencyG   s    c          G   s   t  d ƒ ‚ d S(   sA   Calculates values of contingency table marginals from its values.s?   The contingency table is not availablein the general ngram caseN(   R   (   t   contingency(    (    s7   lib/python2.7/site-packages/nltk/metrics/association.pyt
   _marginalsO   s    c         #   sƒ   t  ˆ ƒ } g  t ˆ  j ƒ D] ‰ d ˆ >^ q } xJ t t ˆ ƒ ƒ D]6 ‰ t ‡  ‡ ‡ f d †  | Dƒ ƒ | ˆ  j d VqE Wd S(   s3   Calculates expected values for a contingency table.i   c         3   s>   |  ]4 ‰  t  ‡ ‡ ‡  f d  †  t d ˆ j ƒ Dƒ ƒ Vq d S(   c         3   s-   |  ]# } | ˆ @ˆ ˆ @k r ˆ  | Vq d  S(   N(    (   t   .0R   (   t   contt   it   j(    s7   lib/python2.7/site-packages/nltk/metrics/association.pys	   <genexpr>b   s    i   N(   t   sumt   ranget   _n(   R   (   t   clsR   R   (   R   s7   lib/python2.7/site-packages/nltk/metrics/association.pys	   <genexpr>b   s   N(   R   R   R   t   lent   _product(   R   R   t   n_allt   bits(    (   R   R   R   s7   lib/python2.7/site-packages/nltk/metrics/association.pyt   _expected_valuesW   s    &
c          G   s   |  t  |  t S(   s    Scores ngrams by their frequency(   t   NGRAMt   TOTAL(   R   (    (    s7   lib/python2.7/site-packages/nltk/metrics/association.pyt   raw_freqh   s    c         G   s9   | t  t | t ƒ | t |  j d | t  t d S(   s   Scores ngrams using Student's t test with independence hypothesis
        for unigrams, as in Manning and Schutze 5.3.1.
        i   g      à?(   R"   R   t   UNIGRAMSR#   R   t   _SMALL(   R   R   (    (    s7   lib/python2.7/site-packages/nltk/metrics/association.pyt	   student_tm   s    !c         G   s;   |  j  | Œ  } |  j | ƒ } t d „  t | | ƒ Dƒ ƒ S(   sZ   Scores ngrams using Pearson's chi-square as in Manning and Schutze
        5.3.3.
        c         s   s+   |  ]! \ } } | | d  | t  Vq d S(   i   N(   R&   (   R   t   obst   exp(    (    s7   lib/python2.7/site-packages/nltk/metrics/association.pys	   <genexpr>~   s    (   R   R!   R   t   zip(   R   R   R   t   exps(    (    s7   lib/python2.7/site-packages/nltk/metrics/association.pyt   chi_sqw   s    c          O   s&   |  t  | j d d ƒ t |  t ƒ S(   sÂ   Scores ngrams using a variant of mutual information. The keyword
        argument power sets an exponent (default 3) for the numerator. No
        logarithm of the result is calculated.
        t   poweri   (   R"   t   getR   R%   (   R   t   kwargs(    (    s7   lib/python2.7/site-packages/nltk/metrics/association.pyt   mi_like€   s    c         G   s5   t  | t | t |  j d ƒ t  t | t ƒ ƒ S(   s^   Scores ngrams by pointwise mutual information, as in Manning and
        Schutze 5.4.
        i   (   t   _log2R"   R#   R   R   R%   (   R   R   (    (    s7   lib/python2.7/site-packages/nltk/metrics/association.pyt   pmiŠ   s    #c         G   s<   |  j  | Œ  } |  j t d „  t | |  j | ƒ ƒ Dƒ ƒ S(   sO   Scores ngrams using likelihood ratios as in Manning and Schutze 5.3.4.
        c         s   s1   |  ]' \ } } | t  | | t t ƒ Vq d  S(   N(   t   _lnR&   (   R   R(   R)   (    (    s7   lib/python2.7/site-packages/nltk/metrics/association.pys	   <genexpr>™   s   (   R   R   R   R*   R!   (   R   R   R   (    (    s7   lib/python2.7/site-packages/nltk/metrics/association.pyt   likelihood_ratio“   s    	c         G   sA   t  | t ƒ | t |  j d } | t t | t | ƒ d S(   s1   Scores ngrams using the Poisson-Stirling measure.i   (   R   R%   R#   R   R"   R1   (   R   R   R)   (    (    s7   lib/python2.7/site-packages/nltk/metrics/association.pyt   poisson_stirling   s    #c         G   s%   |  j  | Œ  } | d t | d  ƒ S(   s&   Scores ngrams using the Jaccard index.i    iÿÿÿÿ(   R   R   (   R   R   R   (    (    s7   lib/python2.7/site-packages/nltk/metrics/association.pyt   jaccard£   s    (   t   __name__t
   __module__t   __doc__R   t   staticmethodR   R   R   t   classmethodR!   R$   R'   R,   R0   R2   R4   R5   R6   (    (    (    s7   lib/python2.7/site-packages/nltk/metrics/association.pyR   1   s   
	
	
t   BigramAssocMeasuresc           B   s}   e  Z d  Z d Z e d „  ƒ Z e d „  ƒ Z e d „  ƒ Z e d „  ƒ Z	 e d „  ƒ Z
 e d „  ƒ Z e d „  ƒ Z RS(	   s€  
    A collection of bigram association measures. Each association measure
    is provided as a function with three arguments::

        bigram_score_fn(n_ii, (n_ix, n_xi), n_xx)

    The arguments constitute the marginals of a contingency table, counting
    the occurrences of particular events in a corpus. The letter i in the
    suffix refers to the appearance of the word in question, while x indicates
    the appearance of any word. Thus, for example:

        n_ii counts (w1, w2), i.e. the bigram being scored
        n_ix counts (w1, *)
        n_xi counts (*, w2)
        n_xx counts (*, *), i.e. any bigram

    This may be shown with respect to a contingency table::

                w1    ~w1
             ------ ------
         w2 | n_ii | n_oi | = n_xi
             ------ ------
        ~w2 | n_io | n_oo |
             ------ ------
             = n_ix        TOTAL = n_xx
    i   c         C   s<   | \ } } | |  } | |  } |  | | | |  | | f S(   sE   Calculates values of a bigram contingency table from marginal values.(    (   t   n_iit   n_ix_xi_tuplet   n_xxt   n_ixt   n_xit   n_oit   n_io(    (    s7   lib/python2.7/site-packages/nltk/metrics/association.pyR   È   s    

c         C   s'   |  | |  | |  f | | | |  f S(   sA   Calculates values of contingency table marginals from its values.(    (   R=   RB   RC   t   n_oo(    (    s7   lib/python2.7/site-packages/nltk/metrics/association.pyR   Ð   s    c         c   sT   t  |  ƒ } xA t d ƒ D]3 } |  | |  | d A|  | |  | d A| Vq Wd S(   s3   Calculates expected values for a contingency table.i   i   i   N(   R   R   (   R   R?   R   (    (    s7   lib/python2.7/site-packages/nltk/metrics/association.pyR!   Õ   s    c         G   sO   |  j  | Œ  \ } } } } | | | | d | | | | | | | | S(   sd   Scores bigrams using phi-square, the square of the Pearson correlation
        coefficient.
        i   (   R   (   R   R   R=   RC   RB   RD   (    (    s7   lib/python2.7/site-packages/nltk/metrics/association.pyt   phi_sqÝ   s    c         C   s)   | \ } } | |  j  | | | f | ƒ S(   sƒ   Scores bigrams using chi-square, i.e. phi-sq multiplied by the number
        of bigrams, as in Manning and Schutze 5.3.3.
        (   RE   (   R   R=   R>   R?   R@   RA   (    (    s7   lib/python2.7/site-packages/nltk/metrics/association.pyR,   è   s    c         G   sI   |  j  | Œ  \ } } } } t | | g | | g g d d ƒ\ } } | S(   sº   Scores bigrams using Fisher's Exact Test (Pedersen 1996).  Less
        sensitive to small counts than PMI or Chi Sq, but also more expensive
        to compute. Requires scipy.
        t   alternativet   less(   R   R   (   R   R   R=   RC   RB   RD   t   oddst   pvalue(    (    s7   lib/python2.7/site-packages/nltk/metrics/association.pyt   fisherð   s    *c         C   s   | \ } } d |  | | S(   s(   Scores bigrams using Dice's coefficient.i   (    (   R=   R>   R?   R@   RA   (    (    s7   lib/python2.7/site-packages/nltk/metrics/association.pyt   diceü   s    (   R7   R8   R9   R   R:   R   R   R!   R;   RE   R,   RJ   RK   (    (    (    s7   lib/python2.7/site-packages/nltk/metrics/association.pyR<   ª   s   t   TrigramAssocMeasuresc           B   s2   e  Z d  Z d Z e d „  ƒ Z e d „  ƒ Z RS(   sÄ  
    A collection of trigram association measures. Each association measure
    is provided as a function with four arguments::

        trigram_score_fn(n_iii,
                         (n_iix, n_ixi, n_xii),
                         (n_ixx, n_xix, n_xxi),
                         n_xxx)

    The arguments constitute the marginals of a contingency table, counting
    the occurrences of particular events in a corpus. The letter i in the
    suffix refers to the appearance of the word in question, while x indicates
    the appearance of any word. Thus, for example:
    n_iii counts (w1, w2, w3), i.e. the trigram being scored
    n_ixx counts (w1, *, *)
    n_xxx counts (*, *, *), i.e. any trigram
    i   c         C   s°   | \ } } } | \ } } }	 | |  }
 | |  } | |  } |	 |  |
 | } | |  |
 | } | |  | | } | |  |
 | | | | | } |  |
 | | | | | | f S(   sÔ   Calculates values of a trigram contingency table (or cube) from
        marginal values.
        >>> TrigramAssocMeasures._contingency(1, (1, 1, 1), (1, 73, 1), 2000)
        (1, 0, 0, 0, 0, 72, 0, 1927)
        (    (   t   n_iiit   n_iix_tuplet   n_ixx_tuplet   n_xxxt   n_iixt   n_ixit   n_xiit   n_ixxt   n_xixt   n_xxit   n_oiit   n_ioit   n_iiot   n_ooit   n_oiot   n_ioot   n_ooo(    (    s7   lib/python2.7/site-packages/nltk/metrics/association.pyR     s    


"c    	      G   sv   |  \ } } } } } } } } | | | | | | | f | | | | | | | | | | | | f t  |  ƒ f S(   s»   Calculates values of contingency table marginals from its values.
        >>> TrigramAssocMeasures._marginals(1, 0, 0, 0, 0, 72, 0, 1927)
        (1, (1, 1, 1), (1, 73, 1), 2000)
        (   R   (	   R   RM   RW   RX   RZ   RY   R[   R\   R]   (    (    s7   lib/python2.7/site-packages/nltk/metrics/association.pyR   +  s    (   R7   R8   R9   R   R:   R   R   (    (    (    s7   lib/python2.7/site-packages/nltk/metrics/association.pyRL     s   t   QuadgramAssocMeasuresc           B   s2   e  Z d  Z d Z e d „  ƒ Z e d „  ƒ Z RS(   s3  
    A collection of quadgram association measures. Each association measure
    is provided as a function with five arguments::

        trigram_score_fn(n_iiii,
                        (n_iiix, n_iixi, n_ixii, n_xiii),
                        (n_iixx, n_ixix, n_ixxi, n_xixi, n_xxii, n_xiix),
                        (n_ixxx, n_xixx, n_xxix, n_xxxi),
                        n_all)

    The arguments constitute the marginals of a contingency table, counting
    the occurrences of particular events in a corpus. The letter i in the
    suffix refers to the appearance of the word in question, while x indicates
    the appearance of any word. Thus, for example:
    n_iiii counts (w1, w2, w3, w4), i.e. the quadgram being scored
    n_ixxi counts (w1, *, *, w4)
    n_xxxx counts (*, *, *, *), i.e. any quadgram
    i   c   "      C   sÎ  | \ } } } } | \ }	 }
 } } } } | \ } } } } | |  } | |  } | |  } | |  | | } | |  | | } | |  | | } | |  | | | | | | } | |  } | |  | | } |
 |  | | } | |  | | | | | | } |	 |  | | } | |  | | | | | | } | |  | | | | | | }  | |  | | | | | | | | | | | | | |  }! |  | | | | | | | | | | | | | |  |! f S(   sX   Calculates values of a quadgram contingency table from
        marginal values.
        (    ("   t   n_iiiit   n_iiix_tuplet   n_iixx_tuplet   n_ixxx_tuplet   n_xxxxt   n_iiixt   n_iixit   n_ixiit   n_xiiit   n_iixxt   n_ixixt   n_ixxit   n_xixit   n_xxiit   n_xiixt   n_ixxxt   n_xixxt   n_xxixt   n_xxxit   n_oiiit   n_ioiit   n_iioit   n_ooiit   n_oioit   n_iooit   n_oooit   n_iiiot   n_oiiot   n_ioiot   n_ooiot   n_iioot   n_oioot   n_iooot   n_oooo(    (    s7   lib/python2.7/site-packages/nltk/metrics/association.pyR   T  sD    


"
"""Bc           G   s›  |  \ } } } } } } } } }	 }
 } } } } } } | |	 } | | } | | } | | } | | |	 | } | | |	 | } | | | | } | | | | } | | | | } | | |	 |
 } | | | |	 | | | | } | | | |	 | |
 | | } | | | |	 | | |
 | } | | | | | | | | } t  |  ƒ } | | | | | f | | | | | | f | | | | f | f S(   s  Calculates values of contingency table marginals from its values.
        QuadgramAssocMeasures._marginals(1, 0, 2, 46, 552, 825, 2577, 34967, 1, 0, 2, 48, 7250, 9031, 28585, 356653)
        (1, (2, 553, 3, 1), (7804, 6, 3132, 1378, 49, 2), (38970, 17660, 100, 38970), 440540)
        (   R   (    R   R_   Rr   Rs   Ru   Rt   Rv   Rw   Rx   Ry   Rz   R{   R|   R}   R~   R   R€   Rd   Re   Rf   Rg   Rh   Ri   Rj   Rk   Rl   Rm   Rn   Ro   Rp   Rq   R   (    (    s7   lib/python2.7/site-packages/nltk/metrics/association.pyR     s*    6



""""(   R7   R8   R9   R   R:   R   R   (    (    (    s7   lib/python2.7/site-packages/nltk/metrics/association.pyR^   >  s   <t   ContingencyMeasuresc           B   s&   e  Z d  Z d „  Z e d „  ƒ Z RS(   s   Wraps NgramAssocMeasures classes such that the arguments of association
    measures are contingency table values rather than marginals.
    c         C   s‰   d | j  j |  j  _ xl t | ƒ D]^ } | j d ƒ r> q# n  t | | ƒ } | j d ƒ sq |  j | | ƒ } n  t |  | | ƒ q# Wd S(   sA   Constructs a ContingencyMeasures given a NgramAssocMeasures classt   Contingencyt   __t   _N(   t	   __class__R7   t   dirt
   startswitht   getattrt   _make_contingency_fnt   setattr(   t   selft   measurest   kt   v(    (    s7   lib/python2.7/site-packages/nltk/metrics/association.pyt   __init__»  s    c            s.   ‡  ‡ f d †  } ˆ j  | _  ˆ j | _ | S(   s‡   From an association measure function, produces a new function which
        accepts contingency table values as its arguments.
        c             s   ˆ ˆ  j  |  Œ  Œ  S(   N(   R   (   R   (   RŒ   t   old_fn(    s7   lib/python2.7/site-packages/nltk/metrics/association.pyt   resÌ  s    (   R9   R7   (   RŒ   R   R‘   (    (   RŒ   R   s7   lib/python2.7/site-packages/nltk/metrics/association.pyR‰   Æ  s    (   R7   R8   R9   R   R:   R‰   (    (    (    s7   lib/python2.7/site-packages/nltk/metrics/association.pyR   ¶  s   	(   R9   t
   __future__R    t   mathR   t   abcR   R   t	   functoolsR   t   sixR   R1   R   R3   R   R&   t   scipy.statsR   t   ImportErrorR"   R%   R#   t   objectR   R<   RL   R^   R   (    (    (    s7   lib/python2.7/site-packages/nltk/metrics/association.pyt   <module>   s,   			xY;x