B
    >?[]>                 @   s   d Z ddlmZ ddlZddlmZmZ ddlm	Z	 ddl
mZ dd ZejZd	d Zd
ZyddlmZ W n ek
r   dd ZY nX dZdZdZeeG dd deZG dd deZG dd deZG dd deZG dd deZdS )z
Provides scoring functions for a number of association measures through a
generic, abstract implementation in ``NgramAssocMeasures``, and n-specific
``BigramAssocMeasures`` and ``TrigramAssocMeasures``.
    )divisionN)ABCMetaabstractmethod)reduce)add_metaclassc             C   s   t | dS )Ng       @)_mathlog)x r
   7lib/python3.7/site-packages/nltk/metrics/association.py<lambda>   s    r   c             C   s   t dd | S )Nc             S   s   | | S )Nr
   )r	   yr
   r
   r   r      s    z<lambda>.<locals>.<lambda>)r   )sr
   r
   r   r      s    g#B;)fisher_exactc              O   s   t d S )N)NotImplementedError)Z_argsZ_kwargsr
   r
   r   r   !   s    r   c               @   s   e Zd ZdZdZeedd Zeedd Ze	dd Z
ed	d
 Ze	dd Ze	dd Zedd Ze	dd Ze	dd Ze	dd Ze	dd ZdS )NgramAssocMeasuresa  
    An abstract class defining a collection of generic association measures.
    Each public method returns a score, taking the following arguments::

        score_fn(count_of_ngram,
                 (count_of_n-1gram_1, ..., count_of_n-1gram_j),
                 (count_of_n-2gram_1, ..., count_of_n-2gram_k),
                 ...,
                 (count_of_1gram_1, ..., count_of_1gram_n),
                 count_of_total_words)

    See ``BigramAssocMeasures`` and ``TrigramAssocMeasures``

    Inheriting classes should define a property _n, and a method _contingency
    which calculates contingency values from marginals in order for all
    association measures defined here to be usable.
    r   c              G   s   t ddS )z>Calculates values of a contingency table from marginal values.z?The contingency table is not availablein the general ngram caseN)r   )	marginalsr
   r
   r   _contingencyG   s    zNgramAssocMeasures._contingencyc              G   s   t ddS )zACalculates values of contingency table marginals from its values.z?The contingency table is not availablein the general ngram caseN)r   )contingencyr
   r
   r   
_marginalsO   s    zNgramAssocMeasures._marginalsc             #   s`   t }dd t jD }x>ttD ].t fdd|D | jd   V  q*W dS )z3Calculates expected values for a contingency table.c             S   s   g | ]}d |> qS )   r
   ).0ir
   r
   r   
<listcomp>[   s    z7NgramAssocMeasures._expected_values.<locals>.<listcomp>c             3   s2   | ]* t  fd dtdj D V  qdS )c             3   s&   | ]}|@ @ kr | V  qd S )Nr
   )r   r	   )contr   jr
   r   	<genexpr>b   s    z@NgramAssocMeasures._expected_values.<locals>.<genexpr>.<genexpr>   N)sumrange_n)r   )clsr   r   )r   r   r   b   s   z6NgramAssocMeasures._expected_values.<locals>.<genexpr>r   N)r    r!   r"   len_product)r#   r   n_allbitsr
   )r#   r   r   r   _expected_valuesW   s    z#NgramAssocMeasures._expected_valuesc              G   s   | t  | t  S )z Scores ngrams by their frequency)NGRAMTOTAL)r   r
   r
   r   raw_freqh   s    zNgramAssocMeasures.raw_freqc             G   s6   |t  t|t |t | jd    |t  t d  S )zScores ngrams using Student's t test with independence hypothesis
        for unigrams, as in Manning and Schutze 5.3.1.
        r   g      ?)r)   r%   UNIGRAMSr*   r"   _SMALL)r#   r   r
   r
   r   	student_tm   s    zNgramAssocMeasures.student_tc             G   s,   | j | }| |}tdd t||D S )zZScores ngrams using Pearson's chi-square as in Manning and Schutze
        5.3.3.
        c             s   s&   | ]\}}|| d  |t   V  qdS )r   N)r-   )r   obsexpr
   r
   r   r   ~   s    z,NgramAssocMeasures.chi_sq.<locals>.<genexpr>)r   r(   r    zip)r#   r   r   Zexpsr
   r
   r   chi_sqw   s    

zNgramAssocMeasures.chi_sqc              O   s    | t  |dd t| t  S )zScores ngrams using a variant of mutual information. The keyword
        argument power sets an exponent (default 3) for the numerator. No
        logarithm of the result is calculated.
        Zpower   )r)   getr%   r,   )r   kwargsr
   r
   r   mi_like   s    zNgramAssocMeasures.mi_likec             G   s.   t |t |t | jd   t t|t  S )z^Scores ngrams by pointwise mutual information, as in Manning and
        Schutze 5.4.
        r   )_log2r)   r*   r"   r%   r,   )r#   r   r
   r
   r   pmi   s    zNgramAssocMeasures.pmic             G   s.   | j | }| jtdd t|| |D  S )zOScores ngrams using likelihood ratios as in Manning and Schutze 5.3.4.
        c             s   s*   | ]"\}}|t ||t  t  V  qd S )N)_lnr-   )r   r/   r0   r
   r
   r   r      s   z6NgramAssocMeasures.likelihood_ratio.<locals>.<genexpr>)r   r"   r    r1   r(   )r#   r   r   r
   r
   r   likelihood_ratio   s    
z#NgramAssocMeasures.likelihood_ratioc             G   s:   t |t |t | jd   }|t t|t | d  S )z1Scores ngrams using the Poisson-Stirling measure.r   )r%   r,   r*   r"   r)   r7   )r#   r   r0   r
   r
   r   poisson_stirling   s    z#NgramAssocMeasures.poisson_stirlingc             G   s"   | j | }|d t|dd  S )z&Scores ngrams using the Jaccard index.r   Nr   )r   r    )r#   r   r   r
   r
   r   jaccard   s    
zNgramAssocMeasures.jaccardN)__name__
__module____qualname____doc__r"   staticmethodr   r   r   classmethodr(   r+   r.   r2   r6   r8   r:   r;   r<   r
   r
   r
   r   r   1   s   
	
	
r   c               @   sh   e Zd ZdZdZedd Zedd Zedd Ze	d	d
 Z
e	dd Ze	dd Zedd ZdS )BigramAssocMeasuresa  
    A collection of bigram association measures. Each association measure
    is provided as a function with three arguments::

        bigram_score_fn(n_ii, (n_ix, n_xi), n_xx)

    The arguments constitute the marginals of a contingency table, counting
    the occurrences of particular events in a corpus. The letter i in the
    suffix refers to the appearance of the word in question, while x indicates
    the appearance of any word. Thus, for example:

        n_ii counts (w1, w2), i.e. the bigram being scored
        n_ix counts (w1, *)
        n_xi counts (*, w2)
        n_xx counts (*, *), i.e. any bigram

    This may be shown with respect to a contingency table::

                w1    ~w1
             ------ ------
         w2 | n_ii | n_oi | = n_xi
             ------ ------
        ~w2 | n_io | n_oo |
             ------ ------
             = n_ix        TOTAL = n_xx
    r   c             C   s0   |\}}||  }||  }| ||||  | | fS )zECalculates values of a bigram contingency table from marginal values.r
   )n_iin_ix_xi_tuplen_xxn_ixn_xin_oin_ior
   r
   r   r      s    z BigramAssocMeasures._contingencyc             C   s"   | ||  ||  f|| | |  fS )zACalculates values of contingency table marginals from its values.r
   )rD   rI   rJ   n_oor
   r
   r   r      s    zBigramAssocMeasures._marginalsc             c   sL   t | }x>tdD ]2}| | | |dA   | | | |dA    | V  qW dS )z3Calculates expected values for a contingency table.   r   r   N)r    r!   )r   rF   r   r
   r
   r   r(      s    z$BigramAssocMeasures._expected_valuesc             G   sF   | j | \}}}}|| ||  d || ||  ||  ||   S )zdScores bigrams using phi-square, the square of the Pearson correlation
        coefficient.
        r   )r   )r#   r   rD   rJ   rI   rK   r
   r
   r   phi_sq   s    zBigramAssocMeasures.phi_sqc             C   s   |\}}||  |||f| S )zScores bigrams using chi-square, i.e. phi-sq multiplied by the number
        of bigrams, as in Manning and Schutze 5.3.3.
        )rM   )r#   rD   rE   rF   rG   rH   r
   r
   r   r2      s    zBigramAssocMeasures.chi_sqc             G   s2   | j | \}}}}t||g||ggdd\}}|S )zScores bigrams using Fisher's Exact Test (Pedersen 1996).  Less
        sensitive to small counts than PMI or Chi Sq, but also more expensive
        to compute. Requires scipy.
        Zless)Zalternative)r   r   )r#   r   rD   rJ   rI   rK   ZoddsZpvaluer
   r
   r   fisher   s    zBigramAssocMeasures.fisherc             C   s   |\}}d|  ||  S )z(Scores bigrams using Dice's coefficient.r   r
   )rD   rE   rF   rG   rH   r
   r
   r   dice   s    zBigramAssocMeasures.diceN)r=   r>   r?   r@   r"   rA   r   r   r(   rB   rM   r2   rN   rO   r
   r
   r
   r   rC      s   rC   c               @   s,   e Zd ZdZdZedd Zedd ZdS )TrigramAssocMeasuresa  
    A collection of trigram association measures. Each association measure
    is provided as a function with four arguments::

        trigram_score_fn(n_iii,
                         (n_iix, n_ixi, n_xii),
                         (n_ixx, n_xix, n_xxi),
                         n_xxx)

    The arguments constitute the marginals of a contingency table, counting
    the occurrences of particular events in a corpus. The letter i in the
    suffix refers to the appearance of the word in question, while x indicates
    the appearance of any word. Thus, for example:
    n_iii counts (w1, w2, w3), i.e. the trigram being scored
    n_ixx counts (w1, *, *)
    n_xxx counts (*, *, *), i.e. any trigram
    r3   c             C   s   |\}}}|\}}}	||  }
||  }||  }|	|  |
 | }||  |
 | }||  | | }||  |
 | | | | | }| |
||||||fS )zCalculates values of a trigram contingency table (or cube) from
        marginal values.
        >>> TrigramAssocMeasures._contingency(1, (1, 1, 1), (1, 73, 1), 2000)
        (1, 0, 0, 0, 0, 72, 0, 1927)
        r
   )n_iiiZn_iix_tupleZn_ixx_tupleZn_xxxZn_iixZn_ixiZn_xiiZn_ixxZn_xixZn_xxin_oiin_ioin_iion_ooin_oion_ioon_ooor
   r
   r   r     s    

 z!TrigramAssocMeasures._contingencyc        	      G   s`   | \}}}}}}}}||| || || f|| | | || | | || | | ft | fS )zCalculates values of contingency table marginals from its values.
        >>> TrigramAssocMeasures._marginals(1, 0, 0, 0, 0, 72, 0, 1927)
        (1, (1, 1, 1), (1, 73, 1), 2000)
        )r    )	r   rQ   rR   rS   rU   rT   rV   rW   rX   r
   r
   r   r   +  s    zTrigramAssocMeasures._marginalsN)r=   r>   r?   r@   r"   rA   r   r   r
   r
   r
   r   rP     s   rP   c               @   s,   e Zd ZdZdZedd Zedd ZdS )QuadgramAssocMeasuresa3  
    A collection of quadgram association measures. Each association measure
    is provided as a function with five arguments::

        trigram_score_fn(n_iiii,
                        (n_iiix, n_iixi, n_ixii, n_xiii),
                        (n_iixx, n_ixix, n_ixxi, n_xixi, n_xxii, n_xiix),
                        (n_ixxx, n_xixx, n_xxix, n_xxxi),
                        n_all)

    The arguments constitute the marginals of a contingency table, counting
    the occurrences of particular events in a corpus. The letter i in the
    suffix refers to the appearance of the word in question, while x indicates
    the appearance of any word. Thus, for example:
    n_iiii counts (w1, w2, w3, w4), i.e. the quadgram being scored
    n_ixxi counts (w1, *, *, w4)
    n_xxxx counts (*, *, *, *), i.e. any quadgram
    rL   c       "      C   s  |\}}}}|\}	}
}}}}|\}}}}||  }||  }||  }||  | | }||  | | }||  | | }||  | | | | | | }||  }||  | | }|
|  | | }||  | | | | | | }|	|  | | }||  | | | | | | }||  | | | | | | } ||  | | | | | | | | | | | | | |  }!| |||||||||||||| |!fS )zXCalculates values of a quadgram contingency table from
        marginal values.
        r
   )"n_iiiiZn_iiix_tupleZn_iixx_tupleZn_ixxx_tupleZn_xxxxn_iiixn_iixin_ixiin_xiiin_iixxn_ixixn_ixxin_xixin_xxiin_xiixn_ixxxn_xixxn_xxixn_xxxin_oiiin_ioiin_iioin_ooiin_oioin_iooin_oooin_iiion_oiion_ioion_ooion_iioon_oioon_iooon_oooor
   r
   r   r   T  sD        @z"QuadgramAssocMeasures._contingencyc               G   sV  | \}}}}}}}}}	}
}}}}}}||	 }|| }|| }|| }|| |	 | }|| |	 | }|| | | }|| | | }|| | | }|| |	 |
 }|| | |	 | | | | }|| | |	 | |
 | | }|| | |	 | | |
 | }|| | | | | | | }t | }|||||f||||||f||||f|fS )a  Calculates values of contingency table marginals from its values.
        QuadgramAssocMeasures._marginals(1, 0, 2, 46, 552, 825, 2577, 34967, 1, 0, 2, 48, 7250, 9031, 28585, 356653)
        (1, (2, 553, 3, 1), (7804, 6, 3132, 1378, 49, 2), (38970, 17660, 100, 38970), 440540)
        )r    ) r   rZ   ri   rj   rl   rk   rm   rn   ro   rp   rq   rr   rs   rt   ru   rv   rw   r[   r\   r]   r^   r_   r`   ra   rb   rc   rd   re   rf   rg   rh   r&   r
   r
   r   r     s*    $    

z QuadgramAssocMeasures._marginalsN)r=   r>   r?   r@   r"   rA   r   r   r
   r
   r
   r   rY   >  s   <rY   c               @   s$   e Zd ZdZdd Zedd ZdS )ContingencyMeasureszWraps NgramAssocMeasures classes such that the arguments of association
    measures are contingency table values rather than marginals.
    c             C   s^   d|j j | j _xHt|D ]<}|dr*qt||}|dsJ| ||}t| || qW dS )zAConstructs a ContingencyMeasures given a NgramAssocMeasures classZContingency___N)	__class__r=   dir
startswithgetattr_make_contingency_fnsetattr)selfmeasureskvr
   r
   r   __init__  s    


zContingencyMeasures.__init__c                s"    fdd}j |_ j|_|S )zFrom an association measure function, produces a new function which
        accepts contingency table values as its arguments.
        c                 s    j |   S )N)r   )r   )r   old_fnr
   r   res  s    z5ContingencyMeasures._make_contingency_fn.<locals>.res)r@   r=   )r   r   r   r
   )r   r   r   r     s    z(ContingencyMeasures._make_contingency_fnN)r=   r>   r?   r@   r   rA   r   r
   r
   r
   r   rx     s   rx   )r@   Z
__future__r   Zmathr   abcr   r   	functoolsr   Zsixr   r7   r   r9   r%   r-   Zscipy.statsr   ImportErrorr)   r,   r*   objectr   rC   rP   rY   rx   r
   r
   r
   r   <module>   s,   xY;x