ó
ù`]c           @   s²   d  Z  d d l m Z d d l Z d d l m Z d d l m Z d d l m Z d d l m	 Z	 d d l m
 Z
 d d	 l m Z d
 e	 f d „  ƒ  YZ d e f d „  ƒ  YZ d S(   s‘  
Lexical translation model that considers word order.

IBM Model 2 improves on Model 1 by accounting for word order.
An alignment probability is introduced, a(i | j,l,m), which predicts
a source word position, given its aligned target word's position.

The EM algorithm used in Model 2 is:
E step - In the training data, collect counts, weighted by prior
         probabilities.
         (a) count how many times a source language word is translated
             into a target language word
         (b) count how many times a particular position in the source
             sentence is aligned to a particular position in the target
             sentence

M step - Estimate new probabilities based on the counts from the E step


Notations:
i: Position in the source sentence
    Valid values are 0 (for NULL), 1, 2, ..., length of source sentence
j: Position in the target sentence
    Valid values are 1, 2, ..., length of target sentence
l: Number of words in the source sentence, excluding NULL
m: Number of words in the target sentence
s: A word in the source language
t: A word in the target language


References:
Philipp Koehn. 2010. Statistical Machine Translation.
Cambridge University Press, New York.

Peter E Brown, Stephen A. Della Pietra, Vincent J. Della Pietra, and
Robert L. Mercer. 1993. The Mathematics of Statistical Machine
Translation: Parameter Estimation. Computational Linguistics, 19 (2),
263-311.
iÿÿÿÿ(   t   divisionN(   t   defaultdict(   t   AlignedSent(   t	   Alignment(   t   IBMModel(   t	   IBMModel1(   t   Countst	   IBMModel2c           B   sb   e  Z d  Z d
 d „ Z d „  Z d „  Z d „  Z d „  Z d „  Z	 d „  Z
 d „  Z d	 „  Z RS(   sY  
    Lexical translation model that considers word order

    >>> bitext = []
    >>> bitext.append(AlignedSent(['klein', 'ist', 'das', 'haus'], ['the', 'house', 'is', 'small']))
    >>> bitext.append(AlignedSent(['das', 'haus', 'ist', 'ja', 'groÃŸ'], ['the', 'house', 'is', 'big']))
    >>> bitext.append(AlignedSent(['das', 'buch', 'ist', 'ja', 'klein'], ['the', 'book', 'is', 'small']))
    >>> bitext.append(AlignedSent(['das', 'haus'], ['the', 'house']))
    >>> bitext.append(AlignedSent(['das', 'buch'], ['the', 'book']))
    >>> bitext.append(AlignedSent(['ein', 'buch'], ['a', 'book']))

    >>> ibm2 = IBMModel2(bitext, 5)

    >>> print(round(ibm2.translation_table['buch']['book'], 3))
    1.0
    >>> print(round(ibm2.translation_table['das']['book'], 3))
    0.0
    >>> print(round(ibm2.translation_table['buch'][None], 3))
    0.0
    >>> print(round(ibm2.translation_table['ja'][None], 3))
    0.0

    >>> print(ibm2.alignment_table[1][1][2][2])
    0.938...
    >>> print(round(ibm2.alignment_table[1][2][2][2], 3))
    0.0
    >>> print(round(ibm2.alignment_table[2][2][4][5], 3))
    1.0

    >>> test_sentence = bitext[2]
    >>> test_sentence.words
    ['das', 'buch', 'ist', 'ja', 'klein']
    >>> test_sentence.mots
    ['the', 'book', 'is', 'small']
    >>> test_sentence.alignment
    Alignment([(0, 0), (1, 1), (2, 2), (3, 2), (4, 3)])

    c         C   s£   t  t |  ƒ j | ƒ | d k rQ t | d | ƒ } | j |  _ |  j | ƒ n | d |  _ | d |  _ x$ t d | ƒ D] } |  j	 | ƒ q{ W|  j
 | ƒ d S(   s™  
        Train on ``sentence_aligned_corpus`` and create a lexical
        translation model and an alignment model.

        Translation direction is from ``AlignedSent.mots`` to
        ``AlignedSent.words``.

        :param sentence_aligned_corpus: Sentence-aligned parallel corpus
        :type sentence_aligned_corpus: list(AlignedSent)

        :param iterations: Number of iterations to run training algorithm
        :type iterations: int

        :param probability_tables: Optional. Use this to pass in custom
            probability values. If not specified, probabilities will be
            set to a uniform distribution, or some other sensible value.
            If specified, all the following entries must be present:
            ``translation_table``, ``alignment_table``.
            See ``IBMModel`` for the type and purpose of these tables.
        :type probability_tables: dict[str]: object
        i   t   translation_tablet   alignment_tablei    N(   t   superR   t   __init__t   NoneR   R   t   set_uniform_probabilitiesR	   t   ranget   traint	   align_all(   t   selft   sentence_aligned_corpust
   iterationst   probability_tablest   ibm1t   n(    (    s2   lib/python2.7/site-packages/nltk/translate/ibm2.pyR   e   s    c   	      C   sô   t  ƒ  } xä | D]Ü } t | j ƒ } t | j ƒ } | | f | k r | j | | f ƒ d | d } | t j k  r” t j d t	 | ƒ d ƒ n  xU t
 d | d ƒ D]= } x4 t
 d | d ƒ D] } | |  j | | | | <qÂ Wq¨ Wq q Wd  S(   Ni   s   A source sentence is too long (s&    words). Results may be less accurate.i    (   t   sett   lent   motst   wordst   addR   t   MIN_PROBt   warningst   warnt   strR   R	   (	   R   R   t   l_m_combinationst   aligned_sentencet   lt   mt   initial_probt   it   j(    (    s2   lib/python2.7/site-packages/nltk/translate/ibm2.pyR   Ž   s    	c      	   C   s*  t  ƒ  } x | D]ø } d  g | j } d g | j } t | j ƒ } t | j ƒ } |  j | | ƒ } xŸ t d | d ƒ D]Š }	 | |	 }
 xw t d | d ƒ D]b } | | } |  j | |	 | | ƒ } | | |
 } | j | | |
 ƒ | j	 | | |	 | | ƒ qž Wqz Wq W|  j
 | ƒ |  j | ƒ d  S(   Nt   UNUSEDi   i    (   t   Model2CountsR   R   R   R   t   prob_all_alignmentsR   t   prob_alignment_pointt   update_lexical_translationt   update_alignmentt*   maximize_lexical_translation_probabilitiest    maximize_alignment_probabilities(   R   t   parallel_corpust   countsR!   t   src_sentencet   trg_sentenceR"   R#   t   total_countR&   t   tR%   t   st   countt   normalized_count(    (    s2   lib/python2.7/site-packages/nltk/translate/ibm2.pyR   ¢   s"    	

%c         C   sÆ   t  j } x¶ | j j ƒ  D]¥ \ } } x– | j ƒ  D]ˆ \ } } xy | j ƒ  D]k \ } } x\ | D]T }	 | j | | | |	 | j | | |	 }
 t |
 | ƒ |  j | | | |	 <q^ WqK Wq2 Wq Wd  S(   N(   R   R   t	   alignmentt   itemst   alignment_for_any_it   maxR	   (   R   R0   R   R%   t   j_sR&   t   src_sentence_lengthsR"   t   trg_sentence_lengthsR#   t   estimate(    (    s2   lib/python2.7/site-packages/nltk/translate/ibm2.pyR.   ¼   s    	c      
   C   s   t  d „  ƒ } xi t d t | ƒ ƒ D]R } | | } x? t d t | ƒ ƒ D]( } | | c |  j | | | | ƒ 7<qK Wq% W| S(   sï  
        Computes the probability of all possible word alignments,
        expressed as a marginal distribution over target words t

        Each entry in the return value represents the contribution to
        the total alignment probability by the target word t.

        To obtain probability(alignment | src_sentence, trg_sentence),
        simply sum the entries in the return value.

        :return: Probability of t for all s in ``src_sentence``
        :rtype: dict(str): float
        c           S   s   d S(   Ng        (    (    (    (    s2   lib/python2.7/site-packages/nltk/translate/ibm2.pyt   <lambda>Ö   t    i   i    (   R   R   R   R*   (   R   R1   R2   t   alignment_prob_for_tR&   R4   R%   (    (    s2   lib/python2.7/site-packages/nltk/translate/ibm2.pyR)   È   s    
c   	      C   sZ   t  | ƒ d } t  | ƒ d } | | } | | } |  j | | |  j | | | | S(   sz   
        Probability that position j in ``trg_sentence`` is aligned to
        position i in the ``src_sentence``
        i   (   R   R   R	   (	   R   R%   R&   R1   R2   R"   R#   R5   R4   (    (    s2   lib/python2.7/site-packages/nltk/translate/ibm2.pyR*   ß   s
    

c   	      C   s´   d } t  | j ƒ d } t  | j ƒ d } xu t | j ƒ D]d \ } } | d k rZ q< n  | j | } | j | } | |  j | | |  j | | | | 9} q< Wt | t j	 ƒ S(   sc   
        Probability of target sentence and an alignment given the
        source sentence
        g      ð?i   i    (
   R   R1   R2   t	   enumerateR8   R   R	   R;   R   R   (	   R   t   alignment_infot   probR"   R#   R&   R%   t   trg_wordt   src_word(    (    s2   lib/python2.7/site-packages/nltk/translate/ibm2.pyt   prob_t_a_given_sê   s    c         C   s"   x | D] } |  j  | ƒ q Wd  S(   N(   t   align(   R   R/   t   sentence_pair(    (    s2   lib/python2.7/site-packages/nltk/translate/ibm2.pyR   ÿ   s    c         C   s  g  } t  | j ƒ } t  | j ƒ } xß t | j ƒ D]Î \ } } |  j | d |  j d | d | | } t | t j	 ƒ } d } xh t | j ƒ D]W \ }	 }
 |  j | |
 |  j |	 d | d | | } | | k r” | } |	 } q” q” W| j
 | | f ƒ q4 Wt | ƒ | _ d S(   s  
        Determines the best word alignment for one sentence pair from
        the corpus that the model was trained on.

        The best alignment will be set in ``sentence_pair`` when the
        method returns. In contrast with the internal implementation of
        IBM models, the word indices in the ``Alignment`` are zero-
        indexed, not one-indexed.

        :param sentence_pair: A sentence in the source language and its
            counterpart sentence in the target language
        :type sentence_pair: AlignedSent
        i    i   N(   R   R   R   RC   R   R   R	   R;   R   R   t   appendR   R8   (   R   RJ   t   best_alignmentR"   R#   R&   RF   t	   best_probt   best_alignment_pointR%   RG   t
   align_prob(    (    s2   lib/python2.7/site-packages/nltk/translate/ibm2.pyRI     s     "N(   t   __name__t
   __module__t   __doc__R   R   R   R   R.   R)   R*   RH   R   RI   (    (    (    s2   lib/python2.7/site-packages/nltk/translate/ibm2.pyR   =   s   &)							R(   c           B   s)   e  Z d  Z d „  Z d „  Z d „  Z RS(   so   
    Data object to store counts of various parameters during training.
    Includes counts for alignment.
    c         C   s;   t  t |  ƒ j ƒ  t d „  ƒ |  _ t d „  ƒ |  _ d  S(   Nc           S   s   t  d „  ƒ S(   Nc           S   s   t  d „  ƒ S(   Nc           S   s   t  d „  ƒ S(   Nc           S   s   d S(   Ng        (    (    (    (    s2   lib/python2.7/site-packages/nltk/translate/ibm2.pyR@   5  RA   (   R   (    (    (    s2   lib/python2.7/site-packages/nltk/translate/ibm2.pyR@   5  RA   (   R   (    (    (    s2   lib/python2.7/site-packages/nltk/translate/ibm2.pyR@   5  RA   (   R   (    (    (    s2   lib/python2.7/site-packages/nltk/translate/ibm2.pyR@   5  RA   c           S   s   t  d „  ƒ S(   Nc           S   s   t  d „  ƒ S(   Nc           S   s   d S(   Ng        (    (    (    (    s2   lib/python2.7/site-packages/nltk/translate/ibm2.pyR@   8  RA   (   R   (    (    (    s2   lib/python2.7/site-packages/nltk/translate/ibm2.pyR@   8  RA   (   R   (    (    (    s2   lib/python2.7/site-packages/nltk/translate/ibm2.pyR@   8  RA   (   R
   R(   R   R   R8   R:   (   R   (    (    s2   lib/python2.7/site-packages/nltk/translate/ibm2.pyR   2  s
    c         C   s.   |  j  | | c | 7<|  j | c | 7<d  S(   N(   t	   t_given_st   any_t_given_s(   R   R6   R5   R4   (    (    s2   lib/python2.7/site-packages/nltk/translate/ibm2.pyR+   ;  s    c         C   s>   |  j  | | | | c | 7<|  j | | | c | 7<d  S(   N(   R8   R:   (   R   R6   R%   R&   R"   R#   (    (    s2   lib/python2.7/site-packages/nltk/translate/ibm2.pyR,   ?  s    (   RP   RQ   RR   R   R+   R,   (    (    (    s2   lib/python2.7/site-packages/nltk/translate/ibm2.pyR(   ,  s   			(   RR   t
   __future__R    R   t   collectionsR   t   nltk.translateR   R   R   R   t   nltk.translate.ibm_modelR   R   R(   (    (    (    s2   lib/python2.7/site-packages/nltk/translate/ibm2.pyt   <module>/   s   ï