
`]c           @   s   d  Z  d d l m Z d d l Z y$ d d l m Z d d l m Z Wn, e k
 rt d   Z	 d   Z
 d   Z n Xe j d	  Z d
 e f d     YZ d   Z d   Z e d  Z e d  Z d   Z d   Z d S(   s   

A port of the Gale-Church Aligner.

Gale & Church (1993), A Program for Aligning Sentences in Bilingual Corpora.
http://aclweb.org/anthology/J93-1004.pdf

i(   t   divisionN(   t   norm(   t   logsfc         C   s   t  |   } d d d | } | t j | | d | d | d | d | d | d | d	 | d
 | d | d  } |  d k r | Sd | Sd S(   s   Complementary error function.i   g      ?gś??g5 ?g`yg?gƸ?gꪂIǿg#v?g9)gS?gޅ1Ogv(?g        g       @N(   t   abst   matht   exp(   t   xt   zt   tt   r(    (    s9   lib/python2.7/site-packages/nltk/translate/gale_church.pyt   erfcc   s(    	(c         C   s   d d t  |  t j d   S(   s>   Return the area under the normal distribution from M{-∞..x}.i   g      ?i   (   R
   R   t   sqrt(   R   (    (    s9   lib/python2.7/site-packages/nltk/translate/gale_church.pyt   norm_cdfC   s    c         C   s:   y t  j d t |    SWn t k
 r5 t d  SXd  S(   Ni   s   -inf(   R   t   logR   t
   ValueErrort   float(   R   (    (    s9   lib/python2.7/site-packages/nltk/translate/gale_church.pyt
   norm_logsfG   s    i   t   LanguageIndependentc           B   sD   e  Z i d  d 6d  d	 6d d
 6d d 6d d 6d d 6Z d Z d Z RS(   gׁsF?i   i    g{Gz?gbX9ȶ?i   gI+?g333333@(   i   i    (   i    i   (   i   i   (   i   i   (   i   i   (   i   i   (   t   __name__t
   __module__t   PRIORSt   AVERAGE_CHARACTERSt   VARIANCE_CHARACTERS(    (    (    s9   lib/python2.7/site-packages/nltk/translate/gale_church.pyR   Q   s   
c   	      C   s  g  } t  |  t  |  f } x | d k rt d   | D  ry |  | \ } } Wn0 t k
 r | d d | d d f } q! n XxV t |  D]H } x? t |  D]1 } | j | d | d | d | d f  q Wq W| d | | d | f } q! W| d d d  S(   s  
    Traverse the alignment cost from the tracebacks and retrieves
    appropriate sentence pairs.

    :param backlinks: A dictionary where the key is the alignment points and value is the cost (referencing the LanguageIndependent.PRIORS)
    :type backlinks: dict
    :param source_sents_lens: A list of target sentences' lengths
    :type source_sents_lens: list(int)
    :param target_sents_lens: A list of target sentences' lengths
    :type target_sents_lens: list(int)
    i    c         s   s   |  ] } | d  k Vq d S(   i    N(    (   t   .0t   p(    (    s9   lib/python2.7/site-packages/nltk/translate/gale_church.pys	   <genexpr>q   s    i   Ni(   i    i    (   t   lent   allt	   TypeErrort   ranget   append(	   t	   backlinkst   source_sents_lenst   target_sents_lenst   linkst   positiont   sR   t   it   j(    (    s9   lib/python2.7/site-packages/nltk/translate/gale_church.pyt   tracec   s    %3 c   
         s   t     f d   t | d  D  } t    f d   t | d  D  } y> | | | j d } | | j | t j | | j  }	 Wn t k
 r t d  SXt t	 t
 |	   t j | j |  S(   sP  Returns the log probability of the two sentences C{source_sents[i]}, C{target_sents[j]}
    being aligned with a specific C{alignment}.

    @param i: The offset of the source sentence.
    @param j: The offset of the target sentence.
    @param source_sents: The list of source sentence lengths.
    @param target_sents: The list of target sentence lengths.
    @param alignment: The alignment type, a tuple of two integers.
    @param params: The sentence alignment parameters.

    @returns: The log probability of a specific alignment between the two sentences, given the parameters.
    c         3   s!   |  ] }    | d  Vq d S(   i   N(    (   R   t   offset(   R$   t   source_sents(    s9   lib/python2.7/site-packages/nltk/translate/gale_church.pys	   <genexpr>   s    i    c         3   s!   |  ] }    | d  Vq d S(   i   N(    (   R   R'   (   R%   t   target_sents(    s9   lib/python2.7/site-packages/nltk/translate/gale_church.pys	   <genexpr>   s    i   i   s   -inf(   t   sumR   R   R   R   R   t   ZeroDivisionErrorR   t   LOG2R   R   R   R   (
   R$   R%   R(   R)   t	   alignmentt   paramst   l_st   l_tt   mt   delta(    (   R$   R%   R(   R)   s9   lib/python2.7/site-packages/nltk/translate/gale_church.pyt   align_log_prob   s    ))c         C   s  t  | j j    } g  g } i  } xPt t |   d  D]8} x t t |  d  D] } t d  } d }	 x | D] }
 d |
 d } | |
 d } | t |  k  sw | d k  r qw n  | | | t | | |  | |
 |  } | | k  rw | } |
 }	 qw qw W| t d  k rd } n  |	 | | | f <| d j |  qX Wt |  d k rf| j	 d  n  | j g   q; Wt
 | |  |  S(   s  Return the sentence alignment of two text blocks (usually paragraphs).

        >>> align_blocks([5,5,5], [7,7,7])
        [(0, 0), (1, 1), (2, 2)]
        >>> align_blocks([10,5,5], [12,20])
        [(0, 0), (1, 1), (2, 1)]
        >>> align_blocks([12,20], [10,5,5])
        [(0, 0), (1, 1), (1, 2)]
        >>> align_blocks([10,2,10,10,2,10], [12,3,20,3,12])
        [(0, 0), (1, 1), (2, 2), (3, 2), (4, 3), (5, 4)]

    @param source_sents_lens: The list of source sentence lengths.
    @param target_sents_lens: The list of target sentence lengths.
    @param params: the sentence alignment parameters.
    @return: The sentence alignments, a list of index pairs.
    i   t   infii    i   N(   t   listR   t   keysR   R   R   t   NoneR3   R   t   popR&   (   R   R    R.   t   alignment_typest   DR   R$   R%   t   min_distt	   min_alignt   at   prev_it   prev_jR   (    (    s9   lib/python2.7/site-packages/nltk/translate/gale_church.pyt   align_blocks   s2    		c         C   sY   t  |   t  |  k r' t d   n  g  t |  |  D] \ } } t | | |  ^ q7 S(   s  Creates the sentence alignment of two texts.

    Texts can consist of several blocks. Block boundaries cannot be crossed by sentence
    alignment links.

    Each block consists of a list that contains the lengths (in characters) of the sentences
    in this block.

    @param source_blocks: The list of blocks in the source text.
    @param target_blocks: The list of blocks in the target text.
    @param params: the sentence alignment parameters.

    @returns: A list of sentence alignment lists
    s>   Source and target texts do not have the same number of blocks.(   R   R   t   zipR@   (   t   source_blockst   target_blocksR.   t   source_blockt   target_block(    (    s9   lib/python2.7/site-packages/nltk/translate/gale_church.pyt   align_texts   s
    c         #   s4      f d   } x t  r/ |   j    Vq Wd S(   s   Splits an iterator C{it} at values of C{split_value}.

    Each instance of C{split_value} is swallowed. The iterator produces
    subiterators which need to be consumed fully before the next subiterator
    can be used.
    c         3   s.   |  } x! |  k r) | V  j    } q	 Wd  S(   N(   t   next(   t   firstt   v(   t   itt   split_value(    s9   lib/python2.7/site-packages/nltk/translate/gale_church.pyt   _chunk_iterator   s    N(   t   TrueRG   (   RJ   RK   RL   (    (   RJ   RK   s9   lib/python2.7/site-packages/nltk/translate/gale_church.pyt   split_at   s    	c         C   sL   g  t  |  |  D]8 } g  t  | |  D] } t d   | D  ^ q& ^ q S(   s   Parses a stream of tokens and splits it into sentences (using C{soft_delimiter} tokens)
    and blocks (using C{hard_delimiter} tokens) for use with the L{align_texts} function.
    c         s   s   |  ] } t  |  Vq d  S(   N(   R   (   R   t   token(    (    s9   lib/python2.7/site-packages/nltk/translate/gale_church.pys	   <genexpr>  s    (   RN   R*   (   t   streamt   soft_delimitert   hard_delimitert   block_itt   sentence_it(    (    s9   lib/python2.7/site-packages/nltk/translate/gale_church.pyt   parse_token_stream   s    (   t   __doc__t
   __future__R    R   t   scipy.statsR   R   R   t   ImportErrorR
   R   R   R,   t   objectR   R&   R3   R@   RF   RN   RU   (    (    (    s9   lib/python2.7/site-packages/nltk/translate/gale_church.pyt   <module>   s"   	'			6	