ó
ų`]c           @   s   d  d l  m Z d   Z d S(   iĸĸĸĸ(   t   defaultdictc   	   	      se  g  | j    D]$ } t t t | j  d    ^ q } g  | j    D]$ } t t t | j  d    ^ qD } d	 d
 d d d d d d g  t |  j t |    t |  j t |    t t    x6  D]. \ } }   d j |    d j |  qÕ W       f d   }       f d   } |   | |  | |  t	   S(   s  
    This module symmetrisatizes the source-to-target and target-to-source
    word alignment output and produces, aka. GDFA algorithm (Koehn, 2005).

    Step 1: Find the intersection of the bidirectional alignment.

    Step 2: Search for additional neighbor alignment points to be added, given
            these criteria: (i) neighbor alignments points are not in the
            intersection and (ii) neighbor alignments are in the union.

    Step 3: Add all other alignment points thats not in the intersection, not in
            the neighboring alignments that met the criteria but in the original
            foward/backward alignment outputs.

        >>> forw = ('0-0 2-1 9-2 21-3 10-4 7-5 11-6 9-7 12-8 1-9 3-10 '
        ...         '4-11 17-12 17-13 25-14 13-15 24-16 11-17 28-18')
        >>> back = ('0-0 1-9 2-9 3-10 4-11 5-12 6-6 7-5 8-6 9-7 10-4 '
        ...         '11-6 12-8 13-12 15-12 17-13 18-13 19-12 20-13 '
        ...         '21-3 22-12 23-14 24-17 25-15 26-17 27-18 28-18')
        >>> srctext = ("ããŪ ãã ãŠ ãã­ãž į―čē ãã æ ãŪ ïžŽ éĒæ° "
        ...            "ãŊ ïžŽ ãĻ åą ãŦ äļéĢįķ ãŦ åĒå  ãã ããĻ ã "
        ...            "æåū ã ãã ããĻ ã įĪšã ã ã")
        >>> trgtext = ("Therefore , we expect that the luminosity function "
        ...            "of such halo white dwarfs increases discontinuously "
        ...            "with the luminosity .")
        >>> srclen = len(srctext.split())
        >>> trglen = len(trgtext.split())
        >>>
        >>> gdfa = grow_diag_final_and(srclen, trglen, forw, back)
        >>> gdfa == sorted(set([(28, 18), (6, 6), (24, 17), (2, 1), (15, 12), (13, 12),
        ...         (2, 9), (3, 10), (26, 17), (25, 15), (8, 6), (9, 7), (20,
        ...         13), (18, 13), (0, 0), (10, 4), (13, 15), (23, 14), (7, 5),
        ...         (25, 14), (1, 9), (17, 13), (4, 11), (11, 17), (9, 2), (22,
        ...         12), (27, 18), (24, 16), (21, 3), (19, 12), (17, 12), (5,
        ...         12), (11, 6), (12, 8)]))
        True

    References:
    Koehn, P., A. Axelrod, A. Birch, C. Callison, M. Osborne, and D. Talbot.
    2005. Edinburgh System Description for the 2005 IWSLT Speech
    Translation Evaluation. In MT Eval Workshop.

    :type srclen: int
    :param srclen: the number of tokens in the source language
    :type trglen: int
    :param trglen: the number of tokens in the target language
    :type e2f: str
    :param e2f: the forward word alignment outputs from source-to-target
                language (in pharaoh output format)
    :type f2e: str
    :param f2e: the backward word alignment outputs from target-to-source
                language (in pharaoh output format)
    :rtype: set(tuple(int))
    :return: the symmetrized alignment points from the GDFA algorithm
    t   -iĸĸĸĸi    i   t   et   fc             s(  t    d }  x|  t    k  r#t } xč t   D]Ú } xŅ t   D]Ã } | | f  k rK xĻ  D] } t d   t | | f |  D  } | \ } } |   k rj |   k rj |  k rj  j |    d j |    d j |  |  d 7}  t } qj qj WqK qK Wq8 W| r Pq q Wd S(   sz   
        Search for the neighbor points and them to the intersected alignment
        points if criteria are met.
        i   c         s   s   |  ] \ } } | | Vq d  S(   N(    (   t   .0t   it   j(    (    s2   lib/python2.7/site-packages/nltk/translate/gdfa.pys	   <genexpr>d   s    R   R   N(   t   lent   Truet   ranget   tuplet   zipt   addt   False(   t   prev_lent   no_new_pointsR   R   t   neighbort   e_newt   f_new(   t   alignedt	   alignmentt	   neighborst   srclent   trglent   union(    s2   lib/python2.7/site-packages/nltk/translate/gdfa.pyt	   grow_diagS   s$    %
c            s   x t    D] } xv t    D]h } |   k r  |   k r  | | f  k r   j | | f    d j |    d j |  q  q  Wq Wd S(   sĒ   
        Adds remaining points that are not in the intersection, not in the
        neighboring alignments but in the original *e2f* and *f2e* alignments
        R   R   N(   R	   R   (   t   aR   R   (   R   R   R   R   R   (    s2   lib/python2.7/site-packages/nltk/translate/gdfa.pyt	   final_andt   s    (   iĸĸĸĸi    (   i    iĸĸĸĸ(   i   i    (   i    i   (   iĸĸĸĸiĸĸĸĸ(   iĸĸĸĸi   (   i   iĸĸĸĸ(   i   i   (
   t   splitR
   t   mapt   intt   sett   intersectionR   R    R   t   sorted(	   R   R   t   e2ft   f2eR   R   R   R   R   (    (   R   R   R   R   R   R   s2   lib/python2.7/site-packages/nltk/translate/gdfa.pyt   grow_diag_final_and   s    :77!

N(   t   collectionsR    R$   (    (    (    s2   lib/python2.7/site-packages/nltk/translate/gdfa.pyt   <module>	   s   