B
    >?["                 @   s   d Z ddlmZ ddlZyddlmZ ddlmZ W n, ek
r`   dd Z	dd	 Z
d
d ZY nX edZG dd deZdd Zdd ZefddZefddZdd Zdd ZdS )z

A port of the Gale-Church Aligner.

Gale & Church (1993), A Program for Aligning Sentences in Bilingual Corpora.
http://aclweb.org/anthology/J93-1004.pdf

    )divisionN)norm)logsfc             C   s   t | }ddd|   }|t| | d |d|d|d|d|d|d	|d
|d|d                   }| dkr|S d| S dS )zComplementary error function.   g      ?gś??g5 ?g`yg?gƸ?gꪂIǿg#v?g9)gS?gޅ1Ogv(?g        g       @N)absmathZexp)xztr r   9lib/python3.7/site-packages/nltk/translate/gale_church.pyerfcc   s(    4r   c             C   s   ddt | td   S )u>   Return the area under the normal distribution from M{-∞..x}.r   g      ?   )r   r   sqrt)r   r   r   r   norm_cdfC   s    r   c             C   s0   yt dt|  S  tk
r*   tdS X d S )Nr   z-inf)r   logr   
ValueErrorfloat)r   r   r   r   
norm_logsfG   s    r   r   c               @   s&   e Zd ZdddddddZdZdZdS )	LanguageIndependentgׁsF?g{Gz?gbX9ȶ?gI+?))r   r   )r   r   )r   r   )r   r   )r   r   )r   r   r   g333333@N)__name__
__module____qualname__PRIORSAVERAGE_CHARACTERSVARIANCE_CHARACTERSr   r   r   r   r   Q   s   r   c       	      C   s   g }t |t |f}x|dkrtdd |D ry| | \}}W n. tk
rn   |d d |d d f}wY nX xHt|D ]<}x6t|D ]*}||d | d |d | d f qW qzW |d | |d | f}qW |ddd S )a  
    Traverse the alignment cost from the tracebacks and retrieves
    appropriate sentence pairs.

    :param backlinks: A dictionary where the key is the alignment points and value is the cost (referencing the LanguageIndependent.PRIORS)
    :type backlinks: dict
    :param source_sents_lens: A list of target sentences' lengths
    :type source_sents_lens: list(int)
    :param target_sents_lens: A list of target sentences' lengths
    :type target_sents_lens: list(int)
    )r   r   c             s   s   | ]}|d kV  qdS )r   Nr   ).0pr   r   r   	<genexpr>q   s    ztrace.<locals>.<genexpr>r   r   N)lenall	TypeErrorrangeappend)		backlinkssource_sents_lenstarget_sents_lensZlinksZpositionsr
   ijr   r   r   tracec   s    .r,   c       
         s   t  fddt|d D }t fddt|d D }y4|||j  d }||j | t||j  }	W n tk
r   tdS X tt	t
|	 t|j|   S )aP  Returns the log probability of the two sentences C{source_sents[i]}, C{target_sents[j]}
    being aligned with a specific C{alignment}.

    @param i: The offset of the source sentence.
    @param j: The offset of the target sentence.
    @param source_sents: The list of source sentence lengths.
    @param target_sents: The list of target sentence lengths.
    @param alignment: The alignment type, a tuple of two integers.
    @param params: The sentence alignment parameters.

    @returns: The log probability of a specific alignment between the two sentences, given the parameters.
    c             3   s   | ]} | d   V  qdS )r   Nr   )r   offset)r*   source_sentsr   r   r      s    z!align_log_prob.<locals>.<genexpr>r   c             3   s   | ]} | d   V  qdS )r   Nr   )r   r-   )r+   target_sentsr   r   r      s    r   r   z-inf)sumr$   r   r   r   r   ZeroDivisionErrorr   LOG2r   r   r   r   )
r*   r+   r.   r/   Z	alignmentparamsZl_sZl_tmZdeltar   )r*   r+   r.   r/   r   align_log_prob   s      
r5   c             C   s  t |j }g g}i }xtt| d D ]}xtt|d D ]}td}d}	xj|D ]b}
d|
d  }||
d  }|t| k sV|dk rqV|| | t||| ||
| }||k rV|}|
}	qVW |tdkrd}|	|||f< |d | q@W t|dkr|d |g  q*W t	|| |S )a  Return the sentence alignment of two text blocks (usually paragraphs).

        >>> align_blocks([5,5,5], [7,7,7])
        [(0, 0), (1, 1), (2, 2)]
        >>> align_blocks([10,5,5], [12,20])
        [(0, 0), (1, 1), (2, 1)]
        >>> align_blocks([12,20], [10,5,5])
        [(0, 0), (1, 1), (1, 2)]
        >>> align_blocks([10,2,10,10,2,10], [12,3,20,3,12])
        [(0, 0), (1, 1), (2, 2), (3, 2), (4, 3), (5, 4)]

    @param source_sents_lens: The list of source sentence lengths.
    @param target_sents_lens: The list of target sentence lengths.
    @param params: the sentence alignment parameters.
    @return: The sentence alignments, a list of index pairs.
    r   infNr    r   r   )
listr   keysr$   r!   r   r5   r%   popr,   )r'   r(   r3   Zalignment_typesDr&   r*   r+   Zmin_distZ	min_alignaZprev_iZprev_jr   r   r   r   align_blocks   s2    

r<   c                s0   t | t |krtd fddt| |D S )a  Creates the sentence alignment of two texts.

    Texts can consist of several blocks. Block boundaries cannot be crossed by sentence
    alignment links.

    Each block consists of a list that contains the lengths (in characters) of the sentences
    in this block.

    @param source_blocks: The list of blocks in the source text.
    @param target_blocks: The list of blocks in the target text.
    @param params: the sentence alignment parameters.

    @returns: A list of sentence alignment lists
    z>Source and target texts do not have the same number of blocks.c                s   g | ]\}}t || qS r   )r<   )r   Zsource_blockZtarget_block)r3   r   r   
<listcomp>   s   zalign_texts.<locals>.<listcomp>)r!   r   zip)Zsource_blocksZtarget_blocksr3   r   )r3   r   align_texts   s
    
r?   c             #   s&    fdd}x|   V  qW dS )zSplits an iterator C{it} at values of C{split_value}.

    Each instance of C{split_value} is swallowed. The iterator produces
    subiterators which need to be consumed fully before the next subiterator
    can be used.
    c             3   s$   | }x|kr|V     }qW d S )N)next)firstv)itsplit_valuer   r   _chunk_iterator   s    
z!split_at.<locals>._chunk_iteratorN)r@   )rC   rD   rE   r   )rC   rD   r   split_at   s    rF   c                s    fddt | |D S )zParses a stream of tokens and splits it into sentences (using C{soft_delimiter} tokens)
    and blocks (using C{hard_delimiter} tokens) for use with the L{align_texts} function.
    c                s    g | ]}d d t | D qS )c             S   s   g | ]}t d d |D qS )c             s   s   | ]}t |V  qd S )N)r!   )r   tokenr   r   r   r     s    z;parse_token_stream.<locals>.<listcomp>.<listcomp>.<genexpr>)r0   )r   Zsentence_itr   r   r   r=     s   z1parse_token_stream.<locals>.<listcomp>.<listcomp>)rF   )r   Zblock_it)soft_delimiterr   r   r=     s   z&parse_token_stream.<locals>.<listcomp>)rF   )streamrH   Zhard_delimiterr   )rH   r   parse_token_stream   s    
rJ   )__doc__Z
__future__r   r   Zscipy.statsr   r   r   ImportErrorr   r   r   r2   objectr   r,   r5   r<   r?   rF   rJ   r   r   r   r   <module>   s"   '
6