B
    >?ð[>  ã               @   sl   d Z yddlZW n ek
r$   Y nX ddlmZ ddd„Zdd	„ Zd
d„ Zddd„Z	ddd„Z
dd„ ZdS )a  
Text Segmentation Metrics

1. Windowdiff

Pevzner, L., and Hearst, M., A Critique and Improvement of
  an Evaluation Metric for Text Segmentation,
Computational Linguistics 28, 19-36


2. Generalized Hamming Distance

Bookstein A., Kulyukin V.A., Raita T.
Generalized Hamming Distance
Information Retrieval 5, 2002, pp 353-375

Baseline implementation in C++
http://digital.cs.usu.edu/~vkulyukin/vkweb/software/ghd/ghd.html

Study describing benefits of Generalized Hamming Distance Versus
WindowDiff for evaluating text segmentation tasks
Begsten, Y.  Quel indice pour mesurer l'efficacite en segmentation de textes ?
TALN 2009


3. Pk text segmentation metric

Beeferman D., Berger A., Lafferty J. (1999)
Statistical Models for Text Segmentation
Machine Learning, 34, 177-210
é    N)ÚrangeÚ1Fc             C   s®   t | ƒt |ƒkrtdƒ‚|t | ƒkr,tdƒ‚d}xhtt | ƒ| d ƒD ]P}t| ||| …  |¡|||| …  |¡ ƒ}|rˆ||7 }qF|td|ƒ7 }qFW |t | ƒ| d  S )aW  
    Compute the windowdiff score for a pair of segmentations.  A
    segmentation is any sequence over a vocabulary of two items
    (e.g. "0", "1"), where the specified boundary value is used to
    mark the edge of a segmentation.

        >>> s1 = "000100000010"
        >>> s2 = "000010000100"
        >>> s3 = "100000010000"
        >>> '%.2f' % windowdiff(s1, s1, 3)
        '0.00'
        >>> '%.2f' % windowdiff(s1, s2, 3)
        '0.30'
        >>> '%.2f' % windowdiff(s2, s3, 3)
        '0.80'

    :param seg1: a segmentation
    :type seg1: str or list
    :param seg2: a segmentation
    :type seg2: str or list
    :param k: window width
    :type k: int
    :param boundary: boundary value
    :type boundary: str or int or bool
    :param weighted: use the weighted variant of windowdiff
    :type weighted: boolean
    :rtype: float
    z!Segmentations have unequal lengthzCWindow width k should be smaller or equal than segmentation lengthsr   é   g      ð?)ÚlenÚ
ValueErrorr   ÚabsÚcountÚmin)Zseg1Zseg2ÚkÚboundaryZweightedZwdÚiZndiff© r   ú8lib/python3.7/site-packages/nltk/metrics/segmentation.pyÚ
windowdiff3   s    0
r   c             C   sF   t  | |f¡}|t  |¡ |dd d …f< |t  | ¡ |d d …df< |S )Nr   )ÚnpÚemptyZarange)ZnrowsZncolsÚins_costÚdel_costÚmatr   r   r   Ú	_init_matd   s    r   c             C   s®   x¨t |ƒD ]œ\}}x’t |ƒD ]†\}}	|t||	 ƒ | ||f  }
||	krV| ||f }n2||	krt|| ||d f  }n|| |d |f  }t||
ƒ| |d |d f< qW q
W d S )Nr   )Ú	enumerater   r	   )r   ZrowvZcolvr   r   Úshift_cost_coeffr   ZrowiÚjZcoljZ
shift_costZtcostr   r   r   Ú_ghd_auxk   s    r   ç       @ç      ð?c                s°   ‡ fdd„t | ƒD ƒ}‡ fdd„t |ƒD ƒ}t|ƒ}t|ƒ}	|dkrP|	dkrPdS |dkrh|	dkrh|| S |dkr€|	dkr€|	| S t|	d |d ||ƒ}
t|
|||||ƒ |
d S )ab  
    Compute the Generalized Hamming Distance for a reference and a hypothetical
    segmentation, corresponding to the cost related to the transformation
    of the hypothetical segmentation into the reference segmentation
    through boundary insertion, deletion and shift operations.

    A segmentation is any sequence over a vocabulary of two items
    (e.g. "0", "1"), where the specified boundary value is used to
    mark the edge of a segmentation.

    Recommended parameter values are a shift_cost_coeff of 2.
    Associated with a ins_cost, and del_cost equal to the mean segment
    length in the reference segmentation.

        >>> # Same examples as Kulyukin C++ implementation
        >>> ghd('1100100000', '1100010000', 1.0, 1.0, 0.5)
        0.5
        >>> ghd('1100100000', '1100000001', 1.0, 1.0, 0.5)
        2.0
        >>> ghd('011', '110', 1.0, 1.0, 0.5)
        1.0
        >>> ghd('1', '0', 1.0, 1.0, 0.5)
        1.0
        >>> ghd('111', '000', 1.0, 1.0, 0.5)
        3.0
        >>> ghd('000', '111', 1.0, 2.0, 0.5)
        6.0

    :param ref: the reference segmentation
    :type ref: str or list
    :param hyp: the hypothetical segmentation
    :type hyp: str or list
    :param ins_cost: insertion cost
    :type ins_cost: float
    :param del_cost: deletion cost
    :type del_cost: float
    :param shift_cost_coeff: constant used to compute the cost of a shift.
    shift cost = shift_cost_coeff * |i - j| where i and j are
    the positions indicating the shift
    :type shift_cost_coeff: float
    :param boundary: boundary value
    :type boundary: str or int or bool
    :rtype: float
    c                s   g | ]\}}|ˆ kr|‘qS r   r   )Ú.0r   Úval)r   r   r   ú
<listcomp>©   s    zghd.<locals>.<listcomp>c                s   g | ]\}}|ˆ kr|‘qS r   r   )r   r   r   )r   r   r   r   ª   s    r   g        r   )éÿÿÿÿr   )r   r   r   r   )ÚrefÚhypr   r   r   r   Zref_idxZhyp_idxZ
nref_boundZ
nhyp_boundr   r   )r   r   Úghd{   s    .r"   c             C   s    |dkr&t tt| ƒ|  |¡d  ƒƒ}d}x`tt| ƒ| d ƒD ]H}| ||| …  |¡dk}|||| …  |¡dk}||kr@|d7 }q@W |t| ƒ| d  S )aù  
    Compute the Pk metric for a pair of segmentations A segmentation
    is any sequence over a vocabulary of two items (e.g. "0", "1"),
    where the specified boundary value is used to mark the edge of a
    segmentation.

    >>> '%.2f' % pk('0100'*100, '1'*400, 2)
    '0.50'
    >>> '%.2f' % pk('0100'*100, '0'*400, 2)
    '0.50'
    >>> '%.2f' % pk('0100'*100, '0100'*100, 2)
    '0.00'

    :param ref: the reference segmentation
    :type ref: str or list
    :param hyp: the segmentation to evaluate
    :type hyp: str or list
    :param k: window size, if None, set to half of the average reference segment length
    :type boundary: str or int or bool
    :param boundary: boundary value
    :type boundary: str or int or bool
    :rtype: float
    Ng       @r   r   g      ð?)ÚintÚroundr   r   r   )r    r!   r
   r   Úerrr   ÚrÚhr   r   r   Úpk¾   s    r(   c             C   s:   ddl m} ydd l}W n tk
r4   |dƒ‚Y nX d S )Nr   )ÚSkipTestz/numpy is required for nltk.metrics.segmentation)Znoser)   ÚnumpyÚImportError)Úmoduler)   r*   r   r   r   Úsetup_moduleä   s
    r-   )r   F)r   r   r   r   )Nr   )Ú__doc__r*   r   r+   Z	six.movesr   r   r   r   r"   r(   r-   r   r   r   r   Ú<module>)   s   
1
C
&