B
    >?ð[ú*  ã               @   s8  d Z ddlmZ ddlmZ ddlZddlZddlZddlmZ	 ddl
mZmZ yddlmZ W n ek
rt   Y nX ddlmZ dd	lmZ dd
lmZ ddlmZ ddlmZ G dd„ deƒZG dd„ deƒZdd„ Zdd„ Zdd„ Zd$dd„Zdd„ Z dd„ Z!d%dd „Z"e#d!kr4dd"l$m"Z" e"dƒ e"d#ƒ dS )&z
Named entity chunker
é    )Úprint_function)Úunicode_literalsN)ÚElementTree)ÚClassifierBasedTaggerÚpos_tag)ÚMaxentClassifier)ÚTree)Úword_tokenize)Úfind)ÚChunkParserI)Ú
ChunkScorec               @   s0   e Zd ZdZdd„ Zdd„ Zdd„ Zdd	„ Zd
S )ÚNEChunkParserTaggerz2
    The IOB tagger used by the chunk parser.
    c             C   s   t j| || jd d S )N)ÚtrainZclassifier_builder)r   Ú__init__Ú_classifier_builder)Úselfr   © r   ú6lib/python3.7/site-packages/nltk/chunk/named_entity.pyr   %   s    zNEChunkParserTagger.__init__c             C   s   t j|ddddS )NZmegamé   é   )Ú	algorithmZgaussian_prior_sigmaZtrace)r   r   )r   r   r   r   r   r   *   s    z'NEChunkParserTagger._classifier_builderc             C   sF   y
| j }W n6 tk
r@   ddlm} t| d¡ƒ| _ | j }Y nX |S )Nr   )Úwordszen-basic)Z_en_wordlistÚAttributeErrorZnltk.corpusr   Úset)r   Zwlr   r   r   r   Ú_english_wordlist/   s    
z%NEChunkParserTagger._english_wordlistc             C   s*  || d }t || d ƒ}|dkrBd  }}d  }}	d  }
 }}nÂ|dkr”||d  d  ¡ }d }t ||d  d ƒ}d }	||d  d }d  }
}np||d  d  ¡ }||d  d  ¡ }t ||d  d ƒ}t ||d  d ƒ}	||d  }||d  }t|ƒ}
|t|ƒd kr(d  }}d  }}n”|t|ƒd krl||d  d  ¡ }||d  d  ¡ }d }d }nP||d  d  ¡ }||d  d  ¡ }||d  d  ¡ }||d  d  ¡ }dt|ƒt|ƒ|d d…  ¡ |dd …  ¡ ||||  ¡ k|||||d | ¡ |¡d ||¡d |
|¡dœ}|S )	Nr   r   r   Té   éýÿÿÿz{0}+{1})ZbiasÚshapeZwordlenZprefix3Zsuffix3ÚposÚwordzen-wordlistÚprevtagÚprevposÚnextposÚprevwordÚnextwordzword+nextposzpos+prevtagzshape+prevtag)Úsimplify_posÚlowerr   Úlenr   Úformat)r   ÚtokensÚindexÚhistoryr   r   r#   Zprevprevwordr!   ZprevprevposZ	prevshaper    Zprevprevtagr$   Znextnextwordr"   ZnextnextposZfeaturesr   r   r   Ú_feature_detector9   sb    



z%NEChunkParserTagger._feature_detectorN)Ú__name__Ú
__module__Ú__qualname__Ú__doc__r   r   r   r,   r   r   r   r   r       s
   
r   c               @   s<   e Zd ZdZdd„ Zdd„ Zdd„ Zdd	„ Zed
d„ ƒZ	dS )ÚNEChunkParserz2
    Expected input: list of pos-tagged words
    c             C   s   |   |¡ d S )N)Ú_train)r   r   r   r   r   r   y   s    zNEChunkParser.__init__c             C   s   | j  |¡}|  |¡}|S )z8
        Each token should be a pos-tagged word
        )Ú_taggerÚtagÚ_tagged_to_parse)r   r)   ZtaggedÚtreer   r   r   Úparse|   s    
zNEChunkParser.parsec                s"   ‡ fdd„|D ƒ}t |dˆ _d S )Nc                s   g | ]}ˆ   |¡‘qS r   )Ú_parse_to_tagged)Ú.0Ús)r   r   r   ú
<listcomp>†   s    z(NEChunkParser._train.<locals>.<listcomp>)r   )r   r3   )r   Zcorpusr   )r   r   r2   „   s    zNEChunkParser._trainc             C   s¸   t dg ƒ}x¨|D ] \}}|dkr,| |¡ q| d¡rR| t |dd… |gƒ¡ q| d¡r|r–t|d t ƒr–|d  ¡ |dd… kr–|d  |¡ q| t |dd… |gƒ¡ qW |S )zH
        Convert a list of tagged tokens to a chunk-parse tree.
        ÚSÚOzB-r   NzI-éÿÿÿÿ)r   ÚappendÚ
startswithÚ
isinstanceÚlabel)r   Ztagged_tokensÚsentÚtokr4   r   r   r   r5   Š   s    


*zNEChunkParser._tagged_to_parsec          	   C   s   g }x†| D ]~}t |tƒrzt|ƒdkr.tdƒ q
| |d d | ¡ ¡f¡ x<|dd… D ]}| |d | ¡ ¡f¡ qXW q
| |df¡ q
W |S )zH
        Convert a chunk-parse tree to a list of tagged tokens.
        r   z"Warning -- empty chunk in sentencezB-{0}r   NzI-{0}r=   )rA   r   r'   Úprintr?   r(   rB   )rC   ÚtoksÚchildrD   r   r   r   r8   œ   s    

zNEChunkParser._parse_to_taggedN)
r-   r.   r/   r0   r   r7   r2   r5   Ústaticmethodr8   r   r   r   r   r1   t   s   r1   c             C   s^   t  d| t j¡rdS t  d| t j¡r(dS t  d| t j¡rV|  ¡ rDdS |  ¡ rPdS dS nd	S d S )
Nz![0-9]+(\.[0-9]*)?|[0-9]*\.[0-9]+$Znumberz\W+$Úpunctz\w+$ZupcaseZdowncaseZ	mixedcaseÚother)ÚreÚmatchÚUNICODEÚistitleÚislower)r   r   r   r   r   ¯   s    r   c             C   s    |   d¡rdS |  d¡d S d S )NÚVú-r   )r@   Úsplit)r:   r   r   r   r%   ¿   s    
r%   c             C   sŒ   |   ¡ }dd„ t|ƒD ƒ}tdg ƒ}xb| D ]Z}t|tƒrr| t| ¡ g ƒ¡ x6|D ]}|d  |t|ƒf¡ qRW q*| |t|ƒf¡ q*W |S )Nc             s   s   | ]\}}|V  qd S )Nr   )r9   r   r   r   r   r   ú	<genexpr>É   s    zpostag_tree.<locals>.<genexpr>r<   r>   )Úleavesr   r   rA   r?   rB   Únext)r6   r   Ztag_iterZnewtreerG   Zsubchildr   r   r   Úpostag_treeÆ   s    



rV   ÚbinaryTc             c   sx   xr| D ]j}xdt  |¡D ]V\}}}| d¡r0|r0qx:|D ]2}| d¡r6x"tt j ||¡|ƒD ]
}|V  qZW q6W qW qW d S )NZbnewsz.sgm)ÚosÚwalkÚendswithÚload_ace_fileÚpathÚjoin)ÚrootsÚfmtZ
skip_bnewsÚrootÚdirsÚfilesÚfrC   r   r   r   Úload_ace_dataÕ   s    


rd   c          	   c   s  t d tj | ¡d ¡ƒ | d }g }t|dƒ}t |¡ ¡ }W d Q R X xv| 	d¡D ]h}| 
d¡j}xV| 	d¡D ]H}| d¡d	kr†qrt| 
d
¡jƒ}	t| 
d¡jƒd }
| |	|
|f¡ qrW qVW t| dƒ}| ¡ }W d Q R X t dd|¡}dd„ }t d||¡}t dd|¡}t dd|¡}t dd|¡}tdd„ |D ƒƒ}|dkrâd}tdg ƒ}xjt|ƒD ]^\}	}
}|	|k rz|}	|
|	krˆqb| t|||	… ƒ¡ | td||	|
…  ¡ ƒ¡ |
}qbW | t||d … ƒ¡ |V  nª|dkr„d}tdg ƒ}xjt|ƒD ]^\}	}
}|	|k r|}	|
|	kr*q| t|||	… ƒ¡ | t|||	|
…  ¡ ƒ¡ |
}qW | t||d … ƒ¡ |V  ntdƒ‚d S )Nz  - {0}r   z.tmx.rdc.xmlÚrzdocument/entityZentity_typeZentity_mentionZTYPEÚNAMEzhead/charseq/startzhead/charseq/endz<(?!/?TEXT)[^>]+>Ú c             S   s   d|   ¡ |  ¡  d  S )Nú é   )ÚendÚstart)Úmr   r   r   Úsubfuncù   s    zload_ace_file.<locals>.subfuncz[\s\S]*<TEXT>z</TEXT>[\s\S]*z``z "z''z" c             s   s   | ]\}}}|V  qd S )Nr   )r9   r:   ÚeÚtypr   r   r   rS     s    z load_ace_file.<locals>.<genexpr>rW   r   r<   ZNEÚ
multiclasszbad fmt value)rE   r(   rX   r\   rR   ÚopenÚETr7   ZgetrootÚfindallr
   ÚtextÚgetÚintr?   ÚreadrK   Úsubr   r   ÚsortedÚextendr	   Ú
ValueError)Ztextfiler_   ZannfileZentitiesZinfileZxmlZentityro   Zmentionr:   rn   rt   rm   Zentity_typesÚirF   r   r   r   r[   à   sb    









r[   c             C   s˜   t  | ¡} t  |¡}d}xzt| |ƒD ]l\\}}\}}||  krHdkrzn n.|std |||¡ƒ td ddd¡ƒ d}q$d}td |||¡ƒ q$W d S )NFr=   z  {:15} {:15} {2}z...T)r1   r8   ÚziprE   r(   )ÚcorrectZguessedZellipsisÚwZctÚgtr   r   r   Ú
cmp_chunks)  s    

r   c          	   C   s  t dƒ tdƒtdƒtdƒtdƒg}t|| ƒ}dd„ |D ƒ}t dƒ t|ƒ}~t d	ƒ td
ƒg}t|| ƒ}dd„ |D ƒ}t dƒ tƒ }x@t|ƒD ]4\}	}
| |
 ¡ ¡}| |
|¡ |	dk rt	|
|ƒ qW t |ƒ d 
| ¡}t d 
|¡ƒ t|dƒ}t ||d¡ W d Q R X |S )NzLoading training data...zcorpora/ace_data/ace.devzcorpora/ace_data/ace.heldoutzcorpora/ace_data/bbn.devzcorpora/ace_data/muc.devc             S   s   g | ]}t |ƒ‘qS r   )rV   )r9   Útr   r   r   r;   A  s    zbuild_model.<locals>.<listcomp>zTraining...zLoading eval data...zcorpora/ace_data/ace.evalc             S   s   g | ]}t |ƒ‘qS r   )rV   )r9   r‚   r   r   r   r;   I  s    zEvaluating...r   z/tmp/ne_chunker_{0}.picklezSaving chunker to {0}...Úwbr>   )rE   r
   rd   r1   r   Ú	enumerater7   rT   Zscorer   r(   rq   ÚpickleÚdump)r_   Ztrain_pathsZtrain_treesZ
train_dataZcpZ
eval_pathsZ
eval_treesZ	eval_dataZ
chunkscorer|   r~   ZguessZoutfilenameZoutfiler   r   r   Úbuild_model8  s6    




r‡   Ú__main__)r‡   rp   )rW   T)rW   )%r0   Z
__future__r   r   rX   rK   r…   Z	xml.etreer   rr   Znltk.tagr   r   Znltk.classifyr   ÚImportErrorZ	nltk.treer   Znltk.tokenizer	   Z	nltk.datar
   Znltk.chunk.apir   Znltk.chunk.utilr   r   r1   r   r%   rV   rd   r[   r   r‡   r-   Znltk.chunk.named_entityr   r   r   r   Ú<module>
   s6   T;
I
%
