B
    >?[                 @   s   d Z ddlmZmZ ddlmZ ddlmZ er>ddlm	Z	 nddlm
Z
 yddlZW n ek
rn   dZY nX G dd	 d	eZd
d Zedkre  dS )a  
A module for language identification using the TextCat algorithm.
An implementation of the text categorization algorithm
presented in Cavnar, W. B. and J. M. Trenkle,
"N-Gram-Based Text Categorization".

The algorithm takes advantage of Zipf's law and uses
n-gram frequencies to profile languages and text-yet to
be identified-then compares using a distance measure.

Language n-grams are provided by the "An Crubadan"
project. A corpus reader was created seperately to read
those files.

For details regarding the algorithm, see:
http://www.let.rug.nl/~vannoord/TextCat/textcat.pdf

For details about An Crubadan, see:
http://borel.slu.edu/crubadan/index.html
    )print_functionunicode_literals)PY3)trigrams)maxsize)maxintNc               @   sP   e Zd ZdZi ZdZdZi Zdd Zdd Z	dd	 Z
d
d Zdd Zdd ZdS )TextCatN<>c             C   sB   t stdddlm} || _x| j D ]}| j| q*W d S )Nzclassify.textcat requires the regex module that supports unicode. Try '$ pip install regex' and see https://pypi.python.org/pypi/regex for further details.r   )crubadan)reEnvironmentErrornltk.corpusr   _corpuslangs	lang_freq)selfr   lang r   4lib/python3.7/site-packages/nltk/classify/textcat.py__init__@   s    zTextCat.__init__c             C   s   t dd|S )z+ Get rid of punctuation except apostrophes z[^\P{P}\']+ )r   sub)r   textr   r   r   remove_punctuationP   s    zTextCat.remove_punctuationc             C   s   ddl m}m} | |}||}| }x^|D ]V}t| j| | j }dd |D }	x.|	D ]&}
|
|krx||
  d7  < qZd||
< qZW q.W |S )z) Create FreqDist of trigrams within text r   )word_tokenizeFreqDistc             S   s   g | ]}d  |qS )r   )join).0Ztrir   r   r   
<listcomp>^   s    z#TextCat.profile.<locals>.<listcomp>   )Znltkr   r   r   r   _START_CHAR	_END_CHAR)r   r   r   r   Z
clean_texttokensZfingerprinttZtoken_trigram_tuplesZtoken_trigramsZcur_trigramr   r   r   profileT   s    


zTextCat.profilec             C   s\   | j |}d}||krJt| |}t| |}t|| }ntrTt}nt}|S )zm Calculate the "out-of-place" measure between the
            text and language profile for a single trigram r   )	r   r   listkeysindexabsr   r   r   )r   r   trigramZtext_profileZlang_fdZdistZidx_lang_profileZidx_textr   r   r   	calc_disth   s    zTextCat.calc_distc             C   sT   i }|  |}x@| jj D ]0}d}x|D ]}|| |||7 }q*W |||< qW |S )zU Calculate the "out-of-place" measure between
            the text and all languages r   )r%   r   Z_all_lang_freqr'   r+   )r   r   Z	distancesr%   r   Z	lang_distr*   r   r   r   
lang_dists   s    

zTextCat.lang_distsc             C   s   |  || _t| j| jjdS )z_ Find the language with the min distance
            to the text and return its ISO 639-3 code )key)r,   last_distancesminget)r   r   r   r   r   guess_language   s    zTextCat.guess_language)__name__
__module____qualname__r   Zfingerprintsr!   r"   r.   r   r   r%   r+   r,   r1   r   r   r   r   r   7   s   r   c           
   C   s  ddl m}  ddddddd	d
dg	}dddddddddd	}t }x|D ]}| |}t|d }ttt|}d}xJtd|D ]<}	d}
x*td||	 D ]}|
d||	 |  7 }
qW ||
7 }q~W td|dd  d  |	|}td||| f  td qFW d S )Nr   )udhrzKurdish-UTF8zAbkhaz-UTF8zFarsi_Persian-UTF8z
Hindi-UTF8zHawaiian-UTF8zRussian-UTF8zVietnamese-UTF8zSerbian_Srpski-UTF8zEsperanto-UTF8zNorthern KurdishZ	AbkhazianzIranian PersianZHindiZHawaiianZRussianZ
VietnameseZSerbianZ	Esperanto)	ZkmrZabkZpesZhinZhawZrusZvieZsrpZepor    r    zLanguage snippet:    z...zLanguage detection: %s (%s)z############################################################################################################################################)
r   r5   r   Zsentslenr&   maprangeprintr1   )r5   r   ZfriendlyZtcZcur_langZraw_sentencesZrowsZcolsZsampleiZcur_sentjZguessr   r   r   demo   sD    


r>   __main__)__doc__Z
__future__r   r   Znltk.compatr   Z	nltk.utilr   sysr   r   Zregexr   ImportErrorobjectr   r>   r2   r   r   r   r   <module>   s   
d4