ó
ů`]c           @  sČ   d  Z  d d l m Z m Z d d l m Z d d l m Z e rU d d l m	 Z	 n d d l m
 Z
 y d d l Z Wn e k
 r d Z n Xd e f d	     YZ d
   Z e d k rÄ e   n  d S(   u  
A module for language identification using the TextCat algorithm.
An implementation of the text categorization algorithm
presented in Cavnar, W. B. and J. M. Trenkle,
"N-Gram-Based Text Categorization".

The algorithm takes advantage of Zipf's law and uses
n-gram frequencies to profile languages and text-yet to
be identified-then compares using a distance measure.

Language n-grams are provided by the "An Crubadan"
project. A corpus reader was created separately to read
those files.

For details regarding the algorithm, see:
http://www.let.rug.nl/~vannoord/TextCat/textcat.pdf

For details about An Crubadan, see:
http://borel.slu.edu/crubadan/index.html
i˙˙˙˙(   t   print_functiont   unicode_literals(   t   PY3(   t   trigrams(   t   maxsize(   t   maxintNt   TextCatc           B  s\   e  Z d Z i  Z d  Z d Z i  Z d   Z d   Z	 d   Z
 d   Z d   Z d   Z RS(	   u   <u   >c         C  s\   t  s t d   n  d d l m } | |  _ x' |  j j   D] } |  j j |  q> Wd  S(   Nu   classify.textcat requires the regex module that supports unicode. Try '$ pip install regex' and see https://pypi.python.org/pypi/regex for further details.i˙˙˙˙(   t   crubadan(   t   ret   EnvironmentErrort   nltk.corpusR   t   _corpust   langst	   lang_freq(   t   selfR   t   lang(    (    s4   lib/python2.7/site-packages/nltk/classify/textcat.pyt   __init__@   s    	c         C  s   t  j d d |  S(   u+    Get rid of punctuation except apostrophes u   [^\P{P}\']+u    (   R   t   sub(   R   t   text(    (    s4   lib/python2.7/site-packages/nltk/classify/textcat.pyt   remove_punctuationP   s    c         C  sĹ   d d l  m } m } |  j |  } | |  } |   } x | D]| } t |  j | |  j  } g  | D] }	 d j |	  ^ qh }
 x7 |
 D]/ } | | k rŻ | | c d 7<q d | | <q WqA W| S(   u)    Create FreqDist of trigrams within text i˙˙˙˙(   t   word_tokenizet   FreqDistu    i   (   t   nltkR   R   R   R   t   _START_CHARt	   _END_CHARt   join(   R   R   R   R   t
   clean_textt   tokenst   fingerprintt   tt   token_trigram_tuplest   trit   token_trigramst   cur_trigram(    (    s4   lib/python2.7/site-packages/nltk/classify/textcat.pyt   profileT   s    	"c         C  s   |  j  j |  } d } | | k rm t | j    j |  } t | j    j |  } t | |  } n t r| t } n t } | S(   um    Calculate the "out-of-place" measure between the
            text and language profile for a single trigram i    (	   R   R   t   listt   keyst   indext   absR   R   R   (   R   R   t   trigramt   text_profilet   lang_fdt   distt   idx_lang_profilet   idx_text(    (    s4   lib/python2.7/site-packages/nltk/classify/textcat.pyt	   calc_disth   s    	c         C  sp   i  } |  j  |  } xT |  j j j   D]@ } d } x' | D] } | |  j | | |  7} q; W| | | <q( W| S(   uU    Calculate the "out-of-place" measure between
            the text and all languages i    (   R"   R   t   _all_lang_freqR$   R-   (   R   R   t	   distancesR"   R   t	   lang_distR'   (    (    s4   lib/python2.7/site-packages/nltk/classify/textcat.pyt
   lang_dists   s    c         C  s+   |  j  |  |  _ t |  j d |  j j S(   u_    Find the language with the min distance
            to the text and return its ISO 639-3 code t   key(   R1   t   last_distancest   mint   get(   R   R   (    (    s4   lib/python2.7/site-packages/nltk/classify/textcat.pyt   guess_language   s    N(   t   __name__t
   __module__t   NoneR   t   fingerprintsR   R   R3   R   R   R"   R-   R1   R6   (    (    (    s4   lib/python2.7/site-packages/nltk/classify/textcat.pyR   7   s   					c       	   C  sz  d d l  m }  d d d d d d d	 d
 d g	 } i	 d d 6d d 6d d 6d d 6d d 6d d 6d d 6d d 6d d 6} t   } xô | D]ě } |  j |  } t |  d } t t t |   } d } x[ t d  |  D]J }	 d }
 x1 t d  | |	  D] } |
 d! | |	 | 7}
 qö W| |
 7} qÖ Wt d" | d  d# !d$  | j	 |  } t d% | | | f  t d& d#  q Wd  S('   Ni˙˙˙˙(   t   udhru   Kurdish-UTF8u   Abkhaz-UTF8u   Farsi_Persian-UTF8u
   Hindi-UTF8u   Hawaiian-UTF8u   Russian-UTF8u   Vietnamese-UTF8u   Serbian_Srpski-UTF8u   Esperanto-UTF8u   Northern Kurdishu   kmru	   Abkhazianu   abku   Iranian Persianu   pesu   Hindiu   hinu   Hawaiianu   hawu   Russianu   rusu
   Vietnameseu   vieu   Serbianu   srpu	   Esperantou   epoi   u    i    u    u   Language snippet: i   u   ...u   Language detection: %s (%s)u   #(
   R
   R;   R   t   sentst   lenR#   t   mapt   ranget   printR6   (   R;   R   t   friendlyt   tct   cur_langt   raw_sentencest   rowst   colst   samplet   it   cur_sentt   jt   guess(    (    s4   lib/python2.7/site-packages/nltk/classify/textcat.pyt   demo   sF    	
	u   __main__(   t   __doc__t
   __future__R    R   t   nltk.compatR   t	   nltk.utilR   t   sysR   R   t   regexR   t   ImportErrorR9   t   objectR   RL   R7   (    (    (    s4   lib/python2.7/site-packages/nltk/classify/textcat.pyt   <module>   s   
d	4