
`]c           @   sL   d  Z  d d l Z d d l m Z d d l m Z d e f d     YZ d S(   s  
The tok-tok tokenizer is a simple, general tokenizer, where the input has one
sentence per line; thus only final period is tokenized.

Tok-tok has been tested on, and gives reasonably good results for English,
Persian, Russian, Czech, French, German, Vietnamese, Tajik, and a few others.
The input should be in UTF-8 encoding.

Reference:
Jon Dehdari. 2014. A Neurophysiologically-Inspired Statistical Language
Model (Doctoral dissertation). Columbus, OH, USA: The Ohio State University.
iN(   t	   text_type(   t
   TokenizerIt   ToktokTokenizerc           B   s  e  Z d  Z e j d  d f Z e j d  d f Z e j d  d f Z e j d  d f Z e j d  d f Z	 e j d	  d
 f Z
 e j d  d f Z e j d  d f Z e j d  d f Z e j d  d f Z e j d  d f Z e j d  d f Z e j d  d f Z e j d  d f Z e j d  d f Z e j d  d f Z e d  Z e d  Z e d  Z e j d j e   d f Z e j d j e   d f Z e j d j e   d f Z e j d  d  f Z e j d!  d" f Z e j d#  d$ f Z e j d%  d$ f Z  e j d&  d' f Z! e j d(  d) f Z" e j d*  d f Z# e e e e e e  e	 e
 e e e e e e e e e e e e e e e e# g Z$ e% d+  Z& RS(,   s  
    This is a Python port of the tok-tok.pl from
    https://github.com/jonsafari/tok-tok/blob/master/tok-tok.pl

    >>> toktok = ToktokTokenizer()
    >>> text = u'Is 9.5 or 525,600 my favorite number?'
    >>> print (toktok.tokenize(text, return_str=True))
    Is 9.5 or 525,600 my favorite number ?
    >>> text = u'The https://github.com/jonsafari/tok-tok/blob/master/tok-tok.pl is a website with/and/or slashes and sort of weird : things'
    >>> print (toktok.tokenize(text, return_str=True))
    The https://github.com/jonsafari/tok-tok/blob/master/tok-tok.pl is a website with/and/or slashes and sort of weird : things
    >>> text = u'This, is a sentence with weird symbols\u2026 appearing everywhere'
    >>> expected = u' This , is a sentence with weird  symbols \u2026 appearing everywhere '
    >>> assert toktok.tokenize(text, return_str=True) == expected
    >>> toktok.tokenize(text) == [u'', u'This', u',', u'is', u'a', u'sentence', u'with', u'weird', u'', u'symbols', u'\u2026', u'appearing', u'everywhere', u'']
    True
    u    t    u1   ([،;؛¿!"\])}»›”؟¡%٪°±©®।॥…])s    \1 u   ([({\[“‘„‚«‹「『])u
   ([–—])s   & s   &amp; s   	s    &#9; s   \|s    &#124; s   (?<!,)([,،])(?![,\d])s	   (['’`])s    ` ` s    `` s    ' ' s    '' s
   (?<!\.)\.$s    .s    (?<!\.)\.\s*(["'’»›”]) *$s    . \1s   (,{2,})s   (-{2,})s   (\.{2,})u   ([{༺༼᚛‚„⁅⁽₍〈❨❪❬❮❰❲❴⟅⟦⟨⟪⟬⟮⦃⦅⦇⦉⦋⦍⦏⦑⦓⦕⦗⧘⧚⧼⸢⸤⸦⸨〈《「『【〔〖〘〚〝﴾︗︵︷︹︻︽︿﹁﹃﹇﹙﹛﹝（［｛｟｢u   )]}༻༽᚜⁆⁾₎〉❩❫❭❯❱❳❵⟆⟧⟩⟫⟭⟯⦄⦆⦈⦊⦌⦎⦐⦒⦔⦖⦘⧙⧛⧽⸣⸥⸧⸩〉》」』】〕〗〙〛〞〟﴿︘︶︸︺︼︾﹀﹂﹄﹈﹚﹜﹞）］｝｠｣u   $¢£¤¥֏؋৲৳৻૱௹฿៛₠₡₢₣₤₥₦₧₨₩₪₫€₭₮₯₰₱₲₳₴₵₶₷₸₹₺꠸﷼﹩＄￠￡￥￦u   ([{}])s   \1 s   :(?!//)s    : s   \?(?!\S)s    ? s   (:\/\/)[\S+\.\S+\/\S+][\/]s    / s    /s   ^ +t    s   \s+$s   
s    {2,}c         C   s^   t  |  } x) |  j D] \ } } | j | |  } q Wt  | j    } | rT | S| j   S(   N(   R    t   TOKTOK_REGEXESt   subt   stript   split(   t   selft   textt
   return_strt   regexpt   subsitution(    (    s3   lib/python2.7/site-packages/nltk/tokenize/toktok.pyt   tokenize   s
    ('   t   __name__t
   __module__t   __doc__t   ret   compilet   NON_BREAKINGt   FUNKY_PUNCT_1t   FUNKY_PUNCT_2t   EN_EM_DASHESt	   AMPERCENTt   TABt   PIPEt   COMMA_IN_NUMt   PROB_SINGLE_QUOTESt   STUPID_QUOTES_1t   STUPID_QUOTES_2t   FINAL_PERIOD_1t   FINAL_PERIOD_2t   MULTI_COMMASt   MULTI_DASHESt
   MULTI_DOTSR    t
   OPEN_PUNCTt   CLOSE_PUNCTt   CURRENCY_SYMt   formatt   OPEN_PUNCT_REt   CLOSE_PUNCT_REt   CURRENCY_SYM_REt	   URL_FOE_1t	   URL_FOE_2t	   URL_FOE_3t	   URL_FOE_4t   LSTRIPt   RSTRIPt	   ONE_SPACER   t   FalseR   (    (    (    s3   lib/python2.7/site-packages/nltk/tokenize/toktok.pyR      st   					(   R   R   t   sixR    t   nltk.tokenize.apiR   R   (    (    (    s3   lib/python2.7/site-packages/nltk/tokenize/toktok.pyt   <module>   s   