B
    P?[,              	   @   s   d Z ddlmZ ddlZddlmZmZ ddlmZ dZ	dZ
e
de	d	d
ddddf	Zedde ejejB ejB ZedZee	ejejB ejB ZedZd$ddZd%ddZG dd dZdd Zdd  Zd&d"d#ZdS )'a  
Twitter-aware tokenizer, designed to be flexible and easy to adapt to new
domains and tasks. The basic logic is this:

1. The tuple regex_strings defines a list of regular expression
   strings.

2. The regex_strings strings are put, in order, into a compiled
   regular expression object called word_re.

3. The tokenization is done by word_re.findall(s), where s is the
   user-supplied string, inside the tokenize() method of the class
   Tokenizer.

4. When instantiating Tokenizer objects, there is a single option:
   preserve_case.  By default, it is set to True. If it is set to
   False, then the tokenizer will downcase everything except for
   emoticons.

    )unicode_literalsN)int2byteunichr)html_entitiesac  
    (?:
      [<>]?
      [:;=8]                     # eyes
      [\-o\*\']?                 # optional nose
      [\)\]\(\[dDpP/\:\}\{@\|\\] # mouth
      |
      [\)\]\(\[dDpP/\:\}\{@\|\\] # mouth
      [\-o\*\']?                 # optional nose
      [:;=8]                     # eyes
      [<>]?
      |
      <3                         # heart
    )u  			# Capture 1: entire matched URL
  (?:
  https?:				# URL protocol and colon
    (?:
      /{1,3}				# 1-3 slashes
      |					#   or
      [a-z0-9%]				# Single letter or digit or '%'
                                       # (Trying not to match e.g. "URI::Escape")
    )
    |					#   or
                                       # looks like domain name followed by a slash:
    [a-z0-9.\-]+[.]
    (?:[a-z]{2,13})
    /
  )
  (?:					# One or more:
    [^\s()<>{}\[\]]+			# Run of non-space, non-()<>{}[]
    |					#   or
    \([^\s()]*?\([^\s()]+\)[^\s()]*?\) # balanced parens, one level deep: (...(...)...)
    |
    \([^\s]+?\)				# balanced parens, non-recursive: (...)
  )+
  (?:					# End with:
    \([^\s()]*?\([^\s()]+\)[^\s()]*?\) # balanced parens, one level deep: (...(...)...)
    |
    \([^\s]+?\)				# balanced parens, non-recursive: (...)
    |					#   or
    [^\s`!()\[\]{};:'".,<>?«»“”‘’]	# not a space or one of these punct chars
  )
  |					# OR, the following to match naked domains:
  (?:
  	(?<!@)			        # not preceded by a @, avoid matching foo@_gmail.com_
    [a-z0-9]+
    (?:[.\-][a-z0-9]+)*
    [.]
    (?:[a-z]{2,13})
    \b
    /?
    (?!@)			        # not succeeded by a @,
                            # avoid matching "foo.na" in "foo.na@example.com"
  )
a	  
    (?:
      (?:            # (international)
        \+?[01]
        [ *\-.\)]*
      )?
      (?:            # (area code)
        [\(]?
        \d{3}
        [ *\-.\)]*
      )?
      \d{3}          # exchange
      [ *\-.\)]*
      \d{4}          # base
    )z	<[^>\s]+>z[\-]+>|<[\-]+z(?:@[\w_]+)z(?:\#+[\w_]+[\w\'_\-]*[\w_]+)z#[\w.+-]+@[\w-]+\.(?:[\w-]\.?)+[\w-]a  
    (?:[^\W\d_](?:[^\W\d_]|['\-_])+[^\W\d_]) # Words with apostrophes or dashes.
    |
    (?:[+\-]?\d+[,/.:-]\d+[+\-]?)  # Numbers, including fractions, decimals.
    |
    (?:[\w_]+)                     # Words without apostrophes or dashes.
    |
    (?:\.(?:\s*\.){1,})            # Ellipsis dots.
    |
    (?:\S)                         # Everything else that isn't whitespace.
    z(%s)|z([^a-zA-Z0-9])\1{3,}z&(#?(x?))([^&;\s]+);strictc             C   s&   |d krd}t | tr"| ||S | S )Nzutf-8)
isinstancebytesdecode)textencodingerrors r   3lib/python3.7/site-packages/nltk/tokenize/casual.py_str_to_unicode   s
    
r   r   Tutf-8c                s     fdd}t |t| |S )u  
    Remove entities from text by converting them to their
    corresponding unicode character.

    :param text: a unicode string or a byte string encoded in the given
    `encoding` (which defaults to 'utf-8').

    :param list keep:  list of entity names which should not be replaced.    This supports both numeric entities (``&#nnnn;`` and ``&#hhhh;``)
    and named entities (such as ``&nbsp;`` or ``&gt;``).

    :param bool remove_illegal: If `True`, entities that can't be converted are    removed. Otherwise, entities that can't be converted are kept "as
    is".

    :returns: A unicode string with the entities removed.

    See https://github.com/scrapy/w3lib/blob/master/w3lib/html.py

        >>> from nltk.tokenize.casual import _replace_html_entities
        >>> _replace_html_entities(b'Price: &pound;100')
        'Price: \xa3100'
        >>> print(_replace_html_entities(b'Price: &pound;100'))
        Price: £100
        >>>
    c                s   |  d}|  drzyJ|  dr,t|d}n
t|d}d|  krJdkr\n nt|dS W q tk
rv   d }Y qX n| kr|  d	S tj|}|d k	ryt|S  tk
r   Y nX rd
S |  d	S )N            
         cp1252r    )	groupintr   r
   
ValueErrorr   Zname2codepointgetr   )matchZentity_bodyZnumber)keepremove_illegalr   r   _convert_entity   s&    




z/_replace_html_entities.<locals>._convert_entity)ENT_REsubr   )r   r    r!   r   r"   r   )r    r!   r   _replace_html_entities   s    r%   c               @   s"   e Zd ZdZd	ddZdd ZdS )
TweetTokenizera  
    Tokenizer for tweets.

        >>> from nltk.tokenize import TweetTokenizer
        >>> tknzr = TweetTokenizer()
        >>> s0 = "This is a cooool #dummysmiley: :-) :-P <3 and some arrows < > -> <--"
        >>> tknzr.tokenize(s0)
        ['This', 'is', 'a', 'cooool', '#dummysmiley', ':', ':-)', ':-P', '<3', 'and', 'some', 'arrows', '<', '>', '->', '<--']

    Examples using `strip_handles` and `reduce_len parameters`:

        >>> tknzr = TweetTokenizer(strip_handles=True, reduce_len=True)
        >>> s1 = '@remy: This is waaaaayyyy too much for you!!!!!!'
        >>> tknzr.tokenize(s1)
        [':', 'This', 'is', 'waaayyy', 'too', 'much', 'for', 'you', '!', '!', '!']
    TFc             C   s   || _ || _|| _d S )N)preserve_case
reduce_lenstrip_handles)selfr'   r(   r)   r   r   r   __init__  s    zTweetTokenizer.__init__c             C   sV   t |}| jrt|}| jr$t|}td|}t|}| j	sRt
tdd |}|S )z
        :param text: str
        :rtype: list(str)
        :return: a tokenized list of strings; concatenating this list returns        the original string if `preserve_case=False`
        z\1\1\1c             S   s   t | r| S |  S )N)EMOTICON_REsearchlower)xr   r   r   <lambda>/  s    z)TweetTokenizer.tokenize.<locals>.<lambda>)r%   r)   remove_handlesr(   reduce_lengtheningHANG_REr$   WORD_REfindallr'   listmap)r*   r   Z	safe_textZwordsr   r   r   tokenize  s    
zTweetTokenizer.tokenizeN)TFF)__name__
__module____qualname____doc__r+   r8   r   r   r   r   r&     s   
r&   c             C   s   t d}|d| S )ze
    Replace repeated character sequences of length 3 or greater with sequences
    of length 3.
    z	(.)\1{2,}z\1\1\1)recompiler$   )r   patternr   r   r   r2   9  s    
r2   c             C   s   t d}|d| S )z4
    Remove Twitter username handles from text.
    zv(?<![A-Za-z0-9_!@#\$%&*])@(([A-Za-z0-9_]){20}(?!@))|(?<![A-Za-z0-9_!@#\$%&*])@(([A-Za-z0-9_]){1,19})(?![A-Za-z0-9_]*@) )r=   r>   r$   )r   r?   r   r   r   r1   B  s    r1   Fc             C   s   t |||d| S )z:
    Convenience function for wrapping the tokenizer.
    )r'   r(   r)   )r&   r8   )r   r'   r(   r)   r   r   r   casual_tokenizeR  s    rA   )Nr   )r   Tr   )TFF)r<   Z
__future__r   r=   Zsixr   r   Z	six.movesr   Z	EMOTICONSZURLSZREGEXPSr>   joinVERBOSEIUNICODEr4   r3   r,   r#   r   r%   r&   r2   r1   rA   r   r   r   r   <module>!   s2   .$



?7	