B
    >?[e                 @   s
  d dl mZ d dlZd dlZd dlZd dlZd dlZd dlZd dlZd dl	Z	d dl
Z
d dlmZmZmZ d dlmZ d dlmZmZ d dlmZ d dlmZmZmZ d dlmZmZmZmZmZmZmZ d d	l m!Z!m"Z" d d
l#T d dl$m%Z% dQddZ&dd Z'dRddZ(dSddZ)dTddZ*dd Z+dd Z,dd Z-G dd  d eZ.dUd#d$Z/d%d& Z0e1d'fd(d)Z2d*d+ Z3d,d- Z4d.d/ Z5dVd1d2Z6d3d4 Z7d5d6 Z8d7d8 Z9d9d: Z:dWd;d<Z;dXd=d>Z<d?d@ Z=dAdB Z>dYdDdEZ?dFdG Z@i d'fdHdIZAdZdKdLZBd[dMdNZCdOdP ZDdS )\    )print_functionN)islicechaincombinations)pprint)defaultdictdeque)version_info)class_typesstring_types	text_type)build_openerinstall_opener
getproxiesProxyHandlerProxyBasicAuthHandlerProxyDigestAuthHandlerHTTPPasswordMgrWithDefaultRealm)slice_boundsraise_unorderable_types)*)python_2_unicode_compatibleselfc       
      C   s  t |  t| ts| j} td| j  xtt| 	 D ]\}}|
drNq:t|ddr\q:tjd dkrrtj}ntj}||d d \}}}}|r|d dkr|d kst|t|kr|d	d  }d
||f }t||||}	ttjd||	f ddt|d  d q:W d S )Nz%%s supports the following operations:_Z__deprecated__Fr         r      z%s.%sz%s%sz  -     )Zinitial_indentZsubsequent_indent)str
isinstancer
   	__class__print__name__sortedpydocZ
allmethodsitems
startswithgetattrsysr	   inspectZgetfullargspec
getargspeclenZformatargspectextwrapZfill)
objZselfnamenamemethodr+   argsZvarargsZvarkwdefaultsZargspec r3   (lib/python3.7/site-packages/nltk/util.pyusage-   s0    


r5   c              C   s   ddl } | jjjdkS )a  
    Return True if this function is run within idle.  Tkinter
    programs that are run in idle should never call ``Tk.mainloop``; so
    this function should be used to gate all calls to ``Tk.mainloop``.

    :warning: This function works by checking ``sys.stdin``.  If the
        user has modified ``sys.stdin``, then it may return incorrect
        results.
    :rtype: bool
    r   N)ZPyShellZRPCProxy)r)   stdinr!   r#   )r)   r3   r3   r4   in_idleU   s    r7   c             C   s   t tt| || dS )z
    Pretty print a sequence of data items

    :param data: the data stream to print
    :type data: sequence or iter
    :param start: the start position
    :type start: int
    :param end: the end position
    :type end: int
    N)r   listr   )datastartendr3   r3   r4   prj   s    r<   F   c             C   s   t dtj| |d dS )z
    Pretty print a string, breaking lines on whitespace

    :param s: the string to print, consisting of words and spaces
    :type s: str
    :param width: the display width
    :type width: int
    
)widthN)r"   joinr-   wrap)sr?   r3   r3   r4   print_stringx   s    	rC   r   c             C   s   d tj| | |dS )a#  
    Pretty print a list of text tokens, breaking lines on whitespace

    :param tokens: the tokens to print
    :type tokens: list
    :param separator: the string to use to separate tokens
    :type separator: str
    :param width: the display width (default=70)
    :type width: int
    r>   )r?   )r@   r-   rA   )tokensZ	separatorr?   r3   r3   r4   	tokenwrap   s    rE   c               C   s   t d dkot d dkS )Nr      r   r   )r	   r3   r3   r3   r4   py25   s    rG   c               C   s   t d dkot d dkS )Nr   rF   r      )r	   r3   r3   r3   r4   py26   s    rI   c               C   s   t d dkot d dkS )Nr   rF   r      )r	   r3   r3   r3   r4   py27   s    rK   c               @   s   e Zd Zdd ZdS )Indexc             C   s0   t | t x|D ]\}}| | | qW d S )N)r   __init__r8   append)r   Zpairskeyvaluer3   r3   r4   rM      s    zIndex.__init__N)r#   
__module____qualname__rM   r3   r3   r3   r4   rL      s   rL   {}c             C   s*   t t| tj|d | |  dS )a3  
    Return a string with markers surrounding the matched substrings.
    Search str for substrings matching ``regexp`` and wrap the matches
    with braces.  This is convenient for learning about regular expressions.

    :param regexp: The regular expression.
    :type regexp: str
    :param string: The string being matched.
    :type string: str
    :param left: The left delimiter (printed before the matched substring)
    :type left: str
    :param right: The right delimiter (printed after the matched substring)
    :type right: str
    :rtype: str
    z\g<0>N)r"   recompileMsubrstrip)Zregexpstringleftrightr3   r3   r4   re_show   s    r]   c          	   C   sD   t | dr|  S t| tr8t| d
}| S Q R X ntdd S )Nreadrz2Must be called with a filename or file-like object)hasattrr^   r    r   open
ValueError)fZinfiler3   r3   r4   
filestring   s    

rd   c             #   sl   t | dfg}xX|rf| \} |V   |kry | fdd||D  W q tk
rb   Y qX qW dS )a  Traverse the nodes of a tree in breadth-first order.
    (No need to check for cycles.)
    The first argument should be the tree root;
    children should be a function taking as argument a tree node
    and returning an iterator of the node's children.
    r   c             3   s   | ]}| d  fV  qdS )r   Nr3   ).0c)depthr3   r4   	<genexpr>   s    z breadth_first.<locals>.<genexpr>N)r   popleftextend	TypeError)ZtreeZchildrenZmaxdepthZqueueZnoder3   )rh   r4   breadth_first   s     rm   c          
   C   s  d}dg}y| ttj W n tk
r4   Y nX y| t d  W n ttfk
rd   Y nX y| t d  W n ttfk
r   Y nX | d x@|D ]8}|sqyt| |}|}W n t	t
fk
r   Y qX P qW |st	dddd |D  n||fS dS )	at  
    Given a byte string, attempt to decode it.
    Tries the standard 'UTF8' and 'latin-1' encodings,
    Plus several gathered from locale information.

    The calling program *must* first call::

        locale.setlocale(locale.LC_ALL, '')

    If successful it returns ``(decoded_unicode, successful_encoding)``.
    If unsuccessful it raises a ``UnicodeError``.
    Nzutf-8r   zlatin-1z?Unable to decode input data. Tried the following encodings: %s.z, c             S   s   g | ]}|rt |qS r3   )repr)rf   encr3   r3   r4   
<listcomp>)  s    z"guess_encoding.<locals>.<listcomp>)rN   localenl_langinfoCODESETAttributeErrorZ	getlocale
IndexErrorZgetdefaultlocaler   UnicodeErrorLookupErrorr@   )r9   Zsuccessful_encodingZ	encodingsro   Zdecodedr3   r3   r4   guess_encoding   s:    


rx   c                s   t    fdd| D S )Nc                s"   g | ]}| kr  |s|qS r3   )add)rf   x)seenr3   r4   rp   7  s    zunique_list.<locals>.<listcomp>)set)Zxsr3   )r{   r4   unique_list4  s    r}   c             C   sV   t t}xH| D ]@}t| | drBx,| | D ]}|| | q*W q||| | < qW |S )N__iter__)r   r8   r`   rN   )dZinverted_dictrO   Ztermr3   r3   r4   invert_dict?  s    
r   Fc                s   |rdd  ndd  t fddD }t  fddD }xhD ]`}|| }|| }xJ|r| }|| ||| |O }||| |O }||8 }q^W qHW |S )a  
    Calculate the transitive closure of a directed graph,
    optionally the reflexive transitive closure.

    The algorithm is a slight modification of the "Marking Algorithm" of
    Ioannidis & Ramakrishnan (1998) "Efficient Transitive Closure Algorithms".

    :param graph: the initial graph, represented as a dictionary of sets
    :type graph: dict(set)
    :param reflexive: if set, also make the closure reflexive
    :type reflexive: bool
    :rtype: dict(set)
    c             S   s
   t | gS )N)r|   )kr3   r3   r4   <lambda>_  s    z$transitive_closure.<locals>.<lambda>c             S   s   t  S )N)r|   )r   r3   r3   r4   r   a  s    c             3   s   | ]}| |   fV  qd S )N)copy)rf   r   )graphr3   r4   ri   c  s    z%transitive_closure.<locals>.<genexpr>c             3   s   | ]}| |fV  qd S )Nr3   )rf   r   )base_setr3   r4   ri   e  s    )dictpopry   
setdefaultget)r   Z	reflexiveZagenda_graphZclosure_graphiZagendaZclosurejr3   )r   r   r4   transitive_closureP  s    


r   c             C   s<   i }x2| D ]*}x$| | D ]}| |t | qW q
W |S )z
    Inverts a directed graph.

    :param graph: the graph, represented as a dictionary of sets
    :type graph: dict(set)
    :return: the inverted graph
    :rtype: dict(set)
    )r   r|   ry   )r   invertedrO   rP   r3   r3   r4   invert_graphr  s
    	
r   c             C   s   t dd S )Nz>To remove HTML markup, use BeautifulSoup's get_text() function)NotImplementedError)Zhtmlr3   r3   r4   
clean_html  s    r   c             C   s   t dd S )Nz>To remove HTML markup, use BeautifulSoup's get_text() function)r   )Zurlr3   r3   r4   	clean_url  s    r   c              G   s`   g }xV| D ]N}t |ttfs"|g}x4|D ],}t |ttfrJ|t| q(|| q(W q
W |S )z
    Flatten a list.

        >>> from nltk.util import flatten
        >>> flatten(1, 2, ['b', 'a' , ['c', 'd']], 3)
        [1, 2, 'b', 'a', 'c', 'd', 3]

    :param args: items and lists to be combined into a single list
    :rtype: list
    )r    r8   tuplerk   flattenrN   )r1   rz   litemr3   r3   r4   r     s    

r   c             C   s<   t | } |r t|f|d  | } |r8t| |f|d  } | S )a  
    Returns a padded sequence of items before ngram extraction.

        >>> list(pad_sequence([1,2,3,4,5], 2, pad_left=True, pad_right=True, left_pad_symbol='<s>', right_pad_symbol='</s>'))
        ['<s>', 1, 2, 3, 4, 5, '</s>']
        >>> list(pad_sequence([1,2,3,4,5], 2, pad_left=True, left_pad_symbol='<s>'))
        ['<s>', 1, 2, 3, 4, 5]
        >>> list(pad_sequence([1,2,3,4,5], 2, pad_right=True, right_pad_symbol='</s>'))
        [1, 2, 3, 4, 5, '</s>']

    :param sequence: the source data to be padded
    :type sequence: sequence or iter
    :param n: the degree of the ngrams
    :type n: int
    :param pad_left: whether the ngrams should be left-padded
    :type pad_left: bool
    :param pad_right: whether the ngrams should be right-padded
    :type pad_right: bool
    :param left_pad_symbol: the symbol to use for left padding (default is None)
    :type left_pad_symbol: any
    :param right_pad_symbol: the symbol to use for right padding (default is None)
    :type right_pad_symbol: any
    :rtype: sequence or iter
    r   )iterr   )sequencenpad_left	pad_rightleft_pad_symbolright_pad_symbolr3   r3   r4   pad_sequence  s     r   c       	      c   s   t | |||||} g }x@|dkrVyt| }W n tk
r@   dS X || |d8 }qW x&| D ]}|| t|V  |d= q^W dS )a  
    Return the ngrams generated from a sequence of items, as an iterator.
    For example:

        >>> from nltk.util import ngrams
        >>> list(ngrams([1,2,3,4,5], 3))
        [(1, 2, 3), (2, 3, 4), (3, 4, 5)]

    Wrap with list for a list version of this function.  Set pad_left
    or pad_right to true in order to get additional ngrams:

        >>> list(ngrams([1,2,3,4,5], 2, pad_right=True))
        [(1, 2), (2, 3), (3, 4), (4, 5), (5, None)]
        >>> list(ngrams([1,2,3,4,5], 2, pad_right=True, right_pad_symbol='</s>'))
        [(1, 2), (2, 3), (3, 4), (4, 5), (5, '</s>')]
        >>> list(ngrams([1,2,3,4,5], 2, pad_left=True, left_pad_symbol='<s>'))
        [('<s>', 1), (1, 2), (2, 3), (3, 4), (4, 5)]
        >>> list(ngrams([1,2,3,4,5], 2, pad_left=True, pad_right=True, left_pad_symbol='<s>', right_pad_symbol='</s>'))
        [('<s>', 1), (1, 2), (2, 3), (3, 4), (4, 5), (5, '</s>')]


    :param sequence: the source data to be converted into ngrams
    :type sequence: sequence or iter
    :param n: the degree of the ngrams
    :type n: int
    :param pad_left: whether the ngrams should be left-padded
    :type pad_left: bool
    :param pad_right: whether the ngrams should be right-padded
    :type pad_right: bool
    :param left_pad_symbol: the symbol to use for left padding (default is None)
    :type left_pad_symbol: any
    :param right_pad_symbol: the symbol to use for right padding (default is None)
    :type right_pad_symbol: any
    :rtype: sequence or iter
    r   Nr   )r   nextStopIterationrN   r   )	r   r   r   r   r   r   historyZ	next_itemr   r3   r3   r4   ngrams  s    +




r   c             k   s"   xt | df|D ]
}|V  qW dS )a  
    Return the bigrams generated from a sequence of items, as an iterator.
    For example:

        >>> from nltk.util import bigrams
        >>> list(bigrams([1,2,3,4,5]))
        [(1, 2), (2, 3), (3, 4), (4, 5)]

    Use bigrams for a list version of this function.

    :param sequence: the source data to be converted into bigrams
    :type sequence: sequence or iter
    :rtype: iter(tuple)
    rF   N)r   )r   kwargsr   r3   r3   r4   bigrams  s    r   c             k   s"   xt | df|D ]
}|V  qW dS )a  
    Return the trigrams generated from a sequence of items, as an iterator.
    For example:

        >>> from nltk.util import trigrams
        >>> list(trigrams([1,2,3,4,5]))
        [(1, 2, 3), (2, 3, 4), (3, 4, 5)]

    Use trigrams for a list version of this function.

    :param sequence: the source data to be converted into trigrams
    :type sequence: sequence or iter
    :rtype: iter(tuple)
    r   N)r   )r   r   r   r3   r3   r4   trigrams3  s    r   r   c             k   sJ   |dkrt | }x4t||d D ]"}xt| |f|D ]
}|V  q4W q W dS )a  
    Returns all possible ngrams generated from a sequence of items, as an iterator.

        >>> sent = 'a b c'.split()
        >>> list(everygrams(sent))
        [('a',), ('b',), ('c',), ('a', 'b'), ('b', 'c'), ('a', 'b', 'c')]
        >>> list(everygrams(sent, max_len=2))
        [('a',), ('b',), ('c',), ('a', 'b'), ('b', 'c')]

    :param sequence: the source data to be converted into trigrams
    :type sequence: sequence or iter
    :param min_len: minimum length of the ngrams, aka. n-gram order/degree of ngram
    :type  min_len: int
    :param max_len: maximum length of the ngrams (set to length of sequence by default)
    :type  max_len: int
    :rtype: iter(tuple)
    re   r   N)r,   ranger   )r   Zmin_lenZmax_lenr   r   Zngr3   r3   r4   
everygramsG  s
    r   c       	      k   s   d|ksd|krt | |f|} t }xdt| || d|dD ]L}|dd }|dd }x.t||d D ]}|d |krxqf|| V  qfW q:W dS )a  
    Returns all possible skipgrams generated from a sequence of items, as an iterator.
    Skipgrams are ngrams that allows tokens to be skipped.
    Refer to http://homepages.inf.ed.ac.uk/ballison/pdf/lrec_skipgrams.pdf

        >>> sent = "Insurgents killed in ongoing fighting".split()
        >>> list(skipgrams(sent, 2, 2))
        [('Insurgents', 'killed'), ('Insurgents', 'in'), ('Insurgents', 'ongoing'), ('killed', 'in'), ('killed', 'ongoing'), ('killed', 'fighting'), ('in', 'ongoing'), ('in', 'fighting'), ('ongoing', 'fighting')]
        >>> list(skipgrams(sent, 3, 2))
        [('Insurgents', 'killed', 'in'), ('Insurgents', 'killed', 'ongoing'), ('Insurgents', 'killed', 'fighting'), ('Insurgents', 'in', 'ongoing'), ('Insurgents', 'in', 'fighting'), ('Insurgents', 'ongoing', 'fighting'), ('killed', 'in', 'ongoing'), ('killed', 'in', 'fighting'), ('killed', 'ongoing', 'fighting'), ('in', 'ongoing', 'fighting')]

    :param sequence: the source data to be converted into trigrams
    :type sequence: sequence or iter
    :param n: the degree of the ngrams
    :type n: int
    :param k: the skip distance
    :type  k: int
    :rtype: iter(tuple)
    r   r   T)r   r   Nr   re   )r   objectr   r   )	r   r   r   r   ZSENTINELZngramheadtailZ	skip_tailr3   r3   r4   	skipgramsa  s    r   c             C   s  |d }t |}d}d}t| dr6t| jjd }n"| dd |  d }| d xT||k r||f}|| d }	||	r||	 \}
}nzd}x^| t	d|	d  |	dkr| 
  |  }
|  }|dkrP ||	 d }	|	|d krdS qW ||k r|
|f||	< |
|kr6||	d ks,td|	d }nZ|d| |krL|S ||krv||	d ksltd|	d }n||k r|
t | d }|d7 }||f}||kr\dS q\W dS )	a  
    Return the line from the file with first word key.
    Searches through a sorted file using the binary search algorithm.

    :type file: file
    :param file: the file to be searched through.
    :type key: str
    :param key: the identifier we are searching for.
    r   r   r/   r   rF    Nzinfinite loop)r,   r`   osstatr/   st_sizeseektellr   maxZdiscard_linereadlineAssertionError)filerO   cacheZ
cacheDepthZkeylenr:   ZcurrentDepthr;   Z	lastStateZmiddleoffsetlineZ	thisStater3   r3   r4   binary_search_file  sV    








r   r   c             C   s   ddl m} | dkr@yt d } W n tk
r>   tdY nX t| | d}t|}|dk	rt }|jd| ||d |	t
| |	t| t| dS )a  
    Set the HTTP proxy for Python to download through.

    If ``proxy`` is None then tries to set proxy from environment or system
    settings.

    :param proxy: The HTTP proxy server to use. For example:
        'http://proxy.example.com:3128/'
    :param user: The username to authenticate with. Use None to disable
        authentication.
    :param password: The password to authenticate with.
    r   )compatNhttpz'Could not detect default proxy settings)Zhttpsr   )ZrealmZuriuserZpasswd)Znltkr   r   KeyErrorrb   r   r   r   Zadd_passwordZadd_handlerr   r   r   )proxyr   Zpasswordr   Zproxy_handleropenerZpassword_managerr3   r3   r4   	set_proxy  s    r   c             C   s   d|d  }t | rb| jr$| j s.|d | _x| D ]} t| |d  q4W | jrZ| j s||| _n|r|| jrv| j s||| _dS )a  
    Recursive function to indent an ElementTree._ElementInterface
    used for pretty printing. Run indent on elem and then output
    in the normal way.

    :param elem: element to be indented. will be modified.
    :type elem: ElementTree._ElementInterface
    :param level: level of indentation for this element
    :type level: nonnegative integer
    :rtype:   ElementTree._ElementInterface
    :return:  Contents of elem indented to reflect its structure
    r>   z  r   N)r,   textstripelementtree_indentr   )elemlevelr   r3   r3   r4   r      s    

r   c             C   sj   d|  kr| krbn nJd\}}x8t dt|| | d D ]}|| 9 }||9 }| d8 } q:W || S dS dS )a9  
    This function is a fast way to calculate binomial coefficients, commonly
    known as nCk, i.e. the number of combinations of n things taken k at a time.
    (https://en.wikipedia.org/wiki/Binomial_coefficient).

    This is the *scipy.special.comb()* with long integer computation but this
    approximation is faster, see https://github.com/nltk/nltk/issues/1181

        >>> choose(4, 2)
        6
        >>> choose(6, 2)
        15

    :param n: The number of things.
    :type n: int
    :param r: The number of times a thing is taken.
    :type r: int
    r   )r   r   r   N)r   min)r   r   ZntokZktoktr3   r3   r4   choose   s    r   )r   )r   N)r=   )r   r=   )rS   rT   )F)FFNN)FFNN)r   re   )Nr   )r   )EZ
__future__r   r)   r*   rq   rU   typesr-   r%   Zbisectr   	itertoolsr   r   r   r   collectionsr   r   r	   Zsixr
   r   r   Zsix.moves.urllib.requestr   r   r   r   r   r   r   Znltk.internalsr   r   Znltk.collectionsZnltk.compatr   r5   r7   r<   rC   rE   rG   rI   rK   rL   r]   rd   r   rm   rx   r}   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r3   r3   r3   r4   <module>   sl   $

(


	
=
"    
(   
9
+J
*
 