B
    >?[a              	   @   sF  d Z ddlmZmZmZmZ ddlmZ ddlm	Z	m
Z
mZ ddlmZ ddlZddlmZ ddlmZ dd	lmZ dd
lmZmZ ddlmZmZ ddlmZ ddlmZ eddddddddgZG dd de Z!eG dd de Z"G dd de Z#eG dd de Z$G dd de$Z%d d! Z&e'd"kr4e&  dddddgZ(dS )#a  
This module brings together a variety of NLTK functionality for
text analysis, and provides simple, interactive interfaces.
Functionality includes: concordancing, collocation discovery,
regular expression search over tokenized strings, and
distributional similarity.
    )print_functiondivisionunicode_literalsabsolute_import)log)defaultdictCounter
namedtuple)reduceN)	text_type)FreqDist)ConditionalFreqDist)	tokenwrapLazyConcatenation)	f_measureBigramAssocMeasures)BigramCollocationFinder)python_2_unicode_compatibleConcordanceLineleftZqueryrightoffset
left_printright_printlinec               @   sT   e Zd ZdZedd Zdddd fddZd	d
 Zdd ZdddZ	dddZ
dS )ContextIndexa  
    A bidirectional index between words and their 'contexts' in a text.
    The context of a word is usually defined to be the words that occur
    in a fixed window around the word; but other definitions may also
    be used by providing a custom context function.
    c             C   sH   |dkr| |d    nd}|t| d kr<| |d    nd}||fS )z;One left token and one right token, normalized to lowercaser      z*START*z*END*)lowerlen)tokensir   r    r!   (lib/python3.7/site-packages/nltk/text.py_default_context.   s    $zContextIndex._default_contextNc             C   s   | S )Nr!   )xr!   r!   r"   <lambda>5   s    zContextIndex.<lambda>c                sv   |_ _|r|_nj_ r6 fddD tfddtD _tfddtD _d S )Nc                s   g | ]} |r|qS r!   r!   ).0t)filterr!   r"   
<listcomp>=   s    z)ContextIndex.__init__.<locals>.<listcomp>c             3   s(   | ] \}}  | |fV  qd S )N)_key_context_func)r&   r    w)selfr   r!   r"   	<genexpr>?   s    z(ContextIndex.__init__.<locals>.<genexpr>c             3   s(   | ] \}}  | |fV  qd S )N)r+   r*   )r&   r    r,   )r-   r   r!   r"   r.   B   s    )r*   _tokensr+   r#   CFD	enumerate_word_to_contexts_context_to_words)r-   r   Zcontext_funcr(   keyr!   )r(   r-   r   r"   __init__5   s    zContextIndex.__init__c             C   s   | j S )zw
        :rtype: list(str)
        :return: The document that this context index was
            created from.
        )r/   )r-   r!   r!   r"   r   E   s    zContextIndex.tokensc             C   sJ   |  |}t| j| }i }x(| j D ]\}}t|t|||< q(W |S )z
        Return a dictionary mapping from words to 'similarity scores,'
        indicating how often these two words occur in the same
        context.
        )r*   setr2   itemsr   )r-   wordZword_contextsscoresr,   Z
w_contextsr!   r!   r"   word_similarity_dictM   s    
z!ContextIndex.word_similarity_dict   c             C   s~   t t}x\| j| | D ]H}xB| j| D ]4}||kr*||  | j| | | j| |  7  < q*W qW t||jddd | S )NT)r4   reverse)r   intr2   r*   r3   sortedget)r-   r8   nr9   cr,   r!   r!   r"   similar_words\   s    (zContextIndex.similar_wordsFc                s   fddD fddD fddt tD }ttj |rf|rftddn& spt S t fddD }|S d	S )
a  
        Find contexts where the specified words can all appear; and
        return a frequency distribution mapping each context to the
        number of times that context was used.

        :param words: The words used to seed the similarity search
        :type words: str
        :param fail_on_unknown: If true, then raise a value error if
            any of the given words do not occur at all in the index.
        c                s   g | ]}  |qS r!   )r*   )r&   r,   )r-   r!   r"   r)   q   s    z0ContextIndex.common_contexts.<locals>.<listcomp>c                s   g | ]}t  j| qS r!   )r6   r2   )r&   r,   )r-   r!   r"   r)   r   s    c                s   g | ]} | s| qS r!   r!   )r&   r    )contextswordsr!   r"   r)   s   s    z%The following word(s) were not found: c             3   s*   | ]"}j | D ]}| kr|V  qqd S )N)r2   )r&   r,   rA   )commonr-   r!   r"   r.   |   s    z/ContextIndex.common_contexts.<locals>.<genexpr>N)ranger   r
   r6   intersection
ValueErrorjoinr   )r-   rD   Zfail_on_unknownemptyfdr!   )rF   rC   r-   rD   r"   common_contextsf   s    zContextIndex.common_contexts)r;   )F)__name__
__module____qualname____doc__staticmethodr#   r5   r   r:   rB   rM   r!   r!   r!   r"   r   &   s   

r   c               @   sL   e Zd ZdZdd fddZdd Zdd	 Zd
d ZdddZdddZ	dS )ConcordanceIndexzs
    An index that can be used to look up the offset locations at which
    a given word occurs in a document.
    c             C   s   | S )Nr!   )r$   r!   r!   r"   r%      s    zConcordanceIndex.<lambda>c             C   sJ   || _ || _tt| _x.t|D ]"\}}| |}| j| | q W dS )a  
        Construct a new concordance index.

        :param tokens: The document (list of tokens) that this
            concordance index was created from.  This list can be used
            to access the context of a given word occurrence.
        :param key: A function that maps each token to a normalized
            version that will be used as a key in the index.  E.g., if
            you use ``key=lambda s:s.lower()``, then the index will be
            case-insensitive.
        N)r/   r*   r   list_offsetsr1   append)r-   r   r4   indexr8   r!   r!   r"   r5      s    

zConcordanceIndex.__init__c             C   s   | j S )z{
        :rtype: list(str)
        :return: The document that this concordance index was
            created from.
        )r/   )r-   r!   r!   r"   r      s    zConcordanceIndex.tokensc             C   s   |  |}| j| S )z
        :rtype: list(int)
        :return: A list of the offset positions at which the given
            word occurs.  If a key function was specified for the
            index, then given word's key will be looked up.
        )r*   rU   )r-   r8   r!   r!   r"   offsets   s    
zConcordanceIndex.offsetsc             C   s   dt | jt | jf S )Nz+<ConcordanceIndex for %d tokens (%d types)>)r   r/   rU   )r-   r!   r!   r"   __repr__   s    zConcordanceIndex.__repr__P   c          	   C   s   |t | d d }|d }g }| |}|rx|D ]}| j| }| jtd|| | }	| j|d ||  }
d|	| d }d|
d| }d|||g}t|	||
||||}|| q4W |S )zB
        Find all concordance lines given the query word.
              r   r   rE   N)r   rX   r/   maxrJ   r   rV   )r-   r8   widthZ
half_widthcontextconcordance_listrX   r    Z
query_wordZleft_contextZright_contextr   r   Z
line_printconcordance_liner!   r!   r"   find_concordance   s,    


z!ConcordanceIndex.find_concordance   c             C   sj   | j ||d}|std nJt|t|}td|t| x&t|d| D ]\}}t|j qPW dS )a  
        Print concordance lines given the query word.
        :param word: The target word
        :type word: str
        :param lines: The number of lines to display (default=25)
        :type lines: int
        :param width: The width of each line, in characters (default=80)
        :type width: int
        :param save: The option to save the concordance.
        :type save: bool
        )r^   z
no matcheszDisplaying {} of {} matches:N)rb   printminr   formatr1   r   )r-   r8   r^   linesr`   r    ra   r!   r!   r"   print_concordance   s    
z"ConcordanceIndex.print_concordanceN)rZ   )rZ   rc   )
rN   rO   rP   rQ   r5   r   rX   rY   rb   rh   r!   r!   r!   r"   rS      s   

"rS   c               @   s    e Zd ZdZdd Zdd ZdS )TokenSearchera  
    A class that makes it easier to use regular expressions to search
    over tokenized strings.  The tokenized string is converted to a
    string where tokens are marked with angle brackets -- e.g.,
    ``'<the><window><is><still><open>'``.  The regular expression
    passed to the ``findall()`` method is modified to treat angle
    brackets as non-capturing parentheses, in addition to matching the
    token boundaries; and to have ``'.'`` not match the angle brackets.
    c             C   s   d dd |D | _d S )N c             s   s   | ]}d | d V  qdS )<>Nr!   )r&   r,   r!   r!   r"   r.      s    z)TokenSearcher.__init__.<locals>.<genexpr>)rJ   _raw)r-   r   r!   r!   r"   r5      s    zTokenSearcher.__init__c             C   s   t dd|}t dd|}t dd|}t dd|}t || j}x(|D ] }|dsL|drLtd	qLW d
d |D }|S )a"  
        Find instances of the regular expression in the text.
        The text is a list of tokens, and a regexp pattern to match
        a single token must be surrounded by angle brackets.  E.g.

        >>> from nltk.text import TokenSearcher
        >>> print('hack'); from nltk.book import text1, text5, text9
        hack...
        >>> text5.findall("<.*><.*><bro>")
        you rule bro; telling you bro; u twizted bro
        >>> text1.findall("<a>(<.*>)<man>")
        monied; nervous; dangerous; white; white; white; pious; queer; good;
        mature; white; Cape; great; wise; wise; butterless; white; fiendish;
        pale; furious; better; certain; complete; dismasted; younger; brave;
        brave; brave; brave
        >>> text9.findall("<th.*>{3,}")
        thread through those; the thought that; that the thing; the thing
        that; that that thing; through these than through; them that the;
        through the thick; them that they; thought that the

        :param regexp: A regular expression
        :type regexp: str
        z\srj   rk   z(?:<(?:rl   z)>)z	(?<!\\)\.z[^>]z$Bad regexp for TokenSearcher.findallc             S   s   g | ]}|d d  dqS )r   z><)split)r&   hr!   r!   r"   r)   (  s    z)TokenSearcher.findall.<locals>.<listcomp>)resubfindallrm   
startswithendswithrI   )r-   regexphitsrp   r!   r!   r"   rs     s    
zTokenSearcher.findallN)rN   rO   rP   rQ   r5   rs   r!   r!   r!   r"   ri      s   	ri   c               @   s   e Zd ZdZdZd/ddZdd Zdd	 Zd0ddZd1ddZ	d2ddZ
dd Zdd Zdd Zd3ddZd4ddZdd Zd d! Zd"d# Zd$d% Zd&d' Zed(Zd)d* Zd+d, Zd-d. ZdS )5Texta  
    A wrapper around a sequence of simple (string) tokens, which is
    intended to support initial exploration of texts (via the
    interactive console).  Its methods perform a variety of analyses
    on the text's contexts (e.g., counting, concordancing, collocation
    discovery), and display the results.  If you wish to write a
    program which makes use of these analyses, then you should bypass
    the ``Text`` class, and use the appropriate analysis function or
    class directly instead.

    A ``Text`` is typically initialized from a given document or
    corpus.  E.g.:

    >>> import nltk.corpus
    >>> from nltk.text import Text
    >>> moby = Text(nltk.corpus.gutenberg.words('melville-moby_dick.txt'))

    TNc             C   s   | j rt|}|| _|r || _ndd|dd krb|dd d}ddd |d| D | _n"ddd |dd	 D d
 | _dS )zv
        Create a Text object.

        :param tokens: The source text.
        :type tokens: sequence of str
        ]Nr;   rE   c             s   s   | ]}t |V  qd S )N)r   )r&   tokr!   r!   r"   r.   V  s    z Text.__init__.<locals>.<genexpr>r   c             s   s   | ]}t |V  qd S )N)r   )r&   rz   r!   r!   r"   r.   X  s       z...)_COPY_TOKENSrT   r   namerW   rJ   )r-   r   r}   endr!   r!   r"   r5   G  s     zText.__init__c             C   s
   | j | S )N)r   )r-   r    r!   r!   r"   __getitem__^  s    zText.__getitem__c             C   s
   t | jS )N)r   r   )r-   r!   r!   r"   __len__a  s    zText.__len__O   rc   c             C   s.   d| j krt| jdd d| _| j|||S )a  
        Prints a concordance for ``word`` with the specified context window.
        Word matching is not case-sensitive.

        :param word: The target word
        :type word: str
        :param width: The width of each line, in characters (default=80)
        :type width: int
        :param lines: The number of lines to display (default=25)
        :type lines: int

        :seealso: ``ConcordanceIndex``
        _concordance_indexc             S   s   |   S )N)r   )sr!   r!   r"   r%   x  s    z"Text.concordance.<locals>.<lambda>)r4   )__dict__rS   r   r   rh   )r-   r8   r^   rg   r!   r!   r"   concordanceh  s    
zText.concordancec             C   s4   d| j krt| jdd d| _| j||d| S )a  
        Generate a concordance for ``word`` with the specified context window.
        Word matching is not case-sensitive.

        :param word: The target word
        :type word: str
        :param width: The width of each line, in characters (default=80)
        :type width: int
        :param lines: The number of lines to display (default=25)
        :type lines: int

        :seealso: ``ConcordanceIndex``
        r   c             S   s   |   S )N)r   )r   r!   r!   r"   r%     s    z'Text.concordance_list.<locals>.<lambda>)r4   N)r   rS   r   r   rb   )r-   r8   r^   rg   r!   r!   r"   r`   }  s    
zText.concordance_listr;   r[   c                s   d| j kr| j|kr| j|ks|| _|| _ddlm} |d t| j|}|	d |
 fdd t }||j|| _dd	 | jD }tt|d
d dS )aA  
        Print collocations derived from the text, ignoring stopwords.

        :seealso: find_collocations
        :param num: The maximum number of collocations to print.
        :type num: int
        :param window_size: The number of tokens spanned by a collocation (default=2)
        :type window_size: int
        _collocationsr   )	stopwordsZenglishr[   c                s   t | dk p|   kS )N   )r   r   )r,   )ignored_wordsr!   r"   r%     s    z#Text.collocations.<locals>.<lambda>c             S   s   g | ]\}}|d  | qS )rE   r!   )r&   w1w2r!   r!   r"   r)     s    z%Text.collocations.<locals>.<listcomp>z; )Z	separatorN)r   Z_numZ_window_sizenltk.corpusr   rD   r   Z
from_wordsr   Zapply_freq_filterZapply_word_filterr   ZnbestZlikelihood_ratior   rd   r   )r-   numZwindow_sizer   finderZbigram_measuresZcolloc_stringsr!   )r   r"   collocations  s    




zText.collocationsc             C   s   | j |S )zJ
        Count the number of times this word appears in the text.
        )r   count)r-   r8   r!   r!   r"   r     s    z
Text.countc             C   s   | j |S )zQ
        Find the index of the first occurrence of the word in the text.
        )r   rW   )r-   r8   r!   r!   r"   rW     s    z
Text.indexc             C   s   t d S )N)NotImplementedError)r-   methodr!   r!   r"   readability  s    zText.readabilityc                s   d| j kr$t| jdd dd d| _ | jj krt  t fdd D }dd	 |	|D }t
t| nt
d
 dS )a~  
        Distributional similarity: find other words which appear in the
        same contexts as the specified word; list most similar words first.

        :param word: The word used to seed the similarity search
        :type word: str
        :param num: The number of words to generate (default=20)
        :type num: int
        :seealso: ContextIndex.similar_words()
        _word_context_indexc             S   s   |   S )N)isalpha)r$   r!   r!   r"   r%     s    zText.similar.<locals>.<lambda>c             S   s   |   S )N)r   )r   r!   r!   r"   r%     s    )r(   r4   c             3   s0   | ](}| D ]}| kr|ks|V  qqd S )Nr!   )r&   r,   rA   )rC   wcir8   r!   r"   r.     s   zText.similar.<locals>.<genexpr>c             S   s   g | ]\}}|qS r!   r!   )r&   r,   _r!   r!   r"   r)     s    z Text.similar.<locals>.<listcomp>z
No matchesN)r   r   r   r   r   r2   Z
conditionsr6   r   most_commonrd   r   )r-   r8   r   rL   rD   r!   )rC   r   r8   r"   similar  s    
zText.similarc          
   C   s   d| j krt| jdd d| _yJ| j|d}|s<td n*dd ||D }ttd	d
 |D  W n* tk
r } zt| W dd}~X Y nX dS )aV  
        Find contexts where the specified words appear; list
        most frequent common contexts first.

        :param word: The word used to seed the similarity search
        :type word: str
        :param num: The number of words to generate (default=20)
        :type num: int
        :seealso: ContextIndex.common_contexts()
        r   c             S   s   |   S )N)r   )r   r!   r!   r"   r%     s    z&Text.common_contexts.<locals>.<lambda>)r4   TzNo common contexts were foundc             S   s   g | ]\}}|qS r!   r!   )r&   r,   r   r!   r!   r"   r)     s    z(Text.common_contexts.<locals>.<listcomp>c             s   s   | ]\}}|d  | V  qdS )r   Nr!   )r&   r   r   r!   r!   r"   r.     s    z'Text.common_contexts.<locals>.<genexpr>N)	r   r   r   r   rM   rd   r   r   rI   )r-   rD   r   rL   Zranked_contextser!   r!   r"   rM     s    

zText.common_contextsc             C   s   ddl m} || | dS )z
        Produce a plot showing the distribution of the words through the text.
        Requires pylab to be installed.

        :param words: The words to be plotted
        :type words: list(str)
        :seealso: nltk.draw.dispersion_plot()
        r   )dispersion_plotN)Z	nltk.drawr   )r-   rD   r   r!   r!   r"   r     s    	zText.dispersion_plotc             C   s   ddl }|dt dS )zF
        Issues a reminder to users following the book online
        r   Nz-The generate() method is no longer available.)warningswarnDeprecationWarning)r-   rD   r   r!   r!   r"   generate
  s    zText.generatec             G   s   |   j|  dS )zc
        See documentation for FreqDist.plot()
        :seealso: nltk.prob.FreqDist.plot()
        N)vocabplot)r-   argsr!   r!   r"   r     s    z	Text.plotc             C   s   d| j krt| | _| jS )z.
        :seealso: nltk.prob.FreqDist
        _vocab)r   r   r   )r-   r!   r!   r"   r     s    

z
Text.vocabc             C   s@   d| j krt| | _| j|}dd |D }tt|d dS )a  
        Find instances of the regular expression in the text.
        The text is a list of tokens, and a regexp pattern to match
        a single token must be surrounded by angle brackets.  E.g.

        >>> print('hack'); from nltk.book import text1, text5, text9
        hack...
        >>> text5.findall("<.*><.*><bro>")
        you rule bro; telling you bro; u twizted bro
        >>> text1.findall("<a>(<.*>)<man>")
        monied; nervous; dangerous; white; white; white; pious; queer; good;
        mature; white; Cape; great; wise; wise; butterless; white; fiendish;
        pale; furious; better; certain; complete; dismasted; younger; brave;
        brave; brave; brave
        >>> text9.findall("<th.*>{3,}")
        thread through those; the thought that; that the thing; the thing
        that; that that thing; through these than through; them that the;
        through the thick; them that they; thought that the

        :param regexp: A regular expression
        :type regexp: str
        _token_searcherc             S   s   g | ]}d  |qS )rE   )rJ   )r&   rp   r!   r!   r"   r)   @  s    z Text.findall.<locals>.<listcomp>z; N)r   ri   r   rs   rd   r   )r-   rv   rw   r!   r!   r"   rs   $  s
    

zText.findallz\w+|[\.\!\?]c             C   s   |d }x$|dkr,| j || s,|d8 }q
W |dkr>|| nd}|d }x(|t|k rr| j || sr|d7 }qLW |t|kr|| nd}||fS )z
        One left & one right token, both case-normalized.  Skip over
        non-sentence-final punctuation.  Used by the ``ContextIndex``
        that is created for ``similar()`` and ``common_contexts()``.
        r   r   z*START*z*END*)_CONTEXT_REmatchr   )r-   r   r    jr   r   r!   r!   r"   _contextI  s    zText._contextc             C   s
   d| j  S )Nz
<Text: %s>)r}   )r-   r!   r!   r"   __str__a  s    zText.__str__c             C   s
   d| j  S )Nz
<Text: %s>)r}   )r-   r!   r!   r"   rY   d  s    zText.__repr__)N)r   rc   )r   rc   )r;   r[   )r;   )r;   )rN   rO   rP   rQ   r|   r5   r   r   r   r`   r   r   rW   r   r   rM   r   r   r   r   rs   rq   compiler   r   r   rY   r!   r!   r!   r"   rx   ,  s,   




"

	#
rx   c               @   s0   e Zd ZdZdd Zdd Zdd Zdd	 Zd
S )TextCollectionaV  A collection of texts, which can be loaded with list of texts, or
    with a corpus consisting of one or more texts, and which supports
    counting, concordancing, collocation discovery, etc.  Initialize a
    TextCollection as follows:

    >>> import nltk.corpus
    >>> from nltk.text import TextCollection
    >>> print('hack'); from nltk.book import text1, text2, text3
    hack...
    >>> gutenberg = TextCollection(nltk.corpus.gutenberg)
    >>> mytexts = TextCollection([text1, text2, text3])

    Iterating over a TextCollection produces all the tokens of all the
    texts in order.
    c                s@   t  dr  fdd  D   | _t| t  i | _d S )NrD   c                s   g | ]}  |qS r!   )rD   )r&   f)sourcer!   r"   r)   |  s    z+TextCollection.__init__.<locals>.<listcomp>)hasattrZfileids_textsrx   r5   r   
_idf_cache)r-   r   r!   )r   r"   r5   z  s
    
zTextCollection.__init__c             C   s   | |t| S )z$ The frequency of the term in text. )r   r   )r-   termtextr!   r!   r"   tf  s    zTextCollection.tfc                sj   | j  }|dkrft fdd| jD }t| jdkrBtd|rXtt| j| nd}|| j  < |S )z The number of texts in the corpus divided by the
        number of texts that the term appears in.
        If a term does not appear in the corpus, 0.0 is returned. Nc                s   g | ]} |krd qS )Tr!   )r&   r   )r   r!   r"   r)     s    z&TextCollection.idf.<locals>.<listcomp>r   z+IDF undefined for empty document collectiong        )r   r?   r   r   rI   r   )r-   r   idfZmatchesr!   )r   r"   r     s    
zTextCollection.idfc             C   s   |  ||| | S )N)r   r   )r-   r   r   r!   r!   r"   tf_idf  s    zTextCollection.tf_idfN)rN   rO   rP   rQ   r5   r   r   r   r!   r!   r!   r"   r   i  s
   r   c              C   s   ddl m}  t| jdd}t| t  td |d t  td |d t  td |  t  td |dd	d
dg t  td |	d t  td td|d  td|dd  td|
 d  d S )Nr   )brownZnews)Z
categorieszConcordance:zDistributionally similar words:zCollocations:zDispersion plot:ZreportZsaidZ	announcedzVocabulary plot:2   z	Indexing:ztext[3]:r   z
text[3:5]:   ztext.vocab()['news']:)r   r   rx   rD   rd   r   r   r   r   r   r   )r   r   r!   r!   r"   demo  s.    


r   __main__))rQ   Z
__future__r   r   r   r   Zmathr   collectionsr   r   r	   	functoolsr
   rq   Zsixr   Znltk.probabilityr   r   r0   Z	nltk.utilr   r   Znltk.metricsr   r   Znltk.collocationsr   Znltk.compatr   r   objectr   rS   ri   rx   r   r   rN   __all__r!   r!   r!   r"   <module>   s@   [q9  >/
