ó
ù`]c        	   @` sþ  d  Z  d d l m Z m Z m Z m Z d d l m Z d d l m	 Z	 m
 Z
 m Z d d l m Z d d l Z d d l Z d d l m Z d d l m Z d d	 l m Z d d
 l m Z d d l m Z d d l m Z m Z d d l m Z m Z d d l  m! Z! d d l" m# Z# d d l$ m% Z% e d d d d d d d d g ƒ Z& d e' f d „  ƒ  YZ( e# d e' f d „  ƒ  Yƒ Z) d e' f d „  ƒ  YZ* e# d e' f d  „  ƒ  Yƒ Z+ d! e+ f d" „  ƒ  YZ, d# „  Z- e. d$ k råe- ƒ  n  d% d& d' d( d) g Z/ d S(*   u  
This module brings together a variety of NLTK functionality for
text analysis, and provides simple, interactive interfaces.
Functionality includes: concordancing, collocation discovery,
regular expression search over tokenized strings, and
distributional similarity.
i    (   t   print_functiont   divisiont   unicode_literalst   absolute_import(   t   log(   t   defaultdictt   Countert
   namedtuple(   t   reduceN(   t	   text_type(   t   MLE(   t   padded_everygram_pipeline(   t   FreqDist(   t   ConditionalFreqDist(   t	   tokenwrapt   LazyConcatenation(   t	   f_measuret   BigramAssocMeasures(   t   BigramCollocationFinder(   t   python_2_unicode_compatible(   t   sent_tokenizeu   ConcordanceLineu   leftu   queryu   rightu   offsetu
   left_printu   right_printu   linet   ContextIndexc           B` s\   e  Z d  Z e d „  ƒ Z d	 d	 d „  d „ Z d „  Z d „  Z d d „ Z	 e
 d „ Z RS(
   u  
    A bidirectional index between words and their 'contexts' in a text.
    The context of a word is usually defined to be the words that occur
    in a fixed window around the word; but other definitions may also
    be used by providing a custom context function.
    c         C` s`   | d k r  |  | d j  ƒ  n d } | t |  ƒ d k rP |  | d j  ƒ  n d } | | f S(   u;   One left token and one right token, normalized to lowercasei    i   u   *START*u   *END*(   t   lowert   len(   t   tokenst   it   leftt   right(    (    s(   lib/python2.7/site-packages/nltk/text.pyt   _default_context2   s    &0c         C` s   |  S(   N(    (   t   x(    (    s(   lib/python2.7/site-packages/nltk/text.pyt   <lambda>9   t    c         ` s²   | ˆ  _  ˆ ˆ  _ | r$ | ˆ  _ n ˆ  j ˆ  _ | r^ g  ˆ D] } | | ƒ r= | ^ q= ‰ n  t ‡  ‡ f d †  t ˆ ƒ Dƒ ƒ ˆ  _ t ‡  ‡ f d †  t ˆ ƒ Dƒ ƒ ˆ  _ d  S(   Nc         3` s6   |  ], \ } } ˆ  j  | ƒ ˆ  j ˆ | ƒ f Vq d  S(   N(   t   _keyt   _context_func(   t   .0R   t   w(   t   selfR   (    s(   lib/python2.7/site-packages/nltk/text.pys	   <genexpr>C   s    c         3` s6   |  ], \ } } ˆ  j  ˆ | ƒ ˆ  j | ƒ f Vq d  S(   N(   R!   R    (   R"   R   R#   (   R$   R   (    s(   lib/python2.7/site-packages/nltk/text.pys	   <genexpr>F   s    (   R    t   _tokensR!   R   t   CFDt	   enumeratet   _word_to_contextst   _context_to_words(   R$   R   t   context_funct   filtert   keyt   t(    (   R$   R   s(   lib/python2.7/site-packages/nltk/text.pyt   __init__9   s    		(%c         C` s   |  j  S(   uw   
        :rtype: list(str)
        :return: The document that this context index was
            created from.
        (   R%   (   R$   (    (    s(   lib/python2.7/site-packages/nltk/text.pyR   I   s    c         C` se   |  j  | ƒ } t |  j | ƒ } i  } x6 |  j j ƒ  D]% \ } } t | t | ƒ ƒ | | <q8 W| S(   u    
        Return a dictionary mapping from words to 'similarity scores,'
        indicating how often these two words occur in the same
        context.
        (   R    t   setR(   t   itemsR   (   R$   t   wordt   word_contextst   scoresR#   t
   w_contexts(    (    s(   lib/python2.7/site-packages/nltk/text.pyt   word_similarity_dictQ   s    i   c         C` s›   t  t ƒ } xo |  j |  j | ƒ D]W } xN |  j | D]? } | | k r7 | | c |  j | | |  j | | 7<q7 q7 Wq# Wt | d | j d t ƒ|  S(   NR,   t   reverse(   R   t   intR(   R    R)   t   sortedt   gett   True(   R$   R1   t   nR3   t   cR#   (    (    s(   lib/python2.7/site-packages/nltk/text.pyt   similar_words`   s    
+c         ` sè   g  | D] } ˆ j  | ƒ ^ q } g  | D] } t ˆ j | ƒ ^ q) } g  t t | ƒ ƒ D] } | | s[ | | ^ q[ } t t j | ƒ ‰  | r´ | r´ t d d j | ƒ ƒ ‚ n0 ˆ  sÁ t	 ƒ  St	 ‡  ‡ f d †  | Dƒ ƒ } | Sd S(   u§  
        Find contexts where the specified words can all appear; and
        return a frequency distribution mapping each context to the
        number of times that context was used.

        :param words: The words used to seed the similarity search
        :type words: str
        :param fail_on_unknown: If true, then raise a value error if
            any of the given words do not occur at all in the index.
        u%   The following word(s) were not found:u    c         3` s5   |  ]+ } ˆ j  | D] } | ˆ  k r | Vq q d  S(   N(   R(   (   R"   R#   R<   (   t   commonR$   (    s(   lib/python2.7/site-packages/nltk/text.pys	   <genexpr>€   s    N(
   R    R/   R(   t   rangeR   R   t   intersectiont
   ValueErrort   joinR   (   R$   t   wordst   fail_on_unknownR#   t   contextsR   t   emptyt   fd(    (   R>   R$   s(   lib/python2.7/site-packages/nltk/text.pyt   common_contextsj   s    "&3N(   t   __name__t
   __module__t   __doc__t   staticmethodR   t   NoneR.   R   R5   R=   t   FalseRH   (    (    (    s(   lib/python2.7/site-packages/nltk/text.pyR   *   s   		
t   ConcordanceIndexc           B` sS   e  Z d  Z d „  d „ Z d „  Z d „  Z d „  Z d d „ Z d d d	 „ Z RS(
   us   
    An index that can be used to look up the offset locations at which
    a given word occurs in a document.
    c         C` s   |  S(   N(    (   R   (    (    s(   lib/python2.7/site-packages/nltk/text.pyR   Œ   R   c         C` se   | |  _  | |  _ t t ƒ |  _ x= t | ƒ D]/ \ } } |  j | ƒ } |  j | j | ƒ q. Wd S(   ué  
        Construct a new concordance index.

        :param tokens: The document (list of tokens) that this
            concordance index was created from.  This list can be used
            to access the context of a given word occurrence.
        :param key: A function that maps each token to a normalized
            version that will be used as a key in the index.  E.g., if
            you use ``key=lambda s:s.lower()``, then the index will be
            case-insensitive.
        N(   R%   R    R   t   listt   _offsetsR'   t   append(   R$   R   R,   t   indexR1   (    (    s(   lib/python2.7/site-packages/nltk/text.pyR.   Œ   s    		c         C` s   |  j  S(   u{   
        :rtype: list(str)
        :return: The document that this concordance index was
            created from.
        (   R%   (   R$   (    (    s(   lib/python2.7/site-packages/nltk/text.pyR   ¦   s    c         C` s   |  j  | ƒ } |  j | S(   uä   
        :rtype: list(int)
        :return: A list of the offset positions at which the given
            word occurs.  If a key function was specified for the
            index, then given word's key will be looked up.
        (   R    RQ   (   R$   R1   (    (    s(   lib/python2.7/site-packages/nltk/text.pyt   offsets®   s    c         C` s    d t  |  j ƒ t  |  j ƒ f S(   Nu+   <ConcordanceIndex for %d tokens (%d types)>(   R   R%   RQ   (   R$   (    (    s(   lib/python2.7/site-packages/nltk/text.pyt   __repr__¸   s    iP   c      	   C` s  | t  | ƒ d d } | d } g  } |  j | ƒ } | rý x½ | D]² } |  j | } |  j t d | | ƒ | !}	 |  j | d | | !}
 d j |	 ƒ | } d j |
 ƒ |  } d j | | | g ƒ } t |	 | |
 | | | | ƒ } | j | ƒ qD Wn  | S(   uB   
        Find all concordance lines given the query word.
        i   i   i    i   u    (   R   RT   R%   t   maxRB   t   ConcordanceLineRR   (   R$   R1   t   widtht
   half_widtht   contextt   concordance_listRT   R   t
   query_wordt   left_contextt   right_contextt
   left_printt   right_printt
   line_printt   concordance_line(    (    s(   lib/python2.7/site-packages/nltk/text.pyt   find_concordance¾   s,    
	i   c         C` s‹   |  j  | d | ƒ} | s( t d ƒ n_ t | t | ƒ ƒ } t d j | t | ƒ ƒ ƒ x+ t | |  ƒ D] \ } } t | j ƒ qj Wd S(   u‹  
        Print concordance lines given the query word.
        :param word: The target word
        :type word: str
        :param lines: The number of lines to display (default=25)
        :type lines: int
        :param width: The width of each line, in characters (default=80)
        :type width: int
        :param save: The option to save the concordance.
        :type save: bool
        RX   u
   no matchesu   Displaying {} of {} matches:N(   Rc   t   printt   minR   t   formatR'   t   line(   R$   R1   RX   t   linesR[   R   Rb   (    (    s(   lib/python2.7/site-packages/nltk/text.pyt   print_concordanceà   s    (	   RI   RJ   RK   R.   R   RT   RU   Rc   Ri   (    (    (    s(   lib/python2.7/site-packages/nltk/text.pyRO   …   s   		
	"t   TokenSearcherc           B` s    e  Z d  Z d „  Z d „  Z RS(   uâ  
    A class that makes it easier to use regular expressions to search
    over tokenized strings.  The tokenized string is converted to a
    string where tokens are marked with angle brackets -- e.g.,
    ``'<the><window><is><still><open>'``.  The regular expression
    passed to the ``findall()`` method is modified to treat angle
    brackets as non-capturing parentheses, in addition to matching the
    token boundaries; and to have ``'.'`` not match the angle brackets.
    c         C` s    d j  d „  | Dƒ ƒ |  _ d  S(   Nu    c         s` s   |  ] } d  | d Vq d S(   u   <u   >N(    (   R"   R#   (    (    s(   lib/python2.7/site-packages/nltk/text.pys	   <genexpr>  s    (   RB   t   _raw(   R$   R   (    (    s(   lib/python2.7/site-packages/nltk/text.pyR.     s    c         C` sÕ   t  j d d | ƒ } t  j d d | ƒ } t  j d d | ƒ } t  j d d | ƒ } t  j | |  j ƒ } x< | D]4 } | j d ƒ rp | j d ƒ rp t d	 ƒ ‚ qp qp Wg  | D] } | d
 d !j d ƒ ^ q¯ } | S(   u"  
        Find instances of the regular expression in the text.
        The text is a list of tokens, and a regexp pattern to match
        a single token must be surrounded by angle brackets.  E.g.

        >>> from nltk.text import TokenSearcher
        >>> print('hack'); from nltk.book import text1, text5, text9
        hack...
        >>> text5.findall("<.*><.*><bro>")
        you rule bro; telling you bro; u twizted bro
        >>> text1.findall("<a>(<.*>)<man>")
        monied; nervous; dangerous; white; white; white; pious; queer; good;
        mature; white; Cape; great; wise; wise; butterless; white; fiendish;
        pale; furious; better; certain; complete; dismasted; younger; brave;
        brave; brave; brave
        >>> text9.findall("<th.*>{3,}")
        thread through those; the thought that; that the thing; the thing
        that; that that thing; through these than through; them that the;
        through the thick; them that they; thought that the

        :param regexp: A regular expression
        :type regexp: str
        u   \su    u   <u   (?:<(?:u   >u   )>)u	   (?<!\\)\.u   [^>]u$   Bad regexp for TokenSearcher.findalli   iÿÿÿÿu   ><(   t   ret   subt   findallRk   t
   startswitht   endswithRA   t   split(   R$   t   regexpt   hitst   h(    (    s(   lib/python2.7/site-packages/nltk/text.pyRn     s    )(   RI   RJ   RK   R.   Rn   (    (    (    s(   lib/python2.7/site-packages/nltk/text.pyRj   ÷   s   		t   Textc           B` s  e  Z d  Z e Z d d „ Z d „  Z d „  Z d d d „ Z	 d d d „ Z
 d d	 d
 „ Z d d	 d „ Z d „  Z d „  Z d „  Z d d „ Z d d „ Z d „  Z d d „ Z d d d d „ Z d „  Z d „  Z d „  Z e j d ƒ Z d „  Z d „  Z d „  Z RS(   uÛ  
    A wrapper around a sequence of simple (string) tokens, which is
    intended to support initial exploration of texts (via the
    interactive console).  Its methods perform a variety of analyses
    on the text's contexts (e.g., counting, concordancing, collocation
    discovery), and display the results.  If you wish to write a
    program which makes use of these analyses, then you should bypass
    the ``Text`` class, and use the appropriate analysis function or
    class directly instead.

    A ``Text`` is typically initialized from a given document or
    corpus.  E.g.:

    >>> import nltk.corpus
    >>> from nltk.text import Text
    >>> moby = Text(nltk.corpus.gutenberg.words('melville-moby_dick.txt'))

    c         C` s¤   |  j  r t | ƒ } n  | |  _ | r3 | |  _ nm d | d  k r| | d  j d ƒ } d j d „  | d | !Dƒ ƒ |  _ n$ d j d „  | d  Dƒ ƒ d |  _ d	 S(
   uv   
        Create a Text object.

        :param tokens: The source text.
        :type tokens: sequence of str
        u   ]i   u    c         s` s   |  ] } t  | ƒ Vq d  S(   N(   R	   (   R"   t   tok(    (    s(   lib/python2.7/site-packages/nltk/text.pys	   <genexpr>Z  s    i   c         s` s   |  ] } t  | ƒ Vq d  S(   N(   R	   (   R"   Rv   (    (    s(   lib/python2.7/site-packages/nltk/text.pys	   <genexpr>\  s    i   u   ...N(   t   _COPY_TOKENSRP   R   t   nameRS   RB   (   R$   R   Rx   t   end(    (    s(   lib/python2.7/site-packages/nltk/text.pyR.   K  s    		&c         C` s   |  j  | S(   N(   R   (   R$   R   (    (    s(   lib/python2.7/site-packages/nltk/text.pyt   __getitem__b  s    c         C` s   t  |  j ƒ S(   N(   R   R   (   R$   (    (    s(   lib/python2.7/site-packages/nltk/text.pyt   __len__e  s    iO   i   c         C` sC   d |  j  k r- t |  j d d „  ƒ|  _ n  |  j j | | | ƒ S(   u¦  
        Prints a concordance for ``word`` with the specified context window.
        Word matching is not case-sensitive.

        :param word: The target word
        :type word: str
        :param width: The width of each line, in characters (default=80)
        :type width: int
        :param lines: The number of lines to display (default=25)
        :type lines: int

        :seealso: ``ConcordanceIndex``
        u   _concordance_indexR,   c         S` s
   |  j  ƒ  S(   N(   R   (   t   s(    (    s(   lib/python2.7/site-packages/nltk/text.pyR   |  R   (   t   __dict__RO   R   t   _concordance_indexRi   (   R$   R1   RX   Rh   (    (    s(   lib/python2.7/site-packages/nltk/text.pyt   concordancel  s    c         C` sD   d |  j  k r- t |  j d d „  ƒ|  _ n  |  j j | | ƒ |  S(   u¨  
        Generate a concordance for ``word`` with the specified context window.
        Word matching is not case-sensitive.

        :param word: The target word
        :type word: str
        :param width: The width of each line, in characters (default=80)
        :type width: int
        :param lines: The number of lines to display (default=25)
        :type lines: int

        :seealso: ``ConcordanceIndex``
        u   _concordance_indexR,   c         S` s
   |  j  ƒ  S(   N(   R   (   R|   (    (    s(   lib/python2.7/site-packages/nltk/text.pyR   ‘  R   (   R}   RO   R   R~   Rc   (   R$   R1   RX   Rh   (    (    s(   lib/python2.7/site-packages/nltk/text.pyR[     s    i   i   c         ` sâ   d |  j  k o* |  j | k o* |  j | k sº | |  _ | |  _ d d l m } | j d ƒ ‰  t j |  j | ƒ } | j	 d ƒ | j
 ‡  f d †  ƒ t ƒ  } | j | j | ƒ |  _ n  g  |  j D] \ } } | d | ^ qÄ S(   u  
        Return collocations derived from the text, ignoring stopwords.

        :param num: The maximum number of collocations to return.
        :type num: int
        :param window_size: The number of tokens spanned by a collocation (default=2)
        :type window_size: int
        u   _collocationsi    (   t	   stopwordsu   englishi   c         ` s"   t  |  ƒ d k  p! |  j ƒ  ˆ  k S(   Ni   (   R   R   (   R#   (   t   ignored_words(    s(   lib/python2.7/site-packages/nltk/text.pyR   ¬  R   u    (   R}   t   _numt   _window_sizet   nltk.corpusR€   RC   R   t
   from_wordsR   t   apply_freq_filtert   apply_word_filterR   t   nbestt   likelihood_ratiot   _collocations(   R$   t   numt   window_sizeR€   t   findert   bigram_measurest   w1t   w2(    (   R   s(   lib/python2.7/site-packages/nltk/text.pyt   collocation_list•  s    
			c         C` sM   g  |  j  | | ƒ D] \ } } | d | ^ q } t t | d d ƒƒ d S(   u  
        Print collocations derived from the text, ignoring stopwords.

        :param num: The maximum number of collocations to print.
        :type num: int
        :param window_size: The number of tokens spanned by a collocation (default=2)
        :type window_size: int
        u    t	   separatoru   ; N(   R‘   Rd   R   (   R$   R‹   RŒ   R   R   t   collocation_strings(    (    s(   lib/python2.7/site-packages/nltk/text.pyt   collocations±  s    3c         C` s   |  j  j | ƒ S(   uJ   
        Count the number of times this word appears in the text.
        (   R   t   count(   R$   R1   (    (    s(   lib/python2.7/site-packages/nltk/text.pyR•   À  s    c         C` s   |  j  j | ƒ S(   uQ   
        Find the index of the first occurrence of the word in the text.
        (   R   RS   (   R$   R1   (    (    s(   lib/python2.7/site-packages/nltk/text.pyRS   Æ  s    c         C` s
   t  ‚ d  S(   N(   t   NotImplementedError(   R$   t   method(    (    s(   lib/python2.7/site-packages/nltk/text.pyt   readabilityÌ  s    c         ` sá   d |  j  k r6 t |  j d d „  d d „  ƒ|  _ n  ˆ j ƒ  ‰ |  j j ‰ ˆ ˆ j ƒ  k rÓ t ˆ ˆ ƒ ‰  t ‡  ‡ ‡ f d †  ˆ j ƒ  Dƒ ƒ } g  | j	 | ƒ D] \ } } | ^ q¨ } t
 t | ƒ ƒ n
 t
 d ƒ d S(	   u~  
        Distributional similarity: find other words which appear in the
        same contexts as the specified word; list most similar words first.

        :param word: The word used to seed the similarity search
        :type word: str
        :param num: The number of words to generate (default=20)
        :type num: int
        :seealso: ContextIndex.similar_words()
        u   _word_context_indexR+   c         S` s
   |  j  ƒ  S(   N(   t   isalpha(   R   (    (    s(   lib/python2.7/site-packages/nltk/text.pyR   Þ  R   R,   c         S` s
   |  j  ƒ  S(   N(   R   (   R|   (    (    s(   lib/python2.7/site-packages/nltk/text.pyR   Þ  R   c         3` s?   |  ]5 } ˆ | D]$ } | ˆ  k r | ˆ k r | Vq q d  S(   N(    (   R"   R#   R<   (   RE   t   wciR1   (    s(   lib/python2.7/site-packages/nltk/text.pys	   <genexpr>è  s   u
   No matchesN(   R}   R   R   t   _word_context_indexR   R(   t
   conditionsR/   R   t   most_commonRd   R   (   R$   R1   R‹   RG   R#   t   _RC   (    (   RE   Rš   R1   s(   lib/python2.7/site-packages/nltk/text.pyt   similarÐ  s    $(c         C` s¿   d |  j  k r- t |  j d d „  ƒ|  _ n  yn |  j j | t ƒ } | sX t d ƒ nB g  | j | ƒ D] \ } } | ^ qh } t t d „  | Dƒ ƒ ƒ Wn t	 k
 rº } t | ƒ n Xd S(   uY  
        Find contexts where the specified words appear; list
        most frequent common contexts first.

        :param words: The words used to seed the similarity search
        :type words: str
        :param num: The number of words to generate (default=20)
        :type num: int
        :seealso: ContextIndex.common_contexts()
        u   _word_context_indexR,   c         S` s
   |  j  ƒ  S(   N(   R   (   R|   (    (    s(   lib/python2.7/site-packages/nltk/text.pyR      R   u   No common contexts were foundc         s` s#   |  ] \ } } | d  | Vq d S(   u   _N(    (   R"   R   R   (    (    s(   lib/python2.7/site-packages/nltk/text.pys	   <genexpr>	  s    N(
   R}   R   R   R›   RH   R:   Rd   R   R   RA   (   R$   RC   R‹   RG   R#   Rž   t   ranked_contextst   e(    (    s(   lib/python2.7/site-packages/nltk/text.pyRH   ò  s    (c         C` s!   d d l  m } | |  | ƒ d S(   uü   
        Produce a plot showing the distribution of the words through the text.
        Requires pylab to be installed.

        :param words: The words to be plotted
        :type words: list(str)
        :seealso: nltk.draw.dispersion_plot()
        i    (   t   dispersion_plotN(   t	   nltk.drawR¢   (   R$   RC   R¢   (    (    s(   lib/python2.7/site-packages/nltk/text.pyR¢     s    	i   c         C` s8   t  | | ƒ \ } } t d | ƒ } | j | | ƒ | S(   Nt   order(   R   R
   t   fit(   R$   t   tokenized_sentsR;   t
   train_datat   padded_sentst   model(    (    s(   lib/python2.7/site-packages/nltk/text.pyt   _train_default_ngram_lm  s    id   i*   c   
      C` s]  g  t  d j |  j ƒ ƒ D] } | j d ƒ ^ q |  _ t |  d ƒ sw t d d t j ƒ|  j	 |  j d d ƒ|  _
 n  g  } | d k s• t d ƒ ‚ x„ t | ƒ | k  rxa t |  j
 j | d	 | d
 | ƒƒ D]; \ } } | d k rí qÏ n  | d k rý Pn  | j | ƒ qÏ W| d 7} q˜ W| r5d j | ƒ d n d } | t | |  ƒ }	 t |	 ƒ |	 S(   u  
        Print random text, generated using a trigram language model.
        See also `help(nltk.lm)`.

        :param length: The length of text to generate (default=100)
        :type length: int

        :param text_seed: Generation can be conditioned on preceding context.
        :type text_seed: list(str)

        :param random_seed: A random seed or an instance of `random.Random`. If provided,
        makes the random sampling part of generation reproducible. (default=42)
        :type random_seed: int

        u    u   trigram_modelu   Building ngram index...t   fileR;   i   i    u!   The `length` must be more than 0.t	   text_seedt   random_seedu   <s>u   </s>i   u    (   R   RB   R   Rq   t   _tokenized_sentst   hasattrRd   t   syst   stderrRª   t   _trigram_modelt   AssertionErrorR   R'   t   generateRR   R   (
   R$   t   lengthR¬   R­   t   sentt   generated_tokenst   idxt   tokent   prefixt
   output_str(    (    s(   lib/python2.7/site-packages/nltk/text.pyR´   !  s*    7	"
c         G` s   |  j  ƒ  j | Œ  d S(   uc   
        See documentation for FreqDist.plot()
        :seealso: nltk.prob.FreqDist.plot()
        N(   t   vocabt   plot(   R$   t   args(    (    s(   lib/python2.7/site-packages/nltk/text.pyR½   P  s    c         C` s(   d |  j  k r! t |  ƒ |  _ n  |  j S(   u.   
        :seealso: nltk.prob.FreqDist
        u   _vocab(   R}   R   t   _vocab(   R$   (    (    s(   lib/python2.7/site-packages/nltk/text.pyR¼   W  s    c         C` sl   d |  j  k r! t |  ƒ |  _ n  |  j j | ƒ } g  | D] } d j | ƒ ^ q: } t t | d ƒ ƒ d S(   uò  
        Find instances of the regular expression in the text.
        The text is a list of tokens, and a regexp pattern to match
        a single token must be surrounded by angle brackets.  E.g.

        >>> print('hack'); from nltk.book import text1, text5, text9
        hack...
        >>> text5.findall("<.*><.*><bro>")
        you rule bro; telling you bro; u twizted bro
        >>> text1.findall("<a>(<.*>)<man>")
        monied; nervous; dangerous; white; white; white; pious; queer; good;
        mature; white; Cape; great; wise; wise; butterless; white; fiendish;
        pale; furious; better; certain; complete; dismasted; younger; brave;
        brave; brave; brave
        >>> text9.findall("<th.*>{3,}")
        thread through those; the thought that; that the thing; the thing
        that; that that thing; through these than through; them that the;
        through the thick; them that they; thought that the

        :param regexp: A regular expression
        :type regexp: str
        u   _token_searcheru    u   ; N(   R}   Rj   t   _token_searcherRn   RB   Rd   R   (   R$   Rr   Rs   Rt   (    (    s(   lib/python2.7/site-packages/nltk/text.pyRn   `  s
    "u   \w+|[\.\!\?]c         C` sÊ   | d } x1 | d k r= |  j  j | | ƒ r= | d 8} q W| d k rT | | n d } | d } x7 | t | ƒ k  r |  j  j | | ƒ r | d 7} qg W| t | ƒ k rº | | n d } | | f S(   uÙ   
        One left & one right token, both case-normalized.  Skip over
        non-sentence-final punctuation.  Used by the ``ContextIndex``
        that is created for ``similar()`` and ``common_contexts()``.
        i   i    u   *START*u   *END*(   t   _CONTEXT_REt   matchR   (   R$   R   R   t   jR   R   (    (    s(   lib/python2.7/site-packages/nltk/text.pyt   _context…  s    
&
,"c         C` s   d |  j  S(   Nu
   <Text: %s>(   Rx   (   R$   (    (    s(   lib/python2.7/site-packages/nltk/text.pyt   __str__  s    c         C` s   d |  j  S(   Nu
   <Text: %s>(   Rx   (   R$   (    (    s(   lib/python2.7/site-packages/nltk/text.pyRU      s    N(   RI   RJ   RK   R:   Rw   RM   R.   Rz   R{   R   R[   R‘   R”   R•   RS   R˜   RŸ   RH   R¢   Rª   R´   R½   R¼   Rn   Rl   t   compileRÁ   RÄ   RÅ   RU   (    (    (    s(   lib/python2.7/site-packages/nltk/text.pyRu   0  s0   					"	/				#		t   TextCollectionc           B` s2   e  Z d  Z d „  Z d „  Z d „  Z d „  Z RS(   uV  A collection of texts, which can be loaded with list of texts, or
    with a corpus consisting of one or more texts, and which supports
    counting, concordancing, collocation discovery, etc.  Initialize a
    TextCollection as follows:

    >>> import nltk.corpus
    >>> from nltk.text import TextCollection
    >>> print('hack'); from nltk.book import text1, text2, text3
    hack...
    >>> gutenberg = TextCollection(nltk.corpus.gutenberg)
    >>> mytexts = TextCollection([text1, text2, text3])

    Iterating over a TextCollection produces all the tokens of all the
    texts in order.
    c         C` sf   t  | d ƒ r: g  | j ƒ  D] } | j | ƒ ^ q } n  | |  _ t j |  t | ƒ ƒ i  |  _ d  S(   Nu   words(   R¯   t   fileidsRC   t   _textsRu   R.   R   t
   _idf_cache(   R$   t   sourcet   f(    (    s(   lib/python2.7/site-packages/nltk/text.pyR.   ¶  s
    +	c         C` s   | j  | ƒ t | ƒ S(   u$    The frequency of the term in text. (   R•   R   (   R$   t   termt   text(    (    s(   lib/python2.7/site-packages/nltk/text.pyt   tf¾  s    c         C` s©   |  j  j | ƒ } | d k r¥ t g  |  j D] } | | k r+ t ^ q+ ƒ } t |  j ƒ d k rp t d ƒ ‚ n  | r t t |  j ƒ | ƒ n d } | |  j  | <n  | S(   u¦    The number of texts in the corpus divided by the
        number of texts that the term appears in.
        If a term does not appear in the corpus, 0.0 is returned. i    u+   IDF undefined for empty document collectiong        N(   RÊ   R9   RM   R   RÉ   R:   RA   R   (   R$   RÍ   t   idfRÎ   t   matches(    (    s(   lib/python2.7/site-packages/nltk/text.pyRÐ   Â  s    .%c         C` s   |  j  | | ƒ |  j | ƒ S(   N(   RÏ   RÐ   (   R$   RÍ   RÎ   (    (    s(   lib/python2.7/site-packages/nltk/text.pyt   tf_idfÐ  s    (   RI   RJ   RK   R.   RÏ   RÐ   RÒ   (    (    (    s(   lib/python2.7/site-packages/nltk/text.pyRÇ   ¥  s
   			c          C` s"  d d l  m }  t |  j d d ƒ ƒ } t | ƒ t ƒ  t d ƒ | j d ƒ t ƒ  t d ƒ | j d ƒ t ƒ  t d ƒ | j ƒ  t ƒ  t d ƒ | j d d	 d
 d g ƒ t ƒ  t d ƒ | j	 d ƒ t ƒ  t d ƒ t d | d ƒ t d | d d !ƒ t d | j
 ƒ  d ƒ d  S(   Ni    (   t   brownt
   categoriesu   newsu   Concordance:u   Distributionally similar words:u   Collocations:u   Dispersion plot:u   reportu   saidu	   announcedu   Vocabulary plot:i2   u	   Indexing:u   text[3]:i   u
   text[3:5]:i   u   text.vocab()['news']:(   R„   RÓ   Ru   RC   Rd   R   RŸ   R”   R¢   R½   R¼   (   RÓ   RÎ   (    (    s(   lib/python2.7/site-packages/nltk/text.pyt   demoÔ  s.    







u   __main__u   ContextIndexu   ConcordanceIndexu   TokenSearcheru   Textu   TextCollection(0   RK   t
   __future__R    R   R   R   t   mathR   t   collectionsR   R   R   t	   functoolsR   Rl   R°   t   sixR	   t   nltk.lmR
   t   nltk.lm.preprocessingR   t   nltk.probabilityR   R   R&   t	   nltk.utilR   R   t   nltk.metricsR   R   t   nltk.collocationsR   t   nltk.compatR   t   nltk.tokenizeR   RW   t   objectR   RO   Rj   Ru   RÇ   RÕ   RI   t   __all__(    (    (    s(   lib/python2.7/site-packages/nltk/text.pyt   <module>   sF   "[q9ÿ u/	
