ó
ù`]c           @  s  d  Z  d d l m Z d d l Z d d l m Z m Z d d l m Z d d l	 m
 Z
 y d d l m Z Wn! e k
 r d d l m Z n Xe d „  ƒ Z e j e ƒ d	 „  ƒ Z y e Wn e k
 rØ e Z n Xe j e ƒ d
 „  ƒ Z e
 j d e f d „  ƒ  Yƒ Z d S(   u   Language Model Vocabularyiÿÿÿÿ(   t   unicode_literalsN(   t   Countert   Iterable(   t   chain(   t   compat(   t   singledispatchc         C  s   t  d j t |  ƒ ƒ ƒ ‚ d  S(   Nu2   Unsupported type for looking up in vocabulary: {0}(   t	   TypeErrort   formatt   type(   t   wordst   vocab(    (    s1   lib/python2.7/site-packages/nltk/lm/vocabulary.pyt   _dispatched_lookup   s    c           s   t  ‡  f d †  |  Dƒ ƒ S(   uc   Look up a sequence of words in the vocabulary.

    Returns an iterator over looked up words.

    c         3  s   |  ] } t  | ˆ  ƒ Vq d  S(   N(   R   (   t   .0t   w(   R
   (    s1   lib/python2.7/site-packages/nltk/lm/vocabulary.pys	   <genexpr>(   s    (   t   tuple(   R	   R
   (    (   R
   s1   lib/python2.7/site-packages/nltk/lm/vocabulary.pyt   _!   s    c         C  s   |  | k r |  S| j  S(   u$   Looks up one word in the vocabulary.(   t	   unk_label(   t   wordR
   (    (    s1   lib/python2.7/site-packages/nltk/lm/vocabulary.pyt   _string_lookup3   s    t
   Vocabularyc           B  s–   e  Z d  Z d d d d „ Z e d „  ƒ Z d „  Z d „  Z d „  Z	 d „  Z
 d	 „  Z d
 „  Z d „  Z e j d d k r‹ d „  Z n  d „  Z RS(   u´
  Stores language model vocabulary.

    Satisfies two common language modeling requirements for a vocabulary:
    - When checking membership and calculating its size, filters items
      by comparing their counts to a cutoff value.
    - Adds a special "unknown" token which unseen words are mapped to.

    >>> words = ['a', 'c', '-', 'd', 'c', 'a', 'b', 'r', 'a', 'c', 'd']
    >>> from nltk.lm import Vocabulary
    >>> vocab = Vocabulary(words, unk_cutoff=2)

    Tokens with counts greater than or equal to the cutoff value will
    be considered part of the vocabulary.

    >>> vocab['c']
    3
    >>> 'c' in vocab
    True
    >>> vocab['d']
    2
    >>> 'd' in vocab
    True

    Tokens with frequency counts less than the cutoff value will be considered not
    part of the vocabulary even though their entries in the count dictionary are
    preserved.

    >>> vocab['b']
    1
    >>> 'b' in vocab
    False
    >>> vocab['aliens']
    0
    >>> 'aliens' in vocab
    False

    Keeping the count entries for seen words allows us to change the cutoff value
    without having to recalculate the counts.

    >>> vocab2 = Vocabulary(vocab.counts, unk_cutoff=1)
    >>> "b" in vocab2
    True

    The cutoff value influences not only membership checking but also the result of
    getting the size of the vocabulary using the built-in `len`.
    Note that while the number of keys in the vocabulary's counter stays the same,
    the items in the vocabulary differ depending on the cutoff.
    We use `sorted` to demonstrate because it keeps the order consistent.

    >>> sorted(vocab2.counts)
    ['-', 'a', 'b', 'c', 'd', 'r']
    >>> sorted(vocab2)
    ['-', '<UNK>', 'a', 'b', 'c', 'd', 'r']
    >>> sorted(vocab.counts)
    ['-', 'a', 'b', 'c', 'd', 'r']
    >>> sorted(vocab)
    ['<UNK>', 'a', 'c', 'd']

    In addition to items it gets populated with, the vocabulary stores a special
    token that stands in for so-called "unknown" items. By default it's "<UNK>".

    >>> "<UNK>" in vocab
    True

    We can look up words in a vocabulary using its `lookup` method.
    "Unseen" words (with counts less than cutoff) are looked up as the unknown label.
    If given one word (a string) as an input, this method will return a string.

    >>> vocab.lookup("a")
    'a'
    >>> vocab.lookup("aliens")
    '<UNK>'

    If given a sequence, it will return an tuple of the looked up words.

    >>> vocab.lookup(["p", 'a', 'r', 'd', 'b', 'c'])
    ('<UNK>', 'a', '<UNK>', 'd', '<UNK>', 'c')

    It's possible to update the counts after the vocabulary has been created.
    The interface follows that of `collections.Counter`.

    >>> vocab['b']
    1
    >>> vocab.update(["b", "b", "c"])
    >>> vocab['b']
    3
    i   u   <UNK>c         C  sƒ   t  | t ƒ r | |  _ n. t ƒ  |  _ t  | t ƒ rI |  j j | ƒ n  | |  _ | d k  rv t d j | ƒ ƒ ‚ n  | |  _ d S(   uË  Create a new Vocabulary.

        :param counts: Optional iterable or `collections.Counter` instance to
                       pre-seed the Vocabulary. In case it is iterable, counts
                       are calculated.
        :param int unk_cutoff: Words that occur less frequently than this value
                               are not considered part of the vocabulary.
        :param unk_label: Label for marking words not part of vocabulary.

        i   u,   Cutoff value cannot be less than 1. Got: {0}N(	   t
   isinstanceR   t   countsR   t   updateR   t
   ValueErrorR   t   _cutoff(   t   selfR   t
   unk_cutoffR   (    (    s1   lib/python2.7/site-packages/nltk/lm/vocabulary.pyt   __init__“   s    	c         C  s   |  j  S(   ui   Cutoff value.

        Items with count below this value are not considered part of vocabulary.

        (   R   (   R   (    (    s1   lib/python2.7/site-packages/nltk/lm/vocabulary.pyt   cutoff«   s    c         O  s   |  j  j | | Ž  d S(   uW   Update vocabulary counts.

        Wraps `collections.Counter.update` method.

        N(   R   R   (   R   t   counter_argst   counter_kwargs(    (    s1   lib/python2.7/site-packages/nltk/lm/vocabulary.pyR   ´   s    c         C  s   t  | |  ƒ S(   u  Look up one or more words in the vocabulary.

        If passed one word as a string will return that word or `self.unk_label`.
        Otherwise will assume it was passed a sequence of words, will try to look
        each of them up and return an iterator over the looked up words.

        :param words: Word(s) to look up.
        :type words: Iterable(str) or str
        :rtype: generator(str) or str
        :raises: TypeError for types other than strings or iterables

        >>> from nltk.lm import Vocabulary
        >>> vocab = Vocabulary(["a", "b", "c", "a", "b"], unk_cutoff=2)
        >>> vocab.lookup("a")
        'a'
        >>> vocab.lookup("aliens")
        '<UNK>'
        >>> vocab.lookup(["a", "b", "c", ["x", "b"]])
        ('a', 'b', '<UNK>', ('<UNK>', 'b'))

        (   R   (   R   R	   (    (    s1   lib/python2.7/site-packages/nltk/lm/vocabulary.pyt   lookup¼   s    c         C  s!   | |  j  k r |  j S|  j | S(   N(   R   R   R   (   R   t   item(    (    s1   lib/python2.7/site-packages/nltk/lm/vocabulary.pyt   __getitem__Ô   s    c         C  s   |  | |  j  k S(   uP   Only consider items with counts GE to cutoff as being in the
        vocabulary.(   R   (   R   R    (    (    s1   lib/python2.7/site-packages/nltk/lm/vocabulary.pyt   __contains__×   s    c           s5   t  ‡  f d †  ˆ  j Dƒ ˆ  j r. ˆ  j g n g  ƒ S(   uK   Building on membership check define how to iterate over
        vocabulary.c         3  s!   |  ] } | ˆ  k r | Vq d  S(   N(    (   R   R    (   R   (    s1   lib/python2.7/site-packages/nltk/lm/vocabulary.pys	   <genexpr>à   s    (   R   R   R   (   R   (    (   R   s1   lib/python2.7/site-packages/nltk/lm/vocabulary.pyt   __iter__Ü   s    c         C  s   t  d „  |  Dƒ ƒ S(   u1   Computing size of vocabulary reflects the cutoff.c         s  s   |  ] } d  Vq d S(   i   N(    (   R   R   (    (    s1   lib/python2.7/site-packages/nltk/lm/vocabulary.pys	   <genexpr>æ   s    (   t   sum(   R   (    (    s1   lib/python2.7/site-packages/nltk/lm/vocabulary.pyt   __len__ä   s    c         C  s4   |  j  | j  k o3 |  j | j k o3 |  j | j k S(   N(   R   R   R   (   R   t   other(    (    s1   lib/python2.7/site-packages/nltk/lm/vocabulary.pyt   __eq__è   s    i    i   c         C  s$   |  j  | ƒ } | t k r | S| S(   N(   R'   t   NotImplemented(   R   R&   t   equal(    (    s1   lib/python2.7/site-packages/nltk/lm/vocabulary.pyt   __ne__ñ   s    c         C  s(   d j  |  j j |  j |  j t |  ƒ ƒ S(   Nu3   <{0} with cutoff={1} unk_label='{2}' and {3} items>(   R   t	   __class__t   __name__R   R   t   len(   R   (    (    s1   lib/python2.7/site-packages/nltk/lm/vocabulary.pyt   __str__õ   s    N(   R,   t
   __module__t   __doc__t   NoneR   t   propertyR   R   R   R!   R"   R#   R%   R'   t   syst   version_infoR*   R.   (    (    (    s1   lib/python2.7/site-packages/nltk/lm/vocabulary.pyR   9   s   X								(   R0   t
   __future__R    R3   t   collectionsR   R   t	   itertoolsR   t   nltkR   t	   functoolsR   t   ImportErrorR   t   registerR   t
   basestringt	   NameErrort   strR   t   python_2_unicode_compatiblet   objectR   (    (    (    s1   lib/python2.7/site-packages/nltk/lm/vocabulary.pyt   <module>   s$   

	