B
    >?ð[²  ã               @   sÞ   d Z ddlmZ ddlZddlmZmZ ddlmZ ddl	m
Z
 yddlmZ W n  ek
rp   ddlmZ Y nX edd	„ ƒZe e¡d
d„ ƒZye W n ek
r°   eZY nX e e¡dd„ ƒZe
jG dd„ deƒƒZdS )zLanguage Model Vocabularyé    )Úunicode_literalsN)ÚCounterÚIterable)Úchain)Úcompat)Úsingledispatchc             C   s   t d t| ƒ¡ƒ‚d S )Nz2Unsupported type for looking up in vocabulary: {0})Ú	TypeErrorÚformatÚtype)ÚwordsÚvocab© r   ú1lib/python3.7/site-packages/nltk/lm/vocabulary.pyÚ_dispatched_lookup   s    r   c                s   t ‡ fdd„| D ƒƒS )zcLook up a sequence of words in the vocabulary.

    Returns an iterator over looked up words.

    c             3   s   | ]}t |ˆ ƒV  qd S )N)r   )Ú.0Úw)r   r   r   ú	<genexpr>(   s    z_.<locals>.<genexpr>)Útuple)r   r   r   )r   r   Ú_!   s    r   c             C   s   | |kr| S |j S )z$Looks up one word in the vocabulary.)Ú	unk_label)Zwordr   r   r   r   Ú_string_lookup3   s    r   c               @   s|   e Zd ZdZddd„Zedd„ ƒZd	d
„ Zdd„ Zdd„ Z	dd„ Z
dd„ Zdd„ Zdd„ Zejd dkrpdd„ Zdd„ ZdS )Ú
Vocabularya´
  Stores language model vocabulary.

    Satisfies two common language modeling requirements for a vocabulary:
    - When checking membership and calculating its size, filters items
      by comparing their counts to a cutoff value.
    - Adds a special "unknown" token which unseen words are mapped to.

    >>> words = ['a', 'c', '-', 'd', 'c', 'a', 'b', 'r', 'a', 'c', 'd']
    >>> from nltk.lm import Vocabulary
    >>> vocab = Vocabulary(words, unk_cutoff=2)

    Tokens with counts greater than or equal to the cutoff value will
    be considered part of the vocabulary.

    >>> vocab['c']
    3
    >>> 'c' in vocab
    True
    >>> vocab['d']
    2
    >>> 'd' in vocab
    True

    Tokens with frequency counts less than the cutoff value will be considered not
    part of the vocabulary even though their entries in the count dictionary are
    preserved.

    >>> vocab['b']
    1
    >>> 'b' in vocab
    False
    >>> vocab['aliens']
    0
    >>> 'aliens' in vocab
    False

    Keeping the count entries for seen words allows us to change the cutoff value
    without having to recalculate the counts.

    >>> vocab2 = Vocabulary(vocab.counts, unk_cutoff=1)
    >>> "b" in vocab2
    True

    The cutoff value influences not only membership checking but also the result of
    getting the size of the vocabulary using the built-in `len`.
    Note that while the number of keys in the vocabulary's counter stays the same,
    the items in the vocabulary differ depending on the cutoff.
    We use `sorted` to demonstrate because it keeps the order consistent.

    >>> sorted(vocab2.counts)
    ['-', 'a', 'b', 'c', 'd', 'r']
    >>> sorted(vocab2)
    ['-', '<UNK>', 'a', 'b', 'c', 'd', 'r']
    >>> sorted(vocab.counts)
    ['-', 'a', 'b', 'c', 'd', 'r']
    >>> sorted(vocab)
    ['<UNK>', 'a', 'c', 'd']

    In addition to items it gets populated with, the vocabulary stores a special
    token that stands in for so-called "unknown" items. By default it's "<UNK>".

    >>> "<UNK>" in vocab
    True

    We can look up words in a vocabulary using its `lookup` method.
    "Unseen" words (with counts less than cutoff) are looked up as the unknown label.
    If given one word (a string) as an input, this method will return a string.

    >>> vocab.lookup("a")
    'a'
    >>> vocab.lookup("aliens")
    '<UNK>'

    If given a sequence, it will return an tuple of the looked up words.

    >>> vocab.lookup(["p", 'a', 'r', 'd', 'b', 'c'])
    ('<UNK>', 'a', '<UNK>', 'd', '<UNK>', 'c')

    It's possible to update the counts after the vocabulary has been created.
    The interface follows that of `collections.Counter`.

    >>> vocab['b']
    1
    >>> vocab.update(["b", "b", "c"])
    >>> vocab['b']
    3
    Né   ú<UNK>c             C   sV   t |tƒr|| _ntƒ | _t |tƒr0| j |¡ || _|dk rLtd |¡ƒ‚|| _dS )aË  Create a new Vocabulary.

        :param counts: Optional iterable or `collections.Counter` instance to
                       pre-seed the Vocabulary. In case it is iterable, counts
                       are calculated.
        :param int unk_cutoff: Words that occur less frequently than this value
                               are not considered part of the vocabulary.
        :param unk_label: Label for marking words not part of vocabulary.

        r   z,Cutoff value cannot be less than 1. Got: {0}N)	Ú
isinstancer   Úcountsr   Úupdater   Ú
ValueErrorr	   Ú_cutoff)Úselfr   Z
unk_cutoffr   r   r   r   Ú__init__“   s    

zVocabulary.__init__c             C   s   | j S )ziCutoff value.

        Items with count below this value are not considered part of vocabulary.

        )r   )r   r   r   r   Úcutoff«   s    zVocabulary.cutoffc             O   s   | j j||Ž dS )zWUpdate vocabulary counts.

        Wraps `collections.Counter.update` method.

        N)r   r   )r   Zcounter_argsZcounter_kwargsr   r   r   r   ´   s    zVocabulary.updatec             C   s
   t || ƒS )a  Look up one or more words in the vocabulary.

        If passed one word as a string will return that word or `self.unk_label`.
        Otherwise will assume it was passed a sequence of words, will try to look
        each of them up and return an iterator over the looked up words.

        :param words: Word(s) to look up.
        :type words: Iterable(str) or str
        :rtype: generator(str) or str
        :raises: TypeError for types other than strings or iterables

        >>> from nltk.lm import Vocabulary
        >>> vocab = Vocabulary(["a", "b", "c", "a", "b"], unk_cutoff=2)
        >>> vocab.lookup("a")
        'a'
        >>> vocab.lookup("aliens")
        '<UNK>'
        >>> vocab.lookup(["a", "b", "c", ["x", "b"]])
        ('a', 'b', '<UNK>', ('<UNK>', 'b'))

        )r   )r   r   r   r   r   Úlookup¼   s    zVocabulary.lookupc             C   s   || j kr| jS | j| S )N)r   r   r   )r   Úitemr   r   r   Ú__getitem__Ô   s    zVocabulary.__getitem__c             C   s   | | | j kS )zPOnly consider items with counts GE to cutoff as being in the
        vocabulary.)r!   )r   r#   r   r   r   Ú__contains__×   s    zVocabulary.__contains__c                s(   t ‡ fdd„ˆ jD ƒˆ jr"ˆ jgng ƒS )zKBuilding on membership check define how to iterate over
        vocabulary.c             3   s   | ]}|ˆ kr|V  qd S )Nr   )r   r#   )r   r   r   r   à   s    z&Vocabulary.__iter__.<locals>.<genexpr>)r   r   r   )r   r   )r   r   Ú__iter__Ü   s    zVocabulary.__iter__c             C   s   t dd„ | D ƒƒS )z1Computing size of vocabulary reflects the cutoff.c             s   s   | ]
}d V  qdS )r   Nr   )r   r   r   r   r   r   æ   s    z%Vocabulary.__len__.<locals>.<genexpr>)Úsum)r   r   r   r   Ú__len__ä   s    zVocabulary.__len__c             C   s$   | j |j ko"| j|jko"| j|jkS )N)r   r!   r   )r   Úotherr   r   r   Ú__eq__è   s    zVocabulary.__eq__r   é   c             C   s   |   |¡}|tkr|S | S )N)r*   ÚNotImplemented)r   r)   Zequalr   r   r   Ú__ne__ñ   s    
zVocabulary.__ne__c             C   s   d  | jj| j| jt| ƒ¡S )Nz3<{0} with cutoff={1} unk_label='{2}' and {3} items>)r	   Ú	__class__Ú__name__r!   r   Úlen)r   r   r   r   Ú__str__õ   s    zVocabulary.__str__)Nr   r   )r/   Ú
__module__Ú__qualname__Ú__doc__r    Úpropertyr!   r   r"   r$   r%   r&   r(   r*   ÚsysÚversion_infor-   r1   r   r   r   r   r   9   s   X
	r   )r4   Z
__future__r   r6   Úcollectionsr   r   Ú	itertoolsr   Znltkr   Ú	functoolsr   ÚImportErrorr   Úregisterr   Z
basestringÚ	NameErrorÚstrr   Zpython_2_unicode_compatibleÚobjectr   r   r   r   r   Ú<module>   s$   

