B
    >?[                  @   s   d Z ddlmZmZ ddlZddlmZmZ ddlmZ ddl	m
Z
 ddlmZ ddlmZ dd	lmZ ydd
lmZ W n* ek
r   ddlZejfddZY nX e
eG dd deZdd Zdd ZdddZe
eG dd deZdS )zLanguage Model Interface.    )divisionunicode_literalsN)ABCMetaabstractmethod)bisect)add_metaclass)NgramCounter)	log_base2)
Vocabulary)
accumulatec             c   sR   t | }yt|}W n tk
r(   dS X |V  x|D ]}|||}|V  q6W dS )zReturn running totalsN)iternextStopIteration)iterablefuncittotalelement r   *lib/python3.7/site-packages/nltk/lm/api.pyr      s    

r   c               @   s0   e Zd ZdZdd Zedd Zedd ZdS )		Smoothinga#  Ngram Smoothing Interface

    Implements Chen & Goodman 1995's idea that all smoothing algorithms have
    certain features in common. This should ideally allow smoothing algoritms to
    work both with Backoff and Interpolation.

    counter represents the number of counts for ngrams
    c             C   s   || _ || _d S )N)vocabcounts)self
vocabularycounterr   r   r   __init__4   s    zSmoothing.__init__c             C   s
   t  d S )N)NotImplementedError)r   wordr   r   r   unigram_score8   s    zSmoothing.unigram_scorec             C   s
   t  d S )N)r   )r   r   contextr   r   r   alpha_gamma<   s    zSmoothing.alpha_gammaN)__name__
__module____qualname____doc__r   r   r   r!   r   r   r   r   r   )   s   	r   c             C   s   t | t|  S )z0Return average (aka mean) for sequence of items.)sumlen)itemsr   r   r   _meanA   s    r)   c             C   s   t | tjr| S t| S )N)
isinstancerandomZRandom)Zseed_or_generatorr   r   r   _random_generatorF   s    r,   c             C   sV   | st dt| t|kr$t dtt|}|d }t| }| t|||  S )z`Like random.choice, but with weights.

    Heavily inspired by python 3.6 `random.choices`.
    z"Can't choose from empty populationz3The number of weights does not match the population)
ValueErrorr'   listr   r,   r+   r   )Z
populationZweightsrandom_seedZcum_weightsr   Z	thresholdr   r   r   _weighted_choiceL   s    r1   c               @   sh   e Zd ZdZdddZdddZdddZedd	d
ZdddZ	dd Z
dd Zdd ZdddZdS )LanguageModelzKABC for Language Models.

    Cannot be directly instantiated itself.

    Nc             C   s2   || _ |dkrt n|| _|dkr(t n|| _dS )a}  Creates new LanguageModel.

        :param vocabulary: If provided, this vocabulary will be used instead
        of creating a new one when training.
        :type vocabulary: `nltk.lm.Vocabulary` or None
        :param counter: If provided, use this object to count ngrams.
        :type vocabulary: `nltk.lm.NgramCounter` or None
        :param ngrams_fn: If given, defines how sentences in training text are turned to ngram
                          sequences.
        :type ngrams_fn: function or None
        :param pad_fn: If given, defines how senteces in training text are padded.
        :type pad_fn: function or None

        N)orderr
   r   r   r   )r   r3   r   r   r   r   r   r   c   s    zLanguageModel.__init__c                s@    j s"|dkrtd j |  j fdd|D  dS )zeTrains the model on a text.

        :param text: Training text as a sequence of sentences.

        Nz:Cannot fit without a vocabulary or text to create it from.c             3   s   | ]} j |V  qd S )N)r   lookup).0Zsent)r   r   r   	<genexpr>   s    z$LanguageModel.fit.<locals>.<genexpr>)r   r.   updater   )r   textZvocabulary_textr   )r   r   fitv   s    zLanguageModel.fitc             C   s$   |  | j||r| j|ndS )zMasks out of vocab (OOV) words and computes their model score.

        For model-specific logic of calculating scores, see the `unmasked_score`
        method.
        N)unmasked_scorer   r4   )r   r   r    r   r   r   score   s    zLanguageModel.scorec             C   s
   t  dS )a  Score a word given some optional context.

        Concrete models are expected to provide an implementation.
        Note that this method does not mask its arguments with the OOV label.
        Use the `score` method for that.

        :param str word: Word for which we want the score
        :param tuple(str) context: Context the word is in.
        If `None`, compute unigram score.
        :param context: tuple(str) or None
        :rtype: float

        N)r   )r   r   r    r   r   r   r:      s    zLanguageModel.unmasked_scorec             C   s   t | ||S )zEvaluate the log score of this word in this context.

        The arguments are the same as for `score` and `unmasked_score`.

        )r	   r;   )r   r   r    r   r   r   logscore   s    zLanguageModel.logscorec             C   s"   |r| j t|d  | S | j jS )zHelper method for retrieving counts for a given context.

        Assumes context has been checked and oov words in it masked.
        :type context: tuple(str) or None

           )r   r'   Zunigrams)r   r    r   r   r   context_counts   s    zLanguageModel.context_countsc                s   dt  fdd|D  S )zCalculate cross-entropy of model for given evaluation text.

        :param Iterable(tuple(str)) text_ngrams: A sequence of ngram tuples.
        :rtype: float

        r-   c                s$   g | ]}  |d  |dd  qS )r-   N)r<   )r5   Zngram)r   r   r   
<listcomp>   s    z)LanguageModel.entropy.<locals>.<listcomp>)r)   )r   text_ngramsr   )r   r   entropy   s    zLanguageModel.entropyc             C   s   t d| |S )zCalculates the perplexity of the given text.

        This is simply 2 ** cross-entropy for the text, so the arguments are the same.

        g       @)powrA   )r   r@   r   r   r   
perplexity   s    zLanguageModel.perplexityr=   c                s   |dkrg nt |}|dkrt|jkr>|j d d n| j }x: r|st dkrv dd ng  j }qVW t|}t|t fdd|D |S g }x*t	|D ]}|
jd|| |d qW |S )a  Generate words from the model.

        :param int num_words: How many words to generate. By default 1.
        :param text_seed: Generation can be conditioned on preceding context.
        :param random_seed: If provided, makes the random sampling part of
        generation reproducible.
        :return: One (str) word or a list of words generated from model.

        Examples:

        >>> from nltk.lm import MLE
        >>> lm = MLE(2)
        >>> lm.fit([[("a", "b"), ("b", "c")]], vocabulary_text=['a', 'b', 'c'])
        >>> lm.fit([[("a",), ("b",), ("c",)]])
        >>> lm.generate(random_seed=3)
        'a'
        >>> lm.generate(text_seed=['a'])
        'b'

        Nr=   c             3   s   | ]} | V  qd S )N)r;   )r5   w)r    r   r   r   r6      s    z)LanguageModel.generate.<locals>.<genexpr>)	num_words	text_seedr0   )r/   r'   r3   r>   r   r4   sortedr1   tuplerangeappendgenerate)r   rE   rF   r0   ZsamplesZ	generated_r   )r    r   r   rK      s&    "
zLanguageModel.generate)NN)N)N)N)N)r=   NN)r"   r#   r$   r%   r   r9   r;   r   r:   r<   r>   rA   rC   rK   r   r   r   r   r2   [   s   




r2   )N)r%   Z
__future__r   r   r+   abcr   r   r   Zsixr   Znltk.lm.counterr   Znltk.lm.utilr	   Znltk.lm.vocabularyr
   	itertoolsr   ImportErroroperatoraddobjectr   r)   r,   r1   r2   r   r   r   r   <module>   s(   
