o
    Uݢg                     @  s\   d Z ddlmZ ddlZddlmZ G dd deZ	dd Z
dd	 ZdddZdddZdS )zSimple Good-Turing estimator.

Based on S implementation in::

  William A. Gale & Geoffrey Sampson (1995) Good-turing frequency estimation without tears,
  Journal of Quantitative Linguistics, 2:3, 217-237, DOI: 10.1080/09296179508590051
    )annotationsNc                   @  s   e Zd ZdS )SimpleGoodTuringErrorN)__name__
__module____qualname__ r   r   ]/oak/stanford/groups/akundaje/marinovg/programs/cellranger-9.0.1/lib/python/cellranger/sgt.pyr      s    r   c                 C  sb   t t jdtdt | f}t d|dd  |dd   t j|d ftdf}|t| S )N   )dtypeg      ?r   )npconcatenateonesintdiffarrayfloatastype)rnrddrr   r   r   _averaging_transform   s   6r   c                 C  s   | t dd|   d|  S )Nr	   g      ?)r   power)r   coefr   r   r   _rstest   s   r   xr
np.ndarrayxnrc              	   C  s&  |  t} | t}t| | }t| |}tt| t|\}}}}}|dk r4td| d nd}td t	| |}||  }| t
| dd d tdfk}tt| }	| | d | |  t
|dd tdf|  ||  |	|< tt| }
tt| D ]'}|| rt|d ||  t||d  d||d  ||     |
|< qtt| }d}tt| D ]/}|s|| ||< qt|| |	|  d|  |
|  d	kr|	| ||< qd
}|| ||< qt||  | | }|d|d |   | }|d | }| | |fS )aZ  Make a Simple Good-Turing estimate of the frequencies.

    Args:
      xr (np.array(int)): Non-zero item frequencies
      xnr (np.array(int)): Non-zero frequencies of frequencies
    Returns:
      (rstar (np.array(float)), p0 (float)):
        rstar: The adjusted non-zero frequencies
        p0: The total probability of unobserved items
    r   zThe SGT slope is .zQThe SGT slope {slope} is > -1. The SGT estimator is not applicable to these data.r	   N   Tgffffff?Fr   )r   r   r   sumr   sp_stats
linregresslogprintr   r   zeroslenr   rangesqrtabs)r   r   ZxNZxnrzslope_ZxrstZxrstrelZxrtryZxrstarelZtursdiZ
xrstcmbrelZ	useturingr   Zsumprawp0r   r   r   simple_good_turing"   sB   


"
$:B*r/   frequenciesc                   s   t | dkr
tdt| t | krtdt| }|d dks$J t|}t |dk r7tdt | t||| \}}tt	|| t
|| | }tj fdd| D tt | d}d	| ||  }t|t
| d	svJ ||fS )
a  Use Simple Good-Turing estimate to adjust for unobserved items.

    Args:
      frequencies (np.array(int)): Nonzero frequencies of items

    Returns:
        pstar (np.array[float]): The adjusted non-zero proportions
        p0 (float): The total probability of unobserved items
    r   zInput frequency vector is emptyz%Frequencies must be greater than zero
   z4Too few non-zero frequency items (%d). Aborting SGT.c                 3  s    | ]} | V  qd S )Nr   ).0fZ
rstar_dictr   r   	<genexpr>   s    z"sgt_proportions.<locals>.<genexpr>)r
   countr	   )r'   
ValueErrorr   count_nonzerobincountflatnonzeror   r/   dictzipr!   fromiterr   isclose)r0   Z	freqfreqsZ	use_freqsrstarr.   Z	rstar_sumZrstar_iZpstarr   r4   r   sgt_proportionsc   s$   



"r@   )r   r   r   r   )r0   r   )__doc__
__future__r   numpyr   scipy.statsstatsr"   	Exceptionr   r   r   r/   r@   r   r   r   r   <module>   s   
A