o
    Uݢg)                     @  s   d Z ddlmZ ddlmZ ddlZddlmZ ddl	m
Z ddlmZ ddlmZ ddlmZmZmZmZmZmZmZ dZdZdZd	Zd
ZdZdd Zdd Z G dd deZ!d"ddZ"d#ddZ#eeddd$d d!Z$dS )%z/Functions for calling cell-associated barcodes.    )annotations)
NamedTupleN)adjust_pvalue_bh)CHEMISTRY_DESCRIPTION_FIELDCHEMISTRY_SC3P_LTHT_CHEMISTRIESSC3P_V3_CHEMISTRIESSC3P_V4_CHEMISTRIESSC5P_CHEMISTRIESSC5P_V3_CHEMISTRIESg      Y@i  i 
      c                 C  s   | dd|f }t ||ddf jdd}t |dk}t|t | \}}t|}|dkr;||  }d}	n|| }	t |	t|}
||
t |< t |
 dsXJ |
S )at  Estimate a gene expression profile by Simple Good Turing.

    Args:
      raw_mat (sparse matrix): Sparse matrix of all counts
      barcode_indices (np.array(int)): Barcode indices to use
      nz_feat (np.array(int)): Indices of features that are non-zero at least once

    Returns:
      profile (np.array(float)): Estimated probabilities of length len(nz_feat).
    Nr   )axisr   g      g      ?)	npravelsumflatnonzerocr_sgtZsgt_proportionslenrepeatisclose)matrixZbarcode_indicesZnz_featZprof_matprofileZ	zero_featZ
p_smoothedp0n0Zp0_iZ	profile_p r   f/oak/stanford/groups/akundaje/marinovg/programs/cellranger-9.0.1/lib/python/cellranger/cell_calling.pyestimate_profile_sgt/   s   r   c                 C  s*   t t | d}t| ||}||fS )a  Estimate a gene expression profile on a given subset of barcodes.

    Use Good-Turing to smooth the estimated profile.

    Args:
      matrix (scipy.sparse.csc_matrix): Sparse matrix of all counts
      use_bcs (np.array(int)): Indices of barcodes to use (col indices into matrix)

    Returns:
      profile (use_features, np.array(float)): Estimated probabilities of length use_features.
    r   )r   r   asarrayr   r   )r   Zuse_bcsZ	use_featsZbg_profile_pr   r   r   est_background_profile_sgtT   s   r   c                   @  s>   e Zd ZU ded< ded< ded< ded< ded< ded< d	S )
NonAmbientBarcodeResultz
np.ndarrayeval_bcslog_likelihoodpvaluespvalues_adjis_nonambientintemptydrops_minimum_umisN)__name__
__module____qualname____annotations__r   r   r   r   r    i   s   
 r    chemistry_descriptionstrreturnfloatc                 C  s.   t t t t }dd |D }| |v rdS dS )zCGets the maximum adjusted p-value to call a barcode as non-ambient.c                 S     g | ]}|t  qS r   r   .0Zchemr   r   r   
<listcomp>v   s    z'get_empty_drops_fdr.<locals>.<listcomp>gMbP?g{Gz?)r	   r   r
   r   )r,   ZchemistriesZ
chem_namesr   r   r   get_empty_drops_fdrr   s   r5   num_probe_bcs
int | Nonetuple[int, int]c                 C  sf   | t t kr	d}n$| dd tt t D v r"|du rd}nd| }n|dur+d| }nd}|d |fS )	a#  Gets the range of values to use for empty drops background given a chemistry description.

    Args:
        chemistry_description: A string describing the chemistry
        num_probe_bcs: The number of probe or OCM multiplexing barcodes

    Returns:
        (lower_range, upper_range)
    i(#  c                 S  r0   r   r1   r2   r   r   r   r4      s    z)get_empty_drops_range.<locals>.<listcomp>Ni q i@  i_    )r   r   r   r	   r   )r,   r6   Zn_partitionsr   r   r   get_empty_drops_rangez   s   



r:   multinomial)r'   num_simsmethodNonAmbientBarcodeResult | Nonec             
     s  |dv sJ |   }t|}t||\}	}
t|}td|	 d|
  |ddd |	|
 }|  t|}|  tj||dd}t	|dkr{zt
| j|\}}|d	kr_t| j||}W n' tjyz } ztt| W Y d}~dS d}~ww tjdtjd
}td}t| ttj fdd| jD t	| jtd}| dkrdS tjt| j}tj||< tj|| dd}t|d| }td|  tj|||k < t	||j  }ttj||jdd| }|  t	|dkrdS t t!||rJ t t!||rJ tdt	|  td|| "  d||    tdt	|  tdt	|  | j|ddf dd|f }t	|dkrrt#tj$t	|}t#dt	|}t#tj$t	|}dS |d	krt%||| }tj&|| || |d\}}nt'|t(|}tj)||| |d\}}t*|| |||}t+|}td|  tdt"|  ||k}tdt|  t,||||||dS )a  Call barcodes as being sufficiently distinct from the ambient profile.

    Args:
      matrix (CountMatrix): Full expression matrix.
      orig_cell_bcs (iterable of str): Strings of initially-called cell barcodes.
      chemistry_description: Change ambient RNA estimation for LT chemistry
      num_probe_bcs: The number of probe or OCM multiplexing barcodes
      emptydrops_minimum_umis: Minimum UMI threshold
      num_sims: Number of simulations

    Returns:
      NonAmbientBarcodeResult
    )	dirichletr;   zRange empty barcodes: z - NT)assume_uniquer   r?   )dtypec                 3  s    | ]}| v V  qd S )Nr   )r3   bcZorig_cell_bc_setr   r   	<genexpr>   s    z+find_nonambient_barcodes.<locals>.<genexpr>)countrB   )initialr   zMax background UMIs: )maskzNumber of candidate bcs: zRange candidate bc umis: z, zNumber of empty bcs: zNumber of original cell calls: )r<   zMax adjusted P-value: zMin observed P-value: z+Non-ambient bcs identified by empty drops: )r!   r"   r#   r$   r%   r'   )-get_counts_per_bcr   argsortr:   r5   printsortr   intersect1dr   r   mcr_statsZ!estimate_dirichlet_overdispersionr   ZSimpleGoodTuringErrorr-   zerosint64setfromiterbcsboolr   maarrayarangebcs_dimmaskedmaxrH   masked_arrayanyisinminr   nanZ)eval_dirichlet_multinomial_loglikelihoodsZ-simulate_dirichlet_multinomial_loglikelihoodsZeval_multinomial_loglikelihoodslogZ#simulate_multinomial_loglikelihoodsZcompute_ambient_pvaluesr   r    )r   orig_cell_bcsr,   r6   r'   r<   r=   umis_per_bcZbc_orderlowhighZmax_adj_pvalueZ	empty_bcsZnz_bcsZambient_bcsZeval_featuresZambient_profile_palphaeZ
orig_cellsr!   Zmax_background_umisZn_unmasked_bcsZeval_matZ	obs_loglkr#   Z	sim_loglkZdistinct_nsr$   r%   r   rD   r   find_nonambient_barcodes   s   



$


rh   )r,   r-   r.   r/   )r,   r-   r6   r7   r.   r8   )r.   r>   )%__doc__
__future__r   typingr   numpyr   numpy.marV   Zcellranger.sgtZsgtr   Zcellranger.statsstatsrO   Zcellranger.analysis.diffexpr   cellranger.chemistryr   r   r   r   r	   r
   r   MIN_GLOBAL_UMISMAX_MITO_PCTMIN_UMISNUM_SIMS%TARGETED_CC_MIN_UMIS_ADDITIONAL_CELLS&TARGETED_CC_MIN_UMIS_FROM_TARGET_GENESr   r   r    r5   r:   rh   r   r   r   r   <module>   s0   $%
	
#