o
    UݢgC                     @  s   d dl mZ d dlZd dlZd dlZd dlZd dlmZ d dlm	Z	m
Z
 d dlZd dlZd dlmZmZ d dlm  mZ d dlmZ d dlmZ d dlmZ e	rWd dlm Z  dZ!	ddddZ"dd Z#dddZ$G dd de
ddZ%G dd dZ&dS )    )annotationsN)reduce)TYPE_CHECKING	TypedDict)ensure_binary
ensure_str)CountMatrix  counts0
np.ndarraycounts1classifications+np.ndarray[int, np.dtype[np.bytes_]] | Nonec              
   C  s  |du r	t | |}| t| | t }||tjk }d||tjk  }d\}}|t|dk|dk  }|t|dk|dk  }	t|dkrt|	dkrz2t	j
jj|ddd\}
}}}t	j
jj|	ddd\}}}}t	j
jtj|
|}t	j
jtj||}W n? t	j
jjy } zt|tjd d\}}W Y d}~n$d}~w t	j
jjy } zt|tjd d\}}W Y d}~nd}~ww t|tjk||k }t|tjkd| |k }t|}t|}t|t|}t|t|}t||t}|tjk}t| |  | | ||   }|tjk}t||  | | ||   }|tjk|tjkB }tt| | ||  | | |  }|||||||||f	S )znCompute fraction of counts in putative single-cell GEMs.

    originating from the non-cell transcriptome
    N   )      ?r   r   )ZflocZfscale)file)classify_gemsastypefloatanalysis_constantsGEM_CLASS_GENOME0GEM_CLASS_GENOME1nplogical_andlenscipystatsbetaZfitZppfZ#COUNT_PURITY_OUTLIER_PROB_THRESHOLDZ_continuous_distnsZFitSolverErrorprintsysstderrZFitDataErrorsumtk_statsrobust_divideZ
logical_orintZmaximum)r
   r   r   Zfrac0purity0purity1Z
threshold0Z
threshold1Zfit_purity0Zfit_purity1Zalpha0Zbeta0_Zalpha1Zbeta1eZoutlier0Zoutlier1Z
n_outlier0Z
n_outlier1Zfrac_outlier0Zfrac_outlier1
is_outlierZgems0Zmean_purity0Zgems1Zmean_purity1Zsingle_cellZmean_overall_purity r*   n/oak/stanford/groups/akundaje/marinovg/programs/cellranger-9.0.1/lib/python/cellranger/analysis/multigenome.pycompute_count_purity   s~   	


r,   c                 C  sb   |dks|dkr
dS dt |t ||   t |t ||   }t | | }t|t | | | S )zGiven a number of observed multiplets and cell counts for two transcriptomes,.

    infer the total number of multiplets (observed + unobserved)
    r      )r   min)Zn_obs_multipletsZn_cells0Zn_cells1Zp_obs_multipletZmler*   r*   r+   infer_multiplets_from_observedq   s   r/   "np.ndarray[int, np.dtype[np.int_]]return$np.ndarray[int, np.dtype[np.bytes_]]c           	      C  s  t jgd \}}t| |kdkr2t|| kdkr2t| | |k t jd }t||| k t jd }t||g}|d |d  }|d dk rW|dkrWt| | t jd  }}t| |k||k}tdt	dd	 t j
D f}t|t jt j|}t j|tt|t jk|| k< |S )
a\  Classify counts by inferred number of distinct transcriptomes present in each GEM (1 or 2).

    Report analysis_constants.GEM_CLASS_GENOME0 for a single cell w/ transcriptome 0,
    report analysis_constants.GEM_CLASS_GENOME1 for a single cell w/ transcriptome 1,
    report analysis_constants.GEM_CLASS_MULTIPLET for multiple transcriptomes.
    r-   r   g      Y@r   2      Sc                 s  s    | ]}t |V  qd S N)r   ).0clsr*   r*   r+   	<genexpr>   s    z classify_gems.<locals>.<genexpr>)r   ZDEFAULT_MULTIPLET_THRESHOLDr!   r   
percentileZMULTIPLET_PROB_THRESHOLDsortedr   dtypemaxZGEM_CLASSESwhereGEM_CLASS_MULTIPLETr   r   r   Zlogical_not)	r
   r   Zthresh0Zthresh1Z
thresholdsZfold_changeZdoubletr<   resultr*   r*   r+   r      s8   
 
r   c                   @  sF   e Zd ZU ded< ded< ded< ded< ded< ded	< ded
< dS )MultiGenomeAnalysisResultzlist[bytes]barcodecallz	list[int]count0count1strgenome0genome1purity_outlierN)__name__
__module____qualname____annotations__r*   r*   r*   r+   rA      s   
 rA   F)totalc                   @  s   e Zd ZdZd&d'ddZdd Zefd(ddZdd Zdd Z	d)ddZ
d*ddZd*ddZdd Zed*dd Zed!d" Zed+d$d%ZdS ),MultiGenomeAnalysisz/Analysis of matrices when >1 genome is present.Nfiltered_matrixCountMatrix | Nonec                 C  s(   || _ i | _i | _d | _d| _d | _d S )N )rP   summaryr@   top_two_txomessuffixn_gems)selfrP   r*   r*   r+   __init__   s   
zMultiGenomeAnalysis.__init__c                 C  s.   | j d u rdS | j jdks| j jdkrdS dS )NTr   F)rP   Zbcs_dimZfeatures_dimrW   r*   r*   r+   is_zero_matrix   s
   
z"MultiGenomeAnalysis.is_zero_matrixr
   r0   r   
bootstrapsr$   c                 K  s   t ||}t|tjk}|dksJ t|t|ksJ tjd t|}t	|D ]7}tj
t|t|}	||	 }
||	 }t |
|}t|tjk}t|tjk}t|tjk}t|||||< q+|||fS )a#  An overridable method to determine the number of multiplets, should return.

        Args:
            counts0:
            counts1:
            bootstraps: Number of bootstrap iterations to run

        Returns:
            int: observed multiplets
            np.ndarray: the number of inferred multiplets,
                either as a single value, or as a list/vector of values if bootstrapping is used.
            np.ndarray: strings with elements of either GEM_CLASS_GENOME0,
                GEM_CLASS_GENOME1 or GEM_CLASS_MULTIPLET
        r   )r   r!   r   r?   r   r   randomseedZzerosrangechoicer   r   r/   )rW   r
   r   r[   kwargsr   n_multiplet_obsn_multiplet_bootiZboot_idxZcounts0_bootZcounts1_bootZgem_cls_bootZn_obs_multiplet_bootZn_cells0_bootZn_cells1_bootr*   r*   r+   _infer_multiplets   s$   




z%MultiGenomeAnalysis._infer_multipletsc              	     s  i }j d us	J j  fddD  dd  D }tt|d d d dd }fdd|D }|_ fdd|D }ttd	d
 |t t}|dkrXd S  fdd|D }t	t
fdd|D }j|dd d f |dd d f jd\}	}
}||d< tt|	|d< tt|
 |d< t|
 t}||d< dt|t |d< |
jdkrtt|
dt|d< tt|
dt|d< t|dd d f |dd d f |\	}}}}}}}}}|||d  d< |||d  d< ||tj d< |||d  d< |||d  d< |||d  d< |||d  d< || |tj d< | |dd d f  |dd d f  |d |d | d_|_  d S )Nc                   s   g | ]} j |qS r*   )rP   Zselect_features_by_genome)r7   grY   r*   r+   
<listcomp>  s    z/MultiGenomeAnalysis.run_all.<locals>.<listcomp>c                 S  s   g | ]}|j  qS r*   )mr!   )r7   Zmatr*   r*   r+   rf         r   r-   c                      g | ]} | qS r*   r*   r7   rc   )genomesr*   r+   rf         c                   s   g | ]} | j qS r*   )Zbcsrk   genome_matsr*   r+   rf   	  rh   c                 S  s   | t |B S r6   )set)axr*   r*   r+   <lambda>  s    z-MultiGenomeAnalysis.run_all.<locals>.<lambda>c                   rj   r*   r*   rk   rn   r*   r+   rf     rm   c                 3  s    | ]
}|   V  qd S r6   )Zselect_barcodes_by_seqZget_counts_per_bc)r7   rg   )use_barcodesr*   r+   r9     s
    
z.MultiGenomeAnalysis.run_all.<locals>.<genexpr>r   )rV   Zfiltered_bcs_observed_allZ filtered_bcs_observed_multipletsZ filtered_bcs_inferred_multipletsZ$filtered_bcs_inferred_multiplet_rater	   Z/filtered_bcs_inferred_normalized_multiplet_rateg      @Z'filtered_bcs_inferred_multiplet_rate_lbg     `X@Z'filtered_bcs_inferred_multiplet_rate_ubZ_filtered_bcs_mean_count_purityZ_filtered_bcs_purity_outliersZ!_filtered_bcs_frac_purity_outlier)rB   rC   rD   rE   rG   rH   rI   )rP   Zget_genomesr;   r   ZargsortrT   r   rp   r   Zvstacktuplerd   rV   r$   roundmeanr"   r#   sizer:   r,   lib_constantsZMULTI_REFS_PREFIXtolistr@   rS   _add_suffix_to_metrics)rW   dZtxome_countsZtop_txome_idxrT   Ztop_txome_cell_bc_seqsZ
n_barcodesZtop_two_filt_matsZtop_txome_reads_per_bcra   rb   Zgem_class_callZmultiplet_rater%   r&   Zoverall_purityZn_purity_outlier0Zn_purity_outlier1Zfrac_purity_outlier0Zfrac_purity_outlier1r)   r'   r*   )ro   rl   rW   rt   r+   run_all   s   
  

	zMultiGenomeAnalysis.run_allc                 C  sr   | j dkrdS t| j }|D ]}| j|| j|| j  < qt| j }|D ]}| j|| j|| j  < q(dS )zBMethod to update the result and suffix to add a suffix if need be.rR   N)rU   listrS   keyspopr@   )rW   Zsum_keyskeyZres_keysr*   r*   r+   r{   R  s   
z*MultiGenomeAnalysis._add_suffix_to_metricscolumnrF   c                 C  s   | j dkr
|| j  }| j| S )z4Get a value from the result with the suffix removed.rR   )rU   r@   )rW   r   r*   r*   r+   _get_wo_suffix^  s   


z"MultiGenomeAnalysis._get_wo_suffixbase_dirc           	   	   C  s  t j|d}t jt j|dd t|dk}tj|t jd}|	d| 
d| 
dd	g ttj}ttj}tt| 
dD ]7}t| 
d	| }||| 
d}||| 
d}|	t| 
d| | 
d
| | 
d| |g qBW d    d S 1 sw   Y  d S )Nzgem_classification.csvTexist_okw)ZlineterminatorrB   rG   rH   rC   rD   rE   )ospathjoinmakedirsdirnameopencsvwriterlinesepZwriterowr   r   r   r   r   r^   r   replace)	rW   r   Zcsv_file_pathfr   rG   rH   rc   rC   r*   r*   r+   save_gem_class_csvd  s8   

"z&MultiGenomeAnalysis.save_gem_class_csvc                 C  sd   t |}tjtj|dd t|d}tj| j	|ddd W d    d S 1 s+w   Y  d S )NTr   r      indent	sort_keys)
rO   	json_pathr   r   r   r   r   tk_safe_json
dump_numpyr@   )rW   r   json_file_pathr   r*   r*   r+   save_gem_class_json  s
   
"z'MultiGenomeAnalysis.save_gem_class_jsonc                 C  sD   t |d}tj| j|ddd W d    d S 1 sw   Y  d S )Nr   r   Tr   )r   r   r   rS   )rW   filenamer   r*   r*   r+   save_summary_json  s   "z%MultiGenomeAnalysis.save_summary_jsonc                 C  s$   t | }tj|rt |S d S r6   )rO   r   r   r   exists	load_json)r   r   r*   r*   r+   load_default_format  s   

z'MultiGenomeAnalysis.load_default_formatc                   s   t  }t| }t||_W d    n1 sw   Y  d|jv rEi  |jd D ]}| vr5t| |< q) fdd|jd D |jd< |S )NrC   c                   rj   r*   r*   )r7   rC   Zbytes_internr*   r+   rf     rm   z1MultiGenomeAnalysis.load_json.<locals>.<listcomp>)rO   r   jsonloadr@   r   )r   analysisr   rC   r*   r   r+   r     s   

zMultiGenomeAnalysis.load_json	base_pathc                 C  s   t j| dS )Nzanalysis.json)r   r   r   )r   r*   r*   r+   r     s   zMultiGenomeAnalysis.json_pathr6   )rP   rQ   )r
   r0   r   r0   r[   r$   )r   rF   )r   rF   )r   rF   )rJ   rK   rL   __doc__rX   rZ   NUM_MULTIPLET_BOOTSTRAP_SAMPLESrd   r}   r{   r   r   r   r   staticmethodr   r   r   r*   r*   r*   r+   rO      s$    	*W



rO   r6   )r
   r   r   r   r   r   )r
   r0   r   r0   r1   r2   )'
__future__r   r   r   r   r   	functoolsr   typingr   r   Znumpyr   Zscipy.statsr   Zsixr   r   Zcellranger.analysis.constantsr   	constantsr   Zcellranger.library_constantsZlibrary_constantsry   Ztenkit.safe_jsonZ	safe_jsonr   Ztenkit.statsr   r"   cellranger.matrixr   r   r,   r/   r   rA   rO   r*   r*   r*   r+   <module>   s.   R
*
