o
    Uݢg$                     @  s  d Z ddlmZ ddlZddlZddlmZmZ ddlZ	ddl
ZddlZddlm  mZ ddlm  mZ ddlm  mZ ddlmZ ddlmZmZ erSddlZdZd$ddZefd%ddZ d%ddZ!dd Z"d&ddZ#	d'd(ddZ$			d)ddZ%d d! Z&d"d# Z'dS )*z9Differential expression analysis for single-cell RNA-seq.    )annotationsN)TYPE_CHECKINGAny)DifferentialExpression)compute_sseq_params_o3sseq_differential_expression_o3gףp=
?xscipy.sparse.csc_matrixreturn%np.ndarray[int, np.dtype[np.float64]]c                 C  s2   t t | jdd}|t jt | }|S )zEstimate size factors (related to cell RNA content and GEM-to-GEM technical variance).

    Args:
      x: Sparse matrix (csc) of counts (feature x cell)

    Returns:
      Array of floats, one per cell.
    r   )axis)npsqueezeasarraysumastypefloat64median)r   Zcounts_per_cellZsize_factors r   j/oak/stanford/groups/akundaje/marinovg/programs/cellranger-9.0.1/lib/python/cellranger/analysis/diffexp.pyestimate_size_factors    s   	r   c                 C  s   | j s|  } t| |S )a  Compute global parameters for the sSeq differential expression method.

    The key parameters are the shrunken feature-wise dispersions.

    This method was published in:
    Yu D, et al. (2013) Shrinkage estimation of dispersion in Negative Binomial models for RNA-seq experiments with small sample size.
    Bioinformatics. 29: 1275-1282. doi: 10.1093/bioinformatics/btt143

    Args:
      x: Sparse matrix (csc) of counts (feature x cell)
      zeta_quantile (float): Quantile of method-of-moments dispersion estimates to
                             use as the shrinkage target zeta.

    Returns:
      A dictionary containing the sSeq parameters and some diagnostic info.
    )has_sorted_indicessorted_indicesr   )r   Zzeta_quantiler   r   r   compute_sseq_params.   s   
r   c                 C  s   t d tj  t||g}| dd|f }|js| }t|}tj	t
t|tt|d}tj	t
t||t|t| d}t||||fS )zCompute locally-distinguishing sseq parameters.

    For perturbation vs control analysis for CRISPR
    and meta sample comparison within an aggregated matrix.
    z*...Computing params for this comparison...N)dtypecount)printsysstdoutflushr   concatenater   r   lenfromiterrangeintr   )r   group_agroup_bZboth_conditionsZmatrix_groupsZnbcZnew_group_aZnew_group_br   r   r   get_local_sseq_paramsE   s   
"r'   c                 C  s\   t | ddd }tt| t t| dd }t dt j|| |  }|t | S )zOMultiple testing correction of p-values using the Benjamini-Hochberg procedure.Nr      )r   argsortfloatr!   arangeminimum
accumulate)p
descendingscaleZqualr   r   r   adjust_pvalue_bhX   s   r2     c                 C  sj   | j s|  } t| ||||}t|d |d |d |d |d |d |d |d |d	 |d
 d
}|S )a  Run sSeq pairwise differential expression test.

    Args:
      x: Sparse matrix (csc) of counts (feature x cell)
      cond_a (np.array(int)): Indices of cells in group A
      cond_b (np.array(int)): Indices of cells in group B
      sseq_params (dict): Precomputed global parameters
      big_count (int): Use asymptotic approximation if both counts > this

    Returns:
      pd.DataFrame: DE results for group A relative to group B.
    Zuse_gZsums_inZsums_outcommon_meancommon_dispersionZnormalized_mean_inZnormalized_mean_outZp_valuesZadjusted_p_valueslog2_fold_change)
ZtestedZsum_aZsum_br4   r5   norm_mean_aZnorm_mean_bp_valueadjusted_p_valuer6   )r   r   r   pd	DataFrame)r   Zcond_aZcond_bsseq_paramsZ	big_countZdiff_exp_results	de_resultr   r   r   sseq_differential_expressiond   s&   
r>   clusters%np.ndarray[Any, np.dtype[np.integer]]r<   dict[str, Any] | Nonec           
      C  s  t |}|du rtd tj  t| j}t | j	d| f}t
dd| D ]V}||k}t |}t t |}td|  tj  t| j|||}	|	d |dddd|d   f< |	d |dddd|d   f< |	d	 |ddd
d|d   f< q(t|S )a
  Compute differential expression for each cluster vs all other cells.

    Args:
        matrix (GeneBCMatrix):  feature expression data
        clusters (np.ndarray[int, int]):  1-based cluster labels
        sseq_params (dict):  params from compute_sseq_params
    NzComputing params...   r)   zComputing DE for cluster %d...r7   r   r6   r9      )r   maxr   r   r   r   r   mzerosfeatures_dimr#   flatnonzerological_notr>   r   )
matrixr?   r<   
n_clustersZall_de_resultsclusterZ
in_clusterr%   r&   r=   r   r   r   run_differential_expression   s"   




  "rM   differential_expressionc                 C  s  |}| durt j|| }t j|dd t j||d }ddg}	|jjd d }
t|
D ]L}|rH|	||  d	||  d
||  dg7 }	q-|du r`|	d|d  d|d  d|d  g7 }	q-|	d||  d	d||  d
d||  dg7 }	q-dd |jjD }t	
||j|	| dS )Write diffexp results to CSV.NTexist_okz.csv
Feature IDFeature Namer)   rB   z, Mean Countsz, Log2 fold changez, Adjusted p valueCluster %d Mean CountsCluster %d Log2 fold changeCluster %d Adjusted p valuezPerturbation c                 S  s   g | ]}|j |jfqS r   )idname).0fr   r   r   
<listcomp>   s    z4save_differential_expression_csv.<locals>.<listcomp>)ospathjoinmakedirsdatashaper#   feature_reffeature_defsanalysis_iosave_matrix_csv)clustering_keyderJ   base_dirZcluster_names	file_nameZ
cell_typesout_dirdiff_expression_fndiff_expression_headerrK   idiff_expression_prefixesr   r   r    save_differential_expression_csv   s:   




ro   c           	      C  s   t j|| }t j|dd t j|d}ddg}|jjd d }t|D ]}|d|d  d	|d  d
|d  g7 }q%t||j|| dS )rO   TrP   zdifferential_expression.csvrR   rS   r)   rB   rT   rU   rV   N)	r\   r]   r^   r_   r`   ra   r#   rd   re   )	rf   rg   rn   rh   rj   rk   rl   rK   rm   r   r   r   .save_differential_expression_csv_from_features   s   



rp   c                 C  s:   |  | jtj}t| ||| t| tjtjt	| dS )z$Write diffexp results to H5File `f`.N)
create_grouprootanalysis_constantsZ)ANALYSIS_H5_DIFFERENTIAL_EXPRESSION_GROUPrd   Zsave_h5cr_clusteringZcreate_legacy_kmeans_nodesZ0ANALYSIS_H5_KMEANS_DIFFERENTIAL_EXPRESSION_GROUPr   )rZ   rf   rg   groupr   r   r   save_differential_expression_h5   s   rv   )r   r	   r
   r   )r   r	   )r3   )N)r?   r@   r<   rA   )NrN   N)(__doc__
__future__r   r\   r   typingr   r   numpyr   pandasr:   scipy.sparsescipyZcellranger.analysis.clusteringanalysisZ
clusteringrt   Zcellranger.analysis.constants	constantsrs   Zcellranger.analysis.ioiord   Z"cellranger.analysis.analysis_typesr   cellranger.fast_utilsr   r   scipy.statsZSSEQ_ZETA_QUANTILEr   r   r'   r2   r>   rM   ro   rp   rv   r   r   r   r   <module>   s:   


(+
-