o
    Uݢg6                     @  s  U d dl mZ d dlZd dlZd dlmZ d dlmZ d dl	Z
d dlZd dlmZ d dlm  mZ d dlm  mZ d dlm  mZ d dlmZ d dlmZmZmZm Z m!Z!m"Z" d dl#m$Z$ ej%ej&ej'dZ(d	Z)d
Z*e)de*diZ+e,Z-de.d< G dd dZ/dS )    )annotationsN)Sequence)	TypeAlias)ensure_binary)LSAPCAPLSATSNEUMAPDifferentialExpression)tablespcalsaplsatsneumapzt-SNEr
   r   
Projectionc                   @  s   e Zd Zeffd-ddZd	d
 Zdd Zdd Zdd Zd.ddZ	dd Z
ejfddZejfddZeeffd/ddZdd Zdd Zd d! Zd"d# Zd$d% Zed&d' Zed(d) Zed0d+d,ZdS )1SingleGenomeAnalysismatrixcr_matrix.CountMatrixmethodstrprojectionsSequence[str]c                 C  s   |sJ d|D ]
}|t tfv sJ q|| _|| _|| _tj| _tj| _	tj
| _g | _t| | _tj| _tj| _tj| _t| | _tj| _tj| _tj| _|j| _|j| _i | _ i | _!i | _"i | _#i | _$t%j&'d dS )a[  Initialize secondary analysis results for a single genome analysis.

        Args:
            matrix (cr_matrix.CountMatrix): count matrix
            method (str): dimensionality reduction method pca|lsa|plsa
            projections (Sequence[str], optional): List of manifold projections
                to load. Defaults to ("tsne",).
        "Must specify projection(s) to loadr   N)(	TSNE_NAME	UMAP_NAMEr   r   r   analysis_constantsRANDOM_STATErandom_stateMIN_N_CLUSTERSZmin_n_clustersMAX_N_CLUSTERS_DEFAULTZmax_n_clusters%n_dimensionality_reduction_components
COMPONENTSZtsne_input_dimsTSNE_N_COMPONENTSZn_tsne_componentsTSNE_DEFAULT_PERPLEXITYZ
perplexity
TSNE_THETAthetaZumap_input_dimsUMAP_N_COMPONENTSZn_umap_componentsUMAP_MIN_DISTZumap_min_distUMAP_DEFAULT_N_NEIGHBORSZumap_n_neighborsbcs_dimdr_bcsfeatures_dimZdr_featuresdimensionality_reduced_matrixclusteringsdifferential_expressionr   r   nprandomseed)selfr   r   r   proj r7   o/oak/stanford/groups/akundaje/marinovg/programs/cellranger-9.0.1/lib/python/cellranger/analysis/singlegenome.py__init__-   s4   

zSingleGenomeAnalysis.__init__c                 C  s   | j jdkp| j jdkS )Nr   )r   r,   r.   r5   r7   r7   r8   is_zero_matrixc   s   z#SingleGenomeAnalysis.is_zero_matrixc                 C  s   dd | j  D S )Nc                 S  s   i | ]
\}}|t |qS r7   )cr_clusteringget_cluster_sizes).0kvr7   r7   r8   
<dictcomp>g   s    z:SingleGenomeAnalysis.get_cluster_sizes.<locals>.<dictcomp>)r0   itemsr:   r7   r7   r8   r=   f   s   z&SingleGenomeAnalysis.get_cluster_sizesc                 C  s  | j || _ t| jt|| _| jdkr=| j D ]!\}}t|t	s%J t	|j
|d d f |j|j|j|j| j|< qnY| jdkrj| j D ]!\}}t|tsRJ t|j|d d f |j|j|j|j| j|< qGn,| jdkr| j D ]!\}}t|tsJ t|j|d d f |j|j|j|j| j|< qt| j D ]\}}t||| j|< q| j D ]\}}	t|	j|d d f |	j|	jd| j|< q| j D ]\}}
t|
j|d d f |
j|
jd| j|< qd S )Nr   r   r   )namekey)r   Zselect_barcodesminr-   lenr   r/   rB   
isinstancer   transformed_pca_matrix
componentsvariance_explained
dispersionfeatures_selectedr   transformed_lsa_matrixr   transformed_plsa_matrixr0   r<   subselect_barcodesr   r	   transformed_tsne_matrixrC   rD   r   r
   transformed_umap_matrix)r5   cell_bc_indicesn_componentsr   r   r   rD   
clusteringrC   r   r   r7   r7   r8   _select_bc_indicesi   sZ   

	
		z'SingleGenomeAnalysis._select_bc_indicesc                 C  s@   || j jkrdS ttjjt| j j|dd}| | dS )z<Subsample barcodes across entire analysis (matrix, DR, etc).NF)sizereplace)r   r,   r2   sortr3   choicearangerU   )r5   num_bcsrR   r7   r7   r8   subsample_bcs   s   z"SingleGenomeAnalysis.subsample_bcsNc                 C  s~   |s:| j tvrtd| j  dt| j  }|| jv r|}nt| jdkr*| jd }ntd| j  d| d| j d| j| S )	Nmethod z
 not found   r   z/Analysis has multiple reduced matrices of type z, but the defaultn_components z is not available: z. Please specify n_components)r   r$   
ValueErrorr#   rF   r/   )r5   rS   Zdefault_n_componentsr7   r7   r8   !get_dimensionality_reduced_matrix   s    




z6SingleGenomeAnalysis.get_dimensionality_reduced_matrixc                 C  s
   | j | S N)r0   )r5   Zcluster_keyr7   r7   r8   get_clustering   s   
z#SingleGenomeAnalysis.get_clusteringc                 C  j   || j v r| j | }|S d| j v rd}| j | }|S tj| j vr.d| j v r.d}| j | }|S t| d)$  Given analysis object, return the tsne with a given key while being compatible with older cellranger versions.

        Starting from CR7.0 and SR2.0, all libraries have their feature type prefixed on the tSNE output.
        Before, Gene Expression libraries only had the _2 prefix.
           2Nz not found in tsne analysis)r   r   TSNE_DEFAULT_KEYKeyError)r5   rD   r   r7   r7   r8   get_tsne      





zSingleGenomeAnalysis.get_tsnec                 C  rc   )rd   re   Nz not found in umap analysis)r   r   rf   rg   )r5   rD   r   r7   r7   r8   get_umap   ri   zSingleGenomeAnalysis.get_umapc                 C  sd  |sJ dt t| d}|tj }tj|}W d    n1 s$w   Y  t|||d}t	
| dr}d }|dkrAtj}n|dkrItj}n|dkrQtj}ntd|jj| }|| |jjtj }|| |jjtj }|| t|v r|jjtj }|| t|v r|jjtj }|| W d    |S W d    |S 1 sw   Y  |S )Nr   r)r   r   r   r   zmethod invalid)h5Filer   r   ANALYSIS_H5_MATRIX_GROUP	cr_matrixCountMatrixloadr   r   	open_fileANALYSIS_H5_PCA_GROUPANALYSIS_H5_LSA_GROUPANALYSIS_H5_PLSA_GROUPr_   root	_v_groups&_load_dimensionality_reduced_matrix_h5ANALYSIS_H5_CLUSTERING_GROUP_load_clustering_h5)ANALYSIS_H5_DIFFERENTIAL_EXPRESSION_GROUP _load_differential_expression_h5r   ANALYSIS_H5_TSNE_GROUP_load_tsne_h5r   ANALYSIS_H5_UMAP_GROUP_load_umap_h5)filenamer   r   fgroupr   analysisgrpr7   r7   r8   load_h5   sD   






zSingleGenomeAnalysis.load_h5c           	      C  s   t d}tttd}| j|v rG|| j }t||D ])\}}||}|d u r/t	d|  t
||d}| j| || j|< qd S td| j d)Nz^([^\d]*?)_?(\d+)$r   zkey:    r]   z not allowed)recompiler   r   r   r   analysis_ioload_h5_itermatchprintintr   r#   appendr/   r_   )	r5   r   regexZmethod_to_classZdim_red_classrD   Zdim_redmrS   r7   r7   r8   rx     s   



z;SingleGenomeAnalysis._load_dimensionality_reduced_matrix_h5c                 C  s&   t |tjD ]	\}}|| j|< qd S ra   )r   r   r<   
CLUSTERINGr0   )r5   r   clustering_keyrT   r7   r7   r8   rz   !  s   z(SingleGenomeAnalysis._load_clustering_h5c                 C  s$   t |tD ]	\}}|| j|< qd S ra   )r   r   r   r1   )r5   r   r   der7   r7   r8   r|   %  s   z5SingleGenomeAnalysis._load_differential_expression_h5c                 C  &   t |tD ]
\}}|| j|j< qd S ra   )r   r   r	   r   rD   )r5   r   _r   r7   r7   r8   r~   )     z"SingleGenomeAnalysis._load_tsne_h5c                 C  r   ra   )r   r   r
   r   rD   )r5   r   r   r   r7   r7   r8   r   -  r   z"SingleGenomeAnalysis._load_umap_h5c                 C  sL   t | d}t|jtj}dd |D W  d   S 1 sw   Y  dS )z2Load just the clustering keys from an analysis h5.rk   c                 S  s   g | ]	}|j d d qS )r^   N)_v_name)r>   noder7   r7   r8   
<listcomp>6  s    zESingleGenomeAnalysis.load_clustering_keys_from_h5.<locals>.<listcomp>N)r   rr   getattrrv   r   ry   )r   r   r   r7   r7   r8   load_clustering_keys_from_h51  s   $z1SingleGenomeAnalysis.load_clustering_keys_from_h5c                 C  s|   t | d.}t|jtj}|D ]}|jd| kr)t|t	j
  W  d   S qtd| d|  1 s7w   Y  dS )z-Load a single clustering from an analysis h5.rk   r   NzCould not find clustering key: z in HDF5 file )r   rr   r   rv   r   ry   r   r   load_h5_namedtupler<   r   r_   )r   r   r   r   subgroupr7   r7   r8   load_clustering_from_h58  s   z,SingleGenomeAnalysis.load_clustering_from_h5Sequence[Projection]c                C  s(   t | }tj|rt|||S d S ra   )r   h5_pathospathexistsr   r   )base_dirr   r   Zh5_file_pathr7   r7   r8   load_default_formatD  s   
z(SingleGenomeAnalysis.load_default_format)r   r   r   r   r   r   ra   )r   r   )r   r   )__name__
__module____qualname__r   r9   r;   r=   rU   r\   r`   rb   r   rf   rh   rj   staticmethodr   rx   rz   r|   r~   r   r   r   r   r7   r7   r7   r8   r   ,   s0    61

'

r   )0
__future__r   os.pathr   r   collections.abcr   typingr   Zh5pyrl   numpyr2   sixr   cellranger.analysis.clusteringr   rT   r<   cellranger.analysis.constants	constantsr   cellranger.analysis.ioior   cellranger.matrixr   ro   "cellranger.analysis.analysis_typesr   r   r   r	   r
   r   cellranger.wrapped_tablesr   PCA_N_COMPONENTS_DEFAULTLSA_N_COMPONENTS_DEFAULTPLSA_N_COMPONENTS_DEFAULTr$   r   r   ZPROJECTION_TITLEr   r   __annotations__r   r7   r7   r7   r8   <module>   s.    