o
    Uݢg7                     @  sV  d dl mZ d dlZd dlmZmZ d dlmZ d dlZ	d dl
mZ d dlmZ d dlm  mZ d dlm  mZ d dlm  mZ d dlmZ d dlmZ d dlmZ ercd d	lmZ d d
l m!Z! dZ"G dd de#Z$dd Z%					 d8d9ddZ&d:ddZ'dZ(dZ)dZ*dd  Z+d;d%d&Z,d<d(d)Z-d=d.d/Z.d>d1d2Z/d?d3d4Z0d@d6d7Z1dS )A    )annotationsN)IterableMapping)TYPE_CHECKING)
ensure_str)sparsefuncs)PCA)irlb)
FeatureDef)CountMatrix   c                   @  s   e Zd ZdS )MatrixRankTooSmallExceptionN)__name__
__module____qualname__ r   r   f/oak/stanford/groups/akundaje/marinovg/programs/cellranger-9.0.1/lib/python/cellranger/analysis/pca.pyr   $   s    r   c                   s    fdd|D S )a"  If a matrix is subset down to only have columns indexed by cols_not_removed, and then is further subset to.

    only contain cols_used_after removal, in that order, than this method returns the index of which columns in the old
    matrix correspond the the columns in the new matrix.
    c                   s   g | ]} | qS r   r   ).0xcols_not_removedr   r   
<listcomp>.   s    z-get_original_columns_used.<locals>.<listcomp>r   )r   Zcols_used_after_removalr   r   r   get_original_columns_used(   s   r   matrixr   pca_features
int | Nonepca_bcsn_pca_componentsrandom_statemin_count_thresholdintc           !      C  s0  |du rt j}tjd | |\}}}t|j}	|du r)|j}t|j}	n2||jk r?ttjj	t|j|dd}	n||jkr[d| d|j d}
t
|
 |j}t|j}	|du rc|j}n||jkryd| d|j d	}
t
|
 |j}t|}t|j\}}t| | }tj|d
d| d }|du rt j}t||}||k r|tkrtdt
d| d| d |}|d t|krt
d ||	|}t|\}}}t|dt|   || }t||t| |d\}}}}}t| \}}}t|dt|   t||}|dd|f  ||| dd|f  | }t!|| jf}|j|dd|f< t"|}t#|t"|	d |  }t$dd | j%j&D | }t'| j} tj(| dd< || |< |j)| j|fks|J |j)|| jfksJ |j)|fksJ t*|||| |S )a?  Run a PCA on the matrix using the IRLBA matrix factorization algorithm.

    Prior to the PCA analysis, the
    matrix is modified so that all barcodes/columns have the same counts, and then the counts are transformed
    by a log2(1+X) operation.

    If desired, only a subset of features (e.g. sample rows) can be selected for PCA analysis.  Each feature is ranked
    by its dispersion relative to other features that have a similar mean count.  The top `pca_features` as ranked by
    this method will then be used for the PCA.

    One can also select to subset number of barcodes to use (e.g. sample columns), but in this case they are simply
    randomly sampled.

    Args:
        matrix (CountMatrix): The matrix to perform PCA on.
        pca_features (int): Number of features to subset from matrix and use in PCA. The top pca_features ranked by
                            dispersion are used
        pca_bcs (int): Number of barcodes to randomly sample for the matrix.
        n_pca_components (int): How many PCA components should be used.
        random_state (int): The seed for the RNG
        min_count_threshold (int): The minimum sum of each row/column for that row/column to be passed to PCA
                                   (this filter is prior to any subsetting that occurs).

    Returns:
        A PCA object
    Nr   F)sizereplacezYou requested z: barcodes but the matrix after thresholding only included z&, so the smaller amount is being used.z: features but the matrix after thresholding only included z. features,so the smaller amount is being used.stable)kindzMatrix rank is too smallz.There are fewer nonzero features or barcodes (z!) than requested PCA components (z%); reducing the number of components.g      ?zRequested number of PCA components is large relative to the matrix size, an exact approach to matrix factorization may be faster.      ?)centerr      c                 S     g | ]}|j qS r   idr   fr   r   r   r          zrun_pca.<locals>.<listcomp>)+analysis_constantsRANDOM_STATEnprandomseedZselect_axes_above_thresholdarangeZbcs_dimsortchoiceprintZfeatures_dimanalysis_statsnormalize_by_umisummarize_columnsTZget_normalized_dispersionsqueezeargsortPCA_N_COMPONENTS_DEFAULTminDEFAULT_RUNPCA_THRESHOLDr   floatZselect_barcodesZselect_featuresnormalize_and_transposer   inplace_column_scale
atleast_1dr	   r   dotzeroslensquarearrayfeature_reffeature_defsemptynanshaper   )!r   r   r   r   r   r   Zthresholded_matrix_Zthresholded_featuresZpca_bc_indicesmsgmmuvar
dispersionZpca_feature_indicesZlikely_matrix_rankpca_matZpca_norm_matZ
pca_centerZ	pca_scaledvZfull_norm_matZfull_centerZ
full_scaleZorg_cols_usedZtransformed_irlba_matrixZirlba_componentsZvariance_sumvariance_explainedfeatures_selectedZfull_dispersionr   r   r   run_pca1   s   "








rY   c                 C  s^   |    t| }td|j |_|j}t|\}}d|t|dk< t	|}|||fS )Nr'   r%   g        )
tocscr7   r8   r0   log2datar:   r9   wheresqrt)r   rP   crV   sr   r   r   rA      s   


rA      g     @@c                 C  sb   | t j ||t  d d  |t d  |t d  ||d  d d  t j }ttjt	
|S )a9  An approximate model of the memory consumption of PCA preprocessing plus IRLBA.

    The key factor is analysis_constants.NUM_IRLB_MATRIX_ENTRIES_PER_MEM_GB
    which is set empirically from looking at the memory consumption of many jobs.

    See lib/python/cellranger/test/test_pca.py for details & a test.
       g    eAra   )r.   "NUM_IRLB_MATRIX_ENTRIES_PER_MEM_GBEXTRA_IRLBA_PCSBYTES_PER_FEATUREBYTES_PER_BCIRLB_BASE_MEM_GBmaxh5_constants
MIN_MEM_GBr0   ceil)Znonzero_entriesfeaturesbcsZpcsZirlba_mem_gbr   r   r   get_irlb_mem_gb_from_matrix_dim   s   	

	rn   pca_mapMapping[int, PCA]base_dirstrc                 C  s   t | |j|jj| d S N)save_pca_csv_with_bc_featurerm   rI   rJ   )ro   r   rq   r   r   r   save_pca_csv  s   ru   library_typec           
      C  s   |   D ]@\}}tj|| d| d}tj|dd tj|d}|jjd }||ks/J dgdd	 t|D  }	t	||j|	| qd
S )zJUsed only for saving the results of PCA2 in the batch correction pipeline.rN   _componentsTexist_okprojection.csvr'   Barcodec                 S     g | ]}d |d  qS zPC-%dr'   r   r   ir   r   r   r         z!save_pca2_csv.<locals>.<listcomp>N)
itemsospathjoinmakedirstransformed_pca_matrixrM   rangeanalysis_iosave_matrix_csv)
ro   barcodesrq   rv   n_componentspcan_components_dir	matrix_fn	n_columnsmatrix_headerr   r   r   save_pca2_csv  s   r   r   Iterable[bytes]rl   Iterable[FeatureDef]c              
   C  s  |   D ]\}}tj|| d}tj|dd tj|d}|jjd }||ks,J dgdd t|D  }	t	||j|	| |j
jd	krftj|d
}
dgdd |D  }t	|
|j
|td|d  |jjd	krtj|d}ddg}t	||j|td|d  |jjd	krtj|d}ddg}t	||j|dd |D  |jjd	krtj|d}dg}t	||j|tdt|jd  qdS )zJUsed only for saving the results of python-based PCA in the ATAC pipeline.rw   Trx   rz   r'   r{   c                 S  r|   r}   r   r~   r   r   r   r      r   z0save_pca_csv_with_bc_feature.<locals>.<listcomp>r   zcomponents.csvPCc                 S  s   g | ]}t |jqS r   )r   r*   r+   r   r   r   r   '  s    zvariance.csvzProportion.Variance.Explainedzdispersion.csvZFeaturezNormalized.Dispersionc                 S  r(   r   r)   r+   r   r   r   r   7  r-   zfeatures_selected.csvN)r   r   r   r   r   r   rM   r   r   r   
componentsr!   rW   rS   rX   rF   )ro   r   rl   rq   r   r   r   r   r   r   Zcomponents_fnZcomponents_headerZvariance_fnZvariance_headerZdispersion_fnZdispersion_headerZfeatures_fnZfeatures_headerr   r   r   rt     sJ   rt   fnamec                 C  s   t | |tj d S rs   )r   save_dimension_reduction_h5r.   ANALYSIS_H5_PCA_GROUP)ro   r   r   r   r   save_pca_h5F  s   r   c                 C  s   t | |tj| d S rs   )r    save_pca2_dimension_reduction_h5r.   r   )ro   r   rv   r   r   r   save_pca2_h5L  s   r   filenamec                 C  s   t | tjtS )z+Load just the PCA info from an analysis h5.)r    load_dimension_reduction_from_h5r.   r   r   )r   r   r   r   load_pca_from_h5U  s   r   )NNNNr   )r   r   r   r   r   r   r   r   r   r   r   r    )r   r   )ro   rp   r   r   rq   rr   )ro   rp   rq   rr   rv   rr   )ro   rp   r   r   rl   r   rq   rr   )ro   rp   r   rr   )ro   rp   r   rr   rv   rr   )r   rr   )2
__future__r   r   collections.abcr   r   typingr   numpyr0   sixr   sklearn.utilsr   cellranger.analysis.constantsanalysis	constantsr.   cellranger.analysis.ioior   Zcellranger.analysis.statsstatsr7   cellranger.h5_constantsri   "cellranger.analysis.analysis_typesr   Zcellranger.analysis.irlbr	   Zcellranger.feature_refr
   cellranger.matrixr   r?   	Exceptionr   r   rY   rA   rd   re   rf   rn   ru   r   rt   r   r   r   r   r   r   r   <module>   sJ    
"



4
	