o
    Uݢg                     @  s0  d dl mZ d dlZd dlZd dlZd dlZd dlmZ d dl	m
Z
mZmZmZmZmZ d dlmZ d dlmZmZmZ d dlZd dlZd dlmZ d dlmZmZ d dl m!Z! d dl"m#Z# d dl$m%Z& d dl'm(  m)Z* d dl+mZ, d dl-m.Z/ d dl0m1Z2 d dl3m4Z4 d d	l5m6Z6m7Z7m8Z8 d d
l9m:Z: dZ;dZ<dZ=dZ>dZ?dSdTddZ@edejAdZBdUddZCdZDdZEdZFd ZGd!ZHe#jIeFge#jJ ZKG d"d# d#eLZMdVd'd(ZNdWd.d/ZOdXd2d3ZPG d4d5 d5ZQG d6d7 d7ZRdYd;d<ZS	dZd=d>ZTd?d@ ZUdAdB ZVd[dEdFZWdGdH ZXdIdJ ZYd\dLdMZZd]dOdPZ[dQdR Z\dS )^    )annotationsN)OrderedDict)Callable
Collection	ContainerIterableMappingSequence)BufferedIOBase)AnyTypeVaroverload)ensure_binary
ensure_str)MatrixBarcodeIndex)GENOME_FEATURE_TAG
FeatureDefFeatureReference)tablesgzipi8 int32zfeatures.tsv.gz
normalizedaxisintreturn
np.ndarrayc                 C  s*   t | j|d}t |j}||fS )z"Sum a sparse matrix along an axis.r   )npasarraysumprodshapeZreshape)matrixr   Zaxis_sumZmax_dim r#   `/oak/stanford/groups/akundaje/marinovg/programs/cellranger-9.0.1/lib/python/cellranger/matrix.pysum_sparse_matrix/   s   r%   _T)boundarraynp.ndarray[Any, np.dtype[_T]]nIterable[tuple[int, _T]]c                 C  s8   t | | | d }|t | |  }t|| | S )zRetrieve the N largest elements and their positions in a numpy ndarray.

    Args:
       array (numpy.ndarray): Array
       n (int): Number of elements

    Returns:
       list of tuple of (int, x): Tuples are (original index, value).
    N)r   ZargpartitionZargsortzip)r(   r*   indicesr#   r#   r$   top_n9   s   
r.   r"   version   Zsoftware_versionc                   @  s   e Zd ZdS )NullAxisMatrixErrorN)__name__
__module____qualname__r#   r#   r#   r$   r1   U   s    r1   originalsp_sparse.spmatrixsubsetc                 C  s   | j |j kr|| j }|S )a  Scipy has the fantastic habit of subsetting a matrix of one dtype down to a float64 if no elements are selected.

    This function makes the new matrix have the same type as the old matrix, and can be called after subsetting.

    Args:
        original: The original matrix
        subset: The matrix after subsetting

    Returns:
        A type correct version of the matrix
    )dtypeastype)r5   r7   r#   r#   r$   _ensure_types_matchY   s   r:   fileh5.Fileextra_attrs@None | Mapping[str | bytes, str | bytes | Iterable[str | bytes]]Nonec                 C  s,   |r|  D ]\}}t| || qdS dS )z"Set optional top level attributes.N)itemscr_h5Zset_hdf5_attr)r;   r=   kvr#   r#   r$   _save_extra_attrsl   s
   rD   
sw_versionNone | str | bytesc                 C  s   |r	|| j t< d S d S N)attrsSOFTWARE_H5_VERSION_KEY)r;   rE   r#   r#   r$   _save_sw_versionu   s   rJ   c                   @  s0  e Zd ZdZ		dYdZdd	Zed[ddZed[ddZd\ddZd\ddZ	d]d^ddZ
ed_ddZed`ddZdad dZdbd#d$Zdcd'd(Zddd*d+Zded,d-Zdfd.d/Zdgd2d3Zdhd6d7Zdid8d9Zd:d; Zdjd>d?ZdkdBdCZdldEdFZdmdIdJZdndLdMZdodPdQZdpdSdTZdUdV ZdWdX ZdS )qCountMatrixViewzBSupports summing a sliced CountMatrix w/o copying the whole thing.Nr"   CountMatrixfeature_indicesCollection[int] | None
bc_indicesc                 C  s~   t j|jdd| _t j|jdd| _|| _|d ur'| jd d| jt |< |d ur9| jd d| jt |< | 	  d S )Nboolr8   FT)
r   Zonesfeatures_dimfeature_maskbcs_dimbc_maskr"   fillr   _update_feature_ref)selfr"   rM   rO   r#   r#   r$   __init__}   s   zCountMatrixView.__init__r   r   c                 C     t | jS rG   )r   count_nonzerorU   rX   r#   r#   r$   rT         zCountMatrixView.bcs_dimc                 C  rZ   rG   )r   r[   rS   r\   r#   r#   r$   rR      r]   zCountMatrixView.features_dimc                 C  s2   t | j}t| j|_t| j|_|  |S zReturn a copy of this view.)rK   r"   r   copyrU   rS   rW   )rX   viewr#   r#   r$   _copy   s
   
zCountMatrixView._copyc                 C  s   |   S r^   )ra   r\   r#   r#   r$   r`         zCountMatrixView.viewr   
int | Nonec                 C  s   t j| jj| j| j|dS )zSum across an axis.r   )	cr_sparseZ
sum_maskedr"   mrS   rU   )rX   r   r#   r#   r$   r      s   zCountMatrixView.sumr?   	threshold	np.uint64c                 C     d S rG   r#   rX   r   rf   r#   r#   r$   count_ge      zCountMatrixView.count_ge$np.ndarray[int, np.dtype[np.uint64]]c                 C  rh   rG   r#   ri   r#   r#   r$   rj      rk   0np.uint64 | np.ndarray[int, np.dtype[np.uint64]]c                 C  s   t | jj| j| j||S )z+Count number of elements >= X over an axis.)rd   Zcount_ge_maskedr"   re   rS   rU   ri   r#   r#   r$   rj      s   r-   list[int] | np.ndarrayc                 C  s@   |   }tj|t|jdt}d||dk< | j|M  _|S )z\Select a subset of barcodes (by index in the original matrix) and return the resulting view.Z	minlength   )ra   r   bincountlenrU   r9   rP   rX   r-   r`   maskr#   r#   r$   select_barcodes   s
   zCountMatrixView.select_barcodesbarcode_seqslist[bytes]c                 C  s   | j |}| |S rG   )r"   bcs_to_intsru   rX   rv   r-   r#   r#   r$   select_barcodes_by_seq      
z&CountMatrixView.select_barcodes_by_seq	gem_groupc                   s   |   fdd| jjD S )Nc                   s"   g | ]} t |d  kr|qS rp   )cr_utilsZsplit_barcode_seq.0bcr|   r#   r$   
<listcomp>   s   " z@CountMatrixView.select_barcodes_by_gem_group.<locals>.<listcomp>)rz   r"   bcsrX   r|   r#   r   r$   select_barcodes_by_gem_group   s   z,CountMatrixView.select_barcodes_by_gem_groupc                   s:   t  j}t fdd|D  jjj jjjd _dS )z<Make the feature reference consistent with the feature mask.c                   s   g | ]	} j jj| qS r#   )r"   feature_reffeature_defsr   ir\   r#   r$   r          z7CountMatrixView._update_feature_ref.<locals>.<listcomp>)r   all_tag_keystarget_featuresN)r   flatnonzerorS   r   r"   r   r   r   )rX   r-   r#   r\   r$   rW      s   z#CountMatrixView._update_feature_refc                 C  sH   |   }tj|t|jdt}d||dk< | j|M  _|  |S )z:Select a subset of features and return the resulting view.ro   rp   )ra   r   rq   rr   rS   r9   rP   rW   rs   r#   r#   r$   select_features   s   zCountMatrixView.select_featuresgenomestrc                 C  sD   g }| j jjD ]}|jtjkr|jd |kr||j q| 	|S zMSelect the subset of gene-expression features for genes in a specific genome.r   
r"   r   r   feature_typerna_libraryDEFAULT_LIBRARY_TYPEtagsappendindexr   rX   r   r-   featurer#   r#   r$   select_features_by_genome   s   
z)CountMatrixView.select_features_by_genomefeature_typesContainer[str]c                 C  sZ   g }| j jjD ] }|j|v r'd}|jtjkr|jd |krd}|r'||j q| 	|S )zSubset the features by types and genome.

        Select the subset of gene-expression features for genes in a specific genome and matching
        one of the types listed in feature_types.
        Tr   Fr   )rX   r   r   r-   r   Zinclude_featurer#   r#   r$   #select_features_by_genome_and_types   s   

z3CountMatrixView.select_features_by_genome_and_typesc                 C  s4   g }| j jjD ]}|j|v r||j q| |S )zSubset the features by type.

        Select the subset of gene-expression features for genes in a specific genome and matching
        one of the types listed in feature_types.
        )r"   r   r   r   r   r   r   rX   r   r-   r   r#   r#   r$   select_features_by_types   s   

z(CountMatrixView.select_features_by_typesc                 C     |  | |S zVSelect the subset of features with a particular feature type (e.g. "Gene Expression").)r   get_feature_indices_by_typerX   r   r#   r#   r$   select_features_by_type      z'CountMatrixView.select_features_by_typefeature_idsIterable[bytes]c                 C  s   |  | j|S rG   )r   r"   feature_ids_to_intsrX   r   r#   r#   r$   select_features_by_ids     z&CountMatrixView.select_features_by_idsr   	list[int]c                 C  s   | j j|S )zDReturn the list of indices of features corresponding a feature type.)r"   r   Zget_indices_for_typer   r#   r#   r$   r        z+CountMatrixView.get_feature_indices_by_type	list[str]c                 C  rZ   KGet a list of the distinct genomes represented by gene expression features.rL   _get_genomes_from_feature_refr   r\   r#   r#   r$   get_genomes  r]   zCountMatrixView.get_genomesr   list[bytes] | set[bytes]c                 C  s(   t | j| jjkrtd| j|S )NzDCalling bcs_to_ints on a barcode-sliced matrix view is unimplemented)r   r[   rU   r"   rT   NotImplementedErrorrx   )rX   r   r#   r#   r$   rx     s
   zCountMatrixView.bcs_to_intsbc_intsc                   sD   |d u s
t |dkrg S t j}|t| } fdd|D S )Nr   c                   s   g | ]} j j| qS r#   )r"   r   r   r\   r#   r$   r         z/CountMatrixView.ints_to_bcs.<locals>.<listcomp>)rr   r   r   rU   r   )rX   r   Zsliced_bc_intsZorig_bc_intsr#   r\   r$   ints_to_bcs  s
   zCountMatrixView.ints_to_bcsr   bytesc                 C     | j j| jS rG   r   r   idrX   r   r#   r#   r$   int_to_feature_id     z!CountMatrixView.int_to_feature_idtuple[int, int]c                 C  s   t | jt | jfS z&Return the shape of the sliced matrix.)r   r[   rS   rU   r\   r#   r#   r$   	get_shape!  s   zCountMatrixView.get_shapec                 C  s   | j dddS ):Return the number of nonzero entries in the sliced matrix.Nrp   )r   rf   )rj   r\   r#   r#   r$   get_num_nonzero%  r   zCountMatrixView.get_num_nonzeroc                 C  s   | j ddS Nr   r   )r   r\   r#   r#   r$   get_counts_per_bc)  s   z!CountMatrixView.get_counts_per_bcNN)r"   rL   rM   rN   rO   rN   )r   r   )r   rK   rG   )r   rc   )r   r?   rf   r   r   rg   )r   r   rf   r   r   rl   )r   rc   rf   r   r   rm   )r-   rn   )rv   rw   r   rK   )r|   r   r   rK   r   r?   )r-   rn   r   rK   )r   r   )r   r   r   r   )r   r   )r   r   r   rK   )r   r   r   r   )r   r   )r   r   r   r   )r   rn   r   rw   r   r   r   r   )r   r   )r2   r3   r4   __doc__rY   propertyrT   rR   ra   r`   r   r   rj   ru   rz   r   rW   r   r   r   r   r   r   r   r   rx   r   r   r   r   r   r#   r#   r#   r$   rK   z   sF    







	
	
	







rK   c                   @  s  e Zd ZdddZd	d
 Zdd Zdd ZeefdddZ	e
dddZdddZddd Zdd"d#Zdd%d&Zdd'd(Zdd*d+Zddd0d1Zdd3d4Zdd8d9Zddd<d=Zdd>d?ZddAdBZddCdDZ	E	EdddHdIZd dLdMZe
d!dOdPZe
d"dQdRZe
d#dTdUZe
d"dVdWZe
d$d[d\Ze
e j!d,fd%dadbZ"e
e j!d,fd&dcddZ#e
e j!d,fd'dedfZ$e
e j!d,fd(didjZ%e
d)dkdlZ&e
d*dmdnZ'e
d+dodpZ(e
dqdr Z)ed,dsdtZ*ed-dwdxZ+e
d.dydzZ,e
d*d{d|Z-e
d*d}d~Z.e
d*ddZ/e
d*ddZ0e
d/ddZ1d/ddZ2e
d0ddZ3e
d1ddZ4dd Z5dd Z6dd Z7dd Z8	d2d3ddZ9d2d4ddZ:d5ddZ;d6ddZ<d7ddZ=d8ddZ>d9ddZ?d:ddZ@d;ddZAd<ddZBd=ddZCd>ddZDd?ddZEd@ddZFdAddZGdBddZHdCddńZIe
dDddȄZJddʄ ZKe
dEdd̄ZLe
dd΄ ZMdFddфZNdFddӄZO	EdGdHddքZPdFdd؄ZQdFddڄZRdIdd݄ZS	EdJdFddZTdFddZUdKddZV	E	,dLdMddZWe
dNddZXe
dOddZYe
dd ZZe
ddPddZ[e
dd Z\e
dd Z]dQddZ^e
dd Z_dRddZ`dES (S  rL   r   r   r   #Collection[bytes] | Collection[str]r"   r6   c                 C  s   || _ t|j| _dd |jD | _t|tjr$|jj	tj
u r$| }n(t|dkr3tjg ddd}ntdd |D }tj|t|ttj
|fd	}~d|j_|| _| jj\| _t| j | jjj| _|| _| jjd
 t| jksyJ dd S )Nc                 S  s   i | ]}|j |jqS r#   )r   r   r   fr#   r#   r$   
<dictcomp>8      z(CountMatrix.__init__.<locals>.<dictcomp>r   SF)r8   r_   c                 s      | ]}t |V  qd S rG   rr   r   r#   r#   r$   	<genexpr>@      z'CountMatrix.__init__.<locals>.<genexpr>countr8   rp   z(Barcodes must be equal to cols of matrix)r   rr   r   rR   feature_ids_map
isinstancer   Zndarrayr8   typeZbytes_r_   r(   maxfromiterflagsZ	writeabler   r!   rT   r   Zfrom_raw_bytestobytesitemsizebcs_idxre   )rX   r   r   r"   Zbc_arrayZmax_lenr#   r#   r$   rY   /  s$   
 "zCountMatrix.__init__c                 C     | j jS r   )re   r!   r\   r#   r#   r$   r   N  rb   zCountMatrix.get_shapec                 C  r   )r   )re   nnzr\   r#   r#   r$   r   R  rb   zCountMatrix.get_num_nonzeroc                 C  s   t | S )zReturn a view on this matrix.)rK   r\   r#   r#   r$   r`   V  rb   zCountMatrix.viewCollection[bytes]c                 C  s*   t jt|jt|f|d}| |||dS )zCreate an empty matrix.rQ   r   r   r"   )	sp_sparse
lil_matrixrr   r   )clsr   r   r8   r"   r#   r#   r$   emptyZ  s   zCountMatrix.emptyh5_filer<   r   c              
     s  g }g }g }g }g }g }t   t|  D ]\}}| | }	tdd |D }
||	d dd |
  |	d dd }|D ]}| vrIt  |< q=tj fdd|D t|dd}|	d	 dd }t|d
t| ksoJ |dkrt|t	t|d
 sJ t
|}t|t|	d ksJ t||}t|t|	d ksJ t|t|	d ksJ || ||	d dd  ||	d dd  ||	d dd  |t|t|	d  qt|}t|}t|}g }t|||D ]\}}}|tt|||tjt|id qt|tg}t|}t|}t|}t t s0J tj|||fft|t fd}t|  |S )z@Create a CountMatrix from a legacy h5py.File (format version 1).c                 s  r   rG   r   r   xr#   r#   r$   r   q  r   z0CountMatrix.from_legacy_v1_h5.<locals>.<genexpr>r-   Nbarcodesc                 3  s    | ]} | V  qd S rG   r#   r   Zbarcode_mapr#   r$   r     r   uint64r   indptrrp   r   datagenes
gene_names)r   r   namer   r   r!   )r   	enumeratekeysr   r   rr   r   r   Zarray_equalZarangediffrepeatZconcatenater,   r   r   GENE_EXPRESSION_LIBRARY_TYPEr   r   r   r   
csc_matrixrL   )r   Zgenome_arraysZgene_id_arraysZgene_name_arraysZbc_idx_arraysZfeat_idx_arraysZdata_arraysZ
genome_idxr   gZn_genesr   r   Zremapped_col_indsr   Znz_elems_per_bcZbc_idxgenomesZgene_idsr   r   Zgene_idZ	gene_namer   r   jr   r"   r#   r   r$   from_legacy_v1_h5`  sl   




	


"zCountMatrix.from_legacy_v1_h5
feature_idr   r   c                 C  sH   t |tstd| dt| || jvrtd|  | j| S )Nzfeature_id z must be bytes, but was z*Specified feature ID not found in matrix: )r   r   KeyErrorr   r   decode)rX   r  r#   r#   r$   feature_id_to_int  s
   


zCountMatrix.feature_id_to_intr   r   r   c                   s   t  fdd|D S )Nc                 3  s    | ]}  |V  qd S rG   )r  )r   Zfidr\   r#   r$   r     s    z2CountMatrix.feature_ids_to_ints.<locals>.<genexpr>)sortedr   r#   r\   r$   r     s   zCountMatrix.feature_ids_to_intsr   c                 C  s   |  |}| jj| jS rG   )r  r   r   r   )rX   r  idxr#   r#   r$   feature_id_to_name     
zCountMatrix.feature_id_to_namer   c                 C  r   rG   r   r   r#   r#   r$   r     r   zCountMatrix.int_to_feature_idc                 C  r   rG   )r   r   r   r   r#   r#   r$   int_to_feature_name  r   zCountMatrix.int_to_feature_namer   c                 C     | j |S )a!  Get the integer index for a barcode.

        Args:
            bc (bytes): The barcode to search for.

        Raises:
            ValueError: `barcode` was not bytes.
            KeyError: `barcode` was not found in the set.

        Returns:
            int: the barcode index.
        )r   	bc_to_int)rX   r   r#   r#   r$   r    s   zCountMatrix.bc_to_intTSequence[bytes]return_sortedrP   c                 C  s"   t |tr	t|}| jj||dS )N)sort)r   setlistr   rx   )rX   r   r  r#   r#   r$   rx     s   
zCountMatrix.bcs_to_intsr   c                 C  s
   | j | S rG   r   )rX   r   r#   r#   r$   	int_to_bc  s   
zCountMatrix.int_to_bcjjIterable[int]rw   c                   s    fdd|D S )Nc                   s   g | ]}  |qS r#   )r  )r   r   r\   r#   r$   r     r   z+CountMatrix.ints_to_bcs.<locals>.<listcomp>r#   )rX   r  r#   r\   r$   r     r   zCountMatrix.ints_to_bcsrp   r?   c                 C  s0   |  || |}}| j||f  |7  < dS )zAdd a count.Nr  r  re   )rX   r  r   valuer   r   r#   r#   r$   add  s   zCountMatrix.addc                 C  s$   |  || |}}| j||f S rG   r  )rX   r  r   r   r   r#   r#   r$   get  s   zCountMatrix.getotherc                 C  s$   | j |j ksJ |  j|j7  _dS )zmMerge this matrix with another CountMatrix.

        Works by addition, dimensions must be the same.
        N)rR   re   )rX   r  r#   r#   r$   merge  s   zCountMatrix.mergec                 C  s"   |    | jjs| j  d S d S rG   )tocscre   Zhas_sorted_indicessort_indicesr\   r#   r#   r$   r    s   zCountMatrix.sort_indicesNr=   r>   c                 C  st   t t|d(}t|jtj< t|jt< t	|| t
|| |t}| | W d   dS 1 s3w   Y  dS )z=Save this matrix to an HDF5 file, optionally with SW version.wN)h5Filer   MATRIX_H5_FILETYPErH   h5_constantsH5_FILETYPE_KEYMATRIX_H5_VERSIONMATRIX_H5_VERSION_KEYrJ   rD   create_groupMATRIXsave_h5_group)rX   filenamer=   rE   r   groupr#   r#   r$   save_h5_file  s   



"zCountMatrix.save_h5_filer)  h5.Groupc              	   C  s|   |    |tj}| j| tj|tj| j	dd tj
 D ]\}}tjt| j||d}|j||tfdtdd q dS )z^Save this matrix to an HDF5 (h5py) group and converts the matrix to csc format if not already.T)compressionrQ   rG   )r   chunksZmaxshaper,  shuffleN)r  r%  r!  H5_FEATURE_REF_ATTRr   Zto_hdf5rA   Zcreate_hdf5_string_datasetH5_BCS_ATTRr   ZH5_MATRIX_ATTRSr@   r   r(   getattrre   Zcreate_datasetHDF5_CHUNK_SIZEHDF5_COMPRESSION)rX   r)  Zfeature_ref_groupattrr8   Zarrr#   r#   r$   r'    s"   zCountMatrix.save_h5_grouptuple[int, int, int]c                 C  s6   | t j dd \}}t| t j }t|t||fS )z)Load the matrix shape from an HDF5 group.N)r!  H5_MATRIX_SHAPE_ATTRrr   H5_MATRIX_DATA_ATTRr   )r)  rowscolsentriesr#   r#   r$   	load_dims$  s   zCountMatrix.load_dimsc                 C  s   t t| S )(Load the matrix shape from an HDF5 file.)rL   load_dims_from_h5_file_handler   )r(  r#   r#   r$   load_dims_from_h5+     zCountMatrix.load_dims_from_h5r   c                 C  sd   |   }d}d}t }|D ]}| | }|t|d 7 }|t|d 7 }||d  q|t||fS )Nr   r   r   r   )r   r  rr   update)r   r   Znum_nonzero_entriesZnum_gene_idsr   r   r   r#   r#   r$   #_load_dims_from_legacy_v1_h5_handle0  s   z/CountMatrix._load_dims_from_legacy_v1_h5_handlec                 C  >   t t| d}t|W  d   S 1 sw   Y  dS )zALoad the matrix shape from a legacy h5py.File (format version 1).rN)r  r  r   rL   rA  r(  r   r#   r#   r$   _load_dims_from_legacy_v1_h5>     $z(CountMatrix._load_dims_from_legacy_v1_h5r(  str | bytesfloatc                 C  s8   t | \}}}|tj |tj  |tj  tj d S )z8Estimate memory usage of anndata object from the matrix.i   @)rL   r>  r!  Z!MEM_BYTES_PER_MATRIX_FEATURE_H5ADZ!MEM_BYTES_PER_MATRIX_BARCODE_H5ADZMEM_BYTES_PER_MATRIX_NNZ_H5ADZMEM_BYTES_CONSTANT_H5AD)r(  Znum_featuresnum_bcsnonzero_entriesr#   r#   r$   !get_anndata_mem_gb_from_matrix_h5D  s   z-CountMatrix.get_anndata_mem_gb_from_matrix_h5num_barcodesrJ  scaleceilc                 C  s:   t |tj }|t | tj 7 }|r|t| S || S )z*Estimate memory usage of loading a matrix.)rH  r!  ZNUM_MATRIX_ENTRIES_PER_MEM_GBZNUM_MATRIX_BARCODES_PER_MEM_GBr   rN  )rL  rJ  rM  rN  Zmatrix_mem_gbr#   r#   r$   get_mem_gb_from_matrix_dimQ  s   z&CountMatrix.get_mem_gb_from_matrix_dimc                 C      t | \}}}t ||||S )z)Estimate memory usage from an HDF5 group.)rL   r;  rO  )r)  rM  rN  _rI  rJ  r#   r#   r$   get_mem_gb_from_group^     z!CountMatrix.get_mem_gb_from_groupc                 C  s   t t| ||S z(Estimate memory usage from an HDF5 file.)rL   %get_mem_gb_from_matrix_h5_file_handler   )r(  rM  rN  r#   r#   r$   get_mem_gb_from_matrix_h5h  s   
z%CountMatrix.get_mem_gb_from_matrix_h5file_handlestr | bytes | BufferedIOBasec                 C  rP  rT  )rL   r=  rO  )rW  rM  rN  rQ  rI  rJ  r#   r#   r$   rU  s  rS  z1CountMatrix.get_mem_gb_from_matrix_h5_file_handlec                 C  h   t | d$}t|}|dkrt|W  d   S t|t W  d   S 1 s-w   Y  dS )r<  rC  rp   N)r  r  rL   _get_format_version_from_handlerA  r;  r&  rW  r   
h5_versionr#   r#   r$   r=  }  s   
$z)CountMatrix.load_dims_from_h5_file_handlec                 C  rY  )z8Load just the barcode sequences from an HDF5 file hadle.rC  rp   N)r  r  rL   rZ  '_load_bcs_from_legacy_v1_h5_file_handleload_bcs_from_h5_groupr&  r[  r#   r#   r$   load_bcs_from_h5_file_handle  s   
$z(CountMatrix.load_bcs_from_h5_file_handlec                 C  s   | t j d d  S rG   )r!  H5_MATRIX_INDPTR_ATTRr)  r#   r#   r$   _load_indptr_from_matrix_group     z*CountMatrix._load_indptr_from_matrix_groupc                 C  sd   t | \}}|tk rtd| t|d}|t }t |W  d    S 1 s+w   Y  d S )NUMatrix HDF5 file format version (%d) is an older version that is no longer supported.rC  )rL   _validate_h5_filer#  
ValueErrorr  r  r&  rb  )r(  fnr/   r   Z	mat_groupr#   r#   r$   load_indptr_from_file  s   $z!CountMatrix.load_indptr_from_filec           	      C  s   t |}| |}|tj dd }|tj dd }|tj dd }t |}t	t
|dks6J tj|||f|d}| |||dS )zLoad from an HDF5 group.Nr   r   r   )rL   load_feature_ref_from_h5_groupr^  r!  r6  r7  H5_MATRIX_INDICES_ATTRrb  r   allr   r   r   )	r   r)  r   r   r!   r   r-   r   r"   r#   r#   r$   load  s   


zCountMatrix.loadstartendc                 C  s   t |}| |}| |\}}}|dk s||ks||kr+td| d| d| |tj ||d  }	|	d }
|	d }t||| g}|tj	 |
| }|tj
 |
| }|	|	d  }	tt|	dkskJ tj|||	f|d}| |||| |dS )	aB  Load from an HDF5 group only a subset of the columns.

        Very similar to CountMatrix.load().

        Args:
            group: The matrix H5 group
            start: the first column to select
            end: the column to select up to.

        Returns:
            A CountMatrix with the relevant columns
        r   z>The column range you've specified is invalid.  Start Column = z, End = z, Total Columns = rp   r   r   )rL   ri  r^  r;  rf  r!  r`  r   r(   r7  rj  rk  r   r   r   )r   r)  rm  rn  r   r   Zn_rowsZn_colsrQ  r   	col_startcol_endr!   r   r-   r"   r#   r#   r$   load_columns_from_file  s"   

z"CountMatrix.load_columns_from_filec                 C  s*   | t j jdurt| t j dd S g S )z1Load just the barcode sequences from an h5 group.N)r!  r0  r!   r  ra  r#   r#   r$   r^    s   z"CountMatrix.load_bcs_from_h5_groupc                 C  sb   t | } t| }|dkrt| S t| d}t|t W  d   S 1 s*w   Y  dS )z3Load just the barcode sequences from an HDF5 group.rp   rC  N)r   rL   get_format_version_from_h5_load_bcs_from_legacy_v1_h5r  r  r^  r&  )r(  r\  r   r#   r#   r$   load_bcs_from_h5  s   

$zCountMatrix.load_bcs_from_h5c                 C  rB  )KLoad just the barcode sequences from a legacy h5py.File (format version 1).rC  N)r  r  r   rL   r]  rD  r#   r#   r$   rt    rF  z'CountMatrix._load_bcs_from_legacy_v1_h5c                 C  s6   |   }t }|D ]}| | }||d  q	t|S )rv  r   )r   r  r@  r  )rW  r   r   r   r)  r#   r#   r$   r]    s   z3CountMatrix._load_bcs_from_legacy_v1_h5_file_handlec                 C  s   t t| dL}tj|jvs|jtj tkrtdt|jv r&|jt }nd}|t	kr2td| |t	k r<td| d|
 vrFtdt|d W  d    S 1 sWw   Y  d S )NrC  *HDF5 file is not a valid matrix HDF5 file.rp   nMatrix HDF5 file format version (%d) is a newer version that is not supported by this version of the software.rd  r"   >Could not find the "matrix" group inside the matrix HDF5 file.)r  r  r   r!  r"  rH   r   rf  r$  r#  r   rL   r^  r(  r   r/   r#   r#   r$   load_bcs_from_h5_file  s.   
$z!CountMatrix.load_bcs_from_h5_fileset[str]c                 C  s   t t| dA}t|}|tk r3t|\}}}|dkr)tjhW  d   S t	 W  d   S t
|t }dd |jD W  d   S 1 sLw   Y  dS )zCReturn a set of all library types defined in the Feature Reference.rC  r   Nc                 S     h | ]}|j qS r#   r   r   r#   r#   r$   	<setcomp>$      z>CountMatrix.load_library_types_from_h5_file.<locals>.<setcomp>)r  r  r   rL   rZ  r#  rA  r   r   r  ri  r&  r   )r(  r   r/   Z
gene_countrQ  r   r#   r#   r$   load_library_types_from_h5_file  s   

$z+CountMatrix.load_library_types_from_h5_filec                 C  s   dd | j jD S )zGet the list of feature types.c                 S  r}  r#   r~  r   r#   r#   r$   r  (  r  z0CountMatrix.get_library_types.<locals>.<setcomp>)r   r   r\   r#   r#   r$   get_library_types&  rc  zCountMatrix.get_library_typesc                 C  s   | t j }t|S )z,Load just the FeatureRef from an h5py.Group.)r!  r/  r   Z	from_hdf5)r)  Zfeature_groupr#   r#   r$   ri  *  s   

z*CountMatrix.load_feature_ref_from_h5_groupc                 C  s\   t t| d}t|}|tk rtdt|t W  d   S 1 s'w   Y  dS )z1Load just the FeatureRef from a matrix HDF5 file.rC  z<Direct Feature Ref reading not supported for older H5 files.N)	r  r  r   rL   rZ  r#  OSErrorri  r&  rz  r#   r#   r$   load_feature_ref_from_h5_file0  s   
$z)CountMatrix.load_feature_ref_from_h5_filec                 C  $   t | jtjur| j | _d S d S rG   )r   re   r   r   tolilr\   r#   r#   r$   r  :     zCountMatrix.tolilc                 C  r  rG   )r   re   r   
coo_matrixtocoor\   r#   r#   r$   r  >  r  zCountMatrix.tocooc                 C  r  rG   )r   re   r   r   r  r\   r#   r#   r$   r  B  s   zCountMatrix.tocscc                 C  s$   t | jtjur| j | _dS dS )zeConvert to a csr matrix if not already.

        Returns:
            None, mutates in place
        N)r   re   r   Z
csr_matrixtocsrr\   r#   r#   r$   r  G  s   zCountMatrix.tocsrr   rf   *tuple[CountMatrix, np.ndarray, np.ndarray]c                 C  s   t | }t| |k}| jt|kr||}t| |k}|j	t|kr/|
|}t|dks;t|dkr>t |||fS )zSelect axes with sums greater than the threshold value.

        Returns:
            (CountMatrix, np.array of int, np.array of int):
                New count matrix, non-zero bc indices, feat indices
        r   )r_   deepcopyr   r   r   rT   rr   ru   get_counts_per_featurerR   r   r1   )rX   rf   new_matZnonzero_bcsZnonzero_featuresr#   r#   r$   select_axes_above_thresholdQ  s   
	


z'CountMatrix.select_axes_above_thresholdr   c                 C  sX   |dkr|   }t||k}| |S |dkr(|  }t||k}| |S td)a  Select axis given a threshold of features per barcode or total counts per feature.

        Args:
            axis (int): 0 for rows (features) and 1 for columns (barcodes).

        Returns:
            (CountMatrix):
                New count matrix
        r   rp   axis out of ranger  r   r   r   r   ru   rf  )rX   r   rf   countsr-   r#   r#   r$   select_axis_above_thresholdi  s   


z'CountMatrix.select_axis_above_thresholdc                 C  s
   |  dS )zSelect axes with nonzero sums.

        Returns:
            (CountMatrix, np.array of int, np.array of int):
                New count matrix, non-zero bc indices, feat indices
        r   )r  r\   r#   r#   r$   select_nonzero_axes  s   
zCountMatrix.select_nonzero_axestuple[CountMatrix, np.ndarray]c                 C  s`   |dkr|   }t|dk}| ||fS |dkr,|  }t|dk}| ||fS td)a  Select axis with nonzero sums.

        Args:
            axis (int): 0 for rows (features) and 1 for columns (barcodes).

        Returns:
            (CountMatrix, np.array of int, np.array of int):
                New count matrix, selected indices
        r   rp   r  r  )rX   r   r  r-   r#   r#   r$   select_nonzero_axis  s   
zCountMatrix.select_nonzero_axisr-   Sequence[int]c                   s<    j dd|f }t j |}t j fdd|D |dS )zASelect a subset of barcodes and return the resulting CountMatrix.Nc                   s   g | ]} j | qS r#   r  r   r\   r#   r$   r     r   z/CountMatrix.select_barcodes.<locals>.<listcomp>r   )re   r:   rL   r   )rX   r-   submatr#   r\   r$   ru     s   zCountMatrix.select_barcodesrv   c                 C  s   |  |d}| |S )NF)rx   ru   ry   r#   r#   r$   rz     r{   z"CountMatrix.select_barcodes_by_seqr|   c                   s   |   fdd| jD S )Nc                   s   g | ]} t |kr|qS r#   )r~   Zget_gem_group_from_barcoder   r   r#   r$   r     s    z<CountMatrix.select_barcodes_by_gem_group.<locals>.<listcomp>)rz   r   r   r#   r   r$   r     s   z(CountMatrix.select_barcodes_by_gem_groupc                 C  s:   | j |}| j|ddf }t| j|}t|| j|dS )zSelect a subset of features and return the resulting matrix.

        We also update FeatureDefs to keep their indices consistent with their new position.
        Nr   )r   r   re   r:   rL   r   )rX   r-   r   r  r#   r#   r$   r     s   zCountMatrix.select_featuresc                 C  r   rG   )r   r   r   r#   r#   r$   r     s   z"CountMatrix.select_features_by_idsgene_indices_to_keepc                 C  s8   t |}| jjD ]}|jtjkr||j q| |S )ax  Removes all features that are GEX and not on this list, keeping all others.

        Used to subset the matrix down in the targeted assay

        Args:
            gene_indices_to_keep: list of indices of the GEX features to keep

        Returns:
            A copy of this matrix subset to the GEX feature indices requested and all other
            feature types.
        )	r  r   r   r   r   r   r   r   r   )rX   r  r-   r   r#   r#   r$   remove_genes_not_on_list  s   
z$CountMatrix.remove_genes_not_on_listr   c                 C  sB   g }| j jD ]}|jtjkr|jd |kr||j q| |S r   )	r   r   r   r   r   r   r   r   r   r   r#   r#   r$   r     s   
z%CountMatrix.select_features_by_genomer   r   c                 C  s2   g }| j jD ]}|j|v r||j q| |S )zISelect the subset of gene-expression features by genome and feature type.r   r   r   r   r   r   r   r#   r#   r$   r        

z$CountMatrix.select_features_by_typesr   c                 C  s2   g }| j jD ]}|j|kr||j q| |S r   r  )rX   r   r-   r   r#   r#   r$   r     r  z#CountMatrix.select_features_by_typetag_typec                 C  s<   g }| j jD ]}|j|kr||jv r||j q| |S )zpSelect the subset of features with a particular feature type (e.g. "Antibody Capture") and tag (e.g. "Hashtag").)r   r   r   r   r   r   r   )rX   r   r  r-   r   r#   r#   r$   select_features_by_type_and_tag  s   
z+CountMatrix.select_features_by_type_and_tagc                 C  r
  )zSReturn a list of feature ids of a particular feature type (e.g. "Gene Expression").)r   get_feature_ids_by_typer   r#   r#   r$   r    r]   z#CountMatrix.get_feature_ids_by_typec                 C  r
  )zeCount how many features in the matrix are of a given type.

        (e.g. "Gene Expression")
        )r   get_count_of_feature_typer   r#   r#   r$   r    s   z%CountMatrix.get_count_of_feature_typedict[str, int]c                   s    fdd   D S )z"Get count of each feature by type.c                   s   i | ]}|  |qS r#   )r  )r   ftr\   r#   r$   r      r   z:CountMatrix.get_count_of_feature_types.<locals>.<dictcomp>)r  r\   r#   r\   r$   get_count_of_feature_types  s   z&CountMatrix.get_count_of_feature_typesr   c                 C  s   | j tjdS )r   r~  )r   r   r   )r   r#   r#   r$   r     r?  z)CountMatrix._get_genomes_from_feature_refc                 C  rZ   r   r   r\   r#   r#   r$   r     r]   zCountMatrix.get_genomesc                 C  sl   t | } t| }|dkrt| S t| d}t|t }t|W  d   S 1 s/w   Y  dS )z;Get a list of the distinct genomes from a matrix HDF5 file.rp   rC  N)	r   rL   rs  _get_genomes_from_legacy_v1_h5r  r  ri  r&  r   )r(  r\  r   r   r#   r#   r$   get_genomes_from_h5  s   

$zCountMatrix.get_genomes_from_h5c                 C  s@   t t| d}t| W  d   S 1 sw   Y  dS )zNGet a list of the distinct genomes from a legacy h5py.File (format version 1).rC  N)r  r  r   r  r   rD  r#   r#   r$   r    s   
$z*CountMatrix._get_genomes_from_legacy_v1_h5"np.ndarray[int, np.dtype[np.int_]]c                 C  s   | j   | j jddS r   )re   Zeliminate_zerosgetnnzr\   r#   r#   r$   get_numfeatures_per_bc  r  z"CountMatrix.get_numfeatures_per_bcc                 C     t | jddS r   r%   re   r\   r#   r#   r$   r   !  r   zCountMatrix.get_counts_per_bc
str | Nonec                   s   d| j jv s
J d r fddt| j jD }nfddt| j jD }|r7t| |dd}|jdd	S tjg | jj	d
S )zSum the count matrix across feature rows with a given genome tag.

        The feature reference
        must contain a 'genome' tag. If feature_type is not null filter on it as well.
        r   z&feature reference missing 'genome' tagc                   s,   g | ]\}}|j d  kr|j kr|qS r   )r   r   r   r   Zfdefr   r   r#   r$   r   .  s
    zACountMatrix.get_counts_per_barcode_for_genome.<locals>.<listcomp>c                   s"   g | ]\}}|j d   kr|qS r  )r   r  r  r#   r$   r   4  s
    N)rM   rO   r   r   rQ   )
r   r   r   r   rK   r   r   r(   re   r8   )rX   r   r   r-   r`   r#   r  r$   !get_counts_per_barcode_for_genome$  s   


z-CountMatrix.get_counts_per_barcode_for_genomec                 C  r  Nrp   r   r  r\   r#   r#   r$   r  >  r   z"CountMatrix.get_counts_per_featurec                 C  s    t t| j}t| jdd| S r  )r   r%   re   )rX   Z	total_umir#   r#   r$   get_frac_counts_per_featureA  s   z'CountMatrix.get_frac_counts_per_featureStuple[np.ndarray[int, np.dtype[np.float64]], np.ndarray[int, np.dtype[np.float64]]]c                 C  sh   t | jtjs	J | jjdd}| j }|d}|jdd|jd  t	|d }t	
|}||fS )zCalculate the mean and variance on the sparse matrix efficiently.

        :return: a tuple with numpy arrays for mean and var
        rp   r   g       @)r   re   r   r   meanr_   Zpowerr   r!   r   r   )rX   Zmean_per_featureZsecond_momentZvar_per_featurer#   r#   r$   get_mean_and_var_per_featureE  s   


z(CountMatrix.get_mean_and_var_per_featureFc                 C  sj   | }|dur|t jv sJ d||}|dur||}|dur&||}| }|r3td| S |S )zGet counts per barcode, sliced various ways.

        - subset by list of feature IDs
        - subset by list of barcodes
        - subset by library_type
        Nzlibrary_type not recognizedg      ?)r   ZRECOGNIZED_FEATURE_TYPESr   r   rz   r   r   log10)rX   Zlist_feature_idsZlist_barcodesZlog_transformlibrary_typeZsubselect_matrixZcounts_featurer#   r#   r$   get_subselected_countsV  s   	


z"CountMatrix.get_subselected_countsc                 C  s   t | jdkddS )Nr   rp   r   r  r\   r#   r#   r$   get_numbcs_per_features  r   z"CountMatrix.get_numbcs_per_featurecutoffr   c                 C  s@   |   }tdt|j|d }t|dd| }t||kd S )Nr   rp   T)reverse)r   r   minsizer  r   nonzero)rX   r  Zreads_per_bcr   r  r#   r#   r$   get_top_bcsv  s   zCountMatrix.get_top_bcsbase_dirsave_features_func-Callable[[FeatureReference, str, bool], None]metadatadict | Nonecompressc              
   C  s  |    |   tj|dd tj|d}tj|d}|r'|d7 }|d7 }| jjdv r2d}d}n| jjd	v r=d
}d}n	td| jj t	| jt
jjsPJ | jj\}	}
d}d}|p]i }t|d< dt|  }t|d\}|d| d| d| d  |dD ]}|d || |d q|d|	|
| jjf  t| jjd | jjd | jjD ]\}}}|||||f  qW d   n1 sw   Y  || j||d t|d}| jD ]	}||d  qW d   dS 1 sw   Y  dS )a^  Save in Matrix Market Exchange format.

        Note:
            This operation modifies the matrix by
            converting to a coordinate representation by calling scipy.sparse.csc_matrix.tocoo().

        Args:
          base_dir (str): Path to directory to write files in.
          save_features_func (func): Func that takes (FeatureReference, base_dir, compress) and writes
                                     a file describing the features.
          metadata (dict): Optional metadata to encode into the comments as JSON.
          compress: Whether to compress output files with gzip.
        T)exist_okz
matrix.mtxzbarcodes.tsvz.gz)Zuint32r   r   Zint64integers	   %i %i %i
)rH  Zdoublereals   %i %i %15g
z&Unsupported data type for the matrix: Z
coordinateZgeneralZformat_versions   metadata_json: wbz%%MatrixMarket matrix  
   
   %rp   N)r  )r  r  osmakedirspathjoinre   r8   rf  r   r   Zcoor  r!   r#  tk_safe_jsonZsafe_jsonifyencodecr_ioZopen_maybe_gzipwritesplitr   r,   rowcolr   r   r   )rX   r  r  r  r  Zout_matrix_fnZout_barcodes_fnfieldfmtr8  r9  ZrepZsymmetrycommentstreamlinerC  cdr   r   r#   r#   r$   save_mex|  sN   "

*
"zCountMatrix.save_mexofilec                 C  s    t | jv r| jt  }|S d}|S Nrp   )r$  rH   )r  r/   r#   r#   r$   rZ    s
   

z+CountMatrix._get_format_version_from_handlebytes | strc                 C  s>   t t| d}t|W  d    S 1 sw   Y  d S )NrC  )r  r  r   rL   rZ  rD  r#   r#   r$   rs    s   $z&CountMatrix.get_format_version_from_h5c                 C  s   t | tjr	| }nt| }t|dF}tj|jvs$t	|jtj t
kr(tdt|}|tkr7td| |tkrEt| vrOtdW d    ||fS W d    ||fS 1 s\w   Y  ||fS )NrC  rw  rx  ry  )r   pathlib	PosixPathr   r  r  r!  r"  rH   r   r   rf  rL   rZ  r#  r&  r   )r(  rg  r   r/   r#   r#   r$   re    s0   


zCountMatrix._validate_h5_filerp  rc   rq  c                 C  s   t | \}}t|dO}|tk r*|dus|durtdt |W  d   S |dus2|durN|du s:|du r>tdt |t ||W  d   S t 	|t W  d   S 1 s_w   Y  dS )aP  Load a matrix H5 file, optionally subsetting down to a particular range of columns if requests.

        Args:
            filename: The name of the H5 file
            col_start: (Optional) The column to select
            col_end: (Optional) End of column select range

        Returns:
            Instance of a CountMatrix

        rC  NzASubsetting columns when loading legacy H5 files is not supported.z<Both or neither argument col_start/col_end must be provided.)
rL   re  r  r  r#  rf  r   rr  r&  rl  )r(  rp  rq  rg  r/   r   r#   r#   r$   load_h5_file  s    $zCountMatrix.load_h5_filec                 C  s   t | \}}}|S rG   rL   r>  )r(  rQ  r   r#   r#   r$   count_cells_from_h5  s   zCountMatrix.count_cells_from_h5c              	   C  sp   t t| d&}z	|dtj}W n ty   d}Y n	w W d    |S W d    |S 1 s1w   Y  |S )NrC  /Unknown)r   	open_filer   get_node_attrr!  H5_CHEMISTRY_DESC_KEYAttributeError)r(  r   	chemistryr#   r#   r$   load_chemistry_from_h5  s   

z"CountMatrix.load_chemistry_from_h5bcs_per_genomedict[Any, Iterable[bytes]]c                 C  s2   t  }| D ]}||O }qtt|}| |S )zReturn CountMatrix containing only the specified barcodes.

        Args:
            bcs_per_genome (dict of str to list): Maps genome to cell-associated barcodes.

        Returns:
            CountMatrix w/ the specified barcodes.
        )r  valuesr  r  rz   )rX   r  r   r   r#   r#   r$   filter_barcodes  s
   


zCountMatrix.filter_barcodesc                 C  s   t j| ddS )Nhdf5zmatrices.hdf5)r  r  r  )	base_pathr#   r#   r$   h5_path&  r   zCountMatrix.h5_pathr  scale_factorint | floatcontrol_feature_nameslist[str] | list[bytes]c                   sh  d}|   v sJ d| dd      fdd|D }|r.  | d }n  | d }t  |}t	  j
jjj} j|ddf t|| tj}	|	jdk |	j|d kB  }t|	jd|d |	_|	  j
jj j|ddf< t |}
d	d
 |
D }t jjvr jt|d |S  jt| |S )a  Normalise the features of a library type by control features in the matrix.

        scale by the scale_factor.

        The features in the library are all
        1. divided by 1 plus the sum of control features in each barcode
        2. multiplied by scale factor
        3. floored to be an integer

        If the control features provided are not in the matrix, the division is by
        the sum of all features in the library provided.
        The features that are normalized have NORMALIZATION_TAG_IN_H5 set to TRUE.
        If this tag was not in the H5, all other features have the tag set to FALSE.
        If it was present in the H5, its values for other features is unchanged.

        Args:
            library_type (str): string showing library type
            scale_factor (Union[int, float]): factor to scale by
            control_feature_names (list[Union[str, bytes]]): feature names that are used as control

        Returns:
            int:  Number of entries clipped due to overflow
        r   z@Library type to normalise not in matrix. Library type passed in z. zLibrary types in matrix c                   s   g | ]	}| j v r|qS r#   )r   r   r\   r#   r$   r   O  r   zJCountMatrix.normalise_library_type_by_control_features.<locals>.<listcomp>rp   Nc                 S  s   i | ]}|d qS )TRUEr#   r   r#   r#   r$   r   l  r  zJCountMatrix.normalise_library_type_by_control_features.<locals>.<dictcomp>FALSE)r  r`   r   r   r   r  r   r  r   Ziinfor"   re   r8   r   dotr   Zdiagsr9   Zfloat64r   r   Zclipr  NORMALIZATION_TAG_IN_H5r   r   Zadd_tagZ
update_tag)rX   r  r  r  Znumber_of_overflow_entriesZcontrol_features_in_matrixZsize_factorZindices_in_libraryZ	max_valueZtmp_normalised_matrixZfeatures_normalized_setZfeatures_label_dictr#   r\   r$   *normalise_library_type_by_control_features*  s@   
"z6CountMatrix.normalise_library_type_by_control_features)r   r   r   r   r"   r6   )r   r   r   r   )r   r<   r   rL   )r  r   r   r   )r   r   r   r   )r  r   r   r   r   )r   r   r   r   )r   r   r   r   )T)r   r  r  rP   r   r   )r   r   r   r   )r  r  r   rw   r}   )r  r   r   r   r   r?   )r  rL   r   r?   r   r   )r=   r>   )r)  r+  r   r?   )r)  r+  r   r5  )r   r5  )r   r<   r   r5  )r(  rG  r   rH  )
rL  r   rJ  r   rM  rH  rN  rP   r   rH  )r)  r+  rM  rH  rN  rP   r   rH  )r(  rG  rM  rH  rN  rP   r   rH  )rW  rX  rM  rH  rN  rP   r   rH  )rW  rX  r   r5  )r   rw   )r)  r+  )r)  r+  r   rL   )r)  r+  rm  r   rn  r   r   rL   )r)  r+  r   rw   )r   r|  )r)  r+  r   r   )r   r   r   )rf   r   r   r  )r   r   rf   r   r   rL   )r   r  )r   r   r   r  )r-   r  r   rL   )rv   r  r   rL   )r|   r   r   rL   )r-   r  r   rL   )r   r   r   rL   )r  r  r   rL   )r   r   r   rL   )r   r   r   rL   )r   r   r   rL   )r   r   r  r   r   rL   )r   r   r   rw   )r   r   r   r   )r   r  )r   r   r   r   )r(  rG  r   r   )r   r  rG   )r   r   r   r  r   r  )r   r  )NNFN)r  r   r   r   )NT)r  r   r  r  r  r  r  rP   )r  r<   r   r   )r(  r  )rp  rc   rq  rc   )r  r  r   rL   )r  r   r  r  r  r  r   r   )ar2   r3   r4   rY   r   r   r`   classmethodDEFAULT_DATA_DTYPEr   staticmethodr   r  r   r  r   r	  r  rx   r  r   r  r  r  r  r*  r'  r;  r>  rA  rE  rK  r!  ZMATRIX_MEM_GB_MULTIPLIERrO  rR  rV  rU  r=  r_  rb  rh  rl  rr  r^  ru  rt  r]  r{  r  r  ri  r  r  r  r  r  r  r  r  r  ru   rz   r   r   r   r  r   r   r   r  r  r  r  r   r   r  r  r  r   r  r  r  r  r  r  r  r  rZ  rs  re  r  r  r  r  r  r  r#   r#   r#   r$   rL   -  s   U		
		
 
			

	

K


rL   h5_filenamesr   CountMatrix | Nonec                 C  sF   d}| D ]}|du rt |}q|t | q|dur!|  |S )z-Merge multiple matrices into a single matrix.N)rL   r  r  r  )r  r"   Zh5_filenamer#   r#   r$   merge_matricesu  s   r  c                   s  dd | D  t dd  D }t fdd D s J dt fdd D s0J dg }| D ]*}t|}|d	d
 |d
d  }t|}	t|	}
t|	}|||
|f q4|j	dd d d}d}|D ]\}}
}|
|k s~||ks~|
|k rt
d|}|
}qm|d d }t|| tjtjf}t|d}|rt|| |rt|| |t }t|tj}t||d  }|D ]}|| }||f q|d	d
 D ]T\}}}t|}|}
t|jj| }|jj|tj |
|< |jj|tj |
|< |jjtj}||
7 }||d	  ||  }|dks!J d||d
 ||d
< ~q||tj d
d
< W d
   d
S 1 sBw   Y  d
S )a  Merge several h5 files into one larger matrix file.

    An efficient method for doing column concatenation of H5 files.  Assumes the input files all have the same
    features/barcodes in their matrix, and that each only contains a non-overlapping subset of the columns.  Strategy
    is to append the column data directly, rather than merging arbitrary non-distinct columns (As is done in
    merge_matrices).

    Args:
        in_h5_filenames: A list of filenames to column distinct sets of a larger matrix
        out_h5_filename: The desired output filename
        extra_attrs: Similar to the argument to save_h5_file
        sw_version: A version of the software to add into the attributes.

    Returns:
        Nothing
    c                 S  s   g | ]}t |qS r#   r  r   r#   r#   r$   r     r   z8create_merged_matrix_from_col_concat.<locals>.<listcomp>c                 s  s    | ]}|d  V  qdS )r0   Nr#   r   r#   r#   r$   r     r   z7create_merged_matrix_from_col_concat.<locals>.<genexpr>c                 3  s$    | ]}|d   d  d  kV  qdS )r   Nr#   r   Z
dimensionsr#   r$   r         
z$Not all row dimensions were the samec                 3  s$    | ]}|d   d d  kV  qdS )rp   r   Nr#   r   r  r#   r$   r     r  z$Not all col dimensions were the samerp   Nro  c                 S  s   | d S r  r#   )r   r#   r#   r$   <lambda>  s    z6create_merged_matrix_from_col_concat.<locals>.<lambda>keyz;H5 files to be concatenated were not unique sets of columnsr   az4Data was not cleanly separated into distinct columns) r   r   rk  rL   rh  r  r  r   r   r  rf  shutilcopyfiler!  r7  rj  r  r  rJ   rD   r&  rb  r9   r   rr   resizer  re   r   r-   r   r`  )Zin_h5_filenamesZout_h5_filenamer=   rE   Z	total_nnzZfn_start_endsrg  r   Z	col_spansZactive_colsrm  rn  Zlast_endZ
last_startrQ  Z
start_fileZ	to_appendoutfiler"   Zind_ptrZ	dset_nameZdsetZ
small_fileZs_startZs_endr  Z
cur_indptrZnum_entries_beforer#   r  r$   $create_merged_matrix_from_col_concat  sr   








$r  c                 C  s   t | |}||tj< |S rG   )make_library_map_countr!  r  )	sample_id
gem_groupsr  Zmatrix_attrsr#   r#   r$   make_matrix_attrs_count  s   

r  c                 C  s   t | } t| }|dkrt|  S i }t| d/}tjD ]!}|j	
|}|dur@t|r<t|dr<| ||< q|||< q|W  d   S 1 sMw   Y  dS )1Get matrix metadata attributes from an HDF5 file.rp   rC  Nitem)r   rL   rs  "_load_matrix_legacy_v1_h5_metadatar  r  r  r!  H5_METADATA_ATTRSrH   r  r   isscalarhasattrr  r(  r\  rH   r   r  valr#   r#   r$   load_matrix_h5_metadata  s   

$r  r(  r   c                 C  s   i }t | d4}|dj}tjD ]!}t||r3t||}t	|r/t|dr/|
 ||< q|||< qW d    |S 1 s?w   Y  |S )NrC  r  r  )r   r  Zget_nodeZ_v_attrsr!  r  r  r1  r   r  r  )r(  rH   r   Z	all_attrsr  r  r#   r#   r$   r    s   



r  c                 C  sz   t | } t| }|dkri S i }t| d}|j D ]\}}|tvr)|||< q|W  d   S 1 s6w   Y  dS )r  rp   rC  N)r   rL   rs  r  r  rH   r@   MATRIX_H5_BUILTIN_ATTRSr  r#   r#   r$   load_matrix_h5_custom_attrs  s   
$r   c                 C  s>   t t|}tjtj| gt| ddtjtj|tdi}|S )Nr   rQ   )	r  r  r!  H5_LIBRARY_ID_MAPPING_KEYr   r(   rr   H5_ORIG_GEM_GROUP_MAPPING_KEYr   )r  r  Zunique_gem_groupslibrary_mapr#   r#   r$   r    s   r  dict[str, np.ndarray]c                 C  sf   g }g }t |  dd dD ]\}\}}|| || qtjtj|ddtjtj|tdi}|S )Nc                 S  s   t | d S )Nr   )r   )pairr#   r#   r$   r	  (  s    z'make_library_map_aggr.<locals>.<lambda>r
  r   rQ   )	r  r@   r   r!  r!  r   r(   r"  r   )Zgem_group_indexlibrary_idsoriginal_gem_groupsrQ  lidogr#  r#   r#   r$   make_library_map_aggr#  s    
r*  	matrix_h5c              	   C  s   t j| dd)}z|dtj}|dtj}W n ty(   Y W d    d S w W d    n1 s3w   Y  i }tt||ddD ]\}\}}||f||< qC|S )NrC  )moder  rp   )rm  )	r   r  r  r!  r!  r"  r  r   r,   )r+  r   r&  r'  r#  Zngr(  r)  r#   r#   r$   get_gem_group_index2  s   r-  c                 C  s   |   dkst| jd tjtB sJ t| tjsJ t| j	d D ]C}d}t| j
| | j
|d  D ]}|| j| | j|  7 }q3|dkrGq"t|}t| j
| | j
|d  D ]}| j|  |  < qYq"dS )af  Perform in-place column L2-normalization of input matrix X.

    >>> import numpy as np
    >>> import scipy.sparse as sp
    >>> from sklearn.preprocessing import normalize
    >>> a = np.arange(12, dtype='float').reshape((3, 4))
    >>> b = sp.csc_matrix(a)
    >>> inplace_csc_column_normalize_l2(b)
    >>> np.all(normalize(a, axis=0) == b)
    True
    r   rp   g        N)r  r   r   r   Zfloat32rH  r   r   ranger!   r   sqrt)Xr   sr   r#   r#   r$   inplace_csc_column_normalize_l2?  s   &
r2  r   )r   r   r   r   )r(   r)   r*   r   r   r+   )r5   r6   r7   r6   r   r6   )r;   r<   r=   r>   r   r?   )r;   r<   rE   rF   r   r?   )r  r   r   r  r   )r(  r   )r   r$  )r+  r   )]
__future__r   r_   Zos.pathr  r  r  collectionsr   collections.abcr   r   r   r   r   r	   ior
   typingr   r   r   Zh5pyr  Znumpyr   Zscipy.sparseZsparser   Zsixr   r   Zcellranger.cr_ior  Zcellranger.h5_constantsr!  Zcellranger.hdf5r  rA   Zcellranger.rna.libraryZrnaZlibraryr   Zcellranger.sparserd   Zcellranger.utilsutilsr~   Ztenkit.safe_jsonZ	safe_jsonr  Zcellranger.fast_utilsr   Zcellranger.feature_refr   r   r   Zcellranger.wrapped_tablesr   r3  r2  r  ZFEATURES_TSV_GZr  r%   Zgenericr&   r.   r   r&  r$  r#  rI   r"  r  r  	Exceptionr1   r:   rD   rJ   rK   rL   r  r  r  r  r  r   r  r*  r-  r2  r#   r#   r#   r$   <module>   s    



	 4        
P
U


