o
    Uݢgy                     @  s  d Z ddlmZ ddlmZ ddlmZ ddlmZ ddl	Z
ddlZddlZddlmZmZ ddlmZ ddlmZ dd	lmZmZ dd
lmZmZmZmZm Z m!Z!m"Z"m#Z#m$Z$m%Z% g dZ&dZ'dZ(dZ)dZ*dZ+dZ,dZ-dZ.dZ/dZ0ee,e'egZ1dZ2dd Z3	dedfddZ4d d! Z5dgdhd%d&Z6			didjd/d0Z7dkd7d8Z8dld?d@Z9	"	"	A	"	A	A	A	"	"					"dmdndEdFZ:dodpdGdHZ;dqdKdLZ<	drdsdOdPZ=dhdQdRZ>dhdSdTZ?dhdUdVZ@dde2dfdtd[d\ZAdudadbZBdde2dfdvdcddZCdS )wzXUtils for summarizing and computing statistics on RNA reads and UMIs from molecule_info.    )annotations)Iterable)deepcopy)AnyN)ensure_binary
ensure_str)molecule_counter_extensions)GENOME_FEATURE_TAGFeatureReference)
BARCODE_IDX_COL_NAMECOUNT_COL_NAMEFEATURE_IDX_COL_NAMEGEM_GROUP_COL_NAMELIBRARY_IDX_COL_NAMEMOLECULE_INFO_COLUMNSUMI_COL_NAMEUMI_TYPE_COL_NAMEBarcodeInfoMoleculeCounter)feature_typeidnameindexis_cellz{}_cellsZ	num_readsZnum_umisZdup_fracbarcodenum_barcodesnum_featuresZ
feature_idZfeature_namei -1c                 C  s   t jt jt jt jg}| j|vr0tt| }t| t j	r"|t| j }t
d|  tj| ddS |d|| j }|D ]}t |}|j}t | |k }|rV| |  S q<| S )a  A roll our own version of `pd.to_numeric(...,downcast="unsigned")`.

    Profiling showed
    that the pandas version was incredibly memory and CPU intensive due to some inefficient code
    leading to an expensive call to `np.allclose(new_result, result, rtol=0)`
    z=Downcasting function expected a uint64 or smaller, type was: unsigneddowncastN)npuint8uint16uint32uint64dtypestrtype
isinstancendarrayprintpd
to_numericr   iinfomaxallastype)xlevels	type_nameZto_checkZsmaller_typeiimax_valbelow r7   f/oak/stanford/groups/akundaje/marinovg/programs/cellranger-9.0.1/lib/python/cellranger/pandas_utils.py_downcast_large_uint_vector<   s    

r9   r   r1   ,float | int | tuple | np.ndarray | pd.Seriesr   
str | Nonec                 C  sL   |dkrt | dkr| dS t| S |dv rtj| |dS td| d)a  Wrapper around pandas.to_numeric to avoid bugs and slow performance.

    Args:
        x: value to be converted
        downcast: one of (None, 'unsigned', 'float')

    Returns:
        (depends on input type): input value as numeric type, possibly downcast
    r   r   r!   )Nfloatr   z1to_numeric wrapper not implemented for downcast='')lenr0   r9   r+   r,   NotImplementedError)r1   r   r7   r7   r8   to_numeric_safeU   s   
r@   c                 C  s   t | tr	|  S | S N)r(   bytesdecodevr7   r7   r8   _conv_bytesq   s   
rF   Fdfpd.DataFramec                 C  s  |s| j dd} | jjtkr| jtddd n| jjtkr%| jtddd | jtddd | jD ]T}| | jtkrC| | 	t| |< q0| | jtkrT| | 	t| |< q0t
| | jtjr| | jjjtkrq| | jt| |< q0| | jjjtkr| | jt| |< q0| S )z*Convert any bytes in the DataFrame to str.F)deepr   Taxisinplacecolumns)copyr   r%   rB   renamer   objectrF   rM   applyr(   r+   CategoricalDtypecat
categoriesrename_categories)rG   rL   colr7   r7   r8   sanitize_dataframex   s&   
rW   mcr   filter_library_idxint | list[int] | Nonefilter_feature_idxfilter_barcode_idxreturn#np.ndarray[int, np.dtype[np.bool_]]c                 C  sv   |d u rt j|  td}n	t | t|}|d ur&|t | t|M }|d ur9|t	
| tgdd |D M }|S )Nr%   c                 S  s   g | ]}|fqS r7   r7   .0r1   r7   r7   r8   
<listcomp>   s    z _get_idx_mol.<locals>.<listcomp>)r    onesnrowsboolisin
get_columnr   get_column_lazyr   cr_mceget_indices_for_valuesr   )rX   rY   r[   r\   idx_molr7   r7   r8   _get_idx_mol   s   rl   rk   exclude_cellsre   exclude_noncellswith_cell_call*np.ndarray[int, np.dtype[np.bool_]] | Nonec           
      C  s   |r|rt d|s|s|r{|  j}tjt|td}t|ddtj	f D ].}||ddtj	f |kdf }t
| tg|fg| }	|	t
| tg|| M }	||	O }~	q(|ri||dk  | M  < ||  S |ry||dk  |M  < || S |S dS )z#Get vector of cell calls if needed.z&Can't exclude both cells and non-cellsr_   Nr      )
ValueErrorget_barcode_infopass_filterr    zerossumre   setr   PASS_FILTER_LIBRARY_IDXri   rj   r   r   )
rX   rk   rm   rn   ro   rt   r   library_idxZwhich_barcodesZis_bc_and_libr7   r7   r8   _get_is_cell   s:   

rz   with_umiwith_gem_groupwith_library_idxwith_umi_type
downsamplefloat | Nonec                 C  s@  t | ||	|
}t| ||||}i }|d urUd|  kr dk s#J  J | t|dk }tj||}|d ur?|r?||dk }||dk  |dkM  < ||dk tj}|r[||t	< ~t
t}|se|t= |sj|t= |so|t= |st|t= |D ]#}|tkr|d ur|}~n| || }t|}|tkrt}|||< qv~t|S )Nr   rq   )rl   rz   rg   r   r    randombinomialr0   r#   MOL_INFO_CELL_COLr   r   r   r   r   r   r@   FEATURE_DF_COUNT_COLr+   	DataFrame)rX   rn   rm   r{   r|   r}   ro   r~   rY   r[   r\   r   rk   r   Z
dict_constZnew_read_countsZmol_info_colsr1   valr7   r7   r8   _mol_info_df_from_h5   sD   

r   Twith_barcode_seqwith_feature_info
genome_colc                 C  s   t | tr
d}| }nd}t| d}t||||||||	|
|||}|r6| |t  |t< |t d|t< |rNt| |}|j	ddidd |j
|ddd	}|sT|  |S )
av  Load molecule_info.h5 file as a pandas DataFrame.

    Returns a data frame containing the per-molecule columns from the molecule_info
    file (barcode_idx, count, feature_idx, gem_group, library_idx, umi) along with
    a column 'is_cell'. Numeric columns are automatically downcast to the
    smallest appropriate type.

    Args:
        mol_info (str or object): path to a cellranger molecule_info.h5 file or instance of
            MoleculeCounter object
        exclude_noncells (bool): exclude molecules from non-cell-associated barcodes
        exclude_cells (bool): exclude molecules from cell-associated barcodes
        with_umi (bool): include a column 'umi' containing the 2-bit encoded UMI
        with_barcode_seq (bool): include a column 'barcode' containing the full
            barcode sequences
        with_gem_group (bool): include a column gem_group
        with_library_idx (bool): include `library_idx` column
        with_cell_call (bool): include a column 'is_cell' indicating whether
            the barcode associated with each molecule was called as a cell
        with_feature_info (bool): merge the entire result with the feature reference
            (not memory-efficient, but useful for small analysis tasks)
        with_umi_type (bool): include a column `umi_type` indicating whether the UMI is
            transcriptomic or not [default: False]
        filter_library_idx: library indices (i.e., into
            the library_info dataset) to include, or None to include all of them
        filter_barcode_idx: barcode indices (i.e., into
            the barcodes dataset) to include, or None to include all of them. Note
            that exclude_noncells and exclude_cells will still be applied if True
        filter_feature_idx: feature indices (i.e., into
            the features dataset) to include, or None to include all of them
        downsample: between 0 and 1 exclusive, or None.  The rate at which to
            downsample reads in the molecule info.
        genome_col: If with_feature_info is set, will add a genome column to the dataframe.

    Returns:
        pd.DataFrame: molecule_info data (see above)
    TFrcategoryr   feature_idxrM   rL   lefthowon)r(   r   openr   get_barcodesr   FEATURE_DF_BARCODE_SEQ_COLr0   feature_ref_from_h5rO   mergeclose)Zmol_inforn   rm   r{   r   r|   r}   ro   r   r~   rY   r[   r\   r   r   Z
obj_passedrX   rG   feature_refr7   r7   r8   mol_info_from_h5  s:   
7
r   c                 C  s<  t | tr
|  }n"tt| d}d| v r|d d }n|d }t|}|	  |j
}tt}dd | D }|rZt| D ]\}}	|| |	jtd qD|t tj||d}
~~|durq|
j|
d | }
|
jd	d
d |
jd
d
d |
jD ]}|dvr|
| d|
|< q|
jttdd
d |
S )a?  Load feature reference from h5 file as a pandas DataFrame.

    Args:
        fname_or_mol_info (str or object): path to a cellranger molecule_info.h5 file or instance of
            MoleculeCounter object, or a path to a matrix h5 file
        genome_col (bool): Do we want a column with the "Genome" in it? This is to preserve
        older behaviour used by puppy.panda_utils
        filter_feature_types (list or None): list of feature types to select from feature reference,
            all are kept if None

    Returns:
        pd.DataFrame: the feature reference
    r   matrixfeaturesc                   s   g | ]  fd dt D qS )c                   s   g | ]}t  |qS r7   )getattr)ra   rV   rD   r7   r8   rb   |      z2feature_ref_from_h5.<locals>.<listcomp>.<listcomp>)FEATURE_REF_COLS)ra   r7   rD   r8   rb   |  s    z'feature_ref_from_h5.<locals>.<listcomp>N)rM   r   r   T)rL   )droprL   )r   r   r   )r   r   r   )r(   r   get_feature_refh5Filer   keysr
   Z	from_hdf5r   Zid_mapr   r   values	enumerateappendtagsgetr	   r+   r   locrf   sort_valuesreset_indexrM   r0   rO   FEATURE_ID_COLFEATURE_NAME_COL)Zfname_or_mol_infor   Zfilter_feature_typesr   fgrouprM   valsir   rG   rV   r7   r7   r8   r   b  s>   




r   feature_ref_dfmol_info_dfc                 C  sl   t |}t |t}dd |jD |_tj||gdd}|j}| j|ddtd}|| jdd	d
||< |S )ar  Compute a per-feature summary from a molecule_info DataFrame.

    For each feature, these metrics are computed:
        num_umis - # UMIs associated with the feature
        num_reads - # reads associated with the feature
        num_barcodes - # distinct barcodes containing at least one read/molecule
            mapped to the feature
        dup_frac - proportion of reads corresponding to already observed molecules,
            i.e., sequencing saturation. This is (1 - num_umis/num_reads),
            and 0 in case there are no reads.

    Args:
        feature_ref_df (pandas.DataFrame)
        mol_info_df (pandas.DataFrame)

    Returns:
        pandas.DataFrame: per-feature summary statistics from mol_info_df
                            with the following columns
                            * feature_type category
                            * feature_id   object
                            * feature_name category
                            * index        int64
                            * num_umis     int64
                            * num_reads    int64
                            * num_barcodes int64
                            * dup_frac     float64
                            * num_umis_cells int64
                            * num_reads_cells int64
                            * num_barcodes_cells int64
                            * dup_frac_cells float64
    c                 S  s   g | ]}t |qS r7   )IS_CELL_FORMAT_STRINGformatr`   r7   r7   r8   rb     r   z(summarize_by_feature.<locals>.<listcomp>rq   rK   r   r   )r   left_onright_onr   inferr   )	_feature_summaryqueryr   rM   r+   concatr   r   fillna)r   r   rawfilteredcombinedZcombined_colsresultr7   r7   r8   summarize_by_feature  s    r   feature_idx_subsetsdict[str, list[int]] | Nonec           
      C  s   t | }|rfd|d< dd |jD }|| }| D ]5\}}t| j|}t | | }|j|ddd |j|dddd	}|jd
ddd ||d< t	
||g}q|jd|_dd |jD }	|dg|	  }|S )a  Compute a per-barcode summary from a molecule_info DataFrame.

    For each feature, these metrics are computed::

        num_umis - # UMIs associated with the feature
        num_reads - # reads associated with the feature
        num_features - # distinct features containing at least one read/molecule
            assigned to this barcode
        dup_frac - proportion of reads corresponding to already observed molecules,
            i.e., sequencing saturation. This is (1 - num_umis/num_reads),
            and 0 in case there are no reads.

    All features are currently treated the same (regardless of feature_type or genome).
    The subset_feature_idx argument can be used to produce separate metrics on a
    defined subset of features.
    Any metadata columns will be propagated (barcode_idx, barcode sequence,
    and is_cell, as defined by BARCODE_METADATA)

    Args:
        mol_info_df (pandas.DataFrame): TODO
        feature_idx_subsets (Dict[str, List[int]]): produce
            separate metrics for certain named subsets of features, specified
            by their indices within the feature reference (feature_idx column
            in MoleculeCounter). There will be an additional output column
            called 'feature_subset'; the rows marked 'all_features' contain the
            metrics over all features.

    Returns:
        pandas.DataFrame: per-feature summary statistics from mol_info_df
    Zall_featuresfeature_subsetc                 S     g | ]}|t v r|qS r7   BARCODE_METADATAr`   r7   r7   r8   rb         z(summarize_by_barcode.<locals>.<listcomp>rq   TrJ   r   )r   
left_indexright_indexr   r   )r   rL   r   c                 S  s   g | ]}|d kr|qS )r   r7   r`   r7   r7   r8   rb      r   )_barcode_summaryrM   itemsr    rf   r   r   r   r   r+   r   r   r0   )
r   r   r   Zmetadata_colsmetadataZsubset_namesubsetZ
idx_subsetZsummary_subsetZ	main_colsr7   r7   r8   summarize_by_barcode  s&   !r   c                 C  s$   d| | j dd| | j dd  S )zCalculate the fraction of duplication/sequencing saturation.

    Args:
        df: pandas dataframe
        umi_col_name: column with umi counts
        read_col_name: column with read counts

    Returns:
        A vector of duplicate fractions
    rq   lower)clip)rG   Zumi_col_nameZread_col_namer7   r7   r8   calculate_duplicate_fraction  s   $r   c              
   C  sX   |  t}ttt|t  tt|t  t	t|t
  i}t|tt|t< |S )z%Aggregation for summarize_by_feature.)groupbyr   r+   r   FEATURE_DF_UMI_COLr@   r   countrv   FEATURE_DF_BARCODE_COUNT_COLr   nuniquer   FEATURE_DF_DUP_COL)rG   groupedr   r7   r7   r8   r     s   
	r   c              
     s   |  t dd | jD }t fdd|D }ttt t  tt t 	 t
t t  i}d|t jdd|t jdd  |t< tj||gddS )z%Aggregation for summarize_by_barcode.c                 S  r   r7   r   r`   r7   r7   r8   rb   ,  r   z$_barcode_summary.<locals>.<listcomp>c                   s   i | ]	}| |   qS r7   )firstr`   r   r7   r8   
<dictcomp>-  s    z$_barcode_summary.<locals>.<dictcomp>rq   r   r   )r   r   rM   r+   r   r   r@   r   r   rv   FEATURE_DF_FEATURE_COUNT_COLr   r   r   r   r   )rG   Zmetadata_presentr   r   r7   r   r8   r   &  s   
r   list[int] | Nonebarcode_genometgt_chunk_lenintc                   s    }|j |dur|j|}  ddtjf |k  |du r+dd  D }  }t	j
|t	jdt	j
|t	jdt	j
|t	jdt	j
|t	jdd fd
d}j|ddD ]
\}	}
||	|
| qetttttt dtt dtttt	jd i}dD ]}d|t|  jdd|t|  jdd  |t| < qtdd}|jdtidd |j|dtd}|S )au  Get read and UMI counts per feature by aggregating counts over mol_info.

    in chunks.

    Args:
        mc (MoleculeCounter): instance of MoleculeCounter object
        filter_library_idx (list of ints or None): specifies whether to restrict counts to only certain libraries
        barcode_genome (str | None): Use only barcodes from this genome. If None, use all barcodes.
        tgt_chunk_len (int): number of rows by which to chunk mol_info
        downsample (float): downsample fraction

    Returns:
        pandas.DataFrame containing per-feature summary statistics (UMIs, reads, dup_rate, and feature metadata)
    Nc                 S  s   g | ]}t |d  qS )
library_id)r   )ra   libr7   r7   r8   rb   [  r   z+collapse_feature_counts.<locals>.<listcomp>r_   chunk_startr   	chunk_lenrY   	list[int]c           
        sr  | | }t j|dd}|r|t t| | |M }t j|jdd}|D ] }  d d tjf |kdf }|t t	| | |O }q&|| }t
| | | }t| | | }	~d urdk rt j|	}	||	dk }||	dk }|	|	dk }	t j|t |jd  t j||	 || }|	| }	t j|t |jd  t j||	 d S )Nre   r_   r         ?)r    rc   rf   rh   r   ru   shaper   rx   r   r   r   r   r   addat)
r   r   rY   
chunk_stoprk   r   ry   Zcell_barcode_indicesZfeature_indicesread_countsZbarcode_passing_filterr   rX   read_counts_by_indexZread_counts_in_cells_by_indexumi_counts_by_indexZumi_counts_in_cells_by_indexr7   r8   collapse_chunkc  s<   z/collapse_feature_counts.<locals>.collapse_chunkTZpreserve_boundaries_cellsr   ) r   rq   r   )r   r   r   r   r   )r   r   r   r   rY   r   )rs   rt   genomesr   r   ZPASS_FILTER_GENOME_IDXget_library_infor   Zget_num_featuresr    ru   r$   
get_chunksr+   r   r   r@   r   r   aranger   r   r   r   rO   r   )rX   rY   r   r   r   Zbarcode_infoZ
genome_idxr   r   r   r   r   suffixr   r7   r   r8   collapse_feature_counts>  sF   '	r   library_infoIterable[dict[str, Any]]bcs_passing_filter
np.ndarrayc           	      C  sp   |D ]3}t |d }|d ur|d |vrq|t }||d d df |kdf }t||d | }d| |< q| S )Nr   rq   r   T)r   r   r    r   )	r   r   r   rY   r   r   Zlib_idxggZcell_indicesr7   r7   r8   _barcode_is_cell  s   
r  c                   s0  t tt }t t |}tj	t|tj
d}tj	t|tj
d}d fdd	}	j|d
dD ]\}
}|	|
||| q>ttt|tt|i}~~||t< ~ttj|jd dtd  j|t< ||t dk|t B  }d|t jdd|t jdd  |t< |S )a6  Get read and UMI counts per feature by aggregating counts over mol_info in chunks.

    Args:
        mc (MoleculeCounter): instance of MoleculeCounter object
        filter_library_idx (list of ints or None): specifies whether to restrict counts to only certain libraries
        filter_feature_idx: TODO: document
        tgt_chunk_len (int): number of rows by which to chunk mol_info
        downsample (float): downsample fraction

    Returns:
        pandas.DataFrame containing per-barcode summary statistics (UMIs, reads, dup_rate, and barcode metadata)
    r_   r   r   r   r   $np.ndarray[int, np.dtype[np.uint64]]r   c           
        s4  | | }t j|td}r|t t| | M }r-|t t| | M }t| | | }t| | | }t	| | | } d uro dk rot j
| }||dk }||dk }||dk }~t t |d}	t j|t ||	t |jd  t j|t ||	| d S )Nr_   r   r   rq   )r    rc   re   rf   rh   r   r   r   r   r   r   r   multiplysubtractr   r   r   )
r   r   r   r   r   rk   barcode_indicesZ
gg_indicesr   Zbc_idx_offsetr   r[   rY   rX   r   r7   r8   r     s<   

z/collapse_barcode_counts.<locals>.collapse_chunkTr   r   F)r   
fill_valuer%   rq   r   N)r   r   r   r   r   r  r   r  )sortedlistrw   Zget_gem_groupsr>   r   cr_utilsZformat_barcode_seqsr    ru   r$   r   r+   r   r   r@   r   r   r  fullr   re   r   rs   rt   r   r   r   )rX   rY   r[   r   r   Z	gem_wellsZordered_barcodesr   r   r   r   r   r   r7   r  r8   collapse_barcode_counts  s8   ,	r  )r   )r1   r:   r   r;   )F)rG   rH   )NNN)
rX   r   rY   rZ   r[   rZ   r\   rZ   r]   r^   )rX   r   rk   r^   rm   re   rn   re   ro   re   r]   rp   )rX   r   rn   re   rm   re   r{   re   r|   re   r}   re   ro   re   r~   re   rY   rZ   r[   rZ   r\   rZ   r   r   )FFTFTTTFFNNNNF)rn   re   rm   re   r{   re   r   re   r|   re   r}   re   ro   re   r   re   r~   re   rY   rZ   r[   rZ   r\   rZ   r   r   r   re   r]   rH   )FN)r   re   )r   rH   r   rH   rA   )r   rH   r   r   )
rX   r   rY   r   r   r;   r   r   r   r   )r   r   r   r   rY   r   )rX   r   rY   r   r   r   r   r   )D__doc__
__future__r   collections.abcr   rN   r   typingr   Zh5pyr   numpyr    pandasr+   sixr   r   Zcellranger.utilsutilsr  
cellrangerr   ri   Zcellranger.feature_refr	   r
   Zcellranger.molecule_counterr   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   Z
CHUNK_SIZEr9   r@   rF   rW   rl   rz   r   r   r   r   r   r   r   r   r   r  r  r7   r7   r7   r8   <module>   s   0

$>Z
:0
:


e