o
    Uݢg                     @  s  d dl mZ d dlZd dlmZmZ d dlmZ d dlmZm	Z	 d dl
Zd dlZd dlmZ d dlmZ d dlm  mZ d dlm  mZ d dlmZ d dlmZ d dl m!Z!m"Z" d dl#m$Z$ erhd d	lm%Z% g d
Z&dd dD Z'g dZ(dZ)dZ*dZ+dZ,dZ-e*e+gZ.e"e!dgZ/dtddZ0dd Z1duddZ2dvd%d&Z3G d'd( d(e	Z4dwd7d8Z5dxd>d?Z6	@	AdydzdEdFZ7G dGdH dHe	Z8d{dKdLZ9d|dQdRZ:d}dedfZ;dgdh Z<didj Z=dkdl Z>dmdn Z?d~ddpdqZ@	dddrdsZAdS )    )annotationsN)
CollectionIterable)copy)TYPE_CHECKING	TypedDict)ensure_binary)OFF_TARGET_SUBSAMPLEON_TARGET_SUBSAMPLE)FilteredBarcodes)MoleculeCounter)d      i    i	      '  i:   N  0u  i@  P  c                 C  s   g | ]}t |qS  )int).0_xr   r   c/oak/stanford/groups/akundaje/marinovg/programs/cellranger-9.0.1/lib/python/cellranger/subsample.py
<listcomp>3   s    r   )g     @g     j@g     j@g    Ag    Ag    .Ag    CAg    SAg    8\Ag    cAg    ׇAg    חAg    eA)r   r   r   r   r   r   r   
   Zraw_rpcZ$conf_mapped_barcoded_filtered_bc_rpcZraw_barcoded_filtered_bc_rpcZ	raw_readsmolecule_infor   return	list[str]c                 C  sB   | j d usJ dd | j  D }|  jD ]}d||< qt|S )Nc                 S  s   i | ]}|d qS )r   r   r   genomer   r   r   
<dictcomp>Y   s    zget_genomes.<locals>.<dictcomp>r   )feature_referenceget_genomesget_barcode_infogenomessorted)r   r&   r!   r   r   r   r$   W   s
   
r$   c                   s(   t |  t fdd| D }|S )a  Get the number of cell-associated (i.e., filtered) barcodes per library.

    Note:
        We assume cell barcodes are assigned per GEM group, not per library.

    Args:
        library_info (dict): library_info metadata, probably from a MoleculeCounter
        filtered_barcodes_csv (str): path to filtered_barcodes.csv file

    Returns:
        np.array of int: number of cell-associated barcodes per library
    c                   s   g | ]
}  |d  dqS )	gem_groupr   )getr   libZnum_cells_per_ggr   r   r   p       z-get_num_cells_per_library.<locals>.<listcomp>)r   cells_per_gem_groupnparray)library_infofiltered_barcodes_csvnum_cells_per_libr   r,   r   get_num_cells_per_library_   s
   r4   r&   Iterable[str]c                   s(    fdd| D }t j|  |d< |S )a!  Get cell-associated barcodes by genome.

    Args:
        genomes (list of str): Genome names.
        filtered_barcodes_csv (str): Path to CSV file.

    Returns:
        dict of (str, set): Map genome to list of cell-assoc barcodes.
            Empty-string key is for all genomes.
    c                   s   i | ]}|t  t|qS r   )cr_utilsget_cell_associated_barcode_setr   r    r2   r   r   r"      s    z0get_cell_associated_barcodes.<locals>.<dictcomp> )setunionvalues)r&   r2   cell_bcsr   r8   r   get_cell_associated_barcodesu   s
   
r>   
max_targetfloatnum_targetsr   "np.ndarray[int, np.dtype[np.int_]]c                 C  s(   t t jd| |d td}||dk S )a  Construct a list of sorted, unique, integer-valued subsampling depths.

    Generally corresponding to target read pairs per cell.

    Args:
        max_target (float): the largest target depth
        num_targets (int): desired number of targets, including max_target. There
            will be fewer than this many targets in case num_targets > max_target.

    Returns:
        numpy array of int: target subsampling depths (sorted, distinct, nonzero)
    r      )startstopnumdtype)r/   uniquelinspacer   )r?   rA   Zdistinct_targetsr   r   r   compute_target_depths   s   rJ   c                   @  s2   e Zd ZU dZded< ded< ded< ded< d	S )
SubsamplingDefz9The dictionary of metadata returned by make_subsamplings.strlibrary_typesubsample_typer   target_read_pairs_per_cellzlist[float]library_subsample_ratesN__name__
__module____qualname____doc____annotations__r   r   r   r   rK      s   
 rK   target_depthrN   rL   rM   r3   %np.ndarray[int, np.dtype[np.float64]]usable_frac_per_libmax_computed_depth
int | Nonelib_indices#np.ndarray[int, np.dtype[np.int32]]raw_reads_per_libusable_reads_per_liblibrary_countc
                 C  s   |t krtj| |jtd}
n|tkr| | }
n| | | }
tj|	td}|t kr+|}n|}|D ]}|| dkrA|
| ||  ||< q/| |krSt|}|dkrS|| }d||dk< ||t| t	|dS )N)
fill_valueshaperG   rG           g      ?)rM   rN   rO   rP   )
BULK_SUBSAMPLE_TYPEr/   fullrb   r@   MAPPED_SUBSAMPLE_TYPEzerosmaxr   list)rW   rN   rM   r3   rY   rZ   r\   r^   r_   r`   Ztarget_usable_reads_per_libZsubsample_ratesdenominatorindexZmax_rater   r   r   _subsampling_for_depth   s2   

rm   
np.ndarrayfixed_depths	list[int]num_additional_depthslist[SubsamplingDef]c              	     s   t jfddtD t jd tt }t }	 tkr8t   }
nt tt	fv rB|n|	  }
t
|
|}t|dkrWt |ndt j||gtd}|  t |} f	dd|D S )a  Create metadata for subsampling jobs of a specified subsampling type.

    (raw or usable reads per cell) and for a specified library type.

    Args:
        subsample_type (str): subsample based on raw, usable, or bulk reads? (raw
            corresponds to raw rpc, usable to confidently mapped transcriptomic rpc,
            and bulk to bulk transcriptomic reads)
        library_info (dict): per-library metadata from MoleculeCounter
        library_type (str): library type to use for subsampling
        num_cells_per_lib (np.array of int): number of filtered barcodes per library
        raw_reads_per_lib (np.array of int): number of raw reads per library
        usable_reads_per_lib (np.array of int): number of usable reads per library
        fixed_depths (list of int): fixed subsampling depths (reads per cell)
            to include by default
        num_additional_depths (int): number of subsampling depths to use,
            in addition to the defaults
    Returns:
        list of dict: list of subsampling metadata, each of which is:
            {'library_type': <str>,
             'subsample_type': <str>,
             'target_read_pairs_per_cell': <int>,
             'library_subsample_rates': <np.array of floats>}
    c                   s    g | ]\}}|d   kr|qS rM   r   )r   ir+   rs   r   r   r          z%make_subsamplings.<locals>.<listcomp>rc   r   Nc                   s*   g | ]}t | t
qS r   )rm   len)r   rW   	r\   r1   rM   rZ   r3   r^   rN   rY   r_   r   r   r   &  s    )r/   r0   	enumerateint32astyper@   re   minRAW_SUBSAMPLE_TYPERAW_CELLS_SUBSAMPLE_TYPErJ   rv   ri   concatenater   sortrH   )rN   r1   rM   r3   r^   r_   ro   rq   Zraw_rppc_per_libZusable_rppc_per_libZmax_target_depthZcomputed_depthsZtarget_depthsr   rw   r   make_subsamplings   s<   "




r   FTis_targetedboolinclude_bulkc                   s  g }t j| d}|j}t||}|st| }nt| }t|	 }	|
 dkr9|W  d   S |dus?J t|D ]}  fdd|D }
 tjko\tdd |
D }|rdtt}ntt}|ro|tg7 }zt| }|tg7 }W n	 ty   Y nw |D ]7}|tkrt| }nt| }|rt}|}n|tkrt}|	}nt}|}|t|| ||||t qqDW d   |S 1 sw   Y  |S )a  Construct subsampling metadata for a range of target read depths,.

    both raw reads per cell and usable reads per cell.

    Args:
        molecule_info_h5 (str): path to molecule_info.h5 file
        filtered_barcodes_csv (str): path to filtered_barcodes.csv file
        is_targeted (bool, optional): when subsampling to usable reads per cell,
            also restrict to on-target reads. Defaults to False.
        include_bulk (bool, optional): include subsampling based on bulk read counts,
            ignoring the number of cells. Does not apply to targeted GEX libraries.
            Defaults to True.

    Returns:
        list of dict: metadata for subsampling job, produced by make_subsamplings
            and consumed by run_subsampling
    rr   Nc                   s   g | ]
}|d   kr|qS rs   r   )r   lrs   r   r   r   `  r-   z.construct_all_subsamplings.<locals>.<listcomp>c                 s  s    | ]}t |V  qd S )N)rna_libraryhas_target_setr*   r   r   r   	<genexpr>a  s    

z-construct_all_subsamplings.<locals>.<genexpr>)cr_mcr   openr1   r4   r/   r0   !get_usable_read_pairs_per_library+get_on_target_usable_read_pairs_per_library)get_transcriptomic_read_pairs_per_librarysumr   sorted_library_typesGENE_EXPRESSION_LIBRARY_TYPEanyr   ALL_SUBSAMPLE_TYPESre   /get_read_pairs_in_filtered_barcodes_per_libraryr}   
ValueErrorget_raw_read_pairs_per_librarySUBSAMPLE_TARGETED_FIXED_DEPTHSSUBSAMPLE_BULK_FIXED_DEPTHSSUBSAMPLE_FIXED_DEPTHSextendr   SUBSAMPLE_NUM_ADDITIONAL_DEPTHS)molecule_info_h5r2   r   r   subsamplingsmcr1   r3   r_   Ztranscriptomic_reads_per_lib	librariesZis_targeted_gexsubsample_types_rN   r^   ro   Z subsampling_usable_reads_per_libr   rs   r   construct_all_subsamplings7  sx   



AAr   c                   @  sJ   e Zd ZU dZded< ded< ded< ded< ded< d	ed
< ded< dS )SubsampleDataDictz+The dictionary returned by run_subsampling.z4np.ndarray[tuple[int, int, int], np.dtype[np.int64]]umis_per_bcfeatures_det_per_bcread_pairs_per_bc/np.ndarray[tuple[int, int], np.dtype[np.int64]]
read_pairsumisz/np.ndarray[tuple[int, int], np.dtype[np.bool_]]lib_type_genome_any_readstotal_features_detNrQ   r   r   r   r   r     s   
 r   mol_gem_groupmol_barcode_idxc                 C  s0   | rt j|j|jdt j|j|jdfS ||fS )z!Compute tallies for each barcode.rc   )r/   rh   rb   rG   )is_bulk_subsamplingr   r   r   r   r   _make_group_keys  s
   r   subsample_infoCollection[SubsamplingDef]feature_indicesCollection[int] | Nonec           %        s&  t j| dx}t|}tt|t|t| }|d| }	|d| }
|d| }|d| }|d| }|d}|durtj	|tj
d	}|  t||}|	dd | }	|
dd | }
|dd | }|dd | }|dd | }t|}d
d t|D  |jdusJ tj fdd|jjD td	}|| }|jdusJ t|j}dd t|D tjfdd|jD tj
d	}tjt|t|ftd	}tt|	|
dk ||
dk D ]\}}|| }d|||f< qt|}t|}| }|jd }t|||ftj
}t|||ftj
}tj|||ftj
d	}tj||ftj
d	} tj||ftj
d	}!tj|||ftj
d	}"t|D ],\}#}$td|$  t|$||||	|||
|||||# ||# ||# |"|# | |# |!|#  qLW d   n	1 sw   Y  |||| |!||"dS )a  Runs a subsampling chunk.

    Args:
        molecule_info_h5: Path to a MoleculeCounter file.
        subsample_info: A subsampling produced by make_subsamplings
        filtered_barcodes_csv: A CSV of filtered (cell) barcodes
        feature_indices: indices of filtered features
        chunk_start: integer chunk start
        chunk_len: integer chunk len

    Returns:
        dict: data dictionary
    r   library_idxcountr(   barcode_idxfeature_idxbarcodesNrc   c                 S     i | ]\}}||qS r   r   )r   rt   gr   r   r   r"         z#run_subsampling.<locals>.<dictcomp>c                 3  s$    | ]} |j tjd  V  qdS )r9   N)tagsr)   rna_feature_refGENOME_FEATURE_TAG)r   f)genome_to_intr   r   r     s
    
z"run_subsampling.<locals>.<genexpr>c                 S  r   r   r   )r   rt   r   r   r   r   r"     r   c                 3  s    | ]	} |d   V  qdS )rM   Nr   r*   )lib_type_to_intr   r   r     s    r   Tzsubsampling task: )r   r   r   r   r   r   r   )r   r   r   r$   slicer   get_column_lazyget_ref_columnr/   r0   int64r   isinr   rx   r#   fromiterfeature_defsr1   r   r   rh   rv   r   r:   zip	num_cellsrb   print_run_subsample_task)%r   r   r2   r   chunk_start	chunk_lenr   r&   chunkmol_library_idxmol_read_pairsr   r   mol_feature_idxr   maskfiltered_barcodesZfeature_int_to_genome_intmol_genome_idx	lib_typesZlib_idx_to_lib_type_idxr   lib_idx
genome_idxlib_type_idxZn_tasksZ	n_genomesZn_cells
n_featuresr   r   r   read_pairs_per_taskumis_per_taskfeatures_det_per_tasktask_idxtaskr   )r   r   r   run_subsampling  s   




nr   r   r   r   $np.ndarray[int, np.dtype[np.bytes_]]r   r   r   r   r   r   r   r   r   r   r   r   #np.ndarray[int, np.dtype[np.int64]]r   c           !   	   C  s  t jd t j| d t jd}| d tk}| d tk}t |dkr%d S || }t |	 r2d S t 	|dk s@t 	|dkrDt
dtj||t j||ft|||	dD ]\\}}\}}}t|| |}t|D ]\}}|
j||d}|s||r|qlt |dk||k@ }t |||k }|rt|||d d f< |||d d f< d||d d f< t j|| |d	||d d f< n6|r|
|} t|||| f< |||| f< t t || ||| f< ||d d f  t j|| |d	7  < ||  |7  < ||  t|7  < qlqWd S )
NrC   rP   rc   rN   r   z.subsampling probabilities cannot be < 0 or > 1)r<   keys)barcoder!   )	minlength)r/   randomseedr0   float64re   r}   count_nonzeroisnanr   r   r6   numpy_groupbybinomialr   format_barcode_seqrx   containsflatnonzeror   rv   bincountindex_of_barcode)!r   r   r   r&   r   r   r   r   r   r   r   r   r   r   r   r   r   Zrates_per_libraryr   Zis_raw_cells_subsamplingZmol_rateggZbc_idxr   r   r   r   Zthis_genome_idxr!   Zis_cell_barcoder   Zthis_genome_read_pairsZcell_idxr   r   r   r   :  s\   


r   c              	   C  sL   t |}|tv sJ |d u rdnd| }| | d| d| d|  | 	S )Nr9   r   )r   get_library_type_metric_prefixSUBSAMPLE_TARGET_MODES)namerM   r!   ss_typess_depthtarget_modeZ	lt_prefixZtarget_suffixr   r   r   make_metric_name  s   
"r   c                 C  s   | dkrt | | | S dS )Nr   rd   )tk_statsrobust_divide)r   r   r   r   r   compute_dup_frac  s   r   c              	   C  s   t | dkri S t| d d}t|}W d   n1 sw   Y  | dd D ]-}t|d}t|}| D ]\}}||  |7  < q;W d   n1 sRw   Y  q*|S )zJoin together a list of metric dicts (each value is a numpy vector).

    :param metrics: list of metrics
    :return: joined dictionary
    r   rbNrC   )rv   r   pickleloaditems)metricsr   datamZ
chunk_datakvr   r   r   join_metrics  s   
r  c                  C  s&  i }t j|d.}t|}|jdusJ t|j}dd t|D }	|jdus+J |j	 }
W d   n1 s:w   Y  t
|}t|D ]H\}}|d }|	| }|d }|d }t|rmtt| d jd	 }nd
g}|D ]}| d ||f s}qr|| }||}t| d |||f }||td|||||< t| d |||f }||td|||||< t| d |||f }||td|||||< t| d |||f }||td|||||< |tkrt| d ||f }|}nt| d |||f }t| d |||f }||td|||||< ||td|||||< t| d ||f | d ||f }||td|||||< |durd|tkrdt| d ||f }t|t|
|td|||||< qrt| d |ddf }t| d |ddf }t||}||td|tj|||< qG|S )a  Calculate subsampling metrics (summary) from a joined data structure from run_subsampling.

    :param data: A merged dictionary of data from run_subsampling
    :param molecule_info_h5: path to a MoleculeInfo file
    :param filtered_barcodes_csv: path to a list of cell barcodes
    :param subsample_info: subsampling info produced by construct_all_subsamplings
    :param target_mode: String of target mode for metrics suffix.
        Must be one of constants.SUBSAMPLE_TARGET_MODES
    :return: dict (JSON) metrics
    r   Nc                 S  r   r   r   )r   idxltr   r   r   r"     r   z1calculate_subsampling_metrics.<locals>.<dictcomp>rM   rN   rO   r   rC   r   r   r   (subsampled_filtered_bcs_mean_read_counts*subsampled_filtered_bcs_median_read_counts%subsampled_filtered_bcs_median_counts#subsampled_filtered_bcs_mean_countsr   r   4subsampled_filtered_bcs_median_unique_genes_detected2subsampled_filtered_bcs_mean_unique_genes_detectedr   r   subsampled_duplication_fracZ'subsampled_frac_targeted_genes_detected)r   r   r   r$   r1   r   r   rx   r#   count_target_feature_indicesr   has_genomesrj   rangerb   sorted_barcode_indicesr/   meanr   medianre   r   r   r
   divider@   r   MULTI_REFS_PREFIX) r  r   r2   r   r   summaryr   r&   r   Zlib_type_mapZnum_target_featuresr   rt   r   lib_typer   r   r   Zgenome_intsr   r!   Z	cell_indsZmean_reads_per_cellZmedian_reads_per_cellZmedian_umis_per_cellZmean_umis_per_cellZmedian_features_per_cellZmean_features_per_cellZdup_fracZnum_target_features_detectedZall_read_pairsZall_umisr   r   r   calculate_subsampling_metrics  s  






"




r  pd.DataFramec                   sL  g }g d}|r| ddg g d}|r| ddg dur)fdd	|D }t||D ]@\}  fd
d	 D }durJfdd	|D } fdd	|D }	fdd	|D }
t|	|
D ]\}}||||g qaq.tj|g dd}|jddddt}||dkj	dd }|j
dkrtj|d}d|j_dur|d< |S )a  Convenience function.

    Args:
        summary (dict): loaded JSON, as produced by calculate_subsampling_metrics
        suffix (str, optional): subsampling metric suffix
        prefix (sre, optional): subsampling metric prefix
        include_mean (bool, optional): Include mean UMIs and genes per barcode. Defaults to True.

    Returns:
        pd.DataFrame: subsampling metrics
    )zMean reads per cellzMedian reads per cellzMedian UMIs per cellzMedian genes per cellzMean UMIs per cellzMean genes per cell)r  r	  r
  r  r  r  Nc                   s   g | ]	}| d   qS )r   r   r   x)suffixr   r   r   |      z+parse_subsample_summary.<locals>.<listcomp>c                   s    g | ]} |v rt |v r|qS r   )rg   r  metricr   r   r   ~  ru   c                   s   g | ]	}|  r|qS r   )
startswithr  )prefixr   r   r     r  c                   s*   g | ]}t | d  ddd qS )r   r      rC   )r   splitrsplitr  r  r   r   r     s   * c                   s   g | ]} | qS r   r   r  )r  r   r   r     s    )targetvalr   )columnsr&  r   r'  )rl   r(  r<   r   rC   )axis)r   r   zTarget mean reads per cellr!   )r   r   r   appendpdZ	DataFramepivotrz   r   r   rb   rl   r   )r  r  r"  Zinclude_meanZmunged_datanamesr   r   r   Z	ss_targetr  r&  r'  dfr   )r   r"  r  r  r   parse_subsample_summaryY  s>   
r/  c                 C  sf   g }t | D ]*\}}|dur|d |vrq|dur |d |vr q|dur+|d |vr+q|| q|S )a!  Parse through subsample info to extract task indices of interest that match filters.

    Args:
        ss_tasks (list): list of subsamplings, as generated by construct_all_subsamplings
        filter_library_type (list): list of library types to filter by (or None)
        filter_subsample_type (list): list of subsample types to filter by (or None)
        filter_target_read_pairs (list): list of subsample depths to filter by (or None)

    Returns:
        task_indices (list): list of indices that correspond to the tasks of interest
    NrM   rN   rO   )rx   r*  )Zss_tasksZfilter_library_typeZfilter_subsample_typeZfilter_target_read_pairsZtask_indicesr   Ztask_ss_infor   r   r   get_selected_task_indices  s   r0  )r   r   r   r   )r&   r5   )r?   r@   rA   r   r   rB   )rW   r   rN   rL   rM   rL   r3   rX   rY   rX   rZ   r[   r\   r]   r^   rX   r_   rX   r`   r   r   rK   )rN   rL   rM   rL   r3   rn   r^   rn   r_   rn   ro   rp   rq   r   r   rr   )FT)r   r   r   r   r   rr   )r   rn   r   rn   )r   r   r   r   r   r   )"r   rK   r   r   r   r   r&   r   r   rB   r   rB   r   rB   r   rB   r   rB   r   rB   r   r   r   r   r   r   r   r   r   r   r   r   r   r   )NNT)r   r  )NNN)r   rp   )B
__future__r   r   collections.abcr   r   r   typingr   r   numpyr/   Zpandasr+  sixr   cellranger.molecule_countermolecule_counterr   Zcellranger.rna.feature_refrnafeature_refr   cellranger.rna.librarylibraryr   cellranger.utilsutilsr6   tenkit.statsstatsr   cellranger.constantsr	   r
   cellranger.fast_utilsr   r   r   r   r   r   r|   rg   r}   re   r   r   r$   r4   r>   rJ   rK   rm   r   r   r   r   r   r   r   r   r  r  r/  r0  r   r   r   r   <module>   sh   




	
4\]

 
U ':