o
    Uݢg<                     @  s  d dl mZ d dlZd dlZd dlZd dlmZ d dlmZ d dl	Z	d dl
Zd dlZd dlmZmZ d dlm  mZ d dlm  mZ d dlmZ d dlm  mZ d dlm  m   m!Z" ergd dlm#Z# d dl$m%Z%m&Z&m'Z'm(Z( dZ)e*d	g d
Z+d*d+ddZ,	d,d+ddZ-G dd dZ.d+ddZ/d+ddZ0dd Z1dd Z2dd Z3dd Z4d d! Z5d-d$d%Z6d&d' Z7G d(d) d)Z8dS ).    )annotationsN)Sequence)TYPE_CHECKING)ensure_binary
ensure_str)
Projection)AggrCountSamplePropertiesCountSamplePropertiesExtendedCountSamplePropertiesSampleProperties filtered_bcs_transcriptome_unionBarcodeRankPlotSegmentstartendcell_densitylegendFcell_barcodes
set[bytes]c                   sN   || ksJ t  fddt| |D }t|t||   }t| |||dS )z(Helper function to build a plot segment.c                 3  s     | ]}|  v rd V  qdS )   N ).0ir   	sorted_bcr   f/oak/stanford/groups/akundaje/marinovg/programs/cellranger-9.0.1/lib/python/cellranger/webshim/data.py	<genexpr>/   s    z#get_plot_segment.<locals>.<genexpr>r   )sumrangefloatr   )start_index	end_indexr   r   r   	num_cellsdensityr   r   r   get_plot_segment,   s   r$   c                 C  s   |r&t t |d dd |}|| dd | }|d dd | }n|| dd }|d dd }t||| }|| }|| }	~t||	| }
|	|
fS )a  A helper function to generate the data required to generate the barcode rank.

    plot for RNA counter. The barcode rank plot consists of
    multiple plot segments

    If the barcodes are ordered by UMI counts
        - All the barcodes until the first non cell barcodes go into
        the `Cell` segment.
        - All the barcodes beyond the last cell barcode goes into the
        `Background` segment.
        - The remaining barcodes are further divided into multiple plot
        segments
    Input:
        - cell_barcodes: A set of bytes with the cell barcodes
        - barcode_summary: The barcode summary from the h5.
        - key: Specifies the entry to look up in barcode summary
            for getting the UMI counts
        - restrict_barcodes: Optional list of cell barcodes to restrict to
    Output:
        - sorted_counts: UMI counts sorted in descending order
        - plot_segments: List of BarcodeRankPlotSegment
    bc_sequenceN)npnonzeroisincompute_sort_ordercompute_plot_segments)r   barcode_summarykeyrestrict_barcodesZrestrict_indicescounts_per_bcZbarcode_sequences	srt_orderr   sorted_countsplot_segmentsr   r   r   'generate_counter_barcode_rank_plot_data6   s   r2   c                   @  s>   e Zd ZdddZdd Zdd Zd	d
 Zdd Zdd ZdS )
SampleDataprojectionsSequence[Projection]c                 C  s  |r	t |ts	J t|j| _| jd| _t|j|||\| _	| _
t|j| _t|do1|jd u| _| jd| _t |trB|jnd| _t |trM|jnd | _|jr[tt|jnd | _|jrvtj|jdkrvtjt|jdtidnd | _|j rt!"|j nd | _#|j$r| j#d u sJ dd	 t%&|j$D | _#t'|j(| _)t*|j+| _,t-|j.| _/t*|j0| _1t*|j2| _3t*|j4| _5t*|j6| _7d S )
Nr   
target_settargeting_methodFr   barcode)
convertersc                 S  s   h | ]}t |qS r   )r   )r   bcr   r   r   	<setcomp>   s    z&SampleData.__init__.<locals>.<setcomp>)8
isinstancer   load_metrics_summarysummary_pathsummarygetr"   load_analysesanalysis_pathanalysesoriginal_cluster_sizesload_barcode_summarybarcode_summary_pathr+   hasattrr6   has_target_setr7   r
   Zis_visium_hdfilter_probesZvdj_clonotype_summary_pathpdread_csvr   vdj_clonotype_summaryZvdj_barcode_support_pathospathgetsizer   vdj_barcode_supportfiltered_barcodes_pathcr_utilsget_cell_associated_barcode_setr   Zvdj_cell_barcodes_path	vdj_utilsload_cell_barcodes_jsonload_feature_countsfeature_metrics_pathZfeature_metricsload_antibody_dataantibody_histograms_pathantibody_histogramsload_treemap_dataantibody_treemap_pathantibody_treemapantigen_histograms_pathantigen_histogramsZraw_normalized_heatmap_pathZraw_normalized_heatmapZisotype_scatter_pathZisotype_scatterZ gex_fbc_correlation_heatmap_pathZgex_fbc_correlation_heatmap)selfsample_propertiessample_data_pathsplot_preprocess_funcr4   r   r   r   __init__b   sl   

zSampleData.__init__c                   sV   | j d u rd S  fdd| j D }t|dkst|dksJ t|dkr'd S |d S )Nc                   s   g | ]	}t | r|qS r   )r<   )r   aanalysis_typer   r   
<listcomp>   s    z+SampleData.get_analysis.<locals>.<listcomp>r   r   )rC   len)r`   rg   rr   rf   r   get_analysis   s   
zSampleData.get_analysisc                 C  s   | j dusJ t| j | j|S )a!  Generate the data required to generate the barcode rank.

        plot for RNA counter. The barcode rank plot consists of
        multiple plot segments

        If the barcodes are ordered by UMI counts
            - All the barcodes until the first non cell barcodes go into
              the `Cell` segment.
            - All the barcodes beyond the last cell barcode goes into the
              `Background` segment.
            - The remaining barcodes are further divided into multiple plot
              segments
        Input:
            - key: Specifies the entry to look up in barcode summary
                   for getting the UMI counts
        Output:
            - sorted_counts: UMI counts sorted in descending order
            - plot_segments: List of BarcodeRankPlotSegment
        N)r   r2   r+   )r`   r,   r   r   r   counter_barcode_rank_plot_data   s   
z)SampleData.counter_barcode_rank_plot_datac                 C  sb   | j dusJ | jd  }t|| jd | j }| jd  | }|| }~t||| j }||fS )zEGenerate the data required to generate the barcode rank plot for VDJ.Ncountr8   )r   rP   to_numpyr)   r*   )r`   r.   r/   r   r0   r1   r   r   r   vdj_barcode_rank_plot_data   s   z%SampleData.vdj_barcode_rank_plot_datac                 C     | j S N)rH   r`   r   r   r   is_targeted      zSampleData.is_targetedc                 C  s   | j duo	t| j vS )zReturns true if data is antibody only, currently determined by the presence of a metric.

        Returns:
            bool: Whether or not the data is antibody-only.
        N)r?    FILTERED_BCS_TRANSCRIPTOME_UNIONrr   r   r   r   is_antibody_only   s   zSampleData.is_antibody_onlyNr4   r5   )	__name__
__module____qualname__rd   rk   rl   ro   rs   rv   r   r   r   r   r3   a   s    
P
r3   c                   sf   t  t |ks
J tt  dt|D ]
\}}||v |< qttt   fdddd}|S )NFc                   s    |  |  fS rq   r   )xr.   is_cellr   r   <lambda>   s    z$compute_sort_order.<locals>.<lambda>T)r,   reverse)ri   r&   full	enumeratesortedr   )r.   Zbc_sequencesr   r   r:   r/   r   r|   r   r)      s   r)   c           
   
   C  s   t | }t| D ]\}}||vr|} nqd}ttt | D ]}| | |v r+|} nqd||d t | g}g }|td|d ddd |t|d |d ddd t||d |d }	tt |	d D ]}|t|	| |	|d  | |d	d
 qf|S )Nr   r   g      ?Tr                 F)r   )ri   r   reversedr   appendr   segment_log_plot_by_lengthr$   )
r   r0   r   Zfirst_non_cellr   r:   Z	last_cellrangesr1   Zmixed_segmentsr   r   r   r*      s8   r*   c                 C  s   ||krg S d}d}t t| }t t| }d}|g}t||D ]I}	|	dkr)q"t||	d }
t |	t |
 | }t | |	 t | |
  | }|t j||g7 }||krk|	|d | krk||	d  d}q"|d |krw|| |S )zGiven the extends of the mixed region [x_start, x_end), compute.

    the x-indices that would divide the plot into segments of a
    prescribed length (in pixel coordinates) with a minimum number
    of barcodes in each segment
    g{Gz?   r   r   r   )r&   logri   maxr   linalgnormr   )Zy_dataZx_startZx_endZSEGMENT_NORMALIZED_MAX_LENZ
MIN_X_SPANZ	log_max_xZ	log_max_yZthis_segment_lenZsegment_idxr   Zlast_idxdyr   r   r   r     s,    
r   c                 C  B   | d u rd S t | }t|W  d    S 1 sw   Y  d S rq   openjsonload)r>   fr   r   r   r=   C  
   
$r=   c                 C  s   | d u rd S t | S rq   )SummaryFeatureCounts	from_file)rW   r   r   r   rV   J  s   
rV   c                 C  r   rq   r   )rY   r   r   r   r   rX   P  r   rX   c                 C  r   rq   r   )r\   r   r   r   r   r[   W  r   r[   r4   r5   c                 C  s   | du rdS t |tr,t|jdkrtjj| d|dg}ntjj| d|dtj| g}|d du r4dS d}|d }|duret |tjre|	 }t |toP|j
 }|jjtjkre|s_t |tre|tj |||fS )z2Returns (analysis_object, original_cluster_sizes).N)NNr   pca)methodr4   r   )r<   r	   ri   genomescr_sg_analysisSingleGenomeAnalysisload_default_formatcr_mg_analysisMultiGenomeAnalysisget_cluster_sizesZ
is_spatialmatrixbcs_dimgex_constantsMAX_WEBSHIM_BCS_DIMr   subsample_bcs)base_dirrc   ra   r4   rC   rD   analysisZis_single_cellr   r   r   rA   ^  s:   


rA   c                 C  s   | d u rd S t | dS )Nrj   )h5pyFile)h5_pathr   r   r   rE     s   rE   c                   @  sD   e Zd ZdZddddZeddd	Zd
d ZdddZdd Z	dS )r   zJStores and queries a dataframe that summarizes relevant features per gene.Ndfpd.DataFramec                 C  s$   || _ |djdd| _|| _dS )a  Stores and queries a dataframe that summarizes relevant features per gene.

        There should be one row per gene and any number of columns.

        Args:
            df (pandas dataframe): Dataframe with n_genes rows and columns with features per genes.
        
feature_idindex)orientN)r   	set_indexto_dictfeature_dictfilepath)r`   r   r   r   r   r   rd     s   
zSummaryFeatureCounts.__init__returnc                 C  s,   t |}tj|sJ | tt ||S )z#Loads the features from a csv file.)r   rM   rN   isfilerJ   rK   )clsrN   r   r   r   r     s   zSummaryFeatureCounts.from_filec                 C  s   t |}| jj|dd d S )NF)r   )r   r   to_csv)r`   rN   r   r   r   r     s   zSummaryFeatureCounts.to_csvc                 C  rp   rq   )r   rr   r   r   r   get_df  rt   zSummaryFeatureCounts.get_dfc                 C  s.   || j vrdS || j | vrdS | j | | S )z(Get the given key for the given feature.N)r   )r`   r   r,   r   r   r   get_value_for_feature  s
   
z*SummaryFeatureCounts.get_value_for_featurerq   )r   r   )r   r   )r   r   )
rx   ry   rz   __doc__rd   classmethodr   r   r   r   r   r   r   r   r     s    
r   )F)r   r   rq   rw   )9
__future__r   collectionsr   rM   collections.abcr   typingr   r   numpyr&   pandasrJ   sixr   r   cellranger.analysis.multigenomer   multigenomer    cellranger.analysis.singlegenomesinglegenomer   cellranger.utilsutilsrR   cellranger.vdj.utilsvdjrT    cellranger.webshim.constants.gexwebshim	constantsgexr   r   'cellranger.websummary.sample_propertiesr   r	   r
   r   ru   
namedtupler   r$   r2   r3   r)   r*   r   r=   rV   rX   r[   rA   rE   r   r   r   r   r   <module>   sJ   + 

%$
/