o
    Uݢg<                  	   @  s  d dl mZ d dlZd dlZd dlZd dlmZmZ d dlm	Z	m
Z
 d dlmZmZmZ d dlZd dlZd dlmZ d dlm  mZ d dlmZ d dlmZ d dlm  mZ  d dl!mZ" d dl#m$Z$ d dl%m&Z& d d	lm'Z' d d
l(m)Z) d dl*m+Z+ edZ,dZ-dZ.dddddej/e.gZ0dZ1dZ2dZ3dZ4dZ5dZ6dZ7dZ8dZ9dZ:dZ;dZ<d Z=d!Z>e?d"Z@e=e@iZAee;ejBfe6ejCfe7ej?fe8ejBfe<ej?fe:ej?fe=ej?fgZDe5e9gZEd!ZFeFd#gZGde6gZHd$ZId%ZJd&ZKd'ZLd(ZMd)ZNdZOd*ZPd+ZQd,ZRd-ZSd.ZTd/ZUd0ZVd1ZWd2ZXd3ZYd4ZZd5Z[d6Z\d7Z]d8Z^d9Z_d:Z`G d;d< d<eZad=d>d?ZbdidBdCZcdjdEdFZddkdldJdKZeG dLdM dMejfZgdmdPdQZhdRdS ZidndVdWZjdodZd[Zkdpd\d]Zldqdrd`daZmdbdc Znddde ZoG dfdA dAZpG dgdh dheqZrdS )s    )annotationsN)OrderedDictdefaultdict)Callable	Generator)Any
NamedTupleTypeVar)ensure_binary)concatenate_molecule_infos)FeatureReferenceLIBRARY_TYPE)TARGETING_METHOD_TL)tables_T1Zmoleculefile_versionTITLECLASSVERSIONFILTERSPYTABLES_FORMAT_VERSION   metrics_jsonmetricsbarcode_infoZprobesbarcode_idxZfeature_idxlibrary_idxZ	probe_idxcount	gem_groupZumiZumi_typebarcodes   library_info
gem_groups	librariesis_aggregatedtargeting_methodZmolecule_info_typerawanalysis_parametersZraw_read_pairsZ#raw_read_pairs_in_filtered_barcodesZdownsampled_readsZusable_read_pairsZfeature_read_pairsZdownsampled_feature_read_pairsZon_target_usable_read_pairsrecovered_cellsforce_cellsZinclude_intronsFfilter_probesTZno_probe_filtergzipi   c                   @  s*   e Zd ZU dZdZdZded< ded< dS )	BarcodeInfor   r!      
np.ndarraypass_filter	list[str]genomesN)__name__
__module____qualname__PASS_FILTER_BARCODE_IDXPASS_FILTER_LIBRARY_IDXPASS_FILTER_GENOME_IDX__annotations__ r:   r:   j/oak/stanford/groups/akundaje/marinovg/programs/cellranger-9.0.1/lib/python/cellranger/molecule_counter.pyr-      s   
 r-   uint64strr0   r2   mcMoleculeCounterc                 C  s   t |tjr|jjtju r| jjt|d dS t	|dkr0tj
g ddd}| jjt|td dS tdd |D }td	| }| jjttj||t	|d
td dS )zWrites the barcodes to the molecule info.

    Args:
        mc: MoleculeCounter instance
        barcodes: a numpy array of strings or iterable of strings
    )datar   SF)dtypecopyrA   compressionc                 s      | ]}t |V  qd S Nlen.0xr:   r:   r;   	<genexpr>       z"_write_barcodes.<locals>.<genexpr>S%dr   N)
isinstancenpndarrayrC   typebytes_h5create_datasetBARCODE_DS_NAMErJ   arrayHDF5_COMPRESSIONmaxfromiter)r?   r    Zbc_arraymax_barcode_lenbarcode_dtyper:   r:   r;   _write_barcodes   s    

r`   namec                 C  s,   t | }| jj|dd|ttfd| j|< d S )N)r   rH   )maxshaperC   rF   chunks)MOLECULE_INFO_COLUMNSrW   rX   r[   HDF5_CHUNK_SIZEcolumns)r?   ra   col_typer:   r:   r;   rX      s   rX   逄 returnr/   c                 C  s\   |   }|jddtjf }| j|ddD ]\}}t|| t|||  }qt	|S )zAGet barcode indices which have nonzero counts or pass the filter.NFpreserve_boundaries)
get_barcode_infor0   r-   r6   
get_chunksrS   union1dget_column_lazyBARCODE_IDX_COL_NAMErZ   )r?   Ztgt_chunk_lenr   Zunique_bc_idxchunk_start	chunk_lenr:   r:   r;   get_barcode_index_to_retain   s   
rs   c                   @  s   e Zd ZdZdd ZdS )NumpyEncoderz$Custom encoder for numpy data types.c                 C  s   t |tjtjB tjB tjB tjB tjB tjB tj	B tj
B tjB tjB r(t|S t |tjtjB tjB r8t|S t |tjtjB rH|j|jdS t |tjrR| S t |tjr\t|S t |tjrddS tj| |S )zEncode the object.

        Args:
            o: The object to encode.

        Returns:
            A transformed version of the object to encode.
        )realimagN)rR   rS   int_intcintpint8int16int32int64uint8uint16uint32r<   intfloat16float32float64float	complex64
complex128ru   rv   rT   tolistbool_boolvoidjsonJSONEncoderdefault)selfor:   r:   r;   r      sH   		
zNumpyEncoder.defaultN)r3   r4   r5   __doc__r   r:   r:   r:   r;   rt      s    rt   versionr   c                 C  s   || _ t| jt| dS )zqSet the file version.

    Args:
        mc: an moleculecounter instance
        version: The version to set
    N)r   cr_h5set_hdf5_attrrW   FILE_VERSION_KEY)r?   r   r:   r:   r;   set_file_version  s   r   c                   s.   t | dd}|j  fdd  D S )Nrz/metricsc                   s   i | ]}| | qS r:   r:   )rL   kattrsetr:   r;   
<dictcomp>      z"get_v2_metrics.<locals>.<dictcomp>)r   	open_fileget_node_v_attrs_f_list)Zh5_filegroupr:   r   r;   get_v2_metrics  s   r   v2_mc_in	h5py.Filec              	   C  sf   ddd}t j| t tt d}g }||}|D ]\}}}}|tt|dt|ttj	i q||fS )	a  Method to generate library_info information for molecule info files with versions prior to this field.

     being introduced.

    Args:
        hdf_file: an open V2 or earlier molecule info file

    Returns:
        A tuple containing a list of library infos, as well as a corresponding list of chunks, one for each gem group
        with a tuple defining it as (gem_group, chunk_start, chunk_length, lib_idx)
    gem_group_arrnp.arrayc           	        s   t t  dksJ  jd }t  } fdd}t|||}g }tt||D ]\}\}}|	||d |d |f q,|S )z'Return exactly one chunk per gem group.r   c                   s    |  S rH   r:   )ir   r:   r;   gg_key*  s   zOget_v2_library_info_and_chunks.<locals>.get_chunks_by_gem_group.<locals>.gg_keyr!   )
rS   alldiffshapeuniquer@    get_chunks_from_partition_static	enumeratezipappend)	r   num_rowsZ
unique_ggsr   
chunk_iterrc   lib_idxggchunkr:   r   r;   get_chunks_by_gem_group#  s   

z?get_v2_library_info_and_chunks.<locals>.get_chunks_by_gem_grouprC   
library_idN)r   r   )
rS   asarrayGEM_GROUP_COL_NAMErd   r   r   r=   r   rna_libraryGENE_EXPRESSION_LIBRARY_TYPE)r   r   Zv2_gem_groupsr"   rc   r   _r   r:   r:   r;   get_v2_library_info_and_chunks  s    
r   h5flist[dict[str, Any]]c                 C  s   t | d d S )z}Get the library info.

    Args:
        mc: MoleculeCounter instance

    Returns:
        Dictionary of molecule infos
    r"   r   )r   loads)r   r:   r:   r;   _get_library_infoF  s   	r   c                 C  s(   t | \}}|dk rt|d S t|S )av  Takes a molecule info filename and loads the library info  This method.

    allows one to either read the `library_info` from new files, or generate one on the fly from
    older molecule info files without running into version compatibility errors
    Args:
        mol_info_fname: file name of a molecule info file

    Returns:
        Dictionaries of library infos
       r   )get_h5py_file_and_versionr   r   )mol_info_fnamer?   r   r:   r:   r;   get_library_infoR  s   r   r   tuple[h5py.File, int]c              
   C  s   z	t j| |d}W n ty } z	td|  d|d}~ww t|s+td|  dt|jv r;t|jt }||fS d}||fS )zOpens a molecule info h5py.File.

    Args:
        mol_info_fname: Path to a molecule info file to open

    Returns:
        A tuple with objects (h5py.File, file_version_number)
    modezThe molecule info HDF5 file (z/) is invalid. Please provide a valid HDF5 file.Nz#The input molecule info HDF5 file (zH) does not appear to be properly formatted. Please provide a valid file.r!   )h5pyFileOSErroris_valid_mol_info_h5
ValueErrorr   attrsr   )r   r   Zmc_h5exr   r:   r:   r;   r   d  s&   	


r   c                 C  s   t dd | jD S )ad  Checks whether a file is a valid molecule info file.

    The molecule info HDF5 file is deemed valid if and only if all of its attribute keys exist in VALID_MOL_INFO_H5_ATTRIBUTE_KEYS

    Args:
        mol_info: Open h5py.File for the molecule info HDF5 file to be validated

    Returns:
        A boolean with True for valid files, False otherwise
    c                 s  s    | ]}|t v V  qd S rH   ) VALID_MOL_INFO_H5_ATTRIBUTE_KEYS)rL   ar:   r:   r;   rN     rO   z'is_valid_mol_info_h5.<locals>.<genexpr>)r   r   )Zmol_infor:   r:   r;   r   ~  s   r   c                 C  sl   d}|| v r
| | S t | d tr| d d S | d d }d}|d |kr4|d }t |tr2|d S |S d S )NZchemistry_barcode_whitelistchemistry_barcode	whitelistr   Zgel_beadkindra   )rR   dict)r   Zold_wl_metric_keybarcode_defZGB_BARCODE_KINDr   r:   r:   r;   barcode_whitelist_from_metrics  s   
r   c                   @  s  e Zd ZdZdd Zdd Zdd Zdd
dZdddZdddZ	dddZ
dddZedddZedddZeddd#d$Zedd*d+Ze	,	,ddd0d1Zedd5d6Zedd7d8Zdd9d:Ze	,	,	,	,ddd?d@ZddAdBZddCdDZddGdHZdIdJ ZdddNdOZdPdQ ZddSdTZddWdXZddZd[Zdd^d_Z ddadbZ!ddcddZ"ddedfZ#ddgdhZ$ddjdkZ%ddldmZ&dndo Z'	pdddsdtZ(ddvdwZ)ddydzZ*dd{d|Z+dd~dZ,dd Z-dd Z.dd Z/dd Z0edddZ1eddddZ2dddZ3dddZ4edddZ5dddZ6edddZ7	ddddZ8edddZ9edddZ:edddZ;edddZ<ddddZ=	,ddddZ>	,ddddZ?dddZ@dddZAdddĄZBdddƄZCedddɄZDedd˄ ZEed dd̈́ZFedddτZGd,S (  r@   z@Streams a list of tuples w/named elements to or from an h5 file.c                 C  s,   d | _ d | _t | _t | _d | _d | _d S rH   )r   rW   r   rf   ref_columnsr"   feature_referencer   r:   r:   r;   __init__  s   
zMoleculeCounter.__init__c                 C  s   t |  S rH   )r   get_all_metricsr   r:   r:   r;   get_gb_barcode_whitelist  s   z(MoleculeCounter.get_gb_barcode_whitelistc                 C  sH   |   }|dd }|r"t|tr"|d d }t|tr"|dd S d S )Nr   r   r   Zslide)r   getrR   listr   )r   r   r   r   r:   r:   r;   get_visium_hd_slide_name  s   
z(MoleculeCounter.get_visium_hd_slide_nameri   	list[int]c                 C  s   dd |  t D S )Nc                 S  s   g | ]}t |qS r:   )r   rK   r:   r:   r;   
<listcomp>      z2MoleculeCounter.get_gem_groups.<locals>.<listcomp>)
get_metricGEM_GROUPS_METRICkeysr   r:   r:   r;   get_gem_groups     zMoleculeCounter.get_gem_groupsr1   c                 C  s   | j jtjdS )N)feature_type)r   get_genomesr   DEFAULT_LIBRARY_TYPEr   r:   r:   r;   r        zMoleculeCounter.get_genomesr=   c                 C  s   |   ttS rH   )r   r   MOLECULE_INFO_TYPE_METRICMOLECULE_INFO_TYPE_COUNTr   r:   r:   r;   get_molecule_info_type  r   z&MoleculeCounter.get_molecule_info_typer   c                 C  s   |  tjd S Nr   )get_ref_column_lazyBARCODESr   r   r:   r:   r;   get_barcode_list_size  r   z%MoleculeCounter.get_barcode_list_sizer   c                 C  s   |  t}|d ur|S dS )NF)r   IS_AGGREGATED_METRIC)r   retr:   r:   r;   r%     s   
zMoleculeCounter.is_aggregatednp.dtypec                 C  s   t t|  S rH   )rS   rC   rd   )r   r:   r:   r;   get_column_dtype     z MoleculeCounter.get_column_dtypec                   C  s   t dd t D S )Nc                 s  s    | ]	}t |jV  qd S rH   )rS   rC   itemsizerK   r:   r:   r;   rN         z3MoleculeCounter.get_record_bytes.<locals>.<genexpr>)sumrd   valuesr:   r:   r:   r;   get_record_bytes  s   z MoleculeCounter.get_record_bytes      ?Trr   scaler   capc                 C  s6   t dt  }t||  | }|rttj|S |S )zOEstimate memory usage in GB (not GiB) of this object given a number of records.g    eA)r   r@   r   mathceilr\   h5_constants
MIN_MEM_GB)rr   r  r  Zmol_entries_per_gbmem_gbr:   r:   r;   estimate_mem_gb  s
   zMoleculeCounter.estimate_mem_gbfiltered_barcodes_by_genomedict[bytes, list[bytes]]r    list[bytes]r-   c                 C  sH  dd t t|  D }tt}t |D ]\}}||d  | qdd t |D }g }|  D ])\}	}
||	 }|
D ]}t|\}}|| }|| }|D ]
}||||f qOq<q2t	|dkrlt
j|td d}n	t
jdtd d}|jd t	|ksJ |jd	 d
ksJ |t
|dddf ddf }t|t|  dS )aw  Generate numpy arrays for per-barcode info.

        Args:
          filtered_barcodes_by_genome (dict[str,list[str]]): Keys are genomes, values are lists of filtered barcode strings.
          library_info (list[dict]): Per-library metadata.
          barcodes (list[bytes]): All barcode sequences (e.g. ['ACGT', ...]

        Returns:
          BarcodeInfo: object
        c                 S     i | ]\}}||qS r:   r:   )rL   r   gr:   r:   r;   r     r   z6MoleculeCounter.build_barcode_info.<locals>.<dictcomp>r   c                 S  r  r:   r:   )rL   r   bcr:   r:   r;   r     r   r   r0   r   )r   r   r!   r   N)r2   )r   sortedr   r   r   r   itemscr_utilssplit_barcode_seqrJ   rS   rZ   BARCODE_INFO_DTYPESzerosr   argsortr-   )r	  r"   r    Zgenome_to_idxZlibraries_for_gem_groupr   libZbc_seq_to_idxZ	pf_tuplesgenomebcs
genome_idxZbc_strseqr   r   library_indsr   r0   r:   r:   r;   build_barcode_info  s4   "
z"MoleculeCounter.build_barcode_infoNr   r  
int | Nonec                   s  | j }|ddtjf }|ddtjf }|ddtjf }tj|jd td}	|dur2|	||kM }	durPtj	fddt
|D td d}
|	t||
M }	t|	}tj	dd |D dd}|||  }tjt||| fdd	 fd
dtjd D S )a  Get a list of filtered barcode strings e.g. ['ACGT-1',...].

        Args:
          barcode_info (BarcodeInfo): Barcode info object.
          library_info (list of dict): Library info.
          barcodes (np.array): Barcode sequences.
          genome_idx (int): Restrict passing definition to this genome. None for no restriction.
          library_type (str): Restrict passing definition to this library type. None for no restriction.

        Returns:
          list of str
        Nr   r   c                   s    g | ]\}}|t   kr|qS r:   r   )rL   r   r  )library_typer:   r;   r   2       z9MoleculeCounter.get_filtered_barcodes.<locals>.<listcomp>r   c                 S  s   g | ]}|d  qS )r   r:   )rL   r  r:   r:   r;   r   8  r   r<   axisc                   s,   g | ]}t  |d f  |df qS )r!   r   )r  format_barcode_seq)rL   r   )r    gg_bcsr:   r;   r   @  s    )r0   r-   r6   r7   r8   rS   onesr   r   rZ   r   rd   isinflatnonzeror   column_stackrange)r   r"   r    r  r  r0   Zpf_barcode_idxZpf_library_idxZpf_genome_idxmaskr  indsZ	lib_to_ggZpf_gem_groupr:   )r    r#  r  r;   get_filtered_barcodes  s(   
z%MoleculeCounter.get_filtered_barcodesbc_infor   
h5py.Groupc                 C  s<   |j d| jd| jjd ftdd tj|d| jtdd dS )zSave barcode info to HDF5.

        Args:
          barcode_info (BarcodeInfo): Data.
          group (h5py.Group): Output group.
        r0   Nr!   T)rA   rb   rF   shuffler2   )rF   r.  )rX   r0   r   r[   r   create_hdf5_string_datasetr2   )r,  r   r:   r:   r;   save_barcode_infoE  s   
z!MoleculeCounter.save_barcode_infoc                 C  s"   t | d dd t| d dS )zLoad barcode info from an HDF5 group.

        Args:
          group (h5py.Group): Input group.

        Returns:
          BarcodeInfo: object
        r0   Nr2   r>   )r-   r   read_hdf5_string_dataset)r   r:   r:   r;   load_barcode_infoX  s   
z!MoleculeCounter.load_barcode_infoc                 C  s   t | jt S rH   )r@   r2  rW   BARCODE_INFO_GROUP_NAMEr   r:   r:   r;   rl   g  r   z MoleculeCounter.get_barcode_infor   feature_refFeatureReference | NoneBarcodeInfo | Nonec                 C  s  |dv sJ |  }|dkr||du rt d|du rt d|du r%t d|du r-t dt|d|_t|t t|jtj	t
 |jtj}|| t|| tj|dd	d
}	t|jd|	g |jt}
t||
 tD ]}t|| qr|S t||d\|_|_t|jtsJ |jtkrt d| d|j d|jdk rt d| d|j d|j D ]B}|tv r|j| |j|< q|tv r|j| |j|< q|tjkrt |j| |_!q|t"ks|tks|t#ks|t$v rqt%d| t&t'|jd d |_(|S )a
  Open a molecule info object.

        Args:
          filename (str): Filename to open or create
          mode (str): 'r' for reading, 'w' for writing.
          feature_ref (FeatureReference): Required when mode is 'w'.
          barcodes (list of str): All possible barcode sequences. Required when mode is 'w'.
          library_info (list of dict): Library metadata. Required when mode is 'w'.
          barcode_info (BarcodeInfo): Per-barcode metadata.

        Returns:
          MoleculeInfo: A new object
        )r   zr+wr7  NzSFeature reference must be specified when opening a molecule info object for writingzJBarcodes must be specified when opening a molecule info object for writingzNLibrary info must be specified when opening a molecule info object for writingzNBarcode info must be specified when opening a molecule info object for writing   T)indent	sort_keysr"   r   z#The molecule info HDF5 file (file: z, format version zO) was produced by a newer software version. Reading these files is unsupported.r   zP) was produced by an older software version. Reading these files is unsupported.zUnrecognized dataset key: r   ))r   r   r   rW   r   CURR_FILE_VERSIONr   r   r  H5_FILETYPE_KEYMOLECULE_H5_FILETYPEcreate_groupH5_FEATURE_REF_ATTRto_hdf5r`   r   dumpsr/  r3  r@   r0  rd   rX   r   r   rR   r   r   rf   MOLECULE_REF_COLUMNSr   r   	from_hdf5r   V3_METRICS_GROUP_NAMEMETRICS_JSON_DATASET_NAMEUNIMPLEMENTED_PROBE_KEYSAttributeErrorr   r1  r"   )clsfilenamer   r4  r    r"   r   r?   Z
fref_groupZlib_info_jsonr  ra   keyr:   r:   r;   openj  s   


%


zMoleculeCounter.openc                 C  s   t |  p|  duS )zUsing the chemistry whitelist, determine if this is Spatial (Visium) data.

        Returns:
            bool: True if the known barcode list is for Spatial data.
        N)bc_utilsZis_whitelist_spatialr   r   r   r:   r:   r;   is_spatial_data  s   
zMoleculeCounter.is_spatial_datac                 C  s   |  tttjd S r   )ro   nextiterrd   r   r   r:   r:   r;   nrows  r   zMoleculeCounter.nrowsidxtuple[Any, Any]c                   s   t  fddtD S )Nc                 3  s    | ]
} |  V  qd S rH   )ro   )rL   colrQ  r   r:   r;   rN     s    z0MoleculeCounter.get_chunk_key.<locals>.<genexpr>)tupleCHUNK_COLUMNS)r   rQ  r:   rT  r;   get_chunk_key  r   zMoleculeCounter.get_chunk_keyc                 C  s   |   }|||< | | dS )z4Set a metric.

        Serialize to Pickle.
        N)r   set_all_metrics)r   rJ  valuer   r:   r:   r;   
set_metric  s   zMoleculeCounter.set_metricrJ  r   r   c                 C  s   |   ||S )zGet a metric.)r   r   )r   rJ  r   r:   r:   r;   r     s   zMoleculeCounter.get_metricc                 C  s    t j|ddtd}|| jt< d S )NT),:)r:  
separatorsrH  )r   rA  rt   rW   rE  )r   r   r   r:   r:   r;   rX    s   zMoleculeCounter.set_all_metricsdict[str, Any]c                 C  s4   t | jv rt| jt  d S t| jv r|  S i S )zReturn a dictionary of metrics.r:   )rE  rW   r   r   rD  _legacy_get_all_metricsr   r:   r:   r;   r     s
   

zMoleculeCounter.get_all_metricsr   r   c                 C  s   dd | j t j D S )Nc                 S  s    i | ]\}}|t t|qS r:   )pickler   r
   rL   r   vr:   r:   r;   r     s    z;MoleculeCounter._legacy_get_all_metrics.<locals>.<dictcomp>)rW   rD  r   r  r   r:   r:   r;   r_    s   z'MoleculeCounter._legacy_get_all_metricsra   c                 C  sD   | j | }t|}|t| }||f ||||< | j  dS )z&Append an array of values to a column.N)rf   rJ   resizerW   flush)r   ra   r   dsstartendr:   r:   r;   append_column  s   
zMoleculeCounter.append_columncol_nameh5py.Datasetc                 C  
   | j | S )zRetrieve column.

        Does not handle missing columns.

        Depending on how the file was opened,
        this may only be a file view instead of a full array.
        )rf   r   ri  r:   r:   r;   ro     s   
zMoleculeCounter.get_column_lazyr/   c                 C  s8   || j v r| |dd S tj|  t| t| dS )zULoad an entire column of data into memory.

        Handles missing columns.
        Nr   )rf   ro   rS   fullrP  MOLECULE_INFO_DEFAULT_VALUESrd   rl  r:   r:   r;   
get_column  s   
zMoleculeCounter.get_columnc                 C  s2   |t v sJ | jj| jj|t|d| j|< d S )N)obj)rB  rW   create_carrayrootrS   rZ   r   )r   ri  r   r:   r:   r;   set_ref_column&  s   zMoleculeCounter.set_ref_columnc                 C  s   |  |dd S )z4Load a reference array into memory as a numpy array.N)r   rl  r:   r:   r;   get_ref_column-     zMoleculeCounter.get_ref_columnc                 C  rk  )z-Get a reference array as a lazy h5py Dataset.)r   rl  r:   r:   r;   r   1  s   
z#MoleculeCounter.get_ref_column_lazyr   c                 C  s   t | jtj S )z(Returns the feature reference from HDF5.)r   rC  rW   r  r?  r   r:   r:   r;   get_feature_ref5  ru  zMoleculeCounter.get_feature_refc                 C  s   | j t dd S )z!Returns fully loaded barcode set.N)rW   rY   r   r:   r:   r;   get_barcodes9  ru  zMoleculeCounter.get_barcodesc                   s  |   }|  }|j}t| }|| }tt }tjt||d t	|D ]\}}| |< q$| j
dddD ]\}	}
| jt |	|	|
  } | | jt |	|	|
 < q4 fdd|ddtjf D |ddtjf< t||jd}| jt= t| | | jt= | jt}t|| |S )	zChanges the data in the dataset so that only barcodes with have > 0 counts and/or were pass filtered are.

        stored in the file.  Updates the barcode_idx file to indicate the new indices
        Returns:
        r   rh   Frj   c                   s   g | ]} | qS r:   r:   rK   Znew_positionsr:   r;   r   T  s    z1MoleculeCounter.trim_barcodes.<locals>.<listcomp>Nr>   )rw  rl   r0   rs   rd   rp   rS   r  rJ   r   rm   rW   r-   r6   r2   rY   r`   r3  r>  r@   r0  )r   Zold_barcodesZold_barcode_infoZnew_pass_filterZbc_idx_to_retainZnew_barcodesZ	col_dtypenr   Z
chnk_startZchnk_lenZfutureZnew_barcode_infor  r:   rx  r;   trim_barcodes=  s6   


zMoleculeCounter.trim_barcodes   idxs
chunk_sizec                 C  s   |  |}|  }|j|fkrT|jtkrTtj| f|jd}d}td||D ]*}t	|| |}	||||	   }
||
kr?q'|||	 |||	  |||
< |
}q'|S |jd dkrctjg |jdS t
d)z5Get the column values at row indices (boolean array).r   r   z$index value arrays not yet supported)ro   rP  r   rC   r   rS   emptyr   r(  minNotImplementedError)r   ri  r|  r}  rS  ZnmolvalsZlwrbeginrg  Zuprr:   r:   r;   get_column_with_indicesd  s    
 z'MoleculeCounter.get_column_with_indicesr   c                 C  sJ   | j t d dd }t|dddf |k}||df }tt|S )zCount the number of barcodes passing filter for a library.

        Args:
          library_idx (int): Index of library to count.

        Returns:
          int: Number of filtered barcodes for this library.
        r0   Nr!   r   )rW   r3  rS   r&  rJ   r   )r   r   r0   Zthis_libZbarcode_indsr:   r:   r;   %get_num_filtered_barcodes_for_library|  s   	z5MoleculeCounter.get_num_filtered_barcodes_for_librarylibrary_idxsc                 C  sF   | j t d dd }|jdksJ tj|ddddf |kddS )a  Count the number of barcodes passing filter for a range of libraries.

        This is the vectorized equivalent to

        .. code-block:: python

            np.array(
                [
                    mc.get_num_filtered_barcodes_for_library(lib_idx)
                    for lib_idx in library_idxs
                ]
            )

        Args:
          library_idxs (np.ndarray[int]): 1d array of indicies to count.

        Returns:
          np.ndarray[int]: Number of filtered barcodes for each library.
        r0   Nr!   r.   r   r   )rW   r3  ndimrS   count_nonzero)r   r  r0   r:   r:   r;   'get_num_filtered_barcodes_for_libraries  s   "z7MoleculeCounter.get_num_filtered_barcodes_for_librariesc                 C  s0   | j dusJ t| j t d dddf jS )z'Return the number of filtered barcodes.Nr0   r   )rW   rS   r   r3  sizer   r:   r:   r;   get_num_filtered_barcodes  s   "z)MoleculeCounter.get_num_filtered_barcodesr   c                 C  s
   t | jS rH   )r   rW   r   r:   r:   r;   r     s   
z MoleculeCounter.get_library_infoc                 C  s   | S rH   r:   r   r:   r:   r;   	__enter__  s   zMoleculeCounter.__enter__c                 C  s   |    d S rH   )close)r   rU   rY  	tracebackr:   r:   r;   __exit__     zMoleculeCounter.__exit__c                 C     | j   d S rH   rW   r  r   r:   r:   r;   r       zMoleculeCounter.closec                 C  r  rH   r  r   r:   r:   r;   save  r  zMoleculeCounter.savebc_infoslist[BarcodeInfo]c                 C  s   t | dksJ | d j}g }| D ]}|jjd dksJ |j|ks$J ||j qtj|dd}|jd dkr@tj|dd}t||dS )zMerge a BarcodeInfo into another BarcodeInfo.

        Args:
          bc_infos (list of BarcodeInfo): Input BarcodeInfos.

        Returns:
          BarcodeInfo:
        r   r!   r   r   r>   )	rJ   r2   r0   r   r   rS   concatenater   r-   )r  r2   Zpfsr,  Znew_pfr:   r:   r;   merge_barcode_infos  s   

z#MoleculeCounter.merge_barcode_infosr   dict | NoneNonec              	   C  s  t |d d}| }| }| }dd |jD }g }|D ]}	t |	d}
||
  W d   n1 s:w   Y  q t |}t j| d||||d}d}|D ]I}	t j|	dd8}| |ksfJ t	
| |spJ | }d	d |jD |ksJ |du r| }|| 7 }W d   n1 sw   Y  qT|| |  t| | t | d}
||
 ksJ d
W d   dS 1 sw   Y  dS )zConcatenate MoleculeCounter HDF5 files.

        Args:
          out_filename (str): Output HDF5 filename
          in_filenames (list of str): Input HDF5 filenames
          metrics (dict): Metrics to write
        r   r   c                 S     g | ]}|j qS r:   idrL   fr:   r:   r;   r         z/MoleculeCounter.concatenate.<locals>.<listcomp>Nr7  )r   r4  r    r"   r   r   c                 S  r  r:   r  r  r:   r:   r;   r     r  z/Concatenation did not produce expected results.)r@   rK  rv  rw  r   feature_defsr   rl   r  rS   array_equalr   rP  rX  r  r   )out_filenamein_filenamesr   Zfirst_mcr4  r    r"   Zfeature_idsr  rI  r?   Zmerged_bc_infoZout_mcZ
total_rowsZin_mcZfrefr:   r:   r;   r    sL   




"zMoleculeCounter.concatenatefrom_rowc                 C  sH   |   }| |}t||D ]}| |}||kr|d   S q|d S )Nr!   )rP  rW  r(  )r   r  r   Zinitial_chunk_keyr   Z	chunk_keyr:   r:   r;   !find_last_occurrence_of_chunk_key  s   

z1MoleculeCounter.find_last_occurrence_of_chunk_keyqueryr   key_funcCallable[[int], _T1]c                 C     t |  ||S rH   )r@   bisect_staticrP  )r   r  r  r:   r:   r;   bisect  s   zMoleculeCounter.bisectr   c           	      C  s   d}| }d}	 || d }||}||krn|| dkrd}n
||k r&|}n|}q|rBt |ddD ]}||}||krA|d   S q1dS )zPerforms a binary search to find the leftmost insertion point of query.

        Args:
            key_func: A function, where `key_func(i)` is the value to compare to at index i.
        r   Tr.   r!   F)r(  )	r   r  r  lohiexistsr   currjr:   r:   r;   r    s,   zMoleculeCounter.bisect_staticr   	list[_T1]&Generator[tuple[int, int], None, None]c                 C  r  rH   )r@   r   rP  )r   r   r  r:   r:   r;   get_chunks_from_partition>  s   z)MoleculeCounter.get_chunks_from_partitionc                 #  sp    dg fdd|dd D  }t |}t|D ]}|| }|d |k r,||d  n}||| fV  qdS )z3Get chunks by partitioning on the specified values.r   c                   s   g | ]	}t | qS r:   )r@   r  )rL   valr  r   r:   r;   r   H  s    zDMoleculeCounter.get_chunks_from_partition_static.<locals>.<listcomp>r!   N)rJ   r(  )r   r   r  startsry  r   rq   	chunk_endr:   r  r;   r   C  s   
z0MoleculeCounter.get_chunks_from_partition_statictarget_chunk_lenrk   c                 c  sx    |   }d\}}||d k r:t|d || d }|r!| |n|}d| | }||fV  d| }||d k sdS dS )zGet chunks, optionally preserving boundaries defined by get_chunk_key().

        Yields:
            (int, int): (chunk_start, chunk_len) which are closed intervals
        )r   r   r!   N)rP  r  r  )r   r  rk   r   rq   r  Ztarget_chunk_endrr   r:   r:   r;   rm   Q  s   
zMoleculeCounter.get_chunks	np.uint16c                 C  s   t t | S rH   )rd   r   )rM   r:   r:   r;   compress_gem_groupf  r  z"MoleculeCounter.compress_gem_grouprM   bytesumi_bitsc                 C  s   t | |S rH   )r  compress_seq)rM   r  r:   r:   r;   compress_umi_seqj  r  z MoleculeCounter.compress_umi_seqmetrics_list
list[dict]c                   s   d }i }i }g  | D ]@}|d u rdd |  D } dd |  D  |t }|t}|d ur>|D ]	}|| | q4|| ||t  q
|d usQJ ||t< ||t< |td  t fdd D rq| d  |S )Nc                 S  s    i | ]\}}| d s||qS target
startswithra  r:   r:   r;   r   v  s
    zAMoleculeCounter.naive_concatenate_metric_list.<locals>.<dictcomp>c                 S  s    i | ]\}}| d r||qS r  r  ra  r:   r:   r;   r   |  r  c                 3  s    | ]	}| d  kV  qdS )r   Nr:   rK   Ztargeted_metricsr:   r;   rN     r   z@MoleculeCounter.naive_concatenate_metric_list.<locals>.<genexpr>r   )	r  r   r   r   ANALYSIS_PARAMETERS_METRICupdateLIBRARIES_METRICpopr   )r  combined_metricsZ
gg_metricsZlib_metricssingle_metricsZsingle_gg_metricsr(   r   r:   r  r;   naive_concatenate_metric_listn  s4   

z-MoleculeCounter.naive_concatenate_metric_listmol_h5_listc              	   C  s\   g }| D ]"}t j|dd}| }|| W d    n1 s!w   Y  qt |}|S )Nr   r   )r@   rK  r   r   r  )r  r   mol_h5counterr  r  r:   r:   r;   "naive_concatenate_metrics_from_h5s  s   
z2MoleculeCounter.naive_concatenate_metrics_from_h5sall_metricsc              
     l   z$| j dusJ |du r| tn|t  fddtt| j D W S  ty5 } ztd|d}~ww )zGet raw read pairs per library.

        Returns:
          list of int: Order is by library index

        Raises:
            ValueError if expected metrics are missing.
        Nc                      g | ]
} t | t qS r:   )r=   TOTAL_READS_METRICrL   limetricr:   r;   r         zBMoleculeCounter.get_raw_read_pairs_per_library.<locals>.<listcomp>#Missing metrics in molecule counterr"   r   r  r   r(  rJ   KeyErrorr   r   r  excr:   r  r;   get_raw_read_pairs_per_library  s   	

z.MoleculeCounter.get_raw_read_pairs_per_libraryc                   s`   z$| j dusJ |du r| tn|t  fddtt| j D W S  ty/   tdw )zGet read pairs in filtered barcodes per library.

        Returns:
          list of int: Order is by library index

        Raises:
            ValueError if expected metrics are missing.
        Nc                   r  r:   )r=   'TOTAL_READS_IN_FILTERED_BARCODES_METRICr  r  r:   r;   r         zSMoleculeCounter.get_read_pairs_in_filtered_barcodes_per_library.<locals>.<listcomp>r  r  )r   r  r:   r  r;   /get_read_pairs_in_filtered_barcodes_per_library  s   

z?MoleculeCounter.get_read_pairs_in_filtered_barcodes_per_libraryc              
     r  )zGet transcriptomic read pairs per library (cell + non-cell).

        Returns:
            list of int: Order is by library index

        Raises:
            ValueError if expected metrics are missing.
        Nc                   r  r:   )r=   FEATURE_READS_METRICr  r  r:   r;   r     r  zMMoleculeCounter.get_transcriptomic_read_pairs_per_library.<locals>.<listcomp>r  r  r  r:   r  r;   )get_transcriptomic_read_pairs_per_library  s   


z9MoleculeCounter.get_transcriptomic_read_pairs_per_libraryc              
     Z   z| j dusJ | t  fddtt| j D W S  ty, } ztd|d}~ww )zGet usable read pairs per library.

        Returns:
          list of int: Order is by library index

        Raises:
            ValueError if expected metrics are missing.
        Nc                   r  r:   )r=   USABLE_READS_METRICr  r  r:   r;   r     r  zEMoleculeCounter.get_usable_read_pairs_per_library.<locals>.<listcomp>r  r"   r   r  r(  rJ   r  r   r   r  r:   r  r;   !get_usable_read_pairs_per_library  s   	

z1MoleculeCounter.get_usable_read_pairs_per_libraryc              
     r  )zGet usable on-target read pairs per library.

        Returns:
          list of int: Order is by library index

        Raises:
            ValueError if expected metrics are missing.
        Nc                   r  r:   )r=   ON_TARGET_USABLE_READS_METRICr  r  r:   r;   r     r  zOMoleculeCounter.get_on_target_usable_read_pairs_per_library.<locals>.<listcomp>r  r  r  r:   r  r;   +get_on_target_usable_read_pairs_per_library  s   	


z;MoleculeCounter.get_on_target_usable_read_pairs_per_librarydict[str, list[int]]c                 C  s8   |   }tt}t|D ]\}}||t  | q|S )z0Get indices of libraries that are GEX libraries.)r   r   r   r   r   r   )r   r$   Zlibrary_indices_by_typer   r  r:   r:   r;   get_library_indices_by_type  s
   z+MoleculeCounter.get_library_indices_by_typec                 C  s   |  ttkS )zCReturn True if metrics_json/targeting_method is templated_ligation.)r   TARGETING_METHOD_METRICr   r   r:   r:   r;   is_templated_ligation  r   z%MoleculeCounter.is_templated_ligationmetric_typec              	   C  s   |t u s
|tu s
J tt}| D ]/}tj|dd}|| D ]\}}||  || 7  < q!W d   n1 s:w   Y  q|S )zNCombine a library- or gemgroup- level integer metric across multiple h5 files.r   r   N)r  r   r   r   r@   rK  r   r  )r  metric_namer  combinedr  r  rJ  r   r:   r:   r;   _sum_metric  s   zMoleculeCounter._sum_metricc                 C  s   t | |tS rH   )r@   r  r  )r  r  r:   r:   r;   sum_library_metric  r   z"MoleculeCounter.sum_library_metricc           
      C  s   d}t | d||-}t|d|d|dD ]\}}}	|	dk r$q||f|vr+q||	7 }qW d    n1 s:w   Y  || d S )Nr   r   barcoder   readsr!   )r@   rK  r   ro  put)
rI  Zfiltered_bcs_setrf  lengthqueueZtotal_mapped_readsr?   r  r   r  r:   r:   r;   6get_total_conf_mapped_reads_in_filtered_barcodes_chunk#  s   
	zFMoleculeCounter.get_total_conf_mapped_reads_in_filtered_barcodes_chunkc                 C  s   | t j t jkot | S rH   )r   r   r   has_target_set)libraryr:   r:   r;   is_targeted_library3  s   z#MoleculeCounter.is_targeted_library)ri   r   )ri   r1   )ri   r=   )ri   r   )ri   r   )ri   r   )r   T)rr   r   r  r   r  r   ri   r   )r	  r
  r    r  ri   r-   )NN)r   r-   r  r  )r,  r-   r   r-  )r   r-  ri   r-   )ri   r-   )NNNN)r   r=   r4  r5  r   r6  ri   r@   )rQ  r   ri   rR  rH   )rJ  r=   r   r   )ri   r^  )r   r@   ri   r   )ra   r=   )ri  r=   ri   rj  )ri  r=   ri   r/   )ri  r=   )ri   r   )ri   r/   )r{  )r|  r/   r}  r   ri   r/   )r   r   ri   r   )r  r/   ri   r/   ri   r   )r  r  ri   r-   )r   r  ri   r  )r  r   ri   r   )r  r   r  r  ri   r   )r   r   r  r   r  r  ri   r   )r   r  r  r  ri   r  )r   r   r   r  r  r  ri   r  )T)r  r   rk   r   ri   r  )ri   r  )rM   r  r  r   ri   r   )r  r  )r  r1   )r  r  ri   r   )ri   r  )r  r=   )ri   r  )Hr3   r4   r5   r   r   r   r   r   r   r   r   r%   staticmethodr   r   r  r  r+  r0  r2  rl   classmethodrK  rM  rP  rW  rZ  r   rX  r   r_  rh  ro   ro  rs  rt  r   rv  rw  rz  r  r  r  r  r   r  r  r  r  r  r  r  r  r  r  r   rm   r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r:   r:   r:   r;   r@     s    
	



	45

h

	

	

	






(




:
	
%	





c                   @  s$   e Zd ZdZdd Zedd ZdS )MergedBarcodesz!Class to hold a list of barcodes.c                 C  sz   t dd | D }td| }t|d}|jttj| |t| ddd |	  W d   dS 1 s6w   Y  dS )	zWrite to disk.c                 s  rG   rH   rI   rK   r:   r:   r;   rN   ?  rO   z/MergedBarcodes.write_to_disk.<locals>.<genexpr>rP   r7  rQ   r,   rE   N)
r\   rS   rC   r   r   rX   rY   r]   rJ   r  )r   rI  r^   r_   rW   r:   r:   r;   write_to_disk=  s   
"zMergedBarcodes.write_to_diskc                 C  sX   t | d}t }||t dd  |  |W  d   S 1 s%w   Y  dS )zIDeserialize the data.

        :param filename:
        :return:
        r   N)r   r   r  extendrY   r  )rI  rW   resultr:   r:   r;   load_from_diskI  s   $zMergedBarcodes.load_from_diskN)r3   r4   r5   r   r  r   r  r:   r:   r:   r;   r  :  s
    r  )r?   r@   )r?   r@   ra   r=   )rh   )r?   r@   ri   r/   )r?   r@   r   r   )r   r   )r   r   ri   r   r  )r   )ri   r   )s
__future__r   r   r  r`  collectionsr   r   collections.abcr   r   typingr   r   r	   r   numpyrS   sixr
   Zcellranger.barcodes.utilsr    utilsrL  cellranger.h5_constantsr  cellranger.hdf5hdf5r   cellranger.rna.libraryrnar  r   cellranger.utilsr  cellranger.fast_utilsr   cellranger.feature_refr   r   &cellranger.targeted.targeted_constantsr   cellranger.wrapped_tablesr   r   r=  r   r<  r   r;  rE  rD  r3  ZPROBE_GROUP_NAMErp   ZFEATURE_IDX_COL_NAMEZLIBRARY_IDX_COL_NAMEZPROBE_IDX_COL_NAMEZCOUNT_COL_NAMEr   ZUMI_COL_NAMEZUMI_TYPE_COL_NAMErY   r   ZUMI_TYPE_TXOMICrn  r   r<   rd   rF  r   rB  rV  r   r  r   r  r   ZMOLECULE_INFO_TYPE_RAWr   r  r  r  ZDOWNSAMPLED_READS_METRICr  r  Z DOWNSAMPLED_FEATURE_READS_METRICr  ZGG_RECOVERED_CELLS_METRICZGG_FORCE_CELLS_METRICZINTRON_MODE_PARAMZINTRON_MODE_HISTORIC_DEFAULTZFILTER_PROBESZFILTER_PROBES_PARAM_DEFAULTZNO_PROBE_FILTERr[   re   r-   r  r`   rX   rs   r   rt   r   r   r   r   r   r   r   r   r@   r   r  r:   r:   r:   r;   <module>   s   



.

0
       "