o
    UݢgK                      @  sP  d Z ddlmZ ddlmZ ddlZddlmZ	 ddl
mZ ddlmZ er,ddlmZ dHd
dZdIddZdIddZdIddZdIddZdIddZdIddZdIddZdJddZdIddZdIdd ZdId!d"Zd#Zd$Zd%Zd&Z dKd(d)Z!dKd*d+Z"dKd,d-Z#dKd.d/Z$	0dLdMd5d6Z%dNd7d8Z&dNd9d:Z'd;Z(dOd>d?Z)dPdCdDZ*dQdFdGZ+dS )Rz4Utility functions for working with aligned segments.    )annotations)TYPE_CHECKINGN)AlignedSegmentreadr   returntuplec                 C  s   | j | jfS N)tidposr    r   a/oak/stanford/groups/akundaje/marinovg/programs/cellranger-9.0.1/lib/python/cellranger/segment.pypos_sort_key      r   c                 C     t | tjS r   )_get_read_tagcr_constantsPROCESSED_BARCODE_TAGr   r   r   r   get_read_barcode   r   r   c                 C  r   r   )r   r   RAW_BARCODE_TAGr   r   r   r   get_read_raw_barcode   r   r   c                 C  r   r   )r   r   RAW_BARCODE_QUAL_TAGr   r   r   r   get_read_barcode_qual   r   r   c                 C  r   r   )r   r   UMI_QUAL_TAGr   r   r   r   get_read_umi_qual#   r   r   c                 C  r   r   )r   r   PROCESSED_UMI_TAGr   r   r   r   get_read_umi'   r   r   c                 C  r   r   )r   r   RAW_UMI_TAGr   r   r   r   get_read_raw_umi+   r   r   c                 C  s4   t | tj}|du rdS t|tsJ t|dS )zs_summary_.

    Args:
        read (AlignedSegment): _description_

    Returns:
        _type_: _description_
    N;)r   r   FEATURE_IDS_TAG
isinstancestrr   split)r   Zid_strr   r   r   get_read_gene_ids/   s
   	r$   Nonec                 C  s   | j }|| || _ d S r   )tagsextend)r   r&   Z	read_tagsr   r   r   set_read_tags?   s   

r(   c                 C  s.   z|  |}|s
d }|W S  ty   Y d S w r   )optKeyError)r   tagrr   r   r   r   E   s   
r   c                 C  s   t | tjpdS Nr   )r   r   EXTRA_FLAGS_TAGr   r   r   r   get_read_extra_flagsO      r/   c                 C  s@   t |dksJ | jrdS t |dkr|d S t|| j |S )z_summary_.

    Args:
        read (_type_): _description_
        chroms (_type_): _description_
        genomes (_type_): _description_

    Returns:
        _type_: _description_
    r   N   )lenis_unmappedcr_utilsget_genome_from_strr	   )r   chromsgenomesr   r   r   get_genome_from_readS   s   r8                boolc                 C     t | t@ dkS r-   )r/   EXTRA_FLAGS_LOW_SUPPORT_UMIr   r   r   r   is_read_low_support_umiq   r0   r@   c                 C  r>   r-   )r/   EXTRA_FLAGS_FILTERED_TARGET_UMIr   r   r   r   is_read_filtered_target_umiu   r0   rB   c                 C  r>   r-   )r/   EXTRA_FLAGS_UMI_COUNTr   r   r   r   is_read_umi_county   r0   rD   c                 C  r>   r-   )r/   EXTRA_FLAGS_CONF_MAPPED_FEATUREr   r   r   r   is_read_conf_mapped_to_feature}   r0   rF   Thigh_conf_mapqintuse_corrected_umiuse_umisc                 C  s^   |rt | }nt| }| j o.|dup| o.t| duo.t|  o.t|  o.t| |p.t| S )z_summary_.

    Args:
        read: _description_
        high_conf_mapq: _description_
        use_corrected_umi: _description_. Defaults to True.
        use_umis: _description_. Defaults to True.

    Returns:
        bool: _description_
    N)r   r   Zis_secondaryr   r@   rB   $is_read_conf_mapped_to_transcriptomerF   )r   rG   rI   rJ   Zumir   r   r   is_read_dupe_candidate   s   


rL   c                 C  s   | j rdS | j|k rdS dS )_summary_.

    Args:
        read (AlignedSegment): _description_
        high_conf_mapq (int): _description_

    Returns:
        bool: _description_
    FT)r3   Zmapq)r   rG   r   r   r   is_read_conf_mapped   s
   

rN   c                 C  s*   t | |rt| }|duot|dkS dS )rM   Nr1   F)rN   r$   r2   )r   rG   gene_idsr   r   r   rK      s   

rK   s	   MIDNSHP=Xir"   c                 C  s   t t|  S )z-Pysam numeric codes to meaningful categories.)chr__CIGAR_CATEGORIES)rP   r   r   r   _cigar_numeric_to_category_map   s   rS   strandbytesdict[str, int]c                 C  s   i }| j }t|D ]d\}\}}t|}t|}|dkr<|tjkr(| jr%dnd}n| jr-dnd}|dkr8|||< nd||< ||d| ||< |t|krm|tjkrY| j	rVdnd}n| j	r^dnd}|dkri|||< q	d||< q	|S )a+  Get number of mismatches, insertions, deletions, ref skip, soft clip, hard clip bases from a read.

    Returns a dictionary by the element's CIGAR designation. Adds additional
    fields to distinguish between three and five prime soft-clipping for `R1`
    and `R2`: `R1_S_three_prime` and `R1_S_five_prime`, etc. to account for
    soft-clipped local alignments.

    Args:
        read (pysam.AlignedRead): aligned read object
        strand (string): + or - to indicate library orientation (MRO argument strand, for example)

    Returns:
        dict of str,int: Key of base type to base counts for metrics. Adds
                         additional fields to distinguish between three and
                         five prime soft-clipping: `S_three_prime` and
                         `S_five_prime`.
    r   R1_S_five_primeR2_S_three_primeR2_S_five_primeR1_S_three_primeS)
cigar	enumeraterS   rH   r   REVERSE_STRANDis_read1getr2   Zis_read2)r   rT   
statisticsZcigar_tuplesrP   categorycountmetricr   r   r   get_cigar_summary_stats   s,   



re   .tuple[int, np.ndarray[int, np.dtype[np.byte]]]c                 C  s   t j| jt jdtj }d}| jD ]0\}}t|}|dkr(t ||dg| }n|dv r9t 	|t j
|||  }|dvrA||7 }q||fS )aa  Returns base quality scores for the full read alignment.

    Inserts zeroes for deletions and removing inserted and soft-clipped bases.
    Therefore, only returns quality for truly aligned sequenced bases.

    Args:
        read (pysam.AlignedSegment): read to get quality scores for

    Returns:
        np.array: numpy array of quality scores
    )dtyper   D)Ir[   )np
fromstringZqualbytetk_constantsZILLUMINA_QUAL_OFFSETr\   rS   insertdeletes_)r   quality_scoresZ	start_pos	operationlengthr   r   r   &get_full_alignment_base_quality_scores  s   rt   )r   r   r   r   )r   r   )r   r   r   r%   )r   r   r   r=   )TT)rG   rH   rI   r=   rJ   r=   r   r=   )r   r   rG   rH   r   r=   )rP   rH   r   r"   )r   r   rT   rU   r   rV   )r   r   r   rf   ),__doc__
__future__r   typingr   numpyrj   cellranger.constants	constantsr   cellranger.utilsutilsr4   Ztenkit.constantsrm   Zpysamr   r   r   r   r   r   r   r   r$   r(   r   r/   r8   r?   rC   rE   rA   r@   rB   rD   rF   rL   rN   rK   rR   rS   re   rt   r   r   r   r   <module>   sJ   

















 


8