o
    Uݢg                    @  sn  d Z ddlmZ ddlZddlZddlZddlmZ ddlm	Z	m
Z
mZ ddlmZmZmZmZmZ ddlZddlmZ ddlmZ ddlm  mZ ddlm  mZ ddl m!Z" ddl#m$Z% ddl&m'Z' dd	l(m)Z)m*Z*m+Z+m,Z,m-Z-m.Z.m/Z/m0Z0m1Z1m2Z2m3Z3m4Z4m5Z5m6Z6m7Z7m8Z8m9Z9m:Z:m;Z; dd
l<m=Z> erddl?m@Z@mAZA dZBdhddZCe.fdiddZDe.e/fdjddZEdkd&d'ZFdld,d-ZGdmd3d4ZHdnd5d6ZIedod9d:ZJedpd<d:ZJdqd?d:ZJG d@dA dAeZKG dBdC dCZLG dDdE dEeZMG dFd/ d/ZNG dGdH dHZOdIdJ ZPdrdsdNdOZQdtdRdSZRdrdtdTdUZSdVdW ZTdXdY ZUdZd[ ZVdud_d`ZWdadb ZXdvdddeZYdfdg ZZdS )wzZUtilities for annotated contigs with gene/chain information, defining clonotypes and more.    annotationsN)defaultdict)CallableIterableSequence)TYPE_CHECKINGAnyTextIO	TypedDictoverload
ensure_str)MULTI_REFS_PREFIX)AMBIGUOUS_AA_CODECODON_TO_AASTART_CODONSSTOP_CODONSVDJ_5U_FEATURE_TYPESVDJ_ANNOTATION_MATCH_SCORE!VDJ_ANNOTATION_MIN_V_OVERLAP_FRACVDJ_C_FEATURE_TYPESVDJ_CDR3_ALL_END_MOTIFSVDJ_CDR3_COMMON_END_MOTIFSVDJ_CLONOTYPE_TYPESVDJ_D_FEATURE_TYPESVDJ_GENE_PAIRSVDJ_J_FEATURE_TYPESVDJ_MAX_CDR3_LENVDJ_MIN_CDR3_LENVDJ_ORDERED_REGIONSVDJ_QUAL_OFFSETVDJ_V_FEATURE_TYPESseq)SSWAlignmentResultSSWMultiAligner   codonstr | bytesreturnintc                 C  s^   t | dksJ t| tr| d} tdd | D sJ t| t}t |dks+J t|S )zReturn amino acid corresponding to a codon.

    If the codon is not in the translation table, a default
    AA is returned (see AMBIGUOUS_AA_CODE in vdj.constants)
       asciic                 s  s    | ]}|d v V  qdS )ZNACGTN .0cr.   r.   i/oak/stanford/groups/akundaje/marinovg/programs/cellranger-9.0.1/lib/python/cellranger/vdj/annotations.py	<genexpr>A       zcodon_to_aa.<locals>.<genexpr>r'   )	len
isinstancebytesdecodeallr   getr   ord)r(   coder.   r.   r2   codon_to_aa8   s   

r=   alignment_resultr%   score_ratiofloat	word_sizematch_scoreboolc                 C  sZ   | j }|dus	J t|j|j d }|| }t|j||k r"dS t||k r+dS dS )a  Returns True for a passing alignment and False otherwise.

    Args:
        alignment_result (SSWAlignmentResult): alignment result to filter
        score_ratio: minimum (score / max_possible_score)
        word_size: minimum stretch of contiguous matches/mismatches in the alignment
        match_score: match score used in the alignment

    Returns:
        True if alignment passed filters
    Nr'   FT)		alignmentr@   	query_endquery_begintk_statsrobust_dividescorecr_cigarZget_max_word_length)r>   r?   rA   rB   rD   alignment_length	max_scorer.   r.   r2   filter_alignmentG   s   rM   c           
      C  s   t | jjd j}| j}|dusJ t|j|j d }|| }t	|j
||k r+dS t|j|j d }|| }	||	k r?dS dS )a  Returns True for a passing alignment and False otherwise.

    Args:
        alignment_result (SSWAlignmentResult): alignment result to filter
        score_ratio: minimum (score / max_possible_score)
        match_score: match score used in the alignment
        v_overlap_frac: Minimum required V gene overlap (alignment length/V gene length)

    Returns:
        True if alignment passed filters
    featureNr'   FT)r5   	referencemetadatasequencerD   r@   rE   rF   rG   rH   rI   ref_end	ref_begin)
r>   r?   rB   Zv_overlap_fracZv_gene_lengthrD   rK   rL   Zref_alignment_lengthZmin_alignment_lengthr.   r.   r2   filter_v_alignmentg   s   rT   alignerr&   ref_seqr7   r$   filter_func$Callable[[SSWAlignmentResult], bool]list[Annotation]c                 C  sv   t |t |ks
J t|tsJ t|tsJ | |}g }|D ]}|jdu s,||s-q!t||}|| q!|S )a  Align a sequence against an SSWMultiAligner.

    Return a list of Annotation objects
    for the alignments that passed filter_func.

    Args:
        seq (bytes): the sequence to align.
        ref_seq (bytes): the sequence used to create the returned Annotation objects.
            ref_seq and seq will often be the same but seq could be a partially
            masked version of ref_seq. The two must have the same length.

    Returns:
        list: Annotation objects.
    N)r5   r6   r7   alignrD   
Annotationfrom_alignment_resultappend)rU   rV   r$   rW   alignment_resultsr   r>   
annotationr.   r.   r2   collect_annotations   s   
r`   j_amino_acidsallowed_end_motifs	list[str]
int | Nonec                   s`   |D ]+}|t v s
J ttt| d D ] t fddt|D r,     S qqdS )aF  Search for the CDR3 end motif in J amino acid sequence.

    Args:
        j_amino_acids: the AA of the J region
        allowed_end_motifs: prioritized list of allowed end motifs. See VDJ_CDR3_ALL_END_MOTIFS
            for the exhaustive list

    Returns:
        end_motif_pos: Index of the end motif in j_amino_acids
    r'   c                 3  s0    | ]\}} |  t |kp|d kV  qdS XNr;   r0   iaaidxra   r.   r2   r3      s   . z&find_cdr3_end_motif.<locals>.<genexpr>N)r   ranger5   r9   	enumerate)ra   rb   motifr.   rk   r2   find_cdr3_end_motif   s   rp   v_regionr[   j_regionv_frame"tuple[tuple[int, int] | None, str]c                   s:  d}d}|dv s
J |j dusJ | j dusJ | j | j}tfddt|td dD }|d}|d	k r@d
}||fS || |d  }|j  |j}	d|	| | d  }
|
dkr^d	}
tt fddt|
t|j d dD }d}t|t}|dur|	|
 |d  d }|sd}|r||k r||f|fS d|fS )a  Search for the CDR3 signature in a sequence.

    Args:
      v_region: Annotation object for the V-Region
      j_region: Annotation object for the J-Region
      v_frame: reading frame (0, 1, 2)

    Returns:
       (a,b): the starting and ending (base) positions of the CDR3
       status_flag: a string indicating what happened within seq or None if no
                    CDR3 could be found.
     Nr   r'      c                 3  $    | ]}t  ||d   V  qdS r,   Nr=   r0   ri   )v_seqr.   r2   r3          
z(search_cdr3_signature.<locals>.<genexpr>rw   r,      Cr   ZGUIDED_NO_C_IN_Vc                 3  rx   ry   rz   r{   )j_seqr.   r2   r3     r}   ZGUIDED_NO_FGXG)	rQ   contig_match_startr7   rm   r5   rfind	bytearrayrp   r   )rq   rr   rs   flagposv_startZv_amino_acidsZ
last_c_idxZ
last_c_posj_startZj_framera   Zend_motif_posZend_motif_idxr.   )r   r|   r2   search_cdr3_signature   sD   

r   c                   sl  t d }d}d}tdD ]}ttfddt|td dD  d}t|t d D ]tD ]}t fddt|D }q2|r}||d  }	|rZ|	|jksY|	|j	k rZq.d}
t|| dd	D ]+}  t
d
kr}
||
d  }|r||j	kr||jk r nqdt  tv r nqd|
r|	| tk r||	d  } |
|d  }||||	f    S q.qdS )a	  Search the CDR3 signature in a sequence without guides from annotations.

    This could lead to more false positive signature hits than the guided version.

    Return value:
    A tuple (CDR3 DNA seq, CDR3 amino-acid seq, start position in seq, end position)
    r,   Fr   c                 3  rx   ry   rz   r{   r#   r.   r2   r3   $  s   " z.search_cdr3_signature_no_vj.<locals>.<genexpr>rw   Nc                 3  s0    | ]\}} |  t |kp|d kV  qdS re   rg   rh   )amino_acidsrl   r.   r2   r3   ,  s     
r~   r'   )r   rm   r7   r   r5   r   r9   rn   contig_match_endr   r;   chrr   r   )r$   rq   rr   Zmin_cdr3_aasZ	valid_endcys_posframeZfgxg_idxro   fgxg_posZcys_idx_cdr3_seqcdr3_aasr.   )r   rl   r$   r2   search_cdr3_signature_no_vj  sP   $


'r   	maybe_strNonec                 C     d S Nr.   r   r.   r.   r2   _bytes_or_noneS     r   bytes | strc                 C  r   r   r.   r   r.   r.   r2   r   W  r   bytes | str | Nonebytes | Nonec                 C  s"   | d u rd S t | tr| S |  S r   )r6   r7   encoder   r.   r.   r2   r   [  s
   
c                   @  sV   e Zd ZU ded< ded< ded< ded< ded	< d
ed< ded< ded< ded< dS )AnnotatedContigDictr   barcoder	   contig_namestrrQ   zIterable[dict[str, Any]]r   	clonotypebool | Noneis_cellrd   
read_count	umi_countdictinfoN__name__
__module____qualname____annotations__r.   r.   r.   r2   r   c  s   
 r   c                   @  sL  e Zd ZdZg dZg g dddddg di ddfdlddZedmddZdnddZdd  Z	dod#d$Z
d%d& Zd'd( Zd)d* Zdpd,d-Zd.d/ Zd0d1 Zd2d3 Zdqd5d6Zd7d8 Zd9d: Zd;d< Zd=d> Zd?d@ ZdrdGdHZdsdKdLZdtdQdRZdudUdVZdWdX ZdYdZ Zd[d\ Zd]d^ Zd_d` Z dvdbdcZ!ddde Z"dfdg Z#dhdi Z$djdk Z%dS )wAnnotatedContigz,A named sequence with a list of Annotations.)r   rQ   qualsr   primer_annotationsr   r   r   r   unannotated_intervalsfiltered
cdr3_start	cdr3_stopr   cdr3	cdr3_flagstart_codon_posstop_codon_posr   aa_sequence
productive	info_dictr   high_confidenceNTFrQ   r7   r   rY   r   r   r   r   r   r   rd   r   r   rC   c                 C  s   || _ | | _|| _|| _|| _|| _|| _|| _|	| _	|
| _
|| _d| _d| _d| _d| _d| _d| _d| _d| _d| _d| _|| _|| _|| _dS )z\The info_dict can contain additional features that are not included in the objects features.N)r   upperrQ   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   )selfnamerQ   r   r   r   r   r   r   r   r   r   r   r   r   r.   r.   r2   __init__  s0   

zAnnotatedContig.__init__annotation_dictr   rO   vdj_reference.VdjReferencec                   s  |  d   ddt d  fdd d D  fdd  d	g D t d
  d  d  d  d  d d}  dd|_  dd|_  dd|_  dd|_  dd|_  dd|_t  dd|_t  dd|_	  dd|_
  dd|_|S )a  Create an AnnotatedContig object from a dict.

        Does not perform any checks on the self-consistency of the dict features.
        (Eg. do the annotation sequences agree with the contig's sequence,
        do the CDR3 positions make sense etc).
        r   r   NrQ   c                       g | ]}t j| d  dqS rQ   )
contig_seqr[   	from_dictr0   Zhitr   rO   r.   r2   
<listcomp>      z-AnnotatedContig.from_dict.<locals>.<listcomp>r   c                   r   r   r   r   r   r.   r2   r     r   r   r   r   r   r   r   r   )r   r   rQ   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   )r:   r   r   r   r   r   r   r   r   r   r   r   )clsr   rO   new_objr.   r   r2   r     s8   



zAnnotatedContig.from_dictr*   c                 C  s   t di d| jd| jd| jddd | jD ddd | jD d	| jd
| jd| jd| j	d| j
d| jd| jd| jd| jd| jd| jd| jd| jd| jd| jd| jd| jS )Nr   rQ   r   r   c                 S     g | ]}|  qS r.   to_dictr0   ar.   r.   r2   r         z+AnnotatedContig.to_dict.<locals>.<listcomp>r   c                 S  r   r.   r   r   r.   r.   r2   r     r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r.   )r   r   rQ   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r.   r.   r2   r     sZ   	
zAnnotatedContig.to_dictc                 C  
   t | jS )z,Iterator over the annotations of the contig.)iterr   r   r.   r.   r2   hits  s   
zAnnotatedContig.hitsregion_typesSequence[str]c                   sJ   | j D ]}t|jjtsJ q D ]	}t|tsJ q fdd| j D S )Nc                   s    g | ]}|j j  v r|qS r.   )rN   region_typer8   r0   r_   r   r.   r2   r     s
    z3AnnotatedContig.get_region_hits.<locals>.<listcomp>)r   r6   rN   r   r7   r   )r   r   r_   rr.   r   r2   get_region_hits  s   

zAnnotatedContig.get_region_hitsc                 C  s0   |   }t|dkrdS |  rtt|S dS )zReturns 'Multi' if ambiguous.r   Ns   Multi)contig_chainsr5   is_single_chainnextr   )r   chainsr.   r.   r2   get_single_chain  s   z AnnotatedContig.get_single_chainc                 C  sB   |D ]	}t |tsJ q| |}t|dkrdS t|d jjS )zReturns first if ambiguous.r   N)r6   r   r   r5   r   rN   display_name)r   r   r   r   r.   r.   r2   get_single_gene_display_name  s   
z,AnnotatedContig.get_single_gene_display_namec                 C  s   t |  dkS Nr'   )r5   r   r   r.   r.   r2   r   '     zAnnotatedContig.is_single_chain
set[bytes]c                 C  s   dd | j D S )Nc                 S  s   h | ]}|j jqS r.   rN   chainr   r.   r.   r2   	<setcomp>+  r   z0AnnotatedContig.contig_chains.<locals>.<setcomp>r   r   r.   r.   r2   r   *  r   zAnnotatedContig.contig_chainsc                 C  s   | j d uot| j dkS Nr   )r   r5   r   r.   r.   r2   has_cdr-  s   zAnnotatedContig.has_cdrc                 C     |   r| jS dS )zDNA sequence of CDR3.N)r   r   r   r.   r.   r2   get_cdr_seq0     zAnnotatedContig.get_cdr_seqc                 C  r   )zAmino acid sequence of CDR3.N)r   r   r   r.   r.   r2   get_cdr_aa_seq6  r   zAnnotatedContig.get_cdr_aa_seq'tuple[None, None] | tuple[bytes, bytes]c                 C  sp   |   sdS |  sdS tt|  }t|tsJ t||  }t|ts.J t||d | }||fS )zClonotype-defining sequence.

        Returns a tuple (chain, <chain_name>_<CDR3_sequence>). If this contig
        does not match a single chain, or does not have a CDR3, returns (None, None).
        NN   _)	r   r   r   r   r   r6   r7   typer   )r   r   r$   cdr_seqr.   r.   r2   clonotype_seq<  s   zAnnotatedContig.clonotype_seqc                 C  s0   t dd | jD }t dd | jD }|o|S )Nc                 s  s     | ]}t |jjtv V  qd S r   )r   rN   r   r"   r   r.   r.   r2   r3   N  
    
z9AnnotatedContig.has_full_length_vj_hit.<locals>.<genexpr>c                 s  s0    | ]}t |jjtv o|j|jd  kV  qdS ry   r   rN   r   r   annotation_match_endannotation_lengthr   r.   r.   r2   r3   T  s    
anyr   )r   Zhas_full_len_v_hitZhas_full_len_j_hitr.   r.   r2   has_full_length_vj_hitM  s   z&AnnotatedContig.has_full_length_vj_hitc                 C  s   t dd | jD S )Nc                 s  s*    | ]}t |jjtv o|jd kV  qdS )r   Nr   rN   r   r"   annotation_match_startr   r.   r.   r2   r3   \  s    
z0AnnotatedContig.spans_v_start.<locals>.<genexpr>r   r   r.   r.   r2   spans_v_start[  s   zAnnotatedContig.spans_v_startc                 C  st   | j d u rd S dd | jD }|sd S |d }dd | jD }|s#d S |d }tdd | j |j|j D }|S )Nc                 S  s*   g | ]}t |jjtv r|jd kr|qS r   r   r   r.   r.   r2   r   e  s    
z0AnnotatedContig.get_vj_quals.<locals>.<listcomp>r   c                 S  s,   g | ]}t |jjtv r|j|jkr|qS r.   r   r   r.   r.   r2   r   o  s    c                 S     g | ]}t |t qS r.   r;   r!   r0   qr.   r.   r2   r   z  s    
)r   r   nparrayr   r   )r   	v_regionsrq   	j_regionsrr   r   r.   r.   r2   get_vj_qualsb  s(   
zAnnotatedContig.get_vj_qualsc                 C  s$   | j d urtdd | j D S d S )Nc                 S  r  r.   r  r  r.   r.   r2   r         z-AnnotatedContig.get_quals.<locals>.<listcomp>)r   r  r  r   r.   r.   r2   	get_quals  s   
zAnnotatedContig.get_qualsc                 C  sh   |  t}|  t}|r|sdS g }g }tD ]}|  |}|r/||d jj ||d  q||fS )aw  Return a concatenated reference sequence.

        Return value:
        - (None,None) if this contig isn't annotated with a V and a J segment.
        Otherwise a tuple (seqs, annos) where annos is a list of Annotation objects
        in the order that they should appear in a VDJ sequence and seqs is a list
        of corresponding sequences from the input fasta.
        r   r   )r   r"   r   r    r]   rN   rQ   )r   rq   rr   ZseqsZordered_annosZregion_defsregionsr.   r.   r2   get_concat_reference_sequence  s   
	

z-AnnotatedContig.get_concat_reference_sequencefeature_typesIterable[str]ssw_multi_alignersIterable[SSWMultiAligner]ssw_filter_funcs.Iterable[Callable[[SSWAlignmentResult], bool]]c                 C  s   | j }g }g }t|||D ]T\}}}	t|tsJ |tt t vr"qt|| j ||	}
|
ra|tt v r;t|
dd d}nt|
dd d}|	| |tv sP|tv rU|	| t
| j |jt| j }q||fS )Nc                 S  s   | j dk| jfS r   )r   rI   xr.   r.   r2   <lambda>  s    z9AnnotatedContig._get_masked_annotations.<locals>.<lambda>keyc                 S     | j S r   rI   r  r.   r.   r2   r        )rQ   zipr6   r   r"   r   r   r`   maxr]   tk_seqmaskr   r5   )r   r  r  r  Zmasked_sequencemasked_annotationsvj_hitsr   rU   Zalign_filterr   bestr.   r.   r2   _get_masked_annotations  s,   

z'AnnotatedContig._get_masked_annotationsSequence[SSWMultiAligner].Sequence[Callable[[SSWAlignmentResult], bool]]c                   s  |  |||\}||td  }||td  }|r'tdd |D nt| j}t|| jt	| jd||}|rot
|dd d rjtd jjtv rjd jj fdd|D }	|	ri||	d  n|  tdkrtd jjtv sJ td	 jjtv sJ d j}
d	 j}d jjd	 jjkr||td  }||td  }t|| jt	| j|
||}fd
d|D }|r|t
|dd d |S )aF  Add sequence annotations by comparing against a list of reference segments.

        Args:
            feature_types, ssw_multi_aligners, ssw_filter_funcs: Matched aligners and filters
                for a list of feature types (see setup_feature_aligners)

        Returns:
            A list of Annotation objects.
        r   c                 S     g | ]}|j qS r.   r   r0   annor.   r.   r2   r         z5AnnotatedContig.annotate_features.<locals>.<listcomp>c                 S  r  r   r  r  r.   r.   r2   r    r  z3AnnotatedContig.annotate_features.<locals>.<lambda>r  c                   s(   g | ]}|j  j kr|jjkr|qS r.   )rI   rN   r   r*  )best_utrv_gener.   r2   r     s
    rw   r'   c                   s$   g | ]}|j j d  j jkr|qS r  r   r*  )r#  r.   r2   r   	  s    c                 S  r  r   r  r  r.   r.   r2   r    r  )r%  indexr   r  minr5   rQ   r`   r   r!  r  r   rN   r   r"   r   r]   r   r   r   r   r   )r   r  r  r  r"  Zutr_alignerZ
utr_filterZ	min_startr   Zequally_goodZv_endr   Z	d_alignerZd_filterr.   )r-  r.  r#  r2   annotate_features  sP   



z!AnnotatedContig.annotate_featuresr>   r%   alignment_filterrX   c                 C  sb   |j }|d u r	d S ||sd S |jjd }t||j|jt|j|j|j	d |j
|jd g | jd
S NrN   r'   
rN   cigarrI   r   r   r   r   r   
mismatchesr   )rD   rO   rP   r[   cigar_stringrI   r5   rQ   rS   rR   rF   rE   )r   r>   r2  rD   rN   r.   r.   r2   _annotation_from_alignment  s$   z*AnnotatedContig._annotation_from_alignmentssw_multi_alignerr&   c                 C  s   | | j}g }|D ]}| || }dur|| q
g }|r?d	dd}tt||d|D ]\}	}
|t|
dd d q/|S )
aI  Similar to annotate_features but uses a joint SSWMultiAligner object.

        ssw_multi_aligner can contain references of different types. The sequence
        will be aligned against all of these and then the best alignment by type
        will be returned. The same filter function will be used for all feature types.
        Nr  r[   c                 S  s   | j jS r   )rN   r   r  r.   r.   r2   
_by_regionF  s   z>AnnotatedContig.annotate_features_by_group.<locals>._by_regionr  c                 S  r  r   r  r  r.   r.   r2   r  J  r  z<AnnotatedContig.annotate_features_by_group.<locals>.<lambda>)r  r[   )rZ   rQ   r8  r]   	itertoolsgroupbysortedr  )r   r9  r2  r^   r   r>   r_   Ztop_annotationsr:  r   groupr.   r.   r2   annotate_features_by_group,  s   

z*AnnotatedContig.annotate_features_by_groupc           	      C  s   g }t t| j}| jD ]
}d||j|j< qd}t|dd D ]0\}}t	|}|sLt
jdddd}|t|ddt|dt|||t| g d	 |t|7 }q"|S )	zWReturn a list of Annotation objects corresponding to unannotated regions on the contig.r'   r   c                 S  s   | dkS r   r.   r  r.   r.   r2   r  X      z;AnnotatedContig.get_unannotated_intervals.<locals>.<lambda>s   UNANNOTATEDN)r   r   rQ   )	rN   r5  rI   r   r   r   r   r   r6  )r  zerosr5   rQ   r   r   r   r;  r<  listvdj_referenceZcreate_dummy_featurer]   r[   )	r   r   Zannotation_indicatorr_   Zinterval_startZ	annotatedZregion_iterregionrN   r.   r.   r2   get_unannotated_intervalsN  s4   

z)AnnotatedContig.get_unannotated_intervalsc                 c  sb    | j | jfD ]'}|D ]"}dt| jt|jt|jt|jj	 dt|jj
 gV  qqdS )zConvert this contig's annotations to a list of BED entries.

        Yields:
            str: each element is a BED file line (no newline)
        	r   N)r   r   joinr   r   r   r   r   rN   r   r   )r   Z	anno_listr+  r.   r.   r2   get_annotations_bedq  s   z#AnnotatedContig.get_annotations_bedc                   s
    t}  t}d  _d  _d  _d  _d  _d  _d  _	d  _
d  _d  _g }t|dkrFt|dkrF|d }|d }|jdk}|j}|j} j}	d }
|rrt|dd tv rrt||d\}
}|| |d | _	nL|rt|dt  d| dt  dD ]/}|dkr|d t|	krt|	||d  tv rt||d\}
}| _	|| |d q j	d u r|d |rň j	d u rdD ]}t|||\}
}|
r||  nq|
d u r|d |
r.|
d |
d  tk r.|
d  _|
d  _|	|
d |
d   _tt fd	d
tdt jd dD  _|
d |
d  d dks-J n|
d u r9|d n|d|
d |
d     js|rQ|d nd }|rZ|d nd }|d urt j||}|d ur|\}}}}| _| _| _|d  _|d  jrZ j	d u rt jddD ] }t |tv r| _	 j	rt |tv r nq j	r|d  j	d urt j	t jdD ]}t |tv r| _
 nqш  r@ j	d u}|o  jd uo  j	d  jd k}|ot fdd
t j	|jdD }|o|o| _|s&|d |r7 jd ur7|s7|d |s?|d nt  fdd
t j jdD rZ|d d _ j	d urxtt fdd
t j	t jd dD  _d!dd |D  _d S )Nr'   r   r,   ZFULL_V_HAS_STARTZFULL_V_ALT_STARTZFULL_V_NO_STARTrv   ZFAILED_UNGUIDED_SEARCHc                 3  s&    | ]}t  j||d   V  qdS ry   )r=   r   r{   r   r.   r2   r3     s
    
z0AnnotatedContig.annotate_cdr3.<locals>.<genexpr>rw   ZNO_CDR3zCDR3_TOO_LONG:%dZFOUND_CDR3_UNGUIDEDZFOUND_DE_NOVO_V_STARTc                 3  s"    | ]}t  |tvV  qd S r   r   r(   r   r{   r   r.   r2   r3   	  
    
ZNO_STARTZCDR3_OUT_OF_FRAMEZVJ_STOPc                 3  s"    | ]}t  |tv V  qd S r   rJ  r{   r   r.   r2   r3     rK  Z	CDR3_STOPFc                 3  s    | ]
}t  |V  qd S r   )r=   r(   r{   r   r.   r2   r3   "  s
    
|c                 S  s   g | ]}|r|qS r.   r.   )r0   fr.   r.   r2   r   (  r   z1AnnotatedContig.annotate_cdr3.<locals>.<listcomp>)"r   r"   r   r   r   r   r   r   r   r   r   r   r   r5   r   r   rQ   r   r   r   r]   rm   START_CODON_SLOPr   r7   r   r   r(   r   r   r9   r   r   rG  )r   r	  r
  flagsrq   rr   Zhas_v_startr   r|   r$   Zcdr_posr   ri   r   resr   r   r   r   Z	has_startZcdr3_in_frameZ	vj_nostopr.   r   r2   annotate_cdr3  s   












 











zAnnotatedContig.annotate_cdr3c                 C  s(   |d t | jkr| j||d  S dS )Nr,       r5   rQ   )r   ri   r.   r.   r2   r(   *  s   (zAnnotatedContig.codonc                   s0   |  D ] t fdd| jD s dS qdS )zTrue if this contig contains all the annotations of the other_contig.

        We only check the gene_name to test if the two annotations are the same.
        c                 3  s     | ]}|j j j jkV  qd S r   )rN   	gene_namer   Zother_annotationr.   r2   r3   3  r   z7AnnotatedContig.contains_annotations.<locals>.<genexpr>FT)r   r   r   r   other_contigr.   rU  r2   contains_annotations-  s   z$AnnotatedContig.contains_annotationsrW  c                 C  sh   |  t}|  t}|r|sdS tdd |D }tdd |D }t| j|| |jr2dS dS )zITrue if this contig's VJ region is exactly contained in the other_contig.Fc                 S  r(  r.   r)  )r0   rq   r.   r.   r2   r   B  r,  z3AnnotatedContig.is_exact_vj_hit.<locals>.<listcomp>c                 S  r(  r.   )r   )r0   rr   r.   r.   r2   r   C  r,  T)	r   r"   r   r  r0  r  researchrQ   )r   rW  r	  r
  startstopr.   r.   r2   is_exact_vj_hit:  s   

zAnnotatedContig.is_exact_vj_hitc                 C  r   r   rS  r   r.   r.   r2   __len__H     
zAnnotatedContig.__len__c                 C  sz   t t t t }g }| jD ]}t|jj|v r*||j	|j
|j|j	 |j|jjf qdd t|dd dD }d|S )Nc                 S  s   g | ]}|d   qS )   r.   r0   r  r.   r.   r2   r   \  s    z2AnnotatedContig.annotation_str.<locals>.<listcomp>c                 S     | d S r   r.   r  r.   r.   r2   r  \  r@  z0AnnotatedContig.annotation_str.<locals>.<lambda>r  ;)r"   r   r   r   r   r   rN   r   r]   r   r   r   r   r   r=  rG  )r   Zfeatures_to_outputZannotation_strsr+  r.   r.   r2   annotation_strK  s    

	
zAnnotatedContig.annotation_strc                 C  s(   d dd | jD }d| j d| S )Nrc  c                 S     g | ]}t |qS r.   r   r*  r.   r.   r2   r   `  r   z+AnnotatedContig.__str__.<locals>.<listcomp>zContig z: )rG  r   r   )r   rd  r.   r.   r2   __str___  s   zAnnotatedContig.__str__c                 C     |    S r   r   __repr__r   r.   r.   r2   rj  c     zAnnotatedContig.__repr__)rQ   r7   r   rY   r   rY   r   r   r   r   r   rd   r   rd   r   rC   )r   r   rO   r   )r*   r   )r   r   r*   rY   )r*   r   )r*   r   )r  r  r  r  r  r  )r  r   r  r&  r  r'  )r>   r%   r2  rX   )r9  r&   r2  rX   r*   rY   rW  r   r*   rC   )&r   r   r   __doc__	__slots__r   classmethodr   r   r   r   r   r   r   r   r   r   r   r   r   r  r  r  r  r%  r1  r8  r?  rE  rH  rQ  r(   rX  r]  r^  rd  rg  rj  r.   r.   r.   r2   r   o  sb    /
,






,
F
"# )
r   c                   @  sF   e Zd ZU ded< ded< ded< ded< ded< ded	< ded
< dS )AnnotationDictz&vdj_reference.VdjAnnotationFeatureDictrN   r   r5  r+   r   r   r   r   r   Nr   r.   r.   r.   r2   rp  g  s   
 rp  c                   @  s   e Zd ZdZg dZ	d2d3ddZed4ddZedd Z	dd Z
e	d2d5d"d#Zd6d$d%Zd&d' Zd(d) Zd*d+ Zd,d- Zd.d/ Zd0d1 ZdS )7r[   zBAnnotation of a sequence against a reference of segment sequences.)
rN   r5  rI   r   r   r   r   r   r6  rQ   NrN   "vdj_reference.VdjAnnotationFeaturer5  r   rI   r@   r   r+   r   r   r   r   r   c                 C  s^   || _ || _|| _|| _|| _|| _|| _|| _|	| _|
du r$d| _	dS |
|| 
 | _	dS )zeConstructor.

        Args:
            feature (vdj.reference.VdjAnnotationFeature): object
        N)rN   r5  rI   r   r   r   r   r   r6  rQ   r   )r   rN   r5  rI   r   r   r   r   r   r6  r   r.   r.   r2   r     s   
zAnnotation.__init__r>   r%   rQ   c                 C  sX   |j }|d us	J |jjd }| ||j|jt|j|j|jd |j	|j
d g |d
}|S r3  )rD   rO   rP   r7  rI   r5   rQ   rS   rR   rF   rE   )r   r>   rQ   rD   rN   r_   r.   r.   r2   r\     s    z Annotation.from_alignment_resultc                   C  s   ddgS )N
chain_typerQ   r.   r.   r.   r.   r2   _inferred_values  s   zAnnotation._inferred_valuesc              
   C  s\  g }t | j}d}g }g d}|D ]E\}}	t|}|||	g  |	dkr<tt||	||d g}
||
d< ||
 n|	dkrO|tt||	||| g |	dkrW||7 }qd}| j}|D ]L}	|	dkrj|d7 }q_|	dkr|||d  	 |||d  	 kr|tt|d	||d g |d7 }|d7 }q_|	dkr|d7 }q_|	dkr|d7 }q_|S )
a1  Mark mismatches between the contig and the reference.

        Args:
            contig_sequence (str): contig sequence
            feature_sequence (str): the feature that the annotation derives from

        Returns:
            list of dict: list of annotations for mismatches (each as a dict)
        r   )r   r   r      Dr'   Zdeletion_length   I   S   MZMISMATCH)
rJ   Zget_cigar_tuplesr5  r+   extendr   r  r]   r   r   )r   Zcontig_sequenceZfeature_sequencer6  cigar_tuplesZstart_positionZall_operationsZmismatch_annotation_json_keyslengthcategoryr   Zcontig_positionZannotation_positionr.   r.   r2   annotate_mismatches  sl   
	


zAnnotation.annotate_mismatchesr   rp  rO   r   str | bytes | Noner*   c                 C  sV   | t |d ||d |dtd|d |d |d |d |d	 |d
t|d
S )NrN   r5  rI   nanr   r   r   r   r   r6  r4  )rC  Zconvert_dict_to_vdj_featurer:   r@   r   )r   r   rO   r   r.   r.   r2   r     s   zAnnotation.from_dictc                   sB    fdd j D }t j|d<   D ]}||d  q|S )Nc                   s   i | ]}|t  |qS r.   getattr)r0   Zslotr   r.   r2   
<dictcomp>'  r  z&Annotation.to_dict.<locals>.<dictcomp>rN   )rn  rC  Zconvert_vdj_feature_to_dictrN   rs  pop)r   Zout_dictvalr.   r   r2   r   &  s
   zAnnotation.to_dictc                 C  s   |  |  | jS r   )r   r   rQ   r   r.   r.   r2   copy-  s   zAnnotation.copyc                 C  sJ   t || jsdS | jdur| j|jkrdS | jj|jjko$| jj|jjkS )zTwo annotations are the same if they have the same sequence.

        If the sequence is missing, rely on the annotated gene name.
        FNT)r6   	__class__rQ   rN   rT  allele_namer   otherr.   r.   r2   __eq__0  s   zAnnotation.__eq__c                 C  s(   | j dur
t| j S t| jj| jjfS )z2Hash is based on either the sequence or gene name.N)rQ   hashrN   rT  r  r   r.   r.   r2   __hash__>  s   

zAnnotation.__hash__c                 C  s   |  | S r   )r  r  r.   r.   r2   __ne__D  rk  zAnnotation.__ne__c                 C  s   | j j d| j dS )Nz(cigar:))rN   r   r5  r   r.   r.   r2   rg  G  s   zAnnotation.__str__c                 C  rh  r   ri  r   r.   r.   r2   rj  J  rk  zAnnotation.__repr__r   )rN   rq  r5  r   rI   r@   r   r+   r   r+   r   r+   r   r+   r   r+   r   r   )r>   r%   rQ   r   )r   rp  rO   r   r   r}  r*   r[   )r*   rp  )r   r   r   rm  rn  r   ro  r\   staticmethodrs  r|  r   r   r  r  r  r  rg  rj  r.   r.   r.   r2   r[   q  s(    !
R
c                   @  s   e Zd ZdZi fd$ddZedd Zdd	 Zd%ddZ	
	
	
d%d&ddZ	d'ddZ
dd Zdd Zdd Zdd  Zd!d" Zd#S )(CellContigszaA list of AnnotatedContigs.

    (eg. representing a clonotype or all the contigs in a cell)
    contigsIterable[AnnotatedContig]c                 C  s   || _ t|| _|| _d S r   )r   rB  r   r   )r   r   r  r   r.   r.   r2   r   T  s   

zCellContigs.__init__c                 C  s   | |g i S r   r.   )r   r   r.   r.   r2   emptyY  s   zCellContigs.emptyc                 C  r   r   )r   r   r   r.   r.   r2   r  ]  r_  zCellContigs.contigsTc           	      C  sR   dd t D }t }| j|||d}|D ]}t|dd }|| q||v S )z:True if this cell has contigs matching some VDJ gene pair.c                 S  s   g | ]	}t |d qS r   )setsplit)r0   pr.   r.   r2   r   b  s    z)CellContigs.is_paired.<locals>.<listcomp>)require_full_lenrequire_productiverequire_high_confr   r   )r   r  clonotype_tupler   r  add)	r   r  r  r  Z
good_pairsr   clZcdrr   r.   r.   r2   	is_paired`  s   zCellContigs.is_pairedr  rC   r  r  r*   tuple[bytes, ...]c                   s$    fdd| j D }tt|S )a\  Tuple of unique CDR3s across all productive contigs.

        Args:
            require_productive: Contigs (aka 'chains') need to be productive (productive=True)
            require_full_len: Contigs (aka 'chains') need to be full length (productive or not)
            require_high_conf: Contigs (aka 'chains') need to be high confidence
        c                   sH   h | ] }|j s	s| s s|jss| r| r| d  qS )r'   )r   r   r   r   r   r   r/   r  r  r  r.   r2   r   {  s(    
z.CellContigs.clonotype_tuple.<locals>.<setcomp>)r   tupler=  )r   r  r  r  Zcdrsr.   r  r2   r  n  s   zCellContigs.clonotype_tuplerW  r   c                   s   t  fdd| jD S )zTrue if it has a chain that contains all the annotations of the given other contig.

        Args:
            contig (AnnotatedContig):
        c                 3  s    | ]}|  V  qd S r   )rX  r0   contigrW  r.   r2   r3     s    z.CellContigs.contains_contig.<locals>.<genexpr>)r   r   rV  r.   r  r2   contains_contig  s   zCellContigs.contains_contigc                 C  s,   |du rdS | j D ]
}||r dS q	dS )zReturns True if this cell has an exact hit in the other contig.

        True if this cell has some contig whose V(D)J region is perfectly
        contained in the sequence of the other_contig.
        NFT)r   r]  )r   rW  r  r.   r.   r2   has_exact_vj_hit  s   

zCellContigs.has_exact_vj_hitc                 C  sJ   g }| j D ]}| }|jt| j | }| j|d< || q|S )zReturn as a list of AnnotatedContig dicts.

        Information from this object's info_dict is passed to the contigs.
        r   )r   r  r   updater   r   r   r]   )r   Z	out_dictsr  
new_contigZ
contig_outr.   r.   r2   to_dict_list  s   

zCellContigs.to_dict_listc                 C  s    t | jdd | jD t| jS )Nc                 S  r   r.   )r  r  r.   r.   r2   r     r   z$CellContigs.copy.<locals>.<listcomp>)r  r   r   r   r   r   r.   r.   r2   r    s   zCellContigs.copyc                 C  s    d | jddd | jD S )NzCell {}:
{}
c                 S  re  r.   rf  )r0   r   r.   r.   r2   r     r   z'CellContigs.__str__.<locals>.<listcomp>)formatr   rG  r   r   r.   r.   r2   rg    s    zCellContigs.__str__c                 C  rh  r   )r  rj  r   r.   r.   r2   rj    rk  zCellContigs.__repr__Nr  r  )TTT)r  rC   r  rC   r  rC   r*   r  rl  )r   r   r   rm  r   ro  r  r  r  r  r  r  r  r  rg  rj  r.   r.   r.   r2   r  N  s"    


r  c                   s"   t |  fddt| D S )zAReturns a list of AnnotatedContig objects from an open json file.c                   s   g | ]}t | qS r.   )r   r   ra  rO   r.   r2   r     r  z.load_contig_list_from_json.<locals>.<listcomp>)rC  VdjReferencejsonload)	json_filereference_pathr.   r  r2   load_contig_list_from_json  s   
r  T	group_keyr   c                   s    dv sJ t | }t||}W d   n1 sw   Y  g } fdd}tjt||d|d}|D ](\}	}
g }|
D ]}|jdurM|jsH|sM|| q<t|dkr\|t	|	| q4|S )aD  Returns a list of CellContig objects based on annotations in a json.

    The json is assumed to contain a list of AnnotatedContigs (in dict form).
    The contigs are sorted and grouped by group_key and each such group is put
    into a CellContig object.

    Args:
        group_key: must be 'barcode' or 'clonotype'
    )r   r   Nc                   s
   t |  S r   r  r  r  r.   r2   key_func  r_  z-load_cell_contigs_from_json.<locals>.key_funcr  Fr   )
openr  r;  r<  r=  r   r   r]   r5   r  )r  r  r  r  Zjson_file_objr   cell_contigsr  Z	anno_iterclonotype_nameZcontig_annotationsr  r  r.   r  r2   load_cell_contigs_from_json  s*   



r  r  r  c                 C  s   t jdd |D | dd d S )Nc                 s  s    | ]}|  V  qd S r   r   r/   r.   r.   r2   r3     r4   z,save_annotation_list_json.<locals>.<genexpr>T)pretty)tk_safe_json
dump_numpy)out_filer  r.   r.   r2   save_annotation_list_json  s   r  c                 C  s*   g d}|r| ddg t| || dS )z"Write contigs to an open csv file.)r   r   	contig_idr   rz  r   r.  d_genej_genec_genefull_lengthr   r   cdr3_ntreadsumisraw_clonotype_idraw_consensus_idinferred_clonotype_idinferred_consensus_idN)rx  save_annotation_list_csv)csvr  Zwrite_inferredcolumnsr.   r.   r2   save_contig_list_csv  s   r  c                 C  s   g d}t | || dS )z,Write consensus contigs to an open csv file.)clonotype_idconsensus_idrz  r   r.  r  r  r  r  r   r   r  r  r  N)r  )r  r  r  r.   r.   r2   save_consensus_list_csv  s   r  c                 C  s   | d u r| S t | S r   r   r  r.   r.   r2   ensure_str_or_none  s   r  c                   s\  t |}t||  t|dd dD ]}|jri dt|jd|jdt|jd|j	dt
|d	t| d
|td|td|td|td| d|jdt|jdt|jd|jd|jd|jd|jd|jd|jd|j|j|jd|jdd |  sJ t fdd|D |  qdS )z+Write AnnotatedContigs to an open csv file.c                 S  s   | j  | j| jfS r   )r   r   r   r  r.   r.   r2   r  $  r,  z*save_annotation_list_csv.<locals>.<lambda>r  r   r   r  r   rz  r   r.  r  r  r  r  r   r   r  r  r  r  r  r  r  clonotype_freqclonotype_prop)r  r  r  r  r  Zclonotype_frequencyZclonotype_proportionc                      g | ]} | qS r.   r.   r0   krowr.   r2   r   A  r   z,save_annotation_list_csv.<locals>.<listcomp>N)r  	vdj_utilswrite_csv_rowr=  r   r   r   r   r   r   r5   r  r   r   r"   r   r   r   r   r   r   r   r   r   r   r:   r   issubsetkeys)r  r  r  col_setr  r.   r  r2   r    sf   




	








r  r  r
   consensus_contigsc                   s  t t}|D ]v}| }|du rq|j}||vrEt||| d< t|jd || d< |jd || d< |jd || d< t || d	< n)|| d t|jd ksTJ || d |jd ksaJ || d |jd ksnJ || d	 ||j|j	f qd
d }|
 D ]}||d	 \}}	||d< |	|d< qt|
 dd dd}g d}
t|
}t|
|  |D ] |t sJ t fdd|
D |  qdS )zWrite a CSV containing clonotype info to an open csv file.

    Takes a list of AnnotatedContigs corresponding to consensuses.
    Nr  cellsmembersr  	frequencyr  
proportioncdr3sc                 S  sX   g }g }t | D ]\}}}|| d|  || d|  qd|d|fS )N:rc  )r=  r]   rG  )Zchain_cdr3scdr3s_ntcdr3s_aar   ntrj   r.   r.   r2   get_cdr3_list_string`  s   z5save_clonotype_info_csv.<locals>.get_cdr3_list_stringr  r  c                 S  rb  )Nr  r.   )r1   r.   r.   r2   r  n  r@  z)save_clonotype_info_csv.<locals>.<lambda>Tr  reverse)r  r  r  r  r  c                   r  r.   r.   r  r  r.   r2   r   w  r   z+save_clonotype_info_csv.<locals>.<listcomp>)r   r   r   r   r   r  r   r  r   r   valuesr=  r  r  r  )r  r  
clonotypesr  r   r  r  r   r  Zcdr3_aar  r  r.   r  r2   save_clonotype_info_csvD  s:   
r  c                   s*    fdd|D }t t|}||dS )z~Get the VDJ gene pair associated with the clonotype.

    If the gene pair is not in vdj_gene_pairs, it will return None.
    c                   s   h | ]} |  d d qS )r   r   )r  )r0   r$   sequence_idsr.   r2   r     s    z)get_clonotype_vdj_pair.<locals>.<setcomp>N)r  r=  r:   )r  r  vdj_gene_pairsZgenesr.   r  r2   get_clonotype_vdj_pairz  s   r  prefixc           -        s  t   }| d|}| d|}	tjt| fddd fddd}
tdd |
D d	d d
d}
| d|}g }dd tD }i }d}|
D ]M\}}t|dkrUqIdd |D }|	| |du rjd}d}nt|}||| ksvJ t
| |}|du}|tv sJ tjt||dkd}|}t|t|}||| |	|| fdd| D }i }tt}|D ]!}| D ]}| \}}||vrq|| |j |j||< qq||||d}i } t| D ].\}!\}}"t|"dksJ | d|!d  }#|dd || |dd |"d}$|$| |#< q| |d< |||< |jt||du d | jD ]d}%| d|%|}&| d|%|}'| d |%|}(| d!|%|})|dur|&jd|o`|%|tfv d |'jt||op|%|tfv d |%|tfv r| d"|%|}*|*d |(|| |r|)|| q2qI|t|}+| jD ]}%| d|%|}'|'jt|+dd q|jt|+d
d | d#|},|,t  fd$d  D  |,t j!t|+ft"d% | d&|},t|dkr|,#t $|t| |S |,#dt| |S )'a  Group barcodes into clonotypes.

    Args:
        reporter (VdjReporter): object for reporting metrics
        prefix: metrics prefix (must be in VDJ_CLONOTYPE_TYPES)
        cell_barcodes: set of cell barcodes
        clonotype_ids (dict): Mapping from clonotype id to tuple of CDR ids
        sequence_ids (dict): Mapping from CDR id to sequence
        barcode_contigs (list): CellContigs objects
        bc_clonotype_assignments (dict): Mapping from barcode to clonotype id

    Returns:
        dict: clonotype_id -> clonotype_info.
          Clonotype info is itself a dict with the following information::
          - clonotype_id
          - barcodes: barcodes belonging to the clonotype
          - freq: number of cells in the clonotype
          - prop: relative frequency of barcodes in the clonotype
          - consensuses: A dict seq_id -> seq_info with info about the sequences of the
              consensus. seq_info has the following information:
                  - cdr3: AA sequence of CDR3
                  - cdr3_seq: nucleotide sequence of CDR3
                  - chain: chain
                  - cell_contigs: contigs (of the clonotype barcodes) that correspond to this
                  consensus sequence
    vdj_clonotype_freqvdj_clonotype_propc                        | jd S r   r:   r   r  bc_clonotype_assignmentsr.   r2   r        z#report_clonotypes.<locals>.<lambda>r  c                   r  r   r  r  r  r.   r2   r    r  c                 s  s     | ]\}}|t |fV  qd S r   )rB  )r0   r  br.   r.   r2   r3     s    z$report_clonotypes.<locals>.<genexpr>c                 S  s   t | d S r   r5   r  r.   r.   r2   r    s    Tr   vdj_unassigned_clonotype_bc_fracc                 S  s    i | ]}t t|d |qS r  )r  r=  r  )r0   pairr.   r.   r2   r    s     z%report_clonotypes.<locals>.<dictcomp>r   c                 S  r(  r.   )r   )r0   bcr.   r.   r2   r     r,  z%report_clonotypes.<locals>.<listcomp>NFinferred)r   c                   r  r.   r.   )r0   Zcl_seqr  r.   r2   r     r   )r  freqpropbarcodesZ_consensus_r'   r   )r   r   r   r  consensuses)filtervdj_paired_clonotype_fracvdj_paired_clonotype_bc_fracvdj_clonotype_diversityvdj_paired_clonotype_diversityvdj_clonotype_countcdrs_per_bc_histogramc                   s   g | ]}t  | qS r.   r  r/   )clonotype_idsr.   r2   r   2  r  )dtypemajor_clonotype_bc_frac)%r  bincountr  _get_metric_attrr;  r<  r=  r   r5   rx  r  r   r  Zformat_clonotype_idrG   rH   r  r   rB  r  r   r]   r   r   rn   itemsr  canonical_vdj_gene_pairsr   
differencer  add_manyr  rA  r+   	set_valuer  )-reporterr  cell_barcodesr  r  Zbarcode_contigsr  Zclonotype_countsZfreq_metricZprop_metricZgrouped_barcodesZunassigned_metricZobserved_barcodesr  Zout_clonotypesZclonotype_countr  Zbc_contig_listsZbarcode_namesZvdj_gene_pairZis_cl_pairedr  r  r  Zclonotype_chainsZ!clonotype_chain_cdr3_translationsZout_contigsr  r  r   r   Zout_clonotyper  rl   r  consensus_nameconsensus_infoZgpZpaired_cls_metricZbcs_in_paired_cls_metricZdiversity_metricZpaired_diversity_metricZnum_clonotypes_metricZremaining_barcodesmetricr.   )r  r  r  r2   report_clonotypes  s   #






!
"r  c                 C  s   |t v sJ dd |D }i }|  D ]-\}}|d D ]}|d ||< q|d  D ]\}}	|	d D ]}
|||
 j| d< q2q*q|D ]}|j|v rU||j |j| d< qCd	S )
a  Adds clonotype and consensus info to a list of AnnotatedContig objects.

    Args:
        clonotypes: dict like the one output by report_clonotypes.
        contigs: list of AnnotatedContig objects
        prefix: prefix for clonotype ids (must be in VDJ_CLONOTYPE_TYPES)
    c                 S  s   i | ]}|j |qS r.   )r   r  r.   r.   r2   r  G  r   z0label_contigs_with_consensus.<locals>.<dictcomp>r  r  r  r  Z_consensus_idZ_clonotype_idN)r   r  r   r   )r  r  r  Zcontig_dictZbc_assignmentsr   Zclonotype_infor  r  r  memr  r.   r.   r2   label_contigs_with_consensus>  s    
r  )r(   r)   r*   r+   )
r>   r%   r?   r@   rA   r+   rB   r@   r*   rC   )r>   r%   )
rU   r&   rV   r7   r$   r7   rW   rX   r*   rY   )ra   r7   rb   rc   r*   rd   )rq   r[   rr   r[   rs   r+   r*   rt   r   )r   r   r*   r   )r   r   r*   r7   )r   r   r*   r   )T)r  r   r  )r  r
   r  r  )r  r   )[rm  
__future__r   r;  r  rY  collectionsr   collections.abcr   r   r   typingr   r	   r
   r   r   numpyr  sixr   Zcellranger.cigarr5  rJ   cellranger.vdj.referencevdjrO   rC  cellranger.vdj.utilsutilsr  tenkit.safe_json	safe_jsonr  tenkit.statsstatsrG   cellranger.library_constantsr   cellranger.vdj.constantsr   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r    r!   r"   tenkitr$   r   Zcellranger.alignr%   r&   rN  r=   rM   rT   r`   rp   r   r   r   r   r   rp  r[   r  r  r  r  r  r  r  r  r  r  r  r  r.   r.   r.   r2   <module>   sv   T
#
%
&

Q=
     }
 ^k
#
&6
 :