o
    Uݢg3                     @  s  d dl mZ d dlZd dlmZmZ d dlmZmZ d dl	m
Z d dlmZ d dlm  mZ d dlm  m
Z d dlmZ d dlmZ G dd deZG dd	 d	eZG d
d deZddgZg dZdZdZ dZ!h dZ"dd Z#dd Z$dCddZ%dd Z&dd  Z'd!d" Z(d#d$ Z)d%d& Z*dDd(d)Z+d*d+ Z,dEd/d0Z-d1d2 Z.d3d4 Z/G d5d6 d6eZ0dFd8d9Z1dGd=d>Z2d?d@ Z3e3dAdAdZ4G dBd< d<Z5dS )H    )annotationsN)
NamedTuple	TypedDict)ensure_binary
ensure_str)compute_hash_of_filec                   @  s   e Zd ZdZdS )VDJReferenceConstructionErrorz?Raise for customer-facing errors in VDJ reference construction.N)__name__
__module____qualname____doc__ r   r   g/oak/stanford/groups/akundaje/marinovg/programs/cellranger-9.0.1/lib/python/cellranger/vdj/reference.pyr      s    r   c                   @  s^   e Zd ZU ded< ded< ded< ded< ded< ded	< ded
< ded< ded< ded< dS )VdjAnnotationFeatureint
feature_idbytes	record_iddisplay_name	gene_nameregion_typebytes | None
chain_typechainisotypeallele_namesequenceNr	   r
   r   __annotations__r   r   r   r   r         
 r   c                   @  s^   e Zd ZU ded< ded< ded< ded< ded< ded< ded< ded	< ded
< ded< dS )GtfEntrystrchromsourcefeaturestartendscorestrandframeZattributes_strzdict[str, str | int]
attributesNr   r   r   r   r   r    %   r   r    r   r   )r   r   r   r   r   r   r   Zfive_prime_utrZCDSZ
transcript>   Z	TR_J_geneZ	IG_V_geneZ	TR_C_geneZ	TR_D_geneZ	IG_C_geneZ	TR_V_geneZ	IG_D_geneZ	IG_J_genec                 C  s   | d urt j| tjS dS )Nz	/dev/null)ospathjoinvdj_constantsREFERENCE_FASTA_PATH)reference_pathr   r   r   get_vdj_reference_fastaO   s   r1   c                 C  sV   | t krdS | tkr)|dd }|dkrdS |dkrdS |dkr#d	S |d
kr)dS d S )Nz5'UTR_   VzL-REGION+V-REGIONDzD-REGIONJzJ-REGIONCzC-REGION)ENSEMBL_FIVE_PRIME_UTR_FEATUREENSEMBL_CDS_FEATUREsplit)r$   Zbiotypecharr   r   r   infer_ensembl_vdj_feature_typeV   s   r<   r   r!   return
str | Nonec                 C  s:   | d u r| S |   } | drd| d  | dd   } | S )N)zTCRA-zTCRB-zTCRG-zTCRD-TR      )upper
startswithr   r   r   r   standardize_ensembl_gene_namej   s   
rE   c                 C  s   t | dd dS )z0Infer e.g., TR or IG from the ensembl gene name.r      zutf-8)r   rD   r   r   r   infer_ensembl_vdj_chain_typev   s   rG   c                 C  s   | dd S )z2Infer e.g., TRA or IGH from the ensembl gene name.r   r@   r   rD   r   r   r   infer_ensembl_vdj_chain{   s   rH   c                 C  s   t | dkrdS | dd S )zInfer e.g., E from IGHE.r@   N)lenrD   r   r   r   infer_ensembl_isotype   s   rJ   c                 C  s   | j | j| j| j| j| jfS N)r   r   r   r   r   r   )fr   r   r   get_duplicate_feature_key   s   rM   c                 C  s6  t  }t  }g }td t| d}t|D ]\}	}
t|	|
}|j|v r,td|j d|jv r:td|j dd|j	v rHtd|j	 dd|j
v rVtd|j
 dt|}||v rptd	|j d
|j d|j
 d q|j}d|v rtd|j|j
|jfd |d}t|dkrtd|j|j
|jfd q|jtjvrtdtt|jt|j
t|jfttdd tjD  q||j || | }|d|i td!i |}|| qW d   n1 sw   Y  td t|dkrtdtd ttjt | tt |d}|D ]}|!t"|d  q W d   n	1 s8w   Y  td td t#| }td td t$j%|t$j&|t$j'dt$j(tj)| t$j*dt$j+|t$j,|t$j-t.j/i}ttj0|t$j1d}t2j3||ddd  W d   n	1 sw   Y  td dS )"z\Create cellranger-compatible vdj reference files from a.

    V(D)J segment FASTA file.
    zChecking FASTA entries...rbz.Duplicate feature ID found in input FASTA: %d.    z$Spaces not allowed in region type: ""z"Spaces not allowed in gene name: "z"Spaces not allowed in record ID: "z&Warning: Skipping duplicate entry for z (, z).   NzWarning: Feature z& contains Ns. Stripping from the ends.r   z is all Ns. Skipping.zIWarning: Unknown chain type for: {}. Expected name to be in {}. Skipping.c                 s  s    | ]}t |V  qd S rK   )r   ).0tr   r   r   	<genexpr>   s    z3build_reference_fasta_from_fasta.<locals>.<genexpr>r   Nz	...done.
zAn empty constant regions file was generated/detected for your custom species. Please check if there are hits to the IMGT database or run cellranger vdj in denovo mode without reference.zWriting sequences...w
z%Computing hash of input FASTA file...z3Writing metadata JSON file into reference folder...T   )	sort_keysindentr   )4setprintopencr_utilsget_fasta_iterparse_fasta_entryr   r   r   r   r   rM   r   r   striprI   r   chain_typesZVDJ_CHAIN_TYPESformatr!   r   tupleadd_asdictupdater   appendr+   makedirsr,   dirnamer1   write"convert_vdj_feature_to_fasta_entryr   cr_constantsREFERENCE_GENOMES_KEYREFERENCE_FASTA_HASH_KEYREFERENCE_GTF_HASH_KEYREFERENCE_INPUT_FASTA_KEYbasenameREFERENCE_INPUT_GTF_KEYREFERENCE_VERSION_KEYREFERENCE_MKREF_VERSION_KEYREFERENCE_TYPE_KEYr.   REFERENCE_TYPEr-   REFERENCE_METADATA_FILEtk_safe_json
dump_numpy)Z
fasta_pathr0   Zreference_nameZref_versionmkref_versionZseen_featuresZseen_idsfeaturesrL   headerr   ZfeatkeyseqZ	feat_dictZnew_featZ	out_fasta
fasta_hashmetadata	json_filer   r   r    build_reference_fasta_from_fasta   s   






D
r   r   c                 C  s   |du r| S | d | S )z3Make a combined gene/allele name, e.g., TRAV1-1*01.N*r   )r   r   r   r   r   make_display_name  s   r   c                   sf    fddt D } fddtD }ddd |D d ddd |D  }d| d	t j S )
z3Generate a fasta entry from a VdjAnnotationFeature.c                      g | ]}t  |qS r   getattrrS   rL   r$   r   r   
<listcomp>      z6convert_vdj_feature_to_fasta_entry.<locals>.<listcomp>c                   r   r   r   r   r   r   r   r     r   |c                 S  &   g | ]}t |d r| nt|qS decodehasattrr   r!   rS   xr   r   r   r        &  c                 S  r   r   r   r   r   r   r   r     r   >rW   )REF_FASTA_FIELDSREF_FASTA_AUX_FIELDSr-   r   r   )r$   Zfasta_fieldsZ
aux_fieldshdrr   r   r   rl   
  s   rl   r}   r   r   c              	   C  sD  |  d}t|dkrtd|  d|d  d}t|ttkr8tdttdtt|td	|f |d
  d}t|ttkr]tdttdtt|td	|f i }|tt	t| |tt	t| |
d}zt|}|d
k rt W n ty   td|dw ||d< tdd|i|S )z7Parse a FASTA entry into a VdjAnnotationFeature object.rO   rF   zBExpected two strings separated by a space in FASTA header. Found "rP   r      |zzFirst string in FASTA header (record ID) must consist of the following %d fields separated by "|": %s. Found %d values: %srQ   s   , r3   z}Second string in FASTA header (description) must consist of the following %d fields separated by "|": %s. Found %d values: %sr   z:The feature ID must be an integer greater than 0. Found: "r   Nr   )r:   rI   r   r   r-   r   r   rg   dictzippopr   
ValueErrorr   )r}   r   wordsZvalues1Zvalues2fieldsr   r   r   r   r`     sV   



r`   c                 C  s.   |  d}ttt|d  d}t|d S )z1Parse an aligned ref name (i.e. from a BAM file).r   r   r   r   )r:   r   r   r   r   )ref_namer   r   r   r   r   $get_feature_id_from_aligned_ref_nameN  s   
r   c                 c  sb    | du rdS t t| d}t|D ]
\}}t||V  qW d   dS 1 s*w   Y  dS )z3Yield vdj features from a vdj reference fasta file.NrN   )r]   r1   r^   r_   r`   )r0   Zreference_filer}   r   r   r   r   get_vdj_feature_iterV  s   "r   c                   @  s6   e Zd ZU ded< ded< ded< ded< ded< d	S )
VdjAnnotationFeatureDictr   r   r   r   r   r   r   r   Nr   r   r   r   r   r   `  s   
 r   r$   c                 C  s   | j | j| j| j| jdS )zYield a dict.r   r   r   r   r   r   r   r   r   r   convert_vdj_feature_to_dicth  s   r   d	referenceVdjReferencec                 C  s   | | d S )z)Convert a dict to a VdjAnnotationFeature.r   )get_feature_by_id)r   r   r   r   r   convert_dict_to_vdj_features  s   r   c                 C  s,   t | } t |}tdd| | |dddd|d
S )zKCreate a "feature" that does not correspond to an actual reference segment.r       N)
r   r   r   r   r   r   r   r   r   r   )r   r   )r   r   r   r   r   r   create_dummy_featurez  s   r   s   UNANNOTATEDc                   @  s"   e Zd ZdZdd Zddd	Zd
S )r   z.Represents a set of V(D)J reference sequences.c                 C  s$   i | _ t|D ]}|| j |j< qd S rK   )r|   r   r   )selfr0   r$   r   r   r   __init__  s   zVdjReference.__init__r   r   r=   r   c                 C  s   |dkrt S | j| S )Nr   )unannotated_featurer|   )r   r   r   r   r   r     s   
zVdjReference.get_feature_by_idN)r   r   r=   r   )r	   r
   r   r   r   r   r   r   r   r   r     s    )r   r!   r=   r>   )r   r>   r   r>   r=   r>   )r}   r   r   r   r=   r   )r$   r   r=   r   )r   r   r   r   r=   r   )6
__future__r   r+   typingr   r   sixr   r   cellranger.constants	constantsrm   cellranger.utilsutilsr^   Zcellranger.vdj.chain_typesvdjrb   cellranger.vdj.constantsr.   tenkit.safe_json	safe_jsonry   Zcellranger.reference_hashr   	Exceptionr   r   r    r   r   r8   r9   ZENSEMBL_TRANSCRIPT_FEATUREZENSEMBL_VDJ_BIOTYPESr1   r<   rE   rG   rH   rJ   rM   r   r   rl   r`   r   r   r   r   r   r   r   r   r   r   r   r   <module>   sL   

w
7


