o
    Uݢgko                     @  s  d dl mZ d dlZd dlZd dlZd dlZd dlZd dlZd dlZd dl	m
Z
mZ d dlZd dlmZmZ d dlmZ d dlmZ d dlmZ G dd deZG dd deZG d	d
 d
eZG dd dZG dd deZG dd deZG dd deZ G dd dZ!G dd de!Z"ej#$ej#%ej#%ej#%ej#&e'( ddZ)G dd dZ*G dd dZ+G dd dZ,G dd  d e,Z-G d!d" d"e,Z.G d#d$ d$e,Z/d.d(d)Z0G d*d+ d+eZ1G d,d- d-e!Z2dS )/    )annotationsN)Any
NamedTuple)ensure_binary
ensure_strc                   @  s6   e Zd ZU ded< ded< ded< ded< ded< d	S )
Intervalstr | byteschromintstartendlengthz
str | NonestrandN__name__
__module____qualname____annotations__ r   r   c/oak/stanford/groups/akundaje/marinovg/programs/cellranger-9.0.1/lib/python/cellranger/reference.pyr         
 r   c                   @  s6   e Zd ZU ded< ded< ded< ded< d	ed
< dS )Gener   idstrname
int | Noner   float | None
gc_contentzlist[Interval] | None	intervalsNr   r   r   r   r   r   !   r   r   c                   @  s.   e Zd ZU ded< ded< ded< ded< d	S )

Transcriptr   gener   r   r   r   zlist[Interval]r   Nr   r   r   r   r   r   )   s
   
 r   c                   @  s   e Zd ZdZdddZdS )	GtfParseraa  This is an old-style GtfParser class.

    It is used so that we can produce backwards compatible references that work
    with older releases of CR.

    For use within CR, use classes derived from `NewGtfParser`, below.

    In order for the `genes.pickle` file to unpickle in older CR builds, it must be an old-style class
    named `GtfParser`.
    Fc                 C  s   |st dd S )Nz_GtfParser is deprecated & is only present for backwards compatibility reasons. Use NewGtfParser)	Exception)selfZskip_exceptionr   r   r   __init__<   s
   zGtfParser.__init__N)F)r   r   r   __doc__r$   r   r   r   r   r!   0   s    r!   c                      s    e Zd ZdZ fddZ  ZS )GtfParseErrorz4Exception class for errors while parsing a GTF file.c                   s   t  d| d| d d S )NzError while parsing GTF file 
z&

Please fix your GTF and start again.)superr$   )r#   filenamemsg	__class__r   r   r$   G   s   zGtfParseError.__init__)r   r   r   r%   r$   __classcell__r   r   r+   r   r&   D   s    r&   c                   @     e Zd ZdZdS )BaseReferenceErrorz1Base class for all reference construction errors.Nr   r   r   r%   r   r   r   r   r/   M       r/   c                   @  r.   )GexReferenceErrorz>Exception class for errors while constructing a GEX reference.Nr0   r   r   r   r   r2   Q   r1   r2   c                   @  s   e Zd ZedZ		d*d+d
dZedd Zed,ddZ		d-d.ddZ
e	d/d0d d!Zed/d"d#Zed1d&d'Zed2d(d)ZdS )3NewGtfParserz$(\S+?)\s+(".*?"|[^";\n\r]+)\s*;{0,1}NFr)   os.PathLike | str | bytescontig_lengthsdict[str, int] | Noneno_transcript_failboolc                 c  s   d}t |d[}tj|dd}d}t|D ]C\}}	t|	dkr$q|	d dr2|	ddfV  qt|	d	krIt|d
t|	|d d|	f t	|	d }
t	|	d }|
|krlt|d|d  d|	 d|
 d| |r|	d |vrt
t| }t|d|d  d|	d  d| ||	d  }||krt|d|d  d|	 d| d|	d  d| 
|
dk rt|d|d  d|	 d|
 d|	d  }|tjvrt|d|d d|	f |	d }| j|	d |d |dd}|dkr	| ||||	 | ||||	 || }nM|dkr?| ||||	 | ||||	 || }|r>||kr>t|d |	 d!|d  d"| d#| d$	n|d%krV||}|durV|d&krV||= |	d'|fV  qW d   dS 1 siw   Y  dS )(a[  Return an iteator over the rows of a GTF.

        The iterator consists of
        (row, is_comment, properties) tuples of type (str, bool, dict).
        In case an entry's properties consist of multiple key-value pairs with
        the same key, the duplicate keys are made unique. To properly translate
        the properties dict back into GTF format, use format_properties_dict.

        Args:
            contig_lengths: dictionary of {contig: length}. If specified we check
                that start/end of gtf are consistent with contig lengths
            no_transcript_fail: should we accept a gtf that has no 'transcript' row, but
                has 'exon' rows that reference a transcript? Default is False
                because of backwards compatibility with cellranger. For
                ARC we exit since this is not valid gtf.
        transcript_idr	)	delimiterNr   #T	   z;Invalid number of columns (%d, expect 9) in GTF line %d: %s         zInvalid GTF annotation on line z:
z
Start position of feature = z > End position of feature = z,Invalid contig name encountered on GTF line z: z. The FASTA file has contigs:
z
End position of feature = z
 > contig z
 length = z < 1   z!Invalid strand in GTF line %d: %s      )uniquify_keys
transcriptexonz"Supplied GTF is invalid. This row
z	
on line z1 specifies an 'exon' annotation for a transcript z2, but there is no 'transcript' row in the GTF for z that immediately precedes it.r     F)cr_ioZopen_maybe_gzipcsvreader	enumeratelen
startswithr&   joinr
   sortedlistkeysencodecr_constantsSTRANDSget_properties_dictvalidate_transcript_idvalidate_gene_idget)r#   r)   r5   r7   r9   frK   Ztranscript_row_idirowr   r   ZcontigsZmax_lenr   
annotation
propertiestidr   r   r   gtf_reader_iterY   s   





$zNewGtfParser.gtf_reader_iterc                 C  s   d|vrt | d|d d|f |d dkr&t | d|d d|d|d v r:t | d|d d|f td	|d d urRt | d
|d d|f d S )Nr9   z5Property 'transcript_id' not found in GTF line %d: %sr?   r;   rH   z6Property 'transcript_id' is empty in GTF line {:d}: {};zEProperty 'transcript_id' has invalid character ';' in GTF line %d: %sz\szLProperty 'transcript_id' has invalid whitespace character in GTF line %d: %s)r&   rO   formatresearchr)   r^   r[   r\   r   r   r   rW      s6   z#NewGtfParser.validate_transcript_idr^   dict[str, str | int]c                 C  sx   d|vrt | d|d d|f |d dkr&t | d|d d|d|d v r:t | d|d d|f d S )	Ngene_idz/Property 'gene_id' not found in GTF line %d: %sr?   r;   rH   z0Property 'gene_id' is empty in GTF line {:d}: {}ra   z?Property 'gene_id' has invalid character ';' in GTF line %d: %s)r&   rO   rb   re   r   r   r   rX      s$   zNewGtfParser.validate_gene_id	in_gtf_fnfasta_parserFastaParser | Nonereturn(tuple[dict[str, Transcript], list[Gene]]c              
     s  i }t  }| |D ]t\}}}|rq|\	}}	}
}}}	}}	}	|
dkr#qt|d }t|}t|| }|d }|d }|d|}t||d d d }||vrVt|d d g ||< ||vr_t ||< || j	|kshJ || j
tt||||| || | qi i  | D ]\}}tdd |j
D |< |d ur|| |< qg }| D ]\}}tfdd	|D }|d urt fd
d	|D ntd}g }|D ]	}||| j
7 }q|jdd d g }t|dd D ])\}}t|}tdd |D }tdd |D }t||||| d }|| qt|j|j|||}|| |D ]}t||  |td|| j
||< q)q||fS )NrG   r?   r9   rg   	gene_namec                 s      | ]}|j V  qd S Nr   .0intervalr   r   r   	<genexpr>  s    
z(NewGtfParser.load_gtf.<locals>.<genexpr>c                      g | ]} | qS r   r   rr   r9   )transcript_lengthsr   r   
<listcomp>'      z)NewGtfParser.load_gtf.<locals>.<listcomp>c                   ru   r   r   rv   )transcript_gc_contentsr   r   rx   +  ry   nanc                 S     | j S ro   r	   rs   r   r   r   <lambda>4      z'NewGtfParser.load_gtf.<locals>.<lambda>keyc                 S  r|   ro   r}   r~   r   r   r   r   8  r   c                 s  rn   ro   )r   rq   r   r   r   rt   ;      c                 s  rn   ro   )r   rq   r   r   r   rt   <  r   )collectionsOrderedDictr`   r
   absrY   r   r   setr    r   appendr   r   additemssumget_transcript_gc_contentnpmedianfloatsort	itertoolsgroupbyrQ   minmaxr   r   )r#   rh   ri   transcriptsZgene_to_transcriptsr\   
is_commentr^   r	   _r]   r   r   r   r   r9   rg   rm   r    rF   genesZtranscript_idsr   Ztranscript_intervalsr   Zchrom_intervals_iterZchrom_intervalsrs   r   )rz   rw   r   load_gtf   s   


zNewGtfParser.load_gtfTproperties_strr   line_numberr
   rE   dict[str, int | str]c                 C  s"  t | tr| S t }|rd}tj| D ]y}|d}|d}|r%|s&q|d dk}	|d dk}
|	r=|
r=|dd }n0|	rHt|d||f |
rSt|d||f |	 r\t
|}nd|v sdd|v rmt|d||f d|v rzt|d	||f |r||v r|d7 }t||}|||< q|S )
a.  Parse the properties present in the 9th column of a gtf entry into a dictionary.

        If there are multiple properties with the same key, those keys will be
        made unique unless uniquify_keys is False, in which case only the final
        instance of a key will appear in the result.
        r   r?   rC   "zError parsing GTF at line %d.  Parsed attribute began, but did not end with a quote.  Please ensure attributes that start with quotes end with them.
 Bad Attribute = %szError parsing GTF at line %d.  Parsed attribute ended, but did not begin with a quote.  Please ensure attributes that end with quotes end start with them.
 Bad Attribute = %szError parsing GTF at line %d.  Parsed attribute had a quote in the middle of a value.  Please ensure quotes are only used to encapsulate attribute values.
 Bad Attribute Value = %szError parsing GTF at line %d.  Parsed attribute had a quote in the middle of a key.  Please ensure quotes are only used to encapsulate attribute values.
 Bad Attribute Key = %s)
isinstancedictr   r   r3   _attribute_patternfinditergroupr&   isdigitr
   _make_key_unique)r   r   r)   rE   r^   Z
repeat_nummr   valueZstart_quoteZ	end_quoter   r   r   rV   M  sb   





z NewGtfParser.get_properties_dictc                 C  sf   g }|   D ]%\}}t|trt|}nd| d}|r!t|}|| d|  qd|d S )a  Translate a properties dict into a GTF-formatted string.

        Note:
            Keys beginning with "GtfParser_" are assumed to come from
            `_make_key_unique` and will be translated. This behavior can be
            disabled by setting `uniquify_keys=False`.
        r    z; ra   )r   r   r
   r   r3   _translate_unique_keyr   rO   )r^   rE   r   r   r   r   r   r   format_properties_dict  s   	


z#NewGtfParser.format_properties_dictr   numc                 C  s   d|  d| S )N
GtfParser_r   r   )r   r   r   r   r   r     s   zNewGtfParser._make_key_uniquec                 C  s    |  dr| d| d S | S )Nr   
   r   )rN   rindexr   r   r   r   r     s   
z"NewGtfParser._translate_unique_key)NF)r)   r4   r5   r6   r7   r8   )r^   rf   ro   )rh   r4   ri   rj   rk   rl   )T)
r   r   r   r
   r)   r   rE   r8   rk   r   )r   r   r   r
   rk   r   )r   r   rk   r   )r   r   r   rc   compiler   r`   staticmethodrW   rX   r   rV   r   r   r   r   r   r   r   r3   U   s(    
m
YFr3   c                   @  s   e Zd ZdddZdd ZdS )
GtfBuilderr   c                 C  s   || _ || _|p	i | _d S ro   )rh   
out_gtf_fn
attributes)r#   rh   r   r   r   r   r   r$     s   zGtfBuilder.__init__c           	      C  s   t d t| jdM}tj|dtjddd}| | jD ]3\}}}|r)|| qd}|d us1J |	 D ]\}}|| j
v rG|| j
| vrGd}q5|sO|| qW d    n1 sZw   Y  t d	 d S )
NzLWriting new genes GTF file (may take 10 minutes for a 1GB input GTF file)...wr;   rH   r'   )r<   quoting	quotecharlineterminatorFTz...done
)printopenr   rJ   writer
QUOTE_NONEr`   rh   writerowr   r   )	r#   rZ   r   r\   r   r^   remover   r   r   r   r   	build_gtf  s*   

zGtfBuilder.build_gtfN)r   )r   r   r   r$   r   r   r   r   r   r     s    
r   s   bins   gtf_to_gene_indexc                   @  sV   e Zd ZdddZeddd	ZedddZdddZej	fd ddZ
d!ddZdS )"FastaParserin_fasta_fnr4   c                 C  s   |  || _d S ro   )
load_fastachroms)r#   r   r   r   r   r$        zFastaParser.__init__fasta_headerbytesrk   c                 C  s   t d| }|dd S )Ns   >(\S+)r?   r   )rc   rd   groups)r   matchr   r   r   _get_chrom_name  s   zFastaParser._get_chrom_namedict[bytes, bytes]c                 C  s   i }d\}}t | d4}|D ])}| }|dr2|d ur*|d us#J d|||< t|}g }q|| qW d    n1 sBw   Y  |d urX|d usQJ d|||< |S )NNNrb   >    )r   striprN   rO   r   r   r   )r   r   Zcurrent_chromZcurrent_seqrZ   liner   r   r   r     s$   

zFastaParser.load_fastar	   r   r
   r   c                 C  s2   t |tsJ || jv o|dko|t| j| kS )zSDetermine whether the half-open interval [start,end) is within the bounds of chrom.r   )r   r   r   rM   )r#   r	   r   r   r   r   r   is_valid_interval  s   $zFastaParser.is_valid_intervalr   c                 C  sP   t |tsJ | j| || }|tjkr|S |tjkr!t|S td| )z<Get genomic sequence for the half-open interval [start,end).zInvalid strand: )	r   r   r   rT   FORWARD_STRANDREVERSE_STRANDtk_seqZget_rev_compr"   )r#   r	   r   r   r   seqr   r   r   get_sequence   s   


zFastaParser.get_sequencetranscript_objr   r   c                 C  s   t d}d\}}|jD ],}t|jtsJ |j| jvrq| j|j |j|j }|t	t 
||7 }||j7 }q|dkrEt|t| S dS )Ns   [cCgG])r   r   r   )rc   r   r   r   r	   r   r   r   r   rM   findallr   r   )r#   r   patterngcr   rs   r   r   r   r   r     s   

z%FastaParser.get_transcript_gc_contentN)r   r4   )r   r   rk   r   )r   r4   rk   r   )r	   r   r   r
   r   r
   )r	   r   r   r
   r   r
   r   r   )r   r   rk   r   )r   r   r   r$   r   r   r   r   rT   r   r   r   r   r   r   r   r     s    

r   c                   @     e Zd ZdS )IntervalTreeNr   r   r   r   r   r   r   r   !      r   c                   @  r   )RegionNr   r   r   r   r   r   %  r   r   c                   @  r   )IntergenicRegionNr   r   r   r   r   r   )  r   r   c                   @  r   )IntronicRegionNr   r   r   r   r   r   -  r   r   c                   @  r   )ExonicRegionNr   r   r   r   r   r   1  r   r   moduler   r   c                 C  sT   | dkr
|dkr
t S | dkr|dkrtS t| dgd}td|  d| t||S )a  Choose the appropriate type for loading a given file type.

    When unpickling an object with type named `cellranger.reference.GeneIndex` in a
    pickle file, actually open it using `cellranger.reference.NewGeneIndex`.

    Otherwise dynamically load the requested module.
    zcellranger.reference	GeneIndexr!   r   )fromlistzmodule: z, mod: )NewGeneIndexr3   
__import__r   getattr)r   r   modr   r   r   map_path5  s   
r   c                   @  s   e Zd Zdd ZdS )r   c                 C  s.   || _ || _|| _|| _|| _t| d dS )zdConstructor.

        This should not be called directly!! Use NewGeneIndex for new usages.
        TN)rh   r   r   r   gene_ids_mapr!   r$   )r#   rh   r   r   r   r   r   r   r   r$   I  s   zGeneIndex.__init__N)r   r   r   r$   r   r   r   r   r   H  s    r   c                   @  s   e Zd Z		d-d.ddZed/d	d
Zed0ddZd1ddZd1ddZd1ddZ	d2ddZ
dd Zd3d d!Zd4d#d$Zd%d& Zd'd( Zd)d* Zd+d, ZdS )5r   Nrh    os.PathLike | str | bytes | Noner   c                 C  st   |du r
|du r
dS |dusJ |dusJ || _ || _t| j}| j| j |d\| _| _dd t| jD | _dS )zConstructor.

        in_gtf_fn and in_fasta_fn are required, but are defaulted for backwards
        compatibility with pickled data.
        N)ri   c                 S     i | ]\}}|j |qS r   r   rr   r[   r    r   r   r   
<dictcomp>l      z)NewGeneIndex.__init__.<locals>.<dictcomp>)rh   r   r   r   r   r   rL   r   )r#   rh   r   ri   r   r   r   r$   W  s   

zNewGeneIndex.__init__reference_pathr   c              
   C  s   d}t t| |g}z
tj|tjd W n tjy* } z
tdt|j |d}~ww zt	
|W tj|r=t| S S tj|rJt| w w )Load the GeneIndex directly from the reference path.

        Directly parses the GTF and FASTA files using helper Rust tool.
        Does not require the gene.pickle file.
        s   gene_index.json)stderrzcouldn't load reference info: N)_GTF_TO_GENE_INDEXr   
subprocesscheck_outputSTDOUTCalledProcessErrorRuntimeErrorr   outputr   load_from_jsonospathexistsunlink)r   gene_index_fncmdexcr   r   r   load_from_referenceo  s   
z NewGeneIndex.load_from_referencer   str | bytes | os.PathLikerk   c                   s   dfdd dd	d
d fdddfdd}t | }t|}W d   n1 s/w   Y  t }|d |_|d |_||d |_ fdd|d D |_dd t|jD |_	|S )r   objdict[str, Any]rk   r   c                   sB    fdd| d D }t t| d  t| d | d | d |S )Nc                      g | ]} |qS r   r   rr   xconv_intervalr   r   rx     ry   zBNewGeneIndex.load_from_json.<locals>.conv_gene.<locals>.<listcomp>r   r   r   r   r   )r   r   rS   )r  r   r
  r   r   	conv_gene  s   
z.NewGeneIndex.load_from_json.<locals>.conv_genedict[str, str | int | None]r   c                 S  sB   | d rt | d }nd }tt | d  | d | d | d |S )Nr   r	   r   r   r   )r   r   rS   )r  sr   r   r   r    s   (z2NewGeneIndex.load_from_json.<locals>.conv_intervalr   c                   sN    | d }fdd| d D }|d j dkr|  t|| d | d |S )	Nr    c                   r  r   r   r  r
  r   r   rx     ry   zHNewGeneIndex.load_from_json.<locals>.conv_transcript.<locals>.<listcomp>r   r   -r   r   )r   reverser   )r  r    r   )r  r  r   r   conv_transcript  s
   z4NewGeneIndex.load_from_json.<locals>.conv_transcriptdict[str, dict[str, Any]]dict[str, Transcript]c                   s.   i }|   D ]\}} |}||t|< q|S ro   )r   r   )r  restx_idtxZnew_tx)r  r   r   conv_transcripts  s
   z5NewGeneIndex.load_from_json.<locals>.conv_transcriptsNrh   r   r   c                   r  r   r   r  )r  r   r   rx     ry   z/NewGeneIndex.load_from_json.<locals>.<listcomp>r   c                 S  r   r   r   r   r   r   r   r     r   z/NewGeneIndex.load_from_json.<locals>.<dictcomp>)r  r  rk   r   )r  r  rk   r   )r  r  rk   r   )r  r  rk   r  )
r   jsonloadr   rh   r   r   r   rL   r   )r   r  rZ   datar  r   )r  r  r  r   r     s   




zNewGeneIndex.load_from_jsonrF   r   c                 C  (   t |tsJ || jv r| j| jS d S ro   )r   r   r   r   r#   rF   r   r   r   get_transcript_length     
z"NewGeneIndex.get_transcript_lengthc                 C  r  ro   )r   r   r   r   r  r   r   r   r     r  z&NewGeneIndex.get_transcript_gc_contentc                 C  r  ro   )r   r   r   r    r  r   r   r   get_gene_from_transcript  r  z%NewGeneIndex.get_gene_from_transcriptrg   r   r   c                 C  s&   t |tsJ || jv r| j| S dS )z?Return the integer index of gene_id if found or None otherwise.N)r   r   r   )r#   rg   r   r   r   gene_id_to_int  s   

zNewGeneIndex.gene_id_to_intc                 C  r|   ro   r   r#   r   r   r   	get_genes  s   zNewGeneIndex.get_genesGene | Nonec                 C  s<   t |tsJ | |}|dur|t| jk r| j| S dS )zBReturn the Gene object whose gene ID is gene_id or None otherwise.N)r   r   r   rM   r   )r#   rg   indexr   r   r   get_gene  s   
$zNewGeneIndex.get_gene	list[int]c                 C     dd | j D S )Nc                 S     g | ]}|j qS r   rp   rr   r    r   r   r   rx         z1NewGeneIndex.get_gene_lengths.<locals>.<listcomp>r!  r"  r   r   r   get_gene_lengths  r   zNewGeneIndex.get_gene_lengthsc                 C  r(  )Nc                 S  r)  r   )r   r*  r   r   r   rx     r+  z5NewGeneIndex.get_gene_gc_contents.<locals>.<listcomp>r!  r"  r   r   r   get_gene_gc_contents  r   z!NewGeneIndex.get_gene_gc_contentsc                 C  r(  )Nc                 S  r)  r   )r   r  r   r   r   rx     r+  z/NewGeneIndex.get_gene_names.<locals>.<listcomp>r!  r"  r   r   r   get_gene_names  r   zNewGeneIndex.get_gene_namesc                 C  r(  )Nc                 S  r)  r   r   r  r   r   r   rx     r+  z-NewGeneIndex.get_gene_ids.<locals>.<listcomp>r!  r"  r   r   r   get_gene_ids  r   zNewGeneIndex.get_gene_idsc                   sN    fdd| j D | _  fdd| j D }|| _dd t| j D | _dS )z<Reduce this GeneIndex to only genes in the set target_genes.c                   s   g | ]	}|j  v r|qS r   r   )rr   gtarget_genesr   r   rx     s    z-NewGeneIndex.subset_index.<locals>.<listcomp>c                   s"   i | ]\}}|j j v r||qS r   )r    r   )rr   r  r  r1  r   r   r     s   " z-NewGeneIndex.subset_index.<locals>.<dictcomp>c                 S  r   r   r   r   r   r   r   r     r   N)r   r   r   rL   r   )r#   r2  Ztxsr   r1  r   subset_index  s   zNewGeneIndex.subset_indexr   )rh   r   r   r   )r   r   )r   r  rk   r   )rF   r   )rg   r   rk   r   )rg   r   rk   r$  )rk   r'  )r   r   r   r$   r   r  r   r  r   r  r   r#  r&  r,  r-  r.  r/  r3  r   r   r   r   r   V  s&    
3




r   )r   r   r   r   )3
__future__r   r   rJ   r   r  r   rc   r   typingr   r   numpyr   sixr   r   cellranger.constants	constantsrT   Zcellranger.cr_iorI   Z
tenkit.seqr   r   r   r   r   r!   r"   r&   r/   r2   r3   r   r   rO   dirnameabspath__file__rS   r   r   r   r   r   r   r   r   r   r   r   r   r   r   <module>   sN   	  c&F
