o
    UݢgI                  	   @  s   d dl mZ d dlZd dlZd dlZd dlZd dlZd dlZd dlZd dl	m
Z
 d dlmZ d dlmZ d dlmZ d dlmZmZmZ d dlmZ ejejejejedZeje dZ G dd	 d	e!Z"G d
d deZ#G dd dZ$dS )    )annotationsN)
ensure_str)GexReferenceErrorGtfParseErrorNewGtfParser)compute_hash_of_filebins   gtf_to_gene_indexc                   @  s   e Zd ZdZdS )DuplicateContigNameExceptionzHUsed to indicate the input file has multiple contigs with the same name.N)__name__
__module____qualname____doc__ r   r   k/oak/stanford/groups/akundaje/marinovg/programs/cellranger-9.0.1/lib/python/cellranger/reference_builder.pyr	   $   s    r	   c                   @  s   e Zd Z			d-d.ddZdd Z	d/d0ddZdd Zd1ddZdd Zdd Z	d2d3dd Z
d4d"d#Zd5d&d'Z		d/d6d)d*Zd+d, ZdS )7ReferenceBuilder      Fgenomes	list[str]out_dirstr | os.PathLike | Nonenum_threadsintmem_gbc
                 C  s   || _ t| j D ]"\}
}t| j D ]\}}|
|kr)||r)td| d| dqq|| _|| _|| _|| _|| _|| _	|| _
d | _d | _d | _| |	 d S )NzSupplied genome name 'z' is a prefix of genome name '')r   	enumerate
startswithr   in_fasta_fns
in_gtf_fnsr   r   r   ref_versionmkref_version
fasta_pathgtf_pathgenome_prefixesformat_genome_prefixes)selfr   r   r   r   r   r    r   r   prefixes_as_genomesiZg1jZg2r   r   r   __init__)   s(   zReferenceBuilder.__init__c              	   C  s   t d tj| jtj| _ttj	| j | 
| j t d t d tjtj	tj	tj	tdd}tj|d| jgtjd}| }d|v rVd	| }t|t d d
S )z1Copy FASTA to fasta/genome.fa and samtools index.z2Writing genome FASTA file into reference folder......done
zIndexing genome FASTA file...r   samtoolsfaidx)stderrzIgnoring duplicate sequencezmkref failed - your fasta file has duplicated sequence names in it.Please give every contig a unique name.
samtools faidx output is: N)printospathjoinr   cr_constantsZREFERENCE_FASTA_PATHr!   mkdirdirnamewrite_genome_fasta__file__
tk_subproccheck_output
subprocessSTDOUTdecoder	   )r%   r+   stdoutoutputZ	error_msgr   r   r   process_fastaM   s(   zReferenceBuilder.process_fastaNcontig_lengthsdict[str, int] | Noneno_transcript_failboolc                 C  s   | j sJ dtj| j sJ dtd | jdusJ tjt| jtj	}|| _
ttj| | j|||d}td |S )z+Parse and write GTF and compute gtf pickle.zFASTA path must be providedz)FASTA file not present in fasta/genome.faz/Writing genes GTF file into reference folder...Nr?   rA   r*   )r!   r/   r0   existsr.   r   r1   r   r2   ZREFERENCE_GENES_GTF_PATHr"   r3   r4   write_genome_gtf)r%   r?   rA   r"   has_warningr   r   r   process_gtfi   s   zReferenceBuilder.process_gtfc                 C  s8   t jt jtjd}t|| jg | jd | _d S )Nbgzip.gz)	r/   r0   r1   r4   sys
executabler9   
check_callr"   )r%   rH   r   r   r   gzip_gtf~   s   zReferenceBuilder.gzip_gtfc                 C  s  t d t d t| j}t d t d t| j}t d tj| jdtt	t
| jd tj| jtj|tjd |tjdd	 | jD tjd
d	 | jD tj| jtj| ji	}|r]|| tj| jtj}t|d}tj||ddd W d   n1 sw   Y  t d dS )z7Compute hash of FASTA and GTF and write reference.json.z:Writing genome metadata JSON file into reference folder...z&Computing hash of genome FASTA file...r*   z#Computing hash of genes GTF file...threadsg       @rI   c                 S     g | ]}t j|qS r   r/   r0   basename.0xr   r   r   
<listcomp>   s    z5ReferenceBuilder.compute_metadata.<locals>.<listcomp>c                 S  rO   r   rP   rR   r   r   r   rU      s    wTr   )	sort_keysindentN) r.   r   r!   r"   r2   ZREFERENCE_GENOMES_KEYr   r   mathceilfloatr   ZREFERENCE_MEM_GB_KEYZREFERENCE_FASTA_HASH_KEYZREFERENCE_GTF_HASH_KEYZREFERENCE_INPUT_FASTA_KEYr   ZREFERENCE_INPUT_GTF_KEYr   ZREFERENCE_VERSION_KEYr   ZREFERENCE_MKREF_VERSION_KEYr    updater/   r0   r1   r   ZREFERENCE_METADATA_FILEopentk_safe_jsonZ
dump_numpy)r%   Zextra_data_dictZ
fasta_hashZgtf_hashmetadataZnew_metadata_jsonfr   r   r   compute_metadata   s4   




z!ReferenceBuilder.compute_metadatac                 C  sH   t d tj| jtj}t|}|j| j	| j
| j| jd t d dS )z/Generate STAR index for use with GEX pipelines.zMGenerating STAR genome index (may take over 8 core hours for a 3Gb genome)...)r   r   z	...done.
N)r.   r/   r0   r1   r   r2   ZREFERENCE_STAR_PATHSTARindex_reference_with_mem_gbr!   r"   r   r   )r%   Znew_star_pathZstarr   r   r   make_star_index   s   z ReferenceBuilder.make_star_indexc                 C  s   | j sJ | j d }tj|sJ i }t|%}|D ]}| d}|s'q|d }t|d }|||< qW d   n1 s@w   Y  |sStd| j  d| j	 |S )zMDetermine contigs in reference and their lengths from the genome.fa.fai file.z.fai	r   r   Nz*The samtools-constructed FASTA index file z7 is empty. The supplied FASTA file(s) have no contigs: )
r!   r/   r0   rD   r]   rstripsplitr   r   r   )r%   r,   r?   finlinefieldschromlengthr   r   r   get_contig_lengths   s*   



	
z#ReferenceBuilder.get_contig_lengthsout_dir_existsc              
   C  s   |st d| j  t| j t d |   |  }| j|dd |   z|   W n t	j
yI } ztd|jd  d| d|d	}~ww |   |   d	S )
z8Construct a cellranger/spaceranger-compatible reference.z!Creating new reference folder at r*   F)rA   zFailed to make genome index with STAR.  This can occasionally be caused by setting the argument `memgb` too low.
Error was from running command 'r   z'
z/

Check stdout and stderr for more information.N)r.   r   r/   r3   r>   rm   rG   validate_gtfrd   r9   CalledProcessErrorr   cmdrM   ra   )r%   rn   r?   errr   r   r   build_gex_reference   s0   z$ReferenceBuilder.build_gex_referencer&   c                 C  s   |r| j | _d S t| j dkrCtdd | j D }g | _| j D ]!}|}t||k r3|d|t|  7 }|| jvs:J | j| qd S | j | _d S )Nr   c                 s  s    | ]}t |V  qd S N)len)rS   gr   r   r   	<genexpr>   s    z:ReferenceBuilder.format_genome_prefixes.<locals>.<genexpr>_)r   r#   ru   maxappend)r%   r&   Z
max_lengthgenomegenome_prefixr   r   r   r$      s   
z'ReferenceBuilder.format_genome_prefixesout_fasta_fnos.PathLike | str | bytesc           
   
   C  sL  | j D ]7}t|d(}|d}|dkrtd| d|dkr+td| d|dW d    n1 s5w   Y  qt| jdkrt|d	J}t| j| j D ]9\}}t|)}|D ]}	|	 }	|		d
rqd
| d |	dd   }	|
|	d  qZW d    n1 sw   Y  qOW d    d S 1 sw   Y  d S t| j d | d S )Nrbr       zInput FASTA file z	 is empty   >z is invalid. The first byte = zP but it must be '>'. Note that gzipped FASTA files cannot be processed by mkref.rV   >rx   
r   )r   r]   readr   ru   r   zipr#   stripr   writeshutilcopy)
r%   r}   fnrh   Zbyte1r`   r|   in_fasta_fnrv   ri   r   r   r   r5      s6   




"	z#ReferenceBuilder.write_genome_fasta
out_gtf_fnc                   s
  d}| j d us	J t|d}tj|dtjddd}t| j | jD ]\}}t| jdkrL|d t fddd}	|rI fdd|	 D nd }
nddd}	|}
i }t
 }d}| j||
|dD ]k\}}}|ro|| qb|	|d }||d< d|v r|	|d |d< |d }||v r|| |kr|| qb|||< d|v r|	|d |d< d|v r|	|d |d< | j|dd|d< |t|d dk7 }|| qb|dkrt|dt|dkrd}td tdt|d  td q!W d    |S 1 sw   Y  |S )NFrV   re    r   )Z	delimiterZquotingZ	quotecharZlineterminatorr   rx   sstrc                 S  s   ||  S rt   r   )r   pr   r   r   prefix_func  s   z6ReferenceBuilder.write_genome_gtf.<locals>.prefix_funcc                   s(   i | ]\}}| r| d  |qS rt   )r   )rS   Zcontigrl   Zlen_pfxZpfxr   r   
<dictcomp>   s    z5ReferenceBuilder.write_genome_gtf.<locals>.<dictcomp>c                 S  s   | S rt   r   )r   r   r   r   r   *  s   r   rC   Ztranscript_idZgene_idZ	gene_nameT)Zuniquify_keys      Zexonz8The supplied GTF file does not contain any exon featureszMWARNING: The following transcripts appear on multiple chromosomes in the GTF:ziThis can indicate a problem with the reference or annotations. Only the first chromosome will be counted.)r   r   )r#   r]   csvwriterZ
QUOTE_NONEr   r   ru   r   itemssetZgtf_reader_iterZwriterowaddZformat_properties_dictr   r   r.   r1   list)r%   r   r?   rA   rF   r`   r   r|   	in_gtf_fnr   Zstripped_contig_lengthsZtranscript_to_chromZcross_chrom_transcriptsZnum_rowsrowZ
is_commentZ
propertiesrk   Zcurr_txr   r   r   rE     s   	



KKz!ReferenceBuilder.write_genome_gtfc                 C  s   t j| jdd2}zt| j|jg}tj|ddd W n tjy1 } z
td|j	
  |d}~ww W d   dS 1 s=w   Y  dS )zVerifies that the GTF file of the reference can be loaded.

        Uses the Rust GTF indexing code path, and makes sure it succeeds.
        z.json)dirsuffixT)checkcapture_outputzError detected in GTF file: N)tempfileNamedTemporaryFiler   _GTF_TO_GENE_INDEXnamer9   runrp   r   r-   r;   )r%   Zout_filerq   excr   r   r   ro   `  s   "zReferenceBuilder.validate_gtf)r   r   F)r   r   r   r   r   r   r   r   )NF)r?   r@   rA   rB   rt   )F)rn   rB   )r&   rB   )r}   r~   )r   r~   r?   r@   rA   rB   )r
   r   r   r)   r>   rG   rM   ra   rd   rm   rs   r$   r5   rE   ro   r   r   r   r   r   (   s&    	$
!


Ur   c                   @  s2   e Zd Zdd Zd	ddZ					d
ddZdS )rb   c                 C  s
   || _ d S rt   )reference_star_path)r%   r   r   r   r   r)   q  s   
zSTAR.__init__r   Nc              	   C  s\  t tj|}t |t d }d}t|}|D ]}	|	 dr&|d7 }qW d    n1 s1w   Y  tdtt	
|dd d }
tdtt	
|| d}|d u r[d }d }nEt dd	|
  t d }|}t|| d
 }||k r}td||f |d }td|d }t d| t || |  }tdtt	|}| j|||||
||d d S )Ni ʚ;r   r   r      r      r   r      zqSTAR requires at least %d GB of memory when aligning reads to your reference.
Please start again with --memgb=%d.i   @)r   sa_sparse_dsa_index_n_baseschr_bin_n_bits	limit_ram)r[   r/   r0   getsizer]   r   r   minr   rY   logr   ry   rZ   index_reference)r%   r   r   r   r   Zgenome_size_bZgenome_size_gbZgenome_num_chrsr`   ri   r   r   r   r   Zsa_index_mem_gbZgenome_mem_gbZ
min_mem_gbr   r   r   rc   t  sN   


z STAR.index_reference_with_mem_gbc           
      C  s   t j| jrtd| j dt | j t jtdddd| jdt|d|d	|g}|d ur7|d
t|g7 }|d urC|dt|g7 }|d urO|dt|g7 }|d ur[|dt|g7 }zt	
| W d S  tjy{ }	 z|	jdkrutd|	d }	~	ww )NzSTAR reference path z already existsrb   z	--runModeZgenomeGeneratez--genomeDirz--runThreadNz--genomeFastaFilesz--sjdbGTFfilez--limitGenomeGenerateRAMz--genomeSAsparseDz--genomeSAindexNbasesz--genomeChrBinNbitszmkref has failed because it is running on a computer that does not support some required instructions (AVX).  Please contact support@10xgenomics.com to learn how to work around this issue.)r/   r0   rD   r   	Exceptionr3   r1   _LIB_BINr   r7   rL   r9   rp   
returncodeRuntimeError)
r%   r   r   r   r   r   r   r   argsrr   r   r   r   r     sB   

zSTAR.index_reference)r   N)r   NNNN)r
   r   r   r)   rc   r   r   r   r   r   rb   p  s    
:rb   )%
__future__r   r   rY   r/   r   r9   rJ   r   Zsixr   Zcellranger.constants	constantsr2   Ztenkit.log_subprocessZlog_subprocessr7   Ztenkit.safe_jsonZ	safe_jsonr^   Zcellranger.referencer   r   r   Zcellranger.reference_hashr   r0   r1   r4   r6   r   encoder   r   r	   r   rb   r   r   r   r   <module>   s0   &  J