o
    fI>                  
   @   sb  d dl Z d dlZd dlmZ d dlZd dlZd dlZd dlZ	d dl
Z
d dlZd dlmZmZ g dZe	je	je	je	je	je	je	je	je	je	jg
Zdd Zejg ddd	Zejfd
dZdd Zdd Zdd Zdd Zdd Zdd Zdd Zdd Zd6ddZ dd gZ!d!d" Z"d7d$d%Z#d7d&d'Z$d(d) Z%d*d+ Z&d,d- Z'd.d/ Z(d0d1 Z)d2d3 Z*d4d5 Z+dS )8    N)	ExitStack)tqdmtrange)
chr
peak_startZpeak_end	peak_nameZ
peak_scoreZpeak_strandZpeak_signalZ	peak_pvalZ	peak_qvalpeak_summitc           
         s  t j| dtdd tdjt dt dt d | t ddjd	d
 }g }|d urUt|}|D ]}|	d
dd }|| q4W d    n1 sPw   Y  t|  fdd|djddD }|| dd t|D }	|t d|	d}|S )NF	)
has_headerZnew_columns	separator
quote_charZdtypesr   r   r   r   )r   peak_region_startr   peak_idname
r   c                    s   g | ]}| vr|qS  r   .0iZchrom_order_setr   ;/users/shouvikm/ChromBPNet/finemo_gpu/src/finemo/data_io.py
<listcomp>(   s    zload_peaks.<locals>.<listcomp>T)maintain_orderc                 S   s   i | ]\}}||qS r   r   )r   indvalr   r   r   
<dictcomp>*       zload_peaks.<locals>.<dictcomp>chr_id)plscan_csvNARROWPEAK_SCHEMANARROWPEAK_DTYPESselectcolwith_row_countcollectopenrstripsplitappendset
get_columnuniqueextend	enumeratewith_columnsZmap_dictalias)

peaks_pathchrom_order_path
half_widthpeakschrom_orderflinechromZchrom_order_peaksZchrom_ind_mapr   r   r   
load_peaks   s8   

 
r:   )ACGTS1dtypec                 C   sF   |   } tj| ddd}|d d d f td d d f k|}|S )NzUTF-8r?   r@   )uppernpZ
frombufferencodeSEQ_ALPHABETastype)sequencerA   Zseq_chararrayZone_hotr   r   r   one_hot_encode5   s   &rH   c              
   C   s  | j }tj|d|d ftjd}tj||d ftjd}tj|dd}dd |D }tjt||d ftjd}	ztt	| j
dd	d d
|dD ]d\}
}|d }|d }|d|  }|| || }|j}|j}|j}|| }|| }||krt|||
d d ||f< t	|D ]\}}t|j|||dd|	|d d f< qtj|	dd||
||f< qGW |D ]}|  q||fS |D ]}|  qw )N      r@   F)Zone_based_attributesc                 S   s   g | ]}t |qS r   )pyBigWigr'   r   r   r   r   r   F   r   z(load_regions_from_bw.<locals>.<listcomp>T)Znamedregions)ZdisableZunittotalr   r   )numpyr   axis)heightrC   Zzerosint8float16pyfaidxZFastalenr   r/   Z	iter_rowsseqstartendrH   
nan_to_numvaluesmeanclose)r5   fa_pathbw_pathsr4   Z	num_peaks	sequencescontribsZgenomeZbwsZcontrib_bufferr   Zrowr9   rW   rX   Zsequence_datarG   Z	start_adjZend_adjabjZbwr   r   r   load_regions_from_bw>   s<   $&

rd   c                    s   t  Mfdd| D }|d d jd d | d|   |d d d d d d  f tj}tj fdd|D dtjd}W d    ||fS 1 sSw   Y  ||fS )	Nc                       g | ]
}  t|qS r   Zenter_contexth5pyFiler   stackr   r   r   g       z3load_regions_from_chrombpnet_h5.<locals>.<listcomp>r   zraw/seqrJ   c              	      s0   g | ]}t |d  dddd f qS )zshap/seqN)rC   rY   r   r7   rX   rW   r   r   r   m      0 rP   rA   )r   shaperF   rC   rR   r[   rS   h5_pathsr4   h5sr_   r`   r   rX   rj   rW   r   load_regions_from_chrombpnet_h5e   s   *$
		rv   c                    s   t  Qfdd| D }|d d jd d | d|   |d d d d  d d f ddtj}tj fdd|D dtjd	}W d    ||fS 1 sWw   Y  ||fS )
Nc                    re   r   rf   r   ri   r   r   r   t   rk   z.load_regions_from_bpnet_h5.<locals>.<listcomp>r   Z
input_seqsrJ      c              	      s8   g | ]}t |d  dd ddf ddqS )Z
hyp_scoresNrx   rJ   )rC   rY   swapaxesrm   rn   r   r   r   z   s   8 rp   )r   rq   ry   rF   rC   rR   r[   rS   rr   r   ru   r   load_regions_from_bpnet_h5r   s   2$
		rz   c                 C   s*   t | }t|t jr|}|S |d }|S )NZarr_0)rC   load
isinstanceZndarray)pathr7   Zarrr   r   r   load_npy_or_npz   s   
r~   c                    sv   t |}|jd d | d|   |d d d d  f tj} fdd| D }tj|dtjd}||fS )Nrl   rJ   c              	      s0   g | ]}t t|d d d d  f qS )N)rC   rY   r~   )r   prn   r   r   r      ro   z1load_regions_from_modisco_fmt.<locals>.<listcomp>r   rp   )r~   rq   rF   rC   rR   r[   rS   )shaps_pathsohe_pathr4   Zsequences_rawr_   Zshapsr`   r   rn   r   load_regions_from_modisco_fmt   s   "r   c                 C   s   t | }|d |d fS )Nr_   contributions)rC   r{   )Znpz_pathdatar   r   r   load_regions_npz   s   
r   c                 C   s   t j|| |d d S )N)r_   r   )rC   Zsavez_compressed)r_   r   out_pathr   r   r   write_regions_npz   s   r   c                 C   s`   t jt | dd}t || }t ||k}tt |d}tt |d t|}||fS )z
    Adapted from https://github.com/jmschrei/tfmodisco-lite/blob/570535ee5ccf43d670e898d92d63af43d68c38c5/modiscolite/report.py#L213-L236
    r   rO   rx   )rC   sumabsmaxZnonzerominrU   )cwmtrim_thresholdscoreZtrim_threshZ	pass_indsrW   rX   r   r   r   
trim_motif   s   r   d   c                 C   s6   | t j| ddd }t || }|t j|ddd S )Nrx   TrP   Zkeepdimsr   )rC   r[   expr   )xZtempZnorm_xr   r   r   r   softmax   s   r   Zpos_patternsZneg_patternsc              	   C   s  g g g g g d}g }t | d*}tD ]}|| vrq|| }dd }tt| |dD ]\}	\}
}| d|
 }|d dd j}t	|d	 
 }|| }|ddd
ddd
f }t||\}}t||\}}|dkry|}|}|}nl|dkr|d dd j}t	|d	 
 }|| }|ddd
ddd
f }nE|dkr|d dd j}d}|tj
|ddd }|ddd
ddd
f }n|dkr|d dd j}d}t|}|ddd
ddd
f }|d | |d d |d | |d | |d | |d | |d d |d | |d | |d | |||g q0qW d   n	1 s?w   Y  t|jdd}tj|tjdd}||fS )z
    Adapted from https://github.com/jmschrei/tfmodisco-lite/blob/570535ee5ccf43d670e898d92d63af43d68c38c5/modiscolite/report.py#L252-L272
    )
motif_namemotif_strandmotif_start	motif_endmotif_scalerc                 S      t | d dd S Nr   _rl   intr)   r   r   r   r   <lambda>   r   z%load_modisco_motifs.<locals>.<lambda>key.Zcontrib_scoresNrJ   rl   r   hcwmZhypothetical_contribsZpfmrG   rx   r   Tr   Zpfm_softmaxr   r   +r   r   r   -motif_idr   )rA   rP   )rg   rh   MODISCO_PATTERN_GROUPSkeysr/   sorteditemsr>   rC   Zsqrtr   r   r   r*   r.   r   	DataFramer%   rj   rS   )modisco_h5_pathr   
motif_typeZmotif_data_lstsZ	motif_lstmodisco_resultsr   metaclusterr   r   pattern_namepatternpattern_tagZcwm_rawZcwm_normZcwm_fwdZcwm_revZ	start_fwdZend_fwdZ	start_revZend_revZ	motif_fwdZ	motif_revZ
motif_normZ	motif_raw	motifs_dfcwmsr   r   r   load_modisco_motifs   sl   
"<r   Fc                 C   s2   t j| dd dt dd}|r|S | S )Nr	   )r   r   rx   count)r   r    r0   litr1   r&   )	hits_pathlazyhits_dfr   r   r   	load_hits   s   r   c              
      s  g }g }g }g }g }	t | d}
tD ]w}||
 vrq|
| }dd }tt| |dD ][\}\}}| d|  |d d d  }|d d d  }|d d d  }|d	 d d  tj	}t
|d
 d }|| || || || |	 fddt|D  q.qW d    n1 sw   Y  t|t|t|t||	d}|| }t|j| dddjtdtdtd | tdtd | tdtdtdtddjg dd}|r|}|S | }|S )Nr   c                 S   r   r   r   r   r   r   r   r     r   z&load_modisco_seqlets.<locals>.<lambda>r   r   zseqlets/startzseqlets/endzseqlets/is_revcompzseqlets/example_idxzseqlets/n_seqletsr   c                    s   g | ]} qS r   r   )r   r   r   r   r   r   %  s    z(load_modisco_seqlets.<locals>.<listcomp>)seqlet_start
seqlet_end
is_revcompr   r   r   innerZonZhowr   r   r   r   r   r   )r   start_untrimmedend_untrimmedr   r   r   r   )r   r   r   r   )subset)rg   rh   r   r   r/   r   r   rF   rC   Zuint32r   r*   r.   rangeZconcatenater   Z	LazyFramejoinr   r#   r$   r-   r&   )r   peaks_dfr4   modisco_half_widthr   Z	start_lstZend_lstZis_revcomp_lstZpeak_id_lstZpattern_tagsr   r   r   r   r   r   r   ZstartsZendsZis_revcompsZpeak_idsZ	n_seqletsZdf_dataoffset
seqlets_dfr   r   r   load_modisco_seqlets  sh    



r   c                 C   s  t j|dd t j|d}t j|d}t j|d}|  j| dddj| dddj| d	ddjtd
tdtdtd td tdtd td tdtd tdtd | tdtdtdtdtd tdtdtddd
dg	d
}	|	j
g ddd}
|
jtdtdtdtdtdd tdd}|	 j|dd  |
 j|dd  | j|d!dd" d S )#NTexist_okhits.tsvhits_unique.tsvzhits.bedr   r   r   r   r   r   r   	hit_startr   r   r   hit_coefficienthit_correlationhit_importanceglobal_scaler   r   )r   r   rW   rX   r   r   r   r   r   r   strandr   r   rW   )r   rW   r   r   r   r   rX   i  r   )r   rW   rX   r   r   r   r	   r   F)r
   r   )osmakedirsr}   r   r   r#   r   r$   sortdropr-   r&   	write_csv)r   r   r   qc_dfout_dirmotif_widthout_path_tsvout_path_tsv_uniqueZout_path_beddata_alldata_uniqueZdata_bedr   r   r   
write_hitsE  s^   

r   c           	      C   s  t j|dd t j|d}t j|d}|  j| dddj| dddjtd	td
td td
td td
td
| tdtdtdtdtd tdtd	tdd	ddg}|j
g ddd}| j|dd | j|dd d S )NTr   r   r   r   r   r   r   ZNAr   r   r   r   r   r   r   r   r   )r   rW   rX   r   r   r   r   r   r   r   r   r   rW   )r   rW   r   r   r   r	   r   )r   r   r}   r   r   r#   r   r   r$   r   r-   r&   r   )	r   r   r   r   r   r   r   r   r   r   r   r   write_hits_no_peaks{  s>   

r   c                 C   s>   |   j|  dddddgd }|j|dd d S )Nr   r   r   r   r   r	   r   )r   r   r   r   r&   r   )r   r   r   dfr   r   r   write_qc  s   
r   c                 C   s$   |   d }|j|dd d S )Nr   r	   r   )r   r   r&   r   )r   r   r   r   r   r   write_qc_no_peaks  s   r   c                 C   s@   t |d}tj| |dd W d    d S 1 sw   Y  d S )NwrI   )Zindent)r'   jsondump)paramsr   r7   r   r   r   write_params  s   "r   c                 C   s   | j |dd d S )Nr	   r   )r   )occ_dfr   r   r   r   write_occ_df  s   r   c           	   	   C   s   t j|d}t j|dd | D ])\}}t j||}t j|dd | D ]\}}tt j|| d| q(q| jt j|ddd d S )NCWMsTr   z.txtzseqlet_recall.tsvr	   r   )r   r}   r   r   r   rC   Zsavetxtr   )		recall_dfr   r   Zcwms_dirmvZ	motif_dirZcwm_typer   r   r   r   write_recall_data  s   r   )r   )F),r   r   Z
contextlibr   rN   rC   rg   Z
hdf5pluginpolarsr   rK   rT   r   r   r!   ZUtf8ZUInt32ZFloat32r"   r:   ZarrayrE   rR   rH   rd   rv   rz   r~   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   <module>   sJ     	'	

J
	=6$
