o
    f?                     @   s`   d dl mZ ddlZddlZddlZdd Zdd Zdd	 Zd
d Z	dd Z
dd Zdd ZdS )   )data_io    Nc           	      C   s<   |d }t | d |}t ||||\}}t ||| d S N   )r   
load_peaksZload_regions_from_bwwrite_regions_npz)	
peaks_pathZfa_pathZbw_pathsout_pathregion_width
half_widthpeaks_df	sequencescontribs r   8/users/shouvikm/ChromBPNet/finemo_gpu/src/finemo/main.pyextract_regions_bw	   s   r   c                 C   *   |d }t | |\}}t ||| d S r   )r   Zload_regions_from_chrombpnet_h5r   Zh5_pathsr	   r
   r   r   r   r   r   r   extract_regions_chrombpnet_h5      r   c                 C   r   r   )r   Zload_regions_from_bpnet_h5r   r   r   r   r   extract_regions_bpnet_h5   r   r   c                 C   s,   |d }t | ||\}}t ||| d S r   )r   Zload_regions_from_modisco_fmtr   )Zshaps_pathsZohe_pathr	   r
   r   r   r   r   r   r   extract_regions_modisco_fmt"   s   r   c           $      C   s  t  }ddlm} t| \}}|jd }|d dkr#td| d|d }|d urSt|||}|j}||jd ksC||jd krRtd|j d|j d	| n|jd }|d
krad}d}n|dkrjd}d}n|dkrsd}d}n|dkr{d}d}t	|||\}}|jd }|jd }|
||||||||	|
||| |\}}t|} t|jdd}!tj|dd tj|d}"|d urt| |||!|| t|!||" nt| ||!|| t|!|" |||||dO }tj|d}#t||# d S )Nr   )	hitcallerr   r   zRegion width of z is not divisible by 2.zInput sequences of shape z% and/or input contributions of shape z) are not compatible with region count of ppcwmFphThpZhcwmhhZpeak_id)nameexist_okzpeaks_qc.tsv)r
   num_regionsZuntrimmed_motif_width
num_motifszparameters.json)locals r   r   load_regions_npzshape
ValueErrorr   Zheightload_modisco_motifsZfit_contribsplZ	DataFrameZwith_row_countosmakedirspathjoinZ
write_hitsZwrite_qcZwrite_hits_no_peaksZwrite_qc_no_peaksZwrite_params)$regions_pathr   modisco_h5_pathZchrom_order_pathout_dircwm_trim_thresholdalphastep_size_maxstep_size_minconvergence_tol	max_steps
batch_sizestep_adjustdevicemodeno_post_filterparamsr   r   r   r
   r   r   r!   Z
motif_typeZuse_hypothetical_contribs	motifs_dfcwmsr"   motif_widthhitsZqchits_dfZqc_dfZout_path_qcZout_path_paramsr   r   r   	call_hits*   sj   




rB   c              	   C   s  ddl m} t| \}}t|jdkr|| }	nt|jdkr-|d d d d d f | }	|	jd d }
|d }t|d |
}tj|dd}tj|||
|dd}t	|dd\}}|
td	d
kd }|jd }|||\}}||	||||||\}}}tj|dd tj|d}t|| t||| |||| tj|d}|||| tj|d}||| tj|d}||| tj|d}|||| d S )Nr   )
evaluation   r   T)Zlazyr   r   Zmotif_strand+Z
motif_namer   zmotif_occurrences.tsvzmotif_cooocurrence.pngZCWMszhit_vs_seqlet_counts.pngzreport.html)r$   rC   r   r%   lenr&   r   Z	load_hitsZload_modisco_seqletsr(   filterr)   ZcolZ
get_columnZto_numpyZget_motif_occurencesZseqlet_recallr*   r+   r,   r-   Zwrite_occ_dfZwrite_recall_dataZplot_hit_distributionsZ!plot_peak_motif_indicator_heatmapZ	plot_cwmsZplot_hit_vs_seqlet_countsZwrite_report)r.   Z	hits_pathr/   r   r0   modisco_region_widthrC   r   r   regionsr   Zmodisco_half_widthr   rA   Z
seqlets_dfr=   Zcwms_modiscoZmotif_namesr?   Zocc_dfZcooocZrecall_dataZ	recall_dfr>   Zocc_pathZ
coooc_pathZplot_dirZ	plot_pathZreport_pathr   r   r   reportm   s>   


rJ   c            
      C   s  t  } | jddd}|jdt jdd}|jddtd	h d
dd |jddtddd |jddtddd |jddtd dd |jddtd dd |jddtddd |jddtd d!d |jd"d#td$d%d |jd&d'd(d)d* |jd+d,td-d.d |jd/d0td1d2d |jd3d4td5d6d |jd7d8td9d:d |jd;d<td=d>d |jd?d@tdAdBd |jdCdDtdEdFd |jdGt jdHd}|jddtddId |jd&dJtddKd |jd?dLtddMdNdO |jddPtddQd |jdRdStdTdUd |jdVt jdWd}|jd7dXtddMdYdO |jddPtddQd |jdRdStdTdUd |jdZt jd[d}|jd7dXtddMdYdO |jddPtddQd |jdRdStdTdUd |jd\t jd]d}|jd7dXtddMdYdO |jddPtddQd |jdRdStdTdUd |jd^t jd_d}|jd+d`tddad |jd"dbtddMdcdO |jddPtddQd |jdRdStdTdUd |jddt jded}|jddtddfd |jdgdhtddid |jddtddjd |jddtddkd |jddtddd |jdldmtdndod | 	 }	|	j
dkrt|	j|	j|	j|	j|	j|	j|	j|	j|	j|	j|	j|	j|	j|	j|	j|	j d S |	j
dGkr%t|	j|	j|	j|	j|	j  d S |	j
dVkr6t!|	j"|	j|	j  d S |	j
dZkrKt#dp t!|	j"|	j|	j  d S |	j
d\kr\t$|	j"|	j|	j  d S |	j
d^krot%|	j&|	j'|	j|	j  d S |	j
ddkrt(|	j|	j)|	j|	j|	j|	j* d S d S )qNTcmd)requiredZdestz	call-hitsz@Call hits on provided sequences, contributions, and motif CWM's.)Zformatter_classhelpz-Mz--moder   >   r   r   r   r   zThe type of attributions to use for CWM's and input contribution scores, respectively. 'h' for hypothetical and 'p' for projected.)typedefaultZchoicesrM   z-rz	--regionszpA .npz file of input sequences and contributions. Can be generated using `finemo extract-regions-*` subcommands.)rN   rL   rM   z-mz--modisco-h5z2A tfmodisco-lite output H5 file of motif patterns.z-pz--peakszA peak regions file in ENCODE NarrowPeak format, exactly matching the regions specified in `--regions`. If omitted, outputs will lack absolute genomic coordinates.)rN   rO   rM   z-Cz--chrom-orderzA tab-delimited file with chromosome names in the first column to define sort order of chromosomes. Missing chromosomes are ordered as they appear in -p/--peaks.z-oz	--out-dirz!The path to the output directory.z-tz--cwm-trim-thresholdg333333?zNThe threshold to determine motif start and end positions within the full CWMs.z-az--alphag333333?zThe L1 regularization weight.z-fz--no-post-filterZ
store_truezDo not perform post-hit-calling filtering. By default, hits are filtered based on a minimum correlation of `alpha` with the input contributions.)actionrM   z-sz--step-size-maxg      @z The maximum optimizer step size.z-iz--step-size-ming{Gz?z The minimum optimizer step size.z-Az--step-adjustgffffff?zThe optimizer step size adjustment factor. If the optimizer diverges, the step size is multiplicatively adjusted by this factorz-cz--convergence-tolgMb@?zoThe tolerance for determining convergence. The optimizer exits when the duality gap is less than the tolerance.z-Sz--max-stepsi'  z)The maximum number of optimization steps.z-bz--batch-sizei  z%The batch size used for optimization.z-dz--deviceZcudazBThe pytorch device name to use. Set to `cpu` to run without a GPU.zextract-regions-bwz@Extract sequences and contributions from FASTA and bigwig files.z0A peak regions file in ENCODE NarrowPeak format.z--fastazcA genome FASTA file. If an .fai index file doesn't exist in the same directory, it will be created.z	--bigwigsrE   zvOne or more bigwig files of contribution scores, with paths delimited by whitespace. Scores are averaged across files.)rN   rL   ZnargsrM   z
--out-pathz!The path to the output .npz file.z-wz--region-widthi  z?The width of the input region centered around each peak summit.zextract-regions-chrombpnet-h5zKExtract sequences and contributions from ChromBPNet contributions H5 files.z--h5szrOne or more H5 files of contribution scores, with paths delimited by whitespace. Scores are averaged across files.zextract-regions-h5zExtract sequences and contributions from ChromBPNet contributions H5 files. DEPRECATED: Use `extract-regions-chrombpnet-h5` instead.zextract-regions-bpnet-h5zFExtract sequences and contributions from BPNet contributions H5 files.zextract-regions-modisco-fmtzDExtract sequences and contributions from tfmodisco-lite input files.z--sequencesz9A .npy or .npz file containing one-hot encoded sequences.z--attributionszOne or more .npy or .npz files of hypothetical contribution scores, with paths delimited by whitespace. Scores are averaged across files.rJ   z<Generate QC outputs from hits and tfmodisco-lite motif data.z}A .npz file containing input sequences and contributions. Must be the same as those used for motif discovery and hit calling.z-Hz--hitszGThe `hits.tsv` output file generated by the `finemo call-hits` command.zjA file of peak regions in ENCODE NarrowPeak format, exactly matching the regions specified in `--regions`.z4The tfmodisco-lite output H5 file of motif patterns.z-Wz--modisco-region-widthi  zGThe width of the region around each peak summit used by tfmodisco-lite.zeWARNING: The `extract-regions-h5` command is deprecated. Use `extract-regions-chrombpnet-h5` instead.)+argparseZArgumentParserZadd_subparsersZ
add_parserZArgumentDefaultsHelpFormatterZadd_argumentstrfloatintZ
parse_argsrK   rB   rI   ZpeaksZ
modisco_h5Zchrom_orderr0   r1   r2   r3   r4   r5   r6   r7   r8   r9   r:   r;   r   ZfastaZbigwigsr	   r
   r   Zh5sprintr   r   Zattributionsr   rJ   r@   rH   )
ZparserZ
subparsersZcall_hits_parserZextract_regions_bw_parserZ$extract_chrombpnet_regions_h5_parserZextract_regions_h5_parserZextract_bpnet_regions_h5_parserZ"extract_regions_modisco_fmt_parserZreport_parserargsr   r   r   cli   sL  







rW   )r$   r   r*   rQ   Zpolarsr)   r   r   r   r   rB   rJ   rW   r   r   r   r   <module>   s    	C.