o
    1wiV                     @   s4  d Z ddlZddlZddlZddlZddlmZ	 ddl
mZ ddlmZ ddlZddlZddlZddlZddlZddlmZ ddlZdejd< ejdeeejd  ddlmZmZm Z m!Z!m"Z"m#Z# d d
dZ$d!ddZ%d ddZ&d"ddZ'd#ddZ(			d$ddZ)d%ddZ*dd Z+e,dkre+  dS dS )&z
Generate combined figure with FIMO scans and ChromBPNet predictions.

Usage:
    python regenerate_plot_svg.py <variant_id> <model_name> <output_format>
    python regenerate_plot_svg.py variant_1 thyroid_gland__ENCSR474XFV svg
    N)GridSpec)Pathnonezsvg.fonttypevarbook)_extract_variant_sequences_plot_profile
_plot_shap_plotter_shap_resolve_model_metadata _process_single_model_with_folds,  c                 C   sZ   t |}t| } |d }|d }|| }	|| }
t||  |	|
 j }|}||	|fS )ap  
    Extract sequence around variant for FIMO scanning.
    
    Parameters:
    -----------
    chrom : str
        Chromosome
    pos : int
        1-based variant position
    ref : str
        Reference allele
    alt : str
        Alternate allele
    genome_fa : str
        Path to genome FASTA
    window_size : int
        Window size around variant
    
    Returns:
    --------
    sequence : str
        DNA sequence around variant (reference allele)
    start_pos : int
        Genomic start position of sequence (0-based)
    variant_pos_in_seq : int
        Position of variant within sequence (0-based)
          )pyfaidxFastastrsequpper)chromposrefalt	genome_fawindow_sizegenome
pos_0basedhalf_window	seq_startseq_endsequencevariant_pos_in_seq r!   regenerate_plot_svg.pyextract_sequence_for_fimo,   s   

r#   {Gz?c                    s  t j|d}t|d}|d| d|  d W d   n1 s#w   Y  t j|d}ddt|d	|d
d||g	}tdd|  tj|ddd}	|	j	dkr[t
d|	j t j|dd}
t j|
spt j|d}
t j|
s}td|
 tj|
dddddddddddddddd j fd d!jD d"g d#}fd$d%|D }|rtd&| td't d( S ))ao  
    Run FIMO scan on sequence using JASPAR database.
    
    Parameters:
    -----------
    sequence : str
        DNA sequence to scan
    sequence_name : str
        Name for sequence (used in FIMO output)
    jaspar_meme_file : str
        Path to JASPAR CORE database in MEME format
    output_dir : str
        Directory for FIMO output
    pvalue_threshold : float
        P-value threshold for FIMO (default: 1e-2)
    
    Returns:
    --------
    fimo_df : pd.DataFrame
        FIMO results with columns: pattern_name, sequence_name, start, stop, 
        strand, score, p-value, q-value, matched_sequence
    zsequence.faw>
Nzfimo.txtfimoz--threshz--ocz--ozRunning FIMO:  T)capture_outputtextr   zFIMO failed: zfimo.tsvz#FIMO output not found. Expected at 	#)sepcommentpattern_namesequence_namestartstopstrandscorep-valueq-valuematched_sequence)zpattern namezsequence namer2   r3   endr4   r5   r6   pvaluer7   qvaluezmatched sequencec                    s   i | ]	}|  ||qS r!   )get.0col)column_mappingr!   r"   
<dictcomp>       z!run_fimo_scan.<locals>.<dictcomp>columns)r0   r2   r3   r6   c                    s   g | ]	}| j vr|qS r!   rC   r=   )fimo_dfr!   r"   
<listcomp>   rB   z!run_fimo_scan.<locals>.<listcomp>z&FIMO output missing required columns: zFound z
 FIMO hits)ospathjoinopenwriter   print
subprocessrun
returncodeRuntimeErrorstderrexistsFileNotFoundErrorpdread_csvrenamerD   
ValueErrorlen)r   r1   jaspar_meme_file
output_dirpvalue_threshold
fasta_fileffimo_outputcmdresultfimo_tsvrequired_colsmissing_colsr!   )r@   rE   r"   run_fimo_scanW   sT   	
rd   c                 C   s   || d }| d d | | d< | d d | | d< dd }| d  || d	< d
d }| d  || d< |d }| | d | k| d |k@   } | S )a  
    Process FIMO results and convert to plot coordinates.
    
    Parameters:
    -----------
    fimo_df : pd.DataFrame
        FIMO results
    variant_pos_genomic : int
        Genomic position of variant (1-based)
    seq_start_genomic : int
        Genomic start position of sequence (0-based)
    window_size : int
        Window size
    
    Returns:
    --------
    fimo_df : pd.DataFrame
        FIMO DataFrame with added columns:
        - start_rel: Start position relative to variant (variant at 0)
        - end_rel: End position relative to variant
        - color: Color based on p-value
        - tf_name: Cleaned TF name
    r   r2   	start_relr3   end_relc                 S   sB   d| v r|  d}t|dkr|d drd|dd  S | S )N_r   r   MA)splitrX   
startswithrI   )r0   partsr!   r!   r"   clean_tf_name   s
   
z+process_fimo_results.<locals>.clean_tf_namer0   tf_namec                 S   s4   | dk rdS | dk rdS | dk rdS | dk rdS d	S )
Ngh㈵>navyg-C6?
mediumbluegMbP?	lightbluer$   	lightgray	gainsboror!   )pvalr!   r!   r"   assign_color   s   z*process_fimo_results.<locals>.assign_colorr6   colorr   )applycopy)rE   variant_pos_genomicseq_start_genomicr   r    rl   rt   r   r!   r!   r"   process_fimo_results   s    
rz      c                 C   s  |d }|  | | | g d | jddddd | j|d |d dd	dd
 | j|d |d dddd
 |ddg}i }d}d}	| D ]t\}
}|d }|d }|d }d}tdD ]3}||	 }d}| D ]\}\}}t|| dk r||k s||ksd} nqj|s|}||f||<  nq^|du rqH|| }| j	|||d|d dddd || d }| j
|||ddddd qH|r| dt| d  n| dd | jdddd d! | g  | jd" d | jd# d | jd$ d dS )%a  
    Plot FIMO scan tracks.
    
    Parameters:
    -----------
    ax : matplotlib.axes.Axes
        Axis for FIMO section
    fimo_df : pd.DataFrame
        FIMO DataFrame with start_rel, end_rel, tf_name, color
    window_size : int
        Window size
    protein_footprint : tuple
        (start, end) for protein footprint region
    variant_region : tuple
        (start, end) for variant region
    r   ir   2   r   black--r   ru   	linestyle	linewidth皙?rp   alpharu   zorder	peachpuffre   rm   g      ?rf   Nd   F      ?Tg333333?ru   )leftheightru   	edgecolorr   r   center      )havafontsizer         zPredicted TF-binding
ElementsZ   
      rotationlabelpadr   bottomtopright)set_xlim
set_xticksaxvlineaxvspansort_valuesiterrowsrangeitemsabsbarhr+   set_ylimmaxkeys
set_ylabelset_xticklabelsspinesset_visible)axrE   r   protein_footprintvariant_regionr   fimo_df_sortedy_positions	y_current	y_spacingidxrowrm   re   rf   y_posyy_valoverlaps
existing_yexisting_startexisting_endwidthmid_xr!   r!   r"   plot_fimo_scans   sf   
r   c                 C   sF   | D ]}|j |d |d dddd |j |d |d dddd qdS )z@
    Add shaded regions to all panels for visual alignment.
    r   r   r   rp   r   r   N)r   )	axes_listr   r   r   r!   r!   r"   add_shaded_regionsM  s   r   B  cudac           :      C   sb  t d t d|   t d|  t d t d tj|dd}d|jvr*td||d | k }t|d	krBtd
|  d| |jd	 }|d }t|d }d|jv rdd|jv rd|d }|d }nd|jv rwd|jv rw|d }|d }ntdt d| d| d| d|  t d t	||||||d\}}}t dt| d t d| d| d|t|   t d t
 (}t|| d| |||d}t||||d}t dt| d W d    n1 sw   Y  t d! t||}|d" }t d# t||||||d$\}}}}t d%| d& t d' t|||||d(|d)\}}} }!}"}#}$t d*|$d+ t d, tjd-d.d/}%td0d1|%g d2d3d4}&t d5 |%|&d	 }'t|'|||	|
d6 t d7 |%|&d1 }(|dkrrt|nd	})|dkr}t|nd	}*|dkr|nd8}+|dkr|nd8},t|||)|*|+|,||(|$d d9
 |(jd:d;d<d=d> |(jd:d=d? t|(|d t|(g|	|
 |%|&d@ }-|"d	   j}.|#d	   j}/dA}0|0d@ }1|d@ }2|.|1|2 |1|2 |)  }3tj|3g dBdC}4|4 j|2 7  _tj |4|-dD |-j!dEdFdGd1dH |-j!|)dI dFdGd1dH dJt"#t"$|3d	 }5dJt"%t"&|3d	 }6|-j'|6|5dK tj(dLdMdN|+ dOdPdQ|-j)d=dRt*dSdTdUdVdW	 |-jdXd;d<d=d> |-jdXd=d? t|-|d t|-g|	|
 |%|&dY }7|/|1|2 |1|2 |*  }8tj|8g dBdC}9|9 j|2 7  _tj |9|7dD |7j!dEdFdGd1dH |7j!|*dI dFdGd1dH dJt#t"#t"$|3d	t"#t"$|8d	 }5dJt%t"%t"&|3d	t"%t"&|8d	 }6|7j'|6|5dK tj(dLdMdZ|, dOdPdQ|7j)d=dRt*dSdTdUdVdW	 |7jd[d;d<d=d> |7jd[d=d? |7j+d\d=d? t|7|d t|7g|	|
 t d]| d^ |%j,||-d_d` dad.db t dc |%S )dzN
    Generate combined figure with FIMO scans and ChromBPNet predictions.
    zP================================================================================z(Generating combined figure for variant: zModel: z"
1. Loading variant information...r,   )r.   
variant_idz-variants_tsv must contain 'variant_id' columnr   zVariant z not found in chrr   r   r   allele1allele2zKVariants TSV must contain either 'ref'/'alt' or 'allele1'/'allele2' columnsz   Variant: :u   →z(
2. Extracting sequence for FIMO scan...)r   z   Sequence length: z bpz   Genomic region: -z
3. Running FIMO scan...)r[   z	   Found z FIMO hits in windowNz
4. Resolving model metadata...model_foldsz*
5. Extracting sequences for ChromBPNet...)	input_lenz   Extracted zbp sequencesz9
6. Generating ChromBPNet predictions and attributions...   )
n_shufflesdevicez
   LogFC: z.3fz
7. Creating figure layout...)r      i  )figsizedpi   r   )g      ?r   r   r   g333333?)figureheight_ratioshspacez
8. Plotting FIMO scans...)r   r   r   z&
9. Plotting ChromBPNet predictions... )logfc
logfc_pvalzPredicted Profilesr   r   r   r   )r   r   r   )ACGTrC   )r   r   kr   r   r   g?)r   r   gV-?gL7A`?zref ()r   r   r   roundwhite	lightgrey)boxstyle	facecolorr   )verticalalignmenthorizontalalignment	transformsizeru   bboxzContrib. Scores (STRwt)r   zalt (zContrib. Scores (STRmut)zRelative genomic position (bp)z
8. Saving figure to z....tight)formatbbox_inchesr   u   ✓ Figure saved successfully).rL   rT   rU   rD   rW   rX   ilocintindexr#   tempfileTemporaryDirectoryrd   rz   r
   r   r   pltr   r   add_subplotr   r   r   	set_titleconfigure_shared_xaxisr   cpunumpyr   	DataFrame	logomakerLogor   npr   maximumminminimumr   r+   	transAxesdict
set_xlabelsavefigri   ):r   
model_namevariants_tsvmodel_paths_tsvr   rY   output_filer   r   r   r   r[   r   variants_dfvariant_rowr   r   r   r   r   ry   r    temp_dirrE   metadatar   ref_seqalt_seqref_seq_stralt_seq_strref_profilealt_profile	ref_count	alt_countref_attralt_attrr   figgsax_fimoax1
ref_length
alt_length	ref_label	alt_labelax2ref_attr_npalt_attr_nptotal_lengthr   Fref_shap_plotdf_refymaxyminax3alt_shap_plotdf_altr!   r!   r"   generate_combined_figure\  s   



 "



**r0  c                 C   s:   |d }|  | | | g d | j|dddd dS )z'Configure shared x-axis for all panels.r   r   r   r   r   r   N)r   r   r   )r   variant_pos_relr   r   r!   r!   r"   r     s   r   c                  C   sd  t jdd} | jddd | jddd | jdg d	d
dd | jdddd | jdddd | jdddd | jdddd | jddd | jdtddd | jdd td!d"gd#d$ | jd%d td&d'gd(d$ | jd)td*d+d | jd,d-d-d.gd/d0 |  }|js|j d1|j d2|j	 |_t
|j|j|j|j|j|j|j|jt|jt|j|j|jd3 d S )4NzCGenerate combined figure with FIMO scans and ChromBPNet predictions)descriptionr   zVariant ID (e.g., 'variant_1'))helpr
  z/Model name (e.g., 'thyroid_gland__ENCSR474XFV')output_format)svgpngpdfr5  zOutput format (default: svg))choicesdefaultr3  z--variants-tsvTzPath to variants TSV file)requiredr3  z--model-paths-tsvzPath to model paths TSV filez--genome-fazb/oak/stanford/groups/akundaje/airanman/refs/hg38/GRCh38_no_alt_analysis_set_GCA_000001405.15.fastazPath to reference genome FASTA)r9  r3  z--jaspar-memez+Path to JASPAR CORE database in MEME formatz--output-filez*Output file path (default: auto-generated)z--window-sizer   z)Window size around variant (default: 300))typer9  r3  z--protein-footprintr   r|   r}   z+Protein footprint region (default: -40 -10))nargsr;  r9  r3  z--variant-regionr   r   zVariant region (default: -5 5)z--pvalue-thresholdr$   z&FIMO p-value threshold (default: 1e-2)z--devicer   r   z%Device for ChromBPNet (default: cuda))r9  r8  r3  rg   r   )r   r
  r  r  r   rY   r  r   r   r   r[   r   )argparseArgumentParseradd_argumentr   float
parse_argsr  r   r
  r4  r0  r  r  r   jaspar_memer   tupler   r   r[   r   )parserargsr!   r!   r"   main"  sp   


rF  __main__)r   )r$   )r   r{   r~   )r{   r~   )r   r   r{   r~   r$   r   )r   r   )-__doc__r=  r   r  pandasrT   
matplotlibmatplotlib.pyplotpyplotr   matplotlib.patchespatchesmpatchesmatplotlib.gridspecr   r   r   rM   r   rG   pathlibr   sysrcParamsrH   insertr   __file__parentvarbook.plot.variant.profilesr   r   r   r	   r
   r   r#   rd   rz   r   r   r0  r   rF  __name__r!   r!   r!   r"   <module>   sB   
 

+
U
K
V
 
;6
