
    h׹              1          d Z ddlmZ ddlZddlZddlZddlZddlmZm	Z	 ddl
Z	 dDdedee         d	ed
e	e         dededdfdZ	 dDdee         dee         de	e         dededdfdZ	 dDdee         dee         de	e         dededdfdZ	 dDdee         dee         de	e         dedededdfdZ	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 dEdedee         dedee         d ee         d!ee         d"ee         d#ed$ee         d%ee         d&ed'ed(ed)ed*ed+ed,ed-ed.ed/ee         d0ed1ed2eddf0d3Z	 	 	 	 dFded6edee         dee         d ee         d!ee         d#ed7ed8ed9ed:eddfd;ZdGd=eded>eddfd?ZdHd@e	e         dedAeddfdBZdIdCZdS )Ja  Main CLI module for the Fi-NeMo motif instance calling pipeline.

This module provides the command-line interface for all Fi-NeMo operations:
- Data preprocessing from various genomic formats
- Motif hit calling using the Fi-NeMo algorithm
- Report generation and result visualization
- Post-processing operations (hit collapsing, intersection)

The CLI supports multiple input formats including bigWig, HDF5 (ChromBPNet/BPNet),
and TF-MoDISco format.
   )data_io    N)OptionalList  
peaks_pathchrom_order_pathfa_pathbw_pathsout_pathregion_widthreturnc                     |dz  }t          j        | ||          }t          j        ||||          \  }}	t          j        ||	||           dS )ax  Extract genomic regions and contribution scores from bigWig and FASTA files.

    Parameters
    ----------
    peaks_path : str
        Path to ENCODE NarrowPeak format file.
    chrom_order_path : str, optional
        Path to chromosome ordering file.
    fa_path : str
        Path to genome FASTA file.
    bw_paths : List[str]
        List of bigWig file paths containing contribution scores.
    out_path : str
        Output path for NPZ file.
    region_width : int, default 1000
        Width of regions to extract around peak summits.

    Notes
    -----
    BigWig files only provide projected contribution scores.
       peaks_dfN)r   
load_peaksload_regions_from_bwwrite_regions_npz)
r   r	   r
   r   r   r   
half_widthr   	sequencescontribss
             2/srv/www/kundaje/kobbad/Fi-NeMo/src/finemo/main.pyextract_regions_bwr      sg    : "J !*.>
KKH!6'8Z Ix
 i8hOOOOOO    h5_pathsc                     |dz  }| t          j        | ||          }nd}t          j        ||          \  }}t          j        ||||           dS )a	  Extract genomic regions and contribution scores from ChromBPNet HDF5 files.

    Parameters
    ----------
    peaks_path : str, optional
        Path to ENCODE NarrowPeak format file. If None, lacks absolute coordinates.
    chrom_order_path : str, optional
        Path to chromosome ordering file.
    h5_paths : List[str]
        List of ChromBPNet HDF5 file paths.
    out_path : str
        Output path for NPZ file.
    region_width : int, default 1000
        Width of regions to extract around peak summits.
    r   Nr   )r   r   load_regions_from_chrombpnet_h5r   	r   r	   r   r   r   r   r   r   r   s	            r   extract_regions_chrombpnet_h5r    A   sg    , "J%j2BJOO!A(JWWIxi8hOOOOOOr   c                     |dz  }| t          j        | ||          }nd}t          j        ||          \  }}t          j        ||||           dS )a  Extract genomic regions and contribution scores from BPNet HDF5 files.

    Parameters
    ----------
    peaks_path : str, optional
        Path to ENCODE NarrowPeak format file. If None, output lacks absolute coordinates.
    chrom_order_path : str, optional
        Path to chromosome ordering file.
    h5_paths : List[str]
        List of BPNet HDF5 file paths.
    out_path : str
        Output path for NPZ file.
    region_width : int, default 1000
        Width of regions to extract around peak summits.
    r   Nr   )r   r   load_regions_from_bpnet_h5r   r   s	            r   extract_regions_bpnet_h5r#   c   sg    , "J%j2BJOO!<XzRRIxi8hOOOOOOr   shaps_pathsohe_pathc                     |dz  }| t          j        | ||          }nd}t          j        |||          \  }}	t          j        ||	||           dS )a  Extract genomic regions and contribution scores from TF-MoDISco format files.

    Parameters
    ----------
    peaks_path : str, optional
        Path to ENCODE NarrowPeak format file. If None, output lacks absolute coordinates.
    chrom_order_path : str, optional
        Path to chromosome ordering file.
    shaps_paths : List[str]
        List of paths to .npy/.npz files containing SHAP/attribution scores.
    ohe_path : str
        Path to .npy/.npz file containing one-hot encoded sequences.
    out_path : str
        Output path for NPZ file.
    region_width : int, default 1000
        Width of regions to extract around peak summits.
    r   Nr   )r   r   load_regions_from_modisco_fmtr   )
r   r	   r$   r%   r   r   r   r   r   r   s
             r   extract_regions_modisco_fmtr(      sp    2 "J%j2BJOO!?Xz Ix i8hOOOOOOr   333333?ffffff?      @{Gz?FMb@?'    ppregions_pathmodisco_h5_pathmotifs_include_pathmotif_names_pathmotif_lambdas_pathout_dircwm_trim_coords_pathcwm_trim_thresholds_pathcwm_trim_threshold_defaultlambda_defaultstep_size_maxstep_size_minsqrt_transformconvergence_tol	max_steps
batch_sizestep_adjustdevicemodeno_post_filtercompile_optimizerc                    t                      }ddl}ddlm} |t	          j        d           t          j        |           \  }}}}|j        d         }|dz  dk    rt          d| d          |dz  }|j        d         } |,t	          j        d	           t          j
        |||          }d
}|st	          j        d           |dk    rd}!d}"n4|dk    rd}!d
}"n)|dk    rd}!d}"n|dk    rd}!d
}"nt          d| d          |t          j        |          }#nd}#|t          j        |t                    }$nd}$|t          j        |t                    }%nd}%|t          j        |t                     }&nd}&|	t          j        |	t                    }'nd}'t          j        ||&|'|
|!|#|$|%|d

  
        \  }(})}*}+|)j        d         },|)j        d         }-|(                    d                              d
          }.||                    |          nd}/|                    |)|||*|"|.|||||||| |/|          \  }0}1t-          j        |d
           t,          j                            |d          }2t          j        |0||(|1||-           t          j        |1||2           t,          j                            |d          }3t          j        |(|3           t,          j                            |d          }4t          j        |)|4           ||| |-|,dz  }t,          j                            |d          }5t          j        ||5           dS )u  Call motif hits using the Fi-NeMo algorithm on preprocessed genomic regions.

    This function implements the core Fi-NeMo hit calling pipeline, which identifies
    motif instances by solving a sparse reconstruction problem using proximal gradient
    descent. The algorithm represents contribution scores as weighted combinations of
    motif CWMs at specific positions.

    Parameters
    ----------
    regions_path : str
        Path to NPZ file containing preprocessed regions (sequences, contributions,
        and optional peak coordinates).
    peaks_path : str, optional
        DEPRECATED. Path to ENCODE NarrowPeak format file. Peak data should be
        included during preprocessing instead.
    modisco_h5_path : str
        Path to TF-MoDISco H5 file containing motif CWMs.
    chrom_order_path : str, optional
        DEPRECATED. Path to chromosome ordering file.
    motifs_include_path : str, optional
        Path to file listing motif names to include in analysis.
    motif_names_path : str, optional
        Path to file mapping motif IDs to custom names.
    motif_lambdas_path : str, optional
        Path to file specifying per-motif lambda values.
    out_dir : str
        Output directory for results.
    cwm_trim_coords_path : str, optional
        Path to file specifying custom motif trimming coordinates.
    cwm_trim_thresholds_path : str, optional
        Path to file specifying custom motif trimming thresholds.
    cwm_trim_threshold_default : float, default 0.3
        Default threshold for motif trimming.
    lambda_default : float, default 0.7
        Default L1 regularization weight.
    step_size_max : float, default 3.0
        Maximum optimization step size.
    step_size_min : float, default 0.08
        Minimum optimization step size.
    sqrt_transform : bool, default False
        Whether to apply signed square root transform to contributions.
    convergence_tol : float, default 0.0005
        Convergence tolerance for duality gap.
    max_steps : int, default 10000
        Maximum number of optimization steps.
    batch_size : int, default 2000
        Batch size for GPU processing.
    step_adjust : float, default 0.7
        Step size adjustment factor on divergence.
    device : str, optional
        DEPRECATED. Use CUDA_VISIBLE_DEVICES environment variable instead.
    mode : str, default "pp"
        Contribution type mode ('pp', 'ph', 'hp', 'hh') where 'p'=projected, 'h'=hypothetical.
    no_post_filter : bool, default False
        If True, skip post-hit-calling similarity filtering.
    compile_optimizer : bool, default False
        Whether to JIT-compile the optimizer for speed.

    Notes
    -----
    The Fi-NeMo algorithm solves the optimization problem:
    minimize_c: ||contribs - reconstruction(c)||²₂ + λ||c||₁
    subject to: c ≥ 0

    where c represents motif hit coefficients and reconstruction uses convolution
    with motif CWMs.
    r   Nr   )	hitcallerzThe `--device` flag is deprecated and will be removed in a future version. Please use the `CUDA_VISIBLE_DEVICES` environment variable to specify the GPU device.r   zRegion width of z is not divisible by 2.zProviding a peaks file to `call-hits` is deprecated, and this option will be removed in a future version. Peaks should instead be provided in the preprocessing step to be included in `regions.npz`.TzQNo peak region data provided. Output hits will lack absolute genomic coordinates.r0   cwmFphhphcwmhhzInvalid mode: z(. Must be one of 'pp', 'ph', 'hp', 'hh'.lambda)writableexist_okzpeaks_qc.tsvmotif_data.tsvmotif_cwms.npy)r   num_regionsuntrimmed_motif_width
num_motifsparameters.json)localstorch rG   warningswarnr   load_regions_npzshape
ValueErrorr   load_txtload_mappingstrfloatload_mapping_tupleintload_modisco_motifs
get_columnto_numpyrB   fit_contribsosmakedirspathjoin
write_hitswrite_qcwrite_motifs_dfwrite_motif_cwmswrite_params)6r1   r   r2   r	   r3   r4   r5   r6   r7   r8   r9   r:   r;   r<   r=   r>   r?   r@   rA   rB   rC   rD   rE   paramsrX   rG   r   r   r   	has_peaksr   r   rS   
motif_typeuse_hypothetical_contribsmotifs_includemotif_name_mapmotif_lambdastrim_coordstrim_thresholds	motifs_dfcwms
trim_masks_rU   motif_widthlambdas
device_objhits_dfqc_dfout_path_qcout_path_motif_dfout_path_motif_cwmsout_path_paramss6                                                         r   	call_hitsr      s   z XXFLLL o	
 	
 	
 07/G/U/U,Ix9?1%La1QLQQQRRR"J.#K T	
 	
 	
 %j2BJOO	 
_	
 	
 	
 t||
$)!!	
$(!!	
$)!!	
$(!!KTKKK
 
 	
 & )*=>># -.>DD%,-?GG'01EsKK+!./GOO%,%@"& &"ItZ AJ*Q-K""8,,55t5DDG)/);f%%%J++!! NGU& K$''''',,w77Kw)UG[QQQUHk222W.>??I'8999',,w0@AAT#6777
$"!, 	  F gll7,=>>O11111r     Thits_dirmodisco_region_widthcwm_trim_thresholdcompute_recalluse_seqletsc                 8
   ddl m}m} t          j        |           \  }}}}t          |j                  dk    r||z  }nBt          |j                  dk    r|dddddf         |z  }nt          d|j                   |j        d         dz  }|dz  }|*t          j	        d           t          j
        |d|          }|                    d          rt          j	        d	           |}t          j        |d
          }|t          j        |          }nd}|t          j        |t                    }nd}|#t          j        |dd|d||ddd

  
        \  }}}}nt          d          t"          j                            |d          }t          j        |d
          }t"          j                            |d          }t          j        |          \  }}t"          j                            |d          }t          j        |          }t"          j                            |d          }t          j        |          }|d         }|
st          j	        d           d} n!|d}	d} nt          j        |||||d
          } |j        d         }!t1          |t2          j                  r|}"n|                                }"t9          |          }#|                    |"|#          \  }$}%|                    ||||| |||#||!|	          \  }&}'}(})| |                    || ||#|!          \  }*}+nd\  }*}+t#          j         |d
           t"          j                            |d          },t          j!        |$|,           t          j"        |'|(|           |#                    |"|#|           |$                    |$|#|           |%                    |%|#|           t"          j                            |d          }-|&                    |(|)|-           | t1          | t2          j                  r| '                                n| }.t"          j                            |d          }/t          j(        |.|/           |*d|+bt"          j                            |d          }0t          j)        |*|0           |*                    |&|           |+                    |+|#|           t"          j                            |d          }1|,                    |'|#|1|	| du           dS )a  Generate comprehensive HTML report with statistics and visualizations.

    This function creates detailed analysis reports comparing Fi-NeMo hit calling
    results with TF-MoDISco seqlets, including performance metrics, distribution
    plots, and motif visualization. The report provides insights into hit calling
    quality and motif discovery accuracy.

    Parameters
    ----------
    regions_path : str
        Path to NPZ file containing the same regions used for hit calling.
    hits_dir : str
        Path to directory containing Fi-NeMo hit calling outputs.
    modisco_h5_path : str, optional
        Path to TF-MoDISco H5 file. If None, seqlet comparisons are skipped.
    peaks_path : str, optional
        DEPRECATED. Peak coordinates should be included in regions file.
    motifs_include_path : str, optional
        DEPRECATED. This information is inferred from hit calling outputs.
    motif_names_path : str, optional
        DEPRECATED. This information is inferred from hit calling outputs.
    out_dir : str
        Output directory for report files.
    modisco_region_width : int, default 400
        Width of regions used by TF-MoDISco (needed for coordinate conversion).
    cwm_trim_threshold : float, default 0.3
        DEPRECATED. This information is inferred from hit calling outputs.
    compute_recall : bool, default True
        Whether to compute recall metrics against TF-MoDISco seqlets.
    use_seqlets : bool, default True
        Whether to include seqlet-based comparisons in the report.

    Notes
    -----
    The generated report includes:
    - Hit vs seqlet count comparisons
    - Motif CWM visualizations
    - Hit statistic distributions
    - Co-occurrence heatmaps
    - Confusion matrices for overlapping motifs
    r   )
evaluationvisualization   r   NzUnexpected contribs shape: zProviding a peaks file to `report` is deprecated, and this option will be removed in a future version. Peaks should instead be provided in the preprocessing step to be included in `regions.npz`.z.tsvz|Passing a hits.tsv file to `finemo report` is deprecated. Please provide the directory containing the hits.tsv file instead.TlazyrH   g      ?zCmodisco_h5_path is required when providing a hits.tsv file directlyzhits.tsvrQ   rR   rV   r9   zUsage of the `--no-seqlets` flag is deprecated and will be removed in a future version. Please omit the `--modisco-h5` argument instead.F)NNrO   zmotif_occurrences.tsvmotifszseqlets.tsvzseqlet_confusion.tsvzreport.html)-rY   r   r   r   r\   lenr]   r^   rZ   r[   r   endswith	load_hitsr_   r`   ra   re   ri   rk   rl   load_motifs_dfload_motif_cwmsload_paramsload_modisco_seqlets
isinstancepl	LazyFramer   listget_motif_occurencestfmodisco_comparisonseqlet_confusionrj   write_occ_dfwrite_report_dataplot_hit_stat_distributionsplot_hit_peak_distributions!plot_peak_motif_indicator_heatmapplot_motifscollectwrite_modisco_seqletswrite_seqlet_confusion_dfplot_hit_vs_seqlet_countsplot_seqlet_confusion_heatmapwrite_report)2r1   r   r2   r   r3   r4   r6   r   r   r   r   r   r   r   r   r   r~   regionsr   modisco_half_width	hits_pathr   rv   rw   r{   cwms_modiscomotif_nameshits_df_pathmotifs_df_pathcwms_modisco_pathparams_pathrr   
seqlets_dfr   hits_df_lazymotif_names_listocc_dfcooocreport_data	report_dfr   trim_boundsconfusion_dfconfusion_matocc_pathplot_dirseqlets_collectedseqlets_pathseqlet_confusion_pathreport_paths2                                                     r   reportr     s   l ,+++++++'.'?'M'M$Ix1
8>aY&	X^			!	!111dAAA:&2Gx~GGHHHq!Q&J-2 Q	
 	
 	
 %j$
CC   3B K	
 	
 	
 	#ID999*$-.ABBNN!N'$12BCHHNN!N&6=6Q"7 73I|Q U  
 w||Hj99#Lt<<<h0@AA!(!7!G!G	;GLL3CDD./@AAgll8->??$[11#$@A 
 W	
 	
 	
 

		 

1
 
 

 $Q'K '2<(( 4%,%,\\^^"&{"3"333LBRSSMFE2<2Q2Q3 3/KFK &0&A&AZ+;['
 '
#mm '1#mK$''''w||G%<==H***i999--l<LgVVV--f6FPPP33E;KWUUUw||GX..Hfk8<<<$.z2<$H$HXJ   j 	 w||G];;%&7FFF#(A$&GLL:P$Q$Q!-l<QRRR33KIII77/   ',,w66K#[.*TXBX    r   皙?r   overlap_fracc                     ddl m} t          j        | d          }|                    ||          }t          j        ||t          j                   dS )a  Collapse overlapping hits by selecting the best hit per overlapping group.

    This function processes a set of motif hits and identifies overlapping hits,
    keeping only the hit with the highest similarity score within each overlapping
    group. This reduces redundancy in hit calls while preserving the most confident
    predictions.

    Parameters
    ----------
    hits_path : str
        Path to input TSV file containing hit data (hits.tsv or hits_unique.tsv).
    out_path : str
        Path to output TSV file with additional 'is_primary' column.
    overlap_frac : float, default 0.2
        Minimum fractional overlap for considering hits as overlapping.
        For hits of lengths x and y, minimum overlap = overlap_frac * (x + y) / 2.

    Notes
    -----
    The algorithm uses a sweep line approach with a heap data structure to
    efficiently identify overlapping intervals and select the best hit based
    on similarity scores.
    r   postprocessingFr   schemaN)rY   r   r   r   collapse_hitswrite_hits_processedHITS_COLLAPSED_DTYPES)r   r   r   r   r   hits_collapsed_dfs         r   r   r   `  so    0 !     	666G&44WlKK 8G,I     r   
hits_pathsrelaxedc                     ddl m} d | D             }|                    ||          }t          j        ||d           dS )a  Find intersection of hits across multiple Fi-NeMo runs.

    This function identifies motif hits that are consistently called across
    multiple independent runs, providing a way to assess reproducibility and
    identify high-confidence hits.

    Parameters
    ----------
    hits_paths : List[str]
        List of paths to input TSV files from different runs.
    out_path : str
        Path to output TSV file containing intersection results.
        Duplicate columns are suffixed with run index.
    relaxed : bool, default False
        If True, uses relaxed intersection criteria based only on motif names
        and untrimmed coordinates. If False, assumes consistent region definitions
        and motif trimming across runs.

    Notes
    -----
    The strict intersection mode requires consistent input regions and motif
    processing parameters across all runs. The relaxed mode is more permissive
    but may not be suitable when genomic coordinates are unavailable.
    r   r   c                 :    g | ]}t          j        |d           S )Fr   )r   r   ).0r   s     r   
<listcomp>z"intersect_hits.<locals>.<listcomp>  s(    UUUY!)%888UUUr   Nr   )rY   r   intersect_hitsr   r   )r   r   r   r   hits_dfsr   s         r   r   r     s\    2 !     UU*UUUH++Hg>>G (4@@@@@@r   c                  \   t          j                    } |                     dd          }|                    dt           j        d          }|                    ddt          dd	
           |                    ddt          dd           |                    ddt          dd
           |                    ddt          ddd           |                    ddt          dd
           |                    ddt          t          j	        t                    j        d         j        d           |                    dt           j        d           }|                    ddt          dd!           |                    ddt          dd           |                    d"d#t          ddd$           |                    ddt          dd
           |                    ddt          t          j	        t                    j        d         j        d           |                    d%t           j        d&          }|                    ddt          dd!           |                    ddt          dd           |                    d"d#t          ddd$           |                    ddt          dd
           |                    ddt          t          j	        t                    j        d         j        d           |                    d't           j        d(          }|                    ddt          dd!           |                    ddt          dd           |                    d"d#t          ddd$           |                    ddt          dd
           |                    ddt          t          j	        t                    j        d         j        d           |                    d)t           j        d*          }|                    ddt          dd!           |                    ddt          dd           |                    d+d,t          dd-
           |                    d.d/t          ddd0           |                    ddt          dd
           |                    ddt          t          j	        t                    j        d         j        d           |                    d1t           j        d2          }|                    d3d4t          t          j	        t                     j        d5         j        h d6d78           |                    d9d:t          dd;
           |                    d<d=t          dd>
           |                    ddt          dd?           |                    ddt          dd?           |                    d@dAt          ddB           |                    dCdDt          ddE           |                    ddFt          ddG
           |                    dHdIt"          t          j	        t                     j        dJ         j        dK           |                    dLdMt          ddN           |                    dOdPt          ddQ           |                    dRdSt"          t          j	        t                     j        dT         j        dU           |                    dVdWt          ddX           |                    d.dYt"          ddZ           |                    d[d\t          dd]           |                    dd^d_d`a           |                    dbdcd_dda           |                    d+det"          t          j	        t                     j        df         j        dg           |                    dhdit"          t          j	        t                     j        dj         j        dk           |                    dldmt"          t          j	        t                     j        dn         j        do           |                    d"dpt"          t          j	        t                     j        dq         j        dr           |                    dsdtt          t          j	        t                     j        du         j        dv           |                    ddwt          t          j	        t                     j        dx         j        dy           |                    dzd{t          dd|           |                    d}d~d_da           |                    dt           j        d          }|                    d9d:t          dd
           |                    ddt          dd
           |                    ddt          dd?           |                    d<d=t          dd           |                    d@dAt          dd           |                    dCdDt          dd           |                    ddFt          dd
           |                    ddt          t          j	        t$                    j        d         j        d           |                    dHdIt"          t          j	        t$                    j        d         j        d           |                    ddd_da           |                    d+dd_da           |                    dt           j        d          }	|	                    dhdt          dd
           |	                    ddt          dd
           |	                    ddt"          t          j	        t&                    j        d         j        d           |                    dt           j        d          }
|
                    dhdt          ddd           |
                    ddt          dd
           |
                    d9dd_da           |                                 }|j        dk    r4t          |j        |j        |j        |j        |j        |j                   dS |j        dk    r.t          |j        |j        |j        |j        |j                   dS |j        d%k    r=t;          d           t          |j        |j        |j        |j        |j                   dS |j        d'k    r.t          |j        |j        |j        |j        |j                   dS |j        d)k    r4t          |j        |j        |j        |j        |j        |j                   dS |j        d1k    r|j          tC          j"        d           |j         |_#        |j$         tC          j"        d           |j$        |_%        t!          |j&        |j        |j'        |j        |j(        |j)        |j%        |j*        |j+        |j,        |j-        |j#        |j.        |j/        |j0        |j1        |j2        |j3        |j4        |j5        |j6        |j7        |j8                   dS |j        dk    rTt%          |j&        |j9        |j'        |j        |j(        |j)        |j*        |j:        |j-        |j;         |j<                    dS |j        dk    r"t'          |j9        |j        |j=                   dS |j        dk    r"t}          |j9        |j        |j?                   dS dS )a  Command-line interface for the Fi-NeMo motif instance calling pipeline.

    This function provides the main entry point for all Fi-NeMo operations including:
    - Data preprocessing from various formats (bigWig, HDF5, TF-MoDISco)
    - Motif hit calling using the Fi-NeMo algorithm
    - Report generation and visualization
    - Post-processing operations (hit collapsing, intersection)
    Tcmd)requireddestzextract-regions-bwz@Extract sequences and contributions from FASTA and bigwig files.)formatter_classhelpz-pz--peaksz0A peak regions file in ENCODE NarrowPeak format.)typer   r   z-Cz--chrom-orderNzA tab-delimited file with chromosome names in the first column to define sort order of chromosomes. Missing chromosomes are ordered as they appear in -p/--peaks.)r   defaultr   z-fz--fastazcA genome FASTA file. If an .fai index file doesn't exist in the same directory, it will be created.z-bz	--bigwigs+zvOne or more bigwig files of contribution scores, with paths delimited by whitespace. Scores are averaged across files.)r   r   nargsr   z-oz
--out-pathz!The path to the output .npz file.z-wz--region-widthr   z?The width of the input region centered around each peak summit.zextract-regions-chrombpnet-h5zKExtract sequences and contributions from ChromBPNet contributions H5 files.zwA peak regions file in ENCODE NarrowPeak format. If omitted, downstream outputs will lack absolute genomic coordinates.z-cz--h5szrOne or more H5 files of contribution scores, with paths delimited by whitespace. Scores are averaged across files.zextract-regions-h5zExtract sequences and contributions from ChromBPNet contributions H5 files. DEPRECATED: Use `extract-regions-chrombpnet-h5` instead.zextract-regions-bpnet-h5zFExtract sequences and contributions from BPNet contributions H5 files.zextract-regions-modisco-fmtzDExtract sequences and contributions from tfmodisco-lite input files.z-sz--sequencesz9A .npy or .npz file containing one-hot encoded sequences.z-az--attributionszOne or more .npy or .npz files of hypothetical contribution scores, with paths delimited by whitespace. Scores are averaged across files.z	call-hitsz@Call hits on provided sequences, contributions, and motif CWM's.z-Mz--moderC   >   rL   rJ   rI   r0   zThe type of attributions to use for CWM's and input contribution scores, respectively. 'h' for hypothetical and 'p' for projected.)r   r   choicesr   z-rz	--regionsz~A .npz file of input sequences, contributions, and coordinates. Can be generated using `finemo extract-regions-*` subcommands.z-mz--modisco-h5z2A tfmodisco-lite output H5 file of motif patterns.zfDEPRECATED: Please provide this file to a preprocessing `finemo extract-regions-*` subcommand instead.z-Iz--motifs-includezA tab-delimited file with tfmodisco motif names (e.g pos_patterns.pattern_0) in the first column to include in hit calling. If omitted, all motifs in the modisco H5 file are used.z-Nz--motif-nameszA tab-delimited file with tfmodisco motif names (e.g pos_patterns.pattern_0) in the first column and custom names in the second column. Omitted motifs default to tfmodisco names.z	--out-dirz!The path to the output directory.z-tz--cwm-trim-thresholdr9   zVThe default threshold to determine motif start and end positions within the full CWMs.z-Tz--cwm-trim-thresholdszA tab-delimited file with tfmodisco motif names (e.g pos_patterns.pattern_0) in the first column and custom trim thresholds in the second column. Omitted motifs default to the `--cwm-trim-threshold` value.z-Rz--cwm-trim-coordszA tab-delimited file with tfmodisco motif names (e.g pos_patterns.pattern_0) in the first column and custom trim start and end coordinates in the second and third columns, respectively. Omitted motifs default to `--cwm-trim-thresholds` values.z-lz--global-lambdar:   zFThe default L1 regularization weight determining the sparsity of hits.z-Lz--motif-lambdaszA tab-delimited file with tfmodisco motif names (e.g pos_patterns.pattern_0) in the first column and motif-specific lambdas in the second column. Omitted motifs default to the `--global-lambda` value.z--alphaz7DEPRECATED: Please use the `--lambda` argument instead.z-Az--motif-alphasz>DEPRECATED: Please use the `--motif-lambdas` argument instead.z--no-post-filter
store_truezDo not perform post-hit-calling filtering. By default, hits are filtered based on a minimum cosine similarity of `lambda` with the input contributions.)actionr   z-qz--sqrt-transformz\Apply a signed square root transform to the input contributions and CWMs before hit calling.z--step-size-maxr;   z The maximum optimizer step size.z-iz--step-size-minr<   z The minimum optimizer step size.z-jz--step-adjustrA   zThe optimizer step size adjustment factor. If the optimizer diverges, the step size is multiplicatively adjusted by this factorz--convergence-tolr>   zoThe tolerance for determining convergence. The optimizer exits when the duality gap is less than the tolerance.z-Sz--max-stepsr?   z)The maximum number of optimization steps.z--batch-sizer@   z%The batch size used for optimization.z-dz--devicezaDEPRECATED: Please use the `CUDA_VISIBLE_DEVICES` environment variable to specify the GPU device.z-Jz	--compilezZJIT-compile the optimizer for faster performance. This may not be supported on older GPUs.r   zOGenerate statistics and visualizations from hits and tfmodisco-lite motif data.z}A .npz file containing input sequences, contributions, and coordinates. Must be the same as that used for `finemo call-hits`.z-Hz--hitsziThe output directory generated by the `finemo call-hits` command on the regions specified in `--regions`.zThe tfmodisco-lite output H5 file of motif patterns. Must be the same as that used for hit calling unless `--no-recall` is set. If omitted, seqlet-derived metrics will not be computed.zTDEPRECATED: This information is now inferred from the outputs of `finemo call-hits`.z(The path to the report output directory.z-Wz--modisco-region-widthr   zGThe width of the region around each peak summit used by tfmodisco-lite.r   z-nz--no-recallz$Do not compute motif recall metrics.z--no-seqletsz<DEPRECATED: Please omit the `--modisco-h5` argument instead.zcollapse-hitszEIdentify best hit by motif similarity among sets of overlapping hits.z:The `hits.tsv` or `hits_unique.tsv` file from `call-hits`.zHThe path to the output .tsv file with an additional "is_primary" column.z-Oz--overlap-fracr   a  The threshold for determining overlapping hits. For two hits with lengths x and y, the minimum overlap is defined as `overlap_frac * (x + y) / 2`. The default value of 0.2 means that two hits must overlap by at least 20% of their average lengths to be considered overlapping.zintersect-hitsz$Intersect hits across multiple runs.zROne or more hits.tsv or hits_unique.tsv files, with paths delimited by whitespace.zmThe path to the output .tsv file. Duplicate columns are suffixed with the positional index of the input file.z	--relaxedzUse relaxed intersection criteria, using only motif names and untrimmed coordinates. By default, the intersection assumes consistent region definitions and motif trimming. This option is not recommended if genomic coordinates are unavailable.zeWARNING: The `extract-regions-h5` command is deprecated. Use `extract-regions-chrombpnet-h5` instead.zxThe `--alpha` flag is deprecated and will be removed in a future version. Please use the `--global-lambda` flag instead.zThe `--motif-alphas` flag is deprecated and will be removed in a future version. Please use the `--motif-lambdas` flag instead.)@argparseArgumentParseradd_subparsers
add_parserArgumentDefaultsHelpFormatteradd_argumentra   rd   inspect	signaturer   
parametersr   r    r#   r(   r   rb   r   r   
parse_argsr   peakschrom_orderfastabigwigsr   r   h5sprintattributionsr   alpharZ   r[   global_lambdamotif_alphasrx   r   
modisco_h5rv   r   r6   cwm_trim_coordscwm_trim_thresholdsr   r;   r<   r=   r>   r?   r@   rA   rB   rC   rD   compilehitsr   	no_recall
no_seqletsr   r   r   )parser
subparsersextract_regions_bw_parser$extract_chrombpnet_regions_h5_parserextract_regions_h5_parserextract_bpnet_regions_h5_parser"extract_regions_modisco_fmt_parsercall_hits_parserreport_parsercollapse_hits_parserintersect_hits_parserargss               r   clir    sJ    $&&F&&5&AAJ * 5 5 >O !6 ! ! **? +    ** q +    **r +    ** F +    **0 +    **!"455	N$	N +    ,6+@+@' >Z ,A , ,( )55 G 6    )55 q 6    )55 B 6    )550 6    )55!"?@@	N$	N 6    !+ 5 5 > T !6 ! ! ** G +    ** q +    ** B +    **0 +    **!"?@@	N$	N +    '1&;&;" >U '< ' '# $00 G 1    $00 q 1    $00 B 1    $000 1    $00!":;;	N$	N 1    *4)>)>% >S *? * *& '33 G 4    '33 q 4    '33H 4    '33 Y 4    '330 4    '33!"=>>	N$	N 4    ",, >O -   !!!),,7?G((( R "    !! N "    !!A "    !!u "    !!u "    !! C "    !! B "    !!0 "    !!!),,	02	e "    !! ] "    !! C "    !!!),,78HIQU "    !! X "    !!F "    !!M "    !! g	 "    !!k	 "    !!!),,7HP/ "    !!!),,7HP/ "    !!!),,7FN O "    !!!),,78IJR~ "    !!!),,7DL8 "    !!!),,7EM4 "    !!p "    !!i	 "    )) >^ *  M  M     x     u      H     c     c     7      !&))45KLTV     !&))45IJRc     3	     K	     &00 >T 1   %%I &    %%W &    %%!-00;NKS c &    '11 >3 2   &&a '    &&| '    && B	 '    Dx'''JJLM	
 	
 	
 	
 	
 
4	4	4%J($(DM4CT	
 	
 	
 	
 	
 
)	)	)s	
 	
 	
 	&J($(DM4CT	
 	
 	
 	
 	
 
/	/	/ J($(DM4CT	
 	
 	
 	
 	
 
2	2	2#JNM	
 	
 	
 	
 	
 
[	 	 :!M K   "&D(M R   "&!2DLJOL $# NOKIL/	
 	
 	
 	
 	
4 
X		LIOJL%#	
 	
 	
 	
 	
 
_	$	$di0ABBBBB	%	%	%ty$->>>>> 
&	%r   )r   )NNr)   r*   r+   r,   Fr-   r.   r/   r*   Nr0   FF)r   r)   TT)r   )F)r   N)__doc__rY   r   ri   r   rZ   r   typingr   r   polarsr   ra   rd   r   r    r#   r(   rb   boolr   r   r   r   r   r   r   <module>r     s  
 
       				    ! ! ! ! ! ! ! !     &P &P&Psm&P &P 3i	&P
 &P &P 
&P &P &P &P\ P PPsmP 3iP 	P
 P 
P P P PN P PPsmP 3iP 	P
 P 
P P P PP $P $P$Psm$P c$P 	$P
 $P $P 
$P $P $P $P` +/.2(+ #  #/\2 \2\2\2 \2 sm	\2
 "#\2 sm\2 !\2 \2 #3-\2 'sm\2 !&\2 \2 \2 \2 \2  !\2" #\2$ %\2& '\2( SM)\2* +\2, -\2. /\20 
1\2 \2 \2 \2N !$ #R RRR c]R 	R
 "#R smR R R R R R 
R R R Rj S C u t    DA AtCy AC A$ ASW A A A ABt
? t
? t
? t
? t
? t
?r   