
    hfP              "          d Z ddlZddlmZmZmZmZmZ ddlZ	ddlm
Z
 ddlZddlmZmZ dej        dee         deej        ee
d	f         f         fd
Zdee
df         dej        dedee
df         fdZdee
df         dee
df         deej        ej        f         dej        deej        ej        df         dej        dee
df         dee         dedededeeeeeef         f         ej        eeeeee
df         f         f         eeeeeeef         f         f         f         fdZdeej        ej        f         deej        ej        f         dej        dee         dedeej        ee
d	f         f         fdZdS )ao  Evaluation module for assessing Fi-NeMo motif discovery and hit calling performance.

This module provides functions for:
- Computing motif occurrence statistics and co-occurrence patterns
- Evaluating motif discovery quality against TF-MoDISco results
- Analyzing hit calling performance and recall metrics
- Generating confusion matrices for seqlet-hit comparisons
    N)ListTupleDictAnyUnion)ndarray)FloatInthits_dfmotif_namesreturnzM Mc                 4   |                                                      t          j        d                              d                                        dddd                              d          }t          |          t          |j                  z
  }|                    d |D                                           t          j	        | 	          
                    dg          }|j        }t          |          }t          j        ||ft          j        
          }t!          |          D ]3\  }}|                    |                                          |dd|f<   4|dk                        t          j                  }	|	j        |	z  }
||
fS )a  Compute motif occurrence statistics and co-occurrence matrix.

    This function analyzes motif occurrence patterns across peaks by creating
    a pivot table of hit counts and computing pairwise co-occurrence statistics.

    Parameters
    ----------
    hits_df : pl.LazyFrame
        Lazy DataFrame containing hit data with required columns:
        - peak_id : Peak identifier
        - motif_name : Name of the motif
        Additional columns are ignored.
    motif_names : List[str]
        List of motif names to include in analysis. Missing motifs
        will be added as columns with zero counts.

    Returns
    -------
    occ_df : pl.DataFrame
        DataFrame with motif occurrence counts per peak. Contains:
        - peak_id column
        - One column per motif with hit counts
        - 'total' column summing all motif counts per peak
    coocc : Int[ndarray, "M M"]
        Co-occurrence matrix where M = len(motif_names).
        Entry (i,j) indicates number of peaks containing both motif i and motif j.
        Diagonal entries show total peaks containing each motif.

    Notes
    -----
    The co-occurrence matrix is computed using binary occurrence indicators,
    so multiple hits of the same motif in a peak are treated as a single occurrence.
       count
motif_namepeak_idsum)onindexvaluesaggregate_functionr   c                 \    g | ])}t          j        d                               |          *S )r   )pllitalias).0ms     8/srv/www/kundaje/kobbad/Fi-NeMo/src/finemo/evaluation.py
<listcomp>z(get_motif_occurences.<locals>.<listcomp>B   s,    FFFARVAYY__Q//FFF    )totaldtypeN)collectwith_columnsr   r   r   pivot	fill_nullsetcolumnssum_horizontalsortheightlennpzerosint16	enumerate
get_columnto_numpyastypeint32T)r   r   occ_dfmissing_cols	num_peaks
num_motifsocc_matir   occ_bincooccs              r   get_motif_occurencesr?      st   J 		bfQiioog..	/	/	9WQV 
 

 

 
1  {##c&.&9&99LFFFFFGG	B-{;	<	<	yk		  I[!!Jh	:.bh???G+&& 8 81))!,,55771{""28,,GIE5=r    regionszN 4 Lpositions_dfmotif_widthzH 4 Wc           	         |                     t          j        d          t          j        d          t          j        d          z
  t          j        d                    }|                    d                                          }|                    d                                          }|                    d                                                              t                    }|dk    ||z   | j        d	         k    z  }||         }||         }||         }|d
d
d
d
f         }|d
d
d
d
f         t          j	        dd|ft                    z   }	|	| d
d
d
d
fxx         t          j        |          d
d
d
d
f         z  cc<   |	|d
d
d
d
fxx         t          j        |          d
d
d
d
df         z  cc<   t          j	        |j        d         ddft                    }
|
| d
d
d
d
fxx         t          j        d          d
d
d
d
f         z  cc<   |
|d
d
d
d
fxx         t          j        d          d
d
d
dd
f         z  cc<   | ||
|	f         }t          j                    5  t          j        dd           t          j        dd           |                    d          }d
d
d
           n# 1 swxY w Y   |S )a  Extract contribution weight matrices from regions based on hit positions.

    This function extracts motif-sized windows from contribution score regions
    at positions specified by hit coordinates. It handles both forward and
    reverse complement orientations and filters out invalid positions.

    Parameters
    ----------
    regions : Float[ndarray, "N 4 L"]
        Input contribution score regions multiplied by one-hot sequences,
        OR Input one-hot encoded sequences.
        Shape: (n_peaks, 4, region_width) where 4 represents DNA bases (A,C,G,T).
    positions_df : pl.DataFrame
        DataFrame containing hit positions with required columns:
        - peak_id : int, Peak index (0-based)
        - start_untrimmed : int, Start position in genomic coordinates
        - peak_region_start : int, Peak region start coordinate
        - is_revcomp : bool, Whether hit is on reverse complement strand
    motif_width : int
        Width of motifs to extract. Must be positive.

    Returns
    -------
    motifs : Float[ndarray, "H 4 W"]
        Extracted motif matrices for valid hits.
        Shape: (n_valid_hits, 4, motif_width)
        Invalid hits (outside region boundaries) are filtered out.

    Notes
    -----
    - Start positions are converted from genomic to region-relative coordinates
    - Reverse complement hits have their sequence order reversed
    - Hits extending beyond region boundaries are excluded
    - The mean is computed across all valid hits, with warnings suppressed
      for empty slices or invalid operations

    Raises
    ------
    ValueError
        If motif_width is non-positive or positions_df lacks required columns.
    r   start_untrimmedpeak_region_start
is_revcomp)peak_idx	start_idxrF   rG   rH   r      Nr   r"      ignorez#invalid value encountered in divide)actionmessagezMean of empty slice)axis)selectr   colr2   r3   r4   boolshaper.   r/   intarangewarningscatch_warningsfilterwarningsmean)r@   rA   rB   idx_dfrG   rH   rF   
valid_maskrow_idxpos_idxnuc_idxseqsmotifss                r   
get_motifsra   T   sF   X   	""&*++bf5H.I.II6,'' !  F
   ,,5577H!!+..7799I""<0099;;BB4HHJ q.Y%<a@P%PQJ
#H*%IJ'Jqqq$}%G4&1a2ES)Q)Q)QQGZKAAA")K"8"8tQQQ"GGJ111;!7!7dDDbD8H!IIhq)1a0<<<GZKAAA")A,,tQQQ}"==J1111dDDbD$.>!??7GW,-D		 	"	" # #%J	
 	
 	
 	
 	x9NOOOO""# # # # # # # # # # # # # # # Ms   8AKKK	sequencespeaks_df
seqlets_df	motifs_dfcwms_modiscozM 4 Wmodisco_half_widthcompute_recallz4 Wc                 H   t          |t          j                  r|                                }|                    t          j        d                              t          j                                                |                                dd          	                    t          j        d          t          j        d          t          j        d          t          j        d          dk    t          j        d	          t          j        d
          t          j        d                    }|
                    g d          }| j        d         }|dz  }|                    t          j        d          t          j        d
          z
  ||z
  k    t          j        d          t          j        d
          z
  ||z   k    z            
                    g d          }|                                                    d	d          }|                                                    d	d          }|d}d}nGt          |t          j                  r|                                }|}n|}|                                }||                    d	d          }ni }|
r||                    |g dd                                          }|                    |g dd                                          }|                    |g dd                                          }|                    d	d          }|                    d	d          }|                    d	d          }ni }i }i }i }i }i }|                                                                }|D ]A}|                    |f|          }|                    |f|          }|} |}!|}"|}#||                    |f|          } |
rG|E|                    |f|          }!|                    |f|          }"|                    |f|          }#|j        |j        d||<   || j        ||         d<   |
rR|P||xx         |!j        |"j        |#j        | j        dk    r!t'          j        |!j                  | j        z  nddz  cc<   |                    t          j        d	          |k    t          j        d          dk    z  d          }$|                    t          j        d	          |k    t          j        d          dk    z  d          }%t-          | ||	          ||$d                  ||%d                  d||<   ||         d         ddddddf         ||         d<   t-          |||	          ||         d <   ||         d          ddddddf         ||         d!<   |
r3|1||xx         t-          | |"|	          t-          | |#|	          d"z  cc<   |$d#         |$d$         f}&|%d#         |%d$         f}'|&|&|'|'|&|'d%||<   |
r|||xx         |&|&d"z  cc<   ||         d         }(||         d&         })t'          j        |(dz                                            }*t'          j        |)dz                                            }+|(|)z                                  |*|+z  z  },|,||         d'<   Cd( |                                D             }-t          j        |-          }.||.||fS ))a!  Compare Fi-NeMo hits with TF-MoDISco seqlets and compute evaluation metrics.

    This function performs comprehensive comparison between Fi-NeMo hit calls
    and TF-MoDISco seqlets, computing recall metrics, CWM similarities,
    and extracting contribution weight matrices for visualization.

    Parameters
    ----------
    regions : Float[ndarray, "N 4 L"]
        Contribution score regions multiplied by one-hot sequences.
        Shape: (n_peaks, 4, region_length)
    sequences : Int[ndarray, "N 4 L"]
        One-hot encoded sequences corresponding to regions.
        Shape: (n_peaks, 4, region_length)
    hits_df : Union[pl.DataFrame, pl.LazyFrame]
        Fi-NeMo hit calls with required columns:
        - peak_id, start_untrimmed, end_untrimmed, strand, motif_name
    peaks_df : pl.DataFrame
        Peak metadata with columns:
        - peak_id, chr_id, peak_region_start
    seqlets_df : Optional[pl.DataFrame]
        TF-MoDISco seqlets with columns:
        - chr_id, start_untrimmed, is_revcomp, motif_name
        If None, only basic hit statistics are computed.
    motifs_df : pl.DataFrame
        Motif metadata with columns:
        - motif_name, strand, motif_id, motif_start, motif_end
    cwms_modisco : Float[ndarray, "M 4 W"]
        TF-MoDISco contribution weight matrices.
        Shape: (n_modisco_motifs, 4, motif_width)
    motif_names : List[str]
        Names of motifs to analyze.
    modisco_half_width : int
        Half-width for restricting hits to central region for fair comparison.
    motif_width : int
        Width of motifs for CWM extraction.
    compute_recall : bool
        Whether to compute recall metrics requiring seqlets_df.

    Returns
    -------
    report_data : Dict[str, Dict[str, Any]]
        Per-motif evaluation metrics including:
        - num_hits_total, num_hits_restricted, num_seqlets
        - num_overlaps, seqlet_recall, cwm_similarity
    report_df : pl.DataFrame
        Tabular format of report_data for easy analysis.
    cwms : Dict[str, Dict[str, Float[ndarray, "4 W"]]]
        Extracted CWMs for each motif and condition:
        - hits_fc, hits_rc: Forward/reverse complement hits
        - modisco_fc, modisco_rc: TF-MoDISco forward/reverse
        - seqlets_only, hits_restricted_only: Non-overlapping instances
    cwm_trim_bounds : Dict[str, Dict[str, Tuple[int, int]]]
        Trimming boundaries for each CWM type and motif.

    Notes
    -----
    - Hits are filtered to central region defined by modisco_half_width
    - CWM similarity is computed as normalized dot product between hit and TF-MoDISco CWMs
    - Recall metrics require both hits_df and seqlets_df to be non-empty
    - Missing motifs are handled gracefully with empty DataFrames

    Raises
    ------
    ValueError
        If required columns are missing from input DataFrames.
    r   innerr   howchr_idrD   end_untrimmedstrand-r   rE   )rm   rD   rn   rF   r   rE   r   rm   rD   r   rF   subsetrI   T)as_dictN)rm   rD   rF   r   anti)num_hits_totalnum_hits_restrictednum_seqletsr   g        )num_overlapsnum_seqlets_onlynum_hits_restricted_onlyseqlet_recall+)by_predicatenamedmotif_id)hits_fc
modisco_fc
modisco_rcr   rJ   hits_rchits_ppm_fchits_ppm_rc)seqlets_onlyhits_restricted_onlymotif_start	motif_end)r   r   r   r   r   r   r   cwm_similarityc                 "    g | ]\  }}d |i|z  S )r    )r   kvs      r   r   z(tfmodisco_comparison.<locals>.<listcomp>  s&    EEEAa 1$EEEr    )
isinstancer   	DataFramelazyr%   rQ   castUInt32joinrP   uniquerS   filterr$   partition_by	LazyFramecleargetr,   r.   float64rowra   sqrtr   items
from_dicts)/r@   rb   r   rc   rd   re   rf   r   rg   rB   rh   hits_unique
region_lencenterhits_filteredhits_by_motifhits_filtered_by_motifseqlets_collectedseqlets_lazyseqlets_by_motifoverlaps_dfseqlets_only_dfhits_only_filtered_dfoverlaps_by_motifseqlets_only_by_motifhits_only_filtered_by_motifreport_datar`   cwm_trim_boundsdummy_dfr   hitsseqletsoverlapsr   hits_only_filteredmotif_data_fcmotif_data_rc	bounds_fc	bounds_rchits_cwmmodisco_cwmhnormsnormcwm_simrecords	report_dfs/                                                  r   tfmodisco_comparisonr      s   n '2<(( !,,.. 	RVI..33BI>>??	hmmoo)	9	9	6(##F#455&11vh''3.vl++ f%899F9%% 
 

 

  ..HHH !  K q!J!^FNNV%&&0C)D)DD++- VO$$rv.A'B'BB++-	
	 	 fMMMfNN   ''))66|T6RRM*2244AAd B    	J	-	- )&..00!&!(($,99,PT9UU ),2#((HHH ) 
 
 '))	 	 '++HHH , 
 
 '))	 	 !. 2 2HHH !3 !
 !
 '))	 	 (44\44PP / < <\SW < X X&;&H&H$ 'I '
 '
##  "&(#KFO}}&&((H V3 V3  !x00.22A4BB %!&**A4::G 	Qj4(,,aT8<<H044aT8DDL!<!@!@!x!P!P #k#0#7
 
A
 !,3NKN=) 	j4NNN ($0$7,>,E>A%% "$HO!<!<w~!M!M  NNN "&..!3x8H8HC8OP & 
 
 "&..!3x8H8HC8OP & 
 
 "'4==&}Z'@A&}Z'@A
 
q	
  &ay3DDbD$$B$J?q	)#-i{#K#Kq	- #)!9]#;DDbD$$B$J#Gq	-  	j41III *7L+ N N(2/) )  III #=1=3MN	"=1=3MN	 !## $$
 
  	j4A )(1# # 
 !9Y'Qi-1))++,,a,,..//k)..00EEMB+2A'((EE1B1B1D1DEEEGg&&I	6?::r    c                    |}t          | t          j                  r|                                 } |                     t          j        d                              t          j                  t          j        d          dk                                  |                                dd          	                    g d          
                    t          j        d	          t          j        d
          |z  t          j        d          |z  t          j        d                    }|                                }|
                    t          j        d	          t          j        d
          |z  t          j        d          |z  t          j        d                    }|                    |g ddd          }	|                    d                              d                                          }
|	                    ddg                              d                                          }t          |          }t          j        ||ft          j                  }d t#          |          D             }|                    |
dd          
                    t          j        d          t          j        d          t          j        d          t          j        d          z            }|
                    t          j        d                              |          t          j        d                              |          t          j        d                    }|d                                         }|d                                         }|d                                         }||||f<   ||fS )a  Compute confusion matrix between TF-MoDISco seqlets and Fi-NeMo hits.

    This function creates a confusion matrix showing the overlap between
    TF-MoDISco seqlets (ground truth) and Fi-NeMo hits across different motifs.
    Overlap frequencies are estimated using binned genomic coordinates.

    Parameters
    ----------
    hits_df : Union[pl.DataFrame, pl.LazyFrame]
        Fi-NeMo hit calls with required columns:
        - peak_id, start_untrimmed, end_untrimmed, strand, motif_name
    seqlets_df : pl.DataFrame
        TF-MoDISco seqlets with required columns:
        - chr_id, start_untrimmed, end_untrimmed, motif_name
    peaks_df : pl.DataFrame
        Peak metadata for joining coordinates:
        - peak_id, chr_id
    motif_names : List[str]
        Names of motifs to include in confusion matrix.
        Determines matrix dimensions.
    motif_width : int
        Width used for binning genomic coordinates.
        Positions are binned to motif_width resolution.

    Returns
    -------
    confusion_df : pl.DataFrame
        Detailed confusion matrix in tabular format with columns:
        - motif_name_seqlets : Seqlet motif labels (rows)
        - motif_name_hits : Hit motif labels (columns)
        - frac_overlap : Fraction of seqlets overlapping with hits
    confusion_mat : Float[ndarray, "M M"]
        Confusion matrix where M = len(motif_names).
        Entry (i,j) = fraction of motif i seqlets overlapping with motif j hits.
        Rows represent seqlet motifs, columns represent hit motifs.

    Notes
    -----
    - Genomic coordinates are binned to motif_width resolution for overlap detection
    - Only exact bin overlaps are considered (same chr_id, start_bin, end_bin)
    - Fractions are computed as: overlaps / total_seqlets_per_motif
    - Missing motif combinations result in zero entries in the confusion matrix

    Raises
    ------
    ValueError
        If required columns are missing from input DataFrames.
    KeyError
        If motif names in data don't match those in motif_names list.
    r   ro   rp   )r   rF   rj   rk   rq   rr   rm   rD   rn   r   )rm   	start_binend_binr   )rm   r   r   _hits)r   rl   suffixrx   )namemotif_name_hitsry   r"   c                     i | ]\  }}||	S r   r   )r   r<   r   s      r   
<dictcomp>z$seqlet_confusion.<locals>.<dictcomp>  s    ;;;DAq1a;;;r    )motif_name_seqletsr   frac_overlapr   r   )r\   col_idxr   r\   r   )r   r   r   r   r%   rQ   r   r   r   r   rP   group_byr-   r$   r.   r/   float32r1   replace_strictr3   )r   rd   rc   r   rB   bin_sizehits_binnedr   seqlets_binnedr   seqlet_countsoverlap_countsr:   confusion_matname_to_idxconfusion_dfconfusion_idx_dfr\   r   r   s                       r   seqlet_confusionr     sw   r H '2<(( !,,.. 	F9%%**2955vh''3. 	 	
 	
 
hmmoo)	9	9	PPP	Q	Q	6(##f.//8;F?++x7vl++	 
 

 

  ??$$L!((vh&*++x7''836,''	 )  N !%%:::PW &  K
 	--11}1EEMMOO  	l,=>??	.	!	!	  [!!JHj*5RZHHHM;;Ik$:$:;;;K!&&,G '  f6,//011VN++bf].C.CC     $**+,,;;KHH())88EEVN++ +   y)2244Gy)2244G#N3<<>>L&2M'7"#&&r    )__doc__rV   typingr   r   r   r   r   numpyr.   r   polarsr   	jaxtypingr	   r
   r   strr   r?   rT   ra   rR   r   r   r   r    r   <module>r      s     0 0 0 0 0 0 0 0 0 0 0 0 0 0                              >\>(,S	>
2<We^,,-> > > >BL7G#$L46LLORL
7GL L L L^J;7G#$J;7G#$J; 2<-.J; l	J;
 blBL$67J; |J; ()J; cJ; J; J; J; d38n	Ld3gun--.	./d3c3h'(	()+J; J; J; J;Z{'2<-.{'blBL01{' l{' c	{'
 {' 2<w~../{' {' {' {' {' {'r    