
    h+                     $   d Z ddlZddlmZmZ ddlZddlmZ ddlZ	ddl
mZ ddlmZmZmZmZ ddlmZmZ  e edd          eedd	d
           eedd	d
           eedd	d
           eedd	d
                    d
          deedf         deedf         deedf         deedf         deedf         f
d            Zdee	j        e	j        f         dede	j        fdZdeee	j        e	j        f                  dede	j        fdZdS )aP  Post-processing utilities for Fi-NeMo hit calling results.

This module provides functions for:
- Collapsing overlapping hits based on similarity scores
- Intersecting hit sets across multiple runs
- Quality control and filtering operations

The main operations are optimized using Numba for efficient processing
of large hit datasets.
    N)ListUnion)ndarray)njit)Arrayuint32int32float32)FloatInt   CT)readonly)cache	chrom_idsz Nstartsendssimilaritiesreturnc                    | j         d         }t          j        |t          j                  }d t	          d          D             }t	          |          D ]}| |         }||         }	||         }
||         }|r4|d         ||	dfk     r%t          j        |           |r|d         ||	dfk     %|D ]3\  }}}|||         k    }||xx         |z  cc<   ||xx         | z  cc<   4t          j        |||
|f           |S )a  Identify primary hits among overlapping hits using a sweep line algorithm.

    This function uses a heap-based sweep line algorithm to efficiently identify
    the best hit (highest similarity) among sets of overlapping hits within each
    chromosome. Only one hit per overlapping group is marked as primary.

    Parameters
    ----------
    chrom_ids : Int[ndarray, "N"]
        Chromosome identifiers for each hit, where N is the number of hits.
        Dtype should be uint32 for Numba compatibility.
    starts : Int[ndarray, "N"]
        Start positions of hits (adjusted for overlap computation).
        Dtype should be int32 for Numba compatibility.
    ends : Int[ndarray, "N"]
        End positions of hits (adjusted for overlap computation).
        Dtype should be int32 for Numba compatibility.
    similarities : Float[ndarray, "N"]
        Similarity scores used for selecting the best hit.
        Dtype should be float32 for Numba compatibility.

    Returns
    -------
    Int[ndarray, "N"]
        Binary array where 1 indicates the hit is primary, 0 otherwise.
        Returns uint32 array for consistency with input types.

    Notes
    -----
    This function is JIT-compiled with Numba for performance on large datasets.
    The algorithm maintains active intervals in a heap and resolves overlaps
    by keeping only the hit with the highest similarity score.

    The sweep line algorithm processes hits in order and maintains a heap of
    currently active intervals. When a new interval is encountered, it is
    compared against all overlapping intervals in the heap, and only the
    interval with the highest similarity score remains marked as primary.
    r   dtypec                 `    g | ]+}t          j        d           t          j        d           df,S )r   )npr   r	   ).0_s     </srv/www/kundaje/kobbad/Fi-NeMo/src/finemo/postprocessing.py
<listcomp>z"_collapse_hits.<locals>.<listcomp>N   s/    >>>RYq\\28A;;+>>>    r   )shaper   onesr   rangeheapqheappopheappush)r   r   r   r   noutheapi	chrom_new	start_newend_newsim_newr   idxcmps                  r   _collapse_hitsr1      s?   j 	A
'!29
%
%
%C>>U1XX>>>D1XX 6 6aL	1I	q'q/  	 tAw)Y!;;;M$  	 tAw)Y!;;;  	 	IAq#L--CHHHOHHHFFF#gFFFF 	ti!45555Jr    hits_dfoverlap_fracc           	         t          | t          j                  r|                                 } | d                             d          }|                                s4d t          |          D             }|                     t          j        d          	                    |t          j
                  t          j        d          dz  t          j        d          t          j        d          z
  |z                      t          j                  z   t          j        d          dz  t          j        d          t          j        d          z
  |z                      t          j                  z
  t          j        d	          
          }n|                     t          j        d          t          j        d          dz  t          j        d          t          j        d          z
  |z                      t          j                  z   t          j        d          dz  t          j        d          t          j        d          z
  |z                      t          j                  z
  t          j        d	          
          }|                                }|d                             d          }|d                             d          }|d                             d          }|d                             d          }t          ||||          }	|                     t          j        |	t          j
                            }
|
S )a   Collapse overlapping hits by selecting the best hit per overlapping group.

    This function identifies overlapping hits and marks only the highest-similarity
    hit as primary in each overlapping group. Overlap is determined by a fractional
    threshold based on the average length of the two hits being compared.

    Parameters
    ----------
    hits_df : Union[pl.DataFrame, pl.LazyFrame]
        Hit data containing required columns: chr (or peak_id if no chr), start, end,
        hit_similarity. Will be collected to DataFrame if passed as LazyFrame.
    overlap_frac : float
        Overlap fraction threshold for considering hits as overlapping.
        For two hits with lengths x and y, minimum overlap = overlap_frac * (x + y) / 2.
        Must be between 0 and 1, where 0 means any overlap and 1 means complete overlap.

    Returns
    -------
    pl.DataFrame
        Original hit data with an additional 'is_primary' column (1 for primary hits, 0 otherwise).
        All original columns are preserved, with the new column added at the end.

    Raises
    ------
    KeyError
        If required columns (chr/peak_id, start, end, hit_similarity) are missing.

    Notes
    -----
    The algorithm transforms coordinates by scaling by 2 and adjusting by the overlap
    fraction to create effective overlap regions for efficient processing. This allows
    using a sweep line algorithm to identify overlaps in a single pass.

    The transformation works as follows:
    - Original coordinates: [start, end]
    - Length = end - start
    - Adjusted start = start * 2 + length * overlap_frac
    - Adjusted end = end * 2 - length * overlap_frac

    This creates regions that overlap only when the original regions have sufficient
    overlap according to the specified fraction.

    Examples
    --------
    >>> hits_collapsed = collapse_hits(hits_df, overlap_frac=0.2)
    >>> primary_hits = hits_collapsed.filter(pl.col("is_primary") == 1)
    >>> print(f"Kept {primary_hits.height}/{hits_df.height} hits as primary")
    chrT)maintain_orderc                     i | ]\  }}||	S  r8   )r   r*   chroms      r   
<dictcomp>z!collapse_hits.<locals>.<dictcomp>   s    BBBHAuuaBBBr    )return_dtypestart   endhit_similarity)chrom_id
start_trimend_trim
similaritypeak_idr@   F)
allow_copyrA   rB   rC   r   )
is_primary)
isinstancepl	LazyFramecollectuniqueis_empty	enumerateselectcolreplace_strictUInt32castInt32rechunkto_numpyr1   with_columnsSeries)r2   r3   chromschrom_to_iddfr   r   r   r   rF   df_outs              r   collapse_hitsr\   f   s   h '2<(( $//##U^""$"77F?? 
BB	&0A0ABBB ^^VE]]11+BI1VVvg*uw/<?EEbhOOPVE]]Q&uw/<?EEbhOOPv.//  
 
 ^^VI&&vg*uw/<?EEbhOOPVE]]Q&uw/<?EEbhOOPv.//  
 
 
B:''5'99I&&%&88Fj>""e"44Dl#,,,>>L  	64FFJ !!RYz-S-S-S!TTFMr    hits_dfsrelaxedc           	         |rg d}ng d}t          |           dk     rt          d          g }| D ]Y}t          |t          j                  r(|                    |                                           D|                    |           Z|d         }t          dt          |                    D ]&}|                    ||         |dd| dd	          }'|S )
a	  Intersect hit datasets across multiple runs to find common hits.

    This function finds hits that appear consistently across multiple Fi-NeMo
    runs, which can be useful for identifying robust motif instances that are
    not sensitive to parameter variations or random initialization.

    Parameters
    ----------
    hits_dfs : List[Union[pl.DataFrame, pl.LazyFrame]]
        List of hit DataFrames from different Fi-NeMo runs. Each DataFrame must
        contain the columns specified by the intersection criteria. LazyFrames
        will be collected before processing.
    relaxed : bool
        If True, uses relaxed intersection criteria with only motif names and
        untrimmed coordinates. If False, uses strict criteria including all
        coordinate and metadata columns.

    Returns
    -------
    pl.DataFrame
        DataFrame containing hits that appear in all input datasets.
        Columns from later datasets are suffixed with their index (e.g., '_1', '_2').
        The first dataset's columns retain their original names.

    Raises
    ------
    ValueError
        If fewer than one hits DataFrame is provided.
    KeyError
        If required columns for the specified intersection criteria are missing
        from any of the input DataFrames.

    Notes
    -----
    Relaxed intersection is useful when comparing results across different
    region definitions or motif trimming parameters, but may produce less
    precise matches. Strict intersection requires identical region definitions
    and is recommended for most use cases.

    The intersection columns used are:
    - Relaxed: ["chr", "start_untrimmed", "end_untrimmed", "motif_name", "strand"]
    - Strict: ["chr", "start", "end", "start_untrimmed", "end_untrimmed",
               "motif_name", "strand", "peak_name", "peak_id"]

    The function performs successive inner joins starting with the first DataFrame,
    so the final result contains only hits present in all input datasets.

    Examples
    --------
    >>> common_hits = intersect_hits([hits_df1, hits_df2], relaxed=False)
    >>> print(f"Found {common_hits.height} hits common to both runs")
    >>>
    >>> # Compare relaxed vs strict intersection
    >>> relaxed_hits = intersect_hits([hits_df1, hits_df2], relaxed=True)
    >>> strict_hits = intersect_hits([hits_df1, hits_df2], relaxed=False)
    >>> print(f"Relaxed: {relaxed_hits.height}, Strict: {strict_hits.height}")
    )r5   start_untrimmedend_untrimmed
motif_namestrand)	r5   r<   r>   r`   ra   rb   rc   	peak_namerD   r   z$At least one hits dataframe requiredr   innerr   T)onhowsuffix
join_nullscoalesce)	len
ValueErrorrG   rH   rI   appendrJ   r#   join)r]   r^   	join_colscollected_dfsrZ   r2   r*   s          r   intersect_hitsrq      s   x  
WWW		

 

 

	 8}}q?@@@ M % %b",'' 	%  ....  $$$$ AG1c-(()) 
 
,,!q77  
 
 Nr    )__doc__r$   typingr   r   numpyr   r   polarsrH   numbar   numba.typesr   r   r	   r
   	jaxtypingr   r   r1   	DataFramerI   floatr\   boolrq   r8   r    r   <module>r|      sO  	 	                              5 5 5 5 5 5 5 5 5 5 5 5                 F111Ifat,,,eQd+++eQd+++gq#---	    C7D=!CC gtm
C &	C
 	$C C C CL]2<-.]>C]\] ] ] ]@d5r|345d@Dd\d d d d d dr    