
    h>                        U d Z ddlZddlZddlZddlmZ ddlmZmZm	Z	m
Z
mZmZmZ ddlZddlmZ ddlZddlZddlZddlZddlZddlmZmZ ddlmZ ded	ee         fd
Zdedeegef         d	eeef         fdZdedeegef         d	eee	edf         f         fdZg dZee         e d<   ej!        ej"        ej"        ej!        ej#        ej!        ej$        ej$        ej$        ej"        g
Z%ee         e d<   dede
e         de&d	ej'        fdZ( ej)        g dd          Z*ej        e d<   ej+        fdeded	eedf         fdZ,dej'        ded ee         de&d	e	eed!f         eed"f         f         f
d#Z-d$ee         de&d	e	eed!f         eed!f         f         fd%Z.d$ee         de&d	e	eed!f         eed!f         f         fd&Z/ded	efd'Z0d(ee         d)ede&d	e	eed!f         eed!f         f         fd*Z1d+ed	e	eed!f         eeed!f         eed"f         f         ej'        e2f         fd,Z3	 dqd-eed!f         d.eeed!f         eed"f         f         d/ed0e
ej'                 d	df
d1Z4d2eed3f         d4e5d	e	e&e&f         fd5Z6drd7eed3f         d8e5d	eed3f         fd9Z7d:e	eef         d	ee	e&e&f         e	e&ef         f         fd;Z8d<d=gZ9d>ed?e
eee	e&e&f         f                  d@e
eee5f                  dAe5dBedCe
ee                  dDe
eeef                  dEe
eee5f                  dFe5dGe2d	e	ej'        eedHf         eedIf         ef         fdJZ:	 dsd>ed0ej'        dLej'        de&dMe&dNe2d	eej'        ej;        f         fdOZ<dPeej'        ej;        f         d/ed	dfdQZ=ej!        ej"        ej"        ej"        ej"        ej!        ej$        ej$        ej$        ej$        ej$        ej$        ej!        ej!        ej#        dRZ>e>dSej#        iz  Z?dKe>fdTedNe2dUeeef         d	eej'        ej;        f         fdVZ@e>fdWeej'        ej;        f         d/edUe
eeef                  d	dfdXZAdWeej'        ej;        f         d0ej'        dLej'        dYej'        dZed[e&d	dfd\ZBdYej'        d0ej'        d/ed	dfd]ZCdLej'        d/ed	dfd^ZDej#        ej!        ej!        ej!        ej#        ej#        ej$        ej$        d_ZEd`ed	e	ej'        ef         fdaZFdbeedHf         d/ed	dfdcZGdded	eedHf         fdeZHdfeeef         d/ed	dfdgZIdhed	eeef         fdiZJdjej'        d/ed	dfdkZKdlej'        d/ed	dfdmZLdnej'        doeeeeef         f         dZed	dfdpZMdS )ta  Data input/output module for the Fi-NeMo motif instance calling pipeline.

This module handles loading and processing of various genomic data formats including:
- Peak region files (ENCODE NarrowPeak format)
- Genome sequences (FASTA format)
- Contribution scores (bigWig, HDF5 formats)
- Neural network model outputs
- Motif data from TF-MoDISco
- Hit calling results

The module supports multiple input formats used for contribution scores
and provides utilities for data conversion and quality control.
    N)	ExitStack)ListDictTupleOptionalAnyUnionCallable)ndarray)FloatInt)tqdmpathreturnc                     g }t          |           5 }|D ]E}|                    d                              d          d         }|                    |           F	 ddd           n# 1 swxY w Y   |S )zLoad a text file containing one item per line.

    Parameters
    ----------
    path : str
        Path to the text file.

    Returns
    -------
    List[str]
        List of strings, one per line (first column if tab-delimited).
    
	r   N)openrstripsplitappend)r   entriesflineitems        5/srv/www/kundaje/kobbad/Fi-NeMo/src/finemo/data_io.pyload_txtr   !   s     G	d !q 	! 	!D;;t$$**4003DNN4    	!! ! ! ! ! ! ! ! ! ! ! ! ! ! !
 Ns   A	A((A,/A,
value_typec                     i }t          |           5 }|D ];}|                    d                              d          \  }} ||          ||<   <	 ddd           n# 1 swxY w Y   |S )ar  Load a two-column tab-delimited mapping file.

    Parameters
    ----------
    path : str
        Path to the mapping file. Must be tab-delimited with exactly two columns.
    value_type : Callable[[str], Any]
        Type constructor to apply to values (e.g., int, float, str).
        Must accept a string and return the converted value.

    Returns
    -------
    Dict[str, Any]
        Dictionary mapping keys to values of the specified type.

    Raises
    ------
    ValueError
        If lines don't contain exactly two tab-separated values.
    FileNotFoundError
        If the specified file does not exist.
    r   r   N)r   r   r   )r   r   mappingr   r   keyvals          r   load_mappingr#   7   s    . G	d +q 	+ 	+D{{4((..t44HC%:c??GCLL	++ + + + + + + + + + + + + + +
 Ns   ?AA"%A".c                    i }t          |           5 }|D ]Z}|                    d                              d          }|d         }|dd         }t          fd|D                       ||<   [	 ddd           n# 1 swxY w Y   |S )a  Load a mapping file where values are tuples from multiple columns.

    Parameters
    ----------
    path : str
        Path to the mapping file. Must be tab-delimited with multiple columns.
    value_type : Callable[[str], Any]
        Type constructor to apply to each value element.
        Must accept a string and return the converted value.

    Returns
    -------
    Dict[str, Tuple[Any, ...]]
        Dictionary mapping keys to tuples of values of the specified type.
        The first column is used as the key, remaining columns as tuple values.

    Raises
    ------
    ValueError
        If lines don't contain at least two tab-separated values.
    FileNotFoundError
        If the specified file does not exist.
    r   r   r      Nc              3   .   K   | ]} |          V  d S N ).0ir   s     r   	<genexpr>z%load_mapping_tuple.<locals>.<genexpr>w   s+       < <1A < < < < < <    )r   r   r   tuple)r   r   r    r   r   r   r!   r"   s    `      r   load_mapping_tupler.   W   s    4 G	d =q 	= 	=Dkk$''--d33G!*C!""+C  < < < < < < <<<GCLL		== = = = = = = = = = = = = = = Ns   AA>>BB)
chr
peak_startpeak_end	peak_name
peak_scorepeak_strandpeak_signal	peak_pval	peak_qvalpeak_summitNARROWPEAK_SCHEMANARROWPEAK_DTYPES
peaks_pathchrom_order_path
half_widthc           
         t          j        | dt          ddt          g d                              t          j        d          t          j        d          t          j        d          z   |z
  t          j        d	          
                              d                                          }|t          |          }ng }t          |          fd|
                    d                              d          D             }|                    |           d t          |          D             }|                    t          j        d                              |                              d                    }|S )a  Load peak region data from ENCODE NarrowPeak format file.

    Parameters
    ----------
    peaks_path : str
        Path to the NarrowPeak format file.
    chrom_order_path : str, optional
        Path to file defining chromosome ordering. If None, uses order from peaks file.
    half_width : int
        Half-width of regions around peak summits.

    Returns
    -------
    pl.DataFrame
        DataFrame containing peak information with columns:
        - chr: Chromosome name
        - peak_region_start: Start coordinate of centered region
        - peak_name: Peak identifier
        - peak_id: Sequential peak index
        - chr_id: Numeric chromosome identifier
    Fr   N).NAnullNaN)
has_headernew_columns	separator
quote_charschema_overridesnull_valuesr/   r0   r8   r2   )r/   peak_region_startr2   peak_idnamec                     g | ]}|v|	S r(   r(   )r)   r*   chrom_order_sets     r   
<listcomp>zload_peaks.<locals>.<listcomp>   s.       O## 	
###r,   T)maintain_orderc                     i | ]\  }}||	S r(   r(   )r)   indr"   s      r   
<dictcomp>zload_peaks.<locals>.<dictcomp>   s    EEE(#sS#EEEr,   chr_id)plscan_csvr9   r:   selectcolwith_row_indexcollectr   set
get_columnuniqueextend	enumeratewith_columnsreplace_strictalias)r;   r<   r=   peakschrom_orderchrom_order_peakschrom_ind_maprN   s          @r   
load_peaksrg      s   2 	).222	
 	
 	
 
u f\22RVM5J5JJZWf[)) 
 

 


 
Y	'	'	! 
& #/00+&&O   !!%((//t/DD  
 ()))EEi.D.DEEEM
u$$]3399(CC E Lr,   )ACGTS1dtypeSEQ_ALPHABETsequencern   z4 Lc                     |                                  } t          j        |                     d          d          }|dddf         t          dddf         k                        |          }|S )ad  Convert DNA sequence string to one-hot encoded matrix.

    Parameters
    ----------
    sequence : str
        DNA sequence string containing A, C, G, T characters.
    dtype : np.dtype, default np.int8
        Data type for the output array.

    Returns
    -------
    Int[ndarray, "4 L"]
        One-hot encoded sequence where rows correspond to A, C, G, T and
        L is the sequence length.

    Notes
    -----
    The output array has shape (4, len(sequence)) with rows corresponding to
    nucleotides A, C, G, T in that order. Non-standard nucleotides (N, etc.)
    result in all-zero columns.
    zUTF-8rl   rm   N)uppernp
frombufferencodero   astype)rp   rn   seq_chararrayone_hots       r   one_hot_encodery      sj    , ~~HM(//'":":$GGGMT111W%aaag)>>FFuMMGNr,   rc   fa_pathbw_pathszN 4 LzN Lc           
         | j         }|dz  }t          j        |d|ft          j                  }t          j        ||ft          j                  }t          j        |d          }d |D             }	t          j        t          |          |dz  ft          j                  }
	 t          t          | 
                    d                    d	d
|          D ]\  }}|d         }|d         }|d|z  z   }||         ||         }|j        }|j        }|j        }||z
  }||z
  }||k    r}t          |          ||d	d	||f<   t          |	          D ]7\  }}t          j        |                    |||d                    |
|d	d	f<   8t          j        |
d          ||||f<   	 |	D ]}|                                 n# |	D ]}|                                 w xY w||fS )a  Load genomic sequences and contribution scores from FASTA and bigWig files.

    Parameters
    ----------
    peaks : pl.DataFrame
        Peak regions DataFrame from load_peaks() containing columns:
        'chr', 'peak_region_start'.
    fa_path : str
        Path to genome FASTA file (.fa or .fasta format).
    bw_paths : List[str]
        List of paths to bigWig files containing contribution scores.
        Must be non-empty.
    half_width : int
        Half-width of regions to extract around peak centers.
        Total region width will be 2 * half_width.

    Returns
    -------
    sequences : Int[ndarray, "N 4 L"]
        One-hot encoded DNA sequences where N is the number of peaks,
        4 represents A,C,G,T nucleotides, and L is the region length (2 * half_width).
    contribs : Float[ndarray, "N L"]
        Contribution scores averaged across input bigWig files.
        Shape is (N peaks, L region_length).

    Notes
    -----
    BigWig files only provide projected contribution scores, not hypothetical scores.
    Regions extending beyond chromosome boundaries are zero-padded.
    Missing values in bigWig files are converted to zero.
          rm   F)one_based_attributesc                 6    g | ]}t          j        |          S r(   )pyBigWigr   r)   r*   s     r   rO   z(load_regions_from_bw.<locals>.<listcomp>$  s"    
.
.
.8=
.
.
.r,   T)namedNregions)disableunittotalr/   rI   )numpyr   axis)heightrs   zerosint8float16pyfaidxFastalenr   r_   	iter_rowsseqstartendry   
nan_to_numvaluesmeanclose)rc   rz   r{   r=   	num_peaksregion_width	sequencescontribsgenomebwscontrib_bufferrR   rowchromr   r   sequence_datarp   	start_adjend_adjabjbws                           r   load_regions_from_bwr      sC   D I>L)Q5RWEEEIxL1DDDH ]7???F
.
.X
.
.
.CXs8}}j1n=RZPPPNeooDo1122	
 
 
 	E 	EHC JE+,E!j.(C17uSy1IM)-H*0I(,GE!A%A1uu)7)A)A	#qqq!A#+&&s^^  EAr+-=		%G4	HH, ,N1aaa4(( &(W^!%D%D%Dac"3	E8  	 	BHHJJJJ	# 	 	BHHJJJJ	 hs   'DG G!h5_pathsc                    t                      5 fd| D             }|d         d         j        d         dz  |z
  d|z  z   |d         d         ddddf                             t          j                  }t          j        fd|D             dt          j                  }ddd           n# 1 swxY w Y   ||fS )	a  Load genomic sequences and contribution scores from ChromBPNet HDF5 files.

    Parameters
    ----------
    h5_paths : List[str]
        List of paths to ChromBPNet HDF5 files containing sequences and SHAP scores.
        Must be non-empty and contain compatible data shapes.
    half_width : int
        Half-width of regions to extract around the center.
        Total region width will be 2 * half_width.

    Returns
    -------
    sequences : Int[ndarray, "N 4 L"]
        One-hot encoded DNA sequences where N is the number of regions,
        4 represents A,C,G,T nucleotides, and L is the region length (2 * half_width).
    contribs : Float[ndarray, "N 4 L"]
        SHAP contribution scores averaged across input files.
        Shape is (N regions, 4 nucleotides, L region_length).

    Notes
    -----
    ChromBPNet files store sequences in 'raw/seq' and SHAP scores in 'shap/seq'.
    All input files must have the same dimensions and number of regions.
    Missing values in contribution scores are converted to zero.
    c                 ^    g | ])}                     t          j        |                    *S r(   enter_contexth5pyFiler)   r*   stacks     r   rO   z3load_regions_from_chrombpnet_h5.<locals>.<listcomp>h  /    CCCQu""49Q<<00CCCr,   r   zraw/seqr}   Nc           	      b    g | ]+}t          j        |d          ddddf                   ,S )zshap/seqN)rs   r   r)   r   r   r   s     r   rO   z3load_regions_from_chrombpnet_h5.<locals>.<listcomp>o  s>    HHHqR]1Z=AAAuSy9::HHHr,   r   rn   )r   shaperv   rs   r   r   r   r   r=   h5sr   r   r   r   r   s        @@@r   load_regions_from_chrombpnet_h5r   J  s$   : 
 
CCCC(CCCAy!'+q0:=a*n$F9%aaaE#Io6==bgFF	7HHHHHCHHH*
 
 

 
 
 
 
 
 
 
 
 
 
 
 
 
 
 hs   BB<<C C c                    t                      5 fd| D             }|d         d         j        d         dz  |z
  d|z  z   |d         d         ddddf                             dd                              t          j                  }t	          j        fd|D             dt          j        	          }ddd           n# 1 swxY w Y   ||fS )
az  Load genomic sequences and contribution scores from BPNet HDF5 files.

    Parameters
    ----------
    h5_paths : List[str]
        List of paths to BPNet HDF5 files containing sequences and contribution scores.
        Must be non-empty and contain compatible data shapes.
    half_width : int
        Half-width of regions to extract around the center.
        Total region width will be 2 * half_width.

    Returns
    -------
    sequences : Int[ndarray, "N 4 L"]
        One-hot encoded DNA sequences where N is the number of regions,
        4 represents A,C,G,T nucleotides, and L is the region length (2 * half_width).
    contribs : Float[ndarray, "N 4 L"]
        Hypothetical contribution scores averaged across input files.
        Shape is (N regions, 4 nucleotides, L region_length).

    Notes
    -----
    BPNet files store sequences in 'input_seqs' and hypothetical scores in 'hyp_scores'.
    The data requires axis swapping to convert from (n, length, 4) to (n, 4, length) format.
    All input files must have the same dimensions and number of regions.
    Missing values in contribution scores are converted to zero.
    c                 ^    g | ])}                     t          j        |                    *S r(   r   r   s     r   rO   z.load_regions_from_bpnet_h5.<locals>.<listcomp>  r   r,   r   
input_seqsr}   Nr%   c           	          g | ]?}t          j        |d          ddddf                             dd                    @S )
hyp_scoresNr%   r}   )rs   r   swapaxesr   s     r   rO   z.load_regions_from_bpnet_h5.<locals>.<listcomp>  s^        aoaaasAAAo>GG1MMNN  r,   r   )r   r   r   rv   rs   r   r   r   r   s        @@@r   load_regions_from_bpnet_h5r   w  sI   < 
 
CCCC(CCCA|$*2.!3j@a*n$F<(E#Iqqq9BB1aHHOOPRPWXX	7       *
 
 

 
 
 
 
 
 
 
 
 
 
 
 
 
 
  hs   B2CCCc                 x    t          j        |           }t          |t           j                  r|}n|d         }|S )a  Load array data from .npy or .npz file.

    Parameters
    ----------
    path : str
        Path to .npy or .npz file. File must exist and contain valid NumPy data.

    Returns
    -------
    ndarray
        Loaded array data. For .npz files, returns the first array ('arr_0').
        For .npy files, returns the array directly.

    Raises
    ------
    FileNotFoundError
        If the specified file does not exist.
    KeyError
        If .npz file does not contain 'arr_0' key.
    arr_0)rs   load
isinstancer   )r   r   arrs      r   load_npy_or_npzr     s:    * 	A!RZ   jJr,   shaps_pathsohe_pathc                    t          |          }|j        d         dz  |z
  d|z  z   |ddddf                             t          j                  }fd| D             }t          j        |dt          j                  }||fS )a  Load genomic sequences and contribution scores from TF-MoDISco format files.

    Parameters
    ----------
    shaps_paths : List[str]
        List of paths to .npy/.npz files containing SHAP/attribution scores.
        Must be non-empty and all files must have compatible shapes.
    ohe_path : str
        Path to .npy/.npz file containing one-hot encoded sequences.
        Must have shape (n_regions, 4, sequence_length).
    half_width : int
        Half-width of regions to extract around the center.
        Total region width will be 2 * half_width.

    Returns
    -------
    sequences : Int[ndarray, "N 4 L"]
        One-hot encoded DNA sequences where N is the number of regions,
        4 represents A,C,G,T nucleotides, and L is the region length (2 * half_width).
    contribs : Float[ndarray, "N 4 L"]
        SHAP contribution scores averaged across input files.
        Shape is (N regions, 4 nucleotides, L region_length).

    Notes
    -----
    All SHAP files must have the same shape as the sequence file.
    Missing values in contribution scores are converted to zero.
    The center of the input sequences is used as the reference point for extraction.
    r   r}   Nc           	      p    g | ]2}t          j        t          |          d d d d f                   3S r'   )rs   r   r   )r)   pr   r   s     r   rO   z1load_regions_from_modisco_fmt.<locals>.<listcomp>  sB    UUUAR]?1--aaaE#Io>??UUUr,   r   r   )r   r   rv   rs   r   r   r   )	r   r   r=   sequences_rawr   shapsr   r   r   s	          @@r   load_regions_from_modisco_fmtr     s    @ $H--M#q(:5E
!j.
 CaaaE#Io.55bg>>IUUUUUUUUEwu1BJ777Hhr,   npz_pathc                 j   t          j        |           }d|                                vrt          j        d           d}|d         j        d         }t          j        dg|z  d          t          j        |t           j                  t          j	        |t           j
                  t          j        |t           j                  t          j        dg|z  d          d	}n(d
}|d         |d         |d         |d         |d         d	}t          j        |          }|d         |d         ||fS )aF  Load preprocessed genomic regions from NPZ file.

    Parameters
    ----------
    npz_path : str
        Path to NPZ file containing sequences, contributions, and optional coordinates.
        Must contain 'sequences' and 'contributions' arrays at minimum.

    Returns
    -------
    sequences : Int[ndarray, "N 4 L"]
        One-hot encoded DNA sequences where N is the number of regions,
        4 represents A,C,G,T nucleotides, and L is the region length.
    contributions : Union[Float[ndarray, "N 4 L"], Float[ndarray, "N L"]]
        Contribution scores in either hypothetical format (N, 4, L) or
        projected format (N, L). Shape depends on the input data format.
    peaks_df : pl.DataFrame
        DataFrame containing peak region information with columns:
        'chr', 'chr_id', 'peak_region_start', 'peak_id', 'peak_name'.
    has_peaks : bool
        Whether the file contains genomic coordinate information.
        If False, placeholder coordinate data is used.

    Notes
    -----
    If genomic coordinates are not present in the NPZ file, creates placeholder
    coordinate data and issues a warning. The placeholder data uses 'NA' for
    chromosome names and sequential indices for peak IDs.

    Raises
    ------
    KeyError
        If required arrays 'sequences' or 'contributions' are missing from the file.
    r/   zaNo genome coordinates present in the input .npz file. Returning sequences and contributions only.Fr   r   r@   Urm   )r/   rT   rI   rJ   r2   TrT   r   rJ   r2   contributions)rs   r   keyswarningswarnr   arrayarangeuint32r   int32rU   	DataFrame)r   data	has_peaksnum_regions	peak_datapeaks_dfs         r   load_regions_npzr     s4   T 78DDIIKKo	
 	
 	
 	;'-a08TF[0<<<i29===!#+RX!F!F!FyBI>>>4&;"6cBBB
 
		 	;8n!%gIk*
 
	 |I&&Hd?3XyHHr,   r   r   out_pathr   c           
         |-t          j        d           t          j        || |           dS |j        }|| j        d         k    s||j        d         k    r"t          d| j         d|j         d|           |                    d                                          	                    d	          }|                    d
                                          }|                    d                                          }|                    d                                          }|                    d                                          	                    d	          }	t          j        || ||||||	           dS )a  Write genomic regions and contribution scores to compressed NPZ file.

    Parameters
    ----------
    sequences : Int[ndarray, "N 4 L"]
        One-hot encoded DNA sequences where N is the number of regions,
        4 represents A,C,G,T nucleotides, and L is the region length.
    contributions : Union[Float[ndarray, "N 4 L"], Float[ndarray, "N L"]]
        Contribution scores in either hypothetical format (N, 4, L) or
        projected format (N, L).
    out_path : str
        Output path for the NPZ file. Parent directory must exist.
    peaks_df : Optional[pl.DataFrame]
        DataFrame containing peak region information with columns:
        'chr', 'chr_id', 'peak_region_start', 'peak_id', 'peak_name'.
        If None, only sequences and contributions are saved.

    Raises
    ------
    ValueError
        If the number of regions in sequences/contributions doesn't match peaks_df.
    FileNotFoundError
        If the parent directory of out_path does not exist.

    Notes
    -----
    The output file is compressed using NumPy's savez_compressed format.
    If peaks_df is provided, genomic coordinate information is included
    in the output file for downstream analysis.
    NzINo genome coordinates provided. Writing sequences and contributions only.)r   r   r   zInput sequences of shape z% and/or input contributions of shape z. are not compatible with peak region count of r/   r   rT   rI   rJ   r2   )r   r   r/   rT   r   rJ   r2   )
r   r   rs   savez_compressedr   r   
ValueErrorr\   to_numpyrv   )
r   r   r   r   r   chr_arr
chr_id_arr	start_arrpeak_id_arrpeak_name_arrs
             r   write_regions_npzr   <  s   H W	
 	
 	
 	H	WWWWWW o9?1---=.q111NIO N N0=0CN N@KN N   %%e,,5577>>sCC((22;;==
''(;<<EEGG	)))44==?? ++K88AACCJJ3OO
'#		
 		
 		
 		
 		
 		
r,   cwmz4 Wtrim_thresholdc                    t          j        t          j        |           d          }t          j        |          |z  }t          j        ||k              }t          t          t          j        |                    d          }t          t          t          j        |                    dz   t          |                    }||fS )ar  Determine trimmed start and end positions for a motif based on contribution magnitude.

    This function identifies the core region of a motif by finding positions where
    the total absolute contribution exceeds a threshold relative to the maximum.

    Parameters
    ----------
    cwm : Float[ndarray, "4 W"]
        Contribution weight matrix for the motif where 4 represents A,C,G,T
        nucleotides and W is the motif width.
    trim_threshold : float
        Fraction of maximum score to use as trimming threshold (0.0 to 1.0).
        Higher values result in more aggressive trimming.

    Returns
    -------
    start : int
        Start position of the trimmed motif (inclusive).
    end : int
        End position of the trimmed motif (exclusive).

    Notes
    -----
    The trimming is based on the sum of absolute contributions across all nucleotides
    at each position. Positions with contributions below trim_threshold * max_score
    are removed from the motif edges.

    Adapted from https://github.com/jmschrei/tfmodisco-lite/blob/570535ee5ccf43d670e898d92d63af43d68c38c5/modiscolite/report.py#L213-L236
    r   r   r%   )rs   sumabsmaxnonzerointminr   )r   r   scoretrim_thresh	pass_indsr   r   s          r   
trim_motifr     s    < F26#;;Q'''E&--.0K
5K/00IBF9%%&&**E
c"&##$$q(#e**
5
5C#:r,   d   xtempc                     | t          j        | dd          z
  }t          j        ||z            }|t          j        |dd          z  S )a  Apply softmax transformation with temperature scaling.

    Parameters
    ----------
    x : Float[ndarray, "4 W"]
        Input array to transform where 4 represents A,C,G,T nucleotides
        and W is the motif width.
    temp : float, default 100
        Temperature parameter for softmax scaling. Higher values create
        sharper probability distributions.

    Returns
    -------
    Float[ndarray, "4 W"]
        Softmax-transformed array with same shape as input. Each column
        sums to 1.0, representing nucleotide probabilities at each position.

    Notes
    -----
    The softmax is applied along the nucleotide axis (axis=0), normalizing
    each position to have probabilities that sum to 1. The temperature
    parameter controls the sharpness of the distribution.
    r%   Tr   keepdimsr   )rs   r   expr   )r   r   norm_xr  s       r   softmaxr    sN    0 T2222F
&

C!d33333r,   r   c                 &   | d         }	 dt          |                    d          d                   fS # t          t          f$ rK 	 dt          |                    d          d                   fcY S # t          t          f$ r	 d|fcY cY S w xY ww xY w)a  Generate sort key for TF-MoDISco motif names.

    This function creates a sort key that orders motifs by pattern number,
    with non-standard patterns sorted to the end.

    Parameters
    ----------
    data : Tuple[str, Any]
        Tuple containing motif name as first element and additional data.
        The motif name should follow the format 'pattern_N' or 'pattern#N' where N is an integer.

    Returns
    -------
    Union[Tuple[int, int], Tuple[int, str]]
        Sort key tuple for ordering motifs. Standard pattern names return
        (0, pattern_number) while non-standard names return (1, name).

    Notes
    -----
    This function is used internally by load_modisco_motifs to ensure
    consistent motif ordering across runs.
    r   _r   #r%   )r   r   r   
IndexError)r   pattern_names     r   _motif_name_sort_keyr    s    . 7L%3|))#..r23344
# % % %	%s<--c2226778888J' 	% 	% 	%|$$$$$$	%%s-   )4 B)A2/B2BBBBpos_patternsneg_patternsmodisco_h5_pathtrim_coordstrim_thresholdstrim_threshold_default
motif_typemotifs_includemotif_name_mapmotif_lambdasmotif_lambda_default
include_rczM 4 WzM Wc
           	         g g g g g g g d}
g }g }|t          |          }nd}|i }|i }|i }|i }t          |                                          t          t          |                                                    k    rt          d          t	          j        | d          5 }t          D ]z}||                                vr||         }t          t          |
                                t                              D ]!\  }\  }}| d| }|||vr|                    ||          }|}|                    ||          }|d         dd         j        }t          j        |dz                                            }||z  }|ddd	ddd	f         }||v r||         \  }}n)|                    ||          }t#          ||          \  }}|j        d
         }||z
  ||z
  } }t          j        |j        d
         t          j                  }!d
|!||<   t          j        |j        d
         t          j                  }"d
|"|| <   |dk    r|}#|}$|}%n|dk    rT|d         dd         j        }&t          j        |&dz                                            }%|&|%z  }#|#ddd	ddd	f         }$n|dk    rB|d         dd         j        }&d
}%|&t          j        |&dd          z  }#|#ddd	ddd	f         }$nP|dk    r7|d         dd         j        }&d
}%t+          |&          }#|#ddd	ddd	f         }$nt          d| d          |
d                             |           |
d                             |           |
d                             d           |
d                             |           |
d                             |           |
d                             |%           |
d                             |           |	r|
d                             |           |
d                             |           |
d                             d           |
d                             |           |
d                             |            |
d                             |%           |
d                             |           |                    |#|$g           |                    |!|"g           |                    |#           |                    |!           #|	 ddd           n# 1 swxY w Y   t1          j        |
                              d !          }'t          j        |t          j        d"          }(t          j        |t          j        d"          })|'                    t1          j        d          dk                                  d                                           }*|'|(|)|*fS )#aG  Load motif data from TF-MoDISco HDF5 file with customizable processing options.

    This function extracts contribution weight matrices and associated metadata from
    TF-MoDISco results, with support for custom naming, trimming, and regularization
    parameters.

    Parameters
    ----------
    modisco_h5_path : str
        Path to TF-MoDISco HDF5 results file containing pattern groups.
    trim_coords : Optional[Dict[str, Tuple[int, int]]]
        Manual trim coordinates for specific motifs {motif_name: (start, end)}.
        Takes precedence over automatic trimming based on thresholds.
    trim_thresholds : Optional[Dict[str, float]]
        Custom trim thresholds for specific motifs {motif_name: threshold}.
        Values should be between 0.0 and 1.0.
    trim_threshold_default : float
        Default trim threshold for motifs not in trim_thresholds.
        Fraction of maximum contribution used for trimming.
    motif_type : str
        Type of motif to extract. Must be one of:
        - 'cwm': Contribution weight matrix (normalized)
        - 'hcwm': Hypothetical contribution weight matrix
        - 'pfm': Position frequency matrix
        - 'pfm_softmax': Softmax-transformed position frequency matrix
    motifs_include : Optional[List[str]]
        List of motif names to include. If None, includes all motifs found.
        Names should follow format 'pos_patterns.pattern_N' or 'neg_patterns.pattern_N'.
    motif_name_map : Optional[Dict[str, str]]
        Mapping from original to custom motif names {orig_name: new_name}.
        New names must be unique across all motifs.
    motif_lambdas : Optional[Dict[str, float]]
        Custom lambda regularization values for specific motifs {motif_name: lambda}.
        Higher values increase sparsity penalty for the corresponding motif.
    motif_lambda_default : float
        Default lambda value for motifs not specified in motif_lambdas.
    include_rc : bool
        Whether to include reverse complement motifs in addition to forward motifs.
        If True, doubles the number of motifs returned.

    Returns
    -------
    motifs_df : pl.DataFrame
        DataFrame containing motif metadata with columns: motif_id, motif_name,
        motif_name_orig, strand, motif_start, motif_end, motif_scale, lambda.
    cwms : Float[ndarray, "M 4 W"]
        Contribution weight matrices for all motifs where M is the number of motifs,
        4 represents A,C,G,T nucleotides, and W is the motif width.
    trim_masks : Int[ndarray, "M W"]
        Binary masks indicating core motif regions (1) vs trimmed regions (0).
        Shape is (M motifs, W motif_width).
    names : ndarray
        Array of unique motif names (forward strand only).

    Raises
    ------
    ValueError
        If motif_type is not one of the supported types, or if motif names
        in motif_name_map are not unique.
    FileNotFoundError
        If the specified HDF5 file does not exist.
    KeyError
        If required datasets are missing from the HDF5 file.

    Notes
    -----
    Motif trimming removes low-contribution positions from the edges based on
    the position-wise sum of absolute contributions across nucleotides. The trimming
    helps focus on the core binding site.

    Adapted from https://github.com/jmschrei/tfmodisco-lite/blob/570535ee5ccf43d670e898d92d63af43d68c38c5/modiscolite/report.py#L252-L272
    )
motif_namemotif_name_origstrandmotif_start	motif_endmotif_scalelambdaNz$Specified motif names are not uniquerr!   r?   contrib_scoresr}   r   r%   rm   r   hcwmhypothetical_contribspfmrp   r   Tr  pfm_softmaxzInvalid motif_type: z5. Must be one of 'cwm', 'hcwm', 'pfm', 'pfm_softmax'.r  r  r  +r  r  r  r  -motif_idrK   )rn   r   )!r[   r   r   r   r   r   MODISCO_PATTERN_GROUPSr   r_   sorteditemsr  getrk   rs   sqrtr   r   r   r   r   r  r   r^   rU   r   rY   r   r   filterrX   r\   r   )+r  r  r  r  r  r  r  r  r  r  motif_data_lsts	motif_lsttrim_mask_lstmotifs_include_setmodisco_resultsrL   metaclusterr  r
  patternpattern_tagmotif_lambdapattern_tag_origcwm_rawcwm_normcwm_fwdcwm_rev	start_fwdend_fwdr   cwm_len	start_revend_revtrim_mask_fwdtrim_mask_rev	motif_fwd	motif_rev
motif_norm	motif_raw	motifs_dfcwms
trim_masksnamess+                                              r   load_modisco_motifsrM    s   j  O IM! 00!
>  ""##s3~/D/D/F/F+G+G'H'HHH?@@@	?C	(	( `8O* _	8 _	8D?//1111)$/K.7{((**0DEEE/ / Z8 Z8**L' "&6666 '2#+===,00>RSS#. ,00kJJ!"23AAA687GQJ#3#3#5#566!H,!$$B$"*-+--)4[)A&Iww%4%8%8#%;& &N *4G^)L)L&Iw!-*%,w%6)8K7	 "q)9 I I I34i/0 "q)9 I I I34i/0&& 'I 'I!)JJ6)) '(? @ C EI!#)Q,););)=)=!>!>J )J 6I )$$B$"* 5II5(( '
 3AAA 6 8I!"J )BF91t,T,T,T TI )$$B$"* 5II=00 '
 3AAA 6 8I!"J '	 2 2I )$$B$"* 5II %pzppp    -44[AAA 1299:JKKK)00555.55i@@@,33G<<<.55jAAA)00>>> 8#L188EEE#$56==>NOOO#H-44S999#M299)DDD#K077@@@#M299*EEE#H-44\BBB$$i%;<<<!((-)GHHHH $$Y///!((7777uZ8_	8`8 `8 `8 `8 `8 `8 `8 `8 `8 `8 `8 `8 `8 `8 `8D _--<<*<MMI8IRZa888D-rwQ???J))S011<<\JJSSUU 
 dJ--s   (RT99T= T=FrI  modisco_half_widthlazyc                    g }g }g }g }	g }
g }t          j        | d          5 }t          D ]}||                                vr||         }t          }t          t          |                                |                    D ]}\  }\  }}| d| |d         dd                             t          j
                  }|d         dd                             t          j
                  }|d         dd                             t                    }d |D             }|d	         dd                             t          j                  }t          |d
         d                   }|                    |           |                    |           |                    |           |	                    |           |
                    |           |                    fdt!          |          D                        	 ddd           n# 1 swxY w Y   t          j        |          t          j        |          t          j        |          |	t          j        |
          |d}||z
  }t%          j        |                              |                                dd                              |                                dd                              t%          j        d          t%          j        d          t%          j        d          t%          j        d          z   t%          j        d          z   |z   t%          j        d          t%          j        d          z   t%          j        d          z   |z   t%          j        d          t%          j        d          z   |z   t%          j        d          t%          j        d          z   |z   t%          j        d          t%          j        d          t%          j        d          t%          j        d          t%          j        d                                        g d          }|r|n|                                }|S )aE  Load seqlet data from TF-MoDISco HDF5 file and convert to genomic coordinates.

    This function extracts seqlet instances from TF-MoDISco results and converts
    their relative positions to absolute genomic coordinates using peak region
    information.

    Parameters
    ----------
    modisco_h5_path : str
        Path to TF-MoDISco HDF5 results file containing seqlet data.
    peaks_df : pl.DataFrame
        DataFrame containing peak region information with columns:
        'peak_id', 'chr', 'chr_id', 'peak_region_start'.
    motifs_df : pl.DataFrame
        DataFrame containing motif metadata with columns:
        'motif_name_orig', 'strand', 'motif_name', 'motif_start', 'motif_end'.
    half_width : int
        Half-width of the current analysis regions.
    modisco_half_width : int
        Half-width of the regions used in the original TF-MoDISco analysis.
        Used to calculate coordinate offsets.
    lazy : bool, default False
        If True, returns a LazyFrame for efficient chaining of operations.
        If False, collects the result into a DataFrame.

    Returns
    -------
    Union[pl.DataFrame, pl.LazyFrame]
        Seqlets with genomic coordinates containing columns:
        - chr: Chromosome name
        - chr_id: Numeric chromosome identifier
        - start: Start coordinate of trimmed motif instance
        - end: End coordinate of trimmed motif instance
        - start_untrimmed: Start coordinate of full motif instance
        - end_untrimmed: End coordinate of full motif instance
        - is_revcomp: Whether the motif is reverse complemented
        - strand: Motif strand ('+' or '-')
        - motif_name: Motif name (may be remapped)
        - peak_id: Peak identifier
        - peak_region_start: Peak region start coordinate

    Notes
    -----
    Seqlets are deduplicated based on chromosome ID, start position (untrimmed),
    motif name, and reverse complement status to avoid redundant instances.

    The coordinate transformation accounts for differences in region sizes
    between the original TF-MoDISco analysis and the current analysis.
    r   r!  r?   zseqlets/startNzseqlets/endzseqlets/is_revcompc                     g | ]}|sd nd	S )r'  r(  r(   r   s     r   rO   z(load_modisco_seqlets.<locals>.<listcomp>  s!    FFFQa033SFFFr,   zseqlets/example_idxzseqlets/n_seqletsr   c                     g | ]}S r(   r(   )r)   r  r7  s     r   rO   z(load_modisco_seqlets.<locals>.<listcomp>$  s    $K$K$KQ[$K$K$Kr,   )seqlet_start
seqlet_end
is_revcompr  rJ   r  )r  r  inneronhowrJ   r/   rT   rI   rS  r  r  rT  rU  r  r  )r/   rT   r   r   start_untrimmedend_untrimmedrU  r  r  rJ   rI   )rT   rZ  r  rU  )subset)r   r   r*  r   r  r_   r+  r,  rv   rs   r   boolr   r   r   r^   rangeconcatenaterU   	LazyFramejoinrO  rW   rX   r]   rZ   )r  r   rI  r=   rN  rO  	start_lstend_lstis_revcomp_lst
strand_lstpeak_id_lstpattern_tagsr4  rL   r5  r!   r  r
  r6  startsendsis_revcompsstrandspeak_ids	n_seqletsdf_dataoffset
seqlets_dfr7  s                               @r   load_modisco_seqletsrq    s   t IGNJKL	?C	(	( MO* 	M 	MD?//1111)$/K&C.7{((**444/ / M M**L' "&6666 1!!!4;;BHEE}-aaa077AA%&:;AAA>EEdKKFF+FFF"#89!!!<CCBINN(; <Q ?@@	  (((t$$$%%k222!!'***""8,,,##$K$K$K$K%	:J:J$K$K$KLLLL%M	MM M M M M M M M M M M M M M M: y11nW--n^44>+..' G ,,F 	W	inn#@g	N	N	hmmoo)	9	9	u6(##&,--f^$$%f]##$  *++f^$$%f[!!"  F#677f^$$% &!455|8L8LLvUvl++6(##vl++F9%% f%899' 
 

 

* 
PPP	Q	Q3 8  $=););)=)=Js   G"HHHrp  c                     |                      ddg          } t          | t          j                  r|                                 } |                     |d           dS )a  Write TF-MoDISco seqlets to TSV file.

    Parameters
    ----------
    seqlets_df : Union[pl.DataFrame, pl.LazyFrame]
        Seqlets DataFrame with genomic coordinates. Must contain columns
        that are safe to drop: 'chr_id', 'is_revcomp'.
    out_path : str
        Output TSV file path.

    Notes
    -----
    Removes internal columns 'chr_id' and 'is_revcomp' before writing
    to create a clean output format suitable for downstream analysis.
    rT   rU  r   rE   N)dropr   rU   r`  rZ   	write_csv)rp  r   s     r   write_modisco_seqletsrv  R  s^    $ (L!9::J*bl++ *''))
T22222r,   )r/   r   r   rZ  r[  r  hit_coefficienthit_coefficient_globalhit_similarityhit_correlationhit_importancehit_importance_sqr  r2   rJ   
is_primary	hits_pathschemac                     t          j        | dd|                              t          j        d                              d                    }|r|n|                                S )a3  Load motif hit data from TSV file.

    Parameters
    ----------
    hits_path : str
        Path to TSV file containing motif hit results.
    lazy : bool, default False
        If True, returns a LazyFrame for efficient chaining operations.
        If False, collects the result into a DataFrame.
    schema : Dict[str, Any], default HITS_DTYPES
        Schema defining column names and data types for the hit data.

    Returns
    -------
    Union[pl.DataFrame, pl.LazyFrame]
        Hit data with an additional 'count' column set to 1 for aggregation.
    r   N)rE   rF   r  r%   count)rU   rV   r`   litrb   rZ   )r~  rO  r  hits_dfs       r   	load_hitsr  ~  sb    ( kTd6  l26!99??7++,,  177 1 11r,   r  c                     |'|                      |                                          } t          | t          j                  r|                                 } |                     |d           dS )a  Write processed hit data to TSV file with optional column filtering.

    Parameters
    ----------
    hits_df : Union[pl.DataFrame, pl.LazyFrame]
        Hit data to write to file.
    out_path : str
        Output path for the TSV file.
    schema : Optional[Dict[str, Any]], default HITS_DTYPES
        Schema defining which columns to include in output.
        If None, all columns are written.
    Nr   rs  )rW   r   r   rU   r`  rZ   ru  )r  r   r  s      r   write_hits_processedr    se    " ..//'2<(( $//##h$/////r,   qc_dfout_dirmotif_widthc                 f	   t          j        |d           t           j                            |d          }t           j                            |d          }t           j                            |d          }|                                                     |                                dd                              |                                dd                              |                                d	d                              t          j        d
          t          j        d          t          j        d          t          j        d          z   t          j        d          z   t          j        d          t          j        d          z   t          j        d          z   t          j        d          t          j        d          z   t          j        d          t          j        d          z   |z   t          j        d          t          j        d          t          j        d          t          j        d          dz  z  t          j        d          t          j        d          t          j        d          t          j        d          z  t          j        d          t          j        d          dz  z  t          j        d          t          j        d          t          j        d          t          j        d                                        d
dg                              t          
                                          }	|	                    t          j        d                                                                        g dd          }
|
                    t          j        d          t          j        d          t          j        d          t          j        d          t          j        d          t          j        d                     }|	                                                    |d!"           |
                                                    |d!"           |                                                    |d#d!$           d%S )&a  Write comprehensive hit results to multiple output files.

    This function combines hit data with peak, motif, and quality control information
    to generate complete output files including genomic coordinates and scores.

    Parameters
    ----------
    hits_df : Union[pl.DataFrame, pl.LazyFrame]
        Hit data containing motif instance information.
    peaks_df : pl.DataFrame
        Peak region information for coordinate conversion.
    motifs_df : pl.DataFrame
        Motif metadata for annotation and trimming information.
    qc_df : pl.DataFrame
        Quality control data for normalization factors.
    out_dir : str
        Output directory for results files. Will be created if it doesn't exist.
    motif_width : int
        Width of motif instances for coordinate calculations.

    Notes
    -----
    Creates three output files:
    - hits.tsv: Complete hit data with all instances
    - hits_unique.tsv: Deduplicated hits by genomic position and motif (excludes rows with NA chromosome coordinates)
    - hits.bed: BED format file for genome browser visualization
    
    Rows where the chromosome field is NA are filtered out during deduplication
    to ensure that data_unique only contains well-defined genomic coordinates.
    Texist_okzhits.tsvzhits_unique.tsvzhits.bedrJ   rV  rW  r)  rT   r/   rI   	hit_startr  r  r  rw  global_scaler}   ry  r{  r|  r  r2   r  )rT   r/   r   r   rZ  r[  r  rw  rx  ry  rz  r{  r|  r  r2   rJ   r8  r   )r/   r   r  r  )r\  rP   r   r   )r/   r   r   r  r   r  r   rs  F)include_headerrE   N)osmakedirsr   ra  rO  rW   rU   rX   sortHITS_DTYPESr   r/  is_not_nullr]   r  rZ   ru  )r  r   rI  r  r  r  out_path_tsvout_path_tsv_uniqueout_path_beddata_alldata_uniquedata_beds               r   
write_hitsr    s   L K$''''7<<44L',,w0ABB7<<44L 		hmmoo)	9	9	ejjllyg	6	6	inn:7	;	;	6(##u&,--f[!!"f]##$ *++bf[.A.AABF;DWDWWF#677"&:M:MM&!455f[!!" vl++F#455#%6*;#<#<vn%%*$,6"233F#3446"233bf^6L6LL f%899vn%%*,6(##f[))F9%%))/ 
 

 

2 
x!	"	"	  ""	#	#? D //"&--";";"="=>>EE777 F  K !!F5MMfWooF5MM6,''fQiivh "  H    >>>##$74#HHH  et TTTTTr,   c                     |                                                      |                                 dd                              ddg                              d                                          }|                    |d           dS )	a@  Write quality control data with peak information to TSV file.

    Parameters
    ----------
    qc_df : pl.DataFrame
        Quality control metrics for each peak region.
    peaks_df : pl.DataFrame
        Peak region information for coordinate annotation.
    out_path : str
        Output path for the TSV file.
    rJ   rV  rW  rT   rI   r   rs  N)rO  ra  r  rt  rZ   ru  )r  r   r   dfs       r   write_qcr    st     	

	hmmoo)	9	9	x,-	.	.	h	  LLTL*****r,   c                 4    |                      |d           dS )zWrite motif metadata to TSV file.

    Parameters
    ----------
    motifs_df : pl.DataFrame
        Motif metadata DataFrame.
    out_path : str
        Output path for the TSV file.
    r   rs  Nru  )rI  r   s     r   write_motifs_dfr  (  s#     D11111r,   )r)  r  r  r  r  r  r  r  motifs_pathc                     t          j        | dt                    }|                    t          j        d          dk                                  d                                          }||fS )a8  Load motif metadata from TSV file.

    Parameters
    ----------
    motifs_path : str
        Path to motif metadata TSV file.

    Returns
    -------
    motifs_df : pl.DataFrame
        Motif metadata with predefined schema.
    motif_names : ndarray
        Array of unique forward-strand motif names.
    r   )rE   r  r  r'  r  )rU   read_csvMOTIF_DTYPESr/  rX   r\   r   )r  rI  motif_namess      r   load_motifs_dfr  A  sf     K4MMMI))S011<<\JJSSUU  k!!r,   rJ  c                 0    t          j        ||            dS )a  Write motif contribution weight matrices to .npy file.

    Parameters
    ----------
    cwms : Float[ndarray, "M 4 W"]
        Contribution weight matrices for M motifs, 4 nucleotides, W width.
    out_path : str
        Output path for the .npy file.
    N)rs   save)rJ  r   s     r   write_motif_cwmsr  X  s     GHdr,   	cwms_pathc                 *    t          j        |           S )zLoad motif contribution weight matrices from .npy file.

    Parameters
    ----------
    cwms_path : str
        Path to .npy file containing CWMs.

    Returns
    -------
    Float[ndarray, "M 4 W"]
        Loaded contribution weight matrices.
    )rs   r   )r  s    r   load_motif_cwmsr  e  s     79r,   paramsc                     t          |d          5 }t          j        | |d           ddd           dS # 1 swxY w Y   dS )zWrite parameter dictionary to JSON file.

    Parameters
    ----------
    params : Dict[str, Any]
        Parameter dictionary to serialize.
    out_path : str
        Output path for the JSON file.
    wr~   )indentN)r   jsondump)r  r   r   s      r   write_paramsr  u  s     
h		 '	&!A&&&&' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' 's   6::params_pathc                 |    t          |           5 }t          j        |          }ddd           n# 1 swxY w Y   |S )zLoad parameter dictionary from JSON file.

    Parameters
    ----------
    params_path : str
        Path to JSON file containing parameters.

    Returns
    -------
    Dict[str, Any]
        Loaded parameter dictionary.
    N)r   r  r   )r  r   r  s      r   load_paramsr    sx     
k		 a1               Ms   155occ_dfc                 4    |                      |d           dS )zWrite occurrence data to TSV file.

    Parameters
    ----------
    occ_df : pl.DataFrame
        Occurrence data DataFrame.
    out_path : str
        Output path for the TSV file.
    r   rs  Nr  )r  r   s     r   write_occ_dfr    s#     X.....r,   seqlet_confusion_dfc                 4    |                      |d           dS )zWrite seqlet confusion matrix data to TSV file.

    Parameters
    ----------
    seqlet_confusion_df : pl.DataFrame
        Seqlet confusion matrix DataFrame.
    out_path : str
        Output path for the TSV file.
    r   rs  Nr  )r  r   s     r   write_seqlet_confusion_dfr    s#     !!(d!;;;;;r,   	report_dfmotifsc           	         t           j                            |d          }t          j        |d           |                                D ]\  }}t           j                            ||          }t          j        |d           |                                D ];\  }}t          j        t           j                            || d          |           <|                     t           j                            |d          d           dS )	aQ  Write comprehensive motif report data including CWMs and metadata.

    Parameters
    ----------
    report_df : pl.DataFrame
        Report metadata DataFrame.
    motifs : Dict[str, Dict[str, ndarray]]
        Nested dictionary of motif names to motif types to arrays.
    out_dir : str
        Output directory for report files.
    r  Tr  z.txtzmotif_report.tsvr   rs  N)r  r   ra  r  r,  rs   savetxtru  )	r  r  r  
motifs_dirmv	motif_dirr  motifs	            r   write_report_datar    s      gx00JK
T**** L L1GLLQ//	
I----!" 	L 	LJJrw||I*/B/B/BCCUKKKK	L W.@AATRRRRRr,   r'   )r   )F)N__doc__r  r  r   
contextlibr   typingr   r   r   r   r   r	   r
   r   rs   r   r   
hdf5pluginpolarsrU   r   r   	jaxtypingr   r   r   strr   r#   r.   r9   __annotations__StringInt32UInt32Float32r:   r   r   rg   r   ro   r   ry   r   r   r   r   r   r]  r   r   floatr   r  r  r*  rM  r`  rq  rv  r  HITS_COLLAPSED_DTYPESr  r  r  r  r  r  r  r  r  r  r  r  r  r  r(   r,   r   <module>r     s      				              D D D D D D D D D D D D D D D D D D                                           3 49    ,s #(< c3h    @"
"#SE3J/"	#uS#X
" " " "L      49    IHHIIIJJJH  49   =='/}=BE=\= = = =B $28$8$8$8EEEbj E E E 02w  S  3w~;N    <N<N"%N15cNHKN
3w %"778N N N Nb*3i*%(*
3w %(8"99:* * * *Z.3i.%(.
3w %(8"99:. . . .b# '    <*c*&)*7:*
3w %(8"99:* * * *ZFIFI
	% 
!5%#8
89L
FI FI FI FIZ (,	C
 C
7G#$C
w/0%2GGHC
 C
 r|$	C

 
C
 C
 C
 C
L$E'5.) $5 $U3PS8_ $ $ $ $N4 4uWe^$ 4E 4E'5.<Q 4 4 4 4:%uS#X %5sCx%PSUXPX/9Y3Z % % % %B ).9 \.\.$sE#s(O345\. d3:./\. "	\.
 \. T#Y'\. T#s(^,\. De,-\.  \. \. 2<w/0#gun2EwNO\. \. \. \.J F FFlF |F 	F
 F F 2<%&F F F FR3blBL013=@3	3 3 3 32 9X8xX)z jjzjiy " $|RY&??  "'2 22204S#X2
2<%&2 2 2 2< (30 02<-.00 T#s(^$0 
	0 0 0 04\U2<-.\Ul\U |\U <	\U
 \U \U 
\U \U \U \U~+BL +BL +C +D + + + +,
2r| 
2s 
2t 
2 
2 
2 
2 	)yi9:j	 	" "blG.C(D " " " ".
5'!12 
c 
d 
 
 
 
s uWg-='>     'c3h '3 '4 ' ' ' 'S T#s(^    &
/ 
/ 
/ 
/ 
/ 
/ 
/
<2< 
<3 
<SW 
< 
< 
< 
<S|Sd3<(()S S 
	S S S S S Sr,   