import pandas as pd
import numpy as np
import os
import argparse
import scipy.stats
from scipy.spatial.distance import jensenshannon
from tensorflow.keras.utils import get_custom_objects
from tensorflow.keras.models import load_model
import tensorflow as tf
import h5py
import math
from generators.variant_generator import VariantGenerator
from generators.peak_generator import PeakGenerator
from utils import argmanager, losses
from utils.helpers import *
from utils.io import *
import shap
from utils.shap_utils import *
import deepdish as dd
tf.compat.v1.disable_v2_behavior()


def main():
    args = argmanager.fetch_shap_args()
    print(args)

    out_dir = os.path.sep.join(args.out_prefix.split(os.path.sep)[:-1])
    print()
    print('out_dir:', out_dir)
    print()
    if not os.path.exists(out_dir):
        raise OSError("Output directory does not exist")

    model = load_model_wrapper(args.model)
    variants_table = load_variant_table(args.list, args.schema)
    variants_table = variants_table.fillna('-')

    chrom_sizes = pd.read_csv(args.chrom_sizes, header=None, sep='\t', names=['chrom', 'size'])
    chrom_sizes_dict = chrom_sizes.set_index('chrom')['size'].to_dict()

    if args.debug_mode:
        variants_table = variants_table.sample(10)
        print(variants_table.head())

    # infer input length
    if args.lite:
        input_len = model.input_shape[0][1]
    else:
        input_len = model.input_shape[1]
    print("input length inferred from the model: ", input_len)

    print(variants_table.shape)
    variants_table = variants_table.loc[variants_table.apply(lambda x: get_valid_variants(x.chr, x.pos, x.allele1, x.allele2, input_len, chrom_sizes_dict), axis=1)]
    variants_table.reset_index(drop=True, inplace=True)
    print(variants_table.shape)
    
    for shap_type in args.shap_type:
        # fetch model prediction for variants
        batch_size=args.batch_size
        ### set the batch size to the length of variant table in case variant table is small to avoid error
        batch_size=min(batch_size,len(variants_table))
        # output_file=h5py.File(''.join([args.out_prefix, ".variant_shap.%s.h5"%shap_type]), 'w')
        # observed = output_file.create_group('observed')
        # allele1_write = observed.create_dataset('allele1_shap', (len(variants_table),2114,4), chunks=(batch_size,2114,4), dtype=np.float16, compression='gzip', compression_opts=9)
        # allele2_write = observed.create_dataset('allele2_shap', (len(variants_table),2114,4), chunks=(batch_size,2114,4), dtype=np.float16, compression='gzip', compression_opts=9)
        # variant_ids_write = observed.create_dataset('variant_ids', (len(variants_table),), chunks=(batch_size,), dtype='S100', compression='gzip', compression_opts=9)

        allele1_seqs = []
        allele2_seqs = []
        allele1_scores = []
        allele2_scores = []
        variant_ids = []

        num_batches=len(variants_table)//batch_size
        for i in range(num_batches):
            sub_table=variants_table[i*batch_size:(i+1)*batch_size]
            var_ids, allele1_inputs, allele2_inputs, \
            allele1_shap, allele2_shap = fetch_shap(model,
                                                    sub_table,
                                                    input_len,
                                                    args.genome,
                                                    args.batch_size,
                                                    debug_mode=args.debug_mode,
                                                    lite=args.lite,
                                                    bias=None,
                                                    shuf=False,
                                                    shap_type=shap_type)
            
            # allele1_write[i*batch_size:(i+1)*batch_size] = allele1_shap
            # allele2_write[i*batch_size:(i+1)*batch_size] = allele2_shap
            # variant_ids_write[i*batch_size:(i+1)*batch_size] = [s.encode("utf-8") for s in var_ids]

            if len(variant_ids) == 0:
                allele1_seqs = allele1_inputs
                allele2_seqs = allele2_inputs
                allele1_scores = allele1_shap
                allele2_scores = allele2_shap
                variant_ids = var_ids
            else:
                allele1_seqs = np.concatenate((allele1_seqs, allele1_inputs))
                allele2_seqs = np.concatenate((allele2_seqs, allele2_inputs))
                allele1_scores = np.concatenate((allele1_scores, allele1_shap))
                allele2_scores = np.concatenate((allele2_scores, allele2_shap))
                variant_ids = np.concatenate((variant_ids, var_ids))

        if len(variants_table)%batch_size != 0:
            sub_table=variants_table[num_batches*batch_size:len(variants_table)]
            var_ids, allele1_inputs, allele2_inputs, \
            allele1_shap, allele2_shap = fetch_shap(model,
                                                                    sub_table,
                                                                    input_len,
                                                                    args.genome,
                                                                    args.batch_size,
                                                                    debug_mode=args.debug_mode,
                                                                    lite=args.lite,
                                                                    bias=None,
                                                                    shuf=False,
                                                                    shap_type=shap_type)
            
            # allele1_write[num_batches*batch_size:len(variants_table)] = allele1_shap
            # allele2_write[num_batches*batch_size:len(variants_table)] = allele2_shap
            # variant_ids_write[num_batches*batch_size:len(variants_table)] = [s.encode("utf-8") for s in var_ids]

            if len(variant_ids) == 0:
                allele1_seqs = allele1_inputs
                allele2_seqs = allele2_inputs
                allele1_scores = allele1_shap
                allele2_scores = allele2_shap
                variant_ids = var_ids
            else:
                allele1_seqs = np.concatenate((allele1_seqs, allele1_inputs))
                allele2_seqs = np.concatenate((allele2_seqs, allele2_inputs))
                allele1_scores = np.concatenate((allele1_scores, allele1_shap))
                allele2_scores = np.concatenate((allele2_scores, allele2_shap))
                variant_ids = np.concatenate((variant_ids, var_ids))

        # # store shap at variants
        # with h5py.File(''.join([args.out_prefix, ".variant_shap.%s.h5"%shap_type]), 'w') as f:
        #     observed = f.create_group('observed')
        #     observed.create_dataset('allele1_shap', data=allele1_shap, compression='gzip', compression_opts=9)
        #     observed.create_dataset('allele2_shap', data=allele2_shap, compression='gzip', compression_opts=9)
            
        assert(allele1_seqs.shape==allele1_scores.shape)
        assert(allele2_seqs.shape==allele2_scores.shape)
        assert(allele1_seqs.shape==allele2_seqs.shape)
        assert(allele1_scores.shape==allele2_scores.shape)
        assert(allele1_seqs.shape[2]==4)
        assert(len(allele1_seqs==len(variant_ids)))
        
        shap_dict = {
            'raw': {'seq': np.concatenate((np.transpose(allele1_seqs, (0, 2, 1)).astype(np.int8),
                                           np.transpose(allele2_seqs, (0, 2, 1)).astype(np.int8)))},
            'shap': {'seq': np.concatenate((np.transpose(allele1_scores, (0, 2, 1)).astype(np.float16),
                                            np.transpose(allele2_scores, (0, 2, 1)).astype(np.float16)))},
            'projected_shap': {'seq': np.concatenate((np.transpose(allele1_seqs * allele1_scores, (0, 2, 1)).astype(np.float16),
                                                      np.transpose(allele2_seqs * allele2_scores, (0, 2, 1)).astype(np.float16)))},
            'variant_ids': np.concatenate((np.array(variant_ids), np.array(variant_ids))),
            'alleles': np.concatenate((np.array([0] * len(variant_ids)),
                                       np.array([1] * len(variant_ids))))}

        dd.io.save(''.join([args.out_prefix, ".variant_shap.%s.h5"%shap_type]),
                   shap_dict,
                   compression='blosc')

    print("DONE")


if __name__ == "__main__":
    main()

---

from tensorflow.keras.utils import Sequence
import pandas as pd
import numpy as np
import math
import pyfaidx
from utils import one_hot
from deeplift.dinuc_shuffle import dinuc_shuffle


class VariantGenerator(Sequence):
    def __init__(self,
                 variants_table,
                 input_len,
                 genome_fasta,
                 batch_size=512,
                 debug_mode=False,
                 shuf=False):

        self.variants_table = variants_table
        self.num_variants = self.variants_table.shape[0]
        self.input_len = input_len
        self.genome = pyfaidx.Fasta(genome_fasta)
        self.debug_mode = debug_mode
        self.flank_size = self.input_len // 2
        self.shuf = shuf
        self.batch_size = batch_size

    def __get_allele_seq__(self, chrom, pos, allele1, allele2, seed=-1):
        chrom = str(chrom)
        pos = int(pos)
        allele1 = str(allele1)
        allele2 = str(allele2)
        
        if allele1 == "-":
            allele1 = ""
        if allele2 == "-":
            allele2 = ""
        ### 1 - indexed position 
        pos = pos - 1
        
        if len(allele1) == len(allele2):
            flank  = str(self.genome[chrom][pos-self.flank_size:pos+self.flank_size])
            if self.shuf:
                assert seed != -1
                flank = dinuc_shuffle(flank, rng=np.random.RandomState(seed))
            allele1_seq = flank[:self.flank_size] + allele1 + flank[self.flank_size+len(allele1):]
            allele2_seq = flank[:self.flank_size] + allele2 + flank[self.flank_size+len(allele2):]

        ### handle INDELS (allele1 must be the reference allele)
        # Here, we adjust the flanks to account for the INDEL and insure that
        # the allele1 and allele2 sequences are the same length
        else:
            ### hg19 has lower case
            assert len(allele1) != len(allele2)
            assert self.genome[chrom][pos:pos+len(allele1)].seq.upper() == allele1
            mismatch_length = len(allele1) - len(allele2)
            if mismatch_length > 0:
                flank = str(self.genome[chrom][pos-self.flank_size:pos+self.flank_size+mismatch_length])
            else:
                flank = str(self.genome[chrom][pos-self.flank_size:pos+self.flank_size])

            if self.shuf:
                assert seed != -1
                flank = dinuc_shuffle(flank, rng=np.random.RandomState(seed))

            left_flank=flank[:self.flank_size]

            allele1_right_flank = flank[self.flank_size+len(allele1):self.flank_size*2]
            allele2_right_flank = flank[self.flank_size+len(allele1):self.flank_size*2+mismatch_length]
            
            allele1_seq = left_flank + allele1 + allele1_right_flank
            allele2_seq = left_flank + allele2 + allele2_right_flank

        assert len(allele1_seq) == self.flank_size * 2
        assert len(allele2_seq) == self.flank_size * 2
        return allele1_seq, allele2_seq

    def __getitem__(self, idx):
        cur_entries = self.variants_table.iloc[idx*self.batch_size:min([self.num_variants,(idx+1)*self.batch_size])]
        variant_ids = cur_entries['variant_id'].tolist()

        if self.shuf:
            allele1_seqs, allele2_seqs = zip(*[self.__get_allele_seq__(v, w, x, y, z) for v,w,x,y,z in
                                             zip(cur_entries.chr, cur_entries.pos,
                                                 cur_entries.allele1, cur_entries.allele2, cur_entries.random_seed)])
        else:
            allele1_seqs, allele2_seqs = zip(*[self.__get_allele_seq__(w, x, y, z) for w,x,y,z in
                                             zip(cur_entries.chr, cur_entries.pos, cur_entries.allele1, cur_entries.allele2)])

        if self.debug_mode:
            return variant_ids, list(allele1_seqs),list(allele2_seqs)
        else:
            return variant_ids, one_hot.dna_to_one_hot(list(allele1_seqs)), one_hot.dna_to_one_hot(list(allele2_seqs))
    
    def __len__(self):
        return math.ceil(self.num_variants/self.batch_size)

---


import marimo

__generated_with = "0.16.2"
app = marimo.App(width="medium", auto_download=["html"])


@app.cell
def _():
	import os.path
	import os
	import ast
	import sys
	import subprocess
	import time
	import hashlib
	from collections import defaultdict
	import numpy as np
	import pandas as pd
	import deepdish
	import h5py
	import multiprocessing


	import matplotlib as mpl
	mpl.use('Agg')
	mpl.rcParams["svg.fonttype"] = "none"
	import matplotlib.pyplot as plt
	import logomaker

	def load_peaks_4(peaks_loc):
		peak_df = pd.read_csv(peaks_loc, sep="\t", names=["chro", "start", "stop", "name"])
		peak_finder = dict()
		for chro in set(peak_df["chro"]):
			peak_finder[chro] = peak_df[peak_df["chro"] == chro]
		return peak_finder


	def load_peaks_10(peaks_loc):
		peak_df = pd.read_csv(peaks_loc, sep="\t", names=["chro", "start", "stop", "4", "5", "6", "7", "8", "9", "summit"])
		peak_df["name"] = peak_df["chro"] + ":" + peak_df["start"].astype(str) + ":" + peak_df["stop"].astype(str)
		peak_finder = dict()
		for chro in set(peak_df["chro"]):
			peak_finder[chro] = peak_df[peak_df["chro"] == chro]
		return peak_finder


	def assign_peaks(variants_df, peaks_loc, peaks_name, num_cols):
		print(f"starting {peaks_name}")
		if num_cols == 4:
			peak_finder = load_peaks_4(peaks_loc)
		elif num_cols == 10:
			peak_finder = load_peaks_10(peaks_loc)
		else:
			assert(False)
		in_peaks = []
		for index, row in variants_df.iterrows():
			chro, pos, ref = row["chr"], row["pos"], row["ref"]
			pos_start = pos
			pos_end = pos + len(ref) - 1
			if chro not in peak_finder:
				in_peaks.append("out_of_peak")
				continue
			else:
				peak_chro = peak_finder[chro]
			# Option 1 - variant start is within region
			match_1 = peak_chro[(peak_chro["start"] <= pos_start) & (peak_chro["stop"] >= pos_start)]
			# Option 2 - variant stop is within region
			match_2 = peak_chro[(peak_chro["start"] <= pos_end) & (peak_chro["stop"] >= pos_end)]
			# Option 3 - variant region encapsulates a whole region
			match_3 = peak_chro[(peak_chro["start"] >= pos_start) & (peak_chro["stop"] <= pos_end)]
			# Combine all matches
			all_matches_df = pd.concat([match_1, match_2, match_3], axis=0, ignore_index=True)
			all_matches_df.drop_duplicates(inplace=True)
			# In peak or not
			in_peaks.append(" & ".join(list(all_matches_df["name"])) if len(all_matches_df) > 0 else "out_of_peak")
		return {peaks_name: in_peaks}


	def assign_peaks_parallel(variants_df, peaks_dict, num_cols=10):
		inputs = []
		for peaks_name, peaks_loc in peaks_dict.items():
			inputs.append((variants_df, peaks_loc, peaks_name, num_cols))
		with multiprocessing.Pool() as pool:
			results = pool.starmap(assign_peaks, inputs)
			pool.close()
			pool.join()
		for x in results:
			for pn, ip in x.items():		
				variants_df[pn] = ip
		return variants_df

	def _plotter_profile(allele1_profiles, allele2_profiles, vlines, ax0, allele1_label, allele2_label):
		xmins, xmaxs = [], []
		shortened_allele1_label = allele1_label if len(allele1_label) < 15 else f"{allele1_label[:15]}... (len {len(allele1_label)})"
		for i, (x, allele1_profile) in enumerate(allele1_profiles):
			ax0.plot(x, allele1_profile, label=(f"ref ({shortened_allele1_label})" if i==0 else ""), color='C0')
			xmins.append(min(x)); xmaxs.append(max(x))
		shortened_allele2_label = allele2_label if len(allele2_label) < 15 else f"{allele2_label[:15]}... (len {len(allele2_label)})"
		for i, (x, allele2_profile) in enumerate(allele2_profiles):
			ax0.plot(x, allele2_profile, label=(f"alt ({shortened_allele2_label})" if i==0 else ""), color='C1')
			xmins.append(min(x)); xmaxs.append(max(x))
		for v in vlines:
			ax0.axvline(v, color='black', ls='--', linewidth=1)
		xmin, xmax = min(xmins), max(xmaxs)
		ax0.set_xlim(xmin, xmax)
		ax0.set_xticks(np.arange(xmin + (50 - xmin%50)%50, xmax+1, 50))
		legend = ax0.legend(prop={'size': 12}, loc='upper right')


	def _plotter_shap(allele1_shap, allele2_shap, vlines, xmin, ax1, ax2, allele1_label, allele2_label):
		active_allele = "ref" if np.sum(allele1_shap) > np.sum(allele2_shap) else "alt"
		df1 = pd.DataFrame(allele1_shap, columns=["A", "C", "G", "T"])
		df1.index +=  xmin
		df2 = pd.DataFrame(allele2_shap, columns=["A", "C", "G", "T"])
		df2.index += xmin
		logomaker.Logo(df1, ax=ax1)
		logomaker.Logo(df2, ax=ax2)
		for v in vlines:
			ax1.axvline(v, color='k', linestyle='--', linewidth=1)
			ax2.axvline(v, color='k', linestyle='--', linewidth=1)
		ymax = 1.1*max(np.max(np.maximum(allele1_shap, 0)), np.max(np.maximum(allele2_shap, 0)))
		ymin = 1.1*min(np.min(np.minimum(allele1_shap, 0)), np.min(np.minimum(allele2_shap, 0)))
		ax1.set_ylim(bottom=ymin, top=ymax)
		ax2.set_ylim(bottom=ymin, top=ymax)
		shortened_allele_1_label = allele1_label if len(allele1_label) < 15 else f"{allele1_label[:15]}... (len {len(allele1_label)})"
		plt.text(0.988, 0.903, f"ref ({shortened_allele_1_label})", 
								 verticalalignment='top', horizontalalignment='right',
								 transform=ax1.transAxes, size=12, color='black',
								 bbox=dict(boxstyle='round', facecolor='white', edgecolor='lightgrey'))
		shortened_allele2_label = allele2_label if len(allele2_label) < 15 else f"{allele2_label[:15]}... (len {len(allele2_label)})"
		plt.text(0.988, 0.903, f"alt ({shortened_allele2_label})", 
								 verticalalignment='top', horizontalalignment='right',
								 transform=ax2.transAxes, size=12, color='black',
								 bbox=dict(boxstyle='round', facecolor='white', edgecolor='lightgrey'))


	def _add_logfc_stats_box(ax, logfc, logfc_pval):
		"""Add a box with logFC and p-value statistics to the plot"""
		# Format the p-value in scientific notation if it's very small
		if logfc_pval < 0.001:
			pval_text = f"{logfc_pval:.2e}"
		else:
			pval_text = f"{logfc_pval:.3f}"

		# Create the text for the stats box
		stats_text = f"LogFC: {logfc:.3f}\nP-value: {pval_text}"

		# Get the legend position
		legend = ax.get_legend()
		if legend:
			# Get legend bounding box in axes coordinates
			bbox = legend.get_window_extent(ax.figure.canvas.get_renderer())
			bbox_axes = bbox.transformed(ax.transAxes.inverted())
			legend_top = bbox_axes.y1  # Top of legend
			legend_bottom = bbox_axes.y0  # Bottom of legend

			# Use the legend's top position for alignment
			y_pos = legend_top
		else:
			# Fallback if no legend exists
			y_pos = 1.0

		# Add text box on the left side at the same height as the legend
		plt.text(0.008, y_pos-0.04, stats_text,
				 verticalalignment='top', horizontalalignment='left',
				 transform=ax.transAxes, size=12, color='black',
				 bbox=dict(boxstyle='round', facecolor='white', edgecolor='lightgrey'))


	def _plot_profile(allele1_pred, allele2_pred, allele1_length, allele2_length, allele1_label, allele2_label, window_size, ax0, logfc=None, logfc_pval=None):
		total_length = 1000
		C = total_length//2
		F = window_size//2
		if allele1_length < allele2_length:
			# INSERTION
			allele1_pred_plots = []
			allele1_pred_plots.append((list(range(-F, allele1_length)), allele1_pred[C-F:C+allele1_length]))
			allele1_pred_plots.append((list(range(allele2_length, F+allele2_length)),
									  allele1_pred[C+allele1_length:C+F+allele1_length]))
			allele2_pred_plots = [(list(range(-F, F+allele2_length)), allele2_pred[C-F:C+F+allele2_length])]
			vlines = [-0.5, allele2_length-0.5]
		elif allele1_length > allele2_length:
			# DELETION
			allele1_pred_plots = [(list(range(-F, F+allele1_length)), allele1_pred[C-F:C+F+allele1_length])]
			allele2_pred_plots = []
			allele2_pred_plots.append((list(range(-F, allele2_length)), allele2_pred[C-F:C+allele2_length]))
			allele2_pred_plots.append((list(range(allele1_length, F+allele1_length)),
									  allele2_pred[C+allele2_length:C+F+allele2_length]))
			vlines = [-0.5, allele1_length-0.5]
		else:
			# SUBSTITUTION
			allele1_pred_plots = [(list(range(-F, F+allele1_length)), allele1_pred[C-F:C+F+allele1_length])]
			allele2_pred_plots = [(list(range(-F, F+allele2_length)), allele2_pred[C-F:C+F+allele2_length])]
			vlines = [-0.5, +allele1_length-0.5]

		_plotter_profile(allele1_pred_plots, allele2_pred_plots, vlines, ax0, allele1_label, allele2_label)

		# Add logFC stats box if values are provided
		if logfc is not None and logfc_pval is not None:
			_add_logfc_stats_box(ax0, logfc, logfc_pval)


	def _plot_shap(allele1_shap, allele2_shap, allele1_length, allele2_length, allele1_label, allele2_label, window_size, ax1, ax2):
		total_length = 2114
		C = total_length//2
		F = window_size//2
		if allele1_length < allele2_length:
			# INSERTION
			allele1_shap_plot = np.concatenate([allele1_shap[C-F:C+allele1_length],
											   np.zeros((allele2_length-allele1_length, 4)),
											   allele1_shap[C+allele1_length:C+F+allele1_length]])
			allele2_shap_plot = allele2_shap[C-F:C+F+allele2_length]
			vlines = [-0.5, allele2_length-0.5]
		elif allele1_length > allele2_length:
			# DELETION
			allele1_shap_plot = allele1_shap[C-F:C+F+allele1_length]
			allele2_shap_plot = np.concatenate([allele2_shap[C-F:C+allele2_length],
											   np.zeros((allele1_length-allele2_length, 4)),
											   allele2_shap[C+allele2_length:C+F+allele2_length]])
			vlines = [-0.5, allele1_length-0.5]
		else:
			# SUBSTITUTION
			allele1_shap_plot = allele1_shap[C-F:C+F+allele1_length]
			allele2_shap_plot = allele2_shap[C-F:C+F+allele2_length]
			vlines = [-0.5, allele1_length-0.5]
		assert(allele1_shap_plot.shape == allele2_shap_plot.shape)
		_plotter_shap(allele1_shap_plot, allele2_shap_plot, vlines, -F, ax1, ax2, allele1_label, allele2_label)


	def plot_variant(allele1_pred, allele2_pred, allele1_shap, allele2_shap, allele1_length, allele2_length, 
					allele1_label, allele2_label, window_size, title, save_loc, include_shap=True, 
					logfc=None, logfc_pval=None):
		"""
		Plot variant with optional SHAP visualization and logFC statistics

		Parameters:
		- include_shap: If True, creates 3-panel plot with profile + 2 SHAP plots. If False, creates 1-panel plot with just profile.
		- logfc: LogFC value to display in stats box
		- logfc_pval: LogFC p-value to display in stats box
		"""
		if include_shap:
			fig, axs = plt.subplots(3, 1, figsize=(20, 8), dpi=400)
			# PLOT PROFILE
			_plot_profile(allele1_pred, allele2_pred, allele1_length, allele2_length, 
						 allele1_label, allele2_label, window_size, axs[0], logfc, logfc_pval)
			# PLOT SHAP
			_plot_shap(allele1_shap, allele2_shap, allele1_length, allele2_length, 
					  allele1_label, allele2_label, window_size, axs[1], axs[2])
			plt.subplots_adjust(hspace=0.3, top=0.785)
		else:
			fig, ax = plt.subplots(1, 1, figsize=(20, 4), dpi=400)
			# PLOT PROFILE ONLY
			_plot_profile(allele1_pred, allele2_pred, allele1_length, allele2_length, 
						 allele1_label, allele2_label, window_size, ax, logfc, logfc_pval)
			plt.subplots_adjust(top=0.85)

		# PLOT
		plt.suptitle(title, fontsize=24)
		fig.set_facecolor('white')
		plt.savefig(save_loc, bbox_inches='tight', dpi=300)
		plt.close()
		print(f"Saved to {save_loc}")


	def load_scores(prediction_files, expected_num_variants=None):
		counts1, counts2, profile1, profile2 = {}, {}, {}, {}
		chrs = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 22, 'X', 'Y']

		def softmax(x, temp=1):
			norm_x = x - np.mean(x, axis=1, keepdims=True)
			return np.exp(temp*norm_x)/np.sum(np.exp(temp*norm_x), axis=1, keepdims=True)

		fold_preds1 = []
		fold_preds2 = []
		existing_folds = []
		for i, file in enumerate(prediction_files):
			if not os.path.exists(file):
				continue
			existing_folds.append(i)
			print(f"	Loading {file}")
			with h5py.File(file, 'r') as h5file:
				# If expected_num_variants is provided, check if the number of variants matches
				if expected_num_variants is not None:
					num_variants = h5file["observed"]["allele1_pred_counts"].shape[0]
					if num_variants != expected_num_variants:
						print(f"Warning: Fold {i} has {num_variants} variants, expected {expected_num_variants}. Skipping this fold.")
						continue

				fold_preds1.append(h5file["observed"]["allele1_pred_counts"][:] * softmax(h5file["observed"]["allele1_pred_profiles"][:]))
				fold_preds2.append(h5file["observed"]["allele2_pred_counts"][:] * softmax(h5file["observed"]["allele2_pred_profiles"][:]))
		avg_preds1 = np.mean(np.array([fold_preds1[fold] for fold in existing_folds]), axis=0)
		avg_preds2 = np.mean(np.array([fold_preds2[fold] for fold in existing_folds]), axis=0)
		return avg_preds1, avg_preds2


	def load_shaps(shap_files):
		shaps1 = None
		shaps2 = None
		existing_folds = []
		n = len(shap_files)
		for i, file in enumerate(shap_files):
			print(f"	Loading {file}")
			if not os.path.exists(file):
				continue
			existing_folds.append(i)
			fold_shaps = deepdish.io.load(file)
			shaps_f = fold_shaps["projected_shap"]["seq"]
			N = shaps_f.shape[0]//2
			if shaps1 is None:
				shaps1 = shaps_f[:N, :, :]
				shaps2 = shaps_f[N:, :, :]
			else:
				shaps1 += shaps_f[:N, :, :]
				shaps2 += shaps_f[N:, :, :]
		shaps1 = np.transpose(shaps1/n, (0, 2, 1))
		shaps2 = np.transpose(shaps2/n, (0, 2, 1))
		assert(shaps1.shape == shaps2.shape)
		return shaps1, shaps2


	def load_list(list_loc):
		with open(list_loc, 'r') as f:
			return [line.strip() for line in f]

	def parse_models_completed(cell):
		try:
			return ast.literal_eval(cell)
		except (ValueError, SyntaxError):
			return [cell] if cell != '[]' else []

	# Parse command line arguments for parallel processing
	process_id = 0
	total_process_count = 1
	
	if len(sys.argv) >= 3:
		process_id = int(sys.argv[1])
		total_process_count = int(sys.argv[2])
		print(f"Running as process {process_id} of {total_process_count}")
	else:
		print("No parallel processing arguments provided. Running all models.")
	all_models = sorted(os.listdir("/oak/stanford/groups/akundaje/airanman/nautilus-sync/chd-091625/pvc/outputs/outputs/chd-091625"))
	
	# Split models across processes
	models_to_process = [model for idx, model in enumerate(all_models) if idx % total_process_count == process_id]
	
	print(f"Process {process_id} will handle {len(models_to_process)} models out of {len(all_models)} total models")


	# variants_df = pd.read_csv("/users/airanman/nautilus-projects/chd-091625/variants_to_plot.tsv", sep='\t')
	# shap_prio_variants_df = pd.read_csv("/users/airanman/nautilus-projects/chd-091625/variants_to_plot.tsv", sep='\t')
	predictions_variants_df = pd.read_csv("/oak/stanford/groups/akundaje/airanman/nautilus-sync/chd-091625/denovo_variants_chd_and_non_chd_ncDNVs.with_tbx5_variants.tsv", sep='\t', header=None, names=['chr','pos','ref','alt','variant_id'])
	shap_prio_variants_df = pd.read_csv("/oak/stanford/groups/akundaje/airanman/nautilus-sync/chd-091625/denovo_variants_chd_and_non_chd_ncDNVs.shap-prio.variant_scorer.tsv", sep='\t', header=None, names=['chr','pos','ref','alt','variant_id'])
	is_done_df = pd.read_csv("/oak/stanford/groups/akundaje/airanman/nautilus-sync/chd-091625/plot.is_done.tsv", sep='\t')
	i = 0
	plotting_shap = True  # Set this to True if you want to include SHAP plots

	print(f'Skipping the following finished models: {is_done_df["id"].tolist()}')

	# Get models names from directory /oak/stanford/groups/akundaje/airanman/nautilus-sync/chd-091625/pvc/outputs/outputs/chd-091625
	for model in models_to_process:
	# for model in os.listdir("/oak/stanford/groups/akundaje/airanman/nautilus-sync/chd-091625/pvc/outputs/outputs/chd-091625"):
		print(f"Model: {model}")
		if model in is_done_df['id'].values:
			print("	Skipping")
			continue

		annots_df = pd.read_csv(f"/oak/stanford/groups/akundaje/airanman/nautilus-sync/chd-091625/pvc/outputs/annotate-summ/{model}/annotations.tsv", sep='\t')

		# Load scores once for all variants in this model
		h5_path_prefix = f"/oak/stanford/groups/akundaje/airanman/nautilus-sync/chd-091625/pvc/outputs/outputs/chd-091625/{model}/"
		prediction_files = [f"{h5_path_prefix}/fold_{f}/.variant_predictions.h5" for f in range(5)]

		# Check if they all exist
		if not all([os.path.exists(f) for f in prediction_files]):
			print(f"Model {model} has missing fold files. Skipping")
			continue

		# Removed the one invalid variant
		counts_profile_1, counts_profile_2 = load_scores(prediction_files, expected_num_variants=len(annots_df))

		if not isinstance(counts_profile_1, np.ndarray) or not isinstance(counts_profile_2, np.ndarray):
			print(f"Model {model} has invalid arrays. Skipping")
			continue

		# Load SHAP data if needed
		shaps1, shaps2 = None, None
		if plotting_shap:
			shap_type = 'counts'
			try:
				shap_path_prefix = f"/oak/stanford/groups/akundaje/airanman/nautilus-sync/chd-091625/pvc/outputs/shap/chd-091625/{model}/"
				shap_files = [f"{shap_path_prefix}/fold_{f}/shap-prio.variant_shap.{shap_type}.h5" for f in range(5) for shap_type in ['counts']]
				shaps1, shaps2 = load_shaps(shap_files)
				if not isinstance(shaps1, np.ndarray) or not isinstance(shaps2, np.ndarray):
					print(f"	Model {model} has invalid SHAP arrays. Skipping SHAP plotting for this model")
					plotting_shap = False
			except Exception as e:
				print(f"	Error loading SHAP data for model {model}: {e}. Skipping SHAP plotting for this model")
				plotting_shap = False

		for i, row in shap_prio_variants_df.iterrows():
			# if i != len(shap_prio_variants_df)-1:
			# 	continue
			print(f"	Variant {i+1}/{len(shap_prio_variants_df)}")
			variant_idx_list = predictions_variants_df.index[predictions_variants_df['variant_id'] == row['variant_id']].tolist()
			print(row['variant_id'])

			if variant_idx_list:
				variant_idx = variant_idx_list[0]  # Get first occurrence (0-indexed)
				print('variant_idx', variant_idx)
			else:
				print("No match found")
			variant_id_internal = row['variant_id']
			variant_name = row['variant_id'] 
			# variant_id_external = row['id']

			ref_profile = counts_profile_1[variant_idx, :]
			alt_profile = counts_profile_2[variant_idx, :]
			ref_length = len(row["ref"])
			alt_length = len(row["alt"])
			ref_label = row["ref"]
			alt_label = row["alt"]

			# Get the logfc from df
			variant_row = annots_df[annots_df['variant_id'] == row['variant_id']]
			if variant_row.empty:
				print(f"Warning: No matching variant found in annotations for variant_id {row['variant_id']}. Skipping.")
				continue
			logfc = variant_row['logfc.mean'].values[0]
			logfc_pval = variant_row['logfc.mean.pval'].values[0]

			output_path = f"/oak/stanford/groups/akundaje/airanman/nautilus-sync/chd-091625/plot/{variant_name}/{model}.svg"

			# Create output directory if it doesn't exist
			os.makedirs(os.path.dirname(output_path), exist_ok=True)

			if plotting_shap and shaps1 is not None and shaps2 is not None:
				# Get shap_idx from their row num in variant_prio_variants_df on 'variant_id'
				# shap_idx = shap_prio_variants_df.index[shap_prio_variants_df['variant_id'] == variant_id_internal].tolist()[0]
				shap_idx = i
				print('shap_idx', shap_idx)
				# Plot with SHAP and logFC stats
				ref_shap = shaps1[shap_idx, :, :]
				alt_shap = shaps2[shap_idx, :, :]
				plot_variant(ref_profile, alt_profile, ref_shap, alt_shap, ref_length, alt_length, 
							ref_label, alt_label, 300, f'{variant_name} in {model}', 
							output_path, include_shap=True, 
							logfc=logfc, logfc_pval=logfc_pval)
			else:
				# Plot without SHAP but with logFC stats
				plot_variant(ref_profile, alt_profile, None, None, ref_length, alt_length, 
							ref_label, alt_label, 300, f'{variant_name} in {model}', 
							output_path, include_shap=False,
							logfc=logfc, logfc_pval=logfc_pval)
			if os.environ['USER'] == 'airanman':
				# Get mitra URL
				mitra_cmd = f'bash /oak/stanford/groups/akundaje/airanman/projects/lab/lab-utils-shared/scripts/mitra-utils url "{output_path}"'
				result = subprocess.run(mitra_cmd, shell=True, executable="/bin/bash", capture_output=True, text=True)
				print(result.stdout)
				if result.stderr:
					print("Error:", result.stderr)
			htmler_cmd = f'cd /oak/stanford/groups/akundaje/airanman/nautilus-sync/chd-091625/plot/{variant_name} && bash /oak/stanford/groups/akundaje/airanman/nautilus-sync/chd-091625/htmler.sh .'
			htmler_result = subprocess.run(htmler_cmd, shell=True, executable="/bin/bash", capture_output=True, text=True)
			print(htmler_result.stdout)
			if htmler_result.stderr:
				print("Error:", htmler_result.stderr)

		# Add the model to the is_done_df
		new_row = pd.DataFrame([{'id': model}])
		# Use pd.concat to add the new row
		is_done_df = pd.read_csv("/oak/stanford/groups/akundaje/airanman/nautilus-sync/chd-091625/plot.is_done.tsv", sep='\t')
		is_done_df = pd.concat([is_done_df, new_row], ignore_index=True)
		is_done_df.to_csv("/oak/stanford/groups/akundaje/airanman/nautilus-sync/chd-091625/plot.is_done.tsv", sep='\t', index=False)
	return


@app.cell
def _():
	return


if __name__ == "__main__":
	app.run()

---

from tensorflow.keras.utils import get_custom_objects
from tensorflow.keras.models import load_model
import tensorflow as tf
from scipy.spatial.distance import jensenshannon
import pandas as pd
import numpy as np
from tqdm import tqdm
import sys
sys.path.append('..')
from generators.variant_generator import VariantGenerator
from generators.peak_generator import PeakGenerator
from utils import losses
from utils.io import get_variant_schema, get_peak_schema


def get_valid_peaks(chrom, pos, summit, input_len, chrom_sizes_dict):
	valid_chrom = chrom in chrom_sizes_dict
	if valid_chrom:
		flank = input_len // 2
		lower_check = ((pos + summit) - flank > 0)
		upper_check = ((pos + summit) + flank <= chrom_sizes_dict[chrom])
		in_bounds = lower_check and upper_check
		valid_peak = valid_chrom and in_bounds
		return valid_peak
	else:
		return False

def get_valid_variants(chrom, pos, allele1, allele2, input_len, chrom_sizes_dict):
	valid_chrom = chrom in chrom_sizes_dict
	if valid_chrom:
		flank = input_len // 2
		lower_check = (pos - flank > 0)
		upper_check = (pos + flank <= chrom_sizes_dict[chrom])
		in_bounds = lower_check and upper_check
		# no_allele1_indel = (len(allele1) == 1)
		# no_allele2_indel = (len(allele2) == 1)
		# no_indel = no_allele1_indel and no_allele2_indel
		# valid_variant = valid_chrom and in_bounds and no_indel
		valid_variant = valid_chrom and in_bounds
		return valid_variant
	else:
		return False

def softmax(x, temp=1):
	norm_x = x - np.mean(x, axis=1, keepdims=True)
	return np.exp(temp*norm_x)/np.sum(np.exp(temp*norm_x), axis=1, keepdims=True)

def load_model_wrapper(model_file):
	# read .h5 model
	custom_objects = {"multinomial_nll": losses.multinomial_nll, "tf": tf}
	get_custom_objects().update(custom_objects)
	model = load_model(model_file, compile=False)
	print("model loaded succesfully")
	return model

def fetch_peak_predictions(model, peaks, input_len, genome_fasta, batch_size, debug_mode=False, lite=False,forward_only=False):
	peak_ids = []
	pred_counts = []
	pred_profiles = []
	if not forward_only:
		revcomp_counts = []
		revcomp_profiles = []

	# peak sequence generator
	peak_gen = PeakGenerator(peaks=peaks,
							 input_len=input_len,
							 genome_fasta=genome_fasta,
							 batch_size=batch_size,
							 debug_mode=debug_mode)

	for i in tqdm(range(len(peak_gen))):
		batch_peak_ids, seqs = peak_gen[i]
		revcomp_seq = seqs[:, ::-1, ::-1]

		if lite:
			batch_preds = model.predict([seqs,
										 np.zeros((len(seqs), model.output_shape[0][1])),
										 np.zeros((len(seqs), ))],
										verbose=False)

			if not forward_only:
				revcomp_batch_preds = model.predict([revcomp_seq,
											 np.zeros((len(revcomp_seq), model.output_shape[0][1])),
											 np.zeros((len(revcomp_seq), ))],
											verbose=False)
		else:
			batch_preds = model.predict(seqs, verbose=False)
			if not forward_only:
				revcomp_batch_preds = model.predict(revcomp_seq, verbose=False)

		batch_preds[1] = np.array([batch_preds[1][i] for i in range(len(batch_preds[1]))])
		pred_counts.extend(np.exp(batch_preds[1]))
		pred_profiles.extend(np.array(batch_preds[0]))   # np.squeeze(softmax()) to get probability profile

		if not forward_only:
			revcomp_batch_preds[1] = np.array([revcomp_batch_preds[1][i] for i in range(len(revcomp_batch_preds[1]))])
			revcomp_counts.extend(np.exp(revcomp_batch_preds[1]))
			revcomp_profiles.extend(np.array(revcomp_batch_preds[0]))    # np.squeeze(softmax()) to get probability profile

		peak_ids.extend(batch_peak_ids)

	peak_ids = np.array(peak_ids)
	pred_counts = np.array(pred_counts)
	pred_profiles = np.array(pred_profiles)

	if not forward_only:
		revcomp_counts = np.array(revcomp_counts)
		revcomp_profiles = np.array(revcomp_profiles)
		average_counts = np.average([pred_counts,revcomp_counts],axis=0)
		average_profiles = np.average([pred_profiles,revcomp_profiles[:,::-1]],axis=0)
		return peak_ids,average_counts,average_profiles
	else:
		return peak_ids,pred_counts,pred_profiles

def fetch_variant_predictions(model, variants_table, input_len, genome_fasta, batch_size, debug_mode=False, lite=False, shuf=False, forward_only=False):
	variant_ids = []
	allele1_pred_counts = []
	allele2_pred_counts = []
	allele1_pred_profiles = []
	allele2_pred_profiles = []
	if not forward_only:
		revcomp_allele1_pred_counts = []
		revcomp_allele2_pred_counts = []
		revcomp_allele1_pred_profiles = []
		revcomp_allele2_pred_profiles = []

	# variant sequence generator
	var_gen = VariantGenerator(variants_table=variants_table,
						   input_len=input_len,
						   genome_fasta=genome_fasta,
						   batch_size=batch_size,
						   debug_mode=False,
						   shuf=shuf)

	for i in tqdm(range(len(var_gen))):

		batch_variant_ids, allele1_seqs, allele2_seqs = var_gen[i]
		revcomp_allele1_seqs = allele1_seqs[:, ::-1, ::-1]
		revcomp_allele2_seqs = allele2_seqs[:, ::-1, ::-1]

		if lite:
			allele1_batch_preds = model.predict([allele1_seqs,
												 np.zeros((len(allele1_seqs), model.output_shape[0][1])),
												 np.zeros((len(allele1_seqs), ))],
												verbose=False)
			allele2_batch_preds = model.predict([allele2_seqs,
												 np.zeros((len(allele2_seqs), model.output_shape[0][1])),
												 np.zeros((len(allele2_seqs), ))],
												verbose=False)

			if not forward_only:
				revcomp_allele1_batch_preds = model.predict([revcomp_allele1_seqs,
													 np.zeros((len(revcomp_allele1_seqs), model.output_shape[0][1])),
													 np.zeros((len(revcomp_allele1_seqs), ))],
													verbose=False)
				revcomp_allele2_batch_preds = model.predict([revcomp_allele2_seqs,
										 np.zeros((len(revcomp_allele2_seqs), model.output_shape[0][1])),
										 np.zeros((len(revcomp_allele2_seqs), ))],
										verbose=False)
		else:
			allele1_batch_preds = model.predict(allele1_seqs, verbose=False)
			allele2_batch_preds = model.predict(allele2_seqs, verbose=False)
			if not forward_only:
				revcomp_allele1_batch_preds = model.predict(revcomp_allele1_seqs, verbose=False)
				revcomp_allele2_batch_preds = model.predict(revcomp_allele2_seqs, verbose=False)

		allele1_batch_preds[1] = np.array([allele1_batch_preds[1][i] for i in range(len(allele1_batch_preds[1]))])
		allele2_batch_preds[1] = np.array([allele2_batch_preds[1][i] for i in range(len(allele2_batch_preds[1]))])
		allele1_pred_counts.extend(np.exp(allele1_batch_preds[1]))
		allele2_pred_counts.extend(np.exp(allele2_batch_preds[1]))
		allele1_pred_profiles.extend(np.array(allele1_batch_preds[0]))   # np.squeeze(softmax()) to get probability profile
		allele2_pred_profiles.extend(np.array(allele2_batch_preds[0]))

		if not forward_only:
			revcomp_allele1_batch_preds[1] = np.array([revcomp_allele1_batch_preds[1][i] for i in range(len(revcomp_allele1_batch_preds[1]))])
			revcomp_allele2_batch_preds[1] = np.array([revcomp_allele2_batch_preds[1][i] for i in range(len(revcomp_allele2_batch_preds[1]))])
			revcomp_allele1_pred_counts.extend(np.exp(revcomp_allele1_batch_preds[1]))
			revcomp_allele2_pred_counts.extend(np.exp(revcomp_allele2_batch_preds[1]))
			revcomp_allele1_pred_profiles.extend(np.array(revcomp_allele1_batch_preds[0]))   # np.squeeze(softmax()) to get probability profile
			revcomp_allele2_pred_profiles.extend(np.array(revcomp_allele2_batch_preds[0]))

		variant_ids.extend(batch_variant_ids)

	variant_ids = np.array(variant_ids)
	allele1_pred_counts = np.array(allele1_pred_counts)
	allele2_pred_counts = np.array(allele2_pred_counts)
	allele1_pred_profiles = np.array(allele1_pred_profiles)
	allele2_pred_profiles = np.array(allele2_pred_profiles)

	if not forward_only:
		revcomp_allele1_pred_counts = np.array(revcomp_allele1_pred_counts)
		revcomp_allele2_pred_counts = np.array(revcomp_allele2_pred_counts)
		revcomp_allele1_pred_profiles = np.array(revcomp_allele1_pred_profiles)
		revcomp_allele2_pred_profiles = np.array(revcomp_allele2_pred_profiles)
		average_allele1_pred_counts = np.average([allele1_pred_counts,revcomp_allele1_pred_counts],axis=0)
		average_allele1_pred_profiles = np.average([allele1_pred_profiles,revcomp_allele1_pred_profiles[:,::-1]],axis=0)
		average_allele2_pred_counts = np.average([allele2_pred_counts,revcomp_allele2_pred_counts],axis=0)
		average_allele2_pred_profiles = np.average([allele2_pred_profiles,revcomp_allele2_pred_profiles[:,::-1]],axis=0)
		return variant_ids, average_allele1_pred_counts, average_allele2_pred_counts, \
			   average_allele1_pred_profiles, average_allele2_pred_profiles
	else:
		return variant_ids, allele1_pred_counts, allele2_pred_counts, \
			   allele1_pred_profiles, allele2_pred_profiles

def get_variant_scores_with_peaks(allele1_pred_counts, allele2_pred_counts,
					   allele1_pred_profiles, allele2_pred_profiles, pred_counts):
	# logfc = np.log2(allele2_pred_counts / allele1_pred_counts)
	# jsd = np.array([jensenshannon(x,y,base=2.0) for x,y in zip(allele2_pred_profiles, allele1_pred_profiles)])

	logfc, jsd = get_variant_scores(allele1_pred_counts, allele2_pred_counts,
									allele1_pred_profiles, allele2_pred_profiles)
	allele1_quantile = np.array([np.max([np.mean(pred_counts < x), (1/len(pred_counts))]) for x in allele1_pred_counts])
	allele2_quantile = np.array([np.max([np.mean(pred_counts < x), (1/len(pred_counts))]) for x in allele2_pred_counts])

	return logfc, jsd, allele1_quantile, allele2_quantile

def get_variant_scores(allele1_pred_counts, allele2_pred_counts,
					   allele1_pred_profiles, allele2_pred_profiles):

	print('allele1_pred_counts shape:', allele1_pred_counts.shape)
	print('allele2_pred_counts shape:', allele2_pred_counts.shape)
	print('allele1_pred_profiles shape:', allele1_pred_profiles.shape)
	print('allele2_pred_profiles shape:', allele2_pred_profiles.shape)

	logfc = np.squeeze(np.log2(allele2_pred_counts / allele1_pred_counts))
	jsd = np.squeeze([jensenshannon(x, y, base=2.0)
					 for x,y in zip(softmax(allele2_pred_profiles),
									softmax(allele1_pred_profiles))])

	print('logfc shape:', logfc.shape)
	print('jsd shape:', jsd.shape)

	return logfc, jsd

def adjust_indel_jsd(variants_table,allele1_pred_profiles,allele2_pred_profiles,original_jsd):
	allele1_pred_profiles = softmax(allele1_pred_profiles)
	allele2_pred_profiles = softmax(allele2_pred_profiles)
	indel_idx = []
	for i, row in variants_table.iterrows():
		allele1, allele2 = row[['allele1','allele2']]
		if allele1 == "-":
			allele1 = ""
		if allele2 == "-":
			allele2 = ""
		if len(allele1) != len(allele2):
			indel_idx += [i]

	adjusted_jsd = []
	for i in indel_idx:
		row = variants_table.iloc[i]
		allele1, allele2 = row[['allele1','allele2']]
		if allele1 == "-":
			allele1 = ""
		if allele2 == "-":
			allele2 = ""

		allele1_length = len(allele1)
		allele2_length = len(allele2)

		allele1_p = allele1_pred_profiles[i]
		allele2_p = allele2_pred_profiles[i]
		assert len(allele1_p) == len(allele2_p)
		assert allele1_length != allele2_length
		flank_size = len(allele1_p)//2
		allele1_left_flank = allele1_p[:flank_size]
		allele2_left_flank = allele2_p[:flank_size]

		if allele1_length > allele2_length:
			allele1_right_flank = np.concatenate([allele1_p[flank_size:flank_size+allele2_length],allele1_p[flank_size+allele1_length:]])
			allele2_right_flank = allele2_p[flank_size:allele2_length-allele1_length]
		else:
			allele1_right_flank = allele1_p[flank_size:allele1_length-allele2_length]
			allele2_right_flank = np.concatenate([allele2_p[flank_size:flank_size+allele1_length], allele2_p[flank_size+allele2_length:]])


		adjusted_allele1_p = np.concatenate([allele1_left_flank,allele1_right_flank])
		adjusted_allele2_p = np.concatenate([allele2_left_flank,allele2_right_flank])
		adjusted_allele1_p = adjusted_allele1_p/np.sum(adjusted_allele1_p)
		adjusted_allele2_p = adjusted_allele2_p/np.sum(adjusted_allele2_p)
		assert len(adjusted_allele1_p) == len(adjusted_allele2_p)
		adjusted_j = jensenshannon(adjusted_allele1_p,adjusted_allele2_p,base=2.0)
		adjusted_jsd += [adjusted_j]

	adjusted_jsd_list = original_jsd.copy()
	if len(indel_idx) > 0:
		for i in range(len(indel_idx)):
			idx = indel_idx[i]
			adjusted_jsd_list[idx] = adjusted_jsd[i]

	return indel_idx, adjusted_jsd_list


def create_shuffle_table(variants_table, random_seed=None, total_shuf=None, num_shuf=None):
	if total_shuf != None:
		if len(variants_table) > total_shuf:
			shuf_variants_table = variants_table.sample(total_shuf,
														random_state=random_seed,
														ignore_index=True,
														replace=False)
		else:
			shuf_variants_table = variants_table.sample(total_shuf,
														random_state=random_seed,
														ignore_index=True,
														replace=True)
		shuf_variants_table['random_seed'] = np.random.permutation(len(shuf_variants_table))
	else:
		if num_shuf != None:
			total_shuf = len(variants_table) * num_shuf
			shuf_variants_table = variants_table.sample(total_shuf,
														random_state=random_seed,
														ignore_index=True,
														replace=True)
			shuf_variants_table['random_seed'] = np.random.permutation(len(shuf_variants_table))
		else:
			## empty dataframe
			shuf_variants_table = pd.DataFrame()
	return shuf_variants_table

def get_pvals(obs, bg, tail):
	sorted_bg = np.sort(bg)
	if tail == 'right' or tail == 'both':
		rank_right = len(sorted_bg) - np.searchsorted(sorted_bg, obs, side='left')
		pval_right = (rank_right + 1) / (len(sorted_bg) + 1)
		if tail == 'right':
			return pval_right
	if tail == 'left' or tail == 'both':
		rank_left = np.searchsorted(sorted_bg, obs, side='right')
		pval_left = (rank_left + 1) / (len(sorted_bg) + 1)
		if tail == 'left':
			return pval_left
	assert tail == 'both'
	min_pval = np.minimum(pval_left, pval_right)
	pval_both = min_pval * 2

	return pval_both