import pandas as pd import numpy as np import os import argparse import scipy.stats from scipy.spatial.distance import jensenshannon from tensorflow.keras.utils import get_custom_objects from tensorflow.keras.models import load_model import tensorflow as tf import h5py import math from generators.variant_generator import VariantGenerator from generators.peak_generator import PeakGenerator from utils import argmanager, losses from utils.helpers import * from utils.io import * import shap from utils.shap_utils import * import deepdish as dd tf.compat.v1.disable_v2_behavior() def main(): args = argmanager.fetch_shap_args() print(args) out_dir = os.path.sep.join(args.out_prefix.split(os.path.sep)[:-1]) print() print('out_dir:', out_dir) print() if not os.path.exists(out_dir): raise OSError("Output directory does not exist") model = load_model_wrapper(args.model) variants_table = load_variant_table(args.list, args.schema) variants_table = variants_table.fillna('-') chrom_sizes = pd.read_csv(args.chrom_sizes, header=None, sep='\t', names=['chrom', 'size']) chrom_sizes_dict = chrom_sizes.set_index('chrom')['size'].to_dict() if args.debug_mode: variants_table = variants_table.sample(10) print(variants_table.head()) # infer input length if args.lite: input_len = model.input_shape[0][1] else: input_len = model.input_shape[1] print("input length inferred from the model: ", input_len) print(variants_table.shape) variants_table = variants_table.loc[variants_table.apply(lambda x: get_valid_variants(x.chr, x.pos, x.allele1, x.allele2, input_len, chrom_sizes_dict), axis=1)] variants_table.reset_index(drop=True, inplace=True) print(variants_table.shape) for shap_type in args.shap_type: # fetch model prediction for variants batch_size=args.batch_size ### set the batch size to the length of variant table in case variant table is small to avoid error batch_size=min(batch_size,len(variants_table)) # output_file=h5py.File(''.join([args.out_prefix, ".variant_shap.%s.h5"%shap_type]), 'w') # observed = output_file.create_group('observed') # allele1_write = observed.create_dataset('allele1_shap', (len(variants_table),2114,4), chunks=(batch_size,2114,4), dtype=np.float16, compression='gzip', compression_opts=9) # allele2_write = observed.create_dataset('allele2_shap', (len(variants_table),2114,4), chunks=(batch_size,2114,4), dtype=np.float16, compression='gzip', compression_opts=9) # variant_ids_write = observed.create_dataset('variant_ids', (len(variants_table),), chunks=(batch_size,), dtype='S100', compression='gzip', compression_opts=9) allele1_seqs = [] allele2_seqs = [] allele1_scores = [] allele2_scores = [] variant_ids = [] num_batches=len(variants_table)//batch_size for i in range(num_batches): sub_table=variants_table[i*batch_size:(i+1)*batch_size] var_ids, allele1_inputs, allele2_inputs, \ allele1_shap, allele2_shap = fetch_shap(model, sub_table, input_len, args.genome, args.batch_size, debug_mode=args.debug_mode, lite=args.lite, bias=None, shuf=False, shap_type=shap_type) # allele1_write[i*batch_size:(i+1)*batch_size] = allele1_shap # allele2_write[i*batch_size:(i+1)*batch_size] = allele2_shap # variant_ids_write[i*batch_size:(i+1)*batch_size] = [s.encode("utf-8") for s in var_ids] if len(variant_ids) == 0: allele1_seqs = allele1_inputs allele2_seqs = allele2_inputs allele1_scores = allele1_shap allele2_scores = allele2_shap variant_ids = var_ids else: allele1_seqs = np.concatenate((allele1_seqs, allele1_inputs)) allele2_seqs = np.concatenate((allele2_seqs, allele2_inputs)) allele1_scores = np.concatenate((allele1_scores, allele1_shap)) allele2_scores = np.concatenate((allele2_scores, allele2_shap)) variant_ids = np.concatenate((variant_ids, var_ids)) if len(variants_table)%batch_size != 0: sub_table=variants_table[num_batches*batch_size:len(variants_table)] var_ids, allele1_inputs, allele2_inputs, \ allele1_shap, allele2_shap = fetch_shap(model, sub_table, input_len, args.genome, args.batch_size, debug_mode=args.debug_mode, lite=args.lite, bias=None, shuf=False, shap_type=shap_type) # allele1_write[num_batches*batch_size:len(variants_table)] = allele1_shap # allele2_write[num_batches*batch_size:len(variants_table)] = allele2_shap # variant_ids_write[num_batches*batch_size:len(variants_table)] = [s.encode("utf-8") for s in var_ids] if len(variant_ids) == 0: allele1_seqs = allele1_inputs allele2_seqs = allele2_inputs allele1_scores = allele1_shap allele2_scores = allele2_shap variant_ids = var_ids else: allele1_seqs = np.concatenate((allele1_seqs, allele1_inputs)) allele2_seqs = np.concatenate((allele2_seqs, allele2_inputs)) allele1_scores = np.concatenate((allele1_scores, allele1_shap)) allele2_scores = np.concatenate((allele2_scores, allele2_shap)) variant_ids = np.concatenate((variant_ids, var_ids)) # # store shap at variants # with h5py.File(''.join([args.out_prefix, ".variant_shap.%s.h5"%shap_type]), 'w') as f: # observed = f.create_group('observed') # observed.create_dataset('allele1_shap', data=allele1_shap, compression='gzip', compression_opts=9) # observed.create_dataset('allele2_shap', data=allele2_shap, compression='gzip', compression_opts=9) assert(allele1_seqs.shape==allele1_scores.shape) assert(allele2_seqs.shape==allele2_scores.shape) assert(allele1_seqs.shape==allele2_seqs.shape) assert(allele1_scores.shape==allele2_scores.shape) assert(allele1_seqs.shape[2]==4) assert(len(allele1_seqs==len(variant_ids))) shap_dict = { 'raw': {'seq': np.concatenate((np.transpose(allele1_seqs, (0, 2, 1)).astype(np.int8), np.transpose(allele2_seqs, (0, 2, 1)).astype(np.int8)))}, 'shap': {'seq': np.concatenate((np.transpose(allele1_scores, (0, 2, 1)).astype(np.float16), np.transpose(allele2_scores, (0, 2, 1)).astype(np.float16)))}, 'projected_shap': {'seq': np.concatenate((np.transpose(allele1_seqs * allele1_scores, (0, 2, 1)).astype(np.float16), np.transpose(allele2_seqs * allele2_scores, (0, 2, 1)).astype(np.float16)))}, 'variant_ids': np.concatenate((np.array(variant_ids), np.array(variant_ids))), 'alleles': np.concatenate((np.array([0] * len(variant_ids)), np.array([1] * len(variant_ids))))} dd.io.save(''.join([args.out_prefix, ".variant_shap.%s.h5"%shap_type]), shap_dict, compression='blosc') print("DONE") if __name__ == "__main__": main() --- from tensorflow.keras.utils import Sequence import pandas as pd import numpy as np import math import pyfaidx from utils import one_hot from deeplift.dinuc_shuffle import dinuc_shuffle class VariantGenerator(Sequence): def __init__(self, variants_table, input_len, genome_fasta, batch_size=512, debug_mode=False, shuf=False): self.variants_table = variants_table self.num_variants = self.variants_table.shape[0] self.input_len = input_len self.genome = pyfaidx.Fasta(genome_fasta) self.debug_mode = debug_mode self.flank_size = self.input_len // 2 self.shuf = shuf self.batch_size = batch_size def __get_allele_seq__(self, chrom, pos, allele1, allele2, seed=-1): chrom = str(chrom) pos = int(pos) allele1 = str(allele1) allele2 = str(allele2) if allele1 == "-": allele1 = "" if allele2 == "-": allele2 = "" ### 1 - indexed position pos = pos - 1 if len(allele1) == len(allele2): flank = str(self.genome[chrom][pos-self.flank_size:pos+self.flank_size]) if self.shuf: assert seed != -1 flank = dinuc_shuffle(flank, rng=np.random.RandomState(seed)) allele1_seq = flank[:self.flank_size] + allele1 + flank[self.flank_size+len(allele1):] allele2_seq = flank[:self.flank_size] + allele2 + flank[self.flank_size+len(allele2):] ### handle INDELS (allele1 must be the reference allele) # Here, we adjust the flanks to account for the INDEL and insure that # the allele1 and allele2 sequences are the same length else: ### hg19 has lower case assert len(allele1) != len(allele2) assert self.genome[chrom][pos:pos+len(allele1)].seq.upper() == allele1 mismatch_length = len(allele1) - len(allele2) if mismatch_length > 0: flank = str(self.genome[chrom][pos-self.flank_size:pos+self.flank_size+mismatch_length]) else: flank = str(self.genome[chrom][pos-self.flank_size:pos+self.flank_size]) if self.shuf: assert seed != -1 flank = dinuc_shuffle(flank, rng=np.random.RandomState(seed)) left_flank=flank[:self.flank_size] allele1_right_flank = flank[self.flank_size+len(allele1):self.flank_size*2] allele2_right_flank = flank[self.flank_size+len(allele1):self.flank_size*2+mismatch_length] allele1_seq = left_flank + allele1 + allele1_right_flank allele2_seq = left_flank + allele2 + allele2_right_flank assert len(allele1_seq) == self.flank_size * 2 assert len(allele2_seq) == self.flank_size * 2 return allele1_seq, allele2_seq def __getitem__(self, idx): cur_entries = self.variants_table.iloc[idx*self.batch_size:min([self.num_variants,(idx+1)*self.batch_size])] variant_ids = cur_entries['variant_id'].tolist() if self.shuf: allele1_seqs, allele2_seqs = zip(*[self.__get_allele_seq__(v, w, x, y, z) for v,w,x,y,z in zip(cur_entries.chr, cur_entries.pos, cur_entries.allele1, cur_entries.allele2, cur_entries.random_seed)]) else: allele1_seqs, allele2_seqs = zip(*[self.__get_allele_seq__(w, x, y, z) for w,x,y,z in zip(cur_entries.chr, cur_entries.pos, cur_entries.allele1, cur_entries.allele2)]) if self.debug_mode: return variant_ids, list(allele1_seqs),list(allele2_seqs) else: return variant_ids, one_hot.dna_to_one_hot(list(allele1_seqs)), one_hot.dna_to_one_hot(list(allele2_seqs)) def __len__(self): return math.ceil(self.num_variants/self.batch_size) --- import marimo __generated_with = "0.16.2" app = marimo.App(width="medium", auto_download=["html"]) @app.cell def _(): import os.path import os import ast import sys import subprocess import time import hashlib from collections import defaultdict import numpy as np import pandas as pd import deepdish import h5py import multiprocessing import matplotlib as mpl mpl.use('Agg') mpl.rcParams["svg.fonttype"] = "none" import matplotlib.pyplot as plt import logomaker def load_peaks_4(peaks_loc): peak_df = pd.read_csv(peaks_loc, sep="\t", names=["chro", "start", "stop", "name"]) peak_finder = dict() for chro in set(peak_df["chro"]): peak_finder[chro] = peak_df[peak_df["chro"] == chro] return peak_finder def load_peaks_10(peaks_loc): peak_df = pd.read_csv(peaks_loc, sep="\t", names=["chro", "start", "stop", "4", "5", "6", "7", "8", "9", "summit"]) peak_df["name"] = peak_df["chro"] + ":" + peak_df["start"].astype(str) + ":" + peak_df["stop"].astype(str) peak_finder = dict() for chro in set(peak_df["chro"]): peak_finder[chro] = peak_df[peak_df["chro"] == chro] return peak_finder def assign_peaks(variants_df, peaks_loc, peaks_name, num_cols): print(f"starting {peaks_name}") if num_cols == 4: peak_finder = load_peaks_4(peaks_loc) elif num_cols == 10: peak_finder = load_peaks_10(peaks_loc) else: assert(False) in_peaks = [] for index, row in variants_df.iterrows(): chro, pos, ref = row["chr"], row["pos"], row["ref"] pos_start = pos pos_end = pos + len(ref) - 1 if chro not in peak_finder: in_peaks.append("out_of_peak") continue else: peak_chro = peak_finder[chro] # Option 1 - variant start is within region match_1 = peak_chro[(peak_chro["start"] <= pos_start) & (peak_chro["stop"] >= pos_start)] # Option 2 - variant stop is within region match_2 = peak_chro[(peak_chro["start"] <= pos_end) & (peak_chro["stop"] >= pos_end)] # Option 3 - variant region encapsulates a whole region match_3 = peak_chro[(peak_chro["start"] >= pos_start) & (peak_chro["stop"] <= pos_end)] # Combine all matches all_matches_df = pd.concat([match_1, match_2, match_3], axis=0, ignore_index=True) all_matches_df.drop_duplicates(inplace=True) # In peak or not in_peaks.append(" & ".join(list(all_matches_df["name"])) if len(all_matches_df) > 0 else "out_of_peak") return {peaks_name: in_peaks} def assign_peaks_parallel(variants_df, peaks_dict, num_cols=10): inputs = [] for peaks_name, peaks_loc in peaks_dict.items(): inputs.append((variants_df, peaks_loc, peaks_name, num_cols)) with multiprocessing.Pool() as pool: results = pool.starmap(assign_peaks, inputs) pool.close() pool.join() for x in results: for pn, ip in x.items(): variants_df[pn] = ip return variants_df def _plotter_profile(allele1_profiles, allele2_profiles, vlines, ax0, allele1_label, allele2_label): xmins, xmaxs = [], [] shortened_allele1_label = allele1_label if len(allele1_label) < 15 else f"{allele1_label[:15]}... (len {len(allele1_label)})" for i, (x, allele1_profile) in enumerate(allele1_profiles): ax0.plot(x, allele1_profile, label=(f"ref ({shortened_allele1_label})" if i==0 else ""), color='C0') xmins.append(min(x)); xmaxs.append(max(x)) shortened_allele2_label = allele2_label if len(allele2_label) < 15 else f"{allele2_label[:15]}... (len {len(allele2_label)})" for i, (x, allele2_profile) in enumerate(allele2_profiles): ax0.plot(x, allele2_profile, label=(f"alt ({shortened_allele2_label})" if i==0 else ""), color='C1') xmins.append(min(x)); xmaxs.append(max(x)) for v in vlines: ax0.axvline(v, color='black', ls='--', linewidth=1) xmin, xmax = min(xmins), max(xmaxs) ax0.set_xlim(xmin, xmax) ax0.set_xticks(np.arange(xmin + (50 - xmin%50)%50, xmax+1, 50)) legend = ax0.legend(prop={'size': 12}, loc='upper right') def _plotter_shap(allele1_shap, allele2_shap, vlines, xmin, ax1, ax2, allele1_label, allele2_label): active_allele = "ref" if np.sum(allele1_shap) > np.sum(allele2_shap) else "alt" df1 = pd.DataFrame(allele1_shap, columns=["A", "C", "G", "T"]) df1.index += xmin df2 = pd.DataFrame(allele2_shap, columns=["A", "C", "G", "T"]) df2.index += xmin logomaker.Logo(df1, ax=ax1) logomaker.Logo(df2, ax=ax2) for v in vlines: ax1.axvline(v, color='k', linestyle='--', linewidth=1) ax2.axvline(v, color='k', linestyle='--', linewidth=1) ymax = 1.1*max(np.max(np.maximum(allele1_shap, 0)), np.max(np.maximum(allele2_shap, 0))) ymin = 1.1*min(np.min(np.minimum(allele1_shap, 0)), np.min(np.minimum(allele2_shap, 0))) ax1.set_ylim(bottom=ymin, top=ymax) ax2.set_ylim(bottom=ymin, top=ymax) shortened_allele_1_label = allele1_label if len(allele1_label) < 15 else f"{allele1_label[:15]}... (len {len(allele1_label)})" plt.text(0.988, 0.903, f"ref ({shortened_allele_1_label})", verticalalignment='top', horizontalalignment='right', transform=ax1.transAxes, size=12, color='black', bbox=dict(boxstyle='round', facecolor='white', edgecolor='lightgrey')) shortened_allele2_label = allele2_label if len(allele2_label) < 15 else f"{allele2_label[:15]}... (len {len(allele2_label)})" plt.text(0.988, 0.903, f"alt ({shortened_allele2_label})", verticalalignment='top', horizontalalignment='right', transform=ax2.transAxes, size=12, color='black', bbox=dict(boxstyle='round', facecolor='white', edgecolor='lightgrey')) def _add_logfc_stats_box(ax, logfc, logfc_pval): """Add a box with logFC and p-value statistics to the plot""" # Format the p-value in scientific notation if it's very small if logfc_pval < 0.001: pval_text = f"{logfc_pval:.2e}" else: pval_text = f"{logfc_pval:.3f}" # Create the text for the stats box stats_text = f"LogFC: {logfc:.3f}\nP-value: {pval_text}" # Get the legend position legend = ax.get_legend() if legend: # Get legend bounding box in axes coordinates bbox = legend.get_window_extent(ax.figure.canvas.get_renderer()) bbox_axes = bbox.transformed(ax.transAxes.inverted()) legend_top = bbox_axes.y1 # Top of legend legend_bottom = bbox_axes.y0 # Bottom of legend # Use the legend's top position for alignment y_pos = legend_top else: # Fallback if no legend exists y_pos = 1.0 # Add text box on the left side at the same height as the legend plt.text(0.008, y_pos-0.04, stats_text, verticalalignment='top', horizontalalignment='left', transform=ax.transAxes, size=12, color='black', bbox=dict(boxstyle='round', facecolor='white', edgecolor='lightgrey')) def _plot_profile(allele1_pred, allele2_pred, allele1_length, allele2_length, allele1_label, allele2_label, window_size, ax0, logfc=None, logfc_pval=None): total_length = 1000 C = total_length//2 F = window_size//2 if allele1_length < allele2_length: # INSERTION allele1_pred_plots = [] allele1_pred_plots.append((list(range(-F, allele1_length)), allele1_pred[C-F:C+allele1_length])) allele1_pred_plots.append((list(range(allele2_length, F+allele2_length)), allele1_pred[C+allele1_length:C+F+allele1_length])) allele2_pred_plots = [(list(range(-F, F+allele2_length)), allele2_pred[C-F:C+F+allele2_length])] vlines = [-0.5, allele2_length-0.5] elif allele1_length > allele2_length: # DELETION allele1_pred_plots = [(list(range(-F, F+allele1_length)), allele1_pred[C-F:C+F+allele1_length])] allele2_pred_plots = [] allele2_pred_plots.append((list(range(-F, allele2_length)), allele2_pred[C-F:C+allele2_length])) allele2_pred_plots.append((list(range(allele1_length, F+allele1_length)), allele2_pred[C+allele2_length:C+F+allele2_length])) vlines = [-0.5, allele1_length-0.5] else: # SUBSTITUTION allele1_pred_plots = [(list(range(-F, F+allele1_length)), allele1_pred[C-F:C+F+allele1_length])] allele2_pred_plots = [(list(range(-F, F+allele2_length)), allele2_pred[C-F:C+F+allele2_length])] vlines = [-0.5, +allele1_length-0.5] _plotter_profile(allele1_pred_plots, allele2_pred_plots, vlines, ax0, allele1_label, allele2_label) # Add logFC stats box if values are provided if logfc is not None and logfc_pval is not None: _add_logfc_stats_box(ax0, logfc, logfc_pval) def _plot_shap(allele1_shap, allele2_shap, allele1_length, allele2_length, allele1_label, allele2_label, window_size, ax1, ax2): total_length = 2114 C = total_length//2 F = window_size//2 if allele1_length < allele2_length: # INSERTION allele1_shap_plot = np.concatenate([allele1_shap[C-F:C+allele1_length], np.zeros((allele2_length-allele1_length, 4)), allele1_shap[C+allele1_length:C+F+allele1_length]]) allele2_shap_plot = allele2_shap[C-F:C+F+allele2_length] vlines = [-0.5, allele2_length-0.5] elif allele1_length > allele2_length: # DELETION allele1_shap_plot = allele1_shap[C-F:C+F+allele1_length] allele2_shap_plot = np.concatenate([allele2_shap[C-F:C+allele2_length], np.zeros((allele1_length-allele2_length, 4)), allele2_shap[C+allele2_length:C+F+allele2_length]]) vlines = [-0.5, allele1_length-0.5] else: # SUBSTITUTION allele1_shap_plot = allele1_shap[C-F:C+F+allele1_length] allele2_shap_plot = allele2_shap[C-F:C+F+allele2_length] vlines = [-0.5, allele1_length-0.5] assert(allele1_shap_plot.shape == allele2_shap_plot.shape) _plotter_shap(allele1_shap_plot, allele2_shap_plot, vlines, -F, ax1, ax2, allele1_label, allele2_label) def plot_variant(allele1_pred, allele2_pred, allele1_shap, allele2_shap, allele1_length, allele2_length, allele1_label, allele2_label, window_size, title, save_loc, include_shap=True, logfc=None, logfc_pval=None): """ Plot variant with optional SHAP visualization and logFC statistics Parameters: - include_shap: If True, creates 3-panel plot with profile + 2 SHAP plots. If False, creates 1-panel plot with just profile. - logfc: LogFC value to display in stats box - logfc_pval: LogFC p-value to display in stats box """ if include_shap: fig, axs = plt.subplots(3, 1, figsize=(20, 8), dpi=400) # PLOT PROFILE _plot_profile(allele1_pred, allele2_pred, allele1_length, allele2_length, allele1_label, allele2_label, window_size, axs[0], logfc, logfc_pval) # PLOT SHAP _plot_shap(allele1_shap, allele2_shap, allele1_length, allele2_length, allele1_label, allele2_label, window_size, axs[1], axs[2]) plt.subplots_adjust(hspace=0.3, top=0.785) else: fig, ax = plt.subplots(1, 1, figsize=(20, 4), dpi=400) # PLOT PROFILE ONLY _plot_profile(allele1_pred, allele2_pred, allele1_length, allele2_length, allele1_label, allele2_label, window_size, ax, logfc, logfc_pval) plt.subplots_adjust(top=0.85) # PLOT plt.suptitle(title, fontsize=24) fig.set_facecolor('white') plt.savefig(save_loc, bbox_inches='tight', dpi=300) plt.close() print(f"Saved to {save_loc}") def load_scores(prediction_files, expected_num_variants=None): counts1, counts2, profile1, profile2 = {}, {}, {}, {} chrs = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 22, 'X', 'Y'] def softmax(x, temp=1): norm_x = x - np.mean(x, axis=1, keepdims=True) return np.exp(temp*norm_x)/np.sum(np.exp(temp*norm_x), axis=1, keepdims=True) fold_preds1 = [] fold_preds2 = [] existing_folds = [] for i, file in enumerate(prediction_files): if not os.path.exists(file): continue existing_folds.append(i) print(f" Loading {file}") with h5py.File(file, 'r') as h5file: # If expected_num_variants is provided, check if the number of variants matches if expected_num_variants is not None: num_variants = h5file["observed"]["allele1_pred_counts"].shape[0] if num_variants != expected_num_variants: print(f"Warning: Fold {i} has {num_variants} variants, expected {expected_num_variants}. Skipping this fold.") continue fold_preds1.append(h5file["observed"]["allele1_pred_counts"][:] * softmax(h5file["observed"]["allele1_pred_profiles"][:])) fold_preds2.append(h5file["observed"]["allele2_pred_counts"][:] * softmax(h5file["observed"]["allele2_pred_profiles"][:])) avg_preds1 = np.mean(np.array([fold_preds1[fold] for fold in existing_folds]), axis=0) avg_preds2 = np.mean(np.array([fold_preds2[fold] for fold in existing_folds]), axis=0) return avg_preds1, avg_preds2 def load_shaps(shap_files): shaps1 = None shaps2 = None existing_folds = [] n = len(shap_files) for i, file in enumerate(shap_files): print(f" Loading {file}") if not os.path.exists(file): continue existing_folds.append(i) fold_shaps = deepdish.io.load(file) shaps_f = fold_shaps["projected_shap"]["seq"] N = shaps_f.shape[0]//2 if shaps1 is None: shaps1 = shaps_f[:N, :, :] shaps2 = shaps_f[N:, :, :] else: shaps1 += shaps_f[:N, :, :] shaps2 += shaps_f[N:, :, :] shaps1 = np.transpose(shaps1/n, (0, 2, 1)) shaps2 = np.transpose(shaps2/n, (0, 2, 1)) assert(shaps1.shape == shaps2.shape) return shaps1, shaps2 def load_list(list_loc): with open(list_loc, 'r') as f: return [line.strip() for line in f] def parse_models_completed(cell): try: return ast.literal_eval(cell) except (ValueError, SyntaxError): return [cell] if cell != '[]' else [] # Parse command line arguments for parallel processing process_id = 0 total_process_count = 1 if len(sys.argv) >= 3: process_id = int(sys.argv[1]) total_process_count = int(sys.argv[2]) print(f"Running as process {process_id} of {total_process_count}") else: print("No parallel processing arguments provided. Running all models.") all_models = sorted(os.listdir("/oak/stanford/groups/akundaje/airanman/nautilus-sync/chd-091625/pvc/outputs/outputs/chd-091625")) # Split models across processes models_to_process = [model for idx, model in enumerate(all_models) if idx % total_process_count == process_id] print(f"Process {process_id} will handle {len(models_to_process)} models out of {len(all_models)} total models") # variants_df = pd.read_csv("/users/airanman/nautilus-projects/chd-091625/variants_to_plot.tsv", sep='\t') # shap_prio_variants_df = pd.read_csv("/users/airanman/nautilus-projects/chd-091625/variants_to_plot.tsv", sep='\t') predictions_variants_df = pd.read_csv("/oak/stanford/groups/akundaje/airanman/nautilus-sync/chd-091625/denovo_variants_chd_and_non_chd_ncDNVs.with_tbx5_variants.tsv", sep='\t', header=None, names=['chr','pos','ref','alt','variant_id']) shap_prio_variants_df = pd.read_csv("/oak/stanford/groups/akundaje/airanman/nautilus-sync/chd-091625/denovo_variants_chd_and_non_chd_ncDNVs.shap-prio.variant_scorer.tsv", sep='\t', header=None, names=['chr','pos','ref','alt','variant_id']) is_done_df = pd.read_csv("/oak/stanford/groups/akundaje/airanman/nautilus-sync/chd-091625/plot.is_done.tsv", sep='\t') i = 0 plotting_shap = True # Set this to True if you want to include SHAP plots print(f'Skipping the following finished models: {is_done_df["id"].tolist()}') # Get models names from directory /oak/stanford/groups/akundaje/airanman/nautilus-sync/chd-091625/pvc/outputs/outputs/chd-091625 for model in models_to_process: # for model in os.listdir("/oak/stanford/groups/akundaje/airanman/nautilus-sync/chd-091625/pvc/outputs/outputs/chd-091625"): print(f"Model: {model}") if model in is_done_df['id'].values: print(" Skipping") continue annots_df = pd.read_csv(f"/oak/stanford/groups/akundaje/airanman/nautilus-sync/chd-091625/pvc/outputs/annotate-summ/{model}/annotations.tsv", sep='\t') # Load scores once for all variants in this model h5_path_prefix = f"/oak/stanford/groups/akundaje/airanman/nautilus-sync/chd-091625/pvc/outputs/outputs/chd-091625/{model}/" prediction_files = [f"{h5_path_prefix}/fold_{f}/.variant_predictions.h5" for f in range(5)] # Check if they all exist if not all([os.path.exists(f) for f in prediction_files]): print(f"Model {model} has missing fold files. Skipping") continue # Removed the one invalid variant counts_profile_1, counts_profile_2 = load_scores(prediction_files, expected_num_variants=len(annots_df)) if not isinstance(counts_profile_1, np.ndarray) or not isinstance(counts_profile_2, np.ndarray): print(f"Model {model} has invalid arrays. Skipping") continue # Load SHAP data if needed shaps1, shaps2 = None, None if plotting_shap: shap_type = 'counts' try: shap_path_prefix = f"/oak/stanford/groups/akundaje/airanman/nautilus-sync/chd-091625/pvc/outputs/shap/chd-091625/{model}/" shap_files = [f"{shap_path_prefix}/fold_{f}/shap-prio.variant_shap.{shap_type}.h5" for f in range(5) for shap_type in ['counts']] shaps1, shaps2 = load_shaps(shap_files) if not isinstance(shaps1, np.ndarray) or not isinstance(shaps2, np.ndarray): print(f" Model {model} has invalid SHAP arrays. Skipping SHAP plotting for this model") plotting_shap = False except Exception as e: print(f" Error loading SHAP data for model {model}: {e}. Skipping SHAP plotting for this model") plotting_shap = False for i, row in shap_prio_variants_df.iterrows(): # if i != len(shap_prio_variants_df)-1: # continue print(f" Variant {i+1}/{len(shap_prio_variants_df)}") variant_idx_list = predictions_variants_df.index[predictions_variants_df['variant_id'] == row['variant_id']].tolist() print(row['variant_id']) if variant_idx_list: variant_idx = variant_idx_list[0] # Get first occurrence (0-indexed) print('variant_idx', variant_idx) else: print("No match found") variant_id_internal = row['variant_id'] variant_name = row['variant_id'] # variant_id_external = row['id'] ref_profile = counts_profile_1[variant_idx, :] alt_profile = counts_profile_2[variant_idx, :] ref_length = len(row["ref"]) alt_length = len(row["alt"]) ref_label = row["ref"] alt_label = row["alt"] # Get the logfc from df variant_row = annots_df[annots_df['variant_id'] == row['variant_id']] if variant_row.empty: print(f"Warning: No matching variant found in annotations for variant_id {row['variant_id']}. Skipping.") continue logfc = variant_row['logfc.mean'].values[0] logfc_pval = variant_row['logfc.mean.pval'].values[0] output_path = f"/oak/stanford/groups/akundaje/airanman/nautilus-sync/chd-091625/plot/{variant_name}/{model}.svg" # Create output directory if it doesn't exist os.makedirs(os.path.dirname(output_path), exist_ok=True) if plotting_shap and shaps1 is not None and shaps2 is not None: # Get shap_idx from their row num in variant_prio_variants_df on 'variant_id' # shap_idx = shap_prio_variants_df.index[shap_prio_variants_df['variant_id'] == variant_id_internal].tolist()[0] shap_idx = i print('shap_idx', shap_idx) # Plot with SHAP and logFC stats ref_shap = shaps1[shap_idx, :, :] alt_shap = shaps2[shap_idx, :, :] plot_variant(ref_profile, alt_profile, ref_shap, alt_shap, ref_length, alt_length, ref_label, alt_label, 300, f'{variant_name} in {model}', output_path, include_shap=True, logfc=logfc, logfc_pval=logfc_pval) else: # Plot without SHAP but with logFC stats plot_variant(ref_profile, alt_profile, None, None, ref_length, alt_length, ref_label, alt_label, 300, f'{variant_name} in {model}', output_path, include_shap=False, logfc=logfc, logfc_pval=logfc_pval) if os.environ['USER'] == 'airanman': # Get mitra URL mitra_cmd = f'bash /oak/stanford/groups/akundaje/airanman/projects/lab/lab-utils-shared/scripts/mitra-utils url "{output_path}"' result = subprocess.run(mitra_cmd, shell=True, executable="/bin/bash", capture_output=True, text=True) print(result.stdout) if result.stderr: print("Error:", result.stderr) htmler_cmd = f'cd /oak/stanford/groups/akundaje/airanman/nautilus-sync/chd-091625/plot/{variant_name} && bash /oak/stanford/groups/akundaje/airanman/nautilus-sync/chd-091625/htmler.sh .' htmler_result = subprocess.run(htmler_cmd, shell=True, executable="/bin/bash", capture_output=True, text=True) print(htmler_result.stdout) if htmler_result.stderr: print("Error:", htmler_result.stderr) # Add the model to the is_done_df new_row = pd.DataFrame([{'id': model}]) # Use pd.concat to add the new row is_done_df = pd.read_csv("/oak/stanford/groups/akundaje/airanman/nautilus-sync/chd-091625/plot.is_done.tsv", sep='\t') is_done_df = pd.concat([is_done_df, new_row], ignore_index=True) is_done_df.to_csv("/oak/stanford/groups/akundaje/airanman/nautilus-sync/chd-091625/plot.is_done.tsv", sep='\t', index=False) return @app.cell def _(): return if __name__ == "__main__": app.run() --- from tensorflow.keras.utils import get_custom_objects from tensorflow.keras.models import load_model import tensorflow as tf from scipy.spatial.distance import jensenshannon import pandas as pd import numpy as np from tqdm import tqdm import sys sys.path.append('..') from generators.variant_generator import VariantGenerator from generators.peak_generator import PeakGenerator from utils import losses from utils.io import get_variant_schema, get_peak_schema def get_valid_peaks(chrom, pos, summit, input_len, chrom_sizes_dict): valid_chrom = chrom in chrom_sizes_dict if valid_chrom: flank = input_len // 2 lower_check = ((pos + summit) - flank > 0) upper_check = ((pos + summit) + flank <= chrom_sizes_dict[chrom]) in_bounds = lower_check and upper_check valid_peak = valid_chrom and in_bounds return valid_peak else: return False def get_valid_variants(chrom, pos, allele1, allele2, input_len, chrom_sizes_dict): valid_chrom = chrom in chrom_sizes_dict if valid_chrom: flank = input_len // 2 lower_check = (pos - flank > 0) upper_check = (pos + flank <= chrom_sizes_dict[chrom]) in_bounds = lower_check and upper_check # no_allele1_indel = (len(allele1) == 1) # no_allele2_indel = (len(allele2) == 1) # no_indel = no_allele1_indel and no_allele2_indel # valid_variant = valid_chrom and in_bounds and no_indel valid_variant = valid_chrom and in_bounds return valid_variant else: return False def softmax(x, temp=1): norm_x = x - np.mean(x, axis=1, keepdims=True) return np.exp(temp*norm_x)/np.sum(np.exp(temp*norm_x), axis=1, keepdims=True) def load_model_wrapper(model_file): # read .h5 model custom_objects = {"multinomial_nll": losses.multinomial_nll, "tf": tf} get_custom_objects().update(custom_objects) model = load_model(model_file, compile=False) print("model loaded succesfully") return model def fetch_peak_predictions(model, peaks, input_len, genome_fasta, batch_size, debug_mode=False, lite=False,forward_only=False): peak_ids = [] pred_counts = [] pred_profiles = [] if not forward_only: revcomp_counts = [] revcomp_profiles = [] # peak sequence generator peak_gen = PeakGenerator(peaks=peaks, input_len=input_len, genome_fasta=genome_fasta, batch_size=batch_size, debug_mode=debug_mode) for i in tqdm(range(len(peak_gen))): batch_peak_ids, seqs = peak_gen[i] revcomp_seq = seqs[:, ::-1, ::-1] if lite: batch_preds = model.predict([seqs, np.zeros((len(seqs), model.output_shape[0][1])), np.zeros((len(seqs), ))], verbose=False) if not forward_only: revcomp_batch_preds = model.predict([revcomp_seq, np.zeros((len(revcomp_seq), model.output_shape[0][1])), np.zeros((len(revcomp_seq), ))], verbose=False) else: batch_preds = model.predict(seqs, verbose=False) if not forward_only: revcomp_batch_preds = model.predict(revcomp_seq, verbose=False) batch_preds[1] = np.array([batch_preds[1][i] for i in range(len(batch_preds[1]))]) pred_counts.extend(np.exp(batch_preds[1])) pred_profiles.extend(np.array(batch_preds[0])) # np.squeeze(softmax()) to get probability profile if not forward_only: revcomp_batch_preds[1] = np.array([revcomp_batch_preds[1][i] for i in range(len(revcomp_batch_preds[1]))]) revcomp_counts.extend(np.exp(revcomp_batch_preds[1])) revcomp_profiles.extend(np.array(revcomp_batch_preds[0])) # np.squeeze(softmax()) to get probability profile peak_ids.extend(batch_peak_ids) peak_ids = np.array(peak_ids) pred_counts = np.array(pred_counts) pred_profiles = np.array(pred_profiles) if not forward_only: revcomp_counts = np.array(revcomp_counts) revcomp_profiles = np.array(revcomp_profiles) average_counts = np.average([pred_counts,revcomp_counts],axis=0) average_profiles = np.average([pred_profiles,revcomp_profiles[:,::-1]],axis=0) return peak_ids,average_counts,average_profiles else: return peak_ids,pred_counts,pred_profiles def fetch_variant_predictions(model, variants_table, input_len, genome_fasta, batch_size, debug_mode=False, lite=False, shuf=False, forward_only=False): variant_ids = [] allele1_pred_counts = [] allele2_pred_counts = [] allele1_pred_profiles = [] allele2_pred_profiles = [] if not forward_only: revcomp_allele1_pred_counts = [] revcomp_allele2_pred_counts = [] revcomp_allele1_pred_profiles = [] revcomp_allele2_pred_profiles = [] # variant sequence generator var_gen = VariantGenerator(variants_table=variants_table, input_len=input_len, genome_fasta=genome_fasta, batch_size=batch_size, debug_mode=False, shuf=shuf) for i in tqdm(range(len(var_gen))): batch_variant_ids, allele1_seqs, allele2_seqs = var_gen[i] revcomp_allele1_seqs = allele1_seqs[:, ::-1, ::-1] revcomp_allele2_seqs = allele2_seqs[:, ::-1, ::-1] if lite: allele1_batch_preds = model.predict([allele1_seqs, np.zeros((len(allele1_seqs), model.output_shape[0][1])), np.zeros((len(allele1_seqs), ))], verbose=False) allele2_batch_preds = model.predict([allele2_seqs, np.zeros((len(allele2_seqs), model.output_shape[0][1])), np.zeros((len(allele2_seqs), ))], verbose=False) if not forward_only: revcomp_allele1_batch_preds = model.predict([revcomp_allele1_seqs, np.zeros((len(revcomp_allele1_seqs), model.output_shape[0][1])), np.zeros((len(revcomp_allele1_seqs), ))], verbose=False) revcomp_allele2_batch_preds = model.predict([revcomp_allele2_seqs, np.zeros((len(revcomp_allele2_seqs), model.output_shape[0][1])), np.zeros((len(revcomp_allele2_seqs), ))], verbose=False) else: allele1_batch_preds = model.predict(allele1_seqs, verbose=False) allele2_batch_preds = model.predict(allele2_seqs, verbose=False) if not forward_only: revcomp_allele1_batch_preds = model.predict(revcomp_allele1_seqs, verbose=False) revcomp_allele2_batch_preds = model.predict(revcomp_allele2_seqs, verbose=False) allele1_batch_preds[1] = np.array([allele1_batch_preds[1][i] for i in range(len(allele1_batch_preds[1]))]) allele2_batch_preds[1] = np.array([allele2_batch_preds[1][i] for i in range(len(allele2_batch_preds[1]))]) allele1_pred_counts.extend(np.exp(allele1_batch_preds[1])) allele2_pred_counts.extend(np.exp(allele2_batch_preds[1])) allele1_pred_profiles.extend(np.array(allele1_batch_preds[0])) # np.squeeze(softmax()) to get probability profile allele2_pred_profiles.extend(np.array(allele2_batch_preds[0])) if not forward_only: revcomp_allele1_batch_preds[1] = np.array([revcomp_allele1_batch_preds[1][i] for i in range(len(revcomp_allele1_batch_preds[1]))]) revcomp_allele2_batch_preds[1] = np.array([revcomp_allele2_batch_preds[1][i] for i in range(len(revcomp_allele2_batch_preds[1]))]) revcomp_allele1_pred_counts.extend(np.exp(revcomp_allele1_batch_preds[1])) revcomp_allele2_pred_counts.extend(np.exp(revcomp_allele2_batch_preds[1])) revcomp_allele1_pred_profiles.extend(np.array(revcomp_allele1_batch_preds[0])) # np.squeeze(softmax()) to get probability profile revcomp_allele2_pred_profiles.extend(np.array(revcomp_allele2_batch_preds[0])) variant_ids.extend(batch_variant_ids) variant_ids = np.array(variant_ids) allele1_pred_counts = np.array(allele1_pred_counts) allele2_pred_counts = np.array(allele2_pred_counts) allele1_pred_profiles = np.array(allele1_pred_profiles) allele2_pred_profiles = np.array(allele2_pred_profiles) if not forward_only: revcomp_allele1_pred_counts = np.array(revcomp_allele1_pred_counts) revcomp_allele2_pred_counts = np.array(revcomp_allele2_pred_counts) revcomp_allele1_pred_profiles = np.array(revcomp_allele1_pred_profiles) revcomp_allele2_pred_profiles = np.array(revcomp_allele2_pred_profiles) average_allele1_pred_counts = np.average([allele1_pred_counts,revcomp_allele1_pred_counts],axis=0) average_allele1_pred_profiles = np.average([allele1_pred_profiles,revcomp_allele1_pred_profiles[:,::-1]],axis=0) average_allele2_pred_counts = np.average([allele2_pred_counts,revcomp_allele2_pred_counts],axis=0) average_allele2_pred_profiles = np.average([allele2_pred_profiles,revcomp_allele2_pred_profiles[:,::-1]],axis=0) return variant_ids, average_allele1_pred_counts, average_allele2_pred_counts, \ average_allele1_pred_profiles, average_allele2_pred_profiles else: return variant_ids, allele1_pred_counts, allele2_pred_counts, \ allele1_pred_profiles, allele2_pred_profiles def get_variant_scores_with_peaks(allele1_pred_counts, allele2_pred_counts, allele1_pred_profiles, allele2_pred_profiles, pred_counts): # logfc = np.log2(allele2_pred_counts / allele1_pred_counts) # jsd = np.array([jensenshannon(x,y,base=2.0) for x,y in zip(allele2_pred_profiles, allele1_pred_profiles)]) logfc, jsd = get_variant_scores(allele1_pred_counts, allele2_pred_counts, allele1_pred_profiles, allele2_pred_profiles) allele1_quantile = np.array([np.max([np.mean(pred_counts < x), (1/len(pred_counts))]) for x in allele1_pred_counts]) allele2_quantile = np.array([np.max([np.mean(pred_counts < x), (1/len(pred_counts))]) for x in allele2_pred_counts]) return logfc, jsd, allele1_quantile, allele2_quantile def get_variant_scores(allele1_pred_counts, allele2_pred_counts, allele1_pred_profiles, allele2_pred_profiles): print('allele1_pred_counts shape:', allele1_pred_counts.shape) print('allele2_pred_counts shape:', allele2_pred_counts.shape) print('allele1_pred_profiles shape:', allele1_pred_profiles.shape) print('allele2_pred_profiles shape:', allele2_pred_profiles.shape) logfc = np.squeeze(np.log2(allele2_pred_counts / allele1_pred_counts)) jsd = np.squeeze([jensenshannon(x, y, base=2.0) for x,y in zip(softmax(allele2_pred_profiles), softmax(allele1_pred_profiles))]) print('logfc shape:', logfc.shape) print('jsd shape:', jsd.shape) return logfc, jsd def adjust_indel_jsd(variants_table,allele1_pred_profiles,allele2_pred_profiles,original_jsd): allele1_pred_profiles = softmax(allele1_pred_profiles) allele2_pred_profiles = softmax(allele2_pred_profiles) indel_idx = [] for i, row in variants_table.iterrows(): allele1, allele2 = row[['allele1','allele2']] if allele1 == "-": allele1 = "" if allele2 == "-": allele2 = "" if len(allele1) != len(allele2): indel_idx += [i] adjusted_jsd = [] for i in indel_idx: row = variants_table.iloc[i] allele1, allele2 = row[['allele1','allele2']] if allele1 == "-": allele1 = "" if allele2 == "-": allele2 = "" allele1_length = len(allele1) allele2_length = len(allele2) allele1_p = allele1_pred_profiles[i] allele2_p = allele2_pred_profiles[i] assert len(allele1_p) == len(allele2_p) assert allele1_length != allele2_length flank_size = len(allele1_p)//2 allele1_left_flank = allele1_p[:flank_size] allele2_left_flank = allele2_p[:flank_size] if allele1_length > allele2_length: allele1_right_flank = np.concatenate([allele1_p[flank_size:flank_size+allele2_length],allele1_p[flank_size+allele1_length:]]) allele2_right_flank = allele2_p[flank_size:allele2_length-allele1_length] else: allele1_right_flank = allele1_p[flank_size:allele1_length-allele2_length] allele2_right_flank = np.concatenate([allele2_p[flank_size:flank_size+allele1_length], allele2_p[flank_size+allele2_length:]]) adjusted_allele1_p = np.concatenate([allele1_left_flank,allele1_right_flank]) adjusted_allele2_p = np.concatenate([allele2_left_flank,allele2_right_flank]) adjusted_allele1_p = adjusted_allele1_p/np.sum(adjusted_allele1_p) adjusted_allele2_p = adjusted_allele2_p/np.sum(adjusted_allele2_p) assert len(adjusted_allele1_p) == len(adjusted_allele2_p) adjusted_j = jensenshannon(adjusted_allele1_p,adjusted_allele2_p,base=2.0) adjusted_jsd += [adjusted_j] adjusted_jsd_list = original_jsd.copy() if len(indel_idx) > 0: for i in range(len(indel_idx)): idx = indel_idx[i] adjusted_jsd_list[idx] = adjusted_jsd[i] return indel_idx, adjusted_jsd_list def create_shuffle_table(variants_table, random_seed=None, total_shuf=None, num_shuf=None): if total_shuf != None: if len(variants_table) > total_shuf: shuf_variants_table = variants_table.sample(total_shuf, random_state=random_seed, ignore_index=True, replace=False) else: shuf_variants_table = variants_table.sample(total_shuf, random_state=random_seed, ignore_index=True, replace=True) shuf_variants_table['random_seed'] = np.random.permutation(len(shuf_variants_table)) else: if num_shuf != None: total_shuf = len(variants_table) * num_shuf shuf_variants_table = variants_table.sample(total_shuf, random_state=random_seed, ignore_index=True, replace=True) shuf_variants_table['random_seed'] = np.random.permutation(len(shuf_variants_table)) else: ## empty dataframe shuf_variants_table = pd.DataFrame() return shuf_variants_table def get_pvals(obs, bg, tail): sorted_bg = np.sort(bg) if tail == 'right' or tail == 'both': rank_right = len(sorted_bg) - np.searchsorted(sorted_bg, obs, side='left') pval_right = (rank_right + 1) / (len(sorted_bg) + 1) if tail == 'right': return pval_right if tail == 'left' or tail == 'both': rank_left = np.searchsorted(sorted_bg, obs, side='right') pval_left = (rank_left + 1) / (len(sorted_bg) + 1) if tail == 'left': return pval_left assert tail == 'both' min_pval = np.minimum(pval_left, pval_right) pval_both = min_pval * 2 return pval_both