# Filepaths and Hard-coded Defaults
proj_root = "/home/users/kcochran/oak/kcochran/procap_models/"
sequence_path = proj_root + "genomes/GRCh38_no_alt_analysis_set_GCA_000001405.15.fasta"
chrom_sizes = proj_root + "genomes/hg38.chrom.sizes.withrRNA"
in_window = 2114
out_window = 1000
# stuff to get from config file
with open("2022-07-19_13-55-49_run1_modisco_config_K562_counts.txt") as config_f:
config_dict = {line.split()[0] : line.strip().split()[1] for line in config_f}
modisco_out_path = config_dict["modisco_out_path"]
scoring_type = config_dict["scoring_type"]
score_center_size = int(config_dict["score_center_size"])
profile_display_center_size = int(config_dict["profile_display_center_size"])
train_val_type = config_dict["train_val_type"]
# digest what's in config file
assay_type, model_type, cell, accession, modisco_dir_base = modisco_out_path.split("/")[-6:-1]
ts_part1, ts_part2, run_str, _ = modisco_dir_base.split("_")
timestamp = ts_part1 + "_" + ts_part2
run = int(run_str.replace("run", ""))
print(modisco_out_path)
print("cell_type:", cell, accession)
print("timestamp:", timestamp)
print("run:", run)
print("scoring_type:", scoring_type)
print("score_center_size:", score_center_size)
print("profile_display_center_size:", profile_display_center_size)
/home/users/kcochran/oak/kcochran/procap_models/modisco_out/procap_bias/bpnetlite_basic_v2/K562/ENCSR261KBX/2022-07-19_13-55-49_run1_modisco/ cell_type: K562 ENCSR261KBX timestamp: 2022-07-19_13-55-49 run: 1 scoring_type: counts score_center_size: 1000 profile_display_center_size: 400
data_dir = proj_root + "/data/procap/processed/" + cell + "/" + accession + "/"
plus_bw_path = data_dir + "final.5prime.pos.bigWig"
minus_bw_path = data_dir + "final.5prime.neg.bigWig"
val_peak_path = data_dir + "peaks_uni_and_bi_" + train_val_type + ".bed.gz"
val_save_dir = proj_root + "model_out/" + assay_type + "/" + model_type + "/" + cell + "/" + accession + "/"
val_save_path = val_save_dir + timestamp + "_run" + str(run) + "_" + train_val_type
attr_save_path = val_save_dir.replace("model_out", "deepshap_out") + timestamp + "_run" + str(run) + "_deepshap"
if not modisco_out_path.endswith("/"):
modisco_out_path = modisco_out_path + "/"
# task-specific filepaths
import os
assert scoring_type in ["profile", "counts"], scoring_type
if scoring_type == "profile":
scores_path = attr_save_path + "_prof.npy"
onehot_scores_path = attr_save_path + "_prof_onehot.npy"
modisco_obj_path = modisco_out_path + "results_allChroms_prof_slice" + str(score_center_size) + ".hdf5"
seqlet_path = modisco_out_path + "seqlets_prof.txt"
else:
scores_path = attr_save_path + "_count.npy"
onehot_scores_path = attr_save_path + "_count_onehot.npy"
modisco_obj_path = modisco_out_path + "results_allChroms_count_slice" + str(score_center_size) + ".hdf5"
seqlet_path = modisco_out_path + "seqlets_count.txt"
assert(os.path.exists(scores_path)), scores_path
assert(os.path.exists(onehot_scores_path)), onehot_scores_path
# Imports, Plotting Defaults
import matplotlib.pyplot as plt
import matplotlib.font_manager as font_manager
plot_params = {
"figure.titlesize": 22,
"axes.titlesize": 22,
"axes.labelsize": 20,
"legend.fontsize": 18,
"xtick.labelsize": 16,
"ytick.labelsize": 16,
"font.weight": "bold"
}
plt.rcParams.update(plot_params)
from IPython.display import display
import tqdm
tqdm.tqdm_notebook()
import numpy as np
from view_modisco_results_utils import *
from tomtom_utils import *
/home/users/kcochran/miniconda3/envs/procap/lib/python3.7/site-packages/ipykernel_launcher.py:19: TqdmDeprecationWarning: This function will be removed in tqdm==5.0.0 Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
# Load in True Profiles and Sequences
import sys
sys.path.append('../1_train_models')
from data_loading import extract_peaks
one_hot_seqs, true_profs = extract_peaks(sequence_path,
plus_bw_path, minus_bw_path, val_peak_path, in_window, out_window,
max_jitter=0, verbose=True)
one_hot_seqs = one_hot_seqs.swapaxes(1,2)
one_hot_seqs = one_hot_seqs[:, (in_window // 2 - score_center_size // 2):(in_window // 2 + score_center_size // 2), :]
Reading FASTA: 100%|██████████| 24/24 [00:37<00:00, 1.56s/it] Loading Peaks: 3834it [00:03, 1028.31it/s]
# Load in Coordinates of Examples
coords = load_coords(val_peak_path, in_window)
# Import SHAP scores, predicted profiles
hyp_scores = np.load(scores_path).swapaxes(1,2)
hyp_scores = hyp_scores[:, (in_window // 2 - score_center_size // 2):(in_window // 2 + score_center_size // 2), :]
pred_profs = np.exp(np.load(val_save_path + ".profs.npy"))
# Load modisco results object
tfm_obj = import_tfmodisco_results(modisco_obj_path, hyp_scores, one_hot_seqs)
motif_pfms, motif_hcwms, motif_cwms, \
motif_pfms_short, num_seqlets, \
motif_seqlets, num_metaclusters = plot_all_metaclusters(tfm_obj, one_hot_seqs, hyp_scores,
true_profs, pred_profs, coords,
in_window, out_window,
score_center_size,
profile_display_center_size)
3618 seqlets
Sequence (PFM) | |
Hypothetical contributions (hCWM) | |
Actual contributions (CWM) |
3128 seqlets
Sequence (PFM) | |
Hypothetical contributions (hCWM) | |
Actual contributions (CWM) |
1821 seqlets
Sequence (PFM) | |
Hypothetical contributions (hCWM) | |
Actual contributions (CWM) |
1053 seqlets
Sequence (PFM) | |
Hypothetical contributions (hCWM) | |
Actual contributions (CWM) |
609 seqlets
Sequence (PFM) | |
Hypothetical contributions (hCWM) | |
Actual contributions (CWM) |
582 seqlets
Sequence (PFM) | |
Hypothetical contributions (hCWM) | |
Actual contributions (CWM) |
543 seqlets
Sequence (PFM) | |
Hypothetical contributions (hCWM) | |
Actual contributions (CWM) |
500 seqlets
Sequence (PFM) | |
Hypothetical contributions (hCWM) | |
Actual contributions (CWM) |
253 seqlets
Sequence (PFM) | |
Hypothetical contributions (hCWM) | |
Actual contributions (CWM) |
85 seqlets
Sequence (PFM) | |
Hypothetical contributions (hCWM) | |
Actual contributions (CWM) |
83 seqlets
Sequence (PFM) | |
Hypothetical contributions (hCWM) | |
Actual contributions (CWM) |
83 seqlets
Sequence (PFM) | |
Hypothetical contributions (hCWM) | |
Actual contributions (CWM) |
51 seqlets
Sequence (PFM) | |
Hypothetical contributions (hCWM) | |
Actual contributions (CWM) |
47 seqlets
Sequence (PFM) | |
Hypothetical contributions (hCWM) | |
Actual contributions (CWM) |
45 seqlets
Sequence (PFM) | |
Hypothetical contributions (hCWM) | |
Actual contributions (CWM) |
44 seqlets
Sequence (PFM) | |
Hypothetical contributions (hCWM) | |
Actual contributions (CWM) |
30 seqlets
Sequence (PFM) | |
Hypothetical contributions (hCWM) | |
Actual contributions (CWM) |
26 seqlets
Sequence (PFM) | |
Hypothetical contributions (hCWM) | |
Actual contributions (CWM) |
24 seqlets
Sequence (PFM) | |
Hypothetical contributions (hCWM) | |
Actual contributions (CWM) |
21 seqlets
Sequence (PFM) | |
Hypothetical contributions (hCWM) | |
Actual contributions (CWM) |
run_and_plot_tomtom(modisco_out_path, motif_pfms, motif_hcwms, motif_pfms_short, num_metaclusters)
Motif ID | q-val | PWM |
---|---|---|
MA0500.2 | 0.00877176 | |
MA1721.1 | 0.0150378 | |
MA2022.1 | 0.0612133 | |
MA1631.1 | 0.101773 | |
MA1257.1 | 0.147447 |
Motif ID | q-val | PWM |
---|---|---|
MA1631.1 | 0.140583 | |
MA1599.1 | 0.140583 | |
MA0162.4 | 0.2534 | |
MA0499.2 | 0.2534 | |
MA0146.2 | 0.2534 |
Motif ID | q-val | PWM |
---|---|---|
MA0506.2 | 0.107972 | |
MA1615.1 | 0.107972 | |
MA1712.1 | 0.303694 | |
MA1599.1 | 0.352439 | |
MA0162.4 | 0.418173 |
Motif ID | q-val | PWM |
---|---|---|
MA1631.1 | 0.0337125 | |
MA0830.2 | 0.120528 | |
MA1513.1 | 0.120528 | |
MA0146.2 | 0.139673 | |
MA1599.1 | 0.139673 |
Motif ID | q-val | PWM |
---|---|---|
MA0146.2 | 0.0307679 | |
MA1513.1 | 0.0307679 | |
MA1893.1 | 0.0307679 | |
MA0162.4 | 0.0307679 | |
MA1892.1 | 0.0307679 |
Motif ID | q-val | PWM |
---|---|---|
MA1973.1 | 0.0819396 | |
MA0597.2 | 0.168112 | |
MA1715.1 | 0.168112 | |
MA0599.1 | 0.168112 | |
MA1615.1 | 0.168112 |
No TOMTOM matches passing threshold
Motif ID | q-val | PWM |
---|---|---|
MA1513.1 | 0.0327563 | |
MA1615.1 | 0.0327563 | |
MA0146.2 | 0.0327563 | |
MA1880.1 | 0.0399818 | |
MA0599.1 | 0.0876468 |
No TOMTOM matches passing threshold
No TOMTOM matches passing threshold
Motif ID | q-val | PWM |
---|---|---|
MA1513.1 | 0.000283519 | |
MA1892.1 | 0.000283519 | |
MA1890.1 | 0.000283519 | |
MA1893.1 | 0.000317495 | |
MA1959.1 | 0.00129206 |
Motif ID | q-val | PWM |
---|---|---|
MA1865.1 | 0.0438543 |
Motif ID | q-val | PWM |
---|---|---|
MA1403.1 | 3.172e-06 | |
MA1404.1 | 4.55076e-05 | |
MA1402.1 | 0.000262554 | |
MA1723.1 | 0.0012729 | |
MA0149.1 | 0.0012844 |
Motif ID | q-val | PWM |
---|---|---|
MA1267.1 | 2.16072e-09 | |
MA1268.1 | 1.3022e-08 | |
MA1281.1 | 1.01312e-06 | |
MA1274.1 | 1.01312e-06 | |
MA1871.1 | 4.07211e-06 |
Motif ID | q-val | PWM |
---|---|---|
MA1403.1 | 4.68759e-19 | |
MA1404.1 | 4.99683e-16 | |
MA1402.1 | 9.12575e-13 | |
MA1416.1 | 1.19911e-08 | |
MA0543.1 | 0.00090038 |
Motif ID | q-val | PWM |
---|---|---|
MA1890.1 | 8.87058e-07 | |
MA1892.1 | 9.1291e-05 | |
MA1893.1 | 9.1291e-05 | |
MA1653.1 | 0.00129368 | |
MA0073.1 | 0.00212696 |
Motif ID | q-val | PWM |
---|---|---|
MA1890.1 | 1.11067e-05 | |
MA1653.1 | 0.000138182 | |
MA1630.2 | 0.000177783 | |
MA1892.1 | 0.000375965 | |
MA1893.1 | 0.000478261 |
Motif ID | q-val | PWM |
---|---|---|
MA1267.1 | 1.82593e-06 | |
MA1403.1 | 1.41929e-05 | |
MA1268.1 | 3.03288e-05 | |
MA1404.1 | 0.000315981 | |
MA1281.1 | 0.000417519 |
Motif ID | q-val | PWM |
---|---|---|
MA1890.1 | 1.14652e-07 | |
MA1893.1 | 4.91599e-05 | |
MA1892.1 | 5.85483e-05 | |
MA1653.1 | 0.000251517 | |
MA0073.1 | 0.000562923 |
Motif ID | q-val | PWM |
---|---|---|
MA1864.1 | 0.00707574 | |
MA1860.1 | 0.0928782 | |
MA0015.1 | 0.0928782 | |
MA0379.1 | 0.0928782 | |
MA1861.1 | 0.094724 |