In [1]:
# Filepaths and Hard-coded Defaults

proj_root = "/home/users/kcochran/oak/kcochran/procap_models/"
sequence_path = proj_root + "genomes/GRCh38_no_alt_analysis_set_GCA_000001405.15.fasta"
chrom_sizes = proj_root + "genomes/hg38.chrom.sizes.withrRNA"

in_window = 2114
out_window = 1000
In [2]:
# stuff to get from config file

with open("2022-07-19_13-55-49_run1_modisco_config_K562_profile.txt") as config_f:
    config_dict = {line.split()[0] : line.strip().split()[1] for line in config_f}

modisco_out_path = config_dict["modisco_out_path"]
scoring_type = config_dict["scoring_type"]
score_center_size = int(config_dict["score_center_size"])
profile_display_center_size = int(config_dict["profile_display_center_size"])
train_val_type = config_dict["train_val_type"]

# digest what's in config file

assay_type, model_type, cell, accession, modisco_dir_base = modisco_out_path.split("/")[-6:-1]
ts_part1, ts_part2, run_str, _ = modisco_dir_base.split("_")
timestamp = ts_part1 + "_" + ts_part2
run = int(run_str.replace("run", ""))
In [3]:
print(modisco_out_path)
print("cell_type:", cell, accession)
print("timestamp:", timestamp)
print("run:", run)
print("scoring_type:", scoring_type)
print("score_center_size:", score_center_size)
print("profile_display_center_size:", profile_display_center_size)
/home/users/kcochran/oak/kcochran/procap_models/modisco_out/procap_bias/bpnetlite_basic_v2/K562/ENCSR261KBX/2022-07-19_13-55-49_run1_modisco/
cell_type: K562 ENCSR261KBX
timestamp: 2022-07-19_13-55-49
run: 1
scoring_type: profile
score_center_size: 1000
profile_display_center_size: 400
In [4]:
data_dir = proj_root + "/data/procap/processed/" + cell + "/" + accession + "/"
plus_bw_path = data_dir + "final.5prime.pos.bigWig"
minus_bw_path = data_dir + "final.5prime.neg.bigWig"
val_peak_path = data_dir + "peaks_uni_and_bi_" + train_val_type + ".bed.gz"

val_save_dir = proj_root + "model_out/" + assay_type + "/" + model_type + "/" + cell + "/" + accession + "/"
val_save_path = val_save_dir + timestamp + "_run" + str(run) + "_" + train_val_type

attr_save_path = val_save_dir.replace("model_out", "deepshap_out") + timestamp + "_run" + str(run) + "_deepshap"

if not modisco_out_path.endswith("/"):
    modisco_out_path = modisco_out_path + "/"
In [5]:
# task-specific filepaths

import os

assert scoring_type in ["profile", "counts"], scoring_type

if scoring_type == "profile":
    scores_path = attr_save_path + "_prof.npy"
    onehot_scores_path = attr_save_path + "_prof_onehot.npy"
    modisco_obj_path = modisco_out_path + "results_allChroms_prof_slice" + str(score_center_size) + ".hdf5"
    seqlet_path = modisco_out_path + "seqlets_prof.txt"
else:
    scores_path = attr_save_path + "_count.npy"
    onehot_scores_path = attr_save_path + "_count_onehot.npy"
    modisco_obj_path = modisco_out_path + "results_allChroms_count_slice" + str(score_center_size) + ".hdf5"
    seqlet_path = modisco_out_path + "seqlets_count.txt"
    
assert(os.path.exists(scores_path)), scores_path
assert(os.path.exists(onehot_scores_path)), onehot_scores_path
In [6]:
# Imports, Plotting Defaults

import matplotlib.pyplot as plt
import matplotlib.font_manager as font_manager

plot_params = {
    "figure.titlesize": 22,
    "axes.titlesize": 22,
    "axes.labelsize": 20,
    "legend.fontsize": 18,
    "xtick.labelsize": 16,
    "ytick.labelsize": 16,
    "font.weight": "bold"
}
plt.rcParams.update(plot_params)

from IPython.display import display
import tqdm
tqdm.tqdm_notebook()

import numpy as np

from view_modisco_results_utils import *
from tomtom_utils import *  
/home/users/kcochran/miniconda3/envs/procap/lib/python3.7/site-packages/ipykernel_launcher.py:19: TqdmDeprecationWarning: This function will be removed in tqdm==5.0.0
Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
In [7]:
# Load in True Profiles and Sequences

import sys
sys.path.append('../1_train_models')

from data_loading import extract_peaks

one_hot_seqs, true_profs = extract_peaks(sequence_path, 
    plus_bw_path, minus_bw_path, val_peak_path, in_window, out_window,
    max_jitter=0, verbose=True)

one_hot_seqs = one_hot_seqs.swapaxes(1,2)
one_hot_seqs = one_hot_seqs[:, (in_window // 2 - score_center_size // 2):(in_window // 2 + score_center_size // 2), :]
Reading FASTA: 100%|██████████| 24/24 [00:19<00:00,  1.21it/s]
Loading Peaks: 3834it [00:04, 860.93it/s]
In [8]:
# Load in Coordinates of Examples
    
coords = load_coords(val_peak_path, in_window)
In [9]:
# Import SHAP scores, predicted profiles

hyp_scores = np.load(scores_path).swapaxes(1,2)
hyp_scores = hyp_scores[:, (in_window // 2 - score_center_size // 2):(in_window // 2 + score_center_size // 2), :]
pred_profs = np.exp(np.load(val_save_path + ".profs.npy"))
In [10]:
# Load modisco results object
    
tfm_obj = import_tfmodisco_results(modisco_obj_path, hyp_scores, one_hot_seqs)
In [11]:
motif_pfms, motif_hcwms, motif_cwms, \
    motif_pfms_short, num_seqlets, \
    motif_seqlets, num_metaclusters = plot_all_metaclusters(tfm_obj, one_hot_seqs, hyp_scores,
                                                            true_profs, pred_profs, coords,
                                                            in_window, out_window,
                                                            score_center_size,
                                                            profile_display_center_size)

Metacluster 1/1

Pattern 1/27

901 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 2/27

599 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 3/27

513 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 4/27

340 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 5/27

339 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 6/27

328 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 7/27

318 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 8/27

317 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 9/27

306 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 10/27

300 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 11/27

294 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 12/27

262 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 13/27

256 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 14/27

223 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 15/27

222 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 16/27

219 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 17/27

218 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 18/27

196 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 19/27

167 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 20/27

155 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 21/27

120 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 22/27

116 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 23/27

97 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 24/27

66 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 25/27

60 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 26/27

34 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 27/27

27 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)
In [12]:
run_and_plot_tomtom(modisco_out_path, motif_pfms, motif_hcwms, motif_pfms_short, num_metaclusters)

Metacluster 1/1

Motif 1/27

Motif IDq-valPWM
MA1833.12.12985e-07
MA1832.17.15333e-07
MA1821.17.15333e-07
MA2022.12.22189e-06
MA1817.14.20632e-06

Motif 2/27

Motif IDq-valPWM
MA1273.10.187395
MA0452.20.397347
MA0657.10.397347
MA1199.10.397347
MA0984.10.428144

Motif 3/27

Motif IDq-valPWM
MA1597.10.0372614
MA0095.30.295904
MA1978.10.295904
MA1524.20.295904

Motif 4/27

Motif IDq-valPWM
MA1820.10.00110687
MA1832.10.00110687
MA1818.10.00110687
MA1821.10.00110687
MA1817.10.00202057

Motif 5/27

Motif IDq-valPWM
MA1257.11.09495e-08
MA1833.11.09495e-08
MA1262.11.4364e-08
MA2022.11.4364e-08
MA1239.11.4364e-08

Motif 6/27

Motif IDq-valPWM
MA1513.10.0873541
MA0079.50.0873541
MA1410.10.0873541
MA1961.10.0873541
MA0516.30.0873541

Motif 7/27

Motif IDq-valPWM
MA1880.10.00033407
MA1833.10.000451504
MA1817.10.000451504
MA1713.10.0011094
MA1820.10.00177586

Motif 8/27

No TOMTOM matches passing threshold

Motif 9/27

Motif IDq-valPWM
MA1268.10.00589477
MA1279.10.00932267
MA1125.10.00932267
MA1274.10.00932267
MA1278.10.0106531

Motif 10/27

No TOMTOM matches passing threshold

Motif 11/27

Motif IDq-valPWM
MA1893.16.24852e-05
MA1892.16.24852e-05
MA1513.18.63265e-05
MA1890.18.63265e-05
MA1961.10.000267134

Motif 12/27

Motif IDq-valPWM
MA1713.10.00074842
MA1650.10.0143772
MA0146.20.017512
MA1833.10.017512
MA1053.10.017512

Motif 13/27

Motif IDq-valPWM
MA1890.16.31498e-07
MA1892.12.88754e-06
MA1893.12.88754e-06
MA1513.12.88754e-06
MA1961.14.90549e-05

Motif 14/27

Motif IDq-valPWM
MA1880.13.40769e-05
MA1713.10.00614492
MA1817.10.00614492
MA0375.10.00614492
MA1820.10.0134169

Motif 15/27

Motif IDq-valPWM
MA1650.10.00257734
MA1833.10.00830702
MA1834.10.00830702
MA0375.10.00830702
MA0162.40.00830702

Motif 16/27

No TOMTOM matches passing threshold

Motif 17/27

No TOMTOM matches passing threshold

Motif 18/27

No TOMTOM matches passing threshold

Motif 19/27

Motif IDq-valPWM
MA1880.10.000220274
MA1817.10.000887042
MA1833.10.000887042
MA1820.10.00110508
MA1650.10.00115837

Motif 20/27

Motif IDq-valPWM
MA1833.17.56904e-07
MA1819.17.56904e-07
MA1832.12.40763e-06
MA2022.12.40763e-06
MA1821.13.07472e-06

Motif 21/27

Motif IDq-valPWM
MA1820.18.56718e-06
MA1833.18.56718e-06
MA1819.11.39441e-05
MA1817.16.01939e-05
MA2022.16.01939e-05

Motif 22/27

Motif IDq-valPWM
MA1961.10.0772313
MA1880.10.0772313
MA0597.20.0772313
MA1513.10.0772313
MA1982.10.0772313

Motif 23/27

Motif IDq-valPWM
MA1833.12.95511e-08
MA1820.19.9475e-07
MA1819.12.2087e-06
MA1817.13.75551e-06
MA2022.14.33283e-06

Motif 24/27

Motif IDq-valPWM
MA1713.10.0144964
MA1880.10.0254212
MA0374.10.0330874
MA1833.10.0330874
MA1051.10.0393872

Motif 25/27

Motif IDq-valPWM
MA1713.10.0100641
MA1893.10.0145171
MA1833.10.0145171
MA1880.10.0145171
MA1892.10.0145171

Motif 26/27

Motif IDq-valPWM
MA0506.20.0050068
MA1833.10.0824026
MA1266.10.0824026
MA1713.10.0824026
MA1712.10.0824026

Motif 27/27

Motif IDq-valPWM
MA2022.12.83956e-06
MA1820.12.83956e-06
MA1257.12.83956e-06
MA1228.12.83956e-06
MA1262.12.83956e-06