In [1]:
# Filepaths and Hard-coded Defaults

proj_root = "/home/users/kcochran/oak/kcochran/procap_models/"
sequence_path = proj_root + "genomes/GRCh38_no_alt_analysis_set_GCA_000001405.15.fasta"
chrom_sizes = proj_root + "genomes/hg38.chrom.sizes.withrRNA"

in_window = 2114
out_window = 1000
In [2]:
# stuff to get from config file

with open("2022-07-24_12-31-29_run1_modisco_config_K562_profile.txt") as config_f:
    config_dict = {line.split()[0] : line.strip().split()[1] for line in config_f}

modisco_out_path = config_dict["modisco_out_path"]
scoring_type = config_dict["scoring_type"]
score_center_size = int(config_dict["score_center_size"])
profile_display_center_size = int(config_dict["profile_display_center_size"])
train_val_type = config_dict["train_val_type"]

# digest what's in config file

assay_type, model_type, cell, accession, modisco_dir_base = modisco_out_path.split("/")[-6:-1]
ts_part1, ts_part2, run_str, _ = modisco_dir_base.split("_")
timestamp = ts_part1 + "_" + ts_part2
run = int(run_str.replace("run", ""))
In [3]:
print(modisco_out_path)
print("cell_type:", cell, accession)
print("timestamp:", timestamp)
print("run:", run)
print("scoring_type:", scoring_type)
print("score_center_size:", score_center_size)
print("profile_display_center_size:", profile_display_center_size)
/home/users/kcochran/oak/kcochran/procap_models/modisco_out/procap/bpnetlite_basic_v2/K562/ENCSR261KBX/2022-07-24_12-31-29_run1_modisco/
cell_type: K562 ENCSR261KBX
timestamp: 2022-07-24_12-31-29
run: 1
scoring_type: profile
score_center_size: 1000
profile_display_center_size: 400
In [4]:
data_dir = proj_root + "/data/procap/processed/" + cell + "/" + accession + "/"
plus_bw_path = data_dir + "final.5prime.pos.bigWig"
minus_bw_path = data_dir + "final.5prime.neg.bigWig"
val_peak_path = data_dir + "peaks_uni_and_bi_" + train_val_type + ".bed.gz"

val_save_dir = proj_root + "model_out/" + assay_type + "/" + model_type + "/" + cell + "/" + accession + "/"
val_save_path = val_save_dir + timestamp + "_run" + str(run) + "_" + train_val_type

attr_save_path = val_save_dir.replace("model_out", "deepshap_out") + timestamp + "_run" + str(run) + "_deepshap"

if not modisco_out_path.endswith("/"):
    modisco_out_path = modisco_out_path + "/"
In [5]:
# task-specific filepaths

import os

assert scoring_type in ["profile", "counts"], scoring_type

if scoring_type == "profile":
    scores_path = attr_save_path + "_prof.npy"
    onehot_scores_path = attr_save_path + "_prof_onehot.npy"
    modisco_obj_path = modisco_out_path + "results_allChroms_prof_slice" + str(score_center_size) + ".hdf5"
    seqlet_path = modisco_out_path + "seqlets_prof.txt"
else:
    scores_path = attr_save_path + "_count.npy"
    onehot_scores_path = attr_save_path + "_count_onehot.npy"
    modisco_obj_path = modisco_out_path + "results_allChroms_count_slice" + str(score_center_size) + ".hdf5"
    seqlet_path = modisco_out_path + "seqlets_count.txt"
    
assert(os.path.exists(scores_path)), scores_path
assert(os.path.exists(onehot_scores_path)), onehot_scores_path
In [6]:
# Imports, Plotting Defaults

import matplotlib.pyplot as plt
import matplotlib.font_manager as font_manager

plot_params = {
    "figure.titlesize": 22,
    "axes.titlesize": 22,
    "axes.labelsize": 20,
    "legend.fontsize": 18,
    "xtick.labelsize": 16,
    "ytick.labelsize": 16,
    "font.weight": "bold"
}
plt.rcParams.update(plot_params)

from IPython.display import display
import tqdm
tqdm.tqdm_notebook()

import numpy as np

from view_modisco_results_utils import *
from tomtom_utils import *  
/home/users/kcochran/miniconda3/envs/procap/lib/python3.7/site-packages/ipykernel_launcher.py:19: TqdmDeprecationWarning: This function will be removed in tqdm==5.0.0
Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
In [7]:
# Load in True Profiles and Sequences

import sys
sys.path.append('../1_train_models')

from data_loading import extract_peaks

one_hot_seqs, true_profs = extract_peaks(sequence_path, 
    plus_bw_path, minus_bw_path, val_peak_path, in_window, out_window,
    max_jitter=0, verbose=True)

one_hot_seqs = one_hot_seqs.swapaxes(1,2)
one_hot_seqs = one_hot_seqs[:, (in_window // 2 - score_center_size // 2):(in_window // 2 + score_center_size // 2), :]
Reading FASTA: 100%|██████████| 24/24 [00:10<00:00,  2.26it/s]
Loading Peaks: 27000it [00:23, 1131.39it/s]
In [8]:
# Load in Coordinates of Examples
    
coords = load_coords(val_peak_path, in_window)
In [9]:
# Import SHAP scores, predicted profiles

hyp_scores = np.load(scores_path).swapaxes(1,2)
hyp_scores = hyp_scores[:, (in_window // 2 - score_center_size // 2):(in_window // 2 + score_center_size // 2), :]
pred_profs = np.exp(np.load(val_save_path + ".profs.npy"))
In [10]:
# Load modisco results object
    
tfm_obj = import_tfmodisco_results(modisco_obj_path, hyp_scores, one_hot_seqs)
In [11]:
motif_pfms, motif_hcwms, motif_cwms, \
    motif_pfms_short, num_seqlets, \
    motif_seqlets, num_metaclusters = plot_all_metaclusters(tfm_obj, one_hot_seqs, hyp_scores,
                                                            true_profs, pred_profs, coords,
                                                            in_window, out_window,
                                                            score_center_size,
                                                            profile_display_center_size)

Metacluster 1/2

Pattern 1/42

12329 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 2/42

10782 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 3/42

4928 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 4/42

4167 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 5/42

3325 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 6/42

3303 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 7/42

2705 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 8/42

1616 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 9/42

1606 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 10/42

1493 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 11/42

1345 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 12/42

1032 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 13/42

915 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 14/42

834 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 15/42

755 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 16/42

604 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 17/42

604 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 18/42

435 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 19/42

429 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 20/42

391 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 21/42

344 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 22/42

325 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 23/42

284 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 24/42

165 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 25/42

128 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 26/42

124 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 27/42

122 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 28/42

91 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 29/42

85 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 30/42

74 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 31/42

67 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 32/42

63 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 33/42

63 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 34/42

57 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 35/42

44 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 36/42

44 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 37/42

32 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 38/42

29 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 39/42

27 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 40/42

25 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 41/42

24 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 42/42

22 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Metacluster 2/2

Pattern 1/4

62 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 2/4

30 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 3/4

24 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 4/4

23 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)
In [12]:
run_and_plot_tomtom(modisco_out_path, motif_pfms, motif_hcwms, motif_pfms_short, num_metaclusters)

Metacluster 1/2

Motif 1/42

No TOMTOM matches passing threshold

Motif 2/42

Motif IDq-valPWM
MA0516.30.000114923
MA0742.20.000130081
MA0685.20.000167693
MA0079.50.000170741
MA1961.10.000170741

Motif 3/42

Motif IDq-valPWM
MA0314.26.49542e-06
MA0060.36.49542e-06
MA1644.11.60722e-05
MA0502.20.011712
MA0316.10.0127761

Motif 4/42

Motif IDq-valPWM
MA0076.29.77514e-07
MA0750.29.77514e-07
MA0764.33.94958e-05
MA0156.35.92437e-05
MA0763.10.000116853

Motif 5/42

Motif IDq-valPWM
MA1820.10.000977921
MA1817.10.000977921
MA1049.10.00110805
MA1818.10.00128521
MA1832.10.00128521

Motif 6/42

Motif IDq-valPWM
MA0506.26.87591e-07
MA1826.10.139178
MA0103.30.169361
MA1834.10.169361
MA0522.30.169361

Motif 7/42

Motif IDq-valPWM
MA0609.20.000109852
MA1145.10.000155738
MA1475.10.00629298
MA1899.10.00629298
MA0967.10.0092637

Motif 8/42

No TOMTOM matches passing threshold

Motif 9/42

No TOMTOM matches passing threshold

Motif 10/42

Motif IDq-valPWM
MA1257.12.25591e-09
MA1262.14.36941e-09
MA1833.14.36941e-09
MA1239.18.66923e-09
MA2022.18.79324e-09

Motif 11/42

Motif IDq-valPWM
MA0489.20.000518996
MA0591.10.000518996
MA0501.10.000518996
MA0150.20.000518996
MA1141.10.000518996

Motif 12/42

Motif IDq-valPWM
MA1573.22.85065e-09
MA0088.20.0243914
MA1716.10.0302586
MA1625.10.266622
MA0519.10.47702

Motif 13/42

Motif IDq-valPWM
MA0748.24.02353e-05
MA0975.14.9859e-05
MA0998.10.000349785
MA0997.10.00119263
MA1244.10.00215836

Motif 14/42

Motif IDq-valPWM
MA0139.11.73652e-07
MA1929.12.1535e-07
MA1930.12.67266e-06
MA1102.27.30355e-06
MA0531.18.67421e-05

Motif 15/42

Motif IDq-valPWM
MA1820.10.00016575
MA1819.10.000548911
MA1817.10.00155117
MA1833.10.00228667
MA1821.10.00228667

Motif 16/42

Motif IDq-valPWM
MA0080.60.217397
MA1268.10.217397
MA0452.20.217397
MA1279.10.217397
MA1823.10.217397

Motif 17/42

No TOMTOM matches passing threshold

Motif 18/42

Motif IDq-valPWM
MA0527.16.92661e-06

Motif 19/42

Motif IDq-valPWM
MA1833.10.000103603
MA1817.10.000103603
MA0535.10.000134933
MA2022.10.000138324
MA1818.10.000138324

Motif 20/42

Motif IDq-valPWM
MA2022.10.00446026
MA1817.10.00557395
MA1820.10.00557395
MA1228.10.00557395
MA1264.10.0146121

Motif 21/42

Motif IDq-valPWM
MA0599.10.000424675
MA0039.40.000424675
MA1513.10.000457268
MA1107.20.00125194
MA0493.20.00125194

Motif 22/42

Motif IDq-valPWM
MA1833.15.44003e-07
MA1257.12.36847e-06
MA1820.12.36847e-06
MA2022.12.83176e-06
MA1817.18.1102e-06

Motif 23/42

Motif IDq-valPWM
MA1596.10.0270208
MA0587.10.210511
MA1513.10.232968
MA1961.10.232968
MA1712.10.410875

Motif 24/42

Motif IDq-valPWM
MA1817.10.00652875
MA1833.10.00652875
MA1713.10.00652875
MA1832.10.00652875
MA1821.10.00652875

Motif 25/42

Motif IDq-valPWM
MA1880.18.92839e-05
MA0506.20.000457509
MA1833.10.00135399
MA1650.10.00586157
MA0375.10.00756002

Motif 26/42

Motif IDq-valPWM
MA1596.10.00207162
MA1817.10.00886397
MA1820.10.00886397
MA0131.20.0289714
MA2013.10.0332991

Motif 27/42

No TOMTOM matches passing threshold

Motif 28/42

Motif IDq-valPWM
MA0527.10.010958

Motif 29/42

Motif IDq-valPWM
MA1833.15.40527e-07
MA1820.15.40527e-07
MA1817.15.40527e-07
MA1819.16.19317e-07
MA1228.18.20316e-06

Motif 30/42

No TOMTOM matches passing threshold

Motif 31/42

Motif IDq-valPWM
MA0609.20.229455

Motif 32/42

Motif IDq-valPWM
MA1730.10.2509

Motif 33/42

Motif IDq-valPWM
MA0506.20.00169329
MA1880.10.00217315
MA1892.10.0125828
MA1826.10.0125828
MA1893.10.0125828

Motif 34/42

Motif IDq-valPWM
MA0299.10.434865

Motif 35/42

No TOMTOM matches passing threshold

Motif 36/42

Motif IDq-valPWM
MA1820.10.00178998
MA1819.10.00178998
MA1833.10.00178998
MA1880.10.00178998
MA1713.10.00178998

Motif 37/42

No TOMTOM matches passing threshold

Motif 38/42

Motif IDq-valPWM
MA1066.10.204762
MA1050.10.204762
MA1097.10.300788
MA1095.10.300788
MA1098.10.300788

Motif 39/42

Motif IDq-valPWM
MA0609.20.384514
MA0595.10.392174
MA0354.10.392174
MA0450.10.392174
MA1145.10.435865

Motif 40/42

No TOMTOM matches passing threshold

Motif 41/42

Motif IDq-valPWM
MA1464.10.125312
MA0526.40.125312
MA0831.30.129495
MA1816.10.129495
MA0399.10.129495

Motif 42/42

No TOMTOM matches passing threshold

Metacluster 2/2

Motif 1/4

Motif IDq-valPWM
MA1513.11.44586e-05
MA1893.12.81395e-05
MA1890.17.91672e-05
MA0685.28.88832e-05
MA0742.28.88832e-05

Motif 2/4

Motif IDq-valPWM
MA0506.20.00154604
MA1880.10.015799
MA1650.10.0214383
MA0522.30.0502543
MA1412.10.0524631

Motif 3/4

Motif IDq-valPWM
MA0060.38.62694e-05
MA1644.18.62694e-05
MA0314.20.00297962
MA0316.10.00488603
MA0502.20.0472531

Motif 4/4

Motif IDq-valPWM
MA0156.30.00013354
MA0763.10.000283267
MA0759.20.000283267
MA0598.30.000283267
MA0028.20.000300729