In [1]:
# Filepaths and Hard-coded Defaults

proj_root = "/home/users/kcochran/oak/kcochran/procap_models/"
sequence_path = proj_root + "genomes/GRCh38_no_alt_analysis_set_GCA_000001405.15.fasta"
chrom_sizes = proj_root + "genomes/hg38.chrom.sizes.withrRNA"

in_window = 2114
out_window = 1000
In [2]:
# stuff to get from config file

with open("2022-07-26_15-18-44_run1_modisco_config_K562_counts.txt") as config_f:
    config_dict = {line.split()[0] : line.strip().split()[1] for line in config_f}

modisco_out_path = config_dict["modisco_out_path"]
scoring_type = config_dict["scoring_type"]
score_center_size = int(config_dict["score_center_size"])
profile_display_center_size = int(config_dict["profile_display_center_size"])
train_val_type = config_dict["train_val_type"]

# digest what's in config file

if not modisco_out_path.endswith("/"):
    modisco_out_path = modisco_out_path + "/"

assay_type, model_type, cell, accession, modisco_dir_base = modisco_out_path.split("/")[-6:-1]
ts_part1, ts_part2, run_str, _ = modisco_dir_base.split("_")
timestamp = ts_part1 + "_" + ts_part2
run = int(run_str.replace("run", ""))
In [3]:
print(modisco_out_path)
print("cell_type:", cell, accession)
print("timestamp:", timestamp)
print("run:", run)
print("scoring_type:", scoring_type)
print("score_center_size:", score_center_size)
print("profile_display_center_size:", profile_display_center_size)
/home/users/kcochran/oak/kcochran/procap_models/modisco_out/procap/bpnetlite_basic_v2_umap/K562/ENCSR261KBX/2022-07-26_15-18-44_run1_modisco/
cell_type: K562 ENCSR261KBX
timestamp: 2022-07-26_15-18-44
run: 1
scoring_type: counts
score_center_size: 1000
profile_display_center_size: 400
In [4]:
data_dir = proj_root + "/data/procap/processed/" + cell + "/" + accession + "/"
plus_bw_path = data_dir + "final.5prime.pos.bigWig"
minus_bw_path = data_dir + "final.5prime.neg.bigWig"
val_peak_path = data_dir + "peaks_uni_and_bi_" + train_val_type + ".bed.gz"

val_save_dir = proj_root + "model_out/" + assay_type + "/" + model_type + "/" + cell + "/" + accession + "/"
val_save_path = val_save_dir + timestamp + "_run" + str(run) + "_" + train_val_type

attr_save_path = val_save_dir.replace("model_out", "deepshap_out") + timestamp + "_run" + str(run) + "_deepshap"
In [5]:
# task-specific filepaths

import os

assert scoring_type in ["profile", "counts"], scoring_type

if scoring_type == "profile":
    score_type_short = "prof"
else:
    score_type_short = "count"

scores_path = attr_save_path + "_" + score_type_short + ".npy"
onehot_scores_path = attr_save_path + "_" + score_type_short + "_onehot.npy"
modisco_obj_path = modisco_out_path + "results_allChroms_" + score_type_short + "_slice" + str(score_center_size) + ".hdf5"
seqlet_path = modisco_out_path + "seqlets_" + score_type_short + ".txt"
    
tomtom_dir = modisco_out_path + "tomtom_" + score_type_short
    
assert(os.path.exists(scores_path)), scores_path
assert(os.path.exists(onehot_scores_path)), onehot_scores_path
In [6]:
# Imports, Plotting Defaults

import matplotlib.pyplot as plt
import matplotlib.font_manager as font_manager

plot_params = {
    "figure.titlesize": 22,
    "axes.titlesize": 22,
    "axes.labelsize": 20,
    "legend.fontsize": 18,
    "xtick.labelsize": 16,
    "ytick.labelsize": 16,
    "font.weight": "bold"
}
plt.rcParams.update(plot_params)

from IPython.display import display
import tqdm
tqdm.tqdm_notebook()

import numpy as np

from view_modisco_results_utils import *
from tomtom_utils import *  
/home/users/kcochran/miniconda3/envs/procap/lib/python3.7/site-packages/ipykernel_launcher.py:19: TqdmDeprecationWarning: This function will be removed in tqdm==5.0.0
Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
In [7]:
# Load in True Profiles and Sequences

import sys
sys.path.append('../1_train_models')

from data_loading import extract_peaks

one_hot_seqs, true_profs = extract_peaks(sequence_path, 
    plus_bw_path, minus_bw_path, val_peak_path, in_window, out_window,
    max_jitter=0, verbose=True)

one_hot_seqs = one_hot_seqs.swapaxes(1,2)
one_hot_seqs = one_hot_seqs[:, (in_window // 2 - score_center_size // 2):(in_window // 2 + score_center_size // 2), :]
Reading FASTA: 100%|██████████| 24/24 [00:09<00:00,  2.43it/s]
Loading Peaks: 27000it [00:17, 1577.42it/s]
In [8]:
# Load in Coordinates of Examples
    
coords = load_coords(val_peak_path, in_window)
In [9]:
# Import SHAP scores, predicted profiles

hyp_scores = np.load(scores_path).swapaxes(1,2)
hyp_scores = hyp_scores[:, (in_window // 2 - score_center_size // 2):(in_window // 2 + score_center_size // 2), :]
pred_profs = np.exp(np.load(val_save_path + ".profs.npy"))
In [10]:
# Load modisco results object
    
tfm_obj = import_tfmodisco_results(modisco_obj_path, hyp_scores, one_hot_seqs)
In [11]:
motif_pfms, motif_hcwms, motif_cwms, \
    motif_pfms_short, num_seqlets, \
    motif_seqlets, num_metaclusters = plot_all_metaclusters(tfm_obj, one_hot_seqs, hyp_scores,
                                                            true_profs, pred_profs, coords,
                                                            in_window, out_window,
                                                            score_center_size,
                                                            profile_display_center_size)

Metacluster 1/2

Pattern 1/24

9434 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 2/24

7826 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 3/24

6364 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 4/24

5823 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 5/24

3199 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 6/24

1645 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 7/24

1311 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 8/24

1282 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 9/24

1063 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 10/24

986 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 11/24

904 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 12/24

888 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 13/24

825 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 14/24

810 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 15/24

434 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 16/24

376 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 17/24

154 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 18/24

77 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 19/24

51 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 20/24

45 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 21/24

38 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 22/24

36 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 23/24

30 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 24/24

27 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Metacluster 2/2

Pattern 1/14

245 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 2/14

217 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 3/14

207 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 4/14

190 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 5/14

179 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 6/14

167 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 7/14

148 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 8/14

140 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 9/14

114 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 10/14

68 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 11/14

62 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 12/14

39 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 13/14

39 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 14/14

22 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)
In [12]:
run_and_plot_tomtom(modisco_out_path, motif_pfms, motif_hcwms, motif_pfms_short, num_metaclusters, tomtom_dir)

Metacluster 1/2

Motif 1/24

Motif IDq-valPWM
MA0764.30.000162441
MA1854.10.000162441
MA0916.10.000162441
MA0759.20.000162441
MA0156.30.000294336

Motif 2/24

Motif IDq-valPWM
MA1513.17.05562e-05
MA0742.20.000241713
MA0741.10.000266099
MA0685.20.000266099
MA0079.50.000266099

Motif 3/24

Motif IDq-valPWM
MA0314.26.63569e-05
MA0060.36.99574e-05
MA1644.16.99574e-05
MA0502.20.00822663
MA0316.10.0200282

Motif 4/24

Motif IDq-valPWM
MA0506.20.00226446
MA1826.10.280905
MA0963.10.483915
MA1411.10.483915
MA0566.10.483915

Motif 5/24

Motif IDq-valPWM
MA0967.10.00539641
MA1899.10.0204083
MA1127.10.0226253
MA1129.10.0226253
MA1131.10.0226253

Motif 6/24

Motif IDq-valPWM
MA1573.22.18943e-09
MA0088.20.0188641
MA1716.10.0222196
MA1625.10.258082
MA0519.10.424395

Motif 7/24

Motif IDq-valPWM
MA0748.20.000291442
MA0998.10.000291442
MA0975.10.000682161
MA0997.10.00121529
MA1004.10.00124277

Motif 8/24

Motif IDq-valPWM
MA0062.31.22126e-06
MA0598.33.51474e-06
MA1992.13.51474e-06
MA0473.36.69464e-06
MA0474.36.69464e-06

Motif 9/24

Motif IDq-valPWM
MA0139.19.40082e-08
MA1929.19.03979e-06
MA1930.14.08527e-05
MA1102.24.08527e-05
MA0531.14.08527e-05

Motif 10/24

Motif IDq-valPWM
MA0501.10.000563504
MA0591.10.000563504
MA0150.20.000661069
MA0089.20.000924499
MA1448.10.00102698

Motif 11/24

Motif IDq-valPWM
MA1818.10.00155592
MA1832.10.00155592
MA1819.10.00155592
MA1821.10.0017066
MA1004.10.00309407

Motif 12/24

Motif IDq-valPWM
MA0527.10.000205718

Motif 13/24

Motif IDq-valPWM
MA0361.10.202231
MA1713.10.395047

Motif 14/24

Motif IDq-valPWM
MA1053.12.36025e-05
MA1051.10.000610616
MA0531.10.00132933
MA1102.20.00188639
MA0975.10.00337625

Motif 15/24

Motif IDq-valPWM
MA0531.16.49531e-05
MA0139.10.000503732
MA1929.10.0014458
MA1930.10.00159874
MA1102.20.00177191

Motif 16/24

Motif IDq-valPWM
MA0429.10.00440271
MA0361.10.017081
MA0544.10.246562

Motif 17/24

Motif IDq-valPWM
MA1833.10.103082
MA1820.10.103082
MA1257.10.103082
MA1053.10.103082
MA1228.10.103082

Motif 18/24

Motif IDq-valPWM
MA0640.20.28707
MA0060.30.28707
MA1950.10.28707
MA1946.10.28707
MA0750.20.28707

Motif 19/24

Motif IDq-valPWM
MA1651.10.0302719
MA1650.10.0302719
MA1460.10.0800821
MA1832.10.0800821
MA1513.10.0800821

Motif 20/24

Motif IDq-valPWM
MA0986.10.00213016
MA1475.10.00864253
MA1007.10.00932555
MA1023.10.00932555
MA1951.10.0488629

Motif 21/24

Motif IDq-valPWM
MA1625.10.200861
MA0532.10.377664
MA0137.30.489665
MA0519.10.489665
MA0107.10.489665

Motif 22/24

Motif IDq-valPWM
MA0443.10.489425
MA0197.20.489425
MA2002.10.489425
MA0740.20.489425
MA1596.10.489425

Motif 23/24

Motif IDq-valPWM
MA0527.10.0875108

Motif 24/24

Motif IDq-valPWM
MA1066.10.180416
MA1050.10.180416
MA1097.10.295633
MA1095.10.295633
MA1098.10.295633

Metacluster 2/2

Motif 1/14

Motif IDq-valPWM
MA1820.10.000823024
MA1819.10.0012159
MA1833.10.00151157
MA1513.10.00174568
MA0146.20.00255733

Motif 2/14

Motif IDq-valPWM
MA1890.14.82112e-05
MA1893.10.000145121
MA1892.10.000145121
MA1833.10.000295031
MA1961.10.000541838

Motif 3/14

Motif IDq-valPWM
MA1890.16.94638e-07
MA1892.17.96622e-06
MA1893.11.78525e-05
MA1961.13.57157e-05
MA1713.10.000161049

Motif 4/14

Motif IDq-valPWM
MA1268.10.0212701
MA1274.10.0212701
MA1823.10.0212701
MA1277.10.0212701
MA1279.10.0212701

Motif 5/14

Motif IDq-valPWM
MA0538.10.000257144
MA1107.20.311119
MA1865.10.311119

Motif 6/14

Motif IDq-valPWM
MA1281.16.88853e-07
MA1267.16.88853e-07
MA1274.16.88853e-07
MA1268.11.67476e-06
MA1871.12.75635e-06

Motif 7/14

Motif IDq-valPWM
MA1890.11.18747e-05
MA1892.10.000145535
MA1893.10.000165245
MA1880.10.000221964
MA1833.10.000378314

Motif 8/14

Motif IDq-valPWM
MA1961.10.000168842
MA1890.10.000168842
MA1650.10.000168842
MA1893.10.000229531
MA1892.10.000259187

Motif 9/14

No TOMTOM matches passing threshold

Motif 10/14

Motif IDq-valPWM
MA1890.12.9359e-06
MA1513.11.28468e-05
MA1961.16.7422e-05
MA1893.16.7422e-05
MA1892.16.7422e-05

Motif 11/14

Motif IDq-valPWM
MA1890.15.88833e-07
MA1892.12.57685e-05
MA1961.12.57685e-05
MA1893.12.57685e-05
MA1513.10.000191278

Motif 12/14

Motif IDq-valPWM
MA0502.20.00113474
MA0316.10.0113704
MA0314.20.0113704
MA0060.30.0275581
MA1644.10.0275581

Motif 13/14

Motif IDq-valPWM
MA1115.10.465146
MA0963.10.465146
MA0957.10.465146
MA0507.20.465146
MA0322.10.465146

Motif 14/14

Motif IDq-valPWM
MA1403.12.21772e-15
MA1404.16.64357e-13
MA1402.11.01881e-10
MA1416.16.38357e-07
MA0205.20.000149371