In [1]:
# Filepaths and Hard-coded Defaults

proj_root = "/home/users/kcochran/oak/kcochran/procap_models/"
sequence_path = proj_root + "genomes/GRCh38_no_alt_analysis_set_GCA_000001405.15.fasta"
chrom_sizes = proj_root + "genomes/hg38.chrom.sizes.withrRNA"

in_window = 2114
out_window = 1000
In [2]:
# stuff to get from config file

with open("2022-07-26_15-18-44_run1_modisco_config_K562_profile.txt") as config_f:
    config_dict = {line.split()[0] : line.strip().split()[1] for line in config_f}

modisco_out_path = config_dict["modisco_out_path"]
scoring_type = config_dict["scoring_type"]
score_center_size = int(config_dict["score_center_size"])
profile_display_center_size = int(config_dict["profile_display_center_size"])
train_val_type = config_dict["train_val_type"]

# digest what's in config file

if not modisco_out_path.endswith("/"):
    modisco_out_path = modisco_out_path + "/"

assay_type, model_type, cell, accession, modisco_dir_base = modisco_out_path.split("/")[-6:-1]
ts_part1, ts_part2, run_str, _ = modisco_dir_base.split("_")
timestamp = ts_part1 + "_" + ts_part2
run = int(run_str.replace("run", ""))
In [3]:
print(modisco_out_path)
print("cell_type:", cell, accession)
print("timestamp:", timestamp)
print("run:", run)
print("scoring_type:", scoring_type)
print("score_center_size:", score_center_size)
print("profile_display_center_size:", profile_display_center_size)
/home/users/kcochran/oak/kcochran/procap_models/modisco_out/procap/bpnetlite_basic_v2_umap/K562/ENCSR261KBX/2022-07-26_15-18-44_run1_modisco/
cell_type: K562 ENCSR261KBX
timestamp: 2022-07-26_15-18-44
run: 1
scoring_type: profile
score_center_size: 1000
profile_display_center_size: 400
In [4]:
data_dir = proj_root + "/data/procap/processed/" + cell + "/" + accession + "/"
plus_bw_path = data_dir + "final.5prime.pos.bigWig"
minus_bw_path = data_dir + "final.5prime.neg.bigWig"
val_peak_path = data_dir + "peaks_uni_and_bi_" + train_val_type + ".bed.gz"

val_save_dir = proj_root + "model_out/" + assay_type + "/" + model_type + "/" + cell + "/" + accession + "/"
val_save_path = val_save_dir + timestamp + "_run" + str(run) + "_" + train_val_type

attr_save_path = val_save_dir.replace("model_out", "deepshap_out") + timestamp + "_run" + str(run) + "_deepshap"
In [5]:
# task-specific filepaths

import os

assert scoring_type in ["profile", "counts"], scoring_type

if scoring_type == "profile":
    score_type_short = "prof"
else:
    score_type_short = "count"

scores_path = attr_save_path + "_" + score_type_short + ".npy"
onehot_scores_path = attr_save_path + "_" + score_type_short + "_onehot.npy"
modisco_obj_path = modisco_out_path + "results_allChroms_" + score_type_short + "_slice" + str(score_center_size) + ".hdf5"
seqlet_path = modisco_out_path + "seqlets_" + score_type_short + ".txt"
    
tomtom_dir = modisco_out_path + "tomtom_" + score_type_short
    
assert(os.path.exists(scores_path)), scores_path
assert(os.path.exists(onehot_scores_path)), onehot_scores_path
In [6]:
# Imports, Plotting Defaults

import matplotlib.pyplot as plt
import matplotlib.font_manager as font_manager

plot_params = {
    "figure.titlesize": 22,
    "axes.titlesize": 22,
    "axes.labelsize": 20,
    "legend.fontsize": 18,
    "xtick.labelsize": 16,
    "ytick.labelsize": 16,
    "font.weight": "bold"
}
plt.rcParams.update(plot_params)

from IPython.display import display
import tqdm
tqdm.tqdm_notebook()

import numpy as np

from view_modisco_results_utils import *
from tomtom_utils import *  
/home/users/kcochran/miniconda3/envs/procap/lib/python3.7/site-packages/ipykernel_launcher.py:19: TqdmDeprecationWarning: This function will be removed in tqdm==5.0.0
Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
In [7]:
# Load in True Profiles and Sequences

import sys
sys.path.append('../1_train_models')

from data_loading import extract_peaks

one_hot_seqs, true_profs = extract_peaks(sequence_path, 
    plus_bw_path, minus_bw_path, val_peak_path, in_window, out_window,
    max_jitter=0, verbose=True)

one_hot_seqs = one_hot_seqs.swapaxes(1,2)
one_hot_seqs = one_hot_seqs[:, (in_window // 2 - score_center_size // 2):(in_window // 2 + score_center_size // 2), :]
Reading FASTA: 100%|██████████| 24/24 [00:14<00:00,  1.65it/s]
Loading Peaks: 27000it [00:30, 896.94it/s]
In [8]:
# Load in Coordinates of Examples
    
coords = load_coords(val_peak_path, in_window)
In [9]:
# Import SHAP scores, predicted profiles

hyp_scores = np.load(scores_path).swapaxes(1,2)
hyp_scores = hyp_scores[:, (in_window // 2 - score_center_size // 2):(in_window // 2 + score_center_size // 2), :]
pred_profs = np.exp(np.load(val_save_path + ".profs.npy"))
In [10]:
# Load modisco results object
    
tfm_obj = import_tfmodisco_results(modisco_obj_path, hyp_scores, one_hot_seqs)
In [11]:
motif_pfms, motif_hcwms, motif_cwms, \
    motif_pfms_short, num_seqlets, \
    motif_seqlets, num_metaclusters = plot_all_metaclusters(tfm_obj, one_hot_seqs, hyp_scores,
                                                            true_profs, pred_profs, coords,
                                                            in_window, out_window,
                                                            score_center_size,
                                                            profile_display_center_size)

Metacluster 1/2

Pattern 1/35

13142 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 2/35

11079 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 3/35

6013 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 4/35

4916 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 5/35

3543 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 6/35

2625 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 7/35

2197 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 8/35

1321 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 9/35

1277 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 10/35

1177 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 11/35

1114 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 12/35

912 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 13/35

798 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 14/35

775 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 15/35

731 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 16/35

712 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 17/35

469 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 18/35

456 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 19/35

383 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 20/35

369 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 21/35

346 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 22/35

292 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 23/35

213 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 24/35

185 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 25/35

125 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 26/35

123 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 27/35

98 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 28/35

81 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 29/35

80 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 30/35

71 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 31/35

62 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 32/35

34 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 33/35

31 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 34/35

22 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 35/35

22 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Metacluster 2/2

Pattern 1/4

57 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 2/4

45 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 3/4

26 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 4/4

24 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)
In [12]:
run_and_plot_tomtom(modisco_out_path, motif_pfms, motif_hcwms, motif_pfms_short, num_metaclusters, tomtom_dir)

Metacluster 1/2

Motif 1/35

No TOMTOM matches passing threshold

Motif 2/35

Motif IDq-valPWM
MA0742.20.00021035
MA0516.30.00021035
MA0685.20.00021035
MA1961.10.00021035
MA0741.10.00021035

Motif 3/35

Motif IDq-valPWM
MA0314.22.48986e-05
MA0060.32.89337e-05
MA1644.12.89337e-05
MA0502.20.00938454
MA0316.10.0143206

Motif 4/35

Motif IDq-valPWM
MA0076.21.8465e-06
MA0750.21.77665e-05
MA0759.29.46198e-05
MA0156.39.46198e-05
MA0764.39.89699e-05

Motif 5/35

Motif IDq-valPWM
MA0506.23.63899e-06
MA1826.10.10563
MA1834.10.18026
MA1412.10.18026
MA0103.30.216734

Motif 6/35

Motif IDq-valPWM
MA1899.10.0120988
MA0967.10.0231571
MA1145.10.0261796
MA1933.10.0316458
MA1140.20.0316458

Motif 7/35

No TOMTOM matches passing threshold

Motif 8/35

No TOMTOM matches passing threshold

Motif 9/35

Motif IDq-valPWM
MA0478.10.000530903
MA1448.10.00167406
MA1988.10.00167406
MA0489.20.00265684
MA0591.10.00265684

Motif 10/35

Motif IDq-valPWM
MA1573.21.57455e-08
MA0088.20.0371589
MA1716.10.0443068
MA1625.10.416128
MA0525.20.484907

Motif 11/35

Motif IDq-valPWM
MA1820.17.64263e-08
MA1819.17.64263e-08
MA1817.18.54992e-08
MA1228.11.08862e-07
MA1833.11.83796e-07

Motif 12/35

Motif IDq-valPWM
MA0748.20.000173731
MA1818.10.000529731
MA0998.10.000529731
MA0975.10.000591737
MA1244.10.00127819

Motif 13/35

Motif IDq-valPWM
MA1833.10.0158297
MA1820.10.0158297
MA1713.10.0158297
MA1817.10.0158297
MA1819.10.0158297

Motif 14/35

No TOMTOM matches passing threshold

Motif 15/35

Motif IDq-valPWM
MA1257.11.5136e-06
MA1228.13.55625e-06
MA1262.11.01025e-05
MA1240.11.76848e-05
MA1820.12.27939e-05

Motif 16/35

Motif IDq-valPWM
MA0139.11.38779e-06
MA1929.19.05582e-06
MA1102.26.83092e-05
MA1930.17.63621e-05
MA0531.17.72704e-05

Motif 17/35

Motif IDq-valPWM
MA0527.16.02947e-06

Motif 18/35

Motif IDq-valPWM
MA1273.10.0627575
MA1268.10.0627575
MA1278.10.0627575
MA1279.10.0942126
MA0442.20.0942126

Motif 19/35

Motif IDq-valPWM
MA1927.10.137998
MA0998.10.137998
MA1001.30.137998
MA0976.20.137998
MA1819.10.137998

Motif 20/35

Motif IDq-valPWM
MA0506.20.000202055
MA1833.10.00921625
MA1819.10.00921971
MA1880.10.00921971
MA1650.10.00921971

Motif 21/35

Motif IDq-valPWM
MA1819.10.000898193
MA0975.10.000904884
MA1820.10.00132517
MA1832.10.00132517
MA1821.10.00132517

Motif 22/35

Motif IDq-valPWM
MA1832.17.67509e-06
MA1821.17.67509e-06
MA1833.11.04831e-05
MA1819.11.24131e-05
MA1817.11.72835e-05

Motif 23/35

Motif IDq-valPWM
MA1716.10.00103195
MA0088.20.00103195
MA1573.20.0104497

Motif 24/35

Motif IDq-valPWM
MA1596.10.0262465
MA0587.10.260434
MA1513.10.423274
MA1961.10.423274
MA1712.10.423274

Motif 25/35

Motif IDq-valPWM
MA0506.20.0056098
MA1961.10.014001
MA1817.10.0148954
MA1650.10.0198773
MA1820.10.0198773

Motif 26/35

No TOMTOM matches passing threshold

Motif 27/35

Motif IDq-valPWM
MA0748.20.287129
MA0998.10.375951
MA0975.10.481599
MA0997.10.481599
MA1748.10.481599

Motif 28/35

No TOMTOM matches passing threshold

Motif 29/35

Motif IDq-valPWM
MA1548.10.325533
MA1986.10.325533
MA1428.10.325533
MA1410.10.325533
MA1019.10.325533

Motif 30/35

Motif IDq-valPWM
MA1980.10.0719932
MA1615.10.0719932
MA1095.10.0719932
MA1098.10.0719932
MA0599.10.0719932

Motif 31/35

Motif IDq-valPWM
MA1596.16.95891e-06
MA1587.10.472156
MA1262.10.472156
MA1820.10.472156
MA0380.10.472156

Motif 32/35

No TOMTOM matches passing threshold

Motif 33/35

No TOMTOM matches passing threshold

Motif 34/35

Motif IDq-valPWM
MA0528.20.0629652
MA1522.10.175019
MA2029.10.175019
MA1961.10.175019
MA1965.10.175019

Motif 35/35

No TOMTOM matches passing threshold

Metacluster 2/2

Motif 1/4

Motif IDq-valPWM
MA1890.10.000582974
MA0742.20.000582974
MA0685.20.000582974
MA0516.30.000582974
MA0740.20.000582974

Motif 2/4

Motif IDq-valPWM
MA0060.30.00316378
MA1644.10.00413198
MA0316.10.00462462
MA0314.20.00491072
MA0502.20.0573061

Motif 3/4

Motif IDq-valPWM
MA0076.29.61333e-06
MA0750.20.000176732
MA0759.20.000176732
MA0156.30.000207048
MA0598.30.00036351

Motif 4/4

Motif IDq-valPWM
MA1890.15.26228e-05
MA1892.15.26228e-05
MA1961.15.26228e-05
MA1893.16.28185e-05
MA0516.39.59547e-05