In [1]:
# Filepaths and Hard-coded Defaults

proj_root = "/home/users/kcochran/oak/kcochran/procap_models/"
sequence_path = proj_root + "genomes/GRCh38_no_alt_analysis_set_GCA_000001405.15.fasta"
chrom_sizes = proj_root + "genomes/hg38.chrom.sizes.withrRNA"

in_window = 2114
out_window = 1000
In [2]:
# stuff to get from config file

with open("2022-07-24_12-31-29_run1_modisco_config_K562_counts.txt") as config_f:
    config_dict = {line.split()[0] : line.strip().split()[1] for line in config_f}

modisco_out_path = config_dict["modisco_out_path"]
scoring_type = config_dict["scoring_type"]
score_center_size = int(config_dict["score_center_size"])
profile_display_center_size = int(config_dict["profile_display_center_size"])
train_val_type = config_dict["train_val_type"]

# digest what's in config file

if not modisco_out_path.endswith("/"):
    modisco_out_path = modisco_out_path + "/"

assay_type, model_type, cell, accession, modisco_dir_base = modisco_out_path.split("/")[-6:-1]
ts_part1, ts_part2, run_str, _ = modisco_dir_base.split("_")
timestamp = ts_part1 + "_" + ts_part2
run = int(run_str.replace("run", ""))
In [3]:
print(modisco_out_path)
print("cell_type:", cell, accession)
print("timestamp:", timestamp)
print("run:", run)
print("scoring_type:", scoring_type)
print("score_center_size:", score_center_size)
print("profile_display_center_size:", profile_display_center_size)
/home/users/kcochran/oak/kcochran/procap_models/modisco_out/procap/bpnetlite_basic_v2/K562/ENCSR261KBX/2022-07-24_12-31-29_run1_modisco/
cell_type: K562 ENCSR261KBX
timestamp: 2022-07-24_12-31-29
run: 1
scoring_type: counts
score_center_size: 1000
profile_display_center_size: 400
In [4]:
data_dir = proj_root + "/data/procap/processed/" + cell + "/" + accession + "/"
plus_bw_path = data_dir + "final.5prime.pos.bigWig"
minus_bw_path = data_dir + "final.5prime.neg.bigWig"
val_peak_path = data_dir + "peaks_uni_and_bi_" + train_val_type + ".bed.gz"

val_save_dir = proj_root + "model_out/" + assay_type + "/" + model_type + "/" + cell + "/" + accession + "/"
val_save_path = val_save_dir + timestamp + "_run" + str(run) + "_" + train_val_type

attr_save_path = val_save_dir.replace("model_out", "deepshap_out") + timestamp + "_run" + str(run) + "_deepshap"
In [5]:
# task-specific filepaths

import os

assert scoring_type in ["profile", "counts"], scoring_type

if scoring_type == "profile":
    score_type_short = "prof"
else:
    score_type_short = "count"

scores_path = attr_save_path + "_" + score_type_short + ".npy"
onehot_scores_path = attr_save_path + "_" + score_type_short + "_onehot.npy"
modisco_obj_path = modisco_out_path + "results_allChroms_" + score_type_short + "_slice" + str(score_center_size) + ".hdf5"
seqlet_path = modisco_out_path + "seqlets_" + score_type_short + ".txt"
    
tomtom_dir = modisco_out_path + "tomtom_" + score_type_short
    
assert(os.path.exists(scores_path)), scores_path
assert(os.path.exists(onehot_scores_path)), onehot_scores_path
In [6]:
# Imports, Plotting Defaults

import matplotlib.pyplot as plt
import matplotlib.font_manager as font_manager

plot_params = {
    "figure.titlesize": 22,
    "axes.titlesize": 22,
    "axes.labelsize": 20,
    "legend.fontsize": 18,
    "xtick.labelsize": 16,
    "ytick.labelsize": 16,
    "font.weight": "bold"
}
plt.rcParams.update(plot_params)

from IPython.display import display
import tqdm
tqdm.tqdm_notebook()

import numpy as np

from view_modisco_results_utils import *
from tomtom_utils import *  
/home/users/kcochran/miniconda3/envs/procap/lib/python3.7/site-packages/ipykernel_launcher.py:19: TqdmDeprecationWarning: This function will be removed in tqdm==5.0.0
Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
In [7]:
# Load in True Profiles and Sequences

import sys
sys.path.append('../1_train_models')

from data_loading import extract_peaks

one_hot_seqs, true_profs = extract_peaks(sequence_path, 
    plus_bw_path, minus_bw_path, val_peak_path, in_window, out_window,
    max_jitter=0, verbose=True)

one_hot_seqs = one_hot_seqs.swapaxes(1,2)
one_hot_seqs = one_hot_seqs[:, (in_window // 2 - score_center_size // 2):(in_window // 2 + score_center_size // 2), :]
Reading FASTA: 100%|██████████| 24/24 [00:35<00:00,  1.47s/it]
Loading Peaks: 27000it [00:37, 726.09it/s]
In [8]:
# Load in Coordinates of Examples
    
coords = load_coords(val_peak_path, in_window)
In [9]:
# Import SHAP scores, predicted profiles

hyp_scores = np.load(scores_path).swapaxes(1,2)
hyp_scores = hyp_scores[:, (in_window // 2 - score_center_size // 2):(in_window // 2 + score_center_size // 2), :]
pred_profs = np.exp(np.load(val_save_path + ".profs.npy"))
In [10]:
# Load modisco results object
    
tfm_obj = import_tfmodisco_results(modisco_obj_path, hyp_scores, one_hot_seqs)
In [11]:
motif_pfms, motif_hcwms, motif_cwms, \
    motif_pfms_short, num_seqlets, \
    motif_seqlets, num_metaclusters = plot_all_metaclusters(tfm_obj, one_hot_seqs, hyp_scores,
                                                            true_profs, pred_profs, coords,
                                                            in_window, out_window,
                                                            score_center_size,
                                                            profile_display_center_size)

Metacluster 1/2

Pattern 1/20

9372 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 2/20

8027 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 3/20

5603 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 4/20

5579 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 5/20

3722 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 6/20

1480 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 7/20

1090 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 8/20

1075 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 9/20

1054 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 10/20

1006 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 11/20

826 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 12/20

822 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 13/20

680 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 14/20

338 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 15/20

289 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 16/20

107 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 17/20

76 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 18/20

40 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 19/20

30 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 20/20

24 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Metacluster 2/2

Pattern 1/8

137 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 2/8

98 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 3/8

70 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 4/8

64 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 5/8

63 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 6/8

47 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 7/8

36 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 8/8

22 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)
In [12]:
run_and_plot_tomtom(modisco_out_path, motif_pfms, motif_hcwms, motif_pfms_short, num_metaclusters, tomtom_dir)

Metacluster 1/2

Motif 1/20

Motif IDq-valPWM
MA0516.30.000151491
MA0742.20.000153393
MA0079.50.000161258
MA0685.20.000161258
MA1511.20.000161258

Motif 2/20

Motif IDq-valPWM
MA0076.25.70795e-05
MA0764.30.000125721
MA0759.20.000125721
MA0916.10.000141856
MA0765.30.000271855

Motif 3/20

Motif IDq-valPWM
MA0060.34.28622e-05
MA0316.14.28622e-05
MA1644.14.28622e-05
MA0314.27.41335e-05
MA0502.20.0275738

Motif 4/20

Motif IDq-valPWM
MA0506.21.03877e-05
MA1826.10.287145
MA0048.20.395958
MA1106.10.395958
MA1061.10.395958

Motif 5/20

Motif IDq-valPWM
MA1475.10.000192117
MA0609.20.000278576
MA1346.10.0013707
MA1438.10.0013707
MA1133.10.00139121

Motif 6/20

Motif IDq-valPWM
MA1573.21.66769e-08
MA0088.20.0189088
MA1716.10.0210958
MA1625.10.345819
MA0519.10.438711

Motif 7/20

Motif IDq-valPWM
MA0748.20.000237995
MA0998.10.00127309
MA1004.10.00352335
MA0975.10.00365308
MA1651.10.00365308

Motif 8/20

Motif IDq-valPWM
MA0139.11.0599e-06
MA1929.11.7404e-05
MA1102.22.27925e-05
MA1930.13.91608e-05
MA0531.16.25181e-05

Motif 9/20

Motif IDq-valPWM
MA1053.10.00239086
MA1051.10.00269004
MA0997.10.00492725
MA0975.10.00492725
MA0998.10.015186

Motif 10/20

Motif IDq-valPWM
MA0062.31.05054e-06
MA0473.32.34168e-06
MA0598.32.34168e-06
MA0474.35.35136e-06
MA1992.15.35136e-06

Motif 11/20

Motif IDq-valPWM
MA0527.11.7272e-06

Motif 12/20

Motif IDq-valPWM
MA0501.10.000865897
MA0591.10.000865897
MA0150.20.000865897
MA0489.20.000865897
MA1448.10.000865897

Motif 13/20

Motif IDq-valPWM
MA1053.10.000146374
MA1051.10.00190959
MA1102.20.00269784
MA0975.10.00632135
MA0531.10.0104988

Motif 14/20

Motif IDq-valPWM
MA0361.10.155987
MA1961.10.155987
MA0979.10.155987
MA1713.10.155987
MA1006.10.155987

Motif 15/20

Motif IDq-valPWM
MA1102.20.00264256
MA1053.10.00396152
MA1051.10.00498648
MA0139.10.00514376
MA0531.10.00767921

Motif 16/20

Motif IDq-valPWM
MA1004.10.0777813
MA1460.10.0777813
MA0748.20.0777813
MA0544.10.0777813
MA0998.10.103456

Motif 17/20

Motif IDq-valPWM
MA1819.10.154935
MA1820.10.257593
MA1880.10.257593
MA1832.10.268619
MA1821.10.268619

Motif 18/20

Motif IDq-valPWM
MA0527.10.126239

Motif 19/20

Motif IDq-valPWM
MA1053.10.203181
MA0597.20.203181
MA1051.10.203181
MA0456.10.203181
MA1730.10.224849

Motif 20/20

Motif IDq-valPWM
MA1050.10.0873503
MA1066.10.0873503
MA1097.10.192887
MA1095.10.20955
MA1098.10.20955

Metacluster 2/2

Motif 1/8

Motif IDq-valPWM
MA1267.10.0266873
MA1380.10.0280915
MA1268.10.0373389
MA1863.10.0380333
MA1871.10.0380333

Motif 2/8

Motif IDq-valPWM
MA1267.12.1566e-07
MA1268.16.98031e-07
MA1281.16.98031e-07
MA1274.11.31759e-06
MA1871.13.12224e-06

Motif 3/8

Motif IDq-valPWM
MA1826.10.030388
MA1412.10.0340818
MA1822.10.0586347
MA0375.10.0586347
MA1560.10.0586347

Motif 4/8

Motif IDq-valPWM
MA1125.10.0151513
MA1380.10.0151513
MA1205.10.0230788
MA1823.10.0230788
MA1379.10.0230788

Motif 5/8

Motif IDq-valPWM
MA0538.10.000641539
MA1865.10.292076
MA1107.20.373096

Motif 6/8

Motif IDq-valPWM
MA0538.10.000319051
MA1890.10.0141953
MA1627.10.020239
MA0073.10.0606604
MA1893.10.0724721

Motif 7/8

Motif IDq-valPWM
MA1281.12.41325e-07
MA1274.12.17542e-06
MA1268.13.74801e-06
MA1278.15.31535e-06
MA1277.11.83689e-05

Motif 8/8

Motif IDq-valPWM
MA1864.10.231954
MA1862.10.308981
MA1366.10.308981
MA1917.10.308981
MA1870.10.308981