In [1]:
# Filepaths and Hard-coded Defaults

proj_root = "/users/kcochran/projects/procap_models/"
sequence_path = proj_root + "genomes/GRCh38_no_alt_analysis_set_GCA_000001405.15.fasta"
chrom_sizes = proj_root + "genomes/hg38.chrom.sizes.withrRNA"

in_window = 2114
out_window = 1000
In [2]:
# stuff to get from config file

with open("2022-07-04_06-07-54_run1_modisco_config_CALU3_profile.txt") as config_f:
    config_dict = {line.split()[0] : line.strip().split()[1] for line in config_f}

modisco_out_path = config_dict["modisco_out_path"]
scoring_type = config_dict["scoring_type"]
score_center_size = int(config_dict["score_center_size"])
profile_display_center_size = int(config_dict["profile_display_center_size"])

# digest what's in config file

assay_type, model_type, cell, accession, modisco_dir_base = modisco_out_path.split("/")[-5:]
ts_part1, ts_part2, run_str, _ = modisco_dir_base.split("_")
timestamp = ts_part1 + "_" + ts_part2
run = int(run_str.replace("run", ""))
In [3]:
print(modisco_out_path)
print("cell_type:", cell, accession)
print("timestamp:", timestamp)
print("run:", run)
print("scoring_type:", scoring_type)
print("score_center_size:", score_center_size)
print("profile_display_center_size:", profile_display_center_size)
/users/kcochran//projects/procap_models/modisco_out/procap/bpnetlite_basic_v2/CALU3/ENCSR935RNW/2022-07-04_06-07-54_run1_modisco
cell_type: CALU3 ENCSR935RNW
timestamp: 2022-07-04_06-07-54
run: 1
scoring_type: profile
score_center_size: 1000
profile_display_center_size: 400
In [4]:
data_dir = proj_root + "/data/procap/processed/" + cell + "/" + accession + "/"
plus_bw_path = data_dir + "final.5prime.pos.bigWig"
minus_bw_path = data_dir + "final.5prime.neg.bigWig"
val_peak_path = data_dir + "peaks_uni_and_bi_train_and_val.bed.gz"

val_save_dir = proj_root + "model_out/" + assay_type + "/" + model_type + "/" + cell + "/" + accession + "/"
val_save_path = val_save_dir + timestamp + "_run" + str(run) + "_train_and_val"

attr_save_path = val_save_dir.replace("model_out", "deepshap_out") + timestamp + "_run" + str(run) + "_deepshap"

if not modisco_out_path.endswith("/"):
    modisco_out_path = modisco_out_path + "/"
In [5]:
# task-specific filepaths

import os

assert scoring_type in ["profile", "counts"], scoring_type

if scoring_type == "profile":
    scores_path = attr_save_path + "_prof.npy"
    onehot_scores_path = attr_save_path + "_prof_onehot.npy"
    modisco_obj_path = modisco_out_path + "results_allChroms_prof_slice" + str(score_center_size) + ".hdf5"
    seqlet_path = modisco_out_path + "seqlets_prof.txt"
else:
    scores_path = attr_save_path + "_count.npy"
    onehot_scores_path = attr_save_path + "_count_onehot.npy"
    modisco_obj_path = modisco_out_path + "results_allChroms_count_slice" + str(score_center_size) + ".hdf5"
    seqlet_path = modisco_out_path + "seqlets_count.txt"
    
assert(os.path.exists(scores_path)), scores_path
assert(os.path.exists(onehot_scores_path)), onehot_scores_path
In [6]:
# Imports, Plotting Defaults

import matplotlib.pyplot as plt
import matplotlib.font_manager as font_manager

plot_params = {
    "figure.titlesize": 22,
    "axes.titlesize": 22,
    "axes.labelsize": 20,
    "legend.fontsize": 18,
    "xtick.labelsize": 16,
    "ytick.labelsize": 16,
    "font.weight": "bold"
}
plt.rcParams.update(plot_params)

from IPython.display import display
import tqdm
tqdm.tqdm_notebook()

import numpy as np

from view_modisco_results_utils import *
/users/kcochran/miniconda3/envs/procap/lib/python3.7/site-packages/ipykernel_launcher.py:19: TqdmDeprecationWarning: This function will be removed in tqdm==5.0.0
Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
In [7]:
# Load in True Profiles and Sequences

import sys
sys.path.append('../1_train_models')

from data_loading import extract_peaks

one_hot_seqs, true_profs = extract_peaks(sequence_path, 
    plus_bw_path, minus_bw_path, val_peak_path, in_window, out_window,
    max_jitter=0, verbose=True)

one_hot_seqs = one_hot_seqs.swapaxes(1,2)
one_hot_seqs = one_hot_seqs[:, (in_window // 2 - score_center_size // 2):(in_window // 2 + score_center_size // 2), :]
Reading FASTA: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 24/24 [00:18<00:00,  1.27it/s]
Loading Peaks: 42440it [01:25, 498.51it/s]
In [8]:
# Load in Coordinates of Examples
    
coords = load_coords(val_peak_path, in_window)
In [9]:
# Import SHAP scores, predicted profiles

hyp_scores = np.load(scores_path).swapaxes(1,2)
hyp_scores = hyp_scores[:, (in_window // 2 - score_center_size // 2):(in_window // 2 + score_center_size // 2), :]
pred_profs = np.exp(np.load(val_save_path + ".profs.npy"))
In [10]:
# Load modisco results object
    
tfm_obj = import_tfmodisco_results(modisco_obj_path, hyp_scores, one_hot_seqs)
In [11]:
motif_pfms, motif_hcwms, motif_cwms, \
    motif_pfms_short, num_seqlets, \
    motif_seqlets, num_metaclusters = plot_all_metaclusters(tfm_obj, one_hot_seqs, hyp_scores,
                                                            true_profs, pred_profs, coords,
                                                            in_window, out_window,
                                                            score_center_size,
                                                            profile_display_center_size)

Metacluster 1/2

Pattern 1/27

15356 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 2/27

9294 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 3/27

7635 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 4/27

5747 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 5/27

5081 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 6/27

4155 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 7/27

3508 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 8/27

2248 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 9/27

1998 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 10/27

1654 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 11/27

1567 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 12/27

1394 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 13/27

1341 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 14/27

750 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 15/27

747 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 16/27

626 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 17/27

575 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 18/27

461 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 19/27

319 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 20/27

179 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 21/27

173 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 22/27

155 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 23/27

128 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 24/27

51 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 25/27

36 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 26/27

21 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 27/27

21 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Metacluster 2/2

Pattern 1/5

91 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 2/5

34 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 3/5

26 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 4/5

24 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 5/5

21 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)
In [12]:
run_and_plot_tomtom(modisco_out_path, motif_pfms, motif_hcwms, motif_pfms_short, num_metaclusters)

Metacluster 1/2

The output directory '/users/kcochran//projects/procap_models/modisco_out/procap/bpnetlite_basic_v2/CALU3/ENCSR935RNW/2022-07-04_06-07-54_run1_modisco/tomtom' already exists.
Its contents will be overwritten.
Processing query 1 out of 27 
Estimating pi_0 from all 3912 observed p-values.
Estimating pi_0.
Estimated pi_0=0.974985
Processing query 2 out of 27 
Estimating pi_0 from all 3912 observed p-values.
Estimating pi_0.
Estimated pi_0=0.982119
Processing query 3 out of 27 
Estimating pi_0 from all 3912 observed p-values.
Estimating pi_0.
Estimated pi_0=0.969821
Processing query 4 out of 27 
Estimating pi_0 from all 3912 observed p-values.
Estimating pi_0.
Estimated pi_0=0.986014
Processing query 5 out of 27 
Estimating pi_0 from all 3912 observed p-values.
Estimating pi_0.
Estimated pi_0=1
Processing query 6 out of 27 
Estimating pi_0 from all 3912 observed p-values.
Estimating pi_0.
Estimated pi_0=0.984986
Processing query 7 out of 27 
Estimating pi_0 from all 3912 observed p-values.
Estimating pi_0.
Estimated pi_0=0.95846
Processing query 8 out of 27 
Estimating pi_0 from all 3912 observed p-values.
Estimating pi_0.
Estimated pi_0=0.994092
Processing query 9 out of 27 
Estimating pi_0 from all 3912 observed p-values.
Estimating pi_0.
Estimated pi_0=0.95285
Processing query 10 out of 27 
Estimating pi_0 from all 3912 observed p-values.
Estimating pi_0.
Estimated pi_0=0.99435
Processing query 11 out of 27 
Estimating pi_0 from all 3912 observed p-values.
Estimating pi_0.
Estimated pi_0=0.979116
Processing query 12 out of 27 
Estimating pi_0 from all 3912 observed p-values.
Estimating pi_0.
Estimated pi_0=0.969283
Processing query 13 out of 27 
Estimating pi_0 from all 3912 observed p-values.
Estimating pi_0.
Estimated pi_0=1
Processing query 14 out of 27 
Estimating pi_0 from all 3912 observed p-values.
Estimating pi_0.
Estimated pi_0=0.957667
Processing query 15 out of 27 
Estimating pi_0 from all 3912 observed p-values.
Estimating pi_0.
Estimated pi_0=1
Processing query 16 out of 27 
Estimating pi_0 from all 3912 observed p-values.
Estimating pi_0.
Estimated pi_0=0.96545
Processing query 17 out of 27 
Estimating pi_0 from all 3912 observed p-values.
Estimating pi_0.
Estimated pi_0=0.966464
Processing query 18 out of 27 
Estimating pi_0 from all 3912 observed p-values.
Estimating pi_0.
Estimated pi_0=1
Processing query 19 out of 27 
Estimating pi_0 from all 3912 observed p-values.
Estimating pi_0.
Estimated pi_0=1
Processing query 20 out of 27 
Estimating pi_0 from all 3912 observed p-values.
Estimating pi_0.
Estimated pi_0=0.954502
Processing query 21 out of 27 
Estimating pi_0 from all 3912 observed p-values.
Estimating pi_0.
Estimated pi_0=0.998346
Processing query 22 out of 27 
Estimating pi_0 from all 3912 observed p-values.
Estimating pi_0.
Estimated pi_0=0.981903
Processing query 23 out of 27 
Estimating pi_0 from all 3912 observed p-values.
Estimating pi_0.
Estimated pi_0=1
Processing query 24 out of 27 
Estimating pi_0 from all 3912 observed p-values.
Estimating pi_0.
Estimated pi_0=0.962674
Processing query 25 out of 27 
Estimating pi_0 from all 3912 observed p-values.
Estimating pi_0.
Estimated pi_0=0.974239
Processing query 26 out of 27 
Estimating pi_0 from all 3912 observed p-values.
Estimating pi_0.
Estimated pi_0=1
Processing query 27 out of 27 
Estimating pi_0 from all 3912 observed p-values.
Estimating pi_0.
Estimated pi_0=0.997061
Can't locate HtmlMonolithWr.pm:   /root/lib/perl/HtmlMonolithWr.pm: Permission denied at /software/meme/4.11.2/bin/tomtom_xml_to_html line 48.
Warning: tomtom_xml_to_html exited abnormally and may have failed to create HTML output.

Motif 1/27

Motif IDq-valPWM
MA0079.5-SP10.0155889
MA0516.3-SP20.0155889
MA0742.2-KLF120.0155889
MA1511.2-KLF100.0155889
MA1818.1-Zm00001d0522290.0202029

Motif 2/27

Motif IDq-valPWM
MA1708.1-ETV70.263892
MA1221.1-RAP2-60.263892
MA1247.1-ERF0870.263892
MA1225.1-ERF50.263892
MA1958.1-HOXD12::ELK10.263892

Motif 3/27

Motif IDq-valPWM
MA1513.1-KLF155.50158e-05
MA0746.2-SP30.0001767
MA0599.1-KLF50.0001767
MA1564.1-SP90.0001767
MA1961.1-PATZ10.0001767

Motif 4/27

Motif IDq-valPWM
MA0076.2-ELK44.6323e-05
MA0750.2-ZBTB7A4.6323e-05
MA0026.1-Eip74EF4.6323e-05
MA0764.3-ETV40.000110984
MA0759.2-ELK30.00013318

Motif 5/27

Motif IDq-valPWM
MA0314.2-HAP33.31232e-05
MA0060.3-NFYA3.31232e-05
MA1644.1-NFYC3.31232e-05
MA0502.2-NFYB0.00708111
MA0316.1-HAP50.0113831

Motif 6/27

Motif IDq-valPWM
MA1988.1-Atf37.24464e-05
MA0489.2-Jun0.000130246
MA1928.1-BNC20.000193325
MA1448.1-fos-10.000219012
MA0099.3-FOS::JUN0.000302937

Motif 7/27

Motif IDq-valPWM
MA1475.1-CREB3L40.000730204
MA1438.1-atf-70.000915829
MA1346.1-TGA100.00175037
MA0609.2-CREM0.00175037
MA1348.1-TGA90.00175037

Motif 8/27

Motif IDq-valPWM
MA0506.2-Nrf11.95985e-05
MA1412.1-TSAR20.406194
MA1560.1-SOHLH20.406194
MA1826.1-bHLH1450.406194

Motif 9/27

Motif IDq-valPWM
MA0535.1-Mad0.000690477
MA1818.1-Zm00001d0522290.00129126
MA1820.1-Zm00001d0243240.00129126
MA1819.1-Zm00001d0058920.00129126
MA1833.1-Zm00001d0493640.00129126

Motif 10/27

No TOMTOM matches passing threshold

Motif 11/27

Motif IDq-valPWM
MA1513.1-KLF150.00735434
MA0742.2-KLF120.00735434
MA1511.2-KLF100.00735434
MA0079.5-SP10.00735434
MA0516.3-SP20.00735434

Motif 12/27

Motif IDq-valPWM
MA0748.2-YY21.40496e-05
MA1819.1-Zm00001d0058924.04086e-05
MA1833.1-Zm00001d0493644.04086e-05
MA1821.1-Zm00001d0205957.32796e-05
MA1832.1-Zm00001d0023647.32796e-05

Motif 13/27

Motif IDq-valPWM
MA0108.2-TBP0.000851541

Motif 14/27

Motif IDq-valPWM
MA0139.1-CTCF9.94794e-08
MA1929.1-CTCF2.71582e-06
MA1930.1-CTCF2.71582e-06
MA1102.2-CTCFL6.68715e-06
MA0531.1-CTCF6.49032e-05

Motif 15/27

Motif IDq-valPWM
MA1573.2-Thap112.30879e-08
MA0088.2-ZNF1430.0200412
MA1716.1-ZNF760.0300004
MA1625.1-Stat5b0.314796
MA0525.2-TP630.376267

Motif 16/27

Motif IDq-valPWM
MA0153.2-HNF1B2.49254e-05
MA0046.2-HNF1A2.49254e-05
MA0186.1-Dfd0.00219425
MA1198.1-HAT20.0855313
MA0203.1-Scr0.0877376

Motif 17/27

Motif IDq-valPWM
MA1833.1-Zm00001d0493640.000498525
MA1961.1-PATZ10.000852339
MA1820.1-Zm00001d0243240.000852339
MA1713.1-ZNF6100.000852339
MA1513.1-KLF150.000852339

Motif 18/27

Motif IDq-valPWM
MA0527.1-ZBTB332.68474e-07

Motif 19/27

Motif IDq-valPWM
MA0379.1-MOT20.367171

Motif 20/27

Motif IDq-valPWM
MA1833.1-Zm00001d0493640.000237566
MA1819.1-Zm00001d0058920.000354848
MA1820.1-Zm00001d0243240.00036996
MA1817.1-Zm00001d0202670.00036996
MA1832.1-Zm00001d0023640.000454617

Motif 21/27

Motif IDq-valPWM
MA0600.2-RFX23.02498e-07
MA0509.3-RFX13.02498e-07
MA0510.2-RFX53.02498e-07
MA0798.3-RFX33.02498e-07
MA1554.1-RFX70.0326337

Motif 22/27

Motif IDq-valPWM
MA0018.4-CREB10.000560801
MA0488.1-JUN0.000937485
MA0492.1-JUND0.00175052
MA1475.1-CREB3L40.00175052
MA1069.2-TGA60.00175052

Motif 23/27

No TOMTOM matches passing threshold

Motif 24/27

Motif IDq-valPWM
MA0506.2-Nrf10.000919615
MA1880.1-Hey0.000919615
MA1976.1-ZNF3200.00132244
MA1650.1-ZBTB140.00663839
MA1877.1-Hes-b0.0143771

Motif 25/27

Motif IDq-valPWM
MA1833.1-Zm00001d0493648.15337e-07
MA2022.1-LOB2.8127e-06
MA1239.1-ERF1042.8127e-06
MA1262.1-ERF26.60049e-06
MA1257.1-ERF98.3076e-06

Motif 26/27

Motif IDq-valPWM
MA1684.1-Foxn10.34168
MA1508.1-IKZF10.34168

Motif 27/27

Motif IDq-valPWM
MA1573.2-Thap110.0291774
MA1716.1-ZNF760.0291774
MA0399.1-SUT10.0365294
MA0518.1-Stat40.0468401
MA1285.1-TCP90.0602642

Metacluster 2/2

The output directory '/users/kcochran//projects/procap_models/modisco_out/procap/bpnetlite_basic_v2/CALU3/ENCSR935RNW/2022-07-04_06-07-54_run1_modisco/tomtom' already exists.
Its contents will be overwritten.
Processing query 1 out of 5 
Estimating pi_0 from all 3912 observed p-values.
Estimating pi_0.
Estimated pi_0=0.965142
Processing query 2 out of 5 
Estimating pi_0 from all 3912 observed p-values.
Estimating pi_0.
Estimated pi_0=0.989611
Processing query 3 out of 5 
Estimating pi_0 from all 3912 observed p-values.
Estimating pi_0.
Estimated pi_0=0.986862
Processing query 4 out of 5 
Estimating pi_0 from all 3912 observed p-values.
Estimating pi_0.
Estimated pi_0=0.968501
Processing query 5 out of 5 
Estimating pi_0 from all 3912 observed p-values.
Estimating pi_0.
Estimated pi_0=0.942118
Can't locate HtmlMonolithWr.pm:   /root/lib/perl/HtmlMonolithWr.pm: Permission denied at /software/meme/4.11.2/bin/tomtom_xml_to_html line 48.
Warning: tomtom_xml_to_html exited abnormally and may have failed to create HTML output.

Motif 1/5

Motif IDq-valPWM
MA1892.1-Klf5-like0.000154338
MA0516.3-SP20.000154338
MA0685.2-SP40.000154338
MA0742.2-KLF120.000171891
MA0079.5-SP10.000195002

Motif 2/5

Motif IDq-valPWM
MA1141.1-FOS::JUND0.00613298
MA1448.1-fos-10.00613298
MA1130.1-FOSL2::JUN0.00984781
MA0591.1-Bach1::Mafk0.0134008
MA1137.1-FOSL1::JUNB0.0134008

Motif 3/5

Motif IDq-valPWM
MA0916.1-Ets21C0.000287125
MA0156.3-FEV0.00191169
MA0641.1-ELF40.00269273
MA0759.2-ELK30.00335445
MA1483.2-ELF20.00351154

Motif 4/5

Motif IDq-valPWM
MA0506.2-Nrf10.000374622
MA1880.1-Hey0.00407735
MA1650.1-ZBTB140.0142805
MA1893.1-Klf6-7-like0.0635705
MA1713.1-ZNF6100.0635705

Motif 5/5

Motif IDq-valPWM
MA0609.2-CREM0.000879226
MA1348.1-TGA90.000879226
MA1820.1-Zm00001d0243240.00455812
MA1817.1-Zm00001d0202670.00455812
MA0967.1-BZIP600.00455812