In [1]:
# Filepaths and Hard-coded Defaults

proj_root = "/users/kcochran/projects/procap_models/"
sequence_path = proj_root + "genomes/GRCh38_no_alt_analysis_set_GCA_000001405.15.fasta"
chrom_sizes = proj_root + "genomes/hg38.chrom.sizes.withrRNA"

in_window = 2114
out_window = 1000
In [2]:
# stuff to get from config file

with open("2022-07-04_06-07-54_run1_modisco_config.txt") as config_f:
    config_dict = {line.split()[0] : line.strip().split()[1] for line in config_f}

modisco_out_path = config_dict["modisco_out_path"]
scoring_type = config_dict["scoring_type"]
score_center_size = int(config_dict["score_center_size"])
profile_display_center_size = int(config_dict["profile_display_center_size"])

# digest what's in config file

assay_type, model_type, cell, accession, modisco_dir_base = modisco_out_path.split("/")[-5:]
ts_part1, ts_part2, run_str, _ = modisco_dir_base.split("_")
timestamp = ts_part1 + "_" + ts_part2
run = int(run_str.replace("run", ""))
In [3]:
print(modisco_out_path)
print("cell_type:", cell, accession)
print("timestamp:", timestamp)
print("run:", run)
print("scoring_type:", scoring_type)
print("score_center_size:", score_center_size)
print("profile_display_center_size:", profile_display_center_size)
/users/kcochran//projects/procap_models/modisco_out/procap/bpnetlite_basic_v2/CALU3/ENCSR935RNW/2022-07-04_06-07-54_run1_modisco
cell_type: CALU3 ENCSR935RNW
timestamp: 2022-07-04_06-07-54
run: 1
scoring_type: counts
score_center_size: 1000
profile_display_center_size: 400
In [4]:
data_dir = proj_root + "/data/procap/processed/" + cell + "/" + accession + "/"
plus_bw_path = data_dir + "final.5prime.pos.bigWig"
minus_bw_path = data_dir + "final.5prime.neg.bigWig"
val_peak_path = data_dir + "peaks_uni_and_bi_train_and_val.bed.gz"

val_save_dir = proj_root + "model_out/" + assay_type + "/" + model_type + "/" + cell + "/" + accession + "/"
val_save_path = val_save_dir + timestamp + "_run" + str(run) + "_train_and_val"

attr_save_path = val_save_dir.replace("model_out", "deepshap_out") + timestamp + "_run" + str(run) + "_deepshap"

if not modisco_out_path.endswith("/"):
    modisco_out_path = modisco_out_path + "/"
In [5]:
# task-specific filepaths

import os

assert scoring_type in ["profile", "counts"], scoring_type

if scoring_type == "profile":
    scores_path = attr_save_path + "_prof.npy"
    onehot_scores_path = attr_save_path + "_prof_onehot.npy"
    modisco_obj_path = modisco_out_path + "results_allChroms_prof_slice500.hdf5"
    seqlet_path = modisco_out_path + "seqlets_prof.txt"
else:
    scores_path = attr_save_path + "_count.npy"
    onehot_scores_path = attr_save_path + "_count_onehot.npy"
    modisco_obj_path = modisco_out_path + "results_allChroms_count_slice500.hdf5"
    seqlet_path = modisco_out_path + "seqlets_count.txt"
    
assert(os.path.exists(scores_path)), scores_path
assert(os.path.exists(onehot_scores_path)), onehot_scores_path
In [6]:
# Imports, Plotting Defaults

import matplotlib.pyplot as plt
import matplotlib.font_manager as font_manager

plot_params = {
    "figure.titlesize": 22,
    "axes.titlesize": 22,
    "axes.labelsize": 20,
    "legend.fontsize": 18,
    "xtick.labelsize": 16,
    "ytick.labelsize": 16,
    "font.weight": "bold"
}
plt.rcParams.update(plot_params)

from IPython.display import display
import tqdm
tqdm.tqdm_notebook()

import numpy as np

from view_modisco_results_utils import *
/users/kcochran/miniconda3/envs/procap/lib/python3.7/site-packages/ipykernel_launcher.py:19: TqdmDeprecationWarning: This function will be removed in tqdm==5.0.0
Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
In [7]:
# Load in True Profiles and Sequences

import sys
sys.path.append('../1_train_models')

from data_loading import extract_peaks

one_hot_seqs, true_profs = extract_peaks(sequence_path, 
    plus_bw_path, minus_bw_path, val_peak_path, in_window, out_window,
    max_jitter=0, verbose=True)

one_hot_seqs = one_hot_seqs.swapaxes(1,2)
one_hot_seqs = one_hot_seqs[:, (in_window // 2 - score_center_size // 2):(in_window // 2 + score_center_size // 2), :]
Reading FASTA: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 24/24 [00:15<00:00,  1.50it/s]
Loading Peaks: 42440it [01:27, 484.73it/s]
In [8]:
# Load in Coordinates of Examples
    
coords = load_coords(val_peak_path, in_window)
In [9]:
# Import SHAP scores, predicted profiles

hyp_scores = np.load(scores_path).swapaxes(1,2)
hyp_scores = hyp_scores[:, (in_window // 2 - score_center_size // 2):(in_window // 2 + score_center_size // 2), :]
pred_profs = np.exp(np.load(val_save_path + ".profs.npy"))
In [10]:
# Load modisco results object
    
tfm_obj = import_tfmodisco_results(modisco_obj_path, hyp_scores, one_hot_seqs)
In [11]:
motif_pfms, motif_hcwms, motif_cwms, \
    motif_pfms_short, num_seqlets, \
    motif_seqlets, num_metaclusters = plot_all_metaclusters(tfm_obj, one_hot_seqs, hyp_scores,
                                                            true_profs, pred_profs, coords,
                                                            in_window, out_window,
                                                            score_center_size,
                                                            profile_display_center_size)

Metacluster 1/2

Pattern 1/21

16466 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 2/21

12273 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 3/21

7315 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 4/21

6561 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 5/21

5651 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 6/21

4446 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 7/21

2981 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 8/21

1194 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 9/21

1095 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 10/21

1018 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 11/21

1008 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 12/21

830 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 13/21

709 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 14/21

552 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 15/21

384 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 16/21

338 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 17/21

332 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 18/21

314 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 19/21

168 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 20/21

72 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 21/21

22 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Metacluster 2/2

Pattern 1/13

104 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 2/13

102 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 3/13

87 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 4/13

76 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 5/13

57 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 6/13

55 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 7/13

50 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 8/13

48 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 9/13

46 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 10/13

46 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 11/13

36 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 12/13

33 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 13/13

23 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)
In [12]:
run_and_plot_tomtom(modisco_out_path, motif_pfms, motif_hcwms, motif_pfms_short, num_metaclusters)

Metacluster 1/2

The output directory '/users/kcochran//projects/procap_models/modisco_out/procap/bpnetlite_basic_v2/CALU3/ENCSR935RNW/2022-07-04_06-07-54_run1_modisco/tomtom' already exists.
Its contents will be overwritten.
Processing query 1 out of 21 
Estimating pi_0 from all 3912 observed p-values.
Estimating pi_0.
Estimated pi_0=0.9855
Processing query 2 out of 21 
Estimating pi_0 from all 3912 observed p-values.
Estimating pi_0.
Estimated pi_0=0.971112
Processing query 3 out of 21 
Estimating pi_0 from all 3912 observed p-values.
Estimating pi_0.
Estimated pi_0=1
Processing query 4 out of 21 
Estimating pi_0 from all 3912 observed p-values.
Estimating pi_0.
Estimated pi_0=0.957097
Processing query 5 out of 21 
Estimating pi_0 from all 3912 observed p-values.
Estimating pi_0.
Estimated pi_0=0.9855
Processing query 6 out of 21 
Estimating pi_0 from all 3912 observed p-values.
Estimating pi_0.
Estimated pi_0=0.99552
Processing query 7 out of 21 
Estimating pi_0 from all 3912 observed p-values.
Estimating pi_0.
Estimated pi_0=0.978413
Processing query 8 out of 21 
Estimating pi_0 from all 3912 observed p-values.
Estimating pi_0.
Estimated pi_0=1
Processing query 9 out of 21 
Estimating pi_0 from all 3912 observed p-values.
Estimating pi_0.
Estimated pi_0=1
Processing query 10 out of 21 
Estimating pi_0 from all 3912 observed p-values.
Estimating pi_0.
Estimated pi_0=0.97137
Processing query 11 out of 21 
Estimating pi_0 from all 3912 observed p-values.
Estimating pi_0.
Estimated pi_0=0.969304
Processing query 12 out of 21 
Estimating pi_0 from all 3912 observed p-values.
Estimating pi_0.
Estimated pi_0=0.964105
Processing query 13 out of 21 
Estimating pi_0 from all 3912 observed p-values.
Estimating pi_0.
Estimated pi_0=0.959048
Processing query 14 out of 21 
Estimating pi_0 from all 3912 observed p-values.
Estimating pi_0.
Estimated pi_0=0.989354
Processing query 15 out of 21 
Estimating pi_0 from all 3912 observed p-values.
Estimating pi_0.
Estimated pi_0=0.951546
Processing query 16 out of 21 
Estimating pi_0 from all 3912 observed p-values.
Estimating pi_0.
Estimated pi_0=1
Processing query 17 out of 21 
Estimating pi_0 from all 3912 observed p-values.
Estimating pi_0.
Estimated pi_0=0.998346
Processing query 18 out of 21 
Estimating pi_0 from all 3912 observed p-values.
Estimating pi_0.
Estimated pi_0=0.980105
Processing query 19 out of 21 
Estimating pi_0 from all 3912 observed p-values.
Estimating pi_0.
Estimated pi_0=0.990608
Processing query 20 out of 21 
Estimating pi_0 from all 3912 observed p-values.
Estimating pi_0.
Estimated pi_0=0.963325
Processing query 21 out of 21 
Estimating pi_0 from all 3912 observed p-values.
Estimating pi_0.
Estimated pi_0=0.998346
Can't locate HtmlMonolithWr.pm:   /root/lib/perl/HtmlMonolithWr.pm: Permission denied at /software/meme/4.11.2/bin/tomtom_xml_to_html line 48.
Warning: tomtom_xml_to_html exited abnormally and may have failed to create HTML output.

Motif 1/21

Motif IDq-valPWM
MA0026.1-Eip74EF2.88216e-05
MA0916.1-Ets21C4.67609e-05
MA1483.2-ELF24.67609e-05
MA0750.2-ZBTB7A4.67609e-05
MA0764.3-ETV46.30128e-05

Motif 2/21

Motif IDq-valPWM
MA1513.1-KLF154.38475e-05
MA0516.3-SP20.000162585
MA0685.2-SP40.000162585
MA0742.2-KLF120.000162585
MA1511.2-KLF100.000183661

Motif 3/21

Motif IDq-valPWM
MA0314.2-HAP33.56007e-05
MA0060.3-NFYA3.56007e-05
MA1644.1-NFYC3.56007e-05
MA0502.2-NFYB0.00844247
MA0316.1-HAP50.0137046

Motif 4/21

Motif IDq-valPWM
MA1475.1-CREB3L40.00397176
MA1140.2-JUNB0.00538285
MA0831.3-TFE30.00538285
MA1131.1-FOSL2::JUN0.00538285
MA1336.1-TGA30.00538285

Motif 5/21

Motif IDq-valPWM
MA1988.1-Atf36.32078e-05
MA1928.1-BNC26.32078e-05
MA0478.1-FOSL26.32078e-05
MA0099.3-FOS::JUN0.00011464
MA1130.1-FOSL2::JUN0.00011464

Motif 6/21

Motif IDq-valPWM
MA0506.2-Nrf10.000654039
MA1412.1-TSAR20.24566
MA1560.1-SOHLH20.339911
MA1826.1-bHLH1450.339911

Motif 7/21

Motif IDq-valPWM
MA0748.2-YY27.45027e-05
MA0998.1-ERF0960.00019906
MA1818.1-Zm00001d0522290.000216057
MA1651.1-ZFP420.000530575
MA0997.1-ERF0690.000991683

Motif 8/21

Motif IDq-valPWM
MA1573.2-Thap113.32932e-08
MA0088.2-ZNF1430.0221146
MA1716.1-ZNF760.0431596
MA0525.2-TP630.345592
MA1625.1-Stat5b0.345592

Motif 9/21

Motif IDq-valPWM
MA0527.1-ZBTB334.48318e-07

Motif 10/21

Motif IDq-valPWM
MA0139.1-CTCF4.84684e-08
MA1929.1-CTCF2.743e-06
MA1930.1-CTCF1.86695e-05
MA0531.1-CTCF2.5466e-05
MA1102.2-CTCFL4.12234e-05

Motif 11/21

Motif IDq-valPWM
MA1053.1-ERF1090.00968733
MA0975.1-CRF20.00968733
MA0997.1-ERF0690.00968733
MA1820.1-Zm00001d0243240.00968733
MA1817.1-Zm00001d0202670.00968733

Motif 12/21

Motif IDq-valPWM
MA0046.2-HNF1A2.60366e-05
MA0153.2-HNF1B2.60366e-05
MA0186.1-Dfd0.0134684
MA0203.1-Scr0.0771914
MA1198.1-HAT20.0771914

Motif 13/21

Motif IDq-valPWM
MA0375.1-RSC300.0253466
MA1650.1-ZBTB140.0461101
MA1834.1-Zm00001d0342980.0530144
MA0506.2-Nrf10.0574591
MA1817.1-Zm00001d0202670.0607138

Motif 14/21

Motif IDq-valPWM
MA1933.1-ELK1::SREBF20.000186821
MA1939.1-ERF::SREBF20.00127576
MA0765.3-ETV50.0224442
MA0026.1-Eip74EF0.0224442
MA0759.2-ELK30.0224442

Motif 15/21

Motif IDq-valPWM
MA1817.1-Zm00001d0202677.11434e-06
MA1820.1-Zm00001d0243241.26099e-05
MA1833.1-Zm00001d0493641.26099e-05
MA1819.1-Zm00001d0058921.31765e-05
MA1832.1-Zm00001d0023642.29215e-05

Motif 16/21

Motif IDq-valPWM
MA1573.2-Thap115.60403e-08
MA0088.2-ZNF1430.00148189
MA1716.1-ZNF760.0020091
MA1625.1-Stat5b0.170224
MA0519.1-Stat5a::Stat5b0.326336

Motif 17/21

Motif IDq-valPWM
MA0509.3-RFX12.78342e-07
MA0600.2-RFX22.78342e-07
MA0798.3-RFX32.78342e-07
MA0510.2-RFX52.78342e-07
MA1554.1-RFX70.0326001

Motif 18/21

Motif IDq-valPWM
MA1475.1-CREB3L40.000375454
MA0018.4-CREB10.000375454
MA0605.2-ATF30.000533525
MA1069.2-TGA60.000890161
MA0656.1-JDP20.000890161

Motif 19/21

Motif IDq-valPWM
MA0139.1-CTCF8.14271e-09
MA1929.1-CTCF1.26092e-07
MA1930.1-CTCF6.01176e-06
MA1102.2-CTCFL0.00216035
MA0531.1-CTCF0.0150477

Motif 20/21

Motif IDq-valPWM
MA1513.1-KLF150.000228979
MA0742.2-KLF120.000484619
MA0740.2-KLF140.000484619
MA0516.3-SP20.000484619
MA1511.2-KLF100.000484619

Motif 21/21

Motif IDq-valPWM
MA0506.2-Nrf10.0544674
MA0632.2-TCFL50.28589
MA1826.1-bHLH1450.28589
MA1411.1-TSAR10.495726
MA1412.1-TSAR20.495726

Metacluster 2/2

The output directory '/users/kcochran//projects/procap_models/modisco_out/procap/bpnetlite_basic_v2/CALU3/ENCSR935RNW/2022-07-04_06-07-54_run1_modisco/tomtom' already exists.
Its contents will be overwritten.
Processing query 1 out of 13 
Estimating pi_0 from all 3912 observed p-values.
Estimating pi_0.
Estimated pi_0=0.981182
Processing query 2 out of 13 
Estimating pi_0 from all 3912 observed p-values.
Estimating pi_0.
Estimated pi_0=0.990477
Processing query 3 out of 13 
Estimating pi_0 from all 3912 observed p-values.
Estimating pi_0.
Estimated pi_0=0.982731
Processing query 4 out of 13 
Estimating pi_0 from all 3912 observed p-values.
Estimating pi_0.
Estimated pi_0=0.960509
Processing query 5 out of 13 
Estimating pi_0 from all 3912 observed p-values.
Estimating pi_0.
Estimated pi_0=0.999116
Processing query 6 out of 13 
Estimating pi_0 from all 3912 observed p-values.
Estimating pi_0.
Estimated pi_0=0.998089
Processing query 7 out of 13 
Estimating pi_0 from all 3912 observed p-values.
Estimating pi_0.
Estimated pi_0=1
Processing query 8 out of 13 
Estimating pi_0 from all 3912 observed p-values.
Estimating pi_0.
Estimated pi_0=1
Processing query 9 out of 13 
Estimating pi_0 from all 3912 observed p-values.
Estimating pi_0.
Estimated pi_0=0.9794
Processing query 10 out of 13 
Estimating pi_0 from all 3912 observed p-values.
Estimating pi_0.
Estimated pi_0=0.996933
Processing query 11 out of 13 
Estimating pi_0 from all 3912 observed p-values.
Estimating pi_0.
Estimated pi_0=0.987812
Processing query 12 out of 13 
Estimating pi_0 from all 3912 observed p-values.
Estimating pi_0.
Estimated pi_0=1
Processing query 13 out of 13 
Estimating pi_0 from all 3912 observed p-values.
Estimating pi_0.
Estimated pi_0=1
Can't locate HtmlMonolithWr.pm:   /root/lib/perl/HtmlMonolithWr.pm: Permission denied at /software/meme/4.11.2/bin/tomtom_xml_to_html line 48.
Warning: tomtom_xml_to_html exited abnormally and may have failed to create HTML output.

Motif 1/13

Motif IDq-valPWM
MA1268.1-CDF50.0378524
MA1884.1-Hox30.0408994
MA1871.1-FoxM0.0408994
MA1917.1-SoxC0.0408994
MA1281.1-DOF5.10.0408994

Motif 2/13

Motif IDq-valPWM
MA1873.1-FoxQ0.0421883
MA1867.1-FoxI-b0.429214
MA0040.1-Foxq10.437413
MA1898.1-Meox0.437413
MA1882.1-Hnf10.437413

Motif 3/13

Motif IDq-valPWM
MA1862.1-FoxD-b0.0249801
MA1873.1-FoxQ0.0249801
MA1864.1-FoxF0.0249801
MA1870.1-FoxL20.0249801
MA1871.1-FoxM0.0249801

Motif 4/13

Motif IDq-valPWM
MA1344.1-BZIP280.236193
MA0495.3-MAFF0.236193
MA1143.1-FOSL1::JUND0.236193
MA1172.1-MYB3R50.236193
MA0492.1-JUND0.236193

Motif 5/13

Motif IDq-valPWM
MA1864.1-FoxF0.0585988
MA0386.1-SPT150.136018
MA1978.1-ZNF354A0.136018
MA0379.1-MOT20.17691
MA1861.1-FoxB0.17691

Motif 6/13

No TOMTOM matches passing threshold

Motif 7/13

Motif IDq-valPWM
MA0538.1-daf-120.000469024

Motif 8/13

Motif IDq-valPWM
MA1810.1-GLYMA-08G3576000.342494
MA0926.1-unc-860.394998

Motif 9/13

Motif IDq-valPWM
MA1884.1-Hox30.123548
MA0681.2-PHOX2B0.123548
MA1873.1-FoxQ0.123548
MA1900.1-Mnx0.128351
MA1924.1-Tlx0.147439

Motif 10/13

Motif IDq-valPWM
MA1864.1-FoxF0.161776

Motif 11/13

Motif IDq-valPWM
MA1871.1-FoxM8.85061e-08
MA1866.1-FoxI-a2.60964e-05
MA1281.1-DOF5.12.77899e-05
MA1863.1-FoxE9.05754e-05
MA1869.1-FoxK9.67521e-05

Motif 12/13

Motif IDq-valPWM
MA0926.1-unc-860.19531
MA0386.1-SPT150.326284

Motif 13/13

Motif IDq-valPWM
MA0249.2-twi0.301133