In [1]:
# Filepaths and Hard-coded Defaults

proj_root = "/users/kcochran/projects/procap_models/"
sequence_path = proj_root + "genomes/GRCh38_no_alt_analysis_set_GCA_000001405.15.fasta"
chrom_sizes = proj_root + "genomes/hg38.chrom.sizes.withrRNA"

in_window = 2114
out_window = 1000
In [2]:
# stuff to get from config file

with open("2022-07-03_11-35-25_run1_modisco_config.txt") as config_f:
    config_dict = {line.split()[0] : line.strip().split()[1] for line in config_f}

modisco_out_path = config_dict["modisco_out_path"]
scoring_type = config_dict["scoring_type"]
score_center_size = int(config_dict["score_center_size"])
profile_display_center_size = int(config_dict["profile_display_center_size"])

# digest what's in config file

assay_type, model_type, cell, accession, modisco_dir_base = modisco_out_path.split("/")[-5:]
ts_part1, ts_part2, run_str, _ = modisco_dir_base.split("_")
timestamp = ts_part1 + "_" + ts_part2
run = int(run_str.replace("run", ""))
In [3]:
print(modisco_out_path)
print("cell_type:", cell, accession)
print("timestamp:", timestamp)
print("run:", run)
print("scoring_type:", scoring_type)
print("score_center_size:", score_center_size)
print("profile_display_center_size:", profile_display_center_size)
/users/kcochran//projects/procap_models/modisco_out/procap/bpnetlite_basic_v2/CACO2/ENCSR100LIJ/2022-07-03_11-35-25_run1_modisco
cell_type: CACO2 ENCSR100LIJ
timestamp: 2022-07-03_11-35-25
run: 1
scoring_type: counts
score_center_size: 1000
profile_display_center_size: 400
In [4]:
data_dir = proj_root + "/data/procap/processed/" + cell + "/" + accession + "/"
plus_bw_path = data_dir + "final.5prime.pos.bigWig"
minus_bw_path = data_dir + "final.5prime.neg.bigWig"
val_peak_path = data_dir + "peaks_uni_and_bi_train_and_val.bed.gz"

val_save_dir = proj_root + "model_out/" + assay_type + "/" + model_type + "/" + cell + "/" + accession + "/"
val_save_path = val_save_dir + timestamp + "_run" + str(run) + "_train_and_val"

attr_save_path = val_save_dir.replace("model_out", "deepshap_out") + timestamp + "_run" + str(run) + "_deepshap"

if not modisco_out_path.endswith("/"):
    modisco_out_path = modisco_out_path + "/"
In [5]:
# task-specific filepaths

import os

assert scoring_type in ["profile", "counts"], scoring_type

if scoring_type == "profile":
    scores_path = attr_save_path + "_prof.npy"
    onehot_scores_path = attr_save_path + "_prof_onehot.npy"
    modisco_obj_path = modisco_out_path + "results_allChroms_prof_slice500.hdf5"
    seqlet_path = modisco_out_path + "seqlets_prof.txt"
else:
    scores_path = attr_save_path + "_count.npy"
    onehot_scores_path = attr_save_path + "_count_onehot.npy"
    modisco_obj_path = modisco_out_path + "results_allChroms_count_slice500.hdf5"
    seqlet_path = modisco_out_path + "seqlets_count.txt"
    
assert(os.path.exists(scores_path)), scores_path
assert(os.path.exists(onehot_scores_path)), onehot_scores_path
In [6]:
# Imports, Plotting Defaults

import matplotlib.pyplot as plt
import matplotlib.font_manager as font_manager

plot_params = {
    "figure.titlesize": 22,
    "axes.titlesize": 22,
    "axes.labelsize": 20,
    "legend.fontsize": 18,
    "xtick.labelsize": 16,
    "ytick.labelsize": 16,
    "font.weight": "bold"
}
plt.rcParams.update(plot_params)

from IPython.display import display
import tqdm
tqdm.tqdm_notebook()

import numpy as np

from view_modisco_results_utils import *
/users/kcochran/miniconda3/envs/procap/lib/python3.7/site-packages/ipykernel_launcher.py:19: TqdmDeprecationWarning: This function will be removed in tqdm==5.0.0
Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
In [7]:
# Load in True Profiles and Sequences

import sys
sys.path.append('../1_train_models')

from data_loading import extract_peaks

one_hot_seqs, true_profs = extract_peaks(sequence_path, 
    plus_bw_path, minus_bw_path, val_peak_path, in_window, out_window,
    max_jitter=0, verbose=True)

one_hot_seqs = one_hot_seqs.swapaxes(1,2)
one_hot_seqs = one_hot_seqs[:, (in_window // 2 - score_center_size // 2):(in_window // 2 + score_center_size // 2), :]
Reading FASTA: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 24/24 [00:16<00:00,  1.47it/s]
Loading Peaks: 42149it [01:04, 654.64it/s]
In [8]:
# Load in Coordinates of Examples
    
coords = load_coords(val_peak_path, in_window)
In [9]:
# Import SHAP scores, predicted profiles

hyp_scores = np.load(scores_path).swapaxes(1,2)
hyp_scores = hyp_scores[:, (in_window // 2 - score_center_size // 2):(in_window // 2 + score_center_size // 2), :]
pred_profs = np.exp(np.load(val_save_path + ".profs.npy"))
In [10]:
# Load modisco results object
    
tfm_obj = import_tfmodisco_results(modisco_obj_path, hyp_scores, one_hot_seqs)
In [11]:
motif_pfms, motif_hcwms, motif_cwms, \
    motif_pfms_short, num_seqlets, \
    motif_seqlets, num_metaclusters = plot_all_metaclusters(tfm_obj, one_hot_seqs, hyp_scores,
                                                            true_profs, pred_profs, coords,
                                                            in_window, out_window,
                                                            score_center_size,
                                                            profile_display_center_size)

Metacluster 1/1

Pattern 1/30

12005 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 2/30

8340 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 3/30

7834 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 4/30

7565 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 5/30

5575 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 6/30

3564 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 7/30

2865 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 8/30

1971 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 9/30

1931 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 10/30

1630 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 11/30

1355 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 12/30

1332 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 13/30

1190 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 14/30

1171 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 15/30

1113 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 16/30

615 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 17/30

602 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 18/30

584 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 19/30

473 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 20/30

349 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 21/30

344 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 22/30

270 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 23/30

138 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 24/30

87 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 25/30

45 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 26/30

32 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 27/30

30 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 28/30

23 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 29/30

22 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 30/30

20 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)
In [12]:
run_and_plot_tomtom(modisco_out_path, motif_pfms, motif_hcwms, motif_pfms_short, num_metaclusters)

Metacluster 1/1

The output directory '/users/kcochran//projects/procap_models/modisco_out/procap/bpnetlite_basic_v2/CACO2/ENCSR100LIJ/2022-07-03_11-35-25_run1_modisco/tomtom' already exists.
Its contents will be overwritten.
Processing query 1 out of 30 
Estimating pi_0 from all 3912 observed p-values.
Estimating pi_0.
Estimated pi_0=0.972408
Processing query 2 out of 30 
Estimating pi_0 from all 3912 observed p-values.
Estimating pi_0.
Estimated pi_0=1
Processing query 3 out of 30 
Estimating pi_0 from all 3912 observed p-values.
Estimating pi_0.
Estimated pi_0=0.945544
Processing query 4 out of 30 
Estimating pi_0 from all 3912 observed p-values.
Estimating pi_0.
Estimated pi_0=0.985757
Processing query 5 out of 30 
Estimating pi_0 from all 3912 observed p-values.
Estimating pi_0.
Estimated pi_0=0.984217
Processing query 6 out of 30 
Estimating pi_0 from all 3912 observed p-values.
Estimating pi_0.
Estimated pi_0=1
Processing query 7 out of 30 
Estimating pi_0 from all 3912 observed p-values.
Estimating pi_0.
Estimated pi_0=0.985757
Processing query 8 out of 30 
Estimating pi_0 from all 3912 observed p-values.
Estimating pi_0.
Estimated pi_0=0.995263
Processing query 9 out of 30 
Estimating pi_0 from all 3912 observed p-values.
Estimating pi_0.
Estimated pi_0=0.984986
Processing query 10 out of 30 
Estimating pi_0 from all 3912 observed p-values.
Estimating pi_0.
Estimated pi_0=0.970575
Processing query 11 out of 30 
Estimating pi_0 from all 3912 observed p-values.
Estimating pi_0.
Estimated pi_0=0.979582
Processing query 12 out of 30 
Estimating pi_0 from all 3912 observed p-values.
Estimating pi_0.
Estimated pi_0=0.961768
Processing query 13 out of 30 
Estimating pi_0 from all 3912 observed p-values.
Estimating pi_0.
Estimated pi_0=1
Processing query 14 out of 30 
Estimating pi_0 from all 3912 observed p-values.
Estimating pi_0.
Estimated pi_0=1
Processing query 15 out of 30 
Estimating pi_0 from all 3912 observed p-values.
Estimating pi_0.
Estimated pi_0=0.973718
Processing query 16 out of 30 
Estimating pi_0 from all 3912 observed p-values.
Estimating pi_0.
Estimated pi_0=0.987555
Processing query 17 out of 30 
Estimating pi_0 from all 3912 observed p-values.
Estimating pi_0.
Estimated pi_0=0.980105
Processing query 18 out of 30 
Estimating pi_0 from all 3912 observed p-values.
Estimating pi_0.
Estimated pi_0=0.990895
Processing query 19 out of 30 
Estimating pi_0 from all 3912 observed p-values.
Estimating pi_0.
Estimated pi_0=0.99265
Processing query 20 out of 30 
Estimating pi_0 from all 3912 observed p-values.
Estimating pi_0.
Estimated pi_0=1
Processing query 21 out of 30 
Estimating pi_0 from all 3912 observed p-values.
Estimating pi_0.
Estimated pi_0=1
Processing query 22 out of 30 
Estimating pi_0 from all 3912 observed p-values.
Estimating pi_0.
Estimated pi_0=0.983959
Processing query 23 out of 30 
Estimating pi_0 from all 3912 observed p-values.
Estimating pi_0.
Estimated pi_0=0.997575
Processing query 24 out of 30 
Estimating pi_0 from all 3912 observed p-values.
Estimating pi_0.
Estimated pi_0=0.966439
Processing query 25 out of 30 
Estimating pi_0 from all 3912 observed p-values.
Estimating pi_0.
Estimated pi_0=1
Processing query 26 out of 30 
Estimating pi_0 from all 3912 observed p-values.
Estimating pi_0.
Estimated pi_0=1
Processing query 27 out of 30 
Estimating pi_0 from all 3912 observed p-values.
Estimating pi_0.
Estimated pi_0=0.980149
Processing query 28 out of 30 
Estimating pi_0 from all 3912 observed p-values.
Estimating pi_0.
Estimated pi_0=0.971892
Processing query 29 out of 30 
Estimating pi_0 from all 3912 observed p-values.
Estimating pi_0.
Estimated pi_0=0.990315
Processing query 30 out of 30 
Estimating pi_0 from all 3912 observed p-values.
Estimating pi_0.
Estimated pi_0=0.99629
Can't locate HtmlMonolithWr.pm:   /root/lib/perl/HtmlMonolithWr.pm: Permission denied at /software/meme/4.11.2/bin/tomtom_xml_to_html line 48.
Warning: tomtom_xml_to_html exited abnormally and may have failed to create HTML output.

Motif 1/30

Motif IDq-valPWM
MA0516.3-SP29.93986e-05
MA0742.2-KLF129.93986e-05
MA0685.2-SP40.000121574
MA1511.2-KLF100.000151052
MA0079.5-SP10.00016145

Motif 2/30

Motif IDq-valPWM
MA0060.3-NFYA1.35364e-05
MA1644.1-NFYC1.35364e-05
MA0314.2-HAP36.65651e-05
MA0316.1-HAP50.00362119
MA0502.2-NFYB0.00981159

Motif 3/30

Motif IDq-valPWM
MA1474.1-CREB3L40.00368771
MA1672.1-GBF20.0138326
MA0692.1-TFEB0.0138326
MA0871.2-TFEC0.0138326
MA1466.1-ATF60.0138326

Motif 4/30

Motif IDq-valPWM
MA0076.2-ELK46.6659e-06
MA0764.3-ETV43.5158e-05
MA0759.2-ELK33.5158e-05
MA0916.1-Ets21C3.5158e-05
MA0750.2-ZBTB7A3.5158e-05

Motif 5/30

Motif IDq-valPWM
MA0506.2-Nrf15.32753e-06
MA1826.1-bHLH1450.263084
MA0103.3-ZEB10.341382
MA1106.1-HIF1A0.386558
MA1061.1-SPT0.386558

Motif 6/30

Motif IDq-valPWM
MA0808.1-TEAD30.00439319
MA1121.1-TEAD20.0131795
MA0809.2-TEAD40.0579623
MA0090.3-TEAD10.0579623
MA0243.1-sd0.0663711

Motif 7/30

Motif IDq-valPWM
MA0748.2-YY27.28328e-05
MA0998.1-ERF0960.000822085
MA1004.1-ERF130.00248407
MA0975.1-CRF20.00248407
MA1651.1-ZFP420.00248407

Motif 8/30

Motif IDq-valPWM
MA0677.1-Nr2f60.000308755
MA0856.1-RXRG0.000308755
MA0512.2-Rxra0.000308755
MA1574.1-THRB0.000308755
MA1550.1-PPARD0.000308755

Motif 9/30

Motif IDq-valPWM
MA1448.1-fos-10.00256055
MA0489.2-Jun0.00256055
MA1988.1-Atf30.00256055
MA1928.1-BNC20.00256055
MA0099.3-FOS::JUN0.00256055

Motif 10/30

Motif IDq-valPWM
MA0139.1-CTCF2.41762e-09
MA1929.1-CTCF8.43097e-07
MA1930.1-CTCF3.84831e-06
MA1102.2-CTCFL4.04804e-06
MA0531.1-CTCF2.40312e-05

Motif 11/30

Motif IDq-valPWM
MA0153.2-HNF1B9.21234e-08
MA0046.2-HNF1A8.02455e-07
MA1212.1-ATHB-130.169343
MA1198.1-HAT20.169343
MA0951.1-ATHB-160.191625

Motif 12/30

Motif IDq-valPWM
MA0535.1-Mad0.00298608
MA1818.1-Zm00001d0522290.00298608
MA1832.1-Zm00001d0023640.00298608
MA1821.1-Zm00001d0205950.00298608
MA1819.1-Zm00001d0058920.00298608

Motif 13/30

Motif IDq-valPWM
MA0527.1-ZBTB332.08256e-12

Motif 14/30

Motif IDq-valPWM
MA1573.2-Thap111.08358e-08
MA0088.2-ZNF1430.036864
MA1716.1-ZNF760.0487746
MA0525.2-TP630.410317
MA1625.1-Stat5b0.410317

Motif 15/30

Motif IDq-valPWM
MA0374.1-RSC30.155357
MA0375.1-RSC300.155357
MA1099.2-HES10.155357
MA1513.1-KLF150.163495
MA1650.1-ZBTB140.163495

Motif 16/30

Motif IDq-valPWM
MA0833.2-ATF40.00027217
MA1636.1-CEBPG0.000870914
MA0488.1-JUN0.0263704
MA0025.2-NFIL30.0263704
MA1702.1-Pdp10.0311271

Motif 17/30

Motif IDq-valPWM
MA0018.4-CREB10.00083782
MA1475.1-CREB3L40.00439627
MA0492.1-JUND0.00489131
MA1131.1-FOSL2::JUN0.00489131
MA0605.2-ATF30.00489131

Motif 18/30

Motif IDq-valPWM
MA0446.1-fkh0.000795262
MA0546.1-pha-40.000795262
MA0846.1-FOXC20.000795262
MA1683.1-FOXA30.000795262
MA0032.2-FOXC10.00143797

Motif 19/30

Motif IDq-valPWM
MA1861.1-FoxB1.95854e-05
MA1860.1-FoxA-b2.01108e-05
MA0845.1-FOXB10.0375173
MA0032.2-FOXC10.0416862
MA0148.4-FOXA10.116902

Motif 20/30

Motif IDq-valPWM
MA1573.2-Thap112.28677e-06
MA0088.2-ZNF1430.00421798
MA1716.1-ZNF760.00455823
MA1625.1-Stat5b0.0714219
MA0519.1-Stat5a::Stat5b0.0799368

Motif 21/30

No TOMTOM matches passing threshold

Motif 22/30

Motif IDq-valPWM
MA0108.2-TBP0.000320225
MA1818.1-Zm00001d0522290.00116415
MA0375.1-RSC300.00685389
MA0123.1-abi40.0147538
MA1832.1-Zm00001d0023640.0274804

Motif 23/30

Motif IDq-valPWM
MA0647.1-GRHL10.00836536
MA1968.1-TFCP20.0848132
MA0765.3-ETV50.106193
MA0760.1-ERF0.106193
MA0028.2-ELK10.106193

Motif 24/30

Motif IDq-valPWM
MA0108.2-TBP0.00330394
MA1817.1-Zm00001d0202670.00330394
MA1819.1-Zm00001d0058920.00330394
MA1961.1-PATZ10.00330394
MA1513.1-KLF150.00424555

Motif 25/30

Motif IDq-valPWM
MA0326.1-MAC10.17111

Motif 26/30

Motif IDq-valPWM
MA1716.1-ZNF760.000297526
MA0088.2-ZNF1430.00147016
MA0731.1-BCL6B0.431764
MA0463.2-BCL60.468369

Motif 27/30

Motif IDq-valPWM
MA1899.1-Mitf0.00457826
MA0829.2-SREBF10.00457826
MA0664.1-MLXIPL0.00665676
MA0663.1-MLX0.00665676
MA0409.1-TYE70.00729904

Motif 28/30

Motif IDq-valPWM
MA0840.1-Creb50.0417631
MA1695.1-ARF360.0417631
MA1348.1-TGA90.0417631
MA1145.1-FOSL2::JUND0.0417631
MA0605.2-ATF30.0417631

Motif 29/30

Motif IDq-valPWM
MA1880.1-Hey0.201306
MA0399.1-SUT10.269686
MA1695.1-ARF360.383016
MA1686.1-ARF130.383016
MA0538.1-daf-120.46114

Motif 30/30

Motif IDq-valPWM
MA0155.1-INSM10.00435869
MA1102.2-CTCFL0.0876067
MA1548.1-PLAGL20.117608
MA1433.1-msn-10.127532
MA0342.1-MSN40.191525