In [1]:
# Filepaths and Hard-coded Defaults

proj_root = "/users/kcochran/projects/procap_models/"
sequence_path = proj_root + "genomes/GRCh38_no_alt_analysis_set_GCA_000001405.15.fasta"
chrom_sizes = proj_root + "genomes/hg38.chrom.sizes.withrRNA"

in_window = 2114
out_window = 1000
In [2]:
# stuff to get from config file

with open("2022-07-02_16-51-15_run1_modisco_config.txt") as config_f:
    config_dict = {line.split()[0] : line.strip().split()[1] for line in config_f}

modisco_out_path = config_dict["modisco_out_path"]
scoring_type = config_dict["scoring_type"]
score_center_size = int(config_dict["score_center_size"])
profile_display_center_size = int(config_dict["profile_display_center_size"])

# digest what's in config file

assay_type, model_type, cell, accession, modisco_dir_base = modisco_out_path.split("/")[-5:]
ts_part1, ts_part2, run_str, _ = modisco_dir_base.split("_")
timestamp = ts_part1 + "_" + ts_part2
run = int(run_str.replace("run", ""))
In [3]:
print(modisco_out_path)
print("cell_type:", cell, accession)
print("timestamp:", timestamp)
print("run:", run)
print("scoring_type:", scoring_type)
print("score_center_size:", score_center_size)
print("profile_display_center_size:", profile_display_center_size)
/users/kcochran//projects/procap_models/modisco_out/procap/bpnetlite_basic_v2/A673/ENCSR046BCI/2022-07-02_16-51-15_run1_modisco
cell_type: A673 ENCSR046BCI
timestamp: 2022-07-02_16-51-15
run: 1
scoring_type: counts
score_center_size: 1000
profile_display_center_size: 400
In [4]:
data_dir = proj_root + "/data/procap/processed/" + cell + "/" + accession + "/"
plus_bw_path = data_dir + "final.5prime.pos.bigWig"
minus_bw_path = data_dir + "final.5prime.neg.bigWig"
val_peak_path = data_dir + "peaks_uni_and_bi_train_and_val.bed.gz"

val_save_dir = proj_root + "model_out/" + assay_type + "/" + model_type + "/" + cell + "/" + accession + "/"
val_save_path = val_save_dir + timestamp + "_run" + str(run) + "_train_and_val"

attr_save_path = val_save_dir.replace("model_out", "deepshap_out") + timestamp + "_run" + str(run) + "_deepshap"

if not modisco_out_path.endswith("/"):
    modisco_out_path = modisco_out_path + "/"
In [5]:
# task-specific filepaths

import os

assert scoring_type in ["profile", "counts"], scoring_type

if scoring_type == "profile":
    scores_path = attr_save_path + "_prof.npy"
    onehot_scores_path = attr_save_path + "_prof_onehot.npy"
    modisco_obj_path = modisco_out_path + "results_allChroms_prof_slice500.hdf5"
    seqlet_path = modisco_out_path + "seqlets_prof.txt"
else:
    scores_path = attr_save_path + "_count.npy"
    onehot_scores_path = attr_save_path + "_count_onehot.npy"
    modisco_obj_path = modisco_out_path + "results_allChroms_count_slice500.hdf5"
    seqlet_path = modisco_out_path + "seqlets_count.txt"
    
assert(os.path.exists(scores_path)), scores_path
assert(os.path.exists(onehot_scores_path)), onehot_scores_path
In [6]:
# Imports, Plotting Defaults

import matplotlib.pyplot as plt
import matplotlib.font_manager as font_manager

plot_params = {
    "figure.titlesize": 22,
    "axes.titlesize": 22,
    "axes.labelsize": 20,
    "legend.fontsize": 18,
    "xtick.labelsize": 16,
    "ytick.labelsize": 16,
    "font.weight": "bold"
}
plt.rcParams.update(plot_params)

from IPython.display import display
import tqdm
tqdm.tqdm_notebook()

import numpy as np

from view_modisco_results_utils import *
/users/kcochran/miniconda3/envs/procap/lib/python3.7/site-packages/ipykernel_launcher.py:19: TqdmDeprecationWarning: This function will be removed in tqdm==5.0.0
Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
In [7]:
# Load in True Profiles and Sequences

import sys
sys.path.append('../1_train_models')

from data_loading import extract_peaks

one_hot_seqs, true_profs = extract_peaks(sequence_path, 
    plus_bw_path, minus_bw_path, val_peak_path, in_window, out_window,
    max_jitter=0, verbose=True)

one_hot_seqs = one_hot_seqs.swapaxes(1,2)
one_hot_seqs = one_hot_seqs[:, (in_window // 2 - score_center_size // 2):(in_window // 2 + score_center_size // 2), :]
Reading FASTA: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 24/24 [00:17<00:00,  1.39it/s]
Loading Peaks: 44667it [01:37, 458.93it/s]
In [8]:
# Load in Coordinates of Examples
    
coords = load_coords(val_peak_path, in_window)
In [9]:
# Import SHAP scores, predicted profiles

hyp_scores = np.load(scores_path).swapaxes(1,2)
hyp_scores = hyp_scores[:, (in_window // 2 - score_center_size // 2):(in_window // 2 + score_center_size // 2), :]
pred_profs = np.exp(np.load(val_save_path + ".profs.npy"))
In [10]:
# Load modisco results object
    
tfm_obj = import_tfmodisco_results(modisco_obj_path, hyp_scores, one_hot_seqs)
In [11]:
motif_pfms, motif_hcwms, motif_cwms, \
    motif_pfms_short, num_seqlets, \
    motif_seqlets, num_metaclusters = plot_all_metaclusters(tfm_obj, one_hot_seqs, hyp_scores,
                                                            true_profs, pred_profs, coords,
                                                            in_window, out_window,
                                                            score_center_size,
                                                            profile_display_center_size)

Metacluster 1/2

Pattern 1/20

11554 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 2/20

10125 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 3/20

7962 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 4/20

7566 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 5/20

4257 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 6/20

4145 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 7/20

4078 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 8/20

4051 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 9/20

2843 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 10/20

2209 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 11/20

2153 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 12/20

1938 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 13/20

1431 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 14/20

1346 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 15/20

1045 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 16/20

690 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 17/20

354 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 18/20

94 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 19/20

58 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 20/20

32 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Metacluster 2/2

Pattern 1/5

181 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 2/5

83 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 3/5

38 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 4/5

33 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 5/5

25 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)
In [12]:
run_and_plot_tomtom(modisco_out_path, motif_pfms, motif_hcwms, motif_pfms_short, num_metaclusters)

Metacluster 1/2

The output directory '/users/kcochran//projects/procap_models/modisco_out/procap/bpnetlite_basic_v2/A673/ENCSR046BCI/2022-07-02_16-51-15_run1_modisco/tomtom' already exists.
Its contents will be overwritten.
Processing query 1 out of 20 
Estimating pi_0 from all 3912 observed p-values.
Estimating pi_0.
Estimated pi_0=0.984216
Processing query 2 out of 20 
Estimating pi_0 from all 3912 observed p-values.
Estimating pi_0.
Estimated pi_0=0.972927
Processing query 3 out of 20 
Estimating pi_0 from all 3912 observed p-values.
Estimating pi_0.
Estimated pi_0=1
Processing query 4 out of 20 
Estimating pi_0 from all 3912 observed p-values.
Estimating pi_0.
Estimated pi_0=0.954329
Processing query 5 out of 20 
Estimating pi_0 from all 3912 observed p-values.
Estimating pi_0.
Estimated pi_0=0.990124
Processing query 6 out of 20 
Estimating pi_0 from all 3912 observed p-values.
Estimating pi_0.
Estimated pi_0=0.996547
Processing query 7 out of 20 
Estimating pi_0 from all 3912 observed p-values.
Estimating pi_0.
Estimated pi_0=0.970079
Processing query 8 out of 20 
Estimating pi_0 from all 3912 observed p-values.
Estimating pi_0.
Estimated pi_0=0.998346
Processing query 9 out of 20 
Estimating pi_0 from all 3912 observed p-values.
Estimating pi_0.
Estimated pi_0=0.9855
Processing query 10 out of 20 
Estimating pi_0 from all 3912 observed p-values.
Estimating pi_0.
Estimated pi_0=0.9855
Processing query 11 out of 20 
Estimating pi_0 from all 3912 observed p-values.
Estimating pi_0.
Estimated pi_0=1
Processing query 12 out of 20 
Estimating pi_0 from all 3912 observed p-values.
Estimating pi_0.
Estimated pi_0=1
Processing query 13 out of 20 
Estimating pi_0 from all 3912 observed p-values.
Estimating pi_0.
Estimated pi_0=0.990509
Processing query 14 out of 20 
Estimating pi_0 from all 3912 observed p-values.
Estimating pi_0.
Estimated pi_0=1
Processing query 15 out of 20 
Estimating pi_0 from all 3912 observed p-values.
Estimating pi_0.
Estimated pi_0=0.973178
Processing query 16 out of 20 
Estimating pi_0 from all 3912 observed p-values.
Estimating pi_0.
Estimated pi_0=0.986785
Processing query 17 out of 20 
Estimating pi_0 from all 3912 observed p-values.
Estimating pi_0.
Estimated pi_0=0.979456
Processing query 18 out of 20 
Estimating pi_0 from all 3912 observed p-values.
Estimating pi_0.
Estimated pi_0=0.954591
Processing query 19 out of 20 
Estimating pi_0 from all 3912 observed p-values.
Estimating pi_0.
Estimated pi_0=0.978637
Processing query 20 out of 20 
Estimating pi_0 from all 3912 observed p-values.
Estimating pi_0.
Estimated pi_0=0.994235
Can't locate HtmlMonolithWr.pm:   /root/lib/perl/HtmlMonolithWr.pm: Permission denied at /software/meme/4.11.2/bin/tomtom_xml_to_html line 48.
Warning: tomtom_xml_to_html exited abnormally and may have failed to create HTML output.

Motif 1/20

Motif IDq-valPWM
MA0916.1-Ets21C4.38346e-05
MA0076.2-ELK44.38346e-05
MA1484.1-ETS29.51595e-05
MA0764.3-ETV40.000298849
MA1940.1-ETV2::DRGX0.000298849

Motif 2/20

Motif IDq-valPWM
MA1513.1-KLF155.16776e-05
MA0516.3-SP20.00014776
MA0685.2-SP40.00014776
MA0742.2-KLF120.00014776
MA0079.5-SP10.000163251

Motif 3/20

Motif IDq-valPWM
MA0314.2-HAP32.10161e-05
MA1644.1-NFYC3.11483e-05
MA0060.3-NFYA3.11483e-05
MA0502.2-NFYB0.0162431
MA0316.1-HAP50.0168161

Motif 4/20

Motif IDq-valPWM
MA1475.1-CREB3L40.00638728
MA0286.1-CST60.00638728
MA1449.1-hlh-300.00638728
MA0605.2-ATF30.00638728
MA1336.1-TGA30.00638728

Motif 5/20

Motif IDq-valPWM
MA0474.3-Erg0.000772132
MA0598.3-EHF0.000772132
MA0062.3-GABPA0.000772132
MA0761.2-ETV10.000772132
MA1992.1-Ikzf30.000772132

Motif 6/20

Motif IDq-valPWM
MA1651.1-ZFP420.00286028
MA0748.2-YY20.00501397
MA0095.3-Yy10.0399096
MA1927.1-YY1-20.0399096
MA1460.1-pho0.0652459

Motif 7/20

Motif IDq-valPWM
MA0535.1-Mad0.0206967
MA1818.1-Zm00001d0522290.0206967
MA1821.1-Zm00001d0205950.0206967
MA1832.1-Zm00001d0023640.0206967
MA1819.1-Zm00001d0058920.0206967

Motif 8/20

Motif IDq-valPWM
MA0506.2-Nrf10.000292231
MA1412.1-TSAR20.387092

Motif 9/20

Motif IDq-valPWM
MA1988.1-Atf31.24713e-05
MA0478.1-FOSL27.43694e-05
MA1128.1-FOSL1::JUN0.00013221
MA0491.2-JUND0.00013221
MA0490.2-JUNB0.00013221

Motif 10/20

Motif IDq-valPWM
MA0833.2-ATF48.75194e-05
MA1636.1-CEBPG0.000602773
MA0488.1-JUN0.0121305
MA0025.2-NFIL30.0334403
MA1702.1-Pdp10.0379851

Motif 11/20

Motif IDq-valPWM
MA1573.2-Thap119.94946e-09
MA0088.2-ZNF1430.0146411
MA1716.1-ZNF760.026512
MA1625.1-Stat5b0.396635
MA0525.2-TP630.453823

Motif 12/20

Motif IDq-valPWM
MA0527.1-ZBTB334.19642e-05

Motif 13/20

Motif IDq-valPWM
MA1929.1-CTCF2.79368e-11
MA0139.1-CTCF3.56202e-11
MA1930.1-CTCF4.30118e-10
MA1102.2-CTCFL6.62009e-06
MA0531.1-CTCF1.94341e-05

Motif 14/20

Motif IDq-valPWM
MA0149.1-EWSR1-FLI16.79976e-12

Motif 15/20

Motif IDq-valPWM
MA0108.2-TBP7.23214e-06
MA1961.1-PATZ10.000140317
MA1820.1-Zm00001d0243240.000140317
MA1819.1-Zm00001d0058920.000280103
MA1817.1-Zm00001d0202670.000313799

Motif 16/20

Motif IDq-valPWM
MA1933.1-ELK1::SREBF20.000430502
MA0076.2-ELK40.00262661
MA1939.1-ERF::SREBF20.00262661
MA0475.2-FLI10.0041689
MA0156.3-FEV0.0041689

Motif 17/20

Motif IDq-valPWM
MA0108.2-TBP0.021844
MA1819.1-Zm00001d0058920.021844
MA1832.1-Zm00001d0023640.0548352
MA1821.1-Zm00001d0205950.0553189
MA1961.1-PATZ10.0637054

Motif 18/20

Motif IDq-valPWM
MA1833.1-Zm00001d0493643.24576e-06
MA2022.1-LOB3.55384e-05
MA1817.1-Zm00001d0202673.80009e-05
MA1819.1-Zm00001d0058926.2985e-05
MA1820.1-Zm00001d0243246.84977e-05

Motif 19/20

Motif IDq-valPWM
MA1513.1-KLF150.00820478
MA0516.3-SP20.00820478
MA0742.2-KLF120.00820478
MA1511.2-KLF100.00820478
MA0747.1-SP80.00820478

Motif 20/20

Motif IDq-valPWM
MA0591.1-Bach1::Mafk0.0557221
MA1448.1-fos-10.0557221
MA0150.2-Nfe2l20.0684371
MA0478.1-FOSL20.0684371
MA1461.1-sv0.0950865

Metacluster 2/2

The output directory '/users/kcochran//projects/procap_models/modisco_out/procap/bpnetlite_basic_v2/A673/ENCSR046BCI/2022-07-02_16-51-15_run1_modisco/tomtom' already exists.
Its contents will be overwritten.
Processing query 1 out of 5 
Estimating pi_0 from all 3912 observed p-values.
Estimating pi_0.
Estimated pi_0=0.971101
Processing query 2 out of 5 
Estimating pi_0 from all 3912 observed p-values.
Estimating pi_0.
Estimated pi_0=0.929491
Processing query 3 out of 5 
Estimating pi_0 from all 3912 observed p-values.
Estimating pi_0.
Estimated pi_0=0.936018
Processing query 4 out of 5 
Estimating pi_0 from all 3912 observed p-values.
Estimating pi_0.
Estimated pi_0=0.955741
Processing query 5 out of 5 
Estimating pi_0 from all 3912 observed p-values.
Estimating pi_0.
Estimated pi_0=0.94647
Can't locate HtmlMonolithWr.pm:   /root/lib/perl/HtmlMonolithWr.pm: Permission denied at /software/meme/4.11.2/bin/tomtom_xml_to_html line 48.
Warning: tomtom_xml_to_html exited abnormally and may have failed to create HTML output.

Motif 1/5

Motif IDq-valPWM
MA0829.2-SREBF10.305851
MA0830.2-TCF40.305851
MA0828.2-SREBF20.305851
MA0004.1-Arnt0.305851
MA0499.2-MYOD10.305851

Motif 2/5

Motif IDq-valPWM
MA1053.1-ERF1090.0174079
MA0997.1-ERF0690.0174079
MA2022.1-LOB0.0174079
MA1102.2-CTCFL0.0174079
MA0975.1-CRF20.0174079

Motif 3/5

Motif IDq-valPWM
MA1880.1-Hey3.524e-05
MA1819.1-Zm00001d0058920.0033764
MA1820.1-Zm00001d0243240.00864863
MA1877.1-Hes-b0.00864863
MA1818.1-Zm00001d0522290.0185633

Motif 4/5

Motif IDq-valPWM
MA1819.1-Zm00001d0058920.00140206
MA1832.1-Zm00001d0023640.00242122
MA1821.1-Zm00001d0205950.00261271
MA1820.1-Zm00001d0243240.00541994
MA2022.1-LOB0.00627875

Motif 5/5

Motif IDq-valPWM
MA0375.1-RSC300.0356011
MA0374.1-RSC30.0356011
MA1930.1-CTCF0.0356011
MA0449.1-h0.0647462
MA1650.1-ZBTB140.0763549