config_file = "/users/kcochran/projects/new_procap_models/modisco_out/procap/K562/strand_merged_umap/2022-10-05_03-39-32_profile_in/config.json"
# Parameters
config_file = "/users/kcochran/projects/new_procap_models/modisco_out/procap/K562/strand_merged_umap/2022-10-05_03-39-32_profile_in/config.json"
import os
import numpy as np
import sys
sys.path.append("../2_train_models")
from utils import load_json
from report_utils import load_coords, load_modisco_results, report_motifs, plot_all_metaclusters
config = load_json(config_file)
proj_dir = config["proj_dir"]
cell_type = config["cell_type"]
model_type = config["model_type"]
timestamp = config["timestamp"]
data_type = config["data_type"]
genome_path = config["genome_path"]
chrom_sizes = config["chrom_sizes"]
in_window = config["in_window"]
out_window = config["out_window"]
slice_len = config["slice"]
peak_path = config["train_val_peak_path"]
scores_path = config["scores_path"]
modisco_results_path = config["results_save_path"]
from modiscolite_utils import load_sequences, load_scores, load_observed_profiles
coords = load_coords(peak_path, in_window=in_window)
onehot_seqs = load_sequences(genome_path,
chrom_sizes,
peak_path,
slice_len=slice_len,
in_window=in_window)
scores = load_scores(scores_path,
slice_len=slice_len,
in_window=in_window)
OMP: Info #276: omp_set_nested routine deprecated, please use omp_set_max_active_levels instead. /users/kcochran/miniconda3/envs/procap_A100/lib/python3.9/site-packages/tqdm/auto.py:22: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html from .autonotebook import tqdm as notebook_tqdm
Loading genome sequence from /mnt/lab_data2/kcochran/new_procap_models/genomes/hg38.withrDNA.fasta == In Extract Sequences == Peak filepath: /mnt/lab_data2/kcochran/new_procap_models/deepshap_out/procap/K562/strand_merged_umap/2022-10-05_03-39-32_in/peaks_uni_and_bi_train_and_val.bed.gz Sequence length: 2114 Num. Examples: 27000
modisco_results = load_modisco_results(modisco_results_path)
from file_configs import ValFilesConfig, TrainFilesConfig
# TODO: assert that we use the same peak file across all of these
val_config = ValFilesConfig(cell_type, model_type, timestamp, data_type)
train_config = TrainFilesConfig(cell_type, model_type, timestamp, data_type)
true_profs = load_observed_profiles(train_config.plus_bw_path,
train_config.minus_bw_path,
peak_path,
slice_len=slice_len,
out_window=out_window)
pred_profs = np.exp(np.load(val_config.pred_profiles_train_val_path))
Timestamp: 2022-10-05_03-39-32 Timestamp: 2022-10-05_03-39-32 == In Extract Profiles == Peak filepath: /mnt/lab_data2/kcochran/new_procap_models/deepshap_out/procap/K562/strand_merged_umap/2022-10-05_03-39-32_in/peaks_uni_and_bi_train_and_val.bed.gz Profile length: 1000 Num. Examples: 27000
len(coords), onehot_seqs.shape, scores.shape, true_profs.shape, pred_profs.shape
(27000, (27000, 1000, 4), (27000, 1000, 4), (27000, 2, 1000), (27000, 2, 1000))
from IPython.display import HTML
report_html = report_motifs(modisco_results, proj_dir,
os.path.dirname(modisco_results_path))
HTML(report_html)
findfont: Font family ['Arial Rounded'] not found. Falling back to DejaVu Sans.
pattern | num_seqlets | modisco_cwm_fwd | modisco_cwm_rev | match0 | qval0 | match0_logo | match1 | qval1 | match1_logo | match2 | qval2 | match2_logo |
---|---|---|---|---|---|---|---|---|---|---|---|---|
pos_patterns.pattern_0 | 8650 | KLF12_HUMAN.H11MO.0.C | 1.044900e-04 | SP1_HUMAN.H11MO.0.A | 2.438110e-04 | SP3_HUMAN.H11MO.0.B | 5.190530e-04 | |||||
pos_patterns.pattern_1 | 8063 | SIX2_MA1119.1 | 1.000000e+00 | ZNF85_HUMAN.H11MO.0.C | 1.000000e+00 | NaN | NaN | |||||
pos_patterns.pattern_2 | 5862 | ELK4_MA0076.2 | 7.799890e-07 | ETV1_HUMAN.H11MO.0.A | 7.799890e-07 | ZBTB7A_MA0750.2 | 7.799890e-07 | |||||
pos_patterns.pattern_3 | 4465 | NFYA_MA0060.3 | 4.920390e-01 | FOXI1_HUMAN.H11MO.0.B | 4.920390e-01 | FOXI1_MOUSE.H11MO.0.B | 4.920390e-01 | |||||
pos_patterns.pattern_4 | 3488 | NRF1_MA0506.1 | 3.133850e-07 | NRF1_MOUSE.H11MO.0.A | 8.887360e-04 | NRF1_NRF_1 | 8.887360e-04 | |||||
pos_patterns.pattern_5 | 2249 | ATF3_HUMAN.H11MO.0.A | 2.742350e-03 | ATF1_HUMAN.H11MO.0.B | 2.742350e-03 | ATF1_MOUSE.H11MO.0.B | 2.742350e-03 | |||||
pos_patterns.pattern_6 | 1560 | SP2_HUMAN.H11MO.0.A | 7.015650e-05 | SP2_MOUSE.H11MO.0.B | 7.015650e-05 | ZFX_MOUSE.H11MO.0.B | 9.922790e-04 | |||||
pos_patterns.pattern_7 | 1050 | THAP1_HUMAN.H11MO.0.C | 1.638730e-08 | TYY1_HUMAN.H11MO.0.A | 2.263770e-06 | TYY1_MOUSE.H11MO.0.A | 7.323700e-06 | |||||
pos_patterns.pattern_8 | 1011 | NaN | NaN | NaN | NaN | NaN | NaN | |||||
pos_patterns.pattern_9 | 879 | TBP_HUMAN.H11MO.0.A | 3.506050e-03 | TBP_MA0108.2 | 2.308300e-01 | TBP_MOUSE.H11MO.0.A | 5.221300e-01 | |||||
pos_patterns.pattern_10 | 838 | ZNF76_HUMAN.H11MO.0.C | 5.612400e-20 | ZN143_HUMAN.H11MO.0.A | 6.739790e-10 | ZN143_MOUSE.H11MO.0.A | 6.739790e-10 | |||||
pos_patterns.pattern_11 | 833 | SP2_HUMAN.H11MO.0.A | 9.367220e-04 | SP2_MOUSE.H11MO.0.B | 9.367220e-04 | SP1_MOUSE.H11MO.0.A | 1.123880e-03 | |||||
pos_patterns.pattern_12 | 760 | THAP1_HUMAN.H11MO.0.C | 1.575010e-02 | SP2_HUMAN.H11MO.0.A | 1.575010e-02 | SP2_MOUSE.H11MO.0.B | 1.575010e-02 | |||||
pos_patterns.pattern_13 | 747 | ATF3_MOUSE.H11MO.0.A | 5.598190e-03 | JUNB_HUMAN.H11MO.0.A | 5.598190e-03 | JUND_HUMAN.H11MO.0.A | 5.598190e-03 | |||||
pos_patterns.pattern_14 | 535 | CTCF_MOUSE.H11MO.0.A | 1.074500e-11 | CTCF_HUMAN.H11MO.0.A | 1.214560e-09 | CTCF_MA0139.1 | 5.822330e-08 | |||||
pos_patterns.pattern_15 | 410 | ZBTB33_MA0527.1 | 1.269240e-04 | KAISO_HUMAN.H11MO.0.A | 1.269240e-04 | KAISO_MOUSE.H11MO.0.B | 1.269240e-04 | |||||
pos_patterns.pattern_16 | 401 | NRF1_MOUSE.H11MO.0.A | 1.628130e-08 | NRF1_HUMAN.H11MO.0.A | 1.275810e-07 | NRF1_MA0506.1 | 5.178260e-03 | |||||
pos_patterns.pattern_17 | 219 | MYBL1_MYB_1 | 1.000000e+00 | Arid3a_MA0151.1 | 1.000000e+00 | NR4A2_nuclearreceptor_2 | 1.000000e+00 | |||||
pos_patterns.pattern_18 | 198 | CPEB1_RRM_1 | 8.892290e-02 | HOXC12_homeodomain_1 | 8.892290e-02 | HOXD12_homeodomain_1 | 8.892290e-02 | |||||
pos_patterns.pattern_19 | 190 | CTCFL_HUMAN.H11MO.0.A | 1.061910e-01 | ZFX_MOUSE.H11MO.0.B | 1.061910e-01 | CTCFL_MOUSE.H11MO.0.A | 1.061910e-01 | |||||
pos_patterns.pattern_20 | 163 | ZN770_HUMAN.H11MO.0.C | 4.923090e-07 | MAF_MOUSE.H11MO.0.A | 1.532720e-01 | ZBT17_HUMAN.H11MO.0.A | 1.532720e-01 | |||||
pos_patterns.pattern_21 | 153 | ELF2_MOUSE.H11MO.0.C | 1.848560e-01 | KLF4_MA0039.3 | 1.848560e-01 | ETV5_HUMAN.H11MO.0.C | 2.691610e-01 | |||||
pos_patterns.pattern_22 | 148 | ZFX_MOUSE.H11MO.0.B | 1.536910e-02 | MBD2_HUMAN.H11MO.0.B | 7.136900e-02 | MBD2_MOUSE.H11MO.0.B | 7.136900e-02 | |||||
pos_patterns.pattern_23 | 135 | ZN816_HUMAN.H11MO.0.C | 4.183810e-01 | SOX4_HUMAN.H11MO.0.B | 4.183810e-01 | SOX4_MOUSE.H11MO.0.A | 4.183810e-01 | |||||
pos_patterns.pattern_24 | 130 | GATA5_GATA_1 | 1.000000e+00 | GATA5_MA0766.1 | 1.000000e+00 | GATA4_GATA_1 | 1.000000e+00 | |||||
pos_patterns.pattern_25 | 113 | ZBTB33_MA0527.1 | 2.018620e-03 | KAISO_HUMAN.H11MO.0.A | 2.018620e-03 | KAISO_MOUSE.H11MO.0.B | 2.018620e-03 | |||||
pos_patterns.pattern_26 | 107 | ZNF76_HUMAN.H11MO.0.C | 5.675680e-06 | THA11_HUMAN.H11MO.0.B | 4.829700e-04 | THA11_MOUSE.H11MO.0.B | 4.829700e-04 | |||||
pos_patterns.pattern_27 | 105 | SP1_MOUSE.H11MO.0.A | 5.859470e-02 | SP2_HUMAN.H11MO.0.A | 5.859470e-02 | SP2_MOUSE.H11MO.0.B | 5.859470e-02 | |||||
pos_patterns.pattern_28 | 105 | ZN770_HUMAN.H11MO.0.C | 1.847810e-06 | ZSC22_HUMAN.H11MO.0.C | 1.259890e-01 | MAF_MOUSE.H11MO.0.A | 1.278720e-01 | |||||
pos_patterns.pattern_29 | 82 | THAP1_HUMAN.H11MO.0.C | 5.641130e-04 | TYY1_HUMAN.H11MO.0.A | 3.824210e-02 | TYY1_MOUSE.H11MO.0.A | 3.824210e-02 | |||||
pos_patterns.pattern_30 | 67 | SREBF1_MA0595.1 | 4.472250e-01 | ATF3_HUMAN.H11MO.0.A | 4.472250e-01 | SREBF2_MA0596.1 | 4.472250e-01 | |||||
pos_patterns.pattern_31 | 63 | RFX3_MOUSE.H11MO.0.C | 4.912410e-02 | RFX2_HUMAN.H11MO.0.A | 4.912410e-02 | Rfx1_MA0509.1 | 4.912410e-02 | |||||
pos_patterns.pattern_32 | 54 | THAP1_HUMAN.H11MO.0.C | 1.725180e-03 | YY1_MA0095.2 | 9.069740e-03 | TYY1_HUMAN.H11MO.0.A | 9.069740e-03 | |||||
pos_patterns.pattern_33 | 49 | RUNX2_RUNX_1 | 1.000000e+00 | RUNX3_RUNX_1 | 1.000000e+00 | RUNX2_MA0511.2 | 1.000000e+00 | |||||
pos_patterns.pattern_34 | 41 | TEAD1_HUMAN.H11MO.0.A | 1.000000e+00 | TEAD2_MA1121.1 | 1.000000e+00 | CEBPE_HUMAN.H11MO.0.A | 1.000000e+00 | |||||
pos_patterns.pattern_35 | 40 | PRDM1_MA0508.2 | 9.103330e-01 | MLXIPL_MA0664.1 | 9.103330e-01 | MLXIPL_bHLH_1 | 9.103330e-01 | |||||
pos_patterns.pattern_36 | 39 | NaN | NaN | NaN | NaN | NaN | NaN | |||||
pos_patterns.pattern_37 | 34 | ATF1_HUMAN.H11MO.0.B | 3.300850e-01 | CREB1_MA0018.3 | 3.300850e-01 | CREM_HUMAN.H11MO.0.C | 3.300850e-01 | |||||
pos_patterns.pattern_38 | 24 | NaN | NaN | NaN | NaN | NaN | NaN | |||||
neg_patterns.pattern_0 | 87 | KLF12_HUMAN.H11MO.0.C | 1.138970e-04 | SP1_MOUSE.H11MO.0.A | 1.489410e-04 | SP1_MA0079.3 | 1.489410e-04 | |||||
neg_patterns.pattern_1 | 61 | Gabpa_MA0062.2 | 3.137950e-07 | ELF1_MOUSE.H11MO.0.A | 3.137950e-07 | ETV1_HUMAN.H11MO.0.A | 1.958080e-06 | |||||
neg_patterns.pattern_2 | 42 | JUND_MA0491.1 | 1.275760e-01 | NFE2_MA0841.1 | 1.275760e-01 | NFE2_bZIP_1 | 1.275760e-01 | |||||
neg_patterns.pattern_3 | 25 | NRF1_MOUSE.H11MO.0.A | 5.703030e-06 | NRF1_HUMAN.H11MO.0.A | 3.134280e-05 | NRF1_MA0506.1 | 1.283760e-04 |
%matplotlib inline
plot_all_metaclusters(modisco_results, onehot_seqs, scores, true_profs, pred_profs, coords,
in_window, out_window, slice_len, 400)
8650 seqlets
Sequence (PFM) | |
Hypothetical contributions (hCWM) | |
Actual contributions (CWM) |
8063 seqlets
Sequence (PFM) | |
Hypothetical contributions (hCWM) | |
Actual contributions (CWM) |
5862 seqlets
Sequence (PFM) | |
Hypothetical contributions (hCWM) | |
Actual contributions (CWM) |
4465 seqlets
Sequence (PFM) | |
Hypothetical contributions (hCWM) | |
Actual contributions (CWM) |
3488 seqlets
Sequence (PFM) | |
Hypothetical contributions (hCWM) | |
Actual contributions (CWM) |
2249 seqlets
Sequence (PFM) | |
Hypothetical contributions (hCWM) | |
Actual contributions (CWM) |
1560 seqlets
Sequence (PFM) | |
Hypothetical contributions (hCWM) | |
Actual contributions (CWM) |
1050 seqlets
Sequence (PFM) | |
Hypothetical contributions (hCWM) | |
Actual contributions (CWM) |
1011 seqlets
Sequence (PFM) | |
Hypothetical contributions (hCWM) | |
Actual contributions (CWM) |
879 seqlets
Sequence (PFM) | |
Hypothetical contributions (hCWM) | |
Actual contributions (CWM) |
838 seqlets
Sequence (PFM) | |
Hypothetical contributions (hCWM) | |
Actual contributions (CWM) |
833 seqlets
Sequence (PFM) | |
Hypothetical contributions (hCWM) | |
Actual contributions (CWM) |
760 seqlets
Sequence (PFM) | |
Hypothetical contributions (hCWM) | |
Actual contributions (CWM) |
747 seqlets
Sequence (PFM) | |
Hypothetical contributions (hCWM) | |
Actual contributions (CWM) |
535 seqlets
Sequence (PFM) | |
Hypothetical contributions (hCWM) | |
Actual contributions (CWM) |
410 seqlets
Sequence (PFM) | |
Hypothetical contributions (hCWM) | |
Actual contributions (CWM) |
401 seqlets
Sequence (PFM) | |
Hypothetical contributions (hCWM) | |
Actual contributions (CWM) |
219 seqlets
Sequence (PFM) | |
Hypothetical contributions (hCWM) | |
Actual contributions (CWM) |
198 seqlets
Sequence (PFM) | |
Hypothetical contributions (hCWM) | |
Actual contributions (CWM) |
190 seqlets
Sequence (PFM) | |
Hypothetical contributions (hCWM) | |
Actual contributions (CWM) |
163 seqlets
Sequence (PFM) | |
Hypothetical contributions (hCWM) | |
Actual contributions (CWM) |
153 seqlets
Sequence (PFM) | |
Hypothetical contributions (hCWM) | |
Actual contributions (CWM) |
148 seqlets
Sequence (PFM) | |
Hypothetical contributions (hCWM) | |
Actual contributions (CWM) |
135 seqlets
Sequence (PFM) | |
Hypothetical contributions (hCWM) | |
Actual contributions (CWM) |
130 seqlets
Sequence (PFM) | |
Hypothetical contributions (hCWM) | |
Actual contributions (CWM) |
113 seqlets
Sequence (PFM) | |
Hypothetical contributions (hCWM) | |
Actual contributions (CWM) |
107 seqlets
Sequence (PFM) | |
Hypothetical contributions (hCWM) | |
Actual contributions (CWM) |
105 seqlets
Sequence (PFM) | |
Hypothetical contributions (hCWM) | |
Actual contributions (CWM) |
105 seqlets
Sequence (PFM) | |
Hypothetical contributions (hCWM) | |
Actual contributions (CWM) |
82 seqlets
Sequence (PFM) | |
Hypothetical contributions (hCWM) | |
Actual contributions (CWM) |
67 seqlets
Sequence (PFM) | |
Hypothetical contributions (hCWM) | |
Actual contributions (CWM) |
63 seqlets
Sequence (PFM) | |
Hypothetical contributions (hCWM) | |
Actual contributions (CWM) |
54 seqlets
Sequence (PFM) | |
Hypothetical contributions (hCWM) | |
Actual contributions (CWM) |
49 seqlets
Sequence (PFM) | |
Hypothetical contributions (hCWM) | |
Actual contributions (CWM) |
41 seqlets
Sequence (PFM) | |
Hypothetical contributions (hCWM) | |
Actual contributions (CWM) |
40 seqlets
Sequence (PFM) | |
Hypothetical contributions (hCWM) | |
Actual contributions (CWM) |
39 seqlets
Sequence (PFM) | |
Hypothetical contributions (hCWM) | |
Actual contributions (CWM) |
34 seqlets
Sequence (PFM) | |
Hypothetical contributions (hCWM) | |
Actual contributions (CWM) |
24 seqlets
Sequence (PFM) | |
Hypothetical contributions (hCWM) | |
Actual contributions (CWM) |
87 seqlets
Sequence (PFM) | |
Hypothetical contributions (hCWM) | |
Actual contributions (CWM) |
61 seqlets
Sequence (PFM) | |
Hypothetical contributions (hCWM) | |
Actual contributions (CWM) |
42 seqlets
Sequence (PFM) | |
Hypothetical contributions (hCWM) | |
Actual contributions (CWM) |
25 seqlets
Sequence (PFM) | |
Hypothetical contributions (hCWM) | |
Actual contributions (CWM) |