In [1]:
config_file = "/users/kcochran/projects/new_procap_models/modisco_out/procap/K562/strand_merged_umap/2022-10-05_03-39-32_profile_in/config.json"
In [2]:
# Parameters
config_file = "/users/kcochran/projects/new_procap_models/modisco_out/procap/K562/strand_merged_umap/2022-10-05_03-39-32_profile_in/config.json"
In [3]:
import os
import numpy as np
import sys
sys.path.append("../2_train_models")
from utils import load_json
from report_utils import load_coords, load_modisco_results, report_motifs, plot_all_metaclusters

config = load_json(config_file)

proj_dir = config["proj_dir"]

cell_type = config["cell_type"]
model_type = config["model_type"]
timestamp = config["timestamp"]
data_type = config["data_type"]

genome_path = config["genome_path"]
chrom_sizes = config["chrom_sizes"]

in_window = config["in_window"]
out_window = config["out_window"]

slice_len = config["slice"]

peak_path = config["train_val_peak_path"]

scores_path = config["scores_path"]

modisco_results_path = config["results_save_path"]
In [4]:
from modiscolite_utils import load_sequences, load_scores, load_observed_profiles

coords = load_coords(peak_path, in_window=in_window)

onehot_seqs = load_sequences(genome_path,
                             chrom_sizes,
                             peak_path,
                             slice_len=slice_len,
                             in_window=in_window)

scores = load_scores(scores_path,
                     slice_len=slice_len,
                     in_window=in_window)
OMP: Info #276: omp_set_nested routine deprecated, please use omp_set_max_active_levels instead.
/users/kcochran/miniconda3/envs/procap_A100/lib/python3.9/site-packages/tqdm/auto.py:22: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html
  from .autonotebook import tqdm as notebook_tqdm
Loading genome sequence from /mnt/lab_data2/kcochran/new_procap_models/genomes/hg38.withrDNA.fasta
== In Extract Sequences ==
Peak filepath: /mnt/lab_data2/kcochran/new_procap_models/deepshap_out/procap/K562/strand_merged_umap/2022-10-05_03-39-32_in/peaks_uni_and_bi_train_and_val.bed.gz
Sequence length: 2114
Num. Examples: 27000
In [5]:
modisco_results = load_modisco_results(modisco_results_path)
In [6]:
from file_configs import ValFilesConfig, TrainFilesConfig

# TODO: assert that we use the same peak file across all of these

val_config = ValFilesConfig(cell_type, model_type, timestamp, data_type)
train_config = TrainFilesConfig(cell_type, model_type, timestamp, data_type)

true_profs = load_observed_profiles(train_config.plus_bw_path,
                                    train_config.minus_bw_path,
                                    peak_path,
                                    slice_len=slice_len,
                                    out_window=out_window)

pred_profs = np.exp(np.load(val_config.pred_profiles_train_val_path))
Timestamp: 2022-10-05_03-39-32
Timestamp: 2022-10-05_03-39-32
== In Extract Profiles ==
Peak filepath: /mnt/lab_data2/kcochran/new_procap_models/deepshap_out/procap/K562/strand_merged_umap/2022-10-05_03-39-32_in/peaks_uni_and_bi_train_and_val.bed.gz
Profile length: 1000
Num. Examples: 27000
In [7]:
len(coords), onehot_seqs.shape, scores.shape, true_profs.shape, pred_profs.shape
Out[7]:
(27000, (27000, 1000, 4), (27000, 1000, 4), (27000, 2, 1000), (27000, 2, 1000))
In [8]:
from IPython.display import HTML

report_html = report_motifs(modisco_results, proj_dir,
                            os.path.dirname(modisco_results_path))
HTML(report_html)
findfont: Font family ['Arial Rounded'] not found. Falling back to DejaVu Sans.
Out[8]:
pattern num_seqlets modisco_cwm_fwd modisco_cwm_rev match0 qval0 match0_logo match1 qval1 match1_logo match2 qval2 match2_logo
pos_patterns.pattern_0 8650 KLF12_HUMAN.H11MO.0.C 1.044900e-04 SP1_HUMAN.H11MO.0.A 2.438110e-04 SP3_HUMAN.H11MO.0.B 5.190530e-04
pos_patterns.pattern_1 8063 SIX2_MA1119.1 1.000000e+00 ZNF85_HUMAN.H11MO.0.C 1.000000e+00 NaN NaN
pos_patterns.pattern_2 5862 ELK4_MA0076.2 7.799890e-07 ETV1_HUMAN.H11MO.0.A 7.799890e-07 ZBTB7A_MA0750.2 7.799890e-07
pos_patterns.pattern_3 4465 NFYA_MA0060.3 4.920390e-01 FOXI1_HUMAN.H11MO.0.B 4.920390e-01 FOXI1_MOUSE.H11MO.0.B 4.920390e-01
pos_patterns.pattern_4 3488 NRF1_MA0506.1 3.133850e-07 NRF1_MOUSE.H11MO.0.A 8.887360e-04 NRF1_NRF_1 8.887360e-04
pos_patterns.pattern_5 2249 ATF3_HUMAN.H11MO.0.A 2.742350e-03 ATF1_HUMAN.H11MO.0.B 2.742350e-03 ATF1_MOUSE.H11MO.0.B 2.742350e-03
pos_patterns.pattern_6 1560 SP2_HUMAN.H11MO.0.A 7.015650e-05 SP2_MOUSE.H11MO.0.B 7.015650e-05 ZFX_MOUSE.H11MO.0.B 9.922790e-04
pos_patterns.pattern_7 1050 THAP1_HUMAN.H11MO.0.C 1.638730e-08 TYY1_HUMAN.H11MO.0.A 2.263770e-06 TYY1_MOUSE.H11MO.0.A 7.323700e-06
pos_patterns.pattern_8 1011 NaN NaN NaN NaN NaN NaN
pos_patterns.pattern_9 879 TBP_HUMAN.H11MO.0.A 3.506050e-03 TBP_MA0108.2 2.308300e-01 TBP_MOUSE.H11MO.0.A 5.221300e-01
pos_patterns.pattern_10 838 ZNF76_HUMAN.H11MO.0.C 5.612400e-20 ZN143_HUMAN.H11MO.0.A 6.739790e-10 ZN143_MOUSE.H11MO.0.A 6.739790e-10
pos_patterns.pattern_11 833 SP2_HUMAN.H11MO.0.A 9.367220e-04 SP2_MOUSE.H11MO.0.B 9.367220e-04 SP1_MOUSE.H11MO.0.A 1.123880e-03
pos_patterns.pattern_12 760 THAP1_HUMAN.H11MO.0.C 1.575010e-02 SP2_HUMAN.H11MO.0.A 1.575010e-02 SP2_MOUSE.H11MO.0.B 1.575010e-02
pos_patterns.pattern_13 747 ATF3_MOUSE.H11MO.0.A 5.598190e-03 JUNB_HUMAN.H11MO.0.A 5.598190e-03 JUND_HUMAN.H11MO.0.A 5.598190e-03
pos_patterns.pattern_14 535 CTCF_MOUSE.H11MO.0.A 1.074500e-11 CTCF_HUMAN.H11MO.0.A 1.214560e-09 CTCF_MA0139.1 5.822330e-08
pos_patterns.pattern_15 410 ZBTB33_MA0527.1 1.269240e-04 KAISO_HUMAN.H11MO.0.A 1.269240e-04 KAISO_MOUSE.H11MO.0.B 1.269240e-04
pos_patterns.pattern_16 401 NRF1_MOUSE.H11MO.0.A 1.628130e-08 NRF1_HUMAN.H11MO.0.A 1.275810e-07 NRF1_MA0506.1 5.178260e-03
pos_patterns.pattern_17 219 MYBL1_MYB_1 1.000000e+00 Arid3a_MA0151.1 1.000000e+00 NR4A2_nuclearreceptor_2 1.000000e+00
pos_patterns.pattern_18 198 CPEB1_RRM_1 8.892290e-02 HOXC12_homeodomain_1 8.892290e-02 HOXD12_homeodomain_1 8.892290e-02
pos_patterns.pattern_19 190 CTCFL_HUMAN.H11MO.0.A 1.061910e-01 ZFX_MOUSE.H11MO.0.B 1.061910e-01 CTCFL_MOUSE.H11MO.0.A 1.061910e-01
pos_patterns.pattern_20 163 ZN770_HUMAN.H11MO.0.C 4.923090e-07 MAF_MOUSE.H11MO.0.A 1.532720e-01 ZBT17_HUMAN.H11MO.0.A 1.532720e-01
pos_patterns.pattern_21 153 ELF2_MOUSE.H11MO.0.C 1.848560e-01 KLF4_MA0039.3 1.848560e-01 ETV5_HUMAN.H11MO.0.C 2.691610e-01
pos_patterns.pattern_22 148 ZFX_MOUSE.H11MO.0.B 1.536910e-02 MBD2_HUMAN.H11MO.0.B 7.136900e-02 MBD2_MOUSE.H11MO.0.B 7.136900e-02
pos_patterns.pattern_23 135 ZN816_HUMAN.H11MO.0.C 4.183810e-01 SOX4_HUMAN.H11MO.0.B 4.183810e-01 SOX4_MOUSE.H11MO.0.A 4.183810e-01
pos_patterns.pattern_24 130 GATA5_GATA_1 1.000000e+00 GATA5_MA0766.1 1.000000e+00 GATA4_GATA_1 1.000000e+00
pos_patterns.pattern_25 113 ZBTB33_MA0527.1 2.018620e-03 KAISO_HUMAN.H11MO.0.A 2.018620e-03 KAISO_MOUSE.H11MO.0.B 2.018620e-03
pos_patterns.pattern_26 107 ZNF76_HUMAN.H11MO.0.C 5.675680e-06 THA11_HUMAN.H11MO.0.B 4.829700e-04 THA11_MOUSE.H11MO.0.B 4.829700e-04
pos_patterns.pattern_27 105 SP1_MOUSE.H11MO.0.A 5.859470e-02 SP2_HUMAN.H11MO.0.A 5.859470e-02 SP2_MOUSE.H11MO.0.B 5.859470e-02
pos_patterns.pattern_28 105 ZN770_HUMAN.H11MO.0.C 1.847810e-06 ZSC22_HUMAN.H11MO.0.C 1.259890e-01 MAF_MOUSE.H11MO.0.A 1.278720e-01
pos_patterns.pattern_29 82 THAP1_HUMAN.H11MO.0.C 5.641130e-04 TYY1_HUMAN.H11MO.0.A 3.824210e-02 TYY1_MOUSE.H11MO.0.A 3.824210e-02
pos_patterns.pattern_30 67 SREBF1_MA0595.1 4.472250e-01 ATF3_HUMAN.H11MO.0.A 4.472250e-01 SREBF2_MA0596.1 4.472250e-01
pos_patterns.pattern_31 63 RFX3_MOUSE.H11MO.0.C 4.912410e-02 RFX2_HUMAN.H11MO.0.A 4.912410e-02 Rfx1_MA0509.1 4.912410e-02
pos_patterns.pattern_32 54 THAP1_HUMAN.H11MO.0.C 1.725180e-03 YY1_MA0095.2 9.069740e-03 TYY1_HUMAN.H11MO.0.A 9.069740e-03
pos_patterns.pattern_33 49 RUNX2_RUNX_1 1.000000e+00 RUNX3_RUNX_1 1.000000e+00 RUNX2_MA0511.2 1.000000e+00
pos_patterns.pattern_34 41 TEAD1_HUMAN.H11MO.0.A 1.000000e+00 TEAD2_MA1121.1 1.000000e+00 CEBPE_HUMAN.H11MO.0.A 1.000000e+00
pos_patterns.pattern_35 40 PRDM1_MA0508.2 9.103330e-01 MLXIPL_MA0664.1 9.103330e-01 MLXIPL_bHLH_1 9.103330e-01
pos_patterns.pattern_36 39 NaN NaN NaN NaN NaN NaN
pos_patterns.pattern_37 34 ATF1_HUMAN.H11MO.0.B 3.300850e-01 CREB1_MA0018.3 3.300850e-01 CREM_HUMAN.H11MO.0.C 3.300850e-01
pos_patterns.pattern_38 24 NaN NaN NaN NaN NaN NaN
neg_patterns.pattern_0 87 KLF12_HUMAN.H11MO.0.C 1.138970e-04 SP1_MOUSE.H11MO.0.A 1.489410e-04 SP1_MA0079.3 1.489410e-04
neg_patterns.pattern_1 61 Gabpa_MA0062.2 3.137950e-07 ELF1_MOUSE.H11MO.0.A 3.137950e-07 ETV1_HUMAN.H11MO.0.A 1.958080e-06
neg_patterns.pattern_2 42 JUND_MA0491.1 1.275760e-01 NFE2_MA0841.1 1.275760e-01 NFE2_bZIP_1 1.275760e-01
neg_patterns.pattern_3 25 NRF1_MOUSE.H11MO.0.A 5.703030e-06 NRF1_HUMAN.H11MO.0.A 3.134280e-05 NRF1_MA0506.1 1.283760e-04
In [9]:
%matplotlib inline

plot_all_metaclusters(modisco_results, onehot_seqs, scores, true_profs, pred_profs, coords,
                in_window, out_window, slice_len, 400)

Pattern 0/39

8650 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 1/39

8063 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 2/39

5862 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 3/39

4465 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 4/39

3488 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 5/39

2249 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 6/39

1560 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 7/39

1050 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 8/39

1011 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 9/39

879 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 10/39

838 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 11/39

833 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 12/39

760 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 13/39

747 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 14/39

535 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 15/39

410 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 16/39

401 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 17/39

219 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 18/39

198 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 19/39

190 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 20/39

163 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 21/39

153 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 22/39

148 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 23/39

135 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 24/39

130 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 25/39

113 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 26/39

107 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 27/39

105 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 28/39

105 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 29/39

82 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 30/39

67 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 31/39

63 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 32/39

54 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 33/39

49 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 34/39

41 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 35/39

40 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 36/39

39 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 37/39

34 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 38/39

24 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 0/4

87 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 1/4

61 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 2/4

42 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 3/4

25 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)
In [ ]: