In [1]:
import os
import sys
sys.path.append(os.path.abspath("/users/amtseng/tfmodisco/src/"))
from tfmodisco.run_tfmodisco import import_shap_scores, import_tfmodisco_results
from motif.read_motifs import pfm_info_content, pfm_to_pwm, trim_motif_by_ic
from motif.match_motifs import match_motifs_to_database
from util import figure_to_vdom_image
import plot.viz_sequence as viz_sequence
import numpy as np
import h5py
import matplotlib.pyplot as plt
import vdom.helpers as vdomh
from IPython.display import display

Define constants and paths

In [2]:
# Define parameters/fetch arguments
tf_name = os.environ["TFM_TF_NAME"]
shap_scores_path = os.environ["TFM_SHAP_PATH"]
tfm_results_path = os.environ["TFM_TFM_PATH"]
hyp_score_key = os.environ["TFM_HYP_SCORE_KEY"]
if "TFM_MOTIF_CACHE" in os.environ:
    tfm_motifs_cache_dir = os.environ["TFM_MOTIF_CACHE"]
else:
    tfm_motifs_cache_dir = None

print("TF name: %s" % tf_name)
print("DeepSHAP scores path: %s" % shap_scores_path)
print("TF-MoDISco results path: %s" % tfm_results_path)
print("Importance score key: %s" % hyp_score_key)
print("Saved TF-MoDISco-derived motifs cache: %s" % tfm_motifs_cache_dir)
TF name: GABPA
DeepSHAP scores path: /users/amtseng/tfmodisco/results/importance_scores/multitask_profile/GABPA_multitask_profile_fold4/GABPA_multitask_profile_fold4_imp_scores.h5
TF-MoDISco results path: /users/amtseng/tfmodisco/results/tfmodisco/multitask_profile/GABPA_multitask_profile_fold4/GABPA_multitask_profile_fold4_profile_tfm.h5
Importance score key: profile_hyp_scores
Saved TF-MoDISco-derived motifs cache: /users/amtseng/tfmodisco/results/reports/tfmodisco_results//cache/multitask_profile/GABPA_multitask_profile_fold4/GABPA_multitask_profile_fold4_profile
In [3]:
# Define paths and constants
input_length = 2114
shap_score_center_size = 400
In [4]:
if tfm_motifs_cache_dir:
    os.makedirs(tfm_motifs_cache_dir, exist_ok=True)

Import SHAP scores and TF-MoDISco results

In [5]:
# Import SHAP coordinates and one-hot sequences
hyp_scores, _, one_hot_seqs, shap_coords = import_shap_scores(shap_scores_path, hyp_score_key, center_cut_size=shap_score_center_size)
# This cuts the sequences/scores off just as how TF-MoDISco saw them, but the coordinates are uncut
Importing SHAP scores: 100%|██████████| 104/104 [01:18<00:00,  1.32it/s]
In [6]:
# Import the TF-MoDISco results object
tfm_obj = import_tfmodisco_results(tfm_results_path, hyp_scores, one_hot_seqs, shap_score_center_size)

Plot some SHAP score tracks

Plot the central region of some randomly selected actual importance scores

In [7]:
plot_slice = slice(int(shap_score_center_size / 4), int(3 * shap_score_center_size / 4))
for index in np.random.choice(hyp_scores.shape[0], size=5, replace=False):
    viz_sequence.plot_weights((hyp_scores[index] * one_hot_seqs[index])[plot_slice], subticks_frequency=100)

Plot TF-MoDISco results

Plot all motifs by metacluster

In [8]:
motif_pfms, motif_hcwms, motif_cwms = [], [], []  # Save the trimmed PFMs, hCWMs, and CWMs
motif_pfms_short = []  # PFMs that are even more trimmed (for TOMTOM)
num_seqlets = []  # Number of seqlets for each motif
motif_seqlets = []  # Save seqlets of each motif
metaclusters = tfm_obj.metacluster_idx_to_submetacluster_results
num_metaclusters = len(metaclusters.keys())
if tfm_motifs_cache_dir:
    motif_hdf5 = h5py.File(os.path.join(tfm_motifs_cache_dir, "all_motifs.h5"), "w")
for metacluster_i, metacluster_key in enumerate(metaclusters.keys()):
    metacluster = metaclusters[metacluster_key]
    display(vdomh.h3("Metacluster %d/%d" % (metacluster_i + 1, num_metaclusters)))
    patterns = metacluster.seqlets_to_patterns_result.patterns
    if not patterns:
        break
    motif_pfms.append([])
    motif_hcwms.append([])
    motif_cwms.append([])
    motif_pfms_short.append([])
    num_seqlets.append([])
    motif_seqlets.append([])
    num_patterns = len(patterns)
    for pattern_i, pattern in enumerate(patterns):
        seqlets = pattern.seqlets
        display(vdomh.h4("Pattern %d/%d" % (pattern_i + 1, num_patterns)))
        display(vdomh.p("%d seqlets" % len(seqlets)))
        
        pfm = pattern["sequence"].fwd
        hcwm = pattern["task0_hypothetical_contribs"].fwd
        cwm = pattern["task0_contrib_scores"].fwd
        
        pfm_fig = viz_sequence.plot_weights(pfm, subticks_frequency=10, return_fig=True)
        hcwm_fig = viz_sequence.plot_weights(hcwm, subticks_frequency=10, return_fig=True)
        cwm_fig = viz_sequence.plot_weights(cwm, subticks_frequency=10, return_fig=True)
        pfm_fig.tight_layout()
        hcwm_fig.tight_layout()
        cwm_fig.tight_layout()
        
        motif_table = vdomh.table(
            vdomh.tr(
                vdomh.td("Sequence (PFM)"),
                vdomh.td(figure_to_vdom_image(pfm_fig))
            ),
            vdomh.tr(
                vdomh.td("Hypothetical contributions (hCWM)"),
                vdomh.td(figure_to_vdom_image(hcwm_fig))
            ),
            vdomh.tr(
                vdomh.td("Actual contributions (CWM)"),
                vdomh.td(figure_to_vdom_image(cwm_fig))
            )
        )
        display(motif_table)
        plt.close("all")  # Remove all standing figures
        
        # Trim motif based on information content
        short_trimmed_pfm = trim_motif_by_ic(pfm, pfm)
        motif_pfms_short[-1].append(short_trimmed_pfm)
        
        # Expand trimming to +/- 4bp on either side
        trimmed_pfm = trim_motif_by_ic(pfm, pfm, pad=4)
        trimmed_hcwm = trim_motif_by_ic(pfm, hcwm, pad=4)
        trimmed_cwm = trim_motif_by_ic(pfm, cwm, pad=4)
        
        motif_pfms[-1].append(trimmed_pfm)
        motif_hcwms[-1].append(trimmed_hcwm)
        motif_cwms[-1].append(trimmed_cwm)
        
        num_seqlets[-1].append(len(seqlets))
        
        if tfm_motifs_cache_dir:
            # Save results and figures
            motif_id = "%d_%d" % (metacluster_i, pattern_i)
            pfm_fig.savefig(os.path.join(tfm_motifs_cache_dir, motif_id + "_pfm_full.png"))
            hcwm_fig.savefig(os.path.join(tfm_motifs_cache_dir, motif_id + "_hcwm_full.png"))
            cwm_fig.savefig(os.path.join(tfm_motifs_cache_dir, motif_id + "_cwm_full.png"))
            motif_dset = motif_hdf5.create_group(motif_id)
            motif_dset.create_dataset("pfm_full", data=pfm, compression="gzip")
            motif_dset.create_dataset("hcwm_full", data=hcwm, compression="gzip")
            motif_dset.create_dataset("cwm_full", data=cwm, compression="gzip")
            motif_dset.create_dataset("pfm_trimmed", data=trimmed_pfm, compression="gzip")
            motif_dset.create_dataset("hcwm_trimmed", data=trimmed_hcwm, compression="gzip")
            motif_dset.create_dataset("cwm_trimmed", data=trimmed_cwm, compression="gzip")
            motif_dset.create_dataset("pfm_short_trimmed", data=short_trimmed_pfm, compression="gzip")
if tfm_motifs_cache_dir:
    motif_hdf5.close()

Metacluster 1/2

Pattern 1/8

5243 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 2/8

705 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 3/8

485 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 4/8

207 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 5/8

188 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 6/8

93 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 7/8

40 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 8/8

34 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Metacluster 2/2

Pattern 1/7

559 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 2/7

168 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 3/7

129 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 4/7

111 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 5/7

91 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 6/7

87 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 7/7

87 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Summary of motifs

Motifs are trimmed based on information content, and presented in descending order by number of supporting seqlets. The motifs are separated by metacluster. The motifs are presented as hCWMs. The forward orientation is defined as the orientation that is richer in purines.

In [9]:
colgroup = vdomh.colgroup(
    vdomh.col(style={"width": "5%"}),
    vdomh.col(style={"width": "5%"}),
    vdomh.col(style={"width": "45%"}),
    vdomh.col(style={"width": "45%"})
)
header = vdomh.thead(
    vdomh.tr(
        vdomh.th("#", style={"text-align": "center"}),
        vdomh.th("Seqlets", style={"text-align": "center"}),
        vdomh.th("Forward", style={"text-align": "center"}),
        vdomh.th("Reverse", style={"text-align": "center"})
    )
)

for i in range(len(motif_hcwms)):
    display(vdomh.h3("Metacluster %d/%d" % (i + 1, num_metaclusters)))
    body = []
    for j in range(len(motif_hcwms[i])):
        motif = motif_hcwms[i][j]
        if np.sum(motif[:, [0, 2]]) > 0.5 * np.sum(motif):
            # Forward is purine-rich, reverse-complement is pyrimidine-rich
            f, rc = motif, np.flip(motif, axis=(0, 1))
        else:
            f, rc = np.flip(motif, axis=(0, 1)), motif
            
        f_fig = viz_sequence.plot_weights(f, figsize=(20, 4), return_fig=True)
        f_fig.tight_layout()
        rc_fig = viz_sequence.plot_weights(rc, figsize=(20, 4), return_fig=True)
        rc_fig.tight_layout()
        
        if tfm_motifs_cache_dir:
            # Save results and figures
            motif_id = "%d_%d" % (i, j)
            f_fig.savefig(os.path.join(tfm_motifs_cache_dir, motif_id + "_hcwm_trimmed_fwd.png"))
            rc_fig.savefig(os.path.join(tfm_motifs_cache_dir, motif_id + "_hcwm_trimmed_rev.png"))

        body.append(
            vdomh.tr(
                vdomh.td(str(j + 1)),
                vdomh.td(str(num_seqlets[i][j])),
                vdomh.td(figure_to_vdom_image(f_fig)),
                vdomh.td(figure_to_vdom_image(rc_fig))
            )
        )
    display(vdomh.table(colgroup, header, vdomh.tbody(*body)))
    plt.close("all")

Metacluster 1/2

#SeqletsForwardReverse
15243
2705
3485
4207
5188
693
740
834

Metacluster 2/2

#SeqletsForwardReverse
1559
2168
3129
4111
591
687
787

Top TOMTOM matches for each motif

Here, the TF-MoDISco motifs are plotted as hCWMs, but the TOMTOM matches are shown as PWMs.

In [10]:
num_matches_to_keep = 10
num_matches_to_show = 5

header = vdomh.thead(
    vdomh.tr(
        vdomh.th("Motif ID", style={"text-align": "center"}),
        vdomh.th("q-val", style={"text-align": "center"}),
        vdomh.th("PWM", style={"text-align": "center"})
    )
)

for i in range(len(motif_pfms)):
    display(vdomh.h3("Metacluster %d/%d" % (i + 1, num_metaclusters)))
    
    # Compute TOMTOM matches for all motifs in the metacluster at once
    out_dir = os.path.join(tfm_motifs_cache_dir, "tomtom", "metacluster_%d" % i) if tfm_motifs_cache_dir else None
    tomtom_matches = match_motifs_to_database(motif_pfms_short[i], top_k=num_matches_to_keep, temp_dir=out_dir)
    
    for j in range(len(motif_pfms[i])):
        display(vdomh.h4("Motif %d/%d" % (j + 1, len(motif_pfms[i]))))
        viz_sequence.plot_weights(motif_hcwms[i][j])
    
        body = []
        for k, (match_name, match_pfm, match_qval) in enumerate(tomtom_matches[j]):
            fig = viz_sequence.plot_weights(pfm_to_pwm(match_pfm), return_fig=True)
            fig.tight_layout()
            if k < num_matches_to_show:
                body.append(
                    vdomh.tr(
                        vdomh.td(match_name),
                        vdomh.td(str(match_qval)),
                        vdomh.td(figure_to_vdom_image(fig))
                    )
                )
                if tfm_motifs_cache_dir:
                    # Save results and figures
                    motif_id = "%d_%d" % (i, j)
                    fig.savefig(os.path.join(out_dir, motif_id + ("_hit-%d.png" % (k + 1))))
            else:
                body.append(
                    vdomh.tr(
                        vdomh.td(match_name),
                        vdomh.td(str(match_qval)),
                        vdomh.td("Not shown")
                    )
                )
        if not body:
            display(vdomh.p("No TOMTOM matches passing threshold"))
        else:
            display(vdomh.table(header, vdomh.tbody(*body)))
        plt.close("all")

Metacluster 1/2

Motif 1/8

Motif IDq-valPWM
ETV1_HUMAN.H11MO.0.A1.5729e-07
MA0750.2_ZBTB7A1.5729e-07
MA0076.2_ELK45.40167e-07
ELK4_HUMAN.H11MO.0.A5.40167e-07
ELF2_HUMAN.H11MO.0.C5.40167e-07
ELF1_HUMAN.H11MO.0.A5.40167e-07Not shown
GABPA_HUMAN.H11MO.0.A2.41817e-06Not shown
ELK1_HUMAN.H11MO.0.B3.9906599999999995e-06Not shown
MA0765.2_ETV51.03674e-05Not shown
MA0763.1_ETV35.2818900000000003e-05Not shown

Motif 2/8

Motif IDq-valPWM
GABPA_HUMAN.H11MO.0.A0.0938858
MA0076.2_ELK40.10315
MA0772.1_IRF70.10315
ELK4_HUMAN.H11MO.0.A0.141651
ELK1_HUMAN.H11MO.0.B0.141651
MA1508.1_IKZF10.141651Not shown
MA0765.2_ETV50.141651Not shown
MA0640.2_ELF30.141651Not shown
MA1484.1_ETS20.141651Not shown
MA1420.1_IRF50.147954Not shown

Motif 3/8

Motif IDq-valPWM
ZN143_HUMAN.H11MO.0.A3.7525599999999994e-22
ZNF76_HUMAN.H11MO.0.C8.292000000000001e-22
THA11_HUMAN.H11MO.0.B1.04699e-18
MA1573.1_THAP112.31186e-09
MA0088.2_ZNF1430.014522499999999999
STAT3_HUMAN.H11MO.0.A0.0647063Not shown
P63_HUMAN.H11MO.0.A0.08595789999999999Not shown
MA1625.1_Stat5b0.108482Not shown
MA0519.1_Stat5a::Stat5b0.138139Not shown
MA0525.2_TP630.195553Not shown

Motif 4/8

Motif IDq-valPWM
ETS1_HUMAN.H11MO.0.A0.00026409599999999996
MA0136.2_ELF50.006714399999999999
MA0761.2_ETV10.006714399999999999
FLI1_HUMAN.H11MO.1.A0.006714399999999999
EHF_HUMAN.H11MO.0.B0.006714399999999999
ELK3_HUMAN.H11MO.0.D0.006714399999999999Not shown
MA0473.3_ELF10.006714399999999999Not shown
MA0062.3_GABPA0.006714399999999999Not shown
ETV7_HUMAN.H11MO.0.D0.00705548Not shown
MA0764.2_ETV40.00705548Not shown

Motif 5/8

Motif IDq-valPWM
MA0028.2_ELK10.0283661
MA0645.1_ETV60.0283661
MA1483.1_ELF20.0283661
ELF2_HUMAN.H11MO.0.C0.0283661
GABPA_HUMAN.H11MO.0.A0.0283661
ELK1_HUMAN.H11MO.0.B0.0283661Not shown
ETV1_HUMAN.H11MO.0.A0.0283661Not shown
MA0076.2_ELK40.0283661Not shown
MA0765.2_ETV50.0283661Not shown
ELK4_HUMAN.H11MO.0.A0.0283661Not shown

Motif 6/8

Motif IDq-valPWM
SP2_HUMAN.H11MO.0.A0.000226983
ETV1_HUMAN.H11MO.0.A0.00264932
SP3_HUMAN.H11MO.0.B0.0035902
MA1122.1_TFDP10.00493222
MA0750.2_ZBTB7A0.00501106
SP1_HUMAN.H11MO.0.A0.00540905Not shown
MA0471.2_E2F60.00540905Not shown
MA0765.2_ETV50.00674776Not shown
THAP1_HUMAN.H11MO.0.C0.00674776Not shown
ELK4_HUMAN.H11MO.0.A0.00674776Not shown

Motif 7/8

Motif IDq-valPWM
ELF5_HUMAN.H11MO.0.A0.0844674
MA0028.2_ELK10.0844674
FEV_HUMAN.H11MO.0.B0.0844674
MXI1_HUMAN.H11MO.0.A0.0844674
MA0146.2_Zfx0.0844674
ELF1_HUMAN.H11MO.0.A0.0844674Not shown
MA0136.2_ELF50.0844674Not shown
MBD2_HUMAN.H11MO.0.B0.0844674Not shown
RFX1_HUMAN.H11MO.0.B0.0844674Not shown
ELF2_HUMAN.H11MO.0.C0.0844674Not shown

Motif 8/8

Motif IDq-valPWM
GABPA_HUMAN.H11MO.0.A0.017355000000000002
ELK1_HUMAN.H11MO.0.B0.017355000000000002
MA0076.2_ELK40.017355000000000002
MA0645.1_ETV60.017355000000000002
MA0765.2_ETV50.017355000000000002
MA0028.2_ELK10.017355000000000002Not shown
MA0156.2_FEV0.018147Not shown
SP2_HUMAN.H11MO.0.A0.018147Not shown
ELK4_HUMAN.H11MO.0.A0.018147Not shown
MA0763.1_ETV30.018147Not shown

Metacluster 2/2

Motif 1/7

Motif IDq-valPWM
MA0765.2_ETV50.00020058
ELF2_HUMAN.H11MO.0.C0.00020058
GABPA_HUMAN.H11MO.0.A0.00020058
MA0076.2_ELK40.00029253
ETV1_HUMAN.H11MO.0.A0.00029253
MA0641.1_ELF40.00029253Not shown
MA1483.1_ELF20.00029253Not shown
MA0750.2_ZBTB7A0.000298625Not shown
ELF1_HUMAN.H11MO.0.A0.000303365Not shown
MA0759.1_ELK30.00041758400000000003Not shown

Motif 2/7

Motif IDq-valPWM
SP2_HUMAN.H11MO.0.A9.8973e-07
SP1_HUMAN.H11MO.0.A3.3704699999999996e-05
SP3_HUMAN.H11MO.0.B6.12675e-05
THAP1_HUMAN.H11MO.0.C0.00132083
ZFX_HUMAN.H11MO.1.A0.00496369
SP1_HUMAN.H11MO.1.A0.00769596Not shown
MA0146.2_Zfx0.00773136Not shown
KLF3_HUMAN.H11MO.0.B0.00773136Not shown
MA1513.1_KLF150.00848221Not shown
RFX1_HUMAN.H11MO.0.B0.0102626Not shown

Motif 3/7

Motif IDq-valPWM
SP2_HUMAN.H11MO.0.A5.4174299999999996e-09
SP3_HUMAN.H11MO.0.B1.3549100000000002e-06
SP1_HUMAN.H11MO.0.A8.37545e-06
MA1513.1_KLF150.000110241
MA1650.1_ZBTB140.000619465
SP1_HUMAN.H11MO.1.A0.0008272010000000001Not shown
KLF16_HUMAN.H11MO.0.D0.00171024Not shown
KLF3_HUMAN.H11MO.0.B0.00198429Not shown
PATZ1_HUMAN.H11MO.0.C0.00269449Not shown
MA0146.2_Zfx0.00498374Not shown

Motif 4/7

Motif IDq-valPWM
SP2_HUMAN.H11MO.0.A2.57388e-05
SP3_HUMAN.H11MO.0.B0.000360012
SP1_HUMAN.H11MO.0.A0.000364251
MXI1_HUMAN.H11MO.0.A0.005400899999999999
KLF3_HUMAN.H11MO.0.B0.0140029
MA0146.2_Zfx0.030868Not shown
KLF9_HUMAN.H11MO.0.C0.036893Not shown
AP2D_HUMAN.H11MO.0.D0.036893Not shown
MBD2_HUMAN.H11MO.0.B0.036893Not shown
MA1650.1_ZBTB140.036893Not shown

Motif 5/7

Motif IDq-valPWM
SP2_HUMAN.H11MO.0.A2.40658e-06
SP1_HUMAN.H11MO.0.A2.2619899999999997e-05
SP3_HUMAN.H11MO.0.B2.6123499999999998e-05
MA0146.2_Zfx0.000317617
KLF3_HUMAN.H11MO.0.B0.00346532
SP1_HUMAN.H11MO.1.A0.00347268Not shown
THAP1_HUMAN.H11MO.0.C0.00731849Not shown
MA1513.1_KLF150.00846355Not shown
ZFX_HUMAN.H11MO.1.A0.00986998Not shown
KLF16_HUMAN.H11MO.0.D0.00986998Not shown

Motif 6/7

Motif IDq-valPWM
SP2_HUMAN.H11MO.0.A4.89559e-07
SP1_HUMAN.H11MO.0.A4.9900099999999995e-06
SP3_HUMAN.H11MO.0.B6.38097e-05
SP1_HUMAN.H11MO.1.A0.00250342
THAP1_HUMAN.H11MO.0.C0.00250342
KLF3_HUMAN.H11MO.0.B0.00263229Not shown
KLF16_HUMAN.H11MO.0.D0.00782294Not shown
MA1513.1_KLF150.011598899999999999Not shown
MA0146.2_Zfx0.011598899999999999Not shown
USF2_HUMAN.H11MO.0.A0.012933600000000002Not shown

Motif 7/7

Motif IDq-valPWM
SP2_HUMAN.H11MO.0.A2.02652e-06
SP1_HUMAN.H11MO.0.A3.81275e-05
SP3_HUMAN.H11MO.0.B4.7114700000000005e-05
MA1650.1_ZBTB140.00132245
SP1_HUMAN.H11MO.1.A0.00183665
THAP1_HUMAN.H11MO.0.C0.00196493Not shown
KLF16_HUMAN.H11MO.0.D0.00364682Not shown
PATZ1_HUMAN.H11MO.0.C0.00374873Not shown
MA1513.1_KLF150.00381459Not shown
USF2_HUMAN.H11MO.0.A0.00624133Not shown