In [1]:
import os
import sys
sys.path.append(os.path.abspath("/users/amtseng/tfmodisco/src/"))
from tfmodisco.run_tfmodisco import import_shap_scores, import_tfmodisco_results
from motif.read_motifs import pfm_info_content, pfm_to_pwm, trim_motif_by_ic
from motif.match_motifs import match_motifs_to_database
from util import figure_to_vdom_image
import plot.viz_sequence as viz_sequence
import numpy as np
import h5py
import matplotlib.pyplot as plt
import vdom.helpers as vdomh
from IPython.display import display

Define constants and paths

In [2]:
# Define parameters/fetch arguments
tf_name = os.environ["TFM_TF_NAME"]
shap_scores_path = os.environ["TFM_SHAP_PATH"]
tfm_results_path = os.environ["TFM_TFM_PATH"]
hyp_score_key = os.environ["TFM_HYP_SCORE_KEY"]
if "TFM_MOTIF_CACHE" in os.environ:
    tfm_motifs_cache_dir = os.environ["TFM_MOTIF_CACHE"]
else:
    tfm_motifs_cache_dir = None

print("TF name: %s" % tf_name)
print("DeepSHAP scores path: %s" % shap_scores_path)
print("TF-MoDISco results path: %s" % tfm_results_path)
print("Importance score key: %s" % hyp_score_key)
print("Saved TF-MoDISco-derived motifs cache: %s" % tfm_motifs_cache_dir)
TF name: JUND
DeepSHAP scores path: /users/amtseng/tfmodisco/results/importance_scores/multitask_profile/JUND_multitask_profile_fold3/JUND_multitask_profile_fold3_imp_scores.h5
TF-MoDISco results path: /users/amtseng/tfmodisco/results/tfmodisco/multitask_profile/JUND_multitask_profile_fold3/JUND_multitask_profile_fold3_profile_tfm.h5
Importance score key: profile_hyp_scores
Saved TF-MoDISco-derived motifs cache: /users/amtseng/tfmodisco/results/reports/tfmodisco_results//cache/multitask_profile/JUND_multitask_profile_fold3/JUND_multitask_profile_fold3_profile
In [3]:
# Define paths and constants
input_length = 2114
shap_score_center_size = 400
In [4]:
if tfm_motifs_cache_dir:
    os.makedirs(tfm_motifs_cache_dir, exist_ok=True)

Import SHAP scores and TF-MoDISco results

In [5]:
# Import SHAP coordinates and one-hot sequences
hyp_scores, _, one_hot_seqs, shap_coords = import_shap_scores(shap_scores_path, hyp_score_key, center_cut_size=shap_score_center_size)
# This cuts the sequences/scores off just as how TF-MoDISco saw them, but the coordinates are uncut
Importing SHAP scores: 100%|██████████| 350/350 [04:02<00:00,  1.44it/s]
In [6]:
# Import the TF-MoDISco results object
tfm_obj = import_tfmodisco_results(tfm_results_path, hyp_scores, one_hot_seqs, shap_score_center_size)

Plot some SHAP score tracks

Plot the central region of some randomly selected actual importance scores

In [7]:
plot_slice = slice(int(shap_score_center_size / 4), int(3 * shap_score_center_size / 4))
for index in np.random.choice(hyp_scores.shape[0], size=5, replace=False):
    viz_sequence.plot_weights((hyp_scores[index] * one_hot_seqs[index])[plot_slice], subticks_frequency=100)

Plot TF-MoDISco results

Plot all motifs by metacluster

In [8]:
motif_pfms, motif_hcwms, motif_cwms = [], [], []  # Save the trimmed PFMs, hCWMs, and CWMs
motif_pfms_short = []  # PFMs that are even more trimmed (for TOMTOM)
num_seqlets = []  # Number of seqlets for each motif
motif_seqlets = []  # Save seqlets of each motif
metaclusters = tfm_obj.metacluster_idx_to_submetacluster_results
num_metaclusters = len(metaclusters.keys())
if tfm_motifs_cache_dir:
    motif_hdf5 = h5py.File(os.path.join(tfm_motifs_cache_dir, "all_motifs.h5"), "w")
for metacluster_i, metacluster_key in enumerate(metaclusters.keys()):
    metacluster = metaclusters[metacluster_key]
    display(vdomh.h3("Metacluster %d/%d" % (metacluster_i + 1, num_metaclusters)))
    patterns = metacluster.seqlets_to_patterns_result.patterns
    if not patterns:
        break
    motif_pfms.append([])
    motif_hcwms.append([])
    motif_cwms.append([])
    motif_pfms_short.append([])
    num_seqlets.append([])
    motif_seqlets.append([])
    num_patterns = len(patterns)
    for pattern_i, pattern in enumerate(patterns):
        seqlets = pattern.seqlets
        display(vdomh.h4("Pattern %d/%d" % (pattern_i + 1, num_patterns)))
        display(vdomh.p("%d seqlets" % len(seqlets)))
        
        pfm = pattern["sequence"].fwd
        hcwm = pattern["task0_hypothetical_contribs"].fwd
        cwm = pattern["task0_contrib_scores"].fwd
        
        pfm_fig = viz_sequence.plot_weights(pfm, subticks_frequency=10, return_fig=True)
        hcwm_fig = viz_sequence.plot_weights(hcwm, subticks_frequency=10, return_fig=True)
        cwm_fig = viz_sequence.plot_weights(cwm, subticks_frequency=10, return_fig=True)
        pfm_fig.tight_layout()
        hcwm_fig.tight_layout()
        cwm_fig.tight_layout()
        
        motif_table = vdomh.table(
            vdomh.tr(
                vdomh.td("Sequence (PFM)"),
                vdomh.td(figure_to_vdom_image(pfm_fig))
            ),
            vdomh.tr(
                vdomh.td("Hypothetical contributions (hCWM)"),
                vdomh.td(figure_to_vdom_image(hcwm_fig))
            ),
            vdomh.tr(
                vdomh.td("Actual contributions (CWM)"),
                vdomh.td(figure_to_vdom_image(cwm_fig))
            )
        )
        display(motif_table)
        plt.close("all")  # Remove all standing figures
        
        # Trim motif based on information content
        short_trimmed_pfm = trim_motif_by_ic(pfm, pfm)
        motif_pfms_short[-1].append(short_trimmed_pfm)
        
        # Expand trimming to +/- 4bp on either side
        trimmed_pfm = trim_motif_by_ic(pfm, pfm, pad=4)
        trimmed_hcwm = trim_motif_by_ic(pfm, hcwm, pad=4)
        trimmed_cwm = trim_motif_by_ic(pfm, cwm, pad=4)
        
        motif_pfms[-1].append(trimmed_pfm)
        motif_hcwms[-1].append(trimmed_hcwm)
        motif_cwms[-1].append(trimmed_cwm)
        
        num_seqlets[-1].append(len(seqlets))
        
        if tfm_motifs_cache_dir:
            # Save results and figures
            motif_id = "%d_%d" % (metacluster_i, pattern_i)
            pfm_fig.savefig(os.path.join(tfm_motifs_cache_dir, motif_id + "_pfm_full.png"))
            hcwm_fig.savefig(os.path.join(tfm_motifs_cache_dir, motif_id + "_hcwm_full.png"))
            cwm_fig.savefig(os.path.join(tfm_motifs_cache_dir, motif_id + "_cwm_full.png"))
            motif_dset = motif_hdf5.create_group(motif_id)
            motif_dset.create_dataset("pfm_full", data=pfm, compression="gzip")
            motif_dset.create_dataset("hcwm_full", data=hcwm, compression="gzip")
            motif_dset.create_dataset("cwm_full", data=cwm, compression="gzip")
            motif_dset.create_dataset("pfm_trimmed", data=trimmed_pfm, compression="gzip")
            motif_dset.create_dataset("hcwm_trimmed", data=trimmed_hcwm, compression="gzip")
            motif_dset.create_dataset("cwm_trimmed", data=trimmed_cwm, compression="gzip")
            motif_dset.create_dataset("pfm_short_trimmed", data=short_trimmed_pfm, compression="gzip")
if tfm_motifs_cache_dir:
    motif_hdf5.close()

Metacluster 1/2

Pattern 1/4

6591 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 2/4

2312 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 3/4

43 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 4/4

39 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Metacluster 2/2

Pattern 1/14

94 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 2/14

83 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 3/14

77 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 4/14

76 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 5/14

50 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 6/14

47 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 7/14

43 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 8/14

43 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 9/14

39 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 10/14

36 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 11/14

34 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 12/14

33 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 13/14

32 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 14/14

31 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Summary of motifs

Motifs are trimmed based on information content, and presented in descending order by number of supporting seqlets. The motifs are separated by metacluster. The motifs are presented as hCWMs. The forward orientation is defined as the orientation that is richer in purines.

In [9]:
colgroup = vdomh.colgroup(
    vdomh.col(style={"width": "5%"}),
    vdomh.col(style={"width": "5%"}),
    vdomh.col(style={"width": "45%"}),
    vdomh.col(style={"width": "45%"})
)
header = vdomh.thead(
    vdomh.tr(
        vdomh.th("#", style={"text-align": "center"}),
        vdomh.th("Seqlets", style={"text-align": "center"}),
        vdomh.th("Forward", style={"text-align": "center"}),
        vdomh.th("Reverse", style={"text-align": "center"})
    )
)

for i in range(len(motif_hcwms)):
    display(vdomh.h3("Metacluster %d/%d" % (i + 1, num_metaclusters)))
    body = []
    for j in range(len(motif_hcwms[i])):
        motif = motif_hcwms[i][j]
        if np.sum(motif[:, [0, 2]]) > 0.5 * np.sum(motif):
            # Forward is purine-rich, reverse-complement is pyrimidine-rich
            f, rc = motif, np.flip(motif, axis=(0, 1))
        else:
            f, rc = np.flip(motif, axis=(0, 1)), motif
            
        f_fig = viz_sequence.plot_weights(f, figsize=(20, 4), return_fig=True)
        f_fig.tight_layout()
        rc_fig = viz_sequence.plot_weights(rc, figsize=(20, 4), return_fig=True)
        rc_fig.tight_layout()
        
        if tfm_motifs_cache_dir:
            # Save results and figures
            motif_id = "%d_%d" % (i, j)
            f_fig.savefig(os.path.join(tfm_motifs_cache_dir, motif_id + "_hcwm_trimmed_fwd.png"))
            rc_fig.savefig(os.path.join(tfm_motifs_cache_dir, motif_id + "_hcwm_trimmed_rev.png"))

        body.append(
            vdomh.tr(
                vdomh.td(str(j + 1)),
                vdomh.td(str(num_seqlets[i][j])),
                vdomh.td(figure_to_vdom_image(f_fig)),
                vdomh.td(figure_to_vdom_image(rc_fig))
            )
        )
    display(vdomh.table(colgroup, header, vdomh.tbody(*body)))
    plt.close("all")

Metacluster 1/2

#SeqletsForwardReverse
16591
22312
343
439

Metacluster 2/2

/users/amtseng/tfmodisco/src/plot/viz_sequence.py:152: RuntimeWarning: More than 20 figures have been opened. Figures created through the pyplot interface (`matplotlib.pyplot.figure`) are retained until explicitly closed and may consume too much memory. (To control this warning, see the rcParam `figure.max_open_warning`).
  fig = plt.figure(figsize=figsize)
#SeqletsForwardReverse
194
283
377
476
550
647
743
843
939
1036
1134
1233
1332
1431

Top TOMTOM matches for each motif

Here, the TF-MoDISco motifs are plotted as hCWMs, but the TOMTOM matches are shown as PWMs.

In [10]:
num_matches_to_keep = 10
num_matches_to_show = 5

header = vdomh.thead(
    vdomh.tr(
        vdomh.th("Motif ID", style={"text-align": "center"}),
        vdomh.th("q-val", style={"text-align": "center"}),
        vdomh.th("PWM", style={"text-align": "center"})
    )
)

for i in range(len(motif_pfms)):
    display(vdomh.h3("Metacluster %d/%d" % (i + 1, num_metaclusters)))
    
    # Compute TOMTOM matches for all motifs in the metacluster at once
    out_dir = os.path.join(tfm_motifs_cache_dir, "tomtom", "metacluster_%d" % i) if tfm_motifs_cache_dir else None
    tomtom_matches = match_motifs_to_database(motif_pfms_short[i], top_k=num_matches_to_keep, temp_dir=out_dir)
    
    for j in range(len(motif_pfms[i])):
        display(vdomh.h4("Motif %d/%d" % (j + 1, len(motif_pfms[i]))))
        viz_sequence.plot_weights(motif_hcwms[i][j])
    
        body = []
        for k, (match_name, match_pfm, match_qval) in enumerate(tomtom_matches[j]):
            fig = viz_sequence.plot_weights(pfm_to_pwm(match_pfm), return_fig=True)
            fig.tight_layout()
            if k < num_matches_to_show:
                body.append(
                    vdomh.tr(
                        vdomh.td(match_name),
                        vdomh.td(str(match_qval)),
                        vdomh.td(figure_to_vdom_image(fig))
                    )
                )
                if tfm_motifs_cache_dir:
                    # Save results and figures
                    motif_id = "%d_%d" % (i, j)
                    fig.savefig(os.path.join(out_dir, motif_id + ("_hit-%d.png" % (k + 1))))
            else:
                body.append(
                    vdomh.tr(
                        vdomh.td(match_name),
                        vdomh.td(str(match_qval)),
                        vdomh.td("Not shown")
                    )
                )
        if not body:
            display(vdomh.p("No TOMTOM matches passing threshold"))
        else:
            display(vdomh.table(header, vdomh.tbody(*body)))
        plt.close("all")

Metacluster 1/2

Motif 1/4

Motif IDq-valPWM
JUND_HUMAN.H11MO.0.A5.54718e-07
JUN_HUMAN.H11MO.0.A5.54718e-07
FOSB_HUMAN.H11MO.0.A5.54718e-07
FOSL2_HUMAN.H11MO.0.A5.54718e-07
FOSL1_HUMAN.H11MO.0.A1.6588299999999999e-06
MA1130.1_FOSL2::JUN3.6926300000000004e-06Not shown
MA0099.3_FOS::JUN3.6926300000000004e-06Not shown
MA1144.1_FOSL2::JUND7.842739999999998e-06Not shown
MA1138.1_FOSL2::JUNB7.842739999999998e-06Not shown
MA1622.1_Smad2::Smad37.842739999999998e-06Not shown

Motif 2/4

Motif IDq-valPWM
ATF2_HUMAN.H11MO.0.B1.2844799999999998e-06
MA1136.1_FOSB::JUNB(var.2)1.41915e-06
MA1145.1_FOSL2::JUND(var.2)1.85513e-06
MA1129.1_FOSL1::JUN(var.2)1.85513e-06
MA0605.2_ATF32.6445799999999995e-06
MA1139.1_FOSL2::JUNB(var.2)2.81543e-06Not shown
MA1475.1_CREB3L4(var.2)2.81543e-06Not shown
MA1131.1_FOSL2::JUN(var.2)2.81543e-06Not shown
MA1126.1_FOS::JUN(var.2)3.6198400000000003e-06Not shown
MA1127.1_FOSB::JUN4.20562e-06Not shown

Motif 3/4

Motif IDq-valPWM
CPEB1_HUMAN.H11MO.0.D4.63704e-05
MA1125.1_ZNF3840.0355644
PRDM6_HUMAN.H11MO.0.C0.0355644
FOXL1_HUMAN.H11MO.0.D0.0380219
FOXG1_HUMAN.H11MO.0.D0.0726831
MA0679.2_ONECUT10.142593Not shown
ANDR_HUMAN.H11MO.0.A0.142593Not shown
FOXJ3_HUMAN.H11MO.0.A0.164129Not shown
ONEC2_HUMAN.H11MO.0.D0.180876Not shown
HXC10_HUMAN.H11MO.0.D0.224021Not shown

Motif 4/4

Motif IDq-valPWM
CPEB1_HUMAN.H11MO.0.D0.000194831
PRDM6_HUMAN.H11MO.0.C0.011452200000000001
MA1125.1_ZNF3840.0671361
FOXL1_HUMAN.H11MO.0.D0.0671581
FOXG1_HUMAN.H11MO.0.D0.134135
ANDR_HUMAN.H11MO.0.A0.134135Not shown
FUBP1_HUMAN.H11MO.0.D0.13883399999999999Not shown
FOXJ3_HUMAN.H11MO.0.A0.13883399999999999Not shown
IRF1_HUMAN.H11MO.0.A0.165873Not shown
STAT1_HUMAN.H11MO.1.A0.194618Not shown

Metacluster 2/2

Motif 1/14

Motif IDq-valPWM
ZN528_HUMAN.H11MO.0.C0.264914
MA1597.1_ZNF5280.264914
COE1_HUMAN.H11MO.0.A0.264914
MA0154.4_EBF10.264914

Motif 2/14

No TOMTOM matches passing threshold

Motif 3/14

No TOMTOM matches passing threshold

Motif 4/14

Motif IDq-valPWM
COE1_HUMAN.H11MO.0.A0.128439
MA0154.4_EBF10.128439

Motif 5/14

Motif IDq-valPWM
MA0478.1_FOSL20.00956041
MA0489.1_JUN(var.2)0.012112799999999998
FOSL1_HUMAN.H11MO.0.A0.0179794
FOSL2_HUMAN.H11MO.0.A0.0179794
MA1137.1_FOSL1::JUNB0.028482999999999998
MA1141.1_FOS::JUND0.028482999999999998Not shown
MA1135.1_FOSB::JUNB0.028482999999999998Not shown
FOSB_HUMAN.H11MO.0.A0.028482999999999998Not shown
MA0491.2_JUND0.028482999999999998Not shown
JUNB_HUMAN.H11MO.0.A0.028482999999999998Not shown

Motif 6/14

Motif IDq-valPWM
MA0146.2_Zfx0.00026868599999999997
CTCFL_HUMAN.H11MO.0.A0.052076599999999994
ZN770_HUMAN.H11MO.0.C0.052076599999999994
ZFX_HUMAN.H11MO.1.A0.19047999999999998
SP2_HUMAN.H11MO.0.A0.202506
AP2B_HUMAN.H11MO.0.B0.202506Not shown
ZN770_HUMAN.H11MO.1.C0.31450900000000004Not shown
PLAL1_HUMAN.H11MO.0.D0.31450900000000004Not shown
MXI1_HUMAN.H11MO.0.A0.316535Not shown
MA1102.2_CTCFL0.31684Not shown

Motif 7/14

Motif IDq-valPWM
JUNB_HUMAN.H11MO.0.A0.234177
MAFB_HUMAN.H11MO.0.B0.234177
MA1633.1_BACH10.234177
MA0478.1_FOSL20.234177
JUND_HUMAN.H11MO.0.A0.234177
MAFK_HUMAN.H11MO.1.A0.234177Not shown
NRL_HUMAN.H11MO.0.D0.23568499999999998Not shown
FOS_HUMAN.H11MO.0.A0.23568499999999998Not shown
BACH2_HUMAN.H11MO.0.A0.23568499999999998Not shown
MA0146.2_Zfx0.23568499999999998Not shown

Motif 8/14

Motif IDq-valPWM
ASCL1_HUMAN.H11MO.0.A0.0762961
ZN341_HUMAN.H11MO.0.C0.10272200000000001
MA1631.1_ASCL1(var.2)0.10272200000000001
MA0830.2_TCF40.10272200000000001
SP2_HUMAN.H11MO.0.A0.10272200000000001
MA0014.3_PAX50.10272200000000001Not shown
KLF3_HUMAN.H11MO.0.B0.20013599999999998Not shown
ZBTB6_HUMAN.H11MO.0.C0.30896399999999996Not shown
SP4_HUMAN.H11MO.0.A0.321482Not shown
SP3_HUMAN.H11MO.0.B0.321482Not shown

Motif 9/14

Motif IDq-valPWM
SP2_HUMAN.H11MO.0.A0.0028798
SP3_HUMAN.H11MO.0.B0.0028798
SP1_HUMAN.H11MO.0.A0.00351925
MA1653.1_ZNF1480.00549487
TBX15_HUMAN.H11MO.0.D0.00908877
SP1_HUMAN.H11MO.1.A0.0114006Not shown
MA0528.2_ZNF2630.013161700000000002Not shown
SP4_HUMAN.H11MO.1.A0.0138145Not shown
MA1522.1_MAZ0.017971200000000003Not shown
VEZF1_HUMAN.H11MO.1.C0.022273599999999998Not shown

Motif 10/14

Motif IDq-valPWM
SP1_HUMAN.H11MO.1.A0.41601000000000005
SP2_HUMAN.H11MO.0.A0.41601000000000005
SP4_HUMAN.H11MO.1.A0.41601000000000005
NKX21_HUMAN.H11MO.0.A0.41601000000000005
SP1_HUMAN.H11MO.0.A0.41601000000000005
KLF3_HUMAN.H11MO.0.B0.41601000000000005Not shown
TBX1_HUMAN.H11MO.0.D0.41601000000000005Not shown
SP3_HUMAN.H11MO.0.B0.41601000000000005Not shown
NKX22_HUMAN.H11MO.0.D0.41601000000000005Not shown
MA0503.1_Nkx2-5(var.2)0.41601000000000005Not shown

Motif 11/14

No TOMTOM matches passing threshold

Motif 12/14

Motif IDq-valPWM
MA1596.1_ZNF4600.0081329
NRF1_HUMAN.H11MO.0.A0.013411099999999999
ZN770_HUMAN.H11MO.1.C0.295871

Motif 13/14

No TOMTOM matches passing threshold

Motif 14/14

No TOMTOM matches passing threshold