In [1]:
import os
import sys
sys.path.append(os.path.abspath("/users/amtseng/tfmodisco/src/"))
from tfmodisco.run_tfmodisco import import_shap_scores, import_tfmodisco_results
from motif.read_motifs import pfm_info_content, pfm_to_pwm, trim_motif_by_ic
from motif.match_motifs import match_motifs_to_database
from util import figure_to_vdom_image
import plot.viz_sequence as viz_sequence
import numpy as np
import h5py
import matplotlib.pyplot as plt
import vdom.helpers as vdomh
from IPython.display import display

Define constants and paths

In [2]:
# Define parameters/fetch arguments
tf_name = os.environ["TFM_TF_NAME"]
shap_scores_path = os.environ["TFM_SHAP_PATH"]
tfm_results_path = os.environ["TFM_TFM_PATH"]
hyp_score_key = os.environ["TFM_HYP_SCORE_KEY"]
if "TFM_MOTIF_CACHE" in os.environ:
    tfm_motifs_cache_dir = os.environ["TFM_MOTIF_CACHE"]
else:
    tfm_motifs_cache_dir = None

print("TF name: %s" % tf_name)
print("DeepSHAP scores path: %s" % shap_scores_path)
print("TF-MoDISco results path: %s" % tfm_results_path)
print("Importance score key: %s" % hyp_score_key)
print("Saved TF-MoDISco-derived motifs cache: %s" % tfm_motifs_cache_dir)
TF name: JUND
DeepSHAP scores path: /users/amtseng/tfmodisco/results/importance_scores/multitask_profile/JUND_multitask_profile_fold9/JUND_multitask_profile_fold9_imp_scores.h5
TF-MoDISco results path: /users/amtseng/tfmodisco/results/tfmodisco/multitask_profile/JUND_multitask_profile_fold9/JUND_multitask_profile_fold9_profile_tfm.h5
Importance score key: profile_hyp_scores
Saved TF-MoDISco-derived motifs cache: /users/amtseng/tfmodisco/results/reports/tfmodisco_results//cache/multitask_profile/JUND_multitask_profile_fold9/JUND_multitask_profile_fold9_profile
In [3]:
# Define paths and constants
input_length = 2114
shap_score_center_size = 400
In [4]:
if tfm_motifs_cache_dir:
    os.makedirs(tfm_motifs_cache_dir, exist_ok=True)

Import SHAP scores and TF-MoDISco results

In [5]:
# Import SHAP coordinates and one-hot sequences
hyp_scores, _, one_hot_seqs, shap_coords = import_shap_scores(shap_scores_path, hyp_score_key, center_cut_size=shap_score_center_size)
# This cuts the sequences/scores off just as how TF-MoDISco saw them, but the coordinates are uncut
Importing SHAP scores: 100%|██████████| 350/350 [04:33<00:00,  1.28it/s]
In [6]:
# Import the TF-MoDISco results object
tfm_obj = import_tfmodisco_results(tfm_results_path, hyp_scores, one_hot_seqs, shap_score_center_size)

Plot some SHAP score tracks

Plot the central region of some randomly selected actual importance scores

In [7]:
plot_slice = slice(int(shap_score_center_size / 4), int(3 * shap_score_center_size / 4))
for index in np.random.choice(hyp_scores.shape[0], size=5, replace=False):
    viz_sequence.plot_weights((hyp_scores[index] * one_hot_seqs[index])[plot_slice], subticks_frequency=100)

Plot TF-MoDISco results

Plot all motifs by metacluster

In [8]:
motif_pfms, motif_hcwms, motif_cwms = [], [], []  # Save the trimmed PFMs, hCWMs, and CWMs
motif_pfms_short = []  # PFMs that are even more trimmed (for TOMTOM)
num_seqlets = []  # Number of seqlets for each motif
motif_seqlets = []  # Save seqlets of each motif
metaclusters = tfm_obj.metacluster_idx_to_submetacluster_results
num_metaclusters = len(metaclusters.keys())
if tfm_motifs_cache_dir:
    motif_hdf5 = h5py.File(os.path.join(tfm_motifs_cache_dir, "all_motifs.h5"), "w")
for metacluster_i, metacluster_key in enumerate(metaclusters.keys()):
    metacluster = metaclusters[metacluster_key]
    display(vdomh.h3("Metacluster %d/%d" % (metacluster_i + 1, num_metaclusters)))
    patterns = metacluster.seqlets_to_patterns_result.patterns
    if not patterns:
        break
    motif_pfms.append([])
    motif_hcwms.append([])
    motif_cwms.append([])
    motif_pfms_short.append([])
    num_seqlets.append([])
    motif_seqlets.append([])
    num_patterns = len(patterns)
    for pattern_i, pattern in enumerate(patterns):
        seqlets = pattern.seqlets
        display(vdomh.h4("Pattern %d/%d" % (pattern_i + 1, num_patterns)))
        display(vdomh.p("%d seqlets" % len(seqlets)))
        
        pfm = pattern["sequence"].fwd
        hcwm = pattern["task0_hypothetical_contribs"].fwd
        cwm = pattern["task0_contrib_scores"].fwd
        
        pfm_fig = viz_sequence.plot_weights(pfm, subticks_frequency=10, return_fig=True)
        hcwm_fig = viz_sequence.plot_weights(hcwm, subticks_frequency=10, return_fig=True)
        cwm_fig = viz_sequence.plot_weights(cwm, subticks_frequency=10, return_fig=True)
        pfm_fig.tight_layout()
        hcwm_fig.tight_layout()
        cwm_fig.tight_layout()
        
        motif_table = vdomh.table(
            vdomh.tr(
                vdomh.td("Sequence (PFM)"),
                vdomh.td(figure_to_vdom_image(pfm_fig))
            ),
            vdomh.tr(
                vdomh.td("Hypothetical contributions (hCWM)"),
                vdomh.td(figure_to_vdom_image(hcwm_fig))
            ),
            vdomh.tr(
                vdomh.td("Actual contributions (CWM)"),
                vdomh.td(figure_to_vdom_image(cwm_fig))
            )
        )
        display(motif_table)
        plt.close("all")  # Remove all standing figures
        
        # Trim motif based on information content
        short_trimmed_pfm = trim_motif_by_ic(pfm, pfm)
        motif_pfms_short[-1].append(short_trimmed_pfm)
        
        # Expand trimming to +/- 4bp on either side
        trimmed_pfm = trim_motif_by_ic(pfm, pfm, pad=4)
        trimmed_hcwm = trim_motif_by_ic(pfm, hcwm, pad=4)
        trimmed_cwm = trim_motif_by_ic(pfm, cwm, pad=4)
        
        motif_pfms[-1].append(trimmed_pfm)
        motif_hcwms[-1].append(trimmed_hcwm)
        motif_cwms[-1].append(trimmed_cwm)
        
        num_seqlets[-1].append(len(seqlets))
        
        if tfm_motifs_cache_dir:
            # Save results and figures
            motif_id = "%d_%d" % (metacluster_i, pattern_i)
            pfm_fig.savefig(os.path.join(tfm_motifs_cache_dir, motif_id + "_pfm_full.png"))
            hcwm_fig.savefig(os.path.join(tfm_motifs_cache_dir, motif_id + "_hcwm_full.png"))
            cwm_fig.savefig(os.path.join(tfm_motifs_cache_dir, motif_id + "_cwm_full.png"))
            motif_dset = motif_hdf5.create_group(motif_id)
            motif_dset.create_dataset("pfm_full", data=pfm, compression="gzip")
            motif_dset.create_dataset("hcwm_full", data=hcwm, compression="gzip")
            motif_dset.create_dataset("cwm_full", data=cwm, compression="gzip")
            motif_dset.create_dataset("pfm_trimmed", data=trimmed_pfm, compression="gzip")
            motif_dset.create_dataset("hcwm_trimmed", data=trimmed_hcwm, compression="gzip")
            motif_dset.create_dataset("cwm_trimmed", data=trimmed_cwm, compression="gzip")
            motif_dset.create_dataset("pfm_short_trimmed", data=short_trimmed_pfm, compression="gzip")
if tfm_motifs_cache_dir:
    motif_hdf5.close()

Metacluster 1/2

Pattern 1/9

5470 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 2/9

1583 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 3/9

816 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 4/9

731 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 5/9

613 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 6/9

109 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 7/9

88 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 8/9

73 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 9/9

38 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Metacluster 2/2

Pattern 1/10

194 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 2/10

179 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 3/10

161 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 4/10

159 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 5/10

154 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 6/10

128 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 7/10

110 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 8/10

77 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 9/10

58 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 10/10

48 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Summary of motifs

Motifs are trimmed based on information content, and presented in descending order by number of supporting seqlets. The motifs are separated by metacluster. The motifs are presented as hCWMs. The forward orientation is defined as the orientation that is richer in purines.

In [9]:
colgroup = vdomh.colgroup(
    vdomh.col(style={"width": "5%"}),
    vdomh.col(style={"width": "5%"}),
    vdomh.col(style={"width": "45%"}),
    vdomh.col(style={"width": "45%"})
)
header = vdomh.thead(
    vdomh.tr(
        vdomh.th("#", style={"text-align": "center"}),
        vdomh.th("Seqlets", style={"text-align": "center"}),
        vdomh.th("Forward", style={"text-align": "center"}),
        vdomh.th("Reverse", style={"text-align": "center"})
    )
)

for i in range(len(motif_hcwms)):
    display(vdomh.h3("Metacluster %d/%d" % (i + 1, num_metaclusters)))
    body = []
    for j in range(len(motif_hcwms[i])):
        motif = motif_hcwms[i][j]
        if np.sum(motif[:, [0, 2]]) > 0.5 * np.sum(motif):
            # Forward is purine-rich, reverse-complement is pyrimidine-rich
            f, rc = motif, np.flip(motif, axis=(0, 1))
        else:
            f, rc = np.flip(motif, axis=(0, 1)), motif
            
        f_fig = viz_sequence.plot_weights(f, figsize=(20, 4), return_fig=True)
        f_fig.tight_layout()
        rc_fig = viz_sequence.plot_weights(rc, figsize=(20, 4), return_fig=True)
        rc_fig.tight_layout()
        
        if tfm_motifs_cache_dir:
            # Save results and figures
            motif_id = "%d_%d" % (i, j)
            f_fig.savefig(os.path.join(tfm_motifs_cache_dir, motif_id + "_hcwm_trimmed_fwd.png"))
            rc_fig.savefig(os.path.join(tfm_motifs_cache_dir, motif_id + "_hcwm_trimmed_rev.png"))

        body.append(
            vdomh.tr(
                vdomh.td(str(j + 1)),
                vdomh.td(str(num_seqlets[i][j])),
                vdomh.td(figure_to_vdom_image(f_fig)),
                vdomh.td(figure_to_vdom_image(rc_fig))
            )
        )
    display(vdomh.table(colgroup, header, vdomh.tbody(*body)))
    plt.close("all")

Metacluster 1/2

#SeqletsForwardReverse
15470
21583
3816
4731
5613
6109
788
873
938

Metacluster 2/2

#SeqletsForwardReverse
1194
2179
3161
4159
5154
6128
7110
877
958
1048

Top TOMTOM matches for each motif

Here, the TF-MoDISco motifs are plotted as hCWMs, but the TOMTOM matches are shown as PWMs.

In [10]:
num_matches_to_keep = 10
num_matches_to_show = 5

header = vdomh.thead(
    vdomh.tr(
        vdomh.th("Motif ID", style={"text-align": "center"}),
        vdomh.th("q-val", style={"text-align": "center"}),
        vdomh.th("PWM", style={"text-align": "center"})
    )
)

for i in range(len(motif_pfms)):
    display(vdomh.h3("Metacluster %d/%d" % (i + 1, num_metaclusters)))
    
    # Compute TOMTOM matches for all motifs in the metacluster at once
    out_dir = os.path.join(tfm_motifs_cache_dir, "tomtom", "metacluster_%d" % i) if tfm_motifs_cache_dir else None
    tomtom_matches = match_motifs_to_database(motif_pfms_short[i], top_k=num_matches_to_keep, temp_dir=out_dir)
    
    for j in range(len(motif_pfms[i])):
        display(vdomh.h4("Motif %d/%d" % (j + 1, len(motif_pfms[i]))))
        viz_sequence.plot_weights(motif_hcwms[i][j])
    
        body = []
        for k, (match_name, match_pfm, match_qval) in enumerate(tomtom_matches[j]):
            fig = viz_sequence.plot_weights(pfm_to_pwm(match_pfm), return_fig=True)
            fig.tight_layout()
            if k < num_matches_to_show:
                body.append(
                    vdomh.tr(
                        vdomh.td(match_name),
                        vdomh.td(str(match_qval)),
                        vdomh.td(figure_to_vdom_image(fig))
                    )
                )
                if tfm_motifs_cache_dir:
                    # Save results and figures
                    motif_id = "%d_%d" % (i, j)
                    fig.savefig(os.path.join(out_dir, motif_id + ("_hit-%d.png" % (k + 1))))
            else:
                body.append(
                    vdomh.tr(
                        vdomh.td(match_name),
                        vdomh.td(str(match_qval)),
                        vdomh.td("Not shown")
                    )
                )
        if not body:
            display(vdomh.p("No TOMTOM matches passing threshold"))
        else:
            display(vdomh.table(header, vdomh.tbody(*body)))
        plt.close("all")

Metacluster 1/2

Motif 1/9

Motif IDq-valPWM
FOSL2_HUMAN.H11MO.0.A7.887489999999999e-08
JUN_HUMAN.H11MO.0.A2.98982e-07
MA1141.1_FOS::JUND1.12485e-06
MA0099.3_FOS::JUN1.12485e-06
FOSB_HUMAN.H11MO.0.A1.12485e-06
MA1130.1_FOSL2::JUN1.59381e-06Not shown
FOSL1_HUMAN.H11MO.0.A3.21387e-06Not shown
JUND_HUMAN.H11MO.0.A3.4132599999999997e-06Not shown
MA1622.1_Smad2::Smad33.4132599999999997e-06Not shown
MA1128.1_FOSL1::JUN3.4132599999999997e-06Not shown

Motif 2/9

Motif IDq-valPWM
ATF2_HUMAN.H11MO.0.B3.8629e-06
MA1133.1_JUN::JUNB(var.2)3.8629e-06
MA1136.1_FOSB::JUNB(var.2)7.1241999999999995e-06
JDP2_HUMAN.H11MO.0.D7.1241999999999995e-06
MA0605.2_ATF37.1241999999999995e-06
MA0656.1_JDP2(var.2)7.1241999999999995e-06Not shown
MA0840.1_Creb59.82135e-06Not shown
MA1126.1_FOS::JUN(var.2)9.82135e-06Not shown
MA1129.1_FOSL1::JUN(var.2)9.82135e-06Not shown
MA1475.1_CREB3L4(var.2)9.82135e-06Not shown

Motif 3/9

Motif IDq-valPWM
MA1583.1_ZFP570.21591300000000002

Motif 4/9

Motif IDq-valPWM
ATF4_HUMAN.H11MO.0.A0.0006832580000000001
MA0833.2_ATF40.0006832580000000001
MA1636.1_CEBPG(var.2)0.0006832580000000001
BATF_HUMAN.H11MO.1.A0.0006832580000000001
CEBPG_HUMAN.H11MO.0.B0.0006832580000000001
DDIT3_HUMAN.H11MO.0.D0.00103328Not shown
MA0837.1_CEBPE0.00256871Not shown
MA1143.1_FOSL1::JUND(var.2)0.00256871Not shown
MA0466.2_CEBPB0.00258782Not shown
CEBPD_HUMAN.H11MO.0.C0.00337831Not shown

Motif 5/9

Motif IDq-valPWM
NRL_HUMAN.H11MO.0.D0.14790999999999999
FOSL2_HUMAN.H11MO.0.A0.14790999999999999
FOSL1_HUMAN.H11MO.0.A0.14790999999999999
MA1138.1_FOSL2::JUNB0.14790999999999999
MA0841.1_NFE20.14790999999999999
MA1144.1_FOSL2::JUND0.14790999999999999Not shown
MA1128.1_FOSL1::JUN0.14790999999999999Not shown
MA1141.1_FOS::JUND0.14790999999999999Not shown
JUND_HUMAN.H11MO.0.A0.14790999999999999Not shown
MA1634.1_BATF0.14790999999999999Not shown

Motif 6/9

Motif IDq-valPWM
CPEB1_HUMAN.H11MO.0.D3.09672e-05
PRDM6_HUMAN.H11MO.0.C0.026898000000000002
MA1125.1_ZNF3840.026898000000000002
FOXL1_HUMAN.H11MO.0.D0.031428
FOXG1_HUMAN.H11MO.0.D0.06277909999999999
ANDR_HUMAN.H11MO.0.A0.12376600000000001Not shown
MA0679.2_ONECUT10.12376600000000001Not shown
FOXJ3_HUMAN.H11MO.0.A0.12376600000000001Not shown
ONEC2_HUMAN.H11MO.0.D0.15349300000000002Not shown
HXC10_HUMAN.H11MO.0.D0.156381Not shown

Motif 7/9

Motif IDq-valPWM
MA0478.1_FOSL20.00144856
MA0655.1_JDP20.0019519000000000001
MA0841.1_NFE20.0019519000000000001
MA1142.1_FOSL1::JUND0.0019519000000000001
FOSB_HUMAN.H11MO.0.A0.0019519000000000001
MA0476.1_FOS0.0019519000000000001Not shown
MA0591.1_Bach1::Mafk0.00195733Not shown
NFE2_HUMAN.H11MO.0.A0.00305984Not shown
BACH1_HUMAN.H11MO.0.A0.00305984Not shown
MA1134.1_FOS::JUNB0.00348985Not shown

Motif 8/9

Motif IDq-valPWM
USF2_HUMAN.H11MO.0.A0.000624165
SP2_HUMAN.H11MO.0.A0.006150899999999999
SP1_HUMAN.H11MO.0.A0.0453375
MA1596.1_ZNF4600.0453375
SP3_HUMAN.H11MO.0.B0.0453375
MA1583.1_ZFP570.0453375Not shown
MA0814.2_TFAP2C(var.2)0.0453375Not shown
MA0146.2_Zfx0.07436210000000001Not shown
THAP1_HUMAN.H11MO.0.C0.07436210000000001Not shown
MA1615.1_Plagl10.07436210000000001Not shown

Motif 9/9

Motif IDq-valPWM
MA0052.4_MEF2A0.0935192
MEF2B_HUMAN.H11MO.0.A0.230875
MA0773.1_MEF2D0.230875
MEF2D_HUMAN.H11MO.0.A0.230875
MA0679.2_ONECUT10.230875
HXD13_HUMAN.H11MO.0.D0.230875Not shown
HMGA1_HUMAN.H11MO.0.D0.24065599999999998Not shown
PRDM6_HUMAN.H11MO.0.C0.24065599999999998Not shown
FOXJ3_HUMAN.H11MO.0.A0.278904Not shown
MEF2A_HUMAN.H11MO.0.A0.278904Not shown

Metacluster 2/2

Motif 1/10

Motif IDq-valPWM
ZFP82_HUMAN.H11MO.0.C0.41952399999999995

Motif 2/10

No TOMTOM matches passing threshold

Motif 3/10

No TOMTOM matches passing threshold

Motif 4/10

No TOMTOM matches passing threshold

Motif 5/10

No TOMTOM matches passing threshold

Motif 6/10

No TOMTOM matches passing threshold

Motif 7/10

Motif IDq-valPWM
RFX5_HUMAN.H11MO.1.A0.205765
MA0744.2_SCRT20.205765
SCRT2_HUMAN.H11MO.0.D0.250596
PTF1A_HUMAN.H11MO.1.B0.250596
ITF2_HUMAN.H11MO.0.C0.250596
RFX5_HUMAN.H11MO.0.A0.250596Not shown
HTF4_HUMAN.H11MO.0.A0.256019Not shown
ASCL2_HUMAN.H11MO.0.D0.263938Not shown
ZIM3_HUMAN.H11MO.0.C0.263938Not shown
MYF6_HUMAN.H11MO.0.C0.273402Not shown

Motif 8/10

Motif IDq-valPWM
MA0591.1_Bach1::Mafk7.49995e-05
MA0501.1_MAF::NFE27.49995e-05
NF2L2_HUMAN.H11MO.0.A7.49995e-05
MA1622.1_Smad2::Smad37.49995e-05
MA1141.1_FOS::JUND7.49995e-05
MA0150.2_Nfe2l27.49995e-05Not shown
MA1130.1_FOSL2::JUN7.49995e-05Not shown
FOSL2_HUMAN.H11MO.0.A7.49995e-05Not shown
JUN_HUMAN.H11MO.0.A7.49995e-05Not shown
MA0099.3_FOS::JUN7.49995e-05Not shown

Motif 9/10

No TOMTOM matches passing threshold

Motif 10/10

Motif IDq-valPWM
RFX1_HUMAN.H11MO.0.B0.00140421
SP2_HUMAN.H11MO.0.A0.00912286
MA1650.1_ZBTB140.00912286
SP1_HUMAN.H11MO.0.A0.0144494
USF2_HUMAN.H11MO.0.A0.0144494
SP3_HUMAN.H11MO.0.B0.019199200000000003Not shown
ATF6A_HUMAN.H11MO.0.B0.046436Not shown
ZBT7B_HUMAN.H11MO.0.D0.046436Not shown
KLF3_HUMAN.H11MO.0.B0.05209640000000001Not shown
MA1615.1_Plagl10.05209640000000001Not shown