In [1]:
import os
import sys
sys.path.append(os.path.abspath("/users/amtseng/tfmodisco/src/"))
from tfmodisco.run_tfmodisco import import_shap_scores, import_tfmodisco_results
from motif.read_motifs import pfm_info_content, pfm_to_pwm, trim_motif_by_ic
from motif.match_motifs import match_motifs_to_database
from util import figure_to_vdom_image
import plot.viz_sequence as viz_sequence
import numpy as np
import h5py
import matplotlib.pyplot as plt
import vdom.helpers as vdomh
from IPython.display import display

Define constants and paths

In [2]:
# Define parameters/fetch arguments
tf_name = os.environ["TFM_TF_NAME"]
shap_scores_path = os.environ["TFM_SHAP_PATH"]
tfm_results_path = os.environ["TFM_TFM_PATH"]
hyp_score_key = os.environ["TFM_HYP_SCORE_KEY"]
if "TFM_MOTIF_CACHE" in os.environ:
    tfm_motifs_cache_dir = os.environ["TFM_MOTIF_CACHE"]
else:
    tfm_motifs_cache_dir = None

print("TF name: %s" % tf_name)
print("DeepSHAP scores path: %s" % shap_scores_path)
print("TF-MoDISco results path: %s" % tfm_results_path)
print("Importance score key: %s" % hyp_score_key)
print("Saved TF-MoDISco-derived motifs cache: %s" % tfm_motifs_cache_dir)
TF name: E2F6
DeepSHAP scores path: /users/amtseng/tfmodisco/results/importance_scores/multitask_profile/E2F6_multitask_profile_fold7/E2F6_multitask_profile_fold7_imp_scores.h5
TF-MoDISco results path: /users/amtseng/tfmodisco/results/tfmodisco/multitask_profile/E2F6_multitask_profile_fold7/E2F6_multitask_profile_fold7_profile_tfm.h5
Importance score key: profile_hyp_scores
Saved TF-MoDISco-derived motifs cache: /users/amtseng/tfmodisco/results/reports/tfmodisco_results//cache/multitask_profile/E2F6_multitask_profile_fold7/E2F6_multitask_profile_fold7_profile
In [3]:
# Define paths and constants
input_length = 2114
shap_score_center_size = 400
In [4]:
if tfm_motifs_cache_dir:
    os.makedirs(tfm_motifs_cache_dir, exist_ok=True)

Import SHAP scores and TF-MoDISco results

In [5]:
# Import SHAP coordinates and one-hot sequences
hyp_scores, _, one_hot_seqs, shap_coords = import_shap_scores(shap_scores_path, hyp_score_key, center_cut_size=shap_score_center_size)
# This cuts the sequences/scores off just as how TF-MoDISco saw them, but the coordinates are uncut
Importing SHAP scores: 100%|██████████| 52/52 [00:25<00:00,  2.01it/s]
In [6]:
# Import the TF-MoDISco results object
tfm_obj = import_tfmodisco_results(tfm_results_path, hyp_scores, one_hot_seqs, shap_score_center_size)

Plot some SHAP score tracks

Plot the central region of some randomly selected actual importance scores

In [7]:
plot_slice = slice(int(shap_score_center_size / 4), int(3 * shap_score_center_size / 4))
for index in np.random.choice(hyp_scores.shape[0], size=5, replace=False):
    viz_sequence.plot_weights((hyp_scores[index] * one_hot_seqs[index])[plot_slice], subticks_frequency=100)

Plot TF-MoDISco results

Plot all motifs by metacluster

In [8]:
motif_pfms, motif_hcwms, motif_cwms = [], [], []  # Save the trimmed PFMs, hCWMs, and CWMs
motif_pfms_short = []  # PFMs that are even more trimmed (for TOMTOM)
num_seqlets = []  # Number of seqlets for each motif
motif_seqlets = []  # Save seqlets of each motif
metaclusters = tfm_obj.metacluster_idx_to_submetacluster_results
num_metaclusters = len(metaclusters.keys())
if tfm_motifs_cache_dir:
    motif_hdf5 = h5py.File(os.path.join(tfm_motifs_cache_dir, "all_motifs.h5"), "w")
for metacluster_i, metacluster_key in enumerate(metaclusters.keys()):
    metacluster = metaclusters[metacluster_key]
    display(vdomh.h3("Metacluster %d/%d" % (metacluster_i + 1, num_metaclusters)))
    patterns = metacluster.seqlets_to_patterns_result.patterns
    if not patterns:
        break
    motif_pfms.append([])
    motif_hcwms.append([])
    motif_cwms.append([])
    motif_pfms_short.append([])
    num_seqlets.append([])
    motif_seqlets.append([])
    num_patterns = len(patterns)
    for pattern_i, pattern in enumerate(patterns):
        seqlets = pattern.seqlets
        display(vdomh.h4("Pattern %d/%d" % (pattern_i + 1, num_patterns)))
        display(vdomh.p("%d seqlets" % len(seqlets)))
        
        pfm = pattern["sequence"].fwd
        hcwm = pattern["task0_hypothetical_contribs"].fwd
        cwm = pattern["task0_contrib_scores"].fwd
        
        pfm_fig = viz_sequence.plot_weights(pfm, subticks_frequency=10, return_fig=True)
        hcwm_fig = viz_sequence.plot_weights(hcwm, subticks_frequency=10, return_fig=True)
        cwm_fig = viz_sequence.plot_weights(cwm, subticks_frequency=10, return_fig=True)
        pfm_fig.tight_layout()
        hcwm_fig.tight_layout()
        cwm_fig.tight_layout()
        
        motif_table = vdomh.table(
            vdomh.tr(
                vdomh.td("Sequence (PFM)"),
                vdomh.td(figure_to_vdom_image(pfm_fig))
            ),
            vdomh.tr(
                vdomh.td("Hypothetical contributions (hCWM)"),
                vdomh.td(figure_to_vdom_image(hcwm_fig))
            ),
            vdomh.tr(
                vdomh.td("Actual contributions (CWM)"),
                vdomh.td(figure_to_vdom_image(cwm_fig))
            )
        )
        display(motif_table)
        plt.close("all")  # Remove all standing figures
        
        # Trim motif based on information content
        short_trimmed_pfm = trim_motif_by_ic(pfm, pfm)
        motif_pfms_short[-1].append(short_trimmed_pfm)
        
        # Expand trimming to +/- 4bp on either side
        trimmed_pfm = trim_motif_by_ic(pfm, pfm, pad=4)
        trimmed_hcwm = trim_motif_by_ic(pfm, hcwm, pad=4)
        trimmed_cwm = trim_motif_by_ic(pfm, cwm, pad=4)
        
        motif_pfms[-1].append(trimmed_pfm)
        motif_hcwms[-1].append(trimmed_hcwm)
        motif_cwms[-1].append(trimmed_cwm)
        
        num_seqlets[-1].append(len(seqlets))
        
        if tfm_motifs_cache_dir:
            # Save results and figures
            motif_id = "%d_%d" % (metacluster_i, pattern_i)
            pfm_fig.savefig(os.path.join(tfm_motifs_cache_dir, motif_id + "_pfm_full.png"))
            hcwm_fig.savefig(os.path.join(tfm_motifs_cache_dir, motif_id + "_hcwm_full.png"))
            cwm_fig.savefig(os.path.join(tfm_motifs_cache_dir, motif_id + "_cwm_full.png"))
            motif_dset = motif_hdf5.create_group(motif_id)
            motif_dset.create_dataset("pfm_full", data=pfm, compression="gzip")
            motif_dset.create_dataset("hcwm_full", data=hcwm, compression="gzip")
            motif_dset.create_dataset("cwm_full", data=cwm, compression="gzip")
            motif_dset.create_dataset("pfm_trimmed", data=trimmed_pfm, compression="gzip")
            motif_dset.create_dataset("hcwm_trimmed", data=trimmed_hcwm, compression="gzip")
            motif_dset.create_dataset("cwm_trimmed", data=trimmed_cwm, compression="gzip")
            motif_dset.create_dataset("pfm_short_trimmed", data=short_trimmed_pfm, compression="gzip")

Metacluster 1/2

Pattern 1/11

4840 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 2/11

3012 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 3/11

2107 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 4/11

1005 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 5/11

644 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 6/11

304 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 7/11

221 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 8/11

83 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 9/11

42 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 10/11

32 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 11/11

30 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Metacluster 2/2

Pattern 1/1

41 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Summary of motifs

Motifs are trimmed based on information content, and presented in descending order by number of supporting seqlets. The motifs are separated by metacluster. The motifs are presented as hCWMs. The forward orientation is defined as the orientation that is richer in purines.

In [9]:
colgroup = vdomh.colgroup(
    vdomh.col(style={"width": "5%"}),
    vdomh.col(style={"width": "5%"}),
    vdomh.col(style={"width": "45%"}),
    vdomh.col(style={"width": "45%"})
)
header = vdomh.thead(
    vdomh.tr(
        vdomh.th("#", style={"text-align": "center"}),
        vdomh.th("Seqlets", style={"text-align": "center"}),
        vdomh.th("Forward", style={"text-align": "center"}),
        vdomh.th("Reverse", style={"text-align": "center"})
    )
)

for i in range(len(motif_hcwms)):
    display(vdomh.h3("Metacluster %d/%d" % (i + 1, num_metaclusters)))
    body = []
    for j in range(len(motif_hcwms[i])):
        motif = motif_hcwms[i][j]
        if np.sum(motif[:, [0, 2]]) > 0.5 * np.sum(motif):
            # Forward is purine-rich, reverse-complement is pyrimidine-rich
            f, rc = motif, np.flip(motif, axis=(0, 1))
        else:
            f, rc = np.flip(motif, axis=(0, 1)), motif
            
        f_fig = viz_sequence.plot_weights(f, figsize=(20, 4), return_fig=True)
        f_fig.tight_layout()
        rc_fig = viz_sequence.plot_weights(rc, figsize=(20, 4), return_fig=True)
        rc_fig.tight_layout()
        
        if tfm_motifs_cache_dir:
            # Save results and figures
            motif_id = "%d_%d" % (i, j)
            f_fig.savefig(os.path.join(tfm_motifs_cache_dir, motif_id + "_hcwm_trimmed_fwd.png"))
            rc_fig.savefig(os.path.join(tfm_motifs_cache_dir, motif_id + "_hcwm_trimmed_rev.png"))

        body.append(
            vdomh.tr(
                vdomh.td(str(j + 1)),
                vdomh.td(str(num_seqlets[i][j])),
                vdomh.td(figure_to_vdom_image(f_fig)),
                vdomh.td(figure_to_vdom_image(rc_fig))
            )
        )
    display(vdomh.table(colgroup, header, vdomh.tbody(*body)))
    plt.close("all")

Metacluster 1/2

/users/amtseng/tfmodisco/src/plot/viz_sequence.py:152: RuntimeWarning: More than 20 figures have been opened. Figures created through the pyplot interface (`matplotlib.pyplot.figure`) are retained until explicitly closed and may consume too much memory. (To control this warning, see the rcParam `figure.max_open_warning`).
  fig = plt.figure(figsize=figsize)
#SeqletsForwardReverse
14840
23012
32107
41005
5644
6304
7221
883
942
1032
1130

Metacluster 2/2

#SeqletsForwardReverse
141

Top TOMTOM matches for each motif

Here, the TF-MoDISco motifs are plotted as hCWMs, but the TOMTOM matches are shown as PWMs.

In [10]:
num_matches_to_keep = 10
num_matches_to_show = 5

header = vdomh.thead(
    vdomh.tr(
        vdomh.th("Motif ID", style={"text-align": "center"}),
        vdomh.th("q-val", style={"text-align": "center"}),
        vdomh.th("PWM", style={"text-align": "center"})
    )
)

for i in range(len(motif_pfms)):
    display(vdomh.h3("Metacluster %d/%d" % (i + 1, num_metaclusters)))
    
    # Compute TOMTOM matches for all motifs in the metacluster at once
    out_dir = os.path.join(tfm_motifs_cache_dir, "tomtom", "metacluster_%d" % i) if tfm_motifs_cache_dir else None
    tomtom_matches = match_motifs_to_database(motif_pfms_short[i], top_k=num_matches_to_keep, temp_dir=out_dir)
    
    for j in range(len(motif_pfms[i])):
        display(vdomh.h4("Motif %d/%d" % (j + 1, len(motif_pfms[i]))))
        viz_sequence.plot_weights(motif_hcwms[i][j])
    
        body = []
        for k, (match_name, match_pfm, match_qval) in enumerate(tomtom_matches[j]):
            fig = viz_sequence.plot_weights(pfm_to_pwm(match_pfm), return_fig=True)
            fig.tight_layout()
            if k < num_matches_to_show:
                body.append(
                    vdomh.tr(
                        vdomh.td(match_name),
                        vdomh.td(str(match_qval)),
                        vdomh.td(figure_to_vdom_image(fig))
                    )
                )
                if tfm_motifs_cache_dir:
                    # Save results and figures
                    motif_id = "%d_%d" % (i, j)
                    fig.savefig(os.path.join(out_dir, motif_id + ("_hit-%d.png" % (k + 1))))
            else:
                body.append(
                    vdomh.tr(
                        vdomh.td(match_name),
                        vdomh.td(str(match_qval)),
                        vdomh.td("Not shown")
                    )
                )
        if not body:
            display(vdomh.p("No TOMTOM matches passing threshold"))
        else:
            display(vdomh.table(header, vdomh.tbody(*body)))
        plt.close("all")

Metacluster 1/2

Motif 1/11

Motif IDq-valPWM
BMAL1_HUMAN.H11MO.0.A0.0007496089999999999
MYCN_HUMAN.H11MO.0.A0.00155068
MA0059.1_MAX::MYC0.00233009
MA0871.2_TFEC0.00259921
MA0626.1_Npas20.0032025
MXI1_HUMAN.H11MO.0.A0.00439662Not shown
MYC_HUMAN.H11MO.0.A0.00515132Not shown
MA0104.4_MYCN0.00517552Not shown
MA0147.3_MYC0.00517552Not shown
MA0825.1_MNT0.00520133Not shown

Motif 2/11

Motif IDq-valPWM
MA0471.2_E2F64.5234e-06
E2F1_HUMAN.H11MO.0.A4.5234e-06
TFDP1_HUMAN.H11MO.0.C9.86428e-06
E2F4_HUMAN.H11MO.0.A1.64326e-05
E2F3_HUMAN.H11MO.0.A1.79876e-05
E2F6_HUMAN.H11MO.0.A2.49827e-05Not shown
E2F7_HUMAN.H11MO.0.B0.00013114Not shown
MA1122.1_TFDP10.00111357Not shown
E2F4_HUMAN.H11MO.1.A0.017175700000000002Not shown
MA0865.1_E2F80.0210164Not shown

Motif 3/11

Motif IDq-valPWM
MAX_HUMAN.H11MO.0.A0.19447
MXI1_HUMAN.H11MO.1.A0.19447
MA0058.3_MAX0.19447
MA0147.3_MYC0.19447
MYC_HUMAN.H11MO.0.A0.19447
MA0059.1_MAX::MYC0.20777600000000002Not shown
MA0825.1_MNT0.241932Not shown
MXI1_HUMAN.H11MO.0.A0.253798Not shown
MYCN_HUMAN.H11MO.0.A0.288329Not shown
HEY2_HUMAN.H11MO.0.D0.288329Not shown

Motif 4/11

No TOMTOM matches passing threshold

Motif 5/11

No TOMTOM matches passing threshold

Motif 6/11

Motif IDq-valPWM
CTCF_HUMAN.H11MO.0.A3.01238e-08
CTCFL_HUMAN.H11MO.0.A8.18849e-08
MA0139.1_CTCF8.18849e-08
MA1102.2_CTCFL7.73426e-06
SNAI1_HUMAN.H11MO.0.C0.050740600000000004
MA0830.2_TCF40.0623899Not shown
MA1648.1_TCF12(var.2)0.0623899Not shown
MA0147.3_MYC0.0952242Not shown
PLAL1_HUMAN.H11MO.0.D0.09756519999999999Not shown
MYC_HUMAN.H11MO.0.A0.107848Not shown

Motif 7/11

Motif IDq-valPWM
USF2_HUMAN.H11MO.0.A0.0101859
SP2_HUMAN.H11MO.0.A0.011563200000000001
MXI1_HUMAN.H11MO.0.A0.014845500000000001
MA1650.1_ZBTB140.014845500000000001
SP3_HUMAN.H11MO.0.B0.024984299999999997
SP1_HUMAN.H11MO.0.A0.027075400000000003Not shown
KLF3_HUMAN.H11MO.0.B0.0654722Not shown
AP2D_HUMAN.H11MO.0.D0.0654722Not shown
MA0697.1_ZIC30.113449Not shown
SP1_HUMAN.H11MO.1.A0.135812Not shown

Motif 8/11

Motif IDq-valPWM
CPEB1_HUMAN.H11MO.0.D2.87748e-05
MA1125.1_ZNF3840.0251107
PRDM6_HUMAN.H11MO.0.C0.0251107
FOXL1_HUMAN.H11MO.0.D0.051308900000000005
FOXG1_HUMAN.H11MO.0.D0.0666732
HXC10_HUMAN.H11MO.0.D0.122494Not shown
MA0679.2_ONECUT10.12269000000000001Not shown
FOXJ3_HUMAN.H11MO.0.A0.128753Not shown
ANDR_HUMAN.H11MO.0.A0.128753Not shown
ZFP28_HUMAN.H11MO.0.C0.151837Not shown

Motif 9/11

Motif IDq-valPWM
MA1122.1_TFDP10.0106468
E2F3_HUMAN.H11MO.0.A0.0106468
E2F1_HUMAN.H11MO.0.A0.04851880000000001
E2F4_HUMAN.H11MO.0.A0.04851880000000001
SP3_HUMAN.H11MO.0.B0.04851880000000001
SP1_HUMAN.H11MO.0.A0.04851880000000001Not shown
MA0471.2_E2F60.04851880000000001Not shown
MA0865.1_E2F80.04851880000000001Not shown
E2F2_HUMAN.H11MO.0.B0.04851880000000001Not shown
E2F6_HUMAN.H11MO.0.A0.0566269Not shown

Motif 10/11

Motif IDq-valPWM
SP2_HUMAN.H11MO.0.A0.0927346
SP1_HUMAN.H11MO.1.A0.0927346
SP3_HUMAN.H11MO.0.B0.100454
AP2D_HUMAN.H11MO.0.D0.269883
SP1_HUMAN.H11MO.0.A0.31449699999999997
MA1513.1_KLF150.31449699999999997Not shown
MA1102.2_CTCFL0.31449699999999997Not shown
MXI1_HUMAN.H11MO.0.A0.31449699999999997Not shown
ZFX_HUMAN.H11MO.1.A0.31449699999999997Not shown
MA1584.1_ZIC50.31590300000000004Not shown

Motif 11/11

Motif IDq-valPWM
MA0052.4_MEF2A0.104447
HXD13_HUMAN.H11MO.0.D0.22539499999999998
MEF2B_HUMAN.H11MO.0.A0.22539499999999998
MA0679.2_ONECUT10.22539499999999998
MA0773.1_MEF2D0.229519
MEF2D_HUMAN.H11MO.0.A0.229519Not shown
PRDM6_HUMAN.H11MO.0.C0.229519Not shown
FOXJ3_HUMAN.H11MO.0.A0.24708400000000003Not shown
HMGA1_HUMAN.H11MO.0.D0.259774Not shown
CPEB1_HUMAN.H11MO.0.D0.306438Not shown

Metacluster 2/2

Motif 1/1

Motif IDq-valPWM
MA0147.3_MYC0.0024031999999999994
MA0104.4_MYCN0.0024031999999999994
MA0616.2_HES20.00328682
MA0823.1_HEY10.00342112
MA1099.2_HES10.00342112
MA0004.1_Arnt0.00342112Not shown
MA0649.1_HEY20.00347819Not shown
MA0830.2_TCF40.00413472Not shown
CR3L1_HUMAN.H11MO.0.D0.00413472Not shown
HES7_HUMAN.H11MO.0.D0.00485996Not shown