In [1]:
import os
import sys
sys.path.append(os.path.abspath("/users/amtseng/tfmodisco/src/"))
from tfmodisco.run_tfmodisco import import_shap_scores, import_tfmodisco_results
from motif.read_motifs import pfm_info_content, pfm_to_pwm, trim_motif_by_ic
from motif.match_motifs import match_motifs_to_database
from util import figure_to_vdom_image
import plot.viz_sequence as viz_sequence
import numpy as np
import h5py
import matplotlib.pyplot as plt
import vdom.helpers as vdomh
from IPython.display import display

Define constants and paths

In [2]:
# Define parameters/fetch arguments
tf_name = os.environ["TFM_TF_NAME"]
shap_scores_path = os.environ["TFM_SHAP_PATH"]
tfm_results_path = os.environ["TFM_TFM_PATH"]
hyp_score_key = os.environ["TFM_HYP_SCORE_KEY"]
if "TFM_MOTIF_CACHE" in os.environ:
    tfm_motifs_cache_dir = os.environ["TFM_MOTIF_CACHE"]
else:
    tfm_motifs_cache_dir = None

print("TF name: %s" % tf_name)
print("DeepSHAP scores path: %s" % shap_scores_path)
print("TF-MoDISco results path: %s" % tfm_results_path)
print("Importance score key: %s" % hyp_score_key)
print("Saved TF-MoDISco-derived motifs cache: %s" % tfm_motifs_cache_dir)
TF name: MAX
DeepSHAP scores path: /users/amtseng/tfmodisco/results/importance_scores/multitask_profile/MAX_multitask_profile_fold10/MAX_multitask_profile_fold10_imp_scores.h5
TF-MoDISco results path: /users/amtseng/tfmodisco/results/tfmodisco/multitask_profile/MAX_multitask_profile_fold10/MAX_multitask_profile_fold10_profile_tfm.h5
Importance score key: profile_hyp_scores
Saved TF-MoDISco-derived motifs cache: /users/amtseng/tfmodisco/results/reports/tfmodisco_results//cache/multitask_profile/MAX_multitask_profile_fold10/MAX_multitask_profile_fold10_profile
In [3]:
# Define paths and constants
input_length = 2114
shap_score_center_size = 400
In [4]:
if tfm_motifs_cache_dir:
    os.makedirs(tfm_motifs_cache_dir, exist_ok=True)

Import SHAP scores and TF-MoDISco results

In [5]:
# Import SHAP coordinates and one-hot sequences
hyp_scores, _, one_hot_seqs, shap_coords = import_shap_scores(shap_scores_path, hyp_score_key, center_cut_size=shap_score_center_size)
# This cuts the sequences/scores off just as how TF-MoDISco saw them, but the coordinates are uncut
Importing SHAP scores: 100%|██████████| 204/204 [01:56<00:00,  1.75it/s]
In [6]:
# Import the TF-MoDISco results object
tfm_obj = import_tfmodisco_results(tfm_results_path, hyp_scores, one_hot_seqs, shap_score_center_size)

Plot some SHAP score tracks

Plot the central region of some randomly selected actual importance scores

In [7]:
plot_slice = slice(int(shap_score_center_size / 4), int(3 * shap_score_center_size / 4))
for index in np.random.choice(hyp_scores.shape[0], size=5, replace=False):
    viz_sequence.plot_weights((hyp_scores[index] * one_hot_seqs[index])[plot_slice], subticks_frequency=100)

Plot TF-MoDISco results

Plot all motifs by metacluster

In [8]:
motif_pfms, motif_hcwms, motif_cwms = [], [], []  # Save the trimmed PFMs, hCWMs, and CWMs
motif_pfms_short = []  # PFMs that are even more trimmed (for TOMTOM)
num_seqlets = []  # Number of seqlets for each motif
motif_seqlets = []  # Save seqlets of each motif
metaclusters = tfm_obj.metacluster_idx_to_submetacluster_results
num_metaclusters = len(metaclusters.keys())
if tfm_motifs_cache_dir:
    motif_hdf5 = h5py.File(os.path.join(tfm_motifs_cache_dir, "all_motifs.h5"), "w")
for metacluster_i, metacluster_key in enumerate(metaclusters.keys()):
    metacluster = metaclusters[metacluster_key]
    display(vdomh.h3("Metacluster %d/%d" % (metacluster_i + 1, num_metaclusters)))
    patterns = metacluster.seqlets_to_patterns_result.patterns
    if not patterns:
        break
    motif_pfms.append([])
    motif_hcwms.append([])
    motif_cwms.append([])
    motif_pfms_short.append([])
    num_seqlets.append([])
    motif_seqlets.append([])
    num_patterns = len(patterns)
    for pattern_i, pattern in enumerate(patterns):
        seqlets = pattern.seqlets
        display(vdomh.h4("Pattern %d/%d" % (pattern_i + 1, num_patterns)))
        display(vdomh.p("%d seqlets" % len(seqlets)))
        
        pfm = pattern["sequence"].fwd
        hcwm = pattern["task0_hypothetical_contribs"].fwd
        cwm = pattern["task0_contrib_scores"].fwd
        
        pfm_fig = viz_sequence.plot_weights(pfm, subticks_frequency=10, return_fig=True)
        hcwm_fig = viz_sequence.plot_weights(hcwm, subticks_frequency=10, return_fig=True)
        cwm_fig = viz_sequence.plot_weights(cwm, subticks_frequency=10, return_fig=True)
        pfm_fig.tight_layout()
        hcwm_fig.tight_layout()
        cwm_fig.tight_layout()
        
        motif_table = vdomh.table(
            vdomh.tr(
                vdomh.td("Sequence (PFM)"),
                vdomh.td(figure_to_vdom_image(pfm_fig))
            ),
            vdomh.tr(
                vdomh.td("Hypothetical contributions (hCWM)"),
                vdomh.td(figure_to_vdom_image(hcwm_fig))
            ),
            vdomh.tr(
                vdomh.td("Actual contributions (CWM)"),
                vdomh.td(figure_to_vdom_image(cwm_fig))
            )
        )
        display(motif_table)
        plt.close("all")  # Remove all standing figures
        
        # Trim motif based on information content
        short_trimmed_pfm = trim_motif_by_ic(pfm, pfm)
        motif_pfms_short[-1].append(short_trimmed_pfm)
        
        # Expand trimming to +/- 4bp on either side
        trimmed_pfm = trim_motif_by_ic(pfm, pfm, pad=4)
        trimmed_hcwm = trim_motif_by_ic(pfm, hcwm, pad=4)
        trimmed_cwm = trim_motif_by_ic(pfm, cwm, pad=4)
        
        motif_pfms[-1].append(trimmed_pfm)
        motif_hcwms[-1].append(trimmed_hcwm)
        motif_cwms[-1].append(trimmed_cwm)
        
        num_seqlets[-1].append(len(seqlets))
        
        if tfm_motifs_cache_dir:
            # Save results and figures
            motif_id = "%d_%d" % (metacluster_i, pattern_i)
            pfm_fig.savefig(os.path.join(tfm_motifs_cache_dir, motif_id + "_pfm_full.png"))
            hcwm_fig.savefig(os.path.join(tfm_motifs_cache_dir, motif_id + "_hcwm_full.png"))
            cwm_fig.savefig(os.path.join(tfm_motifs_cache_dir, motif_id + "_cwm_full.png"))
            motif_dset = motif_hdf5.create_group(motif_id)
            motif_dset.create_dataset("pfm_full", data=pfm, compression="gzip")
            motif_dset.create_dataset("hcwm_full", data=hcwm, compression="gzip")
            motif_dset.create_dataset("cwm_full", data=cwm, compression="gzip")
            motif_dset.create_dataset("pfm_trimmed", data=trimmed_pfm, compression="gzip")
            motif_dset.create_dataset("hcwm_trimmed", data=trimmed_hcwm, compression="gzip")
            motif_dset.create_dataset("cwm_trimmed", data=trimmed_cwm, compression="gzip")
            motif_dset.create_dataset("pfm_short_trimmed", data=short_trimmed_pfm, compression="gzip")

Metacluster 1/2

Pattern 1/9

7868 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 2/9

2378 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 3/9

1721 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 4/9

339 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 5/9

265 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 6/9

208 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 7/9

50 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 8/9

49 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 9/9

42 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Metacluster 2/2

Summary of motifs

Motifs are trimmed based on information content, and presented in descending order by number of supporting seqlets. The motifs are separated by metacluster. The motifs are presented as hCWMs. The forward orientation is defined as the orientation that is richer in purines.

In [9]:
colgroup = vdomh.colgroup(
    vdomh.col(style={"width": "5%"}),
    vdomh.col(style={"width": "5%"}),
    vdomh.col(style={"width": "45%"}),
    vdomh.col(style={"width": "45%"})
)
header = vdomh.thead(
    vdomh.tr(
        vdomh.th("#", style={"text-align": "center"}),
        vdomh.th("Seqlets", style={"text-align": "center"}),
        vdomh.th("Forward", style={"text-align": "center"}),
        vdomh.th("Reverse", style={"text-align": "center"})
    )
)

for i in range(len(motif_hcwms)):
    display(vdomh.h3("Metacluster %d/%d" % (i + 1, num_metaclusters)))
    body = []
    for j in range(len(motif_hcwms[i])):
        motif = motif_hcwms[i][j]
        if np.sum(motif[:, [0, 2]]) > 0.5 * np.sum(motif):
            # Forward is purine-rich, reverse-complement is pyrimidine-rich
            f, rc = motif, np.flip(motif, axis=(0, 1))
        else:
            f, rc = np.flip(motif, axis=(0, 1)), motif
            
        f_fig = viz_sequence.plot_weights(f, figsize=(20, 4), return_fig=True)
        f_fig.tight_layout()
        rc_fig = viz_sequence.plot_weights(rc, figsize=(20, 4), return_fig=True)
        rc_fig.tight_layout()
        
        if tfm_motifs_cache_dir:
            # Save results and figures
            motif_id = "%d_%d" % (i, j)
            f_fig.savefig(os.path.join(tfm_motifs_cache_dir, motif_id + "_hcwm_trimmed_fwd.png"))
            rc_fig.savefig(os.path.join(tfm_motifs_cache_dir, motif_id + "_hcwm_trimmed_rev.png"))

        body.append(
            vdomh.tr(
                vdomh.td(str(j + 1)),
                vdomh.td(str(num_seqlets[i][j])),
                vdomh.td(figure_to_vdom_image(f_fig)),
                vdomh.td(figure_to_vdom_image(rc_fig))
            )
        )
    display(vdomh.table(colgroup, header, vdomh.tbody(*body)))
    plt.close("all")

Metacluster 1/2

#SeqletsForwardReverse
17868
22378
31721
4339
5265
6208
750
849
942

Top TOMTOM matches for each motif

Here, the TF-MoDISco motifs are plotted as hCWMs, but the TOMTOM matches are shown as PWMs.

In [10]:
num_matches_to_keep = 10
num_matches_to_show = 5

header = vdomh.thead(
    vdomh.tr(
        vdomh.th("Motif ID", style={"text-align": "center"}),
        vdomh.th("q-val", style={"text-align": "center"}),
        vdomh.th("PWM", style={"text-align": "center"})
    )
)

for i in range(len(motif_pfms)):
    display(vdomh.h3("Metacluster %d/%d" % (i + 1, num_metaclusters)))
    
    # Compute TOMTOM matches for all motifs in the metacluster at once
    out_dir = os.path.join(tfm_motifs_cache_dir, "tomtom", "metacluster_%d" % i) if tfm_motifs_cache_dir else None
    tomtom_matches = match_motifs_to_database(motif_pfms_short[i], top_k=num_matches_to_keep, temp_dir=out_dir)
    
    for j in range(len(motif_pfms[i])):
        display(vdomh.h4("Motif %d/%d" % (j + 1, len(motif_pfms[i]))))
        viz_sequence.plot_weights(motif_hcwms[i][j])
    
        body = []
        for k, (match_name, match_pfm, match_qval) in enumerate(tomtom_matches[j]):
            fig = viz_sequence.plot_weights(pfm_to_pwm(match_pfm), return_fig=True)
            fig.tight_layout()
            if k < num_matches_to_show:
                body.append(
                    vdomh.tr(
                        vdomh.td(match_name),
                        vdomh.td(str(match_qval)),
                        vdomh.td(figure_to_vdom_image(fig))
                    )
                )
                if tfm_motifs_cache_dir:
                    # Save results and figures
                    motif_id = "%d_%d" % (i, j)
                    fig.savefig(os.path.join(out_dir, motif_id + ("_hit-%d.png" % (k + 1))))
            else:
                body.append(
                    vdomh.tr(
                        vdomh.td(match_name),
                        vdomh.td(str(match_qval)),
                        vdomh.td("Not shown")
                    )
                )
        if not body:
            display(vdomh.p("No TOMTOM matches passing threshold"))
        else:
            display(vdomh.table(header, vdomh.tbody(*body)))
        plt.close("all")

Metacluster 1/2

Motif 1/9

Motif IDq-valPWM
MYC_HUMAN.H11MO.0.A0.0123172
MA0664.1_MLXIPL0.0123172
MA0825.1_MNT0.0123172
MA1493.1_HES60.0123172
TFEB_HUMAN.H11MO.0.C0.0123172
MA0620.3_MITF0.0123172Not shown
MA0871.2_TFEC0.0123172Not shown
MA0464.2_BHLHE400.0123172Not shown
MA0636.1_BHLHE410.0123172Not shown
MA0649.1_HEY20.0123172Not shown

Motif 2/9

Motif IDq-valPWM
MA0058.3_MAX0.188876
MAX_HUMAN.H11MO.0.A0.188876
MA1099.2_HES10.188876
MXI1_HUMAN.H11MO.1.A0.188876
MA0825.1_MNT0.188876
MA0059.1_MAX::MYC0.188876Not shown
MA0147.3_MYC0.188876Not shown
MA0632.2_TCFL50.188876Not shown
MYC_HUMAN.H11MO.0.A0.188876Not shown
MA0506.1_NRF10.193178Not shown

Motif 3/9

Motif IDq-valPWM
CTCF_HUMAN.H11MO.0.A1.10382e-09
CTCFL_HUMAN.H11MO.0.A1.4139e-08
MA0139.1_CTCF1.06996e-07
MA1102.2_CTCFL2.05783e-06
AP2B_HUMAN.H11MO.0.B0.130163
PLAL1_HUMAN.H11MO.0.D0.130163Not shown
KLF8_HUMAN.H11MO.0.C0.130163Not shown
MA1568.1_TCF21(var.2)0.130163Not shown
MA0155.1_INSM10.130163Not shown
MA0830.2_TCF40.130163Not shown

Motif 4/9

Motif IDq-valPWM
CTCF_HUMAN.H11MO.0.A4.90624e-06
MA0139.1_CTCF4.90624e-06
CTCFL_HUMAN.H11MO.0.A1.99251e-05
MA1102.2_CTCFL0.00111335
SNAI1_HUMAN.H11MO.0.C0.010386
MA1648.1_TCF12(var.2)0.0365168Not shown
BHA15_HUMAN.H11MO.0.B0.0437118Not shown
MA0830.2_TCF40.0444948Not shown
MYC_HUMAN.H11MO.0.A0.0444948Not shown
ASCL2_HUMAN.H11MO.0.D0.0662036Not shown

Motif 5/9

Motif IDq-valPWM
MA0139.1_CTCF1.58035e-06
CTCF_HUMAN.H11MO.0.A2.20732e-06
CTCFL_HUMAN.H11MO.0.A8.78965e-05
SNAI1_HUMAN.H11MO.0.C0.058390199999999996
MA1568.1_TCF21(var.2)0.058390199999999996
MA1638.1_HAND20.058390199999999996Not shown
MA1102.2_CTCFL0.058390199999999996Not shown
MYC_HUMAN.H11MO.0.A0.420523Not shown
MA1648.1_TCF12(var.2)0.420523Not shown
MA0830.2_TCF40.439561Not shown

Motif 6/9

Motif IDq-valPWM
CTCF_HUMAN.H11MO.0.A0.00938614
MA0139.1_CTCF0.0139708
CTCFL_HUMAN.H11MO.0.A0.028885900000000003
MXI1_HUMAN.H11MO.0.A0.0467911
MA0104.4_MYCN0.0467911
MYCN_HUMAN.H11MO.0.A0.0467911Not shown
MYC_HUMAN.H11MO.0.A0.0467911Not shown
SNAI1_HUMAN.H11MO.0.C0.0467911Not shown
MA0058.3_MAX0.0467911Not shown
MA1568.1_TCF21(var.2)0.0467911Not shown

Motif 7/9

Motif IDq-valPWM
SP2_HUMAN.H11MO.0.A1.0377599999999999e-07
SP1_HUMAN.H11MO.0.A1.1140200000000001e-07
SP3_HUMAN.H11MO.0.B1.1140200000000001e-07
TBX15_HUMAN.H11MO.0.D1.1140200000000001e-07
PATZ1_HUMAN.H11MO.0.C1.1140200000000001e-07
KLF16_HUMAN.H11MO.0.D2.50399e-07Not shown
MAZ_HUMAN.H11MO.0.A1.05792e-06Not shown
ZN467_HUMAN.H11MO.0.C1.61237e-06Not shown
WT1_HUMAN.H11MO.0.C1.61237e-06Not shown
VEZF1_HUMAN.H11MO.0.C5.99192e-06Not shown

Motif 8/9

Motif IDq-valPWM
ZN770_HUMAN.H11MO.0.C8.26636e-10
MA1596.1_ZNF4601.17545e-07
ZN770_HUMAN.H11MO.1.C0.000146462
MA1587.1_ZNF1350.0191885
ZSC22_HUMAN.H11MO.0.C0.05834640000000001
ZFX_HUMAN.H11MO.1.A0.155917Not shown
PITX2_HUMAN.H11MO.0.D0.17566199999999998Not shown
ZBT17_HUMAN.H11MO.0.A0.18099400000000002Not shown
MA0812.1_TFAP2B(var.2)0.19186Not shown
ZN263_HUMAN.H11MO.0.A0.201295Not shown

Motif 9/9

Motif IDq-valPWM
CTCFL_HUMAN.H11MO.0.A1.1743e-05
CTCF_HUMAN.H11MO.0.A0.000281984
MA1102.2_CTCFL0.000336518
MA0139.1_CTCF0.000871424
MA1548.1_PLAGL20.121695
SP2_HUMAN.H11MO.1.B0.14948Not shown
MA0830.2_TCF40.14948Not shown
SP2_HUMAN.H11MO.0.A0.14948Not shown
SP1_HUMAN.H11MO.1.A0.14948Not shown
BHA15_HUMAN.H11MO.0.B0.14948Not shown