In [1]:
import os
import sys
sys.path.append(os.path.abspath("/users/amtseng/tfmodisco/src/"))
from tfmodisco.run_tfmodisco import import_shap_scores, import_tfmodisco_results
from motif.read_motifs import pfm_info_content, pfm_to_pwm, trim_motif_by_ic
from motif.match_motifs import match_motifs_to_database
from util import figure_to_vdom_image
import plot.viz_sequence as viz_sequence
import numpy as np
import h5py
import matplotlib.pyplot as plt
import vdom.helpers as vdomh
from IPython.display import display

Define constants and paths

In [2]:
# Define parameters/fetch arguments
tf_name = os.environ["TFM_TF_NAME"]
shap_scores_path = os.environ["TFM_SHAP_PATH"]
tfm_results_path = os.environ["TFM_TFM_PATH"]
hyp_score_key = os.environ["TFM_HYP_SCORE_KEY"]
if "TFM_MOTIF_CACHE" in os.environ:
    tfm_motifs_cache_dir = os.environ["TFM_MOTIF_CACHE"]
else:
    tfm_motifs_cache_dir = None

print("TF name: %s" % tf_name)
print("DeepSHAP scores path: %s" % shap_scores_path)
print("TF-MoDISco results path: %s" % tfm_results_path)
print("Importance score key: %s" % hyp_score_key)
print("Saved TF-MoDISco-derived motifs cache: %s" % tfm_motifs_cache_dir)
TF name: REST
DeepSHAP scores path: /users/amtseng/tfmodisco/results/importance_scores/multitask_profile/REST_multitask_profile_fold1/REST_multitask_profile_fold1_imp_scores.h5
TF-MoDISco results path: /users/amtseng/tfmodisco/results/tfmodisco/multitask_profile/REST_multitask_profile_fold1/REST_multitask_profile_fold1_count_tfm.h5
Importance score key: count_hyp_scores
Saved TF-MoDISco-derived motifs cache: /users/amtseng/tfmodisco/results/reports/tfmodisco_results//cache/multitask_profile/REST_multitask_profile_fold1/REST_multitask_profile_fold1_count
In [3]:
# Define paths and constants
input_length = 2114
shap_score_center_size = 400
In [4]:
if tfm_motifs_cache_dir:
    os.makedirs(tfm_motifs_cache_dir, exist_ok=True)

Import SHAP scores and TF-MoDISco results

In [5]:
# Import SHAP coordinates and one-hot sequences
hyp_scores, _, one_hot_seqs, shap_coords = import_shap_scores(shap_scores_path, hyp_score_key, center_cut_size=shap_score_center_size)
# This cuts the sequences/scores off just as how TF-MoDISco saw them, but the coordinates are uncut
Importing SHAP scores: 100%|██████████| 287/287 [03:08<00:00,  1.52it/s]
In [6]:
# Import the TF-MoDISco results object
tfm_obj = import_tfmodisco_results(tfm_results_path, hyp_scores, one_hot_seqs, shap_score_center_size)

Plot some SHAP score tracks

Plot the central region of some randomly selected actual importance scores

In [7]:
plot_slice = slice(int(shap_score_center_size / 4), int(3 * shap_score_center_size / 4))
for index in np.random.choice(hyp_scores.shape[0], size=5, replace=False):
    viz_sequence.plot_weights((hyp_scores[index] * one_hot_seqs[index])[plot_slice], subticks_frequency=100)

Plot TF-MoDISco results

Plot all motifs by metacluster

In [8]:
motif_pfms, motif_hcwms, motif_cwms = [], [], []  # Save the trimmed PFMs, hCWMs, and CWMs
motif_pfms_short = []  # PFMs that are even more trimmed (for TOMTOM)
num_seqlets = []  # Number of seqlets for each motif
motif_seqlets = []  # Save seqlets of each motif
metaclusters = tfm_obj.metacluster_idx_to_submetacluster_results
num_metaclusters = len(metaclusters.keys())
if tfm_motifs_cache_dir:
    motif_hdf5 = h5py.File(os.path.join(tfm_motifs_cache_dir, "all_motifs.h5"), "w")
for metacluster_i, metacluster_key in enumerate(metaclusters.keys()):
    metacluster = metaclusters[metacluster_key]
    display(vdomh.h3("Metacluster %d/%d" % (metacluster_i + 1, num_metaclusters)))
    patterns = metacluster.seqlets_to_patterns_result.patterns
    if not patterns:
        break
    motif_pfms.append([])
    motif_hcwms.append([])
    motif_cwms.append([])
    motif_pfms_short.append([])
    num_seqlets.append([])
    motif_seqlets.append([])
    num_patterns = len(patterns)
    for pattern_i, pattern in enumerate(patterns):
        seqlets = pattern.seqlets
        display(vdomh.h4("Pattern %d/%d" % (pattern_i + 1, num_patterns)))
        display(vdomh.p("%d seqlets" % len(seqlets)))
        
        pfm = pattern["sequence"].fwd
        hcwm = pattern["task0_hypothetical_contribs"].fwd
        cwm = pattern["task0_contrib_scores"].fwd
        
        pfm_fig = viz_sequence.plot_weights(pfm, subticks_frequency=10, return_fig=True)
        hcwm_fig = viz_sequence.plot_weights(hcwm, subticks_frequency=10, return_fig=True)
        cwm_fig = viz_sequence.plot_weights(cwm, subticks_frequency=10, return_fig=True)
        pfm_fig.tight_layout()
        hcwm_fig.tight_layout()
        cwm_fig.tight_layout()
        
        motif_table = vdomh.table(
            vdomh.tr(
                vdomh.td("Sequence (PFM)"),
                vdomh.td(figure_to_vdom_image(pfm_fig))
            ),
            vdomh.tr(
                vdomh.td("Hypothetical contributions (hCWM)"),
                vdomh.td(figure_to_vdom_image(hcwm_fig))
            ),
            vdomh.tr(
                vdomh.td("Actual contributions (CWM)"),
                vdomh.td(figure_to_vdom_image(cwm_fig))
            )
        )
        display(motif_table)
        plt.close("all")  # Remove all standing figures
        
        # Trim motif based on information content
        short_trimmed_pfm = trim_motif_by_ic(pfm, pfm)
        motif_pfms_short[-1].append(short_trimmed_pfm)
        
        # Expand trimming to +/- 4bp on either side
        trimmed_pfm = trim_motif_by_ic(pfm, pfm, pad=4)
        trimmed_hcwm = trim_motif_by_ic(pfm, hcwm, pad=4)
        trimmed_cwm = trim_motif_by_ic(pfm, cwm, pad=4)
        
        motif_pfms[-1].append(trimmed_pfm)
        motif_hcwms[-1].append(trimmed_hcwm)
        motif_cwms[-1].append(trimmed_cwm)
        
        num_seqlets[-1].append(len(seqlets))
        
        if tfm_motifs_cache_dir:
            # Save results and figures
            motif_id = "%d_%d" % (metacluster_i, pattern_i)
            pfm_fig.savefig(os.path.join(tfm_motifs_cache_dir, motif_id + "_pfm_full.png"))
            hcwm_fig.savefig(os.path.join(tfm_motifs_cache_dir, motif_id + "_hcwm_full.png"))
            cwm_fig.savefig(os.path.join(tfm_motifs_cache_dir, motif_id + "_cwm_full.png"))
            motif_dset = motif_hdf5.create_group(motif_id)
            motif_dset.create_dataset("pfm_full", data=pfm, compression="gzip")
            motif_dset.create_dataset("hcwm_full", data=hcwm, compression="gzip")
            motif_dset.create_dataset("cwm_full", data=cwm, compression="gzip")
            motif_dset.create_dataset("pfm_trimmed", data=trimmed_pfm, compression="gzip")
            motif_dset.create_dataset("hcwm_trimmed", data=trimmed_hcwm, compression="gzip")
            motif_dset.create_dataset("cwm_trimmed", data=trimmed_cwm, compression="gzip")
            motif_dset.create_dataset("pfm_short_trimmed", data=short_trimmed_pfm, compression="gzip")
if tfm_motifs_cache_dir:
    motif_hdf5.close()

Metacluster 1/2

Pattern 1/16

7252 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 2/16

3038 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 3/16

752 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 4/16

662 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 5/16

605 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 6/16

470 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 7/16

428 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 8/16

368 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 9/16

224 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 10/16

203 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 11/16

183 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 12/16

122 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 13/16

78 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 14/16

73 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 15/16

36 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 16/16

30 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Metacluster 2/2

Summary of motifs

Motifs are trimmed based on information content, and presented in descending order by number of supporting seqlets. The motifs are separated by metacluster. The motifs are presented as hCWMs. The forward orientation is defined as the orientation that is richer in purines.

In [9]:
colgroup = vdomh.colgroup(
    vdomh.col(style={"width": "5%"}),
    vdomh.col(style={"width": "5%"}),
    vdomh.col(style={"width": "45%"}),
    vdomh.col(style={"width": "45%"})
)
header = vdomh.thead(
    vdomh.tr(
        vdomh.th("#", style={"text-align": "center"}),
        vdomh.th("Seqlets", style={"text-align": "center"}),
        vdomh.th("Forward", style={"text-align": "center"}),
        vdomh.th("Reverse", style={"text-align": "center"})
    )
)

for i in range(len(motif_hcwms)):
    display(vdomh.h3("Metacluster %d/%d" % (i + 1, num_metaclusters)))
    body = []
    for j in range(len(motif_hcwms[i])):
        motif = motif_hcwms[i][j]
        if np.sum(motif[:, [0, 2]]) > 0.5 * np.sum(motif):
            # Forward is purine-rich, reverse-complement is pyrimidine-rich
            f, rc = motif, np.flip(motif, axis=(0, 1))
        else:
            f, rc = np.flip(motif, axis=(0, 1)), motif
            
        f_fig = viz_sequence.plot_weights(f, figsize=(20, 4), return_fig=True)
        f_fig.tight_layout()
        rc_fig = viz_sequence.plot_weights(rc, figsize=(20, 4), return_fig=True)
        rc_fig.tight_layout()
        
        if tfm_motifs_cache_dir:
            # Save results and figures
            motif_id = "%d_%d" % (i, j)
            f_fig.savefig(os.path.join(tfm_motifs_cache_dir, motif_id + "_hcwm_trimmed_fwd.png"))
            rc_fig.savefig(os.path.join(tfm_motifs_cache_dir, motif_id + "_hcwm_trimmed_rev.png"))

        body.append(
            vdomh.tr(
                vdomh.td(str(j + 1)),
                vdomh.td(str(num_seqlets[i][j])),
                vdomh.td(figure_to_vdom_image(f_fig)),
                vdomh.td(figure_to_vdom_image(rc_fig))
            )
        )
    display(vdomh.table(colgroup, header, vdomh.tbody(*body)))
    plt.close("all")

Metacluster 1/2

/users/amtseng/tfmodisco/src/plot/viz_sequence.py:152: RuntimeWarning: More than 20 figures have been opened. Figures created through the pyplot interface (`matplotlib.pyplot.figure`) are retained until explicitly closed and may consume too much memory. (To control this warning, see the rcParam `figure.max_open_warning`).
  fig = plt.figure(figsize=figsize)
#SeqletsForwardReverse
17252
23038
3752
4662
5605
6470
7428
8368
9224
10203
11183
12122
1378
1473
1536
1630

Top TOMTOM matches for each motif

Here, the TF-MoDISco motifs are plotted as hCWMs, but the TOMTOM matches are shown as PWMs.

In [10]:
num_matches_to_keep = 10
num_matches_to_show = 5

header = vdomh.thead(
    vdomh.tr(
        vdomh.th("Motif ID", style={"text-align": "center"}),
        vdomh.th("q-val", style={"text-align": "center"}),
        vdomh.th("PWM", style={"text-align": "center"})
    )
)

for i in range(len(motif_pfms)):
    display(vdomh.h3("Metacluster %d/%d" % (i + 1, num_metaclusters)))
    
    # Compute TOMTOM matches for all motifs in the metacluster at once
    out_dir = os.path.join(tfm_motifs_cache_dir, "tomtom", "metacluster_%d" % i) if tfm_motifs_cache_dir else None
    tomtom_matches = match_motifs_to_database(motif_pfms_short[i], top_k=num_matches_to_keep, temp_dir=out_dir)
    
    for j in range(len(motif_pfms[i])):
        display(vdomh.h4("Motif %d/%d" % (j + 1, len(motif_pfms[i]))))
        viz_sequence.plot_weights(motif_hcwms[i][j])
    
        body = []
        for k, (match_name, match_pfm, match_qval) in enumerate(tomtom_matches[j]):
            fig = viz_sequence.plot_weights(pfm_to_pwm(match_pfm), return_fig=True)
            fig.tight_layout()
            if k < num_matches_to_show:
                body.append(
                    vdomh.tr(
                        vdomh.td(match_name),
                        vdomh.td(str(match_qval)),
                        vdomh.td(figure_to_vdom_image(fig))
                    )
                )
                if tfm_motifs_cache_dir:
                    # Save results and figures
                    motif_id = "%d_%d" % (i, j)
                    fig.savefig(os.path.join(out_dir, motif_id + ("_hit-%d.png" % (k + 1))))
            else:
                body.append(
                    vdomh.tr(
                        vdomh.td(match_name),
                        vdomh.td(str(match_qval)),
                        vdomh.td("Not shown")
                    )
                )
        if not body:
            display(vdomh.p("No TOMTOM matches passing threshold"))
        else:
            display(vdomh.table(header, vdomh.tbody(*body)))
        plt.close("all")

Metacluster 1/2

Motif 1/16

Motif IDq-valPWM
MA0138.2_REST4.90593e-18
REST_HUMAN.H11MO.0.A4.96239e-15

Motif 2/16

Motif IDq-valPWM
MA0138.2_REST0.0660309
REST_HUMAN.H11MO.0.A0.0660309

Motif 3/16

Motif IDq-valPWM
REST_HUMAN.H11MO.0.A0.000329731
MA0138.2_REST0.00045711199999999997

Motif 4/16

Motif IDq-valPWM
MA0139.1_CTCF3.1451e-10
CTCFL_HUMAN.H11MO.0.A3.1451e-10
CTCF_HUMAN.H11MO.0.A3.9744199999999994e-10
MA1102.2_CTCFL0.00019057700000000002
MA1568.1_TCF21(var.2)0.297188
MA1638.1_HAND20.297188Not shown
SNAI1_HUMAN.H11MO.0.C0.487809Not shown

Motif 5/16

Motif IDq-valPWM
MA0138.2_REST0.20241800000000001
REST_HUMAN.H11MO.0.A0.228129

Motif 6/16

Motif IDq-valPWM
ZN549_HUMAN.H11MO.0.C0.0796341

Motif 7/16

Motif IDq-valPWM
REST_HUMAN.H11MO.0.A0.127105
MA0138.2_REST0.23725300000000002

Motif 8/16

No TOMTOM matches passing threshold

Motif 9/16

No TOMTOM matches passing threshold

Motif 10/16

Motif IDq-valPWM
MA1631.1_ASCL1(var.2)0.0754958
MA0138.2_REST0.0754958
REST_HUMAN.H11MO.0.A0.10119500000000001
MA0830.2_TCF40.10119500000000001

Motif 11/16

Motif IDq-valPWM
CTCFL_HUMAN.H11MO.0.A1.29549e-06
CTCF_HUMAN.H11MO.0.A9.874810000000001e-06
MA1102.2_CTCFL3.37229e-05
MA0139.1_CTCF4.59712e-05
SP2_HUMAN.H11MO.0.A0.024753
MA0830.2_TCF40.05793380000000001Not shown
SP3_HUMAN.H11MO.0.B0.09970169999999999Not shown
PATZ1_HUMAN.H11MO.0.C0.09970169999999999Not shown
MXI1_HUMAN.H11MO.0.A0.10450899999999999Not shown
SP4_HUMAN.H11MO.0.A0.10528499999999999Not shown

Motif 12/16

Motif IDq-valPWM
TBX20_HUMAN.H11MO.0.D0.29388600000000004
MA0138.2_REST0.43510600000000005
REST_HUMAN.H11MO.0.A0.43510600000000005
ZSC16_HUMAN.H11MO.0.D0.43510600000000005
ZN350_HUMAN.H11MO.0.C0.48836599999999997
MYOD1_HUMAN.H11MO.0.A0.48836599999999997Not shown

Motif 13/16

Motif IDq-valPWM
ZN549_HUMAN.H11MO.0.C0.0699298

Motif 14/16

Motif IDq-valPWM
TAF1_HUMAN.H11MO.0.A1.24978e-05
TYY1_HUMAN.H11MO.0.A1.24978e-05
MA0095.2_YY17.21388e-05
PATZ1_HUMAN.H11MO.0.C0.0005171780000000001
SP1_HUMAN.H11MO.0.A0.000830408
THAP1_HUMAN.H11MO.0.C0.000830408Not shown
KLF16_HUMAN.H11MO.0.D0.00101909Not shown
MA0748.2_YY20.0016852999999999998Not shown
SP2_HUMAN.H11MO.0.A0.00194291Not shown
SP3_HUMAN.H11MO.0.B0.00493211Not shown

Motif 15/16

Motif IDq-valPWM
MA0527.1_ZBTB330.00872677
REST_HUMAN.H11MO.0.A0.00872677
MA0138.2_REST0.00872677
KAISO_HUMAN.H11MO.0.A0.0208857
KAISO_HUMAN.H11MO.1.A0.125097
SP2_HUMAN.H11MO.0.A0.166828Not shown
MA0830.2_TCF40.420911Not shown
EGR4_HUMAN.H11MO.0.D0.482379Not shown
SRBP2_HUMAN.H11MO.0.B0.482379Not shown
ZN148_HUMAN.H11MO.0.D0.482379Not shown

Motif 16/16

Motif IDq-valPWM
MA0138.2_REST0.133287
REST_HUMAN.H11MO.0.A0.260067