In [1]:
import os
import sys
sys.path.append(os.path.abspath("/users/amtseng/tfmodisco/src/"))
from tfmodisco.run_tfmodisco import import_shap_scores, import_tfmodisco_results
from motif.read_motifs import pfm_info_content, pfm_to_pwm, trim_motif_by_ic
from motif.match_motifs import match_motifs_to_database
from util import figure_to_vdom_image
import plot.viz_sequence as viz_sequence
import numpy as np
import h5py
import matplotlib.pyplot as plt
import vdom.helpers as vdomh
from IPython.display import display

Define constants and paths

In [2]:
# Define parameters/fetch arguments
tf_name = os.environ["TFM_TF_NAME"]
shap_scores_path = os.environ["TFM_SHAP_PATH"]
tfm_results_path = os.environ["TFM_TFM_PATH"]
hyp_score_key = os.environ["TFM_HYP_SCORE_KEY"]
if "TFM_MOTIF_CACHE" in os.environ:
    tfm_motifs_cache_dir = os.environ["TFM_MOTIF_CACHE"]
else:
    tfm_motifs_cache_dir = None

print("TF name: %s" % tf_name)
print("DeepSHAP scores path: %s" % shap_scores_path)
print("TF-MoDISco results path: %s" % tfm_results_path)
print("Importance score key: %s" % hyp_score_key)
print("Saved TF-MoDISco-derived motifs cache: %s" % tfm_motifs_cache_dir)
TF name: GABPA
DeepSHAP scores path: /users/amtseng/tfmodisco/results/importance_scores/multitask_profile/GABPA_multitask_profile_fold9/GABPA_multitask_profile_fold9_imp_scores.h5
TF-MoDISco results path: /users/amtseng/tfmodisco/results/tfmodisco/multitask_profile/GABPA_multitask_profile_fold9/GABPA_multitask_profile_fold9_count_tfm.h5
Importance score key: count_hyp_scores
Saved TF-MoDISco-derived motifs cache: /users/amtseng/tfmodisco/results/reports/tfmodisco_results//cache/multitask_profile/GABPA_multitask_profile_fold9/GABPA_multitask_profile_fold9_count
In [3]:
# Define paths and constants
input_length = 2114
shap_score_center_size = 400
In [4]:
if tfm_motifs_cache_dir:
    os.makedirs(tfm_motifs_cache_dir, exist_ok=True)

Import SHAP scores and TF-MoDISco results

In [5]:
# Import SHAP coordinates and one-hot sequences
hyp_scores, _, one_hot_seqs, shap_coords = import_shap_scores(shap_scores_path, hyp_score_key, center_cut_size=shap_score_center_size)
# This cuts the sequences/scores off just as how TF-MoDISco saw them, but the coordinates are uncut
Importing SHAP scores: 100%|██████████| 104/104 [01:38<00:00,  1.06it/s]
In [6]:
# Import the TF-MoDISco results object
tfm_obj = import_tfmodisco_results(tfm_results_path, hyp_scores, one_hot_seqs, shap_score_center_size)

Plot some SHAP score tracks

Plot the central region of some randomly selected actual importance scores

In [7]:
plot_slice = slice(int(shap_score_center_size / 4), int(3 * shap_score_center_size / 4))
for index in np.random.choice(hyp_scores.shape[0], size=5, replace=False):
    viz_sequence.plot_weights((hyp_scores[index] * one_hot_seqs[index])[plot_slice], subticks_frequency=100)

Plot TF-MoDISco results

Plot all motifs by metacluster

In [8]:
motif_pfms, motif_hcwms, motif_cwms = [], [], []  # Save the trimmed PFMs, hCWMs, and CWMs
motif_pfms_short = []  # PFMs that are even more trimmed (for TOMTOM)
num_seqlets = []  # Number of seqlets for each motif
motif_seqlets = []  # Save seqlets of each motif
metaclusters = tfm_obj.metacluster_idx_to_submetacluster_results
num_metaclusters = len(metaclusters.keys())
if tfm_motifs_cache_dir:
    motif_hdf5 = h5py.File(os.path.join(tfm_motifs_cache_dir, "all_motifs.h5"), "w")
for metacluster_i, metacluster_key in enumerate(metaclusters.keys()):
    metacluster = metaclusters[metacluster_key]
    display(vdomh.h3("Metacluster %d/%d" % (metacluster_i + 1, num_metaclusters)))
    patterns = metacluster.seqlets_to_patterns_result.patterns
    if not patterns:
        break
    motif_pfms.append([])
    motif_hcwms.append([])
    motif_cwms.append([])
    motif_pfms_short.append([])
    num_seqlets.append([])
    motif_seqlets.append([])
    num_patterns = len(patterns)
    for pattern_i, pattern in enumerate(patterns):
        seqlets = pattern.seqlets
        display(vdomh.h4("Pattern %d/%d" % (pattern_i + 1, num_patterns)))
        display(vdomh.p("%d seqlets" % len(seqlets)))
        
        pfm = pattern["sequence"].fwd
        hcwm = pattern["task0_hypothetical_contribs"].fwd
        cwm = pattern["task0_contrib_scores"].fwd
        
        pfm_fig = viz_sequence.plot_weights(pfm, subticks_frequency=10, return_fig=True)
        hcwm_fig = viz_sequence.plot_weights(hcwm, subticks_frequency=10, return_fig=True)
        cwm_fig = viz_sequence.plot_weights(cwm, subticks_frequency=10, return_fig=True)
        pfm_fig.tight_layout()
        hcwm_fig.tight_layout()
        cwm_fig.tight_layout()
        
        motif_table = vdomh.table(
            vdomh.tr(
                vdomh.td("Sequence (PFM)"),
                vdomh.td(figure_to_vdom_image(pfm_fig))
            ),
            vdomh.tr(
                vdomh.td("Hypothetical contributions (hCWM)"),
                vdomh.td(figure_to_vdom_image(hcwm_fig))
            ),
            vdomh.tr(
                vdomh.td("Actual contributions (CWM)"),
                vdomh.td(figure_to_vdom_image(cwm_fig))
            )
        )
        display(motif_table)
        plt.close("all")  # Remove all standing figures
        
        # Trim motif based on information content
        short_trimmed_pfm = trim_motif_by_ic(pfm, pfm)
        motif_pfms_short[-1].append(short_trimmed_pfm)
        
        # Expand trimming to +/- 4bp on either side
        trimmed_pfm = trim_motif_by_ic(pfm, pfm, pad=4)
        trimmed_hcwm = trim_motif_by_ic(pfm, hcwm, pad=4)
        trimmed_cwm = trim_motif_by_ic(pfm, cwm, pad=4)
        
        motif_pfms[-1].append(trimmed_pfm)
        motif_hcwms[-1].append(trimmed_hcwm)
        motif_cwms[-1].append(trimmed_cwm)
        
        num_seqlets[-1].append(len(seqlets))
        
        if tfm_motifs_cache_dir:
            # Save results and figures
            motif_id = "%d_%d" % (metacluster_i, pattern_i)
            pfm_fig.savefig(os.path.join(tfm_motifs_cache_dir, motif_id + "_pfm_full.png"))
            hcwm_fig.savefig(os.path.join(tfm_motifs_cache_dir, motif_id + "_hcwm_full.png"))
            cwm_fig.savefig(os.path.join(tfm_motifs_cache_dir, motif_id + "_cwm_full.png"))
            motif_dset = motif_hdf5.create_group(motif_id)
            motif_dset.create_dataset("pfm_full", data=pfm, compression="gzip")
            motif_dset.create_dataset("hcwm_full", data=hcwm, compression="gzip")
            motif_dset.create_dataset("cwm_full", data=cwm, compression="gzip")
            motif_dset.create_dataset("pfm_trimmed", data=trimmed_pfm, compression="gzip")
            motif_dset.create_dataset("hcwm_trimmed", data=trimmed_hcwm, compression="gzip")
            motif_dset.create_dataset("cwm_trimmed", data=trimmed_cwm, compression="gzip")
            motif_dset.create_dataset("pfm_short_trimmed", data=short_trimmed_pfm, compression="gzip")
if tfm_motifs_cache_dir:
    motif_hdf5.close()

Metacluster 1/1

Pattern 1/14

7370 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 2/14

937 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 3/14

625 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 4/14

234 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 5/14

194 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 6/14

134 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 7/14

53 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 8/14

50 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 9/14

42 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 10/14

38 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 11/14

36 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 12/14

34 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 13/14

30 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 14/14

30 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Summary of motifs

Motifs are trimmed based on information content, and presented in descending order by number of supporting seqlets. The motifs are separated by metacluster. The motifs are presented as hCWMs. The forward orientation is defined as the orientation that is richer in purines.

In [9]:
colgroup = vdomh.colgroup(
    vdomh.col(style={"width": "5%"}),
    vdomh.col(style={"width": "5%"}),
    vdomh.col(style={"width": "45%"}),
    vdomh.col(style={"width": "45%"})
)
header = vdomh.thead(
    vdomh.tr(
        vdomh.th("#", style={"text-align": "center"}),
        vdomh.th("Seqlets", style={"text-align": "center"}),
        vdomh.th("Forward", style={"text-align": "center"}),
        vdomh.th("Reverse", style={"text-align": "center"})
    )
)

for i in range(len(motif_hcwms)):
    display(vdomh.h3("Metacluster %d/%d" % (i + 1, num_metaclusters)))
    body = []
    for j in range(len(motif_hcwms[i])):
        motif = motif_hcwms[i][j]
        if np.sum(motif[:, [0, 2]]) > 0.5 * np.sum(motif):
            # Forward is purine-rich, reverse-complement is pyrimidine-rich
            f, rc = motif, np.flip(motif, axis=(0, 1))
        else:
            f, rc = np.flip(motif, axis=(0, 1)), motif
            
        f_fig = viz_sequence.plot_weights(f, figsize=(20, 4), return_fig=True)
        f_fig.tight_layout()
        rc_fig = viz_sequence.plot_weights(rc, figsize=(20, 4), return_fig=True)
        rc_fig.tight_layout()
        
        if tfm_motifs_cache_dir:
            # Save results and figures
            motif_id = "%d_%d" % (i, j)
            f_fig.savefig(os.path.join(tfm_motifs_cache_dir, motif_id + "_hcwm_trimmed_fwd.png"))
            rc_fig.savefig(os.path.join(tfm_motifs_cache_dir, motif_id + "_hcwm_trimmed_rev.png"))

        body.append(
            vdomh.tr(
                vdomh.td(str(j + 1)),
                vdomh.td(str(num_seqlets[i][j])),
                vdomh.td(figure_to_vdom_image(f_fig)),
                vdomh.td(figure_to_vdom_image(rc_fig))
            )
        )
    display(vdomh.table(colgroup, header, vdomh.tbody(*body)))
    plt.close("all")

Metacluster 1/1

/users/amtseng/tfmodisco/src/plot/viz_sequence.py:152: RuntimeWarning: More than 20 figures have been opened. Figures created through the pyplot interface (`matplotlib.pyplot.figure`) are retained until explicitly closed and may consume too much memory. (To control this warning, see the rcParam `figure.max_open_warning`).
  fig = plt.figure(figsize=figsize)
#SeqletsForwardReverse
17370
2937
3625
4234
5194
6134
753
850
942
1038
1136
1234
1330
1430

Top TOMTOM matches for each motif

Here, the TF-MoDISco motifs are plotted as hCWMs, but the TOMTOM matches are shown as PWMs.

In [10]:
num_matches_to_keep = 10
num_matches_to_show = 5

header = vdomh.thead(
    vdomh.tr(
        vdomh.th("Motif ID", style={"text-align": "center"}),
        vdomh.th("q-val", style={"text-align": "center"}),
        vdomh.th("PWM", style={"text-align": "center"})
    )
)

for i in range(len(motif_pfms)):
    display(vdomh.h3("Metacluster %d/%d" % (i + 1, num_metaclusters)))
    
    # Compute TOMTOM matches for all motifs in the metacluster at once
    out_dir = os.path.join(tfm_motifs_cache_dir, "tomtom", "metacluster_%d" % i) if tfm_motifs_cache_dir else None
    tomtom_matches = match_motifs_to_database(motif_pfms_short[i], top_k=num_matches_to_keep, temp_dir=out_dir)
    
    for j in range(len(motif_pfms[i])):
        display(vdomh.h4("Motif %d/%d" % (j + 1, len(motif_pfms[i]))))
        viz_sequence.plot_weights(motif_hcwms[i][j])
    
        body = []
        for k, (match_name, match_pfm, match_qval) in enumerate(tomtom_matches[j]):
            fig = viz_sequence.plot_weights(pfm_to_pwm(match_pfm), return_fig=True)
            fig.tight_layout()
            if k < num_matches_to_show:
                body.append(
                    vdomh.tr(
                        vdomh.td(match_name),
                        vdomh.td(str(match_qval)),
                        vdomh.td(figure_to_vdom_image(fig))
                    )
                )
                if tfm_motifs_cache_dir:
                    # Save results and figures
                    motif_id = "%d_%d" % (i, j)
                    fig.savefig(os.path.join(out_dir, motif_id + ("_hit-%d.png" % (k + 1))))
            else:
                body.append(
                    vdomh.tr(
                        vdomh.td(match_name),
                        vdomh.td(str(match_qval)),
                        vdomh.td("Not shown")
                    )
                )
        if not body:
            display(vdomh.p("No TOMTOM matches passing threshold"))
        else:
            display(vdomh.table(header, vdomh.tbody(*body)))
        plt.close("all")

Metacluster 1/1

Motif 1/14

Motif IDq-valPWM
MA0076.2_ELK44.70321e-06
ETV1_HUMAN.H11MO.0.A4.70321e-06
MA0750.2_ZBTB7A4.70321e-06
GABPA_HUMAN.H11MO.0.A4.70321e-06
MA0759.1_ELK31.16393e-05
ELK1_HUMAN.H11MO.0.B1.29325e-05Not shown
ELK4_HUMAN.H11MO.0.A1.3856300000000001e-05Not shown
ELF2_HUMAN.H11MO.0.C1.4549100000000001e-05Not shown
ELF1_HUMAN.H11MO.0.A1.5088e-05Not shown
MA1483.1_ELF24.07288e-05Not shown

Motif 2/14

Motif IDq-valPWM
GABPA_HUMAN.H11MO.0.A5.06709e-05
ELF2_HUMAN.H11MO.0.C0.00169802
ELF1_HUMAN.H11MO.0.A0.00268568
MA0076.2_ELK40.00289996
ELK1_HUMAN.H11MO.0.B0.00290528
MA0645.1_ETV60.00366212Not shown
MA0474.2_ERG0.00367083Not shown
MA0475.2_FLI10.00367083Not shown
ERG_HUMAN.H11MO.0.A0.00380398Not shown
MA0098.3_ETS10.00380398Not shown

Motif 3/14

Motif IDq-valPWM
ZNF76_HUMAN.H11MO.0.C1.2553799999999999e-22
ZN143_HUMAN.H11MO.0.A3.2794500000000004e-19
THA11_HUMAN.H11MO.0.B1.4542299999999998e-16
MA1573.1_THAP113.98401e-09
MA0088.2_ZNF1430.0131934
STAT3_HUMAN.H11MO.0.A0.054890999999999995Not shown
P63_HUMAN.H11MO.0.A0.054890999999999995Not shown
MA1625.1_Stat5b0.103029Not shown
MA0519.1_Stat5a::Stat5b0.115972Not shown
MA0525.2_TP630.115972Not shown

Motif 4/14

Motif IDq-valPWM
MA0765.2_ETV50.0211862
ETV1_HUMAN.H11MO.0.A0.0664545
GABPA_HUMAN.H11MO.0.A0.0664545
MA0076.2_ELK40.0664545
ELK4_HUMAN.H11MO.0.A0.102151
ELF2_HUMAN.H11MO.0.C0.177847Not shown
ELK1_HUMAN.H11MO.0.B0.24916300000000002Not shown
FEV_HUMAN.H11MO.0.B0.24916300000000002Not shown
MA0645.1_ETV60.24916300000000002Not shown
MA0750.2_ZBTB7A0.261821Not shown

Motif 5/14

Motif IDq-valPWM
GABPA_HUMAN.H11MO.0.A0.017471599999999997
MA0076.2_ELK40.017471599999999997
ETV1_HUMAN.H11MO.0.A0.017471599999999997
ELK4_HUMAN.H11MO.0.A0.017471599999999997
MA0765.2_ETV50.022119
SP3_HUMAN.H11MO.0.B0.0294163Not shown
WT1_HUMAN.H11MO.0.C0.030874400000000003Not shown
FEV_HUMAN.H11MO.0.B0.030874400000000003Not shown
SP1_HUMAN.H11MO.0.A0.0410364Not shown
MBD2_HUMAN.H11MO.0.B0.0435024Not shown

Motif 6/14

Motif IDq-valPWM
MA1475.1_CREB3L4(var.2)0.000231127
CTCF_HUMAN.H11MO.0.A0.000231127
MA0605.2_ATF30.000280076
MA0139.1_CTCF0.000619113
MA0656.1_JDP2(var.2)0.000619113
MA1140.2_JUNB(var.2)0.000639859Not shown
CTCFL_HUMAN.H11MO.0.A0.000917227Not shown
MA1139.1_FOSL2::JUNB(var.2)0.00130424Not shown
MA1131.1_FOSL2::JUN(var.2)0.00158454Not shown
ATF1_HUMAN.H11MO.0.B0.0018647000000000002Not shown

Motif 7/14

Motif IDq-valPWM
SP2_HUMAN.H11MO.0.A0.0119264
SP1_HUMAN.H11MO.0.A0.01728
SP3_HUMAN.H11MO.0.B0.0414722
MXI1_HUMAN.H11MO.0.A0.056551800000000006
MA1102.2_CTCFL0.08132389999999999
GABPA_HUMAN.H11MO.0.A0.0949561Not shown
ETV1_HUMAN.H11MO.0.A0.106651Not shown
WT1_HUMAN.H11MO.0.C0.106651Not shown
ELK4_HUMAN.H11MO.0.A0.106651Not shown
PTF1A_HUMAN.H11MO.0.B0.106651Not shown

Motif 8/14

Motif IDq-valPWM
ZNF76_HUMAN.H11MO.0.C8.38425e-16
ZN143_HUMAN.H11MO.0.A1.10524e-14
THA11_HUMAN.H11MO.0.B5.958469999999999e-12
MA1573.1_THAP114.20135e-11
STAT3_HUMAN.H11MO.0.A0.019069
MA0519.1_Stat5a::Stat5b0.037313900000000004Not shown
MA0088.2_ZNF1430.037313900000000004Not shown
MA1625.1_Stat5b0.0860465Not shown
MA0144.2_STAT30.162795Not shown
MA1513.1_KLF150.172593Not shown

Motif 9/14

Motif IDq-valPWM
THAP1_HUMAN.H11MO.0.C0.0174012
SP2_HUMAN.H11MO.0.A0.10565
CTCFL_HUMAN.H11MO.0.A0.24291100000000002
AP2B_HUMAN.H11MO.0.B0.24291100000000002
ELK4_HUMAN.H11MO.0.A0.248687
ETV1_HUMAN.H11MO.0.A0.248687Not shown
MA0830.2_TCF40.248687Not shown
SP3_HUMAN.H11MO.0.B0.248687Not shown
MA0765.2_ETV50.248687Not shown
MA0076.2_ELK40.248687Not shown

Motif 10/14

Motif IDq-valPWM
SP2_HUMAN.H11MO.0.A0.00385924
SP1_HUMAN.H11MO.0.A0.0124039
KLF3_HUMAN.H11MO.0.B0.205125
MA0814.2_TFAP2C(var.2)0.25159000000000004
CTCFL_HUMAN.H11MO.0.A0.25159000000000004
CTCF_HUMAN.H11MO.0.A0.259793Not shown
AP2D_HUMAN.H11MO.0.D0.259793Not shown
MA1587.1_ZNF1350.259793Not shown
SP3_HUMAN.H11MO.0.B0.285808Not shown
KLF1_HUMAN.H11MO.0.A0.287256Not shown

Motif 11/14

Motif IDq-valPWM
SP2_HUMAN.H11MO.0.A0.00156829
ETV2_HUMAN.H11MO.0.B0.00156829
SP1_HUMAN.H11MO.0.A0.00156829
GABPA_HUMAN.H11MO.0.A0.00190978
ELF1_HUMAN.H11MO.0.A0.00260616
ELF2_HUMAN.H11MO.0.C0.00260616Not shown
MA0598.3_EHF0.0030298Not shown
MA1418.1_IRF30.00308483Not shown
SP3_HUMAN.H11MO.0.B0.00367551Not shown
MA0076.2_ELK40.0039186Not shown

Motif 12/14

Motif IDq-valPWM
ETS1_HUMAN.H11MO.0.A0.00045743300000000004
ETV2_HUMAN.H11MO.0.B0.00045743300000000004
MA0764.2_ETV40.00045743300000000004
ERG_HUMAN.H11MO.0.A0.000480412
GABPA_HUMAN.H11MO.0.A0.000562197
FLI1_HUMAN.H11MO.1.A0.00118108Not shown
ETV4_HUMAN.H11MO.0.B0.00141476Not shown
ELK4_HUMAN.H11MO.0.A0.00141476Not shown
SP2_HUMAN.H11MO.0.A0.00141476Not shown
MA0076.2_ELK40.00141476Not shown

Motif 13/14

Motif IDq-valPWM
ELK4_HUMAN.H11MO.0.A0.00679839
MA0076.2_ELK40.00679839
ETV1_HUMAN.H11MO.0.A0.007603699999999999
MA0765.2_ETV50.007603699999999999
MA0750.2_ZBTB7A0.007603699999999999
SP1_HUMAN.H11MO.0.A0.007603699999999999Not shown
GABPA_HUMAN.H11MO.0.A0.00935601Not shown
TBX15_HUMAN.H11MO.0.D0.00935601Not shown
FEV_HUMAN.H11MO.0.B0.0107137Not shown
ELK1_HUMAN.H11MO.0.B0.0143726Not shown

Motif 14/14

Motif IDq-valPWM
ETS1_HUMAN.H11MO.0.A1.0649200000000001e-05
MA0473.3_ELF11.0649200000000001e-05
MA0062.3_GABPA1.0649200000000001e-05
ERG_HUMAN.H11MO.0.A1.29689e-05
FLI1_HUMAN.H11MO.1.A3.55676e-05
ETV5_HUMAN.H11MO.0.C3.71162e-05Not shown
MA0598.3_EHF3.71162e-05Not shown
MA0761.2_ETV13.71162e-05Not shown
ETV2_HUMAN.H11MO.0.B4.3306599999999996e-05Not shown
MA0764.2_ETV40.000140207Not shown