In [1]:
import os
import sys
sys.path.append(os.path.abspath("/users/amtseng/tfmodisco/src/"))
from tfmodisco.run_tfmodisco import import_shap_scores, import_tfmodisco_results
from motif.read_motifs import pfm_info_content, pfm_to_pwm, trim_motif_by_ic
from motif.match_motifs import match_motifs_to_database
from util import figure_to_vdom_image
import plot.viz_sequence as viz_sequence
import numpy as np
import h5py
import matplotlib.pyplot as plt
import vdom.helpers as vdomh
from IPython.display import display

Define constants and paths

In [2]:
# Define parameters/fetch arguments
tf_name = os.environ["TFM_TF_NAME"]
shap_scores_path = os.environ["TFM_SHAP_PATH"]
tfm_results_path = os.environ["TFM_TFM_PATH"]
hyp_score_key = os.environ["TFM_HYP_SCORE_KEY"]
if "TFM_MOTIF_CACHE" in os.environ:
    tfm_motifs_cache_dir = os.environ["TFM_MOTIF_CACHE"]
else:
    tfm_motifs_cache_dir = None

print("TF name: %s" % tf_name)
print("DeepSHAP scores path: %s" % shap_scores_path)
print("TF-MoDISco results path: %s" % tfm_results_path)
print("Importance score key: %s" % hyp_score_key)
print("Saved TF-MoDISco-derived motifs cache: %s" % tfm_motifs_cache_dir)
TF name: GABPA
DeepSHAP scores path: /users/amtseng/tfmodisco/results/importance_scores/multitask_profile/GABPA_multitask_profile_fold8/GABPA_multitask_profile_fold8_imp_scores.h5
TF-MoDISco results path: /users/amtseng/tfmodisco/results/tfmodisco/multitask_profile/GABPA_multitask_profile_fold8/GABPA_multitask_profile_fold8_count_tfm.h5
Importance score key: count_hyp_scores
Saved TF-MoDISco-derived motifs cache: /users/amtseng/tfmodisco/results/reports/tfmodisco_results//cache/multitask_profile/GABPA_multitask_profile_fold8/GABPA_multitask_profile_fold8_count
In [3]:
# Define paths and constants
input_length = 2114
shap_score_center_size = 400
In [4]:
if tfm_motifs_cache_dir:
    os.makedirs(tfm_motifs_cache_dir, exist_ok=True)

Import SHAP scores and TF-MoDISco results

In [5]:
# Import SHAP coordinates and one-hot sequences
hyp_scores, _, one_hot_seqs, shap_coords = import_shap_scores(shap_scores_path, hyp_score_key, center_cut_size=shap_score_center_size)
# This cuts the sequences/scores off just as how TF-MoDISco saw them, but the coordinates are uncut
Importing SHAP scores: 100%|██████████| 104/104 [01:12<00:00,  1.43it/s]
In [6]:
# Import the TF-MoDISco results object
tfm_obj = import_tfmodisco_results(tfm_results_path, hyp_scores, one_hot_seqs, shap_score_center_size)

Plot some SHAP score tracks

Plot the central region of some randomly selected actual importance scores

In [7]:
plot_slice = slice(int(shap_score_center_size / 4), int(3 * shap_score_center_size / 4))
for index in np.random.choice(hyp_scores.shape[0], size=5, replace=False):
    viz_sequence.plot_weights((hyp_scores[index] * one_hot_seqs[index])[plot_slice], subticks_frequency=100)

Plot TF-MoDISco results

Plot all motifs by metacluster

In [8]:
motif_pfms, motif_hcwms, motif_cwms = [], [], []  # Save the trimmed PFMs, hCWMs, and CWMs
motif_pfms_short = []  # PFMs that are even more trimmed (for TOMTOM)
num_seqlets = []  # Number of seqlets for each motif
motif_seqlets = []  # Save seqlets of each motif
metaclusters = tfm_obj.metacluster_idx_to_submetacluster_results
num_metaclusters = len(metaclusters.keys())
if tfm_motifs_cache_dir:
    motif_hdf5 = h5py.File(os.path.join(tfm_motifs_cache_dir, "all_motifs.h5"), "w")
for metacluster_i, metacluster_key in enumerate(metaclusters.keys()):
    metacluster = metaclusters[metacluster_key]
    display(vdomh.h3("Metacluster %d/%d" % (metacluster_i + 1, num_metaclusters)))
    patterns = metacluster.seqlets_to_patterns_result.patterns
    if not patterns:
        break
    motif_pfms.append([])
    motif_hcwms.append([])
    motif_cwms.append([])
    motif_pfms_short.append([])
    num_seqlets.append([])
    motif_seqlets.append([])
    num_patterns = len(patterns)
    for pattern_i, pattern in enumerate(patterns):
        seqlets = pattern.seqlets
        display(vdomh.h4("Pattern %d/%d" % (pattern_i + 1, num_patterns)))
        display(vdomh.p("%d seqlets" % len(seqlets)))
        
        pfm = pattern["sequence"].fwd
        hcwm = pattern["task0_hypothetical_contribs"].fwd
        cwm = pattern["task0_contrib_scores"].fwd
        
        pfm_fig = viz_sequence.plot_weights(pfm, subticks_frequency=10, return_fig=True)
        hcwm_fig = viz_sequence.plot_weights(hcwm, subticks_frequency=10, return_fig=True)
        cwm_fig = viz_sequence.plot_weights(cwm, subticks_frequency=10, return_fig=True)
        pfm_fig.tight_layout()
        hcwm_fig.tight_layout()
        cwm_fig.tight_layout()
        
        motif_table = vdomh.table(
            vdomh.tr(
                vdomh.td("Sequence (PFM)"),
                vdomh.td(figure_to_vdom_image(pfm_fig))
            ),
            vdomh.tr(
                vdomh.td("Hypothetical contributions (hCWM)"),
                vdomh.td(figure_to_vdom_image(hcwm_fig))
            ),
            vdomh.tr(
                vdomh.td("Actual contributions (CWM)"),
                vdomh.td(figure_to_vdom_image(cwm_fig))
            )
        )
        display(motif_table)
        plt.close("all")  # Remove all standing figures
        
        # Trim motif based on information content
        short_trimmed_pfm = trim_motif_by_ic(pfm, pfm)
        motif_pfms_short[-1].append(short_trimmed_pfm)
        
        # Expand trimming to +/- 4bp on either side
        trimmed_pfm = trim_motif_by_ic(pfm, pfm, pad=4)
        trimmed_hcwm = trim_motif_by_ic(pfm, hcwm, pad=4)
        trimmed_cwm = trim_motif_by_ic(pfm, cwm, pad=4)
        
        motif_pfms[-1].append(trimmed_pfm)
        motif_hcwms[-1].append(trimmed_hcwm)
        motif_cwms[-1].append(trimmed_cwm)
        
        num_seqlets[-1].append(len(seqlets))
        
        if tfm_motifs_cache_dir:
            # Save results and figures
            motif_id = "%d_%d" % (metacluster_i, pattern_i)
            pfm_fig.savefig(os.path.join(tfm_motifs_cache_dir, motif_id + "_pfm_full.png"))
            hcwm_fig.savefig(os.path.join(tfm_motifs_cache_dir, motif_id + "_hcwm_full.png"))
            cwm_fig.savefig(os.path.join(tfm_motifs_cache_dir, motif_id + "_cwm_full.png"))
            motif_dset = motif_hdf5.create_group(motif_id)
            motif_dset.create_dataset("pfm_full", data=pfm, compression="gzip")
            motif_dset.create_dataset("hcwm_full", data=hcwm, compression="gzip")
            motif_dset.create_dataset("cwm_full", data=cwm, compression="gzip")
            motif_dset.create_dataset("pfm_trimmed", data=trimmed_pfm, compression="gzip")
            motif_dset.create_dataset("hcwm_trimmed", data=trimmed_hcwm, compression="gzip")
            motif_dset.create_dataset("cwm_trimmed", data=trimmed_cwm, compression="gzip")
            motif_dset.create_dataset("pfm_short_trimmed", data=short_trimmed_pfm, compression="gzip")
if tfm_motifs_cache_dir:
    motif_hdf5.close()

Metacluster 1/2

Pattern 1/13

6440 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 2/13

1156 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 3/13

539 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 4/13

249 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 5/13

209 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 6/13

170 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 7/13

98 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 8/13

97 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 9/13

89 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 10/13

59 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 11/13

54 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 12/13

44 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 13/13

30 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Metacluster 2/2

Summary of motifs

Motifs are trimmed based on information content, and presented in descending order by number of supporting seqlets. The motifs are separated by metacluster. The motifs are presented as hCWMs. The forward orientation is defined as the orientation that is richer in purines.

In [9]:
colgroup = vdomh.colgroup(
    vdomh.col(style={"width": "5%"}),
    vdomh.col(style={"width": "5%"}),
    vdomh.col(style={"width": "45%"}),
    vdomh.col(style={"width": "45%"})
)
header = vdomh.thead(
    vdomh.tr(
        vdomh.th("#", style={"text-align": "center"}),
        vdomh.th("Seqlets", style={"text-align": "center"}),
        vdomh.th("Forward", style={"text-align": "center"}),
        vdomh.th("Reverse", style={"text-align": "center"})
    )
)

for i in range(len(motif_hcwms)):
    display(vdomh.h3("Metacluster %d/%d" % (i + 1, num_metaclusters)))
    body = []
    for j in range(len(motif_hcwms[i])):
        motif = motif_hcwms[i][j]
        if np.sum(motif[:, [0, 2]]) > 0.5 * np.sum(motif):
            # Forward is purine-rich, reverse-complement is pyrimidine-rich
            f, rc = motif, np.flip(motif, axis=(0, 1))
        else:
            f, rc = np.flip(motif, axis=(0, 1)), motif
            
        f_fig = viz_sequence.plot_weights(f, figsize=(20, 4), return_fig=True)
        f_fig.tight_layout()
        rc_fig = viz_sequence.plot_weights(rc, figsize=(20, 4), return_fig=True)
        rc_fig.tight_layout()
        
        if tfm_motifs_cache_dir:
            # Save results and figures
            motif_id = "%d_%d" % (i, j)
            f_fig.savefig(os.path.join(tfm_motifs_cache_dir, motif_id + "_hcwm_trimmed_fwd.png"))
            rc_fig.savefig(os.path.join(tfm_motifs_cache_dir, motif_id + "_hcwm_trimmed_rev.png"))

        body.append(
            vdomh.tr(
                vdomh.td(str(j + 1)),
                vdomh.td(str(num_seqlets[i][j])),
                vdomh.td(figure_to_vdom_image(f_fig)),
                vdomh.td(figure_to_vdom_image(rc_fig))
            )
        )
    display(vdomh.table(colgroup, header, vdomh.tbody(*body)))
    plt.close("all")

Metacluster 1/2

/users/amtseng/tfmodisco/src/plot/viz_sequence.py:152: RuntimeWarning: More than 20 figures have been opened. Figures created through the pyplot interface (`matplotlib.pyplot.figure`) are retained until explicitly closed and may consume too much memory. (To control this warning, see the rcParam `figure.max_open_warning`).
  fig = plt.figure(figsize=figsize)
#SeqletsForwardReverse
16440
21156
3539
4249
5209
6170
798
897
989
1059
1154
1244
1330

Top TOMTOM matches for each motif

Here, the TF-MoDISco motifs are plotted as hCWMs, but the TOMTOM matches are shown as PWMs.

In [10]:
num_matches_to_keep = 10
num_matches_to_show = 5

header = vdomh.thead(
    vdomh.tr(
        vdomh.th("Motif ID", style={"text-align": "center"}),
        vdomh.th("q-val", style={"text-align": "center"}),
        vdomh.th("PWM", style={"text-align": "center"})
    )
)

for i in range(len(motif_pfms)):
    display(vdomh.h3("Metacluster %d/%d" % (i + 1, num_metaclusters)))
    
    # Compute TOMTOM matches for all motifs in the metacluster at once
    out_dir = os.path.join(tfm_motifs_cache_dir, "tomtom", "metacluster_%d" % i) if tfm_motifs_cache_dir else None
    tomtom_matches = match_motifs_to_database(motif_pfms_short[i], top_k=num_matches_to_keep, temp_dir=out_dir)
    
    for j in range(len(motif_pfms[i])):
        display(vdomh.h4("Motif %d/%d" % (j + 1, len(motif_pfms[i]))))
        viz_sequence.plot_weights(motif_hcwms[i][j])
    
        body = []
        for k, (match_name, match_pfm, match_qval) in enumerate(tomtom_matches[j]):
            fig = viz_sequence.plot_weights(pfm_to_pwm(match_pfm), return_fig=True)
            fig.tight_layout()
            if k < num_matches_to_show:
                body.append(
                    vdomh.tr(
                        vdomh.td(match_name),
                        vdomh.td(str(match_qval)),
                        vdomh.td(figure_to_vdom_image(fig))
                    )
                )
                if tfm_motifs_cache_dir:
                    # Save results and figures
                    motif_id = "%d_%d" % (i, j)
                    fig.savefig(os.path.join(out_dir, motif_id + ("_hit-%d.png" % (k + 1))))
            else:
                body.append(
                    vdomh.tr(
                        vdomh.td(match_name),
                        vdomh.td(str(match_qval)),
                        vdomh.td("Not shown")
                    )
                )
        if not body:
            display(vdomh.p("No TOMTOM matches passing threshold"))
        else:
            display(vdomh.table(header, vdomh.tbody(*body)))
        plt.close("all")

Metacluster 1/2

Motif 1/13

Motif IDq-valPWM
MA0076.2_ELK43.35517e-06
GABPA_HUMAN.H11MO.0.A3.35517e-06
ELK1_HUMAN.H11MO.0.B1.28322e-05
ETV1_HUMAN.H11MO.0.A1.28322e-05
ELF2_HUMAN.H11MO.0.C1.28322e-05
MA0750.2_ZBTB7A1.28322e-05Not shown
ELF1_HUMAN.H11MO.0.A1.28322e-05Not shown
MA0759.1_ELK32.85745e-05Not shown
MA0765.2_ETV53.3866100000000004e-05Not shown
ELK4_HUMAN.H11MO.0.A3.4635700000000005e-05Not shown

Motif 2/13

Motif IDq-valPWM
GABPA_HUMAN.H11MO.0.A0.0712192
MA0772.1_IRF70.0712192
MA0076.2_ELK40.0712192
MA0640.2_ELF30.134845
MA0765.2_ETV50.134845
ELK1_HUMAN.H11MO.0.B0.134845Not shown
ELK4_HUMAN.H11MO.0.A0.134845Not shown
ELF1_HUMAN.H11MO.0.A0.138967Not shown
MA1419.1_IRF40.138967Not shown
MA0098.3_ETS10.138967Not shown

Motif 3/13

Motif IDq-valPWM
ZNF76_HUMAN.H11MO.0.C1.9559900000000003e-20
ZN143_HUMAN.H11MO.0.A1.34042e-19
THA11_HUMAN.H11MO.0.B7.12211e-16
MA1573.1_THAP113.65505e-09
MA0088.2_ZNF1430.0116113
STAT3_HUMAN.H11MO.0.A0.0361416Not shown
MA1625.1_Stat5b0.0736129Not shown
MA0519.1_Stat5a::Stat5b0.0741564Not shown
P63_HUMAN.H11MO.0.A0.0875971Not shown
HSF1_HUMAN.H11MO.1.A0.151426Not shown

Motif 4/13

Motif IDq-valPWM
MA1107.2_KLF90.325306
MA1621.1_Rbpjl0.325306
MZF1_HUMAN.H11MO.0.B0.398677

Motif 5/13

Motif IDq-valPWM
CTCF_HUMAN.H11MO.0.A0.00166366
CTCFL_HUMAN.H11MO.0.A0.00166366
MA0139.1_CTCF0.00166366
MA1475.1_CREB3L4(var.2)0.00166366
MA0605.2_ATF30.00166366
MA0656.1_JDP2(var.2)0.00285574Not shown
MA1140.2_JUNB(var.2)0.00285574Not shown
MA1102.2_CTCFL0.00321044Not shown
MA1139.1_FOSL2::JUNB(var.2)0.0037216999999999997Not shown
ATF1_HUMAN.H11MO.0.B0.00745886Not shown

Motif 6/13

Motif IDq-valPWM
MA0163.1_PLAG10.48358599999999996

Motif 7/13

Motif IDq-valPWM
SP2_HUMAN.H11MO.0.A0.0113853
SP3_HUMAN.H11MO.0.B0.0113853
SP1_HUMAN.H11MO.0.A0.0113853
KLF3_HUMAN.H11MO.0.B0.0705939
MXI1_HUMAN.H11MO.0.A0.0705939
PATZ1_HUMAN.H11MO.0.C0.0732154Not shown
MA1122.1_TFDP10.0895638Not shown
MA1650.1_ZBTB140.0895638Not shown
WT1_HUMAN.H11MO.0.C0.0895638Not shown
ZFX_HUMAN.H11MO.1.A0.0895638Not shown

Motif 8/13

Motif IDq-valPWM
MA0750.2_ZBTB7A0.0005672380000000001
ZFX_HUMAN.H11MO.1.A0.00117706
SP3_HUMAN.H11MO.0.B0.00117706
SP2_HUMAN.H11MO.0.A0.00117706
ELK4_HUMAN.H11MO.0.A0.00156406
MA0076.2_ELK40.00190421Not shown
SP1_HUMAN.H11MO.0.A0.00190421Not shown
MA0765.2_ETV50.00195996Not shown
ETV1_HUMAN.H11MO.0.A0.0021715Not shown
FEV_HUMAN.H11MO.0.B0.00845586Not shown

Motif 9/13

Motif IDq-valPWM
MA0076.2_ELK40.00254041
SP2_HUMAN.H11MO.0.A0.00622742
MA0765.2_ETV50.00622742
ELK4_HUMAN.H11MO.0.A0.00622742
SP3_HUMAN.H11MO.0.B0.00756529
ELK1_HUMAN.H11MO.0.B0.00774091Not shown
SP1_HUMAN.H11MO.0.A0.010023Not shown
ETV1_HUMAN.H11MO.0.A0.010023Not shown
FEV_HUMAN.H11MO.0.B0.010023Not shown
THAP1_HUMAN.H11MO.0.C0.010023Not shown

Motif 10/13

Motif IDq-valPWM
GABPA_HUMAN.H11MO.0.A0.000237895
MA0760.1_ERF0.00115294
MA0098.3_ETS10.00115294
ELK4_HUMAN.H11MO.0.A0.00115294
MA0076.2_ELK40.00115294
MA0474.2_ERG0.00115294Not shown
MA0763.1_ETV30.00115294Not shown
MA0156.2_FEV0.00115294Not shown
MA0475.2_FLI10.00115294Not shown
ELK1_HUMAN.H11MO.0.B0.00115294Not shown

Motif 11/13

Motif IDq-valPWM
SP2_HUMAN.H11MO.0.A0.0312828
THAP1_HUMAN.H11MO.0.C0.055405600000000006
USF2_HUMAN.H11MO.0.A0.055405600000000006
SP3_HUMAN.H11MO.0.B0.055405600000000006
KLF3_HUMAN.H11MO.0.B0.135735
CTCFL_HUMAN.H11MO.0.A0.16622699999999999Not shown
KLF6_HUMAN.H11MO.0.A0.234463Not shown
SP1_HUMAN.H11MO.1.A0.234463Not shown
PATZ1_HUMAN.H11MO.0.C0.234463Not shown
SP1_HUMAN.H11MO.0.A0.234463Not shown

Motif 12/13

Motif IDq-valPWM
ZN143_HUMAN.H11MO.0.A1.5775899999999998e-13
ZNF76_HUMAN.H11MO.0.C2.87075e-11
THA11_HUMAN.H11MO.0.B2.87321e-11
MA1573.1_THAP111.66595e-10
STAT3_HUMAN.H11MO.0.A0.00740099
MA0519.1_Stat5a::Stat5b0.009319299999999999Not shown
MA0088.2_ZNF1430.0115341Not shown
MA1625.1_Stat5b0.0150425Not shown
HSF1_HUMAN.H11MO.1.A0.0490282Not shown
MA0144.2_STAT30.0490282Not shown

Motif 13/13

Motif IDq-valPWM
ZN143_HUMAN.H11MO.0.A7.83932e-13
THA11_HUMAN.H11MO.0.B1.82164e-11
ZNF76_HUMAN.H11MO.0.C1.87469e-11
MA1573.1_THAP114.8906300000000004e-08
MA0088.2_ZNF1430.014453299999999999
STAT3_HUMAN.H11MO.0.A0.015411099999999999Not shown
MA0519.1_Stat5a::Stat5b0.048144599999999996Not shown
MA1625.1_Stat5b0.058044399999999996Not shown
HSF1_HUMAN.H11MO.1.A0.154077Not shown
MA0144.2_STAT30.160111Not shown