In [1]:
import os
import sys
sys.path.append(os.path.abspath("/users/amtseng/tfmodisco/src/"))
from tfmodisco.run_tfmodisco import import_shap_scores, import_tfmodisco_results
from motif.read_motifs import pfm_info_content, pfm_to_pwm, trim_motif_by_ic
from motif.match_motifs import match_motifs_to_database
from util import figure_to_vdom_image
import plot.viz_sequence as viz_sequence
import numpy as np
import h5py
import matplotlib.pyplot as plt
import vdom.helpers as vdomh
from IPython.display import display

Define constants and paths

In [2]:
# Define parameters/fetch arguments
tf_name = os.environ["TFM_TF_NAME"]
shap_scores_path = os.environ["TFM_SHAP_PATH"]
tfm_results_path = os.environ["TFM_TFM_PATH"]
hyp_score_key = os.environ["TFM_HYP_SCORE_KEY"]
if "TFM_MOTIF_CACHE" in os.environ:
    tfm_motifs_cache_dir = os.environ["TFM_MOTIF_CACHE"]
else:
    tfm_motifs_cache_dir = None

print("TF name: %s" % tf_name)
print("DeepSHAP scores path: %s" % shap_scores_path)
print("TF-MoDISco results path: %s" % tfm_results_path)
print("Importance score key: %s" % hyp_score_key)
print("Saved TF-MoDISco-derived motifs cache: %s" % tfm_motifs_cache_dir)
TF name: GABPA
DeepSHAP scores path: /users/amtseng/tfmodisco/results/importance_scores/multitask_profile/GABPA_multitask_profile_fold10/GABPA_multitask_profile_fold10_imp_scores.h5
TF-MoDISco results path: /users/amtseng/tfmodisco/results/tfmodisco/multitask_profile/GABPA_multitask_profile_fold10/GABPA_multitask_profile_fold10_profile_tfm.h5
Importance score key: profile_hyp_scores
Saved TF-MoDISco-derived motifs cache: /users/amtseng/tfmodisco/results/reports/tfmodisco_results//cache/multitask_profile/GABPA_multitask_profile_fold10/GABPA_multitask_profile_fold10_profile
In [3]:
# Define paths and constants
input_length = 2114
shap_score_center_size = 400
In [4]:
if tfm_motifs_cache_dir:
    os.makedirs(tfm_motifs_cache_dir, exist_ok=True)

Import SHAP scores and TF-MoDISco results

In [5]:
# Import SHAP coordinates and one-hot sequences
hyp_scores, _, one_hot_seqs, shap_coords = import_shap_scores(shap_scores_path, hyp_score_key, center_cut_size=shap_score_center_size)
# This cuts the sequences/scores off just as how TF-MoDISco saw them, but the coordinates are uncut
Importing SHAP scores: 100%|██████████| 104/104 [01:51<00:00,  1.07s/it]
In [6]:
# Import the TF-MoDISco results object
tfm_obj = import_tfmodisco_results(tfm_results_path, hyp_scores, one_hot_seqs, shap_score_center_size)

Plot some SHAP score tracks

Plot the central region of some randomly selected actual importance scores

In [7]:
plot_slice = slice(int(shap_score_center_size / 4), int(3 * shap_score_center_size / 4))
for index in np.random.choice(hyp_scores.shape[0], size=5, replace=False):
    viz_sequence.plot_weights((hyp_scores[index] * one_hot_seqs[index])[plot_slice], subticks_frequency=100)

Plot TF-MoDISco results

Plot all motifs by metacluster

In [8]:
motif_pfms, motif_hcwms, motif_cwms = [], [], []  # Save the trimmed PFMs, hCWMs, and CWMs
motif_pfms_short = []  # PFMs that are even more trimmed (for TOMTOM)
num_seqlets = []  # Number of seqlets for each motif
motif_seqlets = []  # Save seqlets of each motif
metaclusters = tfm_obj.metacluster_idx_to_submetacluster_results
num_metaclusters = len(metaclusters.keys())
if tfm_motifs_cache_dir:
    motif_hdf5 = h5py.File(os.path.join(tfm_motifs_cache_dir, "all_motifs.h5"), "w")
for metacluster_i, metacluster_key in enumerate(metaclusters.keys()):
    metacluster = metaclusters[metacluster_key]
    display(vdomh.h3("Metacluster %d/%d" % (metacluster_i + 1, num_metaclusters)))
    patterns = metacluster.seqlets_to_patterns_result.patterns
    if not patterns:
        break
    motif_pfms.append([])
    motif_hcwms.append([])
    motif_cwms.append([])
    motif_pfms_short.append([])
    num_seqlets.append([])
    motif_seqlets.append([])
    num_patterns = len(patterns)
    for pattern_i, pattern in enumerate(patterns):
        seqlets = pattern.seqlets
        display(vdomh.h4("Pattern %d/%d" % (pattern_i + 1, num_patterns)))
        display(vdomh.p("%d seqlets" % len(seqlets)))
        
        pfm = pattern["sequence"].fwd
        hcwm = pattern["task0_hypothetical_contribs"].fwd
        cwm = pattern["task0_contrib_scores"].fwd
        
        pfm_fig = viz_sequence.plot_weights(pfm, subticks_frequency=10, return_fig=True)
        hcwm_fig = viz_sequence.plot_weights(hcwm, subticks_frequency=10, return_fig=True)
        cwm_fig = viz_sequence.plot_weights(cwm, subticks_frequency=10, return_fig=True)
        pfm_fig.tight_layout()
        hcwm_fig.tight_layout()
        cwm_fig.tight_layout()
        
        motif_table = vdomh.table(
            vdomh.tr(
                vdomh.td("Sequence (PFM)"),
                vdomh.td(figure_to_vdom_image(pfm_fig))
            ),
            vdomh.tr(
                vdomh.td("Hypothetical contributions (hCWM)"),
                vdomh.td(figure_to_vdom_image(hcwm_fig))
            ),
            vdomh.tr(
                vdomh.td("Actual contributions (CWM)"),
                vdomh.td(figure_to_vdom_image(cwm_fig))
            )
        )
        display(motif_table)
        plt.close("all")  # Remove all standing figures
        
        # Trim motif based on information content
        short_trimmed_pfm = trim_motif_by_ic(pfm, pfm)
        motif_pfms_short[-1].append(short_trimmed_pfm)
        
        # Expand trimming to +/- 4bp on either side
        trimmed_pfm = trim_motif_by_ic(pfm, pfm, pad=4)
        trimmed_hcwm = trim_motif_by_ic(pfm, hcwm, pad=4)
        trimmed_cwm = trim_motif_by_ic(pfm, cwm, pad=4)
        
        motif_pfms[-1].append(trimmed_pfm)
        motif_hcwms[-1].append(trimmed_hcwm)
        motif_cwms[-1].append(trimmed_cwm)
        
        num_seqlets[-1].append(len(seqlets))
        
        if tfm_motifs_cache_dir:
            # Save results and figures
            motif_id = "%d_%d" % (metacluster_i, pattern_i)
            pfm_fig.savefig(os.path.join(tfm_motifs_cache_dir, motif_id + "_pfm_full.png"))
            hcwm_fig.savefig(os.path.join(tfm_motifs_cache_dir, motif_id + "_hcwm_full.png"))
            cwm_fig.savefig(os.path.join(tfm_motifs_cache_dir, motif_id + "_cwm_full.png"))
            motif_dset = motif_hdf5.create_group(motif_id)
            motif_dset.create_dataset("pfm_full", data=pfm, compression="gzip")
            motif_dset.create_dataset("hcwm_full", data=hcwm, compression="gzip")
            motif_dset.create_dataset("cwm_full", data=cwm, compression="gzip")
            motif_dset.create_dataset("pfm_trimmed", data=trimmed_pfm, compression="gzip")
            motif_dset.create_dataset("hcwm_trimmed", data=trimmed_hcwm, compression="gzip")
            motif_dset.create_dataset("cwm_trimmed", data=trimmed_cwm, compression="gzip")
            motif_dset.create_dataset("pfm_short_trimmed", data=short_trimmed_pfm, compression="gzip")
if tfm_motifs_cache_dir:
    motif_hdf5.close()

Metacluster 1/2

Pattern 1/11

4981 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 2/11

531 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 3/11

414 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 4/11

328 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 5/11

276 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 6/11

254 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 7/11

232 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 8/11

143 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 9/11

57 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 10/11

51 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 11/11

38 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Metacluster 2/2

Pattern 1/5

531 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 2/5

355 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 3/5

115 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 4/5

95 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 5/5

85 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Summary of motifs

Motifs are trimmed based on information content, and presented in descending order by number of supporting seqlets. The motifs are separated by metacluster. The motifs are presented as hCWMs. The forward orientation is defined as the orientation that is richer in purines.

In [9]:
colgroup = vdomh.colgroup(
    vdomh.col(style={"width": "5%"}),
    vdomh.col(style={"width": "5%"}),
    vdomh.col(style={"width": "45%"}),
    vdomh.col(style={"width": "45%"})
)
header = vdomh.thead(
    vdomh.tr(
        vdomh.th("#", style={"text-align": "center"}),
        vdomh.th("Seqlets", style={"text-align": "center"}),
        vdomh.th("Forward", style={"text-align": "center"}),
        vdomh.th("Reverse", style={"text-align": "center"})
    )
)

for i in range(len(motif_hcwms)):
    display(vdomh.h3("Metacluster %d/%d" % (i + 1, num_metaclusters)))
    body = []
    for j in range(len(motif_hcwms[i])):
        motif = motif_hcwms[i][j]
        if np.sum(motif[:, [0, 2]]) > 0.5 * np.sum(motif):
            # Forward is purine-rich, reverse-complement is pyrimidine-rich
            f, rc = motif, np.flip(motif, axis=(0, 1))
        else:
            f, rc = np.flip(motif, axis=(0, 1)), motif
            
        f_fig = viz_sequence.plot_weights(f, figsize=(20, 4), return_fig=True)
        f_fig.tight_layout()
        rc_fig = viz_sequence.plot_weights(rc, figsize=(20, 4), return_fig=True)
        rc_fig.tight_layout()
        
        if tfm_motifs_cache_dir:
            # Save results and figures
            motif_id = "%d_%d" % (i, j)
            f_fig.savefig(os.path.join(tfm_motifs_cache_dir, motif_id + "_hcwm_trimmed_fwd.png"))
            rc_fig.savefig(os.path.join(tfm_motifs_cache_dir, motif_id + "_hcwm_trimmed_rev.png"))

        body.append(
            vdomh.tr(
                vdomh.td(str(j + 1)),
                vdomh.td(str(num_seqlets[i][j])),
                vdomh.td(figure_to_vdom_image(f_fig)),
                vdomh.td(figure_to_vdom_image(rc_fig))
            )
        )
    display(vdomh.table(colgroup, header, vdomh.tbody(*body)))
    plt.close("all")

Metacluster 1/2

/users/amtseng/tfmodisco/src/plot/viz_sequence.py:152: RuntimeWarning: More than 20 figures have been opened. Figures created through the pyplot interface (`matplotlib.pyplot.figure`) are retained until explicitly closed and may consume too much memory. (To control this warning, see the rcParam `figure.max_open_warning`).
  fig = plt.figure(figsize=figsize)
#SeqletsForwardReverse
14981
2531
3414
4328
5276
6254
7232
8143
957
1051
1138

Metacluster 2/2

#SeqletsForwardReverse
1531
2355
3115
495
585

Top TOMTOM matches for each motif

Here, the TF-MoDISco motifs are plotted as hCWMs, but the TOMTOM matches are shown as PWMs.

In [10]:
num_matches_to_keep = 10
num_matches_to_show = 5

header = vdomh.thead(
    vdomh.tr(
        vdomh.th("Motif ID", style={"text-align": "center"}),
        vdomh.th("q-val", style={"text-align": "center"}),
        vdomh.th("PWM", style={"text-align": "center"})
    )
)

for i in range(len(motif_pfms)):
    display(vdomh.h3("Metacluster %d/%d" % (i + 1, num_metaclusters)))
    
    # Compute TOMTOM matches for all motifs in the metacluster at once
    out_dir = os.path.join(tfm_motifs_cache_dir, "tomtom", "metacluster_%d" % i) if tfm_motifs_cache_dir else None
    tomtom_matches = match_motifs_to_database(motif_pfms_short[i], top_k=num_matches_to_keep, temp_dir=out_dir)
    
    for j in range(len(motif_pfms[i])):
        display(vdomh.h4("Motif %d/%d" % (j + 1, len(motif_pfms[i]))))
        viz_sequence.plot_weights(motif_hcwms[i][j])
    
        body = []
        for k, (match_name, match_pfm, match_qval) in enumerate(tomtom_matches[j]):
            fig = viz_sequence.plot_weights(pfm_to_pwm(match_pfm), return_fig=True)
            fig.tight_layout()
            if k < num_matches_to_show:
                body.append(
                    vdomh.tr(
                        vdomh.td(match_name),
                        vdomh.td(str(match_qval)),
                        vdomh.td(figure_to_vdom_image(fig))
                    )
                )
                if tfm_motifs_cache_dir:
                    # Save results and figures
                    motif_id = "%d_%d" % (i, j)
                    fig.savefig(os.path.join(out_dir, motif_id + ("_hit-%d.png" % (k + 1))))
            else:
                body.append(
                    vdomh.tr(
                        vdomh.td(match_name),
                        vdomh.td(str(match_qval)),
                        vdomh.td("Not shown")
                    )
                )
        if not body:
            display(vdomh.p("No TOMTOM matches passing threshold"))
        else:
            display(vdomh.table(header, vdomh.tbody(*body)))
        plt.close("all")

Metacluster 1/2

Motif 1/11

Motif IDq-valPWM
ELF2_HUMAN.H11MO.0.C2.39305e-06
MA0750.2_ZBTB7A2.39305e-06
ELF1_HUMAN.H11MO.0.A2.39305e-06
GABPA_HUMAN.H11MO.0.A2.39305e-06
ELK1_HUMAN.H11MO.0.B6.3896100000000005e-06
ETV1_HUMAN.H11MO.0.A6.3896100000000005e-06Not shown
MA0076.2_ELK46.3896100000000005e-06Not shown
MA0765.2_ETV56.3896100000000005e-06Not shown
ELK4_HUMAN.H11MO.0.A6.3896100000000005e-06Not shown
MA1483.1_ELF26.3896100000000005e-06Not shown

Motif 2/11

No TOMTOM matches passing threshold

Motif 3/11

Motif IDq-valPWM
MA0076.2_ELK40.0152365
ELK4_HUMAN.H11MO.0.A0.0342796
MA0765.2_ETV50.0342796
ELK1_HUMAN.H11MO.0.B0.0342796
MA0763.1_ETV30.045269199999999996
MA0760.1_ERF0.045269199999999996Not shown
MA0475.2_FLI10.045269199999999996Not shown
MA0098.3_ETS10.045269199999999996Not shown
ELF2_HUMAN.H11MO.0.C0.045269199999999996Not shown
GABPA_HUMAN.H11MO.0.A0.045269199999999996Not shown

Motif 4/11

Motif IDq-valPWM
ZN143_HUMAN.H11MO.0.A5.466080000000001e-22
ZNF76_HUMAN.H11MO.0.C3.6820000000000006e-21
THA11_HUMAN.H11MO.0.B3.1781e-17
MA1573.1_THAP112.55992e-09
MA0088.2_ZNF1430.00387518
STAT3_HUMAN.H11MO.0.A0.014699700000000001Not shown
MA1625.1_Stat5b0.0277894Not shown
MA0519.1_Stat5a::Stat5b0.0277894Not shown
HSF1_HUMAN.H11MO.1.A0.0787263Not shown
P63_HUMAN.H11MO.0.A0.0805287Not shown

Motif 5/11

Motif IDq-valPWM
SP2_HUMAN.H11MO.0.A3.5446e-05
SP1_HUMAN.H11MO.0.A0.000315928
USF2_HUMAN.H11MO.0.A0.000855802
SP3_HUMAN.H11MO.0.B0.000855802
CTCFL_HUMAN.H11MO.0.A0.00267372
CTCF_HUMAN.H11MO.0.A0.0043582000000000004Not shown
MXI1_HUMAN.H11MO.0.A0.00841177Not shown
KLF3_HUMAN.H11MO.0.B0.0105174Not shown
RFX1_HUMAN.H11MO.0.B0.016252000000000003Not shown
THAP1_HUMAN.H11MO.0.C0.0264638Not shown

Motif 6/11

No TOMTOM matches passing threshold

Motif 7/11

Motif IDq-valPWM
MA0759.1_ELK30.00405844
GABPA_HUMAN.H11MO.0.A0.00405844
MA0750.2_ZBTB7A0.00449037
MA1483.1_ELF20.00449037
MA0641.1_ELF40.00449037
ETV1_HUMAN.H11MO.0.A0.00449037Not shown
ELK1_HUMAN.H11MO.0.B0.00449037Not shown
ELF2_HUMAN.H11MO.0.C0.00449037Not shown
MA0076.2_ELK40.00449037Not shown
MA0760.1_ERF0.00449037Not shown

Motif 8/11

Motif IDq-valPWM
THA11_HUMAN.H11MO.0.B2.08925e-13
ZNF76_HUMAN.H11MO.0.C3.3564699999999997e-12
ZN143_HUMAN.H11MO.0.A7.62987e-12
MA1573.1_THAP111.3749700000000001e-08
P63_HUMAN.H11MO.0.A0.044465
SP2_HUMAN.H11MO.0.A0.111194Not shown
SP1_HUMAN.H11MO.0.A0.111194Not shown
USF2_HUMAN.H11MO.0.A0.111194Not shown
MA1513.1_KLF150.177369Not shown
MA0525.2_TP630.177369Not shown

Motif 9/11

Motif IDq-valPWM
MA0765.2_ETV50.00385858
MA0076.2_ELK40.00385858
MA0750.2_ZBTB7A0.00550935
ELK4_HUMAN.H11MO.0.A0.00618399
ETV1_HUMAN.H11MO.0.A0.00948098
ELK1_HUMAN.H11MO.0.B0.023752900000000004Not shown
FEV_HUMAN.H11MO.0.B0.0304768Not shown
SP2_HUMAN.H11MO.0.A0.050596800000000004Not shown
MA0474.2_ERG0.05173919999999999Not shown
MA0475.2_FLI10.054994600000000005Not shown

Motif 10/11

Motif IDq-valPWM
SP3_HUMAN.H11MO.0.B4.3854300000000005e-05
SP2_HUMAN.H11MO.0.A6.23735e-05
SP1_HUMAN.H11MO.0.A0.000291655
PATZ1_HUMAN.H11MO.0.C0.00347398
MAZ_HUMAN.H11MO.0.A0.00347398
KLF16_HUMAN.H11MO.0.D0.00347398Not shown
VEZF1_HUMAN.H11MO.0.C0.00363204Not shown
TBX15_HUMAN.H11MO.0.D0.006030399999999999Not shown
ZFX_HUMAN.H11MO.1.A0.006030399999999999Not shown
KLF3_HUMAN.H11MO.0.B0.00738735Not shown

Motif 11/11

Motif IDq-valPWM
ETV2_HUMAN.H11MO.0.B0.00026581
MA0598.3_EHF0.000537236
GABPA_HUMAN.H11MO.0.A0.000537236
MA0473.3_ELF10.000703665
ELF2_HUMAN.H11MO.0.C0.000703665
ELF1_HUMAN.H11MO.0.A0.000703665Not shown
MA1418.1_IRF30.000703665Not shown
MA0076.2_ELK40.00113114Not shown
ELF5_HUMAN.H11MO.0.A0.00127348Not shown
MA1508.1_IKZF10.00158386Not shown

Metacluster 2/2

Motif 1/5

Motif IDq-valPWM
GABPA_HUMAN.H11MO.0.A3.15977e-05
ETV1_HUMAN.H11MO.0.A7.02097e-05
MA0076.2_ELK47.02097e-05
ELF2_HUMAN.H11MO.0.C7.02097e-05
ELF1_HUMAN.H11MO.0.A7.02097e-05
MA0765.2_ETV50.000199327Not shown
ELK4_HUMAN.H11MO.0.A0.000199327Not shown
MA1483.1_ELF20.000199327Not shown
MA0750.2_ZBTB7A0.00020670900000000002Not shown
MA0028.2_ELK10.000440016Not shown

Motif 2/5

Motif IDq-valPWM
SP2_HUMAN.H11MO.0.A4.2248999999999997e-08
SP1_HUMAN.H11MO.0.A4.62969e-06
SP3_HUMAN.H11MO.0.B5.156689999999999e-06
SP1_HUMAN.H11MO.1.A0.000242033
KLF3_HUMAN.H11MO.0.B0.0006429809999999999
ZFX_HUMAN.H11MO.1.A0.00343044Not shown
USF2_HUMAN.H11MO.0.A0.00343044Not shown
MA0146.2_Zfx0.00343044Not shown
MA1650.1_ZBTB140.00381165Not shown
MA1513.1_KLF150.00571231Not shown

Motif 3/5

Motif IDq-valPWM
ELK4_HUMAN.H11MO.0.A9.05233e-06
SP2_HUMAN.H11MO.0.A9.05233e-06
MA0765.2_ETV52.4765500000000002e-05
MA0076.2_ELK42.4765500000000002e-05
ETV1_HUMAN.H11MO.0.A2.55446e-05
SP1_HUMAN.H11MO.0.A3.5880700000000004e-05Not shown
SP3_HUMAN.H11MO.0.B3.5880700000000004e-05Not shown
MA0750.2_ZBTB7A3.9479e-05Not shown
GABPA_HUMAN.H11MO.0.A0.000221311Not shown
ELK1_HUMAN.H11MO.0.B0.0008300139999999999Not shown

Motif 4/5

Motif IDq-valPWM
SP2_HUMAN.H11MO.0.A3.55901e-06
SP3_HUMAN.H11MO.0.B4.6508800000000004e-05
SP1_HUMAN.H11MO.0.A6.56198e-05
SP1_HUMAN.H11MO.1.A0.00210817
KLF3_HUMAN.H11MO.0.B0.00256937
USF2_HUMAN.H11MO.0.A0.00435215Not shown
THAP1_HUMAN.H11MO.0.C0.0117738Not shown
RFX1_HUMAN.H11MO.0.B0.0117738Not shown
KLF16_HUMAN.H11MO.0.D0.0118104Not shown
MA1513.1_KLF150.0118104Not shown

Motif 5/5

Motif IDq-valPWM
SP2_HUMAN.H11MO.0.A2.21299e-07
SP1_HUMAN.H11MO.0.A6.780360000000001e-06
SP3_HUMAN.H11MO.0.B1.0825099999999999e-05
KLF16_HUMAN.H11MO.0.D0.00137998
MA1513.1_KLF150.00144176
TBX15_HUMAN.H11MO.0.D0.00161223Not shown
USF2_HUMAN.H11MO.0.A0.00164557Not shown
SP1_HUMAN.H11MO.1.A0.00164557Not shown
THAP1_HUMAN.H11MO.0.C0.00164557Not shown
KLF3_HUMAN.H11MO.0.B0.00164557Not shown