In [1]:
import os
import sys
sys.path.append(os.path.abspath("/users/amtseng/tfmodisco/src/"))
from tfmodisco.run_tfmodisco import import_shap_scores, import_tfmodisco_results
from motif.read_motifs import pfm_info_content, pfm_to_pwm, trim_motif_by_ic
from motif.match_motifs import match_motifs_to_database
from util import figure_to_vdom_image
import plot.viz_sequence as viz_sequence
import numpy as np
import h5py
import matplotlib.pyplot as plt
import vdom.helpers as vdomh
from IPython.display import display

Define constants and paths

In [2]:
# Define parameters/fetch arguments
tf_name = os.environ["TFM_TF_NAME"]
shap_scores_path = os.environ["TFM_SHAP_PATH"]
tfm_results_path = os.environ["TFM_TFM_PATH"]
hyp_score_key = os.environ["TFM_HYP_SCORE_KEY"]
if "TFM_MOTIF_CACHE" in os.environ:
    tfm_motifs_cache_dir = os.environ["TFM_MOTIF_CACHE"]
else:
    tfm_motifs_cache_dir = None

print("TF name: %s" % tf_name)
print("DeepSHAP scores path: %s" % shap_scores_path)
print("TF-MoDISco results path: %s" % tfm_results_path)
print("Importance score key: %s" % hyp_score_key)
print("Saved TF-MoDISco-derived motifs cache: %s" % tfm_motifs_cache_dir)
TF name: REST
DeepSHAP scores path: /users/amtseng/tfmodisco/results/importance_scores/multitask_profile/REST_multitask_profile_fold4/REST_multitask_profile_fold4_imp_scores.h5
TF-MoDISco results path: /users/amtseng/tfmodisco/results/tfmodisco/multitask_profile/REST_multitask_profile_fold4/REST_multitask_profile_fold4_profile_tfm.h5
Importance score key: profile_hyp_scores
Saved TF-MoDISco-derived motifs cache: /users/amtseng/tfmodisco/results/reports/tfmodisco_results//cache/multitask_profile/REST_multitask_profile_fold4/REST_multitask_profile_fold4_profile
In [3]:
# Define paths and constants
input_length = 2114
shap_score_center_size = 400
In [4]:
if tfm_motifs_cache_dir:
    os.makedirs(tfm_motifs_cache_dir, exist_ok=True)

Import SHAP scores and TF-MoDISco results

In [5]:
# Import SHAP coordinates and one-hot sequences
hyp_scores, _, one_hot_seqs, shap_coords = import_shap_scores(shap_scores_path, hyp_score_key, center_cut_size=shap_score_center_size)
# This cuts the sequences/scores off just as how TF-MoDISco saw them, but the coordinates are uncut
Importing SHAP scores: 100%|██████████| 287/287 [08:30<00:00,  1.78s/it]
In [6]:
# Import the TF-MoDISco results object
tfm_obj = import_tfmodisco_results(tfm_results_path, hyp_scores, one_hot_seqs, shap_score_center_size)

Plot some SHAP score tracks

Plot the central region of some randomly selected actual importance scores

In [7]:
plot_slice = slice(int(shap_score_center_size / 4), int(3 * shap_score_center_size / 4))
for index in np.random.choice(hyp_scores.shape[0], size=5, replace=False):
    viz_sequence.plot_weights((hyp_scores[index] * one_hot_seqs[index])[plot_slice], subticks_frequency=100)

Plot TF-MoDISco results

Plot all motifs by metacluster

In [8]:
motif_pfms, motif_hcwms, motif_cwms = [], [], []  # Save the trimmed PFMs, hCWMs, and CWMs
motif_pfms_short = []  # PFMs that are even more trimmed (for TOMTOM)
num_seqlets = []  # Number of seqlets for each motif
motif_seqlets = []  # Save seqlets of each motif
metaclusters = tfm_obj.metacluster_idx_to_submetacluster_results
num_metaclusters = len(metaclusters.keys())
if tfm_motifs_cache_dir:
    motif_hdf5 = h5py.File(os.path.join(tfm_motifs_cache_dir, "all_motifs.h5"), "w")
for metacluster_i, metacluster_key in enumerate(metaclusters.keys()):
    metacluster = metaclusters[metacluster_key]
    display(vdomh.h3("Metacluster %d/%d" % (metacluster_i + 1, num_metaclusters)))
    patterns = metacluster.seqlets_to_patterns_result.patterns
    if not patterns:
        break
    motif_pfms.append([])
    motif_hcwms.append([])
    motif_cwms.append([])
    motif_pfms_short.append([])
    num_seqlets.append([])
    motif_seqlets.append([])
    num_patterns = len(patterns)
    for pattern_i, pattern in enumerate(patterns):
        seqlets = pattern.seqlets
        display(vdomh.h4("Pattern %d/%d" % (pattern_i + 1, num_patterns)))
        display(vdomh.p("%d seqlets" % len(seqlets)))
        
        pfm = pattern["sequence"].fwd
        hcwm = pattern["task0_hypothetical_contribs"].fwd
        cwm = pattern["task0_contrib_scores"].fwd
        
        pfm_fig = viz_sequence.plot_weights(pfm, subticks_frequency=10, return_fig=True)
        hcwm_fig = viz_sequence.plot_weights(hcwm, subticks_frequency=10, return_fig=True)
        cwm_fig = viz_sequence.plot_weights(cwm, subticks_frequency=10, return_fig=True)
        pfm_fig.tight_layout()
        hcwm_fig.tight_layout()
        cwm_fig.tight_layout()
        
        motif_table = vdomh.table(
            vdomh.tr(
                vdomh.td("Sequence (PFM)"),
                vdomh.td(figure_to_vdom_image(pfm_fig))
            ),
            vdomh.tr(
                vdomh.td("Hypothetical contributions (hCWM)"),
                vdomh.td(figure_to_vdom_image(hcwm_fig))
            ),
            vdomh.tr(
                vdomh.td("Actual contributions (CWM)"),
                vdomh.td(figure_to_vdom_image(cwm_fig))
            )
        )
        display(motif_table)
        plt.close("all")  # Remove all standing figures
        
        # Trim motif based on information content
        short_trimmed_pfm = trim_motif_by_ic(pfm, pfm)
        motif_pfms_short[-1].append(short_trimmed_pfm)
        
        # Expand trimming to +/- 4bp on either side
        trimmed_pfm = trim_motif_by_ic(pfm, pfm, pad=4)
        trimmed_hcwm = trim_motif_by_ic(pfm, hcwm, pad=4)
        trimmed_cwm = trim_motif_by_ic(pfm, cwm, pad=4)
        
        motif_pfms[-1].append(trimmed_pfm)
        motif_hcwms[-1].append(trimmed_hcwm)
        motif_cwms[-1].append(trimmed_cwm)
        
        num_seqlets[-1].append(len(seqlets))
        
        if tfm_motifs_cache_dir:
            # Save results and figures
            motif_id = "%d_%d" % (metacluster_i, pattern_i)
            pfm_fig.savefig(os.path.join(tfm_motifs_cache_dir, motif_id + "_pfm_full.png"))
            hcwm_fig.savefig(os.path.join(tfm_motifs_cache_dir, motif_id + "_hcwm_full.png"))
            cwm_fig.savefig(os.path.join(tfm_motifs_cache_dir, motif_id + "_cwm_full.png"))
            motif_dset = motif_hdf5.create_group(motif_id)
            motif_dset.create_dataset("pfm_full", data=pfm, compression="gzip")
            motif_dset.create_dataset("hcwm_full", data=hcwm, compression="gzip")
            motif_dset.create_dataset("cwm_full", data=cwm, compression="gzip")
            motif_dset.create_dataset("pfm_trimmed", data=trimmed_pfm, compression="gzip")
            motif_dset.create_dataset("hcwm_trimmed", data=trimmed_hcwm, compression="gzip")
            motif_dset.create_dataset("cwm_trimmed", data=trimmed_cwm, compression="gzip")
            motif_dset.create_dataset("pfm_short_trimmed", data=short_trimmed_pfm, compression="gzip")
if tfm_motifs_cache_dir:
    motif_hdf5.close()

Metacluster 1/2

Pattern 1/14

4556 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 2/14

1955 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 3/14

1092 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 4/14

538 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 5/14

318 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 6/14

250 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 7/14

232 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 8/14

175 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 9/14

156 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 10/14

114 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 11/14

110 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 12/14

69 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 13/14

66 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 14/14

51 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Metacluster 2/2

Pattern 1/6

40 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 2/6

40 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 3/6

38 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 4/6

38 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 5/6

38 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 6/6

33 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Summary of motifs

Motifs are trimmed based on information content, and presented in descending order by number of supporting seqlets. The motifs are separated by metacluster. The motifs are presented as hCWMs. The forward orientation is defined as the orientation that is richer in purines.

In [9]:
colgroup = vdomh.colgroup(
    vdomh.col(style={"width": "5%"}),
    vdomh.col(style={"width": "5%"}),
    vdomh.col(style={"width": "45%"}),
    vdomh.col(style={"width": "45%"})
)
header = vdomh.thead(
    vdomh.tr(
        vdomh.th("#", style={"text-align": "center"}),
        vdomh.th("Seqlets", style={"text-align": "center"}),
        vdomh.th("Forward", style={"text-align": "center"}),
        vdomh.th("Reverse", style={"text-align": "center"})
    )
)

for i in range(len(motif_hcwms)):
    display(vdomh.h3("Metacluster %d/%d" % (i + 1, num_metaclusters)))
    body = []
    for j in range(len(motif_hcwms[i])):
        motif = motif_hcwms[i][j]
        if np.sum(motif[:, [0, 2]]) > 0.5 * np.sum(motif):
            # Forward is purine-rich, reverse-complement is pyrimidine-rich
            f, rc = motif, np.flip(motif, axis=(0, 1))
        else:
            f, rc = np.flip(motif, axis=(0, 1)), motif
            
        f_fig = viz_sequence.plot_weights(f, figsize=(20, 4), return_fig=True)
        f_fig.tight_layout()
        rc_fig = viz_sequence.plot_weights(rc, figsize=(20, 4), return_fig=True)
        rc_fig.tight_layout()
        
        if tfm_motifs_cache_dir:
            # Save results and figures
            motif_id = "%d_%d" % (i, j)
            f_fig.savefig(os.path.join(tfm_motifs_cache_dir, motif_id + "_hcwm_trimmed_fwd.png"))
            rc_fig.savefig(os.path.join(tfm_motifs_cache_dir, motif_id + "_hcwm_trimmed_rev.png"))

        body.append(
            vdomh.tr(
                vdomh.td(str(j + 1)),
                vdomh.td(str(num_seqlets[i][j])),
                vdomh.td(figure_to_vdom_image(f_fig)),
                vdomh.td(figure_to_vdom_image(rc_fig))
            )
        )
    display(vdomh.table(colgroup, header, vdomh.tbody(*body)))
    plt.close("all")

Metacluster 1/2

/users/amtseng/tfmodisco/src/plot/viz_sequence.py:152: RuntimeWarning: More than 20 figures have been opened. Figures created through the pyplot interface (`matplotlib.pyplot.figure`) are retained until explicitly closed and may consume too much memory. (To control this warning, see the rcParam `figure.max_open_warning`).
  fig = plt.figure(figsize=figsize)
#SeqletsForwardReverse
14556
21955
31092
4538
5318
6250
7232
8175
9156
10114
11110
1269
1366
1451

Metacluster 2/2

#SeqletsForwardReverse
140
240
338
438
538
633

Top TOMTOM matches for each motif

Here, the TF-MoDISco motifs are plotted as hCWMs, but the TOMTOM matches are shown as PWMs.

In [10]:
num_matches_to_keep = 10
num_matches_to_show = 5

header = vdomh.thead(
    vdomh.tr(
        vdomh.th("Motif ID", style={"text-align": "center"}),
        vdomh.th("q-val", style={"text-align": "center"}),
        vdomh.th("PWM", style={"text-align": "center"})
    )
)

for i in range(len(motif_pfms)):
    display(vdomh.h3("Metacluster %d/%d" % (i + 1, num_metaclusters)))
    
    # Compute TOMTOM matches for all motifs in the metacluster at once
    out_dir = os.path.join(tfm_motifs_cache_dir, "tomtom", "metacluster_%d" % i) if tfm_motifs_cache_dir else None
    tomtom_matches = match_motifs_to_database(motif_pfms_short[i], top_k=num_matches_to_keep, temp_dir=out_dir)
    
    for j in range(len(motif_pfms[i])):
        display(vdomh.h4("Motif %d/%d" % (j + 1, len(motif_pfms[i]))))
        viz_sequence.plot_weights(motif_hcwms[i][j])
    
        body = []
        for k, (match_name, match_pfm, match_qval) in enumerate(tomtom_matches[j]):
            fig = viz_sequence.plot_weights(pfm_to_pwm(match_pfm), return_fig=True)
            fig.tight_layout()
            if k < num_matches_to_show:
                body.append(
                    vdomh.tr(
                        vdomh.td(match_name),
                        vdomh.td(str(match_qval)),
                        vdomh.td(figure_to_vdom_image(fig))
                    )
                )
                if tfm_motifs_cache_dir:
                    # Save results and figures
                    motif_id = "%d_%d" % (i, j)
                    fig.savefig(os.path.join(out_dir, motif_id + ("_hit-%d.png" % (k + 1))))
            else:
                body.append(
                    vdomh.tr(
                        vdomh.td(match_name),
                        vdomh.td(str(match_qval)),
                        vdomh.td("Not shown")
                    )
                )
        if not body:
            display(vdomh.p("No TOMTOM matches passing threshold"))
        else:
            display(vdomh.table(header, vdomh.tbody(*body)))
        plt.close("all")

Metacluster 1/2

Motif 1/14

Motif IDq-valPWM
MA0138.2_REST4.66321e-18
REST_HUMAN.H11MO.0.A7.921689999999999e-16

Motif 2/14

Motif IDq-valPWM
MA0138.2_REST0.050937699999999995
REST_HUMAN.H11MO.0.A0.058728499999999996

Motif 3/14

No TOMTOM matches passing threshold

Motif 4/14

Motif IDq-valPWM
CTCFL_HUMAN.H11MO.0.A1.72028e-11
CTCF_HUMAN.H11MO.0.A1.20228e-10
MA0139.1_CTCF1.2932e-10
MA1102.2_CTCFL4.167350000000001e-05
MA1568.1_TCF21(var.2)0.16713599999999998
SNAI1_HUMAN.H11MO.0.C0.16713599999999998Not shown
MA1638.1_HAND20.169498Not shown
MA1628.1_Zic1::Zic20.381894Not shown
ZIC3_HUMAN.H11MO.0.B0.381894Not shown
MA0155.1_INSM10.381894Not shown

Motif 5/14

Motif IDq-valPWM
MA0138.2_REST0.289509
REST_HUMAN.H11MO.0.A0.302087

Motif 6/14

Motif IDq-valPWM
REST_HUMAN.H11MO.0.A0.004888399999999999
MA0138.2_REST0.004888399999999999
TBX20_HUMAN.H11MO.0.D0.220797

Motif 7/14

Motif IDq-valPWM
REST_HUMAN.H11MO.0.A0.08602439999999999
MA0138.2_REST0.161618

Motif 8/14

Motif IDq-valPWM
MA0138.2_REST0.16902
ZN335_HUMAN.H11MO.0.A0.16902
MA1572.1_TGIF2LY0.16902
MA0774.1_MEIS20.16902
MA1467.1_ATOH1(var.2)0.16902
MA1571.1_TGIF2LX0.16902Not shown
MA0797.1_TGIF20.16902Not shown
MA0782.2_PKNOX10.16902Not shown
TGIF2_HUMAN.H11MO.0.D0.16902Not shown
REST_HUMAN.H11MO.0.A0.16902Not shown

Motif 9/14

No TOMTOM matches passing threshold

Motif 10/14

No TOMTOM matches passing threshold

Motif 11/14

Motif IDq-valPWM
MA1631.1_ASCL1(var.2)0.06211269999999999
MA0830.2_TCF40.20520700000000003
MYOD1_HUMAN.H11MO.0.A0.38345799999999997

Motif 12/14

Motif IDq-valPWM
CPEB1_HUMAN.H11MO.0.D4.1519899999999995e-05
MA1125.1_ZNF3840.026507599999999996
PRDM6_HUMAN.H11MO.0.C0.026507599999999996
FOXL1_HUMAN.H11MO.0.D0.0478382
FOXG1_HUMAN.H11MO.0.D0.0773826
MA0679.2_ONECUT10.11909700000000001Not shown
HXC10_HUMAN.H11MO.0.D0.11909700000000001Not shown
ANDR_HUMAN.H11MO.0.A0.119174Not shown
FOXJ3_HUMAN.H11MO.0.A0.133784Not shown
ONEC2_HUMAN.H11MO.0.D0.171216Not shown

Motif 13/14

Motif IDq-valPWM
SP2_HUMAN.H11MO.0.A9.092450000000001e-10
SP3_HUMAN.H11MO.0.B2.13577e-09
SP1_HUMAN.H11MO.0.A4.71127e-09
KLF16_HUMAN.H11MO.0.D1.00565e-07
TBX15_HUMAN.H11MO.0.D1.69285e-06
PATZ1_HUMAN.H11MO.0.C1.69285e-06Not shown
MAZ_HUMAN.H11MO.0.A1.2334e-05Not shown
WT1_HUMAN.H11MO.0.C1.63821e-05Not shown
ZN467_HUMAN.H11MO.0.C1.63821e-05Not shown
KLF3_HUMAN.H11MO.0.B2.3741e-05Not shown

Motif 14/14

Motif IDq-valPWM
TBX20_HUMAN.H11MO.0.D0.287746

Metacluster 2/2

Motif 1/6

Motif IDq-valPWM
AP2B_HUMAN.H11MO.0.B0.011088899999999999
SP2_HUMAN.H11MO.0.A0.011682600000000001
SP1_HUMAN.H11MO.0.A0.047226199999999996
RFX1_HUMAN.H11MO.0.B0.047226199999999996
MA0146.2_Zfx0.047226199999999996
ZF64A_HUMAN.H11MO.0.D0.0551099Not shown
ZFX_HUMAN.H11MO.1.A0.0551099Not shown
SP3_HUMAN.H11MO.0.B0.0637789Not shown
PATZ1_HUMAN.H11MO.0.C0.108839Not shown
AP2D_HUMAN.H11MO.0.D0.14011300000000002Not shown

Motif 2/6

Motif IDq-valPWM
SP2_HUMAN.H11MO.0.A0.00206717
SP3_HUMAN.H11MO.0.B0.00255864
SP1_HUMAN.H11MO.0.A0.00255864
MA1615.1_Plagl10.0113958
MXI1_HUMAN.H11MO.0.A0.019131799999999997
ZFX_HUMAN.H11MO.1.A0.0214546Not shown
NR1H4_HUMAN.H11MO.0.B0.028169599999999996Not shown
RFX1_HUMAN.H11MO.0.B0.043024400000000004Not shown
MBD2_HUMAN.H11MO.0.B0.050305199999999994Not shown
MA1650.1_ZBTB140.05214149999999999Not shown

Motif 3/6

Motif IDq-valPWM
SP1_HUMAN.H11MO.0.A0.09055260000000001
SP1_HUMAN.H11MO.1.A0.101775
MA0750.2_ZBTB7A0.101775
NR1H4_HUMAN.H11MO.0.B0.101775
MA0146.2_Zfx0.1579
AP2B_HUMAN.H11MO.0.B0.183572Not shown
AP2D_HUMAN.H11MO.0.D0.185339Not shown
SP2_HUMAN.H11MO.0.A0.199657Not shown
MA0814.2_TFAP2C(var.2)0.23289400000000002Not shown
ELK4_HUMAN.H11MO.0.A0.3797Not shown

Motif 4/6

Motif IDq-valPWM
USF2_HUMAN.H11MO.0.A0.0379429
MA1529.1_NHLH20.0379429
HEN1_HUMAN.H11MO.0.C0.0379429
MA0522.3_TCF30.06795110000000001
SP2_HUMAN.H11MO.0.A0.06795110000000001
MA0697.1_ZIC30.06795110000000001Not shown
MA0103.3_ZEB10.06795110000000001Not shown
ITF2_HUMAN.H11MO.0.C0.06795110000000001Not shown
MA0830.2_TCF40.0711289Not shown
SP3_HUMAN.H11MO.0.B0.0711289Not shown

Motif 5/6

Motif IDq-valPWM
MA0830.2_TCF40.060185199999999994
MYOD1_HUMAN.H11MO.0.A0.114619
MA1631.1_ASCL1(var.2)0.114619
MA1100.2_ASCL10.186649
CTCFL_HUMAN.H11MO.0.A0.186649
ASCL1_HUMAN.H11MO.0.A0.186649Not shown
ZFX_HUMAN.H11MO.1.A0.186649Not shown
BHA15_HUMAN.H11MO.0.B0.186649Not shown
MA0500.2_MYOG0.186649Not shown
SP2_HUMAN.H11MO.0.A0.186649Not shown

Motif 6/6

Motif IDq-valPWM
SP2_HUMAN.H11MO.0.A0.025970799999999995
SP1_HUMAN.H11MO.0.A0.0328596
SP3_HUMAN.H11MO.0.B0.0449228
ZF64A_HUMAN.H11MO.0.D0.060804899999999995
THAP1_HUMAN.H11MO.0.C0.060804899999999995
KLF3_HUMAN.H11MO.0.B0.060804899999999995Not shown
MA1650.1_ZBTB140.0797125Not shown
KLF16_HUMAN.H11MO.0.D0.100597Not shown
USF2_HUMAN.H11MO.0.A0.104272Not shown
ZFX_HUMAN.H11MO.1.A0.10768399999999999Not shown