In [1]:
import os
import sys
sys.path.append(os.path.abspath("/users/amtseng/tfmodisco/src/"))
from tfmodisco.run_tfmodisco import import_shap_scores, import_tfmodisco_results
from motif.read_motifs import pfm_info_content, pfm_to_pwm, trim_motif_by_ic
from motif.match_motifs import match_motifs_to_database
from util import figure_to_vdom_image
import plot.viz_sequence as viz_sequence
import numpy as np
import h5py
import matplotlib.pyplot as plt
import vdom.helpers as vdomh
from IPython.display import display

Define constants and paths

In [2]:
# Define parameters/fetch arguments
tf_name = os.environ["TFM_TF_NAME"]
shap_scores_path = os.environ["TFM_SHAP_PATH"]
tfm_results_path = os.environ["TFM_TFM_PATH"]
hyp_score_key = os.environ["TFM_HYP_SCORE_KEY"]
if "TFM_MOTIF_CACHE" in os.environ:
    tfm_motifs_cache_dir = os.environ["TFM_MOTIF_CACHE"]
else:
    tfm_motifs_cache_dir = None

print("TF name: %s" % tf_name)
print("DeepSHAP scores path: %s" % shap_scores_path)
print("TF-MoDISco results path: %s" % tfm_results_path)
print("Importance score key: %s" % hyp_score_key)
print("Saved TF-MoDISco-derived motifs cache: %s" % tfm_motifs_cache_dir)
TF name: SPI1
DeepSHAP scores path: /users/amtseng/tfmodisco/results/importance_scores/multitask_profile/SPI1_multitask_profile_fold4/SPI1_multitask_profile_fold4_imp_scores.h5
TF-MoDISco results path: /users/amtseng/tfmodisco/results/tfmodisco/multitask_profile/SPI1_multitask_profile_fold4/SPI1_multitask_profile_fold4_count_tfm.h5
Importance score key: count_hyp_scores
Saved TF-MoDISco-derived motifs cache: /users/amtseng/tfmodisco/results/reports/tfmodisco_results//cache/multitask_profile/SPI1_multitask_profile_fold4/SPI1_multitask_profile_fold4_count
In [3]:
# Define paths and constants
input_length = 2114
shap_score_center_size = 400
In [4]:
if tfm_motifs_cache_dir:
    os.makedirs(tfm_motifs_cache_dir, exist_ok=True)

Import SHAP scores and TF-MoDISco results

In [5]:
# Import SHAP coordinates and one-hot sequences
hyp_scores, _, one_hot_seqs, shap_coords = import_shap_scores(shap_scores_path, hyp_score_key, center_cut_size=shap_score_center_size)
# This cuts the sequences/scores off just as how TF-MoDISco saw them, but the coordinates are uncut
Importing SHAP scores: 100%|██████████| 194/194 [04:44<00:00,  1.47s/it]
In [6]:
# Import the TF-MoDISco results object
tfm_obj = import_tfmodisco_results(tfm_results_path, hyp_scores, one_hot_seqs, shap_score_center_size)

Plot some SHAP score tracks

Plot the central region of some randomly selected actual importance scores

In [7]:
plot_slice = slice(int(shap_score_center_size / 4), int(3 * shap_score_center_size / 4))
for index in np.random.choice(hyp_scores.shape[0], size=5, replace=False):
    viz_sequence.plot_weights((hyp_scores[index] * one_hot_seqs[index])[plot_slice], subticks_frequency=100)

Plot TF-MoDISco results

Plot all motifs by metacluster

In [8]:
motif_pfms, motif_hcwms, motif_cwms = [], [], []  # Save the trimmed PFMs, hCWMs, and CWMs
motif_pfms_short = []  # PFMs that are even more trimmed (for TOMTOM)
num_seqlets = []  # Number of seqlets for each motif
motif_seqlets = []  # Save seqlets of each motif
metaclusters = tfm_obj.metacluster_idx_to_submetacluster_results
num_metaclusters = len(metaclusters.keys())
if tfm_motifs_cache_dir:
    motif_hdf5 = h5py.File(os.path.join(tfm_motifs_cache_dir, "all_motifs.h5"), "w")
for metacluster_i, metacluster_key in enumerate(metaclusters.keys()):
    metacluster = metaclusters[metacluster_key]
    display(vdomh.h3("Metacluster %d/%d" % (metacluster_i + 1, num_metaclusters)))
    patterns = metacluster.seqlets_to_patterns_result.patterns
    if not patterns:
        break
    motif_pfms.append([])
    motif_hcwms.append([])
    motif_cwms.append([])
    motif_pfms_short.append([])
    num_seqlets.append([])
    motif_seqlets.append([])
    num_patterns = len(patterns)
    for pattern_i, pattern in enumerate(patterns):
        seqlets = pattern.seqlets
        display(vdomh.h4("Pattern %d/%d" % (pattern_i + 1, num_patterns)))
        display(vdomh.p("%d seqlets" % len(seqlets)))
        
        pfm = pattern["sequence"].fwd
        hcwm = pattern["task0_hypothetical_contribs"].fwd
        cwm = pattern["task0_contrib_scores"].fwd
        
        pfm_fig = viz_sequence.plot_weights(pfm, subticks_frequency=10, return_fig=True)
        hcwm_fig = viz_sequence.plot_weights(hcwm, subticks_frequency=10, return_fig=True)
        cwm_fig = viz_sequence.plot_weights(cwm, subticks_frequency=10, return_fig=True)
        pfm_fig.tight_layout()
        hcwm_fig.tight_layout()
        cwm_fig.tight_layout()
        
        motif_table = vdomh.table(
            vdomh.tr(
                vdomh.td("Sequence (PFM)"),
                vdomh.td(figure_to_vdom_image(pfm_fig))
            ),
            vdomh.tr(
                vdomh.td("Hypothetical contributions (hCWM)"),
                vdomh.td(figure_to_vdom_image(hcwm_fig))
            ),
            vdomh.tr(
                vdomh.td("Actual contributions (CWM)"),
                vdomh.td(figure_to_vdom_image(cwm_fig))
            )
        )
        display(motif_table)
        plt.close("all")  # Remove all standing figures
        
        # Trim motif based on information content
        short_trimmed_pfm = trim_motif_by_ic(pfm, pfm)
        motif_pfms_short[-1].append(short_trimmed_pfm)
        
        # Expand trimming to +/- 4bp on either side
        trimmed_pfm = trim_motif_by_ic(pfm, pfm, pad=4)
        trimmed_hcwm = trim_motif_by_ic(pfm, hcwm, pad=4)
        trimmed_cwm = trim_motif_by_ic(pfm, cwm, pad=4)
        
        motif_pfms[-1].append(trimmed_pfm)
        motif_hcwms[-1].append(trimmed_hcwm)
        motif_cwms[-1].append(trimmed_cwm)
        
        num_seqlets[-1].append(len(seqlets))
        
        if tfm_motifs_cache_dir:
            # Save results and figures
            motif_id = "%d_%d" % (metacluster_i, pattern_i)
            pfm_fig.savefig(os.path.join(tfm_motifs_cache_dir, motif_id + "_pfm_full.png"))
            hcwm_fig.savefig(os.path.join(tfm_motifs_cache_dir, motif_id + "_hcwm_full.png"))
            cwm_fig.savefig(os.path.join(tfm_motifs_cache_dir, motif_id + "_cwm_full.png"))
            motif_dset = motif_hdf5.create_group(motif_id)
            motif_dset.create_dataset("pfm_full", data=pfm, compression="gzip")
            motif_dset.create_dataset("hcwm_full", data=hcwm, compression="gzip")
            motif_dset.create_dataset("cwm_full", data=cwm, compression="gzip")
            motif_dset.create_dataset("pfm_trimmed", data=trimmed_pfm, compression="gzip")
            motif_dset.create_dataset("hcwm_trimmed", data=trimmed_hcwm, compression="gzip")
            motif_dset.create_dataset("cwm_trimmed", data=trimmed_cwm, compression="gzip")
            motif_dset.create_dataset("pfm_short_trimmed", data=short_trimmed_pfm, compression="gzip")
if tfm_motifs_cache_dir:
    motif_hdf5.close()

Metacluster 1/2

Pattern 1/8

14384 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 2/8

245 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 3/8

149 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 4/8

147 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 5/8

68 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 6/8

57 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 7/8

50 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 8/8

38 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Metacluster 2/2

Pattern 1/8

79 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 2/8

71 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 3/8

64 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 4/8

63 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 5/8

61 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 6/8

48 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 7/8

45 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 8/8

31 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Summary of motifs

Motifs are trimmed based on information content, and presented in descending order by number of supporting seqlets. The motifs are separated by metacluster. The motifs are presented as hCWMs. The forward orientation is defined as the orientation that is richer in purines.

In [9]:
colgroup = vdomh.colgroup(
    vdomh.col(style={"width": "5%"}),
    vdomh.col(style={"width": "5%"}),
    vdomh.col(style={"width": "45%"}),
    vdomh.col(style={"width": "45%"})
)
header = vdomh.thead(
    vdomh.tr(
        vdomh.th("#", style={"text-align": "center"}),
        vdomh.th("Seqlets", style={"text-align": "center"}),
        vdomh.th("Forward", style={"text-align": "center"}),
        vdomh.th("Reverse", style={"text-align": "center"})
    )
)

for i in range(len(motif_hcwms)):
    display(vdomh.h3("Metacluster %d/%d" % (i + 1, num_metaclusters)))
    body = []
    for j in range(len(motif_hcwms[i])):
        motif = motif_hcwms[i][j]
        if np.sum(motif[:, [0, 2]]) > 0.5 * np.sum(motif):
            # Forward is purine-rich, reverse-complement is pyrimidine-rich
            f, rc = motif, np.flip(motif, axis=(0, 1))
        else:
            f, rc = np.flip(motif, axis=(0, 1)), motif
            
        f_fig = viz_sequence.plot_weights(f, figsize=(20, 4), return_fig=True)
        f_fig.tight_layout()
        rc_fig = viz_sequence.plot_weights(rc, figsize=(20, 4), return_fig=True)
        rc_fig.tight_layout()
        
        if tfm_motifs_cache_dir:
            # Save results and figures
            motif_id = "%d_%d" % (i, j)
            f_fig.savefig(os.path.join(tfm_motifs_cache_dir, motif_id + "_hcwm_trimmed_fwd.png"))
            rc_fig.savefig(os.path.join(tfm_motifs_cache_dir, motif_id + "_hcwm_trimmed_rev.png"))

        body.append(
            vdomh.tr(
                vdomh.td(str(j + 1)),
                vdomh.td(str(num_seqlets[i][j])),
                vdomh.td(figure_to_vdom_image(f_fig)),
                vdomh.td(figure_to_vdom_image(rc_fig))
            )
        )
    display(vdomh.table(colgroup, header, vdomh.tbody(*body)))
    plt.close("all")

Metacluster 1/2

#SeqletsForwardReverse
114384
2245
3149
4147
568
657
750
838

Metacluster 2/2

#SeqletsForwardReverse
179
271
364
463
561
648
745
831

Top TOMTOM matches for each motif

Here, the TF-MoDISco motifs are plotted as hCWMs, but the TOMTOM matches are shown as PWMs.

In [10]:
num_matches_to_keep = 10
num_matches_to_show = 5

header = vdomh.thead(
    vdomh.tr(
        vdomh.th("Motif ID", style={"text-align": "center"}),
        vdomh.th("q-val", style={"text-align": "center"}),
        vdomh.th("PWM", style={"text-align": "center"})
    )
)

for i in range(len(motif_pfms)):
    display(vdomh.h3("Metacluster %d/%d" % (i + 1, num_metaclusters)))
    
    # Compute TOMTOM matches for all motifs in the metacluster at once
    out_dir = os.path.join(tfm_motifs_cache_dir, "tomtom", "metacluster_%d" % i) if tfm_motifs_cache_dir else None
    tomtom_matches = match_motifs_to_database(motif_pfms_short[i], top_k=num_matches_to_keep, temp_dir=out_dir)
    
    for j in range(len(motif_pfms[i])):
        display(vdomh.h4("Motif %d/%d" % (j + 1, len(motif_pfms[i]))))
        viz_sequence.plot_weights(motif_hcwms[i][j])
    
        body = []
        for k, (match_name, match_pfm, match_qval) in enumerate(tomtom_matches[j]):
            fig = viz_sequence.plot_weights(pfm_to_pwm(match_pfm), return_fig=True)
            fig.tight_layout()
            if k < num_matches_to_show:
                body.append(
                    vdomh.tr(
                        vdomh.td(match_name),
                        vdomh.td(str(match_qval)),
                        vdomh.td(figure_to_vdom_image(fig))
                    )
                )
                if tfm_motifs_cache_dir:
                    # Save results and figures
                    motif_id = "%d_%d" % (i, j)
                    fig.savefig(os.path.join(out_dir, motif_id + ("_hit-%d.png" % (k + 1))))
            else:
                body.append(
                    vdomh.tr(
                        vdomh.td(match_name),
                        vdomh.td(str(match_qval)),
                        vdomh.td("Not shown")
                    )
                )
        if not body:
            display(vdomh.p("No TOMTOM matches passing threshold"))
        else:
            display(vdomh.table(header, vdomh.tbody(*body)))
        plt.close("all")

Metacluster 1/2

Motif 1/8

Motif IDq-valPWM
SPIB_HUMAN.H11MO.0.A1.4273899999999999e-16
MA0081.2_SPIB1.46925e-14
SPI1_HUMAN.H11MO.0.A8.8885e-13
BC11A_HUMAN.H11MO.0.A1.44339e-10
IRF4_HUMAN.H11MO.0.A9.1468e-10
IRF8_HUMAN.H11MO.0.B9.800139999999998e-10Not shown
MA0080.5_SPI19.800139999999998e-10Not shown
MA0761.2_ETV16.26379e-05Not shown
MA0062.3_GABPA0.00011908100000000001Not shown
MA0598.3_EHF0.00055591Not shown

Motif 2/8

No TOMTOM matches passing threshold

Motif 3/8

Motif IDq-valPWM
HXC10_HUMAN.H11MO.0.D0.011982099999999999
PO3F3_HUMAN.H11MO.0.D0.0918779
MA0679.2_ONECUT10.0918779
MA0681.2_PHOX2B0.111902
LMX1B_HUMAN.H11MO.0.D0.17591800000000002
CPEB1_HUMAN.H11MO.0.D0.17591800000000002Not shown
MA0645.1_ETV60.17591800000000002Not shown
MA0866.1_SOX210.17591800000000002Not shown
MA0500.2_MYOG0.17591800000000002Not shown
LMX1A_HUMAN.H11MO.0.D0.17591800000000002Not shown

Motif 4/8

Motif IDq-valPWM
MA1655.1_ZNF3410.0784616
MA1623.1_Stat20.09972
MYB_HUMAN.H11MO.0.A0.09972
ZIM3_HUMAN.H11MO.0.C0.131401
MA1420.1_IRF50.21723699999999999
MA0653.1_IRF90.21723699999999999Not shown
MA1419.1_IRF40.21723699999999999Not shown
MA0652.1_IRF80.21723699999999999Not shown
MA1119.1_SIX20.21723699999999999Not shown
MYBA_HUMAN.H11MO.0.D0.21723699999999999Not shown

Motif 5/8

Motif IDq-valPWM
MA0080.5_SPI10.27450399999999997
PO3F2_HUMAN.H11MO.0.A0.39520500000000003
ZFP82_HUMAN.H11MO.0.C0.39520500000000003
VEZF1_HUMAN.H11MO.0.C0.39520500000000003
ETS2_HUMAN.H11MO.0.B0.397805
MZF1_HUMAN.H11MO.0.B0.397805Not shown
IRF3_HUMAN.H11MO.0.B0.397805Not shown
MA1104.2_GATA60.397805Not shown
SOX2_HUMAN.H11MO.0.A0.397805Not shown
ZN467_HUMAN.H11MO.0.C0.47650200000000004Not shown

Motif 6/8

Motif IDq-valPWM
ELF1_HUMAN.H11MO.0.A0.10688299999999999
MA0641.1_ELF40.10688299999999999
MA1483.1_ELF20.10688299999999999
ETV5_HUMAN.H11MO.0.C0.140926
MA0136.2_ELF50.187203
ELK3_HUMAN.H11MO.0.D0.187203Not shown
ZN467_HUMAN.H11MO.0.C0.187203Not shown
MA0081.2_SPIB0.187203Not shown
ELF2_HUMAN.H11MO.0.C0.187203Not shown
ZFP82_HUMAN.H11MO.0.C0.187203Not shown

Motif 7/8

Motif IDq-valPWM
MA0080.5_SPI12.0094999999999998e-07
SPI1_HUMAN.H11MO.0.A2.6171e-06
SPIB_HUMAN.H11MO.0.A4.74964e-06
MA0081.2_SPIB2.78697e-05
BC11A_HUMAN.H11MO.0.A2.78697e-05
IRF3_HUMAN.H11MO.0.B0.00040093400000000003Not shown
MA0687.1_SPIC0.00105816Not shown
IRF8_HUMAN.H11MO.0.B0.00521961Not shown
ETV5_HUMAN.H11MO.0.C0.00650231Not shown
ETS2_HUMAN.H11MO.0.B0.00650231Not shown

Motif 8/8

Motif IDq-valPWM
IRF8_HUMAN.H11MO.0.B0.0008843089999999999
SPIB_HUMAN.H11MO.0.A0.0008843089999999999
SPI1_HUMAN.H11MO.0.A0.0008843089999999999
BC11A_HUMAN.H11MO.0.A0.000906488
IRF4_HUMAN.H11MO.0.A0.000906488
ZN467_HUMAN.H11MO.0.C0.00138696Not shown
MA0598.3_EHF0.00138696Not shown
ELF5_HUMAN.H11MO.0.A0.00138696Not shown
ERG_HUMAN.H11MO.0.A0.00138696Not shown
MA0081.2_SPIB0.00138696Not shown

Metacluster 2/2

Motif 1/8

Motif IDq-valPWM
MA0783.1_PKNOX20.27071999999999996
MA0090.3_TEAD10.27071999999999996
MA0499.2_MYOD10.27071999999999996
MA0820.1_FIGLA0.36772
FIGLA_HUMAN.H11MO.0.D0.36772
ASCL1_HUMAN.H11MO.0.A0.379973Not shown
MA0775.1_MEIS30.379973Not shown
MA1105.2_GRHL20.398831Not shown
MEIS3_HUMAN.H11MO.0.D0.398831Not shown
MA1620.1_Ptf1a(var.3)0.398831Not shown

Motif 2/8

No TOMTOM matches passing threshold

Motif 3/8

No TOMTOM matches passing threshold

Motif 4/8

No TOMTOM matches passing threshold

Motif 5/8

No TOMTOM matches passing threshold

Motif 6/8

Motif IDq-valPWM
MA1513.1_KLF150.24267199999999997
MA0516.2_SP20.24267199999999997
SP2_HUMAN.H11MO.0.A0.266301
SP2_HUMAN.H11MO.1.B0.266301
SP4_HUMAN.H11MO.1.A0.325081
KAISO_HUMAN.H11MO.0.A0.325081Not shown
TFDP1_HUMAN.H11MO.0.C0.325081Not shown
E2F3_HUMAN.H11MO.0.A0.325081Not shown
SP1_HUMAN.H11MO.0.A0.325081Not shown
KLF3_HUMAN.H11MO.0.B0.325081Not shown

Motif 7/8

Motif IDq-valPWM
ETV7_HUMAN.H11MO.0.D0.00348789
ETV5_HUMAN.H11MO.0.C0.00365179
ELF5_HUMAN.H11MO.0.A0.00460621
FLI1_HUMAN.H11MO.1.A0.030788799999999998
ERG_HUMAN.H11MO.0.A0.030788799999999998
ETV4_HUMAN.H11MO.0.B0.030788799999999998Not shown
ETS2_HUMAN.H11MO.0.B0.0394126Not shown
MA0687.1_SPIC0.0487145Not shown
VEZF1_HUMAN.H11MO.1.C0.0487145Not shown
FEV_HUMAN.H11MO.0.B0.0487145Not shown

Motif 8/8

Motif IDq-valPWM
HAND1_HUMAN.H11MO.1.D0.102043
ZNF41_HUMAN.H11MO.0.C0.48257