In [1]:
import os
import sys
sys.path.append(os.path.abspath("/users/amtseng/tfmodisco/src/"))
from tfmodisco.run_tfmodisco import import_shap_scores, import_tfmodisco_results
from motif.read_motifs import pfm_info_content, pfm_to_pwm, trim_motif_by_ic
from motif.match_motifs import match_motifs_to_database
from util import figure_to_vdom_image
import plot.viz_sequence as viz_sequence
import numpy as np
import h5py
import matplotlib.pyplot as plt
import vdom.helpers as vdomh
from IPython.display import display

Define constants and paths

In [2]:
# Define parameters/fetch arguments
tf_name = os.environ["TFM_TF_NAME"]
shap_scores_path = os.environ["TFM_SHAP_PATH"]
tfm_results_path = os.environ["TFM_TFM_PATH"]
hyp_score_key = os.environ["TFM_HYP_SCORE_KEY"]
if "TFM_MOTIF_CACHE" in os.environ:
    tfm_motifs_cache_dir = os.environ["TFM_MOTIF_CACHE"]
else:
    tfm_motifs_cache_dir = None

print("TF name: %s" % tf_name)
print("DeepSHAP scores path: %s" % shap_scores_path)
print("TF-MoDISco results path: %s" % tfm_results_path)
print("Importance score key: %s" % hyp_score_key)
print("Saved TF-MoDISco-derived motifs cache: %s" % tfm_motifs_cache_dir)
TF name: CEBPB
DeepSHAP scores path: /users/amtseng/tfmodisco/results/importance_scores/multitask_profile/CEBPB_multitask_profile_fold1/CEBPB_multitask_profile_fold1_imp_scores.h5
TF-MoDISco results path: /users/amtseng/tfmodisco/results/tfmodisco/multitask_profile/CEBPB_multitask_profile_fold1/CEBPB_multitask_profile_fold1_count_tfm.h5
Importance score key: count_hyp_scores
Saved TF-MoDISco-derived motifs cache: /users/amtseng/tfmodisco/results/reports/tfmodisco_results//cache/multitask_profile/CEBPB_multitask_profile_fold1/CEBPB_multitask_profile_fold1_count
In [3]:
# Define paths and constants
input_length = 2114
shap_score_center_size = 400
In [4]:
if tfm_motifs_cache_dir:
    os.makedirs(tfm_motifs_cache_dir, exist_ok=True)

Import SHAP scores and TF-MoDISco results

In [5]:
# Import SHAP coordinates and one-hot sequences
hyp_scores, _, one_hot_seqs, shap_coords = import_shap_scores(shap_scores_path, hyp_score_key, center_cut_size=shap_score_center_size)
# This cuts the sequences/scores off just as how TF-MoDISco saw them, but the coordinates are uncut
Importing SHAP scores: 100%|██████████| 273/273 [06:40<00:00,  1.47s/it]
In [6]:
# Import the TF-MoDISco results object
tfm_obj = import_tfmodisco_results(tfm_results_path, hyp_scores, one_hot_seqs, shap_score_center_size)

Plot some SHAP score tracks

Plot the central region of some randomly selected actual importance scores

In [7]:
plot_slice = slice(int(shap_score_center_size / 4), int(3 * shap_score_center_size / 4))
for index in np.random.choice(hyp_scores.shape[0], size=5, replace=False):
    viz_sequence.plot_weights((hyp_scores[index] * one_hot_seqs[index])[plot_slice], subticks_frequency=100)

Plot TF-MoDISco results

Plot all motifs by metacluster

In [8]:
motif_pfms, motif_hcwms, motif_cwms = [], [], []  # Save the trimmed PFMs, hCWMs, and CWMs
motif_pfms_short = []  # PFMs that are even more trimmed (for TOMTOM)
num_seqlets = []  # Number of seqlets for each motif
motif_seqlets = []  # Save seqlets of each motif
metaclusters = tfm_obj.metacluster_idx_to_submetacluster_results
num_metaclusters = len(metaclusters.keys())
if tfm_motifs_cache_dir:
    motif_hdf5 = h5py.File(os.path.join(tfm_motifs_cache_dir, "all_motifs.h5"), "w")
for metacluster_i, metacluster_key in enumerate(metaclusters.keys()):
    metacluster = metaclusters[metacluster_key]
    display(vdomh.h3("Metacluster %d/%d" % (metacluster_i + 1, num_metaclusters)))
    patterns = metacluster.seqlets_to_patterns_result.patterns
    if not patterns:
        break
    motif_pfms.append([])
    motif_hcwms.append([])
    motif_cwms.append([])
    motif_pfms_short.append([])
    num_seqlets.append([])
    motif_seqlets.append([])
    num_patterns = len(patterns)
    for pattern_i, pattern in enumerate(patterns):
        seqlets = pattern.seqlets
        display(vdomh.h4("Pattern %d/%d" % (pattern_i + 1, num_patterns)))
        display(vdomh.p("%d seqlets" % len(seqlets)))
        
        pfm = pattern["sequence"].fwd
        hcwm = pattern["task0_hypothetical_contribs"].fwd
        cwm = pattern["task0_contrib_scores"].fwd
        
        pfm_fig = viz_sequence.plot_weights(pfm, subticks_frequency=10, return_fig=True)
        hcwm_fig = viz_sequence.plot_weights(hcwm, subticks_frequency=10, return_fig=True)
        cwm_fig = viz_sequence.plot_weights(cwm, subticks_frequency=10, return_fig=True)
        pfm_fig.tight_layout()
        hcwm_fig.tight_layout()
        cwm_fig.tight_layout()
        
        motif_table = vdomh.table(
            vdomh.tr(
                vdomh.td("Sequence (PFM)"),
                vdomh.td(figure_to_vdom_image(pfm_fig))
            ),
            vdomh.tr(
                vdomh.td("Hypothetical contributions (hCWM)"),
                vdomh.td(figure_to_vdom_image(hcwm_fig))
            ),
            vdomh.tr(
                vdomh.td("Actual contributions (CWM)"),
                vdomh.td(figure_to_vdom_image(cwm_fig))
            )
        )
        display(motif_table)
        plt.close("all")  # Remove all standing figures
        
        # Trim motif based on information content
        short_trimmed_pfm = trim_motif_by_ic(pfm, pfm)
        motif_pfms_short[-1].append(short_trimmed_pfm)
        
        # Expand trimming to +/- 4bp on either side
        trimmed_pfm = trim_motif_by_ic(pfm, pfm, pad=4)
        trimmed_hcwm = trim_motif_by_ic(pfm, hcwm, pad=4)
        trimmed_cwm = trim_motif_by_ic(pfm, cwm, pad=4)
        
        motif_pfms[-1].append(trimmed_pfm)
        motif_hcwms[-1].append(trimmed_hcwm)
        motif_cwms[-1].append(trimmed_cwm)
        
        num_seqlets[-1].append(len(seqlets))
        
        if tfm_motifs_cache_dir:
            # Save results and figures
            motif_id = "%d_%d" % (metacluster_i, pattern_i)
            pfm_fig.savefig(os.path.join(tfm_motifs_cache_dir, motif_id + "_pfm_full.png"))
            hcwm_fig.savefig(os.path.join(tfm_motifs_cache_dir, motif_id + "_hcwm_full.png"))
            cwm_fig.savefig(os.path.join(tfm_motifs_cache_dir, motif_id + "_cwm_full.png"))
            motif_dset = motif_hdf5.create_group(motif_id)
            motif_dset.create_dataset("pfm_full", data=pfm, compression="gzip")
            motif_dset.create_dataset("hcwm_full", data=hcwm, compression="gzip")
            motif_dset.create_dataset("cwm_full", data=cwm, compression="gzip")
            motif_dset.create_dataset("pfm_trimmed", data=trimmed_pfm, compression="gzip")
            motif_dset.create_dataset("hcwm_trimmed", data=trimmed_hcwm, compression="gzip")
            motif_dset.create_dataset("cwm_trimmed", data=trimmed_cwm, compression="gzip")
            motif_dset.create_dataset("pfm_short_trimmed", data=short_trimmed_pfm, compression="gzip")
if tfm_motifs_cache_dir:
    motif_hdf5.close()

Metacluster 1/2

Pattern 1/9

11798 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 2/9

866 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 3/9

715 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 4/9

323 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 5/9

253 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 6/9

133 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 7/9

128 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 8/9

83 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 9/9

83 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Metacluster 2/2

Pattern 1/6

329 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 2/6

313 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 3/6

297 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 4/6

254 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 5/6

238 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Pattern 6/6

42 seqlets

Sequence (PFM)
Hypothetical contributions (hCWM)
Actual contributions (CWM)

Summary of motifs

Motifs are trimmed based on information content, and presented in descending order by number of supporting seqlets. The motifs are separated by metacluster. The motifs are presented as hCWMs. The forward orientation is defined as the orientation that is richer in purines.

In [9]:
colgroup = vdomh.colgroup(
    vdomh.col(style={"width": "5%"}),
    vdomh.col(style={"width": "5%"}),
    vdomh.col(style={"width": "45%"}),
    vdomh.col(style={"width": "45%"})
)
header = vdomh.thead(
    vdomh.tr(
        vdomh.th("#", style={"text-align": "center"}),
        vdomh.th("Seqlets", style={"text-align": "center"}),
        vdomh.th("Forward", style={"text-align": "center"}),
        vdomh.th("Reverse", style={"text-align": "center"})
    )
)

for i in range(len(motif_hcwms)):
    display(vdomh.h3("Metacluster %d/%d" % (i + 1, num_metaclusters)))
    body = []
    for j in range(len(motif_hcwms[i])):
        motif = motif_hcwms[i][j]
        if np.sum(motif[:, [0, 2]]) > 0.5 * np.sum(motif):
            # Forward is purine-rich, reverse-complement is pyrimidine-rich
            f, rc = motif, np.flip(motif, axis=(0, 1))
        else:
            f, rc = np.flip(motif, axis=(0, 1)), motif
            
        f_fig = viz_sequence.plot_weights(f, figsize=(20, 4), return_fig=True)
        f_fig.tight_layout()
        rc_fig = viz_sequence.plot_weights(rc, figsize=(20, 4), return_fig=True)
        rc_fig.tight_layout()
        
        if tfm_motifs_cache_dir:
            # Save results and figures
            motif_id = "%d_%d" % (i, j)
            f_fig.savefig(os.path.join(tfm_motifs_cache_dir, motif_id + "_hcwm_trimmed_fwd.png"))
            rc_fig.savefig(os.path.join(tfm_motifs_cache_dir, motif_id + "_hcwm_trimmed_rev.png"))

        body.append(
            vdomh.tr(
                vdomh.td(str(j + 1)),
                vdomh.td(str(num_seqlets[i][j])),
                vdomh.td(figure_to_vdom_image(f_fig)),
                vdomh.td(figure_to_vdom_image(rc_fig))
            )
        )
    display(vdomh.table(colgroup, header, vdomh.tbody(*body)))
    plt.close("all")

Metacluster 1/2

#SeqletsForwardReverse
111798
2866
3715
4323
5253
6133
7128
883
983

Metacluster 2/2

#SeqletsForwardReverse
1329
2313
3297
4254
5238
642

Top TOMTOM matches for each motif

Here, the TF-MoDISco motifs are plotted as hCWMs, but the TOMTOM matches are shown as PWMs.

In [10]:
num_matches_to_keep = 10
num_matches_to_show = 5

header = vdomh.thead(
    vdomh.tr(
        vdomh.th("Motif ID", style={"text-align": "center"}),
        vdomh.th("q-val", style={"text-align": "center"}),
        vdomh.th("PWM", style={"text-align": "center"})
    )
)

for i in range(len(motif_pfms)):
    display(vdomh.h3("Metacluster %d/%d" % (i + 1, num_metaclusters)))
    
    # Compute TOMTOM matches for all motifs in the metacluster at once
    out_dir = os.path.join(tfm_motifs_cache_dir, "tomtom", "metacluster_%d" % i) if tfm_motifs_cache_dir else None
    tomtom_matches = match_motifs_to_database(motif_pfms_short[i], top_k=num_matches_to_keep, temp_dir=out_dir)
    
    for j in range(len(motif_pfms[i])):
        display(vdomh.h4("Motif %d/%d" % (j + 1, len(motif_pfms[i]))))
        viz_sequence.plot_weights(motif_hcwms[i][j])
    
        body = []
        for k, (match_name, match_pfm, match_qval) in enumerate(tomtom_matches[j]):
            fig = viz_sequence.plot_weights(pfm_to_pwm(match_pfm), return_fig=True)
            fig.tight_layout()
            if k < num_matches_to_show:
                body.append(
                    vdomh.tr(
                        vdomh.td(match_name),
                        vdomh.td(str(match_qval)),
                        vdomh.td(figure_to_vdom_image(fig))
                    )
                )
                if tfm_motifs_cache_dir:
                    # Save results and figures
                    motif_id = "%d_%d" % (i, j)
                    fig.savefig(os.path.join(out_dir, motif_id + ("_hit-%d.png" % (k + 1))))
            else:
                body.append(
                    vdomh.tr(
                        vdomh.td(match_name),
                        vdomh.td(str(match_qval)),
                        vdomh.td("Not shown")
                    )
                )
        if not body:
            display(vdomh.p("No TOMTOM matches passing threshold"))
        else:
            display(vdomh.table(header, vdomh.tbody(*body)))
        plt.close("all")

Metacluster 1/2

Motif 1/9

Motif IDq-valPWM
CEBPB_HUMAN.H11MO.0.A3.00973e-10
CEBPD_HUMAN.H11MO.0.C1.53045e-08
CEBPA_HUMAN.H11MO.0.A1.81421e-06
MA0836.2_CEBPD2.63503e-05
MA0102.4_CEBPA0.000141305
MA0837.1_CEBPE0.00026479Not shown
MA0466.2_CEBPB0.00039843800000000004Not shown
MA0838.1_CEBPG0.000669142Not shown
MA0025.2_NFIL30.00111457Not shown
DBP_HUMAN.H11MO.0.B0.00225009Not shown

Motif 2/9

Motif IDq-valPWM
FOSL1_HUMAN.H11MO.0.A7.402769999999999e-07
FOSB_HUMAN.H11MO.0.A2.14694e-06
JUN_HUMAN.H11MO.0.A2.14694e-06
JUND_HUMAN.H11MO.0.A3.5576400000000003e-06
FOSL2_HUMAN.H11MO.0.A3.7948199999999998e-06
FOS_HUMAN.H11MO.0.A4.39957e-06Not shown
MA0099.3_FOS::JUN7.542130000000001e-06Not shown
MA1128.1_FOSL1::JUN1.3177000000000001e-05Not shown
MA1141.1_FOS::JUND1.3177000000000001e-05Not shown
MA1138.1_FOSL2::JUNB1.3177000000000001e-05Not shown

Motif 3/9

Motif IDq-valPWM
MA0139.1_CTCF1.7162099999999998e-16
CTCF_HUMAN.H11MO.0.A1.6955e-13
CTCFL_HUMAN.H11MO.0.A3.63686e-07
MA1102.2_CTCFL0.00011842100000000001
MA1568.1_TCF21(var.2)0.111633
MA1638.1_HAND20.12629400000000002Not shown
SNAI1_HUMAN.H11MO.0.C0.259473Not shown
ZIC3_HUMAN.H11MO.0.B0.259473Not shown
ZIC2_HUMAN.H11MO.0.D0.431487Not shown
MA0155.1_INSM10.43628Not shown

Motif 4/9

Motif IDq-valPWM
FOXA1_HUMAN.H11MO.0.A1.2091200000000001e-06
FOXM1_HUMAN.H11MO.0.A1.2091200000000001e-06
FOXA2_HUMAN.H11MO.0.A1.8338499999999998e-06
FOXF2_HUMAN.H11MO.0.D1.8338499999999998e-06
FOXA3_HUMAN.H11MO.0.B6.811619999999999e-06
FOXD3_HUMAN.H11MO.0.D1.8368299999999998e-05Not shown
MA0846.1_FOXC22.2280599999999998e-05Not shown
MA0847.2_FOXD26.52357e-05Not shown
FOXC1_HUMAN.H11MO.0.C8.69809e-05Not shown
FOXD1_HUMAN.H11MO.0.D0.000598831Not shown

Motif 5/9

Motif IDq-valPWM
TAL1_HUMAN.H11MO.0.A8.911180000000001e-05
GATA2_HUMAN.H11MO.0.A0.000210807
GATA1_HUMAN.H11MO.1.A0.000210807
GATA2_HUMAN.H11MO.1.A0.000670985
MA0482.2_GATA40.000670985
GATA1_HUMAN.H11MO.0.A0.000692382Not shown
MA0036.3_GATA20.00165568Not shown
MA0037.3_GATA30.00165568Not shown
GATA4_HUMAN.H11MO.0.A0.00670578Not shown
MA0766.2_GATA50.00841139Not shown

Motif 6/9

Motif IDq-valPWM
HNF4A_HUMAN.H11MO.0.A6.922160000000001e-08
HNF4G_HUMAN.H11MO.0.B9.35577e-08
MA0677.1_Nr2f60.000650251
MA0856.1_RXRG0.000650251
MA1574.1_THRB0.000650251
MA0512.2_Rxra0.000650251Not shown
MA1550.1_PPARD0.000650251Not shown
MA1537.1_NR2F1(var.2)0.0006655610000000001Not shown
MA1148.1_PPARA::RXRA0.000745392Not shown
MA0855.1_RXRB0.0007547610000000001Not shown

Motif 7/9

Motif IDq-valPWM
TBX15_HUMAN.H11MO.0.D0.000309132
SP2_HUMAN.H11MO.0.A0.000309132
KLF16_HUMAN.H11MO.0.D0.000309132
SP3_HUMAN.H11MO.0.B0.000309132
MAZ_HUMAN.H11MO.0.A0.00031299400000000003
SP1_HUMAN.H11MO.0.A0.00031299400000000003Not shown
KLF15_HUMAN.H11MO.0.A0.00031299400000000003Not shown
WT1_HUMAN.H11MO.0.C0.00031299400000000003Not shown
PATZ1_HUMAN.H11MO.0.C0.00047973Not shown
ZN467_HUMAN.H11MO.0.C0.0006767030000000001Not shown

Motif 8/9

Motif IDq-valPWM
NFIL3_HUMAN.H11MO.0.D0.00844515
DDIT3_HUMAN.H11MO.0.D0.010732799999999999
DBP_HUMAN.H11MO.0.B0.010732799999999999
MA0025.2_NFIL30.0195863
CEBPD_HUMAN.H11MO.0.C0.0243327
CEBPA_HUMAN.H11MO.0.A0.0243327Not shown
CEBPB_HUMAN.H11MO.0.A0.0243327Not shown
CEBPG_HUMAN.H11MO.0.B0.0243327Not shown
MA0043.3_HLF0.0246578Not shown
MA1636.1_CEBPG(var.2)0.0253622Not shown

Motif 9/9

Motif IDq-valPWM
FOXD2_HUMAN.H11MO.0.D0.0009089110000000001
FOXB1_HUMAN.H11MO.0.D0.00448033
MA0845.1_FOXB10.0657728
MA0032.2_FOXC10.0657728
MA0148.4_FOXA10.195385
MA0846.1_FOXC20.195385Not shown
PO4F3_HUMAN.H11MO.0.D0.376874Not shown
MA0481.3_FOXP10.376874Not shown
MA1683.1_FOXA30.376874Not shown
MA0047.3_FOXA20.376874Not shown

Metacluster 2/2

Motif 1/6

No TOMTOM matches passing threshold

Motif 2/6

Motif IDq-valPWM
CEBPB_HUMAN.H11MO.0.A2.62548e-05
CEBPD_HUMAN.H11MO.0.C8.070649999999999e-05
MA1636.1_CEBPG(var.2)8.18124e-05
ATF4_HUMAN.H11MO.0.A0.000111225
MA0833.2_ATF40.000111225
DDIT3_HUMAN.H11MO.0.D0.000116099Not shown
CEBPG_HUMAN.H11MO.0.B0.000116099Not shown
MA0025.2_NFIL30.000135449Not shown
NFIL3_HUMAN.H11MO.0.D0.000267578Not shown
CEBPA_HUMAN.H11MO.0.A0.000288733Not shown

Motif 3/6

No TOMTOM matches passing threshold

Motif 4/6

No TOMTOM matches passing threshold

Motif 5/6

Motif IDq-valPWM
P53_HUMAN.H11MO.1.A0.264488
P53_HUMAN.H11MO.0.A0.264488
P73_HUMAN.H11MO.0.A0.293565
P63_HUMAN.H11MO.0.A0.293565
MA0525.2_TP630.293565
P73_HUMAN.H11MO.1.A0.341597Not shown
MA0861.1_TP730.341597Not shown
MA0106.3_TP530.397421Not shown
MA1573.1_THAP110.42516400000000004Not shown
REL_HUMAN.H11MO.0.B0.471607Not shown

Motif 6/6

Motif IDq-valPWM
HXC10_HUMAN.H11MO.0.D0.0381983
LMX1A_HUMAN.H11MO.0.D0.08867380000000001
ARI3A_HUMAN.H11MO.0.D0.08867380000000001
PO3F3_HUMAN.H11MO.0.D0.08867380000000001
MA0679.2_ONECUT10.08867380000000001
SHOX_HUMAN.H11MO.0.D0.139416Not shown
MA0606.1_NFAT50.140647Not shown
DMBX1_HUMAN.H11MO.0.D0.15182400000000001Not shown
MNX1_HUMAN.H11MO.0.D0.19914Not shown
LHX9_HUMAN.H11MO.0.D0.19914Not shown