Link to results¶

DiChIPMunk: on peaks and on multi-task seqlets and on single-task seqlets

HOMER: on peaks and on multi-task seqlets and on single-task seqlets

MEME: on peaks and on multitask seqlets and on single-task seqlets

import sys
import os
sys.path.append(os.path.abspath("/users/amtseng/tfmodisco/src/"))
import numpy as np
import h5py
import matplotlib.pyplot as plt
import motif.read_motifs as read_motifs
import motif.match_motifs as match_motifs
from util import figure_to_vdom_image
import plot.viz_sequence as viz_sequence
import vdom.helpers as vdomh
from IPython.display import display

# Define parameters/fetch arguments
tf_name = os.environ["TFM_TF_NAME"]
multitask_fold = int(os.environ["TFM_MULTITASK_FOLD"])
head = os.environ["TFM_HEAD"]
assert head in ("profile", "count")
if "TFM_TASK_INDEX" in os.environ:
    task_index = int(os.environ["TFM_TASK_INDEX"])
    singletask_fold = int(os.environ["TFM_SINGLETASK_FOLD"])
else:
    task_index = None
    singletask_fold = None
motif_file = os.environ["TFM_MOTIF_FILE"]  # TF-MoDISco motifs
    
print("TF name: %s" % tf_name)
print("Multi-task fold: %s" % multitask_fold)
print("Head: %s" % head)
print("Task index: %s" % task_index)
print("Single-task fold: %s" % singletask_fold)
print("TF-MoDISco motif file: %s" % motif_file)

TF name: JUND
Multi-task fold: 7
Head: profile
Task index: 3
Single-task fold: 7
TF-MoDISco motif file: /users/amtseng/tfmodisco/results/reports/tfmodisco_results/cache/singletask_profile_finetune/JUND_singletask_profile_finetune_fold7/task_3/JUND_singletask_profile_finetune_task3_fold7_profile/all_motifs.h5

# Define paths and constants
base_path = "/users/amtseng/tfmodisco/results/classic_motifs/"

multitask_seqlets_dir = os.path.join(
    base_path, "seqlets", "multitask_profile_finetune",
    "%s_multitask_profile_finetune_fold%s" % (tf_name, multitask_fold)
)

if task_index is None:
    peaks_path = os.path.join(base_path, "peaks", tf_name, "%s_peaks_taskall" % tf_name)
    multitask_seqlets_path = os.path.join(
        multitask_seqlets_dir,
        "%s_seqlets_%s_taskall" % (tf_name, head)
    )
else:
    peaks_path = os.path.join(base_path, "peaks", tf_name, "%s_peaks_task%d" % (tf_name, task_index))
    multitask_seqlets_path = os.path.join(
        multitask_seqlets_dir,
        "%s_seqlets_%s_task%d" % (tf_name, head, task_index)
    )
    
    singletask_seqlets_dir = os.path.join(
        base_path, "seqlets", "singletask_profile_finetune",
        "%s_singletask_profile_finetune_fold%s" % (tf_name, singletask_fold),
        "task_%d" % task_index
    )
    singletask_seqlets_path = os.path.join(
        singletask_seqlets_dir,
        "%s_seqlets_%s_task%d" % (tf_name, head, task_index)
    )

Helper functions¶

def import_tfmodisco_motifs(motif_file):
    """
    Imports a set of motifs from the saved HDF5.
    Returns a list of motifs as L x 4 arrays, and a parallel list of
    motif names.
    """
    motifs, motif_names = [], []
    with h5py.File(motif_file, "r") as f:
        for key in f.keys():
            motif_names.append(key)
            motifs.append(f[key]["pfm_trimmed"][:])
    return motifs, motif_names

def pfm_to_forward_pwm(pfm):
    """
    Converts PFM to PWM and flips to purine-rich orientation.
    """
    pwm = read_motifs.pfm_to_pwm(pfm)
    if np.sum(pwm[:, [0, 2]]) < 0.5 * np.sum(pwm):
        # Flip to purine-rich version
        pwm = np.flip(pwm, axis=(0, 1))
    return pwm

def show_matched_motifs_table(
    tfm_motifs, tfm_names, bench_type, bench_motifs, bench_scores, matches
):
    """
    Shows a table of motifs from the given matched results.
    `bench_type` is either `dichipmunk`, `homer`, or `meme`.
    `matches` is NumPy array parallel to `bench_motifs`, where
    each entry is the index of the match in `tfm_motifs`, and -1
    if no match was found.
    """
    assert len(matches) == len(bench_motifs)
    assert bench_type in ("dichipmunk", "homer", "meme")
    if bench_type == "dichipmunk":
        score_name = "Supporting sequences"
        bench_name = "DiChIPMunk"
    elif bench_type == "homer":
        score_name = "Log enrichment"
        bench_name = "HOMER"
    else:
        score_name = "E-value"
        bench_name = "MEME"
        
    colgroup = vdomh.colgroup(
        vdomh.col(style={"width": "5%"}),
        vdomh.col(style={"width": "45%"}),
        vdomh.col(style={"width": "5%"}),
        vdomh.col(style={"width": "45%"})
    )
    header = vdomh.thead(
        vdomh.tr(
            vdomh.th("TF-MoDISco motif ID", style={"text-align": "center"}),
            vdomh.th("TF-MoDISco PWM", style={"text-align": "center"}),
            vdomh.th(score_name, style={"text-align": "center"}),
            vdomh.th("%s PWM" % bench_name, style={"text-align": "center"})
        )
    )
    
    # First, show a table of matches
    display(vdomh.p("Matched motifs"))

    body = []
    for bench_index, tfm_index in enumerate(matches):
        if tfm_index == -1:
            continue
        tfm_fig = viz_sequence.plot_weights(
            pfm_to_forward_pwm(tfm_motifs[tfm_index]), figsize=(20, 4), return_fig=True
        )
        tfm_fig.tight_layout()
        bench_fig = viz_sequence.plot_weights(
            pfm_to_forward_pwm(bench_motifs[bench_index]), figsize=(20, 4), return_fig=True
        )
        bench_fig.tight_layout()
        
        body.append(
            vdomh.tr(
                vdomh.td(str(tfm_names[tfm_index])),
                vdomh.td(figure_to_vdom_image(tfm_fig)),
                vdomh.td(str(bench_scores[bench_index])),
                vdomh.td(figure_to_vdom_image(bench_fig))
            )
        )

    display(vdomh.table(colgroup, header, vdomh.tbody(*body)))
    plt.close("all")
    
    # Now, show a table of leftovers
    display(vdomh.p("Motifs without matches"))
    
    tfm_lone_indices = np.setdiff1d(np.arange(len(tfm_motifs)), matches)
    bench_lone_indices = np.where(matches == -1)[0]

    rows = []
    for i in range(max(len(tfm_lone_indices), len(bench_lone_indices))):
        row = []
        if i < len(tfm_lone_indices):
            tfm_fig = viz_sequence.plot_weights(
                pfm_to_forward_pwm(tfm_motifs[tfm_lone_indices[i]]), figsize=(20, 4), return_fig=True
            )
            tfm_fig.tight_layout()
            row.extend([
                vdomh.td(str(tfm_names[tfm_lone_indices[i]])),
                vdomh.td(figure_to_vdom_image(tfm_fig))
            ])
        else:
            row.extend([vdomh.td(), vdomh.td()])
        if i < len(bench_lone_indices):
            bench_fig = viz_sequence.plot_weights(
                pfm_to_forward_pwm(bench_motifs[bench_lone_indices[i]]), figsize=(20, 4), return_fig=True
            )
            bench_fig.tight_layout()
            row.extend([
                vdomh.td(str(bench_scores[bench_lone_indices[i]])),
                vdomh.td(figure_to_vdom_image(bench_fig))
            ])
        else:
            row.extend([vdomh.td(), vdomh.td()])
        rows.append(row)
    body = [vdomh.tr(*row) for row in rows]

    display(vdomh.table(colgroup, header, vdomh.tbody(*body)))
    plt.close("all")

Import motifs¶

tfm_pfms, tfm_names = import_tfmodisco_motifs(motif_file)

dichipmunk_peak_pfms, dichipmunk_peak_num_seqs = read_motifs.import_dichipmunk_pfms(
    os.path.join(peaks_path, "dichipmunk")
)
dichipmunk_multitask_seqlet_pfms, dichipmunk_multitask_seqlet_num_seqs = read_motifs.import_dichipmunk_pfms(
    os.path.join(multitask_seqlets_path, "dichipmunk")
)

homer_peak_pfms, homer_peak_enrichments = read_motifs.import_homer_pfms(
    os.path.join(peaks_path, "homer")
)
homer_multitask_seqlet_pfms, homer_multitask_seqlet_enrichments = read_motifs.import_homer_pfms(
    os.path.join(multitask_seqlets_path, "homer")
)

meme_peak_pfms, meme_peak_evalues = read_motifs.import_meme_pfms(
    os.path.join(peaks_path, "meme")
)
meme_multitask_seqlet_pfms, meme_multitask_seqlet_evalues = read_motifs.import_meme_pfms(
    os.path.join(multitask_seqlets_path, "meme")
)

if task_index is not None:
    dichipmunk_singletask_seqlet_pfms, dichipmunk_singletask_seqlet_num_seqs = read_motifs.import_dichipmunk_pfms(
        os.path.join(singletask_seqlets_path, "dichipmunk")
    )
    homer_singletask_seqlet_pfms, homer_singletask_seqlet_enrichments = read_motifs.import_homer_pfms(
        os.path.join(singletask_seqlets_path, "homer")
    )
    meme_singletask_seqlet_pfms, meme_singletask_seqlet_evalues = read_motifs.import_meme_pfms(
        os.path.join(singletask_seqlets_path, "meme")
    )

Match benchmarked motifs to TF-MoDISco motifs¶

DiChIPMunk on peaks

dichipmunk_peak_matches = match_motifs.match_motifs_to_targets(dichipmunk_peak_pfms, tfm_pfms)
show_matched_motifs_table(
    tfm_pfms, tfm_names, "dichipmunk", dichipmunk_peak_pfms, dichipmunk_peak_num_seqs,
    dichipmunk_peak_matches
)

/users/amtseng/tfmodisco/src/plot/viz_sequence.py:152: RuntimeWarning: More than 20 figures have been opened. Figures created through the pyplot interface (`matplotlib.pyplot.figure`) are retained until explicitly closed and may consume too much memory. (To control this warning, see the rcParam `figure.max_open_warning`).
  fig = plt.figure(figsize=figsize)

DiChIPMunk on multi-task seqlets

dichipmunk_multitask_seqlet_matches = match_motifs.match_motifs_to_targets(dichipmunk_multitask_seqlet_pfms, tfm_pfms)
show_matched_motifs_table(
    tfm_pfms, tfm_names, "dichipmunk", dichipmunk_multitask_seqlet_pfms, dichipmunk_multitask_seqlet_num_seqs,
    dichipmunk_multitask_seqlet_matches
)

/users/amtseng/tfmodisco/src/plot/viz_sequence.py:152: RuntimeWarning: More than 20 figures have been opened. Figures created through the pyplot interface (`matplotlib.pyplot.figure`) are retained until explicitly closed and may consume too much memory. (To control this warning, see the rcParam `figure.max_open_warning`).
  fig = plt.figure(figsize=figsize)

DiChIPMunk on single-task seqlets

if task_index is not None:
    dichipmunk_singletask_seqlet_matches = match_motifs.match_motifs_to_targets(dichipmunk_singletask_seqlet_pfms, tfm_pfms)
    show_matched_motifs_table(
        tfm_pfms, tfm_names, "dichipmunk", dichipmunk_singletask_seqlet_pfms, dichipmunk_singletask_seqlet_num_seqs,
        dichipmunk_singletask_seqlet_matches
    )

/users/amtseng/tfmodisco/src/plot/viz_sequence.py:152: RuntimeWarning: More than 20 figures have been opened. Figures created through the pyplot interface (`matplotlib.pyplot.figure`) are retained until explicitly closed and may consume too much memory. (To control this warning, see the rcParam `figure.max_open_warning`).
  fig = plt.figure(figsize=figsize)

HOMER on peaks

homer_peak_matches = match_motifs.match_motifs_to_targets(homer_peak_pfms, tfm_pfms)
show_matched_motifs_table(
    tfm_pfms, tfm_names, "homer", homer_peak_pfms, homer_peak_enrichments,
    homer_peak_matches
)

/users/amtseng/tfmodisco/src/plot/viz_sequence.py:152: RuntimeWarning: More than 20 figures have been opened. Figures created through the pyplot interface (`matplotlib.pyplot.figure`) are retained until explicitly closed and may consume too much memory. (To control this warning, see the rcParam `figure.max_open_warning`).
  fig = plt.figure(figsize=figsize)

HOMER on multi-task seqlets

homer_multitask_seqlet_matches = match_motifs.match_motifs_to_targets(homer_multitask_seqlet_pfms, tfm_pfms)
show_matched_motifs_table(
    tfm_pfms, tfm_names, "homer", homer_multitask_seqlet_pfms, homer_multitask_seqlet_enrichments,
    homer_multitask_seqlet_matches
)

/users/amtseng/tfmodisco/src/plot/viz_sequence.py:152: RuntimeWarning: More than 20 figures have been opened. Figures created through the pyplot interface (`matplotlib.pyplot.figure`) are retained until explicitly closed and may consume too much memory. (To control this warning, see the rcParam `figure.max_open_warning`).
  fig = plt.figure(figsize=figsize)

HOMER on single-task seqlets

if task_index is not None:
    homer_singletask_seqlet_matches = match_motifs.match_motifs_to_targets(homer_singletask_seqlet_pfms, tfm_pfms)
    show_matched_motifs_table(
        tfm_pfms, tfm_names, "homer", homer_singletask_seqlet_pfms, homer_singletask_seqlet_enrichments,
        homer_singletask_seqlet_matches
    )

/users/amtseng/tfmodisco/src/plot/viz_sequence.py:152: RuntimeWarning: More than 20 figures have been opened. Figures created through the pyplot interface (`matplotlib.pyplot.figure`) are retained until explicitly closed and may consume too much memory. (To control this warning, see the rcParam `figure.max_open_warning`).
  fig = plt.figure(figsize=figsize)

MEME on peaks

meme_peak_matches = match_motifs.match_motifs_to_targets(meme_peak_pfms, tfm_pfms)
show_matched_motifs_table(
    tfm_pfms, tfm_names, "meme", meme_peak_pfms, meme_peak_evalues,
    meme_peak_matches
)

/users/amtseng/tfmodisco/src/plot/viz_sequence.py:152: RuntimeWarning: More than 20 figures have been opened. Figures created through the pyplot interface (`matplotlib.pyplot.figure`) are retained until explicitly closed and may consume too much memory. (To control this warning, see the rcParam `figure.max_open_warning`).
  fig = plt.figure(figsize=figsize)

MEME on multi-task seqlets

meme_multitask_seqlet_matches = match_motifs.match_motifs_to_targets(meme_multitask_seqlet_pfms, tfm_pfms)
show_matched_motifs_table(
    tfm_pfms, tfm_names, "meme", meme_multitask_seqlet_pfms, meme_multitask_seqlet_evalues,
    meme_multitask_seqlet_matches
)

/users/amtseng/tfmodisco/src/plot/viz_sequence.py:152: RuntimeWarning: More than 20 figures have been opened. Figures created through the pyplot interface (`matplotlib.pyplot.figure`) are retained until explicitly closed and may consume too much memory. (To control this warning, see the rcParam `figure.max_open_warning`).
  fig = plt.figure(figsize=figsize)

MEME on single-task seqlets

if task_index is not None:
    meme_singletask_seqlet_matches = match_motifs.match_motifs_to_targets(meme_singletask_seqlet_pfms, tfm_pfms)
    show_matched_motifs_table(
        tfm_pfms, tfm_names, "meme", meme_singletask_seqlet_pfms, meme_singletask_seqlet_evalues,
        meme_singletask_seqlet_matches
    )

/users/amtseng/tfmodisco/src/plot/viz_sequence.py:152: RuntimeWarning: More than 20 figures have been opened. Figures created through the pyplot interface (`matplotlib.pyplot.figure`) are retained until explicitly closed and may consume too much memory. (To control this warning, see the rcParam `figure.max_open_warning`).
  fig = plt.figure(figsize=figsize)

TF-MoDISco motif ID	TF-MoDISco PWM	Supporting sequences	DiChIPMunk PWM
0_0		813
0_0		149
0_5		12

TF-MoDISco motif ID	TF-MoDISco PWM	Supporting sequences	DiChIPMunk PWM
0_1		17
0_10		1
0_11
0_12
0_13
0_2
0_3
0_4
0_6
0_7
0_8
0_9
1_0
1_1
1_10
1_11
1_2
1_3
1_4
1_5
1_6
1_7
1_8
1_9

TF-MoDISco motif ID	TF-MoDISco PWM	Supporting sequences	DiChIPMunk PWM
0_0		15249
0_4		1973
0_3		876

TF-MoDISco motif ID	TF-MoDISco PWM	Supporting sequences	DiChIPMunk PWM
0_1
0_10
0_11
0_12
0_13
0_2
0_5
0_6
0_7
0_8
0_9
1_0
1_1
1_10
1_11
1_2
1_3
1_4
1_5
1_6
1_7
1_8
1_9

TF-MoDISco motif ID	TF-MoDISco PWM	Supporting sequences	DiChIPMunk PWM
0_0		16525
0_3		890
0_4		413

TF-MoDISco motif ID	TF-MoDISco PWM	Log enrichment	HOMER PWM
0_0		-2752.010864
0_3		-185.40079
1_3		-120.754546
1_4		-113.559571
1_8		-84.23512
1_10		-69.51246
1_4		-64.864242
0_8		-64.661651
0_0		-54.165681
1_0		-34.154715
1_10		-32.239558
0_7		-26.110734

TF-MoDISco motif ID	TF-MoDISco PWM	Log enrichment	HOMER PWM
0_1		-114.912889
0_10		-98.155884
0_11		-84.23512
0_12		-66.294454
0_13		-66.294454
0_2		-60.844548
0_4		-49.687157
0_5		-29.084489
0_6
0_9
1_1
1_11
1_2
1_5
1_6
1_7
1_9

TF-MoDISco motif ID	TF-MoDISco PWM	Log enrichment	HOMER PWM
1_1		-40207.748201
0_0		-1595.04051
0_4		-1019.701547
1_4		-630.3491
0_6		-329.971283
1_3		-186.066875
1_0		-160.031236
1_7		-131.661023
0_3		-117.392488
0_9		-94.262702

TF-MoDISco motif ID	TF-MoDISco PWM	Log enrichment	HOMER PWM
0_1		-1005.213916
0_10		-293.215544
0_11		-156.169914
0_12		-116.642653
0_13		-89.744826
0_2		-76.433149
0_5		-55.168214
0_7
0_8
1_10
1_11
1_2
1_5
1_6
1_8
1_9

TF-MoDISco motif ID	TF-MoDISco PWM	Log enrichment	HOMER PWM
1_1		-36274.194071
0_3		-1334.826157
0_1		-494.686175
0_11		-480.662227
0_4		-403.301165
1_4		-210.010832
1_1		-186.444471
0_4		-184.47794
1_1		-89.784146
1_7		-46.350615
1_0		-28.243803

TF-MoDISco motif ID	TF-MoDISco PWM	Log enrichment	HOMER PWM
0_0		-902.532045
0_10		-269.952124
0_12		-98.860839
0_13		-94.303612
0_2		-72.727783
0_5		-47.06605
0_6
0_7
0_8
0_9
1_10
1_11
1_2
1_3
1_5
1_6
1_8
1_9

TF-MoDISco motif ID	TF-MoDISco PWM	E-value	MEME PWM
0_0		5.6e-677
0_0		4.9e-150
1_4		2.2e-039
0_12		2.4e-016
0_3		3.6e-008
1_0		7.7e-008
0_1		3.3e-003
1_3		3.4e-003

TF-MoDISco motif ID	TF-MoDISco PWM	E-value	MEME PWM
0_10		3.9e-017
0_11		5.8e+000
0_13
0_2
0_4
0_5
0_6
0_7
0_8
0_9
1_1
1_10
1_11
1_2
1_5
1_6
1_7
1_8
1_9

TF-MoDISco motif ID	TF-MoDISco PWM	E-value	MEME PWM
0_0		1.7e-1189
0_7		7.1e-024
1_4		1.3e-003
0_6		7.5e+002
1_5		7.2e+003
0_1		2.9e+004
0_5		4.7e+005
0_5		1.0e+006

TF-MoDISco motif ID	TF-MoDISco PWM	E-value	MEME PWM
0_10		1.3e+001
0_11		1.5e+004
0_12
0_13
0_2
0_3
0_4
0_8
0_9
1_0
1_1
1_10
1_11
1_2
1_3
1_6
1_7
1_8
1_9

TF-MoDISco motif ID	TF-MoDISco PWM	E-value	MEME PWM
1_1		1.0e-1022
0_1		5.1e-010
0_3		8.6e-001
0_5		2.5e+000
0_1		1.6e+000
0_7		5.3e+004

TF-MoDISco motif ID	TF-MoDISco PWM	E-value	MEME PWM
0_0		5.9e-016
0_10		1.7e+001
0_11		4.0e+002
0_12		7.8e+003
0_13
0_2
0_4
0_6
0_8
0_9
1_0
1_10
1_11
1_2
1_3
1_4
1_5
1_6
1_7
1_8
1_9