DiChIPMunk: on peaks and on multi-task seqlets and on single-task seqlets
HOMER: on peaks and on multi-task seqlets and on single-task seqlets
MEME: on peaks and on multitask seqlets and on single-task seqlets
import sys
import os
sys.path.append(os.path.abspath("/users/amtseng/tfmodisco/src/"))
from util import figure_to_vdom_image
import motif.read_motifs as read_motifs
from motif.read_motifs import pfm_to_pwm
import plot.viz_sequence as viz_sequence
import numpy as np
import matplotlib.pyplot as plt
import vdom.helpers as vdomh
from IPython.display import display
# Define parameters/fetch arguments
tf_name = os.environ["TFM_TF_NAME"]
multitask_fold = int(os.environ["TFM_MULTITASK_FOLD"])
if "TFM_TASK_INDEX" in os.environ:
task_index = int(os.environ["TFM_TASK_INDEX"])
singletask_fold = int(os.environ["TFM_SINGLETASK_FOLD"])
else:
task_index = None
singletask_fold = None
print("TF name: %s" % tf_name)
print("Multi-task fold: %s" % multitask_fold)
print("Task index: %s" % task_index)
print("Single-task fold: %s" % singletask_fold)
TF name: MAX Multi-task fold: 1 Task index: 1 Single-task fold: 1
# Define paths and constants
base_path = "/users/amtseng/tfmodisco/results/classic_motifs/"
multitask_seqlets_dir = os.path.join(
base_path, "seqlets", "multitask_profile_finetune",
"%s_multitask_profile_finetune_fold%s" % (tf_name, multitask_fold)
)
if task_index is None:
peaks_path = os.path.join(base_path, "peaks", tf_name, "%s_peaks_taskall" % tf_name)
multitask_profile_seqlets_path = os.path.join(
multitask_seqlets_dir,
"%s_seqlets_profile_taskall" % tf_name
)
multitask_count_seqlets_path = os.path.join(
multitask_seqlets_dir,
"%s_seqlets_count_taskall" % tf_name
)
else:
peaks_path = os.path.join(base_path, "peaks", tf_name, "%s_peaks_task%d" % (tf_name, task_index))
multitask_profile_seqlets_path = os.path.join(
multitask_seqlets_dir,
"%s_seqlets_profile_task%d" % (tf_name, task_index)
)
multitask_count_seqlets_path = os.path.join(
multitask_seqlets_dir,
"%s_seqlets_count_task%d" % (tf_name, task_index)
)
singletask_seqlets_dir = os.path.join(
base_path, "seqlets", "singletask_profile_finetune",
"%s_singletask_profile_finetune_fold%s" % (tf_name, singletask_fold),
"task_%d" % task_index
)
singletask_profile_seqlets_path = os.path.join(
singletask_seqlets_dir,
"%s_seqlets_profile_task%d" % (tf_name, task_index)
)
singletask_count_seqlets_path = os.path.join(
singletask_seqlets_dir,
"%s_seqlets_count_task%d" % (tf_name, task_index)
)
def show_peaks_motif_table(results_path, mode):
"""
Shows a table of motifs from the given results path.
`mode` is either `dichipmunk`, `homer`, `meme`, or `memechip`.
"""
assert mode in ("dichipmunk", "homer", "meme", "memechip")
if mode == "dichipmunk":
score_name = "Supporting sequences"
pfms, score_vals = read_motifs.import_dichipmunk_pfms(results_path)
elif mode == "homer":
score_name = "Log enrichment"
pfms, score_vals = read_motifs.import_homer_pfms(results_path)
elif mode == "meme":
score_name = "E-value"
pfms, score_vals = read_motifs.import_meme_pfms(results_path)
else:
score_name = "E-value"
pfms, score_vals = read_motifs.import_meme_pfms(
os.path.join(results_path, "meme_out")
)
colgroup = vdomh.colgroup(
vdomh.col(style={"width": "5%"}),
vdomh.col(style={"width": "5%"}),
vdomh.col(style={"width": "40%"})
)
header = vdomh.thead(
vdomh.tr(
vdomh.th("Motif", style={"text-align": "center"}),
vdomh.th(score_name, style={"text-align": "center"}),
vdomh.th("PWM", style={"text-align": "center"})
)
)
body = []
for i, pfm in enumerate(pfms):
pwm = pfm_to_pwm(pfm)
if np.sum(pwm[:, [0, 2]]) < 0.5 * np.sum(pwm):
# Flip to purine-rich version
pwm = np.flip(pwm, axis=(0, 1))
fig = viz_sequence.plot_weights(pwm, figsize=(20, 4), return_fig=True)
fig.tight_layout()
body.append(
vdomh.tr(
vdomh.td(str(i + 1)),
vdomh.td(str(score_vals[i])),
vdomh.td(figure_to_vdom_image(fig))
)
)
display(vdomh.table(colgroup, header, vdomh.tbody(*body)))
plt.close("all")
def show_seqlets_motif_table(profile_results_path, count_results_path, mode):
"""
Shows a table of motifs from the given results path.
`mode` is either `dichipmunk`, `homer`, `meme`, or `memechip`
"""
assert mode in ("dichipmunk", "homer", "meme", "memechip")
if mode == "dichipmunk":
score_name = "Supporting sequences"
p_pfms, p_score_vals = read_motifs.import_dichipmunk_pfms(profile_results_path)
c_pfms, c_score_vals = read_motifs.import_dichipmunk_pfms(count_results_path)
elif mode == "homer":
score_name = "Log enrichment"
p_pfms, p_score_vals = read_motifs.import_homer_pfms(profile_results_path)
c_pfms, c_score_vals = read_motifs.import_homer_pfms(count_results_path)
elif mode == "meme":
score_name = "E-value"
p_pfms, p_score_vals = read_motifs.import_meme_pfms(profile_results_path)
c_pfms, c_score_vals = read_motifs.import_meme_pfms(count_results_path)
else:
score_name = "E-value"
p_pfms, p_score_vals = read_motifs.import_meme_pfms(
os.path.join(profile_results_path, "meme_out")
)
c_pfms, c_score_vals = read_motifs.import_meme_pfms(
os.path.join(count_results_path, "meme_out")
)
colgroup = vdomh.colgroup(
vdomh.col(style={"width": "5%"}),
vdomh.col(style={"width": "5%"}),
vdomh.col(style={"width": "40%"}),
vdomh.col(style={"width": "5%"}),
vdomh.col(style={"width": "40%"})
)
header = vdomh.thead(
vdomh.tr(
vdomh.th("Motif", style={"text-align": "center"}),
vdomh.th(score_name + " (profile)", style={"text-align": "center"}),
vdomh.th("PWM (profile)", style={"text-align": "center"}),
vdomh.th(score_name + " (count)", style={"text-align": "center"}),
vdomh.th("PWM (count)", style={"text-align": "center"})
)
)
body = []
for i in range(max(len(p_pfms), len(c_pfms))):
rows = [vdomh.td(str(i + 1))]
if i < len(p_pfms):
pwm = pfm_to_pwm(p_pfms[i])
if np.sum(pwm[:, [0, 2]]) < 0.5 * np.sum(pwm):
# Flip to purine-rich version
pwm = np.flip(pwm, axis=(0, 1))
fig = viz_sequence.plot_weights(pwm, figsize=(20, 4), return_fig=True)
fig.tight_layout()
rows.extend([
vdomh.td(str(p_score_vals[i])),
vdomh.td(figure_to_vdom_image(fig))
])
else:
rows.extend([vdomh.td(), vdomh.td()])
if i < len(c_pfms):
pwm = pfm_to_pwm(c_pfms[i])
if np.sum(pwm[:, [0, 2]]) < 0.5 * np.sum(pwm):
# Flip to purine-rich version
pwm = np.flip(pwm, axis=(0, 1))
fig = viz_sequence.plot_weights(pwm, figsize=(20, 4), return_fig=True)
fig.tight_layout()
rows.extend([
vdomh.td(str(c_score_vals[i])),
vdomh.td(figure_to_vdom_image(fig))
])
else:
rows.extend([vdomh.td(), vdomh.td()])
body.append(vdomh.tr(*rows))
display(vdomh.table(colgroup, header, vdomh.tbody(*body)))
plt.close("all")
show_peaks_motif_table(os.path.join(peaks_path, "dichipmunk"), "dichipmunk")
Motif | Supporting sequences | PWM |
---|---|---|
1 | 1997 | |
2 | 2000 | |
3 | 1998 | |
4 | 1712 | |
5 | 934 | |
6 | 415 | |
7 | 192 | |
8 | 159 | |
9 | 63 | |
10 | 63 |
show_seqlets_motif_table(
os.path.join(multitask_profile_seqlets_path, "dichipmunk"),
os.path.join(multitask_count_seqlets_path, "dichipmunk"),
"dichipmunk"
)
Motif | Supporting sequences (profile) | PWM (profile) | Supporting sequences (count) | PWM (count) |
---|---|---|---|---|
1 | 11102 | 8077 | ||
2 | 1582 | 2858 | ||
3 | 597 | 189 | ||
4 | 41 |
if task_index is not None:
show_seqlets_motif_table(
os.path.join(singletask_profile_seqlets_path, "dichipmunk"),
os.path.join(singletask_count_seqlets_path, "dichipmunk"),
"dichipmunk"
)
Motif | Supporting sequences (profile) | PWM (profile) | Supporting sequences (count) | PWM (count) |
---|---|---|---|---|
1 | 9907 | 5635 | ||
2 | 4670 | 2525 | ||
3 | 33 | 800 | ||
4 | 5 | 231 | ||
5 | 51 | |||
6 | 16 |
show_peaks_motif_table(os.path.join(peaks_path, "homer"), "homer")
/users/amtseng/tfmodisco/src/plot/viz_sequence.py:152: RuntimeWarning: More than 20 figures have been opened. Figures created through the pyplot interface (`matplotlib.pyplot.figure`) are retained until explicitly closed and may consume too much memory. (To control this warning, see the rcParam `figure.max_open_warning`). fig = plt.figure(figsize=figsize)
Motif | Log enrichment | PWM |
---|---|---|
1 | -2506.827597 | |
2 | -2073.259008 | |
3 | -1717.958485 | |
4 | -1598.9711 | |
5 | -1462.091416 | |
6 | -1255.879349 | |
7 | -1171.646217 | |
8 | -1129.993354 | |
9 | -942.683541 | |
10 | -938.982683 | |
11 | -869.258945 | |
12 | -785.656157 | |
13 | -736.990426 | |
14 | -678.447248 | |
15 | -597.530083 | |
16 | -573.895153 | |
17 | -311.056492 | |
18 | -232.982635 | |
19 | -175.933827 | |
20 | -130.666474 | |
21 | -106.185461 | |
22 | -51.912349 |
show_seqlets_motif_table(
os.path.join(multitask_profile_seqlets_path, "homer"),
os.path.join(multitask_count_seqlets_path, "homer"),
"homer"
)
Motif | Log enrichment (profile) | PWM (profile) | Log enrichment (count) | PWM (count) |
---|---|---|---|---|
1 | -9533.348254 | -9764.467368 | ||
2 | -1779.172526 | -1859.267136 | ||
3 | -1225.539903 | -1334.401975 | ||
4 | -821.71859 | -962.822217 | ||
5 | -771.7383 | -642.805945 | ||
6 | -684.111862 | -328.463578 | ||
7 | -655.291735 | -275.369878 | ||
8 | -486.970828 | -259.39964 | ||
9 | -382.834675 | -164.188356 | ||
10 | -343.724612 | -105.145037 | ||
11 | -336.754494 | -85.898285 | ||
12 | -319.701086 | -67.117456 | ||
13 | -263.844137 | -49.870912 | ||
14 | -228.695449 | -36.225715 | ||
15 | -228.695449 | |||
16 | -215.037171 | |||
17 | -207.868309 | |||
18 | -173.02801 | |||
19 | -167.467817 | |||
20 | -106.510547 | |||
21 | -36.193041 | |||
22 | -22.274561 |
if task_index is not None:
show_seqlets_motif_table(
os.path.join(singletask_profile_seqlets_path, "homer"),
os.path.join(singletask_count_seqlets_path, "homer"),
"homer"
)
Motif | Log enrichment (profile) | PWM (profile) | Log enrichment (count) | PWM (count) |
---|---|---|---|---|
1 | -8894.666971 | -10338.355895 | ||
2 | -3067.895493 | -3534.123961 | ||
3 | -1383.557213 | -626.466309 | ||
4 | -866.368198 | -581.507186 | ||
5 | -859.680208 | -487.73266 | ||
6 | -795.238658 | -369.927393 | ||
7 | -762.825934 | -219.23709 | ||
8 | -754.175896 | -94.15917 | ||
9 | -694.355519 | -89.461858 | ||
10 | -636.787719 | -49.647916 | ||
11 | -587.131917 | -32.346825 | ||
12 | -525.865668 | |||
13 | -501.626333 | |||
14 | -448.2773 | |||
15 | -441.750488 | |||
16 | -440.627199 | |||
17 | -406.634102 | |||
18 | -401.344522 | |||
19 | -297.98123 | |||
20 | -212.352487 | |||
21 | -200.150618 | |||
22 | -121.509174 |
show_peaks_motif_table(os.path.join(peaks_path, "memechip"), "memechip")
Motif | E-value | PWM |
---|---|---|
1 | 4e-78 | |
2 | 6.7e-56 | |
3 | 2.6e-33 | |
4 | 7e-13 | |
5 | 6.6e-15 | |
6 | 6.2e-13 | |
7 | 2.4e-09 | |
8 | 1.5e-07 | |
9 | 4.8e-08 | |
10 | 1300.0 |
show_seqlets_motif_table(
os.path.join(multitask_profile_seqlets_path, "meme"),
os.path.join(multitask_count_seqlets_path, "meme"),
"meme"
)
Motif | E-value (profile) | PWM (profile) | E-value (count) | PWM (count) |
---|---|---|---|---|
1 | 3.6e-232 | 0.0 | ||
2 | 6.9e-101 | 9e-40 | ||
3 | 2.7e-59 | 2.1e-16 | ||
4 | 6.5e-39 | 0.0049 | ||
5 | 5.4e-36 | 26.0 | ||
6 | 1.4e-16 | 58000.0 | ||
7 | 2e-13 | 81000.0 | ||
8 | 1.6e-09 | 440000.0 | ||
9 | 0.00068 | 520000.0 | ||
10 | 0.013 | 550000.0 |
if task_index is not None:
show_seqlets_motif_table(
os.path.join(singletask_profile_seqlets_path, "meme"),
os.path.join(singletask_count_seqlets_path, "meme"),
"meme"
)
Motif | E-value (profile) | PWM (profile) | E-value (count) | PWM (count) |
---|---|---|---|---|
1 | 4.9e-209 | 0.0 | ||
2 | 1.4e-142 | 9.4e-131 | ||
3 | 4.9e-72 | 1.5e-09 | ||
4 | 3e-54 | 0.015 | ||
5 | 7e-43 | 14000.0 | ||
6 | 9.2e-38 | 160000.0 | ||
7 | 8.7e-37 | 220000.0 | ||
8 | 4.1e-12 | 240000.0 | ||
9 | 1.7e-08 | 280000.0 | ||
10 | 3.7e-08 | 460000.0 |