DiChIPMunk: on peaks and on multi-task seqlets and on single-task seqlets
HOMER: on peaks and on multi-task seqlets and on single-task seqlets
MEME: on peaks and on multitask seqlets and on single-task seqlets
import sys
import os
sys.path.append(os.path.abspath("/users/amtseng/tfmodisco/src/"))
from util import figure_to_vdom_image
import motif.read_motifs as read_motifs
from motif.read_motifs import pfm_to_pwm
import plot.viz_sequence as viz_sequence
import numpy as np
import matplotlib.pyplot as plt
import vdom.helpers as vdomh
from IPython.display import display
# Define parameters/fetch arguments
tf_name = os.environ["TFM_TF_NAME"]
multitask_fold = int(os.environ["TFM_MULTITASK_FOLD"])
if "TFM_TASK_INDEX" in os.environ:
task_index = int(os.environ["TFM_TASK_INDEX"])
singletask_fold = int(os.environ["TFM_SINGLETASK_FOLD"])
else:
task_index = None
singletask_fold = None
print("TF name: %s" % tf_name)
print("Multi-task fold: %s" % multitask_fold)
print("Task index: %s" % task_index)
print("Single-task fold: %s" % singletask_fold)
TF name: CEBPB Multi-task fold: 7 Task index: 1 Single-task fold: 7
# Define paths and constants
base_path = "/users/amtseng/tfmodisco/results/classic_motifs/"
multitask_seqlets_dir = os.path.join(
base_path, "seqlets", "multitask_profile_finetune",
"%s_multitask_profile_finetune_fold%s" % (tf_name, multitask_fold)
)
if task_index is None:
peaks_path = os.path.join(base_path, "peaks", tf_name, "%s_peaks_taskall" % tf_name)
multitask_profile_seqlets_path = os.path.join(
multitask_seqlets_dir,
"%s_seqlets_profile_taskall" % tf_name
)
multitask_count_seqlets_path = os.path.join(
multitask_seqlets_dir,
"%s_seqlets_count_taskall" % tf_name
)
else:
peaks_path = os.path.join(base_path, "peaks", tf_name, "%s_peaks_task%d" % (tf_name, task_index))
multitask_profile_seqlets_path = os.path.join(
multitask_seqlets_dir,
"%s_seqlets_profile_task%d" % (tf_name, task_index)
)
multitask_count_seqlets_path = os.path.join(
multitask_seqlets_dir,
"%s_seqlets_count_task%d" % (tf_name, task_index)
)
singletask_seqlets_dir = os.path.join(
base_path, "seqlets", "singletask_profile_finetune",
"%s_singletask_profile_finetune_fold%s" % (tf_name, singletask_fold),
"task_%d" % task_index
)
singletask_profile_seqlets_path = os.path.join(
singletask_seqlets_dir,
"%s_seqlets_profile_task%d" % (tf_name, task_index)
)
singletask_count_seqlets_path = os.path.join(
singletask_seqlets_dir,
"%s_seqlets_count_task%d" % (tf_name, task_index)
)
def show_peaks_motif_table(results_path, mode):
"""
Shows a table of motifs from the given results path.
`mode` is either `dichipmunk`, `homer`, `meme`, or `memechip`.
"""
assert mode in ("dichipmunk", "homer", "meme", "memechip")
if mode == "dichipmunk":
score_name = "Supporting sequences"
pfms, score_vals = read_motifs.import_dichipmunk_pfms(results_path)
elif mode == "homer":
score_name = "Log enrichment"
pfms, score_vals = read_motifs.import_homer_pfms(results_path)
elif mode == "meme":
score_name = "E-value"
pfms, score_vals = read_motifs.import_meme_pfms(results_path)
else:
score_name = "E-value"
pfms, score_vals = read_motifs.import_meme_pfms(
os.path.join(results_path, "meme_out")
)
colgroup = vdomh.colgroup(
vdomh.col(style={"width": "5%"}),
vdomh.col(style={"width": "5%"}),
vdomh.col(style={"width": "40%"})
)
header = vdomh.thead(
vdomh.tr(
vdomh.th("Motif", style={"text-align": "center"}),
vdomh.th(score_name, style={"text-align": "center"}),
vdomh.th("PWM", style={"text-align": "center"})
)
)
body = []
for i, pfm in enumerate(pfms):
pwm = pfm_to_pwm(pfm)
if np.sum(pwm[:, [0, 2]]) < 0.5 * np.sum(pwm):
# Flip to purine-rich version
pwm = np.flip(pwm, axis=(0, 1))
fig = viz_sequence.plot_weights(pwm, figsize=(20, 4), return_fig=True)
fig.tight_layout()
body.append(
vdomh.tr(
vdomh.td(str(i + 1)),
vdomh.td(str(score_vals[i])),
vdomh.td(figure_to_vdom_image(fig))
)
)
display(vdomh.table(colgroup, header, vdomh.tbody(*body)))
plt.close("all")
def show_seqlets_motif_table(profile_results_path, count_results_path, mode):
"""
Shows a table of motifs from the given results path.
`mode` is either `dichipmunk`, `homer`, `meme`, or `memechip`
"""
assert mode in ("dichipmunk", "homer", "meme", "memechip")
if mode == "dichipmunk":
score_name = "Supporting sequences"
p_pfms, p_score_vals = read_motifs.import_dichipmunk_pfms(profile_results_path)
c_pfms, c_score_vals = read_motifs.import_dichipmunk_pfms(count_results_path)
elif mode == "homer":
score_name = "Log enrichment"
p_pfms, p_score_vals = read_motifs.import_homer_pfms(profile_results_path)
c_pfms, c_score_vals = read_motifs.import_homer_pfms(count_results_path)
elif mode == "meme":
score_name = "E-value"
p_pfms, p_score_vals = read_motifs.import_meme_pfms(profile_results_path)
c_pfms, c_score_vals = read_motifs.import_meme_pfms(count_results_path)
else:
score_name = "E-value"
p_pfms, p_score_vals = read_motifs.import_meme_pfms(
os.path.join(profile_results_path, "meme_out")
)
c_pfms, c_score_vals = read_motifs.import_meme_pfms(
os.path.join(count_results_path, "meme_out")
)
colgroup = vdomh.colgroup(
vdomh.col(style={"width": "5%"}),
vdomh.col(style={"width": "5%"}),
vdomh.col(style={"width": "40%"}),
vdomh.col(style={"width": "5%"}),
vdomh.col(style={"width": "40%"})
)
header = vdomh.thead(
vdomh.tr(
vdomh.th("Motif", style={"text-align": "center"}),
vdomh.th(score_name + " (profile)", style={"text-align": "center"}),
vdomh.th("PWM (profile)", style={"text-align": "center"}),
vdomh.th(score_name + " (count)", style={"text-align": "center"}),
vdomh.th("PWM (count)", style={"text-align": "center"})
)
)
body = []
for i in range(max(len(p_pfms), len(c_pfms))):
rows = [vdomh.td(str(i + 1))]
if i < len(p_pfms):
pwm = pfm_to_pwm(p_pfms[i])
if np.sum(pwm[:, [0, 2]]) < 0.5 * np.sum(pwm):
# Flip to purine-rich version
pwm = np.flip(pwm, axis=(0, 1))
fig = viz_sequence.plot_weights(pwm, figsize=(20, 4), return_fig=True)
fig.tight_layout()
rows.extend([
vdomh.td(str(p_score_vals[i])),
vdomh.td(figure_to_vdom_image(fig))
])
else:
rows.extend([vdomh.td(), vdomh.td()])
if i < len(c_pfms):
pwm = pfm_to_pwm(c_pfms[i])
if np.sum(pwm[:, [0, 2]]) < 0.5 * np.sum(pwm):
# Flip to purine-rich version
pwm = np.flip(pwm, axis=(0, 1))
fig = viz_sequence.plot_weights(pwm, figsize=(20, 4), return_fig=True)
fig.tight_layout()
rows.extend([
vdomh.td(str(c_score_vals[i])),
vdomh.td(figure_to_vdom_image(fig))
])
else:
rows.extend([vdomh.td(), vdomh.td()])
body.append(vdomh.tr(*rows))
display(vdomh.table(colgroup, header, vdomh.tbody(*body)))
plt.close("all")
show_peaks_motif_table(os.path.join(peaks_path, "dichipmunk"), "dichipmunk")
| Motif | Supporting sequences | PWM |
|---|---|---|
| 1 | 1395 | |
| 2 | 2000 | |
| 3 | 2000 | |
| 4 | 2000 | |
| 5 | 1917 | |
| 6 | 1202 | |
| 7 | 738 | |
| 8 | 471 | |
| 9 | 226 | |
| 10 | 145 |
show_seqlets_motif_table(
os.path.join(multitask_profile_seqlets_path, "dichipmunk"),
os.path.join(multitask_count_seqlets_path, "dichipmunk"),
"dichipmunk"
)
| Motif | Supporting sequences (profile) | PWM (profile) | Supporting sequences (count) | PWM (count) |
|---|---|---|---|---|
| 1 | 11473 | 14254 | ||
| 2 | 4287 | 1680 | ||
| 3 | 175 | 942 | ||
| 4 | 355 | 461 | ||
| 5 | 125 | |||
| 6 | 58 | |||
| 7 | 5 |
if task_index is not None:
show_seqlets_motif_table(
os.path.join(singletask_profile_seqlets_path, "dichipmunk"),
os.path.join(singletask_count_seqlets_path, "dichipmunk"),
"dichipmunk"
)
| Motif | Supporting sequences (profile) | PWM (profile) | Supporting sequences (count) | PWM (count) |
|---|---|---|---|---|
| 1 | 13141 | 13524 | ||
| 2 | 1921 | 2666 | ||
| 3 | 1835 | 639 | ||
| 4 | 350 | |||
| 5 | 25 | |||
| 6 | 5 |
show_peaks_motif_table(os.path.join(peaks_path, "homer"), "homer")
| Motif | Log enrichment | PWM |
|---|---|---|
| 1 | -18945.014176 | |
| 2 | -4961.534705 | |
| 3 | -4086.839961 | |
| 4 | -3298.971219 | |
| 5 | -3162.869139 | |
| 6 | -2979.384357 | |
| 7 | -2559.498025 | |
| 8 | -2484.596978 | |
| 9 | -2286.79765 | |
| 10 | -1918.874467 | |
| 11 | -1801.437202 | |
| 12 | -1799.706339 | |
| 13 | -1430.432981 | |
| 14 | -1339.218653 | |
| 15 | -1313.957315 | |
| 16 | -1057.977153 | |
| 17 | -616.585709 | |
| 18 | -317.138945 |
show_seqlets_motif_table(
os.path.join(multitask_profile_seqlets_path, "homer"),
os.path.join(multitask_count_seqlets_path, "homer"),
"homer"
)
/users/amtseng/tfmodisco/src/plot/viz_sequence.py:152: RuntimeWarning: More than 20 figures have been opened. Figures created through the pyplot interface (`matplotlib.pyplot.figure`) are retained until explicitly closed and may consume too much memory. (To control this warning, see the rcParam `figure.max_open_warning`). fig = plt.figure(figsize=figsize)
| Motif | Log enrichment (profile) | PWM (profile) | Log enrichment (count) | PWM (count) |
|---|---|---|---|---|
| 1 | -19284.331087 | -30313.730254 | ||
| 2 | -4534.977418 | -3222.791269 | ||
| 3 | -4350.971249 | -2757.385169 | ||
| 4 | -3235.564156 | -2712.710637 | ||
| 5 | -3167.177654 | -1529.635453 | ||
| 6 | -2343.000424 | -1434.866734 | ||
| 7 | -2226.680527 | -747.85174 | ||
| 8 | -2077.301321 | -332.432931 | ||
| 9 | -2039.715386 | -315.338333 | ||
| 10 | -1713.391695 | -260.319209 | ||
| 11 | -1622.059136 | -137.869463 | ||
| 12 | -1485.477183 | -135.476864 | ||
| 13 | -1428.848188 | -67.700947 | ||
| 14 | -936.927971 | -52.856141 | ||
| 15 | -902.302967 | |||
| 16 | -789.816863 | |||
| 17 | -485.046701 | |||
| 18 | -86.096691 | |||
| 19 | -57.339414 | |||
| 20 | -48.954794 |
if task_index is not None:
show_seqlets_motif_table(
os.path.join(singletask_profile_seqlets_path, "homer"),
os.path.join(singletask_count_seqlets_path, "homer"),
"homer"
)
| Motif | Log enrichment (profile) | PWM (profile) | Log enrichment (count) | PWM (count) |
|---|---|---|---|---|
| 1 | -20575.833835 | -22736.221712 | ||
| 2 | -3237.255538 | -4023.67547 | ||
| 3 | -2906.774252 | -2967.389489 | ||
| 4 | -2543.036951 | -2716.250024 | ||
| 5 | -2335.879657 | -1598.11829 | ||
| 6 | -1994.326347 | -942.262931 | ||
| 7 | -1770.678791 | -227.93911 | ||
| 8 | -1543.947058 | -169.663683 | ||
| 9 | -1412.25393 | -111.560757 | ||
| 10 | -825.00352 | -98.931624 | ||
| 11 | -711.648074 | -79.098645 | ||
| 12 | -544.723705 | -43.040025 | ||
| 13 | -529.923597 | -30.807531 | ||
| 14 | -526.569705 | -11.134274 | ||
| 15 | -431.276093 | |||
| 16 | -333.240831 | |||
| 17 | -304.799059 | |||
| 18 | -283.322544 | |||
| 19 | -171.00832 | |||
| 20 | -126.070474 | |||
| 21 | -21.889836 |
show_peaks_motif_table(os.path.join(peaks_path, "memechip"), "memechip")
| Motif | E-value | PWM |
|---|---|---|
| 1 | 0.0 | |
| 2 | 1.6e-100 | |
| 3 | 4.1e-77 | |
| 4 | 8.4e-59 | |
| 5 | 2.8e-48 | |
| 6 | 3.7e-25 | |
| 7 | 7.1e-19 | |
| 8 | 4.2e-14 | |
| 9 | 2.2e-09 | |
| 10 | 1.3e-08 |
show_seqlets_motif_table(
os.path.join(multitask_profile_seqlets_path, "meme"),
os.path.join(multitask_count_seqlets_path, "meme"),
"meme"
)
| Motif | E-value (profile) | PWM (profile) | E-value (count) | PWM (count) |
|---|---|---|---|---|
| 1 | 0.0 | 0.0 | ||
| 2 | 1.7e-128 | 1.1e-75 | ||
| 3 | 1.1e-124 | 7.2e-50 | ||
| 4 | 9.3e-74 | 6.3e-41 | ||
| 5 | 3.3e-70 | 1.4e-36 | ||
| 6 | 4.8e-59 | 8.2e-31 | ||
| 7 | 6e-40 | 6.2e-07 | ||
| 8 | 6.6e-38 | 440.0 | ||
| 9 | 3.2e-27 | 8300.0 | ||
| 10 | 2e-21 | 150000.0 |
if task_index is not None:
show_seqlets_motif_table(
os.path.join(singletask_profile_seqlets_path, "meme"),
os.path.join(singletask_count_seqlets_path, "meme"),
"meme"
)
| Motif | E-value (profile) | PWM (profile) | E-value (count) | PWM (count) |
|---|---|---|---|---|
| 1 | 0.0 | 0.0 | ||
| 2 | 5e-58 | 2e-76 | ||
| 3 | 9.5e-47 | 9.5e-70 | ||
| 4 | 3.2e-44 | 2e-62 | ||
| 5 | 1.1e-45 | 3.6e-45 | ||
| 6 | 6.6e-27 | 6.7e-07 | ||
| 7 | 7.2e-23 | 12.0 | ||
| 8 | 2.7e-19 | 50000.0 | ||
| 9 | 1.4e-06 | 51000.0 | ||
| 10 | 0.13 | 73000.0 |