DiChIPMunk: on peaks and on multi-task seqlets and on single-task seqlets
HOMER: on peaks and on multi-task seqlets and on single-task seqlets
MEME: on peaks and on multitask seqlets and on single-task seqlets
import sys
import os
sys.path.append(os.path.abspath("/users/amtseng/tfmodisco/src/"))
import numpy as np
import h5py
import matplotlib.pyplot as plt
import motif.read_motifs as read_motifs
import motif.match_motifs as match_motifs
from util import figure_to_vdom_image
import plot.viz_sequence as viz_sequence
import vdom.helpers as vdomh
from IPython.display import display
# Define parameters/fetch arguments
tf_name = os.environ["TFM_TF_NAME"]
multitask_fold = int(os.environ["TFM_MULTITASK_FOLD"])
head = os.environ["TFM_HEAD"]
assert head in ("profile", "count")
if "TFM_TASK_INDEX" in os.environ:
task_index = int(os.environ["TFM_TASK_INDEX"])
singletask_fold = int(os.environ["TFM_SINGLETASK_FOLD"])
else:
task_index = None
singletask_fold = None
motif_file = os.environ["TFM_MOTIF_FILE"] # TF-MoDISco motifs
print("TF name: %s" % tf_name)
print("Multi-task fold: %s" % multitask_fold)
print("Head: %s" % head)
print("Task index: %s" % task_index)
print("Single-task fold: %s" % singletask_fold)
print("TF-MoDISco motif file: %s" % motif_file)
# Define paths and constants
base_path = "/users/amtseng/tfmodisco/results/classic_motifs/"
multitask_seqlets_dir = os.path.join(
base_path, "seqlets", "multitask_profile_finetune",
"%s_multitask_profile_finetune_fold%s" % (tf_name, multitask_fold)
)
if task_index is None:
peaks_path = os.path.join(base_path, "peaks", tf_name, "%s_peaks_taskall" % tf_name)
multitask_seqlets_path = os.path.join(
multitask_seqlets_dir,
"%s_seqlets_%s_taskall" % (tf_name, head)
)
else:
peaks_path = os.path.join(base_path, "peaks", tf_name, "%s_peaks_task%d" % (tf_name, task_index))
multitask_seqlets_path = os.path.join(
multitask_seqlets_dir,
"%s_seqlets_%s_task%d" % (tf_name, head, task_index)
)
singletask_seqlets_dir = os.path.join(
base_path, "seqlets", "singletask_profile_finetune",
"%s_singletask_profile_finetune_fold%s" % (tf_name, singletask_fold),
"task_%d" % task_index
)
singletask_seqlets_path = os.path.join(
singletask_seqlets_dir,
"%s_seqlets_%s_task%d" % (tf_name, head, task_index)
)
def import_tfmodisco_motifs(motif_file):
"""
Imports a set of motifs from the saved HDF5.
Returns a list of motifs as L x 4 arrays, and a parallel list of
motif names.
"""
motifs, motif_names = [], []
with h5py.File(motif_file, "r") as f:
for key in f.keys():
motif_names.append(key)
motifs.append(f[key]["pfm_trimmed"][:])
return motifs, motif_names
def pfm_to_forward_pwm(pfm):
"""
Converts PFM to PWM and flips to purine-rich orientation.
"""
pwm = read_motifs.pfm_to_pwm(pfm)
if np.sum(pwm[:, [0, 2]]) < 0.5 * np.sum(pwm):
# Flip to purine-rich version
pwm = np.flip(pwm, axis=(0, 1))
return pwm
def show_matched_motifs_table(
tfm_motifs, tfm_names, bench_type, bench_motifs, bench_scores, matches
):
"""
Shows a table of motifs from the given matched results.
`bench_type` is either `dichipmunk`, `homer`, or `meme`.
`matches` is NumPy array parallel to `bench_motifs`, where
each entry is the index of the match in `tfm_motifs`, and -1
if no match was found.
"""
assert len(matches) == len(bench_motifs)
assert bench_type in ("dichipmunk", "homer", "meme")
if bench_type == "dichipmunk":
score_name = "Supporting sequences"
bench_name = "DiChIPMunk"
elif bench_type == "homer":
score_name = "Log enrichment"
bench_name = "HOMER"
else:
score_name = "E-value"
bench_name = "MEME"
colgroup = vdomh.colgroup(
vdomh.col(style={"width": "5%"}),
vdomh.col(style={"width": "45%"}),
vdomh.col(style={"width": "5%"}),
vdomh.col(style={"width": "45%"})
)
header = vdomh.thead(
vdomh.tr(
vdomh.th("TF-MoDISco motif ID", style={"text-align": "center"}),
vdomh.th("TF-MoDISco PWM", style={"text-align": "center"}),
vdomh.th(score_name, style={"text-align": "center"}),
vdomh.th("%s PWM" % bench_name, style={"text-align": "center"})
)
)
# First, show a table of matches
display(vdomh.p("Matched motifs"))
body = []
for bench_index, tfm_index in enumerate(matches):
if tfm_index == -1:
continue
tfm_fig = viz_sequence.plot_weights(
pfm_to_forward_pwm(tfm_motifs[tfm_index]), figsize=(20, 4), return_fig=True
)
tfm_fig.tight_layout()
bench_fig = viz_sequence.plot_weights(
pfm_to_forward_pwm(bench_motifs[bench_index]), figsize=(20, 4), return_fig=True
)
bench_fig.tight_layout()
body.append(
vdomh.tr(
vdomh.td(str(tfm_names[tfm_index])),
vdomh.td(figure_to_vdom_image(tfm_fig)),
vdomh.td(str(bench_scores[bench_index])),
vdomh.td(figure_to_vdom_image(bench_fig))
)
)
display(vdomh.table(colgroup, header, vdomh.tbody(*body)))
plt.close("all")
# Now, show a table of leftovers
display(vdomh.p("Motifs without matches"))
tfm_lone_indices = np.setdiff1d(np.arange(len(tfm_motifs)), matches)
bench_lone_indices = np.where(matches == -1)[0]
rows = []
for i in range(max(len(tfm_lone_indices), len(bench_lone_indices))):
row = []
if i < len(tfm_lone_indices):
tfm_fig = viz_sequence.plot_weights(
pfm_to_forward_pwm(tfm_motifs[tfm_lone_indices[i]]), figsize=(20, 4), return_fig=True
)
tfm_fig.tight_layout()
row.extend([
vdomh.td(str(tfm_names[tfm_lone_indices[i]])),
vdomh.td(figure_to_vdom_image(tfm_fig))
])
else:
row.extend([vdomh.td(), vdomh.td()])
if i < len(bench_lone_indices):
bench_fig = viz_sequence.plot_weights(
pfm_to_forward_pwm(bench_motifs[bench_lone_indices[i]]), figsize=(20, 4), return_fig=True
)
bench_fig.tight_layout()
row.extend([
vdomh.td(str(bench_scores[bench_lone_indices[i]])),
vdomh.td(figure_to_vdom_image(bench_fig))
])
else:
row.extend([vdomh.td(), vdomh.td()])
rows.append(row)
body = [vdomh.tr(*row) for row in rows]
display(vdomh.table(colgroup, header, vdomh.tbody(*body)))
plt.close("all")
tfm_pfms, tfm_names = import_tfmodisco_motifs(motif_file)
dichipmunk_peak_pfms, dichipmunk_peak_num_seqs = read_motifs.import_dichipmunk_pfms(
os.path.join(peaks_path, "dichipmunk")
)
dichipmunk_multitask_seqlet_pfms, dichipmunk_multitask_seqlet_num_seqs = read_motifs.import_dichipmunk_pfms(
os.path.join(multitask_seqlets_path, "dichipmunk")
)
homer_peak_pfms, homer_peak_enrichments = read_motifs.import_homer_pfms(
os.path.join(peaks_path, "homer")
)
homer_multitask_seqlet_pfms, homer_multitask_seqlet_enrichments = read_motifs.import_homer_pfms(
os.path.join(multitask_seqlets_path, "homer")
)
meme_peak_pfms, meme_peak_evalues = read_motifs.import_meme_pfms(
os.path.join(peaks_path, "meme")
)
meme_multitask_seqlet_pfms, meme_multitask_seqlet_evalues = read_motifs.import_meme_pfms(
os.path.join(multitask_seqlets_path, "meme")
)
if task_index is not None:
dichipmunk_singletask_seqlet_pfms, dichipmunk_singletask_seqlet_num_seqs = read_motifs.import_dichipmunk_pfms(
os.path.join(singletask_seqlets_path, "dichipmunk")
)
homer_singletask_seqlet_pfms, homer_singletask_seqlet_enrichments = read_motifs.import_homer_pfms(
os.path.join(singletask_seqlets_path, "homer")
)
meme_singletask_seqlet_pfms, meme_singletask_seqlet_evalues = read_motifs.import_meme_pfms(
os.path.join(singletask_seqlets_path, "meme")
)
dichipmunk_peak_matches = match_motifs.match_motifs_to_targets(dichipmunk_peak_pfms, tfm_pfms)
show_matched_motifs_table(
tfm_pfms, tfm_names, "dichipmunk", dichipmunk_peak_pfms, dichipmunk_peak_num_seqs,
dichipmunk_peak_matches
)
dichipmunk_multitask_seqlet_matches = match_motifs.match_motifs_to_targets(dichipmunk_multitask_seqlet_pfms, tfm_pfms)
show_matched_motifs_table(
tfm_pfms, tfm_names, "dichipmunk", dichipmunk_multitask_seqlet_pfms, dichipmunk_multitask_seqlet_num_seqs,
dichipmunk_multitask_seqlet_matches
)
if task_index is not None:
dichipmunk_singletask_seqlet_matches = match_motifs.match_motifs_to_targets(dichipmunk_singletask_seqlet_pfms, tfm_pfms)
show_matched_motifs_table(
tfm_pfms, tfm_names, "dichipmunk", dichipmunk_singletask_seqlet_pfms, dichipmunk_singletask_seqlet_num_seqs,
dichipmunk_singletask_seqlet_matches
)
homer_peak_matches = match_motifs.match_motifs_to_targets(homer_peak_pfms, tfm_pfms)
show_matched_motifs_table(
tfm_pfms, tfm_names, "homer", homer_peak_pfms, homer_peak_enrichments,
homer_peak_matches
)
homer_multitask_seqlet_matches = match_motifs.match_motifs_to_targets(homer_multitask_seqlet_pfms, tfm_pfms)
show_matched_motifs_table(
tfm_pfms, tfm_names, "homer", homer_multitask_seqlet_pfms, homer_multitask_seqlet_enrichments,
homer_multitask_seqlet_matches
)
if task_index is not None:
homer_singletask_seqlet_matches = match_motifs.match_motifs_to_targets(homer_singletask_seqlet_pfms, tfm_pfms)
show_matched_motifs_table(
tfm_pfms, tfm_names, "homer", homer_singletask_seqlet_pfms, homer_singletask_seqlet_enrichments,
homer_singletask_seqlet_matches
)
meme_peak_matches = match_motifs.match_motifs_to_targets(meme_peak_pfms, tfm_pfms)
show_matched_motifs_table(
tfm_pfms, tfm_names, "meme", meme_peak_pfms, meme_peak_evalues,
meme_peak_matches
)
meme_multitask_seqlet_matches = match_motifs.match_motifs_to_targets(meme_multitask_seqlet_pfms, tfm_pfms)
show_matched_motifs_table(
tfm_pfms, tfm_names, "meme", meme_multitask_seqlet_pfms, meme_multitask_seqlet_evalues,
meme_multitask_seqlet_matches
)
if task_index is not None:
meme_singletask_seqlet_matches = match_motifs.match_motifs_to_targets(meme_singletask_seqlet_pfms, tfm_pfms)
show_matched_motifs_table(
tfm_pfms, tfm_names, "meme", meme_singletask_seqlet_pfms, meme_singletask_seqlet_evalues,
meme_singletask_seqlet_matches
)