import os
import sys
sys.path.append(os.path.abspath("/users/amtseng/tfmodisco/src/"))
sys.path.append(os.path.abspath("/users/amtseng/tfmodisco/notebooks/reports/"))
import util
import motif.read_motifs as read_motifs
import motif.moods as moods
import plot.viz_sequence as viz_sequence
from feature.util import one_hot_to_seq
import h5py
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.font_manager as font_manager
import matplotlib.patches as patches
import tempfile
import tqdm
tqdm.tqdm_notebook(range(1))
/users/amtseng/miniconda3/envs/tfmodisco-mini/lib/python3.7/site-packages/ipykernel_launcher.py:18: TqdmDeprecationWarning: This function will be removed in tqdm==5.0.0 Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
<tqdm.notebook.tqdm_notebook at 0x7fa2ab6cf890>
# Plotting defaults
font_manager.fontManager.ttflist.extend(
font_manager.createFontList(
font_manager.findSystemFonts(fontpaths="/users/amtseng/modules/fonts")
)
)
plot_params = {
"figure.titlesize": 22,
"axes.titlesize": 22,
"axes.labelsize": 20,
"legend.fontsize": 18,
"xtick.labelsize": 16,
"ytick.labelsize": 16,
"font.family": "Roboto",
"font.weight": "bold",
"svg.fonttype": "none"
}
plt.rcParams.update(plot_params)
/users/amtseng/miniconda3/envs/tfmodisco-mini/lib/python3.7/site-packages/ipykernel_launcher.py:4: MatplotlibDeprecationWarning: The createFontList function was deprecated in Matplotlib 3.2 and will be removed two minor releases later. Use FontManager.addfont instead. after removing the cwd from sys.path.
out_path = "/users/amtseng/tfmodisco/figures/ZNF248_pwm_vs_cwm"
os.makedirs(out_path, exist_ok=True)
tfm_motifs_path = os.path.join(
"/users/amtseng/tfmodisco/results/reports/tfmodisco_results/cache/",
"ZNF248",
"ZNF248_ChIPexo",
"ZNF248_ChIPexo_count",
"all_motifs.h5"
)
homer25_motifs_path = "/users/amtseng/tfmodisco/results/classic_motifs/peaks/ZNF248/ZNF248_ChIPExo/homer_25"
homer50_motifs_path = "/users/amtseng/tfmodisco/results/classic_motifs/peaks/ZNF248/ZNF248_ChIPExo/homer_50"
dichipmunk25_motifs_path = "/users/amtseng/tfmodisco/results/classic_motifs/peaks/ZNF248/ZNF248_ChIPExo/dichipmunk_25"
dichipmunk50_motifs_path = "/users/amtseng/tfmodisco/results/classic_motifs/peaks/ZNF248/ZNF248_ChIPExo/dichipmunk_50"
meme_motifs_path = "/users/amtseng/tfmodisco/results/classic_motifs/peaks/ZNF248/ZNF248_ChIPExo/memechip/meme_out"
b1h_motif_path = "/users/amtseng/tfmodisco/data/processed/B1H/ZNF248_B1Hcode.txt"
moods_hits_dir = "/users/amtseng/tfmodisco/results/moods/ZNF248/"
reference_fasta_path = "/users/amtseng/genomes/hg38.fasta"
peaks_bed_path = "/users/amtseng/tfmodisco/data/processed/ZNF248/labels/ChIPexo/ZNF248_Exo_HEK239T_all_peakints.bed.gz"
true_profiles_hdf5_path = "/users/amtseng/tfmodisco/data/processed/ZNF248/labels/ChIPexo/ZNF248_profiles.h5"
pred_profiles_hdf5_path = "/users/amtseng/tfmodisco/results/peak_predictions/ZNF248/ZNF248_ChIPexo_pred.h5"
imp_scores_hdf5_path = "/users/amtseng/tfmodisco/results/importance_scores/ZNF248/ZNF248_ChIPexo_shap_scores.h5"
hyp_score_key = "count_hyp_scores"
repeats_bed_path = "/users/amtseng/tfmodisco/data/processed/ZNF248/ZNF248_ChIPexo_peak_repeats.bed.gz"
# Import and show TF-MoDISco motifs
with h5py.File(tfm_motifs_path, "r") as f:
tfm_pfms, tfm_cwms = {}, {}
tfm_motif_keys = []
for key in f.keys():
tfm_pfms[key] = f[key]["pfm_trimmed"][:]
tfm_cwms[key] = f[key]["cwm_trimmed"][:]
if not np.all(tfm_pfms[key] == util.purine_rich_motif(tfm_pfms[key])):
tfm_pfms[key] = util.purine_rich_motif(tfm_pfms[key])
tfm_cwms[key] = util.purine_rich_motif(tfm_cwms[key])
tfm_motif_keys.append(key)
for key in tfm_motif_keys:
viz_sequence.plot_weights(read_motifs.pfm_to_pwm(tfm_pfms[key]))
# Import and show HOMER 25 motifs
homer25_pfms = [util.purine_rich_motif(pfm) for pfm in read_motifs.import_homer_pfms(homer25_motifs_path)[0]]
for pfm in homer25_pfms:
viz_sequence.plot_weights(read_motifs.pfm_to_pwm(pfm))
# Import and show HOMER 50 motifs
homer50_pfms = [util.purine_rich_motif(pfm) for pfm in read_motifs.import_homer_pfms(homer50_motifs_path)[0]]
for pfm in homer50_pfms:
viz_sequence.plot_weights(read_motifs.pfm_to_pwm(pfm))
# Import and show MEME motifs
meme_pfms = [util.purine_rich_motif(pfm) for pfm in read_motifs.import_meme_pfms(meme_motifs_path)[0]]
for pfm in meme_pfms:
viz_sequence.plot_weights(read_motifs.pfm_to_pwm(pfm))
# Import and show DiChIPMunk 25 motifs
dichipmunk25_pfms = [util.purine_rich_motif(pfm) for pfm in read_motifs.import_dichipmunk_pfms(dichipmunk25_motifs_path)[0]]
for pfm in dichipmunk25_pfms:
viz_sequence.plot_weights(read_motifs.pfm_to_pwm(pfm))
# Import and show DiChIPMunk 50 motifs
dichipmunk50_pfms = [util.purine_rich_motif(pfm) for pfm in read_motifs.import_dichipmunk_pfms(dichipmunk50_motifs_path)[0]]
for pfm in dichipmunk50_pfms:
viz_sequence.plot_weights(read_motifs.pfm_to_pwm(pfm))
# Import B1H recognition code
with open(b1h_motif_path, "r") as f:
line = next(f)
while not line.startswith("Pos"):
line = next(f)
rows = []
for line in f:
tokens = line.strip().split("\t")[1:]
if len(tokens) != 4:
continue
rows.append(np.array([float(x) for x in tokens]))
b1h_code = np.stack(rows)
# Align the B1H recognition to the TF-MoDISco motifs
b1h_code_aligned = np.zeros((50, 4))
b1h_code_aligned[20 : 20 + 21] = b1h_code
# Align the benchmark motifs to the TF-MoDISco motifs
homer25_aligned = np.zeros((50, 4))
homer25_aligned[22:22 + 25] = homer25_pfms[0]
homer50_aligned = np.zeros((50, 4))
homer50_aligned[12:] = homer50_pfms[0][:50 - 12]
meme_aligned = np.zeros((50, 4))
meme_aligned[18:18 + len(meme_pfms[0])] = meme_pfms[0]
dichipmunk25_aligned = np.zeros((50, 4))
dichipmunk25_aligned[13:13 + 26] = dichipmunk25_pfms[0]
dichipmunk50_aligned = np.zeros((50, 4))
dichipmunk50_aligned[6:] = dichipmunk50_pfms[0][:50 - 6]
viz_sequence.plot_weights(read_motifs.pfm_to_pwm(homer25_aligned), subticks_frequency=100, figsize=(20, 4), return_fig=True)
plt.savefig(
os.path.join(out_path, "ZNF248_homer25_pwm.svg"),
format="svg"
)
viz_sequence.plot_weights(read_motifs.pfm_to_pwm(homer50_aligned), subticks_frequency=100, figsize=(20, 4), return_fig=True)
plt.savefig(
os.path.join(out_path, "ZNF248_homer50_pwm.svg"),
format="svg"
)
viz_sequence.plot_weights(read_motifs.pfm_to_pwm(meme_aligned), subticks_frequency=100, figsize=(20, 4), return_fig=True)
plt.savefig(
os.path.join(out_path, "ZNF248_meme_pwm.svg"),
format="svg"
)
viz_sequence.plot_weights(read_motifs.pfm_to_pwm(dichipmunk25_aligned), subticks_frequency=100, figsize=(20, 4), return_fig=True)
plt.savefig(
os.path.join(out_path, "ZNF248_dichipmunk25_pwm.svg"),
format="svg"
)
viz_sequence.plot_weights(read_motifs.pfm_to_pwm(dichipmunk50_aligned), subticks_frequency=100, figsize=(20, 4), return_fig=True)
plt.savefig(
os.path.join(out_path, "ZNF248_dichipmunk50_pwm.svg"),
format="svg"
)
viz_sequence.plot_weights(read_motifs.pfm_to_pwm(tfm_pfms["0_0"]), subticks_frequency=100, figsize=(20, 4), return_fig=True)
plt.savefig(
os.path.join(out_path, "ZNF248_tfm_pwm.svg"),
format="svg"
)
viz_sequence.plot_weights(tfm_cwms["0_0"], subticks_frequency=100, figsize=(20, 4), return_fig=True)
plt.savefig(
os.path.join(out_path, "ZNF248_tfm_cwm.svg"),
format="svg"
)
viz_sequence.plot_weights(read_motifs.pfm_to_pwm(b1h_code_aligned), subticks_frequency=100, figsize=(20, 4), return_fig=True)
plt.savefig(
os.path.join(out_path, "ZNF248_B1H.svg"),
format="svg"
)
# Construct and export motifs in MOODS-ready format
full_pfm = tfm_pfms["0_0"]
core_pfm = full_pfm[21:41]
viz_sequence.plot_weights(full_pfm)
viz_sequence.plot_weights(core_pfm)
moods.export_motifs({"core": core_pfm, "full": full_pfm}, moods_hits_dir)
hits_bed_path = os.path.join(moods_hits_dir, "moods_out_filtered.bed")
if not os.path.exists(hits_bed_path):
moods.run_moods(moods_hits_dir, reference_fasta_path)
moods.moods_hits_to_bed(
os.path.join(moods_hits_dir, "moods_out.csv"), os.path.join(moods_hits_dir, "moods_out.bed")
)
# Create peak bed expanded to 400 base pairs
center_cut_size = 400
peaks_table = pd.read_csv(
peaks_bed_path, sep="\t", header=None, index_col=False,
usecols=[0, 1, 2, 9],
names=["chrom", "start", "end", "summit_offset"]
)
peaks_table["start"] = \
(peaks_table["start"] + peaks_table["summit_offset"]) - \
(center_cut_size // 2)
peaks_table["end"] = peaks_table["start"] + center_cut_size
# Make sure nothing is negative
peaks_table["min"] = 0
peaks_table["start"] = peaks_table[["start", "min"]].max(axis=1)
del peaks_table["min"]
peaks_table[["chrom", "start", "end"]].to_csv(
os.path.join(moods_hits_dir, "peaks_cut.bed"), sep="\t",
header=False, index=False
)
filter_peak_bed_path = os.path.join(moods_hits_dir, "peaks_cut.bed")
# Filter for peaks
moods.filter_hits_for_peaks(
os.path.join(moods_hits_dir, "moods_out.bed"), hits_bed_path,
filter_peak_bed_path
)
def import_hit_table(path):
return pd.read_csv(
path, sep="\t", header=None, index_col=False,
names=["chrom", "start", "end", "key", "strand", "score", "peak_index"]
)
# Import hits table
hit_table = import_hit_table(hits_bed_path)
core_hits_table = hit_table[hit_table["key"] == "core"]
full_hits_table = hit_table[hit_table["key"] == "full"]
min_score = 12
fig, ax = plt.subplots(figsize=(10, 6))
ax.hist(core_hits_table["score"], bins=100, color="blue", label="core", density=True, alpha=0.3)
ax.hist(full_hits_table["score"], bins=100, color="red", label="full", density=True, alpha=0.3)
ax.axvline(min_score, color="gray")
ax.set_title("Histogram of motif instance match score")
ax.set_xlabel("MOODS match score")
plt.legend()
plt.show()
# Limit to high-scoring motif hits
core_hits_table = core_hits_table[core_hits_table["score"] >= min_score]
full_hits_table = full_hits_table[full_hits_table["score"] >= min_score]
core_peaks, full_peaks = np.unique(core_hits_table["peak_index"]), np.unique(full_hits_table["peak_index"])
core_shared_peaks = np.intersect1d(core_peaks, full_peaks)
core_unique_peaks, full_unique_peaks = np.setdiff1d(core_peaks, full_peaks), np.setdiff1d(full_peaks, core_peaks)
print("Number of core peaks: %d" % len(core_peaks))
print("Number of full peaks: %d" % len(full_peaks))
print("Number of shared peaks: %d" % len(core_shared_peaks))
print("Number of unique core peaks: %d" % len(core_unique_peaks))
print("Number of unique full peaks: %d" % len(full_unique_peaks))
Number of core peaks: 4689 Number of full peaks: 4687 Number of shared peaks: 4532 Number of unique core peaks: 157 Number of unique full peaks: 155
# Get set of core and full hits that overlap, unique core, and unique full hits
core_f = tempfile.NamedTemporaryFile("w")
full_f = tempfile.NamedTemporaryFile("w")
core_shared_f = tempfile.NamedTemporaryFile("w")
core_unique_f = tempfile.NamedTemporaryFile("w")
full_unique_f = tempfile.NamedTemporaryFile("w")
core_hits_table.to_csv(core_f, header=False, sep="\t", index=False)
full_hits_table.to_csv(full_f, header=False, sep="\t", index=False)
!bedtools intersect -a {core_f.name} -b {full_f.name} -u > {core_shared_f.name}
!bedtools intersect -a {core_f.name} -b {full_f.name} -v > {core_unique_f.name}
!bedtools intersect -b {core_f.name} -a {full_f.name} -v > {full_unique_f.name}
core_shared_hits_table = import_hit_table(core_shared_f.name)
core_unique_hits_table = import_hit_table(core_unique_f.name)
full_unique_hits_table = import_hit_table(full_unique_f.name)
for f in (core_f, full_f, core_shared_f, core_unique_f, full_unique_f):
f.close()
print("Number of core hits: %d" % len(core_hits_table))
print("Number of full hits: %d" % len(full_hits_table))
print("Number of shared hits: %d" % len(core_shared_hits_table))
print("Number of unique core hits: %d" % len(core_unique_hits_table))
print("Number of unique full hits: %d" % len(full_unique_hits_table))
/bin/bash: bedtools: command not found /bin/bash: bedtools: command not found /bin/bash: bedtools: command not found Number of core hits: 4717 Number of full hits: 4735 Number of shared hits: 0 Number of unique core hits: 0 Number of unique full hits: 0
bar_labels = ["Peaks with motif instances", "Motif instances"]
segment_labels = ["Unique core motifs", "Shared motifs", "Unique full motifs"]
core_unique = np.array([len(core_unique_peaks), len(core_unique_hits_table)])
core_shared = np.array([len(core_shared_peaks), len(core_shared_hits_table)])
full_unique = np.array([len(full_unique_peaks), len(full_unique_hits_table)])
total = core_unique + core_shared + full_unique
core_unique = core_unique / total
core_shared = core_shared / total
full_unique = full_unique / total
fig, ax = plt.subplots(figsize=(4, 8))
ax.bar(bar_labels, core_unique, label=segment_labels[0], width=0.25)
ax.bar(bar_labels, core_shared, bottom=core_unique, label=segment_labels[1], width=0.25)
ax.bar(bar_labels, full_unique, bottom=(core_unique + core_shared), label=segment_labels[2], width=0.25)
ax.set_title("Prevalence of core and full motifs")
ax.set_ylabel("Proportion")
ax.legend()
plt.savefig(
os.path.join(out_path, "ZNF248_core_full_prevalence.svg"), format="svg"
)
plt.show()
/users/amtseng/miniconda3/envs/tfmodisco-mini/lib/python3.7/site-packages/ipykernel_launcher.py:8: RuntimeWarning: invalid value encountered in true_divide /users/amtseng/miniconda3/envs/tfmodisco-mini/lib/python3.7/site-packages/ipykernel_launcher.py:9: RuntimeWarning: invalid value encountered in true_divide if __name__ == '__main__': /users/amtseng/miniconda3/envs/tfmodisco-mini/lib/python3.7/site-packages/ipykernel_launcher.py:10: RuntimeWarning: invalid value encountered in true_divide # Remove the CWD from sys.path while we load stuff.
# Import peaks
peaks_table = pd.read_csv(
peaks_bed_path, sep="\t", header=None, index_col=False,
usecols=[0, 1, 2, 9],
names=["chrom", "start", "end", "summit_offset"]
)
peaks_table["summit"] = peaks_table["start"] + peaks_table["summit_offset"]
# Import repeats in the peaks
repeats_table = pd.read_csv(
repeats_bed_path, sep="\t", index_col=False,
names=[
"genoName", "genoStart", "genoEnd", "genoLeft", "strand",
"repName", "repClass", "repFamily", "repStart", "repEnd",
"repLeft", "id"
]
)
# Compute match coordinates
repeats_table["match_start"] = 0
repeats_table["match_end"] = 0
pos_mask = repeats_table["strand"] == "+"
neg_mask = ~pos_mask
repeats_table["match_start"].loc[pos_mask] = repeats_table["genoStart"][pos_mask] + repeats_table["repStart"][pos_mask]
repeats_table["match_end"].loc[pos_mask] = repeats_table["genoStart"][pos_mask] + repeats_table["repEnd"][pos_mask]
repeats_table["match_start"].loc[neg_mask] = repeats_table["genoStart"][neg_mask] + repeats_table["repLeft"][neg_mask]
repeats_table["match_end"].loc[neg_mask] = repeats_table["genoStart"][neg_mask] + repeats_table["repEnd"][neg_mask]
/users/amtseng/miniconda3/envs/tfmodisco-mini/lib/python3.7/site-packages/pandas/core/indexing.py:670: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy iloc._setitem_with_indexer(indexer, value)
# Peaks that overlap repeats
rep_inds = []
for _, row in tqdm.notebook.tqdm(peaks_table.iterrows(), total=len(peaks_table)):
matches = repeats_table[
(repeats_table["genoName"] == row["chrom"]) &
(repeats_table["genoStart"] <= row["summit"]) &
(repeats_table["genoEnd"] > row["summit"])
]
if matches.empty:
rep_inds.append(-1)
else:
rep_inds.append(matches.index[0])
peaks_table["repeat_index"] = np.array(rep_inds)
# Obtain distribution of repeat classes
repeat_class_names = ["LINE", "SINE", "LTR", "DNA", "Other", "None"]
repeat_class_counts = {name : 0 for name in repeat_class_names}
rep_inds = peaks_table["repeat_index"]
repeat_class_counts["None"] = np.sum(rep_inds == -1)
matches = repeats_table.loc[rep_inds[rep_inds != -1]]
matched_classes = matches["repClass"]
for c in matched_classes:
if c in repeat_class_counts:
repeat_class_counts[c] += 1
else:
repeat_class_counts["Other"] += 1
all_line_subclasses = np.array(matches[matches["repClass"] == "LINE"]["repName"])
unique_subclasses, subclass_counts = np.unique(all_line_subclasses, return_counts=True)
inds = np.flip(np.argsort(subclass_counts))
unique_subclasses, subclass_counts = unique_subclasses[inds], subclass_counts[inds]
line_subclasses = np.concatenate([unique_subclasses[:10], ["Other LINE"]])
line_subclass_counts = np.concatenate([subclass_counts[:10], [np.sum(subclass_counts[5:])]])
# Plot distribution of repeat classes
repeat_class_labels = [
"LINE", "SINE", "LTR", "DNA repeat element", "Other", "None"
]
# Plot pie chart
fig, ax = plt.subplots(figsize=(8, 8))
ax.pie(
[repeat_class_counts[name] for name in repeat_class_names],
labels=repeat_class_labels, autopct='%1.1f%%', shadow=False, startangle=90
)
ax.axis("equal") # Equal aspect ratio ensures that pie is drawn as a circle.
if out_path:
plt.savefig(
os.path.join(out_path, "ZNF248_repeat_prevalence.svg"), format="svg"
)
plt.show()
# Plot bar chart to highlight LINEs
fig, ax = plt.subplots(figsize=(8, 8))
ax.bar(repeat_class_labels, [repeat_class_counts[name] for name in repeat_class_names])
bottom = 0
for subclass, count in zip(line_subclasses, line_subclass_counts):
ax.bar(["LINE"], count, bottom=bottom, label=subclass, color=("gray" if subclass == "Other LINE" else None))
bottom += count
ax.set_xticklabels(repeat_class_labels, rotation=45)
ax.legend()
ax.set_ylabel("Number of peaks")
ax.set_title("Repeat elements underlying ZNF248 peaks")
if out_path:
plt.savefig(
os.path.join(out_path, "ZNF248_repeat_prevalence_bar.svg"), format="svg"
)
plt.show()
/users/amtseng/miniconda3/envs/tfmodisco-mini/lib/python3.7/site-packages/ipykernel_launcher.py:27: UserWarning: FixedFormatter should only be used together with FixedLocator
def plot_example_hit(chrom, start, end, prof_center_size=1000, score_center_size=50, save_path=None):
"""
For a given putative motif hit, plots the ChIP-nexus and ChIP-seq
profiles, as well as the importance scores for the task around the
hit.
"""
mid = (start + end) // 2
prof_start = mid - (prof_center_size // 2)
prof_end = prof_start + prof_center_size
score_start = mid - (score_center_size // 2)
score_end = score_start + score_center_size
with h5py.File(true_profiles_hdf5_path, "r") as f:
true_profs = f[chrom][prof_start:prof_end][:, 0]
with h5py.File(pred_profiles_hdf5_path, "r") as f:
# Need to use the coordinates of the profiles themselves
prof_len = f["predictions"]["log_pred_profs"].shape[2]
prof_coords_chrom = f["coords"]["coords_chrom"][:].astype(str)
prof_coords_start = f["coords"]["coords_start"][:]
prof_coords_end = f["coords"]["coords_end"][:]
mid = (prof_coords_start + prof_coords_end) // 2
prof_coords_start = mid - (prof_len // 2)
prof_coords_end = prof_coords_start + prof_len
match_inds = np.where(
(prof_coords_chrom == chrom) &
(prof_coords_start <= prof_start) &
(prof_coords_end >= prof_end)
)[0]
if not match_inds.size:
print("Warning: did not find sufficiently large prediction track for %s:%d-%s" % (chrom, prof_start, prof_end))
return
match_ind = match_inds[0]
coord_start, coord_end = prof_coords_start[match_ind], prof_coords_end[match_ind]
cut_start = prof_start - coord_start
cut_end = cut_start + prof_center_size
pred_profs = np.exp(f["predictions"]["log_pred_profs"][match_ind][0][cut_start:cut_end])
with h5py.File(imp_scores_hdf5_path, "r") as f:
match_inds = np.where(
(f["coords_chrom"][:].astype(str) == chrom) &
(f["coords_start"][:] <= score_start) &
(f["coords_end"][:] >= score_end)
)[0]
if not match_inds.size:
print("Warning: did not find sufficiently large importance score track for %s:%d-%s" % (chrom, score_start, score_end))
return
match_ind = match_inds[0]
coord_start, coord_end = f["coords_start"][match_ind], f["coords_end"][match_ind]
hyp_scores = f[hyp_score_key][match_ind]
one_hot_seq = f["input_seqs"][match_ind]
cut_start = score_start - coord_start
cut_end = cut_start + score_center_size
hyp_scores = hyp_scores[cut_start:cut_end]
one_hot_seq = one_hot_seq[cut_start:cut_end]
repeats_matches = repeats_table[
(repeats_table["genoName"] == chrom) &
(repeats_table["genoStart"] < prof_end) &
(repeats_table["genoEnd"] > prof_start)
]
prof_fig, ax = plt.subplots(nrows=3, sharex=True, figsize=(20, 10))
# Draw profiles
ax[0].plot(true_profs[:, 0], color="darkslateblue")
ax[0].plot(-true_profs[:, 1], color="darkorange")
ax[0].set_title("True ChIP-exo profiles")
ax[1].plot(pred_profs[:, 0], color="darkslateblue")
ax[1].plot(-pred_profs[:, 1], color="darkorange")
ax[1].set_title("Predicted ChIP-exo profiles")
# Draw repeat annotation(s)
for _, row in repeats_matches.iterrows():
start_pos = max(row["genoStart"] - prof_start, 0)
end_pos = min(row["genoEnd"] - prof_start, prof_center_size)
ax[2].add_patch(patches.Rectangle(
xy=(start_pos, 0), width=(end_pos - start_pos), height=1, color="red", fill=False
))
ax[2].text(
x=((end_pos + start_pos) // 2), y=0.5, s=row["repName"]
)
# Draw vertical lines that denote the portion with importance scores
for i in range(3):
ax[i].axvline(score_start - prof_start, color="gray")
ax[i].axvline(score_end - prof_start, color="gray")
if save_path:
plt.savefig(
os.path.join(save_path + "_profiles.svg"), format="svg"
)
plt.show()
score_fig = viz_sequence.plot_weights(
hyp_scores * one_hot_seq, figsize=(20, 4), subticks_frequency=score_center_size, return_fig=True
)
score_fig.tight_layout()
if save_path:
plt.savefig(
os.path.join(save_path + "_impscores.svg"), format="svg"
)
plt.show()
print(one_hot_to_seq(one_hot_seq))
num_to_take = 10
hits = full_hits_table.sort_values(by="score", ascending=False).head(10)
for index, row in hits.iterrows():
chrom, start, end = row["chrom"], row["start"], row["end"]
print("%s:%d-%d (index %d)" % (chrom, start, end, index))
if index == 1615:
save_path = os.path.join(out_path, "ZNF248_hit_example_%d" % index)
plot_example_hit(chrom, start, end, save_path=save_path)
else:
plot_example_hit(chrom, start, end)
print("")
chr1:231143722-231143772 (index 1615)
AGACACAAACAAATGGAAAGATATCCCATGTTCATGGATTGGAAGAATTA chr8:81567152-81567202 (index 8571) Warning: did not find sufficiently large prediction track for chr8:81566677-81567677 chr20:13145487-13145537 (index 15136) Warning: did not find sufficiently large prediction track for chr20:13145012-13146012 chr2:183198893-183198943 (index 2845) Warning: did not find sufficiently large prediction track for chr2:183198418-183199418 chr10:90950927-90950977 (index 10146) Warning: did not find sufficiently large prediction track for chr10:90950452-90951452 chrX:129398031-129398081 (index 16549) Warning: did not find sufficiently large prediction track for chrX:129397556-129398556 chr2:22724066-22724116 (index 2370) Warning: did not find sufficiently large prediction track for chr2:22723591-22724591 chr3:22515758-22515808 (index 3628)
AGACACCAAAAAATGGAAAGATATCCCATGTTCATGGATTGGAAGAATCA chr6:32258710-32258760 (index 6617) Warning: did not find sufficiently large prediction track for chr6:32258235-32259235 chr14:35839483-35839533 (index 12643) Warning: did not find sufficiently large prediction track for chr14:35839008-35840008