import sys
import os
sys.path.append(os.path.abspath("/users/amtseng/tfmodisco/src/"))
from motif.moods import import_moods_hits
from motif.tfmodisco_hit_scoring import import_tfmodisco_hits
from feature.util import one_hot_to_seq
import plot.viz_sequence as viz_sequence
import h5py
import numpy as np
import pandas as pd
import sklearn.metrics
import scipy.stats
import matplotlib.pyplot as plt
import matplotlib.font_manager as font_manager
import tqdm
tqdm.tqdm_notebook() # It is necessary to call this before the tqdm.notebook submodule is available
/users/amtseng/miniconda3/envs/tfmodisco-mini/lib/python3.7/site-packages/ipykernel_launcher.py:16: TqdmDeprecationWarning: This function will be removed in tqdm==5.0.0 Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook` app.launch_new_instance()
<tqdm.notebook.tqdm_notebook at 0x7fad857d9510>
# Plotting defaults
font_manager.fontManager.ttflist.extend(
font_manager.createFontList(
font_manager.findSystemFonts(fontpaths="/users/amtseng/modules/fonts")
)
)
plot_params = {
"figure.titlesize": 22,
"axes.titlesize": 22,
"axes.labelsize": 20,
"legend.fontsize": 18,
"font.size": 13,
"xtick.labelsize": 16,
"ytick.labelsize": 16,
"font.family": "Roboto",
"font.weight": "bold",
"svg.fonttype": "none"
}
plt.rcParams.update(plot_params)
/users/amtseng/miniconda3/envs/tfmodisco-mini/lib/python3.7/site-packages/ipykernel_launcher.py:4: MatplotlibDeprecationWarning: The createFontList function was deprecated in Matplotlib 3.2 and will be removed two minor releases later. Use FontManager.addfont instead. after removing the cwd from sys.path.
# Define which motifs to look at
# Motif specifications: motif name, TF/task/model, motif ID for that run
motif_specs = [
("Oct4-Sox2", "Oct4", ["0_0"]),
("Sox2", "Sox2", ["0_1"]),
("Nanog", "Nanog", ["0_1"])
]
head = "profile"
out_path = "/users/amtseng/tfmodisco/figures/BPNet_motif_hits_ChIPnexus"
os.makedirs(out_path, exist_ok=True)
# Paths to footprints, profiles, and motif hits
tf_names = ["Nanog", "Oct4", "Sox2"]
true_chipnexus_profile_paths = {
tf_name : os.path.join(
"/users/amtseng/tfmodisco/data/processed/BPNet/ChIPnexus",
"BPNet_%s_ChIPnexus_profiles.h5" % tf_name
) for tf_name in tf_names
}
true_chipseq_profile_paths = {
tf_name : os.path.join(
"/users/amtseng/tfmodisco/data/processed/BPNet/ChIPseq",
"BPNet_%s_ChIPseq_profiles.h5" % tf_name
) for tf_name in tf_names
}
pred_chipseq_profile_path = os.path.join(
"/users/amtseng/tfmodisco/results/peak_predictions/BPNet",
"BPNet_ChIPseq_preds.h5"
)
imp_score_paths = {
tf_name : os.path.join(
"/users/amtseng/tfmodisco/results/importance_scores/BPNet",
"BPNet_%s_ChIPseq_imp_scores.h5" % tf_name
) for tf_name in tf_names
}
hyp_score_key = "%s_hyp_scores" % head
hits_cache_path = "/users/amtseng/tfmodisco/results/reports/motif_hits/cache/"
moods_coll_filt_hit_paths = {
tf_name : os.path.join(
hits_cache_path,
"moods/BPNet",
"BPNet_%s_ChIPseq" % tf_name,
"BPNet_%s_ChIPseq_%s" % (tf_name, head),
"filtered_hits.tsv"
) for tf_name in tf_names
}
tfm_coll_filt_hit_paths = {
tf_name : os.path.join(
hits_cache_path,
"tfm/BPNet",
"BPNet_%s_ChIPseq" % tf_name,
"BPNet_%s_ChIPseq_%s" % (tf_name, head),
"filtered_hits.tsv"
) for tf_name in tf_names
}
results_path = "/users/amtseng/tfmodisco/results/"
moods_coll_hit_paths = {
tf_name : os.path.join(
results_path,
"moods/BPNet",
"BPNet_%s_ChIPseq" % tf_name,
"BPNet_%s_ChIPseq_%s" % (tf_name, head),
"moods_filtered_scored_collapsed-all.bed"
)
for tf_name in tf_names
}
moods_hit_paths = {
tf_name : os.path.join(
results_path,
"moods/BPNet",
"BPNet_%s_ChIPseq" % tf_name,
"BPNet_%s_ChIPseq_%s" % (tf_name, head),
"moods_filtered_scored.bed"
) for tf_name in tf_names
}
tfm_coll_hit_paths = {
tf_name : os.path.join(
results_path,
"tfmodisco_hit_scoring/BPNet",
"BPNet_%s_ChIPseq" % tf_name,
"BPNet_%s_ChIPseq_%s" % (tf_name, head),
"tfm_matches_collapsed-all.bed"
) for tf_name in tf_names
}
tfm_hit_paths = {
tf_name : os.path.join(
results_path,
"tfmodisco_hit_scoring/BPNet",
"BPNet_%s_ChIPseq" % tf_name,
"BPNet_%s_ChIPseq_%s" % (tf_name, head),
"tfm_matches.bed"
) for tf_name in tf_names
}
stem = "/users/amtseng/tfmodisco/data/processed/BPNet/BPNet_motif_hits"
orig_hit_paths = {
"Oct4": os.path.join(stem, "Oct4-Sox2_hits_peak_limited.tsv"),
"Sox2": os.path.join(stem, "Sox2_hits_peak_limited.tsv"),
"Nanog": os.path.join(stem, "Nanog_hits_peak_limited.tsv")
}
orig_coll_hit_paths = {
"Oct4": os.path.join(stem, "Oct4-Sox2_hits_peak_limited_collapsed.tsv"),
"Sox2": os.path.join(stem, "Sox2_hits_peak_limited_collapsed.tsv"),
"Nanog": os.path.join(stem, "Nanog_hits_peak_limited_collapsed.tsv")
}
hit_types = [
"Original",
"Original collapsed",
"TF-MoDISco",
"TF-MoDISco collapsed",
"TF-MoDISco collapsed filtered",
"MOODS",
"MOODS collapsed",
"MOODS collapsed filtered"
]
hit_paths = {
"Original": orig_hit_paths,
"Original collapsed": orig_coll_hit_paths,
"TF-MoDISco": tfm_hit_paths,
"TF-MoDISco collapsed": tfm_coll_hit_paths,
"TF-MoDISco collapsed filtered": tfm_coll_filt_hit_paths,
"MOODS": moods_hit_paths,
"MOODS collapsed": moods_coll_hit_paths,
"MOODS collapsed filtered": moods_coll_filt_hit_paths
}
hit_type_colors = [
"#059917",
"#89ff7b",
"#c6c2f8",
"#8782c3",
"#585584",
"#db6207",
"#ea9e64",
"#f9d6bc",
]
def import_motif_hits(motif_hits_path):
"""
Imports the motif hits, which may be an output of MOODS scanning
or TF-MoDISco hit scanning, and may be filtered hits. Depending on
the number of columns, the hits are imported appropriately.
"""
with open(motif_hits_path, "r") as f:
cols = next(f).split("\t")
if motif_hits_path.endswith(".tsv"):
hit_table = pd.read_csv(motif_hits_path, sep="\t", header=0, index_col=None)
if "index" in hit_table:
hit_table = hit_table.drop("index", axis=1)
elif len(cols) == 10:
# MOODS hits
hit_table = import_moods_hits(motif_hits_path)
elif len(cols) == 16:
# TF-MoDISco hits
hit_table = import_tfmodisco_hits(motif_hits_path)
# Sort by aggregate similarity and drop duplicates (by strand)
hit_table = hit_table.sort_values("agg_sim")
hit_table = hit_table.drop_duplicates(["chrom", "start", "end", "peak_index"], keep="last")
else:
raise ValueError("Motif hits file of unknown format/source: %s" % motif_hits_path)
return hit_table
# Import all motif hits
all_hit_tables = {
hit_type : {
tf_name : import_motif_hits(hit_paths[hit_type][tf_name]) for tf_name in tf_names
} for hit_type in hit_types
} # hit type : TF name : hits
For each motif of interest, obtain the edges of the footprints (relative to the center of the motif instances) for each strand, and compute the footprint scores for each motif
def get_motif_instance_profiles(motif_hit_table, tf_name, profile_size=100):
"""
Given the hit table of a specific motif, obtains the set of ChIP-nexus
profiles of length `profile_size` centered around each motif hit's center.
Returns an N x P x 2 array of profiles for the two strands. The first
strand is the strand the motif is on, the second strand is the opposite.
"""
profile_path = true_chipnexus_profile_paths[tf_name]
# Get the coordinates of the profiles
prof_coords = motif_hit_table[["chrom", "start", "end"]].values
mids = (prof_coords[:, 1] + prof_coords[:, 2]) // 2
prof_coords[:, 1] = mids - (profile_size // 2)
prof_coords[:, 2] = prof_coords[:, 1] + profile_size
strands = motif_hit_table["strand"].values
profiles = np.empty((len(motif_hit_table), profile_size, 2))
with h5py.File(profile_path, "r") as f:
for i in range(len(prof_coords)):
chrom, start, end = prof_coords[i]
profs = f[chrom][start:end][:, 0]
if strands[i] == "-":
profs = np.flip(profs)
profiles[i] = profs
return profiles
def compute_footprint_scores(
profiles, strand_0_pos, strand_1_pos, divide_by_valley=False, expand=2
):
"""
For each profile in the N x P x 2 array `profiles`, computes a footprint
score. For each profile, computes the sum of the profile at positions
given by `strand_0_pos` and `strand_1_pos` (indices out of P). If
`divide_by_valley` is provided, then this sum is divided by the position
between the two given positions. If `expand` is nonzero, then instead of
summing just single positions, this will sum a window by extending each
position left and right by this amount. Returns an N-array.
"""
assert expand >= 0
strand_0_slice = slice(
strand_0_pos - expand, strand_0_pos + 1 + expand
)
strand_1_slice = slice(
strand_1_pos - expand, strand_1_pos + 1 + expand
)
strand_0_sum = np.sum(profiles[:, strand_0_slice, 0], axis=1)
strand_1_sum = np.sum(profiles[:, strand_1_slice, 1], axis=1)
if divide_by_valley:
valley_pos = (strand_0_pos + strand_1_pos) // 2
valley_slice = slice(
valley_pos - expand, valley_pos + 1 + expand
)
valley_0_sum = np.sum(profiles[:, valley_slice, 0], axis=1)
valley_1_sum = np.sum(profiles[:, valley_slice, 1], axis=1)
return (strand_0_sum / valley_0_sum) + (strand_1_sum / valley_1_sum)
else:
return strand_0_sum + strand_1_sum
# Extract the motif hit tables
motif_hit_tables = [] # List of dictionaries, one for each motif; dictionary maps hit type to table
for motif_spec in motif_specs:
motif_name, tf_name, motif_ids = motif_spec
motif_hit_tables.append({})
for hit_type in hit_types:
hit_table = all_hit_tables[hit_type][tf_name]
if hit_type.startswith("Original"):
motif_hit_table = hit_table
else:
motif_hit_table = hit_table[hit_table["key"].isin(motif_ids)].copy()
motif_hit_tables[-1][hit_type] = motif_hit_table
# Compute the footprint scores and add them as a column
for motif_i, motif_spec in enumerate(motif_specs):
motif_name, tf_name, motif_ids = motif_spec
fig, ax = plt.subplots(nrows=len(hit_types), ncols=2, figsize=(20, 4 * len(hit_types)))
for hit_type_i, hit_type in enumerate(hit_types):
motif_hit_table = motif_hit_tables[motif_i][hit_type]
all_footprint_scores = np.empty(len(motif_hit_table))
for motif_id in [None] if hit_type.startswith("Original") else motif_ids:
# For each motif ID of this desired motif group, add in the footprint score column,
# separately for each motif ID
# For "original" hits, there is only one motif in the whole file
if hit_type.startswith("Original"):
mask = np.ones(len(motif_hit_table)).astype(bool) # All True
else:
mask = motif_hit_table["key"] == motif_id
if not np.sum(mask):
continue
profiles = get_motif_instance_profiles(motif_hit_table[mask], tf_name)
# Get summit of each strand from average profile
avg_prof = np.mean(profiles, axis=0)
strand_0_summit_pos = np.argmax(avg_prof[:, 0])
strand_1_summit_pos = np.argmax(avg_prof[:, 1])
# Compute footprint scores
footprint_scores = compute_footprint_scores(
profiles, strand_0_summit_pos, strand_1_summit_pos
)
all_footprint_scores[mask] = footprint_scores
ax[hit_type_i, 0].plot(avg_prof[:, 0], color="royalblue")
ax[hit_type_i, 0].plot(-avg_prof[:, 1], color="darkorange")
ax[hit_type_i, 0].axvline(len(avg_prof) // 2, color="black")
ax[hit_type_i, 0].axvline(strand_0_summit_pos, color="gray")
ax[hit_type_i, 0].axvline(strand_1_summit_pos, color="gray")
ax[hit_type_i, 0].set_title(hit_type)
ax[hit_type_i, 1].hist(footprint_scores, bins=min(100, int(np.ceil(len(footprint_scores) / 10))), color="royalblue")
ax[hit_type_i, 1].set_xlabel("Footprint score")
ax[hit_type_i, 1].set_title(hit_type)
motif_hit_table["footprint_score"] = all_footprint_scores
fig.suptitle(motif_name)
fig.tight_layout()
plt.show()
def get_thresh(motif_hit_table, perc=0.9, score_col="footprint_score"):
"""
Gets the threshold of the given score column, at the given percentile.
"""
score_vals = motif_hit_table[score_col].values
return np.percentile(score_vals, perc * 100)
def get_ranked_mask(
motif_hit_table, pos_thresh, neg_thresh=None, score_col="footprint_score",
sort_col="imp_total_score"
):
"""
For a table of N motif hits, orders the hits and returns a boolean mask
of which hits are positives and which are negatives. The hits are ordered
in descending order of `sort_col`. A hit is considered positive if its
`score_col` value is at least `pos_thresh`, and a negative if the value is
`neg_thresh` (by default `neg_thresh = pos_thresh`).
All other hits will be ignored.
"""
if neg_thresh is None:
neg_thresh = pos_thresh
assert pos_thresh >= neg_thresh
table = motif_hit_table.sort_values(sort_col, ascending=False)
score_vals = table[score_col].values
pos_mask = score_vals >= pos_thresh
neg_mask = score_vals < neg_thresh
# Remove anything that's not a positive or negative, maintaining order
score_vals = score_vals[pos_mask | neg_mask]
return score_vals >= pos_thresh
# Show motif hit coverage
limit_x = 2000 # Adjust this to limit x-axis
for motif_i, motif_spec in enumerate(motif_specs):
motif_name, tf_name, _ = motif_spec
fig, ax = plt.subplots(ncols=2, figsize=(20, 8))
# First, get the threshold footprint score for a positive
pos_threshes = []
for hit_type_i, hit_type in enumerate(hit_types):
motif_hit_table = motif_hit_tables[motif_i][hit_type]
pos_threshes.append(get_thresh(motif_hit_table, perc=0.9))
pos_thresh = np.max(pos_threshes)
print("Thresholds: %s" % {hit_types[i] : pos_threshes[i] for i in range(len(pos_threshes))})
print("Final threshold: %s" % pos_thresh)
for hit_type_i, hit_type in enumerate(hit_types):
motif_hit_table = motif_hit_tables[motif_i][hit_type]
if hit_type.startswith("MOODS"):
sort_col = "score"
elif hit_type.startswith("TF-MoDISco"):
sort_col = "imp_ic_avg_score"
else:
sort_col = "imp"
ranked_mask = get_ranked_mask(motif_hit_table, pos_thresh, sort_col=sort_col)
ax[0].plot(np.cumsum(ranked_mask), label=hit_type, color=hit_type_colors[hit_type_i])
ax[1].plot(np.cumsum(ranked_mask[:limit_x]), label=hit_type, color=hit_type_colors[hit_type_i])
for i in (0, 1):
ax[i].set_xlabel("Top # motif instances")
ax[i].set_ylabel("# motif instances above coverage threshold")
xmax, ymax = ax[i].get_xlim()[1], ax[i].get_ylim()[1]
ax[i].plot([0, min(xmax, ymax)], [0, min(xmax, ymax)], linestyle="--", color="gray")
ax[i].legend()
fig.suptitle(motif_name)
fig.tight_layout()
plt.show()
Thresholds: {'Original': 49.0, 'Original collapsed': 49.0, 'TF-MoDISco': 33.0, 'TF-MoDISco collapsed': 35.0, 'TF-MoDISco collapsed filtered': 38.0, 'MOODS': 41.0, 'MOODS collapsed': 34.0, 'MOODS collapsed filtered': 35.0} Final threshold: 49.0
Thresholds: {'Original': 34.40000000000009, 'Original collapsed': 35.0, 'TF-MoDISco': 24.0, 'TF-MoDISco collapsed': 24.0, 'TF-MoDISco collapsed filtered': 25.0, 'MOODS': 32.0, 'MOODS collapsed': 29.0, 'MOODS collapsed filtered': 37.0} Final threshold: 37.0
Thresholds: {'Original': 62.0, 'Original collapsed': 61.0, 'TF-MoDISco': 73.0, 'TF-MoDISco collapsed': 73.0, 'TF-MoDISco collapsed filtered': 75.0, 'MOODS': 92.0, 'MOODS collapsed': 79.59999999999991, 'MOODS collapsed filtered': 93.0} Final threshold: 93.0
# Show motif hit coverage without thresholding
limit_x = 2000 # Adjust this to limit x-axis
for motif_i, motif_spec in enumerate(motif_specs):
motif_name, tf_name, _ = motif_spec
fig, ax = plt.subplots(ncols=2, figsize=(20, 8))
for hit_type_i, hit_type in enumerate(hit_types):
motif_hit_table = motif_hit_tables[motif_i][hit_type]
if hit_type.startswith("MOODS"):
sort_col = "score"
elif hit_type.startswith("TF-MoDISco"):
sort_col = "imp_total_score" # imp_ic_avg_score imp_total_score imp_total_signed_score
else:
sort_col = "imp"
motif_hit_table_sorted = motif_hit_table.sort_values(sort_col, ascending=False)
footprint_scores = motif_hit_table_sorted["footprint_score"]
ax[0].plot(
np.arange(len(footprint_scores)) / 1e3,
np.cumsum(footprint_scores) / 1e5,
label=hit_type, color=hit_type_colors[hit_type_i]
)
footprint_scores_limited = footprint_scores[:limit_x]
ax[1].plot(
np.arange(len(footprint_scores_limited)) / 1e3,
np.cumsum(footprint_scores_limited) / 1e5,
label=hit_type, color=hit_type_colors[hit_type_i]
)
ax[0].legend()
for i in (0, 1):
ax[i].set_xlabel("Top # motif instances ($\mathregular{x 10^{3}}$)")
ax[i].set_ylabel("Running sum of footprint score ($\mathregular{x 10^{5}}$)")
ax[i].grid()
fig.suptitle(motif_name)
fig.tight_layout()
plt.savefig(
os.path.join(out_path, "BPNet_motif_hit_enrichment_%s.svg" % motif_name),
format="svg"
)
plt.show()
# Show motif hit precision-recall
for motif_i, motif_spec in enumerate(motif_specs):
motif_name, tf_name, _ = motif_spec
fig, ax = plt.subplots(figsize=(10, 8))
# First, get the threshold footprint scores for positives/negatives
pos_threshes, neg_threshes = [], []
for hit_type_i, hit_type in enumerate(hit_types):
motif_hit_table = motif_hit_tables[motif_i][hit_type]
pos_threshes.append(get_thresh(motif_hit_table, perc=0.9))
neg_threshes.append(get_thresh(motif_hit_table, perc=0.5))
pos_thresh = np.max(pos_threshes)
neg_thresh = np.min(neg_threshes)
for hit_type_i, hit_type in enumerate(hit_types):
motif_hit_table = motif_hit_tables[motif_i][hit_type]
score_names = {"score": "MOODS score", "imp_frac_score": "importance", "imp_total_score": "importance", "imp_ic_avg_score": "importance", "agg_sim": "similarity", "imp": "importance"}
if hit_type.startswith("MOODS"):
sort_cols = ["score"] # ["score", "imp_frac_score"]
elif hit_type.startswith("TF-MoDISco"):
sort_cols = ["imp_ic_avg_score"] # ["agg_sim", "imp_frac_score"]
else:
sort_cols = ["imp"]
for sort_col in sort_cols:
ranked_mask = get_ranked_mask(
motif_hit_table, pos_thresh, neg_thresh=neg_thresh, sort_col=sort_col
)
precis, recall, _ = sklearn.metrics.precision_recall_curve(
ranked_mask.astype(int),
np.flip(np.arange(len(ranked_mask))) / len(ranked_mask)
)
auc = sklearn.metrics.auc(recall[:-1], precis[:-1])
ax.plot(recall[:-1], precis[:-1], label="%s ranked by %s (AUC = %.4f)" % (hit_type, score_names[sort_col], auc), color=hit_type_colors[hit_type_i])
ax.set_xlabel("Recall")
ax.set_ylabel("Precision")
ax.legend()
fig.suptitle(motif_name)
fig.tight_layout()
plt.show()
def plot_profiles(all_profs, task_labels):
"""
Plots the given profiles with a heatmap. `all_profs` is a list of N x P x 2
arrays. `labels` is parallel a list of N labels.
Returns the figure.
"""
num_tasks = len(task_labels)
max_num_profs = max([len(profs) for profs in all_profs])
# Create a figure with the right dimensions
heatmap_height = min(max_num_profs * 0.004, 8)
agg_height = 4
fig_height = agg_height + heatmap_height
fig, ax = plt.subplots(
nrows=2, ncols=num_tasks, figsize=(16, fig_height),
gridspec_kw={
"width_ratios": [1] * num_tasks,
"height_ratios": [heatmap_height / fig_height, agg_height / fig_height]
}
)
if num_tasks == 1:
ax = ax[:, None] # Add back second dimension
# Compute profile means
all_prof_means = [np.mean(profs, axis=0) for profs in all_profs]
# Compute maximum profile height in average and in sum
max_mean_height = max([np.max(profs) for profs in all_prof_means])
for i in range(num_tasks):
profs_mean = all_prof_means[i]
profs = all_profs[i]
profs_tot_sum = np.sum(profs, axis=(1, 2))
# Order the profiles by total sum
profs = profs[np.flip(np.argsort(profs_tot_sum))]
# Sum strands
prof_strand_sum = np.sum(profs, axis=2)
# Z-score normalize, then min-max normalize
profs_z_norm = (prof_strand_sum - np.mean(prof_strand_sum)) / np.std(prof_strand_sum)
profs_norm = (profs_z_norm - np.min(profs_z_norm)) / np.max(profs_z_norm)
# Plot the heatmap
ax[0, i].imshow(profs_norm, interpolation="nearest", aspect="auto", cmap="Purples")
# Plot the average profile
ax[1, i].plot(profs_mean[:, 0], color="darkslateblue")
ax[1, i].plot(-profs_mean[:, 1], color="darkorange")
# Set axes on heatmap
width = profs_mean.shape[1]
delta = 50
num_deltas = (width // 2) // delta
labels = list(range(max(-width // 2, -num_deltas * delta), min(width // 2, num_deltas * delta) + 1, delta))
tick_locs = [label + max(width // 2, num_deltas * delta) for label in labels]
ax[0, i].set_xticks(tick_locs)
ax[0, i].set_xticklabels(labels)
ax[0, i].set_xlabel("Distance from motif center (bp)")
ax[0, i].set_title(task_labels[i])
# Set axes on average profile
ylim = max_mean_height * 1.05 # Make 5% higher
for i in range(num_tasks):
ax[1, i].set_ylim(-ylim, ylim)
return fig
for motif_i, motif_spec in enumerate(motif_specs):
motif_name, tf_name, _ = motif_spec
all_profiles = [
get_motif_instance_profiles(motif_hit_tables[motif_i][hit_type], tf_name)
for hit_type in hit_types
]
fig = plot_profiles(all_profiles, hit_types)
fig.suptitle(motif_name)
fig.tight_layout()
plt.show()
def plot_example_hit(chrom, start, end, tf_name="Nanog", prof_center_size=600, score_center_size=50, ylim=None, save_path=None):
"""
For a given putative motif hit, plots the ChIP-nexus and ChIP-seq
profiles, as well as the importance scores for the task around the
hit.
"""
mid = (start + end) // 2
prof_start = mid - (prof_center_size // 2)
prof_end = prof_start + prof_center_size
score_start = mid - (score_center_size // 2)
score_end = score_start + score_center_size
with h5py.File(true_chipnexus_profile_paths[tf_name], "r") as f:
true_chipnexus_profs = f[chrom][prof_start:prof_end][:, 0]
with h5py.File(true_chipseq_profile_paths[tf_name], "r") as f:
true_chipseq_profs = f[chrom][prof_start:prof_end][:, 0]
with h5py.File(pred_chipseq_profile_path, "r") as f:
match_inds = np.where(
(f["coords"]["coords_chrom"][:].astype(str) == chrom) &
(f["coords"]["coords_start"][:] < prof_start) &
(f["coords"]["coords_end"][:] > prof_end)
)[0]
if not match_inds.size:
print("Warning: did not find sufficiently large prediction track for %s:%d-%s" % (chrom, prof_start, prof_end))
return
match_ind = match_inds[0]
coord_start, coord_end = f["coords"]["coords_start"][match_ind], f["coords"]["coords_end"][match_ind]
task_index = (["Oct4", "Sox2", "Nanog"]).index(tf_name)
pred_chipseq_profs = np.exp(f["predictions"]["log_pred_profs"][match_ind][task_index])
cut_start = prof_start - coord_start
cut_end = cut_start + prof_center_size
pred_chipseq_profs = pred_chipseq_profs[cut_start:cut_end]
with h5py.File(imp_score_paths[tf_name], "r") as f:
match_inds = np.where(
(f["coords_chrom"][:].astype(str) == chrom) &
(f["coords_start"][:] < score_start) &
(f["coords_end"][:] > score_end)
)[0]
if not match_inds.size:
print("Warning: did not find sufficiently large importance score track for %s:%d-%s" % (chrom, score_start, score_end))
return
match_ind = match_inds[0]
coord_start, coord_end = f["coords_start"][match_ind], f["coords_end"][match_ind]
hyp_scores = f[hyp_score_key][match_ind]
one_hot_seq = f["input_seqs"][match_ind]
cut_start = score_start - coord_start
cut_end = cut_start + score_center_size
hyp_scores = hyp_scores[cut_start:cut_end]
one_hot_seq = one_hot_seq[cut_start:cut_end]
prof_fig, ax = plt.subplots(nrows=3, figsize=(20, 8))
ax[0].plot(true_chipnexus_profs[:, 0], color="darkslateblue")
ax[0].plot(-true_chipnexus_profs[:, 1], color="darkorange")
ax[0].set_title("True ChIP-nexus profiles")
ax[1].plot(true_chipseq_profs[:, 0], color="darkslateblue")
ax[1].plot(-true_chipseq_profs[:, 1], color="darkorange")
ax[1].set_title("True ChIP-seq profiles")
ax[2].plot(pred_chipseq_profs[:, 0], color="darkslateblue")
ax[2].plot(-pred_chipseq_profs[:, 1], color="darkorange")
ax[2].set_title("Predicted ChIP-seq profiles")
# Draw vertical lines that denote the portion with importance scores
for i in range(3):
ax[i].axvline(score_start - prof_start, color="gray")
ax[i].axvline(score_end - prof_start, color="gray")
if save_path:
plt.savefig(
os.path.join(save_path + "_profiles.svg"), format="svg"
)
plt.show()
score_fig = viz_sequence.plot_weights(
hyp_scores * one_hot_seq, figsize=(20, 4), ylim=ylim, subticks_frequency=score_center_size, return_fig=True
)
score_fig.tight_layout()
if save_path:
plt.savefig(
os.path.join(save_path + "_impscores.svg"), format="svg"
)
plt.show()
print(one_hot_to_seq(one_hot_seq))
# Pick some Nanog hits
num_to_take = 10
tfm_table = motif_hit_tables[2]["TF-MoDISco collapsed filtered"].sort_values("footprint_score", ascending=False)
moods_table = motif_hit_tables[2]["MOODS"].sort_values("footprint_score", ascending=True)
orig_table = motif_hit_tables[2]["Original"].sort_values("footprint_score", ascending=True)
# Pick true positives (strong TF-MoDISco hits missed by MOODS)
true_hits = []
for index, row in tfm_table.iterrows():
if len(true_hits) == num_to_take:
break
if not np.sum(
(moods_table["chrom"] == row["chrom"]) &
(moods_table["start"] < row["end"]) &
(row["start"] < moods_table["end"])
):
true_hits.append(index)
# Pick false positives (weak MOODS hits missed by TF-MoDISco)
false_hits = []
for index, row in moods_table.iterrows():
if len(false_hits) == num_to_take:
break
if not np.sum(
(tfm_table["chrom"] == row["chrom"]) &
(tfm_table["start"] < row["end"]) &
(row["start"] < tfm_table["end"])
):
false_hits.append(index)
# Pick weak original hits missed by TF-MoDISco
orig_unique_hits = []
for index, row in orig_table.iterrows():
if len(orig_unique_hits) == num_to_take:
break
if not np.sum(
(tfm_table["chrom"] == row["chrom"]) &
(tfm_table["start"] < row["end"]) &
(row["start"] < tfm_table["end"])
):
orig_unique_hits.append(index)
for index in true_hits:
row = tfm_table.loc[index]
chrom, start, end = row["chrom"], row["start"], row["end"]
print("%s:%d-%d (index %d)" % (chrom, start, end, index))
if index == 43876:
save_path = os.path.join(out_path, "BPNet_true_hit_example_%d" % index)
plot_example_hit(chrom, start, end, ylim=(-0.1, 0.3), save_path=save_path)
# else:
# plot_example_hit(chrom, start, end, ylim=(-0.1, 0.3))
print("")
chr4:147128421-147128430 (index 52823) chr1:134535218-134535227 (index 75487) chr4:99636179-99636188 (index 25838) chr8:86394055-86394064 (index 32350) chr11:117082476-117082485 (index 67285) chr11:117082467-117082476 (index 8287) chr8:33647166-33647175 (index 36496) chr3:30724629-30724638 (index 5516) chr8:86394047-86394056 (index 27501) chr1:134535194-134535203 (index 43066)
for index in false_hits:
row = moods_table.loc[index]
chrom, start, end = row["chrom"], row["start"], row["end"]
print("%s:%d-%d (index %d)" % (chrom, start, end, index))
if index == 151709:
save_path = os.path.join(out_path, "BPNet_false_hit_example_%d" % index)
plot_example_hit(chrom, start, end, ylim=(-0.1, 0.3), save_path=save_path)
# else:
# plot_example_hit(chrom, start, end, ylim=(-0.1, 0.3))
print("")
chr11:89045564-89045578 (index 31289) chr12:105118254-105118268 (index 35998) chr1:194034245-194034259 (index 8035) chr3:120982577-120982591 (index 104882) chr3:145962006-145962020 (index 104429) chr12:76899169-76899183 (index 36613) chr12:80370796-80370810 (index 36643) chr8:118004510-118004524 (index 151709)
TGGTTAAAAGCAGCTTTGCTAATTGCTCTGGTAAGGGCCTGGGAGAAAGC chr3:105018190-105018204 (index 103477) chr12:99561156-99561170 (index 37512)
for index in orig_unique_hits:
row = orig_table.loc[index]
chrom, start, end = row["chrom"], row["start"], row["end"]
print("%s:%d-%d (index %d)" % (chrom, start, end, index))
plot_example_hit(chrom, start, end, score_center_size=400)
print("")
chr10:73509150-73509159 (index 41273)
CCTGGAATTACCATGCAAGCAAATGACAAAGCTGAGCTCTGACCCCAGTAGTCCAACTTCAGAACCTCCCTCTTCTGTGCAATCATTTCACACATGAGTAACTCTTATATTTTCCCAAGCTCAGCCAAGTCTGGCTCCGTGGATCTCCATTCTCTTTCAAATCTTCCTACTCATCTCTTTTAAGAGCTTTGCATTGGTGAATTAACTGACCTTGATCCCTCCTTCTCTACTTGAGCCTTGCTACTGACACCAGTTCCTTTTAGTTGACCCTTCAATGTCATATTGAATTAATCTCCTATCTTAACAAGGCCCTCTCCAACCTCCTTTATTTCTACCCATCCATGTCACTTCCCAACTTCCTCTAACCCATTCAACTTACCGCTTAATCATCAACACCACA chr9:20456950-20456959 (index 13265) Warning: did not find sufficiently large prediction track for chr9:20456654-20457254 chr11:10078449-10078458 (index 54957)
ATCAATGGAAGAGAATTGAAGACCCAGAAATGAACCCACACACCTATGGTCTCTTTATCTTTGACAAAGGATCTAAAACCATCAAATGGAAAAAAGACAGCATTTTCAACAAATGATGCTGGCACAACTGGCAGTTATCACGTAGAAGAATGCAAATTGACCCATTCTTATGCCCTTGTACAAAGCTTAAGTCCAAGTAGATTAAAGAACTCCACATAAAACCAGAGACTCTGAAGCTTATAGAGGAGAAAGTGGGGGAATGCTTTGAAAATATGGTCATAGGGGAAAAATTCCTGAACAGAACAGCAATGGCTTGTGTTGTAAGATCAAGAATCGACAAATGGGACCTCATAAAATTGCAAAGCTTCTGTAAGGCAAAAGACACTGTCAATAAGACAAA chr15:73173187-73173196 (index 54960)
GACACGAGGTTTTGAAAAAGCTAGGGATTGATGGGAGCTCCGCTACACAGCTCAGCAGGTTCTCTTGGTGGGGTTCACACGGCATCATCTTCAGGTTCTGTGCCCCACAGCCCAAGTCAGCCACCACTTCACCAGCACCCAGACCACAATTGGCATTCCTCAGAAAAGCCATGACACCCCTTCCCACTCTCACAACCGCAATTAAGGATGTGTGTAGCACAGCACACCTCCACCCAGGACGCTGCAAAGTCATTCACACTTGCTCTACACATAACAATACGACCAGTGGGACACTTATTAACCATGGGCAGTTACAGCAATCAGTCCGTGACTTGATATGACAGCTACATATTACTAACTGCCACAGACAAAGAACAATGCACGGTACTAATTATTCTGA chr12:117266309-117266318 (index 53037)
GCTCTTAGCATTTCCTGGCACCTGTCCATCCCTCACAAAGGATTTTATGCAGAAGTGTGTCTATGCACTTCAAATAACTAATTACTCTTAATTCTGGCATTCAAGAAGACATCAGCTTTCCATACAGGGCCATCGATGTCACTTAATAAATCCATTACTTACTAATTTTAGTTCTCTGTACTTTTCCATCATTAATTTAATCTCCTGCACTATTCAGCTTTGGCTTCAAGCTGAGGAGAATTCTGAGACAGCTAGGTATTACTTGGTATTTTAAACACTTTGTCTAGAGGTGGGCTCATGCATCTTCCCATACCCTTTATTCAGAATTTGTAGTCTGAAGTAACATCCTAGAGTTACACAATAAGGCTTCAACCAAAGCGTTTGCCATCAGTAGTGACTA chr6:125066082-125066091 (index 57416) Warning: did not find sufficiently large prediction track for chr6:125065786-125066386 chr3:135466552-135466561 (index 36147)
GAAACGTGTATATGGAGTTTAAGAAATTTATTTTCCAGGTGCATCCCCTTCTAACTAAAATTTTCTTCATCTTGTACACTTAACAGCTGAAAAAGATATATATATATATAAAACTTGGGAGTAATAATGGGTCAAAATTTACAAAATAAAGTACTGTTTTGGTGTGGGAGTTGTCATGAGGCTGTGTTGAAGTGACTTAACTGTGGGATATTGACTATCCATTGAGATGGATTTGTTCAGCCATTTACATTAATGAGCATTTAAATGCAACAGATCATTTCGGGTGACTTAACATGAATGAATAAAAAAGTCAATGCTATTGGATTGTTTGACAAGTGCTATCTGTGCCACTAAATCTTATGTAGTAATAAGGGCATTACCATTTTTCACCCAAGTTCTG chr19:60168223-60168232 (index 40529) Warning: did not find sufficiently large prediction track for chr19:60167927-60168527 chr19:32577948-32577957 (index 45857)
GAGAGGTAGTGAGAGAGGTAGAGAGACAACTAGGAATAGTGAAGACTGGGTGACAGGGAGTCTGTCTTTTTGTGACCTAATCTTGCCATATGACCATCCTTCAAAAGCCAGCCATCAATGCAGCCCATTCCCAAGGTGACACAGCAGGAAATGAGAACAATATGCTGTCTCCTTTATTCACAACTTTGTGTCCTTTCTGATCTCTGGCACTGGTCCCAGAAAAGAGCCCCCAGGGTTAAAAGTTCCTGGCAGCAGTCGCCTCCTCACAAAAATTAAAGACAGCTGACAATGGCCTATAATGCACCATTCTTAGACTGTTCTGTGCTTGGCATGTTAAGTCAGCAACTTAACCCATGGCTAGAGGTTTTTTGTTTGTTTGTTTCAAGAAAGGTTTTGTTGG chr18:48598150-48598159 (index 40523) Warning: did not find sufficiently large prediction track for chr18:48597854-48598454