Shall we just run modisco for each peak-set separately?
calculate entropy for each region and the final aggregated distribution
threshold all the computations w.r.t. the total importance score in the region
[ ] compare the PWM, and the aggregated profile across all pattern locations (not just those identified by clustering with modisco)
[ ] scatterplot the spread or bi-modality of the distribution witin the sequence vs. seqlet_score
from tqdm import tqdm
from basepair.config import get_data_dir
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pathlib import Path
from basepair.modisco import ModiscoResult, append_pattern_loc
from basepair.cli.schemas import DataSpec
from basepair.utils import read_pkl
from kipoi.readers import HDF5Reader
ddir = Path(get_data_dir())
model_dir = ddir / "processed/chipnexus/exp/models/oct-sox-nanog-klf/models/n_dil_layers=9/"
modisco_dir = model_dir / "modisco/valid"
!ls {model_dir}
!ls {modisco_dir} # modisco dir
# Modisco
mr = ModiscoResult(modisco_dir / "modisco.h5")
mr.open()
tasks = mr.tasks()
ds = DataSpec.load(model_dir / "dataspec.yaml")
d = HDF5Reader(model_dir / "grad.all.h5")
d.open()
id_hash = pd.DataFrame({"peak_id": d.f['/metadata/interval_from_task'][:], "example_idx": np.arange(d.f['/metadata/interval_from_task'].shape[0])})
# load the instances data frame
df = pd.read_csv(modisco_dir / "all.instances.tsv", sep='\t')
df['pattern_id'] = "metacluster_" + df.metacluster.astype(str) + "/pattern_" + df.pattern.astype(str)
df['seqlet_center'] = (df.seqlet_start + df.seqlet_end) / 2
patterns = df.pattern_id.unique().tolist()
pattern_pssms = {pattern: mr.get_pssm(*pattern.split("/")) for pattern in patterns}
append_pattern_loc(df, pattern_pssms, trim_frac=0.08)
df['pattern_center'] = (df.pattern_start + df.pattern_end) / 2
df = df.merge(id_hash, on="example_idx")
# Use thresholded values from now on
df_unfiltered = df
df = df[df.percnormed_score > .2]
ls {outdir}
seqlets = read_pkl(modisco_dir / "all.seqlets.pkl")
assert len(df) == len(seqlets)
# total number of mapped seqlets
len(seqlets)
n_examples = d.f['/metadata/range/chr'].shape[0]
# total number of peaks / examples
n_examples
# sanity check the raw peak file
for task, ts in ds.task_specs.items():
n_peaks = !zcat {ts.peaks} | wc -l
print(f"{task}: {n_peaks[0]}")
df.head()
# almost every peak has a seqlet
len(df.example_id.unique()) / n_examples
d.f['/metadata/interval_from_task'][:4]
outdir = model_dir / "clustering/peak-set/"
outdir.mkdir(parents=True, exist_ok=True)
metadata_task = d.f['/metadata/interval_from_task'][:]
for task in tasks:
mask = metadata_task == task
np.save(outdir / f"{task}.npy", mask)
from plotnine import *
import plotnine
plotnine.options.figure_size = (10, 1)
ggplot(aes(x="size"), df.groupby("example_id").pattern.agg(['size']).reset_index()) + \
geom_bar() + theme_bw() + \
xlab("Number of seqlets per example")
# 20 most frequent patterns
fig = plt.figure(figsize=(3, 8))
df.pattern_id.value_counts().head(30).iloc[::-1].plot(kind='barh');
len(df.pattern_id.value_counts())
from basepair.vdom_viz import fig2vdom
from vdom.helpers import *
def plot_n_per_example_ax(dfs, ax):
dfs.groupby("example_id").size().value_counts().plot("barh", ax=ax)
ax.set_title("# per example")
def plot_col_hist_ax(dfs, ax, col='score', xlim=None):
dfs[col].plot.hist(bins=100, ax=ax)
#ax.set_xlabel("# per example")
if xlim is not None:
ax.set_xlim(xlim)
ax.set_title(col)
def plot_all_dist(df, pattern_id, figsize=(12,2), add_title="", **kwargs):
"""Merge all plots into a single one
"""
dfs = df[df.pattern_id == pattern_id]
fig, ax = plt.subplots(1, 5, figsize=figsize)
plot_col_hist_ax(dfs, ax[0], "seqlet_center", xlim=[0, 1000])
ax[0].set_title(f"seqlet_center {add_title}")
plot_n_per_example_ax(dfs, ax[1])
plot_col_hist_ax(dfs, ax[2], "percnormed_score")
plot_col_hist_ax(dfs, ax[3], "score")
plot_col_hist_ax(dfs, ax[4], "seqlet_score")
plt.tight_layout()
return fig2vdom(fig, **kwargs)
import matplotlib.pyplot as plt
from concise.utils.plot import seqlogo_fig, seqlogo
from basepair.math import mean
from basepair.plots import draw_box
# helper functions for plotting the results
def get_scores(d, example_idx, task, grad_type="weighted"):
return mean([d.f[f'/grads/{task}/{grad_type}/0'][example_idx],
d.f[f'/grads/{task}/{grad_type}/1'][example_idx]])
def get_sequence(d, example_idx):
return d.f['/inputs'][example_idx]
def relevant_patterns(df, example_idx, xlim):
patterns = []
for i,row in df[df.example_idx == example_idx].sort_values("pattern_start").iterrows():
xmin = row.pattern_start - xlim[0] + 0.5
xmax = row.pattern_end - xlim[0]+ 0.5
if xmin < 0:
continue
if xmax > xlim[1] - xlim[0]:
continue
patterns.append((row.pattern_id, row.revcomp))
return patterns
def draw_patterns(df, example_idx, xlim, ax, add_label=True):
for i,row in df[df.example_idx == example_idx].iterrows():
xmin = row.pattern_start - xlim[0] + 0.5
xmax = row.pattern_end - xlim[0]+ 0.5
if xmin < 0:
continue
if xmax > xlim[1] - xlim[0]:
continue
draw_box(xmin, xmax, ax, 'g') # trimmed pattern location
if add_label:
y = ax.get_ylim()[1] + (ax.get_ylim()[1] - ax.get_ylim()[0]) * 0.1
if row.revcomp:
prefix = "-"
else:
prefix = ""
ax.text(row.pattern_start - xlim[0] + 0.5, y,
s=prefix + f"mc_{row.metacluster}/p_{row.pattern}")
def plot_region(df, d, tasks, example_idx, xlim=[400,600]):
fig, axes = plt.subplots(len(tasks), 1, figsize=(20, 2* len(tasks)), sharex=True)
seq = get_sequence(d, example_idx)
for i, task in enumerate(tasks):
ax = axes[i]
seqlogo((get_scores(d, example_idx, task) * seq)[xlim[0]:xlim[1]], ax=ax)
draw_patterns(df, example_idx, xlim, ax, add_label=i==0)
ax.set_ylabel(task)
ax.set_xticks(list(range(0, xlim[1] - xlim[0] + 1, 5)));
def rc_prefix(rc):
if rc:
return "-"
else:
return ""
def vdom_patterns(mr, patterns_rc_vec, trim_frac=0.08):
"""Get vdom patterns in a single row
Args:
mr: ModiscoResults object
patterns_rc_vec: list of tuples (pattern_id, rc). Example: ("metacluster_0/pattern_1", True)
trim_frac: how much to trim the pssm
"""
return p([fig2vdom(mr.plot_pssm(*pattern_idx.split("/"), rc=rc, trim_frac=trim_frac, title=rc_prefix(rc) + pattern_idx,
letter_width=0.35, height=1.7), height=75)
for pattern_idx, rc in patterns_rc_vec])
def dist_pairwise_hist(df, a, b, ax=None):
if a != b:
dfc_ab = df[df.pattern_id.isin([a,b])]
counts = dfc_ab.groupby("example_idx").pattern_id.value_counts().unstack(fill_value=0)
relevant_idx = counts.index[(counts==1).all(axis=1)]
distances = dfc_ab[dfc_ab.example_idx.isin(relevant_idx)].set_index(['example_idx', 'pattern_id']).pattern_center.unstack()
diff = distances[a] - distances[b]
title = f"{a} - {b}"
if a == b:
examples = df[df.pattern_id == a].groupby("example_idx").filter(lambda x: len(x) == 2)
diff = examples.groupby("example_idx").apply(lambda x: x.pattern_center.iloc[1] - x.pattern_center.iloc[0])
title = f"{a}"
if ax is None:
fig,ax = plt.subplots(figsize=(15,2))
ax.hist(diff, bins=100);
ax.set_title(title);
df.head()
The plot bellow shows the following:
## fig_height = 100
div(*[div(span(str(i), ". ", row['index'] + f" (n={row.pattern_id})",
fig2vdom(mr.plot_pssm(*row['index'].split("/"), height=0.5, letter_width=0.15, trim_frac=0.08))),
br(),
*[plot_all_dist(df[df.peak_id == task], row['index'], add_title=task,
figsize=(16,1.8), height=100) for task in tasks],
)
for i, row in df.pattern_id.value_counts().reset_index().iterrows()])
Compute the co-occurence stats.
from upsetplot import plot as upsetplt
all_patterns = mr.patterns()
# restrict to the central 500bp
df_center = df[(df.pattern_center > 250) & (df.pattern_center < 750)]
len(df)
len(df_center)
dfp = pd.DataFrame.from_dict([dict(pd.Series(all_patterns,
index=all_patterns).
isin(group.pattern_id.unique()))
for name, group in tqdm(df_center.groupby("example_id"))])
# make the upsetplot
data_dict = dfp.groupby(all_patterns).size()
dfc = data_dict.reset_index().rename(columns={0:"n"})
dfcp = dfc.sort_values("n", ascending=False).head(40) # take only 40 most frequent combinations
col_incl = dfcp.iloc[:, :-1].max()
dfcp_plot = dfcp.iloc[:, col_incl.tolist() + [True]]
dfcp_plot = dfcp_plot.set_index(list(dfcp_plot.columns[:-1]))
r = upsetplt(dfcp_plot.n)
# note interesting pairs to plot pairwise distances
pairs = [
["metacluster_2/pattern_0", "metacluster_1/pattern_0"],
["metacluster_2/pattern_0", "metacluster_0/pattern_0"],
["metacluster_2/pattern_0", "metacluster_28/pattern_0"],
["metacluster_1/pattern_0", "metacluster_2/pattern_3"],
["metacluster_0/pattern_0", "metacluster_0/pattern_1"],
["metacluster_0/pattern_0", "metacluster_0/pattern_2"],
["metacluster_0/pattern_0", "metacluster_3/pattern_2"],
["metacluster_1/pattern_0", "metacluster_1/pattern_1"],
["metacluster_1/pattern_2", "metacluster_0/pattern_0"],
["metacluster_1/pattern_2", "metacluster_0/pattern_1"],
]
# Visualie motifs for most frequent patterns from above
dfcp_plot_melt = pd.melt(dfcp_plot.reset_index().assign(idx=np.arange(len(dfcp_plot))), id_vars=["n","idx"])
patterns = dfcp_plot_melt.query('value').groupby("variable").n.sum().sort_values(ascending=False).index.tolist()
display(vdom_patterns(mr, zip(patterns, [False]*len(patterns)), trim_frac=0.08))
nrow = int(np.ceil(len(patterns)/2))
fig, axes = plt.subplots(nrow,2, figsize=(20, nrow * 1.5), sharex=True)
for pattern, ax in tqdm(zip(patterns, axes.flat)):
dist_pairwise_hist(df[~df[['example_id', 'pattern_center']].duplicated()], pattern, pattern, ax)
ax.set_xlim([-1000, 1000])
plt.tight_layout()
## TODO statify those plots by the peak_id
nrow = int(np.ceil(len(pairs)/2))
fig, axes = plt.subplots(nrow, 2, figsize=(20, nrow * 1.5), sharex=True)
for pair, ax in tqdm(zip(pairs, axes.flat)):
dist_pairwise_hist(df[~df[['example_id', 'pattern_center']].duplicated()], pair[0], pair[1], ax)
ax.set_xlim([-1000, 1000])
plt.tight_layout()
example_idx = 12
xlim=[400,600]
display(vdom_patterns(mr, relevant_patterns(df, example_idx, xlim), trim_frac=0.08))
plot_region(df, d, tasks, 12, xlim)
example_idx = 29934
xlim=[400,600]
display(vdom_patterns(mr, relevant_patterns(df, example_idx, xlim), trim_frac=0.08))
plot_region(df, d, tasks, example_idx, xlim)
example_idx = df.example_idx.sample(1).iloc[0]
print(example_idx)
xlim=[400,600]
display(vdom_patterns(mr, relevant_patterns(df, example_idx, xlim), trim_frac=0.08))
plot_region(df, d, tasks, example_idx, xlim)
example_idx = df.example_idx.sample(1).iloc[0]
print(example_idx)
xlim=[400,600]
display(vdom_patterns(mr, relevant_patterns(df, example_idx, xlim), trim_frac=0.08))
plot_region(df, d, tasks, example_idx, xlim)
# This example doesn't make any sense to me
df[df.example_idx == 5472]
sl = seqlets[638799]
s = sl.track_name_to_snippet['Sox2_contrib_scores']
seqlogo_fig(s.fwd, figsize=(10,2));
example_idx = df.example_idx.sample(1).iloc[0]
print(example_idx)
example_idx = 5472
xlim=[400,600]
display(vdom_patterns(mr, relevant_patterns(df, example_idx, xlim), trim_frac=0.08))
plot_region(df, d, tasks, example_idx, xlim)
seqlogo_fig(s.fwd, figsize=(10,2));
mr.plot_pssm("metacluster_9","pattern_0");
example_idx = df.example_idx.sample(1).iloc[0]
print(example_idx)
xlim=[400,600]
display(vdom_patterns(mr, relevant_patterns(df, example_idx, xlim), trim_frac=0.08))
plot_region(df, d, tasks, example_idx, xlim)
example_idx = df.example_idx.sample(1).iloc[0]
print(example_idx)
xlim=[400,600]
display(vdom_patterns(mr, relevant_patterns(df, example_idx, xlim), trim_frac=0.08))
plot_region(df, d, tasks, example_idx, xlim)
example_idx = df.example_idx.sample(1).iloc[0]
print(example_idx)
xlim=[400,600]
display(vdom_patterns(mr, relevant_patterns(df, example_idx, xlim), trim_frac=0.08))
plot_region(df, d, tasks, example_idx, xlim)
example_idx = df.example_idx.sample(1).iloc[0]
print(example_idx)
xlim=[400,600]
display(vdom_patterns(mr, relevant_patterns(df, example_idx, xlim), trim_frac=0.08))
plot_region(df, d, tasks, example_idx, xlim)
example_idx = df.example_idx.sample(1).iloc[0]
print(example_idx)
xlim=[400,600]
display(vdom_patterns(mr, relevant_patterns(df, example_idx, xlim), trim_frac=0.08))
plot_region(df, d, tasks, example_idx, xlim)
example_idx = df.example_idx.sample(1).iloc[0]
print(example_idx)
xlim=[400,600]
display(vdom_patterns(mr, relevant_patterns(df, example_idx, xlim), trim_frac=0.08))
plot_region(df, d, tasks, example_idx, xlim)
df[df.example_idx == 36709].iloc[0][['example_chr', 'example_start', 'example_end']]
from basepair.BPNet import BPNetPredictor
from keras.models import load_model
from basepair.config import create_tf_session
from pybedtools import Interval
create_tf_session(0)
model = load_model("/users/avsec/workspace/basepair/basepair/../data/processed/chipnexus/exp/models/p-multi-task/seq_mutlitask_filters=32,conv1_kernel_size=21,tconv_kernel_size=25,n_dil_layers=6,use_profile=True,use_counts=True,c_task_weight=10,lr=0.004.5.h5")
preproc = read_pkl("/users/avsec/workspace/basepair/basepair/../data/processed/chipnexus/exp/models/p-multi-task/preproc.pkl")
bp = BPNetPredictor(model,
"/mnt/data/pipeline_genome_data/mm10/mm10_no_alt_analysis_set_ENCODE.fasta",
tasks = ["Sox2", "Oct4"],
preproc=preproc)
bws = {task: {s: f"/users/avsec/workspace/basepair-workflow/data/{task}/5prime.counts.{s}.bw" for s in ['pos', 'neg']} for task in ["Sox2", "Oct4"]}
genome_file = "/mnt/data/pipeline_genome_data/mm10/mm10.chrom.sizes"
df[(df.example_chr == "chr17") & (df.example_start < 35503946) & (df.example_end >35503946+201 )]
# Filtered seqlets (percnormed_score > 0.2)
example_idx = 2
xlim=[400,600]
display(vdom_patterns(mr, relevant_patterns(df, example_idx, xlim), trim_frac=0.08))
plot_region(df, d, tasks, example_idx, xlim)
# Unfilterred seqlets
example_idx = 2
xlim=[400,600]
display(vdom_patterns(mr, relevant_patterns(df_unfiltered, example_idx, xlim), trim_frac=0.08))
plot_region(df_unfiltered, d, tasks, example_idx, xlim)
# old bpnet prediction
bp.plot_predict_grad([Interval("chr17", 35503946, 35503946+201)], bws=bws,
profile_grad='weighted',
figsize=(20, 10))