Example to start with:
metacluster_0/pattern_1metacluster_1/pattern_2[ ] entropy vs count distribution
calculate entropy for each region and the final aggregated distribution
percnormed_score>0.9plot_predict_grad with motif instance annotationmetacluster_0/pattern_1metacluster_1/pattern_2# common imports
from basepair.imports import *
pattern1 = "metacluster_1/pattern_2"
pattern2 = "metacluster_0/pattern_1"
ddir = Path(get_data_dir())
model_dir = ddir / "processed/chipnexus/exp/models/oct-sox-nanog-klf/models/n_dil_layers=9/"
modisco_dir = model_dir / "modisco/valid"
# Modisco
mr = ModiscoResult(modisco_dir / "modisco.h5")
mr.open()
tasks = mr.tasks()
ds = DataSpec.load(model_dir / "dataspec.yaml")
d = HDF5Reader(model_dir / "grad.all.h5")
d.open()
id_hash = pd.DataFrame({"peak_id": d.f['/metadata/interval_from_task'][:], "example_idx": np.arange(d.f['/metadata/interval_from_task'].shape[0])})
d_valid = HDF5Reader(model_dir / "grad.valid.h5")
d_valid.open()
d_test = HDF5Reader(model_dir / "grad.test.h5")
d_test.open()
d_valid = d_valid.load_all()
d_test = d_test.load_all()
# load the instances data frame
df = pd.read_csv(modisco_dir / "all.instances.tsv", sep='\t')
df['pattern_id'] = "metacluster_" + df.metacluster.astype(str) + "/pattern_" + df.pattern.astype(str)
df['seqlet_center'] = (df.seqlet_start + df.seqlet_end) / 2
patterns = df.pattern_id.unique().tolist()
pattern_pssms = {pattern: mr.get_pssm(*pattern.split("/")) for pattern in patterns}
append_pattern_loc(df, pattern_pssms, trim_frac=0.08)
df['pattern_center'] = (df.pattern_start + df.pattern_end) / 2
df = df.merge(id_hash, on="example_idx")
# Use thresholded values from now on
df_unfiltered = df
df = df[df.percnormed_score > .2]
df.seqlet_is_revcomp.max()
df.seqlet_is_revcomp.mean()
grad_type='weighted'
d = d_valid
included_samples = HDF5Reader.load(f"{modisco_dir}/strand_distances.h5")['included_samples']
one_hot = d['inputs']
grad_type="count,weighted" # always plot both importance scores
thr_hypothetical_contribs = OrderedDict([(f"{task}/{gt}", mean(d['grads'][task][gt])[included_samples])
for task in tasks
for gt in grad_type.split(",")])
thr_one_hot = one_hot[included_samples]
thr_contrib_scores = OrderedDict([(f"{task}/{gt}", thr_hypothetical_contribs[f"{task}/{gt}"] * thr_one_hot )
for task in tasks
for gt in grad_type.split(",")])
# importance_scores = {"grad": hyp_scores,
# "grad*input": scores}
# -------------------------------------------------
pattern = "metacluster_0/pattern_1"
# Get the range
from basepair.plot.profiles import *
plot_profiles(mr.seqlets(),
thr_one_hot,
tracks=tracks,#{},
importance_scores=thr_contrib_scores, #thr_hypothetical_contribs,
legend=False,
flip_neg=True,
rotate_y=0, #75,
seq_height=.5,
patterns=['metacluster_2/pattern_0'] + mr.patterns()[:10],#[pattern1, pattern2],
n_bootstrap=100,
figsize=(14, 14))
Use grad.all.h5 to plot the aggregated profile.
Q: Is the profile also sharp as before?
d = HDF5Reader.load(model_dir / "grad.all.h5")
# TODO - get seqlets
df.head()
df.columns
# pd.DataFrame -> seqlets
seqlets = {pattern:
[Seqlet(row.example_idx,
row.pattern_start - 20,
row.pattern_start + 30,
name=pattern,
strand="-" if row.revcomp else "+")
for i, row in df[(df.pattern_id == pattern) & \
(df.pattern_start > 20) & \
(df.pattern_start + 30 < 1000)].iterrows() ]
for pattern in tqdm(df.pattern_id.unique())}
all_patterns = df.pattern_id.unique().tolist()
one_hot = d['inputs']
grad_type="count,weighted" # always plot both importance scores
thr_hypothetical_contribs = OrderedDict([(f"{task}/{gt}", mean(d['grads'][task][gt]))
for task in tasks
for gt in grad_type.split(",")])
thr_one_hot = one_hot
thr_contrib_scores = OrderedDict([(f"{task}/{gt}", thr_hypothetical_contribs[f"{task}/{gt}"] * thr_one_hot )
for task in tasks
for gt in grad_type.split(",")])
df.pattern_end.max()
# TODO - is it possible to get different seqlet locations?
pattern = 'metacluster_0/pattern_0'
plot_profiles(seqlets,
thr_one_hot,
tracks=d['targets']['profile'],
importance_scores=thr_contrib_scores, #thr_hypothetical_contribs,
legend=False,
flip_neg=True,
rotate_y=0, #75,
seq_height=.5,
patterns=mr.patterns()[:1], #['metacluster_2/pattern_0'] + mr.patterns()[:10],#[pattern1, pattern2],
n_bootstrap=100,
figsize=(14, 14))
df.columns
df.head()
df[df.pattern_id == pattern].percnormed_score.plot.hist()
# pd.DataFrame -> seqlets
seqlets = {pattern:
[Seqlet(row.example_idx,
row.pattern_start - 20,
row.pattern_start + 30,
name=pattern,
strand="-" if row.revcomp else "+")
for i, row in df[(df.pattern_id == pattern) & \
(df.percnormed_score > 0.9) & \
(df.pattern_start > 20) & \
(df.pattern_start + 30 < 1000)].iterrows() ]}
plot_profiles(seqlets,
thr_one_hot,
tracks=d['targets']['profile'],
importance_scores=thr_contrib_scores, #thr_hypothetical_contribs,
legend=False,
flip_neg=True,
rotate_y=0, #75,
seq_height=.5,
patterns=[pattern],
n_bootstrap=100,
figsize=(14, 14))
# pd.DataFrame -> seqlets
seqlets = {pattern:
[Seqlet(row.example_idx,
row.pattern_start - 20,
row.pattern_start + 30,
name=pattern,
strand="-" if row.revcomp else "+")
for i, row in df[(df.pattern_id == pattern) & \
(df.percnormed_score > 0.2) & \
(df.pattern_start > 20) & \
(df.pattern_start + 30 < 1000)].iterrows() ]}
plot_profiles(seqlets,
thr_one_hot,
tracks=d['targets']['profile'],
importance_scores=thr_contrib_scores, #thr_hypothetical_contribs,
legend=False,
flip_neg=True,
rotate_y=0, #75,
seq_height=.5,
patterns=[pattern],
n_bootstrap=100,
figsize=(14, 14))
# pd.DataFrame -> seqlets
seqlets = {pattern:
[Seqlet(row.example_idx,
row.pattern_start - 20,
row.pattern_start + 30,
name=pattern,
strand="-" if row.revcomp else "+")
for i, row in df[(df.pattern_id == pattern) & \
(df.percnormed_score < 0.5) & \
(df.pattern_start > 20) & \
(df.pattern_start + 30 < 1000)].iterrows() ]}
plot_profiles(seqlets,
thr_one_hot,
tracks=d['targets']['profile'],
importance_scores=thr_contrib_scores, #thr_hypothetical_contribs,
legend=False,
flip_neg=True,
rotate_y=0, #75,
seq_height=.5,
patterns=[pattern],
n_bootstrap=100,
figsize=(14, 14))
df.seqlet_score.plot.hist(100);
plt.plot(df.seqlet_score, df.score, ".", alpha=0.1)
# pd.DataFrame -> seqlets
seqlets = {pattern:
[Seqlet(row.example_idx,
row.pattern_start - 20,
row.pattern_start + 30,
name=pattern,
strand="-" if row.revcomp else "+")
for i, row in df[(df.pattern_id == pattern) & \
(df.percnormed_score > 0.9) & \
(df.seqlet_score > 0.5) & \
(df.pattern_start > 20) & \
(df.pattern_start + 30 < 1000)].iterrows() ]}
plot_profiles(seqlets,
thr_one_hot,
tracks=d['targets']['profile'],
importance_scores=thr_contrib_scores, #thr_hypothetical_contribs,
legend=False,
flip_neg=True,
rotate_y=0, #75,
seq_height=.5,
patterns=[pattern],
n_bootstrap=100,
figsize=(14, 14))
[ ] entropy vs count distribution
calculate entropy for each region and the final aggregated distribution
d = HDF5Reader.load(model_dir / "grad.valid.h5")
included_samples = HDF5Reader.load(f"{modisco_dir}/strand_distances.h5")['included_samples']
one_hot = d['inputs']
grad_type="count,weighted" # always plot both importance scores
thr_hypothetical_contribs = OrderedDict([(f"{task}/{gt}", mean(d['grads'][task][gt])[included_samples])
for task in tasks
for gt in grad_type.split(",")])
thr_one_hot = one_hot[included_samples]
thr_contrib_scores = OrderedDict([(f"{task}/{gt}", thr_hypothetical_contribs[f"{task}/{gt}"] * thr_one_hot )
for task in tasks
for gt in grad_type.split(",")])
tracks = {task: d['targets']['profile'][task][included_samples] for task in tasks}
thr_one_hot.shape
thr_hypothetical_contribs['Klf4/count'].shape
def filter_seqlets(seqlets, only_idx):
return [s for s in seqlets if s.seqname == only_idx]
def df2seqlets(df):
return [Seqlet(row.example_idx,
row.pattern_start - 20,
row.pattern_start + 30,
name=row.pattern_id,
strand="-" if row.revcomp else "+")
for i, row in df.iterrows()]
df2seqlets(df[df.example_idx == 333])
from basepair.plot.utils import draw_box
# randomly sample a few examples
pattern = 'metacluster_0/pattern_0'
seqlets = mr._get_seqlets(pattern)
mr.plot_pssm(*pattern.split("/"));
for sid in pd.Series(np.arange(len(seqlets))).sample(10):
#sid = 0
plot_profiles_single(seqlets[sid].resize(width=200),
thr_one_hot,
tracks=tracks,
importance_scores=thr_contrib_scores, #thr_hypothetical_contribs,
legend=False,
flip_neg=False, #True,
rotate_y=0, #75,
seq_height=1,#.5,
figsize=(14, 14))
from basepair.BPNet import BPNetPredictor
from basepair.config import create_tf_session
create_tf_session(1)
bp = BPNet
plot_predict_grad
# randomly sample a few examples
pattern = 'metacluster_0/pattern_0'
seqlets = mr._get_seqlets(pattern)
#for sid in pd.Series(np.arange(len(seqlets))).sample(10):
sid = 0
plot_profiles({pattern:[seqlets[sid]]},
thr_one_hot,
tracks=d['targets']['profile'],
importance_scores=thr_contrib_scores, #thr_hypothetical_contribs,
legend=False,
flip_neg=True,
rotate_y=0, #75,
seq_height=.5,
patterns=[pattern],
n_bootstrap=None,
only_idx=0,
figsize=(14, 14))
m = np.ones(20)[np.newaxis] * np.arange(20)[:, np.newaxis]
fig, ax = plt.subplots(1, 3, sharey=True)
ax[0].imshow(m, alpha=1, cmap=plt.cm.Reds)
ax[1].imshow(m.T, alpha=1, cmap=plt.cm.Blues)
ax[2].imshow(m, alpha=1, cmap=plt.cm.Reds)
ax[2].imshow(m.T, alpha=.5, cmap=plt.cm.Blues);
fig.subplots_adjust(wspace=0) # no space between plots
plt.setp([a.get_yaxis() for a in fig.axes[1:]], visible=False); # no ticks
pattern = 'metacluster_0/pattern_2'
seqlets = mr._get_seqlets(pattern)
h2("Results for: ", pattern)
# Load required data
mr = ModiscoResult(modisco_dir / "modisco.h5")
mr.open()
tasks = mr.tasks()
d = HDF5Reader.load(model_dir / "grad.valid.h5")
included_samples = HDF5Reader.load(f"{modisco_dir}/strand_distances.h5")['included_samples']
from basepair.plot.vdom import fig2vdom
from vdom.helpers import *
from basepair.plot.heatmaps import multiple_heatmap_stranded_profile, multiple_heatmap_importance_profile, heatmap_sequence
def get_signal(seqlets, d, included_samples, tasks, resize_width=200):
thr_one_hot = d['inputs'][included_samples]
seq_len = thr_one_hot.shape[1]
# filter seqlets
valid_seqlets = [s.resize(resize_width)
for s in seqlets
if (s.center() > resize_width/2) and \
(s.center() + resize_width/2 < seq_len + 1)]
# prepare data
ex_signal = {task: extract_signal(d['targets']['profile'][task][included_samples], valid_seqlets)
for task in tasks}
# grads
ex_contrib_profile = {task: extract_signal(mean(d['grads'][task]["weighted"])[included_samples] * thr_one_hot,
valid_seqlets).sum(axis=-1)
for task in tasks}
ex_contrib_counts = {task: extract_signal(mean(d['grads'][task]["count"])[included_samples] * thr_one_hot,
valid_seqlets).sum(axis=-1)
for task in tasks}
ex_seq = extract_signal(thr_one_hot, valid_seqlets)
total_counts = sum([x.sum(axis=-1).sum(axis=-1) for x in ex_signal.values()])
sort_idx = np.argsort(-total_counts)
return ex_signal, ex_contrib_profile, ex_contrib_counts, ex_seq, sort_idx
#plot_profiles_single(seqlet, x, tracks, importance_scores={}, figsize=(20, 2), legend=True,
# rotate_y=90,seq_height=1,flip_neg=False)
def get_signal_agg(d, included_samples, tasks):
"""Prepare the signal for plot_profiles_single
"""
grad_type="count,weighted" # always plot both importance scores
one_hot = d['inputs']
thr_hypothetical_contribs = OrderedDict([(f"{task}/{gt}", mean(d['grads'][task][gt])[included_samples])
for task in tasks
for gt in grad_type.split(",")])
thr_one_hot = one_hot[included_samples]
thr_contrib_scores = OrderedDict([(f"{task}/{gt}", thr_hypothetical_contribs[f"{task}/{gt}"] * thr_one_hot )
for task in tasks
for gt in grad_type.split(",")])
tracks = {task: d['targets']['profile'][task][included_samples] for task in tasks}
return thr_one_hot, tracks, thr_hypothetical_contribs, thr_contrib_scores
def vdm_heatmaps(seqlets, tasks, d, included_samples, pattern, pssm_fig=None, opened=False):
ex_signal, ex_contrib_profile, ex_contrib_counts, ex_seq, sort_idx = get_signal(seqlets, d, included_samples, tasks)
return div(details(summary("Sequence:"),
pssm_fig,
br(),
fig2vdom(heatmap_sequence(ex_seq, sort_idx=sort_idx, figsize_tmpl=(10, 15), aspect='auto')),
open=opened
),
details(summary("ChIP-nexus counts:"),
fig2vdom(multiple_plot_stranded_profile(ex_signal, figsize_tmpl=(20/len(ex_signal), 3))),
fig2vdom(multiple_heatmap_stranded_profile(ex_signal, pmin=50, pmax=99, sort_idx=sort_idx, figsize=(20, 20))),
open=opened
),
details(summary("Importance scores (profile)"),
fig2vdom(multiple_heatmap_importance_profile(ex_contrib_profile, sort_idx=sort_idx, figsize=(20, 20))),
open=opened
),
details(summary("Importance scores (counts)"),
fig2vdom(multiple_heatmap_importance_profile(ex_contrib_counts, sort_idx=sort_idx, figsize=(20, 20))),
open=opened
)
)
def write_pattern_pngs(seqlets, d, included_samples, tasks, pattern, output_dir):
# get the data
ex_signal, ex_contrib_profile, ex_contrib_counts, ex_seq, sort_idx = get_signal(seqlets, d, included_samples, tasks)
# get the plots
figs = dict(
heatmap_seq = heatmap_sequence(ex_seq, sort_idx=sort_idx, figsize_tmpl=(10, 15), aspect='auto'),
profile_aggregated = multiple_plot_stranded_profile(ex_signal, figsize_tmpl=(20/len(ex_signal), 3)),
profile_heatmap = multiple_heatmap_stranded_profile(ex_signal, pmin=50, pmax=99, sort_idx=sort_idx, figsize=(20, 20)),
contrib_profile = multiple_heatmap_importance_profile(ex_contrib_profile, sort_idx=sort_idx, figsize=(20, 20)),
contrib_counts = multiple_heatmap_importance_profile(ex_contrib_counts, sort_idx=sort_idx, figsize=(20, 20)),
)
# write the figures
for k, fig in figs.items():
fig.savefig(os.path.join(output_dir, f + ".png"), bbox_inches='tight')
thr_one_hot, tracks, thr_hypothetical_contribs, thr_contrib_scores = get_signal_agg(tasks, d, included_samples)
# TODO - plot profile
pattern
seqlets = mr._get_seqlets(pattern)
pattern
figs = plot_profiles({pattern: seqlets},
thr_one_hot,
tracks=tracks,#{},
importance_scores=thr_contrib_scores, #thr_hypothetical_contribs,
legend=False,
flip_neg=True,
rotate_y=0, #75,
seq_height=.5,
patterns=[pattern],
n_bootstrap=100,
figsize=(10, 10))
plot_profiles({pattern: seqlets},
thr_one_hot,
tracks={},
importance_scores=thr_hypothetical_contribs,
legend=False,
flip_neg=True,
rotate_y=0, #75,
seq_height=1,
patterns=[pattern],
n_bootstrap=100,
figsize=(10, 10)),
def modisco_plot(modisco_dir,
output_dir,
use_importance_scores=False,
figsize=(10, 10),
gpu=None):
"""Plot the results of a modisco run
Args:
modisco_dir: modisco directory
output_dir: Output directory for writing the results
figsize: Output figure size
"""
if gpu is not None:
create_tf_session(gpu)
else:
# Don't use any GPU's
os.environ['CUDA_VISIBLE_DEVICES'] = ''
output_dir = Path(output_dir)
output_dir.parent.mkdir(parents=True, exist_ok=True)
# load modisco
mr = ModiscoResult(f"{modisco_dir}/modisco.h5")
modisco_kwargs = read_json(f"{modisco_dir}/kwargs.json")
grad_type = modisco_kwargs['grad_type']
# load used strand distance filter
included_samples = HDF5Reader.load(f"{modisco_dir}/strand_distances.h5")['included_samples']
# load importance scores
d = HDF5Reader.load(modisco_kwargs['imp_scores'])
tasks = list(d['targets']['profile'])
if isinstance(d['inputs'], dict):
one_hot = d['inputs']['seq']
else:
one_hot = d['inputs']
thr_hypothetical_contribs = {f"{task}/{gt}": mean(d['grads'][task][gt])[included_samples]
for task in tasks
for gt in grad_type.split(",")}
thr_one_hot = one_hot[included_samples]
thr_contrib_scores = {f"{task}/{gt}": thr_hypothetical_contribs[f"{task}/{gt}"] * thr_one_hot
for task in tasks
for gt in grad_type.split(",")}
tracks = {task: d['targets']['profile'][task][included_samples] for task in tasks}
# -------------------------------------------------
all_seqlets = mr.seqlets()
all_patterns = mr.patterns()
tasks = mr.tasks()
# 1. Plots with tracks and contrib scores
print("Writing results for contribution scores")
plot_profiles(all_seqlets,
thr_one_hot,
tracks=tracks,
importance_scores=thr_contrib_scores,
legend=False,
flip_neg=True,
rotate_y=0,
seq_height=.5,
patterns=all_patterns,
n_bootstrap=100,
fpath_template=str(output_dir / "{pattern}/agg_profile_contribcores"),
figsize=figsize)
# 2. Plots only with hypothetical contrib scores
print("Writing results for hypothetical contribution scores")
plot_profiles(all_seqlets,
thr_one_hot,
tracks={},
importance_scores=thr_hypothetical_contribs,
legend=False,
flip_neg=True,
rotate_y=0,
seq_height=1,
patterns=[pattern],
n_bootstrap=100,
fpath_template=str(output_dir / "{pattern}/agg_profile_hypcontribscores"),
figsize=figsize)
# Plot others
print("Plotting heatmaps")
for pattern in tqdm(all_patterns):
write_pattern_pngs(all_seqlets[pattern],
d,
included_samples,
tasks,
pattern,
output_dir=str(output_dir / pattern)):
mr.close()
os.makedirs?
agg_profile_ig = plot_profiles({pattern: seqlets},
thr_one_hot,
tracks=tracks,
importance_scores=thr_contrib_scores,
legend=False,
flip_neg=True,
rotate_y=0, #75,
seq_height=.5,
patterns=[pattern],
n_bootstrap=100,
figsize=(10, 10))[0],
ex_signal, ex_contrib_profile, ex_contrib_counts, ex_seq, sort_idx = get_signal(mr, d, included_samples, "metacluster_0/pattern_0")
multiple_heatmap_importance_profile(ex_contrib_counts, sort_idx=sort_idx, figsize=(20, 20))
vdm_heatmaps(mr, d, included_samples, "", opened=True)