pattern table hows that many motifs are very similar to each other
Although the sequence motif is the same, the context changes hence the way this motif influences the binding are differnet.
from basepair.imports import *
from pybedtools import BedTool
create_tf_session(1)
model_dir = Path(f"{ddir}/processed/chipnexus/exp/models/oct-sox-nanog-klf/models/n_dil_layers=9/")
modisco_dir = model_dir / f"modisco/all/profile/"
output_dir = Path("/srv/www/kundaje/avsec/chipnexus/oct-sox-nanog-klf/models/n_dil_layers=9/modisco/all/profile")
# load
mr = ModiscoResult(modisco_dir / "modisco.h5")
mr.open()
pattern_table = pd.read_csv(output_dir / "pattern_table.csv")
df_clusters = pd.read_csv(output_dir / 'motif_clustering/short-motif.seq-cluster.csv') # cluster table
# read only the non-TE patterns
patterns = read_pkl(output_dir / 'patterns.pkl')
patterns = [p for p in patterns if p.name in df_clusters.pattern.tolist()]
tasks = patterns[0].tasks()
len(patterns)
df_clusters.head()
p = patterns[0]
p.plot("profile", rotate_y=0)
from basepair.preproc import resize_interval
from basepair.plot.tracks import plot_tracks, filter_tracks
bpnet = BPNetPredictor.from_mdir(model_dir)
intervals = [resize_interval(x, bpnet.input_seqlen()) for x in BedTool(f"{output_dir}/modisco_seqlets/{p.name}.bed")]
preds = bpnet.predict(intervals, compute_grads=False, use_strand=True)
scaled_preds = {t: np.stack([x['preds']['scaled_profile'][t] for x in preds]).mean(0) for t in tasks}
plot_tracks(filter_tracks(scaled_preds, [400, 600]), fig_height_per_track=0.8, fig_width=15)
orig_intervals = list(BedTool(f"{output_dir}/modisco_seqlets/{p.name}.bed"))
widths = np.array([i.end - i.start for i in orig_intervals])
assert widths.std() == 0
motif_width = widths[0]
intervals = [resize_interval(x, bpnet.input_seqlen()) for x in orig_intervals]
input_seq = bpnet.get_seq(intervals, use_strand=True)
# figure out the right start and end in the whole array
center = bpnet.input_seqlen() // 2
start = center - motif_width // 2
end = center + motif_width // 2 + motif_width % 2
assert end - start == motif_width
input_seq.shape
# one-hot -> strings
from concise.preprocessing.sequence import one_hot2string, DNA, encodeDNA
up_seqs = one_hot2string(input_seq[:,:start], DNA)
down_seqs = one_hot2string(input_seq[:,end:], DNA)
center_seqs = one_hot2string(input_seq[:,start:end], DNA)
assert len(up_seqs[0]) == start
assert len(down_seqs[0]) == bpnet.input_seqlen() - end
# di-nucleotide shuffle the sequences
from deeplift.dinuc_shuffle import dinuc_shuffle
shuffled_up_seqs = [dinuc_shuffle(s) for s in up_seqs]
shuffled_down_seqs = [dinuc_shuffle(s) for s in down_seqs]
shuffled_seqs = encodeDNA([shuffled_up_seqs[i] + center_seqs[i] + shuffled_down_seqs[i]
for i in range(len(up_seqs))])
# shape still the same
assert shuffled_seqs.shape == input_seq.shape
assert not np.allclose(shuffled_seqs[0], input_seq[0])
# Make predictions
sh_preds = bpnet.seq_predict(shuffled_seqs, compute_grads=False)
sh_scaled_preds = {t: np.stack([x['preds']['scaled_profile'][t] for x in preds]).mean(0) for t in tasks}
plot_tracks(filter_tracks(sh_scaled_preds, [400, 600]), fig_height_per_track=0.8, fig_width=15)
from pybedtools import BedTool
import numpy as np
from basepair.preproc import resize_interval
from concise.preprocessing.sequence import one_hot2string, DNA, encodeDNA
from deeplift.dinuc_shuffle import dinuc_shuffle
from basepair.plot.tracks import filter_tracks
def predict_pattern_footprint_random_ctx(p, bpnet, output_dir):
# load the original intervals and get the motif width
orig_intervals = list(BedTool(f"{output_dir}/modisco_seqlets/{p.name}.bed"))
widths = np.array([i.end - i.start for i in orig_intervals])
assert widths.std() == 0
motif_width = widths[0]
# get sequences
intervals = [resize_interval(x, bpnet.input_seqlen()) for x in orig_intervals]
input_seq = bpnet.get_seq(intervals, use_strand=True)
# figure out the right start and end in the whole array
center = bpnet.input_seqlen() // 2
start = center - motif_width // 2
end = center + motif_width // 2 + motif_width % 2
assert end - start == motif_width
# one-hot -> strings
up_seqs = one_hot2string(input_seq[:, :start], DNA)
down_seqs = one_hot2string(input_seq[:, end:], DNA)
center_seqs = one_hot2string(input_seq[:, start:end], DNA)
assert len(up_seqs[0]) == start
assert len(down_seqs[0]) == bpnet.input_seqlen() - end
# di-nucleotide shuffle the sequences
shuffled_up_seqs = [dinuc_shuffle(s) for s in up_seqs]
shuffled_down_seqs = [dinuc_shuffle(s) for s in down_seqs]
shuffled_seqs = encodeDNA([shuffled_up_seqs[i] + center_seqs[i] + shuffled_down_seqs[i]
for i in range(len(up_seqs))])
# shape still the same
assert shuffled_seqs.shape == input_seq.shape
assert not np.allclose(shuffled_seqs[0], input_seq[0])
# Make predictions
sh_preds = bpnet.seq_predict(shuffled_seqs, compute_grads=False)
sh_scaled_preds = {t: np.stack([x['preds']['scaled_profile'][t] for x in sh_preds]).mean(0) for t in p.tasks()}
return sh_scaled_preds
def predict_pattern_footprint(p, bpnet, output_dir):
intervals = [resize_interval(x, bpnet.input_seqlen()) for x in BedTool(f"{output_dir}/modisco_seqlets/{p.name}.bed")]
preds = bpnet.predict(intervals, compute_grads=False, use_strand=True)
return {t: np.stack([x['preds']['scaled_profile'][t] for x in preds]).mean(0) for t in p.tasks()}
def footprint_context_pred(p, bpnet, output_dir):
return {"measured": p.profile,
"predicted": filter_tracks(predict_pattern_footprint(p, bpnet, output_dir), [400, 600]),
"predicted_random_ctx": filter_tracks(predict_pattern_footprint_random_ctx(p, bpnet, output_dir), [400, 600])}
out = footprint_context_pred(p, bpnet, output_dir)
# FIX - matplotlib not showing
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
for k,v in out.items():
plot_tracks(v, fig_height_per_track=0.8, fig_width=15, title=k)
Choose cluster 3 (Nanog).
idx = df_clusters[df_clusters.cluster==8].cluster_order.values
results_nanog = OrderedDict([(row.pattern, footprint_context_pred(patterns[row.cluster_order], bpnet, output_dir) )
for i,row in tqdm(df_clusters[df_clusters.cluster==3].iterrows())])
patterns_d = {p.name:p for p in patterns}
# FIX - matplotlib not showing
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
pname = 'metacluster_0/pattern_3'
patterns_d[pname].plot(['seq', 'contrib'], rotate_y=0)
for k,v in results_nanog[pname].items():
plot_tracks(v, fig_height_per_track=0.8, fig_width=15, title=k)
pname = 'metacluster_4/pattern_1'
patterns_d[pname].plot(['seq', 'contrib'], rotate_y=0)
for k,v in results_nanog[pname].items():
plot_tracks(v, fig_height_per_track=0.8, fig_width=15, title=k)
pname = 'metacluster_1/pattern_2'
patterns_d[pname].plot(['seq', 'contrib'], rotate_y=0)
for k,v in results_nanog[pname].items():
plot_tracks(v, fig_height_per_track=0.8, fig_width=15, title=k)
results_klf4 = OrderedDict([(row.pattern, footprint_context_pred(patterns[row.cluster_order], bpnet, output_dir) )
for i,row in tqdm(df_clusters[df_clusters.cluster==8].iterrows())])
pname = 'metacluster_2/pattern_0'
patterns_d[pname].plot(['seq', 'contrib'], rotate_y=0)
for k,v in results_klf4[pname].items():
plot_tracks(v, fig_height_per_track=0.8, fig_width=15, title=k)
pname = 'metacluster_6/pattern_3'
patterns_d[pname].plot(['seq', 'contrib'], rotate_y=0)
for k,v in results_klf4[pname].items():
plot_tracks(v, fig_height_per_track=0.8, fig_width=15, title=k)
pname = 'metacluster_2/pattern_3'
patterns_d[pname].plot(['seq', 'contrib'], rotate_y=0)
for k,v in results_klf4[pname].items():
plot_tracks(v, fig_height_per_track=0.8, fig_width=15, title=k)