# Imports
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
from basepair.imports import *
hv.extension('bokeh')
import pybedtools
from basepair.utils import flatten_list
paper_config()
# Common paths
model_dir = Path(f"{ddir}/processed/chipnexus/exp/models/oct-sox-nanog-klf/models/n_dil_layers=9/")
modisco_dir = model_dir / f"modisco/all/deeplift/profile/"
create_tf_session(0)
bpnet = BPNet.from_mdir(model_dir)
interval = pybedtools.create_interval_from_list(['chr17', 35503550, 35504550])
pred = bpnet.predict_intervals([interval], imp_method='deeplift')[0]
def plot_region(interval, variants=None, ):
pred = bpnet.predict_intervals([interval], variants=variants, imp_method='deeplift')[0]
viz_dict = OrderedDict(flatten_list([[
(f"{task} Pred", pred['pred'][task]),
(f"{task} Imp profile", pred['imp_score'][f"{task}/weighted"] * pred['seq']),
# (f"{task} Imp counts", sum(pred['grads'][task_idx]['counts'].values()) / 2 * seq),
] for task_idx, task in enumerate(bpnet.tasks)]))
viz_dict = filter_tracks(viz_dict, [420, 575])
# Hard-code the range
fmax = {'Imp profile': 0.265742, 'Pred': 34.442215}
fmin = {'Imp profile': -0.16751188, 'Pred': 0.07202636}
#fmax = {feature: max([viz_dict[f"{task} {feature}"].max() for task in bpnet.tasks])
# for feature in ['Imp profile', 'Pred']}
#fmin = {feature: min([viz_dict[f"{task} {feature}"].min() for task in bpnet.tasks])
# for feature in ['Imp profile', 'Pred']}
ylim = []
for k in viz_dict:
f = k.split(" ", 1)[1]
if "Imp" in f:
ylim.append((fmin[f], fmax[f]))
else:
ylim.append((0, fmax[f]))
fig = plot_tracks(viz_dict,
#seqlets=shifted_seqlets,
title="{i.chrom}:{i.start}-{i.end}, {i.name}".format(i=interval),
fig_height_per_track=0.5,
rotate_y=0,
fig_width=get_figsize(frac=1)[0],
ylim=ylim,
legend=False)
return fig
plot_region(interval);
from basepair.exp.paper.config import motifs
mr = ModiscoResult(modisco_dir / 'modisco.h5')
mr.open()
from basepair.modisco.pattern_instances import load_instances, filter_nonoverlapping_intervals, plot_coocurence_matrix, align_instance_center
dfi = load_instances(modisco_dir / 'instances.parq', motifs=motifs, dedup=True)
dfi.head()
chr17, 35503550
# get all instances in this region
dfi_subset = dfi.query('example_chrom == "chr17" & pattern_start_abs > 35503550 + 420 & pattern_end_abs < 35503550 + 575')
dfi_subset
dfi_subset['rel_center'] = dfi_subset['pattern_center'] + dfi_subset['example_start']- (35503550 +420)
dfi_subset['pattern_width'] = dfi_subset['pattern_end'] - dfi_subset['pattern_start']
dfi_subset[['pattern_name', 'strand', 'rel_center', 'pattern_width']]
oct_sox_row = dfi_subset.loc[26]
oct_sox_row
from basepair.extractors import Variant, extract_seq
trimmed_interval = pybedtools.create_interval_from_list(['chr17', 35503550 +420, 35503550 +575])
seq = extract_seq(trimmed_interval, None, bpnet.fasta_file)
# Oct4-Sox2
seq[80:91]
# Nanog
seq[98:102]
import random
random_os = ''.join(random.choices("ACGT", k=11))
random_nanog = ''.join(random.choices("ACGT", k=4))
random_os
random_nanog
plot_region(interval);
plot_region(interval, [Variant('chr17', 35503550 +420+80 + 1, 'ATGCATAACAA', 'GTTCGCTCGTG')]);
plot_region(interval, [Variant('chr17', 35503550 +420+98 + 1, 'TGAT', 'ATGT')]);
def plot_region_diff(interval, variants):
ref_pred = bpnet.predict_intervals([interval], variants=None, imp_method='deeplift')[0]
pred = bpnet.predict_intervals([interval], variants=variants, imp_method='deeplift')[0]
viz_dict = OrderedDict(flatten_list([[
(f"{task} Pred", pred['pred'][task]),
(f"{task} ref - alt pred", ref_pred['pred'][task] - pred['pred'][task]),
(f"{task} Imp profile", pred['imp_score'][f"{task}/weighted"] * pred['seq']),
(f"{task} ref - alt Imp", ref_pred['imp_score'][f"{task}/weighted"] * ref_pred['seq'] - pred['imp_score'][f"{task}/weighted"] * pred['seq']),
# (f"{task} Imp counts", sum(pred['grads'][task_idx]['counts'].values()) / 2 * seq),
] for task_idx, task in enumerate(bpnet.tasks)]))
viz_dict = filter_tracks(viz_dict, [420, 575])
# Hard-code the range
fmax = {'Imp profile': 0.265742, 'Pred': 34.442215, }
fmin = {'Imp profile': -0.16751188, 'Pred': 0.07202636}
#fmax = {feature: max([viz_dict[f"{task} {feature}"].max() for task in bpnet.tasks])
# for feature in ['Imp profile', 'Pred']}
#fmin = {feature: min([viz_dict[f"{task} {feature}"].min() for task in bpnet.tasks])
# for feature in ['Imp profile', 'Pred']}
ylim = []
for k in viz_dict:
f = k.split(" ", 1)[1]
if "Imp" in f:
ylim.append((fmin['Imp profile'], fmax['Imp profile']))
else:
ylim.append((-fmax['Pred'], fmax['Pred']))
fig = plot_tracks(viz_dict,
#seqlets=shifted_seqlets,
title="{i.chrom}:{i.start}-{i.end}, {i.name}".format(i=interval),
fig_height_per_track=0.5,
rotate_y=0,
fig_width=get_figsize(frac=1)[0],
ylim=ylim,
legend=False)
return fig
plot_region_diff(interval, [Variant('chr17', 35503550 +420+80 + 1, 'ATGCATAACAA', 'GTTCGCTCGTG')]);
plot_region_diff(interval, [Variant('chr17', 35503550 +420+98 + 1, 'TGAT', 'ATGT')]);