From Melanie
The site we are interested in is a distal enhancer associated with Oct4 (POU5F). The region of interest to us within this enhancer is: chr17:35,504,982-35,505,247. If you look, in this region there are three seqlets being called (two instances of m_0_p_2 and one of m_1_p_4). We expect that there would be a seqlet between the ones already being reported based on our understanding of the enhancer.
%matplotlib inline
from basepair.utils import flatten_list
from basepair.BPNet import BPNetPredictor
from basepair.cli.schemas import DataSpec
from keras.models import load_model
from basepair.config import create_tf_session
from pybedtools import Interval
from basepair.preproc import resize_interval
from pybedtools import BedTool, Interval
from basepair.modisco.results import ModiscoResult
from kipoi.readers import HDF5Reader
from basepair.config import get_data_dir
from collections import OrderedDict
from pathlib import Path
ddir = get_data_dir()
from basepair.cli.modisco import load_included_samples, load_ranges
import pandas as pd
from basepair.plot.tracks import plot_tracks, filter_tracks
import numpy as np
from basepair.modisco.results import Seqlet
interval = Interval("chr17", 35504982, 35505247)
interval.stop - interval.start
interval1kb = resize_interval(interval, 1000)
assert (interval1kb.stop - interval1kb.start) == 1000
create_tf_session(0)
model_dir = Path(f"{ddir}/processed/chipnexus/exp/models/oct-sox-nanog-klf/models/n_dil_layers=9/")
modisco_dir = model_dir/ "modisco/by_peak_tasks/weighted/Oct4"
bpnet = BPNetPredictor.from_mdir(model_dir)
ds = DataSpec.load(model_dir / "dataspec.yaml")
mr = ModiscoResult(modisco_dir / "modisco.h5")
mr.open()
d = HDF5Reader(model_dir / "grad.all.h5")
d.open()
bpnet.plot_predict_grad([interval1kb], ds=ds,
rotate_y=0,
profile_grad='weighted',
xlim=[400-30, 600+30],
fig_height_per_track=1)
There are no seqlets from that region
ranges = load_ranges(modisco_dir)
ranges['name'] = ranges.index
# get modisco instances
dfs = mr.seqlet_df_instances(trim_frac=0.08)
dfs_untrimmed = mr.seqlet_df_instances()
dfs.name = dfs.pattern
dfs_untrimmed.name = dfs_untrimmed.pattern
ranges.head()
interval
ranges.query("chrom=='chr17'").query("start < 35504982").query("end > 35505247")
bt = BedTool.from_dataframe(ranges[['chrom', 'start', 'end', 'name', 'start', 'strand']])
bt.head()
bt_intersect = bt.intersect(BedTool([interval1kb]), wa=True)
len(bt_intersect)
interval.stop - interval.start
# No seqlets were found in that region
np.sum(dfs.seqname == 8962)
dfs.head()
example_idx = int(bt_intersect[0].name)
example_range = ranges.iloc[example_idx]
example_idx
# Problem -> there are no seqlets from that region!
np.sum(dfs.seqname == example_idx)
def df2seqlets(df):
return [Seqlet(row.seqname, row.start, row.end, shorten_pattern(row.pattern), row.strand)
for i,row in df.iterrows()]
def relevant_patterns(df, example_idx, xlim):
patterns = []
for i,row in df[df.seqname == example_idx].sort_values("start").iterrows():
xmin = row.start - xlim[0] + 0.5
xmax = row.end - xlim[0]+ 0.5
if xmin < 0:
continue
if xmax > xlim[1] - xlim[0]:
continue
patterns.append((row.pattern, row.strand))
return patterns
def vdom_patterns(mr, patterns_rc_vec, trim_frac=0.08):
"""Get vdom patterns in a single row
Args:
mr: ModiscoResults object
patterns_rc_vec: list of tuples (pattern_id, rc). Example: ("metacluster_0/pattern_1", True)
trim_frac: how much to trim the pssm
"""
from basepair.plot.vdom import fig2vdom
from vdom.helpers import p
return p([fig2vdom(mr.plot_pssm(*pattern_idx.split("/"), rc=rc=="-", trim_frac=trim_frac, title=rc + pattern_idx,
letter_width=0.35, height=1.7), height=75)
for pattern_idx, rc in patterns_rc_vec])
example_idx = 5170
interval_row = ranges.iloc[example_idx]
interval = Interval(interval_row.chrom, interval_row.start, interval_row.end)
dfs[dfs.seqname == example_idx]
from basepair.modisco.table import shorten_pattern
seqlets = df2seqlets(dfs[dfs.seqname == example_idx])
seqlets
import matplotlib.pyplot as plt
from basepair.plot.tracks import draw_box, plot_track
from matplotlib.collections import PatchCollection
from matplotlib.patches import Rectangle
mr.plot_pssm(*pattern.split("/"), trim_frac=0.08);
dfs.head()
example_idx = dfs.seqname.sample(1).iloc[0]
print(f"Example_idx: {example_idx}")
xlim = [400, 600]
display(vdom_patterns(mr, relevant_patterns(dfs, example_idx, xlim), trim_frac=0.08))
seqlets = df2seqlets(dfs[dfs.seqname == example_idx])
interval_row = ranges.iloc[example_idx]
interval = Interval(interval_row.chrom, interval_row.start, interval_row.end)
bpnet.plot_predict_grad([interval],
seqlets=seqlets,
ds=ds,
rotate_y=0,
profile_grad='weighted',
xlim=xlim,
add_title=False,
fig_height_per_track=1);
example_idx = dfs.seqname.sample(1).iloc[0]
print(f"Example_idx: {example_idx}")
xlim = [400, 600]
display(vdom_patterns(mr, relevant_patterns(dfs, example_idx, xlim), trim_frac=0.08))
seqlets = df2seqlets(dfs[dfs.seqname == example_idx])
interval_row = ranges.iloc[example_idx]
interval = Interval(interval_row.chrom, interval_row.start, interval_row.end)
bpnet.plot_predict_grad([interval],
seqlets=seqlets,
ds=ds,
rotate_y=0,
profile_grad='weighted',
xlim=xlim,
add_title=False,
fig_height_per_track=1);
example_idx = dfs.seqname.sample(1).iloc[0]
print(f"Example_idx: {example_idx}")
xlim = [400, 600]
display(vdom_patterns(mr, relevant_patterns(dfs, example_idx, xlim), trim_frac=0.08))
seqlets = df2seqlets(dfs[dfs.seqname == example_idx])
interval_row = ranges.iloc[example_idx]
interval = Interval(interval_row.chrom, interval_row.start, interval_row.end)
bpnet.plot_predict_grad([interval],
seqlets=seqlets,
ds=ds,
rotate_y=0,
profile_grad='weighted',
xlim=xlim,
add_title=False,
fig_height_per_track=1);
example_idx = dfs.seqname.sample(1).iloc[0]
print(f"Example_idx: {example_idx}")
xlim = [400, 600]
display(vdom_patterns(mr, relevant_patterns(dfs, example_idx, xlim), trim_frac=0.08))
seqlets = df2seqlets(dfs[dfs.seqname == example_idx])
interval_row = ranges.iloc[example_idx]
interval = Interval(interval_row.chrom, interval_row.start, interval_row.end)
bpnet.plot_predict_grad([interval],
seqlets=seqlets,
ds=ds,
rotate_y=0,
profile_grad='weighted',
xlim=xlim,
add_title=False,
fig_height_per_track=1);
example_idx = dfs.seqname.sample(1).iloc[0]
print(f"Example_idx: {example_idx}")
xlim = [400, 600]
display(vdom_patterns(mr, relevant_patterns(dfs, example_idx, xlim), trim_frac=0.08))
seqlets = df2seqlets(dfs[dfs.seqname == example_idx])
interval_row = ranges.iloc[example_idx]
interval = Interval(interval_row.chrom, interval_row.start, interval_row.end)
bpnet.plot_predict_grad([interval],
seqlets=seqlets,
ds=ds,
rotate_y=0,
profile_grad='weighted',
xlim=xlim,
add_title=False,
fig_height_per_track=1);
seqlets = df2seqlets(dfs_untrimmed[dfs_untrimmed.seqname == example_idx])
bpnet.plot_predict_grad([interval],
seqlets=seqlets,
ds=ds,
rotate_y=0,
profile_grad='weighted',
xlim=xlim,
add_title=False,
fig_height_per_track=1);
example_idx = dfs.seqname.sample(1).iloc[0]
print(f"Example_idx: {example_idx}")
xlim = [400, 600]
display(vdom_patterns(mr, relevant_patterns(dfs, example_idx, xlim), trim_frac=0.08))
seqlets = df2seqlets(dfs[dfs.seqname == example_idx])
interval_row = ranges.iloc[example_idx]
interval = Interval(interval_row.chrom, interval_row.start, interval_row.end)
bpnet.plot_predict_grad([interval],
seqlets=seqlets,
ds=ds,
rotate_y=0,
profile_grad='weighted',
xlim=xlim,
add_title=False,
fig_height_per_track=1);
example_idx = dfs.seqname.sample(1).iloc[0]
print(f"Example_idx: {example_idx}")
xlim = [400, 600]
display(vdom_patterns(mr, relevant_patterns(dfs, example_idx, xlim), trim_frac=0.08))
seqlets = df2seqlets(dfs[dfs.seqname == example_idx])
interval_row = ranges.iloc[example_idx]
interval = Interval(interval_row.chrom, interval_row.start, interval_row.end)
bpnet.plot_predict_grad([interval],
seqlets=seqlets,
ds=ds,
rotate_y=0,
profile_grad='weighted',
xlim=xlim,
add_title=False,
fig_height_per_track=1);
example_idx = dfs.seqname.sample(1).iloc[0]
print(f"Example_idx: {example_idx}")
xlim = [400, 600]
display(vdom_patterns(mr, relevant_patterns(dfs, example_idx, xlim), trim_frac=0.08))
seqlets = df2seqlets(dfs[dfs.seqname == example_idx])
interval_row = ranges.iloc[example_idx]
interval = Interval(interval_row.chrom, interval_row.start, interval_row.end)
bpnet.plot_predict_grad([interval],
seqlets=seqlets,
ds=ds,
rotate_y=0,
profile_grad='weighted',
xlim=xlim,
add_title=False,
fig_height_per_track=1);
example_idx = dfs.seqname.sample(1).iloc[0]
print(f"Example_idx: {example_idx}")
xlim = [400, 600]
display(vdom_patterns(mr, relevant_patterns(dfs, example_idx, xlim), trim_frac=0.08))
seqlets = df2seqlets(dfs[dfs.seqname == example_idx])
interval_row = ranges.iloc[example_idx]
interval = Interval(interval_row.chrom, interval_row.start, interval_row.end)
bpnet.plot_predict_grad([interval],
seqlets=seqlets,
ds=ds,
rotate_y=0,
profile_grad='weighted',
xlim=xlim,
add_title=False,
fig_height_per_track=1);