Goal¶

implement and test exporting the bed files from the 'navtive' modisco instances

Tasks¶

[x] visualize the failed region
- to which motif would you assign it by eye?
  - there is no motif
[x] overlay the original plot with the original, wide seqlet
[x] implement the seqlet trimming and re-visualize
[x] eyeball a few more
[x] modify the global function for exporting bed files (it should also trim the patterns)
- can you also get the importance scores attached?
- write a unit-test or a notebook for the functionality
[x] update the CLI
[x] export and upload to the server [[http://mitra.stanford.edu/kundaje/avsec/chipnexus/oct-sox-nanog-klf/models/n_dil_layers%3D9/modisco/by_peak_tasks/weighted/][link]]
- pattern_locations
  - scored_regions.bed
  - metacluster_/
    
    pattern_.bed

Conclusions¶

in the displayed enhancer, no motif was found

potential issues

when scanning for high important regions, the central region should be very short and we should not exclude the seqlets from each other

visualize the failed region¶

to which motif would you assign it by eye?

From Melanie

The site we are interested in is a distal enhancer associated with Oct4 (POU5F). The region of interest to us within this enhancer is: chr17:35,504,982-35,505,247. If you look, in this region there are three seqlets being called (two instances of m_0_p_2 and one of m_1_p_4). We expect that there would be a seqlet between the ones already being reported based on our understanding of the enhancer.

%matplotlib inline

from basepair.utils import flatten_list
from basepair.BPNet import BPNetPredictor
from basepair.cli.schemas import DataSpec
from keras.models import load_model
from basepair.config import create_tf_session
from pybedtools import Interval
from basepair.preproc import resize_interval
from pybedtools import BedTool, Interval
from basepair.modisco.results import ModiscoResult
from kipoi.readers import HDF5Reader
from basepair.config import get_data_dir
from collections import OrderedDict
from pathlib import Path
ddir = get_data_dir()

from basepair.cli.modisco import load_included_samples, load_ranges
import pandas as pd

from basepair.plot.tracks import plot_tracks, filter_tracks
import numpy as np

from basepair.modisco.results import Seqlet

interval = Interval("chr17", 35504982, 35505247)

interval.stop - interval.start

265

interval1kb = resize_interval(interval, 1000)

assert (interval1kb.stop - interval1kb.start) == 1000

Setup the model¶

create_tf_session(0)

<tensorflow.python.client.session.Session at 0x7fbc2aff37f0>

model_dir = Path(f"{ddir}/processed/chipnexus/exp/models/oct-sox-nanog-klf/models/n_dil_layers=9/")

modisco_dir = model_dir/ "modisco/by_peak_tasks/weighted/Oct4"

bpnet = BPNetPredictor.from_mdir(model_dir)
ds = DataSpec.load(model_dir / "dataspec.yaml")

mr = ModiscoResult(modisco_dir / "modisco.h5")
mr.open()

d = HDF5Reader(model_dir / "grad.all.h5")
d.open()

bpnet.plot_predict_grad([interval1kb], ds=ds, 
                        rotate_y=0,
                        profile_grad='weighted', 
                        xlim=[400-30, 600+30], 
                        fig_height_per_track=1)

[<Figure size 1440x1152 with 16 Axes>]

overlay the original plot with the original, wide seqlet¶

There are no seqlets from that region

ranges = load_ranges(modisco_dir)
ranges['name'] = ranges.index

# get modisco instances
dfs = mr.seqlet_df_instances(trim_frac=0.08)
dfs_untrimmed = mr.seqlet_df_instances()
dfs.name = dfs.pattern
dfs_untrimmed.name = dfs_untrimmed.pattern

ranges.head()

interval

Interval(chr17:35504982-35505247)

ranges.query("chrom=='chr17'").query("start < 35504982").query("end > 35505247")

bt = BedTool.from_dataframe(ranges[['chrom', 'start', 'end', 'name', 'start', 'strand']])

bt.head()

chrX	143482544	143483544	0	143482544	*
 chr3	122145063	122146063	1	122145063	*
 chr17	35503551	35504551	2	35503551	*
 chr2	52071751	52072751	3	52071751	*
 chr5	28209541	28210541	4	28209541	*
 chr14	86395729	86396729	5	86395729	*
 chr3	105427372	105428372	6	105427372	*
 chr18	34778275	34779275	7	34778275	*
 chr7	80746115	80747115	8	80746115	*
 chr15	55267750	55268750	9	55267750	*

bt_intersect = bt.intersect(BedTool([interval1kb]), wa=True)

len(bt_intersect)

1

interval.stop - interval.start

265

# No seqlets were found in that region
np.sum(dfs.seqname == 8962)

0

dfs.head()

example_idx = int(bt_intersect[0].name)

example_range = ranges.iloc[example_idx]

example_idx

8962

# Problem -> there are no seqlets from that region!
np.sum(dfs.seqname == example_idx)

0

implement the seqlet trimming and re-visualize¶

Functions¶

def df2seqlets(df):
    return [Seqlet(row.seqname, row.start, row.end, shorten_pattern(row.pattern), row.strand)
            for i,row in df.iterrows()]

def relevant_patterns(df, example_idx, xlim):
    patterns = []
    for i,row in df[df.seqname == example_idx].sort_values("start").iterrows():
        xmin = row.start - xlim[0] + 0.5
        xmax = row.end - xlim[0]+ 0.5
        if xmin < 0:
            continue
        if xmax > xlim[1] - xlim[0]:
            continue
        patterns.append((row.pattern, row.strand))
    return patterns


def vdom_patterns(mr, patterns_rc_vec, trim_frac=0.08):
    """Get vdom patterns in a single row
    
    Args:
      mr: ModiscoResults object
      patterns_rc_vec: list of tuples (pattern_id, rc). Example: ("metacluster_0/pattern_1", True)
      trim_frac: how much to trim the pssm
    """
    from basepair.plot.vdom import fig2vdom
    from vdom.helpers import p
    return p([fig2vdom(mr.plot_pssm(*pattern_idx.split("/"), rc=rc=="-", trim_frac=trim_frac, title=rc + pattern_idx, 
                                    letter_width=0.35, height=1.7), height=75)
              for pattern_idx, rc in patterns_rc_vec])

example_idx = 5170

interval_row = ranges.iloc[example_idx]
interval = Interval(interval_row.chrom, interval_row.start, interval_row.end)

dfs[dfs.seqname == example_idx]

from basepair.modisco.table import shorten_pattern

seqlets = df2seqlets(dfs[dfs.seqname == example_idx])

seqlets

[Seqlet(seqname=5170, start=501, end=517, name='m0_p0', strand='+')]

import matplotlib.pyplot as plt

from basepair.plot.tracks import draw_box, plot_track
from matplotlib.collections import PatchCollection
from matplotlib.patches import Rectangle

mr.plot_pssm(*pattern.split("/"), trim_frac=0.08);

dfs.head()

example_idx = dfs.seqname.sample(1).iloc[0]
print(f"Example_idx: {example_idx}")
xlim = [400, 600]
display(vdom_patterns(mr, relevant_patterns(dfs, example_idx, xlim), trim_frac=0.08))
seqlets = df2seqlets(dfs[dfs.seqname == example_idx])
interval_row = ranges.iloc[example_idx]
interval = Interval(interval_row.chrom, interval_row.start, interval_row.end)
bpnet.plot_predict_grad([interval], 
                       seqlets=seqlets,
                       ds=ds, 
                       rotate_y=0,
                       profile_grad='weighted', 
                       xlim=xlim,
                       add_title=False,
                       fig_height_per_track=1);

Example_idx: 5620

eyeball a few more¶

example_idx = dfs.seqname.sample(1).iloc[0]
print(f"Example_idx: {example_idx}")
xlim = [400, 600]
display(vdom_patterns(mr, relevant_patterns(dfs, example_idx, xlim), trim_frac=0.08))
seqlets = df2seqlets(dfs[dfs.seqname == example_idx])
interval_row = ranges.iloc[example_idx]
interval = Interval(interval_row.chrom, interval_row.start, interval_row.end)
bpnet.plot_predict_grad([interval], 
                       seqlets=seqlets,
                       ds=ds, 
                       rotate_y=0,
                       profile_grad='weighted', 
                       xlim=xlim,
                       add_title=False,
                       fig_height_per_track=1);

Example_idx: 2153

example_idx = dfs.seqname.sample(1).iloc[0]
print(f"Example_idx: {example_idx}")
xlim = [400, 600]
display(vdom_patterns(mr, relevant_patterns(dfs, example_idx, xlim), trim_frac=0.08))
seqlets = df2seqlets(dfs[dfs.seqname == example_idx])
interval_row = ranges.iloc[example_idx]
interval = Interval(interval_row.chrom, interval_row.start, interval_row.end)
bpnet.plot_predict_grad([interval], 
                       seqlets=seqlets,
                       ds=ds, 
                       rotate_y=0,
                       profile_grad='weighted', 
                       xlim=xlim,
                       add_title=False,
                       fig_height_per_track=1);

Example_idx: 4192

example_idx = dfs.seqname.sample(1).iloc[0]
print(f"Example_idx: {example_idx}")
xlim = [400, 600]
display(vdom_patterns(mr, relevant_patterns(dfs, example_idx, xlim), trim_frac=0.08))
seqlets = df2seqlets(dfs[dfs.seqname == example_idx])
interval_row = ranges.iloc[example_idx]
interval = Interval(interval_row.chrom, interval_row.start, interval_row.end)
bpnet.plot_predict_grad([interval], 
                       seqlets=seqlets,
                       ds=ds, 
                       rotate_y=0,
                       profile_grad='weighted', 
                       xlim=xlim,
                       add_title=False,
                       fig_height_per_track=1);

Example_idx: 2427

Problem¶

if two seqlets are too close together, only one will get chosen. E.g. in this case the Sox2 motif got ignore since the Nanog motif got picked first

example_idx = dfs.seqname.sample(1).iloc[0]
print(f"Example_idx: {example_idx}")
xlim = [400, 600]
display(vdom_patterns(mr, relevant_patterns(dfs, example_idx, xlim), trim_frac=0.08))
seqlets = df2seqlets(dfs[dfs.seqname == example_idx])
interval_row = ranges.iloc[example_idx]
interval = Interval(interval_row.chrom, interval_row.start, interval_row.end)
bpnet.plot_predict_grad([interval], 
                       seqlets=seqlets,
                       ds=ds, 
                       rotate_y=0,
                       profile_grad='weighted', 
                       xlim=xlim,
                       add_title=False,
                       fig_height_per_track=1);

Example_idx: 13422

Example how seqlets exclude the others¶

seqlets = df2seqlets(dfs_untrimmed[dfs_untrimmed.seqname == example_idx])
bpnet.plot_predict_grad([interval], 
                       seqlets=seqlets,
                       ds=ds, 
                       rotate_y=0,
                       profile_grad='weighted', 
                       xlim=xlim,
                       add_title=False,
                       fig_height_per_track=1);

example_idx = dfs.seqname.sample(1).iloc[0]
print(f"Example_idx: {example_idx}")
xlim = [400, 600]
display(vdom_patterns(mr, relevant_patterns(dfs, example_idx, xlim), trim_frac=0.08))
seqlets = df2seqlets(dfs[dfs.seqname == example_idx])
interval_row = ranges.iloc[example_idx]
interval = Interval(interval_row.chrom, interval_row.start, interval_row.end)
bpnet.plot_predict_grad([interval], 
                       seqlets=seqlets,
                       ds=ds, 
                       rotate_y=0,
                       profile_grad='weighted', 
                       xlim=xlim,
                       add_title=False,
                       fig_height_per_track=1);

Example_idx: 5798

example_idx = dfs.seqname.sample(1).iloc[0]
print(f"Example_idx: {example_idx}")
xlim = [400, 600]
display(vdom_patterns(mr, relevant_patterns(dfs, example_idx, xlim), trim_frac=0.08))
seqlets = df2seqlets(dfs[dfs.seqname == example_idx])
interval_row = ranges.iloc[example_idx]
interval = Interval(interval_row.chrom, interval_row.start, interval_row.end)
bpnet.plot_predict_grad([interval], 
                       seqlets=seqlets,
                       ds=ds, 
                       rotate_y=0,
                       profile_grad='weighted', 
                       xlim=xlim,
                       add_title=False,
                       fig_height_per_track=1);

Example_idx: 5368

example_idx = dfs.seqname.sample(1).iloc[0]
print(f"Example_idx: {example_idx}")
xlim = [400, 600]
display(vdom_patterns(mr, relevant_patterns(dfs, example_idx, xlim), trim_frac=0.08))
seqlets = df2seqlets(dfs[dfs.seqname == example_idx])
interval_row = ranges.iloc[example_idx]
interval = Interval(interval_row.chrom, interval_row.start, interval_row.end)
bpnet.plot_predict_grad([interval], 
                       seqlets=seqlets,
                       ds=ds, 
                       rotate_y=0,
                       profile_grad='weighted', 
                       xlim=xlim,
                       add_title=False,
                       fig_height_per_track=1);

Example_idx: 17510

example_idx = dfs.seqname.sample(1).iloc[0]
print(f"Example_idx: {example_idx}")
xlim = [400, 600]
display(vdom_patterns(mr, relevant_patterns(dfs, example_idx, xlim), trim_frac=0.08))
seqlets = df2seqlets(dfs[dfs.seqname == example_idx])
interval_row = ranges.iloc[example_idx]
interval = Interval(interval_row.chrom, interval_row.start, interval_row.end)
bpnet.plot_predict_grad([interval], 
                       seqlets=seqlets,
                       ds=ds, 
                       rotate_y=0,
                       profile_grad='weighted', 
                       xlim=xlim,
                       add_title=False,
                       fig_height_per_track=1);

Example_idx: 19410

	seqname	start	end	name	strand	pattern	center
0	5170	501	517	metacluster_0/pattern_0	+	metacluster_0/pattern_0	509.0
1	571	499	515	metacluster_0/pattern_0	+	metacluster_0/pattern_0	507.0
2	1341	485	501	metacluster_0/pattern_0	+	metacluster_0/pattern_0	493.0
3	17234	496	512	metacluster_0/pattern_0	+	metacluster_0/pattern_0	504.0
4	10707	504	520	metacluster_0/pattern_0	+	metacluster_0/pattern_0	512.0

	seqname	start	end	name	strand	pattern	center
0	5170	501	517	metacluster_0/pattern_0	+	metacluster_0/pattern_0	509.0
1	571	499	515	metacluster_0/pattern_0	+	metacluster_0/pattern_0	507.0
2	1341	485	501	metacluster_0/pattern_0	+	metacluster_0/pattern_0	493.0
3	17234	496	512	metacluster_0/pattern_0	+	metacluster_0/pattern_0	504.0
4	10707	504	520	metacluster_0/pattern_0	+	metacluster_0/pattern_0	512.0

	chrom	start	end	strand	interval_from_task	name
0	chrX	143482544	143483544	*	Oct4	0
1	chr3	122145063	122146063	*	Oct4	1
2	chr17	35503551	35504551	*	Oct4	2
3	chr2	52071751	52072751	*	Oct4	3
4	chr5	28209541	28210541	*	Oct4	4