Goal¶

visualize the importance scores + motif instances for known enhancers

Tasks¶

[x] get the enhancer regions
- strided and unstrided
[x] paste the screenshots from last time
[x] run importance scores in BPNet and visualize all the regions
[x] implement instance finding in bpnet region view
- pre-specify which patterns to use
[x] plot for all

get the enhancer regions¶

strided and unstrided

import basepair
from basepair.config import get_data_dir, create_tf_session
from keras.models import load_model
from basepair.datasets import *
from basepair import datasets
from basepair.preproc import AppendTotalCounts, transform_data, resize_interval
from basepair.plots import regression_eval
from basepair.BPNet import BPNetPredictor
from basepair.utils import write_pkl
from basepair.imports import *

/users/avsec/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/h5py/__init__.py:36: FutureWarning: Conversion of the second argument of issubdtype from `float` to `np.floating` is deprecated. In future, it will be treated as `np.float64 == np.dtype(float).type`.
  from ._conv import register_converters as _register_converters
Using TensorFlow backend.
2018-10-14 03:32:10,384 [WARNING] doc empty for the `info:` field

# from basepair.preproc import resize_interval

from kipoiseq.transforms.functional import resize_interval

create_tf_session(0)

<tensorflow.python.client.session.Session at 0x7f6027ae9240>

tasks = ['Oct4', 'Sox2', 'Nanog', 'Klf4']

model_dir = Path(f"{ddir}/processed/chipnexus/exp/models/oct-sox-nanog-klf/models/n_dil_layers=9/")

genome_file = "/mnt/data/pipeline_genome_data/mm10/mm10.chrom.sizes"
klf4_bed = "/users/avsec/workspace/basepair-workflow/data/klf4_sites_mm10.bed"
oct4_bed = "/users/avsec/workspace/basepair-workflow/data/oct4_sites_mm10.bed"
klf4_oct4_windowed_bed = "/users/avsec/workspace/basepair-workflow/data/klf4_oct4_windowed_sites_mm10.1kb.bed"

ds = DataSpec.load(model_dir / 'dataspec.yaml')

cat {klf4_bed} {oct4_bed}

chr4	55477346	55477760	KLF4-Downstream_Enhancer_E1_mm10	0	.
chr4	55475208	55475900	KLF4-Downstream_Enhancer_E2_mm10	0	.
chr17	35503943	35506057	Distal and Proximal Oct4 Enhancers, Combined	0	.
chr17	35503912	35504118	SOX2/OCT4 Binding Site on Distal Enhancer	0	.
chr17	35504981	35505186	SOX2/OCT4 Binding Site on Proximal Enhancer	0	.
chr17	35506031	35510777	OCT4(POU5F1) Gene Region	0	.

# make windowed regions
!cat {klf4_bed} {oct4_bed} | bedtools makewindows -w 1000  -b stdin -i srcwinnum >  {klf4_oct4_windowed_bed}

Regions from Khyati. Google spreadsheet link

regions_from_khyati = """
chr6	122,707,340	122,707,540	Esrrb,Oct4,Sox2->Nanog
chr1	180,933,774	180,933,974	Esrrb,Oct4,Sox2->Lefty
chr5	77262224	77,262,424	Esrrb,Oct4,Sox2->REST
chr6	122707331	122,707,531	Klf4,Pbx1,Oct4,Sox2->Nanog
chr4	55,475,492	55,475,692	Esrrb,Oct4,Sox2,Stat3->Klf4
chr3	34,756,830	34,757,030	Oct4,Sox2,Nanog,Klf4,NR5A2, Sat3,Esrrb,Smad1,Ncoa3->Sox2(dist)
chr3	34,758,000	34,758,200	Oct4,Sox2,Nanog,Klf4,NR5A2, Sat3,Esrrb,Smad1,Ncoa3->Sox2(dist)
chr3	34,761,355	34,761,555	Oct4,Sox2,Nanog,Klf4,NR5A2, Sat3,Esrrb,Smad1,Ncoa3->Sox2(dist)
chr3	34,654,000	34,654,200	Oct4,Sox2,Nanog,P300,Smad1->Sox2(prox)
"""

from io import StringIO

df = pd.read_csv(StringIO(regions_from_khyati), sep='\t', header=None)
df.columns = ['chrom', 'start', 'stop', 'name']
df['start'] = df['start'].str.replace(",", "").astype(int)
df['stop'] = df['stop'].str.replace(",", "").astype(int)
df['score'] = 0
df['strand'] = "."

df["stop"] - df["start"]

0    200
1    200
2    200
3    200
4    200
5    200
6    200
7    200
8    200
dtype: int64

new_intervals_from_khyati = list(BedTool.from_dataframe(df))

# TODO get the interesting regions from the genome browser

bt = BedTool(klf4_bed).cat(BedTool(oct4_bed), postmerge=False)
intervals = list(bt)
bt_windowed = BedTool(klf4_oct4_windowed_bed)
intervals_windowed = [resize_interval(interval, 1000) for interval in bt_windowed]
resized_intervals = [resize_interval(interval, 1000) for interval in intervals]

# motif widths
for i in intervals:
    print(i.stop - i.start, i.name)

414 KLF4-Downstream_Enhancer_E1_mm10
692 KLF4-Downstream_Enhancer_E2_mm10
2114 Distal and Proximal Oct4 Enhancers, Combined
206 SOX2/OCT4 Binding Site on Distal Enhancer
205 SOX2/OCT4 Binding Site on Proximal Enhancer
4746 OCT4(POU5F1) Gene Region

bpnet = BPNetPredictor.from_mdir(model_dir)

WARNING:tensorflow:From /users/avsec/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/tensorflow/python/util/deprecation.py:497: calling conv1d (from tensorflow.python.ops.nn_ops) with data_format=NHWC is deprecated and will be removed in a future version.
Instructions for updating:
`NHWC` for data_format is deprecated, use `NWC` instead

2018-10-14 03:34:29,502 [WARNING] From /users/avsec/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/tensorflow/python/util/deprecation.py:497: calling conv1d (from tensorflow.python.ops.nn_ops) with data_format=NHWC is deprecated and will be removed in a future version.
Instructions for updating:
`NHWC` for data_format is deprecated, use `NWC` instead

WARNING:tensorflow:From /users/avsec/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/tensorflow/contrib/learn/python/learn/datasets/base.py:198: retry (from tensorflow.contrib.learn.python.learn.datasets.base) is deprecated and will be removed in a future version.
Instructions for updating:
Use the retry module or similar alternatives.

2018-10-14 03:34:38,379 [WARNING] From /users/avsec/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/tensorflow/contrib/learn/python/learn/datasets/base.py:198: retry (from tensorflow.contrib.learn.python.learn.datasets.base) is deprecated and will be removed in a future version.
Instructions for updating:
Use the retry module or similar alternatives.

run importance scores in BPNet and visualize all the regions¶

KLF4-Downstream_Enhancer_E1_mm10¶

bpnet.plot_predict_grad(resized_intervals[:1], ds, xlim=[350, 650])

[<Figure size 1440x1728 with 12 Axes>]

Screenshots from last time¶

KLF4-Downstream_Enhancer_E1_mm10¶

interval1

KLF4-Downstream_Enhancer_E2_mm10¶

KLF4-Downstream_Enhancer_E2_mm10

Distal and Proximal Oct4 Enhancers, Combined¶

Distal and Proximal Oct4 Enhancers, Combined

chr17	35503943	35506057	Distal and Proximal Oct4 Enhancers, Combined	0

implement instance finding in bpnet region view¶

pre-specify which patterns to use

oct4_enhancer = [Interval('chr17', 35504050-500, 35504050+500), Interval('chr17', 35505100-500, 35505100+500)]

use_intervals = intervals + oct4_enhancer + new_intervals_from_khyati

len(intervals + oct4_enhancer)

8

len(new_intervals_from_khyati)

9

# examples from Khyati start from 8 onwards

resized_intervals = [resize_interval(interval, 1000) for interval in use_intervals]

len(resized_intervals)

17

preds = bpnet.predict(resized_intervals)

pattern_names = [
    ("Oct4-Sox2", "metacluster_0/pattern_0"),
    ("Errb", "metacluster_0/pattern_1"),
    ("Sox2", "metacluster_0/pattern_2"),
    ("Nanog", "metacluster_0/pattern_3"),
    ("Klf4", "metacluster_2/pattern_0"),
    #("Klf4-1", "metacluster_2/pattern_2"),
    #("Klf4-2", "metacluster_2/pattern_3"),
]

modisco_dir = model_dir / "modisco/by_peak_tasks/weighted/Oct4"

mr = ModiscoResult(modisco_dir / "modisco.h5")
mr.open()

seq = np.stack([preds[i]['seq'] for i in range(len(preds))])
contrib = {t: np.stack([mean(preds[i]['grads'][ti]['profile'].values()) for i in range(len(preds))]) * seq
           for ti, t in enumerate(bpnet.tasks)}

contrib['Oct4'].shape

(17, 1000, 4)

dfm_norm = pd.read_csv(modisco_dir / "centroid_seqlet_matches.csv")

dfm_norm.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24133 entries, 0 to 24132
Data columns (total 29 columns):
Unnamed: 0            24133 non-null int64
pattern               24133 non-null object
example_idx           24133 non-null int64
pattern_start         24133 non-null int64
pattern_end           24133 non-null int64
strand                24133 non-null object
pattern_len           24133 non-null int64
pattern_center        24133 non-null int64
match_weighted        24133 non-null float64
match_weighted_p      0 non-null float64
match_weighted_cat    0 non-null float64
match_max             24133 non-null float64
match_max_task        24133 non-null object
imp_weighted          24133 non-null float64
imp_weighted_p        0 non-null float64
imp_weighted_cat      0 non-null float64
imp_max               24133 non-null float64
imp_max_task          24133 non-null object
seq_match             24133 non-null float64
seq_match_p           0 non-null float64
seq_match_cat         0 non-null float64
match/Klf4            24133 non-null float64
match/Nanog           24133 non-null float64
match/Oct4            24133 non-null float64
match/Sox2            24133 non-null float64
imp/Klf4              24133 non-null float64
imp/Nanog             24133 non-null float64
imp/Oct4              24133 non-null float64
imp/Sox2              24133 non-null float64
dtypes: float64(19), int64(6), object(4)
memory usage: 5.3+ MB

trim_frac = 0.08
n_jobs = 1
dfl = []
for tf, pattern_name in tqdm(pattern_names):
    pattern = mr.get_pattern(pattern_name).trim_seq_ic(trim_frac)
    match, importance = pattern.scan_importance(contrib, seq, tasks,
                                                n_jobs=n_jobs, verbose=False)
    seq_match = pattern.scan_seq(seq, n_jobs=n_jobs, verbose=False)
    dfm = pattern.get_instances(tasks, match, importance, seq_match, 
                                norm_df=dfm_norm[dfm_norm.pattern == pattern_name],
                                verbose=False, plot=False)
    dfm['tf'] = tf
    dfl.append(dfm)
dfp = pd.concat(dfl)    
dfp = dfp[dfp.seq_match > 0]

100%|██████████| 5/5 [00:03<00:00,  1.28it/s]

from basepair.modisco.core import dfi2seqlets

def dfi2seqlets(dfi, short_name=False):
    """Convert the data-frame produced by pattern.get_instances()
    to a list of Seqlets

    Args:
      dfi: pd.DataFrame returned by pattern.get_instances()
      short_name: if True, short pattern name will be used for the seqlet name

    Returns:
      Seqlet list
    """
    def extract_name(row):
        if short_name:
            return shorten_pattern(row.pattern)
        else:
            return row.pattern

    return [Seqlet(row.example_idx,
                   row.pattern_start,
                   row.pattern_end,
                   row.tf,
                   row.strand)
            for i, row in dfi.iterrows()]

KLF4-Downstream_Enhancer_E1_mm10¶

example_idx = 0

query = (dfp.match_weighted_p > 0.2) & (dfp.imp_weighted_p > 0) & (dfp.example_idx == example_idx)
dfp[query & (np.abs(dfp.pattern_center - 500) < 150)][['tf', 'pattern_center', 'match_weighted_p', 'match_weighted_cat', 
                           'imp_weighted', 'imp_weighted_p', 'imp_weighted_cat','seq_match']]

seqlets = dfi2seqlets(dfp[(dfp.example_idx == example_idx) & query])

bpnet.plot_predict_grad([resized_intervals[example_idx]], ds, seqlets=seqlets, xlim=[400, 650], fig_width=30, same_ylim=True)

[<Figure size 2160x1728 with 12 Axes>]

KLF4-Downstream_Enhancer_E2_mm10¶

example_idx = 1

query = (dfp.match_weighted_p > 0.2) & (dfp.imp_weighted_p > 0) & (dfp.example_idx == example_idx)
dfp[query & (np.abs(dfp.pattern_center - 500) < 150)][['tf', 'pattern_center', 'match_weighted_p', 'match_weighted_cat', 
                           'imp_weighted', 'imp_weighted_p', 'imp_weighted_cat','seq_match']]

seqlets = dfi2seqlets(dfp[(dfp.example_idx == example_idx) & query])

bpnet.plot_predict_grad([resized_intervals[example_idx]], ds, seqlets=seqlets, xlim=[400, 650], fig_width=30, same_ylim=True)

[<Figure size 2160x1728 with 12 Axes>]

SOX2/OCT4 Binding Site on Distal Enhancer¶

example_idx = 3

query = (dfp.match_weighted_p > 0.2) & (dfp.imp_weighted_p > 0) & (dfp.example_idx == example_idx)
dfp[query & (np.abs(dfp.pattern_center - 500) < 150)][['tf', 'pattern_center', 'match_weighted_p', 'match_weighted_cat', 
                           'imp_weighted', 'imp_weighted_p', 'imp_weighted_cat','seq_match']]

seqlets = dfi2seqlets(dfp[(dfp.example_idx == example_idx) & query])

bpnet.plot_predict_grad([resized_intervals[example_idx]], ds, seqlets=seqlets, xlim=[400, 650], fig_width=30, same_ylim=True)

[<Figure size 2160x1728 with 12 Axes>]

SOX2/OCT4 Binding Site on Proximal Enhancer¶

example_idx = 4

query = (dfp.match_weighted_p > 0.2) & (dfp.imp_weighted_p > 0) & (dfp.example_idx == example_idx)
dfp[query & (np.abs(dfp.pattern_center - 500) < 200)][['tf', 'pattern_center', 'match_weighted_p', 'match_weighted_cat', 
                           'imp_weighted', 'imp_weighted_p', 'imp_weighted_cat','seq_match']]

seqlets = dfi2seqlets(dfp[(dfp.example_idx == example_idx) & query])

bpnet.plot_predict_grad([resized_intervals[example_idx]], ds, seqlets=seqlets, xlim=[400, 700], fig_width=30, same_ylim=True)

[<Figure size 2160x1728 with 12 Axes>]

Distal Oct4 Enhancer¶

example_idx = len(preds) - 2

example_idx

6

query = (dfp.match_weighted_p > 0.2) & (dfp.imp_weighted_p > 0) & (dfp.example_idx == example_idx)
dfp[query & (np.abs(dfp.pattern_center - 500) < 200)][['tf', 'pattern_center', 'match_weighted_p', 'match_weighted_cat', 
                           'imp_weighted', 'imp_weighted_p', 'imp_weighted_cat','seq_match']]

seqlets = dfi2seqlets(dfp[(dfp.example_idx == example_idx) & query])

bpnet.plot_predict_grad([resized_intervals[example_idx]], ds, seqlets=seqlets, xlim=[400, 700], fig_width=30, same_ylim=True)

[<Figure size 2160x1728 with 12 Axes>]

Proximal Oct4 Enhancer¶

example_idx = len(preds) - 1

example_idx

7

query = (dfp.match_weighted_p > 0.2) & (dfp.imp_weighted_p > 0) & (dfp.example_idx == example_idx)
dfp[query & (np.abs(dfp.pattern_center - 500) < 200)][['tf', 'pattern_center', 'match_weighted_p', 'match_weighted_cat', 
                           'imp_weighted', 'imp_weighted_p', 'imp_weighted_cat','seq_match']]

seqlets = dfi2seqlets(dfp[(dfp.example_idx == example_idx) & query])

bpnet.plot_predict_grad([resized_intervals[example_idx]], ds, seqlets=seqlets, xlim=[400, 700], fig_width=30, same_ylim=True)

[<Figure size 2160x1728 with 12 Axes>]

New regions from Khyati¶

Esrrb,Oct4,Sox2->Nanog¶

example_idx = 8

resized_intervals[example_idx]

Interval(chr6:122706940-122707940)

resized_intervals[example_idx].name

'Esrrb,Oct4,Sox2->Nanog'

query = (dfp.match_weighted_p > 0.2) & (dfp.imp_weighted_p > 0) & (dfp.example_idx == example_idx)
dfp[query & (np.abs(dfp.pattern_center - 500) < 200)][['tf', 'pattern_center', 'match_weighted_p', 'match_weighted_cat', 
                           'imp_weighted', 'imp_weighted_p', 'imp_weighted_cat','seq_match']]

seqlets = dfi2seqlets(dfp[(dfp.example_idx == example_idx) & query])

bpnet.plot_predict_grad([resized_intervals[example_idx]], ds, seqlets=seqlets, xlim=[400, 700], fig_width=30, same_ylim=True)

[<Figure size 2160x1728 with 12 Axes>]

Esrrb,Oct4,Sox2->Lefty¶

example_idx = 9

resized_intervals[example_idx]

Interval(chr1:180933374-180934374)

resized_intervals[example_idx].name

'Esrrb,Oct4,Sox2->Lefty'

query = (dfp.match_weighted_p > 0.2) & (dfp.imp_weighted_p > 0) & (dfp.example_idx == example_idx)
dfp[query & (np.abs(dfp.pattern_center - 500) < 200)][['tf', 'pattern_center', 'match_weighted_p', 'match_weighted_cat', 
                           'imp_weighted', 'imp_weighted_p', 'imp_weighted_cat','seq_match']]

seqlets = dfi2seqlets(dfp[(dfp.example_idx == example_idx) & query])

bpnet.plot_predict_grad([resized_intervals[example_idx]], ds, seqlets=seqlets, xlim=[400, 700], fig_width=30, same_ylim=True)

[<Figure size 2160x1728 with 12 Axes>]

Esrrb,Oct4,Sox2->REST¶

example_idx = 10

resized_intervals[example_idx]

Interval(chr5:77261824-77262824)

resized_intervals[example_idx].name

'Esrrb,Oct4,Sox2->REST'

query = (dfp.match_weighted_p > 0.2) & (dfp.imp_weighted_p > 0) & (dfp.example_idx == example_idx)
dfp[query & (np.abs(dfp.pattern_center - 500) < 200)][['tf', 'pattern_center', 'match_weighted_p', 'match_weighted_cat', 
                           'imp_weighted', 'imp_weighted_p', 'imp_weighted_cat','seq_match']]

seqlets = dfi2seqlets(dfp[(dfp.example_idx == example_idx) & query])

bpnet.plot_predict_grad([resized_intervals[example_idx]], ds, seqlets=seqlets, xlim=[400, 700], fig_width=30, same_ylim=True)

[<Figure size 2160x1728 with 12 Axes>]

Klf4,Pbx1,Oct4,Sox2->Nanog¶

example_idx = 11

resized_intervals[example_idx]

Interval(chr6:122706931-122707931)

resized_intervals[example_idx].name

'Klf4,Pbx1,Oct4,Sox2->Nanog'

query = (dfp.match_weighted_p > 0.2) & (dfp.imp_weighted_p > 0) & (dfp.example_idx == example_idx)
dfp[query & (np.abs(dfp.pattern_center - 500) < 200)][['tf', 'pattern_center', 'match_weighted_p', 'match_weighted_cat', 
                           'imp_weighted', 'imp_weighted_p', 'imp_weighted_cat','seq_match']]

seqlets = dfi2seqlets(dfp[(dfp.example_idx == example_idx) & query])

bpnet.plot_predict_grad([resized_intervals[example_idx]], ds, seqlets=seqlets, xlim=[400, 700], fig_width=30, same_ylim=True)

[<Figure size 2160x1728 with 12 Axes>]

Esrrb,Oct4,Sox2,Stat3->Klf4¶

example_idx = 12

resized_intervals[example_idx]

Interval(chr4:55475092-55476092)

resized_intervals[example_idx].name

'Esrrb,Oct4,Sox2,Stat3->Klf4'

query = (dfp.match_weighted_p > 0.2) & (dfp.imp_weighted_p > 0) & (dfp.example_idx == example_idx)
dfp[query & (np.abs(dfp.pattern_center - 500) < 200)][['tf', 'pattern_center', 'match_weighted_p', 'match_weighted_cat', 
                           'imp_weighted', 'imp_weighted_p', 'imp_weighted_cat','seq_match']]

seqlets = dfi2seqlets(dfp[(dfp.example_idx == example_idx) & query])

bpnet.plot_predict_grad([resized_intervals[example_idx]], ds, seqlets=seqlets, xlim=[400, 700], fig_width=30, same_ylim=True)

[<Figure size 2160x1728 with 12 Axes>]

Oct4,Sox2,Nanog,Klf4,NR5A2,Sat3,Esrrb,Smad1,Ncoa3->Sox2(dist)¶

example_idx = 13

resized_intervals[example_idx]

Interval(chr3:34756430-34757430)

resized_intervals[example_idx].name

'Oct4,Sox2,Nanog,Klf4,NR5A2, Sat3,Esrrb,Smad1,Ncoa3->Sox2(dist)'

query = (dfp.match_weighted_p > 0.2) & (dfp.imp_weighted_p > 0) & (dfp.example_idx == example_idx)
dfp[query & (np.abs(dfp.pattern_center - 500) < 200)][['tf', 'pattern_center', 'match_weighted_p', 'match_weighted_cat', 
                           'imp_weighted', 'imp_weighted_p', 'imp_weighted_cat','seq_match']]

seqlets = dfi2seqlets(dfp[(dfp.example_idx == example_idx) & query])

bpnet.plot_predict_grad([resized_intervals[example_idx]], ds, seqlets=seqlets, xlim=[400, 700], fig_width=30, same_ylim=True)

[<Figure size 2160x1728 with 12 Axes>]

Oct4,Sox2,Nanog,Klf4,NR5A2, Sat3,Esrrb,Smad1,Ncoa3->Sox2(dist)¶

example_idx = 14

resized_intervals[example_idx]

Interval(chr3:34757600-34758600)

resized_intervals[example_idx].name

'Oct4,Sox2,Nanog,Klf4,NR5A2, Sat3,Esrrb,Smad1,Ncoa3->Sox2(dist)'

query = (dfp.match_weighted_p > 0.2) & (dfp.imp_weighted_p > 0) & (dfp.example_idx == example_idx)
dfp[query & (np.abs(dfp.pattern_center - 500) < 200)][['tf', 'pattern_center', 'match_weighted_p', 'match_weighted_cat', 
                           'imp_weighted', 'imp_weighted_p', 'imp_weighted_cat','seq_match']]

seqlets = dfi2seqlets(dfp[(dfp.example_idx == example_idx) & query])

bpnet.plot_predict_grad([resized_intervals[example_idx]], ds, seqlets=seqlets, xlim=[400, 700], fig_width=30, same_ylim=True)

[<Figure size 2160x1728 with 12 Axes>]

Oct4,Sox2,Nanog,Klf4,NR5A2, Sat3,Esrrb,Smad1,Ncoa3->Sox2(dist)¶

example_idx = 15

resized_intervals[example_idx]

Interval(chr3:34760955-34761955)

resized_intervals[example_idx].name

'Oct4,Sox2,Nanog,Klf4,NR5A2, Sat3,Esrrb,Smad1,Ncoa3->Sox2(dist)'

query = (dfp.match_weighted_p > 0.2) & (dfp.imp_weighted_p > 0) & (dfp.example_idx == example_idx)
dfp[query & (np.abs(dfp.pattern_center - 500) < 200)][['tf', 'pattern_center', 'match_weighted_p', 'match_weighted_cat', 
                           'imp_weighted', 'imp_weighted_p', 'imp_weighted_cat','seq_match']]

seqlets = dfi2seqlets(dfp[(dfp.example_idx == example_idx) & query])

bpnet.plot_predict_grad([resized_intervals[example_idx]], ds, seqlets=seqlets, xlim=[400, 700], fig_width=30, same_ylim=True)

[<Figure size 2160x1728 with 12 Axes>]

Oct4,Sox2,Nanog,P300,Smad1->Sox2(prox)¶

example_idx = 16

resized_intervals[example_idx]

Interval(chr3:34653600-34654600)

resized_intervals[example_idx].name

'Oct4,Sox2,Nanog,P300,Smad1->Sox2(prox)'

query = (dfp.match_weighted_p > 0.2) & (dfp.imp_weighted_p > 0) & (dfp.example_idx == example_idx)
dfp[query & (np.abs(dfp.pattern_center - 500) < 200)][['tf', 'pattern_center', 'match_weighted_p', 'match_weighted_cat', 
                           'imp_weighted', 'imp_weighted_p', 'imp_weighted_cat','seq_match']]

seqlets = dfi2seqlets(dfp[(dfp.example_idx == example_idx) & query])

bpnet.plot_predict_grad([resized_intervals[example_idx]], ds, seqlets=seqlets, xlim=[400, 700], fig_width=30, same_ylim=True)

[<Figure size 2160x1728 with 12 Axes>]

Other¶

bpnet.plot_predict_grad([resized_intervals[example_idx]], ds, seqlets=seqlets, xlim=[300, 700], fig_width=30, same_ylim=True)

pattern_name = 'metacluster_2/pattern_0'

pattern = mr.get_pattern(pattern_name).trim_seq_ic(trim_frac)
match, importance = pattern.scan_importance(contrib, seq, tasks,
                                            n_jobs=n_jobs, verbose=False)
seq_match = pattern.scan_seq(seq, n_jobs=n_jobs, verbose=False)
dfm = pattern.get_instances(tasks, match, importance, seq_match, 
                            norm_df=dfm_norm[dfm_norm.pattern == pattern_name],
                            verbose=False, plot=False)
dfm['tf'] = tf

from kipoi.data_utils import get_dataset_item

from basepair.plot.tracks import filter_tracks, plot_tracks

def prefix_dict(d, prefix):
    return {prefix + d: v for d,v in d.items()}

tasks

['Oct4', 'Sox2', 'Nanog', 'Klf4']

plot_dict = {**{"m/" + t: match[0,:,ti].max(axis=-1) for ti, t in enumerate(tasks)}, **get_dataset_item(contrib, 0)}

plot_tracks(filter_tracks(plot_dict, [300, 700]));

	tf	pattern_center	match_weighted_p	match_weighted_cat	imp_weighted	imp_weighted_p	imp_weighted_cat	seq_match
5	Oct4-Sox2	494	0.454653	medium	1.049908	0.650254	medium	11.140099
17	Klf4	450	0.649865	medium	0.355373	0.414005	medium	7.205196
22	Klf4	630	0.279723	low	0.427455	0.545210	medium	4.420834

	tf	pattern_center	match_weighted_p	match_weighted_cat	imp_weighted	imp_weighted_p	imp_weighted_cat	seq_match
55	Oct4-Sox2	541	0.825381	high	1.708725	0.979357	high	10.428925
76	Nanog	515	0.399494	medium	0.420173	0.135272	low	3.137255
79	Nanog	556	0.883692	high	1.205132	0.935525	high	4.346340
82	Klf4	468	0.756445	high	0.551992	0.711427	high	8.679243
92	Klf4	592	0.302809	low	0.446627	0.571758	medium	7.278441

	tf	pattern_center	match_weighted_p	match_weighted_cat	imp_weighted	imp_weighted_p	imp_weighted_cat	seq_match
1775	Sox2	591	0.320000	low	0.431432	0.650566	medium	8.980809
97	Nanog	508	0.584071	medium	1.012207	0.845765	high	4.158890
133	Klf4	615	0.435937	medium	0.309316	0.313967	low	5.888764
138	Klf4	686	0.323971	low	0.326105	0.348211	medium	5.971005

	tf	pattern_center	match_weighted_p	match_weighted_cat	imp_weighted	imp_weighted_p	imp_weighted_cat	seq_match
91	Oct4-Sox2	506	0.804738	high	1.712109	0.979695	high	10.428925
141	Nanog	480	0.380531	medium	0.428680	0.141593	low	3.137255
144	Nanog	521	0.887484	high	1.250795	0.953224	high	4.346340
174	Klf4	433	0.768372	high	0.545754	0.701424	high	8.679243
184	Klf4	557	0.318969	low	0.460556	0.591381	medium	7.278441

	tf	pattern_center	match_weighted_p	match_weighted_cat	imp_weighted	imp_weighted_p	imp_weighted_cat	seq_match
2932	Sox2	574	0.346415	medium	0.441608	0.673208	high	8.980809
162	Nanog	491	0.589128	medium	1.029103	0.854614	high	4.158890
226	Klf4	598	0.440554	medium	0.314149	0.322432	low	5.888764
230	Klf4	669	0.335514	medium	0.340552	0.383609	medium	5.971005

	tf	pattern_center	match_weighted_p	match_weighted_cat	imp_weighted	imp_weighted_p	imp_weighted_cat	seq_match
29	Oct4-Sox2	521	0.249577	low	1.046371	0.647208	medium	7.115862
149	Errb	593	0.270833	low	0.240337	0.092593	low	7.765749

	tf	pattern_center	match_weighted_p	match_weighted_cat	imp_weighted	imp_weighted_p	imp_weighted_cat	seq_match
117	Oct4-Sox2	480	1.000000	high	1.594411	0.962944	high	11.306958
249	Klf4	542	0.517891	medium	0.494865	0.643324	medium	8.402104

	tf	pattern_center	match_weighted_p	match_weighted_cat	imp_weighted	imp_weighted_p	imp_weighted_cat	seq_match
128	Oct4-Sox2	502	0.673096	high	0.990582	0.582910	medium	12.995998
133	Oct4-Sox2	634	0.675973	high	0.362217	0.045685	low	7.886088

	tf	pattern_center	match_weighted_p	match_weighted_cat	imp_weighted	imp_weighted_p	imp_weighted_cat	seq_match
147	Oct4-Sox2	521	0.763621	high	1.153189	0.742470	high	12.113540
4084	Sox2	524	0.320000	low	0.643107	0.912453	high	3.451947
215	Nanog	571	0.312263	low	0.202250	0.008850	low	4.800385

	tf	pattern_center	match_weighted_p	match_weighted_cat	imp_weighted	imp_weighted_p	imp_weighted_cat	seq_match
179	Oct4-Sox2	483	0.247885	low	1.030580	0.625719	medium	7.115862
968	Errb	419	0.220679	low	0.165665	0.006173	low	7.988962
990	Errb	555	0.261574	low	0.273758	0.212963	low	7.765749

	tf	pattern_center	match_weighted_p	match_weighted_cat	imp_weighted	imp_weighted_p	imp_weighted_cat	seq_match
276	Nanog	473	0.591656	medium	0.602352	0.407080	medium	6.329269
281	Nanog	547	0.441214	medium	0.254193	0.025284	low	6.368482

	tf	pattern_center	match_weighted_p	match_weighted_cat	imp_weighted	imp_weighted_p	imp_weighted_cat	seq_match
243	Oct4-Sox2	455	0.624365	medium	1.269355	0.833164	high	11.758594
6342	Sox2	451	0.391698	medium	0.718507	0.950189	high	6.271067