Goal

  • visualize the importance scores + motif instances for known enhancers

Tasks

  • [x] get the enhancer regions
    • strided and unstrided
  • [x] paste the screenshots from last time
  • [x] run importance scores in BPNet and visualize all the regions
  • [x] implement instance finding in bpnet region view
    • pre-specify which patterns to use
  • [x] plot for all

get the enhancer regions

  • strided and unstrided
In [3]:
import basepair
from basepair.config import get_data_dir, create_tf_session
from keras.models import load_model
from basepair.datasets import *
from basepair import datasets
from basepair.preproc import AppendTotalCounts, transform_data, resize_interval
from basepair.plots import regression_eval
from basepair.BPNet import BPNetPredictor
from basepair.utils import write_pkl
from basepair.imports import *
/users/avsec/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/h5py/__init__.py:36: FutureWarning: Conversion of the second argument of issubdtype from `float` to `np.floating` is deprecated. In future, it will be treated as `np.float64 == np.dtype(float).type`.
  from ._conv import register_converters as _register_converters
Using TensorFlow backend.
2018-10-14 03:32:10,384 [WARNING] doc empty for the `info:` field
In [4]:
# from basepair.preproc import resize_interval
In [96]:
from kipoiseq.transforms.functional import resize_interval
In [5]:
create_tf_session(0)
Out[5]:
<tensorflow.python.client.session.Session at 0x7f6027ae9240>
In [6]:
tasks = ['Oct4', 'Sox2', 'Nanog', 'Klf4']
In [7]:
model_dir = Path(f"{ddir}/processed/chipnexus/exp/models/oct-sox-nanog-klf/models/n_dil_layers=9/")
In [30]:
genome_file = "/mnt/data/pipeline_genome_data/mm10/mm10.chrom.sizes"
klf4_bed = "/users/avsec/workspace/basepair-workflow/data/klf4_sites_mm10.bed"
oct4_bed = "/users/avsec/workspace/basepair-workflow/data/oct4_sites_mm10.bed"
klf4_oct4_windowed_bed = "/users/avsec/workspace/basepair-workflow/data/klf4_oct4_windowed_sites_mm10.1kb.bed"
In [31]:
ds = DataSpec.load(model_dir / 'dataspec.yaml')
In [32]:
cat {klf4_bed} {oct4_bed} 
chr4	55477346	55477760	KLF4-Downstream_Enhancer_E1_mm10	0	.
chr4	55475208	55475900	KLF4-Downstream_Enhancer_E2_mm10	0	.
chr17	35503943	35506057	Distal and Proximal Oct4 Enhancers, Combined	0	.
chr17	35503912	35504118	SOX2/OCT4 Binding Site on Distal Enhancer	0	.
chr17	35504981	35505186	SOX2/OCT4 Binding Site on Proximal Enhancer	0	.
chr17	35506031	35510777	OCT4(POU5F1) Gene Region	0	.
In [33]:
# make windowed regions
!cat {klf4_bed} {oct4_bed} | bedtools makewindows -w 1000  -b stdin -i srcwinnum >  {klf4_oct4_windowed_bed}

Regions from Khyati. Google spreadsheet link

In [58]:
regions_from_khyati = """
chr6	122,707,340	122,707,540	Esrrb,Oct4,Sox2->Nanog
chr1	180,933,774	180,933,974	Esrrb,Oct4,Sox2->Lefty
chr5	77262224	77,262,424	Esrrb,Oct4,Sox2->REST
chr6	122707331	122,707,531	Klf4,Pbx1,Oct4,Sox2->Nanog
chr4	55,475,492	55,475,692	Esrrb,Oct4,Sox2,Stat3->Klf4
chr3	34,756,830	34,757,030	Oct4,Sox2,Nanog,Klf4,NR5A2, Sat3,Esrrb,Smad1,Ncoa3->Sox2(dist)
chr3	34,758,000	34,758,200	Oct4,Sox2,Nanog,Klf4,NR5A2, Sat3,Esrrb,Smad1,Ncoa3->Sox2(dist)
chr3	34,761,355	34,761,555	Oct4,Sox2,Nanog,Klf4,NR5A2, Sat3,Esrrb,Smad1,Ncoa3->Sox2(dist)
chr3	34,654,000	34,654,200	Oct4,Sox2,Nanog,P300,Smad1->Sox2(prox)
"""
In [59]:
from io import StringIO
In [86]:
df = pd.read_csv(StringIO(regions_from_khyati), sep='\t', header=None)
df.columns = ['chrom', 'start', 'stop', 'name']
df['start'] = df['start'].str.replace(",", "").astype(int)
df['stop'] = df['stop'].str.replace(",", "").astype(int)
df['score'] = 0
df['strand'] = "."
In [88]:
df["stop"] - df["start"]
Out[88]:
0    200
1    200
2    200
3    200
4    200
5    200
6    200
7    200
8    200
dtype: int64
In [89]:
new_intervals_from_khyati = list(BedTool.from_dataframe(df))
In [62]:
# TODO get the interesting regions from the genome browser
In [63]:
bt = BedTool(klf4_bed).cat(BedTool(oct4_bed), postmerge=False)
intervals = list(bt)
bt_windowed = BedTool(klf4_oct4_windowed_bed)
intervals_windowed = [resize_interval(interval, 1000) for interval in bt_windowed]
resized_intervals = [resize_interval(interval, 1000) for interval in intervals]
In [13]:
# motif widths
for i in intervals:
    print(i.stop - i.start, i.name)
414 KLF4-Downstream_Enhancer_E1_mm10
692 KLF4-Downstream_Enhancer_E2_mm10
2114 Distal and Proximal Oct4 Enhancers, Combined
206 SOX2/OCT4 Binding Site on Distal Enhancer
205 SOX2/OCT4 Binding Site on Proximal Enhancer
4746 OCT4(POU5F1) Gene Region
In [14]:
bpnet = BPNetPredictor.from_mdir(model_dir)
WARNING:tensorflow:From /users/avsec/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/tensorflow/python/util/deprecation.py:497: calling conv1d (from tensorflow.python.ops.nn_ops) with data_format=NHWC is deprecated and will be removed in a future version.
Instructions for updating:
`NHWC` for data_format is deprecated, use `NWC` instead
2018-10-14 03:34:29,502 [WARNING] From /users/avsec/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/tensorflow/python/util/deprecation.py:497: calling conv1d (from tensorflow.python.ops.nn_ops) with data_format=NHWC is deprecated and will be removed in a future version.
Instructions for updating:
`NHWC` for data_format is deprecated, use `NWC` instead
WARNING:tensorflow:From /users/avsec/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/tensorflow/contrib/learn/python/learn/datasets/base.py:198: retry (from tensorflow.contrib.learn.python.learn.datasets.base) is deprecated and will be removed in a future version.
Instructions for updating:
Use the retry module or similar alternatives.
2018-10-14 03:34:38,379 [WARNING] From /users/avsec/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/tensorflow/contrib/learn/python/learn/datasets/base.py:198: retry (from tensorflow.contrib.learn.python.learn.datasets.base) is deprecated and will be removed in a future version.
Instructions for updating:
Use the retry module or similar alternatives.

run importance scores in BPNet and visualize all the regions

KLF4-Downstream_Enhancer_E1_mm10

In [22]:
bpnet.plot_predict_grad(resized_intervals[:1], ds, xlim=[350, 650])
Out[22]:
[<Figure size 1440x1728 with 12 Axes>]

Screenshots from last time

KLF4-Downstream_Enhancer_E1_mm10

interval1

KLF4-Downstream_Enhancer_E2_mm10

KLF4-Downstream_Enhancer_E2_mm10

Distal and Proximal Oct4 Enhancers, Combined

Distal and Proximal Oct4 Enhancers, Combined

In [ ]:
chr17	35503943	35506057	Distal and Proximal Oct4 Enhancers, Combined	0	

implement instance finding in bpnet region view

  • pre-specify which patterns to use
In [36]:
oct4_enhancer = [Interval('chr17', 35504050-500, 35504050+500), Interval('chr17', 35505100-500, 35505100+500)]
In [92]:
use_intervals = intervals + oct4_enhancer + new_intervals_from_khyati
In [110]:
len(intervals + oct4_enhancer)
Out[110]:
8
In [111]:
len(new_intervals_from_khyati)
Out[111]:
9
In [112]:
# examples from Khyati start from 8 onwards
In [97]:
resized_intervals = [resize_interval(interval, 1000) for interval in use_intervals]
In [98]:
len(resized_intervals)
Out[98]:
17
In [99]:
preds = bpnet.predict(resized_intervals)
In [100]:
pattern_names = [
    ("Oct4-Sox2", "metacluster_0/pattern_0"),
    ("Errb", "metacluster_0/pattern_1"),
    ("Sox2", "metacluster_0/pattern_2"),
    ("Nanog", "metacluster_0/pattern_3"),
    ("Klf4", "metacluster_2/pattern_0"),
    #("Klf4-1", "metacluster_2/pattern_2"),
    #("Klf4-2", "metacluster_2/pattern_3"),
]
In [101]:
modisco_dir = model_dir / "modisco/by_peak_tasks/weighted/Oct4"
In [102]:
mr = ModiscoResult(modisco_dir / "modisco.h5")
mr.open()
In [103]:
seq = np.stack([preds[i]['seq'] for i in range(len(preds))])
contrib = {t: np.stack([mean(preds[i]['grads'][ti]['profile'].values()) for i in range(len(preds))]) * seq
           for ti, t in enumerate(bpnet.tasks)}
In [104]:
contrib['Oct4'].shape
Out[104]:
(17, 1000, 4)
In [105]:
dfm_norm = pd.read_csv(modisco_dir / "centroid_seqlet_matches.csv")
In [106]:
dfm_norm.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24133 entries, 0 to 24132
Data columns (total 29 columns):
Unnamed: 0            24133 non-null int64
pattern               24133 non-null object
example_idx           24133 non-null int64
pattern_start         24133 non-null int64
pattern_end           24133 non-null int64
strand                24133 non-null object
pattern_len           24133 non-null int64
pattern_center        24133 non-null int64
match_weighted        24133 non-null float64
match_weighted_p      0 non-null float64
match_weighted_cat    0 non-null float64
match_max             24133 non-null float64
match_max_task        24133 non-null object
imp_weighted          24133 non-null float64
imp_weighted_p        0 non-null float64
imp_weighted_cat      0 non-null float64
imp_max               24133 non-null float64
imp_max_task          24133 non-null object
seq_match             24133 non-null float64
seq_match_p           0 non-null float64
seq_match_cat         0 non-null float64
match/Klf4            24133 non-null float64
match/Nanog           24133 non-null float64
match/Oct4            24133 non-null float64
match/Sox2            24133 non-null float64
imp/Klf4              24133 non-null float64
imp/Nanog             24133 non-null float64
imp/Oct4              24133 non-null float64
imp/Sox2              24133 non-null float64
dtypes: float64(19), int64(6), object(4)
memory usage: 5.3+ MB
In [107]:
trim_frac = 0.08
n_jobs = 1
dfl = []
for tf, pattern_name in tqdm(pattern_names):
    pattern = mr.get_pattern(pattern_name).trim_seq_ic(trim_frac)
    match, importance = pattern.scan_importance(contrib, seq, tasks,
                                                n_jobs=n_jobs, verbose=False)
    seq_match = pattern.scan_seq(seq, n_jobs=n_jobs, verbose=False)
    dfm = pattern.get_instances(tasks, match, importance, seq_match, 
                                norm_df=dfm_norm[dfm_norm.pattern == pattern_name],
                                verbose=False, plot=False)
    dfm['tf'] = tf
    dfl.append(dfm)
dfp = pd.concat(dfl)    
dfp = dfp[dfp.seq_match > 0]
100%|██████████| 5/5 [00:03<00:00,  1.28it/s]
In [108]:
from basepair.modisco.core import dfi2seqlets
In [109]:
def dfi2seqlets(dfi, short_name=False):
    """Convert the data-frame produced by pattern.get_instances()
    to a list of Seqlets

    Args:
      dfi: pd.DataFrame returned by pattern.get_instances()
      short_name: if True, short pattern name will be used for the seqlet name

    Returns:
      Seqlet list
    """
    def extract_name(row):
        if short_name:
            return shorten_pattern(row.pattern)
        else:
            return row.pattern

    return [Seqlet(row.example_idx,
                   row.pattern_start,
                   row.pattern_end,
                   row.tf,
                   row.strand)
            for i, row in dfi.iterrows()]

KLF4-Downstream_Enhancer_E1_mm10

In [306]:
example_idx = 0
In [307]:
query = (dfp.match_weighted_p > 0.2) & (dfp.imp_weighted_p > 0) & (dfp.example_idx == example_idx)
dfp[query & (np.abs(dfp.pattern_center - 500) < 150)][['tf', 'pattern_center', 'match_weighted_p', 'match_weighted_cat', 
                           'imp_weighted', 'imp_weighted_p', 'imp_weighted_cat','seq_match']]
Out[307]:
tf pattern_center match_weighted_p match_weighted_cat imp_weighted imp_weighted_p imp_weighted_cat seq_match
5 Oct4-Sox2 494 0.454653 medium 1.049908 0.650254 medium 11.140099
17 Klf4 450 0.649865 medium 0.355373 0.414005 medium 7.205196
22 Klf4 630 0.279723 low 0.427455 0.545210 medium 4.420834
In [308]:
seqlets = dfi2seqlets(dfp[(dfp.example_idx == example_idx) & query])
In [309]:
bpnet.plot_predict_grad([resized_intervals[example_idx]], ds, seqlets=seqlets, xlim=[400, 650], fig_width=30, same_ylim=True)
Out[309]:
[<Figure size 2160x1728 with 12 Axes>]

KLF4-Downstream_Enhancer_E2_mm10

In [228]:
example_idx = 1
In [229]:
query = (dfp.match_weighted_p > 0.2) & (dfp.imp_weighted_p > 0) & (dfp.example_idx == example_idx)
dfp[query & (np.abs(dfp.pattern_center - 500) < 150)][['tf', 'pattern_center', 'match_weighted_p', 'match_weighted_cat', 
                           'imp_weighted', 'imp_weighted_p', 'imp_weighted_cat','seq_match']]
Out[229]:
tf pattern_center match_weighted_p match_weighted_cat imp_weighted imp_weighted_p imp_weighted_cat seq_match
29 Oct4-Sox2 521 0.249577 low 1.046371 0.647208 medium 7.115862
149 Errb 593 0.270833 low 0.240337 0.092593 low 7.765749
In [230]:
seqlets = dfi2seqlets(dfp[(dfp.example_idx == example_idx) & query])
In [231]:
bpnet.plot_predict_grad([resized_intervals[example_idx]], ds, seqlets=seqlets, xlim=[400, 650], fig_width=30, same_ylim=True)
Out[231]:
[<Figure size 2160x1728 with 12 Axes>]

SOX2/OCT4 Binding Site on Distal Enhancer

In [232]:
example_idx = 3
In [233]:
query = (dfp.match_weighted_p > 0.2) & (dfp.imp_weighted_p > 0) & (dfp.example_idx == example_idx)
dfp[query & (np.abs(dfp.pattern_center - 500) < 150)][['tf', 'pattern_center', 'match_weighted_p', 'match_weighted_cat', 
                           'imp_weighted', 'imp_weighted_p', 'imp_weighted_cat','seq_match']]
Out[233]:
tf pattern_center match_weighted_p match_weighted_cat imp_weighted imp_weighted_p imp_weighted_cat seq_match
55 Oct4-Sox2 541 0.825381 high 1.708725 0.979357 high 10.428925
76 Nanog 515 0.399494 medium 0.420173 0.135272 low 3.137255
79 Nanog 556 0.883692 high 1.205132 0.935525 high 4.346340
82 Klf4 468 0.756445 high 0.551992 0.711427 high 8.679243
92 Klf4 592 0.302809 low 0.446627 0.571758 medium 7.278441
In [234]:
seqlets = dfi2seqlets(dfp[(dfp.example_idx == example_idx) & query])
In [235]:
bpnet.plot_predict_grad([resized_intervals[example_idx]], ds, seqlets=seqlets, xlim=[400, 650], fig_width=30, same_ylim=True)
Out[235]:
[<Figure size 2160x1728 with 12 Axes>]

SOX2/OCT4 Binding Site on Proximal Enhancer

In [238]:
example_idx = 4
In [239]:
query = (dfp.match_weighted_p > 0.2) & (dfp.imp_weighted_p > 0) & (dfp.example_idx == example_idx)
dfp[query & (np.abs(dfp.pattern_center - 500) < 200)][['tf', 'pattern_center', 'match_weighted_p', 'match_weighted_cat', 
                           'imp_weighted', 'imp_weighted_p', 'imp_weighted_cat','seq_match']]
Out[239]:
tf pattern_center match_weighted_p match_weighted_cat imp_weighted imp_weighted_p imp_weighted_cat seq_match
1775 Sox2 591 0.320000 low 0.431432 0.650566 medium 8.980809
97 Nanog 508 0.584071 medium 1.012207 0.845765 high 4.158890
133 Klf4 615 0.435937 medium 0.309316 0.313967 low 5.888764
138 Klf4 686 0.323971 low 0.326105 0.348211 medium 5.971005
In [240]:
seqlets = dfi2seqlets(dfp[(dfp.example_idx == example_idx) & query])
In [241]:
bpnet.plot_predict_grad([resized_intervals[example_idx]], ds, seqlets=seqlets, xlim=[400, 700], fig_width=30, same_ylim=True)
Out[241]:
[<Figure size 2160x1728 with 12 Axes>]

Distal Oct4 Enhancer

In [283]:
example_idx = len(preds) - 2
In [284]:
example_idx
Out[284]:
6
In [285]:
query = (dfp.match_weighted_p > 0.2) & (dfp.imp_weighted_p > 0) & (dfp.example_idx == example_idx)
dfp[query & (np.abs(dfp.pattern_center - 500) < 200)][['tf', 'pattern_center', 'match_weighted_p', 'match_weighted_cat', 
                           'imp_weighted', 'imp_weighted_p', 'imp_weighted_cat','seq_match']]
Out[285]:
tf pattern_center match_weighted_p match_weighted_cat imp_weighted imp_weighted_p imp_weighted_cat seq_match
91 Oct4-Sox2 506 0.804738 high 1.712109 0.979695 high 10.428925
141 Nanog 480 0.380531 medium 0.428680 0.141593 low 3.137255
144 Nanog 521 0.887484 high 1.250795 0.953224 high 4.346340
174 Klf4 433 0.768372 high 0.545754 0.701424 high 8.679243
184 Klf4 557 0.318969 low 0.460556 0.591381 medium 7.278441
In [286]:
seqlets = dfi2seqlets(dfp[(dfp.example_idx == example_idx) & query])
In [287]:
bpnet.plot_predict_grad([resized_intervals[example_idx]], ds, seqlets=seqlets, xlim=[400, 700], fig_width=30, same_ylim=True)
Out[287]:
[<Figure size 2160x1728 with 12 Axes>]

Proximal Oct4 Enhancer

In [310]:
example_idx = len(preds) - 1
In [311]:
example_idx
Out[311]:
7
In [312]:
query = (dfp.match_weighted_p > 0.2) & (dfp.imp_weighted_p > 0) & (dfp.example_idx == example_idx)
dfp[query & (np.abs(dfp.pattern_center - 500) < 200)][['tf', 'pattern_center', 'match_weighted_p', 'match_weighted_cat', 
                           'imp_weighted', 'imp_weighted_p', 'imp_weighted_cat','seq_match']]
Out[312]:
tf pattern_center match_weighted_p match_weighted_cat imp_weighted imp_weighted_p imp_weighted_cat seq_match
2932 Sox2 574 0.346415 medium 0.441608 0.673208 high 8.980809
162 Nanog 491 0.589128 medium 1.029103 0.854614 high 4.158890
226 Klf4 598 0.440554 medium 0.314149 0.322432 low 5.888764
230 Klf4 669 0.335514 medium 0.340552 0.383609 medium 5.971005
In [313]:
seqlets = dfi2seqlets(dfp[(dfp.example_idx == example_idx) & query])
In [314]:
bpnet.plot_predict_grad([resized_intervals[example_idx]], ds, seqlets=seqlets, xlim=[400, 700], fig_width=30, same_ylim=True)
Out[314]:
[<Figure size 2160x1728 with 12 Axes>]

New regions from Khyati

Esrrb,Oct4,Sox2->Nanog

In [122]:
example_idx = 8
In [123]:
resized_intervals[example_idx]
Out[123]:
Interval(chr6:122706940-122707940)
In [124]:
resized_intervals[example_idx].name
Out[124]:
'Esrrb,Oct4,Sox2->Nanog'
In [125]:
query = (dfp.match_weighted_p > 0.2) & (dfp.imp_weighted_p > 0) & (dfp.example_idx == example_idx)
dfp[query & (np.abs(dfp.pattern_center - 500) < 200)][['tf', 'pattern_center', 'match_weighted_p', 'match_weighted_cat', 
                           'imp_weighted', 'imp_weighted_p', 'imp_weighted_cat','seq_match']]
Out[125]:
tf pattern_center match_weighted_p match_weighted_cat imp_weighted imp_weighted_p imp_weighted_cat seq_match
117 Oct4-Sox2 480 1.000000 high 1.594411 0.962944 high 11.306958
249 Klf4 542 0.517891 medium 0.494865 0.643324 medium 8.402104
In [126]:
seqlets = dfi2seqlets(dfp[(dfp.example_idx == example_idx) & query])
In [127]:
bpnet.plot_predict_grad([resized_intervals[example_idx]], ds, seqlets=seqlets, xlim=[400, 700], fig_width=30, same_ylim=True)
Out[127]:
[<Figure size 2160x1728 with 12 Axes>]

Esrrb,Oct4,Sox2->Lefty

In [131]:
example_idx = 9
In [132]:
resized_intervals[example_idx]
Out[132]:
Interval(chr1:180933374-180934374)
In [133]:
resized_intervals[example_idx].name
Out[133]:
'Esrrb,Oct4,Sox2->Lefty'
In [134]:
query = (dfp.match_weighted_p > 0.2) & (dfp.imp_weighted_p > 0) & (dfp.example_idx == example_idx)
dfp[query & (np.abs(dfp.pattern_center - 500) < 200)][['tf', 'pattern_center', 'match_weighted_p', 'match_weighted_cat', 
                           'imp_weighted', 'imp_weighted_p', 'imp_weighted_cat','seq_match']]
Out[134]:
tf pattern_center match_weighted_p match_weighted_cat imp_weighted imp_weighted_p imp_weighted_cat seq_match
128 Oct4-Sox2 502 0.673096 high 0.990582 0.582910 medium 12.995998
133 Oct4-Sox2 634 0.675973 high 0.362217 0.045685 low 7.886088
In [135]:
seqlets = dfi2seqlets(dfp[(dfp.example_idx == example_idx) & query])
In [136]:
bpnet.plot_predict_grad([resized_intervals[example_idx]], ds, seqlets=seqlets, xlim=[400, 700], fig_width=30, same_ylim=True)
Out[136]:
[<Figure size 2160x1728 with 12 Axes>]

Esrrb,Oct4,Sox2->REST

In [140]:
example_idx = 10
In [141]:
resized_intervals[example_idx]
Out[141]:
Interval(chr5:77261824-77262824)
In [142]:
resized_intervals[example_idx].name
Out[142]:
'Esrrb,Oct4,Sox2->REST'
In [143]:
query = (dfp.match_weighted_p > 0.2) & (dfp.imp_weighted_p > 0) & (dfp.example_idx == example_idx)
dfp[query & (np.abs(dfp.pattern_center - 500) < 200)][['tf', 'pattern_center', 'match_weighted_p', 'match_weighted_cat', 
                           'imp_weighted', 'imp_weighted_p', 'imp_weighted_cat','seq_match']]
Out[143]:
tf pattern_center match_weighted_p match_weighted_cat imp_weighted imp_weighted_p imp_weighted_cat seq_match
147 Oct4-Sox2 521 0.763621 high 1.153189 0.742470 high 12.113540
4084 Sox2 524 0.320000 low 0.643107 0.912453 high 3.451947
215 Nanog 571 0.312263 low 0.202250 0.008850 low 4.800385
In [144]:
seqlets = dfi2seqlets(dfp[(dfp.example_idx == example_idx) & query])
In [145]:
bpnet.plot_predict_grad([resized_intervals[example_idx]], ds, seqlets=seqlets, xlim=[400, 700], fig_width=30, same_ylim=True)
Out[145]:
[<Figure size 2160x1728 with 12 Axes>]

Klf4,Pbx1,Oct4,Sox2->Nanog

In [149]:
example_idx = 11
In [150]:
resized_intervals[example_idx]
Out[150]:
Interval(chr6:122706931-122707931)
In [151]:
resized_intervals[example_idx].name
Out[151]:
'Klf4,Pbx1,Oct4,Sox2->Nanog'
In [152]:
query = (dfp.match_weighted_p > 0.2) & (dfp.imp_weighted_p > 0) & (dfp.example_idx == example_idx)
dfp[query & (np.abs(dfp.pattern_center - 500) < 200)][['tf', 'pattern_center', 'match_weighted_p', 'match_weighted_cat', 
                           'imp_weighted', 'imp_weighted_p', 'imp_weighted_cat','seq_match']]
Out[152]:
tf pattern_center match_weighted_p match_weighted_cat imp_weighted imp_weighted_p imp_weighted_cat seq_match
157 Oct4-Sox2 489 1.000000 high 1.59565 0.962944 high 11.306958
355 Klf4 551 0.532128 medium 0.49836 0.647557 medium 8.402104
In [153]:
seqlets = dfi2seqlets(dfp[(dfp.example_idx == example_idx) & query])
In [154]:
bpnet.plot_predict_grad([resized_intervals[example_idx]], ds, seqlets=seqlets, xlim=[400, 700], fig_width=30, same_ylim=True)
Out[154]:
[<Figure size 2160x1728 with 12 Axes>]

Esrrb,Oct4,Sox2,Stat3->Klf4

In [158]:
example_idx = 12
In [159]:
resized_intervals[example_idx]
Out[159]:
Interval(chr4:55475092-55476092)
In [160]:
resized_intervals[example_idx].name
Out[160]:
'Esrrb,Oct4,Sox2,Stat3->Klf4'
In [161]:
query = (dfp.match_weighted_p > 0.2) & (dfp.imp_weighted_p > 0) & (dfp.example_idx == example_idx)
dfp[query & (np.abs(dfp.pattern_center - 500) < 200)][['tf', 'pattern_center', 'match_weighted_p', 'match_weighted_cat', 
                           'imp_weighted', 'imp_weighted_p', 'imp_weighted_cat','seq_match']]
Out[161]:
tf pattern_center match_weighted_p match_weighted_cat imp_weighted imp_weighted_p imp_weighted_cat seq_match
179 Oct4-Sox2 483 0.247885 low 1.030580 0.625719 medium 7.115862
968 Errb 419 0.220679 low 0.165665 0.006173 low 7.988962
990 Errb 555 0.261574 low 0.273758 0.212963 low 7.765749
In [162]:
seqlets = dfi2seqlets(dfp[(dfp.example_idx == example_idx) & query])
In [163]:
bpnet.plot_predict_grad([resized_intervals[example_idx]], ds, seqlets=seqlets, xlim=[400, 700], fig_width=30, same_ylim=True)
Out[163]:
[<Figure size 2160x1728 with 12 Axes>]

Oct4,Sox2,Nanog,Klf4,NR5A2,Sat3,Esrrb,Smad1,Ncoa3->Sox2(dist)

In [164]:
example_idx = 13
In [165]:
resized_intervals[example_idx]
Out[165]:
Interval(chr3:34756430-34757430)
In [166]:
resized_intervals[example_idx].name
Out[166]:
'Oct4,Sox2,Nanog,Klf4,NR5A2, Sat3,Esrrb,Smad1,Ncoa3->Sox2(dist)'
In [167]:
query = (dfp.match_weighted_p > 0.2) & (dfp.imp_weighted_p > 0) & (dfp.example_idx == example_idx)
dfp[query & (np.abs(dfp.pattern_center - 500) < 200)][['tf', 'pattern_center', 'match_weighted_p', 'match_weighted_cat', 
                           'imp_weighted', 'imp_weighted_p', 'imp_weighted_cat','seq_match']]
Out[167]:
tf pattern_center match_weighted_p match_weighted_cat imp_weighted imp_weighted_p imp_weighted_cat seq_match
276 Nanog 473 0.591656 medium 0.602352 0.407080 medium 6.329269
281 Nanog 547 0.441214 medium 0.254193 0.025284 low 6.368482
In [168]:
seqlets = dfi2seqlets(dfp[(dfp.example_idx == example_idx) & query])
In [169]:
bpnet.plot_predict_grad([resized_intervals[example_idx]], ds, seqlets=seqlets, xlim=[400, 700], fig_width=30, same_ylim=True)
Out[169]:
[<Figure size 2160x1728 with 12 Axes>]

Oct4,Sox2,Nanog,Klf4,NR5A2, Sat3,Esrrb,Smad1,Ncoa3->Sox2(dist)

In [170]:
example_idx = 14
In [171]:
resized_intervals[example_idx]
Out[171]:
Interval(chr3:34757600-34758600)
In [172]:
resized_intervals[example_idx].name
Out[172]:
'Oct4,Sox2,Nanog,Klf4,NR5A2, Sat3,Esrrb,Smad1,Ncoa3->Sox2(dist)'
In [173]:
query = (dfp.match_weighted_p > 0.2) & (dfp.imp_weighted_p > 0) & (dfp.example_idx == example_idx)
dfp[query & (np.abs(dfp.pattern_center - 500) < 200)][['tf', 'pattern_center', 'match_weighted_p', 'match_weighted_cat', 
                           'imp_weighted', 'imp_weighted_p', 'imp_weighted_cat','seq_match']]
Out[173]:
tf pattern_center match_weighted_p match_weighted_cat imp_weighted imp_weighted_p imp_weighted_cat seq_match
1140 Errb 363 0.791667 high 0.202987 0.030093 low 10.243579
In [174]:
seqlets = dfi2seqlets(dfp[(dfp.example_idx == example_idx) & query])
In [175]:
bpnet.plot_predict_grad([resized_intervals[example_idx]], ds, seqlets=seqlets, xlim=[400, 700], fig_width=30, same_ylim=True)
Out[175]:
[<Figure size 2160x1728 with 12 Axes>]

Oct4,Sox2,Nanog,Klf4,NR5A2, Sat3,Esrrb,Smad1,Ncoa3->Sox2(dist)

In [176]:
example_idx = 15
In [177]:
resized_intervals[example_idx]
Out[177]:
Interval(chr3:34760955-34761955)
In [178]:
resized_intervals[example_idx].name
Out[178]:
'Oct4,Sox2,Nanog,Klf4,NR5A2, Sat3,Esrrb,Smad1,Ncoa3->Sox2(dist)'
In [179]:
query = (dfp.match_weighted_p > 0.2) & (dfp.imp_weighted_p > 0) & (dfp.example_idx == example_idx)
dfp[query & (np.abs(dfp.pattern_center - 500) < 200)][['tf', 'pattern_center', 'match_weighted_p', 'match_weighted_cat', 
                           'imp_weighted', 'imp_weighted_p', 'imp_weighted_cat','seq_match']]
Out[179]:
tf pattern_center match_weighted_p match_weighted_cat imp_weighted imp_weighted_p imp_weighted_cat seq_match
1214 Errb 344 0.334105 medium 0.176842 0.009259 low 9.71271
In [180]:
seqlets = dfi2seqlets(dfp[(dfp.example_idx == example_idx) & query])
In [181]:
bpnet.plot_predict_grad([resized_intervals[example_idx]], ds, seqlets=seqlets, xlim=[400, 700], fig_width=30, same_ylim=True)
Out[181]:
[<Figure size 2160x1728 with 12 Axes>]

Oct4,Sox2,Nanog,P300,Smad1->Sox2(prox)

In [182]:
example_idx = 16
In [183]:
resized_intervals[example_idx]
Out[183]:
Interval(chr3:34653600-34654600)
In [184]:
resized_intervals[example_idx].name
Out[184]:
'Oct4,Sox2,Nanog,P300,Smad1->Sox2(prox)'
In [185]:
query = (dfp.match_weighted_p > 0.2) & (dfp.imp_weighted_p > 0) & (dfp.example_idx == example_idx)
dfp[query & (np.abs(dfp.pattern_center - 500) < 200)][['tf', 'pattern_center', 'match_weighted_p', 'match_weighted_cat', 
                           'imp_weighted', 'imp_weighted_p', 'imp_weighted_cat','seq_match']]
Out[185]:
tf pattern_center match_weighted_p match_weighted_cat imp_weighted imp_weighted_p imp_weighted_cat seq_match
243 Oct4-Sox2 455 0.624365 medium 1.269355 0.833164 high 11.758594
6342 Sox2 451 0.391698 medium 0.718507 0.950189 high 6.271067
In [186]:
seqlets = dfi2seqlets(dfp[(dfp.example_idx == example_idx) & query])
In [187]:
bpnet.plot_predict_grad([resized_intervals[example_idx]], ds, seqlets=seqlets, xlim=[400, 700], fig_width=30, same_ylim=True)
Out[187]:
[<Figure size 2160x1728 with 12 Axes>]

Other

In [ ]:
bpnet.plot_predict_grad([resized_intervals[example_idx]], ds, seqlets=seqlets, xlim=[300, 700], fig_width=30, same_ylim=True)
In [150]:
pattern_name = 'metacluster_2/pattern_0'
In [151]:
pattern = mr.get_pattern(pattern_name).trim_seq_ic(trim_frac)
match, importance = pattern.scan_importance(contrib, seq, tasks,
                                            n_jobs=n_jobs, verbose=False)
seq_match = pattern.scan_seq(seq, n_jobs=n_jobs, verbose=False)
dfm = pattern.get_instances(tasks, match, importance, seq_match, 
                            norm_df=dfm_norm[dfm_norm.pattern == pattern_name],
                            verbose=False, plot=False)
dfm['tf'] = tf
In [152]:
from kipoi.data_utils import get_dataset_item
In [153]:
from basepair.plot.tracks import filter_tracks, plot_tracks
In [154]:
def prefix_dict(d, prefix):
    return {prefix + d: v for d,v in d.items()}
In [155]:
tasks
Out[155]:
['Oct4', 'Sox2', 'Nanog', 'Klf4']
In [156]:
plot_dict = {**{"m/" + t: match[0,:,ti].max(axis=-1) for ti, t in enumerate(tasks)}, **get_dataset_item(contrib, 0)}
In [157]:
plot_tracks(filter_tracks(plot_dict, [300, 700]));