Goal

  • figure out which importance scores to choose

Tasks

  • [ ]

Required files

-

In [1]:
# Imports
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
from basepair.imports import *
Using TensorFlow backend.
In [2]:
import pybedtools
from basepair.utils import flatten_list
paper_config()
In [3]:
create_tf_session(0)
Out[3]:
<tensorflow.python.client.session.Session at 0x7fb067cd2c88>
In [4]:
# Common paths
model_dir = Path(f"{ddir}/processed/chipnexus/exp/models/oct-sox-nanog-klf/models/n_dil_layers=9/")
# model_dir = Path(f"{ddir}/processed/chipnexus/exp/models/oct-sox-nanog-klf-sall/models/default/")
modisco_dir = model_dir / f"modisco/all/profile/"
figures = f"{ddir}/figures/model-evaluation/chipnexus-bpnet"

!mkdir -p {figures}/known_enhancer_profiles
In [5]:
ds = DataSpec.load(model_dir / "dataspec.yaml")
In [6]:
# Get counts
interval = pybedtools.create_interval_from_list(['chr17', 35503550, 35504550])

In [ ]:
# bpnet = BPNet.from_mdir(model_dir)

# # Get predictions
# pred = bpnet.predict_intervals([interval], imp_method='deeplift')[0]
In [ ]:
tasks = bpnet.tasks
In [ ]:
obs = {task: ds.task_specs[task].load_counts([interval])[0] for task in tasks}
In [ ]:
viz_dict = OrderedDict(flatten_list([[
                    (f"{task} Pred", pred['pred'][task]),
                    (f"{task} Obs", obs[task]),
                    (f"{task} Imp profile", pred['imp_score'][f"{task}/weighted"] * pred['seq']),
                    # (f"{task} Imp counts", sum(pred['grads'][task_idx]['counts'].values()) / 2 * seq),
                ] for task_idx, task in enumerate(bpnet.tasks)]))

viz_dict = filter_tracks(viz_dict, [420, 575])
In [ ]:
fmax = {feature: max([viz_dict[f"{task} {feature}"].max() for task in bpnet.tasks])
        for feature in ['Imp profile', 'Obs', 'Pred']}
fmin = {feature: min([viz_dict[f"{task} {feature}"].min() for task in bpnet.tasks])
        for feature in ['Imp profile', 'Obs', 'Pred']}


ylim = []
for k in viz_dict:
    f = k.split(" ", 1)[1]
    if "Imp" in f:
        ylim.append((fmin[f], fmax[f]))
    else:
        ylim.append((0, fmax[f]))
In [ ]:
fig = plot_tracks(viz_dict,
                  #seqlets=shifted_seqlets,
                  title="{i.chrom}:{i.start}-{i.end}, {i.name}".format(i=interval),
                  fig_height_per_track=0.5,
                  rotate_y=0,
                  fig_width=get_figsize(frac=1)[0],
                  ylim=ylim,
                  legend=False)

Load the seq-model

In [7]:
# # mdir2 = Path('/users/avsec//workspace/basepair/data/processed/chipnexus/exp/models/oct-sox-nanog-klf/models/gt/run-20190219_113014-dgg341ac')

# # bpnet2_model = load_model(str(mdir2/ "model.h5"))

# m2_config = read_json(mdir2/ 'config.gin.json')

# input_seqlen=seq_bpnet_cropped_extra_seqlen(conv1_kernel_size=m2_config['conv1_kernel_size'],
#                                             n_dil_layers=m2_config['n_dil_layers'],
#                                             target_seqlen=m2_config['target_seqlen'],
#                                             tconv_kernel_size=m2_config['tconv_kernel_size'])

# bpnet2 = BPNet(bpnet2_model, fasta_file=ds.fasta_file)

# bpnet2.imp_score(seq, task='Oct4', strand='both', method='deeplift', pred_summary='l2')

# mdir2 = Path('/users/avsec//workspace/basepair/data/processed/chipnexus/exp/models/oct-sox-nanog-klf/models/gt/run-20190204_113238-do8kugq4')

In [7]:
from basepair.BPNet import model2tasks
from basepair.models import seq_bpnet_cropped_extra_seqlen
from basepair.preproc import resize_interval
from basepair.seqmodel import SeqModel
from basepair.utils import unflatten
from genomelake.extractors import FastaExtractor
In [8]:
# mdir2 = Path('../train/seqmodel/output/seq,peaks,OSN,0,10,1,FALSE,same,0.5,64,50,0.004,9,FALSE')
In [9]:
mdir2 = Path('../train/seqmodel/output/nexus,peaks,OSNK,0,10,1,FALSE,same,0.5,64,25,0.004,9,FALSE,[1,50],TRUE,TRUE')
In [10]:
# mdir2 = Path('../train/seqmodel/output/nexus,peaks,OSNK,0,10,1,TRUE,valid,0.5,64,25,0.004,9,FALSE')
In [11]:
# mdir2 = Path('../train/seqmodel/output/nexus,peaks,OSNK,0,10,1,FALSE,valid,0.5,64,25,0.004,9,')
In [12]:
sm = SeqModel.from_mdir(mdir2)
WARNING:tensorflow:From /users/avsec/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/tensorflow/python/util/deprecation.py:497: calling conv1d (from tensorflow.python.ops.nn_ops) with data_format=NHWC is deprecated and will be removed in a future version.
Instructions for updating:
`NHWC` for data_format is deprecated, use `NWC` instead
2019-03-12 12:44:32,628 [WARNING] From /users/avsec/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/tensorflow/python/util/deprecation.py:497: calling conv1d (from tensorflow.python.ops.nn_ops) with data_format=NHWC is deprecated and will be removed in a future version.
Instructions for updating:
`NHWC` for data_format is deprecated, use `NWC` instead
WARNING:tensorflow:From /users/avsec/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/tensorflow/contrib/learn/python/learn/datasets/base.py:198: retry (from tensorflow.contrib.learn.python.learn.datasets.base) is deprecated and will be removed in a future version.
Instructions for updating:
Use the retry module or similar alternatives.
2019-03-12 12:44:45,621 [WARNING] From /users/avsec/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/tensorflow/contrib/learn/python/learn/datasets/base.py:198: retry (from tensorflow.contrib.learn.python.learn.datasets.base) is deprecated and will be removed in a future version.
Instructions for updating:
Use the retry module or similar alternatives.
In [13]:
input_seqlen = 1000 - sm.body.get_len_change()  - sm.heads[0].net.get_len_change()
In [14]:
fe = FastaExtractor(ds.fasta_file)
In [15]:
seq = fe([resize_interval(interval, input_seqlen)])
In [16]:
imp_scores = sm.imp_score_all(seq, preact_only=True)
# x = sm.neutral_bias_inputs(1000, 1000)
# x['seq'] = seq
# preds = sm.predict(x)
WARNING:tensorflow:From /users/avsec/workspace/basepair/basepair/heads.py:323: calling softmax (from tensorflow.python.ops.nn_ops) with dim is deprecated and will be removed in a future version.
Instructions for updating:
dim is deprecated, use axis instead
2019-03-12 12:44:50,734 [WARNING] From /users/avsec/workspace/basepair/basepair/heads.py:323: calling softmax (from tensorflow.python.ops.nn_ops) with dim is deprecated and will be removed in a future version.
Instructions for updating:
dim is deprecated, use axis instead
DeepExplain: running "deeplift" explanation method (5)
Model with multiple inputs:  True
DeepExplain: running "deeplift" explanation method (5)
Model with multiple inputs:  True
DeepExplain: running "deeplift" explanation method (5)
Model with multiple inputs:  True
DeepExplain: running "deeplift" explanation method (5)
Model with multiple inputs:  True
DeepExplain: running "deeplift" explanation method (5)
Model with multiple inputs:  True
DeepExplain: running "deeplift" explanation method (5)
Model with multiple inputs:  True
DeepExplain: running "deeplift" explanation method (5)
Model with multiple inputs:  True
DeepExplain: running "deeplift" explanation method (5)
Model with multiple inputs:  True
DeepExplain: running "deeplift" explanation method (5)
Model with multiple inputs:  True
DeepExplain: running "deeplift" explanation method (5)
Model with multiple inputs:  True
DeepExplain: running "deeplift" explanation method (5)
Model with multiple inputs:  True
DeepExplain: running "deeplift" explanation method (5)
Model with multiple inputs:  True
DeepExplain: running "deeplift" explanation method (5)
Model with multiple inputs:  True
DeepExplain: running "deeplift" explanation method (5)
Model with multiple inputs:  True
DeepExplain: running "deeplift" explanation method (5)
Model with multiple inputs:  True
DeepExplain: running "deeplift" explanation method (5)
Model with multiple inputs:  True
DeepExplain: running "deeplift" explanation method (5)
Model with multiple inputs:  True
DeepExplain: running "deeplift" explanation method (5)
Model with multiple inputs:  True
DeepExplain: running "deeplift" explanation method (5)
Model with multiple inputs:  True
DeepExplain: running "deeplift" explanation method (5)
Model with multiple inputs:  True
In [17]:
x = sm.neutral_bias_inputs(1000, 1000)
x['seq'] = seq
preds = sm.predict(x)
In [18]:
# TODO - put this somewhere
def trim_seq(seq_width, peak_width):
    if seq_width > peak_width:
        # Trim
        # make sure we can nicely trim the peak
        assert (seq_width - peak_width) % 2 == 0
        trim_start = (seq_width - peak_width) // 2
        trim_end = seq_width - trim_start
        assert trim_end - trim_start == peak_width
    elif seq_width == peak_width:
        trim_start = 0
        trim_end = peak_width
    else:
        raise ValueError("seq_width < peak_width")
    return trim_start, trim_end
In [19]:
# TODO  have the function to get the right trimming
trim_i,trim_j = trim_seq(input_seqlen, 1000)
In [20]:
trim_i,trim_j
Out[20]:
(0, 1000)

Pred and observed

In [21]:
from basepair.utils import unflatten
In [22]:
tasks = sm.tasks
In [23]:
obs = {task: ds.task_specs[task].load_counts([interval])[0] for task in tasks}
In [24]:
viz_dict = OrderedDict(flatten_list([[
                    (f"{task} Pred", preds[f"{task}/profile"][0]),
                    (f"{task} Obs", obs[task]),
                    # (f"{task} Imp counts", sum(pred['grads'][task_idx]['counts'].values()) / 2 * seq),
                ]  + [(f"{task} {imp_score} Imp profile", (v * seq)[0, trim_i:trim_j])
                      for imp_score,v in unflatten(imp_scores, "/")[task]['profile'].items()
                     ]
    for task_idx, task in enumerate(tasks)]))

viz_dict = filter_tracks(viz_dict, [420, 575])
In [25]:
from kipoi.utils import unique_list
In [26]:
features = unique_list([k.split(" ", 1)[1] for k in viz_dict])
In [27]:
fmax = {feature: max([viz_dict[f"{task} {feature}"].max() for task in tasks])
        for feature in features}
fmin = {feature: min([viz_dict[f"{task} {feature}"].min() for task in tasks])
        for feature in features}


ylim = []
for k in viz_dict:
    f = k.split(" ", 1)[1]
    if "Imp" in f:
        ylim.append((fmin[f], fmax[f]))
    else:
        ylim.append((0, fmax[f]))
In [28]:
a = sm.all_heads['Oct4'][0]
In [29]:
seq.shape
Out[29]:
(1, 1000, 4)

New ChIP-nexus, same padding, separate prediction, no batchnorm, using data augmentation, bias-corrected

In [30]:
fig = plot_tracks(viz_dict,
                  #seqlets=shifted_seqlets,
                  title="{i.chrom}:{i.start}-{i.end}, {i.name}".format(i=interval),
                  fig_height_per_track=0.5,
                  rotate_y=0,
                  fig_width=get_figsize(frac=1)[0],
                  ylim=ylim,
                  legend=False)

New ChIP-nexus, same padding, separate prediction, no batchnorm, using data augmentation

In [33]:
fig = plot_tracks(viz_dict,
                  #seqlets=shifted_seqlets,
                  title="{i.chrom}:{i.start}-{i.end}, {i.name}".format(i=interval),
                  fig_height_per_track=0.5,
                  rotate_y=0,
                  fig_width=get_figsize(frac=1)[0],
                  ylim=ylim,
                  legend=False)

New Nexus, same padding, separate prediction, no batchnorm

In [27]:
fig = plot_tracks(viz_dict,
                  #seqlets=shifted_seqlets,
                  title="{i.chrom}:{i.start}-{i.end}, {i.name}".format(i=interval),
                  fig_height_per_track=0.5,
                  rotate_y=0,
                  fig_width=get_figsize(frac=1)[0],
                  ylim=ylim,
                  legend=False)

New ChIP-seq, same padding, separate prediction, no batchnorm

In [29]:
fig = plot_tracks(viz_dict,
                  #seqlets=shifted_seqlets,
                  title="{i.chrom}:{i.start}-{i.end}, {i.name}".format(i=interval),
                  fig_height_per_track=0.5,
                  rotate_y=0,
                  fig_width=get_figsize(frac=1)[0],
                  ylim=ylim,
                  legend=False)

Valid padding, joint prediction (only profile), no batchnorm

In [27]:
fig = plot_tracks(viz_dict,
                  #seqlets=shifted_seqlets,
                  title="{i.chrom}:{i.start}-{i.end}, {i.name}".format(i=interval),
                  fig_height_per_track=0.5,
                  rotate_y=0,
                  fig_width=get_figsize(frac=1)[0],
                  ylim=ylim,
                  legend=False)

Valid padding, separated predictions (profile and counts), no batchnorm

In [26]:
fig = plot_tracks(viz_dict,
                  #seqlets=shifted_seqlets,
                  title="{i.chrom}:{i.start}-{i.end}, {i.name}".format(i=interval),
                  fig_height_per_track=0.5,
                  rotate_y=0,
                  fig_width=get_figsize(frac=1)[0],
                  ylim=ylim,
                  legend=False)

Same padding, separated predictions, no batchnorm

In [27]:
fig = plot_tracks(viz_dict,
                  #seqlets=shifted_seqlets,
                  title="{i.chrom}:{i.start}-{i.end}, {i.name}".format(i=interval),
                  fig_height_per_track=0.5,
                  rotate_y=0,
                  fig_width=get_figsize(frac=1)[0],
                  ylim=ylim,
                  legend=False)

Same padding, separated predictions, with batchnorm

In [34]:
fig = plot_tracks(viz_dict,
                  #seqlets=shifted_seqlets,
                  title="{i.chrom}:{i.start}-{i.end}, {i.name}".format(i=interval),
                  fig_height_per_track=0.5,
                  rotate_y=0,
                  fig_width=get_figsize(frac=1)[0],
                  ylim=ylim,
                  legend=False)
In [30]:
fig = plot_tracks(viz_dict,
                  #seqlets=shifted_seqlets,
                  title="{i.chrom}:{i.start}-{i.end}, {i.name}".format(i=interval),
                  fig_height_per_track=0.5,
                  rotate_y=0,
                  fig_width=get_figsize(frac=1)[0],
                  ylim=ylim,
                  legend=False)
In [42]:
fig = plot_tracks(viz_dict,
                  #seqlets=shifted_seqlets,
                  title="{i.chrom}:{i.start}-{i.end}, {i.name}".format(i=interval),
                  fig_height_per_track=0.5,
                  rotate_y=0,
                  fig_width=get_figsize(frac=1)[0],
                  ylim=ylim,
                  legend=False)
In [81]:
fig = plot_tracks(viz_dict,
                  #seqlets=shifted_seqlets,
                  title="{i.chrom}:{i.start}-{i.end}, {i.name}".format(i=interval),
                  fig_height_per_track=0.5,
                  rotate_y=0,
                  fig_width=get_figsize(frac=1)[0],
                  ylim=ylim,
                  legend=False)

Only observed

In [111]:
viz_dict = OrderedDict(flatten_list([[
                    # (f"{task} Pred", pred['pred'][task]),
                    (f"{task} Obs", obs[task]),
                    (f"{task} Imp profile", pred['imp_score'][f"{task}/weighted"] * pred['seq']),
                    (f"{task} Imp counts", pred['imp_score'][f"{task}/count"] * pred['seq']),
                    # (f"{task} Imp counts", sum(pred['grads'][task_idx]['counts'].values()) / 2 * seq),
                ] for task_idx, task in enumerate(bpnet.tasks)]))

viz_dict = filter_tracks(viz_dict, [420, 575])
In [112]:
fmax = {feature: max([viz_dict[f"{task} {feature}"].max() for task in bpnet.tasks])
        for feature in ['Imp profile', 'Imp counts', 'Obs']}  # 'Pred', 
fmin = {feature: min([viz_dict[f"{task} {feature}"].min() for task in bpnet.tasks])
        for feature in ['Imp profile', 'Imp counts', 'Obs']}  # 'Pred', 


ylim = []
for k in viz_dict:
    f = k.split(" ", 1)[1]
    if "Imp" in f:
        ylim.append((fmin[f], fmax[f]))
    else:
        ylim.append((0, fmax[f]))
In [113]:
fig = plot_tracks(viz_dict,
                  #seqlets=shifted_seqlets,
                  title="{i.chrom}:{i.start}-{i.end}, {i.name}".format(i=interval),
                  fig_height_per_track=0.5,
                  rotate_y=0,
                  fig_width=get_figsize(frac=1)[0],
                  ylim=ylim,
                  legend=False)