In [1]:
from collections import OrderedDict
exp = 'nexus,peaks,OSNK,0,10,1,FALSE,same,0.5,64,25,0.004,9,FALSE,[1,50],TRUE'
imp_score = 'profile/wn'

motifs = OrderedDict([
    ("Oct4-Sox2", 'Oct4/m0_p0'),
    ("Oct4", 'Oct4/m0_p1'),
    # ("Strange-sym-motif", 'Oct4/m0_p5'),
    ("Sox2", 'Sox2/m0_p1'),
    ("Nanog", 'Nanog/m0_p1'),
    ("Zic3", 'Nanog/m0_p2'),
    ("Nanog-partner", 'Nanog/m0_p4'),
    ("Klf4", 'Klf4/m0_p0'),
])

Goal

  • simulate the effect of motif spacing

Tasks

  • [x] generate random sequence
  • [x] inject cannonical motif in the middle
  • [x] predict and visualize
  • [x] average accross multiple random datasets
  • [x] quantify the central motif
  • [x] inject another motif nearby
    • check whether the entropy of the profile / total counts got improved
  • [x] inject at different positions
  • [x] add seq-importance
  • [x] add reverse-complemented motifs

Load the model

In [2]:
from basepair.imports import *
ddir = get_data_dir()

from basepair.BPNet import BPNetPredictor
from basepair.plot.profiles import extract_signal
from basepair.math import softmax
from basepair.plot.heatmaps import heatmap_stranded_profile, multiple_heatmap_stranded_profile
from basepair.plot.profiles import  plot_stranded_profile, multiple_plot_stranded_profile
from basepair.plot.tracks import plot_tracks, filter_tracks
from basepair.preproc import rc_seq
from basepair.exp.chipnexus.simulate import (insert_motif, generate_sim, plot_sim, generate_seq, 
                                             model2tasks, motif_coords, interactive_tracks, plot_motif_table,
                                             plot_sim_motif_col)

from scipy.fftpack import fft, ifft
Using TensorFlow backend.
In [3]:
from basepair.exp.paper.config import *
In [4]:
create_tf_session(1)
Out[4]:
<tensorflow.python.client.session.Session at 0x7fc50a4816a0>
In [5]:
model_dir = models_dir / exp
In [6]:
model = SeqModel.from_mdir(model_dir)
WARNING:tensorflow:From /users/avsec/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/tensorflow/python/util/deprecation.py:497: calling conv1d (from tensorflow.python.ops.nn_ops) with data_format=NHWC is deprecated and will be removed in a future version.
Instructions for updating:
`NHWC` for data_format is deprecated, use `NWC` instead
2019-03-21 21:00:34,969 [WARNING] From /users/avsec/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/tensorflow/python/util/deprecation.py:497: calling conv1d (from tensorflow.python.ops.nn_ops) with data_format=NHWC is deprecated and will be removed in a future version.
Instructions for updating:
`NHWC` for data_format is deprecated, use `NWC` instead
WARNING:tensorflow:From /users/avsec/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/tensorflow/contrib/learn/python/learn/datasets/base.py:198: retry (from tensorflow.contrib.learn.python.learn.datasets.base) is deprecated and will be removed in a future version.
Instructions for updating:
Use the retry module or similar alternatives.
2019-03-21 21:00:44,653 [WARNING] From /users/avsec/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/tensorflow/contrib/learn/python/learn/datasets/base.py:198: retry (from tensorflow.contrib.learn.python.learn.datasets.base) is deprecated and will be removed in a future version.
Instructions for updating:
Use the retry module or similar alternatives.

Inject the sim_predict method to SeqModel

In [9]:
from basepair.exp.chipnexus.simulate import generate_seq, postproc, average_profiles, flatten
In [10]:
from basepair.exp.chipnexus.simulate import *
In [11]:
seqs = encodeDNA([generate_seq("ACGAT", side_motif="ACG",
                                   side_distances=[10], seqlen=1000)
                      for i in range(100)])
In [14]:
model.predict(seqs).keys()
Out[14]:
dict_keys(['Oct4/profile', 'Oct4/counts', 'Sox2/profile', 'Sox2/counts', 'Nanog/profile', 'Nanog/counts', 'Klf4/profile', 'Klf4/counts'])
In [29]:
imps.keys()
Out[29]:
dict_keys(['Oct4/profile/wn', 'Oct4/profile/w1', 'Oct4/profile/w2', 'Oct4/profile/winf', 'Oct4/counts/pre-act', 'Sox2/profile/wn', 'Sox2/profile/w1', 'Sox2/profile/w2', 'Sox2/profile/winf', 'Sox2/counts/pre-act', 'Nanog/profile/wn', 'Nanog/profile/w1', 'Nanog/profile/w2', 'Nanog/profile/winf', 'Nanog/counts/pre-act', 'Klf4/profile/wn', 'Klf4/profile/w1', 'Klf4/profile/w2', 'Klf4/profile/winf', 'Klf4/counts/pre-act'])
In [31]:
imps.keys()
Out[31]:
dict_keys(['Oct4/profile/wn', 'Oct4/profile/w1', 'Oct4/profile/w2', 'Oct4/profile/winf', 'Oct4/counts/pre-act', 'Sox2/profile/wn', 'Sox2/profile/w1', 'Sox2/profile/w2', 'Sox2/profile/winf', 'Sox2/counts/pre-act', 'Nanog/profile/wn', 'Nanog/profile/w1', 'Nanog/profile/w2', 'Nanog/profile/winf', 'Nanog/counts/pre-act', 'Klf4/profile/wn', 'Klf4/profile/w1', 'Klf4/profile/w2', 'Klf4/profile/winf', 'Klf4/counts/pre-act'])
In [28]:
imps = model.imp_score_all(seqs, 'deeplift')
DeepExplain: running "deeplift" explanation method (5)
Model with multiple inputs:  True
DeepExplain: running "deeplift" explanation method (5)
Model with multiple inputs:  True
DeepExplain: running "deeplift" explanation method (5)
Model with multiple inputs:  True
DeepExplain: running "deeplift" explanation method (5)
Model with multiple inputs:  True
DeepExplain: running "deeplift" explanation method (5)
Model with multiple inputs:  True
DeepExplain: running "deeplift" explanation method (5)
Model with multiple inputs:  True
DeepExplain: running "deeplift" explanation method (5)
Model with multiple inputs:  True
DeepExplain: running "deeplift" explanation method (5)
Model with multiple inputs:  True
DeepExplain: running "deeplift" explanation method (5)
Model with multiple inputs:  True
DeepExplain: running "deeplift" explanation method (5)
Model with multiple inputs:  True
DeepExplain: running "deeplift" explanation method (5)
Model with multiple inputs:  True
DeepExplain: running "deeplift" explanation method (5)
Model with multiple inputs:  True
DeepExplain: running "deeplift" explanation method (5)
Model with multiple inputs:  True
DeepExplain: running "deeplift" explanation method (5)
Model with multiple inputs:  True
DeepExplain: running "deeplift" explanation method (5)
Model with multiple inputs:  True
DeepExplain: running "deeplift" explanation method (5)
Model with multiple inputs:  True
DeepExplain: running "deeplift" explanation method (5)
Model with multiple inputs:  True
DeepExplain: running "deeplift" explanation method (5)
Model with multiple inputs:  True
DeepExplain: running "deeplift" explanation method (5)
Model with multiple inputs:  True
DeepExplain: running "deeplift" explanation method (5)
Model with multiple inputs:  True
In [81]:
def sim_pred(self, central_motif, side_motif=None, side_distances=[], repeat=128, importance=[]):
    """
    Args:
      importance: list of importance scores
    """
    # TODO - update?
    from basepair.exp.chipnexus.simulate import generate_seq, postproc, average_profiles, flatten
    batch_size = repeat
    seqlen = self.seqlen
    tasks = self.tasks

    # simulate sequence
    seqs = encodeDNA([generate_seq(central_motif, side_motif=side_motif,
                                   side_distances=side_distances, seqlen=seqlen)
                      for i in range(repeat)])

    # get predictions
    preds = self.predict(seqs, batch_size=batch_size)
    scaled_preds = {t: preds[f'{t}/profile'] * np.exp(preds[f'{t}/counts'][:, np.newaxis])
                    for t in tasks}

    if importance:
        # get the importance scores
        imp_scores_all = self.imp_score_all(seqs)
        imp_scores = {t: {imp_score_name.split("/")[0]: seqs * imp_scores_all[f'{t}/{imp_score_name}']
                          for imp_score_name in importance}
                       for t in tasks}

        # merge and aggregate the profiles
        out = {"imp": imp_scores, "profile": scaled_preds}
    else:
        out = scaled_preds
    return average_profiles(flatten(out, "/"))


def input_seqlen(self):
    return self.seqlen
In [82]:
# add the method
import types
model.sim_pred = types.MethodType(sim_pred, model)
model.input_seqlen = types.MethodType(input_seqlen, model)
In [83]:
tasks = model.tasks

test insert_motif, generate_seq

In [84]:
central_motif = "ATTTGCATAACAAAG"
side_motif = "ATTTGCATAACAAAG"
In [85]:
insert_motif("ACGT", "GG", 1)
Out[85]:
'GGGT'
In [86]:
insert_motif("ACGT", "GG", 2)
Out[86]:
'AGGT'
In [87]:
insert_motif("ACGT", "GG", 3)
Out[87]:
'ACGG'
In [88]:
generate_seq("..", "--", [5], 20)
Out[88]:
'GTTC--CTG..GGGATGAGT'
In [89]:
generate_seq("..", "--", [14], 20)
Out[89]:
'TTGGTACTG..TT--GGCTG'

Learned bias

In [90]:
plot_tracks(model.sim_pred(""), fig_width=15, fig_height_per_track=1.5);
In [91]:
plot_tracks(filter_tracks(model.sim_pred("TTTGCATAACAA", importance=['profile/wn']), xlim=[400, 600]), 
            fig_width=15, fig_height_per_track=1.5);
In [92]:
df_d = {}
res_dict_d = {}
In [40]:
cache_path = f"{ddir}/cache/chipnexus/{exp}/simulation/spacing.pkl"
os.makedirs(os.path.dirname(cache_path), exist_ok=True)
In [53]:
cached = False
In [54]:
if cached:
    df_d, res_dict_d = read_pkl(cache_path)
In [55]:
# write_pkl((df_d, res_dict_d), cache_path)
In [56]:
!du -sh {cache_path}
du: cannot access '/users/avsec/workspace/basepair/data/cache/chipnexus/nexus,peaks,OSNK,0,10,1,FALSE,same,0.5,64,25,0.004,9,FALSE,[1,50],TRUE/simulation/spacing.pkl': No such file or directory
In [57]:
side_motifs = OrderedDict([
    ("Oct4-Sox2", ("m0_p0", "TTTGCATAACAA")),
    ("Sox2", ("m0_p2", "CCATTGTT")),
    ("Err2", ("m0_p1", "TCAAGGTCA")),
    ("Nanog", ("m0_p2", "TTGATGGC")),
    ("Klf4", ("m2_p0", "GGGTGTGG")),
])
In [58]:
assert rc_seq("TATCG") == "CGATA"
In [59]:
# get also the rc motifs
rc_side_motifs = OrderedDict([
    (m + "/rc", (v[0], rc_seq(v[1])))
    for m,v in side_motifs.items()
])

all_side_motifs = OrderedDict(list(side_motifs.items()) + list(rc_side_motifs.items()))

Oct4

Notes

  • create multiple weak motif sites
    • hash sequence -> affinity
      • get Bussemakers model
        • run their model
        • typical problem in Selex: lookup for the data
          • email X to get the data?
In [46]:
central_motif_name = "Oct4"
central_motif = "TTTGCATAACAA"
In [47]:
plot_tracks(model.sim_pred(central_motif), fig_width=15, fig_height_per_track=1.5, 
            title=f"{central_motif_name}: {central_motif}", same_ylim=True);
plt.xlabel("Position");
In [48]:
# find the right profile crop
center_coords = [485, 520]
plot_tracks(filter_tracks(model.sim_pred(central_motif), center_coords), fig_width=5, fig_height_per_track=1.5);
In [ ]:
# get the motifs 
if not cached:
    res_dict = OrderedDict([(motif, generate_sim(model, central_motif, side_motif, list(range(511, 511+150, 1)), 
                                                 center_coords=center_coords,
                                                 importance=['counts/pre-act', 'profile/wn']))
                           for motif, (pattern, side_motif) in all_side_motifs.items()])
    df = pd.concat([v[0].assign(motif=k) for k,v in res_dict.items()])  # stack the dataframes
    df_d[central_motif_name] = df
    res_dict_d[central_motif_name] = res_dict
100%|██████████| 150/150 [12:29<00:00,  6.82s/it]
100%|██████████| 150/150 [25:35<00:00, 13.33s/it]
 75%|███████▌  | 113/150 [32:00<14:22, 23.32s/it]
In [ ]:
df = df_d[central_motif_name]
res_dict = res_dict_d[central_motif_name]
In [31]:
display(plot_motif_table(mr, side_motifs))
plot_sim_motif_col(df, tasks, ['profile/max_frac', 'profile/counts_frac', 'profile/simmetric_kl', 
                               'imp/count', 'imp/weighted'],
         motifs=list(all_side_motifs), subfigsize=(6, 3), alpha=1)
In [32]:
side_motif_name = 'Nanog'; interactive_tracks(res_dict[side_motif_name][1], central_motif, all_side_motifs[side_motif_name][1])

Conclusions

Nanog strongly interacts with Oct4-Sox2 complex

Sox2

In [ ]:
central_motif_name = "Sox2"
central_motif = "CCATTGTT"
In [ ]:
plot_tracks(model.sim_pred(central_motif), fig_width=15, fig_height_per_track=1.5, 
            title=f"{central_motif_name}: {central_motif}", same_ylim=True);
plt.xlabel("Position");
In [ ]:
# find the right profile crop
center_coords = [485, 520]
plot_tracks(filter_tracks(model.sim_pred(central_motif), center_coords), fig_width=5, fig_height_per_track=1.5);
In [ ]:
# get the motifs 
if not cached:
    res_dict = OrderedDict([(motif, generate_sim(model, central_motif, side_motif, list(range(511, 511+150, 1)), 
                                                 center_coords=center_coords,
                                                 importance=['counts/pre-act', 'profile/wn']))
                           for motif, (pattern, side_motif) in all_side_motifs.items()])
    df = pd.concat([v[0].assign(motif=k) for k,v in res_dict.items()])  # stack the dataframes
    df_d[central_motif_name] = df
    res_dict_d[central_motif_name] = res_dict
In [ ]:
df = df_d[central_motif_name]
res_dict = res_dict_d[central_motif_name]
In [38]:
display(plot_motif_table(mr, side_motifs))
plot_sim_motif_col(df, tasks, ['profile/max_frac', 'profile/counts_frac', 'profile/simmetric_kl', 
                               'imp/count', 'imp/weighted'],
         motifs=list(all_side_motifs), subfigsize=(6, 3), alpha=1)
In [39]:
side_motif_name = 'Sox2'; interactive_tracks(res_dict[side_motif_name][1], central_motif, all_side_motifs[side_motif_name][1])

Nanog

In [ ]:
central_motif_name = "Nanog"
central_motif = "TTGATGGC"
In [ ]:
plot_tracks(model.sim_pred(central_motif), fig_width=15, fig_height_per_track=1.5, 
            title=f"{central_motif_name}: {central_motif}", same_ylim=True);
plt.xlabel("Position");
In [ ]:
# find the right profile crop
center_coords = [485, 520]
plot_tracks(filter_tracks(model.sim_pred(central_motif), center_coords), fig_width=5, fig_height_per_track=1.5);
In [ ]:
# get the motifs 
if not cached:
    res_dict = OrderedDict([(motif, generate_sim(model, central_motif, side_motif, list(range(511, 511+150, 1)), 
                                                 center_coords=center_coords,
                                                 importance=['counts/pre-act', 'profile/wn']))
                           for motif, (pattern, side_motif) in all_side_motifs.items()])
    df = pd.concat([v[0].assign(motif=k) for k,v in res_dict.items()])  # stack the dataframes
    df_d[central_motif_name] = df
    res_dict_d[central_motif_name] = res_dict
In [ ]:
df = df_d[central_motif_name]
res_dict = res_dict_d[central_motif_name]
In [45]:
display(plot_motif_table(mr, side_motifs))
plot_sim_motif_col(df, tasks, ['profile/max_frac', 'profile/counts_frac', 'profile/simmetric_kl', 
                               'imp/count', 'imp/weighted'],
         motifs=list(all_side_motifs), subfigsize=(6, 3), alpha=1)
In [46]:
side_motif_name = 'Nanog'; interactive_tracks(res_dict[side_motif_name][1], central_motif, all_side_motifs[side_motif_name][1])

Klf4

In [ ]:
central_motif_name = "Klf4"
central_motif = "GGGTGTGG"
In [ ]:
a=1
In [ ]:
plot_tracks(model.sim_pred(central_motif), fig_width=15, fig_height_per_track=1.5, 
            title=f"{central_motif_name}: {central_motif}", same_ylim=True);
plt.xlabel("Position");
In [ ]:
# find the right profile crop
center_coords = [485, 520]
plot_tracks(filter_tracks(model.sim_pred(central_motif), center_coords), fig_width=5, fig_height_per_track=1.5);
In [ ]:
# get the motifs 
if not cached:
    res_dict = OrderedDict([(motif, generate_sim(model, central_motif, side_motif, list(range(511, 511+150, 1)), 
                                                 center_coords=center_coords,
                                                 importance=['counts/pre-act', 'profile/wn']))
                           for motif, (pattern, side_motif) in all_side_motifs.items()])
    df = pd.concat([v[0].assign(motif=k) for k,v in res_dict.items()])  # stack the dataframes
    df_d[central_motif_name] = df
    res_dict_d[central_motif_name] = res_dict
In [ ]:
df = df_d[central_motif_name]
res_dict = res_dict_d[central_motif_name]
In [52]:
display(plot_motif_table(mr, side_motifs))
plot_sim_motif_col(df, tasks, ['profile/max_frac', 'profile/counts_frac', 'profile/simmetric_kl', 
                               'imp/count', 'imp/weighted'],
         motifs=list(all_side_motifs), subfigsize=(6, 3), alpha=1)
In [53]:
side_motif_name = 'Nanog'; interactive_tracks(res_dict[side_motif_name][1], central_motif, all_side_motifs[side_motif_name][1])

Nanog periodicity

In [44]:
dfa = pd.concat([df_d[m].assign(central_motif_name=m) for m in df_d])
In [45]:
side_motifs = list(side_motifs)
In [46]:
df_d.keys()
Out[46]:
dict_keys(['Oct4', 'Sox2', 'Nanog', 'Klf4'])
In [47]:
# row = side motif
# column = main motif
# plot = counts of which protein are looked at
In [48]:
central_motifs = ['Sox2', 'Oct4', 'Nanog', 'Klf4']
fig, axes = plt.subplots(len(side_motifs),len(central_motifs), figsize=(25, 10), sharex=True, sharey='row')
for j, central_motif in enumerate(central_motifs):
    for i, m in enumerate(side_motifs):
        ax = axes[i, j]
        if i == 0:
            ax.set_title(central_motif)
        ax.plot(np.abs(np.fft.rfft(dfa[(dfa.task == "Nanog") & (dfa.motif == m) &(dfa.central_motif_name == central_motif)].counts_frac)[3:74])**2, "-o")
        ax.set_xlabel("Frequency [bp]")
        if j == 0:
            ax.set_ylabel(m)
fig.subplots_adjust(wspace=0, hspace=0)
In [49]:
central_motifs = ['Sox2', 'Oct4', 'Nanog', 'Klf4']
fig, axes = plt.subplots(len(side_motifs),len(central_motifs), figsize=(25, 10), sharex=True, sharey='row')
for j, central_motif in enumerate(central_motifs):
    for i, m in enumerate(side_motifs):
        ax = axes[i, j]
        if i == 0:
            ax.set_title(central_motif)
        power_spec = np.abs(np.fft.rfft(dfa[(dfa.task == "Oct4") & (dfa.motif == m) &(dfa.central_motif_name == central_motif)].counts_frac))**2
        power_spec = power_spec / power_spec[1:].sum()
        ax.plot(power_spec[2:], "-o")
        ax.set_xlabel("Frequency [bp]")
        if j == 0:
            ax.set_ylabel(m)
fig.subplots_adjust(wspace=0, hspace=0)
In [ ]:
a=1
In [50]:
central_motifs = ['Sox2', 'Oct4', 'Nanog', 'Klf4']
fig, axes = plt.subplots(len(side_motifs),len(central_motifs), figsize=(25, 10), sharex=True, sharey='row')
for j, central_motif in enumerate(central_motifs):
    for i, m in enumerate(side_motifs):
        ax = axes[i, j]
        if i == 0:
            ax.set_title(central_motif)
        power_spec = np.abs(np.fft.rfft(dfa[(dfa.task == "Sox2") & (dfa.motif == m) &(dfa.central_motif_name == central_motif)].counts_frac))**2
        power_spec = power_spec / power_spec[1:].sum()
        ax.plot(power_spec[2:], "-o")
        ax.set_xlabel("Frequency [bp]")
        if j == 0:
            ax.set_ylabel(m)
fig.subplots_adjust(wspace=0, hspace=0)
In [51]:
central_motifs = ['Sox2', 'Oct4', 'Nanog', 'Klf4']
fig, axes = plt.subplots(len(side_motifs),len(central_motifs), figsize=(25, 10), sharex=True, sharey='row')
for j, central_motif in enumerate(central_motifs):
    for i, m in enumerate(side_motifs):
        ax = axes[i, j]
        if i == 0:
            ax.set_title(central_motif)
        power_spec = np.abs(np.fft.rfft(dfa[(dfa.task == "Nanog") & (dfa.motif == m) &(dfa.central_motif_name == central_motif)].counts_frac))**2
        power_spec = power_spec / power_spec[1:].sum()
        ax.plot(power_spec[2:], "-o")
        ax.set_xlabel("Frequency [bp]")
        if j == 0:
            ax.set_ylabel(m)
fig.subplots_adjust(wspace=0, hspace=0)
In [52]:
central_motifs = ['Sox2', 'Oct4', 'Nanog', 'Klf4']
fig, axes = plt.subplots(len(side_motifs),len(central_motifs), figsize=(25, 10), sharex=True, sharey='row')
for j, central_motif in enumerate(central_motifs):
    for i, m in enumerate(side_motifs):
        ax = axes[i, j]
        if i == 0:
            ax.set_title(central_motif)
        power_spec = np.abs(np.fft.rfft(dfa[(dfa.task == "Klf4") & (dfa.motif == m) &(dfa.central_motif_name == central_motif)].counts_frac))**2
        power_spec = power_spec / power_spec[1:].sum()
        ax.plot(power_spec[2:], "-o")
        ax.set_xlabel("Frequency [bp]")
        if j == 0:
            ax.set_ylabel(m)
fig.subplots_adjust(wspace=0, hspace=0)
In [ ]: