Goal

  • simulate the effect of motif spacing

Tasks

  • [x] generate random sequence
  • [x] inject cannonical motif in the middle
  • [x] predict and visualize
  • [x] average accross multiple random datasets
  • [x] quantify the central motif
  • [x] inject another motif nearby
    • check whether the entropy of the profile / total counts got improved
  • [x] inject at different positions
  • [x] add seq-importance
  • [x] add reverse-complemented motifs

Load the model

In [1]:
from basepair.imports import *
ddir = get_data_dir()

from basepair.BPNet import BPNetPredictor
from basepair.plot.profiles import extract_signal
from basepair.math import softmax
from basepair.plot.heatmaps import heatmap_stranded_profile, multiple_heatmap_stranded_profile
from basepair.plot.profiles import  plot_stranded_profile, multiple_plot_stranded_profile
from basepair.plot.tracks import plot_tracks, filter_tracks
from basepair.preproc import rc_seq
from basepair.exp.chipnexus.simulate import (insert_motif, generate_sim, plot_sim, generate_seq, 
                                             model2tasks, motif_coords, interactive_tracks, plot_motif_table,
                                             plot_sim_motif_col)

from scipy.fftpack import fft, ifft
/users/avsec/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/h5py/__init__.py:36: FutureWarning: Conversion of the second argument of issubdtype from `float` to `np.floating` is deprecated. In future, it will be treated as `np.float64 == np.dtype(float).type`.
  from ._conv import register_converters as _register_converters
Using TensorFlow backend.
In [2]:
create_tf_session(1)
Out[2]:
<tensorflow.python.client.session.Session at 0x7f370c9720b8>
In [3]:
model_dir = Path(f"{ddir}/processed/chipnexus/exp/models/oct-sox-nanog-klf/models/n_dil_layers=9/")
In [4]:
# Load the model
model = load_model(model_dir / "model.h5")
WARNING:tensorflow:From /users/avsec/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/tensorflow/python/util/deprecation.py:497: calling conv1d (from tensorflow.python.ops.nn_ops) with data_format=NHWC is deprecated and will be removed in a future version.
Instructions for updating:
`NHWC` for data_format is deprecated, use `NWC` instead
2018-09-26 11:56:52,993 [WARNING] From /users/avsec/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/tensorflow/python/util/deprecation.py:497: calling conv1d (from tensorflow.python.ops.nn_ops) with data_format=NHWC is deprecated and will be removed in a future version.
Instructions for updating:
`NHWC` for data_format is deprecated, use `NWC` instead
WARNING:tensorflow:From /users/avsec/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/tensorflow/contrib/learn/python/learn/datasets/base.py:198: retry (from tensorflow.contrib.learn.python.learn.datasets.base) is deprecated and will be removed in a future version.
Instructions for updating:
Use the retry module or similar alternatives.
2018-09-26 11:56:59,773 [WARNING] From /users/avsec/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/tensorflow/contrib/learn/python/learn/datasets/base.py:198: retry (from tensorflow.contrib.learn.python.learn.datasets.base) is deprecated and will be removed in a future version.
Instructions for updating:
Use the retry module or similar alternatives.
In [5]:
model = BPNetPredictor(model)
In [6]:
tasks = model.tasks

test insert_motif, generate_seq

In [7]:
central_motif = "ATTTGCATAACAAAG"
side_motif = "ATTTGCATAACAAAG"
In [8]:
insert_motif("ACGT", "GG", 1)
Out[8]:
'GGGT'
In [9]:
insert_motif("ACGT", "GG", 2)
Out[9]:
'AGGT'
In [10]:
insert_motif("ACGT", "GG", 3)
Out[10]:
'ACGG'
In [11]:
generate_seq("..", "--", [5], 20)
Out[11]:
'GAGG--AGC..TCTCGATGT'
In [12]:
generate_seq("..", "--", [14], 20)
Out[12]:
'GCTAAGCAC..AT--TGATA'

Learned bias

In [13]:
plot_tracks(model.sim_pred(""), fig_width=15, fig_height_per_track=1.5);
In [14]:
plot_tracks(filter_tracks(model.sim_pred("TTTGCATAACAA", importance=['weighted']), xlim=[400, 600]), 
            fig_width=15, fig_height_per_track=1.5);
In [15]:
mr = ModiscoResult(model_dir / f"modisco/by_peak_tasks/weighted/Oct4/modisco.h5")
mr.open()
In [16]:
df_d = {}
res_dict_d = {}
In [17]:
cache_path = f"{ddir}/cache/chipnexus/simulation/spacing.pkl"
os.makedirs(os.path.dirname(cache_path), exist_ok=True)
In [18]:
cached = True
In [19]:
if cached:
    df_d, res_dict_d = read_pkl(cache_path)
In [20]:
# write_pkl((df_d, res_dict_d), cache_path)
In [21]:
!du -sh {cache_path}
2.1G	/users/avsec/workspace/basepair/data/cache/chipnexus/simulation/spacing.pkl
In [22]:
side_motifs = OrderedDict([
    ("Oct4-Sox2", ("m0_p0", "TTTGCATAACAA")),
    ("Sox2", ("m0_p2", "CCATTGTT")),
    ("Err2", ("m0_p1", "TCAAGGTCA")),
    ("Nanog", ("m0_p2", "TTGATGGC")),
    ("Klf4", ("m2_p0", "GGGTGTGG")),
])
In [23]:
assert rc_seq("TATCG") == "CGATA"
In [24]:
# get also the rc motifs
rc_side_motifs = OrderedDict([
    (m + "/rc", (v[0], rc_seq(v[1])))
    for m,v in side_motifs.items()
])

all_side_motifs = OrderedDict(list(side_motifs.items()) + list(rc_side_motifs.items()))

Oct4

Notes

  • create multiple weak motif sites
    • hash sequence -> affinity
      • get Bussemakers model
        • run their model
        • typical problem in Selex: lookup for the data
          • email X to get the data?
In [26]:
central_motif_name = "Oct4"
central_motif = "TTTGCATAACAA"
In [27]:
plot_tracks(model.sim_pred(central_motif), fig_width=15, fig_height_per_track=1.5, 
            title=f"{central_motif_name}: {central_motif}", same_ylim=True);
plt.xlabel("Position");
In [28]:
# find the right profile crop
center_coords = [485, 520]
plot_tracks(filter_tracks(model.sim_pred(central_motif), center_coords), fig_width=5, fig_height_per_track=1.5);
In [29]:
# get the motifs 
if not cached:
    res_dict = OrderedDict([(motif, generate_sim(model, central_motif, side_motif, list(range(511, 511+150, 1)), 
                                                 center_coords=center_coords))
                           for motif, (pattern, side_motif) in all_side_motifs.items()])
    df = pd.concat([v[0].assign(motif=k) for k,v in res_dict.items()])  # stack the dataframes
    df_d[central_motif_name] = df
    res_dict_d[central_motif_name] = res_dict
In [30]:
df = df_d[central_motif_name]
res_dict = res_dict_d[central_motif_name]
In [31]:
display(plot_motif_table(mr, side_motifs))
plot_sim_motif_col(df, tasks, ['profile/max_frac', 'profile/counts_frac', 'profile/simmetric_kl', 
                               'imp/count', 'imp/weighted'],
         motifs=list(all_side_motifs), subfigsize=(6, 3), alpha=1)
In [32]:
side_motif_name = 'Nanog'; interactive_tracks(res_dict[side_motif_name][1], central_motif, all_side_motifs[side_motif_name][1])

Conclusions

Nanog strongly interacts with Oct4-Sox2 complex

Sox2

In [33]:
central_motif_name = "Sox2"
central_motif = "CCATTGTT"
In [34]:
plot_tracks(model.sim_pred(central_motif), fig_width=15, fig_height_per_track=1.5, 
            title=f"{central_motif_name}: {central_motif}", same_ylim=True);
plt.xlabel("Position");
In [35]:
# find the right profile crop
center_coords = [485, 520]
plot_tracks(filter_tracks(model.sim_pred(central_motif), center_coords), fig_width=5, fig_height_per_track=1.5);
In [36]:
# get the motifs 
if not cached:
    res_dict = OrderedDict([(motif, generate_sim(model, central_motif, side_motif, list(range(511, 511+150, 1)), 
                                                 center_coords=center_coords))
                           for motif, (pattern, side_motif) in all_side_motifs.items()])
    df = pd.concat([v[0].assign(motif=k) for k,v in res_dict.items()])  # stack the dataframes
    df_d[central_motif_name] = df
    res_dict_d[central_motif_name] = res_dict
In [37]:
df = df_d[central_motif_name]
res_dict = res_dict_d[central_motif_name]
In [38]:
display(plot_motif_table(mr, side_motifs))
plot_sim_motif_col(df, tasks, ['profile/max_frac', 'profile/counts_frac', 'profile/simmetric_kl', 
                               'imp/count', 'imp/weighted'],
         motifs=list(all_side_motifs), subfigsize=(6, 3), alpha=1)
In [39]:
side_motif_name = 'Sox2'; interactive_tracks(res_dict[side_motif_name][1], central_motif, all_side_motifs[side_motif_name][1])

Nanog

In [40]:
central_motif_name = "Nanog"
central_motif = "TTGATGGC"
In [41]:
plot_tracks(model.sim_pred(central_motif), fig_width=15, fig_height_per_track=1.5, 
            title=f"{central_motif_name}: {central_motif}", same_ylim=True);
plt.xlabel("Position");
In [42]:
# find the right profile crop
center_coords = [485, 520]
plot_tracks(filter_tracks(model.sim_pred(central_motif), center_coords), fig_width=5, fig_height_per_track=1.5);
In [43]:
# get the motifs 
if not cached:
    res_dict = OrderedDict([(motif, generate_sim(model, central_motif, side_motif, list(range(511, 511+150, 1)), 
                                                 center_coords=center_coords))
                           for motif, (pattern, side_motif) in all_side_motifs.items()])
    df = pd.concat([v[0].assign(motif=k) for k,v in res_dict.items()])  # stack the dataframes
    df_d[central_motif_name] = df
    res_dict_d[central_motif_name] = res_dict
In [44]:
df = df_d[central_motif_name]
res_dict = res_dict_d[central_motif_name]
In [45]:
display(plot_motif_table(mr, side_motifs))
plot_sim_motif_col(df, tasks, ['profile/max_frac', 'profile/counts_frac', 'profile/simmetric_kl', 
                               'imp/count', 'imp/weighted'],
         motifs=list(all_side_motifs), subfigsize=(6, 3), alpha=1)
In [46]:
side_motif_name = 'Nanog'; interactive_tracks(res_dict[side_motif_name][1], central_motif, all_side_motifs[side_motif_name][1])

Klf4

In [47]:
central_motif_name = "Klf4"
central_motif = "GGGTGTGG"
In [48]:
plot_tracks(model.sim_pred(central_motif), fig_width=15, fig_height_per_track=1.5, 
            title=f"{central_motif_name}: {central_motif}", same_ylim=True);
plt.xlabel("Position");
In [49]:
# find the right profile crop
center_coords = [485, 520]
plot_tracks(filter_tracks(model.sim_pred(central_motif), center_coords), fig_width=5, fig_height_per_track=1.5);
In [50]:
# get the motifs 
if not cached:
    res_dict = OrderedDict([(motif, generate_sim(model, central_motif, side_motif, list(range(511, 511+150, 1)), 
                                                 center_coords=center_coords))
                           for motif, (pattern, side_motif) in all_side_motifs.items()])
    df = pd.concat([v[0].assign(motif=k) for k,v in res_dict.items()])  # stack the dataframes
    df_d[central_motif_name] = df
    res_dict_d[central_motif_name] = res_dict
In [51]:
df = df_d[central_motif_name]
res_dict = res_dict_d[central_motif_name]
In [52]:
display(plot_motif_table(mr, side_motifs))
plot_sim_motif_col(df, tasks, ['profile/max_frac', 'profile/counts_frac', 'profile/simmetric_kl', 
                               'imp/count', 'imp/weighted'],
         motifs=list(all_side_motifs), subfigsize=(6, 3), alpha=1)
In [53]:
side_motif_name = 'Nanog'; interactive_tracks(res_dict[side_motif_name][1], central_motif, all_side_motifs[side_motif_name][1])

Nanog periodicity

In [44]:
dfa = pd.concat([df_d[m].assign(central_motif_name=m) for m in df_d])
In [45]:
side_motifs = list(side_motifs)
In [46]:
df_d.keys()
Out[46]:
dict_keys(['Oct4', 'Sox2', 'Nanog', 'Klf4'])
In [47]:
# row = side motif
# column = main motif
# plot = counts of which protein are looked at
In [48]:
central_motifs = ['Sox2', 'Oct4', 'Nanog', 'Klf4']
fig, axes = plt.subplots(len(side_motifs),len(central_motifs), figsize=(25, 10), sharex=True, sharey='row')
for j, central_motif in enumerate(central_motifs):
    for i, m in enumerate(side_motifs):
        ax = axes[i, j]
        if i == 0:
            ax.set_title(central_motif)
        ax.plot(np.abs(np.fft.rfft(dfa[(dfa.task == "Nanog") & (dfa.motif == m) &(dfa.central_motif_name == central_motif)].counts_frac)[3:74])**2, "-o")
        ax.set_xlabel("Frequency [bp]")
        if j == 0:
            ax.set_ylabel(m)
fig.subplots_adjust(wspace=0, hspace=0)
In [49]:
central_motifs = ['Sox2', 'Oct4', 'Nanog', 'Klf4']
fig, axes = plt.subplots(len(side_motifs),len(central_motifs), figsize=(25, 10), sharex=True, sharey='row')
for j, central_motif in enumerate(central_motifs):
    for i, m in enumerate(side_motifs):
        ax = axes[i, j]
        if i == 0:
            ax.set_title(central_motif)
        power_spec = np.abs(np.fft.rfft(dfa[(dfa.task == "Oct4") & (dfa.motif == m) &(dfa.central_motif_name == central_motif)].counts_frac))**2
        power_spec = power_spec / power_spec[1:].sum()
        ax.plot(power_spec[2:], "-o")
        ax.set_xlabel("Frequency [bp]")
        if j == 0:
            ax.set_ylabel(m)
fig.subplots_adjust(wspace=0, hspace=0)
In [50]:
central_motifs = ['Sox2', 'Oct4', 'Nanog', 'Klf4']
fig, axes = plt.subplots(len(side_motifs),len(central_motifs), figsize=(25, 10), sharex=True, sharey='row')
for j, central_motif in enumerate(central_motifs):
    for i, m in enumerate(side_motifs):
        ax = axes[i, j]
        if i == 0:
            ax.set_title(central_motif)
        power_spec = np.abs(np.fft.rfft(dfa[(dfa.task == "Sox2") & (dfa.motif == m) &(dfa.central_motif_name == central_motif)].counts_frac))**2
        power_spec = power_spec / power_spec[1:].sum()
        ax.plot(power_spec[2:], "-o")
        ax.set_xlabel("Frequency [bp]")
        if j == 0:
            ax.set_ylabel(m)
fig.subplots_adjust(wspace=0, hspace=0)
In [51]:
central_motifs = ['Sox2', 'Oct4', 'Nanog', 'Klf4']
fig, axes = plt.subplots(len(side_motifs),len(central_motifs), figsize=(25, 10), sharex=True, sharey='row')
for j, central_motif in enumerate(central_motifs):
    for i, m in enumerate(side_motifs):
        ax = axes[i, j]
        if i == 0:
            ax.set_title(central_motif)
        power_spec = np.abs(np.fft.rfft(dfa[(dfa.task == "Nanog") & (dfa.motif == m) &(dfa.central_motif_name == central_motif)].counts_frac))**2
        power_spec = power_spec / power_spec[1:].sum()
        ax.plot(power_spec[2:], "-o")
        ax.set_xlabel("Frequency [bp]")
        if j == 0:
            ax.set_ylabel(m)
fig.subplots_adjust(wspace=0, hspace=0)
In [52]:
central_motifs = ['Sox2', 'Oct4', 'Nanog', 'Klf4']
fig, axes = plt.subplots(len(side_motifs),len(central_motifs), figsize=(25, 10), sharex=True, sharey='row')
for j, central_motif in enumerate(central_motifs):
    for i, m in enumerate(side_motifs):
        ax = axes[i, j]
        if i == 0:
            ax.set_title(central_motif)
        power_spec = np.abs(np.fft.rfft(dfa[(dfa.task == "Klf4") & (dfa.motif == m) &(dfa.central_motif_name == central_motif)].counts_frac))**2
        power_spec = power_spec / power_spec[1:].sum()
        ax.plot(power_spec[2:], "-o")
        ax.set_xlabel("Frequency [bp]")
        if j == 0:
            ax.set_ylabel(m)
fig.subplots_adjust(wspace=0, hspace=0)
In [ ]: