In [1]:
from collections import OrderedDict
exp = 'nexus,peaks,OSNK,0,10,1,FALSE,same,0.5,64,25,0.004,9,FALSE,[1,50],TRUE'
imp_score = 'profile/wn'

motifs = OrderedDict([
    ("Oct4-Sox2", 'Oct4/m0_p0'),
    ("Oct4", "Oct4/m0_p1"),
    ("Oct4-Oct4", "Oct4/m0_p6"),
    ("Sox2", "Sox2/m0_p1"),
    ("Nanog", "Nanog/m0_p1"),
    ("Nanog-alt", "Nanog/m0_p4"),
    ("Klf4", "Klf4/m0_p0"),
    ("Klf4-long", "Klf4/m0_p5"),
    ("B-box", "Oct4/m0_p5"),
    ("Zic3", "Nanog/m0_p2"),
    ("Essrb", "Oct4/m0_p16"),
])
In [2]:
profile_mapping={k: v.split("/")[0] for k,v in motifs.items()}

Load the model

In [3]:
from basepair.imports import *
from basepair.exp.paper.config import *
from basepair.exp.chipnexus.spacing import remove_edge_instances, get_motif_pairs, motif_pair_dfi
Using TensorFlow backend.
In [4]:
# figures dir
model_dir = models_dir / exp
fdir = Path(f'{ddir}/figures/modisco/{exp}/spacing/')
fdir_individual = fdir / 'individual'
fdir_individual_sim = fdir / 'individual-simulation'
In [5]:
%config InlineBackend.figure_format = 'retina'
In [6]:
paper_config()

Fix the spacing issue

In [7]:
import os
import pandas as pd
import types
import numpy as np
from basepair.seqmodel import SeqModel
from basepair.BPNet import BPNetSeqModel
from collections import OrderedDict
from basepair.preproc import rc_seq
from basepair.exp.chipnexus.simulate import (insert_motif, generate_sim, plot_sim, generate_seq,
                                             model2tasks, motif_coords, interactive_tracks, plot_motif_table,
                                             plot_sim_motif_col)
from concise.preprocessing import encodeDNA
from basepair.exp.paper.config import models_dir
from basepair.config import get_data_dir, create_tf_session
from basepair.utils import write_pkl
import argparse
import logging
logger = logging.getLogger(__name__)
logger.addHandler(logging.NullHandler())


# --------------------------------------------
# Motif configuration
motif_seqs = OrderedDict([
    ("Oct4-Sox2", 'TTTGCATAACAA'),
    ("Oct4", "TATGCAAAT"),
    ("Oct4-Oct4", "TATGCATATGCATA"),
    ("Sox2", "GAACAATGG"),
    ("Nanog", "AGCCATCA"),
    ("Nanog-alt", "GATGGCCCATTTCCT"),
    ("Klf4", "CCACGCCC"),
    ("Klf4-long", "GCCCCGCCCCGCCC"),
    ("B-box", "CCGGGGTTCGAACCCGGG"),
    ("Zic3", "TCTCAGCAGGTAGCA"),
    ("Essrb", "TGACCTTGACCTT")
])

# get also the rc motifs
rc_motif_seqs = OrderedDict([
    (m + "/rc", rc_seq(v))
    for m, v in motif_seqs.items()
])
all_motif_seqs = OrderedDict(list(motif_seqs.items()) + list(rc_motif_seqs.items()))

center_coords = [485, 520]
repeat = 128
In [8]:
create_tf_session(3)
Out[8]:
<tensorflow.python.client.session.Session at 0x7fc237271588>
In [9]:
all_motif_seqs
Out[9]:
OrderedDict([('Oct4-Sox2', 'TTTGCATAACAA'),
             ('Oct4', 'TATGCAAAT'),
             ('Oct4-Oct4', 'TATGCATATGCATA'),
             ('Sox2', 'GAACAATGG'),
             ('Nanog', 'AGCCATCA'),
             ('Nanog-alt', 'GATGGCCCATTTCCT'),
             ('Klf4', 'CCACGCCC'),
             ('Klf4-long', 'GCCCCGCCCCGCCC'),
             ('B-box', 'CCGGGGTTCGAACCCGGG'),
             ('Zic3', 'TCTCAGCAGGTAGCA'),
             ('Essrb', 'TGACCTTGACCTT'),
             ('Oct4-Sox2/rc', 'TTGTTATGCAAA'),
             ('Oct4/rc', 'ATTTGCATA'),
             ('Oct4-Oct4/rc', 'TATGCATATGCATA'),
             ('Sox2/rc', 'CCATTGTTC'),
             ('Nanog/rc', 'TGATGGCT'),
             ('Nanog-alt/rc', 'AGGAAATGGGCCATC'),
             ('Klf4/rc', 'GGGCGTGG'),
             ('Klf4-long/rc', 'GGGCGGGGCGGGGC'),
             ('B-box/rc', 'CCCGGGTTCGAACCCCGG'),
             ('Zic3/rc', 'TGCTACCTGCTGAGA'),
             ('Essrb/rc', 'AAGGTCAAGGTCA')])
In [10]:
model_dir = models_dir / exp
In [11]:
bpnet = BPNetSeqModel.from_mdir(model_dir)
WARNING:tensorflow:From /users/avsec/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/tensorflow/python/util/deprecation.py:497: calling conv1d (from tensorflow.python.ops.nn_ops) with data_format=NHWC is deprecated and will be removed in a future version.
Instructions for updating:
`NHWC` for data_format is deprecated, use `NWC` instead
2019-08-10 14:51:42,888 [WARNING] From /users/avsec/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/tensorflow/python/util/deprecation.py:497: calling conv1d (from tensorflow.python.ops.nn_ops) with data_format=NHWC is deprecated and will be removed in a future version.
Instructions for updating:
`NHWC` for data_format is deprecated, use `NWC` instead
WARNING:tensorflow:From /users/avsec/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/tensorflow/contrib/learn/python/learn/datasets/base.py:198: retry (from tensorflow.contrib.learn.python.learn.datasets.base) is deprecated and will be removed in a future version.
Instructions for updating:
Use the retry module or similar alternatives.
2019-08-10 14:51:58,414 [WARNING] From /users/avsec/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/tensorflow/contrib/learn/python/learn/datasets/base.py:198: retry (from tensorflow.contrib.learn.python.learn.datasets.base) is deprecated and will be removed in a future version.
Instructions for updating:
Use the retry module or similar alternatives.
In [107]:
x_both = bpnet.sim_pred('GAACAATGG', 'TTTGCATAACAA', [650])['profile/Sox2']
x_sox2_far = bpnet.sim_pred('GAACAATGG', 'TTTGCATAACAA', [900])['profile/Sox2']
x_sox = bpnet.sim_pred('', 'TTTGCATAACAA', [650])['profile/Sox2']
x_klf4 = bpnet.sim_pred('GAACAATGG', '', [650])['profile/Sox2']
x_none = bpnet.sim_pred('', '', [650])['profile/Sox2']
In [ ]:
plot_tracks({"Klf4": x_both});
In [ ]:
plot_tracks({"Klf4": x_sox2_far});
In [ ]:
plot_tracks({"Klf4": x_sox});
In [156]:
plot_tracks({"Klf4": x_none});
In [157]:
plot_tracks({"Klf4": x_sox - x_none});
In [136]:
# compute the control bias
x_none = bpnet.sim_pred('')
control_bias = {k.split("/")[1]: x_none[k].mean(axis=-1, keepdims=True)/ x_none[k].mean(axis=-1, keepdims=True).min()
                for k in x_none}
plt.plot(control_bias['Oct4']);
In [146]:
for i in range(510, 650, 10):
    x_nanog = bpnet.sim_pred('AGCCATCA', 'AGCCATCA', [i])['profile/Nanog']
    plot_tracks({"Nanog": x_nanog / control_bias['Nanog']});
    plt.ylim([0, 0.4])
In [135]:
for i in range(300, 700, 50):
    x_nanog = bpnet.sim_pred('', 'AGCCATCA', [i])['profile/Nanog']
    plot_tracks({"Nanog": x_nanog / control_bias['Nanog']});
    plt.ylim([0, 0.4])
In [64]:
dfp = pd.DataFrame(
    {"avg-max": [x_both[[491, 502], [0, 1]].mean(),
                 x_sox2_far[[491, 502], [0, 1]].mean(),
             x_sox[[491, 502], [0, 1]].mean(),
                 x_klf4[[491, 502], [0, 1]].mean(),
             x_none[[491, 502], [0, 1]].mean()],
     "which": ['both', 'both, sox2 far', 'sox2 only', 'klf4 only', 'none']})
In [65]:
(ggplot(aes(x='which', y="avg-max"), dfp) + 
 geom_point() + 
 theme_bw() + 
 ylim([0, None])
)
Out[65]:
<ggplot: (-9223363275075085003)>
In [45]:
np.argmax(x_both, axis=0)
Out[45]:
array([491, 502])
In [43]:
plot_tracks({"Klf4": x_both});
In [87]:
res = generate_sim(bpnet, 'CCACGCCC', 'GAACAATGG', list(range(511, 511 + 10, 1)),
            center_coords=center_coords,
            repeat=repeat,
            correct=True,  # Whether to correct
            importance=[])
100%|██████████| 10/10 [00:05<00:00,  1.94it/s]
In [83]:
df_d, res_dict = res
In [84]:
df_d.head()
Out[84]:
central_motif distance position profile/counts profile/counts_frac profile/counts_max_ref profile/counts_max_ref_frac profile/max profile/max_frac profile/simmetric_kl side_motif task
0 CCACGCCC 11 511 2.4152 1.0093 0.1462 1.0630 0.1012 1.1217 0.0090 GAACAATGG Oct4
1 CCACGCCC 11 511 1.3362 0.9967 0.0782 1.1154 0.0488 1.1868 inf GAACAATGG Sox2
2 CCACGCCC 11 511 6.2370 0.7995 0.3073 0.8337 0.2088 0.9910 inf GAACAATGG Nanog
3 CCACGCCC 11 511 7.4679 0.9960 0.4751 0.9686 0.2822 0.9970 0.0069 GAACAATGG Klf4
4 CCACGCCC 12 512 2.6429 1.1045 0.1806 1.3127 0.1234 1.3678 0.0093 GAACAATGG Oct4
In [85]:
df_d.query("task == 'Klf4'").plot("distance", "profile/counts_max_ref", )
Out[85]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f80246dfb70>
In [86]:
df_d.query("task == 'Klf4'").plot("distance", "profile/counts_max_ref_frac", )
Out[86]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f80244b8128>
In [68]:
df_d.head()
Out[68]:
central_motif distance position profile/counts profile/counts_frac profile/counts_max_ref profile/counts_max_ref_frac profile/max profile/max_frac profile/simmetric_kl side_motif task
0 CCACGCCC 11 511 0.0174 0.8210 0.0753 1.2025 0.0632 1.1742 NaN GAACAATGG Oct4
1 CCACGCCC 11 511 -0.0048 -0.2977 0.0396 1.4784 0.0295 1.4803 NaN GAACAATGG Sox2
2 CCACGCCC 11 511 -2.1797 0.9325 0.0977 1.8840 0.0734 1.5385 NaN GAACAATGG Nanog
3 CCACGCCC 11 511 4.5029 0.9074 0.3883 0.9113 0.2309 0.9172 0.0182 GAACAATGG Klf4
4 CCACGCCC 12 512 0.6436 -0.2751 0.1081 -15.0077 0.1023 5.3914 NaN GAACAATGG Oct4

In silico instances

In [147]:
!ls {ddir}/cache/chipnexus/{exp}/simulation/
spacing;correct=True.pkl  spacing;correct=True.pkl.bak	spacing.pkl
In [12]:
# corrected
sim_df_d_norm, sim_res_dict_d_norm = read_pkl(f"{ddir}/cache/chipnexus/{exp}/simulation/spacing;correct=True.pkl")
In [13]:
# Non-corrected
sim_df_d, sim_res_dict_d = read_pkl(f"{ddir}/cache/chipnexus/{exp}/simulation/spacing.pkl")
In [99]:
from basepair.plot.tracks import plot_track

def plot_track_rev(arr, ax, legend=False, ylim=None, color=None, alpha=None, track=None):
    """Plot a track
    """
    seqlen = len(arr)
    if arr.ndim == 1 or arr.shape[1] == 1:
        # single track
        if color is not None:
            if isinstance(color, collections.Sequence):
                color = color[0]
        ax.plot(np.arange(1, seqlen + 1), np.ravel(arr), color=color)
    elif arr.shape[1] == 4:
        # plot seqlogo
        seqlogo(arr, ax=ax)
    elif arr.shape[1] == 2:
        # plot both strands
        if color is not None:
            assert isinstance(color, collections.Sequence)
            c1 = color[0]
            c2 = color[1]
        else:
            c1, c2 = None, None
            
        if alpha is not None:
            assert isinstance(alpha, collections.Sequence)
            a1 = alpha[0]
            a2 = alpha[1]
        else:
            a1, a2 = None, None
            
        ax.plot(np.arange(1, seqlen + 1), arr[:, 0], label='pos', color=c1, alpha=a1)
        ax.plot(np.arange(1, seqlen + 1), -arr[:, 1], label='neg', color=c2, alpha=a2)
        if legend:
            ax.legend()
    else:
        raise ValueError(f"Don't know how to plot array with shape[1] != {arr.shape[1]}. Valid values are: 1,2 or 4.")
    if ylim is not None:
        ax.set_ylim(ylim)

motif_pairs_sim = [
    ('Nanog', 'Nanog'),
    ('Nanog', 'Sox2'),
    ('Sox2', 'Sox2'),
    ('Sox2', 'Oct4-Sox2'),
    ('Nanog', 'Oct4-Sox2'),
    ('Klf4', 'Oct4-Sox2'),
]
profile_mapping

distance_range = np.arange(11, 161)
In [15]:
def get_motif_pair(data, pair, profile_mapping, dist):
    dist_val, profiles = data[pair[0]][pair[1]][1][dist - 11]
    assert dist + 500 == dist_val
    pair_track_names = (profile_mapping[pair[0]], profile_mapping[pair[1]])
    return {tn: profiles['profile'][tn]
            for tn in pair_track_names}
In [16]:
# compute the maximal values across all the possible situations
max_vals = {}
for dist in distance_range:
    for motif_pair in motif_pairs_sim:
        profiles = get_motif_pair(sim_res_dict_d, motif_pair, profile_mapping, dist)
        for k,v in profiles.items():
            if v.max() > max_vals.get(k, 0):
                max_vals[k] = v.max()
In [17]:
# compute the control bias
x_none = bpnet.sim_pred('')
control_bias = {k.split("/")[1]: x_none[k].mean(axis=-1, keepdims=True)/ x_none[k].mean(axis=-1, keepdims=True).max()
                for k in x_none}
In [66]:
distance_range
Out[66]:
array([ 11,  12,  13,  14,  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,  26,  27,  28,
        29,  30, ..., 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155,
       156, 157, 158, 159, 160])
In [95]:
!mkdir -p {fdir}/gifs
In [97]:
len(motif_pairs_sim)
Out[97]:
6
In [104]:
from celluloid import Camera
from IPython.display import HTML, Image
from matplotlib.ticker import FuncFormatter
from basepair.plot.tracks import *

# motif_pair = ('Nanog', 'Nanog')

for motif_pair in tqdm(motif_pairs_sim):
    
    motif_pair_name = "<>".join(motif_pair)
    
    max_vals = {}
    max_val_pos = {}
    for dist in distance_range:
        profiles = get_motif_pair(sim_res_dict_d, motif_pair, profile_mapping, dist)
        profiles = {k:v/control_bias[k] for k,v in profiles.items()}
        for k,v in profiles.items():
            if v.max() > max_vals.get(k, 0):
                max_vals[k] = v.max()
                max_val_pos[k] = dist

    
    fontsize = 10
    shift = 50

    fig, axes = plt.subplots(len(profiles), 1,
                             figsize=(6, 1.5 * len(profiles)),
                             sharex=True)
    if len(profiles) == 1:
        axes = [axes]
    camera = Camera(fig)

    # start from the outside in
    for i, distance in enumerate(reversed([11, 11] + list(range(11, 31, 1)) + list(range(31, 41, 2)) + list(range(41, 61, 4)) + list(range(61, 160, 8)))):

        # TODO - plot different distances

        profiles = get_motif_pair(sim_res_dict_d, motif_pair, profile_mapping, distance)
        profiles = {k:v/control_bias[k] for k,v in profiles.items()}  # normalize
        tracks = filter_tracks(profiles, xlim=[500 - shift, 500-shift + 250])

        # setup ylim
        ylim = [(-max_vals[k], max_vals[k]) for k in profiles]

        # plot the tracks
        for i, (ax, (track, arr)) in enumerate(zip(axes, get_items(tracks))):
            yl = get_list_value(ylim, i)
            plot_track_rev(arr, ax, legend=False, ylim=yl,
                           color=(tf_colors[track], tf_colors[track]),
                           alpha=(1, 0.5),
                           track=track)

            ax.set_ylabel(track)
            simple_yaxis_format(ax)
            if i != len(tracks) - 1:
                ax.xaxis.set_ticks_position('none')

        # add ticks to the final axis
        ax.xaxis.set_major_locator(ticker.AutoLocator())
        fig.subplots_adjust(hspace=0)



        vh = 0.33
        
        fig.axes[0].axvline(x=0+shift, ymin=.5 - vh/2, ymax=.5 + vh/2, color='gray', alpha=0.5)
        fig.axes[0].text(x=.95, y=0, s=f"Motif distance ={distance:3d}", 
                         transform=fig.axes[0].transAxes,
                         fontsize=fontsize,
                         verticalalignment='bottom', 
                         horizontalalignment='right')
        fig.axes[0].text(x=0+shift, y=ylim[0][1]*.9, s=motif_pair[0], 
                         style='italic',
                         fontsize=fontsize,
                         verticalalignment='top', 
                         horizontalalignment='left')
        fig.axes[-1].axvline(x=distance+shift, ymin=.5 - vh/2, ymax=.5 + vh/2, color='gray', alpha=0.5)
        fig.axes[-1].text(x=distance+shift, y=ylim[-1][1]*.9, s=motif_pair[1], 
                         style='italic',
                          fontsize=fontsize,
                         verticalalignment='top', 
                         horizontalalignment='left')

        ax.set_xlabel("Position [bp]")
        fig.axes[-1].xaxis.set_major_formatter(FuncFormatter(lambda val,pos: int(val)-shift))
        sns.despine(top=True, bottom=True, right=True)    
        plt.tight_layout()

        # ---------------------------------
        camera.snap()

    animation = camera.animate()
    animation.save(f'{fdir}/gifs/{motif_pair_name}.gif', writer='imagemagick', fps=5, dpi=300)
  0%|          | 0/6 [00:00<?, ?it/s]2019-08-10 15:46:42,442 [INFO] Animation.save using <class 'matplotlib.animation.ImageMagickWriter'>
2019-08-10 15:46:42,443 [INFO] Disabling savefig.bbox = 'tight', as it may cause frame size to vary, which is inappropriate for animation.
2019-08-10 15:46:42,445 [INFO] MovieWriter.run: running command: ['convert', '-size', '1800x450', '-depth', '8', '-delay', '20.0', '-loop', '0', 'rgba:-', '/users/avsec/workspace/basepair/data/figures/modisco/nexus,peaks,OSNK,0,10,1,FALSE,same,0.5,64,25,0.004,9,FALSE,[1,50],TRUE/spacing/gifs/Nanog<>Nanog.gif']
 17%|█▋        | 1/6 [00:19<01:37, 19.46s/it]2019-08-10 15:47:03,775 [INFO] Animation.save using <class 'matplotlib.animation.ImageMagickWriter'>
2019-08-10 15:47:03,778 [INFO] Disabling savefig.bbox = 'tight', as it may cause frame size to vary, which is inappropriate for animation.
2019-08-10 15:47:03,779 [INFO] MovieWriter.run: running command: ['convert', '-size', '1800x900', '-depth', '8', '-delay', '20.0', '-loop', '0', 'rgba:-', '/users/avsec/workspace/basepair/data/figures/modisco/nexus,peaks,OSNK,0,10,1,FALSE,same,0.5,64,25,0.004,9,FALSE,[1,50],TRUE/spacing/gifs/Nanog<>Sox2.gif']
 33%|███▎      | 2/6 [00:51<01:33, 23.35s/it]2019-08-10 15:47:35,085 [INFO] Animation.save using <class 'matplotlib.animation.ImageMagickWriter'>
2019-08-10 15:47:35,087 [INFO] Disabling savefig.bbox = 'tight', as it may cause frame size to vary, which is inappropriate for animation.
2019-08-10 15:47:35,088 [INFO] MovieWriter.run: running command: ['convert', '-size', '1800x450', '-depth', '8', '-delay', '20.0', '-loop', '0', 'rgba:-', '/users/avsec/workspace/basepair/data/figures/modisco/nexus,peaks,OSNK,0,10,1,FALSE,same,0.5,64,25,0.004,9,FALSE,[1,50],TRUE/spacing/gifs/Sox2<>Sox2.gif']
 50%|█████     | 3/6 [01:12<01:07, 22.41s/it]2019-08-10 15:47:55,773 [INFO] Animation.save using <class 'matplotlib.animation.ImageMagickWriter'>
2019-08-10 15:47:55,774 [INFO] Disabling savefig.bbox = 'tight', as it may cause frame size to vary, which is inappropriate for animation.
2019-08-10 15:47:55,775 [INFO] MovieWriter.run: running command: ['convert', '-size', '1800x900', '-depth', '8', '-delay', '20.0', '-loop', '0', 'rgba:-', '/users/avsec/workspace/basepair/data/figures/modisco/nexus,peaks,OSNK,0,10,1,FALSE,same,0.5,64,25,0.004,9,FALSE,[1,50],TRUE/spacing/gifs/Sox2<>Oct4-Sox2.gif']
 67%|██████▋   | 4/6 [01:40<00:48, 24.26s/it]2019-08-10 15:48:24,776 [INFO] Animation.save using <class 'matplotlib.animation.ImageMagickWriter'>
2019-08-10 15:48:24,779 [INFO] Disabling savefig.bbox = 'tight', as it may cause frame size to vary, which is inappropriate for animation.
2019-08-10 15:48:24,780 [INFO] MovieWriter.run: running command: ['convert', '-size', '1800x900', '-depth', '8', '-delay', '20.0', '-loop', '0', 'rgba:-', '/users/avsec/workspace/basepair/data/figures/modisco/nexus,peaks,OSNK,0,10,1,FALSE,same,0.5,64,25,0.004,9,FALSE,[1,50],TRUE/spacing/gifs/Nanog<>Oct4-Sox2.gif']
 83%|████████▎ | 5/6 [02:09<00:25, 25.74s/it]2019-08-10 15:48:54,506 [INFO] Animation.save using <class 'matplotlib.animation.ImageMagickWriter'>
2019-08-10 15:48:54,508 [INFO] Disabling savefig.bbox = 'tight', as it may cause frame size to vary, which is inappropriate for animation.
2019-08-10 15:48:54,509 [INFO] MovieWriter.run: running command: ['convert', '-size', '1800x900', '-depth', '8', '-delay', '20.0', '-loop', '0', 'rgba:-', '/users/avsec/workspace/basepair/data/figures/modisco/nexus,peaks,OSNK,0,10,1,FALSE,same,0.5,64,25,0.004,9,FALSE,[1,50],TRUE/spacing/gifs/Klf4<>Oct4-Sox2.gif']
100%|██████████| 6/6 [02:40<00:00, 27.24s/it]

TODO

  • [X] normalize all the profiles to the same y axis across all motif pairs
  • [X] shall we show always one motif in the center while the other one gets closer?
    • let's show the influencee in the center
  • [X] compile a list of all motif pairs to show
    • influencee in the center and the other one on the side
  • [X] for each distance and motif get the motif location -> just show motif center
  • [X] show the motif location as a vertical line + add the motif label at the corresponding site
  • [X] use appropriate colors for the motif
  • [X] compile the animation
In [27]:
!ls {ddir}/cache/chipnexus/{exp}/simulation/
spacing;correct=True.pkl  spacing;correct=True.pkl.bak	spacing.pkl
In [28]:
# cache_path = f"{ddir}/cache/chipnexus/{exp}/simulation/spacing.pkl"

# sim_df_d, sim_res_dict_d = read_pkl(cache_path)
In [29]:
from basepair.exp.chipnexus.spacing import comp_strand_compbination


def swap_orientation(o):
    if o=='+':
        return "-"
    if o=="-":
        return "+"
    raise ValueError("")

# Very much a hacked function to map simulation to in vivo data
def get_xy_sim_single(sim_df_d, motif_pair, feature, orientation, profile_mapping):
    # For Nanog, always explicilty swap the orientation
    orientation_pair = [orientation[0], orientation[1]]
    # HACK! Nanog orientation didn't properly match the orientation
    #if motif_pair[0] == "Nanog":
    #    orientation_pair[0] = swap_orientation(orientation_pair[0])
    #if motif_pair[1] == "Nanog":
    #    orientation_pair[1] = swap_orientation(orientation_pair[1])

    mp = list(deepcopy(motif_pair))
    if orientation_pair[0] == "-":
        mp[0] = mp[0] + "/rc"
    if orientation_pair[1] == "-":
        mp[1] = mp[1] + "/rc"
        
    df = sim_df_d[mp[0]]  # choose the central motif
    df = df[df.motif == mp[1]]  # choose the side motif
    
    df = df[df.distance < 150]
    
    # select the task
    df = df[df.task == profile_mapping[motif_pair[0]]]        
    return df.distance.values, df[feature].values
    
def get_xy_sim(sim_df_d, motif_pair, feature, orientation, profile_mapping):
    x1,y1 = get_xy_sim_single(sim_df_d, motif_pair, feature, orientation, profile_mapping)
    x2,y2 = get_xy_sim_single(sim_df_d, list(reversed(motif_pair)), feature, orientation, profile_mapping)
    # x2,y2 = get_xy_sim_single(sim_df_d, list(reversed(motif_pair)), feature, comp_strand_compbination[orientation], profile_mapping)
    # assert np.all(x1 == x2)
    # return x1, y1, y2
    return x1, y1, y2
In [80]:
# motif_pair = ['Oct4-Sox2', 'Sox2']
# x, y, y2 = get_xy_sim(sim_df_d, motif_pair, 'profile/counts_frac', '++', profile_mapping)
In [ ]:
pairs = get_motif_pairs(motifs)
In [ ]:
tested_motif_pairs = [p for p in pairs if p[0] in sim_df_d_norm and p[1] in sim_df_d_norm]
In [ ]:
# fig, axes = plt.subplots(len(tested_motif_pairs), 2, figsize=get_figsize(.5, len(tested_motif_pairs)/3), 
#                          sharex=True, 
#                          sharey=True,
#                          gridspec_kw=dict(wspace=0, hspace=0))
# # motif_pair = ['Oct4-Sox2', 'Sox2']
# for i, motif_pair in enumerate(tested_motif_pairs):
#     x, y, y2 = get_xy_sim(sim_df_d_norm, motif_pair, 'profile/counts_max_ref_frac', '++', profile_mapping)
#     ax = axes[i, 0]
#     ax.plot(x, y, label=motif_pair[0]);
#     ax.plot(x, y2, label=motif_pair[1]);
#     if i == 0:
#         ax.set_title("Bleed-through corrected")
#     ax.axhline(1, color='grey', alpha=0.5)
#     ax.set_ylabel("\n".join(motif_pair))
#     # plt.legend()

#     ax = axes[i, 1]
#     x, y, y2 = get_xy_sim(sim_df_d, motif_pair, 'profile/counts_max_ref_frac', '++', profile_mapping)
#     ax.plot(x, y, label=motif_pair[0]);
#     ax.plot(x, y2, label=motif_pair[1]);
#     if i == 0:
#         ax.set_title("Un-corrected")
#     ax.axhline(1, color='grey', alpha=0.5)
#     ax.legend()
# axes[-1, 0].set_xlabel("Pairwise distance")
# axes[-1, 0].set_xticks([10, 50, 100, 150]);
# fig.savefig(fdir / 'simulated.spacing.all-11.pdf')

Generate per-motif plots

In [32]:
ymax = 5
columns = 4
plot_pairs = get_motif_pairs(['Oct4-Sox2', 'Sox2', 'Nanog', 'Klf4'])
rows = len(plot_pairs)
fig, axes = plt.subplots(rows, columns, figsize=get_figsize(.7, 1/5*len(plot_pairs)), 
                         sharex=True, 
                         sharey=True,
                         gridspec_kw=dict(wspace=0, hspace=0))

for j, motif_pair in enumerate(plot_pairs):
    for i, strand_orientation in enumerate(['++', '--', '+-', '-+']):
        x, y, y2 = get_xy_sim(sim_df_d_norm, motif_pair, 'profile/counts_max_ref_frac', strand_orientation, profile_mapping)
        ax = axes[j, i]
        ax.plot(x, y, label=motif_pair[0], color=tf_colors[profile_mapping[motif_pair[0]]]);
        ax.plot(x, y2, label=motif_pair[1], color=tf_colors[profile_mapping[motif_pair[1]]], 
                linestyle='dashed' if motif_pair[0] == motif_pair[1] else 'solid');  # only dashed for homotypic interactions
        ax.set_ylim([0, ymax])
        ax.axhline(1, color='grey', alpha=0.5)
        if j == 0:
            ax.set_title(strand_orientation)
            if i ==0 :
                ax.set_ylabel("Max profile value")
            
        if i == 3:
            ax.legend()
        ax.spines['top'].set_visible(False)
        ax.spines['right'].set_visible(False)
        if i % rows == rows-1:
            axes[i].set_xticks([10, 50, 100, 150]);
ax.set_xlabel("Motif distance")
motif_pair_name = '<>'.join(motif_pair)
fig.savefig(fdir / f'all-pairs.bleed-through-corrected.strand-specific.pdf')
In [17]:
a=1
In [85]:
ymax = 10
columns = 4
rows = 1

for j, motif_pair in enumerate(tested_motif_pairs):
    fig, axes = plt.subplots(rows, columns, figsize=get_figsize(.7, 1/3.5), 
                             sharex=True, 
                             sharey=True,
                             gridspec_kw=dict(wspace=0, hspace=0))
    for i, strand_orientation in enumerate(['++', '--', '+-', '-+']):
        x, y, y2 = get_xy_sim(sim_df_d_norm, motif_pair, 'profile/counts_max_ref_frac', strand_orientation, profile_mapping)
        ax = axes[i]
        ax.plot(x, y, label=motif_pair[0], color=tf_colors[profile_mapping[motif_pair[0]]]);
        ax.plot(x, y2, label=motif_pair[1], color=tf_colors[profile_mapping[motif_pair[1]]], linestyle='dashed');
        ax.set_ylim([0, ymax])
        #if i == 0:
        ax.set_title(strand_orientation)
        ax.axhline(1, color='grey', alpha=0.5)
        if i == 0:
            ax.set_ylabel("Max profile value")
        if i == 3:
            ax.legend()
        ax.spines['top'].set_visible(False)
        ax.spines['right'].set_visible(False)
        if i % rows == rows-1:
            axes[i].set_xticks([10, 50, 100, 150]);
    axes[1].set_xlabel("Motif distance")
    motif_pair_name = '<>'.join(motif_pair)
    fig.savefig(fdir_individual_sim / f'{motif_pair_name}.bleed-through-corrected.pdf')
In [64]:
strand_orientation = "--"
ymax = 10
columns = 6
rows = len(tested_motif_pairs)//columns
fig, axes = plt.subplots(rows, columns, figsize=get_figsize(1.1, 1/3.5/9*len(tested_motif_pairs)), 
                         sharex=True, 
                         sharey=True,
                         gridspec_kw=dict(wspace=0, hspace=0))
# motif_pair = ['Oct4-Sox2', 'Sox2']
for i, motif_pair in enumerate(tested_motif_pairs):
    x, y, y2 = get_xy_sim(sim_df_d_norm, motif_pair, 'profile/counts_max_ref_frac', strand_orientation, profile_mapping)
    ax = axes[i%rows, i//rows]
    ax.plot(x, y, label=motif_pair[0], color=tf_colors[profile_mapping[motif_pair[0]]]);
    ax.plot(x, y2, label=motif_pair[1], color=tf_colors[profile_mapping[motif_pair[1]]], linestyle='dashed');
    ax.set_ylim([0, ymax])
    if i == 0:
        ax.set_title("Bleed-through corrected")
    ax.axhline(1, color='grey', alpha=0.5)
    if i == 1:
        ax.set_ylabel("Max profile value")
    ax.legend()
    ax.spines['top'].set_visible(False)
    ax.spines['right'].set_visible(False)
    if i % rows == rows-1:
        axes[1, i // rows].set_xticks([10, 50, 100, 150]);
axes[-1, 2].set_xlabel("Motif distance")
fig.savefig(fdir / f'simulated.spacing.all.corrected-only.all-11-motifs.{strand_orientation}.pdf')
In [48]:
columns = 6
rows = len(tested_motif_pairs)//columns
fig, axes = plt.subplots(rows, columns, figsize=get_figsize(1.1, 1/3.5/9*len(tested_motif_pairs)), 
                         sharex=True, 
                         sharey=True,
                         gridspec_kw=dict(wspace=0, hspace=0))
# motif_pair = ['Oct4-Sox2', 'Sox2']
for i, motif_pair in enumerate(tested_motif_pairs):
    x, y, y2 = get_xy_sim(sim_df_d_norm, motif_pair, 'profile/counts_max_ref_frac', '++', profile_mapping)
    ax = axes[i%rows, i//rows]
    ax.plot(x, y, label=motif_pair[0], color=tf_colors[profile_mapping[motif_pair[0]]]);
    ax.plot(x, y2, label=motif_pair[1], color=tf_colors[profile_mapping[motif_pair[1]]], linestyle='dashed');
    if i == 0:
        ax.set_title("Bleed-through corrected")
    ax.axhline(1, color='grey', alpha=0.5)
    if i == 1:
        ax.set_ylabel("Max profile value")
    ax.legend()
    ax.spines['top'].set_visible(False)
    ax.spines['right'].set_visible(False)
    if i % rows == rows-1:
        axes[1, i // rows].set_xticks([10, 50, 100, 150]);
axes[-1, 2].set_xlabel("Motif distance")
fig.savefig(fdir / 'simulated.spacing.all.corrected-only.all-11-motifs.pdf')
In [55]:
sim_df_d_norm['Oct4-Sox2'].motif.unique()
Out[55]:
array(['Oct4-Sox2', 'Oct4', 'Oct4-Oct4', 'Sox2', 'Nanog', 'Nanog-alt', 'Klf4', 'Klf4-long',
       'B-box', 'Zic3', 'Essrb', 'Oct4-Sox2/rc', 'Oct4/rc', 'Oct4-Oct4/rc', 'Sox2/rc', 'Nanog/rc',
       'Nanog-alt/rc', 'Klf4/rc', 'Klf4-long/rc', 'B-box/rc', 'Zic3/rc', 'Essrb/rc'], dtype=object)
In [50]:
columns = 6
rows = len(tested_motif_pairs)//columns
fig, axes = plt.subplots(rows, columns, figsize=get_figsize(1.1, 1/3.5/9*len(tested_motif_pairs)), 
                         sharex=True, 
                         sharey=True,
                         gridspec_kw=dict(wspace=0, hspace=0))
# motif_pair = ['Oct4-Sox2', 'Sox2']
for i, motif_pair in enumerate(tested_motif_pairs):
    x, y, y2 = get_xy_sim(sim_df_d_norm, motif_pair, 'profile/counts_max_ref_frac', '++', profile_mapping)
    ax = axes[i%rows, i//rows]
    ax.set_ylim([0, 5])
    ax.plot(x, y, label=motif_pair[0], color=tf_colors[profile_mapping[motif_pair[0]]]);
    ax.plot(x, y2, label=motif_pair[1], color=tf_colors[profile_mapping[motif_pair[1]]], linestyle='dashed');
    if i == 0:
        ax.set_title("Bleed-through corrected")
    ax.axhline(1, color='grey', alpha=0.5)
    if i == 1:
        ax.set_ylabel("Max profile value")
    ax.legend()
    ax.spines['top'].set_visible(False)
    ax.spines['right'].set_visible(False)
    if i % rows == rows-1:
        axes[1, i // rows].set_xticks([10, 50, 100, 150]);
axes[-1, 2].set_xlabel("Motif distance")
fig.savefig(fdir / 'simulated.spacing.all.corrected-only.all-11-motifs.ylim=5.pdf')
In [83]:
sim_df_d_norm['Nanog'].query('motif == "Nanog"').query('task == "Nanog"').sort_values("profile/counts_max_ref_frac").head(n=4)
Out[83]:
central_motif distance position profile/counts profile/counts_frac profile/counts_max_ref profile/counts_max_ref_frac profile/max profile/max_frac profile/simmetric_kl side_motif task motif
14 AGCCATCA 14 514 14.3918 0.6976 1.1614 0.6562 0.7731 0.8424 inf AGCCATCA Nanog Nanog
150 AGCCATCA 48 548 18.1084 0.8778 1.4260 0.8057 0.8109 0.8836 0.0093 AGCCATCA Nanog Nanog
22 AGCCATCA 16 516 15.8471 0.7682 1.4684 0.8297 0.7590 0.8270 inf AGCCATCA Nanog Nanog
190 AGCCATCA 58 558 17.6948 0.8578 1.4768 0.8344 0.7723 0.8415 0.0061 AGCCATCA Nanog Nanog
In [84]:
from basepair.exp.chipnexus.simulate import generate_seq
In [89]:
generate_seq('AGCCATCA', 'AGCCATCA', side_distances=[50 + 11], seqlen=100)
Out[89]:
'AATTAATTCCGACGTCACGAGTGGTAGCCTTCGTCGTATCGACCAGAGCCATCAATAAGCCATCATTTCGTCACTGATGTGCGGGGCGTGAGGGGGATTG'
In [90]:
# Best sequence: AGCCATCANNNAGCCATCA
In [91]:
# Worst sequence: AGCCATCANNNNNNAGCCATCA
In [73]:
# motif_pair = ['Oct4-Sox2', 'Sox2']
for i, motif_pair in enumerate(tested_motif_pairs):
    fig, ax = plt.subplots(1, 1, figsize=get_figsize(.25, .618))
    x, y, y2 = get_xy_sim(sim_df_d_norm, motif_pair, 'profile/counts_max_ref_frac', '++', profile_mapping)
    ax.plot(x, y, label=motif_pair[0]);
    ax.plot(x, y2, label=motif_pair[1]);
    if i == 0:
        ax.set_title("Bleed-through corrected")
    ax.axhline(1, color='grey', alpha=0.5)
    ax.set_title("<>".join(motif_pair))
    # plt.legend()
    ax.legend()
    ax.set_xlabel("Pairwise distance")
    ax.set_ylabel("Counts at max profile")
    ax.set_xticks([10, 50, 100, 150]);
    motif_pair_name = '<>'.join(motif_pair)
    fig.savefig(fdir_individual_sim / f'{motif_pair_name}.bleed-through-corrected.pdf')

Group-by differnet plots

In [92]:
tested_motif_pairs
Out[92]:
[['Oct4-Sox2', 'Oct4-Sox2'],
 ['Oct4-Sox2', 'Sox2'],
 ['Oct4-Sox2', 'Nanog'],
 ['Oct4-Sox2', 'Klf4'],
 ['Sox2', 'Sox2'],
 ['Sox2', 'Nanog'],
 ['Sox2', 'Klf4'],
 ['Nanog', 'Nanog'],
 ['Nanog', 'Klf4'],
 ['Klf4', 'Klf4']]
In [43]:
for strand in ['++', '-+', '+-', '--']:
    fig, axes = plt.subplots(3, 1, figsize=get_figsize(.25, 2), 
                             sharex=True, 
                             sharey=True,
                             gridspec_kw=dict(wspace=0, hspace=0))
    # motif_pair = ['Oct4-Sox2', 'Sox2']
    for i, motif_pair in enumerate([['Nanog', 'Nanog'], 
                                    ['Sox2', 'Nanog'],
                                    ['Sox2', 'Sox2']]):
        x, y, y2 = get_xy_sim(sim_df_d_norm, motif_pair, 'profile/counts_max_ref_frac', strand, profile_mapping)
        ax = axes[i]
        ax.plot(x, y, label=motif_pair[0], color=tf_colors[profile_mapping[motif_pair[0]]]);
        ax.plot(x, y2, label=motif_pair[1], color=tf_colors[profile_mapping[motif_pair[1]]]);
        if i == 0:
            ax.set_title(f"Short-range interactions {strand}")
        ax.axhline(1, color='grey', alpha=0.5)
        if i == 1:
            ax.set_ylabel("Max profile value")
        ax.legend()
        ax.spines['top'].set_visible(False)
        ax.spines['right'].set_visible(False)
    axes[-1].set_xlabel("Motif distance")
    axes[-1].set_xticks([10, 50, 100, 150]);
    fig.savefig(fdir / f'simulated.short-range.Nanog<>Sox2.strand_orientation={strand}.pdf')
In [45]:
for strand in ['++', '-+', '+-', '--']:

    fig, axes = plt.subplots(3, 1, figsize=get_figsize(.25, 2), 
                             sharex=True, 
                             sharey=True,
                             gridspec_kw=dict(wspace=0, hspace=0))
    # motif_pair = ['Oct4-Sox2', 'Sox2']
    for i, motif_pair in enumerate([['Oct4-Sox2', 'Sox2'], 
                                    ['Oct4-Sox2', 'Nanog'],
                                    ['Oct4-Sox2', 'Klf4']]):
        x, y, y2 = get_xy_sim(sim_df_d_norm, motif_pair, 'profile/counts_max_ref_frac', strand, profile_mapping)
        ax = axes[i]
        ax.plot(x, y, label=motif_pair[0], color=tf_colors[profile_mapping[motif_pair[0]]]);
        ax.plot(x, y2, label=motif_pair[1], color=tf_colors[profile_mapping[motif_pair[1]]]);
        if i == 0:
            ax.set_title(f"Long-range interactions {strand}")
        ax.axhline(1, color='grey', alpha=0.5)
        if i == 1:
            ax.set_ylabel("Max profile value")
        ax.legend()
        ax.spines['top'].set_visible(False)
        ax.spines['right'].set_visible(False)
    axes[-1].set_xlabel("Motif distance")
    axes[-1].set_xticks([10, 50, 100, 150]);
    fig.savefig(fdir / f'simulated.long-range.Oct4-Sox2.strand_orientation={strand}.pdf')