from collections import OrderedDict
exp = 'nexus,peaks,OSNK,0,10,1,FALSE,same,0.5,64,25,0.004,9,FALSE,[1,50],TRUE'
imp_score = 'profile/wn'

motifs = OrderedDict([
    ("Oct4-Sox2", 'Oct4/m0_p0'),
    ("Oct4", "Oct4/m0_p1"),
    ("Oct4-Oct4", "Oct4/m0_p6"),
    ("Sox2", "Sox2/m0_p1"),
    ("Nanog", "Nanog/m0_p1"),
    ("Nanog-alt", "Nanog/m0_p4"),
    ("Klf4", "Klf4/m0_p0"),
    ("Klf4-long", "Klf4/m0_p5"),
    ("B-box", "Oct4/m0_p5"),
    ("Zic3", "Nanog/m0_p2"),
    ("Essrb", "Oct4/m0_p16"),
])

profile_mapping={k: v.split("/")[0] for k,v in motifs.items()}

Load the model¶

from basepair.imports import *
from basepair.exp.paper.config import *
from basepair.exp.chipnexus.spacing import remove_edge_instances, get_motif_pairs, motif_pair_dfi

Using TensorFlow backend.

# figures dir
model_dir = models_dir / exp
fdir = Path(f'{ddir}/figures/modisco/{exp}/spacing/')
fdir_individual = fdir / 'individual'
fdir_individual_sim = fdir / 'individual-simulation'

%config InlineBackend.figure_format = 'retina'

paper_config()

Fix the spacing issue¶

import os
import pandas as pd
import types
import numpy as np
from basepair.seqmodel import SeqModel
from basepair.BPNet import BPNetSeqModel
from collections import OrderedDict
from basepair.preproc import rc_seq
from basepair.exp.chipnexus.simulate import (insert_motif, generate_sim, plot_sim, generate_seq,
                                             model2tasks, motif_coords, interactive_tracks, plot_motif_table,
                                             plot_sim_motif_col)
from concise.preprocessing import encodeDNA
from basepair.exp.paper.config import models_dir
from basepair.config import get_data_dir, create_tf_session
from basepair.utils import write_pkl
import argparse
import logging
logger = logging.getLogger(__name__)
logger.addHandler(logging.NullHandler())


# --------------------------------------------
# Motif configuration
motif_seqs = OrderedDict([
    ("Oct4-Sox2", 'TTTGCATAACAA'),
    ("Oct4", "TATGCAAAT"),
    ("Oct4-Oct4", "TATGCATATGCATA"),
    ("Sox2", "GAACAATGG"),
    ("Nanog", "AGCCATCA"),
    ("Nanog-alt", "GATGGCCCATTTCCT"),
    ("Klf4", "CCACGCCC"),
    ("Klf4-long", "GCCCCGCCCCGCCC"),
    ("B-box", "CCGGGGTTCGAACCCGGG"),
    ("Zic3", "TCTCAGCAGGTAGCA"),
    ("Essrb", "TGACCTTGACCTT")
])

# get also the rc motifs
rc_motif_seqs = OrderedDict([
    (m + "/rc", rc_seq(v))
    for m, v in motif_seqs.items()
])
all_motif_seqs = OrderedDict(list(motif_seqs.items()) + list(rc_motif_seqs.items()))

center_coords = [485, 520]
repeat = 128

create_tf_session(3)

<tensorflow.python.client.session.Session at 0x7fc237271588>

all_motif_seqs

OrderedDict([('Oct4-Sox2', 'TTTGCATAACAA'),
             ('Oct4', 'TATGCAAAT'),
             ('Oct4-Oct4', 'TATGCATATGCATA'),
             ('Sox2', 'GAACAATGG'),
             ('Nanog', 'AGCCATCA'),
             ('Nanog-alt', 'GATGGCCCATTTCCT'),
             ('Klf4', 'CCACGCCC'),
             ('Klf4-long', 'GCCCCGCCCCGCCC'),
             ('B-box', 'CCGGGGTTCGAACCCGGG'),
             ('Zic3', 'TCTCAGCAGGTAGCA'),
             ('Essrb', 'TGACCTTGACCTT'),
             ('Oct4-Sox2/rc', 'TTGTTATGCAAA'),
             ('Oct4/rc', 'ATTTGCATA'),
             ('Oct4-Oct4/rc', 'TATGCATATGCATA'),
             ('Sox2/rc', 'CCATTGTTC'),
             ('Nanog/rc', 'TGATGGCT'),
             ('Nanog-alt/rc', 'AGGAAATGGGCCATC'),
             ('Klf4/rc', 'GGGCGTGG'),
             ('Klf4-long/rc', 'GGGCGGGGCGGGGC'),
             ('B-box/rc', 'CCCGGGTTCGAACCCCGG'),
             ('Zic3/rc', 'TGCTACCTGCTGAGA'),
             ('Essrb/rc', 'AAGGTCAAGGTCA')])

model_dir = models_dir / exp

bpnet = BPNetSeqModel.from_mdir(model_dir)

WARNING:tensorflow:From /users/avsec/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/tensorflow/python/util/deprecation.py:497: calling conv1d (from tensorflow.python.ops.nn_ops) with data_format=NHWC is deprecated and will be removed in a future version.
Instructions for updating:
`NHWC` for data_format is deprecated, use `NWC` instead

2019-08-10 14:51:42,888 [WARNING] From /users/avsec/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/tensorflow/python/util/deprecation.py:497: calling conv1d (from tensorflow.python.ops.nn_ops) with data_format=NHWC is deprecated and will be removed in a future version.
Instructions for updating:
`NHWC` for data_format is deprecated, use `NWC` instead

WARNING:tensorflow:From /users/avsec/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/tensorflow/contrib/learn/python/learn/datasets/base.py:198: retry (from tensorflow.contrib.learn.python.learn.datasets.base) is deprecated and will be removed in a future version.
Instructions for updating:
Use the retry module or similar alternatives.

2019-08-10 14:51:58,414 [WARNING] From /users/avsec/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/tensorflow/contrib/learn/python/learn/datasets/base.py:198: retry (from tensorflow.contrib.learn.python.learn.datasets.base) is deprecated and will be removed in a future version.
Instructions for updating:
Use the retry module or similar alternatives.

x_both = bpnet.sim_pred('GAACAATGG', 'TTTGCATAACAA', [650])['profile/Sox2']
x_sox2_far = bpnet.sim_pred('GAACAATGG', 'TTTGCATAACAA', [900])['profile/Sox2']
x_sox = bpnet.sim_pred('', 'TTTGCATAACAA', [650])['profile/Sox2']
x_klf4 = bpnet.sim_pred('GAACAATGG', '', [650])['profile/Sox2']
x_none = bpnet.sim_pred('', '', [650])['profile/Sox2']

plot_tracks({"Klf4": x_both});

plot_tracks({"Klf4": x_sox2_far});

plot_tracks({"Klf4": x_sox});

plot_tracks({"Klf4": x_none});

plot_tracks({"Klf4": x_sox - x_none});

# compute the control bias
x_none = bpnet.sim_pred('')
control_bias = {k.split("/")[1]: x_none[k].mean(axis=-1, keepdims=True)/ x_none[k].mean(axis=-1, keepdims=True).min()
                for k in x_none}
plt.plot(control_bias['Oct4']);

for i in range(510, 650, 10):
    x_nanog = bpnet.sim_pred('AGCCATCA', 'AGCCATCA', [i])['profile/Nanog']
    plot_tracks({"Nanog": x_nanog / control_bias['Nanog']});
    plt.ylim([0, 0.4])

for i in range(300, 700, 50):
    x_nanog = bpnet.sim_pred('', 'AGCCATCA', [i])['profile/Nanog']
    plot_tracks({"Nanog": x_nanog / control_bias['Nanog']});
    plt.ylim([0, 0.4])

dfp = pd.DataFrame(
    {"avg-max": [x_both[[491, 502], [0, 1]].mean(),
                 x_sox2_far[[491, 502], [0, 1]].mean(),
             x_sox[[491, 502], [0, 1]].mean(),
                 x_klf4[[491, 502], [0, 1]].mean(),
             x_none[[491, 502], [0, 1]].mean()],
     "which": ['both', 'both, sox2 far', 'sox2 only', 'klf4 only', 'none']})

(ggplot(aes(x='which', y="avg-max"), dfp) + 
 geom_point() + 
 theme_bw() + 
 ylim([0, None])
)

<ggplot: (-9223363275075085003)>

np.argmax(x_both, axis=0)

array([491, 502])

plot_tracks({"Klf4": x_both});

res = generate_sim(bpnet, 'CCACGCCC', 'GAACAATGG', list(range(511, 511 + 10, 1)),
            center_coords=center_coords,
            repeat=repeat,
            correct=True,  # Whether to correct
            importance=[])

100%|██████████| 10/10 [00:05<00:00,  1.94it/s]

df_d, res_dict = res

df_d.head()

df_d.query("task == 'Klf4'").plot("distance", "profile/counts_max_ref", )

<matplotlib.axes._subplots.AxesSubplot at 0x7f80246dfb70>

df_d.query("task == 'Klf4'").plot("distance", "profile/counts_max_ref_frac", )

<matplotlib.axes._subplots.AxesSubplot at 0x7f80244b8128>

df_d.head()

In silico instances¶

!ls {ddir}/cache/chipnexus/{exp}/simulation/

spacing;correct=True.pkl  spacing;correct=True.pkl.bak	spacing.pkl

# corrected
sim_df_d_norm, sim_res_dict_d_norm = read_pkl(f"{ddir}/cache/chipnexus/{exp}/simulation/spacing;correct=True.pkl")

# Non-corrected
sim_df_d, sim_res_dict_d = read_pkl(f"{ddir}/cache/chipnexus/{exp}/simulation/spacing.pkl")

from basepair.plot.tracks import plot_track

def plot_track_rev(arr, ax, legend=False, ylim=None, color=None, alpha=None, track=None):
    """Plot a track
    """
    seqlen = len(arr)
    if arr.ndim == 1 or arr.shape[1] == 1:
        # single track
        if color is not None:
            if isinstance(color, collections.Sequence):
                color = color[0]
        ax.plot(np.arange(1, seqlen + 1), np.ravel(arr), color=color)
    elif arr.shape[1] == 4:
        # plot seqlogo
        seqlogo(arr, ax=ax)
    elif arr.shape[1] == 2:
        # plot both strands
        if color is not None:
            assert isinstance(color, collections.Sequence)
            c1 = color[0]
            c2 = color[1]
        else:
            c1, c2 = None, None
            
        if alpha is not None:
            assert isinstance(alpha, collections.Sequence)
            a1 = alpha[0]
            a2 = alpha[1]
        else:
            a1, a2 = None, None
            
        ax.plot(np.arange(1, seqlen + 1), arr[:, 0], label='pos', color=c1, alpha=a1)
        ax.plot(np.arange(1, seqlen + 1), -arr[:, 1], label='neg', color=c2, alpha=a2)
        if legend:
            ax.legend()
    else:
        raise ValueError(f"Don't know how to plot array with shape[1] != {arr.shape[1]}. Valid values are: 1,2 or 4.")
    if ylim is not None:
        ax.set_ylim(ylim)

motif_pairs_sim = [
    ('Nanog', 'Nanog'),
    ('Nanog', 'Sox2'),
    ('Sox2', 'Sox2'),
    ('Sox2', 'Oct4-Sox2'),
    ('Nanog', 'Oct4-Sox2'),
    ('Klf4', 'Oct4-Sox2'),
]
profile_mapping

distance_range = np.arange(11, 161)

def get_motif_pair(data, pair, profile_mapping, dist):
    dist_val, profiles = data[pair[0]][pair[1]][1][dist - 11]
    assert dist + 500 == dist_val
    pair_track_names = (profile_mapping[pair[0]], profile_mapping[pair[1]])
    return {tn: profiles['profile'][tn]
            for tn in pair_track_names}

# compute the maximal values across all the possible situations
max_vals = {}
for dist in distance_range:
    for motif_pair in motif_pairs_sim:
        profiles = get_motif_pair(sim_res_dict_d, motif_pair, profile_mapping, dist)
        for k,v in profiles.items():
            if v.max() > max_vals.get(k, 0):
                max_vals[k] = v.max()

# compute the control bias
x_none = bpnet.sim_pred('')
control_bias = {k.split("/")[1]: x_none[k].mean(axis=-1, keepdims=True)/ x_none[k].mean(axis=-1, keepdims=True).max()
                for k in x_none}

distance_range

array([ 11,  12,  13,  14,  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,  26,  27,  28,
        29,  30, ..., 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155,
       156, 157, 158, 159, 160])

!mkdir -p {fdir}/gifs

len(motif_pairs_sim)

6

from celluloid import Camera
from IPython.display import HTML, Image
from matplotlib.ticker import FuncFormatter
from basepair.plot.tracks import *

# motif_pair = ('Nanog', 'Nanog')

for motif_pair in tqdm(motif_pairs_sim):
    
    motif_pair_name = "<>".join(motif_pair)
    
    max_vals = {}
    max_val_pos = {}
    for dist in distance_range:
        profiles = get_motif_pair(sim_res_dict_d, motif_pair, profile_mapping, dist)
        profiles = {k:v/control_bias[k] for k,v in profiles.items()}
        for k,v in profiles.items():
            if v.max() > max_vals.get(k, 0):
                max_vals[k] = v.max()
                max_val_pos[k] = dist

    
    fontsize = 10
    shift = 50

    fig, axes = plt.subplots(len(profiles), 1,
                             figsize=(6, 1.5 * len(profiles)),
                             sharex=True)
    if len(profiles) == 1:
        axes = [axes]
    camera = Camera(fig)

    # start from the outside in
    for i, distance in enumerate(reversed([11, 11] + list(range(11, 31, 1)) + list(range(31, 41, 2)) + list(range(41, 61, 4)) + list(range(61, 160, 8)))):

        # TODO - plot different distances

        profiles = get_motif_pair(sim_res_dict_d, motif_pair, profile_mapping, distance)
        profiles = {k:v/control_bias[k] for k,v in profiles.items()}  # normalize
        tracks = filter_tracks(profiles, xlim=[500 - shift, 500-shift + 250])

        # setup ylim
        ylim = [(-max_vals[k], max_vals[k]) for k in profiles]

        # plot the tracks
        for i, (ax, (track, arr)) in enumerate(zip(axes, get_items(tracks))):
            yl = get_list_value(ylim, i)
            plot_track_rev(arr, ax, legend=False, ylim=yl,
                           color=(tf_colors[track], tf_colors[track]),
                           alpha=(1, 0.5),
                           track=track)

            ax.set_ylabel(track)
            simple_yaxis_format(ax)
            if i != len(tracks) - 1:
                ax.xaxis.set_ticks_position('none')

        # add ticks to the final axis
        ax.xaxis.set_major_locator(ticker.AutoLocator())
        fig.subplots_adjust(hspace=0)



        vh = 0.33
        
        fig.axes[0].axvline(x=0+shift, ymin=.5 - vh/2, ymax=.5 + vh/2, color='gray', alpha=0.5)
        fig.axes[0].text(x=.95, y=0, s=f"Motif distance ={distance:3d}", 
                         transform=fig.axes[0].transAxes,
                         fontsize=fontsize,
                         verticalalignment='bottom', 
                         horizontalalignment='right')
        fig.axes[0].text(x=0+shift, y=ylim[0][1]*.9, s=motif_pair[0], 
                         style='italic',
                         fontsize=fontsize,
                         verticalalignment='top', 
                         horizontalalignment='left')
        fig.axes[-1].axvline(x=distance+shift, ymin=.5 - vh/2, ymax=.5 + vh/2, color='gray', alpha=0.5)
        fig.axes[-1].text(x=distance+shift, y=ylim[-1][1]*.9, s=motif_pair[1], 
                         style='italic',
                          fontsize=fontsize,
                         verticalalignment='top', 
                         horizontalalignment='left')

        ax.set_xlabel("Position [bp]")
        fig.axes[-1].xaxis.set_major_formatter(FuncFormatter(lambda val,pos: int(val)-shift))
        sns.despine(top=True, bottom=True, right=True)    
        plt.tight_layout()

        # ---------------------------------
        camera.snap()

    animation = camera.animate()
    animation.save(f'{fdir}/gifs/{motif_pair_name}.gif', writer='imagemagick', fps=5, dpi=300)

  0%|          | 0/6 [00:00<?, ?it/s]2019-08-10 15:46:42,442 [INFO] Animation.save using <class 'matplotlib.animation.ImageMagickWriter'>
2019-08-10 15:46:42,443 [INFO] Disabling savefig.bbox = 'tight', as it may cause frame size to vary, which is inappropriate for animation.
2019-08-10 15:46:42,445 [INFO] MovieWriter.run: running command: ['convert', '-size', '1800x450', '-depth', '8', '-delay', '20.0', '-loop', '0', 'rgba:-', '/users/avsec/workspace/basepair/data/figures/modisco/nexus,peaks,OSNK,0,10,1,FALSE,same,0.5,64,25,0.004,9,FALSE,[1,50],TRUE/spacing/gifs/Nanog<>Nanog.gif']
 17%|█▋        | 1/6 [00:19<01:37, 19.46s/it]2019-08-10 15:47:03,775 [INFO] Animation.save using <class 'matplotlib.animation.ImageMagickWriter'>
2019-08-10 15:47:03,778 [INFO] Disabling savefig.bbox = 'tight', as it may cause frame size to vary, which is inappropriate for animation.
2019-08-10 15:47:03,779 [INFO] MovieWriter.run: running command: ['convert', '-size', '1800x900', '-depth', '8', '-delay', '20.0', '-loop', '0', 'rgba:-', '/users/avsec/workspace/basepair/data/figures/modisco/nexus,peaks,OSNK,0,10,1,FALSE,same,0.5,64,25,0.004,9,FALSE,[1,50],TRUE/spacing/gifs/Nanog<>Sox2.gif']
 33%|███▎      | 2/6 [00:51<01:33, 23.35s/it]2019-08-10 15:47:35,085 [INFO] Animation.save using <class 'matplotlib.animation.ImageMagickWriter'>
2019-08-10 15:47:35,087 [INFO] Disabling savefig.bbox = 'tight', as it may cause frame size to vary, which is inappropriate for animation.
2019-08-10 15:47:35,088 [INFO] MovieWriter.run: running command: ['convert', '-size', '1800x450', '-depth', '8', '-delay', '20.0', '-loop', '0', 'rgba:-', '/users/avsec/workspace/basepair/data/figures/modisco/nexus,peaks,OSNK,0,10,1,FALSE,same,0.5,64,25,0.004,9,FALSE,[1,50],TRUE/spacing/gifs/Sox2<>Sox2.gif']
 50%|█████     | 3/6 [01:12<01:07, 22.41s/it]2019-08-10 15:47:55,773 [INFO] Animation.save using <class 'matplotlib.animation.ImageMagickWriter'>
2019-08-10 15:47:55,774 [INFO] Disabling savefig.bbox = 'tight', as it may cause frame size to vary, which is inappropriate for animation.
2019-08-10 15:47:55,775 [INFO] MovieWriter.run: running command: ['convert', '-size', '1800x900', '-depth', '8', '-delay', '20.0', '-loop', '0', 'rgba:-', '/users/avsec/workspace/basepair/data/figures/modisco/nexus,peaks,OSNK,0,10,1,FALSE,same,0.5,64,25,0.004,9,FALSE,[1,50],TRUE/spacing/gifs/Sox2<>Oct4-Sox2.gif']
 67%|██████▋   | 4/6 [01:40<00:48, 24.26s/it]2019-08-10 15:48:24,776 [INFO] Animation.save using <class 'matplotlib.animation.ImageMagickWriter'>
2019-08-10 15:48:24,779 [INFO] Disabling savefig.bbox = 'tight', as it may cause frame size to vary, which is inappropriate for animation.
2019-08-10 15:48:24,780 [INFO] MovieWriter.run: running command: ['convert', '-size', '1800x900', '-depth', '8', '-delay', '20.0', '-loop', '0', 'rgba:-', '/users/avsec/workspace/basepair/data/figures/modisco/nexus,peaks,OSNK,0,10,1,FALSE,same,0.5,64,25,0.004,9,FALSE,[1,50],TRUE/spacing/gifs/Nanog<>Oct4-Sox2.gif']
 83%|████████▎ | 5/6 [02:09<00:25, 25.74s/it]2019-08-10 15:48:54,506 [INFO] Animation.save using <class 'matplotlib.animation.ImageMagickWriter'>
2019-08-10 15:48:54,508 [INFO] Disabling savefig.bbox = 'tight', as it may cause frame size to vary, which is inappropriate for animation.
2019-08-10 15:48:54,509 [INFO] MovieWriter.run: running command: ['convert', '-size', '1800x900', '-depth', '8', '-delay', '20.0', '-loop', '0', 'rgba:-', '/users/avsec/workspace/basepair/data/figures/modisco/nexus,peaks,OSNK,0,10,1,FALSE,same,0.5,64,25,0.004,9,FALSE,[1,50],TRUE/spacing/gifs/Klf4<>Oct4-Sox2.gif']
100%|██████████| 6/6 [02:40<00:00, 27.24s/it]

TODO¶

[X] normalize all the profiles to the same y axis across all motif pairs
[X] shall we show always one motif in the center while the other one gets closer?
- let's show the influencee in the center
[X] compile a list of all motif pairs to show
- influencee in the center and the other one on the side
[X] for each distance and motif get the motif location -> just show motif center
[X] show the motif location as a vertical line + add the motif label at the corresponding site
[X] use appropriate colors for the motif
[X] compile the animation

!ls {ddir}/cache/chipnexus/{exp}/simulation/

spacing;correct=True.pkl  spacing;correct=True.pkl.bak	spacing.pkl

# cache_path = f"{ddir}/cache/chipnexus/{exp}/simulation/spacing.pkl"

# sim_df_d, sim_res_dict_d = read_pkl(cache_path)

from basepair.exp.chipnexus.spacing import comp_strand_compbination


def swap_orientation(o):
    if o=='+':
        return "-"
    if o=="-":
        return "+"
    raise ValueError("")

# Very much a hacked function to map simulation to in vivo data
def get_xy_sim_single(sim_df_d, motif_pair, feature, orientation, profile_mapping):
    # For Nanog, always explicilty swap the orientation
    orientation_pair = [orientation[0], orientation[1]]
    # HACK! Nanog orientation didn't properly match the orientation
    #if motif_pair[0] == "Nanog":
    #    orientation_pair[0] = swap_orientation(orientation_pair[0])
    #if motif_pair[1] == "Nanog":
    #    orientation_pair[1] = swap_orientation(orientation_pair[1])

    mp = list(deepcopy(motif_pair))
    if orientation_pair[0] == "-":
        mp[0] = mp[0] + "/rc"
    if orientation_pair[1] == "-":
        mp[1] = mp[1] + "/rc"
        
    df = sim_df_d[mp[0]]  # choose the central motif
    df = df[df.motif == mp[1]]  # choose the side motif
    
    df = df[df.distance < 150]
    
    # select the task
    df = df[df.task == profile_mapping[motif_pair[0]]]        
    return df.distance.values, df[feature].values
    
def get_xy_sim(sim_df_d, motif_pair, feature, orientation, profile_mapping):
    x1,y1 = get_xy_sim_single(sim_df_d, motif_pair, feature, orientation, profile_mapping)
    x2,y2 = get_xy_sim_single(sim_df_d, list(reversed(motif_pair)), feature, orientation, profile_mapping)
    # x2,y2 = get_xy_sim_single(sim_df_d, list(reversed(motif_pair)), feature, comp_strand_compbination[orientation], profile_mapping)
    # assert np.all(x1 == x2)
    # return x1, y1, y2
    return x1, y1, y2

# motif_pair = ['Oct4-Sox2', 'Sox2']
# x, y, y2 = get_xy_sim(sim_df_d, motif_pair, 'profile/counts_frac', '++', profile_mapping)

pairs = get_motif_pairs(motifs)

tested_motif_pairs = [p for p in pairs if p[0] in sim_df_d_norm and p[1] in sim_df_d_norm]

# fig, axes = plt.subplots(len(tested_motif_pairs), 2, figsize=get_figsize(.5, len(tested_motif_pairs)/3), 
#                          sharex=True, 
#                          sharey=True,
#                          gridspec_kw=dict(wspace=0, hspace=0))
# # motif_pair = ['Oct4-Sox2', 'Sox2']
# for i, motif_pair in enumerate(tested_motif_pairs):
#     x, y, y2 = get_xy_sim(sim_df_d_norm, motif_pair, 'profile/counts_max_ref_frac', '++', profile_mapping)
#     ax = axes[i, 0]
#     ax.plot(x, y, label=motif_pair[0]);
#     ax.plot(x, y2, label=motif_pair[1]);
#     if i == 0:
#         ax.set_title("Bleed-through corrected")
#     ax.axhline(1, color='grey', alpha=0.5)
#     ax.set_ylabel("\n".join(motif_pair))
#     # plt.legend()

#     ax = axes[i, 1]
#     x, y, y2 = get_xy_sim(sim_df_d, motif_pair, 'profile/counts_max_ref_frac', '++', profile_mapping)
#     ax.plot(x, y, label=motif_pair[0]);
#     ax.plot(x, y2, label=motif_pair[1]);
#     if i == 0:
#         ax.set_title("Un-corrected")
#     ax.axhline(1, color='grey', alpha=0.5)
#     ax.legend()
# axes[-1, 0].set_xlabel("Pairwise distance")
# axes[-1, 0].set_xticks([10, 50, 100, 150]);
# fig.savefig(fdir / 'simulated.spacing.all-11.pdf')

Generate per-motif plots¶

ymax = 5
columns = 4
plot_pairs = get_motif_pairs(['Oct4-Sox2', 'Sox2', 'Nanog', 'Klf4'])
rows = len(plot_pairs)
fig, axes = plt.subplots(rows, columns, figsize=get_figsize(.7, 1/5*len(plot_pairs)), 
                         sharex=True, 
                         sharey=True,
                         gridspec_kw=dict(wspace=0, hspace=0))

for j, motif_pair in enumerate(plot_pairs):
    for i, strand_orientation in enumerate(['++', '--', '+-', '-+']):
        x, y, y2 = get_xy_sim(sim_df_d_norm, motif_pair, 'profile/counts_max_ref_frac', strand_orientation, profile_mapping)
        ax = axes[j, i]
        ax.plot(x, y, label=motif_pair[0], color=tf_colors[profile_mapping[motif_pair[0]]]);
        ax.plot(x, y2, label=motif_pair[1], color=tf_colors[profile_mapping[motif_pair[1]]], 
                linestyle='dashed' if motif_pair[0] == motif_pair[1] else 'solid');  # only dashed for homotypic interactions
        ax.set_ylim([0, ymax])
        ax.axhline(1, color='grey', alpha=0.5)
        if j == 0:
            ax.set_title(strand_orientation)
            if i ==0 :
                ax.set_ylabel("Max profile value")
            
        if i == 3:
            ax.legend()
        ax.spines['top'].set_visible(False)
        ax.spines['right'].set_visible(False)
        if i % rows == rows-1:
            axes[i].set_xticks([10, 50, 100, 150]);
ax.set_xlabel("Motif distance")
motif_pair_name = '<>'.join(motif_pair)
fig.savefig(fdir / f'all-pairs.bleed-through-corrected.strand-specific.pdf')

a=1

ymax = 10
columns = 4
rows = 1

for j, motif_pair in enumerate(tested_motif_pairs):
    fig, axes = plt.subplots(rows, columns, figsize=get_figsize(.7, 1/3.5), 
                             sharex=True, 
                             sharey=True,
                             gridspec_kw=dict(wspace=0, hspace=0))
    for i, strand_orientation in enumerate(['++', '--', '+-', '-+']):
        x, y, y2 = get_xy_sim(sim_df_d_norm, motif_pair, 'profile/counts_max_ref_frac', strand_orientation, profile_mapping)
        ax = axes[i]
        ax.plot(x, y, label=motif_pair[0], color=tf_colors[profile_mapping[motif_pair[0]]]);
        ax.plot(x, y2, label=motif_pair[1], color=tf_colors[profile_mapping[motif_pair[1]]], linestyle='dashed');
        ax.set_ylim([0, ymax])
        #if i == 0:
        ax.set_title(strand_orientation)
        ax.axhline(1, color='grey', alpha=0.5)
        if i == 0:
            ax.set_ylabel("Max profile value")
        if i == 3:
            ax.legend()
        ax.spines['top'].set_visible(False)
        ax.spines['right'].set_visible(False)
        if i % rows == rows-1:
            axes[i].set_xticks([10, 50, 100, 150]);
    axes[1].set_xlabel("Motif distance")
    motif_pair_name = '<>'.join(motif_pair)
    fig.savefig(fdir_individual_sim / f'{motif_pair_name}.bleed-through-corrected.pdf')

strand_orientation = "--"
ymax = 10
columns = 6
rows = len(tested_motif_pairs)//columns
fig, axes = plt.subplots(rows, columns, figsize=get_figsize(1.1, 1/3.5/9*len(tested_motif_pairs)), 
                         sharex=True, 
                         sharey=True,
                         gridspec_kw=dict(wspace=0, hspace=0))
# motif_pair = ['Oct4-Sox2', 'Sox2']
for i, motif_pair in enumerate(tested_motif_pairs):
    x, y, y2 = get_xy_sim(sim_df_d_norm, motif_pair, 'profile/counts_max_ref_frac', strand_orientation, profile_mapping)
    ax = axes[i%rows, i//rows]
    ax.plot(x, y, label=motif_pair[0], color=tf_colors[profile_mapping[motif_pair[0]]]);
    ax.plot(x, y2, label=motif_pair[1], color=tf_colors[profile_mapping[motif_pair[1]]], linestyle='dashed');
    ax.set_ylim([0, ymax])
    if i == 0:
        ax.set_title("Bleed-through corrected")
    ax.axhline(1, color='grey', alpha=0.5)
    if i == 1:
        ax.set_ylabel("Max profile value")
    ax.legend()
    ax.spines['top'].set_visible(False)
    ax.spines['right'].set_visible(False)
    if i % rows == rows-1:
        axes[1, i // rows].set_xticks([10, 50, 100, 150]);
axes[-1, 2].set_xlabel("Motif distance")
fig.savefig(fdir / f'simulated.spacing.all.corrected-only.all-11-motifs.{strand_orientation}.pdf')

columns = 6
rows = len(tested_motif_pairs)//columns
fig, axes = plt.subplots(rows, columns, figsize=get_figsize(1.1, 1/3.5/9*len(tested_motif_pairs)), 
                         sharex=True, 
                         sharey=True,
                         gridspec_kw=dict(wspace=0, hspace=0))
# motif_pair = ['Oct4-Sox2', 'Sox2']
for i, motif_pair in enumerate(tested_motif_pairs):
    x, y, y2 = get_xy_sim(sim_df_d_norm, motif_pair, 'profile/counts_max_ref_frac', '++', profile_mapping)
    ax = axes[i%rows, i//rows]
    ax.plot(x, y, label=motif_pair[0], color=tf_colors[profile_mapping[motif_pair[0]]]);
    ax.plot(x, y2, label=motif_pair[1], color=tf_colors[profile_mapping[motif_pair[1]]], linestyle='dashed');
    if i == 0:
        ax.set_title("Bleed-through corrected")
    ax.axhline(1, color='grey', alpha=0.5)
    if i == 1:
        ax.set_ylabel("Max profile value")
    ax.legend()
    ax.spines['top'].set_visible(False)
    ax.spines['right'].set_visible(False)
    if i % rows == rows-1:
        axes[1, i // rows].set_xticks([10, 50, 100, 150]);
axes[-1, 2].set_xlabel("Motif distance")
fig.savefig(fdir / 'simulated.spacing.all.corrected-only.all-11-motifs.pdf')

sim_df_d_norm['Oct4-Sox2'].motif.unique()

array(['Oct4-Sox2', 'Oct4', 'Oct4-Oct4', 'Sox2', 'Nanog', 'Nanog-alt', 'Klf4', 'Klf4-long',
       'B-box', 'Zic3', 'Essrb', 'Oct4-Sox2/rc', 'Oct4/rc', 'Oct4-Oct4/rc', 'Sox2/rc', 'Nanog/rc',
       'Nanog-alt/rc', 'Klf4/rc', 'Klf4-long/rc', 'B-box/rc', 'Zic3/rc', 'Essrb/rc'], dtype=object)

columns = 6
rows = len(tested_motif_pairs)//columns
fig, axes = plt.subplots(rows, columns, figsize=get_figsize(1.1, 1/3.5/9*len(tested_motif_pairs)), 
                         sharex=True, 
                         sharey=True,
                         gridspec_kw=dict(wspace=0, hspace=0))
# motif_pair = ['Oct4-Sox2', 'Sox2']
for i, motif_pair in enumerate(tested_motif_pairs):
    x, y, y2 = get_xy_sim(sim_df_d_norm, motif_pair, 'profile/counts_max_ref_frac', '++', profile_mapping)
    ax = axes[i%rows, i//rows]
    ax.set_ylim([0, 5])
    ax.plot(x, y, label=motif_pair[0], color=tf_colors[profile_mapping[motif_pair[0]]]);
    ax.plot(x, y2, label=motif_pair[1], color=tf_colors[profile_mapping[motif_pair[1]]], linestyle='dashed');
    if i == 0:
        ax.set_title("Bleed-through corrected")
    ax.axhline(1, color='grey', alpha=0.5)
    if i == 1:
        ax.set_ylabel("Max profile value")
    ax.legend()
    ax.spines['top'].set_visible(False)
    ax.spines['right'].set_visible(False)
    if i % rows == rows-1:
        axes[1, i // rows].set_xticks([10, 50, 100, 150]);
axes[-1, 2].set_xlabel("Motif distance")
fig.savefig(fdir / 'simulated.spacing.all.corrected-only.all-11-motifs.ylim=5.pdf')

sim_df_d_norm['Nanog'].query('motif == "Nanog"').query('task == "Nanog"').sort_values("profile/counts_max_ref_frac").head(n=4)

from basepair.exp.chipnexus.simulate import generate_seq

generate_seq('AGCCATCA', 'AGCCATCA', side_distances=[50 + 11], seqlen=100)

'AATTAATTCCGACGTCACGAGTGGTAGCCTTCGTCGTATCGACCAGAGCCATCAATAAGCCATCATTTCGTCACTGATGTGCGGGGCGTGAGGGGGATTG'

# Best sequence: AGCCATCANNNAGCCATCA

# Worst sequence: AGCCATCANNNNNNAGCCATCA

# motif_pair = ['Oct4-Sox2', 'Sox2']
for i, motif_pair in enumerate(tested_motif_pairs):
    fig, ax = plt.subplots(1, 1, figsize=get_figsize(.25, .618))
    x, y, y2 = get_xy_sim(sim_df_d_norm, motif_pair, 'profile/counts_max_ref_frac', '++', profile_mapping)
    ax.plot(x, y, label=motif_pair[0]);
    ax.plot(x, y2, label=motif_pair[1]);
    if i == 0:
        ax.set_title("Bleed-through corrected")
    ax.axhline(1, color='grey', alpha=0.5)
    ax.set_title("<>".join(motif_pair))
    # plt.legend()
    ax.legend()
    ax.set_xlabel("Pairwise distance")
    ax.set_ylabel("Counts at max profile")
    ax.set_xticks([10, 50, 100, 150]);
    motif_pair_name = '<>'.join(motif_pair)
    fig.savefig(fdir_individual_sim / f'{motif_pair_name}.bleed-through-corrected.pdf')

Group-by differnet plots¶

tested_motif_pairs

[['Oct4-Sox2', 'Oct4-Sox2'],
 ['Oct4-Sox2', 'Sox2'],
 ['Oct4-Sox2', 'Nanog'],
 ['Oct4-Sox2', 'Klf4'],
 ['Sox2', 'Sox2'],
 ['Sox2', 'Nanog'],
 ['Sox2', 'Klf4'],
 ['Nanog', 'Nanog'],
 ['Nanog', 'Klf4'],
 ['Klf4', 'Klf4']]

for strand in ['++', '-+', '+-', '--']:
    fig, axes = plt.subplots(3, 1, figsize=get_figsize(.25, 2), 
                             sharex=True, 
                             sharey=True,
                             gridspec_kw=dict(wspace=0, hspace=0))
    # motif_pair = ['Oct4-Sox2', 'Sox2']
    for i, motif_pair in enumerate([['Nanog', 'Nanog'], 
                                    ['Sox2', 'Nanog'],
                                    ['Sox2', 'Sox2']]):
        x, y, y2 = get_xy_sim(sim_df_d_norm, motif_pair, 'profile/counts_max_ref_frac', strand, profile_mapping)
        ax = axes[i]
        ax.plot(x, y, label=motif_pair[0], color=tf_colors[profile_mapping[motif_pair[0]]]);
        ax.plot(x, y2, label=motif_pair[1], color=tf_colors[profile_mapping[motif_pair[1]]]);
        if i == 0:
            ax.set_title(f"Short-range interactions {strand}")
        ax.axhline(1, color='grey', alpha=0.5)
        if i == 1:
            ax.set_ylabel("Max profile value")
        ax.legend()
        ax.spines['top'].set_visible(False)
        ax.spines['right'].set_visible(False)
    axes[-1].set_xlabel("Motif distance")
    axes[-1].set_xticks([10, 50, 100, 150]);
    fig.savefig(fdir / f'simulated.short-range.Nanog<>Sox2.strand_orientation={strand}.pdf')

for strand in ['++', '-+', '+-', '--']:

    fig, axes = plt.subplots(3, 1, figsize=get_figsize(.25, 2), 
                             sharex=True, 
                             sharey=True,
                             gridspec_kw=dict(wspace=0, hspace=0))
    # motif_pair = ['Oct4-Sox2', 'Sox2']
    for i, motif_pair in enumerate([['Oct4-Sox2', 'Sox2'], 
                                    ['Oct4-Sox2', 'Nanog'],
                                    ['Oct4-Sox2', 'Klf4']]):
        x, y, y2 = get_xy_sim(sim_df_d_norm, motif_pair, 'profile/counts_max_ref_frac', strand, profile_mapping)
        ax = axes[i]
        ax.plot(x, y, label=motif_pair[0], color=tf_colors[profile_mapping[motif_pair[0]]]);
        ax.plot(x, y2, label=motif_pair[1], color=tf_colors[profile_mapping[motif_pair[1]]]);
        if i == 0:
            ax.set_title(f"Long-range interactions {strand}")
        ax.axhline(1, color='grey', alpha=0.5)
        if i == 1:
            ax.set_ylabel("Max profile value")
        ax.legend()
        ax.spines['top'].set_visible(False)
        ax.spines['right'].set_visible(False)
    axes[-1].set_xlabel("Motif distance")
    axes[-1].set_xticks([10, 50, 100, 150]);
    fig.savefig(fdir / f'simulated.long-range.Oct4-Sox2.strand_orientation={strand}.pdf')

	central_motif	distance	position	profile/counts	profile/counts_frac	profile/counts_max_ref	profile/counts_max_ref_frac	profile/max	profile/max_frac	profile/simmetric_kl	side_motif	task
0	CCACGCCC	11	511	2.4152	1.0093	0.1462	1.0630	0.1012	1.1217	0.0090	GAACAATGG	Oct4
1	CCACGCCC	11	511	1.3362	0.9967	0.0782	1.1154	0.0488	1.1868	inf	GAACAATGG	Sox2
2	CCACGCCC	11	511	6.2370	0.7995	0.3073	0.8337	0.2088	0.9910	inf	GAACAATGG	Nanog
3	CCACGCCC	11	511	7.4679	0.9960	0.4751	0.9686	0.2822	0.9970	0.0069	GAACAATGG	Klf4
4	CCACGCCC	12	512	2.6429	1.1045	0.1806	1.3127	0.1234	1.3678	0.0093	GAACAATGG	Oct4

	central_motif	distance	position	profile/counts	profile/counts_frac	profile/counts_max_ref	profile/counts_max_ref_frac	profile/max	profile/max_frac	profile/simmetric_kl	side_motif	task
0	CCACGCCC	11	511	0.0174	0.8210	0.0753	1.2025	0.0632	1.1742	NaN	GAACAATGG	Oct4
1	CCACGCCC	11	511	-0.0048	-0.2977	0.0396	1.4784	0.0295	1.4803	NaN	GAACAATGG	Sox2
2	CCACGCCC	11	511	-2.1797	0.9325	0.0977	1.8840	0.0734	1.5385	NaN	GAACAATGG	Nanog
3	CCACGCCC	11	511	4.5029	0.9074	0.3883	0.9113	0.2309	0.9172	0.0182	GAACAATGG	Klf4
4	CCACGCCC	12	512	0.6436	-0.2751	0.1081	-15.0077	0.1023	5.3914	NaN	GAACAATGG	Oct4

	central_motif	distance	position	profile/counts	profile/counts_frac	profile/counts_max_ref	profile/counts_max_ref_frac	profile/max	profile/max_frac	profile/simmetric_kl	side_motif	task	motif
14	AGCCATCA	14	514	14.3918	0.6976	1.1614	0.6562	0.7731	0.8424	inf	AGCCATCA	Nanog	Nanog
150	AGCCATCA	48	548	18.1084	0.8778	1.4260	0.8057	0.8109	0.8836	0.0093	AGCCATCA	Nanog	Nanog
22	AGCCATCA	16	516	15.8471	0.7682	1.4684	0.8297	0.7590	0.8270	inf	AGCCATCA	Nanog	Nanog
190	AGCCATCA	58	558	17.6948	0.8578	1.4768	0.8344	0.7723	0.8415	0.0061	AGCCATCA	Nanog	Nanog