Goal¶

analyze the pairwise features for Nanog

Tasks¶

[x] remove duplicates
[ ] get an overview over the instances
- [ ] global overview
  - # instances per sequence
- [ ] select a few promising seqlets
- [ ] local overview
  - positional distribution within the sequence
  - match distribution (natural, p-scale)
  - importance distribution (natural, p-scale)

from basepair.imports import *
from basepair.plot.profiles import extract_signal
from basepair.math import softmax
from basepair.plot.heatmaps import heatmap_stranded_profile, multiple_heatmap_stranded_profile
from basepair.plot.profiles import  plot_stranded_profile, multiple_plot_stranded_profile
import plotnine
from plotnine import *
import statsmodels.api as sm
import statsmodels.formula.api as smf
from basepair.modisco.results import Seqlet, resize_seqlets
from basepair.modisco.core import dfi2seqlets, annotate_profile
from basepair.cli.modisco import load_profiles

import dask.dataframe as dd
import hvplot.pandas
import holoviews as hv

hv.extension('bokeh', 'matplotlib')

/users/avsec/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/h5py/__init__.py:36: FutureWarning: Conversion of the second argument of issubdtype from `float` to `np.floating` is deprecated. In future, it will be treated as `np.float64 == np.dtype(float).type`.
  from ._conv import register_converters as _register_converters
Using TensorFlow backend.
2018-10-24 03:58:19,799 [WARNING] doc empty for the `info:` field

model_dir = Path(f"{ddir}/processed/chipnexus/exp/models/oct-sox-nanog-klf/models/n_dil_layers=9/")
modisco_dir = model_dir / "modisco/all/profile/"

mr = ModiscoResult(modisco_dir / "modisco.h5")
mr.open()

# Load the data
d = HDF5Reader(model_dir / "grad.all.h5")
d.open()

# specify motifs to use in the analysis
motifs = OrderedDict([
    ("Oct4-Sox2", "m0_p0"),
    ("Oct4-Sox2-deg", "m6_p8"),
    ("Oct4", "m0_p18"),
    ("Sox2", "m0_p1"),
    ("Essrb", "m0_p2"),
    ("Nanog", "m0_p3"),
    ("Nanog-periodic", "m0_p9"),
    ("Klf4", "m2_p0"),
])

# plot them
for tf, p in motifs.items():
    mr.get_pattern(longer_pattern(p)).trim_seq_ic(0.08).resize(40).plot("seq");
    plt.title(tf)

Load instances¶

from basepair.modisco.pattern_instances import load_instances, filter_nonoverlapping_intervals, plot_coocurence_matrix

dfi = load_instances(f"{modisco_dir}/instances.parq", motifs)

dfi = filter_nonoverlapping_intervals(dfi)

total_examples = len(dfi.example_idx.unique())
total_examples

# pattern lengths
dfi.groupby(['pattern_name', 'pattern_len']).size()

pattern_name    pattern_len
Essrb           16             247382
Klf4            10             250910
Nanog           10             287074
Nanog-periodic  52             964426
Oct4            39              46795
Oct4-Sox2       16             244433
Oct4-Sox2-deg   46             180267
Sox2            9              365894
dtype: int64

General stats on the scores¶

Total number of instances
instances per sequence

Whole 1kb region¶

n_patterns = dfi.groupby(['pattern_name', 'match_weighted_cat', 'imp_weighted_cat']).size()
n_patterns = n_patterns.rename(columns={0: "n_instances"})

n_patterns.hvplot.bar('imp_weighted_cat', by='pattern_name',groupby='match_weighted_cat', rot=90)

Central 200bp region¶

n_patterns = dfi[(dfi.pattern_center > 400) & (dfi.pattern_center < 600)].groupby(['pattern_name', 'match_weighted_cat', 'imp_weighted_cat']).size()
n_patterns = n_patterns.rename(columns={0: "n_instances"})

n_patterns.hvplot.bar('imp_weighted_cat', by='pattern_name',groupby='match_weighted_cat', rot=90)

Simple co-occurence analysis¶

from upsetplot import plot as upsetplt

#.query('imp_weighted_cat=="high"')
counts = pd.pivot_table(dfi[(dfi.pattern_center > 400) & (dfi.pattern_center < 600)].query('match_weighted_cat=="high"').query('imp_weighted_cat=="high"'), 
                        'pattern_len', "example_idx", "pattern_name", aggfunc=len, fill_value=0)

c = counts >0

upsetplt(c.groupby(list(c.columns)).size().sort_values(ascending=False)[:50],  sort_by='cardinality');

m = counts[['Nanog', 'Oct4-Sox2']]
m[m.all(1)].sum(1).value_counts().plot.bar()
plt.title("Nanog-Sox2")
plt.xlabel("Number of motifs in the center >=2");

let's test for association

imp_weighted_cat='high'¶

100bp¶

plot_coocurence_matrix(dfi[(dfi.pattern_center > 450) & (dfi.pattern_center < 550)].query('match_weighted_cat!="low"').query('imp_weighted_cat=="high"'), total_examples)

high match¶

plot_coocurence_matrix(dfi[(dfi.pattern_center > 450) & (dfi.pattern_center < 550)].query('match_weighted_cat=="high"').query('imp_weighted_cat=="high"'), total_examples)

200bp¶

plot_coocurence_matrix(dfi[(dfi.pattern_center > 400) & (dfi.pattern_center < 600)].query('match_weighted_cat!="low"').query('imp_weighted_cat=="high"'), total_examples)

high match¶

plot_coocurence_matrix(dfi[(dfi.pattern_center > 400) & (dfi.pattern_center < 600)].query('match_weighted_cat=="high"').query('imp_weighted_cat=="high"'), total_examples)

Whole region¶

plot_coocurence_matrix(dfi.query('match_weighted_cat!="low"').query('imp_weighted_cat=="high"'), total_examples)

imp_weighted_cat='high', 'medium'¶

100bp¶

plot_coocurence_matrix(dfi[(dfi.pattern_center > 450) & (dfi.pattern_center < 550)].query('match_weighted_cat!="low"').query('imp_weighted_p>0.33'), total_examples)

200bp¶

plot_coocurence_matrix(dfi[(dfi.pattern_center > 400) & (dfi.pattern_center < 600)].query('match_weighted_cat!="low"').query('imp_weighted_p>0.33'), total_examples)

Whole region¶

plot_coocurence_matrix(dfi.query('match_weighted_cat!="low"').query('imp_weighted_p>0.33'), total_examples)

Analyzing seqlet context¶

Read seqlets from modisco¶

from basepair.modisco.motif_clustering import to_colors, preproc_motif_table, motif_table_long, scale, preproc_df

# TODO - speedup using IntervalIndex .. .loc[motif_center...]

# Read seqlets from modisco
df_seqlets = mr.seqlet_df_instances(trim_frac=0.08)
df_seqlets = df_seqlets.rename(columns=dict(seqname="example_idx", start='seqlet_start', end='seqlet_end', strand='seqlet_strand', center='seqlet_center'))
df_seqlets['pattern'] = df_seqlets['pattern'].map({k: shorten_pattern(k) for k in df_seqlets.pattern.unique()})
del df_seqlets['name']

# add the `cluster,cluster_order`
dfc = pd.read_csv("http://mitra.stanford.edu/kundaje/avsec/chipnexus/oct-sox-nanog-klf/models/n_dil_layers=9/modisco/all/profile/motif_clustering/nte_seq.csv")
df_seqlets = df_seqlets.merge(dfc[['pattern', 'cluster', 'cluster_order']], on='pattern')

def non_overlapping(x, df_seqlets):
    if len(x) == 0:
        return x
    dfs = df_seqlets[(df_seqlets.pattern == x['pattern_short'].iloc[0]) & (df_seqlets.example_idx == x['example_idx'].iloc[0])]
    if len(dfs)==0:
        return x
    return x[np.abs(x['pattern_center'].values.reshape((-1,1)) - dfs.seqlet_center.values.reshape([1,-1])).min(1) > 15]

dfi_subset = dfi.query('match_weighted_cat!="low"').query('imp_weighted_cat=="high"')

# get rid of overlapping intervals
dfi_subset_subset = dfi_subset.groupby(['pattern_short', 'example_idx']).apply(lambda x: non_overlapping(x, df_seqlets))

counts = pd.pivot_table(dfi_subset_subset, 'pattern_len', "example_idx", "pattern_name", aggfunc=len, fill_value=0)
c = counts >0

df_seqlets_c = c.merge(df_seqlets, on='example_idx')

df_seqlets_c.head(2)

df_seqlets_c_agg = df_seqlets_c.groupby(['cluster', 'cluster_order', 'pattern'])[list(c)].mean()
df_seqlets_c_agg_size = df_seqlets_c.groupby(['cluster', 'cluster_order', 'pattern']).size()
df_seqlets_c_agg = pd.concat([df_seqlets_c_agg, df_seqlets_c_agg_size], axis=1).reset_index().set_index('pattern').rename(columns={0: 'n_seqlets'})

df_seqlets_c_agg = df_seqlets_c_agg.sort_values('cluster_order')

col_anno = df_seqlets_c_agg[['cluster', 'n_seqlets']]
del df_seqlets_c_agg['cluster']
del df_seqlets_c_agg['cluster_order']
del df_seqlets_c_agg['n_seqlets']

col_anno.cluster = pd.Categorical(col_anno.cluster)

# plot
sns.clustermap(df_seqlets_c_agg, 
               row_cluster=False,
               col_cluster=False,
               #col_cluster
               # standard_scale=1,
               row_colors=to_colors(col_anno),
               figsize=(8,25),
               cmap='Blues')

<seaborn.matrix.ClusterGrid at 0x7f6923060eb8>

Spacing analysis¶

from basepair.modisco.pattern_instances import construct_motif_pairs

motif_pair = ['Oct4-Sox2', 'Sox2']

dfi_filtered = dfi[(dfi.pattern_center > 400) & (dfi.pattern_center < 600)].query('match_weighted_p > 0.2').query('imp_weighted_p > 0')

dftw_filt = motif_pairs(dfi_filtered, motif_pair)

# flatten columns for easier plotting
dftw_filt.set_index('strand_combination', append=True, inplace=True, drop=False)
dftw_filt.columns= dftw_filt.columns.get_level_values(0)+  '/' + dftw_filt.columns.get_level_values(1)

if motif_pair[0] == motif_pair[1]:
    bins=150
    bin_range = (0, 150)
else:
    bins=300
    bin_range = (-150, 150)
(dftw_filt['center_diff/'].hvplot.hist(bins=bins, 
                                       by='strand_combination',
                                       width=700,
                                       height=200,
                                       subplots=True,
                                       bin_range=bin_range).cols(1) + \
dftw_filt.hvplot.scatter(x='center_diff/', y='imp_weighted/'+ motif_pair[0],# 'imp_weighted_p/Sox2'], 
                         by='strand_combination',
                         width=700, height=400, 
                         #width=200, height=200, 
                         subplots=False, 
                         xlim=bin_range,
                         alpha=0.4) + \
dftw_filt.hvplot.scatter(x='center_diff/', y='imp_weighted/'+ motif_pair[1],# 'imp_weighted_p/Sox2'], 
                         by='strand_combination',
                         width=700, height=400, 
                         #width=200, height=200, 
                         subplots=False, 
                         xlim=bin_range,
                         alpha=0.4)
).cols(1)

Lookup the frequent spacing¶

from pybedtools import Interval
import pybedtools
from genomelake.extractors import FastaExtractor
from basepair.cli.schemas import DataSpec
from basepair.modisco.utils import ic_scale
from basepair.plot.tracks import plot_tracks
from basepair.modisco.core import Pattern

a = dftw_filt.reset_index().query("strand_combination=='-+'")
a = a[a['center_diff/'] == -58]
spaced_example_idx = a.example_idx
print(len(spaced_example_idx))
s = dfi[dfi.example_idx.isin(spaced_example_idx)][['example_idx', 'example_chrom', 'example_start', 'example_end', 'example_interval_from_task']].drop_duplicates()
a = pd.merge(a, s, on='example_idx').sort_values(['example_chrom', 'example_start'])

23

def pair_center(row, motif_pair):
    return (row['pattern_center/' + motif_pair[0]] + row['pattern_center/' + motif_pair[1]]) // 2

# get the sequences
intervals = [pybedtools.create_interval_from_list([row.example_chrom, row.example_start + pair_center(row, motif_pair) - 35, 
                                                   row.example_start + pair_center(row, motif_pair) + 35, row['example_idx']]) for i,row in a.iterrows()]
ds = DataSpec.load(f"{ddir}/processed/chipnexus/exp/models/oct-sox-nanog-klf/dataspec.yml")
fa = FastaExtractor(ds.fasta_file)
seqs = fa(intervals)

# match to the existing pattern
patterns = [mr.get_pattern(p) for p in mr.patterns()]

pwm = seqs.mean(0)
pwm = Pattern('found', pwm, dict(a=pwm), dict(a=pwm))
pwm = pwm.resize(len(patterns[0]))
matches = np.array([pattern.similarity(pwm, track='seq') for pattern in patterns])

# plot
plt.plot(matches)
mid = matches.argmax()
patterns[mid].plot("seq_ic");
pwm.align(patterns[mid]).plot("seq_ic");

Open questions¶

how many times are Nanog and Nanog-periodic actually detecting the same motif?
how many times this happens for Oct4-Sox2?

dfi.head()

n_patterns2 = dfi.groupby(['pattern_short', 'match_weighted_cat', 'imp_weighted_cat']).size().reset_index()
n_patterns2 = n_patterns2.rename(columns={0: "n_instances"})

dfn = pd.DataFrame({'pattern_short': shorten_pattern(p), 'seqlets': mr.n_seqlets(*p.split('/'))} for p in mr.patterns())

dfn.merge(n_patterns2, on='pattern_short').query('match_weighted_cat=="high"').\
    hvplot.scatter(x='seqlets', y='n_instances', by='imp_weighted_cat', alpha=0.5)

dfn.merge(n_patterns2, on='pattern_short').query('match_weighted_cat=="high"').query('imp_weighted_cat=="high"').\
    hvplot.scatter(x='seqlets', y='n_instances', by='pattern_short', alpha=0.5, legend=False)

These points on the diagonal are typically patterns 0 from differnet metaclusters

Motif instances

# most frequent seqlets
dfn.sort_values('seqlets', ascending=False).head(9)

# most frequent motif instances
n_patterns.reset_index().rename(columns={0: 'scanned_instances'}).sort_values('scanned_instances', ascending=False).head(9)

Manually select motifs to check¶

# define a set of core motifs

dfn.head()

Number of instances per motif¶

dfi_motifs = dfi[dfi.pattern_short.isin(list(motifs.values()))]
dfi_motifs['pattern_name'] = dfi_motifs.pattern_short.map({v: k for k,v in motifs.items()})

n_patterns_per_example = dfi_motifs.groupby(['pattern_name', 'example_idx', 'match_weighted_cat', 'imp_weighted_cat']).size().reset_index()#.groupby(['pattern_short'])[0].value_counts()

n_patterns_per_example = n_patterns_per_example.rename(columns={0: "n seqlets per example"})

n_patterns_per_example.hvplot.box(y='n seqlets per example', by=['pattern_name', 'match_weighted_cat'], invert=True, height=500, width=800, legend=False)

Distribution of values for the top motifs¶

dfi_motifs.hvplot.kde(y='match_weighted', by='pattern_name')

dfi_motifs.hvplot.kde(y='match_weighted_p', by='pattern_name')

dfi_motifs.hvplot.kde(y='imp_weighted', by='pattern_name')

dfi_motifs.hvplot.kde(y='imp_weighted_p', by='pattern_name')

from basepair.stats import low_medium_high

Re-compute the motif instances¶

Nanog¶

# task = "Nanog"
# pattern = "metacluster_0/pattern_0"

Dev - compute the profile similarity¶

model_dir

PosixPath('/users/avsec/workspace/basepair/data/processed/chipnexus/exp/models/oct-sox-nanog-klf/models/n_dil_layers=9')

modisco_dir

PosixPath('/users/avsec/workspace/basepair/data/processed/chipnexus/exp/models/oct-sox-nanog-klf/models/n_dil_layers=9/modisco/all/profile')

# load profiles
profiles = load_profiles(modisco_dir, model_dir/'grad.all.h5')

tasks = list(profiles)

dfim = dfi_motifs[dfi_motifs.match_weighted_cat == 'high']

dfim = dfim[dfim.imp_weighted_p > 0]

dfim.shape

(91512, 38)

Annotate the major motifs with the profile features

dfi_motifs_anno = annotate_profile(dfim, mr, profiles)

  0%|          | 0/8 [00:00<?, ?it/s]
100%|██████████| 8/8 [02:49<00:00, 21.23s/it]

Sanity check¶

# assert len(dfi_motifs_anno) == len(dfi_filter_valid(dfim, 70, 1000))

correlate importance vs profile counts

dfi_motifs_anno.query('pattern_name=="Oct4-Sox2"').hvplot.scatter(x='Oct4/profile_counts_p', y='imp_weighted_p', alpha=0.1)

dfi_motifs_anno.query('pattern_name=="Nanog"').hvplot.scatter(x='Nanog/profile_counts_p', y='imp_weighted_p', alpha=0.05, datashade=False)

Question: How do others contribute counts?¶

Features¶

Count matrix¶

from joblib import Parallel, delayed

def abs_min(x):
    return x[np.abs(x).argmin()]

def pairwise_distances_single(dfi_ex, dfi_motifs_anno_ex):
    out = []
    for i, core_motif in dfi_motifs_anno_ex.iterrows():
        #core_motif = dfi_motifs_anno_ex.iloc[i]
        # remove self 
        select = (dfi_ex.pattern == core_motif.pattern) & (dfi_ex.pattern_center == core_motif.pattern_center)
        assert select.sum() == 1
        dfi_ex = dfi_ex[~select]

        # relative distance to the pattern_center
        dfi_ex['pattern_center_rel'] = dfi_ex.pattern_center - core_motif.pattern_center

        # number of other examples
        df_scores = dfi_ex.groupby(['pattern', 'match_weighted_cat', 'imp_weighted_cat']).pattern_center_rel.agg([np.size, abs_min]).reset_index()
        df_scores.columns = [f'side/{c}' for c in df_scores.columns]
        df_scores = df_scores.assign(**core_motif.to_dict())
        out.append(df_scores)
    return pd.concat(out, axis=0)

def pariwise_distances(dfi, dfi_motifs_anno, n_jobs=8):
    #dfi_motifs_anno_tf = dfi_motifs_anno[dfi_motifs_anno.tf == tf]
    all_example_idx = dfi_motifs_anno.example_idx.unique()
    out = Parallel(n_jobs=n_jobs)(delayed(pairwise_distances_single)(dfi[dfi.example_idx == example_idx], dfi_motifs_anno[dfi_motifs_anno.example_idx == example_idx])
                              for example_idx in tqdm(all_example_idx))
    return pd.concat(out, axis=0)

df_dist = pariwise_distances(dfi[dfi.imp_weighted_p > 0], dfi_motifs_anno, n_jobs=20)

100%|██████████| 32985/32985 [08:32<00:00, 64.39it/s]

TODO¶

[ ] the effect of multiple instances on the final value
- stratify into different classes
[ ] effect of distance onto the effect

df_dist.head()

df_dist.shape

(1179875, 69)

from plotnine import *

df_dist['side_pattern_short'] = df_dist['side/pattern'].map({k: shorten_pattern(k) for k in df_dist['side/pattern'].unique()})

df_dist['side_imp_weighted_cat'] = df_dist['side/imp_weighted_cat']

df_dist['n_motifs'] = pd.Categorical(np.minimum(df_dist['side/size'], 3))

df_dist.head(2)

plotnine.options.figure_size = (20,3)
ggplot(aes(x='side/imp_weighted_cat', y='Nanog/profile_counts_p', color='n_motifs'), df_dist[df_dist['side/match_weighted_cat'] == 'high']) +  \
  facet_wrap("~side_tf", ncol = 10, scales='free_x') + geom_boxplot() + \
  theme_bw()

list(motifs.values())

['m0_p0', 'm6_p8', 'm0_p18', 'm0_p1', 'm0_p2', 'm0_p3', 'm0_p9', 'm2_p0']

motifs

OrderedDict([('Oct4-Sox2', 'm0_p0'),
             ('Oct4-Sox2-deg', 'm6_p8'),
             ('Oct4', 'm0_p18'),
             ('Sox2', 'm0_p1'),
             ('Essrb', 'm0_p2'),
             ('Nanog', 'm0_p3'),
             ('Nanog-periodic', 'm0_p9'),
             ('Klf4', 'm2_p0')])

df_dist_filt = df_dist[(df_dist['side/match_weighted_cat'] == 'high') & (df_dist['side_pattern_short'].isin(motifs.values()))]

df_dist_filt['side_tf'] = df_dist_filt.side_pattern_short.map({v:k for k,v in motifs.items()})

plotnine.options.figure_size = (20,10)
ggplot(aes(x='side/imp_weighted_cat', y='imp_weighted', color='n_motifs'), df_dist_filt) +  \
  facet_grid("tf~side_tf") + geom_boxplot() + \
  theme_bw()

<ggplot: (-9223363274730318371)>

# plotnine.options.figure_size = (20,3)
# ggplot(aes(x='side/imp_weighted_cat', y='Nanog/profile_max_p', color='n_motifs'), df_dist[(df_dist['side/match_weighted_cat'] == 'high') & (df_dist['side_pattern_short'].isin(motifs.values()))]) +  \
#   facet_wrap("~side_tf", ncol = 10, scales='free_x') + geom_boxplot() + \
#   theme_bw()

df_dist_filt['rel_dist'] = df_dist_filt['side/abs_min']

df_dist_filt['side_imp_weighted_cat'] = pd.Categorical(df_dist_filt['side_imp_weighted_cat'], categories=['high', 'medium', 'low'])

plotnine.options.figure_size = (20,5)
ggplot(aes(x='rel_dist', y='Nanog/profile_max'), df_dist_filt[df_dist_filt.pattern_name=='Nanog']) +  \
  xlim([-150, 150]) + \
  scale_y_log10() + \
  facet_grid("side_imp_weighted_cat~side_tf") + geom_point(alpha=0.05) + \
  theme_classic()

<ggplot: (8761907462853)>

plotnine.options.figure_size = (20,5)
ggplot(aes(x='rel_dist', y='Nanog/profile_counts'), df_dist_filt) +  \
  xlim([-150, 150]) + \
  scale_y_log10() + \
  facet_grid("side_imp_weighted_cat~side_tf") + geom_point(alpha=0.1) + \
  theme_classic()

<ggplot: (-9223363274688574001)>

plotnine.options.figure_size = (20,5)
ggplot(aes(x='rel_dist', y='imp_weighted'), df_dist_filt) +  \
  xlim([-150, 150]) + \
  facet_grid("side_imp_weighted_cat~side_tf") + geom_point(alpha=0.1) + \
  theme_classic()

<ggplot: (8762163310449)>

def averaged_distance(df_dist_filt, side_tf, variable='imp_weighted', strand='-', window=15, log=False):
    rp = df_dist_filt[(df_dist_filt.side_tf == side_tf) & (df_dist_filt.side_imp_weighted_cat == 'high')]
    rp.rel_dist = rp.rel_dist.abs()
    rp = rp.sort_values("rel_dist")
    
    rp = rp[(rp.rel_dist < 150) & (rp.strand == strand)]
    if log:
        p = np.log10(rp.set_index('rel_dist')[variable]).rolling(window).mean().reset_index()
    else:
        p = rp.set_index('rel_dist')[variable].rolling(window).mean().reset_index()
    return p

tasks = list(motifs)

variable = 'imp_weighted'
fig, axes = plt.subplots(2, len(tasks), figsize=(30,5), sharex=True);
for i, strand in enumerate(['+', '-']):
    for j, side_tf in enumerate(list(motifs)):
        p = averaged_distance(df_dist_filt, side_tf, variable=variable, strand=strand, window=15)
        p.plot(x='rel_dist', y=variable, ax=axes[i,j], legend=False)
        if i ==0:
            axes[i,j].set_title(side_tf)
        if j == 0:
            axes[i,j].set_ylabel(f"{variable}, {strand}")

variable = 'Nanog/profile_counts'
fig, axes = plt.subplots(2, len(tasks), figsize=(30,5), sharex=True);
for i, strand in enumerate(['+', '-']):
    for j, side_tf in enumerate(list(motifs)):
        p = averaged_distance(df_dist_filt, side_tf, variable=variable, strand=strand, window=15, log=True)
        p.plot(x='rel_dist', y=variable, ax=axes[i,j], legend=False)
        if i ==0:
            axes[i,j].set_title(side_tf)
        if j == 0:
            axes[i,j].set_ylabel(f"{strand}")

variable = 'Nanog/profile_max'
fig, axes = plt.subplots(2, len(tasks), figsize=(30,5), sharex=True);
for i, strand in enumerate(['+', '-']):
    for j, side_tf in enumerate(list(motifs)):
        p = averaged_distance(df_dist_filt, side_tf, variable=variable, strand=strand, window=15, log=True)
        p.plot(x='rel_dist', y=variable, ax=axes[i,j], legend=False)
        if i ==0:
            axes[i,j].set_title(side_tf)
        if j == 0:
            axes[i,j].set_ylabel(f"{strand}")

rp = df_dist_filt[(df_dist_filt.side_tf == 'Nanog') & (df_dist_filt.side_imp_weighted_cat == 'high')].sort_values("rel_dist")
rp = rp[(rp.rel_dist < 150) & (rp.strand == '-') & (rp.rel_dist >= 0)]

p = rp.set_index('rel_dist').imp_weighted.rolling(15).mean().reset_index()

fig, ax = plt.subplots(1, 1, figsize=(10,3));
p.plot(x='rel_dist', y='imp_weighted', ax=ax)

<matplotlib.axes._subplots.AxesSubplot at 0x7f81786da3c8>

plotnine.options.figure_size = (6,8)
ggplot(aes(x='rel_dist', y='imp_weighted'), df_dist_filt[(df_dist_filt.side_tf == 'Nanog') & (df_dist_filt.side_imp_weighted_cat == 'high')]) +  \
  xlim([0, 150]) + \
  geom_point(alpha=0.1) + \
  facet_grid("strand~.", labeller='label_both') + geom_point(alpha=0.05) + \
  scale_x_continuous(limits = [0, 150], breaks=np.arange(0, 150, 10)) + \
  theme_classic()

<ggplot: (-9223363274726408644)>

plotnine.options.figure_size = (6,8)
ggplot(aes(x='rel_dist', y='Nanog/profile_counts'), df_dist_filt[(df_dist_filt.side_tf == 'Nanog') & (df_dist_filt.side_imp_weighted_cat == 'high')]) +  \
  geom_point(alpha=0.1) + \
  scale_y_log10() + \
  facet_grid("strand~.", labeller='label_both') + geom_point(alpha=0.05) + \
  scale_x_continuous(limits = [0, 150], breaks=np.arange(0, 150, 10)) + \
  theme_classic()

<ggplot: (-9223363274726333843)>

plotnine.options.figure_size = (6,8)
ggplot(aes(x='rel_dist', y='Nanog/profile_max'), df_dist_filt[(df_dist_filt.side_tf == 'Nanog') & (df_dist_filt.side_imp_weighted_cat == 'high')]) +  \
  xlim([0, 150]) + \
  geom_point(alpha=0.1) + \
  scale_y_log10() + \
  facet_grid("strand~.", labeller='label_both') + geom_point(alpha=0.05) + \
  scale_x_continuous(limits = [0, 150], breaks=np.arange(0, 150, 10)) + \
  theme_classic()

<ggplot: (-9223363274692299271)>

df_dist_filt[(df_dist_filt.side_tf == 'Nanog') & (df_dist_filt.side_imp_weighted_cat == 'high') & (df_dist_filt.strand == '-') & (df_dist_filt.rel_dist < 150)& (df_dist_filt.rel_dist >= 0)].rel_dist.hvplot.hist(bins=145)

plotnine.options.figure_size = (20,5)
ggplot(aes(x='rel_dist'), df_dist_filt[(df_dist_filt.side_tf == 'Nanog') & (df_dist_filt.side_imp_weighted_cat == 'high')]) +  \
  xlim([0, 150]) + \
  geom_histogram(bins=150) + \
  facet_grid(".~strand", labeller='label_both') + \
  theme_classic()

<ggplot: (-9223363274691069296)>

dfs = df_dist_filt.groupby(['rel_dist', 'side_imp_weighted_cat', 'side_pattern_short']).size().reset_index()
dfs = dfs.rename(columns={0: 'n'})
#dfs = dfs[dfs['side_imp_weighted_cat'] == 'high']

dfs['pattern_name'] = dfs.side_pattern_short.map({v:k for k,v in motifs.items()})

plotnine.options.figure_size = (20,5)
ggplot(aes(x='rel_dist', y='n'), dfs) +  \
  xlim([-150, 150]) + \
  facet_grid("pattern_name~side_imp_weighted_cat", scales='free_y') + geom_line() + \
  theme_classic()

<ggplot: (-9223363274693224339)>

plotnine.options.figure_size = (20,5)
ggplot(aes(x='rel_dist'), df_dist_filt) +  \
  xlim([-150, 150]) + \
  facet_grid("side_imp_weighted_cat~side_pattern_short") + geom_histogram(bins=300) + \
  theme_bw()

<ggplot: (8762165688640)>

df_dist_filt.head()

	pattern	pattern_start	pattern_end	strand	pattern_len	pattern_center	match_weighted	match_weighted_p	match_weighted_cat	...	imp/Sox2	example_chrom	example_start	example_end	example_strand	example_interval_from_task	pattern_start_abs	pattern_end_abs	pattern_short	TF
0	metacluster_0/pattern_0	103	119	+	16	111	0.192108	0.005044	low	...	0.033881	chrX	143482544	143483544	*	Oct4	143482647	143482663	m0_p0	Oct4-Sox2
1	metacluster_0/pattern_0	157	173	+	16	165	0.115688	0.000215	low	...	0.015019	chrX	143482544	143483544	*	Oct4	143482701	143482717	m0_p0	Oct4-Sox2
2	metacluster_0/pattern_0	263	279	-	16	271	0.161070	0.001717	low	...	0.024147	chrX	143482544	143483544	*	Oct4	143482807	143482823	m0_p0	Oct4-Sox2
3	metacluster_0/pattern_0	281	297	+	16	289	0.291691	0.075445	low	...	0.031386	chrX	143482544	143483544	*	Oct4	143482825	143482841	m0_p0	Oct4-Sox2
4	metacluster_0/pattern_0	440	456	-	16	448	0.165342	0.002254	low	...	0.359250	chrX	143482544	143483544	*	Oct4	143482984	143483000	m0_p0	Oct4-Sox2

	pattern_short	seqlets
50	m2_p0	12798
0	m0_p0	9373
34	m1_p0	5799
74	m3_p0	4479
1	m0_p1	3101
87	m4_p0	2428
35	m1_p1	2386
51	m2_p1	1878
2	m0_p2	1656

	Klf4/profile_counts	Klf4/profile_counts_p	Klf4/profile_match	Klf4/profile_match_p	Klf4/profile_max	Klf4/profile_max_p	Nanog/profile_counts	Nanog/profile_counts_p	Nanog/profile_match	Nanog/profile_match_p	...	seq_match_cat	seq_match_p	side/abs_min	side/imp_weighted_cat	side/index	side/match_weighted_cat	side/pattern	side/size	strand	tf
0	102.0	0.901099	inf	None	7.0	0.957004	4877.0	1.0	0.441585	None	...	high	0.883129	123.0	medium	NaN	low	metacluster_0/pattern_0	2.0	+	Oct4-Sox2
1	102.0	0.901099	inf	None	7.0	0.957004	4877.0	1.0	0.441585	None	...	high	0.883129	18.0	high	NaN	low	metacluster_0/pattern_0	9.0	+	Oct4-Sox2
2	102.0	0.901099	inf	None	7.0	0.957004	4877.0	1.0	0.441585	None	...	high	0.883129	-21.0	high	NaN	medium	metacluster_0/pattern_0	1.0	+	Oct4-Sox2
3	102.0	0.901099	inf	None	7.0	0.957004	4877.0	1.0	0.441585	None	...	high	0.883129	330.0	low	NaN	high	metacluster_0/pattern_0	1.0	+	Oct4-Sox2
4	102.0	0.901099	inf	None	7.0	0.957004	4877.0	1.0	0.441585	None	...	high	0.883129	62.0	high	NaN	high	metacluster_0/pattern_0	1.0	+	Oct4-Sox2

	Klf4/profile_counts	Klf4/profile_counts_p	Klf4/profile_match	Klf4/profile_match_p	Klf4/profile_max	Klf4/profile_max_p	Nanog/profile_counts	Nanog/profile_counts_p	Nanog/profile_match	Nanog/profile_match_p	...	side/imp_weighted_cat	side/index	side/match_weighted_cat	side/pattern	side/size	strand	tf	side_pattern_short	side_imp_weighted_cat	n_motifs
0	102.0	0.901099	inf	None	7.0	0.957004	4877.0	1.0	0.441585	None	...	medium	NaN	low	metacluster_0/pattern_0	2.0	+	Oct4-Sox2	m0_p0	medium	2.0
1	102.0	0.901099	inf	None	7.0	0.957004	4877.0	1.0	0.441585	None	...	high	NaN	low	metacluster_0/pattern_0	9.0	+	Oct4-Sox2	m0_p0	high	3.0

	Klf4/profile_counts	Klf4/profile_counts_p	Klf4/profile_match	Klf4/profile_match_p	Klf4/profile_max	Klf4/profile_max_p	Nanog/profile_counts	Nanog/profile_counts_p	Nanog/profile_match	Nanog/profile_match_p	...	side/match_weighted_cat	side/pattern	side/size	strand	tf	side_pattern_short	side_imp_weighted_cat	n_motifs	side_tf	rel_dist
3	102.0	0.901099	inf	None	7.0	0.957004	4877.0	1.000000	0.441585	NaN	...	high	metacluster_0/pattern_0	1.0	+	Oct4-Sox2	m0_p0	low	1.0	Oct4-Sox2	330.0
4	102.0	0.901099	inf	None	7.0	0.957004	4877.0	1.000000	0.441585	NaN	...	high	metacluster_0/pattern_0	1.0	+	Oct4-Sox2	m0_p0	high	1.0	Oct4-Sox2	62.0
3	88.0	0.879441	inf	None	5.0	0.911554	4282.0	1.000000	0.223469	0.8	...	high	metacluster_0/pattern_0	1.0	+	Oct4-Sox2	m0_p0	low	1.0	Oct4-Sox2	268.0
1	330.0	0.994559	inf	None	11.0	0.988264	300.0	0.896298	inf	NaN	...	high	metacluster_0/pattern_0	1.0	+	Oct4-Sox2	m0_p0	high	1.0	Oct4-Sox2	26.0
6	330.0	0.994559	inf	None	11.0	0.988264	300.0	0.896298	inf	NaN	...	high	metacluster_0/pattern_1	1.0	+	Oct4-Sox2	m0_p1	low	1.0	Sox2	149.0

	example_idx	Essrb	Klf4	Nanog	Nanog-periodic	Oct4	Oct4-Sox2	Oct4-Sox2-deg	Sox2	seqlet_start	seqlet_end	seqlet_strand	pattern	seqlet_center	cluster	cluster_order
0	2	False	True	True	True	False	False	True	False	497	513	+	m0_p0	505.0	0	86
1	3	False	False	False	False	False	False	True	False	499	515	-	m0_p0	507.0	0	86

	TF	match_weighted_cat	imp_weighted_cat	scanned_instances
27	Nanog-periodic	low	low	377273
63	Sox2	low	low	171513
45	Oct4-Sox2	low	low	129842
18	Nanog	low	low	110237
0	Essrb	low	low	98412
54	Oct4-Sox2-deg	low	low	83855
9	Klf4	low	low	72345
28	Nanog-periodic	low	medium	51737
55	Oct4-Sox2-deg	low	medium	24803