Goal

  • analyze the pairwise features for Nanog

Tasks

  • [x] remove duplicates
  • [ ] get an overview over the instances
    • [ ] global overview
      • # instances per sequence
    • [ ] select a few promising seqlets
    • [ ] local overview
      • positional distribution within the sequence
      • match distribution (natural, p-scale)
      • importance distribution (natural, p-scale)
In [1]:
from basepair.imports import *
from basepair.plot.profiles import extract_signal
from basepair.math import softmax
from basepair.plot.heatmaps import heatmap_stranded_profile, multiple_heatmap_stranded_profile
from basepair.plot.profiles import  plot_stranded_profile, multiple_plot_stranded_profile
import plotnine
from plotnine import *
import statsmodels.api as sm
import statsmodels.formula.api as smf
from basepair.modisco.results import Seqlet, resize_seqlets
from basepair.modisco.core import dfi2seqlets, annotate_profile
from basepair.cli.modisco import load_profiles

import dask.dataframe as dd
import hvplot.pandas
import holoviews as hv

hv.extension('bokeh', 'matplotlib')
/users/avsec/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/h5py/__init__.py:36: FutureWarning: Conversion of the second argument of issubdtype from `float` to `np.floating` is deprecated. In future, it will be treated as `np.float64 == np.dtype(float).type`.
  from ._conv import register_converters as _register_converters
Using TensorFlow backend.
2018-10-24 03:58:19,799 [WARNING] doc empty for the `info:` field
In [3]:
model_dir = Path(f"{ddir}/processed/chipnexus/exp/models/oct-sox-nanog-klf/models/n_dil_layers=9/")
modisco_dir = model_dir / "modisco/all/profile/"

mr = ModiscoResult(modisco_dir / "modisco.h5")
mr.open()

# Load the data
d = HDF5Reader(model_dir / "grad.all.h5")
d.open()
In [10]:
# specify motifs to use in the analysis
motifs = OrderedDict([
    ("Oct4-Sox2", "m0_p0"),
    ("Oct4-Sox2-deg", "m6_p8"),
    ("Oct4", "m0_p18"),
    ("Sox2", "m0_p1"),
    ("Essrb", "m0_p2"),
    ("Nanog", "m0_p3"),
    ("Nanog-periodic", "m0_p9"),
    ("Klf4", "m2_p0"),
])
In [168]:
# plot them
for tf, p in motifs.items():
    mr.get_pattern(longer_pattern(p)).trim_seq_ic(0.08).resize(40).plot("seq");
    plt.title(tf)

Load instances

In [6]:
from basepair.modisco.pattern_instances import load_instances, filter_nonoverlapping_intervals, plot_coocurence_matrix
In [7]:
dfi = load_instances(f"{modisco_dir}/instances.parq", motifs)

dfi = filter_nonoverlapping_intervals(dfi)

total_examples = len(dfi.example_idx.unique())
total_examples
In [23]:
# pattern lengths
dfi.groupby(['pattern_name', 'pattern_len']).size()
Out[23]:
pattern_name    pattern_len
Essrb           16             247382
Klf4            10             250910
Nanog           10             287074
Nanog-periodic  52             964426
Oct4            39              46795
Oct4-Sox2       16             244433
Oct4-Sox2-deg   46             180267
Sox2            9              365894
dtype: int64

General stats on the scores

  • Total number of instances
  • instances per sequence

Whole 1kb region

In [28]:
n_patterns = dfi.groupby(['pattern_name', 'match_weighted_cat', 'imp_weighted_cat']).size()
n_patterns = n_patterns.rename(columns={0: "n_instances"})

n_patterns.hvplot.bar('imp_weighted_cat', by='pattern_name',groupby='match_weighted_cat', rot=90)
Out[28]:

Central 200bp region

In [30]:
n_patterns = dfi[(dfi.pattern_center > 400) & (dfi.pattern_center < 600)].groupby(['pattern_name', 'match_weighted_cat', 'imp_weighted_cat']).size()
n_patterns = n_patterns.rename(columns={0: "n_instances"})

n_patterns.hvplot.bar('imp_weighted_cat', by='pattern_name',groupby='match_weighted_cat', rot=90)
Out[30]:

Simple co-occurence analysis

In [31]:
from upsetplot import plot as upsetplt
In [32]:
#.query('imp_weighted_cat=="high"')
counts = pd.pivot_table(dfi[(dfi.pattern_center > 400) & (dfi.pattern_center < 600)].query('match_weighted_cat=="high"').query('imp_weighted_cat=="high"'), 
                        'pattern_len', "example_idx", "pattern_name", aggfunc=len, fill_value=0)

c = counts >0
In [33]:
upsetplt(c.groupby(list(c.columns)).size().sort_values(ascending=False)[:50],  sort_by='cardinality');
In [34]:
m = counts[['Nanog', 'Oct4-Sox2']]
m[m.all(1)].sum(1).value_counts().plot.bar()
plt.title("Nanog-Sox2")
plt.xlabel("Number of motifs in the center >=2");
  • let's test for association

imp_weighted_cat='high'

100bp

In [37]:
plot_coocurence_matrix(dfi[(dfi.pattern_center > 450) & (dfi.pattern_center < 550)].query('match_weighted_cat!="low"').query('imp_weighted_cat=="high"'), total_examples)

high match

In [38]:
plot_coocurence_matrix(dfi[(dfi.pattern_center > 450) & (dfi.pattern_center < 550)].query('match_weighted_cat=="high"').query('imp_weighted_cat=="high"'), total_examples)

200bp

In [39]:
plot_coocurence_matrix(dfi[(dfi.pattern_center > 400) & (dfi.pattern_center < 600)].query('match_weighted_cat!="low"').query('imp_weighted_cat=="high"'), total_examples)

high match

In [40]:
plot_coocurence_matrix(dfi[(dfi.pattern_center > 400) & (dfi.pattern_center < 600)].query('match_weighted_cat=="high"').query('imp_weighted_cat=="high"'), total_examples)

Whole region

In [49]:
plot_coocurence_matrix(dfi.query('match_weighted_cat!="low"').query('imp_weighted_cat=="high"'), total_examples)

imp_weighted_cat='high', 'medium'

100bp

In [44]:
plot_coocurence_matrix(dfi[(dfi.pattern_center > 450) & (dfi.pattern_center < 550)].query('match_weighted_cat!="low"').query('imp_weighted_p>0.33'), total_examples)

200bp

In [53]:
plot_coocurence_matrix(dfi[(dfi.pattern_center > 400) & (dfi.pattern_center < 600)].query('match_weighted_cat!="low"').query('imp_weighted_p>0.33'), total_examples)

Whole region

In [48]:
plot_coocurence_matrix(dfi.query('match_weighted_cat!="low"').query('imp_weighted_p>0.33'), total_examples)

Analyzing seqlet context

Read seqlets from modisco

In [64]:
from basepair.modisco.motif_clustering import to_colors, preproc_motif_table, motif_table_long, scale, preproc_df
In [ ]:
# TODO - speedup using IntervalIndex .. .loc[motif_center...]
In [54]:
# Read seqlets from modisco
df_seqlets = mr.seqlet_df_instances(trim_frac=0.08)
df_seqlets = df_seqlets.rename(columns=dict(seqname="example_idx", start='seqlet_start', end='seqlet_end', strand='seqlet_strand', center='seqlet_center'))
df_seqlets['pattern'] = df_seqlets['pattern'].map({k: shorten_pattern(k) for k in df_seqlets.pattern.unique()})
del df_seqlets['name']
In [55]:
# add the `cluster,cluster_order`
dfc = pd.read_csv("http://mitra.stanford.edu/kundaje/avsec/chipnexus/oct-sox-nanog-klf/models/n_dil_layers=9/modisco/all/profile/motif_clustering/nte_seq.csv")
df_seqlets = df_seqlets.merge(dfc[['pattern', 'cluster', 'cluster_order']], on='pattern')
In [58]:
def non_overlapping(x, df_seqlets):
    if len(x) == 0:
        return x
    dfs = df_seqlets[(df_seqlets.pattern == x['pattern_short'].iloc[0]) & (df_seqlets.example_idx == x['example_idx'].iloc[0])]
    if len(dfs)==0:
        return x
    return x[np.abs(x['pattern_center'].values.reshape((-1,1)) - dfs.seqlet_center.values.reshape([1,-1])).min(1) > 15]
In [61]:
dfi_subset = dfi.query('match_weighted_cat!="low"').query('imp_weighted_cat=="high"')

# get rid of overlapping intervals
dfi_subset_subset = dfi_subset.groupby(['pattern_short', 'example_idx']).apply(lambda x: non_overlapping(x, df_seqlets))

counts = pd.pivot_table(dfi_subset_subset, 'pattern_len', "example_idx", "pattern_name", aggfunc=len, fill_value=0)
c = counts >0

df_seqlets_c = c.merge(df_seqlets, on='example_idx')
In [62]:
df_seqlets_c.head(2)
Out[62]:
example_idx Essrb Klf4 Nanog Nanog-periodic Oct4 Oct4-Sox2 Oct4-Sox2-deg Sox2 seqlet_start seqlet_end seqlet_strand pattern seqlet_center cluster cluster_order
0 2 False True True True False False True False 497 513 + m0_p0 505.0 0 86
1 3 False False False False False False True False 499 515 - m0_p0 507.0 0 86
In [63]:
df_seqlets_c_agg = df_seqlets_c.groupby(['cluster', 'cluster_order', 'pattern'])[list(c)].mean()
df_seqlets_c_agg_size = df_seqlets_c.groupby(['cluster', 'cluster_order', 'pattern']).size()
df_seqlets_c_agg = pd.concat([df_seqlets_c_agg, df_seqlets_c_agg_size], axis=1).reset_index().set_index('pattern').rename(columns={0: 'n_seqlets'})

df_seqlets_c_agg = df_seqlets_c_agg.sort_values('cluster_order')

col_anno = df_seqlets_c_agg[['cluster', 'n_seqlets']]
del df_seqlets_c_agg['cluster']
del df_seqlets_c_agg['cluster_order']
del df_seqlets_c_agg['n_seqlets']

col_anno.cluster = pd.Categorical(col_anno.cluster)
In [65]:
# plot
sns.clustermap(df_seqlets_c_agg, 
               row_cluster=False,
               col_cluster=False,
               #col_cluster
               # standard_scale=1,
               row_colors=to_colors(col_anno),
               figsize=(8,25),
               cmap='Blues')
Out[65]:
<seaborn.matrix.ClusterGrid at 0x7f6923060eb8>

Spacing analysis

In [143]:
from basepair.modisco.pattern_instances import construct_motif_pairs
In [163]:
motif_pair = ['Oct4-Sox2', 'Sox2']

dfi_filtered = dfi[(dfi.pattern_center > 400) & (dfi.pattern_center < 600)].query('match_weighted_p > 0.2').query('imp_weighted_p > 0')

dftw_filt = motif_pairs(dfi_filtered, motif_pair)

# flatten columns for easier plotting
dftw_filt.set_index('strand_combination', append=True, inplace=True, drop=False)
dftw_filt.columns= dftw_filt.columns.get_level_values(0)+  '/' + dftw_filt.columns.get_level_values(1)

if motif_pair[0] == motif_pair[1]:
    bins=150
    bin_range = (0, 150)
else:
    bins=300
    bin_range = (-150, 150)
(dftw_filt['center_diff/'].hvplot.hist(bins=bins, 
                                       by='strand_combination',
                                       width=700,
                                       height=200,
                                       subplots=True,
                                       bin_range=bin_range).cols(1) + \
dftw_filt.hvplot.scatter(x='center_diff/', y='imp_weighted/'+ motif_pair[0],# 'imp_weighted_p/Sox2'], 
                         by='strand_combination',
                         width=700, height=400, 
                         #width=200, height=200, 
                         subplots=False, 
                         xlim=bin_range,
                         alpha=0.4) + \
dftw_filt.hvplot.scatter(x='center_diff/', y='imp_weighted/'+ motif_pair[1],# 'imp_weighted_p/Sox2'], 
                         by='strand_combination',
                         width=700, height=400, 
                         #width=200, height=200, 
                         subplots=False, 
                         xlim=bin_range,
                         alpha=0.4)
).cols(1)
Out[163]:

Lookup the frequent spacing

In [66]:
from pybedtools import Interval
import pybedtools
from genomelake.extractors import FastaExtractor
from basepair.cli.schemas import DataSpec
from basepair.modisco.utils import ic_scale
from basepair.plot.tracks import plot_tracks
from basepair.modisco.core import Pattern
In [119]:
a = dftw_filt.reset_index().query("strand_combination=='-+'")
a = a[a['center_diff/'] == -58]
spaced_example_idx = a.example_idx
print(len(spaced_example_idx))
s = dfi[dfi.example_idx.isin(spaced_example_idx)][['example_idx', 'example_chrom', 'example_start', 'example_end', 'example_interval_from_task']].drop_duplicates()
a = pd.merge(a, s, on='example_idx').sort_values(['example_chrom', 'example_start'])
23
In [120]:
def pair_center(row, motif_pair):
    return (row['pattern_center/' + motif_pair[0]] + row['pattern_center/' + motif_pair[1]]) // 2

# get the sequences
intervals = [pybedtools.create_interval_from_list([row.example_chrom, row.example_start + pair_center(row, motif_pair) - 35, 
                                                   row.example_start + pair_center(row, motif_pair) + 35, row['example_idx']]) for i,row in a.iterrows()]
ds = DataSpec.load(f"{ddir}/processed/chipnexus/exp/models/oct-sox-nanog-klf/dataspec.yml")
fa = FastaExtractor(ds.fasta_file)
seqs = fa(intervals)
In [123]:
# match to the existing pattern
patterns = [mr.get_pattern(p) for p in mr.patterns()]

pwm = seqs.mean(0)
pwm = Pattern('found', pwm, dict(a=pwm), dict(a=pwm))
pwm = pwm.resize(len(patterns[0]))
matches = np.array([pattern.similarity(pwm, track='seq') for pattern in patterns])
In [131]:
# plot
plt.plot(matches)
mid = matches.argmax()
patterns[mid].plot("seq_ic");
pwm.align(patterns[mid]).plot("seq_ic");

Open questions

  • how many times are Nanog and Nanog-periodic actually detecting the same motif?
  • how many times this happens for Oct4-Sox2?
In [1160]:
dfi.head()
Out[1160]:
pattern example_idx pattern_start pattern_end strand pattern_len pattern_center match_weighted match_weighted_p match_weighted_cat ... imp/Sox2 example_chrom example_start example_end example_strand example_interval_from_task pattern_start_abs pattern_end_abs pattern_short TF
0 metacluster_0/pattern_0 0 103 119 + 16 111 0.192108 0.005044 low ... 0.033881 chrX 143482544 143483544 * Oct4 143482647 143482663 m0_p0 Oct4-Sox2
1 metacluster_0/pattern_0 0 157 173 + 16 165 0.115688 0.000215 low ... 0.015019 chrX 143482544 143483544 * Oct4 143482701 143482717 m0_p0 Oct4-Sox2
2 metacluster_0/pattern_0 0 263 279 - 16 271 0.161070 0.001717 low ... 0.024147 chrX 143482544 143483544 * Oct4 143482807 143482823 m0_p0 Oct4-Sox2
3 metacluster_0/pattern_0 0 281 297 + 16 289 0.291691 0.075445 low ... 0.031386 chrX 143482544 143483544 * Oct4 143482825 143482841 m0_p0 Oct4-Sox2
4 metacluster_0/pattern_0 0 440 456 - 16 448 0.165342 0.002254 low ... 0.359250 chrX 143482544 143483544 * Oct4 143482984 143483000 m0_p0 Oct4-Sox2

5 rows × 37 columns

In [1161]:
n_patterns2 = dfi.groupby(['pattern_short', 'match_weighted_cat', 'imp_weighted_cat']).size().reset_index()
n_patterns2 = n_patterns2.rename(columns={0: "n_instances"})
In [1162]:
dfn = pd.DataFrame({'pattern_short': shorten_pattern(p), 'seqlets': mr.n_seqlets(*p.split('/'))} for p in mr.patterns())
In [1163]:
dfn.merge(n_patterns2, on='pattern_short').query('match_weighted_cat=="high"').\
    hvplot.scatter(x='seqlets', y='n_instances', by='imp_weighted_cat', alpha=0.5)
Out[1163]:
In [1164]:
dfn.merge(n_patterns2, on='pattern_short').query('match_weighted_cat=="high"').query('imp_weighted_cat=="high"').\
    hvplot.scatter(x='seqlets', y='n_instances', by='pattern_short', alpha=0.5, legend=False)
Out[1164]:

These points on the diagonal are typically patterns 0 from differnet metaclusters

In [1168]:
# most frequent seqlets
dfn.sort_values('seqlets', ascending=False).head(9)
Out[1168]:
pattern_short seqlets
50 m2_p0 12798
0 m0_p0 9373
34 m1_p0 5799
74 m3_p0 4479
1 m0_p1 3101
87 m4_p0 2428
35 m1_p1 2386
51 m2_p1 1878
2 m0_p2 1656
In [1169]:
# most frequent motif instances
n_patterns.reset_index().rename(columns={0: 'scanned_instances'}).sort_values('scanned_instances', ascending=False).head(9)
Out[1169]:
TF match_weighted_cat imp_weighted_cat scanned_instances
27 Nanog-periodic low low 377273
63 Sox2 low low 171513
45 Oct4-Sox2 low low 129842
18 Nanog low low 110237
0 Essrb low low 98412
54 Oct4-Sox2-deg low low 83855
9 Klf4 low low 72345
28 Nanog-periodic low medium 51737
55 Oct4-Sox2-deg low medium 24803

Manually select motifs to check

In [1170]:
# define a set of core motifs
In [1171]:
dfn.head()
Out[1171]:
pattern_short seqlets
0 m0_p0 9373
1 m0_p1 3101
2 m0_p2 1656
3 m0_p3 1024
4 m0_p4 991

Number of instances per motif

In [1172]:
dfi_motifs = dfi[dfi.pattern_short.isin(list(motifs.values()))]
dfi_motifs['pattern_name'] = dfi_motifs.pattern_short.map({v: k for k,v in motifs.items()})
In [1174]:
n_patterns_per_example = dfi_motifs.groupby(['pattern_name', 'example_idx', 'match_weighted_cat', 'imp_weighted_cat']).size().reset_index()#.groupby(['pattern_short'])[0].value_counts()
In [1175]:
n_patterns_per_example = n_patterns_per_example.rename(columns={0: "n seqlets per example"})
In [1176]:
n_patterns_per_example.hvplot.box(y='n seqlets per example', by=['pattern_name', 'match_weighted_cat'], invert=True, height=500, width=800, legend=False)
Out[1176]:

Distribution of values for the top motifs

In [1177]:
dfi_motifs.hvplot.kde(y='match_weighted', by='pattern_name')
Out[1177]:
In [1178]:
dfi_motifs.hvplot.kde(y='match_weighted_p', by='pattern_name')
Out[1178]:
In [1179]:
dfi_motifs.hvplot.kde(y='imp_weighted', by='pattern_name')
Out[1179]:
In [1180]:
dfi_motifs.hvplot.kde(y='imp_weighted_p', by='pattern_name')
Out[1180]:
In [1181]:
from basepair.stats import low_medium_high

Re-compute the motif instances

Nanog

In [1182]:
# task = "Nanog"
# pattern = "metacluster_0/pattern_0"

Dev - compute the profile similarity

In [1184]:
model_dir
Out[1184]:
PosixPath('/users/avsec/workspace/basepair/data/processed/chipnexus/exp/models/oct-sox-nanog-klf/models/n_dil_layers=9')
In [1185]:
modisco_dir
Out[1185]:
PosixPath('/users/avsec/workspace/basepair/data/processed/chipnexus/exp/models/oct-sox-nanog-klf/models/n_dil_layers=9/modisco/all/profile')
In [1187]:
# load profiles
profiles = load_profiles(modisco_dir, model_dir/'grad.all.h5')
In [1189]:
tasks = list(profiles)
In [1190]:
dfim = dfi_motifs[dfi_motifs.match_weighted_cat == 'high']
In [1193]:
dfim = dfim[dfim.imp_weighted_p > 0]
In [1194]:
dfim.shape
Out[1194]:
(91512, 38)

Annotate the major motifs with the profile features

In [1195]:
dfi_motifs_anno = annotate_profile(dfim, mr, profiles)
  0%|          | 0/8 [00:00<?, ?it/s]
100%|██████████| 8/8 [02:49<00:00, 21.23s/it]

Sanity check

In [1197]:
# assert len(dfi_motifs_anno) == len(dfi_filter_valid(dfim, 70, 1000))

correlate importance vs profile counts

In [1198]:
dfi_motifs_anno.query('pattern_name=="Oct4-Sox2"').hvplot.scatter(x='Oct4/profile_counts_p', y='imp_weighted_p', alpha=0.1)
Out[1198]:
In [1199]:
dfi_motifs_anno.query('pattern_name=="Nanog"').hvplot.scatter(x='Nanog/profile_counts_p', y='imp_weighted_p', alpha=0.05, datashade=False)
Out[1199]:

Question: How do others contribute counts?

Features

Count matrix

In [1200]:
from joblib import Parallel, delayed
In [1201]:
def abs_min(x):
    return x[np.abs(x).argmin()]

def pairwise_distances_single(dfi_ex, dfi_motifs_anno_ex):
    out = []
    for i, core_motif in dfi_motifs_anno_ex.iterrows():
        #core_motif = dfi_motifs_anno_ex.iloc[i]
        # remove self 
        select = (dfi_ex.pattern == core_motif.pattern) & (dfi_ex.pattern_center == core_motif.pattern_center)
        assert select.sum() == 1
        dfi_ex = dfi_ex[~select]

        # relative distance to the pattern_center
        dfi_ex['pattern_center_rel'] = dfi_ex.pattern_center - core_motif.pattern_center

        # number of other examples
        df_scores = dfi_ex.groupby(['pattern', 'match_weighted_cat', 'imp_weighted_cat']).pattern_center_rel.agg([np.size, abs_min]).reset_index()
        df_scores.columns = [f'side/{c}' for c in df_scores.columns]
        df_scores = df_scores.assign(**core_motif.to_dict())
        out.append(df_scores)
    return pd.concat(out, axis=0)

def pariwise_distances(dfi, dfi_motifs_anno, n_jobs=8):
    #dfi_motifs_anno_tf = dfi_motifs_anno[dfi_motifs_anno.tf == tf]
    all_example_idx = dfi_motifs_anno.example_idx.unique()
    out = Parallel(n_jobs=n_jobs)(delayed(pairwise_distances_single)(dfi[dfi.example_idx == example_idx], dfi_motifs_anno[dfi_motifs_anno.example_idx == example_idx])
                              for example_idx in tqdm(all_example_idx))
    return pd.concat(out, axis=0)
In [1202]:
df_dist = pariwise_distances(dfi[dfi.imp_weighted_p > 0], dfi_motifs_anno, n_jobs=20)
100%|██████████| 32985/32985 [08:32<00:00, 64.39it/s]

TODO

  • [ ] the effect of multiple instances on the final value
    • stratify into different classes
  • [ ] effect of distance onto the effect
In [1203]:
df_dist.head()
Out[1203]:
Klf4/profile_counts Klf4/profile_counts_p Klf4/profile_match Klf4/profile_match_p Klf4/profile_max Klf4/profile_max_p Nanog/profile_counts Nanog/profile_counts_p Nanog/profile_match Nanog/profile_match_p ... seq_match_cat seq_match_p side/abs_min side/imp_weighted_cat side/index side/match_weighted_cat side/pattern side/size strand tf
0 102.0 0.901099 inf None 7.0 0.957004 4877.0 1.0 0.441585 None ... high 0.883129 123.0 medium NaN low metacluster_0/pattern_0 2.0 + Oct4-Sox2
1 102.0 0.901099 inf None 7.0 0.957004 4877.0 1.0 0.441585 None ... high 0.883129 18.0 high NaN low metacluster_0/pattern_0 9.0 + Oct4-Sox2
2 102.0 0.901099 inf None 7.0 0.957004 4877.0 1.0 0.441585 None ... high 0.883129 -21.0 high NaN medium metacluster_0/pattern_0 1.0 + Oct4-Sox2
3 102.0 0.901099 inf None 7.0 0.957004 4877.0 1.0 0.441585 None ... high 0.883129 330.0 low NaN high metacluster_0/pattern_0 1.0 + Oct4-Sox2
4 102.0 0.901099 inf None 7.0 0.957004 4877.0 1.0 0.441585 None ... high 0.883129 62.0 high NaN high metacluster_0/pattern_0 1.0 + Oct4-Sox2

5 rows × 69 columns

In [1204]:
df_dist.shape
Out[1204]:
(1179875, 69)
In [1205]:
from plotnine import *
In [1206]:
df_dist['side_pattern_short'] = df_dist['side/pattern'].map({k: shorten_pattern(k) for k in df_dist['side/pattern'].unique()})
In [1207]:
df_dist['side_imp_weighted_cat'] = df_dist['side/imp_weighted_cat']
In [1208]:
df_dist['n_motifs'] = pd.Categorical(np.minimum(df_dist['side/size'], 3))
In [1264]:
df_dist.head(2)
Out[1264]:
Klf4/profile_counts Klf4/profile_counts_p Klf4/profile_match Klf4/profile_match_p Klf4/profile_max Klf4/profile_max_p Nanog/profile_counts Nanog/profile_counts_p Nanog/profile_match Nanog/profile_match_p ... side/imp_weighted_cat side/index side/match_weighted_cat side/pattern side/size strand tf side_pattern_short side_imp_weighted_cat n_motifs
0 102.0 0.901099 inf None 7.0 0.957004 4877.0 1.0 0.441585 None ... medium NaN low metacluster_0/pattern_0 2.0 + Oct4-Sox2 m0_p0 medium 2.0
1 102.0 0.901099 inf None 7.0 0.957004 4877.0 1.0 0.441585 None ... high NaN low metacluster_0/pattern_0 9.0 + Oct4-Sox2 m0_p0 high 3.0

2 rows × 72 columns

In [ ]:
plotnine.options.figure_size = (20,3)
ggplot(aes(x='side/imp_weighted_cat', y='Nanog/profile_counts_p', color='n_motifs'), df_dist[df_dist['side/match_weighted_cat'] == 'high']) +  \
  facet_wrap("~side_tf", ncol = 10, scales='free_x') + geom_boxplot() + \
  theme_bw()
In [1211]:
list(motifs.values())
Out[1211]:
['m0_p0', 'm6_p8', 'm0_p18', 'm0_p1', 'm0_p2', 'm0_p3', 'm0_p9', 'm2_p0']
In [1212]:
motifs
Out[1212]:
OrderedDict([('Oct4-Sox2', 'm0_p0'),
             ('Oct4-Sox2-deg', 'm6_p8'),
             ('Oct4', 'm0_p18'),
             ('Sox2', 'm0_p1'),
             ('Essrb', 'm0_p2'),
             ('Nanog', 'm0_p3'),
             ('Nanog-periodic', 'm0_p9'),
             ('Klf4', 'm2_p0')])
In [1213]:
df_dist_filt = df_dist[(df_dist['side/match_weighted_cat'] == 'high') & (df_dist['side_pattern_short'].isin(motifs.values()))]
In [1214]:
df_dist_filt['side_tf'] = df_dist_filt.side_pattern_short.map({v:k for k,v in motifs.items()})
In [1267]:
plotnine.options.figure_size = (20,10)
ggplot(aes(x='side/imp_weighted_cat', y='imp_weighted', color='n_motifs'), df_dist_filt) +  \
  facet_grid("tf~side_tf") + geom_boxplot() + \
  theme_bw()
Out[1267]:
<ggplot: (-9223363274730318371)>
In [1216]:
# plotnine.options.figure_size = (20,3)
# ggplot(aes(x='side/imp_weighted_cat', y='Nanog/profile_max_p', color='n_motifs'), df_dist[(df_dist['side/match_weighted_cat'] == 'high') & (df_dist['side_pattern_short'].isin(motifs.values()))]) +  \
#   facet_wrap("~side_tf", ncol = 10, scales='free_x') + geom_boxplot() + \
#   theme_bw()
In [1217]:
df_dist_filt['rel_dist'] = df_dist_filt['side/abs_min']
In [1218]:
df_dist_filt['side_imp_weighted_cat'] = pd.Categorical(df_dist_filt['side_imp_weighted_cat'], categories=['high', 'medium', 'low'])
In [1268]:
plotnine.options.figure_size = (20,5)
ggplot(aes(x='rel_dist', y='Nanog/profile_max'), df_dist_filt[df_dist_filt.pattern_name=='Nanog']) +  \
  xlim([-150, 150]) + \
  scale_y_log10() + \
  facet_grid("side_imp_weighted_cat~side_tf") + geom_point(alpha=0.05) + \
  theme_classic()
Out[1268]:
<ggplot: (8761907462853)>
In [1220]:
plotnine.options.figure_size = (20,5)
ggplot(aes(x='rel_dist', y='Nanog/profile_counts'), df_dist_filt) +  \
  xlim([-150, 150]) + \
  scale_y_log10() + \
  facet_grid("side_imp_weighted_cat~side_tf") + geom_point(alpha=0.1) + \
  theme_classic()
Out[1220]:
<ggplot: (-9223363274688574001)>
In [1221]:
plotnine.options.figure_size = (20,5)
ggplot(aes(x='rel_dist', y='imp_weighted'), df_dist_filt) +  \
  xlim([-150, 150]) + \
  facet_grid("side_imp_weighted_cat~side_tf") + geom_point(alpha=0.1) + \
  theme_classic()
Out[1221]:
<ggplot: (8762163310449)>
In [1222]:
def averaged_distance(df_dist_filt, side_tf, variable='imp_weighted', strand='-', window=15, log=False):
    rp = df_dist_filt[(df_dist_filt.side_tf == side_tf) & (df_dist_filt.side_imp_weighted_cat == 'high')]
    rp.rel_dist = rp.rel_dist.abs()
    rp = rp.sort_values("rel_dist")
    
    rp = rp[(rp.rel_dist < 150) & (rp.strand == strand)]
    if log:
        p = np.log10(rp.set_index('rel_dist')[variable]).rolling(window).mean().reset_index()
    else:
        p = rp.set_index('rel_dist')[variable].rolling(window).mean().reset_index()
    return p
In [1223]:
tasks = list(motifs)
In [1224]:
variable = 'imp_weighted'
fig, axes = plt.subplots(2, len(tasks), figsize=(30,5), sharex=True);
for i, strand in enumerate(['+', '-']):
    for j, side_tf in enumerate(list(motifs)):
        p = averaged_distance(df_dist_filt, side_tf, variable=variable, strand=strand, window=15)
        p.plot(x='rel_dist', y=variable, ax=axes[i,j], legend=False)
        if i ==0:
            axes[i,j].set_title(side_tf)
        if j == 0:
            axes[i,j].set_ylabel(f"{variable}, {strand}")
            
In [1225]:
variable = 'Nanog/profile_counts'
fig, axes = plt.subplots(2, len(tasks), figsize=(30,5), sharex=True);
for i, strand in enumerate(['+', '-']):
    for j, side_tf in enumerate(list(motifs)):
        p = averaged_distance(df_dist_filt, side_tf, variable=variable, strand=strand, window=15, log=True)
        p.plot(x='rel_dist', y=variable, ax=axes[i,j], legend=False)
        if i ==0:
            axes[i,j].set_title(side_tf)
        if j == 0:
            axes[i,j].set_ylabel(f"{strand}")
            
In [1226]:
variable = 'Nanog/profile_max'
fig, axes = plt.subplots(2, len(tasks), figsize=(30,5), sharex=True);
for i, strand in enumerate(['+', '-']):
    for j, side_tf in enumerate(list(motifs)):
        p = averaged_distance(df_dist_filt, side_tf, variable=variable, strand=strand, window=15, log=True)
        p.plot(x='rel_dist', y=variable, ax=axes[i,j], legend=False)
        if i ==0:
            axes[i,j].set_title(side_tf)
        if j == 0:
            axes[i,j].set_ylabel(f"{strand}")
            
In [1227]:
rp = df_dist_filt[(df_dist_filt.side_tf == 'Nanog') & (df_dist_filt.side_imp_weighted_cat == 'high')].sort_values("rel_dist")
rp = rp[(rp.rel_dist < 150) & (rp.strand == '-') & (rp.rel_dist >= 0)]

p = rp.set_index('rel_dist').imp_weighted.rolling(15).mean().reset_index()

fig, ax = plt.subplots(1, 1, figsize=(10,3));
p.plot(x='rel_dist', y='imp_weighted', ax=ax)
Out[1227]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f81786da3c8>
In [1228]:
plotnine.options.figure_size = (6,8)
ggplot(aes(x='rel_dist', y='imp_weighted'), df_dist_filt[(df_dist_filt.side_tf == 'Nanog') & (df_dist_filt.side_imp_weighted_cat == 'high')]) +  \
  xlim([0, 150]) + \
  geom_point(alpha=0.1) + \
  facet_grid("strand~.", labeller='label_both') + geom_point(alpha=0.05) + \
  scale_x_continuous(limits = [0, 150], breaks=np.arange(0, 150, 10)) + \
  theme_classic()
Out[1228]:
<ggplot: (-9223363274726408644)>
In [1229]:
plotnine.options.figure_size = (6,8)
ggplot(aes(x='rel_dist', y='Nanog/profile_counts'), df_dist_filt[(df_dist_filt.side_tf == 'Nanog') & (df_dist_filt.side_imp_weighted_cat == 'high')]) +  \
  geom_point(alpha=0.1) + \
  scale_y_log10() + \
  facet_grid("strand~.", labeller='label_both') + geom_point(alpha=0.05) + \
  scale_x_continuous(limits = [0, 150], breaks=np.arange(0, 150, 10)) + \
  theme_classic()
Out[1229]:
<ggplot: (-9223363274726333843)>
In [1230]:
plotnine.options.figure_size = (6,8)
ggplot(aes(x='rel_dist', y='Nanog/profile_max'), df_dist_filt[(df_dist_filt.side_tf == 'Nanog') & (df_dist_filt.side_imp_weighted_cat == 'high')]) +  \
  xlim([0, 150]) + \
  geom_point(alpha=0.1) + \
  scale_y_log10() + \
  facet_grid("strand~.", labeller='label_both') + geom_point(alpha=0.05) + \
  scale_x_continuous(limits = [0, 150], breaks=np.arange(0, 150, 10)) + \
  theme_classic()
Out[1230]:
<ggplot: (-9223363274692299271)>
In [1231]:
df_dist_filt[(df_dist_filt.side_tf == 'Nanog') & (df_dist_filt.side_imp_weighted_cat == 'high') & (df_dist_filt.strand == '-') & (df_dist_filt.rel_dist < 150)& (df_dist_filt.rel_dist >= 0)].rel_dist.hvplot.hist(bins=145)
Out[1231]:
In [1232]:
plotnine.options.figure_size = (20,5)
ggplot(aes(x='rel_dist'), df_dist_filt[(df_dist_filt.side_tf == 'Nanog') & (df_dist_filt.side_imp_weighted_cat == 'high')]) +  \
  xlim([0, 150]) + \
  geom_histogram(bins=150) + \
  facet_grid(".~strand", labeller='label_both') + \
  theme_classic()
Out[1232]:
<ggplot: (-9223363274691069296)>
In [1233]:
dfs = df_dist_filt.groupby(['rel_dist', 'side_imp_weighted_cat', 'side_pattern_short']).size().reset_index()
dfs = dfs.rename(columns={0: 'n'})
#dfs = dfs[dfs['side_imp_weighted_cat'] == 'high']
In [1234]:
dfs['pattern_name'] = dfs.side_pattern_short.map({v:k for k,v in motifs.items()})
In [1235]:
plotnine.options.figure_size = (20,5)
ggplot(aes(x='rel_dist', y='n'), dfs) +  \
  xlim([-150, 150]) + \
  facet_grid("pattern_name~side_imp_weighted_cat", scales='free_y') + geom_line() + \
  theme_classic()
Out[1235]:
<ggplot: (-9223363274693224339)>
In [1236]:
plotnine.options.figure_size = (20,5)
ggplot(aes(x='rel_dist'), df_dist_filt) +  \
  xlim([-150, 150]) + \
  facet_grid("side_imp_weighted_cat~side_pattern_short") + geom_histogram(bins=300) + \
  theme_bw()
Out[1236]:
<ggplot: (8762165688640)>
In [1237]:
df_dist_filt.head()
Out[1237]:
Klf4/profile_counts Klf4/profile_counts_p Klf4/profile_match Klf4/profile_match_p Klf4/profile_max Klf4/profile_max_p Nanog/profile_counts Nanog/profile_counts_p Nanog/profile_match Nanog/profile_match_p ... side/match_weighted_cat side/pattern side/size strand tf side_pattern_short side_imp_weighted_cat n_motifs side_tf rel_dist
3 102.0 0.901099 inf None 7.0 0.957004 4877.0 1.000000 0.441585 NaN ... high metacluster_0/pattern_0 1.0 + Oct4-Sox2 m0_p0 low 1.0 Oct4-Sox2 330.0
4 102.0 0.901099 inf None 7.0 0.957004 4877.0 1.000000 0.441585 NaN ... high metacluster_0/pattern_0 1.0 + Oct4-Sox2 m0_p0 high 1.0 Oct4-Sox2 62.0
3 88.0 0.879441 inf None 5.0 0.911554 4282.0 1.000000 0.223469 0.8 ... high metacluster_0/pattern_0 1.0 + Oct4-Sox2 m0_p0 low 1.0 Oct4-Sox2 268.0
1 330.0 0.994559 inf None 11.0 0.988264 300.0 0.896298 inf NaN ... high metacluster_0/pattern_0 1.0 + Oct4-Sox2 m0_p0 high 1.0 Oct4-Sox2 26.0
6 330.0 0.994559 inf None 11.0 0.988264 300.0 0.896298 inf NaN ... high metacluster_0/pattern_1 1.0 + Oct4-Sox2 m0_p1 low 1.0 Sox2 149.0

5 rows × 74 columns