Goal¶

plot the pairwise features

Conclusions¶

there doesn't seem to be any strong dependency on other factors for binding of Oct4

Discovered issues¶

some positions might be double counted
- e.g. Oct4, Sox2 heterodimer from: metacluster_0/pattern_0 as well as the same motif - metacluster_3/pattern_0 could share the same final location
When using instances from modisco, it could be that the wide seqlet exclude the possibility to find another motif nearby

from basepair.imports import *
ddir = get_data_dir()

/users/avsec/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/h5py/__init__.py:36: FutureWarning: Conversion of the second argument of issubdtype from `float` to `np.floating` is deprecated. In future, it will be treated as `np.float64 == np.dtype(float).type`.
  from ._conv import register_converters as _register_converters
Using TensorFlow backend.

from basepair.plot.profiles import extract_signal
from basepair.math import softmax
from basepair.plot.heatmaps import heatmap_stranded_profile, multiple_heatmap_stranded_profile
from basepair.plot.profiles import  plot_stranded_profile, multiple_plot_stranded_profile

ls {model_dir}

ls: cannot access '{model_dir}': No such file or directory

model_dir = Path(f"{ddir}/processed/chipnexus/exp/models/oct-sox-nanog-klf/models/n_dil_layers=9/")
modisco_pdir = model_dir / "modisco/by_peak_tasks/weighted/"

# Load the data
d = HDF5Reader(model_dir / "grad.all.h5")
d.open()

Oct4¶

task = "Oct4"
pattern = "metacluster_0/pattern_0"

mr = ModiscoResult(modisco_pdir / f"{task}/modisco.h5")
mr.open()
seqlets = mr._get_seqlets(pattern)
#mr.close()

include_samples = np.load(read_json(modisco_pdir / f"{task}/kwargs.json")["filter_npy"])

profile_obs = d.f[f'/targets/profile/{task}'][:][include_samples]

ds = DataSpec.load(model_dir / "dataspec.yaml")

seqlet_profile_obs = extract_signal(profile_obs, seqlets)

total_counts = seqlet_profile_obs.sum(axis=-1).sum(axis=-1)
sort_idx = np.argsort(-total_counts)

plot_stranded_profile(seqlet_profile_obs.mean(axis=0))

np.isnan(p[:,0,0])

array([False, False, False, ...,  True,  True,  True])

# probabilities
p = seqlet_profile_obs[sort_idx] / seqlet_profile_obs[sort_idx].sum(axis=1, keepdims=True)

# drop NA's
notnan = ~np.any(np.any(np.isnan(p), axis=-1), axis=-1)
total_counts = seqlet_profile_obs[sort_idx].sum(axis=1)[notnan]
p = p[notnan]

seqlet_idx = np.array([s.seqname for s in seqlets])[notnan]

/users/avsec/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/ipykernel/__main__.py:2: RuntimeWarning: invalid value encountered in true_divide
  from ipykernel import kernelapp as app

# dropped 
print("Dropped", seqlet_profile_obs.shape[0] - p.shape[0], "profiles with only 0's")

Dropped 79 profiles with only 0's

heatmap_stranded_profile(p[:1000], figsize=(20,10))

1. Quantify the profile effect -> entropy and total counts¶

from scipy.stats import entropy
from scipy.special import  rel_entr, kl_div

# S(p_obs)
entropies = entropy(p.swapaxes(0,1)).sum(axis=-1)

# KL(p_obs, p_average)
kl = kl_div(p, p.mean(axis=0, keepdims=True)).mean(axis=-1).sum(axis=-1)
crossentropy = rel_entr(p, p.mean(axis=0, keepdims=True)).mean(axis=-1).sum(axis=-1)

fig = plt.figure(figsize=(13,5))
plt.subplot(131)
plt.plot(entropies);
plt.ylabel("entropy")
plt.xlabel("idx");
plt.subplot(132)
plt.plot(kl, entropies, ".");  # kl divergence and the entropy between the other factor are almost the same
plt.xlabel("kl")
plt.ylabel("Entropy");
plt.subplot(133)
plt.plot(crossentropy, kl, ".");  # kl divergence and the entropy between the other factor are almost the same
plt.xlabel("crossentropy")
plt.ylabel("Entropy");

fig=plt.figure(figsize=(10,4))
plt.subplot(121)
plt.plot(entropies, np.log(1+total_counts.mean(axis=-1)), ".", alpha=0.3);
plt.xlabel("Entropy")
plt.ylabel("log(1+ counts)");
plt.subplot(122)
plt.plot(entropies**2, np.log(1+total_counts.mean(axis=-1)), ".", alpha=0.3);
plt.xlabel(r"Entropy^2")
plt.ylabel("log(1+ counts)");

Conclusion¶

both metrics: counts and entropy are good for characterizing the signal

Question: How do others contribute to that?¶

Features¶

Count matrix¶

df = mr.seqlet_df_instances()

dfp = df.pivot_table("center", "seqname", "pattern", aggfunc=len, fill_value=0)

seqlet_idx

array([ 5170,   571,  1341, ..., 18409, 18409, 18409])

seqlet_idx[:3]

array([5170,  571, 1341])

len(seqlet_idx)

5955

count_features = dfp.loc[seqlet_idx]

count_features.head()

count_features.shape

(5955, 54)

count_features[pattern].value_counts().plot.bar();
plt.xlabel("Number of occurences in the sequence")
plt.ylabel("Frequency");

count_features.iloc[:,count_features.columns!=pattern].sum().plot.bar(figsize=(20,5));
plt.ylabel("Mean");

Boxplot for each factor the change in entropy¶

entropies.shape

(5876,)

count_features.shape

(5955, 54)

count_features.index

Int64Index([ 5170,   571,  1341, 17234, 10707,  8646, 20230,   278,  3710,
             1687,
            ...
            13523,  2492,  3958, 16475,  5541,  9615, 11315, 11871, 16979,
             3594],
           dtype='int64', name='seqname', length=5876)

def rename_pattern(p):
    if "metacluster" in p:
        return p.replace("metacluster_", "m").replace("/", "_").replace("pattern_", "p")
    else:
        return p

count_features.columns = [rename_pattern(p) for p in count_features.columns]

dfm = count_features.assign(entropy=entropies, counts=np.log10(1 + total_counts.mean(axis=-1)), 
                            example_idx=count_features.index).melt(id_vars=['entropy', 'counts', 'example_idx'], var_name="pattern")

dfmf = dfm.groupby("pattern").filter(lambda x: x.value.sum()>50)

dfmf.value = pd.Categorical(dfmf.value)
dfm.value = pd.Categorical(dfm.value)

/users/avsec/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/pandas/core/generic.py:3643: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self[name] = value

import plotnine
from plotnine import *

# TODO - add the significance test

# TODO - filter in-frequent ones

plotnine.options.figure_size = (20,20)
ggplot(aes(x='value', y='entropy'), dfmf) + geom_boxplot() + facet_wrap("~pattern", ncol = 10, scales='free_x') + theme_bw()

<ggplot: (-9223363291408381219)>

Fit a model to determine the effects¶

import statsmodels.api as sm
import statsmodels.formula.api as smf

def ols_formula(df, dependent_var, *excluded_cols):
    '''
    Generates the R style formula for statsmodels (patsy) given
    the dataframe, dependent variable and optional excluded columns
    as strings
    '''
    df_columns = list(df.columns.values)
    df_columns.remove(dependent_var)
    for col in excluded_cols:
        df_columns.remove(col)
    return dependent_var + ' ~ ' + ' + '.join(df_columns)

dm = count_features.assign(counts=np.log10(1+total_counts.sum(axis=-1)))

ols_formula(dm, "counts")

'counts ~ m0_p0 + m0_p1 + m0_p10 + m0_p11 + m0_p12 + m0_p13 + m0_p2 + m0_p3 + m0_p4 + m0_p5 + m0_p6 + m0_p7 + m0_p8 + m0_p9 + m1_p0 + m1_p1 + m1_p2 + m1_p3 + m1_p4 + m1_p5 + m1_p6 + m1_p7 + m1_p8 + m1_p9 + m10_p0 + m10_p1 + m10_p2 + m11_p0 + m11_p1 + m11_p2 + m2_p0 + m2_p1 + m2_p2 + m2_p3 + m3_p0 + m3_p1 + m3_p2 + m3_p3 + m3_p4 + m4_p0 + m4_p1 + m4_p2 + m4_p3 + m4_p4 + m4_p5 + m4_p6 + m5_p0 + m5_p1 + m5_p2 + m5_p3 + m7_p0 + m7_p1 + m7_p2 + m7_p3'

results = smf.ols(ols_formula(dm, "counts"), data=dm).fit()

def tidy_ols(ols_reults):
    smry = results.summary()
    coef = smry.tables[1]
    return pd.DataFrame(coef.data[1:], columns=coef.data[0])

df_fit = tidy_ols(results)

# significant coefficients
df_fit_signif = df_fit[df_fit['P>|t|'].astype(float) < 0.05]
df_fit_signif = df_fit_signif[df_fit_signif[""] != "Intercept"]  # don't show the intercept
df_fit_signif.iloc[df_fit_signif['coef'].astype(float).abs().argsort()].iloc[::-1]  # sort by the effect size

Conclusions¶

Top negative effect¶

m0_p6 - another Oct4-sox2
m7_p2 - Klf4
m1_p3 - Nanog
m3_p3 - Nanog
m2_p3 - Klf4
m2_p0 - Klf4
m4_p0 - Sox2 - ?
m0_p0 - Self -> Oct4-Sox2
m1_p0 - Nanog
m0_p1 - ERR2 - Important also for Oct4 counts

Top positive effect¶

m3_p1 - Sox2 (lacking profile)
m2_p1 - ERR2 - CAAGGTC - Only important for Klf4

Interesting, motifs of one kind are present on both sides

pattern_short = rename_pattern(pattern)

single_motif_idx = count_features.index[count_features[pattern_short] == 1]

df_center = df[df.seqname.isin(single_motif_idx)].query(f"pattern == '{pattern}'")[['seqname', 'center']]

df_counts = pd.DataFrame({"seqname": seqlet_idx,
                         "log_counts": np.log10(1+total_counts.mean(axis=-1))})

dfd = pd.merge(df[df.pattern != pattern], df_center, on='seqname', suffixes=("_other", "_core"))

dfd['rel'] = dfd.center_other - dfd.center_core

dfd = dfd.merge(df_counts, on="seqname")

plotnine.options.figure_size = (20,20)
ggplot(aes(x="rel", y='log_counts'), dfd) + geom_point(alpha=0.5) + facet_wrap("~pattern", ncol = 5) + theme_bw() + xlim([-400, 400])

/users/avsec/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/plotnine/layer.py:450: UserWarning: geom_point : Removed 123 rows containing missing values.
  self.data = self.geom.handle_na(self.data)

<ggplot: (8745465376639)>

plotnine.options.figure_size = (20,20)
ggplot(aes(x="rel", y='log_counts'), dfd) + geom_point(alpha=0.5) + facet_wrap("~pattern", ncol = 5) + theme_bw() + xlim([-70, 70])

/users/avsec/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/plotnine/layer.py:450: UserWarning: geom_point : Removed 2790 rows containing missing values.
  self.data = self.geom.handle_na(self.data)

<ggplot: (8745446775030)>

Conclusions¶

no interesting positional patterns pop-up

df

dfd.rel.abs().min() # -> there are some at the same position ?!?!?!?

0.0

pattern	metacluster_0/pattern_0	metacluster_0/pattern_1	metacluster_0/pattern_10	metacluster_0/pattern_11	metacluster_0/pattern_12	metacluster_0/pattern_13	metacluster_0/pattern_2	metacluster_0/pattern_3	metacluster_0/pattern_4	metacluster_0/pattern_5	...	metacluster_4/pattern_5	metacluster_4/pattern_6	metacluster_5/pattern_0	metacluster_5/pattern_1	metacluster_5/pattern_2	metacluster_5/pattern_3	metacluster_7/pattern_0	metacluster_7/pattern_1	metacluster_7/pattern_2	metacluster_7/pattern_3
seqname
5170	1	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
571	1	0	0	0	0	0	1	0	0	0	...	0	0	0	0	0	0	0	0	0	0
1341	1	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
17234	1	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	1	0	0	0
10707	1	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0

		coef	std err	t	P>\|t\|	[0.025	0.975]
11	m0_p6	-0.2649	0.043	-6.212	0.000	-0.349	-0.181
53	m7_p2	-0.1383	0.056	-2.456	0.014	-0.249	-0.028
18	m1_p3	-0.1248	0.042	-3.005	0.003	-0.206	-0.043
38	m3_p3	-0.1234	0.054	-2.276	0.023	-0.230	-0.017
34	m2_p3	-0.1139	0.056	-2.033	0.042	-0.224	-0.004
31	m2_p0	-0.0984	0.013	-7.627	0.000	-0.124	-0.073
36	m3_p1	0.0893	0.038	2.330	0.020	0.014	0.164
32	m2_p1	0.0682	0.030	2.307	0.021	0.010	0.126
40	m4_p0	-0.0640	0.024	-2.655	0.008	-0.111	-0.017
1	m0_p0	-0.0590	0.011	-5.218	0.000	-0.081	-0.037
15	m1_p0	-0.0507	0.018	-2.821	0.005	-0.086	-0.015
2	m0_p1	-0.0507	0.019	-2.733	0.006	-0.087	-0.014