Goal¶

analyze the pairwise features for Klf4

Conclusions¶

sensitivity to other factors order (from least sensitive to most sensitive)
- Nanog
- Sox2
- Oct4
- Klf4
regulation model (who influences whom)
- Klf4 -> Sox2
- no regulator on Nanog
- Sox2, ERR2 -> Oct4
- Nanog -> Klf
long motifs play an important role in regulating Klf4
position doesn't have any role regulating it

from basepair.imports import *
ddir = get_data_dir()

from basepair.plot.profiles import extract_signal
from basepair.math import softmax
from basepair.plot.heatmaps import heatmap_stranded_profile, multiple_heatmap_stranded_profile
from basepair.plot.profiles import  plot_stranded_profile, multiple_plot_stranded_profile

import plotnine
from plotnine import *

import statsmodels.api as sm
import statsmodels.formula.api as smf

ls {model_dir}

model_dir = Path(f"{ddir}/processed/chipnexus/exp/models/oct-sox-nanog-klf/models/n_dil_layers=9/")
modisco_pdir = model_dir / "modisco/by_peak_tasks/weighted/"

# Load the data
d = HDF5Reader(model_dir / "grad.all.h5")
d.open()

Nanog¶

task = "Klf4"
pattern = "metacluster_0/pattern_0"

mr = ModiscoResult(modisco_pdir / f"{task}/modisco.h5")
mr.open()
seqlets = mr._get_seqlets(pattern)
#mr.close()

include_samples = np.load(read_json(modisco_pdir / f"{task}/kwargs.json")["filter_npy"])

profile_obs = d.f[f'/targets/profile/{task}'][:][include_samples]

ds = DataSpec.load(model_dir / "dataspec.yaml")

seqlet_profile_obs = extract_signal(profile_obs, seqlets)

total_counts = seqlet_profile_obs.sum(axis=-1).sum(axis=-1)
sort_idx = np.argsort(-total_counts)

plot_stranded_profile(seqlet_profile_obs.mean(axis=0))

# probabilities
p = seqlet_profile_obs[sort_idx] / seqlet_profile_obs[sort_idx].sum(axis=1, keepdims=True)

# drop NA's
notnan = ~np.any(np.any(np.isnan(p), axis=-1), axis=-1)
total_counts = seqlet_profile_obs[sort_idx].sum(axis=1)[notnan]
p = p[notnan]

seqlet_idx = np.array([s.seqname for s in seqlets])[notnan]

/users/avsec/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/ipykernel/__main__.py:2: RuntimeWarning: invalid value encountered in true_divide
  from ipykernel import kernelapp as app

# dropped 
print("Dropped", seqlet_profile_obs.shape[0] - p.shape[0], "profiles with only 0's")

Dropped 488 profiles with only 0's

heatmap_stranded_profile(p[:1000], figsize=(20,10))

1. Quantify the profile effect -> entropy and total counts¶

from scipy.stats import entropy
from scipy.special import  rel_entr, kl_div

# S(p_obs)
entropies = entropy(p.swapaxes(0,1)).sum(axis=-1)
entropies_uniform = entropy(np.ones_like(p).swapaxes(0,1)).sum(axis=-1)

# KL(p_obs, p_average)
kl = kl_div(p, p.mean(axis=0, keepdims=True)).mean(axis=-1).sum(axis=-1)
crossentropy = rel_entr(p, p.mean(axis=0, keepdims=True)).mean(axis=-1).sum(axis=-1)

fig = plt.figure(figsize=(13,5))
plt.subplot(131)
plt.plot(entropies - entropies_uniform);
plt.ylabel("entropy - entropy_uniform")
plt.xlabel("idx");
plt.subplot(132)
plt.plot(kl, entropies, ".");  # kl divergence and the entropy between the other factor are almost the same
plt.xlabel("kl")
plt.ylabel("Entropy");
plt.subplot(133)
plt.plot(crossentropy, kl, ".");  # kl divergence and the entropy between the other factor are almost the same
plt.xlabel("crossentropy")
plt.ylabel("Entropy");

fig=plt.figure(figsize=(10,4))
plt.subplot(121)
plt.plot(entropies, np.log(1+total_counts.mean(axis=-1)), ".", alpha=0.3);
plt.xlabel("Entropy")
plt.ylabel("log(1+ counts)");
plt.subplot(122)
plt.plot(entropies**2, np.log(1+total_counts.mean(axis=-1)), ".", alpha=0.3);
plt.xlabel(r"Entropy^2")
plt.ylabel("log(1+ counts)");

Conclusion¶

both metrics: counts and entropy are good for characterizing the signal

Question: How do others contribute counts?¶

Features¶

Count matrix¶

df = mr.seqlet_df_instances()

dfp = df.pivot_table("center", "seqname", "pattern", aggfunc=len, fill_value=0)

count_features = dfp.loc[seqlet_idx]

count_features[pattern].value_counts().plot.bar();
plt.xlabel("Number of occurences in the sequence")
plt.ylabel("Frequency");

count_features.iloc[:,count_features.columns!=pattern].sum().plot.bar(figsize=(20,5));
plt.ylabel("Sum");

Boxplot for each factor the change in entropy¶

def rename_pattern(p):
    if "metacluster" in p:
        return p.replace("metacluster_", "m").replace("/", "_").replace("pattern_", "p")
    else:
        return p

count_features.columns = [rename_pattern(p) for p in count_features.columns]

dfm = count_features.assign(entropy=entropies, counts=np.log10(1 + total_counts.mean(axis=-1)), 
                            example_idx=count_features.index).melt(id_vars=['entropy', 'counts', 'example_idx'], var_name="pattern")

dfmf = dfm.groupby("pattern").filter(lambda x: x.value.sum()> 50)

dfmf.value = pd.Categorical(dfmf.value)
dfm.value = pd.Categorical(dfm.value)

/users/avsec/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/pandas/core/generic.py:3643: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self[name] = value

plotnine.options.figure_size = (20,10)
ggplot(aes(x='value', y='counts'), dfmf) + geom_boxplot() + facet_wrap("~pattern", ncol = 10, scales='free_x') + theme_bw()

<ggplot: (8778820924985)>

Fit a model to determine the effects¶

def ols_formula(df, dependent_var, *excluded_cols):
    '''
    Generates the R style formula for statsmodels (patsy) given
    the dataframe, dependent variable and optional excluded columns
    as strings
    '''
    df_columns = list(df.columns.values)
    df_columns.remove(dependent_var)
    for col in excluded_cols:
        df_columns.remove(col)
    return dependent_var + ' ~ ' + ' + '.join(df_columns)

dm = count_features.assign(counts=np.log10(1+total_counts.sum(axis=-1)))

ols_formula(dm, "counts")

'counts ~ m0_p0 + m0_p1 + m0_p10 + m0_p11 + m0_p12 + m0_p13 + m0_p14 + m0_p15 + m0_p16 + m0_p17 + m0_p18 + m0_p19 + m0_p2 + m0_p20 + m0_p21 + m0_p22 + m0_p23 + m0_p24 + m0_p25 + m0_p26 + m0_p27 + m0_p3 + m0_p4 + m0_p5 + m0_p6 + m0_p7 + m0_p8 + m0_p9 + m1_p0 + m1_p1 + m1_p10 + m1_p2 + m1_p3 + m1_p4 + m1_p5 + m1_p6 + m1_p7 + m1_p8 + m1_p9 + m11_p0 + m11_p1 + m11_p2 + m11_p3 + m11_p4 + m2_p0 + m2_p1 + m2_p10 + m2_p11 + m2_p2 + m2_p3 + m2_p4 + m2_p5 + m2_p6 + m2_p7 + m2_p8 + m2_p9 + m4_p0 + m4_p1 + m4_p2 + m4_p3 + m4_p4 + m4_p5 + m4_p6 + m5_p0 + m5_p1 + m5_p2 + m5_p3 + m5_p4 + m5_p5 + m6_p0 + m6_p1 + m6_p2 + m6_p3 + m6_p4 + m6_p5 + m6_p6 + m7_p0 + m7_p1 + m7_p2 + m7_p3 + m7_p4 + m7_p5 + m9_p0 + m9_p1'

results = smf.ols(ols_formula(dm, "counts"), data=dm).fit()

def tidy_ols(ols_reults):
    smry = results.summary()
    coef = smry.tables[1]
    return pd.DataFrame(coef.data[1:], columns=coef.data[0])

df_fit = tidy_ols(results)

# significant coefficients
df_fit_signif = df_fit[df_fit['P>|t|'].astype(float) < 0.05]
df_fit_signif = df_fit_signif[df_fit_signif[""] != "Intercept"]  # don't show the intercept
df_fit_signif.iloc[df_fit_signif['coef'].astype(float).abs().argsort()].iloc[::-1]  # sort by the effect size

# significant coefficients - more stringent
df_fit_signif = df_fit[df_fit['P>|t|'].astype(float) < 0.01]
df_fit_signif = df_fit_signif[df_fit_signif[""] != "Intercept"]  # don't show the intercept
df_fit_signif.iloc[df_fit_signif['coef'].astype(float).abs().argsort()].iloc[::-1]  # sort by the effect size

Conclusions¶

A lot going on in the pairwise binding

Significant negative effect¶

m1_p6 - long motif contains a Sox2 motif and ? Nanog / Sox2
m1_p9 - long moitf bounded by Nanog not by Klf4
m6_p6 - long Klf4 motif
m0_p23 - long Klf4 motif
m0_p18 - logn Klf4 motif
m0_p3 - short Klf4 motif
m1_p5 - long sox2 nanog motif

Significant positive effect¶

m4_p5 - Nanog motif
m7_p2 - Nanog motif
m1_p2 - ERR2 motif

pattern_short = rename_pattern(pattern)
single_motif_idx = count_features.index[count_features[pattern_short] == 1]
df_center = df[df.seqname.isin(single_motif_idx)].query(f"pattern == '{pattern}'")[['seqname', 'center']]
df_counts = pd.DataFrame({"seqname": seqlet_idx,
                          "log_counts": np.log10(1+total_counts.mean(axis=-1))})
dfd = pd.merge(df[df.pattern != pattern], df_center, on='seqname', suffixes=("_other", "_core"))
dfd['rel'] = dfd.center_other - dfd.center_core
dfd = dfd.merge(df_counts, on="seqname")

plotnine.options.figure_size = (20,30)
ggplot(aes(x="rel", y='log_counts'), dfd) + geom_point(alpha=0.5) + facet_wrap("~pattern", ncol = 5) + theme_bw() #+ xlim([-400, 400])

<ggplot: (-9223363298286387631)>

plotnine.options.figure_size = (20,30)
ggplot(aes(x="rel", y='log_counts'), dfd) + geom_point(alpha=0.5) + facet_wrap("~pattern", ncol = 5) + theme_bw() + xlim([-70, 70])

/users/avsec/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/plotnine/layer.py:450: UserWarning: geom_point : Removed 3943 rows containing missing values.
  self.data = self.geom.handle_na(self.data)

<ggplot: (-9223363298533720078)>

Conclusions¶

no specifical positioning

		coef	std err	t	P>\|t\|	[0.025	0.975]
36	m1_p6	-0.3461	0.100	-3.444	0.001	-0.543	-0.149
39	m1_p9	-0.2892	0.096	-3.028	0.002	-0.476	-0.102
62	m4_p5	0.2556	0.104	2.454	0.014	0.051	0.460
76	m6_p6	-0.2541	0.067	-3.781	0.000	-0.386	-0.122
17	m0_p23	-0.2208	0.057	-3.841	0.000	-0.333	-0.108
11	m0_p18	-0.2143	0.039	-5.495	0.000	-0.291	-0.138
22	m0_p3	-0.2140	0.037	-5.709	0.000	-0.288	-0.141
68	m5_p4	0.1936	0.089	2.164	0.031	0.018	0.369
74	m6_p4	-0.1707	0.056	-3.025	0.002	-0.281	-0.060
79	m7_p2	0.1386	0.052	2.669	0.008	0.037	0.240
50	m2_p3	0.1321	0.051	2.576	0.010	0.032	0.233
32	m1_p2	0.1159	0.021	5.536	0.000	0.075	0.157
35	m1_p5	-0.1041	0.036	-2.872	0.004	-0.175	-0.033
8	m0_p15	-0.0911	0.044	-2.057	0.040	-0.178	-0.004
33	m1_p3	0.0893	0.029	3.059	0.002	0.032	0.147
34	m1_p4	-0.0861	0.028	-3.092	0.002	-0.141	-0.032
37	m1_p7	-0.0852	0.038	-2.267	0.023	-0.159	-0.012
26	m0_p7	-0.0808	0.039	-2.057	0.040	-0.158	-0.004
41	m11_p1	0.0800	0.037	2.177	0.029	0.008	0.152
57	m4_p0	0.0474	0.022	2.148	0.032	0.004	0.091
30	m1_p1	-0.0417	0.020	-2.133	0.033	-0.080	-0.003
29	m1_p0	0.0409	0.014	3.029	0.002	0.014	0.067
1	m0_p0	-0.0348	0.008	-4.407	0.000	-0.050	-0.019

		coef	std err	t	P>\|t\|	[0.025	0.975]
36	m1_p6	-0.3461	0.100	-3.444	0.001	-0.543	-0.149
39	m1_p9	-0.2892	0.096	-3.028	0.002	-0.476	-0.102
76	m6_p6	-0.2541	0.067	-3.781	0.000	-0.386	-0.122
17	m0_p23	-0.2208	0.057	-3.841	0.000	-0.333	-0.108
11	m0_p18	-0.2143	0.039	-5.495	0.000	-0.291	-0.138
22	m0_p3	-0.2140	0.037	-5.709	0.000	-0.288	-0.141
74	m6_p4	-0.1707	0.056	-3.025	0.002	-0.281	-0.060
79	m7_p2	0.1386	0.052	2.669	0.008	0.037	0.240
32	m1_p2	0.1159	0.021	5.536	0.000	0.075	0.157
35	m1_p5	-0.1041	0.036	-2.872	0.004	-0.175	-0.033
33	m1_p3	0.0893	0.029	3.059	0.002	0.032	0.147
34	m1_p4	-0.0861	0.028	-3.092	0.002	-0.141	-0.032
29	m1_p0	0.0409	0.014	3.029	0.002	0.014	0.067
1	m0_p0	-0.0348	0.008	-4.407	0.000	-0.050	-0.019