Goal¶

analyze the pairwise features for Nanog

Conclusions¶

there is almost nothing going on in the pairwise interactions

from basepair.imports import *
ddir = get_data_dir()

/users/avsec/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/h5py/__init__.py:36: FutureWarning: Conversion of the second argument of issubdtype from `float` to `np.floating` is deprecated. In future, it will be treated as `np.float64 == np.dtype(float).type`.
  from ._conv import register_converters as _register_converters
Using TensorFlow backend.

from basepair.plot.profiles import extract_signal
from basepair.math import softmax
from basepair.plot.heatmaps import heatmap_stranded_profile, multiple_heatmap_stranded_profile
from basepair.plot.profiles import  plot_stranded_profile, multiple_plot_stranded_profile

import plotnine
from plotnine import *

import statsmodels.api as sm
import statsmodels.formula.api as smf

from basepair.modisco.results import Seqlet, resize_seqlets
from basepair.modisco.core import dfi2seqlets, annotate_profile
from basepair.cli.modisco import load_profiles

model_dir = Path(f"{ddir}/processed/chipnexus/exp/models/oct-sox-nanog-klf/models/n_dil_layers=9/")
modisco_pdir = model_dir / "modisco/by_peak_tasks/weighted/"

# Load the data
d = HDF5Reader(model_dir / "grad.all.h5")
d.open()

dfi = pd.read_csv(f"{modisco_pdir}/Nanog/instances.tsv.gz", sep='\t')

dfi.tail()

seqlets_medium = dfi2seqlets(dfi[dfi.match_weighted_cat == 'medium'])

dfi.match_weighted.plot.hist(100);

dfi.match_weighted_p.plot.hist(10);

dfi.match_weighted_cat.value_counts().plot.bar();

dfi.imp_weighted_p.plot.hist(100);

dfi.imp_weighted_cat.value_counts().plot.bar();

dfi.match_weighted_p.plot.hist(10);

from basepair.stats import low_medium_high

Re-compute the motif instances¶

p = mr.get_pattern(pattern).trim_seq_ic(0.08)

dfi.head()

Nanog¶

task = "Nanog"
pattern = "metacluster_0/pattern_0"

Dev - compute the profile similarity¶

# load profiles
profiles = load_profiles(modisco_pdir, model_dir/'grad.all.h5')

tasks = list(profiles)

dfi_anno = annotate_profile(dfi, mr, profiles)

  0%|          | 0/1 [00:00<?, ?it/s]
100%|██████████| 1/1 [05:48<00:00, 348.17s/it]

dfi_anno['Klf4/profile_match'].isnull().mean()

0.026150592159228804

dfi_anno['match_weighted_cat'] = pd.Categorical(dfi_anno['match_weighted_cat'])

dfi_anno['imp_weighted_cat'] = pd.Categorical(dfi_anno['imp_weighted_cat'])

len(dfi_anno)

295022

len(dfi)

295022

dfif = dfi_anno[~np.isinf(dfi_anno['Oct4/profile_match'])]

dfif.groupby(['match_weighted_cat', 'imp_weighted_cat'])['Oct4/profile_match'].mean().plot.bar()

<matplotlib.axes._subplots.AxesSubplot at 0x7f0b0bf3fac8>

ggplot(aes(x='match_weighted_cat', 
           color='imp_weighted_cat', 
           y='Oct4/profile_match'), dfi_anno.sample(100000)) + geom_boxplot()

---------------------------------------------------------------------------
KeyError                                  Traceback (most recent call last)
~/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/plotnine/utils.py in match(v1, v2, nomatch, incomparables, start)
    139         with suppress(KeyError):
--> 140             lst[i] = lookup[x] + start
    141 

KeyError: nan

During handling of the above exception, another exception occurred:

KeyboardInterrupt                         Traceback (most recent call last)
<ipython-input-371-eed5ae626cd0> in <module>()
      1 ggplot(aes(x='match_weighted_cat', 
      2            color='imp_weighted_cat',
----> 3            y='Oct4/profile_match'), dfi_anno.sample(100000)) + geom_boxplot()

~/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/IPython/core/displayhook.py in __call__(self, result)
    255             self.start_displayhook()
    256             self.write_output_prompt()
--> 257             format_dict, md_dict = self.compute_format_data(result)
    258             self.update_user_ns(result)
    259             self.fill_exec_result(result)

~/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/IPython/core/displayhook.py in compute_format_data(self, result)
    149 
    150         """
--> 151         return self.shell.display_formatter.format(result)
    152 
    153     # This can be set to True by the write_output_prompt method in a subclass

~/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/IPython/core/formatters.py in format(self, obj, include, exclude)
    178             md = None
    179             try:
--> 180                 data = formatter(obj)
    181             except:
    182                 # FIXME: log the exception

<decorator-gen-10> in __call__(self, obj)

~/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/IPython/core/formatters.py in catch_format_error(method, self, *args, **kwargs)
    222     """show traceback on failed format call"""
    223     try:
--> 224         r = method(self, *args, **kwargs)
    225     except NotImplementedError:
    226         # don't warn on NotImplementedErrors

~/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/IPython/core/formatters.py in __call__(self, obj)
    700                 type_pprinters=self.type_printers,
    701                 deferred_pprinters=self.deferred_printers)
--> 702             printer.pretty(obj)
    703             printer.flush()
    704             return stream.getvalue()

~/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/IPython/lib/pretty.py in pretty(self, obj)
    398                         if cls is not object \
    399                                 and callable(cls.__dict__.get('__repr__')):
--> 400                             return _repr_pprint(obj, self, cycle)
    401 
    402             return _default_pprint(obj, self, cycle)

~/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/IPython/lib/pretty.py in _repr_pprint(obj, p, cycle)
    693     """A pprint that just redirects to the normal repr function."""
    694     # Find newlines and replace them with p.break_()
--> 695     output = repr(obj)
    696     for idx,output_line in enumerate(output.splitlines()):
    697         if idx:

~/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/plotnine/ggplot.py in __repr__(self)
     84         Print/show the plot
     85         """
---> 86         self.draw()
     87         plt.show()
     88         return '<ggplot: (%d)>' % self.__hash__()

~/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/plotnine/ggplot.py in draw(self, return_ggplot)
    177         # assign a default theme
    178         self = deepcopy(self)
--> 179         self._build()
    180 
    181         # If no theme we use the default

~/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/plotnine/ggplot.py in _build(self)
    274         # Compute aesthetics to produce data with generalised
    275         # variable names
--> 276         layers.compute_aesthetics(self)
    277 
    278         # Transform data using all scales

~/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/plotnine/layer.py in compute_aesthetics(self, plot)
     80     def compute_aesthetics(self, plot):
     81         for l in self:
---> 82             l.compute_aesthetics(plot)
     83 
     84     def compute_statistic(self, layout):

~/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/plotnine/layer.py in compute_aesthetics(self, plot)
    348             evaled['PANEL'] = data['PANEL']
    349 
--> 350         self.data = add_group(evaled)
    351 
    352     def compute_statistic(self, layout):

~/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/plotnine/layer.py in add_group(data)
    479         disc = discrete_columns(data, ignore=['label'])
    480         if disc:
--> 481             data['group'] = ninteraction(data[disc], drop=True)
    482         else:
    483             data['group'] = NO_GROUP

~/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/plotnine/utils.py in ninteraction(df, drop)
    288 
    289     if drop:
--> 290         return _id_var(res, drop)
    291     else:
    292         return res

~/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/plotnine/utils.py in _id_var(x, drop)
    319             levels = multitype_sort(set(x))
    320 
--> 321         lst = match(x, levels)
    322         lst = [item + 1 for item in lst]
    323 

~/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/plotnine/utils.py in match(v1, v2, nomatch, incomparables, start)
    138 
    139         with suppress(KeyError):
--> 140             lst[i] = lookup[x] + start
    141 
    142     return lst

KeyboardInterrupt:

ggplot(aes(x='match_weighted_cat', 
           color='imp_weighted_cat', 
           y='Oct4/profile_match'), dfi_anno.sample(100000)) + geom_boxplot()

<ggplot: (8730573773232)>

dfi_anno.head()

np.isinf(np.array([np.inf]))

array([ True])

mr = ModiscoResult(modisco_pdir / f"{task}/modisco.h5")
mr.open()
seqlets = mr._get_seqlets(pattern, trim_frac=0.08)
seqlets = resize_seqlets(seqlets, 200, seqlen=profile_obs.shape[1])
# mr.close()

list(profiles)

['Klf4', 'Nanog', 'Oct4', 'Sox2']

len(seqlets)

4265

len(seqlets)

4265

from basepair.cli.modisco import load_ranges, load_included_samples

ranges = load_ranges(modisco_pdir / task)

load_included_samples(modisco_pdir / task)

array([False, False, False, ..., False, False, False])

include_samples = np.load(read_json(modisco_pdir / f"{task}/kwargs.json")["filter_npy"])

out_task = 'Oct4'
profile_obs = d.f[f'/targets/profile/{out_task}'][:][include_samples]

ds = DataSpec.load(model_dir / "dataspec.yaml")

seqlets = dfi2seqlets(dfi[(dfi.match_weighted_cat == 'medium')])
seqlets = resize_seqlets(seqlets, 200, seqlen=profile_obs.shape[1])

seqlets[:3]

[Seqlet(seqname=1, start=317, end=517, name='metacluster_0/pattern_0', strand='+'),
 Seqlet(seqname=1, start=698, end=898, name='metacluster_0/pattern_0', strand='-'),
 Seqlet(seqname=3, start=322, end=522, name='metacluster_0/pattern_0', strand='+')]

seqlet_profile_obs = extract_signal(profile_obs, seqlets)

total_counts = seqlet_profile_obs.sum(axis=-1).sum(axis=-1)
sort_idx = np.argsort(-total_counts)

avg_profile = seqlet_profile_obs.mean(axis=0)

# now, compute the KL distance

plot_stranded_profile(avg_profile)

plot_stranded_profile(p.mean(axis=0))

# probabilities
p = seqlet_profile_obs[sort_idx] / seqlet_profile_obs[sort_idx].sum(axis=1, keepdims=True)

# drop NA's
notnan = ~np.any(np.any(np.isnan(p), axis=-1), axis=-1)
total_counts = seqlet_profile_obs[sort_idx].sum(axis=1)[notnan]
p = p[notnan]

seqlet_idx = np.array([s.seqname for s in seqlets])[notnan]

total_counts.shape

(4228, 2)

# dropped 
print("Dropped", seqlet_profile_obs.shape[0] - p.shape[0], "profiles with only 0's")

Dropped 71 profiles with only 0's

p.shape

(14169, 200, 2)

old_total_counts = total_counts

plt.scatter(total_counts[:,0], total_counts[:,1])

<matplotlib.collections.PathCollection at 0x7f0bdbbe39b0>

plt.plot(total_counts.sum(axis=-1)[:400])
plt.plot(old_total_counts.sum(axis=-1)[:400], label='old')
plt.legend()

<matplotlib.legend.Legend at 0x7f0b0ed17278>

heatmap_stranded_profile(p[:5000], figsize=(20,20))

heatmap_stranded_profile(p[:5000], figsize=(20,20))

1. Quantify the profile effect -> entropy and total counts¶

from scipy.stats import entropy
from scipy.special import  rel_entr, kl_div

# S(p_obs)
entropies = entropy(p.swapaxes(0,1)).sum(axis=-1)

# KL(p_obs, p_average)
kl = kl_div(p, p.mean(axis=0, keepdims=True)).mean(axis=-1).sum(axis=-1)
crossentropy = rel_entr(p, p.mean(axis=0, keepdims=True)).mean(axis=-1).sum(axis=-1)

fig = plt.figure(figsize=(13,5))
plt.subplot(131)
plt.plot(entropies);
plt.ylabel("entropy")
plt.xlabel("idx");
plt.subplot(132)
plt.plot(kl, entropies, ".");  # kl divergence and the entropy between the other factor are almost the same
plt.xlabel("kl")
plt.ylabel("Entropy");
plt.subplot(133)
plt.plot(crossentropy, kl, ".");  # kl divergence and the entropy between the other factor are almost the same
plt.xlabel("crossentropy")
plt.ylabel("Entropy");

fig=plt.figure(figsize=(10,4))
plt.subplot(121)
plt.plot(entropies, np.log(1+total_counts.mean(axis=-1)), ".", alpha=0.3);
plt.xlabel("Entropy")
plt.ylabel("log(1+ counts)");
plt.subplot(122)
plt.plot(entropies**2, np.log(1+total_counts.mean(axis=-1)), ".", alpha=0.3);
plt.xlabel(r"Entropy^2")
plt.ylabel("log(1+ counts)");

Conclusion¶

both metrics: counts and entropy are good for characterizing the signal

Question: How do others contribute counts?¶

Features¶

Count matrix¶

df = mr.seqlet_df_instances()

dfp = df.pivot_table("center", "seqname", "pattern", aggfunc=len, fill_value=0)

count_features = dfp.loc[seqlet_idx]

count_features[pattern].value_counts().plot.bar();
plt.xlabel("Number of occurences in the sequence")
plt.ylabel("Frequency");

count_features.iloc[:,count_features.columns!=pattern].sum().plot.bar(figsize=(20,5));
plt.ylabel("Sum");

Boxplot for each factor the change in entropy¶

def rename_pattern(p):
    if "metacluster" in p:
        return p.replace("metacluster_", "m").replace("/", "_").replace("pattern_", "p")
    else:
        return p

count_features.columns = [rename_pattern(p) for p in count_features.columns]

dfm = count_features.assign(entropy=entropies, counts=np.log10(1 + total_counts.mean(axis=-1)), 
                            example_idx=count_features.index).melt(id_vars=['entropy', 'counts', 'example_idx'], var_name="pattern")

dfmf = dfm.groupby("pattern").filter(lambda x: x.value.sum()> 10)

dfmf.value = pd.Categorical(dfmf.value)
dfm.value = pd.Categorical(dfm.value)

/users/avsec/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/pandas/core/generic.py:3643: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self[name] = value

plotnine.options.figure_size = (20,10)
ggplot(aes(x='value', y='entropy'), dfmf) + geom_boxplot() + facet_wrap("~pattern", ncol = 10, scales='free_x') + theme_bw()

<ggplot: (8768024965668)>

Fit a model to determine the effects¶

def ols_formula(df, dependent_var, *excluded_cols):
    '''
    Generates the R style formula for statsmodels (patsy) given
    the dataframe, dependent variable and optional excluded columns
    as strings
    '''
    df_columns = list(df.columns.values)
    df_columns.remove(dependent_var)
    for col in excluded_cols:
        df_columns.remove(col)
    return dependent_var + ' ~ ' + ' + '.join(df_columns)

dm = count_features.assign(counts=np.log10(1+total_counts.sum(axis=-1)))

ols_formula(dm, "counts")

'counts ~ m0_p0 + m0_p1 + m0_p10 + m0_p11 + m0_p12 + m0_p13 + m0_p14 + m0_p15 + m0_p2 + m0_p3 + m0_p4 + m0_p5 + m0_p6 + m0_p7 + m0_p8 + m0_p9 + m1_p0 + m1_p1 + m1_p2 + m1_p3 + m1_p4 + m1_p5 + m1_p6 + m1_p7 + m1_p8 + m10_p0 + m10_p1 + m2_p0 + m2_p1 + m2_p2 + m2_p3 + m2_p4 + m3_p0 + m3_p1 + m3_p2 + m3_p3 + m3_p4 + m3_p5 + m3_p6 + m3_p7 + m3_p8 + m4_p0 + m4_p1 + m4_p2 + m4_p3 + m4_p4 + m4_p5 + m4_p6 + m6_p0 + m6_p1 + m6_p2 + m6_p3 + m7_p0 + m7_p1 + m7_p2 + m7_p3 + m7_p4 + m7_p5 + m8_p0 + m8_p1'

results = smf.ols(ols_formula(dm, "counts"), data=dm).fit()

def tidy_ols(ols_reults):
    smry = results.summary()
    coef = smry.tables[1]
    return pd.DataFrame(coef.data[1:], columns=coef.data[0])

df_fit = tidy_ols(results)

# significant coefficients
df_fit_signif = df_fit[df_fit['P>|t|'].astype(float) < 0.05]
df_fit_signif = df_fit_signif[df_fit_signif[""] != "Intercept"]  # don't show the intercept
df_fit_signif.iloc[df_fit_signif['coef'].astype(float).abs().argsort()].iloc[::-1]  # sort by the effect size

Conclusions¶

Significant negative effect¶

m1_p0 - another Nanog motif

Significant positive effect¶

m3_p4 - Klf4 - homodimer - long motif (only 3 instances...)
m0_p8 - long motif (Oct4?)

pattern_short = rename_pattern(pattern)
single_motif_idx = count_features.index[count_features[pattern_short] == 1]
df_center = df[df.seqname.isin(single_motif_idx)].query(f"pattern == '{pattern}'")[['seqname', 'center']]
df_counts = pd.DataFrame({"seqname": seqlet_idx,
                          "log_counts": np.log10(1+total_counts.mean(axis=-1))})
dfd = pd.merge(df[df.pattern != pattern], df_center, on='seqname', suffixes=("_other", "_core"))
dfd['rel'] = dfd.center_other - dfd.center_core
dfd = dfd.merge(df_counts, on="seqname")

plotnine.options.figure_size = (20,20)
ggplot(aes(x="rel", y='log_counts'), dfd) + geom_point(alpha=0.5) + facet_wrap("~pattern", ncol = 5) + theme_bw() #+ xlim([-400, 400])

<ggplot: (-9223363268808843695)>

plotnine.options.figure_size = (20,20)
ggplot(aes(x="rel", y='log_counts'), dfd) + geom_point(alpha=0.5) + facet_wrap("~pattern", ncol = 5) + theme_bw() + xlim([-70, 70])

/users/avsec/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/plotnine/layer.py:450: UserWarning: geom_point : Removed 611 rows containing missing values.
  self.data = self.geom.handle_na(self.data)

<ggplot: (-9223363268845628777)>

Conclusions¶

not much going on

	pattern	example_idx	pattern_start	pattern_end	strand	pattern_len	pattern_center	match_weighted	match_weighted_p	match_weighted_cat	...	match/Sox2	imp/Klf4	imp/Nanog	imp/Oct4	imp/Sox2	example_chrom	example_start	example_end	example_strand	example_interval_from_task
295017	metacluster_0/pattern_0	18016	552	568	-	16	560	0.142941	0.193969	low	...	0.160996	0.101324	0.330139	0.052185	0.092031	chr3	10381329	10382329	*	Nanog
295018	metacluster_0/pattern_0	18016	591	607	+	16	599	0.147612	0.193969	low	...	0.128601	0.078536	0.311810	0.050221	0.088571	chr3	10381329	10382329	*	Nanog
295019	metacluster_0/pattern_0	18016	689	705	-	16	697	0.148917	0.193969	low	...	0.199165	0.076908	0.100458	0.134652	0.110449	chr3	10381329	10382329	*	Nanog
295020	metacluster_0/pattern_0	18016	695	711	+	16	703	0.205385	0.215127	low	...	0.215987	0.079238	0.072323	0.128498	0.098791	chr3	10381329	10382329	*	Nanog
295021	metacluster_0/pattern_0	18016	876	892	-	16	884	0.197814	0.215127	low	...	0.246796	0.025204	0.014320	0.031117	0.021162	chr3	10381329	10382329	*	Nanog

	pattern	pattern_start	pattern_end	strand	pattern_len	pattern_center	match_weighted	match_weighted_p	match_weighted_cat	...	match/Sox2	imp/Klf4	imp/Nanog	imp/Oct4	imp/Sox2	example_chrom	example_start	example_end	example_strand	example_interval_from_task
0	metacluster_0/pattern_0	75	91	+	16	83	0.175556	0.193969	low	...	0.206333	0.050587	0.063537	0.090030	0.036130	chrX	143482572	143483572	*	Nanog
1	metacluster_0/pattern_0	152	168	-	16	160	0.141040	0.193969	low	...	0.223384	0.032036	0.020173	0.023113	0.029687	chrX	143482572	143483572	*	Nanog
2	metacluster_0/pattern_0	235	251	-	16	243	0.170543	0.193969	low	...	0.206882	0.018007	0.063500	0.021244	0.022853	chrX	143482572	143483572	*	Nanog
3	metacluster_0/pattern_0	237	253	+	16	245	0.151329	0.193969	low	...	0.186484	0.017459	0.059497	0.021635	0.023506	chrX	143482572	143483572	*	Nanog
4	metacluster_0/pattern_0	245	261	-	16	253	0.205689	0.215127	low	...	0.221023	0.020000	0.051363	0.035843	0.030362	chrX	143482572	143483572	*	Nanog

	Klf4/profile_match	Klf4/profile_match_p	Klf4/profile_counts	Klf4/profile_counts_p	Klf4/profile_max	Klf4/profile_max_p	Nanog/profile_match	Nanog/profile_match_p	Nanog/profile_counts	Nanog/profile_counts_p	...	match/Sox2	imp/Klf4	imp/Nanog	imp/Oct4	imp/Sox2	example_chrom	example_start	example_end	example_strand	example_interval_from_task
0	inf	NaN	0.0	1.0	0.0	1.0	inf	NaN	1.0	5.0	...	0.206333	0.050587	0.063537	0.090030	0.036130	chrX	143482572	143483572	*	Nanog
1	inf	NaN	1.0	1.0	1.0	1.0	inf	NaN	0.0	5.0	...	0.223384	0.032036	0.020173	0.023113	0.029687	chrX	143482572	143483572	*	Nanog
2	inf	NaN	0.0	1.0	0.0	1.0	inf	NaN	1.0	5.0	...	0.206882	0.018007	0.063500	0.021244	0.022853	chrX	143482572	143483572	*	Nanog
3	inf	NaN	0.0	1.0	0.0	1.0	inf	NaN	1.0	5.0	...	0.186484	0.017459	0.059497	0.021635	0.023506	chrX	143482572	143483572	*	Nanog
4	inf	NaN	0.0	1.0	0.0	1.0	inf	NaN	1.0	5.0	...	0.221023	0.020000	0.051363	0.035843	0.030362	chrX	143482572	143483572	*	Nanog

		coef	std err	t	P>\|t\|	[0.025	0.975]
37	m3_p4	0.6035	0.237	2.542	0.011	0.138	1.069
15	m0_p8	0.3219	0.115	2.811	0.005	0.097	0.547
17	m1_p0	-0.1178	0.047	-2.528	0.012	-0.209	-0.026