from basepair.imports import *
ddir = get_data_dir()
from basepair.plot.profiles import extract_signal
from basepair.math import softmax
from basepair.plot.heatmaps import heatmap_stranded_profile, multiple_heatmap_stranded_profile
from basepair.plot.profiles import plot_stranded_profile, multiple_plot_stranded_profile
ls {model_dir}
model_dir = Path(f"{ddir}/processed/chipnexus/exp/models/oct-sox-nanog-klf/models/n_dil_layers=9/")
modisco_pdir = model_dir / "modisco/by_peak_tasks/weighted/"
# Load the data
d = HDF5Reader(model_dir / "grad.all.h5")
d.open()
task = "Oct4"
pattern = "metacluster_0/pattern_0"
mr = ModiscoResult(modisco_pdir / f"{task}/modisco.h5")
mr.open()
seqlets = mr._get_seqlets(pattern)
#mr.close()
include_samples = np.load(read_json(modisco_pdir / f"{task}/kwargs.json")["filter_npy"])
profile_obs = d.f[f'/targets/profile/{task}'][:][include_samples]
ds = DataSpec.load(model_dir / "dataspec.yaml")
seqlet_profile_obs = extract_signal(profile_obs, seqlets)
total_counts = seqlet_profile_obs.sum(axis=-1).sum(axis=-1)
sort_idx = np.argsort(-total_counts)
plot_stranded_profile(seqlet_profile_obs.mean(axis=0))
np.isnan(p[:,0,0])
# probabilities
p = seqlet_profile_obs[sort_idx] / seqlet_profile_obs[sort_idx].sum(axis=1, keepdims=True)
# drop NA's
notnan = ~np.any(np.any(np.isnan(p), axis=-1), axis=-1)
total_counts = seqlet_profile_obs[sort_idx].sum(axis=1)[notnan]
p = p[notnan]
seqlet_idx = np.array([s.seqname for s in seqlets])[notnan]
# dropped
print("Dropped", seqlet_profile_obs.shape[0] - p.shape[0], "profiles with only 0's")
heatmap_stranded_profile(p[:1000], figsize=(20,10))
from scipy.stats import entropy
from scipy.special import rel_entr, kl_div
# S(p_obs)
entropies = entropy(p.swapaxes(0,1)).sum(axis=-1)
# KL(p_obs, p_average)
kl = kl_div(p, p.mean(axis=0, keepdims=True)).mean(axis=-1).sum(axis=-1)
crossentropy = rel_entr(p, p.mean(axis=0, keepdims=True)).mean(axis=-1).sum(axis=-1)
fig = plt.figure(figsize=(13,5))
plt.subplot(131)
plt.plot(entropies);
plt.ylabel("entropy")
plt.xlabel("idx");
plt.subplot(132)
plt.plot(kl, entropies, "."); # kl divergence and the entropy between the other factor are almost the same
plt.xlabel("kl")
plt.ylabel("Entropy");
plt.subplot(133)
plt.plot(crossentropy, kl, "."); # kl divergence and the entropy between the other factor are almost the same
plt.xlabel("crossentropy")
plt.ylabel("Entropy");
fig=plt.figure(figsize=(10,4))
plt.subplot(121)
plt.plot(entropies, np.log(1+total_counts.mean(axis=-1)), ".", alpha=0.3);
plt.xlabel("Entropy")
plt.ylabel("log(1+ counts)");
plt.subplot(122)
plt.plot(entropies**2, np.log(1+total_counts.mean(axis=-1)), ".", alpha=0.3);
plt.xlabel(r"Entropy^2")
plt.ylabel("log(1+ counts)");
df = mr.seqlet_df_instances()
dfp = df.pivot_table("center", "seqname", "pattern", aggfunc=len, fill_value=0)
seqlet_idx
seqlet_idx[:3]
len(seqlet_idx)
count_features = dfp.loc[seqlet_idx]
count_features.head()
count_features.shape
count_features[pattern].value_counts().plot.bar();
plt.xlabel("Number of occurences in the sequence")
plt.ylabel("Frequency");
count_features.iloc[:,count_features.columns!=pattern].sum().plot.bar(figsize=(20,5));
plt.ylabel("Mean");
entropies.shape
count_features.shape
count_features.index
def rename_pattern(p):
if "metacluster" in p:
return p.replace("metacluster_", "m").replace("/", "_").replace("pattern_", "p")
else:
return p
count_features.columns = [rename_pattern(p) for p in count_features.columns]
dfm = count_features.assign(entropy=entropies, counts=np.log10(1 + total_counts.mean(axis=-1)),
example_idx=count_features.index).melt(id_vars=['entropy', 'counts', 'example_idx'], var_name="pattern")
dfmf = dfm.groupby("pattern").filter(lambda x: x.value.sum()>50)
dfmf.value = pd.Categorical(dfmf.value)
dfm.value = pd.Categorical(dfm.value)
import plotnine
from plotnine import *
# TODO - add the significance test
# TODO - filter in-frequent ones
plotnine.options.figure_size = (20,20)
ggplot(aes(x='value', y='entropy'), dfmf) + geom_boxplot() + facet_wrap("~pattern", ncol = 10, scales='free_x') + theme_bw()
import statsmodels.api as sm
import statsmodels.formula.api as smf
def ols_formula(df, dependent_var, *excluded_cols):
'''
Generates the R style formula for statsmodels (patsy) given
the dataframe, dependent variable and optional excluded columns
as strings
'''
df_columns = list(df.columns.values)
df_columns.remove(dependent_var)
for col in excluded_cols:
df_columns.remove(col)
return dependent_var + ' ~ ' + ' + '.join(df_columns)
dm = count_features.assign(counts=np.log10(1+total_counts.sum(axis=-1)))
ols_formula(dm, "counts")
results = smf.ols(ols_formula(dm, "counts"), data=dm).fit()
def tidy_ols(ols_reults):
smry = results.summary()
coef = smry.tables[1]
return pd.DataFrame(coef.data[1:], columns=coef.data[0])
df_fit = tidy_ols(results)
# significant coefficients
df_fit_signif = df_fit[df_fit['P>|t|'].astype(float) < 0.05]
df_fit_signif = df_fit_signif[df_fit_signif[""] != "Intercept"] # don't show the intercept
df_fit_signif.iloc[df_fit_signif['coef'].astype(float).abs().argsort()].iloc[::-1] # sort by the effect size
Interesting, motifs of one kind are present on both sides
pattern_short = rename_pattern(pattern)
single_motif_idx = count_features.index[count_features[pattern_short] == 1]
df_center = df[df.seqname.isin(single_motif_idx)].query(f"pattern == '{pattern}'")[['seqname', 'center']]
df_counts = pd.DataFrame({"seqname": seqlet_idx,
"log_counts": np.log10(1+total_counts.mean(axis=-1))})
dfd = pd.merge(df[df.pattern != pattern], df_center, on='seqname', suffixes=("_other", "_core"))
dfd['rel'] = dfd.center_other - dfd.center_core
dfd = dfd.merge(df_counts, on="seqname")
plotnine.options.figure_size = (20,20)
ggplot(aes(x="rel", y='log_counts'), dfd) + geom_point(alpha=0.5) + facet_wrap("~pattern", ncol = 5) + theme_bw() + xlim([-400, 400])
plotnine.options.figure_size = (20,20)
ggplot(aes(x="rel", y='log_counts'), dfd) + geom_point(alpha=0.5) + facet_wrap("~pattern", ncol = 5) + theme_bw() + xlim([-70, 70])
df
dfd.rel.abs().min() # -> there are some at the same position ?!?!?!?