import seaborn as sns
from basepair.config import get_data_dir
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pathlib import Path
from basepair.modisco import ModiscoResult, append_pattern_loc
from basepair.cli.schemas import DataSpec
from basepair.utils import read_pkl
ddir = Path(get_data_dir())
model_dir = ddir / "processed/chipnexus/exp/models/oct-sox-nanog-klf/models/n_dil_layers=9/"
modisco_dir = model_dir / "modisco/valid"
ds = DataSpec.load(model_dir / "dataspec.yaml")
from kipoi.readers import HDF5Reader
d = HDF5Reader(model_dir / "grad.all.h5")
d.open()
tasks = list(ds.task_specs)
df = pd.DataFrame({task: d.f[f"/targets/profile/{task}"][:].sum(axis=-1).sum(axis=-1) for task in tasks})
df.head()
dfl = np.log(1+df)
from sklearn.preprocessing import StandardScaler
x = StandardScaler().fit_transform(dfl)
from sklearn.decomposition import PCA
pca = PCA(n_components=2)
principalComponents = pca.fit_transform(x)
principalDf = pd.DataFrame(data = principalComponents, columns = ['pc1', 'pc2'])
dfl.head()
pca.components_
principalDf.plot("pc1", "pc2", kind='scatter', alpha=0.1);
sns.jointplot(x, y, kind="hex", color="#4CB391")
principalDf.plot("pc1", "pc2", gridsize=100, kind='hexbin');
sns.jointplot(principalDf.pc1, principalDf.pc2, kind="hex", color="#4CB391")
import seaborn as sns
dfls = dfl.sample(frac=0.1)
g = sns.PairGrid(dfls)
g.map_diag(sns.kdeplot)
g.map_offdiag(sns.kdeplot, n_levels=6);
sns.pairplot(dfl,
diag_kind = 'kde',
markers='o',
plot_kws = {'alpha': 0.01, 'edgecolor': None})
def plot_emb(emb, legend=False):
"""Plot embedding. Highlights the held-out points on the plot
"""
plt.scatter(emb[:,0], emb[:,1], alpha=0.3, s=60, label="Training")
plot_emb(tsnec.embedding_)
import umap
um = umap.UMAP(n_neighbors=5,
min_dist=0.3)
plot_emb(mds.embedding_)
from sklearn. import Pip
d.ls()
ls {model_dir}
from keras.models import load_model, Model
from basepair.config import create_tf_session
create_tf_session(0)
m = load_model(model_dir / 'model.h5')
m.summary()
seq = d.f['/inputs'][:]
import keras.layers as kl
m_avg = Model(m.input, m.get_layer("global_average_pooling1d_1").output)
hid_act = m.get_layer("add_9").output
gmax_pool = kl.GlobalMaxPooling1D()(hid_act)
max_avg_vec = kl.concatenate([gmax_pool, m.get_layer("global_average_pooling1d_1").output])
m_avg_max = Model(m.input, max_avg_vec)
act = m_avg_max.predict(seq, verbose=1)
# append also the total counts
act_w_counts = np.concatenate([act, dfl.values], axis=1)
act_norm = StandardScaler().fit_transform(act_w_counts)
from sklearn.decomposition import PCA
pca = PCA(n_components=4)
principalComponents = pca.fit_transform(act_norm)
principalDf = pd.DataFrame(data = principalComponents, columns = ['pc1', 'pc2', 'pc3', 'pc4'])
idx = pd.Series(np.arange(len(act))).sample(frac=0.1, replace=True)
act_norm_small = act_norm[idx]
len(act_norm_small)
act_norm
def plot_emb(emb, legend=False, alpha=0.3, **kwargs):
"""Plot embedding. Highlights the held-out points on the plot
"""
plt.scatter(emb[:,0], emb[:,1], alpha=alpha, s=60, label="Training", **kwargs)
import umap
um = umap.UMAP(n_neighbors=5,
min_dist=0.3)
um.fit(act_norm_small)
sns.jointplot(um.embedding_[:,0], um.embedding_[:,1], kind="hex", color="#4CB391")
from sklearn.cluster import AgglomerativeClustering
cl = AgglomerativeClustering(10)
cl.fit(act_norm)
pd.Series(cl.labels_).value_counts().plot(kind='bar')
# Save the clusters to different masked files
outdir = model_dir / "clustering/agglomerative.model,counts/10/"
outdir.mkdir(parents=True, exist_ok=True)
from basepair.utils import write_pkl
for i in range(10):
mask = cl.labels_ == i
np.save(outdir / f"{i}.npy", mask, )
write_pkl(cl, outdir / "clustering.pkl")
ls {outdir}
mask.sum()
from sklearn import manifold
x = manifold.SpectralEmbedding(n_components=2).fit_transform(act_norm_small)
def plot_clustering(X_red, X, labels, title=None):
x_min, x_max = np.min(X_red, axis=0), np.max(X_red, axis=0)
X_red = (X_red - x_min) / (x_max - x_min)
plt.figure(figsize=(6, 4))
#for i in range(X_red.shape[0]):
plt.scatter(X_red[:, 0], X_red[:, 1], color=plt.cm.nipy_spectral(labels / 10.),
fontdict={'weight': 'bold', 'size': 9})
cl.labels_
plt.cm.nipy_spectral(cl.labels_ / 10)
plt.scatter(x[:,0], x[:,1], color=plt.cm.nipy_spectral(cl.labels_ / 10), alpha=0.01)
cm = plt.get_cmap("tab10")
cm(np.array([10]))
plt.scatter(um.embedding_[:,0], um.embedding_[:,1], color=cm(cl.labels_), alpha=0.05)
pd.Series(cl.labels_).value_counts().plot(kind='bar')
plt.scatter(x[:,0], x[:,1], color=cm(cl.labels_), alpha=0.05)
plot_emb(um.embedding_, alpha=0.01)
principalDf.plot("pc1", "pc2", kind='scatter', alpha=0.1);
sns.jointplot(principalDf.pc1, principalDf.pc2, kind="hex", color="#4CB391")