from basepair.imports import *
from basepair.modisco.utils import longer_pattern
from basepair.modisco.motif_clustering import to_colors, preproc_motif_table, motif_table_long, scale, preproc_df
from scipy.cluster.hierarchy import linkage, optimal_leaf_ordering
from scipy.spatial.distance import pdist, squareform
tasks = ['Oct4', 'Sox2', 'Klf4', 'Nanog']
output_dir = Path("/srv/www/kundaje/avsec/chipnexus/oct-sox-nanog-klf/models/n_dil_layers=9/modisco/all/profile")
def load_df(modisco_run, min_n_seqlets=100):
df = pd.read_csv(f"http://mitra.stanford.edu/kundaje/avsec/chipnexus/oct-sox-nanog-klf/models/n_dil_layers=9/modisco/{modisco_run}/pattern_table.csv")
return preproc_df(df, min_n_seqlets)
modisco_run = 'all/profile'
df = load_df(modisco_run)
# load annotated tables
pattern_table_nte_contrib = preproc_df(pd.read_csv(output_dir / 'motif_clustering/nte_contrib.csv'), 100)
pattern_table_nte_seq = preproc_df(pd.read_csv(output_dir / 'motif_clustering/nte_seq.csv'), 100)
pattern_table_te_contrib = preproc_df(pd.read_csv(output_dir / 'motif_clustering/te_contrib.csv'), None)
dfl = motif_table_long(df, tasks)
features = [c for c in dfl.columns if c not in ['idx', 'consensus', 'n seqlets', 'metacluster', 'pattern', 'task', 'pos unimodal']]
plotnine.options.figure_size = (20, 3)
ggplot(aes(x='value', color='task'), dfl.melt(['task'], value_vars=features)) + geom_density() + facet_wrap('variable', nrow=1, scales='free') + theme_minimal()
modisco_run = 'all/profile'
from basepair.modisco.motif_clustering import *
model_dir = Path(f"{ddir}/processed/chipnexus/exp/models/oct-sox-nanog-klf/models/n_dil_layers=9/")
modisco_dir = model_dir / f"modisco/all/profile/"
mr = ModiscoResult(modisco_dir / "modisco.h5")
mr.open()
pattern_table_nte_seq = pattern_table_nte_seq[pattern_table_nte_seq.pattern != 'm4_p4']
pattern_table_nte_seq = pattern_table_nte_seq.sort_values('cluster_order')
dfx, row_df, col_df = preproc_motif_table(pattern_table_nte_seq, tasks, unclust_feat=['log n seqlets', 'cluster_order', 'cluster'],
drop_columns=[t+s for t in tasks for s in ['/d_p', '/d']])
x = scale(dfx).T
vmax=max(np.abs(x.values.min()), x.values.max()) # simmatric colors
vmin=-vmax
for cluster in row_df.cluster.unique():
subset_vec = row_df.cluster == cluster
major_pattern_name = row_df[subset_vec].sort_values('log n seqlets').iloc[-1].name
p = mr.get_pattern(longer_pattern(major_pattern_name))
# plot the major motif
p.plot(['seq', 'contrib/mean'], rotate_y=0);
row_df_subset = row_df[subset_vec]
row_df_subset['major_pattern'] = pd.Categorical(row_df_subset.index == major_pattern_name)
del row_df_subset['cluster']
del row_df_subset['cluster_order']
g = sns.clustermap(x.T[subset_vec].T,
col_colors=to_colors(row_df_subset),
row_linkage=linkage(x.values, method='weighted', optimal_ordering=True),
col_linkage=linkage(x.values.T[subset_vec], method='weighted', optimal_ordering=True),
#method="weighted",
vmax=vmax,
vmin=vmin,
row_colors=to_colors(col_df),
figsize=(4 + subset_vec.sum()*0.2, 10), cmap='RdBu_r', center=0);
g.fig.suptitle(f'Cluster {cluster}');
def same_cluster_adj(clusters):
"""Get the adjacency matrix:
1. if two nodes are in the same cluster
0. nodes are not in the same cluster
"""
n = len(clusters)
out = np.zeros((n, n))
for i in range(n):
for j in range(n):
if clusters[i] == clusters[j]:
out[i,j] = 1
return out
def pdist_within_clusters(xdist, clusters, across_cluster_max_factor=1):
"""perform clustering only within clusters
Args:
matrix of features (N,D)
clusters: vector of cluster labels (N,)
"""
adj = same_cluster_adj(clusters)
return squareform(squareform(xdist) + (1-adj) * across_cluster_max_factor * xdist.max())
g = sns.clustermap(x.T,
row_colors=to_colors(row_df),
col_linkage=linkage(x.values, method='weighted', optimal_ordering=True),
row_linkage=linkage(pdist_within_clusters(pdist(x.values.T), row_df.cluster),
method='weighted', optimal_ordering=False),
#method="weighted",
col_colors=to_colors(col_df),
figsize=(10, 20), cmap='RdBu_r', center=0);
dfx, row_df, col_df = preproc_motif_table(pattern_table_te_contrib, tasks, unclust_feat=['log n seqlets', 'cluster_order', 'cluster'],
drop_columns=[t+s for t in tasks for s in ['/d_p', '/d']])
x = scale(dfx).T
g = sns.clustermap(x, row_colors=to_colors(col_df),
row_linkage=linkage(x.values, method='weighted', optimal_ordering=True),
col_linkage=linkage(x.values.T, method='weighted', optimal_ordering=True),
#method="weighted",
col_colors=to_colors(row_df),
figsize=(20, 14), cmap='RdBu_r', center=0);