from basepair.imports import *
from basepair.modisco.motif_clustering import to_colors, preproc_motif_table, motif_table_long
tasks = ['Oct4', 'Sox2', 'Klf4', 'Nanog']
def load_df(modisco_run, min_n_seqlets=100):
df = pd.read_csv(f"http://mitra.stanford.edu/kundaje/avsec/chipnexus/oct-sox-nanog-klf/models/n_dil_layers=9/modisco/{modisco_run}/pattern_table.csv")
df['metacluster'] = df.pattern.str.split("_", expand=True)[0].str.replace("m", "").astype(int)
df['metacluster'] = pd.Categorical(df.metacluster, ordered=True)
df['log n seqlets'] = np.log10(df['n seqlets'])
df['motif len'] = df.consensus.str.len()
df['motif ic'] = df['motif len'] * df['ic pwm mean'] # add also motif ic
# filter
df = df[df['n seqlets'] >= min_n_seqlets]
return df
modisco_run = 'valid'
df = load_df(modisco_run)
dfl = motif_table_long(df, tasks)
features = [c for c in dfl.columns if c not in ['idx', 'consensus', 'n seqlets', 'metacluster', 'pattern', 'task', 'pos unimodal']]
plotnine.options.figure_size = (20, 3)
ggplot(aes(x='value', color='task'), dfl.melt(['task'], value_vars=features)) + geom_density() + facet_wrap('variable', nrow=1, scales='free') + theme_minimal()
from scipy.cluster.hierarchy import linkage, optimal_leaf_ordering
modisco_run = 'valid'
df = load_df(modisco_run)
dfx, row_df, col_df = preproc_motif_table(df, tasks)
x = scale(dfx).T
g = sns.clustermap(x, row_colors=to_colors(col_df),
row_linkage=linkage(x.values, method='weighted', optimal_ordering=True),
col_linkage=linkage(x.values.T, method='weighted', optimal_ordering=True),
#method="weighted",
col_colors=to_colors(row_df),
figsize=(20, 10), cmap='RdBu_r', center=0);
gdrive_upload_fig(g, f"modisco/{modisco_run}/heatmap")
modisco_run = 'by_peak_tasks/weighted/Oct4'
df = load_df(modisco_run)
dfx, row_df, col_df = preproc_motif_table(df, tasks)
x = scale(dfx).T
g = sns.clustermap(x, row_colors=to_colors(col_df),
row_linkage=linkage(x.values, method='weighted', optimal_ordering=True),
col_linkage=linkage(x.values.T, method='weighted', optimal_ordering=True),
#method="weighted",
col_colors=to_colors(row_df),
figsize=(20, 10), cmap='RdBu_r', center=0);
gdrive_upload_fig(g, f"modisco/{modisco_run}/heatmap")
modisco_run = 'by_peak_tasks/weighted/Sox2'
df = load_df(modisco_run)
dfx, row_df, col_df = preproc_motif_table(df, tasks)
x = scale(dfx).T
g = sns.clustermap(x, row_colors=to_colors(col_df),
row_linkage=linkage(x.values, method='weighted', optimal_ordering=True),
col_linkage=linkage(x.values.T, method='weighted', optimal_ordering=True),
#method="weighted",
col_colors=to_colors(row_df),
figsize=(20, 10), cmap='RdBu_r', center=0);
gdrive_upload_fig(g, f"modisco/{modisco_run}/heatmap")
modisco_run = 'by_peak_tasks/weighted/Nanog'
df = load_df(modisco_run)
dfx, row_df, col_df = preproc_motif_table(df, tasks)
x = scale(dfx).T
g = sns.clustermap(x, row_colors=to_colors(col_df),
row_linkage=linkage(x.values, method='weighted', optimal_ordering=True),
col_linkage=linkage(x.values.T, method='weighted', optimal_ordering=True),
#method="weighted",
col_colors=to_colors(row_df),
figsize=(20, 10), cmap='RdBu_r', center=0);
gdrive_upload_fig(g, f"modisco/{modisco_run}/heatmap")
modisco_run = 'by_peak_tasks/weighted/Klf4'
df = load_df(modisco_run)
dfx, row_df, col_df = preproc_motif_table(df, tasks)
x = scale(dfx).T
g = sns.clustermap(x, row_colors=to_colors(col_df),
row_linkage=linkage(x.values, method='weighted', optimal_ordering=True),
col_linkage=linkage(x.values.T, method='weighted', optimal_ordering=True),
#method="weighted",
col_colors=to_colors(row_df),
figsize=(20, 10), cmap='RdBu_r', center=0);
gdrive_upload_fig(g, f"modisco/{modisco_run}/heatmap")
# load the table
df = pd.read_csv(f"http://mitra.stanford.edu/kundaje/avsec/chipnexus/oct-sox-nanog-klf/models/n_dil_layers=9/modisco/{modisco_run}/pattern_table.csv")
df['metacluster'] = df.pattern.str.split("_", expand=True)[0].str.replace("m", "").astype(int)
df['metacluster'] = pd.Categorical(df.metacluster, ordered=True)
df['log n seqlets'] = np.log10(df['n seqlets'])
# filter
df = df[df['n seqlets'] >= 100]
df = df.reset_index().set_index(['idx', 'pattern', 'consensus', 'n seqlets', 'metacluster'])
dfl = pd.concat([rename(df, task) for task in tasks])
dfl.head()
dfl = dfl.reset_index()
dfl.head()
# TODO - do PCA
plotnine.options.figure_size = (10, 3)
ggplot(aes(x='imp profile', y='imp counts', color='metacluster', size='n seqlets'), dfl.sort_values('n seqlets', ascending=False)) + \
geom_point(alpha=0.6) + \
theme_classic() + facet_grid(".~task")+ theme(legend_position="top")
plotnine.options.figure_size = (10, 3)
ggplot(aes(x='imp profile', y='footprint max', color='metacluster', size='n seqlets'), dfl.sort_values('n seqlets', ascending=False)) + \
geom_point(alpha=0.6) + \
theme_classic() + facet_grid(".~task")+ theme(legend_position="top")
plotnine.options.figure_size = (10, 3)
ggplot(aes(x='imp profile', y='footprint counts', color='metacluster', size='n seqlets'), dfl.sort_values('n seqlets', ascending=False)) + \
geom_point(alpha=0.6) + \
theme_classic() + facet_grid(".~task")+ theme(legend_position="top")
plotnine.options.figure_size = (10, 3)
ggplot(aes(x='footprint max', y='footprint counts', color='metacluster', size='n seqlets'), dfl.sort_values('n seqlets', ascending=False)) + \
geom_point(alpha=0.6) + \
theme_classic() + facet_grid(".~task")+ theme(legend_position="top")
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
pca = Pipeline([('scale', StandardScaler()),
('pca', PCA(5))])
dfx = df.reset_index().set_index(['idx', 'pattern', 'consensus', 'n seqlets', 'metacluster'])
del dfx['index']
x = dfx.values
xpca = pca.fit_transform(x)
pca_cols = [f'pc{i}' for i in range(xpca.shape[1])]
dfpca = pd.DataFrame(xpca, columns=pca_cols)
dfpca['metacluster'] = df.reset_index()['metacluster']
dfpca['n seqlets'] = df.reset_index()['n seqlets']
plotnine.options.figure_size = (6, 4)
ggplot(aes(x='pc1', y='pc2', color='metacluster', size='n seqlets'), dfpca.sort_values('n seqlets', ascending=False)) + \
geom_point(alpha=0.6) + \
theme_classic() + theme(legend_position="top")
# what are the features?
p = pca.steps[1][1]
p.components_.shape
components = pd.DataFrame(p.components_, columns=dfx.columns, index=pca_cols)
#fig, ax = plt.subplots(figsize=(10, 5))
#sns.heatmap(components, ax=ax);
sns.clustermap(components.T, col_cluster=False, cmap='RdBu_r', center=0);
# features to merge
merge_features = {
"footprint": ('footprint max', 'footprint counts'),
"imp": ('imp profile', 'imp counts')
}
sns.pairplot(pd.DataFrame(xpca, columns=[f'pca{i}' for i in range(xpca.shape[1])]))
xpca.shape
PCA(5, whiten=True)