Goal¶

cluster motifs w.r.t their properties

Tasks¶

[x] get the property table for motifs
[x] normalize by quantiles
[x] cluster
[x] cluster also the pattern table and export to html: pattern_table.clustered.html

from basepair.imports import *
from basepair.modisco.motif_clustering import to_colors, preproc_motif_table, motif_table_long

tasks = ['Oct4', 'Sox2', 'Klf4', 'Nanog']

Load the data¶

def load_df(modisco_run, min_n_seqlets=100):
    df = pd.read_csv(f"http://mitra.stanford.edu/kundaje/avsec/chipnexus/oct-sox-nanog-klf/models/n_dil_layers=9/modisco/{modisco_run}/pattern_table.csv")

    df['metacluster'] = df.pattern.str.split("_", expand=True)[0].str.replace("m", "").astype(int)
    df['metacluster'] = pd.Categorical(df.metacluster, ordered=True)
    df['log n seqlets'] = np.log10(df['n seqlets'])

    df['motif len'] = df.consensus.str.len()
    df['motif ic'] = df['motif len'] * df['ic pwm mean']  # add also motif ic

    # filter 
    df = df[df['n seqlets'] >= min_n_seqlets]
    return df

modisco_run = 'valid'
df = load_df(modisco_run)

Check right scaling¶

dfl = motif_table_long(df, tasks)
features = [c for c in dfl.columns if c not in ['idx', 'consensus', 'n seqlets', 'metacluster', 'pattern', 'task', 'pos unimodal']]

plotnine.options.figure_size = (20, 3)
ggplot(aes(x='value', color='task'), dfl.melt(['task'], value_vars=features)) + geom_density() + facet_wrap('variable', nrow=1, scales='free') + theme_minimal()

<ggplot: (8774858503046)>

Heatmap¶

TODO¶

run also for individualized factors
add 10bp periodicity in the importance scores
dump motif plots to pdf files (for assembly) (use your own code, strip off any frames)
add motif length (total information content)
export to gdrive
annotate with motifs
lookup the ordering of the heatmap (call R package for the start)

Valid¶

from scipy.cluster.hierarchy import linkage, optimal_leaf_ordering

modisco_run = 'valid'

df = load_df(modisco_run)
dfx, row_df, col_df = preproc_motif_table(df, tasks)
x = scale(dfx).T

g = sns.clustermap(x,  row_colors=to_colors(col_df), 
                   row_linkage=linkage(x.values, method='weighted', optimal_ordering=True),
                   col_linkage=linkage(x.values.T, method='weighted', optimal_ordering=True),
                   #method="weighted", 
                   col_colors=to_colors(row_df), 
                   figsize=(20, 10), cmap='RdBu_r', center=0);

gdrive_upload_fig(g, f"modisco/{modisco_run}/heatmap")

Oct4 peaks¶

modisco_run = 'by_peak_tasks/weighted/Oct4'

[autoreload of basepair.gdrive failed: Traceback (most recent call last):
  File "/users/avsec/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/IPython/extensions/autoreload.py", line 245, in check
    superreload(m, reload, self.old_objects)
  File "/users/avsec/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/IPython/extensions/autoreload.py", line 384, in superreload
    update_generic(old_obj, new_obj)
  File "/users/avsec/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/IPython/extensions/autoreload.py", line 323, in update_generic
    update(a, b)
  File "/users/avsec/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/IPython/extensions/autoreload.py", line 266, in update_function
    setattr(old, name, getattr(new, name))
ValueError: list_folder() requires a code object with 0 free vars, not 2
]

df = load_df(modisco_run)
dfx, row_df, col_df = preproc_motif_table(df, tasks)
x = scale(dfx).T

g = sns.clustermap(x,  row_colors=to_colors(col_df), 
                   row_linkage=linkage(x.values, method='weighted', optimal_ordering=True),
                   col_linkage=linkage(x.values.T, method='weighted', optimal_ordering=True),
                   #method="weighted", 
                   col_colors=to_colors(row_df), 
                   figsize=(20, 10), cmap='RdBu_r', center=0);

gdrive_upload_fig(g, f"modisco/{modisco_run}/heatmap")

Sox2 peaks¶

modisco_run = 'by_peak_tasks/weighted/Sox2'

df = load_df(modisco_run)
dfx, row_df, col_df = preproc_motif_table(df, tasks)
x = scale(dfx).T

g = sns.clustermap(x,  row_colors=to_colors(col_df), 
                   row_linkage=linkage(x.values, method='weighted', optimal_ordering=True),
                   col_linkage=linkage(x.values.T, method='weighted', optimal_ordering=True),
                   #method="weighted", 
                   col_colors=to_colors(row_df), 
                   figsize=(20, 10), cmap='RdBu_r', center=0);

gdrive_upload_fig(g, f"modisco/{modisco_run}/heatmap")

Nanog peaks¶

modisco_run = 'by_peak_tasks/weighted/Nanog'

df = load_df(modisco_run)
dfx, row_df, col_df = preproc_motif_table(df, tasks)
x = scale(dfx).T

g = sns.clustermap(x,  row_colors=to_colors(col_df), 
                   row_linkage=linkage(x.values, method='weighted', optimal_ordering=True),
                   col_linkage=linkage(x.values.T, method='weighted', optimal_ordering=True),
                   #method="weighted", 
                   col_colors=to_colors(row_df), 
                   figsize=(20, 10), cmap='RdBu_r', center=0);

gdrive_upload_fig(g, f"modisco/{modisco_run}/heatmap")

Klf4 peaks¶

modisco_run = 'by_peak_tasks/weighted/Klf4'

df = load_df(modisco_run)
dfx, row_df, col_df = preproc_motif_table(df, tasks)
x = scale(dfx).T

g = sns.clustermap(x,  row_colors=to_colors(col_df), 
                   row_linkage=linkage(x.values, method='weighted', optimal_ordering=True),
                   col_linkage=linkage(x.values.T, method='weighted', optimal_ordering=True),
                   #method="weighted", 
                   col_colors=to_colors(row_df), 
                   figsize=(20, 10), cmap='RdBu_r', center=0);

gdrive_upload_fig(g, f"modisco/{modisco_run}/heatmap")

Notes¶

The strange motif p22_p0 is sox link

Scatterplot¶

# load the table
df = pd.read_csv(f"http://mitra.stanford.edu/kundaje/avsec/chipnexus/oct-sox-nanog-klf/models/n_dil_layers=9/modisco/{modisco_run}/pattern_table.csv")

df['metacluster'] = df.pattern.str.split("_", expand=True)[0].str.replace("m", "").astype(int)
df['metacluster'] = pd.Categorical(df.metacluster, ordered=True)
df['log n seqlets'] = np.log10(df['n seqlets'])

# filter 
df = df[df['n seqlets'] >= 100]

df = df.reset_index().set_index(['idx', 'pattern', 'consensus', 'n seqlets', 'metacluster'])
dfl = pd.concat([rename(df, task) for task in tasks])

dfl.head()

dfl = dfl.reset_index()

dfl.head()

# TODO - do PCA

plotnine.options.figure_size = (10, 3)
ggplot(aes(x='imp profile', y='imp counts', color='metacluster', size='n seqlets'), dfl.sort_values('n seqlets', ascending=False)) + \
  geom_point(alpha=0.6) + \
  theme_classic() + facet_grid(".~task")+ theme(legend_position="top")

<ggplot: (-9223363261952851645)>

plotnine.options.figure_size = (10, 3)
ggplot(aes(x='imp profile', y='footprint max', color='metacluster', size='n seqlets'), dfl.sort_values('n seqlets', ascending=False)) + \
  geom_point(alpha=0.6) + \
  theme_classic() + facet_grid(".~task")+ theme(legend_position="top")

<ggplot: (-9223363261996917223)>

plotnine.options.figure_size = (10, 3)
ggplot(aes(x='imp profile', y='footprint counts', color='metacluster', size='n seqlets'), dfl.sort_values('n seqlets', ascending=False)) + \
  geom_point(alpha=0.6) + \
  theme_classic() + facet_grid(".~task")+ theme(legend_position="top")

<ggplot: (8774857905998)>

plotnine.options.figure_size = (10, 3)
ggplot(aes(x='footprint max', y='footprint counts', color='metacluster', size='n seqlets'), dfl.sort_values('n seqlets', ascending=False)) + \
  geom_point(alpha=0.6) + \
  theme_classic() + facet_grid(".~task")+ theme(legend_position="top")

<ggplot: (-9223363261996058936)>

PCA¶

from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

pca = Pipeline([('scale', StandardScaler()),
                 ('pca', PCA(5))])

dfx = df.reset_index().set_index(['idx', 'pattern', 'consensus', 'n seqlets', 'metacluster'])
del dfx['index']
x = dfx.values

xpca = pca.fit_transform(x)

pca_cols = [f'pc{i}' for i in range(xpca.shape[1])]

dfpca = pd.DataFrame(xpca, columns=pca_cols)

dfpca['metacluster'] = df.reset_index()['metacluster']
dfpca['n seqlets'] = df.reset_index()['n seqlets']

plotnine.options.figure_size = (6, 4)
ggplot(aes(x='pc1', y='pc2', color='metacluster', size='n seqlets'), dfpca.sort_values('n seqlets', ascending=False)) + \
  geom_point(alpha=0.6) + \
  theme_classic() + theme(legend_position="top")

<ggplot: (-9223363261988785264)>

# what are the features?

p = pca.steps[1][1]

p.components_.shape

(5, 41)

components = pd.DataFrame(p.components_, columns=dfx.columns, index=pca_cols)

#fig, ax = plt.subplots(figsize=(10, 5))
#sns.heatmap(components, ax=ax);
sns.clustermap(components.T, col_cluster=False, cmap='RdBu_r', center=0);

Other¶

# features to merge
merge_features = {
    "footprint": ('footprint max', 'footprint counts'),
    "imp": ('imp profile', 'imp counts')
}

sns.pairplot(pd.DataFrame(xpca, columns=[f'pca{i}' for i in range(xpca.shape[1])]))

<seaborn.axisgrid.PairGrid at 0x7fb12dd9ec88>

xpca.shape

(70, 5)

PCA(5, whiten=True)

					imp profile	imp counts	footprint entropydiff	footprint max	footprint strandcor	footprint counts	region counts	pos meandiff	pos std	pos unimodal	task
idx	pattern	consensus	n seqlets	metacluster
0	m0_p0	ATTTGCATAACAAA	3675	0	0.042555	0.059322	0.113231	3.613860	0.005538	381.097595	775.774414	1.997996	89.129641	True	Oct4
1	m0_p1	CCATTGTT	1663	0	0.013889	0.045596	0.018575	0.822610	0.005021	210.675293	515.116089	2.062599	83.017328	True	Oct4
2	m0_p2	GTTGATGGC	773	0	0.009207	0.025203	0.062085	1.184347	0.005008	246.592499	537.979248	2.052729	55.446947	False	Oct4
3	m0_p3	AGGCTGACCTTGA	428	0	0.008658	0.016956	0.047439	1.192671	0.004977	214.732849	544.341125	1.961061	97.076959	True	Oct4
4	m0_p4	TGGCAGCAAGTTCAAGGTCAGCCTCAG	290	0	0.003723	0.008185	0.070402	0.975862	0.005046	189.955170	477.631012	2.075499	78.280716	False	Oct4

	idx	pattern	consensus	n seqlets	imp profile	imp counts	footprint entropydiff	footprint max	footprint strandcor	footprint counts	region counts	pos meandiff	pos std	pos unimodal	task
0	0	m0_p0	ATTTGCATAACAAA	3675	0.042555	0.059322	0.113231	3.613860	0.005538	381.097595	775.774414	1.997996	89.129641	True	Oct4
1	1	m0_p1	CCATTGTT	1663	0.013889	0.045596	0.018575	0.822610	0.005021	210.675293	515.116089	2.062599	83.017328	True	Oct4
2	2	m0_p2	GTTGATGGC	773	0.009207	0.025203	0.062085	1.184347	0.005008	246.592499	537.979248	2.052729	55.446947	False	Oct4
3	3	m0_p3	AGGCTGACCTTGA	428	0.008658	0.016956	0.047439	1.192671	0.004977	214.732849	544.341125	1.961061	97.076959	True	Oct4
4	4	m0_p4	TGGCAGCAAGTTCAAGGTCAGCCTCAG	290	0.003723	0.008185	0.070402	0.975862	0.005046	189.955170	477.631012	2.075499	78.280716	False	Oct4