Goal

  • cluster motifs w.r.t their properties

Tasks

  • [x] get the property table for motifs
  • [x] normalize by quantiles
  • [x] cluster
  • [x] cluster also the pattern table and export to html: pattern_table.clustered.html
In [1513]:
from basepair.imports import *
from basepair.modisco.motif_clustering import to_colors, preproc_motif_table, motif_table_long
In [1418]:
tasks = ['Oct4', 'Sox2', 'Klf4', 'Nanog']

Load the data

In [1465]:
def load_df(modisco_run, min_n_seqlets=100):
    df = pd.read_csv(f"http://mitra.stanford.edu/kundaje/avsec/chipnexus/oct-sox-nanog-klf/models/n_dil_layers=9/modisco/{modisco_run}/pattern_table.csv")

    df['metacluster'] = df.pattern.str.split("_", expand=True)[0].str.replace("m", "").astype(int)
    df['metacluster'] = pd.Categorical(df.metacluster, ordered=True)
    df['log n seqlets'] = np.log10(df['n seqlets'])

    df['motif len'] = df.consensus.str.len()
    df['motif ic'] = df['motif len'] * df['ic pwm mean']  # add also motif ic

    # filter 
    df = df[df['n seqlets'] >= min_n_seqlets]
    return df
In [1466]:
modisco_run = 'valid'
df = load_df(modisco_run)

Check right scaling

In [1479]:
dfl = motif_table_long(df, tasks)
features = [c for c in dfl.columns if c not in ['idx', 'consensus', 'n seqlets', 'metacluster', 'pattern', 'task', 'pos unimodal']]
In [1468]:
plotnine.options.figure_size = (20, 3)
ggplot(aes(x='value', color='task'), dfl.melt(['task'], value_vars=features)) + geom_density() + facet_wrap('variable', nrow=1, scales='free') + theme_minimal()
Out[1468]:
<ggplot: (8774858503046)>

Heatmap

TODO

  • run also for individualized factors
  • add 10bp periodicity in the importance scores
  • dump motif plots to pdf files (for assembly) (use your own code, strip off any frames)
  • add motif length (total information content)
  • export to gdrive
  • annotate with motifs
  • lookup the ordering of the heatmap (call R package for the start)

Valid

In [1485]:
from scipy.cluster.hierarchy import linkage, optimal_leaf_ordering
In [1488]:
modisco_run = 'valid'
In [1486]:
df = load_df(modisco_run)
dfx, row_df, col_df = preproc_motif_table(df, tasks)
x = scale(dfx).T
In [1487]:
g = sns.clustermap(x,  row_colors=to_colors(col_df), 
                   row_linkage=linkage(x.values, method='weighted', optimal_ordering=True),
                   col_linkage=linkage(x.values.T, method='weighted', optimal_ordering=True),
                   #method="weighted", 
                   col_colors=to_colors(row_df), 
                   figsize=(20, 10), cmap='RdBu_r', center=0);
In [1491]:
gdrive_upload_fig(g, f"modisco/{modisco_run}/heatmap")

Oct4 peaks

In [1509]:
modisco_run = 'by_peak_tasks/weighted/Oct4'
[autoreload of basepair.gdrive failed: Traceback (most recent call last):
  File "/users/avsec/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/IPython/extensions/autoreload.py", line 245, in check
    superreload(m, reload, self.old_objects)
  File "/users/avsec/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/IPython/extensions/autoreload.py", line 384, in superreload
    update_generic(old_obj, new_obj)
  File "/users/avsec/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/IPython/extensions/autoreload.py", line 323, in update_generic
    update(a, b)
  File "/users/avsec/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/IPython/extensions/autoreload.py", line 266, in update_function
    setattr(old, name, getattr(new, name))
ValueError: list_folder() requires a code object with 0 free vars, not 2
]
In [1510]:
df = load_df(modisco_run)
dfx, row_df, col_df = preproc_motif_table(df, tasks)
x = scale(dfx).T
In [1511]:
g = sns.clustermap(x,  row_colors=to_colors(col_df), 
                   row_linkage=linkage(x.values, method='weighted', optimal_ordering=True),
                   col_linkage=linkage(x.values.T, method='weighted', optimal_ordering=True),
                   #method="weighted", 
                   col_colors=to_colors(row_df), 
                   figsize=(20, 10), cmap='RdBu_r', center=0);
In [1512]:
gdrive_upload_fig(g, f"modisco/{modisco_run}/heatmap")

Sox2 peaks

In [1492]:
modisco_run = 'by_peak_tasks/weighted/Sox2'
In [1493]:
df = load_df(modisco_run)
dfx, row_df, col_df = preproc_motif_table(df, tasks)
x = scale(dfx).T
In [1494]:
g = sns.clustermap(x,  row_colors=to_colors(col_df), 
                   row_linkage=linkage(x.values, method='weighted', optimal_ordering=True),
                   col_linkage=linkage(x.values.T, method='weighted', optimal_ordering=True),
                   #method="weighted", 
                   col_colors=to_colors(row_df), 
                   figsize=(20, 10), cmap='RdBu_r', center=0);
In [1495]:
gdrive_upload_fig(g, f"modisco/{modisco_run}/heatmap")

Nanog peaks

In [1496]:
modisco_run = 'by_peak_tasks/weighted/Nanog'
In [1497]:
df = load_df(modisco_run)
dfx, row_df, col_df = preproc_motif_table(df, tasks)
x = scale(dfx).T
In [1498]:
g = sns.clustermap(x,  row_colors=to_colors(col_df), 
                   row_linkage=linkage(x.values, method='weighted', optimal_ordering=True),
                   col_linkage=linkage(x.values.T, method='weighted', optimal_ordering=True),
                   #method="weighted", 
                   col_colors=to_colors(row_df), 
                   figsize=(20, 10), cmap='RdBu_r', center=0);
In [1499]:
gdrive_upload_fig(g, f"modisco/{modisco_run}/heatmap")

Klf4 peaks

In [1500]:
modisco_run = 'by_peak_tasks/weighted/Klf4'
In [1501]:
df = load_df(modisco_run)
dfx, row_df, col_df = preproc_motif_table(df, tasks)
x = scale(dfx).T
In [1502]:
g = sns.clustermap(x,  row_colors=to_colors(col_df), 
                   row_linkage=linkage(x.values, method='weighted', optimal_ordering=True),
                   col_linkage=linkage(x.values.T, method='weighted', optimal_ordering=True),
                   #method="weighted", 
                   col_colors=to_colors(row_df), 
                   figsize=(20, 10), cmap='RdBu_r', center=0);
In [1503]:
gdrive_upload_fig(g, f"modisco/{modisco_run}/heatmap")

Notes

  • The strange motif p22_p0 is sox link

Scatterplot

In [1235]:
# load the table
df = pd.read_csv(f"http://mitra.stanford.edu/kundaje/avsec/chipnexus/oct-sox-nanog-klf/models/n_dil_layers=9/modisco/{modisco_run}/pattern_table.csv")

df['metacluster'] = df.pattern.str.split("_", expand=True)[0].str.replace("m", "").astype(int)
df['metacluster'] = pd.Categorical(df.metacluster, ordered=True)
df['log n seqlets'] = np.log10(df['n seqlets'])

# filter 
df = df[df['n seqlets'] >= 100]
In [902]:
 
In [903]:
df = df.reset_index().set_index(['idx', 'pattern', 'consensus', 'n seqlets', 'metacluster'])
dfl = pd.concat([rename(df, task) for task in tasks])
In [904]:
dfl.head()
Out[904]:
imp profile imp counts footprint entropydiff footprint max footprint strandcor footprint counts region counts pos meandiff pos std pos unimodal task
idx pattern consensus n seqlets metacluster
0 m0_p0 ATTTGCATAACAAA 3675 0 0.042555 0.059322 0.113231 3.613860 0.005538 381.097595 775.774414 1.997996 89.129641 True Oct4
1 m0_p1 CCATTGTT 1663 0 0.013889 0.045596 0.018575 0.822610 0.005021 210.675293 515.116089 2.062599 83.017328 True Oct4
2 m0_p2 GTTGATGGC 773 0 0.009207 0.025203 0.062085 1.184347 0.005008 246.592499 537.979248 2.052729 55.446947 False Oct4
3 m0_p3 AGGCTGACCTTGA 428 0 0.008658 0.016956 0.047439 1.192671 0.004977 214.732849 544.341125 1.961061 97.076959 True Oct4
4 m0_p4 TGGCAGCAAGTTCAAGGTCAGCCTCAG 290 0 0.003723 0.008185 0.070402 0.975862 0.005046 189.955170 477.631012 2.075499 78.280716 False Oct4
In [905]:
dfl = dfl.reset_index()
In [906]:
dfl.head()
Out[906]:
idx pattern consensus n seqlets metacluster imp profile imp counts footprint entropydiff footprint max footprint strandcor footprint counts region counts pos meandiff pos std pos unimodal task
0 0 m0_p0 ATTTGCATAACAAA 3675 0 0.042555 0.059322 0.113231 3.613860 0.005538 381.097595 775.774414 1.997996 89.129641 True Oct4
1 1 m0_p1 CCATTGTT 1663 0 0.013889 0.045596 0.018575 0.822610 0.005021 210.675293 515.116089 2.062599 83.017328 True Oct4
2 2 m0_p2 GTTGATGGC 773 0 0.009207 0.025203 0.062085 1.184347 0.005008 246.592499 537.979248 2.052729 55.446947 False Oct4
3 3 m0_p3 AGGCTGACCTTGA 428 0 0.008658 0.016956 0.047439 1.192671 0.004977 214.732849 544.341125 1.961061 97.076959 True Oct4
4 4 m0_p4 TGGCAGCAAGTTCAAGGTCAGCCTCAG 290 0 0.003723 0.008185 0.070402 0.975862 0.005046 189.955170 477.631012 2.075499 78.280716 False Oct4
In [907]:
# TODO - do PCA
In [908]:
plotnine.options.figure_size = (10, 3)
ggplot(aes(x='imp profile', y='imp counts', color='metacluster', size='n seqlets'), dfl.sort_values('n seqlets', ascending=False)) + \
  geom_point(alpha=0.6) + \
  theme_classic() + facet_grid(".~task")+ theme(legend_position="top")
Out[908]:
<ggplot: (-9223363261952851645)>
In [909]:
plotnine.options.figure_size = (10, 3)
ggplot(aes(x='imp profile', y='footprint max', color='metacluster', size='n seqlets'), dfl.sort_values('n seqlets', ascending=False)) + \
  geom_point(alpha=0.6) + \
  theme_classic() + facet_grid(".~task")+ theme(legend_position="top")
Out[909]:
<ggplot: (-9223363261996917223)>
In [910]:
plotnine.options.figure_size = (10, 3)
ggplot(aes(x='imp profile', y='footprint counts', color='metacluster', size='n seqlets'), dfl.sort_values('n seqlets', ascending=False)) + \
  geom_point(alpha=0.6) + \
  theme_classic() + facet_grid(".~task")+ theme(legend_position="top")
Out[910]:
<ggplot: (8774857905998)>
In [911]:
plotnine.options.figure_size = (10, 3)
ggplot(aes(x='footprint max', y='footprint counts', color='metacluster', size='n seqlets'), dfl.sort_values('n seqlets', ascending=False)) + \
  geom_point(alpha=0.6) + \
  theme_classic() + facet_grid(".~task")+ theme(legend_position="top")
Out[911]:
<ggplot: (-9223363261996058936)>

PCA

In [912]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
In [913]:
pca = Pipeline([('scale', StandardScaler()),
                 ('pca', PCA(5))])
In [914]:
dfx = df.reset_index().set_index(['idx', 'pattern', 'consensus', 'n seqlets', 'metacluster'])
del dfx['index']
x = dfx.values
In [915]:
xpca = pca.fit_transform(x)
In [916]:
pca_cols = [f'pc{i}' for i in range(xpca.shape[1])]
In [917]:
dfpca = pd.DataFrame(xpca, columns=pca_cols)
In [918]:
dfpca['metacluster'] = df.reset_index()['metacluster']
dfpca['n seqlets'] = df.reset_index()['n seqlets']
In [927]:
plotnine.options.figure_size = (6, 4)
ggplot(aes(x='pc1', y='pc2', color='metacluster', size='n seqlets'), dfpca.sort_values('n seqlets', ascending=False)) + \
  geom_point(alpha=0.6) + \
  theme_classic() + theme(legend_position="top")
Out[927]:
<ggplot: (-9223363261988785264)>
In [928]:
# what are the features?
In [929]:
p = pca.steps[1][1]
In [930]:
p.components_.shape
Out[930]:
(5, 41)
In [931]:
components = pd.DataFrame(p.components_, columns=dfx.columns, index=pca_cols)
In [932]:
#fig, ax = plt.subplots(figsize=(10, 5))
#sns.heatmap(components, ax=ax);
sns.clustermap(components.T, col_cluster=False, cmap='RdBu_r', center=0);

Other

In [851]:
# features to merge
merge_features = {
    "footprint": ('footprint max', 'footprint counts'),
    "imp": ('imp profile', 'imp counts')
}
In [147]:
sns.pairplot(pd.DataFrame(xpca, columns=[f'pca{i}' for i in range(xpca.shape[1])]))
Out[147]:
<seaborn.axisgrid.PairGrid at 0x7fb12dd9ec88>
In [131]:
xpca.shape
Out[131]:
(70, 5)
In [ ]:
PCA(5, whiten=True)