Goal¶

cluster motifs w.r.t their properties

Decision¶

use sequence importance scores to cluster the motifs
- reason: this will help us study the heterogeneity of motifs that the model higlights

from basepair.imports import *
from basepair.modisco.utils import longer_pattern
from basepair.modisco.motif_clustering import to_colors, preproc_motif_table, motif_table_long, scale, preproc_df
from scipy.cluster.hierarchy import linkage, optimal_leaf_ordering
from scipy.spatial.distance import pdist, squareform

tasks = ['Oct4', 'Sox2', 'Klf4', 'Nanog']

output_dir = Path("/srv/www/kundaje/avsec/chipnexus/oct-sox-nanog-klf/models/n_dil_layers=9/modisco/all/profile")

def load_df(modisco_run, min_n_seqlets=100):
    df = pd.read_csv(f"http://mitra.stanford.edu/kundaje/avsec/chipnexus/oct-sox-nanog-klf/models/n_dil_layers=9/modisco/{modisco_run}/pattern_table.csv")
    return preproc_df(df, min_n_seqlets)

Load the data¶

modisco_run = 'all/profile'
df = load_df(modisco_run)

# load annotated tables
pattern_table_nte_contrib = preproc_df(pd.read_csv(output_dir / 'motif_clustering/nte_contrib.csv'), 100)
pattern_table_nte_seq = preproc_df(pd.read_csv(output_dir / 'motif_clustering/nte_seq.csv'), 100)
pattern_table_te_contrib = preproc_df(pd.read_csv(output_dir / 'motif_clustering/te_contrib.csv'), None)

Check right scaling¶

dfl = motif_table_long(df, tasks)
features = [c for c in dfl.columns if c not in ['idx', 'consensus', 'n seqlets', 'metacluster', 'pattern', 'task', 'pos unimodal']]

plotnine.options.figure_size = (20, 3)
ggplot(aes(x='value', color='task'), dfl.melt(['task'], value_vars=features)) + geom_density() + facet_wrap('variable', nrow=1, scales='free') + theme_minimal()

<ggplot: (8753741630191)>

Heatmap¶

Valid¶

modisco_run = 'all/profile'

from basepair.modisco.motif_clustering import *

model_dir = Path(f"{ddir}/processed/chipnexus/exp/models/oct-sox-nanog-klf/models/n_dil_layers=9/")
modisco_dir = model_dir / f"modisco/all/profile/"
mr = ModiscoResult(modisco_dir / "modisco.h5")
mr.open()

Short motifs (seq)¶

pattern_table_nte_seq = pattern_table_nte_seq[pattern_table_nte_seq.pattern != 'm4_p4']

pattern_table_nte_seq = pattern_table_nte_seq.sort_values('cluster_order')

dfx, row_df, col_df = preproc_motif_table(pattern_table_nte_seq, tasks, unclust_feat=['log n seqlets', 'cluster_order', 'cluster'],
                                          drop_columns=[t+s for t in tasks for s in ['/d_p', '/d']])
x = scale(dfx).T

vmax=max(np.abs(x.values.min()), x.values.max())  # simmatric colors
vmin=-vmax

for cluster in row_df.cluster.unique():
    subset_vec = row_df.cluster == cluster
    major_pattern_name = row_df[subset_vec].sort_values('log n seqlets').iloc[-1].name
    p = mr.get_pattern(longer_pattern(major_pattern_name))
    # plot the major motif
    p.plot(['seq', 'contrib/mean'], rotate_y=0);
    row_df_subset = row_df[subset_vec]
    row_df_subset['major_pattern'] = pd.Categorical(row_df_subset.index == major_pattern_name)
    del row_df_subset['cluster']
    del row_df_subset['cluster_order']
    
    g = sns.clustermap(x.T[subset_vec].T,  
                       col_colors=to_colors(row_df_subset), 
                       row_linkage=linkage(x.values, method='weighted', optimal_ordering=True),
                       col_linkage=linkage(x.values.T[subset_vec], method='weighted', optimal_ordering=True),
                       #method="weighted", 
                       vmax=vmax,
                       vmin=vmin,
                       row_colors=to_colors(col_df), 
                       figsize=(4 + subset_vec.sum()*0.2, 10), cmap='RdBu_r', center=0);
    g.fig.suptitle(f'Cluster {cluster}');

TF-MoDISco is using the TensorFlow backend.

Merged cluster-maps¶

def same_cluster_adj(clusters):
    """Get the adjacency matrix:
    1. if two nodes are in the same cluster
    0. nodes are not in the same cluster
    """
    n = len(clusters)
    out = np.zeros((n, n))
    for i in range(n):
        for j in range(n):
            if clusters[i] == clusters[j]:
                out[i,j] = 1
    return out

def pdist_within_clusters(xdist, clusters, across_cluster_max_factor=1):
    """perform clustering only within clusters
    
    Args:
      matrix of features (N,D)
      clusters: vector of cluster labels (N,)
    """
    adj = same_cluster_adj(clusters)
    return squareform(squareform(xdist) + (1-adj) * across_cluster_max_factor * xdist.max())

g = sns.clustermap(x.T,  
                   row_colors=to_colors(row_df), 
                   col_linkage=linkage(x.values, method='weighted', optimal_ordering=True),
                   row_linkage=linkage(pdist_within_clusters(pdist(x.values.T), row_df.cluster), 
                                       method='weighted', optimal_ordering=False),
                   #method="weighted", 
                   col_colors=to_colors(col_df), 
                   figsize=(10, 20), cmap='RdBu_r', center=0);

Long-motifs¶

dfx, row_df, col_df = preproc_motif_table(pattern_table_te_contrib, tasks, unclust_feat=['log n seqlets', 'cluster_order', 'cluster'],
                                          drop_columns=[t+s for t in tasks for s in ['/d_p', '/d']])
x = scale(dfx).T

g = sns.clustermap(x,  row_colors=to_colors(col_df), 
                   row_linkage=linkage(x.values, method='weighted', optimal_ordering=True),
                   col_linkage=linkage(x.values.T, method='weighted', optimal_ordering=True),
                   #method="weighted", 
                   col_colors=to_colors(row_df), 
                   figsize=(20, 14), cmap='RdBu_r', center=0);