Goal

  • cluster the motifs by their sequence similarity. Write the aligned list of patterns to a pkl file

Summary

Requires

  • {modisco_dir}/modisco.h5
  • {output_dir}/pattern_table.csv
  • {output_dir}/footprints.pkl

Produces

  • {output_dir}/patterns.pkl
In [1]:
from uuid import uuid4
from basepair.math import mean
from basepair.stats import perc

from IPython.display import display, HTML
from basepair.plot.vdom import df2html, df2html_old, render_datatable, vdom_footprint

from basepair.modisco.core import patterns_to_df
from basepair.modisco.utils import longer_pattern, shorten_pattern, extract_name_short
from basepair.imports import *


model_dir = Path(f"{ddir}/processed/chipnexus/exp/models/oct-sox-nanog-klf/models/n_dil_layers=9/")
modisco_dir = model_dir / f"modisco/all/profile/"
output_dir = Path("/srv/www/kundaje/avsec/chipnexus/oct-sox-nanog-klf/models/n_dil_layers=9/modisco/all/profile")
/users/avsec/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/h5py/__init__.py:36: FutureWarning: Conversion of the second argument of issubdtype from `float` to `np.floating` is deprecated. In future, it will be treated as `np.float64 == np.dtype(float).type`.
  from ._conv import register_converters as _register_converters
Using TensorFlow backend.
In [50]:
mr = ModiscoResult(modisco_dir / "modisco.h5")
mr.open()
patterns = [mr.get_pattern(p) for p in mr.patterns()]

# patterns = [mr.get_pattern(p) for p in mr.patterns()]
pattern_table = pd.read_csv(output_dir / "pattern_table.csv")

tasks = [x.split("/")[0] for x in mr.tasks()]

footprints = read_pkl(output_dir / 'footprints.pkl')

# Add profile to patterns
patterns = [p.add_profile(footprints[p.name]) for p in patterns]

# Add features
patterns = [p.add_attr('features', OrderedDict(pattern_table[pattern_table.pattern == shorten_pattern(p.name)].iloc[0])) for p in patterns]

# check that the pattern names match
assert patterns[4].attrs['features']['pattern'] == shorten_pattern(patterns[4].name)

In [3]:
patterns[:2]
Out[3]:
[Pattern('metacluster_0/pattern_0', 'AAAAAAATAAAAAATTAAGTGTCGTAATTTGCATAACAAAGGACACCATAGGATCAAATTTTTGAATATT' ...),
 Pattern('metacluster_0/pattern_1', 'TAGGGTGGAGCCTTCCTACCTAAATGGCTCCATTGTTCTGTCACTTCACCCTTTTCTTTTCAATATATTT' ...)]

1. split into two groups: {TE, non-TE}

In [4]:
p = patterns[0]
In [5]:
df = patterns_to_df(patterns, properties=['short_name', 'seq_info_content'])
TF-MoDISco is using the TensorFlow backend.
In [6]:
df.head(2)
Out[6]:
short_name seq_info_content
0 m0_p0 13.674130
1 m0_p1 11.609451
In [7]:
# example borderline motif (ic == 21.498123)
patterns[28].plot(kind='seq');
In [8]:
# one can also plot the profile of the Pattern
patterns[0].plot(kind='profile', rotate_y=0);
In [9]:
df.seq_info_content.plot.hist(bins=50)
plt.xlabel("Sequence information content");
In [10]:
# manually determine the cutoff
TE_cuttof = 30
In [11]:
patterns_te = [p for p in patterns if p.seq_info_content > TE_cuttof]
patterns_nte = [p for p in patterns if p.seq_info_content <= TE_cuttof]
assert len(patterns_te) + len(patterns_nte) == len(patterns)

Pattern len distribution

In [12]:
ns = pattern_table['n seqlets']
In [13]:
np.log10(ns).plot.hist(30)
plt.xlabel("log10(n seqlets)");
In [14]:
ns.quantile(0.1)
Out[14]:
90.2
In [15]:
print("# long motifs: ", len(patterns_te))
print("# short motifs:", len(patterns_nte))
# long motifs:  46
# short motifs: 96

2. cluster by seq-similarty, freeze groups

In [16]:
# Whole pipeline from this notebook
def cluster_patterns(patterns, n_clusters=9, cluster_track='seq_ic'):
    """Cluster patterns
    """
    sim = similarity_matrix(patterns, track=cluster_track)
    
    # cluster
    lm_nte_seq = linkage(1-sim_nte_seq[iu1], 'ward', optimal_ordering=True)
    cluster = cut_tree(lm_nte_seq, n_clusters=n_clusters)[:,0]
    
    cluster_order = np.argsort(leaves_list(lm_nte_seq))
    pattern_table_nte_seq = create_pattern_table(patterns_nte, cluster_order, cluster, 
                                                 align_track='contrib/mean',
                                                 logo_len=70, 
                                                 seqlogo_kwargs=dict(width=320), 
                                                 footprint_width=320,
                                                 footprint_kwargs=dict())
    return sim, lm_nte_seq, cluster, cluster_order, pattern_table_nte_seq
In [17]:
from basepair.exp.chipnexus.motif_clustering import similarity_matrix
In [18]:
sim_nte_seq = similarity_matrix(patterns_nte, track='seq_ic')
sim_te_seq = similarity_matrix(patterns_te, track='seq_ic')
100%|██████████| 96/96 [00:31<00:00,  3.07it/s]
100%|██████████| 46/46 [00:06<00:00,  7.16it/s]

non-TE

In [19]:
iu1 = np.triu_indices(len(sim_nte_seq), 1)
plt.hist(sim_nte_seq[iu1], bins=100);
plt.title("Similarity (seq_ic)")
plt.xlabel("Similarity between two motifs (non-TE)");

TE

In [20]:
iu2 = np.triu_indices(len(sim_te_seq), 1)
plt.hist(sim_te_seq[iu2], bins=100);
plt.title("Similarity (seq_ic)")
plt.xlabel("Similarity between two motifs (TE)");
In [21]:
from scipy.cluster.hierarchy import linkage, optimal_leaf_ordering, cut_tree, leaves_list
from basepair.modisco.motif_clustering import to_colors

Cluster the motifs

Non-TE's

In [22]:
# Seq IC-based clustering
lm_nte_seq = linkage(1-sim_nte_seq[iu1], 'ward', optimal_ordering=True)
clusters_nte_seq = cut_tree(lm_nte_seq, n_clusters=9)[:,0]
cm_nte_seq = sns.clustermap(pd.DataFrame(sim_nte_seq),
                            row_linkage=lm_nte_seq,
                            col_linkage=lm_nte_seq,
                            col_colors=to_colors(pd.DataFrame(dict(cluster=pd.Categorical(clusters_nte_seq)))),
                            cmap='Blues'
                           ).fig.suptitle('Seq-ic sim');

TE's

In [23]:
lm_te_seq = linkage(1-sim_te_seq[iu2], 'ward', optimal_ordering=True)
clusters_te_seq = cut_tree(lm_te_seq, n_clusters=9)[:,0]
cm_te = sns.clustermap(sim_te_seq,
                        row_linkage=lm_te_seq,
                        col_linkage=lm_te_seq,
                        cmap='Blues'
                       ).fig.suptitle('Seq IC sim');

Cluster by profile

In [24]:
# sim_nte = similarity_matrix(patterns_nte, track='profile/mean')
In [25]:
# Contrib-score based clustering
# lm_nte_contrib = linkage(1-sim_nte[iu1], 'ward', optimal_ordering=True)
# clusters_nte_contrib = cut_tree(lm_nte_contrib, n_clusters=9)[:,0]
# cm_nte_contrib = sns.clustermap(pd.DataFrame(sim_nte),
#                                 row_linkage=lm_nte_contrib,
#                                 col_linkage=lm_nte_contrib,
#                                 col_colors=to_colors(pd.DataFrame(dict(cluster=pd.Categorical(clusters_nte_contrib)))),
#                                 cmap='Blues',
#                                )

3. Visualize the clusters

In [26]:
from basepair.modisco.motif_clustering import create_pattern_table, align_clustered_patterns
In [47]:
cluster_order = np.argsort(leaves_list(lm_nte_seq))
cluster = clusters_nte_seq

patterns_nte_clustered = align_clustered_patterns(patterns_nte, cluster_order, cluster, 
                                                  align_track='seq_ic',
                                                  # don't shit the major patterns 
                                                  # by more than 15 when aligning
                                                  max_shift=15)
# add the major motif group
patterns_nte_clustered = [x.add_attr("pattern_group", 'nte') for x in patterns_nte_clustered]
100%|██████████| 96/96 [00:00<00:00, 269.38it/s]
In [48]:
cluster_order = np.argsort(leaves_list(lm_te_seq))
cluster = clusters_te_seq

patterns_te_clustered = align_clustered_patterns(patterns_te, cluster_order, cluster, 
                                                 align_track='seq_ic',
                                                 # don't shit the major patterns 
                                                 # by more than 15 when aligning                                             
                                                 max_shift=15)
patterns_te_clustered = [x.add_attr("pattern_group", 'te') for x in patterns_te_clustered]
100%|██████████| 46/46 [00:00<00:00, 217.00it/s]
In [49]:
write_pkl(patterns_nte_clustered + patterns_te_clustered, output_dir / 'patterns.pkl')