Goal¶

cluster the motifs by their sequence similarity

Summary¶

See https://docs.google.com/presentation/d/1XCRnpDvYApY8wiUeoqpSGq2JOHXSBMwQbzXE4tbopLU/edit#slide=id.p

0. Load the data¶

as a list of Patterns

from uuid import uuid4
from basepair.math import mean
from basepair.stats import perc

from IPython.display import display, HTML
from basepair.plot.vdom import df2html, df2html_old, render_datatable

from basepair.modisco.core import patterns_to_df
from basepair.modisco.utils import longer_pattern, shorten_pattern
from basepair.imports import *


model_dir = Path(f"{ddir}/processed/chipnexus/exp/models/oct-sox-nanog-klf/models/n_dil_layers=9/")
modisco_dir = model_dir / f"modisco/all/profile/"

/users/avsec/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/h5py/__init__.py:36: FutureWarning: Conversion of the second argument of issubdtype from `float` to `np.floating` is deprecated. In future, it will be treated as `np.float64 == np.dtype(float).type`.
  from ._conv import register_converters as _register_converters
Using TensorFlow backend.
2018-10-18 07:30:48,738 [WARNING] doc empty for the `info:` field

mr = ModiscoResult(modisco_dir / "modisco.h5")
mr.open()

patterns = [mr.get_pattern(p) for p in mr.patterns()]

tasks = [x.split("/")[0] for x in mr.tasks()]

patterns[:2]

[Pattern('metacluster_0/pattern_0', 'AAAAAAATAAAAAATTAAGTGTCGTAATTTGCATAACAAAGGACACCATAGGATCAAATTTTTGAATATT' ...),
 Pattern('metacluster_0/pattern_1', 'TAGGGTGGAGCCTTCCTACCTAAATGGCTCCATTGTTCTGTCACTTCACCCTTTTCTTTTCAATATATTT' ...)]

1. split into two groups: {TE, non-TE}¶

[x] implement patterns_to_df (convenient conversion ot list[Pattern] to pd.DataFrame (including plots, see section 3)
- derive differnent properties of the motif
  - decorate using @property
  - allow to use a list of properties to in patterns_to_df?
[x] visualize information_content as a histogram and decide on the cutoff based on total information content
- two groups:
  - patterns_te
  - patterns_nte

p = patterns[0]

df = patterns_to_df(patterns, properties=['short_name', 'seq_info_content'])

TF-MoDISco is using the TensorFlow backend.

df.head(2)

# example borderline motif (ic == 21.498123)
patterns[28].plot(kind='seq');

df.seq_info_content.plot.hist(bins=50)
plt.xlabel("Sequence information content");

# manually determine the cutoff
TE_cuttof = 30

patterns_te = [p for p in patterns if p.seq_info_content > TE_cuttof]
patterns_nte = [p for p in patterns if p.seq_info_content <= TE_cuttof]
assert len(patterns_te) + len(patterns_nte) == len(patterns)

print("# long motifs:", len(patterns_te))
print("# short motifs: ", len(patterns_nte))

# long motifs: 46
# short motifs:  96

2. cluster by seq-similarty, freeze groups¶

[x] implement: method dist(pattern, metric=..., scores=['seq']) to compute the distance to another pattern
[x] visualize: similarity heatmap with hirearchical clustering (keeping the diagonal)
[x] visualize the distribution of pairwise scores
- is it bi-modal.?

from basepair.exp.chipnexus.motif_clustering import similarity_matrix

sim_nte = similarity_matrix(patterns_nte, track='contrib/mean')
sim_nte_seq = similarity_matrix(patterns_nte, track='seq_ic')
sim_te = similarity_matrix(patterns_te, track='contrib/mean')
sim_te_seq = similarity_matrix(patterns_te, track='seq_ic')

100%|██████████| 96/96 [00:31<00:00,  3.01it/s]
100%|██████████| 96/96 [00:33<00:00,  2.89it/s]
100%|██████████| 46/46 [00:07<00:00,  5.84it/s]
100%|██████████| 46/46 [00:07<00:00,  5.96it/s]

non-TE¶

iu1 = np.triu_indices(len(sim_nte), 1)
plt.hist(sim_nte[iu1], bins=100);
plt.title("Similarity (contrib)")
plt.xlabel("Similarity between two motifs (non-TE)");

iu1 = np.triu_indices(len(sim_nte_seq), 1)
plt.hist(sim_nte_seq[iu1], bins=100);
plt.title("Similarity (seq_ic)")
plt.xlabel("Similarity between two motifs (non-TE)");

TE¶

iu2 = np.triu_indices(len(sim_te), 1)
plt.hist(sim_te[iu2], bins=100);
plt.title("Similarity (contrib)")
plt.xlabel("Similarity between two motifs (TE)");

iu2 = np.triu_indices(len(sim_te), 1)
plt.hist(sim_te_seq[iu2], bins=100);
plt.title("Similarity (seq_ic)")
plt.xlabel("Similarity between two motifs (TE)");

from scipy.cluster.hierarchy import linkage, optimal_leaf_ordering, cut_tree, leaves_list
from basepair.modisco.motif_clustering import to_colors

Cluster by contribution scores¶

# Contrib-score based clustering
lm_nte_contrib = linkage(1-sim_nte[iu1], 'ward', optimal_ordering=True)
clusters_nte_contrib = cut_tree(lm_nte_contrib, n_clusters=9)[:,0]
cm_nte_contrib = sns.clustermap(pd.DataFrame(sim_nte),
                                row_linkage=lm_nte_contrib,
                                col_linkage=lm_nte_contrib,
                                col_colors=to_colors(pd.DataFrame(dict(cluster=pd.Categorical(clusters_nte_contrib)))),
                                cmap='Blues',
                               )
cm_nte_contrib.fig.suptitle('Contrib sim');
sns.clustermap(pd.DataFrame(sim_nte_seq),
                            row_linkage=lm_nte_contrib,
                            col_linkage=lm_nte_contrib,
                            col_colors=to_colors(pd.DataFrame(dict(cluster=pd.Categorical(clusters_nte_contrib)))),
                            cmap='Blues',
                           ).fig.suptitle('Seq-ic sim');

Cluster by sequence IC similarity¶

# Seq IC-based clustering
lm_nte_seq = linkage(1-sim_nte_seq[iu1], 'ward', optimal_ordering=True)
clusters_nte_seq = cut_tree(lm_nte_seq, n_clusters=9)[:,0]
cm_nte_seq = sns.clustermap(pd.DataFrame(sim_nte_seq),
                            row_linkage=lm_nte_seq,
                            col_linkage=lm_nte_seq,
                            col_colors=to_colors(pd.DataFrame(dict(cluster=pd.Categorical(clusters_nte_seq)))),
                            cmap='Blues'
                           ).fig.suptitle('Seq-ic sim');
cm_nte_seq = sns.clustermap(pd.DataFrame(sim_nte),
                            row_linkage=lm_nte_seq,
                            col_linkage=lm_nte_seq,
                            col_colors=to_colors(pd.DataFrame(dict(cluster=pd.Categorical(clusters_nte_seq)))),
                            cmap='Blues'
                           ).fig.suptitle('Contrib sim');

lm_te_contrib = linkage(1-sim_te[iu2], 'ward', optimal_ordering=True)
cm_te = sns.clustermap(sim_te,
                        row_linkage=lm_te_contrib,
                        col_linkage=lm_te_contrib,
                        cmap='Blues'
                       )
cm_te.fig.suptitle('Contrib sim');
lm_te_seq = linkage(1-sim_te_seq[iu2], 'ward', optimal_ordering=True)
cm_te = sns.clustermap(sim_te_seq,
                        row_linkage=lm_te_seq,
                        col_linkage=lm_te_seq,
                        cmap='Blues'
                       ).fig.suptitle('Seq IC sim');

Interactive plots¶

import holoviews as hv
import hvplot.pandas
hv.extension('bokeh', 'matplotlib')

def to_img(fig):
    fig.savefig("/tmp/fig.png", dpi=150)
    f = hv.RGB.load_image("/tmp/fig.png",bare=True)
    return f

d_nte = {p.short_name: to_img(p.trim_seq_ic(0.08).plot("seq")) for p in tqdm(patterns_nte)}
d_te = {p.short_name: to_img(p.trim_seq_ic(0.08).plot("seq")) for p in tqdm(patterns_te)}

100%|██████████| 96/96 [00:24<00:00,  3.85it/s]
100%|██████████| 46/46 [00:26<00:00,  1.71it/s]

sort_idx = leaves_list(lm_nte_contrib)
sim_nte_df = pd.DataFrame([dict(a=patterns_nte[i].short_name, b=patterns_nte[j].short_name, v=sim_nte[i,j])
                                for i in sort_idx for j in sort_idx])
sort_idx = leaves_list(lm_nte_seq)
sim_nte_df_seq = pd.DataFrame([dict(a=patterns_nte[i].short_name, b=patterns_nte[j].short_name, v=sim_nte_seq[i,j])
                                for i in sort_idx for j in sort_idx])
sort_idx = leaves_list(lm_te_contrib)
sim_te_df = pd.DataFrame([dict(a=patterns_te[i].short_name, b=patterns_te[j].short_name, v=sim_te[i,j])
                          for i in sort_idx for j in sort_idx])

Cluster and display contribution scores¶

%opts HeatMap [xrotation=90] (cmap='Blues') 
heatmap = sim_nte_df.hvplot.heatmap(x='a', y='b', C='v', width=1000, height=1000)
posxy = hv.streams.Tap(source=heatmap, x="m0_p0", y='m0_p0')
# Define function to compute histogram based on tap location
def tap_histogram(x, y):
    return d_nte[x].options(width=800, height=500)

obj = heatmap + hv.DynamicMap(tap_histogram, kdims=[], streams=[posxy])
obj

Cluster and display sequence IC scores¶

%opts HeatMap [xrotation=90] (cmap='Blues') 
heatmap = sim_nte_df_seq.hvplot.heatmap(x='a', y='b', C='v', width=1000, height=1000)
posxy = hv.streams.Tap(source=heatmap, x="m0_p0", y='m0_p0')
# Define function to compute histogram based on tap location
def tap_histogram(x, y):
    return d_nte[x].options(width=800, height=500)

obj = heatmap + hv.DynamicMap(tap_histogram, kdims=[], streams=[posxy])
obj

%opts HeatMap [xrotation=90] (cmap='Blues') 
heatmap = sim_te_df.hvplot.heatmap(x='a', y='b', C='v', width=600, height=600)
heatmap

3. Visualize the clusters¶

[x] implement the align method
- returns the relative position and strand of the aligned pattern
  - align()
    - by default return the pattern in the new aligned coordinates
  - align_coord()
[x] Setup a table with three columns (pattern, logo_imp, logo_seq):
- [x] Port the code bellow to use pattern
- [x] sort by clustering from before
- [x] make sure it renders nicely in Ipynb and the exported html
[x] allow patterns_to_df to also add logo_imp and logo_seq
[x] create the pandas DataFrame, color it by clusters and display it
- columns:
  - cluster
  - pattern
  - n_seqlets
  - logo_imp
  - logo_seq
[x] check where Nanog and Sox2 motifs got assigned

Implement align¶

# query
patterns[1].plot("seq");
# target
patterns[0].plot("seq");
# query
patterns[1].rc().plot("seq");

pa = patterns[1]
pb = patterns[0]

display(pb.resize(30).vdom_plot('seq'))
display(pa.align(pb).resize(30).vdom_plot('seq'))

display(pa.resize(30).vdom_plot('seq'))
display(pb.align(pa).resize(30).vdom_plot('seq'))

TODO¶

[x] take pattern_table and append cluster and cluster_order
[x] split the pandas data-frame into different groups. For each group:
- identify the major motif (one with the most seqlets)
- align all the other seqlets to that motif
- append two new columns to the table:
  - logo_imp
  - logo_seq
[x] merge the table
[X] visualize the table (sort by cluster_order)

# read the property table
output_dir = Path("/srv/www/kundaje/avsec/chipnexus/oct-sox-nanog-klf/models/n_dil_layers=9/modisco/all/profile")
pattern_table = pd.read_csv(output_dir / "pattern_table.csv")
# read the footprints
footprints = read_pkl(output_dir / 'footprints.pkl')

# names in nte
pattern_names = np.array([shorten_pattern(p.name) for p in patterns_nte])
patterns_nte_dict = {shorten_pattern(p.name): p for p in patterns_nte}  # organize as a dict
#assert len(pattern_names) == len(cluster_order)

from basepair.exp.chipnexus.motif_clustering import append_logo_cluster

# cluster using contrib scores
cluster_order = np.argsort(leaves_list(lm_nte_contrib))
cluster = clusters_nte_contrib
pattern_table_nte_contrib = append_logo_cluster(pattern_table, patterns_nte, cluster_order, cluster, width=320, logo_len=70)

100%|██████████| 9/9 [03:04<00:00, 20.49s/it]

cluster_order = np.argsort(leaves_list(lm_nte_seq))
cluster = clusters_nte_seq
pattern_table_nte_seq = append_logo_cluster(pattern_table, patterns_nte, cluster_order, cluster, width=320, logo_len=70)

100%|██████████| 9/9 [03:04<00:00, 20.48s/it]

cluster_order = np.argsort(leaves_list(lm_te_contrib))
cluster = np.arange(len(cluster_order))
pattern_table_te_contrib = append_logo_cluster(pattern_table, patterns_te, cluster_order, cluster, width=480, logo_len=70)

100%|██████████| 46/46 [02:06<00:00,  2.76s/it]

# add the 'directness score'
for task in tasks:
    pattern_table_nte_contrib[task + "/d"] = 2*pattern_table_nte_contrib[task + " imp profile"] - pattern_table_nte_contrib[task + " imp profile"]
    # add the normalized version
    pattern_table_nte_contrib[task + "/d_p"] = perc(pattern_table_nte_contrib[task + "/d"])
    
    pattern_table_nte_seq[task + "/d"] = 2*pattern_table_nte_seq[task + " imp profile"] - pattern_table_nte_seq[task + " imp profile"]
    pattern_table_nte_seq[task + "/d_p"] = perc(pattern_table_nte_seq[task + "/d"])
    
    pattern_table_te_contrib[task + "/d"] = 2*pattern_table_te_contrib[task + " imp profile"] - pattern_table_te_contrib[task + " imp profile"]
    # add the normalized version
    pattern_table_te_contrib[task + "/d_p"] = perc(pattern_table_te_contrib[task + "/d"])

Save to file¶

!ls {output_dir}

footprints.pkl
modisco-table.stdout.log
motif_clustering
pattern_table.csv
pattern_table.html
pattern-table.smaller.contrib-cluster.html
pattern-table.smaller.seq-cluster.html
pattern_table.sorted.csv
pattern_table.sorted.html
pattern-table.TE.contrib-cluster.html
plots
results.html
results.ipynb
stdout.log

!mkdir -p {output_dir}/motif_clustering

pattern_table_nte_contrib.reset_index().to_csv(output_dir / 'motif_clustering/nte_contrib.csv', index=False)
pattern_table_nte_seq.reset_index().to_csv(output_dir / 'motif_clustering/nte_seq.csv', index=False)
pattern_table_te_contrib.reset_index().to_csv(output_dir / 'motif_clustering/te_contrib.csv', index=False)

Focus only on the things to visualize¶

display_cols = ['pattern', 'cluster', 'n seqlets', 'logo_imp', 'logo_seq'] + [task+'/d_p' for task in tasks]

pattern_table_nte_contrib['n seqlets'] = pattern_table_nte_contrib['n seqlets'].astype(float)
pattern_table_nte_seq['n seqlets'] = pattern_table_nte_seq['n seqlets'].astype(float)
pattern_table_te_contrib['n seqlets'] = pattern_table_te_contrib['n seqlets'].astype(float)

dfv_contrib = pattern_table_nte_contrib.reset_index().sort_values('cluster_order', ascending=True)[display_cols]
dfv_seq = pattern_table_nte_seq.reset_index().sort_values('cluster_order', ascending=True)[display_cols]
dfv_te_contrib = pattern_table_te_contrib.reset_index().sort_values('cluster_order', ascending=True)[display_cols]

Add manual periodicity annotations¶

from basepair.exp.chipnexus.annotate_profile import read_annotated_csv
dfl = read_annotated_csv("../2018-10-16-ziga-all-profile.csv")

dfl.head()

Add sparklines¶

from basepair.plot.vdom import vdom_footprint, footprint_df

dff = footprint_df(footprints, dfl, figsize=(3,1.5), width=120)

---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-50-36c6c0911f0c> in <module>()
----> 1 dff = footprint_df(footprints, dfl, figsize=(3,1.5), width=120)

~/workspace/basepair/basepair/plot/vdom.py in footprint_df(footprints, dfl, width, **kwargs)
    422     profile_max_median = {task: np.median([np.max(v[task]) for v in footprints.values()]) for task in tasks}
    423     out = []
--> 424     for p, arr_d in tqdm(footprints.items()):
    425 
    426         try:

NameError: name 'tqdm' is not defined

# add the footprints
dfv_contrib = pd.merge(dfv_contrib, dff, on='pattern', how='left')
dfv_seq = pd.merge(dfv_seq, dff, on='pattern', how='left')
dfv_te_contrib = pd.merge(dfv_te_contrib, dff, on='pattern', how='left')

colorder = ['pattern', 'cluster', 'n seqlets', 'logo_imp', 'logo_seq'] + tasks + [t + '/d_p' for t in tasks]

Visualize¶

Results¶

html_str_contrib = df2html(dfv_contrib[colorder], "table_contrib" + str(uuid4()), "")
with open('/srv/www/kundaje/avsec/chipnexus/oct-sox-nanog-klf/models/n_dil_layers=9/modisco/all/profile/pattern-table.smaller.cotrib-cluster.html', 'w') as fo:
    fo.write(html_str_contrib)
# HTML(html_str_contrib)

html_str_seq = df2html(dfv_seq[colorder], "table_seq" + str(uuid4()), "")
with open('/srv/www/kundaje/avsec/chipnexus/oct-sox-nanog-klf/models/n_dil_layers=9/modisco/all/profile/pattern-table.smaller.seq-cluster.html', 'w') as fo:
    fo.write(html_str_seq)
# html_str_seq

html_str_te_contrib = df2html(dfv_te_contrib[colorder], "table_seq" + str(uuid4()), "")
with open('/srv/www/kundaje/avsec/chipnexus/oct-sox-nanog-klf/models/n_dil_layers=9/modisco/all/profile/pattern-table.TE.contrib-cluster.html', 'w') as fo:
    fo.write(html_str_te_contrib)
# html_str_seq

4. Iterate¶

Explore different:
- metric
- score (seq vs importance)
- cutoff
for each choice, inspect
- heatmap
- distribution of pairwise scores
- visual motif grouping in the pandas data-frame

	Klf4/l	Nanog/l	Oct4/l	Sox2/l	pattern
0	ambiguous	not-bound	not-bound	not-bound	m9_p9
1	ambiguous	not-bound	not-bound	not-bound	m9_p8
2	not-bound	not-bound	ambigous	not-bound	m9_p7
3	not-bound	not-bound	not-bound	not-bound	m9_p6
4	not-bound	direct	not-bound	not-bound	m9_p5

	short_name	seq_info_content
0	m0_p0	13.674130
1	m0_p1	11.609451