0. Load the data

  • as a list of Patterns
In [1]:
from uuid import uuid4
from basepair.math import mean
from basepair.stats import perc

from IPython.display import display, HTML
from basepair.plot.vdom import df2html, df2html_old, render_datatable

from basepair.modisco.core import patterns_to_df
from basepair.modisco.utils import longer_pattern, shorten_pattern
from basepair.imports import *


model_dir = Path(f"{ddir}/processed/chipnexus/exp/models/oct-sox-nanog-klf/models/n_dil_layers=9/")
modisco_dir = model_dir / f"modisco/all/profile/"
/users/avsec/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/h5py/__init__.py:36: FutureWarning: Conversion of the second argument of issubdtype from `float` to `np.floating` is deprecated. In future, it will be treated as `np.float64 == np.dtype(float).type`.
  from ._conv import register_converters as _register_converters
Using TensorFlow backend.
2018-10-18 07:30:48,738 [WARNING] doc empty for the `info:` field
In [2]:
mr = ModiscoResult(modisco_dir / "modisco.h5")
mr.open()

patterns = [mr.get_pattern(p) for p in mr.patterns()]

tasks = [x.split("/")[0] for x in mr.tasks()]
In [3]:
patterns[:2]
Out[3]:
[Pattern('metacluster_0/pattern_0', 'AAAAAAATAAAAAATTAAGTGTCGTAATTTGCATAACAAAGGACACCATAGGATCAAATTTTTGAATATT' ...),
 Pattern('metacluster_0/pattern_1', 'TAGGGTGGAGCCTTCCTACCTAAATGGCTCCATTGTTCTGTCACTTCACCCTTTTCTTTTCAATATATTT' ...)]

1. split into two groups: {TE, non-TE}

  • [x] implement patterns_to_df (convenient conversion ot list[Pattern] to pd.DataFrame (including plots, see section 3)
    • derive differnent properties of the motif
      • decorate using @property
      • allow to use a list of properties to in patterns_to_df?
  • [x] visualize information_content as a histogram and decide on the cutoff based on total information content
    • two groups:
      • patterns_te
      • patterns_nte
In [4]:
p = patterns[0]
In [5]:
df = patterns_to_df(patterns, properties=['short_name', 'seq_info_content'])
TF-MoDISco is using the TensorFlow backend.
In [6]:
df.head(2)
Out[6]:
short_name seq_info_content
0 m0_p0 13.674130
1 m0_p1 11.609451
In [7]:
# example borderline motif (ic == 21.498123)
patterns[28].plot(kind='seq');
In [8]:
df.seq_info_content.plot.hist(bins=50)
plt.xlabel("Sequence information content");
In [9]:
# manually determine the cutoff
TE_cuttof = 30
In [10]:
patterns_te = [p for p in patterns if p.seq_info_content > TE_cuttof]
patterns_nte = [p for p in patterns if p.seq_info_content <= TE_cuttof]
assert len(patterns_te) + len(patterns_nte) == len(patterns)
In [11]:
print("# long motifs:", len(patterns_te))
print("# short motifs: ", len(patterns_nte))
# long motifs: 46
# short motifs:  96

2. cluster by seq-similarty, freeze groups

  • [x] implement: method dist(pattern, metric=..., scores=['seq']) to compute the distance to another pattern
  • [x] visualize: similarity heatmap with hirearchical clustering (keeping the diagonal)
  • [x] visualize the distribution of pairwise scores
    • is it bi-modal.?
In [12]:
from basepair.exp.chipnexus.motif_clustering import similarity_matrix
In [13]:
sim_nte = similarity_matrix(patterns_nte, track='contrib/mean')
sim_nte_seq = similarity_matrix(patterns_nte, track='seq_ic')
sim_te = similarity_matrix(patterns_te, track='contrib/mean')
sim_te_seq = similarity_matrix(patterns_te, track='seq_ic')
100%|██████████| 96/96 [00:31<00:00,  3.01it/s]
100%|██████████| 96/96 [00:33<00:00,  2.89it/s]
100%|██████████| 46/46 [00:07<00:00,  5.84it/s]
100%|██████████| 46/46 [00:07<00:00,  5.96it/s]

non-TE

In [14]:
iu1 = np.triu_indices(len(sim_nte), 1)
plt.hist(sim_nte[iu1], bins=100);
plt.title("Similarity (contrib)")
plt.xlabel("Similarity between two motifs (non-TE)");
In [15]:
iu1 = np.triu_indices(len(sim_nte_seq), 1)
plt.hist(sim_nte_seq[iu1], bins=100);
plt.title("Similarity (seq_ic)")
plt.xlabel("Similarity between two motifs (non-TE)");

TE

In [16]:
iu2 = np.triu_indices(len(sim_te), 1)
plt.hist(sim_te[iu2], bins=100);
plt.title("Similarity (contrib)")
plt.xlabel("Similarity between two motifs (TE)");
In [17]:
iu2 = np.triu_indices(len(sim_te), 1)
plt.hist(sim_te_seq[iu2], bins=100);
plt.title("Similarity (seq_ic)")
plt.xlabel("Similarity between two motifs (TE)");
In [18]:
from scipy.cluster.hierarchy import linkage, optimal_leaf_ordering, cut_tree, leaves_list
from basepair.modisco.motif_clustering import to_colors

Cluster by contribution scores

In [19]:
# Contrib-score based clustering
lm_nte_contrib = linkage(1-sim_nte[iu1], 'ward', optimal_ordering=True)
clusters_nte_contrib = cut_tree(lm_nte_contrib, n_clusters=9)[:,0]
cm_nte_contrib = sns.clustermap(pd.DataFrame(sim_nte),
                                row_linkage=lm_nte_contrib,
                                col_linkage=lm_nte_contrib,
                                col_colors=to_colors(pd.DataFrame(dict(cluster=pd.Categorical(clusters_nte_contrib)))),
                                cmap='Blues',
                               )
cm_nte_contrib.fig.suptitle('Contrib sim');
sns.clustermap(pd.DataFrame(sim_nte_seq),
                            row_linkage=lm_nte_contrib,
                            col_linkage=lm_nte_contrib,
                            col_colors=to_colors(pd.DataFrame(dict(cluster=pd.Categorical(clusters_nte_contrib)))),
                            cmap='Blues',
                           ).fig.suptitle('Seq-ic sim');

Cluster by sequence IC similarity

In [20]:
# Seq IC-based clustering
lm_nte_seq = linkage(1-sim_nte_seq[iu1], 'ward', optimal_ordering=True)
clusters_nte_seq = cut_tree(lm_nte_seq, n_clusters=9)[:,0]
cm_nte_seq = sns.clustermap(pd.DataFrame(sim_nte_seq),
                            row_linkage=lm_nte_seq,
                            col_linkage=lm_nte_seq,
                            col_colors=to_colors(pd.DataFrame(dict(cluster=pd.Categorical(clusters_nte_seq)))),
                            cmap='Blues'
                           ).fig.suptitle('Seq-ic sim');
cm_nte_seq = sns.clustermap(pd.DataFrame(sim_nte),
                            row_linkage=lm_nte_seq,
                            col_linkage=lm_nte_seq,
                            col_colors=to_colors(pd.DataFrame(dict(cluster=pd.Categorical(clusters_nte_seq)))),
                            cmap='Blues'
                           ).fig.suptitle('Contrib sim');
In [21]:
lm_te_contrib = linkage(1-sim_te[iu2], 'ward', optimal_ordering=True)
cm_te = sns.clustermap(sim_te,
                        row_linkage=lm_te_contrib,
                        col_linkage=lm_te_contrib,
                        cmap='Blues'
                       )
cm_te.fig.suptitle('Contrib sim');
lm_te_seq = linkage(1-sim_te_seq[iu2], 'ward', optimal_ordering=True)
cm_te = sns.clustermap(sim_te_seq,
                        row_linkage=lm_te_seq,
                        col_linkage=lm_te_seq,
                        cmap='Blues'
                       ).fig.suptitle('Seq IC sim');

Interactive plots

In [22]:
import holoviews as hv
import hvplot.pandas
hv.extension('bokeh', 'matplotlib')
In [23]:
def to_img(fig):
    fig.savefig("/tmp/fig.png", dpi=150)
    f = hv.RGB.load_image("/tmp/fig.png",bare=True)
    return f
In [24]:
d_nte = {p.short_name: to_img(p.trim_seq_ic(0.08).plot("seq")) for p in tqdm(patterns_nte)}
d_te = {p.short_name: to_img(p.trim_seq_ic(0.08).plot("seq")) for p in tqdm(patterns_te)}
100%|██████████| 96/96 [00:24<00:00,  3.85it/s]
100%|██████████| 46/46 [00:26<00:00,  1.71it/s]
In [25]:
sort_idx = leaves_list(lm_nte_contrib)
sim_nte_df = pd.DataFrame([dict(a=patterns_nte[i].short_name, b=patterns_nte[j].short_name, v=sim_nte[i,j])
                                for i in sort_idx for j in sort_idx])
sort_idx = leaves_list(lm_nte_seq)
sim_nte_df_seq = pd.DataFrame([dict(a=patterns_nte[i].short_name, b=patterns_nte[j].short_name, v=sim_nte_seq[i,j])
                                for i in sort_idx for j in sort_idx])
sort_idx = leaves_list(lm_te_contrib)
sim_te_df = pd.DataFrame([dict(a=patterns_te[i].short_name, b=patterns_te[j].short_name, v=sim_te[i,j])
                          for i in sort_idx for j in sort_idx])

Cluster and display contribution scores

In [26]:
%opts HeatMap [xrotation=90] (cmap='Blues') 
heatmap = sim_nte_df.hvplot.heatmap(x='a', y='b', C='v', width=1000, height=1000)
posxy = hv.streams.Tap(source=heatmap, x="m0_p0", y='m0_p0')
# Define function to compute histogram based on tap location
def tap_histogram(x, y):
    return d_nte[x].options(width=800, height=500)

obj = heatmap + hv.DynamicMap(tap_histogram, kdims=[], streams=[posxy])
obj
Out[26]:

Cluster and display sequence IC scores

In [27]:
%opts HeatMap [xrotation=90] (cmap='Blues') 
heatmap = sim_nte_df_seq.hvplot.heatmap(x='a', y='b', C='v', width=1000, height=1000)
posxy = hv.streams.Tap(source=heatmap, x="m0_p0", y='m0_p0')
# Define function to compute histogram based on tap location
def tap_histogram(x, y):
    return d_nte[x].options(width=800, height=500)

obj = heatmap + hv.DynamicMap(tap_histogram, kdims=[], streams=[posxy])
obj
Out[27]:
In [28]:
%opts HeatMap [xrotation=90] (cmap='Blues') 
heatmap = sim_te_df.hvplot.heatmap(x='a', y='b', C='v', width=600, height=600)
heatmap
Out[28]:

3. Visualize the clusters

  • [x] implement the align method
    • returns the relative position and strand of the aligned pattern
      • align()
        • by default return the pattern in the new aligned coordinates
      • align_coord()
  • [x] Setup a table with three columns (pattern, logo_imp, logo_seq):

    • [x] Port the code bellow to use pattern
    • [x] sort by clustering from before
    • [x] make sure it renders nicely in Ipynb and the exported html
  • [x] allow patterns_to_df to also add logo_imp and logo_seq

  • [x] create the pandas DataFrame, color it by clusters and display it
    • columns:
      • cluster
      • pattern
      • n_seqlets
      • logo_imp
      • logo_seq
  • [x] check where Nanog and Sox2 motifs got assigned

Implement align

In [29]:
# query
patterns[1].plot("seq");
# target
patterns[0].plot("seq");
# query
patterns[1].rc().plot("seq");
In [30]:
pa = patterns[1]
pb = patterns[0]
In [31]:
display(pb.resize(30).vdom_plot('seq'))
display(pa.align(pb).resize(30).vdom_plot('seq'))
In [32]:
display(pa.resize(30).vdom_plot('seq'))
display(pb.align(pa).resize(30).vdom_plot('seq'))

TODO

  • [x] take pattern_table and append cluster and cluster_order
  • [x] split the pandas data-frame into different groups. For each group:
    • identify the major motif (one with the most seqlets)
    • align all the other seqlets to that motif
    • append two new columns to the table:
      • logo_imp
      • logo_seq
  • [x] merge the table
  • [X] visualize the table (sort by cluster_order)
In [33]:
# read the property table
output_dir = Path("/srv/www/kundaje/avsec/chipnexus/oct-sox-nanog-klf/models/n_dil_layers=9/modisco/all/profile")
pattern_table = pd.read_csv(output_dir / "pattern_table.csv")
# read the footprints
footprints = read_pkl(output_dir / 'footprints.pkl')
In [35]:
# names in nte
pattern_names = np.array([shorten_pattern(p.name) for p in patterns_nte])
patterns_nte_dict = {shorten_pattern(p.name): p for p in patterns_nte}  # organize as a dict
#assert len(pattern_names) == len(cluster_order)
In [36]:
from basepair.exp.chipnexus.motif_clustering import append_logo_cluster
In [37]:
# cluster using contrib scores
cluster_order = np.argsort(leaves_list(lm_nte_contrib))
cluster = clusters_nte_contrib
pattern_table_nte_contrib = append_logo_cluster(pattern_table, patterns_nte, cluster_order, cluster, width=320, logo_len=70)
100%|██████████| 9/9 [03:04<00:00, 20.49s/it]
In [38]:
cluster_order = np.argsort(leaves_list(lm_nte_seq))
cluster = clusters_nte_seq
pattern_table_nte_seq = append_logo_cluster(pattern_table, patterns_nte, cluster_order, cluster, width=320, logo_len=70)
100%|██████████| 9/9 [03:04<00:00, 20.48s/it]
In [39]:
cluster_order = np.argsort(leaves_list(lm_te_contrib))
cluster = np.arange(len(cluster_order))
pattern_table_te_contrib = append_logo_cluster(pattern_table, patterns_te, cluster_order, cluster, width=480, logo_len=70)
100%|██████████| 46/46 [02:06<00:00,  2.76s/it]
In [40]:
# add the 'directness score'
for task in tasks:
    pattern_table_nte_contrib[task + "/d"] = 2*pattern_table_nte_contrib[task + " imp profile"] - pattern_table_nte_contrib[task + " imp profile"]
    # add the normalized version
    pattern_table_nte_contrib[task + "/d_p"] = perc(pattern_table_nte_contrib[task + "/d"])
    
    pattern_table_nte_seq[task + "/d"] = 2*pattern_table_nte_seq[task + " imp profile"] - pattern_table_nte_seq[task + " imp profile"]
    pattern_table_nte_seq[task + "/d_p"] = perc(pattern_table_nte_seq[task + "/d"])
    
    pattern_table_te_contrib[task + "/d"] = 2*pattern_table_te_contrib[task + " imp profile"] - pattern_table_te_contrib[task + " imp profile"]
    # add the normalized version
    pattern_table_te_contrib[task + "/d_p"] = perc(pattern_table_te_contrib[task + "/d"])

Save to file

In [41]:
!ls {output_dir}
footprints.pkl
modisco-table.stdout.log
motif_clustering
pattern_table.csv
pattern_table.html
pattern-table.smaller.contrib-cluster.html
pattern-table.smaller.seq-cluster.html
pattern_table.sorted.csv
pattern_table.sorted.html
pattern-table.TE.contrib-cluster.html
plots
results.html
results.ipynb
stdout.log
In [42]:
!mkdir -p {output_dir}/motif_clustering
In [43]:
pattern_table_nte_contrib.reset_index().to_csv(output_dir / 'motif_clustering/nte_contrib.csv', index=False)
pattern_table_nte_seq.reset_index().to_csv(output_dir / 'motif_clustering/nte_seq.csv', index=False)
pattern_table_te_contrib.reset_index().to_csv(output_dir / 'motif_clustering/te_contrib.csv', index=False)

Focus only on the things to visualize

In [44]:
display_cols = ['pattern', 'cluster', 'n seqlets', 'logo_imp', 'logo_seq'] + [task+'/d_p' for task in tasks]
In [45]:
pattern_table_nte_contrib['n seqlets'] = pattern_table_nte_contrib['n seqlets'].astype(float)
pattern_table_nte_seq['n seqlets'] = pattern_table_nte_seq['n seqlets'].astype(float)
pattern_table_te_contrib['n seqlets'] = pattern_table_te_contrib['n seqlets'].astype(float)
In [46]:
dfv_contrib = pattern_table_nte_contrib.reset_index().sort_values('cluster_order', ascending=True)[display_cols]
dfv_seq = pattern_table_nte_seq.reset_index().sort_values('cluster_order', ascending=True)[display_cols]
dfv_te_contrib = pattern_table_te_contrib.reset_index().sort_values('cluster_order', ascending=True)[display_cols]

Add manual periodicity annotations

In [47]:
from basepair.exp.chipnexus.annotate_profile import read_annotated_csv
dfl = read_annotated_csv("../2018-10-16-ziga-all-profile.csv")
In [48]:
dfl.head()
Out[48]:
Klf4/l Nanog/l Oct4/l Sox2/l pattern
0 ambiguous not-bound not-bound not-bound m9_p9
1 ambiguous not-bound not-bound not-bound m9_p8
2 not-bound not-bound ambigous not-bound m9_p7
3 not-bound not-bound not-bound not-bound m9_p6
4 not-bound direct not-bound not-bound m9_p5

Add sparklines

In [49]:
from basepair.plot.vdom import vdom_footprint, footprint_df
In [50]:
dff = footprint_df(footprints, dfl, figsize=(3,1.5), width=120)
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-50-36c6c0911f0c> in <module>()
----> 1 dff = footprint_df(footprints, dfl, figsize=(3,1.5), width=120)

~/workspace/basepair/basepair/plot/vdom.py in footprint_df(footprints, dfl, width, **kwargs)
    422     profile_max_median = {task: np.median([np.max(v[task]) for v in footprints.values()]) for task in tasks}
    423     out = []
--> 424     for p, arr_d in tqdm(footprints.items()):
    425 
    426         try:

NameError: name 'tqdm' is not defined
In [ ]:
# add the footprints
dfv_contrib = pd.merge(dfv_contrib, dff, on='pattern', how='left')
dfv_seq = pd.merge(dfv_seq, dff, on='pattern', how='left')
dfv_te_contrib = pd.merge(dfv_te_contrib, dff, on='pattern', how='left')
In [ ]:
colorder = ['pattern', 'cluster', 'n seqlets', 'logo_imp', 'logo_seq'] + tasks + [t + '/d_p' for t in tasks]

Visualize

In [ ]:
html_str_contrib = df2html(dfv_contrib[colorder], "table_contrib" + str(uuid4()), "")
with open('/srv/www/kundaje/avsec/chipnexus/oct-sox-nanog-klf/models/n_dil_layers=9/modisco/all/profile/pattern-table.smaller.cotrib-cluster.html', 'w') as fo:
    fo.write(html_str_contrib)
# HTML(html_str_contrib)
In [ ]:
html_str_seq = df2html(dfv_seq[colorder], "table_seq" + str(uuid4()), "")
with open('/srv/www/kundaje/avsec/chipnexus/oct-sox-nanog-klf/models/n_dil_layers=9/modisco/all/profile/pattern-table.smaller.seq-cluster.html', 'w') as fo:
    fo.write(html_str_seq)
# html_str_seq
In [ ]:
html_str_te_contrib = df2html(dfv_te_contrib[colorder], "table_seq" + str(uuid4()), "")
with open('/srv/www/kundaje/avsec/chipnexus/oct-sox-nanog-klf/models/n_dil_layers=9/modisco/all/profile/pattern-table.TE.contrib-cluster.html', 'w') as fo:
    fo.write(html_str_te_contrib)
# html_str_seq

4. Iterate

  • Explore different:
    • metric
    • score (seq vs importance)
    • cutoff
  • for each choice, inspect
    • heatmap
    • distribution of pairwise scores
    • visual motif grouping in the pandas data-frame