Goal

  • cluster the motifs per tf
  • match the TE's to the all/deeplift/profile run

Tasks

  • Load patterns from each task
    • add the annotation where they came from
  • merge all together
  • get the affinity matrix
  • cluster them
  • write out a table
In [3]:
# Imports
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
from basepair.imports import *
hv.extension('bokeh')
Using TensorFlow backend.
In [4]:
# Common paths
tasks = ['Oct4', 'Sox2', 'Nanog', 'Klf4']
In [5]:
sdir = Path('/oak/stanford/groups/akundaje/avsec/basepair/data/processed/comparison/output/')
In [6]:
model_dir = sdir / 'nexus,peaks,OSNK,0,10,1,FALSE,same,0.5,64,25,0.004,9,FALSE,[1,50],TRUE'
In [7]:
# create_tf_session(0)
In [8]:
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
from uuid import uuid4
from basepair.math import mean
from basepair.stats import perc

from IPython.display import display, HTML
from basepair.plot.vdom import df2html, df2html_old, render_datatable, vdom_footprint
from basepair.plot.utils import strip_axis
from basepair.plot.tracks import tidy_motif_plot
from basepair.modisco.core import patterns_to_df
from basepair.modisco.utils import longer_pattern, shorten_pattern, extract_name_short
from basepair.imports import *

paper_config()

Load patterns from each task

  • add the annotation where they came from
In [9]:
# mr = ModiscoResult(modisco_dirs / tf / "modisco.h5")
# mr.open()
In [12]:
patterns = []
def rename(p, tf):
    p = p.copy()
    p.name = tf + "_" + shorten_pattern(p.name).split("_")[1]
    return p
for tf in tasks:
    mr = ModiscoResult(model_dir /f"deeplift/{tf}/out/profile/wn/modisco.h5")
    mr.open()
    patterns = patterns + [rename(mr.get_pattern(p), tf).add_attr("features", 
                                                                  {"n seqlets": mr.n_seqlets(*p.split("/"))}) 
                           for p in mr.patterns()]
    mr.close()
In [112]:
patterns = [p for p in patterns if p.attrs['features']['n seqlets'] > 100] # require 100 seqlets at least

get the affinity matrix

In [209]:
similarity_metric = 'continousjaccard'  # or 'continousjaccard'
track = 'seq_ic'
In [210]:
from basepair.exp.chipnexus.motif_clustering import similarity_matrix
sim_all_seq = similarity_matrix(patterns, track=track, metric=similarity_metric)
100%|██████████| 49/49 [00:10<00:00,  4.71it/s]
In [211]:
from basepair.modisco.core import *
In [212]:
fig = plt.figure(figsize=get_figsize(0.25))
iu_all = np.triu_indices(len(sim_all_seq), 1)
plt.hist(sim_all_seq[iu_all], bins=100);
plt.title("Similarity (seq_ic)")
plt.ylabel("Frequency");
plt.xlabel("Similarity between two motifs (all)");
# plt.savefig(f"{figures}/all.similarity.hist.pdf")
# plt.savefig(f"{figures}/all.similarity.hist.png")
In [213]:
from scipy.cluster.hierarchy import linkage, optimal_leaf_ordering, cut_tree, leaves_list
from basepair.modisco.motif_clustering import to_colors

cluster them

In [214]:
iu_all = np.triu_indices(len(sim_all_seq), 1)
lm_all_seq = linkage(1-sim_all_seq[iu_all], 'ward', optimal_ordering=True)
clusters_all_seq = cut_tree(lm_all_seq, n_clusters=9)[:,0]
names = [x.name for x in patterns]
cm_all = sns.clustermap(pd.DataFrame(sim_all_seq, index=names, columns=names),
                        row_linkage=lm_all_seq,
                        col_linkage=lm_all_seq,
                        cmap='Blues',
                        yticklabels=True,
                        figsize=get_figsize(1, aspect=1)
                       ).fig.suptitle('Seq IC sim');
# plt.savefig(f"{figures}/all.heatmap.pdf")
# plt.savefig(f"{figures}/all.heatmap.png")

Align motifs

In [117]:
from basepair.modisco.motif_clustering import create_pattern_table, align_clustered_patterns
np.random.seed(42)
cluster_order = np.argsort(leaves_list(lm_all_seq))
cluster = clusters_all_seq

patterns_all_clustered = align_clustered_patterns(patterns, cluster_order, cluster, 
                                                 align_track='seq_ic',
                                                 metric='continousjaccard',
                                                 # don't shit the major patterns 
                                                 # by more than 15 when aligning 
                                                 trials=20,
                                                 max_shift=15)
100%|██████████| 49/49 [00:00<00:00, 174.10it/s]
In [118]:
def create_pattern_table(patterns,
                         logo_len=30,
                         footprint_width=320,
                         footprint_kwargs=None,
                         seqlogo_kwargs=None,
                         # report_url='results.html'
                         n_jobs=10):
    """Creates the pattern table given a list of patterns

    In addition to normal features under p.attrs['features'] it adds
    logo_imp, logo_seq, profile scores and the directness score

    Args:
      patterns: list of patterns with 'profile' and attrs['features'] and attrs['motif_freq'] features
      cluster_order: n

    """
    # get profile medians for each task across all patterns
    tasks = patterns[0].tasks()
#     profile_max_median = {t: np.median([p.profile[t].max() for p in patterns])
#                           for t in tasks}

    def extract_info(pattern_i):
        """Function ran for each cluster order
        """
        p = patterns[pattern_i]
        # setup the dictionary
        d = p.attrs['features']
        # Add motif frequencies
        # d['pattern'] = pattern_url(d['pattern'], report_url)
        d['cluster'] = p.attrs['cluster']
        d['pattern'] = p.name

        # Add seqlogos
        d['logo_imp'] = p.resize(logo_len).vdom_plot('contrib', as_html=True, **seqlogo_kwargs)
        d['logo_seq'] = p.resize(logo_len).vdom_plot('seq', as_html=True, **seqlogo_kwargs)

#         for t in p.tasks():
#             # add profile sparklines
#             d[t + '/f'] = (vdom_footprint(p.profile[t],
#                                           r_height=profile_max_median[t],
#                                           text=None,
#                                           **footprint_kwargs)
#                            .to_html()
#                            .replace("<img", f"<img width={footprint_width}")
#                            )

#             # Add directness score
#             d[t + "/d"] = 2 * d[t + " imp profile"] - d[t + " imp counts"]
        return d

    l = Parallel(n_jobs=n_jobs)(delayed(extract_info)(i) for i in tqdm(range(len(patterns))))
    df = pd.DataFrame(l)

    # add normalized features
#     for t in tasks:
#         df[t + '/d_p'] = perc(df[t + '/d'])
        
    # use float for n seqlets
    df['n seqlets'] = df['n seqlets'].astype(float)
    return df
In [119]:
patterns_all_clustered[0].attrs
Out[119]:
{'features': {'n seqlets': 306},
 'align': {'use_rc': False, 'offset': -5},
 'cluster': 0}
In [120]:
pattern_table_all = create_pattern_table(patterns_all_clustered,
                                         logo_len=70, 
                                         seqlogo_kwargs=dict(width=420),
                                         n_jobs=20,
                                         footprint_width=120,
                                         footprint_kwargs=dict(figsize=(3,1.5)))
100%|██████████| 49/49 [00:09<00:00,  5.36it/s]
In [121]:
pattern_table_all.head()
Out[121]:
cluster logo_imp logo_seq n seqlets pattern
0 0 <img width=420 src="d... <img width=420 src="d... 306.0 Oct4_p3
1 0 <img width=420 src="d... <img width=420 src="d... 303.0 Sox2_p3
2 0 <img width=420 src="d... <img width=420 src="d... 614.0 Klf4_p4
3 0 <img width=420 src="d... <img width=420 src="d... 2668.0 Sox2_p0
4 0 <img width=420 src="d... <img width=420 src="d... 5430.0 Oct4_p0
In [122]:
del pattern_table_all['cluster']

write out a table

In [123]:
from basepair.modisco.table import write_modisco_table

def get_first_columns(df, cols):
    remaining_columns = [c for c in df.columns if c not in cols]
    return df[cols + remaining_columns]
In [124]:
first_columns = ['pattern', 'n seqlets', 'logo_seq', 'logo_imp']
In [125]:
pattern_table_all['i'] = np.arange(len(pattern_table_all), dtype=int)
pattern_table_all = get_first_columns(pattern_table_all, ['i'] + first_columns)
# (modisco_dirs / 'motif_clustering').mkdir(exist_ok=True)
# write_modisco_table(pattern_table_all, modisco_dirs, report_url='results.html', prefix='pattern_table', write_csv=False, doc={})
In [126]:
# TODO - for each TF 
In [127]:
ref_modisco_dir = model_dir /f"deeplift/all/out/profile/wn/"
In [128]:
ref_patterns = read_pkl(ref_modisco_dir / 'patterns.pkl')
In [173]:
ref_patterns = [p for p in ref_patterns if p.attrs['features']['n seqlets'] > 100]
In [133]:
dfp = pd.read_csv(ref_modisco_dir / 'motif_clustering/patterns_long.csv')
In [228]:
dfp_short = pd.read_csv(ref_modisco_dir / 'motif_clustering/patterns_short.csv')
In [134]:
shorten_pattern(ref_patterns[0].name)
Out[134]:
'm6_p11'
In [140]:
d_ref_patterns_te = {shorten_pattern(p.name): p for p in ref_patterns}
ref_patterns_te =  [d_ref_patterns_te[pn] for pn in dfp[dfp['n seqlets'] > 100].pattern]
In [141]:
p = ref_patterns_te[0]
In [142]:
def similarity_matrix_two_patterns(patterns, patterns_b, **kwargs):
    """Compute pattern-pattern similarity matrix 
    """
    m = np.zeros((len(patterns), len(patterns_b)))
    closest_patterns = []
    for i in tqdm(range(len(patterns))):
        for j in range(len(patterns_b)):
            m[i, j] = max(patterns[i].similarity(patterns_b[j], **kwargs),
                          patterns[i].rc().similarity(patterns_b[j], **kwargs))
        j_best = m[i].argmax()
        closest_patterns.append((patterns[i].align(patterns_b[j_best], **kwargs), patterns_b[j_best], m[i].max()))
    dfm = pd.DataFrame(m, index=[p.name for p in patterns],
                       columns = [p.name for p in patterns_b])
    return closest_patterns, dfm
In [144]:
te_merged_patterns = [p for p in patterns_all_clustered if p.seq_info_content > 34]
In [145]:
plt.hist([p.seq_info_content for p in patterns], 30);

TODO

  1. cluster
  2. how many clustered patterns do we have in common? -> define a similarity threshold
  3. Which motifs does one method find and not the other one?

Long patterns

In [271]:
te_merged_patterns = [p for p in patterns_all_clustered if p.seq_info_content > 30]
In [272]:
d_ref_patterns_te = {shorten_pattern(p.name): p for p in ref_patterns}
ref_patterns_te =  [d_ref_patterns_te[pn] for pn in dfp[dfp['n seqlets'] > 100].pattern]
In [273]:
pattern_pairs, m = similarity_matrix_two_patterns(te_merged_patterns, ref_patterns_te)
100%|██████████| 21/21 [00:05<00:00,  4.03it/s]
In [274]:
pattern_pairs2, m2 = similarity_matrix_two_patterns(ref_patterns_te, te_merged_patterns)
100%|██████████| 55/55 [00:05<00:00, 10.26it/s]
In [275]:
lrow = linkage(m, 'ward', optimal_ordering=True)
lcol = linkage(m.T, 'ward', optimal_ordering=True)
cm_te = sns.clustermap(m,
                        #row_linkage=lrow,
                        row_cluster=False,
                        col_linkage=lcol,
                       figsize=(12, 7),
                        cmap='Blues'
                       ).fig.suptitle('Seq IC sim');
In [276]:
lrow = linkage(m2, 'ward', optimal_ordering=True)
lcol = linkage(m2.T, 'ward', optimal_ordering=True)
cm_te = sns.clustermap(m2,
                        row_linkage=None,
                        row_cluster=False,
                        col_linkage=lcol,
                        cmap='Blues', figsize=(7, 12),
                       ).fig.suptitle('Seq IC sim');
In [277]:
m.max(axis=1).plot.hist(30)  # best match per row
plt.xlabel("Best match from joint run to each per-TF run")
Out[277]:
Text(0.5, 0, 'Best match from joint run to each per-TF run')
In [278]:
m2.max(axis=1).plot.hist(30)  # best match per row
plt.xlabel("Best match from per-TF run to each joint run")
Out[278]:
Text(0.5, 0, 'Best match from per-TF run to each joint run')

per-TF -> Joint run

In [279]:
pattern_pairs[0][0].attrs['features']['n seqlets']
Out[279]:
224

Least similar

At 0.35 threshod, we include all the other patterns

In [280]:
for pa, pb, sim_score in pattern_pairs:
    if sim_score > 0.4:
        continue
    pa.plot("seq_ic", height=.3, letter_width=.1)
    strip_axis(plt.gca())
    plt.ylim([0,2])
    plt.title(pa.name + f" ({int(pa.attrs['features']['n seqlets'])})")
    plt.gca().axison = False
    pb.plot("seq_ic", height=.3, letter_width=.1)
    strip_axis(plt.gca())
    plt.ylim([0,2])
    plt.gca().axison = False
    plt.title(pb.name + f" ({int(pb.attrs['features']['n seqlets'])})")

Most similar

In [281]:
for pa, pb, sim_score in pattern_pairs:
    if sim_score <= 0.4:
        continue
    pa.plot("seq_ic", height=.3, letter_width=.1)
    strip_axis(plt.gca())
    plt.ylim([0,2])
    plt.title(pa.name + f" ({int(pa.attrs['features']['n seqlets'])})")
    plt.gca().axison = False
    pb.plot("seq_ic", height=.3, letter_width=.1)
    strip_axis(plt.gca())
    plt.ylim([0,2])
    plt.gca().axison = False
    plt.title(pb.name + f" ({int(pb.attrs['features']['n seqlets'])})")

Joint run -> per-TF

Least similar

In [282]:
for pa, pb, sim_score in pattern_pairs2:
    if sim_score > 0.4:
        continue
    pa.plot("seq_ic", height=.3, letter_width=.1)
    strip_axis(plt.gca())
    plt.ylim([0,2])
    plt.title(pa.name + f" ({int(pa.attrs['features']['n seqlets'])})")
    plt.gca().axison = False
    pb.plot("seq_ic", height=.3, letter_width=.1)
    strip_axis(plt.gca())
    plt.ylim([0,2])
    plt.gca().axison = False
    plt.title(pb.name + f" ({int(pb.attrs['features']['n seqlets'])})")

Most similar

In [283]:
for pa, pb, sim_score in pattern_pairs2:
    if sim_score <= 0.4:
        continue
    pa.plot("seq_ic", height=.3, letter_width=.1)
    strip_axis(plt.gca())
    plt.ylim([0,2])
    plt.title(pa.name + f" ({int(pa.attrs['features']['n seqlets'])})")
    plt.gca().axison = False
    pb.plot("seq_ic", height=.3, letter_width=.1)
    strip_axis(plt.gca())
    plt.ylim([0,2])
    plt.gca().axison = False
    plt.title(pb.name + f" ({int(pb.attrs['features']['n seqlets'])})")

Short patterns

In [284]:
short_merged_patterns = [p for p in patterns_all_clustered if p.seq_info_content <= 30 and p.attrs['features']['n seqlets'] > 300]
In [285]:
d_ref_patterns_short = {shorten_pattern(p.name): p for p in ref_patterns}
ref_patterns_short =  [d_ref_patterns_short[pn] for pn in dfp_short[dfp_short['n seqlets'] > 300].pattern]
In [286]:
pattern_pairs, m = similarity_matrix_two_patterns(short_merged_patterns, ref_patterns_short)
100%|██████████| 18/18 [00:02<00:00,  7.11it/s]
In [287]:
pattern_pairs2, m2 = similarity_matrix_two_patterns(ref_patterns_short, short_merged_patterns)
100%|██████████| 35/35 [00:03<00:00,  9.39it/s]
In [288]:
m2.shape
Out[288]:
(35, 18)
In [289]:
m.shape
Out[289]:
(18, 35)
In [290]:
lrow = linkage(m, 'ward', optimal_ordering=True)
lcol = linkage(m.T, 'ward', optimal_ordering=True)
cm_te = sns.clustermap(m,
                        #row_linkage=lrow,
                        row_cluster=False,
                        col_linkage=lcol,
                       figsize=(7, 5),
                        cmap='Blues'
                       ).fig.suptitle('Seq IC sim');
In [291]:
lrow = linkage(m2, 'ward', optimal_ordering=True)
lcol = linkage(m2.T, 'ward', optimal_ordering=True)
cm_te = sns.clustermap(m2,
                        row_linkage=None,
                        row_cluster=False,
                        col_linkage=lcol,
                        cmap='Blues', figsize=(5, 7),
                       ).fig.suptitle('Seq IC sim');
In [292]:
m.max(axis=1).plot.hist(30)  # best match per row
plt.xlabel("Best match from joint run to each per-TF run")
Out[292]:
Text(0.5, 0, 'Best match from joint run to each per-TF run')
In [293]:
m2.max(axis=1).plot.hist(30)  # best match per row
plt.xlabel("Best match from per-TF run to each joint run")
Out[293]:
Text(0.5, 0, 'Best match from per-TF run to each joint run')

per-TF -> Joint run

In [294]:
pattern_pairs[0][0].attrs['features']['n seqlets']
Out[294]:
306

Least similar

At 0.35 threshod, we include all the other patterns

TODO- thresholds are different...

In [295]:
for pa, pb, sim_score in pattern_pairs:
    if sim_score > 0.75:
        continue
    pa.plot("seq_ic", height=.3, letter_width=.1)
    strip_axis(plt.gca())
    plt.ylim([0,2])
    plt.title(pa.name + f" ({int(pa.attrs['features']['n seqlets'])})")
    plt.gca().axison = False
    pb.plot("seq_ic", height=.3, letter_width=.1)
    strip_axis(plt.gca())
    plt.ylim([0,2])
    plt.gca().axison = False
    plt.title(pb.name + f" ({int(pb.attrs['features']['n seqlets'])})")

Most similar

In [296]:
for pa, pb, sim_score in pattern_pairs:
    if sim_score <= 0.75:
        continue
    pa.plot("seq_ic", height=.3, letter_width=.1)
    strip_axis(plt.gca())
    plt.ylim([0,2])
    plt.title(pa.name + f" ({int(pa.attrs['features']['n seqlets'])})")
    plt.gca().axison = False
    pb.plot("seq_ic", height=.3, letter_width=.1)
    strip_axis(plt.gca())
    plt.ylim([0,2])
    plt.gca().axison = False
    plt.title(pb.name + f" ({int(pb.attrs['features']['n seqlets'])})")

Joint run -> per-TF

Least similar

In [297]:
for pa, pb, sim_score in pattern_pairs2:
    if sim_score > 0.75:
        continue
    pa.plot("seq_ic", height=.3, letter_width=.1)
    strip_axis(plt.gca())
    plt.ylim([0,2])
    plt.title(pa.name + f" ({int(pa.attrs['features']['n seqlets'])})")
    plt.gca().axison = False
    pb.plot("seq_ic", height=.3, letter_width=.1)
    strip_axis(plt.gca())
    plt.ylim([0,2])
    plt.gca().axison = False
    plt.title(pb.name + f" ({int(pb.attrs['features']['n seqlets'])})")

Most similar

In [298]:
for pa, pb, sim_score in pattern_pairs2:
    if sim_score <= 0.75:
        continue
    pa.plot("seq_ic", height=.3, letter_width=.1)
    strip_axis(plt.gca())
    plt.ylim([0,2])
    plt.title(pa.name + f" ({int(pa.attrs['features']['n seqlets'])})")
    plt.gca().axison = False
    pb.plot("seq_ic", height=.3, letter_width=.1)
    strip_axis(plt.gca())
    plt.ylim([0,2])
    plt.gca().axison = False
    plt.title(pb.name + f" ({int(pb.attrs['features']['n seqlets'])})")

Joint run

  • cluter the motifs -> show the old plot and the full table with all 100 motifs?