Goal¶

cluster the motifs per tf
match the TE's to the all/deeplift/profile run

Tasks¶

Load patterns from each task
- add the annotation where they came from
merge all together
get the affinity matrix
cluster them
write out a table

# Imports
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
from basepair.imports import *
hv.extension('bokeh')

Using TensorFlow backend.

# Common paths
tasks = ['Oct4', 'Sox2', 'Nanog', 'Klf4']

sdir = Path('/oak/stanford/groups/akundaje/avsec/basepair/data/processed/comparison/output/')

model_dir = sdir / 'nexus,peaks,OSNK,0,10,1,FALSE,same,0.5,64,25,0.004,9,FALSE,[1,50],TRUE'

# create_tf_session(0)

%matplotlib inline
%config InlineBackend.figure_format = 'retina'
from uuid import uuid4
from basepair.math import mean
from basepair.stats import perc

from IPython.display import display, HTML
from basepair.plot.vdom import df2html, df2html_old, render_datatable, vdom_footprint
from basepair.plot.utils import strip_axis
from basepair.plot.tracks import tidy_motif_plot
from basepair.modisco.core import patterns_to_df
from basepair.modisco.utils import longer_pattern, shorten_pattern, extract_name_short
from basepair.imports import *

paper_config()

Load patterns from each task¶

add the annotation where they came from

# mr = ModiscoResult(modisco_dirs / tf / "modisco.h5")
# mr.open()

patterns = []
def rename(p, tf):
    p = p.copy()
    p.name = tf + "_" + shorten_pattern(p.name).split("_")[1]
    return p
for tf in tasks:
    mr = ModiscoResult(model_dir /f"deeplift/{tf}/out/profile/wn/modisco.h5")
    mr.open()
    patterns = patterns + [rename(mr.get_pattern(p), tf).add_attr("features", 
                                                                  {"n seqlets": mr.n_seqlets(*p.split("/"))}) 
                           for p in mr.patterns()]
    mr.close()

patterns = [p for p in patterns if p.attrs['features']['n seqlets'] > 100] # require 100 seqlets at least

get the affinity matrix¶

similarity_metric = 'continousjaccard'  # or 'continousjaccard'
track = 'seq_ic'

from basepair.exp.chipnexus.motif_clustering import similarity_matrix
sim_all_seq = similarity_matrix(patterns, track=track, metric=similarity_metric)

100%|██████████| 49/49 [00:10<00:00,  4.71it/s]

from basepair.modisco.core import *

fig = plt.figure(figsize=get_figsize(0.25))
iu_all = np.triu_indices(len(sim_all_seq), 1)
plt.hist(sim_all_seq[iu_all], bins=100);
plt.title("Similarity (seq_ic)")
plt.ylabel("Frequency");
plt.xlabel("Similarity between two motifs (all)");
# plt.savefig(f"{figures}/all.similarity.hist.pdf")
# plt.savefig(f"{figures}/all.similarity.hist.png")

from scipy.cluster.hierarchy import linkage, optimal_leaf_ordering, cut_tree, leaves_list
from basepair.modisco.motif_clustering import to_colors

cluster them¶

iu_all = np.triu_indices(len(sim_all_seq), 1)
lm_all_seq = linkage(1-sim_all_seq[iu_all], 'ward', optimal_ordering=True)
clusters_all_seq = cut_tree(lm_all_seq, n_clusters=9)[:,0]
names = [x.name for x in patterns]
cm_all = sns.clustermap(pd.DataFrame(sim_all_seq, index=names, columns=names),
                        row_linkage=lm_all_seq,
                        col_linkage=lm_all_seq,
                        cmap='Blues',
                        yticklabels=True,
                        figsize=get_figsize(1, aspect=1)
                       ).fig.suptitle('Seq IC sim');
# plt.savefig(f"{figures}/all.heatmap.pdf")
# plt.savefig(f"{figures}/all.heatmap.png")

Align motifs¶

from basepair.modisco.motif_clustering import create_pattern_table, align_clustered_patterns
np.random.seed(42)
cluster_order = np.argsort(leaves_list(lm_all_seq))
cluster = clusters_all_seq

patterns_all_clustered = align_clustered_patterns(patterns, cluster_order, cluster, 
                                                 align_track='seq_ic',
                                                 metric='continousjaccard',
                                                 # don't shit the major patterns 
                                                 # by more than 15 when aligning 
                                                 trials=20,
                                                 max_shift=15)

100%|██████████| 49/49 [00:00<00:00, 174.10it/s]

def create_pattern_table(patterns,
                         logo_len=30,
                         footprint_width=320,
                         footprint_kwargs=None,
                         seqlogo_kwargs=None,
                         # report_url='results.html'
                         n_jobs=10):
    """Creates the pattern table given a list of patterns

    In addition to normal features under p.attrs['features'] it adds
    logo_imp, logo_seq, profile scores and the directness score

    Args:
      patterns: list of patterns with 'profile' and attrs['features'] and attrs['motif_freq'] features
      cluster_order: n

    """
    # get profile medians for each task across all patterns
    tasks = patterns[0].tasks()
#     profile_max_median = {t: np.median([p.profile[t].max() for p in patterns])
#                           for t in tasks}

    def extract_info(pattern_i):
        """Function ran for each cluster order
        """
        p = patterns[pattern_i]
        # setup the dictionary
        d = p.attrs['features']
        # Add motif frequencies
        # d['pattern'] = pattern_url(d['pattern'], report_url)
        d['cluster'] = p.attrs['cluster']
        d['pattern'] = p.name

        # Add seqlogos
        d['logo_imp'] = p.resize(logo_len).vdom_plot('contrib', as_html=True, **seqlogo_kwargs)
        d['logo_seq'] = p.resize(logo_len).vdom_plot('seq', as_html=True, **seqlogo_kwargs)

#         for t in p.tasks():
#             # add profile sparklines
#             d[t + '/f'] = (vdom_footprint(p.profile[t],
#                                           r_height=profile_max_median[t],
#                                           text=None,
#                                           **footprint_kwargs)
#                            .to_html()
#                            .replace("<img", f"<img width={footprint_width}")
#                            )

#             # Add directness score
#             d[t + "/d"] = 2 * d[t + " imp profile"] - d[t + " imp counts"]
        return d

    l = Parallel(n_jobs=n_jobs)(delayed(extract_info)(i) for i in tqdm(range(len(patterns))))
    df = pd.DataFrame(l)

    # add normalized features
#     for t in tasks:
#         df[t + '/d_p'] = perc(df[t + '/d'])
        
    # use float for n seqlets
    df['n seqlets'] = df['n seqlets'].astype(float)
    return df

patterns_all_clustered[0].attrs

{'features': {'n seqlets': 306},
 'align': {'use_rc': False, 'offset': -5},
 'cluster': 0}

pattern_table_all = create_pattern_table(patterns_all_clustered,
                                         logo_len=70, 
                                         seqlogo_kwargs=dict(width=420),
                                         n_jobs=20,
                                         footprint_width=120,
                                         footprint_kwargs=dict(figsize=(3,1.5)))

100%|██████████| 49/49 [00:09<00:00,  5.36it/s]

pattern_table_all.head()

del pattern_table_all['cluster']

write out a table¶

from basepair.modisco.table import write_modisco_table

def get_first_columns(df, cols):
    remaining_columns = [c for c in df.columns if c not in cols]
    return df[cols + remaining_columns]

first_columns = ['pattern', 'n seqlets', 'logo_seq', 'logo_imp']

pattern_table_all['i'] = np.arange(len(pattern_table_all), dtype=int)
pattern_table_all = get_first_columns(pattern_table_all, ['i'] + first_columns)
# (modisco_dirs / 'motif_clustering').mkdir(exist_ok=True)
# write_modisco_table(pattern_table_all, modisco_dirs, report_url='results.html', prefix='pattern_table', write_csv=False, doc={})

# TODO - for each TF

ref_modisco_dir = model_dir /f"deeplift/all/out/profile/wn/"

ref_patterns = read_pkl(ref_modisco_dir / 'patterns.pkl')

ref_patterns = [p for p in ref_patterns if p.attrs['features']['n seqlets'] > 100]

dfp = pd.read_csv(ref_modisco_dir / 'motif_clustering/patterns_long.csv')

dfp_short = pd.read_csv(ref_modisco_dir / 'motif_clustering/patterns_short.csv')

shorten_pattern(ref_patterns[0].name)

'm6_p11'

d_ref_patterns_te = {shorten_pattern(p.name): p for p in ref_patterns}
ref_patterns_te =  [d_ref_patterns_te[pn] for pn in dfp[dfp['n seqlets'] > 100].pattern]

p = ref_patterns_te[0]

def similarity_matrix_two_patterns(patterns, patterns_b, **kwargs):
    """Compute pattern-pattern similarity matrix 
    """
    m = np.zeros((len(patterns), len(patterns_b)))
    closest_patterns = []
    for i in tqdm(range(len(patterns))):
        for j in range(len(patterns_b)):
            m[i, j] = max(patterns[i].similarity(patterns_b[j], **kwargs),
                          patterns[i].rc().similarity(patterns_b[j], **kwargs))
        j_best = m[i].argmax()
        closest_patterns.append((patterns[i].align(patterns_b[j_best], **kwargs), patterns_b[j_best], m[i].max()))
    dfm = pd.DataFrame(m, index=[p.name for p in patterns],
                       columns = [p.name for p in patterns_b])
    return closest_patterns, dfm

te_merged_patterns = [p for p in patterns_all_clustered if p.seq_info_content > 34]

plt.hist([p.seq_info_content for p in patterns], 30);

TODO¶

cluster
how many clustered patterns do we have in common? -> define a similarity threshold
Which motifs does one method find and not the other one?

Long patterns¶

te_merged_patterns = [p for p in patterns_all_clustered if p.seq_info_content > 30]

d_ref_patterns_te = {shorten_pattern(p.name): p for p in ref_patterns}
ref_patterns_te =  [d_ref_patterns_te[pn] for pn in dfp[dfp['n seqlets'] > 100].pattern]

pattern_pairs, m = similarity_matrix_two_patterns(te_merged_patterns, ref_patterns_te)

100%|██████████| 21/21 [00:05<00:00,  4.03it/s]

pattern_pairs2, m2 = similarity_matrix_two_patterns(ref_patterns_te, te_merged_patterns)

100%|██████████| 55/55 [00:05<00:00, 10.26it/s]

lrow = linkage(m, 'ward', optimal_ordering=True)
lcol = linkage(m.T, 'ward', optimal_ordering=True)
cm_te = sns.clustermap(m,
                        #row_linkage=lrow,
                        row_cluster=False,
                        col_linkage=lcol,
                       figsize=(12, 7),
                        cmap='Blues'
                       ).fig.suptitle('Seq IC sim');

lrow = linkage(m2, 'ward', optimal_ordering=True)
lcol = linkage(m2.T, 'ward', optimal_ordering=True)
cm_te = sns.clustermap(m2,
                        row_linkage=None,
                        row_cluster=False,
                        col_linkage=lcol,
                        cmap='Blues', figsize=(7, 12),
                       ).fig.suptitle('Seq IC sim');

m.max(axis=1).plot.hist(30)  # best match per row
plt.xlabel("Best match from joint run to each per-TF run")

Text(0.5, 0, 'Best match from joint run to each per-TF run')

m2.max(axis=1).plot.hist(30)  # best match per row
plt.xlabel("Best match from per-TF run to each joint run")

Text(0.5, 0, 'Best match from per-TF run to each joint run')

per-TF -> Joint run¶

pattern_pairs[0][0].attrs['features']['n seqlets']

224

Least similar¶

At 0.35 threshod, we include all the other patterns

for pa, pb, sim_score in pattern_pairs:
    if sim_score > 0.4:
        continue
    pa.plot("seq_ic", height=.3, letter_width=.1)
    strip_axis(plt.gca())
    plt.ylim([0,2])
    plt.title(pa.name + f" ({int(pa.attrs['features']['n seqlets'])})")
    plt.gca().axison = False
    pb.plot("seq_ic", height=.3, letter_width=.1)
    strip_axis(plt.gca())
    plt.ylim([0,2])
    plt.gca().axison = False
    plt.title(pb.name + f" ({int(pb.attrs['features']['n seqlets'])})")

Most similar¶

for pa, pb, sim_score in pattern_pairs:
    if sim_score <= 0.4:
        continue
    pa.plot("seq_ic", height=.3, letter_width=.1)
    strip_axis(plt.gca())
    plt.ylim([0,2])
    plt.title(pa.name + f" ({int(pa.attrs['features']['n seqlets'])})")
    plt.gca().axison = False
    pb.plot("seq_ic", height=.3, letter_width=.1)
    strip_axis(plt.gca())
    plt.ylim([0,2])
    plt.gca().axison = False
    plt.title(pb.name + f" ({int(pb.attrs['features']['n seqlets'])})")

Joint run -> per-TF¶

Least similar¶

for pa, pb, sim_score in pattern_pairs2:
    if sim_score > 0.4:
        continue
    pa.plot("seq_ic", height=.3, letter_width=.1)
    strip_axis(plt.gca())
    plt.ylim([0,2])
    plt.title(pa.name + f" ({int(pa.attrs['features']['n seqlets'])})")
    plt.gca().axison = False
    pb.plot("seq_ic", height=.3, letter_width=.1)
    strip_axis(plt.gca())
    plt.ylim([0,2])
    plt.gca().axison = False
    plt.title(pb.name + f" ({int(pb.attrs['features']['n seqlets'])})")

Most similar¶

for pa, pb, sim_score in pattern_pairs2:
    if sim_score <= 0.4:
        continue
    pa.plot("seq_ic", height=.3, letter_width=.1)
    strip_axis(plt.gca())
    plt.ylim([0,2])
    plt.title(pa.name + f" ({int(pa.attrs['features']['n seqlets'])})")
    plt.gca().axison = False
    pb.plot("seq_ic", height=.3, letter_width=.1)
    strip_axis(plt.gca())
    plt.ylim([0,2])
    plt.gca().axison = False
    plt.title(pb.name + f" ({int(pb.attrs['features']['n seqlets'])})")

Short patterns¶

short_merged_patterns = [p for p in patterns_all_clustered if p.seq_info_content <= 30 and p.attrs['features']['n seqlets'] > 300]

d_ref_patterns_short = {shorten_pattern(p.name): p for p in ref_patterns}
ref_patterns_short =  [d_ref_patterns_short[pn] for pn in dfp_short[dfp_short['n seqlets'] > 300].pattern]

pattern_pairs, m = similarity_matrix_two_patterns(short_merged_patterns, ref_patterns_short)

100%|██████████| 18/18 [00:02<00:00,  7.11it/s]

pattern_pairs2, m2 = similarity_matrix_two_patterns(ref_patterns_short, short_merged_patterns)

100%|██████████| 35/35 [00:03<00:00,  9.39it/s]

m2.shape

(35, 18)

m.shape

(18, 35)

lrow = linkage(m, 'ward', optimal_ordering=True)
lcol = linkage(m.T, 'ward', optimal_ordering=True)
cm_te = sns.clustermap(m,
                        #row_linkage=lrow,
                        row_cluster=False,
                        col_linkage=lcol,
                       figsize=(7, 5),
                        cmap='Blues'
                       ).fig.suptitle('Seq IC sim');

lrow = linkage(m2, 'ward', optimal_ordering=True)
lcol = linkage(m2.T, 'ward', optimal_ordering=True)
cm_te = sns.clustermap(m2,
                        row_linkage=None,
                        row_cluster=False,
                        col_linkage=lcol,
                        cmap='Blues', figsize=(5, 7),
                       ).fig.suptitle('Seq IC sim');

m.max(axis=1).plot.hist(30)  # best match per row
plt.xlabel("Best match from joint run to each per-TF run")

Text(0.5, 0, 'Best match from joint run to each per-TF run')

m2.max(axis=1).plot.hist(30)  # best match per row
plt.xlabel("Best match from per-TF run to each joint run")

Text(0.5, 0, 'Best match from per-TF run to each joint run')

per-TF -> Joint run¶

pattern_pairs[0][0].attrs['features']['n seqlets']

306

Least similar¶

At 0.35 threshod, we include all the other patterns

TODO- thresholds are different...¶

for pa, pb, sim_score in pattern_pairs:
    if sim_score > 0.75:
        continue
    pa.plot("seq_ic", height=.3, letter_width=.1)
    strip_axis(plt.gca())
    plt.ylim([0,2])
    plt.title(pa.name + f" ({int(pa.attrs['features']['n seqlets'])})")
    plt.gca().axison = False
    pb.plot("seq_ic", height=.3, letter_width=.1)
    strip_axis(plt.gca())
    plt.ylim([0,2])
    plt.gca().axison = False
    plt.title(pb.name + f" ({int(pb.attrs['features']['n seqlets'])})")

Most similar¶

for pa, pb, sim_score in pattern_pairs:
    if sim_score <= 0.75:
        continue
    pa.plot("seq_ic", height=.3, letter_width=.1)
    strip_axis(plt.gca())
    plt.ylim([0,2])
    plt.title(pa.name + f" ({int(pa.attrs['features']['n seqlets'])})")
    plt.gca().axison = False
    pb.plot("seq_ic", height=.3, letter_width=.1)
    strip_axis(plt.gca())
    plt.ylim([0,2])
    plt.gca().axison = False
    plt.title(pb.name + f" ({int(pb.attrs['features']['n seqlets'])})")

Joint run -> per-TF¶

Least similar¶

for pa, pb, sim_score in pattern_pairs2:
    if sim_score > 0.75:
        continue
    pa.plot("seq_ic", height=.3, letter_width=.1)
    strip_axis(plt.gca())
    plt.ylim([0,2])
    plt.title(pa.name + f" ({int(pa.attrs['features']['n seqlets'])})")
    plt.gca().axison = False
    pb.plot("seq_ic", height=.3, letter_width=.1)
    strip_axis(plt.gca())
    plt.ylim([0,2])
    plt.gca().axison = False
    plt.title(pb.name + f" ({int(pb.attrs['features']['n seqlets'])})")

Most similar¶

for pa, pb, sim_score in pattern_pairs2:
    if sim_score <= 0.75:
        continue
    pa.plot("seq_ic", height=.3, letter_width=.1)
    strip_axis(plt.gca())
    plt.ylim([0,2])
    plt.title(pa.name + f" ({int(pa.attrs['features']['n seqlets'])})")
    plt.gca().axison = False
    pb.plot("seq_ic", height=.3, letter_width=.1)
    strip_axis(plt.gca())
    plt.ylim([0,2])
    plt.gca().axison = False
    plt.title(pb.name + f" ({int(pb.attrs['features']['n seqlets'])})")

Joint run¶

cluter the motifs -> show the old plot and the full table with all 100 motifs?

	logo_imp	logo_seq	n seqlets	pattern
0	<img width=420 src="d...	<img width=420 src="d...	306.0	Oct4_p3
1	<img width=420 src="d...	<img width=420 src="d...	303.0	Sox2_p3
2	<img width=420 src="d...	<img width=420 src="d...	614.0	Klf4_p4
3	<img width=420 src="d...	<img width=420 src="d...	2668.0	Sox2_p0
4	<img width=420 src="d...	<img width=420 src="d...	5430.0	Oct4_p0