# Imports
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
from basepair.imports import *
hv.extension('bokeh')
# Common paths
tasks = ['Oct4', 'Sox2', 'Nanog', 'Klf4']
sdir = Path('/oak/stanford/groups/akundaje/avsec/basepair/data/processed/comparison/output/')
model_dir = sdir / 'nexus,peaks,OSNK,0,10,1,FALSE,same,0.5,64,25,0.004,9,FALSE,[1,50],TRUE'
# create_tf_session(0)
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
from uuid import uuid4
from basepair.math import mean
from basepair.stats import perc
from IPython.display import display, HTML
from basepair.plot.vdom import df2html, df2html_old, render_datatable, vdom_footprint
from basepair.plot.utils import strip_axis
from basepair.plot.tracks import tidy_motif_plot
from basepair.modisco.core import patterns_to_df
from basepair.modisco.utils import longer_pattern, shorten_pattern, extract_name_short
from basepair.imports import *
paper_config()
# mr = ModiscoResult(modisco_dirs / tf / "modisco.h5")
# mr.open()
patterns = []
def rename(p, tf):
p = p.copy()
p.name = tf + "_" + shorten_pattern(p.name).split("_")[1]
return p
for tf in tasks:
mr = ModiscoResult(model_dir /f"deeplift/{tf}/out/profile/wn/modisco.h5")
mr.open()
patterns = patterns + [rename(mr.get_pattern(p), tf).add_attr("features",
{"n seqlets": mr.n_seqlets(*p.split("/"))})
for p in mr.patterns()]
mr.close()
patterns = [p for p in patterns if p.attrs['features']['n seqlets'] > 100] # require 100 seqlets at least
similarity_metric = 'continousjaccard' # or 'continousjaccard'
track = 'seq_ic'
from basepair.exp.chipnexus.motif_clustering import similarity_matrix
sim_all_seq = similarity_matrix(patterns, track=track, metric=similarity_metric)
from basepair.modisco.core import *
fig = plt.figure(figsize=get_figsize(0.25))
iu_all = np.triu_indices(len(sim_all_seq), 1)
plt.hist(sim_all_seq[iu_all], bins=100);
plt.title("Similarity (seq_ic)")
plt.ylabel("Frequency");
plt.xlabel("Similarity between two motifs (all)");
# plt.savefig(f"{figures}/all.similarity.hist.pdf")
# plt.savefig(f"{figures}/all.similarity.hist.png")
from scipy.cluster.hierarchy import linkage, optimal_leaf_ordering, cut_tree, leaves_list
from basepair.modisco.motif_clustering import to_colors
iu_all = np.triu_indices(len(sim_all_seq), 1)
lm_all_seq = linkage(1-sim_all_seq[iu_all], 'ward', optimal_ordering=True)
clusters_all_seq = cut_tree(lm_all_seq, n_clusters=9)[:,0]
names = [x.name for x in patterns]
cm_all = sns.clustermap(pd.DataFrame(sim_all_seq, index=names, columns=names),
row_linkage=lm_all_seq,
col_linkage=lm_all_seq,
cmap='Blues',
yticklabels=True,
figsize=get_figsize(1, aspect=1)
).fig.suptitle('Seq IC sim');
# plt.savefig(f"{figures}/all.heatmap.pdf")
# plt.savefig(f"{figures}/all.heatmap.png")
from basepair.modisco.motif_clustering import create_pattern_table, align_clustered_patterns
np.random.seed(42)
cluster_order = np.argsort(leaves_list(lm_all_seq))
cluster = clusters_all_seq
patterns_all_clustered = align_clustered_patterns(patterns, cluster_order, cluster,
align_track='seq_ic',
metric='continousjaccard',
# don't shit the major patterns
# by more than 15 when aligning
trials=20,
max_shift=15)
def create_pattern_table(patterns,
logo_len=30,
footprint_width=320,
footprint_kwargs=None,
seqlogo_kwargs=None,
# report_url='results.html'
n_jobs=10):
"""Creates the pattern table given a list of patterns
In addition to normal features under p.attrs['features'] it adds
logo_imp, logo_seq, profile scores and the directness score
Args:
patterns: list of patterns with 'profile' and attrs['features'] and attrs['motif_freq'] features
cluster_order: n
"""
# get profile medians for each task across all patterns
tasks = patterns[0].tasks()
# profile_max_median = {t: np.median([p.profile[t].max() for p in patterns])
# for t in tasks}
def extract_info(pattern_i):
"""Function ran for each cluster order
"""
p = patterns[pattern_i]
# setup the dictionary
d = p.attrs['features']
# Add motif frequencies
# d['pattern'] = pattern_url(d['pattern'], report_url)
d['cluster'] = p.attrs['cluster']
d['pattern'] = p.name
# Add seqlogos
d['logo_imp'] = p.resize(logo_len).vdom_plot('contrib', as_html=True, **seqlogo_kwargs)
d['logo_seq'] = p.resize(logo_len).vdom_plot('seq', as_html=True, **seqlogo_kwargs)
# for t in p.tasks():
# # add profile sparklines
# d[t + '/f'] = (vdom_footprint(p.profile[t],
# r_height=profile_max_median[t],
# text=None,
# **footprint_kwargs)
# .to_html()
# .replace("<img", f"<img width={footprint_width}")
# )
# # Add directness score
# d[t + "/d"] = 2 * d[t + " imp profile"] - d[t + " imp counts"]
return d
l = Parallel(n_jobs=n_jobs)(delayed(extract_info)(i) for i in tqdm(range(len(patterns))))
df = pd.DataFrame(l)
# add normalized features
# for t in tasks:
# df[t + '/d_p'] = perc(df[t + '/d'])
# use float for n seqlets
df['n seqlets'] = df['n seqlets'].astype(float)
return df
patterns_all_clustered[0].attrs
pattern_table_all = create_pattern_table(patterns_all_clustered,
logo_len=70,
seqlogo_kwargs=dict(width=420),
n_jobs=20,
footprint_width=120,
footprint_kwargs=dict(figsize=(3,1.5)))
pattern_table_all.head()
del pattern_table_all['cluster']
from basepair.modisco.table import write_modisco_table
def get_first_columns(df, cols):
remaining_columns = [c for c in df.columns if c not in cols]
return df[cols + remaining_columns]
first_columns = ['pattern', 'n seqlets', 'logo_seq', 'logo_imp']
pattern_table_all['i'] = np.arange(len(pattern_table_all), dtype=int)
pattern_table_all = get_first_columns(pattern_table_all, ['i'] + first_columns)
# (modisco_dirs / 'motif_clustering').mkdir(exist_ok=True)
# write_modisco_table(pattern_table_all, modisco_dirs, report_url='results.html', prefix='pattern_table', write_csv=False, doc={})
# TODO - for each TF
ref_modisco_dir = model_dir /f"deeplift/all/out/profile/wn/"
ref_patterns = read_pkl(ref_modisco_dir / 'patterns.pkl')
ref_patterns = [p for p in ref_patterns if p.attrs['features']['n seqlets'] > 100]
dfp = pd.read_csv(ref_modisco_dir / 'motif_clustering/patterns_long.csv')
dfp_short = pd.read_csv(ref_modisco_dir / 'motif_clustering/patterns_short.csv')
shorten_pattern(ref_patterns[0].name)
d_ref_patterns_te = {shorten_pattern(p.name): p for p in ref_patterns}
ref_patterns_te = [d_ref_patterns_te[pn] for pn in dfp[dfp['n seqlets'] > 100].pattern]
p = ref_patterns_te[0]
def similarity_matrix_two_patterns(patterns, patterns_b, **kwargs):
"""Compute pattern-pattern similarity matrix
"""
m = np.zeros((len(patterns), len(patterns_b)))
closest_patterns = []
for i in tqdm(range(len(patterns))):
for j in range(len(patterns_b)):
m[i, j] = max(patterns[i].similarity(patterns_b[j], **kwargs),
patterns[i].rc().similarity(patterns_b[j], **kwargs))
j_best = m[i].argmax()
closest_patterns.append((patterns[i].align(patterns_b[j_best], **kwargs), patterns_b[j_best], m[i].max()))
dfm = pd.DataFrame(m, index=[p.name for p in patterns],
columns = [p.name for p in patterns_b])
return closest_patterns, dfm
te_merged_patterns = [p for p in patterns_all_clustered if p.seq_info_content > 34]
plt.hist([p.seq_info_content for p in patterns], 30);
te_merged_patterns = [p for p in patterns_all_clustered if p.seq_info_content > 30]
d_ref_patterns_te = {shorten_pattern(p.name): p for p in ref_patterns}
ref_patterns_te = [d_ref_patterns_te[pn] for pn in dfp[dfp['n seqlets'] > 100].pattern]
pattern_pairs, m = similarity_matrix_two_patterns(te_merged_patterns, ref_patterns_te)
pattern_pairs2, m2 = similarity_matrix_two_patterns(ref_patterns_te, te_merged_patterns)
lrow = linkage(m, 'ward', optimal_ordering=True)
lcol = linkage(m.T, 'ward', optimal_ordering=True)
cm_te = sns.clustermap(m,
#row_linkage=lrow,
row_cluster=False,
col_linkage=lcol,
figsize=(12, 7),
cmap='Blues'
).fig.suptitle('Seq IC sim');
lrow = linkage(m2, 'ward', optimal_ordering=True)
lcol = linkage(m2.T, 'ward', optimal_ordering=True)
cm_te = sns.clustermap(m2,
row_linkage=None,
row_cluster=False,
col_linkage=lcol,
cmap='Blues', figsize=(7, 12),
).fig.suptitle('Seq IC sim');
m.max(axis=1).plot.hist(30) # best match per row
plt.xlabel("Best match from joint run to each per-TF run")
m2.max(axis=1).plot.hist(30) # best match per row
plt.xlabel("Best match from per-TF run to each joint run")
pattern_pairs[0][0].attrs['features']['n seqlets']
At 0.35 threshod, we include all the other patterns
for pa, pb, sim_score in pattern_pairs:
if sim_score > 0.4:
continue
pa.plot("seq_ic", height=.3, letter_width=.1)
strip_axis(plt.gca())
plt.ylim([0,2])
plt.title(pa.name + f" ({int(pa.attrs['features']['n seqlets'])})")
plt.gca().axison = False
pb.plot("seq_ic", height=.3, letter_width=.1)
strip_axis(plt.gca())
plt.ylim([0,2])
plt.gca().axison = False
plt.title(pb.name + f" ({int(pb.attrs['features']['n seqlets'])})")
for pa, pb, sim_score in pattern_pairs:
if sim_score <= 0.4:
continue
pa.plot("seq_ic", height=.3, letter_width=.1)
strip_axis(plt.gca())
plt.ylim([0,2])
plt.title(pa.name + f" ({int(pa.attrs['features']['n seqlets'])})")
plt.gca().axison = False
pb.plot("seq_ic", height=.3, letter_width=.1)
strip_axis(plt.gca())
plt.ylim([0,2])
plt.gca().axison = False
plt.title(pb.name + f" ({int(pb.attrs['features']['n seqlets'])})")
for pa, pb, sim_score in pattern_pairs2:
if sim_score > 0.4:
continue
pa.plot("seq_ic", height=.3, letter_width=.1)
strip_axis(plt.gca())
plt.ylim([0,2])
plt.title(pa.name + f" ({int(pa.attrs['features']['n seqlets'])})")
plt.gca().axison = False
pb.plot("seq_ic", height=.3, letter_width=.1)
strip_axis(plt.gca())
plt.ylim([0,2])
plt.gca().axison = False
plt.title(pb.name + f" ({int(pb.attrs['features']['n seqlets'])})")
for pa, pb, sim_score in pattern_pairs2:
if sim_score <= 0.4:
continue
pa.plot("seq_ic", height=.3, letter_width=.1)
strip_axis(plt.gca())
plt.ylim([0,2])
plt.title(pa.name + f" ({int(pa.attrs['features']['n seqlets'])})")
plt.gca().axison = False
pb.plot("seq_ic", height=.3, letter_width=.1)
strip_axis(plt.gca())
plt.ylim([0,2])
plt.gca().axison = False
plt.title(pb.name + f" ({int(pb.attrs['features']['n seqlets'])})")
short_merged_patterns = [p for p in patterns_all_clustered if p.seq_info_content <= 30 and p.attrs['features']['n seqlets'] > 300]
d_ref_patterns_short = {shorten_pattern(p.name): p for p in ref_patterns}
ref_patterns_short = [d_ref_patterns_short[pn] for pn in dfp_short[dfp_short['n seqlets'] > 300].pattern]
pattern_pairs, m = similarity_matrix_two_patterns(short_merged_patterns, ref_patterns_short)
pattern_pairs2, m2 = similarity_matrix_two_patterns(ref_patterns_short, short_merged_patterns)
m2.shape
m.shape
lrow = linkage(m, 'ward', optimal_ordering=True)
lcol = linkage(m.T, 'ward', optimal_ordering=True)
cm_te = sns.clustermap(m,
#row_linkage=lrow,
row_cluster=False,
col_linkage=lcol,
figsize=(7, 5),
cmap='Blues'
).fig.suptitle('Seq IC sim');
lrow = linkage(m2, 'ward', optimal_ordering=True)
lcol = linkage(m2.T, 'ward', optimal_ordering=True)
cm_te = sns.clustermap(m2,
row_linkage=None,
row_cluster=False,
col_linkage=lcol,
cmap='Blues', figsize=(5, 7),
).fig.suptitle('Seq IC sim');
m.max(axis=1).plot.hist(30) # best match per row
plt.xlabel("Best match from joint run to each per-TF run")
m2.max(axis=1).plot.hist(30) # best match per row
plt.xlabel("Best match from per-TF run to each joint run")
pattern_pairs[0][0].attrs['features']['n seqlets']
At 0.35 threshod, we include all the other patterns
for pa, pb, sim_score in pattern_pairs:
if sim_score > 0.75:
continue
pa.plot("seq_ic", height=.3, letter_width=.1)
strip_axis(plt.gca())
plt.ylim([0,2])
plt.title(pa.name + f" ({int(pa.attrs['features']['n seqlets'])})")
plt.gca().axison = False
pb.plot("seq_ic", height=.3, letter_width=.1)
strip_axis(plt.gca())
plt.ylim([0,2])
plt.gca().axison = False
plt.title(pb.name + f" ({int(pb.attrs['features']['n seqlets'])})")
for pa, pb, sim_score in pattern_pairs:
if sim_score <= 0.75:
continue
pa.plot("seq_ic", height=.3, letter_width=.1)
strip_axis(plt.gca())
plt.ylim([0,2])
plt.title(pa.name + f" ({int(pa.attrs['features']['n seqlets'])})")
plt.gca().axison = False
pb.plot("seq_ic", height=.3, letter_width=.1)
strip_axis(plt.gca())
plt.ylim([0,2])
plt.gca().axison = False
plt.title(pb.name + f" ({int(pb.attrs['features']['n seqlets'])})")
for pa, pb, sim_score in pattern_pairs2:
if sim_score > 0.75:
continue
pa.plot("seq_ic", height=.3, letter_width=.1)
strip_axis(plt.gca())
plt.ylim([0,2])
plt.title(pa.name + f" ({int(pa.attrs['features']['n seqlets'])})")
plt.gca().axison = False
pb.plot("seq_ic", height=.3, letter_width=.1)
strip_axis(plt.gca())
plt.ylim([0,2])
plt.gca().axison = False
plt.title(pb.name + f" ({int(pb.attrs['features']['n seqlets'])})")
for pa, pb, sim_score in pattern_pairs2:
if sim_score <= 0.75:
continue
pa.plot("seq_ic", height=.3, letter_width=.1)
strip_axis(plt.gca())
plt.ylim([0,2])
plt.title(pa.name + f" ({int(pa.attrs['features']['n seqlets'])})")
plt.gca().axison = False
pb.plot("seq_ic", height=.3, letter_width=.1)
strip_axis(plt.gca())
plt.ylim([0,2])
plt.gca().axison = False
plt.title(pb.name + f" ({int(pb.attrs['features']['n seqlets'])})")