# Imports
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
from basepair.imports import *
hv.extension('bokeh')
# Common paths
model_dir = Path(f"{ddir}/processed/chipnexus/exp/models/oct-sox-nanog-klf/models/n_dil_layers=9/")
modisco_dirs = model_dir / f"modisco/by_peak_tasks/per-tf"
tasks = ['Oct4', 'Sox2', 'Nanog', 'Klf4']
# create_tf_session(0)
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
from uuid import uuid4
from basepair.math import mean
from basepair.stats import perc
from IPython.display import display, HTML
from basepair.plot.vdom import df2html, df2html_old, render_datatable, vdom_footprint
from basepair.plot.utils import strip_axis
from basepair.plot.tracks import tidy_motif_plot
from basepair.modisco.core import patterns_to_df
from basepair.modisco.utils import longer_pattern, shorten_pattern, extract_name_short
from basepair.imports import *
paper_config()
patterns[0].name
mr
mr = ModiscoResult(modisco_dirs / tf / "modisco.h5")
mr.open()
patterns = []
def rename(p, tf):
p = p.copy()
p.name = tf + "_" + shorten_pattern(p.name).split("_")[1]
return p
for tf in tasks:
mr = ModiscoResult(modisco_dirs / tf / "modisco.h5")
mr.open()
patterns = patterns + [rename(mr.get_pattern(p), tf).add_attr("features",
{"n seqlets": mr.n_seqlets(*p.split("/"))})
for p in mr.patterns()]
mr.close()
from basepair.exp.chipnexus.motif_clustering import similarity_matrix
sim_all_seq = similarity_matrix(patterns, track='seq_ic')
fig = plt.figure(figsize=get_figsize(0.25))
iu_all = np.triu_indices(len(sim_all_seq), 1)
plt.hist(sim_all_seq[iu_all], bins=100);
plt.title("Similarity (seq_ic)")
plt.ylabel("Frequency");
plt.xlabel("Similarity between two motifs (all)");
# plt.savefig(f"{figures}/all.similarity.hist.pdf")
# plt.savefig(f"{figures}/all.similarity.hist.png")
from scipy.cluster.hierarchy import linkage, optimal_leaf_ordering, cut_tree, leaves_list
from basepair.modisco.motif_clustering import to_colors
iu_all = np.triu_indices(len(sim_all_seq), 1)
lm_all_seq = linkage(1-sim_all_seq[iu_all], 'ward', optimal_ordering=True)
clusters_all_seq = cut_tree(lm_all_seq, n_clusters=9)[:,0]
names = [x.name for x in patterns]
cm_all = sns.clustermap(pd.DataFrame(sim_all_seq, index=names, columns=names),
row_linkage=lm_all_seq,
col_linkage=lm_all_seq,
cmap='Blues',
yticklabels=True,
figsize=get_figsize(1, aspect=1)
).fig.suptitle('Seq IC sim');
# plt.savefig(f"{figures}/all.heatmap.pdf")
# plt.savefig(f"{figures}/all.heatmap.png")
from basepair.modisco.motif_clustering import create_pattern_table, align_clustered_patterns
np.random.seed(42)
cluster_order = np.argsort(leaves_list(lm_all_seq))
cluster = clusters_all_seq
patterns_all_clustered = align_clustered_patterns(patterns, cluster_order, cluster,
align_track='seq_ic',
metric='continousjaccard',
# don't shit the major patterns
# by more than 15 when aligning
trials=20,
max_shift=15)
def create_pattern_table(patterns,
logo_len=30,
footprint_width=320,
footprint_kwargs=None,
seqlogo_kwargs=None,
# report_url='results.html'
n_jobs=10):
"""Creates the pattern table given a list of patterns
In addition to normal features under p.attrs['features'] it adds
logo_imp, logo_seq, profile scores and the directness score
Args:
patterns: list of patterns with 'profile' and attrs['features'] and attrs['motif_freq'] features
cluster_order: n
"""
# get profile medians for each task across all patterns
tasks = patterns[0].tasks()
# profile_max_median = {t: np.median([p.profile[t].max() for p in patterns])
# for t in tasks}
def extract_info(pattern_i):
"""Function ran for each cluster order
"""
p = patterns[pattern_i]
# setup the dictionary
d = p.attrs['features']
# Add motif frequencies
# d['pattern'] = pattern_url(d['pattern'], report_url)
d['cluster'] = p.attrs['cluster']
d['pattern'] = p.name
# Add seqlogos
d['logo_imp'] = p.resize(logo_len).vdom_plot('contrib', as_html=True, **seqlogo_kwargs)
d['logo_seq'] = p.resize(logo_len).vdom_plot('seq', as_html=True, **seqlogo_kwargs)
# for t in p.tasks():
# # add profile sparklines
# d[t + '/f'] = (vdom_footprint(p.profile[t],
# r_height=profile_max_median[t],
# text=None,
# **footprint_kwargs)
# .to_html()
# .replace("<img", f"<img width={footprint_width}")
# )
# # Add directness score
# d[t + "/d"] = 2 * d[t + " imp profile"] - d[t + " imp counts"]
return d
l = Parallel(n_jobs=n_jobs)(delayed(extract_info)(i) for i in tqdm(range(len(patterns))))
df = pd.DataFrame(l)
# add normalized features
# for t in tasks:
# df[t + '/d_p'] = perc(df[t + '/d'])
# use float for n seqlets
df['n seqlets'] = df['n seqlets'].astype(float)
return df
patterns_all_clustered[0].attrs
pattern_table_all = create_pattern_table(patterns_all_clustered,
logo_len=70,
seqlogo_kwargs=dict(width=420),
n_jobs=20,
footprint_width=120,
footprint_kwargs=dict(figsize=(3,1.5)))
pattern_table_all.head()
del pattern_table_all['cluster']
from basepair.modisco.table import write_modisco_table
def get_first_columns(df, cols):
remaining_columns = [c for c in df.columns if c not in cols]
return df[cols + remaining_columns]
(modisco_dirs / 'motif_clustering').mkdir(exist_ok=True)
first_columns = ['pattern', 'n seqlets', 'logo_seq', 'logo_imp']
pattern_table_all['i'] = np.arange(len(pattern_table_all), dtype=int)
pattern_table_all = get_first_columns(pattern_table_all, ['i'] + first_columns)
write_modisco_table(pattern_table_all, modisco_dirs, report_url='results.html', prefix='pattern_table', write_csv=False, doc={})
# TODO - for each TF
ref_modisco_dir = model_dir / f"modisco/all/deeplift/profile"
ls {ref_modisco_dir}
ref_patterns = read_pkl(ref_modisco_dir / 'patterns.pkl')
ref_patterns[0].attrs.keys()
ls {ref_modisco_dir / "motif_clustering"}
dfp = pd.read_csv(ref_modisco_dir / 'motif_clustering/patterns_long.csv')
dfp.head()
shorten_pattern(ref_patterns[0].name)
d_ref_patterns_te = {shorten_pattern(p.name): p for p in ref_patterns}
ref_patterns_te = [d_ref_patterns_te[pn] for pn in dfp.pattern]
def similarity_matrix_two_patterns(patterns, patterns_b, **kwargs):
"""Compute pattern-pattern similarity matrix
"""
m = np.zeros((len(patterns), len(patterns_b)))
closest_patterns = []
for i in tqdm(range(len(patterns))):
for j in range(len(patterns_b)):
m[i, j] = max(patterns[i].similarity(patterns_b[j], **kwargs),
patterns[i].rc().similarity(patterns_b[j], **kwargs))
j_best = m[i].argmax()
closest_patterns.append((patterns[i].align(patterns_b[j_best], **kwargs), patterns_b[j_best]))
dfm = pd.DataFrame(m, index=[p.name for p in patterns],
columns = [p.name for p in patterns_b])
return closest_patterns, dfm
te_merged_patterns = [p for p in patterns if p.seq_info_content > 40]
plt.hist([p.seq_info_content for p in patterns], 30);
pattern_pairs, m = similarity_matrix_two_patterns(te_merged_patterns, ref_patterns_te)
sns.heatmap(m.T)
for pa, pb in pattern_pairs:
pa.plot("seq_ic", height=.3, letter_width=.1)
strip_axis(plt.gca())
plt.ylim([0,2])
plt.gca().axison = False
pb.plot("seq_ic", height=.3, letter_width=.1)
strip_axis(plt.gca())
plt.ylim([0,2])
plt.gca().axison = False