exp = 'nexus,peaks,OSNK,0,10,1,FALSE,same,0.5,64,25,0.004,9,FALSE,[1,50],TRUE'
imp_score = 'profile/wn'
# Imports
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
from basepair.imports import *
from basepair.plot.profiles import multiple_plot_stranded_profile
from basepair.exp.paper.config import tf_colors
hv.extension('bokeh')
from basepair.plot.profiles import multiple_plot_stranded_profile
from basepair.plot.table import render_mpl_table
from basepair.exp.paper.fig4 import *
from basepair.exp.paper.config import *
# Use paper config
paper_config()
tasks = ['Oct4', 'Sox2', 'Nanog', 'Klf4']
model_dir = models_dir / exp
figures = Path(f"{ddir}/figures/modisco/{exp}")
mr_dict = [(task, ModiscoResult(model_dir / f"deeplift/{task}/out/{imp_score}/modisco.h5"))
for task in tasks]
mr = MultipleModiscoResult({task: model_dir / f"deeplift/{task}/out/{imp_score}/modisco.h5"
for task in tasks})
mrp = mr_dict[0][1]
{k: v.sum() for k,v in mrp.get_pattern(mrp.patterns()[0]).contrib.items()}
{k: v.sum() for k,v in mr.get_pattern(mr.patterns()[0]).contrib.items()}
isf = ImpScoreFile(model_dir / 'deeplift.imp_score.h5')
len(isf)
for k,v in mr_dict: v.open()
footprints = {t: read_pkl(model_dir / f"deeplift/{t}/out/{imp_score}/footprints.pkl")
for t in tasks}
df = pd.DataFrame([{"TF": t, **mr.stats()} for t,mr in mr_dict])
df[['%clustered']] = np.round(df[['clustered_seqlets_frac']] * 100, decimals=1)
dfp = df[['TF', 'all_seqlets', 'clustered_seqlets', '%clustered', 'patterns']]
dfp
render_mpl_table(dfp, col_width=1.8,
row_height=0.3,
header_color='white', font_size=10,
row_colors = ['w'],
header_text_color='black')
plt.savefig(figures / 'seqlet-stats.pdf')
optimal_align_patterns??
def cluster_align_patterns(patterns, n_clusters=9):
# get the similarity matrix
sim_seq = similarity_matrix(patterns, track='seq_ic')
# cluster
iu = np.triu_indices(len(sim_seq), 1)
lm_seq = linkage(1 - sim_seq[iu], 'ward', optimal_ordering=True)
# determine clusters and the cluster order
cluster = cut_tree(lm_seq, n_clusters=n_clusters)[:, 0]
cluster_order = np.argsort(leaves_list(lm_seq))
# align patterns
patterns_clustered = align_clustered_patterns(patterns, cluster_order, cluster,
align_track='seq_ic',
metric='continousjaccard',
# don't shit the major patterns
# by more than 15 when aligning
trials=20,
max_shift=15)
return sim_seq, lm_seq, np.argsort(cluster_order), patterns_clustered
all_patterns = get_patterns(mr_dict, footprints, tasks, min_n_seqlets=100)
patterns = [p for p in all_patterns if p.seq_info_content > 4]
short_patterns = [p for p in patterns if p.seq_info_content < 30]
sim_seq, lm_seq, sort_idx, short_patterns_clustered = cluster_align_patterns(short_patterns, n_clusters=11)
pattern_thr_100 = get_patterns(mr_dict, footprints, tasks, min_n_seqlets=100)
long_patterns = [p for p in pattern_thr_100 if p.seq_info_content >= 30]
long_patterns_clustered = cluster_align_patterns(long_patterns, n_clusters=25)
len(patterns)
len(long_patterns)
len(short_patterns)
# all_patterns = get_patterns(mr_dict, footprints, tasks, min_n_seqlets=100)
# sim_seq, lm_seq, sort_idx, all_patterns_clustered = cluster_align_patterns(all_patterns, n_clusters=25)
# all_patterns = get_patterns(mr_dict, footprints, tasks, min_n_seqlets=100)
# patterns = [p for p in all_patterns if p.seq_info_content > 4]
# short_patterns = [p for p in patterns if p.seq_info_content < 30]
# sim_seq, lm_seq, sort_idx, short_patterns_clustered = cluster_align_patterns(short_patterns, n_clusters=10)
short_patterns = [p.rename(p.attrs['TF'] + '/' + p.name) for p in short_patterns]
short_patterns_clustered = [p.rename(p.attrs['TF'] + '/' + p.name) for p in short_patterns_clustered]
len(short_patterns)
len(long_patterns)
def short_name(pn):
task, mc, p = pn.split("/")
return task + "/p" + p.split("_")[1]
p = short_patterns_clustered[0]
for i in range(len(short_patterns_clustered)):
short_patterns_clustered[i].trim_seq_ic(0.08).plot('seq_ic', height=0.4, letter_width=0.12);
strip_axis(plt.gca())
pnames = [short_name(p.name) for p in short_patterns]
df = pd.DataFrame(sim_seq,
index=pnames,
columns=pnames)
cm = sns.clustermap(df,
row_linkage=lm_seq,
col_linkage=lm_seq,
cmap='Blues', figsize=get_figsize(.9, 1))
plt.savefig(figures / 'cluster.short-motifs.heatmap.100seqlets.pdf')
long_patterns = [p.add_attr('features', {"n seqlets": mr.n_seqlets(p.name)}) for p in mr.get_all_patterns()
if p.seq_info_content >= 30 and mr.n_seqlets(p.name) > 100]
sim_seq, lm_seq, sort_idx, long_patterns_clustered = cluster_align_patterns(long_patterns, n_clusters=len(long_patterns))
cluster_align_patterns
def short_name(pn):
task, mc, p = pn.split("/")
return task + "/p" + p.split("_")[1]
pnames = [short_name(p.name) for p in long_patterns]
long_patterns = [p.rename(short_name(p.name)) for p in long_patterns]
df = pd.DataFrame(sim_seq,
index=pnames,
columns=pnames)
for i in range(len(long_patterns_clustered)):
long_patterns_clustered[i].trim_seq_ic(0.08).plot('seq_ic', height=0.4, letter_width=0.12);
strip_axis(plt.gca())
cm = sns.clustermap(df,
row_linkage=lm_seq,
col_linkage=lm_seq,
cmap='Blues', figsize=get_figsize(.5, 1))
plt.savefig(figures / 'cluster.long-motifs.heatmap.pdf')
# Note -> patterns with higher Sequence IC used a lower threshold (min_n_seqlets > 100)
# non-bias corrected
fig, ax = plt.subplots(figsize=get_figsize(.25))
ax.hist([p.seq_info_content for p in patterns + long_patterns], 20);
ax.set_ylabel("Frequency");
ax.set_xlabel("Sequence IC");
fig.savefig(figures / 'pattern-length.pdf', bbox_inches='tight', pad_inches=0)
paper_config()
# Non-bias corrected
fig = plot_patterns(list(reversed(short_patterns_clustered)), tasks, pattern_trim=(27, 48))
fig.savefig(figures / 'short-patterns.table.100seqlets-v2.pdf')
fig.savefig(figures / 'short-patterns.table.100seqlets-v2.png')
short_patterns_rev = list(reversed(short_patterns_clustered))
# dfp = pd.read_csv(f"{model_dir}/deeplift/{task}/out/{imp_score}/pattern_table.csv")
short_patterns_rev_list = [p.name for p in short_patterns_rev]
len(short_patterns_rev_list)
dfp_list = []
for task in tasks:
dfp = pd.read_csv(f"{model_dir}/deeplift/{task}/out/{imp_score}/pattern_table.csv")
dfp['task'] = task
dfp['pattern_name'] = task + '/' + dfp['pattern'].map(longer_pattern)
dfp_list.append(dfp)
dfpl = pd.concat(dfp_list)
dfpl.head()
dfpl.set_index("pattern_name", inplace=True)
dfpl
dfp_imp = dfpl.loc[short_patterns_rev_list, [t + ' imp profile' for t in tasks]]
dfp_imp.columns = tasks
fig = plt.figure(figsize=get_figsize(0.5, aspect=2.5))
sns.heatmap(dfp_imp, cmap='Blues');
fig.savefig(figures / 'short-patterns.contrib_score.pdf')
Subset to patterns shown in Figure 4
df_subset = pd.read_csv("https://docs.google.com/spreadsheets/d/1MVBUbMs_F0UPgZxR8-v9eMM6DCFhVYc7kdH-C6WJnYU/export?gid=0&format=csv")
df_subset.name = df_subset.name.str.replace('"', '')
df_subset.task = df_subset.task.str.replace('"', '')
df_subset.head()
df_subset['long_pattern'] = df_subset.task + "/" + df_subset.pattern.map(longer_pattern)
paper_config()
fig = plt.figure(figsize=get_figsize(0.5, aspect=2.5))
sns.heatmap(dfp_imp.loc[df_subset.long_pattern.values], cmap='Blues');
fig.savefig(figures / 'short-patterns.contrib_score.figure-subset.pdf')
p = short_patterns_rev[0]
contrib_scores = np.empty((len(short_patterns_rev), len(tasks)))
for i,p in enumerate(short_patterns_rev):
for j, task in enumerate(tasks):
contrib_scores[i,j] = p.contrib[task].sum()
short_patterns_rev[8].plot('contrib');
sns.heatmap(contrib_scores);
# create the table with the histogram
# Non-bias corrected
fig = plot_patterns(list(reversed(all_patterns_clustered)), tasks, pattern_trim=(5, 65))
fig.savefig(figures / 'all-patterns.table.pdf')
fig.savefig(figures / 'all-patterns.table.png')