from collections import OrderedDict
exp = 'nexus,peaks,OSNK,0,10,1,FALSE,same,0.5,64,25,0.004,9,FALSE,[1,50],TRUE'
imp_score = 'profile/wn'
motifs = OrderedDict([
("Oct4-Sox2", 'Oct4/m0_p0'),
("Oct4", 'Oct4/m0_p1'),
# ("Strange-sym-motif", 'Oct4/m0_p5'),
("Sox2", 'Sox2/m0_p1'),
("Nanog", 'Nanog/m0_p1'),
("Zic3", 'Nanog/m0_p2'),
("Nanog-partner", 'Nanog/m0_p4'),
("Klf4", 'Klf4/m0_p0'),
])
from basepair.imports import *
from basepair.exp.paper.config import *
from basepair.utils import flatten
from basepair.modisco.core import Pattern
from plotnine import *
import plotnine
import warnings
warnings.filterwarnings("ignore")
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
paper_config()
from basepair.utils import pd_first_cols, flatten
from basepair.exp.chipnexus.comparison import read_peakxus_dfi, read_chexmix_dfi, read_fimo_dfi, read_meme_motifs, read_transfac
# models = {
# 'nexus/binary': 'nexus,gw,OSNK,1,0,0,FALSE,same,0.5,64,25,0.001,9,FALSE',
# 'nexus/profile': 'nexus,peaks,OSNK,0,10,1,FALSE,same,0.5,64,25,0.004,9,FALSE,[1,50],TRUE',
# # 'nexus/profile.peaks.bias-corrected.augm': 'nexus,peaks,OSNK,0,10,1,FALSE,same,0.5,64,25,0.004,9,FALSE,[1,50],TRUE,TRUE',
# # 'nexus/profile.peaks.non-bias-corrected': 'nexus,peaks,OSNK,0,10,1,FALSE,same,0.5,64,25,0.004,9,FALSE-2',
# 'seq/binary': 'seq,gw,OSN,1,0,0,FALSE,same,0.5,64,50,0.001,9,FALSE',
# #'seq/profile': 'seq,peaks,OSN,0,10,1,FALSE,same,0.5,64,50,0.004,9,FALSE,[1,50],TRUE',
# #'seq/profile.peaks.bias-corrected.augm': 'seq,peaks,OSN,0,10,1,FALSE,same,0.5,64,50,0.004,9,FALSE,[1,50],TRUE,TRUE',
# 'seq/profile': 'seq,peaks,OSN,0,10,1,FALSE,same,0.5,64,50,0.004,9,FALSE,[1,50],TRUE,TRUE',
# # 'seq/profile.peaks.non-bias-corrected': 'seq,peaks,OSN,0,10,1,FALSE,same,0.5,64,50,0.004,9,FALSE',
# 'nexus/profile.peaks-union': 'nexus,nexus-seq-union,OSN,0,10,1,FALSE,same,0.5,64,25,0.004,9,FALSE,[1,50],TRUE',
# 'seq/profile.peaks-union': 'seq,nexus-seq-union,OSN,0,10,1,FALSE,same,0.5,64,50,0.004,9,FALSE,[1,50],TRUE'
# }
models = {
'nexus/binary': 'nexus,gw,OSNK,1,0,0,FALSE,same,0.5,64,25,0.001,9,FALSE',
'nexus/profile': 'nexus,peaks,OSNK,0,10,1,FALSE,same,0.5,64,25,0.004,9,FALSE,[1,50],TRUE',
# 'nexus/binary+profile': 'nexus,gw,OSNK,1,0.1,0.01,FALSE,same,0.5,64,25,0.001,9,FALSE',
# 'nexus/profile.gw': 'nexus,gw,OSNK,0,10,1,FALSE,same,0.5,64,25,0.001,9,FALSE',
# 'nexus/profile.peaks.bias-corrected.augm': 'nexus,peaks,OSNK,0,10,1,FALSE,same,0.5,64,25,0.004,9,FALSE,[1,50],TRUE,TRUE',
# 'nexus/profile.peaks.non-bias-corrected': 'nexus,peaks,OSNK,0,10,1,FALSE,same,0.5,64,25,0.004,9,FALSE-2',
'seq/profile': 'seq,peaks,OSN,0,10,1,FALSE,same,0.5,64,50,0.004,9,FALSE,[1,50],TRUE,TRUE',
'seq/binary': 'seq,gw,OSN,1,0,0,FALSE,same,0.5,64,50,0.001,9,FALSE',
#'seq/profile': 'seq,peaks,OSN,0,10,1,FALSE,same,0.5,64,50,0.004,9,FALSE,[1,50],TRUE',
#'seq/profile.peaks.bias-corrected.augm': 'seq,peaks,OSN,0,10,1,FALSE,same,0.5,64,50,0.004,9,FALSE,[1,50],TRUE,TRUE',
# 'seq/profile.peaks.non-bias-corrected': 'seq,peaks,OSN,0,10,1,FALSE,same,0.5,64,50,0.004,9,FALSE',
# 'nexus/profile.peaks-union': 'nexus,nexus-seq-union,OSN,0,10,1,FALSE,same,0.5,64,25,0.004,9,FALSE,[1,50],TRUE,TRUE,0',
# 'nexus/profile.peaks-union.no-aug': 'nexus,nexus-seq-union,OSN,0,10,1,FALSE,same,0.5,64,25,0.004,9,FALSE,[1,50],TRUE,FALSE,0',
'nexus/profile.peaks-union': 'nexus,nexus-seq-union,OSN,0,10,1,FALSE,same,0.5,64,25,0.004,9,FALSE,[1,50],TRUE,TRUE,0',
'seq/profile.peaks-union': 'seq,nexus-seq-union,OSN,0,10,1,FALSE,same,0.5,64,50,0.004,9,FALSE,[1,50],TRUE,TRUE,0',
# 'basset/binary': 'binary-basset,nexus,gw,OSNK,0.5,64,0.001,FALSE,0.6',
# 'factorized-basset/binary': 'factorized-basset,nexus,gw,OSNK,0.5,64,0.001,FALSE,0.5'
}
models_inv = {v:k for k,v in models.items()}
# TODO - add the nexus/profile.peaksunio
from config import experiments
fdir = Path(f'{ddir}/figures/method-comparison/modisco')
fdir.mkdir(exist_ok=True)
from basepair.exp.paper.config import get_tasks
def load_stats(exp, model):
mr = MultipleModiscoResult({t: models_dir / exp / f'deeplift/{t}/out/{experiments[exp]["imp_score"]}/modisco.h5'
for t in get_tasks(exp)})
return pd.DataFrame([{"task": task, "model": model, **mr2.stats(verbose=False)}
for task,mr2 in mr.mr_dict.items()
])
# TODO - add seq/binary to experiments
dfs = pd.concat([load_stats(exp, model) for model, exp in models.items() if exp in experiments])
dfs['seqlets_per_pattern'] = dfs['clustered_seqlets'] / dfs['patterns']
plotnine.options.figure_size = get_figsize(.4)
fig = (ggplot(aes(x='task', fill='model', y='clustered_seqlets_frac'), dfs) +
geom_bar(stat='identity', position='dodge') +
theme_classic() +
scale_fill_brewer(type='qual', palette=3)
)
fig.save(fdir / 'modisco-stats.clustered_seqlets_frac.pdf')
fig
fig = (ggplot(aes(x='task', fill='model', y='all_seqlets'), dfs) +
geom_bar(stat='identity', position='dodge') +
theme_classic() +
scale_fill_brewer(type='qual', palette=3)
)
fig.save(fdir / 'modisco-stats.n_all_seqlets.pdf')
fig
fig = (ggplot(aes(x='task', fill='model', y='clustered_seqlets'), dfs) +
geom_bar(stat='identity', position='dodge') +
theme_classic() +
scale_fill_brewer(type='qual', palette=3)
)
fig.save(fdir / 'modisco-stats.clustered_seqlets.pdf')
fig
fig = (ggplot(aes(x='task', fill='model', y='patterns'), dfs) +
geom_bar(stat='identity', position='dodge') +
theme_classic() +
scale_fill_brewer(type='qual', palette=3)
)
fig.save(fdir / 'modisco-stats.patterns.pdf')
fig
fig = (ggplot(aes(x='task', fill='model', y='seqlets_per_pattern'), dfs) +
geom_bar(stat='identity', position='dodge') +
theme_classic() +
scale_fill_brewer(type='qual', palette=3)
)
fig.save(fdir / 'modisco-stats.seqlets_per_pattern.pdf')
fig
comparison_dir = Path('../../chipnexus/comparison/output')
# main_motifs = [mr.get_pattern(pattern_name).rename(name)
# for name, pattern_name in motifs.items()]
def rename(name):
task, mc, pattern = name.split("/")
mc_n = mc.split("_")[1]
pattern_n = pattern.split("_")[1]
return f'BPNet/{task}/m{mc_n}_p{pattern_n}'
def load_patterns(exp, model):
mr = MultipleModiscoResult({t: models_dir / exp / f'deeplift/{t}/out/{experiments[exp]["imp_score"]}/modisco.h5'
for t in get_tasks(exp)})
return [p.rename(model + '/' + p.name).add_attr("n_seqlets", mr.n_seqlets(p.name))
for p in mr.get_all_patterns()
if mr.n_seqlets(p.name) > 100]
# bpnet_patterns = [p.rename(rename(p.name)).trim_seq_ic(0.08) for p in mr.get_all_patterns()
# if mr.n_seqlets(p.name) > 100]
all_patterns = []
for model, exp in models.items():
try:
all_patterns += load_patterns(exp, model)
except:
print("Unable to load ", model, exp)
from basepair.exp.paper.fig4 import cluster_align_patterns
len(all_patterns)
# Cluster patterns
from basepair.exp.paper.fig4 import *
patterns = all_patterns
n_clusters = 20
sim_seq = similarity_matrix(patterns, track='seq_ic')
# cluster
iu = np.triu_indices(len(sim_seq), 1)
lm_seq = linkage(1 - sim_seq[iu], 'ward', optimal_ordering=True)
# determine clusters and the cluster order
cluster = cut_tree(lm_seq, n_clusters=n_clusters)[:, 0]
cluster_order = np.argsort(leaves_list(lm_seq))
# align patterns
aligned_patterns = align_clustered_patterns(patterns, cluster_order, cluster,
align_track='seq_ic',
metric='continousjaccard',
# don't shit the major patterns
# by more than 15 when aligning
trials=20,
max_shift=15)
write_pkl(aligned_patterns,
"/users/avsec/gdrive/projects/chipnexus/data/motif_clustering.patterns.pkl")
from basepair.modisco.motif_clustering import to_colors
def shorten_name(pn):
m1, m2, tf, mc, p = pn.split("/")
mcn = mc.split("_")[1]
pn = p.split("_")[1]
return f"{m1}/{m2}/{tf}/m{mcn}_p{pn}"
pnames = [shorten_name(p.name) for p in all_patterns]
df_anno = pd.DataFrame(dict(method=["/".join(pn.split("/")[:2]) for pn in pnames],
tf=[pn.split("/")[2] for pn in pnames]),
index=pnames)
# Alternative code
import matplotlib.colors
df = df_anno
cat_cmap='RdBu_r'
import matplotlib.pyplot as plt
import seaborn as sns
cols = {}
for c in df.columns:
categories = list(df[c].unique())
if c == 'tf':
# cmap_dict = {tf: matplotlib.colors.to_rgb(hex_color)
# for tf, hex_color in tf_colors.items()}
cmap_dict = tf_colors
else:
cmap = sns.color_palette(cat_cmap, len(categories))
cmap_dict = dict(zip(categories, cmap))
cols[c] = df[c].map(cmap_dict)
row_colors = pd.DataFrame(cols)
c = 'method'
categories = list(df[c].unique())
sns.color_palette(cat_cmap, len(categories))
cmap_dict_method = dict(zip(list(df[c].unique()), cmap))
from basepair.plot.utils import plot_colormap
plot_colormap(cmap_dict_method, "Method")
# plt.savefig(fdir / 'cmap.method.pdf')
plot_colormap({k:v for k,v in tf_colors.items() if k !='Esrrb'}, 'TF')
# plt.savefig(fdir / 'cmap.TF.pdf')
df = pd.DataFrame(sim_seq,
index=pnames,
columns=pnames)
cm = sns.clustermap(df,
row_linkage=lm_seq,
col_linkage=lm_seq,
row_colors=row_colors,
cmap='Blues', figsize=get_figsize(1, 1))
plt.savefig(fdir / 'clustered-motifs.heatmap.pdf')
# aligned_patterns
cluster20 = cut_tree(lm_seq, n_clusters=20)[:, 0]
cluster30 = cut_tree(lm_seq, n_clusters=30)[:, 0]
cluster40 = cut_tree(lm_seq, n_clusters=40)[:, 0]
cluster50 = cut_tree(lm_seq, n_clusters=50)[:, 0]
cluster60 = cut_tree(lm_seq, n_clusters=60)[:, 0]
cluster_order = np.argsort(leaves_list(lm_seq))
dfp = pd.DataFrame({"name": pnames,
"cluster20": cut_tree(lm_seq, n_clusters=20)[:, 0],
"cluster30": cut_tree(lm_seq, n_clusters=30)[:, 0],
"cluster40": cut_tree(lm_seq, n_clusters=40)[:, 0],
"cluster50": cut_tree(lm_seq, n_clusters=50)[:, 0],
"cluster60": cut_tree(lm_seq, n_clusters=60)[:, 0],
}).set_index('name').loc[np.array([shorten_name(p.name) for p in aligned_patterns])].reset_index()
dfp.to_csv("/users/avsec/gdrive/projects/chipnexus/data/motif_clustering.csv", index=None)
# Create the summary table
df_n_seqlets = pd.DataFrame({
"name": [shorten_name(p.name) for p in all_patterns],
"n_seqlets": [p.attrs['n_seqlets'] for p in all_patterns],
})
# dfp = pd.read_csv("https://docs.google.com/spreadsheets/d/1enDvKamBZOs4-EIKy--zUebekHvhJekAmNy1yExf1VQ/export?gid=481460659&format=csv")
# dfp = pd.read_csv("https://docs.google.com/spreadsheets/d/1umWAMk-FKN2u4k8noKoaX3FZXmU3_-IurnO922cKUko/export?gid=1666626001&format=csv")
dfp = pd.read_csv("https://docs.google.com/spreadsheets/d/1G_vC2Y9j45WuN08Erzm8M57uNWsXl4Obl9WR2zKA6dc/export?gid=47535911&format=csv")
dfp['label'] = dfp.label.str.replace('"', '')
dfp = pd.merge(dfp, df_n_seqlets, on='name')
dfp['method'] = dfp.name.str.rsplit("/", n=2, expand=True)[0]
dfpm = pd.pivot_table(dfp, values='n_seqlets', index='label', columns='method', aggfunc=np.sum, fill_value=0)
print(dfp.groupby(['label', 'erronious']).size().to_string())
noisy_motifs = list(dfpm.index.unique()[dfpm.index.unique().str.startswith("noisy")])
erronious = dfp.query('erronious').label.unique()
dfpm = dfpm.loc[dfpm.sum(1).sort_values(ascending=False).index]
dfpm = pd.concat([
dfpm[~dfpm.index.isin(erronious)],
dfpm.loc[erronious]
])
def remove_zeroes(df):
df = df[df.max(axis=1)!=0]
dfpm = df.loc[df.sum(1).sort_values(ascending=False).index]
dfpm = pd.concat([
dfpm[~dfpm.index.isin(erronious)],
dfpm[dfpm.index.isin(erronious)]
])
return dfpm
# All motifs
fig, ax = plt.subplots(figsize=get_figsize(1.6, 1/10))
sns.heatmap(np.log10(remove_zeroes(dfpm[['nexus/profile', 'nexus/binary',
'seq/profile', 'seq/binary',
'nexus/profile.peaks-union', 'seq/profile.peaks-union'
]]) + 1).T);
fig.savefig(fdir / 'motif-venn.heatmap.all.pdf')
fig, ax = plt.subplots(figsize=get_figsize(1.4, 1/19))
sns.heatmap(np.log10(remove_zeroes(dfpm[['nexus/profile','nexus/binary',
# 'seq/profile',# 'seq/binary',
# 'nexus/profile.peaks-union', 'seq/profile.peaks-union'
]]) + 1).T);
fig.savefig(fdir / 'nexus.profile-vs-binary.motif-venn.heatmap.pdf')
fig, ax = plt.subplots(figsize=get_figsize(1.1, 1/19))
sns.heatmap(np.log10(remove_zeroes(dfpm[['nexus/profile','nexus/binary',
# 'seq/profile',# 'seq/binary',
# 'nexus/profile.peaks-union', 'seq/profile.peaks-union'
]]) + 1).T,
xticklabels=True);
fig.savefig(fdir / 'nexus.profile-vs-binary.motif-venn.heatmap.pdf')
fig, ax = plt.subplots(figsize=get_figsize(1.2, 1/12))
sns.heatmap(np.log10(remove_zeroes(dfpm[[
'nexus/profile.peaks-union',
'seq/profile.peaks-union',
'seq/profile',
'seq/binary',
]]) + 1).T);
fig.savefig(fdir / 'nexus-vs-seq.motif-venn.heatmap.pdf')
fdir
## TODO - was it more difficult to call motif on the peak union?
## - number of seqlets?
len(dfp)
len(aligned_patterns)
dict(row)
len(aligned_patterns)
dfp
for i, p in enumerate(aligned_patterns):
p.plot("seq_ic");
row = dfp[dfp.name == shorten_name(p.name)].iloc[0]
err = 'erronious ' if row.erronious else ''
plt.title(err + f"{row.label} {row['name']}")
# plt.title(p.name)
plt.ylim([0, 2])
sns.despine(bottom=True, right=True, top=True)