# Imports
from basepair.imports import *
from basepair.exp.paper.config import motifs, profile_mapping
from basepair.exp.chipnexus.perturb.vdom import vdom_motif_pair, plot_spacing_hist
from basepair.exp.chipnexus.spacing import remove_edge_instances, get_motif_pairs
from basepair.exp.paper.config import tasks
from plotnine import *
import plotnine
import warnings
warnings.filterwarnings("ignore")
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
paper_config()
# Common paths
model_dir = Path(f"{ddir}/processed/chipnexus/exp/models/oct-sox-nanog-klf/models/n_dil_layers=9/")
modisco_dir = model_dir / f"modisco/all/deeplift/profile/"
output_dir = modisco_dir
dataset_dir = output_dir / 'perturbation-analysis'
pairs = get_motif_pairs(motifs)
# ordered names
pair_names = ["<>".join(x) for x in pairs]
!mkdir -p {ddir}/figures/modisco/spacing/perturb
figures_dir = Path(f"{ddir}/figures/modisco/spacing/perturb")
# load the data
dataset_dir = output_dir / 'perturbation-analysis'
from basepair.exp.chipnexus.perturb.scores import ism_compute_features_tidy, compute_features_tidy, SCORES, max_profile_count
# motif_pair_lpdata = read_pkl(dataset_dir / 'motif_pair_lpdata.incl-whole.pkl')
# dfabf_ism = ism_compute_features_tidy(motif_pair_lpdata, tasks)
# dfs = dfabf_ism[['Wt_obs', 'Wt', 'dA', 'dB', 'dAB', 'motif_pair', 'task', 'center_diff', 'strand_combination']]
# dfabf = compute_features_tidy(motif_pair_lpdata, tasks, SCORES,
# pseudo_count_quantile=.2,
# profile_slice=slice(82, 118))
# dfabf.to_parquet(dataset_dir / 'dfabf.parq')
# dfabf.to_csv(dataset_dir / 'dfabf.csv.gz', compression='gzip')
# dfs.to_csv(dataset_dir / 'dfs.csv.gz', compression='gzip')
ls {dataset_dir}
ls {dataset_dir}
!cp {dataset_dir}/dfabf.csv.gz /srv/www//kundaje/avsec/chipnexus/oct-sox-nanog-klf/models/n_dil_layers=9/modisco/all/deeplift/profile/perturbation-analysis/
dfabf.head()
from IPython.display import FileLink
FileLink(dataset_dir / 'dfs.csv.gz')
dfabf = pd.read_csv(dataset_dir / 'dfabf.csv.gz')
# dfabf = pd.read_parquet(dataset_dir / 'dfabf.parq')
dfabf['center_diff_cat'] = pd.Categorical(pd.cut(dfabf['center_diff'], [0, 35, 70, 150, 1000]))
dfs = pd.read_csv(dataset_dir / 'dfs.csv.gz')
dfab = pd.read_csv(dataset_dir / 'dfab.csv.gz')
dfab['motif_pair_cat'] = pd.Categorical(dfab.motif_pair, categories=dfab.groupby("motif_pair").size().sort_values(ascending=False).index)
dfab['strand_combination_cat'] = pd.Categorical(dfab['strand_combination'], )
dfi_subset = pd.read_csv(dataset_dir / 'dfi_subset.csv.gz')
dfs.motif_pair.value_counts()
dfs.head()
# define the global set of distances
dist_subsets = ['center_diff<=35',
'(center_diff>35)&(center_diff<=70)',
'(center_diff>70)&(center_diff<=150)',
'center_diff>150']
dist_subset_labels = ['dist < 35',
'35 < dist <= 70',
'70 < dist <= 150',
'150 < dist',
]
from basepair.exp.chipnexus.spacing import coocurrence_plot
# co-occurence
fig, ax = plt.subplots(figsize=get_figsize(.25, aspect=1))
dist = 150
coocurrence_plot(dfi_subset, list(motifs),
query_string=f"(abs(pattern_center_aln_x- pattern_center_aln_y) <= {dist})")
ax.set_ylabel("Motif of interest")
ax.set_xlabel("Motif partner");
fig.savefig(figures_dir / f'coocurrence-test.center_diff<{dist}.pdf')
dfi_subset.head()
from basepair.exp.chipnexus.spacing import co_occurence_matrix
# co-occurence
fig, ax = plt.subplots(figsize=get_figsize(.25, aspect=1))
dist = 35
coocurrence_plot(dfi_subset, list(motifs),
query_string=f"center_diff <= {dist}")
ax.set_ylabel("Motif of interest")
ax.set_xlabel("Motif partner");
fig.savefig(figures_dir / f'coocurrence-test.center_diff<{dist}.pdf')
# subsets = ['center_diff <= 35', 'center_diff > 35', 'center_diff > 70']
subsets = dist_subsets
fig,axes = plt.subplots(1, len(subsets),
figsize=get_figsize(0.25*len(subsets), .8/len(subsets)),
sharey=True)
for i, (subset,ax, subset_label) in enumerate(zip(subsets, axes, dist_subset_labels)):
if i == len(subsets) - 1:
cbar = True
else:
cbar = False
coocurrence_plot(dfi_subset, list(motifs), query_string=subset, ax=ax, cbar=cbar)
if i == 0:
ax.set_ylabel("Motif of interest")
ax.set_xlabel("Motif partner");
ax.set_title(subset_label)
# plt.tight_layout()
plt.savefig(figures_dir / 'coocurrence-test.all-dist.pdf')
plotnine.options.figure_size = get_figsize(.5, aspect=2)# (10, 10)
max_dist = 100
fig = (ggplot(aes(x='center_diff', fill='strand_combination'), dfab[(dfab.center_diff <= max_dist)]) +
geom_histogram(bins=max_dist) +
facet_grid("motif_pair_cat~ .") +
theme_classic(base_size=10, base_family='Arial') +
theme(strip_text = element_text(rotation=0), legend_position='top') +
xlim([0, max_dist]) +
xlab("Pairwise distance") +
scale_fill_brewer(type='qual', palette=3))
fig.save(figures_dir / 'histogram.center_diff.all.pdf')
fig
plotnine.options.figure_size = get_figsize(.3, aspect=2/10*4)
max_dist = 100
fig = (ggplot(aes(x='center_diff', fill='strand_combination'), dfab[(dfab.center_diff <= max_dist) & (dfab.motif_pair.isin(["Nanog<>Nanog"]))]) +
geom_vline(xintercept=10, alpha=0.1) +
geom_vline(xintercept=20, alpha=0.1) +
geom_vline(xintercept=30, alpha=0.1) +
geom_vline(xintercept=40, alpha=0.1) +
geom_histogram(bins=max_dist) + facet_grid("strand_combination~.") +
theme_classic(base_size=10, base_family='Arial') +
theme(strip_text = element_text(rotation=0), legend_position='top') +
xlim([0, max_dist]) +
xlab("Pairwise distance") +
ggtitle("Nanog<>Nanog") +
scale_fill_brewer(type='qual', palette=3))
fig.save(figures_dir / 'nanog.spacing.pdf')
fig
Then TF-A depends on A with
(Wt-dA)/(Wt - dAB)
and on B with
(Wt-dB)/(Wt - dAB)
# dfs = dfabf_ism[['Wt_obs', 'Wt', 'dA', 'dB', 'dAB', 'motif_pair', 'task', 'center_diff', 'strand_combination']]
dfs_prox = dfs[dfs.center_diff <= 35]
dfs_dist = dfs[dfs.center_diff > 35]
dfs.head()
# choose the right color
colors = plt.get_cmap("RdBu")(np.linspace(0, 1, 9))
hex_colors = [plt.cm.colors.to_hex(a) for a in colors]
sns.palplot(colors)
cb_red = hex_colors[2]
cb_blue = hex_colors[-2 - 2]
hex_colors
from basepair.plot.utils import PlotninePalette
terms = ['Wt-dA>0', 'Wt-dB>0', 'Wt-dAB>0', 'dA-dAB>0', 'dB-dAB>0']
filter_terms = ['center_diff <= 35', 'center_diff > 35']
c = pd.concat([(dfs.query(ft).groupby(['motif_pair', 'task']).apply(lambda x: x.eval(term).mean()).
reset_index().
rename(columns={0:'fraction'}).
assign(term=term, subset=ft))
for term in terms for ft in filter_terms], axis=0)
c['term'] = pd.Categorical(c['term'], categories=terms)
c['subset'] = pd.Categorical(c['subset'], categories=filter_terms)
c['motif_pair'] = pd.Categorical(c['motif_pair'], pair_names[::-1])
plotnine.options.figure_size = get_figsize(1, 0.6)
fig = (ggplot(aes(y='motif_pair', x='task', fill='fraction'), c) +
geom_tile() +
theme_classic(base_size=10, base_family='Arial') +
facet_grid("subset ~ term") +
theme(legend_position='top', axis_text_x=element_text(rotation=30, hjust=1)) +
scale_fill_gradient2(low=cb_red, mid='white', high=cb_blue, midpoint=.5, limits=[0, 1],
palette=PlotninePalette("RdBu", [.1, .9])) +
ggtitle("Fraction of positive terms")
)
fig.save(figures_dir / 'heatmap.positive_fraction.pdf')
fig
(2*Wt-dA-dB)/(Wt-dAB)¶mask = 'Wt-dAB>0'
terms = ['(2*Wt-dA-dB)/(Wt-dAB)']
filter_terms = dist_subsets
filter_names = dist_subset_labels
# filter_terms = ['center_diff <= 35', 'center_diff > 35']
c = pd.concat([(dfs.query(ft).groupby(['motif_pair', 'task']).
apply(lambda x: x.eval(term).median() if x.eval(mask).mean() > .9 else np.nan).
reset_index().
rename(columns={0:'average'}).
assign(term=term, subset=ft, subset_name=filter_names[i]))
for term in terms for i, ft in enumerate(filter_terms)], axis=0)
c['term'] = pd.Categorical(c['term'], categories=terms)
c['subset'] = pd.Categorical(c['subset'], categories=filter_terms)
c['subset_name'] = pd.Categorical(c['subset_name'], categories=filter_names)
c['motif_pair'] = pd.Categorical(c['motif_pair'], pair_names[::-1])
plotnine.options.figure_size = get_figsize(1, 0.6)
assert len(terms) == 1
fig = (ggplot(aes(y='motif_pair', x='task', fill='average')) +
geom_tile(data=c.dropna()) +
# geom_tile(fill='#d9d9d9', data=c[c.average.isnull()]) +
geom_text(label='/', data=c[c.average.isnull()]) +
theme_classic(base_size=10, base_family='Arial') +
facet_grid(". ~ subset_name") +
theme(legend_position='top', axis_text_x=element_text(rotation=30, hjust=1)) +
scale_fill_gradient2(low=cb_red, mid='white', high=cb_blue, midpoint=1, limits=[.6, 1.4],
palette=PlotninePalette("RdBu_r", [0, 1])) +
# scale_fill_gradient2(low=cb_blue, mid='white', high=cb_red, midpoint=1, limits=[0, 2]) +
ggtitle(terms[0] + ", 90% Wt-dAB>0")
)
fig.save(figures_dir / 'heatmap.synergy-score.90perc.Wt-dAB>0.pdf')
fig
def get_syn_effect(c, pairs, motifs, tasks):
motifxtask = [f"{motif} (task={profile_mapping[motif]})" for motif in motifs]
dfp = pd.DataFrame(index=motifxtask, columns=motifs, dtype=float)
k = 0
for i in range(len(motifs)):
for j in range(len(motifs)):
if i>j:
continue
idx = 'motif_pair_idx'
cat = 'center_diff_cat'
motif_pair = pairs[k]
motif_pair_name = "<>".join(motif_pair)
k+=1
t1 = profile_mapping[motif_pair[0]]
t2 = profile_mapping[motif_pair[1]]
dfp.loc[motif_pair[0] + f" (task={t1})"][motif_pair[1]] = c[(c.motif_pair == motif_pair_name) & (c.task == t1)]['average']
dfp.loc[motif_pair[1] + f" (task={t2})"][motif_pair[0]] = c[(c.motif_pair == motif_pair_name) & (c.task == t2)]['average']
# dfp.columns = [r"$\Delta$"+c for c in dfp.columns]
return dfp
# subsets = ['center_diff <= 35', 'center_diff > 35', 'center_diff > 70']
subsets = dist_subsets
fig,axes = plt.subplots(1, len(subsets),
figsize=get_figsize(0.25*len(subsets), .8/len(subsets)),
sharey=True)
for i, (subset,ax, subset_title) in enumerate(zip(subsets, axes, dist_subset_labels)):
sns.heatmap(get_syn_effect(c[c.subset == subset], pairs, motifs, tasks), cmap='RdBu_r', center=1, ax=ax,
cbar=i==len(subsets)-1,
vmin=.7, vmax=1.3)
ax.set_title(subset_title)
plt.savefig(figures_dir / 'heatmap.synergy-score.90perc.Wt-dAB>0.main_tasks.pdf')
mask = 'Wt-dAB>0'
terms = ['(2*Wt-dA-dB)/(Wt-dAB)']
filter_terms = dist_subsets
filter_names = dist_subset_labels
# filter_terms = ['center_diff <= 35', 'center_diff > 35']
c = pd.concat([(dfs.query(ft).groupby(['motif_pair', 'task']).
apply(lambda x: x.eval(term).median() if x.eval(mask).mean() > .2 else np.nan).
reset_index().
rename(columns={0:'average'}).
assign(term=term, subset=ft, subset_name=filter_names[i]))
for term in terms for i, ft in enumerate(filter_terms)], axis=0)
c['term'] = pd.Categorical(c['term'], categories=terms)
c['subset'] = pd.Categorical(c['subset'], categories=filter_terms)
c['subset_name'] = pd.Categorical(c['subset_name'], categories=filter_names)
c['motif_pair'] = pd.Categorical(c['motif_pair'], pair_names[::-1])
plotnine.options.figure_size = get_figsize(1, 0.6)
assert len(terms) == 1
fig = (ggplot(aes(y='motif_pair', x='task', fill='average')) +
geom_tile(data=c.dropna()) +
# geom_tile(fill='#d9d9d9', data=c[c.average.isnull()]) +
geom_text(label='/', data=c[c.average.isnull()]) +
theme_classic(base_size=10, base_family='Arial') +
facet_grid(". ~ subset_name") +
theme(legend_position='top', axis_text_x=element_text(rotation=30, hjust=1)) +
scale_fill_gradient2(low=cb_red, mid='white', high=cb_blue, midpoint=1, limits=[.6, 1.4],
palette=PlotninePalette("RdBu_r", [0, 1])) +
# scale_fill_gradient2(low=cb_blue, mid='white', high=cb_red, midpoint=1, limits=[0, 2]) +
ggtitle(terms[0] + ", 90% Wt-dAB>0")
)
fig.save(figures_dir / 'heatmap.synergy-score.20perc.Wt-dAB>0.pdf')
fig
subsets = dist_subsets
fig,axes = plt.subplots(1, len(subsets),
figsize=get_figsize(0.25*len(subsets), .8/len(subsets)),
sharey=True)
for i, (subset,ax, subset_title) in enumerate(zip(subsets, axes, dist_subset_labels)):
sns.heatmap(get_syn_effect(c[c.subset == subset], pairs, motifs, tasks), cmap='RdBu_r', center=1, ax=ax,
cbar=i==len(subsets)-1,
vmin=.7, vmax=1.3)
ax.set_title(subset_title)
plt.savefig(figures_dir / 'heatmap.synergy-score.20perc.Wt-dAB>0.main_tasks.pdf')
mask = 'Wt-dAB>0'
terms = ['(2*Wt-dA-dB)/(Wt-dAB)']
filter_terms = dist_subsets
filter_names = dist_subset_labels
# filter_terms = ['center_diff <= 35', 'center_diff > 35']
c = pd.concat([(dfs.query(ft).groupby(['motif_pair', 'task']).
apply(lambda x: x.eval(term).median()).
reset_index().
rename(columns={0:'average'}).
assign(term=term, subset=ft, subset_name=filter_names[i]))
for term in terms for i, ft in enumerate(filter_terms)], axis=0)
c['term'] = pd.Categorical(c['term'], categories=terms)
c['subset'] = pd.Categorical(c['subset'], categories=filter_terms)
c['subset_name'] = pd.Categorical(c['subset_name'], categories=filter_names)
c['motif_pair'] = pd.Categorical(c['motif_pair'], pair_names[::-1])
plotnine.options.figure_size = get_figsize(1, 0.6)
assert len(terms) == 1
fig = (ggplot(aes(y='motif_pair', x='task', fill='average')) +
geom_tile(data=c.dropna()) +
# geom_tile(fill='#d9d9d9', data=c[c.average.isnull()]) +
geom_text(label='/', data=c[c.average.isnull()]) +
theme_classic(base_size=10, base_family='Arial') +
facet_grid(". ~ subset_name") +
theme(legend_position='top', axis_text_x=element_text(rotation=30, hjust=1)) +
scale_fill_gradient2(low=cb_red, mid='white', high=cb_blue, midpoint=1, limits=[.6, 1.4],
palette=PlotninePalette("RdBu_r", [0, 1])) +
# scale_fill_gradient2(low=cb_blue, mid='white', high=cb_red, midpoint=1, limits=[0, 2]) +
ggtitle(terms[0] + ", 90% Wt-dAB>0")
)
fig.save(figures_dir / 'heatmap.synergy-score.all.pdf')
fig
subsets = dist_subsets
fig,axes = plt.subplots(1, len(subsets),
figsize=get_figsize(0.25*len(subsets), .8/len(subsets)),
sharey=True)
for i, (subset,ax, subset_title) in enumerate(zip(subsets, axes, dist_subset_labels)):
sns.heatmap(get_syn_effect(c[c.subset == subset], pairs, motifs, tasks), cmap='RdBu_r', center=1, ax=ax,
cbar=i==len(subsets)-1,
vmin=.7, vmax=1.3)
ax.set_title(subset_title)
plt.savefig(figures_dir / 'heatmap.synergy-score.all.main_tasks.pdf')
from basepair.exp.paper.config import profile_mapping
fig, axes = plt.subplots(len(motifs), len(motifs),
figsize=get_figsize(.65, 1),
gridspec_kw=dict(wspace=0, hspace=0),
sharex=True, sharey=True)
colors = plt.get_cmap("viridis")(np.linspace(0, .95, len(motifs)))
k = 0
for i in range(len(motifs)):
for j in range(len(motifs)):
ax = axes[i,j]
if i>j:
ax.axison = False
continue
idx = 'motif_pair_idx'
cat = 'center_diff_cat'
motif_pair = pairs[k]
motif_pair_name = "<>".join(motif_pair)
k+=1
t1 = profile_mapping[motif_pair[0]]
t2 = profile_mapping[motif_pair[1]]
# t1 = 'Nanog'
# t2 = 'Nanog'
# motif mask
mmask = (dfabf.motif_pair == motif_pair_name) & (dfabf.score == 'max_profile_count')
ab = pd.merge(dfabf[mmask & (dfabf.task == t1)][[idx, 'x_alt_ref_pc', cat]],
dfabf[mmask & (dfabf.task == t2)][[idx, 'y_alt_ref_pc']],
on=idx)
# make a scatterplot
for ci, (label,df) in enumerate(list(ab.groupby(cat))):
ax.scatter(df.y_alt_ref_pc,
df.x_alt_ref_pc, # first one is on the y axis
alpha=0.2, s=1, label=label, color=colors[ci])
# ax.text(0, 0,t1+"-"+t2)
alpha = .5
xl = [0, 2]
ax.plot(xl, xl, c='grey', alpha=alpha)
ax.axvline(1, c='grey', alpha=alpha)
ax.axhline(1, c='grey', alpha=alpha)
ax.set_xlim(xl)
ax.set_ylim(xl)
if i == j:
ax.set_ylabel(motif_pair[1])
ax.set_xlabel(motif_pair[0])
# Use grey spines
plt.setp(ax.spines.values(), color='grey')
plt.setp([ax.get_xticklines(), ax.get_yticklines()], color='grey')
plt.legend(loc="bottom right", bbox_to_anchor=(-.2,.7),
scatterpoints=1, markerscale=10, columnspacing=0,
handletextpad=0, borderpad=0, frameon=False, title="Pairwise distance")
fig.savefig(figures_dir / "max_profile_count.upper-triag.width=0.65.pdf")
fig, axes = plt.subplots(len(motifs), len(motifs),
figsize=get_figsize(.65, 1),
gridspec_kw=dict(wspace=0, hspace=0),
sharex=True, sharey=True)
k = 0
for i in range(len(motifs)):
for j in range(len(motifs)):
ax = axes[i,j]
if i>j:
ax.axison = False
continue
idx = 'motif_pair_idx'
cat = 'center_diff_cat'
motif_pair = pairs[k]
motif_pair_name = "<>".join(motif_pair)
k+=1
t1 = profile_mapping[motif_pair[0]]
t2 = profile_mapping[motif_pair[1]]
# t1 = 'Nanog'
# t2 = 'Nanog'
# motif mask
mmask = (dfabf.motif_pair == motif_pair_name) & (dfabf.score == 'max_profile_count')
ab = pd.merge(dfabf[mmask & (dfabf.task == t1)][[idx, 'x_alt_ref', cat]],
dfabf[mmask & (dfabf.task == t2)][[idx, 'y_alt_ref']],
on=idx)
# make a scatterplot
for ci, (label,df) in enumerate(list(ab.groupby(cat))):
ax.scatter(df.y_alt_ref,
df.x_alt_ref, # first one is on the y axis
alpha=0.2, s=1, label=label, color=colors[ci])
# ax.text(0, 0,t1+"-"+t2)
alpha = .5
xl = [0, 2]
ax.plot(xl, xl, c='grey', alpha=alpha)
ax.axvline(1, c='grey', alpha=alpha)
ax.axhline(1, c='grey', alpha=alpha)
ax.set_xlim(xl)
ax.set_ylim(xl)
if i == j:
ax.set_ylabel(motif_pair[1])
ax.set_xlabel(motif_pair[0])
# Use grey spines
plt.setp(ax.spines.values(), color='grey')
plt.setp([ax.get_xticklines(), ax.get_yticklines()], color='grey')
plt.legend(loc="bottom right", bbox_to_anchor=(-.2,.7),
scatterpoints=1, markerscale=10, columnspacing=0,
handletextpad=0, borderpad=0, frameon=False, title="Pairwise distance")
fig.savefig(figures_dir / "max_profile_count.upper-triag.no-pseudocounts.pdf")
import matplotlib.cm as cm
colors = plt.get_cmap("viridis")(np.linspace(0, .95, len(motifs)))
hex_colors = [plt.cm.colors.to_hex(a) for a in colors]
# use pair names
dfabf['motif_pair'] = pd.Categorical(dfabf['motif_pair'], pair_names)
from plotnine import *
plotnine.options.figure_size = get_figsize(1, 10/4)
fig = (ggplot(aes(x='x_alt_ref_pc', y='y_alt_ref_pc', color='center_diff_cat'),
dfabf[dfabf.score == 'max_profile_count']) +
geom_point(alpha=0.2, size=.05, shape='.', show_legend=False) +
scale_color_manual(hex_colors)+
geom_point(alpha=1, size=5, data=dfabf[:0], shape='.') + # for the legend
# add vertical lines
geom_abline(intercept=0, slope=1, color='grey', alpha=0.5, show_legend=False) +
geom_vline(xintercept=1, color='grey', alpha=0.5) +
geom_hline(yintercept=1, color='grey', alpha=0.5) +
facet_grid("motif_pair ~ task") +
xlim([0, 2]) +
ylim([0, 2]) +
theme_classic(base_size=10, base_family='Arial') +
xlab("B") +
ylab("A") +
ggtitle('max_profile_count')
)
fig.save(figures_dir / 'max_profile_count.all.pdf', raster=True)
display(fig)
def get_effect(dfabf, pairs, agg_fn=np.mean):
motifxtask = [f"{motif} (task={profile_mapping[motif]})" for motif in motifs]
dfp = pd.DataFrame(index=motifxtask, columns=motifs, dtype=float)
k = 0
for i in range(len(motifs)):
for j in range(len(motifs)):
if i>j:
continue
idx = 'motif_pair_idx'
cat = 'center_diff_cat'
motif_pair = pairs[k]
motif_pair_name = "<>".join(motif_pair)
k+=1
t1 = profile_mapping[motif_pair[0]]
t2 = profile_mapping[motif_pair[1]]
# t1 = 'Nanog'
# t2 = 'Nanog'
# motif mask
mmask = (dfabf.motif_pair == motif_pair_name) & (dfabf.score == 'max_profile_count')
ab = pd.merge(dfabf[mmask & (dfabf.task == t1)][[idx, 'x_alt_ref_pc', cat]],
dfabf[mmask & (dfabf.task == t2)][[idx, 'y_alt_ref_pc']],
on=idx)
dfp.loc[motif_pair[0] + f" (task={t1})"][motif_pair[1]] = agg_fn(ab.x_alt_ref_pc)
dfp.loc[motif_pair[1] + f" (task={t2})"][motif_pair[0]] = agg_fn(ab.y_alt_ref_pc)
dfp.columns = [r"$\Delta$"+c for c in dfp.columns]
return dfp
subsets = dist_subsets
fig,axes = plt.subplots(1, len(subsets),
figsize=get_figsize(0.25*len(subsets), .8/len(subsets)),
sharey=True)
for i, (subset,ax, subset_label) in enumerate(zip(subsets, axes, dist_subset_labels)):
sns.heatmap(get_effect(dfabf.query(subset), pairs, agg_fn=np.mean), cmap='RdBu', center=1, ax=ax,
cbar=i==len(subsets)-1,
vmin=.7, vmax=1.3)
ax.set_title(subset_label)
plt.savefig(figures_dir / 'heatmap.mean_max_profile_count.directional_effect.pdf')
subsets = dist_subsets
fig,axes = plt.subplots(1, len(subsets),
figsize=get_figsize(0.25*len(subsets), .8/len(subsets)),
sharey=True)
for i, (subset,ax, subset_label) in enumerate(zip(subsets, axes, dist_subset_labels)):
sns.heatmap(get_effect(dfabf.query(subset), pairs, agg_fn=np.median), cmap='RdBu', center=1, ax=ax,
cbar=i==len(subsets)-1,
vmin=.7, vmax=1.3)
ax.set_title(subset_label)
plt.savefig(figures_dir / 'heatmap.median_max_profile_count.directional_effect.pdf')
motifxtask = [f"{motif} (task={task})" for motif in motifs for task in tasks]
def get_effect_all_tasks(dfabf, pairs, tasks, agg_fn=np.mean):
motifxtask = [f"{motif} (task={task})" for motif in motifs for task in tasks]
dfp = pd.DataFrame(index=motifxtask, columns=motifs, dtype=float)
for task in tasks:
k = 0
for i in range(len(motifs)):
for j in range(len(motifs)):
if i>j:
continue
idx = 'motif_pair_idx'
cat = 'center_diff_cat'
motif_pair = pairs[k]
motif_pair_name = "<>".join(motif_pair)
k+=1
t1 = profile_mapping[motif_pair[0]]
t2 = profile_mapping[motif_pair[1]]
ab = dfabf[(dfabf.motif_pair == motif_pair_name) & (dfabf.score == 'max_profile_count') & (dfabf.task == task)]
dfp.loc[motif_pair[0] + f" (task={task})"][motif_pair[1]] = agg_fn(ab.x_alt_ref_pc)
dfp.loc[motif_pair[1] + f" (task={task})"][motif_pair[0]] = agg_fn(ab.y_alt_ref_pc)
dfp.columns = [r"$\Delta$"+c for c in dfp.columns]
return dfp
subsets = dist_subsets
fig,axes = plt.subplots(1, len(subsets),
figsize=get_figsize(0.25*len(subsets), .8/len(subsets)*4),
sharey=True)
for i, (subset,ax, subset_label) in enumerate(zip(subsets, axes, dist_subset_labels)):
sns.heatmap(get_effect_all_tasks(dfabf.query(subset), pairs, tasks=tasks, agg_fn=np.median), cmap='RdBu', center=1, ax=ax,
cbar=i==len(subsets)-1,
vmin=.7, vmax=1.3)
ax.set_title(subset_label)
plt.savefig(figures_dir / 'heatmap.median_max_profile_count.directional_effect.all-tasks.pdf')
subsets = dist_subsets
fig,axes = plt.subplots(1, len(subsets),
figsize=get_figsize(0.25*len(subsets), .8/len(subsets)*4),
sharey=True)
for i, (subset,ax, subset_label) in enumerate(zip(subsets, axes, dist_subset_labels)):
sns.heatmap(get_effect_all_tasks(dfabf.query(subset), pairs, tasks=tasks, agg_fn=np.mean), cmap='RdBu', center=1, ax=ax,
cbar=i==len(subsets)-1,
vmin=.7, vmax=1.3)
ax.set_title(subset_label)
plt.savefig(figures_dir / 'heatmap.mean_max_profile_count.directional_effect.all-tasks.pdf')
dfp = pd.read_csv("http://mitra.stanford.edu/kundaje/avsec/chipnexus/oct-sox-nanog-klf/models/n_dil_layers=9/modisco/all/deeplift/profile/pattern_table.csv")
dfp.head()
dfp = dfp[dfp.pattern.isin(list(motifs.values()))]
m_hash = {v:k for k,v in motifs.items()}
dfp['pattern_name'] = dfp.pattern.map(m_hash)
dfp = dfp.set_index('pattern_name')
dfp = dfp.loc[list(motifs)]
dfp_max = dfp[[c for c in dfp.columns if c.endswith(' footprint max')]]
dfp_max.columns = [c.split(" ")[0] for c in dfp_max.columns]
dfp_max = dfp_max[tasks]
dfp_max
dfp_counts = dfp[[c for c in dfp.columns if c.endswith(' footprint counts')]]
dfp_counts.columns = [c.split(" ")[0] for c in dfp_max.columns]
dfp_counts = dfp_counts[tasks]
dfp_counts
fig, axes = plt.subplots(1, 2, figsize=get_figsize(1, .35), sharey=True)
ax = axes[0]
sns.heatmap(dfp_max, cmap='Reds', ax=ax)
ax.set_ylabel("Motif")
ax.set_xlabel("Task");
ax.set_title("Avg. max profile counts");
ax = axes[1]
sns.heatmap(dfp_counts, cmap='Reds', ax=ax)
ax.set_ylabel("")
ax.set_xlabel("Task");
ax.set_title("Avg. total profile counts");
plt.savefig(figures_dir / 'heatmap.total-profile-counts.pdf')
For the scatter plots (directional interactions), generate png and pdf files for each interaction so I can use them directly.
- The combined pdfs are too big and also too confusing to show on a slide or figure.
odir = figures_dir / "individual/max_profile_count"
odir.mkdir(parents=True, exist_ok=True)
colors = plt.get_cmap("viridis")(np.linspace(0, .95, len(motifs)))
k = 0
for i in range(len(motifs)):
for j in range(len(motifs)):
if i>j:
continue
fig, ax = plt.subplots(1, 1, figsize=get_figsize(.65 / 3, 1))
idx = 'motif_pair_idx'
cat = 'center_diff_cat'
motif_pair = pairs[k]
motif_pair_name = "<>".join(motif_pair)
k+=1
t1 = profile_mapping[motif_pair[0]]
t2 = profile_mapping[motif_pair[1]]
# t1 = 'Nanog'
# t2 = 'Nanog'
# motif mask
mmask = (dfabf.motif_pair == motif_pair_name) & (dfabf.score == 'max_profile_count')
ab = pd.merge(dfabf[mmask & (dfabf.task == t1)][[idx, 'x_alt_ref_pc', cat]],
dfabf[mmask & (dfabf.task == t2)][[idx, 'y_alt_ref_pc']],
on=idx)
# make a scatterplot
for ci, (label,df) in enumerate(list(ab.groupby(cat))):
ax.scatter(df.y_alt_ref_pc,
df.x_alt_ref_pc, # first one is on the y axis
alpha=0.2, s=1, label=label, color=colors[ci])
# ax.text(0, 0,t1+"-"+t2)
alpha = .5
xl = [0, 2]
ax.plot(xl, xl, c='grey', alpha=alpha)
ax.axvline(1, c='grey', alpha=alpha)
ax.axhline(1, c='grey', alpha=alpha)
ax.set_xlim(xl)
ax.set_ylim(xl)
ax.set_xlabel(r"{} ($\Delta$ {})".format(motif_pair[1], motif_pair[0]))
ax.set_ylabel(r"{} ($\Delta$ {})".format(motif_pair[0], motif_pair[1]))
# Use grey spines
plt.setp(ax.spines.values(), color='grey')
plt.setp([ax.get_xticklines(), ax.get_yticklines()], color='grey')
fig.savefig(odir / f'{motif_pair_name}.png')
fig.savefig(odir / f'{motif_pair_name}.pdf')
odir = figures_dir / "individual/max_profile_count/all_tasks"
odir.mkdir(parents=True, exist_ok=True)
colors = plt.get_cmap("viridis")(np.linspace(0, .95, len(motifs)))
for task in tasks:
k = 0
for i in range(len(motifs)):
for j in range(len(motifs)):
if i>j:
continue
fig, ax = plt.subplots(1, 1, figsize=get_figsize(.65 / 3, 1))
idx = 'motif_pair_idx'
cat = 'center_diff_cat'
motif_pair = pairs[k]
motif_pair_name = "<>".join(motif_pair)
k+=1
t1 = profile_mapping[motif_pair[0]]
t2 = profile_mapping[motif_pair[1]]
ab = dfabf[(dfabf.motif_pair == motif_pair_name) & (dfabf.score == 'max_profile_count') & (dfabf.task == task)]
# make a scatterplot
for ci, (label,df) in enumerate(list(ab.groupby(cat))):
ax.scatter(df.y_alt_ref_pc,
df.x_alt_ref_pc, # first one is on the y axis
alpha=0.2, s=1, label=label, color=colors[ci])
# ax.text(0, 0,t1+"-"+t2)
alpha = .5
xl = [0, 2]
ax.plot(xl, xl, c='grey', alpha=alpha)
ax.axvline(1, c='grey', alpha=alpha)
ax.axhline(1, c='grey', alpha=alpha)
ax.set_xlim(xl)
ax.set_ylim(xl)
ax.set_xlabel(r"{} ($\Delta$ {})".format(motif_pair[1], motif_pair[0]))
ax.set_ylabel(r"{} ($\Delta$ {})".format(motif_pair[0], motif_pair[1]))
ax.set_title(task)
# Use grey spines
plt.setp(ax.spines.values(), color='grey')
plt.setp([ax.get_xticklines(), ax.get_yticklines()], color='grey')
fig.savefig(odir / f'{motif_pair_name},task={task}.png')
fig.savefig(odir / f'{motif_pair_name},task={task}.pdf')