from collections import OrderedDict
exp = 'nexus,peaks,OSNK,0,10,1,FALSE,same,0.5,64,25,0.004,9,FALSE,[1,50],TRUE'
imp_score = 'profile/wn'
motifs = OrderedDict([
("Oct4-Sox2", 'Oct4/m0_p0'),
# ("Oct4", 'Oct4/m0_p1'),
# ("Strange-sym-motif", 'Oct4/m0_p5'),
("Sox2", 'Sox2/m0_p1'),
("Nanog", 'Nanog/m0_p1'),
# ("Zic3", 'Nanog/m0_p2'),
# ("Nanog-partner", 'Nanog/m0_p4'),
("Klf4", 'Klf4/m0_p0'),
])
# Imports
from basepair.imports import *
from basepair.exp.paper.config import models_dir, profile_mapping, ddir
from basepair.exp.chipnexus.perturb.vdom import vdom_motif_pair, plot_spacing_hist
from basepair.exp.chipnexus.spacing import remove_edge_instances, get_motif_pairs
from basepair.exp.paper.config import tasks
from plotnine import *
import plotnine
import warnings
warnings.filterwarnings("ignore")
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
paper_config()
model_dir = models_dir / exp
dataset_dir = model_dir / 'perturbation-analysis'
pairs = get_motif_pairs(motifs)
# ordered names
pair_names = ["<>".join(x) for x in pairs]
figures_dir = Path(f"{ddir}/figures/modisco/{exp}/in-vivo-perturb")
!mkdir -p {figures_dir}
# define the global set of distances
dist_subsets = ['center_diff<=35',
'(center_diff>35)&(center_diff<=70)',
'(center_diff>70)&(center_diff<=150)',
'center_diff>150']
dist_subset_labels = ['dist < 35',
'35 < dist <= 70',
'70 < dist <= 150',
'150 < dist',
]
from basepair.plot.utils import PlotninePalette
# choose the right color
colors = plt.get_cmap("RdBu")(np.linspace(0, 1, 9))
hex_colors = [plt.cm.colors.to_hex(a) for a in colors]
sns.palplot(colors)
cb_red = hex_colors[2]
cb_blue = hex_colors[-2 - 2]
hex_colors
from basepair.exp.chipnexus.perturb.scores import ism_compute_features_tidy, compute_features_tidy, SCORES, max_profile_count
ls {dataset_dir}
from IPython.display import FileLink
FileLink(dataset_dir / 'dfs.csv.gz')
dfabf = pd.read_csv(dataset_dir / 'dfabf.csv.gz')
dfabf = dfabf[dfabf.motif_pair.isin(pair_names)]
dfabf['center_diff_cat'] = pd.Categorical(pd.cut(dfabf['center_diff'], [0, 35, 70, 150, 1000]))
# dfs = pd.read_csv(dataset_dir / 'dfs.csv.gz')
# Store the output
dfabf_ism = pd.read_csv(dataset_dir / 'dfabf_ism.csv.gz')
dfabf_ism = dfabf_ism[dfabf_ism.motif_pair.isin(pair_names)]
# dfab = pd.read_csv(dataset_dir / 'dfab.csv.gz')
# dfab['motif_pair_cat'] = pd.Categorical(dfab.motif_pair, categories=dfab.groupby("motif_pair").size().sort_values(ascending=False).index)
# dfab['strand_combination_cat'] = pd.Categorical(dfab['strand_combination'], )
dfi_subset = pd.read_csv(dataset_dir / 'dfi_subset.csv.gz')
dfabf.motif_pair.value_counts()
dfabf.head()
from basepair.exp.paper.config import tasks
pattern_center_aln to dfi_subset¶mr = MultipleModiscoResult({t: model_dir / f'deeplift/{t}/out/{imp_score}/modisco.h5'
for t in tasks})
from basepair.exp.paper.fig4 import cluster_align_patterns
main_motifs = [mr.get_pattern(pattern_name).add_attr('features', {'n seqlets': mr.n_seqlets(pattern_name)})
for name, pattern_name in motifs.items()]
main_motifs = [p.rename(longer_pattern(p.name)) for p in main_motifs]
main_motifs_clustered = cluster_align_patterns(main_motifs, n_clusters=1)
from basepair.modisco.pattern_instances import align_instance_center
dfi_subset = align_instance_center(dfi_subset, main_motifs, main_motifs_clustered, trim_frac=0.08)
instance_parq_paths = {t: model_dir / f'deeplift/{t}/out/{imp_score}/instances.parq'
for t in tasks}
def shorten_te_pattern(s):
tf, p = s.split("/", 1)
return tf + "/" + shorten_pattern(p)
motifs_te = [p.name
for p in mr.get_all_patterns()
if p.seq_info_content > 30 and mr.n_seqlets(p.name) > 100]
motifs_te_d = OrderedDict([(shorten_te_pattern(x), shorten_te_pattern(x)) for x in motifs_te])
from basepair.modisco.pattern_instances import multiple_load_instances
# # get transposable element locations
dfi_te = multiple_load_instances(instance_parq_paths, motifs_te_d)
dfi_te = dfi_te[(dfi_te.match_weighted_p > 0.1) & (dfi_te.seq_match > 20)]
# Get rows without intersecting transposable elements
from basepair.preproc import rc_seq, dfint_no_intersection
interval_cols = ['example_chrom', 'pattern_start_abs', 'pattern_end_abs']
non_te_idx = dfint_no_intersection(dfi_subset[interval_cols], dfi_te[interval_cols])
print(f"Not overlapping te's: {non_te_idx.mean()}")
# # annotate with profiles
# profiles = load_profiles(modisco_dir, imp_score_file)
# dfi_anno = annotate_profile(dfi, mr, profiles, profile_width=70, trim_frac=0.08)
# dfi = dfi_anno
valid_row_idx = dfi_subset[non_te_idx].row_idx.values
len(valid_row_idx) # non-te rows
Append pattern_center_aln to dfabf.
def suffix_colnames(df, suffix):
df.columns = [c + suffix for c in df.columns]
return df
dfabf['row_idx_x'] = dfabf['row_idx_x'].astype(int)
dfabf['row_idx_y'] = dfabf['row_idx_y'].astype(int)
dfabf2 = pd.merge(dfabf, dfi_subset[['row_idx',
'pattern_center_aln',
'pattern_strand_aln']].pipe(suffix_colnames, '_x'),
on='row_idx_x',
how='left')
dfabf2 = pd.merge(dfabf2, dfi_subset[['row_idx',
'pattern_center_aln',
'pattern_strand_aln']].pipe(suffix_colnames, '_y'),
on='row_idx_y',
how='left')
dfabf2['pattern_diff_aln'] = dfabf2['pattern_center_aln_y'] - dfabf2['pattern_center_aln_x']
exclude_sox2 = dfabf2[(dfabf2.motif_pair == 'Oct4-Sox2<>Sox2') &
(dfabf2['pattern_diff_aln'] == 0)].row_idx_y.values
exclude_oct4 = dfabf2[(dfabf2.motif_pair == 'Oct4-Sox2<>Oct4') &
(dfabf2['pattern_diff_aln'] == 0)].row_idx_y.values
# Exclude the overlapping row
dfabf2 = dfabf2[(dfabf2.pattern_name_x != 'Oct4') | (~dfabf2.row_idx_x.isin(exclude_oct4))]
dfabf2 = dfabf2[(dfabf2.pattern_name_y != 'Oct4') | (~dfabf2.row_idx_y.isin(exclude_oct4))]
dfabf2 = dfabf2[(dfabf2.pattern_name_x != 'Sox2') | (~dfabf2.row_idx_x.isin(exclude_sox2))]
dfabf2 = dfabf2[(dfabf2.pattern_name_y != 'Sox2') | (~dfabf2.row_idx_y.isin(exclude_sox2))]
dfabf2.info()
len(dfabf2)
# Exclude TE's
dfabf2 = dfabf2[dfabf2.row_idx_x.isin(valid_row_idx) & dfabf2.row_idx_y.isin(valid_row_idx)]
dfabf2 = dfabf2.reset_index()
# dfabf2.reset_index().to_csv(dataset_dir / 'dfabf.Oct4-Sox2-subset.csv.gz', compression='gzip', index=False)
dfabf2.head()
# set categorical index
dfabf2.center_diff_cat.cat.categories = dfabf2.center_diff_cat.cat.categories.astype(str)
%time dfabf2.to_parquet(dataset_dir / 'dfabf.Oct4-Sox2-subset.parq', index=False, engine='fastparquet')
!du -sh {dataset_dir}/dfabf.Oct4-Sox2-subset.parq
%time dfabf2 = pd.read_parquet(dataset_dir / 'dfabf.Oct4-Sox2-subset.parq', engine='fastparquet')
!du -sh {dataset_dir}
dfabf_ism.head()
dfabf_ism2 = pd.merge(dfabf_ism, dfi_subset[['row_idx',
'pattern_center_aln',
'pattern_strand_aln']].pipe(suffix_colnames, '_x'),
on='row_idx_x',
how='left')
dfabf_ism2 = pd.merge(dfabf_ism2, dfi_subset[['row_idx',
'pattern_center_aln',
'pattern_strand_aln']].pipe(suffix_colnames, '_y'),
on='row_idx_y',
how='left')
dfabf_ism2['pattern_diff_aln'] = dfabf_ism2['pattern_center_aln_y'] - dfabf_ism2['pattern_center_aln_x']
exclude_sox2 = dfabf_ism2[(dfabf_ism2.motif_pair == 'Oct4-Sox2<>Sox2') &
(dfabf_ism2['pattern_diff_aln'] == 0)].row_idx_y.values
exclude_oct4 = dfabf_ism2[(dfabf_ism2.motif_pair == 'Oct4-Sox2<>Oct4') &
(dfabf_ism2['pattern_diff_aln'] == 0)].row_idx_y.values
# Exclude the overlapping row
dfabf_ism2 = dfabf_ism2[(dfabf_ism2.pattern_name_x != 'Oct4') | (~dfabf_ism2.row_idx_x.isin(exclude_oct4))]
dfabf_ism2 = dfabf_ism2[(dfabf_ism2.pattern_name_y != 'Oct4') | (~dfabf_ism2.row_idx_y.isin(exclude_oct4))]
dfabf_ism2 = dfabf_ism2[(dfabf_ism2.pattern_name_x != 'Sox2') | (~dfabf_ism2.row_idx_x.isin(exclude_sox2))]
dfabf_ism2 = dfabf_ism2[(dfabf_ism2.pattern_name_y != 'Sox2') | (~dfabf_ism2.row_idx_y.isin(exclude_sox2))]
dfs = dfabf_ism2[['Wt_obs', 'Wt', 'dA', 'dB', 'dAB', 'motif_pair',
'task', 'center_diff', 'strand_combination']]
Then TF-A depends on A with
(Wt-dA)/(Wt - dAB)
and on B with
(Wt-dB)/(Wt - dAB)
# dfs = dfabf_ism[['Wt_obs', 'Wt', 'dA', 'dB', 'dAB', 'motif_pair', 'task', 'center_diff', 'strand_combination']]
dfs_prox = dfs[dfs.center_diff <= 35]
dfs_dist = dfs[dfs.center_diff > 35]
dfs.head()
terms = ['Wt-dA>0', 'Wt-dB>0', 'Wt-dAB>0', 'dA-dAB>0', 'dB-dAB>0']
filter_terms = ['center_diff <= 35', 'center_diff > 35']
c = pd.concat([(dfs.query(ft).groupby(['motif_pair', 'task']).apply(lambda x: x.eval(term).mean()).
reset_index().
rename(columns={0:'fraction'}).
assign(term=term, subset=ft))
for term in terms for ft in filter_terms], axis=0)
c = c[c.motif_pair.isin(pair_names)]
c['term'] = pd.Categorical(c['term'], categories=terms)
c['subset'] = pd.Categorical(c['subset'], categories=filter_terms)
c['motif_pair'] = pd.Categorical(c['motif_pair'], categories=pair_names[::-1], ordered=True)
plotnine.options.figure_size = get_figsize(1, 0.6)
fig = (ggplot(aes(y='motif_pair', x='task', fill='fraction'), c) +
geom_tile() +
theme_classic(base_size=10, base_family='Arial') +
facet_grid("subset ~ term") +
theme(legend_position='top', axis_text_x=element_text(rotation=30, hjust=1)) +
scale_fill_gradient2(low=cb_red, mid='white', high=cb_blue, midpoint=.5, limits=[0, 1],
palette=PlotninePalette("RdBu", [.1, .9])) +
ggtitle("Fraction of positive terms")
)
fig.save(figures_dir / 'heatmap.positive_fraction.pdf')
fig
(2*Wt-dA-dB)/(Wt-dAB)¶mask = 'Wt-dAB>0'
terms = ['(2*Wt-dA-dB)/(Wt-dAB)']
filter_terms = dist_subsets
filter_names = dist_subset_labels
# filter_terms = ['center_diff <= 35', 'center_diff > 35']
c = pd.concat([(dfs.query(ft).groupby(['motif_pair', 'task']).
apply(lambda x: x.eval(term).median() if x.eval(mask).mean() > .9 else np.nan).
reset_index().
rename(columns={0:'average'}).
assign(term=term, subset=ft, subset_name=filter_names[i]))
for term in terms for i, ft in enumerate(filter_terms)], axis=0)
c['term'] = pd.Categorical(c['term'], categories=terms)
c['subset'] = pd.Categorical(c['subset'], categories=filter_terms)
c['subset_name'] = pd.Categorical(c['subset_name'], categories=filter_names)
c['motif_pair'] = pd.Categorical(c['motif_pair'], pair_names[::-1])
plotnine.options.figure_size = get_figsize(1, 0.6)
assert len(terms) == 1
fig = (ggplot(aes(y='motif_pair', x='task', fill='average')) +
geom_tile(data=c.dropna()) +
# geom_tile(fill='#d9d9d9', data=c[c.average.isnull()]) +
geom_text(label='/', data=c[c.average.isnull()]) +
theme_classic(base_size=10, base_family='Arial') +
facet_grid(". ~ subset_name") +
theme(legend_position='top', axis_text_x=element_text(rotation=30, hjust=1)) +
scale_fill_gradient2(low=cb_red, mid='white', high=cb_blue, midpoint=1, limits=[.6, 1.4],
palette=PlotninePalette("RdBu_r", [0, 1])) +
# scale_fill_gradient2(low=cb_blue, mid='white', high=cb_red, midpoint=1, limits=[0, 2]) +
ggtitle(terms[0] + ", 90% Wt-dAB>0")
)
fig.save(figures_dir / 'heatmap.synergy-score.90perc.Wt-dAB>0.pdf')
fig
def get_syn_effect(c, pairs, motifs, tasks):
motifxtask = [f"{motif} (task={profile_mapping[motif]})" for motif in motifs]
dfp = pd.DataFrame(index=motifxtask, columns=motifs, dtype=float)
k = 0
for i in range(len(motifs)):
for j in range(len(motifs)):
if i>j:
continue
idx = 'motif_pair_idx'
cat = 'center_diff_cat'
motif_pair = pairs[k]
motif_pair_name = "<>".join(motif_pair)
k+=1
t1 = profile_mapping[motif_pair[0]]
t2 = profile_mapping[motif_pair[1]]
dfp.loc[motif_pair[0] + f" (task={t1})"][motif_pair[1]] = c[(c.motif_pair == motif_pair_name) & (c.task == t1)]['average']
dfp.loc[motif_pair[1] + f" (task={t2})"][motif_pair[0]] = c[(c.motif_pair == motif_pair_name) & (c.task == t2)]['average']
# dfp.columns = [r"$\Delta$"+c for c in dfp.columns]
return dfp
# subsets = ['center_diff <= 35', 'center_diff > 35', 'center_diff > 70']
subsets = dist_subsets
fig,axes = plt.subplots(1, len(subsets),
figsize=get_figsize(0.25*len(subsets), .8/len(subsets)),
sharey=True)
for i, (subset,ax, subset_title) in enumerate(zip(subsets, axes, dist_subset_labels)):
sns.heatmap(get_syn_effect(c[c.subset == subset], pairs, motifs, tasks), cmap='RdBu_r', center=1, ax=ax,
cbar=i==len(subsets)-1,
vmin=.7, vmax=1.3)
ax.set_title(subset_title)
plt.savefig(figures_dir / 'heatmap.synergy-score.90perc.Wt-dAB>0.main_tasks.pdf')
mask = 'Wt-dAB>0'
terms = ['(2*Wt-dA-dB)/(Wt-dAB)']
filter_terms = dist_subsets
filter_names = dist_subset_labels
# filter_terms = ['center_diff <= 35', 'center_diff > 35']
c = pd.concat([(dfs.query(ft).groupby(['motif_pair', 'task']).
apply(lambda x: x.eval(term).median() if x.eval(mask).mean() > .2 else np.nan).
reset_index().
rename(columns={0:'average'}).
assign(term=term, subset=ft, subset_name=filter_names[i]))
for term in terms for i, ft in enumerate(filter_terms)], axis=0)
c['term'] = pd.Categorical(c['term'], categories=terms)
c['subset'] = pd.Categorical(c['subset'], categories=filter_terms)
c['subset_name'] = pd.Categorical(c['subset_name'], categories=filter_names)
c['motif_pair'] = pd.Categorical(c['motif_pair'], pair_names[::-1])
plotnine.options.figure_size = get_figsize(1, 0.6)
assert len(terms) == 1
fig = (ggplot(aes(y='motif_pair', x='task', fill='average')) +
geom_tile(data=c.dropna()) +
# geom_tile(fill='#d9d9d9', data=c[c.average.isnull()]) +
geom_text(label='/', data=c[c.average.isnull()]) +
theme_classic(base_size=10, base_family='Arial') +
facet_grid(". ~ subset_name") +
theme(legend_position='top', axis_text_x=element_text(rotation=30, hjust=1)) +
scale_fill_gradient2(low=cb_red, mid='white', high=cb_blue, midpoint=1, limits=[.6, 1.4],
palette=PlotninePalette("RdBu_r", [0, 1])) +
# scale_fill_gradient2(low=cb_blue, mid='white', high=cb_red, midpoint=1, limits=[0, 2]) +
ggtitle(terms[0] + ", 90% Wt-dAB>0")
)
fig.save(figures_dir / 'heatmap.synergy-score.20perc.Wt-dAB>0.pdf')
fig
subsets = dist_subsets
fig,axes = plt.subplots(1, len(subsets),
figsize=get_figsize(0.25*len(subsets), .8/len(subsets)),
sharey=True)
for i, (subset,ax, subset_title) in enumerate(zip(subsets, axes, dist_subset_labels)):
sns.heatmap(get_syn_effect(c[c.subset == subset], pairs, motifs, tasks), cmap='RdBu_r', center=1, ax=ax,
cbar=i==len(subsets)-1,
vmin=.7, vmax=1.3)
ax.set_title(subset_title)
plt.savefig(figures_dir / 'heatmap.synergy-score.20perc.Wt-dAB>0.main_tasks.pdf')
mask = 'Wt-dAB>0'
terms = ['(2*Wt-dA-dB)/(Wt-dAB)']
filter_terms = dist_subsets
filter_names = dist_subset_labels
# filter_terms = ['center_diff <= 35', 'center_diff > 35']
c = pd.concat([(dfs.query(ft).groupby(['motif_pair', 'task']).
apply(lambda x: x.eval(term).median()).
reset_index().
rename(columns={0:'average'}).
assign(term=term, subset=ft, subset_name=filter_names[i]))
for term in terms for i, ft in enumerate(filter_terms)], axis=0)
c['term'] = pd.Categorical(c['term'], categories=terms)
c['subset'] = pd.Categorical(c['subset'], categories=filter_terms)
c['subset_name'] = pd.Categorical(c['subset_name'], categories=filter_names)
c['motif_pair'] = pd.Categorical(c['motif_pair'], pair_names[::-1])
plotnine.options.figure_size = get_figsize(1, 0.6)
assert len(terms) == 1
fig = (ggplot(aes(y='motif_pair', x='task', fill='average')) +
geom_tile(data=c.dropna()) +
# geom_tile(fill='#d9d9d9', data=c[c.average.isnull()]) +
geom_text(label='/', data=c[c.average.isnull()]) +
theme_classic(base_size=10, base_family='Arial') +
facet_grid(". ~ subset_name") +
theme(legend_position='top', axis_text_x=element_text(rotation=30, hjust=1)) +
scale_fill_gradient2(low=cb_red, mid='white', high=cb_blue, midpoint=1, limits=[.6, 1.4],
palette=PlotninePalette("RdBu_r", [0, 1])) +
# scale_fill_gradient2(low=cb_blue, mid='white', high=cb_red, midpoint=1, limits=[0, 2]) +
ggtitle(terms[0] + ", 90% Wt-dAB>0")
)
fig.save(figures_dir / 'heatmap.synergy-score.all.pdf')
fig
subsets = dist_subsets
fig,axes = plt.subplots(1, len(subsets),
figsize=get_figsize(0.25*len(subsets), .8/len(subsets)),
sharey=True)
for i, (subset,ax, subset_title) in enumerate(zip(subsets, axes, dist_subset_labels)):
sns.heatmap(get_syn_effect(c[c.subset == subset], pairs, motifs, tasks), cmap='RdBu_r', center=1, ax=ax,
cbar=i==len(subsets)-1,
vmin=.7, vmax=1.3)
ax.set_title(subset_title)
plt.savefig(figures_dir / 'heatmap.synergy-score.all.main_tasks.pdf')
from basepair.exp.paper.config import profile_mapping
dfabf_plot = dfabf2.query('center_diff > 5')#.query('center_diff < 150')
score = 'max_profile_count_bt'
# -----------------------------------
fig, axes = plt.subplots(len(motifs), len(motifs),
figsize=get_figsize(.65, 1),
gridspec_kw=dict(wspace=0, hspace=0),
sharex=True, sharey=True)
colors = plt.get_cmap("viridis")(np.linspace(0, .95, len(dfabf_plot.center_diff_cat.cat.categories)))
k = 0
for i in range(len(motifs)):
for j in range(len(motifs)):
ax = axes[i,j]
if i>j:
ax.axison = False
continue
idx = 'motif_pair_idx'
cat = 'center_diff_cat'
motif_pair = pairs[k]
motif_pair_name = "<>".join(motif_pair)
k+=1
t1 = profile_mapping[motif_pair[0]]
t2 = profile_mapping[motif_pair[1]]
# t1 = 'Nanog'
# t2 = 'Nanog'
# motif mask
mmask = (dfabf_plot.motif_pair == motif_pair_name) & (dfabf_plot.score == score)
ab = pd.merge(dfabf_plot[mmask & (dfabf_plot.task == t1)][[idx, 'x_alt_ref_pc', 'x_ref', cat]],
dfabf_plot[mmask & (dfabf_plot.task == t2)][[idx, 'y_alt_ref_pc', 'y_ref']],
on=idx)
# make a scatterplot
# Subset ab to only the highest values
ab = ab[(ab.x_ref > np.percentile(ab.x_ref, 80)) & (ab.y_ref > np.percentile(ab.y_ref, 80))]
for ci, (label,df) in enumerate(list(ab.groupby(cat))):
ax.scatter(1/df.y_alt_ref_pc,
1/df.x_alt_ref_pc, # first one is on the y axis
alpha=0.2, s=1, label=label, color=colors[ci], rasterized=True)
# ax.text(0, 0,t1+"-"+t2)
alpha = .5
xl = [1/3, 3]
ax.plot(xl, xl, c='grey', alpha=alpha)
ax.axvline(1, c='grey', alpha=alpha)
ax.axhline(1, c='grey', alpha=alpha)
ax.set_xlim(xl)
ax.set_ylim(xl)
ax.set_yscale('log')
ax.set_xscale('log')
# explicitly add ticks
ax.set_yticks([1/3, 1, 3])
ax.set_xticks([1/3, 1, 3])
ax.set_xticklabels(['1/3', '1', '3'])
ax.set_yticklabels(['1/3', '1', '3'])
ax.minorticks_off() # turns off minor ticks
if i == j:
ax.set_ylabel(motif_pair[1])
ax.set_xlabel(motif_pair[0])
# Use grey spines
plt.setp(ax.spines.values(), color='grey')
plt.setp([ax.get_xticklines(), ax.get_yticklines()], color='grey')
plt.legend(loc="bottom right", bbox_to_anchor=(-.2,.7),
scatterpoints=1, markerscale=10, columnspacing=0,
handletextpad=0, borderpad=0, frameon=False, title="Pairwise distance")
fig.savefig(figures_dir / "max_profile_count.upper-triag.no-Oct4.width=0.65.center_diff>5.pdf")
import matplotlib.cm as cm
colors = plt.get_cmap("viridis")(np.linspace(0, .95, len(dfabf.center_diff_cat.cat.categories)))
hex_colors = [plt.cm.colors.to_hex(a) for a in colors]
# use pair names
dfabf['motif_pair'] = pd.Categorical(dfabf['motif_pair'], pair_names)
from plotnine import *
hex_colors
dfabf['task'] = pd.Categorical(dfabf['task'])
print(dfabf[dfabf.score == 'max_profile_count']['center_diff_cat'].value_counts().to_string())
def get_effect(dfabf, pairs, motifs, agg_fn=np.mean, score='max_profile_count'):
motifxtask = [f"{motif} (task={profile_mapping[motif]})" for motif in motifs]
dfp = pd.DataFrame(index=motifxtask, columns=motifs, dtype=float)
k = 0
for i in range(len(motifs)):
for j in range(len(motifs)):
if i>j:
continue
idx = 'motif_pair_idx'
cat = 'center_diff_cat'
motif_pair = pairs[k]
motif_pair_name = "<>".join(motif_pair)
k+=1
t1 = profile_mapping[motif_pair[0]]
t2 = profile_mapping[motif_pair[1]]
# t1 = 'Nanog'
# t2 = 'Nanog'
# motif mask
mmask = (dfabf.motif_pair == motif_pair_name) & (dfabf.score == score)
ab = pd.merge(dfabf[mmask & (dfabf.task == t1)][[idx, 'x_alt_ref_pc', 'x_ref', cat]],
dfabf[mmask & (dfabf.task == t2)][[idx, 'y_alt_ref_pc', 'y_ref']],
on=idx)
ab = ab[(ab.x_ref > np.percentile(ab.x_ref, 80)) &
(ab.y_ref > np.percentile(ab.y_ref, 80))]
dfp.loc[motif_pair[0] + f" (task={t1})"][motif_pair[1]] = agg_fn(ab.x_alt_ref_pc)
dfp.loc[motif_pair[1] + f" (task={t2})"][motif_pair[0]] = agg_fn(ab.y_alt_ref_pc)
dfp.columns = [r"$\Delta$"+c for c in dfp.columns]
return dfp
dfabf_plot = dfabf2.query('center_diff > 5')
(AB - (B - None)) / A
pairs_no_oct4 = [p for p in pairs if p[0] != 'Oct4' and p[1] != 'Oct4']
motifs_no_oct4 = [m for m in motifs if m != 'Oct4']
motifs_no_oct4
dfabf_plot.strand_combination.unique()
subsets = dist_subsets
fig,axes = plt.subplots(4, len(subsets),
figsize=get_figsize(0.25*len(subsets), .8/len(subsets)*4),
sharex=True,
sharey=True)
for irow, sc in enumerate(['++', '+-', '-+', '--']):
for i, (subset,ax, subset_label) in enumerate(zip(subsets, axes[irow], dist_subset_labels)):
try:
sns.heatmap(get_effect(dfabf_plot[dfabf_plot.strand_combination == sc].query(subset), pairs_no_oct4, motifs_no_oct4,
agg_fn=np.median, score='max_profile_count_bt'),
cmap='RdBu', center=1, ax=ax,
cbar=i==len(subsets)-1,
vmin=.7, vmax=1.3)
except:
pass
if i == 0:
ax.set_title(sc + " " + subset_label)
else:
ax.set_title(subset_label)
plt.savefig(figures_dir / 'heatmap.median_max_profile_count.directional_effect.top-profile.bleed-through-corrected.strand-specific.pdf')
subsets = dist_subsets
fig,axes = plt.subplots(1, len(subsets),
figsize=get_figsize(0.25*len(subsets), .8/len(subsets)),
sharey=True)
for i, (subset,ax, subset_label) in enumerate(zip(subsets, axes, dist_subset_labels)):
sns.heatmap(get_effect(dfabf_plot.query(subset), pairs_no_oct4, motifs_no_oct4,
agg_fn=np.median, score='max_profile_count_bt'),
cmap='RdBu', center=1, ax=ax,
cbar=i==len(subsets)-1,
vmin=.7, vmax=1.3)
ax.set_title(subset_label)
plt.savefig(figures_dir / 'heatmap.median_max_profile_count.directional_effect.top-profile.bleed-through-corrected.pdf')
subsets = dist_subsets
fig,axes = plt.subplots(1, len(subsets),
figsize=get_figsize(0.25*len(subsets), .8/len(subsets)),
sharey=True)
for i, (subset,ax, subset_label) in enumerate(zip(subsets, axes, dist_subset_labels)):
sns.heatmap(get_effect(dfabf_plot.query(subset), pairs, agg_fn=np.median), cmap='RdBu', center=1, ax=ax,
cbar=i==len(subsets)-1,
vmin=.7, vmax=1.3)
ax.set_title(subset_label)
plt.savefig(figures_dir / 'heatmap.median_max_profile_count.directional_effect.pdf')
(AB - (B - None)) / A
subsets = dist_subsets
fig,axes = plt.subplots(1, len(subsets),
figsize=get_figsize(0.25*len(subsets), .8/len(subsets)),
sharey=True)
for i, (subset,ax, subset_label) in enumerate(zip(subsets, axes, dist_subset_labels)):
sns.heatmap(get_effect(dfabf_plot.query(subset), pairs, agg_fn=np.mean, score='max_profile_count_bt'), cmap='RdBu', center=1, ax=ax,
cbar=i==len(subsets)-1,
vmin=.7, vmax=1.3)
ax.set_title(subset_label)
plt.savefig(figures_dir / 'heatmap.mean_max_profile_count.directional_effect.bleed-through-corrected.pdf')
subsets = dist_subsets
fig,axes = plt.subplots(1, len(subsets),
figsize=get_figsize(0.25*len(subsets), .8/len(subsets)),
sharey=True)
for i, (subset,ax, subset_label) in enumerate(zip(subsets, axes, dist_subset_labels)):
sns.heatmap(get_effect(dfabf_plot.query(subset), pairs, agg_fn=np.mean, score='max_profile_count'), cmap='RdBu', center=1, ax=ax,
cbar=i==len(subsets)-1,
vmin=.7, vmax=1.3)
ax.set_title(subset_label)
plt.savefig(figures_dir / 'heatmap.mean_max_profile_count.directional_effect.pdf')
motifxtask = [f"{motif} (task={task})" for motif in motifs for task in tasks]
def get_effect_all_tasks(dfabf, pairs, tasks, agg_fn=np.mean):
motifxtask = [f"{motif} (task={task})" for motif in motifs for task in tasks]
dfp = pd.DataFrame(index=motifxtask, columns=motifs, dtype=float)
for task in tasks:
k = 0
for i in range(len(motifs)):
for j in range(len(motifs)):
if i>j:
continue
idx = 'motif_pair_idx'
cat = 'center_diff_cat'
motif_pair = pairs[k]
motif_pair_name = "<>".join(motif_pair)
k+=1
t1 = profile_mapping[motif_pair[0]]
t2 = profile_mapping[motif_pair[1]]
ab = dfabf[(dfabf.motif_pair == motif_pair_name) & (dfabf.score == 'max_profile_count') & (dfabf.task == task)]
dfp.loc[motif_pair[0] + f" (task={task})"][motif_pair[1]] = agg_fn(ab.x_alt_ref_pc)
dfp.loc[motif_pair[1] + f" (task={task})"][motif_pair[0]] = agg_fn(ab.y_alt_ref_pc)
dfp.columns = [r"$\Delta$"+c for c in dfp.columns]
return dfp
subsets = dist_subsets
fig,axes = plt.subplots(1, len(subsets),
figsize=get_figsize(0.25*len(subsets), .8/len(subsets)*4),
sharey=True)
for i, (subset,ax, subset_label) in enumerate(zip(subsets, axes, dist_subset_labels)):
sns.heatmap(get_effect_all_tasks(dfabf_plot.query(subset), pairs, tasks=tasks, agg_fn=np.median), cmap='RdBu', center=1, ax=ax,
cbar=i==len(subsets)-1,
vmin=.7, vmax=1.3)
ax.set_title(subset_label)
plt.savefig(figures_dir / 'heatmap.median_max_profile_count.directional_effect.all-tasks.pdf')
subsets = dist_subsets
fig,axes = plt.subplots(1, len(subsets),
figsize=get_figsize(0.25*len(subsets), .8/len(subsets)*4),
sharey=True)
for i, (subset,ax, subset_label) in enumerate(zip(subsets, axes, dist_subset_labels)):
sns.heatmap(get_effect_all_tasks(dfabf.query(subset), pairs, tasks=tasks, agg_fn=np.mean), cmap='RdBu', center=1, ax=ax,
cbar=i==len(subsets)-1,
vmin=.7, vmax=1.3)
ax.set_title(subset_label)
plt.savefig(figures_dir / 'heatmap.mean_max_profile_count.directional_effect.all-tasks.pdf')
For the scatter plots (directional interactions), generate png and pdf files for each interaction so I can use them directly.
- The combined pdfs are too big and also too confusing to show on a slide or figure.
dfabf_plot = dfabf2.query('center_diff > 5')
odir = figures_dir / "individual/max_profile_count"
odir.mkdir(parents=True, exist_ok=True)
colors = plt.get_cmap("viridis")(np.linspace(0, .95, len(dfabf_plot.center_diff_cat.cat.categories)))
k = 0
for i in range(len(motifs)):
for j in range(len(motifs)):
if i>j:
continue
fig, ax = plt.subplots(1, 1, figsize=get_figsize(.65 / 3, 1))
idx = 'motif_pair_idx'
cat = 'center_diff_cat'
motif_pair = pairs[k]
motif_pair_name = "<>".join(motif_pair)
k+=1
t1 = profile_mapping[motif_pair[0]]
t2 = profile_mapping[motif_pair[1]]
# t1 = 'Nanog'
# t2 = 'Nanog'
# motif mask
mmask = (dfabf_plot.motif_pair == motif_pair_name) & (dfabf_plot.score == 'max_profile_count')
ab = pd.merge(dfabf_plot[mmask & (dfabf_plot.task == t1)][[idx, 'x_alt_ref_pc', cat]],
dfabf_plot[mmask & (dfabf_plot.task == t2)][[idx, 'y_alt_ref_pc']],
on=idx)
# make a scatterplot
for ci, (label,df) in enumerate(list(ab.groupby(cat))):
ax.scatter(df.y_alt_ref_pc,
df.x_alt_ref_pc, # first one is on the y axis
alpha=0.2, s=1, label=label, color=colors[ci])
# ax.text(0, 0,t1+"-"+t2)
alpha = .5
xl = [0, 2]
ax.plot(xl, xl, c='grey', alpha=alpha)
ax.axvline(1, c='grey', alpha=alpha)
ax.axhline(1, c='grey', alpha=alpha)
ax.set_xlim(xl)
ax.set_ylim(xl)
ax.set_xlabel(r"{} ($\Delta$ {})".format(motif_pair[1], motif_pair[0]))
ax.set_ylabel(r"{} ($\Delta$ {})".format(motif_pair[0], motif_pair[1]))
# Use grey spines
plt.setp(ax.spines.values(), color='grey')
plt.setp([ax.get_xticklines(), ax.get_yticklines()], color='grey')
fig.savefig(odir / f'{motif_pair_name}.png', raster=True)
fig.savefig(odir / f'{motif_pair_name}.pdf', raster=True)
dfabf_plot = dfabf2.query('center_diff > 5')
odir = figures_dir / "individual/max_profile_count/all_tasks"
odir.mkdir(parents=True, exist_ok=True)
colors = plt.get_cmap("viridis")(np.linspace(0, .95, len(dfabf_plot.center_diff_cat.cat.categories)))
for task in tasks:
k = 0
for i in range(len(motifs)):
for j in range(len(motifs)):
if i>j:
continue
fig, ax = plt.subplots(1, 1, figsize=get_figsize(.65 / 3, 1))
idx = 'motif_pair_idx'
cat = 'center_diff_cat'
motif_pair = pairs[k]
motif_pair_name = "<>".join(motif_pair)
k+=1
t1 = profile_mapping[motif_pair[0]]
t2 = profile_mapping[motif_pair[1]]
ab = dfabf_plot[(dfabf_plot.motif_pair == motif_pair_name) &
(dfabf_plot.score == 'max_profile_count') &
(dfabf_plot.task == task)]
# make a scatterplot
for ci, (label,df) in enumerate(list(ab.groupby(cat))):
ax.scatter(df.y_alt_ref_pc,
df.x_alt_ref_pc, # first one is on the y axis
alpha=0.2, s=1, label=label, color=colors[ci])
# ax.text(0, 0,t1+"-"+t2)
alpha = .5
xl = [0, 2]
ax.plot(xl, xl, c='grey', alpha=alpha)
ax.axvline(1, c='grey', alpha=alpha)
ax.axhline(1, c='grey', alpha=alpha)
ax.set_xlim(xl)
ax.set_ylim(xl)
ax.set_xlabel(r"{} ($\Delta$ {})".format(motif_pair[1], motif_pair[0]))
ax.set_ylabel(r"{} ($\Delta$ {})".format(motif_pair[0], motif_pair[1]))
ax.set_title(task)
# Use grey spines
plt.setp(ax.spines.values(), color='grey')
plt.setp([ax.get_xticklines(), ax.get_yticklines()], color='grey')
fig.savefig(odir / f'{motif_pair_name},task={task}.png', raster=True)
fig.savefig(odir / f'{motif_pair_name},task={task}.pdf', raster=True)
dfabf_plot = dfabf2.query('center_diff > 5')
score = 'max_profile_count_bt'
odir = figures_dir / "individual/max_profile_count/all_tasks"
odir.mkdir(parents=True, exist_ok=True)
colors = plt.get_cmap("viridis")(np.linspace(0, .95, len(dfabf_plot.center_diff_cat.cat.categories)))
for task in tasks:
k = 0
for i in range(len(motifs)):
for j in range(len(motifs)):
if i>j:
continue
fig, ax = plt.subplots(1, 1, figsize=get_figsize(.65 / 3, 1))
idx = 'motif_pair_idx'
cat = 'center_diff_cat'
motif_pair = pairs[k]
motif_pair_name = "<>".join(motif_pair)
k+=1
t1 = profile_mapping[motif_pair[0]]
t2 = profile_mapping[motif_pair[1]]
ab = dfabf_plot[(dfabf_plot.motif_pair == motif_pair_name) &
(dfabf_plot.score == score) &
(dfabf_plot.task == task)]
# make a scatterplot
for ci, (label,df) in enumerate(list(ab.groupby(cat))):
ax.scatter(df.y_alt_ref_pc,
df.x_alt_ref_pc, # first one is on the y axis
alpha=0.2, s=1, label=label, color=colors[ci])
# ax.text(0, 0,t1+"-"+t2)
alpha = .5
xl = [0, 2]
ax.plot(xl, xl, c='grey', alpha=alpha)
ax.axvline(1, c='grey', alpha=alpha)
ax.axhline(1, c='grey', alpha=alpha)
ax.set_xlim(xl)
ax.set_ylim(xl)
ax.set_xlabel(r"{} ($\Delta$ {})".format(motif_pair[1], motif_pair[0]))
ax.set_ylabel(r"{} ($\Delta$ {})".format(motif_pair[0], motif_pair[1]))
ax.set_title(task)
# Use grey spines
plt.setp(ax.spines.values(), color='grey')
plt.setp([ax.get_xticklines(), ax.get_yticklines()], color='grey')
fig.savefig(odir / f'{motif_pair_name},task={task}.bt-corrected.png', raster=True)
fig.savefig(odir / f'{motif_pair_name},task={task}.bt-corrected.pdf', raster=True)
dfabf_plot = dfabf2.query('center_diff > 5')
dfabf_plot.motif_pair.unique()
dfp = dfabf_plot.query("motif_pair == 'Oct4-Sox2<>Sox2'").query("score == 'max_profile_count_bt'").query("task == 'Sox2'")
dfp.shape
(ggplot(aes(x='center_diff'), data=dfp) +
geom_histogram() +
theme_bw()
)
# .query('match_weighted_p > .2').query('imp_weighted_p > .01')
(ggplot(aes(x='center_diff', fill='strand_combination'), data=dfp[dfp.center_diff < 150]) +
geom_histogram(breaks=np.arange(150)) +
theme_bw() +
facet_grid("strand_combination ~ .") +
scale_fill_brewer(type='qual', palette=3)
)
dfabf_plot.match_weighted_p_x.min()
dfabf_plot.match_weighted_p_y.min()
dfabf_plot.imp_weighted_p_x.min()
dfabf_plot.imp_weighted_p_y.min()
(ggplot(aes(x='center_diff', fill='strand_combination'), data=(dfabf_plot.
query("motif_pair == 'Nanog<>Nanog'").
query("score == 'max_profile_count_bt'").
query("task == 'Nanog'").
query("center_diff < 150"))) +
geom_histogram(breaks=np.arange(150)) +
theme_bw() +
facet_grid("strand_combination ~ .") +
scale_fill_brewer(type='qual', palette=3)
)
dfs = (dfabf_plot.query("motif_pair == 'Oct4-Sox2<>Sox2'").
query("score == 'max_profile_count_bt'").
query("task == 'Sox2'").
query("center_diff < 150"))
fig, axes = plt.subplots(2, 1, figsize=get_figsize(.5, 1/2), sharex=True)
axes[0].hist(dfs.center_diff, bins=np.arange(150))
df = dfs.groupby('center_diff').y_alt_ref_pc.median().reset_index()
axes[1].plot(df.center_diff, 1/df.y_alt_ref_pc);
axes[1].scatter(df.center_diff, 1/df.y_alt_ref_pc, s=2);
fig.subplots_adjust(hspace=0)
dfs = (dfabf_plot.query("motif_pair == 'Oct4-Sox2<>Nanog'").
query("score == 'max_profile_count_bt'").
query("task == 'Nanog'").
query("center_diff < 150"))
fig, axes = plt.subplots(2, 1, figsize=get_figsize(.5, 1/2), sharex=True)
axes[0].hist(dfs.center_diff, bins=np.arange(150))
df = dfs.groupby('center_diff').y_alt_ref_pc.median().reset_index()
axes[1].plot(df.center_diff, 1/df.y_alt_ref_pc);
axes[1].scatter(df.center_diff, 1/df.y_alt_ref_pc, s=2);
fig.subplots_adjust(hspace=0)
dfs = (dfabf_plot.query("motif_pair == 'Nanog<>Nanog'").
query("score == 'max_profile_count_bt'").
query("task == 'Nanog'").
query("center_diff < 150"))
fig, axes = plt.subplots(2, 1, figsize=get_figsize(.5, 1/2), sharex=True)
axes[0].hist(dfs.center_diff, bins=np.arange(150))
df = dfs.groupby('center_diff').y_alt_ref_pc.median().reset_index()
axes[1].plot(df.center_diff, 1/df.y_alt_ref_pc);
axes[1].scatter(df.center_diff, 1/df.y_alt_ref_pc, s=2);
axes[1].axhline(y=1, color='grey', alpha=0.5)
fig.subplots_adjust(hspace=0)
dfs = (dfabf_plot.query("motif_pair == 'Sox2<>Nanog'").
query("score == 'max_profile_count_bt'").
query("task == 'Nanog'").
query("center_diff < 150"))
fig, axes = plt.subplots(2, 1, figsize=get_figsize(1, 1/2), sharex=True)
axes[0].hist(dfs.center_diff, bins=np.arange(150))
df = dfs.groupby('center_diff').y_alt_ref_pc.median().reset_index()
axes[1].plot(df.center_diff, 1/df.y_alt_ref_pc);
axes[1].scatter(df.center_diff, 1/df.y_alt_ref_pc, s=2);
axes[1].axhline(y=1, color='grey', alpha=0.5)
fig.subplots_adjust(hspace=0)
# Y
plt.figure()
h, _, _, _ = plt.hist2d(dfabf_plot.match_weighted_p_x, dfabf_plot.imp_weighted_p_x)
cb = plt.colorbar()
cb.set_label('counts in bin')
plt.xlabel("Match")
plt.ylabel("imp")
# plt.figure()
# # Y
# h, _, _, _ = plt.hist2d(dfabf_plot.match_weighted_p_y, dfabf_plot.imp_weighted_p_y)
# cb = plt.colorbar()
# cb.set_label('counts in bin')
# plt.xlabel("Match")
# plt.ylabel("imp")
# Y
plt.figure()
nn = ~dfabf_plot.seq_match_p_x.isna()
h, _, _, _ = plt.hist2d(dfabf_plot.match_weighted_p_x[nn], dfabf_plot.seq_match_p_x[nn])
cb = plt.colorbar()
cb.set_label('counts in bin')
plt.xlabel("Match")
plt.ylabel("imp")
# plt.figure()
# # Y
# h, _, _, _ = plt.hist2d(dfabf_plot.match_weighted_p_y, dfabf_plot.imp_weighted_p_y)
# cb = plt.colorbar()
# cb.set_label('counts in bin')
# plt.xlabel("Match")
# plt.ylabel("imp")
values, buckets, _ = plt.hist(np.random.randn(100))
dfs[dfs.task == t1].groupby('center_diff').x_alt_ref_pc.agg(['size', 'median'])
df1
motif_pair = ['Oct4-Sox2', 'Sox2']
motif_pair_name = '<>'.join(motif_pair)
score = 'max_profile_count_bt'
dfs = (dfabf_plot.query(f"motif_pair == '{motif_pair_name}'").
query(f"score == '{score}'").
query("center_diff > 6").
query("center_diff < 150"))
dfs[dfs.task == 'Sox2'].y_ref.plot.hist(bins=100)
np.percentile(dfs[dfs.task == 'Sox2'].y_ref, 90)
np.sum(dfs[dfs.task == 'Sox2'].y_ref > .25)
dfs[dfs.task == 'Sox2'].x_ref.plot.hist(bins=100)
np.percentile(dfs[dfs.task == 'Sox2'].x_ref, 90)
np.sum(dfs.x_ref > 1)
np.sum((dfs.x_ref > .5) & (dfs.y_ref > .5))
np.sum(dfs.y_ref > 1)
dfs.head()
dfabf_plot.head()
df1['median'].shape
len(dfs)
len(df1)
dfs[dfs.task == t1].groupby('center_diff').x_alt_ref_pc.agg(['size', 'median', 'mad']).reset_index()
dfs.task.value_counts()
dfs[dfs.task == t1].groupby('center_diff').x_alt_ref_pc.agg(['size', 'median', 'mad']).reset_index()
fig, axes = plt.subplots(2, 1, figsize=get_figsize(.5, 1), sharex=True)
for j, score in enumerate(['max_profile_count_bt']):
motif_pair_name = '<>'.join(motif_pair)
dfs = (dfabf_plot.query(f"motif_pair == '{motif_pair_name}'").
query(f"score == '{score}'").
query("center_diff > 6").
query("center_diff < 150"))
dfs = dfs[(dfs.x_ref > .25) & (dfs.y_ref > .25)]
# dfs = dfs[(dfs.x_ref > .25)]
if j == 0:
values, buckets, _ = axes[0].hist(dfs.center_diff, bins=np.arange(150))
t1 = profile_mapping[motif_pair[0]]
t2 = profile_mapping[motif_pair[1]]
df1 = dfs[dfs.task == t1].groupby('center_diff').x_alt_ref_pc.agg(['size', 'median', 'mad']).reset_index()
#df1 = df1[df1['size'] > 20]
df2 = dfs[dfs.task == t2].groupby('center_diff').y_alt_ref_pc.agg(['size', 'median', 'mad']).reset_index()
#df2 = df2[df2['size'] > 20]
# Plot #1
axes[j+1].plot(df1.center_diff, 1/df1['median'], label=motif_pair[0]);
# axes[j+1].fill_between(df1.center_diff,
# 1/(df1['median']+df1['mad']/np.sqrt(df1['size'])),
# 1/(df1['median']-df1['mad']/np.sqrt(df1['size'])), alpha=0.1);
axes[j+1].scatter(df1.center_diff, 1/df1['median'], s=2);
# Plot #2
axes[j+1].plot(df2.center_diff, 1/df2['median'], label=motif_pair[1]);
# axes[j+1].fill_between(df2.center_diff,
# 1/(df2['median']+df2['mad']/np.sqrt(df2['size'])),
# 1/(df2['median']-df2['mad']/np.sqrt(df2['size'])), alpha=0.1);
axes[j+1].scatter(df2.center_diff, 1/df2['median'], s=2);
axes[j+1].set_ylim([.8, 4])
axes[j+1].axhline(y=1, color='grey', alpha=0.5)
axes[j+1].set_ylabel(score, rotation=0, ha='right')
plt.legend(labels=motif_pair)
fig.subplots_adjust(hspace=0)
np.percentile([1, 2,3 , 5], 40)
motif_pair = ['Oct4-Sox2', 'Oct4']
for motif_pair in pairs:
fig, axes = plt.subplots(3, 1, figsize=get_figsize(.5, 1), sharex=True)
for j, score in enumerate(['max_profile_count_bt', 'max_profile_count']):
motif_pair_name = '<>'.join(motif_pair)
dfs = (dfabf_plot.query(f"motif_pair == '{motif_pair_name}'").
query(f"score == '{score}'").
query("center_diff > 6").
query("center_diff < 150"))
if j == 0:
values, buckets, _ = axes[0].hist(dfs.center_diff, bins=np.arange(150))
t1 = profile_mapping[motif_pair[0]]
x_ref_threshold = np.percentile(dfs[dfs.task == t1].x_ref, 80)
t2 = profile_mapping[motif_pair[1]]
y_ref_threshold = np.percentile(dfs[dfs.task == t2].y_ref, 80)
df1 = dfs[(dfs.task == t1) & (dfs.x_ref > x_ref_threshold)].groupby('center_diff').x_alt_ref_pc.agg(['size', 'median', 'mad']).reset_index()
# df1 = df1[df1['size'] > 10]
df2 = dfs[(dfs.task == t2) & (dfs.y_ref > y_ref_threshold)].groupby('center_diff').y_alt_ref_pc.agg(['size', 'median', 'mad']).reset_index()
# df2 = df2[df2['size'] > 10]
# Plot #1
axes[j+1].plot(df1.center_diff, 1/df1['median'], label=motif_pair[0]);
# axes[j+1].fill_between(df1.center_diff,
# 1/(df1['median']+df1['mad']/np.sqrt(df1['size'])),
# 1/(df1['median']-df1['mad']/np.sqrt(df1['size'])), alpha=0.1);
axes[j+1].scatter(df1.center_diff, 1/df1['median'], s=2);
# Plot #2
axes[j+1].plot(df2.center_diff, 1/df2['median'], label=motif_pair[1]);
# axes[j+1].fill_between(df2.center_diff,
# 1/(df2['median']+df2['mad']/np.sqrt(df2['size'])),
# 1/(df2['median']-df2['mad']/np.sqrt(df2['size'])), alpha=0.1);
axes[j+1].scatter(df2.center_diff, 1/df2['median'], s=2);
axes[j+1].set_ylim([0, 3])
axes[j+1].axhline(y=1, color='grey', alpha=0.5)
axes[j+1].set_ylabel(score, rotation=0, ha='right')
plt.legend(labels=motif_pair)
fig.subplots_adjust(hspace=0)
motif_pair = ['Oct4-Sox2', 'Oct4']
for motif_pair in pairs:
fig, axes = plt.subplots(3, 1, figsize=get_figsize(.5, 1), sharex=True)
for j, score in enumerate(['max_profile_count_bt', 'max_profile_count']):
motif_pair_name = '<>'.join(motif_pair)
dfs = (dfabf_plot.query(f"motif_pair == '{motif_pair_name}'").
query(f"score == '{score}'").
query("center_diff > 6").
query("center_diff < 150"))
if j == 0:
values, buckets, _ = axes[0].hist(dfs.center_diff, bins=np.arange(150))
t1 = profile_mapping[motif_pair[0]]
t2 = profile_mapping[motif_pair[1]]
df1 = dfs[dfs.task == t1].groupby('center_diff').x_alt_ref_pc.agg(['size', 'median', 'mad']).reset_index()
df1 = df1[df1['size'] > 20]
df2 = dfs[dfs.task == t2].groupby('center_diff').y_alt_ref_pc.agg(['size', 'median', 'mad']).reset_index()
df2 = df2[df2['size'] > 20]
# Plot #1
axes[j+1].plot(df1.center_diff, 1/df1['median'], label=motif_pair[0]);
# axes[j+1].fill_between(df1.center_diff,
# 1/(df1['median']+df1['mad']/np.sqrt(df1['size'])),
# 1/(df1['median']-df1['mad']/np.sqrt(df1['size'])), alpha=0.1);
axes[j+1].scatter(df1.center_diff, 1/df1['median'], s=2);
# Plot #2
axes[j+1].plot(df2.center_diff, 1/df2['median'], label=motif_pair[1]);
# axes[j+1].fill_between(df2.center_diff,
# 1/(df2['median']+df2['mad']/np.sqrt(df2['size'])),
# 1/(df2['median']-df2['mad']/np.sqrt(df2['size'])), alpha=0.1);
axes[j+1].scatter(df2.center_diff, 1/df2['median'], s=2);
axes[j+1].set_ylim([0, 2])
axes[j+1].axhline(y=1, color='grey', alpha=0.5)
axes[j+1].set_ylabel(score, rotation=0, ha='right')
plt.legend(labels=motif_pair)
fig.subplots_adjust(hspace=0)
motif_pair = ['Oct4-Sox2', 'Oct4']
for motif_pair in pairs:
fig, axes = plt.subplots(3, 1, figsize=get_figsize(.5, 1), sharex=True)
for j, score in enumerate(['max_profile_count_bt', 'max_profile_count']):
motif_pair_name = '<>'.join(motif_pair)
dfs = (dfabf_plot.query(f"motif_pair == '{motif_pair_name}'").
query(f"score == '{score}'").
query("center_diff > 6").
query("center_diff < 150"))
if j == 0:
values, buckets, _ = axes[0].hist(dfs.center_diff, bins=np.arange(150))
t1 = profile_mapping[motif_pair[0]]
t2 = profile_mapping[motif_pair[1]]
df1 = dfs[dfs.task == t1].groupby('center_diff').x_alt_ref_pc.agg(['size', 'median', 'mad']).reset_index()
df1 = df1[df1['size'] > 20]
df2 = dfs[dfs.task == t2].groupby('center_diff').y_alt_ref_pc.agg(['size', 'median', 'mad']).reset_index()
df2 = df2[df2['size'] > 20]
# Plot #1
axes[j+1].plot(df1.center_diff, 1/df1['median'], label=motif_pair[0]);
# axes[j+1].fill_between(df1.center_diff,
# 1/(df1['median']+df1['mad']/np.sqrt(df1['size'])),
# 1/(df1['median']-df1['mad']/np.sqrt(df1['size'])), alpha=0.1);
axes[j+1].scatter(df1.center_diff, 1/df1['median'], s=2);
# Plot #2
axes[j+1].plot(df2.center_diff, 1/df2['median'], label=motif_pair[1]);
# axes[j+1].fill_between(df2.center_diff,
# 1/(df2['median']+df2['mad']/np.sqrt(df2['size'])),
# 1/(df2['median']-df2['mad']/np.sqrt(df2['size'])), alpha=0.1);
axes[j+1].scatter(df2.center_diff, 1/df2['median'], s=2);
axes[j+1].set_ylim([0, 2])
axes[j+1].axhline(y=1, color='grey', alpha=0.5)
axes[j+1].set_ylabel(score, rotation=0, ha='right')
plt.legend(labels=motif_pair)
fig.subplots_adjust(hspace=0)
dfabf_plot.strand_combination
from basepair.exp.paper.config import tf_colors
# for strand_combination in ['++', '-+', '+-', '--']:
fig, axes = plt.subplots(3, 1, figsize=get_figsize(.25, 2),
sharex=True,
sharey=True,
gridspec_kw=dict(wspace=0, hspace=0))
# motif_pair = ['Oct4-Sox2', 'Sox2']
for i, motif_pair in enumerate([['Nanog', 'Nanog'],
['Sox2', 'Nanog'],
['Sox2', 'Sox2']]):
motif_pair_name = '<>'.join(motif_pair)
score = 'max_profile_count_bt'
dfs = (dfabf_plot.query(f"motif_pair == '{motif_pair_name}'").
query(f"score == '{score}'").
query("center_diff > 6").
# query(f"strand_combination == '{strand_combination}'").
query("center_diff < 150"))
t1 = profile_mapping[motif_pair[0]]
t2 = profile_mapping[motif_pair[1]]
df1 = dfs[dfs.task == t1].groupby('center_diff').x_alt_ref_pc.agg(['size', 'median', 'mad']).reset_index()
df1 = df1[df1['size'] > 20]
df2 = dfs[dfs.task == t2].groupby('center_diff').y_alt_ref_pc.agg(['size', 'median', 'mad']).reset_index()
df2 = df2[df2['size'] > 20]
ax = axes[i]
ax.plot(df1.center_diff, 1/df1['median'],
label=motif_pair[0],
color=tf_colors[profile_mapping[motif_pair[0]]]);
ax.plot(df2.center_diff, 1/df2['median'],
label=motif_pair[1],
color=tf_colors[profile_mapping[motif_pair[1]]]);
if i == 0:
ax.set_title(f"Short-range interactions")
ax.axhline(1, color='grey', alpha=0.5)
if i == 1:
ax.set_ylabel("Max profile value")
ax.legend()
ax.spines['top'].set_visible(False)
ax.set_ylim([0.8, 2.5])
ax.spines['right'].set_visible(False)
axes[-1].set_xlabel("Motif distance")
axes[-1].set_xticks([10, 50, 100, 150]);
fig.savefig(figures_dir / f'short-range.Nanog<>Sox2.strand_combination=all.pdf')
fig, axes = plt.subplots(3, 1, figsize=get_figsize(.25, 2),
sharex=True,
sharey=True,
gridspec_kw=dict(wspace=0, hspace=0))
# motif_pair = ['Oct4-Sox2', 'Sox2']
for i, motif_pair in enumerate([['Oct4-Sox2', 'Sox2'],
['Oct4-Sox2', 'Nanog'],
['Oct4-Sox2', 'Klf4']]):
motif_pair_name = '<>'.join(motif_pair)
score = 'max_profile_count_bt'
dfs = (dfabf_plot.query(f"motif_pair == '{motif_pair_name}'").
query(f"score == '{score}'").
query("center_diff > 6").
# query("strand_combination == '++'").
query("center_diff < 150"))
t1 = profile_mapping[motif_pair[0]]
t2 = profile_mapping[motif_pair[1]]
df1 = dfs[dfs.task == t1].groupby('center_diff').x_alt_ref_pc.agg(['size', 'median', 'mad']).reset_index()
df1 = df1[df1['size'] > 20]
df2 = dfs[dfs.task == t2].groupby('center_diff').y_alt_ref_pc.agg(['size', 'median', 'mad']).reset_index()
df2 = df2[df2['size'] > 20]
ax = axes[i]
ax.plot(df1.center_diff, 1/df1['median'],
label=motif_pair[0],
color=tf_colors[profile_mapping[motif_pair[0]]]);
ax.plot(df2.center_diff, 1/df2['median'],
label=motif_pair[1],
color=tf_colors[profile_mapping[motif_pair[1]]]);
if i == 0:
ax.set_title("Short-range interactions")
ax.axhline(1, color='grey', alpha=0.5)
if i == 1:
ax.set_ylabel("Max profile value")
ax.legend()
ax.spines['top'].set_visible(False)
ax.set_ylim([0.8, 2.5])
ax.spines['right'].set_visible(False)
axes[-1].set_xlabel("Motif distance")
axes[-1].set_xticks([10, 50, 100, 150]);
fig.savefig(figures_dir / 'long-range.Oct4-Sox2.strand_combination=all.pdf')
dfp = pd.read_csv("http://mitra.stanford.edu/kundaje/avsec/chipnexus/oct-sox-nanog-klf/models/n_dil_layers=9/modisco/all/deeplift/profile/pattern_table.csv")
dfp.head()
dfp = dfp[dfp.pattern.isin(list(motifs.values()))]
m_hash = {v:k for k,v in motifs.items()}
dfp['pattern_name'] = dfp.pattern.map(m_hash)
dfp = dfp.set_index('pattern_name')
dfp = dfp.loc[list(motifs)]
dfp_max = dfp[[c for c in dfp.columns if c.endswith(' footprint max')]]
dfp_max.columns = [c.split(" ")[0] for c in dfp_max.columns]
dfp_max = dfp_max[tasks]
dfp_max
dfp_counts = dfp[[c for c in dfp.columns if c.endswith(' footprint counts')]]
dfp_counts.columns = [c.split(" ")[0] for c in dfp_max.columns]
dfp_counts = dfp_counts[tasks]
dfp_counts
fig, axes = plt.subplots(1, 2, figsize=get_figsize(1, .35), sharey=True)
ax = axes[0]
sns.heatmap(dfp_max, cmap='Reds', ax=ax)
ax.set_ylabel("Motif")
ax.set_xlabel("Task");
ax.set_title("Avg. max profile counts");
ax = axes[1]
sns.heatmap(dfp_counts, cmap='Reds', ax=ax)
ax.set_ylabel("")
ax.set_xlabel("Task");
ax.set_title("Avg. total profile counts");
plt.savefig(figures_dir / 'heatmap.total-profile-counts.pdf')
dfs = (dfabf_plot.query("motif_pair == 'Oct4-Sox2<>Sox2'").
query("center_diff == 50").
query("score == 'max_profile_count_bt'").
query("task == 'Sox2'"))
(1 / dfs[dfs.task == 'Sox2'].y_alt_ref_pc).hist()
1 / dfs[dfs.task == 'Sox2'].y_alt_ref_pc < 1.2
dfs = dfs[1 / dfs.y_alt_ref_pc < 1.2]
ls {model_dir}
isf = ImpScoreFile(model_dir / 'deeplift.imp_score.h5', default_imp_score='profile/wn')
isf.get_profiles()
profiles = isf.get_profiles(idx=100281)
imp_scores = isf.get_contrib(idx=100281)
plot_tracks(profiles);
plot_tracks(filter_tracks(imp_scores, [250, 600]));
plot_tracks(filter_tracks(imp_scores, [350, 500]));
create_tf_session(2)
from basepair.BPNet import BPNetSeqModel
bpnet = BPNetSeqModel.from_mdir(model_dir)
dict(dfs.sort_values('y_alt_ref_pc').iloc[-2])
idx = 100281
profiles = isf.get_profiles(idx=idx)
imp_scores = isf.get_contrib(idx=idx)
seq = isf.get_seq(idx=idx)
preds = {k:v[0] for k,v in bpnet.predict(seq[np.newaxis]).items()}
plot_tracks(preds);
plt.xlim([0, 1000]);
plot_tracks(profiles);
plt.xlim([0, 1000]);
plot_tracks(filter_tracks(imp_scores, None));
plot_tracks(filter_tracks(imp_scores, [150, 350]));
plot_tracks(filter_tracks(imp_scores, [450, 680]));
dict(dfs.sort_values('y_alt_ref_pc').iloc[-3])
idx = 142628
profiles = isf.get_profiles(idx=idx)
imp_scores = isf.get_contrib(idx=idx)
seq = isf.get_seq(idx=idx)
preds = {k:v[0] for k,v in bpnet.predict(seq[np.newaxis]).items()}
plot_tracks(preds);
plt.xlim([0, 1000]);
plot_tracks(profiles);
plt.xlim([0, 1000]);
plot_tracks(filter_tracks(imp_scores, None));
plot_tracks(filter_tracks(imp_scores, [700, 900]));
dict(dfs.sort_values('y_alt_ref_pc').iloc[-4])
idx = 80917
profiles = isf.get_profiles(idx=idx)
imp_scores = isf.get_contrib(idx=idx)
seq = isf.get_seq(idx=idx)
preds = {k:v[0] for k,v in bpnet.predict(seq[np.newaxis]).items()}
plot_tracks(preds);
plt.xlim([0, 1000]);
plot_tracks(profiles);
plt.xlim([0, 1000]);
plot_tracks(filter_tracks(imp_scores, None));
plot_tracks(filter_tracks(imp_scores, [350, 600]));
dict(dfs.sort_values('y_alt_ref_pc').iloc[-5])
idx = 56054
profiles = isf.get_profiles(idx=idx)
imp_scores = isf.get_contrib(idx=idx)
seq = isf.get_seq(idx=idx)
preds = {k:v[0] for k,v in bpnet.predict(seq[np.newaxis]).items()}
plot_tracks(preds);
plt.xlim([0, 1000]);
plot_tracks(profiles);
plt.xlim([0, 1000]);
plot_tracks(filter_tracks(imp_scores, None));
plot_tracks(filter_tracks(imp_scores, [100, 200]));
plot_tracks(filter_tracks(imp_scores, [450, 650]));
dict(dfs.sort_values('y_alt_ref_pc').iloc[0])
idx = 56086
profiles = isf.get_profiles(idx=idx)
imp_scores = isf.get_contrib(idx=idx)
seq = isf.get_seq(idx=idx)
preds = {k:v[0] for k,v in bpnet.predict(seq[np.newaxis]).items()}
plot_tracks(preds);
plt.xlim([0, 1000]);
plot_tracks(profiles);
plt.xlim([0, 1000]);
plot_tracks(filter_tracks(imp_scores, None));
plot_tracks(filter_tracks(imp_scores, [450, 600]));
plot_tracks(filter_tracks(imp_scores, [450, 650]));
dict(dfs.sort_values('y_alt_ref_pc').iloc[1])
idx = 7895
profiles = isf.get_profiles(idx=idx)
imp_scores = isf.get_contrib(idx=idx)
seq = isf.get_seq(idx=idx)
preds = {k:v[0] for k,v in bpnet.predict(seq[np.newaxis]).items()}
plot_tracks(preds);
plt.xlim([0, 1000]);
plot_tracks(profiles);
plt.xlim([0, 1000]);
plot_tracks(filter_tracks(imp_scores, None));
plot_tracks(filter_tracks(imp_scores, [600, 800]));