from collections import OrderedDict
exp = 'nexus,peaks,OSNK,0,10,1,FALSE,same,0.5,64,25,0.004,9,FALSE,[1,50],TRUE'
imp_score = 'profile/wn'
motifs = OrderedDict([
("Oct4-Sox2", 'Oct4/m0_p0'),
("Oct4", 'Oct4/m0_p1'),
# ("Strange-sym-motif", 'Oct4/m0_p5'),
("Sox2", 'Sox2/m0_p1'),
("Nanog", 'Nanog/m0_p1'),
("Zic3", 'Nanog/m0_p2'),
("Nanog-partner", 'Nanog/m0_p4'),
("Klf4", 'Klf4/m0_p0'),
])
make a nicer Nanog plot where the periodicity can be seen
from basepair.imports import *
from basepair.exp.paper.config import *
from basepair.modisco.periodicity import compute_power_spectrum, periodicity_10bp_frac, plot_power_spectrum
import matplotlib.ticker as ticker
from basepair.plot.profiles import extract_signal
from basepair.utils import pd_col_prepend
from basepair.modisco.table import ModiscoData
paper_config()
# figures dir
model_dir = models_dir / exp
fdir = Path(f'{ddir}/figures/modisco/{exp}/periodicity/')
fdir.mkdir(parents=True, exist_ok=True)
task = 'Nanog'
modisco_dir = model_dir / f'deeplift/{task}/out/{imp_score}/'
mr = ModiscoResult(modisco_dir / 'modisco.h5')
p = mr.get_pattern("metacluster_0/pattern_1")
w = get_figsize(.6, 1/12)[0]
w / 70
# fig = plt.figure(figsize=get_figsize(.6, 1/12))
p.plot("contrib", letter_width=0.06, height=0.4);
sns.despine(top=True, right=True, bottom=True)
plt.savefig(fdir / 'm0_p1.contrib.pdf')
# fig = plt.figure(figsize=get_figsize(.6, 1/12))
p.plot("seq_ic", letter_width=0.06, height=0.4);
sns.despine(top=True, right=True, bottom=True)
plt.savefig(fdir / 'm0_p1.pfm.pdf')
md = ModiscoData.load(modisco_dir, None)
def plot_power_spectrum(pattern, task, data, figwidth=10):
seqlets = data.seqlets_per_task[pattern]
wide_seqlets = [s.resize(data.footprint_width)
for s in seqlets
if s.center() > data.footprint_width // 2 and
s.center() < data.get_seqlen(pattern) - data.footprint_width // 2
]
p = extract_signal(data.get_region_grad(task, 'profile'), wide_seqlets)
agg_profile = np.log(np.abs(p).sum(axis=-1).sum(axis=0))
heatmap_importance_profile(normalize(np.abs(p).sum(axis=-1)[:500], pmin=50, pmax=99), figsize=(figwidth, 20))
heatmap_fig = plt.gcf()
# heatmap_importance_profile(np.abs(p*seq).sum(axis=-1)[:500], figsize=(10, 20))
agg_profile = agg_profile - agg_profile.mean()
agg_profile = agg_profile / agg_profile.std()
freq = np.fft.fftfreq(agg_profile[102:].shape[-1])
smooth_part = smooth(agg_profile, 10)
oscilatory_part = agg_profile - smooth_part
avg_fig, axes = plt.subplots(2, 1, figsize=(11, 4), sharex=True)
axes[0].plot(agg_profile, label='original')
axes[0].plot(smooth_part, label="smooth", alpha=0.5)
axes[0].legend()
axes[0].set_ylabel("Avg. importance")
axes[0].set_title("Average importance score")
# axes[0].set_xlabel("Position");
axes[1].plot(oscilatory_part)
axes[1].set_xlabel("Position")
axes[1].set_ylabel("original - smooth")
avg_fig.subplots_adjust(hspace=0) # no space between plots
# plt.savefig('nanog-agg-profile.png', dpi=300)
# plt.savefig('nanog-agg-profile.pdf')
fft_fig = plt.figure(figsize=(11, 2))
plt.plot(1 / freq[:49], np.abs(np.fft.fft(oscilatory_part[102:])[:49])**2 + np.abs(np.fft.fft(oscilatory_part[:98])[:49])**2, "-o")
plt.xlim([0, 50])
plt.gca().xaxis.set_major_locator(ticker.MaxNLocator(25, integer=True))
plt.grid(alpha=0.3)
plt.xlabel("1/Frequency [bp]")
plt.ylabel("Power spectrum")
plt.title("Power spectrum")
plt.gcf().subplots_adjust(bottom=0.4)
return heatmap_fig, avg_fig, fft_fig
from basepair.modisco.periodicity import *
pattern='metacluster_0/pattern_1'
task="Nanog"
data=md
seqlets = data.seqlets_per_task[pattern]
wide_seqlets = [s.resize(data.footprint_width)
for s in seqlets
if s.center() > data.footprint_width // 2 and
s.center() < data.get_seqlen(pattern) - data.footprint_width // 2
]
p = extract_signal(data.get_region_grad(task, 'profile'), wide_seqlets)
agg_profile = np.log(np.abs(p).sum(axis=-1).sum(axis=0))
agg_profile = agg_profile - agg_profile.mean()
agg_profile = agg_profile / agg_profile.std()
freq = np.fft.fftfreq(agg_profile[102:].shape[-1])
smooth_part = smooth(agg_profile, 10)
oscilatory_part = agg_profile - smooth_part
agg_profile
pd.DataFrame()
ls ~/gdrive/projects/chipnexus/data/
agg_profile.argmax()
periodicity_dir = Path('/users/avsec/gdrive/projects/chipnexus/data/periodicity/')
periodicity_dir.mkdir(exist_ok=True)
np.savetxt(periodicity_dir / 'aggregated-signal.Nanog.txt', agg_profile)
agg_profile = np.loadtxt(periodicity_dir / 'aggregated-signal.Nanog.txt')
smooth_part = np.loadtxt(periodicity_dir / 'aggregated-signal-minus-smoothed.Nanog.txt')
smooth_part
np.savetxt(periodicity_dir / 'aggregated-signal-minus-smoothed.Nanog.txt', smooth_part)
ax.plot(agg_profile, label='original')
ax.plot(smooth_part, label="smooth", alpha=0.5)
fig, axes = plt.subplots(1, 1,
figsize=get_figsize(.5, ))
heatmap_importance_profile(normalize(np.abs(p).sum(axis=-1)[:500], pmin=50, pmax=99), aspect='auto',
ax=axes)
fig.savefig(fdir / 'Nanog.p1.heatmap.pdf')
fig.savefig(fdir / 'Nanog.p1.heatmap.png')
fig, axes = plt.subplots(3, 1,
gridspec_kw=dict(height_ratios=[4, 1, 1]),
figsize=get_figsize(.5, 1))
ax = axes[0]
heatmap_importance_profile(normalize(np.abs(p).sum(axis=-1)[:500], pmin=50, pmax=99), aspect='auto',
ax=ax)
ax.get_xaxis().set_visible(False)
ax=axes[1]
ax.plot(agg_profile, label='original')
ax.plot(smooth_part, label="smooth", alpha=0.5)
ax.legend()
ax.set_ylabel("Avg. importance")
# ax.set_title("Average importance score")
ax=axes[2]
# axes[0].set_xlabel("Position");
ax.plot(oscilatory_part)
ax.set_xlabel("Position")
ax.set_ylabel("original - smooth")
ax.set_xlim([0, 200])
ax.set_xlabel("Position")
ax.set_xticklabels([-100, -50, 0, 50, 100])
fig.subplots_adjust(hspace=0) # no space between plots
fig.savefig(fdir / 'Nanog.p1.heatmap+avg+norm.pdf')
fig.savefig(fdir / 'Nanog.p1.heatmap+avg+norm.png')
fig, axes = plt.subplots(2, 1,
gridspec_kw=dict(height_ratios=[4, 1]),
figsize=get_figsize(.5, 1))
ax = axes[0]
heatmap_importance_profile(normalize(np.abs(p).sum(axis=-1)[:500], pmin=50, pmax=99), aspect='auto',
ax=ax)
ax.get_xaxis().set_visible(False)
ax=axes[1]
ax.plot(agg_profile, label='original')
ax.plot(smooth_part, label="smooth", alpha=0.5)
ax.legend()
ax.set_ylabel("Avg. importance")
ax.set_xlim([0, 200])
ax.set_xlabel("Position")
ax.set_xticklabels([-100, -50, 0, 50, 100])
# ax.set_title("Average importance score")
fig.subplots_adjust(hspace=0) # no space between plots
fig.savefig(fdir / 'Nanog.p1.heatmap+avg.pdf')
fig.savefig(fdir / 'Nanog.p1.heatmap+avg.png')
fft_fig, ax = plt.subplots(1, 1, figsize=get_figsize(0.5, 1/5))
x = 1 / freq[:49]
y = np.abs(np.fft.fft(oscilatory_part[102:])[:49])**2 + np.abs(np.fft.fft(oscilatory_part[:98])[:49])**2
t0 = x[np.argmax(y)]
ax.plot(x, y)
ax.scatter(x, y, s=10)
ax.set_xlim([0, 50])
ax.xaxis.set_major_locator(ticker.MaxNLocator(10, integer=True))
# ax.grid(alpha=0.3)
ax.axvline(x=t0, color='grey', alpha=0.5)
ax.set_xlabel("1/Frequency [bp]")
ax.set_ylabel("Power spectrum");
fft_fig.savefig(fdir / 'Nanog.p1.fft.pdf')
fft_fig.savefig(fdir / 'Nanog.p1.fft.png')
fft_fig, ax = plt.subplots(1, 1, figsize=get_figsize(0.5, 1/5))
x = 1 / freq[:49]
freq_ebar = np.diff(freq[:49])[0] / 2
y = np.abs(np.fft.fft(oscilatory_part[102:])[:49])**2 + np.abs(np.fft.fft(oscilatory_part[:98])[:49])**2
t0 = x[np.argmax(y)]
markerline, stemlines, baseline = ax.stem(x, y, linefmt='-', bottom=0.5)
plt.setp(baseline, linestyle="-", color="grey", linewidth=6)
# plt.setp(baseline, color='grey', linewidth=2)
# ax.scatter(x, y, s=10)
ax.set_xlim([0, 50])
#ax.xaxis.set_major_locator(ticker.MaxNLocator(10, integer=True))
# ax.grid(alpha=0.3)
#ax.axvline(x=t0, color='grey', alpha=0.5)
ax.set_xlabel("1/Frequency [bp]")
ax.set_ylabel("Power spectrum");
# fft_fig.savefig(fdir / 'Nanog.p1.fft.pdf')
# fft_fig.savefig(fdir / 'Nanog.p1.fft.png')
sns.despine(top=True, bottom=True, right=True)
oscilatory_part = agg_profile - smooth_part
# Compute the
freq_ebar = np.diff(freq)[0] / 2
lower = 1 / (freq[:49][np.argmax(y)] + freq_ebar)
center = t0
upper = 1 / (freq[:49][np.argmax(y)] - freq_ebar)
ebar = upper - center
print(f"t0: {center} +- {ebar}")
from basepair.utils import pd_col_prepend
def pd_column_replace(df):
df.columns = [c.split(" ")[0] for c in df.columns]
return df
dfp = pd.concat([
(pd.read_csv(model_dir / f'deeplift/{tf}/out/{imp_score}/pattern_table.csv')
.pipe(pd_col_prepend, 'pattern', tf + '/').set_index("pattern"))
for tf in tasks
])
dfps = dfp[[f'{task} periodicity 10bp' for task in tasks]].pipe(pd_column_replace)
dfp= dfp[(dfp['n seqlets'] > 100)]
dfp['seqlen'] = dfp['consensus'].str.len()
dfp['ic'] = dfp['ic pwm mean'] * dfp['seqlen']
dfp['footprint counts'] = sum([dfp[f'{t} footprint counts']
for t in tasks])
dfp['footprint max'] = sum([dfp[f'{t} footprint max']
for t in tasks])
dfp['region counts'] = sum([dfp[f'{t} region counts']
for t in tasks])
dfp.plot.scatter('ic', 'footprint max')
dfp.plot.scatter('seqlen', 'footprint max')
dfpsm = pd.melt(dfps.reset_index(), id_vars='pattern', value_vars=tasks, var_name='TF', value_name='10bp periodicity')
dfpsm['TF'] = pd.Categorical(dfpsm['TF'], categories=tasks, ordered=True)
dfpsm['task'] = dfpsm.pattern.str.split('/', expand=True).iloc[:, 0]
dfpsm.to_csv(fdir / 'dfpsm.csv')
a=1
tf_colors
fig, ax = plt.subplots(figsize=get_figsize(0.5, 4))
sns.heatmap(dfps.iloc[np.argsort(-dfps.max(axis=1))], ax=ax)
fig.savefig(fdir / '10bp-periodicity.heatmap.pdf')
print(dfpsm[dfpsm.TF == dfpsm.task].sort_values('10bp periodicity', ascending=False).groupby('TF').head(5).to_string())
plotnine.options.figure_size = get_figsize(.25, aspect=1.5)# (10, 10)
fig = (ggplot(aes(x='TF', y='10bp periodicity', color='TF'),
dfpsm[dfpsm.TF == dfpsm.task]) +
geom_boxplot(color='grey') +
geom_jitter(size=1, alpha=0.8) +
theme_classic(base_size=10, base_family='Arial') +
theme(legend_position='top') +
scale_color_manual(values=[tf_colors[t] for t in tasks]))
fig.save(fdir / '10bp-periodicity.boxplot.pdf')
fig
dfpsm
dfpsm['short'] = dfpsm.pattern.str.split("/", expand=True)[1].str.replace("m0_p", "")
(p9.ggplot(mtcars, p9.aes('mpg','disp')) + p9.geom_point()
+ p9.geom_text(p9.aes(label='name'), color='blue', position = p9.position_adjust_text(), size=7)
)
import plotnine as p9
plotnine.options.figure_size = get_figsize(.25, aspect=1.5)# (10, 10)
fig = (ggplot(aes(x='TF', y='10bp periodicity', color='TF'),
dfpsm[dfpsm.TF == dfpsm.task]) +
geom_boxplot(color='grey', size=0.1) +
geom_text(aes(label='short'), position = 'jitter', size=6) +
#geom_jitter(size=1, alpha=0.8) +
theme_classic(base_size=10, base_family='Arial') +
theme(legend_position='top') +
scale_color_manual(values=[tf_colors[t] for t in tasks]))
# fig.save(fdir / '10bp-periodicity.boxplot.pdf')
fig
from adjustText import adjust_text
plotnine.options.figure_size = get_figsize(.5, aspect=1.5)# (10, 10)
fig = (ggplot(aes(x='TF', y='10bp periodicity', color='TF'),
dfpsm[dfpsm.TF == dfpsm.task]) +
geom_boxplot(color='grey', size=0.1) +
geom_text(aes(label='short'), position = 'jitter', size=9) +
#geom_jitter(size=1, alpha=0.8) +
theme_classic(base_size=10, base_family='Arial') +
theme(legend_position='top') +
scale_color_manual(values=[tf_colors[t] for t in tasks]))
# fig.save(fdir / '10bp-periodicity.boxplot.pdf')
fig
f = fig.draw()
for a in f.axes:
texts = [t for t in a.texts]
adjust_text(texts,ax=a, only_move={"text": 'x', 'points': 'x'})
plt.savefig(fdir / '10bp-periodicity.boxplot.w-labels.pdf')
dfpsm
dfp['task'] = dfp.reset_index().pattern.str.split("/", expand=True)[0]
dfp['metacluster'] = pd.Categorical(dfp.task, ordered=True)
dfp['log n seqlets'] = np.log10(dfp['n seqlets'])
# filter
dfp = dfp[dfp['n seqlets'] >= 100]
dfp_short = dfp.copy().reset_index()
dfp_short = dfp.copy().reset_index()
dfp_short = dfp_short[dfp_short.pattern.isin(motifs.values())]
dfp_short['pattern'] = dfp_short.pattern.map({v:k for k,v in motifs.items()})
dfx, row_df, col_df = preproc_motif_table(dfp_short, tasks)
x = scale(dfx).T
# x = dfx.T
g = sns.clustermap(x, row_colors=to_colors(col_df), col_colors=to_colors(row_df), method="weighted", figsize=(5, 10), cmap='RdBu_r', center=0);
from basepair.modisco.motif_clustering import to_colors, preproc_motif_table, motif_table_long, scale
dfx, row_df, col_df = preproc_motif_table(dfp.reset_index(), tasks)
x = scale(dfx).T
# x = dfx.T
g = sns.clustermap(x, row_colors=to_colors(col_df), col_colors=to_colors(row_df), method="weighted", figsize=(20, 10), cmap='RdBu_r', center=0);