In [1]:
from collections import OrderedDict
exp = 'nexus,peaks,OSNK,0,10,1,FALSE,same,0.5,64,25,0.004,9,FALSE,[1,50],TRUE'

imp_score = 'profile/wn'

motifs = OrderedDict([
    ("Oct4-Sox2", 'Oct4/m0_p0'),
    ("Oct4", 'Oct4/m0_p1'),
    # ("Strange-sym-motif", 'Oct4/m0_p5'),
    ("Sox2", 'Sox2/m0_p1'),
    ("Nanog", 'Nanog/m0_p1'),
    ("Zic3", 'Nanog/m0_p2'),
    ("Nanog-partner", 'Nanog/m0_p4'),
    ("Klf4", 'Klf4/m0_p0'),
])

Goal

make a nicer Nanog plot where the periodicity can be seen

In [2]:
from basepair.imports import *
from basepair.exp.paper.config import *
from basepair.modisco.periodicity import compute_power_spectrum, periodicity_10bp_frac, plot_power_spectrum

import matplotlib.ticker as ticker
from basepair.plot.profiles import extract_signal
from basepair.utils import pd_col_prepend
from basepair.modisco.table import ModiscoData
Using TensorFlow backend.
In [3]:
paper_config()
In [4]:
# figures dir
model_dir = models_dir / exp
fdir = Path(f'{ddir}/figures/modisco/{exp}/periodicity/')
fdir.mkdir(parents=True, exist_ok=True)
In [5]:
task = 'Nanog'
modisco_dir = model_dir / f'deeplift/{task}/out/{imp_score}/'
In [6]:
mr = ModiscoResult(modisco_dir / 'modisco.h5')
In [7]:
p = mr.get_pattern("metacluster_0/pattern_1")
In [8]:
w = get_figsize(.6, 1/12)[0]
In [9]:
w / 70
Out[9]:
0.05871754285714286
In [10]:
# fig = plt.figure(figsize=get_figsize(.6, 1/12))
p.plot("contrib", letter_width=0.06, height=0.4);
sns.despine(top=True, right=True, bottom=True)
plt.savefig(fdir / 'm0_p1.contrib.pdf')
In [11]:
# fig = plt.figure(figsize=get_figsize(.6, 1/12))
p.plot("seq_ic", letter_width=0.06, height=0.4);
sns.despine(top=True, right=True, bottom=True)
plt.savefig(fdir / 'm0_p1.pfm.pdf')
TF-MoDISco is using the TensorFlow backend.
In [12]:
md = ModiscoData.load(modisco_dir, None)
100%|██████████| 19/19 [1:04:30<00:00, 202.69s/it]
In [13]:
def plot_power_spectrum(pattern, task, data, figwidth=10):
    seqlets = data.seqlets_per_task[pattern]
    wide_seqlets = [s.resize(data.footprint_width)
                    for s in seqlets
                    if s.center() > data.footprint_width // 2 and
                    s.center() < data.get_seqlen(pattern) - data.footprint_width // 2
                    ]
    p = extract_signal(data.get_region_grad(task, 'profile'), wide_seqlets)

    agg_profile = np.log(np.abs(p).sum(axis=-1).sum(axis=0))
    heatmap_importance_profile(normalize(np.abs(p).sum(axis=-1)[:500], pmin=50, pmax=99), figsize=(figwidth, 20))
    heatmap_fig = plt.gcf()
    # heatmap_importance_profile(np.abs(p*seq).sum(axis=-1)[:500], figsize=(10, 20))

    agg_profile = agg_profile - agg_profile.mean()
    agg_profile = agg_profile / agg_profile.std()
    freq = np.fft.fftfreq(agg_profile[102:].shape[-1])

    smooth_part = smooth(agg_profile, 10)
    oscilatory_part = agg_profile - smooth_part

    avg_fig, axes = plt.subplots(2, 1, figsize=(11, 4), sharex=True)
    axes[0].plot(agg_profile, label='original')
    axes[0].plot(smooth_part, label="smooth", alpha=0.5)
    axes[0].legend()
    axes[0].set_ylabel("Avg. importance")
    axes[0].set_title("Average importance score")
    # axes[0].set_xlabel("Position");
    axes[1].plot(oscilatory_part)
    axes[1].set_xlabel("Position")
    axes[1].set_ylabel("original - smooth")
    avg_fig.subplots_adjust(hspace=0)  # no space between plots
    # plt.savefig('nanog-agg-profile.png', dpi=300)
    # plt.savefig('nanog-agg-profile.pdf')

    fft_fig = plt.figure(figsize=(11, 2))
    plt.plot(1 / freq[:49], np.abs(np.fft.fft(oscilatory_part[102:])[:49])**2 + np.abs(np.fft.fft(oscilatory_part[:98])[:49])**2, "-o")
    plt.xlim([0, 50])
    plt.gca().xaxis.set_major_locator(ticker.MaxNLocator(25, integer=True))
    plt.grid(alpha=0.3)
    plt.xlabel("1/Frequency [bp]")
    plt.ylabel("Power spectrum")
    plt.title("Power spectrum")
    plt.gcf().subplots_adjust(bottom=0.4)
    return heatmap_fig, avg_fig, fft_fig
In [14]:
from basepair.modisco.periodicity import *
pattern='metacluster_0/pattern_1'
task="Nanog"
data=md

seqlets = data.seqlets_per_task[pattern]
wide_seqlets = [s.resize(data.footprint_width)
                for s in seqlets
                if s.center() > data.footprint_width // 2 and
                s.center() < data.get_seqlen(pattern) - data.footprint_width // 2
                ]
p = extract_signal(data.get_region_grad(task, 'profile'), wide_seqlets)

agg_profile = np.log(np.abs(p).sum(axis=-1).sum(axis=0))
agg_profile = agg_profile - agg_profile.mean()
agg_profile = agg_profile / agg_profile.std()
freq = np.fft.fftfreq(agg_profile[102:].shape[-1])

smooth_part = smooth(agg_profile, 10)
oscilatory_part = agg_profile - smooth_part
In [15]:
agg_profile
Out[15]:
array([-1.3127, -1.3029, -1.2026, -1.1925, -1.2354, -1.1362, -1.217 , -1.2383, -1.1977, -1.1722,
       -1.0307, -0.9261, -0.8692, -0.8143, -0.8409, -0.9097, -0.9606, -1.0243, -1.0351, -1.012 ,
       ..., -1.1064, -1.0156, -0.962 , -0.9978, -1.0171, -1.0258, -0.9766, -1.1623, -1.2684,
       -1.3583, -1.4058, -1.3792, -1.2338, -1.2795, -1.2181, -1.178 , -1.4184, -1.3415, -1.4265,
       -1.5904], dtype=float32)
In [16]:
pd.DataFrame()
Out[16]:
In [17]:
ls ~/gdrive/projects/chipnexus/data/
bpnet/                     motif_clustering.csv
ChIP-Nexus Pipeline.gddoc  motif_clustering.gdsheet
cluster.gdsheet            motif_clustering_old2.gdsheet
data-sheet.gdsheet         motif_clustering_old.gdsheet
dfab.csv.gz                periodicity/
dfabf.csv.gz               profile_aggregated.png
dfi_subset.csv.gz          Regulatory regions of interest in mESCs.gdsheet
dfs.csv.gz                 Regulatory regions of interest in mESCs.xlsx
In [18]:
agg_profile.argmax()
Out[18]:
98
In [19]:
periodicity_dir = Path('/users/avsec/gdrive/projects/chipnexus/data/periodicity/')
In [20]:
periodicity_dir.mkdir(exist_ok=True)
In [23]:
np.savetxt(periodicity_dir / 'aggregated-signal.Nanog.txt', agg_profile)
In [35]:
agg_profile = np.loadtxt(periodicity_dir / 'aggregated-signal.Nanog.txt')
smooth_part = np.loadtxt(periodicity_dir / 'aggregated-signal-minus-smoothed.Nanog.txt')
In [34]:
smooth_part
Out[34]:
array([-0.6246, -0.7382, -0.8599, -0.9838, -1.1035, -1.2207, -1.1925, -1.1549, -1.1215, -1.0837,
       -1.0443, -1.0216, -0.996 , -0.9746, -0.9583, -0.9423, -0.9286, -0.9066, -0.8769, -0.8386,
       ..., -1.0157, -1.039 , -1.0406, -1.0503, -1.0679, -1.089 , -1.119 , -1.1553, -1.1825,
       -1.2107, -1.2308, -1.246 , -1.2902, -1.3081, -1.3239, -1.3471, -1.2065, -1.0686, -0.9452,
       -0.8173])
In [25]:
np.savetxt(periodicity_dir / 'aggregated-signal-minus-smoothed.Nanog.txt', smooth_part)
In [ ]:
ax.plot(agg_profile, label='original')
ax.plot(smooth_part, label="smooth", alpha=0.5)
In [263]:
fig, axes = plt.subplots(1, 1, 
                         figsize=get_figsize(.5, ))
heatmap_importance_profile(normalize(np.abs(p).sum(axis=-1)[:500], pmin=50, pmax=99), aspect='auto',
                           ax=axes)
fig.savefig(fdir / 'Nanog.p1.heatmap.pdf')
fig.savefig(fdir / 'Nanog.p1.heatmap.png')
In [281]:
fig, axes = plt.subplots(3, 1, 
                         gridspec_kw=dict(height_ratios=[4, 1, 1]),
                         figsize=get_figsize(.5, 1))
ax = axes[0]
heatmap_importance_profile(normalize(np.abs(p).sum(axis=-1)[:500], pmin=50, pmax=99), aspect='auto',
                           ax=ax)
ax.get_xaxis().set_visible(False)
ax=axes[1]
ax.plot(agg_profile, label='original')
ax.plot(smooth_part, label="smooth", alpha=0.5)
ax.legend()
ax.set_ylabel("Avg. importance")
# ax.set_title("Average importance score")

ax=axes[2]
# axes[0].set_xlabel("Position");
ax.plot(oscilatory_part)
ax.set_xlabel("Position")
ax.set_ylabel("original - smooth")
ax.set_xlim([0, 200])
ax.set_xlabel("Position")
ax.set_xticklabels([-100, -50, 0, 50, 100])
fig.subplots_adjust(hspace=0)  # no space between plots
fig.savefig(fdir / 'Nanog.p1.heatmap+avg+norm.pdf')
fig.savefig(fdir / 'Nanog.p1.heatmap+avg+norm.png')
In [280]:
fig, axes = plt.subplots(2, 1, 
                         gridspec_kw=dict(height_ratios=[4, 1]),
                         figsize=get_figsize(.5, 1))
ax = axes[0]
heatmap_importance_profile(normalize(np.abs(p).sum(axis=-1)[:500], pmin=50, pmax=99), aspect='auto',
                           ax=ax)
ax.get_xaxis().set_visible(False)

ax=axes[1]
ax.plot(agg_profile, label='original')
ax.plot(smooth_part, label="smooth", alpha=0.5)
ax.legend()
ax.set_ylabel("Avg. importance")
ax.set_xlim([0, 200])
ax.set_xlabel("Position")
ax.set_xticklabels([-100, -50, 0, 50, 100])
# ax.set_title("Average importance score")
fig.subplots_adjust(hspace=0)  # no space between plots
fig.savefig(fdir / 'Nanog.p1.heatmap+avg.pdf')
fig.savefig(fdir / 'Nanog.p1.heatmap+avg.png')
In [90]:
fft_fig, ax = plt.subplots(1, 1, figsize=get_figsize(0.5, 1/5))
x = 1 / freq[:49]
y = np.abs(np.fft.fft(oscilatory_part[102:])[:49])**2 + np.abs(np.fft.fft(oscilatory_part[:98])[:49])**2
t0 = x[np.argmax(y)]
ax.plot(x, y)
ax.scatter(x, y, s=10)
ax.set_xlim([0, 50])
ax.xaxis.set_major_locator(ticker.MaxNLocator(10, integer=True))
# ax.grid(alpha=0.3)
ax.axvline(x=t0, color='grey', alpha=0.5)
ax.set_xlabel("1/Frequency [bp]")
ax.set_ylabel("Power spectrum");
fft_fig.savefig(fdir / 'Nanog.p1.fft.pdf')
fft_fig.savefig(fdir / 'Nanog.p1.fft.png')
In [88]:
fft_fig, ax = plt.subplots(1, 1, figsize=get_figsize(0.5, 1/5))
x = 1 / freq[:49]
freq_ebar = np.diff(freq[:49])[0] / 2
y = np.abs(np.fft.fft(oscilatory_part[102:])[:49])**2 + np.abs(np.fft.fft(oscilatory_part[:98])[:49])**2
t0 = x[np.argmax(y)]
markerline, stemlines, baseline = ax.stem(x, y, linefmt='-', bottom=0.5)

plt.setp(baseline, linestyle="-", color="grey", linewidth=6)

# plt.setp(baseline, color='grey', linewidth=2)
# ax.scatter(x, y, s=10)
ax.set_xlim([0, 50])
#ax.xaxis.set_major_locator(ticker.MaxNLocator(10, integer=True))
# ax.grid(alpha=0.3)
#ax.axvline(x=t0, color='grey', alpha=0.5)
ax.set_xlabel("1/Frequency [bp]")
ax.set_ylabel("Power spectrum");
# fft_fig.savefig(fdir / 'Nanog.p1.fft.pdf')
# fft_fig.savefig(fdir / 'Nanog.p1.fft.png')
sns.despine(top=True, bottom=True, right=True)
In [69]:
oscilatory_part = agg_profile - smooth_part
In [74]:
# Compute the 
freq_ebar = np.diff(freq)[0] / 2
lower = 1 / (freq[:49][np.argmax(y)] + freq_ebar)
center = t0
upper = 1 / (freq[:49][np.argmax(y)] - freq_ebar)

ebar = upper - center
print(f"t0: {center} +- {ebar}")
t0: 10.88888888888889 +- 0.6405228758169947

Load the information from all the modisco runs

In [21]:
from basepair.utils import pd_col_prepend
In [22]:
def pd_column_replace(df):
    df.columns = [c.split(" ")[0] for c in df.columns]
    return df
In [23]:
dfp = pd.concat([
    (pd.read_csv(model_dir / f'deeplift/{tf}/out/{imp_score}/pattern_table.csv')
     .pipe(pd_col_prepend, 'pattern', tf + '/').set_index("pattern"))
    for tf in tasks
])
In [24]:
dfps = dfp[[f'{task} periodicity 10bp' for task in tasks]].pipe(pd_column_replace)
In [25]:
dfp= dfp[(dfp['n seqlets'] > 100)]

dfp['seqlen'] = dfp['consensus'].str.len()

dfp['ic'] = dfp['ic pwm mean'] * dfp['seqlen']
In [26]:
dfp['footprint counts'] = sum([dfp[f'{t} footprint counts']
                              for t in tasks])

dfp['footprint max'] = sum([dfp[f'{t} footprint max']
                              for t in tasks])

dfp['region counts'] = sum([dfp[f'{t} region counts']
                              for t in tasks])
In [27]:
dfp.plot.scatter('ic', 'footprint max')
Out[27]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f16d6f6c400>
In [28]:
dfp.plot.scatter('seqlen', 'footprint max')
Out[28]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f16d6f21400>
In [34]:
dfpsm = pd.melt(dfps.reset_index(), id_vars='pattern', value_vars=tasks, var_name='TF', value_name='10bp periodicity')
dfpsm['TF'] = pd.Categorical(dfpsm['TF'], categories=tasks, ordered=True)
dfpsm['task'] = dfpsm.pattern.str.split('/', expand=True).iloc[:, 0]
In [ ]:
dfpsm.to_csv(fdir / 'dfpsm.csv')
In [31]:
a=1
In [53]:
tf_colors
Out[53]:
{'Klf4': '#357C42',
 'Oct4': '#9F1D20',
 'Sox2': '#3A3C97',
 'Nanog': '#9F8A31',
 'Esrrb': '#30BDC4'}
In [56]:
fig, ax = plt.subplots(figsize=get_figsize(0.5, 4))
sns.heatmap(dfps.iloc[np.argsort(-dfps.max(axis=1))], ax=ax)
fig.savefig(fdir / '10bp-periodicity.heatmap.pdf')
In [78]:
print(dfpsm[dfpsm.TF == dfpsm.task].sort_values('10bp periodicity', ascending=False).groupby('TF').head(5).to_string())
         pattern     TF  10bp periodicity   task
152  Nanog/m0_p1  Nanog            0.4675  Nanog
154  Nanog/m0_p3  Nanog            0.4023  Nanog
156  Nanog/m0_p5  Nanog            0.3972  Nanog
159  Nanog/m0_p8  Nanog            0.3935  Nanog
155  Nanog/m0_p4  Nanog            0.3563  Nanog
81    Sox2/m0_p1   Sox2            0.2372   Sox2
89    Sox2/m0_p9   Sox2            0.2080   Sox2
80    Sox2/m0_p0   Sox2            0.2034   Sox2
86    Sox2/m0_p6   Sox2            0.1882   Sox2
85    Sox2/m0_p5   Sox2            0.1567   Sox2
15   Oct4/m0_p15   Oct4            0.1203   Oct4
232   Klf4/m0_p1   Klf4            0.1106   Klf4
243  Klf4/m0_p12   Klf4            0.1084   Klf4
238   Klf4/m0_p7   Klf4            0.0904   Klf4
239   Klf4/m0_p8   Klf4            0.0854   Klf4
9     Oct4/m0_p9   Oct4            0.0829   Oct4
235   Klf4/m0_p4   Klf4            0.0806   Klf4
11   Oct4/m0_p11   Oct4            0.0787   Oct4
3     Oct4/m0_p3   Oct4            0.0769   Oct4
10   Oct4/m0_p10   Oct4            0.0756   Oct4
In [73]:
plotnine.options.figure_size = get_figsize(.25, aspect=1.5)# (10, 10)
fig = (ggplot(aes(x='TF', y='10bp periodicity', color='TF'),
       dfpsm[dfpsm.TF == dfpsm.task]) + 
 geom_boxplot(color='grey') + 
 geom_jitter(size=1, alpha=0.8) + 
 theme_classic(base_size=10, base_family='Arial') + 
 theme(legend_position='top') + 
 scale_color_manual(values=[tf_colors[t] for t in tasks]))
fig.save(fdir / '10bp-periodicity.boxplot.pdf')
fig
/users/avsec/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/plotnine/ggplot.py:706: UserWarning: Saving 1.712595 x 2.5688925 in image.
  from_inches(height, units), units))
/users/avsec/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/plotnine/ggplot.py:707: UserWarning: Filename: /users/avsec/workspace/basepair/data/figures/modisco/nexus,peaks,OSNK,0,10,1,FALSE,same,0.5,64,25,0.004,9,FALSE,[1,50],TRUE/periodicity/10bp-periodicity.boxplot.pdf
  warn('Filename: {}'.format(filename))
Out[73]:
<ggplot: (8759399919659)>
In [36]:
dfpsm
Out[36]:
pattern TF 10bp periodicity task
0 Oct4/m0_p0 Oct4 0.0679 Oct4
1 Oct4/m0_p1 Oct4 0.0723 Oct4
2 Oct4/m0_p2 Oct4 0.0299 Oct4
... ... ... ... ...
241 Klf4/m0_p10 Klf4 0.0327 Klf4
242 Klf4/m0_p11 Klf4 0.0352 Klf4
243 Klf4/m0_p12 Klf4 0.1084 Klf4

244 rows × 4 columns

In [38]:
dfpsm['short'] = dfpsm.pattern.str.split("/", expand=True)[1].str.replace("m0_p", "")
In [ ]:
(p9.ggplot(mtcars, p9.aes('mpg','disp')) + p9.geom_point()
      + p9.geom_text(p9.aes(label='name'), color='blue', position = p9.position_adjust_text(), size=7)
    )
In [42]:
import plotnine as p9
In [59]:
plotnine.options.figure_size = get_figsize(.25, aspect=1.5)# (10, 10)
fig = (ggplot(aes(x='TF', y='10bp periodicity', color='TF'),
       dfpsm[dfpsm.TF == dfpsm.task]) + 
 geom_boxplot(color='grey', size=0.1) + 
 geom_text(aes(label='short'), position = 'jitter', size=6) + 
 #geom_jitter(size=1, alpha=0.8) + 
 theme_classic(base_size=10, base_family='Arial') + 
 theme(legend_position='top') + 
 scale_color_manual(values=[tf_colors[t] for t in tasks]))
# fig.save(fdir / '10bp-periodicity.boxplot.pdf')
fig
Out[59]:
<ggplot: (-9223363303357819795)>
In [64]:
from adjustText import adjust_text
In [67]:
plotnine.options.figure_size = get_figsize(.5, aspect=1.5)# (10, 10)
fig = (ggplot(aes(x='TF', y='10bp periodicity', color='TF'),
       dfpsm[dfpsm.TF == dfpsm.task]) + 
 geom_boxplot(color='grey', size=0.1) + 
 geom_text(aes(label='short'), position = 'jitter', size=9) + 
 #geom_jitter(size=1, alpha=0.8) + 
 theme_classic(base_size=10, base_family='Arial') + 
 theme(legend_position='top') + 
 scale_color_manual(values=[tf_colors[t] for t in tasks]))
# fig.save(fdir / '10bp-periodicity.boxplot.pdf')
fig
Out[67]:
<ggplot: (8733496795719)>
In [70]:
f = fig.draw()
for a in f.axes:
    texts = [t for t in  a.texts]
    adjust_text(texts,ax=a, only_move={"text": 'x', 'points': 'x'})
plt.savefig(fdir / '10bp-periodicity.boxplot.w-labels.pdf')
In [32]:
dfpsm
Out[32]:
pattern TF 10bp periodicity
0 Oct4/m0_p0 Oct4 0.0679
1 Oct4/m0_p1 Oct4 0.0723
2 Oct4/m0_p2 Oct4 0.0299
... ... ... ...
241 Klf4/m0_p10 Klf4 0.0327
242 Klf4/m0_p11 Klf4 0.0352
243 Klf4/m0_p12 Klf4 0.1084

244 rows × 3 columns

In [183]:
dfp['task'] = dfp.reset_index().pattern.str.split("/", expand=True)[0]
dfp['metacluster'] = pd.Categorical(dfp.task, ordered=True)
dfp['log n seqlets'] = np.log10(dfp['n seqlets'])
    # filter 
dfp = dfp[dfp['n seqlets'] >= 100]
In [209]:
dfp_short = dfp.copy().reset_index()
In [215]:
dfp_short = dfp.copy().reset_index()
dfp_short = dfp_short[dfp_short.pattern.isin(motifs.values())]
dfp_short['pattern'] = dfp_short.pattern.map({v:k for k,v in motifs.items()})
dfx, row_df, col_df = preproc_motif_table(dfp_short, tasks)

x = scale(dfx).T
# x = dfx.T
In [216]:
g = sns.clustermap(x, row_colors=to_colors(col_df), col_colors=to_colors(row_df), method="weighted", figsize=(5, 10), cmap='RdBu_r', center=0);
In [185]:
from basepair.modisco.motif_clustering import to_colors, preproc_motif_table, motif_table_long, scale
In [193]:
dfx, row_df, col_df = preproc_motif_table(dfp.reset_index(), tasks)

x = scale(dfx).T
# x = dfx.T
In [194]:
g = sns.clustermap(x, row_colors=to_colors(col_df), col_colors=to_colors(row_df), method="weighted", figsize=(20, 10), cmap='RdBu_r', center=0);