Goal¶

make a nicer Nanog plot where the periodicity can be seen

from basepair.imports import *

/users/avsec/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/h5py/__init__.py:36: FutureWarning: Conversion of the second argument of issubdtype from `float` to `np.floating` is deprecated. In future, it will be treated as `np.float64 == np.dtype(float).type`.
  from ._conv import register_converters as _register_converters
Using TensorFlow backend.

import matplotlib.ticker as ticker
from basepair.plot.profiles import extract_signal

model_dir =  Path(f"{ddir}/processed/chipnexus/exp/models/oct-sox-nanog-klf/models/n_dil_layers=9/")

modisco_dir =  model_dir / "modisco/by_peak_tasks/weighted/Nanog"

from basepair.modisco.table import ModiscoData

ls {model_dir}

Intervene_results/  figures/        grad.valid.h5  modisco/
clustering/         grad.all.h5     history.csv    preds.all.h5
cometml.json        grad.test.2.h5  hparams.yaml   results.html
dataspec.yaml       grad.test.h5    model.h5       results.ipynb

md = ModiscoData.load(modisco_dir, model_dir/ "grad.all.h5")

100%|██████████| 60/60 [13:55<00:00, 13.92s/it]

ps, t0, freq = compute_power_spectrum('metacluster_0/pattern_0', "Oct4", md)

tasks = md.get_tasks()

l = []
for pattern in tqdm(md.mr.patterns()):
    if md.mr.n_seqlets(*pattern.split("/")) < 300:
        continue
    for task in md.get_tasks():
        l.append(dict(task=task, pattern=pattern, frac=periodicity_10bp_frac(pattern, task, md)))

100%|██████████| 60/60 [02:09<00:00,  2.16s/it]

df = pd.DataFrame(l)

sns.heatmap(df.pivot("pattern", "task", "frac"))

<matplotlib.axes._subplots.AxesSubplot at 0x7fb2de277780>

sns.clustermap(df.pivot("pattern", "task", "frac"))
plt.savefig("fft-frac.png", dpi=300, bbox_inches="tight")
plt.savefig("fft-frac.pdf", bbox_inches="tight")

import seaborn as sns

sns.heatmap(df, annot=True)

md.mr.n_seqlets()

power_spectrum

<function __main__.power_spectrum(md, pattern, task)>

assert t0[9] == 10.88888888888889

ps[0] / ps.sum()

0.02215352944034531

np.where((t0 < 11) & (t0> 10))

(array([9]),)

plt.plot(t0, ps)

[<matplotlib.lines.Line2D at 0x7fb2dd7a32b0>]

power_spectrum(md, 'metacluster_0/pattern_5', "Oct4")

power_spectrum(md, 'metacluster_2/pattern_0', "Nanog")

power_spectrum(md, 'metacluster_0/pattern_1', "Sox2")

Question¶

which motifs have a clear peak?

power_spectrum(md, 'metacluster_0/pattern_4', "Oct4")

power_spectrum(md, 'metacluster_0/pattern_0', "Klf4")

p= md.grad_profile['metacluster_0/pattern_2']['Nanog']

seqlets = md.seqlets_per_task['metacluster_0/pattern_2']

power_spectrum(md, 'metacluster_0/pattern_0', "Sox2")

power_spectrum(md, 'metacluster_0/pattern_1')

pattern= 'metacluster_0/pattern_2'

wide_seqlets = [s.resize(md.footprint_width)
                            for s in seqlets
                            if s.center() > md.footprint_width //2 and \
                               s.center() < md.get_seqlen(pattern) - md.footprint_width //2
                           ]
            

p = extract_signal(md.get_region_grad("Nanog", 'profile'), wide_seqlets)

seq = extract_signal(md.get_region_seq(), wide_seqlets)

from basepair.plot.tracks import plot_tracks

from basepair.plot.heatmaps import heatmap_importance_profile, normalize

np.fft.rfft

fig = plt.figure(figsize=(11, 2))
plt.plot(np.log(np.abs(p*seq).sum(axis=-1).sum(axis=0)));

heatmap_importance_profile(normalize(np.abs(p*seq).sum(axis=-1)[:500], pmin=50, pmax=99), figsize=(10, 20))

heatmap_importance_profile(normalize(np.abs(p).sum(axis=-1)[:500], pmin=50, pmax=99), figsize=(10, 20))
plt.savefig('nanog-imp-heatmap.png', dpi=300, tight_layout=True)
plt.savefig('nanog-imp-heatmap.pdf')

agg_profile = np.log(np.abs(p*seq).sum(axis=-1).sum(axis=0))

agg_profile = agg_profile - agg_profile.mean()
agg_profile = agg_profile / agg_profile.std()

def smooth(y, box_pts):
    box = np.ones(box_pts)/box_pts
    y_smooth = np.convolve(y, box, mode='same')
    return y_smooth

smooth_part = smooth(agg_profile, 10)
oscilatory_part = agg_profile - smooth_part

fig, axes= plt.subplots(2, 1,figsize=(11, 4), sharex=True)
axes[0].plot(agg_profile, label='original')
axes[0].plot(smooth_part, label="smooth", alpha=0.5)
axes[0].legend()
axes[0].set_ylabel("Avg. importance");
axes[0].set_title("Average importance score")
#axes[0].set_xlabel("Position");
axes[1].plot(oscilatory_part);
axes[1].set_xlabel("Position");
axes[1].set_ylabel("original - smooth");
fig.subplots_adjust(hspace=0)  # no space between plots
plt.savefig('nanog-agg-profile.png', dpi=300)
plt.savefig('nanog-agg-profile.pdf')

fig = plt.figure(figsize=(11, 2))
plt.plot(1/freq[:49], np.abs(np.fft.fft(oscilatory_part[102:])[:49])**2 + np.abs(np.fft.fft(oscilatory_part[:98])[:49])**2, "-o")
plt.xlim([0, 50])
plt.gca().xaxis.set_major_locator(ticker.MaxNLocator(25, integer=True))
plt.grid(alpha=0.3)
plt.xlabel("1/Frequency [bp]")
plt.ylabel("Power spectrum")
plt.title("Power spectrum")
plt.gcf().subplots_adjust(bottom=0.4)

plt.savefig('nanog-fft.png', dpi=300)
plt.savefig('nanog-fft.pdf')

plt.plot(agg_profile[99:])
plt.plot(agg_profile[98:0:-1])
plt.gca().xaxis.set_major_locator(ticker.MaxNLocator(13))
plt.grid()

plt.plot(agg_profile[100:])
plt.plot(agg_profile[97:0:-1])

[<matplotlib.lines.Line2D at 0x7fb2fc248278>]

plt.plot()

sp = np.abs(np.fft.fft(agg_profile))**2
freq = np.fft.fftfreq(agg_profile.shape[-1])

plt.plot(sp[2:25])

[<matplotlib.lines.Line2D at 0x7fb2e3a670f0>]

agg_profile = np.log(np.abs(p*seq).sum(axis=-1).sum(axis=0))

plt.plot(agg_profile)

[<matplotlib.lines.Line2D at 0x7fb2e477fe48>]

fig = plt.figure(figsize=(11, 2))
plt.plot(np.log(np.abs(p*seq).sum(axis=-1).sum(axis=0)));