Goal

make a nicer Nanog plot where the periodicity can be seen

In [1]:
from basepair.imports import *
/users/avsec/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/h5py/__init__.py:36: FutureWarning: Conversion of the second argument of issubdtype from `float` to `np.floating` is deprecated. In future, it will be treated as `np.float64 == np.dtype(float).type`.
  from ._conv import register_converters as _register_converters
Using TensorFlow backend.
In [2]:
import matplotlib.ticker as ticker
from basepair.plot.profiles import extract_signal
In [3]:
model_dir =  Path(f"{ddir}/processed/chipnexus/exp/models/oct-sox-nanog-klf/models/n_dil_layers=9/")
In [4]:
modisco_dir =  model_dir / "modisco/by_peak_tasks/weighted/Nanog"
In [5]:
from basepair.modisco.table import ModiscoData
In [6]:
ls {model_dir}
Intervene_results/  figures/        grad.valid.h5  modisco/
clustering/         grad.all.h5     history.csv    preds.all.h5
cometml.json        grad.test.2.h5  hparams.yaml   results.html
dataspec.yaml       grad.test.h5    model.h5       results.ipynb
In [7]:
md = ModiscoData.load(modisco_dir, model_dir/ "grad.all.h5")
100%|██████████| 60/60 [13:55<00:00, 13.92s/it]
In [524]:
ps, t0, freq = compute_power_spectrum('metacluster_0/pattern_0', "Oct4", md)
In [541]:
tasks = md.get_tasks()
In [545]:
l = []
for pattern in tqdm(md.mr.patterns()):
    if md.mr.n_seqlets(*pattern.split("/")) < 300:
        continue
    for task in md.get_tasks():
        l.append(dict(task=task, pattern=pattern, frac=periodicity_10bp_frac(pattern, task, md)))
100%|██████████| 60/60 [02:09<00:00,  2.16s/it]
In [547]:
df = pd.DataFrame(l)
In [549]:
sns.heatmap(df.pivot("pattern", "task", "frac"))
Out[549]:
<matplotlib.axes._subplots.AxesSubplot at 0x7fb2de277780>
In [553]:
sns.clustermap(df.pivot("pattern", "task", "frac"))
plt.savefig("fft-frac.png", dpi=300, bbox_inches="tight")
plt.savefig("fft-frac.pdf", bbox_inches="tight")
In [546]:
import seaborn as sns
In [ ]:
sns.heatmap(df, annot=True)
In [ ]:
md.mr.n_seqlets()
In [522]:
power_spectrum
Out[522]:
<function __main__.power_spectrum(md, pattern, task)>
In [537]:
assert t0[9] == 10.88888888888889
In [538]:
ps[0] / ps.sum()
Out[538]:
0.02215352944034531
In [531]:
np.where((t0 < 11) & (t0> 10))
Out[531]:
(array([9]),)
In [527]:
plt.plot(t0, ps)
Out[527]:
[<matplotlib.lines.Line2D at 0x7fb2dd7a32b0>]
In [554]:
power_spectrum(md, 'metacluster_0/pattern_5', "Oct4")
In [555]:
power_spectrum(md, 'metacluster_2/pattern_0', "Nanog")
In [512]:
power_spectrum(md, 'metacluster_0/pattern_1', "Sox2")

Question

  • which motifs have a clear peak?
In [486]:
power_spectrum(md, 'metacluster_0/pattern_4', "Oct4")
In [477]:
power_spectrum(md, 'metacluster_0/pattern_0', "Klf4")
In [28]:
p= md.grad_profile['metacluster_0/pattern_2']['Nanog']
In [54]:
seqlets = md.seqlets_per_task['metacluster_0/pattern_2']
In [472]:
power_spectrum(md, 'metacluster_0/pattern_0', "Sox2")
In [459]:
power_spectrum(md, 'metacluster_0/pattern_1')
In [57]:
pattern= 'metacluster_0/pattern_2'
In [98]:
wide_seqlets = [s.resize(md.footprint_width)
                            for s in seqlets
                            if s.center() > md.footprint_width //2 and \
                               s.center() < md.get_seqlen(pattern) - md.footprint_width //2
                           ]
            

p = extract_signal(md.get_region_grad("Nanog", 'profile'), wide_seqlets)

seq = extract_signal(md.get_region_seq(), wide_seqlets)
In [33]:
from basepair.plot.tracks import plot_tracks

from basepair.plot.heatmaps import heatmap_importance_profile, normalize
In [ ]:
np.fft.rfft
In [324]:
fig = plt.figure(figsize=(11, 2))
plt.plot(np.log(np.abs(p*seq).sum(axis=-1).sum(axis=0)));
In [433]:
heatmap_importance_profile(normalize(np.abs(p*seq).sum(axis=-1)[:500], pmin=50, pmax=99), figsize=(10, 20))
In [450]:
heatmap_importance_profile(normalize(np.abs(p).sum(axis=-1)[:500], pmin=50, pmax=99), figsize=(10, 20))
plt.savefig('nanog-imp-heatmap.png', dpi=300, tight_layout=True)
plt.savefig('nanog-imp-heatmap.pdf')
In [440]:
agg_profile = np.log(np.abs(p*seq).sum(axis=-1).sum(axis=0))
In [441]:
agg_profile = agg_profile - agg_profile.mean()
agg_profile = agg_profile / agg_profile.std()
In [442]:
def smooth(y, box_pts):
    box = np.ones(box_pts)/box_pts
    y_smooth = np.convolve(y, box, mode='same')
    return y_smooth
In [443]:
smooth_part = smooth(agg_profile, 10)
oscilatory_part = agg_profile - smooth_part
In [448]:
fig, axes= plt.subplots(2, 1,figsize=(11, 4), sharex=True)
axes[0].plot(agg_profile, label='original')
axes[0].plot(smooth_part, label="smooth", alpha=0.5)
axes[0].legend()
axes[0].set_ylabel("Avg. importance");
axes[0].set_title("Average importance score")
#axes[0].set_xlabel("Position");
axes[1].plot(oscilatory_part);
axes[1].set_xlabel("Position");
axes[1].set_ylabel("original - smooth");
fig.subplots_adjust(hspace=0)  # no space between plots
plt.savefig('nanog-agg-profile.png', dpi=300)
plt.savefig('nanog-agg-profile.pdf')
In [456]:
fig = plt.figure(figsize=(11, 2))
plt.plot(1/freq[:49], np.abs(np.fft.fft(oscilatory_part[102:])[:49])**2 + np.abs(np.fft.fft(oscilatory_part[:98])[:49])**2, "-o")
plt.xlim([0, 50])
plt.gca().xaxis.set_major_locator(ticker.MaxNLocator(25, integer=True))
plt.grid(alpha=0.3)
plt.xlabel("1/Frequency [bp]")
plt.ylabel("Power spectrum")
plt.title("Power spectrum")
plt.gcf().subplots_adjust(bottom=0.4)

plt.savefig('nanog-fft.png', dpi=300)
plt.savefig('nanog-fft.pdf')
In [213]:
plt.plot(agg_profile[99:])
plt.plot(agg_profile[98:0:-1])
plt.gca().xaxis.set_major_locator(ticker.MaxNLocator(13))
plt.grid()
In [159]:
plt.plot(agg_profile[100:])
plt.plot(agg_profile[97:0:-1])
Out[159]:
[<matplotlib.lines.Line2D at 0x7fb2fc248278>]
In [ ]:
plt.plot()
In [225]:
sp = np.abs(np.fft.fft(agg_profile))**2
freq = np.fft.fftfreq(agg_profile.shape[-1])
In [231]:
plt.plot(sp[2:25])
Out[231]:
[<matplotlib.lines.Line2D at 0x7fb2e3a670f0>]
In [142]:
agg_profile = np.log(np.abs(p*seq).sum(axis=-1).sum(axis=0))
In [143]:
plt.plot(agg_profile)
Out[143]:
[<matplotlib.lines.Line2D at 0x7fb2e477fe48>]
In [133]:
fig = plt.figure(figsize=(11, 2))
plt.plot(np.log(np.abs(p*seq).sum(axis=-1).sum(axis=0)));