# Use gpus 0,1
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0, 1"
from pathlib import Path
import sys
import numpy as np
import matplotlib.pyplot as plt
sys.path.append(str(Path(os.getcwd()).absolute().parent.parent))
sys.path.append('/opt/miniconda3/envs/basepair/lib/python3.6/site-packages')
from basepair import samplers
import basepair
from basepair.cli.schemas import DataSpec, TaskSpec
from basepair.datasets import chip_exo_nexus
from basepair.preproc import AppendTotalCounts
from basepair.config import get_data_dir, create_tf_session
ddir = '/home/prime/data'
bdir = "/data/sox2-oct4-chipseq/"
ds = DataSpec(task_specs={"Sox2": TaskSpec(task="Sox2",
pos_counts=f"{bdir}/Sox2/pos.bw",
neg_counts=f"{bdir}/Sox2/neg.bw",
peaks=f"{bdir}/Sox2/Sox2_1_rep1-pr.IDR0.05.filt.12-col.bed.gz",
),
"Oct4": TaskSpec(task="Oct4",
pos_counts=f"{bdir}/Oct4/pos2.bw",
neg_counts=f"{bdir}/Oct4/neg2.bw",
peaks=f"{bdir}/Oct4/Oct4_12_ppr.IDR0.05.filt.12-col.bed.gz",
)
},
fasta_file="/data/mm10_no_alt_analysis_set_ENCODE.fasta"
)
def ds2bws(ds):
return {task: {"pos": task_spec.pos_counts, "neg": task_spec.neg_counts} for task, task_spec in ds.task_specs.items()}
# Get the training data
train, valid, test = chip_exo_nexus(ds, peak_width=1000)
import numpy as np
print (train[0].shape)
print (train[1]['profile/Sox2'].shape)
print(train[1]['counts/Sox2'].shape)
# print (train[1]['profile/Sox2'].max())
for k in range(train[0].shape[0]):
mask1 = train[1]['profile/Sox2'][k] > 0.1
mask2 = train[1]['profile/Sox2'][k] < 0.9
elems = train[1]['profile/Sox2'][k][mask1 & mask2]
if len(elems):
print (elems)
train[1]['profile/Sox2'].shape
import numpy as np
np.unique(train[1]['profile/Sox2'].shape)
class Plotter:
def __init__(self, ys):
#size = (N, 1000, 2)
self.ys = np.array(ys)
def plot(self, binsize=10, n=10, sigma=1.6, lows=10, fft='r', sort='random', figsize=(20, 2), fpath_template=None):
if sort == 'random':
idx_list = samplers.random(self.ys[0], n)
elif sort == 'sum':
idx_list = samplers.top_sum_count(self.ys, n)
idx_list = idx_list[2:]
else: # sort == 'max':
idx_list = samplers.top_max_count(self.ys, n)
for i, idx in enumerate(idx_list):
bin0 = binify(self.ys[idx, :, 0], binsize=binsize)
bin1 = binify(self.ys[idx, :, 1], binsize=binsize)
gauss0 = gaussian_filter1d(bin0, sigma=sigma)
gauss1 = gaussian_filter1d(bin1, sigma=sigma)
fft_func = np.fft.fft if fft == 'd' else np.fft.rfft
ifft_func = np.fft.ifft if fft == 'd' else np.fft.irfft
length = len(bin0)
fft_low0 = fft_func(bin0)[:10]
fft_low1 = fft_func(bin0)[:10]
ifft_low0 = np.maximum(ifft_func(fft_low0, n=length), 0)
ifft_low1 = np.maximum(ifft_func(fft_low1, n=length), 0)
fig = plt.figure(figsize=figsize)
plt.subplot(141)
if i == 0:
plt.title("Binned")
plt.plot(bin0)
plt.plot(bin1)
plt.subplot(142)
if i == 0:
plt.title("Gaussian σ=%.2f" % sigma)
plt.plot(gauss0)
plt.plot(gauss1)
plt.subplot(143)
if i == 0:
plt.title("FFT with low-pass")
plt.plot(fft_low0)
plt.plot(fft_low1)
plt.subplot(144)
if i == 0:
plt.title("IFFT with low-pass")
plt.plot(ifft_low0)
plt.plot(ifft_low1)
if fpath_template is not None:
plt.savefig(fpath_template.format(i) + '.png', dpi=600)
plt.savefig(fpath_template.format(i) + '.pdf', dpi=600)
plt.close(fig) # close the figure
show_figure(fig)
plt.show()
plotter = Plotter(test[1]['profile/Sox2'])
plotter.plot(sort='sum', n=6)
class GaussianPlotter:
def __init__(self, ys):
#size = (N, 1000, 2)
self.ys = np.array(ys)
def plot(self, binsize=10, n=10, fft='r', sort='random', figsize=(20, 2), fpath_template=None):
if sort == 'random':
idx_list = samplers.random(self.ys[0], n)
elif sort == 'sum':
idx_list = samplers.top_sum_count(self.ys, n)
idx_list = idx_list[2:]
else: # sort == 'max':
idx_list = samplers.top_max_count(self.ys, n)
for i, idx in enumerate(idx_list):
bin0 = binify(self.ys[idx, :, 0], binsize=binsize)
bin1 = binify(self.ys[idx, :, 1], binsize=binsize)
fig = plt.figure(figsize=figsize)
plt.subplot(141)
if i == 0:
plt.title("Binned")
plt.plot(bin0)
plt.plot(bin1)
for j, sigma in enumerate([1.6, 2.0, 3.0]):
gauss0 = gaussian_filter1d(bin0, sigma=sigma)
gauss1 = gaussian_filter1d(bin1, sigma=sigma)
plt.subplot(142 + j)
if i == 0:
plt.title("Gaussian σ=%f" % sigma)
plt.plot(gauss0)
plt.plot(gauss1)
if fpath_template is not None:
plt.savefig(fpath_template.format(i) + '.png', dpi=600)
plt.savefig(fpath_template.format(i) + '.pdf', dpi=600)
plt.close(fig) # close the figure
show_figure(fig)
plt.show()
plotter = GaussianPlotter(test[1]['profile/Sox2'])
plotter.plot(sort='sum', n=6)
class IRFFTPlotter:
def __init__(self, ys):
#size = (N, 1000, 2)
self.ys = np.array(ys)
def plot(self, binsize=10, n=10, fft='r', sort='random', figsize=(20, 2), fpath_template=None):
if sort == 'random':
idx_list = samplers.random(self.ys[0], n)
elif sort == 'sum':
idx_list = samplers.top_sum_count(self.ys, n)
idx_list = idx_list[2:]
else: # sort == 'max':
idx_list = samplers.top_max_count(self.ys, n)
for i, idx in enumerate(idx_list):
bin0 = binify(self.ys[idx, :, 0], binsize=binsize)
bin1 = binify(self.ys[idx, :, 1], binsize=binsize)
fft0 = np.fft.rfft(bin0)
fft1 = np.fft.rfft(bin1)
length = len(bin0)
fig = plt.figure(figsize=figsize)
plt.subplot(141)
if i == 0:
plt.title("Binned")
plt.plot(bin0)
plt.plot(bin1)
for j, lows in enumerate([5, 10, 15]):
fft_low0 = fft0[:lows]
fft_low1 = fft1[:lows]
ifft_low0 = np.maximum(np.fft.irfft(fft_low0, n=length), 0)
ifft_low1 = np.maximum(np.fft.irfft(fft_low1, n=length), 0)
plt.subplot(142 + j)
if i == 0:
plt.title("Low pass IFFT with %d freqs" % lows)
plt.plot(ifft_low0)
plt.plot(ifft_low1)
if fpath_template is not None:
plt.savefig(fpath_template.format(i) + '.png', dpi=600)
plt.savefig(fpath_template.format(i) + '.pdf', dpi=600)
plt.close(fig) # close the figure
show_figure(fig)
plt.show()
plotter = IRFFTPlotter(test[1]['profile/Sox2'])
plotter.plot(sort='sum', n=6)
seq = train[1]['profile/Sox2'][100, :, 0]
binned = binify(seq)[20:]
print (binned)
plt.plot(binned)
fft = np.fft.rfft(binned)
plt.plot(fft)
ifft = np.fft.irfft(fft[:15], n=len(binned))
plt.plot(ifft)
from scipy.ndimage.filters import gaussian_filter1d
gauss = gaussian_filter1d(binned, sigma=1.2)
plt.plot(gauss)
from basepair.math import softmax
from basepair import samplers
from basepair.preproc import bin_counts
import numpy as np
class Seq2Sox2Oct4:
def __init__(self, x, y, model):
self.x = x
self.y = y
self.model = model
# Make the prediction
self.y_pred = [softmax(y) for y in model.predict(x)]
def plot(self, n=10, kind='test', sort='random', figsize=(20, 2), fpath_template=None, binsize=1):
import matplotlib.pyplot as plt
if sort == 'random':
idx_list = samplers.random(self.x, n)
elif "_" in sort:
kind, task = sort.split("_")
#task_id = {"Sox2": 0, "Oct4": 1}[task]
if kind == "max":
idx_list = samplers.top_max_count(self.y[f"profile/{task}"], n)
elif kind == "sum":
idx_list = samplers.top_sum_count(self.y[f"profile/{task}"], n)
else:
raise ValueError("")
else:
raise ValueError(f"sort={sort} couldn't be interpreted")
# for visualization, we use bucketize
for i, idx in enumerate(idx_list):
fig = plt.figure(figsize=figsize)
plt.subplot(141)
if i == 0:
plt.title("Predicted Sox2")
plt.plot(bin_counts(self.y_pred[0], binsize=binsize)[idx, :, 0], label='pos,m={}'.format(np.argmax(self.y_pred[0][idx, :, 0])))
plt.plot(bin_counts(self.y_pred[0], binsize=binsize)[idx, :, 1], label='neg,m={}'.format(np.argmax(self.y_pred[0][idx, :, 1])))
plt.legend()
plt.subplot(142)
if i == 0:
plt.title("Observed Sox2")
plt.plot(bin_counts(self.y["profile/Sox2"], binsize=binsize)[idx, :, 0], label='pos,m={}'.format(np.argmax(self.y["profile/Sox2"][idx, :, 0])))
plt.plot(bin_counts(self.y["profile/Sox2"], binsize=binsize)[idx, :, 1], label='neg,m={}'.format(np.argmax(self.y["profile/Sox2"][idx, :, 1])))
plt.legend()
plt.subplot(143)
if i == 0:
plt.title("Predicted Oct4")
plt.plot(bin_counts(self.y_pred[1], binsize=binsize)[idx, :, 0], label='pos,m={}'.format(np.argmax(self.y_pred[1][idx, :, 0])))
plt.plot(bin_counts(self.y_pred[1], binsize=binsize)[idx, :, 1], label='neg,m={}'.format(np.argmax(self.y_pred[1][idx, :, 1])))
plt.legend()
plt.subplot(144)
if i == 0:
plt.title("Observed Oct4")
plt.plot(bin_counts(self.y["profile/Oct4"], binsize=binsize)[idx, :, 0], label='pos,m={}'.format(np.argmax(self.y["profile/Oct4"][idx, :, 0])))
plt.plot(bin_counts(self.y["profile/Oct4"], binsize=binsize)[idx, :, 1], label='neg,m={}'.format(np.argmax(self.y["profile/Oct4"][idx, :, 1])))
plt.legend()
if fpath_template is not None:
plt.savefig(fpath_template.format(i) + '.png', dpi=600)
plt.savefig(fpath_template.format(i) + '.pdf', dpi=600)
plt.close(fig) # close the figure
show_figure(fig)
plt.show()
pl = Seq2Sox2Oct4(test[0], test[1], model)
pl.plot(n=10, sort='sum_Sox2', binsize=50)