import pandas as pd
import numpy as np
from pybedtools import BedTool
from basepair.config import get_data_dir, create_tf_session
from tqdm import tqdm
from concise.preprocessing import encodeDNA
from basepair.datasets import get_sox2_data
from basepair.plots import plot_loss
import pyBigWig
from basepair.math import softmax
import matplotlib.pyplot as plt
from basepair import samplers
create_tf_session(1)
ddir = get_data_dir()
#rm -rf {ddir}/processed/chipnexus/chipseq_pipeline/
#!ln -s /srv/scratch/shared/surya/avsec/chip-nexus/data-stowers/semi_processed {ddir}/processed/chipnexus/chipseq_pipeline
from basepair.datasets import *
from basepair.datasets import sox2_oct4_peaks_sox2
# Get the dataset
df_sox2_oct4 = get_chipnexus_data(
bigwigs={"sox2_pos": f"{Sox2_BW_DIR}/sox2_pooled_reps_1b_2b_4b.pos_strand.bw",
"sox2_neg": f"{Sox2_BW_DIR}/sox2_pooled_reps_1b_2b_4b.neg_strand.bw",
"oct4_pos": f"{Oct4_BW_DIR}/Oct4_1234.pos.bw",
"oct4_neg": f"{Oct4_BW_DIR}/Oct4_1234.neg.bw",
})
train, valid, test = sox2_oct4_peaks_sox2()
train[0][:2]
train[1]['oct4'][:2,:2]
train[1]['sox2'][:2,:2]
idx_list = samplers.random(train[1]['sox2'], 10)
for i,idx in enumerate(idx_list):
for protein in ['sox2', 'oct4']:
plt.figure(figsize=(10,1))
plt.plot(train[1][protein][idx,:,0], label='pos,p={}'.format(protein))
plt.plot(train[1][protein][idx,:,1], label='neg,p={}'.format(protein))
interval_id = train[2].seq_id.iloc[idx]
plt.title(f"{protein}, {interval_id}")
idx_list = samplers.top_max_count(train[1]['sox2'], 20, 10)
for i,idx in enumerate(idx_list):
for protein in ['sox2', 'oct4']:
plt.figure(figsize=(10,1))
plt.plot(train[1][protein][idx,:,0], label='pos,p={}'.format(protein))
plt.plot(train[1][protein][idx,:,1], label='neg,p={}'.format(protein))
interval_id = train[2].seq_id.iloc[idx]
plt.title(f"{protein}, {interval_id}")
idx_list = samplers.top_max_count(train[1]['oct4'], 10, 0)
for i,idx in enumerate(idx_list):
for protein in ['sox2', 'oct4']:
plt.figure(figsize=(10,1))
plt.plot(train[1][protein][idx,:,0], label='pos,p={}'.format(protein))
plt.plot(train[1][protein][idx,:,1], label='neg,p={}'.format(protein))
interval_id = train[2].seq_id.iloc[idx]
plt.title(f"{protein}, {interval_id}")
import keras.layers as kl
from keras.optimizers import Adam
from keras.models import Model
import keras.backend as K
from concise.utils.helper import get_from_module
from basepair.losses import twochannel_multinomial_nll
from basepair.layers import SpatialLifetimeSparsity
from basepair.models import seq_mutlitask
import keras.layers as kl
from keras.optimizers import Adam
from keras.callbacks import EarlyStopping, ModelCheckpoint, History
from keras.models import Model, load_model
def get_model(mfn, mkwargs):
"""Get the model"""
import datetime
mdir = f"{ddir}/processed/chipnexus/exp/models/multi-task"
name = mfn + "_" + \
",".join([f'{k}={v}' for k,v in mkwargs.items()]) + \
"." + str(datetime.datetime.now()).replace(" ", "::")
!mkdir -p {mdir}
ckp_file = f"{mdir}/{name}.h5"
return eval(mfn)(**mkwargs), name, ckp_file
#model = seq_mutlitask()
# hyper-parameters
mfn = "seq_mutlitask"
mkwargs = dict(filters=32,
conv1_kernel_size=21,
tconv_kernel_size=25,
n_dil_layers=6,
lr=0.004)
# best valid so far: 108238.6558
model, name, ckp_file = get_model(mfn, mkwargs)
history = model.fit(train[0],
[train[1]['sox2'], train[1]['oct4']],
batch_size=256,
epochs=100,
validation_data=valid[:2],
callbacks=[EarlyStopping(patience=5),
History(),
ModelCheckpoint(ckp_file, save_best_only=True)]
)
# get the best model
model = load_model(ckp_file, custom_objects={"twochannel_multinomial_nll": twochannel_multinomial_nll,
"SpatialLifetimeSparsity": SpatialLifetimeSparsity})
plot_loss(history)
y_pred = model.predict(test[0])
plt.scatter(np.log(1+np.ravel(test[1]['sox2'])), np.log(1+np.ravel(test[1]['oct4'])));
def normalize(x):
return x / x.sum(1, keepdims=True)
test[1]['sox2'].shape
plt.scatter(np.log(1+np.ravel(normalize(test[1]['sox2']))),
np.log(1+np.ravel(normalize(test[1]['oct4']))));
plt.scatter(np.ravel(normalize(test[1]['sox2'])),
np.ravel(normalize(test[1]['oct4'])));
plt.scatter(y_pred[0].reshape((len(y_pred[0]), -1)), y_pred[1].reshape((len(y_pred[0]), -1)));
class Seq2Sox2Oct4:
def __init__(self, x, y, model):
self.x = x
self.y = y
self.model = model
# Make the prediction
self.y_pred = [softmax(y) for y in model.predict(x)]
def plot(self, n=10, kind='test', sort='random', sort_prot='sox2', figsize=(20,2)):
import matplotlib.pyplot as plt
if sort=='random':
idx_list = pd.Series(np.arange(len(self.x))).sample(n)
elif sort == 'top':
max_counts_pos = pd.Series(np.max(self.y[sort_prot][:,:,0], axis=-1))
max_counts_neg = pd.Series(np.max(self.y[sort_prot][:,:,1], axis=-1))
idx_list = (max_counts_pos + max_counts_neg).sort_values(ascending=False).index[:n]
else:
raise ValueError(f"sort={sort} couldn't be interpreted")
for i,idx in enumerate(idx_list):
plt.figure(figsize=figsize)
plt.subplot(141)
if i==0:
plt.title("Predicted Sox2")
plt.plot(self.y_pred[0][idx,:,0], label='pos,m={}'.format(np.argmax(self.y_pred[0][idx,:,0])))
plt.plot(self.y_pred[0][idx,:,1], label='neg,m={}'.format(np.argmax(self.y_pred[0][idx,:,1])))
plt.legend();
plt.subplot(142)
if i==0:
plt.title("Observed Sox2")
plt.plot(self.y['sox2'][idx,:,0], label='pos,m={}'.format(np.argmax(self.y['sox2'][idx,:,0])))
plt.plot(self.y['sox2'][idx,:,1], label='neg,m={}'.format(np.argmax(self.y['sox2'][idx,:,1])))
plt.legend();
plt.subplot(143)
if i==0:
plt.title("Predicted Oct4")
plt.plot(self.y_pred[1][idx,:,0], label='pos,m={}'.format(np.argmax(self.y_pred[1][idx,:,0])))
plt.plot(self.y_pred[1][idx,:,1], label='neg,m={}'.format(np.argmax(self.y_pred[1][idx,:,1])))
plt.legend();
plt.subplot(144)
if i==0:
plt.title("Observed Oct4")
plt.plot(self.y['oct4'][idx,:,0], label='pos,m={}'.format(np.argmax(self.y['oct4'][idx,:,0])))
plt.plot(self.y['oct4'][idx,:,1], label='neg,m={}'.format(np.argmax(self.y['oct4'][idx,:,1])))
plt.legend();
p = Seq2Sox2Oct4(test[0], test[1], model)
p.plot(sort='top', sort_prot='oct4',figsize=(20,2))
p.plot(sort='top', sort_prot='sox2',figsize=(20,2))
from basepair import samplers
import pandas as pd
from basepair.math import softmax
import numpy as np
import keras.backend as K
from keras.models import Model
from concise.utils.plot import seqlogo_fig, seqlogo
import matplotlib.pyplot as plt
seqlogo
class Seq2Nexus:
def __init__(self, x, y, df, model):
self.x = x
self.y = y
self.df = df
self.labels = df.chr + ":" + df.start.astype(str) + "-" + df.end.astype(str)
self.model = model
# Make the prediction
self.y_pred = [softmax(p) for p in model.predict(x)]
self.seq_len = self.y_pred[0].shape[1]
def input_grad(self, x, strand='pos', task_id=0, seq_grad='max'):
strand_id = {"pos": 0, "neg": 1}[strand]
if seq_grad =='max':
sfn = K.max
elif seq_grad == 'mean':
sfn = K.mean
else:
raise ValueError(f"seq_grad={seq_grad} couldn't be interpreted")
inp = self.model.inputs[0]
fn = K.function([inp], K.gradients(sfn(self.model.outputs[task_id][:,:,strand_id], axis=-1), inp))
return fn([x])[0]
def plot(self, n=10, kind='test', sort='random',
seq_grad='max', figsize=(20,6)):
import matplotlib.pyplot as plt
if sort=='random':
idx_list = samplers.random(self.x, n)
elif "_" in sort:
kind, task = sort.split("_")
if kind == "max":
idx_list = samplers.top_max_count(self.y[task], n)
elif kind == "sum":
idx_list = samplers.top_sum_count(self.y[task], n)
else:
raise ValueError(f"sort={sort} couldn't be interpreted")
# compute grads
grads = [[self.input_grad(self.x[idx_list], 'pos', 0, seq_grad) * self.x[idx_list],
self.input_grad(self.x[idx_list], 'neg', 0, seq_grad) * self.x[idx_list]],
[self.input_grad(self.x[idx_list], 'pos', 1, seq_grad) * self.x[idx_list],
self.input_grad(self.x[idx_list], 'neg', 1, seq_grad) * self.x[idx_list]]]
for i,idx in enumerate(idx_list):
fig, axes = plt.subplots(8, 1, sharex=True, figsize=figsize)
for tid, task in enumerate(['sox2', 'oct4']):
axes[0 + 4*tid].plot(np.arange(1,self.seq_len+1), self.y[task][idx,:,0], label="pos")#
axes[0 + 4*tid].plot(np.arange(1,self.seq_len+1), self.y[task][idx,:,1], label="neg")
axes[0 + 4*tid].set_ylabel("Observed")
axes[0 + 4*tid].legend()
axes[0 + 4*tid].set_title('{} {}'.format(task, self.labels.iloc[idx]))
axes[1 + 4*tid].plot(np.arange(1,self.seq_len+1), self.y_pred[tid][idx,:,0], label="pos")#
axes[1 + 4*tid].plot(np.arange(1,self.seq_len+1), self.y_pred[tid][idx,:,1], label="neg")
axes[1 + 4*tid].set_ylabel("Predicted")
axes[1 + 4*tid].legend()
# ------------------
seqlogo(grads[tid][0][i], ax=axes[2 + 4*tid]);
axes[2 + 4*tid].set_ylabel("Pos. strand")
seqlogo(grads[tid][1][i], ax=axes[3 + 4*tid]);
axes[3 + 4*tid].set_ylabel("Neg. strand")
x_range = [1, self.seq_len]
axes[3 + 4*tid].set_xticks(list(range(0, self.seq_len, 5)));
p = Seq2Nexus(test[0], test[1],test[2], model)
p.plot(sort='max_sox2', seq_grad='max', figsize=(20,10))
p.plot(sort='max_sox2', seq_grad='max', figsize=(20,10))
p.plot(sort='max_oct4', seq_grad='max', figsize=(20,10))
p.plot(sort='max_oct4', seq_grad='mean', figsize=(20,10))
# Functions to get the gradients
out = model.outputs[0]
inp = model.inputs[0]
pos_strand_ginp_avg = K.function([inp], K.gradients(K.mean(out[:,:,0], axis=-1), inp))
neg_strand_ginp_avg = K.function([inp], K.gradients(K.mean(out[:,:,1], axis=-1), inp))
pos_strand_ginp_max = K.function([inp], K.gradients(K.max(out[:,:,0], axis=-1), inp))
neg_strand_ginp_max = K.function([inp], K.gradients(K.max(out[:,:,1], axis=-1), inp))
from basepair.data import numpy_minibatch
# Pre-compute the predictions and bottlenecks
x = valid[0]
y_true = valid[1]['sox2']
y_pred = softmax(model.predict(x)[0])
grads_pos = np.concatenate([pos_strand_ginp_max([batch])[0]
for batch in numpy_minibatch(x, 512)])
grads_neg = np.concatenate([neg_strand_ginp_max([batch])[0]
for batch in numpy_minibatch(x, 512)])
igrads_pos = grads_pos * x
igrads_neg = grads_neg * x
from scipy.spatial.distance import cosine, correlation
grads_pos_ext = grads_pos.reshape((grads_pos.shape[0], -1))
grads_neg_ext = grads_neg.reshape((grads_neg.shape[0], -1))
distances = np.array([correlation(grads_neg_ext[i], grads_pos_ext[i]) for i in range(len(grads_neg_ext))])
import numpy as np
import seaborn as sns
plt.figure(figsize=(6,6))
plt.subplot(211)
plt.hist(distances, bins=50);
plt.subplot(212)
values, base = np.histogram(distances, bins=40)
plt.plot(base[:-1], np.cumsum(values));
plt.grid()
plt.xlabel("Correlation Distance");
plt.ylabel("Fraction of data points");
# We get roughly 1/3 of the points with high-correlation
top10_idx = pd.Series(np.where(distances<0.5)[0]).sample(10)
top10_idx = pd.Series(distances).sort_values(ascending=True).index[:10]
distances
# Top maxcount indicies
top10_idx = pd.Series(y_true.max(1).sum(1)).sort_values(ascending=False).index[10:30]
# Top count indicies
top10_idx = pd.Series(y_true.sum(1).sum(1)).sort_values(ascending=False).index[:10]
# Random indicies
top10_idx = pd.Series(np.arange(len(y_true))).sample(10)
for i in top10_idx:
fig, (ax1, ax2, ax3, ax4) = plt.subplots(4, 1, sharex=True, figsize=(20,6))
ax1.plot(np.arange(1,202), y_true[i,:,0], label="pos")#
ax1.plot(np.arange(1,202), y_true[i,:,1], label="neg")
ax1.set_ylabel("Observed\ncounts")
ax1.legend()
ax2.plot(np.arange(1,202), y_pred[i,:,0], label="pos")#
ax2.plot(np.arange(1,202), y_pred[i,:,1], label="neg")
ax2.set_ylabel("Predicted\n")
ax2.legend()
ax3.set_ylabel("Pos. strand")
seqlogo(igrads_pos[i], ax=ax3);
ax4.set_ylabel("Neg. strand")
seqlogo(igrads_neg[i], ax=ax4);
x_range = [1, 201]
ax4.set_xticks(list(range(0, 201, 5)));
top_distances = distances<0.2
hyp_scores = grads_pos + grads_neg
hyp_scores = hyp_scores[top_distances]
hyp_scores = hyp_scores - hyp_scores.mean(-1, keepdims=True)
onehot_data = valid[0][top_distances]
scores = hyp_scores * onehot_data
len(hyp_scores)
# Visualize
import modisco.visualization
from modisco.visualization import viz_sequence
viz_sequence.plot_weights(scores[0])
viz_sequence.plot_weights(hyp_scores[0])
viz_sequence.plot_weights(onehot_data[0])
from imp import reload
%env MKL_THREADING_LAYER=GNU
import theano
onehot_data.shape
import h5py
import numpy as np
%matplotlib inline
import modisco
reload(modisco)
import modisco.backend
reload(modisco.backend.theano_backend)
reload(modisco.backend)
import modisco.nearest_neighbors
reload(modisco.nearest_neighbors)
import modisco.affinitymat
reload(modisco.affinitymat.core)
reload(modisco.affinitymat.transformers)
import modisco.tfmodisco_workflow.seqlets_to_patterns
reload(modisco.tfmodisco_workflow.seqlets_to_patterns)
import modisco.tfmodisco_workflow.workflow
reload(modisco.tfmodisco_workflow.workflow)
import modisco.aggregator
reload(modisco.aggregator)
import modisco.cluster
reload(modisco.cluster.core)
reload(modisco.cluster.phenograph.core)
reload(modisco.cluster.phenograph.cluster)
import modisco.core
reload(modisco.core)
import modisco.coordproducers
reload(modisco.coordproducers)
import modisco.metaclusterers
reload(modisco.metaclusterers)
tfmodisco_results = modisco.tfmodisco_workflow.workflow.TfModiscoWorkflow(
sliding_window_size=21,
flank_size=10,
histogram_bins=100,
percentiles_in_bandwidth=10,
overlap_portion=0.5,
min_cluster_size=50,
threshold_for_counting_sign=1.0,
weak_threshold_for_counting_sign=0.7)(
task_names=["task0"],
contrib_scores={'task0': scores},
hypothetical_contribs={'task0': hyp_scores},
one_hot=onehot_data)
mkdir -p {ddir}/processed/chipnexus/motifs/sox2/modisco
!du -sh {ddir}/processed/chipnexus/motifs/sox2/modisco
modisco_file = f"{ddir}/processed/chipnexus/motifs/oct4/modisco/multi-task.hdf5"
import h5py
import modisco.util
reload(modisco.util)
grp = h5py.File(modisco_file)
tfmodisco_results.save_hdf5(grp)
# TODO - write pickle
from collections import Counter
from modisco.visualization import viz_sequence
reload(viz_sequence)
from matplotlib import pyplot as plt
import modisco.affinitymat.core
reload(modisco.affinitymat.core)
import modisco.cluster.phenograph.core
reload(modisco.cluster.phenograph.core)
import modisco.cluster.phenograph.cluster
reload(modisco.cluster.phenograph.cluster)
import modisco.cluster.core
reload(modisco.cluster.core)
import modisco.aggregator
reload(modisco.aggregator)
import sklearn.decomposition
import sklearn.manifold
hdf5_results = h5py.File(modisco_file)
#patterns = (tfmodisco_results
# .metacluster_idx_to_submetacluster_results[0]
# .seqlets_to_patterns_result.patterns);
patterns = (list(hdf5_results
["metacluster_idx_to_submetacluster_results"]
["metacluster0"]
["seqlets_to_patterns_result"]
["patterns"]["all_pattern_names"]))
print(len(patterns))
pattern_grp = (hdf5_results
["metacluster_idx_to_submetacluster_results"]
["metacluster0"]
["seqlets_to_patterns_result"]
["patterns"])
for pattern_name in patterns:
pattern = pattern_grp[pattern_name]
print(pattern_name)
print("total seqlets:",len(pattern["seqlets_and_alnmts"]["seqlets"]))
#pattern.plot_counts(counts=aggregated_seqlet.get_per_position_seqlet_center_counts())
background = np.array([0.27, 0.23, 0.23, 0.27])
print("fwd:")
viz_sequence.plot_weights(pattern["task0_contrib_scores"]["fwd"])
viz_sequence.plot_weights(pattern["task0_hypothetical_contribs"]["fwd"])
viz_sequence.plot_weights(viz_sequence.ic_scale(np.array(pattern["sequence"]["fwd"]),
background=background))
print("reverse:")
viz_sequence.plot_weights(pattern["task0_contrib_scores"]["rev"])
viz_sequence.plot_weights(pattern["task0_hypothetical_contribs"]["rev"])
viz_sequence.plot_weights(viz_sequence.ic_scale(np.array(pattern["sequence"]["rev"]),
background=background))
hdf5_results
sl = tfmodisco_results.multitask_seqlet_creation_results.final_seqlets[0]
cl0 = hdf5_results.metacluster_idx_to_submetacluster_results[0]
sl.coor.example_idx
len(pd.Series([sl.coor.example_idx for sl in cl0.seqlets]).unique())
cl0.seqlets[0]
len(cl0.seqlets)
len(cl0.seqlets)
tfmodisco_results["metacluster_idx_to_submetacluster_results"]
["metacluster0"]
["seqlets_to_patterns_result"]
["patterns"]["all_pattern_names"]))
sl.coor.start
sl.exidx_start_end_string
tfmodisco_results.metaclustering_results.metacluster_indices
sl = tfmodisco_results.multitask_seqlet_creation_results.final_seqlets[0]
sl.coor.
tfmodisco_results.metaclustering_results.metacluster_indices.max()
len(tfmodisco_results.metaclustering_results.metacluster_indices)
sl.