%env CUDA_VISIBLE_DEVICES=3
import os
os.environ['CUDA_VISIBLE_DEVICES']
import tensorflow as tf
#tf.enable_eager_execution()
import keras
from basepair.config import get_data_dir
import pyBigWig
import matplotlib.pyplot as plt
from concise.preprocessing import encodeDNA
from concise.utils.fasta import read_fasta
from pybedtools import BedTool
from tqdm import tqdm
import pandas as pd
import numpy as np
from basepair.data import get_sox2_data, seq_inp_exo_out
from basepair.math import softmax
keras.backend.get_session().list_devices()
ddir = get_data_dir()
dfc = get_sox2_data()
train, test = seq_inp_exo_out()
train[0].shape
train[1].shape
test[0].shape
test[1].shape
train[1][2][:,0]
y_batch = train[1][:64]
from basepair.losses import twochannel_multinomial_nll
import concise.layers as cl
import concise.initializers as ci
import keras.layers as kl
from keras.optimizers import Adam
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras.models import Model, load_model
motifs = {"motif1": f"{ddir}/processed/chipnexus/motifs/sox2/homer_200bp/de-novo/motif1.motif",
"motif2": f"{ddir}/processed/chipnexus/motifs/sox2/homer_200bp/de-novo/motif2.motif"}
from basepair.motif.homer import load_motif, read_motif_hits
pwm_list = [load_motif(fname) for k,fname in motifs.items()]
for i,pwm in enumerate(pwm_list):
pwm.plotPWMInfo((5,1.5))
plt.title(f"motif{i+1}")
# TODO - build the reverse complement symmetry into the model
inp = cl.Input(shape=(201, 4))
filters = 32
first_conv = cl.Conv1D(filters, kernel_size=21, padding='same',
#kernel_initializer = ci.PSSMKernelInitializer(pwm_list, stddev=0, add_noise_before_Pwm2Pssm=False),
#bias_initializer = 'zeros',
activation='relu')(inp)
second_conv = kl.Conv1D(filters, kernel_size=3, padding='same', activation='relu', dilation_rate=2)(first_conv)
c_second_conv = kl.add([first_conv, second_conv])
third_conv = kl.Conv1D(filters, kernel_size=3, padding='same', activation='relu', dilation_rate=4)(c_second_conv)
c_third_conv = kl.add([first_conv, second_conv, third_conv])
fourth_conv = kl.Conv1D(filters, kernel_size=3, padding='same', activation='relu', dilation_rate=8)(c_third_conv)
c_fourth_conv = kl.add([first_conv, second_conv, third_conv, fourth_conv])
fifth_conv = kl.Conv1D(filters, kernel_size=3, padding='same', activation='relu', dilation_rate=16)(c_fourth_conv)
c_fifth_conv = kl.add([first_conv, second_conv, third_conv, fourth_conv, fifth_conv])
sixt_conv = kl.Conv1D(filters, kernel_size=3, padding='same', activation='relu', dilation_rate=32)(c_fifth_conv)
c_sixt_conv = kl.add([first_conv, second_conv, third_conv, fourth_conv, fifth_conv, sixt_conv])
seventh_conv = kl.Conv1D(filters, kernel_size=3, padding='same', activation='relu', dilation_rate=64)(c_sixt_conv)
combined_conv = kl.add([first_conv, second_conv, third_conv, fourth_conv, fifth_conv, sixt_conv, seventh_conv])
# Bottleneck layer
combined_conv = kl.Conv1D(3, kernel_size=1, padding='same', activation='relu')(combined_conv)
x = kl.Reshape((-1, 1, 3))(combined_conv)
x = kl.Conv2DTranspose(2, kernel_size=(25, 1), padding='same')(x)
#kl.Conv2DTranspose(32, kernel_size=(7, 1), padding='same', activation='relu'),
#kl.Conv2DTranspose(2, kernel_size=(3, 1), padding='same'),
out = kl.Reshape((-1, 2))(x)
model = Model(inp, out)
model.compile(Adam(lr=0.004), loss=twochannel_multinomial_nll)
start messing around with output kernel size (before it was 25)
Dillation helps
!mkdir -p {ddir}/processed/chipnexus/exp/models
# best valid so far: 108238.6558
ckp_file = f"{ddir}/processed/chipnexus/exp/models/resnest_allconnect_nconv=7_filters=32_lr=0.004_dilated=True,out=25,bottleneck=3.h5"
model.fit(train[0], train[1],
batch_size=256,
epochs=200,
validation_split=0.2,
callbacks=[EarlyStopping(patience=5),
ModelCheckpoint(ckp_file, save_best_only=True)]
)
# get the best model
model = load_model(ckp_file, custom_objects={"twochannel_multinomial_nll": twochannel_multinomial_nll})
import basepair
model.evaluate(test[0], test[1])
y_pred = softmax(model.predict(train[0]))
y_true = train[1]
np.allclose(y_pred.sum(axis=1),1)
idx_list = pd.Series(np.arange(len(test[0]))).sample(10)
for idx in idx_list:
plt.figure(figsize=(10,2))
plt.subplot(121)
plt.plot(y_pred[idx,:,0], label='pos,m={}'.format(np.argmax(y_pred[idx,:,0])))
plt.plot(y_pred[idx,:,1], label='neg,m={}'.format(np.argmax(y_pred[idx,:,1])))
plt.legend();
plt.subplot(122)
plt.plot(y_true[idx,:,0], label='pos,m={}'.format(np.argmax(y_true[idx,:,0])))
plt.plot(y_true[idx,:,1], label='neg,m={}'.format(np.argmax(y_true[idx,:,1])))
plt.legend();
print("Pos")
print(np.argmax(y_pred[idx_list,:,0], axis=1))
print(np.argmax(y_true[idx_list,:,0], axis=1))
print("Neg")
print(np.argmax(y_pred[idx_list,:,1], axis=1))
print(np.argmax(y_true[idx_list,:,1], axis=1))
y_pred = softmax(model.predict(test[0]))
y_true = test[1]
np.allclose(y_pred.sum(axis=1),1)
idx_list = pd.Series(np.arange(len(test[0]))).sample(10)
for idx in idx_list:
plt.figure(figsize=(10,2))
plt.subplot(121)
plt.plot(y_pred[idx,:,0], label='pos,m={}'.format(np.argmax(y_pred[idx,:,0])))
plt.plot(y_pred[idx,:,1], label='neg,m={}'.format(np.argmax(y_pred[idx,:,1])))
plt.legend();
plt.subplot(122)
plt.plot(y_true[idx,:,0], label='pos,m={}'.format(np.argmax(y_true[idx,:,0])))
plt.plot(y_true[idx,:,1], label='neg,m={}'.format(np.argmax(y_true[idx,:,1])))
plt.legend();
# Inspect the intermediate activations
from concise.preprocessing.sequence import one_hot2string, DNA
one_hot2string(train[0][0][np.newaxis], DNA)[0]
# Add a softmax layer to the model
w=model.layers[-2].get_weights()[0]
w.shape
# Plot weights
w=model.layers[-2].get_weights()[0]
plt.figure(figsize=(20,3))
filters = w.shape[-1]
for i in range(filters):
plt.subplot(1,filters, i+1)
plt.plot(w[:,0,0,i], label='pos')
plt.plot(w[:,0,1,i], label='neg')
plt.title("Motif{}".format(i))
plt.legend();
from concise.utils.plot import seqlogo, seqlogo_fig
from concise.utils.pwm import pssm_array2pwm_array, _pwm2pwm_info
pssm_array2pwm_array
pwm_list[0].plotPSSM(figsize=(5,4));
seqlogo_fig(model.layers[0].get_weights()[0], figsize=(10,4), ncol=2);
# GC gets strongly enhanced -> GC bias?
model.layers[0].plot_weights();
seqlogo_fig(model.layers[0].get_weights()[0], figsize=(10,2), ncol=2);
l.plot_weights?
model.layers[0].plot_weights