from basepair.cli.schemas import DataSpec, TaskSpec
from basepair.datasets import chip_exo_nexus
from basepair.preproc import AppendTotalCounts
from basepair.config import get_data_dir, create_tf_session
# Use gpus 3, 5
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "3, 5"
ddir = get_data_dir()
bdir = "/srv/scratch/amr1/chipseq/sox2-oct4-chipseq/"
ds = DataSpec(task_specs={"Sox2": TaskSpec(task="Sox2",
pos_counts=f"{bdir}/Sox2/pos.bw",
neg_counts=f"{bdir}/Sox2/neg.bw",
peaks=f"{bdir}/Sox2/Sox2_1_rep1-pr.IDR0.05.filt.12-col.bed.gz",
),
"Oct4": TaskSpec(task="Oct4",
pos_counts=f"{bdir}/Oct4/pos2.bw",
neg_counts=f"{bdir}/Oct4/neg2.bw",
peaks=f"{bdir}/Oct4/Oct4_12_ppr.IDR0.05.filt.12-col.bed.gz",
)
},
fasta_file="/mnt/data/pipeline_genome_data/mm10/mm10_no_alt_analysis_set_ENCODE.fasta"
)
def ds2bws(ds):
return {task: {"pos": task_spec.pos_counts, "neg": task_spec.neg_counts} for task, task_spec in ds.task_specs.items()}
train, valid, test = chip_exo_nexus(ds, peak_width=201)
# Pre-process the data
preproc = AppendTotalCounts()
preproc.fit(train[1])
train[1] = preproc.transform(train[1])
valid[1] = preproc.transform(valid[1])
test[1] = preproc.transform(test[1])
def seq_multitask_chipseq(filters=21,
conv1_kernel_size=21,
tconv_kernel_size=25,
#tconv_kernel_size2=25,
n_dil_layers=6,
lr=0.004,
c_task_weight=100,
use_profile=True,
use_counts=True,
tasks=['sox2', 'oct4'],
seq_len=201):
"""
Dense
Args:
c_task_weights: how to upweight the count-prediction task
"""
inp = kl.Input(shape=(seq_len, 4), name='seq')
first_conv = kl.Conv1D(filters,
kernel_size=conv1_kernel_size,
padding='same',
activation='relu')(inp)
prev_layers = [first_conv]
for i in range(1, n_dil_layers + 1):
if i == 1:
prev_sum = first_conv
else:
prev_sum = kl.add(prev_layers)
conv_output = kl.Conv1D(filters, kernel_size=3, padding='same', activation='relu', dilation_rate=2**i)(prev_sum)
prev_layers.append(conv_output)
combined_conv = kl.add(prev_layers)
# De-conv
x = kl.Reshape((-1, 1, filters))(combined_conv)
x = kl.Conv2DTranspose(2*len(tasks), kernel_size=(tconv_kernel_size, 1), padding='same')(x)
#x = kl.UpSampling2D((2, 1))(x)
#x = kl.Conv2DTranspose(len(tasks), kernel_size=(tconv_kernel_size2, 1), padding='same')(x)
#x = kl.UpSampling2D((2, 1))(x)
#x = kl.Conv2DTranspose(int(len(tasks)/2), kernel_size=(tconv_kernel_size3, 1), padding='same')(x)
out = kl.Reshape((-1, 2 * len(tasks)))(x)
# setup the output branches
outputs = []
losses = []
loss_weights = []
if use_profile:
output = [kl.Lambda(lambda x, i: x[:, :, (2 * i):(2 * i + 2)],
output_shape=(seq_len, 2),
name="profile/" + task,
arguments={"i": i})(out)
for i, task in enumerate(tasks)]
outputs += output
losses += [twochannel_multinomial_nll] * len(tasks)
loss_weights += [1] * len(tasks)
if use_counts:
pooled = kl.GlobalAvgPool1D()(combined_conv)
counts = [kl.Dense(2, name="counts/" + task)(pooled)
for task in tasks]
outputs += counts
losses += ["mse"] * len(tasks)
loss_weights += [c_task_weight] * len(tasks)
model = Model(inp, outputs)
model.compile(Adam(lr=lr), loss=losses, loss_weights=loss_weights)
return model
import keras.layers as kl
from keras.optimizers import Adam
from keras.models import Model
import keras.backend as K
from concise.utils.helper import get_from_module
from basepair.losses import twochannel_multinomial_nll
from basepair.layers import SpatialLifetimeSparsity
import keras.layers as kl
from keras.optimizers import Adam
from keras.callbacks import EarlyStopping, ModelCheckpoint, History
from keras.models import Model, load_model
i=1
def get_model(mfn, mkwargs, fkwargs, i):
"""Get the model"""
import datetime
mdir = f"{ddir}/processed/chipseq/exp/models/count+profile"
name = mfn + "_" + \
",".join([f'{k}={v}' for k,v in mkwargs.items()]) + \
"." + str(i)
i+=1
!mkdir -p {mdir}
ckp_file = f"{mdir}/{name}.h5"
all_kwargs = {**mkwargs, **fkwargs}
return eval(mfn)(**all_kwargs), name, ckp_file
# hyper-parameters
mfn = "seq_multitask_chipseq"
use_profile = True
use_counts = True
mkwargs = dict(filters=32,
conv1_kernel_size=21,
tconv_kernel_size=100,
n_dil_layers=6,
use_profile=use_profile,
use_counts=use_counts,
c_task_weight=10,
seq_len=train[0].shape[1],
lr=0.004)
fixed_kwargs = dict(
tasks=list(ds.task_specs)
)
import numpy as np
np.random.seed(20)
i += 1
model, name, ckp_file = get_model(mfn, mkwargs, fixed_kwargs, i)
history = model.fit(train[0],
train[1],
batch_size=256,
epochs=100,
validation_data=valid[:2],
callbacks=[EarlyStopping(patience=5),
History(),
ModelCheckpoint(ckp_file, save_best_only=True)]
)
# get the best model
model = load_model(ckp_file, custom_objects={"twochannel_multinomial_nll": twochannel_multinomial_nll})
from basepair.eval import evaluate
evaluate(model, valid[0], valid[1])
BED_DIR = f"/srv/scratch/amr1/chipseq/sox2-oct4-chipseq/"
Sox2_BW_DIR = f"/srv/scratch/amr1/chipseq/sox2-oct4-chipseq/Sox2/"
Oct4_BW_DIR = f"/srv/scratch/amr1/chipseq/sox2-oct4-chipseq/Oct4/"
import pandas as pd
import numpy as np
from pybedtools import BedTool, Interval
from basepair.config import get_data_dir
from basepair.preproc import bin_counts
from concise.utils.helper import get_from_module
from tqdm import tqdm
from concise.preprocessing import encodeDNA
from random import Random
import joblib
from basepair.preproc import resize_interval
from genomelake.extractors import FastaExtractor, BigwigExtractor
from kipoi.data_utils import get_dataset_item
import logging
logger = logging.getLogger(__name__)
logger.addHandler(logging.NullHandler())
def get_chipnexus_data(bed_file=f"{BED_DIR}//Sox2_123b_1_ppr.IDR0.05.filt.summit_centered_200bp.narrowPeak",
peak_fasta_file=f"{BED_DIR}//Sox2_123b_1_ppr.IDR0.05.filt.summit_centered_200bp.fasta",
bigwigs={"cuts_pos": f"{Sox2_BW_DIR}/sox2_pooled_reps_1b_2b_4b.pos_strand.bw",
"cuts_neg": f"{Sox2_BW_DIR}/sox2_pooled_reps_1b_2b_4b.neg_strand.bw",
}
):
"""Loads the dataframe for sox2
"""
from concise.utils.fasta import read_fasta
import pyBigWig
fas = read_fasta(peak_fasta_file)
bed = BedTool(bed_file)
assert len(fas) == len(bed)
bigwig_obj = {k: pyBigWig.open(v) for k, v in bigwigs.items()}
# cuts_pos = []
# cuts_neg = []
l = []
for interval in tqdm(bed):
l.append({"chr": interval.chrom,
"start": interval.start,
"end": interval.stop,
**{k: np.nan_to_num(bw.values(interval.chrom,
interval.start,
interval.stop,
numpy=True))
for k, bw in bigwig_obj.items()}
})
dfc = pd.DataFrame(l)
dfc['seq'] = list(fas.values())
dfc['seq_id'] = list(fas)
dfc['seq'] = dfc.seq.str.upper()
return dfc
def sox2_oct4_peaks_sox2(valid_chr=['chr2', 'chr3', 'chr4'],
test_chr=['chr1', 'chr8', 'chr9']):
"""
The default chromomsome split is roughly 60/20/20
"""
for v in valid_chr:
assert v not in test_chr
dfc = get_chipnexus_data(
bigwigs={"sox2_pos": f"{Sox2_BW_DIR}/sox2_pooled_reps_1b_2b_4b.pos_strand.bw",
"sox2_neg": f"{Sox2_BW_DIR}/sox2_pooled_reps_1b_2b_4b.neg_strand.bw",
"oct4_pos": f"{Oct4_BW_DIR}/Oct4_1234.pos.bw",
"oct4_neg": f"{Oct4_BW_DIR}/Oct4_1234.neg.bw"})
seq = encodeDNA(dfc.seq)
# Prepare the signal
sox2_pos = np.stack(dfc.sox2_pos)
sox2_neg = np.stack(dfc.sox2_neg)
oct4_pos = np.stack(dfc.oct4_pos)
oct4_neg = np.stack(dfc.oct4_neg)
ids = dfc.seq_id
# Data splits
is_test = dfc.chr.isin(test_chr)
is_valid = dfc.chr.isin(valid_chr)
is_train = (~is_test) & (~is_valid)
sox2_cuts = np.stack([sox2_pos, sox2_neg], axis=-1)
oct4_cuts = np.stack([oct4_pos, oct4_neg], axis=-1)
return tuple(((seq[subset], # x
{"sox2": sox2_cuts[subset], # y
"oct4": oct4_cuts[subset]},
dfc[subset]) # metadata
for subset in [is_train, is_valid, is_test]))
# hyper-parameters
from basepair.models import seq_multitask
mfn2 = "seq_multitask"
use_profile = True
use_counts = True
mkwargs2 = dict(filters=32,
conv1_kernel_size=21,
tconv_kernel_size=25,
n_dil_layers=6,
use_profile=use_profile,
use_counts=use_counts,
c_task_weight=10,
lr=0.004)
data2 = sox2_oct4_peaks_sox2()
from basepair.preproc import transform_data
train_nex, valid_nex, test_nex = transform_data(data2, use_profile, use_counts)
i += 1
model2, name2, ckp_file2 = get_model(mfn2, mkwargs2, fixed_kwargs, i)
history2 = model2.fit(train_nex[0],
train_nex[1],
batch_size=256,
epochs=100,
validation_data=valid_nex[:2],
callbacks=[EarlyStopping(patience=5),
History(),
ModelCheckpoint(ckp_file2, save_best_only=True)]
)
# get the best model
model2 = load_model(ckp_file2, custom_objects={"twochannel_multinomial_nll": twochannel_multinomial_nll,
"SpatialLifetimeSparsity": SpatialLifetimeSparsity})
evaluate(model2, valid_nex[0], valid_nex[1])
from basepair import samplers
from basepair.plots import *
import pandas as pd
from basepair.math import softmax
import numpy as np
import keras.backend as K
from keras.models import Model
from concise.utils.plot import seqlogo_fig, seqlogo
import matplotlib.pyplot as plt
seqlogo
test_nex[2].head()
p2 = Seq2Nexus(test_nex[0], test_nex[1], test_nex[2], model2)
p2.plot(sort='max_sox2', seq_grad='max', figsize=(20,12))
test[2].head()
from basepair.BPNet import BPNetPredictor
from pybedtools import Interval, BedTool
bt = BedTool.from_dataframe(test[2][["chr", "start", "end"]][:5])
bpnet = BPNetPredictor(model, ds.fasta_file, list(ds.task_specs), preproc=preproc)
bpnet.predict_plot(intervals=list(bt), bws = ds2bws(ds), profile_grad="max")
bpnet2 = BPNetPredictor(model2, ds.fasta_file, list(ds.task_specs), preproc=preproc)
#bpnet2.predict_plot(intervals=list(bt), bws = ds2bws(ds), profile_grad="weighted")
diff = bpnet2.input_grad(test[0], seq_grad='weighted') - bpnet.input_grad(test[0], seq_grad='weighted')
diff = diff * test[0]
diff.shape
sums = np.sum(np.sum(diff, axis = 1), axis=1)
abs_sums = np.sum(np.sum(np.abs(diff), axis = 1), axis=1)
max_diff_index = np.argmax(sums)
no_diff_index = np.argmin(abs_sums)
most_diff_index = np.argmax(abs_sums)
min_diff_index = np.argmin(sums)
fig, (ax0, ax1, ax2, ax3)= plt.subplots(4, 1, sharex=True, figsize=(20, 6))
ax0.set_title("max_diff_index")
seqlogo(diff[max_diff_index], ax=ax0)
ax1.set_title("min_diff_index")
seqlogo(diff[min_diff_index], ax=ax1)
ax2.set_title("most_diff_index")
seqlogo(diff[most_diff_index], ax=ax2)
ax3.set_title("no_diff_index")
seqlogo(diff[no_diff_index], ax=ax3)
fig, (ax0, ax1, ax2, ax3)= plt.subplots(4, 1, sharex=True, figsize=(20, 6))
scale = 0.2
ax0.set_title("max_diff_index")
ax0.set_ylim((-1*scale,scale))
seqlogo(diff[max_diff_index], ax=ax0)
ax1.set_title("min_diff_index")
ax1.set_ylim((-1*scale,scale))
seqlogo(diff[min_diff_index], ax=ax1)
ax2.set_title("most_diff_index")
ax2.set_ylim((-1*scale,scale))
seqlogo(diff[most_diff_index], ax=ax2)
ax3.set_title("no_diff_index")
ax3.set_ylim((-1*scale,scale))
seqlogo(diff[no_diff_index], ax=ax3)
np.array(train[0]).shape
np.array(train_nex[0]).shape