import matplotlib.pyplot as plt
import basepair
from basepair.plots import regression_eval
from basepair.losses import MultichannelMultinomialNLL
from basepair.exp.dnase.models import KmerBiasModel
from kipoi.readers import HDF5Reader
from basepair.utils import read_pkl
from keras.models import load_model
from basepair.cli.schemas import DataSpec, TaskSpec
from basepair.datasets import StrandedProfile
from basepair.datasets import chip_exo_nexus
from pathlib import Path
import pandas as pd
import numpy as np
from basepair.config import create_tf_session, get_data_dir
ddir = get_data_dir()
SEQ_WIDTH = 1000
ls {ddir}/processed/dnase/exp/models/single-task/
ls {ddir}/processed/dnase/exp/models/single-task-background-corrected/models
ls {ddir}/processed/dnase/exp/models/single-task-background-corrected/
!cat {ddir}/processed/dnase/exp/models/single-task-background-corrected/tryout.model.txt
mdir_root = Path(f"{ddir}//processed/dnase/exp/models/single-task-background-corrected/models")
mdirs = list(mdir_root.glob("*=*"))
mdirs
# performance comparison
def get_performance(mdir):
try:
dfh = pd.read_csv(f"{mdir}/history.csv")
return dict(dfh.iloc[dfh.val_loss.idxmin()])
except:
return {}
df_exp = pd.DataFrame([{**get_performance(md), "name":md.name}
for md in mdirs])
df_exp.sort_values("val_counts/DNase_loss")
!cat {str(mdir_root / "tconv_kernel_size=51/hparams.yaml")} | head
mdir_root
!cat {str(mdir_root / "tconv_kernel_size=51/hparams.yaml")} | head
from keras.models import load_model
create_tf_session(0)
mdir = mdir_root / "tconv_kernel_size=51"
model = load_model(mdir / "model.h5",
custom_objects={"mc_multinomial_nll": MultichannelMultinomialNLL(1),
"MultichannelMultinomialNLL": MultichannelMultinomialNLL(1)})
mdir = mdir_root / "../../single-task/models/tconv_kernel_size=51"
model = load_model(mdir / "model.h5",
custom_objects={"mc_multinomial_nll": MultichannelMultinomialNLL(1),
"MultichannelMultinomialNLL": MultichannelMultinomialNLL(1)})
from basepair.cli.modisco import compute_importance_scores
compute_importance_scores(mdir, "/tmp/imp.h5")
from kipoi.readers import HDF5Reader
r = HDF5Reader("/tmp/imp.h5")
r.open()
d = HDF5Reader.load("/tmp/imp.h5")
r.ls()
%time d = r.load_all()
r.close()
from basepair.config import test_chr
from basepair.preproc import AppendCounts
ds = DataSpec.load(mdir / "dataspec.yaml")
ds.task_specs['DNase'].ignore_strand = True # fix
dl_test = StrandedProfile(ds,
incl_chromosomes=test_chr,
peak_width=SEQ_WIDTH,
shuffle=False,
target_transformer=AppendCounts())
test = dl_test.load_all(batch_size=256, num_workers=10)
from basepair.BPNet import BiasModel
ds.task_specs['DNase'].bias_model = "/users/avsec/workspace/basepair/data/processed/dnase/exp/models/background/models/w=1000/model.h5"
x,y_true = BiasModel(ds).predict((test["inputs"], test["targets"]), batch_size=512)
y_true.keys()
from basepair.plots import regression_eval
from basepair.cli.evaluate import eval_profile
from basepair.math import softmax
y_pred = model.predict(x, verbose=1)
# Not bias-corrected
regression_eval(y_true['counts/DNase'].mean(-1),
y_pred[ds.task2idx("DNase", 'counts')].mean(-1),
alpha=0.1)
# bias-corrected
regression_eval(y_true['counts/DNase'].mean(-1),
y_pred[ds.task2idx("DNase", 'counts')].mean(-1),
alpha=0.1)
task = "DNase"
yp = softmax(y_pred[ds.task2idx(task, "profile")])
yt = y_true["profile/" + task]
df = eval_profile(yt, yp)
# not bias-corrected
df
# bias-corrected
df
from basepair.plots import plot_profiles
from basepair import samplers
tasks = list(ds.task_specs)
idx_list = samplers.top_sum_count(y_true['profile/DNase'], 10)
idx_list = samplers.top_max_count(y_true['profile/DNase'], 10)
idx_list = samplers.random(y_true['profile/DNase'], 10)
plot_profiles(y_true, y_pred, tasks, idx_list, preproc=None, figsize=(20, 2), tic_freq=50)
# get high bias sites
high_bias_idx = list(pd.Series(x['bias/counts/DNase'][:,0]).sort_values(ascending=False)[:10].index)
high_bias_idx
# TODO - plot the importance scores
plot_profiles(y_true, y_pred, tasks, high_bias_idx, preproc=None, figsize=(20, 2), tic_freq=50)
from basepair.BPNet import BPNetPredictor
bp = BPNetPredictor(model, ds.fasta_file, tasks, bias_model=BiasModel(ds))
from kipoi.data_utils import get_dataset_item
bp.model.inputs[0]
assert bp.model.inputs[0].name[:3] == "seq"
bp.model.input_names
bp.model.inputs
bp.input_grad(get_dataset_item(x, high_bias_idx), strand='pos').shape
from kipoi.writers import HDF5BatchWriter
from pybedtools import BedTool
bt = list(BedTool.from_dataframe(dl_test.dfm.iloc[high_bias_idx]))
list((1,2,3))
list(({"seq": 1}, )[1:])
# non bias-corrected
bp.plot_predict_grad(bt)
# bias-corrected
bp.plot_predict_grad(bt)
(1,) + (1,)[1:]
bt.head()
bp.plot_predict_grad()