from pathlib import Path
from basepair.config import get_data_dir
import pandas as pd
import matplotlib.pyplot as plt
ddir = get_data_dir()
dir_with_dnase = Path(f"{ddir}/processed/chipnexus/exp/models/oct-sox-nanog-klf-dnase/models/")
dir_without_dnase = Path(f"{ddir}/processed/chipnexus/exp/models/oct-sox-nanog-klf/models")
# performance comparison
def get_performance(mdir):
try:
dfh = pd.read_csv(f"{mdir}/history.csv")
return dict(dfh.iloc[dfh.val_loss.idxmin()])
except:
return {}
df_exp_w = pd.DataFrame([{**get_performance(md), "name":md.name}
for md in dir_with_dnase.iterdir()])
df_exp_wo = pd.DataFrame([{**get_performance(md), "name":md.name}
for md in dir_without_dnase.iterdir()])
ls {dir_with_dnase}
ls {dir_with_dnase}
dfm = pd.merge(df_exp_wo, df_exp_w, on='name', suffixes=("_wo", "_w"))
tasks = ['Sox2', 'Oct4', "Nanog", 'Klf4']
# Label the best performing model
imin = dfm['val_profile/Sox2_loss_wo'].idxmin()
fig = plt.figure(figsize=(15,6))
for r, metric in enumerate(['counts', 'profile']):
for i, task in enumerate(tasks):
plt.subplot(2, len(tasks), r*len(tasks) + i+1)
plt.scatter(dfm[f'val_{metric}/{task}_loss_w'], dfm[f'val_{metric}/{task}_loss_wo'])
plt.scatter(dfm[f'val_{metric}/{task}_loss_w'].iloc[imin], dfm[f'val_{metric}/{task}_loss_wo'].iloc[imin])
minx = min(dfm[f'val_{metric}/{task}_loss_w'].min(), dfm[f'val_{metric}/{task}_loss_w'].min())
maxx = max(dfm[f'val_{metric}/{task}_loss_w'].max(), dfm[f'val_{metric}/{task}_loss_w'].max())
plt.plot([minx, maxx], [minx, maxx])
plt.title(f"{metric}/{task}")
plt.xlabel("with DNase")
plt.ylabel("without DNase")
plt.tight_layout()
# Best model
dfm.iloc[imin]['name'] # n_dil_layers=9
# Best model
dfm.iloc[imin:(imin+1)]
import h5py
import numpy as np
import modisco
import modisco.util
import modisco.core
import modisco.tfmodisco_workflow.seqlets_to_patterns
from modisco.tfmodisco_workflow import workflow
from kipoi.readers import HDF5Reader
from scipy.spatial.distance import correlation
imp_scores = dir_without_dnase / "n_dil_layers=9/grad.test.h5"
grad_type = "weighted"
threshold = 0.1
outdir = "/tmp/modisco"
num_workers = 10
# TODO - make sure the output folder exist
outdir = Path(outdir)
d = HDF5Reader.load(imp_scores)
tasks = list(d['targets']['profile'])
def mean(x):
return sum(x) / len(x)
n = len(d['grads'][tasks[0]][grad_type][0])
distances = np.array([np.array([correlation(np.ravel(d['grads'][task][grad_type][0][i]),
np.ravel(d['grads'][task][grad_type][1][i]))
for i in range(n)])
for task in tasks]).T.mean(axis=-1) # average the distances across tasks
dist_filter = distances < threshold
np.save(outdir / "distances.npy", dist_filter)
# number of kept sequences
print("Fraction of sequences kept: {dist_filter.mean()})
if isinstance(d['inputs'], dict):
one_hot = d['inputs']['seq']
else:
one_hot = d['inputs']
# get the importance scores
thr_hypothetical_contribs = {task: mean(d['grads'][task][grad_type])[dist_filter] for task in tasks}
thr_one_hot = one_hot[dist_filter]
thr_contrib_scores = {task: thr_hypothetical_contribs[task] * thr_one_hot for task in tasks}
tfmodisco_results = modisco.tfmodisco_workflow.workflow.TfModiscoWorkflow(
#Slight modifications from the default settings
sliding_window_size=15,
flank_size=5,
target_seqlet_fdr=0.01,
min_seqlets_per_task=1000,
seqlets_to_patterns_factory=
modisco.tfmodisco_workflow.seqlets_to_patterns.TfModiscoSeqletsToPatternsFactory(
trim_to_window_size=15,
initial_flank_to_add=5,
kmer_len=5,
num_gaps=1,
num_mismatches=0,
n_cores=num_workers,
final_min_cluster_size=60)
)(
task_names=tasks,
contrib_scores=thr_contrib_scores, # -> task score
hypothetical_contribs=thr_hypothetical_contribs,
one_hot=thr_one_hot)
tfmodisco_results
# save the results
import h5py
import modisco.util
# !rm modisco_results_on_valid.hdf5
grp = h5py.File(outdir / "modisco.h5")
tfmodisco_results.save_hdf5(grp)
grp.close()
d['targets']['profile']['Klf4'].shape
plt.correlate(mean(d['grads'][task][grad_type][0].reshape((n, -1)), mean(d['grads'][task][grad_type][1].reshape((n, -1)))
correlation(np.ravel(d['grads'][task][grad_type][0][0]), np.ravel(d['grads'][task][grad_type][1][0]))
import matplotlib.pyplot as plt
task = "Klf4"
distances = np.array([correlation(np.ravel(d['grads'][task][grad_type][0][i]),
np.ravel(d['grads'][task][grad_type][1][i]))
for i in range(n)])
plt.hist(distances, bins=100);
hypothetical_contribs['DNase'].shape
# average across tasks
mean(d['grads'][task][grad_type]).shape
hypothetical_contribs
tasks
d['targets']['profile'].keys()
hypothetical_contribs
one_hot = TODO
contrib_scores = pass
import os
from basepair.BPNet import BPNetPredictor
from basepair.cli.schemas import DataSpec, HParams
from basepair.modisco import ModiscoResult
from scipy.spatial.distance import correlation
from concise.utils.helper import write_json, read_json
from basepair.data import numpy_minibatch
from kipoi.writers import HDF5BatchWriter
import h5py
import numpy as np
import keras.backend as K
from basepair.config import create_tf_session, valid_chr, test_chr
from keras.models import load_model
from basepair.cli.evaluate import load_data
import matplotlib.pyplot as plt
plt.switch_backend('agg')
def grad_score(model_dir, output_file,
batch_size=512,
num_workers=10,
chunk_size=512,
split='valid',
overwrite=False,
gpu=None):
if gpu is not None:
create_tf_session(gpu)
else:
# Don't use any GPU's
os.environ['CUDA_VISIBLE_DEVICES'] = ''
if os.path.exists(output_file):
if overwrite:
os.remove(output_file)
else:
raise ValueError(f"File exists {output_file}. Use overwrite=True to overwrite it")
if split=='valid':
incl_chromosomes = valid_chr
excl_chromosomes = None
elif split=='test':
incl_chromosomes = test_chr
excl_chromosomes = None
elif split=='train':
incl_chromosomes = None
excl_chromosomes = valid_chr + test_chr
elif split=='all':
incl_chromosomes = None
excl_chromosomes = None
else:
raise ValueError("split needs to be from {train,valid,test,all}")
from pathlib import Path
model_dir = Path(model_dir)
hp = HParams.load(model_dir / "hparams.yaml")
ds = DataSpec.load(model_dir / "dataspec.yaml")
tasks = list(ds.task_specs)
from basepair.datasets import StrandedProfile
seq_len = hp.data.kwargs['peak_width']
dl_valid = StrandedProfile(ds,
incl_chromosomes=incl_chromosomes,
excl_chromosomes=excl_chromosomes,
peak_width=seq_len,
shuffle=False,
target_transformer=None)
from basepair.BPNet import BPNetPredictor, BiasModel
from tqdm import tqdm
from basepair.losses import MultichannelMultinomialNLL, mc_multinomial_nll_2, mc_multinomial_nll_1, twochannel_multinomial_nll
model = load_model(str(model_dir / "model.h5"),
custom_objects={"mc_multinomial_nll_1": mc_multinomial_nll_1,
"mc_multinomial_nll_2": mc_multinomial_nll_2,
"mc_multinomial_nll": mc_multinomial_nll_1,
"MultichannelMultinomialNLL": MultichannelMultinomialNLL(1),
"twochannel_multinomial_nll": twochannel_multinomial_nll})
bpnet = BPNetPredictor(model, fasta_file=ds.fasta_file, tasks=tasks)
# setup the bias model
if [task for task, task_spec in ds.task_specs.items() if task_spec.bias_model]:
bm = BiasModel(ds)
else:
bm = lambda x: x
writer = HDF5BatchWriter(output_file, chunk_size=chunk_size)
for i, batch in enumerate(tqdm(dl_valid.batch_iter(batch_size=batch_size, num_workers=num_workers))):
(batch['inputs'], batch['targets']) = bm((batch['inputs'], batch['targets'])) # append the bias model predictions
if i > 3:
break
wdict=batch
for task_i, task in enumerate(tasks):
for seq_grad in ['count', 'weighted']:
# figure out the number of channels
nstrands = batch['targets'][f'profile/{task}'].shape[-1]
strand_hash = ["pos", "neg"]
for strand_i in range(nstrands):
grads_pos = bpnet.input_grad(batch['inputs'],
strand=strand_hash[strand_i],
task_id=task_i,
seq_grad=seq_grad,
batch_size=None)
wdict[f"/grads/{task}/{seq_grad}/{strand_i}"] = grads_pos
writer.batch_write(wdict)
writer.close()
%load_ext line_profiler
BPNetPredictor
fname = dir_without_dnase/"n_dil_layers=9/grads.test.benchmark9.h5"
!rm {fname}
%lprun -f BPNetPredictor.input_grad grad_score(dir_without_dnase / "n_dil_layers=9", fname, \
batch_size=512, num_workers=10,chunk_size=512,split='valid',overwrite=False, gpu=1)
fname = dir_without_dnase/"n_dil_layers=9/grads.test.benchmark9.h5"
!rm {fname}
%lprun -f BPNetPredictor.input_grad grad_score(dir_without_dnase / "n_dil_layers=9", fname, \
batch_size=512,num_workers=10,chunk_size=512,split='valid',overwrite=False, gpu=1)
%lprun -f BPNetPredictor.input_grad grad_score(dir_without_dnase / "n_dil_layers=9", dir_without_dnase/"n_dil_layers=9/grads.test.benchmark6.h5", \
batch_size=512,num_workers=10,chunk_size=512,split='valid',overwrite=False, gpu=1)
%lprun -f grad_score grad_score(dir_without_dnase / "n_dil_layers=9", dir_without_dnase/"n_dil_layers=9/grads.test.benchmark4.h5", \
batch_size=512,num_workers=10,chunk_size=512,split='valid',overwrite=False, gpu=1)
%lprun -f grad_score grad_score(dir_without_dnase / "n_dil_layers=9", dir_without_dnase/"n_dil_layers=9/grads.test.benchmark.h5", batch_size=512,num_workers=10,chunk_size=512,split='valid',overwrite=False,gpu=1)