Goal

Analyze models with w=200

  • kmer:
    • predictions:
      • data/processed/dnase/6mer/retrained.IMR90_6mer.w200.normalized.preds.h5
      • old: IMR90_6mer.w1000.preds.h5
    • model: new_model.train-pos3.w200.normalized.pkl
    • [x] compare weights with the obtained pkl file - do they match now - not really
  • BPNet model:
    • w=1000 link
    • w=1000,filters=128 link
    • w=1000,filters=256 link

Conclusions

  • best models: f"{ddir}/processed/dnase/exp/models/background/models/w=200,filters=128"
  • the distribution of is actually bi-modal

TODO

  • use a poisson loss function
In [59]:
import matplotlib.pyplot as plt
import basepair
from basepair.plots import regression_eval
from basepair.losses import MultichannelMultinomialNLL
from basepair.exp.dnase.models import KmerBiasModel
from kipoi.readers import HDF5Reader
from basepair.utils import read_pkl
from keras.models import load_model
from basepair.cli.schemas import DataSpec, TaskSpec
from basepair.datasets import StrandedProfile
from basepair.datasets import chip_exo_nexus
from pathlib import Path
import pandas as pd
import numpy as np
from basepair.config import create_tf_session, get_data_dir

ddir = get_data_dir()
SEQ_WIDTH = 1000

K-mer models

Re-trained

In [172]:
# get the predictions
r = HDF5Reader(f"{ddir}/processed/dnase/6mer/retrained.IMR90_6mer.w{SEQ_WIDTH}.normalized.preds.h5")
r.open()
preds = r.f['/preds'][:]
targets = r.f['/targets/profile/DNase'][:].sum(axis=-1)
r.close()
In [173]:
# normalize
y_true = targets[:,3:(1000-2)]
yt_counts = np.log(1+y_true.sum(axis=-1))
yp_counts = np.log(1+preds.sum(axis=-1))
In [85]:
regression_eval(yt_counts, yp_counts, alpha=0.1)
In [133]:
# it's actually a bi-modal distribution (7%, 93%)
plt.hexbin(yp_counts, 
           yt_counts,  
           gridsize=(40,30),
            cmap=plt.cm.BuGn)
Out[133]:
<matplotlib.collections.PolyCollection at 0x7f80abe91048>
In [174]:
plt.hexbin(yp_counts[(yt_counts>1) & (yt_counts < 3.5)], 
           yt_counts[(yt_counts>1) & (yt_counts < 3.5)],  
           gridsize=(30,15),#gridsize=(40,30),
            cmap=plt.cm.BuGn)
Out[174]:
<matplotlib.collections.PolyCollection at 0x7f80ab053780>
In [141]:
# correlation when removing 0's
regression_eval(yt_counts[yt_counts > 0], yp_counts[yt_counts > 0], alpha=0.1)
In [183]:
# outliers (.4%)
regression_eval(yt_counts[np.exp(yt_counts)-1 > 25], yp_counts[np.exp(yt_counts)-1 > 25], alpha=0.1)

Existing

In [153]:
# get the predictions
r = HDF5Reader(f"{ddir}/processed/dnase/6mer/IMR90_6mer.w{SEQ_WIDTH}.preds.h5")
r.open()
preds = r.f['/preds'][:]
targets = r.f['/targets/profile/DNase'][:].sum(axis=-1)
r.close()
In [154]:
# normalize
y_true = targets[:,3:(1000-2)]
yt_counts = np.log(1+y_true.sum(axis=-1))
yp_counts = np.log(1+preds.sum(axis=-1))
In [77]:
regression_eval(yt_counts, yp_counts, alpha=0.1)
In [171]:
# w=1000,filters=128
plt.hexbin(yp_counts[(yt_counts>1) & (yt_counts < 3.5)], 
           yt_counts[(yt_counts>1) & (yt_counts < 3.5)],  
           gridsize=(30,15),#gridsize=(40,30),
            cmap=plt.cm.BuGn)
Out[171]:
<matplotlib.collections.PolyCollection at 0x7f80aafa22e8>

Re-trained vs existing

In [13]:
km_new = read_pkl(f"{ddir}/processed/dnase/6mer/new_model.train-pos3.w{SEQ_WIDTH}.normalized.pkl")
In [14]:
km_old_pkl = read_pkl(f"{ddir}/raw/dnase/6mer/IMR90_6mer.pickle")
km_old = KmerBiasModel({k: float(v['forward']) for k,v in km_old_pkl.items()})
In [17]:
def kmermodel2df(km):
    return pd.DataFrame([{"kmer": k, "value": v} for k,v in km.d.items()])

dfm = pd.merge(kmermodel2df(km_new), kmermodel2df(km_old), on='kmer', suffixes=('_new', '_old'))
In [18]:
dfm.head()
Out[18]:
kmer value_new value_old
0 AAAAAA 0.000002 0.001431
1 AAAAAC 0.000055 0.003110
2 AAAAAG 0.000080 0.001913
3 AAAAAT 0.000020 0.000993
4 AAAACA 0.000053 0.004403
In [19]:
plt.scatter(np.log(1+dfm.value_new*100), np.log(1+dfm.value_old), alpha=0.1)
plt.xlabel("value_new")
plt.ylabel("value_old")
Out[19]:
Text(0,0.5,'value_old')

BPNet

In [78]:
mdir_root = Path(f"{ddir}/processed/dnase/exp/models/background/models")
mdirs = list(mdir_root.glob("w=1000*"))
mdirs
Out[78]:
[PosixPath('/users/avsec/workspace/basepair/basepair/../data/processed/dnase/exp/models/background/models/w=1000,filters=256'),
 PosixPath('/users/avsec/workspace/basepair/basepair/../data/processed/dnase/exp/models/background/models/w=1000'),
 PosixPath('/users/avsec/workspace/basepair/basepair/../data/processed/dnase/exp/models/background/models/w=1000,filters=128')]
In [79]:
# performance comparison
def get_performance(mdir):
    try:
        dfh = pd.read_csv(f"{mdir}/history.csv")
        return dict(dfh.iloc[dfh.val_loss.idxmin()])
    except:
        return {}

df_exp = pd.DataFrame([{**get_performance(md), "name":md.name}
                        for md in mdirs])

df_exp.sort_values("val_counts/DNase_loss")
Out[79]:
counts/DNase_loss epoch loss name profile/DNase_loss val_counts/DNase_loss val_loss val_profile/DNase_loss
2 0.202767 4.0 35.776703 w=1000,filters=128 35.573936 0.130209 38.799657 38.669447
1 0.201894 12.0 35.658200 w=1000 35.456307 0.134396 38.829834 38.695438
0 0.203932 11.0 35.877191 w=1000,filters=256 35.673259 0.134887 38.888322 38.753435
In [80]:
# select the model
mdir = mdir_root / "w=1000,filters=128"   # for 256 it's the same as for w=1000
In [63]:
create_tf_session(0)
Out[63]:
<tensorflow.python.client.session.Session at 0x7f8094fd2208>
In [81]:
model = load_model(mdir / "model.h5",
                   custom_objects={"mc_multinomial_nll": MultichannelMultinomialNLL(1),
                                   "MultichannelMultinomialNLL": MultichannelMultinomialNLL(1)})
In [65]:
from basepair.config import test_chr
from basepair.preproc import AppendCounts
ds = DataSpec.load(mdir / "dataspec.yaml")
dl_test = StrandedProfile(ds, 
                          incl_chromosomes=test_chr, 
                          peak_width=SEQ_WIDTH, 
                          shuffle=False,
                          target_transformer=AppendCounts())
Skipped 3 intervals outside of the genome size
In [66]:
test = dl_test.load_all(num_workers=28)
100%|██████████| 16641/16641 [00:22<00:00, 730.08it/s]
In [82]:
y_pred = model.predict(test['inputs'], verbose=1)
532487/532487 [==============================] - 175s 328us/step
In [88]:
# w=1000,filters=128
regression_eval(test["targets"]['counts/DNase'].mean(-1), 
                y_pred[ds.task2idx("DNase", 'counts')].mean(-1), alpha=0.05)
In [142]:
# w=1000,filters=128
regression_eval(test["targets"]['counts/DNase'].mean(-1)[test["targets"]['counts/DNase'].mean(-1)>0], 
                y_pred[ds.task2idx("DNase", 'counts')].mean(-1)[test["targets"]['counts/DNase'].mean(-1)>0], 
                alpha=0.01)
In [166]:
# w=1000,filters=128
plt.hexbin(y_pred[ds.task2idx("DNase", 'counts')].mean(-1), 
           test["targets"]['counts/DNase'].mean(-1),  
           gridsize=(40,30),
            cmap=plt.cm.BuGn);
In [170]:
# w=1000,filters=128
ytc = test["targets"]['counts/DNase'].mean(-1)
plt.hexbin(y_pred[ds.task2idx("DNase", 'counts')].mean(-1)[(ytc>1) & (ytc < 3.5)], 
           test["targets"]['counts/DNase'].mean(-1)[(ytc>1) & (ytc < 3.5)],  
           gridsize=(30,15),
           cmap=plt.cm.BuGn);
In [68]:
# w=1000
regression_eval(test["targets"]['counts/DNase'].mean(-1), 
                y_pred[ds.task2idx("DNase", 'counts')].mean(-1), alpha=0.05)
In [182]:
# w=1000
regression_eval(test["targets"]['counts/DNase'].mean(-1)[counts > 25], 
                y_pred[ds.task2idx("DNase", 'counts')].mean(-1)[counts > 25], alpha=0.05)

Explore the distribution of output counts

In [89]:
import scipy.stats as stats
In [95]:
counts = test["targets"]['profile/DNase'].sum(axis=-1).sum(axis=-1)
In [163]:
plt.hist(counts[counts<50], bins=50);
In [179]:
np.mean(counts > 25) # 0.4% of the data are outliers
Out[179]:
0.004871480430508163
In [178]:
# Poisson: poisson is the right loss function
stats.probplot(counts[counts<25], sparams=(counts[counts<25].mean()), dist="poisson", plot=plt);
In [96]:
# log-normal
stats.probplot(np.log(1 + counts), dist="norm", plot=plt);
In [90]:
# same as counts/DNase
stats.probplot(test["targets"]['counts/DNase'].mean(-1), dist="norm", plot=plt);
In [100]:
# Poisson
stats.probplot(counts, sparams=(counts.mean()), dist="poisson", plot=plt);
In [101]:
# Negative binomial
stats.probplot(counts, sparams=(0.2, 0.001), dist="nbinom", plot=plt);
In [177]:
# Negative binomial
stats.probplot(counts[counts<50], sparams=(0.2, 0.001), dist="nbinom", plot=plt);