In [1]:
# Example parameters
ddir= "/users/avsec/workspace/basepair/data/"
model_file = f"{ddir}/processed/chipnexus/exp/models/oct-sox-nanog-klf/models/filters=128/model.h5"
dataspec_file = f"{ddir}/processed/chipnexus/exp/models/oct-sox-nanog-klf/models/filters=128/dataspec.yaml"
history_file = f"{ddir}/processed/chipnexus/exp/models/oct-sox-nanog-klf/models/filters=128/history.csv"
seq_width = 1000
gpu = 0
num_workers = 10
In [2]:
# Parameters
model_file = "/srv/scratch/avsec/workspace/chipnexus/data/processed/chipnexus/exp/models/oct-sox-nanog-klf/models/n_dil_layers=9/model.h5"
dataspec_file = "/srv/scratch/avsec/workspace/chipnexus/data/processed/chipnexus/exp/models/oct-sox-nanog-klf/models/n_dil_layers=9/dataspec.yaml"
history_file = "/srv/scratch/avsec/workspace/chipnexus/data/processed/chipnexus/exp/models/oct-sox-nanog-klf/models/n_dil_layers=9/history.csv"
seq_width = 1000
gpu = 1
num_workers = 10
In [3]:
import basepair
import pandas as pd
import numpy as np
from basepair.cli.schemas import DataSpec, TaskSpec
from pathlib import Path
from keras.models import load_model
from basepair.config import create_tf_session, get_data_dir
from basepair.datasets import StrandedProfile
from basepair.preproc import AppendCounts
from basepair.losses import MultichannelMultinomialNLL
from basepair.config import valid_chr, test_chr
from basepair.plots import regression_eval, plot_loss
from basepair.cli.evaluate import eval_profile
from basepair import samplers 
from basepair.math import softmax

ddir = get_data_dir()
/users/avsec/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/h5py/__init__.py:36: FutureWarning: Conversion of the second argument of issubdtype from `float` to `np.floating` is deprecated. In future, it will be treated as `np.float64 == np.dtype(float).type`.
  from ._conv import register_converters as _register_converters
Using TensorFlow backend.
In [4]:
create_tf_session(gpu)
Out[4]:
<tensorflow.python.client.session.Session at 0x7f1fc45a2dd8>
In [5]:
model = load_model(model_file, custom_objects={"mc_multinomial_nll": MultichannelMultinomialNLL(1),
                                               "MultichannelMultinomialNLL": MultichannelMultinomialNLL(1)})
WARNING:tensorflow:From /users/avsec/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/tensorflow/python/util/deprecation.py:497: calling conv1d (from tensorflow.python.ops.nn_ops) with data_format=NHWC is deprecated and will be removed in a future version.
Instructions for updating:
`NHWC` for data_format is deprecated, use `NWC` instead
2018-08-18 02:44:36,361 [WARNING] From /users/avsec/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/tensorflow/python/util/deprecation.py:497: calling conv1d (from tensorflow.python.ops.nn_ops) with data_format=NHWC is deprecated and will be removed in a future version.
Instructions for updating:
`NHWC` for data_format is deprecated, use `NWC` instead
WARNING:tensorflow:From /users/avsec/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/tensorflow/contrib/learn/python/learn/datasets/base.py:198: retry (from tensorflow.contrib.learn.python.learn.datasets.base) is deprecated and will be removed in a future version.
Instructions for updating:
Use the retry module or similar alternatives.
2018-08-18 02:44:40,924 [WARNING] From /users/avsec/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/tensorflow/contrib/learn/python/learn/datasets/base.py:198: retry (from tensorflow.contrib.learn.python.learn.datasets.base) is deprecated and will be removed in a future version.
Instructions for updating:
Use the retry module or similar alternatives.

Learning curves

In [6]:
ds = DataSpec.load(dataspec_file)
tasks = list(ds.task_specs)
In [7]:
# Best metrics
dfh = pd.read_csv(history_file)
dict(dfh.iloc[dfh.val_loss.idxmin()])
Out[7]:
{'epoch': 18.0,
 'counts/Klf4_loss': 0.5171329292885835,
 'counts/Nanog_loss': 0.5940229764048089,
 'counts/Oct4_loss': 0.39732301427169175,
 'counts/Sox2_loss': 0.25518755363548445,
 'loss': 2806.9261171801936,
 'profile/Klf4_loss': 646.4162220664094,
 'profile/Nanog_loss': 700.4867328182585,
 'profile/Oct4_loss': 880.7845713663878,
 'profile/Sox2_loss': 561.601928508652,
 'val_counts/Klf4_loss': 0.5109416802115981,
 'val_counts/Nanog_loss': 0.6022814974290613,
 'val_counts/Oct4_loss': 0.3760230953767513,
 'val_counts/Sox2_loss': 0.24623416733466486,
 'val_loss': 2858.8069611719043,
 'val_profile/Klf4_loss': 646.0904789701318,
 'val_profile/Nanog_loss': 725.1349713717607,
 'val_profile/Oct4_loss': 894.9416634601201,
 'val_profile/Sox2_loss': 575.2850453742612}
In [8]:
plot_loss(dfh, [f"{p}/{task}_loss"
                for task in tasks
                for p in ['profile']
               ], figsize=(len(tasks)*4, 4))
In [9]:
plot_loss(dfh, [f"{p}/{task}_loss"
                for task in tasks
                for p in ['counts']
               ], figsize=(len(tasks)*4, 4))

Evaluation

In [10]:
dl_valid = StrandedProfile(ds, 
                          incl_chromosomes=valid_chr, 
                          peak_width=seq_width,
                          shuffle=False,
                          target_transformer=AppendCounts())
In [11]:
valid = dl_valid.load_all(num_workers=num_workers)
y_true = valid["targets"]
100%|██████████| 599/599 [00:04<00:00, 145.56it/s]
In [12]:
from basepair.BPNet import BiasModel
In [13]:
if [task for task, task_spec in ds.task_specs.items() if task_spec.bias_model]:
    bm = BiasModel(ds)
    valid['inputs'] = bm.predict([valid["inputs"]])[0]
In [14]:
y_pred = model.predict(valid['inputs'], verbose=1)
19137/19137 [==============================] - 7s 359us/step
In [15]:
import matplotlib.pyplot as plt
In [16]:
for task in tasks:
    plt.figure()
    yt = y_true[f'counts/{task}'].mean(-1)
    yp = y_pred[ds.task2idx(task, 'counts')].mean(-1)
    regression_eval(yt, 
                    yp, alpha=0.1, task=task)

Profile evaluation

In [17]:
from joblib import Parallel, delayed
import basepair
In [18]:
task=tasks[0]
In [19]:
yp = softmax(y_pred[ds.task2idx(task, "profile")])
yt = y_true["profile/" + task]
In [20]:
x = np.ravel(yt / (1+yt.sum(axis=-2, keepdims=True)))
In [21]:
plt.figure(figsize=(12,3))
plt.subplot(121)
plt.hist(x[(x<0.04) ], bins=100);
plt.subplot(122)
plt.hist(x[(x<0.04) & (x>0.0001)], bins=100);
In [22]:
np.mean(x>0.0001)
Out[22]:
0.1517862517635993
In [23]:
out_df = Parallel(n_jobs=len(tasks))(delayed(basepair.cli.evaluate.eval_profile)(y_true["profile/" + task], 
                                                                                 softmax(y_pred[ds.task2idx(task, "profile")]), 
                                                                                 binsizes=[1,10],
                                                                                 pos_min_threshold=0.015,
                                                                                 neg_max_threshold=0.005,)
                 for task in tasks)
df = pd.concat([out_df[i].assign(task=task) for i,task in enumerate(tasks)])
/users/avsec/workspace/basepair/basepair/cli/evaluate.py:147: RuntimeWarning: invalid value encountered in true_divide
  fracs = yt / yt.sum(axis=1, keepdims=True)
/users/avsec/workspace/basepair/basepair/cli/evaluate.py:148: RuntimeWarning: invalid value encountered in greater_equal
  is_peak = (fracs >= pos_min_threshold).astype(float)
/users/avsec/workspace/basepair/basepair/cli/evaluate.py:149: RuntimeWarning: invalid value encountered in less
  ambigous = (fracs < pos_min_threshold) & (fracs >= neg_max_threshold)
/users/avsec/workspace/basepair/basepair/cli/evaluate.py:149: RuntimeWarning: invalid value encountered in greater_equal
  ambigous = (fracs < pos_min_threshold) & (fracs >= neg_max_threshold)
/users/avsec/workspace/basepair/basepair/cli/evaluate.py:147: RuntimeWarning: invalid value encountered in true_divide
  fracs = yt / yt.sum(axis=1, keepdims=True)
/users/avsec/workspace/basepair/basepair/cli/evaluate.py:148: RuntimeWarning: invalid value encountered in greater_equal
  is_peak = (fracs >= pos_min_threshold).astype(float)
/users/avsec/workspace/basepair/basepair/cli/evaluate.py:149: RuntimeWarning: invalid value encountered in less
  ambigous = (fracs < pos_min_threshold) & (fracs >= neg_max_threshold)
/users/avsec/workspace/basepair/basepair/cli/evaluate.py:149: RuntimeWarning: invalid value encountered in greater_equal
  ambigous = (fracs < pos_min_threshold) & (fracs >= neg_max_threshold)
In [24]:
df
Out[24]:
auprc binsize frac_ambigous imbalance n_positives random_auprc task
0 0.198621 1 0.070386 0.002733 41672 0.007400 Oct4
1 0.551732 10 0.352983 0.032840 34855 0.154641 Oct4
0 0.364433 1 0.073087 0.005662 20079 0.019916 Sox2
1 0.733935 10 0.338500 0.054846 13881 0.289232 Sox2
0 0.439036 1 0.060686 0.006543 73217 0.028797 Nanog
1 0.773083 10 0.236276 0.049070 44649 0.305059 Nanog
0 0.246301 1 0.069568 0.004085 35146 0.014666 Klf4
1 0.689117 10 0.310053 0.040917 26102 0.228612 Klf4

Profile plots

In [25]:
from basepair.plots import plot_profiles
from basepair import samplers
In [26]:
tasks = list(ds.task_specs)
task = tasks[0]
In [27]:
#idx_list = samplers.top_sum_count(y_true['profile/Nanog'], 10)
In [28]:
#idx_list = samplers.top_sum_count(y_true['profile/Oct4'], 10)
In [29]:
# Total sum
#idx_list = samplers.top_sum_count(sum([y_true[f'profile/{task}'].mean(-1) for task in tasks])[:,:,np.newaxis], 10)
In [30]:
#idx_list = samplers.top_max_count(y_true['profile/Oct4'], 10)
In [31]:
np.random.seed(42)
idx_list = samplers.random(y_true[f'profile/{task}'], 20, )
In [32]:
idx_list[:4]
Out[32]:
[12650, 11668, 8499, 11526]
In [33]:
plot_profiles(y_true, y_pred, tasks, idx_list, preproc=None, figsize=(24, 8), tic_freq=50, xlim=[200,800])