Goal

compare the results for oct-sox-nanog-klf5 (with and without dnase)

TODO

  • [x] compile the notebook for each

Load the results

In [1]:
from pathlib import Path
from basepair.config import get_data_dir
import pandas as pd
import matplotlib.pyplot as plt

ddir = get_data_dir()

dir_with_dnase = Path(f"{ddir}/processed/chipnexus/exp/models/oct-sox-nanog-klf-dnase/models/")

dir_without_dnase = Path(f"{ddir}/processed/chipnexus/exp/models/oct-sox-nanog-klf/models")
/users/avsec/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/h5py/__init__.py:36: FutureWarning: Conversion of the second argument of issubdtype from `float` to `np.floating` is deprecated. In future, it will be treated as `np.float64 == np.dtype(float).type`.
  from ._conv import register_converters as _register_converters
Using TensorFlow backend.
In [5]:
# performance comparison
def get_performance(mdir):
    try:
        dfh = pd.read_csv(f"{mdir}/history.csv")
        return dict(dfh.iloc[dfh.val_loss.idxmin()])
    except:
        return {}
In [6]:
df_exp_w = pd.DataFrame([{**get_performance(md), "name":md.name}
                        for md in dir_with_dnase.iterdir()])
In [7]:
df_exp_wo = pd.DataFrame([{**get_performance(md), "name":md.name}
                        for md in dir_without_dnase.iterdir()])
In [8]:
ls {dir_with_dnase}
c_task_weight=1/    filters=256/      tconv_kernel_size=101/
c_task_weight=100/  filters=32/       tconv_kernel_size=31/
c_task_weight=5/    n_dil_layers=12/  tconv_kernel_size=51/
c_task_weight=50/   n_dil_layers=3/
filters=128/        n_dil_layers=9/
In [9]:
ls {dir_with_dnase}
c_task_weight=1/    filters=256/      tconv_kernel_size=101/
c_task_weight=100/  filters=32/       tconv_kernel_size=31/
c_task_weight=5/    n_dil_layers=12/  tconv_kernel_size=51/
c_task_weight=50/   n_dil_layers=3/
filters=128/        n_dil_layers=9/

scatter-plot the best performance for each metric

In [10]:
dfm = pd.merge(df_exp_wo, df_exp_w, on='name', suffixes=("_wo", "_w"))
In [11]:
tasks = ['Sox2', 'Oct4', "Nanog", 'Klf4']
In [12]:
# Label the best performing model
imin = dfm['val_profile/Sox2_loss_wo'].idxmin()

fig = plt.figure(figsize=(15,6))
for r, metric in enumerate(['counts', 'profile']):
    for i, task in enumerate(tasks):
        plt.subplot(2, len(tasks), r*len(tasks) + i+1)
        plt.scatter(dfm[f'val_{metric}/{task}_loss_w'], dfm[f'val_{metric}/{task}_loss_wo'])
        plt.scatter(dfm[f'val_{metric}/{task}_loss_w'].iloc[imin], dfm[f'val_{metric}/{task}_loss_wo'].iloc[imin])
        minx = min(dfm[f'val_{metric}/{task}_loss_w'].min(), dfm[f'val_{metric}/{task}_loss_w'].min())
        maxx = max(dfm[f'val_{metric}/{task}_loss_w'].max(), dfm[f'val_{metric}/{task}_loss_w'].max())
        plt.plot([minx, maxx], [minx, maxx])
        plt.title(f"{metric}/{task}")
    
plt.xlabel("with DNase")
plt.ylabel("without DNase")
plt.tight_layout()

get the best results for each

In [13]:
# Best model
dfm.iloc[imin]['name']  # n_dil_layers=9
Out[13]:
'n_dil_layers=9'
In [14]:
# Best model
dfm.iloc[imin:(imin+1)]
Out[14]:
counts/Klf4_loss_wo counts/Nanog_loss_wo counts/Oct4_loss_wo counts/Sox2_loss_wo epoch_wo loss_wo name profile/Klf4_loss_wo profile/Nanog_loss_wo profile/Oct4_loss_wo ... val_counts/Klf4_loss_w val_counts/Nanog_loss_w val_counts/Oct4_loss_w val_counts/Sox2_loss_w val_loss_w val_profile/DNase_loss val_profile/Klf4_loss_w val_profile/Nanog_loss_w val_profile/Oct4_loss_w val_profile/Sox2_loss_w
12 0.517133 0.594023 0.397323 0.255188 18.0 2806.926117 n_dil_layers=9 646.416222 700.486733 880.784571 ... 0.537659 0.632297 0.389396 0.245566 4876.575714 1970.435313 655.200428 739.589288 903.619765 579.854008

1 rows × 43 columns

Run modisco template

In [15]:
import h5py
import numpy as np
import modisco
import modisco.util
import modisco.core
import modisco.tfmodisco_workflow.seqlets_to_patterns
from modisco.tfmodisco_workflow import workflow
from kipoi.readers import HDF5Reader
from scipy.spatial.distance import correlation
Can not use cuDNN on context None: cannot compile with cuDNN. We got this error:
b'/tmp/try_flags_u_hkwv9b.c:4:19: fatal error: cudnn.h: No such file or directory\ncompilation terminated.\n'
Mapped name None to device cuda: TITAN X (Pascal) (0000:02:00.0)
In [16]:
imp_scores = dir_without_dnase / "n_dil_layers=9/grad.test.h5"
grad_type = "weighted"
threshold = 0.1
outdir = "/tmp/modisco"
num_workers = 10
In [ ]:
# TODO - make sure the output folder exist
outdir = Path(outdir)
In [27]:
d = HDF5Reader.load(imp_scores)

tasks = list(d['targets']['profile'])

def mean(x):
    return sum(x) / len(x)

n = len(d['grads'][tasks[0]][grad_type][0])

distances = np.array([np.array([correlation(np.ravel(d['grads'][task][grad_type][0][i]), 
                                  np.ravel(d['grads'][task][grad_type][1][i]))
                          for i in range(n)])
             for task in tasks]).T.mean(axis=-1)  # average the distances across tasks

dist_filter = distances < threshold
np.save(outdir / "distances.npy", dist_filter)
In [ ]:
# number of kept sequences
print("Fraction of sequences kept: {dist_filter.mean()})

if isinstance(d['inputs'], dict):
    one_hot = d['inputs']['seq']
else:
    one_hot = d['inputs']

# get the importance scores
thr_hypothetical_contribs = {task: mean(d['grads'][task][grad_type])[dist_filter] for task in tasks}
thr_one_hot = one_hot[dist_filter]
thr_contrib_scores = {task: thr_hypothetical_contribs[task] * thr_one_hot for task in tasks}

tfmodisco_results = modisco.tfmodisco_workflow.workflow.TfModiscoWorkflow(
                    #Slight modifications from the default settings
                    sliding_window_size=15,
                    flank_size=5,
                    target_seqlet_fdr=0.01,
                    min_seqlets_per_task=1000,
                    seqlets_to_patterns_factory=
                     modisco.tfmodisco_workflow.seqlets_to_patterns.TfModiscoSeqletsToPatternsFactory(
                        trim_to_window_size=15,
                        initial_flank_to_add=5,
                        kmer_len=5, 
                        num_gaps=1,
                        num_mismatches=0,
                        n_cores=num_workers,
                        final_min_cluster_size=60)
                )(
                task_names=tasks,
                contrib_scores=thr_contrib_scores,  # -> task score 
                hypothetical_contribs=thr_hypothetical_contribs,
                one_hot=thr_one_hot)
In [36]:
tfmodisco_results
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-36-a07b72135197> in <module>()
----> 1 tfmodisco_results

NameError: name 'tfmodisco_results' is not defined
In [ ]:
# save the results
import h5py
import modisco.util
# !rm modisco_results_on_valid.hdf5
grp = h5py.File(outdir / "modisco.h5")
tfmodisco_results.save_hdf5(grp)
grp.close()
In [40]:
d['targets']['profile']['Klf4'].shape
Out[40]:
(18086, 1000, 2)

Filter the strands

In [ ]:
plt.correlate(mean(d['grads'][task][grad_type][0].reshape((n, -1)), mean(d['grads'][task][grad_type][1].reshape((n, -1)))
In [42]:
correlation(np.ravel(d['grads'][task][grad_type][0][0]), np.ravel(d['grads'][task][grad_type][1][0]))
Out[42]:
0.06321585178375244
In [55]:
import matplotlib.pyplot as plt
In [72]:
task = "Klf4"
In [73]:
distances = np.array([correlation(np.ravel(d['grads'][task][grad_type][0][i]), 
                                  np.ravel(d['grads'][task][grad_type][1][i]))
                          for i in range(n)])
In [74]:
plt.hist(distances, bins=100);
In [37]:
hypothetical_contribs['DNase'].shape
Out[37]:
(61670, 1000, 4)
In [30]:
# average across tasks
mean(d['grads'][task][grad_type]).shape
Out[30]:
(61670, 1000, 4)
In [ ]:
hypothetical_contribs
In [9]:
tasks
Out[9]:
['DNase']
In [7]:
d['targets']['profile'].keys()
Out[7]:
dict_keys(['DNase'])
In [10]:
hypothetical_contribs
one_hot = TODO
contrib_scores = pass
  File "<ipython-input-10-ba096094ff28>", line 3
    contrib_scores = pass
                        ^
SyntaxError: invalid syntax
In [2]:
import os
from basepair.BPNet import BPNetPredictor
from basepair.cli.schemas import DataSpec, HParams
from basepair.modisco import ModiscoResult
from scipy.spatial.distance import correlation
from concise.utils.helper import write_json, read_json
from basepair.data import numpy_minibatch
from kipoi.writers import HDF5BatchWriter
import h5py
import numpy as np
import keras.backend as K
from basepair.config import create_tf_session, valid_chr, test_chr
from keras.models import load_model
from basepair.cli.evaluate import load_data
import matplotlib.pyplot as plt
plt.switch_backend('agg')
In [3]:
def grad_score(model_dir, output_file, 
               batch_size=512,
               num_workers=10,
               chunk_size=512,
               split='valid',
               overwrite=False,
               gpu=None):
    if gpu is not None:
        create_tf_session(gpu)
    else:
        # Don't use any GPU's
        os.environ['CUDA_VISIBLE_DEVICES'] = ''
        
    if os.path.exists(output_file):
        if overwrite:
            os.remove(output_file)
        else:
            raise ValueError(f"File exists {output_file}. Use overwrite=True to overwrite it")

    if split=='valid':
        incl_chromosomes = valid_chr
        excl_chromosomes = None
    elif split=='test':
        incl_chromosomes = test_chr
        excl_chromosomes = None
    elif split=='train':
        incl_chromosomes = None
        excl_chromosomes = valid_chr + test_chr
    elif split=='all':
        incl_chromosomes = None
        excl_chromosomes = None
    else:
        raise ValueError("split needs to be from {train,valid,test,all}")
        
    from pathlib import Path
    model_dir = Path(model_dir)
    hp = HParams.load(model_dir / "hparams.yaml")
    ds = DataSpec.load(model_dir / "dataspec.yaml")
    tasks = list(ds.task_specs)
    
    from basepair.datasets import StrandedProfile
    seq_len = hp.data.kwargs['peak_width']
    dl_valid = StrandedProfile(ds, 
                              incl_chromosomes=incl_chromosomes, 
                              excl_chromosomes=excl_chromosomes, 
                              peak_width=seq_len,
                              shuffle=False,
                              target_transformer=None)
    
    from basepair.BPNet import BPNetPredictor, BiasModel
    from tqdm import tqdm 
    from basepair.losses import MultichannelMultinomialNLL, mc_multinomial_nll_2, mc_multinomial_nll_1, twochannel_multinomial_nll
    model = load_model(str(model_dir / "model.h5"),
                       custom_objects={"mc_multinomial_nll_1": mc_multinomial_nll_1,
                                       "mc_multinomial_nll_2": mc_multinomial_nll_2,
                                       "mc_multinomial_nll": mc_multinomial_nll_1,
                                       "MultichannelMultinomialNLL": MultichannelMultinomialNLL(1),
                                       "twochannel_multinomial_nll": twochannel_multinomial_nll})
    
    bpnet = BPNetPredictor(model, fasta_file=ds.fasta_file, tasks=tasks)
    
    # setup the bias model
    if [task for task, task_spec in ds.task_specs.items() if task_spec.bias_model]:
        bm = BiasModel(ds)
    else:
        bm = lambda x: x
        
    writer = HDF5BatchWriter(output_file, chunk_size=chunk_size)
    for i, batch in enumerate(tqdm(dl_valid.batch_iter(batch_size=batch_size, num_workers=num_workers))):
        (batch['inputs'], batch['targets']) = bm((batch['inputs'], batch['targets']))  # append the bias model predictions
        
        
        if i > 3:
            break
        wdict=batch
        for task_i, task in enumerate(tasks):
            for seq_grad in ['count', 'weighted']:
                # figure out the number of channels
                nstrands = batch['targets'][f'profile/{task}'].shape[-1]
                strand_hash = ["pos", "neg"]
                
                for strand_i in range(nstrands):
                    grads_pos = bpnet.input_grad(batch['inputs'], 
                                                 strand=strand_hash[strand_i], 
                                                 task_id=task_i, 
                                                 seq_grad=seq_grad, 
                                                 batch_size=None)
                    wdict[f"/grads/{task}/{seq_grad}/{strand_i}"] = grads_pos
        writer.batch_write(wdict)
    writer.close()
In [4]:
%load_ext line_profiler
In [5]:
BPNetPredictor
Out[5]:
basepair.BPNet.BPNetPredictor
In [6]:
fname = dir_without_dnase/"n_dil_layers=9/grads.test.benchmark9.h5"
!rm {fname}
%lprun -f BPNetPredictor.input_grad grad_score(dir_without_dnase / "n_dil_layers=9", fname, \
                                batch_size=512, num_workers=10,chunk_size=512,split='valid',overwrite=False, gpu=1)
WARNING:tensorflow:From /users/avsec/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/tensorflow/python/util/deprecation.py:497: calling conv1d (from tensorflow.python.ops.nn_ops) with data_format=NHWC is deprecated and will be removed in a future version.
Instructions for updating:
`NHWC` for data_format is deprecated, use `NWC` instead
2018-08-20 16:45:44,075 [WARNING] From /users/avsec/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/tensorflow/python/util/deprecation.py:497: calling conv1d (from tensorflow.python.ops.nn_ops) with data_format=NHWC is deprecated and will be removed in a future version.
Instructions for updating:
`NHWC` for data_format is deprecated, use `NWC` instead
WARNING:tensorflow:From /users/avsec/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/tensorflow/contrib/learn/python/learn/datasets/base.py:198: retry (from tensorflow.contrib.learn.python.learn.datasets.base) is deprecated and will be removed in a future version.
Instructions for updating:
Use the retry module or similar alternatives.
2018-08-20 16:45:55,885 [WARNING] From /users/avsec/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/tensorflow/contrib/learn/python/learn/datasets/base.py:198: retry (from tensorflow.contrib.learn.python.learn.datasets.base) is deprecated and will be removed in a future version.
Instructions for updating:
Use the retry module or similar alternatives.
 11%|█         | 4/38 [01:04<09:04, 16.02s/it]
Timer unit: 1e-06 s

Total time: 38.3393 s
File: /users/avsec/workspace/basepair/basepair/BPNet.py
Function: input_grad at line 68

Line #      Hits         Time  Per Hit   % Time  Line Contents
==============================================================
    68                                               def input_grad(self, x, strand='pos', task_id=0, seq_grad='max', batch_size=512):
    69                                                   """
    70                                                   Args:
    71                                                     strand: 'pos' or 'neg'
    72                                                     seq_grad: max, count, mean or weighted
    73                                                   """
    74        64        268.0      4.2      0.0          def input_to_list(input_names, x):
    75                                                       if isinstance(x, list):
    76                                                           return x
    77                                                       elif isinstance(x, dict):
    78                                                           return [x[k] for k in input_names]
    79                                                       else:
    80                                                           return [x]
    81                                                       
    82        64        307.0      4.8      0.0          input_names = self.model.input_names
    83        64        167.0      2.6      0.0          assert input_names[0] == "seq"
    84                                                   # get the gradient function
    85        64   16072678.0 251135.6     41.9          fn = self.grad_fn(strand, task_id, seq_grad)
    86        64        150.0      2.3      0.0          if batch_size is None:
    87        64   22265714.0 347901.8     58.1              return fn(input_to_list(input_names, x))[0]
    88                                                   else:
    89                                                       return numpy_collate_concat([fn(input_to_list(input_names, batch))[0]
    90                                                                                    for batch in nested_numpy_minibatch(x, batch_size=batch_size)])
In [36]:
fname = dir_without_dnase/"n_dil_layers=9/grads.test.benchmark9.h5"
!rm {fname}
%lprun -f BPNetPredictor.input_grad grad_score(dir_without_dnase / "n_dil_layers=9", fname, \
                                batch_size=512,num_workers=10,chunk_size=512,split='valid',overwrite=False, gpu=1)
  0%|          | 0/38 [00:00<?, ?it/s]
 11%|█         | 4/38 [04:16<36:17, 64.04s/it]
Timer unit: 1e-06 s

Total time: 230.569 s
File: /users/avsec/workspace/basepair/basepair/BPNet.py
Function: input_grad at line 68

Line #      Hits         Time  Per Hit   % Time  Line Contents
==============================================================
    68                                               def input_grad(self, x, strand='pos', task_id=0, seq_grad='max', batch_size=512):
    69                                                   """
    70                                                   Args:
    71                                                     strand: 'pos' or 'neg'
    72                                                     seq_grad: max, count, mean or weighted
    73                                                   """
    74        64        553.0      8.6      0.0          def input_to_list(input_names, x):
    75                                                       if isinstance(x, list):
    76                                                           return x
    77                                                       elif isinstance(x, dict):
    78                                                           return [x[k] for k in input_names]
    79                                                       else:
    80                                                           return [x]
    81                                                       
    82        64        726.0     11.3      0.0          input_names = self.model.input_names
    83        64        249.0      3.9      0.0          assert input_names[0] == "seq"
    84                                                   # get the gradient function
    85        64   17581380.0 274709.1      7.6          fn = self.grad_fn(strand, task_id, seq_grad)
    86        64        245.0      3.8      0.0          return numpy_collate_concat([fn(input_to_list(input_names, batch))[0]
    87        64  212985549.0 3327899.2     92.4                                       for batch in nested_numpy_minibatch(x, batch_size=batch_size)])
In [22]:
%lprun -f BPNetPredictor.input_grad grad_score(dir_without_dnase / "n_dil_layers=9", dir_without_dnase/"n_dil_layers=9/grads.test.benchmark6.h5", \
                                batch_size=512,num_workers=10,chunk_size=512,split='valid',overwrite=False, gpu=1)
  0%|          | 0/38 [00:00<?, ?it/s]
 11%|█         | 4/38 [08:46<1:14:38, 131.71s/it]
Timer unit: 1e-06 s

Total time: 500.275 s
File: /users/avsec/workspace/basepair/basepair/BPNet.py
Function: input_grad at line 38

Line #      Hits         Time  Per Hit   % Time  Line Contents
==============================================================
    38                                               def input_grad(self, x, strand='pos', task_id=0, seq_grad='max', batch_size=512):
    39                                                   """
    40                                                   Args:
    41                                                     strand: 'pos' or 'neg'
    42                                                     seq_grad: max, count, mean or weighted
    43                                                   """
    44                                           
    45        64        382.0      6.0      0.0          strand_id = {"pos": 0, "neg": 1}[strand]
    46        64       1696.0     26.5      0.0          inp = self.model.inputs[0] # always compute the gradient w.r.t. the sequene
    47        64        185.0      2.9      0.0          input_names = self.model.input_names
    48        64        383.0      6.0      0.0          assert input_names[0] == "seq"
    49                                                   
    50                                                   
    51        64        169.0      2.6      0.0          def input_to_list(input_names, x):
    52                                                       if isinstance(x, list):
    53                                                           return x
    54                                                       elif isinstance(x, dict):
    55                                                           return [x[k] for k in input_names]
    56                                                       else:
    57                                                           return [x]
    58                                                   
    59        64        147.0      2.3      0.0          if seq_grad == 'count':
    60        32   36493254.0 1140414.2      7.3              fn = K.function(self.model.inputs,
    61        32        182.0      5.7      0.0                              K.gradients(self.model.outputs[len(self.tasks) + task_id][:, strand_id], inp))
    62        32  213549799.0 6673431.2     42.7              return numpy_collate_concat([fn(input_to_list(input_names, batch))[0]
    63                                                                                    for batch in nested_numpy_minibatch(x, batch_size=batch_size)])
    64                                           
    65        32         70.0      2.2      0.0          
    66                                                   if seq_grad == 'weighted':
    67        32     504805.0  15775.2      0.1              # \sum softmax(y)[i] * dy[i]
    68        32   34798434.0 1087451.1      7.0              fn = K.function(self.model.inputs, K.gradients(K.sum(K.stop_gradient(K.softmax(self.model.outputs[task_id][:, :, strand_id])) *
    69                                                                                                self.model.outputs[task_id][:, :, strand_id], axis=-1), inp))
    70                                                   else:
    71                                                       if seq_grad == 'max':
    72                                                           sfn = K.max
    73                                                       elif seq_grad == 'mean':
    74                                                           sfn = K.mean
    75                                                       else:
    76                                                           raise ValueError(f"seq_grad={seq_grad} couldn't be interpreted")
    77                                                       fn = K.function(self.model.inputs, K.gradients(sfn(self.model.outputs[task_id][:, :, strand_id], axis=-1), inp))
    78        32        172.0      5.4      0.0          
    79        32  214925810.0 6716431.6     43.0          return numpy_collate_concat([fn(input_to_list(input_names, batch))[0]
    80                                                                                for batch in nested_numpy_minibatch(x, batch_size=batch_size)])
In [13]:
%lprun -f grad_score grad_score(dir_without_dnase / "n_dil_layers=9", dir_without_dnase/"n_dil_layers=9/grads.test.benchmark4.h5", \
                                batch_size=512,num_workers=10,chunk_size=512,split='valid',overwrite=False, gpu=1)
WARNING:tensorflow:From /users/avsec/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/tensorflow/python/util/deprecation.py:497: calling conv1d (from tensorflow.python.ops.nn_ops) with data_format=NHWC is deprecated and will be removed in a future version.
Instructions for updating:
`NHWC` for data_format is deprecated, use `NWC` instead
2018-08-20 14:28:04,474 [WARNING] From /users/avsec/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/tensorflow/python/util/deprecation.py:497: calling conv1d (from tensorflow.python.ops.nn_ops) with data_format=NHWC is deprecated and will be removed in a future version.
Instructions for updating:
`NHWC` for data_format is deprecated, use `NWC` instead
WARNING:tensorflow:From /users/avsec/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/tensorflow/contrib/learn/python/learn/datasets/base.py:198: retry (from tensorflow.contrib.learn.python.learn.datasets.base) is deprecated and will be removed in a future version.
Instructions for updating:
Use the retry module or similar alternatives.
2018-08-20 14:28:17,970 [WARNING] From /users/avsec/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/tensorflow/contrib/learn/python/learn/datasets/base.py:198: retry (from tensorflow.contrib.learn.python.learn.datasets.base) is deprecated and will be removed in a future version.
Instructions for updating:
Use the retry module or similar alternatives.
 29%|██▉       | 11/38 [11:13<27:32, 61.22s/it]
Timer unit: 1e-06 s

Total time: 697.415 s
File: <ipython-input-4-aff5fe2039ef>
Function: grad_score at line 1

Line #      Hits         Time  Per Hit   % Time  Line Contents
==============================================================
     1                                           def grad_score(model_dir, output_file, 
     2                                                          batch_size=512,
     3                                                          num_workers=10,
     4                                                          chunk_size=512,
     5                                                          split='valid',
     6                                                          overwrite=False,
     7                                                          gpu=None):
     8         1          6.0      6.0      0.0      if gpu is not None:
     9         1     642948.0 642948.0      0.1          create_tf_session(gpu)
    10                                               else:
    11                                                   # Don't use any GPU's
    12                                                   os.environ['CUDA_VISIBLE_DEVICES'] = ''
    13                                                   
    14         1        147.0    147.0      0.0      if os.path.exists(output_file):
    15                                                   if overwrite:
    16                                                       os.remove(output_file)
    17                                                   else:
    18                                                       raise ValueError(f"File exists {output_file}. Use overwrite=True to overwrite it")
    19                                           
    20         1          2.0      2.0      0.0      if split=='valid':
    21         1          3.0      3.0      0.0          incl_chromosomes = valid_chr
    22         1          1.0      1.0      0.0          excl_chromosomes = None
    23                                               elif split=='test':
    24                                                   incl_chromosomes = test_chr
    25                                                   excl_chromosomes = None
    26                                               elif split=='train':
    27                                                   incl_chromosomes = None
    28                                                   excl_chromosomes = valid_chr + test_chr
    29                                               elif split=='all':
    30                                                   incl_chromosomes = None
    31                                                   excl_chromosomes = None
    32                                               else:
    33                                                   raise ValueError("split needs to be from {train,valid,test,all}")
    34                                                   
    35         1         26.0     26.0      0.0      from pathlib import Path
    36         1        115.0    115.0      0.0      model_dir = Path(model_dir)
    37         1      50173.0  50173.0      0.0      hp = HParams.load(model_dir / "hparams.yaml")
    38         1      28092.0  28092.0      0.0      ds = DataSpec.load(model_dir / "dataspec.yaml")
    39         1         12.0     12.0      0.0      tasks = list(ds.task_specs)
    40                                               
    41         1         18.0     18.0      0.0      from basepair.datasets import StrandedProfile
    42         1          4.0      4.0      0.0      seq_len = hp.data.kwargs['peak_width']
    43         1          3.0      3.0      0.0      dl_valid = StrandedProfile(ds, 
    44         1          3.0      3.0      0.0                                incl_chromosomes=incl_chromosomes, 
    45         1          3.0      3.0      0.0                                excl_chromosomes=excl_chromosomes, 
    46         1          3.0      3.0      0.0                                peak_width=seq_len,
    47         1          3.0      3.0      0.0                                shuffle=False,
    48         1     399332.0 399332.0      0.1                                target_transformer=None)
    49                                               
    50         1          9.0      9.0      0.0      from basepair.BPNet import BPNetPredictor, BiasModel
    51         1          6.0      6.0      0.0      from tqdm import tqdm 
    52         1          6.0      6.0      0.0      from basepair.losses import MultichannelMultinomialNLL, mc_multinomial_nll_2, mc_multinomial_nll_1, twochannel_multinomial_nll
    53         1         63.0     63.0      0.0      model = load_model(str(model_dir / "model.h5"),
    54         1          3.0      3.0      0.0                         custom_objects={"mc_multinomial_nll_1": mc_multinomial_nll_1,
    55         1          2.0      2.0      0.0                                         "mc_multinomial_nll_2": mc_multinomial_nll_2,
    56         1          2.0      2.0      0.0                                         "mc_multinomial_nll": mc_multinomial_nll_1,
    57         1         13.0     13.0      0.0                                         "MultichannelMultinomialNLL": MultichannelMultinomialNLL(1),
    58         1   22638616.0 22638616.0      3.2                                         "twochannel_multinomial_nll": twochannel_multinomial_nll})
    59                                               
    60         1         12.0     12.0      0.0      bpnet = BPNetPredictor(model, fasta_file=ds.fasta_file, tasks=tasks)
    61                                               
    62                                               # setup the bias model
    63         1         38.0     38.0      0.0      if [task for task, task_spec in ds.task_specs.items() if task_spec.bias_model]:
    64                                                   bm = BiasModel(ds)
    65                                               else:
    66         1          2.0      2.0      0.0          bm = lambda x: x
    67                                                   
    68         1        892.0    892.0      0.0      writer = HDF5BatchWriter(output_file, chunk_size=chunk_size)
    69         1          3.0      3.0      0.0      grad_pos = None
    70        12    2032977.0 169414.8      0.3      for i, batch in enumerate(tqdm(dl_valid.batch_iter(batch_size=batch_size, num_workers=num_workers))):
    71        12        158.0     13.2      0.0          (batch['inputs'], batch['targets']) = bm((batch['inputs'], batch['targets']))  # append the bias model predictions
    72                                                   
    73                                                   
    74        12         38.0      3.2      0.0          if i > 10:
    75         1          6.0      6.0      0.0              break
    76        11        836.0     76.0      0.0          wdict=batch
    77        55        617.0     11.2      0.0          for task_i, task in enumerate(tasks):
    78       132        665.0      5.0      0.0              for seq_grad in ['count', 'weighted']:
    79                                                           # figure out the number of channels
    80        88       1037.0     11.8      0.0                  nstrands = batch['targets'][f'profile/{task}'].shape[-1]
    81        88        508.0      5.8      0.0                  strand_hash = ["pos", "neg"]
    82                                                           
    83       264       1563.0      5.9      0.0                  for strand_i in range(nstrands):
    84       176        649.0      3.7      0.0                      if grad_pos is None:
    85       176       1607.0      9.1      0.0                          grads_pos = bpnet.input_grad(batch['inputs'], 
    86       176        662.0      3.8      0.0                                                       strand=strand_hash[strand_i], 
    87       176        605.0      3.4      0.0                                                       task_id=task_i, 
    88       176        597.0      3.4      0.0                                                       seq_grad=seq_grad, 
    89       176  605497562.0 3440327.1     86.8                                                       batch_size=batch_size)
    90       176       2541.0     14.4      0.0                      wdict[f"/grads/{task}/{seq_grad}/{strand_i}"] = grads_pos
    91        11   66110947.0 6010086.1      9.5          writer.batch_write(wdict)
    92         1        684.0    684.0      0.0      writer.close()
In [11]:
%lprun -f grad_score grad_score(dir_without_dnase / "n_dil_layers=9", dir_without_dnase/"n_dil_layers=9/grads.test.benchmark.h5", batch_size=512,num_workers=10,chunk_size=512,split='valid',overwrite=False,gpu=1)
WARNING:tensorflow:From /users/avsec/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/tensorflow/python/util/deprecation.py:497: calling conv1d (from tensorflow.python.ops.nn_ops) with data_format=NHWC is deprecated and will be removed in a future version.
Instructions for updating:
`NHWC` for data_format is deprecated, use `NWC` instead
2018-08-19 04:49:29,842 [WARNING] From /users/avsec/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/tensorflow/python/util/deprecation.py:497: calling conv1d (from tensorflow.python.ops.nn_ops) with data_format=NHWC is deprecated and will be removed in a future version.
Instructions for updating:
`NHWC` for data_format is deprecated, use `NWC` instead
WARNING:tensorflow:From /users/avsec/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/tensorflow/contrib/learn/python/learn/datasets/base.py:198: retry (from tensorflow.contrib.learn.python.learn.datasets.base) is deprecated and will be removed in a future version.
Instructions for updating:
Use the retry module or similar alternatives.
2018-08-19 04:49:41,625 [WARNING] From /users/avsec/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/tensorflow/contrib/learn/python/learn/datasets/base.py:198: retry (from tensorflow.contrib.learn.python.learn.datasets.base) is deprecated and will be removed in a future version.
Instructions for updating:
Use the retry module or similar alternatives.
  8%|▊         | 3/38 [01:56<22:40, 38.88s/it]
Timer unit: 1e-06 s

Total time: 136.238 s
File: <ipython-input-2-1a922ed4377f>
Function: grad_score at line 1

Line #      Hits         Time  Per Hit   % Time  Line Contents
==============================================================
     1                                           def grad_score(model_dir, output_file, 
     2                                                          batch_size=512,
     3                                                          num_workers=10,
     4                                                          chunk_size=512,
     5                                                          split='valid',
     6                                                          overwrite=False,
     7                                                          gpu=None):
     8         1          4.0      4.0      0.0      if gpu is not None:
     9         1       1571.0   1571.0      0.0          create_tf_session(gpu)
    10                                               else:
    11                                                   # Don't use any GPU's
    12                                                   os.environ['CUDA_VISIBLE_DEVICES'] = ''
    13                                                   
    14         1         74.0     74.0      0.0      if os.path.exists(output_file):
    15                                                   if overwrite:
    16                                                       os.remove(output_file)
    17                                                   else:
    18                                                       raise ValueError(f"File exists {output_file}. Use overwrite=True to overwrite it")
    19                                           
    20         1          3.0      3.0      0.0      if split=='valid':
    21         1          2.0      2.0      0.0          incl_chromosomes = valid_chr
    22         1          2.0      2.0      0.0          excl_chromosomes = None
    23                                               elif split=='test':
    24                                                   incl_chromosomes = test_chr
    25                                                   excl_chromosomes = None
    26                                               elif split=='train':
    27                                                   incl_chromosomes = None
    28                                                   excl_chromosomes = valid_chr + test_chr
    29                                               elif split=='all':
    30                                                   incl_chromosomes = None
    31                                                   excl_chromosomes = None
    32                                               else:
    33                                                   raise ValueError("split needs to be from {train,valid,test,all}")
    34                                                   
    35         1          8.0      8.0      0.0      from pathlib import Path
    36         1         72.0     72.0      0.0      model_dir = Path(model_dir)
    37         1      21738.0  21738.0      0.0      hp = HParams.load(model_dir / "hparams.yaml")
    38         1      18532.0  18532.0      0.0      ds = DataSpec.load(model_dir / "dataspec.yaml")
    39         1         10.0     10.0      0.0      tasks = list(ds.task_specs)
    40                                               
    41         1         14.0     14.0      0.0      from basepair.datasets import StrandedProfile
    42         1          3.0      3.0      0.0      seq_len = hp.data.kwargs['peak_width']
    43         1          3.0      3.0      0.0      dl_valid = StrandedProfile(ds, 
    44         1          2.0      2.0      0.0                                incl_chromosomes=incl_chromosomes, 
    45         1          2.0      2.0      0.0                                excl_chromosomes=excl_chromosomes, 
    46         1          2.0      2.0      0.0                                peak_width=seq_len,
    47         1          2.0      2.0      0.0                                shuffle=False,
    48         1     223989.0 223989.0      0.2                                target_transformer=None)
    49                                               
    50         1         10.0     10.0      0.0      from basepair.BPNet import BPNetPredictor, BiasModel
    51         1          7.0      7.0      0.0      from tqdm import tqdm 
    52         1          7.0      7.0      0.0      from basepair.losses import MultichannelMultinomialNLL, mc_multinomial_nll_2, mc_multinomial_nll_1, twochannel_multinomial_nll
    53         1         61.0     61.0      0.0      model = load_model(str(model_dir / "model.h5"),
    54         1          2.0      2.0      0.0                         custom_objects={"mc_multinomial_nll_1": mc_multinomial_nll_1,
    55         1          2.0      2.0      0.0                                         "mc_multinomial_nll_2": mc_multinomial_nll_2,
    56         1          2.0      2.0      0.0                                         "mc_multinomial_nll": mc_multinomial_nll_1,
    57         1         16.0     16.0      0.0                                         "MultichannelMultinomialNLL": MultichannelMultinomialNLL(1),
    58         1   19092428.0 19092428.0     14.0                                         "twochannel_multinomial_nll": twochannel_multinomial_nll})
    59                                               
    60         1         18.0     18.0      0.0      bpnet = BPNetPredictor(model, fasta_file=ds.fasta_file, tasks=tasks)
    61                                               
    62                                               # setup the bias model
    63         1         44.0     44.0      0.0      if [task for task, task_spec in ds.task_specs.items() if task_spec.bias_model]:
    64                                                   bm = BiasModel(ds)
    65                                               else:
    66         1          4.0      4.0      0.0          bm = lambda x: x
    67                                                   
    68         1        972.0    972.0      0.0      writer = HDF5BatchWriter(output_file, chunk_size=chunk_size)
    69         4    1938957.0 484739.2      1.4      for i, batch in enumerate(tqdm(dl_valid.batch_iter(batch_size=batch_size, num_workers=num_workers))):
    70         4         32.0      8.0      0.0          (batch['inputs'], batch['targets']) = bm((batch['inputs'], batch['targets']))  # append the bias model predictions
    71                                                   
    72         4          8.0      2.0      0.0          if i > 2:
    73         1          6.0      6.0      0.0              break
    74         3         63.0     21.0      0.0          wdict=batch
    75        15        167.0     11.1      0.0          for task_i, task in enumerate(tasks):
    76        36        144.0      4.0      0.0              for seq_grad in ['count', 'weighted']:
    77                                                           # figure out the number of channels
    78        24        211.0      8.8      0.0                  nstrands = batch['targets'][f'profile/{task}'].shape[-1]
    79        24        107.0      4.5      0.0                  strand_hash = ["pos", "neg"]
    80                                                           
    81        72        365.0      5.1      0.0                  for strand_i in range(nstrands):
    82        48        273.0      5.7      0.0                      grads_pos = bpnet.input_grad(batch['inputs'], 
    83        48        172.0      3.6      0.0                                                   strand=strand_hash[strand_i], 
    84        48        158.0      3.3      0.0                                                   task_id=task_i, 
    85        48        148.0      3.1      0.0                                                   seq_grad=seq_grad, 
    86        48   97994255.0 2041547.0     71.9                                                   batch_size=batch_size)
    87        48        604.0     12.6      0.0                      wdict[f"/grads/{task}/{seq_grad}/{strand_i}"] = grads_pos
    88         3   16941897.0 5647299.0     12.4          writer.batch_write(wdict)
    89         1        537.0    537.0      0.0      writer.close()