Goal

  • benchmark the dataloader

get the training data

In [2]:
import basepair
import pandas as pd
import numpy as np
from basepair.cli.schemas import DataSpec, TaskSpec
from basepair.datasets import chip_exo_nexus
from pathlib import Path
from basepair.config import create_tf_session, get_data_dir

ddir = get_data_dir()

create_tf_session(0)
/users/avsec/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/h5py/__init__.py:36: FutureWarning: Conversion of the second argument of issubdtype from `float` to `np.floating` is deprecated. In future, it will be treated as `np.float64 == np.dtype(float).type`.
  from ._conv import register_converters as _register_converters
Using TensorFlow backend.
Out[2]:
<tensorflow.python.client.session.Session at 0x7f4e40816e48>
In [3]:
dpath = Path("/mnt/lab_data/users/avsec/dnase_bias")

SEQ_WIDTH = 1000

ds = DataSpec(
    task_specs={"DNase": TaskSpec(task="DNase",
                                 pos_counts=dpath / "IMR90_Naked_DNase.pos.bw",
                                 neg_counts=dpath / "IMR90_Naked_DNase.neg.bw",
                                 peaks=dpath / "genome.stride200.w200.nonblacklist.bed")},
    fasta_file=dpath / "GRCh38.genome.fa",
              )
In [4]:
dpath = Path("/mnt/lab_data/users/avsec/dnase_bias")

SEQ_WIDTH = 1000

ds = DataSpec(
    task_specs={"DNase": TaskSpec(task="DNase",
                                 pos_counts="/dev/shm/avsec/IMR90_Naked_DNase.pos.bw",
                                 neg_counts="/dev/shm/avsec/IMR90_Naked_DNase.pos.bw",
                                 peaks=dpath / "genome.stride200.w200.nonblacklist.bed")},
    fasta_file=dpath / "hg38.genome.fa.bc.gl",
              )
In [5]:
dpath = Path("/mnt/lab_data/users/avsec/dnase_bias")

SEQ_WIDTH = 1000

ds = DataSpec(
    task_specs={"DNase": TaskSpec(task="DNase",
                                 pos_counts=dpath / "IMR90_Naked_DNase.pos.bw.bc.gl",
                                 neg_counts=dpath / "IMR90_Naked_DNase.neg.bw.bc.gl",
                                 peaks=dpath / "genome.stride200.w200.nonblacklist.bed")},
    fasta_file="/dev/shm/avsec/IMR90_Naked_DNase.pos.bw.bc.gl",
              )

Benchmarking

In [280]:
from genomelake.extractors import ArrayExtractor, BigwigExtractor
from pybedtools import Interval
In [281]:
intervals = [Interval("chr1", 20000, 21000)]

Bigwig in memory

In [434]:
%time a = ca[100000:101000]

%time a = ca[:1000]

%time a = ca[200000:210000]
In [440]:
ae = BigwigExtractor("/dev/shm/avsec/IMR90_Naked_DNase.pos.bw")

%time a=ae([Interval("chr1", 0, 1000)])
%time a=ae([Interval("chr1", 20000, 21000)])

%time a=ae([Interval("chr1", 100000, 101000)])
%time a=ae([Interval("chr1", 200000, 210000)])

%time a=ae([Interval("chr10", 200000, 201000)])

ae.bw.close()
CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 249 µs
CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 338 µs
CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 287 µs
CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 1.14 ms
CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 372 µs
In [299]:
ae.bw.chroms()['chr1']
Out[299]:
248956422
In [300]:
values = ae.bw.values('chr1', 0, ae.bw.chroms()['chr1'], numpy=True)
In [304]:
values[np.isnan(values)] = 0
In [305]:
values[:5]
Out[305]:
array([0., 0., 0., 0., 0.], dtype=float32)
In [390]:
# save the array
_blosc_params = bcolz.cparams(clevel=5, shuffle=bcolz.SHUFFLE, cname='lz4')
bcolz.carray(values, rootdir="/dev/shm/avsec/test/bw.bc", chunklen=1000, cparams=_blosc_params, mode='w').flush()
In [433]:
ca = bcolz.open("/dev/shm/avsec/test/bw.bc")
In [434]:
%time a = ca[100000:101000]
CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 316 µs
In [435]:
%time a = ca[:1000]
CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 265 µs
In [436]:
%time a = ca[200000:210000]
CPU times: user 4 ms, sys: 0 ns, total: 4 ms
Wall time: 769 µs
In [303]:
values[:4]
Out[303]:
array([nan, nan, nan, nan], dtype=float32)
In [420]:
write_tiledb(values, "/dev/shm/avsec/test/bw.tiledb")
In [422]:
ca = load_tiledb("/dev/shm/avsec/test/bw.tiledb")
In [424]:
%time a = ca[100000:101000]
CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 476 µs
In [426]:
%time a = ca[:1000]
CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 464 µs
In [429]:
%time a = ca[200000:210000]
CPU times: user 4 ms, sys: 4 ms, total: 8 ms
Wall time: 528 µs

Bigwig on disk

In [351]:
ae = BigwigExtractor(str(dpath / "IMR90_Naked_DNase.pos.bw"))
%time a=ae(intervals)

ae.bw.close()

# 452 µs
CPU times: user 0 ns, sys: 4 ms, total: 4 ms
Wall time: 495 µs

Both, bigwig on disk and big-wig in memory yield rougly the same results

Bcolz

In memory

In [286]:
ae = ArrayExtractor("/dev/shm/avsec/IMR90_Naked_DNase.pos.bw.bc.gl")
In [287]:
%time a=ae(intervals)
CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 1.94 ms
In [292]:
chr1_track = ae._data['chr1'][:]
In [293]:
chr1_track[:4]
Out[293]:
array(['0.0', '0.0', '0.0', '0.0'], dtype='<U32')
In [294]:
chr1_track.
Out[294]:
dtype('<U32')

On disk

In [288]:
ae = ArrayExtractor(dpath / "IMR90_Naked_DNase.pos.bw.bc.gl", in_memory=False)
In [291]:
%time a=ae(intervals)
CPU times: user 4 ms, sys: 0 ns, total: 4 ms
Wall time: 1.59 ms

Genome

Fasta

On disk

In [48]:
from genomelake.extractors import FastaExtractor
In [49]:
fa = FastaExtractor(str(dpath / "GRCh38.genome.fa"))
In [67]:
%time a=fa(intervals)
CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 599 µs
In [69]:
%time a=fa([Interval("chr1", 120000, 121000)]*100)
CPU times: user 4 ms, sys: 4 ms, total: 8 ms
Wall time: 7.79 ms
In [82]:
%time a=fa([Interval("chr1", 0, 1000)])
CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 317 µs
In [83]:
%time a=fa([Interval("chr1", 0, 10000)])
CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 851 µs
In [84]:
%time a=fa([Interval("chr1", 0, 100000)])
CPU times: user 8 ms, sys: 0 ns, total: 8 ms
Wall time: 7.18 ms

In memory

In [173]:
fa = FastaExtractor("/dev/shm/avsec/GRCh38.genome.fa")
In [180]:
%time a=fa(intervals)
CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 302 µs

Bcolz

In [51]:
ae = ArrayExtractor(str(dpath / "hg38.genome.fa.bc.gl"))
In [52]:
# on disk
%time a=ae(intervals)
CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 2.92 ms
In [53]:
# in-memory
%time a=ae(intervals)
CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 480 µs
In [142]:
bc = ae._data['chr1']
In [143]:
ae._data['chr1'].partitions[:4] ## the partitions are really large -> 32kb instead of ~1kb
Out[143]:
[(0, 32768), (32768, 65536), (65536, 98304), (98304, 131072)]
In [144]:
%time a=ae(intervals)
CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 1.9 ms

Bcolz converted by others

In [215]:
chr1 = ae._data["chr1"][:]
In [217]:
import bcolz
In [219]:
mkdir /dev/shm/avsec/test/
In [257]:
# save the array
_blosc_params = bcolz.cparams(clevel=5, shuffle=bcolz.SHUFFLE, cname='lz4')
bcolz.carray(chr1, rootdir="/dev/shm/avsec/test/chr1.bc", chunklen=9000, cparams=_blosc_params, mode='w').flush()
In [372]:
ca = bcolz.open("/dev/shm/avsec/test/chr1.bc")
In [373]:
%time a= ca[100000:101000]
CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 655 µs
In [277]:
!ln -s /users/cprobert/dev/genomelake/benchmark .
In [363]:
write_tiledb(chr1, "/dev/shm/avsec/test/chr1.tiledb")
In [374]:
ca = load_tiledb("/dev/shm/avsec/test/chr1.tiledb")
In [375]:
%time a= ca[100000:101000]
CPU times: user 4 ms, sys: 0 ns, total: 4 ms
Wall time: 3.32 ms
In [377]:
%time a= ca[0:1000]
CPU times: user 4 ms, sys: 0 ns, total: 4 ms
Wall time: 2.24 ms
In [362]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import os.path
import shutil

import numpy as np
import tiledb

GENOME_DOMAIN_NAME = "genome_coord"
SECONDARY_DOMAIN_NAME = "signal_coord"
GENOME_VALUE_NAME = "v"

DEFAULT_GENOME_TILE_EXTENT = 9000
DEFAULT_COMPRESSOR = "blosc-lz"
DEFAULT_COMPRESSOR_LEVEL = -1


def write_tiledb(arr, path, overwrite=True):
    """Write a tiledb to disk.
    """
    if os.path.exists(path) and os.path.isdir(path) and overwrite:
        shutil.rmtree(path)

    if os.path.exists(path):
        raise FileExistsError("Output path {} already exists".format(path))

    ctx = tiledb.Ctx()

    n = arr.shape[0]
    n_tile_extent = min(DEFAULT_GENOME_TILE_EXTENT, n)

    d1 = tiledb.Dim(
        ctx, GENOME_DOMAIN_NAME, domain=(0, n - 1), tile=n_tile_extent, dtype="uint32"
    )

    if arr.ndim == 1:
        domain = tiledb.Domain(ctx, d1)

    elif arr.ndim == 2:
        m = arr.shape[1]
        d2 = tiledb.Dim(
            ctx, SECONDARY_DOMAIN_NAME, domain=(0, m - 1), tile=m, dtype="uint32"
        )
        domain = tiledb.Domain(ctx, d1, d2)

    else:
        raise ValueError("tiledb backend only supports 1D or 2D arrays")

    v = tiledb.Attr(
        ctx,
        GENOME_VALUE_NAME,
        compressor=(DEFAULT_COMPRESSOR, DEFAULT_COMPRESSOR_LEVEL),
        dtype="float32",
    )

    schema = tiledb.ArraySchema(
        ctx, domain=domain, attrs=(v,), cell_order="row-major", tile_order="row-major"
    )
    A = tiledb.DenseArray.create(path, schema)

    values = arr.astype(np.float32)

    with tiledb.DenseArray(ctx, path, mode="w") as A:
        A[:] = {GENOME_VALUE_NAME: values}


def load_tiledb(path):
    return TDBDenseArray(path)


class TDBDenseArray(object):
    """A read-only wrapper of tiledb.DenseArray"""

    def __init__(self, array):
        if isinstance(array, tiledb.DenseArray):
            self._arr = array
        else:
            ctx = tiledb.Ctx()
            self._arr = tiledb.DenseArray(ctx, array, mode="r")

    def __getitem__(self, key):
        return self._arr[key][GENOME_VALUE_NAME]

    def __setitem__(self, key, item):
        raise NotImplemented("TDBDenseArray is read-only")

    @property
    def shape(self):
        return self._arr.shape

    @property
    def ndim(self):
        return self._arr.ndim
In [361]:
ls /dev/shm/cproberttmp/outputs/GM12878_CTCF_ENCFF312KXX.bigWig_tiledb/
ls: cannot open directory '/dev/shm/cproberttmp/outputs/GM12878_CTCF_ENCFF312KXX.bigWig_tiledb/chr1': Permission denied
In [354]:
arr = load_tiledb("/users/cprobert/dev/genomelake/benchmark/outputs/GM12878_CTCF_ENCFF312KXX.bigWig_tiledb/chr1")
---------------------------------------------------------------------------
TileDBError                               Traceback (most recent call last)
<ipython-input-354-8e1dbbfcac30> in <module>()
----> 1 arr = load_tiledb("/users/cprobert/dev/genomelake/benchmark/outputs/GM12878_CTCF_ENCFF312KXX.bigWig_tiledb/chr1")

<ipython-input-352-3c22d0878ab8> in load_tiledb(path)
     68 
     69 def load_tiledb(path):
---> 70     return TDBDenseArray(path)
     71 
     72 

<ipython-input-352-3c22d0878ab8> in __init__(self, array)
     79         else:
     80             ctx = tiledb.Ctx()
---> 81             self._arr = tiledb.DenseArray(ctx, array, mode="r")
     82 
     83     def __getitem__(self, key):

tiledb/libtiledb.pyx in tiledb.libtiledb.DenseArray.__init__()

tiledb/libtiledb.pyx in tiledb.libtiledb.Array.__init__()

tiledb/libtiledb.pyx in tiledb.libtiledb._raise_ctx_err()

tiledb/libtiledb.pyx in tiledb.libtiledb._raise_tiledb_error()

TileDBError: [TileDB::StorageManager] Error: Cannot open array; Array does not exist
In [275]:
ls /users/cprobert/dev/genomelake/benchmark/outputs/
GM12878_CTCF_ENCFF312KXX.bigWig_bcolz/
GM12878_CTCF_ENCFF312KXX.bigWig_tiledb/
GM12878_DNase_ENCFF264NMW.bigWig_bcolz/
GM12878_DNase_ENCFF264NMW.bigWig_tiledb/
hg19.genome.fa_bcolz/
hg19.genome.fa_tiledb/
In [ ]:
ls /mnt/data/memmap/bcolz_data/DNASE.H1-hESC.fc.signal.bigwig/
In [212]:
!du -sh /dev/shm/cproberttmp/outputs/hg19.genome.fa_bcolz/
11G	/dev/shm/cproberttmp/outputs/hg19.genome.fa_bcolz/
In [188]:
ae = ArrayExtractor("/dev/shm/cproberttmp/outputs/hg19.genome.fa_bcolz/")
In [190]:
%time a=ae([Interval("chr1", 120000, 121000)])
CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 419 µs
In [195]:
%time a=ae([Interval("chr1", 0, 1000)])
CPU times: user 4 ms, sys: 0 ns, total: 4 ms
Wall time: 463 µs
In [197]:
%time a=ae([Interval("chr1", 0, 10000)])
CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 436 µs
In [200]:
%time a=ae([Interval("chr1", 0, 100000)])
CPU times: user 4 ms, sys: 0 ns, total: 4 ms
Wall time: 2.93 ms
In [72]:
carr.chunklen
Out[72]:
131072
In [201]:
!vmtouch -vt /users/cprobert/dev/genomelake/benchmark/inputs/hg19.genome.fa
/users/cprobert/dev/genomelake/benchmark/inputs/hg19.genome.fa
[OOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOO] 781228/781228

           Files: 1
     Directories: 0
   Touched Pages: 781228 (2G)
         Elapsed: 0.34503 seconds
In [240]:
fa = FastaExtractor("/users/cprobert/dev/genomelake/benchmark/inputs/hg19.genome.fa")
In [243]:
%time o=fa([Interval("chr1", 0, 1000)])
CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 296 µs
In [245]:
%time o=fa([Interval("chr1", 100000, 101000)])
CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 377 µs
In [244]:
%time o=fa([Interval("chr1", 0, 10000)])
CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 1.16 ms
In [209]:
%time o=fa([Interval("chr1", 0, 100000)])
CPU times: user 16 ms, sys: 0 ns, total: 16 ms
Wall time: 14 ms
In [205]:
%time o=fa([Interval("chr2", 0, 10000)])
CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 2.08 ms
In [206]:
%time o=fa([Interval("chr3", 0, 1000)])
CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 391 µs
In [187]:
%time o=fa([Interval("chr4", 0, 1000)])
CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 298 µs
In [140]:
%time o=fa([Interval("chr4", 9000, 10000)])
CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 965 µs
In [138]:
%time o=fa([Interval("chr15", 0, 10000)])
CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 1.01 ms
In [210]:
%time o=fa([Interval("chr20", 0, 1000)])
CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 293 µs
In [211]:
%time o=fa([Interval("chr20", 100000, 101000)])
CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 376 µs
In [14]:
ae = ArrayExtractor("/mnt/data/memmap/genomes/hg19.GRCh37")
---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-14-25f29f961787> in <module>()
----> 1 ae = ArrayExtractor("/mnt/data/memmap/genomes/hg19.GRCh37")

~/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/genomelake/extractors.py in __init__(self, datafile, in_memory, **kwargs)
     56     def __init__(self, datafile, in_memory=False, **kwargs):
     57         super(ArrayExtractor, self).__init__(datafile, **kwargs)
---> 58         self._data = backend.load_directory(datafile, in_memory=in_memory)
     59         self.multiprocessing_safe = in_memory
     60 

~/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/genomelake/backend.py in load_directory(base_dir, in_memory)
    104                                                         data[chrom].shape))
    105     else:
--> 106         raise ValueError('Can only extract from array_bcolz and array_numpy')
    107 
    108     return data

ValueError: Can only extract from array_bcolz and array_numpy
In [ ]:
ae = ArrayExtractor(str(dpath / "hg38.genome.fa.bc.gl"))
In [9]:
valid_chr=['chr2', 'chr3', 'chr4']
test_chr=['chr1', 'chr8', 'chr9']
In [6]:
from basepair.datasets import StrandedProfile
In [59]:
class AppendCounts:
    def transform(self, x):
        counts = {k.replace("profile/", "counts/"): np.log(1 + x[k].sum(0))
                  for k in x}
        return {**x, **counts}
In [70]:
dl = StrandedProfile(ds, 
                     excl_chromosomes=valid_chr+test_chr, 
                     peak_width=SEQ_WIDTH, shuffle=True,
                     bcolz=False,
                     in_memory=False,
                     target_transformer=AppendCounts())
Skipped 90 intervals outside of the genome size
In [11]:
from tqdm import tqdm
In [64]:
%timeit dl[100000]
1.81 ms ± 140 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
In [ ]:
it = dl.batch_iter(batch_size=256, num_workers=100)
next(it)
%timeit next(it)
In [72]:
%load_ext line_profiler
In [ ]:
# TODO - use ix, iat functions
In [39]:
row = dl.dfm.iloc[10000]
108 µs ± 1.67 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)
In [45]:
dl.dfm.head().index[0]
Out[45]:
5447559
In [23]:
%time row = dl.dfm.iloc[10000]
CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 489 µs
In [68]:
%time row = [dl.dfm.iat[1000, i] for i in range(4)]
CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 90.4 µs
In [35]:
%time row = dl.dfm.iat[1000, 0]
CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 79.4 µs
In [38]:
%time row = dl.dfm.iat[1000, 1]
CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 86.5 µs
In [17]:
%time row = dl.dfm.iloc[10000]
CPU times: user 8 ms, sys: 4 ms, total: 12 ms
Wall time: 13.5 ms
In [7]:
from basepair.datasets import *
In [19]:
%time intervals = [pd_row2interval(row)]
CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 256 µs
In [87]:
from pybedtools import Interval
In [94]:
%time Interval("chr1", 10, 20)
CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 170 µs
Out[94]:
Interval(chr1:10-20)
In [163]:
%time a=dl.bw_extractors['DNase'][0](intervals)
CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 1.12 ms
In [153]:
%time a=dl.fasta_extractor(intervals*1000)
CPU times: user 300 ms, sys: 4 ms, total: 304 ms
Wall time: 293 ms
In [95]:
ls -latr /tmp/ | tail -n 5
-rw-------  1 root     root           579 Aug  7 15:54 filekxC3OG
-rw-------  1 root     root           579 Aug  8 15:54 fileZ4kHlf
drwxr-xr-x  2 avsec    kundaje       4096 Aug  9 14:54 hsperfdata_avsec/
-rw-------  1 root     root           579 Aug  9 15:54 filec8QQSh
drwxrwxrwt 82 root     root     174465024 Aug 10 07:55 ./
In [86]:
pd_row2interval??
Signature: pd_row2interval(row)
Docstring: <no docstring>
Source:   
def pd_row2interval(row):
    return pybedtools.create_interval_from_list([str(row.chrom), int(row.start), int(row.end)])
File:      ~/workspace/basepair/basepair/datasets.py
Type:      function
In [101]:
%lprun -f dl.__getitem__ dl[100000]
Timer unit: 1e-06 s

Total time: 0.001339 s
File: /users/avsec/workspace/basepair/basepair/datasets.py
Function: __getitem__ at line 248

Line #      Hits         Time  Per Hit   % Time  Line Contents
==============================================================
   248                                               def __getitem__(self, idx):
   249         1          3.0      3.0      0.2          if self.fasta_extractor is None:
   250                                                       # Use array extractors
   251                                                       if self.bcolz:
   252                                                           self.fasta_extractor = ArrayExtractor(self.ds.fasta_file, in_memory=False)
   253                                                           self.bw_extractors = {task: [ArrayExtractor(task_spec.pos_counts, in_memory=False),
   254                                                                                        ArrayExtractor(task_spec.neg_counts, in_memory=False)]
   255                                                                                 for task, task_spec in self.ds.task_specs.items()}
   256                                                       else:
   257                                                           # Use normal fasta/bigwig extractors
   258                                                           assert not self.bcolz
   259                                                           # first call
   260                                                           self.bw_extractors = {task: [BigwigExtractor(task_spec.pos_counts),
   261                                                                                        BigwigExtractor(task_spec.neg_counts)]
   262                                                                                 for task, task_spec in self.ds.task_specs.items()}
   263                                                           self.fasta_extractor = FastaExtractor(self.ds.fasta_file)
   264                                           
   265         1         58.0     58.0      4.3          intervals = [Interval(self.dfm.iat[idx,0], # chrom
   266         1         35.0     35.0      2.6                                self.dfm.iat[idx,1], # start
   267         1         61.0     61.0      4.6                                self.dfm.iat[idx,2])] # end
   268         1         31.0     31.0      2.3          task = self.dfm.iat[idx,3]  # task
   269                                           
   270         1        261.0    261.0     19.5          sequence = self.fasta_extractor(intervals)[0]
   271         1          3.0      3.0      0.2          cuts = {f"profile/{task}": run_extractors(self.bw_extractors[task], intervals)[0]
   272         1        758.0    758.0     56.6                  for task, spec in self.ds.task_specs.items()}
   273         1          3.0      3.0      0.2          if self.target_transformer is not None:
   274         1         94.0     94.0      7.0              cuts = self.target_transformer.transform(cuts)
   275                                           
   276         1          2.0      2.0      0.1          return {"inputs": sequence,
   277         1          2.0      2.0      0.1                  "targets": cuts,
   278         1         25.0     25.0      1.9                  "metadata": {"range": GenomicRanges.from_interval(intervals[0]),
   279         1          3.0      3.0      0.2                               "interval_from_task": task}}
In [36]:
3e9 * 2 /1024 / 1024  # 1 whole bigwig ~ 5G
Out[36]:
5722.0458984375
In [71]:
%lprun -f dl.__getitem__ dl[100000]
UsageError: Line magic function `%lprun` not found.
In [32]:
%time dl.dfm.iloc[100000]
CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 457 µs
Out[32]:
chrom       chr10
start    44230200
end      44231200
task        DNase
Name: 8595534, dtype: object
In [16]:
for i, batch in enumerate(tqdm(dl.batch_iter(batch_size=256, num_workers=40))):
    if i > 1000:
        break
  0%|          | 0/37599 [00:00<?, ?it/s]
  0%|          | 1/37599 [00:12<129:28:45, 12.40s/it]
  0%|          | 2/37599 [00:12<66:02:39,  6.32s/it] 
  0%|          | 18/37599 [00:15<9:02:34,  1.15it/s]
  0%|          | 41/37599 [00:18<4:39:31,  2.24it/s]
  0%|          | 42/37599 [00:18<4:36:09,  2.27it/s]
  0%|          | 45/37599 [00:18<4:20:34,  2.40it/s]
  0%|          | 63/37599 [00:18<3:07:46,  3.33it/s]
  0%|          | 74/37599 [00:19<2:40:48,  3.89it/s]
  0%|          | 80/37599 [00:20<2:44:04,  3.81it/s]
  0%|          | 84/37599 [00:23<2:56:04,  3.55it/s]
  0%|          | 101/37599 [00:23<2:27:08,  4.25it/s]
  0%|          | 119/37599 [00:24<2:06:32,  4.94it/s]
  0%|          | 125/37599 [00:28<2:20:14,  4.45it/s]
  0%|          | 134/37599 [00:28<2:11:34,  4.75it/s]
  0%|          | 149/37599 [00:28<1:58:43,  5.26it/s]
  0%|          | 157/37599 [00:28<1:53:17,  5.51it/s]
  0%|          | 164/37599 [00:31<2:00:21,  5.18it/s]
  0%|          | 169/37599 [00:31<1:57:23,  5.31it/s]
  1%|          | 189/37599 [00:32<1:45:46,  5.89it/s]
  1%|          | 197/37599 [00:32<1:41:56,  6.12it/s]
  1%|          | 203/37599 [00:34<1:47:10,  5.81it/s]
  1%|          | 207/37599 [00:35<1:45:24,  5.91it/s]
  1%|          | 221/37599 [00:35<1:38:58,  6.29it/s]
  1%|          | 230/37599 [00:35<1:35:23,  6.53it/s]
  1%|          | 238/37599 [00:35<1:32:42,  6.72it/s]
  1%|          | 245/37599 [00:38<1:36:45,  6.43it/s]
  1%|          | 250/37599 [00:38<1:35:11,  6.54it/s]
  1%|          | 268/37599 [00:38<1:29:18,  6.97it/s]
  1%|          | 276/37599 [00:38<1:26:59,  7.15it/s]
  1%|          | 282/37599 [00:40<1:29:38,  6.94it/s]
  1%|          | 288/37599 [00:40<1:28:15,  7.05it/s]
  1%|          | 306/37599 [00:41<1:23:48,  7.42it/s]
  1%|          | 313/37599 [00:41<1:22:13,  7.56it/s]
  1%|          | 320/37599 [00:42<1:21:47,  7.60it/s]
  1%|          | 324/37599 [00:43<1:22:47,  7.50it/s]
  1%|          | 337/37599 [00:43<1:19:45,  7.79it/s]
  1%|          | 352/37599 [00:43<1:16:32,  8.11it/s]
  1%|          | 360/37599 [00:44<1:16:12,  8.14it/s]
  1%|          | 366/37599 [00:45<1:16:34,  8.10it/s]
  1%|          | 386/37599 [00:45<1:12:48,  8.52it/s]
  1%|          | 400/37599 [00:46<1:11:20,  8.69it/s]
  1%|          | 406/37599 [00:46<1:11:27,  8.67it/s]
  1%|          | 419/37599 [00:46<1:09:30,  8.92it/s]
  1%|          | 437/37599 [00:47<1:06:45,  9.28it/s]
  1%|          | 446/37599 [00:48<1:07:02,  9.24it/s]
  1%|          | 457/37599 [00:48<1:05:33,  9.44it/s]
  1%|          | 465/37599 [00:48<1:04:33,  9.59it/s]
  1%|▏         | 475/37599 [00:48<1:03:20,  9.77it/s]
  1%|▏         | 483/37599 [00:49<1:03:27,  9.75it/s]
  1%|▏         | 489/37599 [00:49<1:02:56,  9.83it/s]
  1%|▏         | 494/37599 [00:49<1:02:25,  9.91it/s]
  1%|▏         | 511/37599 [00:49<1:00:27, 10.23it/s]
  1%|▏         | 519/37599 [00:50<59:37, 10.36it/s]  
  1%|▏         | 527/37599 [00:51<59:49, 10.33it/s]
  1%|▏         | 533/37599 [00:51<59:16, 10.42it/s]
  1%|▏         | 543/37599 [00:51<58:20, 10.59it/s]
  1%|▏         | 555/37599 [00:51<57:13, 10.79it/s]
  1%|▏         | 562/37599 [00:51<56:47, 10.87it/s]
  2%|▏         | 568/37599 [00:52<56:52, 10.85it/s]
  2%|▏         | 582/37599 [00:52<55:40, 11.08it/s]
  2%|▏         | 598/37599 [00:52<54:20, 11.35it/s]
  2%|▏         | 605/37599 [00:53<54:23, 11.34it/s]
  2%|▏         | 610/37599 [00:53<54:03, 11.41it/s]
  2%|▏         | 615/37599 [00:53<53:43, 11.47it/s]
  2%|▏         | 629/37599 [00:53<52:41, 11.70it/s]
  2%|▏         | 640/37599 [00:54<52:03, 11.83it/s]
  2%|▏         | 645/37599 [00:54<52:22, 11.76it/s]
  2%|▏         | 649/37599 [00:55<52:15, 11.79it/s]
  2%|▏         | 661/37599 [00:55<51:25, 11.97it/s]
  2%|▏         | 682/37599 [00:55<49:55, 12.32it/s]
  2%|▏         | 691/37599 [00:56<49:52, 12.33it/s]
  2%|▏         | 698/37599 [00:56<49:28, 12.43it/s]
  2%|▏         | 704/37599 [00:56<49:08, 12.51it/s]
  2%|▏         | 719/37599 [00:56<48:13, 12.75it/s]
  2%|▏         | 727/37599 [00:57<48:14, 12.74it/s]
  2%|▏         | 733/37599 [00:57<47:55, 12.82it/s]
  2%|▏         | 744/37599 [00:57<47:26, 12.95it/s]
  2%|▏         | 755/37599 [00:57<46:51, 13.11it/s]
  2%|▏         | 764/37599 [00:57<46:26, 13.22it/s]
  2%|▏         | 770/37599 [00:58<46:26, 13.22it/s]
  2%|▏         | 776/37599 [00:58<46:11, 13.29it/s]
  2%|▏         | 786/37599 [00:58<45:40, 13.43it/s]
  2%|▏         | 793/37599 [00:58<45:21, 13.52it/s]
  2%|▏         | 803/37599 [00:58<44:54, 13.66it/s]
  2%|▏         | 809/37599 [00:59<44:55, 13.65it/s]
  2%|▏         | 814/37599 [00:59<44:44, 13.70it/s]
  2%|▏         | 823/37599 [00:59<44:22, 13.81it/s]
  2%|▏         | 832/37599 [00:59<43:58, 13.94it/s]
  2%|▏         | 841/37599 [00:59<43:35, 14.06it/s]
  2%|▏         | 848/37599 [01:00<43:24, 14.11it/s]
  2%|▏         | 853/37599 [01:00<43:21, 14.13it/s]
  2%|▏         | 857/37599 [01:00<43:13, 14.17it/s]
  2%|▏         | 861/37599 [01:00<43:07, 14.20it/s]
  2%|▏         | 877/37599 [01:00<42:23, 14.44it/s]
  2%|▏         | 886/37599 [01:00<42:04, 14.54it/s]
  2%|▏         | 893/37599 [01:01<42:06, 14.53it/s]
  2%|▏         | 903/37599 [01:01<41:42, 14.66it/s]
  2%|▏         | 910/37599 [01:01<41:27, 14.75it/s]
  2%|▏         | 916/37599 [01:01<41:15, 14.82it/s]
  2%|▏         | 924/37599 [01:01<40:58, 14.92it/s]
  2%|▏         | 930/37599 [01:02<40:57, 14.92it/s]
  2%|▏         | 935/37599 [01:02<40:48, 14.97it/s]
  3%|▎         | 940/37599 [01:02<40:40, 15.02it/s]
  3%|▎         | 946/37599 [01:02<40:29, 15.09it/s]
  3%|▎         | 951/37599 [01:02<40:20, 15.14it/s]
  3%|▎         | 960/37599 [01:02<40:01, 15.25it/s]
  3%|▎         | 970/37599 [01:03<39:40, 15.38it/s]
  3%|▎         | 977/37599 [01:03<39:40, 15.38it/s]
  3%|▎         | 982/37599 [01:03<39:34, 15.42it/s]
  3%|▎         | 989/37599 [01:03<39:20, 15.51it/s]
  3%|▎         | 995/37599 [01:03<39:10, 15.57it/s]
  3%|▎         | 1001/37599 [01:04<39:00, 15.63it/s]