Goal¶

benchmark the dataloader

get the training data¶

import basepair
import pandas as pd
import numpy as np
from basepair.cli.schemas import DataSpec, TaskSpec
from basepair.datasets import chip_exo_nexus
from pathlib import Path
from basepair.config import create_tf_session, get_data_dir

ddir = get_data_dir()

create_tf_session(0)

/users/avsec/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/h5py/__init__.py:36: FutureWarning: Conversion of the second argument of issubdtype from `float` to `np.floating` is deprecated. In future, it will be treated as `np.float64 == np.dtype(float).type`.
  from ._conv import register_converters as _register_converters
Using TensorFlow backend.

<tensorflow.python.client.session.Session at 0x7f4e40816e48>

dpath = Path("/mnt/lab_data/users/avsec/dnase_bias")

SEQ_WIDTH = 1000

ds = DataSpec(
    task_specs={"DNase": TaskSpec(task="DNase",
                                 pos_counts=dpath / "IMR90_Naked_DNase.pos.bw",
                                 neg_counts=dpath / "IMR90_Naked_DNase.neg.bw",
                                 peaks=dpath / "genome.stride200.w200.nonblacklist.bed")},
    fasta_file=dpath / "GRCh38.genome.fa",
              )

dpath = Path("/mnt/lab_data/users/avsec/dnase_bias")

SEQ_WIDTH = 1000

ds = DataSpec(
    task_specs={"DNase": TaskSpec(task="DNase",
                                 pos_counts="/dev/shm/avsec/IMR90_Naked_DNase.pos.bw",
                                 neg_counts="/dev/shm/avsec/IMR90_Naked_DNase.pos.bw",
                                 peaks=dpath / "genome.stride200.w200.nonblacklist.bed")},
    fasta_file=dpath / "hg38.genome.fa.bc.gl",
              )

dpath = Path("/mnt/lab_data/users/avsec/dnase_bias")

SEQ_WIDTH = 1000

ds = DataSpec(
    task_specs={"DNase": TaskSpec(task="DNase",
                                 pos_counts=dpath / "IMR90_Naked_DNase.pos.bw.bc.gl",
                                 neg_counts=dpath / "IMR90_Naked_DNase.neg.bw.bc.gl",
                                 peaks=dpath / "genome.stride200.w200.nonblacklist.bed")},
    fasta_file="/dev/shm/avsec/IMR90_Naked_DNase.pos.bw.bc.gl",
              )

Benchmarking¶

from genomelake.extractors import ArrayExtractor, BigwigExtractor
from pybedtools import Interval

intervals = [Interval("chr1", 20000, 21000)]

Bigwig in memory¶

%time a = ca[100000:101000]

%time a = ca[:1000]

%time a = ca[200000:210000]

ae = BigwigExtractor("/dev/shm/avsec/IMR90_Naked_DNase.pos.bw")

%time a=ae([Interval("chr1", 0, 1000)])
%time a=ae([Interval("chr1", 20000, 21000)])

%time a=ae([Interval("chr1", 100000, 101000)])
%time a=ae([Interval("chr1", 200000, 210000)])

%time a=ae([Interval("chr10", 200000, 201000)])

ae.bw.close()

CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 249 µs
CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 338 µs
CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 287 µs
CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 1.14 ms
CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 372 µs

ae.bw.chroms()['chr1']

248956422

values = ae.bw.values('chr1', 0, ae.bw.chroms()['chr1'], numpy=True)

values[np.isnan(values)] = 0

values[:5]

array([0., 0., 0., 0., 0.], dtype=float32)

# save the array
_blosc_params = bcolz.cparams(clevel=5, shuffle=bcolz.SHUFFLE, cname='lz4')
bcolz.carray(values, rootdir="/dev/shm/avsec/test/bw.bc", chunklen=1000, cparams=_blosc_params, mode='w').flush()

ca = bcolz.open("/dev/shm/avsec/test/bw.bc")

%time a = ca[100000:101000]

CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 316 µs

%time a = ca[:1000]

CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 265 µs

%time a = ca[200000:210000]

CPU times: user 4 ms, sys: 0 ns, total: 4 ms
Wall time: 769 µs

values[:4]

array([nan, nan, nan, nan], dtype=float32)

write_tiledb(values, "/dev/shm/avsec/test/bw.tiledb")

ca = load_tiledb("/dev/shm/avsec/test/bw.tiledb")

%time a = ca[100000:101000]

CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 476 µs

%time a = ca[:1000]

CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 464 µs

%time a = ca[200000:210000]

CPU times: user 4 ms, sys: 4 ms, total: 8 ms
Wall time: 528 µs

Bigwig on disk¶

ae = BigwigExtractor(str(dpath / "IMR90_Naked_DNase.pos.bw"))
%time a=ae(intervals)

ae.bw.close()

# 452 µs

CPU times: user 0 ns, sys: 4 ms, total: 4 ms
Wall time: 495 µs

Both, bigwig on disk and big-wig in memory yield rougly the same results

Bcolz¶

In memory¶

ae = ArrayExtractor("/dev/shm/avsec/IMR90_Naked_DNase.pos.bw.bc.gl")

%time a=ae(intervals)

CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 1.94 ms

chr1_track = ae._data['chr1'][:]

chr1_track[:4]

array(['0.0', '0.0', '0.0', '0.0'], dtype='<U32')

chr1_track.

dtype('<U32')

On disk¶

ae = ArrayExtractor(dpath / "IMR90_Naked_DNase.pos.bw.bc.gl", in_memory=False)

%time a=ae(intervals)

CPU times: user 4 ms, sys: 0 ns, total: 4 ms
Wall time: 1.59 ms

Genome¶

Fasta¶

On disk¶

from genomelake.extractors import FastaExtractor

fa = FastaExtractor(str(dpath / "GRCh38.genome.fa"))

%time a=fa(intervals)

CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 599 µs

%time a=fa([Interval("chr1", 120000, 121000)]*100)

CPU times: user 4 ms, sys: 4 ms, total: 8 ms
Wall time: 7.79 ms

%time a=fa([Interval("chr1", 0, 1000)])

CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 317 µs

%time a=fa([Interval("chr1", 0, 10000)])

CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 851 µs

%time a=fa([Interval("chr1", 0, 100000)])

CPU times: user 8 ms, sys: 0 ns, total: 8 ms
Wall time: 7.18 ms

In memory¶

fa = FastaExtractor("/dev/shm/avsec/GRCh38.genome.fa")

%time a=fa(intervals)

CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 302 µs

Bcolz¶

ae = ArrayExtractor(str(dpath / "hg38.genome.fa.bc.gl"))

# on disk
%time a=ae(intervals)

CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 2.92 ms

# in-memory
%time a=ae(intervals)

CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 480 µs

bc = ae._data['chr1']

ae._data['chr1'].partitions[:4] ## the partitions are really large -> 32kb instead of ~1kb

[(0, 32768), (32768, 65536), (65536, 98304), (98304, 131072)]

%time a=ae(intervals)

CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 1.9 ms

Bcolz converted by others¶

chr1 = ae._data["chr1"][:]

import bcolz

mkdir /dev/shm/avsec/test/

# save the array
_blosc_params = bcolz.cparams(clevel=5, shuffle=bcolz.SHUFFLE, cname='lz4')
bcolz.carray(chr1, rootdir="/dev/shm/avsec/test/chr1.bc", chunklen=9000, cparams=_blosc_params, mode='w').flush()

ca = bcolz.open("/dev/shm/avsec/test/chr1.bc")

%time a= ca[100000:101000]

CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 655 µs

!ln -s /users/cprobert/dev/genomelake/benchmark .

write_tiledb(chr1, "/dev/shm/avsec/test/chr1.tiledb")

ca = load_tiledb("/dev/shm/avsec/test/chr1.tiledb")

%time a= ca[100000:101000]

CPU times: user 4 ms, sys: 0 ns, total: 4 ms
Wall time: 3.32 ms

%time a= ca[0:1000]

CPU times: user 4 ms, sys: 0 ns, total: 4 ms
Wall time: 2.24 ms

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import os.path
import shutil

import numpy as np
import tiledb

GENOME_DOMAIN_NAME = "genome_coord"
SECONDARY_DOMAIN_NAME = "signal_coord"
GENOME_VALUE_NAME = "v"

DEFAULT_GENOME_TILE_EXTENT = 9000
DEFAULT_COMPRESSOR = "blosc-lz"
DEFAULT_COMPRESSOR_LEVEL = -1


def write_tiledb(arr, path, overwrite=True):
    """Write a tiledb to disk.
    """
    if os.path.exists(path) and os.path.isdir(path) and overwrite:
        shutil.rmtree(path)

    if os.path.exists(path):
        raise FileExistsError("Output path {} already exists".format(path))

    ctx = tiledb.Ctx()

    n = arr.shape[0]
    n_tile_extent = min(DEFAULT_GENOME_TILE_EXTENT, n)

    d1 = tiledb.Dim(
        ctx, GENOME_DOMAIN_NAME, domain=(0, n - 1), tile=n_tile_extent, dtype="uint32"
    )

    if arr.ndim == 1:
        domain = tiledb.Domain(ctx, d1)

    elif arr.ndim == 2:
        m = arr.shape[1]
        d2 = tiledb.Dim(
            ctx, SECONDARY_DOMAIN_NAME, domain=(0, m - 1), tile=m, dtype="uint32"
        )
        domain = tiledb.Domain(ctx, d1, d2)

    else:
        raise ValueError("tiledb backend only supports 1D or 2D arrays")

    v = tiledb.Attr(
        ctx,
        GENOME_VALUE_NAME,
        compressor=(DEFAULT_COMPRESSOR, DEFAULT_COMPRESSOR_LEVEL),
        dtype="float32",
    )

    schema = tiledb.ArraySchema(
        ctx, domain=domain, attrs=(v,), cell_order="row-major", tile_order="row-major"
    )
    A = tiledb.DenseArray.create(path, schema)

    values = arr.astype(np.float32)

    with tiledb.DenseArray(ctx, path, mode="w") as A:
        A[:] = {GENOME_VALUE_NAME: values}


def load_tiledb(path):
    return TDBDenseArray(path)


class TDBDenseArray(object):
    """A read-only wrapper of tiledb.DenseArray"""

    def __init__(self, array):
        if isinstance(array, tiledb.DenseArray):
            self._arr = array
        else:
            ctx = tiledb.Ctx()
            self._arr = tiledb.DenseArray(ctx, array, mode="r")

    def __getitem__(self, key):
        return self._arr[key][GENOME_VALUE_NAME]

    def __setitem__(self, key, item):
        raise NotImplemented("TDBDenseArray is read-only")

    @property
    def shape(self):
        return self._arr.shape

    @property
    def ndim(self):
        return self._arr.ndim

ls /dev/shm/cproberttmp/outputs/GM12878_CTCF_ENCFF312KXX.bigWig_tiledb/

ls: cannot open directory '/dev/shm/cproberttmp/outputs/GM12878_CTCF_ENCFF312KXX.bigWig_tiledb/chr1': Permission denied

arr = load_tiledb("/users/cprobert/dev/genomelake/benchmark/outputs/GM12878_CTCF_ENCFF312KXX.bigWig_tiledb/chr1")

---------------------------------------------------------------------------
TileDBError                               Traceback (most recent call last)
<ipython-input-354-8e1dbbfcac30> in <module>()
----> 1 arr = load_tiledb("/users/cprobert/dev/genomelake/benchmark/outputs/GM12878_CTCF_ENCFF312KXX.bigWig_tiledb/chr1")

<ipython-input-352-3c22d0878ab8> in load_tiledb(path)
     68 
     69 def load_tiledb(path):
---> 70     return TDBDenseArray(path)
     71 
     72 

<ipython-input-352-3c22d0878ab8> in __init__(self, array)
     79         else:
     80             ctx = tiledb.Ctx()
---> 81             self._arr = tiledb.DenseArray(ctx, array, mode="r")
     82 
     83     def __getitem__(self, key):

tiledb/libtiledb.pyx in tiledb.libtiledb.DenseArray.__init__()

tiledb/libtiledb.pyx in tiledb.libtiledb.Array.__init__()

tiledb/libtiledb.pyx in tiledb.libtiledb._raise_ctx_err()

tiledb/libtiledb.pyx in tiledb.libtiledb._raise_tiledb_error()

TileDBError: [TileDB::StorageManager] Error: Cannot open array; Array does not exist

ls /users/cprobert/dev/genomelake/benchmark/outputs/

GM12878_CTCF_ENCFF312KXX.bigWig_bcolz/
GM12878_CTCF_ENCFF312KXX.bigWig_tiledb/
GM12878_DNase_ENCFF264NMW.bigWig_bcolz/
GM12878_DNase_ENCFF264NMW.bigWig_tiledb/
hg19.genome.fa_bcolz/
hg19.genome.fa_tiledb/

ls /mnt/data/memmap/bcolz_data/DNASE.H1-hESC.fc.signal.bigwig/

!du -sh /dev/shm/cproberttmp/outputs/hg19.genome.fa_bcolz/

11G	/dev/shm/cproberttmp/outputs/hg19.genome.fa_bcolz/

ae = ArrayExtractor("/dev/shm/cproberttmp/outputs/hg19.genome.fa_bcolz/")

%time a=ae([Interval("chr1", 120000, 121000)])

CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 419 µs

%time a=ae([Interval("chr1", 0, 1000)])

CPU times: user 4 ms, sys: 0 ns, total: 4 ms
Wall time: 463 µs

%time a=ae([Interval("chr1", 0, 10000)])

CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 436 µs

%time a=ae([Interval("chr1", 0, 100000)])

CPU times: user 4 ms, sys: 0 ns, total: 4 ms
Wall time: 2.93 ms

carr.chunklen

131072

!vmtouch -vt /users/cprobert/dev/genomelake/benchmark/inputs/hg19.genome.fa

/users/cprobert/dev/genomelake/benchmark/inputs/hg19.genome.fa
[OOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOO] 781228/781228

           Files: 1
     Directories: 0
   Touched Pages: 781228 (2G)
         Elapsed: 0.34503 seconds

fa = FastaExtractor("/users/cprobert/dev/genomelake/benchmark/inputs/hg19.genome.fa")

%time o=fa([Interval("chr1", 0, 1000)])

CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 296 µs

%time o=fa([Interval("chr1", 100000, 101000)])

CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 377 µs

%time o=fa([Interval("chr1", 0, 10000)])

CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 1.16 ms

%time o=fa([Interval("chr1", 0, 100000)])

CPU times: user 16 ms, sys: 0 ns, total: 16 ms
Wall time: 14 ms

%time o=fa([Interval("chr2", 0, 10000)])

CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 2.08 ms

%time o=fa([Interval("chr3", 0, 1000)])

CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 391 µs

%time o=fa([Interval("chr4", 0, 1000)])

CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 298 µs

%time o=fa([Interval("chr4", 9000, 10000)])

CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 965 µs

%time o=fa([Interval("chr15", 0, 10000)])

CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 1.01 ms

%time o=fa([Interval("chr20", 0, 1000)])

CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 293 µs

%time o=fa([Interval("chr20", 100000, 101000)])

CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 376 µs

ae = ArrayExtractor("/mnt/data/memmap/genomes/hg19.GRCh37")

---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-14-25f29f961787> in <module>()
----> 1 ae = ArrayExtractor("/mnt/data/memmap/genomes/hg19.GRCh37")

~/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/genomelake/extractors.py in __init__(self, datafile, in_memory, **kwargs)
     56     def __init__(self, datafile, in_memory=False, **kwargs):
     57         super(ArrayExtractor, self).__init__(datafile, **kwargs)
---> 58         self._data = backend.load_directory(datafile, in_memory=in_memory)
     59         self.multiprocessing_safe = in_memory
     60 

~/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/genomelake/backend.py in load_directory(base_dir, in_memory)
    104                                                         data[chrom].shape))
    105     else:
--> 106         raise ValueError('Can only extract from array_bcolz and array_numpy')
    107 
    108     return data

ValueError: Can only extract from array_bcolz and array_numpy

ae = ArrayExtractor(str(dpath / "hg38.genome.fa.bc.gl"))

valid_chr=['chr2', 'chr3', 'chr4']
test_chr=['chr1', 'chr8', 'chr9']

from basepair.datasets import StrandedProfile

class AppendCounts:
    def transform(self, x):
        counts = {k.replace("profile/", "counts/"): np.log(1 + x[k].sum(0))
                  for k in x}
        return {**x, **counts}

dl = StrandedProfile(ds, 
                     excl_chromosomes=valid_chr+test_chr, 
                     peak_width=SEQ_WIDTH, shuffle=True,
                     bcolz=False,
                     in_memory=False,
                     target_transformer=AppendCounts())

Skipped 90 intervals outside of the genome size

from tqdm import tqdm

%timeit dl[100000]

1.81 ms ± 140 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)

it = dl.batch_iter(batch_size=256, num_workers=100)
next(it)
%timeit next(it)

%load_ext line_profiler

# TODO - use ix, iat functions

row = dl.dfm.iloc[10000]

108 µs ± 1.67 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)

dl.dfm.head().index[0]

5447559

%time row = dl.dfm.iloc[10000]

CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 489 µs

%time row = [dl.dfm.iat[1000, i] for i in range(4)]

CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 90.4 µs

%time row = dl.dfm.iat[1000, 0]

CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 79.4 µs

%time row = dl.dfm.iat[1000, 1]

CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 86.5 µs

%time row = dl.dfm.iloc[10000]

CPU times: user 8 ms, sys: 4 ms, total: 12 ms
Wall time: 13.5 ms

from basepair.datasets import *

%time intervals = [pd_row2interval(row)]

CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 256 µs

from pybedtools import Interval

%time Interval("chr1", 10, 20)

CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 170 µs

Interval(chr1:10-20)

%time a=dl.bw_extractors['DNase'][0](intervals)

CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 1.12 ms

%time a=dl.fasta_extractor(intervals*1000)

CPU times: user 300 ms, sys: 4 ms, total: 304 ms
Wall time: 293 ms

ls -latr /tmp/ | tail -n 5

-rw-------  1 root     root           579 Aug  7 15:54 filekxC3OG
-rw-------  1 root     root           579 Aug  8 15:54 fileZ4kHlf
drwxr-xr-x  2 avsec    kundaje       4096 Aug  9 14:54 hsperfdata_avsec/
-rw-------  1 root     root           579 Aug  9 15:54 filec8QQSh
drwxrwxrwt 82 root     root     174465024 Aug 10 07:55 ./

pd_row2interval??

Signature: pd_row2interval(row)
Docstring: <no docstring>
Source:   
def pd_row2interval(row):
    return pybedtools.create_interval_from_list([str(row.chrom), int(row.start), int(row.end)])
File:      ~/workspace/basepair/basepair/datasets.py
Type:      function

%lprun -f dl.__getitem__ dl[100000]

Timer unit: 1e-06 s

Total time: 0.001339 s
File: /users/avsec/workspace/basepair/basepair/datasets.py
Function: __getitem__ at line 248

Line #      Hits         Time  Per Hit   % Time  Line Contents
==============================================================
   248                                               def __getitem__(self, idx):
   249         1          3.0      3.0      0.2          if self.fasta_extractor is None:
   250                                                       # Use array extractors
   251                                                       if self.bcolz:
   252                                                           self.fasta_extractor = ArrayExtractor(self.ds.fasta_file, in_memory=False)
   253                                                           self.bw_extractors = {task: [ArrayExtractor(task_spec.pos_counts, in_memory=False),
   254                                                                                        ArrayExtractor(task_spec.neg_counts, in_memory=False)]
   255                                                                                 for task, task_spec in self.ds.task_specs.items()}
   256                                                       else:
   257                                                           # Use normal fasta/bigwig extractors
   258                                                           assert not self.bcolz
   259                                                           # first call
   260                                                           self.bw_extractors = {task: [BigwigExtractor(task_spec.pos_counts),
   261                                                                                        BigwigExtractor(task_spec.neg_counts)]
   262                                                                                 for task, task_spec in self.ds.task_specs.items()}
   263                                                           self.fasta_extractor = FastaExtractor(self.ds.fasta_file)
   264                                           
   265         1         58.0     58.0      4.3          intervals = [Interval(self.dfm.iat[idx,0], # chrom
   266         1         35.0     35.0      2.6                                self.dfm.iat[idx,1], # start
   267         1         61.0     61.0      4.6                                self.dfm.iat[idx,2])] # end
   268         1         31.0     31.0      2.3          task = self.dfm.iat[idx,3]  # task
   269                                           
   270         1        261.0    261.0     19.5          sequence = self.fasta_extractor(intervals)[0]
   271         1          3.0      3.0      0.2          cuts = {f"profile/{task}": run_extractors(self.bw_extractors[task], intervals)[0]
   272         1        758.0    758.0     56.6                  for task, spec in self.ds.task_specs.items()}
   273         1          3.0      3.0      0.2          if self.target_transformer is not None:
   274         1         94.0     94.0      7.0              cuts = self.target_transformer.transform(cuts)
   275                                           
   276         1          2.0      2.0      0.1          return {"inputs": sequence,
   277         1          2.0      2.0      0.1                  "targets": cuts,
   278         1         25.0     25.0      1.9                  "metadata": {"range": GenomicRanges.from_interval(intervals[0]),
   279         1          3.0      3.0      0.2                               "interval_from_task": task}}

3e9 * 2 /1024 / 1024  # 1 whole bigwig ~ 5G

5722.0458984375

%lprun -f dl.__getitem__ dl[100000]

UsageError: Line magic function `%lprun` not found.

%time dl.dfm.iloc[100000]

CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 457 µs

chrom       chr10
start    44230200
end      44231200
task        DNase
Name: 8595534, dtype: object

for i, batch in enumerate(tqdm(dl.batch_iter(batch_size=256, num_workers=40))):
    if i > 1000:
        break

  0%|          | 0/37599 [00:00<?, ?it/s]
  0%|          | 1/37599 [00:12<129:28:45, 12.40s/it]
  0%|          | 2/37599 [00:12<66:02:39,  6.32s/it] 
  0%|          | 18/37599 [00:15<9:02:34,  1.15it/s]
  0%|          | 41/37599 [00:18<4:39:31,  2.24it/s]
  0%|          | 42/37599 [00:18<4:36:09,  2.27it/s]
  0%|          | 45/37599 [00:18<4:20:34,  2.40it/s]
  0%|          | 63/37599 [00:18<3:07:46,  3.33it/s]
  0%|          | 74/37599 [00:19<2:40:48,  3.89it/s]
  0%|          | 80/37599 [00:20<2:44:04,  3.81it/s]
  0%|          | 84/37599 [00:23<2:56:04,  3.55it/s]
  0%|          | 101/37599 [00:23<2:27:08,  4.25it/s]
  0%|          | 119/37599 [00:24<2:06:32,  4.94it/s]
  0%|          | 125/37599 [00:28<2:20:14,  4.45it/s]
  0%|          | 134/37599 [00:28<2:11:34,  4.75it/s]
  0%|          | 149/37599 [00:28<1:58:43,  5.26it/s]
  0%|          | 157/37599 [00:28<1:53:17,  5.51it/s]
  0%|          | 164/37599 [00:31<2:00:21,  5.18it/s]
  0%|          | 169/37599 [00:31<1:57:23,  5.31it/s]
  1%|          | 189/37599 [00:32<1:45:46,  5.89it/s]
  1%|          | 197/37599 [00:32<1:41:56,  6.12it/s]
  1%|          | 203/37599 [00:34<1:47:10,  5.81it/s]
  1%|          | 207/37599 [00:35<1:45:24,  5.91it/s]
  1%|          | 221/37599 [00:35<1:38:58,  6.29it/s]
  1%|          | 230/37599 [00:35<1:35:23,  6.53it/s]
  1%|          | 238/37599 [00:35<1:32:42,  6.72it/s]
  1%|          | 245/37599 [00:38<1:36:45,  6.43it/s]
  1%|          | 250/37599 [00:38<1:35:11,  6.54it/s]
  1%|          | 268/37599 [00:38<1:29:18,  6.97it/s]
  1%|          | 276/37599 [00:38<1:26:59,  7.15it/s]
  1%|          | 282/37599 [00:40<1:29:38,  6.94it/s]
  1%|          | 288/37599 [00:40<1:28:15,  7.05it/s]
  1%|          | 306/37599 [00:41<1:23:48,  7.42it/s]
  1%|          | 313/37599 [00:41<1:22:13,  7.56it/s]
  1%|          | 320/37599 [00:42<1:21:47,  7.60it/s]
  1%|          | 324/37599 [00:43<1:22:47,  7.50it/s]
  1%|          | 337/37599 [00:43<1:19:45,  7.79it/s]
  1%|          | 352/37599 [00:43<1:16:32,  8.11it/s]
  1%|          | 360/37599 [00:44<1:16:12,  8.14it/s]
  1%|          | 366/37599 [00:45<1:16:34,  8.10it/s]
  1%|          | 386/37599 [00:45<1:12:48,  8.52it/s]
  1%|          | 400/37599 [00:46<1:11:20,  8.69it/s]
  1%|          | 406/37599 [00:46<1:11:27,  8.67it/s]
  1%|          | 419/37599 [00:46<1:09:30,  8.92it/s]
  1%|          | 437/37599 [00:47<1:06:45,  9.28it/s]
  1%|          | 446/37599 [00:48<1:07:02,  9.24it/s]
  1%|          | 457/37599 [00:48<1:05:33,  9.44it/s]
  1%|          | 465/37599 [00:48<1:04:33,  9.59it/s]
  1%|▏         | 475/37599 [00:48<1:03:20,  9.77it/s]
  1%|▏         | 483/37599 [00:49<1:03:27,  9.75it/s]
  1%|▏         | 489/37599 [00:49<1:02:56,  9.83it/s]
  1%|▏         | 494/37599 [00:49<1:02:25,  9.91it/s]
  1%|▏         | 511/37599 [00:49<1:00:27, 10.23it/s]
  1%|▏         | 519/37599 [00:50<59:37, 10.36it/s]  
  1%|▏         | 527/37599 [00:51<59:49, 10.33it/s]
  1%|▏         | 533/37599 [00:51<59:16, 10.42it/s]
  1%|▏         | 543/37599 [00:51<58:20, 10.59it/s]
  1%|▏         | 555/37599 [00:51<57:13, 10.79it/s]
  1%|▏         | 562/37599 [00:51<56:47, 10.87it/s]
  2%|▏         | 568/37599 [00:52<56:52, 10.85it/s]
  2%|▏         | 582/37599 [00:52<55:40, 11.08it/s]
  2%|▏         | 598/37599 [00:52<54:20, 11.35it/s]
  2%|▏         | 605/37599 [00:53<54:23, 11.34it/s]
  2%|▏         | 610/37599 [00:53<54:03, 11.41it/s]
  2%|▏         | 615/37599 [00:53<53:43, 11.47it/s]
  2%|▏         | 629/37599 [00:53<52:41, 11.70it/s]
  2%|▏         | 640/37599 [00:54<52:03, 11.83it/s]
  2%|▏         | 645/37599 [00:54<52:22, 11.76it/s]
  2%|▏         | 649/37599 [00:55<52:15, 11.79it/s]
  2%|▏         | 661/37599 [00:55<51:25, 11.97it/s]
  2%|▏         | 682/37599 [00:55<49:55, 12.32it/s]
  2%|▏         | 691/37599 [00:56<49:52, 12.33it/s]
  2%|▏         | 698/37599 [00:56<49:28, 12.43it/s]
  2%|▏         | 704/37599 [00:56<49:08, 12.51it/s]
  2%|▏         | 719/37599 [00:56<48:13, 12.75it/s]
  2%|▏         | 727/37599 [00:57<48:14, 12.74it/s]
  2%|▏         | 733/37599 [00:57<47:55, 12.82it/s]
  2%|▏         | 744/37599 [00:57<47:26, 12.95it/s]
  2%|▏         | 755/37599 [00:57<46:51, 13.11it/s]
  2%|▏         | 764/37599 [00:57<46:26, 13.22it/s]
  2%|▏         | 770/37599 [00:58<46:26, 13.22it/s]
  2%|▏         | 776/37599 [00:58<46:11, 13.29it/s]
  2%|▏         | 786/37599 [00:58<45:40, 13.43it/s]
  2%|▏         | 793/37599 [00:58<45:21, 13.52it/s]
  2%|▏         | 803/37599 [00:58<44:54, 13.66it/s]
  2%|▏         | 809/37599 [00:59<44:55, 13.65it/s]
  2%|▏         | 814/37599 [00:59<44:44, 13.70it/s]
  2%|▏         | 823/37599 [00:59<44:22, 13.81it/s]
  2%|▏         | 832/37599 [00:59<43:58, 13.94it/s]
  2%|▏         | 841/37599 [00:59<43:35, 14.06it/s]
  2%|▏         | 848/37599 [01:00<43:24, 14.11it/s]
  2%|▏         | 853/37599 [01:00<43:21, 14.13it/s]
  2%|▏         | 857/37599 [01:00<43:13, 14.17it/s]
  2%|▏         | 861/37599 [01:00<43:07, 14.20it/s]
  2%|▏         | 877/37599 [01:00<42:23, 14.44it/s]
  2%|▏         | 886/37599 [01:00<42:04, 14.54it/s]
  2%|▏         | 893/37599 [01:01<42:06, 14.53it/s]
  2%|▏         | 903/37599 [01:01<41:42, 14.66it/s]
  2%|▏         | 910/37599 [01:01<41:27, 14.75it/s]
  2%|▏         | 916/37599 [01:01<41:15, 14.82it/s]
  2%|▏         | 924/37599 [01:01<40:58, 14.92it/s]
  2%|▏         | 930/37599 [01:02<40:57, 14.92it/s]
  2%|▏         | 935/37599 [01:02<40:48, 14.97it/s]
  3%|▎         | 940/37599 [01:02<40:40, 15.02it/s]
  3%|▎         | 946/37599 [01:02<40:29, 15.09it/s]
  3%|▎         | 951/37599 [01:02<40:20, 15.14it/s]
  3%|▎         | 960/37599 [01:02<40:01, 15.25it/s]
  3%|▎         | 970/37599 [01:03<39:40, 15.38it/s]
  3%|▎         | 977/37599 [01:03<39:40, 15.38it/s]
  3%|▎         | 982/37599 [01:03<39:34, 15.42it/s]
  3%|▎         | 989/37599 [01:03<39:20, 15.51it/s]
  3%|▎         | 995/37599 [01:03<39:10, 15.57it/s]
  3%|▎         | 1001/37599 [01:04<39:00, 15.63it/s]