In [1]:
import os.path
import shutil

from datetime import datetime

import bcolz
import numpy as np
import tiledb
In [15]:
GENOME_DOMAIN_NAME = "genome_coord"
SECONDARY_DOMAIN_NAME = "signal_coord"
GENOME_VALUE_NAME = "v"

DEFAULT_GENOME_TILE_EXTENT = 1000
DEFAULT_COMPRESSOR = "lz4"
DEFAULT_COMPRESSION_LEVEL = -1


def write_tiledb(arr, path, overwrite=True):
    """Write a tiledb to disk.
    """
    if os.path.exists(path) and os.path.isdir(path) and overwrite:
        shutil.rmtree(path)

    if os.path.exists(path):
        raise FileExistsError("Output path {} already exists".format(path))

    ctx = tiledb.Ctx()

    n = arr.shape[0]
    n_tile_extent = min(DEFAULT_GENOME_TILE_EXTENT, n)

    d1 = tiledb.Dim(
        ctx, GENOME_DOMAIN_NAME, domain=(0, n - 1), tile=n_tile_extent, dtype="uint32"
    )

    if arr.ndim == 1:
        domain = tiledb.Domain(ctx, d1)

    elif arr.ndim == 2:
        m = arr.shape[1]
        d2 = tiledb.Dim(
            ctx, SECONDARY_DOMAIN_NAME, domain=(0, m - 1), tile=m, dtype="uint32"
        )
        domain = tiledb.Domain(ctx, d1, d2)

    else:
        raise ValueError("tiledb backend only supports 1D or 2D arrays")

    v = tiledb.Attr(ctx, GENOME_VALUE_NAME, compressor=(DEFAULT_COMPRESSOR, DEFAULT_COMPRESSION_LEVEL), dtype="float32")

    A = tiledb.DenseArray(
        ctx,
        path,
        domain=domain,
        attrs=(v,),
        cell_order="row-major",
        tile_order="row-major",
    )

    values = arr.astype(np.float32)

    A[:] = {GENOME_VALUE_NAME: values}


def load_tiledb(path):
    return TDBDenseArray(path)


class TDBDenseArray(object):
    """A read-only wrapper of tiledb.DenseArray"""

    def __init__(self, array):
        if isinstance(array, tiledb.DenseArray):
            self._arr = array
        else:
            ctx = tiledb.Ctx()
            self._arr = tiledb.DenseArray.load(ctx, array)

    def __getitem__(self, key):
        return self._arr[key][GENOME_VALUE_NAME]

    def __setitem__(self, key, item):
        raise NotImplemented("TDBDenseArray is read-only")

    @property
    def shape(self):
        return self._arr.shape

    @property
    def ndim(self):
        return self._arr.ndim
In [3]:
a = load_tiledb('outputs/GM12878_CTCF_ENCFF312KXX.bigWig_tiledb/chr10')
am = a[:]
write_tiledb(am, 'test_blosc_lz4')
In [4]:
def bench(path, n_access=1000, access_len=1000):
    if os.path.basename(os.path.dirname(path)).endswith('bcolz'):
        arr = bcolz.open(path, mode='r')
    else:
        arr = load_tiledb(path)
    nd = arr.ndim

    starts = np.random.randint(1, arr.shape[0] - access_len - 1, n_access).astype(np.int32)
    stops = np.array([s + access_len for s in starts]).astype(np.int32)

    dt1 = datetime.now()

    for start, stop in zip(starts, stops):
        if nd == 1:
            _ = arr[start:stop]
        else:
            _ = arr[start:stop, :]

    dt2 = datetime.now()
    ms = (dt2 - dt1).total_seconds() * 1000
    return ms
In [31]:
print(bench('test_blosc_lz4'))
1672.436
In [32]:
! cp -aR test_blosc_lz4 /dev/shm/cproberttmp/
In [33]:
print(bench('/dev/shm/cproberttmp/test_blosc_lz4'))
1948.912
In [17]:
bench('outputs/GM12878_CTCF_ENCFF312KXX.bigWig_bcolz/chr1')
Out[17]:
574.139
In [5]:
res = []
compr = []
ext = []

for DEFAULT_COMPRESSOR in ("zstd", "lz4", "blosc-lz", "blosc-lz4", "blosc-lz4hc", "blosc-snappy", "blosc-zstd", "rle", "bzip2"):
    for DEFAULT_GENOME_TILE_EXTENT in (2000, 5000, 10000, 50000, 100000):
        write_tiledb(am, 'test_tiledb')
        tm = bench('test_tiledb')
        print('{} {}: {}'.format(DEFAULT_COMPRESSOR, DEFAULT_GENOME_TILE_EXTENT, tm))
        res.append(tm)
        compr.append(DEFAULT_COMPRESSOR)
        ext.append(DEFAULT_GENOME_TILE_EXTENT)
        
zstd 2000: 1405.721
zstd 5000: 1360.0339999999999
zstd 10000: 1380.107
zstd 50000: 1281.666
zstd 100000: 1496.938
lz4 2000: 1406.1509999999998
lz4 5000: 1359.3519999999999
lz4 10000: 1244.5539999999999
lz4 50000: 1278.28
lz4 100000: 1403.815
blosc-lz 2000: 1447.289
blosc-lz 5000: 1379.931
blosc-lz 10000: 1341.0059999999999
blosc-lz 50000: 1431.286
blosc-lz 100000: 1553.364
blosc-lz4 2000: 1596.674
blosc-lz4 5000: 1306.951
blosc-lz4 10000: 1276.172
blosc-lz4 50000: 1475.3210000000001
blosc-lz4 100000: 1695.127
blosc-lz4hc 2000: 1497.9869999999999
blosc-lz4hc 5000: 1293.73
blosc-lz4hc 10000: 1320.573
blosc-lz4hc 50000: 1412.194
blosc-lz4hc 100000: 1475.411
blosc-snappy 2000: 1431.843
blosc-snappy 5000: 1394.558
blosc-snappy 10000: 1347.4360000000001
blosc-snappy 50000: 1481.7640000000001
blosc-snappy 100000: 1767.907
blosc-zstd 2000: 1535.282
blosc-zstd 5000: 1302.819
blosc-zstd 10000: 1335.585
blosc-zstd 50000: 1466.615
blosc-zstd 100000: 1639.568
rle 2000: 1508.159
rle 5000: 1479.039
rle 10000: 1394.659
rle 50000: 1699.71
rle 100000: 2154.429
bzip2 2000: 1411.124
bzip2 5000: 1466.675
bzip2 10000: 1630.79
bzip2 50000: 2906.723
bzip2 100000: 4582.713
In [6]:
import matplotlib.pylab as plt

%matplotlib inline
In [8]:
plt.scatter(ext, res)
plt.xscale('log')
In [12]:
res2 = []
compr2 = []
ext2 = []

for DEFAULT_COMPRESSOR in ("lz4", "blosc-lz", "blosc-lz4", "blosc-lz4hc", "blosc-snappy", "blosc-zstd"):
    for DEFAULT_GENOME_TILE_EXTENT in (5000, 7500, 10000, 15000, 20000):
        write_tiledb(am, 'test_tiledb')
        tm = bench('test_tiledb')
        print('{} {}: {}'.format(DEFAULT_COMPRESSOR, DEFAULT_GENOME_TILE_EXTENT, tm))
        res2.append(tm)
        compr2.append(DEFAULT_COMPRESSOR)
        ext2.append(DEFAULT_GENOME_TILE_EXTENT)
        
lz4 5000: 1221.5990000000002
lz4 7500: 1190.306
lz4 10000: 1159.8790000000001
lz4 15000: 1174.877
lz4 20000: 1204.637
blosc-lz 5000: 1107.088
blosc-lz 7500: 1103.329
blosc-lz 10000: 1202.474
blosc-lz 15000: 1280.661
blosc-lz 20000: 1335.5269999999998
blosc-lz4 5000: 1234.7740000000001
blosc-lz4 7500: 1287.494
blosc-lz4 10000: 1165.4
blosc-lz4 15000: 1173.7150000000001
blosc-lz4 20000: 1310.246
blosc-lz4hc 5000: 1191.2469999999998
blosc-lz4hc 7500: 1384.5829999999999
blosc-lz4hc 10000: 1408.038
blosc-lz4hc 15000: 1388.881
blosc-lz4hc 20000: 1406.938
blosc-snappy 5000: 1525.759
blosc-snappy 7500: 1457.4009999999998
blosc-snappy 10000: 1589.112
blosc-snappy 15000: 1462.237
blosc-snappy 20000: 1483.2559999999999
blosc-zstd 5000: 1466.255
blosc-zstd 7500: 1307.422
blosc-zstd 10000: 1292.1750000000002
blosc-zstd 15000: 1421.24
blosc-zstd 20000: 1348.5339999999999
In [11]:
for r, c, e in zip(res, compr, ext):
    if e == 10000:
        print('{} {}'.format(c, r))
zstd 1380.107
lz4 1244.5539999999999
blosc-lz 1341.0059999999999
blosc-lz4 1276.172
blosc-lz4hc 1320.573
blosc-snappy 1347.4360000000001
blosc-zstd 1335.585
rle 1394.659
bzip2 1630.79
In [14]:
plt.plot(ext2, res2)
plt.xscale('log')
In [16]:
res3 = []
compr3 = []
ext3 = []
lev3 = []

for DEFAULT_COMPRESSOR in ("lz4", "blosc-lz", "blosc-lz4"):
    for DEFAULT_GENOME_TILE_EXTENT in (5000, 7000, 9000):
        for DEFAULT_COMPRESSION_LEVEL in (3, 5, 7, -1):
            write_tiledb(am, 'test_tiledb')
            tm = bench('test_tiledb')
            print('{} {} {}: {}'.format(DEFAULT_COMPRESSOR, DEFAULT_GENOME_TILE_EXTENT, DEFAULT_COMPRESSION_LEVEL, tm))
            res2.append(tm)
            compr2.append(DEFAULT_COMPRESSOR)
            ext2.append(DEFAULT_GENOME_TILE_EXTENT)
            lev3.append(DEFAULT_COMPRESSION_LEVEL)
lz4 5000 3: 1316.221
lz4 5000 5: 1391.139
lz4 5000 7: 1440.3999999999999
lz4 5000 -1: 1438.993
lz4 7000 3: 1386.919
lz4 7000 5: 1294.948
lz4 7000 7: 1291.392
lz4 7000 -1: 1430.771
lz4 9000 3: 1382.828
lz4 9000 5: 1388.346
lz4 9000 7: 1326.0810000000001
lz4 9000 -1: 1337.2939999999999
blosc-lz 5000 3: 1415.153
blosc-lz 5000 5: 1442.309
blosc-lz 5000 7: 1343.074
blosc-lz 5000 -1: 1337.529
blosc-lz 7000 3: 1385.436
blosc-lz 7000 5: 1537.8970000000002
blosc-lz 7000 7: 1294.312
blosc-lz 7000 -1: 1335.772
blosc-lz 9000 3: 1250.115
blosc-lz 9000 5: 1299.641
blosc-lz 9000 7: 1227.048
blosc-lz 9000 -1: 1200.7810000000002
blosc-lz4 5000 3: 1399.435
blosc-lz4 5000 5: 1508.3310000000001
blosc-lz4 5000 7: 1444.266
blosc-lz4 5000 -1: 1384.952
blosc-lz4 7000 3: 1579.838
blosc-lz4 7000 5: 1451.074
blosc-lz4 7000 7: 1377.594
blosc-lz4 7000 -1: 1527.376
blosc-lz4 9000 3: 1375.8310000000001
blosc-lz4 9000 5: 1359.25
blosc-lz4 9000 7: 1336.91
blosc-lz4 9000 -1: 1475.5510000000002
In [ ]:
DEFAULT_GENOME_TILE_EXTENT = 1000

def write_sparse_array(path, n, n_idxs, values, overwrite=True):
    if os.path.exists(path) and os.path.isdir(path) and overwrite:
        shutil.rmtree(path)

    if os.path.exists(path):
        raise FileExistsError("Output path {} already exists".format(path))

    if n_idxs.min() < 0 or n_idxs.max() >= n:
        raise ValueError('row indexes must be in range [0, n - 1]')

    ctx = tiledb.Ctx()

    n_tile_extent = min(DEFAULT_GENOME_TILE_EXTENT, n)

    d1 = tiledb.Dim(ctx, GENOME_DOMAIN_NAME, domain=(0, n - 1),
                    tile=n_tile_extent, dtype="uint32")

    domain = tiledb.Domain(ctx, d1)

    v = tiledb.Attr(ctx, "v", compressor=('lz4', -1), dtype="float32")

    A = tiledb.SparseArray(ctx,
                           path,
                           domain=domain,
                           attrs=(v,),
                           capacity=1000,
                           cell_order='row-major',
                           tile_order='row-major')

    values = values.astype(np.float32)

    A[n_idxs] = {'v': values}
In [ ]:
nz = a.nonzero()[0]
write_sparse_array('test_sparse', a.shape[0], nz, a[nz])
In [ ]:
def read_space()
In [34]:
np.unique(am).shape[0]
Out[34]:
14740
In [35]:
2 ** 32
Out[35]:
4294967296
In [37]:
2 ** 16
Out[37]:
65536