import basepair
import pandas as pd
import numpy as np
from basepair.cli.schemas import DataSpec, TaskSpec
from basepair.datasets import chip_exo_nexus
from pathlib import Path
from basepair.config import create_tf_session, get_data_dir
ddir = get_data_dir()
create_tf_session(0)
dpath = Path("/mnt/lab_data/users/avsec/dnase_bias")
SEQ_WIDTH = 1000
ds = DataSpec(
task_specs={"DNase": TaskSpec(task="DNase",
pos_counts=dpath / "IMR90_Naked_DNase.pos.bw",
neg_counts=dpath / "IMR90_Naked_DNase.neg.bw",
peaks=dpath / "genome.stride200.w200.nonblacklist.bed")},
fasta_file=dpath / "GRCh38.genome.fa",
)
dpath = Path("/mnt/lab_data/users/avsec/dnase_bias")
SEQ_WIDTH = 1000
ds = DataSpec(
task_specs={"DNase": TaskSpec(task="DNase",
pos_counts="/dev/shm/avsec/IMR90_Naked_DNase.pos.bw",
neg_counts="/dev/shm/avsec/IMR90_Naked_DNase.pos.bw",
peaks=dpath / "genome.stride200.w200.nonblacklist.bed")},
fasta_file=dpath / "hg38.genome.fa.bc.gl",
)
dpath = Path("/mnt/lab_data/users/avsec/dnase_bias")
SEQ_WIDTH = 1000
ds = DataSpec(
task_specs={"DNase": TaskSpec(task="DNase",
pos_counts=dpath / "IMR90_Naked_DNase.pos.bw.bc.gl",
neg_counts=dpath / "IMR90_Naked_DNase.neg.bw.bc.gl",
peaks=dpath / "genome.stride200.w200.nonblacklist.bed")},
fasta_file="/dev/shm/avsec/IMR90_Naked_DNase.pos.bw.bc.gl",
)
from genomelake.extractors import ArrayExtractor, BigwigExtractor
from pybedtools import Interval
intervals = [Interval("chr1", 20000, 21000)]
%time a = ca[100000:101000]
%time a = ca[:1000]
%time a = ca[200000:210000]
ae = BigwigExtractor("/dev/shm/avsec/IMR90_Naked_DNase.pos.bw")
%time a=ae([Interval("chr1", 0, 1000)])
%time a=ae([Interval("chr1", 20000, 21000)])
%time a=ae([Interval("chr1", 100000, 101000)])
%time a=ae([Interval("chr1", 200000, 210000)])
%time a=ae([Interval("chr10", 200000, 201000)])
ae.bw.close()
ae.bw.chroms()['chr1']
values = ae.bw.values('chr1', 0, ae.bw.chroms()['chr1'], numpy=True)
values[np.isnan(values)] = 0
values[:5]
# save the array
_blosc_params = bcolz.cparams(clevel=5, shuffle=bcolz.SHUFFLE, cname='lz4')
bcolz.carray(values, rootdir="/dev/shm/avsec/test/bw.bc", chunklen=1000, cparams=_blosc_params, mode='w').flush()
ca = bcolz.open("/dev/shm/avsec/test/bw.bc")
%time a = ca[100000:101000]
%time a = ca[:1000]
%time a = ca[200000:210000]
values[:4]
write_tiledb(values, "/dev/shm/avsec/test/bw.tiledb")
ca = load_tiledb("/dev/shm/avsec/test/bw.tiledb")
%time a = ca[100000:101000]
%time a = ca[:1000]
%time a = ca[200000:210000]
ae = BigwigExtractor(str(dpath / "IMR90_Naked_DNase.pos.bw"))
%time a=ae(intervals)
ae.bw.close()
# 452 µs
Both, bigwig on disk and big-wig in memory yield rougly the same results
ae = ArrayExtractor("/dev/shm/avsec/IMR90_Naked_DNase.pos.bw.bc.gl")
%time a=ae(intervals)
chr1_track = ae._data['chr1'][:]
chr1_track[:4]
chr1_track.
ae = ArrayExtractor(dpath / "IMR90_Naked_DNase.pos.bw.bc.gl", in_memory=False)
%time a=ae(intervals)
from genomelake.extractors import FastaExtractor
fa = FastaExtractor(str(dpath / "GRCh38.genome.fa"))
%time a=fa(intervals)
%time a=fa([Interval("chr1", 120000, 121000)]*100)
%time a=fa([Interval("chr1", 0, 1000)])
%time a=fa([Interval("chr1", 0, 10000)])
%time a=fa([Interval("chr1", 0, 100000)])
fa = FastaExtractor("/dev/shm/avsec/GRCh38.genome.fa")
%time a=fa(intervals)
ae = ArrayExtractor(str(dpath / "hg38.genome.fa.bc.gl"))
# on disk
%time a=ae(intervals)
# in-memory
%time a=ae(intervals)
bc = ae._data['chr1']
ae._data['chr1'].partitions[:4] ## the partitions are really large -> 32kb instead of ~1kb
%time a=ae(intervals)
chr1 = ae._data["chr1"][:]
import bcolz
mkdir /dev/shm/avsec/test/
# save the array
_blosc_params = bcolz.cparams(clevel=5, shuffle=bcolz.SHUFFLE, cname='lz4')
bcolz.carray(chr1, rootdir="/dev/shm/avsec/test/chr1.bc", chunklen=9000, cparams=_blosc_params, mode='w').flush()
ca = bcolz.open("/dev/shm/avsec/test/chr1.bc")
%time a= ca[100000:101000]
!ln -s /users/cprobert/dev/genomelake/benchmark .
write_tiledb(chr1, "/dev/shm/avsec/test/chr1.tiledb")
ca = load_tiledb("/dev/shm/avsec/test/chr1.tiledb")
%time a= ca[100000:101000]
%time a= ca[0:1000]
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import os.path
import shutil
import numpy as np
import tiledb
GENOME_DOMAIN_NAME = "genome_coord"
SECONDARY_DOMAIN_NAME = "signal_coord"
GENOME_VALUE_NAME = "v"
DEFAULT_GENOME_TILE_EXTENT = 9000
DEFAULT_COMPRESSOR = "blosc-lz"
DEFAULT_COMPRESSOR_LEVEL = -1
def write_tiledb(arr, path, overwrite=True):
"""Write a tiledb to disk.
"""
if os.path.exists(path) and os.path.isdir(path) and overwrite:
shutil.rmtree(path)
if os.path.exists(path):
raise FileExistsError("Output path {} already exists".format(path))
ctx = tiledb.Ctx()
n = arr.shape[0]
n_tile_extent = min(DEFAULT_GENOME_TILE_EXTENT, n)
d1 = tiledb.Dim(
ctx, GENOME_DOMAIN_NAME, domain=(0, n - 1), tile=n_tile_extent, dtype="uint32"
)
if arr.ndim == 1:
domain = tiledb.Domain(ctx, d1)
elif arr.ndim == 2:
m = arr.shape[1]
d2 = tiledb.Dim(
ctx, SECONDARY_DOMAIN_NAME, domain=(0, m - 1), tile=m, dtype="uint32"
)
domain = tiledb.Domain(ctx, d1, d2)
else:
raise ValueError("tiledb backend only supports 1D or 2D arrays")
v = tiledb.Attr(
ctx,
GENOME_VALUE_NAME,
compressor=(DEFAULT_COMPRESSOR, DEFAULT_COMPRESSOR_LEVEL),
dtype="float32",
)
schema = tiledb.ArraySchema(
ctx, domain=domain, attrs=(v,), cell_order="row-major", tile_order="row-major"
)
A = tiledb.DenseArray.create(path, schema)
values = arr.astype(np.float32)
with tiledb.DenseArray(ctx, path, mode="w") as A:
A[:] = {GENOME_VALUE_NAME: values}
def load_tiledb(path):
return TDBDenseArray(path)
class TDBDenseArray(object):
"""A read-only wrapper of tiledb.DenseArray"""
def __init__(self, array):
if isinstance(array, tiledb.DenseArray):
self._arr = array
else:
ctx = tiledb.Ctx()
self._arr = tiledb.DenseArray(ctx, array, mode="r")
def __getitem__(self, key):
return self._arr[key][GENOME_VALUE_NAME]
def __setitem__(self, key, item):
raise NotImplemented("TDBDenseArray is read-only")
@property
def shape(self):
return self._arr.shape
@property
def ndim(self):
return self._arr.ndim
ls /dev/shm/cproberttmp/outputs/GM12878_CTCF_ENCFF312KXX.bigWig_tiledb/
arr = load_tiledb("/users/cprobert/dev/genomelake/benchmark/outputs/GM12878_CTCF_ENCFF312KXX.bigWig_tiledb/chr1")
ls /users/cprobert/dev/genomelake/benchmark/outputs/
ls /mnt/data/memmap/bcolz_data/DNASE.H1-hESC.fc.signal.bigwig/
!du -sh /dev/shm/cproberttmp/outputs/hg19.genome.fa_bcolz/
ae = ArrayExtractor("/dev/shm/cproberttmp/outputs/hg19.genome.fa_bcolz/")
%time a=ae([Interval("chr1", 120000, 121000)])
%time a=ae([Interval("chr1", 0, 1000)])
%time a=ae([Interval("chr1", 0, 10000)])
%time a=ae([Interval("chr1", 0, 100000)])
carr.chunklen
!vmtouch -vt /users/cprobert/dev/genomelake/benchmark/inputs/hg19.genome.fa
fa = FastaExtractor("/users/cprobert/dev/genomelake/benchmark/inputs/hg19.genome.fa")
%time o=fa([Interval("chr1", 0, 1000)])
%time o=fa([Interval("chr1", 100000, 101000)])
%time o=fa([Interval("chr1", 0, 10000)])
%time o=fa([Interval("chr1", 0, 100000)])
%time o=fa([Interval("chr2", 0, 10000)])
%time o=fa([Interval("chr3", 0, 1000)])
%time o=fa([Interval("chr4", 0, 1000)])
%time o=fa([Interval("chr4", 9000, 10000)])
%time o=fa([Interval("chr15", 0, 10000)])
%time o=fa([Interval("chr20", 0, 1000)])
%time o=fa([Interval("chr20", 100000, 101000)])
ae = ArrayExtractor("/mnt/data/memmap/genomes/hg19.GRCh37")
ae = ArrayExtractor(str(dpath / "hg38.genome.fa.bc.gl"))
valid_chr=['chr2', 'chr3', 'chr4']
test_chr=['chr1', 'chr8', 'chr9']
from basepair.datasets import StrandedProfile
class AppendCounts:
def transform(self, x):
counts = {k.replace("profile/", "counts/"): np.log(1 + x[k].sum(0))
for k in x}
return {**x, **counts}
dl = StrandedProfile(ds,
excl_chromosomes=valid_chr+test_chr,
peak_width=SEQ_WIDTH, shuffle=True,
bcolz=False,
in_memory=False,
target_transformer=AppendCounts())
from tqdm import tqdm
%timeit dl[100000]
it = dl.batch_iter(batch_size=256, num_workers=100)
next(it)
%timeit next(it)
%load_ext line_profiler
# TODO - use ix, iat functions
row = dl.dfm.iloc[10000]
dl.dfm.head().index[0]
%time row = dl.dfm.iloc[10000]
%time row = [dl.dfm.iat[1000, i] for i in range(4)]
%time row = dl.dfm.iat[1000, 0]
%time row = dl.dfm.iat[1000, 1]
%time row = dl.dfm.iloc[10000]
from basepair.datasets import *
%time intervals = [pd_row2interval(row)]
from pybedtools import Interval
%time Interval("chr1", 10, 20)
%time a=dl.bw_extractors['DNase'][0](intervals)
%time a=dl.fasta_extractor(intervals*1000)
ls -latr /tmp/ | tail -n 5
pd_row2interval??
%lprun -f dl.__getitem__ dl[100000]
3e9 * 2 /1024 / 1024 # 1 whole bigwig ~ 5G
%lprun -f dl.__getitem__ dl[100000]
%time dl.dfm.iloc[100000]
for i, batch in enumerate(tqdm(dl.batch_iter(batch_size=256, num_workers=40))):
if i > 1000:
break