-
# Imports
from basepair.imports import *
hv.extension('bokeh')
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
# Common paths
model_dir = Path(f"{ddir}/processed/chipnexus/exp/models/oct-sox-nanog-klf/models/n_dil_layers=9/")
modisco_dir = model_dir / f"modisco/all/profile/"
output_dir = Path("/srv/www/kundaje/avsec/chipnexus/oct-sox-nanog-klf/models/n_dil_layers=9/modisco/all/profile")
# create_tf_session(0)
data = f"{ddir}/processed/chipnexus/exp/models/genomewide/oct-sox-nanog-klf"
ls {data}
from pybedtools import BedTool
btg.head()
btg = BedTool(f"{ddir}/raw/annotation/mm10/mm10.genome.stride1000.w1000.no-blacklist.bed.gz")
dfg = btg.to_dataframe()
dfg['name'] = dfg.index
btg = BedTool.from_dataframe(dfg)
ls {data}
b = BedTool(f"{data}/Sox2-summits.200bp.bed.gz")
b.head()
feature = 'feature1'
import pybedtools
pybedtools.
from basepair.config import get_data_dir
from basepair.preproc import label_bed
ddir = get_data_dir()
data = f"{ddir}/processed/chipnexus/exp/models/genomewide/oct-sox-nanog-klf"
dfo = label_bed(f"{ddir}/raw/annotation/mm10/mm10.genome.stride1000.w1000.no-blacklist.bed.gz",
{t: f"{data}/{t}-summits.200bp.bed.gz" for t in ['Oct4', 'Sox2', 'Nanog', 'Klf4']})
dfo.to_csv(f"{data}/1kb.osnk.tsv.gz", compression='gzip', sep='\t', index=False)
for t in ['Oct4', 'Sox2', 'Nanog', 'Klf4']:
print(t)
!zcat {data}/{t}-summits.200bp.bed.gz | wc -l
dfo.set_index(['chrom', 'start', 'end'], inplace=True)
dfo.sum(axis=0)
dfo.mean(axis=0)
intersected = btg.intersect(b, wa=True, u=True).to_dataframe()['name']
dfg[feature] = 0
dfg.loc[intersected, feature] = 1
dfi.head()
dfg.head()
from gin_train.samplers import StratifiedRandomBatchSampler,iterable_cycle
classes = np.concatenate([np.ones((int(1e7),)), np.zeros((int(1e7),))])
import ipython_memory_usage.ipython_memory_usage as imu
imu.start_watching_memory()
sampler = StratifiedRandomBatchSampler(classes, [0.5, 0.5], 128)
it = iter(sampler)
for i in sampler:
pass
len(sampler)
iterable_cycle()
intervals_file = '/users/avsec/workspace/basepair/data/processed/chipnexus/exp/models/genomewide/oct-sox-nanog-klf/1kb.osnk.tsv.gz'
dataspec = '/users/avsec/workspace/basepair/data/processed/chipnexus/exp/models/oct-sox-nanog-klf/dataspec.yml'
from basepair.datasets import get_gw_StrandedProfile_datasets
train, valid = get_gw_StrandedProfile_datasets(dataspec = '/users/avsec/workspace/basepair/data/processed/chipnexus/exp/models/oct-sox-nanog-klf/dataspec.yml',
intervals_file = '/users/avsec/workspace/basepair/data/processed/chipnexus/exp/models/genomewide/oct-sox-nanog-klf/1kb.osnk.tsv.gz',
peak_width=1000,
seq_width=3088,
exclude_chr = ['chrX', 'chrY']
) # use the default train and valid chromosomes
ds = valid[0][1]
valid[0][0]
it = ds.batch_train_iter(64, num_workers=6)
%tqdm_restart
for i in tqdm(range(100)):
# ds[i]
batch = next(it)
it = ds.batch_train_iter(cycle=False, batch_size=64, num_workers=6)
for i in tqdm(range(100)):
# ds[i]
batch = next(it)
from tqdm import tqdm
%tqdm_restart
from kipoi.data_utils import iterable_cycle
train.tsv.df[0]
from basepair.data import Dataset
from basepair.datasets import *
import torch
class StrandedProfile(Dataset):
def __init__(self, ds,
peak_width=200,
seq_width=None,
incl_chromosomes=None,
excl_chromosomes=None,
intervals_file=None,
shuffle=True, target_transformer=None):
"""Dataset for loading the bigwigs and fastas
Args:
ds (basepair.src.schemas.DataSpec): data specification containing the
fasta file, bed files and bigWig file paths
chromosomes (list of str): a list of chor
peak_width: resize the bed file to a certain width
bcolz: If True, the bigwig/fasta files are in the genomelake bcolz format
in_memory: If True, load the whole bcolz into memory. Only applicable when bcolz=True
shuffle: True
preprocessor: trained preprocessor object containing the .transform methods
"""
if isinstance(ds, str):
self.ds = DataSpec.load(ds)
else:
self.ds = ds
self.peak_width = peak_width
if seq_width is None:
self.seq_width = peak_width
else:
self.seq_width = seq_width
self.shuffle = shuffle
self.intervals_file = intervals_file
self.incl_chromosomes = incl_chromosomes
self.excl_chromosomes = excl_chromosomes
self.target_transformer = target_transformer
# not specified yet
self.fasta_extractor = None
self.bw_extractors = None
# Load chromosome lengths
fa = FastaFile(self.ds.fasta_file)
self.chrom_lens = {name: l for name, l in zip(fa.references, fa.lengths)}
del fa
self.tsv = TsvReader(self.intervals_file,
num_chr=False,
label_dtype=int,
mask_ambigous=-1,
incl_chromosomes=incl_chromosomes,
excl_chromosomes=excl_chromosomes,
)
self.dfm = self.tsv.df # use the data-frame from tsv
# self.dfmo = {"chrom": self.dfm}
if self.shuffle:
self.dfm = self.dfm.sample(frac=1)
def __len__(self):
return len(self.dfm)
def get_targets(self):
"""
'targets'
"""
assert self.intervals_file is not None
return self.tsv.get_targets()
def __getitem__(self, idx):
if self.fasta_extractor is None:
# first call
self.bw_extractors = {task: [BigwigExtractor(task_spec.pos_counts),
BigwigExtractor(task_spec.neg_counts)]
for task, task_spec in self.ds.task_specs.items()}
self.fasta_extractor = FastaExtractor(self.ds.fasta_file)
# Load the bias model if available
interval = Interval(self.dfm.iat[idx, 0], # chrom
self.dfm.iat[idx, 1], # start
self.dfm.iat[idx, 2]) # end
target_interval = resize_interval(deepcopy(interval), self.peak_width)
seq_interval = resize_interval(deepcopy(interval), self.seq_width)
# task = self.dfm.iat[idx, 3] # task
# TODO - add data augmentation
sequence = self.fasta_extractor([seq_interval])[0]
# cuts = {f"profile/{task}": run_extractors(self.bw_extractors[task],
# [target_interval],
# ignore_strand=spec.ignore_strand)[0]
# for task, spec in self.ds.task_specs.items()}
cuts = {}
task = ''
return {"inputs": sequence,
"targets": cuts,
"metadata": {"range": GenomicRanges(target_interval.chrom,
target_interval.start,
target_interval.stop,
idx),
"interval_from_task": task}}
class Ds(Dataset):
def __init__(self, ds, seq_width=1000, intervals_file=None):
"""Dataset for loading the bigwigs and fastas
Args:
ds (basepair.src.schemas.DataSpec): data specification containing the
fasta file, bed files and bigWig file paths
chromosomes (list of str): a list of chor
peak_width: resize the bed file to a certain width
bcolz: If True, the bigwig/fasta files are in the genomelake bcolz format
in_memory: If True, load the whole bcolz into memory. Only applicable when bcolz=True
shuffle: True
preprocessor: trained preprocessor object containing the .transform methods
"""
if isinstance(ds, str):
self.ds = DataSpec.load(ds)
else:
self.ds = ds
self.tsv = TsvReader(intervals_file,
num_chr=False,
label_dtype=int,
mask_ambigous=-1
)
self.seq_width = seq_width
self.dfm = self.tsv.df # use the data-frame from tsv
self.dfm[0] = self.dfm[0].astype(str)
self.fasta_extractor = None
def __len__(self):
return len(self.dfm)
def get_targets(self):
"""
'targets'
"""
assert self.intervals_file is not None
return self.tsv.get_targets()
def __getitem__(self, idx):
return torch.ones((10000, 10))
# return np.ones((10000, 10))
# if self.fasta_extractor is None:
# self.fasta_extractor = FastaExtractor(self.ds.fasta_file)
# interval = Interval(self.dfm.iat[idx, 0], # chrom
# self.dfm.iat[idx, 1], # start
# self.dfm.iat[idx, 2]) # end
# seq_interval = resize_interval(deepcopy(interval), self.seq_width)
# sequence = self.fasta_extractor([seq_interval])[0]
# # sequence = ''
# return {"input": sequence,
# "s2": sequence,
# "metadata": {"range": GenomicRanges(interval.chrom,
# interval.start,
# interval.stop,
# idx)}}
from torch.utils.data import DataLoader
%tqdm_restart
import torch
intervals_file = '/users/avsec/workspace/basepair/data/processed/chipnexus/exp/models/genomewide/oct-sox-nanog-klf/1kb.osnk.tsv.gz'
dataspec = '/users/avsec/workspace/basepair/data/processed/chipnexus/exp/models/oct-sox-nanog-klf/dataspec.yml'
train = StrandedProfile(dataspec, 10000, intervals_file=intervals_file)
it = train.batch_iter(batch_size=32, num_workers=12)
from kipoi.data_utils import numpy_collate
dl = DataLoader(train, num_workers=12, batch_size=32)
it = iter(dl)
next(it)
np.array("asd")
from basepair.data import to_numpy
for i in tqdm(range(10000)):
# if i % 1000 == 0:
# gc.collect()
batch = next(it)
# o = to_numpybatch)
# a= batch['inputs'].numpy()
# del batch
del train
it = train.batch_train_iter(batch_size=32, num_workers=6)
del it
for i in tqdm(range(1000)):
next(it)
import torch