Goal¶

implement interval overlap

Tasks¶

[ ]

Required files¶

-

# Imports
from basepair.imports import *
hv.extension('bokeh')
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

Using TensorFlow backend.
/users/avsec/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/concise/utils/plot.py:115: FutureWarning: arrays to stack must be passed as a "sequence" type such as list or tuple. Support for non-sequence iterables such as generators is deprecated as of NumPy 1.16 and will raise an error in the future.
  min_coords = np.vstack(data.min(0) for data in polygons_data).min(0)
/users/avsec/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/concise/utils/plot.py:116: FutureWarning: arrays to stack must be passed as a "sequence" type such as list or tuple. Support for non-sequence iterables such as generators is deprecated as of NumPy 1.16 and will raise an error in the future.
  max_coords = np.vstack(data.max(0) for data in polygons_data).max(0)

# Common paths
model_dir = Path(f"{ddir}/processed/chipnexus/exp/models/oct-sox-nanog-klf/models/n_dil_layers=9/")
modisco_dir = model_dir / f"modisco/all/profile/"
output_dir = Path("/srv/www/kundaje/avsec/chipnexus/oct-sox-nanog-klf/models/n_dil_layers=9/modisco/all/profile")

# create_tf_session(0)

data = f"{ddir}/processed/chipnexus/exp/models/genomewide/oct-sox-nanog-klf"

ls {data}

dataspec.yml               Nanog-summits.200bp.bed.gz  Sox2-summits.bed.gz
Klf4-summits.200bp.bed.gz  Nanog-summits.bed.gz        {}-summits.200bp.bed.gz
Klf4-summits.bed.gz        Oct4-summits.200bp.bed.gz   tasks.tsv
label.bash*                Oct4-summits.bed.gz
label.bash~*               Sox2-summits.200bp.bed.gz

from pybedtools import BedTool

btg.head()

chr1	0	1000
 chr1	1000	2000
 chr1	2000	3000
 chr1	3000	4000
 chr1	4000	5000
 chr1	5000	6000
 chr1	6000	7000
 chr1	7000	8000
 chr1	8000	9000
 chr1	9000	10000

btg = BedTool(f"{ddir}/raw/annotation/mm10/mm10.genome.stride1000.w1000.no-blacklist.bed.gz")
dfg = btg.to_dataframe()
dfg['name'] = dfg.index

btg = BedTool.from_dataframe(dfg)

ls {data}

dataspec.yml               Nanog-summits.200bp.bed.gz  Sox2-summits.bed.gz
Klf4-summits.200bp.bed.gz  Nanog-summits.bed.gz        {}-summits.200bp.bed.gz
Klf4-summits.bed.gz        Oct4-summits.200bp.bed.gz   tasks.tsv
label.bash*                Oct4-summits.bed.gz
label.bash~*               Sox2-summits.200bp.bed.gz

b = BedTool(f"{data}/Sox2-summits.200bp.bed.gz")

b.head()

chr17	17408939	17409140
 chr5	110284720	110284921
 chr1	57780178	57780379
 chr6	94183999	94184200
 chr5	121537408	121537609
 chr8	67966590	67966791
 chr11	68348455	68348656
 chr1	9955321	9955522
 chr10	123087129	123087330
 chr2	18567798	18567999

feature = 'feature1'

import pybedtools

pybedtools.

from basepair.config import get_data_dir
from basepair.preproc import label_bed
ddir = get_data_dir()
data = f"{ddir}/processed/chipnexus/exp/models/genomewide/oct-sox-nanog-klf"
dfo = label_bed(f"{ddir}/raw/annotation/mm10/mm10.genome.stride1000.w1000.no-blacklist.bed.gz",
         {t: f"{data}/{t}-summits.200bp.bed.gz" for t in ['Oct4', 'Sox2', 'Nanog', 'Klf4']})
dfo.to_csv(f"{data}/1kb.osnk.tsv.gz", compression='gzip', sep='\t', index=False)

CPU times: user 12 s, sys: 1.11 s, total: 13.1 s
Wall time: 23.9 s

for t in ['Oct4', 'Sox2', 'Nanog', 'Klf4']:
    print(t)
    !zcat {data}/{t}-summits.200bp.bed.gz | wc -l

Oct4
21841
Sox2
9396
Nanog
18017
Klf4
49174

dfo.set_index(['chrom', 'start', 'end'], inplace=True)
dfo.sum(axis=0)

task/Oct4     25965
task/Sox2     10963
task/Nanog    21462
task/Klf4     57561
dtype: int64

dfo.mean(axis=0)

task/Oct4     0.0095
task/Sox2     0.0040
task/Nanog    0.0079
task/Klf4     0.0211
dtype: float64

intersected = btg.intersect(b, wa=True, u=True).to_dataframe()['name']
dfg[feature] = 0
dfg.loc[intersected, feature] = 1

dfi.head()

dfg.head()

from gin_train.samplers import StratifiedRandomBatchSampler,iterable_cycle

classes = np.concatenate([np.ones((int(1e7),)), np.zeros((int(1e7),))])

import ipython_memory_usage.ipython_memory_usage as imu

imu.start_watching_memory()

In [5] used 0.0000 MiB RAM in 0.16s, peaked 0.00 MiB above current, total RAM usage 653.64 MiB

sampler = StratifiedRandomBatchSampler(classes, [0.5, 0.5], 128)

In [25] used 171.6680 MiB RAM in 1.61s, peaked 159.31 MiB above current, total RAM usage 545.91 MiB

it = iter(sampler)

In [38] used 0.0000 MiB RAM in 0.10s, peaked 0.00 MiB above current, total RAM usage 471.67 MiB

for i in sampler:
    pass

In [45] used 0.0000 MiB RAM in 6.33s, peaked 0.00 MiB above current, total RAM usage 640.98 MiB

len(sampler)

156250

In [36] used 0.0039 MiB RAM in 0.10s, peaked 0.00 MiB above current, total RAM usage 471.67 MiB

iterable_cycle()

intervals_file = '/users/avsec/workspace/basepair/data/processed/chipnexus/exp/models/genomewide/oct-sox-nanog-klf/1kb.osnk.tsv.gz'
dataspec = '/users/avsec/workspace/basepair/data/processed/chipnexus/exp/models/oct-sox-nanog-klf/dataspec.yml'

from basepair.datasets import get_gw_StrandedProfile_datasets
train, valid = get_gw_StrandedProfile_datasets(dataspec = '/users/avsec/workspace/basepair/data/processed/chipnexus/exp/models/oct-sox-nanog-klf/dataspec.yml',
                                              intervals_file = '/users/avsec/workspace/basepair/data/processed/chipnexus/exp/models/genomewide/oct-sox-nanog-klf/1kb.osnk.tsv.gz',
                                              peak_width=1000,
                                              seq_width=3088,
                                              exclude_chr = ['chrX', 'chrY']
                                             )  # use the default train and valid chromosomes

ds = valid[0][1]

valid[0][0]

'valid-genome-wide'

it = ds.batch_train_iter(64, num_workers=6)

%tqdm_restart

for i in tqdm(range(100)):
    # ds[i]
    batch = next(it)

100%|██████████| 100/100 [00:02<00:00, 49.41it/s]

it = ds.batch_train_iter(cycle=False, batch_size=64, num_workers=6)

for i in tqdm(range(100)):
    # ds[i]
    batch = next(it)

100%|██████████| 100/100 [00:02<00:00, 38.33it/s]

from tqdm import tqdm

In [96] used 0.0000 MiB RAM in 0.10s, peaked 0.00 MiB above current, total RAM usage 1174.79 MiB

%tqdm_restart

In [68] used -0.0078 MiB RAM in 0.11s, peaked 0.01 MiB above current, total RAM usage 944.02 MiB

from kipoi.data_utils import iterable_cycle

In [8] used 0.0000 MiB RAM in 0.10s, peaked 0.00 MiB above current, total RAM usage 654.44 MiB

TODO¶

there is a memory leak in multiprocessing, you should prevent it by storing the arrays as numpy dictionaries

train.tsv.df[0]

195435     chr10
195436     chr10
195437     chr10
           ...  
2208235     chr7
2208236     chr7
2208237     chr7
Name: 0, Length: 1514227, dtype: category
Categories (13, object): [chr10, chr11, chr12, chr13, ..., chr19, chr5, chr6, chr7]

from basepair.data import Dataset

Using TensorFlow backend.
/users/avsec/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/concise/utils/plot.py:115: FutureWarning: arrays to stack must be passed as a "sequence" type such as list or tuple. Support for non-sequence iterables such as generators is deprecated as of NumPy 1.16 and will raise an error in the future.
  min_coords = np.vstack(data.min(0) for data in polygons_data).min(0)
/users/avsec/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/concise/utils/plot.py:116: FutureWarning: arrays to stack must be passed as a "sequence" type such as list or tuple. Support for non-sequence iterables such as generators is deprecated as of NumPy 1.16 and will raise an error in the future.
  max_coords = np.vstack(data.max(0) for data in polygons_data).max(0)

from basepair.datasets import *

import torch

class StrandedProfile(Dataset):

    def __init__(self, ds,
                 peak_width=200,
                 seq_width=None,
                 incl_chromosomes=None,
                 excl_chromosomes=None,
                 intervals_file=None,
                 shuffle=True, target_transformer=None):
        """Dataset for loading the bigwigs and fastas

        Args:
          ds (basepair.src.schemas.DataSpec): data specification containing the
            fasta file, bed files and bigWig file paths
          chromosomes (list of str): a list of chor
          peak_width: resize the bed file to a certain width
          bcolz: If True, the bigwig/fasta files are in the genomelake bcolz format
          in_memory: If True, load the whole bcolz into memory. Only applicable when bcolz=True
          shuffle: True
          preprocessor: trained preprocessor object containing the .transform methods
        """
        if isinstance(ds, str):
            self.ds = DataSpec.load(ds)
        else:
            self.ds = ds
        self.peak_width = peak_width
        if seq_width is None:
            self.seq_width = peak_width
        else:
            self.seq_width = seq_width
        self.shuffle = shuffle
        self.intervals_file = intervals_file
        self.incl_chromosomes = incl_chromosomes
        self.excl_chromosomes = excl_chromosomes
        self.target_transformer = target_transformer
        # not specified yet
        self.fasta_extractor = None
        self.bw_extractors = None
        
        # Load chromosome lengths
        fa = FastaFile(self.ds.fasta_file)
        self.chrom_lens = {name: l for name, l in zip(fa.references, fa.lengths)}
        del fa

        self.tsv = TsvReader(self.intervals_file,
                         num_chr=False,
                         label_dtype=int,
                         mask_ambigous=-1,
                         incl_chromosomes=incl_chromosomes,
                         excl_chromosomes=excl_chromosomes,
                         )
        self.dfm = self.tsv.df  # use the data-frame from tsv
            
        # self.dfmo = {"chrom": self.dfm}
        
        if self.shuffle:
            self.dfm = self.dfm.sample(frac=1)

    def __len__(self):
        return len(self.dfm)
    
    def get_targets(self):
        """
        'targets'
        """
        assert self.intervals_file is not None
        return self.tsv.get_targets()

    def __getitem__(self, idx):
        if self.fasta_extractor is None:
            # first call
            self.bw_extractors = {task: [BigwigExtractor(task_spec.pos_counts),
                                         BigwigExtractor(task_spec.neg_counts)]
                                  for task, task_spec in self.ds.task_specs.items()}
            self.fasta_extractor = FastaExtractor(self.ds.fasta_file)

            # Load the bias model if available
        interval = Interval(self.dfm.iat[idx, 0],  # chrom
                            self.dfm.iat[idx, 1],  # start
                            self.dfm.iat[idx, 2])  # end
        target_interval = resize_interval(deepcopy(interval), self.peak_width)
        seq_interval = resize_interval(deepcopy(interval), self.seq_width)
        # task = self.dfm.iat[idx, 3]  # task
        # TODO - add data augmentation

        sequence = self.fasta_extractor([seq_interval])[0]
#         cuts = {f"profile/{task}": run_extractors(self.bw_extractors[task],
#                                                   [target_interval],
#                                                   ignore_strand=spec.ignore_strand)[0]
#                 for task, spec in self.ds.task_specs.items()}
        cuts = {}
        task = ''
        return {"inputs": sequence,
                "targets": cuts,
                "metadata": {"range": GenomicRanges(target_interval.chrom, 
                                                    target_interval.start, 
                                                    target_interval.stop,
                                                    idx),
                             "interval_from_task": task}}

class Ds(Dataset):

    def __init__(self, ds, seq_width=1000, intervals_file=None):
        """Dataset for loading the bigwigs and fastas

        Args:
          ds (basepair.src.schemas.DataSpec): data specification containing the
            fasta file, bed files and bigWig file paths
          chromosomes (list of str): a list of chor
          peak_width: resize the bed file to a certain width
          bcolz: If True, the bigwig/fasta files are in the genomelake bcolz format
          in_memory: If True, load the whole bcolz into memory. Only applicable when bcolz=True
          shuffle: True
          preprocessor: trained preprocessor object containing the .transform methods
        """
        if isinstance(ds, str):
            self.ds = DataSpec.load(ds)
        else:
            self.ds = ds
        self.tsv = TsvReader(intervals_file,
                         num_chr=False,
                         label_dtype=int,
                         mask_ambigous=-1
                         )
        self.seq_width = seq_width
        self.dfm = self.tsv.df  # use the data-frame from tsv
        self.dfm[0] = self.dfm[0].astype(str)
        self.fasta_extractor = None

    def __len__(self):
        return len(self.dfm)
    
    def get_targets(self):
        """
        'targets'
        """
        assert self.intervals_file is not None
        return self.tsv.get_targets()

    def __getitem__(self, idx):
        return torch.ones((10000, 10))
        # return np.ones((10000, 10))
#         if self.fasta_extractor is None:
#             self.fasta_extractor = FastaExtractor(self.ds.fasta_file)
#         interval = Interval(self.dfm.iat[idx, 0],  # chrom
#                             self.dfm.iat[idx, 1],  # start
#                             self.dfm.iat[idx, 2])  # end
#         seq_interval = resize_interval(deepcopy(interval), self.seq_width)
#         sequence = self.fasta_extractor([seq_interval])[0]
#         # sequence = ''
#         return {"input": sequence,
#                 "s2": sequence,
#                 "metadata": {"range": GenomicRanges(interval.chrom, 
#                                                     interval.start, 
#                                                     interval.stop,
#                                                     idx)}}

from torch.utils.data import DataLoader

%tqdm_restart

import torch

intervals_file = '/users/avsec/workspace/basepair/data/processed/chipnexus/exp/models/genomewide/oct-sox-nanog-klf/1kb.osnk.tsv.gz'
dataspec = '/users/avsec/workspace/basepair/data/processed/chipnexus/exp/models/oct-sox-nanog-klf/dataspec.yml'

train = StrandedProfile(dataspec, 10000, intervals_file=intervals_file)
it = train.batch_iter(batch_size=32, num_workers=12)

from kipoi.data_utils import numpy_collate

dl = DataLoader(train, num_workers=12, batch_size=32)
it = iter(dl)

next(it)

{'inputs': tensor([[[1., 0., 0., 0.],
          [0., 1., 0., 0.],
          [0., 0., 0., 1.],
          ...,
          [1., 0., 0., 0.],
          [1., 0., 0., 0.],
          [0., 0., 1., 0.]],
 
         [[1., 0., 0., 0.],
          [0., 1., 0., 0.],
          [1., 0., 0., 0.],
          ...,
          [0., 0., 1., 0.],
          [0., 0., 1., 0.],
          [1., 0., 0., 0.]],
 
         [[1., 0., 0., 0.],
          [1., 0., 0., 0.],
          [0., 1., 0., 0.],
          ...,
          [0., 0., 1., 0.],
          [0., 1., 0., 0.],
          [0., 0., 0., 1.]],
 
         ...,
 
         [[0., 0., 0., 1.],
          [0., 0., 0., 1.],
          [0., 1., 0., 0.],
          ...,
          [0., 0., 0., 1.],
          [1., 0., 0., 0.],
          [1., 0., 0., 0.]],
 
         [[0., 1., 0., 0.],
          [1., 0., 0., 0.],
          [1., 0., 0., 0.],
          ...,
          [0., 0., 0., 1.],
          [0., 0., 0., 1.],
          [1., 0., 0., 0.]],
 
         [[0., 1., 0., 0.],
          [1., 0., 0., 0.],
          [1., 0., 0., 0.],
          ...,
          [0., 0., 0., 1.],
          [1., 0., 0., 0.],
          [0., 0., 0., 1.]]]),
 'targets': {},
 'metadata': {'range': {'chr': ['chr17',
    'chr3',
    'chr6',
    'chr1',
    'chr10',
    'chr1',
    'chr10',
    'chr9',
    'chr7',
    'chr2',
    'chrY',
    'chr18',
    'chr9',
    'chrY',
    'chr6',
    'chrY',
    'chr11',
    'chr2',
    'chrY',
    'chr1',
    'chr9',
    'chr10',
    'chr10',
    'chrY',
    'chr10',
    'chr5',
    'chr1',
    'chr4',
    'chr10',
    'chr13',
    'chr13',
    'chrY'],
   'start': tensor([ 56325500, 128308500,  17817500,  94541500,  26313500, 160664500,
           103891500,  13377500,  29069500,  57283500,  50638500,  16560500,
            34656500,  47534500,  16556500,  29475500,  59287500, 156045500,
            18834500,  62269500, 113084500,   3397500, 117583500,  45514500,
            65664500,  69164500, 133235500,  34073500, 107823500,  99026500,
            79919500,  88448500]),
   'end': tensor([ 56335500, 128318500,  17827500,  94551500,  26323500, 160674500,
           103901500,  13387500,  29079500,  57293500,  50648500,  16570500,
            34666500,  47544500,  16566500,  29485500,  59297500, 156055500,
            18844500,  62279500, 113094500,   3407500, 117593500,  45524500,
            65674500,  69174500, 133245500,  34083500, 107833500,  99036500,
            79929500,  88458500]),
   'id': tensor([320032, 320033, 320034, 320035, 320036, 320037, 320038, 320039, 320040,
           320041, 320042, 320043, 320044, 320045, 320046, 320047, 320048, 320049,
           320050, 320051, 320052, 320053, 320054, 320055, 320056, 320057, 320058,
           320059, 320060, 320061, 320062, 320063]),
   'strand': ['*',
    '*',
    '*',
    '*',
    '*',
    '*',
    '*',
    '*',
    '*',
    '*',
    '*',
    '*',
    '*',
    '*',
    '*',
    '*',
    '*',
    '*',
    '*',
    '*',
    '*',
    '*',
    '*',
    '*',
    '*',
    '*',
    '*',
    '*',
    '*',
    '*',
    '*',
    '*']},
  'interval_from_task': ['',
   '',
   '',
   '',
   '',
   '',
   '',
   '',
   '',
   '',
   '',
   '',
   '',
   '',
   '',
   '',
   '',
   '',
   '',
   '',
   '',
   '',
   '',
   '',
   '',
   '',
   '',
   '',
   '',
   '',
   '',
   '']}}

np.array("asd")

array('asd', dtype='<U3')

from basepair.data import to_numpy

for i in tqdm(range(10000)):
    # if i % 1000 == 0:
    #     gc.collect()
    batch = next(it)
    # o = to_numpybatch)
    # a= batch['inputs'].numpy()
    # del batch

 51%|█████▏    | 5131/10000 [00:25<00:27, 179.65it/s]

---------------------------------------------------------------------------
KeyboardInterrupt                         Traceback (most recent call last)
<ipython-input-78-15125a4bb5f9> in <module>
      2     # if i % 1000 == 0:
      3     #     gc.collect()
----> 4     batch = next(it)
      5     # o = to_numpybatch)
      6     # a= batch['inputs'].numpy()

~/workspace/basepair/basepair/data.py in <genexpr>(.0)
    104                                   drop_last=drop_last,
    105                                   **kwargs)
--> 106         return (to_numpy(batch) for batch in dl)
    107 
    108 

~/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/torch/utils/data/dataloader.py in __next__(self)
    629         while True:
    630             assert (not self.shutdown and self.batches_outstanding > 0)
--> 631             idx, batch = self._get_batch()
    632             self.batches_outstanding -= 1
    633             if idx != self.rcvd_idx:

~/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/torch/utils/data/dataloader.py in _get_batch(self)
    608             # need to call `.task_done()` because we don't use `.join()`.
    609         else:
--> 610             return self.data_queue.get()
    611 
    612     def __next__(self):

~/bin/anaconda3/envs/chipnexus/lib/python3.6/multiprocessing/queues.py in get(self, block, timeout)
    111                 self._rlock.release()
    112         # unserialize the data after having released the lock
--> 113         return _ForkingPickler.loads(res)
    114 
    115     def qsize(self):

~/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/torch/multiprocessing/reductions.py in rebuild_storage_fd(cls, df, size)
    254         fd = multiprocessing.reduction.rebuild_handle(df)
    255     else:
--> 256         fd = df.detach()
    257     try:
    258         storage = storage_from_cache(cls, fd_id(fd))

~/bin/anaconda3/envs/chipnexus/lib/python3.6/multiprocessing/resource_sharer.py in detach(self)
     55         def detach(self):
     56             '''Get the fd.  This should only be called once.'''
---> 57             with _resource_sharer.get_connection(self._id) as conn:
     58                 return reduction.recv_handle(conn)
     59 

~/bin/anaconda3/envs/chipnexus/lib/python3.6/multiprocessing/resource_sharer.py in get_connection(ident)
     85         from .connection import Client
     86         address, key = ident
---> 87         c = Client(address, authkey=process.current_process().authkey)
     88         c.send((key, os.getpid()))
     89         return c

~/bin/anaconda3/envs/chipnexus/lib/python3.6/multiprocessing/connection.py in Client(address, family, authkey)
    485         c = PipeClient(address)
    486     else:
--> 487         c = SocketClient(address)
    488 
    489     if authkey is not None and not isinstance(authkey, bytes):

~/bin/anaconda3/envs/chipnexus/lib/python3.6/multiprocessing/connection.py in SocketClient(address)
    610     '''
    611     family = address_type(address)
--> 612     with socket.socket( getattr(socket, family) ) as s:
    613         s.setblocking(True)
    614         s.connect(address)

~/bin/anaconda3/envs/chipnexus/lib/python3.6/socket.py in __init__(self, family, type, proto, fileno)
    142         # constructor of _socket.socket converts the given argument to an
    143         # integer automatically.
--> 144         _socket.socket.__init__(self, family, type, proto, fileno)
    145         self._io_refs = 0
    146         self._closed = False

KeyboardInterrupt:

 51%|█████▏    | 5131/10000 [00:40<00:27, 179.65it/s]

del train

In [94] used 0.0000 MiB RAM in 0.10s, peaked 0.00 MiB above current, total RAM usage 946.27 MiB

it = train.batch_train_iter(batch_size=32, num_workers=6)

In [97] used 0.0000 MiB RAM in 0.10s, peaked 0.00 MiB above current, total RAM usage 1174.79 MiB

del it

In [90] used 0.0000 MiB RAM in 0.10s, peaked 0.00 MiB above current, total RAM usage 946.27 MiB

for i in tqdm(range(1000)):
    next(it)

100%|██████████| 1000/1000 [00:14<00:00, 68.37it/s]

In [98] used 0.0078 MiB RAM in 14.74s, peaked 0.00 MiB above current, total RAM usage 1174.80 MiB

import torch

---------------------------------------------------------------------------
ModuleNotFoundError                       Traceback (most recent call last)
<ipython-input-84-eb42ca6e4af3> in <module>
----> 1 import torch

ModuleNotFoundError: No module named 'torch'

In [84] used -0.2500 MiB RAM in 0.13s, peaked 0.25 MiB above current, total RAM usage 946.27 MiB

	chrom	start	end	name
0	chr1	3062000	3063000	3062
1	chr1	3063000	3064000	3063
2	chr1	3483000	3484000	3483
3	chr1	4150000	4151000	4150
4	chr1	4151000	4152000	4151

	chrom	start	end	name
0	chr1	0	1000	0
1	chr1	1000	2000	1
2	chr1	2000	3000	2
3	chr1	3000	4000	3
4	chr1	4000	5000	4