Goal

-

Tasks

  • [ ]

Required files

-

In [1]:
# Imports
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
from basepair.imports import *
hv.extension('bokeh')
Using TensorFlow backend.
/users/avsec/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/concise/utils/plot.py:115: FutureWarning: arrays to stack must be passed as a "sequence" type such as list or tuple. Support for non-sequence iterables such as generators is deprecated as of NumPy 1.16 and will raise an error in the future.
  min_coords = np.vstack(data.min(0) for data in polygons_data).min(0)
/users/avsec/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/concise/utils/plot.py:116: FutureWarning: arrays to stack must be passed as a "sequence" type such as list or tuple. Support for non-sequence iterables such as generators is deprecated as of NumPy 1.16 and will raise an error in the future.
  max_coords = np.vstack(data.max(0) for data in polygons_data).max(0)
In [2]:
# Common paths
model_dir = Path(f"{ddir}/processed/chipnexus/exp/models/oct-sox-nanog-klf/models/n_dil_layers=9/")
modisco_dir = model_dir / f"modisco/all/profile/"
output_dir = Path("/srv/www/kundaje/avsec/chipnexus/oct-sox-nanog-klf/models/n_dil_layers=9/modisco/all/profile")
In [ ]:
# create_tf_session(0)
In [6]:
from basepair.datasets import get_gw_StrandedProfile_datasets, StrandedProfile
In [4]:
dataspec_path = '/users/avsec/workspace/basepair/data/processed/chipnexus/exp/models/oct-sox-nanog-klf/dataspec.yml'
intervals_file = '/users/avsec/workspace/basepair/data/processed/chipnexus/exp/models/genomewide/oct-sox-nanog-klf/1kb.osnk.tsv.gz'
In [9]:
ds = StrandedProfile(dataspec_path, peak_width=1000, seq_width=10000, intervals_file=intervals_file, shuffle=False)
In [11]:
from basepair.models import seq_bpnet_cropped_extra_seqlen
In [13]:
seq_bpnet_cropped_extra_seqlen(conv1_kernel_size=21, n_dil_layers=9, tconv_kernel_size=25)
Out[13]:
2088
In [10]:
ds[0]
---------------------------------------------------------------------------
OverflowError                             Traceback (most recent call last)
<ipython-input-10-d4162fd57828> in <module>
----> 1 ds[0]

~/workspace/basepair/basepair/datasets.py in __getitem__(self, idx)
    471                             self.dfm.iat[idx, 2])  # end
    472         target_interval = resize_interval(deepcopy(interval), self.peak_width)
--> 473         seq_interval = resize_interval(deepcopy(interval), self.seq_width)
    474         task = self.dfm.iat[idx, 3]  # task
    475         # TODO - add data augmentation

~/workspace/basepair/basepair/preproc.py in resize_interval(interval, width)
    185                                                      interval.name,
    186                                                      interval.score,
--> 187                                                      interval.strand])
    188     else:
    189         interval.start = start

~/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/pybedtools/cbedtools.pyx in pybedtools.cbedtools.create_interval_from_list (pybedtools/cbedtools.cxx:10027)()

~/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/pybedtools/cbedtools.pyx in pybedtools.cbedtools.create_interval_from_list (pybedtools/cbedtools.cxx:9360)()

OverflowError: can't convert negative value to CHRPOS
In [13]:
for i in tqdm(range(len(ds))):
    a = ds[i]
  4%|▎         | 101705/2725346 [03:04<1:17:06, 567.13it/s]
---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-13-5323b7a46300> in <module>
      1 for i in tqdm(range(len(ds))):
----> 2     a = ds[i]

~/workspace/basepair/basepair/datasets.py in __getitem__(self, idx)
    475         # TODO - add data augmentation
    476 
--> 477         sequence = self.fasta_extractor([seq_interval])[0]
    478         cuts = {f"profile/{task}": run_extractors(self.bw_extractors[task],
    479                                                   [target_interval],

~/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/genomelake/extractors.py in __call__(self, intervals, out, **kwargs)
     24     def __call__(self, intervals, out=None, **kwargs):
     25         data = self._check_or_create_output_array(intervals, out)
---> 26         self._extract(intervals, data, **kwargs)
     27         return data
     28 

~/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/genomelake/extractors.py in _extract(self, intervals, out, **kwargs)
     92             seq = self.fasta.fetch(str(interval.chrom), interval.start,
     93                                        interval.stop)
---> 94             one_hot_encode_sequence(seq, out[index, :, :])
     95 
     96             # reverse-complement seq the negative strand

genomelake/util.pyx in genomelake.util.one_hot_encode_sequence (genomelake/util.c:2179)()

genomelake/util.pyx in genomelake.util.one_hot_encode_sequence (genomelake/util.c:1910)()

ValueError: encoded array not the same length as given seq
In [15]:
%debug
> /users/avsec/workspace/basepair/src/chipnexus/train/genome-wide/genomelake/util.pyx(37)genomelake.util.one_hot_encode_sequence (genomelake/util.c:1910)()

> /users/avsec/workspace/basepair/src/chipnexus/train/genome-wide/genomelake/util.pyx(32)genomelake.util.one_hot_encode_sequence (genomelake/util.c:2179)()

> /users/avsec/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/genomelake/extractors.py(94)_extract()
     92             seq = self.fasta.fetch(str(interval.chrom), interval.start,
     93                                        interval.stop)
---> 94             one_hot_encode_sequence(seq, out[index, :, :])
     95 
     96             # reverse-complement seq the negative strand

> /users/avsec/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/genomelake/extractors.py(26)__call__()
     24     def __call__(self, intervals, out=None, **kwargs):
     25         data = self._check_or_create_output_array(intervals, out)
---> 26         self._extract(intervals, data, **kwargs)
     27         return data
     28 

> /users/avsec/workspace/basepair/basepair/datasets.py(477)__getitem__()
    475         # TODO - add data augmentation
    476 
--> 477         sequence = self.fasta_extractor([seq_interval])[0]
    478         cuts = {f"profile/{task}": run_extractors(self.bw_extractors[task],
    479                                                   [target_interval],

Interval(chr11:122081771-122082771)
True
False
In [14]:
ds[i]
[autoreload of basepair.datasets failed: Traceback (most recent call last):
  File "/users/avsec/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/IPython/extensions/autoreload.py", line 244, in check
    superreload(m, reload, self.old_objects)
  File "/users/avsec/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/IPython/extensions/autoreload.py", line 376, in superreload
    module = reload(module)
  File "/users/avsec/bin/anaconda3/envs/chipnexus/lib/python3.6/imp.py", line 315, in reload
    return importlib.reload(module)
  File "/users/avsec/bin/anaconda3/envs/chipnexus/lib/python3.6/importlib/__init__.py", line 166, in reload
    _bootstrap._exec(spec, module)
  File "<frozen importlib._bootstrap>", line 618, in _exec
  File "<frozen importlib._bootstrap_external>", line 678, in exec_module
  File "<frozen importlib._bootstrap>", line 219, in _call_with_frames_removed
  File "/users/avsec/workspace/basepair/basepair/datasets.py", line 83, in <module>
    test_chr=test_chr):
  File "/users/avsec/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/gin/config.py", line 1129, in configurable
    return perform_decoration(decoration_target)
  File "/users/avsec/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/gin/config.py", line 1126, in perform_decoration
    return _make_configurable(fn_or_cls, name, module, whitelist, blacklist)
ValueError: A configurable matching 'basepair.datasets.chip_exo_nexus' already exists.
]
---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-14-c48b138b8749> in <module>
----> 1 ds[i]

~/workspace/basepair/basepair/datasets.py in __getitem__(self, idx)
    475         # TODO - add data augmentation
    476 
--> 477         sequence = self.fasta_extractor([seq_interval])[0]
    478         cuts = {f"profile/{task}": run_extractors(self.bw_extractors[task],
    479                                                   [target_interval],

~/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/genomelake/extractors.py in __call__(self, intervals, out, **kwargs)
     24     def __call__(self, intervals, out=None, **kwargs):
     25         data = self._check_or_create_output_array(intervals, out)
---> 26         self._extract(intervals, data, **kwargs)
     27         return data
     28 

~/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/genomelake/extractors.py in _extract(self, intervals, out, **kwargs)
     92             seq = self.fasta.fetch(str(interval.chrom), interval.start,
     93                                        interval.stop)
---> 94             one_hot_encode_sequence(seq, out[index, :, :])
     95 
     96             # reverse-complement seq the negative strand

genomelake/util.pyx in genomelake.util.one_hot_encode_sequence (genomelake/util.c:2179)()

genomelake/util.pyx in genomelake.util.one_hot_encode_sequence (genomelake/util.c:1910)()

ValueError: encoded array not the same length as given seq
  4%|▎         | 101705/2725346 [03:20<1:17:06, 567.13it/s]
In [ ]:
train, valid = get_gw_StrandedProfile_datasets('/users/avsec/workspace/basepair/data/processed/chipnexus/exp/models/oct-sox-nanog-klf/dataspec.yml',
                                               intervals_file='/users/avsec/workspace/basepair/data/processed/chipnexus/exp/models/genomewide/oct-sox-nanog-klf/1kb.osnk.tsv.gz'
                                           
                                          )
In [ ]:
train.data = @get_gw_StrandedProfile_datasets()  # use the default train and valid chromosomes
get_gw_StrandedProfile_datasets.dataspec = '/users/avsec/workspace/basepair/data/processed/chipnexus/exp/models/oct-sox-nanog-klf/dataspec.yml'
# use genome-wide training
get_gw_StrandedProfile_datasets.intervals_file = '/users/avsec/workspace/basepair/data/processed/chipnexus/exp/models/genomewide/oct-sox-nanog-klf/1kb.osnk.tsv.gz'
get_gw_StrandedProfile_datasets.peak_width = 1000
get_gw_StrandedProfile_datasets.seq_width = 1000  # TODO - infer from the model
# get_gw_StrandedProfile_datasets.exclude_chr = ['chrX', 'chrY']