Goal

  • debug model
In [1]:
# Imports
from basepair.imports import *
hv.extension('bokeh')
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
paper_config()
Using TensorFlow backend.
/users/avsec/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/concise/utils/plot.py:115: FutureWarning: arrays to stack must be passed as a "sequence" type such as list or tuple. Support for non-sequence iterables such as generators is deprecated as of NumPy 1.16 and will raise an error in the future.
  min_coords = np.vstack(data.min(0) for data in polygons_data).min(0)
/users/avsec/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/concise/utils/plot.py:116: FutureWarning: arrays to stack must be passed as a "sequence" type such as list or tuple. Support for non-sequence iterables such as generators is deprecated as of NumPy 1.16 and will raise an error in the future.
  max_coords = np.vstack(data.max(0) for data in polygons_data).max(0)
In [2]:
# Common paths
model_dir = Path(f"{ddir}/processed/chipnexus/exp/models/oct-sox-nanog-klf/models/n_dil_layers=9/")
modisco_dir = model_dir / f"modisco/all/deeplift/profile/"
output_dir = modisco_dir
In [ ]:
# create_tf_session(0)
In [4]:
from basepair.datasets import get_gw_StrandedProfile_datasets
In [4]:
get_gw_StrandedProfile_datasets??
Signature:
get_gw_StrandedProfile_datasets(
    ['dataspec', 'intervals_file=None', 'peak_width=200', 'seq_width=None', 'shuffle=True', 'target_transformer=<basepair.preproc.AppendCounts object at 0x7f7fd3702c88>', 'include_metadata=False', "valid_chr=['chr2', 'chr3', 'chr4']", "test_chr=['chr1', 'chr8', 'chr9']", 'exclude_chr=[]', 'vmtouch=True'],
)
Docstring: <no docstring>
Source:   
@gin.configurable
def get_gw_StrandedProfile_datasets(dataspec,
                                    intervals_file=None,
                                    peak_width=200,
                                    seq_width=None,
                                    shuffle=True,
                                    target_transformer=AppendCounts(),
                                    include_metadata=False,
                                    valid_chr=['chr2', 'chr3', 'chr4'],
                                    test_chr=['chr1', 'chr8', 'chr9'],
                                    exclude_chr=[],
                                    vmtouch=True):
    from basepair.metrics import BPNetMetric, PeakPredictionProfileMetric, pearson_spearman
    # test and valid shouldn't be in the valid or test sets
    for vc in valid_chr:
        assert vc not in exclude_chr
    for vc in test_chr:
        assert vc not in exclude_chr
        
    dataspec = DataSpec.load(dataspec)
    if vmtouch:
        # use vmtouch to load all file to memory
        dataspec.touch_all_files()
        
    tasks = list(dataspec.task_specs)
        
    nonprofile_metric = BPNetMetric(tasks=list(dataspec.task_specs),
                                    count_metric=pearson_spearman,
                                    profile_metric=None)
    
    return (StrandedProfile(dataspec, peak_width,
                            seq_width=seq_width,
                            intervals_file=intervals_file,
                            include_metadata=include_metadata,
                            excl_chromosomes=valid_chr + test_chr + exclude_chr,
                            shuffle=shuffle, target_transformer=target_transformer),
            [('valid-genome-wide', 
              StrandedProfile(dataspec, peak_width,
                              seq_width=seq_width,
                              intervals_file=intervals_file,
                              include_metadata=include_metadata,
                              incl_chromosomes=valid_chr,
                              shuffle=shuffle, target_transformer=target_transformer),
              nonprofile_metric),
             ('valid-peaks', StrandedProfile(dataspec, peak_width,
                                            seq_width=seq_width,
                                            intervals_file=None,
                                            include_metadata=include_metadata,
                                            incl_chromosomes=valid_chr,
                                            shuffle=shuffle, target_transformer=target_transformer)),
             ('train-peaks', StrandedProfile(dataspec, peak_width,
                                            seq_width=seq_width,
                                            intervals_file=None,
                                             include_metadata=include_metadata,
                                            excl_chromosomes=valid_chr + test_chr + exclude_chr,
                                            shuffle=shuffle, target_transformer=target_transformer)),
             # use the default metric for the peak sets
            ])
File:      ~/workspace/basepair/basepair/datasets.py
Type:      function
In [5]:
train,valid = get_gw_StrandedProfile_datasets(
    dataspec = '/users/avsec/workspace/basepair/data/processed/chipnexus/exp/models/oct-sox-nanog-klf/dataspec.yml',
    intervals_file = '/users/avsec/workspace/basepair/data/processed/chipnexus/exp/models/genomewide/oct-sox-nanog-klf/1kb.osnk.tsv.gz',
    peak_width = 1000,
    include_metadata = True,
    exclude_chr = ['chrX', 'chrY'])
In [6]:
len(train)
Out[6]:
1514227
In [7]:
df = train.dfm
In [12]:
pos_labels = train.get_targets().max(axis=1)
In [13]:
pos_labels.sum()
Out[13]:
45357
In [14]:
pos_labels.mean()
Out[14]:
0.02995389726903562
In [38]:
!zcat {train.intervals_file} | grep '^chr10' | head
chr10	10000	11000	0	0	0	0
chr10	11000	12000	0	0	0	0
chr10	12000	13000	0	0	0	0
chr10	13000	14000	0	0	0	0
chr10	14000	15000	0	0	0	0
chr10	15000	16000	0	0	0	0
chr10	16000	17000	0	0	0	0
chr10	17000	18000	0	0	0	0
chr10	18000	19000	0	0	0	0
chr10	19000	20000	0	0	0	0
grep: write error: Broken pipe

gzip: stdout: Broken pipe
In [19]:
list(train.ds.task_specs)
Out[19]:
['Oct4', 'Sox2', 'Nanog', 'Klf4']
In [34]:
df.columns = ['chrom', 'start', 'end', 'Oct4', 'Sox2', 'Nanog', 'Klf4']
In [36]:
df.chrom = df.chrom.astype(str)
In [33]:
df[df.astype(str) == 'chr1']
Out[33]:
0 1 2 3 4 5 6
In [46]:
train[1]['metadata']
Out[46]:
{'range': GenomicRanges(chr='chr13', start=111184000, end=111185000, id=1, strand='*'),
 'interval_from_task': 0}
In [52]:
train.get_targets().max(axis=1).max()
Out[52]:
1
In [6]:
train.dfm.iloc[train.get_targets().max(axis=1) == 1]
Out[6]:
0 1 2 3 4 5 6
456738 chr12 8620000 8621000 1 0 0 0
430255 chr11 104195000 104196000 0 0 0 1
679799 chr13 111580000 111581000 0 0 0 1
... ... ... ... ... ... ... ...
715478 chr14 26880000 26881000 1 0 0 0
993815 chr16 76319000 76320000 1 0 0 1
1007892 chr16 90396000 90397000 0 0 1 0

45357 rows × 7 columns

In [44]:
df.head()
Out[44]:
chrom start end Oct4 Sox2 Nanog Klf4
1973242 chr6 60148000 60149000 0 0 0 0
679403 chr13 111184000 111185000 0 0 0 0
527383 chr12 79265000 79266000 0 0 0 0
2116323 chr7 53516000 53517000 0 0 0 0
2034393 chr6 121301000 121302000 0 0 0 0
In [40]:
df.sort_index().head()
Out[40]:
chrom start end Oct4 Sox2 Nanog Klf4
195435 chr10 10000 11000 0 0 0 0
195436 chr10 11000 12000 0 0 0 0
195437 chr10 12000 13000 0 0 0 0
195438 chr10 13000 14000 0 0 0 0
195439 chr10 14000 15000 0 0 0 0
In [21]:
df.sort_values([0,1])
Out[21]:
0 1 2 3 4 5 6
195435 chr10 10000 11000 0 0 0 0
195436 chr10 11000 12000 0 0 0 0
195437 chr10 12000 13000 0 0 0 0
... ... ... ... ... ... ... ...
2208235 chr7 145428000 145429000 0 0 0 0
2208236 chr7 145429000 145430000 0 0 0 0
2208237 chr7 145430000 145431000 0 0 0 0

1514227 rows × 7 columns

In [8]:
df.head()
Out[8]:
0 1 2 3 4 5 6
1973242 chr6 60148000 60149000 0 0 0 0
679403 chr13 111184000 111185000 0 0 0 0
527383 chr12 79265000 79266000 0 0 0 0
2116323 chr7 53516000 53517000 0 0 0 0
2034393 chr6 121301000 121302000 0 0 0 0