Goal¶

debug model

# Imports
from basepair.imports import *
hv.extension('bokeh')
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
paper_config()

Using TensorFlow backend.
/users/avsec/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/concise/utils/plot.py:115: FutureWarning: arrays to stack must be passed as a "sequence" type such as list or tuple. Support for non-sequence iterables such as generators is deprecated as of NumPy 1.16 and will raise an error in the future.
  min_coords = np.vstack(data.min(0) for data in polygons_data).min(0)
/users/avsec/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/concise/utils/plot.py:116: FutureWarning: arrays to stack must be passed as a "sequence" type such as list or tuple. Support for non-sequence iterables such as generators is deprecated as of NumPy 1.16 and will raise an error in the future.
  max_coords = np.vstack(data.max(0) for data in polygons_data).max(0)

# Common paths
model_dir = Path(f"{ddir}/processed/chipnexus/exp/models/oct-sox-nanog-klf/models/n_dil_layers=9/")
modisco_dir = model_dir / f"modisco/all/deeplift/profile/"
output_dir = modisco_dir

# create_tf_session(0)

from basepair.datasets import get_gw_StrandedProfile_datasets

get_gw_StrandedProfile_datasets??

Signature:
get_gw_StrandedProfile_datasets(
    ['dataspec', 'intervals_file=None', 'peak_width=200', 'seq_width=None', 'shuffle=True', 'target_transformer=<basepair.preproc.AppendCounts object at 0x7f7fd3702c88>', 'include_metadata=False', "valid_chr=['chr2', 'chr3', 'chr4']", "test_chr=['chr1', 'chr8', 'chr9']", 'exclude_chr=[]', 'vmtouch=True'],
)
Docstring: <no docstring>
Source:   
@gin.configurable
def get_gw_StrandedProfile_datasets(dataspec,
                                    intervals_file=None,
                                    peak_width=200,
                                    seq_width=None,
                                    shuffle=True,
                                    target_transformer=AppendCounts(),
                                    include_metadata=False,
                                    valid_chr=['chr2', 'chr3', 'chr4'],
                                    test_chr=['chr1', 'chr8', 'chr9'],
                                    exclude_chr=[],
                                    vmtouch=True):
    from basepair.metrics import BPNetMetric, PeakPredictionProfileMetric, pearson_spearman
    # test and valid shouldn't be in the valid or test sets
    for vc in valid_chr:
        assert vc not in exclude_chr
    for vc in test_chr:
        assert vc not in exclude_chr
        
    dataspec = DataSpec.load(dataspec)
    if vmtouch:
        # use vmtouch to load all file to memory
        dataspec.touch_all_files()
        
    tasks = list(dataspec.task_specs)
        
    nonprofile_metric = BPNetMetric(tasks=list(dataspec.task_specs),
                                    count_metric=pearson_spearman,
                                    profile_metric=None)
    
    return (StrandedProfile(dataspec, peak_width,
                            seq_width=seq_width,
                            intervals_file=intervals_file,
                            include_metadata=include_metadata,
                            excl_chromosomes=valid_chr + test_chr + exclude_chr,
                            shuffle=shuffle, target_transformer=target_transformer),
            [('valid-genome-wide', 
              StrandedProfile(dataspec, peak_width,
                              seq_width=seq_width,
                              intervals_file=intervals_file,
                              include_metadata=include_metadata,
                              incl_chromosomes=valid_chr,
                              shuffle=shuffle, target_transformer=target_transformer),
              nonprofile_metric),
             ('valid-peaks', StrandedProfile(dataspec, peak_width,
                                            seq_width=seq_width,
                                            intervals_file=None,
                                            include_metadata=include_metadata,
                                            incl_chromosomes=valid_chr,
                                            shuffle=shuffle, target_transformer=target_transformer)),
             ('train-peaks', StrandedProfile(dataspec, peak_width,
                                            seq_width=seq_width,
                                            intervals_file=None,
                                             include_metadata=include_metadata,
                                            excl_chromosomes=valid_chr + test_chr + exclude_chr,
                                            shuffle=shuffle, target_transformer=target_transformer)),
             # use the default metric for the peak sets
            ])
File:      ~/workspace/basepair/basepair/datasets.py
Type:      function

train,valid = get_gw_StrandedProfile_datasets(
    dataspec = '/users/avsec/workspace/basepair/data/processed/chipnexus/exp/models/oct-sox-nanog-klf/dataspec.yml',
    intervals_file = '/users/avsec/workspace/basepair/data/processed/chipnexus/exp/models/genomewide/oct-sox-nanog-klf/1kb.osnk.tsv.gz',
    peak_width = 1000,
    include_metadata = True,
    exclude_chr = ['chrX', 'chrY'])

len(train)

1514227

df = train.dfm

pos_labels = train.get_targets().max(axis=1)

pos_labels.sum()

45357

pos_labels.mean()

0.02995389726903562

!zcat {train.intervals_file} | grep '^chr10' | head

chr10	10000	11000	0	0	0	0
chr10	11000	12000	0	0	0	0
chr10	12000	13000	0	0	0	0
chr10	13000	14000	0	0	0	0
chr10	14000	15000	0	0	0	0
chr10	15000	16000	0	0	0	0
chr10	16000	17000	0	0	0	0
chr10	17000	18000	0	0	0	0
chr10	18000	19000	0	0	0	0
chr10	19000	20000	0	0	0	0
grep: write error: Broken pipe

gzip: stdout: Broken pipe

list(train.ds.task_specs)

['Oct4', 'Sox2', 'Nanog', 'Klf4']

df.columns = ['chrom', 'start', 'end', 'Oct4', 'Sox2', 'Nanog', 'Klf4']

df.chrom = df.chrom.astype(str)

df[df.astype(str) == 'chr1']

train[1]['metadata']

{'range': GenomicRanges(chr='chr13', start=111184000, end=111185000, id=1, strand='*'),
 'interval_from_task': 0}

train.get_targets().max(axis=1).max()

1

train.dfm.iloc[train.get_targets().max(axis=1) == 1]

df.head()

df.sort_index().head()

df.sort_values([0,1])

df.head()

	0	1	2	3	4	5	6
456738	chr12	8620000	8621000	1	0	0	0
430255	chr11	104195000	104196000	0	0	0	1
679799	chr13	111580000	111581000	0	0	0	1
...	...	...	...	...	...	...	...
715478	chr14	26880000	26881000	1	0	0	0
993815	chr16	76319000	76320000	1	0	0	1
1007892	chr16	90396000	90397000	0	0	1	0

	chrom	start	end
1973242	chr6	60148000	60149000
679403	chr13	111184000	111185000
527383	chr12	79265000	79266000
2116323	chr7	53516000	53517000
2034393	chr6	121301000	121302000

	0	1	2	3	4	5	6
195435	chr10	10000	11000	0	0	0	0
195436	chr10	11000	12000	0	0	0	0
195437	chr10	12000	13000	0	0	0	0
...	...	...	...	...	...	...	...
2208235	chr7	145428000	145429000	0	0	0	0
2208236	chr7	145429000	145430000	0	0	0	0
2208237	chr7	145430000	145431000	0	0	0	0

	0	1	2
1973242	chr6	60148000	60149000
679403	chr13	111184000	111185000
527383	chr12	79265000	79266000
2116323	chr7	53516000	53517000
2034393	chr6	121301000	121302000