Goal

  • test ImpScoreFile and StackedSeqletImp

Tasks

  • [x] add seq_width and profile_width arguments to ImpScoreFile.extract()
  • [x] run the extraction function for one pattern
    • [X] check that the exact footprint is at the right location
  • [X] run the extraction for the shifted patern
    • [X] check that the exact footprint is at the right location
  • [x] write a CLI function to run the extraction and enrich the patterns
  • [x] run this extraction for all the patterns

Required files

-

In [104]:
# Imports
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
from basepair.imports import *
hv.extension('bokeh')
In [191]:
from basepair.cli.imp_score import ImpScoreFile
from basepair.modisco.core import StackedSeqletImp
In [3]:
# Common paths
model_dir = Path(f"{ddir}/processed/chipnexus/exp/models/oct-sox-nanog-klf/models/n_dil_layers=9/")
modisco_dir = model_dir / f"modisco/all/deeplift/profile/"
In [4]:
# create_tf_session(0)
In [343]:
%time patterns = read_pkl(modisco_dir / 'patterns.pkl')
CPU times: user 3.54 s, sys: 1.82 s, total: 5.37 s
Wall time: 5.37 s
In [6]:
mr = ModiscoResult(modisco_dir / 'modisco.h5')
mr.open()
In [19]:
# Cache
imp_file.cache()
Out[19]:
<basepair.cli.imp_score.ImpScoreFile at 0x7f15b3e69240>
In [248]:
p = patterns[0]

seqlets = mr._get_seqlets(p.name)

p_orig = mr.get_pattern(p.name)

Original

In [249]:
imp_file.extract(seqlets).aggregate().plot('seq_ic');
In [250]:
p_orig.plot('seq_ic');

Shift the seqlets

In [259]:
p2 = imp_file.extract([s.pattern_align(**p.attrs['align']) for s in valid_seqlets], profile_width=200).aggregate()
In [260]:
# shift the original pattern
po = p_orig.copy()
if p.attrs['align']['use_rc']:
    po = po.rc()
po.shift(p.attrs['align']['offset']).plot('seq_ic');
In [263]:
p2.plot('seq_ic');
In [264]:
p.plot('seq_ic');

Profile

In [265]:
profile_width = 200
In [266]:
valid_seqlets = [s for s in seqlets if s.valid_resize(profile_width, imp_file.get_seqlen())]
In [267]:
print(len(valid_seqlets), "/", len(seqlets), "seqlets were valid")
156 / 157 seqlets were valid
In [268]:
p2.plot('profile');
In [269]:
p.plot('profile');
In [ ]:
imp_file.extract([s.pattern_align(**p.attrs['align']) for s in valid_seqlets], profile_width=200).aggregate()
In [5]:
patterns = read_pkl(modisco_dir / 'patterns.pkl')

mr = ModiscoResult(modisco_dir / 'modisco.h5')
mr.open()
In [270]:
p = patterns[0]
In [311]:
valid_seqlets = [s for s in mr._get_seqlets(p.name) 
                     if s.valid_resize(profile_width+ 2, imp_file.get_seqlen())]
In [316]:
%tqdm_restart
In [334]:
extended_patterns = []
for p in tqdm(patterns):
    p = p.copy()
    profile_width = p.len_profile()
    # get the shifted seqlets
    seqlets = [s.pattern_align(**p.attrs['align']) for s in mr._get_seqlets(p.name) ]
    
    # keep only valid seqlets
    valid_seqlets = [s for s in seqlets
                     if s.valid_resize(profile_width, imp_file.get_seqlen() + 1)]
    # extract the importance scores
    p.attrs['stacked_seqlet_imp'] = imp_file.extract(valid_seqlets, profile_width=profile_width)
    extended_patterns.append(p)
100%|██████████| 122/122 [00:32<00:00,  3.72it/s]
In [335]:
!ls -latr {output_dir}
total 99788
drwxrwxr-x 11 avsec kundaje     4096 Dec 10 10:31 plots
-rw-rw-r--  1 avsec kundaje 36998707 Dec 10 12:38 results.ipynb
-rw-rw-r--  1 avsec kundaje 12478221 Dec 10 12:38 results.html
-rw-rw-r--  1 avsec kundaje  1117156 Dec 10 14:10 footprints.pkl
-rw-rw-r--  1 avsec kundaje 21016986 Dec 10 15:14 centroid_seqlet_matches.csv
drwxrwxr-x 11 avsec kundaje     4096 Dec 11 11:05 seqlets
drwxrwxr-x  6 avsec kundaje     4096 Dec 13 07:04 ..
-rw-rw-r--  1 avsec kundaje   128070 Dec 13 11:23 pattern_table.clustered.csv
drwxrwxr-x  2 avsec kundaje     4096 Dec 13 11:29 motif_clustering
-rw-rw-r--  1 avsec kundaje  5987056 Dec 13 17:12 patterns.pkl
-rw-rw-r--  1 avsec kundaje   128051 Dec 13 17:15 pattern_table.csv
-rw-rw-r--  1 avsec kundaje 24293324 Dec 13 17:15 pattern_table.html
drwxrwxr-x  5 avsec kundaje     4096 Dec 16 05:35 .
In [329]:
ls {output_dir}
ls: cannot access '{output_dir}': No such file or directory
In [339]:
write_pkl(extended_patterns, modisco_dir / 'patterns.pkl')
In [342]:
!du -sh {modisco_dir}/*
21M	/users/avsec/workspace/basepair/data/processed/chipnexus/exp/models/oct-sox-nanog-klf/models/n_dil_layers=9/modisco/all/deeplift/profile/centroid_seqlet_matches.csv
64K	/users/avsec/workspace/basepair/data/processed/chipnexus/exp/models/oct-sox-nanog-klf/models/n_dil_layers=9/modisco/all/deeplift/profile/figures
1.1M	/users/avsec/workspace/basepair/data/processed/chipnexus/exp/models/oct-sox-nanog-klf/models/n_dil_layers=9/modisco/all/deeplift/profile/footprints.pkl
4.0K	/users/avsec/workspace/basepair/data/processed/chipnexus/exp/models/oct-sox-nanog-klf/models/n_dil_layers=9/modisco/all/deeplift/profile/hparams.yaml
11G	/users/avsec/workspace/basepair/data/processed/chipnexus/exp/models/oct-sox-nanog-klf/models/n_dil_layers=9/modisco/all/deeplift/profile/instances.parq
4.0K	/users/avsec/workspace/basepair/data/processed/chipnexus/exp/models/oct-sox-nanog-klf/models/n_dil_layers=9/modisco/all/deeplift/profile/kwargs.json
228K	/users/avsec/workspace/basepair/data/processed/chipnexus/exp/models/oct-sox-nanog-klf/models/n_dil_layers=9/modisco/all/deeplift/profile/log
99M	/users/avsec/workspace/basepair/data/processed/chipnexus/exp/models/oct-sox-nanog-klf/models/n_dil_layers=9/modisco/all/deeplift/profile/modisco.h5
46M	/users/avsec/workspace/basepair/data/processed/chipnexus/exp/models/oct-sox-nanog-klf/models/n_dil_layers=9/modisco/all/deeplift/profile/motif_clustering
128K	/users/avsec/workspace/basepair/data/processed/chipnexus/exp/models/oct-sox-nanog-klf/models/n_dil_layers=9/modisco/all/deeplift/profile/pattern_table.clustered.csv
128K	/users/avsec/workspace/basepair/data/processed/chipnexus/exp/models/oct-sox-nanog-klf/models/n_dil_layers=9/modisco/all/deeplift/profile/pattern_table.csv
24M	/users/avsec/workspace/basepair/data/processed/chipnexus/exp/models/oct-sox-nanog-klf/models/n_dil_layers=9/modisco/all/deeplift/profile/pattern_table.html
1.3G	/users/avsec/workspace/basepair/data/processed/chipnexus/exp/models/oct-sox-nanog-klf/models/n_dil_layers=9/modisco/all/deeplift/profile/patterns.pkl
580M	/users/avsec/workspace/basepair/data/processed/chipnexus/exp/models/oct-sox-nanog-klf/models/n_dil_layers=9/modisco/all/deeplift/profile/plots
12M	/users/avsec/workspace/basepair/data/processed/chipnexus/exp/models/oct-sox-nanog-klf/models/n_dil_layers=9/modisco/all/deeplift/profile/results.html
36M	/users/avsec/workspace/basepair/data/processed/chipnexus/exp/models/oct-sox-nanog-klf/models/n_dil_layers=9/modisco/all/deeplift/profile/results.ipynb
6.6M	/users/avsec/workspace/basepair/data/processed/chipnexus/exp/models/oct-sox-nanog-klf/models/n_dil_layers=9/modisco/all/deeplift/profile/seqlets
In [336]:
extended_patterns[0].plot('seq_ic');
extended_patterns[0].attrs['stacked_seqlet_imp'].aggregate().plot('seq_ic');