ImpScoreFile and StackedSeqletImpseq_width and profile_width arguments to ImpScoreFile.extract()-
# Imports
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
from basepair.imports import *
hv.extension('bokeh')
from basepair.cli.imp_score import ImpScoreFile
from basepair.modisco.core import StackedSeqletImp
# Common paths
model_dir = Path(f"{ddir}/processed/chipnexus/exp/models/oct-sox-nanog-klf/models/n_dil_layers=9/")
modisco_dir = model_dir / f"modisco/all/deeplift/profile/"
# create_tf_session(0)
%time patterns = read_pkl(modisco_dir / 'patterns.pkl')
mr = ModiscoResult(modisco_dir / 'modisco.h5')
mr.open()
# Cache
imp_file.cache()
p = patterns[0]
seqlets = mr._get_seqlets(p.name)
p_orig = mr.get_pattern(p.name)
imp_file.extract(seqlets).aggregate().plot('seq_ic');
p_orig.plot('seq_ic');
p2 = imp_file.extract([s.pattern_align(**p.attrs['align']) for s in valid_seqlets], profile_width=200).aggregate()
# shift the original pattern
po = p_orig.copy()
if p.attrs['align']['use_rc']:
po = po.rc()
po.shift(p.attrs['align']['offset']).plot('seq_ic');
p2.plot('seq_ic');
p.plot('seq_ic');
profile_width = 200
valid_seqlets = [s for s in seqlets if s.valid_resize(profile_width, imp_file.get_seqlen())]
print(len(valid_seqlets), "/", len(seqlets), "seqlets were valid")
p2.plot('profile');
p.plot('profile');
imp_file.extract([s.pattern_align(**p.attrs['align']) for s in valid_seqlets], profile_width=200).aggregate()
patterns = read_pkl(modisco_dir / 'patterns.pkl')
mr = ModiscoResult(modisco_dir / 'modisco.h5')
mr.open()
p = patterns[0]
valid_seqlets = [s for s in mr._get_seqlets(p.name)
if s.valid_resize(profile_width+ 2, imp_file.get_seqlen())]
%tqdm_restart
extended_patterns = []
for p in tqdm(patterns):
p = p.copy()
profile_width = p.len_profile()
# get the shifted seqlets
seqlets = [s.pattern_align(**p.attrs['align']) for s in mr._get_seqlets(p.name) ]
# keep only valid seqlets
valid_seqlets = [s for s in seqlets
if s.valid_resize(profile_width, imp_file.get_seqlen() + 1)]
# extract the importance scores
p.attrs['stacked_seqlet_imp'] = imp_file.extract(valid_seqlets, profile_width=profile_width)
extended_patterns.append(p)
!ls -latr {output_dir}
ls {output_dir}
write_pkl(extended_patterns, modisco_dir / 'patterns.pkl')
!du -sh {modisco_dir}/*
extended_patterns[0].plot('seq_ic');
extended_patterns[0].attrs['stacked_seqlet_imp'].aggregate().plot('seq_ic');