In [6]:
# Imports
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
from basepair.extractors import FastaFile
from genomelake.extractors import FastaExtractor
from pybedtools import BedTool
from basepair.plot.utils import seqlogo_clean
from basepair.modisco.utils import ic_scale
from basepair.exp.paper.config import motifs
from basepair.imports import *
hv.extension('bokeh')
In [7]:
%matplotlib inline
In [8]:
# Common paths
model_dir = Path(f"{ddir}/processed/chipnexus/exp/models/oct-sox-nanog-klf/models/n_dil_layers=9/")
modisco_dir = model_dir / f"modisco/all/deeplift/profile/"
In [9]:
mr = ModiscoResult(modisco_dir / 'modisco.h5')
mr.open()
In [10]:
motifs
Out[10]:
OrderedDict([('Oct4-Sox2', 'm0_p0'),
             ('Sox2', 'm0_p1'),
             ('Nanog', 'm2_p0'),
             ('Klf4', 'm1_p0')])
In [11]:
ref_fa = FastaExtractor('/mnt/data/pipeline_genome_data/mm10/mm10_no_alt_analysis_set_ENCODE.fasta', use_strand=True)
In [12]:
ds = DataSpec.load(model_dir / 'dataspec.yaml')
In [13]:
bt = BedTool(str(modisco_dir / 'seqlets/metacluster_0/pattern_1.bedgz'))
bt_wide = bt.slop(l=50, r=50, s=True, g='/mnt/data/pipeline_genome_data/mm10/mm10.chrom.sizes')
bt_wide = BedTool.from_dataframe(bt_wide.to_dataframe().drop_duplicates())
In [223]:
counts = ds.task_specs['Sox2'].load_counts(list(bt_wide))
sort_idx = np.argsort(-counts.max(1).max(1))
In [224]:
sort_idx = np.argsort(-counts.sum(1).sum(1))
In [225]:
strands = np.array([i.strand == '+' for i in bt_wide])
pos_counts = counts[strands]
neg_counts = counts[~strands]

pos_sort_idx = np.argsort(-pos_counts.max(1).max(1))
neg_sort_idx = np.argsort(-neg_counts.max(1).max(1))
In [226]:
from basepair.plot.heatmaps import *
In [ ]:
heatmap_stranded_profile(counts[sort_idx][:1000], aspect=.1, figsize=(15,15), pmin=50, pmax=99);
In [ ]:
heatmap_stranded_profile(pos_counts[pos_sort_idx][:500], aspect=.1, figsize=(10,10), pmin=50, pmax=99);
heatmap_stranded_profile(neg_counts[neg_sort_idx][:500], aspect=.1, figsize=(10,10), pmin=50, pmax=99);
In [64]:
seqs = ref_fa(list(bt))
strands = np.array([i.strand == '+' for i in bt])
In [67]:
seqlogo_clean(ic_scale(seqs.mean(0)))
Out[67]:
In [68]:
seqlogo_clean(ic_scale(seqs[strands].mean(0)))
Out[68]:
In [69]:
seqlogo_clean(ic_scale(seqs[~strands].mean(0)))
Out[69]: