# Imports
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
from basepair.extractors import FastaFile
from genomelake.extractors import FastaExtractor
from pybedtools import BedTool
from basepair.plot.utils import seqlogo_clean
from basepair.modisco.utils import ic_scale
from basepair.exp.paper.config import motifs
from basepair.imports import *
from basepair.extractors import StrandedBigWigExtractor
from basepair.utils import flatten
hv.extension('bokeh')
paper_config()
%matplotlib inline
# Common paths
model_dir = Path(f"{ddir}/processed/chipnexus/exp/models/oct-sox-nanog-klf/models/n_dil_layers=9/")
modisco_dir = model_dir / f"modisco/all/deeplift/profile/"
mr = ModiscoResult(modisco_dir / 'modisco.h5')
mr.open()
patterns = read_pkl(modisco_dir / 'patterns.pkl')
pdict = {shorten_pattern(p.name): p for p in patterns}
te_df = pd.read_csv(modisco_dir / 'motif_clustering/patterns_long.csv')
tes = [pdict[pn] for pn in te_df.pattern]
phastCons = StrandedBigWigExtractor('/users/avsec//oak-nfs/anno/mm10/conservation/phastcons/mm10.60way.phastCons.bw', use_strand=True)
phyloP = StrandedBigWigExtractor('/users/avsec//oak-nfs/anno/mm10/conservation/phylop/mm10.60way.phyloP60way.bw', use_strand=True)
g = '/mnt/data/pipeline_genome_data/mm10/mm10.chrom.sizes'
from basepair.preproc import resize_interval
# Extract the conservation
scores = {}
wide_scores = {}
resized_scores = {}
for p in tqdm(patterns):
bt = BedTool(str(modisco_dir / f'seqlets/{p.name}.bedgz'))
bt = BedTool.from_dataframe(bt.to_dataframe().drop_duplicates())
bt_wide = [resize_interval(i, 1000) for i in bt]
bt_resized_100 = [resize_interval(i, 100) for i in bt]
scores[p.name] = dict(
phastCons = phastCons.extract(list(bt)),
phyloP = phyloP.extract(list(bt))
)
wide_scores[p.name] = dict(
phastCons = phastCons.extract(list(bt_wide)),
phyloP = phyloP.extract(list(bt_wide))
)
resized_scores[p.name] = dict(
phastCons = phastCons.extract(list(bt_resized_100)),
phyloP = phyloP.extract(list(bt_resized_100))
)
def te_stat(p, scores, resized_scores, wide_scores):
return flatten({"cons": {k:v.mean(0).mean(0) for k,v in scores.items()},
"cons_100bp": {k:v.mean(0).mean(0) for k,v in resized_scores.items()},
"cons_1kb": {k:v.mean(0).mean(0) for k,v in wide_scores.items()},
"ic": p.seq_info_content,
"name": p.name,
})
def te_plot(p, scores):
plot_tracks(flatten({"cons": {k:v.mean(0) for k,v in scores.items()},
"contrib": p.contrib, "ic": p.get_seq_ic()}),
title=p.name, rotate_y=0, fig_height_per_track=.5, fig_width=10)
dft = pd.DataFrame([te_stat(p, scores[p.name], resized_scores[p.name], wide_scores[p.name]) for p in patterns])
!mkdir -p {ddir}/figures/modisco/TE/
fig = plt.figure(figsize=get_figsize(1, aspect=1/3))
plt.subplot(131)
dft.plot.scatter("ic", "cons_phastCons", alpha=0.7, ax=plt.gca());
plt.subplot(132)
dft.plot.scatter("ic", "cons_phyloP", alpha=0.7, ax=plt.gca());
plt.subplot(133)
dft.plot.scatter("cons_phastCons", "cons_phyloP", alpha=0.7, ax=plt.gca());
plt.tight_layout()
plt.savefig(f"{ddir}/figures/modisco/TE/conservation-scores.pdf")
plt.savefig(f"{ddir}/figures/modisco/TE/conservation-scores.png")
fig = plt.figure(figsize=get_figsize(1, aspect=1/3))
plt.subplot(131)
dft.plot.scatter("ic", "cons_100bp_phastCons", alpha=0.7, ax=plt.gca());
plt.subplot(132)
dft.plot.scatter("ic", "cons_100bp_phyloP", alpha=0.7, ax=plt.gca());
plt.subplot(133)
dft.plot.scatter("cons_100bp_phastCons", "cons_100bp_phyloP", alpha=0.7, ax=plt.gca());
plt.tight_layout()
plt.savefig(f"{ddir}/figures/modisco/TE/conservation-scores.100bp.pdf")
plt.savefig(f"{ddir}/figures/modisco/TE/conservation-scores.100bp.png")
fig = plt.figure(figsize=get_figsize(1, aspect=1/3))
plt.subplot(131)
dft.plot.scatter("ic", "cons_1kb_phastCons", alpha=0.7, ax=plt.gca());
plt.subplot(132)
dft.plot.scatter("ic", "cons_1kb_phyloP", alpha=0.7, ax=plt.gca());
plt.subplot(133)
dft.plot.scatter("cons_1kb_phastCons", "cons_1kb_phyloP", alpha=0.7, ax=plt.gca());
plt.tight_layout()
plt.savefig(f"{ddir}/figures/modisco/TE/conservation-scores.1kb.pdf")
plt.savefig(f"{ddir}/figures/modisco/TE/conservation-scores.1kb.png")
fig = plt.figure(figsize=get_figsize(1, aspect=1/3))
plt.subplot(131)
dft.plot.scatter("cons_phastCons", "cons_100bp_phastCons", alpha=0.7, ax=plt.gca());
plt.subplot(132)
dft.plot.scatter("cons_100bp_phastCons", "cons_1kb_phastCons", alpha=0.7, ax=plt.gca());
plt.subplot(133)
dft.plot.scatter("cons_phastCons", "cons_1kb_phastCons", alpha=0.7, ax=plt.gca());
plt.tight_layout()
plt.savefig(f"{ddir}/figures/modisco/TE/conservation-scores.phastCons-context.pdf")
plt.savefig(f"{ddir}/figures/modisco/TE/conservation-scores.phastCons-context.png")
fig = plt.figure(figsize=get_figsize(1, aspect=1/3))
plt.subplot(131)
dft.plot.scatter("cons_phyloP", "cons_100bp_phyloP", alpha=0.7, ax=plt.gca());
plt.subplot(132)
dft.plot.scatter("cons_100bp_phyloP", "cons_1kb_phyloP", alpha=0.7, ax=plt.gca());
plt.subplot(133)
dft.plot.scatter("cons_phyloP", "cons_1kb_phyloP", alpha=0.7, ax=plt.gca());
plt.tight_layout()
plt.savefig(f"{ddir}/figures/modisco/TE/conservation-scores.phyloP-context.pdf")
plt.savefig(f"{ddir}/figures/modisco/TE/conservation-scores.phyloP-context.png")
def te_plot(p, scores, wide_scores):
plot_tracks(flatten({"cons": {k:(v - wide_scores[k].mean(1, keepdims=True)).mean(0) for k,v in scores.items()},
"contrib": p.contrib, "ic": p.get_seq_ic()}),
title=p.name, rotate_y=0, fig_height_per_track=.8,
ylim=[[-.1, .1],
[-.2, .2]] + [[0, .175]]*4 + [[0, 2]],
fig_width=14)
te_plot(p, scores[p.name], wide_scores[p.name])
for p in tes:
te_plot(p, scores[p.name], wide_scores[p.name])