Goal

  • check the conservation of transposable elements
In [25]:
# Imports
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
from basepair.extractors import FastaFile
from genomelake.extractors import FastaExtractor
from pybedtools import BedTool
from basepair.plot.utils import seqlogo_clean
from basepair.modisco.utils import ic_scale
from basepair.imports import *
from basepair.exp.paper.config import motifs
from basepair.extractors import StrandedBigWigExtractor
from basepair.utils import flatten
hv.extension('bokeh')
In [21]:
paper_config()
In [121]:
%matplotlib inline
In [8]:
# Common paths
model_dir = Path(f"{ddir}/processed/chipnexus/exp/models/oct-sox-nanog-klf/models/n_dil_layers=9/")
modisco_dir = model_dir / f"modisco/all/deeplift/profile/"
In [9]:
mr = ModiscoResult(modisco_dir / 'modisco.h5')
mr.open()
In [14]:
patterns = read_pkl(modisco_dir / 'patterns.pkl')
pdict = {shorten_pattern(p.name): p for p in patterns}
te_df = pd.read_csv(modisco_dir / 'motif_clustering/patterns_long.csv')
tes = [pdict[pn] for pn in te_df.pattern]
In [19]:
phastCons = StrandedBigWigExtractor('/users/avsec//oak-nfs/anno/mm10/conservation/phastcons/mm10.60way.phastCons.bw', use_strand=True)
phyloP = StrandedBigWigExtractor('/users/avsec//oak-nfs/anno/mm10/conservation/phylop/mm10.60way.phyloP60way.bw', use_strand=True)
In [65]:
g = '/mnt/data/pipeline_genome_data/mm10/mm10.chrom.sizes'
In [88]:
from basepair.preproc import resize_interval
In [95]:
# Extract the conservation
scores = {}
wide_scores = {}
resized_scores = {}
for p in tqdm(patterns):
    bt = BedTool(str(modisco_dir / f'seqlets/{p.name}.bedgz'))
    bt = BedTool.from_dataframe(bt.to_dataframe().drop_duplicates())
    bt_wide = [resize_interval(i, 1000) for i in bt]
    bt_resized_100 = [resize_interval(i, 100) for i in bt]
    scores[p.name] = dict(
        phastCons = phastCons.extract(list(bt)),
        phyloP = phyloP.extract(list(bt))
    )
    wide_scores[p.name] = dict(
        phastCons = phastCons.extract(list(bt_wide)),
        phyloP = phyloP.extract(list(bt_wide))
    )
    resized_scores[p.name] = dict(
        phastCons = phastCons.extract(list(bt_resized_100)),
        phyloP = phyloP.extract(list(bt_resized_100))
    )
100%|██████████| 122/122 [01:32<00:00,  1.32it/s]
In [90]:
def te_stat(p, scores, resized_scores, wide_scores):
    return flatten({"cons": {k:v.mean(0).mean(0) for k,v in scores.items()}, 
                    "cons_100bp": {k:v.mean(0).mean(0) for k,v in resized_scores.items()}, 
                    "cons_1kb": {k:v.mean(0).mean(0) for k,v in wide_scores.items()}, 
                    "ic": p.seq_info_content,
                    "name": p.name,
                   })

def te_plot(p, scores):
    plot_tracks(flatten({"cons": {k:v.mean(0) for k,v in scores.items()}, 
                         "contrib": p.contrib, "ic": p.get_seq_ic()}),
               title=p.name, rotate_y=0, fig_height_per_track=.5, fig_width=10)
In [92]:
dft = pd.DataFrame([te_stat(p, scores[p.name], resized_scores[p.name], wide_scores[p.name]) for p in patterns])
In [35]:
!mkdir -p {ddir}/figures/modisco/TE/
In [102]:
fig = plt.figure(figsize=get_figsize(1, aspect=1/3))
plt.subplot(131)
dft.plot.scatter("ic", "cons_phastCons", alpha=0.7, ax=plt.gca());
plt.subplot(132)
dft.plot.scatter("ic", "cons_phyloP", alpha=0.7, ax=plt.gca());
plt.subplot(133)
dft.plot.scatter("cons_phastCons", "cons_phyloP", alpha=0.7, ax=plt.gca());
plt.tight_layout()
plt.savefig(f"{ddir}/figures/modisco/TE/conservation-scores.pdf")
plt.savefig(f"{ddir}/figures/modisco/TE/conservation-scores.png")
In [103]:
fig = plt.figure(figsize=get_figsize(1, aspect=1/3))
plt.subplot(131)
dft.plot.scatter("ic", "cons_100bp_phastCons", alpha=0.7, ax=plt.gca());
plt.subplot(132)
dft.plot.scatter("ic", "cons_100bp_phyloP", alpha=0.7, ax=plt.gca());
plt.subplot(133)
dft.plot.scatter("cons_100bp_phastCons", "cons_100bp_phyloP", alpha=0.7, ax=plt.gca());
plt.tight_layout()
plt.savefig(f"{ddir}/figures/modisco/TE/conservation-scores.100bp.pdf")
plt.savefig(f"{ddir}/figures/modisco/TE/conservation-scores.100bp.png")
In [104]:
fig = plt.figure(figsize=get_figsize(1, aspect=1/3))
plt.subplot(131)
dft.plot.scatter("ic", "cons_1kb_phastCons", alpha=0.7, ax=plt.gca());
plt.subplot(132)
dft.plot.scatter("ic", "cons_1kb_phyloP", alpha=0.7, ax=plt.gca());
plt.subplot(133)
dft.plot.scatter("cons_1kb_phastCons", "cons_1kb_phyloP", alpha=0.7, ax=plt.gca());
plt.tight_layout()
plt.savefig(f"{ddir}/figures/modisco/TE/conservation-scores.1kb.pdf")
plt.savefig(f"{ddir}/figures/modisco/TE/conservation-scores.1kb.png")
In [101]:
fig = plt.figure(figsize=get_figsize(1, aspect=1/3))
plt.subplot(131)
dft.plot.scatter("cons_phastCons", "cons_100bp_phastCons", alpha=0.7, ax=plt.gca());
plt.subplot(132)
dft.plot.scatter("cons_100bp_phastCons", "cons_1kb_phastCons", alpha=0.7, ax=plt.gca());
plt.subplot(133)
dft.plot.scatter("cons_phastCons", "cons_1kb_phastCons", alpha=0.7, ax=plt.gca());
plt.tight_layout()
plt.savefig(f"{ddir}/figures/modisco/TE/conservation-scores.phastCons-context.pdf")
plt.savefig(f"{ddir}/figures/modisco/TE/conservation-scores.phastCons-context.png")
In [100]:
fig = plt.figure(figsize=get_figsize(1, aspect=1/3))
plt.subplot(131)
dft.plot.scatter("cons_phyloP", "cons_100bp_phyloP", alpha=0.7, ax=plt.gca());
plt.subplot(132)
dft.plot.scatter("cons_100bp_phyloP", "cons_1kb_phyloP", alpha=0.7, ax=plt.gca());
plt.subplot(133)
dft.plot.scatter("cons_phyloP", "cons_1kb_phyloP", alpha=0.7, ax=plt.gca());
plt.tight_layout()
plt.savefig(f"{ddir}/figures/modisco/TE/conservation-scores.phyloP-context.pdf")
plt.savefig(f"{ddir}/figures/modisco/TE/conservation-scores.phyloP-context.png")

TOOD

Relate per-base variation.

In [111]:
cs = 'phastCons'
In [114]:
scores[p.name][cs].reshape((-1))
scores[p.name][cs].reshape((-1))
Out[114]:
(25, 70)
In [117]:
a = p.attrs['stacked_seqlet_imp']
In [120]:
a.contrib['Klf4'].shape
Out[120]:
(63, 70, 4)

NOTE: you haven't de-duplicated the seqlets. You should match the per-base importance scores with conservation scores.

In [ ]:
dft = pd.DataFrame([te_stat(p, scores[p.name], resized_scores[p.name], wide_scores[p.name]) for p in patterns])
In [105]:
def te_plot(p, scores, wide_scores):
    plot_tracks(flatten({"cons": {k:(v - wide_scores[k].mean(1, keepdims=True)).mean(0) for k,v in scores.items()}, 
                         "contrib": p.contrib, "ic": p.get_seq_ic()}),
               title=p.name, rotate_y=0, fig_height_per_track=.8, 
                ylim=[[-.1, .1], 
                      [-.2, .2]] + [[0, .175]]*4 + [[0, 2]], 
                fig_width=14)
In [106]:
te_plot(p, scores[p.name], wide_scores[p.name])

Plot the conservation tracks per TE

In [107]:
for p in tes:
    te_plot(p, scores[p.name], wide_scores[p.name])