Three things to show for the long motifs
# Imports
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
from basepair.imports import *
from basepair.plot.profiles import multiple_plot_stranded_profile
from basepair.exp.paper.config import tf_colors
from basepair.plot.profiles import multiple_plot_stranded_profile
from basepair.exp.paper.config import tf_colors, tasks
from basepair.plot.table import render_mpl_table
from basepair.exp.paper.fig4 import *
from basepair.plot.tracks import tidy_motif_plot
paper_config()
sdir = Path('/oak/stanford/groups/akundaje/avsec/basepair/data/processed/comparison/output/')
model_dir = sdir / 'nexus,peaks,OSNK,0,10,1,FALSE,same,0.5,64,25,0.004,9,FALSE,[1,50],TRUE'
mr_dict = [(task, ModiscoResult(model_dir / f"deeplift/{task}/out/profile/wn/modisco.h5"))
for task in tasks]
for k,v in mr_dict: v.open()
patterns_dict = [(task, read_pkl(model_dir / f"deeplift/{task}/out/profile/wn/patterns.pkl"))
for task in tasks]
footprints = {t: read_pkl(model_dir / f"deeplift/{t}/out/profile/wn/footprints.pkl")
for t in tasks}
mr = ModiscoResult(model_dir / f"deeplift/all/out/profile/wn/modisco.h5")
mr.open()
patterns = [mr.get_pattern(pname) for pname in mr.patterns()]
patterns = read_pkl(model_dir / f"deeplift/all/out/profile/wn/patterns.pkl")
def get_seqlets_bedtool(p):
return BedTool(str(model_dir / f"deeplift/all/out/profile/wn/seqlets/{p.name}.bed.gz"))
patterns2 = [p for p in out if p.attrs['n_seqlets'] > 100]
patterns_long_name = [p for p in patterns if p.attrs['features']['n seqlets'] > 100]
from basepair.exp.paper.repeat_masker import download_repeat_masker, read_repeat_masker, intersect_repeat_masker
from pybedtools import BedTool
dfrm = read_repeat_masker(download_repeat_masker())
from basepair.modisco.core import patterns_to_df
bt_te = BedTool.from_dataframe(dfrm)
dfi_raw = pd.concat(Parallel(n_jobs=20)(delayed(intersect_repeat_masker)(p.name, get_seqlets_bedtool(p), bt_te, f=0.9) for p in tqdm(patterns)))
dfi = dfi_raw.copy()
# Restrict only to LTR/
# dfi = dfi[dfi.repeat_family.str.startswith("LTR/")]
# Append some stats
unique_elements = dfi.groupby(['pattern_name']).interval.nunique()
dfiu = dfi[['pattern_name', 'n_pattern']].drop_duplicates().set_index('pattern_name').join(unique_elements)
dfiu['LTR_overlap_frac'] = dfiu.interval / dfiu.n_pattern
dfi = pd.merge(dfi, dfiu.reset_index()[['pattern_name', 'LTR_overlap_frac']], on='pattern_name', how='left')
dfi = dfi.drop_duplicates()
# Append properties
df = patterns_to_df(patterns_long_name, properties=['name', 'short_name', 'seq_info_content'])
df = df.set_index('name').join(dfiu).reset_index()
TE_min_seq_IC = 30
TE_min_LTR_Overlap_frac = 0.7
height = get_figsize(0.25)[0]
fig = sns.jointplot("LTR_overlap_frac", "seq_info_content", marginal_kws=dict(bins=15), s=5, data=df, height=height);
fig.ax_joint.axvline(TE_min_LTR_Overlap_frac, linestyle='--', color='grey', alpha=.4);
fig.ax_joint.axhline(TE_min_seq_IC, linestyle='--', color='grey', alpha=.4);
# fig.savefig(f"{figures}/TE-match,seq-IC.scatter.pdf")
# fig.savefig(f"{figures}/TE-match,seq-IC.scatter.png")
df['group'] = (df['seq_info_content'] > 30).map({False: 'short', True: 'long'})
plotnine.options.figure_size = get_figsize(0.25)
(ggplot(aes(x='LTR_overlap_frac', color='group'), data=df) +
geom_density() +
scale_color_brewer(type='qual', palette=3) +
theme_classic(base_size=10, base_family='Arial')
)
# TE's are in the top right
# pattern_te_names = df[(df.seq_info_content > TE_min_seq_IC) & (df.LTR_overlap_frac > TE_min_LTR_Overlap_frac)].sort_values('LTR_overlap_frac', ascending=False).name.unique()
# TE's are in the top right
pattern_te_names = df[(df.seq_info_content > TE_min_seq_IC)].sort_values('LTR_overlap_frac', ascending=False).name.unique()
len(pattern_te_names)
We can see that motif with high IC are frequently in the region of known TE's
dfi.repeat_name.value_counts()
dfi.repeat_family.value_counts()
dfi_top = dfi.drop_duplicates()
dfi_top['n_repeat_name'] = dfi_top.groupby(['pattern_name', 'repeat_name']).repeat_name.transform('size')
dfi_top['n_repeat_frac'] = dfi_top['n_repeat_name'] / dfi_top['n_pattern']
dfi_top1 = dfi_top.groupby(['pattern_name']).apply(lambda x: x.loc[x.n_repeat_frac.idxmax()])
del dfi_top1['chrom']
del dfi_top1['start']
del dfi_top1['end']
del dfi_top1['interval']
del dfi_top1['pattern_name']
len(patterns)
len(dfi_top1)
long_patterns_clustered = cluster_align_patterns([p for p in patterns_long_name if p.seq_info_content >= 30], n_clusters=len(patterns_long_name))
dft = dfi_top1.loc[[p.name for p in long_patterns_clustered]]
main_patterns = dft.groupby("repeat_name").LTR_overlap_frac.idxmax().values
main_patterns
for p in patterns:
if p.name not in main_patterns:
continue
ymax = max([v.max() for v in p.contrib.values()])
ymin = min([v.min() for v in p.contrib.values()])
ltr_name = dft.loc[p.name].repeat_name
ltr_frac = dft.loc[p.name].LTR_overlap_frac
p.plot(['seq_ic', 'contrib'],
rotate_y=0,
letter_width=0.1,
height=0.4,
ylim=[(0, 2)] + [(ymin, ymax)]*4,
title=f"{p.name}, LTR: {ltr_name}, any LTR frac: {ltr_frac:.2f}");
sns.despine(top=True, right=True, left=False, bottom=True)