# Imports
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
from basepair.imports import *
hv.extension('bokeh')
# Common paths
model_dir = Path(f"{ddir}/processed/chipnexus/exp/models/oct-sox-nanog-klf/models/n_dil_layers=9/")
modisco_dir = model_dir / f"modisco/all/deeplift/profile/"
figures = f"{ddir}/figures/modisco/TE/"
output_dir = modisco_dir
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
from uuid import uuid4
from basepair.math import mean
from basepair.stats import perc
from IPython.display import display, HTML
from basepair.plot.vdom import df2html, df2html_old, render_datatable, vdom_footprint
from basepair.plot.utils import strip_axis
from basepair.plot.tracks import tidy_motif_plot
from basepair.modisco.core import patterns_to_df
from basepair.modisco.utils import longer_pattern, shorten_pattern, extract_name_short
from basepair.imports import *
modisco_dir = Path(modisco_dir)
output_dir = Path(output_dir)
!mkdir -p {figures}
paper_config()
patterns = read_pkl(modisco_dir / 'patterns.pkl')
pdict = {shorten_pattern(p.name): p for p in patterns}
te_df = pd.read_csv(modisco_dir / 'motif_clustering/patterns_long.csv')
tes = [pdict[pn] for pn in te_df.pattern]
df = patterns_to_df(patterns, properties=['name', 'short_name', 'seq_info_content'])
dfl = pd.DataFrame({"length": [len(p.trim_seq_ic(trim_frac=0.08)) for p in patterns],
"consensus": [p.trim_seq_ic(trim_frac=0.08).get_consensus() for p in patterns],
"n_seqlets": [len(p.attrs['stacked_seqlet_imp']) for p in patterns]
}, index=[p.name for p in patterns])
dfl.index.name = 'pattern'
from basepair.exp.te.repeat_masker import read_repeat_masker, wget_repeat_masker
repeat_masker_file = f"{ddir}/raw/annotation/mm10/RepeatMasker/mm10.fa.out.gz"
wget_repeat_masker(repeat_masker_file)
dfrm = read_repeat_masker(repeat_masker_file)
dfrm.head()
from pybedtools import BedTool
bt_te = BedTool.from_dataframe(dfrm)
def intersect_repeat_masker_single(pattern, f=1.0):
from pybedtools import BedTool
bt = BedTool(f"{modisco_dir}/seqlets/{pattern}.bedgz")
try:
dfint = bt.intersect(bt_te, wa=True, wb=True, f=f).to_dataframe()
except:
return None
t = dfint.blockCount.str.split("//", expand=True)
dfint['pattern_name'] = pattern
dfint['repeat_name'] = t[0]
dfint['repeat_family'] = t[1]
dfint['n_pattern'] = bt.to_dataframe()[['chrom', 'start', 'end']].drop_duplicates().shape[0]
dfint['interval'] = dfint['chrom'] + ":" + dfint['start'].astype(str) + "-" + dfint['end'].astype(str)
return dfint[['chrom', 'start', 'end', 'interval', 'pattern_name', 'n_pattern', 'repeat_name', 'repeat_family']]
dfi_raw = pd.concat(Parallel(n_jobs=20)(delayed(intersect_repeat_masker_single)(pattern, f=0.9) for pattern in tqdm(mr.patterns())))
dfi = dfi_raw.copy()
# Restrict only to LTR/
dfi = dfi[dfi.repeat_family.str.startswith("LTR/")]
# Append some stats
unique_elements = dfi.groupby(['pattern_name']).interval.nunique()
dfiu = dfi[['pattern_name', 'n_pattern']].drop_duplicates().set_index('pattern_name').join(unique_elements)
dfiu['LTR_overlap_frac'] = dfiu.interval / dfiu.n_pattern
dfi = pd.merge(dfi, dfiu.reset_index()[['pattern_name', 'LTR_overlap_frac']], on='pattern_name', how='left')
dfi = dfi.drop_duplicates()
# Append properties
df = patterns_to_df(patterns, properties=['name', 'short_name', 'seq_info_content'])
df = df.set_index('name').join(dfiu).reset_index()
df
TE_min_seq_IC = 50
TE_min_LTR_Overlap_frac = 0.7
height = get_figsize(0.25)[0]
fig = sns.jointplot("LTR_overlap_frac", "seq_info_content", marginal_kws=dict(bins=15), s=5, data=df, height=height);
fig.ax_joint.axvline(TE_min_LTR_Overlap_frac, linestyle='--', color='grey', alpha=.4);
fig.ax_joint.axhline(TE_min_seq_IC, linestyle='--', color='grey', alpha=.4);
# fig.savefig(f"{figures}/TE-match,seq-IC.scatter.pdf")
# fig.savefig(f"{figures}/TE-match,seq-IC.scatter.png")
# TE's are in the top right
pattern_te_names = df[(df.seq_info_content > TE_min_seq_IC) & (df.LTR_overlap_frac > TE_min_LTR_Overlap_frac)].sort_values('LTR_overlap_frac', ascending=False).name.unique()
df['is_te'] = df.name.isin(pattern_te_names)
pattern_te_names
We can see that motif with high IC are frequently in the region of known TE's
dfi.repeat_name.value_counts()
dfi.repeat_family.value_counts()
dfi
dfi_top = dfi.drop_duplicates()
dfi_top['n_repeat_name'] = dfi_top.groupby(['pattern_name', 'repeat_name']).repeat_name.transform('size')
dfi_top['n_repeat_frac'] = dfi_top['n_repeat_name'] / dfi_top['n_pattern']
dfi_top1 = dfi_top.groupby(['pattern_name']).apply(lambda x: x.loc[x.n_repeat_frac.idxmax()])
del dfi_top1['chrom']
del dfi_top1['start']
del dfi_top1['end']
del dfi_top1['interval']
del dfi_top1['pattern_name']
dfi_top1
from basepair.preproc import resize_interval
from basepair.exp.paper.config import motifs
from basepair.extractors import StrandedBigWigExtractor
from basepair.utils import flatten
phastCons = StrandedBigWigExtractor('/users/avsec//oak-nfs/anno/mm10/conservation/phastcons/mm10.60way.phastCons.bw', use_strand=True)
phyloP = StrandedBigWigExtractor('/users/avsec//oak-nfs/anno/mm10/conservation/phylop/mm10.60way.phyloP60way.bw', use_strand=True)
g = '/mnt/data/pipeline_genome_data/mm10/mm10.chrom.sizes'
# Extract the conservation
scores = {}
wide_scores = {}
resized_scores = {}
for p in tqdm(patterns):
bt = BedTool(str(modisco_dir / f'seqlets/{p.name}.bedgz'))
bt = BedTool.from_dataframe(bt.to_dataframe().drop_duplicates())
bt_wide = [resize_interval(i, 1000) for i in bt]
bt_resized_100 = [resize_interval(i, 100) for i in bt]
scores[p.name] = dict(
phastCons = phastCons.extract(list(bt)),
phyloP = phyloP.extract(list(bt))
)
wide_scores[p.name] = dict(
phastCons = phastCons.extract(list(bt_wide)),
phyloP = phyloP.extract(list(bt_wide))
)
resized_scores[p.name] = dict(
phastCons = phastCons.extract(list(bt_resized_100)),
phyloP = phyloP.extract(list(bt_resized_100))
)
def te_stat(p, scores, resized_scores, wide_scores):
return flatten({"cons": {k:v.mean(0).mean(0) for k,v in scores.items()},
"cons_100bp": {k:v.mean(0).mean(0) for k,v in resized_scores.items()},
"cons_1kb": {k:v.mean(0).mean(0) for k,v in wide_scores.items()},
"ic": p.seq_info_content,
"name": p.name,
})
def te_plot(p, scores):
plot_tracks(flatten({"cons": {k:v.mean(0) for k,v in scores.items()},
"contrib": p.contrib, "ic": p.get_seq_ic()}),
title=p.name, rotate_y=0, fig_height_per_track=.5, fig_width=10)
dft = pd.DataFrame([te_stat(p, scores[p.name], resized_scores[p.name], wide_scores[p.name]) for p in patterns])
fig = plt.figure(figsize=get_figsize(1, aspect=1/3))
plt.subplot(131)
dft.plot.scatter("ic", "cons_phastCons", alpha=0.7, ax=plt.gca());
plt.subplot(132)
dft.plot.scatter("ic", "cons_phyloP", alpha=0.7, ax=plt.gca());
plt.subplot(133)
dft.plot.scatter("cons_phastCons", "cons_phyloP", alpha=0.7, ax=plt.gca());
plt.tight_layout()
# plt.savefig(f"{ddir}/figures/modisco/TE/conservation-scores.pdf")
# plt.savefig(f"{ddir}/figures/modisco/TE/conservation-scores.png")
from basepair.exp.te.dist import kimura_2p_distance
a = 'AATGAGCCGGTGGCTGATCCTTCCTCTGTACATGGGCCTCATAGCCTAGTCTGATGGTGATGAGCTGCCCAATTATAGCAGGGTGAAAATGCGCAACCCGCCGAAAACTTGCCACGGATTGGATCTTACATCTTGCGGGGGTACATCGACGATCAGGCACTAGTTGATCCTTCACACGAGTTCAACAGCTATTGACTCCCTCCAGAGGAACGGAGACTTACACTCTAGTTATTCTTTGACTGCGCCTGGTCGGCCCCCCCTGTACAGGTTGTACGAAAAAATAAGATCACCGCGCCAAGGCCCCCCTCCCGACCGCACACAATGCTACGGCCGTAATCTGGTTCTGTGCATATAGAGCAACTAGCATCACCCCGAAATGCAGCCGGTAGTTGGAAATATGTCGACGGCCTAAGGTCGGTAGCATTGCCCTTAGAGCGGAATACCTGGGCTACCTAAATCTTTCTCGCTCACTACCACTGGCCTAAGCCGATTAGAGGGCCAATTAGTTATTGTGCGTTAGTCGGACGGCAGTACGTGGAGACTTGGCGTGGAGATGCTAATCAGCTGCGCGCTGTCCAGTGGATCTATGCAGGGCATGGTGAGATCAAAATATCGGACGGAGTTGGTTACCCAATATCTCACAGAAGACCGGCTACCAAAAGGTAAGCAATCGCTCGGGAGGACCGAACCGGGGACTAGGTTACAATTTGTTCGGTACAACGCGCTATCCATGACAGGGTATTCCTTTGTAGTTACGCCCCATGCTTATGCCCCCGTTTTACCGTATTCGCATGAGAACGCTGAACACGGAGACGTACGTCTGAATCGCAGTGCTCTATCTGCCCCGCCGAGTTGGATACCTGGACCGCGACAAAGAATCGCTTAGAATTGTAGGTATTCGGGGATCGCCAAGGGATCGGTGCGTGCGCGCCTTAGCATGTACTCATCCCTAGCGCCCAGAGAAGACAACGGCCCCACTCGCAAACTTACGAAGACCTCCCAATCTCTCAAAACCTCACTAGTCCACTCCCAGTCACTGGAGGACAGCATTCGGTGAATAGCACTACTTACACAACGCCCGGACGCGTATCGGAGTATGCTGAGAATAATTTGTTATCGAAGCAGTAGGGTTGGGCTCTTCCGAACTGTTGACCTCGCGCGCTCGATTCTGCCCATGCAGCGGAAGGCACTCCGATGCGCATAAGTTCTGCAAAGCCACCCTGACTTCCAAGATCCGGGTGCTTACTACCAGTTGTAGCCTCTGCCAGAAGAGAAGTGGTGCTCAATGGTCAACCGTGTCCTCAGGCACCTTCAATTCCCCCCGTCCAAAGCGACGGGTATCATACCTACCCGCTAGGTTTCATGAGTGCAGAGTTTCATTTCCTGAAGCGAGGAGGCCAAATCTACACTTGCTTATATCTAGGTTAAATCTCCTGCTTGACATGTATCCTATTCGAAGTTGTTCGGTCACCAACGGTTAGTTACTGTTGCTCCGGTCTAGGCCCATTGAGGGACCACAAAGCACCGCGCAACTAGCACCCCGTTGTGCCGAAGTTACGAAGTTATGCCCCCTAATCATTAGTAAAAATCTTAACGGTCACCTTGGCTGGTATCTACTGCCTACGCCGAGCCTATCACGTGGCCTGGGCACAATATTCCGAGACACTGCGTATCTGACCTAGCTATCCATTTGACTATTCAGCGGTGTCCACTCTGCTTTATAGTGTTGATCTGGTAGGACAGCCATCGCGTTATCGCCCTGGCCAGCAAACCTCTACAGAGAGGTCACTGCAGAACCGTATAATTCTGGGGTACTACTGGGTAGGCCTTGGCTGGTCTTGTCCAACCCCAGGCCGTAGCGGATATGAGAGAACTTAGCTCAAACGTAATCGCCTGTTGCACGCACTTCTCGAGAGGATGACACTAATTGGCCTACGAAAATAGGACCAACTAATTGTCAAAAACCACCACTTCTTCACTATGAGGTGAGGAACGCGCCTTTACTGAGAACGTATCGTAAAATGAATAATCGACTCTCGGTCTCACGGCCGGAGATCTGCTGTGGGAGGTTATTTGTGAGGACGACAGTTGGCTCCAGCGCTGTAGAACGCCGGGTTTCTACTTTCAGAGACCACGGGTAAACAAATTCGTAGGATTTCGACTCTCTAAAATCATGACGACCGCGCTCTATAGTCGACTGTGTATTGTAGAGAAAAAGCCAAGTAAGTTGCGGGTTTCCCCCGGGCACGTGTACCGGGTTTACGTTTAAACGGATTGACGGGACTATAGATAACATTGTTTAGACCACGCATCTTAGTGAGTGAGTCGATTTTCTATTGTTGGATAGATGCCCGGACTAGAACGGTGATTAACCCCCGGCCATTCTGGCGTATGTGATGGCGCGGGGGAATTTTCACCAGATTCGCTTTCTGGTCCCCGCCCTTCGGGTTCCGACTTGTAGAGAGTCAACAGGTACCCCTAGGCGTAATTCTAAGTGGGGAACTAAGAGGTTTTCGTATTCCTCTCCATTACGTCATCGGATAAGCACAATGTAATGTGTAATAGCTCCCTTTAGTTGGAGTGTAGCCATATCAAAACTGAGCACGATAACTGACCTTCCTCTTTAATGATCGGGGCAACTACAATAACATGGGACCGAATTAAAGATAGGCTGAGCTAGCCTGATGTACCTACGCGCATTTGGAAATAGATCGCTCAACTTCCCGGCCCTAAGCAACTACCAGACGTTTGTGGTACGATGATCGGGAGTCCCCGGCGGAGGCATCGTCACGAAAACTGTCTGTTCATGCGGACAGTAAATCCCTATCTATGATGTTCGCGGGCTCATCTACCAGAAACGTCCCCCGGACCCGAACAGTTTCGGTGTAAATACTTGCTAGCCGGGTCCGAGGCTTCCGTGCCGCACAGTACTTTAGTCGGAGGCAGGTGACTGTTAGATTGTGCGAGCGCTCAAAGCGTGATATAGTTACGGGGGGTCTAATGTCAGCTGTGACAAAGTCCTGTCTGTCCGTTTTTGCCCAGACACGCTCGTCTCTCCATGCTCTTTTACAGTGCTGGAAATGGGAGCTTCTGGTGCATTGAAACGAACCTTTTAGCAGATGTGGCATACCTCTTGGTACAGACGAAGTATAGACTACACAAAAGCGCTCGAGGCAAGTTTTGCGTAATCGGCCCGTTTCGCGCAGGTTGGAAACCTATACTGCACACTCCAATTAGTTAATTCACACTAGATTTTGGATCTTACGTCTCAGCGTTCCACTTGCGAATGAACATCCAGTCGATTAACTACCAGCTATACTTCCCTTAGATTAATTGTGACCAAATTAACGCATGTTGATCCGAACATCTGCTGTTGACTCGCTGTAGTCTTACGTGCTAAAATTTCGTCACCCCAACAGCATCCTTATGGTTTAAAGGTAGCCCCATCAACGGAATATTGCAAACTACTAACGTGGTTGGCTCGAAGATTTTTACCTTTGGTTATGCCTCGGACCTGCCGTGCTGTATATAGGTCCCAGAGTCACACATGGATGTACTACTTTCATAATACCATGGTGAACCACGATTATACTAGGTATGCGTGGAGTGATCAGTTTGTGCTACGACCGAGCGGAGCGGGCGTGACATGGGCGGTGTACGACGACCTTCTGGAAAGGACCCCCTGGTTCAGTTCAGCAACAGGCGGACTTGTCTATAAACACGTCTTAAGGAGTAAAGTAGAGGCGGTTTGCTAGGTGCAGATTGTTTGGCCACGCTGCCTGAGCGTAATGTTAACGAATTATTGGACAGTTAGGGCGTTCGAGTGGGATGTTAATTTGGCCTCTTGCACCGGAACCGTTAGCCGGACCACCGGCTTCTGTGGGGGTCCGTAGGAACACAGGTGCCTAGAGCGCAGCAGGCTGTGCCTGTGGGCCCGCCAATGAAGCCAAATCAGATTTCGATGAGCATGATCAGCACACGCCAGTGCGATACCGTTCTCTCGGTAGGGCGTCCCAATGGGAAGAGCGCAGTGAATCTTGTGGGTGCGGACTATAACGGATCCGTGATTGCCTCCGGCGGAAATGAAGTCTGTGGTAGTGTTTTAATAAATATAATGCGCCAACGTGGACTCTCGGATAACGTCGGTGTATCCGGCATATCCAGACACCGTTGAACAATTTAAATCGTCTGCCCGTGCTCTAGAGTCCTCATAGTCTAAGTGTAGTCTATTCCCTCTAAAATTTTTGAGATTAGTAAGTCTTGACTCTCAGGTCTTGTTCTTAAGCGGGGCAATGCTCGACTAAATAACACGATCACGCCCACGCCCGTAGCCAGTGCCTCAATCTATGCGCACGTGAGGTAACTGCGGACGGCGTAACGCGTTCTGGATATTATATCCTTGCCGTGGACCCGGTCAGAAGCAGGGTCGGAAGACCCCAGTACTGCCGGCCCGGGGCGTACAAAGTGTCTGTGCTTTGCGCCGGGGGACGGTAAGGCTTTGTGATCCACCGTCAGTTCCGCTAATAGTGTTCACTGGTAATACCTAGGGTAGCAACGCACAGATTCGAGAATTAAGCACATAGGTTTGACCGACATGGTGGGCGCACACGACCGAAGTAGATGTACGATGTTGACGTTTACGAACACTATCTAATTCTCTGCAATATGTAGGAATGCAGCCCAAAATTTACATTGAGTAGCCGGACGAAGCTCTGTGATCTAGGGCATAATATTGATAGCACTCCAAGACTGTCAACTTATAGATGAGCGAGAACACTGGTCCTGAGCCCCAAGGGAAAGCTCTGCAAGAGGCACCTCCTTGACCAAGGAGCCCATTGGTAGCCGGTTACATAGAGTCCGCAAATCTCGAAACCGGTGTCCAGATGAGAGATAAAGAACTTGTTGTTGGCTGAGTGGGCAGAAAACACCGATGCTGCGCTTGCGGACGGCATGCAAGCCACTCAGGCTCCCGTTGAAACACGCGCCCCTTCTGTCAAGCC'
b = 'ATTCCCCGGTAAAAGCAAAATTCTGTAGTCAGACGCCCGTACCACGTAGATGAGGATCTGCCCCCGTCATGAGGGCGACCACAATAGTAGGCGGCCACTGTCGGCAGTTACAAGAGCAGCGGCGGATTTCATCGGTACAATCGCGGTTGCGCCGCCCGCCTAGTAAGCATTGCGCTCGAGTCCCTGTAATTCTCAGGCTAGGCAAGCACACAGATATGTGAGTGTATCGTCATAATGTGCTCGTTTAAGAGCTCATAATTTGTTAACTGTATGGCCGCATTTTCAAGATGGATCGTGCGCCCAATAACAATCCGGCAGTAAGAGTTCGCTCCAGGAGATTGCCTGGCGCGCTAGTACTACGGTAGAGACGTCAAACACACGAGCGGGGCGAGGATTTTGGCTTGGGTTCTAGCTACCCTAAACCAGCGGGTATCGGCCCAAGGGCTTTTCTCTCCTGTTTCCAGCGAAATTAGGGGTACGGTGGCAATCACTGAAATCACCCCCGATGTGTAGTGACGCTATACCTAAGTCTAAGAGAATCTGGGGCAAGTGTCTAAGGACTGCTGGCTCCAAAGAGCCTGGATACACCGCTGTACCTTTGAGACGTTGGTGGTATCTAATGCATAACGCGCTCGGAGCGAACGCCTCCACAACGGTGCGTTTCCCGTATCGTATAGAGGATAAGGATATCGTCTCTATTAAAGCCCCAATGGGGTGCAACAGCAACGAATAAGTCTGTATTTAGAAGAGCGAAAAAATTCGCAACCTTTAGAAATCCTTAGGCCTACGGCTATCCACGCATTGGTCGGGGCACTAAAGTCATAGATTCACCACGATCCCCCAGTTTCGGATAGGAATTGCGTCGTGGGCGTGATGAGTCTACGTCAACCAAGCCAGCTACATGCTCAGTGCAGCGATAATTTGCCTGGTGAGGATAATACAAATCGCATTGCGGAATGCTTTCTAACCTACGTTAGGTCGGTAAGTTGCCATGCTTGTCGGTGTTGTACTACTCTCAGACTTAAGATGTTCAAATGGACGGTATCCCATGGGTAGCCAGTGATGTGGATGTCGTCGGGCCTATAAAGGGCGTGGCGTGGCCCCTAACCAATCCACTAGTCCGGCGTAAGCAGCACCCTCACGGGGGAAAGACCGAGCATCTCGAGCAGTTCGTTCCACGATATTATCAGGTGACTTCCTATCTAGGTAACAGTCTCCAGCTGCAGACGCCTATTGGTACCCCAACGGCGCGGAGTTTAATAAGTGACCTAGCATCCTGGGTCTCCCGGTATACAAAGTCGAACGTCGACGTAGTCCCCCCCGTGACGCTATGAGGCTGGTCGGGAGGTTTCCGTAGACCGAGAGTGATACACACCAAAGGTTCACAATGTTGTTTGATATGCAAACGTCACCTTAATGGATAAGAACAGTGCTCATCAGAAGTCAGAATACAGATATCAATAAGAACGGCCTCACGGCCGTTAGAGCTATTGAACGAAAGAGCGCCTCCAACACATAGGATCGGCCAATCGACAGTCTAACCTGGCACATTGTGGGCCGCGGCAAAGCGTCCGCGACAGCGTTGTAGGTCCGGAGCCTTATCTAGAAGTGAGATCAGCACAACATCTCATAGTATCCCATGGTTTCCCCTTCGGAAGTATATGGAGAGGTGACATGTAACCGAGGGTCATCAACGGACTAGGTGATGATTCTTTCCCGTGAAACCCCATCTGATGCTGTGTCTGCGGGACTTGGATTGTACGGAACGACTCGGCTTGTTGAATCATCGAATTTGGGGAGGGGTGCGTATGTGGTGGGTTTGACAACTAGCTCCTTCCCTCGATCTTGAAGTCAGTTGTCTGAACGTTTTAGACATATGACATCCCGACCGCAGGGAAGAATCGTCTGATAGGGGATTGGAGCTTGATAGTAAGTGGCCCCTAATACGGCTGTATAGGTCGTGAAGGTCAAGGTAACGGTGACATATAAACACTCCCCTGATTGTAGTATAGCACATCGAGTTGTACACGAGTACACCTCTTATTGACCAATGGGCATCGCTTCCGCTACGGCCCCAAAGGGTGGCCGCCTGAGCCCTGGGCTAACGACGATGAAATAATCAGCTTGTCTCGTGTGCAGGCATAGATTAGGTGACCCAAGGAGCAATAGCCATTGGCTCATCTTAGGGGTTCAACGGACAAATTTTGTCGAGAAGACACAATCCCGGATCAGTTGAGTGGCAATGCGAGATCTTCCTCCTCTTAATTACGTCCCTTATCGTTTGGCTTCTGGAAGACGCGATAACCACGGAATGCAGTGTAGCATTCTTTGCGCGCAAAGGTTCAGATGGAACTACAAAGCCGCATGGTTATTCCCCGTCGAAGAAGTCGCACCTTACTTTTGCACTCTTCGTGCTCGAGACCAAAAGTGCGAATTTAGGGGCTCCTAATGGGTCGCCTAGACTTGTATCGACTACACACATGCCTTCATAACTGTGTGATTTATAGAGCTAACACACTTGCGAGTTGCACGTTGAGCTCCCGCGTAAGTGTTCTCTACAGCCCGATGCCTGCTTGTCTTCTGTCCGCGGCTTGTTGCAGGTCTGACCGCCATGTTCGTTCCCTACTAACACACAGACGGTGTATAGCCCTATCATATCGATCACTTAGGACTGTGTTTTATCGGAACCGAGGTGACCATGCGTAGTCGACTAGTTGAGCGAGAATGCCCAGACGTTAAAAGGACAACGTCGCGAATGGCGTTGCGCGGGTATCCAATCGCGTAGCCAGAGGTAACCAAGTGACCCGATTAACTCCGACATAACACTAGATTGAGCGGTCAGGACTTTCGCCAAGAAGCGTCAATGGGTGGGCGATGGCGTATACTCGCTCTAACCTCGGAGAGAAGCCATTCAGCATTACCAGAGAACTGTGGCGATAGGTAACCGCTTTCTCAACTATTATACATTCGGTGCACCCTGCTTTAGGCATGTTCGAATCGCTGGGAACACAAGCCATCCAACTTCGCAGTCTGGGTAGCCGCGGACGTTTTAACGGTCGTACAGTTGAGCGCCGCGGTATTGGTACGTTCAGGCTAATTCGATCTCCGTTGGCGCCCGGTGCCGTGAGACTTGAATGAATGCCCTATGGAATGTGGTAGACATTGGTAGCTGAGTATTGCGAAGTCCGCGAATAGGGCCATACCAGGTTAGTAAATGTAACCAAAGTAAGGCTTGGTGTTGCTGACCGAGTTTTTCGTTTTAGTAGATATCGTAACACGACTGACGATGCCCTGCTTACCCAATTTAGTTGACGTTAGATAAGATTGATTCGATAGAAACCTTTCCTGTTTCTTAATTTCTTATCATACTATATTGCGGACGGGTGTAACATAGCGGTTTAAAGGTCGACAGCTGAAGCCTTGTTAGGCTGATTATGCCTCCTCACCACTCAATCTAACGAATTTTCCAAAGCGGTCGCTTTAAACTAACGGCGAGACTATGCGTGCTCGGTCATTTCCGCCTACGCGAAATCACGTCGGAGCGCGGACCCGGCGGGTAGTGACGGCGAACTCTACTCCCCCGAAATCAACCAAGGCACGTTGCGACCACTTTTGGGGTGGAGCCGCCCACGCAATCCCTCAACCGATATTTGGAGTAGCGGACGCCCCGGGTATCCTAGGTCTGGAGCACCTATCCGTACAGCGATGGCTGCCTACCAGTACTGCAAAGAGACCTACATCGTTTCTAAGAACCTCTTACACGTCCCGATGCACAGCATGACTCCCCTGGCTGATTCCCACCCTTGTGGGACCGCTTAGATGTTAACGGTTCTCAATGTGCATACTAGACCCAGGGGCCGGTACAAGAACCCATAAGGAGAACACATCTATCAACGGTCGTGTATAACAAGGCAGCGTTGTTTTGGATATCAGATGTAGATACCATCGTACTGTACCACCCTAGAGGTGATTCCTGGAGCGAGTGTCGACGGACGGACCACGCTATGCGCCTTCCCCATCAAGATAAAGTGGGTAGCTTCGACTGACTGTGATTGAACAGATTTTTGGAGGTCCAAAGACAGCTTCGGCGCGTGCGCACGCCGAAGCACAACAGTACCTACTTGATGAGCTTGTTTCGGTTGGTAAGGTGGACCAGTACGATCCGACGTTTGTGAAATTTTACGGACAACCCCAAGAAGTCGGGAGCATGACCTTGATCTTGGGCTTGACCACGCATTGTCAGAGAACAGTCTACCTATACTAGCCACCTGTACGGTCCTTAATTATCACTATTACCATACGCTATTTTGAAACCCATCTCATTTCCTTCCGTTTCCACTCATCATGTGAGATATAAACGTACCAGGACCGCGTGCCGATGCGACGGATTCGGTGGTGTCCGTTGGCCAAACGGCCTCTCTACACAGTAGGGATACATCATCCGCCGTGTAATAGCAGGGGTGGCCGGACTAGCTTATGCACTGATAGCAAACTTTAGACCGGTCCGAATGGTCGTGGCTCGGTCTCGGTCCGCTGATATTGCTTGAGTCCCCGCTGACGTGATTCTAACTGAATGTATCGCGGTTAATGTACGCGCTTAGCTACCCCAAAGGCCATAGGCCGCGGCGACCGGTATTGCAGATTTTATAGCTCTAGTTCGTTAGTCAAGGGTCCGGATTGTTCCCATTTGCGACTGAAAAGCGCTTTAAGTGGCTACTACCCTTGTGACTAGCTAGCAGACTCTCGGCACTACGTGTATAATAACGCAACAAGTCTGTTCCACGAGAAAGCTGGAGGCAGGAATTTGTGTGAAAAGTCGTCTTTAGAACTTAGACCATGATACCGAACCTGGGGCTGTCGCTCCGTCAGCGGCGCAATTACCGTCACGGCCAGACGGTCCACCCCTATTGTAAATATAGCTTCTTGCCTGCTGATGATCGTTAGATGAGCCACCTGGCATTAAAGCCAACGGCACAACTTACCATATGAATTCGCAGGTT'
kimura_2p_distance(a,b)
# Kimura 80 distance
p = patterns[0]
patterns[0].attrs.keys()
simp = p.attrs['stacked_seqlet_imp']
def get_kimura_dist(p, verbose=True):
"""Get the Kimura distance for a certain pattern
"""
from basepair.exp.te.dist import kimura_2p_distance
simp = p.attrs['stacked_seqlet_imp']
seqs = one_hot2string(simp.seq, DNA)
consensus = p.get_consensus()
distances = np.array([kimura_2p_distance(seq, consensus, verbose=verbose) for seq in seqs])
return distances
kd = pd.concat([pd.DataFrame({"dist": get_kimura_dist(p, verbose=False),"pattern": p.name}) for p in patterns])
dfk = kd.groupby('pattern').dist.agg(['mean', 'std'])
dfk.columns = ["kimura_" + c for c in dfk.columns]
dfk
dfp = df.rename(columns=dict(name="pattern")).set_index("pattern")
dfp
df_rm = dfi_top1.copy()
df_rm.index.name = 'pattern'
del df_rm['LTR_overlap_frac']
del df_rm['n_pattern']
dfti = dft.rename(columns=dict(name="pattern")).set_index("pattern")
dfpa = dfp.join(df_rm).join(dfk).join(dfti).join(dfl)
dfpa
length: trimmed pattern lengthseq_info_content: sequence information contentkimura_mean: average Kimura distance from the consensus sequencekumura_std: std of the Kimura distance from the consensus sequenceLTR_overlap_frac: fraction of the LTR's overlapping the seqletn_repeat_frac: fraction of the seqlets overlapping the most common LTRcons_phyloP: PhyloP average conservation of the seqletcons_1kb_phyloP: PhyloP average conservation of the seqlet's 1kb windowcons_phastCons: same as for cons_phyloP but using phastConscons_1kb_phastCons: same as for cons_1kb_phyloP but using phastConsrepeat_name: Name of the most commonly overlaped LTRrepeat_family: Family of the most commonly overlaped LTRdfpa['type'] = pd.Categorical(dfpa.is_te.map({False:"non-TE", True: "TE"}))
dfpa_subset = dfpa[['length',
'consensus',
'n_seqlets',
'type',
'seq_info_content',
'kimura_mean', 'kimura_std',
'LTR_overlap_frac', 'n_repeat_frac',
'repeat_name', 'repeat_family',
'cons_phyloP', 'cons_1kb_phyloP',
'cons_phastCons', 'cons_1kb_phastCons']]
sns.pairplot(dfpa[['length',
'type',
'seq_info_content',
'kimura_mean', 'kimura_std',
'LTR_overlap_frac', 'n_repeat_frac',
'cons_phyloP', 'cons_1kb_phyloP',
'cons_phastCons', 'cons_1kb_phastCons']],
hue='type',
# ag_kws=dict(bins=20),
height=get_figsize(.3, aspect=1)[0],
plot_kws=dict(alpha=.5, s=20))
plt.savefig(f"{figures}/pairplot.conservation.pdf")
# save to csv
dfpa_subset.reset_index().to_csv(modisco_dir / 'pattern.conservation-prop.csv', index=False)