Goal

  • [x] make a spreadsheet with
    • the motifs
    • their LTR overlap
      • total
      • best specific LTR
    • their conservation scores
      • motif
      • 100bp
      • 1kb
    • length
    • average IC
    • their mutation rate
      • average kimura distance
In [2]:
# Imports
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
from basepair.imports import *
hv.extension('bokeh')
Using TensorFlow backend.
In [61]:
# Common paths
model_dir = Path(f"{ddir}/processed/chipnexus/exp/models/oct-sox-nanog-klf/models/n_dil_layers=9/")
modisco_dir = model_dir / f"modisco/all/deeplift/profile/"
figures = f"{ddir}/figures/modisco/TE/"
output_dir = modisco_dir
In [26]:
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
from uuid import uuid4
from basepair.math import mean
from basepair.stats import perc

from IPython.display import display, HTML
from basepair.plot.vdom import df2html, df2html_old, render_datatable, vdom_footprint
from basepair.plot.utils import strip_axis
from basepair.plot.tracks import tidy_motif_plot
from basepair.modisco.core import patterns_to_df
from basepair.modisco.utils import longer_pattern, shorten_pattern, extract_name_short
from basepair.imports import *

modisco_dir = Path(modisco_dir)
output_dir = Path(output_dir)
!mkdir -p {figures}

paper_config()
In [89]:
patterns = read_pkl(modisco_dir / 'patterns.pkl')
In [34]:
pdict = {shorten_pattern(p.name): p for p in patterns}
te_df = pd.read_csv(modisco_dir / 'motif_clustering/patterns_long.csv')
tes = [pdict[pn] for pn in te_df.pattern]
In [11]:
df = patterns_to_df(patterns, properties=['name', 'short_name', 'seq_info_content'])
TF-MoDISco is using the TensorFlow backend.
In [268]:
dfl = pd.DataFrame({"length": [len(p.trim_seq_ic(trim_frac=0.08)) for p in patterns],
                    "consensus": [p.trim_seq_ic(trim_frac=0.08).get_consensus() for p in patterns],
                    "n_seqlets": [len(p.attrs['stacked_seqlet_imp']) for p in patterns]
                   }, index=[p.name for p in patterns])
dfl.index.name = 'pattern'

Overlapping seqlet regiosn with RepeatMasker regions

Load the repeat masker file

In [81]:
from basepair.exp.te.repeat_masker import read_repeat_masker, wget_repeat_masker
In [82]:
repeat_masker_file = f"{ddir}/raw/annotation/mm10/RepeatMasker/mm10.fa.out.gz"
In [85]:
wget_repeat_masker(repeat_masker_file)
Using downloaded and verified file: /users/avsec/workspace/basepair/data/raw/annotation/mm10/RepeatMasker/mm10.fa.out.gz
In [86]:
dfrm = read_repeat_masker(repeat_masker_file)
In [15]:
dfrm.head()
Out[15]:
chrom start end name
14737 chr1 3000001 3000097 L1MdFanc_I//LINE/L1
27 chr1 3000098 3000123 (T)n//Simple_repeat
14737 chr1 3000124 3002128 L1MdFanc_I//LINE/L1
2853 chr1 3003148 3004054 L1MdFanc_I//LINE/L1
3936 chr1 3004041 3004206 L1_Rod//LINE/L1

Intersect seqlet locations with repeat masker

In [17]:
from pybedtools import BedTool

bt_te = BedTool.from_dataframe(dfrm)
def intersect_repeat_masker_single(pattern, f=1.0):
    from pybedtools import BedTool
    bt = BedTool(f"{modisco_dir}/seqlets/{pattern}.bedgz")
    try:
        dfint = bt.intersect(bt_te, wa=True, wb=True, f=f).to_dataframe()
    except:
        return None
    t = dfint.blockCount.str.split("//", expand=True)
    dfint['pattern_name'] = pattern
    dfint['repeat_name'] = t[0]
    dfint['repeat_family'] = t[1]
    dfint['n_pattern'] = bt.to_dataframe()[['chrom', 'start', 'end']].drop_duplicates().shape[0]
    dfint['interval'] = dfint['chrom'] + ":" + dfint['start'].astype(str) + "-" + dfint['end'].astype(str)
    return dfint[['chrom', 'start', 'end', 'interval', 'pattern_name', 'n_pattern', 'repeat_name', 'repeat_family']]
In [17]:
dfi_raw = pd.concat(Parallel(n_jobs=20)(delayed(intersect_repeat_masker_single)(pattern, f=0.9) for pattern in tqdm(mr.patterns())))
In [19]:
dfi = dfi_raw.copy()
In [20]:
# Restrict only to LTR/
dfi = dfi[dfi.repeat_family.str.startswith("LTR/")]
In [21]:
# Append some stats
unique_elements = dfi.groupby(['pattern_name']).interval.nunique()
dfiu = dfi[['pattern_name', 'n_pattern']].drop_duplicates().set_index('pattern_name').join(unique_elements)
dfiu['LTR_overlap_frac'] = dfiu.interval / dfiu.n_pattern
dfi = pd.merge(dfi, dfiu.reset_index()[['pattern_name', 'LTR_overlap_frac']], on='pattern_name', how='left')
dfi = dfi.drop_duplicates()
In [59]:
# Append properties
df = patterns_to_df(patterns, properties=['name', 'short_name', 'seq_info_content'])
df = df.set_index('name').join(dfiu).reset_index()
In [60]:
df
Out[60]:
name short_name seq_info_content n_pattern interval LTR_overlap_frac
0 metacluster_0/pattern_0 m0_p0 15.3199 6569 2174 0.3309
1 metacluster_0/pattern_1 m0_p1 16.9463 975 394 0.4041
2 metacluster_0/pattern_2 m0_p2 9.7451 930 233 0.2505
... ... ... ... ... ... ...
119 metacluster_6/pattern_12 m6_p12 10.4935 79 1 0.0127
120 metacluster_7/pattern_0 m7_p0 14.2622 640 180 0.2812
121 metacluster_7/pattern_1 m7_p1 18.3106 158 25 0.1582

122 rows × 6 columns

Scatter-plot: Seq IC ~ LTR_overlap_frac

In [24]:
TE_min_seq_IC = 50
TE_min_LTR_Overlap_frac = 0.7
In [27]:
height = get_figsize(0.25)[0]
fig = sns.jointplot("LTR_overlap_frac", "seq_info_content", marginal_kws=dict(bins=15), s=5, data=df, height=height);
fig.ax_joint.axvline(TE_min_LTR_Overlap_frac, linestyle='--', color='grey', alpha=.4);
fig.ax_joint.axhline(TE_min_seq_IC, linestyle='--', color='grey', alpha=.4);
# fig.savefig(f"{figures}/TE-match,seq-IC.scatter.pdf")
# fig.savefig(f"{figures}/TE-match,seq-IC.scatter.png")
/users/avsec/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/matplotlib/tight_layout.py:199: UserWarning: Tight layout not applied. tight_layout cannot make axes width small enough to accommodate all axes decorations
  warnings.warn('Tight layout not applied. '
In [28]:
# TE's are in the top right
pattern_te_names = df[(df.seq_info_content > TE_min_seq_IC) & (df.LTR_overlap_frac > TE_min_LTR_Overlap_frac)].sort_values('LTR_overlap_frac', ascending=False).name.unique()
In [244]:
df['is_te'] = df.name.isin(pattern_te_names)
In [242]:
pattern_te_names
Out[242]:
array(['metacluster_0/pattern_28', 'metacluster_0/pattern_10', 'metacluster_0/pattern_16',
       'metacluster_0/pattern_22', 'metacluster_0/pattern_18', 'metacluster_5/pattern_3',
       'metacluster_3/pattern_9', 'metacluster_5/pattern_9', 'metacluster_0/pattern_23',
       'metacluster_0/pattern_9', 'metacluster_6/pattern_5', 'metacluster_0/pattern_24',
       'metacluster_5/pattern_11', 'metacluster_0/pattern_14', 'metacluster_2/pattern_19',
       'metacluster_0/pattern_6', 'metacluster_0/pattern_25', 'metacluster_0/pattern_13',
       'metacluster_1/pattern_15', 'metacluster_5/pattern_8', 'metacluster_0/pattern_21',
       'metacluster_2/pattern_16', 'metacluster_0/pattern_29', 'metacluster_5/pattern_2',
       'metacluster_2/pattern_15', 'metacluster_5/pattern_7', 'metacluster_6/pattern_6',
       'metacluster_0/pattern_3'], dtype=object)

We can see that motif with high IC are frequently in the region of known TE's

In [29]:
dfi.repeat_name.value_counts()
Out[29]:
RLTR9E      1541
RLTR13D6     683
RLTR9D       598
            ... 
LTR52          1
LTR89B         1
LTR86B1        1
Name: repeat_name, Length: 410, dtype: int64
In [30]:
dfi.repeat_family.value_counts()
Out[30]:
LTR/ERVK         8747
LTR/ERVL-MaLR    3315
LTR/ERVL          800
LTR/ERV1          579
LTR/Gypsy          22
LTR/ERV1?           2
LTR/ERVL?           2
Name: repeat_family, dtype: int64
In [49]:
dfi
Out[49]:
chrom start end interval pattern_name n_pattern repeat_name repeat_family LTR_overlap_frac
0 chr1 40303313 40303328 chr1:40303313-40303328 metacluster_0/pattern_0 6569 RLTR17 LTR/ERVK 0.3309
1 chr17 63583030 63583045 chr17:63583030-63583045 metacluster_0/pattern_0 6569 ORR1D1 LTR/ERVL-MaLR 0.3309
2 chr1 56624517 56624532 chr1:56624517-56624532 metacluster_0/pattern_0 6569 RMER3D-int LTR/ERVK 0.3309
... ... ... ... ... ... ... ... ... ...
23863 chr8 27657802 27657816 chr8:27657802-27657816 metacluster_7/pattern_1 158 MLTR18B_MM LTR/ERVK 0.1582
23865 chr14 118070816 118070830 chr14:118070816-11807... metacluster_7/pattern_1 158 RLTR11C_MM LTR/ERVK 0.1582
23867 chr7 139334374 139334388 chr7:139334374-139334388 metacluster_7/pattern_1 158 RMER15-int LTR/ERVL 0.1582

13467 rows × 9 columns

For each pattern, are all seqlets falling into regions of the same LTRs?

In [176]:
dfi_top = dfi.drop_duplicates()
dfi_top['n_repeat_name'] = dfi_top.groupby(['pattern_name', 'repeat_name']).repeat_name.transform('size')
dfi_top['n_repeat_frac'] = dfi_top['n_repeat_name'] / dfi_top['n_pattern']
dfi_top1 = dfi_top.groupby(['pattern_name']).apply(lambda x: x.loc[x.n_repeat_frac.idxmax()])
In [177]:
del dfi_top1['chrom']
del dfi_top1['start']
del dfi_top1['end']
del dfi_top1['interval']
del dfi_top1['pattern_name']
In [179]:
dfi_top1
Out[179]:
n_pattern repeat_name repeat_family LTR_overlap_frac n_repeat_name n_repeat_frac
pattern_name
metacluster_0/pattern_0 6569 ORR1C2 LTR/ERVL-MaLR 0.3309 170 0.0259
metacluster_0/pattern_1 975 RLTR9E LTR/ERVK 0.4041 130 0.1333
metacluster_0/pattern_10 114 RLTR9A3 LTR/ERVK 1.0000 80 0.7018
... ... ... ... ... ... ...
metacluster_6/pattern_9 84 RLTR9E LTR/ERVK 0.1905 15 0.1786
metacluster_7/pattern_0 640 RLTR11B LTR/ERVK 0.2812 23 0.0359
metacluster_7/pattern_1 158 RLTR17B_Mm LTR/ERVK 0.1582 7 0.0443

122 rows × 6 columns

Overlap the conservation scores

In [37]:
from basepair.preproc import resize_interval
from basepair.exp.paper.config import motifs
from basepair.extractors import StrandedBigWigExtractor
from basepair.utils import flatten
In [38]:
phastCons = StrandedBigWigExtractor('/users/avsec//oak-nfs/anno/mm10/conservation/phastcons/mm10.60way.phastCons.bw', use_strand=True)
phyloP = StrandedBigWigExtractor('/users/avsec//oak-nfs/anno/mm10/conservation/phylop/mm10.60way.phyloP60way.bw', use_strand=True)
In [39]:
g = '/mnt/data/pipeline_genome_data/mm10/mm10.chrom.sizes'
In [40]:
# Extract the conservation
scores = {}
wide_scores = {}
resized_scores = {}
for p in tqdm(patterns):
    bt = BedTool(str(modisco_dir / f'seqlets/{p.name}.bedgz'))
    bt = BedTool.from_dataframe(bt.to_dataframe().drop_duplicates())
    bt_wide = [resize_interval(i, 1000) for i in bt]
    bt_resized_100 = [resize_interval(i, 100) for i in bt]
    scores[p.name] = dict(
        phastCons = phastCons.extract(list(bt)),
        phyloP = phyloP.extract(list(bt))
    )
    wide_scores[p.name] = dict(
        phastCons = phastCons.extract(list(bt_wide)),
        phyloP = phyloP.extract(list(bt_wide))
    )
    resized_scores[p.name] = dict(
        phastCons = phastCons.extract(list(bt_resized_100)),
        phyloP = phyloP.extract(list(bt_resized_100))
    )
100%|██████████| 122/122 [07:56<00:00,  2.19it/s]
In [41]:
def te_stat(p, scores, resized_scores, wide_scores):
    return flatten({"cons": {k:v.mean(0).mean(0) for k,v in scores.items()}, 
                    "cons_100bp": {k:v.mean(0).mean(0) for k,v in resized_scores.items()}, 
                    "cons_1kb": {k:v.mean(0).mean(0) for k,v in wide_scores.items()}, 
                    "ic": p.seq_info_content,
                    "name": p.name,
                   })

def te_plot(p, scores):
    plot_tracks(flatten({"cons": {k:v.mean(0) for k,v in scores.items()}, 
                         "contrib": p.contrib, "ic": p.get_seq_ic()}),
               title=p.name, rotate_y=0, fig_height_per_track=.5, fig_width=10)
In [42]:
dft = pd.DataFrame([te_stat(p, scores[p.name], resized_scores[p.name], wide_scores[p.name]) for p in patterns])
In [48]:
fig = plt.figure(figsize=get_figsize(1, aspect=1/3))
plt.subplot(131)
dft.plot.scatter("ic", "cons_phastCons", alpha=0.7, ax=plt.gca());
plt.subplot(132)
dft.plot.scatter("ic", "cons_phyloP", alpha=0.7, ax=plt.gca());
plt.subplot(133)
dft.plot.scatter("cons_phastCons", "cons_phyloP", alpha=0.7, ax=plt.gca());
plt.tight_layout()
# plt.savefig(f"{ddir}/figures/modisco/TE/conservation-scores.pdf")
# plt.savefig(f"{ddir}/figures/modisco/TE/conservation-scores.png")

Compute the mutation rate

  • average kimura distance
In [ ]:
from basepair.exp.te.dist import kimura_2p_distance
In [46]:
a = 'AATGAGCCGGTGGCTGATCCTTCCTCTGTACATGGGCCTCATAGCCTAGTCTGATGGTGATGAGCTGCCCAATTATAGCAGGGTGAAAATGCGCAACCCGCCGAAAACTTGCCACGGATTGGATCTTACATCTTGCGGGGGTACATCGACGATCAGGCACTAGTTGATCCTTCACACGAGTTCAACAGCTATTGACTCCCTCCAGAGGAACGGAGACTTACACTCTAGTTATTCTTTGACTGCGCCTGGTCGGCCCCCCCTGTACAGGTTGTACGAAAAAATAAGATCACCGCGCCAAGGCCCCCCTCCCGACCGCACACAATGCTACGGCCGTAATCTGGTTCTGTGCATATAGAGCAACTAGCATCACCCCGAAATGCAGCCGGTAGTTGGAAATATGTCGACGGCCTAAGGTCGGTAGCATTGCCCTTAGAGCGGAATACCTGGGCTACCTAAATCTTTCTCGCTCACTACCACTGGCCTAAGCCGATTAGAGGGCCAATTAGTTATTGTGCGTTAGTCGGACGGCAGTACGTGGAGACTTGGCGTGGAGATGCTAATCAGCTGCGCGCTGTCCAGTGGATCTATGCAGGGCATGGTGAGATCAAAATATCGGACGGAGTTGGTTACCCAATATCTCACAGAAGACCGGCTACCAAAAGGTAAGCAATCGCTCGGGAGGACCGAACCGGGGACTAGGTTACAATTTGTTCGGTACAACGCGCTATCCATGACAGGGTATTCCTTTGTAGTTACGCCCCATGCTTATGCCCCCGTTTTACCGTATTCGCATGAGAACGCTGAACACGGAGACGTACGTCTGAATCGCAGTGCTCTATCTGCCCCGCCGAGTTGGATACCTGGACCGCGACAAAGAATCGCTTAGAATTGTAGGTATTCGGGGATCGCCAAGGGATCGGTGCGTGCGCGCCTTAGCATGTACTCATCCCTAGCGCCCAGAGAAGACAACGGCCCCACTCGCAAACTTACGAAGACCTCCCAATCTCTCAAAACCTCACTAGTCCACTCCCAGTCACTGGAGGACAGCATTCGGTGAATAGCACTACTTACACAACGCCCGGACGCGTATCGGAGTATGCTGAGAATAATTTGTTATCGAAGCAGTAGGGTTGGGCTCTTCCGAACTGTTGACCTCGCGCGCTCGATTCTGCCCATGCAGCGGAAGGCACTCCGATGCGCATAAGTTCTGCAAAGCCACCCTGACTTCCAAGATCCGGGTGCTTACTACCAGTTGTAGCCTCTGCCAGAAGAGAAGTGGTGCTCAATGGTCAACCGTGTCCTCAGGCACCTTCAATTCCCCCCGTCCAAAGCGACGGGTATCATACCTACCCGCTAGGTTTCATGAGTGCAGAGTTTCATTTCCTGAAGCGAGGAGGCCAAATCTACACTTGCTTATATCTAGGTTAAATCTCCTGCTTGACATGTATCCTATTCGAAGTTGTTCGGTCACCAACGGTTAGTTACTGTTGCTCCGGTCTAGGCCCATTGAGGGACCACAAAGCACCGCGCAACTAGCACCCCGTTGTGCCGAAGTTACGAAGTTATGCCCCCTAATCATTAGTAAAAATCTTAACGGTCACCTTGGCTGGTATCTACTGCCTACGCCGAGCCTATCACGTGGCCTGGGCACAATATTCCGAGACACTGCGTATCTGACCTAGCTATCCATTTGACTATTCAGCGGTGTCCACTCTGCTTTATAGTGTTGATCTGGTAGGACAGCCATCGCGTTATCGCCCTGGCCAGCAAACCTCTACAGAGAGGTCACTGCAGAACCGTATAATTCTGGGGTACTACTGGGTAGGCCTTGGCTGGTCTTGTCCAACCCCAGGCCGTAGCGGATATGAGAGAACTTAGCTCAAACGTAATCGCCTGTTGCACGCACTTCTCGAGAGGATGACACTAATTGGCCTACGAAAATAGGACCAACTAATTGTCAAAAACCACCACTTCTTCACTATGAGGTGAGGAACGCGCCTTTACTGAGAACGTATCGTAAAATGAATAATCGACTCTCGGTCTCACGGCCGGAGATCTGCTGTGGGAGGTTATTTGTGAGGACGACAGTTGGCTCCAGCGCTGTAGAACGCCGGGTTTCTACTTTCAGAGACCACGGGTAAACAAATTCGTAGGATTTCGACTCTCTAAAATCATGACGACCGCGCTCTATAGTCGACTGTGTATTGTAGAGAAAAAGCCAAGTAAGTTGCGGGTTTCCCCCGGGCACGTGTACCGGGTTTACGTTTAAACGGATTGACGGGACTATAGATAACATTGTTTAGACCACGCATCTTAGTGAGTGAGTCGATTTTCTATTGTTGGATAGATGCCCGGACTAGAACGGTGATTAACCCCCGGCCATTCTGGCGTATGTGATGGCGCGGGGGAATTTTCACCAGATTCGCTTTCTGGTCCCCGCCCTTCGGGTTCCGACTTGTAGAGAGTCAACAGGTACCCCTAGGCGTAATTCTAAGTGGGGAACTAAGAGGTTTTCGTATTCCTCTCCATTACGTCATCGGATAAGCACAATGTAATGTGTAATAGCTCCCTTTAGTTGGAGTGTAGCCATATCAAAACTGAGCACGATAACTGACCTTCCTCTTTAATGATCGGGGCAACTACAATAACATGGGACCGAATTAAAGATAGGCTGAGCTAGCCTGATGTACCTACGCGCATTTGGAAATAGATCGCTCAACTTCCCGGCCCTAAGCAACTACCAGACGTTTGTGGTACGATGATCGGGAGTCCCCGGCGGAGGCATCGTCACGAAAACTGTCTGTTCATGCGGACAGTAAATCCCTATCTATGATGTTCGCGGGCTCATCTACCAGAAACGTCCCCCGGACCCGAACAGTTTCGGTGTAAATACTTGCTAGCCGGGTCCGAGGCTTCCGTGCCGCACAGTACTTTAGTCGGAGGCAGGTGACTGTTAGATTGTGCGAGCGCTCAAAGCGTGATATAGTTACGGGGGGTCTAATGTCAGCTGTGACAAAGTCCTGTCTGTCCGTTTTTGCCCAGACACGCTCGTCTCTCCATGCTCTTTTACAGTGCTGGAAATGGGAGCTTCTGGTGCATTGAAACGAACCTTTTAGCAGATGTGGCATACCTCTTGGTACAGACGAAGTATAGACTACACAAAAGCGCTCGAGGCAAGTTTTGCGTAATCGGCCCGTTTCGCGCAGGTTGGAAACCTATACTGCACACTCCAATTAGTTAATTCACACTAGATTTTGGATCTTACGTCTCAGCGTTCCACTTGCGAATGAACATCCAGTCGATTAACTACCAGCTATACTTCCCTTAGATTAATTGTGACCAAATTAACGCATGTTGATCCGAACATCTGCTGTTGACTCGCTGTAGTCTTACGTGCTAAAATTTCGTCACCCCAACAGCATCCTTATGGTTTAAAGGTAGCCCCATCAACGGAATATTGCAAACTACTAACGTGGTTGGCTCGAAGATTTTTACCTTTGGTTATGCCTCGGACCTGCCGTGCTGTATATAGGTCCCAGAGTCACACATGGATGTACTACTTTCATAATACCATGGTGAACCACGATTATACTAGGTATGCGTGGAGTGATCAGTTTGTGCTACGACCGAGCGGAGCGGGCGTGACATGGGCGGTGTACGACGACCTTCTGGAAAGGACCCCCTGGTTCAGTTCAGCAACAGGCGGACTTGTCTATAAACACGTCTTAAGGAGTAAAGTAGAGGCGGTTTGCTAGGTGCAGATTGTTTGGCCACGCTGCCTGAGCGTAATGTTAACGAATTATTGGACAGTTAGGGCGTTCGAGTGGGATGTTAATTTGGCCTCTTGCACCGGAACCGTTAGCCGGACCACCGGCTTCTGTGGGGGTCCGTAGGAACACAGGTGCCTAGAGCGCAGCAGGCTGTGCCTGTGGGCCCGCCAATGAAGCCAAATCAGATTTCGATGAGCATGATCAGCACACGCCAGTGCGATACCGTTCTCTCGGTAGGGCGTCCCAATGGGAAGAGCGCAGTGAATCTTGTGGGTGCGGACTATAACGGATCCGTGATTGCCTCCGGCGGAAATGAAGTCTGTGGTAGTGTTTTAATAAATATAATGCGCCAACGTGGACTCTCGGATAACGTCGGTGTATCCGGCATATCCAGACACCGTTGAACAATTTAAATCGTCTGCCCGTGCTCTAGAGTCCTCATAGTCTAAGTGTAGTCTATTCCCTCTAAAATTTTTGAGATTAGTAAGTCTTGACTCTCAGGTCTTGTTCTTAAGCGGGGCAATGCTCGACTAAATAACACGATCACGCCCACGCCCGTAGCCAGTGCCTCAATCTATGCGCACGTGAGGTAACTGCGGACGGCGTAACGCGTTCTGGATATTATATCCTTGCCGTGGACCCGGTCAGAAGCAGGGTCGGAAGACCCCAGTACTGCCGGCCCGGGGCGTACAAAGTGTCTGTGCTTTGCGCCGGGGGACGGTAAGGCTTTGTGATCCACCGTCAGTTCCGCTAATAGTGTTCACTGGTAATACCTAGGGTAGCAACGCACAGATTCGAGAATTAAGCACATAGGTTTGACCGACATGGTGGGCGCACACGACCGAAGTAGATGTACGATGTTGACGTTTACGAACACTATCTAATTCTCTGCAATATGTAGGAATGCAGCCCAAAATTTACATTGAGTAGCCGGACGAAGCTCTGTGATCTAGGGCATAATATTGATAGCACTCCAAGACTGTCAACTTATAGATGAGCGAGAACACTGGTCCTGAGCCCCAAGGGAAAGCTCTGCAAGAGGCACCTCCTTGACCAAGGAGCCCATTGGTAGCCGGTTACATAGAGTCCGCAAATCTCGAAACCGGTGTCCAGATGAGAGATAAAGAACTTGTTGTTGGCTGAGTGGGCAGAAAACACCGATGCTGCGCTTGCGGACGGCATGCAAGCCACTCAGGCTCCCGTTGAAACACGCGCCCCTTCTGTCAAGCC'
b = 'ATTCCCCGGTAAAAGCAAAATTCTGTAGTCAGACGCCCGTACCACGTAGATGAGGATCTGCCCCCGTCATGAGGGCGACCACAATAGTAGGCGGCCACTGTCGGCAGTTACAAGAGCAGCGGCGGATTTCATCGGTACAATCGCGGTTGCGCCGCCCGCCTAGTAAGCATTGCGCTCGAGTCCCTGTAATTCTCAGGCTAGGCAAGCACACAGATATGTGAGTGTATCGTCATAATGTGCTCGTTTAAGAGCTCATAATTTGTTAACTGTATGGCCGCATTTTCAAGATGGATCGTGCGCCCAATAACAATCCGGCAGTAAGAGTTCGCTCCAGGAGATTGCCTGGCGCGCTAGTACTACGGTAGAGACGTCAAACACACGAGCGGGGCGAGGATTTTGGCTTGGGTTCTAGCTACCCTAAACCAGCGGGTATCGGCCCAAGGGCTTTTCTCTCCTGTTTCCAGCGAAATTAGGGGTACGGTGGCAATCACTGAAATCACCCCCGATGTGTAGTGACGCTATACCTAAGTCTAAGAGAATCTGGGGCAAGTGTCTAAGGACTGCTGGCTCCAAAGAGCCTGGATACACCGCTGTACCTTTGAGACGTTGGTGGTATCTAATGCATAACGCGCTCGGAGCGAACGCCTCCACAACGGTGCGTTTCCCGTATCGTATAGAGGATAAGGATATCGTCTCTATTAAAGCCCCAATGGGGTGCAACAGCAACGAATAAGTCTGTATTTAGAAGAGCGAAAAAATTCGCAACCTTTAGAAATCCTTAGGCCTACGGCTATCCACGCATTGGTCGGGGCACTAAAGTCATAGATTCACCACGATCCCCCAGTTTCGGATAGGAATTGCGTCGTGGGCGTGATGAGTCTACGTCAACCAAGCCAGCTACATGCTCAGTGCAGCGATAATTTGCCTGGTGAGGATAATACAAATCGCATTGCGGAATGCTTTCTAACCTACGTTAGGTCGGTAAGTTGCCATGCTTGTCGGTGTTGTACTACTCTCAGACTTAAGATGTTCAAATGGACGGTATCCCATGGGTAGCCAGTGATGTGGATGTCGTCGGGCCTATAAAGGGCGTGGCGTGGCCCCTAACCAATCCACTAGTCCGGCGTAAGCAGCACCCTCACGGGGGAAAGACCGAGCATCTCGAGCAGTTCGTTCCACGATATTATCAGGTGACTTCCTATCTAGGTAACAGTCTCCAGCTGCAGACGCCTATTGGTACCCCAACGGCGCGGAGTTTAATAAGTGACCTAGCATCCTGGGTCTCCCGGTATACAAAGTCGAACGTCGACGTAGTCCCCCCCGTGACGCTATGAGGCTGGTCGGGAGGTTTCCGTAGACCGAGAGTGATACACACCAAAGGTTCACAATGTTGTTTGATATGCAAACGTCACCTTAATGGATAAGAACAGTGCTCATCAGAAGTCAGAATACAGATATCAATAAGAACGGCCTCACGGCCGTTAGAGCTATTGAACGAAAGAGCGCCTCCAACACATAGGATCGGCCAATCGACAGTCTAACCTGGCACATTGTGGGCCGCGGCAAAGCGTCCGCGACAGCGTTGTAGGTCCGGAGCCTTATCTAGAAGTGAGATCAGCACAACATCTCATAGTATCCCATGGTTTCCCCTTCGGAAGTATATGGAGAGGTGACATGTAACCGAGGGTCATCAACGGACTAGGTGATGATTCTTTCCCGTGAAACCCCATCTGATGCTGTGTCTGCGGGACTTGGATTGTACGGAACGACTCGGCTTGTTGAATCATCGAATTTGGGGAGGGGTGCGTATGTGGTGGGTTTGACAACTAGCTCCTTCCCTCGATCTTGAAGTCAGTTGTCTGAACGTTTTAGACATATGACATCCCGACCGCAGGGAAGAATCGTCTGATAGGGGATTGGAGCTTGATAGTAAGTGGCCCCTAATACGGCTGTATAGGTCGTGAAGGTCAAGGTAACGGTGACATATAAACACTCCCCTGATTGTAGTATAGCACATCGAGTTGTACACGAGTACACCTCTTATTGACCAATGGGCATCGCTTCCGCTACGGCCCCAAAGGGTGGCCGCCTGAGCCCTGGGCTAACGACGATGAAATAATCAGCTTGTCTCGTGTGCAGGCATAGATTAGGTGACCCAAGGAGCAATAGCCATTGGCTCATCTTAGGGGTTCAACGGACAAATTTTGTCGAGAAGACACAATCCCGGATCAGTTGAGTGGCAATGCGAGATCTTCCTCCTCTTAATTACGTCCCTTATCGTTTGGCTTCTGGAAGACGCGATAACCACGGAATGCAGTGTAGCATTCTTTGCGCGCAAAGGTTCAGATGGAACTACAAAGCCGCATGGTTATTCCCCGTCGAAGAAGTCGCACCTTACTTTTGCACTCTTCGTGCTCGAGACCAAAAGTGCGAATTTAGGGGCTCCTAATGGGTCGCCTAGACTTGTATCGACTACACACATGCCTTCATAACTGTGTGATTTATAGAGCTAACACACTTGCGAGTTGCACGTTGAGCTCCCGCGTAAGTGTTCTCTACAGCCCGATGCCTGCTTGTCTTCTGTCCGCGGCTTGTTGCAGGTCTGACCGCCATGTTCGTTCCCTACTAACACACAGACGGTGTATAGCCCTATCATATCGATCACTTAGGACTGTGTTTTATCGGAACCGAGGTGACCATGCGTAGTCGACTAGTTGAGCGAGAATGCCCAGACGTTAAAAGGACAACGTCGCGAATGGCGTTGCGCGGGTATCCAATCGCGTAGCCAGAGGTAACCAAGTGACCCGATTAACTCCGACATAACACTAGATTGAGCGGTCAGGACTTTCGCCAAGAAGCGTCAATGGGTGGGCGATGGCGTATACTCGCTCTAACCTCGGAGAGAAGCCATTCAGCATTACCAGAGAACTGTGGCGATAGGTAACCGCTTTCTCAACTATTATACATTCGGTGCACCCTGCTTTAGGCATGTTCGAATCGCTGGGAACACAAGCCATCCAACTTCGCAGTCTGGGTAGCCGCGGACGTTTTAACGGTCGTACAGTTGAGCGCCGCGGTATTGGTACGTTCAGGCTAATTCGATCTCCGTTGGCGCCCGGTGCCGTGAGACTTGAATGAATGCCCTATGGAATGTGGTAGACATTGGTAGCTGAGTATTGCGAAGTCCGCGAATAGGGCCATACCAGGTTAGTAAATGTAACCAAAGTAAGGCTTGGTGTTGCTGACCGAGTTTTTCGTTTTAGTAGATATCGTAACACGACTGACGATGCCCTGCTTACCCAATTTAGTTGACGTTAGATAAGATTGATTCGATAGAAACCTTTCCTGTTTCTTAATTTCTTATCATACTATATTGCGGACGGGTGTAACATAGCGGTTTAAAGGTCGACAGCTGAAGCCTTGTTAGGCTGATTATGCCTCCTCACCACTCAATCTAACGAATTTTCCAAAGCGGTCGCTTTAAACTAACGGCGAGACTATGCGTGCTCGGTCATTTCCGCCTACGCGAAATCACGTCGGAGCGCGGACCCGGCGGGTAGTGACGGCGAACTCTACTCCCCCGAAATCAACCAAGGCACGTTGCGACCACTTTTGGGGTGGAGCCGCCCACGCAATCCCTCAACCGATATTTGGAGTAGCGGACGCCCCGGGTATCCTAGGTCTGGAGCACCTATCCGTACAGCGATGGCTGCCTACCAGTACTGCAAAGAGACCTACATCGTTTCTAAGAACCTCTTACACGTCCCGATGCACAGCATGACTCCCCTGGCTGATTCCCACCCTTGTGGGACCGCTTAGATGTTAACGGTTCTCAATGTGCATACTAGACCCAGGGGCCGGTACAAGAACCCATAAGGAGAACACATCTATCAACGGTCGTGTATAACAAGGCAGCGTTGTTTTGGATATCAGATGTAGATACCATCGTACTGTACCACCCTAGAGGTGATTCCTGGAGCGAGTGTCGACGGACGGACCACGCTATGCGCCTTCCCCATCAAGATAAAGTGGGTAGCTTCGACTGACTGTGATTGAACAGATTTTTGGAGGTCCAAAGACAGCTTCGGCGCGTGCGCACGCCGAAGCACAACAGTACCTACTTGATGAGCTTGTTTCGGTTGGTAAGGTGGACCAGTACGATCCGACGTTTGTGAAATTTTACGGACAACCCCAAGAAGTCGGGAGCATGACCTTGATCTTGGGCTTGACCACGCATTGTCAGAGAACAGTCTACCTATACTAGCCACCTGTACGGTCCTTAATTATCACTATTACCATACGCTATTTTGAAACCCATCTCATTTCCTTCCGTTTCCACTCATCATGTGAGATATAAACGTACCAGGACCGCGTGCCGATGCGACGGATTCGGTGGTGTCCGTTGGCCAAACGGCCTCTCTACACAGTAGGGATACATCATCCGCCGTGTAATAGCAGGGGTGGCCGGACTAGCTTATGCACTGATAGCAAACTTTAGACCGGTCCGAATGGTCGTGGCTCGGTCTCGGTCCGCTGATATTGCTTGAGTCCCCGCTGACGTGATTCTAACTGAATGTATCGCGGTTAATGTACGCGCTTAGCTACCCCAAAGGCCATAGGCCGCGGCGACCGGTATTGCAGATTTTATAGCTCTAGTTCGTTAGTCAAGGGTCCGGATTGTTCCCATTTGCGACTGAAAAGCGCTTTAAGTGGCTACTACCCTTGTGACTAGCTAGCAGACTCTCGGCACTACGTGTATAATAACGCAACAAGTCTGTTCCACGAGAAAGCTGGAGGCAGGAATTTGTGTGAAAAGTCGTCTTTAGAACTTAGACCATGATACCGAACCTGGGGCTGTCGCTCCGTCAGCGGCGCAATTACCGTCACGGCCAGACGGTCCACCCCTATTGTAAATATAGCTTCTTGCCTGCTGATGATCGTTAGATGAGCCACCTGGCATTAAAGCCAACGGCACAACTTACCATATGAATTCGCAGGTT'
In [47]:
kimura_2p_distance(a,b)
Out[47]:
2.6031087353225875
In [ ]:
# Kimura 80 distance
In [101]:
p = patterns[0]
In [90]:
patterns[0].attrs.keys()
Out[90]:
dict_keys(['features', 'align', 'cluster', 'stacked_seqlet_imp'])
In [102]:
simp = p.attrs['stacked_seqlet_imp']
In [151]:
def get_kimura_dist(p, verbose=True):
    """Get the Kimura distance for a certain pattern
    """
    from basepair.exp.te.dist import kimura_2p_distance
    simp = p.attrs['stacked_seqlet_imp']
    seqs = one_hot2string(simp.seq, DNA)
    consensus = p.get_consensus()
    distances = np.array([kimura_2p_distance(seq, consensus, verbose=verbose) for seq in seqs])
    return distances
In [147]:
kd = pd.concat([pd.DataFrame({"dist": get_kimura_dist(p, verbose=False),"pattern": p.name}) for p in patterns])
dfk = kd.groupby('pattern').dist.agg(['mean', 'std'])
dfk.columns = ["kimura_" + c for c in dfk.columns]
In [157]:
dfk
Out[157]:
kimura_mean kimura_std
pattern
metacluster_0/pattern_0 1.4588 1.4168
metacluster_0/pattern_1 1.5296 2.4358
metacluster_0/pattern_10 0.2687 0.2003
... ... ...
metacluster_6/pattern_9 1.5804 1.9648
metacluster_7/pattern_0 1.3900 1.2231
metacluster_7/pattern_1 1.2126 0.3488

122 rows × 2 columns

Merge all together

In [245]:
dfp = df.rename(columns=dict(name="pattern")).set_index("pattern")
In [246]:
dfp
Out[246]:
short_name seq_info_content n_pattern interval LTR_overlap_frac is_te
pattern
metacluster_0/pattern_0 m0_p0 15.3199 6569 2174 0.3309 False
metacluster_0/pattern_1 m0_p1 16.9463 975 394 0.4041 False
metacluster_0/pattern_2 m0_p2 9.7451 930 233 0.2505 False
... ... ... ... ... ... ...
metacluster_6/pattern_12 m6_p12 10.4935 79 1 0.0127 False
metacluster_7/pattern_0 m7_p0 14.2622 640 180 0.2812 False
metacluster_7/pattern_1 m7_p1 18.3106 158 25 0.1582 False

122 rows × 6 columns

In [ ]:
df_rm = dfi_top1.copy()
df_rm.index.name = 'pattern'
del df_rm['LTR_overlap_frac']
del df_rm['n_pattern']
In [191]:
dfti = dft.rename(columns=dict(name="pattern")).set_index("pattern")
In [270]:
dfpa = dfp.join(df_rm).join(dfk).join(dfti).join(dfl)
In [271]:
dfpa
Out[271]:
short_name seq_info_content n_pattern interval LTR_overlap_frac is_te repeat_name repeat_family n_repeat_name n_repeat_frac kimura_mean kimura_std cons_100bp_phastCons cons_100bp_phyloP cons_1kb_phastCons cons_1kb_phyloP cons_phastCons cons_phyloP ic length consensus n_seqlets
pattern
metacluster_0/pattern_0 m0_p0 15.3199 6569 2174 0.3309 False ORR1C2 LTR/ERVL-MaLR 170 0.0259 1.4588 1.4168 0.1895 0.2662 0.1488 0.1807 0.2339 0.3937 15.3199 15 TTTGCATAACAAAGG 13251
metacluster_0/pattern_1 m0_p1 16.9463 975 394 0.4041 False RLTR9E LTR/ERVK 130 0.1333 1.5296 2.4358 0.1781 0.2583 0.1363 0.1686 0.1961 0.2839 16.9463 22 TGGCAGAACAATGGAGTCATCT 1862
metacluster_0/pattern_2 m0_p2 9.7451 930 233 0.2505 False RMER16B2 LTR/ERVK 33 0.0355 1.9008 2.6584 0.1880 0.2691 0.1459 0.1775 0.1927 0.2922 9.7451 17 GAGCCATCAAGATCCAA 1647
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
metacluster_6/pattern_12 m6_p12 10.4935 79 1 0.0127 False RLTR9E LTR/ERVK 1 0.0127 1.2812 0.3043 0.3421 0.6196 0.2493 0.3746 0.3322 0.6014 10.4935 69 CCCCGGGGCGAGGCGCCGCGG... 94
metacluster_7/pattern_0 m7_p0 14.2622 640 180 0.2812 False RLTR11B LTR/ERVK 23 0.0359 1.3900 1.2231 0.1553 0.1857 0.1376 0.1537 0.1803 0.2457 14.2622 16 ATTTGCATAACAAAGG 833
metacluster_7/pattern_1 m7_p1 18.3106 158 25 0.1582 False RLTR17B_Mm LTR/ERVK 7 0.0443 1.2126 0.3488 0.1858 0.2171 0.1614 0.1994 0.2088 0.2830 18.3106 14 TATGCATATGCATA 192

122 rows × 22 columns

Columns:

  • length: trimmed pattern length
  • seq_info_content: sequence information content
  • kimura_mean: average Kimura distance from the consensus sequence
  • kumura_std: std of the Kimura distance from the consensus sequence
  • LTR_overlap_frac: fraction of the LTR's overlapping the seqlet
  • n_repeat_frac: fraction of the seqlets overlapping the most common LTR
  • cons_phyloP: PhyloP average conservation of the seqlet
  • cons_1kb_phyloP: PhyloP average conservation of the seqlet's 1kb window
  • cons_phastCons: same as for cons_phyloP but using phastCons
  • cons_1kb_phastCons: same as for cons_1kb_phyloP but using phastCons
  • repeat_name: Name of the most commonly overlaped LTR
  • repeat_family: Family of the most commonly overlaped LTR
In [278]:
dfpa['type'] = pd.Categorical(dfpa.is_te.map({False:"non-TE", True: "TE"}))

dfpa_subset = dfpa[['length', 
                    'consensus',
                    'n_seqlets',
                   'type',
                   'seq_info_content', 
                   'kimura_mean', 'kimura_std', 
                   'LTR_overlap_frac', 'n_repeat_frac', 
                   'repeat_name', 'repeat_family',                     
                   'cons_phyloP', 'cons_1kb_phyloP',
                   'cons_phastCons', 'cons_1kb_phastCons']]
In [275]:
sns.pairplot(dfpa[['length', 
                   'type',
                   'seq_info_content', 
                   'kimura_mean', 'kimura_std', 
                   'LTR_overlap_frac', 'n_repeat_frac', 
                   'cons_phyloP', 'cons_1kb_phyloP',
                   'cons_phastCons', 'cons_1kb_phastCons']], 
             hue='type',
             # ag_kws=dict(bins=20),
             height=get_figsize(.3, aspect=1)[0],
             plot_kws=dict(alpha=.5, s=20))
plt.savefig(f"{figures}/pairplot.conservation.pdf")
In [279]:
# save to csv
dfpa_subset.reset_index().to_csv(modisco_dir / 'pattern.conservation-prop.csv', index=False)