Analyze the patterns discovered by modisco¶

sequence information content of motifs
for long motifs, do they match known transposable elements?
align and cluster the motifs by their pairwise distance
export the aligned and clustered motifs into a table

# All required paths
%matplotlib inline
from basepair.config import get_data_dir
ddir = get_data_dir()
modisco_subdir = "modisco/all/deeplift/profile/"
model_dir = f"{ddir}/processed/chipnexus/exp/models/oct-sox-nanog-klf/models/n_dil_layers=9/"
modisco_dir = os.path.join(model_dir, modisco_subdir)
output_dir = modisco_dir
figures = f"{ddir}/figures/modisco/pattern_clustering"

Using TensorFlow backend.

%matplotlib inline
%config InlineBackend.figure_format = 'retina'
from uuid import uuid4
from basepair.math import mean
from basepair.stats import perc

from IPython.display import display, HTML
from basepair.plot.vdom import df2html, df2html_old, render_datatable, vdom_footprint
from basepair.plot.utils import strip_axis
from basepair.plot.tracks import tidy_motif_plot
from basepair.modisco.core import patterns_to_df
from basepair.modisco.utils import longer_pattern, shorten_pattern, extract_name_short
from basepair.imports import *

modisco_dir = Path(modisco_dir)
output_dir = Path(output_dir)
!mkdir -p {figures}

paper_config()

# Load modisco patterns
mr = ModiscoResult(modisco_dir / "modisco.h5")
mr.open()
patterns = [mr.get_pattern(p) for p in mr.patterns()]

# patterns = [mr.get_pattern(p) for p in mr.patterns()]
pattern_table = pd.read_csv(output_dir / "pattern_table.csv")

tasks = [x.split("/")[0] for x in mr.tasks()]

footprints = read_pkl(output_dir / 'footprints.pkl')

# Add profile to patterns
patterns = [p.add_profile(footprints[p.name]) for p in patterns]

# Add features
patterns = [p.add_attr('features', OrderedDict(pattern_table[pattern_table.pattern == shorten_pattern(p.name)].iloc[0])) for p in patterns]

# check that the pattern names match
assert patterns[4].attrs['features']['pattern'] == shorten_pattern(patterns[4].name)

Pattern len distribution¶

df = patterns_to_df(patterns, properties=['name', 'short_name', 'seq_info_content'])

TF-MoDISco is using the TensorFlow backend.

fig = plt.figure(figsize=get_figsize(0.25))
df.seq_info_content.plot.hist(30)
plt.xlabel("Sequence IC")
fig.savefig(f"{figures}/../pattern_IC.hist.pdf")
fig.savefig(f"{figures}/../pattern_IC.hist.png")

Motif with the lowest ic:

patterns[df.seq_info_content.idxmin()].plot("seq_ic");
tidy_motif_plot()

Motif with the highest ic:

patterns[df.seq_info_content.idxmax()].plot("seq_ic");
tidy_motif_plot()

We see that many patterns have high information content. The question now is: are these motifs overlapping known transposable elements?

Overlapping seqlet regions with RepeatMasker regions¶

Load the repeat masker file¶

repeat_masker_file = f"{ddir}/raw/annotation/mm10/RepeatMasker/mm10.fa.out.gz"
!mkdir -p {ddir}/raw/annotation/mm10/RepeatMasker/
if not os.path.exists(repeat_masker_file):
    !wget http://www.repeatmasker.org/genomes/mm10/RepeatMasker-rm405-db20140131/mm10.fa.out.gz -O {repeat_masker_file}

def read_repeat_masker(file_path):    
    dfrm = pd.read_table(file_path, delim_whitespace=True, header=[1])

    dfrm.columns = [x.replace("\n", "_") for x in dfrm.columns]

    dfrm['name'] = dfrm['repeat'] + "//" + dfrm['class/family']

    dfrm = dfrm[['ins.', 'sequence', 'begin', 'name']]
    dfrm.columns = ['chrom', 'start', 'end', 'name']
    return dfrm

dfrm = read_repeat_masker(repeat_masker_file)

dfrm.head()

Intersect seqlet locations with repeat masker¶

from pybedtools import BedTool

bt_te = BedTool.from_dataframe(dfrm)
def intersect_repeat_masker(pattern, f=1.0):
    bt = BedTool(f"{modisco_dir}/seqlets/{pattern}.bedgz")
    try:
        dfint = bt.intersect(bt_te, wa=True, wb=True, f=f).to_dataframe()
    except:
        return None
    t = dfint.blockCount.str.split("//", expand=True)
    dfint['pattern_name'] = pattern
    dfint['repeat_name'] = t[0]
    dfint['repeat_family'] = t[1]
    dfint['n_pattern'] = bt.to_dataframe()[['chrom', 'start', 'end']].drop_duplicates().shape[0]
    dfint['interval'] = dfint['chrom'] + ":" + dfint['start'].astype(str) + "-" + dfint['end'].astype(str)
    return dfint[['chrom', 'start', 'end', 'interval', 'pattern_name', 'n_pattern', 'repeat_name', 'repeat_family']]

dfi_raw = pd.concat(Parallel(n_jobs=20)(delayed(intersect_repeat_masker)(pattern, f=0.9) for pattern in tqdm(mr.patterns())))

100%|██████████| 122/122 [00:41<00:00,  2.95it/s]

dfi = dfi_raw.copy()

# Restrict only to LTR/
dfi = dfi[dfi.repeat_family.str.startswith("LTR/")]

# Append some stats
unique_elements = dfi.groupby(['pattern_name']).interval.nunique()
dfiu = dfi[['pattern_name', 'n_pattern']].drop_duplicates().set_index('pattern_name').join(unique_elements)
dfiu['LTR_overlap_frac'] = dfiu.interval / dfiu.n_pattern
dfi = pd.merge(dfi, dfiu.reset_index()[['pattern_name', 'LTR_overlap_frac']], on='pattern_name', how='left')
dfi = dfi.drop_duplicates()

# Append properties
df = patterns_to_df(patterns, properties=['name', 'short_name', 'seq_info_content'])
df = df.set_index('name').join(dfiu).reset_index()

Scatter-plot: Seq IC ~ LTR_overlap_frac¶

TE_min_seq_IC = 50
TE_min_LTR_Overlap_frac = 0.7

height = get_figsize(0.25)[0]
fig = sns.jointplot("LTR_overlap_frac", "seq_info_content", marginal_kws=dict(bins=15), s=5, data=df, height=height);
fig.ax_joint.axvline(TE_min_LTR_Overlap_frac, linestyle='--', color='grey', alpha=.4);
fig.ax_joint.axhline(TE_min_seq_IC, linestyle='--', color='grey', alpha=.4);
fig.savefig(f"{figures}/TE-match,seq-IC.scatter.pdf")
fig.savefig(f"{figures}/TE-match,seq-IC.scatter.png")

# TE's are in the top right
pattern_te_names = df[(df.seq_info_content > TE_min_seq_IC) & (df.LTR_overlap_frac > TE_min_LTR_Overlap_frac)].sort_values('LTR_overlap_frac', ascending=False).name.unique()

We can see that motif with high IC are frequently in the region of known TE's

dfi.repeat_name.value_counts()

RLTR9E       1541
RLTR13D6      683
RLTR9D        598
             ... 
LTR16A2         1
MER21-int       1
LTR78           1
Name: repeat_name, Length: 410, dtype: int64

dfi.repeat_family.value_counts()

LTR/ERVK         8747
LTR/ERVL-MaLR    3315
LTR/ERVL          800
LTR/ERV1          579
LTR/Gypsy          22
LTR/ERV1?           2
LTR/ERVL?           2
Name: repeat_family, dtype: int64

For each pattern, are all seqlets falling into regions of the same LTRs?¶

dfi_top = dfi[dfi.pattern_name.isin(pattern_te_names)].drop_duplicates()
dfi_top['n_repeat_name'] = dfi_top.groupby(['pattern_name', 'repeat_name']).repeat_name.transform('size')
dfi_top['n_repeat_frac'] = dfi_top['n_repeat_name'] / dfi_top['n_pattern']
dfi_top1 = dfi_top.groupby(['pattern_name']).apply(lambda x: x.loc[x.n_repeat_frac.idxmax()])

del dfi_top1['chrom']
del dfi_top1['start']
del dfi_top1['end']
del dfi_top1['interval']
del dfi_top1['pattern_name']

print(dfi_top1.sort_values('n_repeat_frac', ascending=False).to_string())

                          n_pattern     repeat_name repeat_family  LTR_overlap_frac  n_repeat_name  n_repeat_frac
pattern_name                                                                                                     
metacluster_0/pattern_23         28         RLTR41C      LTR/ERV1            0.9286             26         0.9286
metacluster_0/pattern_18         53        RLTR13D6      LTR/ERVK            0.9811             47         0.8868
metacluster_5/pattern_11         35        RLTR13D6      LTR/ERVK            0.8857             31         0.8857
metacluster_0/pattern_16        101          RLTR9E      LTR/ERVK            1.0000             85         0.8416
metacluster_5/pattern_9          47        RLTR13D6      LTR/ERVK            0.9362             39         0.8298
metacluster_2/pattern_19         39           BGLII      LTR/ERVK            0.8718             31         0.7949
metacluster_0/pattern_24         50          RLTR9E      LTR/ERVK            0.9200             39         0.7800
metacluster_0/pattern_25         49          RLTR41      LTR/ERV1            0.8367             37         0.7551
metacluster_5/pattern_7          65        RLTR13D6      LTR/ERVK            0.7692             47         0.7231
metacluster_0/pattern_14         73          RLTR9D      LTR/ERVK            0.8767             52         0.7123
metacluster_0/pattern_10        114         RLTR9A3      LTR/ERVK            1.0000             80         0.7018
metacluster_2/pattern_15         60           BGLII      LTR/ERVK            0.7833             42         0.7000
metacluster_0/pattern_29         25        RLTR13D6      LTR/ERVK            0.8000             17         0.6800
metacluster_0/pattern_22         43          RLTR9D      LTR/ERVK            1.0000             24         0.5581
metacluster_5/pattern_2         410          RLTR9E      LTR/ERVK            0.7951            218         0.5317
metacluster_6/pattern_6         116          RLTR9E      LTR/ERVK            0.7672             61         0.5259
metacluster_0/pattern_28         33         RLTR9D2      LTR/ERVK            1.0000             17         0.5152
metacluster_3/pattern_9          35        RLTR13D6      LTR/ERVK            0.9429             16         0.4571
metacluster_5/pattern_3         339          RLTR9E      LTR/ERVK            0.9469            153         0.4513
metacluster_0/pattern_6         254          RLTR17      LTR/ERVK            0.8425            109         0.4291
metacluster_2/pattern_16         56        RLTR13D6      LTR/ERVK            0.8036             24         0.4286
metacluster_0/pattern_13        106         RMER10A      LTR/ERVL            0.8302             44         0.4151
metacluster_5/pattern_8          57          RLTR9D      LTR/ERVK            0.8246             23         0.4035
metacluster_6/pattern_5         118          RLTR9E      LTR/ERVK            0.9237             46         0.3898
metacluster_0/pattern_9         153          RLTR9E      LTR/ERVK            0.9281             57         0.3725
metacluster_0/pattern_3         354  MMERVK9C_I-int      LTR/ERVK            0.7429            127         0.3588
metacluster_0/pattern_21         49         MLTR25A      LTR/ERVK            0.8163             14         0.2857
metacluster_1/pattern_15         64        RLTR13C3      LTR/ERVK            0.8281             13         0.2031

Seems not, but for some of them it's pretty consistent.

Here is the result of querying these motif in RepBase: link

Save motifs to file¶

w,h = get_figsize(.5, aspect=1/12)

os.makedirs(f"{figures}/TE_motifs", exist_ok=True)
for pname in pattern_te_names:
    mr.plot_pattern(pname, kind='seq_ic', letter_width=w/70, height=h);
    tidy_motif_plot()
    plt.savefig(f"{figures}/TE_motifs/{shorten_pattern(pname)}.pdf")
    plt.savefig(f"{figures}/TE_motifs/{shorten_pattern(pname)}.png")
    plt.close()

os.makedirs(f"{figures}/non-TE_motifs", exist_ok=True)
for p in tqdm(patterns):
    pname = p.name
    if pname in pattern_te_names:
        continue
    mr.plot_pattern(pname, kind='seq_ic', letter_width=w/70, height=h, trim_frac=0.08);
    tidy_motif_plot()
    plt.savefig(f"{figures}/non-TE_motifs/{shorten_pattern(pname)}.pdf")
    plt.savefig(f"{figures}/non-TE_motifs/{shorten_pattern(pname)}.png")
    plt.close()

100%|██████████| 122/122 [01:08<00:00,  1.41it/s]

1. split into two groups: {TE, non-TE}¶

For further analysis, let's split the patterns into two groups: TE's and non-TE's:

patterns_te = [p for p in patterns if p.name in pattern_te_names]
patterns_nte = [p for p in patterns if p.name not in pattern_te_names]
assert len(patterns_te) + len(patterns_nte) == len(patterns)

# Fasta-sequences for the consensu
# for p in patterns_te:
#     print(f">{p.name}")
#     print(p.get_consensus())

print("# long motifs: ", len(patterns_te))
print("# short motifs:", len(patterns_nte))

# long motifs:  28
# short motifs: 94

2. cluster by seq-similarty, freeze groups¶

from basepair.exp.chipnexus.motif_clustering import similarity_matrix

sim_nte_seq = similarity_matrix(patterns_nte, track='seq_ic')
sim_te_seq = similarity_matrix(patterns_te, track='seq_ic')
sim_all_seq = similarity_matrix(patterns, track='seq_ic')

100%|██████████| 94/94 [00:27<00:00,  3.44it/s]
100%|██████████| 28/28 [00:02<00:00, 11.57it/s]
100%|██████████| 122/122 [00:45<00:00,  2.70it/s]

%tqdm_restart

non-TE¶

fig = plt.figure(figsize=get_figsize(0.25))
iu1 = np.triu_indices(len(sim_nte_seq), 1)
plt.hist(sim_nte_seq[iu1], bins=100);
plt.title("Similarity (seq_ic)")
plt.xlabel("Similarity between two motifs (non-TE)");
plt.ylabel("Frequency");
plt.savefig(f"{figures}/non-TE.similarity.hist.pdf")
plt.savefig(f"{figures}/non-TE.similarity.hist.png")

TE¶

fig = plt.figure(figsize=get_figsize(0.25))
iu2 = np.triu_indices(len(sim_te_seq), 1)
plt.hist(sim_te_seq[iu2], bins=100);
plt.title("Similarity (seq_ic)")
plt.ylabel("Frequency");
plt.xlabel("Similarity between two motifs (TE)");
plt.savefig(f"{figures}/TE.similarity.hist.pdf")
plt.savefig(f"{figures}/TE.similarity.hist.png")

All¶

fig = plt.figure(figsize=get_figsize(0.25))
iu_all = np.triu_indices(len(sim_all_seq), 1)
plt.hist(sim_all_seq[iu2], bins=100);
plt.title("Similarity (seq_ic)")
plt.ylabel("Frequency");
plt.xlabel("Similarity between two motifs (all)");
plt.savefig(f"{figures}/all.similarity.hist.pdf")
plt.savefig(f"{figures}/all.similarity.hist.png")

from scipy.cluster.hierarchy import linkage, optimal_leaf_ordering, cut_tree, leaves_list
from basepair.modisco.motif_clustering import to_colors

Cluster the motifs¶

non-TE¶

Report table

# Seq IC-based clustering
iu_nte = np.triu_indices(len(sim_nte_seq), 1)
lm_nte_seq = linkage(1-sim_nte_seq[iu_nte], 'ward', optimal_ordering=True)
clusters_nte_seq = cut_tree(lm_nte_seq, n_clusters=9)[:,0]

names = [shorten_pattern(x.name) for x in patterns_nte]

cm_nte_seq = sns.clustermap(pd.DataFrame(sim_nte_seq, index=names, columns=names),
                            row_linkage=lm_nte_seq,
                            col_linkage=lm_nte_seq,
                            col_colors=to_colors(pd.DataFrame(dict(cluster=pd.Categorical(clusters_nte_seq)), index=names)),
                            cmap='Blues',
                            figsize=get_figsize(0.5, aspect=1)
                           ).fig.suptitle('Seq-ic sim');
plt.savefig(f"{figures}/non-TE.heatmap.pdf")
plt.savefig(f"{figures}/non-TE.heatmap.png")

TE¶

Report table

iu_te = np.triu_indices(len(sim_te_seq), 1)
lm_te_seq = linkage(1-sim_te_seq[iu_te], 'ward', optimal_ordering=True)
clusters_te_seq = cut_tree(lm_te_seq, n_clusters=9)[:,0]
names = [shorten_pattern(x.name) for x in patterns_te]
rnames = [shorten_pattern(x.name).ljust(9) + " " + dfi_top1.loc[x.name].repeat_name for x in patterns_te]

df_anno = dfi_top1.loc[[x.name for x in patterns_te]][['LTR_overlap_frac', 'n_repeat_frac']]
df_anno.index = rnames
cm_te = sns.clustermap(pd.DataFrame(sim_te_seq, index=rnames, columns=names),
                        row_linkage=lm_te_seq,
                        col_linkage=lm_te_seq,
                        row_colors=to_colors(df_anno),
                        cmap='Blues',
                        yticklabels=True,
                        xticklabels=True,
                        figsize=get_figsize(0.7, aspect=1)
                       ).fig.suptitle('Seq IC sim');
plt.savefig(f"{figures}/TE.heatmap.pdf")
plt.savefig(f"{figures}/TE.heatmap.png")

All motifs¶

Report table

iu_all = np.triu_indices(len(sim_all_seq), 1)
lm_all_seq = linkage(1-sim_all_seq[iu_all], 'ward', optimal_ordering=True)
clusters_all_seq = cut_tree(lm_all_seq, n_clusters=9)[:,0]
names = [shorten_pattern(x.name) for x in patterns]
cm_all = sns.clustermap(pd.DataFrame(sim_all_seq, index=names, columns=names),
                        row_linkage=lm_all_seq,
                        col_linkage=lm_all_seq,
                        cmap='Blues',
                        figsize=get_figsize(0.5, aspect=1)
                       ).fig.suptitle('Seq IC sim');
plt.savefig(f"{figures}/all.heatmap.pdf")
plt.savefig(f"{figures}/all.heatmap.png")

3. Visualize the clusters¶

from basepair.modisco.motif_clustering import create_pattern_table, align_clustered_patterns

np.random.seed(42)

cluster_order = np.argsort(leaves_list(lm_nte_seq))
cluster = clusters_nte_seq

patterns_nte_clustered = align_clustered_patterns(patterns_nte, cluster_order, cluster, 
                                                  align_track='seq_ic',
                                                  metric='continousjaccard',
                                                  # don't shit the major patterns 
                                                  # by more than 15 when aligning
                                                  trials=20,
                                                  max_shift=15)
# add the major motif group
patterns_nte_clustered = [x.add_attr("pattern_group", 'nte') for x in patterns_nte_clustered]

100%|██████████| 94/94 [00:00<00:00, 263.66it/s]

cluster_order = np.argsort(leaves_list(lm_te_seq))
cluster = clusters_te_seq

patterns_te_clustered = align_clustered_patterns(patterns_te, cluster_order, cluster, 
                                                 align_track='seq_ic',
                                                 metric='continousjaccard',
                                                 # don't shit the major patterns 
                                                 # by more than 15 when aligning
                                                 trials=20,
                                                 max_shift=15)
patterns_te_clustered = [x.add_attr("pattern_group", 'te') for x in patterns_te_clustered]

100%|██████████| 28/28 [00:00<00:00, 258.91it/s]

cluster_order = np.argsort(leaves_list(lm_all_seq))
cluster = clusters_all_seq

patterns_all_clustered = align_clustered_patterns(patterns, cluster_order, cluster, 
                                                 align_track='seq_ic',
                                                 metric='continousjaccard',
                                                 # don't shit the major patterns 
                                                 # by more than 15 when aligning 
                                                 trials=20,
                                                 max_shift=15)
# patterns_te_clustered = [x.add_attr("pattern_group", 'te') for x in patterns_te_clustered]

100%|██████████| 122/122 [00:00<00:00, 270.46it/s]

Refining motif alignment¶

exclude TE's

write_pkl(patterns_all_clustered, output_dir / 'patterns.pkl')

Write a simple table¶

Patterns -> pd.DataFrame¶

pattern_table_nte_seq = create_pattern_table(patterns_nte_clustered,
                                             logo_len=50, 
                                             seqlogo_kwargs=dict(width=420),
                                             n_jobs=20,
                                             footprint_width=120,
                                             footprint_kwargs=dict(figsize=(3,1.5)))

100%|██████████| 94/94 [00:16<00:00,  5.28it/s]

pattern_table_te_seq = create_pattern_table(patterns_te_clustered,
                                            logo_len=70, 
                                            seqlogo_kwargs=dict(width=420),
                                            n_jobs=20,
                                            footprint_width=120,
                                            footprint_kwargs=dict(figsize=(3,1.5)))

100%|██████████| 28/28 [00:00<00:00, 305.77it/s]

pattern_table_all = create_pattern_table(patterns_all_clustered,
                                         logo_len=70, 
                                         seqlogo_kwargs=dict(width=420),
                                         n_jobs=20,
                                         footprint_width=120,
                                         footprint_kwargs=dict(figsize=(3,1.5)))

100%|██████████| 122/122 [00:21<00:00,  3.15it/s]

from basepair.modisco.table import write_modisco_table

def get_first_columns(df, cols):
    remaining_columns = [c for c in df.columns if c not in cols]
    return df[cols + remaining_columns]

(output_dir / 'motif_clustering').mkdir(exist_ok=True)

background_motifs = ['Essrb', 'Klf4', 'Nanog','Oct4','Oct4-Sox2', 'Sox2']

first_columns = ['pattern', 'cluster', 'n seqlets', 'logo_imp', 'logo_seq'] + [task+'/f' for task in tasks] + [t + '/d_p' for t in tasks] # + [m+'/odds' for m in background_motifs]

remove = [task + '/f' for task in tasks] + ['logo_imp', 'logo_seq']

pd.DataFrame -> .html, .csv¶

pattern_table_nte_seq['i'] = np.arange(len(pattern_table_nte_seq), dtype=int)
pattern_table_nte_seq = get_first_columns(pattern_table_nte_seq, ['i'] + first_columns)
write_modisco_table(pattern_table_nte_seq, output_dir / 'motif_clustering', report_url='../results.html', prefix='patterns_short', exclude_when_writing=remove)

pattern_table_te_seq['i'] = np.arange(len(pattern_table_te_seq), dtype=int)
pattern_table_te_seq = get_first_columns(pattern_table_te_seq, ['i'] + first_columns)
write_modisco_table(pattern_table_te_seq, output_dir / 'motif_clustering', report_url='../results.html', prefix='patterns_long', exclude_when_writing=remove)

pattern_table_all['i'] = np.arange(len(pattern_table_all), dtype=int)
pattern_table_all = get_first_columns(pattern_table_all, ['i'] + first_columns)
write_modisco_table(pattern_table_all, output_dir, report_url='results.html', prefix='pattern_table', exclude_when_writing=remove)

	chrom	start	end	name
14737	chr1	3000001	3000097	L1MdFanc_I//LINE/L1
27	chr1	3000098	3000123	(T)n//Simple_repeat
14737	chr1	3000124	3002128	L1MdFanc_I//LINE/L1
2853	chr1	3003148	3004054	L1MdFanc_I//LINE/L1
3936	chr1	3004041	3004206	L1_Rod//LINE/L1