Analyze the patterns discovered by modisco

  • sequence information content of motifs
  • for long motifs, do they match known transposable elements?
  • align and cluster the motifs by their pairwise distance
  • export the aligned and clustered motifs into a table
In [1]:
# All required paths
%matplotlib inline
from basepair.config import get_data_dir
ddir = get_data_dir()
modisco_subdir = "modisco/all/deeplift/profile/"
model_dir = f"{ddir}/processed/chipnexus/exp/models/oct-sox-nanog-klf/models/n_dil_layers=9/"
modisco_dir = os.path.join(model_dir, modisco_subdir)
output_dir = modisco_dir
figures = f"{ddir}/figures/modisco/pattern_clustering"
Using TensorFlow backend.
In [2]:
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
from uuid import uuid4
from basepair.math import mean
from basepair.stats import perc

from IPython.display import display, HTML
from basepair.plot.vdom import df2html, df2html_old, render_datatable, vdom_footprint
from basepair.plot.utils import strip_axis
from basepair.plot.tracks import tidy_motif_plot
from basepair.modisco.core import patterns_to_df
from basepair.modisco.utils import longer_pattern, shorten_pattern, extract_name_short
from basepair.imports import *

modisco_dir = Path(modisco_dir)
output_dir = Path(output_dir)
!mkdir -p {figures}

paper_config()
In [3]:
# Load modisco patterns
mr = ModiscoResult(modisco_dir / "modisco.h5")
mr.open()
patterns = [mr.get_pattern(p) for p in mr.patterns()]

# patterns = [mr.get_pattern(p) for p in mr.patterns()]
pattern_table = pd.read_csv(output_dir / "pattern_table.csv")

tasks = [x.split("/")[0] for x in mr.tasks()]

footprints = read_pkl(output_dir / 'footprints.pkl')

# Add profile to patterns
patterns = [p.add_profile(footprints[p.name]) for p in patterns]

# Add features
patterns = [p.add_attr('features', OrderedDict(pattern_table[pattern_table.pattern == shorten_pattern(p.name)].iloc[0])) for p in patterns]

# check that the pattern names match
assert patterns[4].attrs['features']['pattern'] == shorten_pattern(patterns[4].name)

Pattern len distribution

In [4]:
df = patterns_to_df(patterns, properties=['name', 'short_name', 'seq_info_content'])
TF-MoDISco is using the TensorFlow backend.
In [5]:
fig = plt.figure(figsize=get_figsize(0.25))
df.seq_info_content.plot.hist(30)
plt.xlabel("Sequence IC")
fig.savefig(f"{figures}/../pattern_IC.hist.pdf")
fig.savefig(f"{figures}/../pattern_IC.hist.png")

Motif with the lowest ic:

In [9]:
patterns[df.seq_info_content.idxmin()].plot("seq_ic");
tidy_motif_plot()

Motif with the highest ic:

In [8]:
patterns[df.seq_info_content.idxmax()].plot("seq_ic");
tidy_motif_plot()

We see that many patterns have high information content. The question now is: are these motifs overlapping known transposable elements?

Overlapping seqlet regions with RepeatMasker regions

Load the repeat masker file

In [10]:
repeat_masker_file = f"{ddir}/raw/annotation/mm10/RepeatMasker/mm10.fa.out.gz"
!mkdir -p {ddir}/raw/annotation/mm10/RepeatMasker/
if not os.path.exists(repeat_masker_file):
    !wget http://www.repeatmasker.org/genomes/mm10/RepeatMasker-rm405-db20140131/mm10.fa.out.gz -O {repeat_masker_file}
In [11]:
def read_repeat_masker(file_path):    
    dfrm = pd.read_table(file_path, delim_whitespace=True, header=[1])

    dfrm.columns = [x.replace("\n", "_") for x in dfrm.columns]

    dfrm['name'] = dfrm['repeat'] + "//" + dfrm['class/family']

    dfrm = dfrm[['ins.', 'sequence', 'begin', 'name']]
    dfrm.columns = ['chrom', 'start', 'end', 'name']
    return dfrm
In [12]:
dfrm = read_repeat_masker(repeat_masker_file)
In [13]:
dfrm.head()
Out[13]:
chrom start end name
14737 chr1 3000001 3000097 L1MdFanc_I//LINE/L1
27 chr1 3000098 3000123 (T)n//Simple_repeat
14737 chr1 3000124 3002128 L1MdFanc_I//LINE/L1
2853 chr1 3003148 3004054 L1MdFanc_I//LINE/L1
3936 chr1 3004041 3004206 L1_Rod//LINE/L1

Intersect seqlet locations with repeat masker

In [14]:
from pybedtools import BedTool
In [15]:
bt_te = BedTool.from_dataframe(dfrm)
def intersect_repeat_masker(pattern, f=1.0):
    bt = BedTool(f"{modisco_dir}/seqlets/{pattern}.bedgz")
    try:
        dfint = bt.intersect(bt_te, wa=True, wb=True, f=f).to_dataframe()
    except:
        return None
    t = dfint.blockCount.str.split("//", expand=True)
    dfint['pattern_name'] = pattern
    dfint['repeat_name'] = t[0]
    dfint['repeat_family'] = t[1]
    dfint['n_pattern'] = bt.to_dataframe()[['chrom', 'start', 'end']].drop_duplicates().shape[0]
    dfint['interval'] = dfint['chrom'] + ":" + dfint['start'].astype(str) + "-" + dfint['end'].astype(str)
    return dfint[['chrom', 'start', 'end', 'interval', 'pattern_name', 'n_pattern', 'repeat_name', 'repeat_family']]
In [16]:
dfi_raw = pd.concat(Parallel(n_jobs=20)(delayed(intersect_repeat_masker)(pattern, f=0.9) for pattern in tqdm(mr.patterns())))
100%|██████████| 122/122 [00:41<00:00,  2.95it/s]
In [17]:
dfi = dfi_raw.copy()
In [18]:
# Restrict only to LTR/
dfi = dfi[dfi.repeat_family.str.startswith("LTR/")]
In [19]:
# Append some stats
unique_elements = dfi.groupby(['pattern_name']).interval.nunique()
dfiu = dfi[['pattern_name', 'n_pattern']].drop_duplicates().set_index('pattern_name').join(unique_elements)
dfiu['LTR_overlap_frac'] = dfiu.interval / dfiu.n_pattern
dfi = pd.merge(dfi, dfiu.reset_index()[['pattern_name', 'LTR_overlap_frac']], on='pattern_name', how='left')
dfi = dfi.drop_duplicates()
In [20]:
# Append properties
df = patterns_to_df(patterns, properties=['name', 'short_name', 'seq_info_content'])
df = df.set_index('name').join(dfiu).reset_index()

Scatter-plot: Seq IC ~ LTR_overlap_frac

In [21]:
TE_min_seq_IC = 50
TE_min_LTR_Overlap_frac = 0.7
In [22]:
height = get_figsize(0.25)[0]
fig = sns.jointplot("LTR_overlap_frac", "seq_info_content", marginal_kws=dict(bins=15), s=5, data=df, height=height);
fig.ax_joint.axvline(TE_min_LTR_Overlap_frac, linestyle='--', color='grey', alpha=.4);
fig.ax_joint.axhline(TE_min_seq_IC, linestyle='--', color='grey', alpha=.4);
fig.savefig(f"{figures}/TE-match,seq-IC.scatter.pdf")
fig.savefig(f"{figures}/TE-match,seq-IC.scatter.png")
In [23]:
# TE's are in the top right
pattern_te_names = df[(df.seq_info_content > TE_min_seq_IC) & (df.LTR_overlap_frac > TE_min_LTR_Overlap_frac)].sort_values('LTR_overlap_frac', ascending=False).name.unique()

We can see that motif with high IC are frequently in the region of known TE's

In [24]:
dfi.repeat_name.value_counts()
Out[24]:
RLTR9E       1541
RLTR13D6      683
RLTR9D        598
             ... 
LTR16A2         1
MER21-int       1
LTR78           1
Name: repeat_name, Length: 410, dtype: int64
In [25]:
dfi.repeat_family.value_counts()
Out[25]:
LTR/ERVK         8747
LTR/ERVL-MaLR    3315
LTR/ERVL          800
LTR/ERV1          579
LTR/Gypsy          22
LTR/ERV1?           2
LTR/ERVL?           2
Name: repeat_family, dtype: int64

For each pattern, are all seqlets falling into regions of the same LTRs?

In [26]:
dfi_top = dfi[dfi.pattern_name.isin(pattern_te_names)].drop_duplicates()
dfi_top['n_repeat_name'] = dfi_top.groupby(['pattern_name', 'repeat_name']).repeat_name.transform('size')
dfi_top['n_repeat_frac'] = dfi_top['n_repeat_name'] / dfi_top['n_pattern']
dfi_top1 = dfi_top.groupby(['pattern_name']).apply(lambda x: x.loc[x.n_repeat_frac.idxmax()])
In [27]:
del dfi_top1['chrom']
del dfi_top1['start']
del dfi_top1['end']
del dfi_top1['interval']
del dfi_top1['pattern_name']
In [28]:
print(dfi_top1.sort_values('n_repeat_frac', ascending=False).to_string())
                          n_pattern     repeat_name repeat_family  LTR_overlap_frac  n_repeat_name  n_repeat_frac
pattern_name                                                                                                     
metacluster_0/pattern_23         28         RLTR41C      LTR/ERV1            0.9286             26         0.9286
metacluster_0/pattern_18         53        RLTR13D6      LTR/ERVK            0.9811             47         0.8868
metacluster_5/pattern_11         35        RLTR13D6      LTR/ERVK            0.8857             31         0.8857
metacluster_0/pattern_16        101          RLTR9E      LTR/ERVK            1.0000             85         0.8416
metacluster_5/pattern_9          47        RLTR13D6      LTR/ERVK            0.9362             39         0.8298
metacluster_2/pattern_19         39           BGLII      LTR/ERVK            0.8718             31         0.7949
metacluster_0/pattern_24         50          RLTR9E      LTR/ERVK            0.9200             39         0.7800
metacluster_0/pattern_25         49          RLTR41      LTR/ERV1            0.8367             37         0.7551
metacluster_5/pattern_7          65        RLTR13D6      LTR/ERVK            0.7692             47         0.7231
metacluster_0/pattern_14         73          RLTR9D      LTR/ERVK            0.8767             52         0.7123
metacluster_0/pattern_10        114         RLTR9A3      LTR/ERVK            1.0000             80         0.7018
metacluster_2/pattern_15         60           BGLII      LTR/ERVK            0.7833             42         0.7000
metacluster_0/pattern_29         25        RLTR13D6      LTR/ERVK            0.8000             17         0.6800
metacluster_0/pattern_22         43          RLTR9D      LTR/ERVK            1.0000             24         0.5581
metacluster_5/pattern_2         410          RLTR9E      LTR/ERVK            0.7951            218         0.5317
metacluster_6/pattern_6         116          RLTR9E      LTR/ERVK            0.7672             61         0.5259
metacluster_0/pattern_28         33         RLTR9D2      LTR/ERVK            1.0000             17         0.5152
metacluster_3/pattern_9          35        RLTR13D6      LTR/ERVK            0.9429             16         0.4571
metacluster_5/pattern_3         339          RLTR9E      LTR/ERVK            0.9469            153         0.4513
metacluster_0/pattern_6         254          RLTR17      LTR/ERVK            0.8425            109         0.4291
metacluster_2/pattern_16         56        RLTR13D6      LTR/ERVK            0.8036             24         0.4286
metacluster_0/pattern_13        106         RMER10A      LTR/ERVL            0.8302             44         0.4151
metacluster_5/pattern_8          57          RLTR9D      LTR/ERVK            0.8246             23         0.4035
metacluster_6/pattern_5         118          RLTR9E      LTR/ERVK            0.9237             46         0.3898
metacluster_0/pattern_9         153          RLTR9E      LTR/ERVK            0.9281             57         0.3725
metacluster_0/pattern_3         354  MMERVK9C_I-int      LTR/ERVK            0.7429            127         0.3588
metacluster_0/pattern_21         49         MLTR25A      LTR/ERVK            0.8163             14         0.2857
metacluster_1/pattern_15         64        RLTR13C3      LTR/ERVK            0.8281             13         0.2031

Seems not, but for some of them it's pretty consistent.

Here is the result of querying these motif in RepBase: link

Save motifs to file

In [29]:
w,h = get_figsize(.5, aspect=1/12)
In [55]:
os.makedirs(f"{figures}/TE_motifs", exist_ok=True)
for pname in pattern_te_names:
    mr.plot_pattern(pname, kind='seq_ic', letter_width=w/70, height=h);
    tidy_motif_plot()
    plt.savefig(f"{figures}/TE_motifs/{shorten_pattern(pname)}.pdf")
    plt.savefig(f"{figures}/TE_motifs/{shorten_pattern(pname)}.png")
    plt.close()
In [56]:
os.makedirs(f"{figures}/non-TE_motifs", exist_ok=True)
for p in tqdm(patterns):
    pname = p.name
    if pname in pattern_te_names:
        continue
    mr.plot_pattern(pname, kind='seq_ic', letter_width=w/70, height=h, trim_frac=0.08);
    tidy_motif_plot()
    plt.savefig(f"{figures}/non-TE_motifs/{shorten_pattern(pname)}.pdf")
    plt.savefig(f"{figures}/non-TE_motifs/{shorten_pattern(pname)}.png")
    plt.close()
100%|██████████| 122/122 [01:08<00:00,  1.41it/s]

1. split into two groups: {TE, non-TE}

For further analysis, let's split the patterns into two groups: TE's and non-TE's:

In [30]:
patterns_te = [p for p in patterns if p.name in pattern_te_names]
patterns_nte = [p for p in patterns if p.name not in pattern_te_names]
assert len(patterns_te) + len(patterns_nte) == len(patterns)
In [31]:
# Fasta-sequences for the consensu
# for p in patterns_te:
#     print(f">{p.name}")
#     print(p.get_consensus())
In [32]:
print("# long motifs: ", len(patterns_te))
print("# short motifs:", len(patterns_nte))
# long motifs:  28
# short motifs: 94

2. cluster by seq-similarty, freeze groups

In [33]:
from basepair.exp.chipnexus.motif_clustering import similarity_matrix
In [34]:
sim_nte_seq = similarity_matrix(patterns_nte, track='seq_ic')
sim_te_seq = similarity_matrix(patterns_te, track='seq_ic')
sim_all_seq = similarity_matrix(patterns, track='seq_ic')
100%|██████████| 94/94 [00:27<00:00,  3.44it/s]
100%|██████████| 28/28 [00:02<00:00, 11.57it/s]
100%|██████████| 122/122 [00:45<00:00,  2.70it/s]
In [62]:
%tqdm_restart

non-TE

In [35]:
fig = plt.figure(figsize=get_figsize(0.25))
iu1 = np.triu_indices(len(sim_nte_seq), 1)
plt.hist(sim_nte_seq[iu1], bins=100);
plt.title("Similarity (seq_ic)")
plt.xlabel("Similarity between two motifs (non-TE)");
plt.ylabel("Frequency");
plt.savefig(f"{figures}/non-TE.similarity.hist.pdf")
plt.savefig(f"{figures}/non-TE.similarity.hist.png")

TE

In [36]:
fig = plt.figure(figsize=get_figsize(0.25))
iu2 = np.triu_indices(len(sim_te_seq), 1)
plt.hist(sim_te_seq[iu2], bins=100);
plt.title("Similarity (seq_ic)")
plt.ylabel("Frequency");
plt.xlabel("Similarity between two motifs (TE)");
plt.savefig(f"{figures}/TE.similarity.hist.pdf")
plt.savefig(f"{figures}/TE.similarity.hist.png")

All

In [37]:
fig = plt.figure(figsize=get_figsize(0.25))
iu_all = np.triu_indices(len(sim_all_seq), 1)
plt.hist(sim_all_seq[iu2], bins=100);
plt.title("Similarity (seq_ic)")
plt.ylabel("Frequency");
plt.xlabel("Similarity between two motifs (all)");
plt.savefig(f"{figures}/all.similarity.hist.pdf")
plt.savefig(f"{figures}/all.similarity.hist.png")
In [38]:
from scipy.cluster.hierarchy import linkage, optimal_leaf_ordering, cut_tree, leaves_list
from basepair.modisco.motif_clustering import to_colors

Cluster the motifs

In [36]:
# Seq IC-based clustering
iu_nte = np.triu_indices(len(sim_nte_seq), 1)
lm_nte_seq = linkage(1-sim_nte_seq[iu_nte], 'ward', optimal_ordering=True)
clusters_nte_seq = cut_tree(lm_nte_seq, n_clusters=9)[:,0]

names = [shorten_pattern(x.name) for x in patterns_nte]

cm_nte_seq = sns.clustermap(pd.DataFrame(sim_nte_seq, index=names, columns=names),
                            row_linkage=lm_nte_seq,
                            col_linkage=lm_nte_seq,
                            col_colors=to_colors(pd.DataFrame(dict(cluster=pd.Categorical(clusters_nte_seq)), index=names)),
                            cmap='Blues',
                            figsize=get_figsize(0.5, aspect=1)
                           ).fig.suptitle('Seq-ic sim');
plt.savefig(f"{figures}/non-TE.heatmap.pdf")
plt.savefig(f"{figures}/non-TE.heatmap.png")
In [39]:
iu_te = np.triu_indices(len(sim_te_seq), 1)
lm_te_seq = linkage(1-sim_te_seq[iu_te], 'ward', optimal_ordering=True)
clusters_te_seq = cut_tree(lm_te_seq, n_clusters=9)[:,0]
names = [shorten_pattern(x.name) for x in patterns_te]
rnames = [shorten_pattern(x.name).ljust(9) + " " + dfi_top1.loc[x.name].repeat_name for x in patterns_te]

df_anno = dfi_top1.loc[[x.name for x in patterns_te]][['LTR_overlap_frac', 'n_repeat_frac']]
df_anno.index = rnames
cm_te = sns.clustermap(pd.DataFrame(sim_te_seq, index=rnames, columns=names),
                        row_linkage=lm_te_seq,
                        col_linkage=lm_te_seq,
                        row_colors=to_colors(df_anno),
                        cmap='Blues',
                        yticklabels=True,
                        xticklabels=True,
                        figsize=get_figsize(0.7, aspect=1)
                       ).fig.suptitle('Seq IC sim');
plt.savefig(f"{figures}/TE.heatmap.pdf")
plt.savefig(f"{figures}/TE.heatmap.png")

All motifs

Report table

In [43]:
iu_all = np.triu_indices(len(sim_all_seq), 1)
lm_all_seq = linkage(1-sim_all_seq[iu_all], 'ward', optimal_ordering=True)
clusters_all_seq = cut_tree(lm_all_seq, n_clusters=9)[:,0]
names = [shorten_pattern(x.name) for x in patterns]
cm_all = sns.clustermap(pd.DataFrame(sim_all_seq, index=names, columns=names),
                        row_linkage=lm_all_seq,
                        col_linkage=lm_all_seq,
                        cmap='Blues',
                        figsize=get_figsize(0.5, aspect=1)
                       ).fig.suptitle('Seq IC sim');
plt.savefig(f"{figures}/all.heatmap.pdf")
plt.savefig(f"{figures}/all.heatmap.png")

3. Visualize the clusters

In [44]:
from basepair.modisco.motif_clustering import create_pattern_table, align_clustered_patterns
In [45]:
np.random.seed(42)
In [46]:
cluster_order = np.argsort(leaves_list(lm_nte_seq))
cluster = clusters_nte_seq

patterns_nte_clustered = align_clustered_patterns(patterns_nte, cluster_order, cluster, 
                                                  align_track='seq_ic',
                                                  metric='continousjaccard',
                                                  # don't shit the major patterns 
                                                  # by more than 15 when aligning
                                                  trials=20,
                                                  max_shift=15)
# add the major motif group
patterns_nte_clustered = [x.add_attr("pattern_group", 'nte') for x in patterns_nte_clustered]
100%|██████████| 94/94 [00:00<00:00, 263.66it/s]
In [47]:
cluster_order = np.argsort(leaves_list(lm_te_seq))
cluster = clusters_te_seq

patterns_te_clustered = align_clustered_patterns(patterns_te, cluster_order, cluster, 
                                                 align_track='seq_ic',
                                                 metric='continousjaccard',
                                                 # don't shit the major patterns 
                                                 # by more than 15 when aligning
                                                 trials=20,
                                                 max_shift=15)
patterns_te_clustered = [x.add_attr("pattern_group", 'te') for x in patterns_te_clustered]
100%|██████████| 28/28 [00:00<00:00, 258.91it/s]
In [48]:
cluster_order = np.argsort(leaves_list(lm_all_seq))
cluster = clusters_all_seq

patterns_all_clustered = align_clustered_patterns(patterns, cluster_order, cluster, 
                                                 align_track='seq_ic',
                                                 metric='continousjaccard',
                                                 # don't shit the major patterns 
                                                 # by more than 15 when aligning 
                                                 trials=20,
                                                 max_shift=15)
# patterns_te_clustered = [x.add_attr("pattern_group", 'te') for x in patterns_te_clustered]
100%|██████████| 122/122 [00:00<00:00, 270.46it/s]

Refining motif alignment

  • exclude TE's
In [51]:
write_pkl(patterns_all_clustered, output_dir / 'patterns.pkl')

Write a simple table

Patterns -> pd.DataFrame

In [52]:
pattern_table_nte_seq = create_pattern_table(patterns_nte_clustered,
                                             logo_len=50, 
                                             seqlogo_kwargs=dict(width=420),
                                             n_jobs=20,
                                             footprint_width=120,
                                             footprint_kwargs=dict(figsize=(3,1.5)))
100%|██████████| 94/94 [00:16<00:00,  5.28it/s]
In [53]:
pattern_table_te_seq = create_pattern_table(patterns_te_clustered,
                                            logo_len=70, 
                                            seqlogo_kwargs=dict(width=420),
                                            n_jobs=20,
                                            footprint_width=120,
                                            footprint_kwargs=dict(figsize=(3,1.5)))
100%|██████████| 28/28 [00:00<00:00, 305.77it/s]
In [54]:
pattern_table_all = create_pattern_table(patterns_all_clustered,
                                         logo_len=70, 
                                         seqlogo_kwargs=dict(width=420),
                                         n_jobs=20,
                                         footprint_width=120,
                                         footprint_kwargs=dict(figsize=(3,1.5)))
100%|██████████| 122/122 [00:21<00:00,  3.15it/s]
In [55]:
from basepair.modisco.table import write_modisco_table

def get_first_columns(df, cols):
    remaining_columns = [c for c in df.columns if c not in cols]
    return df[cols + remaining_columns]

(output_dir / 'motif_clustering').mkdir(exist_ok=True)
In [56]:
background_motifs = ['Essrb', 'Klf4', 'Nanog','Oct4','Oct4-Sox2', 'Sox2']

first_columns = ['pattern', 'cluster', 'n seqlets', 'logo_imp', 'logo_seq'] + [task+'/f' for task in tasks] + [t + '/d_p' for t in tasks] # + [m+'/odds' for m in background_motifs]

remove = [task + '/f' for task in tasks] + ['logo_imp', 'logo_seq']

pd.DataFrame -> .html, .csv

In [57]:
pattern_table_nte_seq['i'] = np.arange(len(pattern_table_nte_seq), dtype=int)
pattern_table_nte_seq = get_first_columns(pattern_table_nte_seq, ['i'] + first_columns)
write_modisco_table(pattern_table_nte_seq, output_dir / 'motif_clustering', report_url='../results.html', prefix='patterns_short', exclude_when_writing=remove)
In [58]:
pattern_table_te_seq['i'] = np.arange(len(pattern_table_te_seq), dtype=int)
pattern_table_te_seq = get_first_columns(pattern_table_te_seq, ['i'] + first_columns)
write_modisco_table(pattern_table_te_seq, output_dir / 'motif_clustering', report_url='../results.html', prefix='patterns_long', exclude_when_writing=remove)
In [59]:
pattern_table_all['i'] = np.arange(len(pattern_table_all), dtype=int)
pattern_table_all = get_first_columns(pattern_table_all, ['i'] + first_columns)
write_modisco_table(pattern_table_all, output_dir, report_url='results.html', prefix='pattern_table', exclude_when_writing=remove)