from basepair.config import get_data_dir
ddir = get_data_dir()
modisco_subdir = "modisco/all/deeplift/profile/"
model_dir = f"{ddir}/processed/chipnexus/exp/models/oct-sox-nanog-klf/models/n_dil_layers=9/"
modisco_dir = os.path.join(model_dir, modisco_subdir)
output_dir = modisco_dir
print(plt.style.available)
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
from uuid import uuid4
from basepair.math import mean
from basepair.stats import perc
from IPython.display import display, HTML
from basepair.plot.vdom import df2html, df2html_old, render_datatable, vdom_footprint
from basepair.modisco.core import patterns_to_df
from basepair.modisco.utils import longer_pattern, shorten_pattern, extract_name_short
from basepair.imports import *
model_dir = Path(model_dir)
modisco_dir = Path(modisco_dir)
mr = ModiscoResult(modisco_dir / "modisco.h5")
mr.open()
patterns = [mr.get_pattern(p) for p in mr.patterns()]
# patterns = [mr.get_pattern(p) for p in mr.patterns()]
pattern_table = pd.read_csv(output_dir / "pattern_table.csv")
tasks = [x.split("/")[0] for x in mr.tasks()]
footprints = read_pkl(output_dir / 'footprints.pkl')
# Add profile to patterns
patterns = [p.add_profile(footprints[p.name]) for p in patterns]
# Add features
patterns = [p.add_attr('features', OrderedDict(pattern_table[pattern_table.pattern == shorten_pattern(p.name)].iloc[0])) for p in patterns]
# check that the pattern names match
assert patterns[4].attrs['features']['pattern'] == shorten_pattern(patterns[4].name)
patterns[:2]
p = patterns[0]
df = patterns_to_df(patterns, properties=['short_name', 'seq_info_content'])
df.head(2)
# example borderline motif (ic == 21.498123)
patterns[28].plot(kind='seq');
# one can also plot the profile of the Pattern
patterns[0].plot(kind='profile', rotate_y=0);
TE_cutoff = 34
df.seq_info_content.plot.hist(bins=50)
plt.xlabel("Sequence information content");
plt.axvline(TE_cutoff, color='orange', linewidth=2);
patterns_te = [p for p in patterns if p.seq_info_content > TE_cuttof]
patterns_nte = [p for p in patterns if p.seq_info_content <= TE_cuttof]
assert len(patterns_te) + len(patterns_nte) == len(patterns)
sizes = np.array([p.seq_info_content for p in patterns_te])
p = patterns_te[0]
for p in patterns_nte:
p.plot('seq_ic');
plt.title(f"{p.name} {p.seq_info_content:.2f}")
for p in patterns_nte:
print(f">{p.name}")
print(p.get_consensus())
for p in patterns_te:
print(f">{p.name}")
print(p.get_consensus())
for i in np.argsort([p.seq_info_content for p in patterns_te]):
p = patterns_te[i]
p.plot('seq_ic');
plt.title(f"{p.name} {p.seq_info_content:.2f}")
ns = pattern_table['n seqlets']
np.log10(ns).plot.hist(30)
plt.xlabel("log10(n seqlets)");
ns.quantile(0.1)
print("# long motifs: ", len(patterns_te))
print("# short motifs:", len(patterns_nte))
# Whole pipeline from this notebook
def cluster_patterns(patterns, n_clusters=9, cluster_track='seq_ic'):
"""Cluster patterns
"""
sim = similarity_matrix(patterns, track=cluster_track)
# cluster
lm_nte_seq = linkage(1-sim_nte_seq[iu1], 'ward', optimal_ordering=True)
cluster = cut_tree(lm_nte_seq, n_clusters=n_clusters)[:,0]
cluster_order = np.argsort(leaves_list(lm_nte_seq))
pattern_table_nte_seq = create_pattern_table(patterns_nte, cluster_order, cluster,
align_track='contrib/mean',
logo_len=70,
seqlogo_kwargs=dict(width=320),
footprint_width=320,
footprint_kwargs=dict())
return sim, lm_nte_seq, cluster, cluster_order, pattern_table_nte_seq
from basepair.exp.chipnexus.motif_clustering import similarity_matrix
sim_nte_seq = similarity_matrix(patterns_nte, track='seq_ic')
sim_te_seq = similarity_matrix(patterns_te, track='seq_ic')
%tqdm_restart
sim_all_seq = similarity_matrix(patterns, track='seq_ic')
iu1 = np.triu_indices(len(sim_nte_seq), 1)
plt.hist(sim_nte_seq[iu1], bins=100);
plt.title("Similarity (seq_ic)")
plt.xlabel("Similarity between two motifs (non-TE)");
iu2 = np.triu_indices(len(sim_te_seq), 1)
plt.hist(sim_te_seq[iu2], bins=100);
plt.title("Similarity (seq_ic)")
plt.xlabel("Similarity between two motifs (TE)");
iu_all = np.triu_indices(len(sim_all_seq), 1)
plt.hist(sim_all_seq[iu2], bins=100);
plt.title("Similarity (seq_ic)")
plt.xlabel("Similarity between two motifs (all)");
from scipy.cluster.hierarchy import linkage, optimal_leaf_ordering, cut_tree, leaves_list
from basepair.modisco.motif_clustering import to_colors
# Seq IC-based clustering
iu_nte = np.triu_indices(len(sim_nte_seq), 1)
lm_nte_seq = linkage(1-sim_nte_seq[iu_nte], 'ward', optimal_ordering=True)
clusters_nte_seq = cut_tree(lm_nte_seq, n_clusters=9)[:,0]
cm_nte_seq = sns.clustermap(pd.DataFrame(sim_nte_seq),
row_linkage=lm_nte_seq,
col_linkage=lm_nte_seq,
col_colors=to_colors(pd.DataFrame(dict(cluster=pd.Categorical(clusters_nte_seq)))),
cmap='Blues'
).fig.suptitle('Seq-ic sim');
iu_te = np.triu_indices(len(sim_te_seq), 1)
lm_te_seq = linkage(1-sim_te_seq[iu_te], 'ward', optimal_ordering=True)
clusters_te_seq = cut_tree(lm_te_seq, n_clusters=9)[:,0]
cm_te = sns.clustermap(sim_te_seq,
row_linkage=lm_te_seq,
col_linkage=lm_te_seq,
cmap='Blues'
).fig.suptitle('Seq IC sim');
iu_all = np.triu_indices(len(sim_all_seq), 1)
lm_all_seq = linkage(1-sim_all_seq[iu_all], 'ward', optimal_ordering=True)
clusters_all_seq = cut_tree(lm_all_seq, n_clusters=9)[:,0]
cm_all = sns.clustermap(sim_all_seq,
row_linkage=lm_all_seq,
col_linkage=lm_all_seq,
cmap='Blues'
).fig.suptitle('Seq IC sim');
# sim_nte = similarity_matrix(patterns_nte, track='profile/mean')
# Contrib-score based clustering
# lm_nte_contrib = linkage(1-sim_nte[iu1], 'ward', optimal_ordering=True)
# clusters_nte_contrib = cut_tree(lm_nte_contrib, n_clusters=9)[:,0]
# cm_nte_contrib = sns.clustermap(pd.DataFrame(sim_nte),
# row_linkage=lm_nte_contrib,
# col_linkage=lm_nte_contrib,
# col_colors=to_colors(pd.DataFrame(dict(cluster=pd.Categorical(clusters_nte_contrib)))),
# cmap='Blues',
# )
from basepair.modisco.motif_clustering import create_pattern_table, align_clustered_patterns
cluster_order = np.argsort(leaves_list(lm_nte_seq))
cluster = clusters_nte_seq
patterns_nte_clustered = align_clustered_patterns(patterns_nte, cluster_order, cluster,
align_track='seq_ic',
# don't shit the major patterns
# by more than 15 when aligning
max_shift=15)
# add the major motif group
patterns_nte_clustered = [x.add_attr("pattern_group", 'nte') for x in patterns_nte_clustered]
cluster_order = np.argsort(leaves_list(lm_te_seq))
cluster = clusters_te_seq
patterns_te_clustered = align_clustered_patterns(patterns_te, cluster_order, cluster,
align_track='seq_ic',
# don't shit the major patterns
# by more than 15 when aligning
max_shift=15)
patterns_te_clustered = [x.add_attr("pattern_group", 'te') for x in patterns_te_clustered]
cluster_order = np.argsort(leaves_list(lm_all_seq))
cluster = clusters_all_seq
patterns_all_clustered = align_clustered_patterns(patterns, cluster_order, cluster,
align_track='seq_ic',
# don't shit the major patterns
# by more than 15 when aligning
max_shift=15)
# patterns_te_clustered = [x.add_attr("pattern_group", 'te') for x in patterns_te_clustered]
write_pkl(patterns_nte_clustered + patterns_te_clustered, output_dir / 'patterns.pkl')
patterns = read_pkl(modisco_dir / 'patterns.pkl')
# split the patterns back into the groups
patterns_nte_clustered = [x for x in patterns if x.attrs['pattern_group'] == 'nte']
patterns_te_clustered = [x for x in patterns if x.attrs['pattern_group'] == 'te']
pattern_table_nte_seq = create_pattern_table(patterns_nte_clustered,
logo_len=50,
seqlogo_kwargs=dict(width=420),
n_jobs=20,
footprint_width=120,
footprint_kwargs=dict(figsize=(3,1.5)))
pattern_table_te_seq = create_pattern_table(patterns_te_clustered,
logo_len=70,
seqlogo_kwargs=dict(width=420),
n_jobs=20,
footprint_width=120,
footprint_kwargs=dict(figsize=(3,1.5)))
pattern_table_all = create_pattern_table(patterns,
logo_len=70,
seqlogo_kwargs=dict(width=420),
n_jobs=20,
footprint_width=120,
footprint_kwargs=dict(figsize=(3,1.5)))
background_motifs = ['Essrb', 'Klf4', 'Nanog','Oct4','Oct4-Sox2', 'Sox2']
first_columns = ['pattern', 'cluster', 'n seqlets', 'logo_imp', 'logo_seq'] + [task+'/f' for task in tasks] + [t + '/d_p' for t in tasks] # + [m+'/odds' for m in background_motifs]
remaining_columns = [c for c in pattern_table_nte_seq.columns if c not in first_columns]
colorder = first_columns + remaining_columns
from basepair.modisco.table import write_modisco_table
remove = [task + '/f' for task in tasks] + ['logo_imp', 'logo_seq']
pattern_table_nte_seq['i'] = pattern_table_nte_seq.index.astype(int)
pattern_table_nte_seq = pattern_table_nte_seq[['i'] + colorder]
write_modisco_table(pattern_table_nte_seq, output_dir / 'motif_clustering', report_url='../results.html', prefix='patterns_short', exclude_when_writing=remove)
pattern_table_te_seq['i'] = pattern_table_te_seq.index.astype(int)
pattern_table_te_seq = pattern_table_te_seq[['i'] + colorder]
write_modisco_table(pattern_table_te_seq, output_dir / 'motif_clustering', report_url='../results.html', prefix='patterns_long', exclude_when_writing=remove)
pattern_table_all['i'] = pattern_table_all.index.astype(int)
pattern_table_all = pattern_table_all[['i'] + colorder]
write_modisco_table(pattern_table_all, output_dir, report_url='../results.html', prefix='patterns_all.datatable', exclude_when_writing=remove)