from uuid import uuid4
from basepair.math import mean
from basepair.stats import perc
from IPython.display import display, HTML
from basepair.plot.vdom import df2html, df2html_old, render_datatable
from basepair.modisco.core import patterns_to_df
from basepair.modisco.utils import longer_pattern, shorten_pattern
from basepair.imports import *
model_dir = Path(f"{ddir}/processed/chipnexus/exp/models/oct-sox-nanog-klf/models/n_dil_layers=9/")
modisco_dir = model_dir / f"modisco/all/profile/"
mr = ModiscoResult(modisco_dir / "modisco.h5")
mr.open()
patterns = [mr.get_pattern(p) for p in mr.patterns()]
tasks = [x.split("/")[0] for x in mr.tasks()]
patterns[:2]
patterns_to_df (convenient conversion ot list[Pattern] to pd.DataFrame (including plots, see section 3)patterns_to_df?patterns_tepatterns_ntep = patterns[0]
df = patterns_to_df(patterns, properties=['short_name', 'seq_info_content'])
df.head(2)
# example borderline motif (ic == 21.498123)
patterns[28].plot(kind='seq');
df.seq_info_content.plot.hist(bins=50)
plt.xlabel("Sequence information content");
# manually determine the cutoff
TE_cuttof = 30
patterns_te = [p for p in patterns if p.seq_info_content > TE_cuttof]
patterns_nte = [p for p in patterns if p.seq_info_content <= TE_cuttof]
assert len(patterns_te) + len(patterns_nte) == len(patterns)
print("# long motifs:", len(patterns_te))
print("# short motifs: ", len(patterns_nte))
dist(pattern, metric=..., scores=['seq']) to compute the distance to another patternfrom basepair.exp.chipnexus.motif_clustering import similarity_matrix
sim_nte = similarity_matrix(patterns_nte, track='contrib/mean')
sim_nte_seq = similarity_matrix(patterns_nte, track='seq_ic')
sim_te = similarity_matrix(patterns_te, track='contrib/mean')
sim_te_seq = similarity_matrix(patterns_te, track='seq_ic')
iu1 = np.triu_indices(len(sim_nte), 1)
plt.hist(sim_nte[iu1], bins=100);
plt.title("Similarity (contrib)")
plt.xlabel("Similarity between two motifs (non-TE)");
iu1 = np.triu_indices(len(sim_nte_seq), 1)
plt.hist(sim_nte_seq[iu1], bins=100);
plt.title("Similarity (seq_ic)")
plt.xlabel("Similarity between two motifs (non-TE)");
iu2 = np.triu_indices(len(sim_te), 1)
plt.hist(sim_te[iu2], bins=100);
plt.title("Similarity (contrib)")
plt.xlabel("Similarity between two motifs (TE)");
iu2 = np.triu_indices(len(sim_te), 1)
plt.hist(sim_te_seq[iu2], bins=100);
plt.title("Similarity (seq_ic)")
plt.xlabel("Similarity between two motifs (TE)");
from scipy.cluster.hierarchy import linkage, optimal_leaf_ordering, cut_tree, leaves_list
from basepair.modisco.motif_clustering import to_colors
# Contrib-score based clustering
lm_nte_contrib = linkage(1-sim_nte[iu1], 'ward', optimal_ordering=True)
clusters_nte_contrib = cut_tree(lm_nte_contrib, n_clusters=9)[:,0]
cm_nte_contrib = sns.clustermap(pd.DataFrame(sim_nte),
row_linkage=lm_nte_contrib,
col_linkage=lm_nte_contrib,
col_colors=to_colors(pd.DataFrame(dict(cluster=pd.Categorical(clusters_nte_contrib)))),
cmap='Blues',
)
cm_nte_contrib.fig.suptitle('Contrib sim');
sns.clustermap(pd.DataFrame(sim_nte_seq),
row_linkage=lm_nte_contrib,
col_linkage=lm_nte_contrib,
col_colors=to_colors(pd.DataFrame(dict(cluster=pd.Categorical(clusters_nte_contrib)))),
cmap='Blues',
).fig.suptitle('Seq-ic sim');
# Seq IC-based clustering
lm_nte_seq = linkage(1-sim_nte_seq[iu1], 'ward', optimal_ordering=True)
clusters_nte_seq = cut_tree(lm_nte_seq, n_clusters=9)[:,0]
cm_nte_seq = sns.clustermap(pd.DataFrame(sim_nte_seq),
row_linkage=lm_nte_seq,
col_linkage=lm_nte_seq,
col_colors=to_colors(pd.DataFrame(dict(cluster=pd.Categorical(clusters_nte_seq)))),
cmap='Blues'
).fig.suptitle('Seq-ic sim');
cm_nte_seq = sns.clustermap(pd.DataFrame(sim_nte),
row_linkage=lm_nte_seq,
col_linkage=lm_nte_seq,
col_colors=to_colors(pd.DataFrame(dict(cluster=pd.Categorical(clusters_nte_seq)))),
cmap='Blues'
).fig.suptitle('Contrib sim');
lm_te_contrib = linkage(1-sim_te[iu2], 'ward', optimal_ordering=True)
cm_te = sns.clustermap(sim_te,
row_linkage=lm_te_contrib,
col_linkage=lm_te_contrib,
cmap='Blues'
)
cm_te.fig.suptitle('Contrib sim');
lm_te_seq = linkage(1-sim_te_seq[iu2], 'ward', optimal_ordering=True)
cm_te = sns.clustermap(sim_te_seq,
row_linkage=lm_te_seq,
col_linkage=lm_te_seq,
cmap='Blues'
).fig.suptitle('Seq IC sim');
import holoviews as hv
import hvplot.pandas
hv.extension('bokeh', 'matplotlib')
def to_img(fig):
fig.savefig("/tmp/fig.png", dpi=150)
f = hv.RGB.load_image("/tmp/fig.png",bare=True)
return f
d_nte = {p.short_name: to_img(p.trim_seq_ic(0.08).plot("seq")) for p in tqdm(patterns_nte)}
d_te = {p.short_name: to_img(p.trim_seq_ic(0.08).plot("seq")) for p in tqdm(patterns_te)}
sort_idx = leaves_list(lm_nte_contrib)
sim_nte_df = pd.DataFrame([dict(a=patterns_nte[i].short_name, b=patterns_nte[j].short_name, v=sim_nte[i,j])
for i in sort_idx for j in sort_idx])
sort_idx = leaves_list(lm_nte_seq)
sim_nte_df_seq = pd.DataFrame([dict(a=patterns_nte[i].short_name, b=patterns_nte[j].short_name, v=sim_nte_seq[i,j])
for i in sort_idx for j in sort_idx])
sort_idx = leaves_list(lm_te_contrib)
sim_te_df = pd.DataFrame([dict(a=patterns_te[i].short_name, b=patterns_te[j].short_name, v=sim_te[i,j])
for i in sort_idx for j in sort_idx])
%opts HeatMap [xrotation=90] (cmap='Blues')
heatmap = sim_nte_df.hvplot.heatmap(x='a', y='b', C='v', width=1000, height=1000)
posxy = hv.streams.Tap(source=heatmap, x="m0_p0", y='m0_p0')
# Define function to compute histogram based on tap location
def tap_histogram(x, y):
return d_nte[x].options(width=800, height=500)
obj = heatmap + hv.DynamicMap(tap_histogram, kdims=[], streams=[posxy])
obj
%opts HeatMap [xrotation=90] (cmap='Blues')
heatmap = sim_nte_df_seq.hvplot.heatmap(x='a', y='b', C='v', width=1000, height=1000)
posxy = hv.streams.Tap(source=heatmap, x="m0_p0", y='m0_p0')
# Define function to compute histogram based on tap location
def tap_histogram(x, y):
return d_nte[x].options(width=800, height=500)
obj = heatmap + hv.DynamicMap(tap_histogram, kdims=[], streams=[posxy])
obj
%opts HeatMap [xrotation=90] (cmap='Blues')
heatmap = sim_te_df.hvplot.heatmap(x='a', y='b', C='v', width=600, height=600)
heatmap
[x] Setup a table with three columns (pattern, logo_imp, logo_seq):
[x] allow patterns_to_df to also add logo_imp and logo_seq
# query
patterns[1].plot("seq");
# target
patterns[0].plot("seq");
# query
patterns[1].rc().plot("seq");
pa = patterns[1]
pb = patterns[0]
display(pb.resize(30).vdom_plot('seq'))
display(pa.align(pb).resize(30).vdom_plot('seq'))
display(pa.resize(30).vdom_plot('seq'))
display(pb.align(pa).resize(30).vdom_plot('seq'))
cluster and cluster_order# read the property table
output_dir = Path("/srv/www/kundaje/avsec/chipnexus/oct-sox-nanog-klf/models/n_dil_layers=9/modisco/all/profile")
pattern_table = pd.read_csv(output_dir / "pattern_table.csv")
# read the footprints
footprints = read_pkl(output_dir / 'footprints.pkl')
# names in nte
pattern_names = np.array([shorten_pattern(p.name) for p in patterns_nte])
patterns_nte_dict = {shorten_pattern(p.name): p for p in patterns_nte} # organize as a dict
#assert len(pattern_names) == len(cluster_order)
from basepair.exp.chipnexus.motif_clustering import append_logo_cluster
# cluster using contrib scores
cluster_order = np.argsort(leaves_list(lm_nte_contrib))
cluster = clusters_nte_contrib
pattern_table_nte_contrib = append_logo_cluster(pattern_table, patterns_nte, cluster_order, cluster, width=320, logo_len=70)
cluster_order = np.argsort(leaves_list(lm_nte_seq))
cluster = clusters_nte_seq
pattern_table_nte_seq = append_logo_cluster(pattern_table, patterns_nte, cluster_order, cluster, width=320, logo_len=70)
cluster_order = np.argsort(leaves_list(lm_te_contrib))
cluster = np.arange(len(cluster_order))
pattern_table_te_contrib = append_logo_cluster(pattern_table, patterns_te, cluster_order, cluster, width=480, logo_len=70)
# add the 'directness score'
for task in tasks:
pattern_table_nte_contrib[task + "/d"] = 2*pattern_table_nte_contrib[task + " imp profile"] - pattern_table_nte_contrib[task + " imp profile"]
# add the normalized version
pattern_table_nte_contrib[task + "/d_p"] = perc(pattern_table_nte_contrib[task + "/d"])
pattern_table_nte_seq[task + "/d"] = 2*pattern_table_nte_seq[task + " imp profile"] - pattern_table_nte_seq[task + " imp profile"]
pattern_table_nte_seq[task + "/d_p"] = perc(pattern_table_nte_seq[task + "/d"])
pattern_table_te_contrib[task + "/d"] = 2*pattern_table_te_contrib[task + " imp profile"] - pattern_table_te_contrib[task + " imp profile"]
# add the normalized version
pattern_table_te_contrib[task + "/d_p"] = perc(pattern_table_te_contrib[task + "/d"])
!ls {output_dir}
!mkdir -p {output_dir}/motif_clustering
pattern_table_nte_contrib.reset_index().to_csv(output_dir / 'motif_clustering/nte_contrib.csv', index=False)
pattern_table_nte_seq.reset_index().to_csv(output_dir / 'motif_clustering/nte_seq.csv', index=False)
pattern_table_te_contrib.reset_index().to_csv(output_dir / 'motif_clustering/te_contrib.csv', index=False)
display_cols = ['pattern', 'cluster', 'n seqlets', 'logo_imp', 'logo_seq'] + [task+'/d_p' for task in tasks]
pattern_table_nte_contrib['n seqlets'] = pattern_table_nte_contrib['n seqlets'].astype(float)
pattern_table_nte_seq['n seqlets'] = pattern_table_nte_seq['n seqlets'].astype(float)
pattern_table_te_contrib['n seqlets'] = pattern_table_te_contrib['n seqlets'].astype(float)
dfv_contrib = pattern_table_nte_contrib.reset_index().sort_values('cluster_order', ascending=True)[display_cols]
dfv_seq = pattern_table_nte_seq.reset_index().sort_values('cluster_order', ascending=True)[display_cols]
dfv_te_contrib = pattern_table_te_contrib.reset_index().sort_values('cluster_order', ascending=True)[display_cols]
from basepair.exp.chipnexus.annotate_profile import read_annotated_csv
dfl = read_annotated_csv("../2018-10-16-ziga-all-profile.csv")
dfl.head()
from basepair.plot.vdom import vdom_footprint, footprint_df
dff = footprint_df(footprints, dfl, figsize=(3,1.5), width=120)
# add the footprints
dfv_contrib = pd.merge(dfv_contrib, dff, on='pattern', how='left')
dfv_seq = pd.merge(dfv_seq, dff, on='pattern', how='left')
dfv_te_contrib = pd.merge(dfv_te_contrib, dff, on='pattern', how='left')
colorder = ['pattern', 'cluster', 'n seqlets', 'logo_imp', 'logo_seq'] + tasks + [t + '/d_p' for t in tasks]
html_str_contrib = df2html(dfv_contrib[colorder], "table_contrib" + str(uuid4()), "")
with open('/srv/www/kundaje/avsec/chipnexus/oct-sox-nanog-klf/models/n_dil_layers=9/modisco/all/profile/pattern-table.smaller.cotrib-cluster.html', 'w') as fo:
fo.write(html_str_contrib)
# HTML(html_str_contrib)
html_str_seq = df2html(dfv_seq[colorder], "table_seq" + str(uuid4()), "")
with open('/srv/www/kundaje/avsec/chipnexus/oct-sox-nanog-klf/models/n_dil_layers=9/modisco/all/profile/pattern-table.smaller.seq-cluster.html', 'w') as fo:
fo.write(html_str_seq)
# html_str_seq
html_str_te_contrib = df2html(dfv_te_contrib[colorder], "table_seq" + str(uuid4()), "")
with open('/srv/www/kundaje/avsec/chipnexus/oct-sox-nanog-klf/models/n_dil_layers=9/modisco/all/profile/pattern-table.TE.contrib-cluster.html', 'w') as fo:
fo.write(html_str_te_contrib)
# html_str_seq