%matplotlib inline
from uuid import uuid4
import basepair
from collections import OrderedDict
from basepair.math import mean
from basepair.stats import perc
from IPython.display import display, HTML
from basepair.plot.vdom import df2html, df2html_old, render_datatable
from basepair.modisco.core import patterns_to_df
from basepair.modisco.utils import longer_pattern, shorten_pattern, extract_name_long
from basepair.imports import *
from basepair.utils import apply_parallel
from basepair.modisco.pattern_instances import load_instances
from basepair.modisco.gspan import (dfi2graph_data, run_gspan,
load_gspan_results, load_gspan_as_features)
model_dir = Path(f"{ddir}/processed/chipnexus/exp/models/oct-sox-nanog-klf/models/n_dil_layers=9/")
modisco_dir = model_dir / f"modisco/all/profile/"
output_dir = Path("/srv/www/kundaje/avsec/chipnexus/oct-sox-nanog-klf/models/n_dil_layers=9/modisco/all/profile")
# setup file paths
gspan_dir = modisco_dir / 'interpretable-model/gSpan'
graph_file = gspan_dir / 'match!=low,imp=high,dist=10-50.data'
graph_file_num_cat = gspan_dir / 'match!=low,imp=high,dist=10-50.numcat.data'
gspan_output = gspan_dir / "gspan.output.txt"
# specify motifs to use in the analysis
motifs = OrderedDict([
("Oct4-Sox2", "m0_p0"),
("Oct4-Sox2-deg", "m6_p8"),
("Oct4", "m0_p18"),
("Sox2", "m0_p1"),
("Essrb", "m0_p2"),
("Nanog", "m0_p3"),
("Nanog-periodic", "m0_p9"),
("Klf4", "m2_p0"),
])
# Load instances
try:
dfi = load_instances(f"{modisco_dir}/instances.subset.parq", motifs, dedup=False)
except:
dfi = load_instances(f"{modisco_dir}/instances.parq", motifs, dedup=False)
dfi.set_index("example_idx", inplace=True)
# prepare dfi_subset
dfi_subset = dfi.query("match_weighted_cat!= 'low'").query("imp_weighted_cat == 'high'")
# Uncomment the following line to distinguish differnet categories
# dfi_subset['pattern_cat'] = pd.Categorical(dfi_subset['pattern_name'].astype(str) + '/match=' + dfi_subset.match_weighted_cat.astype(str) + "/imp=" + dfi_subset.imp_weighted_cat.astype(str))
dfi_subset['pattern_cat'] = dfi_subset['pattern_name']
dfi_subset = dfi_subset[['pattern_cat', 'pattern_center']].dropna()
dfi_subset.head()
dfi_subset.index.is_monotonic
# Write files
dfi2graph_data(dfi_subset.groupby(dfi_subset.index), graph_file, num_cat=False, n_jobs=20)
from gspan_mining import gSpan
# run gSpan
run_gspan(graph_file, gspan_output, min_support=50, min_num_vertices=2, where=True)
# load results
graphs = load_gspan_results(gspan_output)
dfg = pd.DataFrame({"n_edges": [len(g.edges) for g in graphs],
"n_nodes": [len(g.nodes) for g in graphs],
"support": [g.graph['support'] for g in graphs],
})
print("Number of graphs: ", len(dfg))
print("Most support: ", dfg.support.max())
print("Most nodes: ", dfg.n_nodes.max())
print("Most edges: ", dfg.n_edges.max())
from basepair.plot.config import paper_config
paper_config()
def plot_graph(g, ax=None, figsize=(3.5, 3.5)):
import networkx as nx
fig, ax = plt.subplots(1,1, figsize=figsize)
pos = nx.spring_layout(g, iterations=100, scale=0.5)
nx.draw(g, pos, node_color="white", node_shape="s", linewidths=40, node_size=100, ax=ax)
nx.draw_networkx_labels(g, pos, {k: v['name'].replace("/", "\n") for k,v in g.nodes.data()},
font_size=8,
font_weight='bold', ax=ax);
print(dfg.sort_values("n_nodes", ascending=False)[:15].to_string())
for i in dfg.sort_values("support", ascending=False).index.values[:20]:
g = graphs[i]
plot_graph(g, figsize=(3.5,3.5))
plt.margins(.2)
plt.title(f"Graph: {i}, Support {g.graph['support']}")
for i in dfg[dfg.n_nodes >= 3].sort_values("support", ascending=False).index.values[:10]:
g = graphs[i]
plot_graph(g, figsize=(4,4))
plt.margins(.3)
plt.title(f"Graph: {i}, Support {g.graph['support']}")
for i in dfg.sort_values("n_nodes", ascending=False).index.values[:10]:
g = graphs[i]
plot_graph(g, figsize=(8,6))
plt.margins(.3)
plt.title(f"Graph: {i}, Support {g.graph['support']}")
for i in dfg.sort_values("n_edges", ascending=False).index.values[:10]:
g = graphs[i]
plot_graph(g, figsize=(6,6))
plt.margins(.3)
plt.title(f"Graph: {i}, Support {g.graph['support']}")
dfg = load_gspan_as_features(gspan_output, str(graph_file) + ".idx")
plt.figure(figsize=(5,2))
plt.hist(dfg.sum(0), 50);
plt.xlabel("Number of features per region");
plt.figure(figsize=(5,2))
n = dfg.sum(0)
plt.hist(n[n<200], 50);
plt.xlabel("Number of features per region");
other keywords: graph motif detection https://en.wikipedia.org/wiki/Network_motif
Grasping frequent subgraph mining for bioinformatics applications - https://biodatamining.biomedcentral.com/articles/10.1186/s13040-018-0181-9
### Prepare the output for sgmine
# Write files
dfi_subset['pattern_cat'] = pd.Categorical(dfi_subset['pattern_cat'])
pattern_cat_categories = dfi_subset['pattern_cat'].cat.categories
dfi_subset['pattern_cat'] = dfi_subset['pattern_cat'].cat.codes # use categorical variables
dfi2graph_data(dfi_subset.groupby(dfi_subset.index), graph_file_num_cat, num_cat=True)
# Write also the output classes
with open(str(graph_file_num_cat) + ".classes", "w") as f:
f.writelines(['1\n' for i in range(dfi_subset.index.unique().shape[0])])
# `sgmine