Goal¶

Run gspan

Run gSpan¶

Nodes¶

match=medium,high, imp=high

Edges¶

distance = [10, 50] bp

%matplotlib inline
from uuid import uuid4
import basepair
from collections import OrderedDict
from basepair.math import mean
from basepair.stats import perc

from IPython.display import display, HTML
from basepair.plot.vdom import df2html, df2html_old, render_datatable

from basepair.modisco.core import patterns_to_df
from basepair.modisco.utils import longer_pattern, shorten_pattern, extract_name_long
from basepair.imports import *
from basepair.utils import apply_parallel
from basepair.modisco.pattern_instances import load_instances

from basepair.modisco.gspan import (dfi2graph_data, run_gspan, 
                                    load_gspan_results, load_gspan_as_features)

model_dir = Path(f"{ddir}/processed/chipnexus/exp/models/oct-sox-nanog-klf/models/n_dil_layers=9/")
modisco_dir = model_dir / f"modisco/all/profile/"
output_dir = Path("/srv/www/kundaje/avsec/chipnexus/oct-sox-nanog-klf/models/n_dil_layers=9/modisco/all/profile")

# setup file paths
gspan_dir = modisco_dir / 'interpretable-model/gSpan'
graph_file = gspan_dir / 'match!=low,imp=high,dist=10-50.data'
graph_file_num_cat = gspan_dir / 'match!=low,imp=high,dist=10-50.numcat.data'
gspan_output = gspan_dir / "gspan.output.txt"

Load and filter instances¶

# specify motifs to use in the analysis
motifs = OrderedDict([
    ("Oct4-Sox2", "m0_p0"),
    ("Oct4-Sox2-deg", "m6_p8"),
    ("Oct4", "m0_p18"),
    ("Sox2", "m0_p1"),
    ("Essrb", "m0_p2"),
    ("Nanog", "m0_p3"),
    ("Nanog-periodic", "m0_p9"),
    ("Klf4", "m2_p0"),
])

# Load instances
try:
    dfi = load_instances(f"{modisco_dir}/instances.subset.parq", motifs, dedup=False)
except:
    dfi = load_instances(f"{modisco_dir}/instances.parq", motifs, dedup=False)
    dfi.set_index("example_idx", inplace=True)

# prepare dfi_subset
dfi_subset = dfi.query("match_weighted_cat!= 'low'").query("imp_weighted_cat == 'high'")
# Uncomment the following line to distinguish differnet categories
# dfi_subset['pattern_cat'] = pd.Categorical(dfi_subset['pattern_name'].astype(str) + '/match=' + dfi_subset.match_weighted_cat.astype(str) + "/imp=" + dfi_subset.imp_weighted_cat.astype(str))
dfi_subset['pattern_cat'] = dfi_subset['pattern_name']
dfi_subset = dfi_subset[['pattern_cat', 'pattern_center']].dropna()

dfi_subset.head()

Run gSpan¶

dfi_subset.index.is_monotonic

False

# Write files
dfi2graph_data(dfi_subset.groupby(dfi_subset.index), graph_file, num_cat=False, n_jobs=20)

100%|██████████| 33755/33755 [04:05<00:00, 137.51it/s]

from gspan_mining import gSpan

# run gSpan
run_gspan(graph_file, gspan_output, min_support=50, min_num_vertices=2, where=True)

# load results
graphs = load_gspan_results(gspan_output)

dfg = pd.DataFrame({"n_edges": [len(g.edges) for g in graphs],
                    "n_nodes": [len(g.nodes) for g in graphs],
                    "support": [g.graph['support'] for g in graphs],
                    })
print("Number of graphs: ", len(dfg))
print("Most support: ", dfg.support.max())
print("Most nodes: ", dfg.n_nodes.max())
print("Most edges: ", dfg.n_edges.max())

Number of graphs:  100
Most support:  1788
Most nodes:  4
Most edges:  4

Visualize the results¶

from basepair.plot.config import paper_config
paper_config()

def plot_graph(g, ax=None, figsize=(3.5, 3.5)):
    import networkx as nx
    fig, ax = plt.subplots(1,1, figsize=figsize)
    pos = nx.spring_layout(g, iterations=100, scale=0.5)
    nx.draw(g, pos, node_color="white", node_shape="s", linewidths=40, node_size=100, ax=ax)
    nx.draw_networkx_labels(g, pos, {k: v['name'].replace("/", "\n") for k,v in g.nodes.data()}, 
                            font_size=8, 
                            font_weight='bold', ax=ax);

print(dfg.sort_values("n_nodes", ascending=False)[:15].to_string())

    n_edges  n_nodes  support
78        3        4       52
84        4        4       87
48        3        4       52
42        3        4       51
40        3        4       58
39        4        4       50
51        3        4       50
90        3        4       52
87        3        4       50
86        3        4       50
85        3        4       89
60        3        4      108
25        3        4       50
46        3        4       52
22        3        4      127

Some interesting graphs¶

20 most frequent¶

for i in dfg.sort_values("support", ascending=False).index.values[:20]:
    g = graphs[i]
    plot_graph(g, figsize=(3.5,3.5))
    plt.margins(.2)
    plt.title(f"Graph: {i}, Support {g.graph['support']}")

5 most frequent with >= 3 nodes¶

for i in dfg[dfg.n_nodes >= 3].sort_values("support", ascending=False).index.values[:10]:
    g = graphs[i]
    plot_graph(g, figsize=(4,4))
    plt.margins(.3)
    plt.title(f"Graph: {i}, Support {g.graph['support']}")

Most nodes¶

for i in dfg.sort_values("n_nodes", ascending=False).index.values[:10]:
    g = graphs[i]
    plot_graph(g, figsize=(8,6))
    plt.margins(.3)
    plt.title(f"Graph: {i}, Support {g.graph['support']}")

Most edges¶

for i in dfg.sort_values("n_edges", ascending=False).index.values[:10]:
    g = graphs[i]
    plot_graph(g, figsize=(6,6))
    plt.margins(.3)
    plt.title(f"Graph: {i}, Support {g.graph['support']}")

Graph results to features¶

dfg = load_gspan_as_features(gspan_output, str(graph_file) + ".idx")

plt.figure(figsize=(5,2))
plt.hist(dfg.sum(0), 50);
plt.xlabel("Number of features per region");
plt.figure(figsize=(5,2))
n = dfg.sum(0)
plt.hist(n[n<200], 50);
plt.xlabel("Number of features per region");

TODO¶

[x] create a design matrix based on gspan output
- binary indicator variable whether the example is in the category
[x] investigate the results
[ ] use the following statistical test for it:
- https://github.com/BorgwardtLab/significant-subgraph-mining, https://arxiv.org/abs/1407.0316 "significant pattern mining"
- Thesis on that topic: https://www.research-collection.ethz.ch/bitstream/handle/20.500.11850/273086/1/thesis.pdf
- [ ] fix the third columns, use indicator variables

other keywords: graph motif detection https://en.wikipedia.org/wiki/Network_motif

Grasping frequent subgraph mining for bioinformatics applications - https://biodatamining.biomedcentral.com/articles/10.1186/s13040-018-0181-9

### Prepare the output for sgmine

# Write files
dfi_subset['pattern_cat'] = pd.Categorical(dfi_subset['pattern_cat'])
pattern_cat_categories = dfi_subset['pattern_cat'].cat.categories
dfi_subset['pattern_cat'] = dfi_subset['pattern_cat'].cat.codes  # use categorical variables

dfi2graph_data(dfi_subset.groupby(dfi_subset.index), graph_file_num_cat, num_cat=True)

# Write also the output classes
with open(str(graph_file_num_cat) + ".classes", "w") as f:
    f.writelines(['1\n' for i in range(dfi_subset.index.unique().shape[0])])

# `sgmine

Goal¶

Run gSpan¶

Nodes¶

Edges¶

Load and filter instances¶

Run gSpan¶

Visualize the results¶

Some interesting graphs¶

20 most frequent¶

5 most frequent with >= 3 nodes¶

Most nodes¶

Most edges¶

Graph results to features¶

TODO¶

Other ideas for notes / edges¶

Node categories¶

Edge categories¶

	pattern_cat	pattern_center
example_idx
1	Oct4-Sox2	444
1	Oct4-Sox2	465
1	Oct4-Sox2	527
2	Oct4-Sox2	505
3	Oct4-Sox2	507