Goal

  • Run gspan

Run gSpan

Nodes

  • match=medium,high, imp=high

Edges

  • distance = [10, 50] bp
In [203]:
%matplotlib inline
from uuid import uuid4
import basepair
from collections import OrderedDict
from basepair.math import mean
from basepair.stats import perc

from IPython.display import display, HTML
from basepair.plot.vdom import df2html, df2html_old, render_datatable

from basepair.modisco.core import patterns_to_df
from basepair.modisco.utils import longer_pattern, shorten_pattern, extract_name_long
from basepair.imports import *
from basepair.utils import apply_parallel
from basepair.modisco.pattern_instances import load_instances

from basepair.modisco.gspan import (dfi2graph_data, run_gspan, 
                                    load_gspan_results, load_gspan_as_features)

model_dir = Path(f"{ddir}/processed/chipnexus/exp/models/oct-sox-nanog-klf/models/n_dil_layers=9/")
modisco_dir = model_dir / f"modisco/all/profile/"
output_dir = Path("/srv/www/kundaje/avsec/chipnexus/oct-sox-nanog-klf/models/n_dil_layers=9/modisco/all/profile")

# setup file paths
gspan_dir = modisco_dir / 'interpretable-model/gSpan'
graph_file = gspan_dir / 'match!=low,imp=high,dist=10-50.data'
graph_file_num_cat = gspan_dir / 'match!=low,imp=high,dist=10-50.numcat.data'
gspan_output = gspan_dir / "gspan.output.txt"

Load and filter instances

In [2]:
# specify motifs to use in the analysis
motifs = OrderedDict([
    ("Oct4-Sox2", "m0_p0"),
    ("Oct4-Sox2-deg", "m6_p8"),
    ("Oct4", "m0_p18"),
    ("Sox2", "m0_p1"),
    ("Essrb", "m0_p2"),
    ("Nanog", "m0_p3"),
    ("Nanog-periodic", "m0_p9"),
    ("Klf4", "m2_p0"),
])
In [3]:
# Load instances
try:
    dfi = load_instances(f"{modisco_dir}/instances.subset.parq", motifs, dedup=False)
except:
    dfi = load_instances(f"{modisco_dir}/instances.parq", motifs, dedup=False)
    dfi.set_index("example_idx", inplace=True)
In [11]:
# prepare dfi_subset
dfi_subset = dfi.query("match_weighted_cat!= 'low'").query("imp_weighted_cat == 'high'")
# Uncomment the following line to distinguish differnet categories
# dfi_subset['pattern_cat'] = pd.Categorical(dfi_subset['pattern_name'].astype(str) + '/match=' + dfi_subset.match_weighted_cat.astype(str) + "/imp=" + dfi_subset.imp_weighted_cat.astype(str))
dfi_subset['pattern_cat'] = dfi_subset['pattern_name']
dfi_subset = dfi_subset[['pattern_cat', 'pattern_center']].dropna()
In [5]:
dfi_subset.head()
Out[5]:
pattern_cat pattern_center
example_idx
1 Oct4-Sox2 444
1 Oct4-Sox2 465
1 Oct4-Sox2 527
2 Oct4-Sox2 505
3 Oct4-Sox2 507

Run gSpan

In [85]:
dfi_subset.index.is_monotonic
Out[85]:
False
In [10]:
# Write files
dfi2graph_data(dfi_subset.groupby(dfi_subset.index), graph_file, num_cat=False, n_jobs=20)
100%|██████████| 33755/33755 [04:05<00:00, 137.51it/s]
In [169]:
from gspan_mining import gSpan
In [182]:
# run gSpan
run_gspan(graph_file, gspan_output, min_support=50, min_num_vertices=2, where=True)
In [183]:
# load results
graphs = load_gspan_results(gspan_output)

dfg = pd.DataFrame({"n_edges": [len(g.edges) for g in graphs],
                    "n_nodes": [len(g.nodes) for g in graphs],
                    "support": [g.graph['support'] for g in graphs],
                    })
print("Number of graphs: ", len(dfg))
print("Most support: ", dfg.support.max())
print("Most nodes: ", dfg.n_nodes.max())
print("Most edges: ", dfg.n_edges.max())
Number of graphs:  100
Most support:  1788
Most nodes:  4
Most edges:  4

Visualize the results

In [184]:
from basepair.plot.config import paper_config
paper_config()

def plot_graph(g, ax=None, figsize=(3.5, 3.5)):
    import networkx as nx
    fig, ax = plt.subplots(1,1, figsize=figsize)
    pos = nx.spring_layout(g, iterations=100, scale=0.5)
    nx.draw(g, pos, node_color="white", node_shape="s", linewidths=40, node_size=100, ax=ax)
    nx.draw_networkx_labels(g, pos, {k: v['name'].replace("/", "\n") for k,v in g.nodes.data()}, 
                            font_size=8, 
                            font_weight='bold', ax=ax);    
In [185]:
print(dfg.sort_values("n_nodes", ascending=False)[:15].to_string())
    n_edges  n_nodes  support
78        3        4       52
84        4        4       87
48        3        4       52
42        3        4       51
40        3        4       58
39        4        4       50
51        3        4       50
90        3        4       52
87        3        4       50
86        3        4       50
85        3        4       89
60        3        4      108
25        3        4       50
46        3        4       52
22        3        4      127

Some interesting graphs

20 most frequent

In [186]:
for i in dfg.sort_values("support", ascending=False).index.values[:20]:
    g = graphs[i]
    plot_graph(g, figsize=(3.5,3.5))
    plt.margins(.2)
    plt.title(f"Graph: {i}, Support {g.graph['support']}")

5 most frequent with >= 3 nodes

In [187]:
for i in dfg[dfg.n_nodes >= 3].sort_values("support", ascending=False).index.values[:10]:
    g = graphs[i]
    plot_graph(g, figsize=(4,4))
    plt.margins(.3)
    plt.title(f"Graph: {i}, Support {g.graph['support']}")

Most nodes

In [188]:
for i in dfg.sort_values("n_nodes", ascending=False).index.values[:10]:
    g = graphs[i]
    plot_graph(g, figsize=(8,6))
    plt.margins(.3)
    plt.title(f"Graph: {i}, Support {g.graph['support']}")

Most edges

In [189]:
for i in dfg.sort_values("n_edges", ascending=False).index.values[:10]:
    g = graphs[i]
    plot_graph(g, figsize=(6,6))
    plt.margins(.3)
    plt.title(f"Graph: {i}, Support {g.graph['support']}")

Graph results to features

In [196]:
dfg = load_gspan_as_features(gspan_output, str(graph_file) + ".idx")
In [202]:
plt.figure(figsize=(5,2))
plt.hist(dfg.sum(0), 50);
plt.xlabel("Number of features per region");
plt.figure(figsize=(5,2))
n = dfg.sum(0)
plt.hist(n[n<200], 50);
plt.xlabel("Number of features per region");

TODO

other keywords: graph motif detection https://en.wikipedia.org/wiki/Network_motif

Grasping frequent subgraph mining for bioinformatics applications - https://biodatamining.biomedcentral.com/articles/10.1186/s13040-018-0181-9

In [23]:
### Prepare the output for sgmine

# Write files
dfi_subset['pattern_cat'] = pd.Categorical(dfi_subset['pattern_cat'])
pattern_cat_categories = dfi_subset['pattern_cat'].cat.categories
dfi_subset['pattern_cat'] = dfi_subset['pattern_cat'].cat.codes  # use categorical variables

dfi2graph_data(dfi_subset.groupby(dfi_subset.index), graph_file_num_cat, num_cat=True)

# Write also the output classes
with open(str(graph_file_num_cat) + ".classes", "w") as f:
    f.writelines(['1\n' for i in range(dfi_subset.index.unique().shape[0])])
In [ ]:
# `sgmine

Other ideas for notes / edges

Node categories

  • Pattern X 3x3 option: match=high,cat=low
  • Pattern X Subset to only high importance: 1x3 option: match=high, match=low etc
  • Pattern X 1x2 option: eighter match!=low,imp=high and match=low,imp=high
  • Pattern X 1 option: vmatch!=low,imp=high

Edge categories

  • 3 cls: 5-15, 15-50, 50-150
  • 1 cls: 15-50 or 15-80