url_dir = "http://mitra.stanford.edu/kundaje/avsec/chipnexus/oct-sox-nanog-klf/models/n_dil_layers=9/modisco/valid/new-hparams/plots/"
modisco_dir = "/users/avsec/workspace/basepair/data/processed/chipnexus/exp/models/oct-sox-nanog-klf/models/n_dil_layers=9/modisco/valid/new-hparams"
# Parameters
url_dir = "http://mitra.stanford.edu/kundaje/avsec/chipnexus/oct-sox-nanog-klf/models/n_dil_layers=9/modisco/all/profile/plots"
modisco_dir = "."
from basepair.modisco.results import ModiscoResult
from basepair.config import get_data_dir
from basepair.utils import read_json
from basepair.plot.vdom import vdom_modisco
from kipoi.readers import HDF5Reader
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
from plotnine import *
/users/avsec/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/h5py/__init__.py:36: FutureWarning: Conversion of the second argument of issubdtype from `float` to `np.floating` is deprecated. In future, it will be treated as `np.float64 == np.dtype(float).type`. from ._conv import register_converters as _register_converters Using TensorFlow backend. /users/avsec/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/statsmodels/compat/pandas.py:56: FutureWarning: The pandas.core.datetools module is deprecated and will be removed in a future version. Please use the pandas.tseries module instead. from pandas.core import datetools
mr = ModiscoResult(f"{modisco_dir}/modisco.h5")
mr.open()
# load the data
modisco_kwargs = read_json(os.path.join(modisco_dir, "kwargs.json"))
d = HDF5Reader(modisco_kwargs['imp_scores'])
d.open()
strand_dist_file = f"{modisco_dir}/strand_distances.h5"
if modisco_kwargs.get("ignore_strand_dist", False) and os.path.exists(strand_dist_file):
included_samples = HDF5Reader.load(strand_dist_file)['included_samples']
else:
included_samples = np.ones(d.f['inputs'].shape[:1], dtype=bool)
if modisco_kwargs.get("filter_npy", None) is not None:
included_samples = np.load(modisco_kwargs['filter_npy']) * included_samples
id_hash = pd.DataFrame({"peak_id": d.f['/metadata/interval_from_task'][:][included_samples],
"example_idx": np.arange(d.f['/metadata/interval_from_task'][included_samples].shape[0])})
tasks = list(d.f["targets"]["profile"].keys())
# get all seqlet instances
dfp = mr.seqlet_df_instances().rename(columns=dict(seqname="example_idx"))
dfp = pd.merge(dfp, id_hash, on="example_idx")
TF-MoDISco is using the TensorFlow backend.
# row = example_idx
total_counts = pd.DataFrame({task: d.f[f"/targets/profile/{task}"][:][included_samples].sum(axis=-1).sum(axis=-1)
for task in tasks
})
len(mr.patterns())
142
# total number of seqlets
len(dfp)
84425
# Number of metaclusters
len(mr.metaclusters())
15
mc_stat = mr.metacluster_stats()
ggplot(aes(x="pattern", y='n'), mc_stat) + geom_bar(stat='identity') + \
facet_wrap("~metacluster", ncol=4, labeller='label_both') + \
ylab("Number of seqlets") + theme_classic()
/users/avsec/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/plotnine/utils.py:281: FutureWarning: Method .as_matrix will be removed in a future version. Use .values instead. ndistinct = ids.apply(len_unique, axis=0).as_matrix() /users/avsec/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/pandas/core/generic.py:4388: FutureWarning: Attribute 'is_copy' is deprecated and will be removed in a future version. object.__getattribute__(self, name) /users/avsec/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/pandas/core/generic.py:4389: FutureWarning: Attribute 'is_copy' is deprecated and will be removed in a future version. return object.__setattr__(self, name, value) /users/avsec/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/plotnine/positions/position.py:188: FutureWarning: Method .as_matrix will be removed in a future version. Use .values instead. intervals = data[xminmax].drop_duplicates().as_matrix().flatten()
<ggplot: (-9223363269748035384)>
ggplot(aes(x="pattern", y='n'), mc_stat) + geom_bar(stat='identity') + \
facet_wrap("~metacluster", ncol=4, labeller='label_both') + \
ylab("Number of seqlets") + theme_classic() + coord_cartesian(ylim=[0, 500])
/users/avsec/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/plotnine/utils.py:281: FutureWarning: Method .as_matrix will be removed in a future version. Use .values instead. ndistinct = ids.apply(len_unique, axis=0).as_matrix() /users/avsec/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/pandas/core/generic.py:4388: FutureWarning: Attribute 'is_copy' is deprecated and will be removed in a future version. object.__getattribute__(self, name) /users/avsec/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/pandas/core/generic.py:4389: FutureWarning: Attribute 'is_copy' is deprecated and will be removed in a future version. return object.__setattr__(self, name, value) /users/avsec/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/plotnine/positions/position.py:188: FutureWarning: Method .as_matrix will be removed in a future version. Use .values instead. intervals = data[xminmax].drop_duplicates().as_matrix().flatten()
<ggplot: (-9223363269747513775)>
mcs_grouped = mc_stat.groupby("metacluster").n.agg(["count", "sum"]).reset_index()
fig, ax = plt.subplots(2, 1, sharex=False, figsize=(18,6),
gridspec_kw={'height_ratios': [2,1]})
mcs_grouped.plot("metacluster", "count",
label="# patterns per metacluster", style="o--",
ax=ax[0],
yticks=range(mcs_grouped['count'].max()+1),
xticks=range(38),
fontsize='large',
xlim=(-.5, len(mr.metaclusters()) - .5 ))
mcs_grouped.plot("metacluster", "sum",
label="# seqlets per metacluster",
style="o--", ax=ax[0], secondary_y=True)
ax[0].grid(linewidth=0.2)
mr.plot_metacluster_activity(ax[1], cbar=False)
ax[1].set_title("Importance score activity: Red = positive, Blue = negative");
vdom_modisco(mr, url_dir, total_counts, dfp, is_open=True, trim_frac=0.08, letter_width=0.15, height=0.5)







Pattern occurs in 8424 / 98428 regions (8.6%)







Pattern occurs in 2945 / 98428 regions (3.0%)







Pattern occurs in 1594 / 98428 regions (1.6%)







Pattern occurs in 996 / 98428 regions (1.0%)







Pattern occurs in 947 / 98428 regions (1.0%)







Pattern occurs in 691 / 98428 regions (0.7%)







Pattern occurs in 487 / 98428 regions (0.5%)







Pattern occurs in 448 / 98428 regions (0.5%)







Pattern occurs in 404 / 98428 regions (0.4%)







Pattern occurs in 346 / 98428 regions (0.4%)







Pattern occurs in 288 / 98428 regions (0.3%)







Pattern occurs in 228 / 98428 regions (0.2%)







Pattern occurs in 235 / 98428 regions (0.2%)







Pattern occurs in 222 / 98428 regions (0.2%)







Pattern occurs in 211 / 98428 regions (0.2%)







Pattern occurs in 224 / 98428 regions (0.2%)







Pattern occurs in 190 / 98428 regions (0.2%)







Pattern occurs in 192 / 98428 regions (0.2%)







Pattern occurs in 153 / 98428 regions (0.2%)







Pattern occurs in 185 / 98428 regions (0.2%)







Pattern occurs in 132 / 98428 regions (0.1%)







Pattern occurs in 125 / 98428 regions (0.1%)







Pattern occurs in 119 / 98428 regions (0.1%)







Pattern occurs in 107 / 98428 regions (0.1%)







Pattern occurs in 92 / 98428 regions (0.1%)







Pattern occurs in 145 / 98428 regions (0.1%)







Pattern occurs in 90 / 98428 regions (0.1%)







Pattern occurs in 93 / 98428 regions (0.1%)







Pattern occurs in 93 / 98428 regions (0.1%)







Pattern occurs in 172 / 98428 regions (0.2%)







Pattern occurs in 82 / 98428 regions (0.1%)







Pattern occurs in 87 / 98428 regions (0.1%)







Pattern occurs in 107 / 98428 regions (0.1%)







Pattern occurs in 77 / 98428 regions (0.1%)







Pattern occurs in 5545 / 98428 regions (5.6%)







Pattern occurs in 2337 / 98428 regions (2.4%)







Pattern occurs in 466 / 98428 regions (0.5%)







Pattern occurs in 396 / 98428 regions (0.4%)







Pattern occurs in 450 / 98428 regions (0.5%)







Pattern occurs in 295 / 98428 regions (0.3%)







Pattern occurs in 303 / 98428 regions (0.3%)







Pattern occurs in 301 / 98428 regions (0.3%)







Pattern occurs in 232 / 98428 regions (0.2%)







Pattern occurs in 180 / 98428 regions (0.2%)







Pattern occurs in 163 / 98428 regions (0.2%)







Pattern occurs in 175 / 98428 regions (0.2%)







Pattern occurs in 157 / 98428 regions (0.2%)







Pattern occurs in 111 / 98428 regions (0.1%)







Pattern occurs in 96 / 98428 regions (0.1%)







Pattern occurs in 78 / 98428 regions (0.1%)







Pattern occurs in 11571 / 98428 regions (11.8%)







Pattern occurs in 1848 / 98428 regions (1.9%)







Pattern occurs in 1511 / 98428 regions (1.5%)







Pattern occurs in 604 / 98428 regions (0.6%)







Pattern occurs in 523 / 98428 regions (0.5%)







Pattern occurs in 420 / 98428 regions (0.4%)







Pattern occurs in 444 / 98428 regions (0.5%)







Pattern occurs in 402 / 98428 regions (0.4%)







Pattern occurs in 389 / 98428 regions (0.4%)







Pattern occurs in 298 / 98428 regions (0.3%)







Pattern occurs in 310 / 98428 regions (0.3%)







Pattern occurs in 269 / 98428 regions (0.3%)







Pattern occurs in 268 / 98428 regions (0.3%)







Pattern occurs in 269 / 98428 regions (0.3%)







Pattern occurs in 252 / 98428 regions (0.3%)







Pattern occurs in 159 / 98428 regions (0.2%)







Pattern occurs in 160 / 98428 regions (0.2%)







Pattern occurs in 132 / 98428 regions (0.1%)







Pattern occurs in 124 / 98428 regions (0.1%)







Pattern occurs in 102 / 98428 regions (0.1%)







Pattern occurs in 94 / 98428 regions (0.1%)







Pattern occurs in 77 / 98428 regions (0.1%)







Pattern occurs in 108 / 98428 regions (0.1%)







Pattern occurs in 62 / 98428 regions (0.1%)







Pattern occurs in 4348 / 98428 regions (4.4%)







Pattern occurs in 1300 / 98428 regions (1.3%)







Pattern occurs in 1134 / 98428 regions (1.2%)







Pattern occurs in 799 / 98428 regions (0.8%)







Pattern occurs in 684 / 98428 regions (0.7%)







Pattern occurs in 389 / 98428 regions (0.4%)







Pattern occurs in 193 / 98428 regions (0.2%)







Pattern occurs in 192 / 98428 regions (0.2%)







Pattern occurs in 110 / 98428 regions (0.1%)







Pattern occurs in 77 / 98428 regions (0.1%)







Pattern occurs in 148 / 98428 regions (0.2%)







Pattern occurs in 104 / 98428 regions (0.1%)







Pattern occurs in 100 / 98428 regions (0.1%)







Pattern occurs in 2377 / 98428 regions (2.4%)







Pattern occurs in 1544 / 98428 regions (1.6%)







Pattern occurs in 1005 / 98428 regions (1.0%)







Pattern occurs in 344 / 98428 regions (0.3%)







Pattern occurs in 226 / 98428 regions (0.2%)







Pattern occurs in 194 / 98428 regions (0.2%)







Pattern occurs in 183 / 98428 regions (0.2%)







Pattern occurs in 177 / 98428 regions (0.2%)







Pattern occurs in 159 / 98428 regions (0.2%)







Pattern occurs in 118 / 98428 regions (0.1%)







Pattern occurs in 146 / 98428 regions (0.1%)







Pattern occurs in 96 / 98428 regions (0.1%)







Pattern occurs in 94 / 98428 regions (0.1%)







Pattern occurs in 94 / 98428 regions (0.1%)







Pattern occurs in 1465 / 98428 regions (1.5%)







Pattern occurs in 793 / 98428 regions (0.8%)







Pattern occurs in 653 / 98428 regions (0.7%)







Pattern occurs in 536 / 98428 regions (0.5%)







Pattern occurs in 214 / 98428 regions (0.2%)







Pattern occurs in 129 / 98428 regions (0.1%)







Pattern occurs in 135 / 98428 regions (0.1%)







Pattern occurs in 73 / 98428 regions (0.1%)







Pattern occurs in 120 / 98428 regions (0.1%)







Pattern occurs in 69 / 98428 regions (0.1%)







Pattern occurs in 1400 / 98428 regions (1.4%)







Pattern occurs in 729 / 98428 regions (0.7%)







Pattern occurs in 667 / 98428 regions (0.7%)







Pattern occurs in 421 / 98428 regions (0.4%)







Pattern occurs in 225 / 98428 regions (0.2%)







Pattern occurs in 224 / 98428 regions (0.2%)







Pattern occurs in 159 / 98428 regions (0.2%)







Pattern occurs in 114 / 98428 regions (0.1%)







Pattern occurs in 95 / 98428 regions (0.1%)







Pattern occurs in 63 / 98428 regions (0.1%)







Pattern occurs in 93 / 98428 regions (0.1%)







Pattern occurs in 661 / 98428 regions (0.7%)







Pattern occurs in 492 / 98428 regions (0.5%)







Pattern occurs in 231 / 98428 regions (0.2%)







Pattern occurs in 237 / 98428 regions (0.2%)







Pattern occurs in 139 / 98428 regions (0.1%)







Pattern occurs in 139 / 98428 regions (0.1%)







Pattern occurs in 119 / 98428 regions (0.1%)







Pattern occurs in 113 / 98428 regions (0.1%)







Pattern occurs in 73 / 98428 regions (0.1%)







Pattern occurs in 71 / 98428 regions (0.1%)







Pattern occurs in 92 / 98428 regions (0.1%)







Pattern occurs in 151 / 98428 regions (0.2%)







Pattern occurs in 97 / 98428 regions (0.1%)







Pattern occurs in 357 / 98428 regions (0.4%)







Pattern occurs in 261 / 98428 regions (0.3%)







Pattern occurs in 147 / 98428 regions (0.1%)







Pattern occurs in 122 / 98428 regions (0.1%)







Pattern occurs in 81 / 98428 regions (0.1%)







Pattern occurs in 81 / 98428 regions (0.1%)







Pattern occurs in 96 / 98428 regions (0.1%)
print("Metaclusters heatmap")
import seaborn as sns
activity_patterns = np.array(mr.f.f['metaclustering_results']['attribute_vectors'])[
np.array(
[x[0] for x in sorted(
enumerate(mr.f.f['metaclustering_results']['metacluster_indices']),
key=lambda x: x[1])])]
sns.heatmap(activity_patterns, center=0);
Metaclusters heatmap