%matplotlib inline
from uuid import uuid4
from collections import OrderedDict
from basepair.math import mean
from basepair.stats import perc
from IPython.display import display, HTML
from basepair.plot.vdom import df2html, df2html_old, render_datatable
from basepair.modisco.core import patterns_to_df
from basepair.modisco.utils import longer_pattern, shorten_pattern, extract_name_long
from basepair.imports import *
from basepair.utils import apply_parallel
import holoviews as hv
hv.extension('bokeh')
model_dir = Path(f"{ddir}/processed/chipnexus/exp/models/oct-sox-nanog-klf/models/n_dil_layers=9/")
modisco_dir = model_dir / f"modisco/all/profile/"
output_dir = modisco_dir / 'interpretable-model'
gspan_dir = output_dir / 'gSpan'
gspan_output = gspan_dir / "gspan.output.txt"
mr = ModiscoResult(modisco_dir / "modisco.h5")
mr.open()
tasks = [x.split("/")[0] for x in mr.tasks()]
from basepair.modisco.motif_clustering import to_colors, preproc_motif_table, motif_table_long, scale, preproc_df
from basepair.modisco.pattern_instances import load_instances, filter_nonoverlapping_intervals, plot_coocurence_matrix
# specify motifs to use in the analysis
motifs = OrderedDict([
("Oct4-Sox2", "m0_p0"),
("Oct4-Sox2-deg", "m6_p8"),
("Oct4", "m0_p18"),
("Sox2", "m0_p1"),
("Essrb", "m0_p2"),
("Nanog", "m0_p3"),
("Nanog-periodic", "m0_p9"),
("Klf4", "m2_p0"),
])
# Load instances
try:
dfi = load_instances(f"{modisco_dir}/instances.subset.parq", motifs, dedup=False)
except:
dfi = load_instances(f"{modisco_dir}/instances.parq", motifs, dedup=False)
dfi.set_index("example_idx", inplace=True)
# save back to a file
# dfi.to_parquet(f"{modisco_dir}/instances.subset.parq")
!du -sh {modisco_dir}/instances.*parq
from basepair.imports import *
import pybedtools
from genomelake.extractors import BigwigExtractor
from basepair.extractors import StrandedBigWigExtractor, bw_extract
from kipoiseq.transforms import ResizeInterval
from basepair.modisco.table import ModiscoData
from pybedtools import BedTool
from basepair.cli.modisco import load_ranges
from basepair.plot.heatmaps import (heatmap_importance_profile, normalize, multiple_heatmap_stranded_profile,
heatmap_stranded_profile)
from basepair.plot.profiles import plot_stranded_profile
import hvplot.pandas
from basepair.plots import regression_eval
create_tf_session(0)
df = pd.read_csv(f"{ddir}/processed/chipnexus/external-data.tsv", sep='\t')
df = df.set_index('assay')
df
dfi.head()
Write the bed file
!mkdir -p {ddir}/processed/activity/data/
dfi.reset_index()[['example_chrom', 'example_start', 'example_end', 'example_idx']].drop_duplicates().to_csv(f"{ddir}/processed/activity/data/peak.instances.bed", sep='\t', index=False, header=False)
from basepair.cli.schemas import DataSpec
ds = DataSpec.load(f"{ddir}/processed/chipnexus/exp/models/oct-sox-nanog-klf/dataspec.yml")
!head -n 2 {ddir}/processed/activity/data/peak.instances.bed
assays = ['H3K27ac', 'PolII', 'Groseq']
def tolist(s):
if isinstance(s, str):
return [s]
else:
return list(s)
bigwigs = {a: tolist(df.loc[a].path) for a in assays}
bigwigs = {**bigwigs, **{t: [ds.task_specs[t].pos_counts, ds.task_specs[t].neg_counts] for t in ds.task_specs}}
bigwigs
from basepair.datasets import *
from basepair.config import valid_chr, test_chr
from kipoi.data_utils import numpy_collate
class ActivityDataset(Dataset):
"""
Args:
intervals_file: bed4 file containing chrom start end name
fasta_file: file path; Genome sequence
track_with: a list or a dictionary of tasks
label_dtype: label data type
num_chr_fasta: if True, the tsv-loader will make sure that the chromosomes
don't start with chr
"""
def __init__(self, intervals_file,
fasta_file,
bigwigs,
track_width=2000,
incl_chromosomes=None,
excl_chromosomes=None,
num_chr_fasta=False):
self.num_chr_fasta = num_chr_fasta
self.intervals_file = intervals_file
self.fasta_file = fasta_file
self.bigwigs = bigwigs
self.incl_chromosomes = incl_chromosomes
self.excl_chromosomes = excl_chromosomes
self.tasks = list(self.bigwigs)
if isinstance(track_width, dict):
self.track_width = track_width
else:
# track_width always needs to be the dictionary
self.track_width = {t: track_width for t in self.tasks}
self.tsv = BedDataset(self.intervals_file,
num_chr=self.num_chr_fasta,
bed_columns=4,
ignore_targets=True,
incl_chromosomes=incl_chromosomes,
excl_chromosomes=excl_chromosomes,
)
self.fasta_extractor = None
self.bigwig_extractors = None
def __len__(self):
return len(self.tsv)
def __getitem__(self, idx):
if self.fasta_extractor is None:
self.fasta_extractor = FastaExtractor(self.fasta_file)
self.bigwig_extractors = {a: [BigwigExtractor(f) for f in self.bigwigs[a]]
for a in self.bigwigs}
interval, labels = self.tsv[idx]
# interval = resize_interval(interval, self.track_width[t])
# # Intervals need to be 1000bp wide
# assert interval.stop - interval.start == 1000
# Run the fasta extractor
# seq = np.squeeze(self.fasta_extractor([interval]))
resized_intervals = {t: resize_interval(deepcopy(interval), self.track_width[t])
for t in self.tasks}
targets = {a: sum([e([resized_intervals[a]])[0]
for e in self.bigwig_extractors[a]]).sum()
for a in self.tasks}
return {
# "inputs": {"seq": seq},
"targets": targets,
"metadata": {
"ranges": GenomicRanges(interval.chrom, interval.start, interval.stop, str(idx)),
"ranges_wide": {t: GenomicRanges.from_interval(interval_wide)
for t, interval_wide in resized_intervals.items()},
"name": interval.name
}
}
def get_targets(self):
return self.tsv.get_targets()
track_width = {t: 2000 if t in assays else 1000 for t in bigwigs}
track_width
dl = ActivityDataset(f"{ddir}/processed/activity/data/peak.instances.bed", ds.fasta_file, bigwigs,
track_width=track_width,
excl_chromosomes=valid_chr + test_chr)
train = numpy_collate([dl[i] for i in tqdm(range(len(dl)))])
dfy = pd.DataFrame(train['targets'])
dl = ActivityDataset(f"{ddir}/processed/activity/data/peak.instances.bed", ds.fasta_file, bigwigs,
track_width=track_width,
incl_chromosomes=valid_chr)
valid = numpy_collate([dl[i] for i in tqdm(range(len(dl)))])
dfy_valid = pd.DataFrame(valid['targets'])
dfi.head()
# Columns
# pattern_short, match_weighted_p, imp_weighted_p, pattern_center,
columns = ["example_idx", "pattern_name", "example_center", "imp_weighted_p", "match_weighted_p"]
values = ['a', 'b', 'c']
data = {"example_idx": [1, 1, 1],
"pattern_name": pd.Categorical(['a', 'a', 'b'], values),
'pattern_center': [10, 20, 50],
'imp_weighted_p': [.5] * 3,
'match_weighted_p': [.5] * 3}
x = pd.DataFrame(data)
# Get all features for the strongest motif
# imp_weighted_p and match_weighted_p for the strongest motif
def index_prefix(s, value):
s = s.copy()
s.index = value + s.index.astype(str)
return s
def index_suffix(s, value):
s = s.copy()
s.index = s.index.astype(str) + value
return s
# --------------------------------------------
# Features
def counts(x):
return index_suffix(x['pattern_name'].value_counts(), '/counts')
def present(x):
return index_suffix(x['pattern_name'].value_counts() > 0, '/present')
def features_max(x, max_feat='imp_weighted_p',
output_feat=['imp_weighted_p', 'match_weighted_p', 'pattern_center']):
def subset_max(x):
if len(x) == 0:
return pd.DataFrame({x: [np.nan] for x in output_feat})
else:
return x.loc[[x[max_feat].idxmax()]][output_feat]
# Select the stronges motif
x_single = x.groupby("pattern_name").apply(subset_max).reset_index()
del x_single['level_1']
x_single = pd.DataFrame(x_single.set_index('pattern_name').stack(dropna=False)).T
x_single.columns = ['%s%s' % (a, '/max/%s' % b if b else '')
for a, b in x_single.columns]
return x_single.iloc[0]
# --------------------------------------------
# run for all the features
pd.concat([counts(x), present(x), features_max(x)])
# TODO - make combination of different features
# make pattern_name categorical
dfi.pattern_name = pd.Categorical(dfi.pattern_name, list(motifs))
dfi.imp_weighted_p = dfi.imp_weighted_p.fillna(0)
dfi.match_weighted_p = dfi.match_weighted_p.fillna(0)
x
df = pd.DataFrame(pd.concat([counts(x), present(x), features_max(x)])).T
df.index = [x.index[0]]
df
def apply_fn(x):
df = pd.DataFrame(pd.concat([counts(x), present(x), features_max(x)])).T
df['example_idx'] = x['example_idx'].iloc[0]
return df
# TODO - put as a jupyter notebook magic?
from tqdm import tqdm as tqdm_cls
inst = tqdm_cls._instances
for i in range(len(inst)): inst.pop().close()
X_feat = apply_parallel(dfi.reset_index()[['example_idx', 'pattern_name',
'match_weighted_p', 'imp_weighted_p',
'pattern_center'] ].groupby('example_idx'), apply_fn, -1)
X = X_feat
y_cols = ['H3K27ac', 'PolII', 'Groseq', 'Oct4', 'Sox2', 'Nanog', 'Klf4']
x_cols = [c for c in X.columns if c != 'example_idx']
from sklearn.preprocessing import StandardScaler
y_train = pd.DataFrame(train['targets'])
y_train['example_idx'] = train['metadata']['name'].astype(int)
Xy_train = pd.merge(y_train, X, on='example_idx')
y = Xy_train[y_cols]
yscaler = StandardScaler()
y = yscaler.fit_transform(np.log10(y + 1).values, y=None)
x = Xy_train[x_cols]
x = x.fillna(0)
y_valid = pd.DataFrame(valid['targets'])
y_valid['example_idx'] = valid['metadata']['name'].astype(int)
Xy_valid = pd.merge(y_valid, X, on='example_idx')
y_valid = Xy_valid[y_cols]
y_valid = yscaler.fit_transform(np.log10(y_valid + 1).values)
x_valid = Xy_valid[x_cols]
x_valid = x_valid.fillna(0)
HDF5BatchWriter.dump(output_dir, {"inputs"
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_predict
from basepair.plot.evaluate import regression_eval
from basepair.plot.config import paper_config
paper_config()
i = 0
for i in range(len(y_cols)):
fig, ax = plt.subplots(figsize=(3, 3))
print(i, y_cols[i])
m = LinearRegression()
m.fit(x.values, y[:,i])
preds = m.predict(x_valid)
regression_eval(y_valid[:,i], preds, task=y_cols[i], alpha=0.5, ax=ax);
i = 0
for i in range(len(y_cols)):
fig, ax = plt.subplots(figsize=(3, 3))
print(i, y_cols[i])
m = RandomForestRegressor(n_estimators=100, n_jobs=5)
m.fit(x.values, y[:,i])
preds = m.predict(x_valid)
regression_eval(y_valid[:,i], preds, task=y_cols[i], alpha=0.5, ax=ax);
from basepair.modisco.gspan import load_gspan_as_features
gspan_dir = modisco_dir / 'interpretable-model/gSpan'
graph_file = gspan_dir / 'match!=low,imp=high,dist=10-50.data'
gspan_output = gspan_dir / "gspan.output.txt"
dfg = load_gspan_as_features(gspan_output, str(graph_file) + ".idx")
dfg.head()
# Append to X
X = pd.merge(X_feat, dfg.reset_index(), on='example_idx', how='left')
X[dfg.columns] = X[dfg.columns].fillna(False)
y_cols = ['H3K27ac', 'PolII', 'Groseq', 'Oct4', 'Sox2', 'Nanog', 'Klf4']
x_cols = [c for c in X.columns if c != 'example_idx']
from sklearn.preprocessing import StandardScaler
y_train = pd.DataFrame(train['targets'])
y_train['example_idx'] = train['metadata']['name'].astype(int)
Xy_train = pd.merge(y_train, X, on='example_idx')
y = Xy_train[y_cols]
yscaler = StandardScaler()
y = yscaler.fit_transform(np.log10(y + 1).values, y=None)
x = Xy_train[x_cols]
x = x.fillna(0)
y_valid = pd.DataFrame(valid['targets'])
y_valid['example_idx'] = valid['metadata']['name'].astype(int)
Xy_valid = pd.merge(y_valid, X, on='example_idx')
y_valid = Xy_valid[y_cols]
y_valid = yscaler.fit_transform(np.log10(y_valid + 1).values)
x_valid = Xy_valid[x_cols]
x_valid = x_valid.fillna(0)
x.shape
i = 0
for i in range(len(y_cols)):
fig, ax = plt.subplots(figsize=(3, 3))
print(i, y_cols[i])
m = LinearRegression()
m.fit(x.values, y[:,i])
preds = m.predict(x_valid)
regression_eval(y_valid[:,i], preds, task=y_cols[i], alpha=0.5, ax=ax);
i = 0
for i in range(len(y_cols)):
fig, ax = plt.subplots(figsize=(3, 3))
print(i, y_cols[i])
m = RandomForestRegressor(n_estimators=100, n_jobs=5)
m.fit(x.values, y[:,i])
preds = m.predict(x_valid)
regression_eval(y_valid[:,i], preds, task=y_cols[i], alpha=0.5, ax=ax);