# instances per sequencefrom basepair.imports import *
from basepair.plot.profiles import extract_signal
from basepair.math import softmax
from basepair.plot.heatmaps import heatmap_stranded_profile, multiple_heatmap_stranded_profile
from basepair.plot.profiles import plot_stranded_profile, multiple_plot_stranded_profile
import plotnine
from plotnine import *
import statsmodels.api as sm
import statsmodels.formula.api as smf
from basepair.modisco.results import Seqlet, resize_seqlets
from basepair.modisco.core import dfi2seqlets, annotate_profile
from basepair.cli.modisco import load_profiles
import dask.dataframe as dd
import hvplot.pandas
import holoviews as hv
hv.extension('bokeh', 'matplotlib')
model_dir = Path(f"{ddir}/processed/chipnexus/exp/models/oct-sox-nanog-klf/models/n_dil_layers=9/")
modisco_dir = model_dir / "modisco/all/profile/"
mr = ModiscoResult(modisco_dir / "modisco.h5")
mr.open()
# Load the data
d = HDF5Reader(model_dir / "grad.all.h5")
d.open()
# specify motifs to use in the analysis
motifs = OrderedDict([
("Oct4-Sox2", "m0_p0"),
("Oct4-Sox2-deg", "m6_p8"),
("Oct4", "m0_p18"),
("Sox2", "m0_p1"),
("Essrb", "m0_p2"),
("Nanog", "m0_p3"),
("Nanog-periodic", "m0_p9"),
("Klf4", "m2_p0"),
])
# plot them
for tf, p in motifs.items():
mr.get_pattern(longer_pattern(p)).trim_seq_ic(0.08).resize(40).plot("seq");
plt.title(tf)
from basepair.modisco.pattern_instances import load_instances, filter_nonoverlapping_intervals, plot_coocurence_matrix
dfi = load_instances(f"{modisco_dir}/instances.parq", motifs)
dfi = filter_nonoverlapping_intervals(dfi)
total_examples = len(dfi.example_idx.unique())
total_examples
# pattern lengths
dfi.groupby(['pattern_name', 'pattern_len']).size()
n_patterns = dfi.groupby(['pattern_name', 'match_weighted_cat', 'imp_weighted_cat']).size()
n_patterns = n_patterns.rename(columns={0: "n_instances"})
n_patterns.hvplot.bar('imp_weighted_cat', by='pattern_name',groupby='match_weighted_cat', rot=90)
n_patterns = dfi[(dfi.pattern_center > 400) & (dfi.pattern_center < 600)].groupby(['pattern_name', 'match_weighted_cat', 'imp_weighted_cat']).size()
n_patterns = n_patterns.rename(columns={0: "n_instances"})
n_patterns.hvplot.bar('imp_weighted_cat', by='pattern_name',groupby='match_weighted_cat', rot=90)
from upsetplot import plot as upsetplt
#.query('imp_weighted_cat=="high"')
counts = pd.pivot_table(dfi[(dfi.pattern_center > 400) & (dfi.pattern_center < 600)].query('match_weighted_cat=="high"').query('imp_weighted_cat=="high"'),
'pattern_len', "example_idx", "pattern_name", aggfunc=len, fill_value=0)
c = counts >0
upsetplt(c.groupby(list(c.columns)).size().sort_values(ascending=False)[:50], sort_by='cardinality');
m = counts[['Nanog', 'Oct4-Sox2']]
m[m.all(1)].sum(1).value_counts().plot.bar()
plt.title("Nanog-Sox2")
plt.xlabel("Number of motifs in the center >=2");
plot_coocurence_matrix(dfi[(dfi.pattern_center > 450) & (dfi.pattern_center < 550)].query('match_weighted_cat!="low"').query('imp_weighted_cat=="high"'), total_examples)
plot_coocurence_matrix(dfi[(dfi.pattern_center > 450) & (dfi.pattern_center < 550)].query('match_weighted_cat=="high"').query('imp_weighted_cat=="high"'), total_examples)
plot_coocurence_matrix(dfi[(dfi.pattern_center > 400) & (dfi.pattern_center < 600)].query('match_weighted_cat!="low"').query('imp_weighted_cat=="high"'), total_examples)
plot_coocurence_matrix(dfi[(dfi.pattern_center > 400) & (dfi.pattern_center < 600)].query('match_weighted_cat=="high"').query('imp_weighted_cat=="high"'), total_examples)
plot_coocurence_matrix(dfi.query('match_weighted_cat!="low"').query('imp_weighted_cat=="high"'), total_examples)
plot_coocurence_matrix(dfi[(dfi.pattern_center > 450) & (dfi.pattern_center < 550)].query('match_weighted_cat!="low"').query('imp_weighted_p>0.33'), total_examples)
plot_coocurence_matrix(dfi[(dfi.pattern_center > 400) & (dfi.pattern_center < 600)].query('match_weighted_cat!="low"').query('imp_weighted_p>0.33'), total_examples)
plot_coocurence_matrix(dfi.query('match_weighted_cat!="low"').query('imp_weighted_p>0.33'), total_examples)
from basepair.modisco.motif_clustering import to_colors, preproc_motif_table, motif_table_long, scale, preproc_df
# TODO - speedup using IntervalIndex .. .loc[motif_center...]
# Read seqlets from modisco
df_seqlets = mr.seqlet_df_instances(trim_frac=0.08)
df_seqlets = df_seqlets.rename(columns=dict(seqname="example_idx", start='seqlet_start', end='seqlet_end', strand='seqlet_strand', center='seqlet_center'))
df_seqlets['pattern'] = df_seqlets['pattern'].map({k: shorten_pattern(k) for k in df_seqlets.pattern.unique()})
del df_seqlets['name']
# add the `cluster,cluster_order`
dfc = pd.read_csv("http://mitra.stanford.edu/kundaje/avsec/chipnexus/oct-sox-nanog-klf/models/n_dil_layers=9/modisco/all/profile/motif_clustering/nte_seq.csv")
df_seqlets = df_seqlets.merge(dfc[['pattern', 'cluster', 'cluster_order']], on='pattern')
def non_overlapping(x, df_seqlets):
if len(x) == 0:
return x
dfs = df_seqlets[(df_seqlets.pattern == x['pattern_short'].iloc[0]) & (df_seqlets.example_idx == x['example_idx'].iloc[0])]
if len(dfs)==0:
return x
return x[np.abs(x['pattern_center'].values.reshape((-1,1)) - dfs.seqlet_center.values.reshape([1,-1])).min(1) > 15]
dfi_subset = dfi.query('match_weighted_cat!="low"').query('imp_weighted_cat=="high"')
# get rid of overlapping intervals
dfi_subset_subset = dfi_subset.groupby(['pattern_short', 'example_idx']).apply(lambda x: non_overlapping(x, df_seqlets))
counts = pd.pivot_table(dfi_subset_subset, 'pattern_len', "example_idx", "pattern_name", aggfunc=len, fill_value=0)
c = counts >0
df_seqlets_c = c.merge(df_seqlets, on='example_idx')
df_seqlets_c.head(2)
df_seqlets_c_agg = df_seqlets_c.groupby(['cluster', 'cluster_order', 'pattern'])[list(c)].mean()
df_seqlets_c_agg_size = df_seqlets_c.groupby(['cluster', 'cluster_order', 'pattern']).size()
df_seqlets_c_agg = pd.concat([df_seqlets_c_agg, df_seqlets_c_agg_size], axis=1).reset_index().set_index('pattern').rename(columns={0: 'n_seqlets'})
df_seqlets_c_agg = df_seqlets_c_agg.sort_values('cluster_order')
col_anno = df_seqlets_c_agg[['cluster', 'n_seqlets']]
del df_seqlets_c_agg['cluster']
del df_seqlets_c_agg['cluster_order']
del df_seqlets_c_agg['n_seqlets']
col_anno.cluster = pd.Categorical(col_anno.cluster)
# plot
sns.clustermap(df_seqlets_c_agg,
row_cluster=False,
col_cluster=False,
#col_cluster
# standard_scale=1,
row_colors=to_colors(col_anno),
figsize=(8,25),
cmap='Blues')
from basepair.modisco.pattern_instances import construct_motif_pairs
motif_pair = ['Oct4-Sox2', 'Sox2']
dfi_filtered = dfi[(dfi.pattern_center > 400) & (dfi.pattern_center < 600)].query('match_weighted_p > 0.2').query('imp_weighted_p > 0')
dftw_filt = motif_pairs(dfi_filtered, motif_pair)
# flatten columns for easier plotting
dftw_filt.set_index('strand_combination', append=True, inplace=True, drop=False)
dftw_filt.columns= dftw_filt.columns.get_level_values(0)+ '/' + dftw_filt.columns.get_level_values(1)
if motif_pair[0] == motif_pair[1]:
bins=150
bin_range = (0, 150)
else:
bins=300
bin_range = (-150, 150)
(dftw_filt['center_diff/'].hvplot.hist(bins=bins,
by='strand_combination',
width=700,
height=200,
subplots=True,
bin_range=bin_range).cols(1) + \
dftw_filt.hvplot.scatter(x='center_diff/', y='imp_weighted/'+ motif_pair[0],# 'imp_weighted_p/Sox2'],
by='strand_combination',
width=700, height=400,
#width=200, height=200,
subplots=False,
xlim=bin_range,
alpha=0.4) + \
dftw_filt.hvplot.scatter(x='center_diff/', y='imp_weighted/'+ motif_pair[1],# 'imp_weighted_p/Sox2'],
by='strand_combination',
width=700, height=400,
#width=200, height=200,
subplots=False,
xlim=bin_range,
alpha=0.4)
).cols(1)
from pybedtools import Interval
import pybedtools
from genomelake.extractors import FastaExtractor
from basepair.cli.schemas import DataSpec
from basepair.modisco.utils import ic_scale
from basepair.plot.tracks import plot_tracks
from basepair.modisco.core import Pattern
a = dftw_filt.reset_index().query("strand_combination=='-+'")
a = a[a['center_diff/'] == -58]
spaced_example_idx = a.example_idx
print(len(spaced_example_idx))
s = dfi[dfi.example_idx.isin(spaced_example_idx)][['example_idx', 'example_chrom', 'example_start', 'example_end', 'example_interval_from_task']].drop_duplicates()
a = pd.merge(a, s, on='example_idx').sort_values(['example_chrom', 'example_start'])
def pair_center(row, motif_pair):
return (row['pattern_center/' + motif_pair[0]] + row['pattern_center/' + motif_pair[1]]) // 2
# get the sequences
intervals = [pybedtools.create_interval_from_list([row.example_chrom, row.example_start + pair_center(row, motif_pair) - 35,
row.example_start + pair_center(row, motif_pair) + 35, row['example_idx']]) for i,row in a.iterrows()]
ds = DataSpec.load(f"{ddir}/processed/chipnexus/exp/models/oct-sox-nanog-klf/dataspec.yml")
fa = FastaExtractor(ds.fasta_file)
seqs = fa(intervals)
# match to the existing pattern
patterns = [mr.get_pattern(p) for p in mr.patterns()]
pwm = seqs.mean(0)
pwm = Pattern('found', pwm, dict(a=pwm), dict(a=pwm))
pwm = pwm.resize(len(patterns[0]))
matches = np.array([pattern.similarity(pwm, track='seq') for pattern in patterns])
# plot
plt.plot(matches)
mid = matches.argmax()
patterns[mid].plot("seq_ic");
pwm.align(patterns[mid]).plot("seq_ic");
dfi.head()
n_patterns2 = dfi.groupby(['pattern_short', 'match_weighted_cat', 'imp_weighted_cat']).size().reset_index()
n_patterns2 = n_patterns2.rename(columns={0: "n_instances"})
dfn = pd.DataFrame({'pattern_short': shorten_pattern(p), 'seqlets': mr.n_seqlets(*p.split('/'))} for p in mr.patterns())
dfn.merge(n_patterns2, on='pattern_short').query('match_weighted_cat=="high"').\
hvplot.scatter(x='seqlets', y='n_instances', by='imp_weighted_cat', alpha=0.5)
dfn.merge(n_patterns2, on='pattern_short').query('match_weighted_cat=="high"').query('imp_weighted_cat=="high"').\
hvplot.scatter(x='seqlets', y='n_instances', by='pattern_short', alpha=0.5, legend=False)
These points on the diagonal are typically patterns 0 from differnet metaclusters
# most frequent seqlets
dfn.sort_values('seqlets', ascending=False).head(9)
# most frequent motif instances
n_patterns.reset_index().rename(columns={0: 'scanned_instances'}).sort_values('scanned_instances', ascending=False).head(9)
# define a set of core motifs
dfn.head()
dfi_motifs = dfi[dfi.pattern_short.isin(list(motifs.values()))]
dfi_motifs['pattern_name'] = dfi_motifs.pattern_short.map({v: k for k,v in motifs.items()})
n_patterns_per_example = dfi_motifs.groupby(['pattern_name', 'example_idx', 'match_weighted_cat', 'imp_weighted_cat']).size().reset_index()#.groupby(['pattern_short'])[0].value_counts()
n_patterns_per_example = n_patterns_per_example.rename(columns={0: "n seqlets per example"})
n_patterns_per_example.hvplot.box(y='n seqlets per example', by=['pattern_name', 'match_weighted_cat'], invert=True, height=500, width=800, legend=False)
dfi_motifs.hvplot.kde(y='match_weighted', by='pattern_name')
dfi_motifs.hvplot.kde(y='match_weighted_p', by='pattern_name')
dfi_motifs.hvplot.kde(y='imp_weighted', by='pattern_name')
dfi_motifs.hvplot.kde(y='imp_weighted_p', by='pattern_name')
from basepair.stats import low_medium_high
# task = "Nanog"
# pattern = "metacluster_0/pattern_0"
model_dir
modisco_dir
# load profiles
profiles = load_profiles(modisco_dir, model_dir/'grad.all.h5')
tasks = list(profiles)
dfim = dfi_motifs[dfi_motifs.match_weighted_cat == 'high']
dfim = dfim[dfim.imp_weighted_p > 0]
dfim.shape
Annotate the major motifs with the profile features
dfi_motifs_anno = annotate_profile(dfim, mr, profiles)
# assert len(dfi_motifs_anno) == len(dfi_filter_valid(dfim, 70, 1000))
correlate importance vs profile counts
dfi_motifs_anno.query('pattern_name=="Oct4-Sox2"').hvplot.scatter(x='Oct4/profile_counts_p', y='imp_weighted_p', alpha=0.1)
dfi_motifs_anno.query('pattern_name=="Nanog"').hvplot.scatter(x='Nanog/profile_counts_p', y='imp_weighted_p', alpha=0.05, datashade=False)
from joblib import Parallel, delayed
def abs_min(x):
return x[np.abs(x).argmin()]
def pairwise_distances_single(dfi_ex, dfi_motifs_anno_ex):
out = []
for i, core_motif in dfi_motifs_anno_ex.iterrows():
#core_motif = dfi_motifs_anno_ex.iloc[i]
# remove self
select = (dfi_ex.pattern == core_motif.pattern) & (dfi_ex.pattern_center == core_motif.pattern_center)
assert select.sum() == 1
dfi_ex = dfi_ex[~select]
# relative distance to the pattern_center
dfi_ex['pattern_center_rel'] = dfi_ex.pattern_center - core_motif.pattern_center
# number of other examples
df_scores = dfi_ex.groupby(['pattern', 'match_weighted_cat', 'imp_weighted_cat']).pattern_center_rel.agg([np.size, abs_min]).reset_index()
df_scores.columns = [f'side/{c}' for c in df_scores.columns]
df_scores = df_scores.assign(**core_motif.to_dict())
out.append(df_scores)
return pd.concat(out, axis=0)
def pariwise_distances(dfi, dfi_motifs_anno, n_jobs=8):
#dfi_motifs_anno_tf = dfi_motifs_anno[dfi_motifs_anno.tf == tf]
all_example_idx = dfi_motifs_anno.example_idx.unique()
out = Parallel(n_jobs=n_jobs)(delayed(pairwise_distances_single)(dfi[dfi.example_idx == example_idx], dfi_motifs_anno[dfi_motifs_anno.example_idx == example_idx])
for example_idx in tqdm(all_example_idx))
return pd.concat(out, axis=0)
df_dist = pariwise_distances(dfi[dfi.imp_weighted_p > 0], dfi_motifs_anno, n_jobs=20)
df_dist.head()
df_dist.shape
from plotnine import *
df_dist['side_pattern_short'] = df_dist['side/pattern'].map({k: shorten_pattern(k) for k in df_dist['side/pattern'].unique()})
df_dist['side_imp_weighted_cat'] = df_dist['side/imp_weighted_cat']
df_dist['n_motifs'] = pd.Categorical(np.minimum(df_dist['side/size'], 3))
df_dist.head(2)
plotnine.options.figure_size = (20,3)
ggplot(aes(x='side/imp_weighted_cat', y='Nanog/profile_counts_p', color='n_motifs'), df_dist[df_dist['side/match_weighted_cat'] == 'high']) + \
facet_wrap("~side_tf", ncol = 10, scales='free_x') + geom_boxplot() + \
theme_bw()
list(motifs.values())
motifs
df_dist_filt = df_dist[(df_dist['side/match_weighted_cat'] == 'high') & (df_dist['side_pattern_short'].isin(motifs.values()))]
df_dist_filt['side_tf'] = df_dist_filt.side_pattern_short.map({v:k for k,v in motifs.items()})
plotnine.options.figure_size = (20,10)
ggplot(aes(x='side/imp_weighted_cat', y='imp_weighted', color='n_motifs'), df_dist_filt) + \
facet_grid("tf~side_tf") + geom_boxplot() + \
theme_bw()
# plotnine.options.figure_size = (20,3)
# ggplot(aes(x='side/imp_weighted_cat', y='Nanog/profile_max_p', color='n_motifs'), df_dist[(df_dist['side/match_weighted_cat'] == 'high') & (df_dist['side_pattern_short'].isin(motifs.values()))]) + \
# facet_wrap("~side_tf", ncol = 10, scales='free_x') + geom_boxplot() + \
# theme_bw()
df_dist_filt['rel_dist'] = df_dist_filt['side/abs_min']
df_dist_filt['side_imp_weighted_cat'] = pd.Categorical(df_dist_filt['side_imp_weighted_cat'], categories=['high', 'medium', 'low'])
plotnine.options.figure_size = (20,5)
ggplot(aes(x='rel_dist', y='Nanog/profile_max'), df_dist_filt[df_dist_filt.pattern_name=='Nanog']) + \
xlim([-150, 150]) + \
scale_y_log10() + \
facet_grid("side_imp_weighted_cat~side_tf") + geom_point(alpha=0.05) + \
theme_classic()
plotnine.options.figure_size = (20,5)
ggplot(aes(x='rel_dist', y='Nanog/profile_counts'), df_dist_filt) + \
xlim([-150, 150]) + \
scale_y_log10() + \
facet_grid("side_imp_weighted_cat~side_tf") + geom_point(alpha=0.1) + \
theme_classic()
plotnine.options.figure_size = (20,5)
ggplot(aes(x='rel_dist', y='imp_weighted'), df_dist_filt) + \
xlim([-150, 150]) + \
facet_grid("side_imp_weighted_cat~side_tf") + geom_point(alpha=0.1) + \
theme_classic()
def averaged_distance(df_dist_filt, side_tf, variable='imp_weighted', strand='-', window=15, log=False):
rp = df_dist_filt[(df_dist_filt.side_tf == side_tf) & (df_dist_filt.side_imp_weighted_cat == 'high')]
rp.rel_dist = rp.rel_dist.abs()
rp = rp.sort_values("rel_dist")
rp = rp[(rp.rel_dist < 150) & (rp.strand == strand)]
if log:
p = np.log10(rp.set_index('rel_dist')[variable]).rolling(window).mean().reset_index()
else:
p = rp.set_index('rel_dist')[variable].rolling(window).mean().reset_index()
return p
tasks = list(motifs)
variable = 'imp_weighted'
fig, axes = plt.subplots(2, len(tasks), figsize=(30,5), sharex=True);
for i, strand in enumerate(['+', '-']):
for j, side_tf in enumerate(list(motifs)):
p = averaged_distance(df_dist_filt, side_tf, variable=variable, strand=strand, window=15)
p.plot(x='rel_dist', y=variable, ax=axes[i,j], legend=False)
if i ==0:
axes[i,j].set_title(side_tf)
if j == 0:
axes[i,j].set_ylabel(f"{variable}, {strand}")
variable = 'Nanog/profile_counts'
fig, axes = plt.subplots(2, len(tasks), figsize=(30,5), sharex=True);
for i, strand in enumerate(['+', '-']):
for j, side_tf in enumerate(list(motifs)):
p = averaged_distance(df_dist_filt, side_tf, variable=variable, strand=strand, window=15, log=True)
p.plot(x='rel_dist', y=variable, ax=axes[i,j], legend=False)
if i ==0:
axes[i,j].set_title(side_tf)
if j == 0:
axes[i,j].set_ylabel(f"{strand}")
variable = 'Nanog/profile_max'
fig, axes = plt.subplots(2, len(tasks), figsize=(30,5), sharex=True);
for i, strand in enumerate(['+', '-']):
for j, side_tf in enumerate(list(motifs)):
p = averaged_distance(df_dist_filt, side_tf, variable=variable, strand=strand, window=15, log=True)
p.plot(x='rel_dist', y=variable, ax=axes[i,j], legend=False)
if i ==0:
axes[i,j].set_title(side_tf)
if j == 0:
axes[i,j].set_ylabel(f"{strand}")
rp = df_dist_filt[(df_dist_filt.side_tf == 'Nanog') & (df_dist_filt.side_imp_weighted_cat == 'high')].sort_values("rel_dist")
rp = rp[(rp.rel_dist < 150) & (rp.strand == '-') & (rp.rel_dist >= 0)]
p = rp.set_index('rel_dist').imp_weighted.rolling(15).mean().reset_index()
fig, ax = plt.subplots(1, 1, figsize=(10,3));
p.plot(x='rel_dist', y='imp_weighted', ax=ax)
plotnine.options.figure_size = (6,8)
ggplot(aes(x='rel_dist', y='imp_weighted'), df_dist_filt[(df_dist_filt.side_tf == 'Nanog') & (df_dist_filt.side_imp_weighted_cat == 'high')]) + \
xlim([0, 150]) + \
geom_point(alpha=0.1) + \
facet_grid("strand~.", labeller='label_both') + geom_point(alpha=0.05) + \
scale_x_continuous(limits = [0, 150], breaks=np.arange(0, 150, 10)) + \
theme_classic()
plotnine.options.figure_size = (6,8)
ggplot(aes(x='rel_dist', y='Nanog/profile_counts'), df_dist_filt[(df_dist_filt.side_tf == 'Nanog') & (df_dist_filt.side_imp_weighted_cat == 'high')]) + \
geom_point(alpha=0.1) + \
scale_y_log10() + \
facet_grid("strand~.", labeller='label_both') + geom_point(alpha=0.05) + \
scale_x_continuous(limits = [0, 150], breaks=np.arange(0, 150, 10)) + \
theme_classic()
plotnine.options.figure_size = (6,8)
ggplot(aes(x='rel_dist', y='Nanog/profile_max'), df_dist_filt[(df_dist_filt.side_tf == 'Nanog') & (df_dist_filt.side_imp_weighted_cat == 'high')]) + \
xlim([0, 150]) + \
geom_point(alpha=0.1) + \
scale_y_log10() + \
facet_grid("strand~.", labeller='label_both') + geom_point(alpha=0.05) + \
scale_x_continuous(limits = [0, 150], breaks=np.arange(0, 150, 10)) + \
theme_classic()
df_dist_filt[(df_dist_filt.side_tf == 'Nanog') & (df_dist_filt.side_imp_weighted_cat == 'high') & (df_dist_filt.strand == '-') & (df_dist_filt.rel_dist < 150)& (df_dist_filt.rel_dist >= 0)].rel_dist.hvplot.hist(bins=145)
plotnine.options.figure_size = (20,5)
ggplot(aes(x='rel_dist'), df_dist_filt[(df_dist_filt.side_tf == 'Nanog') & (df_dist_filt.side_imp_weighted_cat == 'high')]) + \
xlim([0, 150]) + \
geom_histogram(bins=150) + \
facet_grid(".~strand", labeller='label_both') + \
theme_classic()
dfs = df_dist_filt.groupby(['rel_dist', 'side_imp_weighted_cat', 'side_pattern_short']).size().reset_index()
dfs = dfs.rename(columns={0: 'n'})
#dfs = dfs[dfs['side_imp_weighted_cat'] == 'high']
dfs['pattern_name'] = dfs.side_pattern_short.map({v:k for k,v in motifs.items()})
plotnine.options.figure_size = (20,5)
ggplot(aes(x='rel_dist', y='n'), dfs) + \
xlim([-150, 150]) + \
facet_grid("pattern_name~side_imp_weighted_cat", scales='free_y') + geom_line() + \
theme_classic()
plotnine.options.figure_size = (20,5)
ggplot(aes(x='rel_dist'), df_dist_filt) + \
xlim([-150, 150]) + \
facet_grid("side_imp_weighted_cat~side_pattern_short") + geom_histogram(bins=300) + \
theme_bw()
df_dist_filt.head()