Goal¶

analyze instance locations. Focus on Oct4 peaks

TODO¶

sample a few points from the scatterplot: importance vs match to see what you are getting
try the same analysis for a few factors
- try to find the right cutoff based on the number of motifs you expect
eyeball a few regions (higlight the found seqlets in there)
- before doing the search, merge all teh metaclusters (ignore the importance score magnitude)

Conclusions¶

ignore hypothetical scores when scoring seqlet instances

from basepair.imports import *
from basepair.samplers import top_sum_count, top_max_count
from basepair.plot.tracks import plot_tracks, filter_tracks
from kipoi.data_utils import get_dataset_item

from plotnine import *
import plotnine

model_dir = Path(f"{ddir}/processed//chipnexus/exp/models/oct-sox-nanog-klf/models/n_dil_layers=9/")

modisco_dir = model_dir / "modisco/by_peak_tasks/weighted/Oct4"

mr = ModiscoResult(modisco_dir / "modisco.h5")
mr.open()

tasks = ['Oct4', 'Sox2', 'Nanog', 'Klf4']

mr.plot_pssm("metacluster_0", "pattern_0", trim_frac=0.08);
mr.plot_pssm("metacluster_0", "pattern_2", trim_frac=0.08);

pattern_name = 'metacluster_0/pattern_0'

pattern = mr.get_pattern(pattern_name)

pattern.plot(rotate_y=0);

def load_scores(h5_file, importance='weighted'):
    gt = importance
    im = HDF5Reader(h5_file)
    im.open()
    d = im.load_all()
    seq = d['inputs']
    tasks = list(d['grads'])
    hyp_contrib = {f"{task}": mean(d['grads'][task][gt])
                                 for task in tasks}
    contrib = {f"{task}": hyp_contrib[f"{task}"] * seq
                          for task in tasks}
    profile = {f"t/{task}": d['targets']['profile'][task] for task in tasks}
    im.close()
    return seq, contrib, hyp_contrib, profile


# subset
def get_scores(im, idx, tasks):
    hyp_contrib = {task: (im.f[f'/grads/{task}/weighted/0'][idx] + 
                          im.f[f'/grads/{task}/weighted/1'][idx]) / 2
                   for task in tasks}
    seq = im.f['/inputs/'][idx]
    contrib = {t: hyp_contrib[t] * seq for t in tasks}
    profile = {t: im.f[f'/targets/profile/{t}'][idx] for t in tasks}
    return seq, contrib, hyp_contrib, profile

seq, contrib, hyp_contrib, profile = load_scores(model_dir / "grad.valid.h5")

example_idx = 11128

idx = top_max_count(profile['t/Oct4'][:, 400:600], 80)

viz_dict = filter_tracks({**get_dataset_item(profile, example_idx), 
                          **get_dataset_item(contrib, example_idx)}, xlim=[400, 600])

plot_tracks(viz_dict);

Focus on one specific example¶

t = dict(contrib=contrib['Oct4'][example_idx, 400:600],
         hyp_contrib=hyp_contrib['Oct4'][example_idx, 400:600],
         seq=seq[example_idx, 400:600])

# target
plot_tracks(dict(contrib=t['contrib'], hyp_contrib=t['hyp_contrib']));

from basepair.modisco.results import *

pattern.trim_seq_ic(0.08).plot(['seq', 'contrib/Oct4', 'hyp_contrib/Oct4'], 
                               rotate_y=0, height=1, letter_width=0.2);

Get the instances¶

pattern_names = [
    ("Oct4-Sox2", "metacluster_0/pattern_0"),
    ("Errb", "metacluster_0/pattern_1"),
    ("Sox2", "metacluster_0/pattern_2"),
    ("Nanog", "metacluster_0/pattern_3"),
    ("Klf4", "metacluster_2/pattern_0"),
]

cached = True

cache_dir = Path(Path(ddir) / "cache/dev/")
cache_dir.mkdir(parents=True, exist_ok=True)
cache_file = cache_dir / "hit-scoring-contrib-only.hdf5"

dfp = pd.read_hdf(cache_file, "/scores")

!du -sh {cache_file}

137M	/users/avsec/workspace/basepair/data/cache/dev/hit-scoring-contrib-only.hdf5

#dfp.to_hdf(cache_file, "/scores")

if not cached:
    dfl = []
    match_l, importance_l, seq_match_l = [], [], []
    for tf, pattern_name in pattern_names:
        pattern = mr.get_pattern(pattern_name).trim_seq_ic(0.08)
        match, importance = pattern.scan_importance(contrib, hyp_contrib, tasks, 
                                                    n_jobs=20, verbose=True)
        seq_match = pattern.scan_seq(seq, n_jobs=20, verbose=True)
        dfm = pattern.get_instances(tasks, match, importance, seq_match, fdr=0.01, verbose=True)
        dfm['tf'] = tf
        dfl.append(dfm)
        match_l.append(match)
        importance_l.append(importance)
        seq_match_l.append(seq_match)
    dfp = pd.concat(dfl)
    # cache
    cached = True
    dfp.to_hdf(cache_file, "/scores")
dfp['idx'] = np.arange(len(dfp))
dfp = dfp[dfp.score_seq_match > 0]
# get also the match scores

match = np.stack(match_l, axis=-1)
importance = np.stack(importance_l, axis=-1)
seq_match = np.stack(seq_match_l, axis=-1)

dfp.head()

# convert a wide format to a long format
def melt_prefix(dfp, var='imp'):
    prefix = var + "/"
    dfp_imp = dfp[['idx'] + [prefix + t for t in tasks]].melt(id_vars='idx', value_name=var)
    dfp_imp['task'] = dfp_imp.variable.str.replace(prefix, "")
    del dfp_imp['variable']
    return dfp_imp


dfpm = melt_prefix(dfp, 'imp').merge(melt_prefix(dfp, 'match'), on=['idx', 'task'])
dfpl = dfp[[c for c in dfp.columns 
            if not c.startswith("imp/") 
            and not c.startswith("match/")]].merge(dfpm, on='idx')
dfpl['log_imp'] = np.log(1+dfpl.imp)

dfp['log_imp_weighted'] = np.log(1+dfp['imp_weighted'])
dfp['log_match_weighted'] = np.log(1+dfp['match_weighted'])

# partition the scores into 4 categories (according to the median)
dfp_medians = pd.DataFrame({'log_match_weighted': dfp.groupby("tf").log_match_weighted.median(),
                            'log_imp_weighted': dfp.groupby("tf").log_imp_weighted.median()}).reset_index()

log_imp_weighted_median = dfp.tf.map(dfp.groupby("tf").log_imp_weighted.median())
log_match_weighted_median = dfp.tf.map(dfp.groupby("tf").log_match_weighted.median())
dfp['imp_cat'] = (dfp.log_imp_weighted  > log_imp_weighted_median).map({False: 'low', True: 'high'})
dfp['match_cat'] = (dfp.log_match_weighted  > log_match_weighted_median).map({False: 'low', True: 'high'})

TODO¶

add percentiles

TODO¶

sample a few points from the scatterplot: importance vs match to see what you are getting
try the same analysis for a few factors
- try to find the right cutoff based on the number of motifs you expect
eyeball a few regions (higlight the found seqlets in there)
- before doing the search, merge all teh metaclusters (ignore the importance score magnitude)

dfpl.shape

(2421396, 20)

dfpl_subset = dfpl.sample(10000)  # focus on the subset of the data
dfp_subset = dfp.sample(10000)  # focus on the subset of the data

dfpl_subset.head()

Match vs importance¶

Match score specifically for each task¶

import warnings
warnings.filterwarnings("ignore")

plotnine.options.figure_size = (12, 7)
ggplot(aes(x="match", y="log_imp"), dfpl_subset) + geom_point(alpha=0.1) + \
  facet_grid("tf~task", labeller='label_both') + theme_bw()

<ggplot: (8727825595505)>

Match score with highest importance across tasks¶

plotnine.options.figure_size = (12, 7)
ggplot(aes(x="match_weighted", y="log_imp"), dfpl_subset) + geom_point(alpha=0.1) + \
  facet_grid("tf~task", labeller='label_both') + theme_bw()

<ggplot: (-9223363309028540251)>

Match vs imp¶

plotnine.options.figure_size = (12, 3)
ggplot(aes(x="log_match_weighted", y="log_imp_weighted"), dfp_subset) + geom_point(alpha=0.1) + \
  facet_grid(".~tf", labeller='label_both') + theme_bw()

<ggplot: (-9223363309027851120)>

Seq vs match¶

plotnine.options.figure_size = (12, 3)
ggplot(aes(x="score_seq_match", y="log_match_weighted"), dfp_subset) + geom_point(alpha=0.1) + \
  facet_grid(".~tf", labeller='label_both') + theme_bw()

<ggplot: (8727826509958)>

Seq vs importance¶

plotnine.options.figure_size = (12, 3)
ggplot(aes(x="score_seq_match", y="log_imp_weighted"), dfp_subset) + geom_point(alpha=0.1) + \
  facet_grid(".~tf", labeller='label_both') + theme_bw()

<ggplot: (-9223363309028498365)>

Seq vs match¶

ggplot(aes(x='log_match_weighted'), dfp_subset) + geom_histogram() + facet_grid(".~tf", labeller='label_both') + theme_bw()

<ggplot: (8727825980231)>

plotnine.options.figure_size = (12, 3)
ggplot(aes(x="log_match_weighted", y="log_imp_weighted"), dfp_subset) + geom_point(alpha=0.1) + \
  geom_vline(aes(xintercept='log_match_weighted'), data=dfp_medians, alpha=1, color='orange', linetype='dashed') + \
  geom_hline(aes(yintercept='log_imp_weighted'), data=dfp_medians, alpha=1, color='orange', linetype='dashed') + \
  facet_grid(".~tf", labeller='label_both') + theme_bw()

<ggplot: (8727823780921)>

np.log(1+dfp.imp_max).plot.hist(200);
#plt.xlim([0, 20])
plt.xlabel("Maximum importance plot");

plotnine.options.figure_size = (7, 7)
ggplot(aes(x='tf', color='task', y='match'), dfpl) + geom_boxplot() + theme_bw()

<ggplot: (-9223363306565456391)>

plotnine.options.figure_size = (7, 7)
ggplot(aes(x='tf', color='task', y='log_imp'), dfpl) + geom_boxplot() + theme_bw()

<ggplot: (8730528235570)>

dfpl.groupby(['tf', 'task']).imp.mean().unstack().plot.bar()
plt.ylabel("Average importance");

dfpl.groupby(['tf', 'task']).match.mean().unstack().plot.bar()
plt.title("Average match per task");
plt.ylabel("Average Match");

dfp.groupby("tf").imp_max_task.value_counts().unstack().plot.bar();
plt.ylabel("Frequency")
plt.title("Number of times a particular task has max importance ");

dfp.groupby("tf").match_max_task.value_counts().unstack().plot.bar();
plt.ylabel("Frequency")
plt.title("Number of times a particular task has max match ");

Select the right values for visualization¶

fig, axes = plt.subplots(1, len(tasks), figsize=(15,3))
for task, ax in zip(tasks, axes):
    p = profile[f't/{task}'].max(axis=(1,2))
    p = p[p<20]
    ax.hist(p, 20);
    #ax.set_xlim([0,20]);
    ax.set_title(task)
    ax.set_xlabel("Max counts in the region")

fig, axes = plt.subplots(1, len(tasks), figsize=(15,3))
for task, ax in zip(tasks, axes):
    p = np.abs(contrib[f'{task}']).max(axis=(1,2))
    #p = p[p<20]
    ax.hist(p, 20);
    #ax.set_xlim([0,20]);
    ax.set_title(task)
    ax.set_xlabel("Max importance in the region")

Plot some examples¶

pattern_names = [
    ("Oct4-Sox2", "metacluster_0/pattern_0"),
    ("Errb", "metacluster_0/pattern_1"),
    ("Sox2", "metacluster_0/pattern_2"),
    ("Nanog", "metacluster_0/pattern_3"),
    ("Klf4", "metacluster_2/pattern_0"),
]

pattern.plot("contrib");

pattern = mr.get_pattern(pattern.name).trim_seq_ic(0.08)

dfpl_subset[(dfpl_subset.task == 'Klf4') & (dfpl_subset.tf == 'Oct4-Sox2')& (dfpl_subset.imp > 40)]

viz_dict = filter_tracks({**get_dataset_item(profile, example_idx), 
                          **get_dataset_item(contrib, example_idx)}, xlim=[400, 600])

example_idx = 1288

dfp_subset_idx = dfp[(dfp.example_idx == example_idx) & (dfp.pattern_start > 400) & (dfp.pattern_end < 600)]
dfp_subset_idx

seqlets = [Seqlet(row.example_idx, row.pattern_start-400, row.pattern_end-400, row.tf, row.strand) 
           for i,row in dfp_subset_idx.iterrows()]

# TODO - is this thing normalized by importance?

def truncate(profile, value):
    return {t: np.sign(profile[t]) * np.minimum(np.abs(profile[t]), value) for t in profile}

plot_tracks(filter_tracks({**get_dataset_item(truncate(profile, 20), example_idx), 
                           **get_dataset_item(truncate(contrib, 0.5), example_idx)}, xlim=[400, 600]), 
            ylim=[(0, 20)]*4 + [(-0.5, 0.5)]*4,
            seqlets=seqlets, rotate_y=0, legend=True);

Number of motifs per example¶

dfp[(np.log(1+dfp.imp_max) > 2) & (dfp.pattern_start > 400) & (dfp.pattern_end < 600)].groupby('example_idx').size().plot.hist()

---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-53-1b7d262f2faf> in <module>()
----> 1 dfp[(np.log(1+dfp.imp_max) > 2) & (dfp.pattern_start > 400) & (dfp.pattern_end < 600)].groupby('example_idx').size().plot.hist()

~/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/pandas/plotting/_core.py in hist(self, bins, **kwds)
   2831         axes : :class:`matplotlib.axes.Axes` or numpy.ndarray of them
   2832         """
-> 2833         return self(kind='hist', bins=bins, **kwds)
   2834 
   2835     @Appender(_kde_docstring % {

~/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/pandas/plotting/_core.py in __call__(self, kind, ax, figsize, use_index, title, grid, legend, style, logx, logy, loglog, xticks, yticks, xlim, ylim, rot, fontsize, colormap, table, yerr, xerr, label, secondary_y, **kwds)
   2739                            colormap=colormap, table=table, yerr=yerr,
   2740                            xerr=xerr, label=label, secondary_y=secondary_y,
-> 2741                            **kwds)
   2742     __call__.__doc__ = plot_series.__doc__
   2743 

~/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/pandas/plotting/_core.py in plot_series(data, kind, ax, figsize, use_index, title, grid, legend, style, logx, logy, loglog, xticks, yticks, xlim, ylim, rot, fontsize, colormap, table, yerr, xerr, label, secondary_y, **kwds)
   2000                  yerr=yerr, xerr=xerr,
   2001                  label=label, secondary_y=secondary_y,
-> 2002                  **kwds)
   2003 
   2004 

~/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/pandas/plotting/_core.py in _plot(data, x, y, subplots, ax, kind, **kwds)
   1802         plot_obj = klass(data, subplots=subplots, ax=ax, kind=kind, **kwds)
   1803 
-> 1804     plot_obj.generate()
   1805     plot_obj.draw()
   1806     return plot_obj.result

~/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/pandas/plotting/_core.py in generate(self)
    256     def generate(self):
    257         self._args_adjust()
--> 258         self._compute_plot_data()
    259         self._setup_subplots()
    260         self._make_plot()

~/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/pandas/plotting/_core.py in _compute_plot_data(self)
    371         if is_empty:
    372             raise TypeError('Empty {0!r}: no numeric data to '
--> 373                             'plot'.format(numeric_data.__class__.__name__))
    374 
    375         self.data = numeric_data

TypeError: Empty 'DataFrame': no numeric data to plot

Examples¶

def prefix_dict(d, prefix):
    return {prefix + d: v for d,v in d.items()}

example_idx = dfp.example_idx.sample(1).iloc[0]
print(example_idx)

576

example_idx = top_max_count(profile['t/Oct4'][:, 400:600], 80)[0]

dfp_subset_idx = dfp[#(np.log(1+dfp.imp_max) > 2) & 
                     (dfp.imp_cat == 'high') & 
                     (dfp.match_cat == 'high') & 
                     #(dfp.tf == "Errb") & 
                     (dfp.example_idx == example_idx) & 
                     (dfp.pattern_start > 400) & 
                     (dfp.pattern_end < 600)]
seqlets = [Seqlet(row.example_idx, row.pattern_start-400, row.pattern_end-400, row.tf, row.strand) 
           for i,row in dfp_subset_idx.iterrows()]
plot_tracks(filter_tracks({**get_dataset_item(truncate(profile, 20), example_idx), 
                           **get_dataset_item(truncate(contrib, 0.5), example_idx)}, xlim=[400, 600]), 
            ylim=[(0, 20)]*4 + [(-0.5, 0.5)]*4,
            seqlets=seqlets, rotate_y=0, legend=True);

Others¶

example_idx = dfp.example_idx.sample(1).iloc[0]
print(example_idx)

10833

dfp_subset_idx = dfp[#(np.log(1+dfp.imp_max) > 2) & 
                     (dfp.imp_cat == 'high') & 
                     (dfp.match_cat == 'high') & 
                     #(dfp.tf == "Errb") & 
                     (dfp.example_idx == example_idx) & 
                     (dfp.pattern_start > 400) & 
                     (dfp.pattern_end < 600)]
seqlets = [Seqlet(row.example_idx, row.pattern_start-400, row.pattern_end-400, row.tf, row.strand) 
           for i,row in dfp_subset_idx.iterrows()]
plot_tracks(filter_tracks({**get_dataset_item(profile, example_idx), 
                           **get_dataset_item(contrib, example_idx)}, xlim=[400, 600]), 
            seqlets, rotate_y=0, legend=True);

counts = profile['t/Oct4'][:, 400:600].sum(axis=(1,2))

Regions with high counts¶

from basepair.samplers import top_max_count

top_counts = top_max_count(profile['t/Oct4'][:, 400:600])

example_idx = top_counts[0]

example_idx

0

# region needs to have at least > 5 max counts
valid_idx = np.arange(len(profile['t/Oct4']))[profile['t/Sox2'][:, 400:600].max((1,2)) > 5]

example_idx = dfp.example_idx.sample(1).iloc[0]
print(example_idx)

13898

# stringent (~50%)
thresholds = {"Klf4": 0.55,
              "Nanog": 0.45,
              "Sox2": 0.55,
              "Errb": 0.4,
              "Oct4-Sox2": 0.4
               }

# loose (~25%)
thresholds = {"Klf4": 0.45,
              "Nanog": 0.35,
              "Sox2": 0.4,
              "Errb": 0.3,
              "Oct4-Sox2": 0.3
               }

dfp[dfp.match_weighted > dfp.tf.map(thresholds)]

array(['Oct4-Sox2', 'Errb', 'Sox2', 'Nanog', 'Klf4'], dtype=object)

example_idx = dfp.example_idx.sample(1).iloc[0]
print(example_idx)

2851

example_idx = pd.Series(valid_idx).sample(1).iloc[0]

count_lim = 20
dfp_subset_idx = dfp[#(np.log(1+dfp.imp_max) > 2) & 
                     (dfp.match_weighted > dfp.tf.map(thresholds)) &
                     #(dfp.imp_cat == 'high') & 
                     #(dfp.match_cat == 'high') & 
                     #(dfp.tf == "Errb") & 
                     (dfp.example_idx == example_idx) & 
                     (dfp.pattern_start > 400) & 
                     (dfp.pattern_end < 600)]
seqlets = [Seqlet(row.example_idx, row.pattern_start-400, row.pattern_end-400, row.tf, row.strand) 
           for i,row in dfp_subset_idx.iterrows()]
plot_tracks(filter_tracks({**get_dataset_item(truncate(profile, count_lim), example_idx), 
                           **get_dataset_item(truncate(contrib, 0.5), example_idx)}, xlim=[400, 600]), 
            ylim=[(0, count_lim)]*4 + [(-0.5, 0.5)]*4,
            seqlets=seqlets, rotate_y=0, legend=True);

for tf, pattern_name in pattern_names:
    p = mr.get_pattern(pattern_name).trim_seq_ic(0.08)
    p.name = "+" + tf
    p.plot(['seq'] + [f'contrib/{t}' for t in tasks], rotate_y=0)
    p.name = "-" + tf
    p.rc().plot(['seq'] + [f'contrib/{t}' for t in tasks], rotate_y=0)

Regions with high counts¶

example_idx = dfp.example_idx.sample(1).iloc[0]
print(example_idx)

18679

dfp_subset_idx = dfp[#(np.log(1+dfp.imp_max) > 2) & 
                     (dfp.imp_cat == 'high') & 
                     (dfp.match_cat == 'high') & 
                     #(dfp.tf == "Errb") & 
                     (dfp.example_idx == example_idx) & 
                     (dfp.pattern_start > 400) & 
                     (dfp.pattern_end < 600)]
seqlets = [Seqlet(row.example_idx, row.pattern_start-400, row.pattern_end-400, row.tf, row.strand) 
           for i,row in dfp_subset_idx.iterrows()]
plot_tracks(filter_tracks({**get_dataset_item(profile, example_idx), 
                           **get_dataset_item(contrib, example_idx)}, xlim=[400, 600]), 
            seqlets, rotate_y=0, legend=True);

dfp_subset_idx[['tf', 'pattern_center', 'strand', 'match_weighted', 'imp_weighted', 'score_seq_match']].sort_values('pattern_center')

Example of strong reliance on hypothetical scores¶

example_idx = dfp.example_idx.sample(1).iloc[0]
print(example_idx)

1335

dfp_subset_idx = dfp[#(np.log(1+dfp.imp_max) > 2) & 
                     (dfp.imp_cat == 'high') & 
                     (dfp.match_cat == 'high') & 
                     #(dfp.tf == "Errb") & 
                     (dfp.example_idx == example_idx) & 
                     (dfp.pattern_start > 400) & 
                     (dfp.pattern_end < 600)]
seqlets = [Seqlet(row.example_idx, row.pattern_start-400, row.pattern_end-400, row.tf, row.strand) 
           for i,row in dfp_subset_idx.iterrows()]
plot_tracks(filter_tracks({**get_dataset_item(contrib, example_idx), 
                           **get_dataset_item(prefix_dict(hyp_contrib, "h/"), example_idx)}, xlim=[400, 600]), 
            seqlets, rotate_y=0, legend=True);

dfp_subset_idx[['tf', 'pattern_center', 'strand', 'match_weighted', 'imp_weighted', 'score_seq_match']].sort_values('pattern_center')

dfp_subset_idx = dfp[#(np.log(1+dfp.imp_max) > 2) & 
                     (dfp.imp_cat == 'high') & 
                     (dfp.match_cat == 'high') & 
                     (dfp.tf == "Sox2") & 
                     (dfp.example_idx == example_idx) & 
                     (dfp.pattern_start > 400) & 
                     (dfp.pattern_end < 600)]
seqlets = [Seqlet(row.example_idx, row.pattern_start-400, row.pattern_end-400, row.tf, row.strand) 
           for i,row in dfp_subset_idx.iterrows()]
plot_tracks(filter_tracks({**get_dataset_item(contrib, example_idx), 
                           **get_dataset_item(prefix_dict(hyp_contrib, "h/"), example_idx)}, xlim=[400, 600]), 
            seqlets, rotate_y=0, legend=True);

Conclusion¶

hypothetical contribution scores can be confusing to inspect

dfp_subset_idx = dfp[#(np.log(1+dfp.imp_max) > 2) & 
                     (dfp.imp_cat == 'high') & 
                     (dfp.match_cat == 'high') & 
                     (dfp.tf == "Oct4-Sox2") & 
                     (dfp.example_idx == example_idx) & 
                     (dfp.pattern_start > 400) & 
                     (dfp.pattern_end < 600)]
seqlets = [Seqlet(row.example_idx, row.pattern_start-400, row.pattern_end-400, row.tf, row.strand) 
           for i,row in dfp_subset_idx.iterrows()]
plot_tracks(filter_tracks({**get_dataset_item(contrib, example_idx), 
                           **get_dataset_item(prefix_dict(hyp_contrib, "h/"), example_idx)}, xlim=[400, 600]), 
            seqlets, rotate_y=0, legend=True);

dfp_subset_idx = dfp[#(np.log(1+dfp.imp_max) > 2) & 
                     (dfp.imp_cat == 'high') & 
                     (dfp.match_cat == 'high') & 
                     (dfp.tf == "Nanog") & 
                     (dfp.example_idx == example_idx) & 
                     (dfp.pattern_start > 400) & 
                     (dfp.pattern_end < 600)]
seqlets = [Seqlet(row.example_idx, row.pattern_start-400, row.pattern_end-400, row.tf, row.strand) 
           for i,row in dfp_subset_idx.iterrows()]

plot_tracks(filter_tracks({**get_dataset_item(contrib, example_idx), 
                           **get_dataset_item(profile, example_idx)}, xlim=[400, 600]), 
            rotate_y=0, legend=True);

plot_tracks(filter_tracks({**get_dataset_item(contrib, example_idx), 
                           **get_dataset_item(prefix_dict(hyp_contrib, "h/"), example_idx)}, xlim=[400, 600]), 
            seqlets, rotate_y=0, legend=True);

	pattern	pattern_start	pattern_end	strand	pattern_len	pattern_center	match_weighted	match_max	match_max_task	...	match/Oct4	match/Sox2	match/Nanog	match/Klf4	imp/Oct4	imp/Sox2	imp/Nanog	imp/Klf4	tf	idx
0	metacluster_0/pattern_0	434	450	-	16	442	0.252062	0.275759	Oct4	...	0.275759	0.237059	0.242851	0.233977	1.241835	1.133492	1.741973	0.686105	Oct4-Sox2	0
1	metacluster_0/pattern_0	436	452	+	16	444	0.432034	0.489517	Oct4	...	0.489517	0.460554	0.337755	0.361674	1.309108	1.175500	1.664400	0.695356	Oct4-Sox2	1
3	metacluster_0/pattern_0	457	473	+	16	465	0.388216	0.436506	Oct4	...	0.436506	0.397625	0.358199	0.299836	1.689514	1.843320	2.113065	0.911170	Oct4-Sox2	3
4	metacluster_0/pattern_0	475	491	-	16	483	0.173301	0.201869	Nanog	...	0.195371	0.156338	0.201869	0.121640	1.672300	1.999877	3.030394	1.244052	Oct4-Sox2	4
5	metacluster_0/pattern_0	477	493	+	16	485	0.348796	0.370780	Oct4	...	0.370780	0.341117	0.351014	0.310645	1.765607	2.086354	3.086851	1.298601	Oct4-Sox2	5

	pattern	example_idx	pattern_start	pattern_end	strand	pattern_len	pattern_center	match_weighted	match_max	match_max_task	imp_weighted	imp_max	imp_max_task	score_seq_match	tf	idx	imp	task	match	log_imp
1823109	metacluster_0/pattern_3	16987	441	450	+	9	445	0.265346	0.401434	Sox2	0.207118	0.262213	Klf4	3.484303	Nanog	693032	0.043545	Sox2	0.401434	0.042623
1988177	metacluster_2/pattern_0	3477	850	859	-	9	854	0.511327	0.570913	Klf4	0.050606	0.053549	Klf4	4.292175	Klf4	745750	0.026993	Sox2	-0.425327	0.026635
734692	metacluster_0/pattern_2	644	850	859	-	9	854	0.293186	0.524033	Sox2	0.030897	0.049519	Sox2	3.549739	Sox2	320944	0.014209	Oct4	-0.079685	0.014109
692936	metacluster_0/pattern_1	17675	835	851	-	16	843	0.242067	0.353556	Nanog	0.046924	0.054159	Klf4	0.617845	Errb	303673	0.027362	Oct4	0.094728	0.026994
404589	metacluster_0/pattern_1	315	258	274	+	16	266	0.166622	0.233833	Klf4	0.044707	0.068878	Nanog	6.597100	Errb	167300	0.014944	Sox2	0.152988	0.014833

	pattern	example_idx	pattern_start	pattern_end	strand	pattern_len	pattern_center	match_weighted	match_max	match_max_task	...	imp/Oct4	imp/Sox2	imp/Nanog	imp/Klf4	tf	idx	log_imp_weighted	log_match_weighted	imp_cat	match_cat
12835	metacluster_0/pattern_0	1288	481	497	-	16	489	0.175347	0.198611	Oct4	...	0.200184	0.278294	0.480298	0.181074	Oct4-Sox2	12835	0.238454	0.161563	high	low
12836	metacluster_0/pattern_0	1288	523	539	-	16	531	0.473083	0.495400	Sox2	...	2.173852	2.219619	1.448849	1.102148	Oct4-Sox2	12836	1.054440	0.387358	high	high
9957	metacluster_0/pattern_1	1288	566	582	+	16	574	0.201671	0.219057	Sox2	...	0.150967	0.260928	0.706322	0.139661	Errb	174889	0.291117	0.183713	high	low
9958	metacluster_0/pattern_1	1288	572	588	+	16	580	0.264234	0.316895	Klf4	...	0.116667	0.257177	0.663679	0.136698	Errb	174890	0.275893	0.234467	high	high
11690	metacluster_0/pattern_2	1288	481	490	+	9	485	0.300741	0.400579	Oct4	...	0.138503	0.190852	0.306008	0.119629	Sox2	326847	0.196988	0.262934	high	low
11692	metacluster_0/pattern_2	1288	512	521	-	9	516	0.260668	0.381429	Sox2	...	0.224723	0.250794	0.262640	0.119058	Sox2	326849	0.206402	0.231642	high	low
11693	metacluster_0/pattern_2	1288	523	532	+	9	527	0.433320	0.461441	Nanog	...	1.206914	1.443564	0.904968	0.646655	Sox2	326850	0.726485	0.359994	high	high
15527	metacluster_0/pattern_3	1288	425	434	-	9	429	0.339477	0.478657	Nanog	...	0.024767	0.019119	0.130528	0.060662	Nanog	497170	0.095054	0.292279	high	high
15528	metacluster_0/pattern_3	1288	449	458	-	9	453	0.337640	0.429814	Nanog	...	0.028087	0.034292	0.268384	0.115667	Nanog	497171	0.183660	0.290907	high	high
15529	metacluster_0/pattern_3	1288	521	530	-	9	525	0.304798	0.319052	Nanog	...	0.856278	1.225770	0.951009	0.503223	Nanog	497172	0.656838	0.266048	high	high
15531	metacluster_0/pattern_3	1288	570	579	-	9	574	0.311657	0.360529	Nanog	...	0.072717	0.164210	0.497861	0.076846	Nanog	497174	0.316995	0.271292	high	high
8946	metacluster_2/pattern_0	1288	461	470	-	9	465	0.483994	0.544808	Klf4	...	0.066188	0.082523	0.203821	0.470699	Klf4	729298	0.366259	0.394737	high	high

	tf	pattern_center	strand	match_weighted	imp_weighted	score_seq_match
232850	Nanog	419	-	0.291701	0.386205	3.786917
232851	Nanog	472	+	0.370044	0.432529	2.352270
162766	Sox2	481	+	0.538665	0.498735	6.725247
153105	Klf4	533	+	0.548452	0.794147	8.627083
232854	Nanog	544	-	0.339172	0.132204	3.140144

	tf	pattern_center	strand	match_weighted	imp_weighted	score_seq_match
13427	Sox2	410	+	7.563056	11.159393	4.537363
16054	Oct4-Sox2	414	-	9.051900	12.778213	7.640690
13428	Sox2	439	-	6.265966	6.403757	5.395059
15976	Nanog	467	-	4.711840	6.146054	0.432334
15977	Nanog	468	+	4.784056	6.087443	2.183998
16056	Oct4-Sox2	473	-	5.308060	7.876490	1.675158
16057	Oct4-Sox2	475	+	5.714996	8.055885	1.497434
15978	Nanog	479	-	5.457193	10.562017	4.525884
15983	Nanog	572	-	6.116345	4.657692	6.155791