Goal

  • analyze instance locations. Focus on Oct4 peaks

TODO

  • sample a few points from the scatterplot: importance vs match to see what you are getting
  • try the same analysis for a few factors
    • try to find the right cutoff based on the number of motifs you expect
  • eyeball a few regions (higlight the found seqlets in there)
    • before doing the search, merge all teh metaclusters (ignore the importance score magnitude)

Conclusions

  • ignore hypothetical scores when scoring seqlet instances
In [2]:
from basepair.imports import *
from basepair.samplers import top_sum_count, top_max_count
from basepair.plot.tracks import plot_tracks, filter_tracks
from kipoi.data_utils import get_dataset_item

from plotnine import *
import plotnine
In [3]:
model_dir = Path(f"{ddir}/processed//chipnexus/exp/models/oct-sox-nanog-klf/models/n_dil_layers=9/")
In [4]:
modisco_dir = model_dir / "modisco/by_peak_tasks/weighted/Oct4"
In [5]:
mr = ModiscoResult(modisco_dir / "modisco.h5")
mr.open()
In [6]:
tasks = ['Oct4', 'Sox2', 'Nanog', 'Klf4']
In [7]:
mr.plot_pssm("metacluster_0", "pattern_0", trim_frac=0.08);
mr.plot_pssm("metacluster_0", "pattern_2", trim_frac=0.08);
In [8]:
pattern_name = 'metacluster_0/pattern_0'
In [9]:
pattern = mr.get_pattern(pattern_name)
In [10]:
pattern.plot(rotate_y=0);
In [11]:
def load_scores(h5_file, importance='weighted'):
    gt = importance
    im = HDF5Reader(h5_file)
    im.open()
    d = im.load_all()
    seq = d['inputs']
    tasks = list(d['grads'])
    hyp_contrib = {f"{task}": mean(d['grads'][task][gt])
                                 for task in tasks}
    contrib = {f"{task}": hyp_contrib[f"{task}"] * seq
                          for task in tasks}
    profile = {f"t/{task}": d['targets']['profile'][task] for task in tasks}
    im.close()
    return seq, contrib, hyp_contrib, profile


# subset
def get_scores(im, idx, tasks):
    hyp_contrib = {task: (im.f[f'/grads/{task}/weighted/0'][idx] + 
                          im.f[f'/grads/{task}/weighted/1'][idx]) / 2
                   for task in tasks}
    seq = im.f['/inputs/'][idx]
    contrib = {t: hyp_contrib[t] * seq for t in tasks}
    profile = {t: im.f[f'/targets/profile/{t}'][idx] for t in tasks}
    return seq, contrib, hyp_contrib, profile
In [12]:
seq, contrib, hyp_contrib, profile = load_scores(model_dir / "grad.valid.h5")
In [103]:
example_idx = 11128
In [104]:
idx = top_max_count(profile['t/Oct4'][:, 400:600], 80)
In [105]:
viz_dict = filter_tracks({**get_dataset_item(profile, example_idx), 
                          **get_dataset_item(contrib, example_idx)}, xlim=[400, 600])
In [106]:
plot_tracks(viz_dict);

Focus on one specific example

In [19]:
t = dict(contrib=contrib['Oct4'][example_idx, 400:600],
         hyp_contrib=hyp_contrib['Oct4'][example_idx, 400:600],
         seq=seq[example_idx, 400:600])
In [20]:
# target
plot_tracks(dict(contrib=t['contrib'], hyp_contrib=t['hyp_contrib']));
In [21]:
from basepair.modisco.results import *
In [22]:
pattern.trim_seq_ic(0.08).plot(['seq', 'contrib/Oct4', 'hyp_contrib/Oct4'], 
                               rotate_y=0, height=1, letter_width=0.2);

Get the instances

In [23]:
pattern_names = [
    ("Oct4-Sox2", "metacluster_0/pattern_0"),
    ("Errb", "metacluster_0/pattern_1"),
    ("Sox2", "metacluster_0/pattern_2"),
    ("Nanog", "metacluster_0/pattern_3"),
    ("Klf4", "metacluster_2/pattern_0"),
]
In [24]:
cached = True
In [25]:
cache_dir = Path(Path(ddir) / "cache/dev/")
cache_dir.mkdir(parents=True, exist_ok=True)
cache_file = cache_dir / "hit-scoring-contrib-only.hdf5"
In [26]:
dfp = pd.read_hdf(cache_file, "/scores")
In [26]:
!du -sh {cache_file}
137M	/users/avsec/workspace/basepair/data/cache/dev/hit-scoring-contrib-only.hdf5
In [27]:
#dfp.to_hdf(cache_file, "/scores")
In [28]:
if not cached:
    dfl = []
    match_l, importance_l, seq_match_l = [], [], []
    for tf, pattern_name in pattern_names:
        pattern = mr.get_pattern(pattern_name).trim_seq_ic(0.08)
        match, importance = pattern.scan_importance(contrib, hyp_contrib, tasks, 
                                                    n_jobs=20, verbose=True)
        seq_match = pattern.scan_seq(seq, n_jobs=20, verbose=True)
        dfm = pattern.get_instances(tasks, match, importance, seq_match, fdr=0.01, verbose=True)
        dfm['tf'] = tf
        dfl.append(dfm)
        match_l.append(match)
        importance_l.append(importance)
        seq_match_l.append(seq_match)
    dfp = pd.concat(dfl)
    # cache
    cached = True
    dfp.to_hdf(cache_file, "/scores")
dfp['idx'] = np.arange(len(dfp))
dfp = dfp[dfp.score_seq_match > 0]
# get also the match scores
In [28]:
match = np.stack(match_l, axis=-1)
importance = np.stack(importance_l, axis=-1)
seq_match = np.stack(seq_match_l, axis=-1)
In [30]:
dfp.head()
Out[30]:
pattern example_idx pattern_start pattern_end strand pattern_len pattern_center match_weighted match_max match_max_task ... match/Oct4 match/Sox2 match/Nanog match/Klf4 imp/Oct4 imp/Sox2 imp/Nanog imp/Klf4 tf idx
0 metacluster_0/pattern_0 0 434 450 - 16 442 0.252062 0.275759 Oct4 ... 0.275759 0.237059 0.242851 0.233977 1.241835 1.133492 1.741973 0.686105 Oct4-Sox2 0
1 metacluster_0/pattern_0 0 436 452 + 16 444 0.432034 0.489517 Oct4 ... 0.489517 0.460554 0.337755 0.361674 1.309108 1.175500 1.664400 0.695356 Oct4-Sox2 1
3 metacluster_0/pattern_0 0 457 473 + 16 465 0.388216 0.436506 Oct4 ... 0.436506 0.397625 0.358199 0.299836 1.689514 1.843320 2.113065 0.911170 Oct4-Sox2 3
4 metacluster_0/pattern_0 0 475 491 - 16 483 0.173301 0.201869 Nanog ... 0.195371 0.156338 0.201869 0.121640 1.672300 1.999877 3.030394 1.244052 Oct4-Sox2 4
5 metacluster_0/pattern_0 0 477 493 + 16 485 0.348796 0.370780 Oct4 ... 0.370780 0.341117 0.351014 0.310645 1.765607 2.086354 3.086851 1.298601 Oct4-Sox2 5

5 rows × 24 columns

In [32]:
# convert a wide format to a long format
def melt_prefix(dfp, var='imp'):
    prefix = var + "/"
    dfp_imp = dfp[['idx'] + [prefix + t for t in tasks]].melt(id_vars='idx', value_name=var)
    dfp_imp['task'] = dfp_imp.variable.str.replace(prefix, "")
    del dfp_imp['variable']
    return dfp_imp


dfpm = melt_prefix(dfp, 'imp').merge(melt_prefix(dfp, 'match'), on=['idx', 'task'])
dfpl = dfp[[c for c in dfp.columns 
            if not c.startswith("imp/") 
            and not c.startswith("match/")]].merge(dfpm, on='idx')
dfpl['log_imp'] = np.log(1+dfpl.imp)
In [32]:
dfp['log_imp_weighted'] = np.log(1+dfp['imp_weighted'])
dfp['log_match_weighted'] = np.log(1+dfp['match_weighted'])

# partition the scores into 4 categories (according to the median)
dfp_medians = pd.DataFrame({'log_match_weighted': dfp.groupby("tf").log_match_weighted.median(),
                            'log_imp_weighted': dfp.groupby("tf").log_imp_weighted.median()}).reset_index()

log_imp_weighted_median = dfp.tf.map(dfp.groupby("tf").log_imp_weighted.median())
log_match_weighted_median = dfp.tf.map(dfp.groupby("tf").log_match_weighted.median())
dfp['imp_cat'] = (dfp.log_imp_weighted  > log_imp_weighted_median).map({False: 'low', True: 'high'})
dfp['match_cat'] = (dfp.log_match_weighted  > log_match_weighted_median).map({False: 'low', True: 'high'})

TODO

  • add percentiles

TODO

  • sample a few points from the scatterplot: importance vs match to see what you are getting
  • try the same analysis for a few factors
    • try to find the right cutoff based on the number of motifs you expect
  • eyeball a few regions (higlight the found seqlets in there)
    • before doing the search, merge all teh metaclusters (ignore the importance score magnitude)
In [33]:
dfpl.shape
Out[33]:
(2421396, 20)
In [34]:
dfpl_subset = dfpl.sample(10000)  # focus on the subset of the data
dfp_subset = dfp.sample(10000)  # focus on the subset of the data
In [35]:
dfpl_subset.head()
Out[35]:
pattern example_idx pattern_start pattern_end strand pattern_len pattern_center match_weighted match_max match_max_task imp_weighted imp_max imp_max_task score_seq_match tf idx imp task match log_imp
1823109 metacluster_0/pattern_3 16987 441 450 + 9 445 0.265346 0.401434 Sox2 0.207118 0.262213 Klf4 3.484303 Nanog 693032 0.043545 Sox2 0.401434 0.042623
1988177 metacluster_2/pattern_0 3477 850 859 - 9 854 0.511327 0.570913 Klf4 0.050606 0.053549 Klf4 4.292175 Klf4 745750 0.026993 Sox2 -0.425327 0.026635
734692 metacluster_0/pattern_2 644 850 859 - 9 854 0.293186 0.524033 Sox2 0.030897 0.049519 Sox2 3.549739 Sox2 320944 0.014209 Oct4 -0.079685 0.014109
692936 metacluster_0/pattern_1 17675 835 851 - 16 843 0.242067 0.353556 Nanog 0.046924 0.054159 Klf4 0.617845 Errb 303673 0.027362 Oct4 0.094728 0.026994
404589 metacluster_0/pattern_1 315 258 274 + 16 266 0.166622 0.233833 Klf4 0.044707 0.068878 Nanog 6.597100 Errb 167300 0.014944 Sox2 0.152988 0.014833

Match vs importance

Match score specifically for each task

In [36]:
import warnings
warnings.filterwarnings("ignore")
In [37]:
plotnine.options.figure_size = (12, 7)
ggplot(aes(x="match", y="log_imp"), dfpl_subset) + geom_point(alpha=0.1) + \
  facet_grid("tf~task", labeller='label_both') + theme_bw()
Out[37]:
<ggplot: (8727825595505)>

Match score with highest importance across tasks

In [38]:
plotnine.options.figure_size = (12, 7)
ggplot(aes(x="match_weighted", y="log_imp"), dfpl_subset) + geom_point(alpha=0.1) + \
  facet_grid("tf~task", labeller='label_both') + theme_bw()
Out[38]:
<ggplot: (-9223363309028540251)>

Match vs imp

In [39]:
plotnine.options.figure_size = (12, 3)
ggplot(aes(x="log_match_weighted", y="log_imp_weighted"), dfp_subset) + geom_point(alpha=0.1) + \
  facet_grid(".~tf", labeller='label_both') + theme_bw()
Out[39]:
<ggplot: (-9223363309027851120)>

Seq vs match

In [40]:
plotnine.options.figure_size = (12, 3)
ggplot(aes(x="score_seq_match", y="log_match_weighted"), dfp_subset) + geom_point(alpha=0.1) + \
  facet_grid(".~tf", labeller='label_both') + theme_bw()
Out[40]:
<ggplot: (8727826509958)>

Seq vs importance

In [41]:
plotnine.options.figure_size = (12, 3)
ggplot(aes(x="score_seq_match", y="log_imp_weighted"), dfp_subset) + geom_point(alpha=0.1) + \
  facet_grid(".~tf", labeller='label_both') + theme_bw()
Out[41]:
<ggplot: (-9223363309028498365)>

Seq vs match

In [42]:
ggplot(aes(x='log_match_weighted'), dfp_subset) + geom_histogram() + facet_grid(".~tf", labeller='label_both') + theme_bw()
Out[42]:
<ggplot: (8727825980231)>
In [43]:
plotnine.options.figure_size = (12, 3)
ggplot(aes(x="log_match_weighted", y="log_imp_weighted"), dfp_subset) + geom_point(alpha=0.1) + \
  geom_vline(aes(xintercept='log_match_weighted'), data=dfp_medians, alpha=1, color='orange', linetype='dashed') + \
  geom_hline(aes(yintercept='log_imp_weighted'), data=dfp_medians, alpha=1, color='orange', linetype='dashed') + \
  facet_grid(".~tf", labeller='label_both') + theme_bw()
Out[43]:
<ggplot: (8727823780921)>
In [44]:
np.log(1+dfp.imp_max).plot.hist(200);
#plt.xlim([0, 20])
plt.xlabel("Maximum importance plot");
In [52]:
plotnine.options.figure_size = (7, 7)
ggplot(aes(x='tf', color='task', y='match'), dfpl) + geom_boxplot() + theme_bw()
Out[52]:
<ggplot: (-9223363306565456391)>
In [53]:
plotnine.options.figure_size = (7, 7)
ggplot(aes(x='tf', color='task', y='log_imp'), dfpl) + geom_boxplot() + theme_bw()
Out[53]:
<ggplot: (8730528235570)>
In [54]:
dfpl.groupby(['tf', 'task']).imp.mean().unstack().plot.bar()
plt.ylabel("Average importance");
In [47]:
dfpl.groupby(['tf', 'task']).match.mean().unstack().plot.bar()
plt.title("Average match per task");
plt.ylabel("Average Match");
In [48]:
dfp.groupby("tf").imp_max_task.value_counts().unstack().plot.bar();
plt.ylabel("Frequency")
plt.title("Number of times a particular task has max importance ");
In [49]:
dfp.groupby("tf").match_max_task.value_counts().unstack().plot.bar();
plt.ylabel("Frequency")
plt.title("Number of times a particular task has max match ");

Select the right values for visualization

In [216]:
fig, axes = plt.subplots(1, len(tasks), figsize=(15,3))
for task, ax in zip(tasks, axes):
    p = profile[f't/{task}'].max(axis=(1,2))
    p = p[p<20]
    ax.hist(p, 20);
    #ax.set_xlim([0,20]);
    ax.set_title(task)
    ax.set_xlabel("Max counts in the region")
In [98]:
fig, axes = plt.subplots(1, len(tasks), figsize=(15,3))
for task, ax in zip(tasks, axes):
    p = np.abs(contrib[f'{task}']).max(axis=(1,2))
    #p = p[p<20]
    ax.hist(p, 20);
    #ax.set_xlim([0,20]);
    ax.set_title(task)
    ax.set_xlabel("Max importance in the region")

Plot some examples

In [50]:
pattern_names = [
    ("Oct4-Sox2", "metacluster_0/pattern_0"),
    ("Errb", "metacluster_0/pattern_1"),
    ("Sox2", "metacluster_0/pattern_2"),
    ("Nanog", "metacluster_0/pattern_3"),
    ("Klf4", "metacluster_2/pattern_0"),
]
In [51]:
pattern.plot("contrib");
In [45]:
pattern = mr.get_pattern(pattern.name).trim_seq_ic(0.08)
In [46]:
dfpl_subset[(dfpl_subset.task == 'Klf4') & (dfpl_subset.tf == 'Oct4-Sox2')& (dfpl_subset.imp > 40)]
Out[46]:
pattern example_idx pattern_start pattern_end strand pattern_len pattern_center match_weighted match_max match_max_task imp_weighted imp_max imp_max_task score_seq_match tf idx imp task match log_imp
In [47]:
viz_dict = filter_tracks({**get_dataset_item(profile, example_idx), 
                          **get_dataset_item(contrib, example_idx)}, xlim=[400, 600])
In [48]:
example_idx = 1288
In [49]:
dfp_subset_idx = dfp[(dfp.example_idx == example_idx) & (dfp.pattern_start > 400) & (dfp.pattern_end < 600)]
dfp_subset_idx
Out[49]:
pattern example_idx pattern_start pattern_end strand pattern_len pattern_center match_weighted match_max match_max_task ... imp/Oct4 imp/Sox2 imp/Nanog imp/Klf4 tf idx log_imp_weighted log_match_weighted imp_cat match_cat
12835 metacluster_0/pattern_0 1288 481 497 - 16 489 0.175347 0.198611 Oct4 ... 0.200184 0.278294 0.480298 0.181074 Oct4-Sox2 12835 0.238454 0.161563 high low
12836 metacluster_0/pattern_0 1288 523 539 - 16 531 0.473083 0.495400 Sox2 ... 2.173852 2.219619 1.448849 1.102148 Oct4-Sox2 12836 1.054440 0.387358 high high
9957 metacluster_0/pattern_1 1288 566 582 + 16 574 0.201671 0.219057 Sox2 ... 0.150967 0.260928 0.706322 0.139661 Errb 174889 0.291117 0.183713 high low
9958 metacluster_0/pattern_1 1288 572 588 + 16 580 0.264234 0.316895 Klf4 ... 0.116667 0.257177 0.663679 0.136698 Errb 174890 0.275893 0.234467 high high
11690 metacluster_0/pattern_2 1288 481 490 + 9 485 0.300741 0.400579 Oct4 ... 0.138503 0.190852 0.306008 0.119629 Sox2 326847 0.196988 0.262934 high low
11692 metacluster_0/pattern_2 1288 512 521 - 9 516 0.260668 0.381429 Sox2 ... 0.224723 0.250794 0.262640 0.119058 Sox2 326849 0.206402 0.231642 high low
11693 metacluster_0/pattern_2 1288 523 532 + 9 527 0.433320 0.461441 Nanog ... 1.206914 1.443564 0.904968 0.646655 Sox2 326850 0.726485 0.359994 high high
15527 metacluster_0/pattern_3 1288 425 434 - 9 429 0.339477 0.478657 Nanog ... 0.024767 0.019119 0.130528 0.060662 Nanog 497170 0.095054 0.292279 high high
15528 metacluster_0/pattern_3 1288 449 458 - 9 453 0.337640 0.429814 Nanog ... 0.028087 0.034292 0.268384 0.115667 Nanog 497171 0.183660 0.290907 high high
15529 metacluster_0/pattern_3 1288 521 530 - 9 525 0.304798 0.319052 Nanog ... 0.856278 1.225770 0.951009 0.503223 Nanog 497172 0.656838 0.266048 high high
15531 metacluster_0/pattern_3 1288 570 579 - 9 574 0.311657 0.360529 Nanog ... 0.072717 0.164210 0.497861 0.076846 Nanog 497174 0.316995 0.271292 high high
8946 metacluster_2/pattern_0 1288 461 470 - 9 465 0.483994 0.544808 Klf4 ... 0.066188 0.082523 0.203821 0.470699 Klf4 729298 0.366259 0.394737 high high

12 rows × 28 columns

In [50]:
seqlets = [Seqlet(row.example_idx, row.pattern_start-400, row.pattern_end-400, row.tf, row.strand) 
           for i,row in dfp_subset_idx.iterrows()]
In [51]:
# TODO - is this thing normalized by importance?
In [99]:
def truncate(profile, value):
    return {t: np.sign(profile[t]) * np.minimum(np.abs(profile[t]), value) for t in profile}
In [101]:
plot_tracks(filter_tracks({**get_dataset_item(truncate(profile, 20), example_idx), 
                           **get_dataset_item(truncate(contrib, 0.5), example_idx)}, xlim=[400, 600]), 
            ylim=[(0, 20)]*4 + [(-0.5, 0.5)]*4,
            seqlets=seqlets, rotate_y=0, legend=True);

Number of motifs per example

In [53]:
dfp[(np.log(1+dfp.imp_max) > 2) & (dfp.pattern_start > 400) & (dfp.pattern_end < 600)].groupby('example_idx').size().plot.hist()
---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-53-1b7d262f2faf> in <module>()
----> 1 dfp[(np.log(1+dfp.imp_max) > 2) & (dfp.pattern_start > 400) & (dfp.pattern_end < 600)].groupby('example_idx').size().plot.hist()

~/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/pandas/plotting/_core.py in hist(self, bins, **kwds)
   2831         axes : :class:`matplotlib.axes.Axes` or numpy.ndarray of them
   2832         """
-> 2833         return self(kind='hist', bins=bins, **kwds)
   2834 
   2835     @Appender(_kde_docstring % {

~/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/pandas/plotting/_core.py in __call__(self, kind, ax, figsize, use_index, title, grid, legend, style, logx, logy, loglog, xticks, yticks, xlim, ylim, rot, fontsize, colormap, table, yerr, xerr, label, secondary_y, **kwds)
   2739                            colormap=colormap, table=table, yerr=yerr,
   2740                            xerr=xerr, label=label, secondary_y=secondary_y,
-> 2741                            **kwds)
   2742     __call__.__doc__ = plot_series.__doc__
   2743 

~/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/pandas/plotting/_core.py in plot_series(data, kind, ax, figsize, use_index, title, grid, legend, style, logx, logy, loglog, xticks, yticks, xlim, ylim, rot, fontsize, colormap, table, yerr, xerr, label, secondary_y, **kwds)
   2000                  yerr=yerr, xerr=xerr,
   2001                  label=label, secondary_y=secondary_y,
-> 2002                  **kwds)
   2003 
   2004 

~/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/pandas/plotting/_core.py in _plot(data, x, y, subplots, ax, kind, **kwds)
   1802         plot_obj = klass(data, subplots=subplots, ax=ax, kind=kind, **kwds)
   1803 
-> 1804     plot_obj.generate()
   1805     plot_obj.draw()
   1806     return plot_obj.result

~/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/pandas/plotting/_core.py in generate(self)
    256     def generate(self):
    257         self._args_adjust()
--> 258         self._compute_plot_data()
    259         self._setup_subplots()
    260         self._make_plot()

~/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/pandas/plotting/_core.py in _compute_plot_data(self)
    371         if is_empty:
    372             raise TypeError('Empty {0!r}: no numeric data to '
--> 373                             'plot'.format(numeric_data.__class__.__name__))
    374 
    375         self.data = numeric_data

TypeError: Empty 'DataFrame': no numeric data to plot

Examples

In [54]:
def prefix_dict(d, prefix):
    return {prefix + d: v for d,v in d.items()}
In [55]:
example_idx = dfp.example_idx.sample(1).iloc[0]
print(example_idx)
576
In [108]:
example_idx = top_max_count(profile['t/Oct4'][:, 400:600], 80)[0]
In [109]:
dfp_subset_idx = dfp[#(np.log(1+dfp.imp_max) > 2) & 
                     (dfp.imp_cat == 'high') & 
                     (dfp.match_cat == 'high') & 
                     #(dfp.tf == "Errb") & 
                     (dfp.example_idx == example_idx) & 
                     (dfp.pattern_start > 400) & 
                     (dfp.pattern_end < 600)]
seqlets = [Seqlet(row.example_idx, row.pattern_start-400, row.pattern_end-400, row.tf, row.strand) 
           for i,row in dfp_subset_idx.iterrows()]
plot_tracks(filter_tracks({**get_dataset_item(truncate(profile, 20), example_idx), 
                           **get_dataset_item(truncate(contrib, 0.5), example_idx)}, xlim=[400, 600]), 
            ylim=[(0, 20)]*4 + [(-0.5, 0.5)]*4,
            seqlets=seqlets, rotate_y=0, legend=True);

Others

In [42]:
example_idx = dfp.example_idx.sample(1).iloc[0]
print(example_idx)
10833
In [43]:
dfp_subset_idx = dfp[#(np.log(1+dfp.imp_max) > 2) & 
                     (dfp.imp_cat == 'high') & 
                     (dfp.match_cat == 'high') & 
                     #(dfp.tf == "Errb") & 
                     (dfp.example_idx == example_idx) & 
                     (dfp.pattern_start > 400) & 
                     (dfp.pattern_end < 600)]
seqlets = [Seqlet(row.example_idx, row.pattern_start-400, row.pattern_end-400, row.tf, row.strand) 
           for i,row in dfp_subset_idx.iterrows()]
plot_tracks(filter_tracks({**get_dataset_item(profile, example_idx), 
                           **get_dataset_item(contrib, example_idx)}, xlim=[400, 600]), 
            seqlets, rotate_y=0, legend=True);
In [129]:
counts = profile['t/Oct4'][:, 400:600].sum(axis=(1,2))

Regions with high counts

In [173]:
from basepair.samplers import top_max_count
In [174]:
top_counts = top_max_count(profile['t/Oct4'][:, 400:600])
In [175]:
example_idx = top_counts[0]
In [160]:
example_idx
Out[160]:
0
In [230]:
# region needs to have at least > 5 max counts
valid_idx = np.arange(len(profile['t/Oct4']))[profile['t/Sox2'][:, 400:600].max((1,2)) > 5]
In [146]:
example_idx = dfp.example_idx.sample(1).iloc[0]
print(example_idx)
13898
In [150]:
# stringent (~50%)
thresholds = {"Klf4": 0.55,
              "Nanog": 0.45,
              "Sox2": 0.55,
              "Errb": 0.4,
              "Oct4-Sox2": 0.4
               }
In [243]:
# loose (~25%)
thresholds = {"Klf4": 0.45,
              "Nanog": 0.35,
              "Sox2": 0.4,
              "Errb": 0.3,
              "Oct4-Sox2": 0.3
               }
In [151]:
dfp[dfp.match_weighted > dfp.tf.map(thresholds)]
Out[151]:
array(['Oct4-Sox2', 'Errb', 'Sox2', 'Nanog', 'Klf4'], dtype=object)
In [213]:
example_idx = dfp.example_idx.sample(1).iloc[0]
print(example_idx)
2851
In [261]:
example_idx = pd.Series(valid_idx).sample(1).iloc[0]
In [262]:
count_lim = 20
dfp_subset_idx = dfp[#(np.log(1+dfp.imp_max) > 2) & 
                     (dfp.match_weighted > dfp.tf.map(thresholds)) &
                     #(dfp.imp_cat == 'high') & 
                     #(dfp.match_cat == 'high') & 
                     #(dfp.tf == "Errb") & 
                     (dfp.example_idx == example_idx) & 
                     (dfp.pattern_start > 400) & 
                     (dfp.pattern_end < 600)]
seqlets = [Seqlet(row.example_idx, row.pattern_start-400, row.pattern_end-400, row.tf, row.strand) 
           for i,row in dfp_subset_idx.iterrows()]
plot_tracks(filter_tracks({**get_dataset_item(truncate(profile, count_lim), example_idx), 
                           **get_dataset_item(truncate(contrib, 0.5), example_idx)}, xlim=[400, 600]), 
            ylim=[(0, count_lim)]*4 + [(-0.5, 0.5)]*4,
            seqlets=seqlets, rotate_y=0, legend=True);
In [285]:
for tf, pattern_name in pattern_names:
    p = mr.get_pattern(pattern_name).trim_seq_ic(0.08)
    p.name = "+" + tf
    p.plot(['seq'] + [f'contrib/{t}' for t in tasks], rotate_y=0)
    p.name = "-" + tf
    p.rc().plot(['seq'] + [f'contrib/{t}' for t in tasks], rotate_y=0)

Regions with high counts

In [48]:
example_idx = dfp.example_idx.sample(1).iloc[0]
print(example_idx)
18679
In [49]:
dfp_subset_idx = dfp[#(np.log(1+dfp.imp_max) > 2) & 
                     (dfp.imp_cat == 'high') & 
                     (dfp.match_cat == 'high') & 
                     #(dfp.tf == "Errb") & 
                     (dfp.example_idx == example_idx) & 
                     (dfp.pattern_start > 400) & 
                     (dfp.pattern_end < 600)]
seqlets = [Seqlet(row.example_idx, row.pattern_start-400, row.pattern_end-400, row.tf, row.strand) 
           for i,row in dfp_subset_idx.iterrows()]
plot_tracks(filter_tracks({**get_dataset_item(profile, example_idx), 
                           **get_dataset_item(contrib, example_idx)}, xlim=[400, 600]), 
            seqlets, rotate_y=0, legend=True);
In [50]:
dfp_subset_idx[['tf', 'pattern_center', 'strand', 'match_weighted', 'imp_weighted', 'score_seq_match']].sort_values('pattern_center')
Out[50]:
tf pattern_center strand match_weighted imp_weighted score_seq_match
232850 Nanog 419 - 0.291701 0.386205 3.786917
232851 Nanog 472 + 0.370044 0.432529 2.352270
162766 Sox2 481 + 0.538665 0.498735 6.725247
153105 Klf4 533 + 0.548452 0.794147 8.627083
232854 Nanog 544 - 0.339172 0.132204 3.140144

Example of strong reliance on hypothetical scores

In [254]:
example_idx = dfp.example_idx.sample(1).iloc[0]
print(example_idx)
1335
In [272]:
dfp_subset_idx = dfp[#(np.log(1+dfp.imp_max) > 2) & 
                     (dfp.imp_cat == 'high') & 
                     (dfp.match_cat == 'high') & 
                     #(dfp.tf == "Errb") & 
                     (dfp.example_idx == example_idx) & 
                     (dfp.pattern_start > 400) & 
                     (dfp.pattern_end < 600)]
seqlets = [Seqlet(row.example_idx, row.pattern_start-400, row.pattern_end-400, row.tf, row.strand) 
           for i,row in dfp_subset_idx.iterrows()]
plot_tracks(filter_tracks({**get_dataset_item(contrib, example_idx), 
                           **get_dataset_item(prefix_dict(hyp_contrib, "h/"), example_idx)}, xlim=[400, 600]), 
            seqlets, rotate_y=0, legend=True);
In [275]:
dfp_subset_idx[['tf', 'pattern_center', 'strand', 'match_weighted', 'imp_weighted', 'score_seq_match']].sort_values('pattern_center')
Out[275]:
tf pattern_center strand match_weighted imp_weighted score_seq_match
13427 Sox2 410 + 7.563056 11.159393 4.537363
16054 Oct4-Sox2 414 - 9.051900 12.778213 7.640690
13428 Sox2 439 - 6.265966 6.403757 5.395059
15976 Nanog 467 - 4.711840 6.146054 0.432334
15977 Nanog 468 + 4.784056 6.087443 2.183998
16056 Oct4-Sox2 473 - 5.308060 7.876490 1.675158
16057 Oct4-Sox2 475 + 5.714996 8.055885 1.497434
15978 Nanog 479 - 5.457193 10.562017 4.525884
15983 Nanog 572 - 6.116345 4.657692 6.155791
In [268]:
dfp_subset_idx = dfp[#(np.log(1+dfp.imp_max) > 2) & 
                     (dfp.imp_cat == 'high') & 
                     (dfp.match_cat == 'high') & 
                     (dfp.tf == "Sox2") & 
                     (dfp.example_idx == example_idx) & 
                     (dfp.pattern_start > 400) & 
                     (dfp.pattern_end < 600)]
seqlets = [Seqlet(row.example_idx, row.pattern_start-400, row.pattern_end-400, row.tf, row.strand) 
           for i,row in dfp_subset_idx.iterrows()]
plot_tracks(filter_tracks({**get_dataset_item(contrib, example_idx), 
                           **get_dataset_item(prefix_dict(hyp_contrib, "h/"), example_idx)}, xlim=[400, 600]), 
            seqlets, rotate_y=0, legend=True);

Conclusion

  • hypothetical contribution scores can be confusing to inspect
In [269]:
dfp_subset_idx = dfp[#(np.log(1+dfp.imp_max) > 2) & 
                     (dfp.imp_cat == 'high') & 
                     (dfp.match_cat == 'high') & 
                     (dfp.tf == "Oct4-Sox2") & 
                     (dfp.example_idx == example_idx) & 
                     (dfp.pattern_start > 400) & 
                     (dfp.pattern_end < 600)]
seqlets = [Seqlet(row.example_idx, row.pattern_start-400, row.pattern_end-400, row.tf, row.strand) 
           for i,row in dfp_subset_idx.iterrows()]
plot_tracks(filter_tracks({**get_dataset_item(contrib, example_idx), 
                           **get_dataset_item(prefix_dict(hyp_contrib, "h/"), example_idx)}, xlim=[400, 600]), 
            seqlets, rotate_y=0, legend=True);
In [270]:
dfp_subset_idx = dfp[#(np.log(1+dfp.imp_max) > 2) & 
                     (dfp.imp_cat == 'high') & 
                     (dfp.match_cat == 'high') & 
                     (dfp.tf == "Nanog") & 
                     (dfp.example_idx == example_idx) & 
                     (dfp.pattern_start > 400) & 
                     (dfp.pattern_end < 600)]
seqlets = [Seqlet(row.example_idx, row.pattern_start-400, row.pattern_end-400, row.tf, row.strand) 
           for i,row in dfp_subset_idx.iterrows()]
In [279]:
plot_tracks(filter_tracks({**get_dataset_item(contrib, example_idx), 
                           **get_dataset_item(profile, example_idx)}, xlim=[400, 600]), 
            rotate_y=0, legend=True);
In [270]:
plot_tracks(filter_tracks({**get_dataset_item(contrib, example_idx), 
                           **get_dataset_item(prefix_dict(hyp_contrib, "h/"), example_idx)}, xlim=[400, 600]), 
            seqlets, rotate_y=0, legend=True);