Goal

  • analyze instance locations. Focus on Oct4 peaks

TODO

  • sample a few points from the scatterplot: importance vs match to see what you are getting
  • try the same analysis for a few factors
    • try to find the right cutoff based on the number of motifs you expect
  • eyeball a few regions (higlight the found seqlets in there)
    • before doing the search, merge all teh metaclusters (ignore the importance score magnitude)

Conclusions

  • ignore hypothetical scores when scoring seqlet instances
In [212]:
from basepair.imports import *
from basepair.samplers import top_sum_count, top_max_count
from basepair.plot.tracks import plot_tracks, filter_tracks
from kipoi.data_utils import get_dataset_item
In [213]:
model_dir = Path(f"{ddir}/processed//chipnexus/exp/models/oct-sox-nanog-klf/models/n_dil_layers=9/")
In [214]:
modisco_dir = model_dir / "modisco/by_peak_tasks/weighted/Oct4"
In [215]:
mr = ModiscoResult(modisco_dir / "modisco.h5")
mr.open()
In [216]:
tasks = ['Oct4', 'Sox2', 'Nanog', 'Klf4']
In [217]:
mr.plot_pssm("metacluster_0", "pattern_0", trim_frac=0.08);
mr.plot_pssm("metacluster_0", "pattern_2", trim_frac=0.08);
In [218]:
pattern_name = 'metacluster_0/pattern_0'
In [219]:
pattern = mr.get_pattern(pattern_name)
In [220]:
pattern.plot(rotate_y=0);
In [221]:
def load_scores(h5_file, importance='weighted'):
    gt = importance
    im = HDF5Reader(h5_file)
    im.open()
    d = im.load_all()
    seq = d['inputs']
    tasks = list(d['grads'])
    hyp_contrib = {f"{task}": mean(d['grads'][task][gt])
                                 for task in tasks}
    contrib = {f"{task}": hyp_contrib[f"{task}"] * seq
                          for task in tasks}
    profile = {f"t/{task}": d['targets']['profile'][task] for task in tasks}
    im.close()
    return seq, contrib, hyp_contrib, profile


# subset
def get_scores(im, idx, tasks):
    hyp_contrib = {task: (im.f[f'/grads/{task}/weighted/0'][idx] + 
                          im.f[f'/grads/{task}/weighted/1'][idx]) / 2
                   for task in tasks}
    seq = im.f['/inputs/'][idx]
    contrib = {t: hyp_contrib[t] * seq for t in tasks}
    profile = {t: im.f[f'/targets/profile/{t}'][idx] for t in tasks}
    return seq, contrib, hyp_contrib, profile
In [11]:
seq, contrib, hyp_contrib, profile = load_scores(model_dir / "grad.valid.h5")
In [12]:
idx = top_max_count(profile['t/Oct4'][:, 400:600], 80)
In [13]:
seq.shape
Out[13]:
(19137, 1000, 4)
In [14]:
example_idx = 11128
In [15]:
viz_dict = filter_tracks({**get_dataset_item(profile, example_idx), 
                          **get_dataset_item(contrib, example_idx)}, xlim=[400, 600])
In [16]:
plot_tracks(viz_dict);

Focus on one specific example

In [17]:
t = dict(contrib=contrib['Oct4'][example_idx, 400:600],
         hyp_contrib=hyp_contrib['Oct4'][example_idx, 400:600],
         seq=seq[example_idx, 400:600])
In [18]:
# target
plot_tracks(dict(contrib=t['contrib'], hyp_contrib=t['hyp_contrib']));
In [19]:
from basepair.modisco.results import *
In [20]:
pattern.trim_seq_ic(0.08).plot(['seq', 'contrib/Oct4', 'hyp_contrib/Oct4'], 
                               rotate_y=0, height=1, letter_width=0.2);

Get the instances

In [128]:
pattern_names = [
    ("Oct4-Sox2", "metacluster_0/pattern_0"),
    ("Errb", "metacluster_0/pattern_1"),
    ("Sox2", "metacluster_0/pattern_2"),
    ("Nanog", "metacluster_0/pattern_3"),
    ("Klf4", "metacluster_2/pattern_0"),
]
In [223]:
cached = True
In [130]:
cache_dir = Path(Path(ddir) / "cache/dev/")
cache_dir.mkdir(parents=True, exist_ok=True)
cache_file = cache_dir / "hit-scoring.hdf5"
In [131]:
dfp = pd.read_hdf(cache_file, "/scores")
In [132]:
!du -sh {cache_file}
266M	/users/avsec/workspace/basepair/data/cache/dev/hit-scoring.hdf5
In [133]:
#dfp.to_hdf(cache_file, "/scores")
In [224]:
if not cached:
    dfl = []
    match_l, importance_l, seq_match_l = [], [], []
    for tf, pattern_name in pattern_names:
        pattern = mr.get_pattern(pattern_name).trim_seq_ic(0.08)
        match, importance = pattern.scan_importance(contrib, hyp_contrib, tasks, 
                                                    n_jobs=20, verbose=True)
        seq_match = pattern.scan_seq(seq, n_jobs=20, verbose=True)
        dfm = pattern.get_instances(tasks, match, importance, seq_match, fdr=0.01, verbose=True)
        dfm['tf'] = tf
        dfl.append(dfm)
        match_l.append(match)
        importance_l.append(importance)
        seq_match_l.append(seq_match)
    dfp = pd.concat(dfl)
    # cache
    cached = True
    dfp.to_hdf(cache_file, "/scores")
dfp['idx'] = np.arange(len(dfp))
dfp = dfp[dfp.score_seq_match > 0]
# get also the match scores
match = np.stack(match_l, axis=-1)
importance = np.stack(importance_l, axis=-1)
seq_match = np.stack(seq_match_l, axis=-1)
100%|██████████| 19137/19137 [00:10<00:00, 1742.14it/s]
100%|██████████| 19137/19137 [00:02<00:00, 7962.66it/s]
100%|██████████| 19137/19137 [00:02<00:00, 8723.86it/s]
100%|██████████| 19137/19137 [00:02<00:00, 8746.84it/s]
100%|██████████| 19137/19137 [00:02<00:00, 6662.45it/s]
100%|██████████| 19137/19137 [00:02<00:00, 7716.18it/s]
100%|██████████| 19137/19137 [00:02<00:00, 9164.94it/s]
100%|██████████| 19137/19137 [00:02<00:00, 8671.83it/s]
100%|██████████| 19137/19137 [00:02<00:00, 7945.76it/s]
100%|██████████| 19137/19137 [00:03<00:00, 6300.24it/s]
100%|██████████| 19137/19137 [00:02<00:00, 9279.22it/s]
100%|██████████| 19137/19137 [00:02<00:00, 7970.74it/s]
100%|██████████| 19137/19137 [00:02<00:00, 8482.25it/s]
100%|██████████| 19137/19137 [00:02<00:00, 8546.49it/s]
100%|██████████| 19137/19137 [00:02<00:00, 7329.75it/s]
100%|██████████| 19137/19137 [00:02<00:00, 7930.96it/s]
100%|██████████| 19137/19137 [00:01<00:00, 11955.52it/s]
100%|██████████| 19137/19137 [00:01<00:00, 12333.38it/s]
Keeping 204848/19137000 (1.07%) of the instances. Match score cutoff=3.08
100%|██████████| 19137/19137 [00:02<00:00, 8447.92it/s]
100%|██████████| 19137/19137 [00:02<00:00, 8108.99it/s]
100%|██████████| 19137/19137 [00:02<00:00, 7492.27it/s]
100%|██████████| 19137/19137 [00:02<00:00, 8474.65it/s]
100%|██████████| 19137/19137 [00:02<00:00, 7745.69it/s]
100%|██████████| 19137/19137 [00:02<00:00, 7834.36it/s]
100%|██████████| 19137/19137 [00:02<00:00, 8659.57it/s]
100%|██████████| 19137/19137 [00:02<00:00, 7959.10it/s]
100%|██████████| 19137/19137 [00:02<00:00, 7778.28it/s]
100%|██████████| 19137/19137 [00:03<00:00, 4976.72it/s]
100%|██████████| 19137/19137 [00:02<00:00, 6596.14it/s]
100%|██████████| 19137/19137 [00:02<00:00, 8550.98it/s]
100%|██████████| 19137/19137 [00:02<00:00, 8518.67it/s]
100%|██████████| 19137/19137 [00:02<00:00, 8406.46it/s]
100%|██████████| 19137/19137 [00:02<00:00, 8604.23it/s]
100%|██████████| 19137/19137 [00:02<00:00, 6911.11it/s]
100%|██████████| 19137/19137 [00:01<00:00, 12148.28it/s]
100%|██████████| 19137/19137 [00:01<00:00, 11264.05it/s]
Keeping 137240/19137000 (0.72%) of the instances. Match score cutoff=3.39
100%|██████████| 19137/19137 [00:01<00:00, 10600.66it/s]
100%|██████████| 19137/19137 [00:01<00:00, 10047.84it/s]
100%|██████████| 19137/19137 [00:01<00:00, 10764.77it/s]
100%|██████████| 19137/19137 [00:01<00:00, 10143.65it/s]
100%|██████████| 19137/19137 [00:01<00:00, 11325.76it/s]
100%|██████████| 19137/19137 [00:01<00:00, 11820.98it/s]
100%|██████████| 19137/19137 [00:01<00:00, 12256.08it/s]
100%|██████████| 19137/19137 [00:01<00:00, 10305.40it/s]
100%|██████████| 19137/19137 [00:02<00:00, 7372.69it/s]
100%|██████████| 19137/19137 [00:01<00:00, 11202.33it/s]
100%|██████████| 19137/19137 [00:01<00:00, 10245.95it/s]
100%|██████████| 19137/19137 [00:01<00:00, 11120.58it/s]
100%|██████████| 19137/19137 [00:02<00:00, 9064.21it/s]
100%|██████████| 19137/19137 [00:01<00:00, 11146.57it/s]
100%|██████████| 19137/19137 [00:01<00:00, 10913.38it/s]
100%|██████████| 19137/19137 [00:01<00:00, 12207.83it/s]
100%|██████████| 19137/19137 [00:01<00:00, 12440.87it/s]
100%|██████████| 19137/19137 [00:01<00:00, 12459.28it/s]
Keeping 184603/19137000 (0.96%) of the instances. Match score cutoff=3.35
100%|██████████| 19137/19137 [00:02<00:00, 9399.96it/s]
100%|██████████| 19137/19137 [00:01<00:00, 11983.18it/s]
100%|██████████| 19137/19137 [00:01<00:00, 12227.05it/s]
100%|██████████| 19137/19137 [00:01<00:00, 12017.54it/s]
100%|██████████| 19137/19137 [00:02<00:00, 8617.93it/s] 
100%|██████████| 19137/19137 [00:01<00:00, 10606.40it/s]
100%|██████████| 19137/19137 [00:01<00:00, 9800.29it/s]
100%|██████████| 19137/19137 [00:01<00:00, 10676.40it/s]
100%|██████████| 19137/19137 [00:01<00:00, 10824.75it/s]
100%|██████████| 19137/19137 [00:01<00:00, 10626.79it/s]
100%|██████████| 19137/19137 [00:01<00:00, 9684.57it/s]
100%|██████████| 19137/19137 [00:01<00:00, 11851.83it/s]
100%|██████████| 19137/19137 [00:01<00:00, 10861.11it/s]
100%|██████████| 19137/19137 [00:01<00:00, 10674.73it/s]
100%|██████████| 19137/19137 [00:02<00:00, 7885.78it/s]
100%|██████████| 19137/19137 [00:01<00:00, 12526.63it/s]
100%|██████████| 19137/19137 [00:01<00:00, 15065.85it/s]
100%|██████████| 19137/19137 [00:01<00:00, 11444.54it/s]
Keeping 238999/19137000 (1.25%) of the instances. Match score cutoff=3.65
100%|██████████| 19137/19137 [00:01<00:00, 11981.46it/s]
100%|██████████| 19137/19137 [00:01<00:00, 12373.74it/s]
100%|██████████| 19137/19137 [00:01<00:00, 9673.13it/s]
100%|██████████| 19137/19137 [00:01<00:00, 10301.31it/s]
100%|██████████| 19137/19137 [00:01<00:00, 10758.57it/s]
100%|██████████| 19137/19137 [00:01<00:00, 11662.30it/s]
100%|██████████| 19137/19137 [00:01<00:00, 11241.89it/s]
100%|██████████| 19137/19137 [00:02<00:00, 8102.43it/s]
100%|██████████| 19137/19137 [00:01<00:00, 9870.33it/s]
100%|██████████| 19137/19137 [00:01<00:00, 12217.03it/s]
100%|██████████| 19137/19137 [00:01<00:00, 11611.04it/s]
100%|██████████| 19137/19137 [00:01<00:00, 13138.68it/s]
100%|██████████| 19137/19137 [00:02<00:00, 8603.95it/s]
100%|██████████| 19137/19137 [00:01<00:00, 10406.28it/s]
100%|██████████| 19137/19137 [00:01<00:00, 11478.78it/s]
100%|██████████| 19137/19137 [00:01<00:00, 11621.85it/s]
100%|██████████| 19137/19137 [00:01<00:00, 12667.78it/s]
100%|██████████| 19137/19137 [00:01<00:00, 13642.64it/s]
Keeping 137500/19137000 (0.72%) of the instances. Match score cutoff=3.7
In [29]:
dfp.head()
Out[29]:
pattern example_idx pattern_start pattern_end strand pattern_len pattern_center match_score match_max match_max_task ... match/Oct4 match/Sox2 match/Nanog match/Klf4 imp/Oct4 imp/Sox2 imp/Nanog imp/Klf4 tf idx
0 metacluster_0/pattern_0 0 434 450 - 16 442 4.436324 5.085370 Oct4 ... 5.085370 4.306160 3.336833 4.397326 25.292760 18.074471 10.993076 6.595614 Oct4-Sox2 0
1 metacluster_0/pattern_0 0 436 452 + 16 444 8.239908 9.589859 Oct4 ... 9.589859 9.110400 4.635757 7.749240 26.049998 18.633804 10.866910 6.611798 Oct4-Sox2 1
2 metacluster_0/pattern_0 0 440 456 - 16 448 3.561162 4.125266 Oct4 ... 4.125266 3.806090 1.889072 3.721674 23.658365 17.657431 10.061822 6.011966 Oct4-Sox2 2
3 metacluster_0/pattern_0 0 457 473 + 16 465 7.014139 7.907351 Oct4 ... 7.907351 7.310353 5.991961 5.682630 28.729819 27.834638 14.706600 8.676175 Oct4-Sox2 3
4 metacluster_0/pattern_0 0 477 493 + 16 485 7.122949 7.235220 Oct4 ... 7.235220 6.972675 7.212878 7.020781 32.487208 31.967045 21.090343 12.860154 Oct4-Sox2 4

5 rows × 23 columns

In [247]:
# convert a wide format to a long format
def melt_prefix(dfp, var='imp'):
    prefix = var + "/"
    dfp_imp = dfp[['idx'] + [prefix + t for t in tasks]].melt(id_vars='idx', value_name=var)
    dfp_imp['task'] = dfp_imp.variable.str.replace(prefix, "")
    del dfp_imp['variable']
    return dfp_imp


dfpm = melt_prefix(dfp, 'imp').merge(melt_prefix(dfp, 'match'), on=['idx', 'task'])
dfpl = dfp[[c for c in dfp.columns 
            if not c.startswith("imp/") 
            and not c.startswith("match/")]].merge(dfpm, on='idx')
dfpl['log_imp'] = np.log(1+dfpl.imp)

dfp['log_imp_weighted'] = np.log(1+dfp['imp_weighted'])
dfp['log_match_weighted'] = np.log(dfp['match_weighted'])
In [248]:
# partition the scores into 4 categories (according to the median)
dfp_medians = pd.DataFrame({'log_match_weighted': dfp.groupby("tf").log_match_weighted.median(),
                            'log_imp_weighted': dfp.groupby("tf").log_imp_weighted.median()}).reset_index()

dfp
log_imp_weighted_median = dfp.tf.map(dfp.groupby("tf").log_imp_weighted.median())
log_match_weighted_median = dfp.tf.map(dfp.groupby("tf").log_match_weighted.median())
dfp['imp_cat'] = (dfp.log_imp_weighted  > log_imp_weighted_median).map({False: 'low', True: 'high'})
dfp['match_cat'] = (dfp.log_match_weighted  > log_match_weighted_median).map({False: 'low', True: 'high'})

TODO

  • add percentiles
In [136]:
# dfp['match_max_task'] = dfp.match_max__task

# dfpl['match_max_task'] = dfpl.match_max__task
In [137]:
dfpl.head()
Out[137]:
pattern example_idx pattern_start pattern_end strand pattern_len pattern_center match_weighted match_max match_max_task imp_weighted imp_max imp_max_task score_seq_match tf idx imp task match log_imp
0 metacluster_0/pattern_0 0 434 450 - 16 442 4.436324 5.085370 Oct4 17.515070 25.292760 Oct4 -0.358489 Oct4-Sox2 0 25.292760 Oct4 5.085370 3.269294
1 metacluster_0/pattern_0 0 434 450 - 16 442 4.436324 5.085370 Oct4 17.515070 25.292760 Oct4 -0.358489 Oct4-Sox2 0 18.074471 Sox2 4.306160 2.948351
2 metacluster_0/pattern_0 0 434 450 - 16 442 4.436324 5.085370 Oct4 17.515070 25.292760 Oct4 -0.358489 Oct4-Sox2 0 10.993076 Nanog 3.336833 2.484329
3 metacluster_0/pattern_0 0 434 450 - 16 442 4.436324 5.085370 Oct4 17.515070 25.292760 Oct4 -0.358489 Oct4-Sox2 0 6.595614 Klf4 4.397326 2.027571
4 metacluster_0/pattern_0 0 436 452 + 16 444 8.239908 9.589859 Oct4 17.930257 26.049998 Oct4 10.260480 Oct4-Sox2 1 26.049998 Oct4 9.589859 3.297687

TODO

  • sample a few points from the scatterplot: importance vs match to see what you are getting
  • try the same analysis for a few factors
    • try to find the right cutoff based on the number of motifs you expect
  • eyeball a few regions (higlight the found seqlets in there)
    • before doing the search, merge all teh metaclusters (ignore the importance score magnitude)
In [249]:
from plotnine import *
import plotnine
In [250]:
dfpl.shape
Out[250]:
(2624168, 24)
In [251]:
dfpl_subset = dfpl.sample(10000)  # focus on the subset of the data
dfp_subset = dfp.sample(10000)  # focus on the subset of the data
In [252]:
dfpl_subset.head()
Out[252]:
pattern example_idx pattern_start pattern_end strand pattern_len pattern_center match_weighted match_max match_max_task ... tf idx log_imp_weighted log_match_weighted imp_cat match_cat imp task match log_imp
2360977 metacluster_2/pattern_0 10341 500 509 - 9 504 6.981125 7.401968 Klf4 ... Klf4 830929 3.092235 1.943210 high high 0.744257 Sox2 1.914759 0.556328
1178593 metacluster_0/pattern_2 12262 119 128 - 9 123 4.760456 5.745961 Oct4 ... Sox2 464809 1.398745 1.560343 high low 5.099994 Sox2 5.349951 1.808288
897050 metacluster_0/pattern_2 3624 56 65 + 9 60 7.472459 8.716251 Sox2 ... Sox2 378253 0.439719 2.011224 low high 0.129061 Nanog 7.353604 0.121386
970151 metacluster_0/pattern_2 5774 419 428 - 9 423 4.816924 6.851645 Sox2 ... Sox2 400522 2.479368 1.572136 high low 6.436266 Klf4 5.997974 2.006369
1855531 metacluster_0/pattern_3 12378 169 178 - 9 173 4.409331 5.580443 Nanog ... Nanog 679873 0.546656 1.483723 low high 0.424428 Klf4 2.972585 0.353770

5 rows × 24 columns

Match vs importance

Match score specifically for each task

In [231]:
import warnings
warnings.filterwarnings("ignore")
In [143]:
plotnine.options.figure_size = (12, 7)
ggplot(aes(x="match", y="log_imp"), dfpl_subset) + geom_point(alpha=0.1) + \
  facet_grid("tf~task", labeller='label_both') + theme_bw()
Out[143]:
<ggplot: (-9223363266424025242)>

Match score with highest importance across tasks

In [145]:
plotnine.options.figure_size = (12, 7)
ggplot(aes(x="match_weighted", y="log_imp"), dfpl_subset) + geom_point(alpha=0.1) + \
  facet_grid("tf~task", labeller='label_both') + theme_bw()
Out[145]:
<ggplot: (-9223363266427861423)>

Match vs imp

In [242]:
plotnine.options.figure_size = (12, 3)
ggplot(aes(x="log_match_weighted", y="log_imp_weighted"), dfp_subset) + geom_point(alpha=0.1) + \
  facet_grid(".~tf", labeller='label_both') + theme_bw()
Out[242]:
<ggplot: (8770372708416)>

Seq vs match

In [243]:
plotnine.options.figure_size = (12, 3)
ggplot(aes(x="score_seq_match", y="log_match_weighted"), dfp_subset) + geom_point(alpha=0.1) + \
  facet_grid(".~tf", labeller='label_both') + theme_bw()
Out[243]:
<ggplot: (-9223363266420723452)>

Seq vs importance

In [245]:
plotnine.options.figure_size = (12, 3)
ggplot(aes(x="score_seq_match", y="log_imp_weighted"), dfp_subset) + geom_point(alpha=0.1) + \
  facet_grid(".~tf", labeller='label_both') + theme_bw()
Out[245]:
<ggplot: (8770434063880)>

Seq vs match

In [234]:
ggplot(aes(x='log_match_weighted'), dfp_subset) + geom_histogram() + facet_grid(".~tf", labeller='label_both') + theme_bw()
Out[234]:
<ggplot: (8770434001875)>

Non-PWM filtered

In [240]:
plotnine.options.figure_size = (12, 3)
ggplot(aes(x="log_match_weighted", y="log_imp_weighted"), dfp_subset) + geom_point(alpha=0.1) + \
  geom_vline(aes(xintercept='log_match_weighted'), data=dfp_medians, alpha=1, color='orange', linetype='dashed') + \
  geom_hline(aes(yintercept='log_imp_weighted'), data=dfp_medians, alpha=1, color='orange', linetype='dashed') + \
  facet_grid(".~tf", labeller='label_both') + theme_bw()
Out[240]:
<ggplot: (-9223363266324861815)>

PWM filtered

In [253]:
plotnine.options.figure_size = (12, 3)
ggplot(aes(x="log_match_weighted", y="log_imp_weighted"), dfp_subset) + geom_point(alpha=0.1) + \
  geom_vline(aes(xintercept='log_match_weighted'), data=dfp_medians, alpha=1, color='orange', linetype='dashed') + \
  geom_hline(aes(yintercept='log_imp_weighted'), data=dfp_medians, alpha=1, color='orange', linetype='dashed') + \
  facet_grid(".~tf", labeller='label_both') + theme_bw()
Out[253]:
<ggplot: (8770430771355)>
In [ ]:
dfp_subset.log_imp.weighted.median()
In [66]:
np.log(1+dfp.imp_max).plot.hist(200);
#plt.xlim([0, 20])
plt.xlabel("Maximum importance plot");
In [44]:
plotnine.options.figure_size = (7, 7)
ggplot(aes(x='tf', color='task', y='match'), dfpl) + geom_boxplot() + theme_bw()
Out[44]:
<ggplot: (8770372654186)>
In [45]:
plotnine.options.figure_size = (7, 7)
ggplot(aes(x='tf', color='task', y='log_imp'), dfpl) + geom_boxplot() + theme_bw()
Out[45]:
<ggplot: (-9223363266482419389)>
In [46]:
dfpl.groupby(['tf', 'task']).imp.mean().unstack().plot.bar()
plt.ylabel("Average importance");
In [47]:
dfpl.groupby(['tf', 'task']).match.mean().unstack().plot.bar()
plt.title("Average match per task");
plt.ylabel("Average Match");
In [48]:
dfp.groupby("tf").imp_max_task.value_counts().unstack().plot.bar();
plt.ylabel("Frequency")
plt.title("Number of times a particular task has max importance ");
In [49]:
dfp.groupby("tf").match_max_task.value_counts().unstack().plot.bar();
plt.ylabel("Frequency")
plt.title("Number of times a particular task has max match ");
In [50]:
pattern_names = [
    ("Oct4-Sox2", "metacluster_0/pattern_0"),
    ("Errb", "metacluster_0/pattern_1"),
    ("Sox2", "metacluster_0/pattern_2"),
    ("Nanog", "metacluster_0/pattern_3"),
    ("Klf4", "metacluster_2/pattern_0"),
]
In [51]:
pattern.plot("contrib");
In [52]:
tasks
task_imp = np.array([np.abs(pattern.contrib[t]).mean() for t in tasks])
task_imp = task_imp / task_imp.sum()
In [53]:
pattern = mr.get_pattern(pattern.name).trim_seq_ic(0.08)
In [40]:
dfpl_subset[(dfpl_subset.task == 'Klf4') & (dfpl_subset.tf == 'Oct4-Sox2')& (dfpl_subset.imp > 40)]
Out[40]:
pattern example_idx pattern_start pattern_end strand pattern_len pattern_center match_score match_max match_max_task imp_max imp_max_task score_seq_match tf idx imp task match log_imp
In [38]:
viz_dict = filter_tracks({**get_dataset_item(profile, example_idx), 
                          **get_dataset_item(contrib, example_idx)}, xlim=[400, 600])
In [41]:
example_idx = 1288
In [67]:
dfp_subset_idx = dfp[(dfp.example_idx == example_idx) & (dfp.pattern_start > 400) & (dfp.pattern_end < 600)]
dfp_subset_idx
Out[67]:
pattern example_idx pattern_start pattern_end strand pattern_len pattern_center match_score match_max match_max_task ... match/Oct4 match/Sox2 match/Nanog match/Klf4 imp/Oct4 imp/Sox2 imp/Nanog imp/Klf4 tf idx
15468 metacluster_0/pattern_0 1288 481 497 - 16 489 4.520667 4.931519 Sox2 ... 4.450363 4.931519 3.989680 4.587109 3.852208 4.707893 3.883829 2.103986 Oct4-Sox2 15468
15469 metacluster_0/pattern_0 1288 482 498 + 16 490 3.143075 3.517850 Klf4 ... 3.022358 3.017903 3.225264 3.517850 3.866364 4.774422 3.898947 2.115534 Oct4-Sox2 15469
15470 metacluster_0/pattern_0 1288 523 539 - 16 531 10.138856 10.968470 Klf4 ... 9.777510 10.578058 9.445521 10.968470 41.614139 37.723788 11.668232 11.860842 Oct4-Sox2 15470
15471 metacluster_0/pattern_0 1288 525 541 + 16 533 3.667514 4.364790 Nanog ... 3.712812 3.073689 4.364790 3.769160 37.773879 33.113699 10.446346 10.660964 Oct4-Sox2 15471
9035 metacluster_0/pattern_1 1288 516 532 - 16 524 3.399173 4.172987 Nanog ... 2.778824 3.511239 4.172987 2.968923 27.140151 30.284350 10.641942 8.648113 Errb 213883
9036 metacluster_0/pattern_1 1288 517 533 - 16 525 3.616138 3.979899 Nanog ... 3.455617 3.890882 3.979899 3.286359 30.567106 32.256126 10.840259 9.518879 Errb 213884
9037 metacluster_0/pattern_1 1288 566 582 + 16 574 3.826363 4.095667 Sox2 ... 3.676426 4.095667 3.668151 3.912041 3.115768 4.546923 5.303988 1.534756 Errb 213885
9038 metacluster_0/pattern_1 1288 572 588 + 16 580 5.230893 6.569507 Klf4 ... 5.329646 5.032655 3.599923 6.569507 2.524323 4.284051 4.627897 1.554792 Errb 213886
12954 metacluster_0/pattern_2 1288 480 489 + 9 485 5.473097 6.404852 Oct4 ... 6.404852 6.304051 4.732015 5.023636 4.905771 6.078678 4.868028 2.584665 Sox2 355042
12955 metacluster_0/pattern_2 1288 488 497 - 9 493 3.688067 4.210517 Klf4 ... 3.785836 3.617310 3.483407 4.210517 2.273475 2.815127 2.638144 1.469825 Sox2 355043
12956 metacluster_0/pattern_2 1288 511 520 - 9 516 5.117610 6.474059 Sox2 ... 1.423716 6.474059 5.298361 4.259889 9.242240 11.221710 4.775025 3.616715 Sox2 355044
12957 metacluster_0/pattern_2 1288 522 531 + 9 527 6.908667 7.161598 Nanog ... 5.905656 7.129154 7.161598 6.506771 40.932951 44.091636 13.611524 12.327282 Sox2 355045
12958 metacluster_0/pattern_2 1288 572 581 - 9 577 5.147044 5.922225 Sox2 ... 4.420929 5.922225 5.213804 3.965695 2.960440 5.240326 5.775572 1.710346 Sox2 355046
15414 metacluster_0/pattern_3 1288 411 420 - 9 416 3.776633 4.369342 Nanog ... 2.862567 3.310912 4.369342 1.433792 0.814543 0.658071 1.611839 1.312913 Nanog 542105
15415 metacluster_0/pattern_3 1288 424 433 - 9 429 4.858609 6.604256 Nanog ... -1.922662 2.457443 6.604256 1.803888 0.749030 0.535248 1.750176 1.127528 Nanog 542106
15416 metacluster_0/pattern_3 1288 448 457 - 9 453 5.508692 7.009302 Nanog ... -0.834571 3.944422 7.009302 2.642171 0.872530 0.849825 3.080530 1.853544 Nanog 542107
15417 metacluster_0/pattern_3 1288 520 529 - 9 525 4.766256 5.052603 Nanog ... 3.636848 4.340019 5.052603 4.313495 30.428372 38.201649 13.799849 9.916406 Nanog 542108
15418 metacluster_0/pattern_3 1288 533 542 + 9 538 4.141324 4.202407 Oct4 ... 4.202407 3.852317 4.193808 4.126471 21.300420 14.967832 5.236810 5.660789 Nanog 542109
15419 metacluster_0/pattern_3 1288 569 578 - 9 574 4.744068 5.347129 Nanog ... 3.009729 4.155099 5.347129 3.016147 2.892792 4.975846 6.161192 1.486753 Nanog 542110
7846 metacluster_2/pattern_0 1288 460 469 - 9 465 7.010387 7.890176 Klf4 ... -5.006579 -4.531986 0.301801 7.890176 2.064340 2.193574 2.609387 9.171772 Klf4 773536

20 rows × 23 columns

In [69]:
np.log(1+dfp_subset_idx.imp_max)
Out[69]:
15468    1.741850
15469    1.753438
15470    3.752186
15471    3.657747
9035     3.443118
9036     3.504239
9037     1.841183
9038     1.727736
12954    1.957087
12955    1.338974
12956    2.503214
12957    3.808697
12958    1.913324
15414    0.960055
15415    1.011665
15416    1.406227
15417    3.668719
15418    3.104606
15419    1.968676
7846     2.319616
Name: imp_max, dtype: float64
In [70]:
seqlets = [Seqlet(row.example_idx, row.pattern_start-400, row.pattern_end-400, row.tf, row.strand) 
           for i,row in dfp_subset_idx.iterrows()]
In [72]:
# TODO - is this thing normalized by importance?
In [71]:
plot_tracks(filter_tracks({**get_dataset_item(profile, example_idx), 
                           **get_dataset_item(contrib, example_idx)}, xlim=[400, 600]), 
            seqlets, rotate_y=0, legend=True);

Number of motifs per example

In [102]:
dfp[(np.log(1+dfp.imp_max) > 2) & (dfp.pattern_start > 400) & (dfp.pattern_end < 600)].groupby('example_idx').size().plot.hist()
Out[102]:
<matplotlib.axes._subplots.AxesSubplot at 0x7fa0d36ef860>
In [256]:
def prefix_dict(d, prefix):
    return {prefix + d: v for d,v in d.items()}

Example of strong reliance on hypothetical scores

In [282]:
example_idx = dfp.example_idx.sample(1).iloc[0]
print(example_idx)
6999
In [283]:
dfp_subset_idx = dfp[#(np.log(1+dfp.imp_max) > 2) & 
                     (dfp.imp_cat == 'high') & 
                     (dfp.match_cat == 'high') & 
                     #(dfp.tf == "Errb") & 
                     (dfp.example_idx == example_idx) & 
                     (dfp.pattern_start > 400) & 
                     (dfp.pattern_end < 600)]
seqlets = [Seqlet(row.example_idx, row.pattern_start-400, row.pattern_end-400, row.tf, row.strand) 
           for i,row in dfp_subset_idx.iterrows()]
plot_tracks(filter_tracks({**get_dataset_item(contrib, example_idx), 
                           **get_dataset_item(prefix_dict(hyp_contrib, "h/"), example_idx)}, xlim=[400, 600]), 
            seqlets, rotate_y=0, legend=True);
In [287]:
dfp_subset_idx[['tf', 'pattern_center', 'strand', 'match_weighted', 'imp_weighted', 'score_seq_match']].sort_values('pattern_center')
Out[287]:
tf pattern_center strand match_weighted imp_weighted score_seq_match
85404 Nanog 410 + 5.718244 2.441144 6.756690
49818 Errb 430 + 7.712369 1.951875 3.740249
70962 Sox2 575 + 6.281482 11.888993 2.246133
81732 Oct4-Sox2 579 - 9.271116 16.520833 9.200202
85413 Nanog 586 + 5.691527 8.747549 4.093247
85414 Nanog 592 - 4.622986 6.233724 1.951879
In [285]:
for tf, pattern_name in pattern_names:
    p = mr.get_pattern(pattern_name).trim_seq_ic(0.08)
    p.name = "+" + tf
    p.plot(['seq'] + [f'contrib/{t}' for t in tasks], rotate_y=0)
    p.name = "-" + tf
    p.rc().plot(['seq'] + [f'contrib/{t}' for t in tasks], rotate_y=0)

Example of strong reliance on hypothetical scores

In [254]:
example_idx = dfp.example_idx.sample(1).iloc[0]
print(example_idx)
1335
In [272]:
dfp_subset_idx = dfp[#(np.log(1+dfp.imp_max) > 2) & 
                     (dfp.imp_cat == 'high') & 
                     (dfp.match_cat == 'high') & 
                     #(dfp.tf == "Errb") & 
                     (dfp.example_idx == example_idx) & 
                     (dfp.pattern_start > 400) & 
                     (dfp.pattern_end < 600)]
seqlets = [Seqlet(row.example_idx, row.pattern_start-400, row.pattern_end-400, row.tf, row.strand) 
           for i,row in dfp_subset_idx.iterrows()]
plot_tracks(filter_tracks({**get_dataset_item(contrib, example_idx), 
                           **get_dataset_item(prefix_dict(hyp_contrib, "h/"), example_idx)}, xlim=[400, 600]), 
            seqlets, rotate_y=0, legend=True);
In [275]:
dfp_subset_idx[['tf', 'pattern_center', 'strand', 'match_weighted', 'imp_weighted', 'score_seq_match']].sort_values('pattern_center')
Out[275]:
tf pattern_center strand match_weighted imp_weighted score_seq_match
13427 Sox2 410 + 7.563056 11.159393 4.537363
16054 Oct4-Sox2 414 - 9.051900 12.778213 7.640690
13428 Sox2 439 - 6.265966 6.403757 5.395059
15976 Nanog 467 - 4.711840 6.146054 0.432334
15977 Nanog 468 + 4.784056 6.087443 2.183998
16056 Oct4-Sox2 473 - 5.308060 7.876490 1.675158
16057 Oct4-Sox2 475 + 5.714996 8.055885 1.497434
15978 Nanog 479 - 5.457193 10.562017 4.525884
15983 Nanog 572 - 6.116345 4.657692 6.155791
In [268]:
dfp_subset_idx = dfp[#(np.log(1+dfp.imp_max) > 2) & 
                     (dfp.imp_cat == 'high') & 
                     (dfp.match_cat == 'high') & 
                     (dfp.tf == "Sox2") & 
                     (dfp.example_idx == example_idx) & 
                     (dfp.pattern_start > 400) & 
                     (dfp.pattern_end < 600)]
seqlets = [Seqlet(row.example_idx, row.pattern_start-400, row.pattern_end-400, row.tf, row.strand) 
           for i,row in dfp_subset_idx.iterrows()]
plot_tracks(filter_tracks({**get_dataset_item(contrib, example_idx), 
                           **get_dataset_item(prefix_dict(hyp_contrib, "h/"), example_idx)}, xlim=[400, 600]), 
            seqlets, rotate_y=0, legend=True);

Conclusion

  • hypothetical contribution scores can be confusing to inspect
In [269]:
dfp_subset_idx = dfp[#(np.log(1+dfp.imp_max) > 2) & 
                     (dfp.imp_cat == 'high') & 
                     (dfp.match_cat == 'high') & 
                     (dfp.tf == "Oct4-Sox2") & 
                     (dfp.example_idx == example_idx) & 
                     (dfp.pattern_start > 400) & 
                     (dfp.pattern_end < 600)]
seqlets = [Seqlet(row.example_idx, row.pattern_start-400, row.pattern_end-400, row.tf, row.strand) 
           for i,row in dfp_subset_idx.iterrows()]
plot_tracks(filter_tracks({**get_dataset_item(contrib, example_idx), 
                           **get_dataset_item(prefix_dict(hyp_contrib, "h/"), example_idx)}, xlim=[400, 600]), 
            seqlets, rotate_y=0, legend=True);
In [270]:
dfp_subset_idx = dfp[#(np.log(1+dfp.imp_max) > 2) & 
                     (dfp.imp_cat == 'high') & 
                     (dfp.match_cat == 'high') & 
                     (dfp.tf == "Nanog") & 
                     (dfp.example_idx == example_idx) & 
                     (dfp.pattern_start > 400) & 
                     (dfp.pattern_end < 600)]
seqlets = [Seqlet(row.example_idx, row.pattern_start-400, row.pattern_end-400, row.tf, row.strand) 
           for i,row in dfp_subset_idx.iterrows()]
In [279]:
plot_tracks(filter_tracks({**get_dataset_item(contrib, example_idx), 
                           **get_dataset_item(profile, example_idx)}, xlim=[400, 600]), 
            rotate_y=0, legend=True);
In [270]:
plot_tracks(filter_tracks({**get_dataset_item(contrib, example_idx), 
                           **get_dataset_item(prefix_dict(hyp_contrib, "h/"), example_idx)}, xlim=[400, 600]), 
            seqlets, rotate_y=0, legend=True);