Goal¶

analyze instance locations. Focus on Oct4 peaks

TODO¶

sample a few points from the scatterplot: importance vs match to see what you are getting
try the same analysis for a few factors
- try to find the right cutoff based on the number of motifs you expect
eyeball a few regions (higlight the found seqlets in there)
- before doing the search, merge all teh metaclusters (ignore the importance score magnitude)

Conclusions¶

ignore hypothetical scores when scoring seqlet instances

from basepair.imports import *
from basepair.samplers import top_sum_count, top_max_count
from basepair.plot.tracks import plot_tracks, filter_tracks
from kipoi.data_utils import get_dataset_item

model_dir = Path(f"{ddir}/processed//chipnexus/exp/models/oct-sox-nanog-klf/models/n_dil_layers=9/")

modisco_dir = model_dir / "modisco/by_peak_tasks/weighted/Oct4"

mr = ModiscoResult(modisco_dir / "modisco.h5")
mr.open()

tasks = ['Oct4', 'Sox2', 'Nanog', 'Klf4']

mr.plot_pssm("metacluster_0", "pattern_0", trim_frac=0.08);
mr.plot_pssm("metacluster_0", "pattern_2", trim_frac=0.08);

pattern_name = 'metacluster_0/pattern_0'

pattern = mr.get_pattern(pattern_name)

pattern.plot(rotate_y=0);

def load_scores(h5_file, importance='weighted'):
    gt = importance
    im = HDF5Reader(h5_file)
    im.open()
    d = im.load_all()
    seq = d['inputs']
    tasks = list(d['grads'])
    hyp_contrib = {f"{task}": mean(d['grads'][task][gt])
                                 for task in tasks}
    contrib = {f"{task}": hyp_contrib[f"{task}"] * seq
                          for task in tasks}
    profile = {f"t/{task}": d['targets']['profile'][task] for task in tasks}
    im.close()
    return seq, contrib, hyp_contrib, profile


# subset
def get_scores(im, idx, tasks):
    hyp_contrib = {task: (im.f[f'/grads/{task}/weighted/0'][idx] + 
                          im.f[f'/grads/{task}/weighted/1'][idx]) / 2
                   for task in tasks}
    seq = im.f['/inputs/'][idx]
    contrib = {t: hyp_contrib[t] * seq for t in tasks}
    profile = {t: im.f[f'/targets/profile/{t}'][idx] for t in tasks}
    return seq, contrib, hyp_contrib, profile

seq, contrib, hyp_contrib, profile = load_scores(model_dir / "grad.valid.h5")

idx = top_max_count(profile['t/Oct4'][:, 400:600], 80)

seq.shape

(19137, 1000, 4)

example_idx = 11128

viz_dict = filter_tracks({**get_dataset_item(profile, example_idx), 
                          **get_dataset_item(contrib, example_idx)}, xlim=[400, 600])

plot_tracks(viz_dict);

Focus on one specific example¶

t = dict(contrib=contrib['Oct4'][example_idx, 400:600],
         hyp_contrib=hyp_contrib['Oct4'][example_idx, 400:600],
         seq=seq[example_idx, 400:600])

# target
plot_tracks(dict(contrib=t['contrib'], hyp_contrib=t['hyp_contrib']));

from basepair.modisco.results import *

pattern.trim_seq_ic(0.08).plot(['seq', 'contrib/Oct4', 'hyp_contrib/Oct4'], 
                               rotate_y=0, height=1, letter_width=0.2);

Get the instances¶

pattern_names = [
    ("Oct4-Sox2", "metacluster_0/pattern_0"),
    ("Errb", "metacluster_0/pattern_1"),
    ("Sox2", "metacluster_0/pattern_2"),
    ("Nanog", "metacluster_0/pattern_3"),
    ("Klf4", "metacluster_2/pattern_0"),
]

cached = True

cache_dir = Path(Path(ddir) / "cache/dev/")
cache_dir.mkdir(parents=True, exist_ok=True)
cache_file = cache_dir / "hit-scoring.hdf5"

dfp = pd.read_hdf(cache_file, "/scores")

!du -sh {cache_file}

266M	/users/avsec/workspace/basepair/data/cache/dev/hit-scoring.hdf5

#dfp.to_hdf(cache_file, "/scores")

if not cached:
    dfl = []
    match_l, importance_l, seq_match_l = [], [], []
    for tf, pattern_name in pattern_names:
        pattern = mr.get_pattern(pattern_name).trim_seq_ic(0.08)
        match, importance = pattern.scan_importance(contrib, hyp_contrib, tasks, 
                                                    n_jobs=20, verbose=True)
        seq_match = pattern.scan_seq(seq, n_jobs=20, verbose=True)
        dfm = pattern.get_instances(tasks, match, importance, seq_match, fdr=0.01, verbose=True)
        dfm['tf'] = tf
        dfl.append(dfm)
        match_l.append(match)
        importance_l.append(importance)
        seq_match_l.append(seq_match)
    dfp = pd.concat(dfl)
    # cache
    cached = True
    dfp.to_hdf(cache_file, "/scores")
dfp['idx'] = np.arange(len(dfp))
dfp = dfp[dfp.score_seq_match > 0]
# get also the match scores
match = np.stack(match_l, axis=-1)
importance = np.stack(importance_l, axis=-1)
seq_match = np.stack(seq_match_l, axis=-1)

100%|██████████| 19137/19137 [00:10<00:00, 1742.14it/s]
100%|██████████| 19137/19137 [00:02<00:00, 7962.66it/s]
100%|██████████| 19137/19137 [00:02<00:00, 8723.86it/s]
100%|██████████| 19137/19137 [00:02<00:00, 8746.84it/s]
100%|██████████| 19137/19137 [00:02<00:00, 6662.45it/s]
100%|██████████| 19137/19137 [00:02<00:00, 7716.18it/s]
100%|██████████| 19137/19137 [00:02<00:00, 9164.94it/s]
100%|██████████| 19137/19137 [00:02<00:00, 8671.83it/s]
100%|██████████| 19137/19137 [00:02<00:00, 7945.76it/s]
100%|██████████| 19137/19137 [00:03<00:00, 6300.24it/s]
100%|██████████| 19137/19137 [00:02<00:00, 9279.22it/s]
100%|██████████| 19137/19137 [00:02<00:00, 7970.74it/s]
100%|██████████| 19137/19137 [00:02<00:00, 8482.25it/s]
100%|██████████| 19137/19137 [00:02<00:00, 8546.49it/s]
100%|██████████| 19137/19137 [00:02<00:00, 7329.75it/s]
100%|██████████| 19137/19137 [00:02<00:00, 7930.96it/s]
100%|██████████| 19137/19137 [00:01<00:00, 11955.52it/s]
100%|██████████| 19137/19137 [00:01<00:00, 12333.38it/s]

Keeping 204848/19137000 (1.07%) of the instances. Match score cutoff=3.08

100%|██████████| 19137/19137 [00:02<00:00, 8447.92it/s]
100%|██████████| 19137/19137 [00:02<00:00, 8108.99it/s]
100%|██████████| 19137/19137 [00:02<00:00, 7492.27it/s]
100%|██████████| 19137/19137 [00:02<00:00, 8474.65it/s]
100%|██████████| 19137/19137 [00:02<00:00, 7745.69it/s]
100%|██████████| 19137/19137 [00:02<00:00, 7834.36it/s]
100%|██████████| 19137/19137 [00:02<00:00, 8659.57it/s]
100%|██████████| 19137/19137 [00:02<00:00, 7959.10it/s]
100%|██████████| 19137/19137 [00:02<00:00, 7778.28it/s]
100%|██████████| 19137/19137 [00:03<00:00, 4976.72it/s]
100%|██████████| 19137/19137 [00:02<00:00, 6596.14it/s]
100%|██████████| 19137/19137 [00:02<00:00, 8550.98it/s]
100%|██████████| 19137/19137 [00:02<00:00, 8518.67it/s]
100%|██████████| 19137/19137 [00:02<00:00, 8406.46it/s]
100%|██████████| 19137/19137 [00:02<00:00, 8604.23it/s]
100%|██████████| 19137/19137 [00:02<00:00, 6911.11it/s]
100%|██████████| 19137/19137 [00:01<00:00, 12148.28it/s]
100%|██████████| 19137/19137 [00:01<00:00, 11264.05it/s]

Keeping 137240/19137000 (0.72%) of the instances. Match score cutoff=3.39

100%|██████████| 19137/19137 [00:01<00:00, 10600.66it/s]
100%|██████████| 19137/19137 [00:01<00:00, 10047.84it/s]
100%|██████████| 19137/19137 [00:01<00:00, 10764.77it/s]
100%|██████████| 19137/19137 [00:01<00:00, 10143.65it/s]
100%|██████████| 19137/19137 [00:01<00:00, 11325.76it/s]
100%|██████████| 19137/19137 [00:01<00:00, 11820.98it/s]
100%|██████████| 19137/19137 [00:01<00:00, 12256.08it/s]
100%|██████████| 19137/19137 [00:01<00:00, 10305.40it/s]
100%|██████████| 19137/19137 [00:02<00:00, 7372.69it/s]
100%|██████████| 19137/19137 [00:01<00:00, 11202.33it/s]
100%|██████████| 19137/19137 [00:01<00:00, 10245.95it/s]
100%|██████████| 19137/19137 [00:01<00:00, 11120.58it/s]
100%|██████████| 19137/19137 [00:02<00:00, 9064.21it/s]
100%|██████████| 19137/19137 [00:01<00:00, 11146.57it/s]
100%|██████████| 19137/19137 [00:01<00:00, 10913.38it/s]
100%|██████████| 19137/19137 [00:01<00:00, 12207.83it/s]
100%|██████████| 19137/19137 [00:01<00:00, 12440.87it/s]
100%|██████████| 19137/19137 [00:01<00:00, 12459.28it/s]

Keeping 184603/19137000 (0.96%) of the instances. Match score cutoff=3.35

100%|██████████| 19137/19137 [00:02<00:00, 9399.96it/s]
100%|██████████| 19137/19137 [00:01<00:00, 11983.18it/s]
100%|██████████| 19137/19137 [00:01<00:00, 12227.05it/s]
100%|██████████| 19137/19137 [00:01<00:00, 12017.54it/s]
100%|██████████| 19137/19137 [00:02<00:00, 8617.93it/s] 
100%|██████████| 19137/19137 [00:01<00:00, 10606.40it/s]
100%|██████████| 19137/19137 [00:01<00:00, 9800.29it/s]
100%|██████████| 19137/19137 [00:01<00:00, 10676.40it/s]
100%|██████████| 19137/19137 [00:01<00:00, 10824.75it/s]
100%|██████████| 19137/19137 [00:01<00:00, 10626.79it/s]
100%|██████████| 19137/19137 [00:01<00:00, 9684.57it/s]
100%|██████████| 19137/19137 [00:01<00:00, 11851.83it/s]
100%|██████████| 19137/19137 [00:01<00:00, 10861.11it/s]
100%|██████████| 19137/19137 [00:01<00:00, 10674.73it/s]
100%|██████████| 19137/19137 [00:02<00:00, 7885.78it/s]
100%|██████████| 19137/19137 [00:01<00:00, 12526.63it/s]
100%|██████████| 19137/19137 [00:01<00:00, 15065.85it/s]
100%|██████████| 19137/19137 [00:01<00:00, 11444.54it/s]

Keeping 238999/19137000 (1.25%) of the instances. Match score cutoff=3.65

100%|██████████| 19137/19137 [00:01<00:00, 11981.46it/s]
100%|██████████| 19137/19137 [00:01<00:00, 12373.74it/s]
100%|██████████| 19137/19137 [00:01<00:00, 9673.13it/s]
100%|██████████| 19137/19137 [00:01<00:00, 10301.31it/s]
100%|██████████| 19137/19137 [00:01<00:00, 10758.57it/s]
100%|██████████| 19137/19137 [00:01<00:00, 11662.30it/s]
100%|██████████| 19137/19137 [00:01<00:00, 11241.89it/s]
100%|██████████| 19137/19137 [00:02<00:00, 8102.43it/s]
100%|██████████| 19137/19137 [00:01<00:00, 9870.33it/s]
100%|██████████| 19137/19137 [00:01<00:00, 12217.03it/s]
100%|██████████| 19137/19137 [00:01<00:00, 11611.04it/s]
100%|██████████| 19137/19137 [00:01<00:00, 13138.68it/s]
100%|██████████| 19137/19137 [00:02<00:00, 8603.95it/s]
100%|██████████| 19137/19137 [00:01<00:00, 10406.28it/s]
100%|██████████| 19137/19137 [00:01<00:00, 11478.78it/s]
100%|██████████| 19137/19137 [00:01<00:00, 11621.85it/s]
100%|██████████| 19137/19137 [00:01<00:00, 12667.78it/s]
100%|██████████| 19137/19137 [00:01<00:00, 13642.64it/s]

Keeping 137500/19137000 (0.72%) of the instances. Match score cutoff=3.7

dfp.head()

# convert a wide format to a long format
def melt_prefix(dfp, var='imp'):
    prefix = var + "/"
    dfp_imp = dfp[['idx'] + [prefix + t for t in tasks]].melt(id_vars='idx', value_name=var)
    dfp_imp['task'] = dfp_imp.variable.str.replace(prefix, "")
    del dfp_imp['variable']
    return dfp_imp


dfpm = melt_prefix(dfp, 'imp').merge(melt_prefix(dfp, 'match'), on=['idx', 'task'])
dfpl = dfp[[c for c in dfp.columns 
            if not c.startswith("imp/") 
            and not c.startswith("match/")]].merge(dfpm, on='idx')
dfpl['log_imp'] = np.log(1+dfpl.imp)

dfp['log_imp_weighted'] = np.log(1+dfp['imp_weighted'])
dfp['log_match_weighted'] = np.log(dfp['match_weighted'])

# partition the scores into 4 categories (according to the median)
dfp_medians = pd.DataFrame({'log_match_weighted': dfp.groupby("tf").log_match_weighted.median(),
                            'log_imp_weighted': dfp.groupby("tf").log_imp_weighted.median()}).reset_index()

dfp
log_imp_weighted_median = dfp.tf.map(dfp.groupby("tf").log_imp_weighted.median())
log_match_weighted_median = dfp.tf.map(dfp.groupby("tf").log_match_weighted.median())
dfp['imp_cat'] = (dfp.log_imp_weighted  > log_imp_weighted_median).map({False: 'low', True: 'high'})
dfp['match_cat'] = (dfp.log_match_weighted  > log_match_weighted_median).map({False: 'low', True: 'high'})

TODO¶

add percentiles

# dfp['match_max_task'] = dfp.match_max__task

# dfpl['match_max_task'] = dfpl.match_max__task

dfpl.head()

TODO¶

sample a few points from the scatterplot: importance vs match to see what you are getting
try the same analysis for a few factors
- try to find the right cutoff based on the number of motifs you expect
eyeball a few regions (higlight the found seqlets in there)
- before doing the search, merge all teh metaclusters (ignore the importance score magnitude)

from plotnine import *
import plotnine

dfpl.shape

(2624168, 24)

dfpl_subset = dfpl.sample(10000)  # focus on the subset of the data
dfp_subset = dfp.sample(10000)  # focus on the subset of the data

dfpl_subset.head()

Match vs importance¶

Match score specifically for each task¶

import warnings
warnings.filterwarnings("ignore")

plotnine.options.figure_size = (12, 7)
ggplot(aes(x="match", y="log_imp"), dfpl_subset) + geom_point(alpha=0.1) + \
  facet_grid("tf~task", labeller='label_both') + theme_bw()

<ggplot: (-9223363266424025242)>

Match score with highest importance across tasks¶

plotnine.options.figure_size = (12, 7)
ggplot(aes(x="match_weighted", y="log_imp"), dfpl_subset) + geom_point(alpha=0.1) + \
  facet_grid("tf~task", labeller='label_both') + theme_bw()

<ggplot: (-9223363266427861423)>

Match vs imp¶

plotnine.options.figure_size = (12, 3)
ggplot(aes(x="log_match_weighted", y="log_imp_weighted"), dfp_subset) + geom_point(alpha=0.1) + \
  facet_grid(".~tf", labeller='label_both') + theme_bw()

<ggplot: (8770372708416)>

Seq vs match¶

plotnine.options.figure_size = (12, 3)
ggplot(aes(x="score_seq_match", y="log_match_weighted"), dfp_subset) + geom_point(alpha=0.1) + \
  facet_grid(".~tf", labeller='label_both') + theme_bw()

<ggplot: (-9223363266420723452)>

Seq vs importance¶

plotnine.options.figure_size = (12, 3)
ggplot(aes(x="score_seq_match", y="log_imp_weighted"), dfp_subset) + geom_point(alpha=0.1) + \
  facet_grid(".~tf", labeller='label_both') + theme_bw()

<ggplot: (8770434063880)>

Seq vs match¶

ggplot(aes(x='log_match_weighted'), dfp_subset) + geom_histogram() + facet_grid(".~tf", labeller='label_both') + theme_bw()

<ggplot: (8770434001875)>

Non-PWM filtered¶

plotnine.options.figure_size = (12, 3)
ggplot(aes(x="log_match_weighted", y="log_imp_weighted"), dfp_subset) + geom_point(alpha=0.1) + \
  geom_vline(aes(xintercept='log_match_weighted'), data=dfp_medians, alpha=1, color='orange', linetype='dashed') + \
  geom_hline(aes(yintercept='log_imp_weighted'), data=dfp_medians, alpha=1, color='orange', linetype='dashed') + \
  facet_grid(".~tf", labeller='label_both') + theme_bw()

<ggplot: (-9223363266324861815)>

PWM filtered¶

plotnine.options.figure_size = (12, 3)
ggplot(aes(x="log_match_weighted", y="log_imp_weighted"), dfp_subset) + geom_point(alpha=0.1) + \
  geom_vline(aes(xintercept='log_match_weighted'), data=dfp_medians, alpha=1, color='orange', linetype='dashed') + \
  geom_hline(aes(yintercept='log_imp_weighted'), data=dfp_medians, alpha=1, color='orange', linetype='dashed') + \
  facet_grid(".~tf", labeller='label_both') + theme_bw()

<ggplot: (8770430771355)>

dfp_subset.log_imp.weighted.median()

np.log(1+dfp.imp_max).plot.hist(200);
#plt.xlim([0, 20])
plt.xlabel("Maximum importance plot");

plotnine.options.figure_size = (7, 7)
ggplot(aes(x='tf', color='task', y='match'), dfpl) + geom_boxplot() + theme_bw()

<ggplot: (8770372654186)>

plotnine.options.figure_size = (7, 7)
ggplot(aes(x='tf', color='task', y='log_imp'), dfpl) + geom_boxplot() + theme_bw()

<ggplot: (-9223363266482419389)>

dfpl.groupby(['tf', 'task']).imp.mean().unstack().plot.bar()
plt.ylabel("Average importance");

dfpl.groupby(['tf', 'task']).match.mean().unstack().plot.bar()
plt.title("Average match per task");
plt.ylabel("Average Match");

dfp.groupby("tf").imp_max_task.value_counts().unstack().plot.bar();
plt.ylabel("Frequency")
plt.title("Number of times a particular task has max importance ");

dfp.groupby("tf").match_max_task.value_counts().unstack().plot.bar();
plt.ylabel("Frequency")
plt.title("Number of times a particular task has max match ");

pattern_names = [
    ("Oct4-Sox2", "metacluster_0/pattern_0"),
    ("Errb", "metacluster_0/pattern_1"),
    ("Sox2", "metacluster_0/pattern_2"),
    ("Nanog", "metacluster_0/pattern_3"),
    ("Klf4", "metacluster_2/pattern_0"),
]

pattern.plot("contrib");

tasks
task_imp = np.array([np.abs(pattern.contrib[t]).mean() for t in tasks])
task_imp = task_imp / task_imp.sum()

pattern = mr.get_pattern(pattern.name).trim_seq_ic(0.08)

dfpl_subset[(dfpl_subset.task == 'Klf4') & (dfpl_subset.tf == 'Oct4-Sox2')& (dfpl_subset.imp > 40)]

viz_dict = filter_tracks({**get_dataset_item(profile, example_idx), 
                          **get_dataset_item(contrib, example_idx)}, xlim=[400, 600])

example_idx = 1288

dfp_subset_idx = dfp[(dfp.example_idx == example_idx) & (dfp.pattern_start > 400) & (dfp.pattern_end < 600)]
dfp_subset_idx

np.log(1+dfp_subset_idx.imp_max)

15468    1.741850
15469    1.753438
15470    3.752186
15471    3.657747
9035     3.443118
9036     3.504239
9037     1.841183
9038     1.727736
12954    1.957087
12955    1.338974
12956    2.503214
12957    3.808697
12958    1.913324
15414    0.960055
15415    1.011665
15416    1.406227
15417    3.668719
15418    3.104606
15419    1.968676
7846     2.319616
Name: imp_max, dtype: float64

seqlets = [Seqlet(row.example_idx, row.pattern_start-400, row.pattern_end-400, row.tf, row.strand) 
           for i,row in dfp_subset_idx.iterrows()]

# TODO - is this thing normalized by importance?

plot_tracks(filter_tracks({**get_dataset_item(profile, example_idx), 
                           **get_dataset_item(contrib, example_idx)}, xlim=[400, 600]), 
            seqlets, rotate_y=0, legend=True);

Number of motifs per example¶

dfp[(np.log(1+dfp.imp_max) > 2) & (dfp.pattern_start > 400) & (dfp.pattern_end < 600)].groupby('example_idx').size().plot.hist()

<matplotlib.axes._subplots.AxesSubplot at 0x7fa0d36ef860>

def prefix_dict(d, prefix):
    return {prefix + d: v for d,v in d.items()}

Example of strong reliance on hypothetical scores¶

example_idx = dfp.example_idx.sample(1).iloc[0]
print(example_idx)

6999

dfp_subset_idx = dfp[#(np.log(1+dfp.imp_max) > 2) & 
                     (dfp.imp_cat == 'high') & 
                     (dfp.match_cat == 'high') & 
                     #(dfp.tf == "Errb") & 
                     (dfp.example_idx == example_idx) & 
                     (dfp.pattern_start > 400) & 
                     (dfp.pattern_end < 600)]
seqlets = [Seqlet(row.example_idx, row.pattern_start-400, row.pattern_end-400, row.tf, row.strand) 
           for i,row in dfp_subset_idx.iterrows()]
plot_tracks(filter_tracks({**get_dataset_item(contrib, example_idx), 
                           **get_dataset_item(prefix_dict(hyp_contrib, "h/"), example_idx)}, xlim=[400, 600]), 
            seqlets, rotate_y=0, legend=True);

dfp_subset_idx[['tf', 'pattern_center', 'strand', 'match_weighted', 'imp_weighted', 'score_seq_match']].sort_values('pattern_center')

for tf, pattern_name in pattern_names:
    p = mr.get_pattern(pattern_name).trim_seq_ic(0.08)
    p.name = "+" + tf
    p.plot(['seq'] + [f'contrib/{t}' for t in tasks], rotate_y=0)
    p.name = "-" + tf
    p.rc().plot(['seq'] + [f'contrib/{t}' for t in tasks], rotate_y=0)

Example of strong reliance on hypothetical scores¶

example_idx = dfp.example_idx.sample(1).iloc[0]
print(example_idx)

1335

dfp_subset_idx = dfp[#(np.log(1+dfp.imp_max) > 2) & 
                     (dfp.imp_cat == 'high') & 
                     (dfp.match_cat == 'high') & 
                     #(dfp.tf == "Errb") & 
                     (dfp.example_idx == example_idx) & 
                     (dfp.pattern_start > 400) & 
                     (dfp.pattern_end < 600)]
seqlets = [Seqlet(row.example_idx, row.pattern_start-400, row.pattern_end-400, row.tf, row.strand) 
           for i,row in dfp_subset_idx.iterrows()]
plot_tracks(filter_tracks({**get_dataset_item(contrib, example_idx), 
                           **get_dataset_item(prefix_dict(hyp_contrib, "h/"), example_idx)}, xlim=[400, 600]), 
            seqlets, rotate_y=0, legend=True);

dfp_subset_idx[['tf', 'pattern_center', 'strand', 'match_weighted', 'imp_weighted', 'score_seq_match']].sort_values('pattern_center')

dfp_subset_idx = dfp[#(np.log(1+dfp.imp_max) > 2) & 
                     (dfp.imp_cat == 'high') & 
                     (dfp.match_cat == 'high') & 
                     (dfp.tf == "Sox2") & 
                     (dfp.example_idx == example_idx) & 
                     (dfp.pattern_start > 400) & 
                     (dfp.pattern_end < 600)]
seqlets = [Seqlet(row.example_idx, row.pattern_start-400, row.pattern_end-400, row.tf, row.strand) 
           for i,row in dfp_subset_idx.iterrows()]
plot_tracks(filter_tracks({**get_dataset_item(contrib, example_idx), 
                           **get_dataset_item(prefix_dict(hyp_contrib, "h/"), example_idx)}, xlim=[400, 600]), 
            seqlets, rotate_y=0, legend=True);

Conclusion¶

hypothetical contribution scores can be confusing to inspect

dfp_subset_idx = dfp[#(np.log(1+dfp.imp_max) > 2) & 
                     (dfp.imp_cat == 'high') & 
                     (dfp.match_cat == 'high') & 
                     (dfp.tf == "Oct4-Sox2") & 
                     (dfp.example_idx == example_idx) & 
                     (dfp.pattern_start > 400) & 
                     (dfp.pattern_end < 600)]
seqlets = [Seqlet(row.example_idx, row.pattern_start-400, row.pattern_end-400, row.tf, row.strand) 
           for i,row in dfp_subset_idx.iterrows()]
plot_tracks(filter_tracks({**get_dataset_item(contrib, example_idx), 
                           **get_dataset_item(prefix_dict(hyp_contrib, "h/"), example_idx)}, xlim=[400, 600]), 
            seqlets, rotate_y=0, legend=True);

dfp_subset_idx = dfp[#(np.log(1+dfp.imp_max) > 2) & 
                     (dfp.imp_cat == 'high') & 
                     (dfp.match_cat == 'high') & 
                     (dfp.tf == "Nanog") & 
                     (dfp.example_idx == example_idx) & 
                     (dfp.pattern_start > 400) & 
                     (dfp.pattern_end < 600)]
seqlets = [Seqlet(row.example_idx, row.pattern_start-400, row.pattern_end-400, row.tf, row.strand) 
           for i,row in dfp_subset_idx.iterrows()]

plot_tracks(filter_tracks({**get_dataset_item(contrib, example_idx), 
                           **get_dataset_item(profile, example_idx)}, xlim=[400, 600]), 
            rotate_y=0, legend=True);

plot_tracks(filter_tracks({**get_dataset_item(contrib, example_idx), 
                           **get_dataset_item(prefix_dict(hyp_contrib, "h/"), example_idx)}, xlim=[400, 600]), 
            seqlets, rotate_y=0, legend=True);

	pattern	pattern_start	pattern_end	strand	pattern_len	pattern_center	match_score	match_max	match_max_task	...	match/Oct4	match/Sox2	match/Nanog	match/Klf4	imp/Oct4	imp/Sox2	imp/Nanog	imp/Klf4	tf	idx
0	metacluster_0/pattern_0	434	450	-	16	442	4.436324	5.085370	Oct4	...	5.085370	4.306160	3.336833	4.397326	25.292760	18.074471	10.993076	6.595614	Oct4-Sox2	0
1	metacluster_0/pattern_0	436	452	+	16	444	8.239908	9.589859	Oct4	...	9.589859	9.110400	4.635757	7.749240	26.049998	18.633804	10.866910	6.611798	Oct4-Sox2	1
2	metacluster_0/pattern_0	440	456	-	16	448	3.561162	4.125266	Oct4	...	4.125266	3.806090	1.889072	3.721674	23.658365	17.657431	10.061822	6.011966	Oct4-Sox2	2
3	metacluster_0/pattern_0	457	473	+	16	465	7.014139	7.907351	Oct4	...	7.907351	7.310353	5.991961	5.682630	28.729819	27.834638	14.706600	8.676175	Oct4-Sox2	3
4	metacluster_0/pattern_0	477	493	+	16	485	7.122949	7.235220	Oct4	...	7.235220	6.972675	7.212878	7.020781	32.487208	31.967045	21.090343	12.860154	Oct4-Sox2	4

	pattern	example_idx	pattern_start	pattern_end	strand	pattern_len	pattern_center	match_weighted	match_max	match_max_task	...	tf	idx	log_imp_weighted	log_match_weighted	imp_cat	match_cat	imp	task	match	log_imp
2360977	metacluster_2/pattern_0	10341	500	509	-	9	504	6.981125	7.401968	Klf4	...	Klf4	830929	3.092235	1.943210	high	high	0.744257	Sox2	1.914759	0.556328
1178593	metacluster_0/pattern_2	12262	119	128	-	9	123	4.760456	5.745961	Oct4	...	Sox2	464809	1.398745	1.560343	high	low	5.099994	Sox2	5.349951	1.808288
897050	metacluster_0/pattern_2	3624	56	65	+	9	60	7.472459	8.716251	Sox2	...	Sox2	378253	0.439719	2.011224	low	high	0.129061	Nanog	7.353604	0.121386
970151	metacluster_0/pattern_2	5774	419	428	-	9	423	4.816924	6.851645	Sox2	...	Sox2	400522	2.479368	1.572136	high	low	6.436266	Klf4	5.997974	2.006369
1855531	metacluster_0/pattern_3	12378	169	178	-	9	173	4.409331	5.580443	Nanog	...	Nanog	679873	0.546656	1.483723	low	high	0.424428	Klf4	2.972585	0.353770

	pattern	example_idx	pattern_start	pattern_end	strand	pattern_len	pattern_center	match_score	match_max	match_max_task	...	match/Oct4	match/Sox2	match/Nanog	match/Klf4	imp/Oct4	imp/Sox2	imp/Nanog	imp/Klf4	tf	idx
15468	metacluster_0/pattern_0	1288	481	497	-	16	489	4.520667	4.931519	Sox2	...	4.450363	4.931519	3.989680	4.587109	3.852208	4.707893	3.883829	2.103986	Oct4-Sox2	15468
15469	metacluster_0/pattern_0	1288	482	498	+	16	490	3.143075	3.517850	Klf4	...	3.022358	3.017903	3.225264	3.517850	3.866364	4.774422	3.898947	2.115534	Oct4-Sox2	15469
15470	metacluster_0/pattern_0	1288	523	539	-	16	531	10.138856	10.968470	Klf4	...	9.777510	10.578058	9.445521	10.968470	41.614139	37.723788	11.668232	11.860842	Oct4-Sox2	15470
15471	metacluster_0/pattern_0	1288	525	541	+	16	533	3.667514	4.364790	Nanog	...	3.712812	3.073689	4.364790	3.769160	37.773879	33.113699	10.446346	10.660964	Oct4-Sox2	15471
9035	metacluster_0/pattern_1	1288	516	532	-	16	524	3.399173	4.172987	Nanog	...	2.778824	3.511239	4.172987	2.968923	27.140151	30.284350	10.641942	8.648113	Errb	213883
9036	metacluster_0/pattern_1	1288	517	533	-	16	525	3.616138	3.979899	Nanog	...	3.455617	3.890882	3.979899	3.286359	30.567106	32.256126	10.840259	9.518879	Errb	213884
9037	metacluster_0/pattern_1	1288	566	582	+	16	574	3.826363	4.095667	Sox2	...	3.676426	4.095667	3.668151	3.912041	3.115768	4.546923	5.303988	1.534756	Errb	213885
9038	metacluster_0/pattern_1	1288	572	588	+	16	580	5.230893	6.569507	Klf4	...	5.329646	5.032655	3.599923	6.569507	2.524323	4.284051	4.627897	1.554792	Errb	213886
12954	metacluster_0/pattern_2	1288	480	489	+	9	485	5.473097	6.404852	Oct4	...	6.404852	6.304051	4.732015	5.023636	4.905771	6.078678	4.868028	2.584665	Sox2	355042
12955	metacluster_0/pattern_2	1288	488	497	-	9	493	3.688067	4.210517	Klf4	...	3.785836	3.617310	3.483407	4.210517	2.273475	2.815127	2.638144	1.469825	Sox2	355043
12956	metacluster_0/pattern_2	1288	511	520	-	9	516	5.117610	6.474059	Sox2	...	1.423716	6.474059	5.298361	4.259889	9.242240	11.221710	4.775025	3.616715	Sox2	355044
12957	metacluster_0/pattern_2	1288	522	531	+	9	527	6.908667	7.161598	Nanog	...	5.905656	7.129154	7.161598	6.506771	40.932951	44.091636	13.611524	12.327282	Sox2	355045
12958	metacluster_0/pattern_2	1288	572	581	-	9	577	5.147044	5.922225	Sox2	...	4.420929	5.922225	5.213804	3.965695	2.960440	5.240326	5.775572	1.710346	Sox2	355046
15414	metacluster_0/pattern_3	1288	411	420	-	9	416	3.776633	4.369342	Nanog	...	2.862567	3.310912	4.369342	1.433792	0.814543	0.658071	1.611839	1.312913	Nanog	542105
15415	metacluster_0/pattern_3	1288	424	433	-	9	429	4.858609	6.604256	Nanog	...	-1.922662	2.457443	6.604256	1.803888	0.749030	0.535248	1.750176	1.127528	Nanog	542106
15416	metacluster_0/pattern_3	1288	448	457	-	9	453	5.508692	7.009302	Nanog	...	-0.834571	3.944422	7.009302	2.642171	0.872530	0.849825	3.080530	1.853544	Nanog	542107
15417	metacluster_0/pattern_3	1288	520	529	-	9	525	4.766256	5.052603	Nanog	...	3.636848	4.340019	5.052603	4.313495	30.428372	38.201649	13.799849	9.916406	Nanog	542108
15418	metacluster_0/pattern_3	1288	533	542	+	9	538	4.141324	4.202407	Oct4	...	4.202407	3.852317	4.193808	4.126471	21.300420	14.967832	5.236810	5.660789	Nanog	542109
15419	metacluster_0/pattern_3	1288	569	578	-	9	574	4.744068	5.347129	Nanog	...	3.009729	4.155099	5.347129	3.016147	2.892792	4.975846	6.161192	1.486753	Nanog	542110
7846	metacluster_2/pattern_0	1288	460	469	-	9	465	7.010387	7.890176	Klf4	...	-5.006579	-4.531986	0.301801	7.890176	2.064340	2.193574	2.609387	9.171772	Klf4	773536

	tf	pattern_center	strand	match_weighted	imp_weighted	score_seq_match
85404	Nanog	410	+	5.718244	2.441144	6.756690
49818	Errb	430	+	7.712369	1.951875	3.740249
70962	Sox2	575	+	6.281482	11.888993	2.246133
81732	Oct4-Sox2	579	-	9.271116	16.520833	9.200202
85413	Nanog	586	+	5.691527	8.747549	4.093247
85414	Nanog	592	-	4.622986	6.233724	1.951879

	tf	pattern_center	strand	match_weighted	imp_weighted	score_seq_match
13427	Sox2	410	+	7.563056	11.159393	4.537363
16054	Oct4-Sox2	414	-	9.051900	12.778213	7.640690
13428	Sox2	439	-	6.265966	6.403757	5.395059
15976	Nanog	467	-	4.711840	6.146054	0.432334
15977	Nanog	468	+	4.784056	6.087443	2.183998
16056	Oct4-Sox2	473	-	5.308060	7.876490	1.675158
16057	Oct4-Sox2	475	+	5.714996	8.055885	1.497434
15978	Nanog	479	-	5.457193	10.562017	4.525884
15983	Nanog	572	-	6.116345	4.657692	6.155791