from collections import OrderedDict
exp = 'nexus,peaks,OSNK,0,10,1,FALSE,same,0.5,64,25,0.004,9,FALSE,[1,50],TRUE'
imp_score = 'profile/wn'

motifs = OrderedDict([
    ("Oct4-Sox2", 'Oct4/m0_p0'),
    # ("Oct4", 'Oct4/m0_p1'),
    # ("Strange-sym-motif", 'Oct4/m0_p5'),
    ("Sox2", 'Sox2/m0_p1'),
    ("Nanog", 'Nanog/m0_p1'),
    # ("Zic3", 'Nanog/m0_p2'),
    # ("Nanog-partner", 'Nanog/m0_p4'),
    ("Klf4", 'Klf4/m0_p0'),
])

Figure 5,6¶

# Imports
from basepair.imports import *
from basepair.exp.paper.config import models_dir, profile_mapping, ddir
from basepair.exp.chipnexus.perturb.vdom import vdom_motif_pair, plot_spacing_hist
from basepair.exp.chipnexus.spacing import remove_edge_instances, get_motif_pairs
from basepair.exp.paper.config import tasks
from plotnine import *
import plotnine
import warnings
warnings.filterwarnings("ignore")

%matplotlib inline
%config InlineBackend.figure_format = 'retina'
paper_config()

Using TensorFlow backend.

model_dir = models_dir / exp
dataset_dir = model_dir / 'perturbation-analysis'

pairs = get_motif_pairs(motifs)

# ordered names
pair_names = ["<>".join(x) for x in pairs]

figures_dir = Path(f"{ddir}/figures/modisco/{exp}/in-vivo-perturb")

!mkdir -p {figures_dir}

# define the global set of distances
dist_subsets = ['center_diff<=35',
               '(center_diff>35)&(center_diff<=70)', 
               '(center_diff>70)&(center_diff<=150)', 
               'center_diff>150']
dist_subset_labels = ['dist < 35',
                      '35 < dist <= 70',
                      '70 < dist <= 150',
                      '150 < dist',
                     ]

from basepair.plot.utils import PlotninePalette

# choose the right color
colors = plt.get_cmap("RdBu")(np.linspace(0, 1, 9))
hex_colors = [plt.cm.colors.to_hex(a) for a in colors]
sns.palplot(colors)

cb_red = hex_colors[2]
cb_blue = hex_colors[-2 - 2]

hex_colors

['#67001f',
 '#bb2a34',
 '#e58368',
 '#fbceb7',
 '#f6f7f7',
 '#c0dceb',
 '#68abd0',
 '#2870b1',
 '#053061']

Load the data¶

In vitro instances¶

from basepair.exp.chipnexus.perturb.scores import ism_compute_features_tidy, compute_features_tidy, SCORES, max_profile_count

ls {dataset_dir}

dfab.csv.gz                  dfi_subset.csv.gz  ref.h5
dfabf.csv.gz                 dfs.csv.gz         ref.h5.bak
dfabf_ism.csv.gz             double_mut.h5      single_mut.h5
dfabf.Oct4-Sox2-subset.parq  double_mut.h5.bak  single_mut.h5.bak

from IPython.display import FileLink
FileLink(dataset_dir / 'dfs.csv.gz')

dfabf = pd.read_csv(dataset_dir / 'dfabf.csv.gz')
dfabf = dfabf[dfabf.motif_pair.isin(pair_names)]
dfabf['center_diff_cat'] = pd.Categorical(pd.cut(dfabf['center_diff'], [0, 35, 70, 150, 1000]))

# dfs = pd.read_csv(dataset_dir / 'dfs.csv.gz')

# Store the output
dfabf_ism = pd.read_csv(dataset_dir / 'dfabf_ism.csv.gz')
dfabf_ism = dfabf_ism[dfabf_ism.motif_pair.isin(pair_names)]

# dfab = pd.read_csv(dataset_dir / 'dfab.csv.gz')
# dfab['motif_pair_cat'] = pd.Categorical(dfab.motif_pair, categories=dfab.groupby("motif_pair").size().sort_values(ascending=False).index)
# dfab['strand_combination_cat'] = pd.Categorical(dfab['strand_combination'], )

dfi_subset = pd.read_csv(dataset_dir / 'dfi_subset.csv.gz')

dfabf.motif_pair.value_counts()

Nanog<>Klf4             1775508
Oct4<>Nanog             1531264
Oct4-Sox2<>Oct4         1233428
                         ...   
Oct4-Sox2<>Sox2          380268
Oct4-Sox2<>Oct4-Sox2     210812
Sox2<>Sox2               183960
Name: motif_pair, Length: 15, dtype: int64

dfabf.head()

from basepair.exp.paper.config import tasks

Add `pattern_center_aln` to `dfi_subset`¶

mr = MultipleModiscoResult({t: model_dir / f'deeplift/{t}/out/{imp_score}/modisco.h5'
                           for t in tasks})

from basepair.exp.paper.fig4 import cluster_align_patterns
main_motifs = [mr.get_pattern(pattern_name).add_attr('features', {'n seqlets': mr.n_seqlets(pattern_name)})
               for name, pattern_name in motifs.items()]
main_motifs = [p.rename(longer_pattern(p.name)) for p in main_motifs]
main_motifs_clustered = cluster_align_patterns(main_motifs, n_clusters=1)

  0%|          | 0/5 [00:00<?, ?it/s]TF-MoDISco is using the TensorFlow backend.
100%|██████████| 5/5 [00:00<00:00, 20.09it/s]
100%|██████████| 5/5 [00:00<00:00, 226.71it/s]

from basepair.modisco.pattern_instances import align_instance_center
dfi_subset = align_instance_center(dfi_subset, main_motifs, main_motifs_clustered, trim_frac=0.08)

Exclude TEs¶

instance_parq_paths = {t: model_dir / f'deeplift/{t}/out/{imp_score}/instances.parq' 
                       for t in tasks}

def shorten_te_pattern(s):
    tf, p = s.split("/", 1)
    return tf + "/" + shorten_pattern(p)

motifs_te = [p.name
             for p in mr.get_all_patterns()
             if p.seq_info_content > 30 and mr.n_seqlets(p.name) > 100]
motifs_te_d = OrderedDict([(shorten_te_pattern(x), shorten_te_pattern(x)) for x in motifs_te])

from basepair.modisco.pattern_instances import multiple_load_instances

# # get transposable element locations
dfi_te = multiple_load_instances(instance_parq_paths, motifs_te_d)
dfi_te = dfi_te[(dfi_te.match_weighted_p > 0.1) & (dfi_te.seq_match > 20)]

100%|██████████| 4/4 [13:15<00:00, 258.15s/it]

# Get rows without intersecting transposable elements
from basepair.preproc import rc_seq, dfint_no_intersection
interval_cols = ['example_chrom', 'pattern_start_abs', 'pattern_end_abs']
non_te_idx = dfint_no_intersection(dfi_subset[interval_cols], dfi_te[interval_cols])
print(f"Not overlapping te's: {non_te_idx.mean()}")
# # annotate with profiles
# profiles = load_profiles(modisco_dir, imp_score_file)
# dfi_anno = annotate_profile(dfi, mr, profiles, profile_width=70, trim_frac=0.08)

# dfi = dfi_anno

Not overlapping te's: 0.9650362162006175

valid_row_idx = dfi_subset[non_te_idx].row_idx.values

len(valid_row_idx) # non-te rows

334414

Remove all Sox2 or Oct4 instances comming from the Oct4-Sox2 motif¶

Append pattern_center_aln to dfabf.

def suffix_colnames(df, suffix):
    df.columns = [c + suffix for c in df.columns]
    return df

dfabf['row_idx_x'] = dfabf['row_idx_x'].astype(int)
dfabf['row_idx_y'] = dfabf['row_idx_y'].astype(int)

dfabf2 = pd.merge(dfabf, dfi_subset[['row_idx',
                                     'pattern_center_aln',
                                     'pattern_strand_aln']].pipe(suffix_colnames, '_x'),
                 on='row_idx_x',
                 how='left')
dfabf2 = pd.merge(dfabf2, dfi_subset[['row_idx',
                                      'pattern_center_aln', 
                                      'pattern_strand_aln']].pipe(suffix_colnames, '_y'),
                  on='row_idx_y',
                  how='left')
dfabf2['pattern_diff_aln'] = dfabf2['pattern_center_aln_y'] - dfabf2['pattern_center_aln_x']

exclude_sox2 = dfabf2[(dfabf2.motif_pair == 'Oct4-Sox2<>Sox2') & 
                      (dfabf2['pattern_diff_aln'] == 0)].row_idx_y.values
exclude_oct4 = dfabf2[(dfabf2.motif_pair == 'Oct4-Sox2<>Oct4') & 
                      (dfabf2['pattern_diff_aln'] == 0)].row_idx_y.values

# Exclude the overlapping row
dfabf2 = dfabf2[(dfabf2.pattern_name_x != 'Oct4') | (~dfabf2.row_idx_x.isin(exclude_oct4))]
dfabf2 = dfabf2[(dfabf2.pattern_name_y != 'Oct4') | (~dfabf2.row_idx_y.isin(exclude_oct4))]
dfabf2 = dfabf2[(dfabf2.pattern_name_x != 'Sox2') | (~dfabf2.row_idx_x.isin(exclude_sox2))]
dfabf2 = dfabf2[(dfabf2.pattern_name_y != 'Sox2') | (~dfabf2.row_idx_y.isin(exclude_sox2))]

dfabf2.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 10292128 entries, 0 to 12490911
Columns: 102 entries, Unnamed: 0 to pattern_diff_aln
dtypes: category(1), float64(64), int64(5), object(32)
memory usage: 7.8+ GB

len(dfabf2)

10292128

Exclude TE's¶

# Exclude TE's
dfabf2 = dfabf2[dfabf2.row_idx_x.isin(valid_row_idx) & dfabf2.row_idx_y.isin(valid_row_idx)]

dfabf2 = dfabf2.reset_index()

Save dfabf¶

# dfabf2.reset_index().to_csv(dataset_dir / 'dfabf.Oct4-Sox2-subset.csv.gz', compression='gzip', index=False)

dfabf2.head()

# set categorical index
dfabf2.center_diff_cat.cat.categories = dfabf2.center_diff_cat.cat.categories.astype(str)

%time dfabf2.to_parquet(dataset_dir / 'dfabf.Oct4-Sox2-subset.parq', index=False, engine='fastparquet')

CPU times: user 1min 45s, sys: 18.4 s, total: 2min 3s
Wall time: 1min 57s

!du -sh {dataset_dir}/dfabf.Oct4-Sox2-subset.parq

2.9G	/oak/stanford/groups/akundaje/avsec/basepair/data/processed/comparison/output/nexus,peaks,OSNK,0,10,1,FALSE,same,0.5,64,25,0.004,9,FALSE,[1,50],TRUE/perturbation-analysis/dfabf.Oct4-Sox2-subset.parq

%time dfabf2 = pd.read_parquet(dataset_dir / 'dfabf.Oct4-Sox2-subset.parq', engine='fastparquet')

CPU times: user 1min 9s, sys: 18.9 s, total: 1min 28s
Wall time: 1min 53s

!du -sh {dataset_dir}

161G	/oak/stanford/groups/akundaje/avsec/basepair/data/processed/comparison/output/nexus,peaks,OSNK,0,10,1,FALSE,same,0.5,64,25,0.004,9,FALSE,[1,50],TRUE/perturbation-analysis

Subset dfs¶

dfabf_ism.head()

dfabf_ism2 = pd.merge(dfabf_ism, dfi_subset[['row_idx',
                                     'pattern_center_aln',
                                     'pattern_strand_aln']].pipe(suffix_colnames, '_x'),
                 on='row_idx_x',
                 how='left')
dfabf_ism2 = pd.merge(dfabf_ism2, dfi_subset[['row_idx',
                                      'pattern_center_aln', 
                                      'pattern_strand_aln']].pipe(suffix_colnames, '_y'),
                  on='row_idx_y',
                  how='left')
dfabf_ism2['pattern_diff_aln'] = dfabf_ism2['pattern_center_aln_y'] - dfabf_ism2['pattern_center_aln_x']

exclude_sox2 = dfabf_ism2[(dfabf_ism2.motif_pair == 'Oct4-Sox2<>Sox2') & 
                          (dfabf_ism2['pattern_diff_aln'] == 0)].row_idx_y.values
exclude_oct4 = dfabf_ism2[(dfabf_ism2.motif_pair == 'Oct4-Sox2<>Oct4') & 
                          (dfabf_ism2['pattern_diff_aln'] == 0)].row_idx_y.values

# Exclude the overlapping row
dfabf_ism2 = dfabf_ism2[(dfabf_ism2.pattern_name_x != 'Oct4') | (~dfabf_ism2.row_idx_x.isin(exclude_oct4))]
dfabf_ism2 = dfabf_ism2[(dfabf_ism2.pattern_name_y != 'Oct4') | (~dfabf_ism2.row_idx_y.isin(exclude_oct4))]
dfabf_ism2 = dfabf_ism2[(dfabf_ism2.pattern_name_x != 'Sox2') | (~dfabf_ism2.row_idx_x.isin(exclude_sox2))]
dfabf_ism2 = dfabf_ism2[(dfabf_ism2.pattern_name_y != 'Sox2') | (~dfabf_ism2.row_idx_y.isin(exclude_sox2))]

dfs = dfabf_ism2[['Wt_obs', 'Wt', 'dA', 'dB', 'dAB', 'motif_pair',
                  'task', 'center_diff', 'strand_combination']]

Synergy scores¶

Wt = total counts in ref
dA = total counts after removing A
dB = total counts after removing B
dAB = total counts after removing both

Then TF-A depends on A with

(Wt-dA)/(Wt - dAB)

and on B with

(Wt-dB)/(Wt - dAB)

# dfs = dfabf_ism[['Wt_obs', 'Wt', 'dA', 'dB', 'dAB', 'motif_pair', 'task', 'center_diff', 'strand_combination']]
dfs_prox = dfs[dfs.center_diff <= 35]
dfs_dist = dfs[dfs.center_diff > 35]
dfs.head()

Fraction of positive values¶

terms = ['Wt-dA>0', 'Wt-dB>0', 'Wt-dAB>0', 'dA-dAB>0', 'dB-dAB>0']
filter_terms = ['center_diff <= 35', 'center_diff > 35']
c = pd.concat([(dfs.query(ft).groupby(['motif_pair', 'task']).apply(lambda x: x.eval(term).mean()).
                reset_index().
                rename(columns={0:'fraction'}).
                assign(term=term, subset=ft))
               for term in terms for ft in filter_terms], axis=0)
c = c[c.motif_pair.isin(pair_names)]

c['term'] = pd.Categorical(c['term'], categories=terms)
c['subset'] = pd.Categorical(c['subset'], categories=filter_terms)
c['motif_pair'] = pd.Categorical(c['motif_pair'], categories=pair_names[::-1], ordered=True)

plotnine.options.figure_size = get_figsize(1, 0.6)
fig = (ggplot(aes(y='motif_pair', x='task', fill='fraction'), c) + 
       geom_tile() + 
       theme_classic(base_size=10, base_family='Arial') +  
       facet_grid("subset ~ term") + 
       theme(legend_position='top', axis_text_x=element_text(rotation=30, hjust=1))  + 
       scale_fill_gradient2(low=cb_red, mid='white', high=cb_blue, midpoint=.5, limits=[0, 1], 
                            palette=PlotninePalette("RdBu", [.1, .9])) +
       ggtitle("Fraction of positive terms")
)
fig.save(figures_dir / 'heatmap.positive_fraction.pdf')
fig

<ggplot: (-9223363280239612114)>

Synergy scores: `(2*Wt-dA-dB)/(Wt-dAB)`¶

Require: 90% Wt-dAB>0¶

mask = 'Wt-dAB>0'
terms = ['(2*Wt-dA-dB)/(Wt-dAB)']
filter_terms = dist_subsets
filter_names = dist_subset_labels
# filter_terms = ['center_diff <= 35', 'center_diff > 35']
c = pd.concat([(dfs.query(ft).groupby(['motif_pair', 'task']).
                apply(lambda x: x.eval(term).median() if x.eval(mask).mean() > .9 else np.nan).
                reset_index().
                rename(columns={0:'average'}).
                assign(term=term, subset=ft, subset_name=filter_names[i]))
               for term in terms for i, ft in enumerate(filter_terms)], axis=0)
c['term'] = pd.Categorical(c['term'], categories=terms)
c['subset'] = pd.Categorical(c['subset'], categories=filter_terms)
c['subset_name'] = pd.Categorical(c['subset_name'], categories=filter_names)
c['motif_pair'] = pd.Categorical(c['motif_pair'], pair_names[::-1])

plotnine.options.figure_size = get_figsize(1, 0.6)
assert len(terms) == 1
fig = (ggplot(aes(y='motif_pair', x='task', fill='average')) + 
       geom_tile(data=c.dropna()) + 
       # geom_tile(fill='#d9d9d9', data=c[c.average.isnull()]) + 
       geom_text(label='/', data=c[c.average.isnull()]) + 
       theme_classic(base_size=10, base_family='Arial') + 
       facet_grid(". ~ subset_name") + 
       theme(legend_position='top', axis_text_x=element_text(rotation=30, hjust=1))  + 
       scale_fill_gradient2(low=cb_red, mid='white', high=cb_blue, midpoint=1, limits=[.6, 1.4], 
                            palette=PlotninePalette("RdBu_r", [0, 1])) +
       # scale_fill_gradient2(low=cb_blue, mid='white', high=cb_red, midpoint=1, limits=[0, 2]) +
       ggtitle(terms[0] + ", 90% Wt-dAB>0")
)
fig.save(figures_dir / 'heatmap.synergy-score.90perc.Wt-dAB>0.pdf')
fig

<ggplot: (8756614591394)>

Setup table showing only the major TF's¶

Row -> target motif of interest (and the task)
column -> other motif

def get_syn_effect(c, pairs, motifs, tasks):
    motifxtask = [f"{motif} (task={profile_mapping[motif]})" for motif in motifs]
    dfp = pd.DataFrame(index=motifxtask, columns=motifs, dtype=float)
    k = 0
    for i in range(len(motifs)):
        for j in range(len(motifs)):
            if i>j:
                continue
            idx = 'motif_pair_idx'
            cat = 'center_diff_cat'
            motif_pair = pairs[k]
            motif_pair_name = "<>".join(motif_pair)
            k+=1
            t1 = profile_mapping[motif_pair[0]]
            t2 = profile_mapping[motif_pair[1]]
            
            dfp.loc[motif_pair[0] + f" (task={t1})"][motif_pair[1]] = c[(c.motif_pair == motif_pair_name) & (c.task == t1)]['average']
            dfp.loc[motif_pair[1] + f" (task={t2})"][motif_pair[0]] = c[(c.motif_pair == motif_pair_name) & (c.task == t2)]['average']
    # dfp.columns = [r"$\Delta$"+c for c in dfp.columns]
    return dfp

Major TF per motif¶

# subsets = ['center_diff <= 35', 'center_diff > 35', 'center_diff > 70']
subsets = dist_subsets
fig,axes = plt.subplots(1, len(subsets), 
                        figsize=get_figsize(0.25*len(subsets), .8/len(subsets)), 
                        sharey=True)
for i, (subset,ax, subset_title) in enumerate(zip(subsets, axes, dist_subset_labels)):
    sns.heatmap(get_syn_effect(c[c.subset == subset], pairs, motifs, tasks), cmap='RdBu_r', center=1, ax=ax, 
                cbar=i==len(subsets)-1,
                vmin=.7, vmax=1.3)
    ax.set_title(subset_title)
plt.savefig(figures_dir / 'heatmap.synergy-score.90perc.Wt-dAB>0.main_tasks.pdf')

Require: 20% Wt-dAB>0¶

mask = 'Wt-dAB>0'
terms = ['(2*Wt-dA-dB)/(Wt-dAB)']
filter_terms = dist_subsets
filter_names = dist_subset_labels
# filter_terms = ['center_diff <= 35', 'center_diff > 35']
c = pd.concat([(dfs.query(ft).groupby(['motif_pair', 'task']).
                apply(lambda x: x.eval(term).median() if x.eval(mask).mean() > .2 else np.nan).
                reset_index().
                rename(columns={0:'average'}).
                assign(term=term, subset=ft, subset_name=filter_names[i]))
               for term in terms for i, ft in enumerate(filter_terms)], axis=0)
c['term'] = pd.Categorical(c['term'], categories=terms)
c['subset'] = pd.Categorical(c['subset'], categories=filter_terms)
c['subset_name'] = pd.Categorical(c['subset_name'], categories=filter_names)
c['motif_pair'] = pd.Categorical(c['motif_pair'], pair_names[::-1])

plotnine.options.figure_size = get_figsize(1, 0.6)
assert len(terms) == 1
fig = (ggplot(aes(y='motif_pair', x='task', fill='average')) + 
       geom_tile(data=c.dropna()) + 
       # geom_tile(fill='#d9d9d9', data=c[c.average.isnull()]) + 
       geom_text(label='/', data=c[c.average.isnull()]) + 
       theme_classic(base_size=10, base_family='Arial') + 
       facet_grid(". ~ subset_name") + 
       theme(legend_position='top', axis_text_x=element_text(rotation=30, hjust=1))  + 
       scale_fill_gradient2(low=cb_red, mid='white', high=cb_blue, midpoint=1, limits=[.6, 1.4], 
                            palette=PlotninePalette("RdBu_r", [0, 1])) +
       # scale_fill_gradient2(low=cb_blue, mid='white', high=cb_red, midpoint=1, limits=[0, 2]) +
       ggtitle(terms[0] + ", 90% Wt-dAB>0")
)
fig.save(figures_dir / 'heatmap.synergy-score.20perc.Wt-dAB>0.pdf')
fig

<ggplot: (-9223363280254873796)>

Major TF per motif¶

subsets = dist_subsets
fig,axes = plt.subplots(1, len(subsets), 
                        figsize=get_figsize(0.25*len(subsets), .8/len(subsets)), 
                        sharey=True)
for i, (subset,ax, subset_title) in enumerate(zip(subsets, axes, dist_subset_labels)):
    sns.heatmap(get_syn_effect(c[c.subset == subset], pairs, motifs, tasks), cmap='RdBu_r', center=1, ax=ax, 
                cbar=i==len(subsets)-1,
                vmin=.7, vmax=1.3)
    ax.set_title(subset_title)
plt.savefig(figures_dir / 'heatmap.synergy-score.20perc.Wt-dAB>0.main_tasks.pdf')

Show all¶

mask = 'Wt-dAB>0'
terms = ['(2*Wt-dA-dB)/(Wt-dAB)']
filter_terms = dist_subsets
filter_names = dist_subset_labels
# filter_terms = ['center_diff <= 35', 'center_diff > 35']
c = pd.concat([(dfs.query(ft).groupby(['motif_pair', 'task']).
                apply(lambda x: x.eval(term).median()).
                reset_index().
                rename(columns={0:'average'}).
                assign(term=term, subset=ft, subset_name=filter_names[i]))
               for term in terms for i, ft in enumerate(filter_terms)], axis=0)
c['term'] = pd.Categorical(c['term'], categories=terms)
c['subset'] = pd.Categorical(c['subset'], categories=filter_terms)
c['subset_name'] = pd.Categorical(c['subset_name'], categories=filter_names)
c['motif_pair'] = pd.Categorical(c['motif_pair'], pair_names[::-1])

plotnine.options.figure_size = get_figsize(1, 0.6)
assert len(terms) == 1
fig = (ggplot(aes(y='motif_pair', x='task', fill='average')) + 
       geom_tile(data=c.dropna()) + 
       # geom_tile(fill='#d9d9d9', data=c[c.average.isnull()]) + 
       geom_text(label='/', data=c[c.average.isnull()]) + 
       theme_classic(base_size=10, base_family='Arial') + 
       facet_grid(". ~ subset_name") + 
       theme(legend_position='top', axis_text_x=element_text(rotation=30, hjust=1))  + 
       scale_fill_gradient2(low=cb_red, mid='white', high=cb_blue, midpoint=1, limits=[.6, 1.4], 
                            palette=PlotninePalette("RdBu_r", [0, 1])) +
       # scale_fill_gradient2(low=cb_blue, mid='white', high=cb_red, midpoint=1, limits=[0, 2]) +
       ggtitle(terms[0] + ", 90% Wt-dAB>0")
)
fig.save(figures_dir / 'heatmap.synergy-score.all.pdf')
fig

<ggplot: (8756615188004)>

Major TF per motif¶

subsets = dist_subsets
fig,axes = plt.subplots(1, len(subsets), 
                        figsize=get_figsize(0.25*len(subsets), .8/len(subsets)), 
                        sharey=True)
for i, (subset,ax, subset_title) in enumerate(zip(subsets, axes, dist_subset_labels)):
    sns.heatmap(get_syn_effect(c[c.subset == subset], pairs, motifs, tasks), cmap='RdBu_r', center=1, ax=ax, 
                cbar=i==len(subsets)-1,
                vmin=.7, vmax=1.3)
    ax.set_title(subset_title)
plt.savefig(figures_dir / 'heatmap.synergy-score.all.main_tasks.pdf')

Directional effect¶

Scatterplot showing the major TF per motif¶

from basepair.exp.paper.config import profile_mapping

dfabf_plot = dfabf2.query('center_diff > 5')#.query('center_diff < 150')
score = 'max_profile_count_bt'
# -----------------------------------
fig, axes = plt.subplots(len(motifs), len(motifs), 
                         figsize=get_figsize(.65, 1), 
                         gridspec_kw=dict(wspace=0, hspace=0),
                         sharex=True, sharey=True)
colors = plt.get_cmap("viridis")(np.linspace(0, .95, len(dfabf_plot.center_diff_cat.cat.categories)))

k = 0
for i in range(len(motifs)):
    for j in range(len(motifs)):
        ax = axes[i,j]
        if i>j:
            ax.axison = False
            continue
        idx = 'motif_pair_idx'
        cat = 'center_diff_cat'
        motif_pair = pairs[k]
        motif_pair_name = "<>".join(motif_pair)
        k+=1
        t1 = profile_mapping[motif_pair[0]]
        t2 = profile_mapping[motif_pair[1]]
        # t1 = 'Nanog'
        # t2 = 'Nanog'
        # motif mask
        mmask = (dfabf_plot.motif_pair == motif_pair_name) & (dfabf_plot.score == score)
        ab = pd.merge(dfabf_plot[mmask & (dfabf_plot.task == t1)][[idx, 'x_alt_ref_pc', 'x_ref', cat]],
                      dfabf_plot[mmask & (dfabf_plot.task == t2)][[idx, 'y_alt_ref_pc', 'y_ref']],
                      on=idx)
        # make a scatterplot
        
        # Subset ab to only the highest values
        ab = ab[(ab.x_ref > np.percentile(ab.x_ref, 80)) & (ab.y_ref > np.percentile(ab.y_ref, 80))]
        for ci, (label,df) in enumerate(list(ab.groupby(cat))):
            ax.scatter(1/df.y_alt_ref_pc,
                       1/df.x_alt_ref_pc,   # first one is on the y axis
                       alpha=0.2, s=1, label=label, color=colors[ci], rasterized=True)
        
        # ax.text(0, 0,t1+"-"+t2)
        alpha = .5
        xl = [1/3, 3]
        ax.plot(xl, xl, c='grey', alpha=alpha)
        ax.axvline(1, c='grey', alpha=alpha)
        ax.axhline(1, c='grey', alpha=alpha)
        ax.set_xlim(xl)
        ax.set_ylim(xl)
        
        ax.set_yscale('log')
        ax.set_xscale('log')
        
        # explicitly add ticks
        ax.set_yticks([1/3, 1, 3])
        ax.set_xticks([1/3, 1, 3])
        ax.set_xticklabels(['1/3', '1', '3'])
        ax.set_yticklabels(['1/3', '1', '3'])
        ax.minorticks_off() # turns off minor ticks
        
        if i == j:
            ax.set_ylabel(motif_pair[1])
            ax.set_xlabel(motif_pair[0])
            
        # Use grey spines
        plt.setp(ax.spines.values(), color='grey')
        plt.setp([ax.get_xticklines(), ax.get_yticklines()], color='grey')

plt.legend(loc="bottom right", bbox_to_anchor=(-.2,.7),
           scatterpoints=1, markerscale=10, columnspacing=0,
           handletextpad=0, borderpad=0, frameon=False, title="Pairwise distance")
fig.savefig(figures_dir / "max_profile_count.upper-triag.no-Oct4.width=0.65.center_diff>5.pdf")

All TF's per motif¶

import matplotlib.cm as cm
colors = plt.get_cmap("viridis")(np.linspace(0, .95, len(dfabf.center_diff_cat.cat.categories)))
hex_colors = [plt.cm.colors.to_hex(a) for a in colors]

# use pair names
dfabf['motif_pair'] = pd.Categorical(dfabf['motif_pair'], pair_names)

from plotnine import *

hex_colors

['#440154', '#32648e', '#2ab07f', '#dfe318']

dfabf['task'] = pd.Categorical(dfabf['task'])

print(dfabf[dfabf.score == 'max_profile_count']['center_diff_cat'].value_counts().to_string())

(150, 1000]    808604
(70, 150]      433820
(0, 35]        301120
(35, 70]       240868

Heatmap version¶

def get_effect(dfabf, pairs, motifs, agg_fn=np.mean, score='max_profile_count'):
    motifxtask = [f"{motif} (task={profile_mapping[motif]})" for motif in motifs]
    dfp = pd.DataFrame(index=motifxtask, columns=motifs, dtype=float)
    k = 0
    for i in range(len(motifs)):
        for j in range(len(motifs)):
            if i>j:
                continue
            idx = 'motif_pair_idx'
            cat = 'center_diff_cat'
            motif_pair = pairs[k]
            motif_pair_name = "<>".join(motif_pair)
            k+=1
            t1 = profile_mapping[motif_pair[0]]
            t2 = profile_mapping[motif_pair[1]]
            # t1 = 'Nanog'
            # t2 = 'Nanog'
            # motif mask
            mmask = (dfabf.motif_pair == motif_pair_name) & (dfabf.score == score)
            ab = pd.merge(dfabf[mmask & (dfabf.task == t1)][[idx, 'x_alt_ref_pc', 'x_ref', cat]],
                          dfabf[mmask & (dfabf.task == t2)][[idx, 'y_alt_ref_pc', 'y_ref']],
                          on=idx)
            ab = ab[(ab.x_ref > np.percentile(ab.x_ref, 80)) & 
                    (ab.y_ref > np.percentile(ab.y_ref, 80))]
            dfp.loc[motif_pair[0] + f" (task={t1})"][motif_pair[1]] = agg_fn(ab.x_alt_ref_pc)
            dfp.loc[motif_pair[1] + f" (task={t2})"][motif_pair[0]] = agg_fn(ab.y_alt_ref_pc)
    dfp.columns = [r"$\Delta$"+c for c in dfp.columns]
    return dfp

Median value¶

dfabf_plot = dfabf2.query('center_diff > 5')

(AB - (B - None)) / A

pairs_no_oct4 = [p for p in pairs if p[0] != 'Oct4' and p[1] != 'Oct4']

motifs_no_oct4 = [m for m in motifs if m != 'Oct4']

motifs_no_oct4

['Oct4-Sox2', 'Sox2', 'Nanog', 'Klf4']

dfabf_plot.strand_combination.unique()

array(['++', '+-', '-+', '--'], dtype=object)

subsets = dist_subsets
fig,axes = plt.subplots(4, len(subsets), 
                        figsize=get_figsize(0.25*len(subsets), .8/len(subsets)*4), 
                        sharex=True,
                        sharey=True)
for irow, sc in enumerate(['++', '+-', '-+', '--']):
    for i, (subset,ax, subset_label) in enumerate(zip(subsets, axes[irow], dist_subset_labels)):
        try:
            sns.heatmap(get_effect(dfabf_plot[dfabf_plot.strand_combination == sc].query(subset), pairs_no_oct4, motifs_no_oct4,
                                   agg_fn=np.median, score='max_profile_count_bt'), 
                        cmap='RdBu', center=1, ax=ax, 
                        cbar=i==len(subsets)-1,
                        vmin=.7, vmax=1.3)
        except:
            pass
        if i == 0:
            ax.set_title(sc + " " + subset_label)
        else:
            ax.set_title(subset_label)
plt.savefig(figures_dir / 'heatmap.median_max_profile_count.directional_effect.top-profile.bleed-through-corrected.strand-specific.pdf')

subsets = dist_subsets
fig,axes = plt.subplots(1, len(subsets), 
                        figsize=get_figsize(0.25*len(subsets), .8/len(subsets)), 
                        sharey=True)
for i, (subset,ax, subset_label) in enumerate(zip(subsets, axes, dist_subset_labels)):
    sns.heatmap(get_effect(dfabf_plot.query(subset), pairs_no_oct4, motifs_no_oct4,
                           agg_fn=np.median, score='max_profile_count_bt'), 
                cmap='RdBu', center=1, ax=ax, 
                cbar=i==len(subsets)-1,
                vmin=.7, vmax=1.3)
    ax.set_title(subset_label)
plt.savefig(figures_dir / 'heatmap.median_max_profile_count.directional_effect.top-profile.bleed-through-corrected.pdf')

AB / A¶

subsets = dist_subsets
fig,axes = plt.subplots(1, len(subsets), 
                        figsize=get_figsize(0.25*len(subsets), .8/len(subsets)), 
                        sharey=True)
for i, (subset,ax, subset_label) in enumerate(zip(subsets, axes, dist_subset_labels)):
    sns.heatmap(get_effect(dfabf_plot.query(subset), pairs, agg_fn=np.median), cmap='RdBu', center=1, ax=ax, 
                cbar=i==len(subsets)-1,
                vmin=.7, vmax=1.3)
    ax.set_title(subset_label)
plt.savefig(figures_dir / 'heatmap.median_max_profile_count.directional_effect.pdf')

Mean value¶

(AB - (B - None)) / A

subsets = dist_subsets
fig,axes = plt.subplots(1, len(subsets), 
                        figsize=get_figsize(0.25*len(subsets), .8/len(subsets)), 
                        sharey=True)
for i, (subset,ax, subset_label) in enumerate(zip(subsets, axes, dist_subset_labels)):
    sns.heatmap(get_effect(dfabf_plot.query(subset), pairs, agg_fn=np.mean, score='max_profile_count_bt'), cmap='RdBu', center=1, ax=ax, 
                cbar=i==len(subsets)-1,
                vmin=.7, vmax=1.3)
    ax.set_title(subset_label)
plt.savefig(figures_dir / 'heatmap.mean_max_profile_count.directional_effect.bleed-through-corrected.pdf')

AB / A¶

subsets = dist_subsets
fig,axes = plt.subplots(1, len(subsets), 
                        figsize=get_figsize(0.25*len(subsets), .8/len(subsets)), 
                        sharey=True)
for i, (subset,ax, subset_label) in enumerate(zip(subsets, axes, dist_subset_labels)):
    sns.heatmap(get_effect(dfabf_plot.query(subset), pairs, agg_fn=np.mean, score='max_profile_count'), cmap='RdBu', center=1, ax=ax, 
                cbar=i==len(subsets)-1,
                vmin=.7, vmax=1.3)
    ax.set_title(subset_label)
plt.savefig(figures_dir / 'heatmap.mean_max_profile_count.directional_effect.pdf')

Values for all the tasks¶

motifxtask = [f"{motif} (task={task})" for motif in motifs for task in tasks]

def get_effect_all_tasks(dfabf, pairs, tasks, agg_fn=np.mean):
    motifxtask = [f"{motif} (task={task})" for motif in motifs for task in tasks]
    dfp = pd.DataFrame(index=motifxtask, columns=motifs, dtype=float)
    for task in tasks:
        k = 0
        for i in range(len(motifs)):
            for j in range(len(motifs)):
                if i>j:
                    continue
                idx = 'motif_pair_idx'
                cat = 'center_diff_cat'
                motif_pair = pairs[k]
                motif_pair_name = "<>".join(motif_pair)
                k+=1
                t1 = profile_mapping[motif_pair[0]]
                t2 = profile_mapping[motif_pair[1]]
                ab = dfabf[(dfabf.motif_pair == motif_pair_name) & (dfabf.score == 'max_profile_count') & (dfabf.task == task)]
                dfp.loc[motif_pair[0] + f" (task={task})"][motif_pair[1]] = agg_fn(ab.x_alt_ref_pc)
                dfp.loc[motif_pair[1] + f" (task={task})"][motif_pair[0]] = agg_fn(ab.y_alt_ref_pc)
    dfp.columns = [r"$\Delta$"+c for c in dfp.columns]
    return dfp

Median¶

subsets = dist_subsets
fig,axes = plt.subplots(1, len(subsets), 
                        figsize=get_figsize(0.25*len(subsets), .8/len(subsets)*4), 
                        sharey=True)
for i, (subset,ax, subset_label) in enumerate(zip(subsets, axes, dist_subset_labels)):
    sns.heatmap(get_effect_all_tasks(dfabf_plot.query(subset), pairs, tasks=tasks, agg_fn=np.median), cmap='RdBu', center=1, ax=ax, 
                cbar=i==len(subsets)-1,
                vmin=.7, vmax=1.3)
    ax.set_title(subset_label)
plt.savefig(figures_dir / 'heatmap.median_max_profile_count.directional_effect.all-tasks.pdf')

Mean¶

subsets = dist_subsets
fig,axes = plt.subplots(1, len(subsets), 
                        figsize=get_figsize(0.25*len(subsets), .8/len(subsets)*4), 
                        sharey=True)
for i, (subset,ax, subset_label) in enumerate(zip(subsets, axes, dist_subset_labels)):
    sns.heatmap(get_effect_all_tasks(dfabf.query(subset), pairs, tasks=tasks, agg_fn=np.mean), cmap='RdBu', center=1, ax=ax, 
                cbar=i==len(subsets)-1,
                vmin=.7, vmax=1.3)
    ax.set_title(subset_label)
plt.savefig(figures_dir / 'heatmap.mean_max_profile_count.directional_effect.all-tasks.pdf')

Generate individual plots¶

For the scatter plots (directional interactions), generate png and pdf files for each interaction so I can use them directly.

The combined pdfs are too big and also too confusing to show on a slide or figure.

dfabf_plot = dfabf2.query('center_diff > 5')
odir = figures_dir / "individual/max_profile_count"
odir.mkdir(parents=True, exist_ok=True)
colors = plt.get_cmap("viridis")(np.linspace(0, .95, len(dfabf_plot.center_diff_cat.cat.categories)))
k = 0
for i in range(len(motifs)):
    for j in range(len(motifs)):
        if i>j:
            continue
        fig, ax = plt.subplots(1, 1, figsize=get_figsize(.65 / 3, 1))
        idx = 'motif_pair_idx'
        cat = 'center_diff_cat'
        motif_pair = pairs[k]
        motif_pair_name = "<>".join(motif_pair)
        k+=1
        t1 = profile_mapping[motif_pair[0]]
        t2 = profile_mapping[motif_pair[1]]
        # t1 = 'Nanog'
        # t2 = 'Nanog'
        # motif mask
        mmask = (dfabf_plot.motif_pair == motif_pair_name) & (dfabf_plot.score == 'max_profile_count')
        ab = pd.merge(dfabf_plot[mmask & (dfabf_plot.task == t1)][[idx, 'x_alt_ref_pc', cat]],
                      dfabf_plot[mmask & (dfabf_plot.task == t2)][[idx, 'y_alt_ref_pc']],
                      on=idx)
        # make a scatterplot
        for ci, (label,df) in enumerate(list(ab.groupby(cat))):
            ax.scatter(df.y_alt_ref_pc,
                       df.x_alt_ref_pc,   # first one is on the y axis
                       alpha=0.2, s=1, label=label, color=colors[ci])
        
        # ax.text(0, 0,t1+"-"+t2)
        alpha = .5
        xl = [0, 2]
        ax.plot(xl, xl, c='grey', alpha=alpha)
        ax.axvline(1, c='grey', alpha=alpha)
        ax.axhline(1, c='grey', alpha=alpha)
        ax.set_xlim(xl)
        ax.set_ylim(xl)
        ax.set_xlabel(r"{} ($\Delta$ {})".format(motif_pair[1], motif_pair[0]))
        ax.set_ylabel(r"{} ($\Delta$ {})".format(motif_pair[0], motif_pair[1]))
            
        # Use grey spines
        plt.setp(ax.spines.values(), color='grey')
        plt.setp([ax.get_xticklines(), ax.get_yticklines()], color='grey')
        fig.savefig(odir / f'{motif_pair_name}.png', raster=True)
        fig.savefig(odir / f'{motif_pair_name}.pdf', raster=True)

dfabf_plot = dfabf2.query('center_diff > 5')
odir = figures_dir / "individual/max_profile_count/all_tasks"
odir.mkdir(parents=True, exist_ok=True)
colors = plt.get_cmap("viridis")(np.linspace(0, .95, len(dfabf_plot.center_diff_cat.cat.categories)))

for task in tasks:
    k = 0
    for i in range(len(motifs)):
        for j in range(len(motifs)):
            if i>j:
                continue
            fig, ax = plt.subplots(1, 1, figsize=get_figsize(.65 / 3, 1))
            idx = 'motif_pair_idx'
            cat = 'center_diff_cat'
            motif_pair = pairs[k]
            motif_pair_name = "<>".join(motif_pair)
            k+=1
            t1 = profile_mapping[motif_pair[0]]
            t2 = profile_mapping[motif_pair[1]]
            
            ab = dfabf_plot[(dfabf_plot.motif_pair == motif_pair_name) & 
                            (dfabf_plot.score == 'max_profile_count') & 
                            (dfabf_plot.task == task)]
            # make a scatterplot
            for ci, (label,df) in enumerate(list(ab.groupby(cat))):
                ax.scatter(df.y_alt_ref_pc,
                           df.x_alt_ref_pc,   # first one is on the y axis
                           alpha=0.2, s=1, label=label, color=colors[ci])

            # ax.text(0, 0,t1+"-"+t2)
            alpha = .5
            xl = [0, 2]
            ax.plot(xl, xl, c='grey', alpha=alpha)
            ax.axvline(1, c='grey', alpha=alpha)
            ax.axhline(1, c='grey', alpha=alpha)
            ax.set_xlim(xl)
            ax.set_ylim(xl)
            ax.set_xlabel(r"{} ($\Delta$ {})".format(motif_pair[1], motif_pair[0]))
            ax.set_ylabel(r"{} ($\Delta$ {})".format(motif_pair[0], motif_pair[1]))
            ax.set_title(task)

            # Use grey spines
            plt.setp(ax.spines.values(), color='grey')
            plt.setp([ax.get_xticklines(), ax.get_yticklines()], color='grey')
            fig.savefig(odir / f'{motif_pair_name},task={task}.png', raster=True)
            fig.savefig(odir / f'{motif_pair_name},task={task}.pdf', raster=True)

dfabf_plot = dfabf2.query('center_diff > 5')
score = 'max_profile_count_bt'
odir = figures_dir / "individual/max_profile_count/all_tasks"
odir.mkdir(parents=True, exist_ok=True)
colors = plt.get_cmap("viridis")(np.linspace(0, .95, len(dfabf_plot.center_diff_cat.cat.categories)))

for task in tasks:
    k = 0
    for i in range(len(motifs)):
        for j in range(len(motifs)):
            if i>j:
                continue
            fig, ax = plt.subplots(1, 1, figsize=get_figsize(.65 / 3, 1))
            idx = 'motif_pair_idx'
            cat = 'center_diff_cat'
            motif_pair = pairs[k]
            motif_pair_name = "<>".join(motif_pair)
            k+=1
            t1 = profile_mapping[motif_pair[0]]
            t2 = profile_mapping[motif_pair[1]]
            
            ab = dfabf_plot[(dfabf_plot.motif_pair == motif_pair_name) & 
                            (dfabf_plot.score == score) & 
                            (dfabf_plot.task == task)]
            # make a scatterplot
            for ci, (label,df) in enumerate(list(ab.groupby(cat))):
                ax.scatter(df.y_alt_ref_pc,
                           df.x_alt_ref_pc,   # first one is on the y axis
                           alpha=0.2, s=1, label=label, color=colors[ci])

            # ax.text(0, 0,t1+"-"+t2)
            alpha = .5
            xl = [0, 2]
            ax.plot(xl, xl, c='grey', alpha=alpha)
            ax.axvline(1, c='grey', alpha=alpha)
            ax.axhline(1, c='grey', alpha=alpha)
            ax.set_xlim(xl)
            ax.set_ylim(xl)
            ax.set_xlabel(r"{} ($\Delta$ {})".format(motif_pair[1], motif_pair[0]))
            ax.set_ylabel(r"{} ($\Delta$ {})".format(motif_pair[0], motif_pair[1]))
            ax.set_title(task)

            # Use grey spines
            plt.setp(ax.spines.values(), color='grey')
            plt.setp([ax.get_xticklines(), ax.get_yticklines()], color='grey')
            fig.savefig(odir / f'{motif_pair_name},task={task}.bt-corrected.png', raster=True)
            fig.savefig(odir / f'{motif_pair_name},task={task}.bt-corrected.pdf', raster=True)

Pairwise plot¶

dfabf_plot = dfabf2.query('center_diff > 5')

dfabf_plot.motif_pair.unique()

array(['Oct4-Sox2<>Oct4-Sox2', 'Oct4-Sox2<>Oct4', 'Oct4-Sox2<>Sox2', 'Oct4-Sox2<>Nanog',
       'Oct4-Sox2<>Klf4', 'Oct4<>Oct4', 'Oct4<>Sox2', 'Oct4<>Nanog', 'Oct4<>Klf4', 'Sox2<>Sox2',
       'Sox2<>Nanog', 'Sox2<>Klf4', 'Nanog<>Nanog', 'Nanog<>Klf4', 'Klf4<>Klf4'], dtype=object)

dfp = dfabf_plot.query("motif_pair == 'Oct4-Sox2<>Sox2'").query("score == 'max_profile_count_bt'").query("task == 'Sox2'")

dfp.shape

(12307, 103)

(ggplot(aes(x='center_diff'), data=dfp) + 
 geom_histogram() + 
 theme_bw()
)

<ggplot: (8774732277852)>

# .query('match_weighted_p > .2').query('imp_weighted_p > .01')

(ggplot(aes(x='center_diff', fill='strand_combination'), data=dfp[dfp.center_diff < 150]) + 
 geom_histogram(breaks=np.arange(150)) + 
 theme_bw() + 
 facet_grid("strand_combination ~ .") +
 scale_fill_brewer(type='qual', palette=3)
)

<ggplot: (-9223363262120541885)>

dfabf_plot.match_weighted_p_x.min()

0.2000656311529206

dfabf_plot.match_weighted_p_y.min()

0.2000656311529206

dfabf_plot.imp_weighted_p_x.min()

0.00018637592023110607

dfabf_plot.imp_weighted_p_y.min()

0.00018637592023110607

(ggplot(aes(x='center_diff', fill='strand_combination'), data=(dfabf_plot.
                                                               query("motif_pair == 'Nanog<>Nanog'").
                                                               query("score == 'max_profile_count_bt'").
                                                               query("task == 'Nanog'").
                                                               query("center_diff < 150"))) + 
 geom_histogram(breaks=np.arange(150)) + 
 theme_bw() + 
 facet_grid("strand_combination ~ .") +
 scale_fill_brewer(type='qual', palette=3)
)

<ggplot: (8774733266191)>

TODO¶

[X] scatterplot: avg score ~ distance
[ ] scatterplot: avg log(score) ~ distance

dfs = (dfabf_plot.query("motif_pair == 'Oct4-Sox2<>Sox2'").
            query("score == 'max_profile_count_bt'").
            query("task == 'Sox2'").
            query("center_diff < 150"))
fig, axes = plt.subplots(2, 1, figsize=get_figsize(.5, 1/2), sharex=True)
axes[0].hist(dfs.center_diff, bins=np.arange(150))
df = dfs.groupby('center_diff').y_alt_ref_pc.median().reset_index()
axes[1].plot(df.center_diff, 1/df.y_alt_ref_pc);
axes[1].scatter(df.center_diff, 1/df.y_alt_ref_pc, s=2);
fig.subplots_adjust(hspace=0)

dfs = (dfabf_plot.query("motif_pair == 'Oct4-Sox2<>Nanog'").
            query("score == 'max_profile_count_bt'").
            query("task == 'Nanog'").
            query("center_diff < 150"))
fig, axes = plt.subplots(2, 1, figsize=get_figsize(.5, 1/2), sharex=True)
axes[0].hist(dfs.center_diff, bins=np.arange(150))
df = dfs.groupby('center_diff').y_alt_ref_pc.median().reset_index()
axes[1].plot(df.center_diff, 1/df.y_alt_ref_pc);
axes[1].scatter(df.center_diff, 1/df.y_alt_ref_pc, s=2);
fig.subplots_adjust(hspace=0)

dfs = (dfabf_plot.query("motif_pair == 'Nanog<>Nanog'").
            query("score == 'max_profile_count_bt'").
            query("task == 'Nanog'").
            query("center_diff < 150"))
fig, axes = plt.subplots(2, 1, figsize=get_figsize(.5, 1/2), sharex=True)
axes[0].hist(dfs.center_diff, bins=np.arange(150))
df = dfs.groupby('center_diff').y_alt_ref_pc.median().reset_index()
axes[1].plot(df.center_diff, 1/df.y_alt_ref_pc);
axes[1].scatter(df.center_diff, 1/df.y_alt_ref_pc, s=2);
axes[1].axhline(y=1, color='grey', alpha=0.5)
fig.subplots_adjust(hspace=0)

dfs = (dfabf_plot.query("motif_pair == 'Sox2<>Nanog'").
            query("score == 'max_profile_count_bt'").
            query("task == 'Nanog'").
            query("center_diff < 150"))
fig, axes = plt.subplots(2, 1, figsize=get_figsize(1, 1/2), sharex=True)
axes[0].hist(dfs.center_diff, bins=np.arange(150))
df = dfs.groupby('center_diff').y_alt_ref_pc.median().reset_index()
axes[1].plot(df.center_diff, 1/df.y_alt_ref_pc);
axes[1].scatter(df.center_diff, 1/df.y_alt_ref_pc, s=2);
axes[1].axhline(y=1, color='grey', alpha=0.5)
fig.subplots_adjust(hspace=0)

Bleed-through corrected¶

Open questions:

how to add error-bars? ( for median...)
remove TE's?

# Y
plt.figure()
h, _, _, _ = plt.hist2d(dfabf_plot.match_weighted_p_x, dfabf_plot.imp_weighted_p_x)
cb = plt.colorbar()
cb.set_label('counts in bin')
plt.xlabel("Match")
plt.ylabel("imp")

# plt.figure()
# # Y
# h, _, _, _ = plt.hist2d(dfabf_plot.match_weighted_p_y, dfabf_plot.imp_weighted_p_y)
# cb = plt.colorbar()
# cb.set_label('counts in bin')
# plt.xlabel("Match")
# plt.ylabel("imp")

Text(0, 0.5, 'imp')

# Y
plt.figure()
nn = ~dfabf_plot.seq_match_p_x.isna()
h, _, _, _ = plt.hist2d(dfabf_plot.match_weighted_p_x[nn], dfabf_plot.seq_match_p_x[nn])
cb = plt.colorbar()
cb.set_label('counts in bin')
plt.xlabel("Match")
plt.ylabel("imp")

# plt.figure()
# # Y
# h, _, _, _ = plt.hist2d(dfabf_plot.match_weighted_p_y, dfabf_plot.imp_weighted_p_y)
# cb = plt.colorbar()
# cb.set_label('counts in bin')
# plt.xlabel("Match")
# plt.ylabel("imp")

Text(0, 0.5, 'imp')

values, buckets, _ = plt.hist(np.random.randn(100))

dfs[dfs.task == t1].groupby('center_diff').x_alt_ref_pc.agg(['size', 'median'])

df1

TODO¶

[ ] subset to sites which show a reliable footprint and then run your analysis
[ ] visualize Oct4-Sox2 <> Nanog/Sox2 instances in the genome which are 50bp apart

motif_pair = ['Oct4-Sox2', 'Sox2']
motif_pair_name = '<>'.join(motif_pair)
score = 'max_profile_count_bt'

dfs = (dfabf_plot.query(f"motif_pair == '{motif_pair_name}'").
                query(f"score == '{score}'").
                query("center_diff > 6").
                query("center_diff < 150"))

dfs[dfs.task == 'Sox2'].y_ref.plot.hist(bins=100)

<matplotlib.axes._subplots.AxesSubplot at 0x7fae7915ed30>

np.percentile(dfs[dfs.task == 'Sox2'].y_ref, 90)

0.25800478800000004

np.sum(dfs[dfs.task == 'Sox2'].y_ref > .25)

592

dfs[dfs.task == 'Sox2'].x_ref.plot.hist(bins=100)

<matplotlib.axes._subplots.AxesSubplot at 0x7fb00fe7af98>

np.percentile(dfs[dfs.task == 'Sox2'].x_ref, 90)

0.3526743440000004

np.sum(dfs.x_ref > 1)

698

np.sum((dfs.x_ref > .5) & (dfs.y_ref > .5))

859

np.sum(dfs.y_ref > 1)

441

dfs.head()

dfabf_plot.head()

df1['median'].shape

(6,)

len(dfs)

859

len(df1)

6

dfs[dfs.task == t1].groupby('center_diff').x_alt_ref_pc.agg(['size', 'median', 'mad']).reset_index()

dfs.task.value_counts()

Nanog    829
Sox2      22
Oct4       8
Name: task, dtype: int64

dfs[dfs.task == t1].groupby('center_diff').x_alt_ref_pc.agg(['size', 'median', 'mad']).reset_index()

fig, axes = plt.subplots(2, 1, figsize=get_figsize(.5, 1), sharex=True)
for j, score in enumerate(['max_profile_count_bt']):
    motif_pair_name = '<>'.join(motif_pair)
    dfs = (dfabf_plot.query(f"motif_pair == '{motif_pair_name}'").
                query(f"score == '{score}'").
                query("center_diff > 6").
                query("center_diff < 150"))
    dfs = dfs[(dfs.x_ref > .25) & (dfs.y_ref > .25)]
    # dfs = dfs[(dfs.x_ref > .25)]
    if j == 0:
        values, buckets, _ = axes[0].hist(dfs.center_diff, bins=np.arange(150))
    t1 = profile_mapping[motif_pair[0]]
    t2 = profile_mapping[motif_pair[1]]
    df1 = dfs[dfs.task == t1].groupby('center_diff').x_alt_ref_pc.agg(['size', 'median', 'mad']).reset_index()
    #df1 = df1[df1['size'] > 20]
    df2 = dfs[dfs.task == t2].groupby('center_diff').y_alt_ref_pc.agg(['size', 'median', 'mad']).reset_index()
    #df2 = df2[df2['size'] > 20]
    # Plot #1
    axes[j+1].plot(df1.center_diff, 1/df1['median'], label=motif_pair[0]);
    #         axes[j+1].fill_between(df1.center_diff, 
    #                                1/(df1['median']+df1['mad']/np.sqrt(df1['size'])), 
    #                                1/(df1['median']-df1['mad']/np.sqrt(df1['size'])), alpha=0.1);
    axes[j+1].scatter(df1.center_diff, 1/df1['median'], s=2);

    # Plot #2
    axes[j+1].plot(df2.center_diff, 1/df2['median'], label=motif_pair[1]);
    #         axes[j+1].fill_between(df2.center_diff, 
    #                                1/(df2['median']+df2['mad']/np.sqrt(df2['size'])), 
    #                                1/(df2['median']-df2['mad']/np.sqrt(df2['size'])), alpha=0.1);
    axes[j+1].scatter(df2.center_diff, 1/df2['median'], s=2);
    axes[j+1].set_ylim([.8, 4])
    axes[j+1].axhline(y=1, color='grey', alpha=0.5)
    axes[j+1].set_ylabel(score, rotation=0, ha='right')
    plt.legend(labels=motif_pair)
    fig.subplots_adjust(hspace=0)

np.percentile([1, 2,3 , 5], 40)

2.2

motif_pair = ['Oct4-Sox2', 'Oct4']
for motif_pair in pairs:
    fig, axes = plt.subplots(3, 1, figsize=get_figsize(.5, 1), sharex=True)
    for j, score in enumerate(['max_profile_count_bt', 'max_profile_count']):
        motif_pair_name = '<>'.join(motif_pair)
        dfs = (dfabf_plot.query(f"motif_pair == '{motif_pair_name}'").
                    query(f"score == '{score}'").
                    query("center_diff > 6").
                    query("center_diff < 150"))
        if j == 0:
            values, buckets, _ = axes[0].hist(dfs.center_diff, bins=np.arange(150))
        t1 = profile_mapping[motif_pair[0]]
        x_ref_threshold = np.percentile(dfs[dfs.task == t1].x_ref, 80)
        t2 = profile_mapping[motif_pair[1]]
        y_ref_threshold = np.percentile(dfs[dfs.task == t2].y_ref, 80)
        df1 = dfs[(dfs.task == t1) & (dfs.x_ref > x_ref_threshold)].groupby('center_diff').x_alt_ref_pc.agg(['size', 'median', 'mad']).reset_index()
        # df1 = df1[df1['size'] > 10]
        df2 = dfs[(dfs.task == t2) & (dfs.y_ref > y_ref_threshold)].groupby('center_diff').y_alt_ref_pc.agg(['size', 'median', 'mad']).reset_index()
        # df2 = df2[df2['size'] > 10]
        # Plot #1
        axes[j+1].plot(df1.center_diff, 1/df1['median'], label=motif_pair[0]);
#         axes[j+1].fill_between(df1.center_diff, 
#                                1/(df1['median']+df1['mad']/np.sqrt(df1['size'])), 
#                                1/(df1['median']-df1['mad']/np.sqrt(df1['size'])), alpha=0.1);
        axes[j+1].scatter(df1.center_diff, 1/df1['median'], s=2);
        
        # Plot #2
        axes[j+1].plot(df2.center_diff, 1/df2['median'], label=motif_pair[1]);
#         axes[j+1].fill_between(df2.center_diff, 
#                                1/(df2['median']+df2['mad']/np.sqrt(df2['size'])), 
#                                1/(df2['median']-df2['mad']/np.sqrt(df2['size'])), alpha=0.1);
        axes[j+1].scatter(df2.center_diff, 1/df2['median'], s=2);
        axes[j+1].set_ylim([0, 3])
        axes[j+1].axhline(y=1, color='grey', alpha=0.5)
        axes[j+1].set_ylabel(score, rotation=0, ha='right')
        plt.legend(labels=motif_pair)
        fig.subplots_adjust(hspace=0)

motif_pair = ['Oct4-Sox2', 'Oct4']
for motif_pair in pairs:
    fig, axes = plt.subplots(3, 1, figsize=get_figsize(.5, 1), sharex=True)
    for j, score in enumerate(['max_profile_count_bt', 'max_profile_count']):
        motif_pair_name = '<>'.join(motif_pair)
        dfs = (dfabf_plot.query(f"motif_pair == '{motif_pair_name}'").
                    query(f"score == '{score}'").
                    query("center_diff > 6").
                    query("center_diff < 150"))
        if j == 0:
            values, buckets, _ = axes[0].hist(dfs.center_diff, bins=np.arange(150))
        t1 = profile_mapping[motif_pair[0]]
        t2 = profile_mapping[motif_pair[1]]
        df1 = dfs[dfs.task == t1].groupby('center_diff').x_alt_ref_pc.agg(['size', 'median', 'mad']).reset_index()
        df1 = df1[df1['size'] > 20]
        df2 = dfs[dfs.task == t2].groupby('center_diff').y_alt_ref_pc.agg(['size', 'median', 'mad']).reset_index()
        df2 = df2[df2['size'] > 20]
        # Plot #1
        axes[j+1].plot(df1.center_diff, 1/df1['median'], label=motif_pair[0]);
#         axes[j+1].fill_between(df1.center_diff, 
#                                1/(df1['median']+df1['mad']/np.sqrt(df1['size'])), 
#                                1/(df1['median']-df1['mad']/np.sqrt(df1['size'])), alpha=0.1);
        axes[j+1].scatter(df1.center_diff, 1/df1['median'], s=2);
        
        # Plot #2
        axes[j+1].plot(df2.center_diff, 1/df2['median'], label=motif_pair[1]);
#         axes[j+1].fill_between(df2.center_diff, 
#                                1/(df2['median']+df2['mad']/np.sqrt(df2['size'])), 
#                                1/(df2['median']-df2['mad']/np.sqrt(df2['size'])), alpha=0.1);
        axes[j+1].scatter(df2.center_diff, 1/df2['median'], s=2);
        axes[j+1].set_ylim([0, 2])
        axes[j+1].axhline(y=1, color='grey', alpha=0.5)
        axes[j+1].set_ylabel(score, rotation=0, ha='right')
        plt.legend(labels=motif_pair)
        fig.subplots_adjust(hspace=0)

Make a nice plot¶

motif_pair = ['Oct4-Sox2', 'Oct4']
for motif_pair in pairs:
    fig, axes = plt.subplots(3, 1, figsize=get_figsize(.5, 1), sharex=True)
    for j, score in enumerate(['max_profile_count_bt', 'max_profile_count']):
        motif_pair_name = '<>'.join(motif_pair)
        dfs = (dfabf_plot.query(f"motif_pair == '{motif_pair_name}'").
                    query(f"score == '{score}'").
                    query("center_diff > 6").
                    query("center_diff < 150"))
        if j == 0:
            values, buckets, _ = axes[0].hist(dfs.center_diff, bins=np.arange(150))
        t1 = profile_mapping[motif_pair[0]]
        t2 = profile_mapping[motif_pair[1]]
        df1 = dfs[dfs.task == t1].groupby('center_diff').x_alt_ref_pc.agg(['size', 'median', 'mad']).reset_index()
        df1 = df1[df1['size'] > 20]
        df2 = dfs[dfs.task == t2].groupby('center_diff').y_alt_ref_pc.agg(['size', 'median', 'mad']).reset_index()
        df2 = df2[df2['size'] > 20]
        # Plot #1
        axes[j+1].plot(df1.center_diff, 1/df1['median'], label=motif_pair[0]);
#         axes[j+1].fill_between(df1.center_diff, 
#                                1/(df1['median']+df1['mad']/np.sqrt(df1['size'])), 
#                                1/(df1['median']-df1['mad']/np.sqrt(df1['size'])), alpha=0.1);
        axes[j+1].scatter(df1.center_diff, 1/df1['median'], s=2);
        
        # Plot #2
        axes[j+1].plot(df2.center_diff, 1/df2['median'], label=motif_pair[1]);
#         axes[j+1].fill_between(df2.center_diff, 
#                                1/(df2['median']+df2['mad']/np.sqrt(df2['size'])), 
#                                1/(df2['median']-df2['mad']/np.sqrt(df2['size'])), alpha=0.1);
        axes[j+1].scatter(df2.center_diff, 1/df2['median'], s=2);
        axes[j+1].set_ylim([0, 2])
        axes[j+1].axhline(y=1, color='grey', alpha=0.5)
        axes[j+1].set_ylabel(score, rotation=0, ha='right')
        plt.legend(labels=motif_pair)
        fig.subplots_adjust(hspace=0)

dfabf_plot.strand_combination

0          ++
1          ++
2          ++
           ..
9750325    -+
9750326    -+
9750327    ++
Name: strand_combination, Length: 9728040, dtype: object

from basepair.exp.paper.config import tf_colors

# for strand_combination in ['++', '-+', '+-', '--']:
fig, axes = plt.subplots(3, 1, figsize=get_figsize(.25, 2), 
                         sharex=True, 
                         sharey=True,
                         gridspec_kw=dict(wspace=0, hspace=0))
# motif_pair = ['Oct4-Sox2', 'Sox2']
for i, motif_pair in enumerate([['Nanog', 'Nanog'], 
                                ['Sox2', 'Nanog'],
                                ['Sox2', 'Sox2']]):
    motif_pair_name = '<>'.join(motif_pair)
    score = 'max_profile_count_bt'
    dfs = (dfabf_plot.query(f"motif_pair == '{motif_pair_name}'").
                query(f"score == '{score}'").
                query("center_diff > 6").
                # query(f"strand_combination == '{strand_combination}'").
                query("center_diff < 150"))
    t1 = profile_mapping[motif_pair[0]]
    t2 = profile_mapping[motif_pair[1]]
    df1 = dfs[dfs.task == t1].groupby('center_diff').x_alt_ref_pc.agg(['size', 'median', 'mad']).reset_index()
    df1 = df1[df1['size'] > 20]
    df2 = dfs[dfs.task == t2].groupby('center_diff').y_alt_ref_pc.agg(['size', 'median', 'mad']).reset_index()
    df2 = df2[df2['size'] > 20]

    ax = axes[i]
    ax.plot(df1.center_diff, 1/df1['median'], 
            label=motif_pair[0],
            color=tf_colors[profile_mapping[motif_pair[0]]]);
    ax.plot(df2.center_diff, 1/df2['median'], 
            label=motif_pair[1],
            color=tf_colors[profile_mapping[motif_pair[1]]]);

    if i == 0:
        ax.set_title(f"Short-range interactions")
    ax.axhline(1, color='grey', alpha=0.5)
    if i == 1:
        ax.set_ylabel("Max profile value")
    ax.legend()
    ax.spines['top'].set_visible(False)
    ax.set_ylim([0.8, 2.5])
    ax.spines['right'].set_visible(False)
axes[-1].set_xlabel("Motif distance")
axes[-1].set_xticks([10, 50, 100, 150]);
fig.savefig(figures_dir / f'short-range.Nanog<>Sox2.strand_combination=all.pdf')

fig, axes = plt.subplots(3, 1, figsize=get_figsize(.25, 2), 
                         sharex=True, 
                         sharey=True,
                         gridspec_kw=dict(wspace=0, hspace=0))
# motif_pair = ['Oct4-Sox2', 'Sox2']
for i, motif_pair in enumerate([['Oct4-Sox2', 'Sox2'], 
                                ['Oct4-Sox2', 'Nanog'],
                                ['Oct4-Sox2', 'Klf4']]):
    motif_pair_name = '<>'.join(motif_pair)
    score = 'max_profile_count_bt'
    dfs = (dfabf_plot.query(f"motif_pair == '{motif_pair_name}'").
                query(f"score == '{score}'").
                query("center_diff > 6").
                # query("strand_combination == '++'").
                query("center_diff < 150"))
    t1 = profile_mapping[motif_pair[0]]
    t2 = profile_mapping[motif_pair[1]]
    df1 = dfs[dfs.task == t1].groupby('center_diff').x_alt_ref_pc.agg(['size', 'median', 'mad']).reset_index()
    df1 = df1[df1['size'] > 20]
    df2 = dfs[dfs.task == t2].groupby('center_diff').y_alt_ref_pc.agg(['size', 'median', 'mad']).reset_index()
    df2 = df2[df2['size'] > 20]
    
    ax = axes[i]
    ax.plot(df1.center_diff, 1/df1['median'], 
            label=motif_pair[0],
            color=tf_colors[profile_mapping[motif_pair[0]]]);
    ax.plot(df2.center_diff, 1/df2['median'], 
            label=motif_pair[1],
            color=tf_colors[profile_mapping[motif_pair[1]]]);
    
    if i == 0:
        ax.set_title("Short-range interactions")
    ax.axhline(1, color='grey', alpha=0.5)
    if i == 1:
        ax.set_ylabel("Max profile value")
    ax.legend()
    ax.spines['top'].set_visible(False)
    ax.set_ylim([0.8, 2.5])
    ax.spines['right'].set_visible(False)
axes[-1].set_xlabel("Motif distance")
axes[-1].set_xticks([10, 50, 100, 150]);
fig.savefig(figures_dir / 'long-range.Oct4-Sox2.strand_combination=all.pdf')

Directness score¶

dfp = pd.read_csv("http://mitra.stanford.edu/kundaje/avsec/chipnexus/oct-sox-nanog-klf/models/n_dil_layers=9/modisco/all/deeplift/profile/pattern_table.csv")

dfp.head()

dfp = dfp[dfp.pattern.isin(list(motifs.values()))]
m_hash = {v:k for k,v in motifs.items()}
dfp['pattern_name'] = dfp.pattern.map(m_hash)
dfp = dfp.set_index('pattern_name')
dfp = dfp.loc[list(motifs)]

dfp_max = dfp[[c for c in dfp.columns if c.endswith(' footprint max')]]
dfp_max.columns = [c.split(" ")[0] for c in dfp_max.columns]
dfp_max = dfp_max[tasks]
dfp_max

dfp_counts = dfp[[c for c in dfp.columns if c.endswith(' footprint counts')]]
dfp_counts.columns = [c.split(" ")[0] for c in dfp_max.columns]
dfp_counts = dfp_counts[tasks]
dfp_counts

fig, axes = plt.subplots(1, 2, figsize=get_figsize(1, .35), sharey=True)
ax = axes[0]
sns.heatmap(dfp_max, cmap='Reds', ax=ax)
ax.set_ylabel("Motif")
ax.set_xlabel("Task");
ax.set_title("Avg. max profile counts");
ax = axes[1]
sns.heatmap(dfp_counts, cmap='Reds', ax=ax)
ax.set_ylabel("")
ax.set_xlabel("Task");
ax.set_title("Avg. total profile counts");
plt.savefig(figures_dir / 'heatmap.total-profile-counts.pdf')

Debug the pioneering activity¶

dfs = (dfabf_plot.query("motif_pair == 'Oct4-Sox2<>Sox2'").
       query("center_diff == 50").
       query("score == 'max_profile_count_bt'").
       query("task == 'Sox2'"))

(1 / dfs[dfs.task == 'Sox2'].y_alt_ref_pc).hist()

<matplotlib.axes._subplots.AxesSubplot at 0x7fb0174cfbe0>

1 / dfs[dfs.task == 'Sox2'].y_alt_ref_pc < 1.2

1580040    False
1580214    False
1580280    False
           ...  
1592859     True
1593120     True
1593129    False
Name: y_alt_ref_pc, Length: 50, dtype: bool

dfs = dfs[1 / dfs.y_alt_ref_pc < 1.2]

ls {model_dir}

activity/                                     log/
config.gin                                    model.h5
config.gin.json                               note_params.json
deeplift/                                     null.deeplift.imp_score.h5
deeplift.imp_score.h5                         perturbation-analysis/
evaluation.valid.json                         preds.test.pkl
events.out.tfevents.1552394698.sh-112-11.int  seq_model.pkl
history.csv                                   train.smk-benchmark.tsv
imp_score_deeplift.smk-benchmark.tsv          wandb.json

isf = ImpScoreFile(model_dir / 'deeplift.imp_score.h5', default_imp_score='profile/wn')

isf.get_profiles()

Smallest effect¶

not a good Oct4-Sox2 motif

profiles = isf.get_profiles(idx=100281)

imp_scores = isf.get_contrib(idx=100281)

plot_tracks(profiles);

plot_tracks(filter_tracks(imp_scores, [250, 600]));

plot_tracks(filter_tracks(imp_scores, [350, 500]));

create_tf_session(2)

<tensorflow.python.client.session.Session at 0x7fb0173b2c18>

from basepair.BPNet import BPNetSeqModel

bpnet = BPNetSeqModel.from_mdir(model_dir)

2nd smallest effect¶

issue: many other motifs nearby

dict(dfs.sort_values('y_alt_ref_pc').iloc[-2])

{'Unnamed: 0': 74081,
 'Unnamed: 0.1': 35478,
 'example_chrom_x': 'chr7',
 'example_end_x': 49679157.0,
 'example_idx': 100281,
 'example_interval_from_task_x': 'Klf4',
 'example_start_x': 49678157.0,
 'example_strand_x': '.',
 'imp/Klf4_x': nan,
 'imp/Nanog_x': nan,
 'imp/Oct4_x': 0.2470316290855408,
 'imp/Sox2_x': nan,
 'imp_max_x': 0.2470316290855408,
 'imp_max_task_x': 'Oct4',
 'imp_weighted_x': 0.2470316290855408,
 'imp_weighted_cat_x': 'low',
 'imp_weighted_p_x': 0.17267729009411986,
 'match/Klf4_x': nan,
 'match/Nanog_x': nan,
 'match/Oct4_x': 0.5360119539089431,
 'match/Sox2_x': nan,
 'match_max_x': 0.5360119539089431,
 'match_max_task_x': 'Oct4',
 'match_weighted_x': 0.5360119539089431,
 'match_weighted_cat_x': 'high',
 'match_weighted_p_x': 0.8333799273133911,
 'pattern_x': 'Oct4/metacluster_0/pattern_0',
 'pattern_center_x': 539.0,
 'pattern_end_x': 547.0,
 'pattern_end_abs_x': 49678704.0,
 'pattern_len_x': 16.0,
 'pattern_name_x': 'Oct4-Sox2',
 'pattern_short_x': 'Oct4/m0_p0',
 'pattern_start_x': 531.0,
 'pattern_start_abs_x': 49678688.0,
 'seq_match_x': 9.09128254391644,
 'seq_match_cat_x': 'medium',
 'seq_match_p_x': 0.4510297269592768,
 'strand_x': '-',
 'tf_x': 'Oct4',
 'row_idx_x': 33299,
 'example_chrom_y': 'chr7',
 'example_end_y': 49679157.0,
 'example_interval_from_task_y': 'Klf4',
 'example_start_y': 49678157.0,
 'example_strand_y': '.',
 'imp/Klf4_y': nan,
 'imp/Nanog_y': nan,
 'imp/Oct4_y': nan,
 'imp/Sox2_y': 0.1804771721363068,
 'imp_max_y': 0.1804771721363068,
 'imp_max_task_y': 'Sox2',
 'imp_weighted_y': 0.1804771721363068,
 'imp_weighted_cat_y': 'low',
 'imp_weighted_p_y': 0.10944527736131934,
 'match/Klf4_y': nan,
 'match/Nanog_y': nan,
 'match/Oct4_y': nan,
 'match/Sox2_y': 0.5070820014841294,
 'match_max_y': 0.5070820014841294,
 'match_max_task_y': 'Sox2',
 'match_weighted_y': 0.5070820014841294,
 'match_weighted_cat_y': 'medium',
 'match_weighted_p_y': 0.33183408295852074,
 'pattern_y': 'Sox2/metacluster_0/pattern_1',
 'pattern_center_y': 589.0,
 'pattern_end_y': 597.0,
 'pattern_end_abs_y': 49678754.0,
 'pattern_len_y': 16.0,
 'pattern_name_y': 'Sox2',
 'pattern_short_y': 'Sox2/m0_p1',
 'pattern_start_y': 581.0,
 'pattern_start_abs_y': 49678738.0,
 'seq_match_y': 5.2207650519352455,
 'seq_match_cat_y': 'low',
 'seq_match_p_y': 0.06296851574212893,
 'strand_y': '-',
 'tf_y': 'Sox2',
 'row_idx_y': 133545,
 'center_diff': 50.0,
 'strand_combination': '--',
 'motif_pair': 'Oct4-Sox2<>Sox2',
 'motif_pair_idx': 74081,
 'task': 'Sox2',
 'score': 'max_profile_count_bt',
 'x_alt': 0.14282344,
 'x_ref': 0.20891881,
 'x_alt_ref': 0.6836313,
 'x_alt_pc': 0.16735701,
 'x_ref_pc': 0.23345238,
 'x_alt_ref_pc': 0.7168786,
 'y_alt': 0.05080743,
 'y_ref': 0.046952803,
 'y_alt_ref': 1.0820958999999999,
 'y_alt_pc': 0.10186459,
 'y_ref_pc': 0.09800996,
 'y_alt_ref_pc': 1.0393289,
 'center_diff_cat': Interval(35, 70, closed='right'),
 'pattern_center_aln_x': 538.0,
 'pattern_strand_aln_x': '-',
 'pattern_center_aln_y': 582.0,
 'pattern_strand_aln_y': '-',
 'pattern_diff_aln': 44.0}

idx = 100281
profiles = isf.get_profiles(idx=idx)
imp_scores = isf.get_contrib(idx=idx)
seq = isf.get_seq(idx=idx)

preds = {k:v[0] for k,v in bpnet.predict(seq[np.newaxis]).items()}

plot_tracks(preds);
plt.xlim([0, 1000]);

plot_tracks(profiles);
plt.xlim([0, 1000]);

(0, 1000)

plot_tracks(filter_tracks(imp_scores, None));

plot_tracks(filter_tracks(imp_scores, [150, 350]));

plot_tracks(filter_tracks(imp_scores, [450, 680]));

3rd smallest effect¶

crappy Oct4-Sox2 match

dict(dfs.sort_values('y_alt_ref_pc').iloc[-3])

{'Unnamed: 0': 76535,
 'Unnamed: 0.1': 42906,
 'example_chrom_x': 'chr6',
 'example_end_x': 144340321.0,
 'example_idx': 142628,
 'example_interval_from_task_x': 'Klf4',
 'example_start_x': 144339321.0,
 'example_strand_x': '.',
 'imp/Klf4_x': nan,
 'imp/Nanog_x': nan,
 'imp/Oct4_x': 0.03695483878254889,
 'imp/Sox2_x': nan,
 'imp_max_x': 0.03695483878254889,
 'imp_max_task_x': 'Oct4',
 'imp_weighted_x': 0.03695483878254889,
 'imp_weighted_cat_x': 'low',
 'imp_weighted_p_x': 0.00018637592023110607,
 'match/Klf4_x': nan,
 'match/Nanog_x': nan,
 'match/Oct4_x': 0.4164715891127139,
 'match/Sox2_x': nan,
 'match_max_x': 0.4164715891127139,
 'match_max_task_x': 'Oct4',
 'match_weighted_x': 0.4164715891127139,
 'match_weighted_cat_x': 'low',
 'match_weighted_p_x': 0.20408163265306128,
 'pattern_x': 'Oct4/metacluster_0/pattern_0',
 'pattern_center_x': 789.0,
 'pattern_end_x': 797.0,
 'pattern_end_abs_x': 144340118.0,
 'pattern_len_x': 16.0,
 'pattern_name_x': 'Oct4-Sox2',
 'pattern_short_x': 'Oct4/m0_p0',
 'pattern_start_x': 781.0,
 'pattern_start_abs_x': 144340102.0,
 'seq_match_x': 5.896903681088381,
 'seq_match_cat_x': 'low',
 'seq_match_p_x': 0.04445065697511882,
 'strand_x': '-',
 'tf_x': 'Oct4',
 'row_idx_x': 40299,
 'example_chrom_y': 'chr6',
 'example_end_y': 144340321.0,
 'example_interval_from_task_y': 'Klf4',
 'example_start_y': 144339321.0,
 'example_strand_y': '.',
 'imp/Klf4_y': nan,
 'imp/Nanog_y': nan,
 'imp/Oct4_y': nan,
 'imp/Sox2_y': 0.08268684148788452,
 'imp_max_y': 0.08268684148788452,
 'imp_max_task_y': 'Sox2',
 'imp_weighted_y': 0.08268684148788452,
 'imp_weighted_cat_y': 'low',
 'imp_weighted_p_y': 0.0009995002498750622,
 'match/Klf4_y': nan,
 'match/Nanog_y': nan,
 'match/Oct4_y': nan,
 'match/Sox2_y': 0.4926420428226211,
 'match_max_y': 0.4926420428226211,
 'match_max_task_y': 'Sox2',
 'match_weighted_y': 0.4926420428226211,
 'match_weighted_cat_y': 'low',
 'match_weighted_p_y': 0.2748625687156422,
 'pattern_y': 'Sox2/metacluster_0/pattern_1',
 'pattern_center_y': 739.0,
 'pattern_end_y': 747.0,
 'pattern_end_abs_y': 144340068.0,
 'pattern_len_y': 16.0,
 'pattern_name_y': 'Sox2',
 'pattern_short_y': 'Sox2/m0_p1',
 'pattern_start_y': 731.0,
 'pattern_start_abs_y': 144340052.0,
 'seq_match_y': 5.667162795146028,
 'seq_match_cat_y': 'low',
 'seq_match_p_y': 0.09245377311344327,
 'strand_y': '+',
 'tf_y': 'Sox2',
 'row_idx_y': 141147,
 'center_diff': 50.0,
 'strand_combination': '-+',
 'motif_pair': 'Oct4-Sox2<>Sox2',
 'motif_pair_idx': 76535,
 'task': 'Sox2',
 'score': 'max_profile_count_bt',
 'x_alt': 0.011093924,
 'x_ref': 0.0069532990000000005,
 'x_alt_ref': 1.5954906999999998,
 'x_alt_pc': 0.035627488,
 'x_ref_pc': 0.03148686,
 'x_alt_ref_pc': 1.1315033,
 'y_alt': 0.037026532,
 'y_ref': 0.034897726000000004,
 'y_alt_ref': 1.0610013,
 'y_alt_pc': 0.08808368400000001,
 'y_ref_pc': 0.08595488,
 'y_alt_ref_pc': 1.0247666,
 'center_diff_cat': Interval(35, 70, closed='right'),
 'pattern_center_aln_x': 788.0,
 'pattern_strand_aln_x': '-',
 'pattern_center_aln_y': 746.0,
 'pattern_strand_aln_y': '+',
 'pattern_diff_aln': -42.0}

idx = 142628
profiles = isf.get_profiles(idx=idx)
imp_scores = isf.get_contrib(idx=idx)
seq = isf.get_seq(idx=idx)

preds = {k:v[0] for k,v in bpnet.predict(seq[np.newaxis]).items()}

plot_tracks(preds);
plt.xlim([0, 1000]);

plot_tracks(profiles);
plt.xlim([0, 1000]);

plot_tracks(filter_tracks(imp_scores, None));

plot_tracks(filter_tracks(imp_scores, [700, 900]));

4th smallest effect¶

crappy Sox2 and Oct4-Sox2 matches

dict(dfs.sort_values('y_alt_ref_pc').iloc[-4])

{'Unnamed: 0': 72592,
 'Unnamed: 0.1': 31230,
 'example_chrom_x': 'chr16',
 'example_end_x': 10839660.0,
 'example_idx': 80917,
 'example_interval_from_task_x': 'Nanog',
 'example_start_x': 10838660.0,
 'example_strand_x': '.',
 'imp/Klf4_x': nan,
 'imp/Nanog_x': nan,
 'imp/Oct4_x': 0.3402984440326691,
 'imp/Sox2_x': nan,
 'imp_max_x': 0.3402984440326691,
 'imp_max_task_x': 'Oct4',
 'imp_weighted_x': 0.3402984440326691,
 'imp_weighted_cat_x': 'low',
 'imp_weighted_p_x': 0.2766750535830771,
 'match/Klf4_x': nan,
 'match/Nanog_x': nan,
 'match/Oct4_x': 0.41829988490049,
 'match/Sox2_x': nan,
 'match_max_x': 0.41829988490049,
 'match_max_task_x': 'Oct4',
 'match_weighted_x': 0.41829988490049,
 'match_weighted_cat_x': 'low',
 'match_weighted_p_x': 0.20976609822010997,
 'pattern_x': 'Oct4/metacluster_0/pattern_0',
 'pattern_center_x': 435.0,
 'pattern_end_x': 443.0,
 'pattern_end_abs_x': 10839103.0,
 'pattern_len_x': 16.0,
 'pattern_name_x': 'Oct4-Sox2',
 'pattern_short_x': 'Oct4/m0_p0',
 'pattern_start_x': 427.0,
 'pattern_start_abs_x': 10839087.0,
 'seq_match_x': 9.69120805974702,
 'seq_match_cat_x': 'medium',
 'seq_match_p_x': 0.5653713540210604,
 'strand_x': '-',
 'tf_x': 'Oct4',
 'row_idx_x': 29311,
 'example_chrom_y': 'chr16',
 'example_end_y': 10839660.0,
 'example_interval_from_task_y': 'Nanog',
 'example_start_y': 10838660.0,
 'example_strand_y': '.',
 'imp/Klf4_y': nan,
 'imp/Nanog_y': nan,
 'imp/Oct4_y': nan,
 'imp/Sox2_y': 0.11376836150884627,
 'imp_max_y': 0.11376836150884627,
 'imp_max_task_y': 'Sox2',
 'imp_weighted_y': 0.11376836150884627,
 'imp_weighted_cat_y': 'low',
 'imp_weighted_p_y': 0.004997501249375314,
 'match/Klf4_y': nan,
 'match/Nanog_y': nan,
 'match/Oct4_y': nan,
 'match/Sox2_y': 0.4977048674596135,
 'match_max_y': 0.4977048674596135,
 'match_max_task_y': 'Sox2',
 'match_weighted_y': 0.4977048674596135,
 'match_weighted_cat_y': 'low',
 'match_weighted_p_y': 0.2913543228385807,
 'pattern_y': 'Sox2/metacluster_0/pattern_1',
 'pattern_center_y': 385.0,
 'pattern_end_y': 393.0,
 'pattern_end_abs_y': 10839053.0,
 'pattern_len_y': 16.0,
 'pattern_name_y': 'Sox2',
 'pattern_short_y': 'Sox2/m0_p1',
 'pattern_start_y': 377.0,
 'pattern_start_abs_y': 10839037.0,
 'seq_match_y': 2.4394019425113287,
 'seq_match_cat_y': 'low',
 'seq_match_p_y': 0.0074962518740629685,
 'strand_y': '-',
 'tf_y': 'Sox2',
 'row_idx_y': 129476,
 'center_diff': 50.0,
 'strand_combination': '--',
 'motif_pair': 'Oct4-Sox2<>Sox2',
 'motif_pair_idx': 72592,
 'task': 'Sox2',
 'score': 'max_profile_count_bt',
 'x_alt': 0.061950665,
 'x_ref': 0.07106447,
 'x_alt_ref': 0.8717530000000001,
 'x_alt_pc': 0.08648423,
 'x_ref_pc': 0.095598035,
 'x_alt_ref_pc': 0.90466535,
 'y_alt': 0.033524055,
 'y_ref': 0.033593252000000004,
 'y_alt_ref': 0.9979401,
 'y_alt_pc': 0.08458121,
 'y_ref_pc': 0.08465041,
 'y_alt_ref_pc': 0.9991825000000001,
 'center_diff_cat': Interval(35, 70, closed='right'),
 'pattern_center_aln_x': 434.0,
 'pattern_strand_aln_x': '-',
 'pattern_center_aln_y': 378.0,
 'pattern_strand_aln_y': '-',
 'pattern_diff_aln': -56.0}

idx = 80917
profiles = isf.get_profiles(idx=idx)
imp_scores = isf.get_contrib(idx=idx)
seq = isf.get_seq(idx=idx)

preds = {k:v[0] for k,v in bpnet.predict(seq[np.newaxis]).items()}

plot_tracks(preds);
plt.xlim([0, 1000]);

plot_tracks(profiles);
plt.xlim([0, 1000]);

plot_tracks(filter_tracks(imp_scores, None));

plot_tracks(filter_tracks(imp_scores, [350, 600]));

5th worst effect¶

another Oct4-Sox2 motif in the vicinity
another Klf4 motif

dict(dfs.sort_values('y_alt_ref_pc').iloc[-5])

{'Unnamed: 0': 69419,
 'Unnamed: 0.1': 22074,
 'example_chrom_x': 'chr7',
 'example_end_x': 49679158.0,
 'example_idx': 56054,
 'example_interval_from_task_x': 'Nanog',
 'example_start_x': 49678158.0,
 'example_strand_x': '.',
 'imp/Klf4_x': nan,
 'imp/Nanog_x': nan,
 'imp/Oct4_x': 0.2477083504199981,
 'imp/Sox2_x': nan,
 'imp_max_x': 0.2477083504199981,
 'imp_max_task_x': 'Oct4',
 'imp_weighted_x': 0.2477083504199981,
 'imp_weighted_cat_x': 'low',
 'imp_weighted_p_x': 0.1733296058149287,
 'match/Klf4_x': nan,
 'match/Nanog_x': nan,
 'match/Oct4_x': 0.536355694460538,
 'match/Sox2_x': nan,
 'match_max_x': 0.536355694460538,
 'match_max_task_x': 'Oct4',
 'match_weighted_x': 0.536355694460538,
 'match_weighted_cat_x': 'high',
 'match_weighted_p_x': 0.8343118069145466,
 'pattern_x': 'Oct4/metacluster_0/pattern_0',
 'pattern_center_x': 538.0,
 'pattern_end_x': 546.0,
 'pattern_end_abs_x': 49678704.0,
 'pattern_len_x': 16.0,
 'pattern_name_x': 'Oct4-Sox2',
 'pattern_short_x': 'Oct4/m0_p0',
 'pattern_start_x': 530.0,
 'pattern_start_abs_x': 49678688.0,
 'seq_match_x': 9.09128254391644,
 'seq_match_cat_x': 'medium',
 'seq_match_p_x': 0.4510297269592768,
 'strand_x': '-',
 'tf_x': 'Oct4',
 'row_idx_x': 20627,
 'example_chrom_y': 'chr7',
 'example_end_y': 49679158.0,
 'example_interval_from_task_y': 'Nanog',
 'example_start_y': 49678158.0,
 'example_strand_y': '.',
 'imp/Klf4_y': nan,
 'imp/Nanog_y': nan,
 'imp/Oct4_y': nan,
 'imp/Sox2_y': 0.18059472739696505,
 'imp_max_y': 0.18059472739696505,
 'imp_max_task_y': 'Sox2',
 'imp_weighted_y': 0.18059472739696505,
 'imp_weighted_cat_y': 'low',
 'imp_weighted_p_y': 0.10944527736131934,
 'match/Klf4_y': nan,
 'match/Nanog_y': nan,
 'match/Oct4_y': nan,
 'match/Sox2_y': 0.5066622523892046,
 'match_max_y': 0.5066622523892046,
 'match_max_task_y': 'Sox2',
 'match_weighted_y': 0.5066622523892046,
 'match_weighted_cat_y': 'low',
 'match_weighted_p_y': 0.3273363318340829,
 'pattern_y': 'Sox2/metacluster_0/pattern_1',
 'pattern_center_y': 588.0,
 'pattern_end_y': 596.0,
 'pattern_end_abs_y': 49678754.0,
 'pattern_len_y': 16.0,
 'pattern_name_y': 'Sox2',
 'pattern_short_y': 'Sox2/m0_p1',
 'pattern_start_y': 580.0,
 'pattern_start_abs_y': 49678738.0,
 'seq_match_y': 5.2207650519352455,
 'seq_match_cat_y': 'low',
 'seq_match_p_y': 0.06296851574212893,
 'strand_y': '-',
 'tf_y': 'Sox2',
 'row_idx_y': 121273,
 'center_diff': 50.0,
 'strand_combination': '--',
 'motif_pair': 'Oct4-Sox2<>Sox2',
 'motif_pair_idx': 69419,
 'task': 'Sox2',
 'score': 'max_profile_count_bt',
 'x_alt': 0.15319549999999998,
 'x_ref': 0.17042047,
 'x_alt_ref': 0.8989267,
 'x_alt_pc': 0.17772907,
 'x_ref_pc': 0.19495404,
 'x_alt_ref_pc': 0.911646,
 'y_alt': 0.04134101,
 'y_ref': 0.043782405999999996,
 'y_alt_ref': 0.9442379999999999,
 'y_alt_pc': 0.09239817,
 'y_ref_pc': 0.09483956,
 'y_alt_ref_pc': 0.97425765,
 'center_diff_cat': Interval(35, 70, closed='right'),
 'pattern_center_aln_x': 537.0,
 'pattern_strand_aln_x': '-',
 'pattern_center_aln_y': 581.0,
 'pattern_strand_aln_y': '-',
 'pattern_diff_aln': 44.0}

idx = 56054
profiles = isf.get_profiles(idx=idx)
imp_scores = isf.get_contrib(idx=idx)
seq = isf.get_seq(idx=idx)

preds = {k:v[0] for k,v in bpnet.predict(seq[np.newaxis]).items()}

plot_tracks(preds);
plt.xlim([0, 1000]);

plot_tracks(profiles);
plt.xlim([0, 1000]);

plot_tracks(filter_tracks(imp_scores, None));

plot_tracks(filter_tracks(imp_scores, [100, 200]));

plot_tracks(filter_tracks(imp_scores, [450, 650]));

Best effect¶

dict(dfs.sort_values('y_alt_ref_pc').iloc[0])

{'Unnamed: 0': 69420,
 'Unnamed: 0.1': 22082,
 'example_chrom_x': 'chr11',
 'example_end_x': 76388970.0,
 'example_idx': 56086,
 'example_interval_from_task_x': 'Nanog',
 'example_start_x': 76387970.0,
 'example_strand_x': '.',
 'imp/Klf4_x': nan,
 'imp/Nanog_x': nan,
 'imp/Oct4_x': 0.14150290191173553,
 'imp/Sox2_x': nan,
 'imp_max_x': 0.14150290191173553,
 'imp_max_task_x': 'Oct4',
 'imp_weighted_x': 0.14150290191173553,
 'imp_weighted_cat_x': 'low',
 'imp_weighted_p_x': 0.039791258969341166,
 'match/Klf4_x': nan,
 'match/Nanog_x': nan,
 'match/Oct4_x': 0.42153619461549,
 'match/Sox2_x': nan,
 'match_max_x': 0.42153619461549,
 'match_max_task_x': 'Oct4',
 'match_weighted_x': 0.42153619461549,
 'match_weighted_cat_x': 'low',
 'match_weighted_p_x': 0.2225328487559408,
 'pattern_x': 'Oct4/metacluster_0/pattern_0',
 'pattern_center_x': 482.0,
 'pattern_end_x': 490.0,
 'pattern_end_abs_x': 76388460.0,
 'pattern_len_x': 16.0,
 'pattern_name_x': 'Oct4-Sox2',
 'pattern_short_x': 'Oct4/m0_p0',
 'pattern_start_x': 474.0,
 'pattern_start_abs_x': 76388444.0,
 'seq_match_x': 6.026784046015417,
 'seq_match_cat_x': 'low',
 'seq_match_p_x': 0.050787438262976416,
 'strand_x': '-',
 'tf_x': 'Oct4',
 'row_idx_x': 20635,
 'example_chrom_y': 'chr11',
 'example_end_y': 76388970.0,
 'example_interval_from_task_y': 'Nanog',
 'example_start_y': 76387970.0,
 'example_strand_y': '.',
 'imp/Klf4_y': nan,
 'imp/Nanog_y': nan,
 'imp/Oct4_y': nan,
 'imp/Sox2_y': 0.13634876906871796,
 'imp_max_y': 0.13634876906871796,
 'imp_max_task_y': 'Sox2',
 'imp_weighted_y': 0.13634876906871796,
 'imp_weighted_cat_y': 'low',
 'imp_weighted_p_y': 0.020989505247376312,
 'match/Klf4_y': nan,
 'match/Nanog_y': nan,
 'match/Oct4_y': nan,
 'match/Sox2_y': 0.4784038875654268,
 'match_max_y': 0.4784038875654268,
 'match_max_task_y': 'Sox2',
 'match_weighted_y': 0.4784038875654268,
 'match_weighted_cat_y': 'low',
 'match_weighted_p_y': 0.2263868065967017,
 'pattern_y': 'Sox2/metacluster_0/pattern_1',
 'pattern_center_y': 532.0,
 'pattern_end_y': 540.0,
 'pattern_end_abs_y': 76388510.0,
 'pattern_len_y': 16.0,
 'pattern_name_y': 'Sox2',
 'pattern_short_y': 'Sox2/m0_p1',
 'pattern_start_y': 524.0,
 'pattern_start_abs_y': 76388494.0,
 'seq_match_y': 2.8958057307741734,
 'seq_match_cat_y': 'low',
 'seq_match_p_y': 0.010494752623688156,
 'strand_y': '-',
 'tf_y': 'Sox2',
 'row_idx_y': 121285,
 'center_diff': 50.0,
 'strand_combination': '--',
 'motif_pair': 'Oct4-Sox2<>Sox2',
 'motif_pair_idx': 69420,
 'task': 'Sox2',
 'score': 'max_profile_count_bt',
 'x_alt': 0.03517713,
 'x_ref': 0.05559849,
 'x_alt_ref': 0.6326994,
 'x_alt_pc': 0.059710695999999994,
 'x_ref_pc': 0.08013205,
 'x_alt_ref_pc': 0.7451536999999999,
 'y_alt': 0.049314,
 'y_ref': 0.06586266,
 'y_alt_ref': 0.74873984,
 'y_alt_pc': 0.10037115,
 'y_ref_pc': 0.116919816,
 'y_alt_ref_pc': 0.85846144,
 'center_diff_cat': Interval(35, 70, closed='right'),
 'pattern_center_aln_x': 481.0,
 'pattern_strand_aln_x': '-',
 'pattern_center_aln_y': 525.0,
 'pattern_strand_aln_y': '-',
 'pattern_diff_aln': 44.0}

idx = 56086
profiles = isf.get_profiles(idx=idx)
imp_scores = isf.get_contrib(idx=idx)
seq = isf.get_seq(idx=idx)

preds = {k:v[0] for k,v in bpnet.predict(seq[np.newaxis]).items()}

plot_tracks(preds);
plt.xlim([0, 1000]);

plot_tracks(profiles);
plt.xlim([0, 1000]);

plot_tracks(filter_tracks(imp_scores, None));

plot_tracks(filter_tracks(imp_scores, [450, 600]));

plot_tracks(filter_tracks(imp_scores, [450, 650]));

2nd best effect¶

dict(dfs.sort_values('y_alt_ref_pc').iloc[1])

{'Unnamed: 0': 63767,
 'Unnamed: 0.1': 6804,
 'example_chrom_x': 'chr5',
 'example_end_x': 45515831.0,
 'example_idx': 7895,
 'example_interval_from_task_x': 'Oct4',
 'example_start_x': 45514831.0,
 'example_strand_x': '.',
 'imp/Klf4_x': nan,
 'imp/Nanog_x': nan,
 'imp/Oct4_x': 0.04964468255639076,
 'imp/Sox2_x': nan,
 'imp_max_x': 0.04964468255639076,
 'imp_max_task_x': 'Oct4',
 'imp_weighted_x': 0.04964468255639076,
 'imp_weighted_cat_x': 'low',
 'imp_weighted_p_x': 0.00018637592023110607,
 'match/Klf4_x': nan,
 'match/Nanog_x': nan,
 'match/Oct4_x': 0.4393814096022649,
 'match/Sox2_x': nan,
 'match_max_x': 0.4393814096022649,
 'match_max_task_x': 'Oct4',
 'match_weighted_x': 0.4393814096022649,
 'match_weighted_cat_x': 'low',
 'match_weighted_p_x': 0.2968036529680365,
 'pattern_x': 'Oct4/metacluster_0/pattern_0',
 'pattern_center_x': 710.0,
 'pattern_end_x': 718.0,
 'pattern_end_abs_x': 45515549.0,
 'pattern_len_x': 16.0,
 'pattern_name_x': 'Oct4-Sox2',
 'pattern_short_x': 'Oct4/m0_p0',
 'pattern_start_x': 702.0,
 'pattern_start_abs_x': 45515533.0,
 'seq_match_x': 5.036626686383653,
 'seq_match_cat_x': 'low',
 'seq_match_p_x': 0.0182648401826484,
 'strand_x': '+',
 'tf_x': 'Oct4',
 'row_idx_x': 6416,
 'example_chrom_y': 'chr5',
 'example_end_y': 45515831.0,
 'example_interval_from_task_y': 'Oct4',
 'example_start_y': 45514831.0,
 'example_strand_y': '.',
 'imp/Klf4_y': nan,
 'imp/Nanog_y': nan,
 'imp/Oct4_y': nan,
 'imp/Sox2_y': 0.1218717321753502,
 'imp_max_y': 0.1218717321753502,
 'imp_max_task_y': 'Sox2',
 'imp_weighted_y': 0.1218717321753502,
 'imp_weighted_cat_y': 'low',
 'imp_weighted_p_y': 0.00849575212393803,
 'match/Klf4_y': nan,
 'match/Nanog_y': nan,
 'match/Oct4_y': nan,
 'match/Sox2_y': 0.5199034029192633,
 'match_max_y': 0.5199034029192633,
 'match_max_task_y': 'Sox2',
 'match_weighted_y': 0.5199034029192633,
 'match_weighted_cat_y': 'medium',
 'match_weighted_p_y': 0.3828085957021489,
 'pattern_y': 'Sox2/metacluster_0/pattern_1',
 'pattern_center_y': 660.0,
 'pattern_end_y': 668.0,
 'pattern_end_abs_y': 45515499.0,
 'pattern_len_y': 16.0,
 'pattern_name_y': 'Sox2',
 'pattern_short_y': 'Sox2/m0_p1',
 'pattern_start_y': 652.0,
 'pattern_start_abs_y': 45515483.0,
 'seq_match_y': 6.324524210843803,
 'seq_match_cat_y': 'low',
 'seq_match_p_y': 0.1344327836081959,
 'strand_y': '+',
 'tf_y': 'Sox2',
 'row_idx_y': 109978,
 'center_diff': 50.0,
 'strand_combination': '++',
 'motif_pair': 'Oct4-Sox2<>Sox2',
 'motif_pair_idx': 63767,
 'task': 'Sox2',
 'score': 'max_profile_count_bt',
 'x_alt': 0.012075658000000001,
 'x_ref': 0.020065458,
 'x_alt_ref': 0.6018131999999999,
 'x_alt_pc': 0.03660922,
 'x_ref_pc': 0.044599023,
 'x_alt_ref_pc': 0.8208525,
 'y_alt': 0.0666105,
 'y_ref': 0.08502854,
 'y_alt_ref': 0.7833899000000001,
 'y_alt_pc': 0.11766766,
 'y_ref_pc': 0.13608569,
 'y_alt_ref_pc': 0.8646585999999999,
 'center_diff_cat': Interval(35, 70, closed='right'),
 'pattern_center_aln_x': 711.0,
 'pattern_strand_aln_x': '+',
 'pattern_center_aln_y': 667.0,
 'pattern_strand_aln_y': '+',
 'pattern_diff_aln': -44.0}

idx = 7895
profiles = isf.get_profiles(idx=idx)
imp_scores = isf.get_contrib(idx=idx)
seq = isf.get_seq(idx=idx)

preds = {k:v[0] for k,v in bpnet.predict(seq[np.newaxis]).items()}

plot_tracks(preds);
plt.xlim([0, 1000]);

plot_tracks(profiles);
plt.xlim([0, 1000]);

plot_tracks(filter_tracks(imp_scores, None));

plot_tracks(filter_tracks(imp_scores, [600, 800]));

	Unnamed: 0	example_chrom_x	example_end_x	example_idx	example_interval_from_task_x	example_start_x	example_strand_x	imp/Klf4_x	imp/Nanog_x	imp/Oct4_x	imp/Sox2_x	imp_max_x	imp_max_task_x	imp_weighted_x	imp_weighted_cat_x	imp_weighted_p_x	match/Klf4_x	match/Nanog_x	match/Oct4_x	match/Sox2_x	match_max_x	match_max_task_x	match_weighted_x	match_weighted_cat_x	match_weighted_p_x	pattern_x	pattern_center_x	pattern_end_x	pattern_end_abs_x	pattern_len_x	pattern_name_x	pattern_short_x	pattern_start_x	pattern_start_abs_x	seq_match_x	seq_match_cat_x	seq_match_p_x	strand_x	tf_x	row_idx_x	example_chrom_y	example_end_y	example_interval_from_task_y	example_start_y	example_strand_y	imp/Klf4_y	imp/Nanog_y	imp/Oct4_y	imp/Sox2_y	imp_max_y	imp_max_task_y	imp_weighted_y	imp_weighted_cat_y	imp_weighted_p_y	match/Klf4_y	match/Nanog_y	match/Oct4_y	match/Sox2_y	match_max_y	match_max_task_y	match_weighted_y	match_weighted_cat_y	match_weighted_p_y	pattern_y	pattern_center_y	pattern_end_y	pattern_end_abs_y	pattern_len_y	pattern_name_y	pattern_short_y	pattern_start_y	pattern_start_abs_y	seq_match_y	seq_match_cat_y	seq_match_p_y	strand_y	tf_y	row_idx_y	center_diff	strand_combination	motif_pair	motif_pair_idx	task	score	x_alt	x_ref	x_alt_ref	x_alt_pc	x_ref_pc	x_alt_ref_pc	y_alt	y_ref	y_alt_ref	y_alt_pc	y_ref_pc	y_alt_ref_pc	center_diff_cat
0	1	chr3	1.2215e+08	1	Oct4	1.2215e+08	.	NaN	NaN	0.9374	NaN	0.9374	Oct4	0.9374	high	0.8462	NaN	NaN	0.5077	NaN	0.5077	Oct4	0.5077	high	0.6886	Oct4/metacluster_0/pa...	430.0	438.0	1.2215e+08	16.0	Oct4-Sox2	Oct4/m0_p0	422.0	1.2215e+08	10.2722	high	0.6902	-	Oct4	0.0	chr3	1.2215e+08	Oct4	1.2215e+08	.	NaN	NaN	1.5355	NaN	1.5355	Oct4	1.5355	high	0.992	NaN	NaN	0.5287	NaN	0.5287	Oct4	0.5287	high	0.7992	Oct4/metacluster_0/pa...	451.0	459.0	1.2215e+08	16.0	Oct4-Sox2	Oct4/m0_p0	443.0	1.2215e+08	11.5528	high	0.9044	-	Oct4	1.0	21.0	++	Oct4-Sox2<>Oct4-Sox2	0	Oct4	profile_count	2.6246	13.7886	0.1903	2.6566	13.8206	0.1922	11.6847	20.6220	0.5666	11.7136	20.6510	0.5672	(0, 35]
1	2	chr3	1.2215e+08	1	Oct4	1.2215e+08	.	NaN	NaN	0.9374	NaN	0.9374	Oct4	0.9374	high	0.8462	NaN	NaN	0.5077	NaN	0.5077	Oct4	0.5077	high	0.6886	Oct4/metacluster_0/pa...	430.0	438.0	1.2215e+08	16.0	Oct4-Sox2	Oct4/m0_p0	422.0	1.2215e+08	10.2722	high	0.6902	-	Oct4	0.0	chr3	1.2215e+08	Oct4	1.2215e+08	.	NaN	NaN	1.2758	NaN	1.2758	Oct4	1.2758	high	0.967	NaN	NaN	0.4219	NaN	0.4219	Oct4	0.4219	low	0.2234	Oct4/metacluster_0/pa...	492.0	500.0	1.2215e+08	16.0	Oct4-Sox2	Oct4/m0_p0	484.0	1.2215e+08	10.7122	high	0.7777	-	Oct4	2.0	62.0	++	Oct4-Sox2<>Oct4-Sox2	1	Oct4	profile_count	4.2233	13.7886	0.3063	4.2553	13.8206	0.3079	13.3997	19.3488	0.6925	13.4286	19.3777	0.6930	(35, 70]
2	3	chr3	1.2215e+08	1	Oct4	1.2215e+08	.	NaN	NaN	0.9374	NaN	0.9374	Oct4	0.9374	high	0.8462	NaN	NaN	0.5077	NaN	0.5077	Oct4	0.5077	high	0.6886	Oct4/metacluster_0/pa...	430.0	438.0	1.2215e+08	16.0	Oct4-Sox2	Oct4/m0_p0	422.0	1.2215e+08	10.2722	high	0.6902	-	Oct4	0.0	chr3	1.2215e+08	Oct4	1.2215e+08	.	NaN	NaN	1.0455	NaN	1.0455	Oct4	1.0455	high	0.905	NaN	NaN	0.4155	NaN	0.4155	Oct4	0.4155	low	0.2005	Oct4/metacluster_0/pa...	534.0	542.0	1.2215e+08	16.0	Oct4-Sox2	Oct4/m0_p0	526.0	1.2215e+08	10.2722	high	0.6902	-	Oct4	3.0	104.0	++	Oct4-Sox2<>Oct4-Sox2	2	Oct4	profile_count	5.0190	13.7886	0.3640	5.0510	13.8206	0.3655	8.8362	12.9900	0.6802	8.8651	13.0189	0.6809	(70, 150]
3	6	chr3	1.2215e+08	1	Oct4	1.2215e+08	.	NaN	NaN	1.5355	NaN	1.5355	Oct4	1.5355	high	0.9920	NaN	NaN	0.5287	NaN	0.5287	Oct4	0.5287	high	0.7992	Oct4/metacluster_0/pa...	451.0	459.0	1.2215e+08	16.0	Oct4-Sox2	Oct4/m0_p0	443.0	1.2215e+08	11.5528	high	0.9044	-	Oct4	1.0	chr3	1.2215e+08	Oct4	1.2215e+08	.	NaN	NaN	1.2758	NaN	1.2758	Oct4	1.2758	high	0.967	NaN	NaN	0.4219	NaN	0.4219	Oct4	0.4219	low	0.2234	Oct4/metacluster_0/pa...	492.0	500.0	1.2215e+08	16.0	Oct4-Sox2	Oct4/m0_p0	484.0	1.2215e+08	10.7122	high	0.7777	-	Oct4	2.0	41.0	++	Oct4-Sox2<>Oct4-Sox2	3	Oct4	profile_count	6.0170	20.6220	0.2918	6.0490	20.6540	0.2929	7.1250	19.3488	0.3682	7.1539	19.3777	0.3692	(35, 70]
4	7	chr3	1.2215e+08	1	Oct4	1.2215e+08	.	NaN	NaN	1.5355	NaN	1.5355	Oct4	1.5355	high	0.9920	NaN	NaN	0.5287	NaN	0.5287	Oct4	0.5287	high	0.7992	Oct4/metacluster_0/pa...	451.0	459.0	1.2215e+08	16.0	Oct4-Sox2	Oct4/m0_p0	443.0	1.2215e+08	11.5528	high	0.9044	-	Oct4	1.0	chr3	1.2215e+08	Oct4	1.2215e+08	.	NaN	NaN	1.0455	NaN	1.0455	Oct4	1.0455	high	0.905	NaN	NaN	0.4155	NaN	0.4155	Oct4	0.4155	low	0.2005	Oct4/metacluster_0/pa...	534.0	542.0	1.2215e+08	16.0	Oct4-Sox2	Oct4/m0_p0	526.0	1.2215e+08	10.2722	high	0.6902	-	Oct4	3.0	83.0	++	Oct4-Sox2<>Oct4-Sox2	4	Oct4	profile_count	7.2364	20.6220	0.3509	7.2684	20.6540	0.3519	5.7126	12.9900	0.4398	5.7415	13.0189	0.4410	(70, 150]

	index	Unnamed: 0	example_chrom_x	example_end_x	example_idx	example_interval_from_task_x	example_start_x	example_strand_x	imp/Klf4_x	imp/Nanog_x	imp/Oct4_x	imp/Sox2_x	imp_max_x	imp_max_task_x	imp_weighted_x	imp_weighted_cat_x	imp_weighted_p_x	match/Klf4_x	match/Nanog_x	match/Oct4_x	match/Sox2_x	match_max_x	match_max_task_x	match_weighted_x	match_weighted_cat_x	match_weighted_p_x	pattern_x	pattern_center_x	pattern_end_x	pattern_end_abs_x	pattern_len_x	pattern_name_x	pattern_short_x	pattern_start_x	pattern_start_abs_x	seq_match_x	seq_match_cat_x	seq_match_p_x	strand_x	tf_x	row_idx_x	example_chrom_y	example_end_y	example_interval_from_task_y	example_start_y	example_strand_y	imp/Klf4_y	imp/Nanog_y	imp/Oct4_y	imp/Sox2_y	imp_max_y	imp_max_task_y	imp_weighted_y	imp_weighted_cat_y	imp_weighted_p_y	match/Klf4_y	match/Nanog_y	match/Oct4_y	match/Sox2_y	match_max_y	match_max_task_y	match_weighted_y	match_weighted_cat_y	match_weighted_p_y	pattern_y	pattern_center_y	pattern_end_y	pattern_end_abs_y	pattern_len_y	pattern_name_y	pattern_short_y	pattern_start_y	pattern_start_abs_y	seq_match_y	seq_match_cat_y	seq_match_p_y	strand_y	tf_y	row_idx_y	center_diff	strand_combination	motif_pair	motif_pair_idx	task	score	x_alt	x_ref	x_alt_ref	x_alt_pc	x_ref_pc	x_alt_ref_pc	y_alt	y_ref	y_alt_ref	y_alt_pc	y_ref_pc	y_alt_ref_pc	center_diff_cat	pattern_center_aln_x	pattern_strand_aln_x	pattern_center_aln_y	pattern_strand_aln_y	pattern_diff_aln
0	0	1	chr3	1.2215e+08	1	Oct4	1.2215e+08	.	NaN	NaN	0.9374	NaN	0.9374	Oct4	0.9374	high	0.8462	NaN	NaN	0.5077	NaN	0.5077	Oct4	0.5077	high	0.6886	Oct4/metacluster_0/pa...	430.0	438.0	1.2215e+08	16.0	Oct4-Sox2	Oct4/m0_p0	422.0	1.2215e+08	10.2722	high	0.6902	-	Oct4	0	chr3	1.2215e+08	Oct4	1.2215e+08	.	NaN	NaN	1.5355	NaN	1.5355	Oct4	1.5355	high	0.992	NaN	NaN	0.5287	NaN	0.5287	Oct4	0.5287	high	0.7992	Oct4/metacluster_0/pa...	451.0	459.0	1.2215e+08	16.0	Oct4-Sox2	Oct4/m0_p0	443.0	1.2215e+08	11.5528	high	0.9044	-	Oct4	1	21.0	++	Oct4-Sox2<>Oct4-Sox2	0	Oct4	profile_count	2.5930	13.7886	0.1881	2.6250	13.8206	0.1899	10.8607	20.6220	0.5267	10.8896	20.6510	0.5273	(0, 35]	429.0	-	450.0	-	21.0
1	1	2	chr3	1.2215e+08	1	Oct4	1.2215e+08	.	NaN	NaN	0.9374	NaN	0.9374	Oct4	0.9374	high	0.8462	NaN	NaN	0.5077	NaN	0.5077	Oct4	0.5077	high	0.6886	Oct4/metacluster_0/pa...	430.0	438.0	1.2215e+08	16.0	Oct4-Sox2	Oct4/m0_p0	422.0	1.2215e+08	10.2722	high	0.6902	-	Oct4	0	chr3	1.2215e+08	Oct4	1.2215e+08	.	NaN	NaN	1.2758	NaN	1.2758	Oct4	1.2758	high	0.967	NaN	NaN	0.4219	NaN	0.4219	Oct4	0.4219	low	0.2234	Oct4/metacluster_0/pa...	492.0	500.0	1.2215e+08	16.0	Oct4-Sox2	Oct4/m0_p0	484.0	1.2215e+08	10.7122	high	0.7777	-	Oct4	2	62.0	++	Oct4-Sox2<>Oct4-Sox2	1	Oct4	profile_count	3.5153	13.7886	0.2549	3.5473	13.8206	0.2567	12.1548	19.3488	0.6282	12.1837	19.3777	0.6287	(35, 70]	429.0	-	491.0	-	62.0
2	2	3	chr3	1.2215e+08	1	Oct4	1.2215e+08	.	NaN	NaN	0.9374	NaN	0.9374	Oct4	0.9374	high	0.8462	NaN	NaN	0.5077	NaN	0.5077	Oct4	0.5077	high	0.6886	Oct4/metacluster_0/pa...	430.0	438.0	1.2215e+08	16.0	Oct4-Sox2	Oct4/m0_p0	422.0	1.2215e+08	10.2722	high	0.6902	-	Oct4	0	chr3	1.2215e+08	Oct4	1.2215e+08	.	NaN	NaN	1.0455	NaN	1.0455	Oct4	1.0455	high	0.905	NaN	NaN	0.4155	NaN	0.4155	Oct4	0.4155	low	0.2005	Oct4/metacluster_0/pa...	534.0	542.0	1.2215e+08	16.0	Oct4-Sox2	Oct4/m0_p0	526.0	1.2215e+08	10.2722	high	0.6902	-	Oct4	3	104.0	++	Oct4-Sox2<>Oct4-Sox2	2	Oct4	profile_count	5.0137	13.7886	0.3636	5.0457	13.8206	0.3651	8.3057	12.9900	0.6394	8.3347	13.0189	0.6402	(70, 150]	429.0	-	533.0	-	104.0
3	3	6	chr3	1.2215e+08	1	Oct4	1.2215e+08	.	NaN	NaN	1.5355	NaN	1.5355	Oct4	1.5355	high	0.9920	NaN	NaN	0.5287	NaN	0.5287	Oct4	0.5287	high	0.7992	Oct4/metacluster_0/pa...	451.0	459.0	1.2215e+08	16.0	Oct4-Sox2	Oct4/m0_p0	443.0	1.2215e+08	11.5528	high	0.9044	-	Oct4	1	chr3	1.2215e+08	Oct4	1.2215e+08	.	NaN	NaN	1.2758	NaN	1.2758	Oct4	1.2758	high	0.967	NaN	NaN	0.4219	NaN	0.4219	Oct4	0.4219	low	0.2234	Oct4/metacluster_0/pa...	492.0	500.0	1.2215e+08	16.0	Oct4-Sox2	Oct4/m0_p0	484.0	1.2215e+08	10.7122	high	0.7777	-	Oct4	2	41.0	++	Oct4-Sox2<>Oct4-Sox2	3	Oct4	profile_count	4.6573	20.6220	0.2258	4.6893	20.6540	0.2270	7.5864	19.3488	0.3921	7.6153	19.3777	0.3930	(35, 70]	450.0	-	491.0	-	41.0
4	4	7	chr3	1.2215e+08	1	Oct4	1.2215e+08	.	NaN	NaN	1.5355	NaN	1.5355	Oct4	1.5355	high	0.9920	NaN	NaN	0.5287	NaN	0.5287	Oct4	0.5287	high	0.7992	Oct4/metacluster_0/pa...	451.0	459.0	1.2215e+08	16.0	Oct4-Sox2	Oct4/m0_p0	443.0	1.2215e+08	11.5528	high	0.9044	-	Oct4	1	chr3	1.2215e+08	Oct4	1.2215e+08	.	NaN	NaN	1.0455	NaN	1.0455	Oct4	1.0455	high	0.905	NaN	NaN	0.4155	NaN	0.4155	Oct4	0.4155	low	0.2005	Oct4/metacluster_0/pa...	534.0	542.0	1.2215e+08	16.0	Oct4-Sox2	Oct4/m0_p0	526.0	1.2215e+08	10.2722	high	0.6902	-	Oct4	3	83.0	++	Oct4-Sox2<>Oct4-Sox2	4	Oct4	profile_count	7.1093	20.6220	0.3447	7.1413	20.6540	0.3458	5.8591	12.9900	0.4510	5.8880	13.0189	0.4523	(70, 150]	450.0	-	533.0	-	83.0

	Unnamed: 0	Unnamed: 0.1	example_chrom_x	example_end_x	example_idx	example_interval_from_task_x	example_start_x	example_strand_x	imp/Klf4_x	imp/Nanog_x	imp/Oct4_x	imp/Sox2_x	imp_max_x	imp_max_task_x	imp_weighted_x	imp_weighted_cat_x	imp_weighted_p_x	match/Klf4_x	match/Nanog_x	match/Oct4_x	match/Sox2_x	match_max_x	match_max_task_x	match_weighted_x	match_weighted_cat_x	match_weighted_p_x	pattern_x	pattern_center_x	pattern_end_x	pattern_end_abs_x	pattern_len_x	pattern_name_x	pattern_short_x	pattern_start_x	pattern_start_abs_x	seq_match_x	seq_match_cat_x	seq_match_p_x	strand_x	tf_x	row_idx_x	example_chrom_y	example_end_y	example_interval_from_task_y	example_start_y	example_strand_y	imp/Klf4_y	imp/Nanog_y	imp/Oct4_y	imp/Sox2_y	imp_max_y	imp_max_task_y	imp_weighted_y	imp_weighted_cat_y	imp_weighted_p_y	match/Klf4_y	match/Nanog_y	match/Oct4_y	match/Sox2_y	match_max_y	match_max_task_y	match_weighted_y	match_weighted_cat_y	match_weighted_p_y	pattern_y	pattern_center_y	pattern_end_y	pattern_end_abs_y	pattern_len_y	pattern_name_y	pattern_short_y	pattern_start_y	pattern_start_abs_y	seq_match_y	seq_match_cat_y	seq_match_p_y	strand_y	tf_y	row_idx_y	center_diff	strand_combination	motif_pair	motif_pair_idx	task	Wt_obs	Wt	dA	dB	dAB
0	0	1	chr3	1.2215e+08	1	Oct4	1.2215e+08	.	NaN	NaN	0.9374	NaN	0.9374	Oct4	0.9374	high	0.8462	NaN	NaN	0.5077	NaN	0.5077	Oct4	0.5077	high	0.6886	Oct4/metacluster_0/pa...	430.0	438.0	1.2215e+08	16.0	Oct4-Sox2	Oct4/m0_p0	422.0	1.2215e+08	10.2722	high	0.6902	-	Oct4	0.0	chr3	1.2215e+08	Oct4	1.2215e+08	.	NaN	NaN	1.5355	NaN	1.5355	Oct4	1.5355	high	0.992	NaN	NaN	0.5287	NaN	0.5287	Oct4	0.5287	high	0.7992	Oct4/metacluster_0/pa...	451.0	459.0	1.2215e+08	16.0	Oct4-Sox2	Oct4/m0_p0	443.0	1.2215e+08	11.5528	high	0.9044	-	Oct4	1.0	21.0	++	Oct4-Sox2<>Oct4-Sox2	0	Oct4	12070.0	5831.35	3665.2786	2096.8750	1037.8267
1	1	2	chr3	1.2215e+08	1	Oct4	1.2215e+08	.	NaN	NaN	0.9374	NaN	0.9374	Oct4	0.9374	high	0.8462	NaN	NaN	0.5077	NaN	0.5077	Oct4	0.5077	high	0.6886	Oct4/metacluster_0/pa...	430.0	438.0	1.2215e+08	16.0	Oct4-Sox2	Oct4/m0_p0	422.0	1.2215e+08	10.2722	high	0.6902	-	Oct4	0.0	chr3	1.2215e+08	Oct4	1.2215e+08	.	NaN	NaN	1.2758	NaN	1.2758	Oct4	1.2758	high	0.967	NaN	NaN	0.4219	NaN	0.4219	Oct4	0.4219	low	0.2234	Oct4/metacluster_0/pa...	492.0	500.0	1.2215e+08	16.0	Oct4-Sox2	Oct4/m0_p0	484.0	1.2215e+08	10.7122	high	0.7777	-	Oct4	2.0	62.0	++	Oct4-Sox2<>Oct4-Sox2	1	Oct4	12070.0	5831.35	3665.2786	1875.4661	1405.9451
2	2	3	chr3	1.2215e+08	1	Oct4	1.2215e+08	.	NaN	NaN	0.9374	NaN	0.9374	Oct4	0.9374	high	0.8462	NaN	NaN	0.5077	NaN	0.5077	Oct4	0.5077	high	0.6886	Oct4/metacluster_0/pa...	430.0	438.0	1.2215e+08	16.0	Oct4-Sox2	Oct4/m0_p0	422.0	1.2215e+08	10.2722	high	0.6902	-	Oct4	0.0	chr3	1.2215e+08	Oct4	1.2215e+08	.	NaN	NaN	1.0455	NaN	1.0455	Oct4	1.0455	high	0.905	NaN	NaN	0.4155	NaN	0.4155	Oct4	0.4155	low	0.2005	Oct4/metacluster_0/pa...	534.0	542.0	1.2215e+08	16.0	Oct4-Sox2	Oct4/m0_p0	526.0	1.2215e+08	10.2722	high	0.6902	-	Oct4	3.0	104.0	++	Oct4-Sox2<>Oct4-Sox2	2	Oct4	12070.0	5831.35	3665.2786	1574.0667	801.1036
3	3	6	chr3	1.2215e+08	1	Oct4	1.2215e+08	.	NaN	NaN	1.5355	NaN	1.5355	Oct4	1.5355	high	0.9920	NaN	NaN	0.5287	NaN	0.5287	Oct4	0.5287	high	0.7992	Oct4/metacluster_0/pa...	451.0	459.0	1.2215e+08	16.0	Oct4-Sox2	Oct4/m0_p0	443.0	1.2215e+08	11.5528	high	0.9044	-	Oct4	1.0	chr3	1.2215e+08	Oct4	1.2215e+08	.	NaN	NaN	1.2758	NaN	1.2758	Oct4	1.2758	high	0.967	NaN	NaN	0.4219	NaN	0.4219	Oct4	0.4219	low	0.2234	Oct4/metacluster_0/pa...	492.0	500.0	1.2215e+08	16.0	Oct4-Sox2	Oct4/m0_p0	484.0	1.2215e+08	10.7122	high	0.7777	-	Oct4	2.0	41.0	++	Oct4-Sox2<>Oct4-Sox2	3	Oct4	12070.0	5831.35	2096.8750	1875.4661	656.2836
4	4	7	chr3	1.2215e+08	1	Oct4	1.2215e+08	.	NaN	NaN	1.5355	NaN	1.5355	Oct4	1.5355	high	0.9920	NaN	NaN	0.5287	NaN	0.5287	Oct4	0.5287	high	0.7992	Oct4/metacluster_0/pa...	451.0	459.0	1.2215e+08	16.0	Oct4-Sox2	Oct4/m0_p0	443.0	1.2215e+08	11.5528	high	0.9044	-	Oct4	1.0	chr3	1.2215e+08	Oct4	1.2215e+08	.	NaN	NaN	1.0455	NaN	1.0455	Oct4	1.0455	high	0.905	NaN	NaN	0.4155	NaN	0.4155	Oct4	0.4155	low	0.2005	Oct4/metacluster_0/pa...	534.0	542.0	1.2215e+08	16.0	Oct4-Sox2	Oct4/m0_p0	526.0	1.2215e+08	10.2722	high	0.6902	-	Oct4	3.0	83.0	++	Oct4-Sox2<>Oct4-Sox2	4	Oct4	12070.0	5831.35	2096.8750	1574.0667	513.7097

	Wt_obs	Wt	dA	dB	dAB	motif_pair	task	center_diff	strand_combination
0	12070.0	5831.3506	3569.0996	1977.2351	1536.7383	Oct4-Sox2<>Oct4-Sox2	Oct4	21.0	++
1	12070.0	5831.3506	3569.0996	1946.5161	871.3293	Oct4-Sox2<>Oct4-Sox2	Oct4	62.0	++
2	12070.0	5831.3506	3569.0996	1818.8110	725.8771	Oct4-Sox2<>Oct4-Sox2	Oct4	104.0	++
3	12070.0	5831.3506	1977.2351	1946.5161	533.5667	Oct4-Sox2<>Oct4-Sox2	Oct4	41.0	++
4	12070.0	5831.3506	1977.2351	1818.8110	544.4917	Oct4-Sox2<>Oct4-Sox2	Oct4	83.0	++

	Unnamed: 0	Unnamed: 0.1	example_chrom_x	example_end_x	example_idx	example_interval_from_task_x	example_start_x	example_strand_x	imp/Klf4_x	imp/Nanog_x	imp/Oct4_x	imp/Sox2_x	imp_max_x	imp_max_task_x	imp_weighted_x	imp_weighted_cat_x	imp_weighted_p_x	match/Klf4_x	match/Nanog_x	match/Oct4_x	match/Sox2_x	match_max_x	match_max_task_x	match_weighted_x	match_weighted_cat_x	match_weighted_p_x	pattern_x	pattern_center_x	pattern_end_x	pattern_end_abs_x	pattern_len_x	pattern_name_x	pattern_short_x	pattern_start_x	pattern_start_abs_x	seq_match_x	seq_match_cat_x	seq_match_p_x	strand_x	tf_x	row_idx_x	example_chrom_y	example_end_y	example_interval_from_task_y	example_start_y	example_strand_y	imp/Klf4_y	imp/Nanog_y	imp/Oct4_y	imp/Sox2_y	imp_max_y	imp_max_task_y	imp_weighted_y	imp_weighted_cat_y	imp_weighted_p_y	match/Klf4_y	match/Nanog_y	match/Oct4_y	match/Sox2_y	match_max_y	match_max_task_y	match_weighted_y	match_weighted_cat_y	match_weighted_p_y	pattern_y	pattern_center_y	pattern_end_y	pattern_end_abs_y	pattern_len_y	pattern_name_y	pattern_short_y	pattern_start_y	pattern_start_abs_y	seq_match_y	seq_match_cat_y	seq_match_p_y	strand_y	tf_y	row_idx_y	center_diff	strand_combination	motif_pair	motif_pair_idx	task	score	x_alt	x_ref	x_alt_ref	x_alt_pc	x_ref_pc	x_alt_ref_pc	y_alt	y_ref	y_alt_ref	y_alt_pc	y_ref_pc	y_alt_ref_pc	center_diff_cat	pattern_center_aln_x	pattern_strand_aln_x	pattern_center_aln_y	pattern_strand_aln_y	pattern_diff_aln
1484740	61436	6	chr5	2.8211e+07	10	Oct4	2.8210e+07	.	NaN	NaN	1.7529	NaN	1.7529	Oct4	1.7529	high	0.9973	NaN	NaN	0.5800	NaN	0.5800	Oct4	0.5800	high	0.9643	Oct4/metacluster_0/pa...	481.0	489.0	2.8210e+07	16.0	Oct4-Sox2	Oct4/m0_p0	473.0	2.8210e+07	8.8721	medium	0.4155	-	Oct4	6	chr5	2.8211e+07	Oct4	2.8210e+07	.	NaN	NaN	NaN	0.3789	0.3789	Sox2	0.3789	medium	0.5832	NaN	NaN	NaN	0.4990	0.4990	Sox2	0.4990	low	0.2939	Sox2/metacluster_0/pa...	551.0	559.0	2.8210e+07	16.0	Sox2	Sox2/m0_p1	543.0	2.8210e+07	7.1866	low	0.2134	+	Sox2	107109	70.0	-+	Oct4-Sox2<>Sox2	61436	Oct4	max_profile_count_bt	0.8474	1.0098	0.8391	0.8827	1.0452	0.8446	0.1881	0.3576	0.5259	0.2275	0.3970	0.5729	(35, 70]	480.0	-	558.0	+	78.0
1484741	61437	7	chr5	2.8211e+07	10	Oct4	2.8210e+07	.	NaN	NaN	1.3879	NaN	1.3879	Oct4	1.3879	high	0.9823	NaN	NaN	0.4648	NaN	0.4648	Oct4	0.4648	medium	0.4248	Oct4/metacluster_0/pa...	507.0	515.0	2.8210e+07	16.0	Oct4-Sox2	Oct4/m0_p0	499.0	2.8210e+07	7.4766	low	0.1803	-	Oct4	7	chr5	2.8211e+07	Oct4	2.8210e+07	.	NaN	NaN	NaN	0.3789	0.3789	Sox2	0.3789	medium	0.5832	NaN	NaN	NaN	0.4990	0.4990	Sox2	0.4990	low	0.2939	Sox2/metacluster_0/pa...	551.0	559.0	2.8210e+07	16.0	Sox2	Sox2/m0_p1	543.0	2.8210e+07	7.1866	low	0.2134	+	Sox2	107109	44.0	-+	Oct4-Sox2<>Sox2	61437	Oct4	max_profile_count_bt	0.7806	0.8259	0.9452	0.8160	0.8613	0.9474	0.2393	0.3743	0.6394	0.2787	0.4137	0.6737	(35, 70]	506.0	-	558.0	+	52.0
1484743	61439	10	chr14	8.6397e+07	28	Oct4	8.6396e+07	.	NaN	NaN	0.4323	NaN	0.4323	Oct4	0.4323	medium	0.3711	NaN	NaN	0.4789	NaN	0.4789	Oct4	0.4789	medium	0.5101	Oct4/metacluster_0/pa...	433.0	441.0	8.6396e+07	16.0	Oct4-Sox2	Oct4/m0_p0	425.0	8.6396e+07	8.4841	medium	0.3326	-	Oct4	10	chr14	8.6397e+07	Oct4	8.6396e+07	.	NaN	NaN	NaN	0.2097	0.2097	Sox2	0.2097	low	0.2024	NaN	NaN	NaN	0.5218	0.5218	Sox2	0.5218	medium	0.3908	Sox2/metacluster_0/pa...	567.0	575.0	8.6396e+07	16.0	Sox2	Sox2/m0_p1	559.0	8.6396e+07	3.1809	low	0.0160	-	Sox2	107112	134.0	--	Oct4-Sox2<>Sox2	61439	Oct4	max_profile_count_bt	0.1837	0.2576	0.7132	0.2191	0.2929	0.7478	0.1815	0.2363	0.7684	0.2209	0.2756	0.8015	(70, 150]	432.0	-	560.0	-	128.0
1484744	61440	11	chr14	8.6397e+07	28	Oct4	8.6396e+07	.	NaN	NaN	1.0223	NaN	1.0223	Oct4	1.0223	high	0.8950	NaN	NaN	0.4931	NaN	0.4931	Oct4	0.4931	medium	0.5949	Oct4/metacluster_0/pa...	463.0	471.0	8.6396e+07	16.0	Oct4-Sox2	Oct4/m0_p0	455.0	8.6396e+07	10.5521	high	0.7476	-	Oct4	11	chr14	8.6397e+07	Oct4	8.6396e+07	.	NaN	NaN	NaN	0.2097	0.2097	Sox2	0.2097	low	0.2024	NaN	NaN	NaN	0.5218	0.5218	Sox2	0.5218	medium	0.3908	Sox2/metacluster_0/pa...	567.0	575.0	8.6396e+07	16.0	Sox2	Sox2/m0_p1	559.0	8.6396e+07	3.1809	low	0.0160	-	Sox2	107112	104.0	--	Oct4-Sox2<>Sox2	61440	Oct4	max_profile_count_bt	0.7108	0.7512	0.9462	0.7462	0.7865	0.9487	0.1032	0.2063	0.5004	0.1426	0.2457	0.5805	(70, 150]	462.0	-	560.0	-	98.0
1484745	61441	12	chr14	8.6397e+07	28	Oct4	8.6396e+07	.	NaN	NaN	1.3324	NaN	1.3324	Oct4	1.3324	high	0.9751	NaN	NaN	0.4873	NaN	0.4873	Oct4	0.4873	medium	0.5576	Oct4/metacluster_0/pa...	506.0	514.0	8.6396e+07	16.0	Oct4-Sox2	Oct4/m0_p0	498.0	8.6396e+07	6.6522	low	0.0915	-	Oct4	12	chr14	8.6397e+07	Oct4	8.6396e+07	.	NaN	NaN	NaN	0.2097	0.2097	Sox2	0.2097	low	0.2024	NaN	NaN	NaN	0.5218	0.5218	Sox2	0.5218	medium	0.3908	Sox2/metacluster_0/pa...	567.0	575.0	8.6396e+07	16.0	Sox2	Sox2/m0_p1	559.0	8.6396e+07	3.1809	low	0.0160	-	Sox2	107112	61.0	--	Oct4-Sox2<>Sox2	61441	Oct4	max_profile_count_bt	1.0052	0.9887	1.0167	1.0405	1.0240	1.0161	0.1268	0.1670	0.7593	0.1662	0.2064	0.8052	(35, 70]	505.0	-	560.0	-	55.0

	size	median
center_diff
8.0	1	0.4575
9.0	14	0.7890
10.0	39	0.8930
...	...	...
147.0	73	0.9789
148.0	64	0.9799
149.0	59	0.9559

	center_diff	size	median	mad
2	10.0	39	0.8930	0.1113
3	11.0	62	0.8319	0.1825
4	12.0	69	0.9561	0.1143
...	...	...	...	...
139	147.0	73	0.9789	0.0339
140	148.0	64	0.9799	0.0216
141	149.0	59	0.9559	0.0302

	center_diff	size	median	mad
0	11.0	1	0.1575	0.0000
1	18.0	2	0.4648	0.0174
2	25.0	2	0.5192	0.0413
3	26.0	1	0.4054	0.0000
4	29.0	1	0.5173	0.0000
5	67.0	1	0.7914	0.0000

	i	pattern	cluster	n seqlets	Klf4/d_p	Nanog/d_p	Oct4/d_p	Sox2/d_p	Klf4 footprint counts	Klf4 footprint entropydiff	Klf4 footprint max	Klf4 footprint strandcor	Klf4 imp counts	Klf4 imp profile	Klf4 periodicity 10bp	Klf4 pos meandiff	Klf4 pos std	Klf4 pos unimodal	Klf4 region counts	Klf4/d	Nanog footprint counts	Nanog footprint entropydiff	Nanog footprint max	Nanog footprint strandcor	Nanog imp counts	Nanog imp profile	Nanog periodicity 10bp	Nanog pos meandiff	Nanog pos std	Nanog pos unimodal	Nanog region counts	Nanog/d	Oct4 footprint counts	Oct4 footprint entropydiff	Oct4 footprint max	Oct4 footprint strandcor	Oct4 imp counts	Oct4 imp profile	Oct4 periodicity 10bp	Oct4 pos meandiff	Oct4 pos std	Oct4 pos unimodal	Oct4 region counts	Oct4/d	Sox2 footprint counts	Sox2 footprint entropydiff	Sox2 footprint max	Sox2 footprint strandcor	Sox2 imp counts	Sox2 imp profile	Sox2 periodicity 10bp	Sox2 pos meandiff	Sox2 pos std	Sox2 pos unimodal	Sox2 region counts	Sox2/d	consensus	ic pwm mean	idx
0	0	m1_p7	8	157.0	95.0820	11.4754	0.8197	0.8197	183.4872	0.2647	1.5417	0.0059	0.0168	0.0285	0.0077	2.0164	95.7851	True	387.5287	0.0401	133.1731	0.1340	0.8974	0.0051	-0.0103	-9.0438e-04	0.0009	1.8610	185.0268	True	367.1019	0.0085	149.9744	0.1425	0.9423	0.0050	-4.8469e-04	-8.0106e-04	0.0019	1.9562	127.6654	True	422.9363	-0.0011	60.1218	0.2124	0.5737	0.0048	-0.0016	-2.6529e-03	0.0016	1.8105	121.0656	True	199.4968	-0.0038	GCCCCGCCCCACCCT	1.3623	37
1	1	m1_p8	8	162.0	90.9836	13.1148	8.1967	4.9180	154.7716	0.2412	1.2716	0.0059	0.0106	0.0198	0.0572	2.0037	61.0182	True	302.7778	0.0291	74.3827	0.1929	0.5123	0.0049	-0.0038	2.4813e-03	0.0169	2.3406	104.3285	True	212.5494	0.0088	85.1667	0.1113	0.4969	0.0049	5.1559e-04	1.1783e-03	0.0248	2.2167	96.4021	True	276.4321	0.0018	37.7284	0.1553	0.2593	0.0049	0.0005	-4.5393e-05	0.0183	2.0258	93.9946	True	146.6173	-0.0006	GGGAAGGGTGGGGCAGTAGA	0.9815	38
2	2	m6_p2	8	355.0	80.3279	24.5902	40.9836	27.8689	220.2955	0.2445	1.9148	0.0061	0.0053	0.0123	0.0073	2.0044	87.2336	True	421.7324	0.0193	288.6136	0.1758	1.9446	0.0052	0.0027	9.0652e-03	0.0038	1.9253	75.9526	True	537.9211	0.0154	167.0597	0.1038	1.1847	0.0050	2.4447e-03	3.4294e-03	0.0055	1.9122	139.0525	True	430.0113	0.0044	68.9744	0.1055	0.4290	0.0051	0.0027	3.2214e-03	0.0027	1.7937	138.7371	True	202.9465	0.0037	CAGAGTGCTGGGATTAAAGGC...	0.3143	109
3	3	m1_p0	8	12735.0	100.0000	25.4098	4.9180	3.2787	182.8839	0.2046	1.5890	0.0061	0.0263	0.0500	0.0159	1.9985	70.1436	True	355.6721	0.0736	115.8733	0.0249	0.5408	0.0051	-0.0140	7.4599e-04	0.0065	2.0307	146.5852	True	307.2621	0.0155	102.3714	0.0258	0.5333	0.0050	-8.0866e-04	9.4931e-05	0.0132	2.0174	155.3548	True	339.2477	0.0010	48.2119	0.0220	0.2416	0.0050	-0.0007	-1.2597e-03	0.0053	2.0489	165.0139	True	173.3736	-0.0018	TGGGTGTGGC	1.2984	30
4	4	m1_p1	8	2064.0	95.9016	16.3934	2.4590	2.4590	164.3267	0.1382	1.0644	0.0056	0.0204	0.0303	0.0954	1.9978	59.0731	True	349.1899	0.0402	38.5148	0.0186	0.1699	0.0050	-0.0103	4.1849e-06	0.1099	2.0319	154.9271	True	139.5499	0.0103	77.7647	0.0188	0.3607	0.0050	-9.5728e-06	-1.6580e-04	0.0898	1.9667	109.5886	True	286.3324	-0.0003	28.1585	0.0213	0.1181	0.0050	-0.0002	-1.5025e-03	0.0829	1.9582	172.8593	True	135.0795	-0.0028	GGCCCCGCCCT	1.1518	31

	Oct4	Sox2	Nanog	Klf4
pattern_name
Oct4-Sox2	5.1700	1.9023	1.4743	0.5510
Sox2	1.0024	3.0287	4.7277	1.2223
Nanog	0.7602	0.2945	2.9368	0.4001
Klf4	0.5333	0.2416	0.5408	1.5890

	Oct4	Sox2	Nanog	Klf4
pattern_name
Oct4-Sox2	116.8492	289.9074	401.0843	135.1212
Sox2	200.3641	516.2857	216.5016	188.5086
Nanog	96.6386	274.3910	154.1314	63.4457
Klf4	182.8839	115.8733	102.3714	48.2119

Figure 5,6¶

Load the data¶

In vitro instances¶

Add pattern_center_aln to dfi_subset¶

Exclude TEs¶

Remove all Sox2 or Oct4 instances comming from the Oct4-Sox2 motif¶

Exclude TE's¶

Save dfabf¶

Subset dfs¶

Synergy scores¶

Fraction of positive values¶

Synergy scores: (2*Wt-dA-dB)/(Wt-dAB)¶

Require: 90% Wt-dAB>0¶

Setup table showing only the major TF's¶

Major TF per motif¶

Require: 20% Wt-dAB>0¶

Major TF per motif¶

Show all¶

Major TF per motif¶

Directional effect¶

Scatterplot showing the major TF per motif¶

All TF's per motif¶

Heatmap version¶

Median value¶

AB / A¶

Mean value¶

AB / A¶

Values for all the tasks¶

Median¶

Mean¶

Generate individual plots¶

Pairwise plot¶

TODO¶

Bleed-through corrected¶

TODO¶

Make a nice plot¶

Directness score¶

Debug the pioneering activity¶

Smallest effect¶

2nd smallest effect¶

3rd smallest effect¶

4th smallest effect¶

5th worst effect¶

Best effect¶

2nd best effect¶

Add `pattern_center_aln` to `dfi_subset`¶

Synergy scores: `(2*Wt-dA-dB)/(Wt-dAB)`¶