In [25]:
from collections import OrderedDict
exp = 'nexus,peaks,OSNK,0,10,1,FALSE,same,0.5,64,25,0.004,9,FALSE,[1,50],TRUE'
imp_score = 'profile/wn'

motifs = OrderedDict([
    ("Oct4-Sox2", 'Oct4/m0_p0'),
    # ("Oct4", 'Oct4/m0_p1'),
    # ("Strange-sym-motif", 'Oct4/m0_p5'),
    ("Sox2", 'Sox2/m0_p1'),
    ("Nanog", 'Nanog/m0_p1'),
    # ("Zic3", 'Nanog/m0_p2'),
    # ("Nanog-partner", 'Nanog/m0_p4'),
    ("Klf4", 'Klf4/m0_p0'),
])

Figure 5,6

In [2]:
# Imports
from basepair.imports import *
from basepair.exp.paper.config import models_dir, profile_mapping, ddir
from basepair.exp.chipnexus.perturb.vdom import vdom_motif_pair, plot_spacing_hist
from basepair.exp.chipnexus.spacing import remove_edge_instances, get_motif_pairs
from basepair.exp.paper.config import tasks
from plotnine import *
import plotnine
import warnings
warnings.filterwarnings("ignore")

%matplotlib inline
%config InlineBackend.figure_format = 'retina'
paper_config()
Using TensorFlow backend.
In [3]:
model_dir = models_dir / exp
dataset_dir = model_dir / 'perturbation-analysis'
In [28]:
pairs = get_motif_pairs(motifs)

# ordered names
pair_names = ["<>".join(x) for x in pairs]
In [5]:
figures_dir = Path(f"{ddir}/figures/modisco/{exp}/in-vivo-perturb")
In [6]:
!mkdir -p {figures_dir}
In [7]:
# define the global set of distances
dist_subsets = ['center_diff<=35',
               '(center_diff>35)&(center_diff<=70)', 
               '(center_diff>70)&(center_diff<=150)', 
               'center_diff>150']
dist_subset_labels = ['dist < 35',
                      '35 < dist <= 70',
                      '70 < dist <= 150',
                      '150 < dist',
                     ]
In [8]:
from basepair.plot.utils import PlotninePalette

# choose the right color
colors = plt.get_cmap("RdBu")(np.linspace(0, 1, 9))
hex_colors = [plt.cm.colors.to_hex(a) for a in colors]
sns.palplot(colors)

cb_red = hex_colors[2]
cb_blue = hex_colors[-2 - 2]

hex_colors
Out[8]:
['#67001f',
 '#bb2a34',
 '#e58368',
 '#fbceb7',
 '#f6f7f7',
 '#c0dceb',
 '#68abd0',
 '#2870b1',
 '#053061']

Load the data

In vitro instances

In [9]:
from basepair.exp.chipnexus.perturb.scores import ism_compute_features_tidy, compute_features_tidy, SCORES, max_profile_count
In [10]:
ls {dataset_dir}
dfab.csv.gz                  dfi_subset.csv.gz  ref.h5
dfabf.csv.gz                 dfs.csv.gz         ref.h5.bak
dfabf_ism.csv.gz             double_mut.h5      single_mut.h5
dfabf.Oct4-Sox2-subset.parq  double_mut.h5.bak  single_mut.h5.bak
In [12]:
dfabf = pd.read_csv(dataset_dir / 'dfabf.csv.gz')
dfabf = dfabf[dfabf.motif_pair.isin(pair_names)]
dfabf['center_diff_cat'] = pd.Categorical(pd.cut(dfabf['center_diff'], [0, 35, 70, 150, 1000]))
In [13]:
# dfs = pd.read_csv(dataset_dir / 'dfs.csv.gz')
In [14]:
# Store the output
dfabf_ism = pd.read_csv(dataset_dir / 'dfabf_ism.csv.gz')
dfabf_ism = dfabf_ism[dfabf_ism.motif_pair.isin(pair_names)]
In [15]:
# dfab = pd.read_csv(dataset_dir / 'dfab.csv.gz')
# dfab['motif_pair_cat'] = pd.Categorical(dfab.motif_pair, categories=dfab.groupby("motif_pair").size().sort_values(ascending=False).index)
# dfab['strand_combination_cat'] = pd.Categorical(dfab['strand_combination'], )
In [16]:
dfi_subset = pd.read_csv(dataset_dir / 'dfi_subset.csv.gz')
In [17]:
dfabf.motif_pair.value_counts()
Out[17]:
Nanog<>Klf4             1775508
Oct4<>Nanog             1531264
Oct4-Sox2<>Oct4         1233428
                         ...   
Oct4-Sox2<>Sox2          380268
Oct4-Sox2<>Oct4-Sox2     210812
Sox2<>Sox2               183960
Name: motif_pair, Length: 15, dtype: int64
In [18]:
dfabf.head()
Out[18]:
Unnamed: 0 example_chrom_x example_end_x example_idx example_interval_from_task_x example_start_x example_strand_x imp/Klf4_x imp/Nanog_x imp/Oct4_x imp/Sox2_x imp_max_x imp_max_task_x imp_weighted_x imp_weighted_cat_x imp_weighted_p_x match/Klf4_x match/Nanog_x match/Oct4_x match/Sox2_x match_max_x match_max_task_x match_weighted_x match_weighted_cat_x match_weighted_p_x pattern_x pattern_center_x pattern_end_x pattern_end_abs_x pattern_len_x pattern_name_x pattern_short_x pattern_start_x pattern_start_abs_x seq_match_x seq_match_cat_x seq_match_p_x strand_x tf_x row_idx_x example_chrom_y example_end_y example_interval_from_task_y example_start_y example_strand_y imp/Klf4_y imp/Nanog_y imp/Oct4_y imp/Sox2_y imp_max_y imp_max_task_y imp_weighted_y imp_weighted_cat_y imp_weighted_p_y match/Klf4_y match/Nanog_y match/Oct4_y match/Sox2_y match_max_y match_max_task_y match_weighted_y match_weighted_cat_y match_weighted_p_y pattern_y pattern_center_y pattern_end_y pattern_end_abs_y pattern_len_y pattern_name_y pattern_short_y pattern_start_y pattern_start_abs_y seq_match_y seq_match_cat_y seq_match_p_y strand_y tf_y row_idx_y center_diff strand_combination motif_pair motif_pair_idx task score x_alt x_ref x_alt_ref x_alt_pc x_ref_pc x_alt_ref_pc y_alt y_ref y_alt_ref y_alt_pc y_ref_pc y_alt_ref_pc center_diff_cat
0 1 chr3 1.2215e+08 1 Oct4 1.2215e+08 . NaN NaN 0.9374 NaN 0.9374 Oct4 0.9374 high 0.8462 NaN NaN 0.5077 NaN 0.5077 Oct4 0.5077 high 0.6886 Oct4/metacluster_0/pa... 430.0 438.0 1.2215e+08 16.0 Oct4-Sox2 Oct4/m0_p0 422.0 1.2215e+08 10.2722 high 0.6902 - Oct4 0.0 chr3 1.2215e+08 Oct4 1.2215e+08 . NaN NaN 1.5355 NaN 1.5355 Oct4 1.5355 high 0.992 NaN NaN 0.5287 NaN 0.5287 Oct4 0.5287 high 0.7992 Oct4/metacluster_0/pa... 451.0 459.0 1.2215e+08 16.0 Oct4-Sox2 Oct4/m0_p0 443.0 1.2215e+08 11.5528 high 0.9044 - Oct4 1.0 21.0 ++ Oct4-Sox2<>Oct4-Sox2 0 Oct4 profile_count 2.6246 13.7886 0.1903 2.6566 13.8206 0.1922 11.6847 20.6220 0.5666 11.7136 20.6510 0.5672 (0, 35]
1 2 chr3 1.2215e+08 1 Oct4 1.2215e+08 . NaN NaN 0.9374 NaN 0.9374 Oct4 0.9374 high 0.8462 NaN NaN 0.5077 NaN 0.5077 Oct4 0.5077 high 0.6886 Oct4/metacluster_0/pa... 430.0 438.0 1.2215e+08 16.0 Oct4-Sox2 Oct4/m0_p0 422.0 1.2215e+08 10.2722 high 0.6902 - Oct4 0.0 chr3 1.2215e+08 Oct4 1.2215e+08 . NaN NaN 1.2758 NaN 1.2758 Oct4 1.2758 high 0.967 NaN NaN 0.4219 NaN 0.4219 Oct4 0.4219 low 0.2234 Oct4/metacluster_0/pa... 492.0 500.0 1.2215e+08 16.0 Oct4-Sox2 Oct4/m0_p0 484.0 1.2215e+08 10.7122 high 0.7777 - Oct4 2.0 62.0 ++ Oct4-Sox2<>Oct4-Sox2 1 Oct4 profile_count 4.2233 13.7886 0.3063 4.2553 13.8206 0.3079 13.3997 19.3488 0.6925 13.4286 19.3777 0.6930 (35, 70]
2 3 chr3 1.2215e+08 1 Oct4 1.2215e+08 . NaN NaN 0.9374 NaN 0.9374 Oct4 0.9374 high 0.8462 NaN NaN 0.5077 NaN 0.5077 Oct4 0.5077 high 0.6886 Oct4/metacluster_0/pa... 430.0 438.0 1.2215e+08 16.0 Oct4-Sox2 Oct4/m0_p0 422.0 1.2215e+08 10.2722 high 0.6902 - Oct4 0.0 chr3 1.2215e+08 Oct4 1.2215e+08 . NaN NaN 1.0455 NaN 1.0455 Oct4 1.0455 high 0.905 NaN NaN 0.4155 NaN 0.4155 Oct4 0.4155 low 0.2005 Oct4/metacluster_0/pa... 534.0 542.0 1.2215e+08 16.0 Oct4-Sox2 Oct4/m0_p0 526.0 1.2215e+08 10.2722 high 0.6902 - Oct4 3.0 104.0 ++ Oct4-Sox2<>Oct4-Sox2 2 Oct4 profile_count 5.0190 13.7886 0.3640 5.0510 13.8206 0.3655 8.8362 12.9900 0.6802 8.8651 13.0189 0.6809 (70, 150]
3 6 chr3 1.2215e+08 1 Oct4 1.2215e+08 . NaN NaN 1.5355 NaN 1.5355 Oct4 1.5355 high 0.9920 NaN NaN 0.5287 NaN 0.5287 Oct4 0.5287 high 0.7992 Oct4/metacluster_0/pa... 451.0 459.0 1.2215e+08 16.0 Oct4-Sox2 Oct4/m0_p0 443.0 1.2215e+08 11.5528 high 0.9044 - Oct4 1.0 chr3 1.2215e+08 Oct4 1.2215e+08 . NaN NaN 1.2758 NaN 1.2758 Oct4 1.2758 high 0.967 NaN NaN 0.4219 NaN 0.4219 Oct4 0.4219 low 0.2234 Oct4/metacluster_0/pa... 492.0 500.0 1.2215e+08 16.0 Oct4-Sox2 Oct4/m0_p0 484.0 1.2215e+08 10.7122 high 0.7777 - Oct4 2.0 41.0 ++ Oct4-Sox2<>Oct4-Sox2 3 Oct4 profile_count 6.0170 20.6220 0.2918 6.0490 20.6540 0.2929 7.1250 19.3488 0.3682 7.1539 19.3777 0.3692 (35, 70]
4 7 chr3 1.2215e+08 1 Oct4 1.2215e+08 . NaN NaN 1.5355 NaN 1.5355 Oct4 1.5355 high 0.9920 NaN NaN 0.5287 NaN 0.5287 Oct4 0.5287 high 0.7992 Oct4/metacluster_0/pa... 451.0 459.0 1.2215e+08 16.0 Oct4-Sox2 Oct4/m0_p0 443.0 1.2215e+08 11.5528 high 0.9044 - Oct4 1.0 chr3 1.2215e+08 Oct4 1.2215e+08 . NaN NaN 1.0455 NaN 1.0455 Oct4 1.0455 high 0.905 NaN NaN 0.4155 NaN 0.4155 Oct4 0.4155 low 0.2005 Oct4/metacluster_0/pa... 534.0 542.0 1.2215e+08 16.0 Oct4-Sox2 Oct4/m0_p0 526.0 1.2215e+08 10.2722 high 0.6902 - Oct4 3.0 83.0 ++ Oct4-Sox2<>Oct4-Sox2 4 Oct4 profile_count 7.2364 20.6220 0.3509 7.2684 20.6540 0.3519 5.7126 12.9900 0.4398 5.7415 13.0189 0.4410 (70, 150]
In [19]:
from basepair.exp.paper.config import tasks

Add pattern_center_aln to dfi_subset

In [21]:
mr = MultipleModiscoResult({t: model_dir / f'deeplift/{t}/out/{imp_score}/modisco.h5'
                           for t in tasks})
In [22]:
from basepair.exp.paper.fig4 import cluster_align_patterns
main_motifs = [mr.get_pattern(pattern_name).add_attr('features', {'n seqlets': mr.n_seqlets(pattern_name)})
               for name, pattern_name in motifs.items()]
main_motifs = [p.rename(longer_pattern(p.name)) for p in main_motifs]
main_motifs_clustered = cluster_align_patterns(main_motifs, n_clusters=1)
  0%|          | 0/5 [00:00<?, ?it/s]TF-MoDISco is using the TensorFlow backend.
100%|██████████| 5/5 [00:00<00:00, 20.09it/s]
100%|██████████| 5/5 [00:00<00:00, 226.71it/s]
In [23]:
from basepair.modisco.pattern_instances import align_instance_center
dfi_subset = align_instance_center(dfi_subset, main_motifs, main_motifs_clustered, trim_frac=0.08)

Exclude TEs

In [24]:
instance_parq_paths = {t: model_dir / f'deeplift/{t}/out/{imp_score}/instances.parq' 
                       for t in tasks}
In [25]:
def shorten_te_pattern(s):
    tf, p = s.split("/", 1)
    return tf + "/" + shorten_pattern(p)
In [26]:
motifs_te = [p.name
             for p in mr.get_all_patterns()
             if p.seq_info_content > 30 and mr.n_seqlets(p.name) > 100]
motifs_te_d = OrderedDict([(shorten_te_pattern(x), shorten_te_pattern(x)) for x in motifs_te])
In [27]:
from basepair.modisco.pattern_instances import multiple_load_instances
In [28]:
# # get transposable element locations
dfi_te = multiple_load_instances(instance_parq_paths, motifs_te_d)
dfi_te = dfi_te[(dfi_te.match_weighted_p > 0.1) & (dfi_te.seq_match > 20)]
100%|██████████| 4/4 [13:15<00:00, 258.15s/it]
In [29]:
# Get rows without intersecting transposable elements
from basepair.preproc import rc_seq, dfint_no_intersection
interval_cols = ['example_chrom', 'pattern_start_abs', 'pattern_end_abs']
non_te_idx = dfint_no_intersection(dfi_subset[interval_cols], dfi_te[interval_cols])
print(f"Not overlapping te's: {non_te_idx.mean()}")
# # annotate with profiles
# profiles = load_profiles(modisco_dir, imp_score_file)
# dfi_anno = annotate_profile(dfi, mr, profiles, profile_width=70, trim_frac=0.08)

# dfi = dfi_anno
Not overlapping te's: 0.9650362162006175
In [30]:
valid_row_idx = dfi_subset[non_te_idx].row_idx.values
In [31]:
len(valid_row_idx) # non-te rows
Out[31]:
334414

Remove all Sox2 or Oct4 instances comming from the Oct4-Sox2 motif

Append pattern_center_aln to dfabf.

In [32]:
def suffix_colnames(df, suffix):
    df.columns = [c + suffix for c in df.columns]
    return df
In [33]:
dfabf['row_idx_x'] = dfabf['row_idx_x'].astype(int)
dfabf['row_idx_y'] = dfabf['row_idx_y'].astype(int)
In [34]:
dfabf2 = pd.merge(dfabf, dfi_subset[['row_idx',
                                     'pattern_center_aln',
                                     'pattern_strand_aln']].pipe(suffix_colnames, '_x'),
                 on='row_idx_x',
                 how='left')
dfabf2 = pd.merge(dfabf2, dfi_subset[['row_idx',
                                      'pattern_center_aln', 
                                      'pattern_strand_aln']].pipe(suffix_colnames, '_y'),
                  on='row_idx_y',
                  how='left')
dfabf2['pattern_diff_aln'] = dfabf2['pattern_center_aln_y'] - dfabf2['pattern_center_aln_x']
In [35]:
exclude_sox2 = dfabf2[(dfabf2.motif_pair == 'Oct4-Sox2<>Sox2') & 
                      (dfabf2['pattern_diff_aln'] == 0)].row_idx_y.values
exclude_oct4 = dfabf2[(dfabf2.motif_pair == 'Oct4-Sox2<>Oct4') & 
                      (dfabf2['pattern_diff_aln'] == 0)].row_idx_y.values
In [36]:
# Exclude the overlapping row
dfabf2 = dfabf2[(dfabf2.pattern_name_x != 'Oct4') | (~dfabf2.row_idx_x.isin(exclude_oct4))]
dfabf2 = dfabf2[(dfabf2.pattern_name_y != 'Oct4') | (~dfabf2.row_idx_y.isin(exclude_oct4))]
dfabf2 = dfabf2[(dfabf2.pattern_name_x != 'Sox2') | (~dfabf2.row_idx_x.isin(exclude_sox2))]
dfabf2 = dfabf2[(dfabf2.pattern_name_y != 'Sox2') | (~dfabf2.row_idx_y.isin(exclude_sox2))]
In [46]:
dfabf2.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 10292128 entries, 0 to 12490911
Columns: 102 entries, Unnamed: 0 to pattern_diff_aln
dtypes: category(1), float64(64), int64(5), object(32)
memory usage: 7.8+ GB
In [47]:
len(dfabf2)
Out[47]:
10292128

Exclude TE's

In [37]:
# Exclude TE's
dfabf2 = dfabf2[dfabf2.row_idx_x.isin(valid_row_idx) & dfabf2.row_idx_y.isin(valid_row_idx)]
In [38]:
dfabf2 = dfabf2.reset_index()

Save dfabf

In [50]:
# dfabf2.reset_index().to_csv(dataset_dir / 'dfabf.Oct4-Sox2-subset.csv.gz', compression='gzip', index=False)
In [51]:
dfabf2.head()
Out[51]:
index Unnamed: 0 example_chrom_x example_end_x example_idx example_interval_from_task_x example_start_x example_strand_x imp/Klf4_x imp/Nanog_x imp/Oct4_x imp/Sox2_x imp_max_x imp_max_task_x imp_weighted_x imp_weighted_cat_x imp_weighted_p_x match/Klf4_x match/Nanog_x match/Oct4_x match/Sox2_x match_max_x match_max_task_x match_weighted_x match_weighted_cat_x match_weighted_p_x pattern_x pattern_center_x pattern_end_x pattern_end_abs_x pattern_len_x pattern_name_x pattern_short_x pattern_start_x pattern_start_abs_x seq_match_x seq_match_cat_x seq_match_p_x strand_x tf_x row_idx_x example_chrom_y example_end_y example_interval_from_task_y example_start_y example_strand_y imp/Klf4_y imp/Nanog_y imp/Oct4_y imp/Sox2_y imp_max_y imp_max_task_y imp_weighted_y imp_weighted_cat_y imp_weighted_p_y match/Klf4_y match/Nanog_y match/Oct4_y match/Sox2_y match_max_y match_max_task_y match_weighted_y match_weighted_cat_y match_weighted_p_y pattern_y pattern_center_y pattern_end_y pattern_end_abs_y pattern_len_y pattern_name_y pattern_short_y pattern_start_y pattern_start_abs_y seq_match_y seq_match_cat_y seq_match_p_y strand_y tf_y row_idx_y center_diff strand_combination motif_pair motif_pair_idx task score x_alt x_ref x_alt_ref x_alt_pc x_ref_pc x_alt_ref_pc y_alt y_ref y_alt_ref y_alt_pc y_ref_pc y_alt_ref_pc center_diff_cat pattern_center_aln_x pattern_strand_aln_x pattern_center_aln_y pattern_strand_aln_y pattern_diff_aln
0 0 1 chr3 1.2215e+08 1 Oct4 1.2215e+08 . NaN NaN 0.9374 NaN 0.9374 Oct4 0.9374 high 0.8462 NaN NaN 0.5077 NaN 0.5077 Oct4 0.5077 high 0.6886 Oct4/metacluster_0/pa... 430.0 438.0 1.2215e+08 16.0 Oct4-Sox2 Oct4/m0_p0 422.0 1.2215e+08 10.2722 high 0.6902 - Oct4 0 chr3 1.2215e+08 Oct4 1.2215e+08 . NaN NaN 1.5355 NaN 1.5355 Oct4 1.5355 high 0.992 NaN NaN 0.5287 NaN 0.5287 Oct4 0.5287 high 0.7992 Oct4/metacluster_0/pa... 451.0 459.0 1.2215e+08 16.0 Oct4-Sox2 Oct4/m0_p0 443.0 1.2215e+08 11.5528 high 0.9044 - Oct4 1 21.0 ++ Oct4-Sox2<>Oct4-Sox2 0 Oct4 profile_count 2.5930 13.7886 0.1881 2.6250 13.8206 0.1899 10.8607 20.6220 0.5267 10.8896 20.6510 0.5273 (0, 35] 429.0 - 450.0 - 21.0
1 1 2 chr3 1.2215e+08 1 Oct4 1.2215e+08 . NaN NaN 0.9374 NaN 0.9374 Oct4 0.9374 high 0.8462 NaN NaN 0.5077 NaN 0.5077 Oct4 0.5077 high 0.6886 Oct4/metacluster_0/pa... 430.0 438.0 1.2215e+08 16.0 Oct4-Sox2 Oct4/m0_p0 422.0 1.2215e+08 10.2722 high 0.6902 - Oct4 0 chr3 1.2215e+08 Oct4 1.2215e+08 . NaN NaN 1.2758 NaN 1.2758 Oct4 1.2758 high 0.967 NaN NaN 0.4219 NaN 0.4219 Oct4 0.4219 low 0.2234 Oct4/metacluster_0/pa... 492.0 500.0 1.2215e+08 16.0 Oct4-Sox2 Oct4/m0_p0 484.0 1.2215e+08 10.7122 high 0.7777 - Oct4 2 62.0 ++ Oct4-Sox2<>Oct4-Sox2 1 Oct4 profile_count 3.5153 13.7886 0.2549 3.5473 13.8206 0.2567 12.1548 19.3488 0.6282 12.1837 19.3777 0.6287 (35, 70] 429.0 - 491.0 - 62.0
2 2 3 chr3 1.2215e+08 1 Oct4 1.2215e+08 . NaN NaN 0.9374 NaN 0.9374 Oct4 0.9374 high 0.8462 NaN NaN 0.5077 NaN 0.5077 Oct4 0.5077 high 0.6886 Oct4/metacluster_0/pa... 430.0 438.0 1.2215e+08 16.0 Oct4-Sox2 Oct4/m0_p0 422.0 1.2215e+08 10.2722 high 0.6902 - Oct4 0 chr3 1.2215e+08 Oct4 1.2215e+08 . NaN NaN 1.0455 NaN 1.0455 Oct4 1.0455 high 0.905 NaN NaN 0.4155 NaN 0.4155 Oct4 0.4155 low 0.2005 Oct4/metacluster_0/pa... 534.0 542.0 1.2215e+08 16.0 Oct4-Sox2 Oct4/m0_p0 526.0 1.2215e+08 10.2722 high 0.6902 - Oct4 3 104.0 ++ Oct4-Sox2<>Oct4-Sox2 2 Oct4 profile_count 5.0137 13.7886 0.3636 5.0457 13.8206 0.3651 8.3057 12.9900 0.6394 8.3347 13.0189 0.6402 (70, 150] 429.0 - 533.0 - 104.0
3 3 6 chr3 1.2215e+08 1 Oct4 1.2215e+08 . NaN NaN 1.5355 NaN 1.5355 Oct4 1.5355 high 0.9920 NaN NaN 0.5287 NaN 0.5287 Oct4 0.5287 high 0.7992 Oct4/metacluster_0/pa... 451.0 459.0 1.2215e+08 16.0 Oct4-Sox2 Oct4/m0_p0 443.0 1.2215e+08 11.5528 high 0.9044 - Oct4 1 chr3 1.2215e+08 Oct4 1.2215e+08 . NaN NaN 1.2758 NaN 1.2758 Oct4 1.2758 high 0.967 NaN NaN 0.4219 NaN 0.4219 Oct4 0.4219 low 0.2234 Oct4/metacluster_0/pa... 492.0 500.0 1.2215e+08 16.0 Oct4-Sox2 Oct4/m0_p0 484.0 1.2215e+08 10.7122 high 0.7777 - Oct4 2 41.0 ++ Oct4-Sox2<>Oct4-Sox2 3 Oct4 profile_count 4.6573 20.6220 0.2258 4.6893 20.6540 0.2270 7.5864 19.3488 0.3921 7.6153 19.3777 0.3930 (35, 70] 450.0 - 491.0 - 41.0
4 4 7 chr3 1.2215e+08 1 Oct4 1.2215e+08 . NaN NaN 1.5355 NaN 1.5355 Oct4 1.5355 high 0.9920 NaN NaN 0.5287 NaN 0.5287 Oct4 0.5287 high 0.7992 Oct4/metacluster_0/pa... 451.0 459.0 1.2215e+08 16.0 Oct4-Sox2 Oct4/m0_p0 443.0 1.2215e+08 11.5528 high 0.9044 - Oct4 1 chr3 1.2215e+08 Oct4 1.2215e+08 . NaN NaN 1.0455 NaN 1.0455 Oct4 1.0455 high 0.905 NaN NaN 0.4155 NaN 0.4155 Oct4 0.4155 low 0.2005 Oct4/metacluster_0/pa... 534.0 542.0 1.2215e+08 16.0 Oct4-Sox2 Oct4/m0_p0 526.0 1.2215e+08 10.2722 high 0.6902 - Oct4 3 83.0 ++ Oct4-Sox2<>Oct4-Sox2 4 Oct4 profile_count 7.1093 20.6220 0.3447 7.1413 20.6540 0.3458 5.8591 12.9900 0.4510 5.8880 13.0189 0.4523 (70, 150] 450.0 - 533.0 - 83.0
In [39]:
# set categorical index
dfabf2.center_diff_cat.cat.categories = dfabf2.center_diff_cat.cat.categories.astype(str)
In [53]:
%time dfabf2.to_parquet(dataset_dir / 'dfabf.Oct4-Sox2-subset.parq', index=False, engine='fastparquet')
CPU times: user 1min 45s, sys: 18.4 s, total: 2min 3s
Wall time: 1min 57s
In [54]:
!du -sh {dataset_dir}/dfabf.Oct4-Sox2-subset.parq
2.9G	/oak/stanford/groups/akundaje/avsec/basepair/data/processed/comparison/output/nexus,peaks,OSNK,0,10,1,FALSE,same,0.5,64,25,0.004,9,FALSE,[1,50],TRUE/perturbation-analysis/dfabf.Oct4-Sox2-subset.parq
In [20]:
%time dfabf2 = pd.read_parquet(dataset_dir / 'dfabf.Oct4-Sox2-subset.parq', engine='fastparquet')
CPU times: user 1min 9s, sys: 18.9 s, total: 1min 28s
Wall time: 1min 53s
In [21]:
!du -sh {dataset_dir}
161G	/oak/stanford/groups/akundaje/avsec/basepair/data/processed/comparison/output/nexus,peaks,OSNK,0,10,1,FALSE,same,0.5,64,25,0.004,9,FALSE,[1,50],TRUE/perturbation-analysis

Subset dfs

In [30]:
dfabf_ism.head()
Out[30]:
Unnamed: 0 Unnamed: 0.1 example_chrom_x example_end_x example_idx example_interval_from_task_x example_start_x example_strand_x imp/Klf4_x imp/Nanog_x imp/Oct4_x imp/Sox2_x imp_max_x imp_max_task_x imp_weighted_x imp_weighted_cat_x imp_weighted_p_x match/Klf4_x match/Nanog_x match/Oct4_x match/Sox2_x match_max_x match_max_task_x match_weighted_x match_weighted_cat_x match_weighted_p_x pattern_x pattern_center_x pattern_end_x pattern_end_abs_x pattern_len_x pattern_name_x pattern_short_x pattern_start_x pattern_start_abs_x seq_match_x seq_match_cat_x seq_match_p_x strand_x tf_x row_idx_x example_chrom_y example_end_y example_interval_from_task_y example_start_y example_strand_y imp/Klf4_y imp/Nanog_y imp/Oct4_y imp/Sox2_y imp_max_y imp_max_task_y imp_weighted_y imp_weighted_cat_y imp_weighted_p_y match/Klf4_y match/Nanog_y match/Oct4_y match/Sox2_y match_max_y match_max_task_y match_weighted_y match_weighted_cat_y match_weighted_p_y pattern_y pattern_center_y pattern_end_y pattern_end_abs_y pattern_len_y pattern_name_y pattern_short_y pattern_start_y pattern_start_abs_y seq_match_y seq_match_cat_y seq_match_p_y strand_y tf_y row_idx_y center_diff strand_combination motif_pair motif_pair_idx task Wt_obs Wt dA dB dAB
0 0 1 chr3 1.2215e+08 1 Oct4 1.2215e+08 . NaN NaN 0.9374 NaN 0.9374 Oct4 0.9374 high 0.8462 NaN NaN 0.5077 NaN 0.5077 Oct4 0.5077 high 0.6886 Oct4/metacluster_0/pa... 430.0 438.0 1.2215e+08 16.0 Oct4-Sox2 Oct4/m0_p0 422.0 1.2215e+08 10.2722 high 0.6902 - Oct4 0.0 chr3 1.2215e+08 Oct4 1.2215e+08 . NaN NaN 1.5355 NaN 1.5355 Oct4 1.5355 high 0.992 NaN NaN 0.5287 NaN 0.5287 Oct4 0.5287 high 0.7992 Oct4/metacluster_0/pa... 451.0 459.0 1.2215e+08 16.0 Oct4-Sox2 Oct4/m0_p0 443.0 1.2215e+08 11.5528 high 0.9044 - Oct4 1.0 21.0 ++ Oct4-Sox2<>Oct4-Sox2 0 Oct4 12070.0 5831.35 3665.2786 2096.8750 1037.8267
1 1 2 chr3 1.2215e+08 1 Oct4 1.2215e+08 . NaN NaN 0.9374 NaN 0.9374 Oct4 0.9374 high 0.8462 NaN NaN 0.5077 NaN 0.5077 Oct4 0.5077 high 0.6886 Oct4/metacluster_0/pa... 430.0 438.0 1.2215e+08 16.0 Oct4-Sox2 Oct4/m0_p0 422.0 1.2215e+08 10.2722 high 0.6902 - Oct4 0.0 chr3 1.2215e+08 Oct4 1.2215e+08 . NaN NaN 1.2758 NaN 1.2758 Oct4 1.2758 high 0.967 NaN NaN 0.4219 NaN 0.4219 Oct4 0.4219 low 0.2234 Oct4/metacluster_0/pa... 492.0 500.0 1.2215e+08 16.0 Oct4-Sox2 Oct4/m0_p0 484.0 1.2215e+08 10.7122 high 0.7777 - Oct4 2.0 62.0 ++ Oct4-Sox2<>Oct4-Sox2 1 Oct4 12070.0 5831.35 3665.2786 1875.4661 1405.9451
2 2 3 chr3 1.2215e+08 1 Oct4 1.2215e+08 . NaN NaN 0.9374 NaN 0.9374 Oct4 0.9374 high 0.8462 NaN NaN 0.5077 NaN 0.5077 Oct4 0.5077 high 0.6886 Oct4/metacluster_0/pa... 430.0 438.0 1.2215e+08 16.0 Oct4-Sox2 Oct4/m0_p0 422.0 1.2215e+08 10.2722 high 0.6902 - Oct4 0.0 chr3 1.2215e+08 Oct4 1.2215e+08 . NaN NaN 1.0455 NaN 1.0455 Oct4 1.0455 high 0.905 NaN NaN 0.4155 NaN 0.4155 Oct4 0.4155 low 0.2005 Oct4/metacluster_0/pa... 534.0 542.0 1.2215e+08 16.0 Oct4-Sox2 Oct4/m0_p0 526.0 1.2215e+08 10.2722 high 0.6902 - Oct4 3.0 104.0 ++ Oct4-Sox2<>Oct4-Sox2 2 Oct4 12070.0 5831.35 3665.2786 1574.0667 801.1036
3 3 6 chr3 1.2215e+08 1 Oct4 1.2215e+08 . NaN NaN 1.5355 NaN 1.5355 Oct4 1.5355 high 0.9920 NaN NaN 0.5287 NaN 0.5287 Oct4 0.5287 high 0.7992 Oct4/metacluster_0/pa... 451.0 459.0 1.2215e+08 16.0 Oct4-Sox2 Oct4/m0_p0 443.0 1.2215e+08 11.5528 high 0.9044 - Oct4 1.0 chr3 1.2215e+08 Oct4 1.2215e+08 . NaN NaN 1.2758 NaN 1.2758 Oct4 1.2758 high 0.967 NaN NaN 0.4219 NaN 0.4219 Oct4 0.4219 low 0.2234 Oct4/metacluster_0/pa... 492.0 500.0 1.2215e+08 16.0 Oct4-Sox2 Oct4/m0_p0 484.0 1.2215e+08 10.7122 high 0.7777 - Oct4 2.0 41.0 ++ Oct4-Sox2<>Oct4-Sox2 3 Oct4 12070.0 5831.35 2096.8750 1875.4661 656.2836
4 4 7 chr3 1.2215e+08 1 Oct4 1.2215e+08 . NaN NaN 1.5355 NaN 1.5355 Oct4 1.5355 high 0.9920 NaN NaN 0.5287 NaN 0.5287 Oct4 0.5287 high 0.7992 Oct4/metacluster_0/pa... 451.0 459.0 1.2215e+08 16.0 Oct4-Sox2 Oct4/m0_p0 443.0 1.2215e+08 11.5528 high 0.9044 - Oct4 1.0 chr3 1.2215e+08 Oct4 1.2215e+08 . NaN NaN 1.0455 NaN 1.0455 Oct4 1.0455 high 0.905 NaN NaN 0.4155 NaN 0.4155 Oct4 0.4155 low 0.2005 Oct4/metacluster_0/pa... 534.0 542.0 1.2215e+08 16.0 Oct4-Sox2 Oct4/m0_p0 526.0 1.2215e+08 10.2722 high 0.6902 - Oct4 3.0 83.0 ++ Oct4-Sox2<>Oct4-Sox2 4 Oct4 12070.0 5831.35 2096.8750 1574.0667 513.7097
In [40]:
dfabf_ism2 = pd.merge(dfabf_ism, dfi_subset[['row_idx',
                                     'pattern_center_aln',
                                     'pattern_strand_aln']].pipe(suffix_colnames, '_x'),
                 on='row_idx_x',
                 how='left')
dfabf_ism2 = pd.merge(dfabf_ism2, dfi_subset[['row_idx',
                                      'pattern_center_aln', 
                                      'pattern_strand_aln']].pipe(suffix_colnames, '_y'),
                  on='row_idx_y',
                  how='left')
dfabf_ism2['pattern_diff_aln'] = dfabf_ism2['pattern_center_aln_y'] - dfabf_ism2['pattern_center_aln_x']
In [41]:
exclude_sox2 = dfabf_ism2[(dfabf_ism2.motif_pair == 'Oct4-Sox2<>Sox2') & 
                          (dfabf_ism2['pattern_diff_aln'] == 0)].row_idx_y.values
exclude_oct4 = dfabf_ism2[(dfabf_ism2.motif_pair == 'Oct4-Sox2<>Oct4') & 
                          (dfabf_ism2['pattern_diff_aln'] == 0)].row_idx_y.values
In [42]:
# Exclude the overlapping row
dfabf_ism2 = dfabf_ism2[(dfabf_ism2.pattern_name_x != 'Oct4') | (~dfabf_ism2.row_idx_x.isin(exclude_oct4))]
dfabf_ism2 = dfabf_ism2[(dfabf_ism2.pattern_name_y != 'Oct4') | (~dfabf_ism2.row_idx_y.isin(exclude_oct4))]
dfabf_ism2 = dfabf_ism2[(dfabf_ism2.pattern_name_x != 'Sox2') | (~dfabf_ism2.row_idx_x.isin(exclude_sox2))]
dfabf_ism2 = dfabf_ism2[(dfabf_ism2.pattern_name_y != 'Sox2') | (~dfabf_ism2.row_idx_y.isin(exclude_sox2))]
In [43]:
dfs = dfabf_ism2[['Wt_obs', 'Wt', 'dA', 'dB', 'dAB', 'motif_pair',
                  'task', 'center_diff', 'strand_combination']]

Synergy scores

  • Wt = total counts in ref
  • dA = total counts after removing A
  • dB = total counts after removing B
  • dAB = total counts after removing both

Then TF-A depends on A with

(Wt-dA)/(Wt - dAB)

and on B with

(Wt-dB)/(Wt - dAB)

In [44]:
# dfs = dfabf_ism[['Wt_obs', 'Wt', 'dA', 'dB', 'dAB', 'motif_pair', 'task', 'center_diff', 'strand_combination']]
dfs_prox = dfs[dfs.center_diff <= 35]
dfs_dist = dfs[dfs.center_diff > 35]
dfs.head()
Out[44]:
Wt_obs Wt dA dB dAB motif_pair task center_diff strand_combination
0 12070.0 5831.3506 3569.0996 1977.2351 1536.7383 Oct4-Sox2<>Oct4-Sox2 Oct4 21.0 ++
1 12070.0 5831.3506 3569.0996 1946.5161 871.3293 Oct4-Sox2<>Oct4-Sox2 Oct4 62.0 ++
2 12070.0 5831.3506 3569.0996 1818.8110 725.8771 Oct4-Sox2<>Oct4-Sox2 Oct4 104.0 ++
3 12070.0 5831.3506 1977.2351 1946.5161 533.5667 Oct4-Sox2<>Oct4-Sox2 Oct4 41.0 ++
4 12070.0 5831.3506 1977.2351 1818.8110 544.4917 Oct4-Sox2<>Oct4-Sox2 Oct4 83.0 ++

Fraction of positive values

In [60]:
terms = ['Wt-dA>0', 'Wt-dB>0', 'Wt-dAB>0', 'dA-dAB>0', 'dB-dAB>0']
filter_terms = ['center_diff <= 35', 'center_diff > 35']
c = pd.concat([(dfs.query(ft).groupby(['motif_pair', 'task']).apply(lambda x: x.eval(term).mean()).
                reset_index().
                rename(columns={0:'fraction'}).
                assign(term=term, subset=ft))
               for term in terms for ft in filter_terms], axis=0)
c = c[c.motif_pair.isin(pair_names)]
In [61]:
c['term'] = pd.Categorical(c['term'], categories=terms)
c['subset'] = pd.Categorical(c['subset'], categories=filter_terms)
c['motif_pair'] = pd.Categorical(c['motif_pair'], categories=pair_names[::-1], ordered=True)
In [45]:
plotnine.options.figure_size = get_figsize(1, 0.6)
fig = (ggplot(aes(y='motif_pair', x='task', fill='fraction'), c) + 
       geom_tile() + 
       theme_classic(base_size=10, base_family='Arial') +  
       facet_grid("subset ~ term") + 
       theme(legend_position='top', axis_text_x=element_text(rotation=30, hjust=1))  + 
       scale_fill_gradient2(low=cb_red, mid='white', high=cb_blue, midpoint=.5, limits=[0, 1], 
                            palette=PlotninePalette("RdBu", [.1, .9])) +
       ggtitle("Fraction of positive terms")
)
fig.save(figures_dir / 'heatmap.positive_fraction.pdf')
fig
Out[45]:
<ggplot: (-9223363280239612114)>

Synergy scores: (2*Wt-dA-dB)/(Wt-dAB)

Require: 90% Wt-dAB>0

In [46]:
mask = 'Wt-dAB>0'
terms = ['(2*Wt-dA-dB)/(Wt-dAB)']
filter_terms = dist_subsets
filter_names = dist_subset_labels
# filter_terms = ['center_diff <= 35', 'center_diff > 35']
c = pd.concat([(dfs.query(ft).groupby(['motif_pair', 'task']).
                apply(lambda x: x.eval(term).median() if x.eval(mask).mean() > .9 else np.nan).
                reset_index().
                rename(columns={0:'average'}).
                assign(term=term, subset=ft, subset_name=filter_names[i]))
               for term in terms for i, ft in enumerate(filter_terms)], axis=0)
c['term'] = pd.Categorical(c['term'], categories=terms)
c['subset'] = pd.Categorical(c['subset'], categories=filter_terms)
c['subset_name'] = pd.Categorical(c['subset_name'], categories=filter_names)
c['motif_pair'] = pd.Categorical(c['motif_pair'], pair_names[::-1])
In [47]:
plotnine.options.figure_size = get_figsize(1, 0.6)
assert len(terms) == 1
fig = (ggplot(aes(y='motif_pair', x='task', fill='average')) + 
       geom_tile(data=c.dropna()) + 
       # geom_tile(fill='#d9d9d9', data=c[c.average.isnull()]) + 
       geom_text(label='/', data=c[c.average.isnull()]) + 
       theme_classic(base_size=10, base_family='Arial') + 
       facet_grid(". ~ subset_name") + 
       theme(legend_position='top', axis_text_x=element_text(rotation=30, hjust=1))  + 
       scale_fill_gradient2(low=cb_red, mid='white', high=cb_blue, midpoint=1, limits=[.6, 1.4], 
                            palette=PlotninePalette("RdBu_r", [0, 1])) +
       # scale_fill_gradient2(low=cb_blue, mid='white', high=cb_red, midpoint=1, limits=[0, 2]) +
       ggtitle(terms[0] + ", 90% Wt-dAB>0")
)
fig.save(figures_dir / 'heatmap.synergy-score.90perc.Wt-dAB>0.pdf')
fig
Out[47]:
<ggplot: (8756614591394)>

Setup table showing only the major TF's

  • Row -> target motif of interest (and the task)
  • column -> other motif
In [48]:
def get_syn_effect(c, pairs, motifs, tasks):
    motifxtask = [f"{motif} (task={profile_mapping[motif]})" for motif in motifs]
    dfp = pd.DataFrame(index=motifxtask, columns=motifs, dtype=float)
    k = 0
    for i in range(len(motifs)):
        for j in range(len(motifs)):
            if i>j:
                continue
            idx = 'motif_pair_idx'
            cat = 'center_diff_cat'
            motif_pair = pairs[k]
            motif_pair_name = "<>".join(motif_pair)
            k+=1
            t1 = profile_mapping[motif_pair[0]]
            t2 = profile_mapping[motif_pair[1]]
            
            dfp.loc[motif_pair[0] + f" (task={t1})"][motif_pair[1]] = c[(c.motif_pair == motif_pair_name) & (c.task == t1)]['average']
            dfp.loc[motif_pair[1] + f" (task={t2})"][motif_pair[0]] = c[(c.motif_pair == motif_pair_name) & (c.task == t2)]['average']
    # dfp.columns = [r"$\Delta$"+c for c in dfp.columns]
    return dfp

Major TF per motif

In [49]:
# subsets = ['center_diff <= 35', 'center_diff > 35', 'center_diff > 70']
subsets = dist_subsets
fig,axes = plt.subplots(1, len(subsets), 
                        figsize=get_figsize(0.25*len(subsets), .8/len(subsets)), 
                        sharey=True)
for i, (subset,ax, subset_title) in enumerate(zip(subsets, axes, dist_subset_labels)):
    sns.heatmap(get_syn_effect(c[c.subset == subset], pairs, motifs, tasks), cmap='RdBu_r', center=1, ax=ax, 
                cbar=i==len(subsets)-1,
                vmin=.7, vmax=1.3)
    ax.set_title(subset_title)
plt.savefig(figures_dir / 'heatmap.synergy-score.90perc.Wt-dAB>0.main_tasks.pdf')

Require: 20% Wt-dAB>0

In [50]:
mask = 'Wt-dAB>0'
terms = ['(2*Wt-dA-dB)/(Wt-dAB)']
filter_terms = dist_subsets
filter_names = dist_subset_labels
# filter_terms = ['center_diff <= 35', 'center_diff > 35']
c = pd.concat([(dfs.query(ft).groupby(['motif_pair', 'task']).
                apply(lambda x: x.eval(term).median() if x.eval(mask).mean() > .2 else np.nan).
                reset_index().
                rename(columns={0:'average'}).
                assign(term=term, subset=ft, subset_name=filter_names[i]))
               for term in terms for i, ft in enumerate(filter_terms)], axis=0)
c['term'] = pd.Categorical(c['term'], categories=terms)
c['subset'] = pd.Categorical(c['subset'], categories=filter_terms)
c['subset_name'] = pd.Categorical(c['subset_name'], categories=filter_names)
c['motif_pair'] = pd.Categorical(c['motif_pair'], pair_names[::-1])
In [51]:
plotnine.options.figure_size = get_figsize(1, 0.6)
assert len(terms) == 1
fig = (ggplot(aes(y='motif_pair', x='task', fill='average')) + 
       geom_tile(data=c.dropna()) + 
       # geom_tile(fill='#d9d9d9', data=c[c.average.isnull()]) + 
       geom_text(label='/', data=c[c.average.isnull()]) + 
       theme_classic(base_size=10, base_family='Arial') + 
       facet_grid(". ~ subset_name") + 
       theme(legend_position='top', axis_text_x=element_text(rotation=30, hjust=1))  + 
       scale_fill_gradient2(low=cb_red, mid='white', high=cb_blue, midpoint=1, limits=[.6, 1.4], 
                            palette=PlotninePalette("RdBu_r", [0, 1])) +
       # scale_fill_gradient2(low=cb_blue, mid='white', high=cb_red, midpoint=1, limits=[0, 2]) +
       ggtitle(terms[0] + ", 90% Wt-dAB>0")
)
fig.save(figures_dir / 'heatmap.synergy-score.20perc.Wt-dAB>0.pdf')
fig
Out[51]:
<ggplot: (-9223363280254873796)>

Major TF per motif

In [52]:
subsets = dist_subsets
fig,axes = plt.subplots(1, len(subsets), 
                        figsize=get_figsize(0.25*len(subsets), .8/len(subsets)), 
                        sharey=True)
for i, (subset,ax, subset_title) in enumerate(zip(subsets, axes, dist_subset_labels)):
    sns.heatmap(get_syn_effect(c[c.subset == subset], pairs, motifs, tasks), cmap='RdBu_r', center=1, ax=ax, 
                cbar=i==len(subsets)-1,
                vmin=.7, vmax=1.3)
    ax.set_title(subset_title)
plt.savefig(figures_dir / 'heatmap.synergy-score.20perc.Wt-dAB>0.main_tasks.pdf')

Show all

In [53]:
mask = 'Wt-dAB>0'
terms = ['(2*Wt-dA-dB)/(Wt-dAB)']
filter_terms = dist_subsets
filter_names = dist_subset_labels
# filter_terms = ['center_diff <= 35', 'center_diff > 35']
c = pd.concat([(dfs.query(ft).groupby(['motif_pair', 'task']).
                apply(lambda x: x.eval(term).median()).
                reset_index().
                rename(columns={0:'average'}).
                assign(term=term, subset=ft, subset_name=filter_names[i]))
               for term in terms for i, ft in enumerate(filter_terms)], axis=0)
c['term'] = pd.Categorical(c['term'], categories=terms)
c['subset'] = pd.Categorical(c['subset'], categories=filter_terms)
c['subset_name'] = pd.Categorical(c['subset_name'], categories=filter_names)
c['motif_pair'] = pd.Categorical(c['motif_pair'], pair_names[::-1])
In [54]:
plotnine.options.figure_size = get_figsize(1, 0.6)
assert len(terms) == 1
fig = (ggplot(aes(y='motif_pair', x='task', fill='average')) + 
       geom_tile(data=c.dropna()) + 
       # geom_tile(fill='#d9d9d9', data=c[c.average.isnull()]) + 
       geom_text(label='/', data=c[c.average.isnull()]) + 
       theme_classic(base_size=10, base_family='Arial') + 
       facet_grid(". ~ subset_name") + 
       theme(legend_position='top', axis_text_x=element_text(rotation=30, hjust=1))  + 
       scale_fill_gradient2(low=cb_red, mid='white', high=cb_blue, midpoint=1, limits=[.6, 1.4], 
                            palette=PlotninePalette("RdBu_r", [0, 1])) +
       # scale_fill_gradient2(low=cb_blue, mid='white', high=cb_red, midpoint=1, limits=[0, 2]) +
       ggtitle(terms[0] + ", 90% Wt-dAB>0")
)
fig.save(figures_dir / 'heatmap.synergy-score.all.pdf')
fig
Out[54]:
<ggplot: (8756615188004)>

Major TF per motif

In [55]:
subsets = dist_subsets
fig,axes = plt.subplots(1, len(subsets), 
                        figsize=get_figsize(0.25*len(subsets), .8/len(subsets)), 
                        sharey=True)
for i, (subset,ax, subset_title) in enumerate(zip(subsets, axes, dist_subset_labels)):
    sns.heatmap(get_syn_effect(c[c.subset == subset], pairs, motifs, tasks), cmap='RdBu_r', center=1, ax=ax, 
                cbar=i==len(subsets)-1,
                vmin=.7, vmax=1.3)
    ax.set_title(subset_title)
plt.savefig(figures_dir / 'heatmap.synergy-score.all.main_tasks.pdf')

Directional effect

Scatterplot showing the major TF per motif

In [84]:
from basepair.exp.paper.config import profile_mapping
In [30]:
dfabf_plot = dfabf2.query('center_diff > 5')#.query('center_diff < 150')
score = 'max_profile_count_bt'
# -----------------------------------
fig, axes = plt.subplots(len(motifs), len(motifs), 
                         figsize=get_figsize(.65, 1), 
                         gridspec_kw=dict(wspace=0, hspace=0),
                         sharex=True, sharey=True)
colors = plt.get_cmap("viridis")(np.linspace(0, .95, len(dfabf_plot.center_diff_cat.cat.categories)))

k = 0
for i in range(len(motifs)):
    for j in range(len(motifs)):
        ax = axes[i,j]
        if i>j:
            ax.axison = False
            continue
        idx = 'motif_pair_idx'
        cat = 'center_diff_cat'
        motif_pair = pairs[k]
        motif_pair_name = "<>".join(motif_pair)
        k+=1
        t1 = profile_mapping[motif_pair[0]]
        t2 = profile_mapping[motif_pair[1]]
        # t1 = 'Nanog'
        # t2 = 'Nanog'
        # motif mask
        mmask = (dfabf_plot.motif_pair == motif_pair_name) & (dfabf_plot.score == score)
        ab = pd.merge(dfabf_plot[mmask & (dfabf_plot.task == t1)][[idx, 'x_alt_ref_pc', 'x_ref', cat]],
                      dfabf_plot[mmask & (dfabf_plot.task == t2)][[idx, 'y_alt_ref_pc', 'y_ref']],
                      on=idx)
        # make a scatterplot
        
        # Subset ab to only the highest values
        ab = ab[(ab.x_ref > np.percentile(ab.x_ref, 80)) & (ab.y_ref > np.percentile(ab.y_ref, 80))]
        for ci, (label,df) in enumerate(list(ab.groupby(cat))):
            ax.scatter(1/df.y_alt_ref_pc,
                       1/df.x_alt_ref_pc,   # first one is on the y axis
                       alpha=0.2, s=1, label=label, color=colors[ci], rasterized=True)
        
        # ax.text(0, 0,t1+"-"+t2)
        alpha = .5
        xl = [1/3, 3]
        ax.plot(xl, xl, c='grey', alpha=alpha)
        ax.axvline(1, c='grey', alpha=alpha)
        ax.axhline(1, c='grey', alpha=alpha)
        ax.set_xlim(xl)
        ax.set_ylim(xl)
        
        ax.set_yscale('log')
        ax.set_xscale('log')
        
        # explicitly add ticks
        ax.set_yticks([1/3, 1, 3])
        ax.set_xticks([1/3, 1, 3])
        ax.set_xticklabels(['1/3', '1', '3'])
        ax.set_yticklabels(['1/3', '1', '3'])
        ax.minorticks_off() # turns off minor ticks
        
        if i == j:
            ax.set_ylabel(motif_pair[1])
            ax.set_xlabel(motif_pair[0])
            
        # Use grey spines
        plt.setp(ax.spines.values(), color='grey')
        plt.setp([ax.get_xticklines(), ax.get_yticklines()], color='grey')

plt.legend(loc="bottom right", bbox_to_anchor=(-.2,.7),
           scatterpoints=1, markerscale=10, columnspacing=0,
           handletextpad=0, borderpad=0, frameon=False, title="Pairwise distance")
fig.savefig(figures_dir / "max_profile_count.upper-triag.no-Oct4.width=0.65.center_diff>5.pdf")

All TF's per motif

In [62]:
import matplotlib.cm as cm
colors = plt.get_cmap("viridis")(np.linspace(0, .95, len(dfabf.center_diff_cat.cat.categories)))
hex_colors = [plt.cm.colors.to_hex(a) for a in colors]
In [63]:
# use pair names
dfabf['motif_pair'] = pd.Categorical(dfabf['motif_pair'], pair_names)
In [64]:
from plotnine import *
In [65]:
hex_colors
Out[65]:
['#440154', '#32648e', '#2ab07f', '#dfe318']
In [66]:
dfabf['task'] = pd.Categorical(dfabf['task'])
In [67]:
print(dfabf[dfabf.score == 'max_profile_count']['center_diff_cat'].value_counts().to_string())
(150, 1000]    808604
(70, 150]      433820
(0, 35]        301120
(35, 70]       240868

Heatmap version

In [68]:
def get_effect(dfabf, pairs, motifs, agg_fn=np.mean, score='max_profile_count'):
    motifxtask = [f"{motif} (task={profile_mapping[motif]})" for motif in motifs]
    dfp = pd.DataFrame(index=motifxtask, columns=motifs, dtype=float)
    k = 0
    for i in range(len(motifs)):
        for j in range(len(motifs)):
            if i>j:
                continue
            idx = 'motif_pair_idx'
            cat = 'center_diff_cat'
            motif_pair = pairs[k]
            motif_pair_name = "<>".join(motif_pair)
            k+=1
            t1 = profile_mapping[motif_pair[0]]
            t2 = profile_mapping[motif_pair[1]]
            # t1 = 'Nanog'
            # t2 = 'Nanog'
            # motif mask
            mmask = (dfabf.motif_pair == motif_pair_name) & (dfabf.score == score)
            ab = pd.merge(dfabf[mmask & (dfabf.task == t1)][[idx, 'x_alt_ref_pc', 'x_ref', cat]],
                          dfabf[mmask & (dfabf.task == t2)][[idx, 'y_alt_ref_pc', 'y_ref']],
                          on=idx)
            ab = ab[(ab.x_ref > np.percentile(ab.x_ref, 80)) & 
                    (ab.y_ref > np.percentile(ab.y_ref, 80))]
            dfp.loc[motif_pair[0] + f" (task={t1})"][motif_pair[1]] = agg_fn(ab.x_alt_ref_pc)
            dfp.loc[motif_pair[1] + f" (task={t2})"][motif_pair[0]] = agg_fn(ab.y_alt_ref_pc)
    dfp.columns = [r"$\Delta$"+c for c in dfp.columns]
    return dfp

Median value

In [69]:
dfabf_plot = dfabf2.query('center_diff > 5')

(AB - (B - None)) / A

In [70]:
pairs_no_oct4 = [p for p in pairs if p[0] != 'Oct4' and p[1] != 'Oct4']
In [71]:
motifs_no_oct4 = [m for m in motifs if m != 'Oct4']
In [72]:
motifs_no_oct4
Out[72]:
['Oct4-Sox2', 'Sox2', 'Nanog', 'Klf4']
In [80]:
dfabf_plot.strand_combination.unique()
Out[80]:
array(['++', '+-', '-+', '--'], dtype=object)
In [85]:
subsets = dist_subsets
fig,axes = plt.subplots(4, len(subsets), 
                        figsize=get_figsize(0.25*len(subsets), .8/len(subsets)*4), 
                        sharex=True,
                        sharey=True)
for irow, sc in enumerate(['++', '+-', '-+', '--']):
    for i, (subset,ax, subset_label) in enumerate(zip(subsets, axes[irow], dist_subset_labels)):
        try:
            sns.heatmap(get_effect(dfabf_plot[dfabf_plot.strand_combination == sc].query(subset), pairs_no_oct4, motifs_no_oct4,
                                   agg_fn=np.median, score='max_profile_count_bt'), 
                        cmap='RdBu', center=1, ax=ax, 
                        cbar=i==len(subsets)-1,
                        vmin=.7, vmax=1.3)
        except:
            pass
        if i == 0:
            ax.set_title(sc + " " + subset_label)
        else:
            ax.set_title(subset_label)
plt.savefig(figures_dir / 'heatmap.median_max_profile_count.directional_effect.top-profile.bleed-through-corrected.strand-specific.pdf')
In [112]:
subsets = dist_subsets
fig,axes = plt.subplots(1, len(subsets), 
                        figsize=get_figsize(0.25*len(subsets), .8/len(subsets)), 
                        sharey=True)
for i, (subset,ax, subset_label) in enumerate(zip(subsets, axes, dist_subset_labels)):
    sns.heatmap(get_effect(dfabf_plot.query(subset), pairs_no_oct4, motifs_no_oct4,
                           agg_fn=np.median, score='max_profile_count_bt'), 
                cmap='RdBu', center=1, ax=ax, 
                cbar=i==len(subsets)-1,
                vmin=.7, vmax=1.3)
    ax.set_title(subset_label)
plt.savefig(figures_dir / 'heatmap.median_max_profile_count.directional_effect.top-profile.bleed-through-corrected.pdf')

AB / A

In [77]:
subsets = dist_subsets
fig,axes = plt.subplots(1, len(subsets), 
                        figsize=get_figsize(0.25*len(subsets), .8/len(subsets)), 
                        sharey=True)
for i, (subset,ax, subset_label) in enumerate(zip(subsets, axes, dist_subset_labels)):
    sns.heatmap(get_effect(dfabf_plot.query(subset), pairs, agg_fn=np.median), cmap='RdBu', center=1, ax=ax, 
                cbar=i==len(subsets)-1,
                vmin=.7, vmax=1.3)
    ax.set_title(subset_label)
plt.savefig(figures_dir / 'heatmap.median_max_profile_count.directional_effect.pdf')

Mean value

(AB - (B - None)) / A

In [79]:
subsets = dist_subsets
fig,axes = plt.subplots(1, len(subsets), 
                        figsize=get_figsize(0.25*len(subsets), .8/len(subsets)), 
                        sharey=True)
for i, (subset,ax, subset_label) in enumerate(zip(subsets, axes, dist_subset_labels)):
    sns.heatmap(get_effect(dfabf_plot.query(subset), pairs, agg_fn=np.mean, score='max_profile_count_bt'), cmap='RdBu', center=1, ax=ax, 
                cbar=i==len(subsets)-1,
                vmin=.7, vmax=1.3)
    ax.set_title(subset_label)
plt.savefig(figures_dir / 'heatmap.mean_max_profile_count.directional_effect.bleed-through-corrected.pdf')

AB / A

In [78]:
subsets = dist_subsets
fig,axes = plt.subplots(1, len(subsets), 
                        figsize=get_figsize(0.25*len(subsets), .8/len(subsets)), 
                        sharey=True)
for i, (subset,ax, subset_label) in enumerate(zip(subsets, axes, dist_subset_labels)):
    sns.heatmap(get_effect(dfabf_plot.query(subset), pairs, agg_fn=np.mean, score='max_profile_count'), cmap='RdBu', center=1, ax=ax, 
                cbar=i==len(subsets)-1,
                vmin=.7, vmax=1.3)
    ax.set_title(subset_label)
plt.savefig(figures_dir / 'heatmap.mean_max_profile_count.directional_effect.pdf')

Values for all the tasks

In [102]:
motifxtask = [f"{motif} (task={task})" for motif in motifs for task in tasks]
In [103]:
def get_effect_all_tasks(dfabf, pairs, tasks, agg_fn=np.mean):
    motifxtask = [f"{motif} (task={task})" for motif in motifs for task in tasks]
    dfp = pd.DataFrame(index=motifxtask, columns=motifs, dtype=float)
    for task in tasks:
        k = 0
        for i in range(len(motifs)):
            for j in range(len(motifs)):
                if i>j:
                    continue
                idx = 'motif_pair_idx'
                cat = 'center_diff_cat'
                motif_pair = pairs[k]
                motif_pair_name = "<>".join(motif_pair)
                k+=1
                t1 = profile_mapping[motif_pair[0]]
                t2 = profile_mapping[motif_pair[1]]
                ab = dfabf[(dfabf.motif_pair == motif_pair_name) & (dfabf.score == 'max_profile_count') & (dfabf.task == task)]
                dfp.loc[motif_pair[0] + f" (task={task})"][motif_pair[1]] = agg_fn(ab.x_alt_ref_pc)
                dfp.loc[motif_pair[1] + f" (task={task})"][motif_pair[0]] = agg_fn(ab.y_alt_ref_pc)
    dfp.columns = [r"$\Delta$"+c for c in dfp.columns]
    return dfp

Median

In [104]:
subsets = dist_subsets
fig,axes = plt.subplots(1, len(subsets), 
                        figsize=get_figsize(0.25*len(subsets), .8/len(subsets)*4), 
                        sharey=True)
for i, (subset,ax, subset_label) in enumerate(zip(subsets, axes, dist_subset_labels)):
    sns.heatmap(get_effect_all_tasks(dfabf_plot.query(subset), pairs, tasks=tasks, agg_fn=np.median), cmap='RdBu', center=1, ax=ax, 
                cbar=i==len(subsets)-1,
                vmin=.7, vmax=1.3)
    ax.set_title(subset_label)
plt.savefig(figures_dir / 'heatmap.median_max_profile_count.directional_effect.all-tasks.pdf')

Mean

In [105]:
subsets = dist_subsets
fig,axes = plt.subplots(1, len(subsets), 
                        figsize=get_figsize(0.25*len(subsets), .8/len(subsets)*4), 
                        sharey=True)
for i, (subset,ax, subset_label) in enumerate(zip(subsets, axes, dist_subset_labels)):
    sns.heatmap(get_effect_all_tasks(dfabf.query(subset), pairs, tasks=tasks, agg_fn=np.mean), cmap='RdBu', center=1, ax=ax, 
                cbar=i==len(subsets)-1,
                vmin=.7, vmax=1.3)
    ax.set_title(subset_label)
plt.savefig(figures_dir / 'heatmap.mean_max_profile_count.directional_effect.all-tasks.pdf')

Generate individual plots

For the scatter plots (directional interactions), generate png and pdf files for each interaction so I can use them directly.

  • The combined pdfs are too big and also too confusing to show on a slide or figure.
In [31]:
dfabf_plot = dfabf2.query('center_diff > 5')
odir = figures_dir / "individual/max_profile_count"
odir.mkdir(parents=True, exist_ok=True)
colors = plt.get_cmap("viridis")(np.linspace(0, .95, len(dfabf_plot.center_diff_cat.cat.categories)))
k = 0
for i in range(len(motifs)):
    for j in range(len(motifs)):
        if i>j:
            continue
        fig, ax = plt.subplots(1, 1, figsize=get_figsize(.65 / 3, 1))
        idx = 'motif_pair_idx'
        cat = 'center_diff_cat'
        motif_pair = pairs[k]
        motif_pair_name = "<>".join(motif_pair)
        k+=1
        t1 = profile_mapping[motif_pair[0]]
        t2 = profile_mapping[motif_pair[1]]
        # t1 = 'Nanog'
        # t2 = 'Nanog'
        # motif mask
        mmask = (dfabf_plot.motif_pair == motif_pair_name) & (dfabf_plot.score == 'max_profile_count')
        ab = pd.merge(dfabf_plot[mmask & (dfabf_plot.task == t1)][[idx, 'x_alt_ref_pc', cat]],
                      dfabf_plot[mmask & (dfabf_plot.task == t2)][[idx, 'y_alt_ref_pc']],
                      on=idx)
        # make a scatterplot
        for ci, (label,df) in enumerate(list(ab.groupby(cat))):
            ax.scatter(df.y_alt_ref_pc,
                       df.x_alt_ref_pc,   # first one is on the y axis
                       alpha=0.2, s=1, label=label, color=colors[ci])
        
        # ax.text(0, 0,t1+"-"+t2)
        alpha = .5
        xl = [0, 2]
        ax.plot(xl, xl, c='grey', alpha=alpha)
        ax.axvline(1, c='grey', alpha=alpha)
        ax.axhline(1, c='grey', alpha=alpha)
        ax.set_xlim(xl)
        ax.set_ylim(xl)
        ax.set_xlabel(r"{} ($\Delta$ {})".format(motif_pair[1], motif_pair[0]))
        ax.set_ylabel(r"{} ($\Delta$ {})".format(motif_pair[0], motif_pair[1]))
            
        # Use grey spines
        plt.setp(ax.spines.values(), color='grey')
        plt.setp([ax.get_xticklines(), ax.get_yticklines()], color='grey')
        fig.savefig(odir / f'{motif_pair_name}.png', raster=True)
        fig.savefig(odir / f'{motif_pair_name}.pdf', raster=True)
In [32]:
dfabf_plot = dfabf2.query('center_diff > 5')
odir = figures_dir / "individual/max_profile_count/all_tasks"
odir.mkdir(parents=True, exist_ok=True)
colors = plt.get_cmap("viridis")(np.linspace(0, .95, len(dfabf_plot.center_diff_cat.cat.categories)))

for task in tasks:
    k = 0
    for i in range(len(motifs)):
        for j in range(len(motifs)):
            if i>j:
                continue
            fig, ax = plt.subplots(1, 1, figsize=get_figsize(.65 / 3, 1))
            idx = 'motif_pair_idx'
            cat = 'center_diff_cat'
            motif_pair = pairs[k]
            motif_pair_name = "<>".join(motif_pair)
            k+=1
            t1 = profile_mapping[motif_pair[0]]
            t2 = profile_mapping[motif_pair[1]]
            
            ab = dfabf_plot[(dfabf_plot.motif_pair == motif_pair_name) & 
                            (dfabf_plot.score == 'max_profile_count') & 
                            (dfabf_plot.task == task)]
            # make a scatterplot
            for ci, (label,df) in enumerate(list(ab.groupby(cat))):
                ax.scatter(df.y_alt_ref_pc,
                           df.x_alt_ref_pc,   # first one is on the y axis
                           alpha=0.2, s=1, label=label, color=colors[ci])

            # ax.text(0, 0,t1+"-"+t2)
            alpha = .5
            xl = [0, 2]
            ax.plot(xl, xl, c='grey', alpha=alpha)
            ax.axvline(1, c='grey', alpha=alpha)
            ax.axhline(1, c='grey', alpha=alpha)
            ax.set_xlim(xl)
            ax.set_ylim(xl)
            ax.set_xlabel(r"{} ($\Delta$ {})".format(motif_pair[1], motif_pair[0]))
            ax.set_ylabel(r"{} ($\Delta$ {})".format(motif_pair[0], motif_pair[1]))
            ax.set_title(task)

            # Use grey spines
            plt.setp(ax.spines.values(), color='grey')
            plt.setp([ax.get_xticklines(), ax.get_yticklines()], color='grey')
            fig.savefig(odir / f'{motif_pair_name},task={task}.png', raster=True)
            fig.savefig(odir / f'{motif_pair_name},task={task}.pdf', raster=True)
In [80]:
dfabf_plot = dfabf2.query('center_diff > 5')
score = 'max_profile_count_bt'
odir = figures_dir / "individual/max_profile_count/all_tasks"
odir.mkdir(parents=True, exist_ok=True)
colors = plt.get_cmap("viridis")(np.linspace(0, .95, len(dfabf_plot.center_diff_cat.cat.categories)))

for task in tasks:
    k = 0
    for i in range(len(motifs)):
        for j in range(len(motifs)):
            if i>j:
                continue
            fig, ax = plt.subplots(1, 1, figsize=get_figsize(.65 / 3, 1))
            idx = 'motif_pair_idx'
            cat = 'center_diff_cat'
            motif_pair = pairs[k]
            motif_pair_name = "<>".join(motif_pair)
            k+=1
            t1 = profile_mapping[motif_pair[0]]
            t2 = profile_mapping[motif_pair[1]]
            
            ab = dfabf_plot[(dfabf_plot.motif_pair == motif_pair_name) & 
                            (dfabf_plot.score == score) & 
                            (dfabf_plot.task == task)]
            # make a scatterplot
            for ci, (label,df) in enumerate(list(ab.groupby(cat))):
                ax.scatter(df.y_alt_ref_pc,
                           df.x_alt_ref_pc,   # first one is on the y axis
                           alpha=0.2, s=1, label=label, color=colors[ci])

            # ax.text(0, 0,t1+"-"+t2)
            alpha = .5
            xl = [0, 2]
            ax.plot(xl, xl, c='grey', alpha=alpha)
            ax.axvline(1, c='grey', alpha=alpha)
            ax.axhline(1, c='grey', alpha=alpha)
            ax.set_xlim(xl)
            ax.set_ylim(xl)
            ax.set_xlabel(r"{} ($\Delta$ {})".format(motif_pair[1], motif_pair[0]))
            ax.set_ylabel(r"{} ($\Delta$ {})".format(motif_pair[0], motif_pair[1]))
            ax.set_title(task)

            # Use grey spines
            plt.setp(ax.spines.values(), color='grey')
            plt.setp([ax.get_xticklines(), ax.get_yticklines()], color='grey')
            fig.savefig(odir / f'{motif_pair_name},task={task}.bt-corrected.png', raster=True)
            fig.savefig(odir / f'{motif_pair_name},task={task}.bt-corrected.pdf', raster=True)

Pairwise plot

In [238]:
dfabf_plot = dfabf2.query('center_diff > 5')
In [239]:
dfabf_plot.motif_pair.unique()
Out[239]:
array(['Oct4-Sox2<>Oct4-Sox2', 'Oct4-Sox2<>Oct4', 'Oct4-Sox2<>Sox2', 'Oct4-Sox2<>Nanog',
       'Oct4-Sox2<>Klf4', 'Oct4<>Oct4', 'Oct4<>Sox2', 'Oct4<>Nanog', 'Oct4<>Klf4', 'Sox2<>Sox2',
       'Sox2<>Nanog', 'Sox2<>Klf4', 'Nanog<>Nanog', 'Nanog<>Klf4', 'Klf4<>Klf4'], dtype=object)
In [91]:
dfp = dfabf_plot.query("motif_pair == 'Oct4-Sox2<>Sox2'").query("score == 'max_profile_count_bt'").query("task == 'Sox2'")
In [93]:
dfp.shape
Out[93]:
(12307, 103)
In [96]:
(ggplot(aes(x='center_diff'), data=dfp) + 
 geom_histogram() + 
 theme_bw()
)
Out[96]:
<ggplot: (8774732277852)>
In [ ]:
# .query('match_weighted_p > .2').query('imp_weighted_p > .01')
In [109]:
(ggplot(aes(x='center_diff', fill='strand_combination'), data=dfp[dfp.center_diff < 150]) + 
 geom_histogram(breaks=np.arange(150)) + 
 theme_bw() + 
 facet_grid("strand_combination ~ .") +
 scale_fill_brewer(type='qual', palette=3)
)
Out[109]:
<ggplot: (-9223363262120541885)>
In [119]:
dfabf_plot.match_weighted_p_x.min()
Out[119]:
0.2000656311529206
In [120]:
dfabf_plot.match_weighted_p_y.min()
Out[120]:
0.2000656311529206
In [121]:
dfabf_plot.imp_weighted_p_x.min()
Out[121]:
0.00018637592023110607
In [122]:
dfabf_plot.imp_weighted_p_y.min()
Out[122]:
0.00018637592023110607
In [125]:
(ggplot(aes(x='center_diff', fill='strand_combination'), data=(dfabf_plot.
                                                               query("motif_pair == 'Nanog<>Nanog'").
                                                               query("score == 'max_profile_count_bt'").
                                                               query("task == 'Nanog'").
                                                               query("center_diff < 150"))) + 
 geom_histogram(breaks=np.arange(150)) + 
 theme_bw() + 
 facet_grid("strand_combination ~ .") +
 scale_fill_brewer(type='qual', palette=3)
)
Out[125]:
<ggplot: (8774733266191)>

TODO

  • [X] scatterplot: avg score ~ distance
  • [ ] scatterplot: avg log(score) ~ distance
In [180]:
dfs = (dfabf_plot.query("motif_pair == 'Oct4-Sox2<>Sox2'").
            query("score == 'max_profile_count_bt'").
            query("task == 'Sox2'").
            query("center_diff < 150"))
fig, axes = plt.subplots(2, 1, figsize=get_figsize(.5, 1/2), sharex=True)
axes[0].hist(dfs.center_diff, bins=np.arange(150))
df = dfs.groupby('center_diff').y_alt_ref_pc.median().reset_index()
axes[1].plot(df.center_diff, 1/df.y_alt_ref_pc);
axes[1].scatter(df.center_diff, 1/df.y_alt_ref_pc, s=2);
fig.subplots_adjust(hspace=0)
In [179]:
dfs = (dfabf_plot.query("motif_pair == 'Oct4-Sox2<>Nanog'").
            query("score == 'max_profile_count_bt'").
            query("task == 'Nanog'").
            query("center_diff < 150"))
fig, axes = plt.subplots(2, 1, figsize=get_figsize(.5, 1/2), sharex=True)
axes[0].hist(dfs.center_diff, bins=np.arange(150))
df = dfs.groupby('center_diff').y_alt_ref_pc.median().reset_index()
axes[1].plot(df.center_diff, 1/df.y_alt_ref_pc);
axes[1].scatter(df.center_diff, 1/df.y_alt_ref_pc, s=2);
fig.subplots_adjust(hspace=0)
In [193]:
dfs = (dfabf_plot.query("motif_pair == 'Nanog<>Nanog'").
            query("score == 'max_profile_count_bt'").
            query("task == 'Nanog'").
            query("center_diff < 150"))
fig, axes = plt.subplots(2, 1, figsize=get_figsize(.5, 1/2), sharex=True)
axes[0].hist(dfs.center_diff, bins=np.arange(150))
df = dfs.groupby('center_diff').y_alt_ref_pc.median().reset_index()
axes[1].plot(df.center_diff, 1/df.y_alt_ref_pc);
axes[1].scatter(df.center_diff, 1/df.y_alt_ref_pc, s=2);
axes[1].axhline(y=1, color='grey', alpha=0.5)
fig.subplots_adjust(hspace=0)
In [249]:
dfs = (dfabf_plot.query("motif_pair == 'Sox2<>Nanog'").
            query("score == 'max_profile_count_bt'").
            query("task == 'Nanog'").
            query("center_diff < 150"))
fig, axes = plt.subplots(2, 1, figsize=get_figsize(1, 1/2), sharex=True)
axes[0].hist(dfs.center_diff, bins=np.arange(150))
df = dfs.groupby('center_diff').y_alt_ref_pc.median().reset_index()
axes[1].plot(df.center_diff, 1/df.y_alt_ref_pc);
axes[1].scatter(df.center_diff, 1/df.y_alt_ref_pc, s=2);
axes[1].axhline(y=1, color='grey', alpha=0.5)
fig.subplots_adjust(hspace=0)

Bleed-through corrected

Open questions:

  • how to add error-bars? ( for median...)
  • remove TE's?
In [244]:
# Y
plt.figure()
h, _, _, _ = plt.hist2d(dfabf_plot.match_weighted_p_x, dfabf_plot.imp_weighted_p_x)
cb = plt.colorbar()
cb.set_label('counts in bin')
plt.xlabel("Match")
plt.ylabel("imp")

# plt.figure()
# # Y
# h, _, _, _ = plt.hist2d(dfabf_plot.match_weighted_p_y, dfabf_plot.imp_weighted_p_y)
# cb = plt.colorbar()
# cb.set_label('counts in bin')
# plt.xlabel("Match")
# plt.ylabel("imp")
Out[244]:
Text(0, 0.5, 'imp')
In [519]:
# Y
plt.figure()
nn = ~dfabf_plot.seq_match_p_x.isna()
h, _, _, _ = plt.hist2d(dfabf_plot.match_weighted_p_x[nn], dfabf_plot.seq_match_p_x[nn])
cb = plt.colorbar()
cb.set_label('counts in bin')
plt.xlabel("Match")
plt.ylabel("imp")

# plt.figure()
# # Y
# h, _, _, _ = plt.hist2d(dfabf_plot.match_weighted_p_y, dfabf_plot.imp_weighted_p_y)
# cb = plt.colorbar()
# cb.set_label('counts in bin')
# plt.xlabel("Match")
# plt.ylabel("imp")
Out[519]:
Text(0, 0.5, 'imp')
In [267]:
values, buckets, _ = plt.hist(np.random.randn(100))
In [281]:
dfs[dfs.task == t1].groupby('center_diff').x_alt_ref_pc.agg(['size', 'median'])
Out[281]:
size median
center_diff
8.0 1 0.4575
9.0 14 0.7890
10.0 39 0.8930
... ... ...
147.0 73 0.9789
148.0 64 0.9799
149.0 59 0.9559

142 rows × 2 columns

In [287]:
df1
Out[287]:
center_diff size median mad
2 10.0 39 0.8930 0.1113
3 11.0 62 0.8319 0.1825
4 12.0 69 0.9561 0.1143
... ... ... ... ...
139 147.0 73 0.9789 0.0339
140 148.0 64 0.9799 0.0216
141 149.0 59 0.9559 0.0302

140 rows × 4 columns

TODO

  • [ ] subset to sites which show a reliable footprint and then run your analysis
  • [ ] visualize Oct4-Sox2 <> Nanog/Sox2 instances in the genome which are 50bp apart
In [501]:
motif_pair = ['Oct4-Sox2', 'Sox2']
motif_pair_name = '<>'.join(motif_pair)
score = 'max_profile_count_bt'
In [502]:
dfs = (dfabf_plot.query(f"motif_pair == '{motif_pair_name}'").
                query(f"score == '{score}'").
                query("center_diff > 6").
                query("center_diff < 150"))
In [503]:
dfs[dfs.task == 'Sox2'].y_ref.plot.hist(bins=100)
Out[503]:
<matplotlib.axes._subplots.AxesSubplot at 0x7fae7915ed30>
In [507]:
np.percentile(dfs[dfs.task == 'Sox2'].y_ref, 90)
Out[507]:
0.25800478800000004
In [465]:
np.sum(dfs[dfs.task == 'Sox2'].y_ref > .25)
Out[465]:
592
In [508]:
dfs[dfs.task == 'Sox2'].x_ref.plot.hist(bins=100)
Out[508]:
<matplotlib.axes._subplots.AxesSubplot at 0x7fb00fe7af98>
In [510]:
np.percentile(dfs[dfs.task == 'Sox2'].x_ref, 90)
Out[510]:
0.3526743440000004
In [443]:
np.sum(dfs.x_ref > 1)
Out[443]:
698
In [447]:
np.sum((dfs.x_ref > .5) & (dfs.y_ref > .5))
Out[447]:
859
In [444]:
np.sum(dfs.y_ref > 1)
Out[444]:
441
In [439]:
dfs.head()
Out[439]:
Unnamed: 0 Unnamed: 0.1 example_chrom_x example_end_x example_idx example_interval_from_task_x example_start_x example_strand_x imp/Klf4_x imp/Nanog_x imp/Oct4_x imp/Sox2_x imp_max_x imp_max_task_x imp_weighted_x imp_weighted_cat_x imp_weighted_p_x match/Klf4_x match/Nanog_x match/Oct4_x match/Sox2_x match_max_x match_max_task_x match_weighted_x match_weighted_cat_x match_weighted_p_x pattern_x pattern_center_x pattern_end_x pattern_end_abs_x pattern_len_x pattern_name_x pattern_short_x pattern_start_x pattern_start_abs_x seq_match_x seq_match_cat_x seq_match_p_x strand_x tf_x row_idx_x example_chrom_y example_end_y example_interval_from_task_y example_start_y example_strand_y imp/Klf4_y imp/Nanog_y imp/Oct4_y imp/Sox2_y imp_max_y imp_max_task_y imp_weighted_y imp_weighted_cat_y imp_weighted_p_y match/Klf4_y match/Nanog_y match/Oct4_y match/Sox2_y match_max_y match_max_task_y match_weighted_y match_weighted_cat_y match_weighted_p_y pattern_y pattern_center_y pattern_end_y pattern_end_abs_y pattern_len_y pattern_name_y pattern_short_y pattern_start_y pattern_start_abs_y seq_match_y seq_match_cat_y seq_match_p_y strand_y tf_y row_idx_y center_diff strand_combination motif_pair motif_pair_idx task score x_alt x_ref x_alt_ref x_alt_pc x_ref_pc x_alt_ref_pc y_alt y_ref y_alt_ref y_alt_pc y_ref_pc y_alt_ref_pc center_diff_cat pattern_center_aln_x pattern_strand_aln_x pattern_center_aln_y pattern_strand_aln_y pattern_diff_aln
1484740 61436 6 chr5 2.8211e+07 10 Oct4 2.8210e+07 . NaN NaN 1.7529 NaN 1.7529 Oct4 1.7529 high 0.9973 NaN NaN 0.5800 NaN 0.5800 Oct4 0.5800 high 0.9643 Oct4/metacluster_0/pa... 481.0 489.0 2.8210e+07 16.0 Oct4-Sox2 Oct4/m0_p0 473.0 2.8210e+07 8.8721 medium 0.4155 - Oct4 6 chr5 2.8211e+07 Oct4 2.8210e+07 . NaN NaN NaN 0.3789 0.3789 Sox2 0.3789 medium 0.5832 NaN NaN NaN 0.4990 0.4990 Sox2 0.4990 low 0.2939 Sox2/metacluster_0/pa... 551.0 559.0 2.8210e+07 16.0 Sox2 Sox2/m0_p1 543.0 2.8210e+07 7.1866 low 0.2134 + Sox2 107109 70.0 -+ Oct4-Sox2<>Sox2 61436 Oct4 max_profile_count_bt 0.8474 1.0098 0.8391 0.8827 1.0452 0.8446 0.1881 0.3576 0.5259 0.2275 0.3970 0.5729 (35, 70] 480.0 - 558.0 + 78.0
1484741 61437 7 chr5 2.8211e+07 10 Oct4 2.8210e+07 . NaN NaN 1.3879 NaN 1.3879 Oct4 1.3879 high 0.9823 NaN NaN 0.4648 NaN 0.4648 Oct4 0.4648 medium 0.4248 Oct4/metacluster_0/pa... 507.0 515.0 2.8210e+07 16.0 Oct4-Sox2 Oct4/m0_p0 499.0 2.8210e+07 7.4766 low 0.1803 - Oct4 7 chr5 2.8211e+07 Oct4 2.8210e+07 . NaN NaN NaN 0.3789 0.3789 Sox2 0.3789 medium 0.5832 NaN NaN NaN 0.4990 0.4990 Sox2 0.4990 low 0.2939 Sox2/metacluster_0/pa... 551.0 559.0 2.8210e+07 16.0 Sox2 Sox2/m0_p1 543.0 2.8210e+07 7.1866 low 0.2134 + Sox2 107109 44.0 -+ Oct4-Sox2<>Sox2 61437 Oct4 max_profile_count_bt 0.7806 0.8259 0.9452 0.8160 0.8613 0.9474 0.2393 0.3743 0.6394 0.2787 0.4137 0.6737 (35, 70] 506.0 - 558.0 + 52.0
1484743 61439 10 chr14 8.6397e+07 28 Oct4 8.6396e+07 . NaN NaN 0.4323 NaN 0.4323 Oct4 0.4323 medium 0.3711 NaN NaN 0.4789 NaN 0.4789 Oct4 0.4789 medium 0.5101 Oct4/metacluster_0/pa... 433.0 441.0 8.6396e+07 16.0 Oct4-Sox2 Oct4/m0_p0 425.0 8.6396e+07 8.4841 medium 0.3326 - Oct4 10 chr14 8.6397e+07 Oct4 8.6396e+07 . NaN NaN NaN 0.2097 0.2097 Sox2 0.2097 low 0.2024 NaN NaN NaN 0.5218 0.5218 Sox2 0.5218 medium 0.3908 Sox2/metacluster_0/pa... 567.0 575.0 8.6396e+07 16.0 Sox2 Sox2/m0_p1 559.0 8.6396e+07 3.1809 low 0.0160 - Sox2 107112 134.0 -- Oct4-Sox2<>Sox2 61439 Oct4 max_profile_count_bt 0.1837 0.2576 0.7132 0.2191 0.2929 0.7478 0.1815 0.2363 0.7684 0.2209 0.2756 0.8015 (70, 150] 432.0 - 560.0 - 128.0
1484744 61440 11 chr14 8.6397e+07 28 Oct4 8.6396e+07 . NaN NaN 1.0223 NaN 1.0223 Oct4 1.0223 high 0.8950 NaN NaN 0.4931 NaN 0.4931 Oct4 0.4931 medium 0.5949 Oct4/metacluster_0/pa... 463.0 471.0 8.6396e+07 16.0 Oct4-Sox2 Oct4/m0_p0 455.0 8.6396e+07 10.5521 high 0.7476 - Oct4 11 chr14 8.6397e+07 Oct4 8.6396e+07 . NaN NaN NaN 0.2097 0.2097 Sox2 0.2097 low 0.2024 NaN NaN NaN 0.5218 0.5218 Sox2 0.5218 medium 0.3908 Sox2/metacluster_0/pa... 567.0 575.0 8.6396e+07 16.0 Sox2 Sox2/m0_p1 559.0 8.6396e+07 3.1809 low 0.0160 - Sox2 107112 104.0 -- Oct4-Sox2<>Sox2 61440 Oct4 max_profile_count_bt 0.7108 0.7512 0.9462 0.7462 0.7865 0.9487 0.1032 0.2063 0.5004 0.1426 0.2457 0.5805 (70, 150] 462.0 - 560.0 - 98.0
1484745 61441 12 chr14 8.6397e+07 28 Oct4 8.6396e+07 . NaN NaN 1.3324 NaN 1.3324 Oct4 1.3324 high 0.9751 NaN NaN 0.4873 NaN 0.4873 Oct4 0.4873 medium 0.5576 Oct4/metacluster_0/pa... 506.0 514.0 8.6396e+07 16.0 Oct4-Sox2 Oct4/m0_p0 498.0 8.6396e+07 6.6522 low 0.0915 - Oct4 12 chr14 8.6397e+07 Oct4 8.6396e+07 . NaN NaN NaN 0.2097 0.2097 Sox2 0.2097 low 0.2024 NaN NaN NaN 0.5218 0.5218 Sox2 0.5218 medium 0.3908 Sox2/metacluster_0/pa... 567.0 575.0 8.6396e+07 16.0 Sox2 Sox2/m0_p1 559.0 8.6396e+07 3.1809 low 0.0160 - Sox2 107112 61.0 -- Oct4-Sox2<>Sox2 61441 Oct4 max_profile_count_bt 1.0052 0.9887 1.0167 1.0405 1.0240 1.0161 0.1268 0.1670 0.7593 0.1662 0.2064 0.8052 (35, 70] 505.0 - 560.0 - 55.0
In [436]:
dfabf_plot.head()
Out[436]:
Unnamed: 0 Unnamed: 0.1 example_chrom_x example_end_x example_idx example_interval_from_task_x example_start_x example_strand_x imp/Klf4_x imp/Nanog_x imp/Oct4_x imp/Sox2_x imp_max_x imp_max_task_x imp_weighted_x imp_weighted_cat_x imp_weighted_p_x match/Klf4_x match/Nanog_x match/Oct4_x match/Sox2_x match_max_x match_max_task_x match_weighted_x match_weighted_cat_x match_weighted_p_x pattern_x pattern_center_x pattern_end_x pattern_end_abs_x pattern_len_x pattern_name_x pattern_short_x pattern_start_x pattern_start_abs_x seq_match_x seq_match_cat_x seq_match_p_x strand_x tf_x row_idx_x example_chrom_y example_end_y example_interval_from_task_y example_start_y example_strand_y imp/Klf4_y imp/Nanog_y imp/Oct4_y imp/Sox2_y imp_max_y imp_max_task_y imp_weighted_y imp_weighted_cat_y imp_weighted_p_y match/Klf4_y match/Nanog_y match/Oct4_y match/Sox2_y match_max_y match_max_task_y match_weighted_y match_weighted_cat_y match_weighted_p_y pattern_y pattern_center_y pattern_end_y pattern_end_abs_y pattern_len_y pattern_name_y pattern_short_y pattern_start_y pattern_start_abs_y seq_match_y seq_match_cat_y seq_match_p_y strand_y tf_y row_idx_y center_diff strand_combination motif_pair motif_pair_idx task score x_alt x_ref x_alt_ref x_alt_pc x_ref_pc x_alt_ref_pc y_alt y_ref y_alt_ref y_alt_pc y_ref_pc y_alt_ref_pc center_diff_cat pattern_center_aln_x pattern_strand_aln_x pattern_center_aln_y pattern_strand_aln_y pattern_diff_aln
0 0 1 chr3 1.2215e+08 1 Oct4 1.2215e+08 . NaN NaN 0.9374 NaN 0.9374 Oct4 0.9374 high 0.8462 NaN NaN 0.5077 NaN 0.5077 Oct4 0.5077 high 0.6886 Oct4/metacluster_0/pa... 430.0 438.0 1.2215e+08 16.0 Oct4-Sox2 Oct4/m0_p0 422.0 1.2215e+08 10.2722 high 0.6902 - Oct4 0 chr3 1.2215e+08 Oct4 1.2215e+08 . NaN NaN 1.5355 NaN 1.5355 Oct4 1.5355 high 0.992 NaN NaN 0.5287 NaN 0.5287 Oct4 0.5287 high 0.7992 Oct4/metacluster_0/pa... 451.0 459.0 1.2215e+08 16.0 Oct4-Sox2 Oct4/m0_p0 443.0 1.2215e+08 11.5528 high 0.9044 - Oct4 1 21.0 ++ Oct4-Sox2<>Oct4-Sox2 0 Oct4 profile_count 2.4636 16.0985 0.1530 2.4963 16.1312 0.1547 10.9299 23.2048 0.4710 10.9592 23.2341 0.4717 (0, 35] 429.0 - 450.0 - 21.0
1 1 2 chr3 1.2215e+08 1 Oct4 1.2215e+08 . NaN NaN 0.9374 NaN 0.9374 Oct4 0.9374 high 0.8462 NaN NaN 0.5077 NaN 0.5077 Oct4 0.5077 high 0.6886 Oct4/metacluster_0/pa... 430.0 438.0 1.2215e+08 16.0 Oct4-Sox2 Oct4/m0_p0 422.0 1.2215e+08 10.2722 high 0.6902 - Oct4 0 chr3 1.2215e+08 Oct4 1.2215e+08 . NaN NaN 1.2758 NaN 1.2758 Oct4 1.2758 high 0.967 NaN NaN 0.4219 NaN 0.4219 Oct4 0.4219 low 0.2234 Oct4/metacluster_0/pa... 492.0 500.0 1.2215e+08 16.0 Oct4-Sox2 Oct4/m0_p0 484.0 1.2215e+08 10.7122 high 0.7777 - Oct4 2 62.0 ++ Oct4-Sox2<>Oct4-Sox2 1 Oct4 profile_count 5.1420 16.0985 0.3194 5.1746 16.1312 0.3208 14.5598 20.8518 0.6983 14.5891 20.8811 0.6987 (35, 70] 429.0 - 491.0 - 62.0
2 2 3 chr3 1.2215e+08 1 Oct4 1.2215e+08 . NaN NaN 0.9374 NaN 0.9374 Oct4 0.9374 high 0.8462 NaN NaN 0.5077 NaN 0.5077 Oct4 0.5077 high 0.6886 Oct4/metacluster_0/pa... 430.0 438.0 1.2215e+08 16.0 Oct4-Sox2 Oct4/m0_p0 422.0 1.2215e+08 10.2722 high 0.6902 - Oct4 0 chr3 1.2215e+08 Oct4 1.2215e+08 . NaN NaN 1.0455 NaN 1.0455 Oct4 1.0455 high 0.905 NaN NaN 0.4155 NaN 0.4155 Oct4 0.4155 low 0.2005 Oct4/metacluster_0/pa... 534.0 542.0 1.2215e+08 16.0 Oct4-Sox2 Oct4/m0_p0 526.0 1.2215e+08 10.2722 high 0.6902 - Oct4 3 104.0 ++ Oct4-Sox2<>Oct4-Sox2 2 Oct4 profile_count 5.2543 16.0985 0.3264 5.2870 16.1312 0.3278 11.1567 14.7501 0.7564 11.1859 14.7794 0.7569 (70, 150] 429.0 - 533.0 - 104.0
3 3 6 chr3 1.2215e+08 1 Oct4 1.2215e+08 . NaN NaN 1.5355 NaN 1.5355 Oct4 1.5355 high 0.9920 NaN NaN 0.5287 NaN 0.5287 Oct4 0.5287 high 0.7992 Oct4/metacluster_0/pa... 451.0 459.0 1.2215e+08 16.0 Oct4-Sox2 Oct4/m0_p0 443.0 1.2215e+08 11.5528 high 0.9044 - Oct4 1 chr3 1.2215e+08 Oct4 1.2215e+08 . NaN NaN 1.2758 NaN 1.2758 Oct4 1.2758 high 0.967 NaN NaN 0.4219 NaN 0.4219 Oct4 0.4219 low 0.2234 Oct4/metacluster_0/pa... 492.0 500.0 1.2215e+08 16.0 Oct4-Sox2 Oct4/m0_p0 484.0 1.2215e+08 10.7122 high 0.7777 - Oct4 2 41.0 ++ Oct4-Sox2<>Oct4-Sox2 3 Oct4 profile_count 6.9377 23.2048 0.2990 6.9704 23.2375 0.3000 8.8887 20.8518 0.4263 8.9179 20.8811 0.4271 (35, 70] 450.0 - 491.0 - 41.0
4 4 7 chr3 1.2215e+08 1 Oct4 1.2215e+08 . NaN NaN 1.5355 NaN 1.5355 Oct4 1.5355 high 0.9920 NaN NaN 0.5287 NaN 0.5287 Oct4 0.5287 high 0.7992 Oct4/metacluster_0/pa... 451.0 459.0 1.2215e+08 16.0 Oct4-Sox2 Oct4/m0_p0 443.0 1.2215e+08 11.5528 high 0.9044 - Oct4 1 chr3 1.2215e+08 Oct4 1.2215e+08 . NaN NaN 1.0455 NaN 1.0455 Oct4 1.0455 high 0.905 NaN NaN 0.4155 NaN 0.4155 Oct4 0.4155 low 0.2005 Oct4/metacluster_0/pa... 534.0 542.0 1.2215e+08 16.0 Oct4-Sox2 Oct4/m0_p0 526.0 1.2215e+08 10.2722 high 0.6902 - Oct4 3 83.0 ++ Oct4-Sox2<>Oct4-Sox2 4 Oct4 profile_count 6.8628 23.2048 0.2957 6.8954 23.2375 0.2967 6.8670 14.7501 0.4656 6.8963 14.7794 0.4666 (70, 150] 450.0 - 533.0 - 83.0
In [454]:
df1['median'].shape
Out[454]:
(6,)
In [456]:
len(dfs)
Out[456]:
859
In [455]:
len(df1)
Out[455]:
6
In [458]:
dfs[dfs.task == t1].groupby('center_diff').x_alt_ref_pc.agg(['size', 'median', 'mad']).reset_index()
Out[458]:
center_diff size median mad
0 11.0 1 0.1575 0.0000
1 18.0 2 0.4648 0.0174
2 25.0 2 0.5192 0.0413
3 26.0 1 0.4054 0.0000
4 29.0 1 0.5173 0.0000
5 67.0 1 0.7914 0.0000
In [459]:
dfs.task.value_counts()
Out[459]:
Nanog    829
Sox2      22
Oct4       8
Name: task, dtype: int64
In [458]:
dfs[dfs.task == t1].groupby('center_diff').x_alt_ref_pc.agg(['size', 'median', 'mad']).reset_index()
Out[458]:
center_diff size median mad
0 11.0 1 0.1575 0.0000
1 18.0 2 0.4648 0.0174
2 25.0 2 0.5192 0.0413
3 26.0 1 0.4054 0.0000
4 29.0 1 0.5173 0.0000
5 67.0 1 0.7914 0.0000
In [113]:
fig, axes = plt.subplots(2, 1, figsize=get_figsize(.5, 1), sharex=True)
for j, score in enumerate(['max_profile_count_bt']):
    motif_pair_name = '<>'.join(motif_pair)
    dfs = (dfabf_plot.query(f"motif_pair == '{motif_pair_name}'").
                query(f"score == '{score}'").
                query("center_diff > 6").
                query("center_diff < 150"))
    dfs = dfs[(dfs.x_ref > .25) & (dfs.y_ref > .25)]
    # dfs = dfs[(dfs.x_ref > .25)]
    if j == 0:
        values, buckets, _ = axes[0].hist(dfs.center_diff, bins=np.arange(150))
    t1 = profile_mapping[motif_pair[0]]
    t2 = profile_mapping[motif_pair[1]]
    df1 = dfs[dfs.task == t1].groupby('center_diff').x_alt_ref_pc.agg(['size', 'median', 'mad']).reset_index()
    #df1 = df1[df1['size'] > 20]
    df2 = dfs[dfs.task == t2].groupby('center_diff').y_alt_ref_pc.agg(['size', 'median', 'mad']).reset_index()
    #df2 = df2[df2['size'] > 20]
    # Plot #1
    axes[j+1].plot(df1.center_diff, 1/df1['median'], label=motif_pair[0]);
    #         axes[j+1].fill_between(df1.center_diff, 
    #                                1/(df1['median']+df1['mad']/np.sqrt(df1['size'])), 
    #                                1/(df1['median']-df1['mad']/np.sqrt(df1['size'])), alpha=0.1);
    axes[j+1].scatter(df1.center_diff, 1/df1['median'], s=2);

    # Plot #2
    axes[j+1].plot(df2.center_diff, 1/df2['median'], label=motif_pair[1]);
    #         axes[j+1].fill_between(df2.center_diff, 
    #                                1/(df2['median']+df2['mad']/np.sqrt(df2['size'])), 
    #                                1/(df2['median']-df2['mad']/np.sqrt(df2['size'])), alpha=0.1);
    axes[j+1].scatter(df2.center_diff, 1/df2['median'], s=2);
    axes[j+1].set_ylim([.8, 4])
    axes[j+1].axhline(y=1, color='grey', alpha=0.5)
    axes[j+1].set_ylabel(score, rotation=0, ha='right')
    plt.legend(labels=motif_pair)
    fig.subplots_adjust(hspace=0)
In [483]:
np.percentile([1, 2,3 , 5], 40)
Out[483]:
2.2
In [114]:
motif_pair = ['Oct4-Sox2', 'Oct4']
for motif_pair in pairs:
    fig, axes = plt.subplots(3, 1, figsize=get_figsize(.5, 1), sharex=True)
    for j, score in enumerate(['max_profile_count_bt', 'max_profile_count']):
        motif_pair_name = '<>'.join(motif_pair)
        dfs = (dfabf_plot.query(f"motif_pair == '{motif_pair_name}'").
                    query(f"score == '{score}'").
                    query("center_diff > 6").
                    query("center_diff < 150"))
        if j == 0:
            values, buckets, _ = axes[0].hist(dfs.center_diff, bins=np.arange(150))
        t1 = profile_mapping[motif_pair[0]]
        x_ref_threshold = np.percentile(dfs[dfs.task == t1].x_ref, 80)
        t2 = profile_mapping[motif_pair[1]]
        y_ref_threshold = np.percentile(dfs[dfs.task == t2].y_ref, 80)
        df1 = dfs[(dfs.task == t1) & (dfs.x_ref > x_ref_threshold)].groupby('center_diff').x_alt_ref_pc.agg(['size', 'median', 'mad']).reset_index()
        # df1 = df1[df1['size'] > 10]
        df2 = dfs[(dfs.task == t2) & (dfs.y_ref > y_ref_threshold)].groupby('center_diff').y_alt_ref_pc.agg(['size', 'median', 'mad']).reset_index()
        # df2 = df2[df2['size'] > 10]
        # Plot #1
        axes[j+1].plot(df1.center_diff, 1/df1['median'], label=motif_pair[0]);
#         axes[j+1].fill_between(df1.center_diff, 
#                                1/(df1['median']+df1['mad']/np.sqrt(df1['size'])), 
#                                1/(df1['median']-df1['mad']/np.sqrt(df1['size'])), alpha=0.1);
        axes[j+1].scatter(df1.center_diff, 1/df1['median'], s=2);
        
        # Plot #2
        axes[j+1].plot(df2.center_diff, 1/df2['median'], label=motif_pair[1]);
#         axes[j+1].fill_between(df2.center_diff, 
#                                1/(df2['median']+df2['mad']/np.sqrt(df2['size'])), 
#                                1/(df2['median']-df2['mad']/np.sqrt(df2['size'])), alpha=0.1);
        axes[j+1].scatter(df2.center_diff, 1/df2['median'], s=2);
        axes[j+1].set_ylim([0, 3])
        axes[j+1].axhline(y=1, color='grey', alpha=0.5)
        axes[j+1].set_ylabel(score, rotation=0, ha='right')
        plt.legend(labels=motif_pair)
        fig.subplots_adjust(hspace=0)
In [312]:
motif_pair = ['Oct4-Sox2', 'Oct4']
for motif_pair in pairs:
    fig, axes = plt.subplots(3, 1, figsize=get_figsize(.5, 1), sharex=True)
    for j, score in enumerate(['max_profile_count_bt', 'max_profile_count']):
        motif_pair_name = '<>'.join(motif_pair)
        dfs = (dfabf_plot.query(f"motif_pair == '{motif_pair_name}'").
                    query(f"score == '{score}'").
                    query("center_diff > 6").
                    query("center_diff < 150"))
        if j == 0:
            values, buckets, _ = axes[0].hist(dfs.center_diff, bins=np.arange(150))
        t1 = profile_mapping[motif_pair[0]]
        t2 = profile_mapping[motif_pair[1]]
        df1 = dfs[dfs.task == t1].groupby('center_diff').x_alt_ref_pc.agg(['size', 'median', 'mad']).reset_index()
        df1 = df1[df1['size'] > 20]
        df2 = dfs[dfs.task == t2].groupby('center_diff').y_alt_ref_pc.agg(['size', 'median', 'mad']).reset_index()
        df2 = df2[df2['size'] > 20]
        # Plot #1
        axes[j+1].plot(df1.center_diff, 1/df1['median'], label=motif_pair[0]);
#         axes[j+1].fill_between(df1.center_diff, 
#                                1/(df1['median']+df1['mad']/np.sqrt(df1['size'])), 
#                                1/(df1['median']-df1['mad']/np.sqrt(df1['size'])), alpha=0.1);
        axes[j+1].scatter(df1.center_diff, 1/df1['median'], s=2);
        
        # Plot #2
        axes[j+1].plot(df2.center_diff, 1/df2['median'], label=motif_pair[1]);
#         axes[j+1].fill_between(df2.center_diff, 
#                                1/(df2['median']+df2['mad']/np.sqrt(df2['size'])), 
#                                1/(df2['median']-df2['mad']/np.sqrt(df2['size'])), alpha=0.1);
        axes[j+1].scatter(df2.center_diff, 1/df2['median'], s=2);
        axes[j+1].set_ylim([0, 2])
        axes[j+1].axhline(y=1, color='grey', alpha=0.5)
        axes[j+1].set_ylabel(score, rotation=0, ha='right')
        plt.legend(labels=motif_pair)
        fig.subplots_adjust(hspace=0)

Make a nice plot

In [ ]:
motif_pair = ['Oct4-Sox2', 'Oct4']
for motif_pair in pairs:
    fig, axes = plt.subplots(3, 1, figsize=get_figsize(.5, 1), sharex=True)
    for j, score in enumerate(['max_profile_count_bt', 'max_profile_count']):
        motif_pair_name = '<>'.join(motif_pair)
        dfs = (dfabf_plot.query(f"motif_pair == '{motif_pair_name}'").
                    query(f"score == '{score}'").
                    query("center_diff > 6").
                    query("center_diff < 150"))
        if j == 0:
            values, buckets, _ = axes[0].hist(dfs.center_diff, bins=np.arange(150))
        t1 = profile_mapping[motif_pair[0]]
        t2 = profile_mapping[motif_pair[1]]
        df1 = dfs[dfs.task == t1].groupby('center_diff').x_alt_ref_pc.agg(['size', 'median', 'mad']).reset_index()
        df1 = df1[df1['size'] > 20]
        df2 = dfs[dfs.task == t2].groupby('center_diff').y_alt_ref_pc.agg(['size', 'median', 'mad']).reset_index()
        df2 = df2[df2['size'] > 20]
        # Plot #1
        axes[j+1].plot(df1.center_diff, 1/df1['median'], label=motif_pair[0]);
#         axes[j+1].fill_between(df1.center_diff, 
#                                1/(df1['median']+df1['mad']/np.sqrt(df1['size'])), 
#                                1/(df1['median']-df1['mad']/np.sqrt(df1['size'])), alpha=0.1);
        axes[j+1].scatter(df1.center_diff, 1/df1['median'], s=2);
        
        # Plot #2
        axes[j+1].plot(df2.center_diff, 1/df2['median'], label=motif_pair[1]);
#         axes[j+1].fill_between(df2.center_diff, 
#                                1/(df2['median']+df2['mad']/np.sqrt(df2['size'])), 
#                                1/(df2['median']-df2['mad']/np.sqrt(df2['size'])), alpha=0.1);
        axes[j+1].scatter(df2.center_diff, 1/df2['median'], s=2);
        axes[j+1].set_ylim([0, 2])
        axes[j+1].axhline(y=1, color='grey', alpha=0.5)
        axes[j+1].set_ylabel(score, rotation=0, ha='right')
        plt.legend(labels=motif_pair)
        fig.subplots_adjust(hspace=0)
In [117]:
dfabf_plot.strand_combination
Out[117]:
0          ++
1          ++
2          ++
           ..
9750325    -+
9750326    -+
9750327    ++
Name: strand_combination, Length: 9728040, dtype: object
In [69]:
from basepair.exp.paper.config import tf_colors
In [81]:
# for strand_combination in ['++', '-+', '+-', '--']:
fig, axes = plt.subplots(3, 1, figsize=get_figsize(.25, 2), 
                         sharex=True, 
                         sharey=True,
                         gridspec_kw=dict(wspace=0, hspace=0))
# motif_pair = ['Oct4-Sox2', 'Sox2']
for i, motif_pair in enumerate([['Nanog', 'Nanog'], 
                                ['Sox2', 'Nanog'],
                                ['Sox2', 'Sox2']]):
    motif_pair_name = '<>'.join(motif_pair)
    score = 'max_profile_count_bt'
    dfs = (dfabf_plot.query(f"motif_pair == '{motif_pair_name}'").
                query(f"score == '{score}'").
                query("center_diff > 6").
                # query(f"strand_combination == '{strand_combination}'").
                query("center_diff < 150"))
    t1 = profile_mapping[motif_pair[0]]
    t2 = profile_mapping[motif_pair[1]]
    df1 = dfs[dfs.task == t1].groupby('center_diff').x_alt_ref_pc.agg(['size', 'median', 'mad']).reset_index()
    df1 = df1[df1['size'] > 20]
    df2 = dfs[dfs.task == t2].groupby('center_diff').y_alt_ref_pc.agg(['size', 'median', 'mad']).reset_index()
    df2 = df2[df2['size'] > 20]

    ax = axes[i]
    ax.plot(df1.center_diff, 1/df1['median'], 
            label=motif_pair[0],
            color=tf_colors[profile_mapping[motif_pair[0]]]);
    ax.plot(df2.center_diff, 1/df2['median'], 
            label=motif_pair[1],
            color=tf_colors[profile_mapping[motif_pair[1]]]);

    if i == 0:
        ax.set_title(f"Short-range interactions")
    ax.axhline(1, color='grey', alpha=0.5)
    if i == 1:
        ax.set_ylabel("Max profile value")
    ax.legend()
    ax.spines['top'].set_visible(False)
    ax.set_ylim([0.8, 2.5])
    ax.spines['right'].set_visible(False)
axes[-1].set_xlabel("Motif distance")
axes[-1].set_xticks([10, 50, 100, 150]);
fig.savefig(figures_dir / f'short-range.Nanog<>Sox2.strand_combination=all.pdf')
In [82]:
fig, axes = plt.subplots(3, 1, figsize=get_figsize(.25, 2), 
                         sharex=True, 
                         sharey=True,
                         gridspec_kw=dict(wspace=0, hspace=0))
# motif_pair = ['Oct4-Sox2', 'Sox2']
for i, motif_pair in enumerate([['Oct4-Sox2', 'Sox2'], 
                                ['Oct4-Sox2', 'Nanog'],
                                ['Oct4-Sox2', 'Klf4']]):
    motif_pair_name = '<>'.join(motif_pair)
    score = 'max_profile_count_bt'
    dfs = (dfabf_plot.query(f"motif_pair == '{motif_pair_name}'").
                query(f"score == '{score}'").
                query("center_diff > 6").
                # query("strand_combination == '++'").
                query("center_diff < 150"))
    t1 = profile_mapping[motif_pair[0]]
    t2 = profile_mapping[motif_pair[1]]
    df1 = dfs[dfs.task == t1].groupby('center_diff').x_alt_ref_pc.agg(['size', 'median', 'mad']).reset_index()
    df1 = df1[df1['size'] > 20]
    df2 = dfs[dfs.task == t2].groupby('center_diff').y_alt_ref_pc.agg(['size', 'median', 'mad']).reset_index()
    df2 = df2[df2['size'] > 20]
    
    ax = axes[i]
    ax.plot(df1.center_diff, 1/df1['median'], 
            label=motif_pair[0],
            color=tf_colors[profile_mapping[motif_pair[0]]]);
    ax.plot(df2.center_diff, 1/df2['median'], 
            label=motif_pair[1],
            color=tf_colors[profile_mapping[motif_pair[1]]]);
    
    if i == 0:
        ax.set_title("Short-range interactions")
    ax.axhline(1, color='grey', alpha=0.5)
    if i == 1:
        ax.set_ylabel("Max profile value")
    ax.legend()
    ax.spines['top'].set_visible(False)
    ax.set_ylim([0.8, 2.5])
    ax.spines['right'].set_visible(False)
axes[-1].set_xlabel("Motif distance")
axes[-1].set_xticks([10, 50, 100, 150]);
fig.savefig(figures_dir / 'long-range.Oct4-Sox2.strand_combination=all.pdf')

Directness score

In [243]:
dfp = pd.read_csv("http://mitra.stanford.edu/kundaje/avsec/chipnexus/oct-sox-nanog-klf/models/n_dil_layers=9/modisco/all/deeplift/profile/pattern_table.csv")
In [244]:
dfp.head()
Out[244]:
i pattern cluster n seqlets Klf4/d_p Nanog/d_p Oct4/d_p Sox2/d_p Klf4 footprint counts Klf4 footprint entropydiff Klf4 footprint max Klf4 footprint strandcor Klf4 imp counts Klf4 imp profile Klf4 periodicity 10bp Klf4 pos meandiff Klf4 pos std Klf4 pos unimodal Klf4 region counts Klf4/d Nanog footprint counts Nanog footprint entropydiff Nanog footprint max Nanog footprint strandcor Nanog imp counts Nanog imp profile Nanog periodicity 10bp Nanog pos meandiff Nanog pos std Nanog pos unimodal Nanog region counts Nanog/d Oct4 footprint counts Oct4 footprint entropydiff Oct4 footprint max Oct4 footprint strandcor Oct4 imp counts Oct4 imp profile Oct4 periodicity 10bp Oct4 pos meandiff Oct4 pos std Oct4 pos unimodal Oct4 region counts Oct4/d Sox2 footprint counts Sox2 footprint entropydiff Sox2 footprint max Sox2 footprint strandcor Sox2 imp counts Sox2 imp profile Sox2 periodicity 10bp Sox2 pos meandiff Sox2 pos std Sox2 pos unimodal Sox2 region counts Sox2/d consensus ic pwm mean idx
0 0 m1_p7 8 157.0 95.0820 11.4754 0.8197 0.8197 183.4872 0.2647 1.5417 0.0059 0.0168 0.0285 0.0077 2.0164 95.7851 True 387.5287 0.0401 133.1731 0.1340 0.8974 0.0051 -0.0103 -9.0438e-04 0.0009 1.8610 185.0268 True 367.1019 0.0085 149.9744 0.1425 0.9423 0.0050 -4.8469e-04 -8.0106e-04 0.0019 1.9562 127.6654 True 422.9363 -0.0011 60.1218 0.2124 0.5737 0.0048 -0.0016 -2.6529e-03 0.0016 1.8105 121.0656 True 199.4968 -0.0038 GCCCCGCCCCACCCT 1.3623 37
1 1 m1_p8 8 162.0 90.9836 13.1148 8.1967 4.9180 154.7716 0.2412 1.2716 0.0059 0.0106 0.0198 0.0572 2.0037 61.0182 True 302.7778 0.0291 74.3827 0.1929 0.5123 0.0049 -0.0038 2.4813e-03 0.0169 2.3406 104.3285 True 212.5494 0.0088 85.1667 0.1113 0.4969 0.0049 5.1559e-04 1.1783e-03 0.0248 2.2167 96.4021 True 276.4321 0.0018 37.7284 0.1553 0.2593 0.0049 0.0005 -4.5393e-05 0.0183 2.0258 93.9946 True 146.6173 -0.0006 GGGAAGGGTGGGGCAGTAGA 0.9815 38
2 2 m6_p2 8 355.0 80.3279 24.5902 40.9836 27.8689 220.2955 0.2445 1.9148 0.0061 0.0053 0.0123 0.0073 2.0044 87.2336 True 421.7324 0.0193 288.6136 0.1758 1.9446 0.0052 0.0027 9.0652e-03 0.0038 1.9253 75.9526 True 537.9211 0.0154 167.0597 0.1038 1.1847 0.0050 2.4447e-03 3.4294e-03 0.0055 1.9122 139.0525 True 430.0113 0.0044 68.9744 0.1055 0.4290 0.0051 0.0027 3.2214e-03 0.0027 1.7937 138.7371 True 202.9465 0.0037 CAGAGTGCTGGGATTAAAGGC... 0.3143 109
3 3 m1_p0 8 12735.0 100.0000 25.4098 4.9180 3.2787 182.8839 0.2046 1.5890 0.0061 0.0263 0.0500 0.0159 1.9985 70.1436 True 355.6721 0.0736 115.8733 0.0249 0.5408 0.0051 -0.0140 7.4599e-04 0.0065 2.0307 146.5852 True 307.2621 0.0155 102.3714 0.0258 0.5333 0.0050 -8.0866e-04 9.4931e-05 0.0132 2.0174 155.3548 True 339.2477 0.0010 48.2119 0.0220 0.2416 0.0050 -0.0007 -1.2597e-03 0.0053 2.0489 165.0139 True 173.3736 -0.0018 TGGGTGTGGC 1.2984 30
4 4 m1_p1 8 2064.0 95.9016 16.3934 2.4590 2.4590 164.3267 0.1382 1.0644 0.0056 0.0204 0.0303 0.0954 1.9978 59.0731 True 349.1899 0.0402 38.5148 0.0186 0.1699 0.0050 -0.0103 4.1849e-06 0.1099 2.0319 154.9271 True 139.5499 0.0103 77.7647 0.0188 0.3607 0.0050 -9.5728e-06 -1.6580e-04 0.0898 1.9667 109.5886 True 286.3324 -0.0003 28.1585 0.0213 0.1181 0.0050 -0.0002 -1.5025e-03 0.0829 1.9582 172.8593 True 135.0795 -0.0028 GGCCCCGCCCT 1.1518 31
In [245]:
dfp = dfp[dfp.pattern.isin(list(motifs.values()))]
m_hash = {v:k for k,v in motifs.items()}
dfp['pattern_name'] = dfp.pattern.map(m_hash)
dfp = dfp.set_index('pattern_name')
dfp = dfp.loc[list(motifs)]
In [246]:
dfp_max = dfp[[c for c in dfp.columns if c.endswith(' footprint max')]]
dfp_max.columns = [c.split(" ")[0] for c in dfp_max.columns]
dfp_max = dfp_max[tasks]
dfp_max
Out[246]:
Oct4 Sox2 Nanog Klf4
pattern_name
Oct4-Sox2 5.1700 1.9023 1.4743 0.5510
Sox2 1.0024 3.0287 4.7277 1.2223
Nanog 0.7602 0.2945 2.9368 0.4001
Klf4 0.5333 0.2416 0.5408 1.5890
In [247]:
dfp_counts = dfp[[c for c in dfp.columns if c.endswith(' footprint counts')]]
dfp_counts.columns = [c.split(" ")[0] for c in dfp_max.columns]
dfp_counts = dfp_counts[tasks]
dfp_counts
Out[247]:
Oct4 Sox2 Nanog Klf4
pattern_name
Oct4-Sox2 116.8492 289.9074 401.0843 135.1212
Sox2 200.3641 516.2857 216.5016 188.5086
Nanog 96.6386 274.3910 154.1314 63.4457
Klf4 182.8839 115.8733 102.3714 48.2119
In [250]:
fig, axes = plt.subplots(1, 2, figsize=get_figsize(1, .35), sharey=True)
ax = axes[0]
sns.heatmap(dfp_max, cmap='Reds', ax=ax)
ax.set_ylabel("Motif")
ax.set_xlabel("Task");
ax.set_title("Avg. max profile counts");
ax = axes[1]
sns.heatmap(dfp_counts, cmap='Reds', ax=ax)
ax.set_ylabel("")
ax.set_xlabel("Task");
ax.set_title("Avg. total profile counts");
plt.savefig(figures_dir / 'heatmap.total-profile-counts.pdf')

Debug the pioneering activity

In [337]:
dfs = (dfabf_plot.query("motif_pair == 'Oct4-Sox2<>Sox2'").
       query("center_diff == 50").
       query("score == 'max_profile_count_bt'").
       query("task == 'Sox2'"))
In [320]:
(1 / dfs[dfs.task == 'Sox2'].y_alt_ref_pc).hist()
Out[320]:
<matplotlib.axes._subplots.AxesSubplot at 0x7fb0174cfbe0>
In [338]:
1 / dfs[dfs.task == 'Sox2'].y_alt_ref_pc < 1.2
Out[338]:
1580040    False
1580214    False
1580280    False
           ...  
1592859     True
1593120     True
1593129    False
Name: y_alt_ref_pc, Length: 50, dtype: bool
In [339]:
dfs = dfs[1 / dfs.y_alt_ref_pc < 1.2]
In [346]:
ls {model_dir}
activity/                                     log/
config.gin                                    model.h5
config.gin.json                               note_params.json
deeplift/                                     null.deeplift.imp_score.h5
deeplift.imp_score.h5                         perturbation-analysis/
evaluation.valid.json                         preds.test.pkl
events.out.tfevents.1552394698.sh-112-11.int  seq_model.pkl
history.csv                                   train.smk-benchmark.tsv
imp_score_deeplift.smk-benchmark.tsv          wandb.json
In [347]:
isf = ImpScoreFile(model_dir / 'deeplift.imp_score.h5', default_imp_score='profile/wn')
In [ ]:
isf.get_profiles()

Smallest effect

  • not a good Oct4-Sox2 motif
In [359]:
profiles = isf.get_profiles(idx=100281)
In [360]:
imp_scores = isf.get_contrib(idx=100281)
In [354]:
plot_tracks(profiles);
In [356]:
plot_tracks(filter_tracks(imp_scores, [250, 600]));
In [357]:
plot_tracks(filter_tracks(imp_scores, [350, 500]));
In [369]:
create_tf_session(2)
Out[369]:
<tensorflow.python.client.session.Session at 0x7fb0173b2c18>
In [371]:
from basepair.BPNet import BPNetSeqModel
In [372]:
bpnet = BPNetSeqModel.from_mdir(model_dir)

2nd smallest effect

  • issue: many other motifs nearby
In [358]:
dict(dfs.sort_values('y_alt_ref_pc').iloc[-2])
Out[358]:
{'Unnamed: 0': 74081,
 'Unnamed: 0.1': 35478,
 'example_chrom_x': 'chr7',
 'example_end_x': 49679157.0,
 'example_idx': 100281,
 'example_interval_from_task_x': 'Klf4',
 'example_start_x': 49678157.0,
 'example_strand_x': '.',
 'imp/Klf4_x': nan,
 'imp/Nanog_x': nan,
 'imp/Oct4_x': 0.2470316290855408,
 'imp/Sox2_x': nan,
 'imp_max_x': 0.2470316290855408,
 'imp_max_task_x': 'Oct4',
 'imp_weighted_x': 0.2470316290855408,
 'imp_weighted_cat_x': 'low',
 'imp_weighted_p_x': 0.17267729009411986,
 'match/Klf4_x': nan,
 'match/Nanog_x': nan,
 'match/Oct4_x': 0.5360119539089431,
 'match/Sox2_x': nan,
 'match_max_x': 0.5360119539089431,
 'match_max_task_x': 'Oct4',
 'match_weighted_x': 0.5360119539089431,
 'match_weighted_cat_x': 'high',
 'match_weighted_p_x': 0.8333799273133911,
 'pattern_x': 'Oct4/metacluster_0/pattern_0',
 'pattern_center_x': 539.0,
 'pattern_end_x': 547.0,
 'pattern_end_abs_x': 49678704.0,
 'pattern_len_x': 16.0,
 'pattern_name_x': 'Oct4-Sox2',
 'pattern_short_x': 'Oct4/m0_p0',
 'pattern_start_x': 531.0,
 'pattern_start_abs_x': 49678688.0,
 'seq_match_x': 9.09128254391644,
 'seq_match_cat_x': 'medium',
 'seq_match_p_x': 0.4510297269592768,
 'strand_x': '-',
 'tf_x': 'Oct4',
 'row_idx_x': 33299,
 'example_chrom_y': 'chr7',
 'example_end_y': 49679157.0,
 'example_interval_from_task_y': 'Klf4',
 'example_start_y': 49678157.0,
 'example_strand_y': '.',
 'imp/Klf4_y': nan,
 'imp/Nanog_y': nan,
 'imp/Oct4_y': nan,
 'imp/Sox2_y': 0.1804771721363068,
 'imp_max_y': 0.1804771721363068,
 'imp_max_task_y': 'Sox2',
 'imp_weighted_y': 0.1804771721363068,
 'imp_weighted_cat_y': 'low',
 'imp_weighted_p_y': 0.10944527736131934,
 'match/Klf4_y': nan,
 'match/Nanog_y': nan,
 'match/Oct4_y': nan,
 'match/Sox2_y': 0.5070820014841294,
 'match_max_y': 0.5070820014841294,
 'match_max_task_y': 'Sox2',
 'match_weighted_y': 0.5070820014841294,
 'match_weighted_cat_y': 'medium',
 'match_weighted_p_y': 0.33183408295852074,
 'pattern_y': 'Sox2/metacluster_0/pattern_1',
 'pattern_center_y': 589.0,
 'pattern_end_y': 597.0,
 'pattern_end_abs_y': 49678754.0,
 'pattern_len_y': 16.0,
 'pattern_name_y': 'Sox2',
 'pattern_short_y': 'Sox2/m0_p1',
 'pattern_start_y': 581.0,
 'pattern_start_abs_y': 49678738.0,
 'seq_match_y': 5.2207650519352455,
 'seq_match_cat_y': 'low',
 'seq_match_p_y': 0.06296851574212893,
 'strand_y': '-',
 'tf_y': 'Sox2',
 'row_idx_y': 133545,
 'center_diff': 50.0,
 'strand_combination': '--',
 'motif_pair': 'Oct4-Sox2<>Sox2',
 'motif_pair_idx': 74081,
 'task': 'Sox2',
 'score': 'max_profile_count_bt',
 'x_alt': 0.14282344,
 'x_ref': 0.20891881,
 'x_alt_ref': 0.6836313,
 'x_alt_pc': 0.16735701,
 'x_ref_pc': 0.23345238,
 'x_alt_ref_pc': 0.7168786,
 'y_alt': 0.05080743,
 'y_ref': 0.046952803,
 'y_alt_ref': 1.0820958999999999,
 'y_alt_pc': 0.10186459,
 'y_ref_pc': 0.09800996,
 'y_alt_ref_pc': 1.0393289,
 'center_diff_cat': Interval(35, 70, closed='right'),
 'pattern_center_aln_x': 538.0,
 'pattern_strand_aln_x': '-',
 'pattern_center_aln_y': 582.0,
 'pattern_strand_aln_y': '-',
 'pattern_diff_aln': 44.0}
In [359]:
idx = 100281
profiles = isf.get_profiles(idx=idx)
imp_scores = isf.get_contrib(idx=idx)
seq = isf.get_seq(idx=idx)
In [387]:
preds = {k:v[0] for k,v in bpnet.predict(seq[np.newaxis]).items()}
In [390]:
plot_tracks(preds);
plt.xlim([0, 1000]);
In [389]:
plot_tracks(profiles);
plt.xlim([0, 1000]);
Out[389]:
(0, 1000)
In [364]:
plot_tracks(filter_tracks(imp_scores, None));
In [367]:
plot_tracks(filter_tracks(imp_scores, [150, 350]));
In [393]:
plot_tracks(filter_tracks(imp_scores, [450, 680]));

3rd smallest effect

  • crappy Oct4-Sox2 match
In [394]:
dict(dfs.sort_values('y_alt_ref_pc').iloc[-3])
Out[394]:
{'Unnamed: 0': 76535,
 'Unnamed: 0.1': 42906,
 'example_chrom_x': 'chr6',
 'example_end_x': 144340321.0,
 'example_idx': 142628,
 'example_interval_from_task_x': 'Klf4',
 'example_start_x': 144339321.0,
 'example_strand_x': '.',
 'imp/Klf4_x': nan,
 'imp/Nanog_x': nan,
 'imp/Oct4_x': 0.03695483878254889,
 'imp/Sox2_x': nan,
 'imp_max_x': 0.03695483878254889,
 'imp_max_task_x': 'Oct4',
 'imp_weighted_x': 0.03695483878254889,
 'imp_weighted_cat_x': 'low',
 'imp_weighted_p_x': 0.00018637592023110607,
 'match/Klf4_x': nan,
 'match/Nanog_x': nan,
 'match/Oct4_x': 0.4164715891127139,
 'match/Sox2_x': nan,
 'match_max_x': 0.4164715891127139,
 'match_max_task_x': 'Oct4',
 'match_weighted_x': 0.4164715891127139,
 'match_weighted_cat_x': 'low',
 'match_weighted_p_x': 0.20408163265306128,
 'pattern_x': 'Oct4/metacluster_0/pattern_0',
 'pattern_center_x': 789.0,
 'pattern_end_x': 797.0,
 'pattern_end_abs_x': 144340118.0,
 'pattern_len_x': 16.0,
 'pattern_name_x': 'Oct4-Sox2',
 'pattern_short_x': 'Oct4/m0_p0',
 'pattern_start_x': 781.0,
 'pattern_start_abs_x': 144340102.0,
 'seq_match_x': 5.896903681088381,
 'seq_match_cat_x': 'low',
 'seq_match_p_x': 0.04445065697511882,
 'strand_x': '-',
 'tf_x': 'Oct4',
 'row_idx_x': 40299,
 'example_chrom_y': 'chr6',
 'example_end_y': 144340321.0,
 'example_interval_from_task_y': 'Klf4',
 'example_start_y': 144339321.0,
 'example_strand_y': '.',
 'imp/Klf4_y': nan,
 'imp/Nanog_y': nan,
 'imp/Oct4_y': nan,
 'imp/Sox2_y': 0.08268684148788452,
 'imp_max_y': 0.08268684148788452,
 'imp_max_task_y': 'Sox2',
 'imp_weighted_y': 0.08268684148788452,
 'imp_weighted_cat_y': 'low',
 'imp_weighted_p_y': 0.0009995002498750622,
 'match/Klf4_y': nan,
 'match/Nanog_y': nan,
 'match/Oct4_y': nan,
 'match/Sox2_y': 0.4926420428226211,
 'match_max_y': 0.4926420428226211,
 'match_max_task_y': 'Sox2',
 'match_weighted_y': 0.4926420428226211,
 'match_weighted_cat_y': 'low',
 'match_weighted_p_y': 0.2748625687156422,
 'pattern_y': 'Sox2/metacluster_0/pattern_1',
 'pattern_center_y': 739.0,
 'pattern_end_y': 747.0,
 'pattern_end_abs_y': 144340068.0,
 'pattern_len_y': 16.0,
 'pattern_name_y': 'Sox2',
 'pattern_short_y': 'Sox2/m0_p1',
 'pattern_start_y': 731.0,
 'pattern_start_abs_y': 144340052.0,
 'seq_match_y': 5.667162795146028,
 'seq_match_cat_y': 'low',
 'seq_match_p_y': 0.09245377311344327,
 'strand_y': '+',
 'tf_y': 'Sox2',
 'row_idx_y': 141147,
 'center_diff': 50.0,
 'strand_combination': '-+',
 'motif_pair': 'Oct4-Sox2<>Sox2',
 'motif_pair_idx': 76535,
 'task': 'Sox2',
 'score': 'max_profile_count_bt',
 'x_alt': 0.011093924,
 'x_ref': 0.0069532990000000005,
 'x_alt_ref': 1.5954906999999998,
 'x_alt_pc': 0.035627488,
 'x_ref_pc': 0.03148686,
 'x_alt_ref_pc': 1.1315033,
 'y_alt': 0.037026532,
 'y_ref': 0.034897726000000004,
 'y_alt_ref': 1.0610013,
 'y_alt_pc': 0.08808368400000001,
 'y_ref_pc': 0.08595488,
 'y_alt_ref_pc': 1.0247666,
 'center_diff_cat': Interval(35, 70, closed='right'),
 'pattern_center_aln_x': 788.0,
 'pattern_strand_aln_x': '-',
 'pattern_center_aln_y': 746.0,
 'pattern_strand_aln_y': '+',
 'pattern_diff_aln': -42.0}
In [395]:
idx = 142628
profiles = isf.get_profiles(idx=idx)
imp_scores = isf.get_contrib(idx=idx)
seq = isf.get_seq(idx=idx)
In [396]:
preds = {k:v[0] for k,v in bpnet.predict(seq[np.newaxis]).items()}
In [397]:
plot_tracks(preds);
plt.xlim([0, 1000]);
In [398]:
plot_tracks(profiles);
plt.xlim([0, 1000]);
In [399]:
plot_tracks(filter_tracks(imp_scores, None));
In [401]:
plot_tracks(filter_tracks(imp_scores, [700, 900]));

4th smallest effect

  • crappy Sox2 and Oct4-Sox2 matches
In [402]:
dict(dfs.sort_values('y_alt_ref_pc').iloc[-4])
Out[402]:
{'Unnamed: 0': 72592,
 'Unnamed: 0.1': 31230,
 'example_chrom_x': 'chr16',
 'example_end_x': 10839660.0,
 'example_idx': 80917,
 'example_interval_from_task_x': 'Nanog',
 'example_start_x': 10838660.0,
 'example_strand_x': '.',
 'imp/Klf4_x': nan,
 'imp/Nanog_x': nan,
 'imp/Oct4_x': 0.3402984440326691,
 'imp/Sox2_x': nan,
 'imp_max_x': 0.3402984440326691,
 'imp_max_task_x': 'Oct4',
 'imp_weighted_x': 0.3402984440326691,
 'imp_weighted_cat_x': 'low',
 'imp_weighted_p_x': 0.2766750535830771,
 'match/Klf4_x': nan,
 'match/Nanog_x': nan,
 'match/Oct4_x': 0.41829988490049,
 'match/Sox2_x': nan,
 'match_max_x': 0.41829988490049,
 'match_max_task_x': 'Oct4',
 'match_weighted_x': 0.41829988490049,
 'match_weighted_cat_x': 'low',
 'match_weighted_p_x': 0.20976609822010997,
 'pattern_x': 'Oct4/metacluster_0/pattern_0',
 'pattern_center_x': 435.0,
 'pattern_end_x': 443.0,
 'pattern_end_abs_x': 10839103.0,
 'pattern_len_x': 16.0,
 'pattern_name_x': 'Oct4-Sox2',
 'pattern_short_x': 'Oct4/m0_p0',
 'pattern_start_x': 427.0,
 'pattern_start_abs_x': 10839087.0,
 'seq_match_x': 9.69120805974702,
 'seq_match_cat_x': 'medium',
 'seq_match_p_x': 0.5653713540210604,
 'strand_x': '-',
 'tf_x': 'Oct4',
 'row_idx_x': 29311,
 'example_chrom_y': 'chr16',
 'example_end_y': 10839660.0,
 'example_interval_from_task_y': 'Nanog',
 'example_start_y': 10838660.0,
 'example_strand_y': '.',
 'imp/Klf4_y': nan,
 'imp/Nanog_y': nan,
 'imp/Oct4_y': nan,
 'imp/Sox2_y': 0.11376836150884627,
 'imp_max_y': 0.11376836150884627,
 'imp_max_task_y': 'Sox2',
 'imp_weighted_y': 0.11376836150884627,
 'imp_weighted_cat_y': 'low',
 'imp_weighted_p_y': 0.004997501249375314,
 'match/Klf4_y': nan,
 'match/Nanog_y': nan,
 'match/Oct4_y': nan,
 'match/Sox2_y': 0.4977048674596135,
 'match_max_y': 0.4977048674596135,
 'match_max_task_y': 'Sox2',
 'match_weighted_y': 0.4977048674596135,
 'match_weighted_cat_y': 'low',
 'match_weighted_p_y': 0.2913543228385807,
 'pattern_y': 'Sox2/metacluster_0/pattern_1',
 'pattern_center_y': 385.0,
 'pattern_end_y': 393.0,
 'pattern_end_abs_y': 10839053.0,
 'pattern_len_y': 16.0,
 'pattern_name_y': 'Sox2',
 'pattern_short_y': 'Sox2/m0_p1',
 'pattern_start_y': 377.0,
 'pattern_start_abs_y': 10839037.0,
 'seq_match_y': 2.4394019425113287,
 'seq_match_cat_y': 'low',
 'seq_match_p_y': 0.0074962518740629685,
 'strand_y': '-',
 'tf_y': 'Sox2',
 'row_idx_y': 129476,
 'center_diff': 50.0,
 'strand_combination': '--',
 'motif_pair': 'Oct4-Sox2<>Sox2',
 'motif_pair_idx': 72592,
 'task': 'Sox2',
 'score': 'max_profile_count_bt',
 'x_alt': 0.061950665,
 'x_ref': 0.07106447,
 'x_alt_ref': 0.8717530000000001,
 'x_alt_pc': 0.08648423,
 'x_ref_pc': 0.095598035,
 'x_alt_ref_pc': 0.90466535,
 'y_alt': 0.033524055,
 'y_ref': 0.033593252000000004,
 'y_alt_ref': 0.9979401,
 'y_alt_pc': 0.08458121,
 'y_ref_pc': 0.08465041,
 'y_alt_ref_pc': 0.9991825000000001,
 'center_diff_cat': Interval(35, 70, closed='right'),
 'pattern_center_aln_x': 434.0,
 'pattern_strand_aln_x': '-',
 'pattern_center_aln_y': 378.0,
 'pattern_strand_aln_y': '-',
 'pattern_diff_aln': -56.0}
In [403]:
idx = 80917
profiles = isf.get_profiles(idx=idx)
imp_scores = isf.get_contrib(idx=idx)
seq = isf.get_seq(idx=idx)
In [404]:
preds = {k:v[0] for k,v in bpnet.predict(seq[np.newaxis]).items()}
In [405]:
plot_tracks(preds);
plt.xlim([0, 1000]);
In [406]:
plot_tracks(profiles);
plt.xlim([0, 1000]);
In [407]:
plot_tracks(filter_tracks(imp_scores, None));
In [408]:
plot_tracks(filter_tracks(imp_scores, [350, 600]));

5th worst effect

  • another Oct4-Sox2 motif in the vicinity
  • another Klf4 motif
In [409]:
dict(dfs.sort_values('y_alt_ref_pc').iloc[-5])
Out[409]:
{'Unnamed: 0': 69419,
 'Unnamed: 0.1': 22074,
 'example_chrom_x': 'chr7',
 'example_end_x': 49679158.0,
 'example_idx': 56054,
 'example_interval_from_task_x': 'Nanog',
 'example_start_x': 49678158.0,
 'example_strand_x': '.',
 'imp/Klf4_x': nan,
 'imp/Nanog_x': nan,
 'imp/Oct4_x': 0.2477083504199981,
 'imp/Sox2_x': nan,
 'imp_max_x': 0.2477083504199981,
 'imp_max_task_x': 'Oct4',
 'imp_weighted_x': 0.2477083504199981,
 'imp_weighted_cat_x': 'low',
 'imp_weighted_p_x': 0.1733296058149287,
 'match/Klf4_x': nan,
 'match/Nanog_x': nan,
 'match/Oct4_x': 0.536355694460538,
 'match/Sox2_x': nan,
 'match_max_x': 0.536355694460538,
 'match_max_task_x': 'Oct4',
 'match_weighted_x': 0.536355694460538,
 'match_weighted_cat_x': 'high',
 'match_weighted_p_x': 0.8343118069145466,
 'pattern_x': 'Oct4/metacluster_0/pattern_0',
 'pattern_center_x': 538.0,
 'pattern_end_x': 546.0,
 'pattern_end_abs_x': 49678704.0,
 'pattern_len_x': 16.0,
 'pattern_name_x': 'Oct4-Sox2',
 'pattern_short_x': 'Oct4/m0_p0',
 'pattern_start_x': 530.0,
 'pattern_start_abs_x': 49678688.0,
 'seq_match_x': 9.09128254391644,
 'seq_match_cat_x': 'medium',
 'seq_match_p_x': 0.4510297269592768,
 'strand_x': '-',
 'tf_x': 'Oct4',
 'row_idx_x': 20627,
 'example_chrom_y': 'chr7',
 'example_end_y': 49679158.0,
 'example_interval_from_task_y': 'Nanog',
 'example_start_y': 49678158.0,
 'example_strand_y': '.',
 'imp/Klf4_y': nan,
 'imp/Nanog_y': nan,
 'imp/Oct4_y': nan,
 'imp/Sox2_y': 0.18059472739696505,
 'imp_max_y': 0.18059472739696505,
 'imp_max_task_y': 'Sox2',
 'imp_weighted_y': 0.18059472739696505,
 'imp_weighted_cat_y': 'low',
 'imp_weighted_p_y': 0.10944527736131934,
 'match/Klf4_y': nan,
 'match/Nanog_y': nan,
 'match/Oct4_y': nan,
 'match/Sox2_y': 0.5066622523892046,
 'match_max_y': 0.5066622523892046,
 'match_max_task_y': 'Sox2',
 'match_weighted_y': 0.5066622523892046,
 'match_weighted_cat_y': 'low',
 'match_weighted_p_y': 0.3273363318340829,
 'pattern_y': 'Sox2/metacluster_0/pattern_1',
 'pattern_center_y': 588.0,
 'pattern_end_y': 596.0,
 'pattern_end_abs_y': 49678754.0,
 'pattern_len_y': 16.0,
 'pattern_name_y': 'Sox2',
 'pattern_short_y': 'Sox2/m0_p1',
 'pattern_start_y': 580.0,
 'pattern_start_abs_y': 49678738.0,
 'seq_match_y': 5.2207650519352455,
 'seq_match_cat_y': 'low',
 'seq_match_p_y': 0.06296851574212893,
 'strand_y': '-',
 'tf_y': 'Sox2',
 'row_idx_y': 121273,
 'center_diff': 50.0,
 'strand_combination': '--',
 'motif_pair': 'Oct4-Sox2<>Sox2',
 'motif_pair_idx': 69419,
 'task': 'Sox2',
 'score': 'max_profile_count_bt',
 'x_alt': 0.15319549999999998,
 'x_ref': 0.17042047,
 'x_alt_ref': 0.8989267,
 'x_alt_pc': 0.17772907,
 'x_ref_pc': 0.19495404,
 'x_alt_ref_pc': 0.911646,
 'y_alt': 0.04134101,
 'y_ref': 0.043782405999999996,
 'y_alt_ref': 0.9442379999999999,
 'y_alt_pc': 0.09239817,
 'y_ref_pc': 0.09483956,
 'y_alt_ref_pc': 0.97425765,
 'center_diff_cat': Interval(35, 70, closed='right'),
 'pattern_center_aln_x': 537.0,
 'pattern_strand_aln_x': '-',
 'pattern_center_aln_y': 581.0,
 'pattern_strand_aln_y': '-',
 'pattern_diff_aln': 44.0}
In [410]:
idx = 56054
profiles = isf.get_profiles(idx=idx)
imp_scores = isf.get_contrib(idx=idx)
seq = isf.get_seq(idx=idx)
In [411]:
preds = {k:v[0] for k,v in bpnet.predict(seq[np.newaxis]).items()}
In [412]:
plot_tracks(preds);
plt.xlim([0, 1000]);
In [413]:
plot_tracks(profiles);
plt.xlim([0, 1000]);
In [414]:
plot_tracks(filter_tracks(imp_scores, None));
In [416]:
plot_tracks(filter_tracks(imp_scores, [100, 200]));
In [415]:
plot_tracks(filter_tracks(imp_scores, [450, 650]));

Best effect

In [422]:
dict(dfs.sort_values('y_alt_ref_pc').iloc[0])
Out[422]:
{'Unnamed: 0': 69420,
 'Unnamed: 0.1': 22082,
 'example_chrom_x': 'chr11',
 'example_end_x': 76388970.0,
 'example_idx': 56086,
 'example_interval_from_task_x': 'Nanog',
 'example_start_x': 76387970.0,
 'example_strand_x': '.',
 'imp/Klf4_x': nan,
 'imp/Nanog_x': nan,
 'imp/Oct4_x': 0.14150290191173553,
 'imp/Sox2_x': nan,
 'imp_max_x': 0.14150290191173553,
 'imp_max_task_x': 'Oct4',
 'imp_weighted_x': 0.14150290191173553,
 'imp_weighted_cat_x': 'low',
 'imp_weighted_p_x': 0.039791258969341166,
 'match/Klf4_x': nan,
 'match/Nanog_x': nan,
 'match/Oct4_x': 0.42153619461549,
 'match/Sox2_x': nan,
 'match_max_x': 0.42153619461549,
 'match_max_task_x': 'Oct4',
 'match_weighted_x': 0.42153619461549,
 'match_weighted_cat_x': 'low',
 'match_weighted_p_x': 0.2225328487559408,
 'pattern_x': 'Oct4/metacluster_0/pattern_0',
 'pattern_center_x': 482.0,
 'pattern_end_x': 490.0,
 'pattern_end_abs_x': 76388460.0,
 'pattern_len_x': 16.0,
 'pattern_name_x': 'Oct4-Sox2',
 'pattern_short_x': 'Oct4/m0_p0',
 'pattern_start_x': 474.0,
 'pattern_start_abs_x': 76388444.0,
 'seq_match_x': 6.026784046015417,
 'seq_match_cat_x': 'low',
 'seq_match_p_x': 0.050787438262976416,
 'strand_x': '-',
 'tf_x': 'Oct4',
 'row_idx_x': 20635,
 'example_chrom_y': 'chr11',
 'example_end_y': 76388970.0,
 'example_interval_from_task_y': 'Nanog',
 'example_start_y': 76387970.0,
 'example_strand_y': '.',
 'imp/Klf4_y': nan,
 'imp/Nanog_y': nan,
 'imp/Oct4_y': nan,
 'imp/Sox2_y': 0.13634876906871796,
 'imp_max_y': 0.13634876906871796,
 'imp_max_task_y': 'Sox2',
 'imp_weighted_y': 0.13634876906871796,
 'imp_weighted_cat_y': 'low',
 'imp_weighted_p_y': 0.020989505247376312,
 'match/Klf4_y': nan,
 'match/Nanog_y': nan,
 'match/Oct4_y': nan,
 'match/Sox2_y': 0.4784038875654268,
 'match_max_y': 0.4784038875654268,
 'match_max_task_y': 'Sox2',
 'match_weighted_y': 0.4784038875654268,
 'match_weighted_cat_y': 'low',
 'match_weighted_p_y': 0.2263868065967017,
 'pattern_y': 'Sox2/metacluster_0/pattern_1',
 'pattern_center_y': 532.0,
 'pattern_end_y': 540.0,
 'pattern_end_abs_y': 76388510.0,
 'pattern_len_y': 16.0,
 'pattern_name_y': 'Sox2',
 'pattern_short_y': 'Sox2/m0_p1',
 'pattern_start_y': 524.0,
 'pattern_start_abs_y': 76388494.0,
 'seq_match_y': 2.8958057307741734,
 'seq_match_cat_y': 'low',
 'seq_match_p_y': 0.010494752623688156,
 'strand_y': '-',
 'tf_y': 'Sox2',
 'row_idx_y': 121285,
 'center_diff': 50.0,
 'strand_combination': '--',
 'motif_pair': 'Oct4-Sox2<>Sox2',
 'motif_pair_idx': 69420,
 'task': 'Sox2',
 'score': 'max_profile_count_bt',
 'x_alt': 0.03517713,
 'x_ref': 0.05559849,
 'x_alt_ref': 0.6326994,
 'x_alt_pc': 0.059710695999999994,
 'x_ref_pc': 0.08013205,
 'x_alt_ref_pc': 0.7451536999999999,
 'y_alt': 0.049314,
 'y_ref': 0.06586266,
 'y_alt_ref': 0.74873984,
 'y_alt_pc': 0.10037115,
 'y_ref_pc': 0.116919816,
 'y_alt_ref_pc': 0.85846144,
 'center_diff_cat': Interval(35, 70, closed='right'),
 'pattern_center_aln_x': 481.0,
 'pattern_strand_aln_x': '-',
 'pattern_center_aln_y': 525.0,
 'pattern_strand_aln_y': '-',
 'pattern_diff_aln': 44.0}
In [423]:
idx = 56086
profiles = isf.get_profiles(idx=idx)
imp_scores = isf.get_contrib(idx=idx)
seq = isf.get_seq(idx=idx)
In [424]:
preds = {k:v[0] for k,v in bpnet.predict(seq[np.newaxis]).items()}
In [425]:
plot_tracks(preds);
plt.xlim([0, 1000]);
In [426]:
plot_tracks(profiles);
plt.xlim([0, 1000]);
In [427]:
plot_tracks(filter_tracks(imp_scores, None));
In [428]:
plot_tracks(filter_tracks(imp_scores, [450, 600]));
In [415]:
plot_tracks(filter_tracks(imp_scores, [450, 650]));

2nd best effect

In [429]:
dict(dfs.sort_values('y_alt_ref_pc').iloc[1])
Out[429]:
{'Unnamed: 0': 63767,
 'Unnamed: 0.1': 6804,
 'example_chrom_x': 'chr5',
 'example_end_x': 45515831.0,
 'example_idx': 7895,
 'example_interval_from_task_x': 'Oct4',
 'example_start_x': 45514831.0,
 'example_strand_x': '.',
 'imp/Klf4_x': nan,
 'imp/Nanog_x': nan,
 'imp/Oct4_x': 0.04964468255639076,
 'imp/Sox2_x': nan,
 'imp_max_x': 0.04964468255639076,
 'imp_max_task_x': 'Oct4',
 'imp_weighted_x': 0.04964468255639076,
 'imp_weighted_cat_x': 'low',
 'imp_weighted_p_x': 0.00018637592023110607,
 'match/Klf4_x': nan,
 'match/Nanog_x': nan,
 'match/Oct4_x': 0.4393814096022649,
 'match/Sox2_x': nan,
 'match_max_x': 0.4393814096022649,
 'match_max_task_x': 'Oct4',
 'match_weighted_x': 0.4393814096022649,
 'match_weighted_cat_x': 'low',
 'match_weighted_p_x': 0.2968036529680365,
 'pattern_x': 'Oct4/metacluster_0/pattern_0',
 'pattern_center_x': 710.0,
 'pattern_end_x': 718.0,
 'pattern_end_abs_x': 45515549.0,
 'pattern_len_x': 16.0,
 'pattern_name_x': 'Oct4-Sox2',
 'pattern_short_x': 'Oct4/m0_p0',
 'pattern_start_x': 702.0,
 'pattern_start_abs_x': 45515533.0,
 'seq_match_x': 5.036626686383653,
 'seq_match_cat_x': 'low',
 'seq_match_p_x': 0.0182648401826484,
 'strand_x': '+',
 'tf_x': 'Oct4',
 'row_idx_x': 6416,
 'example_chrom_y': 'chr5',
 'example_end_y': 45515831.0,
 'example_interval_from_task_y': 'Oct4',
 'example_start_y': 45514831.0,
 'example_strand_y': '.',
 'imp/Klf4_y': nan,
 'imp/Nanog_y': nan,
 'imp/Oct4_y': nan,
 'imp/Sox2_y': 0.1218717321753502,
 'imp_max_y': 0.1218717321753502,
 'imp_max_task_y': 'Sox2',
 'imp_weighted_y': 0.1218717321753502,
 'imp_weighted_cat_y': 'low',
 'imp_weighted_p_y': 0.00849575212393803,
 'match/Klf4_y': nan,
 'match/Nanog_y': nan,
 'match/Oct4_y': nan,
 'match/Sox2_y': 0.5199034029192633,
 'match_max_y': 0.5199034029192633,
 'match_max_task_y': 'Sox2',
 'match_weighted_y': 0.5199034029192633,
 'match_weighted_cat_y': 'medium',
 'match_weighted_p_y': 0.3828085957021489,
 'pattern_y': 'Sox2/metacluster_0/pattern_1',
 'pattern_center_y': 660.0,
 'pattern_end_y': 668.0,
 'pattern_end_abs_y': 45515499.0,
 'pattern_len_y': 16.0,
 'pattern_name_y': 'Sox2',
 'pattern_short_y': 'Sox2/m0_p1',
 'pattern_start_y': 652.0,
 'pattern_start_abs_y': 45515483.0,
 'seq_match_y': 6.324524210843803,
 'seq_match_cat_y': 'low',
 'seq_match_p_y': 0.1344327836081959,
 'strand_y': '+',
 'tf_y': 'Sox2',
 'row_idx_y': 109978,
 'center_diff': 50.0,
 'strand_combination': '++',
 'motif_pair': 'Oct4-Sox2<>Sox2',
 'motif_pair_idx': 63767,
 'task': 'Sox2',
 'score': 'max_profile_count_bt',
 'x_alt': 0.012075658000000001,
 'x_ref': 0.020065458,
 'x_alt_ref': 0.6018131999999999,
 'x_alt_pc': 0.03660922,
 'x_ref_pc': 0.044599023,
 'x_alt_ref_pc': 0.8208525,
 'y_alt': 0.0666105,
 'y_ref': 0.08502854,
 'y_alt_ref': 0.7833899000000001,
 'y_alt_pc': 0.11766766,
 'y_ref_pc': 0.13608569,
 'y_alt_ref_pc': 0.8646585999999999,
 'center_diff_cat': Interval(35, 70, closed='right'),
 'pattern_center_aln_x': 711.0,
 'pattern_strand_aln_x': '+',
 'pattern_center_aln_y': 667.0,
 'pattern_strand_aln_y': '+',
 'pattern_diff_aln': -44.0}
In [430]:
idx = 7895
profiles = isf.get_profiles(idx=idx)
imp_scores = isf.get_contrib(idx=idx)
seq = isf.get_seq(idx=idx)
In [431]:
preds = {k:v[0] for k,v in bpnet.predict(seq[np.newaxis]).items()}
In [432]:
plot_tracks(preds);
plt.xlim([0, 1000]);
In [433]:
plot_tracks(profiles);
plt.xlim([0, 1000]);
In [434]:
plot_tracks(filter_tracks(imp_scores, None));
In [435]:
plot_tracks(filter_tracks(imp_scores, [600, 800]));