Figure 5,6

TODO

  • [x] generate all the pdf plots
  • [x] for scatterplots, show only the corresponding TF in the main (triangular) figure
    • [x] for the supplementary figures generate all 40 plots
    • [x] use max count
  • [x] add the heatmap of A->B type of analysis
    • [x] show <35 and >35 distance versions
  • [x] make sure colors look good and are consistent
  • [x] switch colors so that red means the 'interesting' effect
  • [x] add the co-occurence heatmap
  • [ ] simulation - generate the max score (same as in the perturbation analysis)
  • [ ] re-compute motif_centers and motif_diff using proper rounding schemes
In [1]:
# Imports
from basepair.imports import *
from basepair.exp.paper.config import motifs, profile_mapping
from basepair.exp.chipnexus.perturb.vdom import vdom_motif_pair, plot_spacing_hist
from basepair.exp.chipnexus.spacing import remove_edge_instances, get_motif_pairs
from basepair.exp.paper.config import tasks
from plotnine import *
import plotnine
import warnings
warnings.filterwarnings("ignore")

%matplotlib inline
%config InlineBackend.figure_format = 'retina'
paper_config()
Using TensorFlow backend.
In [2]:
# Common paths
model_dir = Path(f"{ddir}/processed/chipnexus/exp/models/oct-sox-nanog-klf/models/n_dil_layers=9/")
modisco_dir = model_dir / f"modisco/all/deeplift/profile/"
output_dir = modisco_dir
dataset_dir = output_dir / 'perturbation-analysis'
In [3]:
pairs = get_motif_pairs(motifs)

# ordered names
pair_names = ["<>".join(x) for x in pairs]
In [4]:
!mkdir -p {ddir}/figures/modisco/spacing/perturb
In [5]:
figures_dir = Path(f"{ddir}/figures/modisco/spacing/perturb")

Load the data

In [6]:
# load the data
dataset_dir = output_dir / 'perturbation-analysis'
In [7]:
from basepair.exp.chipnexus.perturb.scores import ism_compute_features_tidy, compute_features_tidy, SCORES, max_profile_count
In [8]:
# motif_pair_lpdata = read_pkl(dataset_dir / 'motif_pair_lpdata.incl-whole.pkl')
In [9]:
# dfabf_ism = ism_compute_features_tidy(motif_pair_lpdata, tasks)
# dfs = dfabf_ism[['Wt_obs', 'Wt', 'dA', 'dB', 'dAB', 'motif_pair', 'task', 'center_diff', 'strand_combination']]
In [10]:
# dfabf = compute_features_tidy(motif_pair_lpdata, tasks, SCORES, 
#                               pseudo_count_quantile=.2, 
#                               profile_slice=slice(82, 118))
In [11]:
# dfabf.to_parquet(dataset_dir / 'dfabf.parq')
In [12]:
# dfabf.to_csv(dataset_dir / 'dfabf.csv.gz', compression='gzip')
# dfs.to_csv(dataset_dir / 'dfs.csv.gz', compression='gzip')
In [13]:
ls {dataset_dir}
dfab.csv.gz     dfi_subset.csv.gz                 pair.total_counts.csv
dfabf.csv.gz    dfs.csv.gz                        pair.total_counts.csv.gz
dfabf.feather   double_mut.h5                     ref.h5
dfabf_ism.parq  motif_pair_lpdata.incl-whole.pkl  single_mut.h5
dfabf.parq      motif_pair_lpdata.pkl
In [52]:
ls {dataset_dir}
dfab.csv.gz     dfi_subset.csv.gz                 pair.total_counts.csv
dfabf.csv.gz    dfs.csv.gz                        pair.total_counts.csv.gz
dfabf.feather   double_mut.h5                     ref.h5
dfabf_ism.parq  motif_pair_lpdata.incl-whole.pkl  single_mut.h5
dfabf.parq      motif_pair_lpdata.pkl
In [62]:
!cp {dataset_dir}/dfabf.csv.gz /srv/www//kundaje/avsec/chipnexus/oct-sox-nanog-klf/models/n_dil_layers=9/modisco/all/deeplift/profile/perturbation-analysis/
In [55]:
dfabf.head()
Out[55]:
Unnamed: 0 Unnamed: 0.1 pattern_x example_idx pattern_start_x pattern_end_x strand_x pattern_len_x pattern_center_x match_weighted_x match_weighted_p_x match_weighted_cat_x match_max_x match_max_task_x imp_weighted_x imp_weighted_p_x imp_weighted_cat_x imp_max_x imp_max_task_x seq_match_x seq_match_p_x seq_match_cat_x match/Klf4_x match/Nanog_x match/Oct4_x match/Sox2_x imp/Klf4_x imp/Nanog_x imp/Oct4_x imp/Sox2_x example_chrom_x example_start_x example_end_x example_strand_x example_interval_from_task_x pattern_short_x pattern_name_x pattern_start_abs_x pattern_end_abs_x pattern_center_aln_x pattern_strand_aln_x row_idx_x pattern_y pattern_start_y pattern_end_y strand_y pattern_len_y pattern_center_y match_weighted_y match_weighted_p_y match_weighted_cat_y match_max_y match_max_task_y imp_weighted_y imp_weighted_p_y imp_weighted_cat_y imp_max_y imp_max_task_y seq_match_y seq_match_p_y seq_match_cat_y match/Klf4_y match/Nanog_y match/Oct4_y match/Sox2_y imp/Klf4_y imp/Nanog_y imp/Oct4_y imp/Sox2_y example_chrom_y example_start_y example_end_y example_strand_y example_interval_from_task_y pattern_short_y pattern_name_y pattern_start_abs_y pattern_end_abs_y pattern_center_aln_y pattern_strand_aln_y row_idx_y center_diff center_diff_aln strand_combination motif_pair motif_pair_idx task score x_alt x_ref x_alt_ref x_alt_pc x_ref_pc x_alt_ref_pc y_alt y_ref y_alt_ref y_alt_pc y_ref_pc y_alt_ref_pc center_diff_cat
0 0 1 metacluster_0/pattern_0 1 437.0 452.0 + 15.0 444.0 0.5391 0.5105 medium 0.5627 Sox2 0.9221 0.8271 low 1.0504 Oct4 9.6558 0.4369 medium 0.5452 0.4714 0.5524 0.5627 0.5605 1.0090 1.0504 0.9147 chr3 1.2215e+08 1.2215e+08 * Oct4 m0_p0 Oct4-Sox2 1.2215e+08 1.2215e+08 448.0 + 0.0 metacluster_0/pattern_0 458.0 473.0 + 15.0 465.0 0.5174 0.3763 medium 0.5613 Oct4 1.0389 0.9077 low 1.2170 Sox2 11.3139 0.7737 high 0.4683 0.4598 0.5613 0.5274 0.5500 1.1198 1.0884 1.2170 chr3 1.2215e+08 1.2215e+08 * Oct4 m0_p0 Oct4-Sox2 1.2215e+08 1.2215e+08 469.0 + 1.0 21.0 21.0 ++ Oct4-Sox2<>Oct4-Sox2 0 Oct4 max_profile_count 97.6125 162.8281 0.5995 98.3200 163.5357 0.6012 138.5507 488.4471 0.2837 139.3577 489.2541 0.2848 (0, 35]
1 1 2 metacluster_0/pattern_0 1 437.0 452.0 + 15.0 444.0 0.5391 0.5105 medium 0.5627 Sox2 0.9221 0.8271 low 1.0504 Oct4 9.6558 0.4369 medium 0.5452 0.4714 0.5524 0.5627 0.5605 1.0090 1.0504 0.9147 chr3 1.2215e+08 1.2215e+08 * Oct4 m0_p0 Oct4-Sox2 1.2215e+08 1.2215e+08 448.0 + 0.0 metacluster_0/pattern_0 499.0 514.0 + 15.0 506.0 0.5154 0.3673 medium 0.5390 Oct4 1.3206 0.9862 low 1.5825 Oct4 9.8925 0.4740 medium 0.4916 0.4731 0.5390 0.5267 0.6803 1.0429 1.5825 1.5485 chr3 1.2215e+08 1.2215e+08 * Oct4 m0_p0 Oct4-Sox2 1.2215e+08 1.2215e+08 510.0 + 2.0 62.0 62.0 ++ Oct4-Sox2<>Oct4-Sox2 1 Oct4 max_profile_count 71.4936 162.8281 0.4391 72.2012 163.5357 0.4415 403.7719 852.8242 0.4735 404.5789 853.6312 0.4740 (35, 70]
2 2 3 metacluster_0/pattern_0 1 437.0 452.0 + 15.0 444.0 0.5391 0.5105 medium 0.5627 Sox2 0.9221 0.8271 low 1.0504 Oct4 9.6558 0.4369 medium 0.5452 0.4714 0.5524 0.5627 0.5605 1.0090 1.0504 0.9147 chr3 1.2215e+08 1.2215e+08 * Oct4 m0_p0 Oct4-Sox2 1.2215e+08 1.2215e+08 448.0 + 0.0 metacluster_0/pattern_0 520.0 535.0 + 15.0 527.0 0.4897 0.2392 low 0.5008 Oct4 1.4250 0.9950 low 1.6570 Sox2 7.7218 0.1351 low 0.4555 0.4992 0.5008 0.4895 0.8025 1.2853 1.6062 1.6570 chr3 1.2215e+08 1.2215e+08 * Oct4 m0_p0 Oct4-Sox2 1.2215e+08 1.2215e+08 531.0 + 3.0 83.0 83.0 ++ Oct4-Sox2<>Oct4-Sox2 2 Oct4 max_profile_count 131.1055 162.8281 0.8052 131.8130 163.5357 0.8060 113.7232 261.8408 0.4343 114.5302 262.6478 0.4361 (70, 150]
3 3 4 metacluster_0/pattern_0 1 437.0 452.0 + 15.0 444.0 0.5391 0.5105 medium 0.5627 Sox2 0.9221 0.8271 low 1.0504 Oct4 9.6558 0.4369 medium 0.5452 0.4714 0.5524 0.5627 0.5605 1.0090 1.0504 0.9147 chr3 1.2215e+08 1.2215e+08 * Oct4 m0_p0 Oct4-Sox2 1.2215e+08 1.2215e+08 448.0 + 0.0 metacluster_0/pattern_0 541.0 556.0 + 15.0 548.0 0.4843 0.2165 low 0.5198 Oct4 1.2319 0.9746 low 1.4439 Oct4 9.6558 0.4369 medium 0.4183 0.4421 0.5198 0.5058 0.6828 1.0318 1.4439 1.4188 chr3 1.2215e+08 1.2215e+08 * Oct4 m0_p0 Oct4-Sox2 1.2215e+08 1.2215e+08 552.0 + 4.0 104.0 104.0 ++ Oct4-Sox2<>Oct4-Sox2 3 Oct4 max_profile_count 105.7563 162.8281 0.6495 106.4639 163.5357 0.6510 340.2409 652.8372 0.5212 341.0479 653.6442 0.5218 (70, 150]
4 4 7 metacluster_0/pattern_0 1 458.0 473.0 + 15.0 465.0 0.5174 0.3763 medium 0.5613 Oct4 1.0389 0.9077 low 1.2170 Sox2 11.3139 0.7737 high 0.4683 0.4598 0.5613 0.5274 0.5500 1.1198 1.0884 1.2170 chr3 1.2215e+08 1.2215e+08 * Oct4 m0_p0 Oct4-Sox2 1.2215e+08 1.2215e+08 469.0 + 1.0 metacluster_0/pattern_0 499.0 514.0 + 15.0 506.0 0.5154 0.3673 medium 0.5390 Oct4 1.3206 0.9862 low 1.5825 Oct4 9.8925 0.4740 medium 0.4916 0.4731 0.5390 0.5267 0.6803 1.0429 1.5825 1.5485 chr3 1.2215e+08 1.2215e+08 * Oct4 m0_p0 Oct4-Sox2 1.2215e+08 1.2215e+08 510.0 + 2.0 41.0 41.0 ++ Oct4-Sox2<>Oct4-Sox2 4 Oct4 max_profile_count 148.8586 488.4471 0.3048 149.5661 489.1546 0.3058 655.5236 852.8242 0.7687 656.3306 853.6312 0.7689 (35, 70]
In [14]:
dfabf = pd.read_csv(dataset_dir / 'dfabf.csv.gz')
In [15]:
# dfabf = pd.read_parquet(dataset_dir / 'dfabf.parq')
dfabf['center_diff_cat'] = pd.Categorical(pd.cut(dfabf['center_diff'], [0, 35, 70, 150, 1000]))
dfs = pd.read_csv(dataset_dir / 'dfs.csv.gz')
In [16]:
dfab = pd.read_csv(dataset_dir / 'dfab.csv.gz')
dfab['motif_pair_cat'] = pd.Categorical(dfab.motif_pair, categories=dfab.groupby("motif_pair").size().sort_values(ascending=False).index)
dfab['strand_combination_cat'] = pd.Categorical(dfab['strand_combination'], )
In [17]:
dfi_subset = pd.read_csv(dataset_dir / 'dfi_subset.csv.gz')
In [51]:
dfs.motif_pair.value_counts()
Out[51]:
Nanog<>Klf4             72768
Nanog<>Nanog            62860
Oct4-Sox2<>Nanog        36956
                        ...  
Oct4-Sox2<>Sox2          9320
Oct4-Sox2<>Oct4-Sox2     5176
Sox2<>Sox2               4904
Name: motif_pair, Length: 10, dtype: int64
In [50]:
dfs.head()
Out[50]:
Unnamed: 0 Wt_obs Wt dA dB dAB motif_pair task center_diff strand_combination
0 0 16198.0 58234.414 28042.203 38972.977 23831.658 Oct4-Sox2<>Oct4-Sox2 Oct4 21.0 ++
1 1 16198.0 58234.414 28042.203 17461.543 9037.368 Oct4-Sox2<>Oct4-Sox2 Oct4 62.0 ++
2 2 16198.0 58234.414 28042.203 37178.540 22600.363 Oct4-Sox2<>Oct4-Sox2 Oct4 83.0 ++
3 3 16198.0 58234.414 28042.203 23178.790 13922.051 Oct4-Sox2<>Oct4-Sox2 Oct4 104.0 ++
4 4 16198.0 58234.414 38972.977 17461.543 9234.978 Oct4-Sox2<>Oct4-Sox2 Oct4 41.0 ++

TODO

  • have uniform distance filters
In [18]:
# define the global set of distances
dist_subsets = ['center_diff<=35',
               '(center_diff>35)&(center_diff<=70)', 
               '(center_diff>70)&(center_diff<=150)', 
               'center_diff>150']
dist_subset_labels = ['dist < 35',
                      '35 < dist <= 70',
                      '70 < dist <= 150',
                      '150 < dist',
                     ]

Co-occurence test

<150bp

In [19]:
from basepair.exp.chipnexus.spacing import coocurrence_plot
In [20]:
# co-occurence
fig, ax = plt.subplots(figsize=get_figsize(.25, aspect=1))
dist = 150
coocurrence_plot(dfi_subset, list(motifs),
                 query_string=f"(abs(pattern_center_aln_x- pattern_center_aln_y) <= {dist})")
ax.set_ylabel("Motif of interest")
ax.set_xlabel("Motif partner");
fig.savefig(figures_dir / f'coocurrence-test.center_diff<{dist}.pdf')
In [21]:
dfi_subset.head()
Out[21]:
Unnamed: 0 pattern example_idx pattern_start pattern_end strand pattern_len pattern_center match_weighted match_weighted_p match_weighted_cat match_max match_max_task imp_weighted imp_weighted_p imp_weighted_cat imp_max imp_max_task seq_match seq_match_p seq_match_cat match/Klf4 match/Nanog match/Oct4 match/Sox2 imp/Klf4 imp/Nanog imp/Oct4 imp/Sox2 example_chrom example_start example_end example_strand example_interval_from_task pattern_short pattern_name pattern_start_abs pattern_end_abs pattern_center_aln pattern_strand_aln row_idx
0 10 metacluster_0/pattern_0 1 437 452 + 15 444 0.5391 0.5105 medium 0.5627 Sox2 0.9221 0.8271 low 1.0504 Oct4 9.6558 0.4369 medium 0.5452 0.4714 0.5524 0.5627 0.5605 1.0090 1.0504 0.9147 chr3 122145063 122146063 * Oct4 m0_p0 Oct4-Sox2 122145500 122145515 448 + 0
1 11 metacluster_0/pattern_0 1 458 473 + 15 465 0.5174 0.3763 medium 0.5613 Oct4 1.0389 0.9077 low 1.2170 Sox2 11.3139 0.7737 high 0.4683 0.4598 0.5613 0.5274 0.5500 1.1198 1.0884 1.2170 chr3 122145063 122146063 * Oct4 m0_p0 Oct4-Sox2 122145521 122145536 469 + 1
2 14 metacluster_0/pattern_0 1 499 514 + 15 506 0.5154 0.3673 medium 0.5390 Oct4 1.3206 0.9862 low 1.5825 Oct4 9.8925 0.4740 medium 0.4916 0.4731 0.5390 0.5267 0.6803 1.0429 1.5825 1.5485 chr3 122145063 122146063 * Oct4 m0_p0 Oct4-Sox2 122145562 122145577 510 + 2
3 16 metacluster_0/pattern_0 1 520 535 + 15 527 0.4897 0.2392 low 0.5008 Oct4 1.4250 0.9950 low 1.6570 Sox2 7.7218 0.1351 low 0.4555 0.4992 0.5008 0.4895 0.8025 1.2853 1.6062 1.6570 chr3 122145063 122146063 * Oct4 m0_p0 Oct4-Sox2 122145583 122145598 531 + 3
4 18 metacluster_0/pattern_0 1 541 556 + 15 548 0.4843 0.2165 low 0.5198 Oct4 1.2319 0.9746 low 1.4439 Oct4 9.6558 0.4369 medium 0.4183 0.4421 0.5198 0.5058 0.6828 1.0318 1.4439 1.4188 chr3 122145063 122146063 * Oct4 m0_p0 Oct4-Sox2 122145604 122145619 552 + 4

<35bp

In [116]:
from basepair.exp.chipnexus.spacing import co_occurence_matrix
In [121]:
# co-occurence
fig, ax = plt.subplots(figsize=get_figsize(.25, aspect=1))
dist = 35
coocurrence_plot(dfi_subset, list(motifs),
                 query_string=f"center_diff <= {dist}")
ax.set_ylabel("Motif of interest")
ax.set_xlabel("Motif partner");
fig.savefig(figures_dir / f'coocurrence-test.center_diff<{dist}.pdf')

All distance ranges

In [136]:
# subsets = ['center_diff <= 35', 'center_diff > 35', 'center_diff > 70']
subsets = dist_subsets
fig,axes = plt.subplots(1, len(subsets), 
                        figsize=get_figsize(0.25*len(subsets), .8/len(subsets)), 
                        sharey=True)
for i, (subset,ax, subset_label) in enumerate(zip(subsets, axes, dist_subset_labels)):
    if i == len(subsets) - 1:
        cbar = True
    else:
        cbar = False
    coocurrence_plot(dfi_subset, list(motifs), query_string=subset, ax=ax, cbar=cbar)
    if i == 0:
        ax.set_ylabel("Motif of interest")
        ax.set_xlabel("Motif partner");
    
    ax.set_title(subset_label)
# plt.tight_layout()
plt.savefig(figures_dir / 'coocurrence-test.all-dist.pdf')

Pairwise distance distribution

In [22]:
plotnine.options.figure_size = get_figsize(.5, aspect=2)# (10, 10)
max_dist = 100
fig = (ggplot(aes(x='center_diff', fill='strand_combination'), dfab[(dfab.center_diff <= max_dist)]) + 
 geom_histogram(bins=max_dist) + 
 facet_grid("motif_pair_cat~ .") + 
 theme_classic(base_size=10, base_family='Arial') + 
 theme(strip_text = element_text(rotation=0), legend_position='top') + 
 xlim([0, max_dist]) + 
 xlab("Pairwise distance") + 
 scale_fill_brewer(type='qual', palette=3))
fig.save(figures_dir / 'histogram.center_diff.all.pdf')
fig 
Out[22]:
<ggplot: (-9223363310486774314)>

Nanog

In [23]:
plotnine.options.figure_size = get_figsize(.3, aspect=2/10*4)
max_dist = 100
fig = (ggplot(aes(x='center_diff', fill='strand_combination'), dfab[(dfab.center_diff <= max_dist) & (dfab.motif_pair.isin(["Nanog<>Nanog"]))]) + 
 geom_vline(xintercept=10, alpha=0.1) + 
 geom_vline(xintercept=20, alpha=0.1) + 
 geom_vline(xintercept=30, alpha=0.1) + 
 geom_vline(xintercept=40, alpha=0.1) + 
 geom_histogram(bins=max_dist) + facet_grid("strand_combination~.") + 
 theme_classic(base_size=10, base_family='Arial') + 
 theme(strip_text = element_text(rotation=0), legend_position='top') + 
 xlim([0, max_dist]) + 
 xlab("Pairwise distance") +
 ggtitle("Nanog<>Nanog") + 
 scale_fill_brewer(type='qual', palette=3))
fig.save(figures_dir / 'nanog.spacing.pdf')
fig
Out[23]:
<ggplot: (-9223363310659608965)>

Synergy scores

  • Wt = total counts in ref
  • dA = total counts after removing A
  • dB = total counts after removing B
  • dAB = total counts after removing both

Then TF-A depends on A with

(Wt-dA)/(Wt - dAB)

and on B with

(Wt-dB)/(Wt - dAB)

In [27]:
# dfs = dfabf_ism[['Wt_obs', 'Wt', 'dA', 'dB', 'dAB', 'motif_pair', 'task', 'center_diff', 'strand_combination']]
dfs_prox = dfs[dfs.center_diff <= 35]
dfs_dist = dfs[dfs.center_diff > 35]
dfs.head()
Out[27]:
Unnamed: 0 Wt_obs Wt dA dB dAB motif_pair task center_diff strand_combination
0 0 16198.0 58234.414 28042.203 38972.977 23831.658 Oct4-Sox2<>Oct4-Sox2 Oct4 21.0 ++
1 1 16198.0 58234.414 28042.203 17461.543 9037.368 Oct4-Sox2<>Oct4-Sox2 Oct4 62.0 ++
2 2 16198.0 58234.414 28042.203 37178.540 22600.363 Oct4-Sox2<>Oct4-Sox2 Oct4 83.0 ++
3 3 16198.0 58234.414 28042.203 23178.790 13922.051 Oct4-Sox2<>Oct4-Sox2 Oct4 104.0 ++
4 4 16198.0 58234.414 38972.977 17461.543 9234.978 Oct4-Sox2<>Oct4-Sox2 Oct4 41.0 ++

Fraction of positive values

In [23]:
# choose the right color
colors = plt.get_cmap("RdBu")(np.linspace(0, 1, 9))
hex_colors = [plt.cm.colors.to_hex(a) for a in colors]
sns.palplot(colors)
In [24]:
cb_red = hex_colors[2]
cb_blue = hex_colors[-2 - 2]
In [25]:
hex_colors
Out[25]:
['#67001f',
 '#bb2a34',
 '#e58368',
 '#fbceb7',
 '#f6f7f7',
 '#c0dceb',
 '#68abd0',
 '#2870b1',
 '#053061']
In [26]:
from basepair.plot.utils import PlotninePalette
In [67]:
terms = ['Wt-dA>0', 'Wt-dB>0', 'Wt-dAB>0', 'dA-dAB>0', 'dB-dAB>0']
filter_terms = ['center_diff <= 35', 'center_diff > 35']
c = pd.concat([(dfs.query(ft).groupby(['motif_pair', 'task']).apply(lambda x: x.eval(term).mean()).
                reset_index().
                rename(columns={0:'fraction'}).
                assign(term=term, subset=ft))
               for term in terms for ft in filter_terms], axis=0)
c['term'] = pd.Categorical(c['term'], categories=terms)
c['subset'] = pd.Categorical(c['subset'], categories=filter_terms)
c['motif_pair'] = pd.Categorical(c['motif_pair'], pair_names[::-1])
In [70]:
plotnine.options.figure_size = get_figsize(1, 0.6)
fig = (ggplot(aes(y='motif_pair', x='task', fill='fraction'), c) + 
       geom_tile() + 
       theme_classic(base_size=10, base_family='Arial') +  
       facet_grid("subset ~ term") + 
       theme(legend_position='top', axis_text_x=element_text(rotation=30, hjust=1))  + 
       scale_fill_gradient2(low=cb_red, mid='white', high=cb_blue, midpoint=.5, limits=[0, 1], 
                            palette=PlotninePalette("RdBu", [.1, .9])) +
       ggtitle("Fraction of positive terms")
)
fig.save(figures_dir / 'heatmap.positive_fraction.pdf')
fig
Out[70]:
<ggplot: (8726136487751)>

Synergy scores: (2*Wt-dA-dB)/(Wt-dAB)

Require: 90% Wt-dAB>0

In [38]:
mask = 'Wt-dAB>0'
terms = ['(2*Wt-dA-dB)/(Wt-dAB)']
filter_terms = dist_subsets
filter_names = dist_subset_labels
# filter_terms = ['center_diff <= 35', 'center_diff > 35']
c = pd.concat([(dfs.query(ft).groupby(['motif_pair', 'task']).
                apply(lambda x: x.eval(term).median() if x.eval(mask).mean() > .9 else np.nan).
                reset_index().
                rename(columns={0:'average'}).
                assign(term=term, subset=ft, subset_name=filter_names[i]))
               for term in terms for i, ft in enumerate(filter_terms)], axis=0)
c['term'] = pd.Categorical(c['term'], categories=terms)
c['subset'] = pd.Categorical(c['subset'], categories=filter_terms)
c['subset_name'] = pd.Categorical(c['subset_name'], categories=filter_names)
c['motif_pair'] = pd.Categorical(c['motif_pair'], pair_names[::-1])
In [39]:
plotnine.options.figure_size = get_figsize(1, 0.6)
assert len(terms) == 1
fig = (ggplot(aes(y='motif_pair', x='task', fill='average')) + 
       geom_tile(data=c.dropna()) + 
       # geom_tile(fill='#d9d9d9', data=c[c.average.isnull()]) + 
       geom_text(label='/', data=c[c.average.isnull()]) + 
       theme_classic(base_size=10, base_family='Arial') + 
       facet_grid(". ~ subset_name") + 
       theme(legend_position='top', axis_text_x=element_text(rotation=30, hjust=1))  + 
       scale_fill_gradient2(low=cb_red, mid='white', high=cb_blue, midpoint=1, limits=[.6, 1.4], 
                            palette=PlotninePalette("RdBu_r", [0, 1])) +
       # scale_fill_gradient2(low=cb_blue, mid='white', high=cb_red, midpoint=1, limits=[0, 2]) +
       ggtitle(terms[0] + ", 90% Wt-dAB>0")
)
fig.save(figures_dir / 'heatmap.synergy-score.90perc.Wt-dAB>0.pdf')
fig
Out[39]:
<ggplot: (8759288233094)>

Setup table showing only the major TF's

  • Row -> target motif of interest (and the task)
  • column -> other motif
In [40]:
def get_syn_effect(c, pairs, motifs, tasks):
    motifxtask = [f"{motif} (task={profile_mapping[motif]})" for motif in motifs]
    dfp = pd.DataFrame(index=motifxtask, columns=motifs, dtype=float)
    k = 0
    for i in range(len(motifs)):
        for j in range(len(motifs)):
            if i>j:
                continue
            idx = 'motif_pair_idx'
            cat = 'center_diff_cat'
            motif_pair = pairs[k]
            motif_pair_name = "<>".join(motif_pair)
            k+=1
            t1 = profile_mapping[motif_pair[0]]
            t2 = profile_mapping[motif_pair[1]]
            
            dfp.loc[motif_pair[0] + f" (task={t1})"][motif_pair[1]] = c[(c.motif_pair == motif_pair_name) & (c.task == t1)]['average']
            dfp.loc[motif_pair[1] + f" (task={t2})"][motif_pair[0]] = c[(c.motif_pair == motif_pair_name) & (c.task == t2)]['average']
    # dfp.columns = [r"$\Delta$"+c for c in dfp.columns]
    return dfp

Major TF per motif

In [41]:
# subsets = ['center_diff <= 35', 'center_diff > 35', 'center_diff > 70']
subsets = dist_subsets
fig,axes = plt.subplots(1, len(subsets), 
                        figsize=get_figsize(0.25*len(subsets), .8/len(subsets)), 
                        sharey=True)
for i, (subset,ax, subset_title) in enumerate(zip(subsets, axes, dist_subset_labels)):
    sns.heatmap(get_syn_effect(c[c.subset == subset], pairs, motifs, tasks), cmap='RdBu_r', center=1, ax=ax, 
                cbar=i==len(subsets)-1,
                vmin=.7, vmax=1.3)
    ax.set_title(subset_title)
plt.savefig(figures_dir / 'heatmap.synergy-score.90perc.Wt-dAB>0.main_tasks.pdf')

Require: 20% Wt-dAB>0

In [42]:
mask = 'Wt-dAB>0'
terms = ['(2*Wt-dA-dB)/(Wt-dAB)']
filter_terms = dist_subsets
filter_names = dist_subset_labels
# filter_terms = ['center_diff <= 35', 'center_diff > 35']
c = pd.concat([(dfs.query(ft).groupby(['motif_pair', 'task']).
                apply(lambda x: x.eval(term).median() if x.eval(mask).mean() > .2 else np.nan).
                reset_index().
                rename(columns={0:'average'}).
                assign(term=term, subset=ft, subset_name=filter_names[i]))
               for term in terms for i, ft in enumerate(filter_terms)], axis=0)
c['term'] = pd.Categorical(c['term'], categories=terms)
c['subset'] = pd.Categorical(c['subset'], categories=filter_terms)
c['subset_name'] = pd.Categorical(c['subset_name'], categories=filter_names)
c['motif_pair'] = pd.Categorical(c['motif_pair'], pair_names[::-1])
In [43]:
plotnine.options.figure_size = get_figsize(1, 0.6)
assert len(terms) == 1
fig = (ggplot(aes(y='motif_pair', x='task', fill='average')) + 
       geom_tile(data=c.dropna()) + 
       # geom_tile(fill='#d9d9d9', data=c[c.average.isnull()]) + 
       geom_text(label='/', data=c[c.average.isnull()]) + 
       theme_classic(base_size=10, base_family='Arial') + 
       facet_grid(". ~ subset_name") + 
       theme(legend_position='top', axis_text_x=element_text(rotation=30, hjust=1))  + 
       scale_fill_gradient2(low=cb_red, mid='white', high=cb_blue, midpoint=1, limits=[.6, 1.4], 
                            palette=PlotninePalette("RdBu_r", [0, 1])) +
       # scale_fill_gradient2(low=cb_blue, mid='white', high=cb_red, midpoint=1, limits=[0, 2]) +
       ggtitle(terms[0] + ", 90% Wt-dAB>0")
)
fig.save(figures_dir / 'heatmap.synergy-score.20perc.Wt-dAB>0.pdf')
fig
Out[43]:
<ggplot: (-9223363277898274777)>

Major TF per motif

In [44]:
subsets = dist_subsets
fig,axes = plt.subplots(1, len(subsets), 
                        figsize=get_figsize(0.25*len(subsets), .8/len(subsets)), 
                        sharey=True)
for i, (subset,ax, subset_title) in enumerate(zip(subsets, axes, dist_subset_labels)):
    sns.heatmap(get_syn_effect(c[c.subset == subset], pairs, motifs, tasks), cmap='RdBu_r', center=1, ax=ax, 
                cbar=i==len(subsets)-1,
                vmin=.7, vmax=1.3)
    ax.set_title(subset_title)
plt.savefig(figures_dir / 'heatmap.synergy-score.20perc.Wt-dAB>0.main_tasks.pdf')

Show all

In [45]:
mask = 'Wt-dAB>0'
terms = ['(2*Wt-dA-dB)/(Wt-dAB)']
filter_terms = dist_subsets
filter_names = dist_subset_labels
# filter_terms = ['center_diff <= 35', 'center_diff > 35']
c = pd.concat([(dfs.query(ft).groupby(['motif_pair', 'task']).
                apply(lambda x: x.eval(term).median()).
                reset_index().
                rename(columns={0:'average'}).
                assign(term=term, subset=ft, subset_name=filter_names[i]))
               for term in terms for i, ft in enumerate(filter_terms)], axis=0)
c['term'] = pd.Categorical(c['term'], categories=terms)
c['subset'] = pd.Categorical(c['subset'], categories=filter_terms)
c['subset_name'] = pd.Categorical(c['subset_name'], categories=filter_names)
c['motif_pair'] = pd.Categorical(c['motif_pair'], pair_names[::-1])
In [46]:
plotnine.options.figure_size = get_figsize(1, 0.6)
assert len(terms) == 1
fig = (ggplot(aes(y='motif_pair', x='task', fill='average')) + 
       geom_tile(data=c.dropna()) + 
       # geom_tile(fill='#d9d9d9', data=c[c.average.isnull()]) + 
       geom_text(label='/', data=c[c.average.isnull()]) + 
       theme_classic(base_size=10, base_family='Arial') + 
       facet_grid(". ~ subset_name") + 
       theme(legend_position='top', axis_text_x=element_text(rotation=30, hjust=1))  + 
       scale_fill_gradient2(low=cb_red, mid='white', high=cb_blue, midpoint=1, limits=[.6, 1.4], 
                            palette=PlotninePalette("RdBu_r", [0, 1])) +
       # scale_fill_gradient2(low=cb_blue, mid='white', high=cb_red, midpoint=1, limits=[0, 2]) +
       ggtitle(terms[0] + ", 90% Wt-dAB>0")
)
fig.save(figures_dir / 'heatmap.synergy-score.all.pdf')
fig
Out[46]:
<ggplot: (-9223363277898619062)>

Major TF per motif

In [47]:
subsets = dist_subsets
fig,axes = plt.subplots(1, len(subsets), 
                        figsize=get_figsize(0.25*len(subsets), .8/len(subsets)), 
                        sharey=True)
for i, (subset,ax, subset_title) in enumerate(zip(subsets, axes, dist_subset_labels)):
    sns.heatmap(get_syn_effect(c[c.subset == subset], pairs, motifs, tasks), cmap='RdBu_r', center=1, ax=ax, 
                cbar=i==len(subsets)-1,
                vmin=.7, vmax=1.3)
    ax.set_title(subset_title)
plt.savefig(figures_dir / 'heatmap.synergy-score.all.main_tasks.pdf')

Directional effect

Scatterplot showing the major TF per motif

In [25]:
from basepair.exp.paper.config import profile_mapping
In [26]:
fig, axes = plt.subplots(len(motifs), len(motifs), 
                         figsize=get_figsize(.65, 1), 
                         gridspec_kw=dict(wspace=0, hspace=0),
                         sharex=True, sharey=True)
colors = plt.get_cmap("viridis")(np.linspace(0, .95, len(motifs)))
k = 0
for i in range(len(motifs)):
    for j in range(len(motifs)):
        ax = axes[i,j]
        if i>j:
            ax.axison = False
            continue
        idx = 'motif_pair_idx'
        cat = 'center_diff_cat'
        motif_pair = pairs[k]
        motif_pair_name = "<>".join(motif_pair)
        k+=1
        t1 = profile_mapping[motif_pair[0]]
        t2 = profile_mapping[motif_pair[1]]
        # t1 = 'Nanog'
        # t2 = 'Nanog'
        # motif mask
        mmask = (dfabf.motif_pair == motif_pair_name) & (dfabf.score == 'max_profile_count')
        ab = pd.merge(dfabf[mmask & (dfabf.task == t1)][[idx, 'x_alt_ref_pc', cat]],
                      dfabf[mmask & (dfabf.task == t2)][[idx, 'y_alt_ref_pc']],
                      on=idx)
        # make a scatterplot
        for ci, (label,df) in enumerate(list(ab.groupby(cat))):
            ax.scatter(df.y_alt_ref_pc,
                       df.x_alt_ref_pc,   # first one is on the y axis
                       alpha=0.2, s=1, label=label, color=colors[ci])
        
        # ax.text(0, 0,t1+"-"+t2)
        alpha = .5
        xl = [0, 2]
        ax.plot(xl, xl, c='grey', alpha=alpha)
        ax.axvline(1, c='grey', alpha=alpha)
        ax.axhline(1, c='grey', alpha=alpha)
        ax.set_xlim(xl)
        ax.set_ylim(xl)
        if i == j:
            ax.set_ylabel(motif_pair[1])
            ax.set_xlabel(motif_pair[0])
            
        # Use grey spines
        plt.setp(ax.spines.values(), color='grey')
        plt.setp([ax.get_xticklines(), ax.get_yticklines()], color='grey')
plt.legend(loc="bottom right", bbox_to_anchor=(-.2,.7),
           scatterpoints=1, markerscale=10, columnspacing=0,
           handletextpad=0, borderpad=0, frameon=False, title="Pairwise distance")
fig.savefig(figures_dir / "max_profile_count.upper-triag.width=0.65.pdf")

No pseudo-counts

In [31]:
fig, axes = plt.subplots(len(motifs), len(motifs), 
                         figsize=get_figsize(.65, 1),
                         gridspec_kw=dict(wspace=0, hspace=0),
                         sharex=True, sharey=True)
k = 0
for i in range(len(motifs)):
    for j in range(len(motifs)):
        ax = axes[i,j]
        if i>j:
            ax.axison = False
            continue
        idx = 'motif_pair_idx'
        cat = 'center_diff_cat'
        motif_pair = pairs[k]
        motif_pair_name = "<>".join(motif_pair)
        k+=1
        t1 = profile_mapping[motif_pair[0]]
        t2 = profile_mapping[motif_pair[1]]
        # t1 = 'Nanog'
        # t2 = 'Nanog'
        # motif mask
        mmask = (dfabf.motif_pair == motif_pair_name) & (dfabf.score == 'max_profile_count')
        ab = pd.merge(dfabf[mmask & (dfabf.task == t1)][[idx, 'x_alt_ref', cat]],
                      dfabf[mmask & (dfabf.task == t2)][[idx, 'y_alt_ref']],
                      on=idx)
        # make a scatterplot
        for ci, (label,df) in enumerate(list(ab.groupby(cat))):
            ax.scatter(df.y_alt_ref,
                       df.x_alt_ref,   # first one is on the y axis
                       alpha=0.2, s=1, label=label, color=colors[ci])
        # ax.text(0, 0,t1+"-"+t2)
        alpha = .5
        xl = [0, 2]
        ax.plot(xl, xl, c='grey', alpha=alpha)
        ax.axvline(1, c='grey', alpha=alpha)
        ax.axhline(1, c='grey', alpha=alpha)
        ax.set_xlim(xl)
        ax.set_ylim(xl)
        if i == j:
            ax.set_ylabel(motif_pair[1])
            ax.set_xlabel(motif_pair[0])
            
        # Use grey spines
        plt.setp(ax.spines.values(), color='grey')
        plt.setp([ax.get_xticklines(), ax.get_yticklines()], color='grey')
plt.legend(loc="bottom right", bbox_to_anchor=(-.2,.7),
           scatterpoints=1, markerscale=10, columnspacing=0,
           handletextpad=0, borderpad=0, frameon=False, title="Pairwise distance")
fig.savefig(figures_dir / "max_profile_count.upper-triag.no-pseudocounts.pdf")

All TF's per motif

In [17]:
import matplotlib.cm as cm
colors = plt.get_cmap("viridis")(np.linspace(0, .95, len(motifs)))
hex_colors = [plt.cm.colors.to_hex(a) for a in colors]
In [18]:
# use pair names
dfabf['motif_pair'] = pd.Categorical(dfabf['motif_pair'], pair_names)
In [25]:
from plotnine import *
In [20]:
plotnine.options.figure_size = get_figsize(1, 10/4)
fig = (ggplot(aes(x='x_alt_ref_pc', y='y_alt_ref_pc', color='center_diff_cat'), 
              dfabf[dfabf.score == 'max_profile_count']) + 
 geom_point(alpha=0.2, size=.05, shape='.', show_legend=False) + 
 scale_color_manual(hex_colors)+
 geom_point(alpha=1, size=5, data=dfabf[:0], shape='.') +   # for the legend
 # add vertical lines
 geom_abline(intercept=0, slope=1, color='grey', alpha=0.5, show_legend=False) + 
 geom_vline(xintercept=1, color='grey', alpha=0.5) + 
 geom_hline(yintercept=1, color='grey', alpha=0.5) + 
 facet_grid("motif_pair ~ task") + 
 xlim([0, 2]) + 
 ylim([0, 2]) + 
 theme_classic(base_size=10, base_family='Arial') + 
 xlab("B") + 
 ylab("A") + 
 ggtitle('max_profile_count')
)
fig.save(figures_dir / 'max_profile_count.all.pdf', raster=True)
display(fig)
<ggplot: (8769465047062)>

Heatmap version

In [54]:
def get_effect(dfabf, pairs, agg_fn=np.mean):
    motifxtask = [f"{motif} (task={profile_mapping[motif]})" for motif in motifs]
    dfp = pd.DataFrame(index=motifxtask, columns=motifs, dtype=float)
    k = 0
    for i in range(len(motifs)):
        for j in range(len(motifs)):
            if i>j:
                continue
            idx = 'motif_pair_idx'
            cat = 'center_diff_cat'
            motif_pair = pairs[k]
            motif_pair_name = "<>".join(motif_pair)
            k+=1
            t1 = profile_mapping[motif_pair[0]]
            t2 = profile_mapping[motif_pair[1]]
            # t1 = 'Nanog'
            # t2 = 'Nanog'
            # motif mask
            mmask = (dfabf.motif_pair == motif_pair_name) & (dfabf.score == 'max_profile_count')
            ab = pd.merge(dfabf[mmask & (dfabf.task == t1)][[idx, 'x_alt_ref_pc', cat]],
                          dfabf[mmask & (dfabf.task == t2)][[idx, 'y_alt_ref_pc']],
                          on=idx)
            dfp.loc[motif_pair[0] + f" (task={t1})"][motif_pair[1]] = agg_fn(ab.x_alt_ref_pc)
            dfp.loc[motif_pair[1] + f" (task={t2})"][motif_pair[0]] = agg_fn(ab.y_alt_ref_pc)
    dfp.columns = [r"$\Delta$"+c for c in dfp.columns]
    return dfp

Mean value

In [183]:
subsets = dist_subsets
fig,axes = plt.subplots(1, len(subsets), 
                        figsize=get_figsize(0.25*len(subsets), .8/len(subsets)), 
                        sharey=True)
for i, (subset,ax, subset_label) in enumerate(zip(subsets, axes, dist_subset_labels)):
    sns.heatmap(get_effect(dfabf.query(subset), pairs, agg_fn=np.mean), cmap='RdBu', center=1, ax=ax, 
                cbar=i==len(subsets)-1,
                vmin=.7, vmax=1.3)
    ax.set_title(subset_label)
plt.savefig(figures_dir / 'heatmap.mean_max_profile_count.directional_effect.pdf')

Median value

In [184]:
subsets = dist_subsets
fig,axes = plt.subplots(1, len(subsets), 
                        figsize=get_figsize(0.25*len(subsets), .8/len(subsets)), 
                        sharey=True)
for i, (subset,ax, subset_label) in enumerate(zip(subsets, axes, dist_subset_labels)):
    sns.heatmap(get_effect(dfabf.query(subset), pairs, agg_fn=np.median), cmap='RdBu', center=1, ax=ax, 
                cbar=i==len(subsets)-1,
                vmin=.7, vmax=1.3)
    ax.set_title(subset_label)
plt.savefig(figures_dir / 'heatmap.median_max_profile_count.directional_effect.pdf')

Values for all the tasks

In [185]:
motifxtask = [f"{motif} (task={task})" for motif in motifs for task in tasks]
In [186]:
def get_effect_all_tasks(dfabf, pairs, tasks, agg_fn=np.mean):
    motifxtask = [f"{motif} (task={task})" for motif in motifs for task in tasks]
    dfp = pd.DataFrame(index=motifxtask, columns=motifs, dtype=float)
    for task in tasks:
        k = 0
        for i in range(len(motifs)):
            for j in range(len(motifs)):
                if i>j:
                    continue
                idx = 'motif_pair_idx'
                cat = 'center_diff_cat'
                motif_pair = pairs[k]
                motif_pair_name = "<>".join(motif_pair)
                k+=1
                t1 = profile_mapping[motif_pair[0]]
                t2 = profile_mapping[motif_pair[1]]
                ab = dfabf[(dfabf.motif_pair == motif_pair_name) & (dfabf.score == 'max_profile_count') & (dfabf.task == task)]
                dfp.loc[motif_pair[0] + f" (task={task})"][motif_pair[1]] = agg_fn(ab.x_alt_ref_pc)
                dfp.loc[motif_pair[1] + f" (task={task})"][motif_pair[0]] = agg_fn(ab.y_alt_ref_pc)
    dfp.columns = [r"$\Delta$"+c for c in dfp.columns]
    return dfp

Median

In [187]:
subsets = dist_subsets
fig,axes = plt.subplots(1, len(subsets), 
                        figsize=get_figsize(0.25*len(subsets), .8/len(subsets)*4), 
                        sharey=True)
for i, (subset,ax, subset_label) in enumerate(zip(subsets, axes, dist_subset_labels)):
    sns.heatmap(get_effect_all_tasks(dfabf.query(subset), pairs, tasks=tasks, agg_fn=np.median), cmap='RdBu', center=1, ax=ax, 
                cbar=i==len(subsets)-1,
                vmin=.7, vmax=1.3)
    ax.set_title(subset_label)
plt.savefig(figures_dir / 'heatmap.median_max_profile_count.directional_effect.all-tasks.pdf')

Mean

In [188]:
subsets = dist_subsets
fig,axes = plt.subplots(1, len(subsets), 
                        figsize=get_figsize(0.25*len(subsets), .8/len(subsets)*4), 
                        sharey=True)
for i, (subset,ax, subset_label) in enumerate(zip(subsets, axes, dist_subset_labels)):
    sns.heatmap(get_effect_all_tasks(dfabf.query(subset), pairs, tasks=tasks, agg_fn=np.mean), cmap='RdBu', center=1, ax=ax, 
                cbar=i==len(subsets)-1,
                vmin=.7, vmax=1.3)
    ax.set_title(subset_label)
plt.savefig(figures_dir / 'heatmap.mean_max_profile_count.directional_effect.all-tasks.pdf')

Directness score

In [243]:
dfp = pd.read_csv("http://mitra.stanford.edu/kundaje/avsec/chipnexus/oct-sox-nanog-klf/models/n_dil_layers=9/modisco/all/deeplift/profile/pattern_table.csv")
In [244]:
dfp.head()
Out[244]:
i pattern cluster n seqlets Klf4/d_p Nanog/d_p Oct4/d_p Sox2/d_p Klf4 footprint counts Klf4 footprint entropydiff Klf4 footprint max Klf4 footprint strandcor Klf4 imp counts Klf4 imp profile Klf4 periodicity 10bp Klf4 pos meandiff Klf4 pos std Klf4 pos unimodal Klf4 region counts Klf4/d Nanog footprint counts Nanog footprint entropydiff Nanog footprint max Nanog footprint strandcor Nanog imp counts Nanog imp profile Nanog periodicity 10bp Nanog pos meandiff Nanog pos std Nanog pos unimodal Nanog region counts Nanog/d Oct4 footprint counts Oct4 footprint entropydiff Oct4 footprint max Oct4 footprint strandcor Oct4 imp counts Oct4 imp profile Oct4 periodicity 10bp Oct4 pos meandiff Oct4 pos std Oct4 pos unimodal Oct4 region counts Oct4/d Sox2 footprint counts Sox2 footprint entropydiff Sox2 footprint max Sox2 footprint strandcor Sox2 imp counts Sox2 imp profile Sox2 periodicity 10bp Sox2 pos meandiff Sox2 pos std Sox2 pos unimodal Sox2 region counts Sox2/d consensus ic pwm mean idx
0 0 m1_p7 8 157.0 95.0820 11.4754 0.8197 0.8197 183.4872 0.2647 1.5417 0.0059 0.0168 0.0285 0.0077 2.0164 95.7851 True 387.5287 0.0401 133.1731 0.1340 0.8974 0.0051 -0.0103 -9.0438e-04 0.0009 1.8610 185.0268 True 367.1019 0.0085 149.9744 0.1425 0.9423 0.0050 -4.8469e-04 -8.0106e-04 0.0019 1.9562 127.6654 True 422.9363 -0.0011 60.1218 0.2124 0.5737 0.0048 -0.0016 -2.6529e-03 0.0016 1.8105 121.0656 True 199.4968 -0.0038 GCCCCGCCCCACCCT 1.3623 37
1 1 m1_p8 8 162.0 90.9836 13.1148 8.1967 4.9180 154.7716 0.2412 1.2716 0.0059 0.0106 0.0198 0.0572 2.0037 61.0182 True 302.7778 0.0291 74.3827 0.1929 0.5123 0.0049 -0.0038 2.4813e-03 0.0169 2.3406 104.3285 True 212.5494 0.0088 85.1667 0.1113 0.4969 0.0049 5.1559e-04 1.1783e-03 0.0248 2.2167 96.4021 True 276.4321 0.0018 37.7284 0.1553 0.2593 0.0049 0.0005 -4.5393e-05 0.0183 2.0258 93.9946 True 146.6173 -0.0006 GGGAAGGGTGGGGCAGTAGA 0.9815 38
2 2 m6_p2 8 355.0 80.3279 24.5902 40.9836 27.8689 220.2955 0.2445 1.9148 0.0061 0.0053 0.0123 0.0073 2.0044 87.2336 True 421.7324 0.0193 288.6136 0.1758 1.9446 0.0052 0.0027 9.0652e-03 0.0038 1.9253 75.9526 True 537.9211 0.0154 167.0597 0.1038 1.1847 0.0050 2.4447e-03 3.4294e-03 0.0055 1.9122 139.0525 True 430.0113 0.0044 68.9744 0.1055 0.4290 0.0051 0.0027 3.2214e-03 0.0027 1.7937 138.7371 True 202.9465 0.0037 CAGAGTGCTGGGATTAAAGGC... 0.3143 109
3 3 m1_p0 8 12735.0 100.0000 25.4098 4.9180 3.2787 182.8839 0.2046 1.5890 0.0061 0.0263 0.0500 0.0159 1.9985 70.1436 True 355.6721 0.0736 115.8733 0.0249 0.5408 0.0051 -0.0140 7.4599e-04 0.0065 2.0307 146.5852 True 307.2621 0.0155 102.3714 0.0258 0.5333 0.0050 -8.0866e-04 9.4931e-05 0.0132 2.0174 155.3548 True 339.2477 0.0010 48.2119 0.0220 0.2416 0.0050 -0.0007 -1.2597e-03 0.0053 2.0489 165.0139 True 173.3736 -0.0018 TGGGTGTGGC 1.2984 30
4 4 m1_p1 8 2064.0 95.9016 16.3934 2.4590 2.4590 164.3267 0.1382 1.0644 0.0056 0.0204 0.0303 0.0954 1.9978 59.0731 True 349.1899 0.0402 38.5148 0.0186 0.1699 0.0050 -0.0103 4.1849e-06 0.1099 2.0319 154.9271 True 139.5499 0.0103 77.7647 0.0188 0.3607 0.0050 -9.5728e-06 -1.6580e-04 0.0898 1.9667 109.5886 True 286.3324 -0.0003 28.1585 0.0213 0.1181 0.0050 -0.0002 -1.5025e-03 0.0829 1.9582 172.8593 True 135.0795 -0.0028 GGCCCCGCCCT 1.1518 31
In [245]:
dfp = dfp[dfp.pattern.isin(list(motifs.values()))]
m_hash = {v:k for k,v in motifs.items()}
dfp['pattern_name'] = dfp.pattern.map(m_hash)
dfp = dfp.set_index('pattern_name')
dfp = dfp.loc[list(motifs)]
In [246]:
dfp_max = dfp[[c for c in dfp.columns if c.endswith(' footprint max')]]
dfp_max.columns = [c.split(" ")[0] for c in dfp_max.columns]
dfp_max = dfp_max[tasks]
dfp_max
Out[246]:
Oct4 Sox2 Nanog Klf4
pattern_name
Oct4-Sox2 5.1700 1.9023 1.4743 0.5510
Sox2 1.0024 3.0287 4.7277 1.2223
Nanog 0.7602 0.2945 2.9368 0.4001
Klf4 0.5333 0.2416 0.5408 1.5890
In [247]:
dfp_counts = dfp[[c for c in dfp.columns if c.endswith(' footprint counts')]]
dfp_counts.columns = [c.split(" ")[0] for c in dfp_max.columns]
dfp_counts = dfp_counts[tasks]
dfp_counts
Out[247]:
Oct4 Sox2 Nanog Klf4
pattern_name
Oct4-Sox2 116.8492 289.9074 401.0843 135.1212
Sox2 200.3641 516.2857 216.5016 188.5086
Nanog 96.6386 274.3910 154.1314 63.4457
Klf4 182.8839 115.8733 102.3714 48.2119
In [250]:
fig, axes = plt.subplots(1, 2, figsize=get_figsize(1, .35), sharey=True)
ax = axes[0]
sns.heatmap(dfp_max, cmap='Reds', ax=ax)
ax.set_ylabel("Motif")
ax.set_xlabel("Task");
ax.set_title("Avg. max profile counts");
ax = axes[1]
sns.heatmap(dfp_counts, cmap='Reds', ax=ax)
ax.set_ylabel("")
ax.set_xlabel("Task");
ax.set_title("Avg. total profile counts");
plt.savefig(figures_dir / 'heatmap.total-profile-counts.pdf')

Scratch space

Extra plots

Generate individual plots

For the scatter plots (directional interactions), generate png and pdf files for each interaction so I can use them directly.

  • The combined pdfs are too big and also too confusing to show on a slide or figure.
In [ ]:
odir = figures_dir / "individual/max_profile_count"
odir.mkdir(parents=True, exist_ok=True)
colors = plt.get_cmap("viridis")(np.linspace(0, .95, len(motifs)))
k = 0
for i in range(len(motifs)):
    for j in range(len(motifs)):
        if i>j:
            continue
        fig, ax = plt.subplots(1, 1, figsize=get_figsize(.65 / 3, 1))
        idx = 'motif_pair_idx'
        cat = 'center_diff_cat'
        motif_pair = pairs[k]
        motif_pair_name = "<>".join(motif_pair)
        k+=1
        t1 = profile_mapping[motif_pair[0]]
        t2 = profile_mapping[motif_pair[1]]
        # t1 = 'Nanog'
        # t2 = 'Nanog'
        # motif mask
        mmask = (dfabf.motif_pair == motif_pair_name) & (dfabf.score == 'max_profile_count')
        ab = pd.merge(dfabf[mmask & (dfabf.task == t1)][[idx, 'x_alt_ref_pc', cat]],
                      dfabf[mmask & (dfabf.task == t2)][[idx, 'y_alt_ref_pc']],
                      on=idx)
        # make a scatterplot
        for ci, (label,df) in enumerate(list(ab.groupby(cat))):
            ax.scatter(df.y_alt_ref_pc,
                       df.x_alt_ref_pc,   # first one is on the y axis
                       alpha=0.2, s=1, label=label, color=colors[ci])
        
        # ax.text(0, 0,t1+"-"+t2)
        alpha = .5
        xl = [0, 2]
        ax.plot(xl, xl, c='grey', alpha=alpha)
        ax.axvline(1, c='grey', alpha=alpha)
        ax.axhline(1, c='grey', alpha=alpha)
        ax.set_xlim(xl)
        ax.set_ylim(xl)
        ax.set_xlabel(r"{} ($\Delta$ {})".format(motif_pair[1], motif_pair[0]))
        ax.set_ylabel(r"{} ($\Delta$ {})".format(motif_pair[0], motif_pair[1]))
            
        # Use grey spines
        plt.setp(ax.spines.values(), color='grey')
        plt.setp([ax.get_xticklines(), ax.get_yticklines()], color='grey')
        fig.savefig(odir / f'{motif_pair_name}.png')
        fig.savefig(odir / f'{motif_pair_name}.pdf')
In [ ]:
odir = figures_dir / "individual/max_profile_count/all_tasks"
odir.mkdir(parents=True, exist_ok=True)
colors = plt.get_cmap("viridis")(np.linspace(0, .95, len(motifs)))

for task in tasks:
    k = 0
    for i in range(len(motifs)):
        for j in range(len(motifs)):
            if i>j:
                continue
            fig, ax = plt.subplots(1, 1, figsize=get_figsize(.65 / 3, 1))
            idx = 'motif_pair_idx'
            cat = 'center_diff_cat'
            motif_pair = pairs[k]
            motif_pair_name = "<>".join(motif_pair)
            k+=1
            t1 = profile_mapping[motif_pair[0]]
            t2 = profile_mapping[motif_pair[1]]
            
            ab = dfabf[(dfabf.motif_pair == motif_pair_name) & (dfabf.score == 'max_profile_count') & (dfabf.task == task)]
            # make a scatterplot
            for ci, (label,df) in enumerate(list(ab.groupby(cat))):
                ax.scatter(df.y_alt_ref_pc,
                           df.x_alt_ref_pc,   # first one is on the y axis
                           alpha=0.2, s=1, label=label, color=colors[ci])

            # ax.text(0, 0,t1+"-"+t2)
            alpha = .5
            xl = [0, 2]
            ax.plot(xl, xl, c='grey', alpha=alpha)
            ax.axvline(1, c='grey', alpha=alpha)
            ax.axhline(1, c='grey', alpha=alpha)
            ax.set_xlim(xl)
            ax.set_ylim(xl)
            ax.set_xlabel(r"{} ($\Delta$ {})".format(motif_pair[1], motif_pair[0]))
            ax.set_ylabel(r"{} ($\Delta$ {})".format(motif_pair[0], motif_pair[1]))
            ax.set_title(task)

            # Use grey spines
            plt.setp(ax.spines.values(), color='grey')
            plt.setp([ax.get_xticklines(), ax.get_yticklines()], color='grey')
            fig.savefig(odir / f'{motif_pair_name},task={task}.png')
            fig.savefig(odir / f'{motif_pair_name},task={task}.pdf')