In [1]:
from collections import OrderedDict
exp = 'nexus,peaks,OSNK,0,10,1,FALSE,same,0.5,64,25,0.004,9,FALSE,[1,50],TRUE'
imp_score = 'profile/wn'

# motifs = OrderedDict([
#     ("Oct4-Sox2", 'Oct4/m0_p0'),
#     ("Oct4", 'Oct4/m0_p1'),
#     # ("Strange-sym-motif", 'Oct4/m0_p5'),
#     ("Sox2", 'Sox2/m0_p1'),
#     ("Nanog", 'Nanog/m0_p1'),
#     ("Zic3", 'Nanog/m0_p2'),
#     ("Nanog-partner", 'Nanog/m0_p4'),
#     ("Klf4", 'Klf4/m0_p0'),
# ])

motifs = OrderedDict([
    ("Oct4-Sox2", 'Oct4/m0_p0'),
    ("Oct4", "Oct4/m0_p1"),
    ("Oct4-Oct4", "Oct4/m0_p6"),
    ("Sox2", "Sox2/m0_p1"),
    ("Nanog", "Nanog/m0_p1"),
    ("Nanog-partner", "Nanog/m0_p4"),
    ("Klf4", "Klf4/m0_p0"),
    ("Klf4-Klf4", "Klf4/m0_p5"),
    ("B-Box", "Oct4/m0_p5"),
    ("Zic3", "Nanog/m0_p2"),
    ("Essrb", "Oct4/m0_p16"),
])
In [2]:
# dfi_subset.pattern_name.unique()

Goal

  • make sub-plots for Figure 5

Open questions

  • should we make a larger co-occurence plot?
    • plot more motifs (one from each cluster at least)
In [3]:
# Imports
from basepair.imports import *
from basepair.math import softmax
import pyranges as pr
from basepair.plot.heatmaps import heatmap_stranded_profile, multiple_heatmap_stranded_profile
from basepair.plot.profiles import plot_stranded_profile, multiple_plot_stranded_profile, extract_signal
from basepair.plot.tracks import plot_tracks, filter_tracks
from basepair.modisco.results import MultipleModiscoResult, Seqlet, resize_seqlets
from basepair.modisco.pattern_instances import (multiple_load_instances, load_instances, filter_nonoverlapping_intervals, 
                                                plot_coocurence_matrix, align_instance_center, dfi2seqlets, annotate_profile)
from basepair.exp.chipnexus.perturb.vdom import vdom_motif_pair, plot_spacing_hist
from basepair.exp.chipnexus.spacing import remove_edge_instances, get_motif_pairs, motif_pair_dfi
from basepair.exp.chipnexus.simulate import (insert_motif, generate_sim, plot_sim, generate_seq, 
                                             model2tasks, motif_coords, interactive_tracks, plot_motif_table,
                                             plot_sim_motif_col)
from basepair.exp.paper.config import *
from basepair.cli.modisco import load_profiles
from basepair.cli.imp_score import ImpScoreFile
from basepair.preproc import rc_seq, dfint_no_intersection
from copy import deepcopy
from scipy.fftpack import fft, ifft
from plotnine import *
import plotnine
import warnings
warnings.filterwarnings("ignore")

%matplotlib inline
%config InlineBackend.figure_format = 'retina'
paper_config()

# interval columns in dfi
interval_cols = ['example_chrom', 'pattern_start_abs', 'pattern_end_abs']
Using TensorFlow backend.
In [4]:
# figures dir
model_dir = models_dir / exp
fdir = Path(f'{ddir}/figures/modisco/{exp}/spacing/')
fdir_individual = fdir / 'individual'
fdir_individual_sim = fdir / 'individual-simulation'
In [5]:
from basepair.cli.imp_score import ImpScoreFile

isf = ImpScoreFile(model_dir / 'deeplift.imp_score.h5')
In [6]:
peaks_per_task = pd.value_counts(isf.data["/metadata/interval_from_task"][:])
In [7]:
!mkdir -p {fdir_individual}
!mkdir -p {fdir_individual_sim}
!mkdir -p {fdir}
In [8]:
# Generate motif pairs
pairs = get_motif_pairs(motifs)

# ordered names
pair_names = ["<>".join(x) for x in pairs]
In [9]:
# define the global set of distances
dist_subsets = ['center_diff<=35',
               '(center_diff>35)&(center_diff<=70)', 
               '(center_diff>70)&(center_diff<=150)', 
               'center_diff>150']
dist_subset_labels = ['dist < 35',
                      '35 < dist <= 70',
                      '70 < dist <= 150',
                      '150 < dist',
                     ]
In [10]:
task_map = {"O": "Oct4", "S": "Sox2", "N": "Nanog", "K": "Klf4"}
# Figure out the tasks from the name
tasks = [task_map[s] for s in exp.split(",")[2]]
In [11]:
mr = MultipleModiscoResult({t: model_dir / f'deeplift/{t}/out/{imp_score}/modisco.h5'
                           for t in tasks})
In [12]:
main_motifs = [mr.get_pattern(pattern_name).rename(name)
               for name, pattern_name in motifs.items()]
In [12]:
len(main_motifs)
Out[12]:
11
In [14]:
from basepair.exp.paper.fig4 import similarity_matrix, cluster_align_patterns
In [152]:
sim_seq = similarity_matrix(main_motifs, track='seq_ic')
np.fill_diagonal(sim_seq, 0)
100%|██████████| 11/11 [00:00<00:00, 21.12it/s]
In [153]:
motif_names = [m.name for m in main_motifs]
df_sim_seq = pd.DataFrame(sim_seq, index=motif_names, columns=motif_names)
In [154]:
main_motifs[4].align(main_motifs[5], 'contrib/Nanog').attrs
Out[154]:
{'align': {'use_rc': False, 'offset': -1}}
In [155]:
main_motifs[4].align(main_motifs[5], 'contrib/Nanog').plot('contrib')
main_motifs[5].plot("contrib");
In [71]:
main_motifs[5].plot("seq_ic");
In [72]:
main_motifs[1].align(main_motifs[4], 'seq_ic').plot('seq_ic')
main_motifs[4].plot("seq_ic");
Exception ignored in: <bound method ImpScoreFile.__del__ of <basepair.cli.imp_score.ImpScoreFile object at 0x7f807e4fb390>>
Traceback (most recent call last):
  File "/users/avsec/workspace/basepair/basepair/cli/imp_score.py", line 360, in __del__
    self.close()
  File "/users/avsec/workspace/basepair/basepair/cli/imp_score.py", line 357, in close
    self.f.close()
  File "/users/avsec/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/kipoi/readers.py", line 115, in close
    self.__exit__()
  File "/users/avsec/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/kipoi/readers.py", line 105, in __exit__
    self.f.close()
AttributeError: 'NoneType' object has no attribute 'close'
In [73]:
main_motifs[-2].align(main_motifs[-4], 'seq_ic').plot('seq_ic')
main_motifs[-4].plot("seq_ic");
In [13]:
for p in main_motifs:
    p.trim_seq_ic(0.08).plot("seq_ic");
TF-MoDISco is using the TensorFlow backend.
In [79]:
dfrm.name.str.match("ERV").sum()
Out[79]:
12770
In [80]:
dfrm.name.str.match("ERV").mean()
Out[80]:
0.002411897283017542
In [13]:
dfi_subset = pd.read_parquet(f"{model_dir}/deeplift/dfi_subset.parq", engine='fastparquet')
dfi_subset['row_idx'] = np.arange(len(dfi_subset))

dfi_subset['Chromosome'] = dfi_subset.example_chrom
dfi_subset['Start'] = dfi_subset.pattern_start_abs
dfi_subset['End'] = dfi_subset.pattern_end_abs
dfi_subset_pr = pr.PyRanges(dfi_subset)
In [16]:
assert len(dfi_subset.pattern.unique()) == len(main_motifs)
In [17]:
len(dfi_subset)
Out[17]:
397924

Add repeat masker annotation

In [16]:
# exclude erv's
import pyranges as pr

# load repeat masker file
from basepair.exp.paper.repeat_masker import download_repeat_masker, read_repeat_masker, intersect_repeat_masker
dfrm = read_repeat_masker(download_repeat_masker())
dfrm_erv = dfrm[dfrm.name.str.contains("ERV")]
dfrm_erv['Start'] = dfrm_erv['start']
dfrm_erv['End'] = dfrm_erv['end']
dfrm_erv['Chromosome'] = dfrm_erv['chrom']
dfrm_erv = pr.PyRanges(dfrm_erv)

# exclude erv's
exclude = dfi_subset_pr.overlap(dfrm_erv).df.row_idx.unique()
exclude_rows = dfi_subset.row_idx.isin(exclude)
print(f"Excluding: {len(exclude_rows)} / {len(dfi_subset)} rows")
dfi_subset['is_erv'] = exclude_rows
Using downloaded and verified file: /users/avsec/workspace/basepair/data/raw/annotation/mm10/RepeatMasker/mm10.fa.out.gz
Excluding: 397924 / 397924 rows
In [17]:
pd.crosstab(dfi_subset.is_erv, dfi_subset.is_te)
Out[17]:
is_te False True
is_erv
False 310848 290
True 65802 20984
In [18]:
a = pr.PyRanges(chromosomes=['chr1'], starts=[10], ends=[20])
b = pr.PyRanges(chromosomes=['chr1'], starts=[19], ends=[22])
In [19]:
a.overlap(b)
Out[19]:
+--------------+-----------+-----------+
| Chromosome   |     Start |       End |
| (category)   |   (int32) |   (int32) |
|--------------+-----------+-----------|
| chr1         |        10 |        20 |
+--------------+-----------+-----------+
Unstranded PyRanges object has 1 rows and 3 columns from 1 chromosomes.
In [20]:
len(dfi_subset)
Out[20]:
397924
In [21]:
# many still remain
dfi_subset.is_te.sum()
Out[21]:
21274

Exclude motifs overlapping the repeat elements

In [22]:
dfi_subset_pr = pr.PyRanges(dfi_subset)
In [23]:
# duplicated number of instances
len(dfi_subset_pr)
Out[23]:
397924
In [24]:
# non-duplicated number of instances
len(dfi_subset_pr.drop_duplicate_positions())
Out[24]:
241005
In [25]:
dfi_subset.head()
Out[25]:
example_chrom example_end example_idx example_interval_from_task example_start example_strand imp/Klf4 imp/Nanog imp/Oct4 imp/Sox2 imp_max imp_max_task imp_weighted imp_weighted_cat imp_weighted_p match/Klf4 match/Nanog match/Oct4 match/Sox2 match_max match_max_task match_weighted match_weighted_cat match_weighted_p pattern pattern_center pattern_end pattern_end_abs pattern_len pattern_name pattern_short pattern_start pattern_start_abs seq_match seq_match_cat seq_match_p strand tf is_te pattern_center_aln pattern_strand_aln row_idx Chromosome Start End is_erv
0 chr3 122146077 1 Oct4 122145077 . NaN NaN 0.9374 NaN 0.9374 Oct4 0.9374 high 0.8462 NaN NaN 0.5077 NaN 0.5077 Oct4 0.5077 high 0.6886 Oct4/metacluster_0/pa... 430 438 122145515 16 Oct4-Sox2 Oct4/m0_p0 422 122145499 10.2722 high 0.6902 - Oct4 False 429 - 0 chr3 122145499 122145515 False
1 chr3 122146077 1 Oct4 122145077 . NaN NaN 1.5355 NaN 1.5355 Oct4 1.5355 high 0.9920 NaN NaN 0.5287 NaN 0.5287 Oct4 0.5287 high 0.7992 Oct4/metacluster_0/pa... 451 459 122145536 16 Oct4-Sox2 Oct4/m0_p0 443 122145520 11.5528 high 0.9044 - Oct4 False 450 - 1 chr3 122145520 122145536 False
2 chr3 122146077 1 Oct4 122145077 . NaN NaN 1.2758 NaN 1.2758 Oct4 1.2758 high 0.9670 NaN NaN 0.4219 NaN 0.4219 Oct4 0.4219 low 0.2234 Oct4/metacluster_0/pa... 492 500 122145577 16 Oct4-Sox2 Oct4/m0_p0 484 122145561 10.7122 high 0.7777 - Oct4 False 491 - 2 chr3 122145561 122145577 False
3 chr3 122146077 1 Oct4 122145077 . NaN NaN 1.0455 NaN 1.0455 Oct4 1.0455 high 0.9050 NaN NaN 0.4155 NaN 0.4155 Oct4 0.4155 low 0.2005 Oct4/metacluster_0/pa... 534 542 122145619 16 Oct4-Sox2 Oct4/m0_p0 526 122145603 10.2722 high 0.6902 - Oct4 False 533 - 3 chr3 122145603 122145619 False
4 chr2 52072742 5 Oct4 52071742 . NaN NaN 1.4742 NaN 1.4742 Oct4 1.4742 high 0.9890 NaN NaN 0.5071 NaN 0.5071 Oct4 0.5071 high 0.6851 Oct4/metacluster_0/pa... 516 524 52072266 16 Oct4-Sox2 Oct4/m0_p0 508 52072250 9.2183 medium 0.4731 + Oct4 False 517 + 4 chr2 52072250 52072266 False
In [26]:
dict(dfi_subset.pattern_name.value_counts())
Out[26]:
{'Klf4': 103044,
 'Nanog': 76337,
 'Oct4': 74594,
 'Oct4-Sox2': 52475,
 'Sox2': 40193,
 'Zic3': 14927,
 'Oct4-Oct4': 12511,
 'B-Box': 7516,
 'Nanog-partner': 7066,
 'Essrb': 5353,
 'Klf4-Klf4': 3908}
In [27]:
fig, ax = plt.subplots(figsize=get_figsize(0.23))
dfi_subset.pattern_name.value_counts().plot.bar(width=.9)
plt.ylabel("Motif instances")
plt.xlabel("Motif")
fig.savefig(fdir / '../n-instances-per-motif.pdf')
In [103]:
# Number of regions with a motif (total = 150k. Only 14k lack a motif)
len(dfi_subset.groupby('example_idx').size())
Out[103]:
136302
In [104]:
dfi_subset.groupby('example_idx').size().value_counts()
Out[104]:
2     35091
3     31293
1     27308
      ...  
17        1
25        1
30        1
Length: 23, dtype: int64
In [113]:
total_regions = len(ranges)
In [114]:
fig, ax = plt.subplots(figsize=get_figsize(0.4))
vec = dfi_subset.groupby('example_idx').size()
vec = pd.concat([vec, pd.Series(np.zeros(total_regions - len(vec)))])
vec.plot.hist(bins=np.arange(11), rwidth=0.9, align='left')
plt.xlabel("Number of motifs per region")
fig.savefig(fdir / '../n-motifs-per-region.pdf')
In [16]:
dfi_subset.head()
Out[16]:
example_chrom example_end example_idx example_interval_from_task example_start example_strand imp/Klf4 imp/Nanog imp/Oct4 imp/Sox2 imp_max imp_max_task imp_weighted imp_weighted_cat imp_weighted_p match/Klf4 match/Nanog match/Oct4 match/Sox2 match_max match_max_task match_weighted match_weighted_cat match_weighted_p pattern pattern_center pattern_end pattern_end_abs pattern_len pattern_name pattern_short pattern_start pattern_start_abs seq_match seq_match_cat seq_match_p strand tf is_te pattern_center_aln pattern_strand_aln
0 chr3 122146077 1 Oct4 122145077 . NaN NaN 0.9374 NaN 0.9374 Oct4 0.9374 high 0.8462 NaN NaN 0.5077 NaN 0.5077 Oct4 0.5077 high 0.6886 Oct4/metacluster_0/pa... 430 438 122145515 16 Oct4-Sox2 Oct4/m0_p0 422 122145499 10.2722 high 0.6902 - Oct4 False 428 -
1 chr3 122146077 1 Oct4 122145077 . NaN NaN 1.5355 NaN 1.5355 Oct4 1.5355 high 0.9920 NaN NaN 0.5287 NaN 0.5287 Oct4 0.5287 high 0.7992 Oct4/metacluster_0/pa... 451 459 122145536 16 Oct4-Sox2 Oct4/m0_p0 443 122145520 11.5528 high 0.9044 - Oct4 False 449 -
2 chr3 122146077 1 Oct4 122145077 . NaN NaN 1.2758 NaN 1.2758 Oct4 1.2758 high 0.9670 NaN NaN 0.4219 NaN 0.4219 Oct4 0.4219 low 0.2234 Oct4/metacluster_0/pa... 492 500 122145577 16 Oct4-Sox2 Oct4/m0_p0 484 122145561 10.7122 high 0.7777 - Oct4 False 490 -
3 chr3 122146077 1 Oct4 122145077 . NaN NaN 1.0455 NaN 1.0455 Oct4 1.0455 high 0.9050 NaN NaN 0.4155 NaN 0.4155 Oct4 0.4155 low 0.2005 Oct4/metacluster_0/pa... 534 542 122145619 16 Oct4-Sox2 Oct4/m0_p0 526 122145603 10.2722 high 0.6902 - Oct4 False 532 -
4 chr2 52072742 5 Oct4 52071742 . NaN NaN 1.4742 NaN 1.4742 Oct4 1.4742 high 0.9890 NaN NaN 0.5071 NaN 0.5071 Oct4 0.5071 high 0.6851 Oct4/metacluster_0/pa... 516 524 52072266 16 Oct4-Sox2 Oct4/m0_p0 508 52072250 9.2183 medium 0.4731 + Oct4 False 518 +
In [198]:
19.535 / (212.63-196.32) * 0.3
Out[198]:
0.3593194359288779
In [199]:
3.505 / (212.63-196.32) * 0.3
Out[199]:
0.06446965052115265
In [18]:
dfi_subset.groupby(['pattern_name', 'example_idx']).size().reset_index()
Out[18]:
pattern_name example_idx 0
0 B-Box 2 2
1 B-Box 4 2
2 B-Box 7 4
... ... ... ...
311598 Zic3 147928 1
311599 Zic3 147950 1
311600 Zic3 147966 1

311601 rows × 3 columns

In [ ]:
dfi_subset.head()
In [120]:
fig, axes = plt.subplots(len(tasks), 1, 
                         figsize=get_figsize(0.23*1.4, aspect=len(tasks)* 0.618/1.4),
                         sharey=True,
                         gridspec_kw=dict(hspace=0),
                         sharex=True)
for i, task in enumerate(tasks):
    ax = axes[i]
    vec = dfi_subset[dfi_subset.example_interval_from_task == task].groupby('example_idx').size()
    vec = pd.concat([vec, pd.Series(np.zeros(peaks_per_task[task] - vec.index.nunique()))])
    vec.plot.hist(bins=np.arange(10), rwidth=0.9, align='left', ax=ax)
    ax.set_ylabel(task, rotation=0, labelpad=10, ha='right')
ax.set_xlabel("Number of motifs per region")
fig.savefig(fdir / '../n-motifs-per-region.per-task.pdf')
In [121]:
fig, axes = plt.subplots(len(tasks), 1, 
                         figsize=get_figsize(0.23, aspect=len(tasks)* 0.618),
                         sharey=True,
                         gridspec_kw=dict(hspace=0),
                         sharex=True)
pattern_names = dfi_subset.pattern_name.unique()
for i, task in enumerate(tasks):
    ax = axes[i]
    if i == 0:
        ax.set_title("Total number of motifs")
    vec = dfi_subset[dfi_subset.example_interval_from_task == task].pattern_name.value_counts().reindex(pattern_names, fill_value=0)
    vec.plot.bar(width=.9, ax=ax)
    ax.set_ylabel(task, rotation=0, labelpad=10, ha='right')
ax.set_xlabel("Motif")
fig.savefig(fdir / '../n-instances-per-motif.per-task.pdf')
In [122]:
fig, axes = plt.subplots(len(tasks), 1, 
                         figsize=get_figsize(0.23, aspect=len(tasks)* 0.618),
                         sharey=True,
                         gridspec_kw=dict(hspace=0),
                         sharex=True)
pattern_names = dfi_subset.pattern_name.unique()
for i, task in enumerate(tasks):
    ax = axes[i]
    if i == 0:
        ax.set_title("Average number of motifs per peak")
    vec = dfi_subset[dfi_subset.example_interval_from_task == task].pattern_name.value_counts().reindex(pattern_names, fill_value=0)
    vec = vec / peaks_per_task[task]
    vec.plot.bar(width=.9, ax=ax)
    ax.set_ylabel(task, rotation=0, labelpad=10, ha='right')
ax.set_xlabel("Motif")

fig.savefig(fdir / '../n-instances-per-motif.per-task.avg-motifs-per-peak.pdf')
In [16]:
np.sum(dfi_subset.groupby('example_idx').size() >= 3)
Out[16]:
72696
In [19]:
np.sum(dfi_subset.groupby('example_idx').size() >= 3) / 150908
Out[19]:
0.4817239642696212
In [18]:
np.sum(dfi_subset.groupby('example_idx').size() >= 5)
Out[18]:
20352
In [17]:
72696 / 150000
Out[17]:
0.48464
In [14]:
dfi_subset.groupby('example_idx').size()
Out[14]:
example_idx
1         8
2         2
3         3
         ..
147971    1
147972    3
147973    4
Length: 135913, dtype: int64
In [47]:
patterns = [p for p in mr.get_all_patterns()]
In [93]:
ic = [p.seq_info_content for p in mr.get_all_patterns() if mr.n_seqlets(p.name) > 100]
In [99]:
n_seqlets = [mr.n_seqlets(p.name) for p in mr.get_all_patterns()]
In [94]:
len(ic)
Out[94]:
56
In [97]:
ic = [p.seq_info_content for p in mr.get_all_patterns()]
In [98]:
fig, ax = plt.subplots(figsize=get_figsize(0.4))
plt.hist(ic, bins=np.arange(0, 110, step=8), rwidth=0.9);
# plt.hist(ic, bins=15, rwidth=0.9, align='left');
plt.xlabel("IC")
plt.ylabel("Motifs")
fig.savefig(fdir / '../seq-IC.hist.all.pdf')
In [48]:
p = patterns[0]
In [50]:
p.seq_info_content
TF-MoDISco is using the TensorFlow backend.
Out[50]:
14.260665650571447
Check if they occur at the same location

Create motif pairs

In [21]:
# create motif pairs
# dfab_w_te = pd.concat([motif_pair_dfi(dfi_subset, pair).assign(motif_pair="<>".join(pair)) for pair in pairs], axis=0)
In [22]:
# create motif pairs
dfab = pd.concat([motif_pair_dfi(dfi_subset[(~dfi_subset.is_te) & (~dfi_subset.is_erv)], pair).assign(motif_pair="<>".join(pair)) for pair in pairs], axis=0)
In [23]:
# Remove self matches
dfab = dfab.query('~((pattern_center_aln_x == pattern_center_aln_y) & (pattern_strand_aln_x == pattern_strand_aln_x))')

Remove Sox2 and Oct4 matching exactly to Oct4-Sox2

In [24]:
exclude_sox2 = dfab[(dfab.motif_pair == 'Oct4-Sox2<>Sox2') & 
                    (dfab['center_diff_aln'] == 0)].row_idx_y.values
exclude_oct4 = dfab[(dfab.motif_pair == 'Oct4-Sox2<>Oct4') & 
                    (dfab['center_diff_aln'] == 0)].row_idx_y.values
exclude_oct4_v2 = dfab[(dfab.motif_pair == 'Oct4<>Oct4-Oct4') & 
                    (dfab['center_diff_aln'] == 0)].row_idx_y.values


old_len = len(dfab)
# Exclude the overlapping row
dfab = dfab[(dfab.pattern_name_x != 'Oct4') | (~dfab.row_idx_x.isin(exclude_oct4))]
dfab = dfab[(dfab.pattern_name_y != 'Oct4') | (~dfab.row_idx_y.isin(exclude_oct4))]
dfab = dfab[(dfab.pattern_name_x != 'Oct4') | (~dfab.row_idx_x.isin(exclude_oct4_v2))]
dfab = dfab[(dfab.pattern_name_y != 'Oct4') | (~dfab.row_idx_y.isin(exclude_oct4_v2))]
dfab = dfab[(dfab.pattern_name_x != 'Sox2') | (~dfab.row_idx_x.isin(exclude_sox2))]
dfab = dfab[(dfab.pattern_name_y != 'Sox2') | (~dfab.row_idx_y.isin(exclude_sox2))]
nd = len(dfab) - old_len
print(f"Removed {nd}/{len(dfab)} instances")
Removed 0/353922 instances
In [20]:
# dfab.to_csv(f"{model_dir}/deeplift/dfab.csv.gz", index=False, compression='gzip')

Sanity-check -> plot the meta profile

motif_subset = OrderedDict[(k,v )

In [22]:
# seqlets = dfi2seqlets(dfi_subset[dfi_subset.pattern_name == 'Oct4-Sox2'])
# seqlets = resize_seqlets(seqlets, 70, seqlen=1000)
# seqlet_profiles = {k: extract_signal(v, seqlets) for k,v in profiles.items()}
In [24]:
# sort_idx = np.argsort(-seqlet_profiles['Oct4'].sum(axis=(-1,-2)))[:1000]
sort_idx = np.arange(1000)
In [23]:
# multiple_plot_stranded_profile({p:v for p,v in seqlet_profiles.items()}, figsize_tmpl=(2.55,2))
# multiple_heatmap_stranded_profile(seqlet_profiles, sort_idx=sort_idx, figsize=(10,10));

Co-occurence test

  • [ ] TODO - fix the test
    • [ ] use fisher exact
    • [ ] remove TE's?

<100bp

In [28]:
from basepair.exp.chipnexus.spacing import coocurrence_plot
from basepair.exp.chipnexus.spacing import co_occurence_matrix
In [29]:
dfi_subset.pattern_name.unique()
Out[29]:
array(['Oct4-Sox2', 'Oct4', 'B-Box', 'Oct4-Oct4', 'Essrb', 'Sox2', 'Nanog', 'Zic3',
       'Nanog-partner', 'Klf4', 'Klf4-Klf4'], dtype=object)
In [30]:
def co_occurence_matrix(dfi_subset, query_string=""):
    """Returns the fraction of times pattern x (row) overlaps pattern y (column)
    """
    from basepair.stats import norm_matrix
    total_number = dfi_subset.groupby(['pattern']).size()
    norm_counts = norm_matrix(total_number)

    # normalization: minimum number of counts
    total_number = dfi_subset.groupby(['pattern_name']).size()
    norm_counts = norm_matrix(total_number)

    # cross-product
    dfi_filt_crossp = pd.merge(dfi_subset[['pattern_name', 'pattern_center_aln',
                                           'pattern_strand_aln', 'pattern_center', 'example_idx']].set_index('example_idx'),
                               dfi_subset[['pattern_name', 'pattern_center_aln',
                                           'pattern_strand_aln', 'pattern_center', 'example_idx']].set_index('example_idx'),
                               how='outer', left_index=True, right_index=True).reset_index()
    # remove self-matches
    dfi_filt_crossp = dfi_filt_crossp.query('~((pattern_name_x == pattern_name_y) & '
                                            '(pattern_center_aln_x == pattern_center_aln_y) & '
                                            '(pattern_strand_aln_x == pattern_strand_aln_x))')
    dfi_filt_crossp['center_diff'] = dfi_filt_crossp.eval("abs(pattern_center_x- pattern_center_y)")
    dfi_filt_crossp['center_diff_aln'] = dfi_filt_crossp.eval("abs(pattern_center_aln_x- pattern_center_aln_y)")

    if query_string:
        dfi_filt_crossp = dfi_filt_crossp.query(query_string)
    match_sizes = dfi_filt_crossp.groupby(['pattern_name_x', 'pattern_name_y']).size()
    count_matrix = match_sizes.unstack(fill_value=0)

    norm_count_matrix = count_matrix / norm_counts  # .truediv(min_counts, axis='columns').truediv(total_number, axis='index')
    norm_count_matrix = norm_count_matrix.fillna(0)  # these examples didn't have any paired pattern

    return count_matrix, norm_count_matrix, norm_counts


def chi2_test_coc(random_coocurrence_counts, random_coocurrence,
                    random_coocurrence_norm, coocurrence_counts, coocurrence, coocurrence_norm):
    from scipy.stats import chi2_contingency
    cols = list(coocurrence_norm.columns)
    n = len(random_coocurrence_counts)
    o = np.zeros((n, n))
    op = np.zeros((n, n))
    for i in range(n):
        for j in range(n):
            
            # [[# not randomly found together , # not found together],
            #  [# randomly found together     , # found together]]
            ct = [[random_coocurrence_norm.iloc[i, j] - random_coocurrence_counts.iloc[i, j],
                   coocurrence_norm.iloc[i, j] - coocurrence_counts.iloc[i, j]],
                  [random_coocurrence_counts.iloc[i, j],
                   coocurrence_counts.iloc[i, j]]]
            ct = np.array(ct)
            # TODO - make this an actual fisher exact test from
            # https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.fisher_exact.html
            # or the chi-square contingency table:
            # https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.chi2_contingency.html#scipy.stats.chi2_contingency
            chi2, p, dof, ex = chi2_contingency(ct, correction=False)
            # t22 = sm.stats.contingency_tables.Table2x2(np.array(ct))
            
            o[i, j] = (ct[1, 1] / ct[0, 1]) / (ct[1, 0] / ct[0, 0 ])
            op[i, j] = p
    return pd.DataFrame(o, columns=cols, index=cols), pd.DataFrame(op, columns=cols, index=cols)


def coocurrence_plot(dfi_subset, motif_list, query_string="(abs(pattern_center_aln_x- pattern_center_aln_y) <= 150)",
                     signif_threshold=1e-5, ax=None, **kwargs):
    """Test for co-occurence

    Args:
      dfi_subset: desired subset of dfi
      motif_list: list of motifs used to order the heatmap
      query_string: string used with df_cross.query() to detering the valid motif pairs
      signif_threshold: significance threshold for Fisher's exact test
    """
    import seaborn as sns
    if ax is None:
        ax = plt.gca()
    c_counts, c, c_norm = co_occurence_matrix(dfi_subset, query_string=query_string)

    # Generate the NULL
    dfi_subset_random = dfi_subset.copy()
    np.random.seed(42)
    # TODO - shall we draw the sample indices completely at random?
    dfi_subset_random['example_idx'] = dfi_subset_random['example_idx'].sample(frac=1).values
    rc_counts, rc, rc_norm = co_occurence_matrix(dfi_subset_random, query_string=query_string)

    # test for significance
    o, op = chi2_test_coc(rc_counts, rc, rc_norm, c_counts, c, c_norm)

    # re-order
    o = o[motif_list].loc[motif_list]
    op = op[motif_list].loc[motif_list]

    signif = op < signif_threshold
    a = np.zeros_like(signif).astype(str)
    a[signif] = "*"
    a[~signif] = ""

    sns.heatmap(o, annot=a, fmt="", vmin=0, vmax=2,
                cmap='RdBu_r', ax=ax, **kwargs)
    ax.set_title(f"odds-ratio (proximal / non-proximal) (*: p<{signif_threshold})")
In [72]:
# count_matrix, norm_count_matrix, norm_counts = co_occurence_matrix(dfi_subset, 
#                  query_string=f"(abs(pattern_center_aln_x- pattern_center_aln_y) <= {dist}) & center_diff_aln > 5")
In [141]:
# co-occurence
fig, ax = plt.subplots(figsize=get_figsize(.4, aspect=1))
dist = 100
coocurrence_plot(dfi_subset[(~dfi_subset.is_te) & (~dfi_subset.is_erv)], list(motifs),
                 query_string=f"(abs(pattern_center_aln_x- pattern_center_aln_y) <= {dist}) & center_diff_aln > 5")
ax.set_ylabel("Motif of interest")
ax.set_xlabel("Motif partner");
fig.savefig(fdir / f'coocurrence.test.center_diff<{dist}.motifs=10.exclude-te.pdf')

All distance ranges

In [142]:
# subsets = ['center_diff <= 35', 'center_diff > 35', 'center_diff > 70']
subsets = dist_subsets
fig,axes = plt.subplots(1, len(subsets), 
                        figsize=get_figsize(0.4*len(subsets), .8/len(subsets)), 
                        sharey=True)
# dfs = dfi_subset.query('match_weighted_p > .2').query('imp_weighted_p > .2')
dfs = dfi_subset[(~dfi_subset.is_te) & (~dfi_subset.is_erv)]
for i, (subset,ax, subset_label) in enumerate(zip(subsets, axes, dist_subset_labels)):
    if i == len(subsets) - 1:
        cbar = True
    else:
        cbar = False
    coocurrence_plot(dfs, list(motifs), query_string=f"({subset}) & (center_diff_aln > 5)" , ax=ax, cbar=cbar)
    if i == 0:
        ax.set_ylabel("Motif of interest")
        ax.set_xlabel("Motif partner");
    
    ax.set_title(subset_label)
# plt.tight_layout()
fig.savefig(fdir / f'coocurrence.test.all-dist.motifs=10.exclude-TE.pdf')

Only the major motifs

In [31]:
main_motifs = OrderedDict([(k,v)
                            for k,v in motifs.items()
                            if k in ['Oct4-Sox2', 'Sox2', 'Nanog', 'Klf4']
                           ])
In [32]:
# co-occurence
fig, ax = plt.subplots(figsize=get_figsize(.25, aspect=1))
dist = 100
coocurrence_plot(dfi_subset[(~dfi_subset.is_te) & (~dfi_subset.is_erv)], list(main_motifs),
                 query_string=f"(abs(pattern_center_aln_x- pattern_center_aln_y) <= {dist}) & center_diff_aln > 5")
ax.set_ylabel("Motif of interest")
ax.set_xlabel("Motif partner");
fig.savefig(fdir / f'coocurrence.test.center_diff<{dist}.main-motifs.pdf')

All distance ranges

In [34]:
# subsets = ['center_diff <= 35', 'center_diff > 35', 'center_diff > 70']
subsets = dist_subsets
fig,axes = plt.subplots(1, len(subsets), 
                        figsize=get_figsize(0.25*len(subsets), .8/len(subsets)), 
                        sharey=True)
# dfs = dfi_subset.query('match_weighted_p > .2').query('imp_weighted_p > .2')
dfs = dfi_subset[(~dfi_subset.is_te) & (~dfi_subset.is_erv)]
for i, (subset,ax, subset_label) in enumerate(zip(subsets, axes, dist_subset_labels)):
    if i == len(subsets) - 1:
        cbar = True
    else:
        cbar = False
    coocurrence_plot(dfs, list(main_motifs), query_string=f"({subset}) & (center_diff_aln > 5)" , ax=ax, cbar=cbar)
    if i == 0:
        ax.set_ylabel("Motif of interest")
        ax.set_xlabel("Motif partner");
    
    ax.set_title(subset_label)
# plt.tight_layout()
fig.savefig(fdir / f'coocurrence.test.all-dist.main-motifs.pdf')

Pairwise distance distribution

In [28]:
main_motif_pairs = [k + "<>" + v for k,v in get_motif_pairs(['Oct4-Sox2', 'Sox2', 'Nanog', 'Klf4'])]
In [29]:
dfab_main = dfab[dfab.motif_pair.isin(main_motif_pairs)]
dfab_main['motif_pair'] = pd.Categorical(dfab_main['motif_pair'],
                                         categories=main_motif_pairs)
In [35]:
plotnine.options.figure_size = get_figsize(.4, aspect=3.1875)# (10, 10)
max_dist = 100
min_dist = 6

fig = (ggplot(aes(x='center_diff', fill='strand_combination'), dfab_main[(dfab_main.center_diff <= max_dist) & 
                                                                    (dfab_main.center_diff > min_dist)]) + 
 geom_histogram(breaks=np.arange(min_dist, max_dist + 1)) + 
 facet_grid("motif_pair~ .") +
 theme_classic(base_size=10, base_family='Arial') + 
 theme(strip_text = element_text(rotation=0), legend_position='top') + 
 xlim([min_dist, max_dist]) + 
 # ylim([0, 1000]) +
 xlab("Pairwise distance") + 
 scale_fill_brewer(type='qual', palette=3))
fig.save(fdir / 'histogram.center_diff.main-4-motifs.imp>.2,match>.0.pdf')
fig
Out[35]:
<ggplot: (-9223363275141050620)>
In [16]:
df_instances = pd.read_csv("https://docs.google.com/spreadsheets/d/1uAtJZJR_dmlWwer5Afcd2q72Uuj7A4Ln/export?gid=826411817&format=csv")
In [17]:
df_instances
Out[17]:
task pattern imp_score is_te length IC n_pwm_gw n_pwm_peaks n_cwm_peaks n_seqlets n_cwm_and_pwm_peaks frac_cwm_in_pwm_peaks frac_pwm_in_cwm_peaks is_short_motif_>100 seqlets_and_ic<30_and_ic>4 Unnamed: 14 Unnamed: 15
0 Oct4 metacluster_0/pattern_9 profile/wn True 68.0 52.2238 76989.0 1446.0 9.1400e+02 355.0 309.0 3.3807e-01 0.2137 FALSE NaN NaN
1 Oct4 metacluster_0/pattern_13 profile/wn True 70.0 62.8465 94727.0 1179.0 1.3800e+02 138.0 97.0 7.0290e-01 0.0823 FALSE NaN NaN
2 Oct4 metacluster_0/pattern_14 profile/wn True 70.0 108.1187 90990.0 1538.0 5.4100e+02 125.0 373.0 6.8946e-01 0.2425 FALSE NaN NaN
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
59 Klf4 metacluster_0/pattern_9 profile/wn False 70.0 3.0499 54457.0 1257.0 1.1689e+07 214.0 526.0 4.4999e-05 0.4185 FALSE NaN NaN
60 Klf4 metacluster_0/pattern_11 profile/wn False 33.0 19.3212 55242.0 2501.0 6.3700e+02 174.0 370.0 5.8085e-01 0.1479 TRUE NaN NaN
61 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 33 NaN NaN

62 rows × 16 columns

In [215]:
df_instances[df_instances.IC > 4].n_cwm_peaks.sum()
Out[215]:
965844.0
In [217]:
df_instances[(df_instances.IC > 4) & (df_instances.is_te==False)].n_cwm_peaks.sum()
Out[217]:
956134.0
In [243]:
main_motif_names = [longer_pattern(v) for k,v in motifs.items()] + ["Oct4/metacluster_0/pattern_16"]
In [226]:
df_instances['pattern']
Out[226]:
0      metacluster_0/pattern_9
1     metacluster_0/pattern_13
2     metacluster_0/pattern_14
                ...           
59     metacluster_0/pattern_9
60    metacluster_0/pattern_11
61                         NaN
Name: pattern, Length: 62, dtype: object
In [231]:
df_instances['pattern']
Out[231]:
0      metacluster_0/pattern_9
1     metacluster_0/pattern_13
2     metacluster_0/pattern_14
                ...           
59     metacluster_0/pattern_9
60    metacluster_0/pattern_11
61                         NaN
Name: pattern, Length: 62, dtype: object
In [244]:
df_instances['pattern_name'] = df_instances['task'] + '/' + df_instances['pattern']
In [251]:
dfi_subset.pattern_name.unique()
Out[251]:
array(['Oct4-Sox2', 'Oct4', 'B-Box', 'Oct4-Oct4', 'Sox2', 'Nanog', 'Zic3', 'Nanog-partner', 'Klf4',
       'Klf4-Klf4'], dtype=object)
In [ ]:
dfi_subset.
In [239]:
df_instances[(df_instances.IC > 4) & (df_instances.is_te==False)].n_cwm_peaks.sum()
Out[239]:
956134.0
In [214]:
df_instances.head()
Out[214]:
task pattern imp_score is_te length IC n_pwm_gw n_pwm_peaks n_cwm_peaks n_seqlets n_cwm_and_pwm_peaks frac_cwm_in_pwm_peaks frac_pwm_in_cwm_peaks is_short_motif_>100 seqlets_and_ic<30_and_ic>4 Unnamed: 14 Unnamed: 15
0 Oct4 metacluster_0/pattern_9 profile/wn True 68.0 52.2238 76989.0 1446.0 914.0 355.0 309.0 0.3381 0.2137 FALSE NaN NaN
1 Oct4 metacluster_0/pattern_13 profile/wn True 70.0 62.8465 94727.0 1179.0 138.0 138.0 97.0 0.7029 0.0823 FALSE NaN NaN
2 Oct4 metacluster_0/pattern_14 profile/wn True 70.0 108.1187 90990.0 1538.0 541.0 125.0 373.0 0.6895 0.2425 FALSE NaN NaN
3 Oct4 metacluster_0/pattern_15 profile/wn True 69.0 63.6217 93066.0 1205.0 145.0 142.0 98.0 0.6759 0.0813 FALSE NaN NaN
4 Sox2 metacluster_0/pattern_6 profile/wn True 60.0 34.9974 64904.0 550.0 1731.0 104.0 217.0 0.1254 0.3945 FALSE NaN NaN
In [143]:
# Generate all spacing plots
individual_plots = {}
for motif_pair_name in pair_names:
    plotnine.options.figure_size = get_figsize(.3, aspect=2/10*4)
    max_dist = 100
    min_dist = 0
    df = dfab[(dfab.center_diff <= max_dist) & 
              (dfab.motif_pair == motif_pair_name)]
    if len(df) == 0:
        continue
    fig = (ggplot(aes(x='center_diff', fill='strand_combination'), df) + 
     # Spacing vertical lines
     geom_vline(xintercept=10, alpha=0.1) + 
     geom_vline(xintercept=20, alpha=0.1) + 
     geom_vline(xintercept=30, alpha=0.1) + 
     geom_vline(xintercept=40, alpha=0.1) + 
     # plot
     geom_histogram(breaks=np.arange(min_dist, max_dist + 1)) + 
     facet_grid("strand_combination~.") + 
     # Theme, labels, colors
     theme_classic(base_size=10, base_family='Arial') + 
     theme(strip_text = element_text(rotation=0), legend_position='top') + 
     xlim([min_dist, max_dist]) + 
     xlab("Pairwise distance") +
     ggtitle(motif_pair_name) + 
     scale_fill_brewer(type='qual', palette=3))
    
    # Save
    fig.save(fdir_individual / f'{motif_pair_name}.pdf')
    individual_plots[motif_pair_name] = fig
In [383]:
individual_plots['Sox2<>Nanog']
Out[383]:
<ggplot: (-9223363287224432780)>
In [270]:
individual_plots['Nanog<>Nanog']
Out[270]:
<ggplot: (-9223363277389710634)>

Detailed individual plots

  • [X] TODO - include a detailed major and minor tiks (Minor = 1bp, Major = 5bp)

Considered TF's:

  • Nanog-Nanog
  • Nanog-Sox2
  • Zic3-Nanog
In [415]:
for motif_pair_name in ['Nanog<>Nanog', 'Sox2<>Nanog', 'Nanog<>Zic3']:
    plotnine.options.figure_size = get_figsize(2, aspect=2/10*4 / 2)
    xmin = 5
    xmax = 50
    df = dfab[(dfab.center_diff <= xmax) & 
              (dfab.motif_pair == motif_pair_name)]
    fig = (ggplot(aes(x='center_diff', fill='strand_combination'), df) + 
     # plot
     geom_histogram(breaks=np.arange(xmin, xmax+1)) + facet_grid("strand_combination~.") + 
     # Theme, labels, colors
     theme_bw(base_size=10, base_family='Arial') + 
     theme(strip_text = element_text(rotation=0), legend_position='top') +  
     xlab("Pairwise distance") +
     ggtitle(motif_pair_name) + 
     scale_x_continuous(breaks=np.arange(xmin, xmax, step=5),
                        minor_breaks=np.arange(xmin, xmax, step=1)) +
     scale_fill_brewer(type='qual', palette=3))
     # axis_ticks_major_x()
    display(fig)
    fig.save(fdir_individual / f'{motif_pair_name}.large.pdf')
<ggplot: (8749611385327)>
<ggplot: (8749628652068)>
<ggplot: (-9223363287240838428)>

PWM for the main motifs

  • [ ] Plot a PWM of all the sequences overlapping a certain region
    • [ ] Query the motif pairs -> construct intervals
    • [ ] Create seqlets
    • [ ] Fetch the sequence from ImpScoreFile
  • [ ] Consider each orientation
  • [ ] Flank the sequence
  • [ ] Say the peak at 10bp or peak at 20 bp

TODO - take into account the strand

In [124]:
rdir = Path('../../../')
In [125]:
ls {rdir}
basepair/           conda-env-gpu.yml  error.afmat-compute.log  setup.py*
basepair.egg-info/  conda-env.yml      example/                 Snakefile
chipnexus-gdrive@   data@              py27-conda-env.yml       src/
conda-env-cpu.yml   docs/              README.md                tests/
In [128]:
dataspec_file = rdir / "src/chipnexus/train/seqmodel/ChIP-nexus.dataspec.yml"
ds = DataSpec.load(dataspec_file)

# Load all files into the page cache
ds.touch_all_files()
In [134]:
from basepair.extractors import Interval
from basepair.exp.paper.config import fasta_file
from genomelake.extractors import FastaExtractor

from basepair.utils import halve

from basepair.preproc import resize_interval
In [130]:
fe = FastaExtractor(fasta_file, use_strand=True)
In [131]:
def plot_pwm(pwm, ax):
    from matplotlib import ticker
    from concise.utils.pwm import seqlogo
    from basepair.modisco.utils import ic_scale
    seqlogo(ic_scale(pwm), ax=ax)
    # strip_axis(ax)
    ax.set_xlabel(None)
    # ax.get_xaxis().set_visible(False)
    ax.set_ylim([0, 2.0])
    ax.set_ylabel("IC")
    ax.spines['top'].set_visible(False)
    ax.spines['right'].set_visible(False)
    ax.spines['bottom'].set_visible(False)
    
    ax.xaxis.set_major_locator(ticker.AutoLocator())


def create_intervals(dfab, ax=None):
    from basepair.extractors import Interval
    intervals = []
    rev_strand = {"+": "-", "-": "+"}
    for i,row in dfab.iterrows():
        diff = row.pattern_center_y - row.pattern_center_x
        width = np.abs(diff) + 30
#         if diff > 0:
#             start = min(row.pattern_start_x, row.pattern_end_x) - 10
#             end = start + width
#             strand = row.strand_x
#         else:
#             end = max(row.pattern_start_x, row.pattern_end_x) + 10 + 1
#             start = end - width
#             strand = row.strand_x
        if diff > 0:
            start = row.pattern_center_x - 15
            end = start + width
            strand = row.strand_x
        else:
            end = row.pattern_center_x + 15
            start = end - width
            strand = row.strand_x
        intervals.append(Interval(chrom=row.example_chrom_x, 
                     start=row.example_start_x + start, 
                     stop=row.example_start_x + end,
                     name=row.motif_pair, 
                     strand=strand,
                     # strand="+" if row.pattern_center_x > row.pattern_center_y else "-"
                    ))
    return intervals

Notes

It's nice that the discovered motif pairs are not comming from a TE

In [ ]:
mrn = mr.mr_dict['Nanog']
In [ ]:
ls /oak/stanford/groups/akundaje/avsec/basepair/data/processed/comparison/output/nexus,peaks,OSNK,0,10,1,FALSE,same,0.5,64,25,0.004,9,FALSE,[1,50],TRUE/deeplift/Nanog/out/profile/wn/
In [159]:
patterns = read_pkl("/oak/stanford/groups/akundaje/avsec/basepair/data/processed/comparison/output/nexus,peaks,OSNK,0,10,1,FALSE,same,0.5,64,25,0.004,9,FALSE,[1,50],TRUE/deeplift/Nanog/out/profile/wn/patterns.pkl")
In [ ]:
p = {p.name:p for p in patterns}['metacluster_0/pattern_1']
In [ ]:
p = p.shift(4).resize(50).resize_profile(50)

TODO

  • can you come up with a way to nicely visualize this?
    • overplot the two tracks?
In [204]:
p.rc().plot("seq_ic")
Out[204]:
In [271]:
dfab[dfab.motif_pair == 'Nanog<>Nanog-partner'].groupby(['strand_combination', 'center_diff']).size().sort_values()
Out[271]:
strand_combination  center_diff
++                  3.0              1
+-                  306.0            1
                    302.0            1
                                  ... 
++                  10.0            25
                    9.0             28
-+                  6.0            278
Length: 726, dtype: int64
In [295]:
np.sum(dfi_subset.pattern_name == 'Nanog-partner')
Out[295]:
7066
In [296]:
278/7066
Out[296]:
0.03934333427681857
In [ ]:
from basepair.BPNet import BPNetSeqModel
In [40]:
# create_tf_session(0)
In [288]:
bpnet = BPNetSeqModel.from_mdir(model_dir)
WARNING:tensorflow:From /users/avsec/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/tensorflow/python/util/deprecation.py:497: calling conv1d (from tensorflow.python.ops.nn_ops) with data_format=NHWC is deprecated and will be removed in a future version.
Instructions for updating:
`NHWC` for data_format is deprecated, use `NWC` instead
2019-07-13 14:00:47,770 [WARNING] From /users/avsec/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/tensorflow/python/util/deprecation.py:497: calling conv1d (from tensorflow.python.ops.nn_ops) with data_format=NHWC is deprecated and will be removed in a future version.
Instructions for updating:
`NHWC` for data_format is deprecated, use `NWC` instead
WARNING:tensorflow:From /users/avsec/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/tensorflow/contrib/learn/python/learn/datasets/base.py:198: retry (from tensorflow.contrib.learn.python.learn.datasets.base) is deprecated and will be removed in a future version.
Instructions for updating:
Use the retry module or similar alternatives.
2019-07-13 14:01:01,067 [WARNING] From /users/avsec/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/tensorflow/contrib/learn/python/learn/datasets/base.py:198: retry (from tensorflow.contrib.learn.python.learn.datasets.base) is deprecated and will be removed in a future version.
Instructions for updating:
Use the retry module or similar alternatives.
In [ ]:
to_plot = [
    ("Nanog<>Nanog-partner", "-+", 6),
    ("Nanog<>Nanog-partner", "++", 9),
    ("Nanog<>Nanog-partner", "++", 10),
#     ("Sox2<>Nanog", "--", 10),
#     ("Sox2<>Nanog", "--", 20),
#     ("Sox2<>Nanog", "--", 30),
#     ("Sox2<>Nanog", "++", 13),
#     ("Sox2<>Nanog", "++", 22),
#     ("Sox2<>Nanog", "++", 23),
#     ("Nanog<>Zic3", "++", 12),
#     ("Nanog<>Zic3", "+-", 11),
#     ("Nanog<>Zic3", "+-", 21),
#     ("Nanog<>Zic3", "+-", 31),
#     ("Oct4<>Sox2", "++", 10),
]

for motif_pair, strand_combination, dist in to_plot:
#     dist = 10
#     strand_combination = '++'
#     motif_pair = 'Nanog<>Nanog'
    fig, axes = plt.subplots(2,1, 
                           gridspec_kw={'height_ratios':[1/7, 1/2]},
                           sharex=True,
                           figsize=get_figsize(.5, 1/7+1/2))
    dfp = (dfab
           .query(f'strand_combination=="{strand_combination}"')
           .query(f'motif_pair=="{motif_pair}"')
           .query(f'center_diff=={dist}'))
    intervals = create_intervals(dfp, ax=ax)
    seqs = fe(intervals)
    
    counts = ds.load_counts(intervals)
    ax = axes[0]
    plot_pwm(seqs.mean(0), ax=ax)
    ax.set_title(f"{motif_pair} {strand_combination}, dist = {dist}, ({len(dfp)})");
    
    
    cmap = plt.get_cmap("tab10")
    cycle = plt.rcParams['axes.prop_cycle'].by_key()['color']
    
    ao = .7
    # plot aggregate profile
    agg_profile = counts[motif_pair.split("<>")[0]].mean(0)
    ax = axes[1]
    # fig, ax = plt.subplots(1, 1, figsize=get_figsize(.5, 1/2))
    ax.plot(agg_profile[:, 0], alpha=ao, label='Obs', color=cycle[0])
    ax.plot(-agg_profile[:, 1], alpha=ao, color=cycle[0])
    ax.set_ylabel("Counts")
    # ax.set_title("Observed")
    sns.despine(top=True, bottom=True, right=True)
    
    # params
    seqlen = seqs.shape[1]
    
    # Plot model
    agg_profile = counts[motif_pair.split("<>")[0]].mean(0)
    # fig, ax = plt.subplots(1, 1, figsize=get_figsize(.5, 1/2))
    seqs_1kb = fe([resize_interval(interval, 1000) for interval in intervals])
    preds = bpnet.predict(seqs_1kb)
    agg_preds = preds['Nanog'].mean(0)
    upstream_seq, downstream_seq = halve(seqlen)
    xs = slice(500-upstream_seq, 500+downstream_seq)
    a = agg_profile.sum() / agg_preds[xs].sum()
    ax.plot(a*agg_preds[xs, 0], label='BPNet', alpha=ao, color=cycle[1])
    ax.plot(-a*agg_preds[xs, 1], alpha=ao, color=cycle[1])
    ax.set_ylabel("Counts")
    # ax.set_title("BPNet")
    sns.despine(top=True, bottom=True, right=True)
    
    
#     pshifted = p.rc().shift(-3).resize(seqlen).resize_profile(seqlen)
#     upstream, downstream = halve(dist)
#     double_profile = pshifted.shift(-upstream).profile['Nanog'] + pshifted.shift(downstream).profile['Nanog']
#     # fig, ax = plt.subplots(1, 1, figsize=get_figsize(.5, 1/2))
#     a = agg_profile.sum() / double_profile.sum()
#     ax.plot(a*double_profile[:, 0], label='Mixture', alpha=ao, color=cycle[2])
#     ax.plot(-a*double_profile[:, 1], alpha=ao, color=cycle[2])
#     ax.set_ylabel("Counts")
    # ax.set_title("Mixture")
    sns.despine(top=True, bottom=True, right=True)
    ax.legend()#labels=['Obs', 'BPNet', 'Mixure'])
    fig.align_ylabels()
    
    fig.savefig(fdir_individual / f'{motif_pair}.{strand_combination}.{dist}.pwm+profile.pdf')

I looked at all the motif pairs of Nanog and Nanog-alt and then visualized the aggregate footprints for the following scenarios:

  1. only those where Nanog is 6 bp away at -+ orientation
  2. All those where Nanog is not 6 bp away at -+ orientation (e.g. the opposite of 1.)
  3. All those where Nanog is > 15 bp away
  4. All
In [136]:
# Nanog
from basepair.extractors import Interval
to_plot = [
    ("Nanog<>Nanog-partner", "-+", 6),
    ("Nanog<>Nanog-partner", "-+", -6),
    ("Nanog<>Nanog-partner", None, -15),
    ("Nanog<>Nanog-partner", "-+", None),
]

for motif_pair, strand_combination, dist in to_plot:
#     dist = 10
#     strand_combination = '++'
#     motif_pair = 'Nanog<>Nanog'
    fig, axes = plt.subplots(2,1, 
                           gridspec_kw={'height_ratios':[1/7, 1/2]},
                           sharex=True,
                           figsize=get_figsize(.5, 1/7+1/2))
    dfp = (dfab
           .query(f'motif_pair=="{motif_pair}"'))
    if dist is not None:
        if dist > 0 :
            dfp = (dfp
                   .query(f'((strand_combination=="{strand_combination}") &  (center_diff=={dist}))'))
        else:
            if strand_combination is not None:
                dfp = (dfp
                       .query(f'~((strand_combination=="{strand_combination}") &  (center_diff==-{dist}))'))
            else:
                dfp = (dfp
                       .query(f'center_diff>-{dist}'))
    intervals = [Interval(chrom=row.Chromosome_y,
                          start=row.Start_y - 20,
                          stop=row.End_y + 20,
                          strand=row.strand_y)
                 for i, row in dfp.iterrows()]
    # intervals = create_intervals(dfp, ax=ax)
    seqs = fe(intervals)
    
    counts = ds.load_counts(intervals)
    ax = axes[0]
    plot_pwm(seqs.mean(0), ax=ax)
    ax.set_title(f"{motif_pair} {strand_combination}, dist = {dist}, ({len(dfp)})");
    
    
    cmap = plt.get_cmap("tab10")
    cycle = plt.rcParams['axes.prop_cycle'].by_key()['color']
    
    ao = .7
    # plot aggregate profile
    agg_profile = counts[motif_pair.split("<>")[0]].mean(0)
    ax = axes[1]
    # fig, ax = plt.subplots(1, 1, figsize=get_figsize(.5, 1/2))
    ax.plot(agg_profile[:, 0], alpha=ao, label='Obs', color=cycle[0])
    ax.plot(-agg_profile[:, 1], alpha=ao, color=cycle[0])
    ax.set_ylabel("Counts")
    # ax.set_title("Observed")
    sns.despine(top=True, bottom=True, right=True)
    
    # params
    seqlen = seqs.shape[1]
    
    # Plot model
    agg_profile = counts[motif_pair.split("<>")[0]].mean(0)
    # fig, ax = plt.subplots(1, 1, figsize=get_figsize(.5, 1/2))
    seqs_1kb = fe([resize_interval(interval, 1000) for interval in intervals])
    # preds = bpnet.predict(seqs_1kb)
    # agg_preds = preds['Nanog'].mean(0)
    upstream_seq, downstream_seq = halve(seqlen)
    xs = slice(500-upstream_seq, 500+downstream_seq)
    # a = agg_profile.sum() / agg_preds[xs].sum()
    # ax.plot(a*agg_preds[xs, 0], label='BPNet', alpha=ao, color=cycle[1])
    # ax.plot(-a*agg_preds[xs, 1], alpha=ao, color=cycle[1])
    ax.set_ylabel("Counts")
    ax.set_ylim([-11, 11])
    # ax.set_title("BPNet")
    sns.despine(top=True, bottom=True, right=True)
    
    
#     pshifted = p.rc().shift(-3).resize(seqlen).resize_profile(seqlen)
#     upstream, downstream = halve(dist)
#     double_profile = pshifted.shift(-upstream).profile['Nanog'] + pshifted.shift(downstream).profile['Nanog']
#     # fig, ax = plt.subplots(1, 1, figsize=get_figsize(.5, 1/2))
#     a = agg_profile.sum() / double_profile.sum()
#     ax.plot(a*double_profile[:, 0], label='Mixture', alpha=ao, color=cycle[2])
#     ax.plot(-a*double_profile[:, 1], alpha=ao, color=cycle[2])
#     ax.set_ylabel("Counts")
    # ax.set_title("Mixture")
    sns.despine(top=True, bottom=True, right=True)
    ax.legend()#labels=['Obs', 'BPNet', 'Mixure'])
    fig.align_ylabels()
    
    fig.savefig(fdir_individual / f'{motif_pair}.{strand_combination}.{dist}.pwm+profile.Nanog-mix.pdf')
In [162]:
nanog_patterns = read_pkl("/oak/stanford/groups/akundaje/avsec/basepair/data/processed/comparison/output/nexus,peaks,OSNK,0,10,1,FALSE,same,0.5,64,25,0.004,9,FALSE,[1,50],TRUE/deeplift/Nanog/out/profile/wn/patterns.pkl")
nanog_patterns = {p.name: p for p in nanog_patterns}
In [166]:
nanog_patterns
Out[166]:
{'metacluster_0/pattern_13': Pattern('metacluster_0/pattern_13', 'TAGCCAGTCTCACCCAAACAAGGGAATACACAATGGCCTAGCGAATAGGTGGGTTTACCATGAAAGAGAA' ...),
 'metacluster_0/pattern_12': Pattern('metacluster_0/pattern_12', 'GTTACCTATTCAGTAACTAATTAAGCATTACAAAAGACGGAGCCAGCAGCTCAGAGAAAAAAAAAAAAAA' ...),
 'metacluster_0/pattern_7': Pattern('metacluster_0/pattern_7', 'AGTAGAGACAGTGAAACTAATTAGGCATTACAAAGAACAGAGCTAGAATAGCTCAGGGAAAAAAAAAAAA' ...),
 'metacluster_0/pattern_18': Pattern('metacluster_0/pattern_18', 'TAAACCTGCTTCCCACCTTAATTTGTAAAATAAAGGAGAGCTGATGACTGGGCAGATAAAAGGGAAGGTG' ...),
 'metacluster_0/pattern_15': Pattern('metacluster_0/pattern_15', 'CAAAGAACAGAGCTAGAATAGCTCAGGGCTAGTCTGCAGAGTGATTACTTAGGACAGTAGCCAGTCTCAA' ...),
 'metacluster_0/pattern_11': Pattern('metacluster_0/pattern_11', 'AGGGCTAGTCTGCATAGTAATTACTTAGGACAATAGCCGAGTCACACCCAAACACAGAAAAAAAAAAAAA' ...),
 'metacluster_0/pattern_17': Pattern('metacluster_0/pattern_17', 'ACAACTGGCTGTTATCATGTAGAAGAATGCAAATTGATCCATTCCTATCTCCTTGTAAAAAAAAAAAAAA' ...),
 'metacluster_0/pattern_10': Pattern('metacluster_0/pattern_10', 'GGTCAGAGTCTGAGCTTGATCGTTGCTAATTTGCAATTGAATGGAGTACCTGGACTTCCCCAAAGAAAAA' ...),
 'metacluster_0/pattern_16': Pattern('metacluster_0/pattern_16', 'AAACCAAAGCCCAGTTTTTTATTTACATATCAATTCAGTCATTTACTCCAGGTTCCCTTTTGATTATCCA' ...),
 'metacluster_0/pattern_14': Pattern('metacluster_0/pattern_14', 'AAAAATCCTGGGAAAGCAGAATTAGCATCAAAATACAAACAGCACCAGGGCCATCCACAACACTTACCCA' ...),
 'metacluster_0/pattern_6': Pattern('metacluster_0/pattern_6', 'TCTTGAAGAGAAAAAGGGCAAGTGGCAGAACAATAGAGCCATCTAGGTAGGAAGGCTCCACCCTAGAAAA' ...),
 'metacluster_0/pattern_0': Pattern('metacluster_0/pattern_0', 'AAAGAATTTTAAATATGGTAATTTGCATAACAATGGAAACAATAGGATCATAAATTTTAAAAAAAAAAAA' ...),
 'metacluster_0/pattern_9': Pattern('metacluster_0/pattern_9', 'AGGTTGGGCATCGGGCAGGTAATGGCACATCAAAGGAGACTGATGGAAGGCAGCAAGGCATAAGGAGAAA' ...),
 'metacluster_0/pattern_8': Pattern('metacluster_0/pattern_8', 'AAAAAAAAAAAAAATGGATTTAAGGTAGAACAAAGGAGTCGTTATGGAAGTCTTGATAATCCTTTTAAAC' ...),
 'metacluster_0/pattern_3': Pattern('metacluster_0/pattern_3', 'ATTGTTAATATAAATAGTAATTTCCCAAAACAATGGCCAATTTTAATCGGACAATGAAATTAACAATGAA' ...),
 'metacluster_0/pattern_4': Pattern('metacluster_0/pattern_4', 'AAAAAAAAAAAAAAAAATAATAAAGTGCAGGAAATGGGCCATCAAAAAAACAATTAGAAATTATTAAAAA' ...),
 'metacluster_0/pattern_5': Pattern('metacluster_0/pattern_5', 'AAAAAAAAAAATATTCCATATTAGTACTTTGATGGCCCATTAAGTCCCACTTAAAGCATTTTTTTGCTTT' ...),
 'metacluster_0/pattern_1': Pattern('metacluster_0/pattern_1', 'TTTTTTTAAAAATTTTTTTATTATTGTCTTGATGGCTTTTTTTTTTTTTTTTTATAAATTTTTTTAAAAA' ...),
 'metacluster_0/pattern_2': Pattern('metacluster_0/pattern_2', 'TAGTCTCCGGAATAGCTCAAGGGCCTCTCAGCAGGTAGCAGTGTCTTGGAGAAGAACAGCGGCAAAAAAA' ...)}
In [175]:
p_nanog = nanog_patterns[longer_pattern(motifs['Nanog'].split("/")[1])]
In [176]:
p_nanog_alt = longer_pattern(motifs['Nanog-partner'].split("/")[1])
In [189]:
from basepair.exp.paper.fig4 import *
In [195]:
pattern_names = ['Nanog', 'Nanog-partner']
for pname in pattern_names:
    p = nanog_patterns[longer_pattern(motifs[pname].split("/")[1])]
    agg_profile = p.profile['Nanog'][70:130]
    pfm = p.seq[5:65]
    fig, axes = plt.subplots(2,1, 
                               gridspec_kw={'height_ratios':[1/7, 1/2]},
                               sharex=True,
                               figsize=get_figsize(.5, 1/7+1/2))
    cmap = plt.get_cmap("tab10")
    cycle = plt.rcParams['axes.prop_cycle'].by_key()['color']

    ax = axes[0]
    plot_pwm(pfm, ax=ax)
    ax.set_title(f"Nanog");


    ao = .7
    # plot aggregate profile
    ax = axes[1]
    ax.plot(agg_profile[:, 0], alpha=ao, label='Obs', color=cycle[0])
    ax.plot(-agg_profile[:, 1], alpha=ao, color=cycle[0])

    ax.set_ylabel("Counts")
    ax.set_ylim([-11, 11])
    # ax.set_title("BPNet")
    sns.despine(top=True, bottom=True, right=True)
    ax.legend()
    fig.align_ylabels()

    fig.savefig(fdir_individual / f'only-{pname}.pdf')
In [ ]:
p_nanog.plot(['seq_ic', 'profile'])
In [ ]:
to_plot = [
    ("Nanog<>Nanog", "++", 10),
    ("Nanog<>Nanog", "++", 20),
    ("Nanog<>Nanog", "++", 21),
    ("Nanog<>Nanog", "++", 30),
    ("Nanog<>Nanog", "++", 31),
    ("Nanog<>Nanog", "++", 40),
    ("Nanog<>Nanog", "++", 41),
#     ("Sox2<>Nanog", "--", 10),
#     ("Sox2<>Nanog", "--", 20),
#     ("Sox2<>Nanog", "--", 30),
#     ("Sox2<>Nanog", "++", 13),
#     ("Sox2<>Nanog", "++", 22),
#     ("Sox2<>Nanog", "++", 23),
#     ("Nanog<>Zic3", "++", 12),
#     ("Nanog<>Zic3", "+-", 11),
#     ("Nanog<>Zic3", "+-", 21),
#     ("Nanog<>Zic3", "+-", 31),
#     ("Oct4<>Sox2", "++", 10),
]

for motif_pair, strand_combination, dist in to_plot:
#     dist = 10
#     strand_combination = '++'
#     motif_pair = 'Nanog<>Nanog'
    fig, axes = plt.subplots(2,1, 
                           gridspec_kw={'height_ratios':[1/7, 1/2]},
                           sharex=True,
                           figsize=get_figsize(.5, 1/7+1/2))
    dfp = (dfab
           .query(f'strand_combination=="{strand_combination}"')
           .query(f'motif_pair=="{motif_pair}"')
           .query(f'center_diff=={dist}'))
    intervals = create_intervals(dfp, ax=ax)
    seqs = fe(intervals)
    
    counts = ds.load_counts(intervals)
    ax = axes[0]
    plot_pwm(seqs.mean(0), ax=ax)
    ax.set_title(f"{motif_pair} {strand_combination}, dist = {dist}, ({len(dfp)})");
    
    
    cmap = plt.get_cmap("tab10")
    cycle = plt.rcParams['axes.prop_cycle'].by_key()['color']
    
    ao = .7
    # plot aggregate profile
    agg_profile = counts[motif_pair.split("<>")[0]].mean(0)
    ax = axes[1]
    # fig, ax = plt.subplots(1, 1, figsize=get_figsize(.5, 1/2))
    ax.plot(agg_profile[:, 0], alpha=ao, label='Obs', color=cycle[0])
    ax.plot(-agg_profile[:, 1], alpha=ao, color=cycle[0])
    ax.set_ylabel("Counts")
    # ax.set_title("Observed")
    sns.despine(top=True, bottom=True, right=True)
    
    # params
    seqlen = seqs.shape[1]
    
    # Plot model
    agg_profile = counts[motif_pair.split("<>")[0]].mean(0)
    # fig, ax = plt.subplots(1, 1, figsize=get_figsize(.5, 1/2))
    seqs_1kb = fe([resize_interval(interval, 1000) for interval in intervals])
    preds = bpnet.predict(seqs_1kb)
    agg_preds = preds['Nanog'].mean(0)
    upstream_seq, downstream_seq = halve(seqlen)
    xs = slice(500-upstream_seq, 500+downstream_seq)
    a = agg_profile.sum() / agg_preds[xs].sum()
    ax.plot(a*agg_preds[xs, 0], label='BPNet', alpha=ao, color=cycle[1])
    ax.plot(-a*agg_preds[xs, 1], alpha=ao, color=cycle[1])
    ax.set_ylabel("Counts")
    # ax.set_title("BPNet")
    sns.despine(top=True, bottom=True, right=True)
    
    
    pshifted = p.rc().shift(-3).resize(seqlen).resize_profile(seqlen)
    upstream, downstream = halve(dist)
    double_profile = pshifted.shift(-upstream).profile['Nanog'] + pshifted.shift(downstream).profile['Nanog']
    # fig, ax = plt.subplots(1, 1, figsize=get_figsize(.5, 1/2))
    a = agg_profile.sum() / double_profile.sum()
    ax.plot(a*double_profile[:, 0], label='Mixture', alpha=ao, color=cycle[2])
    ax.plot(-a*double_profile[:, 1], alpha=ao, color=cycle[2])
    ax.set_ylabel("Counts")
    # ax.set_title("Mixture")
    sns.despine(top=True, bottom=True, right=True)
    ax.legend()#labels=['Obs', 'BPNet', 'Mixure'])
    fig.align_ylabels()
    
    fig.savefig(fdir_individual / f'{motif_pair}.{strand_combination}.{dist}.pwm+profile.pdf')
In [424]:
def extract_stacked_seqlets(dfab, isf):
    from matplotlib import ticker
    from concise.utils.pwm import seqlogo
    from basepair.modisco.utils import ic_scale
    from basepair.extractors import Interval
    seqlets = []
    rev_strand = {"+": "-", "-": "+"}
    for i,row in dfab.iterrows():
        diff = row.pattern_center_y - row.pattern_center_x
        width = np.abs(diff) + 30
        if diff > 0:
            start = row.pattern_center_x - 15
            end = start + width
            strand = row.strand_x
        else:
            end = row.pattern_center_x + 15
            start = end - width
            strand = row.strand_x
        seqlets.append(Seqlet(seqname=row.example_idx, 
                     start=start, 
                     end=end,
                     name=row.motif_pair, 
                     strand=strand,
                     # strand="+" if row.pattern_center_x > row.pattern_center_y else "-"
                    ))
    sts = isf.extract(seqlets)
    return sts
In [425]:
# to_plot = [
#     ("Nanog<>Nanog", "++", 10),
#     ("Nanog<>Nanog", "++", 20),
#     ("Nanog<>Nanog", "++", 21),
#     ("Nanog<>Nanog", "++", 30),
#     ("Nanog<>Nanog", "++", 31),
#     ("Nanog<>Nanog", "++", 40),
#     ("Nanog<>Nanog", "++", 41),
#     ("Sox2<>Nanog", "--", 10),
#     ("Sox2<>Nanog", "--", 20),
#     ("Sox2<>Nanog", "--", 30),
#     ("Sox2<>Nanog", "++", 13),
#     ("Sox2<>Nanog", "++", 22),
#     ("Sox2<>Nanog", "++", 23),
#     ("Nanog<>Zic3", "++", 12),
#     ("Nanog<>Zic3", "+-", 11),
#     ("Nanog<>Zic3", "+-", 21),
#     ("Nanog<>Zic3", "+-", 31),
# ]

# for motif_pair, strand_combination, dist in to_plot:
# #     dist = 10
# #     strand_combination = '++'
# #     motif_pair = 'Nanog<>Nanog'
#     fig, ax = plt.subplots(1,1, figsize=get_figsize(.5, 1/7))
#     dfp = (dfab
#            .query(f'strand_combination=="{strand_combination}"')
#            .query(f'motif_pair=="{motif_pair}"')
#            .query(f'center_diff=={dist}'))
#     sts = extract_stacked_seqlets(dfp, isf)
#     plot_pwm(sts.seq.mean(0), ax=ax)
#     ax.set_title(f"{motif_pair} {strand_combination}, dist = {dist}, ({len(dfp)})");

TODO

  • [ ] plot the aggregate strand plot (for each distance)
  • [ ] Plot the spacing for Oct4-Sox2 -> is that the cannonical spacing?
  • [X] Annotate dfi_subset with profile scores
    • [X] Implement _get_seqlets(pattern, trim_frac=trim_frac) to MultipleModiscoResult
    • [X] is dfi.pattern the same as in mr.get_pattern(dfi.pattern)
    • [X] move the code up & re-run dfab
  • [ ] Generate the pairwise spacing plots showing differnet profile properties
In [387]:
# isf.cache()
Out[387]:
<basepair.cli.imp_score.ImpScoreFile at 0x7f51fc2f3a58>

Pairwise scatter-plot

Oct4-Sox2 <-> Nanog

In [489]:
motif_pair_name = 'Oct4-Sox2<>Nanog'
In [490]:
dfab['Nanog/profile_counts_max_ref_y_log'] = np.log10(1+dfab['Nanog/profile_counts_max_ref_y'])
dfab['Sox2/profile_counts_max_ref_x_log'] = np.log10(1+dfab['Sox2/profile_counts_max_ref_x'])

Nanog site (Nanog counts)

In [492]:
# Test the ab scatterplot
plotnine.options.figure_size = get_figsize(1, aspect=2/10*4)
max_dist = 100
min_dist = 0
df = dfab[(dfab.center_diff <= max_dist) & 
          (dfab.motif_pair == motif_pair_name)]
fig = (ggplot(aes(x='center_diff', 
                  y='Nanog/profile_counts_max_ref_y_log',  # or 'profile_counts_max_ref_p'
                  color='strand_combination'
                 ), 
              df) + 
 # plot
 geom_point(alpha=0.5, size=.05, shape='.') +
 facet_grid("strand_combination~.") + 
 # scale_y_log10() + 
 # geom_smooth() + 
 # ylim([1, None]) + 
 # Theme, labels, colors
 theme_classic(base_size=10, base_family='Arial') + 
 theme(strip_text = element_text(rotation=0), legend_position='top') + 
 xlim([min_dist, max_dist]) + 
 xlab("Pairwise distance") +
 ggtitle(motif_pair_name) + 
 scale_color_brewer(type='qual', palette=3)
)
fig
Out[492]:
<ggplot: (8749623594574)>
In [496]:
# Test the ab scatterplot
plotnine.options.figure_size = get_figsize(.3, aspect=2/10*4)
max_dist = 100
min_dist = 0
df = dfab[(dfab.center_diff <= max_dist) & 
          (dfab.motif_pair == motif_pair_name)]
fig = (ggplot(aes(x='center_diff', 
                  y='Nanog/profile_counts_max_ref_y_log',  # or 'profile_counts_max_ref_p'
                  color='strand_combination'), 
              df.groupby(["center_diff", 'strand_combination'])['Nanog/profile_counts_max_ref_y_log'].mean().reset_index()) + 
 # plot
 geom_line() +
 facet_grid("strand_combination~.") + 
 # geom_smooth() + 
 # ylim([1, None]) + 
 # Theme, labels, colors
 theme_classic(base_size=10, base_family='Arial') + 
 theme(strip_text = element_text(rotation=0), legend_position='top') + 
 xlim([min_dist, max_dist]) + 
 xlab("Pairwise distance") +
 ggtitle(motif_pair_name) + 
 scale_color_brewer(type='qual', palette=3)
)
fig
Out[496]:
<ggplot: (-9223363287230738495)>

Oct4-Sox2 site (Sox2 counts)

In [507]:
# Test the ab scatterplot
plotnine.options.figure_size = get_figsize(1, aspect=2/10*4)
max_dist = 100
min_dist = 0
df = dfab[(dfab.center_diff <= max_dist) & 
          (dfab.motif_pair == motif_pair_name)]
fig = (ggplot(aes(x='center_diff', 
                  y='Sox2/profile_counts_max_ref_x_log',  # or 'profile_counts_max_ref_p'
                  color='strand_combination'
                 ), 
              df) + 
 # plot
 geom_jitter(alpha=0.4, size=.05, shape='.', height=0, width=1) +
 facet_grid("strand_combination~.") + 
 # scale_y_log10() + 
 # geom_smooth() + 
 # ylim([1, None]) + 
 # Theme, labels, colors
 theme_classic(base_size=10, base_family='Arial') + 
 theme(strip_text = element_text(rotation=0), legend_position='top') + 
 xlim([min_dist, max_dist]) + 
 xlab("Pairwise distance") +
 ggtitle(motif_pair_name) + 
 scale_color_brewer(type='qual', palette=3)
)
fig
Out[507]:
<ggplot: (-9223363287223337121)>
In [494]:
# Test the ab scatterplot
plotnine.options.figure_size = get_figsize(1, aspect=2/10*4)
max_dist = 100
min_dist = 0
df = dfab[(dfab.center_diff <= max_dist) & 
          (dfab.motif_pair == motif_pair_name)]
fig = (ggplot(aes(x='center_diff', 
                  y='Sox2/profile_counts_max_ref_x_log',  # or 'profile_counts_max_ref_p'
                  color='strand_combination'), 
              df.groupby(["center_diff", 'strand_combination'])['Sox2/profile_counts_max_ref_x_log'].mean().reset_index()) + 
 # plot
 geom_line() +
 facet_grid("strand_combination~.") + 
 # geom_smooth() + 
 # ylim([1, None]) + 
 # Theme, labels, colors
 theme_classic(base_size=10, base_family='Arial') + 
 theme(strip_text = element_text(rotation=0), legend_position='top') + 
 xlim([min_dist, max_dist]) + 
 xlab("Pairwise distance") +
 ggtitle(motif_pair_name) + 
 scale_color_brewer(type='qual', palette=3)
)
fig
Out[494]:
<ggplot: (8749630463275)>

Sox2 <-> Nanog

In [482]:
motif_pair_name = 'Sox2<>Nanog'
In [483]:
dfab['Nanog/profile_counts_max_ref_y_log'] = np.log10(1+dfab['Nanog/profile_counts_max_ref_y'])
dfab['Sox2/profile_counts_max_ref_x_log'] = np.log10(1+dfab['Sox2/profile_counts_max_ref_x'])
In [485]:
# Test the ab scatterplot
plotnine.options.figure_size = get_figsize(1, aspect=2/10*4)
max_dist = 100
min_dist = 0
df = dfab[(dfab.center_diff <= max_dist) & 
          (dfab.motif_pair == motif_pair_name)]
fig = (ggplot(aes(x='center_diff', 
                  y='Sox2/profile_counts_max_ref_x_log',  # or 'profile_counts_max_ref_p'
                  color='strand_combination'
                 ), 
              df) + 
 # plot
 geom_point(alpha=0.5, size=.05, shape='.') +
 facet_grid("strand_combination~.") + 
 # scale_y_log10() + 
 # geom_smooth() + 
 # ylim([1, None]) + 
 # Theme, labels, colors
 theme_classic(base_size=10, base_family='Arial') + 
 theme(strip_text = element_text(rotation=0), legend_position='top') + 
 xlim([min_dist, max_dist]) + 
 xlab("Pairwise distance") +
 ggtitle(motif_pair_name) + 
 scale_color_brewer(type='qual', palette=3)
)
fig
Out[485]:
<ggplot: (-9223363287305922237)>
In [486]:
# Test the ab scatterplot
plotnine.options.figure_size = get_figsize(1, aspect=2/10*4)
max_dist = 100
min_dist = 0
df = dfab[(dfab.center_diff <= max_dist) & 
          (dfab.motif_pair == motif_pair_name)]
fig = (ggplot(aes(x='center_diff', 
                  y='Nanog/profile_counts_max_ref_y_log',  # or 'profile_counts_max_ref_p'
                  color='strand_combination'
                 ), 
              df) + 
 # plot
 geom_point(alpha=0.5, size=.05, shape='.') +
 facet_grid("strand_combination~.") + 
 # scale_y_log10() + 
 # geom_smooth() + 
 # ylim([1, None]) + 
 # Theme, labels, colors
 theme_classic(base_size=10, base_family='Arial') + 
 theme(strip_text = element_text(rotation=0), legend_position='top') + 
 xlim([min_dist, max_dist]) + 
 xlab("Pairwise distance") +
 ggtitle(motif_pair_name) + 
 scale_color_brewer(type='qual', palette=3)
)
fig
Out[486]:
<ggplot: (-9223363287261764271)>
In [523]:
dfab.head()
Out[523]:
example_chrom_x example_end_x example_idx example_interval_from_task_x example_start_x example_strand_x imp/Klf4_x imp/Nanog_x imp/Oct4_x imp/Sox2_x imp_max_x imp_max_task_x imp_weighted_x imp_weighted_cat_x imp_weighted_p_x match/Klf4_x match/Nanog_x match/Oct4_x match/Sox2_x match_max_x match_max_task_x match_weighted_x match_weighted_cat_x match_weighted_p_x pattern_x pattern_center_x pattern_end_x pattern_end_abs_x pattern_len_x pattern_name_x pattern_short_x pattern_start_x pattern_start_abs_x seq_match_x seq_match_cat_x seq_match_p_x strand_x tf_x pattern_center_aln_x pattern_strand_aln_x row_idx_x id_x Klf4/profile_match_x Klf4/profile_match_p_x Klf4/profile_counts_x Klf4/profile_counts_p_x Klf4/profile_max_x Klf4/profile_max_p_x Klf4/profile_counts_max_ref_x Klf4/profile_counts_max_ref_p_x Nanog/profile_match_x Nanog/profile_match_p_x Nanog/profile_counts_x Nanog/profile_counts_p_x Nanog/profile_max_x Nanog/profile_max_p_x Nanog/profile_counts_max_ref_x Nanog/profile_counts_max_ref_p_x Oct4/profile_match_x Oct4/profile_match_p_x Oct4/profile_counts_x Oct4/profile_counts_p_x Oct4/profile_max_x Oct4/profile_max_p_x Oct4/profile_counts_max_ref_x Oct4/profile_counts_max_ref_p_x Sox2/profile_match_x Sox2/profile_match_p_x Sox2/profile_counts_x Sox2/profile_counts_p_x Sox2/profile_max_x Sox2/profile_max_p_x Sox2/profile_counts_max_ref_x Sox2/profile_counts_max_ref_p_x example_chrom_y example_end_y example_interval_from_task_y example_start_y example_strand_y imp/Klf4_y imp/Nanog_y imp/Oct4_y imp/Sox2_y imp_max_y imp_max_task_y imp_weighted_y imp_weighted_cat_y imp_weighted_p_y match/Klf4_y match/Nanog_y match/Oct4_y match/Sox2_y match_max_y match_max_task_y match_weighted_y match_weighted_cat_y match_weighted_p_y pattern_y pattern_center_y pattern_end_y pattern_end_abs_y pattern_len_y pattern_name_y pattern_short_y pattern_start_y pattern_start_abs_y seq_match_y seq_match_cat_y seq_match_p_y strand_y tf_y pattern_center_aln_y pattern_strand_aln_y row_idx_y id_y Klf4/profile_match_y Klf4/profile_match_p_y Klf4/profile_counts_y Klf4/profile_counts_p_y Klf4/profile_max_y Klf4/profile_max_p_y Klf4/profile_counts_max_ref_y Klf4/profile_counts_max_ref_p_y Nanog/profile_match_y Nanog/profile_match_p_y Nanog/profile_counts_y Nanog/profile_counts_p_y Nanog/profile_max_y Nanog/profile_max_p_y Nanog/profile_counts_max_ref_y Nanog/profile_counts_max_ref_p_y Oct4/profile_match_y Oct4/profile_match_p_y Oct4/profile_counts_y Oct4/profile_counts_p_y Oct4/profile_max_y Oct4/profile_max_p_y Oct4/profile_counts_max_ref_y Oct4/profile_counts_max_ref_p_y Sox2/profile_match_y Sox2/profile_match_p_y Sox2/profile_counts_y Sox2/profile_counts_p_y Sox2/profile_max_y Sox2/profile_max_p_y Sox2/profile_counts_max_ref_y Sox2/profile_counts_max_ref_p_y center_diff center_diff_aln strand_combination motif_pair Nanog/profile_counts_max_ref_x_log Nanog/profile_counts_max_ref_y_log Sox2/profile_counts_max_ref_x_log
1 chr3 1.2215e+08 1 Oct4 1.2215e+08 . NaN NaN 0.9374 NaN 0.9374 Oct4 0.9374 high 0.8462 NaN NaN 0.5077 NaN 0.5077 Oct4 0.5077 high 0.6886 Oct4/metacluster_0/pa... 430.0 438.0 1.2215e+08 16.0 Oct4-Sox2 Oct4/m0_p0 422.0 1.2215e+08 10.2722 high 0.6902 - Oct4 429.0 - 0.0 0.0 5.3935 0.4228 44.0001 0.6462 4.0 0.8385 2.0000e-06 0.7681 3.1092 0.3125 2726.0 0.9965 281.0 0.9993 48.0 0.9884 1.5704 0.0673 2310.0 0.9998 93.0 0.9994 60.0 0.9934 2.4112 0.0222 443.0001 0.9982 22.0 0.9934 9.0000e+00 0.9520 chr3 1.2215e+08 Oct4 1.2215e+08 . NaN NaN 1.5355 NaN 1.5355 Oct4 1.5355 high 0.992 NaN NaN 0.5287 NaN 0.5287 Oct4 0.5287 high 0.7992 Oct4/metacluster_0/pa... 451.0 459.0 1.2215e+08 16.0 Oct4-Sox2 Oct4/m0_p0 443.0 1.2215e+08 11.5528 high 0.9044 - Oct4 450.0 - 1.0 1.0 4.9558 0.3114 59.0001 0.7495 4.0 0.8385 2.0000e-06 0.7681 3.8785 0.4346 3366.0 0.9981 281.0 0.9993 1.0 0.5260 3.5481 0.3387 2339.0 0.9999 104.0 0.9996 1.0 0.3723 4.3891 0.1790 537.0001 0.9988 27.0 0.9960 2.0000e-06 0.6075 21.0 21.0 ++ Oct4-Sox2<>Oct4-Sox2 1.6902 0.3010 1.0000e+00
2 chr3 1.2215e+08 1 Oct4 1.2215e+08 . NaN NaN 0.9374 NaN 0.9374 Oct4 0.9374 high 0.8462 NaN NaN 0.5077 NaN 0.5077 Oct4 0.5077 high 0.6886 Oct4/metacluster_0/pa... 430.0 438.0 1.2215e+08 16.0 Oct4-Sox2 Oct4/m0_p0 422.0 1.2215e+08 10.2722 high 0.6902 - Oct4 429.0 - 0.0 0.0 5.3935 0.4228 44.0001 0.6462 4.0 0.8385 2.0000e-06 0.7681 3.1092 0.3125 2726.0 0.9965 281.0 0.9993 48.0 0.9884 1.5704 0.0673 2310.0 0.9998 93.0 0.9994 60.0 0.9934 2.4112 0.0222 443.0001 0.9982 22.0 0.9934 9.0000e+00 0.9520 chr3 1.2215e+08 Oct4 1.2215e+08 . NaN NaN 1.2758 NaN 1.2758 Oct4 1.2758 high 0.967 NaN NaN 0.4219 NaN 0.4219 Oct4 0.4219 low 0.2234 Oct4/metacluster_0/pa... 492.0 500.0 1.2215e+08 16.0 Oct4-Sox2 Oct4/m0_p0 484.0 1.2215e+08 10.7122 high 0.7777 - Oct4 491.0 - 2.0 2.0 4.5805 0.2406 66.0001 0.7776 4.0 0.8385 2.0000e-06 0.7681 1.8579 0.1491 4406.0 0.9993 281.0 0.9993 56.0 0.9910 1.4478 0.0585 2964.0 1.0000 104.0 0.9996 166.0 0.9998 1.6449 0.0099 866.0001 0.9998 29.0 0.9968 3.3000e+01 0.9967 62.0 62.0 ++ Oct4-Sox2<>Oct4-Sox2 1.6902 1.7559 1.0000e+00
3 chr3 1.2215e+08 1 Oct4 1.2215e+08 . NaN NaN 0.9374 NaN 0.9374 Oct4 0.9374 high 0.8462 NaN NaN 0.5077 NaN 0.5077 Oct4 0.5077 high 0.6886 Oct4/metacluster_0/pa... 430.0 438.0 1.2215e+08 16.0 Oct4-Sox2 Oct4/m0_p0 422.0 1.2215e+08 10.2722 high 0.6902 - Oct4 429.0 - 0.0 0.0 5.3935 0.4228 44.0001 0.6462 4.0 0.8385 2.0000e-06 0.7681 3.1092 0.3125 2726.0 0.9965 281.0 0.9993 48.0 0.9884 1.5704 0.0673 2310.0 0.9998 93.0 0.9994 60.0 0.9934 2.4112 0.0222 443.0001 0.9982 22.0 0.9934 9.0000e+00 0.9520 chr3 1.2215e+08 Oct4 1.2215e+08 . NaN NaN 1.0455 NaN 1.0455 Oct4 1.0455 high 0.905 NaN NaN 0.4155 NaN 0.4155 Oct4 0.4155 low 0.2005 Oct4/metacluster_0/pa... 534.0 542.0 1.2215e+08 16.0 Oct4-Sox2 Oct4/m0_p0 526.0 1.2215e+08 10.2722 high 0.6902 - Oct4 533.0 - 3.0 3.0 3.8271 0.1487 98.0001 0.8662 4.0 0.8385 4.0000e+00 0.9512 0.3850 0.0107 5699.0 0.9995 248.0 0.9989 91.0 0.9964 0.2652 0.0049 3446.0 1.0000 98.0 0.9995 132.0 0.9994 0.5203 0.0060 1039.0000 1.0000 33.0 0.9979 2.4000e+01 0.9919 104.0 104.0 ++ Oct4-Sox2<>Oct4-Sox2 1.6902 1.9638 1.0000e+00
6 chr3 1.2215e+08 1 Oct4 1.2215e+08 . NaN NaN 1.5355 NaN 1.5355 Oct4 1.5355 high 0.9920 NaN NaN 0.5287 NaN 0.5287 Oct4 0.5287 high 0.7992 Oct4/metacluster_0/pa... 451.0 459.0 1.2215e+08 16.0 Oct4-Sox2 Oct4/m0_p0 443.0 1.2215e+08 11.5528 high 0.9044 - Oct4 450.0 - 1.0 1.0 4.9558 0.3114 59.0001 0.7495 4.0 0.8385 2.0000e-06 0.7681 3.8785 0.4346 3366.0 0.9981 281.0 0.9993 1.0 0.5260 3.5481 0.3387 2339.0 0.9999 104.0 0.9996 1.0 0.3723 4.3891 0.1790 537.0001 0.9988 27.0 0.9960 2.0000e-06 0.6075 chr3 1.2215e+08 Oct4 1.2215e+08 . NaN NaN 1.2758 NaN 1.2758 Oct4 1.2758 high 0.967 NaN NaN 0.4219 NaN 0.4219 Oct4 0.4219 low 0.2234 Oct4/metacluster_0/pa... 492.0 500.0 1.2215e+08 16.0 Oct4-Sox2 Oct4/m0_p0 484.0 1.2215e+08 10.7122 high 0.7777 - Oct4 491.0 - 2.0 2.0 4.5805 0.2406 66.0001 0.7776 4.0 0.8385 2.0000e-06 0.7681 1.8579 0.1491 4406.0 0.9993 281.0 0.9993 56.0 0.9910 1.4478 0.0585 2964.0 1.0000 104.0 0.9996 166.0 0.9998 1.6449 0.0099 866.0001 0.9998 29.0 0.9968 3.3000e+01 0.9967 41.0 41.0 ++ Oct4-Sox2<>Oct4-Sox2 0.3010 1.7559 8.6859e-07
7 chr3 1.2215e+08 1 Oct4 1.2215e+08 . NaN NaN 1.5355 NaN 1.5355 Oct4 1.5355 high 0.9920 NaN NaN 0.5287 NaN 0.5287 Oct4 0.5287 high 0.7992 Oct4/metacluster_0/pa... 451.0 459.0 1.2215e+08 16.0 Oct4-Sox2 Oct4/m0_p0 443.0 1.2215e+08 11.5528 high 0.9044 - Oct4 450.0 - 1.0 1.0 4.9558 0.3114 59.0001 0.7495 4.0 0.8385 2.0000e-06 0.7681 3.8785 0.4346 3366.0 0.9981 281.0 0.9993 1.0 0.5260 3.5481 0.3387 2339.0 0.9999 104.0 0.9996 1.0 0.3723 4.3891 0.1790 537.0001 0.9988 27.0 0.9960 2.0000e-06 0.6075 chr3 1.2215e+08 Oct4 1.2215e+08 . NaN NaN 1.0455 NaN 1.0455 Oct4 1.0455 high 0.905 NaN NaN 0.4155 NaN 0.4155 Oct4 0.4155 low 0.2005 Oct4/metacluster_0/pa... 534.0 542.0 1.2215e+08 16.0 Oct4-Sox2 Oct4/m0_p0 526.0 1.2215e+08 10.2722 high 0.6902 - Oct4 533.0 - 3.0 3.0 3.8271 0.1487 98.0001 0.8662 4.0 0.8385 4.0000e+00 0.9512 0.3850 0.0107 5699.0 0.9995 248.0 0.9989 91.0 0.9964 0.2652 0.0049 3446.0 1.0000 98.0 0.9995 132.0 0.9994 0.5203 0.0060 1039.0000 1.0000 33.0 0.9979 2.4000e+01 0.9919 83.0 83.0 ++ Oct4-Sox2<>Oct4-Sox2 0.3010 1.9638 8.6859e-07
In [474]:
# Test the ab scatterplot
plotnine.options.figure_size = get_figsize(1, aspect=2/10*4)
max_dist = 100
min_dist = 0
df = dfab[(dfab.center_diff <= max_dist) & 
          (dfab.motif_pair == motif_pair_name)]
fig = (ggplot(aes(x='center_diff', 
                  y='Nanog/profile_counts_max_ref_x_log',  # or 'profile_counts_max_ref_p'
                  color='strand_combination'
                 ), 
              df) + 
 # plot
 geom_point(alpha=0.2, size=.05, shape='.') +
 facet_grid("strand_combination~.") + 
 # scale_y_log10() + 
 # geom_smooth() + 
 # ylim([1, None]) + 
 # Theme, labels, colors
 theme_classic(base_size=10, base_family='Arial') + 
 theme(strip_text = element_text(rotation=0), legend_position='top') + 
 xlim([min_dist, max_dist]) + 
 xlab("Pairwise distance") +
 ggtitle(motif_pair_name) + 
 scale_color_brewer(type='qual', palette=3)
)
fig
Out[474]:
<ggplot: (8749453687928)>
In [522]:
# Test the ab scatterplot
plotnine.options.figure_size = get_figsize(1, aspect=2/10*4)
max_dist = 100
min_dist = 0
df = dfab[(dfab.center_diff <= max_dist) & 
          (dfab.motif_pair == motif_pair_name)]
fig = (ggplot(aes(x='center_diff', 
                  y='Nanog/profile_counts_max_ref_x_log',  # or 'profile_counts_max_ref_p'
                  color='strand_combination'
                 ), 
              df) + 
 # plot
 geom_point(alpha=0.2, size=.05, shape='.') +
 facet_grid("strand_combination~.") + 
 # scale_y_log10() + 
 # geom_smooth() + 
 # ylim([1, None]) + 
 # Theme, labels, colors
 theme_classic(base_size=10, base_family='Arial') + 
 theme(strip_text = element_text(rotation=0), legend_position='top') + 
 xlim([min_dist, max_dist]) + 
 xlab("Pairwise distance") +
 ggtitle(motif_pair_name) + 
 scale_color_brewer(type='qual', palette=3)
)
fig
Out[522]:
<ggplot: (-9223363287241288546)>
In [481]:
# Test the ab scatterplot
plotnine.options.figure_size = get_figsize(1, aspect=2/10*4)
max_dist = 100
min_dist = 0
df = dfab[(dfab.center_diff <= max_dist) & 
          (dfab.motif_pair == motif_pair_name)]
fig = (ggplot(aes(x='center_diff', 
                  y='Nanog/profile_counts_max_ref_x_log',  # or 'profile_counts_max_ref_p'
                  color='strand_combination'), 
              df.groupby(["center_diff", 'strand_combination'])['Nanog/profile_counts_max_ref_x_log'].quantile(0.95).reset_index()) + 
 # plot
 geom_line() +
 facet_grid("strand_combination~.") + 
 # geom_smooth() + 
 # ylim([1, None]) + 
 # Theme, labels, colors
 theme_classic(base_size=10, base_family='Arial') + 
 theme(strip_text = element_text(rotation=0), legend_position='top') + 
 xlim([min_dist, max_dist]) + 
 xlab("Pairwise distance") +
 ggtitle(motif_pair_name) + 
 scale_color_brewer(type='qual', palette=3)
)
fig
Out[481]:
<ggplot: (8749453634375)>
In [521]:
# Test the ab scatterplot
plotnine.options.figure_size = get_figsize(1, aspect=2/10*4)
max_dist = 100
min_dist = 0
df = dfab[(dfab.center_diff <= max_dist) & 
          (dfab.motif_pair == motif_pair_name)]
fig = (ggplot(aes(x='center_diff', 
                  y='Nanog/profile_counts_max_ref_x_log',  # or 'profile_counts_max_ref_p'
                  color='strand_combination'), 
              df.groupby(["center_diff", 'strand_combination'])['Nanog/profile_counts_max_ref_x_log'].quantile(.5).reset_index()) + 
 # plot
 geom_line() +
 facet_grid("strand_combination~.") + 
 # geom_smooth() + 
 # ylim([1, None]) + 
 # Theme, labels, colors
 theme_classic(base_size=10, base_family='Arial') + 
 theme(strip_text = element_text(rotation=0), legend_position='top') + 
 xlim([min_dist, max_dist]) + 
 xlab("Pairwise distance") +
 ggtitle(motif_pair_name) + 
 scale_color_brewer(type='qual', palette=3)
)
fig
Out[521]:
<ggplot: (8749549577634)>