Goal

  • compare the profiles of weak affinity sites

Conclusions

  • Weak affinity sites have stronger footprints when they are highlighted as important
  • Potential issue: the important poor matches might actually come from other motifs (e.g. Sox2 might have Oct4 flanks etc)
  • Nanog-Nanog 10bp spacing can be only seen with our motif instances

Next steps

  • study the spacing of motifs?
In [1]:
# Imports
from basepair.imports import *
from basepair.exp.paper.config import motifs
from basepair.modisco.pattern_instances import load_instances, filter_nonoverlapping_intervals, plot_coocurence_matrix, dfi_filter_valid
from basepair.cli.imp_score import ImpScoreFile
from basepair.modisco.core import Pattern
hv.extension('bokeh')
%config InlineBackend.figure_format = 'retina'
%matplotlib inline
paper_config()
Using TensorFlow backend.
In [364]:
# Common paths
model_dir = Path(f"{ddir}/processed/chipnexus/exp/models/oct-sox-nanog-klf/models/n_dil_layers=9/")
modisco_dir = model_dir / f"modisco/all/deeplift/profile/"
output_dir = modisco_dir / "pwm_comparison"  # this notebook's output directory
output_dir.mkdir(exist_ok=True)
figures_dir = Path(f"{ddir}/figures/modisco/pwm_comparison")
figures_dir.mkdir(exist_ok=True)
In [363]:
!mkdir -p {ddir}/figures/modisco/pwm_comparison
In [6]:
# load all the required data
dfi = load_instances(f"{modisco_dir}/instances.parq", motifs)

mr = ModiscoResult(modisco_dir / "modisco.h5")
mr.open()

imp_scores = ImpScoreFile.from_modisco_dir(modisco_dir)
imp_scores.cache()  # load all the scores
seqs = imp_scores.get_seq()

Considered motifs

In [99]:
# plot them
patterns = {m: mr.get_pattern(longer_pattern(p)).trim_seq_ic(0.08)
            for m, p in motifs.items()}
for tf, p in patterns.items():
    p.plot("seq");
    plt.title(tf)

Extract profiles and sequences

Sequence (PWM) instances

In [214]:
# sequences
p_seq = {m: imp_scores.extract_dfi(dfi_filter_valid(patterns[m].get_instances_seq_scan(seqs), profile_width, seqlen=1000), 
                                   profile_width=70, verbose=True)
         for m in motifs}
dfi_seq = pd.concat([p_seq[m].dfi.assign(motif=m) for m in motifs])
100%|██████████| 98428/98428 [00:26<00:00, 3735.63it/s]
100%|██████████| 98428/98428 [00:17<00:00, 5561.87it/s]
100%|██████████| 453467/453467 [03:08<00:00, 2411.01it/s]
100%|██████████| 98428/98428 [00:22<00:00, 4397.69it/s]
100%|██████████| 98428/98428 [00:22<00:00, 4344.63it/s]
100%|██████████| 469791/469791 [03:17<00:00, 2380.60it/s]
100%|██████████| 98428/98428 [00:13<00:00, 7339.42it/s]
100%|██████████| 98428/98428 [00:13<00:00, 7227.47it/s]
100%|██████████| 958034/958034 [06:33<00:00, 2432.15it/s]
100%|██████████| 98428/98428 [00:24<00:00, 4069.31it/s]
100%|██████████| 98428/98428 [00:15<00:00, 6476.03it/s]
100%|██████████| 654974/654974 [04:32<00:00, 2400.62it/s]
In [ ]:
# Load the cached results
# p_seq = read_pkl(output_dir / 'p_seq.pkl')
# dfi_seq = pd.read_csv(output_dir / 'pwm_instances.csv.gz')
In [325]:
# save the instances
dfi_seq.to_csv(output_dir / 'pwm_instances.csv.gz', index=False, compression='gzip')
In [329]:
write_pkl(p_seq, output_dir / 'p_seq.pkl')
In [330]:
!du -sh {output_dir}/*
12G	/users/avsec/workspace/basepair/data/processed/chipnexus/exp/models/oct-sox-nanog-klf/models/n_dil_layers=9/modisco/all/deeplift/profile/pwm_comparison/p_seq.pkl
39M	/users/avsec/workspace/basepair/data/processed/chipnexus/exp/models/oct-sox-nanog-klf/models/n_dil_layers=9/modisco/all/deeplift/profile/pwm_comparison/pwm_instances.csv.gz

Importance score instances

In [187]:
dfis = dfi.query("imp_weighted_p > 0")
dfis = dfi_filter_valid(dfis, profile_width, seqlen=1000)
ps_motifs = {m: imp_scores.extract_dfi(dfis[dfis.pattern_name == m], profile_width=profile_width, verbose=True) for m in motifs}

Visualize the PWM match distribution

Sequence (PWM) instances

In [229]:
(ggplot(aes(x='seq_match_score'), dfi_seq) + 
 geom_histogram(bins=100) + 
 facet_grid("motif ~ .") + 
 theme_classic() + 
 xlim([0, 15])
)
/users/avsec/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/plotnine/layer.py:360: UserWarning: stat_bin : Removed 1186 rows containing non-finite values.
  data = self.stat.compute_layer(data, params, layout)
/users/avsec/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/plotnine/layer.py:449: UserWarning: geom_histogram : Removed 8 rows containing missing values.
  self.data = self.geom.handle_na(self.data)
Out[229]:
<ggplot: (-9223363253352059995)>

Importance score instances

In [228]:
(ggplot(aes(x='seq_match', fill='imp_weighted_cat'), dfis) + 
 geom_histogram(bins=100) + 
 facet_grid("pattern_name ~ .") + 
 theme_classic() + 
 xlim([0, 15])
)
/users/avsec/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/plotnine/layer.py:360: UserWarning: stat_bin : Removed 515 rows containing non-finite values.
  data = self.stat.compute_layer(data, params, layout)
/users/avsec/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/plotnine/layer.py:449: UserWarning: geom_histogram : Removed 24 rows containing missing values.
  self.data = self.geom.handle_na(self.data)
Out[228]:
<ggplot: (-9223363253634832752)>

Visualize motifs

In [230]:
t_high = 10
t_low = [3, 5]
In [279]:
motif = 'Sox2'
In [232]:
h2(motif)
Out[232]:

Sox2

In [280]:
patterns[motif].plot('seq');

Good matches

Sequence (PWM) instances

In [281]:
# Good matches
ps_subset = p_seq[motif][p_seq[motif].dfi.seq_match_score > t_high]
ps = ps_subset.aggregate(np.mean)
print("# locations:", len(ps_subset))
ps.plot(kind='seq_ic');
# aggregated footprint
ps.plot(kind='profile', rotate_y=0);
# locations: 21726

Importance score instances

In [282]:
# Good matches
ps_subset = ps_motifs[motif][ps_motifs[motif].dfi.seq_match > t_high]
ps = ps_subset.aggregate(np.mean)
print("# locations:", len(ps_subset))
ps.plot(kind='seq_ic');
# aggregated footprint
ps.plot(kind='profile', rotate_y=0);
# locations: 5277

Poor matches

Sequence (PWM) instances

In [283]:
# Poor matches
ps_subset = p_seq[motif][p_seq[motif].dfi.seq_match_score < t_low[1]]
ps = ps_subset.aggregate(np.mean)
print("# locations:", len(ps_subset))
ps.plot(kind='seq_ic');
# aggregated footprint
ps.plot(kind='profile', rotate_y=0);
# locations: 314983

Importance score instances

In [284]:
# Poor matches
ps_subset = ps_motifs[motif][(ps_motifs[motif].dfi.seq_match < t_low[1]) & 
                             (ps_motifs[motif].dfi.seq_match > t_low[0])
                            ]
ps = ps_subset.aggregate(np.mean)
print("# locations:", len(ps_subset))
ps.plot(kind='seq_ic');
# aggregated footprint
ps.plot(kind='profile', rotate_y=0);
# locations: 14254

Poor matches but important

In [285]:
# Poor matches but important
ps_subset = ps_motifs[motif][(ps_motifs[motif].dfi.seq_match < t_low[1]) & 
                             (ps_motifs[motif].dfi.seq_match > t_low[0]) &
                             (ps_motifs[motif].dfi.imp_weighted_p > .5)
                            ]
ps = ps_subset.aggregate(np.mean)
print("# locations:", len(ps_subset))
ps.plot(kind='seq_ic');
# aggregated footprint
ps.plot(kind='profile', rotate_y=0);
# locations: 2295
In [291]:
motif = 'Oct4-Sox2'
In [292]:
h2(motif)
Out[292]:

Oct4-Sox2

In [293]:
patterns[motif].plot('seq');

Good matches

Sequence (PWM) instances

In [294]:
# Good matches
ps_subset = p_seq[motif][p_seq[motif].dfi.seq_match_score > t_high]
ps = ps_subset.aggregate(np.mean)
print("# locations:", len(ps_subset))
ps.plot(kind='seq_ic');
# aggregated footprint
ps.plot(kind='profile', rotate_y=0);
# locations: 33861

Importance score instances

In [295]:
# Good matches
ps_subset = ps_motifs[motif][ps_motifs[motif].dfi.seq_match > t_high]
ps = ps_subset.aggregate(np.mean)
print("# locations:", len(ps_subset))
ps.plot(kind='seq_ic');
# aggregated footprint
ps.plot(kind='profile', rotate_y=0);
# locations: 13633

Poor matches

Sequence (PWM) instances

In [296]:
# Poor matches
ps_subset = p_seq[motif][p_seq[motif].dfi.seq_match_score < t_low[1]]
ps = ps_subset.aggregate(np.mean)
print("# locations:", len(ps_subset))
ps.plot(kind='seq_ic');
# aggregated footprint
ps.plot(kind='profile', rotate_y=0);
# locations: 303857

Importance score instances

In [297]:
# Poor matches
ps_subset = ps_motifs[motif][(ps_motifs[motif].dfi.seq_match < t_low[1]) & 
                             (ps_motifs[motif].dfi.seq_match > t_low[0])
                            ]
ps = ps_subset.aggregate(np.mean)
print("# locations:", len(ps_subset))
ps.plot(kind='seq_ic');
# aggregated footprint
ps.plot(kind='profile', rotate_y=0);
# locations: 10936

Poor matches but important

In [298]:
# Poor matches but important
ps_subset = ps_motifs[motif][(ps_motifs[motif].dfi.seq_match < t_low[1]) & 
                             (ps_motifs[motif].dfi.seq_match > t_low[0]) &
                             (ps_motifs[motif].dfi.imp_weighted_p > .5)
                            ]
ps = ps_subset.aggregate(np.mean)
print("# locations:", len(ps_subset))
ps.plot(kind='seq_ic');
# aggregated footprint
ps.plot(kind='profile', rotate_y=0);
# locations: 141
In [299]:
motif = 'Nanog'
In [300]:
h2(motif)
Out[300]:

Nanog

In [301]:
patterns[motif].plot('seq');
In [302]:
t_high = 8
t_low = [3, 5]

Good matches

Sequence (PWM) instances

In [303]:
# Good matches
ps_subset = p_seq[motif][p_seq[motif].dfi.seq_match_score > t_high]
ps = ps_subset.aggregate(np.mean)
print("# locations:", len(ps_subset))
ps.plot(kind='seq_ic');
# aggregated footprint
ps.plot(kind='profile', rotate_y=0);
# locations: 6922

Importance score instances

In [304]:
# Good matches
ps_subset = ps_motifs[motif][ps_motifs[motif].dfi.seq_match > t_high]
ps = ps_subset.aggregate(np.mean)
print("# locations:", len(ps_subset))
ps.plot(kind='seq_ic');
# aggregated footprint
ps.plot(kind='profile', rotate_y=0);
# locations: 2559

Poor matches

Sequence (PWM) instances

In [305]:
# Poor matches
ps_subset = p_seq[motif][p_seq[motif].dfi.seq_match_score < t_low[1]]
ps = ps_subset.aggregate(np.mean)
print("# locations:", len(ps_subset))
ps.plot(kind='seq_ic');
# aggregated footprint
ps.plot(kind='profile', rotate_y=0);
# locations: 754128

Importance score instances

In [306]:
# Poor matches
ps_subset = ps_motifs[motif][(ps_motifs[motif].dfi.seq_match < t_low[1]) & 
                             (ps_motifs[motif].dfi.seq_match > t_low[0])
                            ]
ps = ps_subset.aggregate(np.mean)
print("# locations:", len(ps_subset))
ps.plot(kind='seq_ic');
# aggregated footprint
ps.plot(kind='profile', rotate_y=0);
# locations: 54548

Poor matches but important

In [307]:
# Poor matches but important
ps_subset = ps_motifs[motif][(ps_motifs[motif].dfi.seq_match < t_low[1]) & 
                             (ps_motifs[motif].dfi.seq_match > t_low[0]) &
                             (ps_motifs[motif].dfi.imp_weighted_p > .5)
                            ]
ps = ps_subset.aggregate(np.mean)
print("# locations:", len(ps_subset))
ps.plot(kind='seq_ic');
# aggregated footprint
ps.plot(kind='profile', rotate_y=0);
# locations: 10163
In [308]:
t_high = 10
t_low = [3, 5]
In [309]:
motif = 'Klf4'
In [310]:
h2(motif)
Out[310]:

Klf4

In [311]:
patterns[motif].plot('seq');
In [312]:
t_high = 8
t_low = [3, 5]

Good matches

Sequence (PWM) instances

In [313]:
# Good matches
ps_subset = p_seq[motif][p_seq[motif].dfi.seq_match_score > t_high]
ps = ps_subset.aggregate(np.mean)
print("# locations:", len(ps_subset))
ps.plot(kind='seq_ic');
# aggregated footprint
ps.plot(kind='profile', rotate_y=0);
# locations: 66410

Importance score instances

In [314]:
# Good matches
ps_subset = ps_motifs[motif][ps_motifs[motif].dfi.seq_match > t_high]
ps = ps_subset.aggregate(np.mean)
print("# locations:", len(ps_subset))
ps.plot(kind='seq_ic');
# aggregated footprint
ps.plot(kind='profile', rotate_y=0);
# locations: 35561

Poor matches

Sequence (PWM) instances

In [318]:
# Poor matches
ps_subset = p_seq[motif][p_seq[motif].dfi.seq_match_score < t_low[1]]
ps = ps_subset.aggregate(np.mean)
print("# locations:", len(ps_subset))
ps.plot(kind='seq_ic');
# aggregated footprint
ps.plot(kind='profile', rotate_y=0);
# locations: 386581

Importance score instances

In [319]:
# Poor matches
ps_subset = ps_motifs[motif][(ps_motifs[motif].dfi.seq_match < t_low[1]) & 
                             (ps_motifs[motif].dfi.seq_match > t_low[0])
                            ]
ps = ps_subset.aggregate(np.mean)
print("# locations:", len(ps_subset))
ps.plot(kind='seq_ic');
# aggregated footprint
ps.plot(kind='profile', rotate_y=0);
# locations: 12296

Poor matches but important

In [320]:
# Poor matches but important
ps_subset = ps_motifs[motif][(ps_motifs[motif].dfi.seq_match < t_low[1]) & 
                             (ps_motifs[motif].dfi.seq_match > t_low[0]) &
                             (ps_motifs[motif].dfi.imp_weighted_p > .5)
                            ]
ps = ps_subset.aggregate(np.mean)
print("# locations:", len(ps_subset))
ps.plot(kind='seq_ic');
# aggregated footprint
ps.plot(kind='profile', rotate_y=0);
# locations: 278

Scratch space (e.g. not part of the main notebook analysis)

In [335]:
from basepair.exp.chipnexus.spacing import get_motif_pairs, motif_pair_dfi
In [339]:
dfi_seq['pattern_name'] = dfi_seq.motif
In [338]:
dfi_seq.head()
Out[338]:
pattern example_idx pattern_start pattern_end strand pattern_len pattern_center seq_match_score motif
0 metacluster_0/pattern_0 0 97 112 - 15 104 3.0758 Oct4-Sox2
1 metacluster_0/pattern_0 0 104 119 + 15 111 3.9825 Oct4-Sox2
2 metacluster_0/pattern_0 0 282 297 + 15 289 5.0100 Oct4-Sox2
3 metacluster_0/pattern_0 0 388 403 + 15 395 3.1126 Oct4-Sox2
4 metacluster_0/pattern_0 0 459 474 - 15 466 4.4375 Oct4-Sox2
In [336]:
dfis.head()
Out[336]:
pattern example_idx pattern_start pattern_end strand pattern_len pattern_center match_weighted match_weighted_p match_weighted_cat match_max match_max_task imp_weighted imp_weighted_p imp_weighted_cat imp_max imp_max_task seq_match seq_match_p seq_match_cat match/Klf4 match/Nanog match/Oct4 match/Sox2 imp/Klf4 imp/Nanog imp/Oct4 imp/Sox2 example_chrom example_start example_end example_strand example_interval_from_task pattern_short pattern_name pattern_start_abs pattern_end_abs
3 metacluster_0/pattern_0 0 445 460 + 15 452 0.2399 0.0008 low 0.2469 Sox2 0.1852 0.0008 high 0.4145 Nanog 1.7775 0.0012 low 0.2458 0.2408 0.2314 0.2469 0.1234 0.4145 0.0949 0.1896 chrX 143482544 143483544 * Oct4 m0_p0 Oct4-Sox2 143482989 143483004
4 metacluster_0/pattern_0 0 475 490 - 15 482 0.2903 0.0025 low 0.3322 Oct4 0.2889 0.0179 high 0.5196 Nanog 3.3390 0.0057 low 0.2092 0.2681 0.3322 0.2991 0.1613 0.5196 0.2049 0.3241 chrX 143482544 143483544 * Oct4 m0_p0 Oct4-Sox2 143483019 143483034
9 metacluster_0/pattern_0 1 434 449 - 15 441 0.2848 0.0023 low 0.3127 Nanog 0.8842 0.7923 low 1.0064 Nanog 1.1376 0.0008 low 0.2786 0.3127 0.2929 0.2593 0.5520 1.0064 0.9892 0.8662 chr3 122145063 122146063 * Oct4 m0_p0 Oct4-Sox2 122145497 122145512
10 metacluster_0/pattern_0 1 437 452 + 15 444 0.5391 0.5105 medium 0.5627 Sox2 0.9221 0.8271 low 1.0504 Oct4 9.6558 0.4369 medium 0.5452 0.4714 0.5524 0.5627 0.5605 1.0090 1.0504 0.9147 chr3 122145063 122146063 * Oct4 m0_p0 Oct4-Sox2 122145500 122145515
11 metacluster_0/pattern_0 1 458 473 + 15 465 0.5174 0.3763 medium 0.5613 Oct4 1.0389 0.9077 low 1.2170 Sox2 11.3139 0.7737 high 0.4683 0.4598 0.5613 0.5274 0.5500 1.1198 1.0884 1.2170 chr3 122145063 122146063 * Oct4 m0_p0 Oct4-Sox2 122145521 122145536
In [346]:
motif_pair = ['Nanog', 'Nanog']
In [358]:
dfab = motif_pair_dfi(dfi_seq, motif_pair)
dfab['motif_pair'] = "<>".join(motif_pair)
In [345]:
dfab.head()
Out[345]:
pattern_x example_idx pattern_start_x pattern_end_x strand_x pattern_len_x pattern_center_x seq_match_score_x motif_x pattern_name_x pattern_y pattern_start_y pattern_end_y strand_y pattern_len_y pattern_center_y seq_match_score_y motif_y pattern_name_y center_diff strand_combination
1 metacluster_2/pattern_0 0 165 174 - 9 169 4.793 Nanog Nanog metacluster_2/pattern_0 188 197 + 9 192 3.9526 Nanog Nanog 23 -+
2 metacluster_2/pattern_0 0 165 174 - 9 169 4.793 Nanog Nanog metacluster_2/pattern_0 226 235 + 9 230 3.8959 Nanog Nanog 61 -+
3 metacluster_2/pattern_0 0 165 174 - 9 169 4.793 Nanog Nanog metacluster_2/pattern_0 398 407 + 9 402 3.7102 Nanog Nanog 233 -+
4 metacluster_2/pattern_0 0 165 174 - 9 169 4.793 Nanog Nanog metacluster_2/pattern_0 443 452 - 9 447 3.0285 Nanog Nanog 278 ++
5 metacluster_2/pattern_0 0 165 174 - 9 169 4.793 Nanog Nanog metacluster_2/pattern_0 536 545 + 9 540 3.6167 Nanog Nanog 371 -+
In [359]:
plotnine.options.figure_size = get_figsize(.3, aspect=2/10*4)
max_dist = 100
fig = (ggplot(aes(x='center_diff', fill='strand_combination'), dfab[(dfab.center_diff <= max_dist) & (dfab.motif_pair.isin(["Nanog<>Nanog"]))]) + 
 geom_vline(xintercept=10, alpha=0.1) + 
 geom_vline(xintercept=20, alpha=0.1) + 
 geom_vline(xintercept=30, alpha=0.1) + 
 geom_vline(xintercept=40, alpha=0.1) + 
 geom_histogram(bins=max_dist) + facet_grid("strand_combination~.") + 
 theme_classic(base_size=10, base_family='Arial') + 
 theme(strip_text = element_text(rotation=0), legend_position='top') + 
 xlim([0, max_dist]) + 
 xlab("Pairwise distance") +
 ggtitle("Nanog<>Nanog") + 
 scale_fill_brewer(type='qual', palette=3))
fig.save(figures_dir / 'nanog.spacing.pdf')
fig
/users/avsec/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/plotnine/layer.py:449: UserWarning: geom_histogram : Removed 6 rows containing missing values.
  self.data = self.geom.handle_na(self.data)
Out[359]:
<ggplot: (-9223363254351077411)>

Very good matches

In [350]:
dfi_seq.head()
Out[350]:
pattern example_idx pattern_start pattern_end strand pattern_len pattern_center seq_match_score motif pattern_name
0 metacluster_0/pattern_0 0 97 112 - 15 104 3.0758 Oct4-Sox2 Oct4-Sox2
1 metacluster_0/pattern_0 0 104 119 + 15 111 3.9825 Oct4-Sox2 Oct4-Sox2
2 metacluster_0/pattern_0 0 282 297 + 15 289 5.0100 Oct4-Sox2 Oct4-Sox2
3 metacluster_0/pattern_0 0 388 403 + 15 395 3.1126 Oct4-Sox2 Oct4-Sox2
4 metacluster_0/pattern_0 0 459 474 - 15 466 4.4375 Oct4-Sox2 Oct4-Sox2
In [369]:
threshold = 6
dfab = motif_pair_dfi(dfi_seq[dfi_seq.seq_match_score > threshold], motif_pair)
dfab['motif_pair'] = "<>".join(motif_pair)
In [370]:
plotnine.options.figure_size = get_figsize(.3, aspect=2/10*4)
max_dist = 100
fig = (ggplot(aes(x='center_diff', fill='strand_combination'), dfab[(dfab.center_diff <= max_dist) & (dfab.motif_pair.isin(["Nanog<>Nanog"]))]) + 
 geom_vline(xintercept=10, alpha=0.1) + 
 geom_vline(xintercept=20, alpha=0.1) + 
 geom_vline(xintercept=30, alpha=0.1) + 
 geom_vline(xintercept=40, alpha=0.1) + 
 geom_histogram(bins=max_dist) + facet_grid("strand_combination~.") + 
 theme_classic(base_size=10, base_family='Arial') + 
 theme(strip_text = element_text(rotation=0), legend_position='top') + 
 xlim([0, max_dist]) + 
 xlab("Pairwise distance") +
 ggtitle("Nanog<>Nanog") + 
 scale_fill_brewer(type='qual', palette=3))
fig.save(figures_dir / f'nanog.spacing.match_score>{threshold}.pdf')
fig
/users/avsec/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/plotnine/ggplot.py:706: UserWarning: Saving 2.055114 x 1.6440912 in image.
  from_inches(height, units), units))
/users/avsec/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/plotnine/ggplot.py:707: UserWarning: Filename: /users/avsec/workspace/basepair/data/figures/modisco/pwm_comparison/nanog.spacing.match_score>7.pdf
  warn('Filename: {}'.format(filename))
/users/avsec/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/plotnine/layer.py:449: UserWarning: geom_histogram : Removed 6 rows containing missing values.
  self.data = self.geom.handle_na(self.data)
/users/avsec/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/plotnine/layer.py:449: UserWarning: geom_histogram : Removed 6 rows containing missing values.
  self.data = self.geom.handle_na(self.data)
Out[370]:
<ggplot: (-9223363253324358221)>

Original figure

In [298]:
plotnine.options.figure_size = get_figsize(.3, aspect=2/10*4)
max_dist = 100
fig = (ggplot(aes(x='center_diff', fill='strand_combination'), dfab[(dfab.center_diff <= max_dist) & (dfab.motif_pair.isin(["Nanog<>Nanog"]))]) + 
 geom_vline(xintercept=10, alpha=0.1) + 
 geom_vline(xintercept=20, alpha=0.1) + 
 geom_vline(xintercept=30, alpha=0.1) + 
 geom_vline(xintercept=40, alpha=0.1) + 
 geom_histogram(bins=max_dist) + facet_grid("strand_combination~.") + 
 theme_classic(base_size=10, base_family='Arial') + 
 theme(strip_text = element_text(rotation=0), legend_position='top') + 
 xlim([0, max_dist]) + 
 xlab("Pairwise distance") +
 ggtitle("Nanog<>Nanog") + 
 scale_fill_brewer(type='qual', palette=3))
# fig.save(figures_dir / 'nanog.spacing.pdf')
fig
Out[298]:
<ggplot: (8777610748627)>
In [342]:
dfab.head()
Out[342]:
pattern_x example_idx pattern_start_x pattern_end_x strand_x pattern_len_x pattern_center_x seq_match_score_x motif_x pattern_name_x pattern_y pattern_start_y pattern_end_y strand_y pattern_len_y pattern_center_y seq_match_score_y motif_y pattern_name_y center_diff strand_combination
0 metacluster_0/pattern_0 0 97.0 112.0 - 15.0 104.0 3.0758 Oct4-Sox2 Oct4-Sox2 metacluster_0/pattern_1 145.0 167.0 + 22.0 156.0 3.0777 Sox2 Sox2 52.0 -+
1 metacluster_0/pattern_0 0 97.0 112.0 - 15.0 104.0 3.0758 Oct4-Sox2 Oct4-Sox2 metacluster_0/pattern_1 234.0 256.0 + 22.0 245.0 3.0009 Sox2 Sox2 141.0 -+
2 metacluster_0/pattern_0 0 97.0 112.0 - 15.0 104.0 3.0758 Oct4-Sox2 Oct4-Sox2 metacluster_0/pattern_1 290.0 312.0 - 22.0 301.0 4.9312 Sox2 Sox2 197.0 --
3 metacluster_0/pattern_0 0 97.0 112.0 - 15.0 104.0 3.0758 Oct4-Sox2 Oct4-Sox2 metacluster_0/pattern_1 359.0 381.0 + 22.0 370.0 3.6757 Sox2 Sox2 266.0 -+
4 metacluster_0/pattern_0 0 97.0 112.0 - 15.0 104.0 3.0758 Oct4-Sox2 Oct4-Sox2 metacluster_0/pattern_1 657.0 679.0 - 22.0 668.0 3.9737 Sox2 Sox2 564.0 --
In [ ]:
 

Fimo motifs

In [ ]:
# Show the distribution of differne
In [ ]:
psm = ps_motifs["Oct4-Sox2"]
In [ ]:
psm
In [ ]:
 
In [57]:
from concise.preprocessing import encodeDNA
from concise.utils.pwm import PWM
In [47]:
melanie_motifs = {"nanog": "Nanog",
                  "sox2": "Sox2",
                  "oct4": "Oct4-Sox2",
                  "klf4": "Klf4"}
In [51]:
dff = pd.concat([pd.read_csv(f"ftp://ftp.stowers.org/pub/mw2098/for_ziga/FIMO_results/{k}_top1000peaks_window201bp/fimo.tsv", sep='\t').assign(motif=v)
                 for k,v in melanie_motifs.items()])
In [55]:
dff = dff.dropna()
In [56]:
dff.groupby("motif").size()
Out[56]:
motif
Klf4          96871
Nanog        100066
Oct4-Sox2     55364
Sox2         138543
dtype: int64
In [ ]:
# TODO - take RC into account
In [88]:
def make_pwm(seqs, strand):
    return np.concatenate([encodeDNA(list(seqs[strand == '+'])),
                           encodeDNA(list(seqs[strand == '-']))[:, ::-1, ::-1]], axis=0)
In [92]:
pwms = {m: PWM(encodeDNA(list(dfs.matched_sequence.str.upper())).mean(0) + 0.0001, name=m) for m,dfs in dff.groupby("motif")}
pwms
Out[92]:
{'Klf4': PWM(name: Klf4, consensus: GCCCCACCCAGCCCC),
 'Nanog': PWM(name: Nanog, consensus: TTTGAATGCCATCAA),
 'Oct4-Sox2': PWM(name: Oct4-Sox2, consensus: TGCATAACAAA),
 'Sox2': PWM(name: Sox2, consensus: AGAACAAACGTAACAAA)}
In [93]:
for k,v in pwms.items():
    v.plotPWMInfo(figsize=get_figsize(1, 1/5))
    plt.title(k)
In [52]:
dff.head()
Out[52]:
motif_id motif_alt_id sequence_name start stop strand score p-value q-value matched_sequence motif
0 CAMHWGRGCCATCMA MEME-8 chr8 4.7659e+06 4.7660e+06 + 21.1184 7.8500e-10 0.338 CAACAGGGCCATCAA Nanog
1 CAMHWGRGCCATCMA MEME-8 chr16 2.2674e+07 2.2674e+07 + 21.1184 7.8500e-10 0.338 CAACAGGGCCATCAA Nanog
2 CAMHWGRGCCATCMA MEME-8 chr1 4.8170e+07 4.8170e+07 + 21.1184 7.8500e-10 0.338 caacagggccatcaa Nanog
3 CAMHWGRGCCATCMA MEME-8 chr16 8.0013e+07 8.0013e+07 + 21.1184 7.8500e-10 0.338 CAACAGGGCCATCAA Nanog
4 CAMHWGRGCCATCMA MEME-8 chr7 1.2546e+08 1.2546e+08 + 21.1184 7.8500e-10 0.338 CAACAGGGCCATCAA Nanog
In [42]:
dff['p-value'].plot.hist(30)
Out[42]:
<matplotlib.axes._subplots.AxesSubplot at 0x7fd1e70a04e0>
In [15]:
dff = dff.dropna()
In [16]:
dff.shape
Out[16]:
(96871, 10)
In [17]:
dff.matched_sequence
Out[17]:
0        GCCACACCCTGCCCC
1        GCCACACCCTGCCCC
2        GCCACACCCTGCCCC
              ...       
96868    CCCACACCCAGCCTA
96869    ACCCCACCCCGCCGC
96870    GCCCCACCCTTTGTC
Name: matched_sequence, Length: 96871, dtype: object
In [35]:
pwm = encodeDNA(dff.matched_sequence.str.upper(), ).mean(axis=0)
In [22]:
 
In [36]:
pwm = PWM(pwm + 0.001, )  # add some pseudo-counts
In [37]:
pwm.pwm = pwm.pwm[::-1, ::-1]  # Tc
In [25]:
pwm.shape
Out[25]:
(15, 4)
In [38]:
pwm.plotPWMInfo()
Out[38]:

Compare pwm scanning using the modisco motifs