In [1]:
from collections import OrderedDict
exp = 'nexus,peaks,OSNK,0,10,1,FALSE,same,0.5,64,25,0.004,9,FALSE,[1,50],TRUE'
imp_score = 'profile/wn'

motifs = OrderedDict([
    ("Oct4-Sox2", 'Oct4/m0_p0'),
    ("Oct4", 'Oct4/m0_p1'),
    # ("Strange-sym-motif", 'Oct4/m0_p5'),
    ("Sox2", 'Sox2/m0_p1'),
    ("Nanog", 'Nanog/m0_p1'),
    ("Zic3", 'Nanog/m0_p2'),
    ("Nanog-partner", 'Nanog/m0_p4'),
    ("Klf4", 'Klf4/m0_p0'),
])

motifs_chexmix = OrderedDict([
    ("Oct4-Sox2", 'Oct4/1'),
    ("Sox2", 'Sox2/1'),
    ("Nanog", 'Nanog/1'),
    ("Klf4", 'Klf4/1'),
])

main_motifs = ['Oct4-Sox2', 'Sox2', 'Nanog', 'Klf4']

Goal

  • compare the profiles of top-ranked sites (according to sequence)
  • compare the profiles of weak affinity sites
  • compare the spacing between motifs

Conclusions

  • Weak affinity sites have stronger footprints when they are highlighted as important
  • Potential issue: the important poor matches might actually come from other motifs (e.g. Sox2 might have Oct4 flanks etc)
  • Nanog-Nanog 10bp spacing can be only seen with our motif instances
  • 63.5% of Chexmix peaks (138,098) are in our MACS2 peaks (98,428) and 74.1% of Peakxus peaks (175,259)

Methods

  • Use max counts to compare methods
    • use 10% of the highest counts as the threshold (e.g. 8 counts)

TODO

  • [ ] Update TE annotation code
    • shall we generate a bed file with intervals to be excluded?
  • What fraction of the peaks can we explain? E.g. compute the overlap between

    • peakxus peaks
    • seqlets (un-clustered)

      • Q: How to get this number?

        • Can we take just modisco's unclustered seqlet locations?
        • There could be a possible problem when excluding seqlets...

          • we could also tighten the bin-size to say 10nt and then call peaks
        • What are the two nearest peakxus peaks?

    • seqlets (clustered)
    • Q: What's the right cut-off?
In [2]:
# Imports
from basepair.imports import *
from basepair.exp.paper.config import tasks
from basepair.modisco.pattern_instances import load_instances, filter_nonoverlapping_intervals, plot_coocurence_matrix, dfi_filter_valid, dfi_add_ranges, annotate_profile
from basepair.cli.imp_score import ImpScoreFile
from basepair.modisco.core import Pattern
from basepair.preproc import dfint_no_intersection
hv.extension('bokeh')
%config InlineBackend.figure_format = 'retina'
%matplotlib inline
paper_config()

interval_cols = ['example_chrom', 'pattern_start_abs', 'pattern_end_abs']
Using TensorFlow backend.

Peakxus/MEME and chexmix

  1. Load peakxus
    • dfi
  2. Load chexmix
    • dfi
    • motifs
  3. Load MEME
    • dfi (fimo)
    • motifs
In [3]:
# load all the required data
# mr = ModiscoResult(modisco_dir / "modisco.h5")
# mr.open()

# imp_scores = ImpScoreFile.from_modisco_dir(modisco_dir)
# imp_scores.cache()  # load all the scores
# ranges = imp_scores.get_ranges()
# ranges.columns = ["example_" + v for v in ranges.columns]
# seqs = imp_scores.get_seq()
# profiles = imp_scores.get_profiles()
In [4]:
from basepair.exp.paper.config import models_dir
In [5]:
!mkdir -p {ddir}/figures/method-comparison
In [6]:
# figures dir
model_dir = models_dir / exp
fdir = Path(f'{ddir}/figures/method-comparison')
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-6-4683f7f6d67f> in <module>
      1 # figures dir
      2 model_dir = models_dir / exp
----> 3 fdir = Path(f'{ddir}/figures/method-comparison')

NameError: name 'ddir' is not defined
In [ ]:
%config InlineBackend.figure_format = 'retina'
In [ ]:
paper_config()
In [ ]:
profile_width = 70
seqlen = 1000
In [ ]:
instance_parq_paths = {t: model_dir / f'deeplift/{t}/out/{imp_score}/instances.parq' 
                       for t in tasks}
In [ ]:
from basepair.modisco.pattern_instances import (multiple_load_instances, load_instances, filter_nonoverlapping_intervals, 
                                                plot_coocurence_matrix, align_instance_center, dfi2seqlets, annotate_profile)
In [ ]:
mr = MultipleModiscoResult({t: model_dir / f'deeplift/{t}/out/{imp_score}/modisco.h5'
                           for t in tasks})
In [ ]:
# Exclude TEs
def shorten_te_pattern(s):
    tf, p = s.split("/", 1)
    return tf + "/" + shorten_pattern(p)

ChExMix motifs

In [ ]:
from basepair.utils import flatten
In [ ]:
from basepair.utils import pd_first_cols, flatten
from basepair.exp.chipnexus.comparison import read_peakxus_dfi, read_chexmix_dfi, read_fimo_dfi, read_meme_motifs, read_transfac
In [ ]:
comparison_dir = Path('../../chipnexus/comparison/output')
In [ ]:
chexmix_motifs = flatten({tf: read_transfac(comparison_dir / f"chexmix/{tf}/intermediate-results/{tf}.experiment.transfac")
                          for tf in tasks}, separator='/')
chexmix_patterns = [Pattern('chexmix/' + k, v.pwm, dict(), dict()) 
                    for k,v in chexmix_motifs.items()]
In [ ]:
from basepair.plot.utils import seqlogo_clean
In [ ]:
import matplotlib as mpl
In [ ]:
list(map(len, chexmix_patterns))
In [315]:
p = chexmix_patterns[0]
from concise.utils.plot import seqlogo

fig, axes = plt.subplots(len(chexmix_patterns), 1, 
                         sharex=True, sharey=True,
                         figsize=get_figsize(.25, len(chexmix_patterns)/3))
for i, p in enumerate(chexmix_patterns):
    pwm = p.resize(16).get_seq_ic()
    ax = axes[i]
    seqlogo(pwm, ax=axes[i])
    # p.resize(16).plot('seq_ic', letter_width=.1, height=0.5);
    ax.xaxis.set_major_locator(mpl.ticker.AutoLocator())
    _, tf, n = p.name.split("/")
    ax.set_ylabel(f"{tf}/{n}", rotation=0,
                  multialignment='center',
                  ha='right', labelpad=5)
    ax.set_ylim([0, 2])
    sns.despine(bottom=True, top=True, right=True)
fig.savefig(fdir / 'ChExMix.motifs.pdf')

Considered motifs

In [13]:
# plot them
patterns = {m: mr.get_pattern(longer_pattern(p)).trim_seq_ic(0.08)
            for m, p in motifs.items()}
for tf, p in patterns.items():
    p.plot("seq");
    plt.title(tf)
TF-MoDISco is using the TensorFlow backend.
In [35]:
len(dfi_chexmix)
Out[35]:
169377

Support the following operations:

  • dfi_filter_valid
    • pattern_center
  • extract_dfi (dfi_row2seqlet)
    • example_idx
    • pattern_start
    • pattern_end
    • pattern
    • strand

Extract profiles and sequences

Importance score instances

Sequence (PWM) instances

In [74]:
%tqdm_restart

Annotate dfi and

Load

In [7]:
# Store the results
chexmix_output_dir = Path("../../chipnexus/comparison/output/chexmix/")
In [8]:
# Load the cached results
%time p_seq = read_pkl(chexmix_output_dir / 'pwm_instances.stacked_seqlets.pkl')
%time dfi_seq = pd.read_csv(chexmix_output_dir / 'pwm_instances.csv.gz')
CPU times: user 58 s, sys: 47.8 s, total: 1min 45s
Wall time: 3min 18s
CPU times: user 58.7 s, sys: 5.15 s, total: 1min 3s
Wall time: 57.8 s
In [9]:
# Load the cached results
%time ps_chexmix_motifs = read_pkl(chexmix_output_dir / 'chexmix.stacked_seqlets.pkl')
%time dfi_chexmix = pd.read_csv(chexmix_output_dir / 'chexmix.csv.gz')
CPU times: user 3.14 s, sys: 1.26 s, total: 4.41 s
Wall time: 5.64 s
CPU times: user 2.62 s, sys: 92 ms, total: 2.71 s
Wall time: 2.79 s
In [10]:
!mv {chexmix_output_dir}/benchmark/instances.stacked_seqlets.pkl {model_dir}/benchmark/instances.stacked_seqlets.pkl
mv: cannot stat '../../chipnexus/comparison/output/chexmix/benchmark/instances.stacked_seqlets.pkl': No such file or directory
In [11]:
ls -latr {model_dir}/benchmark
total 12305656
-rw-rw-r-- 1 avsec oak_akundaje 12138112499 Apr 15 11:40 instances.stacked_seqlets.pkl
-rw-rw---- 1 avsec oak_akundaje   462846104 Apr 16 07:06 instances.csv.gz
drwxrws--- 2 avsec oak_akundaje        4096 Apr 16 09:44 ./
drwxrws--- 8 avsec oak_akundaje        4096 May  3 14:15 ../
In [12]:
# Load the cached results
%time ps_motifs = read_pkl(model_dir / 'benchmark/instances.stacked_seqlets.pkl')
%time dfis = pd.read_csv(model_dir / 'benchmark/instances.csv.gz')
CPU times: user 35.2 s, sys: 23.4 s, total: 58.5 s
Wall time: 4min 12s
CPU times: user 41.8 s, sys: 2.11 s, total: 43.9 s
Wall time: 36.6 s
In [13]:
!du -sh {output_dir}/*
du: cannot access '{output_dir}/*': No such file or directory

Visualize the PWM match distribution

Sequence (PWM) instances

In [37]:
plotnine.options.figure_size = get_figsize(1)
(ggplot(aes(x='seq_match_score'), dfi_seq) + 
 geom_histogram(bins=100) + 
 facet_grid("motif ~ .") + 
 theme_classic() + 
 xlim([0, 15])
)
/users/avsec/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/plotnine/layer.py:449: UserWarning: geom_histogram : Removed 8 rows containing missing values.
  self.data = self.geom.handle_na(self.data)
Out[37]:
<ggplot: (-9223363289620938587)>

Importance score instances

In [38]:
(ggplot(aes(x='seq_match', fill='imp_weighted_cat'), dfis) + 
 geom_histogram(bins=100) + 
 facet_grid("pattern_name ~ .") + 
 theme_classic() + 
 xlim([0, 15])
)
/users/avsec/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/plotnine/layer.py:360: UserWarning: stat_bin : Removed 2969 rows containing non-finite values.
  data = self.stat.compute_layer(data, params, layout)
/users/avsec/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/plotnine/layer.py:449: UserWarning: geom_histogram : Removed 42 rows containing missing values.
  self.data = self.geom.handle_na(self.data)
Out[38]:
<ggplot: (8747192044636)>

Do the Precision-recall curve for good profile matches

  • decide on the cut-offs

TODO

  • [x] re-implement the profile quantification from p_seq
    • [x] max, counts, symmetric_kl, cosine-distance
      • figure out which one to get (always sort by top 1k values)
  • [ ] decide on the profile cut-off
    • [ ] visualize the hits for that cut-off (for both, PWM and BPNet scores)
  • [ ] Add motifs from MEME
  • [ ] Make it into paper figures
In [14]:
def profile_metrics(profiles, target_profile, dist_metrics=['cosine']):
    """
    
    Args:
      profiles: np.array of a profile
      target_profile: aggregated profile
    """
    from scipy.spatial.distance import cdist
    from basepair.stats import symmetric_kl
    # total counts
    total_counts = profiles.sum(axis=(1,2))
    
    # value at the maximum position
    max_positions = np.argmax(target_profile, axis=0)
    max_counts = profiles[:, max_positions, [0, 1]].sum(axis=-1)
    
    # cosine distance
    Xa = profiles.reshape((len(profiles), -1))
    Xb = target_profile[np.newaxis].reshape((1, -1))
    cosine = cdist(Xa + 1e-6, Xb + 1e-6, metric='cosine')  # cosine distance
    sym_kl = symmetric_kl(Xa.T + 1e-6, Xb.T + 1e-6)  # Make sure we don't have 0 values
    # KL divergence

    assert cosine.ndim == 2
    assert cosine.shape[1] == 1
    return pd.DataFrame({
        "counts": total_counts,
        "max": max_counts,
        "sym_kl": sym_kl,
        "cosine": cosine[:, 0]
    })
In [15]:
from basepair.exp.paper.config import profile_mapping
In [16]:
dfis['imp_weighted_cat'] = pd.Categorical(dfis['imp_weighted_cat'].astype(str), categories = ['low', 'medium', 'high'], ordered=True)
In [17]:
dfis.head()
Out[17]:
example_chrom example_end example_idx example_interval_from_task example_start example_strand imp/Klf4 imp/Nanog imp/Oct4 imp/Sox2 imp_max imp_max_task imp_weighted imp_weighted_cat imp_weighted_p match/Klf4 match/Nanog match/Oct4 match/Sox2 match_max match_max_task match_weighted match_weighted_cat match_weighted_p pattern pattern_center pattern_end pattern_end_abs pattern_len pattern_name pattern_short pattern_start pattern_start_abs seq_match seq_match_cat seq_match_p strand tf id Klf4/profile_match Klf4/profile_match_p Klf4/profile_counts Klf4/profile_counts_p Klf4/profile_max Klf4/profile_max_p Klf4/profile_counts_max_ref Klf4/profile_counts_max_ref_p Nanog/profile_match Nanog/profile_match_p Nanog/profile_counts Nanog/profile_counts_p Nanog/profile_max Nanog/profile_max_p Nanog/profile_counts_max_ref Nanog/profile_counts_max_ref_p Oct4/profile_match Oct4/profile_match_p Oct4/profile_counts Oct4/profile_counts_p Oct4/profile_max Oct4/profile_max_p Oct4/profile_counts_max_ref Oct4/profile_counts_max_ref_p Sox2/profile_match Sox2/profile_match_p Sox2/profile_counts Sox2/profile_counts_p Sox2/profile_max Sox2/profile_max_p Sox2/profile_counts_max_ref Sox2/profile_counts_max_ref_p
0 chr9 3002633 0 Oct4 3001633 . NaN NaN 0.0448 NaN 0.0448 Oct4 0.0448 low 0.0002 NaN NaN 0.2644 NaN 0.2644 Oct4 0.2644 low 0.0043 Oct4/metacluster_0/pa... 232 240 3001873 16 Oct4-Sox2 Oct4/m0_p0 224 3001857 2.3456 low 0.0021 + Oct4 0 0.0225 0.0047 0.0001 0.0047 1.0000e-06 0.1831 2.0000e-06 0.7681 0.0390 0.0038 0.0001 0.0038 1.0000e-06 0.0842 2.0000e-06 0.5260 0.0611 0.0049 0.0001 0.0049 1.0000e-06 0.0303 2.0000e-06 0.3723 0.1121 0.0060 0.0001 0.0061 1.0000e-06 0.2067 2.0000e-06 0.6075
1 chr3 122146077 1 Oct4 122145077 . NaN NaN 0.9439 NaN 0.9439 Oct4 0.9439 high 0.8507 NaN NaN 0.2953 NaN 0.2953 Oct4 0.2953 low 0.0106 Oct4/metacluster_0/pa... 428 436 122145513 16 Oct4-Sox2 Oct4/m0_p0 420 122145497 1.8069 low 0.0014 + Oct4 1 5.2561 0.3833 39.0001 0.6100 2.0000e+00 0.5320 2.0000e-06 0.7681 2.8647 0.2738 2572.0000 0.9958 2.8100e+02 0.9993 1.0400e+02 0.9967 1.7678 0.0818 2258.0000 0.9998 9.3000e+01 0.9994 8.4000e+01 0.9976 2.4488 0.0228 424.0001 0.9979 2.2000e+01 0.9934 4.0000e+00 0.8520
2 chr3 122146077 1 Oct4 122145077 . NaN NaN 0.9374 NaN 0.9374 Oct4 0.9374 high 0.8462 NaN NaN 0.5077 NaN 0.5077 Oct4 0.5077 high 0.6886 Oct4/metacluster_0/pa... 430 438 122145515 16 Oct4-Sox2 Oct4/m0_p0 422 122145499 10.2722 high 0.6902 - Oct4 2 5.3935 0.4228 44.0001 0.6462 4.0000e+00 0.8385 2.0000e-06 0.7681 3.1092 0.3125 2726.0000 0.9965 2.8100e+02 0.9993 4.8000e+01 0.9884 1.5704 0.0673 2310.0000 0.9998 9.3000e+01 0.9994 6.0000e+01 0.9934 2.4112 0.0222 443.0001 0.9982 2.2000e+01 0.9934 9.0000e+00 0.9520
3 chr3 122146077 1 Oct4 122145077 . NaN NaN 1.5355 NaN 1.5355 Oct4 1.5355 high 0.9920 NaN NaN 0.5287 NaN 0.5287 Oct4 0.5287 high 0.7992 Oct4/metacluster_0/pa... 451 459 122145536 16 Oct4-Sox2 Oct4/m0_p0 443 122145520 11.5528 high 0.9044 - Oct4 3 4.9558 0.3114 59.0001 0.7495 4.0000e+00 0.8385 2.0000e-06 0.7681 3.8785 0.4346 3366.0000 0.9981 2.8100e+02 0.9993 1.0000e+00 0.5260 3.5481 0.3387 2339.0000 0.9999 1.0400e+02 0.9996 1.0000e+00 0.3723 4.3891 0.1790 537.0001 0.9988 2.7000e+01 0.9960 2.0000e-06 0.6075
4 chr3 122146077 1 Oct4 122145077 . NaN NaN 1.0385 NaN 1.0385 Oct4 1.0385 high 0.9024 NaN NaN 0.2392 NaN 0.2392 Oct4 0.2392 low 0.0028 Oct4/metacluster_0/pa... 469 477 122145554 16 Oct4-Sox2 Oct4/m0_p0 461 122145538 5.0157 low 0.0179 + Oct4 4 4.8723 0.2939 59.0001 0.7467 4.0000e+00 0.8385 1.0000e+00 0.7681 3.2607 0.3358 3721.0000 0.9987 2.8100e+02 0.9993 4.8000e+01 0.9884 3.2816 0.2846 2455.0000 0.9999 1.0400e+02 0.9996 2.0000e+00 0.4958 4.0302 0.1345 631.0001 0.9994 2.7000e+01 0.9960 2.0000e-06 0.6075
In [18]:
from basepair.plot.profiles import multiple_plot_stranded_profile
In [44]:
ps_subset = p_seq['Klf4']
In [45]:
sort_idx = np.argsort(-ps_subset.profile['Klf4'].sum(axis=(-2, -1)))
In [46]:
multiple_plot_stranded_profile(ps_subset[sort_idx[:1000]].profile, figsize_tmpl=(2.5, 1.5));
In [47]:
ref = ps_subset[sort_idx[:1000]].profile['Klf4'].mean(axis=0)
In [48]:
dfm = profile_metrics(ps_subset.profile['Klf4'], ref)
In [49]:
dfm
Out[49]:
counts max sym_kl cosine
0 8.0 1.0 6.5650 0.7727
1 7.0 0.0 6.5187 0.7663
2 13.0 0.0 6.1979 0.6819
... ... ... ... ...
1510952 6.0 0.0 6.6950 0.8125
1510953 9.0 1.0 6.3318 0.7112
1510954 11.0 0.0 6.3736 0.7246

1510955 rows × 4 columns

In [50]:
dfm.counts.plot.hist(100)
Out[50]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f49d46e1f60>
In [51]:
dfm[dfm.counts > 2].plot.scatter('sym_kl', 'cosine', s=0.1, alpha=0.5)
Out[51]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f49d4732eb8>
In [52]:
dfm[dfm.counts > 2].plot.scatter('sym_kl', 'max', s=0.1, alpha=0.5)
plt.yscale("symlog")
In [53]:
dfm[dfm.counts > 2].plot.scatter('counts', 'max', s=0.5, alpha=0.5)
plt.yscale("symlog")
plt.xscale("symlog")
In [54]:
dfm[dfm.counts > 2].plot.scatter('sym_kl', 'counts', s=0.5, alpha=0.5)
plt.yscale("symlog")
In [187]:
plotnine.options.figure_size = get_figsize(1/3, .6)
(ggplot(aes(x='seq_match_cat', y="Oct4/profile_counts_p"), dfis[(~dfis['seq_match_cat'].isnull()) & (dfis.pattern_name == 'Oct4-Sox2')]) + 
 # geom_violin() + 
 geom_boxplot() + 
 theme_classic()
)
Out[187]:
<ggplot: (-9223363281642180853)>
In [55]:
from basepair.config import test_chr
In [112]:
dfis
Out[112]:
example_chrom example_end example_idx example_interval_from_task example_start example_strand imp/Klf4 imp/Nanog imp/Oct4 imp/Sox2 imp_max imp_max_task imp_weighted imp_weighted_cat imp_weighted_p match/Klf4 match/Nanog match/Oct4 match/Sox2 match_max match_max_task match_weighted match_weighted_cat match_weighted_p pattern pattern_center pattern_end pattern_end_abs pattern_len pattern_name pattern_short pattern_start pattern_start_abs seq_match seq_match_cat seq_match_p strand tf id Klf4/profile_match Klf4/profile_match_p Klf4/profile_counts Klf4/profile_counts_p Klf4/profile_max Klf4/profile_max_p Klf4/profile_counts_max_ref Klf4/profile_counts_max_ref_p Nanog/profile_match Nanog/profile_match_p Nanog/profile_counts Nanog/profile_counts_p Nanog/profile_max Nanog/profile_max_p Nanog/profile_counts_max_ref Nanog/profile_counts_max_ref_p Oct4/profile_match Oct4/profile_match_p Oct4/profile_counts Oct4/profile_counts_p Oct4/profile_max Oct4/profile_max_p Oct4/profile_counts_max_ref Oct4/profile_counts_max_ref_p Sox2/profile_match Sox2/profile_match_p Sox2/profile_counts Sox2/profile_counts_p Sox2/profile_max Sox2/profile_max_p Sox2/profile_counts_max_ref Sox2/profile_counts_max_ref_p
0 chr9 3002633 0 Oct4 3001633 . NaN NaN 0.0448 NaN 0.0448 Oct4 0.0448 low 0.0002 NaN NaN 0.2644 NaN 0.2644 Oct4 0.2644 low 0.0043 Oct4/metacluster_0/pa... 232 240 3001873 16 Oct4-Sox2 Oct4/m0_p0 224 3001857 2.3456 low 0.0021 + Oct4 0 0.0225 0.0047 0.0001 0.0047 1.0000e-06 0.1831 2.0000e-06 0.7681 0.0390 0.0038 0.0001 0.0038 1.0000e-06 0.0842 2.0000e-06 0.5260 0.0611 0.0049 0.0001 0.0049 1.0000e-06 0.0303 2.0000e-06 0.3723 0.1121 0.0060 0.0001 0.0061 1.0000e-06 0.2067 2.0000e-06 0.6075
1 chr3 122146077 1 Oct4 122145077 . NaN NaN 0.9439 NaN 0.9439 Oct4 0.9439 high 0.8507 NaN NaN 0.2953 NaN 0.2953 Oct4 0.2953 low 0.0106 Oct4/metacluster_0/pa... 428 436 122145513 16 Oct4-Sox2 Oct4/m0_p0 420 122145497 1.8069 low 0.0014 + Oct4 1 5.2561 0.3833 39.0001 0.6100 2.0000e+00 0.5320 2.0000e-06 0.7681 2.8647 0.2738 2572.0000 0.9958 2.8100e+02 0.9993 1.0400e+02 0.9967 1.7678 0.0818 2258.0000 0.9998 9.3000e+01 0.9994 8.4000e+01 0.9976 2.4488 0.0228 424.0001 0.9979 2.2000e+01 0.9934 4.0000e+00 0.8520
2 chr3 122146077 1 Oct4 122145077 . NaN NaN 0.9374 NaN 0.9374 Oct4 0.9374 high 0.8462 NaN NaN 0.5077 NaN 0.5077 Oct4 0.5077 high 0.6886 Oct4/metacluster_0/pa... 430 438 122145515 16 Oct4-Sox2 Oct4/m0_p0 422 122145499 10.2722 high 0.6902 - Oct4 2 5.3935 0.4228 44.0001 0.6462 4.0000e+00 0.8385 2.0000e-06 0.7681 3.1092 0.3125 2726.0000 0.9965 2.8100e+02 0.9993 4.8000e+01 0.9884 1.5704 0.0673 2310.0000 0.9998 9.3000e+01 0.9994 6.0000e+01 0.9934 2.4112 0.0222 443.0001 0.9982 2.2000e+01 0.9934 9.0000e+00 0.9520
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
2357835 chr13 101785985 147973 Klf4 101784985 . 0.1612 NaN NaN NaN 0.1612 Klf4 0.1612 low 0.0049 0.5796 NaN NaN NaN 0.5796 Klf4 0.5796 low 0.1028 Klf4/metacluster_0/pa... 84 89 101785074 10 Klf4 Klf4/m0_p0 79 101785064 5.6424 low 0.0609 + Klf4 2357835 6.5724 0.9484 7.0001 0.0634 1.0000e+00 0.1455 1.0000e+00 0.7716 6.5273 0.9315 9.0001 0.0894 1.0000e+00 0.1645 2.0000e-06 0.6936 6.3493 0.9397 10.0001 0.0406 1.0000e+00 0.0620 2.0000e-06 0.7011 6.5499 0.7909 8.0001 0.2300 1.0000e+00 0.3974 2.0000e-06 0.9172
2357836 chr13 101785985 147973 Klf4 101784985 . 0.1716 NaN NaN NaN 0.1716 Klf4 0.1716 low 0.0119 0.6673 NaN NaN NaN 0.6673 Klf4 0.6673 medium 0.4704 Klf4/metacluster_0/pa... 275 280 101785265 10 Klf4 Klf4/m0_p0 270 101785255 8.6962 medium 0.4549 + Klf4 2357836 6.3685 0.8627 11.0001 0.1342 1.0000e+00 0.1455 2.0000e-06 0.7716 6.4790 0.9106 11.0001 0.1401 2.0000e+00 0.4399 2.0000e-06 0.6936 6.0341 0.8601 16.0001 0.0950 1.0000e+00 0.0620 2.0000e-06 0.7011 6.5535 0.7955 6.0001 0.1183 1.0000e+00 0.3974 2.0000e-06 0.9172
2357837 chr13 101785985 147973 Klf4 101784985 . 0.1189 NaN NaN NaN 0.1189 Klf4 0.1189 low 0.0004 0.5369 NaN NaN NaN 0.5369 Klf4 0.5369 low 0.0494 Klf4/metacluster_0/pa... 872 877 101785862 10 Klf4 Klf4/m0_p0 867 101785852 3.8225 low 0.0103 + Klf4 2357837 5.5081 0.5507 32.0001 0.4303 2.0000e+00 0.4441 2.0000e+00 0.8699 6.1054 0.7263 18.0001 0.2866 2.0000e+00 0.4399 1.0000e+00 0.6936 5.1945 0.5322 40.0001 0.4060 3.0000e+00 0.5786 2.0000e-06 0.7011 6.4968 0.7322 8.0001 0.2300 1.0000e+00 0.3974 2.0000e-06 0.9172

2357838 rows × 71 columns

In [141]:
def get_profile_score(dfi, score, profile_cls):
    df = dfi[(~dfi[score].isnull())]
    return df.sort_values(score, ascending=False)[profile_cls]

motif_profile_counts = []
for motif in main_motifs:
    motif_profile_counts.append((motif, {
        "BPNet": get_profile_score(dfis[(dfis.pattern_name == motif) & dfis.example_chrom.isin(test_chr)], 'imp_weighted', profile_mapping[motif]+ "/profile_max").values,
        "PWM": get_profile_score(dfi_seq[(dfi_seq.motif == motif) & dfi_seq.example_chrom.isin(test_chr)], 'seq_match_score', profile_mapping[motif]+ "/profile_max").values,
        # "ChExMix profile": get_profile_score(dfi_chexmix[(dfi_chexmix.pattern_name == motif) & dfi_chexmix.example_chrom.isin(test_chr)], 'profile_score', profile_mapping[motif]+ "/profile_match").values,
        "ChExMix pwm": get_profile_score(dfi_chexmix[(dfi_chexmix.pattern_name == motif) & dfi_chexmix.example_chrom.isin(test_chr)], 'motif_score', profile_mapping[motif]+ "/profile_max").values,
    }))
In [140]:
## TODO - where is ref refined
In [241]:
fig, ax = plt.subplots(1,1,figsize=get_figsize(0.25))
scores = get_profile_score(dfis[(dfis.pattern_name == motif) & dfis.example_chrom.isin(test_chr)], 'imp_weighted', profile_mapping[motif]+ "/profile_max")
scores.plot.hist(np.arange(30), ax=ax)
plt.axvline(8, color='grey')
plt.xlabel("Profile score (max value)");
fig.savefig(fdir / 'profile-score.hist.pdf')
In [240]:
np.mean(scores > 8)
Out[240]:
0.09404622261674539
In [234]:
np.percentile(scores, 90)
Out[234]:
7.000000953674316
In [325]:
dfis[['pattern_name', 'example_chrom', 'pattern_start_abs', 'pattern_end_abs', 'strand']]
Out[325]:
pattern_name example_chrom pattern_start_abs pattern_end_abs strand
0 Oct4-Sox2 chr9 3001857 3001873 +
1 Oct4-Sox2 chr3 122145497 122145513 +
2 Oct4-Sox2 chr3 122145499 122145515 -
... ... ... ... ... ...
2357835 Klf4 chr13 101785064 101785074 +
2357836 Klf4 chr13 101785255 101785265 +
2357837 Klf4 chr13 101785852 101785862 +

2357838 rows × 5 columns

In [19]:
# Remove duplicates in dfis
dfis = dfis.sort_values("imp_weighted", ascending=False)
duplicated = dfis[['pattern_name', 'example_chrom', 'pattern_start_abs', 'pattern_end_abs', 'strand']].duplicated()
dfis = dfis[~duplicated]
In [20]:
# Remove duplicates in dfis
dfi_chexmix = dfi_chexmix.sort_values("profile_score", ascending=False)
duplicated = dfi_chexmix[['pattern_name', 'example_chrom', 'pattern_center_abs', 'strand']].duplicated()
dfi_chexmix = dfi_chexmix[~duplicated]
In [21]:
# Remove duplicates in dfis
dfi_seq = dfi_seq.sort_values("seq_match_score", ascending=False)
duplicated = dfi_seq[['pattern_name', 'example_chrom', 'pattern_start_abs', 'pattern_end_abs', 'strand']].duplicated()
dfi_seq = dfi_seq[~duplicated]
---------------------------------------------------------------------------
KeyError                                  Traceback (most recent call last)
<ipython-input-21-3d411bfa287d> in <module>
      1 # Remove duplicates in dfis
      2 dfi_seq = dfi_seq.sort_values("seq_match_score", ascending=False)
----> 3 duplicated = dfi_seq[['pattern_name', 'example_chrom', 'pattern_start_abs', 'pattern_end_abs', 'strand']].duplicated()
      4 dfi_seq = dfi_seq[~duplicated]

~/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/pandas/core/frame.py in __getitem__(self, key)
   2932                 key = list(key)
   2933             indexer = self.loc._convert_to_indexer(key, axis=1,
-> 2934                                                    raise_missing=True)
   2935 
   2936         # take() does not accept boolean indexers

~/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/pandas/core/indexing.py in _convert_to_indexer(self, obj, axis, is_setter, raise_missing)
   1352                 kwargs = {'raise_missing': True if is_setter else
   1353                           raise_missing}
-> 1354                 return self._get_listlike_indexer(obj, axis, **kwargs)[1]
   1355         else:
   1356             try:

~/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/pandas/core/indexing.py in _get_listlike_indexer(self, key, axis, raise_missing)
   1159         self._validate_read_indexer(keyarr, indexer,
   1160                                     o._get_axis_number(axis),
-> 1161                                     raise_missing=raise_missing)
   1162         return keyarr, indexer
   1163 

~/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/pandas/core/indexing.py in _validate_read_indexer(self, key, indexer, axis, raise_missing)
   1250             if not(self.name == 'loc' and not raise_missing):
   1251                 not_found = list(set(key) - set(ax))
-> 1252                 raise KeyError("{} not in index".format(not_found))
   1253 
   1254             # we skip the warning on Categorical/Interval

KeyError: "['pattern_name'] not in index"
In [ ]:
def get_profile_score(dfi, score, profile_cls):
    df = dfi[(~dfi[score].isnull())]
    return df.sort_values(score, ascending=False)[profile_cls]

motif_profile_counts = []
for motif in main_motifs:
    motif_profile_counts.append((motif, {
        "BPNet": get_profile_score(dfis[(dfis.pattern_name == motif) & dfis.example_chrom.isin(test_chr)], 'imp_weighted', profile_mapping[motif]+ "/profile_max").values,
        "ChExMix": get_profile_score(dfi_chexmix[(dfi_chexmix.pattern_name == motif) & dfi_chexmix.example_chrom.isin(test_chr)], 'profile_score', profile_mapping[motif]+ "/profile_max").values,
        "PWM": get_profile_score(dfi_seq[(dfi_seq.motif == motif) & dfi_seq.example_chrom.isin(test_chr)], 'seq_match_score', profile_mapping[motif]+ "/profile_max").values,
        # "ChExMix pwm": get_profile_score(dfi_chexmix[(dfi_chexmix.pattern_name == motif) & dfi_chexmix.example_chrom.isin(test_chr)], 'motif_score', profile_mapping[motif]+ "/profile_max_p").values,
    }))
In [ ]:
# Profile scores
model = 'BPNet'
thresholds = dict()
fig, axes = plt.subplots(len(main_motifs),1,figsize=get_figsize(0.25, 1), 
                         gridspec_kw=dict(hspace=0),
                         sharex=True)
for i, motif in enumerate(main_motifs):
    ax = axes[i]
    scores = motif_profile_counts[i][1][model]
    thresholds[motif] = np.percentile(scores, 95)
    #if len(scores) == 0:
    #    continue
    ax.hist(scores, bins=np.arange(30))
    ax.axvline(thresholds[motif], color='grey')
    ax.set_ylabel(motif, rotation=0)
    if i == len(main_motifs)-1:
        ax.set_xlabel("Profile score (max value)");
    fig.savefig(fdir / 'profile-score.hist.pdf')
In [ ]:
cutoff = 8
fig, axes = plt.subplots(1, len(main_motifs), figsize=get_figsize(1, 1/(1*len(main_motifs))), 
                         sharex=True, 
                         sharey=True, gridspec_kw=dict(wspace=0, hspace=0))
fig.subplots_adjust(wspace=.5)#, wspace=0)
max_val = 0
for i, (motif, res) in enumerate(motif_profile_counts):
    ax = axes[i]
    # top_n = min([len(x) for x in res.values()])
    top_n = 10000
    cutoff = thresholds[motif]
    for k,v in res.items():
        ax.plot(np.cumsum(v[:top_n] > cutoff), label=k)

    max_val = max(max_val, ax.get_ylim()[1])
    if i == 0:
        ax.set_ylabel("# motif instances\nwith high coverage")
    if i == 1:
        ax.set_xlabel("Top # of motif instances")
    if i == 0:
        leg = ax.legend(loc='lower right', framealpha=1, borderpad=0)
        leg.get_frame().set_linewidth(0.0)
    ax.set_title(motif)

for i, (motif, res) in enumerate(motif_profile_counts):
    axes[i].plot([0, max_val], [0, max_val], '--', color='grey', label=None)
# fig.savefig(fdir / 'footprint-recall-curve.with-ChExMix.pdf')
In [347]:
cutoff = 8
fig, axes = plt.subplots(1, len(main_motifs), figsize=get_figsize(1, 1/(1*len(main_motifs))), 
                         sharex=True, 
                         sharey=True, gridspec_kw=dict(wspace=0, hspace=0))
fig.subplots_adjust(wspace=.5)#, wspace=0)
max_val = 0
for i, (motif, res) in enumerate(motif_profile_counts):
    ax = axes[i]
    # top_n = min([len(x) for x in res.values()])
    top_n = 10000
    cutoff = thresholds[motif]
    for k,v in res.items():
        ax.plot(np.cumsum(v[:top_n] > cutoff), label=k)

    max_val = max(max_val, ax.get_ylim()[1])
    if i == 0:
        ax.set_ylabel("# motif instances\nwith high coverage")
    if i == 1:
        ax.set_xlabel("Top # of motif instances")
    if i == 0:
        leg = ax.legend(loc='lower right', framealpha=1, borderpad=0)
        leg.get_frame().set_linewidth(0.0)
    ax.set_title(motif)

for i, (motif, res) in enumerate(motif_profile_counts):
    axes[i].plot([0, max_val], [0, max_val], '--', color='grey', label=None)
fig.savefig(fdir / 'footprint-recall-curve.with-ChExMix.pdf')

Spearman correlation

In [242]:
from gin_train.metrics import spearmanr
In [246]:
def corelate_cols(dfi, score, profile_cls, metric=spearmanr):
    df = dfi[(~dfi[score].isnull())]
    return metric(dfi[profile_cls], dfi[score])
In [247]:
motif_spearmanr = []
for motif in main_motifs:
    
    motif_spearmanr.append({
        "spearmanr": corelate_cols(dfis[(dfis.pattern_name == motif) & dfis.example_chrom.isin(test_chr)], 'imp_weighted', profile_mapping[motif]+ "/profile_max"),
        "motif_score": "BPNet",
        "motif": motif
    })
    motif_spearmanr.append({
        "spearmanr": corelate_cols(dfi_seq[(dfi_seq.motif == motif) & dfi_seq.example_chrom.isin(test_chr)], 'seq_match_score', profile_mapping[motif]+ "/profile_max"),
        "motif_score": "PWM",
        "motif": motif
    })
    
dfm_spearman = pd.DataFrame(motif_spearmanr)
dfm_spearman['motif'] = pd.Categorical(dfm_spearman['motif'], categories=list(motifs), ordered=True)
In [249]:
plotnine.options.figure_size = get_figsize(.4)
fig = (ggplot(aes(x='motif', fill='motif_score', y='spearmanr'), dfm_spearman) + 
 geom_bar(stat='identity', position='dodge') + 
 theme_classic() + 
 scale_fill_brewer(type='qual', palette=3) + 
 theme(legend_position='top') + 
 ylab("Spearman rank correlation\nwith profile height")
)
fig.save(fdir / 'profile-max-correlation.bar.pdf')
fig
/users/avsec/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/plotnine/ggplot.py:706: UserWarning: Saving 2.740152 x 1.693413936 in image.
  from_inches(height, units), units))
/users/avsec/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/plotnine/ggplot.py:707: UserWarning: Filename: /users/avsec/workspace/basepair/data/figures/method-comparison/profile-max-correlation.bar.pdf
  warn('Filename: {}'.format(filename))
Out[249]:
<ggplot: (8746853109554)>

TODO

  • [x] make a bar-plot with spearman correlation for different TF's and differnet methods (test chromosome)
  • [x] fix the profile distance
In [295]:
plotnine.options.figure_size = get_figsize(1, .25)
(ggplot(aes(x='Oct4/profile_max_p', y="Oct4/profile_counts_p"), dfis[(~dfis['seq_match_cat'].isnull()) & (dfis.pattern_name == 'Oct4-Sox2')]) + 
 # geom_violin() + 
 geom_point(alpha=0.01, size=0.1, color='black') + 
 theme_classic()
)
Out[295]:
<ggplot: (8743950590391)>
In [84]:
plotnine.options.figure_size = get_figsize(.25, 1)
fig = (ggplot(aes(x='seq_match', y="match_weighted"), 
        dfis[(~dfis['seq_match_cat'].isnull()) & (dfis.pattern_name == 'Oct4-Sox2')].sample(frac=0.1)) + 
 # geom_violin() + 
 geom_point(alpha=0.01, size=0.1, color='black') + 
 xlab("PWM match") + 
 ylab("CWM match (jaccard)") + 
 theme_classic()
)
fig.save(fdir / 'pwm_match_vs_cwm_match.pdf', raster=True)
fig
/users/avsec/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/plotnine/ggplot.py:706: UserWarning: Saving 1.712595 x 1.712595 in image.
  from_inches(height, units), units))
/users/avsec/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/plotnine/ggplot.py:707: UserWarning: Filename: /users/avsec/workspace/basepair/data/figures/method-comparison/pwm_match_vs_cwm_match.pdf
  warn('Filename: {}'.format(filename))
Out[84]:
<ggplot: (-9223363289661771811)>
In [85]:
plotnine.options.figure_size = get_figsize(.25, 1)
fig = (ggplot(aes(x='imp_weighted', y="match_weighted"), 
        dfis[(~dfis['seq_match_cat'].isnull()) & (dfis.pattern_name == 'Oct4-Sox2')].sample(frac=0.1)) + 
 # geom_violin() + 
 geom_point(alpha=0.01, size=0.1, color='black') + 
 xlab("Importance") + 
 ylab("CWM match (jaccard)") + 
 theme_classic()
)
fig.save(fdir / 'importance_vs_cwm_match.pdf', raster=True)
fig
/users/avsec/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/plotnine/ggplot.py:706: UserWarning: Saving 1.712595 x 1.712595 in image.
  from_inches(height, units), units))
/users/avsec/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/plotnine/ggplot.py:707: UserWarning: Filename: /users/avsec/workspace/basepair/data/figures/method-comparison/importance_vs_cwm_match.pdf
  warn('Filename: {}'.format(filename))
Out[85]:
<ggplot: (8747193213872)>

Spacing analysis

TODO

  • plot a big pairwise matrix
    • `method + strand ~ motif_pair
In [86]:
from basepair.exp.chipnexus.spacing import get_motif_pairs, motif_pair_dfi
In [87]:
dfi_seq['pattern_name'] = dfi_seq.motif
In [88]:
motif_pairs = get_motif_pairs(main_motifs)
motif_pairs
Out[88]:
[['Oct4-Sox2', 'Oct4-Sox2'],
 ['Oct4-Sox2', 'Sox2'],
 ['Oct4-Sox2', 'Nanog'],
 ['Oct4-Sox2', 'Klf4'],
 ['Sox2', 'Sox2'],
 ['Sox2', 'Nanog'],
 ['Sox2', 'Klf4'],
 ['Nanog', 'Nanog'],
 ['Nanog', 'Klf4'],
 ['Klf4', 'Klf4']]
In [89]:
motif_pair = ['Nanog', 'Nanog']
motif_pair_name = "<>".join(motif_pair)
In [94]:
threshold = 5
dfab = motif_pair_dfi(dfi_seq[dfi_seq.seq_match_score > threshold], motif_pair)
dfab['motif_pair'] = "<>".join(motif_pair)
In [95]:
plotnine.options.figure_size = get_figsize(.3, aspect=2/10*4)
max_dist = 100
fig = (ggplot(aes(x='center_diff', fill='strand_combination'), dfab[(dfab.center_diff <= max_dist) & (dfab.motif_pair.isin([motif_pair_name]))]) + 
 geom_vline(xintercept=10, alpha=0.1) + 
 geom_vline(xintercept=20, alpha=0.1) + 
 geom_vline(xintercept=30, alpha=0.1) + 
 geom_vline(xintercept=40, alpha=0.1) + 
 geom_histogram(bins=max_dist) + facet_grid("strand_combination~.") + 
 theme_classic(base_size=10, base_family='Arial') + 
 theme(strip_text = element_text(rotation=0), legend_position='top') + 
 xlim([0, max_dist]) + 
 xlab("Pairwise distance") +
 ggtitle(motif_pair_name) + 
 scale_fill_brewer(type='qual', palette=3))
fig.save(fdir / 'Nanog<>Nanog.PWM.threshold=5.pdf')
fig
/users/avsec/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/plotnine/ggplot.py:706: UserWarning: Saving 2.055114 x 1.6440912 in image.
  from_inches(height, units), units))
/users/avsec/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/plotnine/ggplot.py:707: UserWarning: Filename: /users/avsec/workspace/basepair/data/figures/method-comparison/Nanog<>Nanog.PWM.threshold=5.pdf
  warn('Filename: {}'.format(filename))
/users/avsec/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/plotnine/layer.py:449: UserWarning: geom_histogram : Removed 6 rows containing missing values.
  self.data = self.geom.handle_na(self.data)
/users/avsec/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/plotnine/layer.py:449: UserWarning: geom_histogram : Removed 6 rows containing missing values.
  self.data = self.geom.handle_na(self.data)
Out[95]:
<ggplot: (8746850227576)>
In [97]:
threshold = 6
dfab = motif_pair_dfi(dfi_seq[dfi_seq.seq_match_score > threshold], motif_pair)
dfab['motif_pair'] = "<>".join(motif_pair)
In [98]:
plotnine.options.figure_size = get_figsize(.3, aspect=2/10*4)
max_dist = 100
fig = (ggplot(aes(x='center_diff', fill='strand_combination'), dfab[(dfab.center_diff <= max_dist) & (dfab.motif_pair.isin([motif_pair_name]))]) + 
 geom_vline(xintercept=10, alpha=0.1) + 
 geom_vline(xintercept=20, alpha=0.1) + 
 geom_vline(xintercept=30, alpha=0.1) + 
 geom_vline(xintercept=40, alpha=0.1) + 
 geom_histogram(bins=max_dist) + facet_grid("strand_combination~.") + 
 theme_classic(base_size=10, base_family='Arial') + 
 theme(strip_text = element_text(rotation=0), legend_position='top') + 
 xlim([0, max_dist]) + 
 xlab("Pairwise distance") +
 ggtitle(motif_pair_name) + 
 scale_fill_brewer(type='qual', palette=3))
fig.save(fdir / f'Nanog<>Nanog.PWM.threshold={threshold}.pdf')
fig
/users/avsec/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/plotnine/ggplot.py:706: UserWarning: Saving 2.055114 x 1.6440912 in image.
  from_inches(height, units), units))
/users/avsec/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/plotnine/ggplot.py:707: UserWarning: Filename: /users/avsec/workspace/basepair/data/figures/method-comparison/Nanog<>Nanog.PWM.threshold=6.pdf
  warn('Filename: {}'.format(filename))
/users/avsec/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/plotnine/layer.py:449: UserWarning: geom_histogram : Removed 6 rows containing missing values.
  self.data = self.geom.handle_na(self.data)
/users/avsec/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/plotnine/layer.py:449: UserWarning: geom_histogram : Removed 6 rows containing missing values.
  self.data = self.geom.handle_na(self.data)
Out[98]:
<ggplot: (8746850231630)>
In [99]:
threshold = 7
dfab = motif_pair_dfi(dfi_seq[dfi_seq.seq_match_score > threshold], motif_pair)
dfab['motif_pair'] = "<>".join(motif_pair)
In [100]:
plotnine.options.figure_size = get_figsize(.3, aspect=2/10*4)
max_dist = 100
fig = (ggplot(aes(x='center_diff', fill='strand_combination'), dfab[(dfab.center_diff <= max_dist) & (dfab.motif_pair.isin([motif_pair_name]))]) + 
 geom_vline(xintercept=10, alpha=0.1) + 
 geom_vline(xintercept=20, alpha=0.1) + 
 geom_vline(xintercept=30, alpha=0.1) + 
 geom_vline(xintercept=40, alpha=0.1) + 
 geom_histogram(bins=max_dist) + facet_grid("strand_combination~.") + 
 theme_classic(base_size=10, base_family='Arial') + 
 theme(strip_text = element_text(rotation=0), legend_position='top') + 
 xlim([0, max_dist]) + 
 xlab("Pairwise distance") +
 ggtitle(motif_pair_name) + 
 scale_fill_brewer(type='qual', palette=3))
fig.save(fdir / f'Nanog<>Nanog.PWM.threshold={threshold}.pdf')
fig
/users/avsec/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/plotnine/ggplot.py:706: UserWarning: Saving 2.055114 x 1.6440912 in image.
  from_inches(height, units), units))
/users/avsec/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/plotnine/ggplot.py:707: UserWarning: Filename: /users/avsec/workspace/basepair/data/figures/method-comparison/Nanog<>Nanog.PWM.threshold=7.pdf
  warn('Filename: {}'.format(filename))
/users/avsec/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/plotnine/layer.py:449: UserWarning: geom_histogram : Removed 6 rows containing missing values.
  self.data = self.geom.handle_na(self.data)
/users/avsec/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/plotnine/layer.py:449: UserWarning: geom_histogram : Removed 6 rows containing missing values.
  self.data = self.geom.handle_na(self.data)
Out[100]:
<ggplot: (8746850223473)>

Very good matches

In [91]:
dfi_seq.seq_match_score.plot.hist(100);

Original figure

In [101]:
dfab = motif_pair_dfi(dfis.query('match_weighted_p > .2').query('imp_weighted_p > 0'), motif_pair)
dfab['motif_pair'] = "<>".join(motif_pair)
In [102]:
plotnine.options.figure_size = get_figsize(.3, aspect=2/10*4)
max_dist = 100
fig = (ggplot(aes(x='center_diff', fill='strand_combination'), dfab[(dfab.center_diff <= max_dist) & (dfab.motif_pair.isin([motif_pair_name]))]) + 
 geom_vline(xintercept=10, alpha=0.1) + 
 geom_vline(xintercept=20, alpha=0.1) + 
 geom_vline(xintercept=30, alpha=0.1) + 
 geom_vline(xintercept=40, alpha=0.1) + 
 geom_histogram(bins=max_dist) + facet_grid("strand_combination~.") + 
 theme_classic(base_size=10, base_family='Arial') + 
 theme(strip_text = element_text(rotation=0), legend_position='top') + 
 xlim([0, max_dist]) + 
 xlab("Pairwise distance") +
 ggtitle(motif_pair_name) + 
 scale_fill_brewer(type='qual', palette=3))
fig.save(fdir / 'Nanog<>Nanog.BPNet.default.pdf')
fig
/users/avsec/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/plotnine/ggplot.py:706: UserWarning: Saving 2.055114 x 1.6440912 in image.
  from_inches(height, units), units))
/users/avsec/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/plotnine/ggplot.py:707: UserWarning: Filename: /users/avsec/workspace/basepair/data/figures/method-comparison/Nanog<>Nanog.BPNet.default.pdf
  warn('Filename: {}'.format(filename))
/users/avsec/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/plotnine/layer.py:449: UserWarning: geom_histogram : Removed 6 rows containing missing values.
  self.data = self.geom.handle_na(self.data)
/users/avsec/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/plotnine/layer.py:449: UserWarning: geom_histogram : Removed 6 rows containing missing values.
  self.data = self.geom.handle_na(self.data)
Out[102]:
<ggplot: (-9223363290004483987)>

Chexmix

In [251]:
dfi_chexmix.motif_score.plot.hist(100);
In [103]:
dfab = motif_pair_dfi(dfi_chexmix, motif_pair)
dfab['motif_pair'] = "<>".join(motif_pair)
In [104]:
plotnine.options.figure_size = get_figsize(.3, aspect=2/10*4)
max_dist = 100
fig = (ggplot(aes(x='center_diff', fill='strand_combination'), dfab[(dfab.center_diff <= max_dist) & (dfab.motif_pair.isin([motif_pair_name]))]) + 
 geom_vline(xintercept=10, alpha=0.1) + 
 geom_vline(xintercept=20, alpha=0.1) + 
 geom_vline(xintercept=30, alpha=0.1) + 
 geom_vline(xintercept=40, alpha=0.1) + 
 geom_histogram(bins=max_dist) + facet_grid("strand_combination~.") + 
 theme_classic(base_size=10, base_family='Arial') + 
 theme(strip_text = element_text(rotation=0), legend_position='top') + 
 xlim([0, max_dist]) + 
 xlab("Pairwise distance") +
 ggtitle(motif_pair_name) + 
 scale_fill_brewer(type='qual', palette=3))
fig.save(fdir / 'Nanog<>Nanog.ChExMix.default.pdf')
fig
/users/avsec/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/plotnine/ggplot.py:706: UserWarning: Saving 2.055114 x 1.6440912 in image.
  from_inches(height, units), units))
/users/avsec/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/plotnine/ggplot.py:707: UserWarning: Filename: /users/avsec/workspace/basepair/data/figures/method-comparison/Nanog<>Nanog.ChExMix.default.pdf
  warn('Filename: {}'.format(filename))
/users/avsec/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/plotnine/layer.py:449: UserWarning: geom_histogram : Removed 6 rows containing missing values.
  self.data = self.geom.handle_na(self.data)
/users/avsec/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/plotnine/layer.py:449: UserWarning: geom_histogram : Removed 6 rows containing missing values.
  self.data = self.geom.handle_na(self.data)
Out[104]:
<ggplot: (-9223363290004543905)>
In [105]:
motif_score = 1
dfab = motif_pair_dfi(dfi_chexmix.query(f'motif_score > {motif_score}'), motif_pair)
dfab['motif_pair'] = "<>".join(motif_pair)

plotnine.options.figure_size = get_figsize(.3, aspect=2/10*4)
max_dist = 100
fig = (ggplot(aes(x='center_diff', fill='strand_combination'), dfab[(dfab.center_diff <= max_dist) & (dfab.motif_pair.isin([motif_pair_name]))]) + 
 geom_vline(xintercept=10, alpha=0.1) + 
 geom_vline(xintercept=20, alpha=0.1) + 
 geom_vline(xintercept=30, alpha=0.1) + 
 geom_vline(xintercept=40, alpha=0.1) + 
 geom_histogram(bins=max_dist) + facet_grid("strand_combination~.") + 
 theme_classic(base_size=10, base_family='Arial') + 
 theme(strip_text = element_text(rotation=0), legend_position='top') + 
 xlim([0, max_dist]) + 
 xlab("Pairwise distance") +
 ggtitle(motif_pair_name) + 
 scale_fill_brewer(type='qual', palette=3))
fig.save(fdir / f'Nanog<>Nanog.ChExMix.motif_score={motif_score}.pdf')
fig
/users/avsec/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/plotnine/ggplot.py:706: UserWarning: Saving 2.055114 x 1.6440912 in image.
  from_inches(height, units), units))
/users/avsec/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/plotnine/ggplot.py:707: UserWarning: Filename: /users/avsec/workspace/basepair/data/figures/method-comparison/Nanog<>Nanog.ChExMix.motif_score=1.pdf
  warn('Filename: {}'.format(filename))
/users/avsec/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/plotnine/layer.py:449: UserWarning: geom_histogram : Removed 6 rows containing missing values.
  self.data = self.geom.handle_na(self.data)
/users/avsec/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/plotnine/layer.py:449: UserWarning: geom_histogram : Removed 6 rows containing missing values.
  self.data = self.geom.handle_na(self.data)
Out[105]:
<ggplot: (-9223363290004516562)>
In [106]:
motif_score = 5
dfab = motif_pair_dfi(dfi_chexmix.query(f'motif_score > {motif_score}'), motif_pair)
dfab['motif_pair'] = "<>".join(motif_pair)

plotnine.options.figure_size = get_figsize(.3, aspect=2/10*4)
max_dist = 100
fig = (ggplot(aes(x='center_diff', fill='strand_combination'), dfab[(dfab.center_diff <= max_dist) & (dfab.motif_pair.isin([motif_pair_name]))]) + 
 geom_vline(xintercept=10, alpha=0.1) + 
 geom_vline(xintercept=20, alpha=0.1) + 
 geom_vline(xintercept=30, alpha=0.1) + 
 geom_vline(xintercept=40, alpha=0.1) + 
 geom_histogram(bins=max_dist) + facet_grid("strand_combination~.") + 
 theme_classic(base_size=10, base_family='Arial') + 
 theme(strip_text = element_text(rotation=0), legend_position='top') + 
 xlim([0, max_dist]) + 
 xlab("Pairwise distance") +
 ggtitle(motif_pair_name) + 
 scale_fill_brewer(type='qual', palette=3))
fig.save(fdir / f'Nanog<>Nanog.ChExMix.motif_score={motif_score}.pdf')
fig
/users/avsec/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/plotnine/ggplot.py:706: UserWarning: Saving 2.055114 x 1.6440912 in image.
  from_inches(height, units), units))
/users/avsec/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/plotnine/ggplot.py:707: UserWarning: Filename: /users/avsec/workspace/basepair/data/figures/method-comparison/Nanog<>Nanog.ChExMix.motif_score=5.pdf
  warn('Filename: {}'.format(filename))
/users/avsec/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/plotnine/layer.py:449: UserWarning: geom_histogram : Removed 6 rows containing missing values.
  self.data = self.geom.handle_na(self.data)
/users/avsec/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/plotnine/layer.py:449: UserWarning: geom_histogram : Removed 6 rows containing missing values.
  self.data = self.geom.handle_na(self.data)
Out[106]:
<ggplot: (8746853501311)>

Weak affinity site analysis

Visualize motifs

In [67]:
t_high = 10
t_low = [3, 5]
In [68]:
motif = 'Sox2'
In [69]:
h2(motif)
Out[69]:

Sox2

In [70]:
patterns[motif].plot('seq');

Good matches

Sequence (PWM) instances

In [71]:
# Good matches
ps_subset = p_seq[motif][p_seq[motif].dfi.seq_match_score > t_high]
ps = ps_subset.aggregate(np.mean)
print("# locations:", len(ps_subset))
ps.plot(kind='seq_ic');
# aggregated footprint
ps.plot(kind='profile', rotate_y=0);
# locations: 2486

Importance score instances

In [72]:
# Good matches
ps_subset = ps_motifs[motif][ps_motifs[motif].dfi.seq_match > t_high]
ps = ps_subset.aggregate(np.mean)
print("# locations:", len(ps_subset))
ps.plot(kind='seq_ic');
# aggregated footprint
ps.plot(kind='profile', rotate_y=0);
# locations: 1711

Poor matches

Sequence (PWM) instances

In [73]:
# Poor matches
ps_subset = p_seq[motif][p_seq[motif].dfi.seq_match_score < t_low[1]]
ps = ps_subset.aggregate(np.mean)
print("# locations:", len(ps_subset))
ps.plot(kind='seq_ic');
# aggregated footprint
ps.plot(kind='profile', rotate_y=0);
# locations: 190455

Importance score instances

In [74]:
# Poor matches
ps_subset = ps_motifs[motif][(ps_motifs[motif].dfi.seq_match < t_low[1]) & 
                             (ps_motifs[motif].dfi.seq_match > t_low[0])
                            ]
ps = ps_subset.aggregate(np.mean)
print("# locations:", len(ps_subset))
ps.plot(kind='seq_ic');
# aggregated footprint
ps.plot(kind='profile', rotate_y=0);
# locations: 14254

Poor matches but important

In [75]:
# Poor matches but important
ps_subset = ps_motifs[motif][(ps_motifs[motif].dfi.seq_match < t_low[1]) & 
                             (ps_motifs[motif].dfi.seq_match > t_low[0]) &
                             (ps_motifs[motif].dfi.imp_weighted_p > .5)
                            ]
ps = ps_subset.aggregate(np.mean)
print("# locations:", len(ps_subset))
ps.plot(kind='seq_ic');
# aggregated footprint
ps.plot(kind='profile', rotate_y=0);
# locations: 2295
In [76]:
motif = 'Oct4-Sox2'
In [77]:
h2(motif)
Out[77]:

Oct4-Sox2

In [78]:
patterns[motif].plot('seq');

Good matches

Sequence (PWM) instances

In [79]:
# Good matches
ps_subset = p_seq[motif][p_seq[motif].dfi.seq_match_score > t_high]
ps = ps_subset.aggregate(np.mean)
print("# locations:", len(ps_subset))
ps.plot(kind='seq_ic');
# aggregated footprint
ps.plot(kind='profile', rotate_y=0);
# locations: 5393

Importance score instances

In [80]:
# Good matches
ps_subset = ps_motifs[motif][ps_motifs[motif].dfi.seq_match > t_high]
ps = ps_subset.aggregate(np.mean)
print("# locations:", len(ps_subset))
ps.plot(kind='seq_ic');
# aggregated footprint
ps.plot(kind='profile', rotate_y=0);
# locations: 5275

Poor matches

Sequence (PWM) instances

In [81]:
# Poor matches
ps_subset = p_seq[motif][p_seq[motif].dfi.seq_match_score < t_low[1]]
ps = ps_subset.aggregate(np.mean)
print("# locations:", len(ps_subset))
ps.plot(kind='seq_ic');
# aggregated footprint
ps.plot(kind='profile', rotate_y=0);
# locations: 179424

Importance score instances

In [82]:
# Poor matches
ps_subset = ps_motifs[motif][(ps_motifs[motif].dfi.seq_match < t_low[1]) & 
                             (ps_motifs[motif].dfi.seq_match > t_low[0])
                            ]
ps = ps_subset.aggregate(np.mean)
print("# locations:", len(ps_subset))
ps.plot(kind='seq_ic');
# aggregated footprint
ps.plot(kind='profile', rotate_y=0);
# locations: 10936

Poor matches but important

In [83]:
# Poor matches but important
ps_subset = ps_motifs[motif][(ps_motifs[motif].dfi.seq_match < t_low[1]) & 
                             (ps_motifs[motif].dfi.seq_match > t_low[0]) &
                             (ps_motifs[motif].dfi.imp_weighted_p > .5)
                            ]
ps = ps_subset.aggregate(np.mean)
print("# locations:", len(ps_subset))
ps.plot(kind='seq_ic');
# aggregated footprint
ps.plot(kind='profile', rotate_y=0);
# locations: 141
In [84]:
motif = 'Nanog'
In [85]:
h2(motif)
Out[85]:

Nanog

In [86]:
patterns[motif].plot('seq');
In [87]:
t_high = 8
t_low = [3, 5]

Good matches

Sequence (PWM) instances

In [88]:
# Good matches
ps_subset = p_seq[motif][p_seq[motif].dfi.seq_match_score > t_high]
ps = ps_subset.aggregate(np.mean)
print("# locations:", len(ps_subset))
ps.plot(kind='seq_ic');
# aggregated footprint
ps.plot(kind='profile', rotate_y=0);
# locations: 4193

Importance score instances

In [89]:
# Good matches
ps_subset = ps_motifs[motif][ps_motifs[motif].dfi.seq_match > t_high]
ps = ps_subset.aggregate(np.mean)
print("# locations:", len(ps_subset))
ps.plot(kind='seq_ic');
# aggregated footprint
ps.plot(kind='profile', rotate_y=0);
# locations: 2559

Poor matches

Sequence (PWM) instances

In [90]:
# Poor matches
ps_subset = p_seq[motif][p_seq[motif].dfi.seq_match_score < t_low[1]]
ps = ps_subset.aggregate(np.mean)
print("# locations:", len(ps_subset))
ps.plot(kind='seq_ic');
# aggregated footprint
ps.plot(kind='profile', rotate_y=0);
# locations: 472663

Importance score instances

In [91]:
# Poor matches
ps_subset = ps_motifs[motif][(ps_motifs[motif].dfi.seq_match < t_low[1]) & 
                             (ps_motifs[motif].dfi.seq_match > t_low[0])
                            ]
ps = ps_subset.aggregate(np.mean)
print("# locations:", len(ps_subset))
ps.plot(kind='seq_ic');
# aggregated footprint
ps.plot(kind='profile', rotate_y=0);
# locations: 54548

Poor matches but important

In [92]:
# Poor matches but important
ps_subset = ps_motifs[motif][(ps_motifs[motif].dfi.seq_match < t_low[1]) & 
                             (ps_motifs[motif].dfi.seq_match > t_low[0]) &
                             (ps_motifs[motif].dfi.imp_weighted_p > .5)
                            ]
ps = ps_subset.aggregate(np.mean)
print("# locations:", len(ps_subset))
ps.plot(kind='seq_ic');
# aggregated footprint
ps.plot(kind='profile', rotate_y=0);
# locations: 10163
In [93]:
t_high = 10
t_low = [3, 5]
In [94]:
motif = 'Klf4'
In [95]:
h2(motif)
Out[95]:

Klf4

In [96]:
patterns[motif].plot('seq');
In [97]:
t_high = 8
t_low = [3, 5]

Good matches

Sequence (PWM) instances

In [98]:
# Good matches
ps_subset = p_seq[motif][p_seq[motif].dfi.seq_match_score > t_high]
ps = ps_subset.aggregate(np.mean)
print("# locations:", len(ps_subset))
ps.plot(kind='seq_ic');
# aggregated footprint
ps.plot(kind='profile', rotate_y=0);
# locations: 47921

Importance score instances

In [99]:
# Good matches
ps_subset = ps_motifs[motif][ps_motifs[motif].dfi.seq_match > t_high]
ps = ps_subset.aggregate(np.mean)
print("# locations:", len(ps_subset))
ps.plot(kind='seq_ic');
# aggregated footprint
ps.plot(kind='profile', rotate_y=0);
# locations: 35561

Poor matches

Sequence (PWM) instances

In [100]:
# Poor matches
ps_subset = p_seq[motif][p_seq[motif].dfi.seq_match_score < t_low[1]]
ps = ps_subset.aggregate(np.mean)
print("# locations:", len(ps_subset))
ps.plot(kind='seq_ic');
# aggregated footprint
ps.plot(kind='profile', rotate_y=0);
# locations: 258975

Importance score instances

In [101]:
# Poor matches
ps_subset = ps_motifs[motif][(ps_motifs[motif].dfi.seq_match < t_low[1]) & 
                             (ps_motifs[motif].dfi.seq_match > t_low[0])
                            ]
ps = ps_subset.aggregate(np.mean)
print("# locations:", len(ps_subset))
ps.plot(kind='seq_ic');
# aggregated footprint
ps.plot(kind='profile', rotate_y=0);
# locations: 12296

Poor matches but important

In [102]:
# Poor matches but important
ps_subset = ps_motifs[motif][(ps_motifs[motif].dfi.seq_match < t_low[1]) & 
                             (ps_motifs[motif].dfi.seq_match > t_low[0]) &
                             (ps_motifs[motif].dfi.imp_weighted_p > .5)
                            ]
ps = ps_subset.aggregate(np.mean)
print("# locations:", len(ps_subset))
ps.plot(kind='seq_ic');
# aggregated footprint
ps.plot(kind='profile', rotate_y=0);
# locations: 278

Scratch space

Fimo motifs

In [47]:
melanie_motifs = {"nanog": "Nanog",
                  "sox2": "Sox2",
                  "oct4": "Oct4-Sox2",
                  "klf4": "Klf4"}
In [26]:
dff = pd.concat([pd.read_csv(f"ftp://ftp.stowers.org/pub/mw2098/for_ziga/FIMO_results/{k}_top1000peaks_window201bp/fimo.tsv",
                             sep='\t', comment='#').assign(motif=v)
                 for k,v in melanie_motifs.items()])
In [55]:
dff = dff.dropna()
In [56]:
dff.groupby("motif").size()
Out[56]:
motif
Klf4          96871
Nanog        100066
Oct4-Sox2     55364
Sox2         138543
dtype: int64
In [92]:
from concise.preprocessing import encodeDNA
from concise.utils.pwm import PWM


def make_pwm(seqs, strand):
    return np.concatenate([encodeDNA(list(seqs[strand == '+'])),
                           encodeDNA(list(seqs[strand == '-']))[:, ::-1, ::-1]], axis=0)

pwms = {m: PWM(encodeDNA(list(dfs.matched_sequence.str.upper())).mean(0) + 0.0001, name=m) for m,dfs in dff.groupby("motif")}
pwms
In [93]:
for k,v in pwms.items():
    v.plotPWMInfo(figsize=get_figsize(1, 1/5))
    plt.title(k)
In [182]:
dff.head()
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-182-c84b700d8754> in <module>
----> 1 dff.head()

NameError: name 'dff' is not defined
In [180]:
from concise.preprocessing import encodeDNA
In [181]:
dff.motif.unique()
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-181-a66f5c7e3f9b> in <module>
----> 1 dff.motif.unique()

NameError: name 'dff' is not defined
In [42]:
dff['p-value'].plot.hist(30)
Out[42]:
<matplotlib.axes._subplots.AxesSubplot at 0x7fd1e70a04e0>
In [15]:
dff = dff.dropna()
In [16]:
dff.shape
Out[16]:
(96871, 10)
In [17]:
dff.matched_sequence
Out[17]:
0        GCCACACCCTGCCCC
1        GCCACACCCTGCCCC
2        GCCACACCCTGCCCC
              ...       
96868    CCCACACCCAGCCTA
96869    ACCCCACCCCGCCGC
96870    GCCCCACCCTTTGTC
Name: matched_sequence, Length: 96871, dtype: object
In [35]:
pwm = encodeDNA(dff.matched_sequence.str.upper(), ).mean(axis=0)
In [36]:
pwm = PWM(pwm + 0.001, )  # add some pseudo-counts
In [37]:
pwm.pwm = pwm.pwm[::-1, ::-1]  # Tc
In [25]:
pwm.shape
Out[25]:
(15, 4)
In [38]:
pwm.plotPWMInfo()
Out[38]: