from collections import OrderedDict
exp = 'nexus,peaks,OSNK,0,10,1,FALSE,same,0.5,64,25,0.004,9,FALSE,[1,50],TRUE'
imp_score = 'profile/wn'

motifs = OrderedDict([
    ("Oct4-Sox2", 'Oct4/m0_p0'),
    ("Oct4", 'Oct4/m0_p1'),
    # ("Strange-sym-motif", 'Oct4/m0_p5'),
    ("Sox2", 'Sox2/m0_p1'),
    ("Nanog", 'Nanog/m0_p1'),
    ("Zic3", 'Nanog/m0_p2'),
    ("Nanog-partner", 'Nanog/m0_p4'),
    ("Klf4", 'Klf4/m0_p0'),
])

motifs_chexmix = OrderedDict([
    ("Oct4-Sox2", 'Oct4/1'),
    ("Sox2", 'Sox2/1'),
    ("Nanog", 'Nanog/1'),
    ("Klf4", 'Klf4/1'),
])

main_motifs = ['Oct4-Sox2', 'Sox2', 'Nanog', 'Klf4']

Goal¶

compare the profiles of top-ranked sites (according to sequence)
compare the profiles of weak affinity sites
compare the spacing between motifs

Conclusions¶

Weak affinity sites have stronger footprints when they are highlighted as important
Potential issue: the important poor matches might actually come from other motifs (e.g. Sox2 might have Oct4 flanks etc)
Nanog-Nanog 10bp spacing can be only seen with our motif instances
63.5% of Chexmix peaks (138,098) are in our MACS2 peaks (98,428) and 74.1% of Peakxus peaks (175,259)

Methods¶

Use max counts to compare methods
- use 10% of the highest counts as the threshold (e.g. 8 counts)

TODO¶

[ ] Update TE annotation code
- shall we generate a bed file with intervals to be excluded?
What fraction of the peaks can we explain? E.g. compute the overlap between
- peakxus peaks
- seqlets (un-clustered)
  - Q: How to get this number?
    - Can we take just modisco's unclustered seqlet locations?
    - There could be a possible problem when excluding seqlets...
      - we could also tighten the bin-size to say 10nt and then call peaks
    - What are the two nearest peakxus peaks?
- seqlets (clustered)
- Q: What's the right cut-off?

# Imports
from basepair.imports import *
from basepair.exp.paper.config import tasks
from basepair.modisco.pattern_instances import load_instances, filter_nonoverlapping_intervals, plot_coocurence_matrix, dfi_filter_valid, dfi_add_ranges, annotate_profile
from basepair.cli.imp_score import ImpScoreFile
from basepair.modisco.core import Pattern
from basepair.preproc import dfint_no_intersection
hv.extension('bokeh')
%config InlineBackend.figure_format = 'retina'
%matplotlib inline
paper_config()

interval_cols = ['example_chrom', 'pattern_start_abs', 'pattern_end_abs']

Using TensorFlow backend.

Peakxus/MEME and chexmix¶

Load peakxus
- dfi
Load chexmix
- dfi
- motifs
Load MEME
- dfi (fimo)
- motifs

# load all the required data
# mr = ModiscoResult(modisco_dir / "modisco.h5")
# mr.open()

# imp_scores = ImpScoreFile.from_modisco_dir(modisco_dir)
# imp_scores.cache()  # load all the scores
# ranges = imp_scores.get_ranges()
# ranges.columns = ["example_" + v for v in ranges.columns]
# seqs = imp_scores.get_seq()
# profiles = imp_scores.get_profiles()

from basepair.exp.paper.config import models_dir

!mkdir -p {ddir}/figures/method-comparison

# figures dir
model_dir = models_dir / exp
fdir = Path(f'{ddir}/figures/method-comparison')

---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-6-4683f7f6d67f> in <module>
      1 # figures dir
      2 model_dir = models_dir / exp
----> 3 fdir = Path(f'{ddir}/figures/method-comparison')

NameError: name 'ddir' is not defined

%config InlineBackend.figure_format = 'retina'

paper_config()

profile_width = 70
seqlen = 1000

instance_parq_paths = {t: model_dir / f'deeplift/{t}/out/{imp_score}/instances.parq' 
                       for t in tasks}

from basepair.modisco.pattern_instances import (multiple_load_instances, load_instances, filter_nonoverlapping_intervals, 
                                                plot_coocurence_matrix, align_instance_center, dfi2seqlets, annotate_profile)

mr = MultipleModiscoResult({t: model_dir / f'deeplift/{t}/out/{imp_score}/modisco.h5'
                           for t in tasks})

# Exclude TEs
def shorten_te_pattern(s):
    tf, p = s.split("/", 1)
    return tf + "/" + shorten_pattern(p)

ChExMix motifs¶

from basepair.utils import flatten

from basepair.utils import pd_first_cols, flatten
from basepair.exp.chipnexus.comparison import read_peakxus_dfi, read_chexmix_dfi, read_fimo_dfi, read_meme_motifs, read_transfac

comparison_dir = Path('../../chipnexus/comparison/output')

chexmix_motifs = flatten({tf: read_transfac(comparison_dir / f"chexmix/{tf}/intermediate-results/{tf}.experiment.transfac")
                          for tf in tasks}, separator='/')
chexmix_patterns = [Pattern('chexmix/' + k, v.pwm, dict(), dict()) 
                    for k,v in chexmix_motifs.items()]

from basepair.plot.utils import seqlogo_clean

import matplotlib as mpl

list(map(len, chexmix_patterns))

p = chexmix_patterns[0]
from concise.utils.plot import seqlogo

fig, axes = plt.subplots(len(chexmix_patterns), 1, 
                         sharex=True, sharey=True,
                         figsize=get_figsize(.25, len(chexmix_patterns)/3))
for i, p in enumerate(chexmix_patterns):
    pwm = p.resize(16).get_seq_ic()
    ax = axes[i]
    seqlogo(pwm, ax=axes[i])
    # p.resize(16).plot('seq_ic', letter_width=.1, height=0.5);
    ax.xaxis.set_major_locator(mpl.ticker.AutoLocator())
    _, tf, n = p.name.split("/")
    ax.set_ylabel(f"{tf}/{n}", rotation=0,
                  multialignment='center',
                  ha='right', labelpad=5)
    ax.set_ylim([0, 2])
    sns.despine(bottom=True, top=True, right=True)
fig.savefig(fdir / 'ChExMix.motifs.pdf')

Considered motifs¶

# plot them
patterns = {m: mr.get_pattern(longer_pattern(p)).trim_seq_ic(0.08)
            for m, p in motifs.items()}
for tf, p in patterns.items():
    p.plot("seq");
    plt.title(tf)

TF-MoDISco is using the TensorFlow backend.

len(dfi_chexmix)

169377

Support the following operations:

dfi_filter_valid
- pattern_center
extract_dfi (dfi_row2seqlet)
- example_idx
- pattern_start
- pattern_end
- pattern
- strand

Extract profiles and sequences¶

Importance score instances¶

Sequence (PWM) instances¶

%tqdm_restart

Annotate dfi and¶

Load¶

# Store the results
chexmix_output_dir = Path("../../chipnexus/comparison/output/chexmix/")

# Load the cached results
%time p_seq = read_pkl(chexmix_output_dir / 'pwm_instances.stacked_seqlets.pkl')
%time dfi_seq = pd.read_csv(chexmix_output_dir / 'pwm_instances.csv.gz')

CPU times: user 58 s, sys: 47.8 s, total: 1min 45s
Wall time: 3min 18s
CPU times: user 58.7 s, sys: 5.15 s, total: 1min 3s
Wall time: 57.8 s

# Load the cached results
%time ps_chexmix_motifs = read_pkl(chexmix_output_dir / 'chexmix.stacked_seqlets.pkl')
%time dfi_chexmix = pd.read_csv(chexmix_output_dir / 'chexmix.csv.gz')

CPU times: user 3.14 s, sys: 1.26 s, total: 4.41 s
Wall time: 5.64 s
CPU times: user 2.62 s, sys: 92 ms, total: 2.71 s
Wall time: 2.79 s

!mv {chexmix_output_dir}/benchmark/instances.stacked_seqlets.pkl {model_dir}/benchmark/instances.stacked_seqlets.pkl

mv: cannot stat '../../chipnexus/comparison/output/chexmix/benchmark/instances.stacked_seqlets.pkl': No such file or directory

ls -latr {model_dir}/benchmark

total 12305656
-rw-rw-r-- 1 avsec oak_akundaje 12138112499 Apr 15 11:40 instances.stacked_seqlets.pkl
-rw-rw---- 1 avsec oak_akundaje   462846104 Apr 16 07:06 instances.csv.gz
drwxrws--- 2 avsec oak_akundaje        4096 Apr 16 09:44 ./
drwxrws--- 8 avsec oak_akundaje        4096 May  3 14:15 ../

# Load the cached results
%time ps_motifs = read_pkl(model_dir / 'benchmark/instances.stacked_seqlets.pkl')
%time dfis = pd.read_csv(model_dir / 'benchmark/instances.csv.gz')

CPU times: user 35.2 s, sys: 23.4 s, total: 58.5 s
Wall time: 4min 12s
CPU times: user 41.8 s, sys: 2.11 s, total: 43.9 s
Wall time: 36.6 s

!du -sh {output_dir}/*

du: cannot access '{output_dir}/*': No such file or directory

Visualize the PWM match distribution¶

Sequence (PWM) instances¶

plotnine.options.figure_size = get_figsize(1)
(ggplot(aes(x='seq_match_score'), dfi_seq) + 
 geom_histogram(bins=100) + 
 facet_grid("motif ~ .") + 
 theme_classic() + 
 xlim([0, 15])
)

/users/avsec/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/plotnine/layer.py:449: UserWarning: geom_histogram : Removed 8 rows containing missing values.
  self.data = self.geom.handle_na(self.data)

<ggplot: (-9223363289620938587)>

Importance score instances¶

(ggplot(aes(x='seq_match', fill='imp_weighted_cat'), dfis) + 
 geom_histogram(bins=100) + 
 facet_grid("pattern_name ~ .") + 
 theme_classic() + 
 xlim([0, 15])
)

/users/avsec/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/plotnine/layer.py:360: UserWarning: stat_bin : Removed 2969 rows containing non-finite values.
  data = self.stat.compute_layer(data, params, layout)
/users/avsec/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/plotnine/layer.py:449: UserWarning: geom_histogram : Removed 42 rows containing missing values.
  self.data = self.geom.handle_na(self.data)

<ggplot: (8747192044636)>

Do the Precision-recall curve for good profile matches¶

decide on the cut-offs

TODO¶

[x] re-implement the profile quantification from p_seq
- [x] max, counts, symmetric_kl, cosine-distance
  - figure out which one to get (always sort by top 1k values)
[ ] decide on the profile cut-off
- [ ] visualize the hits for that cut-off (for both, PWM and BPNet scores)
[ ] Add motifs from MEME
[ ] Make it into paper figures

def profile_metrics(profiles, target_profile, dist_metrics=['cosine']):
    """
    
    Args:
      profiles: np.array of a profile
      target_profile: aggregated profile
    """
    from scipy.spatial.distance import cdist
    from basepair.stats import symmetric_kl
    # total counts
    total_counts = profiles.sum(axis=(1,2))
    
    # value at the maximum position
    max_positions = np.argmax(target_profile, axis=0)
    max_counts = profiles[:, max_positions, [0, 1]].sum(axis=-1)
    
    # cosine distance
    Xa = profiles.reshape((len(profiles), -1))
    Xb = target_profile[np.newaxis].reshape((1, -1))
    cosine = cdist(Xa + 1e-6, Xb + 1e-6, metric='cosine')  # cosine distance
    sym_kl = symmetric_kl(Xa.T + 1e-6, Xb.T + 1e-6)  # Make sure we don't have 0 values
    # KL divergence

    assert cosine.ndim == 2
    assert cosine.shape[1] == 1
    return pd.DataFrame({
        "counts": total_counts,
        "max": max_counts,
        "sym_kl": sym_kl,
        "cosine": cosine[:, 0]
    })

from basepair.exp.paper.config import profile_mapping

dfis['imp_weighted_cat'] = pd.Categorical(dfis['imp_weighted_cat'].astype(str), categories = ['low', 'medium', 'high'], ordered=True)

dfis.head()

from basepair.plot.profiles import multiple_plot_stranded_profile

ps_subset = p_seq['Klf4']

sort_idx = np.argsort(-ps_subset.profile['Klf4'].sum(axis=(-2, -1)))

multiple_plot_stranded_profile(ps_subset[sort_idx[:1000]].profile, figsize_tmpl=(2.5, 1.5));

ref = ps_subset[sort_idx[:1000]].profile['Klf4'].mean(axis=0)

dfm = profile_metrics(ps_subset.profile['Klf4'], ref)

dfm

dfm.counts.plot.hist(100)

<matplotlib.axes._subplots.AxesSubplot at 0x7f49d46e1f60>

dfm[dfm.counts > 2].plot.scatter('sym_kl', 'cosine', s=0.1, alpha=0.5)

<matplotlib.axes._subplots.AxesSubplot at 0x7f49d4732eb8>

dfm[dfm.counts > 2].plot.scatter('sym_kl', 'max', s=0.1, alpha=0.5)
plt.yscale("symlog")

dfm[dfm.counts > 2].plot.scatter('counts', 'max', s=0.5, alpha=0.5)
plt.yscale("symlog")
plt.xscale("symlog")

dfm[dfm.counts > 2].plot.scatter('sym_kl', 'counts', s=0.5, alpha=0.5)
plt.yscale("symlog")

plotnine.options.figure_size = get_figsize(1/3, .6)
(ggplot(aes(x='seq_match_cat', y="Oct4/profile_counts_p"), dfis[(~dfis['seq_match_cat'].isnull()) & (dfis.pattern_name == 'Oct4-Sox2')]) + 
 # geom_violin() + 
 geom_boxplot() + 
 theme_classic()
)

<ggplot: (-9223363281642180853)>

from basepair.config import test_chr

dfis

def get_profile_score(dfi, score, profile_cls):
    df = dfi[(~dfi[score].isnull())]
    return df.sort_values(score, ascending=False)[profile_cls]

motif_profile_counts = []
for motif in main_motifs:
    motif_profile_counts.append((motif, {
        "BPNet": get_profile_score(dfis[(dfis.pattern_name == motif) & dfis.example_chrom.isin(test_chr)], 'imp_weighted', profile_mapping[motif]+ "/profile_max").values,
        "PWM": get_profile_score(dfi_seq[(dfi_seq.motif == motif) & dfi_seq.example_chrom.isin(test_chr)], 'seq_match_score', profile_mapping[motif]+ "/profile_max").values,
        # "ChExMix profile": get_profile_score(dfi_chexmix[(dfi_chexmix.pattern_name == motif) & dfi_chexmix.example_chrom.isin(test_chr)], 'profile_score', profile_mapping[motif]+ "/profile_match").values,
        "ChExMix pwm": get_profile_score(dfi_chexmix[(dfi_chexmix.pattern_name == motif) & dfi_chexmix.example_chrom.isin(test_chr)], 'motif_score', profile_mapping[motif]+ "/profile_max").values,
    }))

## TODO - where is ref refined

fig, ax = plt.subplots(1,1,figsize=get_figsize(0.25))
scores = get_profile_score(dfis[(dfis.pattern_name == motif) & dfis.example_chrom.isin(test_chr)], 'imp_weighted', profile_mapping[motif]+ "/profile_max")
scores.plot.hist(np.arange(30), ax=ax)
plt.axvline(8, color='grey')
plt.xlabel("Profile score (max value)");
fig.savefig(fdir / 'profile-score.hist.pdf')

np.mean(scores > 8)

0.09404622261674539

np.percentile(scores, 90)

7.000000953674316

dfis[['pattern_name', 'example_chrom', 'pattern_start_abs', 'pattern_end_abs', 'strand']]

# Remove duplicates in dfis
dfis = dfis.sort_values("imp_weighted", ascending=False)
duplicated = dfis[['pattern_name', 'example_chrom', 'pattern_start_abs', 'pattern_end_abs', 'strand']].duplicated()
dfis = dfis[~duplicated]

# Remove duplicates in dfis
dfi_chexmix = dfi_chexmix.sort_values("profile_score", ascending=False)
duplicated = dfi_chexmix[['pattern_name', 'example_chrom', 'pattern_center_abs', 'strand']].duplicated()
dfi_chexmix = dfi_chexmix[~duplicated]

# Remove duplicates in dfis
dfi_seq = dfi_seq.sort_values("seq_match_score", ascending=False)
duplicated = dfi_seq[['pattern_name', 'example_chrom', 'pattern_start_abs', 'pattern_end_abs', 'strand']].duplicated()
dfi_seq = dfi_seq[~duplicated]

---------------------------------------------------------------------------
KeyError                                  Traceback (most recent call last)
<ipython-input-21-3d411bfa287d> in <module>
      1 # Remove duplicates in dfis
      2 dfi_seq = dfi_seq.sort_values("seq_match_score", ascending=False)
----> 3 duplicated = dfi_seq[['pattern_name', 'example_chrom', 'pattern_start_abs', 'pattern_end_abs', 'strand']].duplicated()
      4 dfi_seq = dfi_seq[~duplicated]

~/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/pandas/core/frame.py in __getitem__(self, key)
   2932                 key = list(key)
   2933             indexer = self.loc._convert_to_indexer(key, axis=1,
-> 2934                                                    raise_missing=True)
   2935 
   2936         # take() does not accept boolean indexers

~/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/pandas/core/indexing.py in _convert_to_indexer(self, obj, axis, is_setter, raise_missing)
   1352                 kwargs = {'raise_missing': True if is_setter else
   1353                           raise_missing}
-> 1354                 return self._get_listlike_indexer(obj, axis, **kwargs)[1]
   1355         else:
   1356             try:

~/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/pandas/core/indexing.py in _get_listlike_indexer(self, key, axis, raise_missing)
   1159         self._validate_read_indexer(keyarr, indexer,
   1160                                     o._get_axis_number(axis),
-> 1161                                     raise_missing=raise_missing)
   1162         return keyarr, indexer
   1163 

~/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/pandas/core/indexing.py in _validate_read_indexer(self, key, indexer, axis, raise_missing)
   1250             if not(self.name == 'loc' and not raise_missing):
   1251                 not_found = list(set(key) - set(ax))
-> 1252                 raise KeyError("{} not in index".format(not_found))
   1253 
   1254             # we skip the warning on Categorical/Interval

KeyError: "['pattern_name'] not in index"

def get_profile_score(dfi, score, profile_cls):
    df = dfi[(~dfi[score].isnull())]
    return df.sort_values(score, ascending=False)[profile_cls]

motif_profile_counts = []
for motif in main_motifs:
    motif_profile_counts.append((motif, {
        "BPNet": get_profile_score(dfis[(dfis.pattern_name == motif) & dfis.example_chrom.isin(test_chr)], 'imp_weighted', profile_mapping[motif]+ "/profile_max").values,
        "ChExMix": get_profile_score(dfi_chexmix[(dfi_chexmix.pattern_name == motif) & dfi_chexmix.example_chrom.isin(test_chr)], 'profile_score', profile_mapping[motif]+ "/profile_max").values,
        "PWM": get_profile_score(dfi_seq[(dfi_seq.motif == motif) & dfi_seq.example_chrom.isin(test_chr)], 'seq_match_score', profile_mapping[motif]+ "/profile_max").values,
        # "ChExMix pwm": get_profile_score(dfi_chexmix[(dfi_chexmix.pattern_name == motif) & dfi_chexmix.example_chrom.isin(test_chr)], 'motif_score', profile_mapping[motif]+ "/profile_max_p").values,
    }))

# Profile scores
model = 'BPNet'
thresholds = dict()
fig, axes = plt.subplots(len(main_motifs),1,figsize=get_figsize(0.25, 1), 
                         gridspec_kw=dict(hspace=0),
                         sharex=True)
for i, motif in enumerate(main_motifs):
    ax = axes[i]
    scores = motif_profile_counts[i][1][model]
    thresholds[motif] = np.percentile(scores, 95)
    #if len(scores) == 0:
    #    continue
    ax.hist(scores, bins=np.arange(30))
    ax.axvline(thresholds[motif], color='grey')
    ax.set_ylabel(motif, rotation=0)
    if i == len(main_motifs)-1:
        ax.set_xlabel("Profile score (max value)");
    fig.savefig(fdir / 'profile-score.hist.pdf')

cutoff = 8
fig, axes = plt.subplots(1, len(main_motifs), figsize=get_figsize(1, 1/(1*len(main_motifs))), 
                         sharex=True, 
                         sharey=True, gridspec_kw=dict(wspace=0, hspace=0))
fig.subplots_adjust(wspace=.5)#, wspace=0)
max_val = 0
for i, (motif, res) in enumerate(motif_profile_counts):
    ax = axes[i]
    # top_n = min([len(x) for x in res.values()])
    top_n = 10000
    cutoff = thresholds[motif]
    for k,v in res.items():
        ax.plot(np.cumsum(v[:top_n] > cutoff), label=k)

    max_val = max(max_val, ax.get_ylim()[1])
    if i == 0:
        ax.set_ylabel("# motif instances\nwith high coverage")
    if i == 1:
        ax.set_xlabel("Top # of motif instances")
    if i == 0:
        leg = ax.legend(loc='lower right', framealpha=1, borderpad=0)
        leg.get_frame().set_linewidth(0.0)
    ax.set_title(motif)

for i, (motif, res) in enumerate(motif_profile_counts):
    axes[i].plot([0, max_val], [0, max_val], '--', color='grey', label=None)
# fig.savefig(fdir / 'footprint-recall-curve.with-ChExMix.pdf')

cutoff = 8
fig, axes = plt.subplots(1, len(main_motifs), figsize=get_figsize(1, 1/(1*len(main_motifs))), 
                         sharex=True, 
                         sharey=True, gridspec_kw=dict(wspace=0, hspace=0))
fig.subplots_adjust(wspace=.5)#, wspace=0)
max_val = 0
for i, (motif, res) in enumerate(motif_profile_counts):
    ax = axes[i]
    # top_n = min([len(x) for x in res.values()])
    top_n = 10000
    cutoff = thresholds[motif]
    for k,v in res.items():
        ax.plot(np.cumsum(v[:top_n] > cutoff), label=k)

    max_val = max(max_val, ax.get_ylim()[1])
    if i == 0:
        ax.set_ylabel("# motif instances\nwith high coverage")
    if i == 1:
        ax.set_xlabel("Top # of motif instances")
    if i == 0:
        leg = ax.legend(loc='lower right', framealpha=1, borderpad=0)
        leg.get_frame().set_linewidth(0.0)
    ax.set_title(motif)

for i, (motif, res) in enumerate(motif_profile_counts):
    axes[i].plot([0, max_val], [0, max_val], '--', color='grey', label=None)
fig.savefig(fdir / 'footprint-recall-curve.with-ChExMix.pdf')

Spearman correlation¶

from gin_train.metrics import spearmanr

def corelate_cols(dfi, score, profile_cls, metric=spearmanr):
    df = dfi[(~dfi[score].isnull())]
    return metric(dfi[profile_cls], dfi[score])

motif_spearmanr = []
for motif in main_motifs:
    
    motif_spearmanr.append({
        "spearmanr": corelate_cols(dfis[(dfis.pattern_name == motif) & dfis.example_chrom.isin(test_chr)], 'imp_weighted', profile_mapping[motif]+ "/profile_max"),
        "motif_score": "BPNet",
        "motif": motif
    })
    motif_spearmanr.append({
        "spearmanr": corelate_cols(dfi_seq[(dfi_seq.motif == motif) & dfi_seq.example_chrom.isin(test_chr)], 'seq_match_score', profile_mapping[motif]+ "/profile_max"),
        "motif_score": "PWM",
        "motif": motif
    })
    
dfm_spearman = pd.DataFrame(motif_spearmanr)
dfm_spearman['motif'] = pd.Categorical(dfm_spearman['motif'], categories=list(motifs), ordered=True)

plotnine.options.figure_size = get_figsize(.4)
fig = (ggplot(aes(x='motif', fill='motif_score', y='spearmanr'), dfm_spearman) + 
 geom_bar(stat='identity', position='dodge') + 
 theme_classic() + 
 scale_fill_brewer(type='qual', palette=3) + 
 theme(legend_position='top') + 
 ylab("Spearman rank correlation\nwith profile height")
)
fig.save(fdir / 'profile-max-correlation.bar.pdf')
fig

/users/avsec/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/plotnine/ggplot.py:706: UserWarning: Saving 2.740152 x 1.693413936 in image.
  from_inches(height, units), units))
/users/avsec/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/plotnine/ggplot.py:707: UserWarning: Filename: /users/avsec/workspace/basepair/data/figures/method-comparison/profile-max-correlation.bar.pdf
  warn('Filename: {}'.format(filename))

<ggplot: (8746853109554)>

TODO¶

[x] make a bar-plot with spearman correlation for different TF's and differnet methods (test chromosome)
[x] fix the profile distance

plotnine.options.figure_size = get_figsize(1, .25)
(ggplot(aes(x='Oct4/profile_max_p', y="Oct4/profile_counts_p"), dfis[(~dfis['seq_match_cat'].isnull()) & (dfis.pattern_name == 'Oct4-Sox2')]) + 
 # geom_violin() + 
 geom_point(alpha=0.01, size=0.1, color='black') + 
 theme_classic()
)

<ggplot: (8743950590391)>

plotnine.options.figure_size = get_figsize(.25, 1)
fig = (ggplot(aes(x='seq_match', y="match_weighted"), 
        dfis[(~dfis['seq_match_cat'].isnull()) & (dfis.pattern_name == 'Oct4-Sox2')].sample(frac=0.1)) + 
 # geom_violin() + 
 geom_point(alpha=0.01, size=0.1, color='black') + 
 xlab("PWM match") + 
 ylab("CWM match (jaccard)") + 
 theme_classic()
)
fig.save(fdir / 'pwm_match_vs_cwm_match.pdf', raster=True)
fig

/users/avsec/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/plotnine/ggplot.py:706: UserWarning: Saving 1.712595 x 1.712595 in image.
  from_inches(height, units), units))
/users/avsec/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/plotnine/ggplot.py:707: UserWarning: Filename: /users/avsec/workspace/basepair/data/figures/method-comparison/pwm_match_vs_cwm_match.pdf
  warn('Filename: {}'.format(filename))

<ggplot: (-9223363289661771811)>

plotnine.options.figure_size = get_figsize(.25, 1)
fig = (ggplot(aes(x='imp_weighted', y="match_weighted"), 
        dfis[(~dfis['seq_match_cat'].isnull()) & (dfis.pattern_name == 'Oct4-Sox2')].sample(frac=0.1)) + 
 # geom_violin() + 
 geom_point(alpha=0.01, size=0.1, color='black') + 
 xlab("Importance") + 
 ylab("CWM match (jaccard)") + 
 theme_classic()
)
fig.save(fdir / 'importance_vs_cwm_match.pdf', raster=True)
fig

/users/avsec/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/plotnine/ggplot.py:706: UserWarning: Saving 1.712595 x 1.712595 in image.
  from_inches(height, units), units))
/users/avsec/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/plotnine/ggplot.py:707: UserWarning: Filename: /users/avsec/workspace/basepair/data/figures/method-comparison/importance_vs_cwm_match.pdf
  warn('Filename: {}'.format(filename))

<ggplot: (8747193213872)>

Spacing analysis¶

TODO¶

plot a big pairwise matrix
- `method + strand ~ motif_pair

from basepair.exp.chipnexus.spacing import get_motif_pairs, motif_pair_dfi

dfi_seq['pattern_name'] = dfi_seq.motif

motif_pairs = get_motif_pairs(main_motifs)
motif_pairs

[['Oct4-Sox2', 'Oct4-Sox2'],
 ['Oct4-Sox2', 'Sox2'],
 ['Oct4-Sox2', 'Nanog'],
 ['Oct4-Sox2', 'Klf4'],
 ['Sox2', 'Sox2'],
 ['Sox2', 'Nanog'],
 ['Sox2', 'Klf4'],
 ['Nanog', 'Nanog'],
 ['Nanog', 'Klf4'],
 ['Klf4', 'Klf4']]

motif_pair = ['Nanog', 'Nanog']
motif_pair_name = "<>".join(motif_pair)

threshold = 5
dfab = motif_pair_dfi(dfi_seq[dfi_seq.seq_match_score > threshold], motif_pair)
dfab['motif_pair'] = "<>".join(motif_pair)

plotnine.options.figure_size = get_figsize(.3, aspect=2/10*4)
max_dist = 100
fig = (ggplot(aes(x='center_diff', fill='strand_combination'), dfab[(dfab.center_diff <= max_dist) & (dfab.motif_pair.isin([motif_pair_name]))]) + 
 geom_vline(xintercept=10, alpha=0.1) + 
 geom_vline(xintercept=20, alpha=0.1) + 
 geom_vline(xintercept=30, alpha=0.1) + 
 geom_vline(xintercept=40, alpha=0.1) + 
 geom_histogram(bins=max_dist) + facet_grid("strand_combination~.") + 
 theme_classic(base_size=10, base_family='Arial') + 
 theme(strip_text = element_text(rotation=0), legend_position='top') + 
 xlim([0, max_dist]) + 
 xlab("Pairwise distance") +
 ggtitle(motif_pair_name) + 
 scale_fill_brewer(type='qual', palette=3))
fig.save(fdir / 'Nanog<>Nanog.PWM.threshold=5.pdf')
fig

/users/avsec/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/plotnine/ggplot.py:706: UserWarning: Saving 2.055114 x 1.6440912 in image.
  from_inches(height, units), units))
/users/avsec/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/plotnine/ggplot.py:707: UserWarning: Filename: /users/avsec/workspace/basepair/data/figures/method-comparison/Nanog<>Nanog.PWM.threshold=5.pdf
  warn('Filename: {}'.format(filename))
/users/avsec/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/plotnine/layer.py:449: UserWarning: geom_histogram : Removed 6 rows containing missing values.
  self.data = self.geom.handle_na(self.data)
/users/avsec/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/plotnine/layer.py:449: UserWarning: geom_histogram : Removed 6 rows containing missing values.
  self.data = self.geom.handle_na(self.data)

<ggplot: (8746850227576)>

threshold = 6
dfab = motif_pair_dfi(dfi_seq[dfi_seq.seq_match_score > threshold], motif_pair)
dfab['motif_pair'] = "<>".join(motif_pair)

plotnine.options.figure_size = get_figsize(.3, aspect=2/10*4)
max_dist = 100
fig = (ggplot(aes(x='center_diff', fill='strand_combination'), dfab[(dfab.center_diff <= max_dist) & (dfab.motif_pair.isin([motif_pair_name]))]) + 
 geom_vline(xintercept=10, alpha=0.1) + 
 geom_vline(xintercept=20, alpha=0.1) + 
 geom_vline(xintercept=30, alpha=0.1) + 
 geom_vline(xintercept=40, alpha=0.1) + 
 geom_histogram(bins=max_dist) + facet_grid("strand_combination~.") + 
 theme_classic(base_size=10, base_family='Arial') + 
 theme(strip_text = element_text(rotation=0), legend_position='top') + 
 xlim([0, max_dist]) + 
 xlab("Pairwise distance") +
 ggtitle(motif_pair_name) + 
 scale_fill_brewer(type='qual', palette=3))
fig.save(fdir / f'Nanog<>Nanog.PWM.threshold={threshold}.pdf')
fig

/users/avsec/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/plotnine/ggplot.py:706: UserWarning: Saving 2.055114 x 1.6440912 in image.
  from_inches(height, units), units))
/users/avsec/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/plotnine/ggplot.py:707: UserWarning: Filename: /users/avsec/workspace/basepair/data/figures/method-comparison/Nanog<>Nanog.PWM.threshold=6.pdf
  warn('Filename: {}'.format(filename))
/users/avsec/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/plotnine/layer.py:449: UserWarning: geom_histogram : Removed 6 rows containing missing values.
  self.data = self.geom.handle_na(self.data)
/users/avsec/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/plotnine/layer.py:449: UserWarning: geom_histogram : Removed 6 rows containing missing values.
  self.data = self.geom.handle_na(self.data)

<ggplot: (8746850231630)>

threshold = 7
dfab = motif_pair_dfi(dfi_seq[dfi_seq.seq_match_score > threshold], motif_pair)
dfab['motif_pair'] = "<>".join(motif_pair)

plotnine.options.figure_size = get_figsize(.3, aspect=2/10*4)
max_dist = 100
fig = (ggplot(aes(x='center_diff', fill='strand_combination'), dfab[(dfab.center_diff <= max_dist) & (dfab.motif_pair.isin([motif_pair_name]))]) + 
 geom_vline(xintercept=10, alpha=0.1) + 
 geom_vline(xintercept=20, alpha=0.1) + 
 geom_vline(xintercept=30, alpha=0.1) + 
 geom_vline(xintercept=40, alpha=0.1) + 
 geom_histogram(bins=max_dist) + facet_grid("strand_combination~.") + 
 theme_classic(base_size=10, base_family='Arial') + 
 theme(strip_text = element_text(rotation=0), legend_position='top') + 
 xlim([0, max_dist]) + 
 xlab("Pairwise distance") +
 ggtitle(motif_pair_name) + 
 scale_fill_brewer(type='qual', palette=3))
fig.save(fdir / f'Nanog<>Nanog.PWM.threshold={threshold}.pdf')
fig

/users/avsec/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/plotnine/ggplot.py:706: UserWarning: Saving 2.055114 x 1.6440912 in image.
  from_inches(height, units), units))
/users/avsec/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/plotnine/ggplot.py:707: UserWarning: Filename: /users/avsec/workspace/basepair/data/figures/method-comparison/Nanog<>Nanog.PWM.threshold=7.pdf
  warn('Filename: {}'.format(filename))
/users/avsec/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/plotnine/layer.py:449: UserWarning: geom_histogram : Removed 6 rows containing missing values.
  self.data = self.geom.handle_na(self.data)
/users/avsec/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/plotnine/layer.py:449: UserWarning: geom_histogram : Removed 6 rows containing missing values.
  self.data = self.geom.handle_na(self.data)

<ggplot: (8746850223473)>

Very good matches¶

dfi_seq.seq_match_score.plot.hist(100);

Original figure¶

dfab = motif_pair_dfi(dfis.query('match_weighted_p > .2').query('imp_weighted_p > 0'), motif_pair)
dfab['motif_pair'] = "<>".join(motif_pair)

plotnine.options.figure_size = get_figsize(.3, aspect=2/10*4)
max_dist = 100
fig = (ggplot(aes(x='center_diff', fill='strand_combination'), dfab[(dfab.center_diff <= max_dist) & (dfab.motif_pair.isin([motif_pair_name]))]) + 
 geom_vline(xintercept=10, alpha=0.1) + 
 geom_vline(xintercept=20, alpha=0.1) + 
 geom_vline(xintercept=30, alpha=0.1) + 
 geom_vline(xintercept=40, alpha=0.1) + 
 geom_histogram(bins=max_dist) + facet_grid("strand_combination~.") + 
 theme_classic(base_size=10, base_family='Arial') + 
 theme(strip_text = element_text(rotation=0), legend_position='top') + 
 xlim([0, max_dist]) + 
 xlab("Pairwise distance") +
 ggtitle(motif_pair_name) + 
 scale_fill_brewer(type='qual', palette=3))
fig.save(fdir / 'Nanog<>Nanog.BPNet.default.pdf')
fig

/users/avsec/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/plotnine/ggplot.py:706: UserWarning: Saving 2.055114 x 1.6440912 in image.
  from_inches(height, units), units))
/users/avsec/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/plotnine/ggplot.py:707: UserWarning: Filename: /users/avsec/workspace/basepair/data/figures/method-comparison/Nanog<>Nanog.BPNet.default.pdf
  warn('Filename: {}'.format(filename))
/users/avsec/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/plotnine/layer.py:449: UserWarning: geom_histogram : Removed 6 rows containing missing values.
  self.data = self.geom.handle_na(self.data)
/users/avsec/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/plotnine/layer.py:449: UserWarning: geom_histogram : Removed 6 rows containing missing values.
  self.data = self.geom.handle_na(self.data)

<ggplot: (-9223363290004483987)>

Chexmix¶

dfi_chexmix.motif_score.plot.hist(100);

dfab = motif_pair_dfi(dfi_chexmix, motif_pair)
dfab['motif_pair'] = "<>".join(motif_pair)

plotnine.options.figure_size = get_figsize(.3, aspect=2/10*4)
max_dist = 100
fig = (ggplot(aes(x='center_diff', fill='strand_combination'), dfab[(dfab.center_diff <= max_dist) & (dfab.motif_pair.isin([motif_pair_name]))]) + 
 geom_vline(xintercept=10, alpha=0.1) + 
 geom_vline(xintercept=20, alpha=0.1) + 
 geom_vline(xintercept=30, alpha=0.1) + 
 geom_vline(xintercept=40, alpha=0.1) + 
 geom_histogram(bins=max_dist) + facet_grid("strand_combination~.") + 
 theme_classic(base_size=10, base_family='Arial') + 
 theme(strip_text = element_text(rotation=0), legend_position='top') + 
 xlim([0, max_dist]) + 
 xlab("Pairwise distance") +
 ggtitle(motif_pair_name) + 
 scale_fill_brewer(type='qual', palette=3))
fig.save(fdir / 'Nanog<>Nanog.ChExMix.default.pdf')
fig

/users/avsec/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/plotnine/ggplot.py:706: UserWarning: Saving 2.055114 x 1.6440912 in image.
  from_inches(height, units), units))
/users/avsec/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/plotnine/ggplot.py:707: UserWarning: Filename: /users/avsec/workspace/basepair/data/figures/method-comparison/Nanog<>Nanog.ChExMix.default.pdf
  warn('Filename: {}'.format(filename))
/users/avsec/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/plotnine/layer.py:449: UserWarning: geom_histogram : Removed 6 rows containing missing values.
  self.data = self.geom.handle_na(self.data)
/users/avsec/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/plotnine/layer.py:449: UserWarning: geom_histogram : Removed 6 rows containing missing values.
  self.data = self.geom.handle_na(self.data)

<ggplot: (-9223363290004543905)>

motif_score = 1
dfab = motif_pair_dfi(dfi_chexmix.query(f'motif_score > {motif_score}'), motif_pair)
dfab['motif_pair'] = "<>".join(motif_pair)

plotnine.options.figure_size = get_figsize(.3, aspect=2/10*4)
max_dist = 100
fig = (ggplot(aes(x='center_diff', fill='strand_combination'), dfab[(dfab.center_diff <= max_dist) & (dfab.motif_pair.isin([motif_pair_name]))]) + 
 geom_vline(xintercept=10, alpha=0.1) + 
 geom_vline(xintercept=20, alpha=0.1) + 
 geom_vline(xintercept=30, alpha=0.1) + 
 geom_vline(xintercept=40, alpha=0.1) + 
 geom_histogram(bins=max_dist) + facet_grid("strand_combination~.") + 
 theme_classic(base_size=10, base_family='Arial') + 
 theme(strip_text = element_text(rotation=0), legend_position='top') + 
 xlim([0, max_dist]) + 
 xlab("Pairwise distance") +
 ggtitle(motif_pair_name) + 
 scale_fill_brewer(type='qual', palette=3))
fig.save(fdir / f'Nanog<>Nanog.ChExMix.motif_score={motif_score}.pdf')
fig

/users/avsec/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/plotnine/ggplot.py:706: UserWarning: Saving 2.055114 x 1.6440912 in image.
  from_inches(height, units), units))
/users/avsec/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/plotnine/ggplot.py:707: UserWarning: Filename: /users/avsec/workspace/basepair/data/figures/method-comparison/Nanog<>Nanog.ChExMix.motif_score=1.pdf
  warn('Filename: {}'.format(filename))
/users/avsec/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/plotnine/layer.py:449: UserWarning: geom_histogram : Removed 6 rows containing missing values.
  self.data = self.geom.handle_na(self.data)
/users/avsec/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/plotnine/layer.py:449: UserWarning: geom_histogram : Removed 6 rows containing missing values.
  self.data = self.geom.handle_na(self.data)

<ggplot: (-9223363290004516562)>

motif_score = 5
dfab = motif_pair_dfi(dfi_chexmix.query(f'motif_score > {motif_score}'), motif_pair)
dfab['motif_pair'] = "<>".join(motif_pair)

plotnine.options.figure_size = get_figsize(.3, aspect=2/10*4)
max_dist = 100
fig = (ggplot(aes(x='center_diff', fill='strand_combination'), dfab[(dfab.center_diff <= max_dist) & (dfab.motif_pair.isin([motif_pair_name]))]) + 
 geom_vline(xintercept=10, alpha=0.1) + 
 geom_vline(xintercept=20, alpha=0.1) + 
 geom_vline(xintercept=30, alpha=0.1) + 
 geom_vline(xintercept=40, alpha=0.1) + 
 geom_histogram(bins=max_dist) + facet_grid("strand_combination~.") + 
 theme_classic(base_size=10, base_family='Arial') + 
 theme(strip_text = element_text(rotation=0), legend_position='top') + 
 xlim([0, max_dist]) + 
 xlab("Pairwise distance") +
 ggtitle(motif_pair_name) + 
 scale_fill_brewer(type='qual', palette=3))
fig.save(fdir / f'Nanog<>Nanog.ChExMix.motif_score={motif_score}.pdf')
fig

/users/avsec/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/plotnine/ggplot.py:706: UserWarning: Saving 2.055114 x 1.6440912 in image.
  from_inches(height, units), units))
/users/avsec/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/plotnine/ggplot.py:707: UserWarning: Filename: /users/avsec/workspace/basepair/data/figures/method-comparison/Nanog<>Nanog.ChExMix.motif_score=5.pdf
  warn('Filename: {}'.format(filename))
/users/avsec/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/plotnine/layer.py:449: UserWarning: geom_histogram : Removed 6 rows containing missing values.
  self.data = self.geom.handle_na(self.data)
/users/avsec/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/plotnine/layer.py:449: UserWarning: geom_histogram : Removed 6 rows containing missing values.
  self.data = self.geom.handle_na(self.data)

<ggplot: (8746853501311)>

Weak affinity site analysis¶

Visualize motifs¶

t_high = 10
t_low = [3, 5]

motif = 'Sox2'

h2(motif)

patterns[motif].plot('seq');

Good matches¶

Sequence (PWM) instances¶

# Good matches
ps_subset = p_seq[motif][p_seq[motif].dfi.seq_match_score > t_high]
ps = ps_subset.aggregate(np.mean)
print("# locations:", len(ps_subset))
ps.plot(kind='seq_ic');
# aggregated footprint
ps.plot(kind='profile', rotate_y=0);

# locations: 2486

Importance score instances¶

# Good matches
ps_subset = ps_motifs[motif][ps_motifs[motif].dfi.seq_match > t_high]
ps = ps_subset.aggregate(np.mean)
print("# locations:", len(ps_subset))
ps.plot(kind='seq_ic');
# aggregated footprint
ps.plot(kind='profile', rotate_y=0);

# locations: 1711

Poor matches¶

Sequence (PWM) instances¶

# Poor matches
ps_subset = p_seq[motif][p_seq[motif].dfi.seq_match_score < t_low[1]]
ps = ps_subset.aggregate(np.mean)
print("# locations:", len(ps_subset))
ps.plot(kind='seq_ic');
# aggregated footprint
ps.plot(kind='profile', rotate_y=0);

# locations: 190455

Importance score instances¶

# Poor matches
ps_subset = ps_motifs[motif][(ps_motifs[motif].dfi.seq_match < t_low[1]) & 
                             (ps_motifs[motif].dfi.seq_match > t_low[0])
                            ]
ps = ps_subset.aggregate(np.mean)
print("# locations:", len(ps_subset))
ps.plot(kind='seq_ic');
# aggregated footprint
ps.plot(kind='profile', rotate_y=0);

# locations: 14254

Poor matches but important¶

# Poor matches but important
ps_subset = ps_motifs[motif][(ps_motifs[motif].dfi.seq_match < t_low[1]) & 
                             (ps_motifs[motif].dfi.seq_match > t_low[0]) &
                             (ps_motifs[motif].dfi.imp_weighted_p > .5)
                            ]
ps = ps_subset.aggregate(np.mean)
print("# locations:", len(ps_subset))
ps.plot(kind='seq_ic');
# aggregated footprint
ps.plot(kind='profile', rotate_y=0);

# locations: 2295

motif = 'Oct4-Sox2'

h2(motif)

patterns[motif].plot('seq');

Good matches¶

Sequence (PWM) instances¶

# Good matches
ps_subset = p_seq[motif][p_seq[motif].dfi.seq_match_score > t_high]
ps = ps_subset.aggregate(np.mean)
print("# locations:", len(ps_subset))
ps.plot(kind='seq_ic');
# aggregated footprint
ps.plot(kind='profile', rotate_y=0);

# locations: 5393

Importance score instances¶

# Good matches
ps_subset = ps_motifs[motif][ps_motifs[motif].dfi.seq_match > t_high]
ps = ps_subset.aggregate(np.mean)
print("# locations:", len(ps_subset))
ps.plot(kind='seq_ic');
# aggregated footprint
ps.plot(kind='profile', rotate_y=0);

# locations: 5275

Poor matches¶

Sequence (PWM) instances¶

# Poor matches
ps_subset = p_seq[motif][p_seq[motif].dfi.seq_match_score < t_low[1]]
ps = ps_subset.aggregate(np.mean)
print("# locations:", len(ps_subset))
ps.plot(kind='seq_ic');
# aggregated footprint
ps.plot(kind='profile', rotate_y=0);

# locations: 179424

Importance score instances¶

# Poor matches
ps_subset = ps_motifs[motif][(ps_motifs[motif].dfi.seq_match < t_low[1]) & 
                             (ps_motifs[motif].dfi.seq_match > t_low[0])
                            ]
ps = ps_subset.aggregate(np.mean)
print("# locations:", len(ps_subset))
ps.plot(kind='seq_ic');
# aggregated footprint
ps.plot(kind='profile', rotate_y=0);

# locations: 10936

Poor matches but important¶

# Poor matches but important
ps_subset = ps_motifs[motif][(ps_motifs[motif].dfi.seq_match < t_low[1]) & 
                             (ps_motifs[motif].dfi.seq_match > t_low[0]) &
                             (ps_motifs[motif].dfi.imp_weighted_p > .5)
                            ]
ps = ps_subset.aggregate(np.mean)
print("# locations:", len(ps_subset))
ps.plot(kind='seq_ic');
# aggregated footprint
ps.plot(kind='profile', rotate_y=0);

# locations: 141

motif = 'Nanog'

h2(motif)

patterns[motif].plot('seq');

t_high = 8
t_low = [3, 5]

Good matches¶

Sequence (PWM) instances¶

# Good matches
ps_subset = p_seq[motif][p_seq[motif].dfi.seq_match_score > t_high]
ps = ps_subset.aggregate(np.mean)
print("# locations:", len(ps_subset))
ps.plot(kind='seq_ic');
# aggregated footprint
ps.plot(kind='profile', rotate_y=0);

# locations: 4193

Importance score instances¶

# Good matches
ps_subset = ps_motifs[motif][ps_motifs[motif].dfi.seq_match > t_high]
ps = ps_subset.aggregate(np.mean)
print("# locations:", len(ps_subset))
ps.plot(kind='seq_ic');
# aggregated footprint
ps.plot(kind='profile', rotate_y=0);

# locations: 2559

Poor matches¶

Sequence (PWM) instances¶

# Poor matches
ps_subset = p_seq[motif][p_seq[motif].dfi.seq_match_score < t_low[1]]
ps = ps_subset.aggregate(np.mean)
print("# locations:", len(ps_subset))
ps.plot(kind='seq_ic');
# aggregated footprint
ps.plot(kind='profile', rotate_y=0);

# locations: 472663

Importance score instances¶

# Poor matches
ps_subset = ps_motifs[motif][(ps_motifs[motif].dfi.seq_match < t_low[1]) & 
                             (ps_motifs[motif].dfi.seq_match > t_low[0])
                            ]
ps = ps_subset.aggregate(np.mean)
print("# locations:", len(ps_subset))
ps.plot(kind='seq_ic');
# aggregated footprint
ps.plot(kind='profile', rotate_y=0);

# locations: 54548

Poor matches but important¶

# Poor matches but important
ps_subset = ps_motifs[motif][(ps_motifs[motif].dfi.seq_match < t_low[1]) & 
                             (ps_motifs[motif].dfi.seq_match > t_low[0]) &
                             (ps_motifs[motif].dfi.imp_weighted_p > .5)
                            ]
ps = ps_subset.aggregate(np.mean)
print("# locations:", len(ps_subset))
ps.plot(kind='seq_ic');
# aggregated footprint
ps.plot(kind='profile', rotate_y=0);

# locations: 10163

t_high = 10
t_low = [3, 5]

motif = 'Klf4'

h2(motif)

patterns[motif].plot('seq');

t_high = 8
t_low = [3, 5]

Good matches¶

Sequence (PWM) instances¶

# Good matches
ps_subset = p_seq[motif][p_seq[motif].dfi.seq_match_score > t_high]
ps = ps_subset.aggregate(np.mean)
print("# locations:", len(ps_subset))
ps.plot(kind='seq_ic');
# aggregated footprint
ps.plot(kind='profile', rotate_y=0);

# locations: 47921

Importance score instances¶

# Good matches
ps_subset = ps_motifs[motif][ps_motifs[motif].dfi.seq_match > t_high]
ps = ps_subset.aggregate(np.mean)
print("# locations:", len(ps_subset))
ps.plot(kind='seq_ic');
# aggregated footprint
ps.plot(kind='profile', rotate_y=0);

# locations: 35561

Poor matches¶

Sequence (PWM) instances¶

# Poor matches
ps_subset = p_seq[motif][p_seq[motif].dfi.seq_match_score < t_low[1]]
ps = ps_subset.aggregate(np.mean)
print("# locations:", len(ps_subset))
ps.plot(kind='seq_ic');
# aggregated footprint
ps.plot(kind='profile', rotate_y=0);

# locations: 258975

Importance score instances¶

# Poor matches
ps_subset = ps_motifs[motif][(ps_motifs[motif].dfi.seq_match < t_low[1]) & 
                             (ps_motifs[motif].dfi.seq_match > t_low[0])
                            ]
ps = ps_subset.aggregate(np.mean)
print("# locations:", len(ps_subset))
ps.plot(kind='seq_ic');
# aggregated footprint
ps.plot(kind='profile', rotate_y=0);

# locations: 12296

Poor matches but important¶

# Poor matches but important
ps_subset = ps_motifs[motif][(ps_motifs[motif].dfi.seq_match < t_low[1]) & 
                             (ps_motifs[motif].dfi.seq_match > t_low[0]) &
                             (ps_motifs[motif].dfi.imp_weighted_p > .5)
                            ]
ps = ps_subset.aggregate(np.mean)
print("# locations:", len(ps_subset))
ps.plot(kind='seq_ic');
# aggregated footprint
ps.plot(kind='profile', rotate_y=0);

# locations: 278

Scratch space¶

Fimo motifs¶

melanie_motifs = {"nanog": "Nanog",
                  "sox2": "Sox2",
                  "oct4": "Oct4-Sox2",
                  "klf4": "Klf4"}

dff = pd.concat([pd.read_csv(f"ftp://ftp.stowers.org/pub/mw2098/for_ziga/FIMO_results/{k}_top1000peaks_window201bp/fimo.tsv",
                             sep='\t', comment='#').assign(motif=v)
                 for k,v in melanie_motifs.items()])

dff = dff.dropna()

dff.groupby("motif").size()

motif
Klf4          96871
Nanog        100066
Oct4-Sox2     55364
Sox2         138543
dtype: int64

from concise.preprocessing import encodeDNA
from concise.utils.pwm import PWM


def make_pwm(seqs, strand):
    return np.concatenate([encodeDNA(list(seqs[strand == '+'])),
                           encodeDNA(list(seqs[strand == '-']))[:, ::-1, ::-1]], axis=0)

pwms = {m: PWM(encodeDNA(list(dfs.matched_sequence.str.upper())).mean(0) + 0.0001, name=m) for m,dfs in dff.groupby("motif")}
pwms

for k,v in pwms.items():
    v.plotPWMInfo(figsize=get_figsize(1, 1/5))
    plt.title(k)

dff.head()

---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-182-c84b700d8754> in <module>
----> 1 dff.head()

NameError: name 'dff' is not defined

from concise.preprocessing import encodeDNA

dff.motif.unique()

---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-181-a66f5c7e3f9b> in <module>
----> 1 dff.motif.unique()

NameError: name 'dff' is not defined

dff['p-value'].plot.hist(30)

<matplotlib.axes._subplots.AxesSubplot at 0x7fd1e70a04e0>

dff = dff.dropna()

dff.shape

(96871, 10)

dff.matched_sequence

0        GCCACACCCTGCCCC
1        GCCACACCCTGCCCC
2        GCCACACCCTGCCCC
              ...       
96868    CCCACACCCAGCCTA
96869    ACCCCACCCCGCCGC
96870    GCCCCACCCTTTGTC
Name: matched_sequence, Length: 96871, dtype: object

pwm = encodeDNA(dff.matched_sequence.str.upper(), ).mean(axis=0)

pwm = PWM(pwm + 0.001, )  # add some pseudo-counts

pwm.pwm = pwm.pwm[::-1, ::-1]  # Tc

pwm.shape

(15, 4)

pwm.plotPWMInfo()

	example_chrom	example_end	example_idx	example_interval_from_task	example_start	example_strand	imp/Klf4	imp/Nanog	imp/Oct4	imp/Sox2	imp_max	imp_max_task	imp_weighted	imp_weighted_cat	imp_weighted_p	match/Klf4	match/Nanog	match/Oct4	match/Sox2	match_max	match_max_task	match_weighted	match_weighted_cat	match_weighted_p	pattern	pattern_center	pattern_end	pattern_end_abs	pattern_len	pattern_name	pattern_short	pattern_start	pattern_start_abs	seq_match	seq_match_cat	seq_match_p	strand	tf	id	Klf4/profile_match	Klf4/profile_match_p	Klf4/profile_counts	Klf4/profile_counts_p	Klf4/profile_max	Klf4/profile_max_p	Klf4/profile_counts_max_ref	Klf4/profile_counts_max_ref_p	Nanog/profile_match	Nanog/profile_match_p	Nanog/profile_counts	Nanog/profile_counts_p	Nanog/profile_max	Nanog/profile_max_p	Nanog/profile_counts_max_ref	Nanog/profile_counts_max_ref_p	Oct4/profile_match	Oct4/profile_match_p	Oct4/profile_counts	Oct4/profile_counts_p	Oct4/profile_max	Oct4/profile_max_p	Oct4/profile_counts_max_ref	Oct4/profile_counts_max_ref_p	Sox2/profile_match	Sox2/profile_match_p	Sox2/profile_counts	Sox2/profile_counts_p	Sox2/profile_max	Sox2/profile_max_p	Sox2/profile_counts_max_ref	Sox2/profile_counts_max_ref_p
0	chr9	3002633	0	Oct4	3001633	.	NaN	NaN	0.0448	NaN	0.0448	Oct4	0.0448	low	0.0002	NaN	NaN	0.2644	NaN	0.2644	Oct4	0.2644	low	0.0043	Oct4/metacluster_0/pa...	232	240	3001873	16	Oct4-Sox2	Oct4/m0_p0	224	3001857	2.3456	low	0.0021	+	Oct4	0	0.0225	0.0047	0.0001	0.0047	1.0000e-06	0.1831	2.0000e-06	0.7681	0.0390	0.0038	0.0001	0.0038	1.0000e-06	0.0842	2.0000e-06	0.5260	0.0611	0.0049	0.0001	0.0049	1.0000e-06	0.0303	2.0000e-06	0.3723	0.1121	0.0060	0.0001	0.0061	1.0000e-06	0.2067	2.0000e-06	0.6075
1	chr3	122146077	1	Oct4	122145077	.	NaN	NaN	0.9439	NaN	0.9439	Oct4	0.9439	high	0.8507	NaN	NaN	0.2953	NaN	0.2953	Oct4	0.2953	low	0.0106	Oct4/metacluster_0/pa...	428	436	122145513	16	Oct4-Sox2	Oct4/m0_p0	420	122145497	1.8069	low	0.0014	+	Oct4	1	5.2561	0.3833	39.0001	0.6100	2.0000e+00	0.5320	2.0000e-06	0.7681	2.8647	0.2738	2572.0000	0.9958	2.8100e+02	0.9993	1.0400e+02	0.9967	1.7678	0.0818	2258.0000	0.9998	9.3000e+01	0.9994	8.4000e+01	0.9976	2.4488	0.0228	424.0001	0.9979	2.2000e+01	0.9934	4.0000e+00	0.8520
2	chr3	122146077	1	Oct4	122145077	.	NaN	NaN	0.9374	NaN	0.9374	Oct4	0.9374	high	0.8462	NaN	NaN	0.5077	NaN	0.5077	Oct4	0.5077	high	0.6886	Oct4/metacluster_0/pa...	430	438	122145515	16	Oct4-Sox2	Oct4/m0_p0	422	122145499	10.2722	high	0.6902	-	Oct4	2	5.3935	0.4228	44.0001	0.6462	4.0000e+00	0.8385	2.0000e-06	0.7681	3.1092	0.3125	2726.0000	0.9965	2.8100e+02	0.9993	4.8000e+01	0.9884	1.5704	0.0673	2310.0000	0.9998	9.3000e+01	0.9994	6.0000e+01	0.9934	2.4112	0.0222	443.0001	0.9982	2.2000e+01	0.9934	9.0000e+00	0.9520
3	chr3	122146077	1	Oct4	122145077	.	NaN	NaN	1.5355	NaN	1.5355	Oct4	1.5355	high	0.9920	NaN	NaN	0.5287	NaN	0.5287	Oct4	0.5287	high	0.7992	Oct4/metacluster_0/pa...	451	459	122145536	16	Oct4-Sox2	Oct4/m0_p0	443	122145520	11.5528	high	0.9044	-	Oct4	3	4.9558	0.3114	59.0001	0.7495	4.0000e+00	0.8385	2.0000e-06	0.7681	3.8785	0.4346	3366.0000	0.9981	2.8100e+02	0.9993	1.0000e+00	0.5260	3.5481	0.3387	2339.0000	0.9999	1.0400e+02	0.9996	1.0000e+00	0.3723	4.3891	0.1790	537.0001	0.9988	2.7000e+01	0.9960	2.0000e-06	0.6075
4	chr3	122146077	1	Oct4	122145077	.	NaN	NaN	1.0385	NaN	1.0385	Oct4	1.0385	high	0.9024	NaN	NaN	0.2392	NaN	0.2392	Oct4	0.2392	low	0.0028	Oct4/metacluster_0/pa...	469	477	122145554	16	Oct4-Sox2	Oct4/m0_p0	461	122145538	5.0157	low	0.0179	+	Oct4	4	4.8723	0.2939	59.0001	0.7467	4.0000e+00	0.8385	1.0000e+00	0.7681	3.2607	0.3358	3721.0000	0.9987	2.8100e+02	0.9993	4.8000e+01	0.9884	3.2816	0.2846	2455.0000	0.9999	1.0400e+02	0.9996	2.0000e+00	0.4958	4.0302	0.1345	631.0001	0.9994	2.7000e+01	0.9960	2.0000e-06	0.6075

	counts	max	sym_kl	cosine
0	8.0	1.0	6.5650	0.7727
1	7.0	0.0	6.5187	0.7663
2	13.0	0.0	6.1979	0.6819
...	...	...	...	...
1510952	6.0	0.0	6.6950	0.8125
1510953	9.0	1.0	6.3318	0.7112
1510954	11.0	0.0	6.3736	0.7246

	example_chrom	example_end	example_idx	example_interval_from_task	example_start	example_strand	imp/Klf4	imp/Nanog	imp/Oct4	imp/Sox2	imp_max	imp_max_task	imp_weighted	imp_weighted_cat	imp_weighted_p	match/Klf4	match/Nanog	match/Oct4	match/Sox2	match_max	match_max_task	match_weighted	match_weighted_cat	match_weighted_p	pattern	pattern_center	pattern_end	pattern_end_abs	pattern_len	pattern_name	pattern_short	pattern_start	pattern_start_abs	seq_match	seq_match_cat	seq_match_p	strand	tf	id	Klf4/profile_match	Klf4/profile_match_p	Klf4/profile_counts	Klf4/profile_counts_p	Klf4/profile_max	Klf4/profile_max_p	Klf4/profile_counts_max_ref	Klf4/profile_counts_max_ref_p	Nanog/profile_match	Nanog/profile_match_p	Nanog/profile_counts	Nanog/profile_counts_p	Nanog/profile_max	Nanog/profile_max_p	Nanog/profile_counts_max_ref	Nanog/profile_counts_max_ref_p	Oct4/profile_match	Oct4/profile_match_p	Oct4/profile_counts	Oct4/profile_counts_p	Oct4/profile_max	Oct4/profile_max_p	Oct4/profile_counts_max_ref	Oct4/profile_counts_max_ref_p	Sox2/profile_match	Sox2/profile_match_p	Sox2/profile_counts	Sox2/profile_counts_p	Sox2/profile_max	Sox2/profile_max_p	Sox2/profile_counts_max_ref	Sox2/profile_counts_max_ref_p
0	chr9	3002633	0	Oct4	3001633	.	NaN	NaN	0.0448	NaN	0.0448	Oct4	0.0448	low	0.0002	NaN	NaN	0.2644	NaN	0.2644	Oct4	0.2644	low	0.0043	Oct4/metacluster_0/pa...	232	240	3001873	16	Oct4-Sox2	Oct4/m0_p0	224	3001857	2.3456	low	0.0021	+	Oct4	0	0.0225	0.0047	0.0001	0.0047	1.0000e-06	0.1831	2.0000e-06	0.7681	0.0390	0.0038	0.0001	0.0038	1.0000e-06	0.0842	2.0000e-06	0.5260	0.0611	0.0049	0.0001	0.0049	1.0000e-06	0.0303	2.0000e-06	0.3723	0.1121	0.0060	0.0001	0.0061	1.0000e-06	0.2067	2.0000e-06	0.6075
1	chr3	122146077	1	Oct4	122145077	.	NaN	NaN	0.9439	NaN	0.9439	Oct4	0.9439	high	0.8507	NaN	NaN	0.2953	NaN	0.2953	Oct4	0.2953	low	0.0106	Oct4/metacluster_0/pa...	428	436	122145513	16	Oct4-Sox2	Oct4/m0_p0	420	122145497	1.8069	low	0.0014	+	Oct4	1	5.2561	0.3833	39.0001	0.6100	2.0000e+00	0.5320	2.0000e-06	0.7681	2.8647	0.2738	2572.0000	0.9958	2.8100e+02	0.9993	1.0400e+02	0.9967	1.7678	0.0818	2258.0000	0.9998	9.3000e+01	0.9994	8.4000e+01	0.9976	2.4488	0.0228	424.0001	0.9979	2.2000e+01	0.9934	4.0000e+00	0.8520
2	chr3	122146077	1	Oct4	122145077	.	NaN	NaN	0.9374	NaN	0.9374	Oct4	0.9374	high	0.8462	NaN	NaN	0.5077	NaN	0.5077	Oct4	0.5077	high	0.6886	Oct4/metacluster_0/pa...	430	438	122145515	16	Oct4-Sox2	Oct4/m0_p0	422	122145499	10.2722	high	0.6902	-	Oct4	2	5.3935	0.4228	44.0001	0.6462	4.0000e+00	0.8385	2.0000e-06	0.7681	3.1092	0.3125	2726.0000	0.9965	2.8100e+02	0.9993	4.8000e+01	0.9884	1.5704	0.0673	2310.0000	0.9998	9.3000e+01	0.9994	6.0000e+01	0.9934	2.4112	0.0222	443.0001	0.9982	2.2000e+01	0.9934	9.0000e+00	0.9520
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
2357835	chr13	101785985	147973	Klf4	101784985	.	0.1612	NaN	NaN	NaN	0.1612	Klf4	0.1612	low	0.0049	0.5796	NaN	NaN	NaN	0.5796	Klf4	0.5796	low	0.1028	Klf4/metacluster_0/pa...	84	89	101785074	10	Klf4	Klf4/m0_p0	79	101785064	5.6424	low	0.0609	+	Klf4	2357835	6.5724	0.9484	7.0001	0.0634	1.0000e+00	0.1455	1.0000e+00	0.7716	6.5273	0.9315	9.0001	0.0894	1.0000e+00	0.1645	2.0000e-06	0.6936	6.3493	0.9397	10.0001	0.0406	1.0000e+00	0.0620	2.0000e-06	0.7011	6.5499	0.7909	8.0001	0.2300	1.0000e+00	0.3974	2.0000e-06	0.9172
2357836	chr13	101785985	147973	Klf4	101784985	.	0.1716	NaN	NaN	NaN	0.1716	Klf4	0.1716	low	0.0119	0.6673	NaN	NaN	NaN	0.6673	Klf4	0.6673	medium	0.4704	Klf4/metacluster_0/pa...	275	280	101785265	10	Klf4	Klf4/m0_p0	270	101785255	8.6962	medium	0.4549	+	Klf4	2357836	6.3685	0.8627	11.0001	0.1342	1.0000e+00	0.1455	2.0000e-06	0.7716	6.4790	0.9106	11.0001	0.1401	2.0000e+00	0.4399	2.0000e-06	0.6936	6.0341	0.8601	16.0001	0.0950	1.0000e+00	0.0620	2.0000e-06	0.7011	6.5535	0.7955	6.0001	0.1183	1.0000e+00	0.3974	2.0000e-06	0.9172
2357837	chr13	101785985	147973	Klf4	101784985	.	0.1189	NaN	NaN	NaN	0.1189	Klf4	0.1189	low	0.0004	0.5369	NaN	NaN	NaN	0.5369	Klf4	0.5369	low	0.0494	Klf4/metacluster_0/pa...	872	877	101785862	10	Klf4	Klf4/m0_p0	867	101785852	3.8225	low	0.0103	+	Klf4	2357837	5.5081	0.5507	32.0001	0.4303	2.0000e+00	0.4441	2.0000e+00	0.8699	6.1054	0.7263	18.0001	0.2866	2.0000e+00	0.4399	1.0000e+00	0.6936	5.1945	0.5322	40.0001	0.4060	3.0000e+00	0.5786	2.0000e-06	0.7011	6.4968	0.7322	8.0001	0.2300	1.0000e+00	0.3974	2.0000e-06	0.9172

	pattern_name	example_chrom	pattern_start_abs	pattern_end_abs	strand
0	Oct4-Sox2	chr9	3001857	3001873	+
1	Oct4-Sox2	chr3	122145497	122145513	+
2	Oct4-Sox2	chr3	122145499	122145515	-
...	...	...	...	...	...
2357835	Klf4	chr13	101785064	101785074	+
2357836	Klf4	chr13	101785255	101785265	+
2357837	Klf4	chr13	101785852	101785862	+