from collections import OrderedDict
exp = 'nexus,peaks,OSNK,0,10,1,FALSE,same,0.5,64,25,0.004,9,FALSE,[1,50],TRUE'
imp_score = 'profile/wn'
motifs = OrderedDict([
("Oct4-Sox2", 'Oct4/m0_p0'),
("Oct4", 'Oct4/m0_p1'),
# ("Strange-sym-motif", 'Oct4/m0_p5'),
("Sox2", 'Sox2/m0_p1'),
("Nanog", 'Nanog/m0_p1'),
("Zic3", 'Nanog/m0_p2'),
("Nanog-partner", 'Nanog/m0_p4'),
("Klf4", 'Klf4/m0_p0'),
])
motifs_chexmix = OrderedDict([
("Oct4-Sox2", 'Oct4/1'),
("Sox2", 'Sox2/1'),
("Nanog", 'Nanog/1'),
("Klf4", 'Klf4/1'),
])
main_motifs = ['Oct4-Sox2', 'Sox2', 'Nanog', 'Klf4']
What fraction of the peaks can we explain? E.g. compute the overlap between
seqlets (un-clustered)
Q: How to get this number?
There could be a possible problem when excluding seqlets...
What are the two nearest peakxus peaks?
# Imports
from basepair.imports import *
from basepair.exp.paper.config import tasks
from basepair.modisco.pattern_instances import load_instances, filter_nonoverlapping_intervals, plot_coocurence_matrix, dfi_filter_valid, dfi_add_ranges, annotate_profile
from basepair.cli.imp_score import ImpScoreFile
from basepair.modisco.core import Pattern
from basepair.preproc import dfint_no_intersection
hv.extension('bokeh')
%config InlineBackend.figure_format = 'retina'
%matplotlib inline
paper_config()
interval_cols = ['example_chrom', 'pattern_start_abs', 'pattern_end_abs']
# load all the required data
# mr = ModiscoResult(modisco_dir / "modisco.h5")
# mr.open()
# imp_scores = ImpScoreFile.from_modisco_dir(modisco_dir)
# imp_scores.cache() # load all the scores
# ranges = imp_scores.get_ranges()
# ranges.columns = ["example_" + v for v in ranges.columns]
# seqs = imp_scores.get_seq()
# profiles = imp_scores.get_profiles()
from basepair.exp.paper.config import models_dir
!mkdir -p {ddir}/figures/method-comparison
# figures dir
model_dir = models_dir / exp
fdir = Path(f'{ddir}/figures/method-comparison')
%config InlineBackend.figure_format = 'retina'
paper_config()
profile_width = 70
seqlen = 1000
instance_parq_paths = {t: model_dir / f'deeplift/{t}/out/{imp_score}/instances.parq'
for t in tasks}
from basepair.modisco.pattern_instances import (multiple_load_instances, load_instances, filter_nonoverlapping_intervals,
plot_coocurence_matrix, align_instance_center, dfi2seqlets, annotate_profile)
mr = MultipleModiscoResult({t: model_dir / f'deeplift/{t}/out/{imp_score}/modisco.h5'
for t in tasks})
# Exclude TEs
def shorten_te_pattern(s):
tf, p = s.split("/", 1)
return tf + "/" + shorten_pattern(p)
from basepair.utils import flatten
from basepair.utils import pd_first_cols, flatten
from basepair.exp.chipnexus.comparison import read_peakxus_dfi, read_chexmix_dfi, read_fimo_dfi, read_meme_motifs, read_transfac
comparison_dir = Path('../../chipnexus/comparison/output')
chexmix_motifs = flatten({tf: read_transfac(comparison_dir / f"chexmix/{tf}/intermediate-results/{tf}.experiment.transfac")
for tf in tasks}, separator='/')
chexmix_patterns = [Pattern('chexmix/' + k, v.pwm, dict(), dict())
for k,v in chexmix_motifs.items()]
from basepair.plot.utils import seqlogo_clean
import matplotlib as mpl
list(map(len, chexmix_patterns))
p = chexmix_patterns[0]
from concise.utils.plot import seqlogo
fig, axes = plt.subplots(len(chexmix_patterns), 1,
sharex=True, sharey=True,
figsize=get_figsize(.25, len(chexmix_patterns)/3))
for i, p in enumerate(chexmix_patterns):
pwm = p.resize(16).get_seq_ic()
ax = axes[i]
seqlogo(pwm, ax=axes[i])
# p.resize(16).plot('seq_ic', letter_width=.1, height=0.5);
ax.xaxis.set_major_locator(mpl.ticker.AutoLocator())
_, tf, n = p.name.split("/")
ax.set_ylabel(f"{tf}/{n}", rotation=0,
multialignment='center',
ha='right', labelpad=5)
ax.set_ylim([0, 2])
sns.despine(bottom=True, top=True, right=True)
fig.savefig(fdir / 'ChExMix.motifs.pdf')
# plot them
patterns = {m: mr.get_pattern(longer_pattern(p)).trim_seq_ic(0.08)
for m, p in motifs.items()}
for tf, p in patterns.items():
p.plot("seq");
plt.title(tf)
len(dfi_chexmix)
Support the following operations:
dfi_filter_validpattern_centerextract_dfi (dfi_row2seqlet)example_idxpattern_startpattern_endpatternstrand%tqdm_restart
# Store the results
chexmix_output_dir = Path("../../chipnexus/comparison/output/chexmix/")
# Load the cached results
%time p_seq = read_pkl(chexmix_output_dir / 'pwm_instances.stacked_seqlets.pkl')
%time dfi_seq = pd.read_csv(chexmix_output_dir / 'pwm_instances.csv.gz')
# Load the cached results
%time ps_chexmix_motifs = read_pkl(chexmix_output_dir / 'chexmix.stacked_seqlets.pkl')
%time dfi_chexmix = pd.read_csv(chexmix_output_dir / 'chexmix.csv.gz')
!mv {chexmix_output_dir}/benchmark/instances.stacked_seqlets.pkl {model_dir}/benchmark/instances.stacked_seqlets.pkl
ls -latr {model_dir}/benchmark
# Load the cached results
%time ps_motifs = read_pkl(model_dir / 'benchmark/instances.stacked_seqlets.pkl')
%time dfis = pd.read_csv(model_dir / 'benchmark/instances.csv.gz')
!du -sh {output_dir}/*
plotnine.options.figure_size = get_figsize(1)
(ggplot(aes(x='seq_match_score'), dfi_seq) +
geom_histogram(bins=100) +
facet_grid("motif ~ .") +
theme_classic() +
xlim([0, 15])
)
(ggplot(aes(x='seq_match', fill='imp_weighted_cat'), dfis) +
geom_histogram(bins=100) +
facet_grid("pattern_name ~ .") +
theme_classic() +
xlim([0, 15])
)
p_seqdef profile_metrics(profiles, target_profile, dist_metrics=['cosine']):
"""
Args:
profiles: np.array of a profile
target_profile: aggregated profile
"""
from scipy.spatial.distance import cdist
from basepair.stats import symmetric_kl
# total counts
total_counts = profiles.sum(axis=(1,2))
# value at the maximum position
max_positions = np.argmax(target_profile, axis=0)
max_counts = profiles[:, max_positions, [0, 1]].sum(axis=-1)
# cosine distance
Xa = profiles.reshape((len(profiles), -1))
Xb = target_profile[np.newaxis].reshape((1, -1))
cosine = cdist(Xa + 1e-6, Xb + 1e-6, metric='cosine') # cosine distance
sym_kl = symmetric_kl(Xa.T + 1e-6, Xb.T + 1e-6) # Make sure we don't have 0 values
# KL divergence
assert cosine.ndim == 2
assert cosine.shape[1] == 1
return pd.DataFrame({
"counts": total_counts,
"max": max_counts,
"sym_kl": sym_kl,
"cosine": cosine[:, 0]
})
from basepair.exp.paper.config import profile_mapping
dfis['imp_weighted_cat'] = pd.Categorical(dfis['imp_weighted_cat'].astype(str), categories = ['low', 'medium', 'high'], ordered=True)
dfis.head()
from basepair.plot.profiles import multiple_plot_stranded_profile
ps_subset = p_seq['Klf4']
sort_idx = np.argsort(-ps_subset.profile['Klf4'].sum(axis=(-2, -1)))
multiple_plot_stranded_profile(ps_subset[sort_idx[:1000]].profile, figsize_tmpl=(2.5, 1.5));
ref = ps_subset[sort_idx[:1000]].profile['Klf4'].mean(axis=0)
dfm = profile_metrics(ps_subset.profile['Klf4'], ref)
dfm
dfm.counts.plot.hist(100)
dfm[dfm.counts > 2].plot.scatter('sym_kl', 'cosine', s=0.1, alpha=0.5)
dfm[dfm.counts > 2].plot.scatter('sym_kl', 'max', s=0.1, alpha=0.5)
plt.yscale("symlog")
dfm[dfm.counts > 2].plot.scatter('counts', 'max', s=0.5, alpha=0.5)
plt.yscale("symlog")
plt.xscale("symlog")
dfm[dfm.counts > 2].plot.scatter('sym_kl', 'counts', s=0.5, alpha=0.5)
plt.yscale("symlog")
plotnine.options.figure_size = get_figsize(1/3, .6)
(ggplot(aes(x='seq_match_cat', y="Oct4/profile_counts_p"), dfis[(~dfis['seq_match_cat'].isnull()) & (dfis.pattern_name == 'Oct4-Sox2')]) +
# geom_violin() +
geom_boxplot() +
theme_classic()
)
from basepair.config import test_chr
dfis
def get_profile_score(dfi, score, profile_cls):
df = dfi[(~dfi[score].isnull())]
return df.sort_values(score, ascending=False)[profile_cls]
motif_profile_counts = []
for motif in main_motifs:
motif_profile_counts.append((motif, {
"BPNet": get_profile_score(dfis[(dfis.pattern_name == motif) & dfis.example_chrom.isin(test_chr)], 'imp_weighted', profile_mapping[motif]+ "/profile_max").values,
"PWM": get_profile_score(dfi_seq[(dfi_seq.motif == motif) & dfi_seq.example_chrom.isin(test_chr)], 'seq_match_score', profile_mapping[motif]+ "/profile_max").values,
# "ChExMix profile": get_profile_score(dfi_chexmix[(dfi_chexmix.pattern_name == motif) & dfi_chexmix.example_chrom.isin(test_chr)], 'profile_score', profile_mapping[motif]+ "/profile_match").values,
"ChExMix pwm": get_profile_score(dfi_chexmix[(dfi_chexmix.pattern_name == motif) & dfi_chexmix.example_chrom.isin(test_chr)], 'motif_score', profile_mapping[motif]+ "/profile_max").values,
}))
## TODO - where is ref refined
fig, ax = plt.subplots(1,1,figsize=get_figsize(0.25))
scores = get_profile_score(dfis[(dfis.pattern_name == motif) & dfis.example_chrom.isin(test_chr)], 'imp_weighted', profile_mapping[motif]+ "/profile_max")
scores.plot.hist(np.arange(30), ax=ax)
plt.axvline(8, color='grey')
plt.xlabel("Profile score (max value)");
fig.savefig(fdir / 'profile-score.hist.pdf')
np.mean(scores > 8)
np.percentile(scores, 90)
dfis[['pattern_name', 'example_chrom', 'pattern_start_abs', 'pattern_end_abs', 'strand']]
# Remove duplicates in dfis
dfis = dfis.sort_values("imp_weighted", ascending=False)
duplicated = dfis[['pattern_name', 'example_chrom', 'pattern_start_abs', 'pattern_end_abs', 'strand']].duplicated()
dfis = dfis[~duplicated]
# Remove duplicates in dfis
dfi_chexmix = dfi_chexmix.sort_values("profile_score", ascending=False)
duplicated = dfi_chexmix[['pattern_name', 'example_chrom', 'pattern_center_abs', 'strand']].duplicated()
dfi_chexmix = dfi_chexmix[~duplicated]
# Remove duplicates in dfis
dfi_seq = dfi_seq.sort_values("seq_match_score", ascending=False)
duplicated = dfi_seq[['pattern_name', 'example_chrom', 'pattern_start_abs', 'pattern_end_abs', 'strand']].duplicated()
dfi_seq = dfi_seq[~duplicated]
def get_profile_score(dfi, score, profile_cls):
df = dfi[(~dfi[score].isnull())]
return df.sort_values(score, ascending=False)[profile_cls]
motif_profile_counts = []
for motif in main_motifs:
motif_profile_counts.append((motif, {
"BPNet": get_profile_score(dfis[(dfis.pattern_name == motif) & dfis.example_chrom.isin(test_chr)], 'imp_weighted', profile_mapping[motif]+ "/profile_max").values,
"ChExMix": get_profile_score(dfi_chexmix[(dfi_chexmix.pattern_name == motif) & dfi_chexmix.example_chrom.isin(test_chr)], 'profile_score', profile_mapping[motif]+ "/profile_max").values,
"PWM": get_profile_score(dfi_seq[(dfi_seq.motif == motif) & dfi_seq.example_chrom.isin(test_chr)], 'seq_match_score', profile_mapping[motif]+ "/profile_max").values,
# "ChExMix pwm": get_profile_score(dfi_chexmix[(dfi_chexmix.pattern_name == motif) & dfi_chexmix.example_chrom.isin(test_chr)], 'motif_score', profile_mapping[motif]+ "/profile_max_p").values,
}))
# Profile scores
model = 'BPNet'
thresholds = dict()
fig, axes = plt.subplots(len(main_motifs),1,figsize=get_figsize(0.25, 1),
gridspec_kw=dict(hspace=0),
sharex=True)
for i, motif in enumerate(main_motifs):
ax = axes[i]
scores = motif_profile_counts[i][1][model]
thresholds[motif] = np.percentile(scores, 95)
#if len(scores) == 0:
# continue
ax.hist(scores, bins=np.arange(30))
ax.axvline(thresholds[motif], color='grey')
ax.set_ylabel(motif, rotation=0)
if i == len(main_motifs)-1:
ax.set_xlabel("Profile score (max value)");
fig.savefig(fdir / 'profile-score.hist.pdf')
cutoff = 8
fig, axes = plt.subplots(1, len(main_motifs), figsize=get_figsize(1, 1/(1*len(main_motifs))),
sharex=True,
sharey=True, gridspec_kw=dict(wspace=0, hspace=0))
fig.subplots_adjust(wspace=.5)#, wspace=0)
max_val = 0
for i, (motif, res) in enumerate(motif_profile_counts):
ax = axes[i]
# top_n = min([len(x) for x in res.values()])
top_n = 10000
cutoff = thresholds[motif]
for k,v in res.items():
ax.plot(np.cumsum(v[:top_n] > cutoff), label=k)
max_val = max(max_val, ax.get_ylim()[1])
if i == 0:
ax.set_ylabel("# motif instances\nwith high coverage")
if i == 1:
ax.set_xlabel("Top # of motif instances")
if i == 0:
leg = ax.legend(loc='lower right', framealpha=1, borderpad=0)
leg.get_frame().set_linewidth(0.0)
ax.set_title(motif)
for i, (motif, res) in enumerate(motif_profile_counts):
axes[i].plot([0, max_val], [0, max_val], '--', color='grey', label=None)
# fig.savefig(fdir / 'footprint-recall-curve.with-ChExMix.pdf')
cutoff = 8
fig, axes = plt.subplots(1, len(main_motifs), figsize=get_figsize(1, 1/(1*len(main_motifs))),
sharex=True,
sharey=True, gridspec_kw=dict(wspace=0, hspace=0))
fig.subplots_adjust(wspace=.5)#, wspace=0)
max_val = 0
for i, (motif, res) in enumerate(motif_profile_counts):
ax = axes[i]
# top_n = min([len(x) for x in res.values()])
top_n = 10000
cutoff = thresholds[motif]
for k,v in res.items():
ax.plot(np.cumsum(v[:top_n] > cutoff), label=k)
max_val = max(max_val, ax.get_ylim()[1])
if i == 0:
ax.set_ylabel("# motif instances\nwith high coverage")
if i == 1:
ax.set_xlabel("Top # of motif instances")
if i == 0:
leg = ax.legend(loc='lower right', framealpha=1, borderpad=0)
leg.get_frame().set_linewidth(0.0)
ax.set_title(motif)
for i, (motif, res) in enumerate(motif_profile_counts):
axes[i].plot([0, max_val], [0, max_val], '--', color='grey', label=None)
fig.savefig(fdir / 'footprint-recall-curve.with-ChExMix.pdf')
from gin_train.metrics import spearmanr
def corelate_cols(dfi, score, profile_cls, metric=spearmanr):
df = dfi[(~dfi[score].isnull())]
return metric(dfi[profile_cls], dfi[score])
motif_spearmanr = []
for motif in main_motifs:
motif_spearmanr.append({
"spearmanr": corelate_cols(dfis[(dfis.pattern_name == motif) & dfis.example_chrom.isin(test_chr)], 'imp_weighted', profile_mapping[motif]+ "/profile_max"),
"motif_score": "BPNet",
"motif": motif
})
motif_spearmanr.append({
"spearmanr": corelate_cols(dfi_seq[(dfi_seq.motif == motif) & dfi_seq.example_chrom.isin(test_chr)], 'seq_match_score', profile_mapping[motif]+ "/profile_max"),
"motif_score": "PWM",
"motif": motif
})
dfm_spearman = pd.DataFrame(motif_spearmanr)
dfm_spearman['motif'] = pd.Categorical(dfm_spearman['motif'], categories=list(motifs), ordered=True)
plotnine.options.figure_size = get_figsize(.4)
fig = (ggplot(aes(x='motif', fill='motif_score', y='spearmanr'), dfm_spearman) +
geom_bar(stat='identity', position='dodge') +
theme_classic() +
scale_fill_brewer(type='qual', palette=3) +
theme(legend_position='top') +
ylab("Spearman rank correlation\nwith profile height")
)
fig.save(fdir / 'profile-max-correlation.bar.pdf')
fig
plotnine.options.figure_size = get_figsize(1, .25)
(ggplot(aes(x='Oct4/profile_max_p', y="Oct4/profile_counts_p"), dfis[(~dfis['seq_match_cat'].isnull()) & (dfis.pattern_name == 'Oct4-Sox2')]) +
# geom_violin() +
geom_point(alpha=0.01, size=0.1, color='black') +
theme_classic()
)
plotnine.options.figure_size = get_figsize(.25, 1)
fig = (ggplot(aes(x='seq_match', y="match_weighted"),
dfis[(~dfis['seq_match_cat'].isnull()) & (dfis.pattern_name == 'Oct4-Sox2')].sample(frac=0.1)) +
# geom_violin() +
geom_point(alpha=0.01, size=0.1, color='black') +
xlab("PWM match") +
ylab("CWM match (jaccard)") +
theme_classic()
)
fig.save(fdir / 'pwm_match_vs_cwm_match.pdf', raster=True)
fig
plotnine.options.figure_size = get_figsize(.25, 1)
fig = (ggplot(aes(x='imp_weighted', y="match_weighted"),
dfis[(~dfis['seq_match_cat'].isnull()) & (dfis.pattern_name == 'Oct4-Sox2')].sample(frac=0.1)) +
# geom_violin() +
geom_point(alpha=0.01, size=0.1, color='black') +
xlab("Importance") +
ylab("CWM match (jaccard)") +
theme_classic()
)
fig.save(fdir / 'importance_vs_cwm_match.pdf', raster=True)
fig
from basepair.exp.chipnexus.spacing import get_motif_pairs, motif_pair_dfi
dfi_seq['pattern_name'] = dfi_seq.motif
motif_pairs = get_motif_pairs(main_motifs)
motif_pairs
motif_pair = ['Nanog', 'Nanog']
motif_pair_name = "<>".join(motif_pair)
threshold = 5
dfab = motif_pair_dfi(dfi_seq[dfi_seq.seq_match_score > threshold], motif_pair)
dfab['motif_pair'] = "<>".join(motif_pair)
plotnine.options.figure_size = get_figsize(.3, aspect=2/10*4)
max_dist = 100
fig = (ggplot(aes(x='center_diff', fill='strand_combination'), dfab[(dfab.center_diff <= max_dist) & (dfab.motif_pair.isin([motif_pair_name]))]) +
geom_vline(xintercept=10, alpha=0.1) +
geom_vline(xintercept=20, alpha=0.1) +
geom_vline(xintercept=30, alpha=0.1) +
geom_vline(xintercept=40, alpha=0.1) +
geom_histogram(bins=max_dist) + facet_grid("strand_combination~.") +
theme_classic(base_size=10, base_family='Arial') +
theme(strip_text = element_text(rotation=0), legend_position='top') +
xlim([0, max_dist]) +
xlab("Pairwise distance") +
ggtitle(motif_pair_name) +
scale_fill_brewer(type='qual', palette=3))
fig.save(fdir / 'Nanog<>Nanog.PWM.threshold=5.pdf')
fig
threshold = 6
dfab = motif_pair_dfi(dfi_seq[dfi_seq.seq_match_score > threshold], motif_pair)
dfab['motif_pair'] = "<>".join(motif_pair)
plotnine.options.figure_size = get_figsize(.3, aspect=2/10*4)
max_dist = 100
fig = (ggplot(aes(x='center_diff', fill='strand_combination'), dfab[(dfab.center_diff <= max_dist) & (dfab.motif_pair.isin([motif_pair_name]))]) +
geom_vline(xintercept=10, alpha=0.1) +
geom_vline(xintercept=20, alpha=0.1) +
geom_vline(xintercept=30, alpha=0.1) +
geom_vline(xintercept=40, alpha=0.1) +
geom_histogram(bins=max_dist) + facet_grid("strand_combination~.") +
theme_classic(base_size=10, base_family='Arial') +
theme(strip_text = element_text(rotation=0), legend_position='top') +
xlim([0, max_dist]) +
xlab("Pairwise distance") +
ggtitle(motif_pair_name) +
scale_fill_brewer(type='qual', palette=3))
fig.save(fdir / f'Nanog<>Nanog.PWM.threshold={threshold}.pdf')
fig
threshold = 7
dfab = motif_pair_dfi(dfi_seq[dfi_seq.seq_match_score > threshold], motif_pair)
dfab['motif_pair'] = "<>".join(motif_pair)
plotnine.options.figure_size = get_figsize(.3, aspect=2/10*4)
max_dist = 100
fig = (ggplot(aes(x='center_diff', fill='strand_combination'), dfab[(dfab.center_diff <= max_dist) & (dfab.motif_pair.isin([motif_pair_name]))]) +
geom_vline(xintercept=10, alpha=0.1) +
geom_vline(xintercept=20, alpha=0.1) +
geom_vline(xintercept=30, alpha=0.1) +
geom_vline(xintercept=40, alpha=0.1) +
geom_histogram(bins=max_dist) + facet_grid("strand_combination~.") +
theme_classic(base_size=10, base_family='Arial') +
theme(strip_text = element_text(rotation=0), legend_position='top') +
xlim([0, max_dist]) +
xlab("Pairwise distance") +
ggtitle(motif_pair_name) +
scale_fill_brewer(type='qual', palette=3))
fig.save(fdir / f'Nanog<>Nanog.PWM.threshold={threshold}.pdf')
fig
dfi_seq.seq_match_score.plot.hist(100);
dfab = motif_pair_dfi(dfis.query('match_weighted_p > .2').query('imp_weighted_p > 0'), motif_pair)
dfab['motif_pair'] = "<>".join(motif_pair)
plotnine.options.figure_size = get_figsize(.3, aspect=2/10*4)
max_dist = 100
fig = (ggplot(aes(x='center_diff', fill='strand_combination'), dfab[(dfab.center_diff <= max_dist) & (dfab.motif_pair.isin([motif_pair_name]))]) +
geom_vline(xintercept=10, alpha=0.1) +
geom_vline(xintercept=20, alpha=0.1) +
geom_vline(xintercept=30, alpha=0.1) +
geom_vline(xintercept=40, alpha=0.1) +
geom_histogram(bins=max_dist) + facet_grid("strand_combination~.") +
theme_classic(base_size=10, base_family='Arial') +
theme(strip_text = element_text(rotation=0), legend_position='top') +
xlim([0, max_dist]) +
xlab("Pairwise distance") +
ggtitle(motif_pair_name) +
scale_fill_brewer(type='qual', palette=3))
fig.save(fdir / 'Nanog<>Nanog.BPNet.default.pdf')
fig
dfi_chexmix.motif_score.plot.hist(100);
dfab = motif_pair_dfi(dfi_chexmix, motif_pair)
dfab['motif_pair'] = "<>".join(motif_pair)
plotnine.options.figure_size = get_figsize(.3, aspect=2/10*4)
max_dist = 100
fig = (ggplot(aes(x='center_diff', fill='strand_combination'), dfab[(dfab.center_diff <= max_dist) & (dfab.motif_pair.isin([motif_pair_name]))]) +
geom_vline(xintercept=10, alpha=0.1) +
geom_vline(xintercept=20, alpha=0.1) +
geom_vline(xintercept=30, alpha=0.1) +
geom_vline(xintercept=40, alpha=0.1) +
geom_histogram(bins=max_dist) + facet_grid("strand_combination~.") +
theme_classic(base_size=10, base_family='Arial') +
theme(strip_text = element_text(rotation=0), legend_position='top') +
xlim([0, max_dist]) +
xlab("Pairwise distance") +
ggtitle(motif_pair_name) +
scale_fill_brewer(type='qual', palette=3))
fig.save(fdir / 'Nanog<>Nanog.ChExMix.default.pdf')
fig
motif_score = 1
dfab = motif_pair_dfi(dfi_chexmix.query(f'motif_score > {motif_score}'), motif_pair)
dfab['motif_pair'] = "<>".join(motif_pair)
plotnine.options.figure_size = get_figsize(.3, aspect=2/10*4)
max_dist = 100
fig = (ggplot(aes(x='center_diff', fill='strand_combination'), dfab[(dfab.center_diff <= max_dist) & (dfab.motif_pair.isin([motif_pair_name]))]) +
geom_vline(xintercept=10, alpha=0.1) +
geom_vline(xintercept=20, alpha=0.1) +
geom_vline(xintercept=30, alpha=0.1) +
geom_vline(xintercept=40, alpha=0.1) +
geom_histogram(bins=max_dist) + facet_grid("strand_combination~.") +
theme_classic(base_size=10, base_family='Arial') +
theme(strip_text = element_text(rotation=0), legend_position='top') +
xlim([0, max_dist]) +
xlab("Pairwise distance") +
ggtitle(motif_pair_name) +
scale_fill_brewer(type='qual', palette=3))
fig.save(fdir / f'Nanog<>Nanog.ChExMix.motif_score={motif_score}.pdf')
fig
motif_score = 5
dfab = motif_pair_dfi(dfi_chexmix.query(f'motif_score > {motif_score}'), motif_pair)
dfab['motif_pair'] = "<>".join(motif_pair)
plotnine.options.figure_size = get_figsize(.3, aspect=2/10*4)
max_dist = 100
fig = (ggplot(aes(x='center_diff', fill='strand_combination'), dfab[(dfab.center_diff <= max_dist) & (dfab.motif_pair.isin([motif_pair_name]))]) +
geom_vline(xintercept=10, alpha=0.1) +
geom_vline(xintercept=20, alpha=0.1) +
geom_vline(xintercept=30, alpha=0.1) +
geom_vline(xintercept=40, alpha=0.1) +
geom_histogram(bins=max_dist) + facet_grid("strand_combination~.") +
theme_classic(base_size=10, base_family='Arial') +
theme(strip_text = element_text(rotation=0), legend_position='top') +
xlim([0, max_dist]) +
xlab("Pairwise distance") +
ggtitle(motif_pair_name) +
scale_fill_brewer(type='qual', palette=3))
fig.save(fdir / f'Nanog<>Nanog.ChExMix.motif_score={motif_score}.pdf')
fig
t_high = 10
t_low = [3, 5]
motif = 'Sox2'
h2(motif)
patterns[motif].plot('seq');
# Good matches
ps_subset = p_seq[motif][p_seq[motif].dfi.seq_match_score > t_high]
ps = ps_subset.aggregate(np.mean)
print("# locations:", len(ps_subset))
ps.plot(kind='seq_ic');
# aggregated footprint
ps.plot(kind='profile', rotate_y=0);
# Good matches
ps_subset = ps_motifs[motif][ps_motifs[motif].dfi.seq_match > t_high]
ps = ps_subset.aggregate(np.mean)
print("# locations:", len(ps_subset))
ps.plot(kind='seq_ic');
# aggregated footprint
ps.plot(kind='profile', rotate_y=0);
# Poor matches
ps_subset = p_seq[motif][p_seq[motif].dfi.seq_match_score < t_low[1]]
ps = ps_subset.aggregate(np.mean)
print("# locations:", len(ps_subset))
ps.plot(kind='seq_ic');
# aggregated footprint
ps.plot(kind='profile', rotate_y=0);
# Poor matches
ps_subset = ps_motifs[motif][(ps_motifs[motif].dfi.seq_match < t_low[1]) &
(ps_motifs[motif].dfi.seq_match > t_low[0])
]
ps = ps_subset.aggregate(np.mean)
print("# locations:", len(ps_subset))
ps.plot(kind='seq_ic');
# aggregated footprint
ps.plot(kind='profile', rotate_y=0);
# Poor matches but important
ps_subset = ps_motifs[motif][(ps_motifs[motif].dfi.seq_match < t_low[1]) &
(ps_motifs[motif].dfi.seq_match > t_low[0]) &
(ps_motifs[motif].dfi.imp_weighted_p > .5)
]
ps = ps_subset.aggregate(np.mean)
print("# locations:", len(ps_subset))
ps.plot(kind='seq_ic');
# aggregated footprint
ps.plot(kind='profile', rotate_y=0);
motif = 'Oct4-Sox2'
h2(motif)
patterns[motif].plot('seq');
# Good matches
ps_subset = p_seq[motif][p_seq[motif].dfi.seq_match_score > t_high]
ps = ps_subset.aggregate(np.mean)
print("# locations:", len(ps_subset))
ps.plot(kind='seq_ic');
# aggregated footprint
ps.plot(kind='profile', rotate_y=0);
# Good matches
ps_subset = ps_motifs[motif][ps_motifs[motif].dfi.seq_match > t_high]
ps = ps_subset.aggregate(np.mean)
print("# locations:", len(ps_subset))
ps.plot(kind='seq_ic');
# aggregated footprint
ps.plot(kind='profile', rotate_y=0);
# Poor matches
ps_subset = p_seq[motif][p_seq[motif].dfi.seq_match_score < t_low[1]]
ps = ps_subset.aggregate(np.mean)
print("# locations:", len(ps_subset))
ps.plot(kind='seq_ic');
# aggregated footprint
ps.plot(kind='profile', rotate_y=0);
# Poor matches
ps_subset = ps_motifs[motif][(ps_motifs[motif].dfi.seq_match < t_low[1]) &
(ps_motifs[motif].dfi.seq_match > t_low[0])
]
ps = ps_subset.aggregate(np.mean)
print("# locations:", len(ps_subset))
ps.plot(kind='seq_ic');
# aggregated footprint
ps.plot(kind='profile', rotate_y=0);
# Poor matches but important
ps_subset = ps_motifs[motif][(ps_motifs[motif].dfi.seq_match < t_low[1]) &
(ps_motifs[motif].dfi.seq_match > t_low[0]) &
(ps_motifs[motif].dfi.imp_weighted_p > .5)
]
ps = ps_subset.aggregate(np.mean)
print("# locations:", len(ps_subset))
ps.plot(kind='seq_ic');
# aggregated footprint
ps.plot(kind='profile', rotate_y=0);
motif = 'Nanog'
h2(motif)
patterns[motif].plot('seq');
t_high = 8
t_low = [3, 5]
# Good matches
ps_subset = p_seq[motif][p_seq[motif].dfi.seq_match_score > t_high]
ps = ps_subset.aggregate(np.mean)
print("# locations:", len(ps_subset))
ps.plot(kind='seq_ic');
# aggregated footprint
ps.plot(kind='profile', rotate_y=0);
# Good matches
ps_subset = ps_motifs[motif][ps_motifs[motif].dfi.seq_match > t_high]
ps = ps_subset.aggregate(np.mean)
print("# locations:", len(ps_subset))
ps.plot(kind='seq_ic');
# aggregated footprint
ps.plot(kind='profile', rotate_y=0);
# Poor matches
ps_subset = p_seq[motif][p_seq[motif].dfi.seq_match_score < t_low[1]]
ps = ps_subset.aggregate(np.mean)
print("# locations:", len(ps_subset))
ps.plot(kind='seq_ic');
# aggregated footprint
ps.plot(kind='profile', rotate_y=0);
# Poor matches
ps_subset = ps_motifs[motif][(ps_motifs[motif].dfi.seq_match < t_low[1]) &
(ps_motifs[motif].dfi.seq_match > t_low[0])
]
ps = ps_subset.aggregate(np.mean)
print("# locations:", len(ps_subset))
ps.plot(kind='seq_ic');
# aggregated footprint
ps.plot(kind='profile', rotate_y=0);
# Poor matches but important
ps_subset = ps_motifs[motif][(ps_motifs[motif].dfi.seq_match < t_low[1]) &
(ps_motifs[motif].dfi.seq_match > t_low[0]) &
(ps_motifs[motif].dfi.imp_weighted_p > .5)
]
ps = ps_subset.aggregate(np.mean)
print("# locations:", len(ps_subset))
ps.plot(kind='seq_ic');
# aggregated footprint
ps.plot(kind='profile', rotate_y=0);
t_high = 10
t_low = [3, 5]
motif = 'Klf4'
h2(motif)
patterns[motif].plot('seq');
t_high = 8
t_low = [3, 5]
# Good matches
ps_subset = p_seq[motif][p_seq[motif].dfi.seq_match_score > t_high]
ps = ps_subset.aggregate(np.mean)
print("# locations:", len(ps_subset))
ps.plot(kind='seq_ic');
# aggregated footprint
ps.plot(kind='profile', rotate_y=0);
# Good matches
ps_subset = ps_motifs[motif][ps_motifs[motif].dfi.seq_match > t_high]
ps = ps_subset.aggregate(np.mean)
print("# locations:", len(ps_subset))
ps.plot(kind='seq_ic');
# aggregated footprint
ps.plot(kind='profile', rotate_y=0);
# Poor matches
ps_subset = p_seq[motif][p_seq[motif].dfi.seq_match_score < t_low[1]]
ps = ps_subset.aggregate(np.mean)
print("# locations:", len(ps_subset))
ps.plot(kind='seq_ic');
# aggregated footprint
ps.plot(kind='profile', rotate_y=0);
# Poor matches
ps_subset = ps_motifs[motif][(ps_motifs[motif].dfi.seq_match < t_low[1]) &
(ps_motifs[motif].dfi.seq_match > t_low[0])
]
ps = ps_subset.aggregate(np.mean)
print("# locations:", len(ps_subset))
ps.plot(kind='seq_ic');
# aggregated footprint
ps.plot(kind='profile', rotate_y=0);
# Poor matches but important
ps_subset = ps_motifs[motif][(ps_motifs[motif].dfi.seq_match < t_low[1]) &
(ps_motifs[motif].dfi.seq_match > t_low[0]) &
(ps_motifs[motif].dfi.imp_weighted_p > .5)
]
ps = ps_subset.aggregate(np.mean)
print("# locations:", len(ps_subset))
ps.plot(kind='seq_ic');
# aggregated footprint
ps.plot(kind='profile', rotate_y=0);
melanie_motifs = {"nanog": "Nanog",
"sox2": "Sox2",
"oct4": "Oct4-Sox2",
"klf4": "Klf4"}
dff = pd.concat([pd.read_csv(f"ftp://ftp.stowers.org/pub/mw2098/for_ziga/FIMO_results/{k}_top1000peaks_window201bp/fimo.tsv",
sep='\t', comment='#').assign(motif=v)
for k,v in melanie_motifs.items()])
dff = dff.dropna()
dff.groupby("motif").size()
from concise.preprocessing import encodeDNA
from concise.utils.pwm import PWM
def make_pwm(seqs, strand):
return np.concatenate([encodeDNA(list(seqs[strand == '+'])),
encodeDNA(list(seqs[strand == '-']))[:, ::-1, ::-1]], axis=0)
pwms = {m: PWM(encodeDNA(list(dfs.matched_sequence.str.upper())).mean(0) + 0.0001, name=m) for m,dfs in dff.groupby("motif")}
pwms
for k,v in pwms.items():
v.plotPWMInfo(figsize=get_figsize(1, 1/5))
plt.title(k)
dff.head()
from concise.preprocessing import encodeDNA
dff.motif.unique()
dff['p-value'].plot.hist(30)
dff = dff.dropna()
dff.shape
dff.matched_sequence
pwm = encodeDNA(dff.matched_sequence.str.upper(), ).mean(axis=0)
pwm = PWM(pwm + 0.001, ) # add some pseudo-counts
pwm.pwm = pwm.pwm[::-1, ::-1] # Tc
pwm.shape
pwm.plotPWMInfo()