# Imports
from basepair.imports import *
from basepair.exp.paper.config import motifs
from basepair.modisco.pattern_instances import load_instances, filter_nonoverlapping_intervals, plot_coocurence_matrix, dfi_filter_valid
from basepair.cli.imp_score import ImpScoreFile
from basepair.modisco.core import Pattern
hv.extension('bokeh')
%config InlineBackend.figure_format = 'retina'
%matplotlib inline
paper_config()
# Common paths
model_dir = Path(f"{ddir}/processed/chipnexus/exp/models/oct-sox-nanog-klf/models/n_dil_layers=9/")
modisco_dir = model_dir / f"modisco/all/deeplift/profile/"
output_dir = modisco_dir / "pwm_comparison" # this notebook's output directory
output_dir.mkdir(exist_ok=True)
figures_dir = Path(f"{ddir}/figures/modisco/pwm_comparison")
figures_dir.mkdir(exist_ok=True)
!mkdir -p {ddir}/figures/modisco/pwm_comparison
# load all the required data
dfi = load_instances(f"{modisco_dir}/instances.parq", motifs)
mr = ModiscoResult(modisco_dir / "modisco.h5")
mr.open()
imp_scores = ImpScoreFile.from_modisco_dir(modisco_dir)
imp_scores.cache() # load all the scores
seqs = imp_scores.get_seq()
# plot them
patterns = {m: mr.get_pattern(longer_pattern(p)).trim_seq_ic(0.08)
for m, p in motifs.items()}
for tf, p in patterns.items():
p.plot("seq");
plt.title(tf)
# sequences
p_seq = {m: imp_scores.extract_dfi(dfi_filter_valid(patterns[m].get_instances_seq_scan(seqs), profile_width, seqlen=1000),
profile_width=70, verbose=True)
for m in motifs}
dfi_seq = pd.concat([p_seq[m].dfi.assign(motif=m) for m in motifs])
# Load the cached results
# p_seq = read_pkl(output_dir / 'p_seq.pkl')
# dfi_seq = pd.read_csv(output_dir / 'pwm_instances.csv.gz')
# save the instances
dfi_seq.to_csv(output_dir / 'pwm_instances.csv.gz', index=False, compression='gzip')
write_pkl(p_seq, output_dir / 'p_seq.pkl')
!du -sh {output_dir}/*
dfis = dfi.query("imp_weighted_p > 0")
dfis = dfi_filter_valid(dfis, profile_width, seqlen=1000)
ps_motifs = {m: imp_scores.extract_dfi(dfis[dfis.pattern_name == m], profile_width=profile_width, verbose=True) for m in motifs}
(ggplot(aes(x='seq_match_score'), dfi_seq) +
geom_histogram(bins=100) +
facet_grid("motif ~ .") +
theme_classic() +
xlim([0, 15])
)
(ggplot(aes(x='seq_match', fill='imp_weighted_cat'), dfis) +
geom_histogram(bins=100) +
facet_grid("pattern_name ~ .") +
theme_classic() +
xlim([0, 15])
)
t_high = 10
t_low = [3, 5]
motif = 'Sox2'
h2(motif)
patterns[motif].plot('seq');
# Good matches
ps_subset = p_seq[motif][p_seq[motif].dfi.seq_match_score > t_high]
ps = ps_subset.aggregate(np.mean)
print("# locations:", len(ps_subset))
ps.plot(kind='seq_ic');
# aggregated footprint
ps.plot(kind='profile', rotate_y=0);
# Good matches
ps_subset = ps_motifs[motif][ps_motifs[motif].dfi.seq_match > t_high]
ps = ps_subset.aggregate(np.mean)
print("# locations:", len(ps_subset))
ps.plot(kind='seq_ic');
# aggregated footprint
ps.plot(kind='profile', rotate_y=0);
# Poor matches
ps_subset = p_seq[motif][p_seq[motif].dfi.seq_match_score < t_low[1]]
ps = ps_subset.aggregate(np.mean)
print("# locations:", len(ps_subset))
ps.plot(kind='seq_ic');
# aggregated footprint
ps.plot(kind='profile', rotate_y=0);
# Poor matches
ps_subset = ps_motifs[motif][(ps_motifs[motif].dfi.seq_match < t_low[1]) &
(ps_motifs[motif].dfi.seq_match > t_low[0])
]
ps = ps_subset.aggregate(np.mean)
print("# locations:", len(ps_subset))
ps.plot(kind='seq_ic');
# aggregated footprint
ps.plot(kind='profile', rotate_y=0);
# Poor matches but important
ps_subset = ps_motifs[motif][(ps_motifs[motif].dfi.seq_match < t_low[1]) &
(ps_motifs[motif].dfi.seq_match > t_low[0]) &
(ps_motifs[motif].dfi.imp_weighted_p > .5)
]
ps = ps_subset.aggregate(np.mean)
print("# locations:", len(ps_subset))
ps.plot(kind='seq_ic');
# aggregated footprint
ps.plot(kind='profile', rotate_y=0);
motif = 'Oct4-Sox2'
h2(motif)
patterns[motif].plot('seq');
# Good matches
ps_subset = p_seq[motif][p_seq[motif].dfi.seq_match_score > t_high]
ps = ps_subset.aggregate(np.mean)
print("# locations:", len(ps_subset))
ps.plot(kind='seq_ic');
# aggregated footprint
ps.plot(kind='profile', rotate_y=0);
# Good matches
ps_subset = ps_motifs[motif][ps_motifs[motif].dfi.seq_match > t_high]
ps = ps_subset.aggregate(np.mean)
print("# locations:", len(ps_subset))
ps.plot(kind='seq_ic');
# aggregated footprint
ps.plot(kind='profile', rotate_y=0);
# Poor matches
ps_subset = p_seq[motif][p_seq[motif].dfi.seq_match_score < t_low[1]]
ps = ps_subset.aggregate(np.mean)
print("# locations:", len(ps_subset))
ps.plot(kind='seq_ic');
# aggregated footprint
ps.plot(kind='profile', rotate_y=0);
# Poor matches
ps_subset = ps_motifs[motif][(ps_motifs[motif].dfi.seq_match < t_low[1]) &
(ps_motifs[motif].dfi.seq_match > t_low[0])
]
ps = ps_subset.aggregate(np.mean)
print("# locations:", len(ps_subset))
ps.plot(kind='seq_ic');
# aggregated footprint
ps.plot(kind='profile', rotate_y=0);
# Poor matches but important
ps_subset = ps_motifs[motif][(ps_motifs[motif].dfi.seq_match < t_low[1]) &
(ps_motifs[motif].dfi.seq_match > t_low[0]) &
(ps_motifs[motif].dfi.imp_weighted_p > .5)
]
ps = ps_subset.aggregate(np.mean)
print("# locations:", len(ps_subset))
ps.plot(kind='seq_ic');
# aggregated footprint
ps.plot(kind='profile', rotate_y=0);
motif = 'Nanog'
h2(motif)
patterns[motif].plot('seq');
t_high = 8
t_low = [3, 5]
# Good matches
ps_subset = p_seq[motif][p_seq[motif].dfi.seq_match_score > t_high]
ps = ps_subset.aggregate(np.mean)
print("# locations:", len(ps_subset))
ps.plot(kind='seq_ic');
# aggregated footprint
ps.plot(kind='profile', rotate_y=0);
# Good matches
ps_subset = ps_motifs[motif][ps_motifs[motif].dfi.seq_match > t_high]
ps = ps_subset.aggregate(np.mean)
print("# locations:", len(ps_subset))
ps.plot(kind='seq_ic');
# aggregated footprint
ps.plot(kind='profile', rotate_y=0);
# Poor matches
ps_subset = p_seq[motif][p_seq[motif].dfi.seq_match_score < t_low[1]]
ps = ps_subset.aggregate(np.mean)
print("# locations:", len(ps_subset))
ps.plot(kind='seq_ic');
# aggregated footprint
ps.plot(kind='profile', rotate_y=0);
# Poor matches
ps_subset = ps_motifs[motif][(ps_motifs[motif].dfi.seq_match < t_low[1]) &
(ps_motifs[motif].dfi.seq_match > t_low[0])
]
ps = ps_subset.aggregate(np.mean)
print("# locations:", len(ps_subset))
ps.plot(kind='seq_ic');
# aggregated footprint
ps.plot(kind='profile', rotate_y=0);
# Poor matches but important
ps_subset = ps_motifs[motif][(ps_motifs[motif].dfi.seq_match < t_low[1]) &
(ps_motifs[motif].dfi.seq_match > t_low[0]) &
(ps_motifs[motif].dfi.imp_weighted_p > .5)
]
ps = ps_subset.aggregate(np.mean)
print("# locations:", len(ps_subset))
ps.plot(kind='seq_ic');
# aggregated footprint
ps.plot(kind='profile', rotate_y=0);
t_high = 10
t_low = [3, 5]
motif = 'Klf4'
h2(motif)
patterns[motif].plot('seq');
t_high = 8
t_low = [3, 5]
# Good matches
ps_subset = p_seq[motif][p_seq[motif].dfi.seq_match_score > t_high]
ps = ps_subset.aggregate(np.mean)
print("# locations:", len(ps_subset))
ps.plot(kind='seq_ic');
# aggregated footprint
ps.plot(kind='profile', rotate_y=0);
# Good matches
ps_subset = ps_motifs[motif][ps_motifs[motif].dfi.seq_match > t_high]
ps = ps_subset.aggregate(np.mean)
print("# locations:", len(ps_subset))
ps.plot(kind='seq_ic');
# aggregated footprint
ps.plot(kind='profile', rotate_y=0);
# Poor matches
ps_subset = p_seq[motif][p_seq[motif].dfi.seq_match_score < t_low[1]]
ps = ps_subset.aggregate(np.mean)
print("# locations:", len(ps_subset))
ps.plot(kind='seq_ic');
# aggregated footprint
ps.plot(kind='profile', rotate_y=0);
# Poor matches
ps_subset = ps_motifs[motif][(ps_motifs[motif].dfi.seq_match < t_low[1]) &
(ps_motifs[motif].dfi.seq_match > t_low[0])
]
ps = ps_subset.aggregate(np.mean)
print("# locations:", len(ps_subset))
ps.plot(kind='seq_ic');
# aggregated footprint
ps.plot(kind='profile', rotate_y=0);
# Poor matches but important
ps_subset = ps_motifs[motif][(ps_motifs[motif].dfi.seq_match < t_low[1]) &
(ps_motifs[motif].dfi.seq_match > t_low[0]) &
(ps_motifs[motif].dfi.imp_weighted_p > .5)
]
ps = ps_subset.aggregate(np.mean)
print("# locations:", len(ps_subset))
ps.plot(kind='seq_ic');
# aggregated footprint
ps.plot(kind='profile', rotate_y=0);
from basepair.exp.chipnexus.spacing import get_motif_pairs, motif_pair_dfi
dfi_seq['pattern_name'] = dfi_seq.motif
dfi_seq.head()
dfis.head()
motif_pair = ['Nanog', 'Nanog']
dfab = motif_pair_dfi(dfi_seq, motif_pair)
dfab['motif_pair'] = "<>".join(motif_pair)
dfab.head()
plotnine.options.figure_size = get_figsize(.3, aspect=2/10*4)
max_dist = 100
fig = (ggplot(aes(x='center_diff', fill='strand_combination'), dfab[(dfab.center_diff <= max_dist) & (dfab.motif_pair.isin(["Nanog<>Nanog"]))]) +
geom_vline(xintercept=10, alpha=0.1) +
geom_vline(xintercept=20, alpha=0.1) +
geom_vline(xintercept=30, alpha=0.1) +
geom_vline(xintercept=40, alpha=0.1) +
geom_histogram(bins=max_dist) + facet_grid("strand_combination~.") +
theme_classic(base_size=10, base_family='Arial') +
theme(strip_text = element_text(rotation=0), legend_position='top') +
xlim([0, max_dist]) +
xlab("Pairwise distance") +
ggtitle("Nanog<>Nanog") +
scale_fill_brewer(type='qual', palette=3))
fig.save(figures_dir / 'nanog.spacing.pdf')
fig
dfi_seq.head()
threshold = 6
dfab = motif_pair_dfi(dfi_seq[dfi_seq.seq_match_score > threshold], motif_pair)
dfab['motif_pair'] = "<>".join(motif_pair)
plotnine.options.figure_size = get_figsize(.3, aspect=2/10*4)
max_dist = 100
fig = (ggplot(aes(x='center_diff', fill='strand_combination'), dfab[(dfab.center_diff <= max_dist) & (dfab.motif_pair.isin(["Nanog<>Nanog"]))]) +
geom_vline(xintercept=10, alpha=0.1) +
geom_vline(xintercept=20, alpha=0.1) +
geom_vline(xintercept=30, alpha=0.1) +
geom_vline(xintercept=40, alpha=0.1) +
geom_histogram(bins=max_dist) + facet_grid("strand_combination~.") +
theme_classic(base_size=10, base_family='Arial') +
theme(strip_text = element_text(rotation=0), legend_position='top') +
xlim([0, max_dist]) +
xlab("Pairwise distance") +
ggtitle("Nanog<>Nanog") +
scale_fill_brewer(type='qual', palette=3))
fig.save(figures_dir / f'nanog.spacing.match_score>{threshold}.pdf')
fig
plotnine.options.figure_size = get_figsize(.3, aspect=2/10*4)
max_dist = 100
fig = (ggplot(aes(x='center_diff', fill='strand_combination'), dfab[(dfab.center_diff <= max_dist) & (dfab.motif_pair.isin(["Nanog<>Nanog"]))]) +
geom_vline(xintercept=10, alpha=0.1) +
geom_vline(xintercept=20, alpha=0.1) +
geom_vline(xintercept=30, alpha=0.1) +
geom_vline(xintercept=40, alpha=0.1) +
geom_histogram(bins=max_dist) + facet_grid("strand_combination~.") +
theme_classic(base_size=10, base_family='Arial') +
theme(strip_text = element_text(rotation=0), legend_position='top') +
xlim([0, max_dist]) +
xlab("Pairwise distance") +
ggtitle("Nanog<>Nanog") +
scale_fill_brewer(type='qual', palette=3))
# fig.save(figures_dir / 'nanog.spacing.pdf')
fig
dfab.head()
# Show the distribution of differne
psm = ps_motifs["Oct4-Sox2"]
psm
from concise.preprocessing import encodeDNA
from concise.utils.pwm import PWM
melanie_motifs = {"nanog": "Nanog",
"sox2": "Sox2",
"oct4": "Oct4-Sox2",
"klf4": "Klf4"}
dff = pd.concat([pd.read_csv(f"ftp://ftp.stowers.org/pub/mw2098/for_ziga/FIMO_results/{k}_top1000peaks_window201bp/fimo.tsv", sep='\t').assign(motif=v)
for k,v in melanie_motifs.items()])
dff = dff.dropna()
dff.groupby("motif").size()
# TODO - take RC into account
def make_pwm(seqs, strand):
return np.concatenate([encodeDNA(list(seqs[strand == '+'])),
encodeDNA(list(seqs[strand == '-']))[:, ::-1, ::-1]], axis=0)
pwms = {m: PWM(encodeDNA(list(dfs.matched_sequence.str.upper())).mean(0) + 0.0001, name=m) for m,dfs in dff.groupby("motif")}
pwms
for k,v in pwms.items():
v.plotPWMInfo(figsize=get_figsize(1, 1/5))
plt.title(k)
dff.head()
dff['p-value'].plot.hist(30)
dff = dff.dropna()
dff.shape
dff.matched_sequence
pwm = encodeDNA(dff.matched_sequence.str.upper(), ).mean(axis=0)
pwm = PWM(pwm + 0.001, ) # add some pseudo-counts
pwm.pwm = pwm.pwm[::-1, ::-1] # Tc
pwm.shape
pwm.plotPWMInfo()