example_idx# Imports
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
from basepair.imports import *
hv.extension('bokeh')
tf = 'Oct4'
from basepair.exp.paper.config import tasks
from basepair.preproc import interval_center
from basepair.utils import pd_first_cols, flatten
from basepair.exp.chipnexus.comparison import read_peakxus_dfi, read_chexmix_dfi, read_fimo_dfi, read_meme_motifs, read_transfac
def prepend(df, col, name, sep="/"):
df[col] = name + sep + df[col]
return df
dfi_peakxus = pd.concat([read_peakxus_dfi(f'output/peakxus/{tf}/all_transition_points.igv').assign(task=tf)
for tf in tasks], axis=0)
dfi_chexmix = pd.concat([pd_first_cols(read_chexmix_dfi(f"output/chexmix/{tf}/{tf}_experiment.events").
pipe(prepend, col='pattern_name', name=tf),
['chrom', 'pattern_center_abs', 'pattern_name', 'profile_score', 'motif_score', 'strand'])
for tf in tasks], axis=0)
dfi_fimo = pd.concat([pd_first_cols(read_fimo_dfi(f"output/peakxus/{tf}/MEME/w=200,n=1000,nmotifs=15,evt=0.01,maxw=40/FIMO/fimo.tsv").
pipe(prepend, col='pattern_name', name=tf),
['chrom', 'pattern_center_abs', 'pattern_name', 'score', 'strand'])
for tf in tasks], axis=0)
# df = read_fimo_dfi(f"output/peakxus/{tf}/MEME/FIMO/fimo.tsv")
from collections import OrderedDict
exp = 'nexus,peaks,OSNK,0,10,1,FALSE,same,0.5,64,25,0.004,9,FALSE,[1,50],TRUE'
imp_score = 'profile/wn'
from basepair.exp.paper.config import models_dir
model_dir = models_dir / exp
isf = ImpScoreFile(model_dir / 'deeplift.imp_score.h5', default_imp_score=imp_score)
isf.cache() # load everything into memory
ranges = isf.get_ranges()
ranges.columns = ['example_' + c for c in ranges.columns]
from basepair.exp.paper.comparison import dfint_overlap_idx, dfi_append_example_idx
# append example_idx + make pattern_start absolute
dfi_chexmix = dfi_append_example_idx(dfi_chexmix, ranges)
dfi_fimo = dfi_append_example_idx(dfi_fimo, ranges)
dfi_peakxus = dfi_append_example_idx(dfi_peakxus, ranges)
dfi_peakxus.pattern_center.plot.hist(100)
dfi_chexmix.pattern_center.plot.hist(100)
dfi_chexmix_nanog = dfi_chexmix[dfi_chexmix.pattern_name == 'Nanog/1']
dfi_chexmix_nanog['pattern'] = 'Nanog'
dfi_chexmix.pattern_name.unique()
dfi_filtered = dfi_chexmix
dfi_filtered['pattern'] = 'Nanog'
motif_pair = ['Nanog/1', 'Nanog/1']
dfa = dfi_filtered[dfi_filtered.pattern_name == motif_pair[0]]
dfb = dfi_filtered[dfi_filtered.pattern_name == motif_pair[1]]
dfa.head()
del dfa['SubtypeSequence']
del dfb['SubtypeSequence']
dfab = motif_pair_dfi(dfi_chexmix, ['Nanog/1', 'Nanog/1'])
dfab.head()
plotnine.options.figure_size = get_figsize(.5, aspect=2/10)
max_dist = 100
fig = (ggplot(aes(x='center_diff'), dfab[(dfab.center_diff <= max_dist)]) +
geom_vline(xintercept=10, alpha=0.1) +
geom_vline(xintercept=20, alpha=0.1) +
geom_vline(xintercept=30, alpha=0.1) +
geom_vline(xintercept=40, alpha=0.1) +
geom_histogram(bins=max_dist) +
theme_classic(base_size=10, base_family='Arial') +
theme(strip_text = element_text(rotation=0), legend_position='top') +
xlim([0, max_dist]) +
xlab("Pairwise distance") +
ggtitle("Nanog<>Nanog") +
scale_fill_brewer(type='qual', palette=3))
# fig.save(figures_dir / 'nanog.spacing.pdf')
fig
dfab
len(dfi_chexmix)
len(dfi_fimo)
len(dfi_peakxus)
dfi_chexmix.head()
comparison/spacing/<method>/...from basepair.exp.chipnexus.spacing import remove_edge_instances, get_motif_pairs, motif_pair_dfi
# TODO - for each table, annotate the motifs with human-readable motifs
motifs_chexmix = OrderedDict([
("Oct4-Sox2", 'Oct4/1'),
("Sox2", 'Sox2/1'),
("Nanog", 'Nanog/1'),
("Klf4", 'Klf4/1'),
])
motifs_fimo = OrderedDict([
("Oct4-Sox2", 'Oct4/2'),
("Sox2", 'Sox2/1'),
#("Nanog", 'Nanog/1'),
("Klf4", 'Klf4/1'),
])
motifs_bpnet = OrderedDict([
("Oct4-Sox2", 'Oct4/m0_p0'),
("Oct4", 'Oct4/m0_p1'),
# ("Strange-sym-motif", 'Oct4/m0_p5'),
("Sox2", 'Sox2/m0_p1'),
("Nanog", 'Nanog/m0_p1'),
("Zic3", 'Nanog/m0_p2'),
("Nanog-partner", 'Nanog/m0_p4'),
("Klf4", 'Klf4/m0_p0'),
])
pairs = get_motif_pairs(motifs_bpnet)
# ordered names
pair_names = ["<>".join(x) for x in pairs]
# create motif pairs
dfab_w_te = pd.concat([motif_pair_dfi(dfi_subset, pair).assign(motif_pair="<>".join(pair)) for pair in pairs], axis=0)
plotnine.options.figure_size = get_figsize(.5, aspect=4)# (10, 10)
max_dist = 100
min_dist = 6
fig = (ggplot(aes(x='center_diff', fill='strand_combination'), dfab[(dfab.center_diff <= max_dist) &
(dfab.center_diff > min_dist)]) +
geom_histogram(bins=max_dist-min_dist+1) +
facet_grid("motif_pair~ .", scales='free_y') +
theme_classic(base_size=10, base_family='Arial') +
theme(strip_text = element_text(rotation=0), legend_position='top') +
xlim([min_dist, max_dist]) +
# ylim([0, 1000]) +
xlab("Pairwise distance") +
scale_fill_brewer(type='qual', palette=3))
# fig.save(fdir / 'histogram.center_diff.all.imp>.2,match>.0.pdf')
fig
# Generate all spacing plots
individual_plots = {}
for motif_pair_name in pair_names:
plotnine.options.figure_size = get_figsize(.3, aspect=2/10*4)
max_dist = 100
min_dist = 0
df = dfab[(dfab.center_diff <= max_dist) &
(dfab.motif_pair == motif_pair_name)]
if len(df) == 0:
continue
fig = (ggplot(aes(x='center_diff', fill='strand_combination'), df) +
# Spacing vertical lines
geom_vline(xintercept=10, alpha=0.1) +
geom_vline(xintercept=20, alpha=0.1) +
geom_vline(xintercept=30, alpha=0.1) +
geom_vline(xintercept=40, alpha=0.1) +
# plot
geom_histogram(bins=max_dist - min_dist + 1) + facet_grid("strand_combination~.") +
# Theme, labels, colors
theme_classic(base_size=10, base_family='Arial') +
theme(strip_text = element_text(rotation=0), legend_position='top') +
xlim([min_dist, max_dist]) +
xlab("Pairwise distance") +
ggtitle(motif_pair_name) +
scale_fill_brewer(type='qual', palette=3))
# Save
# fig.save(fdir_individual / f'{motif_pair_name}.pdf')
individual_plots[motif_pair_name] = fig
# NOTE: this was the exact same code snippet as before
dfi_peakxus2.head()
chexmix_motifs = flatten({tf: read_transfac(f"output/chexmix/{tf}/intermediate-results/{tf}.experiment.transfac")
for tf in tasks}, separator='/')
for name, motif in chexmix_motifs.items():
motif.plotPWMInfo(figsize=(4, 1.2));
plt.ylim([0, 2])
sns.despine(top=True, right=True, bottom=True)
plt.title(name)
from concise.utils.pwm import PWM, load_motif_db
homer_motifs = flatten({tf: {str(i+1): PWM(np.array(v) + 0.001, name=k.split("\t")[0])
for i, (k,v) in enumerate(load_motif_db(f"output/peakxus/{tf}/HOMER/len=12,size=200/homerMotifs.all.motifs").items())}
for tf in tasks}, separator='/')
for name, motif in homer_motifs.items():
motif.plotPWMInfo(figsize=(4, 1.2));
plt.ylim([0, 2])
sns.despine(top=True, right=True, bottom=True)
plt.title(name)
homer_motifs = flatten({tf: {str(i+1): PWM(np.array(v) + 0.001, name=k.split("\t")[0])
for i, (k,v) in enumerate(load_motif_db(f"output/macs2/{tf}/HOMER/len=12,size=200/homerMotifs.all.motifs").items())}
for tf in tasks}, separator='/')
for name, motif in homer_motifs.items():
motif.plotPWMInfo(figsize=(4, 1.2));
plt.ylim([0, 2])
sns.despine(top=True, right=True, bottom=True)
plt.title(name)
Peakxus peaks overlapping
# Peakxus filter paramters
width_filter = [30, 60],
n_peaks = 1000,
seq_width = 200,
# MEME
meme {input.top_peak_seqs} \
-oc output/{wildcards.peak_caller}/{wildcards.tf}/MEME/{wildcards.run} \
-mod zoops -dna -revcomp \
-p {threads} \
-nmotifs 15 -evt 0.01 -maxw 20
peakxus_motifs = flatten({tf: read_meme_motifs(f'output/peakxus-in-macs2/{tf}/MEME/w=200,n=1000,nmotifs=15,evt=0.01,maxw=20/meme.txt')
for tf in tasks}, separator='/')
for name, motif in peakxus_motifs.items():
if int(motif.name) < 100:
continue
motif.plotPWMInfo(figsize=(5, 1.2));
plt.ylim([0, 2])
sns.despine(top=True, right=True, bottom=True)
plt.title(name + f" ({motif.name})")
# MEME
meme {input.top_peak_seqs} \
-oc output/{wildcards.peak_caller}/{wildcards.tf}/MEME/{wildcards.run} \
-mod zoops -dna -revcomp \
-p {threads} \
-nmotifs 15 -evt 0.01 -maxw 20
peakxus_motifs = flatten({tf: read_meme_motifs(f'output/macs2/{tf}/MEME/w=200,n=1000,nmotifs=15,evt=0.01,maxw=20/meme.txt')
for tf in tasks}, separator='/')
for name, motif in peakxus_motifs.items():
if int(motif.name) < 100:
continue
motif.plotPWMInfo(figsize=(5, 1.5));
plt.ylim([0, 2])
sns.despine(top=True, right=True, bottom=True)
plt.title(name + f" ({motif.name})")
peakxus_motifs = flatten({tf: read_meme_motifs(f'output/peakxus/{tf}/MEME/w=200,n=1000,nmotifs=15,evt=0.01,maxw=20/meme.txt')
for tf in tasks}, separator='/')
for name, motif in peakxus_motifs.items():
if int(motif.name) < 100:
continue
motif.plotPWMInfo(figsize=(5, 1.2));
plt.ylim([0, 2])
sns.despine(top=True, right=True, bottom=True)
plt.title(name + f" ({motif.name})")
melanie_motifs = {"nanog": "Nanog",
"sox2": "Sox2",
"oct4": "Oct4-Sox2",
"klf4": "Klf4"}
dff = pd.concat([pd.read_csv(f"ftp://ftp.stowers.org/pub/mw2098/for_ziga/FIMO_results/{k}_top1000peaks_window201bp/fimo.tsv",
sep='\t', comment='#').assign(motif=v)
for k,v in melanie_motifs.items()])
dff
dfi_peakxus.head()
dfi.chromosome.unique()
dfi_columns = [
'pattern_name',
'pattern_short'.
'pattern_start_abs',
'pattern_end_abs',
]
Required columns for construct_motif_pairs
dfi['score'].plot.hist(50)
plt.xlabel("Score (experiment_log2Fold)");
dfi.SubtypeMotifScore.plot.hist(50);
plt.xlabel("Motif score");
from basepair.exp.chipnexus.motif_clustering import similarity_matrix
from basepair.modisco.motif_clustering import create_pattern_table, align_clustered_patterns
from scipy.cluster.hierarchy import linkage, optimal_leaf_ordering, cut_tree, leaves_list
def cluster_align_patterns(patterns, n_clusters=9):
# get the similarity matrix
sim_seq = similarity_matrix(patterns, track='seq_ic')
# cluster
iu = np.triu_indices(len(sim_seq), 1)
lm_seq = linkage(1-sim_seq[iu], 'ward', optimal_ordering=True)
# determine clusters and the cluster order
cluster = cut_tree(lm_seq, n_clusters=n_clusters)[:,0]
cluster_order = np.argsort(leaves_list(lm_seq))
# align patterns
patterns_clustered = align_clustered_patterns(patterns, cluster_order, cluster,
align_track='seq_ic',
metric='continousjaccard',
# don't shit the major patterns
# by more than 15 when aligning
trials=20,
max_shift=15)
return patterns_clustered
short_patterns = [p for p in patterns if p.seq_info_content < 30]
short_patterns_clustered = cluster_align_patterns(short_patterns, n_clusters=len(short_patterns))
pinfo = []
for p in short_patterns_clustered:
d = {}
d['name'] = p.name
d['n_seqlets'] = p.attrs['n_seqlets']
d['TF'] = p.attrs['TF']
d_footprint = {t + "_max": p.profile[t].max()
for t in tasks}
d = {**d, **d_footprint}
pinfo.append(d)
df_info = pd.DataFrame(pinfo)
from basepair.plot.tracks import plot_track, plot_tracks
from concise.utils.plot import seqlogo
# fp_slice = slice(25, 175)
fp_slice = slice(10, 190)
max_vals = {t: df_info.max()[t + "_max"] for t in tasks}
fig, axes = plt.subplots(len(short_patterns_clustered), 6, figsize=get_figsize(2, aspect=1))
fig.subplots_adjust(hspace=0, wspace=0)
for i, p in enumerate(short_patterns_clustered):
# p = p.trim_seq_ic(0.3) # typical cut-offf was 0.08
ax = axes[i, 0]
p = p.trim(24, 41)
seqlogo(p.get_seq_ic(), ax=ax)
ax.set_ylim([0, 2]) # all plots have the same shape
strip_axis(ax)
ax.axison = False
pos1 = ax.get_position() # get the original position
pos2 = [pos1.x0, pos1.y0 + pos1.height * 0.4, pos1.width*1.2, pos1.height * .5]
ax.set_position(pos2) # set a new position
ax.text(22, 1, p.attrs["n_seqlets"], fontsize=12, horizontalalignment='right')
ax.text(28, 1, p.attrs["TF"], fontsize=12, horizontalalignment='right')
ax_empty = axes[i, 1]
strip_axis(ax_empty)
ax_empty.axison = False
for j, task in enumerate(tasks):
ax = axes[i, 2 + j]
fp = p.profile[task]
ax.plot(fp[fp_slice, 0], color=tf_colors[task])
ax.plot(-fp[fp_slice, 1], color=tf_colors[task], alpha=0.5) # negative strand
ax.set_ylim([-max_vals[task], max_vals[task]]) # all plots have the same shape
strip_axis(ax)
ax.axison = False
if i == 0:
ax.set_title(task)
fig.savefig(figures / 'per-tf.short-patterns.pdf')
fig.savefig(figures / 'per-tf.short-patterns.png')