from collections import OrderedDict
exp = 'nexus,peaks,OSNK,0,10,1,FALSE,same,0.5,64,25,0.004,9,FALSE,[1,50],TRUE'
imp_score = 'profile/wn'
motifs = OrderedDict([
("Oct4-Sox2", 'Oct4/m0_p0'),
("Oct4", 'Oct4/m0_p1'),
# ("Strange-sym-motif", 'Oct4/m0_p5'),
("Sox2", 'Sox2/m0_p1'),
("Nanog", 'Nanog/m0_p1'),
("Zic3", 'Nanog/m0_p2'),
("Nanog-partner", 'Nanog/m0_p4'),
("Klf4", 'Klf4/m0_p0'),
])
<method>/<TF>/<number>w=200,n=1000,nmotifs=15,evt=0.01,maxw=20len=12,size=200from basepair.imports import *
from basepair.exp.paper.config import *
from basepair.utils import flatten
from basepair.modisco.core import Pattern
from plotnine import *
import plotnine
import warnings
warnings.filterwarnings("ignore")
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
paper_config()
from basepair.utils import pd_first_cols, flatten
from basepair.exp.chipnexus.comparison import read_peakxus_dfi, read_chexmix_dfi, read_fimo_dfi, read_meme_motifs, read_transfac
model_dir = models_dir / exp
mr = MultipleModiscoResult({t: model_dir / f'deeplift/{t}/out/{imp_score}/modisco.h5'
for t in tasks})
# figures dir
model_dir = models_dir / exp
fdir = Path(f'{ddir}/figures/method-comparison')
comparison_dir = Path('../../chipnexus/comparison/output')
main_motifs = [mr.get_pattern(pattern_name).rename(name)
for name, pattern_name in motifs.items()]
def rename(name):
task, mc, pattern = name.split("/")
mc_n = mc.split("_")[1]
pattern_n = pattern.split("_")[1]
return f'BPNet/{task}/m{mc_n}_p{pattern_n}'
bpnet_patterns = [p.rename(rename(p.name)).trim_seq_ic(0.08) for p in mr.get_all_patterns()
if mr.n_seqlets(p.name) > 100]
chexmix_motifs = flatten({tf: read_transfac(comparison_dir / f"chexmix/{tf}/intermediate-results/{tf}.experiment.transfac")
for tf in tasks}, separator='/')
chexmix_patterns = [Pattern('chexmix/' + k, v.pwm, dict(), dict())
for k,v in chexmix_motifs.items()]
# for name, motif in chexmix_motifs.items():
# motif.plotPWMInfo(figsize=(4, 1.2));
# plt.ylim([0, 2])
# sns.despine(top=True, right=True, bottom=True)
# plt.title(name)
from concise.utils.pwm import PWM, load_motif_db
def n_motifs_homer(key):
return int(float(key.split("\t")[5].split("(")[0].split("T:")[1]))
homer_motifs = flatten({tf: {str(i+1): PWM(np.array(v) + 0.001, name=k.split("\t")[0])
for i, (k,v) in enumerate(load_motif_db(str(comparison_dir / f"peakxus/{tf}/HOMER/len=12,size=200/homerMotifs.all.motifs")).items())
if n_motifs_homer(k) > 100
}
for tf in tasks}, separator='/')
home_peakxus_patterns = [Pattern('homer/peakxus/' + k, v.pwm, dict(), dict())
for k,v in homer_motifs.items()]
# for name, motif in homer_motifs.items():
# motif.plotPWMInfo(figsize=(4, 1.2));
# plt.ylim([0, 2])
# sns.despine(top=True, right=True, bottom=True)
# plt.title(name)
homer_motifs = flatten({tf: {str(i+1): PWM(np.array(v) + 0.001, name=k.split("\t")[0])
for i, (k,v) in enumerate(load_motif_db(str(comparison_dir / f"macs2/{tf}/HOMER/len=12,size=200/homerMotifs.all.motifs")).items())
if n_motifs_homer(k) > 100}
for tf in tasks}, separator='/')
homer_macs_patterns = [Pattern('homer/' + k, v.pwm, dict(), dict())
for k,v in homer_motifs.items()]
# for name, motif in homer_motifs.items():
# motif.plotPWMInfo(figsize=(4, 1.2));
# plt.ylim([0, 2])
# sns.despine(top=True, right=True, bottom=True)
# plt.title(name)
Peakxus peaks overlapping
# Peakxus filter paramters
width_filter = [30, 60],
n_peaks = 1000,
seq_width = 200,
# MEME
meme {input.top_peak_seqs} \
-oc output/{wildcards.peak_caller}/{wildcards.tf}/MEME/{wildcards.run} \
-mod zoops -dna -revcomp \
-p {threads} \
-nmotifs 15 -evt 0.01 -maxw 20
peakxus_motifs = flatten({tf: read_meme_motifs(comparison_dir / f'peakxus-in-macs2/{tf}/MEME/w=200,n=1000,nmotifs=15,evt=0.01,maxw=20/meme.txt')
for tf in tasks}, separator='/')
meme_patterns = [Pattern('meme/' + k, v.pwm, dict(), dict())
for k,v in peakxus_motifs.items() if v.name > 100]
# for name, motif in peakxus_motifs.items():
# if int(motif.name) < 100:
# continue
# motif.plotPWMInfo(figsize=(5, 1.2));
# plt.ylim([0, 2])
# sns.despine(top=True, right=True, bottom=True)
# plt.title(name + f" ({motif.name})")
all_patterns = chexmix_patterns + homer_macs_patterns + meme_patterns + bpnet_patterns
all_patterns = [p.resize(70) for p in all_patterns]
from basepair.exp.paper.fig4 import cluster_align_patterns
len(all_patterns)
%tqdm_restart
# Cluster patterns
from basepair.exp.paper.fig4 import *
patterns = all_patterns
n_clusters = 20
sim_seq = similarity_matrix(patterns, track='seq_ic')
# cluster
iu = np.triu_indices(len(sim_seq), 1)
lm_seq = linkage(1 - sim_seq[iu], 'ward', optimal_ordering=True)
# determine clusters and the cluster order
cluster = cut_tree(lm_seq, n_clusters=n_clusters)[:, 0]
cluster_order = np.argsort(leaves_list(lm_seq))
# align patterns
aligned_patterns = align_clustered_patterns(patterns, cluster_order, cluster,
align_track='seq_ic',
metric='continousjaccard',
# don't shit the major patterns
# by more than 15 when aligning
trials=20,
max_shift=15)
from basepair.modisco.motif_clustering import to_colors
df_anno = pd.DataFrame(dict(method=[pn.split("/")[0] for pn in pnames],
tf=[pn.split("/")[1] for pn in pnames]),
index=pnames)
# Alternative code
import matplotlib.colors
df = df_anno
cat_cmap='RdBu_r'
import matplotlib.pyplot as plt
import seaborn as sns
cols = {}
for c in df.columns:
categories = list(df[c].unique())
if c == 'tf':
# cmap_dict = {tf: matplotlib.colors.to_rgb(hex_color)
# for tf, hex_color in tf_colors.items()}
cmap_dict = tf_colors
else:
cmap = sns.color_palette(cat_cmap, len(categories))
cmap_dict = dict(zip(categories, cmap))
cols[c] = df[c].map(cmap_dict)
row_colors = pd.DataFrame(cols)
c = 'method'
categories = list(df[c].unique())
sns.color_palette(cat_cmap, len(categories))
cmap_dict_method = dict(zip(list(df[c].unique()), cmap))
from basepair.plot.utils import plot_colormap
plot_colormap(cmap_dict_method, "Method")
plt.savefig(fdir / 'cmap.method.pdf')
plot_colormap({k:v for k,v in tf_colors.items() if k !='Esrrb'}, 'TF')
plt.savefig(fdir / 'cmap.TF.pdf')
pnames = [p.name for p in patterns]
df = pd.DataFrame(sim_seq,
index=pnames,
columns=pnames)
cm = sns.clustermap(df,
row_linkage=lm_seq,
col_linkage=lm_seq,
row_colors=row_colors,
cmap='Blues', figsize=get_figsize(1, 1))
plt.savefig(fdir / 'clustered-motifs.heatmap.pdf')
for p in aligned_patterns:
p.plot("seq_ic");
plt.ylim([0, 2])
sns.despine(bottom=True, right=True, top=True)
# MEME
meme {input.top_peak_seqs} \
-oc output/{wildcards.peak_caller}/{wildcards.tf}/MEME/{wildcards.run} \
-mod zoops -dna -revcomp \
-p {threads} \
-nmotifs 15 -evt 0.01 -maxw 20
peakxus_motifs = flatten({tf: read_meme_motifs(comparison_dir / f'macs2/{tf}/MEME/w=200,n=1000,nmotifs=15,evt=0.01,maxw=20/meme.txt')
for tf in tasks}, separator='/')
for name, motif in peakxus_motifs.items():
if int(motif.name) < 100:
continue
motif.plotPWMInfo(figsize=(5, 1.5));
plt.ylim([0, 2])
sns.despine(top=True, right=True, bottom=True)
plt.title(name + f" ({motif.name})")
peakxus_motifs = flatten({tf: read_meme_motifs(comparison_dir / f'peakxus/{tf}/MEME/w=200,n=1000,nmotifs=15,evt=0.01,maxw=20/meme.txt')
for tf in tasks}, separator='/')
for name, motif in peakxus_motifs.items():
if int(motif.name) < 100:
continue
motif.plotPWMInfo(figsize=(5, 1.2));
plt.ylim([0, 2])
sns.despine(top=True, right=True, bottom=True)
plt.title(name + f" ({motif.name})")