In [148]:
from collections import OrderedDict
exp = 'nexus,peaks,OSNK,0,10,1,FALSE,same,0.5,64,25,0.004,9,FALSE,[1,50],TRUE'
imp_score = 'profile/wn'

motifs = OrderedDict([
    ("Oct4-Sox2", 'Oct4/m0_p0'),
    ("Oct4", 'Oct4/m0_p1'),
    # ("Strange-sym-motif", 'Oct4/m0_p5'),
    ("Sox2", 'Sox2/m0_p1'),
    ("Nanog", 'Nanog/m0_p1'),
    ("Zic3", 'Nanog/m0_p2'),
    ("Nanog-partner", 'Nanog/m0_p4'),
    ("Klf4", 'Klf4/m0_p0'),
])

Goal

  • cluster all motifs for all explored methods

Tasks

  • [X] prefix motifs with the method name: <method>/<TF>/<number>
    • use the cannonical motif names for BPNet
  • [X] plot the major motif clusters
  • [X] Add a filter to all motifs: require support by more than 100 sequences

Open questions

  • How to visualize this in the paper?

Methods

  • MEME: peakxus peaks in MACS2, params: w=200,n=1000,nmotifs=15,evt=0.01,maxw=20
  • HOMER: MACS2 peaks len=12,size=200
  • ChExMix: default genome-wide
  • BPnet: MACS2 peaks

Conclusions

  • Nanog motif not really discovered by HOMER and Meme (maybe a weak form of it was discovered by HOMER)
  • Oct4-Sox2, Sox2 and Klf4 were found regardless of the framework
In [195]:
from basepair.imports import *
from basepair.exp.paper.config import *
from basepair.utils import flatten
from basepair.modisco.core import Pattern
from plotnine import *
import plotnine
import warnings
warnings.filterwarnings("ignore")

%matplotlib inline
%config InlineBackend.figure_format = 'retina'
paper_config()
In [196]:
from basepair.utils import pd_first_cols, flatten
from basepair.exp.chipnexus.comparison import read_peakxus_dfi, read_chexmix_dfi, read_fimo_dfi, read_meme_motifs, read_transfac
In [197]:
model_dir = models_dir / exp
In [198]:
mr = MultipleModiscoResult({t: model_dir / f'deeplift/{t}/out/{imp_score}/modisco.h5'
                           for t in tasks})
In [447]:
# figures dir
model_dir = models_dir / exp
fdir = Path(f'{ddir}/figures/method-comparison')
In [199]:
comparison_dir = Path('../../chipnexus/comparison/output')
In [200]:
main_motifs = [mr.get_pattern(pattern_name).rename(name)
               for name, pattern_name in motifs.items()]
In [238]:
def rename(name):
    task, mc, pattern = name.split("/")
    mc_n = mc.split("_")[1]
    pattern_n = pattern.split("_")[1]
    return f'BPNet/{task}/m{mc_n}_p{pattern_n}'
In [239]:
bpnet_patterns = [p.rename(rename(p.name)).trim_seq_ic(0.08) for p in mr.get_all_patterns()
                  if mr.n_seqlets(p.name) > 100]

Chexmix

In [240]:
chexmix_motifs = flatten({tf: read_transfac(comparison_dir / f"chexmix/{tf}/intermediate-results/{tf}.experiment.transfac")
                          for tf in tasks}, separator='/')
chexmix_patterns = [Pattern('chexmix/' + k, v.pwm, dict(), dict()) 
                    for k,v in chexmix_motifs.items()]
In [241]:
# for name, motif in chexmix_motifs.items():
#     motif.plotPWMInfo(figsize=(4, 1.2));
#     plt.ylim([0, 2])
#     sns.despine(top=True, right=True, bottom=True)
#     plt.title(name)

Homer motifs

Peakxus peaks

In [242]:
from concise.utils.pwm import PWM, load_motif_db
In [243]:
def n_motifs_homer(key):
    return int(float(key.split("\t")[5].split("(")[0].split("T:")[1]))
In [244]:
homer_motifs = flatten({tf: {str(i+1): PWM(np.array(v) + 0.001, name=k.split("\t")[0])
                             for i, (k,v) in enumerate(load_motif_db(str(comparison_dir / f"peakxus/{tf}/HOMER/len=12,size=200/homerMotifs.all.motifs")).items())
                             if n_motifs_homer(k) > 100
                            }
                        for tf in tasks}, separator='/')
home_peakxus_patterns = [Pattern('homer/peakxus/' + k, v.pwm, dict(), dict()) 
                    for k,v in homer_motifs.items()]
In [245]:
# for name, motif in homer_motifs.items():
#     motif.plotPWMInfo(figsize=(4, 1.2));
#     plt.ylim([0, 2])
#     sns.despine(top=True, right=True, bottom=True)
#     plt.title(name)

Macs2 peaks

In [246]:
homer_motifs = flatten({tf: {str(i+1): PWM(np.array(v) + 0.001, name=k.split("\t")[0])
                             for i, (k,v) in enumerate(load_motif_db(str(comparison_dir / f"macs2/{tf}/HOMER/len=12,size=200/homerMotifs.all.motifs")).items())
                             if n_motifs_homer(k) > 100}
                        for tf in tasks}, separator='/')
homer_macs_patterns = [Pattern('homer/' + k, v.pwm, dict(), dict()) 
                    for k,v in homer_motifs.items()]
In [247]:
# for name, motif in homer_motifs.items():
#     motif.plotPWMInfo(figsize=(4, 1.2));
#     plt.ylim([0, 2])
#     sns.despine(top=True, right=True, bottom=True)
#     plt.title(name)

MEME

Peakxus and Macs2 peaks (intersection)

Peakxus peaks overlapping

# Peakxus filter paramters
width_filter = [30, 60],
n_peaks = 1000,
seq_width = 200,

# MEME
meme {input.top_peak_seqs} \
  -oc output/{wildcards.peak_caller}/{wildcards.tf}/MEME/{wildcards.run} \
  -mod zoops -dna -revcomp \
  -p {threads} \
  -nmotifs 15 -evt 0.01 -maxw 20
In [248]:
peakxus_motifs = flatten({tf: read_meme_motifs(comparison_dir / f'peakxus-in-macs2/{tf}/MEME/w=200,n=1000,nmotifs=15,evt=0.01,maxw=20/meme.txt')
                          for tf in tasks}, separator='/')
meme_patterns = [Pattern('meme/' + k, v.pwm, dict(), dict()) 
                    for k,v in peakxus_motifs.items() if v.name > 100]
In [249]:
# for name, motif in peakxus_motifs.items():
#     if int(motif.name) < 100:
#         continue
#     motif.plotPWMInfo(figsize=(5, 1.2));
#     plt.ylim([0, 2])
#     sns.despine(top=True, right=True, bottom=True)
#     plt.title(name + f" ({motif.name})")

All motifs

In [250]:
all_patterns = chexmix_patterns + homer_macs_patterns + meme_patterns + bpnet_patterns
all_patterns = [p.resize(70) for p in all_patterns]
In [251]:
from basepair.exp.paper.fig4 import cluster_align_patterns
In [252]:
len(all_patterns)
Out[252]:
172
In [128]:
%tqdm_restart

In [253]:
# Cluster patterns
from basepair.exp.paper.fig4 import *
patterns = all_patterns
n_clusters = 20
sim_seq = similarity_matrix(patterns, track='seq_ic')

# cluster
iu = np.triu_indices(len(sim_seq), 1)
lm_seq = linkage(1 - sim_seq[iu], 'ward', optimal_ordering=True)

# determine clusters and the cluster order
cluster = cut_tree(lm_seq, n_clusters=n_clusters)[:, 0]
cluster_order = np.argsort(leaves_list(lm_seq))

# align patterns
aligned_patterns = align_clustered_patterns(patterns, cluster_order, cluster,
                                              align_track='seq_ic',
                                              metric='continousjaccard',
                                              # don't shit the major patterns
                                              # by more than 15 when aligning
                                              trials=20,
                                              max_shift=15)
100%|██████████| 172/172 [01:36<00:00,  1.78it/s]
Number of seqlets not provided, using random number of seqlets
100%|██████████| 172/172 [00:00<00:00, 295.16it/s]
In [220]:
from basepair.modisco.motif_clustering import to_colors
In [375]:
df_anno = pd.DataFrame(dict(method=[pn.split("/")[0] for pn in pnames],
                            tf=[pn.split("/")[1] for pn in pnames]),
                       index=pnames)
In [394]:
# Alternative code
import matplotlib.colors
df = df_anno
cat_cmap='RdBu_r'
import matplotlib.pyplot as plt
import seaborn as sns
cols = {}
for c in df.columns:
    categories = list(df[c].unique())
    if c == 'tf':
        # cmap_dict = {tf: matplotlib.colors.to_rgb(hex_color)
        #             for tf, hex_color in tf_colors.items()}
        cmap_dict = tf_colors
    else:
        cmap = sns.color_palette(cat_cmap, len(categories))
        cmap_dict = dict(zip(categories, cmap))
    cols[c] = df[c].map(cmap_dict)
row_colors = pd.DataFrame(cols)
In [443]:
c = 'method'
categories = list(df[c].unique())
sns.color_palette(cat_cmap, len(categories))
cmap_dict_method = dict(zip(list(df[c].unique()), cmap))
In [444]:
from basepair.plot.utils import plot_colormap
In [448]:
plot_colormap(cmap_dict_method, "Method")
plt.savefig(fdir / 'cmap.method.pdf')
In [449]:
plot_colormap({k:v for k,v in tf_colors.items() if k !='Esrrb'}, 'TF')
plt.savefig(fdir / 'cmap.TF.pdf')
In [451]:
pnames = [p.name for p in patterns]
df = pd.DataFrame(sim_seq, 
                  index=pnames,
                  columns=pnames)
cm = sns.clustermap(df, 
                    row_linkage=lm_seq,
                    col_linkage=lm_seq,
                    row_colors=row_colors,
                    cmap='Blues', figsize=get_figsize(1, 1))
plt.savefig(fdir / 'clustered-motifs.heatmap.pdf')
In [379]:
for p in aligned_patterns:
    p.plot("seq_ic");
    plt.ylim([0, 2])
    sns.despine(bottom=True, right=True, top=True)

Possible improvements

  • [ ] draw on the left-hand side the BPNet motifs?

Macs2 top 1000 peaks

# MEME
meme {input.top_peak_seqs} \
  -oc output/{wildcards.peak_caller}/{wildcards.tf}/MEME/{wildcards.run} \
  -mod zoops -dna -revcomp \
  -p {threads} \
  -nmotifs 15 -evt 0.01 -maxw 20
In [39]:
peakxus_motifs = flatten({tf: read_meme_motifs(comparison_dir / f'macs2/{tf}/MEME/w=200,n=1000,nmotifs=15,evt=0.01,maxw=20/meme.txt')
                          for tf in tasks}, separator='/')
In [40]:
for name, motif in peakxus_motifs.items():
    if int(motif.name) < 100:
        continue
    motif.plotPWMInfo(figsize=(5, 1.5));
    plt.ylim([0, 2])
    sns.despine(top=True, right=True, bottom=True)
    plt.title(name + f" ({motif.name})")

Peakxus top 1000 peaks

In [46]:
peakxus_motifs = flatten({tf: read_meme_motifs(comparison_dir / f'peakxus/{tf}/MEME/w=200,n=1000,nmotifs=15,evt=0.01,maxw=20/meme.txt')
                          for tf in tasks}, separator='/')
In [47]:
for name, motif in peakxus_motifs.items():
    if int(motif.name) < 100:
        continue
    motif.plotPWMInfo(figsize=(5, 1.2));
    plt.ylim([0, 2])
    sns.despine(top=True, right=True, bottom=True)
    plt.title(name + f" ({motif.name})")