Goal

  • write functions to load the data

Tasks

  • [x] load the results into dfi
  • [x] load the produced motifs
  • On the test chromosome, compare the peak recall with our approach
  • Run pairwise spacing of Nanog
  • Methods
    • BPNet / modisco motif instances
    • PWM approach based on Chexmix (or modisco motifs)
    • Chexmix
    • ? BPnet models for classification / ChIP-seq
  • [ ] Load all motifs from all the tools into a single matrix
    • cluster them

TODO

  • [x] run MEME for more motifs
  • [x] load new MEME results
  • [x] load HOMER motifs
  • [ ] get the pwm instances from chexmix
    • using peaks
    • not using peaks
      • run your own PWM scanning
  • [ ] run the following analysis
    • overlap chexmix and fimo peaks with our (non-overlapping) peaks
      • [X] Load ranges & remove overlapping intervals
      • [X] add example_idx
    • run pairwise spacing of Nanog (chexmix, fimo)
      • use differnet thresholds
    • on the test chromosome, compare the peak recall with our approach:
      • BPNet / modisco scores
      • PWM approach based on our PWM's
      • PWM's from ChExmix
      • (///) PWM's from MEME (e.g. FIMO)
In [100]:
# Imports
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
from basepair.imports import *
hv.extension('bokeh')

load the results into dfi

Peakxus

In [101]:
tf = 'Oct4'
In [102]:
from basepair.exp.paper.config import tasks
from basepair.preproc import interval_center
from basepair.utils import pd_first_cols, flatten
from basepair.exp.chipnexus.comparison import read_peakxus_dfi, read_chexmix_dfi, read_fimo_dfi, read_meme_motifs, read_transfac

def prepend(df, col, name, sep="/"):
    df[col] = name + sep + df[col]
    return df

dfi_peakxus = pd.concat([read_peakxus_dfi(f'output/peakxus/{tf}/all_transition_points.igv').assign(task=tf)
                         for tf in tasks], axis=0)

dfi_chexmix = pd.concat([pd_first_cols(read_chexmix_dfi(f"output/chexmix/{tf}/{tf}_experiment.events").
                                       pipe(prepend, col='pattern_name', name=tf),
                                       ['chrom', 'pattern_center_abs', 'pattern_name', 'profile_score', 'motif_score', 'strand'])
                         for tf in tasks], axis=0)
In [103]:
dfi_fimo = pd.concat([pd_first_cols(read_fimo_dfi(f"output/peakxus/{tf}/MEME/w=200,n=1000,nmotifs=15,evt=0.01,maxw=40/FIMO/fimo.tsv").
                                    pipe(prepend, col='pattern_name', name=tf),
                                    ['chrom', 'pattern_center_abs', 'pattern_name', 'score', 'strand'])
                         for tf in tasks], axis=0)
In [104]:
# df = read_fimo_dfi(f"output/peakxus/{tf}/MEME/FIMO/fimo.tsv")

Load the ranges

In [105]:
from collections import OrderedDict
exp = 'nexus,peaks,OSNK,0,10,1,FALSE,same,0.5,64,25,0.004,9,FALSE,[1,50],TRUE'
imp_score = 'profile/wn'
In [106]:
from basepair.exp.paper.config import models_dir
In [107]:
model_dir = models_dir / exp
In [108]:
isf = ImpScoreFile(model_dir / 'deeplift.imp_score.h5', default_imp_score=imp_score)
In [109]:
isf.cache() # load everything into memory
Out[109]:
<basepair.cli.imp_score.ImpScoreFile at 0x7f73fc244518>
In [110]:
ranges = isf.get_ranges()
In [111]:
ranges.columns = ['example_' + c for c in ranges.columns]
In [127]:
from basepair.exp.paper.comparison import dfint_overlap_idx, dfi_append_example_idx
In [114]:
# append example_idx + make pattern_start absolute
dfi_chexmix = dfi_append_example_idx(dfi_chexmix, ranges)
dfi_fimo = dfi_append_example_idx(dfi_fimo, ranges)
dfi_peakxus = dfi_append_example_idx(dfi_peakxus, ranges)
In [119]:
dfi_peakxus.pattern_center.plot.hist(100)
Out[119]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f74077a2f28>
In [122]:
dfi_chexmix.pattern_center.plot.hist(100)
Out[122]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f740779bef0>
In [61]:
dfi_chexmix_nanog = dfi_chexmix[dfi_chexmix.pattern_name == 'Nanog/1']
In [64]:
dfi_chexmix_nanog['pattern'] = 'Nanog'
In [70]:
dfi_chexmix.pattern_name.unique()
Out[70]:
array(['Oct4/2', 'Oct4/0', 'Oct4/1', 'Sox2/5', 'Sox2/0', 'Sox2/1', 'Sox2/4', 'Sox2/3', 'Sox2/2',
       'Nanog/0', 'Nanog/1', 'Klf4/2', 'Klf4/0', 'Klf4/1'], dtype=object)
In [73]:
dfi_filtered = dfi_chexmix
In [92]:
dfi_filtered['pattern'] = 'Nanog'
In [74]:
motif_pair = ['Nanog/1', 'Nanog/1']
In [77]:
dfa = dfi_filtered[dfi_filtered.pattern_name == motif_pair[0]]
dfb = dfi_filtered[dfi_filtered.pattern_name == motif_pair[1]]
In [80]:
dfa.head()
Out[80]:
chrom pattern_center_abs pattern_name profile_score motif_score strand #Point experiment_Sig experiment_Ctrl experiment_log2Fold experiment_log2P SubtypePoint Tau SubtypeName SubtypeSequence SubtypeMotifScore example_idx example_chrom example_start example_end example_strand example_interval_from_task pattern_center
57500 chr4 99636221 Nanog/1 8.108 0.0 99636221 chr4:99636221 18737.8 67.9 8.108 -Infinity chr4:99636221:- 0.52 Subtype1 NaN 0.0 388 chr4 99635681 99636681 . Oct4 540
57505 chr14 75505326 Nanog/1 7.653 0.0 75505326 chr14:75505321 15351.7 76.3 7.653 -Infinity chr14:75505326:- 0.67 Subtype1 NaN 0.0 3083 chr14 75504831 75505831 . Oct4 495
57506 chr4 144312954 Nanog/1 7.622 0.0 144312954 chr4:144312959 15202.1 77.1 7.622 -Infinity chr4:144312954:- 0.58 Subtype1 NaN 0.0 236 chr4 144312354 144313354 . Oct4 600
57517 chr3 122145642 Nanog/1 7.260 0.0 122145642 chr3:122145643 11135.9 72.7 7.260 -Infinity chr3:122145642:- 0.76 Subtype1 NaN 0.0 1 chr3 122145077 122146077 . Oct4 565
57518 chr11 62215296 Nanog/1 7.360 0.0 62215296 chr11:62215292 10777.6 65.6 7.360 -Infinity chr11:62215296:- 0.83 Subtype1 NaN 0.0 706 chr11 62214786 62215786 . Oct4 510
In [85]:
del dfa['SubtypeSequence']
In [86]:
del dfb['SubtypeSequence']
In [93]:
dfab = motif_pair_dfi(dfi_chexmix, ['Nanog/1', 'Nanog/1'])
In [96]:
dfab.head()
Out[96]:
chrom_x pattern_center_abs_x pattern_name_x profile_score_x motif_score_x strand_x #Point_x experiment_Sig_x experiment_Ctrl_x experiment_log2Fold_x experiment_log2P_x SubtypePoint_x Tau_x SubtypeName_x SubtypeSequence_x SubtypeMotifScore_x example_idx example_chrom_x example_start_x example_end_x example_strand_x example_interval_from_task_x pattern_center_x pattern_x chrom_y pattern_center_abs_y pattern_name_y profile_score_y motif_score_y strand_y #Point_y experiment_Sig_y experiment_Ctrl_y experiment_log2Fold_y experiment_log2P_y SubtypePoint_y Tau_y SubtypeName_y SubtypeSequence_y SubtypeMotifScore_y example_chrom_y example_start_y example_end_y example_strand_y example_interval_from_task_y pattern_center_y pattern_y center_diff strand_combination
2 chr14 75505326 Nanog/1 7.653 0.0 75505326 chr14:75505321 15351.7 76.3 7.653 -Infinity chr14:75505326:- 0.67 Subtype1 NaN 0.0 3083 chr14 75504831 75505831 . Oct4 495 Nanog chr14 75505333 Nanog/1 7.496 0.0 75505333 chr14:75505333 180.5 0.9 7.496 -124.559 chr14:75505333:+ 0.63 Subtype1 NaN 0.0 chr14 75504831 75505831 . Oct4 502 Nanog 7 7550532675505333
10 chr9 3002144 Nanog/1 9.188 0.0 3002144 chr9:3002144 10252.0 17.6 9.188 -Infinity chr9:3002144:- 0.98 Subtype1 NaN 0.0 0 chr9 3001633 3002633 . Oct4 511 Nanog chr9 3002145 Nanog/1 9.188 0.0 3002145 chr9:3002145 9368.8 16.1 9.188 -Infinity chr9:3002145:- 0.98 Subtype1 NaN 0.0 chr9 3001633 3002633 . Oct4 512 Nanog 1 30021443002145
11 chr9 3002144 Nanog/1 9.188 0.0 3002144 chr9:3002144 10252.0 17.6 9.188 -Infinity chr9:3002144:- 0.98 Subtype1 NaN 0.0 0 chr9 3001633 3002633 . Oct4 511 Nanog chr9 3002582 Nanog/1 7.209 0.0 3002582 chr9:3002582 2340.9 15.8 7.209 -Infinity chr9:3002582:- 1.00 Subtype1 NaN 0.0 chr9 3001633 3002633 . Oct4 949 Nanog 438 30021443002582
14 chr9 3002145 Nanog/1 9.188 0.0 3002145 chr9:3002145 9368.8 16.1 9.188 -Infinity chr9:3002145:- 0.98 Subtype1 NaN 0.0 0 chr9 3001633 3002633 . Oct4 512 Nanog chr9 3002582 Nanog/1 7.209 0.0 3002582 chr9:3002582 2340.9 15.8 7.209 -Infinity chr9:3002582:- 1.00 Subtype1 NaN 0.0 chr9 3001633 3002633 . Oct4 949 Nanog 437 30021453002582
23 chr18 69782002 Nanog/1 6.775 0.0 69782002 chr18:69782002 194.6 1.8 6.775 -129.415 chr18:69782002:+ 1.00 Subtype1 NaN 0.0 1526 chr18 69781867 69782867 . Oct4 135 Nanog chr18 69782367 Nanog/1 6.775 0.0 69782367 chr18:69782372 8533.0 77.9 6.775 -Infinity chr18:69782367:- 0.57 Subtype1 NaN 0.0 chr18 69781867 69782867 . Oct4 500 Nanog 365 6978200269782367
In [98]:
plotnine.options.figure_size = get_figsize(.5, aspect=2/10)
max_dist = 100
fig = (ggplot(aes(x='center_diff'), dfab[(dfab.center_diff <= max_dist)]) + 
 geom_vline(xintercept=10, alpha=0.1) + 
 geom_vline(xintercept=20, alpha=0.1) + 
 geom_vline(xintercept=30, alpha=0.1) + 
 geom_vline(xintercept=40, alpha=0.1) + 
 geom_histogram(bins=max_dist) + 
 theme_classic(base_size=10, base_family='Arial') + 
 theme(strip_text = element_text(rotation=0), legend_position='top') + 
 xlim([0, max_dist]) + 
 xlab("Pairwise distance") +
 ggtitle("Nanog<>Nanog") + 
 scale_fill_brewer(type='qual', palette=3))
# fig.save(figures_dir / 'nanog.spacing.pdf')
fig
/users/avsec/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/plotnine/layer.py:449: UserWarning: geom_histogram : Removed 2 rows containing missing values.
  self.data = self.geom.handle_na(self.data)
/users/avsec/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/plotnine/guides/guides.py:198: UserWarning: Cannot generate legend for the 'fill' aesthetic. Make sure you have mapped a variable to it
  "variable to it".format(output))
Out[98]:
<ggplot: (-9223363278316330275)>

TODO

  • [ ] why is there no strand in chexmix?
  • [ ] Label things with a human-readable pattern_name (pattern=other name)
In [94]:
dfab
Out[94]:
chrom_x pattern_center_abs_x pattern_name_x profile_score_x motif_score_x strand_x #Point_x experiment_Sig_x experiment_Ctrl_x experiment_log2Fold_x experiment_log2P_x SubtypePoint_x Tau_x SubtypeName_x SubtypeSequence_x SubtypeMotifScore_x example_idx example_chrom_x example_start_x example_end_x example_strand_x example_interval_from_task_x pattern_center_x pattern_x chrom_y pattern_center_abs_y pattern_name_y profile_score_y motif_score_y strand_y #Point_y experiment_Sig_y experiment_Ctrl_y experiment_log2Fold_y experiment_log2P_y SubtypePoint_y Tau_y SubtypeName_y SubtypeSequence_y SubtypeMotifScore_y example_chrom_y example_start_y example_end_y example_strand_y example_interval_from_task_y pattern_center_y pattern_y center_diff strand_combination
2 chr14 75505326 Nanog/1 7.653 0.0 75505326 chr14:75505321 15351.7 76.3 7.653 -Infinity chr14:75505326:- 0.67 Subtype1 NaN 0.0 3083 chr14 75504831 75505831 . Oct4 495 Nanog chr14 75505333 Nanog/1 7.496 0.0 75505333 chr14:75505333 180.5 0.9 7.496 -124.559 chr14:75505333:+ 0.63 Subtype1 NaN 0.0 chr14 75504831 75505831 . Oct4 502 Nanog 7 7550532675505333
10 chr9 3002144 Nanog/1 9.188 0.0 3002144 chr9:3002144 10252.0 17.6 9.188 -Infinity chr9:3002144:- 0.98 Subtype1 NaN 0.0 0 chr9 3001633 3002633 . Oct4 511 Nanog chr9 3002145 Nanog/1 9.188 0.0 3002145 chr9:3002145 9368.8 16.1 9.188 -Infinity chr9:3002145:- 0.98 Subtype1 NaN 0.0 chr9 3001633 3002633 . Oct4 512 Nanog 1 30021443002145
11 chr9 3002144 Nanog/1 9.188 0.0 3002144 chr9:3002144 10252.0 17.6 9.188 -Infinity chr9:3002144:- 0.98 Subtype1 NaN 0.0 0 chr9 3001633 3002633 . Oct4 511 Nanog chr9 3002582 Nanog/1 7.209 0.0 3002582 chr9:3002582 2340.9 15.8 7.209 -Infinity chr9:3002582:- 1.00 Subtype1 NaN 0.0 chr9 3001633 3002633 . Oct4 949 Nanog 438 30021443002582
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
37015 chr17 20945660 Nanog/1 1.837 0.0 20945660 chr17:20945660 35.4 9.9 1.837 -6.644 chr17:20945660:- 1.00 Subtype1 NaN 0.0 24279 chr17 20945259 20946259 . Oct4 401 Nanog chr17 20945790 Nanog/1 2.031 0.0 20945790 chr17:20945792 35.2 8.6 2.031 -7.123 chr17:20945790:- 0.84 Subtype1 NaN 0.0 chr17 20945259 20946259 . Oct4 531 Nanog 130 2094566020945790
37083 chr12 40445780 Nanog/1 2.036 0.0 40445780 chr12:40445780 35.1 8.6 2.036 -7.124 chr12:40445780:- 1.00 Subtype1 NaN 0.0 90223 chr12 40445423 40446423 . Nanog 357 Nanog chr12 40445931 Nanog/1 1.977 0.0 40445931 chr12:40445931 33.6 8.5 1.977 -6.741 chr12:40445931:- 1.00 Subtype1 NaN 0.0 chr12 40445423 40446423 . Nanog 508 Nanog 151 4044578040445931
37477 chr15 66911289 Nanog/1 1.994 0.0 66911289 chr15:66911292 33.8 8.5 1.994 -6.736 chr15:66911289:- 0.79 Subtype1 NaN 0.0 8593 chr15 66910782 66911782 . Oct4 507 Nanog chr15 66911478 Nanog/1 1.966 0.0 66911478 chr15:66911478 33.5 8.6 1.966 -6.742 chr15:66911478:- 0.65 Subtype1 NaN 0.0 chr15 66910782 66911782 . Oct4 696 Nanog 189 6691128966911478

2838 rows × 49 columns

Fetch the importance scores at the motifs

  • subset to relevant motifs
In [14]:
len(dfi_chexmix)
Out[14]:
232168
In [15]:
len(dfi_fimo)
Out[15]:
193274
In [16]:
len(dfi_peakxus)
Out[16]:
199514

Chexmix pairwise plot

In [ ]:
dfi_chexmix.head()

Generate motif pairs

  • [ ] Get the motif pairs of interest -> label them for each
  • [ ] annotate the table
    • [ ] figure out which column to use
    • [ ] create all possible motif pairs
  • [ ] generate the motif pair table
  • [ ] plot pairwise spacing plots
  • [ ] store plots to gdrive:
    • comparison/spacing/<method>/...
In [9]:
from basepair.exp.chipnexus.spacing import remove_edge_instances, get_motif_pairs, motif_pair_dfi
In [10]:
# TODO - for each table, annotate the motifs with human-readable motifs
motifs_chexmix = OrderedDict([
    ("Oct4-Sox2", 'Oct4/1'),
    ("Sox2", 'Sox2/1'),
    ("Nanog", 'Nanog/1'),
    ("Klf4", 'Klf4/1'),
])

motifs_fimo = OrderedDict([
    ("Oct4-Sox2", 'Oct4/2'),
    ("Sox2", 'Sox2/1'),
    #("Nanog", 'Nanog/1'),
    ("Klf4", 'Klf4/1'),
])
In [11]:
motifs_bpnet = OrderedDict([
    ("Oct4-Sox2", 'Oct4/m0_p0'),
    ("Oct4", 'Oct4/m0_p1'),
    # ("Strange-sym-motif", 'Oct4/m0_p5'),
    ("Sox2", 'Sox2/m0_p1'),
    ("Nanog", 'Nanog/m0_p1'),
    ("Zic3", 'Nanog/m0_p2'),
    ("Nanog-partner", 'Nanog/m0_p4'),
    ("Klf4", 'Klf4/m0_p0'),
])
In [12]:
pairs = get_motif_pairs(motifs_bpnet)

# ordered names
pair_names = ["<>".join(x) for x in pairs]
In [ ]:
# create motif pairs
dfab_w_te = pd.concat([motif_pair_dfi(dfi_subset, pair).assign(motif_pair="<>".join(pair)) for pair in pairs], axis=0)
In [259]:
plotnine.options.figure_size = get_figsize(.5, aspect=4)# (10, 10)
max_dist = 100
min_dist = 6
fig = (ggplot(aes(x='center_diff', fill='strand_combination'), dfab[(dfab.center_diff <= max_dist) & 
                                                                    (dfab.center_diff > min_dist)]) + 
 geom_histogram(bins=max_dist-min_dist+1) + 
 facet_grid("motif_pair~ .", scales='free_y') + 
 theme_classic(base_size=10, base_family='Arial') + 
 theme(strip_text = element_text(rotation=0), legend_position='top') + 
 xlim([min_dist, max_dist]) + 
 # ylim([0, 1000]) +
 xlab("Pairwise distance") + 
 scale_fill_brewer(type='qual', palette=3))
# fig.save(fdir / 'histogram.center_diff.all.imp>.2,match>.0.pdf')
fig 
Out[259]:
<ggplot: (-9223363281172737115)>
In [267]:
# Generate all spacing plots
individual_plots = {}
for motif_pair_name in pair_names:
    plotnine.options.figure_size = get_figsize(.3, aspect=2/10*4)
    max_dist = 100
    min_dist = 0
    df = dfab[(dfab.center_diff <= max_dist) & 
              (dfab.motif_pair == motif_pair_name)]
    if len(df) == 0:
        continue
    fig = (ggplot(aes(x='center_diff', fill='strand_combination'), df) + 
     # Spacing vertical lines
     geom_vline(xintercept=10, alpha=0.1) + 
     geom_vline(xintercept=20, alpha=0.1) + 
     geom_vline(xintercept=30, alpha=0.1) + 
     geom_vline(xintercept=40, alpha=0.1) + 
     # plot
     geom_histogram(bins=max_dist - min_dist + 1) + facet_grid("strand_combination~.") + 
     # Theme, labels, colors
     theme_classic(base_size=10, base_family='Arial') + 
     theme(strip_text = element_text(rotation=0), legend_position='top') + 
     xlim([min_dist, max_dist]) + 
     xlab("Pairwise distance") +
     ggtitle(motif_pair_name) + 
     scale_fill_brewer(type='qual', palette=3))
    
    # Save
    # fig.save(fdir_individual / f'{motif_pair_name}.pdf')
    individual_plots[motif_pair_name] = fig
In [113]:
# NOTE: this was the exact same code snippet as before
In [121]:
dfi_peakxus2.head()
Out[121]:
chromosome start end id signal p-value score FDR width pattern_center task chrom example_idx example_chrom example_start example_end example_strand example_interval_from_task pattern_center_abs
0 chr2 98667118 98667157 98667118.0-98667157.0 447986.0 0.0 1.0674e+12 0.0 39.0 NaN Oct4 chr2 -1 NaN NaN NaN NaN NaN 98667137
1 chr2 98662829 98662875 98662829.0-98662875.0 165444.0 0.0 1.8742e+11 0.0 46.0 NaN Oct4 chr2 -1 NaN NaN NaN NaN NaN 98662852
2 chr9 35305429 35305472 35305429.0-35305472.0 95560.0 0.0 2.4445e+10 0.0 43.0 NaN Oct4 chr9 -1 NaN NaN NaN NaN NaN 35305450
3 chr2 98662314 98662353 98662314.0-98662353.0 106645.0 0.0 2.0333e+10 0.0 39.0 NaN Oct4 chr2 -1 NaN NaN NaN NaN NaN 98662333
4 chr2 98666842 98666886 98666842.0-98666886.0 81642.0 0.0 1.8779e+10 0.0 44.0 NaN Oct4 chr2 -1 NaN NaN NaN NaN NaN 98666864

load the produced motifs

Chexmix

In [37]:
chexmix_motifs = flatten({tf: read_transfac(f"output/chexmix/{tf}/intermediate-results/{tf}.experiment.transfac")
                          for tf in tasks}, separator='/')
In [39]:
for name, motif in chexmix_motifs.items():
    motif.plotPWMInfo(figsize=(4, 1.2));
    plt.ylim([0, 2])
    sns.despine(top=True, right=True, bottom=True)
    plt.title(name)

Homer motifs

Peakxus peaks

In [24]:
from concise.utils.pwm import PWM, load_motif_db
In [29]:
homer_motifs = flatten({tf: {str(i+1): PWM(np.array(v) + 0.001, name=k.split("\t")[0])
                             for i, (k,v) in enumerate(load_motif_db(f"output/peakxus/{tf}/HOMER/len=12,size=200/homerMotifs.all.motifs").items())}
                        for tf in tasks}, separator='/')
In [34]:
for name, motif in homer_motifs.items():
    motif.plotPWMInfo(figsize=(4, 1.2));
    plt.ylim([0, 2])
    sns.despine(top=True, right=True, bottom=True)
    plt.title(name)

Macs2 peaks

In [40]:
homer_motifs = flatten({tf: {str(i+1): PWM(np.array(v) + 0.001, name=k.split("\t")[0])
                             for i, (k,v) in enumerate(load_motif_db(f"output/macs2/{tf}/HOMER/len=12,size=200/homerMotifs.all.motifs").items())}
                        for tf in tasks}, separator='/')
In [41]:
for name, motif in homer_motifs.items():
    motif.plotPWMInfo(figsize=(4, 1.2));
    plt.ylim([0, 2])
    sns.despine(top=True, right=True, bottom=True)
    plt.title(name)

MEME

Peakxus and Macs2 peaks (intersection)

Peakxus peaks overlapping

# Peakxus filter paramters
width_filter = [30, 60],
n_peaks = 1000,
seq_width = 200,

# MEME
meme {input.top_peak_seqs} \
  -oc output/{wildcards.peak_caller}/{wildcards.tf}/MEME/{wildcards.run} \
  -mod zoops -dna -revcomp \
  -p {threads} \
  -nmotifs 15 -evt 0.01 -maxw 20
In [35]:
peakxus_motifs = flatten({tf: read_meme_motifs(f'output/peakxus-in-macs2/{tf}/MEME/w=200,n=1000,nmotifs=15,evt=0.01,maxw=20/meme.txt')
                          for tf in tasks}, separator='/')
In [36]:
for name, motif in peakxus_motifs.items():
    if int(motif.name) < 100:
        continue
    motif.plotPWMInfo(figsize=(5, 1.2));
    plt.ylim([0, 2])
    sns.despine(top=True, right=True, bottom=True)
    plt.title(name + f" ({motif.name})")

Macs2 top 1000 peaks

# MEME
meme {input.top_peak_seqs} \
  -oc output/{wildcards.peak_caller}/{wildcards.tf}/MEME/{wildcards.run} \
  -mod zoops -dna -revcomp \
  -p {threads} \
  -nmotifs 15 -evt 0.01 -maxw 20
In [42]:
peakxus_motifs = flatten({tf: read_meme_motifs(f'output/macs2/{tf}/MEME/w=200,n=1000,nmotifs=15,evt=0.01,maxw=20/meme.txt')
                          for tf in tasks}, separator='/')
In [44]:
for name, motif in peakxus_motifs.items():
    if int(motif.name) < 100:
        continue
    motif.plotPWMInfo(figsize=(5, 1.5));
    plt.ylim([0, 2])
    sns.despine(top=True, right=True, bottom=True)
    plt.title(name + f" ({motif.name})")

Peakxus top 1000 peaks

In [48]:
peakxus_motifs = flatten({tf: read_meme_motifs(f'output/peakxus/{tf}/MEME/w=200,n=1000,nmotifs=15,evt=0.01,maxw=20/meme.txt')
                          for tf in tasks}, separator='/')
In [49]:
for name, motif in peakxus_motifs.items():
    if int(motif.name) < 100:
        continue
    motif.plotPWMInfo(figsize=(5, 1.2));
    plt.ylim([0, 2])
    sns.despine(top=True, right=True, bottom=True)
    plt.title(name + f" ({motif.name})")

Unify the motif names

In [23]:
melanie_motifs = {"nanog": "Nanog",
                  "sox2": "Sox2",
                  "oct4": "Oct4-Sox2",
                  "klf4": "Klf4"}
In [26]:
dff = pd.concat([pd.read_csv(f"ftp://ftp.stowers.org/pub/mw2098/for_ziga/FIMO_results/{k}_top1000peaks_window201bp/fimo.tsv",
                             sep='\t', comment='#').assign(motif=v)
                 for k,v in melanie_motifs.items()])
In [27]:
dff
Out[27]:
motif_id motif_alt_id sequence_name start stop strand score p-value q-value matched_sequence motif
0 CAMHWGRGCCATCMA MEME-8 chr8 4765939 4765953 + 21.1184 7.8500e-10 0.338 CAACAGGGCCATCAA Nanog
1 CAMHWGRGCCATCMA MEME-8 chr16 22674029 22674043 + 21.1184 7.8500e-10 0.338 CAACAGGGCCATCAA Nanog
2 CAMHWGRGCCATCMA MEME-8 chr1 48169696 48169710 + 21.1184 7.8500e-10 0.338 caacagggccatcaa Nanog
... ... ... ... ... ... ... ... ... ... ... ...
96868 RCCMCACCCWRNBHH MEME-1 chr1 191707921 191707935 + 15.3770 3.0600e-06 0.167 CCCACACCCAGCCTA Klf4
96869 RCCMCACCCWRNBHH MEME-1 chr1 193035774 193035788 - 15.3770 3.0600e-06 0.167 ACCCCACCCCGCCGC Klf4
96870 RCCMCACCCWRNBHH MEME-1 chr1 195023826 195023840 + 15.3770 3.0600e-06 0.167 GCCCCACCCTTTGTC Klf4

390844 rows × 11 columns

In [119]:
dfi_peakxus.head()
Out[119]:
chromosome start end id signal p-value score FDR width pattern_center pattern_name
844054 chr2 98667118 98667157 98667118.0-98667157.0 447986.0 0.0 1.0674e+12 0.0 39.0 98667137 Oct4
844024 chr2 98662829 98662875 98662829.0-98662875.0 165444.0 0.0 1.8742e+11 0.0 46.0 98662852 Oct4
1569729 chr9 35305429 35305472 35305429.0-35305472.0 95560.0 0.0 2.4445e+10 0.0 43.0 35305450 Oct4
844014 chr2 98662314 98662353 98662314.0-98662353.0 106645.0 0.0 2.0333e+10 0.0 39.0 98662333 Oct4
844051 chr2 98666842 98666886 98666842.0-98666886.0 81642.0 0.0 1.8779e+10 0.0 44.0 98666864 Oct4
In [22]:
dfi.chromosome.unique()
Out[22]:
array(['chr2', 'chr9', 'chr14', 'chr6', 'chr4', 'chr12', 'chr15', 'chr3', 'chr13', 'chr19',
       'chr11', 'chr17', 'chr18', 'chr8', 'chr5', 'chr1', 'chr7', 'chr10', 'chr16'], dtype=object)
In [ ]:
dfi_columns = [
    'pattern_name',
    'pattern_short'.
    'pattern_start_abs',
    'pattern_end_abs',
    
]

Required columns for construct_motif_pairs

  • example_idx
  • pattern_name
  • pattern_center
  • strand

Chexmix

In [112]:
dfi['score'].plot.hist(50)
plt.xlabel("Score (experiment_log2Fold)");
In [113]:
dfi.SubtypeMotifScore.plot.hist(50);
plt.xlabel("Motif score");

Pretty plotting

In [409]:
from basepair.exp.chipnexus.motif_clustering import similarity_matrix
from basepair.modisco.motif_clustering import create_pattern_table, align_clustered_patterns
from scipy.cluster.hierarchy import linkage, optimal_leaf_ordering, cut_tree, leaves_list

def cluster_align_patterns(patterns, n_clusters=9):
    # get the similarity matrix
    sim_seq = similarity_matrix(patterns, track='seq_ic')
    
    # cluster
    iu = np.triu_indices(len(sim_seq), 1)
    lm_seq = linkage(1-sim_seq[iu], 'ward', optimal_ordering=True)
    
    # determine clusters and the cluster order
    cluster = cut_tree(lm_seq, n_clusters=n_clusters)[:,0]
    cluster_order = np.argsort(leaves_list(lm_seq))

    # align patterns 
    patterns_clustered = align_clustered_patterns(patterns, cluster_order, cluster, 
                                                      align_track='seq_ic',
                                                      metric='continousjaccard',
                                                      # don't shit the major patterns 
                                                      # by more than 15 when aligning
                                                      trials=20,
                                                      max_shift=15)
    return patterns_clustered
In [410]:
short_patterns = [p for p in patterns if p.seq_info_content < 30]
In [411]:
short_patterns_clustered = cluster_align_patterns(short_patterns, n_clusters=len(short_patterns))
100%|██████████| 13/13 [00:00<00:00, 18.44it/s]
100%|██████████| 13/13 [00:00<00:00, 242.40it/s]
In [412]:
pinfo = []
for p in short_patterns_clustered:
    d = {}
    d['name'] = p.name
    d['n_seqlets'] = p.attrs['n_seqlets']
    d['TF'] = p.attrs['TF']
    
    d_footprint = {t + "_max": p.profile[t].max()
                   for t in tasks}
    d = {**d, **d_footprint}
    pinfo.append(d)
df_info = pd.DataFrame(pinfo)
In [414]:
from basepair.plot.tracks import plot_track, plot_tracks
from concise.utils.plot import seqlogo
In [417]:
# fp_slice = slice(25, 175)
fp_slice = slice(10, 190)

max_vals = {t: df_info.max()[t + "_max"] for t in tasks}

fig, axes = plt.subplots(len(short_patterns_clustered), 6, figsize=get_figsize(2, aspect=1))
fig.subplots_adjust(hspace=0, wspace=0)
for i, p in enumerate(short_patterns_clustered):
    
    
    # p = p.trim_seq_ic(0.3)  # typical cut-offf was 0.08
    ax = axes[i, 0]
    p = p.trim(24, 41)
    seqlogo(p.get_seq_ic(), ax=ax)
    
    ax.set_ylim([0, 2])  # all plots have the same shape
    strip_axis(ax)
    ax.axison = False
    pos1 = ax.get_position() # get the original position 
    pos2 = [pos1.x0, pos1.y0 + pos1.height * 0.4,  pos1.width*1.2, pos1.height * .5] 
    ax.set_position(pos2) # set a new position

    ax.text(22, 1, p.attrs["n_seqlets"], fontsize=12, horizontalalignment='right')
    ax.text(28, 1, p.attrs["TF"], fontsize=12, horizontalalignment='right')
    
    ax_empty = axes[i, 1]
    strip_axis(ax_empty)
    ax_empty.axison = False
    
    for j, task in enumerate(tasks):
        ax = axes[i, 2 + j]
        fp = p.profile[task]
        
        ax.plot(fp[fp_slice, 0], color=tf_colors[task])
        ax.plot(-fp[fp_slice, 1], color=tf_colors[task], alpha=0.5)  # negative strand
        
        ax.set_ylim([-max_vals[task], max_vals[task]])  # all plots have the same shape
        strip_axis(ax)
        ax.axison = False
        
        if i == 0:
            ax.set_title(task)
fig.savefig(figures / 'per-tf.short-patterns.pdf')
fig.savefig(figures / 'per-tf.short-patterns.png')