In [26]:
from collections import OrderedDict
exp = 'nexus,peaks,OSNK,0,10,1,FALSE,same,0.5,64,25,0.004,9,FALSE,[1,50],TRUE'
imp_score = 'profile/wn'

motifs = OrderedDict([
    ("Oct4-Sox2", 'Oct4/m0_p0'),
    ("Oct4", 'Oct4/m0_p1'),
    # ("Strange-sym-motif", 'Oct4/m0_p5'),
    ("Sox2", 'Sox2/m0_p1'),
    ("Nanog", 'Nanog/m0_p1'),
    ("Zic3", 'Nanog/m0_p2'),
    ("Nanog-partner", 'Nanog/m0_p4'),
    ("Klf4", 'Klf4/m0_p0'),
])

motifs_chexmix = OrderedDict([
    ("Oct4-Sox2", 'Oct4/1'),
    ("Sox2", 'Sox2/1'),
    ("Nanog", 'Nanog/1'),
    ("Klf4", 'Klf4/1'),
])

main_motifs = ['Oct4-Sox2', 'Sox2', 'Nanog', 'Klf4']

Goal

  • compare the profiles of top-ranked sites (according to sequence)
  • compare the profiles of weak affinity sites
  • compare the spacing between motifs

Conclusions

  • Weak affinity sites have stronger footprints when they are highlighted as important
  • Potential issue: the important poor matches might actually come from other motifs (e.g. Sox2 might have Oct4 flanks etc)
  • Nanog-Nanog 10bp spacing can be only seen with our motif instances
  • 63.5% of Chexmix peaks (138,098) are in our MACS2 peaks (98,428) and 74.1% of Peakxus peaks (175,259)

TODO

  • What fraction of the peaks can we explain? E.g. compute the overlap between

    • peakxus peaks
    • seqlets (un-clustered)

      • Q: How to get this number?

        • Can we take just modisco's unclustered seqlet locations?
        • There could be a possible problem when excluding seqlets...

          • we could also tighten the bin-size to say 10nt and then call peaks
        • What are the two nearest peakxus peaks?

    • seqlets (clustered)
    • Q: What's the right cut-off?
In [2]:
# Imports
from basepair.imports import *
from basepair.exp.paper.config import tasks
from basepair.modisco.pattern_instances import load_instances, filter_nonoverlapping_intervals, plot_coocurence_matrix, dfi_filter_valid, dfi_add_ranges, annotate_profile
from basepair.cli.imp_score import ImpScoreFile
from basepair.modisco.core import Pattern
from basepair.preproc import dfint_no_intersection
hv.extension('bokeh')
%config InlineBackend.figure_format = 'retina'
%matplotlib inline
paper_config()

interval_cols = ['example_chrom', 'pattern_start_abs', 'pattern_end_abs']
Using TensorFlow backend.

Peakxus/MEME and chexmix

  1. Load peakxus
    • dfi
  2. Load chexmix
    • dfi
    • motifs
  3. Load MEME
    • dfi (fimo)
    • motifs
In [3]:
# load all the required data
# mr = ModiscoResult(modisco_dir / "modisco.h5")
# mr.open()

# imp_scores = ImpScoreFile.from_modisco_dir(modisco_dir)
# imp_scores.cache()  # load all the scores
# ranges = imp_scores.get_ranges()
# ranges.columns = ["example_" + v for v in ranges.columns]
# seqs = imp_scores.get_seq()
# profiles = imp_scores.get_profiles()
In [4]:
from basepair.exp.paper.config import models_dir
In [5]:
# figures dir
model_dir = models_dir / exp
fdir = Path(f'{ddir}/figures/modisco/{exp}/spacing/')
fdir_individual = fdir / 'individual'
fdir_individual_sim = fdir / 'individual-simulation'
In [6]:
%config InlineBackend.figure_format = 'retina'
In [7]:
paper_config()
In [8]:
# Common paths
# output_dir = modisco_dir / "pwm_comparison"  # this notebook's output directory
# output_dir.mkdir(exist_ok=True)
figures_dir = Path(f"{ddir}/figures/modisco/{exp}/pwm_comparison")
figures_dir.mkdir(exist_ok=True)
In [9]:
profile_width = 70
seqlen = 1000
In [10]:
instance_parq_paths = {t: model_dir / f'deeplift/{t}/out/{imp_score}/instances.parq' 
                       for t in tasks}
In [11]:
from basepair.modisco.pattern_instances import (multiple_load_instances, load_instances, filter_nonoverlapping_intervals, 
                                                plot_coocurence_matrix, align_instance_center, dfi2seqlets, annotate_profile)
In [ ]:
dfi = multiple_load_instances(instance_parq_paths, motifs)
  0%|          | 0/4 [00:00<?, ?it/s]
In [ ]:
# Subset the motifs
dfi_subset = dfi.query('match_weighted_p > .2').query('imp_weighted_p > .01')
In [ ]:
# append example_idx + make pattern_start absolute
isf = ImpScoreFile(model_dir / 'deeplift.imp_score.h5', default_imp_score=imp_score)
isf.cache() # load everything into memory
ranges = isf.get_ranges()
ranges.columns = ['example_' + c for c in ranges.columns]
seqs = isf.get_seq()
profiles = isf.get_profiles()
In [ ]:
# # get transposable element locations
# te_patterns = pd.read_csv(modisco_dir / "motif_clustering/patterns_long.csv").pattern.map(longer_pattern).unique()
# dfi_te = load_instances(dfi_full[dfi_full.pattern.isin(te_patterns)], None)
# dfi_te = dfi_te[(dfi_te.match_weighted_p > 0.1) & (dfi_te.seq_match > 20)]
In [12]:
mr = MultipleModiscoResult({t: model_dir / f'deeplift/{t}/out/{imp_score}/modisco.h5'
                           for t in tasks})
In [ ]:
# Exclude TEs
def shorten_te_pattern(s):
    tf, p = s.split("/", 1)
    return tf + "/" + shorten_pattern(p)

motifs_te = [p.name
         for p in mr.get_all_patterns()
         if p.seq_info_content > 30 and mr.n_seqlets(p.name) > 100]
motifs_te_d = OrderedDict([(shorten_te_pattern(x), shorten_te_pattern(x)) for x in motifs_te])

# # get transposable element locations
dfi_te = multiple_load_instances(instance_parq_paths, motifs_te_d)
dfi_te = dfi_te[(dfi_te.match_weighted_p > 0.1) & (dfi_te.seq_match > 20)]
In [ ]:
from basepair.exp.paper.config import tasks
from basepair.utils import pd_first_cols, flatten
from basepair.exp.chipnexus.comparison import read_peakxus_dfi, read_chexmix_dfi, read_fimo_dfi, read_meme_motifs, read_transfac

def prepend(df, col, name, sep="/"):
    df[col] = name + sep + df[col]
    return df

peakxus_motifs = flatten({tf: read_meme_motifs(f'../../chipnexus/comparison/output/peakxus/{tf}/MEME/w=200,n=1000,nmotifs=15,evt=0.01,maxw=40/meme.txt')
                          for tf in tasks}, separator='/')

chexmix_motifs = flatten({tf: read_transfac(f"../../chipnexus/comparison/output/chexmix/{tf}/intermediate-results/{tf}.experiment.transfac")
                          for tf in tasks}, separator='/')

from basepair.modisco.core import Pattern
chexmix_patterns = {k: Pattern(v, chexmix_motifs[v].pwm, dict(), dict()) 
                    for k,v in motifs_chexmix.items()}
In [ ]:
from basepair.exp.paper.config import tasks
from basepair.preproc import interval_center
from basepair.utils import pd_first_cols, flatten
from basepair.exp.chipnexus.comparison import read_peakxus_dfi, read_chexmix_dfi, read_fimo_dfi, read_meme_motifs, read_transfac

def prepend(df, col, name, sep="/"):
    df[col] = name + sep + df[col]
    return df

dfi_peakxus = pd.concat([read_peakxus_dfi(f'../../chipnexus/comparison/output/peakxus/{tf}/all_transition_points.igv').assign(task=tf)
                         for tf in tasks], axis=0)

dfi_chexmix = pd.concat([pd_first_cols(read_chexmix_dfi(f"../../chipnexus/comparison/output/chexmix/{tf}/{tf}_experiment.events").
                                       pipe(prepend, col='pattern_name', name=tf),
                                       ['chrom', 'pattern_center_abs', 'pattern_name', 'profile_score', 'motif_score', 'strand'])
                         for tf in tasks], axis=0)
In [2]:
dfi_append_example_idx??
Signature: dfi_append_example_idx(dfa, ranges)
Source:   
def dfi_append_example_idx(dfa, ranges):
    """
    Args:
      dfa [pd.DataFrame]: containes columns: chrom, pattern_center_abs
      ranges [pd.DataFrame]: containes columns: `example_chrom`, `example_start`, `example_end`

    Note: This assumes that dfa will contain the following 
    """
    dfa = dfa.copy()
    example_idx = dfint_overlap_idx(dfa[['chrom',
                                         'pattern_center_abs',
                                         'pattern_center_abs']],
                                    ranges[['example_chrom',
                                            'example_start',
                                            'example_end']])

    dfa['example_idx'] = example_idx
    dfa = dfa[dfa['example_idx'] != -1]
    dfa = pd.merge(dfa, ranges, on='example_idx', how='left')

    # Add the pattern center
    dfa['pattern_center'] = (dfa['pattern_center_abs'] - dfa['example_start']).astype(int)
    return dfa
File:      ~/workspace/bpnet-manuscript/basepair/exp/paper/comparison.py
Type:      function
In [ ]:
# append `example_idx` and `pattern_center`
from basepair.exp.paper.comparison import dfint_overlap_idx, dfi_append_example_idx
dfi_chexmix = dfi_append_example_idx(dfi_chexmix, ranges)
# dfi_fimo = dfi_append_example_idx(dfi_fimo, ranges)
dfi_peakxus = dfi_append_example_idx(dfi_peakxus, ranges)
In [ ]:
# Subset to only the main patterns
motif_chemix_inv = {v:k for k,v in motifs_chexmix.items()}
dfi_chexmix['pattern'] = dfi_chexmix['pattern_name']
dfi_chexmix['pattern_name'] = dfi_chexmix['pattern'].map(motif_chemix_inv)
dfi_chexmix = dfi_chexmix[~dfi_chexmix.pattern_name.isnull()]
assert not dfi_chexmix.SubtypeSequence.isnull().any()

# Add pattern_start and pattern_end
lens = dfi_chexmix['SubtypeSequence'].str.len()
assert (lens.mod(2) == 0).all()
neg_strand = dfi_chexmix.strand == "-"
dfi_chexmix['pattern_start'] = dfi_chexmix['pattern_center'] - lens // 2 - neg_strand
dfi_chexmix['pattern_end'] = dfi_chexmix['pattern_center'] + lens // 2 - neg_strand

In [6]:
from concise.utils.fasta import read_fasta
In [207]:
fa17 = read_fasta("/oak/stanford/groups/akundaje/avsec/basepair/data/processed/comparison/data/fastas/chr17.fa")
In [210]:
c17b = fa17['chr17']
In [94]:
seqs = encodeDNA(list(dfi_chexmix[dfi_chexmix.pattern_name == 'Nanog'].SubtypeSequence.str.upper()))
In [191]:
from concise.utils.pwm import PWM
In [96]:
PWM(seqs.mean(0)).plotPWMInfo();
In [7]:
from genomelake.extractors import FastaExtractor
from basepair.preproc import parse_interval
In [8]:
from basepair.exp.paper.config import fasta_file
In [9]:
from pyfaidx import Fasta
In [166]:
fa = Fasta(fasta_file)
In [10]:
from basepair.preproc import rc_seq
In [229]:
p.plot('seq_ic');
In [ ]:
fe = FastaExtractor(fasta_file, use_strand=Tr)
In [151]:
dfi_chexmix['pattern_end'] - dfi_chexmix['pattern_start']
Out[151]:
0         8
1         8
2         8
         ..
176330    8
176331    8
176332    8
Length: 169377, dtype: int64

TODO

  • double check the PWM

Modisco data

  1. Load instances
  2. Load profile + importance scores
  3. Get TE instances
  4. Get patterns

TODO

  • do we have to exclude TEs?
In [97]:
# Get rows without intersecting transposable elements
non_te_idx = dfint_no_intersection(dfi_subset[interval_cols], dfi_te[interval_cols])
print(f"Not overlapping te's: {non_te_idx.mean()}")

dfi_subset['is_te'] = True
dfi_subset.loc[non_te_idx, 'is_te'] = True
Not overlapping te's: 0.9453173320023004
In [ ]:
dfi_subset

Considered motifs

In [13]:
# plot them
patterns = {m: mr.get_pattern(longer_pattern(p)).trim_seq_ic(0.08)
            for m, p in motifs.items()}
for tf, p in patterns.items():
    p.plot("seq");
    plt.title(tf)
TF-MoDISco is using the TensorFlow backend.
In [14]:
def dfint_intersect_name(dfa, dfb):
    """Provide the name of the interval b overlapping a

    Args:
      dfa,dfb: each data frame has to contain three columns
        with the following entries: chr, start and end
        
    Returns:
      dfa with 
    """
    from pybedtools import BedTool
    assert len(dfa.columns) == 3
    assert len(dfb.columns) == 4
    dfa = dfa.copy()
    dfa['id'] = np.arange(len(dfa))
    bta = BedTool.from_dataframe(dfa)
    btb = BedTool.from_dataframe(dfb)
    df_int = bta.intersect(btb, wa=True, loj=True).to_dataframe()
    # return df_int
    intersected_id = df_int.name
    other_id = df_int.thickEnd
    keep = ~intersected_id.duplicated()
    return pd.Series(other_id[keep], index=intersected_id[keep])

TODO

  • annotate instances with the overlapping idx
In [159]:
len(dfi_chexmix)
Out[159]:
169377

Support the following operations:

  • dfi_filter_valid
    • pattern_center
  • extract_dfi (dfi_row2seqlet)
    • example_idx
    • pattern_start
    • pattern_end
    • pattern
    • strand

Extract profiles and sequences

Importance score instances

In [ ]:
dfis = dfi.query("imp_weighted_p > 0")
dfis = dfi_filter_valid(dfis, profile_width, seqlen=seqlen)
ps_motifs = {m: isf.extract_dfi(dfis[dfis.pattern_name == m], profile_width=profile_width, verbose=True) for m in motifs}
assert sum([len(x) for x in ps_motifs.values()]) == len(dfis)
In [94]:
sum([len(x) for x in ps_motifs.values()])
Out[94]:
2357838
In [95]:
len(dfis)
Out[95]:
2357838

Chemix instances

In [ ]:
dfi_chexmix
In [116]:
dfi_chexmix.head()
Out[116]:
chrom pattern_center_abs pattern_name profile_score motif_score strand #Point experiment_Sig experiment_Ctrl experiment_log2Fold experiment_log2P SubtypePoint Tau SubtypeName SubtypeSequence SubtypeMotifScore pattern example_idx example_chrom example_start example_end example_strand example_interval_from_task pattern_center
369 chr14 51063900 Oct4-Sox2 4.691 0.0 - chr14:51063900 1237.4 47.9 4.691 -676.423 chr14:51063900:- 1.0 Subtype1 NaN 0.0 Oct4/1 166 chr14 51063428 51064428 . Oct4 472
522 chr13 23401109 Oct4-Sox2 5.220 0.0 - chr13:23401109 1104.8 29.6 5.220 -650.108 chr13:23401109:- 1.0 Subtype1 NaN 0.0 Oct4/1 15 chr13 23400367 23401367 . Oct4 742
544 chr6 128803128 Oct4-Sox2 4.740 0.0 - chr6:128803128 1091.9 40.9 4.740 -601.090 chr6:128803128:- 1.0 Subtype1 NaN 0.0 Oct4/1 173 chr6 128802523 128803523 . Oct4 605
677 chr3 96497807 Oct4-Sox2 4.705 0.0 - chr3:96497807 1013.5 38.8 4.705 -554.894 chr3:96497807:- 1.0 Subtype1 NaN 0.0 Oct4/1 290 chr3 96497319 96498319 . Oct4 488
714 chr13 66711229 Oct4-Sox2 5.902 0.0 + chr13:66711229 992.8 16.6 5.902 -625.607 chr13:66711229:+ 1.0 Subtype1 NaN 0.0 Oct4/1 36 chr13 66710696 66711696 . Oct4 533
In [ ]:
dfi_chexmix = dfi_filter_valid(dfi_chexmix, profile_width, seqlen=seqlen)
ps_chexmix_motifs = {m: isf.extract_dfi(dfi_chexmix[dfi_chexmix.pattern_name == m], 
                                        profile_width=profile_width, 
                                        verbose=True) 
                     for m in motifs_chexmix}
assert sum([len(x) for x in ps_chexmix_motifs.values()]) == len(dfi_chexmix)
In [228]:
p = ps_chexmix_motifs['Nanog'][ps_chexmix_motifs['Nanog'].dfi.strand == '+'].aggregate()
In [230]:
p2 = ps_chexmix_motifs['Nanog'][ps_chexmix_motifs['Nanog'].dfi.strand == '-'].aggregate()
In [230]:
p2 = ps_chexmix_motifs['Nanog'][ps_chexmix_motifs['Nanog'].dfi.strand == '-'].aggregate()
In [127]:
a=1
In [246]:
p2 = ps_chexmix_motifs['Oct4-Sox2'][ps_chexmix_motifs['Oct4-Sox2'].dfi.strand == '-'].aggregate()
In [100]:
PWM(encodeDNA(list(ps_chexmix_motifs['Oct4-Sox2'].dfi.SubtypeSequence.str.upper())).mean(0)).plotPWMInfo();
In [102]:
ps_chexmix_motifs['Oct4-Sox2'][ps_chexmix_motifs['Oct4-Sox2'].dfi.strand == '+'].aggregate().plot("seq_ic");
In [103]:
ps_chexmix_motifs['Oct4-Sox2'][ps_chexmix_motifs['Oct4-Sox2'].dfi.strand == '+'].aggregate().plot("profile");
In [104]:
for p in chexmix_patterns.values():
    p.plot("seq_ic");
In [226]:
p.plot('profile', rotate_y=0);

Sequence (PWM) instances

In [74]:
%tqdm_restart

In [126]:
# sequences
p_seq = {m: isf.extract_dfi(dfi_filter_valid(dfi_add_ranges(chexmix_patterns[m].get_instances_seq_scan(seqs), ranges, dedup=True),
                                                    profile_width=profile_width, seqlen=seqlen), 
                                   profile_width=profile_width, verbose=True)
         for m in motifs_chexmix}
dfi_seq = pd.concat([p_seq[m].dfi.assign(motif=m) for m in motifs_chexmix])
100%|██████████| 147974/147974 [00:26<00:00, 5501.97it/s]
100%|██████████| 147974/147974 [00:25<00:00, 5811.92it/s]
  0%|          | 226/562241 [00:00<04:09, 2254.67it/s]
number of de-duplicated instances: 411963 (38.622388184237835%)
100%|██████████| 562241/562241 [04:11<00:00, 2236.06it/s]
100%|██████████| 147974/147974 [00:22<00:00, 6648.76it/s]
100%|██████████| 147974/147974 [00:21<00:00, 6859.68it/s]
number of de-duplicated instances: 859231 (38.07061773906263%)
100%|██████████| 1195990/1195990 [08:46<00:00, 2272.78it/s]
100%|██████████| 147974/147974 [00:38<00:00, 3842.29it/s]
100%|██████████| 147974/147974 [00:22<00:00, 6444.03it/s]
number of de-duplicated instances: 841896 (37.186810390977946%)
100%|██████████| 1223338/1223338 [09:09<00:00, 2226.05it/s]
100%|██████████| 147974/147974 [00:42<00:00, 3500.47it/s]
100%|██████████| 147974/147974 [00:25<00:00, 5803.21it/s]
number of de-duplicated instances: 992210 (36.51286180378049%)
100%|██████████| 1510955/1510955 [11:15<00:00, 2235.79it/s]

Annotate dfi and

In [ ]:
# # exclude locations overlaping TE's
# keep_nonte = dfint_no_intersection(dfis[interval_cols], dfi_te[interval_cols])
# dfis['is_te'] = ~keep_nonte
# print("not overlapping TE", keep_nonte.mean())
# dfis = dfis[keep_nonte]
In [138]:
from basepair.modisco.pattern_instances import profile_features
In [ ]:
dfis = annotate_profile(dfis, mr, profiles, profile_width=70, trim_frac=0.08)
  0%|          | 0/7 [00:00<?, ?it/s]
In [ ]:
dfi_chexmix = annotate_profile(dfi_chexmix, mr, profiles, profile_width=70, trim_frac=0.08,
                               pattern_map={v:motifs[k] for k,v in motifs_chexmix.items()})
  0%|          | 0/4 [00:00<?, ?it/s]
In [ ]:
dfi_seq = annotate_profile(dfi_seq, mr, profiles, profile_width=70, trim_frac=0.08,
                           pattern_map={v:motifs[k] for k,v in motifs_chexmix.items()})

Save

In [15]:
# Store the results
chexmix_output_dir = Path("../../chipnexus/comparison/output/chexmix/")
In [ ]:
# save the instances
dfi_seq.to_csv(chexmix_output_dir / "pwm_instances.csv.gz", index=False, compression='gzip')

write_pkl(p_seq, chexmix_output_dir / 'pwm_instances.stacked_seqlets.pkl')
In [ ]:
dfi_chexmix.to_csv(chexmix_output_dir / "chexmix.csv.gz", index=False, compression='gzip')
write_pkl(ps_chexmix_motifs, chexmix_output_dir / 'chexmix.stacked_seqlets.pkl')
In [ ]:
!mkdir {model_dir}/benchmark
In [ ]:
dfis.to_csv(model_dir / "benchmark/instances.csv.gz", index=False, compression='gzip')
write_pkl(ps_motifs, model_dir / 'benchmark/instances.stacked_seqlets.pkl')
In [165]:
!du -sh {chexmix_output_dir}/*
516M	../../chipnexus/comparison/output/chexmix/Klf4
454M	../../chipnexus/comparison/output/chexmix/Nanog
241M	../../chipnexus/comparison/output/chexmix/Oct4
100M	../../chipnexus/comparison/output/chexmix/Sox2
12G	../../chipnexus/comparison/output/chexmix/benchmark
28M	../../chipnexus/comparison/output/chexmix/chexmix.csv.gz
884M	../../chipnexus/comparison/output/chexmix/chexmix.stacked_seqlets.pkl
517M	../../chipnexus/comparison/output/chexmix/pwm_instances.csv.gz
22G	../../chipnexus/comparison/output/chexmix/pwm_instances.stacked_seqlets.pkl