from collections import OrderedDict
exp = 'nexus,peaks,OSNK,0,10,1,FALSE,same,0.5,64,25,0.004,9,FALSE,[1,50],TRUE'
imp_score = 'profile/wn'

motifs = OrderedDict([
    ("Oct4-Sox2", 'Oct4/m0_p0'),
    ("Oct4", 'Oct4/m0_p1'),
    # ("Strange-sym-motif", 'Oct4/m0_p5'),
    ("Sox2", 'Sox2/m0_p1'),
    ("Nanog", 'Nanog/m0_p1'),
    ("Zic3", 'Nanog/m0_p2'),
    ("Nanog-partner", 'Nanog/m0_p4'),
    ("Klf4", 'Klf4/m0_p0'),
])

motifs_chexmix = OrderedDict([
    ("Oct4-Sox2", 'Oct4/1'),
    ("Sox2", 'Sox2/1'),
    ("Nanog", 'Nanog/1'),
    ("Klf4", 'Klf4/1'),
])

main_motifs = ['Oct4-Sox2', 'Sox2', 'Nanog', 'Klf4']

Goal¶

compare the profiles of top-ranked sites (according to sequence)
compare the profiles of weak affinity sites
compare the spacing between motifs

Conclusions¶

Weak affinity sites have stronger footprints when they are highlighted as important
Potential issue: the important poor matches might actually come from other motifs (e.g. Sox2 might have Oct4 flanks etc)
Nanog-Nanog 10bp spacing can be only seen with our motif instances
63.5% of Chexmix peaks (138,098) are in our MACS2 peaks (98,428) and 74.1% of Peakxus peaks (175,259)

TODO¶

What fraction of the peaks can we explain? E.g. compute the overlap between
- peakxus peaks
- seqlets (un-clustered)
  - Q: How to get this number?
    - Can we take just modisco's unclustered seqlet locations?
    - There could be a possible problem when excluding seqlets...
      - we could also tighten the bin-size to say 10nt and then call peaks
    - What are the two nearest peakxus peaks?
- seqlets (clustered)
- Q: What's the right cut-off?

# Imports
from basepair.imports import *
from basepair.exp.paper.config import tasks
from basepair.modisco.pattern_instances import load_instances, filter_nonoverlapping_intervals, plot_coocurence_matrix, dfi_filter_valid, dfi_add_ranges, annotate_profile
from basepair.cli.imp_score import ImpScoreFile
from basepair.modisco.core import Pattern
from basepair.preproc import dfint_no_intersection
hv.extension('bokeh')
%config InlineBackend.figure_format = 'retina'
%matplotlib inline
paper_config()

interval_cols = ['example_chrom', 'pattern_start_abs', 'pattern_end_abs']

Using TensorFlow backend.

Peakxus/MEME and chexmix¶

Load peakxus
- dfi
Load chexmix
- dfi
- motifs
Load MEME
- dfi (fimo)
- motifs

# load all the required data
# mr = ModiscoResult(modisco_dir / "modisco.h5")
# mr.open()

# imp_scores = ImpScoreFile.from_modisco_dir(modisco_dir)
# imp_scores.cache()  # load all the scores
# ranges = imp_scores.get_ranges()
# ranges.columns = ["example_" + v for v in ranges.columns]
# seqs = imp_scores.get_seq()
# profiles = imp_scores.get_profiles()

from basepair.exp.paper.config import models_dir

# figures dir
model_dir = models_dir / exp
fdir = Path(f'{ddir}/figures/modisco/{exp}/spacing/')
fdir_individual = fdir / 'individual'
fdir_individual_sim = fdir / 'individual-simulation'

%config InlineBackend.figure_format = 'retina'

paper_config()

# Common paths
# output_dir = modisco_dir / "pwm_comparison"  # this notebook's output directory
# output_dir.mkdir(exist_ok=True)
figures_dir = Path(f"{ddir}/figures/modisco/{exp}/pwm_comparison")
figures_dir.mkdir(exist_ok=True)

profile_width = 70
seqlen = 1000

instance_parq_paths = {t: model_dir / f'deeplift/{t}/out/{imp_score}/instances.parq' 
                       for t in tasks}

from basepair.modisco.pattern_instances import (multiple_load_instances, load_instances, filter_nonoverlapping_intervals, 
                                                plot_coocurence_matrix, align_instance_center, dfi2seqlets, annotate_profile)

dfi = multiple_load_instances(instance_parq_paths, motifs)

  0%|          | 0/4 [00:00<?, ?it/s]

# Subset the motifs
dfi_subset = dfi.query('match_weighted_p > .2').query('imp_weighted_p > .01')

# append example_idx + make pattern_start absolute
isf = ImpScoreFile(model_dir / 'deeplift.imp_score.h5', default_imp_score=imp_score)
isf.cache() # load everything into memory
ranges = isf.get_ranges()
ranges.columns = ['example_' + c for c in ranges.columns]
seqs = isf.get_seq()
profiles = isf.get_profiles()

# # get transposable element locations
# te_patterns = pd.read_csv(modisco_dir / "motif_clustering/patterns_long.csv").pattern.map(longer_pattern).unique()
# dfi_te = load_instances(dfi_full[dfi_full.pattern.isin(te_patterns)], None)
# dfi_te = dfi_te[(dfi_te.match_weighted_p > 0.1) & (dfi_te.seq_match > 20)]

mr = MultipleModiscoResult({t: model_dir / f'deeplift/{t}/out/{imp_score}/modisco.h5'
                           for t in tasks})

# Exclude TEs
def shorten_te_pattern(s):
    tf, p = s.split("/", 1)
    return tf + "/" + shorten_pattern(p)

motifs_te = [p.name
         for p in mr.get_all_patterns()
         if p.seq_info_content > 30 and mr.n_seqlets(p.name) > 100]
motifs_te_d = OrderedDict([(shorten_te_pattern(x), shorten_te_pattern(x)) for x in motifs_te])

# # get transposable element locations
dfi_te = multiple_load_instances(instance_parq_paths, motifs_te_d)
dfi_te = dfi_te[(dfi_te.match_weighted_p > 0.1) & (dfi_te.seq_match > 20)]

from basepair.exp.paper.config import tasks
from basepair.utils import pd_first_cols, flatten
from basepair.exp.chipnexus.comparison import read_peakxus_dfi, read_chexmix_dfi, read_fimo_dfi, read_meme_motifs, read_transfac

def prepend(df, col, name, sep="/"):
    df[col] = name + sep + df[col]
    return df

peakxus_motifs = flatten({tf: read_meme_motifs(f'../../chipnexus/comparison/output/peakxus/{tf}/MEME/w=200,n=1000,nmotifs=15,evt=0.01,maxw=40/meme.txt')
                          for tf in tasks}, separator='/')

chexmix_motifs = flatten({tf: read_transfac(f"../../chipnexus/comparison/output/chexmix/{tf}/intermediate-results/{tf}.experiment.transfac")
                          for tf in tasks}, separator='/')

from basepair.modisco.core import Pattern
chexmix_patterns = {k: Pattern(v, chexmix_motifs[v].pwm, dict(), dict()) 
                    for k,v in motifs_chexmix.items()}

from basepair.exp.paper.config import tasks
from basepair.preproc import interval_center
from basepair.utils import pd_first_cols, flatten
from basepair.exp.chipnexus.comparison import read_peakxus_dfi, read_chexmix_dfi, read_fimo_dfi, read_meme_motifs, read_transfac

def prepend(df, col, name, sep="/"):
    df[col] = name + sep + df[col]
    return df

dfi_peakxus = pd.concat([read_peakxus_dfi(f'../../chipnexus/comparison/output/peakxus/{tf}/all_transition_points.igv').assign(task=tf)
                         for tf in tasks], axis=0)

dfi_chexmix = pd.concat([pd_first_cols(read_chexmix_dfi(f"../../chipnexus/comparison/output/chexmix/{tf}/{tf}_experiment.events").
                                       pipe(prepend, col='pattern_name', name=tf),
                                       ['chrom', 'pattern_center_abs', 'pattern_name', 'profile_score', 'motif_score', 'strand'])
                         for tf in tasks], axis=0)

dfi_append_example_idx??

Signature: dfi_append_example_idx(dfa, ranges)
Source:   
def dfi_append_example_idx(dfa, ranges):
    """
    Args:
      dfa [pd.DataFrame]: containes columns: chrom, pattern_center_abs
      ranges [pd.DataFrame]: containes columns: `example_chrom`, `example_start`, `example_end`

    Note: This assumes that dfa will contain the following 
    """
    dfa = dfa.copy()
    example_idx = dfint_overlap_idx(dfa[['chrom',
                                         'pattern_center_abs',
                                         'pattern_center_abs']],
                                    ranges[['example_chrom',
                                            'example_start',
                                            'example_end']])

    dfa['example_idx'] = example_idx
    dfa = dfa[dfa['example_idx'] != -1]
    dfa = pd.merge(dfa, ranges, on='example_idx', how='left')

    # Add the pattern center
    dfa['pattern_center'] = (dfa['pattern_center_abs'] - dfa['example_start']).astype(int)
    return dfa
File:      ~/workspace/bpnet-manuscript/basepair/exp/paper/comparison.py
Type:      function

# append `example_idx` and `pattern_center`
from basepair.exp.paper.comparison import dfint_overlap_idx, dfi_append_example_idx
dfi_chexmix = dfi_append_example_idx(dfi_chexmix, ranges)
# dfi_fimo = dfi_append_example_idx(dfi_fimo, ranges)
dfi_peakxus = dfi_append_example_idx(dfi_peakxus, ranges)

# Subset to only the main patterns
motif_chemix_inv = {v:k for k,v in motifs_chexmix.items()}
dfi_chexmix['pattern'] = dfi_chexmix['pattern_name']
dfi_chexmix['pattern_name'] = dfi_chexmix['pattern'].map(motif_chemix_inv)
dfi_chexmix = dfi_chexmix[~dfi_chexmix.pattern_name.isnull()]
assert not dfi_chexmix.SubtypeSequence.isnull().any()

# Add pattern_start and pattern_end
lens = dfi_chexmix['SubtypeSequence'].str.len()
assert (lens.mod(2) == 0).all()
neg_strand = dfi_chexmix.strand == "-"
dfi_chexmix['pattern_start'] = dfi_chexmix['pattern_center'] - lens // 2 - neg_strand
dfi_chexmix['pattern_end'] = dfi_chexmix['pattern_center'] + lens // 2 - neg_strand

from concise.utils.fasta import read_fasta

fa17 = read_fasta("/oak/stanford/groups/akundaje/avsec/basepair/data/processed/comparison/data/fastas/chr17.fa")

c17b = fa17['chr17']

seqs = encodeDNA(list(dfi_chexmix[dfi_chexmix.pattern_name == 'Nanog'].SubtypeSequence.str.upper()))

from concise.utils.pwm import PWM

PWM(seqs.mean(0)).plotPWMInfo();

from genomelake.extractors import FastaExtractor
from basepair.preproc import parse_interval

from basepair.exp.paper.config import fasta_file

from pyfaidx import Fasta

fa = Fasta(fasta_file)

from basepair.preproc import rc_seq

p.plot('seq_ic');

fe = FastaExtractor(fasta_file, use_strand=Tr)

dfi_chexmix['pattern_end'] - dfi_chexmix['pattern_start']

0         8
1         8
2         8
         ..
176330    8
176331    8
176332    8
Length: 169377, dtype: int64

TODO¶

double check the PWM

Modisco data¶

Load instances
Load profile + importance scores
Get TE instances
Get patterns

TODO¶

do we have to exclude TEs?

# Get rows without intersecting transposable elements
non_te_idx = dfint_no_intersection(dfi_subset[interval_cols], dfi_te[interval_cols])
print(f"Not overlapping te's: {non_te_idx.mean()}")

dfi_subset['is_te'] = True
dfi_subset.loc[non_te_idx, 'is_te'] = True

Not overlapping te's: 0.9453173320023004

dfi_subset

Considered motifs¶

# plot them
patterns = {m: mr.get_pattern(longer_pattern(p)).trim_seq_ic(0.08)
            for m, p in motifs.items()}
for tf, p in patterns.items():
    p.plot("seq");
    plt.title(tf)

TF-MoDISco is using the TensorFlow backend.

def dfint_intersect_name(dfa, dfb):
    """Provide the name of the interval b overlapping a

    Args:
      dfa,dfb: each data frame has to contain three columns
        with the following entries: chr, start and end
        
    Returns:
      dfa with 
    """
    from pybedtools import BedTool
    assert len(dfa.columns) == 3
    assert len(dfb.columns) == 4
    dfa = dfa.copy()
    dfa['id'] = np.arange(len(dfa))
    bta = BedTool.from_dataframe(dfa)
    btb = BedTool.from_dataframe(dfb)
    df_int = bta.intersect(btb, wa=True, loj=True).to_dataframe()
    # return df_int
    intersected_id = df_int.name
    other_id = df_int.thickEnd
    keep = ~intersected_id.duplicated()
    return pd.Series(other_id[keep], index=intersected_id[keep])

TODO¶

annotate instances with the overlapping idx

len(dfi_chexmix)

169377

Support the following operations:

dfi_filter_valid
- pattern_center
extract_dfi (dfi_row2seqlet)
- example_idx
- pattern_start
- pattern_end
- pattern
- strand

Extract profiles and sequences¶

Importance score instances¶

dfis = dfi.query("imp_weighted_p > 0")
dfis = dfi_filter_valid(dfis, profile_width, seqlen=seqlen)
ps_motifs = {m: isf.extract_dfi(dfis[dfis.pattern_name == m], profile_width=profile_width, verbose=True) for m in motifs}
assert sum([len(x) for x in ps_motifs.values()]) == len(dfis)

sum([len(x) for x in ps_motifs.values()])

2357838

len(dfis)

2357838

Chemix instances¶

dfi_chexmix

dfi_chexmix.head()

dfi_chexmix = dfi_filter_valid(dfi_chexmix, profile_width, seqlen=seqlen)
ps_chexmix_motifs = {m: isf.extract_dfi(dfi_chexmix[dfi_chexmix.pattern_name == m], 
                                        profile_width=profile_width, 
                                        verbose=True) 
                     for m in motifs_chexmix}
assert sum([len(x) for x in ps_chexmix_motifs.values()]) == len(dfi_chexmix)

p = ps_chexmix_motifs['Nanog'][ps_chexmix_motifs['Nanog'].dfi.strand == '+'].aggregate()

p2 = ps_chexmix_motifs['Nanog'][ps_chexmix_motifs['Nanog'].dfi.strand == '-'].aggregate()

p2 = ps_chexmix_motifs['Nanog'][ps_chexmix_motifs['Nanog'].dfi.strand == '-'].aggregate()

a=1

p2 = ps_chexmix_motifs['Oct4-Sox2'][ps_chexmix_motifs['Oct4-Sox2'].dfi.strand == '-'].aggregate()

PWM(encodeDNA(list(ps_chexmix_motifs['Oct4-Sox2'].dfi.SubtypeSequence.str.upper())).mean(0)).plotPWMInfo();

ps_chexmix_motifs['Oct4-Sox2'][ps_chexmix_motifs['Oct4-Sox2'].dfi.strand == '+'].aggregate().plot("seq_ic");

ps_chexmix_motifs['Oct4-Sox2'][ps_chexmix_motifs['Oct4-Sox2'].dfi.strand == '+'].aggregate().plot("profile");

for p in chexmix_patterns.values():
    p.plot("seq_ic");

p.plot('profile', rotate_y=0);

Sequence (PWM) instances¶

%tqdm_restart

# sequences
p_seq = {m: isf.extract_dfi(dfi_filter_valid(dfi_add_ranges(chexmix_patterns[m].get_instances_seq_scan(seqs), ranges, dedup=True),
                                                    profile_width=profile_width, seqlen=seqlen), 
                                   profile_width=profile_width, verbose=True)
         for m in motifs_chexmix}
dfi_seq = pd.concat([p_seq[m].dfi.assign(motif=m) for m in motifs_chexmix])

100%|██████████| 147974/147974 [00:26<00:00, 5501.97it/s]
100%|██████████| 147974/147974 [00:25<00:00, 5811.92it/s]
  0%|          | 226/562241 [00:00<04:09, 2254.67it/s]

number of de-duplicated instances: 411963 (38.622388184237835%)

100%|██████████| 562241/562241 [04:11<00:00, 2236.06it/s]
100%|██████████| 147974/147974 [00:22<00:00, 6648.76it/s]
100%|██████████| 147974/147974 [00:21<00:00, 6859.68it/s]

number of de-duplicated instances: 859231 (38.07061773906263%)

100%|██████████| 1195990/1195990 [08:46<00:00, 2272.78it/s]
100%|██████████| 147974/147974 [00:38<00:00, 3842.29it/s]
100%|██████████| 147974/147974 [00:22<00:00, 6444.03it/s]

number of de-duplicated instances: 841896 (37.186810390977946%)

100%|██████████| 1223338/1223338 [09:09<00:00, 2226.05it/s]
100%|██████████| 147974/147974 [00:42<00:00, 3500.47it/s]
100%|██████████| 147974/147974 [00:25<00:00, 5803.21it/s]

number of de-duplicated instances: 992210 (36.51286180378049%)

100%|██████████| 1510955/1510955 [11:15<00:00, 2235.79it/s]

Annotate dfi and¶

# # exclude locations overlaping TE's
# keep_nonte = dfint_no_intersection(dfis[interval_cols], dfi_te[interval_cols])
# dfis['is_te'] = ~keep_nonte
# print("not overlapping TE", keep_nonte.mean())
# dfis = dfis[keep_nonte]

from basepair.modisco.pattern_instances import profile_features

dfis = annotate_profile(dfis, mr, profiles, profile_width=70, trim_frac=0.08)

  0%|          | 0/7 [00:00<?, ?it/s]

dfi_chexmix = annotate_profile(dfi_chexmix, mr, profiles, profile_width=70, trim_frac=0.08,
                               pattern_map={v:motifs[k] for k,v in motifs_chexmix.items()})

  0%|          | 0/4 [00:00<?, ?it/s]

dfi_seq = annotate_profile(dfi_seq, mr, profiles, profile_width=70, trim_frac=0.08,
                           pattern_map={v:motifs[k] for k,v in motifs_chexmix.items()})

Save¶

# Store the results
chexmix_output_dir = Path("../../chipnexus/comparison/output/chexmix/")

# save the instances
dfi_seq.to_csv(chexmix_output_dir / "pwm_instances.csv.gz", index=False, compression='gzip')

write_pkl(p_seq, chexmix_output_dir / 'pwm_instances.stacked_seqlets.pkl')

dfi_chexmix.to_csv(chexmix_output_dir / "chexmix.csv.gz", index=False, compression='gzip')
write_pkl(ps_chexmix_motifs, chexmix_output_dir / 'chexmix.stacked_seqlets.pkl')

!mkdir {model_dir}/benchmark

dfis.to_csv(model_dir / "benchmark/instances.csv.gz", index=False, compression='gzip')
write_pkl(ps_motifs, model_dir / 'benchmark/instances.stacked_seqlets.pkl')

!du -sh {chexmix_output_dir}/*

516M	../../chipnexus/comparison/output/chexmix/Klf4
454M	../../chipnexus/comparison/output/chexmix/Nanog
241M	../../chipnexus/comparison/output/chexmix/Oct4
100M	../../chipnexus/comparison/output/chexmix/Sox2
12G	../../chipnexus/comparison/output/chexmix/benchmark
28M	../../chipnexus/comparison/output/chexmix/chexmix.csv.gz
884M	../../chipnexus/comparison/output/chexmix/chexmix.stacked_seqlets.pkl
517M	../../chipnexus/comparison/output/chexmix/pwm_instances.csv.gz
22G	../../chipnexus/comparison/output/chexmix/pwm_instances.stacked_seqlets.pkl

	chrom	pattern_center_abs	pattern_name	profile_score	strand	#Point	experiment_Sig	experiment_Ctrl	experiment_log2Fold	experiment_log2P	SubtypePoint	Tau	SubtypeName	SubtypeSequence	pattern	example_idx	example_chrom	example_start	example_end	example_strand	example_interval_from_task	pattern_center
369	chr14	51063900	Oct4-Sox2	4.691	-	chr14:51063900	1237.4	47.9	4.691	-676.423	chr14:51063900:-	1.0	Subtype1	NaN	Oct4/1	166	chr14	51063428	51064428	.	Oct4	472
522	chr13	23401109	Oct4-Sox2	5.220	-	chr13:23401109	1104.8	29.6	5.220	-650.108	chr13:23401109:-	1.0	Subtype1	NaN	Oct4/1	15	chr13	23400367	23401367	.	Oct4	742
544	chr6	128803128	Oct4-Sox2	4.740	-	chr6:128803128	1091.9	40.9	4.740	-601.090	chr6:128803128:-	1.0	Subtype1	NaN	Oct4/1	173	chr6	128802523	128803523	.	Oct4	605
677	chr3	96497807	Oct4-Sox2	4.705	-	chr3:96497807	1013.5	38.8	4.705	-554.894	chr3:96497807:-	1.0	Subtype1	NaN	Oct4/1	290	chr3	96497319	96498319	.	Oct4	488
714	chr13	66711229	Oct4-Sox2	5.902	+	chr13:66711229	992.8	16.6	5.902	-625.607	chr13:66711229:+	1.0	Subtype1	NaN	Oct4/1	36	chr13	66710696	66711696	.	Oct4	533