from collections import OrderedDict
exp = 'nexus,peaks,OSNK,0,10,1,FALSE,same,0.5,64,25,0.004,9,FALSE,[1,50],TRUE'
imp_score = 'profile/wn'
motifs = OrderedDict([
("Oct4-Sox2", 'Oct4/m0_p0'),
("Oct4", 'Oct4/m0_p1'),
# ("Strange-sym-motif", 'Oct4/m0_p5'),
("Sox2", 'Sox2/m0_p1'),
("Nanog", 'Nanog/m0_p1'),
("Zic3", 'Nanog/m0_p2'),
("Nanog-partner", 'Nanog/m0_p4'),
("Klf4", 'Klf4/m0_p0'),
])
motifs_chexmix = OrderedDict([
("Oct4-Sox2", 'Oct4/1'),
("Sox2", 'Sox2/1'),
("Nanog", 'Nanog/1'),
("Klf4", 'Klf4/1'),
])
main_motifs = ['Oct4-Sox2', 'Sox2', 'Nanog', 'Klf4']
What fraction of the peaks can we explain? E.g. compute the overlap between
seqlets (un-clustered)
Q: How to get this number?
There could be a possible problem when excluding seqlets...
What are the two nearest peakxus peaks?
# Imports
from basepair.imports import *
from basepair.exp.paper.config import tasks
from basepair.modisco.pattern_instances import load_instances, filter_nonoverlapping_intervals, plot_coocurence_matrix, dfi_filter_valid, dfi_add_ranges, annotate_profile
from basepair.cli.imp_score import ImpScoreFile
from basepair.modisco.core import Pattern
from basepair.preproc import dfint_no_intersection
hv.extension('bokeh')
%config InlineBackend.figure_format = 'retina'
%matplotlib inline
paper_config()
interval_cols = ['example_chrom', 'pattern_start_abs', 'pattern_end_abs']
# load all the required data
# mr = ModiscoResult(modisco_dir / "modisco.h5")
# mr.open()
# imp_scores = ImpScoreFile.from_modisco_dir(modisco_dir)
# imp_scores.cache() # load all the scores
# ranges = imp_scores.get_ranges()
# ranges.columns = ["example_" + v for v in ranges.columns]
# seqs = imp_scores.get_seq()
# profiles = imp_scores.get_profiles()
from basepair.exp.paper.config import models_dir
# figures dir
model_dir = models_dir / exp
fdir = Path(f'{ddir}/figures/modisco/{exp}/spacing/')
fdir_individual = fdir / 'individual'
fdir_individual_sim = fdir / 'individual-simulation'
%config InlineBackend.figure_format = 'retina'
paper_config()
# Common paths
# output_dir = modisco_dir / "pwm_comparison" # this notebook's output directory
# output_dir.mkdir(exist_ok=True)
figures_dir = Path(f"{ddir}/figures/modisco/{exp}/pwm_comparison")
figures_dir.mkdir(exist_ok=True)
profile_width = 70
seqlen = 1000
instance_parq_paths = {t: model_dir / f'deeplift/{t}/out/{imp_score}/instances.parq'
for t in tasks}
from basepair.modisco.pattern_instances import (multiple_load_instances, load_instances, filter_nonoverlapping_intervals,
plot_coocurence_matrix, align_instance_center, dfi2seqlets, annotate_profile)
dfi = multiple_load_instances(instance_parq_paths, motifs)
# Subset the motifs
dfi_subset = dfi.query('match_weighted_p > .2').query('imp_weighted_p > .01')
# append example_idx + make pattern_start absolute
isf = ImpScoreFile(model_dir / 'deeplift.imp_score.h5', default_imp_score=imp_score)
isf.cache() # load everything into memory
ranges = isf.get_ranges()
ranges.columns = ['example_' + c for c in ranges.columns]
seqs = isf.get_seq()
profiles = isf.get_profiles()
# # get transposable element locations
# te_patterns = pd.read_csv(modisco_dir / "motif_clustering/patterns_long.csv").pattern.map(longer_pattern).unique()
# dfi_te = load_instances(dfi_full[dfi_full.pattern.isin(te_patterns)], None)
# dfi_te = dfi_te[(dfi_te.match_weighted_p > 0.1) & (dfi_te.seq_match > 20)]
mr = MultipleModiscoResult({t: model_dir / f'deeplift/{t}/out/{imp_score}/modisco.h5'
for t in tasks})
# Exclude TEs
def shorten_te_pattern(s):
tf, p = s.split("/", 1)
return tf + "/" + shorten_pattern(p)
motifs_te = [p.name
for p in mr.get_all_patterns()
if p.seq_info_content > 30 and mr.n_seqlets(p.name) > 100]
motifs_te_d = OrderedDict([(shorten_te_pattern(x), shorten_te_pattern(x)) for x in motifs_te])
# # get transposable element locations
dfi_te = multiple_load_instances(instance_parq_paths, motifs_te_d)
dfi_te = dfi_te[(dfi_te.match_weighted_p > 0.1) & (dfi_te.seq_match > 20)]
from basepair.exp.paper.config import tasks
from basepair.utils import pd_first_cols, flatten
from basepair.exp.chipnexus.comparison import read_peakxus_dfi, read_chexmix_dfi, read_fimo_dfi, read_meme_motifs, read_transfac
def prepend(df, col, name, sep="/"):
df[col] = name + sep + df[col]
return df
peakxus_motifs = flatten({tf: read_meme_motifs(f'../../chipnexus/comparison/output/peakxus/{tf}/MEME/w=200,n=1000,nmotifs=15,evt=0.01,maxw=40/meme.txt')
for tf in tasks}, separator='/')
chexmix_motifs = flatten({tf: read_transfac(f"../../chipnexus/comparison/output/chexmix/{tf}/intermediate-results/{tf}.experiment.transfac")
for tf in tasks}, separator='/')
from basepair.modisco.core import Pattern
chexmix_patterns = {k: Pattern(v, chexmix_motifs[v].pwm, dict(), dict())
for k,v in motifs_chexmix.items()}
from basepair.exp.paper.config import tasks
from basepair.preproc import interval_center
from basepair.utils import pd_first_cols, flatten
from basepair.exp.chipnexus.comparison import read_peakxus_dfi, read_chexmix_dfi, read_fimo_dfi, read_meme_motifs, read_transfac
def prepend(df, col, name, sep="/"):
df[col] = name + sep + df[col]
return df
dfi_peakxus = pd.concat([read_peakxus_dfi(f'../../chipnexus/comparison/output/peakxus/{tf}/all_transition_points.igv').assign(task=tf)
for tf in tasks], axis=0)
dfi_chexmix = pd.concat([pd_first_cols(read_chexmix_dfi(f"../../chipnexus/comparison/output/chexmix/{tf}/{tf}_experiment.events").
pipe(prepend, col='pattern_name', name=tf),
['chrom', 'pattern_center_abs', 'pattern_name', 'profile_score', 'motif_score', 'strand'])
for tf in tasks], axis=0)
dfi_append_example_idx??
# append `example_idx` and `pattern_center`
from basepair.exp.paper.comparison import dfint_overlap_idx, dfi_append_example_idx
dfi_chexmix = dfi_append_example_idx(dfi_chexmix, ranges)
# dfi_fimo = dfi_append_example_idx(dfi_fimo, ranges)
dfi_peakxus = dfi_append_example_idx(dfi_peakxus, ranges)
# Subset to only the main patterns
motif_chemix_inv = {v:k for k,v in motifs_chexmix.items()}
dfi_chexmix['pattern'] = dfi_chexmix['pattern_name']
dfi_chexmix['pattern_name'] = dfi_chexmix['pattern'].map(motif_chemix_inv)
dfi_chexmix = dfi_chexmix[~dfi_chexmix.pattern_name.isnull()]
assert not dfi_chexmix.SubtypeSequence.isnull().any()
# Add pattern_start and pattern_end
lens = dfi_chexmix['SubtypeSequence'].str.len()
assert (lens.mod(2) == 0).all()
neg_strand = dfi_chexmix.strand == "-"
dfi_chexmix['pattern_start'] = dfi_chexmix['pattern_center'] - lens // 2 - neg_strand
dfi_chexmix['pattern_end'] = dfi_chexmix['pattern_center'] + lens // 2 - neg_strand
from concise.utils.fasta import read_fasta
fa17 = read_fasta("/oak/stanford/groups/akundaje/avsec/basepair/data/processed/comparison/data/fastas/chr17.fa")
c17b = fa17['chr17']
seqs = encodeDNA(list(dfi_chexmix[dfi_chexmix.pattern_name == 'Nanog'].SubtypeSequence.str.upper()))
from concise.utils.pwm import PWM
PWM(seqs.mean(0)).plotPWMInfo();
from genomelake.extractors import FastaExtractor
from basepair.preproc import parse_interval
from basepair.exp.paper.config import fasta_file
from pyfaidx import Fasta
fa = Fasta(fasta_file)
from basepair.preproc import rc_seq
p.plot('seq_ic');
fe = FastaExtractor(fasta_file, use_strand=Tr)
dfi_chexmix['pattern_end'] - dfi_chexmix['pattern_start']
# Get rows without intersecting transposable elements
non_te_idx = dfint_no_intersection(dfi_subset[interval_cols], dfi_te[interval_cols])
print(f"Not overlapping te's: {non_te_idx.mean()}")
dfi_subset['is_te'] = True
dfi_subset.loc[non_te_idx, 'is_te'] = True
dfi_subset
# plot them
patterns = {m: mr.get_pattern(longer_pattern(p)).trim_seq_ic(0.08)
for m, p in motifs.items()}
for tf, p in patterns.items():
p.plot("seq");
plt.title(tf)
def dfint_intersect_name(dfa, dfb):
"""Provide the name of the interval b overlapping a
Args:
dfa,dfb: each data frame has to contain three columns
with the following entries: chr, start and end
Returns:
dfa with
"""
from pybedtools import BedTool
assert len(dfa.columns) == 3
assert len(dfb.columns) == 4
dfa = dfa.copy()
dfa['id'] = np.arange(len(dfa))
bta = BedTool.from_dataframe(dfa)
btb = BedTool.from_dataframe(dfb)
df_int = bta.intersect(btb, wa=True, loj=True).to_dataframe()
# return df_int
intersected_id = df_int.name
other_id = df_int.thickEnd
keep = ~intersected_id.duplicated()
return pd.Series(other_id[keep], index=intersected_id[keep])
len(dfi_chexmix)
Support the following operations:
dfi_filter_validpattern_centerextract_dfi (dfi_row2seqlet)example_idxpattern_startpattern_endpatternstranddfis = dfi.query("imp_weighted_p > 0")
dfis = dfi_filter_valid(dfis, profile_width, seqlen=seqlen)
ps_motifs = {m: isf.extract_dfi(dfis[dfis.pattern_name == m], profile_width=profile_width, verbose=True) for m in motifs}
assert sum([len(x) for x in ps_motifs.values()]) == len(dfis)
sum([len(x) for x in ps_motifs.values()])
len(dfis)
dfi_chexmix
dfi_chexmix.head()
dfi_chexmix = dfi_filter_valid(dfi_chexmix, profile_width, seqlen=seqlen)
ps_chexmix_motifs = {m: isf.extract_dfi(dfi_chexmix[dfi_chexmix.pattern_name == m],
profile_width=profile_width,
verbose=True)
for m in motifs_chexmix}
assert sum([len(x) for x in ps_chexmix_motifs.values()]) == len(dfi_chexmix)
p = ps_chexmix_motifs['Nanog'][ps_chexmix_motifs['Nanog'].dfi.strand == '+'].aggregate()
p2 = ps_chexmix_motifs['Nanog'][ps_chexmix_motifs['Nanog'].dfi.strand == '-'].aggregate()
p2 = ps_chexmix_motifs['Nanog'][ps_chexmix_motifs['Nanog'].dfi.strand == '-'].aggregate()
a=1
p2 = ps_chexmix_motifs['Oct4-Sox2'][ps_chexmix_motifs['Oct4-Sox2'].dfi.strand == '-'].aggregate()
PWM(encodeDNA(list(ps_chexmix_motifs['Oct4-Sox2'].dfi.SubtypeSequence.str.upper())).mean(0)).plotPWMInfo();
ps_chexmix_motifs['Oct4-Sox2'][ps_chexmix_motifs['Oct4-Sox2'].dfi.strand == '+'].aggregate().plot("seq_ic");
ps_chexmix_motifs['Oct4-Sox2'][ps_chexmix_motifs['Oct4-Sox2'].dfi.strand == '+'].aggregate().plot("profile");
for p in chexmix_patterns.values():
p.plot("seq_ic");
p.plot('profile', rotate_y=0);
%tqdm_restart
# sequences
p_seq = {m: isf.extract_dfi(dfi_filter_valid(dfi_add_ranges(chexmix_patterns[m].get_instances_seq_scan(seqs), ranges, dedup=True),
profile_width=profile_width, seqlen=seqlen),
profile_width=profile_width, verbose=True)
for m in motifs_chexmix}
dfi_seq = pd.concat([p_seq[m].dfi.assign(motif=m) for m in motifs_chexmix])
# # exclude locations overlaping TE's
# keep_nonte = dfint_no_intersection(dfis[interval_cols], dfi_te[interval_cols])
# dfis['is_te'] = ~keep_nonte
# print("not overlapping TE", keep_nonte.mean())
# dfis = dfis[keep_nonte]
from basepair.modisco.pattern_instances import profile_features
dfis = annotate_profile(dfis, mr, profiles, profile_width=70, trim_frac=0.08)
dfi_chexmix = annotate_profile(dfi_chexmix, mr, profiles, profile_width=70, trim_frac=0.08,
pattern_map={v:motifs[k] for k,v in motifs_chexmix.items()})
dfi_seq = annotate_profile(dfi_seq, mr, profiles, profile_width=70, trim_frac=0.08,
pattern_map={v:motifs[k] for k,v in motifs_chexmix.items()})
# Store the results
chexmix_output_dir = Path("../../chipnexus/comparison/output/chexmix/")
# save the instances
dfi_seq.to_csv(chexmix_output_dir / "pwm_instances.csv.gz", index=False, compression='gzip')
write_pkl(p_seq, chexmix_output_dir / 'pwm_instances.stacked_seqlets.pkl')
dfi_chexmix.to_csv(chexmix_output_dir / "chexmix.csv.gz", index=False, compression='gzip')
write_pkl(ps_chexmix_motifs, chexmix_output_dir / 'chexmix.stacked_seqlets.pkl')
!mkdir {model_dir}/benchmark
dfis.to_csv(model_dir / "benchmark/instances.csv.gz", index=False, compression='gzip')
write_pkl(ps_motifs, model_dir / 'benchmark/instances.stacked_seqlets.pkl')
!du -sh {chexmix_output_dir}/*