exp = 'nexus,peaks,OSNK,0,10,1,FALSE,same,0.5,64,25,0.004,9,FALSE,[1,50],TRUE'
imp_score = 'profile/wn'
task = "Nanog"
pattern = "metacluster_0/pattern_6"
#seqlets# PFM genome-wide# PFM peaks# CWM peaks# all peaks# Imports
from basepair.imports import *
from basepair.exp.paper.config import *
import pyranges as pr
hv.extension('bokeh')
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
paper_config()
ddir = get_data_dir()
model_dir = models_dir / exp
patterns_dict = [(task, read_pkl(model_dir / f"deeplift/{task}/out/{imp_score}/patterns.pkl"))
for task in tasks]
p_meme = None
for task, patterns in patterns_dict:
mr = ModiscoResult(model_dir / f'deeplift/{task}/out/{imp_score}/modisco.h5')
for pname in mr.patterns():
untrimmed_pattern = mr.get_pattern(pname)
pattern = untrimmed_pattern.trim_seq_ic(0.08)
if task == 'Nanog' and pattern.name == 'metacluster_0/pattern_6':
print(len(pattern))
p_meme = untrimmed_pattern
motif_dir = model_dir / f"deeplift/{task}/out/{imp_score}/PFM/{pattern.name}"
motif_dir.mkdir(parents=True, exist_ok=True)
pattern.write_meme_file([0.25, 0.25, 0.25, 0.25],
motif_dir /"meme.txt")
# RUN
# snakemake -s fimo.smk -j 10
# Load stats about the motif
mr = ModiscoResult(model_dir / f'deeplift/{task}/out/{imp_score}/modisco.h5')
# get TE's
motifs_te = [pattern
for pattern in mr.patterns()
if mr.get_pattern(pattern).seq_info_content > 30 and mr.n_seqlets(*pattern.split('/')) > 100]
is_te = pattern in motifs_te
# motifs_te_d = OrderedDict([(shorten_pattern(x), shorten_pattern(x)) for x in motifs_te])
n_seqlets = mr.n_seqlets(*pattern.split("/"))
p = mr.get_pattern(pattern).trim_seq_ic(0.08)
load_instances should load the full thingload_instances to filter the instances only to the major motifs# TOOD - there is a discrepancy between the intervals
from basepair.modisco.pattern_instances import load_instances, multiple_load_instances
def abs_coord(seqlet, ranges):
"""Create absolute seqlet coordinates
"""
row = ranges.loc[seqlet.seqname]
return {"Chromosome": row.chrom,
"Start": row.start + seqlet.start - 15, # make it 70 bp long
"End": row.start + seqlet.end + 15,
}
def dfi2pr(dfi):
dfi['Chromosome'] = dfi.example_chrom
dfi['Start'] = dfi.pattern_start_abs
dfi['End'] = dfi.pattern_end_abs
dfi['Strand'] = dfi.strand
dfi_pr = pr.PyRanges(dfi)
return dfi_pr
df_subset = pd.read_csv("https://docs.google.com/spreadsheets/d/1MVBUbMs_F0UPgZxR8-v9eMM6DCFhVYc7kdH-C6WJnYU/export?gid=0&format=csv")
df_subset.name = df_subset.name.str.replace('"', '')
df_subset.task = df_subset.task.str.replace('"', '')
df_info = pd.read_csv("https://docs.google.com/spreadsheets/d/1uAtJZJR_dmlWwer5Afcd2q72Uuj7A4Ln/export?gid=826411817&format=csv")
len(dfi_pr_all.intersect(dfs, how='containment', strand=None))
len(dfi_pr_all)
len(dfi_pr_all.drop_duplicate_positions(strand=False))
len(dfi_pr_main.intersect(dfs, how='containment', strand=None).drop_duplicate_positions(strand=False))
len(dfi_pr_main)
len(dfs.drop_duplicate_positions(strand=False))
len(dfs.overlap(dfi_pr_all).drop_duplicate_positions(strand=False))
len(dfs.overlap(dfi_pr_main).drop_duplicate_positions(strand=False))
# TODO - remove motif with low ic
for task in tasks:
print("Loading ranges")
# Load seqlets
mr = ModiscoResult(model_dir / f'deeplift/{task}/out/{imp_score}/modisco.h5')
seqlets = mr.all_seqlets()
ranges = mr.load_ranges()
ranges.set_index('idx', inplace=True)
assert np.all(ranges.index == np.arange(len(ranges)))
dfs = pd.DataFrame([abs_coord(s, ranges) for s in seqlets])
dfs = pr.PyRanges(dfs).drop_duplicate_positions(strand=False)
print("loading dfi")
# Load CWM instances
print("load instances")
dfi = load_instances(model_dir / f'deeplift/{task}/out/{imp_score}/instances.parq')
print("filter")
dfi = dfi.query('match_weighted_p > .2').query('imp_weighted_p > .01')
print("filter")
# That's only subsetting the main motifs for a particular task
motifs = {k:k for k in df_subset[df_subset.task == task].pattern}
all_motifs = {k:k for k in df_info[(df_info.IC > 4) & (df_info.task == task)].pattern.map(shorten_pattern)}
dfi_main_motifs = load_instances(dfi, motifs)
dfi_pr_main = dfi2pr(dfi_main_motifs).drop_duplicate_positions(strand=False)
dfi_all_motifs = load_instances(dfi, all_motifs)
dfi_pr_all = dfi2pr(dfi_all_motifs).drop_duplicate_positions(strand=False)
print("-"*40)
print(f"Task: {task}")
s = len(dfs)
sm = len(dfs.overlap(dfi_pr_main).drop_duplicate_positions(strand=False))
sa = len(dfs.overlap(dfi_pr_all).drop_duplicate_positions(strand=False))
print(f"seqlets overlaps main motifs: {sm}/{s} ({sm/s:.2f})")
print(f"seqlets overlaps all motifs: {sa}/{s} ({sa/s:.2f})")
print("seqlets overlaps main motifs: {}".format(len(dfs.overlap(dfi_pr_main))))
print("")
cm = len(dfi_pr_main)
cmo = len(dfi_pr_main.intersect(dfs, how='containment', strand=None).drop_duplicate_positions(strand=False))
ca = len(dfi_pr_all)
cao = len(dfi_pr_all.intersect(dfs, how='containment', strand=None).drop_duplicate_positions(strand=False))
print(f"CWM (main motifs) overlap seqlets: {cmo}/{cm} ({cmo/cm:.2f})")
print(f"CWM (all motifs) overlap seqlets: {cao}/{ca} ({cao/ca:.2f})")
print("-"*40)
# total seqlets
s = 42995+17665+101304+103514
s
#overlap main motifs
mo = 28767 + 6377 + 51361 + 59243
ao = 37159 + 13144 + 87802 + 99336
print(f"total seqlets: {mo}")
print(f"main motif overlaps: {mo} / {s} ({mo/s:.2f})")
print(f"all motif overlaps: {ao} / {s} ({ao/s:.2f})")
p.plot();
p_meme.plot();
# Load CWM instances
from basepair.modisco.pattern_instances import load_instances
dfi = load_instances(model_dir / f'deeplift/{task}/out/{imp_score}/instances.parq',
{shorten_pattern(pattern): shorten_pattern(pattern)})
# filter only to the relevant ones
dfi = dfi.query('match_weighted_p > .2').query('imp_weighted_p > .01')
dfi['Chromosome'] = dfi.example_chrom
dfi['Start'] = dfi.pattern_start_abs
dfi['End'] = dfi.pattern_end_abs
dfi['Strand'] = dfi.strand
dfi_pr = pr.PyRanges(dfi)
# Load PWM instances
from basepair.exp.chipnexus.comparison import read_fimo_dfi
dfi_fimo = read_fimo_dfi(model_dir / f'deeplift/{task}/out/{imp_score}/PFM/{pattern}/fimo.tsv')
dfi_fimo['pattern'] = f"{task}/{pattern}"
dfi_fimo['Chromosome'] = dfi_fimo.chrom
dfi_fimo['Start'] = dfi_fimo.start - 1 # intervals are 1-based
dfi_fimo['End'] = dfi_fimo.end
dfi_fimo['Strand'] = dfi_fimo.strand
dfi_fimo = pr.PyRanges(dfi_fimo)
# Load the peak intervals
from basepair.cli.modisco import load_ranges
ranges = load_ranges(model_dir / f'deeplift/{task}/out/{imp_score}')
ranges['Chromosome'] = ranges.chrom
ranges['Start'] = ranges.start
ranges['End'] = ranges.end
ranges = pr.PyRanges(ranges)
ranges
dfi_fimo_peaks = pr.PyRanges(dfi_fimo.intersect(ranges, how='containment').as_df().reset_index())
stats = {}
stats['pattern'] = pattern
stats['task'] = task
stats['imp_score'] = imp_score
stats['is_te'] = is_te
stats['n_seqlets'] = n_seqlets
stats['IC'] = p.seq_info_content
stats['length'] = len(p)
stats['n_pwm_gw'] = len(dfi_fimo)
stats['n_pwm_peaks'] = len(dfi_fimo_peaks)
stats['n_cwm_peaks'] = len(dfi_pr)
stats['n_cwm_and_pwm_peaks'] = len(dfi_pr.intersect(dfi_fimo_peaks, how='containment'))
stats['frac_cwm_in_pwm_peaks'] = stats['n_cwm_and_pwm_peaks'] / stats['n_cwm_peaks']
stats['frac_pwm_in_cwm_peaks'] = stats['n_cwm_and_pwm_peaks'] / stats['n_pwm_peaks']
stats
len(dfi_pr.overlap(dfi_fimo_peaks))
dfi_pr.overlap(dfi_fimo_peaks).df
dfi_pr['chr1', '+', 45358193:45358263].df
dfi_fimo_peaks['chr1', '+', 45358193:45358263].df
45358263 - 45358197
Why do they differ?
len(p)
!cat {model_dir}/deeplift/{task}/out/{imp_score}/PFM/{pattern}/meme.txt
dfi_pr.sort().df.head()
dfi_fimo_peaks.sort().df.head()
# why is n_cwm_and_pwm_peaks 0?
# write to the json file
from basepair.utils import write_json
# write_json(stats, model_dir / f'deeplift/{task}/out/{imp_score}/PFM/{pattern}/overlaps.json')
dfr = pd.read_csv(model_dir / 'deeplift/overlap_cwm_pwm.csv')
dfr.head()
dict(dfr.mean())
dict(dfr[dfr.is_te].mean())
!cp {model_dir}/deeplift/overlap_cwm_pwm.csv figures/modisco/{exp}/overlap_cwm_pwm.csv
dfr.to_excel(f'figures/modisco/{exp}/overlap_cwm_pwm.xlsx')
!ls -latrh figures/modisco/{exp}/overlap_cwm_pwm.xlsx