from collections import OrderedDict
exp = 'nexus,peaks,OSNK,0,10,1,FALSE,same,0.5,64,25,0.004,9,FALSE,[1,50],TRUE'
imp_score = 'profile/wn'
# motifs = OrderedDict([
# ("Oct4-Sox2", 'Oct4/m0_p0'),
# ("Oct4", 'Oct4/m0_p1'),
# # ("Strange-sym-motif", 'Oct4/m0_p5'),
# ("Sox2", 'Sox2/m0_p1'),
# ("Nanog", 'Nanog/m0_p1'),
# ("Zic3", 'Nanog/m0_p2'),
# ("Nanog-partner", 'Nanog/m0_p4'),
# ("Klf4", 'Klf4/m0_p0'),
# ])
motifs = OrderedDict([
("Oct4-Sox2", 'Oct4/m0_p0'),
("Oct4", "Oct4/m0_p1"),
("Oct4-Oct4", "Oct4/m0_p6"),
("Sox2", "Sox2/m0_p1"),
("Nanog", "Nanog/m0_p1"),
("Nanog-partner", "Nanog/m0_p4"),
("Klf4", "Klf4/m0_p0"),
("Klf4-Klf4", "Klf4/m0_p5"),
("B-Box", "Oct4/m0_p5"),
("Zic3", "Nanog/m0_p2"),
("Essrb", "Oct4/m0_p16"),
])
# Imports
from basepair.imports import *
import pyranges as pr
from basepair.modisco.pattern_instances import (multiple_load_instances, load_instances, filter_nonoverlapping_intervals,
plot_coocurence_matrix, align_instance_center, dfi2seqlets, annotate_profile)
from basepair.exp.paper.config import *
from basepair.cli.modisco import load_profiles
from basepair.preproc import rc_seq, dfint_no_intersection
from plotnine import *
import plotnine
import warnings
warnings.filterwarnings("ignore")
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
paper_config()
# interval columns in dfi
interval_cols = ['example_chrom', 'pattern_start_abs', 'pattern_end_abs']
# figures dir
model_dir = models_dir / exp
fdir = Path(f'{ddir}/figures/modisco/{exp}/spacing/')
!mkdir -p {fdir}
from basepair.cli.imp_score import ImpScoreFile
isf = ImpScoreFile(model_dir / 'deeplift.imp_score.h5')
peaks_per_task = pd.value_counts(isf.data["/metadata/interval_from_task"][:])
task_map = {"O": "Oct4", "S": "Sox2", "N": "Nanog", "K": "Klf4"}
# Figure out the tasks from the name
tasks = [task_map[s] for s in exp.split(",")[2]]
dfi_subset = pd.read_parquet(f"{model_dir}/deeplift/dfi_subset.parq", engine='fastparquet')
dfi_subset['row_idx'] = np.arange(len(dfi_subset))
dfi_subset['Chromosome'] = dfi_subset.example_chrom
dfi_subset['Start'] = dfi_subset.pattern_start_abs
dfi_subset['End'] = dfi_subset.pattern_end_abs
dfi_subset_pr = pr.PyRanges(dfi_subset)
assert len(dfi_subset.pattern.unique()) == len(motifs)
dfi_subset_pr = pr.PyRanges(dfi_subset)
# duplicated number of instances
len(dfi_subset_pr)
# non-duplicated number of instances
len(dfi_subset_pr.drop_duplicate_positions())
dfi_subset.head()
dict(dfi_subset.pattern_name.value_counts())
fig, ax = plt.subplots(figsize=get_figsize(0.23))
dfi_subset.pattern_name.value_counts().plot.bar(width=.9)
plt.ylabel("Motif instances")
plt.xlabel("Motif")
fig.savefig(fdir / '../n-instances-per-motif.pdf')
fig, ax = plt.subplots(figsize=get_figsize(0.23))
dfi_subset_pr.drop_duplicate_positions().df.pattern_name.value_counts().plot.bar(width=.9)
plt.ylabel("Motif instances")
plt.xlabel("Motif")
fig.savefig(fdir / '../n-instances-per-motif.unique.pdf')
# Number of regions with a motif (total = 150k. Only 14k lack a motif)
len(dfi_subset.groupby('example_idx').size())
dfi_subset.groupby('example_idx').size().value_counts()
total_regions = len(ranges)
fig, ax = plt.subplots(figsize=get_figsize(0.4))
vec = dfi_subset.groupby('example_idx').size()
vec = pd.concat([vec, pd.Series(np.zeros(total_regions - len(vec)))])
vec.plot.hist(bins=np.arange(11), rwidth=0.9, align='left')
plt.xlabel("Number of motifs per region")
fig.savefig(fdir / '../n-motifs-per-region.pdf')
dfi_subset.head()
19.535 / (212.63-196.32) * 0.3
3.505 / (212.63-196.32) * 0.3
dfi_subset.groupby(['pattern_name', 'example_idx']).size().reset_index()
dfi_subset.head()
fig, axes = plt.subplots(len(tasks), 1,
figsize=get_figsize(0.23*1.4, aspect=len(tasks)* 0.618/1.4),
sharey=True,
gridspec_kw=dict(hspace=0),
sharex=True)
for i, task in enumerate(tasks):
ax = axes[i]
vec = dfi_subset[dfi_subset.example_interval_from_task == task].groupby('example_idx').size()
vec = pd.concat([vec, pd.Series(np.zeros(peaks_per_task[task] - vec.index.nunique()))])
vec.plot.hist(bins=np.arange(10), rwidth=0.9, align='left', ax=ax)
ax.set_ylabel(task, rotation=0, labelpad=10, ha='right')
ax.set_xlabel("Number of motifs per region")
fig.savefig(fdir / '../n-motifs-per-region.per-task.pdf')
fig, axes = plt.subplots(len(tasks), 1,
figsize=get_figsize(0.23, aspect=len(tasks)* 0.618),
sharey=True,
gridspec_kw=dict(hspace=0),
sharex=True)
pattern_names = dfi_subset.pattern_name.unique()
for i, task in enumerate(tasks):
ax = axes[i]
if i == 0:
ax.set_title("Total number of motifs")
vec = dfi_subset[dfi_subset.example_interval_from_task == task].pattern_name.value_counts().reindex(pattern_names, fill_value=0)
vec.plot.bar(width=.9, ax=ax)
ax.set_ylabel(task, rotation=0, labelpad=10, ha='right')
ax.set_xlabel("Motif")
fig.savefig(fdir / '../n-instances-per-motif.per-task.pdf')
fig, axes = plt.subplots(len(tasks), 1,
figsize=get_figsize(0.23, aspect=len(tasks)* 0.618),
sharey=True,
gridspec_kw=dict(hspace=0),
sharex=True)
pattern_names = dfi_subset.pattern_name.unique()
for i, task in enumerate(tasks):
ax = axes[i]
if i == 0:
ax.set_title("Average number of motifs per peak")
vec = dfi_subset[dfi_subset.example_interval_from_task == task].pattern_name.value_counts().reindex(pattern_names, fill_value=0)
vec = vec / peaks_per_task[task]
vec.plot.bar(width=.9, ax=ax)
ax.set_ylabel(task, rotation=0, labelpad=10, ha='right')
ax.set_xlabel("Motif")
fig.savefig(fdir / '../n-instances-per-motif.per-task.avg-motifs-per-peak.pdf')
np.sum(dfi_subset.groupby('example_idx').size() >= 3)
np.sum(dfi_subset.groupby('example_idx').size() >= 3) / 150908
np.sum(dfi_subset.groupby('example_idx').size() >= 5)
72696 / 150000
dfi_subset.groupby('example_idx').size()
patterns = [p for p in mr.get_all_patterns()]
ic = [p.seq_info_content for p in mr.get_all_patterns() if mr.n_seqlets(p.name) > 100]
n_seqlets = [mr.n_seqlets(p.name) for p in mr.get_all_patterns()]
len(ic)
ic = [p.seq_info_content for p in mr.get_all_patterns()]
fig, ax = plt.subplots(figsize=get_figsize(0.4))
plt.hist(ic, bins=np.arange(0, 110, step=8), rwidth=0.9);
# plt.hist(ic, bins=15, rwidth=0.9, align='left');
plt.xlabel("IC")
plt.ylabel("Motifs")
fig.savefig(fdir / '../seq-IC.hist.all.pdf')
p = patterns[0]
p.seq_info_content
tracks_dir = Path(ddir) / 'www/paper/data/tracks'
!mkdir -p {tracks_dir}
bed_cols = ['Chromosome', 'Start', 'End', 'pattern_name', 'match_weighted_p', 'strand', 'imp_weighted_p', 'seq_match_p']
duplicates = dfi_subset[bed_cols[:4]].duplicated() # Drop duplicates
dfi_subset[~duplicates][bed_cols].sort_values(bed_cols[:2]).to_csv(tracks_dir / "motif-instances.bed", sep='\t', header=False, index=False)
!head {tracks_dir}/motif-instances.bed
!wc -l {tracks_dir}/motif-instances.bed
# write out the region
ranges = isf.get_ranges()
ranges[['chrom', 'start', 'end', 'interval_from_task']].to_csv(tracks_dir / "regions.bed", sep='\t', header=False, index=False)
!wc -l {tracks_dir}/regions.bed
ls {model_dir}/
model_dir
ls {model_dir}/log