from collections import OrderedDict
exp = 'nexus,peaks,OSNK,0,10,1,FALSE,same,0.5,64,25,0.004,9,FALSE,[1,50],TRUE'
imp_score = 'profile/wn'
# motifs = OrderedDict([
# ("Oct4-Sox2", 'Oct4/m0_p0'),
# ("Oct4", 'Oct4/m0_p1'),
# # ("Strange-sym-motif", 'Oct4/m0_p5'),
# ("Sox2", 'Sox2/m0_p1'),
# ("Nanog", 'Nanog/m0_p1'),
# ("Zic3", 'Nanog/m0_p2'),
# ("Nanog-partner", 'Nanog/m0_p4'),
# ("Klf4", 'Klf4/m0_p0'),
# ])
motifs = OrderedDict([
("Oct4-Sox2", 'Oct4/m0_p0'),
("Oct4", "Oct4/m0_p1"),
("Oct4-Oct4", "Oct4/m0_p6"),
("Sox2", "Sox2/m0_p1"),
("Nanog", "Nanog/m0_p1"),
("Nanog-partner", "Nanog/m0_p4"),
("Klf4", "Klf4/m0_p0"),
("Klf4-Klf4", "Klf4/m0_p5"),
("B-Box", "Oct4/m0_p5"),
("Zic3", "Nanog/m0_p2"),
("Essrb", "Oct4/m0_p16"),
])
# dfi_subset.pattern_name.unique()
# Imports
from basepair.imports import *
from basepair.math import softmax
import pyranges as pr
from basepair.plot.heatmaps import heatmap_stranded_profile, multiple_heatmap_stranded_profile
from basepair.plot.profiles import plot_stranded_profile, multiple_plot_stranded_profile, extract_signal
from basepair.plot.tracks import plot_tracks, filter_tracks
from basepair.modisco.results import MultipleModiscoResult, Seqlet, resize_seqlets
from basepair.modisco.pattern_instances import (multiple_load_instances, load_instances, filter_nonoverlapping_intervals,
plot_coocurence_matrix, align_instance_center, dfi2seqlets, annotate_profile)
from basepair.exp.chipnexus.perturb.vdom import vdom_motif_pair, plot_spacing_hist
from basepair.exp.chipnexus.spacing import remove_edge_instances, get_motif_pairs, motif_pair_dfi
from basepair.exp.chipnexus.simulate import (insert_motif, generate_sim, plot_sim, generate_seq,
model2tasks, motif_coords, interactive_tracks, plot_motif_table,
plot_sim_motif_col)
from basepair.exp.paper.config import *
from basepair.cli.modisco import load_profiles
from basepair.cli.imp_score import ImpScoreFile
from basepair.preproc import rc_seq, dfint_no_intersection
from copy import deepcopy
from scipy.fftpack import fft, ifft
from plotnine import *
import plotnine
import warnings
warnings.filterwarnings("ignore")
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
paper_config()
# interval columns in dfi
interval_cols = ['example_chrom', 'pattern_start_abs', 'pattern_end_abs']
# figures dir
model_dir = models_dir / exp
fdir = Path(f'{ddir}/figures/modisco/{exp}/spacing/')
fdir_individual = fdir / 'individual'
fdir_individual_sim = fdir / 'individual-simulation'
from basepair.cli.imp_score import ImpScoreFile
isf = ImpScoreFile(model_dir / 'deeplift.imp_score.h5')
peaks_per_task = pd.value_counts(isf.data["/metadata/interval_from_task"][:])
!mkdir -p {fdir_individual}
!mkdir -p {fdir_individual_sim}
!mkdir -p {fdir}
# Generate motif pairs
pairs = get_motif_pairs(motifs)
# ordered names
pair_names = ["<>".join(x) for x in pairs]
# define the global set of distances
dist_subsets = ['center_diff<=35',
'(center_diff>35)&(center_diff<=70)',
'(center_diff>70)&(center_diff<=150)',
'center_diff>150']
dist_subset_labels = ['dist < 35',
'35 < dist <= 70',
'70 < dist <= 150',
'150 < dist',
]
task_map = {"O": "Oct4", "S": "Sox2", "N": "Nanog", "K": "Klf4"}
# Figure out the tasks from the name
tasks = [task_map[s] for s in exp.split(",")[2]]
mr = MultipleModiscoResult({t: model_dir / f'deeplift/{t}/out/{imp_score}/modisco.h5'
for t in tasks})
main_motifs = [mr.get_pattern(pattern_name).rename(name)
for name, pattern_name in motifs.items()]
len(main_motifs)
from basepair.exp.paper.fig4 import similarity_matrix, cluster_align_patterns
sim_seq = similarity_matrix(main_motifs, track='seq_ic')
np.fill_diagonal(sim_seq, 0)
motif_names = [m.name for m in main_motifs]
df_sim_seq = pd.DataFrame(sim_seq, index=motif_names, columns=motif_names)
main_motifs[4].align(main_motifs[5], 'contrib/Nanog').attrs
main_motifs[4].align(main_motifs[5], 'contrib/Nanog').plot('contrib')
main_motifs[5].plot("contrib");
main_motifs[5].plot("seq_ic");
main_motifs[1].align(main_motifs[4], 'seq_ic').plot('seq_ic')
main_motifs[4].plot("seq_ic");
main_motifs[-2].align(main_motifs[-4], 'seq_ic').plot('seq_ic')
main_motifs[-4].plot("seq_ic");
for p in main_motifs:
p.trim_seq_ic(0.08).plot("seq_ic");
dfrm.name.str.match("ERV").sum()
dfrm.name.str.match("ERV").mean()
dfi_subset = pd.read_parquet(f"{model_dir}/deeplift/dfi_subset.parq", engine='fastparquet')
dfi_subset['row_idx'] = np.arange(len(dfi_subset))
dfi_subset['Chromosome'] = dfi_subset.example_chrom
dfi_subset['Start'] = dfi_subset.pattern_start_abs
dfi_subset['End'] = dfi_subset.pattern_end_abs
dfi_subset_pr = pr.PyRanges(dfi_subset)
assert len(dfi_subset.pattern.unique()) == len(main_motifs)
len(dfi_subset)
# exclude erv's
import pyranges as pr
# load repeat masker file
from basepair.exp.paper.repeat_masker import download_repeat_masker, read_repeat_masker, intersect_repeat_masker
dfrm = read_repeat_masker(download_repeat_masker())
dfrm_erv = dfrm[dfrm.name.str.contains("ERV")]
dfrm_erv['Start'] = dfrm_erv['start']
dfrm_erv['End'] = dfrm_erv['end']
dfrm_erv['Chromosome'] = dfrm_erv['chrom']
dfrm_erv = pr.PyRanges(dfrm_erv)
# exclude erv's
exclude = dfi_subset_pr.overlap(dfrm_erv).df.row_idx.unique()
exclude_rows = dfi_subset.row_idx.isin(exclude)
print(f"Excluding: {len(exclude_rows)} / {len(dfi_subset)} rows")
dfi_subset['is_erv'] = exclude_rows
pd.crosstab(dfi_subset.is_erv, dfi_subset.is_te)
a = pr.PyRanges(chromosomes=['chr1'], starts=[10], ends=[20])
b = pr.PyRanges(chromosomes=['chr1'], starts=[19], ends=[22])
a.overlap(b)
len(dfi_subset)
# many still remain
dfi_subset.is_te.sum()
dfi_subset_pr = pr.PyRanges(dfi_subset)
# duplicated number of instances
len(dfi_subset_pr)
# non-duplicated number of instances
len(dfi_subset_pr.drop_duplicate_positions())
dfi_subset.head()
dict(dfi_subset.pattern_name.value_counts())
fig, ax = plt.subplots(figsize=get_figsize(0.23))
dfi_subset.pattern_name.value_counts().plot.bar(width=.9)
plt.ylabel("Motif instances")
plt.xlabel("Motif")
fig.savefig(fdir / '../n-instances-per-motif.pdf')
# Number of regions with a motif (total = 150k. Only 14k lack a motif)
len(dfi_subset.groupby('example_idx').size())
dfi_subset.groupby('example_idx').size().value_counts()
total_regions = len(ranges)
fig, ax = plt.subplots(figsize=get_figsize(0.4))
vec = dfi_subset.groupby('example_idx').size()
vec = pd.concat([vec, pd.Series(np.zeros(total_regions - len(vec)))])
vec.plot.hist(bins=np.arange(11), rwidth=0.9, align='left')
plt.xlabel("Number of motifs per region")
fig.savefig(fdir / '../n-motifs-per-region.pdf')
dfi_subset.head()
19.535 / (212.63-196.32) * 0.3
3.505 / (212.63-196.32) * 0.3
dfi_subset.groupby(['pattern_name', 'example_idx']).size().reset_index()
dfi_subset.head()
fig, axes = plt.subplots(len(tasks), 1,
figsize=get_figsize(0.23*1.4, aspect=len(tasks)* 0.618/1.4),
sharey=True,
gridspec_kw=dict(hspace=0),
sharex=True)
for i, task in enumerate(tasks):
ax = axes[i]
vec = dfi_subset[dfi_subset.example_interval_from_task == task].groupby('example_idx').size()
vec = pd.concat([vec, pd.Series(np.zeros(peaks_per_task[task] - vec.index.nunique()))])
vec.plot.hist(bins=np.arange(10), rwidth=0.9, align='left', ax=ax)
ax.set_ylabel(task, rotation=0, labelpad=10, ha='right')
ax.set_xlabel("Number of motifs per region")
fig.savefig(fdir / '../n-motifs-per-region.per-task.pdf')
fig, axes = plt.subplots(len(tasks), 1,
figsize=get_figsize(0.23, aspect=len(tasks)* 0.618),
sharey=True,
gridspec_kw=dict(hspace=0),
sharex=True)
pattern_names = dfi_subset.pattern_name.unique()
for i, task in enumerate(tasks):
ax = axes[i]
if i == 0:
ax.set_title("Total number of motifs")
vec = dfi_subset[dfi_subset.example_interval_from_task == task].pattern_name.value_counts().reindex(pattern_names, fill_value=0)
vec.plot.bar(width=.9, ax=ax)
ax.set_ylabel(task, rotation=0, labelpad=10, ha='right')
ax.set_xlabel("Motif")
fig.savefig(fdir / '../n-instances-per-motif.per-task.pdf')
fig, axes = plt.subplots(len(tasks), 1,
figsize=get_figsize(0.23, aspect=len(tasks)* 0.618),
sharey=True,
gridspec_kw=dict(hspace=0),
sharex=True)
pattern_names = dfi_subset.pattern_name.unique()
for i, task in enumerate(tasks):
ax = axes[i]
if i == 0:
ax.set_title("Average number of motifs per peak")
vec = dfi_subset[dfi_subset.example_interval_from_task == task].pattern_name.value_counts().reindex(pattern_names, fill_value=0)
vec = vec / peaks_per_task[task]
vec.plot.bar(width=.9, ax=ax)
ax.set_ylabel(task, rotation=0, labelpad=10, ha='right')
ax.set_xlabel("Motif")
fig.savefig(fdir / '../n-instances-per-motif.per-task.avg-motifs-per-peak.pdf')
np.sum(dfi_subset.groupby('example_idx').size() >= 3)
np.sum(dfi_subset.groupby('example_idx').size() >= 3) / 150908
np.sum(dfi_subset.groupby('example_idx').size() >= 5)
72696 / 150000
dfi_subset.groupby('example_idx').size()
patterns = [p for p in mr.get_all_patterns()]
ic = [p.seq_info_content for p in mr.get_all_patterns() if mr.n_seqlets(p.name) > 100]
n_seqlets = [mr.n_seqlets(p.name) for p in mr.get_all_patterns()]
len(ic)
ic = [p.seq_info_content for p in mr.get_all_patterns()]
fig, ax = plt.subplots(figsize=get_figsize(0.4))
plt.hist(ic, bins=np.arange(0, 110, step=8), rwidth=0.9);
# plt.hist(ic, bins=15, rwidth=0.9, align='left');
plt.xlabel("IC")
plt.ylabel("Motifs")
fig.savefig(fdir / '../seq-IC.hist.all.pdf')
p = patterns[0]
p.seq_info_content
# create motif pairs
# dfab_w_te = pd.concat([motif_pair_dfi(dfi_subset, pair).assign(motif_pair="<>".join(pair)) for pair in pairs], axis=0)
# create motif pairs
dfab = pd.concat([motif_pair_dfi(dfi_subset[(~dfi_subset.is_te) & (~dfi_subset.is_erv)], pair).assign(motif_pair="<>".join(pair)) for pair in pairs], axis=0)
# Remove self matches
dfab = dfab.query('~((pattern_center_aln_x == pattern_center_aln_y) & (pattern_strand_aln_x == pattern_strand_aln_x))')
exclude_sox2 = dfab[(dfab.motif_pair == 'Oct4-Sox2<>Sox2') &
(dfab['center_diff_aln'] == 0)].row_idx_y.values
exclude_oct4 = dfab[(dfab.motif_pair == 'Oct4-Sox2<>Oct4') &
(dfab['center_diff_aln'] == 0)].row_idx_y.values
exclude_oct4_v2 = dfab[(dfab.motif_pair == 'Oct4<>Oct4-Oct4') &
(dfab['center_diff_aln'] == 0)].row_idx_y.values
old_len = len(dfab)
# Exclude the overlapping row
dfab = dfab[(dfab.pattern_name_x != 'Oct4') | (~dfab.row_idx_x.isin(exclude_oct4))]
dfab = dfab[(dfab.pattern_name_y != 'Oct4') | (~dfab.row_idx_y.isin(exclude_oct4))]
dfab = dfab[(dfab.pattern_name_x != 'Oct4') | (~dfab.row_idx_x.isin(exclude_oct4_v2))]
dfab = dfab[(dfab.pattern_name_y != 'Oct4') | (~dfab.row_idx_y.isin(exclude_oct4_v2))]
dfab = dfab[(dfab.pattern_name_x != 'Sox2') | (~dfab.row_idx_x.isin(exclude_sox2))]
dfab = dfab[(dfab.pattern_name_y != 'Sox2') | (~dfab.row_idx_y.isin(exclude_sox2))]
nd = len(dfab) - old_len
print(f"Removed {nd}/{len(dfab)} instances")
# dfab.to_csv(f"{model_dir}/deeplift/dfab.csv.gz", index=False, compression='gzip')
motif_subset = OrderedDict[(k,v )
# seqlets = dfi2seqlets(dfi_subset[dfi_subset.pattern_name == 'Oct4-Sox2'])
# seqlets = resize_seqlets(seqlets, 70, seqlen=1000)
# seqlet_profiles = {k: extract_signal(v, seqlets) for k,v in profiles.items()}
# sort_idx = np.argsort(-seqlet_profiles['Oct4'].sum(axis=(-1,-2)))[:1000]
sort_idx = np.arange(1000)
# multiple_plot_stranded_profile({p:v for p,v in seqlet_profiles.items()}, figsize_tmpl=(2.55,2))
# multiple_heatmap_stranded_profile(seqlet_profiles, sort_idx=sort_idx, figsize=(10,10));
from basepair.exp.chipnexus.spacing import coocurrence_plot
from basepair.exp.chipnexus.spacing import co_occurence_matrix
dfi_subset.pattern_name.unique()
def co_occurence_matrix(dfi_subset, query_string=""):
"""Returns the fraction of times pattern x (row) overlaps pattern y (column)
"""
from basepair.stats import norm_matrix
total_number = dfi_subset.groupby(['pattern']).size()
norm_counts = norm_matrix(total_number)
# normalization: minimum number of counts
total_number = dfi_subset.groupby(['pattern_name']).size()
norm_counts = norm_matrix(total_number)
# cross-product
dfi_filt_crossp = pd.merge(dfi_subset[['pattern_name', 'pattern_center_aln',
'pattern_strand_aln', 'pattern_center', 'example_idx']].set_index('example_idx'),
dfi_subset[['pattern_name', 'pattern_center_aln',
'pattern_strand_aln', 'pattern_center', 'example_idx']].set_index('example_idx'),
how='outer', left_index=True, right_index=True).reset_index()
# remove self-matches
dfi_filt_crossp = dfi_filt_crossp.query('~((pattern_name_x == pattern_name_y) & '
'(pattern_center_aln_x == pattern_center_aln_y) & '
'(pattern_strand_aln_x == pattern_strand_aln_x))')
dfi_filt_crossp['center_diff'] = dfi_filt_crossp.eval("abs(pattern_center_x- pattern_center_y)")
dfi_filt_crossp['center_diff_aln'] = dfi_filt_crossp.eval("abs(pattern_center_aln_x- pattern_center_aln_y)")
if query_string:
dfi_filt_crossp = dfi_filt_crossp.query(query_string)
match_sizes = dfi_filt_crossp.groupby(['pattern_name_x', 'pattern_name_y']).size()
count_matrix = match_sizes.unstack(fill_value=0)
norm_count_matrix = count_matrix / norm_counts # .truediv(min_counts, axis='columns').truediv(total_number, axis='index')
norm_count_matrix = norm_count_matrix.fillna(0) # these examples didn't have any paired pattern
return count_matrix, norm_count_matrix, norm_counts
def chi2_test_coc(random_coocurrence_counts, random_coocurrence,
random_coocurrence_norm, coocurrence_counts, coocurrence, coocurrence_norm):
from scipy.stats import chi2_contingency
cols = list(coocurrence_norm.columns)
n = len(random_coocurrence_counts)
o = np.zeros((n, n))
op = np.zeros((n, n))
for i in range(n):
for j in range(n):
# [[# not randomly found together , # not found together],
# [# randomly found together , # found together]]
ct = [[random_coocurrence_norm.iloc[i, j] - random_coocurrence_counts.iloc[i, j],
coocurrence_norm.iloc[i, j] - coocurrence_counts.iloc[i, j]],
[random_coocurrence_counts.iloc[i, j],
coocurrence_counts.iloc[i, j]]]
ct = np.array(ct)
# TODO - make this an actual fisher exact test from
# https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.fisher_exact.html
# or the chi-square contingency table:
# https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.chi2_contingency.html#scipy.stats.chi2_contingency
chi2, p, dof, ex = chi2_contingency(ct, correction=False)
# t22 = sm.stats.contingency_tables.Table2x2(np.array(ct))
o[i, j] = (ct[1, 1] / ct[0, 1]) / (ct[1, 0] / ct[0, 0 ])
op[i, j] = p
return pd.DataFrame(o, columns=cols, index=cols), pd.DataFrame(op, columns=cols, index=cols)
def coocurrence_plot(dfi_subset, motif_list, query_string="(abs(pattern_center_aln_x- pattern_center_aln_y) <= 150)",
signif_threshold=1e-5, ax=None, **kwargs):
"""Test for co-occurence
Args:
dfi_subset: desired subset of dfi
motif_list: list of motifs used to order the heatmap
query_string: string used with df_cross.query() to detering the valid motif pairs
signif_threshold: significance threshold for Fisher's exact test
"""
import seaborn as sns
if ax is None:
ax = plt.gca()
c_counts, c, c_norm = co_occurence_matrix(dfi_subset, query_string=query_string)
# Generate the NULL
dfi_subset_random = dfi_subset.copy()
np.random.seed(42)
# TODO - shall we draw the sample indices completely at random?
dfi_subset_random['example_idx'] = dfi_subset_random['example_idx'].sample(frac=1).values
rc_counts, rc, rc_norm = co_occurence_matrix(dfi_subset_random, query_string=query_string)
# test for significance
o, op = chi2_test_coc(rc_counts, rc, rc_norm, c_counts, c, c_norm)
# re-order
o = o[motif_list].loc[motif_list]
op = op[motif_list].loc[motif_list]
signif = op < signif_threshold
a = np.zeros_like(signif).astype(str)
a[signif] = "*"
a[~signif] = ""
sns.heatmap(o, annot=a, fmt="", vmin=0, vmax=2,
cmap='RdBu_r', ax=ax, **kwargs)
ax.set_title(f"odds-ratio (proximal / non-proximal) (*: p<{signif_threshold})")
# count_matrix, norm_count_matrix, norm_counts = co_occurence_matrix(dfi_subset,
# query_string=f"(abs(pattern_center_aln_x- pattern_center_aln_y) <= {dist}) & center_diff_aln > 5")
# co-occurence
fig, ax = plt.subplots(figsize=get_figsize(.4, aspect=1))
dist = 100
coocurrence_plot(dfi_subset[(~dfi_subset.is_te) & (~dfi_subset.is_erv)], list(motifs),
query_string=f"(abs(pattern_center_aln_x- pattern_center_aln_y) <= {dist}) & center_diff_aln > 5")
ax.set_ylabel("Motif of interest")
ax.set_xlabel("Motif partner");
fig.savefig(fdir / f'coocurrence.test.center_diff<{dist}.motifs=10.exclude-te.pdf')
# subsets = ['center_diff <= 35', 'center_diff > 35', 'center_diff > 70']
subsets = dist_subsets
fig,axes = plt.subplots(1, len(subsets),
figsize=get_figsize(0.4*len(subsets), .8/len(subsets)),
sharey=True)
# dfs = dfi_subset.query('match_weighted_p > .2').query('imp_weighted_p > .2')
dfs = dfi_subset[(~dfi_subset.is_te) & (~dfi_subset.is_erv)]
for i, (subset,ax, subset_label) in enumerate(zip(subsets, axes, dist_subset_labels)):
if i == len(subsets) - 1:
cbar = True
else:
cbar = False
coocurrence_plot(dfs, list(motifs), query_string=f"({subset}) & (center_diff_aln > 5)" , ax=ax, cbar=cbar)
if i == 0:
ax.set_ylabel("Motif of interest")
ax.set_xlabel("Motif partner");
ax.set_title(subset_label)
# plt.tight_layout()
fig.savefig(fdir / f'coocurrence.test.all-dist.motifs=10.exclude-TE.pdf')
main_motifs = OrderedDict([(k,v)
for k,v in motifs.items()
if k in ['Oct4-Sox2', 'Sox2', 'Nanog', 'Klf4']
])
# co-occurence
fig, ax = plt.subplots(figsize=get_figsize(.25, aspect=1))
dist = 100
coocurrence_plot(dfi_subset[(~dfi_subset.is_te) & (~dfi_subset.is_erv)], list(main_motifs),
query_string=f"(abs(pattern_center_aln_x- pattern_center_aln_y) <= {dist}) & center_diff_aln > 5")
ax.set_ylabel("Motif of interest")
ax.set_xlabel("Motif partner");
fig.savefig(fdir / f'coocurrence.test.center_diff<{dist}.main-motifs.pdf')
# subsets = ['center_diff <= 35', 'center_diff > 35', 'center_diff > 70']
subsets = dist_subsets
fig,axes = plt.subplots(1, len(subsets),
figsize=get_figsize(0.25*len(subsets), .8/len(subsets)),
sharey=True)
# dfs = dfi_subset.query('match_weighted_p > .2').query('imp_weighted_p > .2')
dfs = dfi_subset[(~dfi_subset.is_te) & (~dfi_subset.is_erv)]
for i, (subset,ax, subset_label) in enumerate(zip(subsets, axes, dist_subset_labels)):
if i == len(subsets) - 1:
cbar = True
else:
cbar = False
coocurrence_plot(dfs, list(main_motifs), query_string=f"({subset}) & (center_diff_aln > 5)" , ax=ax, cbar=cbar)
if i == 0:
ax.set_ylabel("Motif of interest")
ax.set_xlabel("Motif partner");
ax.set_title(subset_label)
# plt.tight_layout()
fig.savefig(fdir / f'coocurrence.test.all-dist.main-motifs.pdf')
main_motif_pairs = [k + "<>" + v for k,v in get_motif_pairs(['Oct4-Sox2', 'Sox2', 'Nanog', 'Klf4'])]
dfab_main = dfab[dfab.motif_pair.isin(main_motif_pairs)]
dfab_main['motif_pair'] = pd.Categorical(dfab_main['motif_pair'],
categories=main_motif_pairs)
plotnine.options.figure_size = get_figsize(.4, aspect=3.1875)# (10, 10)
max_dist = 100
min_dist = 6
fig = (ggplot(aes(x='center_diff', fill='strand_combination'), dfab_main[(dfab_main.center_diff <= max_dist) &
(dfab_main.center_diff > min_dist)]) +
geom_histogram(breaks=np.arange(min_dist, max_dist + 1)) +
facet_grid("motif_pair~ .") +
theme_classic(base_size=10, base_family='Arial') +
theme(strip_text = element_text(rotation=0), legend_position='top') +
xlim([min_dist, max_dist]) +
# ylim([0, 1000]) +
xlab("Pairwise distance") +
scale_fill_brewer(type='qual', palette=3))
fig.save(fdir / 'histogram.center_diff.main-4-motifs.imp>.2,match>.0.pdf')
fig
df_instances = pd.read_csv("https://docs.google.com/spreadsheets/d/1uAtJZJR_dmlWwer5Afcd2q72Uuj7A4Ln/export?gid=826411817&format=csv")
df_instances
df_instances[df_instances.IC > 4].n_cwm_peaks.sum()
df_instances[(df_instances.IC > 4) & (df_instances.is_te==False)].n_cwm_peaks.sum()
main_motif_names = [longer_pattern(v) for k,v in motifs.items()] + ["Oct4/metacluster_0/pattern_16"]
df_instances['pattern']
df_instances['pattern']
df_instances['pattern_name'] = df_instances['task'] + '/' + df_instances['pattern']
dfi_subset.pattern_name.unique()
dfi_subset.
df_instances[(df_instances.IC > 4) & (df_instances.is_te==False)].n_cwm_peaks.sum()
df_instances.head()
# Generate all spacing plots
individual_plots = {}
for motif_pair_name in pair_names:
plotnine.options.figure_size = get_figsize(.3, aspect=2/10*4)
max_dist = 100
min_dist = 0
df = dfab[(dfab.center_diff <= max_dist) &
(dfab.motif_pair == motif_pair_name)]
if len(df) == 0:
continue
fig = (ggplot(aes(x='center_diff', fill='strand_combination'), df) +
# Spacing vertical lines
geom_vline(xintercept=10, alpha=0.1) +
geom_vline(xintercept=20, alpha=0.1) +
geom_vline(xintercept=30, alpha=0.1) +
geom_vline(xintercept=40, alpha=0.1) +
# plot
geom_histogram(breaks=np.arange(min_dist, max_dist + 1)) +
facet_grid("strand_combination~.") +
# Theme, labels, colors
theme_classic(base_size=10, base_family='Arial') +
theme(strip_text = element_text(rotation=0), legend_position='top') +
xlim([min_dist, max_dist]) +
xlab("Pairwise distance") +
ggtitle(motif_pair_name) +
scale_fill_brewer(type='qual', palette=3))
# Save
fig.save(fdir_individual / f'{motif_pair_name}.pdf')
individual_plots[motif_pair_name] = fig
individual_plots['Sox2<>Nanog']
individual_plots['Nanog<>Nanog']
Considered TF's:
for motif_pair_name in ['Nanog<>Nanog', 'Sox2<>Nanog', 'Nanog<>Zic3']:
plotnine.options.figure_size = get_figsize(2, aspect=2/10*4 / 2)
xmin = 5
xmax = 50
df = dfab[(dfab.center_diff <= xmax) &
(dfab.motif_pair == motif_pair_name)]
fig = (ggplot(aes(x='center_diff', fill='strand_combination'), df) +
# plot
geom_histogram(breaks=np.arange(xmin, xmax+1)) + facet_grid("strand_combination~.") +
# Theme, labels, colors
theme_bw(base_size=10, base_family='Arial') +
theme(strip_text = element_text(rotation=0), legend_position='top') +
xlab("Pairwise distance") +
ggtitle(motif_pair_name) +
scale_x_continuous(breaks=np.arange(xmin, xmax, step=5),
minor_breaks=np.arange(xmin, xmax, step=1)) +
scale_fill_brewer(type='qual', palette=3))
# axis_ticks_major_x()
display(fig)
fig.save(fdir_individual / f'{motif_pair_name}.large.pdf')
ImpScoreFilerdir = Path('../../../')
ls {rdir}
dataspec_file = rdir / "src/chipnexus/train/seqmodel/ChIP-nexus.dataspec.yml"
ds = DataSpec.load(dataspec_file)
# Load all files into the page cache
ds.touch_all_files()
from basepair.extractors import Interval
from basepair.exp.paper.config import fasta_file
from genomelake.extractors import FastaExtractor
from basepair.utils import halve
from basepair.preproc import resize_interval
fe = FastaExtractor(fasta_file, use_strand=True)
def plot_pwm(pwm, ax):
from matplotlib import ticker
from concise.utils.pwm import seqlogo
from basepair.modisco.utils import ic_scale
seqlogo(ic_scale(pwm), ax=ax)
# strip_axis(ax)
ax.set_xlabel(None)
# ax.get_xaxis().set_visible(False)
ax.set_ylim([0, 2.0])
ax.set_ylabel("IC")
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
ax.spines['bottom'].set_visible(False)
ax.xaxis.set_major_locator(ticker.AutoLocator())
def create_intervals(dfab, ax=None):
from basepair.extractors import Interval
intervals = []
rev_strand = {"+": "-", "-": "+"}
for i,row in dfab.iterrows():
diff = row.pattern_center_y - row.pattern_center_x
width = np.abs(diff) + 30
# if diff > 0:
# start = min(row.pattern_start_x, row.pattern_end_x) - 10
# end = start + width
# strand = row.strand_x
# else:
# end = max(row.pattern_start_x, row.pattern_end_x) + 10 + 1
# start = end - width
# strand = row.strand_x
if diff > 0:
start = row.pattern_center_x - 15
end = start + width
strand = row.strand_x
else:
end = row.pattern_center_x + 15
start = end - width
strand = row.strand_x
intervals.append(Interval(chrom=row.example_chrom_x,
start=row.example_start_x + start,
stop=row.example_start_x + end,
name=row.motif_pair,
strand=strand,
# strand="+" if row.pattern_center_x > row.pattern_center_y else "-"
))
return intervals
It's nice that the discovered motif pairs are not comming from a TE
mrn = mr.mr_dict['Nanog']
ls /oak/stanford/groups/akundaje/avsec/basepair/data/processed/comparison/output/nexus,peaks,OSNK,0,10,1,FALSE,same,0.5,64,25,0.004,9,FALSE,[1,50],TRUE/deeplift/Nanog/out/profile/wn/
patterns = read_pkl("/oak/stanford/groups/akundaje/avsec/basepair/data/processed/comparison/output/nexus,peaks,OSNK,0,10,1,FALSE,same,0.5,64,25,0.004,9,FALSE,[1,50],TRUE/deeplift/Nanog/out/profile/wn/patterns.pkl")
p = {p.name:p for p in patterns}['metacluster_0/pattern_1']
p = p.shift(4).resize(50).resize_profile(50)
p.rc().plot("seq_ic")
dfab[dfab.motif_pair == 'Nanog<>Nanog-partner'].groupby(['strand_combination', 'center_diff']).size().sort_values()
np.sum(dfi_subset.pattern_name == 'Nanog-partner')
278/7066
from basepair.BPNet import BPNetSeqModel
# create_tf_session(0)
bpnet = BPNetSeqModel.from_mdir(model_dir)
to_plot = [
("Nanog<>Nanog-partner", "-+", 6),
("Nanog<>Nanog-partner", "++", 9),
("Nanog<>Nanog-partner", "++", 10),
# ("Sox2<>Nanog", "--", 10),
# ("Sox2<>Nanog", "--", 20),
# ("Sox2<>Nanog", "--", 30),
# ("Sox2<>Nanog", "++", 13),
# ("Sox2<>Nanog", "++", 22),
# ("Sox2<>Nanog", "++", 23),
# ("Nanog<>Zic3", "++", 12),
# ("Nanog<>Zic3", "+-", 11),
# ("Nanog<>Zic3", "+-", 21),
# ("Nanog<>Zic3", "+-", 31),
# ("Oct4<>Sox2", "++", 10),
]
for motif_pair, strand_combination, dist in to_plot:
# dist = 10
# strand_combination = '++'
# motif_pair = 'Nanog<>Nanog'
fig, axes = plt.subplots(2,1,
gridspec_kw={'height_ratios':[1/7, 1/2]},
sharex=True,
figsize=get_figsize(.5, 1/7+1/2))
dfp = (dfab
.query(f'strand_combination=="{strand_combination}"')
.query(f'motif_pair=="{motif_pair}"')
.query(f'center_diff=={dist}'))
intervals = create_intervals(dfp, ax=ax)
seqs = fe(intervals)
counts = ds.load_counts(intervals)
ax = axes[0]
plot_pwm(seqs.mean(0), ax=ax)
ax.set_title(f"{motif_pair} {strand_combination}, dist = {dist}, ({len(dfp)})");
cmap = plt.get_cmap("tab10")
cycle = plt.rcParams['axes.prop_cycle'].by_key()['color']
ao = .7
# plot aggregate profile
agg_profile = counts[motif_pair.split("<>")[0]].mean(0)
ax = axes[1]
# fig, ax = plt.subplots(1, 1, figsize=get_figsize(.5, 1/2))
ax.plot(agg_profile[:, 0], alpha=ao, label='Obs', color=cycle[0])
ax.plot(-agg_profile[:, 1], alpha=ao, color=cycle[0])
ax.set_ylabel("Counts")
# ax.set_title("Observed")
sns.despine(top=True, bottom=True, right=True)
# params
seqlen = seqs.shape[1]
# Plot model
agg_profile = counts[motif_pair.split("<>")[0]].mean(0)
# fig, ax = plt.subplots(1, 1, figsize=get_figsize(.5, 1/2))
seqs_1kb = fe([resize_interval(interval, 1000) for interval in intervals])
preds = bpnet.predict(seqs_1kb)
agg_preds = preds['Nanog'].mean(0)
upstream_seq, downstream_seq = halve(seqlen)
xs = slice(500-upstream_seq, 500+downstream_seq)
a = agg_profile.sum() / agg_preds[xs].sum()
ax.plot(a*agg_preds[xs, 0], label='BPNet', alpha=ao, color=cycle[1])
ax.plot(-a*agg_preds[xs, 1], alpha=ao, color=cycle[1])
ax.set_ylabel("Counts")
# ax.set_title("BPNet")
sns.despine(top=True, bottom=True, right=True)
# pshifted = p.rc().shift(-3).resize(seqlen).resize_profile(seqlen)
# upstream, downstream = halve(dist)
# double_profile = pshifted.shift(-upstream).profile['Nanog'] + pshifted.shift(downstream).profile['Nanog']
# # fig, ax = plt.subplots(1, 1, figsize=get_figsize(.5, 1/2))
# a = agg_profile.sum() / double_profile.sum()
# ax.plot(a*double_profile[:, 0], label='Mixture', alpha=ao, color=cycle[2])
# ax.plot(-a*double_profile[:, 1], alpha=ao, color=cycle[2])
# ax.set_ylabel("Counts")
# ax.set_title("Mixture")
sns.despine(top=True, bottom=True, right=True)
ax.legend()#labels=['Obs', 'BPNet', 'Mixure'])
fig.align_ylabels()
fig.savefig(fdir_individual / f'{motif_pair}.{strand_combination}.{dist}.pwm+profile.pdf')
I looked at all the motif pairs of Nanog and Nanog-alt and then visualized the aggregate footprints for the following scenarios:
# Nanog
from basepair.extractors import Interval
to_plot = [
("Nanog<>Nanog-partner", "-+", 6),
("Nanog<>Nanog-partner", "-+", -6),
("Nanog<>Nanog-partner", None, -15),
("Nanog<>Nanog-partner", "-+", None),
]
for motif_pair, strand_combination, dist in to_plot:
# dist = 10
# strand_combination = '++'
# motif_pair = 'Nanog<>Nanog'
fig, axes = plt.subplots(2,1,
gridspec_kw={'height_ratios':[1/7, 1/2]},
sharex=True,
figsize=get_figsize(.5, 1/7+1/2))
dfp = (dfab
.query(f'motif_pair=="{motif_pair}"'))
if dist is not None:
if dist > 0 :
dfp = (dfp
.query(f'((strand_combination=="{strand_combination}") & (center_diff=={dist}))'))
else:
if strand_combination is not None:
dfp = (dfp
.query(f'~((strand_combination=="{strand_combination}") & (center_diff==-{dist}))'))
else:
dfp = (dfp
.query(f'center_diff>-{dist}'))
intervals = [Interval(chrom=row.Chromosome_y,
start=row.Start_y - 20,
stop=row.End_y + 20,
strand=row.strand_y)
for i, row in dfp.iterrows()]
# intervals = create_intervals(dfp, ax=ax)
seqs = fe(intervals)
counts = ds.load_counts(intervals)
ax = axes[0]
plot_pwm(seqs.mean(0), ax=ax)
ax.set_title(f"{motif_pair} {strand_combination}, dist = {dist}, ({len(dfp)})");
cmap = plt.get_cmap("tab10")
cycle = plt.rcParams['axes.prop_cycle'].by_key()['color']
ao = .7
# plot aggregate profile
agg_profile = counts[motif_pair.split("<>")[0]].mean(0)
ax = axes[1]
# fig, ax = plt.subplots(1, 1, figsize=get_figsize(.5, 1/2))
ax.plot(agg_profile[:, 0], alpha=ao, label='Obs', color=cycle[0])
ax.plot(-agg_profile[:, 1], alpha=ao, color=cycle[0])
ax.set_ylabel("Counts")
# ax.set_title("Observed")
sns.despine(top=True, bottom=True, right=True)
# params
seqlen = seqs.shape[1]
# Plot model
agg_profile = counts[motif_pair.split("<>")[0]].mean(0)
# fig, ax = plt.subplots(1, 1, figsize=get_figsize(.5, 1/2))
seqs_1kb = fe([resize_interval(interval, 1000) for interval in intervals])
# preds = bpnet.predict(seqs_1kb)
# agg_preds = preds['Nanog'].mean(0)
upstream_seq, downstream_seq = halve(seqlen)
xs = slice(500-upstream_seq, 500+downstream_seq)
# a = agg_profile.sum() / agg_preds[xs].sum()
# ax.plot(a*agg_preds[xs, 0], label='BPNet', alpha=ao, color=cycle[1])
# ax.plot(-a*agg_preds[xs, 1], alpha=ao, color=cycle[1])
ax.set_ylabel("Counts")
ax.set_ylim([-11, 11])
# ax.set_title("BPNet")
sns.despine(top=True, bottom=True, right=True)
# pshifted = p.rc().shift(-3).resize(seqlen).resize_profile(seqlen)
# upstream, downstream = halve(dist)
# double_profile = pshifted.shift(-upstream).profile['Nanog'] + pshifted.shift(downstream).profile['Nanog']
# # fig, ax = plt.subplots(1, 1, figsize=get_figsize(.5, 1/2))
# a = agg_profile.sum() / double_profile.sum()
# ax.plot(a*double_profile[:, 0], label='Mixture', alpha=ao, color=cycle[2])
# ax.plot(-a*double_profile[:, 1], alpha=ao, color=cycle[2])
# ax.set_ylabel("Counts")
# ax.set_title("Mixture")
sns.despine(top=True, bottom=True, right=True)
ax.legend()#labels=['Obs', 'BPNet', 'Mixure'])
fig.align_ylabels()
fig.savefig(fdir_individual / f'{motif_pair}.{strand_combination}.{dist}.pwm+profile.Nanog-mix.pdf')
nanog_patterns = read_pkl("/oak/stanford/groups/akundaje/avsec/basepair/data/processed/comparison/output/nexus,peaks,OSNK,0,10,1,FALSE,same,0.5,64,25,0.004,9,FALSE,[1,50],TRUE/deeplift/Nanog/out/profile/wn/patterns.pkl")
nanog_patterns = {p.name: p for p in nanog_patterns}
nanog_patterns
p_nanog = nanog_patterns[longer_pattern(motifs['Nanog'].split("/")[1])]
p_nanog_alt = longer_pattern(motifs['Nanog-partner'].split("/")[1])
from basepair.exp.paper.fig4 import *
pattern_names = ['Nanog', 'Nanog-partner']
for pname in pattern_names:
p = nanog_patterns[longer_pattern(motifs[pname].split("/")[1])]
agg_profile = p.profile['Nanog'][70:130]
pfm = p.seq[5:65]
fig, axes = plt.subplots(2,1,
gridspec_kw={'height_ratios':[1/7, 1/2]},
sharex=True,
figsize=get_figsize(.5, 1/7+1/2))
cmap = plt.get_cmap("tab10")
cycle = plt.rcParams['axes.prop_cycle'].by_key()['color']
ax = axes[0]
plot_pwm(pfm, ax=ax)
ax.set_title(f"Nanog");
ao = .7
# plot aggregate profile
ax = axes[1]
ax.plot(agg_profile[:, 0], alpha=ao, label='Obs', color=cycle[0])
ax.plot(-agg_profile[:, 1], alpha=ao, color=cycle[0])
ax.set_ylabel("Counts")
ax.set_ylim([-11, 11])
# ax.set_title("BPNet")
sns.despine(top=True, bottom=True, right=True)
ax.legend()
fig.align_ylabels()
fig.savefig(fdir_individual / f'only-{pname}.pdf')
p_nanog.plot(['seq_ic', 'profile'])
to_plot = [
("Nanog<>Nanog", "++", 10),
("Nanog<>Nanog", "++", 20),
("Nanog<>Nanog", "++", 21),
("Nanog<>Nanog", "++", 30),
("Nanog<>Nanog", "++", 31),
("Nanog<>Nanog", "++", 40),
("Nanog<>Nanog", "++", 41),
# ("Sox2<>Nanog", "--", 10),
# ("Sox2<>Nanog", "--", 20),
# ("Sox2<>Nanog", "--", 30),
# ("Sox2<>Nanog", "++", 13),
# ("Sox2<>Nanog", "++", 22),
# ("Sox2<>Nanog", "++", 23),
# ("Nanog<>Zic3", "++", 12),
# ("Nanog<>Zic3", "+-", 11),
# ("Nanog<>Zic3", "+-", 21),
# ("Nanog<>Zic3", "+-", 31),
# ("Oct4<>Sox2", "++", 10),
]
for motif_pair, strand_combination, dist in to_plot:
# dist = 10
# strand_combination = '++'
# motif_pair = 'Nanog<>Nanog'
fig, axes = plt.subplots(2,1,
gridspec_kw={'height_ratios':[1/7, 1/2]},
sharex=True,
figsize=get_figsize(.5, 1/7+1/2))
dfp = (dfab
.query(f'strand_combination=="{strand_combination}"')
.query(f'motif_pair=="{motif_pair}"')
.query(f'center_diff=={dist}'))
intervals = create_intervals(dfp, ax=ax)
seqs = fe(intervals)
counts = ds.load_counts(intervals)
ax = axes[0]
plot_pwm(seqs.mean(0), ax=ax)
ax.set_title(f"{motif_pair} {strand_combination}, dist = {dist}, ({len(dfp)})");
cmap = plt.get_cmap("tab10")
cycle = plt.rcParams['axes.prop_cycle'].by_key()['color']
ao = .7
# plot aggregate profile
agg_profile = counts[motif_pair.split("<>")[0]].mean(0)
ax = axes[1]
# fig, ax = plt.subplots(1, 1, figsize=get_figsize(.5, 1/2))
ax.plot(agg_profile[:, 0], alpha=ao, label='Obs', color=cycle[0])
ax.plot(-agg_profile[:, 1], alpha=ao, color=cycle[0])
ax.set_ylabel("Counts")
# ax.set_title("Observed")
sns.despine(top=True, bottom=True, right=True)
# params
seqlen = seqs.shape[1]
# Plot model
agg_profile = counts[motif_pair.split("<>")[0]].mean(0)
# fig, ax = plt.subplots(1, 1, figsize=get_figsize(.5, 1/2))
seqs_1kb = fe([resize_interval(interval, 1000) for interval in intervals])
preds = bpnet.predict(seqs_1kb)
agg_preds = preds['Nanog'].mean(0)
upstream_seq, downstream_seq = halve(seqlen)
xs = slice(500-upstream_seq, 500+downstream_seq)
a = agg_profile.sum() / agg_preds[xs].sum()
ax.plot(a*agg_preds[xs, 0], label='BPNet', alpha=ao, color=cycle[1])
ax.plot(-a*agg_preds[xs, 1], alpha=ao, color=cycle[1])
ax.set_ylabel("Counts")
# ax.set_title("BPNet")
sns.despine(top=True, bottom=True, right=True)
pshifted = p.rc().shift(-3).resize(seqlen).resize_profile(seqlen)
upstream, downstream = halve(dist)
double_profile = pshifted.shift(-upstream).profile['Nanog'] + pshifted.shift(downstream).profile['Nanog']
# fig, ax = plt.subplots(1, 1, figsize=get_figsize(.5, 1/2))
a = agg_profile.sum() / double_profile.sum()
ax.plot(a*double_profile[:, 0], label='Mixture', alpha=ao, color=cycle[2])
ax.plot(-a*double_profile[:, 1], alpha=ao, color=cycle[2])
ax.set_ylabel("Counts")
# ax.set_title("Mixture")
sns.despine(top=True, bottom=True, right=True)
ax.legend()#labels=['Obs', 'BPNet', 'Mixure'])
fig.align_ylabels()
fig.savefig(fdir_individual / f'{motif_pair}.{strand_combination}.{dist}.pwm+profile.pdf')
def extract_stacked_seqlets(dfab, isf):
from matplotlib import ticker
from concise.utils.pwm import seqlogo
from basepair.modisco.utils import ic_scale
from basepair.extractors import Interval
seqlets = []
rev_strand = {"+": "-", "-": "+"}
for i,row in dfab.iterrows():
diff = row.pattern_center_y - row.pattern_center_x
width = np.abs(diff) + 30
if diff > 0:
start = row.pattern_center_x - 15
end = start + width
strand = row.strand_x
else:
end = row.pattern_center_x + 15
start = end - width
strand = row.strand_x
seqlets.append(Seqlet(seqname=row.example_idx,
start=start,
end=end,
name=row.motif_pair,
strand=strand,
# strand="+" if row.pattern_center_x > row.pattern_center_y else "-"
))
sts = isf.extract(seqlets)
return sts
# to_plot = [
# ("Nanog<>Nanog", "++", 10),
# ("Nanog<>Nanog", "++", 20),
# ("Nanog<>Nanog", "++", 21),
# ("Nanog<>Nanog", "++", 30),
# ("Nanog<>Nanog", "++", 31),
# ("Nanog<>Nanog", "++", 40),
# ("Nanog<>Nanog", "++", 41),
# ("Sox2<>Nanog", "--", 10),
# ("Sox2<>Nanog", "--", 20),
# ("Sox2<>Nanog", "--", 30),
# ("Sox2<>Nanog", "++", 13),
# ("Sox2<>Nanog", "++", 22),
# ("Sox2<>Nanog", "++", 23),
# ("Nanog<>Zic3", "++", 12),
# ("Nanog<>Zic3", "+-", 11),
# ("Nanog<>Zic3", "+-", 21),
# ("Nanog<>Zic3", "+-", 31),
# ]
# for motif_pair, strand_combination, dist in to_plot:
# # dist = 10
# # strand_combination = '++'
# # motif_pair = 'Nanog<>Nanog'
# fig, ax = plt.subplots(1,1, figsize=get_figsize(.5, 1/7))
# dfp = (dfab
# .query(f'strand_combination=="{strand_combination}"')
# .query(f'motif_pair=="{motif_pair}"')
# .query(f'center_diff=={dist}'))
# sts = extract_stacked_seqlets(dfp, isf)
# plot_pwm(sts.seq.mean(0), ax=ax)
# ax.set_title(f"{motif_pair} {strand_combination}, dist = {dist}, ({len(dfp)})");
dfi_subset with profile scores_get_seqlets(pattern, trim_frac=trim_frac) to MultipleModiscoResultmr.get_pattern(dfi.pattern)# isf.cache()
motif_pair_name = 'Oct4-Sox2<>Nanog'
dfab['Nanog/profile_counts_max_ref_y_log'] = np.log10(1+dfab['Nanog/profile_counts_max_ref_y'])
dfab['Sox2/profile_counts_max_ref_x_log'] = np.log10(1+dfab['Sox2/profile_counts_max_ref_x'])
# Test the ab scatterplot
plotnine.options.figure_size = get_figsize(1, aspect=2/10*4)
max_dist = 100
min_dist = 0
df = dfab[(dfab.center_diff <= max_dist) &
(dfab.motif_pair == motif_pair_name)]
fig = (ggplot(aes(x='center_diff',
y='Nanog/profile_counts_max_ref_y_log', # or 'profile_counts_max_ref_p'
color='strand_combination'
),
df) +
# plot
geom_point(alpha=0.5, size=.05, shape='.') +
facet_grid("strand_combination~.") +
# scale_y_log10() +
# geom_smooth() +
# ylim([1, None]) +
# Theme, labels, colors
theme_classic(base_size=10, base_family='Arial') +
theme(strip_text = element_text(rotation=0), legend_position='top') +
xlim([min_dist, max_dist]) +
xlab("Pairwise distance") +
ggtitle(motif_pair_name) +
scale_color_brewer(type='qual', palette=3)
)
fig
# Test the ab scatterplot
plotnine.options.figure_size = get_figsize(.3, aspect=2/10*4)
max_dist = 100
min_dist = 0
df = dfab[(dfab.center_diff <= max_dist) &
(dfab.motif_pair == motif_pair_name)]
fig = (ggplot(aes(x='center_diff',
y='Nanog/profile_counts_max_ref_y_log', # or 'profile_counts_max_ref_p'
color='strand_combination'),
df.groupby(["center_diff", 'strand_combination'])['Nanog/profile_counts_max_ref_y_log'].mean().reset_index()) +
# plot
geom_line() +
facet_grid("strand_combination~.") +
# geom_smooth() +
# ylim([1, None]) +
# Theme, labels, colors
theme_classic(base_size=10, base_family='Arial') +
theme(strip_text = element_text(rotation=0), legend_position='top') +
xlim([min_dist, max_dist]) +
xlab("Pairwise distance") +
ggtitle(motif_pair_name) +
scale_color_brewer(type='qual', palette=3)
)
fig
# Test the ab scatterplot
plotnine.options.figure_size = get_figsize(1, aspect=2/10*4)
max_dist = 100
min_dist = 0
df = dfab[(dfab.center_diff <= max_dist) &
(dfab.motif_pair == motif_pair_name)]
fig = (ggplot(aes(x='center_diff',
y='Sox2/profile_counts_max_ref_x_log', # or 'profile_counts_max_ref_p'
color='strand_combination'
),
df) +
# plot
geom_jitter(alpha=0.4, size=.05, shape='.', height=0, width=1) +
facet_grid("strand_combination~.") +
# scale_y_log10() +
# geom_smooth() +
# ylim([1, None]) +
# Theme, labels, colors
theme_classic(base_size=10, base_family='Arial') +
theme(strip_text = element_text(rotation=0), legend_position='top') +
xlim([min_dist, max_dist]) +
xlab("Pairwise distance") +
ggtitle(motif_pair_name) +
scale_color_brewer(type='qual', palette=3)
)
fig
# Test the ab scatterplot
plotnine.options.figure_size = get_figsize(1, aspect=2/10*4)
max_dist = 100
min_dist = 0
df = dfab[(dfab.center_diff <= max_dist) &
(dfab.motif_pair == motif_pair_name)]
fig = (ggplot(aes(x='center_diff',
y='Sox2/profile_counts_max_ref_x_log', # or 'profile_counts_max_ref_p'
color='strand_combination'),
df.groupby(["center_diff", 'strand_combination'])['Sox2/profile_counts_max_ref_x_log'].mean().reset_index()) +
# plot
geom_line() +
facet_grid("strand_combination~.") +
# geom_smooth() +
# ylim([1, None]) +
# Theme, labels, colors
theme_classic(base_size=10, base_family='Arial') +
theme(strip_text = element_text(rotation=0), legend_position='top') +
xlim([min_dist, max_dist]) +
xlab("Pairwise distance") +
ggtitle(motif_pair_name) +
scale_color_brewer(type='qual', palette=3)
)
fig
motif_pair_name = 'Sox2<>Nanog'
dfab['Nanog/profile_counts_max_ref_y_log'] = np.log10(1+dfab['Nanog/profile_counts_max_ref_y'])
dfab['Sox2/profile_counts_max_ref_x_log'] = np.log10(1+dfab['Sox2/profile_counts_max_ref_x'])
# Test the ab scatterplot
plotnine.options.figure_size = get_figsize(1, aspect=2/10*4)
max_dist = 100
min_dist = 0
df = dfab[(dfab.center_diff <= max_dist) &
(dfab.motif_pair == motif_pair_name)]
fig = (ggplot(aes(x='center_diff',
y='Sox2/profile_counts_max_ref_x_log', # or 'profile_counts_max_ref_p'
color='strand_combination'
),
df) +
# plot
geom_point(alpha=0.5, size=.05, shape='.') +
facet_grid("strand_combination~.") +
# scale_y_log10() +
# geom_smooth() +
# ylim([1, None]) +
# Theme, labels, colors
theme_classic(base_size=10, base_family='Arial') +
theme(strip_text = element_text(rotation=0), legend_position='top') +
xlim([min_dist, max_dist]) +
xlab("Pairwise distance") +
ggtitle(motif_pair_name) +
scale_color_brewer(type='qual', palette=3)
)
fig
# Test the ab scatterplot
plotnine.options.figure_size = get_figsize(1, aspect=2/10*4)
max_dist = 100
min_dist = 0
df = dfab[(dfab.center_diff <= max_dist) &
(dfab.motif_pair == motif_pair_name)]
fig = (ggplot(aes(x='center_diff',
y='Nanog/profile_counts_max_ref_y_log', # or 'profile_counts_max_ref_p'
color='strand_combination'
),
df) +
# plot
geom_point(alpha=0.5, size=.05, shape='.') +
facet_grid("strand_combination~.") +
# scale_y_log10() +
# geom_smooth() +
# ylim([1, None]) +
# Theme, labels, colors
theme_classic(base_size=10, base_family='Arial') +
theme(strip_text = element_text(rotation=0), legend_position='top') +
xlim([min_dist, max_dist]) +
xlab("Pairwise distance") +
ggtitle(motif_pair_name) +
scale_color_brewer(type='qual', palette=3)
)
fig
dfab.head()
# Test the ab scatterplot
plotnine.options.figure_size = get_figsize(1, aspect=2/10*4)
max_dist = 100
min_dist = 0
df = dfab[(dfab.center_diff <= max_dist) &
(dfab.motif_pair == motif_pair_name)]
fig = (ggplot(aes(x='center_diff',
y='Nanog/profile_counts_max_ref_x_log', # or 'profile_counts_max_ref_p'
color='strand_combination'
),
df) +
# plot
geom_point(alpha=0.2, size=.05, shape='.') +
facet_grid("strand_combination~.") +
# scale_y_log10() +
# geom_smooth() +
# ylim([1, None]) +
# Theme, labels, colors
theme_classic(base_size=10, base_family='Arial') +
theme(strip_text = element_text(rotation=0), legend_position='top') +
xlim([min_dist, max_dist]) +
xlab("Pairwise distance") +
ggtitle(motif_pair_name) +
scale_color_brewer(type='qual', palette=3)
)
fig
# Test the ab scatterplot
plotnine.options.figure_size = get_figsize(1, aspect=2/10*4)
max_dist = 100
min_dist = 0
df = dfab[(dfab.center_diff <= max_dist) &
(dfab.motif_pair == motif_pair_name)]
fig = (ggplot(aes(x='center_diff',
y='Nanog/profile_counts_max_ref_x_log', # or 'profile_counts_max_ref_p'
color='strand_combination'
),
df) +
# plot
geom_point(alpha=0.2, size=.05, shape='.') +
facet_grid("strand_combination~.") +
# scale_y_log10() +
# geom_smooth() +
# ylim([1, None]) +
# Theme, labels, colors
theme_classic(base_size=10, base_family='Arial') +
theme(strip_text = element_text(rotation=0), legend_position='top') +
xlim([min_dist, max_dist]) +
xlab("Pairwise distance") +
ggtitle(motif_pair_name) +
scale_color_brewer(type='qual', palette=3)
)
fig
# Test the ab scatterplot
plotnine.options.figure_size = get_figsize(1, aspect=2/10*4)
max_dist = 100
min_dist = 0
df = dfab[(dfab.center_diff <= max_dist) &
(dfab.motif_pair == motif_pair_name)]
fig = (ggplot(aes(x='center_diff',
y='Nanog/profile_counts_max_ref_x_log', # or 'profile_counts_max_ref_p'
color='strand_combination'),
df.groupby(["center_diff", 'strand_combination'])['Nanog/profile_counts_max_ref_x_log'].quantile(0.95).reset_index()) +
# plot
geom_line() +
facet_grid("strand_combination~.") +
# geom_smooth() +
# ylim([1, None]) +
# Theme, labels, colors
theme_classic(base_size=10, base_family='Arial') +
theme(strip_text = element_text(rotation=0), legend_position='top') +
xlim([min_dist, max_dist]) +
xlab("Pairwise distance") +
ggtitle(motif_pair_name) +
scale_color_brewer(type='qual', palette=3)
)
fig
# Test the ab scatterplot
plotnine.options.figure_size = get_figsize(1, aspect=2/10*4)
max_dist = 100
min_dist = 0
df = dfab[(dfab.center_diff <= max_dist) &
(dfab.motif_pair == motif_pair_name)]
fig = (ggplot(aes(x='center_diff',
y='Nanog/profile_counts_max_ref_x_log', # or 'profile_counts_max_ref_p'
color='strand_combination'),
df.groupby(["center_diff", 'strand_combination'])['Nanog/profile_counts_max_ref_x_log'].quantile(.5).reset_index()) +
# plot
geom_line() +
facet_grid("strand_combination~.") +
# geom_smooth() +
# ylim([1, None]) +
# Theme, labels, colors
theme_classic(base_size=10, base_family='Arial') +
theme(strip_text = element_text(rotation=0), legend_position='top') +
xlim([min_dist, max_dist]) +
xlab("Pairwise distance") +
ggtitle(motif_pair_name) +
scale_color_brewer(type='qual', palette=3)
)
fig