# filtered instances
dfi_filtered = (dfi.query('match_weighted_p > 0.1').query('imp_weighted_p > 0'))
keep_nonte = dfint_no_intersection(dfi_filtered[interval_cols], dfi_te[interval_cols])
print("not overlapping TE", keep_nonte.mean()) # almost all were kept
dfi_filtered = dfi_filtered[keep_nonte]
# filtered
dfi_filtered.pattern_name.value_counts()
# original
dfi.pattern_name.value_counts()
dfab = motif_pair_dfi(dfi_filtered, ['Oct4-Sox2', 'Sox2'])
fig = plot_spacing(dfab, alpha_scatter=0.05, y_feature='profile_counts', figsize=get_figsize(.4, aspect=2))
!mkdir -p {figures}/'individual'
for motif_pair in tqdm(pairs):
dfab = motif_pair_dfi(dfi_filtered, motif_pair)
for yf in ['profile_counts', 'imp_weighted']:
fig = plot_spacing(dfab, alpha_scatter=0.05, y_feature=yf, figsize=get_figsize(.4, aspect=2))
mp_name = "<>".join(motif_pair)
fig.savefig(figures/ f'individual/{mp_name}.{yf}.pdf')
# plt.style.use('default')
in_silico = read_pkl(f"{ddir}/processed/chipnexus/simulation/spacing.pkl")
sim_df_d, sim_res_dict_d = in_silico
sim_df_d.keys()
def swap_orientation(o):
if o=='+':
return "-"
if o=="-":
return "+"
raise ValueError("")
# Very much a hacked function to map simulation to in vivo data
def get_xy_sim_single(sim_df_d, motif_pair, feature, orientation):
# For Nanog, always explicilty swap the orientation
orientation_pair = [orientation[0], orientation[1]]
# HACK! Nanog orientation didn't properly match the orientation
if motif_pair[0] == "Nanog":
orientation_pair[0] = swap_orientation(orientation_pair[0])
if motif_pair[1] == "Nanog":
orientation_pair[1] = swap_orientation(orientation_pair[1])
mp = list(deepcopy(motif_pair))
if orientation_pair[0] == "-":
mp[0] = mp[0] + "/rc"
if orientation_pair[1] == "-":
mp[1] = mp[1] + "/rc"
df = sim_df_d[mp[0]] # choose the central motif
df = df[df.motif == mp[1]] # choose the side motif
df = df[df.distance < 150]
# select the task
df = df[df.task == profile_mapping[motif_pair[0]]]
return df.distance.values, df[feature].values
def get_xy_sim(sim_df_d, motif_pair, feature, orientation):
x1,y1 = get_xy_sim_single(sim_df_d, motif_pair, feature, orientation)
x2,y2 = get_xy_sim_single(sim_df_d, list(reversed(motif_pair)), feature, comp_strand_compbination[orientation])
assert np.all(x1 == x2)
return x1, y1, y2
sim_df_d['Oct4-Sox2'].head()
dfi_filtered = (dfi[(dfi.pattern_center > 400) & (dfi.pattern_center < 600)]
.query('match_weighted_p > 0.2')
.query('imp_weighted_p > 0'))
keep_nonte = dfint_no_intersection(dfi_filtered[interval_cols], dfi_te[interval_cols])
print("not overlapping TE", keep_nonte.mean()) # almost all were kept
dfi_filtered = dfi_filtered[keep_nonte]
d_dfi_filtered=[]
for motif_pair in tqdm(pairs):
dftw_filt = construct_motif_pairs(dfi_filtered, motif_pair,
features=['match_weighted_p','imp_weighted_p', 'imp_weighted'] + \
[f for f in dfi if "profile" in f])
# assure the right strand combination
dftw_filt[dftw_filt.center_diff < 0]['strand_combination'] = dftw_filt[dftw_filt.center_diff < 0]['strand_combination'].map(comp_strand_compbination)
dftw_filt.center_diff = np.abs(dftw_filt.center_diff)
if motif_pair[0] == motif_pair[1]:
motif_pair = [f"{m}{i}" for i,m in enumerate(motif_pair)]
d_dfi_filtered.append((motif_pair, dftw_filt))
fig_profile, all_axes = plt.subplots(3*len(strand_combinations), len(d_dfi_filtered), figsize=(25,15), sharex=True, sharey='row')
for j, (motif_pair, dftw_filt) in enumerate(d_dfi_filtered):
axes = all_axes[:,j]
if motif_pair[0][:-1] == motif_pair[1][:-1]:
axes[0].set_title("{f}<>{f}".format(f=motif_pair[0][:-1]), fontsize=7)
motif_pair_c = [mp[:-1] for mp in motif_pair]
else:
motif_pair_c = motif_pair
axes[0].set_title("<>".join(motif_pair), fontsize=7)
dftw_filt = dftw_filt[(dftw_filt.center_diff < 150) & (dftw_filt.imp_weighted_p.max(1) > 0.3)]
ymax = max([np.log10(1+dftw_filt[profile_mapping[mp] + "/profile_counts"]).max().max() for mp in motif_pair_c])
ymin = min([np.log10(1+dftw_filt[profile_mapping[mp] + "/profile_counts"]).min().min() for mp in motif_pair_c])
#ymax = dftw_filt.imp_weighted.max().max()
#ymin = dftw_filt.imp_weighted.min().min()
for i, sc in enumerate(strand_combinations):
y1 = np.log10(1+ dftw_filt[dftw_filt.strand_combination==sc][profile_mapping[motif_pair_c[0]] + "/profile_counts"][motif_pair[0]])
y2 = np.log10(1+ dftw_filt[dftw_filt.strand_combination==sc][profile_mapping[motif_pair_c[1]] + "/profile_counts"][motif_pair[1]])
# y1 = dftw_filt[dftw_filt.strand_combination==sc]['imp_weighted'][motif_pair[0]]
# y2 = dftw_filt[dftw_filt.strand_combination==sc]['imp_weighted'][motif_pair[1]]
x = dftw_filt[dftw_filt.strand_combination==sc]['center_diff']
#dm,ym,confi = average_distance(x,y, window=5)
dm1,ym1,confi1 = smooth_lowess(x,y1, frac=0.15)
dm2,ym2,confi2 = smooth_lowess(x,y2, frac=0.15)
#dm,ym, confi = smooth_gam(x,y, 140, 20)
ax = axes[3*i]
ax.hist(dftw_filt[dftw_filt.strand_combination==sc]['center_diff'], np.arange(10, 150, 1));
if j == 0:
ax.set_ylabel(sc)
# second plot
ax = axes[3*i+1]
ax.scatter(x,y1, alpha=0.05, s=8)
if confi1 is not None:
ax.fill_between(dm1, confi1[:,0], confi1[:,1], alpha=0.2)
ax.plot(dm1, ym1, linewidth=2, alpha=0.8)
ax.scatter(x,y2, alpha=0.05, s=8)
if confi2 is not None:
ax.fill_between(dm2, confi2[:,0], confi2[:,1], alpha=0.2)
ax.plot(dm2, ym2, linewidth=2, alpha=0.8)
if j == 0:
ax.set_ylabel(sc)
# third plot, simulated
ax = axes[3*i+2]
sim_x, sim_y1, sim_y2 = get_xy_sim(sim_df_d, motif_pair_c, 'profile/counts_frac', sc)
ax.axhline(1, linestyle="--", color='grey', alpha=0.2)
ax.plot(sim_x, sim_y1, linewidth=1, alpha=0.8)
ax.plot(sim_x, sim_y2, linewidth=1, alpha=0.8)
ax.xaxis.set_minor_locator(plt.MultipleLocator(10))
if j == 0:
ax.set_ylabel(sc)
fig_profile.subplots_adjust(wspace=0, hspace=0)
fig_imp, all_axes = plt.subplots(3*len(strand_combinations), len(d_dfi_filtered), figsize=(25,15), sharex=True, sharey='row')
for j, (motif_pair, dftw_filt) in enumerate(d_dfi_filtered):
axes = all_axes[:,j]
if motif_pair[0][:-1] == motif_pair[1][:-1]:
axes[0].set_title("{f}<>{f}".format(f=motif_pair[0][:-1]), fontsize=7)
motif_pair_c = [mp[:-1] for mp in motif_pair]
else:
motif_pair_c = motif_pair
axes[0].set_title("<>".join(motif_pair), fontsize=7)
dftw_filt = dftw_filt[(dftw_filt.center_diff < 150) & (dftw_filt.imp_weighted_p.max(1) > 0.3)]
#ymax = max([np.log10(1+dftw_filt[profile_mapping[mp] + "/profile_counts"]).max().max() for mp in motif_pair_c])
#ymin = min([np.log10(1+dftw_filt[profile_mapping[mp] + "/profile_counts"]).min().min() for mp in motif_pair_c])
ymax = dftw_filt.imp_weighted.max().max()
ymin = dftw_filt.imp_weighted.min().min()
for i, sc in enumerate(strand_combinations):
#y1 = np.log10(1+ dftw_filt[dftw_filt.strand_combination==sc][profile_mapping[motif_pair_c[0]] + "/profile_counts"][motif_pair[0]])
#y2 = np.log10(1+ dftw_filt[dftw_filt.strand_combination==sc][profile_mapping[motif_pair_c[1]] + "/profile_counts"][motif_pair[1]])
y1 = dftw_filt[dftw_filt.strand_combination==sc]['imp_weighted'][motif_pair[0]]
y2 = dftw_filt[dftw_filt.strand_combination==sc]['imp_weighted'][motif_pair[1]]
x = dftw_filt[dftw_filt.strand_combination==sc]['center_diff']
#dm,ym,confi = average_distance(x,y, window=5)
dm1,ym1,confi1 = smooth_lowess(x,y1, frac=0.15)
dm2,ym2,confi2 = smooth_lowess(x,y2, frac=0.15)
#dm,ym, confi = smooth_gam(x,y, 140, 20)
ax = axes[3*i]
ax.hist(dftw_filt[dftw_filt.strand_combination==sc]['center_diff'], np.arange(10, 150, 1));
if j == 0:
ax.set_ylabel(sc)
# second plot
ax = axes[3*i+1]
ax.scatter(x,y1, alpha=0.05, s=8)
if confi1 is not None:
ax.fill_between(dm1, confi1[:,0], confi1[:,1], alpha=0.2)
ax.plot(dm1, ym1, linewidth=2, alpha=0.8)
ax.scatter(x,y2, alpha=0.05, s=8)
if confi2 is not None:
ax.fill_between(dm2, confi2[:,0], confi2[:,1], alpha=0.2)
ax.plot(dm2, ym2, linewidth=2, alpha=0.8)
if j == 0:
ax.set_ylabel(sc)
# third plot, simulated
ax = axes[3*i+2]
sim_x, sim_y1, sim_y2 = get_xy_sim(sim_df_d, motif_pair_c, 'imp/weighted_frac', sc)
ax.axhline(1, linestyle="--", color='grey', alpha=0.2)
ax.plot(sim_x, sim_y1, linewidth=1, alpha=0.8)
ax.plot(sim_x, sim_y2, linewidth=1, alpha=0.8)
ax.xaxis.set_minor_locator(plt.MultipleLocator(10))
if j == 0:
ax.set_ylabel(sc)
fig_imp.subplots_adjust(wspace=0, hspace=0)
fig_profile.savefig(figures / "non-TE.profile_counts.pdf")
fig_imp.savefig(figures / "non-TE.importance_weighted.pdf")