In [ ]:
 

Large pairwise plot

Re-implement spacing

In [47]:
# filtered instances
dfi_filtered = (dfi.query('match_weighted_p > 0.1').query('imp_weighted_p > 0'))
keep_nonte = dfint_no_intersection(dfi_filtered[interval_cols], dfi_te[interval_cols])
print("not overlapping TE", keep_nonte.mean())  # almost all were kept
dfi_filtered = dfi_filtered[keep_nonte]
not overlapping TE 0.9622691422132954
In [42]:
# filtered
dfi_filtered.pattern_name.value_counts()
Out[42]:
Nanog        30601
Klf4         18633
Oct4-Sox2    10364
Sox2          6944
Name: pattern_name, dtype: int64
In [43]:
# original
dfi.pattern_name.value_counts()
Out[43]:
Nanog        899892
Oct4-Sox2    294321
Klf4         285485
Sox2         250038
Name: pattern_name, dtype: int64
In [48]:
dfab = motif_pair_dfi(dfi_filtered, ['Oct4-Sox2', 'Sox2'])
In [49]:
fig = plot_spacing(dfab, alpha_scatter=0.05, y_feature='profile_counts', figsize=get_figsize(.4, aspect=2))
In [235]:
!mkdir -p {figures}/'individual'

Save all pairs to pdf

In [ ]:
for motif_pair in tqdm(pairs):
    dfab = motif_pair_dfi(dfi_filtered, motif_pair)
    for yf in ['profile_counts', 'imp_weighted']:
        fig = plot_spacing(dfab, alpha_scatter=0.05, y_feature=yf, figsize=get_figsize(.4, aspect=2))
        mp_name = "<>".join(motif_pair)
        fig.savefig(figures/ f'individual/{mp_name}.{yf}.pdf')

Simulation data

In [78]:
# plt.style.use('default')
In [50]:
in_silico = read_pkl(f"{ddir}/processed/chipnexus/simulation/spacing.pkl")
In [51]:
sim_df_d, sim_res_dict_d = in_silico
In [52]:
sim_df_d.keys()
Out[52]:
dict_keys(['Oct4-Sox2', 'Sox2', 'Essrb', 'Nanog', 'Klf4', 'Oct4-Sox2/rc', 'Sox2/rc', 'Essrb/rc', 'Nanog/rc', 'Klf4/rc'])
In [53]:
def swap_orientation(o):
    if o=='+':
        return "-"
    if o=="-":
        return "+"
    raise ValueError("")

# Very much a hacked function to map simulation to in vivo data
def get_xy_sim_single(sim_df_d, motif_pair, feature, orientation):
    # For Nanog, always explicilty swap the orientation
    orientation_pair = [orientation[0], orientation[1]]
    # HACK! Nanog orientation didn't properly match the orientation
    if motif_pair[0] == "Nanog":
        orientation_pair[0] = swap_orientation(orientation_pair[0])
    if motif_pair[1] == "Nanog":
        orientation_pair[1] = swap_orientation(orientation_pair[1])

    mp = list(deepcopy(motif_pair))
    if orientation_pair[0] == "-":
        mp[0] = mp[0] + "/rc"
    if orientation_pair[1] == "-":
        mp[1] = mp[1] + "/rc"
        
    df = sim_df_d[mp[0]]  # choose the central motif
    df = df[df.motif == mp[1]]  # choose the side motif
    
    df = df[df.distance < 150]
    
    # select the task
    df = df[df.task == profile_mapping[motif_pair[0]]]        
    return df.distance.values, df[feature].values
    
def get_xy_sim(sim_df_d, motif_pair, feature, orientation):
    x1,y1 = get_xy_sim_single(sim_df_d, motif_pair, feature, orientation)
    x2,y2 = get_xy_sim_single(sim_df_d, list(reversed(motif_pair)), feature, comp_strand_compbination[orientation])
    assert np.all(x1 == x2)
    return x1, y1, y2
In [54]:
sim_df_d['Oct4-Sox2'].head()
Out[54]:
central_motif distance imp/count imp/count_frac imp/weighted imp/weighted_frac position profile/counts profile/counts_frac profile/max profile/max_frac profile/simmetric_kl side_motif task motif
0 TTTGCATAACAA 11 0.9609 0.6405 0.9823 0.8519 511 185.7749 2.3054 12.2156 2.2955 0.1719 TTTGCATAACAA Oct4 Oct4-Sox2
1 TTTGCATAACAA 11 0.5739 0.6229 0.6065 0.7806 511 56.8762 1.9200 5.4221 2.1760 0.3281 TTTGCATAACAA Sox2 Oct4-Sox2
2 TTTGCATAACAA 11 0.6132 0.6085 0.3001 0.6910 511 37.6371 1.3918 1.5615 1.9475 0.1569 TTTGCATAACAA Nanog Oct4-Sox2
3 TTTGCATAACAA 11 0.2450 0.5783 0.2800 0.6075 511 14.7951 1.0222 0.7564 1.4350 0.1079 TTTGCATAACAA Klf4 Oct4-Sox2
4 TTTGCATAACAA 12 0.5861 0.3907 0.4954 0.4297 512 111.2185 1.3802 4.5952 0.8635 0.1345 TTTGCATAACAA Oct4 Oct4-Sox2
In [85]:
dfi_filtered = (dfi[(dfi.pattern_center > 400) & (dfi.pattern_center < 600)]
                .query('match_weighted_p > 0.2')
                .query('imp_weighted_p > 0'))
keep_nonte = dfint_no_intersection(dfi_filtered[interval_cols], dfi_te[interval_cols])
print("not overlapping TE", keep_nonte.mean())  # almost all were kept
dfi_filtered = dfi_filtered[keep_nonte]
d_dfi_filtered=[]
for motif_pair in tqdm(pairs):
    dftw_filt = construct_motif_pairs(dfi_filtered, motif_pair,
                                     features=['match_weighted_p','imp_weighted_p', 'imp_weighted'] + \
                                      [f for f in dfi if "profile" in f])
    
    # assure the right strand combination
    dftw_filt[dftw_filt.center_diff < 0]['strand_combination'] = dftw_filt[dftw_filt.center_diff < 0]['strand_combination'].map(comp_strand_compbination)
    dftw_filt.center_diff = np.abs(dftw_filt.center_diff)
    
    if motif_pair[0] == motif_pair[1]:
        motif_pair = [f"{m}{i}" for i,m in enumerate(motif_pair)]
    d_dfi_filtered.append((motif_pair, dftw_filt))
  0%|          | 0/10 [00:00<?, ?it/s]
not overlapping TE 0.9445685825817741
100%|██████████| 10/10 [00:11<00:00,  1.19s/it]

Response = Profile

In [86]:
fig_profile, all_axes = plt.subplots(3*len(strand_combinations), len(d_dfi_filtered), figsize=(25,15), sharex=True, sharey='row')

for j, (motif_pair, dftw_filt) in enumerate(d_dfi_filtered):
    axes = all_axes[:,j]
    if motif_pair[0][:-1] == motif_pair[1][:-1]:
        axes[0].set_title("{f}<>{f}".format(f=motif_pair[0][:-1]), fontsize=7)
        motif_pair_c = [mp[:-1] for mp in motif_pair]
    else:
        motif_pair_c = motif_pair
        axes[0].set_title("<>".join(motif_pair), fontsize=7)
    dftw_filt = dftw_filt[(dftw_filt.center_diff < 150) & (dftw_filt.imp_weighted_p.max(1) > 0.3)]

    
    ymax = max([np.log10(1+dftw_filt[profile_mapping[mp] + "/profile_counts"]).max().max() for mp in motif_pair_c])
    ymin = min([np.log10(1+dftw_filt[profile_mapping[mp] + "/profile_counts"]).min().min() for mp in motif_pair_c])
    #ymax = dftw_filt.imp_weighted.max().max()
    #ymin = dftw_filt.imp_weighted.min().min()
    for i, sc in enumerate(strand_combinations):
        y1 = np.log10(1+ dftw_filt[dftw_filt.strand_combination==sc][profile_mapping[motif_pair_c[0]] + "/profile_counts"][motif_pair[0]])
        y2 = np.log10(1+ dftw_filt[dftw_filt.strand_combination==sc][profile_mapping[motif_pair_c[1]] + "/profile_counts"][motif_pair[1]])
        # y1 = dftw_filt[dftw_filt.strand_combination==sc]['imp_weighted'][motif_pair[0]]
        # y2 = dftw_filt[dftw_filt.strand_combination==sc]['imp_weighted'][motif_pair[1]]
        x = dftw_filt[dftw_filt.strand_combination==sc]['center_diff']

        #dm,ym,confi = average_distance(x,y, window=5)
        dm1,ym1,confi1 = smooth_lowess(x,y1, frac=0.15)
        dm2,ym2,confi2 = smooth_lowess(x,y2, frac=0.15)
        #dm,ym, confi = smooth_gam(x,y, 140, 20)

        ax = axes[3*i]
        ax.hist(dftw_filt[dftw_filt.strand_combination==sc]['center_diff'], np.arange(10, 150, 1));
        if j == 0:
            ax.set_ylabel(sc)

        # second plot
        ax = axes[3*i+1]
        ax.scatter(x,y1, alpha=0.05, s=8)
        if confi1 is not None:
            ax.fill_between(dm1, confi1[:,0], confi1[:,1], alpha=0.2)
        ax.plot(dm1, ym1, linewidth=2, alpha=0.8)
        ax.scatter(x,y2, alpha=0.05, s=8)
        if confi2 is not None:
            ax.fill_between(dm2, confi2[:,0], confi2[:,1], alpha=0.2)
        ax.plot(dm2, ym2, linewidth=2, alpha=0.8)
        if j == 0:
            ax.set_ylabel(sc)
        # third plot, simulated
        ax = axes[3*i+2]
        sim_x, sim_y1, sim_y2 = get_xy_sim(sim_df_d, motif_pair_c, 'profile/counts_frac', sc)
        ax.axhline(1, linestyle="--", color='grey', alpha=0.2)
        ax.plot(sim_x, sim_y1, linewidth=1, alpha=0.8)
        ax.plot(sim_x, sim_y2, linewidth=1, alpha=0.8)

        ax.xaxis.set_minor_locator(plt.MultipleLocator(10))
        if j == 0:
            ax.set_ylabel(sc)
    fig_profile.subplots_adjust(wspace=0, hspace=0)

Response=Importance

In [87]:
fig_imp, all_axes = plt.subplots(3*len(strand_combinations), len(d_dfi_filtered), figsize=(25,15), sharex=True, sharey='row')

for j, (motif_pair, dftw_filt) in enumerate(d_dfi_filtered):
    axes = all_axes[:,j]
    if motif_pair[0][:-1] == motif_pair[1][:-1]:
        axes[0].set_title("{f}<>{f}".format(f=motif_pair[0][:-1]), fontsize=7)
        motif_pair_c = [mp[:-1] for mp in motif_pair]
    else:
        motif_pair_c = motif_pair
        axes[0].set_title("<>".join(motif_pair), fontsize=7)
    dftw_filt = dftw_filt[(dftw_filt.center_diff < 150) & (dftw_filt.imp_weighted_p.max(1) > 0.3)]

    #ymax = max([np.log10(1+dftw_filt[profile_mapping[mp] + "/profile_counts"]).max().max() for mp in motif_pair_c])
    #ymin = min([np.log10(1+dftw_filt[profile_mapping[mp] + "/profile_counts"]).min().min() for mp in motif_pair_c])
    ymax = dftw_filt.imp_weighted.max().max()
    ymin = dftw_filt.imp_weighted.min().min()
    for i, sc in enumerate(strand_combinations):
        #y1 = np.log10(1+ dftw_filt[dftw_filt.strand_combination==sc][profile_mapping[motif_pair_c[0]] + "/profile_counts"][motif_pair[0]])
        #y2 = np.log10(1+ dftw_filt[dftw_filt.strand_combination==sc][profile_mapping[motif_pair_c[1]] + "/profile_counts"][motif_pair[1]])
        y1 = dftw_filt[dftw_filt.strand_combination==sc]['imp_weighted'][motif_pair[0]]
        y2 = dftw_filt[dftw_filt.strand_combination==sc]['imp_weighted'][motif_pair[1]]
        x = dftw_filt[dftw_filt.strand_combination==sc]['center_diff']

        #dm,ym,confi = average_distance(x,y, window=5)
        dm1,ym1,confi1 = smooth_lowess(x,y1, frac=0.15)
        dm2,ym2,confi2 = smooth_lowess(x,y2, frac=0.15)
        #dm,ym, confi = smooth_gam(x,y, 140, 20)

        ax = axes[3*i]
        ax.hist(dftw_filt[dftw_filt.strand_combination==sc]['center_diff'], np.arange(10, 150, 1));
        if j == 0:
            ax.set_ylabel(sc)

        # second plot
        ax = axes[3*i+1]
        ax.scatter(x,y1, alpha=0.05, s=8)
        if confi1 is not None:
            ax.fill_between(dm1, confi1[:,0], confi1[:,1], alpha=0.2)
        ax.plot(dm1, ym1, linewidth=2, alpha=0.8)
        ax.scatter(x,y2, alpha=0.05, s=8)
        if confi2 is not None:
            ax.fill_between(dm2, confi2[:,0], confi2[:,1], alpha=0.2)
        ax.plot(dm2, ym2, linewidth=2, alpha=0.8)
        if j == 0:
            ax.set_ylabel(sc)
        # third plot, simulated
        ax = axes[3*i+2]
        sim_x, sim_y1, sim_y2 = get_xy_sim(sim_df_d, motif_pair_c, 'imp/weighted_frac', sc)
        ax.axhline(1, linestyle="--", color='grey', alpha=0.2)
        ax.plot(sim_x, sim_y1, linewidth=1, alpha=0.8)
        ax.plot(sim_x, sim_y2, linewidth=1, alpha=0.8)

        ax.xaxis.set_minor_locator(plt.MultipleLocator(10))
        if j == 0:
            ax.set_ylabel(sc)
fig_imp.subplots_adjust(wspace=0, hspace=0)
In [88]:
fig_profile.savefig(figures / "non-TE.profile_counts.pdf")
fig_imp.savefig(figures / "non-TE.importance_weighted.pdf")