In [1]:
from collections import OrderedDict
exp = 'nexus,peaks,OSNK,0,10,1,FALSE,same,0.5,64,25,0.004,9,FALSE,[1,50],TRUE'
imp_score = 'profile/wn'

motifs = OrderedDict([
    ("Oct4-Sox2", 'Oct4/m0_p0'),
    ("Oct4", 'Oct4/m0_p1'),
    # ("Strange-sym-motif", 'Oct4/m0_p5'),
    ("Sox2", 'Sox2/m0_p1'),
    ("Nanog", 'Nanog/m0_p1'),
    ("Zic3", 'Nanog/m0_p2'),
    ("Nanog-partner", 'Nanog/m0_p4'),
    ("Klf4", 'Klf4/m0_p0'),
])

Goal

  • inspect motif overlap and thereby test align_instance_center
In [2]:
# Imports
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
from basepair.imports import *
hv.extension('bokeh')
Using TensorFlow backend.
In [5]:
from basepair.exp.paper.config import *
In [3]:
paper_config()

%matplotlib inline
In [15]:
model_dir = models_dir / exp
modisco_dir = model_dir / f'deeplift/Nanog/out/{imp_score}'
In [16]:
from basepair.modisco.pattern_instances import load_instances, align_instance_center, filter_nonoverlapping_intervals
In [ ]:
aligned_patterns = []
all_patterns = []
for task in tasks:
    patterns = read_pkl(model_dir / f'deeplift/{task}/out/{imp_score}/patterns.pkl')
    patterns = [p.rename(task + "/" + shorten_patten(p.name)) for p in patterns]
    all_patterns += patterns
In [17]:
aligned_patterns = read_pkl(modisco_dir / 'patterns.pkl')
In [21]:
aligned_patterns[3].plot('seq_ic');
In [27]:
mr = ModiscoResult(modisco_dir / 'modisco.h5')
mr.open()
In [28]:
from basepair.cli.imp_score import ImpScoreFile
from kipoi.data_utils import get_dataset_item
In [29]:
# load the contrib scores
imp = ImpScoreFile.from_modisco_dir(modisco_dir)

contrib_scores = imp.get_contrib()
In [30]:
orig_patterns = [mr.get_pattern(pname) for pname in mr.patterns()]
In [31]:
orig_patterns_d = {p.name: p for p in orig_patterns}
In [32]:
dfi = load_instances(modisco_dir / 'instances.parq', motifs=None, dedup=False)
In [33]:
dfi.head()
Out[33]:
pattern example_idx pattern_start pattern_end strand pattern_len pattern_center match_weighted match_weighted_p match_weighted_cat match_max match_max_task imp_weighted imp_weighted_p imp_weighted_cat imp_max imp_max_task seq_match seq_match_p seq_match_cat match/Klf4 match/Nanog match/Oct4 match/Sox2 imp/Klf4 imp/Nanog imp/Oct4 imp/Sox2 example_chrom example_start example_end example_strand example_interval_from_task pattern_short pattern_start_abs pattern_end_abs
0 metacluster_0/pattern_0 0 104 119 + 15 111 0.2469 0.0010 low 0.4484 Oct4 0.0365 NaN NaN 0.0524 Oct4 3.9825 0.0085 low 0.2654 -0.3593 0.4484 0.3725 0.0235 0.0246 0.0524 0.0313 chrX 143482544 143483544 * Oct4 m0_p0 143482648 143482663
1 metacluster_0/pattern_0 0 263 278 - 15 270 0.2518 0.0011 low 0.2893 Oct4 0.0217 NaN NaN 0.0342 Nanog 1.1066 0.0007 low 0.1618 0.2170 0.2893 0.2803 0.0143 0.0342 0.0200 0.0203 chrX 143482544 143483544 * Oct4 m0_p0 143482807 143482822
2 metacluster_0/pattern_0 0 282 297 + 15 289 0.3466 0.0069 low 0.3870 Sox2 0.0272 NaN NaN 0.0296 Sox2 5.0100 0.0188 low 0.2257 0.3454 0.3723 0.3870 0.0233 0.0258 0.0279 0.0296 chrX 143482544 143483544 * Oct4 m0_p0 143482826 143482841
3 metacluster_0/pattern_0 0 445 460 + 15 452 0.2399 0.0008 low 0.2469 Sox2 0.1852 0.0008 high 0.4145 Nanog 1.7775 0.0012 low 0.2458 0.2408 0.2314 0.2469 0.1234 0.4145 0.0949 0.1896 chrX 143482544 143483544 * Oct4 m0_p0 143482989 143483004
4 metacluster_0/pattern_0 0 475 490 - 15 482 0.2903 0.0025 low 0.3322 Oct4 0.2889 0.0179 high 0.5196 Nanog 3.3390 0.0057 low 0.2092 0.2681 0.3322 0.2991 0.1613 0.5196 0.2049 0.3241 chrX 143482544 143483544 * Oct4 m0_p0 143483019 143483034
In [34]:
dfi = align_instance_center(dfi, orig_patterns, aligned_patterns, trim_frac=0.08)
In [35]:
# remove all the overlapping intervals
dfi = filter_nonoverlapping_intervals(dfi)
In [36]:
dfi.eval("strand == pattern_strand_aln").mean()
Out[36]:
0.6135687009197928

compute the exact overlap between motifs (few heatmaps)

  • in each example, compute all the matches that have the same pattern_strand_aln and the same pattern_center_aln
  • for each pair, plot the number / fraction of such cases as a heatmap
In [37]:
sizes = dfi.groupby('example_idx').size()
In [38]:
sizes.plot.hist(30)
plt.xlabel("Number of instances per example")
Out[38]:
Text(0.5, 0, 'Number of instances per example')
In [39]:
dfi_filt = dfi[(dfi.imp_weighted_cat == 'high') & (dfi.match_weighted_cat != 'low')].set_index('example_idx')
In [40]:
dfi_filt.groupby('example_idx').size().plot.hist(30)
plt.xlabel("Number of instances per example");
In [55]:
def norm_matrix(s):
    """Create the normalization matrix
    
    Example:
    print(norm_matrix(pd.Series([1,3,5])).to_string())
       0  1  2
    0  1  1  1
    1  3  3  3
    2  5  5  5
    
    Args:
      s: pandas series
    """
    tnc = s.values[:, np.newaxis]
    vals_by_row = tnc * np.ones_like(tnc).T
    # np.fill_diagonal(vals_by_row,  1)
    return pd.DataFrame(vals_by_row, index=s.index, columns=s.index)    
In [44]:
# exclude noisy motifs
exclude = ['metacluster_2/pattern_22', 'metacluster_1/pattern_16', 'metacluster_6/pattern_9', 'metacluster_6/pattern_12']
dfi_filt = dfi_filt[~dfi_filt.pattern.isin(exclude)]

# normalization: minimum number of counts
total_number = dfi_filt.groupby(['pattern']).size()
norm_counts = norm_matrix(total_number)

# cross-product
dfi_filt_crossp = pd.merge(dfi_filt[['pattern', 'pattern_center_aln', 'pattern_strand_aln', 'pattern_center']], 
                           dfi_filt[['pattern', 'pattern_center_aln', 'pattern_strand_aln', 'pattern_center']], 
                           how='outer', left_index=True, right_index=True).reset_index()
# remove self-matches
dfi_filt_crossp = dfi_filt_crossp.query('~((pattern_x == pattern_y) & (pattern_center_aln_x == pattern_center_aln_y) & (pattern_strand_aln_x == pattern_strand_aln_x))')

# order the matrix by names
idx_order = [p.name for p in aligned_patterns if p.name in dfi_filt.pattern.unique()]
In [45]:
norm_counts.head()
Out[45]:
pattern metacluster_0/pattern_0 metacluster_0/pattern_1 metacluster_0/pattern_10 metacluster_0/pattern_11 metacluster_0/pattern_12 metacluster_0/pattern_13 metacluster_0/pattern_14 metacluster_0/pattern_15 metacluster_0/pattern_16 metacluster_0/pattern_17 metacluster_0/pattern_18 metacluster_0/pattern_19 metacluster_0/pattern_2 metacluster_0/pattern_20 metacluster_0/pattern_21 metacluster_0/pattern_22 metacluster_0/pattern_23 metacluster_0/pattern_24 metacluster_0/pattern_25 metacluster_0/pattern_26 metacluster_0/pattern_27 metacluster_0/pattern_28 metacluster_0/pattern_29 metacluster_0/pattern_3 metacluster_0/pattern_4 metacluster_0/pattern_5 metacluster_0/pattern_6 metacluster_0/pattern_7 metacluster_0/pattern_8 metacluster_0/pattern_9 metacluster_1/pattern_0 metacluster_1/pattern_1 metacluster_1/pattern_10 metacluster_1/pattern_11 metacluster_1/pattern_12 metacluster_1/pattern_13 metacluster_1/pattern_14 metacluster_1/pattern_15 metacluster_1/pattern_17 metacluster_1/pattern_2 metacluster_1/pattern_3 metacluster_1/pattern_4 metacluster_1/pattern_5 metacluster_1/pattern_6 metacluster_1/pattern_7 metacluster_1/pattern_8 metacluster_1/pattern_9 metacluster_2/pattern_0 metacluster_2/pattern_1 metacluster_2/pattern_10 metacluster_2/pattern_11 metacluster_2/pattern_12 metacluster_2/pattern_13 metacluster_2/pattern_14 metacluster_2/pattern_15 metacluster_2/pattern_16 metacluster_2/pattern_17 metacluster_2/pattern_18 metacluster_2/pattern_19 metacluster_2/pattern_2 metacluster_2/pattern_20 metacluster_2/pattern_21 metacluster_2/pattern_3 metacluster_2/pattern_4 metacluster_2/pattern_5 metacluster_2/pattern_6 metacluster_2/pattern_7 metacluster_2/pattern_8 metacluster_2/pattern_9 metacluster_3/pattern_0 metacluster_3/pattern_1 metacluster_3/pattern_10 metacluster_3/pattern_2 metacluster_3/pattern_3 metacluster_3/pattern_4 metacluster_3/pattern_5 metacluster_3/pattern_6 metacluster_3/pattern_7 metacluster_3/pattern_8 metacluster_3/pattern_9 metacluster_4/pattern_0 metacluster_4/pattern_1 metacluster_4/pattern_10 metacluster_4/pattern_11 metacluster_4/pattern_12 metacluster_4/pattern_2 metacluster_4/pattern_3 metacluster_4/pattern_4 metacluster_4/pattern_5 metacluster_4/pattern_6 metacluster_4/pattern_7 metacluster_4/pattern_8 metacluster_4/pattern_9 metacluster_5/pattern_0 metacluster_5/pattern_1 metacluster_5/pattern_10 metacluster_5/pattern_11 metacluster_5/pattern_2 metacluster_5/pattern_3 metacluster_5/pattern_4 metacluster_5/pattern_5 metacluster_5/pattern_6 metacluster_5/pattern_7 metacluster_5/pattern_8 metacluster_5/pattern_9 metacluster_6/pattern_0 metacluster_6/pattern_1 metacluster_6/pattern_10 metacluster_6/pattern_11 metacluster_6/pattern_2 metacluster_6/pattern_3 metacluster_6/pattern_4 metacluster_6/pattern_5 metacluster_6/pattern_6 metacluster_6/pattern_7 metacluster_6/pattern_8 metacluster_7/pattern_0 metacluster_7/pattern_1
pattern
metacluster_0/pattern_0 4250 4250 4250 4250 4250 4250 4250 4250 4250 4250 4250 4250 4250 4250 4250 4250 4250 4250 4250 4250 4250 4250 4250 4250 4250 4250 4250 4250 4250 4250 4250 4250 4250 4250 4250 4250 4250 4250 4250 4250 4250 4250 4250 4250 4250 4250 4250 4250 4250 4250 4250 4250 4250 4250 4250 4250 4250 4250 4250 4250 4250 4250 4250 4250 4250 4250 4250 4250 4250 4250 4250 4250 4250 4250 4250 4250 4250 4250 4250 4250 4250 4250 4250 4250 4250 4250 4250 4250 4250 4250 4250 4250 4250 4250 4250 4250 4250 4250 4250 4250 4250 4250 4250 4250 4250 4250 4250 4250 4250 4250 4250 4250 4250 4250 4250 4250 4250 4250
metacluster_0/pattern_1 5696 5696 5696 5696 5696 5696 5696 5696 5696 5696 5696 5696 5696 5696 5696 5696 5696 5696 5696 5696 5696 5696 5696 5696 5696 5696 5696 5696 5696 5696 5696 5696 5696 5696 5696 5696 5696 5696 5696 5696 5696 5696 5696 5696 5696 5696 5696 5696 5696 5696 5696 5696 5696 5696 5696 5696 5696 5696 5696 5696 5696 5696 5696 5696 5696 5696 5696 5696 5696 5696 5696 5696 5696 5696 5696 5696 5696 5696 5696 5696 5696 5696 5696 5696 5696 5696 5696 5696 5696 5696 5696 5696 5696 5696 5696 5696 5696 5696 5696 5696 5696 5696 5696 5696 5696 5696 5696 5696 5696 5696 5696 5696 5696 5696 5696 5696 5696 5696
metacluster_0/pattern_10 64 64 64 64 64 64 64 64 64 64 64 64 64 64 64 64 64 64 64 64 64 64 64 64 64 64 64 64 64 64 64 64 64 64 64 64 64 64 64 64 64 64 64 64 64 64 64 64 64 64 64 64 64 64 64 64 64 64 64 64 64 64 64 64 64 64 64 64 64 64 64 64 64 64 64 64 64 64 64 64 64 64 64 64 64 64 64 64 64 64 64 64 64 64 64 64 64 64 64 64 64 64 64 64 64 64 64 64 64 64 64 64 64 64 64 64 64 64
metacluster_0/pattern_11 189 189 189 189 189 189 189 189 189 189 189 189 189 189 189 189 189 189 189 189 189 189 189 189 189 189 189 189 189 189 189 189 189 189 189 189 189 189 189 189 189 189 189 189 189 189 189 189 189 189 189 189 189 189 189 189 189 189 189 189 189 189 189 189 189 189 189 189 189 189 189 189 189 189 189 189 189 189 189 189 189 189 189 189 189 189 189 189 189 189 189 189 189 189 189 189 189 189 189 189 189 189 189 189 189 189 189 189 189 189 189 189 189 189 189 189 189 189
metacluster_0/pattern_12 275 275 275 275 275 275 275 275 275 275 275 275 275 275 275 275 275 275 275 275 275 275 275 275 275 275 275 275 275 275 275 275 275 275 275 275 275 275 275 275 275 275 275 275 275 275 275 275 275 275 275 275 275 275 275 275 275 275 275 275 275 275 275 275 275 275 275 275 275 275 275 275 275 275 275 275 275 275 275 275 275 275 275 275 275 275 275 275 275 275 275 275 275 275 275 275 275 275 275 275 275 275 275 275 275 275 275 275 275 275 275 275 275 275 275 275 275 275

Exact match

In [46]:
dfi_sp = dfi_filt_crossp.query('(abs(pattern_center_aln_x- pattern_center_aln_y) == 0) & (pattern_strand_aln_x == pattern_strand_aln_x)')
match_sizes = dfi_sp.groupby(['pattern_x', 'pattern_y']).size()
count_matrix = match_sizes.unstack(fill_value=0)

norm_count_matrix = count_matrix / norm_counts# .truediv(min_counts, axis='columns').truediv(total_number, axis='index')
norm_count_matrix = norm_count_matrix.fillna(0)  # these examples didn't have any paired pattern
In [60]:
%opts HeatMap [xrotation=90] (cmap='Blues')
norm_count_matrix[idx_order].loc[idx_order].stack().reset_index().hvplot.heatmap(x='level_1', y='level_0', C="0", width=1000, height=1000, colorbar=True)  # TODO add vmax
Out[60]:
In [523]:
fig, ax = plt.subplots(figsize=(10, 10))
sns.heatmap(norm_count_matrix[idx_order].loc[idx_order], ax=ax, cmap='Blues', cbar_kws={'label': 'Overlap fraction: #A==B / min(#A, #B)'}, vmax=1)
Out[523]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f82c8178550>

Match within 5bp

In [524]:
dfi_sp = dfi_filt_crossp.query('(abs(pattern_center_aln_x- pattern_center_aln_y) <= 5) & (abs(pattern_center_aln_x- pattern_center_aln_y) != 0) &(pattern_strand_aln_x == pattern_strand_aln_x)')
match_sizes = dfi_sp.groupby(['pattern_x', 'pattern_y']).size()
count_matrix = match_sizes.unstack(fill_value=0)

norm_count_matrix = count_matrix / norm_counts# .truediv(min_counts, axis='columns').truediv(total_number, axis='index')
norm_count_matrix = norm_count_matrix.fillna(0)  # these examples didn't have any paired pattern
In [525]:
fig, ax = plt.subplots(figsize=(10, 10))
sns.heatmap(norm_count_matrix[idx_order].loc[idx_order], ax=ax, cmap='Blues', cbar_kws={'label': 'Overlap fraction: #A==B / min(#A, #B)'}, vmax=1)
Out[525]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f82c40f3ba8>

Match [5-15]

In [72]:
dfi_sp = dfi_filt_crossp.query('(abs(pattern_center_aln_x- pattern_center_aln_y) < 15) & (abs(pattern_center_aln_x- pattern_center_aln_y) > 5) &(pattern_strand_aln_x == pattern_strand_aln_x)')
match_sizes = dfi_sp.groupby(['pattern_x', 'pattern_y']).size()
count_matrix = match_sizes.unstack(fill_value=0)

norm_count_matrix = count_matrix / norm_counts# .truediv(min_counts, axis='columns').truediv(total_number, axis='index')
norm_count_matrix = norm_count_matrix.fillna(0)  # these examples didn't have any paired pattern
In [73]:
%opts HeatMap [xrotation=90] (cmap='Blues')
norm_count_matrix[idx_order].loc[idx_order].stack().reset_index().hvplot.heatmap(x='level_1', y='level_0', C="0", width=1000, height=1000, colorbar=True)  # TODO add vmax
Out[73]:

Match [15-50]

In [80]:
dfi_sp = dfi_filt_crossp.query('(abs(pattern_center_aln_x- pattern_center_aln_y) < 50) & (abs(pattern_center_aln_x- pattern_center_aln_y) > 16) &(pattern_strand_aln_x == pattern_strand_aln_x)')
match_sizes = dfi_sp.groupby(['pattern_x', 'pattern_y']).size()
count_matrix = match_sizes.unstack(fill_value=0)

norm_count_matrix = count_matrix / norm_counts# .truediv(min_counts, axis='columns').truediv(total_number, axis='index')
norm_count_matrix = norm_count_matrix.fillna(0)  # these examples didn't have any paired pattern
In [84]:
%opts HeatMap [xrotation=90] (cmap='Blues')
norm_count_matrix[idx_order].loc[idx_order].stack().reset_index().hvplot.heatmap(x='pattern_y', y='pattern_x', C="0", width=1000, height=1000, colorbar=True)  # TODO add vmax
Out[84]:

Match within [50-100]

In [85]:
dfi_sp = dfi_filt_crossp.query('(abs(pattern_center_aln_x- pattern_center_aln_y) < 100) & (abs(pattern_center_aln_x- pattern_center_aln_y) > 50) &(pattern_strand_aln_x == pattern_strand_aln_x)')
match_sizes = dfi_sp.groupby(['pattern_x', 'pattern_y']).size()
count_matrix = match_sizes.unstack(fill_value=0)

norm_count_matrix = count_matrix / norm_counts# .truediv(min_counts, axis='columns').truediv(total_number, axis='index')
norm_count_matrix = norm_count_matrix.fillna(0)  # these examples didn't have any paired pattern
In [86]:
%opts HeatMap [xrotation=90] (cmap='Blues')
norm_count_matrix[idx_order].loc[idx_order].stack().reset_index().hvplot.heatmap(x='pattern_y', y='pattern_x', C="0", width=1000, height=1000, colorbar=True)  # TODO add vmax
Out[86]:
In [527]:
fig, ax = plt.subplots(figsize=(10, 10))
sns.heatmap(norm_count_matrix[idx_order].loc[idx_order], ax=ax, cmap='Blues', cbar_kws={'label': 'Overlap fraction: #A==B / min(#A, #B)'}, vmax=1)
Out[527]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f82c4110a90>

TODO

  • can we draw cluster-boundaries and label the clusters?
  • can we have some statistical tests for the co-occurence -> higlight only the significant ones

Supplementary

  • if we do abs_distance < 5 or abs_distance != 5 we get almost the same contraints
In [528]:
dfi_sp = dfi_filt_crossp.query('(abs(pattern_center_aln_x- pattern_center_aln_y) < 50) & (abs(pattern_center_aln_x- pattern_center_aln_y) != 0) &(pattern_strand_aln_x == pattern_strand_aln_x)')
match_sizes = dfi_sp.groupby(['pattern_x', 'pattern_y']).size()
count_matrix = match_sizes.unstack(fill_value=0)

norm_count_matrix = count_matrix / norm_counts# .truediv(min_counts, axis='columns').truediv(total_number, axis='index')
norm_count_matrix = norm_count_matrix.fillna(0)  # these examples didn't have any paired pattern
In [529]:
# Excluding exact matches
fig, ax = plt.subplots(figsize=(10, 10))
sns.heatmap(norm_count_matrix[idx_order].loc[idx_order], ax=ax, cmap='Blues', cbar_kws={'label': 'Overlap fraction: #A==B / min(#A, #B)'}, vmax=1)
Out[529]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f801eb2b6d8>

Variying the distance

In [530]:
out = []
for i in range(35):
    dfi_sp = dfi_filt_crossp.query(f'(abs(pattern_center_aln_x- pattern_center_aln_y) == {i})')
    match_sizes = dfi_sp.groupby(['pattern_x', 'pattern_y']).size()
    count_matrix = match_sizes.unstack(fill_value=0)

    norm_count_matrix = count_matrix / norm_counts# .truediv(min_counts, axis='columns').truediv(total_number, axis='index')
    norm_count_matrix = norm_count_matrix.fillna(0)  # these examples didn't have any paired pattern
    out.append(norm_count_matrix)
In [ ]:
# Make a gif
vmax = 0.1
for i in range(len(count_matrices)):
    fig, ax = plt.subplots(figsize=(10, 10))
    sns.heatmap(count_matrices[i][idx_order].loc[idx_order], ax=ax, cmap='Blues', vmax=1)
    ax.set_title(i)
    fig.savefig(f"{modisco_dir}/spacing/heatmap_gif/figures/{i:02d}.png", transparent=False)
    plt.close()

# make a gif
!convert -delay 100 -loop 0 {modisco_dir}/spacing/heatmap_gif/figures/*.png {modisco_dir}/spacing/heatmap_gif/heatmap.gif
In [386]:
def interactive_heatmap(count_matrices, idx_order, vmax=.5):
    def build_fn(count_matrices, idx_order, vmax):
        def fn(i):
            fig, ax = plt.subplots(figsize=(10, 10))
            sns.heatmap(count_matrices[i][idx_order].loc[idx_order]*1000, ax=ax, cmap='Blues', vmax=vmax)
            
        return fn
    from ipywidgets import interact, interactive, fixed, interact_manual
    import ipywidgets as widgets
    return interactive(build_fn(count_matrices, idx_order, vmax),
                       i=widgets.IntSlider(min=0,
                                           max=len(count_matrices) - 1,
                                           step=1,
                                           value=0))
In [390]:
mkdir -p {modisco_dir}/spacing/heatmap_gif/figures
In [405]:
from IPython.display import display
from vdom.helpers import img
In [ ]:
display)
In [388]:
interactive_heatmap(out, idx_order, vmax=.1)

Other

In [141]:
patterns_d = {p.name: p for p in aligned_patterns}

v = (norm_count_matrix*1000 > .1).stack()

for a,b in v[v].index:
    patterns_d[a].plot('seq_ic', letter_width=.1, height=.5)
    patterns_d[b].plot('seq_ic', letter_width=.1, height=.5)
    plt.xlabel("Position")
In [ ]:
## 
In [316]:
dfi_sp = dfi_filt_crossp.query('(abs(pattern_center_aln_x- pattern_center_aln_y) <= 2) & (pattern_strand_aln_x == pattern_strand_aln_x)')
match_sizes = dfi_sp.groupby(['pattern_x', 'pattern_y']).size()
count_matrix_2 = match_sizes.unstack(fill_value=0)
dfi_sp = dfi_filt_crossp.query('(abs(pattern_center_aln_x- pattern_center_aln_y) <= 0) & (pattern_strand_aln_x == pattern_strand_aln_x)')
match_sizes = dfi_sp.groupby(['pattern_x', 'pattern_y']).size()
count_matrix_0 = match_sizes.unstack(fill_value=0)
In [320]:
v = (count_matrix_2 - count_matrix_0).stack().sort_values().tail(10)
In [325]:
print(v.to_string())
pattern_x                 pattern_y               
metacluster_3/pattern_4   metacluster_3/pattern_3      220.0
metacluster_3/pattern_3   metacluster_3/pattern_4      220.0
metacluster_2/pattern_21  metacluster_3/pattern_3      239.0
metacluster_3/pattern_3   metacluster_2/pattern_21     239.0
metacluster_2/pattern_2   metacluster_2/pattern_2      338.0
metacluster_6/pattern_0   metacluster_6/pattern_0      372.0
metacluster_2/pattern_2   metacluster_2/pattern_1     1114.0
metacluster_2/pattern_1   metacluster_2/pattern_2     1114.0
metacluster_3/pattern_3   metacluster_2/pattern_2     1903.0
metacluster_2/pattern_2   metacluster_3/pattern_3     1903.0
In [323]:
for a,b in v.index:
    patterns_d[a].plot('seq_ic', letter_width=.1, height=.5)
    patterns_d[b].plot('seq_ic', letter_width=.1, height=.5)
    plt.xlabel("Position")
In [319]:
patterns_d['metacluster_2/pattern_2'].plot("seq_ic");
patterns_d['metacluster_3/pattern_3'].plot("seq_ic");
In [ ]:
metacluster_2/pattern_22
In [314]:
patterns_d['metacluster_2/pattern_22'].plot("seq_ic");
In [ ]:
exclude
In [243]:
dfi_sp = dfi_filt_crossp.query('(abs(pattern_center_aln_x- pattern_center_aln_y) <= 20) & (pattern_strand_aln_x == pattern_strand_aln_x)')
match_sizes = dfi_sp.groupby(['pattern_x', 'pattern_y']).size()
count_matrix = match_sizes.unstack(fill_value=0)

norm_count_matrix = count_matrix.truediv(total_number, axis='columns').truediv(total_number, axis='index')
norm_count_matrix = norm_count_matrix.fillna(0)  # these examples didn't have any paired pattern
In [244]:
# order the matrix by names
idx_order = [p.name for p in aligned_patterns]
In [245]:
fig, ax = plt.subplots(figsize=(10, 10))
sns.heatmap(norm_count_matrix[idx_order].loc[idx_order]*1000, ax=ax, cmap='Blues', vmax=.5)
Out[245]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f7cd0f9b4a8>
In [ ]:
# TODO - remove the 

question -> why is there no Oct4-Sox2 and Sox2 match?

  • remove TE's from the analysis to not clutter it
  • order the matrix by pattern order
In [ ]:
# m0_p0
# m0_p1
In [188]:
dfi_filt_crossp.head()
Out[188]:
example_idx pattern_x pattern_center_aln_x pattern_strand_aln_x pattern_center_x pattern_y pattern_center_aln_y pattern_strand_aln_y pattern_center_y
2 1 metacluster_3/pattern_3 574 - 595 metacluster_4/pattern_2 493 + 468
3 1 metacluster_3/pattern_3 574 - 595 metacluster_4/pattern_3 446 + 443
4 1 metacluster_3/pattern_3 574 - 595 metacluster_7/pattern_0 440 + 444
5 1 metacluster_3/pattern_3 574 - 595 metacluster_7/pattern_0 461 + 465
6 1 metacluster_3/pattern_3 574 - 595 metacluster_7/pattern_0 502 + 506
In [339]:
dfi_filt_crossp.query('(pattern_x == "metacluster_0/pattern_0") & (pattern_y == "metacluster_0/pattern_1")')
Out[339]:
example_idx pattern_x pattern_center_aln_x pattern_strand_aln_x pattern_center_x pattern_y pattern_center_aln_y pattern_strand_aln_y pattern_center_y
28519 291 metacluster_0/pattern_0 608 + 604 metacluster_0/pattern_1 431 - 430
64051 704 metacluster_0/pattern_0 248 - 251 metacluster_0/pattern_1 461 - 460
65237 713 metacluster_0/pattern_0 400 - 403 metacluster_0/pattern_1 473 - 472
... ... ... ... ... ... ... ... ... ...
4655831 97276 metacluster_0/pattern_0 355 + 351 metacluster_0/pattern_1 286 - 285
4672817 97584 metacluster_0/pattern_0 141 + 137 metacluster_0/pattern_1 481 + 482
4672987 97586 metacluster_0/pattern_0 571 - 574 metacluster_0/pattern_1 519 - 518

445 rows × 9 columns

In [111]:
dfos = dfi_filt_crossp.query('(pattern_x == "metacluster_2/pattern_0") & (pattern_y == "metacluster_2/pattern_0")')
dfos['distance'] = dfos.pattern_center_aln_x - dfos.pattern_center_aln_y
dfos['old_distance'] = dfos.pattern_center_x - dfos.pattern_center_y
dfos['abs_distance'] = np.abs(dfos.distance)
In [112]:
dfos[dfos.abs_distance < 50]['distance'].plot.hist(100)
Out[112]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f4cb2635b00>
In [113]:
dfos[dfos.abs_distance < 50]['old_distance'].plot.hist(100)
Out[113]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f4cb22809b0>
In [346]:
dfos[dfos.abs_distance < 20].distance.value_counts()
Out[346]:
-14    2
 17    2
 0     1
 16    1
Name: distance, dtype: int64
In [333]:
dfos[dfos.distance==3]
Out[333]:
example_idx pattern_x pattern_center_aln_x pattern_strand_aln_x pattern_center_x pattern_y pattern_center_aln_y pattern_strand_aln_y pattern_center_y distance old_distance abs_distance
9979 103 metacluster_2/pattern_2 459 + 445 metacluster_2/pattern_2 456 + 442 3 3 3
119805 1269 metacluster_2/pattern_2 359 + 345 metacluster_2/pattern_2 356 + 342 3 3 3
135068 1431 metacluster_2/pattern_2 459 + 445 metacluster_2/pattern_2 456 + 442 3 3 3
... ... ... ... ... ... ... ... ... ... ... ... ...
4550609 95309 metacluster_2/pattern_2 523 + 509 metacluster_2/pattern_2 520 + 506 3 3 3
4606098 96330 metacluster_2/pattern_2 204 + 190 metacluster_2/pattern_2 201 + 187 3 3 3
4690215 97948 metacluster_2/pattern_2 455 + 441 metacluster_2/pattern_2 452 + 438 3 3 3

182 rows × 12 columns

In [334]:
dfos[dfos.distance==-3]
Out[334]:
example_idx pattern_x pattern_center_aln_x pattern_strand_aln_x pattern_center_x pattern_y pattern_center_aln_y pattern_strand_aln_y pattern_center_y distance old_distance abs_distance
9965 103 metacluster_2/pattern_2 456 + 442 metacluster_2/pattern_2 459 + 445 -3 -3 3
119797 1269 metacluster_2/pattern_2 356 + 342 metacluster_2/pattern_2 359 + 345 -3 -3 3
135064 1431 metacluster_2/pattern_2 456 + 442 metacluster_2/pattern_2 459 + 445 -3 -3 3
... ... ... ... ... ... ... ... ... ... ... ... ...
4550603 95309 metacluster_2/pattern_2 520 + 506 metacluster_2/pattern_2 523 + 509 -3 -3 3
4606097 96330 metacluster_2/pattern_2 201 + 187 metacluster_2/pattern_2 204 + 190 -3 -3 3
4690210 97948 metacluster_2/pattern_2 452 + 438 metacluster_2/pattern_2 455 + 441 -3 -3 3

182 rows × 12 columns

In [338]:
# The central shift is due to the repetative elements
In [335]:
plot_tracks(filter_tracks(get_dataset_item(contrib_scores, 103), [400, 600]));
In [336]:
patterns_d['metacluster_0/pattern_0'].plot("seq_ic");
In [337]:
patterns_d['metacluster_0/pattern_1'].plot("seq_ic");
In [171]:
orig_patterns_d['metacluster_1/pattern_0'].plot("seq_ic");
In [172]:
orig_patterns_d['metacluster_1/pattern_1'].plot("seq_ic");
In [170]:
orig_patterns_d['metacluster_1/pattern_0'].trim_seq_ic(trim_frac=0.08).plot("seq_ic");
In [173]:
orig_patterns_d['metacluster_1/pattern_1'].trim_seq_ic(trim_frac=0.08).plot("seq_ic");
In [200]:
len(orig_patterns_d['metacluster_1/pattern_1'].trim_seq_ic(trim_frac=0.08))
Out[200]:
11
In [201]:
orig_patterns_d['metacluster_1/pattern_1']._trim_seq_ic_ij(trim_frac=0.08)
Out[201]:
(28, 39)
In [228]:
orig_patterns_d['metacluster_1/pattern_1']._trim_center_shift(trim_frac=0.08)
Out[228]:
2
In [204]:
70//2
Out[204]:
35
In [202]:
(39 - 28) // 2
Out[202]:
5
In [168]:
patterns_d['metacluster_1/pattern_0'].attrs['align']
Out[168]:
{'use_rc': True, 'offset': -6}
In [174]:
patterns_d['metacluster_1/pattern_1'].attrs['align']
Out[174]:
{'use_rc': False, 'offset': 0}
In [74]:
fig ,ax = plt.subplots(figsize=(15,2))
dfos.distance.plot.hist(1000, ax=ax);
In [58]:
dfi_sp = dfi_filt_crossp.query('(abs(pattern_center_aln_x- pattern_center_aln_y) <= 20) & (pattern_strand_aln_x == pattern_strand_aln_x)')
match_sizes = dfi_sp.groupby(['pattern_x', 'pattern_y']).size()
count_matrix = match_sizes.unstack(fill_value=0)

norm_count_matrix = count_matrix.truediv(total_number, axis='columns').truediv(total_number, axis='index')
norm_count_matrix = norm_count_matrix.fillna(0)  # these examples didn't have any paired pattern

TODO

  • [x] filter the matrix by example_idx where oct4 and sox2 are in the vicinity
  • [x] checkout the example where two different TE's don't exactly overlap but match when they are shifted by 5
In [ ]:
v = (norm_count_matrix*1000 > .2).stack()

for a,b in v[v].index:
    patterns_d[a].plot('seq_ic', letter_width=.1, height=.5)
    patterns_d[b].plot('seq_ic', letter_width=.1, height=.5)
    plt.xlabel("Position")

add a function unique_instances

  • think about how to disambiguate these instances using metaclusters
  • unique_instances should allow only a single position to be occupied by motifs from the same model group (?)
  • check how to handle the cases with Oct4-Sox2 <-> Sox2 examples