from collections import OrderedDict
exp = 'nexus,peaks,OSNK,0,10,1,FALSE,same,0.5,64,25,0.004,9,FALSE,[1,50],TRUE'
imp_score = 'profile/wn'

# motifs = OrderedDict([
#     ("Oct4-Sox2", 'Oct4/m0_p0'),
#     ("Oct4", 'Oct4/m0_p1'),
#     # ("Strange-sym-motif", 'Oct4/m0_p5'),
#     ("Sox2", 'Sox2/m0_p1'),
#     ("Nanog", 'Nanog/m0_p1'),
#     ("Zic3", 'Nanog/m0_p2'),
#     ("Nanog-partner", 'Nanog/m0_p4'),
#     ("Klf4", 'Klf4/m0_p0'),
# ])

motifs = OrderedDict([
    ("Oct4-Sox2", 'Oct4/m0_p0'),
    ("Oct4", "Oct4/m0_p1"),
    ("Oct4-Oct4", "Oct4/m0_p6"),
    ("Sox2", "Sox2/m0_p1"),
    ("Nanog", "Nanog/m0_p1"),
    ("Nanog-partner", "Nanog/m0_p4"),
    ("Klf4", "Klf4/m0_p0"),
    ("Klf4-Klf4", "Klf4/m0_p5"),
    ("B-Box", "Oct4/m0_p5"),
    ("Zic3", "Nanog/m0_p2"),
    ("Essrb", "Oct4/m0_p16"),
])

# dfi_subset.pattern_name.unique()

Goal¶

make sub-plots for Figure 5

Open questions¶

should we make a larger co-occurence plot?
- plot more motifs (one from each cluster at least)

# Imports
from basepair.imports import *
from basepair.math import softmax
import pyranges as pr
from basepair.plot.heatmaps import heatmap_stranded_profile, multiple_heatmap_stranded_profile
from basepair.plot.profiles import plot_stranded_profile, multiple_plot_stranded_profile, extract_signal
from basepair.plot.tracks import plot_tracks, filter_tracks
from basepair.modisco.results import MultipleModiscoResult, Seqlet, resize_seqlets
from basepair.modisco.pattern_instances import (multiple_load_instances, load_instances, filter_nonoverlapping_intervals, 
                                                plot_coocurence_matrix, align_instance_center, dfi2seqlets, annotate_profile)
from basepair.exp.chipnexus.perturb.vdom import vdom_motif_pair, plot_spacing_hist
from basepair.exp.chipnexus.spacing import remove_edge_instances, get_motif_pairs, motif_pair_dfi
from basepair.exp.chipnexus.simulate import (insert_motif, generate_sim, plot_sim, generate_seq, 
                                             model2tasks, motif_coords, interactive_tracks, plot_motif_table,
                                             plot_sim_motif_col)
from basepair.exp.paper.config import *
from basepair.cli.modisco import load_profiles
from basepair.cli.imp_score import ImpScoreFile
from basepair.preproc import rc_seq, dfint_no_intersection
from copy import deepcopy
from scipy.fftpack import fft, ifft
from plotnine import *
import plotnine
import warnings
warnings.filterwarnings("ignore")

%matplotlib inline
%config InlineBackend.figure_format = 'retina'
paper_config()

# interval columns in dfi
interval_cols = ['example_chrom', 'pattern_start_abs', 'pattern_end_abs']

Using TensorFlow backend.

# figures dir
model_dir = models_dir / exp
fdir = Path(f'{ddir}/figures/modisco/{exp}/spacing/')
fdir_individual = fdir / 'individual'
fdir_individual_sim = fdir / 'individual-simulation'

from basepair.cli.imp_score import ImpScoreFile

isf = ImpScoreFile(model_dir / 'deeplift.imp_score.h5')

peaks_per_task = pd.value_counts(isf.data["/metadata/interval_from_task"][:])

!mkdir -p {fdir_individual}
!mkdir -p {fdir_individual_sim}
!mkdir -p {fdir}

# Generate motif pairs
pairs = get_motif_pairs(motifs)

# ordered names
pair_names = ["<>".join(x) for x in pairs]

# define the global set of distances
dist_subsets = ['center_diff<=35',
               '(center_diff>35)&(center_diff<=70)', 
               '(center_diff>70)&(center_diff<=150)', 
               'center_diff>150']
dist_subset_labels = ['dist < 35',
                      '35 < dist <= 70',
                      '70 < dist <= 150',
                      '150 < dist',
                     ]

task_map = {"O": "Oct4", "S": "Sox2", "N": "Nanog", "K": "Klf4"}
# Figure out the tasks from the name
tasks = [task_map[s] for s in exp.split(",")[2]]

mr = MultipleModiscoResult({t: model_dir / f'deeplift/{t}/out/{imp_score}/modisco.h5'
                           for t in tasks})

main_motifs = [mr.get_pattern(pattern_name).rename(name)
               for name, pattern_name in motifs.items()]

len(main_motifs)

11

from basepair.exp.paper.fig4 import similarity_matrix, cluster_align_patterns

sim_seq = similarity_matrix(main_motifs, track='seq_ic')
np.fill_diagonal(sim_seq, 0)

100%|██████████| 11/11 [00:00<00:00, 21.12it/s]

motif_names = [m.name for m in main_motifs]
df_sim_seq = pd.DataFrame(sim_seq, index=motif_names, columns=motif_names)

main_motifs[4].align(main_motifs[5], 'contrib/Nanog').attrs

{'align': {'use_rc': False, 'offset': -1}}

main_motifs[4].align(main_motifs[5], 'contrib/Nanog').plot('contrib')
main_motifs[5].plot("contrib");

main_motifs[5].plot("seq_ic");

main_motifs[1].align(main_motifs[4], 'seq_ic').plot('seq_ic')
main_motifs[4].plot("seq_ic");

Exception ignored in: <bound method ImpScoreFile.__del__ of <basepair.cli.imp_score.ImpScoreFile object at 0x7f807e4fb390>>
Traceback (most recent call last):
  File "/users/avsec/workspace/basepair/basepair/cli/imp_score.py", line 360, in __del__
    self.close()
  File "/users/avsec/workspace/basepair/basepair/cli/imp_score.py", line 357, in close
    self.f.close()
  File "/users/avsec/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/kipoi/readers.py", line 115, in close
    self.__exit__()
  File "/users/avsec/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/kipoi/readers.py", line 105, in __exit__
    self.f.close()
AttributeError: 'NoneType' object has no attribute 'close'

main_motifs[-2].align(main_motifs[-4], 'seq_ic').plot('seq_ic')
main_motifs[-4].plot("seq_ic");

for p in main_motifs:
    p.trim_seq_ic(0.08).plot("seq_ic");

TF-MoDISco is using the TensorFlow backend.

dfrm.name.str.match("ERV").sum()

12770

dfrm.name.str.match("ERV").mean()

0.002411897283017542

dfi_subset = pd.read_parquet(f"{model_dir}/deeplift/dfi_subset.parq", engine='fastparquet')
dfi_subset['row_idx'] = np.arange(len(dfi_subset))

dfi_subset['Chromosome'] = dfi_subset.example_chrom
dfi_subset['Start'] = dfi_subset.pattern_start_abs
dfi_subset['End'] = dfi_subset.pattern_end_abs
dfi_subset_pr = pr.PyRanges(dfi_subset)

assert len(dfi_subset.pattern.unique()) == len(main_motifs)

len(dfi_subset)

397924

Add repeat masker annotation¶

# exclude erv's
import pyranges as pr

# load repeat masker file
from basepair.exp.paper.repeat_masker import download_repeat_masker, read_repeat_masker, intersect_repeat_masker
dfrm = read_repeat_masker(download_repeat_masker())
dfrm_erv = dfrm[dfrm.name.str.contains("ERV")]
dfrm_erv['Start'] = dfrm_erv['start']
dfrm_erv['End'] = dfrm_erv['end']
dfrm_erv['Chromosome'] = dfrm_erv['chrom']
dfrm_erv = pr.PyRanges(dfrm_erv)

# exclude erv's
exclude = dfi_subset_pr.overlap(dfrm_erv).df.row_idx.unique()
exclude_rows = dfi_subset.row_idx.isin(exclude)
print(f"Excluding: {len(exclude_rows)} / {len(dfi_subset)} rows")
dfi_subset['is_erv'] = exclude_rows

Using downloaded and verified file: /users/avsec/workspace/basepair/data/raw/annotation/mm10/RepeatMasker/mm10.fa.out.gz
Excluding: 397924 / 397924 rows

pd.crosstab(dfi_subset.is_erv, dfi_subset.is_te)

a = pr.PyRanges(chromosomes=['chr1'], starts=[10], ends=[20])
b = pr.PyRanges(chromosomes=['chr1'], starts=[19], ends=[22])

a.overlap(b)

+--------------+-----------+-----------+
| Chromosome   |     Start |       End |
| (category)   |   (int32) |   (int32) |
|--------------+-----------+-----------|
| chr1         |        10 |        20 |
+--------------+-----------+-----------+
Unstranded PyRanges object has 1 rows and 3 columns from 1 chromosomes.

len(dfi_subset)

397924

# many still remain
dfi_subset.is_te.sum()

21274

Exclude motifs overlapping the repeat elements¶

dfi_subset_pr = pr.PyRanges(dfi_subset)

# duplicated number of instances
len(dfi_subset_pr)

397924

# non-duplicated number of instances
len(dfi_subset_pr.drop_duplicate_positions())

241005

dfi_subset.head()

dict(dfi_subset.pattern_name.value_counts())

{'Klf4': 103044,
 'Nanog': 76337,
 'Oct4': 74594,
 'Oct4-Sox2': 52475,
 'Sox2': 40193,
 'Zic3': 14927,
 'Oct4-Oct4': 12511,
 'B-Box': 7516,
 'Nanog-partner': 7066,
 'Essrb': 5353,
 'Klf4-Klf4': 3908}

fig, ax = plt.subplots(figsize=get_figsize(0.23))
dfi_subset.pattern_name.value_counts().plot.bar(width=.9)
plt.ylabel("Motif instances")
plt.xlabel("Motif")
fig.savefig(fdir / '../n-instances-per-motif.pdf')

# Number of regions with a motif (total = 150k. Only 14k lack a motif)
len(dfi_subset.groupby('example_idx').size())

136302

dfi_subset.groupby('example_idx').size().value_counts()

2     35091
3     31293
1     27308
      ...  
17        1
25        1
30        1
Length: 23, dtype: int64

total_regions = len(ranges)

fig, ax = plt.subplots(figsize=get_figsize(0.4))
vec = dfi_subset.groupby('example_idx').size()
vec = pd.concat([vec, pd.Series(np.zeros(total_regions - len(vec)))])
vec.plot.hist(bins=np.arange(11), rwidth=0.9, align='left')
plt.xlabel("Number of motifs per region")
fig.savefig(fdir / '../n-motifs-per-region.pdf')

dfi_subset.head()

19.535 / (212.63-196.32) * 0.3

0.3593194359288779

3.505 / (212.63-196.32) * 0.3

0.06446965052115265

dfi_subset.groupby(['pattern_name', 'example_idx']).size().reset_index()

dfi_subset.head()

fig, axes = plt.subplots(len(tasks), 1, 
                         figsize=get_figsize(0.23*1.4, aspect=len(tasks)* 0.618/1.4),
                         sharey=True,
                         gridspec_kw=dict(hspace=0),
                         sharex=True)
for i, task in enumerate(tasks):
    ax = axes[i]
    vec = dfi_subset[dfi_subset.example_interval_from_task == task].groupby('example_idx').size()
    vec = pd.concat([vec, pd.Series(np.zeros(peaks_per_task[task] - vec.index.nunique()))])
    vec.plot.hist(bins=np.arange(10), rwidth=0.9, align='left', ax=ax)
    ax.set_ylabel(task, rotation=0, labelpad=10, ha='right')
ax.set_xlabel("Number of motifs per region")
fig.savefig(fdir / '../n-motifs-per-region.per-task.pdf')

fig, axes = plt.subplots(len(tasks), 1, 
                         figsize=get_figsize(0.23, aspect=len(tasks)* 0.618),
                         sharey=True,
                         gridspec_kw=dict(hspace=0),
                         sharex=True)
pattern_names = dfi_subset.pattern_name.unique()
for i, task in enumerate(tasks):
    ax = axes[i]
    if i == 0:
        ax.set_title("Total number of motifs")
    vec = dfi_subset[dfi_subset.example_interval_from_task == task].pattern_name.value_counts().reindex(pattern_names, fill_value=0)
    vec.plot.bar(width=.9, ax=ax)
    ax.set_ylabel(task, rotation=0, labelpad=10, ha='right')
ax.set_xlabel("Motif")
fig.savefig(fdir / '../n-instances-per-motif.per-task.pdf')

fig, axes = plt.subplots(len(tasks), 1, 
                         figsize=get_figsize(0.23, aspect=len(tasks)* 0.618),
                         sharey=True,
                         gridspec_kw=dict(hspace=0),
                         sharex=True)
pattern_names = dfi_subset.pattern_name.unique()
for i, task in enumerate(tasks):
    ax = axes[i]
    if i == 0:
        ax.set_title("Average number of motifs per peak")
    vec = dfi_subset[dfi_subset.example_interval_from_task == task].pattern_name.value_counts().reindex(pattern_names, fill_value=0)
    vec = vec / peaks_per_task[task]
    vec.plot.bar(width=.9, ax=ax)
    ax.set_ylabel(task, rotation=0, labelpad=10, ha='right')
ax.set_xlabel("Motif")

fig.savefig(fdir / '../n-instances-per-motif.per-task.avg-motifs-per-peak.pdf')

np.sum(dfi_subset.groupby('example_idx').size() >= 3)

72696

np.sum(dfi_subset.groupby('example_idx').size() >= 3) / 150908

0.4817239642696212

np.sum(dfi_subset.groupby('example_idx').size() >= 5)

20352

72696 / 150000

0.48464

dfi_subset.groupby('example_idx').size()

example_idx
1         8
2         2
3         3
         ..
147971    1
147972    3
147973    4
Length: 135913, dtype: int64

patterns = [p for p in mr.get_all_patterns()]

ic = [p.seq_info_content for p in mr.get_all_patterns() if mr.n_seqlets(p.name) > 100]

n_seqlets = [mr.n_seqlets(p.name) for p in mr.get_all_patterns()]

len(ic)

56

ic = [p.seq_info_content for p in mr.get_all_patterns()]

fig, ax = plt.subplots(figsize=get_figsize(0.4))
plt.hist(ic, bins=np.arange(0, 110, step=8), rwidth=0.9);
# plt.hist(ic, bins=15, rwidth=0.9, align='left');
plt.xlabel("IC")
plt.ylabel("Motifs")
fig.savefig(fdir / '../seq-IC.hist.all.pdf')

p = patterns[0]

p.seq_info_content

TF-MoDISco is using the TensorFlow backend.

14.260665650571447

Check if they occur at the same location¶

Create motif pairs¶

# create motif pairs
# dfab_w_te = pd.concat([motif_pair_dfi(dfi_subset, pair).assign(motif_pair="<>".join(pair)) for pair in pairs], axis=0)

# create motif pairs
dfab = pd.concat([motif_pair_dfi(dfi_subset[(~dfi_subset.is_te) & (~dfi_subset.is_erv)], pair).assign(motif_pair="<>".join(pair)) for pair in pairs], axis=0)

# Remove self matches
dfab = dfab.query('~((pattern_center_aln_x == pattern_center_aln_y) & (pattern_strand_aln_x == pattern_strand_aln_x))')

Remove Sox2 and Oct4 matching exactly to Oct4-Sox2¶

exclude_sox2 = dfab[(dfab.motif_pair == 'Oct4-Sox2<>Sox2') & 
                    (dfab['center_diff_aln'] == 0)].row_idx_y.values
exclude_oct4 = dfab[(dfab.motif_pair == 'Oct4-Sox2<>Oct4') & 
                    (dfab['center_diff_aln'] == 0)].row_idx_y.values
exclude_oct4_v2 = dfab[(dfab.motif_pair == 'Oct4<>Oct4-Oct4') & 
                    (dfab['center_diff_aln'] == 0)].row_idx_y.values


old_len = len(dfab)
# Exclude the overlapping row
dfab = dfab[(dfab.pattern_name_x != 'Oct4') | (~dfab.row_idx_x.isin(exclude_oct4))]
dfab = dfab[(dfab.pattern_name_y != 'Oct4') | (~dfab.row_idx_y.isin(exclude_oct4))]
dfab = dfab[(dfab.pattern_name_x != 'Oct4') | (~dfab.row_idx_x.isin(exclude_oct4_v2))]
dfab = dfab[(dfab.pattern_name_y != 'Oct4') | (~dfab.row_idx_y.isin(exclude_oct4_v2))]
dfab = dfab[(dfab.pattern_name_x != 'Sox2') | (~dfab.row_idx_x.isin(exclude_sox2))]
dfab = dfab[(dfab.pattern_name_y != 'Sox2') | (~dfab.row_idx_y.isin(exclude_sox2))]
nd = len(dfab) - old_len
print(f"Removed {nd}/{len(dfab)} instances")

Removed 0/353922 instances

# dfab.to_csv(f"{model_dir}/deeplift/dfab.csv.gz", index=False, compression='gzip')

Sanity-check -> plot the meta profile¶

motif_subset = OrderedDict[(k,v )

# seqlets = dfi2seqlets(dfi_subset[dfi_subset.pattern_name == 'Oct4-Sox2'])
# seqlets = resize_seqlets(seqlets, 70, seqlen=1000)
# seqlet_profiles = {k: extract_signal(v, seqlets) for k,v in profiles.items()}

# sort_idx = np.argsort(-seqlet_profiles['Oct4'].sum(axis=(-1,-2)))[:1000]
sort_idx = np.arange(1000)

# multiple_plot_stranded_profile({p:v for p,v in seqlet_profiles.items()}, figsize_tmpl=(2.55,2))
# multiple_heatmap_stranded_profile(seqlet_profiles, sort_idx=sort_idx, figsize=(10,10));

Co-occurence test¶

[ ] TODO - fix the test
- [ ] use fisher exact
- [ ] remove TE's?

<100bp¶

from basepair.exp.chipnexus.spacing import coocurrence_plot
from basepair.exp.chipnexus.spacing import co_occurence_matrix

dfi_subset.pattern_name.unique()

array(['Oct4-Sox2', 'Oct4', 'B-Box', 'Oct4-Oct4', 'Essrb', 'Sox2', 'Nanog', 'Zic3',
       'Nanog-partner', 'Klf4', 'Klf4-Klf4'], dtype=object)

def co_occurence_matrix(dfi_subset, query_string=""):
    """Returns the fraction of times pattern x (row) overlaps pattern y (column)
    """
    from basepair.stats import norm_matrix
    total_number = dfi_subset.groupby(['pattern']).size()
    norm_counts = norm_matrix(total_number)

    # normalization: minimum number of counts
    total_number = dfi_subset.groupby(['pattern_name']).size()
    norm_counts = norm_matrix(total_number)

    # cross-product
    dfi_filt_crossp = pd.merge(dfi_subset[['pattern_name', 'pattern_center_aln',
                                           'pattern_strand_aln', 'pattern_center', 'example_idx']].set_index('example_idx'),
                               dfi_subset[['pattern_name', 'pattern_center_aln',
                                           'pattern_strand_aln', 'pattern_center', 'example_idx']].set_index('example_idx'),
                               how='outer', left_index=True, right_index=True).reset_index()
    # remove self-matches
    dfi_filt_crossp = dfi_filt_crossp.query('~((pattern_name_x == pattern_name_y) & '
                                            '(pattern_center_aln_x == pattern_center_aln_y) & '
                                            '(pattern_strand_aln_x == pattern_strand_aln_x))')
    dfi_filt_crossp['center_diff'] = dfi_filt_crossp.eval("abs(pattern_center_x- pattern_center_y)")
    dfi_filt_crossp['center_diff_aln'] = dfi_filt_crossp.eval("abs(pattern_center_aln_x- pattern_center_aln_y)")

    if query_string:
        dfi_filt_crossp = dfi_filt_crossp.query(query_string)
    match_sizes = dfi_filt_crossp.groupby(['pattern_name_x', 'pattern_name_y']).size()
    count_matrix = match_sizes.unstack(fill_value=0)

    norm_count_matrix = count_matrix / norm_counts  # .truediv(min_counts, axis='columns').truediv(total_number, axis='index')
    norm_count_matrix = norm_count_matrix.fillna(0)  # these examples didn't have any paired pattern

    return count_matrix, norm_count_matrix, norm_counts


def chi2_test_coc(random_coocurrence_counts, random_coocurrence,
                    random_coocurrence_norm, coocurrence_counts, coocurrence, coocurrence_norm):
    from scipy.stats import chi2_contingency
    cols = list(coocurrence_norm.columns)
    n = len(random_coocurrence_counts)
    o = np.zeros((n, n))
    op = np.zeros((n, n))
    for i in range(n):
        for j in range(n):
            
            # [[# not randomly found together , # not found together],
            #  [# randomly found together     , # found together]]
            ct = [[random_coocurrence_norm.iloc[i, j] - random_coocurrence_counts.iloc[i, j],
                   coocurrence_norm.iloc[i, j] - coocurrence_counts.iloc[i, j]],
                  [random_coocurrence_counts.iloc[i, j],
                   coocurrence_counts.iloc[i, j]]]
            ct = np.array(ct)
            # TODO - make this an actual fisher exact test from
            # https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.fisher_exact.html
            # or the chi-square contingency table:
            # https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.chi2_contingency.html#scipy.stats.chi2_contingency
            chi2, p, dof, ex = chi2_contingency(ct, correction=False)
            # t22 = sm.stats.contingency_tables.Table2x2(np.array(ct))
            
            o[i, j] = (ct[1, 1] / ct[0, 1]) / (ct[1, 0] / ct[0, 0 ])
            op[i, j] = p
    return pd.DataFrame(o, columns=cols, index=cols), pd.DataFrame(op, columns=cols, index=cols)


def coocurrence_plot(dfi_subset, motif_list, query_string="(abs(pattern_center_aln_x- pattern_center_aln_y) <= 150)",
                     signif_threshold=1e-5, ax=None, **kwargs):
    """Test for co-occurence

    Args:
      dfi_subset: desired subset of dfi
      motif_list: list of motifs used to order the heatmap
      query_string: string used with df_cross.query() to detering the valid motif pairs
      signif_threshold: significance threshold for Fisher's exact test
    """
    import seaborn as sns
    if ax is None:
        ax = plt.gca()
    c_counts, c, c_norm = co_occurence_matrix(dfi_subset, query_string=query_string)

    # Generate the NULL
    dfi_subset_random = dfi_subset.copy()
    np.random.seed(42)
    # TODO - shall we draw the sample indices completely at random?
    dfi_subset_random['example_idx'] = dfi_subset_random['example_idx'].sample(frac=1).values
    rc_counts, rc, rc_norm = co_occurence_matrix(dfi_subset_random, query_string=query_string)

    # test for significance
    o, op = chi2_test_coc(rc_counts, rc, rc_norm, c_counts, c, c_norm)

    # re-order
    o = o[motif_list].loc[motif_list]
    op = op[motif_list].loc[motif_list]

    signif = op < signif_threshold
    a = np.zeros_like(signif).astype(str)
    a[signif] = "*"
    a[~signif] = ""

    sns.heatmap(o, annot=a, fmt="", vmin=0, vmax=2,
                cmap='RdBu_r', ax=ax, **kwargs)
    ax.set_title(f"odds-ratio (proximal / non-proximal) (*: p<{signif_threshold})")

# count_matrix, norm_count_matrix, norm_counts = co_occurence_matrix(dfi_subset, 
#                  query_string=f"(abs(pattern_center_aln_x- pattern_center_aln_y) <= {dist}) & center_diff_aln > 5")

# co-occurence
fig, ax = plt.subplots(figsize=get_figsize(.4, aspect=1))
dist = 100
coocurrence_plot(dfi_subset[(~dfi_subset.is_te) & (~dfi_subset.is_erv)], list(motifs),
                 query_string=f"(abs(pattern_center_aln_x- pattern_center_aln_y) <= {dist}) & center_diff_aln > 5")
ax.set_ylabel("Motif of interest")
ax.set_xlabel("Motif partner");
fig.savefig(fdir / f'coocurrence.test.center_diff<{dist}.motifs=10.exclude-te.pdf')

All distance ranges¶

# subsets = ['center_diff <= 35', 'center_diff > 35', 'center_diff > 70']
subsets = dist_subsets
fig,axes = plt.subplots(1, len(subsets), 
                        figsize=get_figsize(0.4*len(subsets), .8/len(subsets)), 
                        sharey=True)
# dfs = dfi_subset.query('match_weighted_p > .2').query('imp_weighted_p > .2')
dfs = dfi_subset[(~dfi_subset.is_te) & (~dfi_subset.is_erv)]
for i, (subset,ax, subset_label) in enumerate(zip(subsets, axes, dist_subset_labels)):
    if i == len(subsets) - 1:
        cbar = True
    else:
        cbar = False
    coocurrence_plot(dfs, list(motifs), query_string=f"({subset}) & (center_diff_aln > 5)" , ax=ax, cbar=cbar)
    if i == 0:
        ax.set_ylabel("Motif of interest")
        ax.set_xlabel("Motif partner");
    
    ax.set_title(subset_label)
# plt.tight_layout()
fig.savefig(fdir / f'coocurrence.test.all-dist.motifs=10.exclude-TE.pdf')

Only the major motifs¶

main_motifs = OrderedDict([(k,v)
                            for k,v in motifs.items()
                            if k in ['Oct4-Sox2', 'Sox2', 'Nanog', 'Klf4']
                           ])

# co-occurence
fig, ax = plt.subplots(figsize=get_figsize(.25, aspect=1))
dist = 100
coocurrence_plot(dfi_subset[(~dfi_subset.is_te) & (~dfi_subset.is_erv)], list(main_motifs),
                 query_string=f"(abs(pattern_center_aln_x- pattern_center_aln_y) <= {dist}) & center_diff_aln > 5")
ax.set_ylabel("Motif of interest")
ax.set_xlabel("Motif partner");
fig.savefig(fdir / f'coocurrence.test.center_diff<{dist}.main-motifs.pdf')

All distance ranges¶

# subsets = ['center_diff <= 35', 'center_diff > 35', 'center_diff > 70']
subsets = dist_subsets
fig,axes = plt.subplots(1, len(subsets), 
                        figsize=get_figsize(0.25*len(subsets), .8/len(subsets)), 
                        sharey=True)
# dfs = dfi_subset.query('match_weighted_p > .2').query('imp_weighted_p > .2')
dfs = dfi_subset[(~dfi_subset.is_te) & (~dfi_subset.is_erv)]
for i, (subset,ax, subset_label) in enumerate(zip(subsets, axes, dist_subset_labels)):
    if i == len(subsets) - 1:
        cbar = True
    else:
        cbar = False
    coocurrence_plot(dfs, list(main_motifs), query_string=f"({subset}) & (center_diff_aln > 5)" , ax=ax, cbar=cbar)
    if i == 0:
        ax.set_ylabel("Motif of interest")
        ax.set_xlabel("Motif partner");
    
    ax.set_title(subset_label)
# plt.tight_layout()
fig.savefig(fdir / f'coocurrence.test.all-dist.main-motifs.pdf')

Pairwise distance distribution¶

main_motif_pairs = [k + "<>" + v for k,v in get_motif_pairs(['Oct4-Sox2', 'Sox2', 'Nanog', 'Klf4'])]

dfab_main = dfab[dfab.motif_pair.isin(main_motif_pairs)]
dfab_main['motif_pair'] = pd.Categorical(dfab_main['motif_pair'],
                                         categories=main_motif_pairs)

plotnine.options.figure_size = get_figsize(.4, aspect=3.1875)# (10, 10)
max_dist = 100
min_dist = 6

fig = (ggplot(aes(x='center_diff', fill='strand_combination'), dfab_main[(dfab_main.center_diff <= max_dist) & 
                                                                    (dfab_main.center_diff > min_dist)]) + 
 geom_histogram(breaks=np.arange(min_dist, max_dist + 1)) + 
 facet_grid("motif_pair~ .") +
 theme_classic(base_size=10, base_family='Arial') + 
 theme(strip_text = element_text(rotation=0), legend_position='top') + 
 xlim([min_dist, max_dist]) + 
 # ylim([0, 1000]) +
 xlab("Pairwise distance") + 
 scale_fill_brewer(type='qual', palette=3))
fig.save(fdir / 'histogram.center_diff.main-4-motifs.imp>.2,match>.0.pdf')
fig

<ggplot: (-9223363275141050620)>

df_instances = pd.read_csv("https://docs.google.com/spreadsheets/d/1uAtJZJR_dmlWwer5Afcd2q72Uuj7A4Ln/export?gid=826411817&format=csv")

df_instances

df_instances[df_instances.IC > 4].n_cwm_peaks.sum()

965844.0

df_instances[(df_instances.IC > 4) & (df_instances.is_te==False)].n_cwm_peaks.sum()

956134.0

main_motif_names = [longer_pattern(v) for k,v in motifs.items()] + ["Oct4/metacluster_0/pattern_16"]

df_instances['pattern']

0      metacluster_0/pattern_9
1     metacluster_0/pattern_13
2     metacluster_0/pattern_14
                ...           
59     metacluster_0/pattern_9
60    metacluster_0/pattern_11
61                         NaN
Name: pattern, Length: 62, dtype: object

df_instances['pattern']

0      metacluster_0/pattern_9
1     metacluster_0/pattern_13
2     metacluster_0/pattern_14
                ...           
59     metacluster_0/pattern_9
60    metacluster_0/pattern_11
61                         NaN
Name: pattern, Length: 62, dtype: object

df_instances['pattern_name'] = df_instances['task'] + '/' + df_instances['pattern']

dfi_subset.pattern_name.unique()

array(['Oct4-Sox2', 'Oct4', 'B-Box', 'Oct4-Oct4', 'Sox2', 'Nanog', 'Zic3', 'Nanog-partner', 'Klf4',
       'Klf4-Klf4'], dtype=object)

dfi_subset.

df_instances[(df_instances.IC > 4) & (df_instances.is_te==False)].n_cwm_peaks.sum()

956134.0

df_instances.head()

# Generate all spacing plots
individual_plots = {}
for motif_pair_name in pair_names:
    plotnine.options.figure_size = get_figsize(.3, aspect=2/10*4)
    max_dist = 100
    min_dist = 0
    df = dfab[(dfab.center_diff <= max_dist) & 
              (dfab.motif_pair == motif_pair_name)]
    if len(df) == 0:
        continue
    fig = (ggplot(aes(x='center_diff', fill='strand_combination'), df) + 
     # Spacing vertical lines
     geom_vline(xintercept=10, alpha=0.1) + 
     geom_vline(xintercept=20, alpha=0.1) + 
     geom_vline(xintercept=30, alpha=0.1) + 
     geom_vline(xintercept=40, alpha=0.1) + 
     # plot
     geom_histogram(breaks=np.arange(min_dist, max_dist + 1)) + 
     facet_grid("strand_combination~.") + 
     # Theme, labels, colors
     theme_classic(base_size=10, base_family='Arial') + 
     theme(strip_text = element_text(rotation=0), legend_position='top') + 
     xlim([min_dist, max_dist]) + 
     xlab("Pairwise distance") +
     ggtitle(motif_pair_name) + 
     scale_fill_brewer(type='qual', palette=3))
    
    # Save
    fig.save(fdir_individual / f'{motif_pair_name}.pdf')
    individual_plots[motif_pair_name] = fig

individual_plots['Sox2<>Nanog']

<ggplot: (-9223363287224432780)>

individual_plots['Nanog<>Nanog']

<ggplot: (-9223363277389710634)>

Detailed individual plots¶

[X] TODO - include a detailed major and minor tiks (Minor = 1bp, Major = 5bp)

Considered TF's:

Nanog-Nanog
Nanog-Sox2
Zic3-Nanog

for motif_pair_name in ['Nanog<>Nanog', 'Sox2<>Nanog', 'Nanog<>Zic3']:
    plotnine.options.figure_size = get_figsize(2, aspect=2/10*4 / 2)
    xmin = 5
    xmax = 50
    df = dfab[(dfab.center_diff <= xmax) & 
              (dfab.motif_pair == motif_pair_name)]
    fig = (ggplot(aes(x='center_diff', fill='strand_combination'), df) + 
     # plot
     geom_histogram(breaks=np.arange(xmin, xmax+1)) + facet_grid("strand_combination~.") + 
     # Theme, labels, colors
     theme_bw(base_size=10, base_family='Arial') + 
     theme(strip_text = element_text(rotation=0), legend_position='top') +  
     xlab("Pairwise distance") +
     ggtitle(motif_pair_name) + 
     scale_x_continuous(breaks=np.arange(xmin, xmax, step=5),
                        minor_breaks=np.arange(xmin, xmax, step=1)) +
     scale_fill_brewer(type='qual', palette=3))
     # axis_ticks_major_x()
    display(fig)
    fig.save(fdir_individual / f'{motif_pair_name}.large.pdf')

<ggplot: (8749611385327)>

<ggplot: (8749628652068)>

<ggplot: (-9223363287240838428)>

PWM for the main motifs¶

[ ] Plot a PWM of all the sequences overlapping a certain region
- [ ] Query the motif pairs -> construct intervals
- [ ] Create seqlets
- [ ] Fetch the sequence from ImpScoreFile
[ ] Consider each orientation
[ ] Flank the sequence
[ ] Say the peak at 10bp or peak at 20 bp

TODO - take into account the strand¶

rdir = Path('../../../')

ls {rdir}

basepair/           conda-env-gpu.yml  error.afmat-compute.log  setup.py*
basepair.egg-info/  conda-env.yml      example/                 Snakefile
chipnexus-gdrive@   data@              py27-conda-env.yml       src/
conda-env-cpu.yml   docs/              README.md                tests/

dataspec_file = rdir / "src/chipnexus/train/seqmodel/ChIP-nexus.dataspec.yml"
ds = DataSpec.load(dataspec_file)

# Load all files into the page cache
ds.touch_all_files()

from basepair.extractors import Interval
from basepair.exp.paper.config import fasta_file
from genomelake.extractors import FastaExtractor

from basepair.utils import halve

from basepair.preproc import resize_interval

fe = FastaExtractor(fasta_file, use_strand=True)

def plot_pwm(pwm, ax):
    from matplotlib import ticker
    from concise.utils.pwm import seqlogo
    from basepair.modisco.utils import ic_scale
    seqlogo(ic_scale(pwm), ax=ax)
    # strip_axis(ax)
    ax.set_xlabel(None)
    # ax.get_xaxis().set_visible(False)
    ax.set_ylim([0, 2.0])
    ax.set_ylabel("IC")
    ax.spines['top'].set_visible(False)
    ax.spines['right'].set_visible(False)
    ax.spines['bottom'].set_visible(False)
    
    ax.xaxis.set_major_locator(ticker.AutoLocator())


def create_intervals(dfab, ax=None):
    from basepair.extractors import Interval
    intervals = []
    rev_strand = {"+": "-", "-": "+"}
    for i,row in dfab.iterrows():
        diff = row.pattern_center_y - row.pattern_center_x
        width = np.abs(diff) + 30
#         if diff > 0:
#             start = min(row.pattern_start_x, row.pattern_end_x) - 10
#             end = start + width
#             strand = row.strand_x
#         else:
#             end = max(row.pattern_start_x, row.pattern_end_x) + 10 + 1
#             start = end - width
#             strand = row.strand_x
        if diff > 0:
            start = row.pattern_center_x - 15
            end = start + width
            strand = row.strand_x
        else:
            end = row.pattern_center_x + 15
            start = end - width
            strand = row.strand_x
        intervals.append(Interval(chrom=row.example_chrom_x, 
                     start=row.example_start_x + start, 
                     stop=row.example_start_x + end,
                     name=row.motif_pair, 
                     strand=strand,
                     # strand="+" if row.pattern_center_x > row.pattern_center_y else "-"
                    ))
    return intervals

Notes¶

It's nice that the discovered motif pairs are not comming from a TE

mrn = mr.mr_dict['Nanog']

ls /oak/stanford/groups/akundaje/avsec/basepair/data/processed/comparison/output/nexus,peaks,OSNK,0,10,1,FALSE,same,0.5,64,25,0.004,9,FALSE,[1,50],TRUE/deeplift/Nanog/out/profile/wn/

patterns = read_pkl("/oak/stanford/groups/akundaje/avsec/basepair/data/processed/comparison/output/nexus,peaks,OSNK,0,10,1,FALSE,same,0.5,64,25,0.004,9,FALSE,[1,50],TRUE/deeplift/Nanog/out/profile/wn/patterns.pkl")

p = {p.name:p for p in patterns}['metacluster_0/pattern_1']

p = p.shift(4).resize(50).resize_profile(50)

TODO¶

can you come up with a way to nicely visualize this?
- overplot the two tracks?

p.rc().plot("seq_ic")

dfab[dfab.motif_pair == 'Nanog<>Nanog-partner'].groupby(['strand_combination', 'center_diff']).size().sort_values()

strand_combination  center_diff
++                  3.0              1
+-                  306.0            1
                    302.0            1
                                  ... 
++                  10.0            25
                    9.0             28
-+                  6.0            278
Length: 726, dtype: int64

np.sum(dfi_subset.pattern_name == 'Nanog-partner')

7066

278/7066

0.03934333427681857

from basepair.BPNet import BPNetSeqModel

# create_tf_session(0)

bpnet = BPNetSeqModel.from_mdir(model_dir)

WARNING:tensorflow:From /users/avsec/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/tensorflow/python/util/deprecation.py:497: calling conv1d (from tensorflow.python.ops.nn_ops) with data_format=NHWC is deprecated and will be removed in a future version.
Instructions for updating:
`NHWC` for data_format is deprecated, use `NWC` instead

2019-07-13 14:00:47,770 [WARNING] From /users/avsec/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/tensorflow/python/util/deprecation.py:497: calling conv1d (from tensorflow.python.ops.nn_ops) with data_format=NHWC is deprecated and will be removed in a future version.
Instructions for updating:
`NHWC` for data_format is deprecated, use `NWC` instead

WARNING:tensorflow:From /users/avsec/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/tensorflow/contrib/learn/python/learn/datasets/base.py:198: retry (from tensorflow.contrib.learn.python.learn.datasets.base) is deprecated and will be removed in a future version.
Instructions for updating:
Use the retry module or similar alternatives.

2019-07-13 14:01:01,067 [WARNING] From /users/avsec/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/tensorflow/contrib/learn/python/learn/datasets/base.py:198: retry (from tensorflow.contrib.learn.python.learn.datasets.base) is deprecated and will be removed in a future version.
Instructions for updating:
Use the retry module or similar alternatives.

to_plot = [
    ("Nanog<>Nanog-partner", "-+", 6),
    ("Nanog<>Nanog-partner", "++", 9),
    ("Nanog<>Nanog-partner", "++", 10),
#     ("Sox2<>Nanog", "--", 10),
#     ("Sox2<>Nanog", "--", 20),
#     ("Sox2<>Nanog", "--", 30),
#     ("Sox2<>Nanog", "++", 13),
#     ("Sox2<>Nanog", "++", 22),
#     ("Sox2<>Nanog", "++", 23),
#     ("Nanog<>Zic3", "++", 12),
#     ("Nanog<>Zic3", "+-", 11),
#     ("Nanog<>Zic3", "+-", 21),
#     ("Nanog<>Zic3", "+-", 31),
#     ("Oct4<>Sox2", "++", 10),
]

for motif_pair, strand_combination, dist in to_plot:
#     dist = 10
#     strand_combination = '++'
#     motif_pair = 'Nanog<>Nanog'
    fig, axes = plt.subplots(2,1, 
                           gridspec_kw={'height_ratios':[1/7, 1/2]},
                           sharex=True,
                           figsize=get_figsize(.5, 1/7+1/2))
    dfp = (dfab
           .query(f'strand_combination=="{strand_combination}"')
           .query(f'motif_pair=="{motif_pair}"')
           .query(f'center_diff=={dist}'))
    intervals = create_intervals(dfp, ax=ax)
    seqs = fe(intervals)
    
    counts = ds.load_counts(intervals)
    ax = axes[0]
    plot_pwm(seqs.mean(0), ax=ax)
    ax.set_title(f"{motif_pair} {strand_combination}, dist = {dist}, ({len(dfp)})");
    
    
    cmap = plt.get_cmap("tab10")
    cycle = plt.rcParams['axes.prop_cycle'].by_key()['color']
    
    ao = .7
    # plot aggregate profile
    agg_profile = counts[motif_pair.split("<>")[0]].mean(0)
    ax = axes[1]
    # fig, ax = plt.subplots(1, 1, figsize=get_figsize(.5, 1/2))
    ax.plot(agg_profile[:, 0], alpha=ao, label='Obs', color=cycle[0])
    ax.plot(-agg_profile[:, 1], alpha=ao, color=cycle[0])
    ax.set_ylabel("Counts")
    # ax.set_title("Observed")
    sns.despine(top=True, bottom=True, right=True)
    
    # params
    seqlen = seqs.shape[1]
    
    # Plot model
    agg_profile = counts[motif_pair.split("<>")[0]].mean(0)
    # fig, ax = plt.subplots(1, 1, figsize=get_figsize(.5, 1/2))
    seqs_1kb = fe([resize_interval(interval, 1000) for interval in intervals])
    preds = bpnet.predict(seqs_1kb)
    agg_preds = preds['Nanog'].mean(0)
    upstream_seq, downstream_seq = halve(seqlen)
    xs = slice(500-upstream_seq, 500+downstream_seq)
    a = agg_profile.sum() / agg_preds[xs].sum()
    ax.plot(a*agg_preds[xs, 0], label='BPNet', alpha=ao, color=cycle[1])
    ax.plot(-a*agg_preds[xs, 1], alpha=ao, color=cycle[1])
    ax.set_ylabel("Counts")
    # ax.set_title("BPNet")
    sns.despine(top=True, bottom=True, right=True)
    
    
#     pshifted = p.rc().shift(-3).resize(seqlen).resize_profile(seqlen)
#     upstream, downstream = halve(dist)
#     double_profile = pshifted.shift(-upstream).profile['Nanog'] + pshifted.shift(downstream).profile['Nanog']
#     # fig, ax = plt.subplots(1, 1, figsize=get_figsize(.5, 1/2))
#     a = agg_profile.sum() / double_profile.sum()
#     ax.plot(a*double_profile[:, 0], label='Mixture', alpha=ao, color=cycle[2])
#     ax.plot(-a*double_profile[:, 1], alpha=ao, color=cycle[2])
#     ax.set_ylabel("Counts")
    # ax.set_title("Mixture")
    sns.despine(top=True, bottom=True, right=True)
    ax.legend()#labels=['Obs', 'BPNet', 'Mixure'])
    fig.align_ylabels()
    
    fig.savefig(fdir_individual / f'{motif_pair}.{strand_combination}.{dist}.pwm+profile.pdf')

I looked at all the motif pairs of Nanog and Nanog-alt and then visualized the aggregate footprints for the following scenarios:

only those where Nanog is 6 bp away at -+ orientation
All those where Nanog is not 6 bp away at -+ orientation (e.g. the opposite of 1.)
All those where Nanog is > 15 bp away
All

# Nanog
from basepair.extractors import Interval
to_plot = [
    ("Nanog<>Nanog-partner", "-+", 6),
    ("Nanog<>Nanog-partner", "-+", -6),
    ("Nanog<>Nanog-partner", None, -15),
    ("Nanog<>Nanog-partner", "-+", None),
]

for motif_pair, strand_combination, dist in to_plot:
#     dist = 10
#     strand_combination = '++'
#     motif_pair = 'Nanog<>Nanog'
    fig, axes = plt.subplots(2,1, 
                           gridspec_kw={'height_ratios':[1/7, 1/2]},
                           sharex=True,
                           figsize=get_figsize(.5, 1/7+1/2))
    dfp = (dfab
           .query(f'motif_pair=="{motif_pair}"'))
    if dist is not None:
        if dist > 0 :
            dfp = (dfp
                   .query(f'((strand_combination=="{strand_combination}") &  (center_diff=={dist}))'))
        else:
            if strand_combination is not None:
                dfp = (dfp
                       .query(f'~((strand_combination=="{strand_combination}") &  (center_diff==-{dist}))'))
            else:
                dfp = (dfp
                       .query(f'center_diff>-{dist}'))
    intervals = [Interval(chrom=row.Chromosome_y,
                          start=row.Start_y - 20,
                          stop=row.End_y + 20,
                          strand=row.strand_y)
                 for i, row in dfp.iterrows()]
    # intervals = create_intervals(dfp, ax=ax)
    seqs = fe(intervals)
    
    counts = ds.load_counts(intervals)
    ax = axes[0]
    plot_pwm(seqs.mean(0), ax=ax)
    ax.set_title(f"{motif_pair} {strand_combination}, dist = {dist}, ({len(dfp)})");
    
    
    cmap = plt.get_cmap("tab10")
    cycle = plt.rcParams['axes.prop_cycle'].by_key()['color']
    
    ao = .7
    # plot aggregate profile
    agg_profile = counts[motif_pair.split("<>")[0]].mean(0)
    ax = axes[1]
    # fig, ax = plt.subplots(1, 1, figsize=get_figsize(.5, 1/2))
    ax.plot(agg_profile[:, 0], alpha=ao, label='Obs', color=cycle[0])
    ax.plot(-agg_profile[:, 1], alpha=ao, color=cycle[0])
    ax.set_ylabel("Counts")
    # ax.set_title("Observed")
    sns.despine(top=True, bottom=True, right=True)
    
    # params
    seqlen = seqs.shape[1]
    
    # Plot model
    agg_profile = counts[motif_pair.split("<>")[0]].mean(0)
    # fig, ax = plt.subplots(1, 1, figsize=get_figsize(.5, 1/2))
    seqs_1kb = fe([resize_interval(interval, 1000) for interval in intervals])
    # preds = bpnet.predict(seqs_1kb)
    # agg_preds = preds['Nanog'].mean(0)
    upstream_seq, downstream_seq = halve(seqlen)
    xs = slice(500-upstream_seq, 500+downstream_seq)
    # a = agg_profile.sum() / agg_preds[xs].sum()
    # ax.plot(a*agg_preds[xs, 0], label='BPNet', alpha=ao, color=cycle[1])
    # ax.plot(-a*agg_preds[xs, 1], alpha=ao, color=cycle[1])
    ax.set_ylabel("Counts")
    ax.set_ylim([-11, 11])
    # ax.set_title("BPNet")
    sns.despine(top=True, bottom=True, right=True)
    
    
#     pshifted = p.rc().shift(-3).resize(seqlen).resize_profile(seqlen)
#     upstream, downstream = halve(dist)
#     double_profile = pshifted.shift(-upstream).profile['Nanog'] + pshifted.shift(downstream).profile['Nanog']
#     # fig, ax = plt.subplots(1, 1, figsize=get_figsize(.5, 1/2))
#     a = agg_profile.sum() / double_profile.sum()
#     ax.plot(a*double_profile[:, 0], label='Mixture', alpha=ao, color=cycle[2])
#     ax.plot(-a*double_profile[:, 1], alpha=ao, color=cycle[2])
#     ax.set_ylabel("Counts")
    # ax.set_title("Mixture")
    sns.despine(top=True, bottom=True, right=True)
    ax.legend()#labels=['Obs', 'BPNet', 'Mixure'])
    fig.align_ylabels()
    
    fig.savefig(fdir_individual / f'{motif_pair}.{strand_combination}.{dist}.pwm+profile.Nanog-mix.pdf')

nanog_patterns = read_pkl("/oak/stanford/groups/akundaje/avsec/basepair/data/processed/comparison/output/nexus,peaks,OSNK,0,10,1,FALSE,same,0.5,64,25,0.004,9,FALSE,[1,50],TRUE/deeplift/Nanog/out/profile/wn/patterns.pkl")
nanog_patterns = {p.name: p for p in nanog_patterns}

nanog_patterns

{'metacluster_0/pattern_13': Pattern('metacluster_0/pattern_13', 'TAGCCAGTCTCACCCAAACAAGGGAATACACAATGGCCTAGCGAATAGGTGGGTTTACCATGAAAGAGAA' ...),
 'metacluster_0/pattern_12': Pattern('metacluster_0/pattern_12', 'GTTACCTATTCAGTAACTAATTAAGCATTACAAAAGACGGAGCCAGCAGCTCAGAGAAAAAAAAAAAAAA' ...),
 'metacluster_0/pattern_7': Pattern('metacluster_0/pattern_7', 'AGTAGAGACAGTGAAACTAATTAGGCATTACAAAGAACAGAGCTAGAATAGCTCAGGGAAAAAAAAAAAA' ...),
 'metacluster_0/pattern_18': Pattern('metacluster_0/pattern_18', 'TAAACCTGCTTCCCACCTTAATTTGTAAAATAAAGGAGAGCTGATGACTGGGCAGATAAAAGGGAAGGTG' ...),
 'metacluster_0/pattern_15': Pattern('metacluster_0/pattern_15', 'CAAAGAACAGAGCTAGAATAGCTCAGGGCTAGTCTGCAGAGTGATTACTTAGGACAGTAGCCAGTCTCAA' ...),
 'metacluster_0/pattern_11': Pattern('metacluster_0/pattern_11', 'AGGGCTAGTCTGCATAGTAATTACTTAGGACAATAGCCGAGTCACACCCAAACACAGAAAAAAAAAAAAA' ...),
 'metacluster_0/pattern_17': Pattern('metacluster_0/pattern_17', 'ACAACTGGCTGTTATCATGTAGAAGAATGCAAATTGATCCATTCCTATCTCCTTGTAAAAAAAAAAAAAA' ...),
 'metacluster_0/pattern_10': Pattern('metacluster_0/pattern_10', 'GGTCAGAGTCTGAGCTTGATCGTTGCTAATTTGCAATTGAATGGAGTACCTGGACTTCCCCAAAGAAAAA' ...),
 'metacluster_0/pattern_16': Pattern('metacluster_0/pattern_16', 'AAACCAAAGCCCAGTTTTTTATTTACATATCAATTCAGTCATTTACTCCAGGTTCCCTTTTGATTATCCA' ...),
 'metacluster_0/pattern_14': Pattern('metacluster_0/pattern_14', 'AAAAATCCTGGGAAAGCAGAATTAGCATCAAAATACAAACAGCACCAGGGCCATCCACAACACTTACCCA' ...),
 'metacluster_0/pattern_6': Pattern('metacluster_0/pattern_6', 'TCTTGAAGAGAAAAAGGGCAAGTGGCAGAACAATAGAGCCATCTAGGTAGGAAGGCTCCACCCTAGAAAA' ...),
 'metacluster_0/pattern_0': Pattern('metacluster_0/pattern_0', 'AAAGAATTTTAAATATGGTAATTTGCATAACAATGGAAACAATAGGATCATAAATTTTAAAAAAAAAAAA' ...),
 'metacluster_0/pattern_9': Pattern('metacluster_0/pattern_9', 'AGGTTGGGCATCGGGCAGGTAATGGCACATCAAAGGAGACTGATGGAAGGCAGCAAGGCATAAGGAGAAA' ...),
 'metacluster_0/pattern_8': Pattern('metacluster_0/pattern_8', 'AAAAAAAAAAAAAATGGATTTAAGGTAGAACAAAGGAGTCGTTATGGAAGTCTTGATAATCCTTTTAAAC' ...),
 'metacluster_0/pattern_3': Pattern('metacluster_0/pattern_3', 'ATTGTTAATATAAATAGTAATTTCCCAAAACAATGGCCAATTTTAATCGGACAATGAAATTAACAATGAA' ...),
 'metacluster_0/pattern_4': Pattern('metacluster_0/pattern_4', 'AAAAAAAAAAAAAAAAATAATAAAGTGCAGGAAATGGGCCATCAAAAAAACAATTAGAAATTATTAAAAA' ...),
 'metacluster_0/pattern_5': Pattern('metacluster_0/pattern_5', 'AAAAAAAAAAATATTCCATATTAGTACTTTGATGGCCCATTAAGTCCCACTTAAAGCATTTTTTTGCTTT' ...),
 'metacluster_0/pattern_1': Pattern('metacluster_0/pattern_1', 'TTTTTTTAAAAATTTTTTTATTATTGTCTTGATGGCTTTTTTTTTTTTTTTTTATAAATTTTTTTAAAAA' ...),
 'metacluster_0/pattern_2': Pattern('metacluster_0/pattern_2', 'TAGTCTCCGGAATAGCTCAAGGGCCTCTCAGCAGGTAGCAGTGTCTTGGAGAAGAACAGCGGCAAAAAAA' ...)}

p_nanog = nanog_patterns[longer_pattern(motifs['Nanog'].split("/")[1])]

p_nanog_alt = longer_pattern(motifs['Nanog-partner'].split("/")[1])

from basepair.exp.paper.fig4 import *

pattern_names = ['Nanog', 'Nanog-partner']
for pname in pattern_names:
    p = nanog_patterns[longer_pattern(motifs[pname].split("/")[1])]
    agg_profile = p.profile['Nanog'][70:130]
    pfm = p.seq[5:65]
    fig, axes = plt.subplots(2,1, 
                               gridspec_kw={'height_ratios':[1/7, 1/2]},
                               sharex=True,
                               figsize=get_figsize(.5, 1/7+1/2))
    cmap = plt.get_cmap("tab10")
    cycle = plt.rcParams['axes.prop_cycle'].by_key()['color']

    ax = axes[0]
    plot_pwm(pfm, ax=ax)
    ax.set_title(f"Nanog");


    ao = .7
    # plot aggregate profile
    ax = axes[1]
    ax.plot(agg_profile[:, 0], alpha=ao, label='Obs', color=cycle[0])
    ax.plot(-agg_profile[:, 1], alpha=ao, color=cycle[0])

    ax.set_ylabel("Counts")
    ax.set_ylim([-11, 11])
    # ax.set_title("BPNet")
    sns.despine(top=True, bottom=True, right=True)
    ax.legend()
    fig.align_ylabels()

    fig.savefig(fdir_individual / f'only-{pname}.pdf')

p_nanog.plot(['seq_ic', 'profile'])

to_plot = [
    ("Nanog<>Nanog", "++", 10),
    ("Nanog<>Nanog", "++", 20),
    ("Nanog<>Nanog", "++", 21),
    ("Nanog<>Nanog", "++", 30),
    ("Nanog<>Nanog", "++", 31),
    ("Nanog<>Nanog", "++", 40),
    ("Nanog<>Nanog", "++", 41),
#     ("Sox2<>Nanog", "--", 10),
#     ("Sox2<>Nanog", "--", 20),
#     ("Sox2<>Nanog", "--", 30),
#     ("Sox2<>Nanog", "++", 13),
#     ("Sox2<>Nanog", "++", 22),
#     ("Sox2<>Nanog", "++", 23),
#     ("Nanog<>Zic3", "++", 12),
#     ("Nanog<>Zic3", "+-", 11),
#     ("Nanog<>Zic3", "+-", 21),
#     ("Nanog<>Zic3", "+-", 31),
#     ("Oct4<>Sox2", "++", 10),
]

for motif_pair, strand_combination, dist in to_plot:
#     dist = 10
#     strand_combination = '++'
#     motif_pair = 'Nanog<>Nanog'
    fig, axes = plt.subplots(2,1, 
                           gridspec_kw={'height_ratios':[1/7, 1/2]},
                           sharex=True,
                           figsize=get_figsize(.5, 1/7+1/2))
    dfp = (dfab
           .query(f'strand_combination=="{strand_combination}"')
           .query(f'motif_pair=="{motif_pair}"')
           .query(f'center_diff=={dist}'))
    intervals = create_intervals(dfp, ax=ax)
    seqs = fe(intervals)
    
    counts = ds.load_counts(intervals)
    ax = axes[0]
    plot_pwm(seqs.mean(0), ax=ax)
    ax.set_title(f"{motif_pair} {strand_combination}, dist = {dist}, ({len(dfp)})");
    
    
    cmap = plt.get_cmap("tab10")
    cycle = plt.rcParams['axes.prop_cycle'].by_key()['color']
    
    ao = .7
    # plot aggregate profile
    agg_profile = counts[motif_pair.split("<>")[0]].mean(0)
    ax = axes[1]
    # fig, ax = plt.subplots(1, 1, figsize=get_figsize(.5, 1/2))
    ax.plot(agg_profile[:, 0], alpha=ao, label='Obs', color=cycle[0])
    ax.plot(-agg_profile[:, 1], alpha=ao, color=cycle[0])
    ax.set_ylabel("Counts")
    # ax.set_title("Observed")
    sns.despine(top=True, bottom=True, right=True)
    
    # params
    seqlen = seqs.shape[1]
    
    # Plot model
    agg_profile = counts[motif_pair.split("<>")[0]].mean(0)
    # fig, ax = plt.subplots(1, 1, figsize=get_figsize(.5, 1/2))
    seqs_1kb = fe([resize_interval(interval, 1000) for interval in intervals])
    preds = bpnet.predict(seqs_1kb)
    agg_preds = preds['Nanog'].mean(0)
    upstream_seq, downstream_seq = halve(seqlen)
    xs = slice(500-upstream_seq, 500+downstream_seq)
    a = agg_profile.sum() / agg_preds[xs].sum()
    ax.plot(a*agg_preds[xs, 0], label='BPNet', alpha=ao, color=cycle[1])
    ax.plot(-a*agg_preds[xs, 1], alpha=ao, color=cycle[1])
    ax.set_ylabel("Counts")
    # ax.set_title("BPNet")
    sns.despine(top=True, bottom=True, right=True)
    
    
    pshifted = p.rc().shift(-3).resize(seqlen).resize_profile(seqlen)
    upstream, downstream = halve(dist)
    double_profile = pshifted.shift(-upstream).profile['Nanog'] + pshifted.shift(downstream).profile['Nanog']
    # fig, ax = plt.subplots(1, 1, figsize=get_figsize(.5, 1/2))
    a = agg_profile.sum() / double_profile.sum()
    ax.plot(a*double_profile[:, 0], label='Mixture', alpha=ao, color=cycle[2])
    ax.plot(-a*double_profile[:, 1], alpha=ao, color=cycle[2])
    ax.set_ylabel("Counts")
    # ax.set_title("Mixture")
    sns.despine(top=True, bottom=True, right=True)
    ax.legend()#labels=['Obs', 'BPNet', 'Mixure'])
    fig.align_ylabels()
    
    fig.savefig(fdir_individual / f'{motif_pair}.{strand_combination}.{dist}.pwm+profile.pdf')

def extract_stacked_seqlets(dfab, isf):
    from matplotlib import ticker
    from concise.utils.pwm import seqlogo
    from basepair.modisco.utils import ic_scale
    from basepair.extractors import Interval
    seqlets = []
    rev_strand = {"+": "-", "-": "+"}
    for i,row in dfab.iterrows():
        diff = row.pattern_center_y - row.pattern_center_x
        width = np.abs(diff) + 30
        if diff > 0:
            start = row.pattern_center_x - 15
            end = start + width
            strand = row.strand_x
        else:
            end = row.pattern_center_x + 15
            start = end - width
            strand = row.strand_x
        seqlets.append(Seqlet(seqname=row.example_idx, 
                     start=start, 
                     end=end,
                     name=row.motif_pair, 
                     strand=strand,
                     # strand="+" if row.pattern_center_x > row.pattern_center_y else "-"
                    ))
    sts = isf.extract(seqlets)
    return sts

# to_plot = [
#     ("Nanog<>Nanog", "++", 10),
#     ("Nanog<>Nanog", "++", 20),
#     ("Nanog<>Nanog", "++", 21),
#     ("Nanog<>Nanog", "++", 30),
#     ("Nanog<>Nanog", "++", 31),
#     ("Nanog<>Nanog", "++", 40),
#     ("Nanog<>Nanog", "++", 41),
#     ("Sox2<>Nanog", "--", 10),
#     ("Sox2<>Nanog", "--", 20),
#     ("Sox2<>Nanog", "--", 30),
#     ("Sox2<>Nanog", "++", 13),
#     ("Sox2<>Nanog", "++", 22),
#     ("Sox2<>Nanog", "++", 23),
#     ("Nanog<>Zic3", "++", 12),
#     ("Nanog<>Zic3", "+-", 11),
#     ("Nanog<>Zic3", "+-", 21),
#     ("Nanog<>Zic3", "+-", 31),
# ]

# for motif_pair, strand_combination, dist in to_plot:
# #     dist = 10
# #     strand_combination = '++'
# #     motif_pair = 'Nanog<>Nanog'
#     fig, ax = plt.subplots(1,1, figsize=get_figsize(.5, 1/7))
#     dfp = (dfab
#            .query(f'strand_combination=="{strand_combination}"')
#            .query(f'motif_pair=="{motif_pair}"')
#            .query(f'center_diff=={dist}'))
#     sts = extract_stacked_seqlets(dfp, isf)
#     plot_pwm(sts.seq.mean(0), ax=ax)
#     ax.set_title(f"{motif_pair} {strand_combination}, dist = {dist}, ({len(dfp)})");

TODO¶

[ ] plot the aggregate strand plot (for each distance)
[ ] Plot the spacing for Oct4-Sox2 -> is that the cannonical spacing?
[X] Annotate dfi_subset with profile scores
- [X] Implement _get_seqlets(pattern, trim_frac=trim_frac) to MultipleModiscoResult
- [X] is dfi.pattern the same as in mr.get_pattern(dfi.pattern)
- [X] move the code up & re-run dfab
[ ] Generate the pairwise spacing plots showing differnet profile properties

# isf.cache()

<basepair.cli.imp_score.ImpScoreFile at 0x7f51fc2f3a58>

Pairwise scatter-plot¶

Oct4-Sox2 <-> Nanog¶

motif_pair_name = 'Oct4-Sox2<>Nanog'

dfab['Nanog/profile_counts_max_ref_y_log'] = np.log10(1+dfab['Nanog/profile_counts_max_ref_y'])
dfab['Sox2/profile_counts_max_ref_x_log'] = np.log10(1+dfab['Sox2/profile_counts_max_ref_x'])

Nanog site (Nanog counts)¶

# Test the ab scatterplot
plotnine.options.figure_size = get_figsize(1, aspect=2/10*4)
max_dist = 100
min_dist = 0
df = dfab[(dfab.center_diff <= max_dist) & 
          (dfab.motif_pair == motif_pair_name)]
fig = (ggplot(aes(x='center_diff', 
                  y='Nanog/profile_counts_max_ref_y_log',  # or 'profile_counts_max_ref_p'
                  color='strand_combination'
                 ), 
              df) + 
 # plot
 geom_point(alpha=0.5, size=.05, shape='.') +
 facet_grid("strand_combination~.") + 
 # scale_y_log10() + 
 # geom_smooth() + 
 # ylim([1, None]) + 
 # Theme, labels, colors
 theme_classic(base_size=10, base_family='Arial') + 
 theme(strip_text = element_text(rotation=0), legend_position='top') + 
 xlim([min_dist, max_dist]) + 
 xlab("Pairwise distance") +
 ggtitle(motif_pair_name) + 
 scale_color_brewer(type='qual', palette=3)
)
fig

<ggplot: (8749623594574)>

# Test the ab scatterplot
plotnine.options.figure_size = get_figsize(.3, aspect=2/10*4)
max_dist = 100
min_dist = 0
df = dfab[(dfab.center_diff <= max_dist) & 
          (dfab.motif_pair == motif_pair_name)]
fig = (ggplot(aes(x='center_diff', 
                  y='Nanog/profile_counts_max_ref_y_log',  # or 'profile_counts_max_ref_p'
                  color='strand_combination'), 
              df.groupby(["center_diff", 'strand_combination'])['Nanog/profile_counts_max_ref_y_log'].mean().reset_index()) + 
 # plot
 geom_line() +
 facet_grid("strand_combination~.") + 
 # geom_smooth() + 
 # ylim([1, None]) + 
 # Theme, labels, colors
 theme_classic(base_size=10, base_family='Arial') + 
 theme(strip_text = element_text(rotation=0), legend_position='top') + 
 xlim([min_dist, max_dist]) + 
 xlab("Pairwise distance") +
 ggtitle(motif_pair_name) + 
 scale_color_brewer(type='qual', palette=3)
)
fig

<ggplot: (-9223363287230738495)>

Oct4-Sox2 site (Sox2 counts)¶

# Test the ab scatterplot
plotnine.options.figure_size = get_figsize(1, aspect=2/10*4)
max_dist = 100
min_dist = 0
df = dfab[(dfab.center_diff <= max_dist) & 
          (dfab.motif_pair == motif_pair_name)]
fig = (ggplot(aes(x='center_diff', 
                  y='Sox2/profile_counts_max_ref_x_log',  # or 'profile_counts_max_ref_p'
                  color='strand_combination'
                 ), 
              df) + 
 # plot
 geom_jitter(alpha=0.4, size=.05, shape='.', height=0, width=1) +
 facet_grid("strand_combination~.") + 
 # scale_y_log10() + 
 # geom_smooth() + 
 # ylim([1, None]) + 
 # Theme, labels, colors
 theme_classic(base_size=10, base_family='Arial') + 
 theme(strip_text = element_text(rotation=0), legend_position='top') + 
 xlim([min_dist, max_dist]) + 
 xlab("Pairwise distance") +
 ggtitle(motif_pair_name) + 
 scale_color_brewer(type='qual', palette=3)
)
fig

<ggplot: (-9223363287223337121)>

# Test the ab scatterplot
plotnine.options.figure_size = get_figsize(1, aspect=2/10*4)
max_dist = 100
min_dist = 0
df = dfab[(dfab.center_diff <= max_dist) & 
          (dfab.motif_pair == motif_pair_name)]
fig = (ggplot(aes(x='center_diff', 
                  y='Sox2/profile_counts_max_ref_x_log',  # or 'profile_counts_max_ref_p'
                  color='strand_combination'), 
              df.groupby(["center_diff", 'strand_combination'])['Sox2/profile_counts_max_ref_x_log'].mean().reset_index()) + 
 # plot
 geom_line() +
 facet_grid("strand_combination~.") + 
 # geom_smooth() + 
 # ylim([1, None]) + 
 # Theme, labels, colors
 theme_classic(base_size=10, base_family='Arial') + 
 theme(strip_text = element_text(rotation=0), legend_position='top') + 
 xlim([min_dist, max_dist]) + 
 xlab("Pairwise distance") +
 ggtitle(motif_pair_name) + 
 scale_color_brewer(type='qual', palette=3)
)
fig

<ggplot: (8749630463275)>

Sox2 <-> Nanog¶

motif_pair_name = 'Sox2<>Nanog'

dfab['Nanog/profile_counts_max_ref_y_log'] = np.log10(1+dfab['Nanog/profile_counts_max_ref_y'])
dfab['Sox2/profile_counts_max_ref_x_log'] = np.log10(1+dfab['Sox2/profile_counts_max_ref_x'])

# Test the ab scatterplot
plotnine.options.figure_size = get_figsize(1, aspect=2/10*4)
max_dist = 100
min_dist = 0
df = dfab[(dfab.center_diff <= max_dist) & 
          (dfab.motif_pair == motif_pair_name)]
fig = (ggplot(aes(x='center_diff', 
                  y='Sox2/profile_counts_max_ref_x_log',  # or 'profile_counts_max_ref_p'
                  color='strand_combination'
                 ), 
              df) + 
 # plot
 geom_point(alpha=0.5, size=.05, shape='.') +
 facet_grid("strand_combination~.") + 
 # scale_y_log10() + 
 # geom_smooth() + 
 # ylim([1, None]) + 
 # Theme, labels, colors
 theme_classic(base_size=10, base_family='Arial') + 
 theme(strip_text = element_text(rotation=0), legend_position='top') + 
 xlim([min_dist, max_dist]) + 
 xlab("Pairwise distance") +
 ggtitle(motif_pair_name) + 
 scale_color_brewer(type='qual', palette=3)
)
fig

<ggplot: (-9223363287305922237)>

# Test the ab scatterplot
plotnine.options.figure_size = get_figsize(1, aspect=2/10*4)
max_dist = 100
min_dist = 0
df = dfab[(dfab.center_diff <= max_dist) & 
          (dfab.motif_pair == motif_pair_name)]
fig = (ggplot(aes(x='center_diff', 
                  y='Nanog/profile_counts_max_ref_y_log',  # or 'profile_counts_max_ref_p'
                  color='strand_combination'
                 ), 
              df) + 
 # plot
 geom_point(alpha=0.5, size=.05, shape='.') +
 facet_grid("strand_combination~.") + 
 # scale_y_log10() + 
 # geom_smooth() + 
 # ylim([1, None]) + 
 # Theme, labels, colors
 theme_classic(base_size=10, base_family='Arial') + 
 theme(strip_text = element_text(rotation=0), legend_position='top') + 
 xlim([min_dist, max_dist]) + 
 xlab("Pairwise distance") +
 ggtitle(motif_pair_name) + 
 scale_color_brewer(type='qual', palette=3)
)
fig

<ggplot: (-9223363287261764271)>

dfab.head()

# Test the ab scatterplot
plotnine.options.figure_size = get_figsize(1, aspect=2/10*4)
max_dist = 100
min_dist = 0
df = dfab[(dfab.center_diff <= max_dist) & 
          (dfab.motif_pair == motif_pair_name)]
fig = (ggplot(aes(x='center_diff', 
                  y='Nanog/profile_counts_max_ref_x_log',  # or 'profile_counts_max_ref_p'
                  color='strand_combination'
                 ), 
              df) + 
 # plot
 geom_point(alpha=0.2, size=.05, shape='.') +
 facet_grid("strand_combination~.") + 
 # scale_y_log10() + 
 # geom_smooth() + 
 # ylim([1, None]) + 
 # Theme, labels, colors
 theme_classic(base_size=10, base_family='Arial') + 
 theme(strip_text = element_text(rotation=0), legend_position='top') + 
 xlim([min_dist, max_dist]) + 
 xlab("Pairwise distance") +
 ggtitle(motif_pair_name) + 
 scale_color_brewer(type='qual', palette=3)
)
fig

<ggplot: (8749453687928)>

# Test the ab scatterplot
plotnine.options.figure_size = get_figsize(1, aspect=2/10*4)
max_dist = 100
min_dist = 0
df = dfab[(dfab.center_diff <= max_dist) & 
          (dfab.motif_pair == motif_pair_name)]
fig = (ggplot(aes(x='center_diff', 
                  y='Nanog/profile_counts_max_ref_x_log',  # or 'profile_counts_max_ref_p'
                  color='strand_combination'
                 ), 
              df) + 
 # plot
 geom_point(alpha=0.2, size=.05, shape='.') +
 facet_grid("strand_combination~.") + 
 # scale_y_log10() + 
 # geom_smooth() + 
 # ylim([1, None]) + 
 # Theme, labels, colors
 theme_classic(base_size=10, base_family='Arial') + 
 theme(strip_text = element_text(rotation=0), legend_position='top') + 
 xlim([min_dist, max_dist]) + 
 xlab("Pairwise distance") +
 ggtitle(motif_pair_name) + 
 scale_color_brewer(type='qual', palette=3)
)
fig

<ggplot: (-9223363287241288546)>

# Test the ab scatterplot
plotnine.options.figure_size = get_figsize(1, aspect=2/10*4)
max_dist = 100
min_dist = 0
df = dfab[(dfab.center_diff <= max_dist) & 
          (dfab.motif_pair == motif_pair_name)]
fig = (ggplot(aes(x='center_diff', 
                  y='Nanog/profile_counts_max_ref_x_log',  # or 'profile_counts_max_ref_p'
                  color='strand_combination'), 
              df.groupby(["center_diff", 'strand_combination'])['Nanog/profile_counts_max_ref_x_log'].quantile(0.95).reset_index()) + 
 # plot
 geom_line() +
 facet_grid("strand_combination~.") + 
 # geom_smooth() + 
 # ylim([1, None]) + 
 # Theme, labels, colors
 theme_classic(base_size=10, base_family='Arial') + 
 theme(strip_text = element_text(rotation=0), legend_position='top') + 
 xlim([min_dist, max_dist]) + 
 xlab("Pairwise distance") +
 ggtitle(motif_pair_name) + 
 scale_color_brewer(type='qual', palette=3)
)
fig

<ggplot: (8749453634375)>

# Test the ab scatterplot
plotnine.options.figure_size = get_figsize(1, aspect=2/10*4)
max_dist = 100
min_dist = 0
df = dfab[(dfab.center_diff <= max_dist) & 
          (dfab.motif_pair == motif_pair_name)]
fig = (ggplot(aes(x='center_diff', 
                  y='Nanog/profile_counts_max_ref_x_log',  # or 'profile_counts_max_ref_p'
                  color='strand_combination'), 
              df.groupby(["center_diff", 'strand_combination'])['Nanog/profile_counts_max_ref_x_log'].quantile(.5).reset_index()) + 
 # plot
 geom_line() +
 facet_grid("strand_combination~.") + 
 # geom_smooth() + 
 # ylim([1, None]) + 
 # Theme, labels, colors
 theme_classic(base_size=10, base_family='Arial') + 
 theme(strip_text = element_text(rotation=0), legend_position='top') + 
 xlim([min_dist, max_dist]) + 
 xlab("Pairwise distance") +
 ggtitle(motif_pair_name) + 
 scale_color_brewer(type='qual', palette=3)
)
fig

<ggplot: (8749549577634)>

	example_chrom	example_end	example_idx	example_interval_from_task	example_start	example_strand	imp/Klf4	imp/Nanog	imp/Oct4	imp/Sox2	imp_max	imp_max_task	imp_weighted	imp_weighted_cat	imp_weighted_p	match/Klf4	match/Nanog	match/Oct4	match/Sox2	match_max	match_max_task	match_weighted	match_weighted_cat	match_weighted_p	pattern	pattern_center	pattern_end	pattern_end_abs	pattern_len	pattern_name	pattern_short	pattern_start	pattern_start_abs	seq_match	seq_match_cat	seq_match_p	strand	tf	is_te	pattern_center_aln	pattern_strand_aln	row_idx	Chromosome	Start	End	is_erv
0	chr3	122146077	1	Oct4	122145077	.	NaN	NaN	0.9374	NaN	0.9374	Oct4	0.9374	high	0.8462	NaN	NaN	0.5077	NaN	0.5077	Oct4	0.5077	high	0.6886	Oct4/metacluster_0/pa...	430	438	122145515	16	Oct4-Sox2	Oct4/m0_p0	422	122145499	10.2722	high	0.6902	-	Oct4	False	429	-	0	chr3	122145499	122145515	False
1	chr3	122146077	1	Oct4	122145077	.	NaN	NaN	1.5355	NaN	1.5355	Oct4	1.5355	high	0.9920	NaN	NaN	0.5287	NaN	0.5287	Oct4	0.5287	high	0.7992	Oct4/metacluster_0/pa...	451	459	122145536	16	Oct4-Sox2	Oct4/m0_p0	443	122145520	11.5528	high	0.9044	-	Oct4	False	450	-	1	chr3	122145520	122145536	False
2	chr3	122146077	1	Oct4	122145077	.	NaN	NaN	1.2758	NaN	1.2758	Oct4	1.2758	high	0.9670	NaN	NaN	0.4219	NaN	0.4219	Oct4	0.4219	low	0.2234	Oct4/metacluster_0/pa...	492	500	122145577	16	Oct4-Sox2	Oct4/m0_p0	484	122145561	10.7122	high	0.7777	-	Oct4	False	491	-	2	chr3	122145561	122145577	False
3	chr3	122146077	1	Oct4	122145077	.	NaN	NaN	1.0455	NaN	1.0455	Oct4	1.0455	high	0.9050	NaN	NaN	0.4155	NaN	0.4155	Oct4	0.4155	low	0.2005	Oct4/metacluster_0/pa...	534	542	122145619	16	Oct4-Sox2	Oct4/m0_p0	526	122145603	10.2722	high	0.6902	-	Oct4	False	533	-	3	chr3	122145603	122145619	False
4	chr2	52072742	5	Oct4	52071742	.	NaN	NaN	1.4742	NaN	1.4742	Oct4	1.4742	high	0.9890	NaN	NaN	0.5071	NaN	0.5071	Oct4	0.5071	high	0.6851	Oct4/metacluster_0/pa...	516	524	52072266	16	Oct4-Sox2	Oct4/m0_p0	508	52072250	9.2183	medium	0.4731	+	Oct4	False	517	+	4	chr2	52072250	52072266	False

	example_chrom	example_end	example_idx	example_interval_from_task	example_start	example_strand	imp/Klf4	imp/Nanog	imp/Oct4	imp/Sox2	imp_max	imp_max_task	imp_weighted	imp_weighted_cat	imp_weighted_p	match/Klf4	match/Nanog	match/Oct4	match/Sox2	match_max	match_max_task	match_weighted	match_weighted_cat	match_weighted_p	pattern	pattern_center	pattern_end	pattern_end_abs	pattern_len	pattern_name	pattern_short	pattern_start	pattern_start_abs	seq_match	seq_match_cat	seq_match_p	strand	tf	is_te	pattern_center_aln	pattern_strand_aln
0	chr3	122146077	1	Oct4	122145077	.	NaN	NaN	0.9374	NaN	0.9374	Oct4	0.9374	high	0.8462	NaN	NaN	0.5077	NaN	0.5077	Oct4	0.5077	high	0.6886	Oct4/metacluster_0/pa...	430	438	122145515	16	Oct4-Sox2	Oct4/m0_p0	422	122145499	10.2722	high	0.6902	-	Oct4	False	428	-
1	chr3	122146077	1	Oct4	122145077	.	NaN	NaN	1.5355	NaN	1.5355	Oct4	1.5355	high	0.9920	NaN	NaN	0.5287	NaN	0.5287	Oct4	0.5287	high	0.7992	Oct4/metacluster_0/pa...	451	459	122145536	16	Oct4-Sox2	Oct4/m0_p0	443	122145520	11.5528	high	0.9044	-	Oct4	False	449	-
2	chr3	122146077	1	Oct4	122145077	.	NaN	NaN	1.2758	NaN	1.2758	Oct4	1.2758	high	0.9670	NaN	NaN	0.4219	NaN	0.4219	Oct4	0.4219	low	0.2234	Oct4/metacluster_0/pa...	492	500	122145577	16	Oct4-Sox2	Oct4/m0_p0	484	122145561	10.7122	high	0.7777	-	Oct4	False	490	-
3	chr3	122146077	1	Oct4	122145077	.	NaN	NaN	1.0455	NaN	1.0455	Oct4	1.0455	high	0.9050	NaN	NaN	0.4155	NaN	0.4155	Oct4	0.4155	low	0.2005	Oct4/metacluster_0/pa...	534	542	122145619	16	Oct4-Sox2	Oct4/m0_p0	526	122145603	10.2722	high	0.6902	-	Oct4	False	532	-
4	chr2	52072742	5	Oct4	52071742	.	NaN	NaN	1.4742	NaN	1.4742	Oct4	1.4742	high	0.9890	NaN	NaN	0.5071	NaN	0.5071	Oct4	0.5071	high	0.6851	Oct4/metacluster_0/pa...	516	524	52072266	16	Oct4-Sox2	Oct4/m0_p0	508	52072250	9.2183	medium	0.4731	+	Oct4	False	518	+

	task	pattern	imp_score	is_te	length	IC	n_pwm_gw	n_pwm_peaks	n_cwm_peaks	n_seqlets	n_cwm_and_pwm_peaks	frac_cwm_in_pwm_peaks	frac_pwm_in_cwm_peaks	is_short_motif_>100 seqlets_and_ic<30_and_ic>4	Unnamed: 14	Unnamed: 15
0	Oct4	metacluster_0/pattern_9	profile/wn	True	68.0	52.2238	76989.0	1446.0	914.0	355.0	309.0	0.3381	0.2137	FALSE	NaN	NaN
1	Oct4	metacluster_0/pattern_13	profile/wn	True	70.0	62.8465	94727.0	1179.0	138.0	138.0	97.0	0.7029	0.0823	FALSE	NaN	NaN
2	Oct4	metacluster_0/pattern_14	profile/wn	True	70.0	108.1187	90990.0	1538.0	541.0	125.0	373.0	0.6895	0.2425	FALSE	NaN	NaN
3	Oct4	metacluster_0/pattern_15	profile/wn	True	69.0	63.6217	93066.0	1205.0	145.0	142.0	98.0	0.6759	0.0813	FALSE	NaN	NaN
4	Sox2	metacluster_0/pattern_6	profile/wn	True	60.0	34.9974	64904.0	550.0	1731.0	104.0	217.0	0.1254	0.3945	FALSE	NaN	NaN

	example_chrom_x	example_end_x	example_idx	example_interval_from_task_x	example_start_x	example_strand_x	imp/Klf4_x	imp/Nanog_x	imp/Oct4_x	imp/Sox2_x	imp_max_x	imp_max_task_x	imp_weighted_x	imp_weighted_cat_x	imp_weighted_p_x	match/Klf4_x	match/Nanog_x	match/Oct4_x	match/Sox2_x	match_max_x	match_max_task_x	match_weighted_x	match_weighted_cat_x	match_weighted_p_x	pattern_x	pattern_center_x	pattern_end_x	pattern_end_abs_x	pattern_len_x	pattern_name_x	pattern_short_x	pattern_start_x	pattern_start_abs_x	seq_match_x	seq_match_cat_x	seq_match_p_x	strand_x	tf_x	pattern_center_aln_x	pattern_strand_aln_x	row_idx_x	id_x	Klf4/profile_match_x	Klf4/profile_match_p_x	Klf4/profile_counts_x	Klf4/profile_counts_p_x	Klf4/profile_max_x	Klf4/profile_max_p_x	Klf4/profile_counts_max_ref_x	Klf4/profile_counts_max_ref_p_x	Nanog/profile_match_x	Nanog/profile_match_p_x	Nanog/profile_counts_x	Nanog/profile_counts_p_x	Nanog/profile_max_x	Nanog/profile_max_p_x	Nanog/profile_counts_max_ref_x	Nanog/profile_counts_max_ref_p_x	Oct4/profile_match_x	Oct4/profile_match_p_x	Oct4/profile_counts_x	Oct4/profile_counts_p_x	Oct4/profile_max_x	Oct4/profile_max_p_x	Oct4/profile_counts_max_ref_x	Oct4/profile_counts_max_ref_p_x	Sox2/profile_match_x	Sox2/profile_match_p_x	Sox2/profile_counts_x	Sox2/profile_counts_p_x	Sox2/profile_max_x	Sox2/profile_max_p_x	Sox2/profile_counts_max_ref_x	Sox2/profile_counts_max_ref_p_x	example_chrom_y	example_end_y	example_interval_from_task_y	example_start_y	example_strand_y	imp/Klf4_y	imp/Nanog_y	imp/Oct4_y	imp/Sox2_y	imp_max_y	imp_max_task_y	imp_weighted_y	imp_weighted_cat_y	imp_weighted_p_y	match/Klf4_y	match/Nanog_y	match/Oct4_y	match/Sox2_y	match_max_y	match_max_task_y	match_weighted_y	match_weighted_cat_y	match_weighted_p_y	pattern_y	pattern_center_y	pattern_end_y	pattern_end_abs_y	pattern_len_y	pattern_name_y	pattern_short_y	pattern_start_y	pattern_start_abs_y	seq_match_y	seq_match_cat_y	seq_match_p_y	strand_y	tf_y	pattern_center_aln_y	pattern_strand_aln_y	row_idx_y	id_y	Klf4/profile_match_y	Klf4/profile_match_p_y	Klf4/profile_counts_y	Klf4/profile_counts_p_y	Klf4/profile_max_y	Klf4/profile_max_p_y	Klf4/profile_counts_max_ref_y	Klf4/profile_counts_max_ref_p_y	Nanog/profile_match_y	Nanog/profile_match_p_y	Nanog/profile_counts_y	Nanog/profile_counts_p_y	Nanog/profile_max_y	Nanog/profile_max_p_y	Nanog/profile_counts_max_ref_y	Nanog/profile_counts_max_ref_p_y	Oct4/profile_match_y	Oct4/profile_match_p_y	Oct4/profile_counts_y	Oct4/profile_counts_p_y	Oct4/profile_max_y	Oct4/profile_max_p_y	Oct4/profile_counts_max_ref_y	Oct4/profile_counts_max_ref_p_y	Sox2/profile_match_y	Sox2/profile_match_p_y	Sox2/profile_counts_y	Sox2/profile_counts_p_y	Sox2/profile_max_y	Sox2/profile_max_p_y	Sox2/profile_counts_max_ref_y	Sox2/profile_counts_max_ref_p_y	center_diff	center_diff_aln	strand_combination	motif_pair	Nanog/profile_counts_max_ref_x_log	Nanog/profile_counts_max_ref_y_log	Sox2/profile_counts_max_ref_x_log
1	chr3	1.2215e+08	1	Oct4	1.2215e+08	.	NaN	NaN	0.9374	NaN	0.9374	Oct4	0.9374	high	0.8462	NaN	NaN	0.5077	NaN	0.5077	Oct4	0.5077	high	0.6886	Oct4/metacluster_0/pa...	430.0	438.0	1.2215e+08	16.0	Oct4-Sox2	Oct4/m0_p0	422.0	1.2215e+08	10.2722	high	0.6902	-	Oct4	429.0	-	0.0	0.0	5.3935	0.4228	44.0001	0.6462	4.0	0.8385	2.0000e-06	0.7681	3.1092	0.3125	2726.0	0.9965	281.0	0.9993	48.0	0.9884	1.5704	0.0673	2310.0	0.9998	93.0	0.9994	60.0	0.9934	2.4112	0.0222	443.0001	0.9982	22.0	0.9934	9.0000e+00	0.9520	chr3	1.2215e+08	Oct4	1.2215e+08	.	NaN	NaN	1.5355	NaN	1.5355	Oct4	1.5355	high	0.992	NaN	NaN	0.5287	NaN	0.5287	Oct4	0.5287	high	0.7992	Oct4/metacluster_0/pa...	451.0	459.0	1.2215e+08	16.0	Oct4-Sox2	Oct4/m0_p0	443.0	1.2215e+08	11.5528	high	0.9044	-	Oct4	450.0	-	1.0	1.0	4.9558	0.3114	59.0001	0.7495	4.0	0.8385	2.0000e-06	0.7681	3.8785	0.4346	3366.0	0.9981	281.0	0.9993	1.0	0.5260	3.5481	0.3387	2339.0	0.9999	104.0	0.9996	1.0	0.3723	4.3891	0.1790	537.0001	0.9988	27.0	0.9960	2.0000e-06	0.6075	21.0	21.0	++	Oct4-Sox2<>Oct4-Sox2	1.6902	0.3010	1.0000e+00
2	chr3	1.2215e+08	1	Oct4	1.2215e+08	.	NaN	NaN	0.9374	NaN	0.9374	Oct4	0.9374	high	0.8462	NaN	NaN	0.5077	NaN	0.5077	Oct4	0.5077	high	0.6886	Oct4/metacluster_0/pa...	430.0	438.0	1.2215e+08	16.0	Oct4-Sox2	Oct4/m0_p0	422.0	1.2215e+08	10.2722	high	0.6902	-	Oct4	429.0	-	0.0	0.0	5.3935	0.4228	44.0001	0.6462	4.0	0.8385	2.0000e-06	0.7681	3.1092	0.3125	2726.0	0.9965	281.0	0.9993	48.0	0.9884	1.5704	0.0673	2310.0	0.9998	93.0	0.9994	60.0	0.9934	2.4112	0.0222	443.0001	0.9982	22.0	0.9934	9.0000e+00	0.9520	chr3	1.2215e+08	Oct4	1.2215e+08	.	NaN	NaN	1.2758	NaN	1.2758	Oct4	1.2758	high	0.967	NaN	NaN	0.4219	NaN	0.4219	Oct4	0.4219	low	0.2234	Oct4/metacluster_0/pa...	492.0	500.0	1.2215e+08	16.0	Oct4-Sox2	Oct4/m0_p0	484.0	1.2215e+08	10.7122	high	0.7777	-	Oct4	491.0	-	2.0	2.0	4.5805	0.2406	66.0001	0.7776	4.0	0.8385	2.0000e-06	0.7681	1.8579	0.1491	4406.0	0.9993	281.0	0.9993	56.0	0.9910	1.4478	0.0585	2964.0	1.0000	104.0	0.9996	166.0	0.9998	1.6449	0.0099	866.0001	0.9998	29.0	0.9968	3.3000e+01	0.9967	62.0	62.0	++	Oct4-Sox2<>Oct4-Sox2	1.6902	1.7559	1.0000e+00
3	chr3	1.2215e+08	1	Oct4	1.2215e+08	.	NaN	NaN	0.9374	NaN	0.9374	Oct4	0.9374	high	0.8462	NaN	NaN	0.5077	NaN	0.5077	Oct4	0.5077	high	0.6886	Oct4/metacluster_0/pa...	430.0	438.0	1.2215e+08	16.0	Oct4-Sox2	Oct4/m0_p0	422.0	1.2215e+08	10.2722	high	0.6902	-	Oct4	429.0	-	0.0	0.0	5.3935	0.4228	44.0001	0.6462	4.0	0.8385	2.0000e-06	0.7681	3.1092	0.3125	2726.0	0.9965	281.0	0.9993	48.0	0.9884	1.5704	0.0673	2310.0	0.9998	93.0	0.9994	60.0	0.9934	2.4112	0.0222	443.0001	0.9982	22.0	0.9934	9.0000e+00	0.9520	chr3	1.2215e+08	Oct4	1.2215e+08	.	NaN	NaN	1.0455	NaN	1.0455	Oct4	1.0455	high	0.905	NaN	NaN	0.4155	NaN	0.4155	Oct4	0.4155	low	0.2005	Oct4/metacluster_0/pa...	534.0	542.0	1.2215e+08	16.0	Oct4-Sox2	Oct4/m0_p0	526.0	1.2215e+08	10.2722	high	0.6902	-	Oct4	533.0	-	3.0	3.0	3.8271	0.1487	98.0001	0.8662	4.0	0.8385	4.0000e+00	0.9512	0.3850	0.0107	5699.0	0.9995	248.0	0.9989	91.0	0.9964	0.2652	0.0049	3446.0	1.0000	98.0	0.9995	132.0	0.9994	0.5203	0.0060	1039.0000	1.0000	33.0	0.9979	2.4000e+01	0.9919	104.0	104.0	++	Oct4-Sox2<>Oct4-Sox2	1.6902	1.9638	1.0000e+00
6	chr3	1.2215e+08	1	Oct4	1.2215e+08	.	NaN	NaN	1.5355	NaN	1.5355	Oct4	1.5355	high	0.9920	NaN	NaN	0.5287	NaN	0.5287	Oct4	0.5287	high	0.7992	Oct4/metacluster_0/pa...	451.0	459.0	1.2215e+08	16.0	Oct4-Sox2	Oct4/m0_p0	443.0	1.2215e+08	11.5528	high	0.9044	-	Oct4	450.0	-	1.0	1.0	4.9558	0.3114	59.0001	0.7495	4.0	0.8385	2.0000e-06	0.7681	3.8785	0.4346	3366.0	0.9981	281.0	0.9993	1.0	0.5260	3.5481	0.3387	2339.0	0.9999	104.0	0.9996	1.0	0.3723	4.3891	0.1790	537.0001	0.9988	27.0	0.9960	2.0000e-06	0.6075	chr3	1.2215e+08	Oct4	1.2215e+08	.	NaN	NaN	1.2758	NaN	1.2758	Oct4	1.2758	high	0.967	NaN	NaN	0.4219	NaN	0.4219	Oct4	0.4219	low	0.2234	Oct4/metacluster_0/pa...	492.0	500.0	1.2215e+08	16.0	Oct4-Sox2	Oct4/m0_p0	484.0	1.2215e+08	10.7122	high	0.7777	-	Oct4	491.0	-	2.0	2.0	4.5805	0.2406	66.0001	0.7776	4.0	0.8385	2.0000e-06	0.7681	1.8579	0.1491	4406.0	0.9993	281.0	0.9993	56.0	0.9910	1.4478	0.0585	2964.0	1.0000	104.0	0.9996	166.0	0.9998	1.6449	0.0099	866.0001	0.9998	29.0	0.9968	3.3000e+01	0.9967	41.0	41.0	++	Oct4-Sox2<>Oct4-Sox2	0.3010	1.7559	8.6859e-07
7	chr3	1.2215e+08	1	Oct4	1.2215e+08	.	NaN	NaN	1.5355	NaN	1.5355	Oct4	1.5355	high	0.9920	NaN	NaN	0.5287	NaN	0.5287	Oct4	0.5287	high	0.7992	Oct4/metacluster_0/pa...	451.0	459.0	1.2215e+08	16.0	Oct4-Sox2	Oct4/m0_p0	443.0	1.2215e+08	11.5528	high	0.9044	-	Oct4	450.0	-	1.0	1.0	4.9558	0.3114	59.0001	0.7495	4.0	0.8385	2.0000e-06	0.7681	3.8785	0.4346	3366.0	0.9981	281.0	0.9993	1.0	0.5260	3.5481	0.3387	2339.0	0.9999	104.0	0.9996	1.0	0.3723	4.3891	0.1790	537.0001	0.9988	27.0	0.9960	2.0000e-06	0.6075	chr3	1.2215e+08	Oct4	1.2215e+08	.	NaN	NaN	1.0455	NaN	1.0455	Oct4	1.0455	high	0.905	NaN	NaN	0.4155	NaN	0.4155	Oct4	0.4155	low	0.2005	Oct4/metacluster_0/pa...	534.0	542.0	1.2215e+08	16.0	Oct4-Sox2	Oct4/m0_p0	526.0	1.2215e+08	10.2722	high	0.6902	-	Oct4	533.0	-	3.0	3.0	3.8271	0.1487	98.0001	0.8662	4.0	0.8385	4.0000e+00	0.9512	0.3850	0.0107	5699.0	0.9995	248.0	0.9989	91.0	0.9964	0.2652	0.0049	3446.0	1.0000	98.0	0.9995	132.0	0.9994	0.5203	0.0060	1039.0000	1.0000	33.0	0.9979	2.4000e+01	0.9919	83.0	83.0	++	Oct4-Sox2<>Oct4-Sox2	0.3010	1.9638	8.6859e-07

	pattern_name	example_idx	0
0	B-Box	2	2
1	B-Box	4	2
2	B-Box	7	4
...	...	...	...
311598	Zic3	147928	1
311599	Zic3	147950	1
311600	Zic3	147966	1