Goal¶

analyze instance locations

Links¶

modisco results

Conclusions / questions¶

does Klf4 compete with others for the site?
- there are many sites where the model says that if Klf4 binds, all other binding will be depleted?
oct4-sox2 is realatively in-frequent in the dataset. Is this
3/2, 4/1 -> Nanog motif in 2 different context. One occuring in the middle while the other occuring on the side
m1/p0 and m1/p1 are frequently together
instance location assignment could be improved
how to distinquish co-binding vs nearby binding?
- idea: have a look at the specific profile

Ideas¶

Shall we just run modisco for each peak-set separately?

conditioned that we chipped the factor, what are other factors in the vicinity doing?

Improvements¶

Aggregation plot
- plot negative strand as -
- always start at 0
- ylim = [0, max(abs(pos), abs(neg))]
calculate entropy for each region and the final aggregated distribution
- show a histogram
- or a scatterplot with the total counts in the particular region or in the total region
threshold all the computations w.r.t. the total importance score in the region
[ ] compare the PWM, and the aggregated profile across all pattern locations (not just those identified by clustering with modisco)
- can you improve the results by filtering either by seqlet_score or percnormscore?
[ ] scatterplot the spread or bi-modality of the distribution witin the sequence vs. seqlet_score
- are high seqlet scores mostly in the center?

from tqdm import tqdm

from basepair.config import get_data_dir
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pathlib import Path
from basepair.modisco.results import ModiscoResult
from basepair.modisco.score import append_pattern_loc
from basepair.cli.schemas import DataSpec
from basepair.utils import read_pkl

from kipoi.readers import HDF5Reader

Load the data¶

Modisco¶

ddir = Path(get_data_dir())
model_dir = ddir / "processed/chipnexus/exp/models/oct-sox-nanog-klf/models/n_dil_layers=9/"
modisco_dir = model_dir / "modisco/valid"

!ls {model_dir}

!ls {modisco_dir}  # modisco dir

Intervene_results  figures	   grad.valid.h5  modisco
clustering	   grad.all.h5	   history.csv	  preds.all.h5
cometml.json	   grad.test.2.h5  hparams.yaml   results.html
dataspec.yaml	   grad.test.h5    model.h5	  results.ipynb
all.instances.tsv	 kwargs.json  strand_distances.h5
all.instances_bed	 modisco.h5   test.instances.tsv
all.instances_bed.bak	 modisco.log  test.seqlets.pkl
all.seqlets.pkl		 new-hparams  weighted,count
head.test.instances.tsv  plots
hyper-param-search	 plots.bak

TODO -fix the filtering bug¶

# Modisco
mr = ModiscoResult(modisco_dir / "modisco.h5")
mr.open()
tasks = mr.tasks()

ds = DataSpec.load(model_dir / "dataspec.yaml")

d = HDF5Reader(model_dir / "grad.all.h5")
d.open()

id_hash = pd.DataFrame({"peak_id": d.f['/metadata/interval_from_task'][:], 
                        "example_idx": np.arange(d.f['/metadata/interval_from_task'].shape[0])})

Labelled instances¶

# load the instances data frame
df = pd.read_csv(modisco_dir / "all.instances.tsv", sep='\t')
df['pattern_id'] = "metacluster_" + df.metacluster.astype(str) + "/pattern_" + df.pattern.astype(str)
df['seqlet_center'] = (df.seqlet_start + df.seqlet_end) / 2
patterns = df.pattern_id.unique().tolist()
pattern_pssms = {pattern: mr.get_pssm(*pattern.split("/")) for pattern in patterns}
append_pattern_loc(df, pattern_pssms, trim_frac=0.08)
df['pattern_center'] = (df.pattern_start + df.pattern_end) / 2
df = df.merge(id_hash, on="example_idx")

# Use thresholded values from now on
df_unfiltered = df
df = df[df.percnormed_score > .2]

ls {outdir}

Klf4.npy  Nanog.npy  Oct4.npy  Sox2.npy

seqlets = read_pkl(modisco_dir / "all.seqlets.pkl")

sl = seqlets[638799]

assert len(df) == len(seqlets)

# total number of mapped seqlets
len(seqlets)

Grads and inputs¶

n_examples = d.f['/metadata/range/chr'].shape[0]

# total number of peaks / examples
n_examples

98428

# sanity check the raw peak file
for task, ts in ds.task_specs.items():
    n_peaks = !zcat {ts.peaks} | wc -l
    print(f"{task}: {n_peaks[0]}")

Oct4: 21841
Sox2: 9396
Nanog: 18017
Klf4: 49174

df.head()

# almost every peak has a seqlet
len(df.example_id.unique()) / n_examples

Write the interval_from_task vectors¶

d.f['/metadata/interval_from_task'][:4]

array(['Oct4', 'Oct4', 'Oct4', 'Oct4'], dtype=object)

outdir = model_dir / "clustering/peak-set/"
outdir.mkdir(parents=True, exist_ok=True)

metadata_task = d.f['/metadata/interval_from_task'][:]

for task in tasks:
    mask = metadata_task == task
    np.save(outdir / f"{task}.npy", mask)

Stats¶

Number of seqlets per example¶

from plotnine import *
import plotnine

plotnine.options.figure_size = (10, 1)
ggplot(aes(x="size"), df.groupby("example_id").pattern.agg(['size']).reset_index()) + \
  geom_bar() + theme_bw() + \
  xlab("Number of seqlets per example")

<ggplot: (-9223363286210021102)>

Pattern frequency¶

# 20 most frequent patterns
fig = plt.figure(figsize=(3, 8))
df.pattern_id.value_counts().head(30).iloc[::-1].plot(kind='barh');

len(df.pattern_id.value_counts())

70

Functions¶

from basepair.plot.vdom import fig2vdom
from vdom.helpers import *

def plot_n_per_example_ax(dfs, ax):
    dfs.groupby("example_id").size().value_counts().plot("barh", ax=ax)
    ax.set_title("# per example")

def plot_col_hist_ax(dfs, ax, col='score', xlim=None):
    dfs[col].plot.hist(bins=100, ax=ax)
    #ax.set_xlabel("# per example")
    if xlim is not None:
        ax.set_xlim(xlim)
    ax.set_title(col)

def plot_all_dist(df, pattern_id, figsize=(12,2), add_title="", **kwargs):
    """Merge all plots into a single one 
    """
    dfs = df[df.pattern_id == pattern_id]
    fig, ax = plt.subplots(1, 5, figsize=figsize)
    plot_col_hist_ax(dfs, ax[0], "seqlet_center", xlim=[0, 1000])
    ax[0].set_title(f"seqlet_center {add_title}")
    plot_n_per_example_ax(dfs, ax[1])
    plot_col_hist_ax(dfs, ax[2], "percnormed_score")
    plot_col_hist_ax(dfs, ax[3], "score")
    plot_col_hist_ax(dfs, ax[4], "seqlet_score")
    plt.tight_layout()
    return fig2vdom(fig, **kwargs)

import matplotlib.pyplot as plt
from concise.utils.plot import seqlogo_fig, seqlogo
from basepair.math import mean
from basepair.plots import draw_box

# helper functions for plotting the results
def get_scores(d, example_idx, task, grad_type="weighted"):
    return mean([d.f[f'/grads/{task}/{grad_type}/0'][example_idx],
                 d.f[f'/grads/{task}/{grad_type}/1'][example_idx]])

def get_sequence(d, example_idx):
    return d.f['/inputs'][example_idx]

def relevant_patterns(df, example_idx, xlim):
    patterns = []
    for i,row in df[df.example_idx == example_idx].sort_values("pattern_start").iterrows():
        xmin = row.pattern_start - xlim[0] + 0.5
        xmax = row.pattern_end - xlim[0]+ 0.5
        if xmin < 0:
            continue
        if xmax > xlim[1] - xlim[0]:
            continue
        patterns.append((row.pattern_id, row.revcomp))
    return patterns

def draw_patterns(df, example_idx, xlim, ax, add_label=True):
    for i,row in df[df.example_idx == example_idx].iterrows():
        xmin = row.pattern_start - xlim[0] + 0.5
        xmax = row.pattern_end - xlim[0]+ 0.5
        if xmin < 0:
            continue
        if xmax > xlim[1] - xlim[0]:
            continue
        draw_box(xmin, xmax, ax, 'g')  # trimmed pattern location
        if add_label:
            y = ax.get_ylim()[1] + (ax.get_ylim()[1] - ax.get_ylim()[0]) * 0.1
            if row.revcomp:
                prefix = "-"
            else:
                prefix = ""
            ax.text(row.pattern_start - xlim[0] + 0.5, y, 
                    s=prefix + f"mc_{row.metacluster}/p_{row.pattern}")

def plot_region(df, d, tasks, example_idx, xlim=[400,600]):
    fig, axes = plt.subplots(len(tasks), 1, figsize=(20, 2* len(tasks)), sharex=True)
    seq = get_sequence(d, example_idx)
    for i, task in enumerate(tasks):
        ax = axes[i]
        seqlogo((get_scores(d, example_idx, task) * seq)[xlim[0]:xlim[1]], ax=ax)
        draw_patterns(df, example_idx, xlim, ax, add_label=i==0)
        ax.set_ylabel(task)
        ax.set_xticks(list(range(0, xlim[1] - xlim[0] + 1, 5)));    

def rc_prefix(rc):
    if rc:
        return "-"
    else:
        return ""
        
def vdom_patterns(mr, patterns_rc_vec, trim_frac=0.08):
    """Get vdom patterns in a single row
    
    Args:
      mr: ModiscoResults object
      patterns_rc_vec: list of tuples (pattern_id, rc). Example: ("metacluster_0/pattern_1", True)
      trim_frac: how much to trim the pssm
    """
    return p([fig2vdom(mr.plot_pssm(*pattern_idx.split("/"), rc=rc, trim_frac=trim_frac, title=rc_prefix(rc) + pattern_idx, 
                                    letter_width=0.35, height=1.7), height=75)
              for pattern_idx, rc in patterns_rc_vec])

def dist_pairwise_hist(df, a, b, ax=None):
    if a != b:
        dfc_ab = df[df.pattern_id.isin([a,b])]
        counts = dfc_ab.groupby("example_idx").pattern_id.value_counts().unstack(fill_value=0)
        relevant_idx = counts.index[(counts==1).all(axis=1)]
        distances = dfc_ab[dfc_ab.example_idx.isin(relevant_idx)].set_index(['example_idx', 'pattern_id']).pattern_center.unstack()
        diff = distances[a] - distances[b]
        title = f"{a} - {b}"
    if a == b:
        examples = df[df.pattern_id == a].groupby("example_idx").filter(lambda x: len(x) == 2)
        diff = examples.groupby("example_idx").apply(lambda x: x.pattern_center.iloc[1] - x.pattern_center.iloc[0])
        title = f"{a}"
    if ax is None:
        fig,ax = plt.subplots(figsize=(15,2))
    ax.hist(diff, bins=100);
    ax.set_title(title);

df.head()

The plot bellow shows the following:

positional distribution within the 1kb window
number of seqlets per example
PSSM

## fig_height = 100
div(*[div(span(str(i), ". ", row['index'] + f" (n={row.pattern_id})", 
            fig2vdom(mr.plot_pssm(*row['index'].split("/"), height=0.5, letter_width=0.15, trim_frac=0.08))),
            br(),
            *[plot_all_dist(df[df.peak_id == task], row['index'], add_title=task,
                            figsize=(16,1.8), height=100) for task in tasks],
           )
      for i, row in df.pattern_id.value_counts().reset_index().iterrows()])

Analysis¶

metacluster_2/pattern_0 = GGTGTGG = Klf4 motif http://meme-suite.org/opal-jobs/appTOMTOM_5.0.1_15351119239261901384692/tomtom.html
metacluster_1/pattern_0 = Nanog
metacluster_2/pattern_3 = Klf4
metacluster_0/pattern_0 = Oct4-Sox2
metacluster_0/pattern_2 = Nanog

Co-occurence stats¶

Compute the co-occurence stats.

Filter¶

from upsetplot import plot as upsetplt

all_patterns = mr.patterns()

# restrict to the central 500bp
df_center = df[(df.pattern_center > 250) & (df.pattern_center < 750)]

len(df)

516881

len(df_center)

443810

dfp = pd.DataFrame.from_dict([dict(pd.Series(all_patterns, 
                                             index=all_patterns).
                                   isin(group.pattern_id.unique()))
                               for name, group in tqdm(df_center.groupby("example_id"))])

100%|██████████| 96822/96822 [02:05<00:00, 768.82it/s]

# make the upsetplot
data_dict = dfp.groupby(all_patterns).size()
dfc = data_dict.reset_index().rename(columns={0:"n"})
dfcp = dfc.sort_values("n", ascending=False).head(40)  # take only 40 most frequent combinations
col_incl = dfcp.iloc[:, :-1].max()
dfcp_plot = dfcp.iloc[:, col_incl.tolist() + [True]]
dfcp_plot = dfcp_plot.set_index(list(dfcp_plot.columns[:-1]))

Plot¶

r = upsetplt(dfcp_plot.n)

/users/avsec/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/upsetplot/plotting.py:24: PerformanceWarning: indexing past lexsort depth may impact performance.
  totals.append(data.loc[idxslice].sum())
/users/avsec/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/matplotlib/cbook/deprecation.py:107: MatplotlibDeprecationWarning: Passing one of 'on', 'true', 'off', 'false' as a boolean is deprecated; use an actual boolean (True/False) instead.
  warnings.warn(message, mplDeprecation, stacklevel=1)

# note interesting pairs to plot pairwise distances
pairs = [
     ["metacluster_2/pattern_0", "metacluster_1/pattern_0"],
     ["metacluster_2/pattern_0", "metacluster_0/pattern_0"],
     ["metacluster_2/pattern_0", "metacluster_28/pattern_0"],
     ["metacluster_1/pattern_0", "metacluster_2/pattern_3"],
     ["metacluster_0/pattern_0", "metacluster_0/pattern_1"],
     ["metacluster_0/pattern_0", "metacluster_0/pattern_2"],
     ["metacluster_0/pattern_0", "metacluster_3/pattern_2"],
     ["metacluster_1/pattern_0", "metacluster_1/pattern_1"],
     ["metacluster_1/pattern_2", "metacluster_0/pattern_0"],
     ["metacluster_1/pattern_2", "metacluster_0/pattern_1"],
]

# Visualie motifs for most frequent patterns from above
dfcp_plot_melt = pd.melt(dfcp_plot.reset_index().assign(idx=np.arange(len(dfcp_plot))), id_vars=["n","idx"])
patterns = dfcp_plot_melt.query('value').groupby("variable").n.sum().sort_values(ascending=False).index.tolist()

display(vdom_patterns(mr, zip(patterns, [False]*len(patterns)), trim_frac=0.08))

Pairwise distances with self¶

restrict to sites with exactly 2 instances of the same pattern and plot a pairwise distance

nrow = int(np.ceil(len(patterns)/2))
fig, axes = plt.subplots(nrow,2, figsize=(20, nrow * 1.5), sharex=True)
for pattern, ax in tqdm(zip(patterns, axes.flat)):
    dist_pairwise_hist(df[~df[['example_id', 'pattern_center']].duplicated()], pattern, pattern, ax)
    ax.set_xlim([-1000, 1000])
plt.tight_layout()

12it [00:43,  3.64s/it]

Pairwise distances with a pair¶

restrict to sites with exactly 1 instance for each motif and plot their pairwise distance

## TODO statify those plots by the peak_id

nrow = int(np.ceil(len(pairs)/2))
fig, axes = plt.subplots(nrow, 2, figsize=(20, nrow * 1.5), sharex=True)
for pair, ax in tqdm(zip(pairs, axes.flat)):
    dist_pairwise_hist(df[~df[['example_id', 'pattern_center']].duplicated()], pair[0], pair[1], ax)
    ax.set_xlim([-1000, 1000])
plt.tight_layout()

10it [00:03,  3.03it/s]

Eyeballing¶

plots bellow this point are used to check how well the assignment of seqlets to patterns work

example_idx = 12
xlim=[400,600]
display(vdom_patterns(mr, relevant_patterns(df, example_idx, xlim), trim_frac=0.08))
plot_region(df, d, tasks, 12, xlim)

example_idx = 29934
xlim=[400,600]
display(vdom_patterns(mr, relevant_patterns(df, example_idx, xlim), trim_frac=0.08))
plot_region(df, d, tasks, example_idx, xlim)

example_idx = df.example_idx.sample(1).iloc[0]
print(example_idx)

44980

xlim=[400,600]
display(vdom_patterns(mr, relevant_patterns(df, example_idx, xlim), trim_frac=0.08))
plot_region(df, d, tasks, example_idx, xlim)

example_idx = df.example_idx.sample(1).iloc[0]
print(example_idx)

xlim=[400,600]
display(vdom_patterns(mr, relevant_patterns(df, example_idx, xlim), trim_frac=0.08))
plot_region(df, d, tasks, example_idx, xlim)

11682

# This example doesn't make any sense to me

df[df.example_idx == 5472]

sl = seqlets[638799]

s = sl.track_name_to_snippet['Sox2_contrib_scores']

seqlogo_fig(s.fwd, figsize=(10,2));

example_idx = df.example_idx.sample(1).iloc[0]
print(example_idx)
example_idx = 5472
xlim=[400,600]
display(vdom_patterns(mr, relevant_patterns(df, example_idx, xlim), trim_frac=0.08))
plot_region(df, d, tasks, example_idx, xlim)

83953

seqlogo_fig(s.fwd, figsize=(10,2));

mr.plot_pssm("metacluster_9","pattern_0");

example_idx = df.example_idx.sample(1).iloc[0]
print(example_idx)
xlim=[400,600]
display(vdom_patterns(mr, relevant_patterns(df, example_idx, xlim), trim_frac=0.08))
plot_region(df, d, tasks, example_idx, xlim)

28157

example_idx = df.example_idx.sample(1).iloc[0]
print(example_idx)

xlim=[400,600]
display(vdom_patterns(mr, relevant_patterns(df, example_idx, xlim), trim_frac=0.08))
plot_region(df, d, tasks, example_idx, xlim)

55674

example_idx = df.example_idx.sample(1).iloc[0]
print(example_idx)

xlim=[400,600]
display(vdom_patterns(mr, relevant_patterns(df, example_idx, xlim), trim_frac=0.08))
plot_region(df, d, tasks, example_idx, xlim)

47460

example_idx = df.example_idx.sample(1).iloc[0]
print(example_idx)

xlim=[400,600]
display(vdom_patterns(mr, relevant_patterns(df, example_idx, xlim), trim_frac=0.08))
plot_region(df, d, tasks, example_idx, xlim)

11844

example_idx = df.example_idx.sample(1).iloc[0]
print(example_idx)

xlim=[400,600]
display(vdom_patterns(mr, relevant_patterns(df, example_idx, xlim), trim_frac=0.08))
plot_region(df, d, tasks, example_idx, xlim)

20764

example_idx = df.example_idx.sample(1).iloc[0]
print(example_idx)

xlim=[400,600]
display(vdom_patterns(mr, relevant_patterns(df, example_idx, xlim), trim_frac=0.08))
plot_region(df, d, tasks, example_idx, xlim)

36709

df[df.example_idx == 36709].iloc[0][['example_chr', 'example_start', 'example_end']]

example_chr         chr13
example_start    99015585
example_end      99016585
Name: 78128, dtype: object

Comparing models¶

Load old Oct-Sox BPNet¶

from basepair.BPNet import BPNetPredictor
from keras.models import load_model
from basepair.config import create_tf_session
from pybedtools import Interval

create_tf_session(0)

<tensorflow.python.client.session.Session at 0x7fa2994c4208>

model = load_model("/users/avsec/workspace/basepair/basepair/../data/processed/chipnexus/exp/models/p-multi-task/seq_mutlitask_filters=32,conv1_kernel_size=21,tconv_kernel_size=25,n_dil_layers=6,use_profile=True,use_counts=True,c_task_weight=10,lr=0.004.5.h5")
preproc = read_pkl("/users/avsec/workspace/basepair/basepair/../data/processed/chipnexus/exp/models/p-multi-task/preproc.pkl")
bp = BPNetPredictor(model, 
                    "/mnt/data/pipeline_genome_data/mm10/mm10_no_alt_analysis_set_ENCODE.fasta",
                    tasks = ["Sox2", "Oct4"],
                    preproc=preproc)
bws = {task: {s: f"/users/avsec/workspace/basepair-workflow/data/{task}/5prime.counts.{s}.bw" for s in ['pos', 'neg']} for task in ["Sox2", "Oct4"]}
genome_file = "/mnt/data/pipeline_genome_data/mm10/mm10.chrom.sizes"

WARNING:tensorflow:From /users/avsec/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/tensorflow/python/util/deprecation.py:497: calling conv1d (from tensorflow.python.ops.nn_ops) with data_format=NHWC is deprecated and will be removed in a future version.
Instructions for updating:
`NHWC` for data_format is deprecated, use `NWC` instead

2018-09-01 13:42:13,786 [WARNING] From /users/avsec/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/tensorflow/python/util/deprecation.py:497: calling conv1d (from tensorflow.python.ops.nn_ops) with data_format=NHWC is deprecated and will be removed in a future version.
Instructions for updating:
`NHWC` for data_format is deprecated, use `NWC` instead

WARNING:tensorflow:From /users/avsec/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/tensorflow/contrib/learn/python/learn/datasets/base.py:198: retry (from tensorflow.contrib.learn.python.learn.datasets.base) is deprecated and will be removed in a future version.
Instructions for updating:
Use the retry module or similar alternatives.

2018-09-01 13:42:19,181 [WARNING] From /users/avsec/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/tensorflow/contrib/learn/python/learn/datasets/base.py:198: retry (from tensorflow.contrib.learn.python.learn.datasets.base) is deprecated and will be removed in a future version.
Instructions for updating:
Use the retry module or similar alternatives.

Make predictions¶

df[(df.example_chr == "chr17") & (df.example_start < 35504982) & (df.example_end >35504982+201 )]

Core motif not recognized¶

# Filtered seqlets (percnormed_score > 0.2)
example_idx = 8962
xlim=[400,600]
display(vdom_patterns(mr, relevant_patterns(df, example_idx, xlim), trim_frac=0.08))
plot_region(df, d, tasks, example_idx, xlim)

# old bpnet prediction
bp.plot_predict_grad([Interval("chr17", 35504982, 35504982+201)], bws=bws, 
                      profile_grad='weighted',
                      figsize=(20, 10))

	Unnamed: 0	example_idx	seqlet_start	seqlet_end	seqlet_is_revcomp	seqlet_score	pattern	percnormed_score	score	...	example_chr	example_end	example_id	example_start	example_strand	pattern_id	seqlet_center	pattern_start	pattern_end	pattern_center
0	0	0	516	557	False	1.899616	2	0.323415	0.178182	...	chrX	143483544	0	143482544	*	metacluster_0/pattern_2	536.5	522	531	526.5
1	1	0	476	517	False	1.073116	1	0.327721	0.254454	...	chrX	143483544	0	143482544	*	metacluster_0/pattern_1	496.5	494	502	498.0
4	4	1	477	518	False	1.489748	0	0.211156	0.249695	...	chr3	122146063	1	122145063	*	metacluster_0/pattern_0	497.5	477	491	484.0
5	5	1	507	548	False	1.381093	0	0.781497	0.335416	...	chr3	122146063	1	122145063	*	metacluster_0/pattern_0	527.5	519	533	526.0
10	10	2	569	610	False	0.461497	2	0.676585	0.220768	...	chr17	35504551	2	35503551	*	metacluster_0/pattern_2	589.5	581	590	585.5

	Unnamed: 0	example_idx	seqlet_start	seqlet_end	seqlet_is_revcomp	seqlet_score	metacluster	pattern	percnormed_score	score	...	example_id	example_start	example_strand	pattern_id	pattern_start	pattern_end	seqlet_center	pattern_start_rel	pattern_end_rel	strand
13626	13626	5472	486	527	False	0.361572	0	0	0.113469	0.221166	...	5472	21224842	*	metacluster_0/pattern_0	501	515	506.5	21225343	21225357	-
162727	162727	5472	551	592	False	0.433490	1	0	0.188333	0.156902	...	5472	21224842	*	metacluster_1/pattern_0	575	583	571.5	21225417	21225425	+
162728	162728	5472	574	615	False	0.273384	1	0	0.076667	0.132894	...	5472	21224842	*	metacluster_1/pattern_0	594	602	594.5	21225436	21225444	-
318708	318708	5472	425	466	False	0.580759	2	0	0.744276	0.353541	...	5472	21224842	*	metacluster_2/pattern_0	451	459	445.5	21225293	21225301	+
527944	527944	5472	510	551	False	0.354864	4	3	0.927461	0.239710	...	5472	21224842	*	metacluster_4/pattern_3	521	529	530.5	21225363	21225371	+
638799	638799	5472	457	498	False	0.696911	9	0	0.224932	0.347820	...	5472	21224842	*	metacluster_9/pattern_0	465	480	477.5	21225307	21225322	+

	Unnamed: 0	example_idx	seqlet_start	seqlet_end	seqlet_is_revcomp	seqlet_score	metacluster	pattern	percnormed_score	score	...	example_end	example_id	example_start	example_strand	pattern_id	seqlet_center	pattern_start	pattern_end	pattern_center	peak_id
59772	20803	8962	465	506	False	2.426015	0	2	0.235446	0.165443	...	35505610	8962	35504610	*	metacluster_0/pattern_2	485.5	470	479	474.5	Oct4
59773	20804	8962	490	531	False	1.540230	0	2	0.573092	0.206996	...	35505610	8962	35504610	*	metacluster_0/pattern_2	510.5	507	516	511.5	Oct4
59774	20805	8962	535	576	False	1.095303	0	1	0.515334	0.284714	...	35505610	8962	35504610	*	metacluster_0/pattern_1	555.5	560	568	564.0	Oct4
59777	321906	8962	634	675	False	0.203333	2	0	0.594324	0.342402	...	35505610	8962	35504610	*	metacluster_2/pattern_0	654.5	659	667	663.0	Oct4
59778	639257	8962	144	185	False	0.318200	9	2	0.592179	0.471742	...	35505610	8962	35504610	*	metacluster_9/pattern_2	164.5	158	172	165.0	Oct4
188224	62457	30961	421	462	False	2.295741	0	2	0.222510	0.163186	...	35505654	30961	35504654	*	metacluster_0/pattern_2	441.5	426	435	430.5	Sox2
188225	62458	30961	446	487	False	1.504524	0	2	0.580854	0.207997	...	35505654	30961	35504654	*	metacluster_0/pattern_2	466.5	463	472	467.5	Sox2
188226	62459	30961	491	532	False	1.204851	0	1	0.567048	0.293268	...	35505654	30961	35504654	*	metacluster_0/pattern_1	511.5	516	524	520.0	Sox2
188229	343049	30961	590	631	False	0.225735	2	0	0.595937	0.342498	...	35505654	30961	35504654	*	metacluster_2/pattern_0	610.5	615	623	619.0	Sox2
188230	641865	30961	100	141	False	0.283675	9	2	0.620112	0.478279	...	35505654	30961	35504654	*	metacluster_9/pattern_2	120.5	114	128	121.0	Sox2
192867	64532	31552	457	498	False	2.424403	0	2	0.227684	0.164677	...	35505618	31552	35504618	*	metacluster_0/pattern_2	477.5	462	471	466.5	Nanog
192868	64533	31552	482	523	False	1.548651	0	2	0.578266	0.207649	...	35505618	31552	35504618	*	metacluster_0/pattern_2	502.5	499	508	503.5	Nanog
192869	64534	31552	527	568	False	1.107850	0	1	0.527360	0.286755	...	35505618	31552	35504618	*	metacluster_0/pattern_1	547.5	552	560	556.0	Nanog
192872	343530	31552	626	667	False	0.209066	2	0	0.598194	0.342625	...	35505618	31552	35504618	*	metacluster_2/pattern_0	646.5	651	659	655.0	Nanog
192873	641975	31552	136	177	False	0.308004	9	2	0.597765	0.472062	...	35505618	31552	35504618	*	metacluster_9/pattern_2	156.5	150	164	157.0	Nanog
326291	108212	51651	452	493	False	2.413752	0	2	0.225097	0.163647	...	35505623	51651	35504623	*	metacluster_0/pattern_2	472.5	457	466	461.5	Klf4
326292	108213	51651	477	518	False	1.542795	0	2	0.569211	0.206666	...	35505623	51651	35504623	*	metacluster_0/pattern_2	497.5	494	503	498.5	Klf4
326293	108214	51651	522	563	False	1.119886	0	1	0.528563	0.286992	...	35505623	51651	35504623	*	metacluster_0/pattern_1	542.5	547	555	551.0	Klf4
326296	362582	51651	621	662	False	0.211476	2	0	0.601419	0.342887	...	35505623	51651	35504623	*	metacluster_2/pattern_0	641.5	646	654	650.0	Klf4
326297	644942	51651	131	172	False	0.304463	9	2	0.608939	0.473085	...	35505623	51651	35504623	*	metacluster_9/pattern_2	151.5	145	159	152.0	Klf4