exp = 'nexus,peaks,OSNK,0,10,1,FALSE,same,0.5,64,25,0.004,9,FALSE,[1,50],TRUE'
imp_score = 'profile/wn'

task = "Nanog"
pattern = "metacluster_0/pattern_6"

Goal¶

get PFM instances genome wide

Tasks¶

TODO
- [x] export all patterns as MEME / FIMO motifs
  - check if you have some code ready
  - use the trimmed patterns (same as used for dfi scanning)
- [X] Do the FIMO scanning -> generates the bed files
  - do it from the notebook or write a Snakemake file?
    - fimo.smk
- [X] load the fimo files as pd.DataFrame
  - use your scripts from motif comparisons
- [X] overlap instances (join by chr start end strand)
  - [X] load dfi
  - is there a significant overlap?
  - which ones overlap?
  - which ones don't overlap?
- [X] Give the following columns for each motif (pd.DataFrame):
  - pattern
  - #seqlets
  - is_te
  - IC
  - # PFM genome-wide
  - # PFM peaks
  - % PFM peaks in CWM
  - # CWM peaks
  - % CWM peaks in PFM
  - # all peaks
- [ ] export as csv to gdrive. load with gsheet
- [ ] subset the gsheet to only TE's in Figure 3B
analysis
- compare with CWM and PWM in peaks
- give spreadsheet and or histogram

# Imports
from basepair.imports import *
from basepair.exp.paper.config import *
import pyranges as pr
hv.extension('bokeh')
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
paper_config()

Using TensorFlow backend.

ddir = get_data_dir()
model_dir = models_dir / exp

Export MEME motifs¶

patterns_dict = [(task, read_pkl(model_dir / f"deeplift/{task}/out/{imp_score}/patterns.pkl"))
                  for task in tasks]

p_meme = None
for task, patterns in patterns_dict:
    mr = ModiscoResult(model_dir / f'deeplift/{task}/out/{imp_score}/modisco.h5')
    for pname in mr.patterns():
        untrimmed_pattern = mr.get_pattern(pname)
        pattern = untrimmed_pattern.trim_seq_ic(0.08)
        if task == 'Nanog' and pattern.name == 'metacluster_0/pattern_6':
            print(len(pattern))
            p_meme = untrimmed_pattern
        motif_dir = model_dir / f"deeplift/{task}/out/{imp_score}/PFM/{pattern.name}"
        motif_dir.mkdir(parents=True, exist_ok=True)
        pattern.write_meme_file([0.25, 0.25, 0.25, 0.25],
                                motif_dir /"meme.txt")

70

Run FIMO¶

# RUN
# snakemake -s fimo.smk -j 10

TODO¶

[X] test the load time of a single motif vs full table
- fastparquet doesn't work properly :/ -> that's probably due to '/' in the category names
write a function to load dfi for each pattern
- read_instances('Klf4', 'metacluster_0/pattern_1')
write a function to perform the overlaps for each pattern separately
use snakemake to run in parallel

# Load stats about the motif
mr = ModiscoResult(model_dir / f'deeplift/{task}/out/{imp_score}/modisco.h5')

# get TE's
motifs_te = [pattern
             for pattern in mr.patterns()
             if mr.get_pattern(pattern).seq_info_content > 30 and mr.n_seqlets(*pattern.split('/')) > 100]
is_te = pattern in motifs_te
# motifs_te_d = OrderedDict([(shorten_pattern(x), shorten_pattern(x)) for x in motifs_te])

n_seqlets = mr.n_seqlets(*pattern.split("/"))

p = mr.get_pattern(pattern).trim_seq_ic(0.08)

TF-MoDISco is using the TensorFlow backend.

What fraction of the seqlets contains a CWM hit (and vice versa)?¶

[X] convert the cwm matches into intervals using seqname strand
[X] convert seqlets to pyranges
[ ] perform the overlaps for a single pattern
- What fraction of the seqlets contains a CWM hit?
[ ] perform the overlaps for multiple patterns
- all patterns
- major patterns
  - load_instances should load the full thing
  - load the major pattern names
  - use load_instances to filter the instances only to the major motifs
  - include also the TE's
- make sure you are only looking at 57 main motifs (e.g. 5 must be excluded...)

# TOOD - there is a discrepancy between the intervals

TODO - use the absolute coordinates¶

from basepair.modisco.pattern_instances import load_instances, multiple_load_instances

def abs_coord(seqlet, ranges):
    """Create absolute seqlet coordinates
    """
    row = ranges.loc[seqlet.seqname]
    return {"Chromosome": row.chrom,
            "Start": row.start + seqlet.start - 15,  # make it 70 bp long
            "End": row.start + seqlet.end + 15,
           }

def dfi2pr(dfi):
    dfi['Chromosome'] = dfi.example_chrom
    dfi['Start'] = dfi.pattern_start_abs
    dfi['End'] = dfi.pattern_end_abs
    dfi['Strand'] = dfi.strand
    dfi_pr = pr.PyRanges(dfi)
    return dfi_pr


df_subset = pd.read_csv("https://docs.google.com/spreadsheets/d/1MVBUbMs_F0UPgZxR8-v9eMM6DCFhVYc7kdH-C6WJnYU/export?gid=0&format=csv")
df_subset.name = df_subset.name.str.replace('"', '')
df_subset.task = df_subset.task.str.replace('"', '')

df_info = pd.read_csv("https://docs.google.com/spreadsheets/d/1uAtJZJR_dmlWwer5Afcd2q72Uuj7A4Ln/export?gid=826411817&format=csv")

{'m0_p6': 'm0_p6',
 'm0_p7': 'm0_p7',
 'm0_p10': 'm0_p10',
 'm0_p11': 'm0_p11',
 'm0_p12': 'm0_p12',
 'm0_p13': 'm0_p13',
 'm0_p14': 'm0_p14',
 'm0_p15': 'm0_p15',
 'm0_p18': 'm0_p18',
 'm0_p0': 'm0_p0',
 'm0_p1': 'm0_p1',
 'm0_p2': 'm0_p2',
 'm0_p3': 'm0_p3',
 'm0_p4': 'm0_p4',
 'm0_p5': 'm0_p5',
 'm0_p8': 'm0_p8',
 'm0_p9': 'm0_p9',
 'm0_p16': 'm0_p16',
 'm0_p17': 'm0_p17'}

len(dfi_pr_all.intersect(dfs, how='containment', strand=None))

157121

len(dfi_pr_all)

197667

len(dfi_pr_all.drop_duplicate_positions(strand=False))

197663

len(dfi_pr_main.intersect(dfs, how='containment', strand=None).drop_duplicate_positions(strand=False))

39454

len(dfi_pr_main)

59984

len(dfs.drop_duplicate_positions(strand=False))

101304

len(dfs.overlap(dfi_pr_all).drop_duplicate_positions(strand=False))

87802

len(dfs.overlap(dfi_pr_main).drop_duplicate_positions(strand=False))

51361

# TODO - remove motif with low ic

for task in tasks:
    print("Loading ranges")
    # Load seqlets
    mr = ModiscoResult(model_dir / f'deeplift/{task}/out/{imp_score}/modisco.h5')
    seqlets = mr.all_seqlets()
    ranges = mr.load_ranges()
    ranges.set_index('idx', inplace=True)
    assert np.all(ranges.index == np.arange(len(ranges)))
    dfs = pd.DataFrame([abs_coord(s, ranges) for s in seqlets])
    dfs = pr.PyRanges(dfs).drop_duplicate_positions(strand=False)
    
    print("loading dfi")
    
    # Load CWM instances
    print("load instances")
    dfi = load_instances(model_dir / f'deeplift/{task}/out/{imp_score}/instances.parq')
    print("filter")
    dfi = dfi.query('match_weighted_p > .2').query('imp_weighted_p > .01')
    
    print("filter")
    # That's only subsetting the main motifs for a particular task
    motifs = {k:k for k in df_subset[df_subset.task == task].pattern}
    all_motifs = {k:k for k in df_info[(df_info.IC > 4) & (df_info.task == task)].pattern.map(shorten_pattern)}

    
    
    dfi_main_motifs = load_instances(dfi, motifs)
    dfi_pr_main = dfi2pr(dfi_main_motifs).drop_duplicate_positions(strand=False)
    
    dfi_all_motifs = load_instances(dfi, all_motifs)
    dfi_pr_all = dfi2pr(dfi_all_motifs).drop_duplicate_positions(strand=False)
    
    print("-"*40)
    print(f"Task: {task}")
    s = len(dfs)
    sm = len(dfs.overlap(dfi_pr_main).drop_duplicate_positions(strand=False))
    sa = len(dfs.overlap(dfi_pr_all).drop_duplicate_positions(strand=False))
    print(f"seqlets overlaps main motifs: {sm}/{s} ({sm/s:.2f})")
    print(f"seqlets overlaps all motifs: {sa}/{s} ({sa/s:.2f})")
    print("seqlets overlaps main motifs: {}".format(len(dfs.overlap(dfi_pr_main))))
    print("")
    cm = len(dfi_pr_main)
    cmo = len(dfi_pr_main.intersect(dfs, how='containment', strand=None).drop_duplicate_positions(strand=False))
    ca = len(dfi_pr_all)
    cao = len(dfi_pr_all.intersect(dfs, how='containment', strand=None).drop_duplicate_positions(strand=False))
    print(f"CWM (main motifs) overlap seqlets: {cmo}/{cm} ({cmo/cm:.2f})")
    print(f"CWM (all motifs) overlap seqlets: {cao}/{ca} ({cao/ca:.2f})")
    print("-"*40)

Loading ranges
loading dfi
load instances
number of de-duplicated instances: 51782306 (34.84561268187079%)
filter
filter
number of de-duplicated instances: 0 (0.0%)
number of de-duplicated instances: 0 (0.0%)
----------------------------------------
Task: Oct4
seqlets overlaps main motifs: 28767/42995 (0.67)
seqlets overlaps all motifs: 37159/42995 (0.86)
seqlets overlaps main motifs: 49071

CWM (main motifs) overlap seqlets: 30499/79433 (0.38)
CWM (all motifs) overlap seqlets: 45641/198384 (0.23)
----------------------------------------
Loading ranges
loading dfi
load instances
number of de-duplicated instances: 976478 (37.701489367863246%)
filter
filter
number of de-duplicated instances: 0 (0.0%)
number of de-duplicated instances: 0 (0.0%)
----------------------------------------
Task: Sox2
seqlets overlaps main motifs: 6377/17665 (0.36)
seqlets overlaps all motifs: 13144/17665 (0.74)
seqlets overlaps main motifs: 6924

CWM (main motifs) overlap seqlets: 4761/21355 (0.22)
CWM (all motifs) overlap seqlets: 13849/85077 (0.16)
----------------------------------------
Loading ranges
loading dfi
load instances
number of de-duplicated instances: 4377763 (35.19495062591796%)
filter
filter
number of de-duplicated instances: 0 (0.0%)
number of de-duplicated instances: 0 (0.0%)
----------------------------------------
Task: Nanog
seqlets overlaps main motifs: 51361/101304 (0.51)
seqlets overlaps all motifs: 87802/101304 (0.87)
seqlets overlaps main motifs: 64411

CWM (main motifs) overlap seqlets: 39454/59982 (0.66)
CWM (all motifs) overlap seqlets: 123018/197663 (0.62)
----------------------------------------
Loading ranges
loading dfi
load instances
number of de-duplicated instances: 16307358 (35.678892813327785%)
filter
filter
number of de-duplicated instances: 0 (0.0%)
number of de-duplicated instances: 0 (0.0%)
----------------------------------------
Task: Klf4
seqlets overlaps main motifs: 59243/103514 (0.57)
seqlets overlaps all motifs: 99336/103514 (0.96)
seqlets overlaps main motifs: 66794

CWM (main motifs) overlap seqlets: 52005/67328 (0.77)
CWM (all motifs) overlap seqlets: 246825/448379 (0.55)
----------------------------------------

# total seqlets
s = 42995+17665+101304+103514
s

265478

#overlap main motifs
mo = 28767 + 6377 + 51361 + 59243
ao = 37159 + 13144 + 87802 + 99336

print(f"total seqlets: {mo}")
print(f"main motif overlaps: {mo} / {s} ({mo/s:.2f})")
print(f"all motif overlaps: {ao} / {s} ({ao/s:.2f})")

total seqlets: 145748
main motif overlaps: 145748 / 265478 (0.55)
all motif overlaps: 237441 / 265478 (0.89)

p.plot();

p_meme.plot();

# Load CWM instances
from basepair.modisco.pattern_instances import load_instances

dfi = load_instances(model_dir / f'deeplift/{task}/out/{imp_score}/instances.parq',
                     {shorten_pattern(pattern): shorten_pattern(pattern)})

# filter only to the relevant ones
dfi = dfi.query('match_weighted_p > .2').query('imp_weighted_p > .01')

dfi['Chromosome'] = dfi.example_chrom
dfi['Start'] = dfi.pattern_start_abs
dfi['End'] = dfi.pattern_end_abs
dfi['Strand'] = dfi.strand
dfi_pr = pr.PyRanges(dfi)

number of de-duplicated instances: 1826 (60.046037487668535%)

# Load PWM instances
from basepair.exp.chipnexus.comparison import read_fimo_dfi
dfi_fimo = read_fimo_dfi(model_dir / f'deeplift/{task}/out/{imp_score}/PFM/{pattern}/fimo.tsv')
dfi_fimo['pattern'] = f"{task}/{pattern}"
dfi_fimo['Chromosome'] = dfi_fimo.chrom
dfi_fimo['Start'] = dfi_fimo.start - 1  # intervals are 1-based
dfi_fimo['End'] = dfi_fimo.end
dfi_fimo['Strand'] = dfi_fimo.strand
dfi_fimo = pr.PyRanges(dfi_fimo)

# Load the peak intervals
from basepair.cli.modisco import load_ranges

ranges = load_ranges(model_dir / f'deeplift/{task}/out/{imp_score}')

ranges['Chromosome'] = ranges.chrom
ranges['Start'] = ranges.start
ranges['End'] = ranges.end
ranges = pr.PyRanges(ranges)

ranges

+--------------+-----------+-----------+------------+-----------+-----------+------------+-------+
| Chromosome   | Start     | End       | chrom      | start     | end       | strand     | ...   |
| (category)   | (int32)   | (int32)   | (object)   | (int64)   | (int64)   | (object)   |       |
|--------------+-----------+-----------+------------+-----------+-----------+------------+-------|
| chr1         | 134534722 | 134535722 | chr1       | 134534722 | 134535722 | .          | ...   |
| chr1         | 133340136 | 133341136 | chr1       | 133340136 | 133341136 | .          | ...   |
| chr1         | 36985569  | 36986569  | chr1       | 36985569  | 36986569  | .          | ...   |
| chr1         | 72792893  | 72793893  | chr1       | 72792893  | 72793893  | .          | ...   |
| ...          | ...       | ...       | ...        | ...       | ...       | ...        | ...   |
| chr19        | 43523647  | 43524647  | chr19      | 43523647  | 43524647  | .          | ...   |
| chr19        | 57097226  | 57098226  | chr19      | 57097226  | 57098226  | .          | ...   |
| chr19        | 44771651  | 44772651  | chr19      | 44771651  | 44772651  | .          | ...   |
| chr19        | 55705557  | 55706557  | chr19      | 55705557  | 55706557  | .          | ...   |
+--------------+-----------+-----------+------------+-----------+-----------+------------+-------+
Unstranded PyRanges object has 55,233 rows and 9 columns from 19 chromosomes.
Hidden columns: interval_from_task, idx, Chromosome, Start, End

Perform the overlaps¶

dfi_fimo_peaks = pr.PyRanges(dfi_fimo.intersect(ranges, how='containment').as_df().reset_index())

stats = {}

stats['pattern'] = pattern
stats['task'] = task
stats['imp_score'] = imp_score
stats['is_te'] = is_te
stats['n_seqlets'] = n_seqlets
stats['IC'] = p.seq_info_content
stats['length'] = len(p)

stats['n_pwm_gw'] = len(dfi_fimo)
stats['n_pwm_peaks'] = len(dfi_fimo_peaks)
stats['n_cwm_peaks'] = len(dfi_pr)
stats['n_cwm_and_pwm_peaks'] = len(dfi_pr.intersect(dfi_fimo_peaks, how='containment'))
stats['frac_cwm_in_pwm_peaks'] = stats['n_cwm_and_pwm_peaks'] / stats['n_cwm_peaks']
stats['frac_pwm_in_cwm_peaks'] = stats['n_cwm_and_pwm_peaks'] / stats['n_pwm_peaks']


stats

{'pattern': 'metacluster_0/pattern_6',
 'task': 'Nanog',
 'imp_score': 'profile/wn',
 'is_te': True,
 'n_seqlets': 571,
 'IC': 61.4639480136029,
 'length': 70,
 'n_pwm_gw': 62995,
 'n_pwm_peaks': 2631,
 'n_cwm_peaks': 709,
 'n_cwm_and_pwm_peaks': 0,
 'frac_cwm_in_pwm_peaks': 0.0,
 'frac_pwm_in_cwm_peaks': 0.0}

len(dfi_pr.overlap(dfi_fimo_peaks))

660

dfi_pr.overlap(dfi_fimo_peaks).df

dfi_pr['chr1', '+', 45358193:45358263].df

dfi_fimo_peaks['chr1', '+', 45358193:45358263].df

45358263 - 45358197

66

Why do they differ?

len(p)

70

!cat {model_dir}/deeplift/{task}/out/{imp_score}/PFM/{pattern}/meme.txt

MEME version 4

ALPHABET= ACGT

strands: + -

Background letter frequencies (from unknown source):
A 0.250 C 0.250 G 0.250 T 0.250

MOTIF 1 TEMP

letter-probability matrix: alength= 4 w= 66 nsites= 1 E= 0e+0
0.08757 0.30473 0.15762 0.45009
0.12084 0.43783 0.06830 0.37303
0.07005 0.19965 0.13485 0.59545
0.11208 0.11033 0.26270 0.51489
0.13485 0.07005 0.64448 0.15061
0.59019 0.07356 0.25919 0.07706
0.44658 0.06305 0.41681 0.07356
0.34151 0.05954 0.50963 0.08932
0.63047 0.04553 0.24869 0.07531
0.20841 0.03503 0.67075 0.08581
0.44133 0.08932 0.39580 0.07356
0.50438 0.04203 0.39054 0.06305
0.44658 0.40280 0.11559 0.03503
0.41856 0.41681 0.10158 0.06305
0.44133 0.09107 0.40806 0.05954
0.08932 0.20841 0.46060 0.24168
0.25744 0.27145 0.30298 0.16813
0.24518 0.03678 0.62172 0.09632
0.22417 0.35201 0.30123 0.12259
0.44483 0.41506 0.05779 0.08231
0.44483 0.04729 0.43608 0.07180
0.09457 0.05604 0.68126 0.16813
0.05254 0.04904 0.03152 0.86690
0.05079 0.01751 0.86690 0.06480
0.40105 0.04553 0.51664 0.03678
0.05604 0.87566 0.02627 0.04203
0.59545 0.02277 0.02627 0.35552
0.04553 0.05604 0.78984 0.10858
0.94046 0.02277 0.02627 0.01051
0.96497 0.01226 0.00876 0.01401
0.01926 0.91944 0.01401 0.04729
0.98249 0.00525 0.00175 0.01051
0.97548 0.00175 0.01401 0.00876
0.07005 0.01576 0.01401 0.90018
0.90893 0.01401 0.05079 0.02627
0.05779 0.04203 0.84588 0.05429
0.95271 0.00525 0.03853 0.00350
0.07706 0.06655 0.84939 0.00701
0.01051 0.55342 0.00000 0.43608
0.06130 0.52014 0.36077 0.05779
0.95271 0.02802 0.01051 0.00876
0.00350 0.00000 0.00876 0.98774
0.00350 0.97548 0.00876 0.01226
0.05079 0.05954 0.01051 0.87916
0.92469 0.01576 0.03327 0.02627
0.23117 0.02627 0.72329 0.01926
0.12259 0.03678 0.81436 0.02627
0.06655 0.11384 0.36252 0.45709
0.47461 0.32750 0.06830 0.12960
0.18039 0.05779 0.67601 0.08581
0.08757 0.04028 0.84588 0.02627
0.83012 0.10508 0.02627 0.03853
0.87215 0.02802 0.05079 0.04904
0.05429 0.05254 0.85814 0.03503
0.09107 0.07180 0.80560 0.03152
0.05254 0.82837 0.03678 0.08231
0.02277 0.07180 0.04203 0.86340
0.10683 0.79510 0.06480 0.03327
0.05954 0.85464 0.03853 0.04729
0.84238 0.04203 0.03678 0.07881
0.03678 0.89317 0.04203 0.02802
0.03327 0.85814 0.03853 0.07005
0.06830 0.80736 0.04378 0.08056
0.05254 0.34501 0.02977 0.57268
0.87040 0.03327 0.04553 0.05079
0.07881 0.02802 0.83187 0.06130

dfi_pr.sort().df.head()

dfi_fimo_peaks.sort().df.head()

# why is n_cwm_and_pwm_peaks 0?

# write to the json file

from basepair.utils import write_json

# write_json(stats, model_dir / f'deeplift/{task}/out/{imp_score}/PFM/{pattern}/overlaps.json')

Load the results¶

dfr = pd.read_csv(model_dir / 'deeplift/overlap_cwm_pwm.csv')

dfr.head()

dict(dfr.mean())

{'IC': 33.277750751885925,
 'frac_cwm_in_pwm_peaks': 0.1515292533356372,
 'frac_pwm_in_cwm_peaks': 0.33667156788830843,
 'is_te': 0.29508196721311475,
 'length': 70.0,
 'n_cwm_and_pwm_peaks': 1067.344262295082,
 'n_cwm_peaks': 1111020.9016393442,
 'n_pwm_gw': 69075.22950819672,
 'n_pwm_peaks': 2848.1803278688526,
 'n_seqlets': 928.0655737704918}

dict(dfr[dfr.is_te].mean())

{'IC': 61.375062696083056,
 'frac_cwm_in_pwm_peaks': 0.2646783997989644,
 'frac_pwm_in_cwm_peaks': 0.10715056169678877,
 'is_te': 1.0,
 'length': 70.0,
 'n_cwm_and_pwm_peaks': 173.05555555555554,
 'n_cwm_peaks': 539.4444444444445,
 'n_pwm_gw': 73009.44444444444,
 'n_pwm_peaks': 1915.9444444444443,
 'n_seqlets': 196.5}

!cp {model_dir}/deeplift/overlap_cwm_pwm.csv figures/modisco/{exp}/overlap_cwm_pwm.csv

dfr.to_excel(f'figures/modisco/{exp}/overlap_cwm_pwm.xlsx')

!ls -latrh figures/modisco/{exp}/overlap_cwm_pwm.xlsx

-rw-rw-r-- 1 avsec kundaje 11K Jul 10 03:49 figures/modisco/nexus,peaks,OSNK,0,10,1,FALSE,same,0.5,64,25,0.004,9,FALSE,[1,50],TRUE/overlap_cwm_pwm.xlsx

	example_idx	pattern_start	pattern_end	strand	pattern_center	pattern_len	match_weighted	match_weighted_p	match_weighted_cat	match_max	match_max_task	imp_weighted	imp_weighted_p	imp_weighted_cat	imp_max	imp_max_task	seq_match	seq_match_p	seq_match_cat	match/Nanog	imp/Nanog	example_chrom	example_start	example_end	example_strand	example_interval_from_task	pattern	pattern_short	pattern_name	pattern_start_abs	pattern_end_abs	Chromosome	Start	End	Strand
0	2851	467	537	+	502	70	0.4785	0.6187	medium	0.4785	Nanog	3.4153	0.9689	high	3.4153	Nanog	48.4069	0.3502	medium	0.4785	3.4153	chr1	45357726	45358726	.	Oct4	metacluster_0/pattern_6	m0_p6	m0_p6	45358193	45358263	chr1	45358193	45358263	+
1	7401	478	548	+	513	70	0.4345	0.3735	medium	0.4345	Nanog	2.5525	0.8385	high	2.5525	Nanog	43.4033	0.2179	low	0.4345	2.5525	chr1	114849323	114850323	.	Oct4	metacluster_0/pattern_6	m0_p6	m0_p6	114849801	114849871	chr1	114849801	114849871	+
2	7928	520	590	+	555	70	0.4445	0.4144	medium	0.4445	Nanog	1.6450	0.3288	low	1.6450	Nanog	53.5323	0.5097	medium	0.4445	1.6450	chr1	60432173	60433173	.	Oct4	metacluster_0/pattern_6	m0_p6	m0_p6	60432693	60432763	chr1	60432693	60432763	+
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
2	21848	475	545	-	510	70	0.4017	0.2588	low	0.4017	Nanog	3.1762	0.9494	high	3.1762	Nanog	44.8459	0.2412	low	0.4017	3.1762	chr19	38048410	38049410	.	Oct4	metacluster_0/pattern_6	m0_p6	m0_p6	38048885	38048955	chr19	38048885	38048955	-
3	35374	486	556	-	521	70	0.4636	0.5564	medium	0.4636	Nanog	1.6734	0.3424	medium	1.6734	Nanog	52.0025	0.4572	medium	0.4636	1.6734	chr19	40124748	40125748	.	Sox2	metacluster_0/pattern_6	m0_p6	m0_p6	40125234	40125304	chr19	40125234	40125304	-
4	85375	456	526	-	491	70	0.4591	0.5272	medium	0.4591	Nanog	1.3724	0.1498	low	1.3724	Nanog	49.1573	0.3755	medium	0.4591	1.3724	chr19	39752118	39753118	.	Nanog	metacluster_0/pattern_6	m0_p6	m0_p6	39752574	39752644	chr19	39752574	39752644	-

	example_idx	pattern_start	pattern_end	strand	pattern_center	pattern_len	match_weighted	match_weighted_p	match_weighted_cat	match_max	match_max_task	imp_weighted	imp_weighted_p	imp_weighted_cat	imp_max	imp_max_task	seq_match	seq_match_p	seq_match_cat	match/Nanog	imp/Nanog	example_chrom	example_start	example_end	example_strand	example_interval_from_task	pattern	pattern_short	pattern_name	pattern_start_abs	pattern_end_abs	Chromosome	Start	End	Strand
index
10572717	2851	467	537	+	502	70	0.4785	0.6187	medium	0.4785	Nanog	3.4153	0.9689	high	3.4153	Nanog	48.4069	0.3502	medium	0.4785	3.4153	chr1	45357726	45358726	.	Oct4	metacluster_0/pattern_6	m0_p6	m0_p6	45358193	45358263	chr1	45358193	45358263	+

	example_idx	pattern_start	pattern_end	strand	pattern_center	pattern_len	match_weighted	match_weighted_p	match_weighted_cat	match_max	match_max_task	imp_weighted	imp_weighted_p	imp_weighted_cat	imp_max	imp_max_task	seq_match	seq_match_p	seq_match_cat	match/Nanog	imp/Nanog	example_chrom	example_start	example_end	example_strand	example_interval_from_task	pattern	pattern_short	pattern_name	pattern_start_abs	pattern_end_abs	Chromosome	Start	End	Strand
0	17682	539	609	+	574	70	0.4243	0.3444	medium	0.4243	Nanog	2.9910	0.9261	high	2.9910	Nanog	53.7791	0.5214	medium	0.4243	2.9910	chr1	4150435	4151435	.	Oct4	metacluster_0/pattern_6	m0_p6	m0_p6	4150974	4151044	chr1	4150974	4151044	+
1	32742	553	623	+	588	70	0.5275	0.8580	high	0.5275	Nanog	1.9821	0.5759	medium	1.9821	Nanog	63.1465	0.9767	high	0.5275	1.9821	chr1	4659763	4660763	.	Sox2	metacluster_0/pattern_6	m0_p6	m0_p6	4660316	4660386	chr1	4660316	4660386	+
2	80221	458	528	+	493	70	0.4160	0.3171	low	0.4160	Nanog	2.1110	0.6323	medium	2.1110	Nanog	45.0386	0.2490	low	0.4160	2.1110	chr1	13518098	13519098	.	Nanog	metacluster_0/pattern_6	m0_p6	m0_p6	13518556	13518626	chr1	13518556	13518626	+
3	29880	480	550	+	515	70	0.4376	0.3774	medium	0.4376	Nanog	1.9362	0.5428	medium	1.9362	Nanog	58.4699	0.7860	high	0.4376	1.9362	chr1	14687733	14688733	.	Sox2	metacluster_0/pattern_6	m0_p6	m0_p6	14688213	14688283	chr1	14688213	14688283	+
4	133203	471	541	+	506	70	0.4367	0.3774	medium	0.4367	Nanog	1.1155	0.0564	low	1.1155	Nanog	61.2050	0.9105	high	0.4367	1.1155	chr1	16881261	16882261	.	Klf4	metacluster_0/pattern_6	m0_p6	m0_p6	16881732	16881802	chr1	16881732	16881802	+

	index	motif_id	motif_alt_id	sequence_name	start	stop	strand	score	p-value	q-value	matched_sequence	chrom	end	pattern_center_abs	pattern_name	pattern	Chromosome	Start	End	Strand
0	29	1	TEMP	chr1	3062933	3062998	+	61.3434	8.9300e-21	3.2200e-14	tgtcttggagaacagcaacag...	chr1	3062998	3062966	TEMP	Nanog/metacluster_0/p...	chr1	3062932	3062998	+
1	25	1	TEMP	chr1	3953731	3953796	+	63.6687	1.2400e-21	4.8800e-15	cagcaggtagcagcagtggca...	chr1	3953796	3953764	TEMP	Nanog/metacluster_0/p...	chr1	3953730	3953796	+
2	19	1	TEMP	chr1	4150979	4151044	+	74.4217	4.0400e-26	2.7600e-19	cttggagaagagcagcagcag...	chr1	4151044	4151012	TEMP	Nanog/metacluster_0/p...	chr1	4150978	4151044	+
3	53	1	TEMP	chr1	4612048	4612113	+	10.7349	8.6600e-08	1.1100e-01	ctccgagaagagcagggcaat...	chr1	4612113	4612081	TEMP	Nanog/metacluster_0/p...	chr1	4612047	4612113	+
4	0	1	TEMP	chr1	4660321	4660386	+	87.2229	1.2500e-33	3.1400e-25	cttggaagagagcagtggcag...	chr1	4660386	4660354	TEMP	Nanog/metacluster_0/p...	chr1	4660320	4660386	+

	IC	frac_cwm_in_pwm_peaks	frac_pwm_in_cwm_peaks	imp_score	is_te	length	n_cwm_and_pwm_peaks	n_cwm_peaks	n_pwm_gw	n_pwm_peaks	n_seqlets	pattern	task
0	14.2607	0.1506	0.9530	profile/wn	False	70	3957	26277	55725	4152	10742	metacluster_0/pattern_0	Oct4
1	10.8687	0.0414	0.8429	profile/wn	False	70	1733	41868	86576	2056	1517	metacluster_0/pattern_1	Oct4
2	12.1398	0.0099	0.8588	profile/wn	False	70	438	44355	20147	510	1297	metacluster_0/pattern_2	Oct4
3	10.7855	0.0074	0.7382	profile/wn	False	70	530	71260	46853	718	1052	metacluster_0/pattern_3	Oct4
4	13.7675	0.0249	0.4542	profile/wn	False	70	763	30684	60802	1680	970	metacluster_0/pattern_4	Oct4