In [1]:
exp = 'nexus,peaks,OSNK,0,10,1,FALSE,same,0.5,64,25,0.004,9,FALSE,[1,50],TRUE'
imp_score = 'profile/wn'
In [2]:
task = "Nanog"
pattern = "metacluster_0/pattern_6"

Goal

  • get PFM instances genome wide

Tasks

  • TODO
    • [x] export all patterns as MEME / FIMO motifs
      • check if you have some code ready
      • use the trimmed patterns (same as used for dfi scanning)
    • [X] Do the FIMO scanning -> generates the bed files
      • do it from the notebook or write a Snakemake file?
        • fimo.smk
    • [X] load the fimo files as pd.DataFrame
      • use your scripts from motif comparisons
    • [X] overlap instances (join by chr start end strand)
      • [X] load dfi
      • is there a significant overlap?
      • which ones overlap?
      • which ones don't overlap?
    • [X] Give the following columns for each motif (pd.DataFrame):
      • pattern
      • #seqlets
      • is_te
      • IC
      • # PFM genome-wide
      • # PFM peaks
      • % PFM peaks in CWM
      • # CWM peaks
      • % CWM peaks in PFM
      • # all peaks
    • [ ] export as csv to gdrive. load with gsheet
    • [ ] subset the gsheet to only TE's in Figure 3B
  • analysis
    • compare with CWM and PWM in peaks
    • give spreadsheet and or histogram
In [3]:
# Imports
from basepair.imports import *
from basepair.exp.paper.config import *
import pyranges as pr
hv.extension('bokeh')
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
paper_config()
Using TensorFlow backend.
In [4]:
ddir = get_data_dir()
model_dir = models_dir / exp

Export MEME motifs

In [5]:
patterns_dict = [(task, read_pkl(model_dir / f"deeplift/{task}/out/{imp_score}/patterns.pkl"))
                  for task in tasks]
In [409]:
p_meme = None
for task, patterns in patterns_dict:
    mr = ModiscoResult(model_dir / f'deeplift/{task}/out/{imp_score}/modisco.h5')
    for pname in mr.patterns():
        untrimmed_pattern = mr.get_pattern(pname)
        pattern = untrimmed_pattern.trim_seq_ic(0.08)
        if task == 'Nanog' and pattern.name == 'metacluster_0/pattern_6':
            print(len(pattern))
            p_meme = untrimmed_pattern
        motif_dir = model_dir / f"deeplift/{task}/out/{imp_score}/PFM/{pattern.name}"
        motif_dir.mkdir(parents=True, exist_ok=True)
        pattern.write_meme_file([0.25, 0.25, 0.25, 0.25],
                                motif_dir /"meme.txt")
70

Run FIMO

In [ ]:
# RUN
# snakemake -s fimo.smk -j 10

TODO

  • [X] test the load time of a single motif vs full table
    • fastparquet doesn't work properly :/ -> that's probably due to '/' in the category names
  • write a function to load dfi for each pattern
    • read_instances('Klf4', 'metacluster_0/pattern_1')
  • write a function to perform the overlaps for each pattern separately
  • use snakemake to run in parallel
In [5]:
# Load stats about the motif
mr = ModiscoResult(model_dir / f'deeplift/{task}/out/{imp_score}/modisco.h5')

# get TE's
motifs_te = [pattern
             for pattern in mr.patterns()
             if mr.get_pattern(pattern).seq_info_content > 30 and mr.n_seqlets(*pattern.split('/')) > 100]
is_te = pattern in motifs_te
# motifs_te_d = OrderedDict([(shorten_pattern(x), shorten_pattern(x)) for x in motifs_te])

n_seqlets = mr.n_seqlets(*pattern.split("/"))

p = mr.get_pattern(pattern).trim_seq_ic(0.08)
TF-MoDISco is using the TensorFlow backend.

What fraction of the seqlets contains a CWM hit (and vice versa)?

  • [X] convert the cwm matches into intervals using seqname strand
  • [X] convert seqlets to pyranges
  • [ ] perform the overlaps for a single pattern
    • What fraction of the seqlets contains a CWM hit?
  • [ ] perform the overlaps for multiple patterns
    • all patterns
    • major patterns
      • load_instances should load the full thing
      • load the major pattern names
      • use load_instances to filter the instances only to the major motifs
      • include also the TE's
    • make sure you are only looking at 57 main motifs (e.g. 5 must be excluded...)
In [7]:
# TOOD - there is a discrepancy between the intervals

TODO - use the absolute coordinates

In [6]:
from basepair.modisco.pattern_instances import load_instances, multiple_load_instances

def abs_coord(seqlet, ranges):
    """Create absolute seqlet coordinates
    """
    row = ranges.loc[seqlet.seqname]
    return {"Chromosome": row.chrom,
            "Start": row.start + seqlet.start - 15,  # make it 70 bp long
            "End": row.start + seqlet.end + 15,
           }

def dfi2pr(dfi):
    dfi['Chromosome'] = dfi.example_chrom
    dfi['Start'] = dfi.pattern_start_abs
    dfi['End'] = dfi.pattern_end_abs
    dfi['Strand'] = dfi.strand
    dfi_pr = pr.PyRanges(dfi)
    return dfi_pr


df_subset = pd.read_csv("https://docs.google.com/spreadsheets/d/1MVBUbMs_F0UPgZxR8-v9eMM6DCFhVYc7kdH-C6WJnYU/export?gid=0&format=csv")
df_subset.name = df_subset.name.str.replace('"', '')
df_subset.task = df_subset.task.str.replace('"', '')
In [8]:
df_info = pd.read_csv("https://docs.google.com/spreadsheets/d/1uAtJZJR_dmlWwer5Afcd2q72Uuj7A4Ln/export?gid=826411817&format=csv")
In [16]:
 
Out[16]:
{'m0_p6': 'm0_p6',
 'm0_p7': 'm0_p7',
 'm0_p10': 'm0_p10',
 'm0_p11': 'm0_p11',
 'm0_p12': 'm0_p12',
 'm0_p13': 'm0_p13',
 'm0_p14': 'm0_p14',
 'm0_p15': 'm0_p15',
 'm0_p18': 'm0_p18',
 'm0_p0': 'm0_p0',
 'm0_p1': 'm0_p1',
 'm0_p2': 'm0_p2',
 'm0_p3': 'm0_p3',
 'm0_p4': 'm0_p4',
 'm0_p5': 'm0_p5',
 'm0_p8': 'm0_p8',
 'm0_p9': 'm0_p9',
 'm0_p16': 'm0_p16',
 'm0_p17': 'm0_p17'}
In [110]:
len(dfi_pr_all.intersect(dfs, how='containment', strand=None))
Out[110]:
157121
In [111]:
len(dfi_pr_all)
Out[111]:
197667
In [127]:
len(dfi_pr_all.drop_duplicate_positions(strand=False))
Out[127]:
197663
In [125]:
len(dfi_pr_main.intersect(dfs, how='containment', strand=None).drop_duplicate_positions(strand=False))
Out[125]:
39454
In [112]:
len(dfi_pr_main)
Out[112]:
59984
In [122]:
len(dfs.drop_duplicate_positions(strand=False))
Out[122]:
101304
In [123]:
len(dfs.overlap(dfi_pr_all).drop_duplicate_positions(strand=False))
Out[123]:
87802
In [129]:
len(dfs.overlap(dfi_pr_main).drop_duplicate_positions(strand=False))
Out[129]:
51361
In [ ]:
# TODO - remove motif with low ic
In [17]:
for task in tasks:
    print("Loading ranges")
    # Load seqlets
    mr = ModiscoResult(model_dir / f'deeplift/{task}/out/{imp_score}/modisco.h5')
    seqlets = mr.all_seqlets()
    ranges = mr.load_ranges()
    ranges.set_index('idx', inplace=True)
    assert np.all(ranges.index == np.arange(len(ranges)))
    dfs = pd.DataFrame([abs_coord(s, ranges) for s in seqlets])
    dfs = pr.PyRanges(dfs).drop_duplicate_positions(strand=False)
    
    print("loading dfi")
    
    # Load CWM instances
    print("load instances")
    dfi = load_instances(model_dir / f'deeplift/{task}/out/{imp_score}/instances.parq')
    print("filter")
    dfi = dfi.query('match_weighted_p > .2').query('imp_weighted_p > .01')
    
    print("filter")
    # That's only subsetting the main motifs for a particular task
    motifs = {k:k for k in df_subset[df_subset.task == task].pattern}
    all_motifs = {k:k for k in df_info[(df_info.IC > 4) & (df_info.task == task)].pattern.map(shorten_pattern)}

    
    
    dfi_main_motifs = load_instances(dfi, motifs)
    dfi_pr_main = dfi2pr(dfi_main_motifs).drop_duplicate_positions(strand=False)
    
    dfi_all_motifs = load_instances(dfi, all_motifs)
    dfi_pr_all = dfi2pr(dfi_all_motifs).drop_duplicate_positions(strand=False)
    
    print("-"*40)
    print(f"Task: {task}")
    s = len(dfs)
    sm = len(dfs.overlap(dfi_pr_main).drop_duplicate_positions(strand=False))
    sa = len(dfs.overlap(dfi_pr_all).drop_duplicate_positions(strand=False))
    print(f"seqlets overlaps main motifs: {sm}/{s} ({sm/s:.2f})")
    print(f"seqlets overlaps all motifs: {sa}/{s} ({sa/s:.2f})")
    print("seqlets overlaps main motifs: {}".format(len(dfs.overlap(dfi_pr_main))))
    print("")
    cm = len(dfi_pr_main)
    cmo = len(dfi_pr_main.intersect(dfs, how='containment', strand=None).drop_duplicate_positions(strand=False))
    ca = len(dfi_pr_all)
    cao = len(dfi_pr_all.intersect(dfs, how='containment', strand=None).drop_duplicate_positions(strand=False))
    print(f"CWM (main motifs) overlap seqlets: {cmo}/{cm} ({cmo/cm:.2f})")
    print(f"CWM (all motifs) overlap seqlets: {cao}/{ca} ({cao/ca:.2f})")
    print("-"*40)
Loading ranges
loading dfi
load instances
number of de-duplicated instances: 51782306 (34.84561268187079%)
filter
filter
number of de-duplicated instances: 0 (0.0%)
number of de-duplicated instances: 0 (0.0%)
----------------------------------------
Task: Oct4
seqlets overlaps main motifs: 28767/42995 (0.67)
seqlets overlaps all motifs: 37159/42995 (0.86)
seqlets overlaps main motifs: 49071

CWM (main motifs) overlap seqlets: 30499/79433 (0.38)
CWM (all motifs) overlap seqlets: 45641/198384 (0.23)
----------------------------------------
Loading ranges
loading dfi
load instances
number of de-duplicated instances: 976478 (37.701489367863246%)
filter
filter
number of de-duplicated instances: 0 (0.0%)
number of de-duplicated instances: 0 (0.0%)
----------------------------------------
Task: Sox2
seqlets overlaps main motifs: 6377/17665 (0.36)
seqlets overlaps all motifs: 13144/17665 (0.74)
seqlets overlaps main motifs: 6924

CWM (main motifs) overlap seqlets: 4761/21355 (0.22)
CWM (all motifs) overlap seqlets: 13849/85077 (0.16)
----------------------------------------
Loading ranges
loading dfi
load instances
number of de-duplicated instances: 4377763 (35.19495062591796%)
filter
filter
number of de-duplicated instances: 0 (0.0%)
number of de-duplicated instances: 0 (0.0%)
----------------------------------------
Task: Nanog
seqlets overlaps main motifs: 51361/101304 (0.51)
seqlets overlaps all motifs: 87802/101304 (0.87)
seqlets overlaps main motifs: 64411

CWM (main motifs) overlap seqlets: 39454/59982 (0.66)
CWM (all motifs) overlap seqlets: 123018/197663 (0.62)
----------------------------------------
Loading ranges
loading dfi
load instances
number of de-duplicated instances: 16307358 (35.678892813327785%)
filter
filter
number of de-duplicated instances: 0 (0.0%)
number of de-duplicated instances: 0 (0.0%)
----------------------------------------
Task: Klf4
seqlets overlaps main motifs: 59243/103514 (0.57)
seqlets overlaps all motifs: 99336/103514 (0.96)
seqlets overlaps main motifs: 66794

CWM (main motifs) overlap seqlets: 52005/67328 (0.77)
CWM (all motifs) overlap seqlets: 246825/448379 (0.55)
----------------------------------------
In [25]:
# total seqlets
s = 42995+17665+101304+103514
s
Out[25]:
265478
In [27]:
#overlap main motifs
mo = 28767 + 6377 + 51361 + 59243
ao = 37159 + 13144 + 87802 + 99336
In [28]:
print(f"total seqlets: {mo}")
print(f"main motif overlaps: {mo} / {s} ({mo/s:.2f})")
print(f"all motif overlaps: {ao} / {s} ({ao/s:.2f})")
total seqlets: 145748
main motif overlaps: 145748 / 265478 (0.55)
all motif overlaps: 237441 / 265478 (0.89)

In [407]:
p.plot();
In [408]:
p_meme.plot();
In [356]:
# Load CWM instances
from basepair.modisco.pattern_instances import load_instances

dfi = load_instances(model_dir / f'deeplift/{task}/out/{imp_score}/instances.parq',
                     {shorten_pattern(pattern): shorten_pattern(pattern)})

# filter only to the relevant ones
dfi = dfi.query('match_weighted_p > .2').query('imp_weighted_p > .01')

dfi['Chromosome'] = dfi.example_chrom
dfi['Start'] = dfi.pattern_start_abs
dfi['End'] = dfi.pattern_end_abs
dfi['Strand'] = dfi.strand
dfi_pr = pr.PyRanges(dfi)
number of de-duplicated instances: 1826 (60.046037487668535%)
In [357]:
# Load PWM instances
from basepair.exp.chipnexus.comparison import read_fimo_dfi
dfi_fimo = read_fimo_dfi(model_dir / f'deeplift/{task}/out/{imp_score}/PFM/{pattern}/fimo.tsv')
dfi_fimo['pattern'] = f"{task}/{pattern}"
dfi_fimo['Chromosome'] = dfi_fimo.chrom
dfi_fimo['Start'] = dfi_fimo.start - 1  # intervals are 1-based
dfi_fimo['End'] = dfi_fimo.end
dfi_fimo['Strand'] = dfi_fimo.strand
dfi_fimo = pr.PyRanges(dfi_fimo)
In [358]:
# Load the peak intervals
from basepair.cli.modisco import load_ranges

ranges = load_ranges(model_dir / f'deeplift/{task}/out/{imp_score}')

ranges['Chromosome'] = ranges.chrom
ranges['Start'] = ranges.start
ranges['End'] = ranges.end
ranges = pr.PyRanges(ranges)
In [359]:
ranges
Out[359]:
+--------------+-----------+-----------+------------+-----------+-----------+------------+-------+
| Chromosome   | Start     | End       | chrom      | start     | end       | strand     | ...   |
| (category)   | (int32)   | (int32)   | (object)   | (int64)   | (int64)   | (object)   |       |
|--------------+-----------+-----------+------------+-----------+-----------+------------+-------|
| chr1         | 134534722 | 134535722 | chr1       | 134534722 | 134535722 | .          | ...   |
| chr1         | 133340136 | 133341136 | chr1       | 133340136 | 133341136 | .          | ...   |
| chr1         | 36985569  | 36986569  | chr1       | 36985569  | 36986569  | .          | ...   |
| chr1         | 72792893  | 72793893  | chr1       | 72792893  | 72793893  | .          | ...   |
| ...          | ...       | ...       | ...        | ...       | ...       | ...        | ...   |
| chr19        | 43523647  | 43524647  | chr19      | 43523647  | 43524647  | .          | ...   |
| chr19        | 57097226  | 57098226  | chr19      | 57097226  | 57098226  | .          | ...   |
| chr19        | 44771651  | 44772651  | chr19      | 44771651  | 44772651  | .          | ...   |
| chr19        | 55705557  | 55706557  | chr19      | 55705557  | 55706557  | .          | ...   |
+--------------+-----------+-----------+------------+-----------+-----------+------------+-------+
Unstranded PyRanges object has 55,233 rows and 9 columns from 19 chromosomes.
Hidden columns: interval_from_task, idx, Chromosome, Start, End

Perform the overlaps

In [360]:
dfi_fimo_peaks = pr.PyRanges(dfi_fimo.intersect(ranges, how='containment').as_df().reset_index())
In [361]:
stats = {}

stats['pattern'] = pattern
stats['task'] = task
stats['imp_score'] = imp_score
stats['is_te'] = is_te
stats['n_seqlets'] = n_seqlets
stats['IC'] = p.seq_info_content
stats['length'] = len(p)

stats['n_pwm_gw'] = len(dfi_fimo)
stats['n_pwm_peaks'] = len(dfi_fimo_peaks)
stats['n_cwm_peaks'] = len(dfi_pr)
stats['n_cwm_and_pwm_peaks'] = len(dfi_pr.intersect(dfi_fimo_peaks, how='containment'))
stats['frac_cwm_in_pwm_peaks'] = stats['n_cwm_and_pwm_peaks'] / stats['n_cwm_peaks']
stats['frac_pwm_in_cwm_peaks'] = stats['n_cwm_and_pwm_peaks'] / stats['n_pwm_peaks']


stats
Out[361]:
{'pattern': 'metacluster_0/pattern_6',
 'task': 'Nanog',
 'imp_score': 'profile/wn',
 'is_te': True,
 'n_seqlets': 571,
 'IC': 61.4639480136029,
 'length': 70,
 'n_pwm_gw': 62995,
 'n_pwm_peaks': 2631,
 'n_cwm_peaks': 709,
 'n_cwm_and_pwm_peaks': 0,
 'frac_cwm_in_pwm_peaks': 0.0,
 'frac_pwm_in_cwm_peaks': 0.0}
In [371]:
len(dfi_pr.overlap(dfi_fimo_peaks))
Out[371]:
660
In [373]:
dfi_pr.overlap(dfi_fimo_peaks).df
Out[373]:
example_idx pattern_start pattern_end strand pattern_center pattern_len match_weighted match_weighted_p match_weighted_cat match_max match_max_task imp_weighted imp_weighted_p imp_weighted_cat imp_max imp_max_task seq_match seq_match_p seq_match_cat match/Nanog imp/Nanog example_chrom example_start example_end example_strand example_interval_from_task pattern pattern_short pattern_name pattern_start_abs pattern_end_abs Chromosome Start End Strand
0 2851 467 537 + 502 70 0.4785 0.6187 medium 0.4785 Nanog 3.4153 0.9689 high 3.4153 Nanog 48.4069 0.3502 medium 0.4785 3.4153 chr1 45357726 45358726 . Oct4 metacluster_0/pattern_6 m0_p6 m0_p6 45358193 45358263 chr1 45358193 45358263 +
1 7401 478 548 + 513 70 0.4345 0.3735 medium 0.4345 Nanog 2.5525 0.8385 high 2.5525 Nanog 43.4033 0.2179 low 0.4345 2.5525 chr1 114849323 114850323 . Oct4 metacluster_0/pattern_6 m0_p6 m0_p6 114849801 114849871 chr1 114849801 114849871 +
2 7928 520 590 + 555 70 0.4445 0.4144 medium 0.4445 Nanog 1.6450 0.3288 low 1.6450 Nanog 53.5323 0.5097 medium 0.4445 1.6450 chr1 60432173 60433173 . Oct4 metacluster_0/pattern_6 m0_p6 m0_p6 60432693 60432763 chr1 60432693 60432763 +
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
2 21848 475 545 - 510 70 0.4017 0.2588 low 0.4017 Nanog 3.1762 0.9494 high 3.1762 Nanog 44.8459 0.2412 low 0.4017 3.1762 chr19 38048410 38049410 . Oct4 metacluster_0/pattern_6 m0_p6 m0_p6 38048885 38048955 chr19 38048885 38048955 -
3 35374 486 556 - 521 70 0.4636 0.5564 medium 0.4636 Nanog 1.6734 0.3424 medium 1.6734 Nanog 52.0025 0.4572 medium 0.4636 1.6734 chr19 40124748 40125748 . Sox2 metacluster_0/pattern_6 m0_p6 m0_p6 40125234 40125304 chr19 40125234 40125304 -
4 85375 456 526 - 491 70 0.4591 0.5272 medium 0.4591 Nanog 1.3724 0.1498 low 1.3724 Nanog 49.1573 0.3755 medium 0.4591 1.3724 chr19 39752118 39753118 . Nanog metacluster_0/pattern_6 m0_p6 m0_p6 39752574 39752644 chr19 39752574 39752644 -

660 rows × 35 columns

In [375]:
dfi_pr['chr1', '+', 45358193:45358263].df
Out[375]:
example_idx pattern_start pattern_end strand pattern_center pattern_len match_weighted match_weighted_p match_weighted_cat match_max match_max_task imp_weighted imp_weighted_p imp_weighted_cat imp_max imp_max_task seq_match seq_match_p seq_match_cat match/Nanog imp/Nanog example_chrom example_start example_end example_strand example_interval_from_task pattern pattern_short pattern_name pattern_start_abs pattern_end_abs Chromosome Start End Strand
index
10572717 2851 467 537 + 502 70 0.4785 0.6187 medium 0.4785 Nanog 3.4153 0.9689 high 3.4153 Nanog 48.4069 0.3502 medium 0.4785 3.4153 chr1 45357726 45358726 . Oct4 metacluster_0/pattern_6 m0_p6 m0_p6 45358193 45358263 chr1 45358193 45358263 +
In [376]:
dfi_fimo_peaks['chr1', '+', 45358193:45358263].df
Out[376]:
index motif_id motif_alt_id sequence_name start stop strand score p-value q-value matched_sequence chrom end pattern_center_abs pattern_name pattern Chromosome Start End Strand
21 21 1 TEMP chr1 45358198 45358263 + 67.012 6.3000e-23 2.8600e-16 cttggagaagagcagcagcag... chr1 45358263 45358231 TEMP Nanog/metacluster_0/p... chr1 45358197 45358263 +
In [378]:
45358263 - 45358197
Out[378]:
66

Why do they differ?

In [380]:
len(p)
Out[380]:
70
In [384]:
!cat {model_dir}/deeplift/{task}/out/{imp_score}/PFM/{pattern}/meme.txt
MEME version 4

ALPHABET= ACGT

strands: + -

Background letter frequencies (from unknown source):
A 0.250 C 0.250 G 0.250 T 0.250

MOTIF 1 TEMP

letter-probability matrix: alength= 4 w= 66 nsites= 1 E= 0e+0
0.08757 0.30473 0.15762 0.45009
0.12084 0.43783 0.06830 0.37303
0.07005 0.19965 0.13485 0.59545
0.11208 0.11033 0.26270 0.51489
0.13485 0.07005 0.64448 0.15061
0.59019 0.07356 0.25919 0.07706
0.44658 0.06305 0.41681 0.07356
0.34151 0.05954 0.50963 0.08932
0.63047 0.04553 0.24869 0.07531
0.20841 0.03503 0.67075 0.08581
0.44133 0.08932 0.39580 0.07356
0.50438 0.04203 0.39054 0.06305
0.44658 0.40280 0.11559 0.03503
0.41856 0.41681 0.10158 0.06305
0.44133 0.09107 0.40806 0.05954
0.08932 0.20841 0.46060 0.24168
0.25744 0.27145 0.30298 0.16813
0.24518 0.03678 0.62172 0.09632
0.22417 0.35201 0.30123 0.12259
0.44483 0.41506 0.05779 0.08231
0.44483 0.04729 0.43608 0.07180
0.09457 0.05604 0.68126 0.16813
0.05254 0.04904 0.03152 0.86690
0.05079 0.01751 0.86690 0.06480
0.40105 0.04553 0.51664 0.03678
0.05604 0.87566 0.02627 0.04203
0.59545 0.02277 0.02627 0.35552
0.04553 0.05604 0.78984 0.10858
0.94046 0.02277 0.02627 0.01051
0.96497 0.01226 0.00876 0.01401
0.01926 0.91944 0.01401 0.04729
0.98249 0.00525 0.00175 0.01051
0.97548 0.00175 0.01401 0.00876
0.07005 0.01576 0.01401 0.90018
0.90893 0.01401 0.05079 0.02627
0.05779 0.04203 0.84588 0.05429
0.95271 0.00525 0.03853 0.00350
0.07706 0.06655 0.84939 0.00701
0.01051 0.55342 0.00000 0.43608
0.06130 0.52014 0.36077 0.05779
0.95271 0.02802 0.01051 0.00876
0.00350 0.00000 0.00876 0.98774
0.00350 0.97548 0.00876 0.01226
0.05079 0.05954 0.01051 0.87916
0.92469 0.01576 0.03327 0.02627
0.23117 0.02627 0.72329 0.01926
0.12259 0.03678 0.81436 0.02627
0.06655 0.11384 0.36252 0.45709
0.47461 0.32750 0.06830 0.12960
0.18039 0.05779 0.67601 0.08581
0.08757 0.04028 0.84588 0.02627
0.83012 0.10508 0.02627 0.03853
0.87215 0.02802 0.05079 0.04904
0.05429 0.05254 0.85814 0.03503
0.09107 0.07180 0.80560 0.03152
0.05254 0.82837 0.03678 0.08231
0.02277 0.07180 0.04203 0.86340
0.10683 0.79510 0.06480 0.03327
0.05954 0.85464 0.03853 0.04729
0.84238 0.04203 0.03678 0.07881
0.03678 0.89317 0.04203 0.02802
0.03327 0.85814 0.03853 0.07005
0.06830 0.80736 0.04378 0.08056
0.05254 0.34501 0.02977 0.57268
0.87040 0.03327 0.04553 0.05079
0.07881 0.02802 0.83187 0.06130
In [366]:
dfi_pr.sort().df.head()
Out[366]:
example_idx pattern_start pattern_end strand pattern_center pattern_len match_weighted match_weighted_p match_weighted_cat match_max match_max_task imp_weighted imp_weighted_p imp_weighted_cat imp_max imp_max_task seq_match seq_match_p seq_match_cat match/Nanog imp/Nanog example_chrom example_start example_end example_strand example_interval_from_task pattern pattern_short pattern_name pattern_start_abs pattern_end_abs Chromosome Start End Strand
0 17682 539 609 + 574 70 0.4243 0.3444 medium 0.4243 Nanog 2.9910 0.9261 high 2.9910 Nanog 53.7791 0.5214 medium 0.4243 2.9910 chr1 4150435 4151435 . Oct4 metacluster_0/pattern_6 m0_p6 m0_p6 4150974 4151044 chr1 4150974 4151044 +
1 32742 553 623 + 588 70 0.5275 0.8580 high 0.5275 Nanog 1.9821 0.5759 medium 1.9821 Nanog 63.1465 0.9767 high 0.5275 1.9821 chr1 4659763 4660763 . Sox2 metacluster_0/pattern_6 m0_p6 m0_p6 4660316 4660386 chr1 4660316 4660386 +
2 80221 458 528 + 493 70 0.4160 0.3171 low 0.4160 Nanog 2.1110 0.6323 medium 2.1110 Nanog 45.0386 0.2490 low 0.4160 2.1110 chr1 13518098 13519098 . Nanog metacluster_0/pattern_6 m0_p6 m0_p6 13518556 13518626 chr1 13518556 13518626 +
3 29880 480 550 + 515 70 0.4376 0.3774 medium 0.4376 Nanog 1.9362 0.5428 medium 1.9362 Nanog 58.4699 0.7860 high 0.4376 1.9362 chr1 14687733 14688733 . Sox2 metacluster_0/pattern_6 m0_p6 m0_p6 14688213 14688283 chr1 14688213 14688283 +
4 133203 471 541 + 506 70 0.4367 0.3774 medium 0.4367 Nanog 1.1155 0.0564 low 1.1155 Nanog 61.2050 0.9105 high 0.4367 1.1155 chr1 16881261 16882261 . Klf4 metacluster_0/pattern_6 m0_p6 m0_p6 16881732 16881802 chr1 16881732 16881802 +
In [365]:
dfi_fimo_peaks.sort().df.head()
Out[365]:
index motif_id motif_alt_id sequence_name start stop strand score p-value q-value matched_sequence chrom end pattern_center_abs pattern_name pattern Chromosome Start End Strand
0 29 1 TEMP chr1 3062933 3062998 + 61.3434 8.9300e-21 3.2200e-14 tgtcttggagaacagcaacag... chr1 3062998 3062966 TEMP Nanog/metacluster_0/p... chr1 3062932 3062998 +
1 25 1 TEMP chr1 3953731 3953796 + 63.6687 1.2400e-21 4.8800e-15 cagcaggtagcagcagtggca... chr1 3953796 3953764 TEMP Nanog/metacluster_0/p... chr1 3953730 3953796 +
2 19 1 TEMP chr1 4150979 4151044 + 74.4217 4.0400e-26 2.7600e-19 cttggagaagagcagcagcag... chr1 4151044 4151012 TEMP Nanog/metacluster_0/p... chr1 4150978 4151044 +
3 53 1 TEMP chr1 4612048 4612113 + 10.7349 8.6600e-08 1.1100e-01 ctccgagaagagcagggcaat... chr1 4612113 4612081 TEMP Nanog/metacluster_0/p... chr1 4612047 4612113 +
4 0 1 TEMP chr1 4660321 4660386 + 87.2229 1.2500e-33 3.1400e-25 cttggaagagagcagtggcag... chr1 4660386 4660354 TEMP Nanog/metacluster_0/p... chr1 4660320 4660386 +
In [ ]:
# why is n_cwm_and_pwm_peaks 0?
In [315]:
# write to the json file
In [354]:
from basepair.utils import write_json

# write_json(stats, model_dir / f'deeplift/{task}/out/{imp_score}/PFM/{pattern}/overlaps.json')

Load the results

In [331]:
dfr = pd.read_csv(model_dir / 'deeplift/overlap_cwm_pwm.csv')
In [332]:
dfr.head()
Out[332]:
IC frac_cwm_in_pwm_peaks frac_pwm_in_cwm_peaks imp_score is_te length n_cwm_and_pwm_peaks n_cwm_peaks n_pwm_gw n_pwm_peaks n_seqlets pattern task
0 14.2607 0.1506 0.9530 profile/wn False 70 3957 26277 55725 4152 10742 metacluster_0/pattern_0 Oct4
1 10.8687 0.0414 0.8429 profile/wn False 70 1733 41868 86576 2056 1517 metacluster_0/pattern_1 Oct4
2 12.1398 0.0099 0.8588 profile/wn False 70 438 44355 20147 510 1297 metacluster_0/pattern_2 Oct4
3 10.7855 0.0074 0.7382 profile/wn False 70 530 71260 46853 718 1052 metacluster_0/pattern_3 Oct4
4 13.7675 0.0249 0.4542 profile/wn False 70 763 30684 60802 1680 970 metacluster_0/pattern_4 Oct4
In [336]:
dict(dfr.mean())
Out[336]:
{'IC': 33.277750751885925,
 'frac_cwm_in_pwm_peaks': 0.1515292533356372,
 'frac_pwm_in_cwm_peaks': 0.33667156788830843,
 'is_te': 0.29508196721311475,
 'length': 70.0,
 'n_cwm_and_pwm_peaks': 1067.344262295082,
 'n_cwm_peaks': 1111020.9016393442,
 'n_pwm_gw': 69075.22950819672,
 'n_pwm_peaks': 2848.1803278688526,
 'n_seqlets': 928.0655737704918}
In [334]:
dict(dfr[dfr.is_te].mean())
Out[334]:
{'IC': 61.375062696083056,
 'frac_cwm_in_pwm_peaks': 0.2646783997989644,
 'frac_pwm_in_cwm_peaks': 0.10715056169678877,
 'is_te': 1.0,
 'length': 70.0,
 'n_cwm_and_pwm_peaks': 173.05555555555554,
 'n_cwm_peaks': 539.4444444444445,
 'n_pwm_gw': 73009.44444444444,
 'n_pwm_peaks': 1915.9444444444443,
 'n_seqlets': 196.5}
In [341]:
!cp {model_dir}/deeplift/overlap_cwm_pwm.csv figures/modisco/{exp}/overlap_cwm_pwm.csv
In [343]:
dfr.to_excel(f'figures/modisco/{exp}/overlap_cwm_pwm.xlsx')
In [410]:
!ls -latrh figures/modisco/{exp}/overlap_cwm_pwm.xlsx
-rw-rw-r-- 1 avsec kundaje 11K Jul 10 03:49 figures/modisco/nexus,peaks,OSNK,0,10,1,FALSE,same,0.5,64,25,0.004,9,FALSE,[1,50],TRUE/overlap_cwm_pwm.xlsx