Setup the data

Focus on the 200bp window around the peaks.

In [5]:
from basepair.config import get_data_dir

import pyBigWig
import matplotlib.pyplot as plt
from concise.utils.fasta import read_fasta
from pybedtools import BedTool
from tqdm import tqdm
import pandas as pd
import numpy as np
/users/avsec/bin/anaconda3/lib/python3.6/site-packages/h5py/__init__.py:36: FutureWarning: Conversion of the second argument of issubdtype from `float` to `np.floating` is deprecated. In future, it will be treated as `np.float64 == np.dtype(float).type`.
  from ._conv import register_converters as _register_converters
Using TensorFlow backend.
In [6]:
from basepair.data import get_sox2_data
In [7]:
ddir = get_data_dir()

Data

Peak sites

In [4]:
# data:
fasta_file_200 = f"{ddir}/raw/chipnexus/scratch_jzeitlinger_collab/meme_chip_analysis/Sox2_123b_1_ppr.IDR0.05.filt.summit_centered_200bp.fasta"
bed_file_200 = f"{ddir}/raw/chipnexus/scratch_jzeitlinger_collab/meme_chip_analysis/Sox2_123b_1_ppr.IDR0.05.filt.summit_centered_200bp.narrowPeak"
# Sox2_123b_1_ppr.IDR0.05.filt.summit_centered_200bp.fasta

fas = read_fasta(fasta_file_200)
bed = BedTool(bed_file_200)
assert len(bed) == len(fas)

ChipNexus

In [17]:
dfc = get_sox2_data()
100%|██████████| 9396/9396 [00:02<00:00, 3966.58it/s]

Averaged data signal

In [21]:
plt.plot(np.arange(-100.5, 100.5), np.stack(dfc.cuts_pos).mean(axis=0), label='pos')
plt.plot(np.arange(-100.5, 100.5), np.stack(dfc.cuts_neg).mean(axis=0), label='neg')
plt.xlabel("Peak position")
plt.legend()
plt.title("Average signal");

Raw data at random sites

In [8]:
n=2*6
fig = plt.figure(figsize=(20, 5))
for i,idx in enumerate(dfc.sample(n).index):
    plt.subplot(n//6, 6, i+1)
    plt.bar(np.arange(-100.5, 100.5), dfc.iloc[idx].cuts_pos, label='pos')
    plt.bar(np.arange(-100.5, 100.5), dfc.iloc[idx].cuts_neg, label='neg')
    if i==0:
        plt.title("Average signal")
    plt.xlabel("Peak position")
    plt.legend()
    plt.tight_layout()    
In [9]:
dfc.sample(10)
Out[9]:
chr cuts_neg cuts_pos end start seq seq_id
2046 chr16 [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, ... [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ... 38090656 38090455 TAGCACCCTAACATAAAACAAAAGGAAGAAAAGGATTAAGGAAGGA... chr16:38090455-38090656
1954 chr5 [2.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ... [2.0, 0.0, 0.0, 0.0, 1.0, 2.0, 0.0, 0.0, 0.0, ... 75896891 75896690 CCAAGGGAGCTGCTGTGATCTCTTTGCCTTTGGTTCTTTGTTCAGA... chr5:75896690-75896891
6090 chr14 [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 2.0, 1.0, ... [0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, ... 121219530 121219329 CAGACCCTTGCCCGTTCTTGCTGACATTCTGTCTCCGTTTGCTCTA... chr14:121219329-121219530
9231 chr11 [0.0, 1.0, 1.0, 0.0, 0.0, 1.0, 2.0, 1.0, 0.0, ... [1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ... 104944995 104944794 TTTTTCTCCAGTCTGCTTATATACCATTCTAGGTGCATGTAAAGAA... chr11:104944794-104944995
2263 chr4 [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ... [0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 1.0, 0.0, 1.0, ... 146059478 146059277 ACAACAAGGAAAGCACAACAAGGACCACCGGAAATATACAGTGACA... chr4:146059277-146059478
4222 chr2 [0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 1.0, 0.0, ... [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ... 130839162 130838961 GTCAGCCTGGTCTACAGTGAGACCCGGTCTCAATTAAAAAAAAAAA... chr2:130838961-130839162
3217 chr11 [0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, ... [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, ... 22520539 22520338 GAACCCACAGACCCATCCTTTGTTTCCGCTCGATTGTGTTGCAAAG... chr11:22520338-22520539
6113 chr1 [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ... [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ... 4972687 4972486 AATCCTGTTTCTGTTCTTCCTTTGAATAACTAAAGAAATGTAAAAC... chr1:4972486-4972687
563 chr8 [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ... [0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, ... 20601030 20600829 CCTTGCACTCTGAGGTAGTGTCTGTTTTTTTTTCCCTGTGATGGAT... chr8:20600829-20601030
6389 chr12 [0.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, ... [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ... 104247773 104247572 ACAAATGTTAGCTTTATCCCTCCTGGGGACATGTGGTCTGTGCAGG... chr12:104247572-104247773
In [24]:
len(dfc.iloc[0].cuts_pos)
Out[24]:
201
In [25]:
len(dfc.iloc[0].seq)
Out[25]:
201
In [27]:
len(dfc)
Out[27]:
9396

Motif sites

In [8]:
motif_sites = {"motif1": f"{ddir}/processed/chipnexus/motifs/sox2/homer_200bp/de-novo/motif1.peaks200.txt",
               "motif2": f"{ddir}/processed/chipnexus/motifs/sox2/homer_200bp/de-novo/motif2.peaks200.txt"}
In [12]:
motifs = {"motif1": f"{ddir}/processed/chipnexus/motifs/sox2/homer_200bp/de-novo/motif1.motif",
          "motif2": f"{ddir}/processed/chipnexus/motifs/sox2/homer_200bp/de-novo/motif2.motif"}
In [13]:
from pybedtools import BedTool
from pysam import Fastafile
import pandas as pd
from concise.utils.pwm import PWM,load_motif_db
import concise
import matplotlib.pyplot as plt
from plotnine import *
In [14]:
from basepair.motif.homer import load_motif, read_motif_hits

pwm_list = [load_motif(fname) for k,fname in motifs.items()]
In [16]:
for i,pwm in enumerate(pwm_list):
    pwm.plotPWM((5,1.5))
    plt.title(f"motif{i+1}")
In [17]:
for i,pwm in enumerate(pwm_list):
    pwm.plotPWMInfo((5,1.5))
    plt.title(f"motif{i+1}")
In [18]:
df = pd.concat([read_homer_motif_hits(fname).assign(motif_id=k) for k,fname in motif_sites.items()])
In [19]:
df.head()
Out[19]:
motif_name interval_id start end strand score seq fname center motif_id
0 1-TTTGCATRACAA,BestGuess:Pou5f1::Sox2/MA0142.1... chr8:111451951-111452152 14 25 + 10.932273 TTAGCATCACAA /users/avsec/workspace/chipnexus/chipnexus/../... 19.5 motif1
1 1-TTTGCATRACAA,BestGuess:Pou5f1::Sox2/MA0142.1... chr2:122370428-122370629 72 83 + 9.765420 TTTGAATGAGAA /users/avsec/workspace/chipnexus/chipnexus/../... 77.5 motif1
2 1-TTTGCATRACAA,BestGuess:Pou5f1::Sox2/MA0142.1... chr4:125545207-125545408 61 72 + 11.831128 TTAGCATAACAA /users/avsec/workspace/chipnexus/chipnexus/../... 66.5 motif1
3 1-TTTGCATRACAA,BestGuess:Pou5f1::Sox2/MA0142.1... chr13:23685082-23685283 84 95 - 10.031972 TTCAAATGCAAA /users/avsec/workspace/chipnexus/chipnexus/../... 89.5 motif1
4 1-TTTGCATRACAA,BestGuess:Pou5f1::Sox2/MA0142.1... chr13:23438829-23439030 42 53 + 9.900093 TTTACAAAACAA /users/avsec/workspace/chipnexus/chipnexus/../... 47.5 motif1
In [20]:
for i,k in enumerate(motif_sites):
    plt.hist(df.center[df.motif_id==k], bins=20, histtype="step", label=k)
    plt.xlabel("Relative position");
plt.legend()
plt.title("Relative position wrt sequence");

Signal distribution at motif sites

In [21]:
dfcm = df.merge(dfc, left_on='interval_id', right_on='seq_id', suffixes=('_motif', '_interval'))
In [22]:
dfcm.shape
Out[22]:
(8575, 17)
In [23]:
dfcm.head()
Out[23]:
motif_name interval_id start_motif end_motif strand score seq_motif fname center motif_id chr cuts_neg cuts_pos end_interval start_interval seq_interval seq_id
0 1-TTTGCATRACAA,BestGuess:Pou5f1::Sox2/MA0142.1... chr8:111451951-111452152 14 25 + 10.932273 TTAGCATCACAA /users/avsec/workspace/chipnexus/chipnexus/../... 19.5 motif1 chr8 [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ... [0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, ... 111452152 111451951 TACTGTACAGAATTTAGCATCACAATACATTAGCAACAGATCAAAC... chr8:111451951-111452152
1 1-TTTGCATRACAA,BestGuess:Pou5f1::Sox2/MA0142.1... chr2:122370428-122370629 72 83 + 9.765420 TTTGAATGAGAA /users/avsec/workspace/chipnexus/chipnexus/../... 77.5 motif1 chr2 [0.0, 0.0, 2.0, 2.0, 1.0, 1.0, 1.0, 2.0, 1.0, ... [0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, ... 122370629 122370428 ACCACACTACTGGTATTTGCTTAAAAAATATAAAGTGATTCTGGTG... chr2:122370428-122370629
2 1-TTTGCATRACAA,BestGuess:Pou5f1::Sox2/MA0142.1... chr4:125545207-125545408 61 72 + 11.831128 TTAGCATAACAA /users/avsec/workspace/chipnexus/chipnexus/../... 66.5 motif1 chr4 [0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, ... [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, ... 125545408 125545207 TGTCAATGAACATACTTAATTTCTCTGGAAGTGCTATTTAAATTGC... chr4:125545207-125545408
3 1-TTTGCATRACAA,BestGuess:Pou5f1::Sox2/MA0142.1... chr13:23685082-23685283 84 95 - 10.031972 TTCAAATGCAAA /users/avsec/workspace/chipnexus/chipnexus/../... 89.5 motif1 chr13 [0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ... [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ... 23685283 23685082 ATCTTGGTAGAGTATTCTAAAATATGACCTGGAATTGTCAAACTTA... chr13:23685082-23685283
4 1-TTTGCATRACAA,BestGuess:Pou5f1::Sox2/MA0142.1... chr13:23438829-23439030 42 53 + 9.900093 TTTACAAAACAA /users/avsec/workspace/chipnexus/chipnexus/../... 47.5 motif1 chr13 [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ... [0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, ... 23439030 23438829 AAATTGGGATTTTTAGAATGTTTTAGGTTCTGCTTTATTTCTTTAC... chr13:23438829-23439030
In [24]:
cuts_arr_pos = np.stack(dfcm.cuts_pos)
cuts_arr_neg = np.stack(dfcm.cuts_neg)
In [25]:
from tqdm import trange
In [40]:
def extract_signal_pos(row, flip_strand=True, pad=10):
    out = row.cuts_pos[(row.start_motif-pad):(row.end_motif+pad)]
    if flip_strand and row.strand == "-":
        return out[::-1]
    else:
        return out
In [27]:
def skip_row(row, pad=10):
    return row.start_motif < pad or row.end_motif + pad > 200
In [41]:
def extract_signal_neg(row, flip_strand=True, pad=10):
    out = row.cuts_neg[(row.start_motif-pad):(row.end_motif+pad)]
    if flip_strand and row.strand == "-":
        return out[::-1]
    else:
        return out
In [49]:
def extract_signal(row, flip_strand=True, pad=10):
    out_pos = row.cuts_pos[(row.start_motif-pad):(row.end_motif+pad)]
    out_neg = row.cuts_neg[(row.start_motif-pad):(row.end_motif+pad)]
    if flip_strand and row.strand == "-":
        return np.array([out_neg[::-1], out_pos[::-1]])
    else:
        return np.array([out_pos, out_neg])
In [29]:
pad=20
In [52]:
cuts_motif = np.stack([extract_signal(dfcm.iloc[i], pad=pad) for i in range(len(dfcm)) if not skip_row(dfcm.iloc[i], pad=pad)])
In [42]:
cuts_motif_pos = np.stack([extract_signal_pos(dfcm.iloc[i], pad=pad) for i in trange(len(dfcm)) if not skip_row(dfcm.iloc[i], pad=pad)])
cuts_motif_neg = np.stack([extract_signal_neg(dfcm.iloc[i], pad=pad) for i in trange(len(dfcm)) if not skip_row(dfcm.iloc[i], pad=pad)])
100%|██████████| 8575/8575 [00:04<00:00, 1831.75it/s]
100%|██████████| 8575/8575 [00:04<00:00, 1866.70it/s]
In [31]:
len(cuts_motif_pos)
Out[31]:
7859
In [32]:
# We haven't lost more than 80% of the sequences
assert len(cuts_motif_pos) > 0.8*len(dfcm)
In [54]:
motif_id = np.array([dfcm.iloc[i].motif_id for i in range(len(dfcm)) if not skip_row(dfcm.iloc[i], pad=pad)])
In [55]:
cuts_motif_pos.shape
Out[55]:
(7859, 51)
In [58]:
# Strand flipped and reversed
plt.figure(figsize=(10,3))
plt.subplot(121)
plt.plot(cuts_motif[motif_id=='motif1',0].mean(axis=0), label='pos')
plt.plot(cuts_motif[motif_id=='motif1',1].mean(axis=0), label='neg')
plt.title("motif1")
plt.subplot(122)
plt.plot(cuts_motif[motif_id=='motif2',0].mean(axis=0), label='pos')
plt.plot(cuts_motif[motif_id=='motif2',1].mean(axis=0), label='neg')
plt.legend()
plt.title("motif2");
In [44]:
# Strand flipped
plt.figure(figsize=(10,3))
plt.subplot(121)
plt.plot(cuts_motif_pos[motif_id=='motif1'].mean(axis=0), label='pos')
plt.plot(cuts_motif_neg[motif_id=='motif1'].mean(axis=0), label='neg')
plt.title("motif1")
plt.subplot(122)
plt.plot(cuts_motif_pos[motif_id=='motif2'].mean(axis=0), label='pos')
plt.plot(cuts_motif_neg[motif_id=='motif2'].mean(axis=0), label='neg')
plt.legend()
plt.title("motif2");
In [35]:
# Strand not flipped
plt.figure(figsize=(10,3))
plt.subplot(121)
plt.plot(cuts_motif_pos[motif_id=='motif1'].mean(axis=0), label='pos')
plt.plot(cuts_motif_neg[motif_id=='motif1'].mean(axis=0), label='neg')
plt.title("motif1")
plt.subplot(122)
plt.plot(cuts_motif_pos[motif_id=='motif2'].mean(axis=0), label='pos')
plt.plot(cuts_motif_neg[motif_id=='motif2'].mean(axis=0), label='neg')
plt.legend()
plt.title("motif2");
In [36]:
for i,pwm in enumerate(pwm_list):
    pwm.plotPWMInfo((5,1.5))
    plt.title(f"motif{i+1}")