Focus on the 200bp window around the peaks.
from basepair.config import get_data_dir
import pyBigWig
import matplotlib.pyplot as plt
from concise.utils.fasta import read_fasta
from pybedtools import BedTool
from tqdm import tqdm
import pandas as pd
import numpy as np
from basepair.data import get_sox2_data
ddir = get_data_dir()
# data:
fasta_file_200 = f"{ddir}/raw/chipnexus/scratch_jzeitlinger_collab/meme_chip_analysis/Sox2_123b_1_ppr.IDR0.05.filt.summit_centered_200bp.fasta"
bed_file_200 = f"{ddir}/raw/chipnexus/scratch_jzeitlinger_collab/meme_chip_analysis/Sox2_123b_1_ppr.IDR0.05.filt.summit_centered_200bp.narrowPeak"
# Sox2_123b_1_ppr.IDR0.05.filt.summit_centered_200bp.fasta
fas = read_fasta(fasta_file_200)
bed = BedTool(bed_file_200)
assert len(bed) == len(fas)
dfc = get_sox2_data()
plt.plot(np.arange(-100.5, 100.5), np.stack(dfc.cuts_pos).mean(axis=0), label='pos')
plt.plot(np.arange(-100.5, 100.5), np.stack(dfc.cuts_neg).mean(axis=0), label='neg')
plt.xlabel("Peak position")
plt.legend()
plt.title("Average signal");
n=2*6
fig = plt.figure(figsize=(20, 5))
for i,idx in enumerate(dfc.sample(n).index):
plt.subplot(n//6, 6, i+1)
plt.bar(np.arange(-100.5, 100.5), dfc.iloc[idx].cuts_pos, label='pos')
plt.bar(np.arange(-100.5, 100.5), dfc.iloc[idx].cuts_neg, label='neg')
if i==0:
plt.title("Average signal")
plt.xlabel("Peak position")
plt.legend()
plt.tight_layout()
dfc.sample(10)
len(dfc.iloc[0].cuts_pos)
len(dfc.iloc[0].seq)
len(dfc)
motif_sites = {"motif1": f"{ddir}/processed/chipnexus/motifs/sox2/homer_200bp/de-novo/motif1.peaks200.txt",
"motif2": f"{ddir}/processed/chipnexus/motifs/sox2/homer_200bp/de-novo/motif2.peaks200.txt"}
motifs = {"motif1": f"{ddir}/processed/chipnexus/motifs/sox2/homer_200bp/de-novo/motif1.motif",
"motif2": f"{ddir}/processed/chipnexus/motifs/sox2/homer_200bp/de-novo/motif2.motif"}
from pybedtools import BedTool
from pysam import Fastafile
import pandas as pd
from concise.utils.pwm import PWM,load_motif_db
import concise
import matplotlib.pyplot as plt
from plotnine import *
from basepair.motif.homer import load_motif, read_motif_hits
pwm_list = [load_motif(fname) for k,fname in motifs.items()]
for i,pwm in enumerate(pwm_list):
pwm.plotPWM((5,1.5))
plt.title(f"motif{i+1}")
for i,pwm in enumerate(pwm_list):
pwm.plotPWMInfo((5,1.5))
plt.title(f"motif{i+1}")
df = pd.concat([read_homer_motif_hits(fname).assign(motif_id=k) for k,fname in motif_sites.items()])
df.head()
for i,k in enumerate(motif_sites):
plt.hist(df.center[df.motif_id==k], bins=20, histtype="step", label=k)
plt.xlabel("Relative position");
plt.legend()
plt.title("Relative position wrt sequence");
dfcm = df.merge(dfc, left_on='interval_id', right_on='seq_id', suffixes=('_motif', '_interval'))
dfcm.shape
dfcm.head()
cuts_arr_pos = np.stack(dfcm.cuts_pos)
cuts_arr_neg = np.stack(dfcm.cuts_neg)
from tqdm import trange
def extract_signal_pos(row, flip_strand=True, pad=10):
out = row.cuts_pos[(row.start_motif-pad):(row.end_motif+pad)]
if flip_strand and row.strand == "-":
return out[::-1]
else:
return out
def skip_row(row, pad=10):
return row.start_motif < pad or row.end_motif + pad > 200
def extract_signal_neg(row, flip_strand=True, pad=10):
out = row.cuts_neg[(row.start_motif-pad):(row.end_motif+pad)]
if flip_strand and row.strand == "-":
return out[::-1]
else:
return out
def extract_signal(row, flip_strand=True, pad=10):
out_pos = row.cuts_pos[(row.start_motif-pad):(row.end_motif+pad)]
out_neg = row.cuts_neg[(row.start_motif-pad):(row.end_motif+pad)]
if flip_strand and row.strand == "-":
return np.array([out_neg[::-1], out_pos[::-1]])
else:
return np.array([out_pos, out_neg])
pad=20
cuts_motif = np.stack([extract_signal(dfcm.iloc[i], pad=pad) for i in range(len(dfcm)) if not skip_row(dfcm.iloc[i], pad=pad)])
cuts_motif_pos = np.stack([extract_signal_pos(dfcm.iloc[i], pad=pad) for i in trange(len(dfcm)) if not skip_row(dfcm.iloc[i], pad=pad)])
cuts_motif_neg = np.stack([extract_signal_neg(dfcm.iloc[i], pad=pad) for i in trange(len(dfcm)) if not skip_row(dfcm.iloc[i], pad=pad)])
len(cuts_motif_pos)
# We haven't lost more than 80% of the sequences
assert len(cuts_motif_pos) > 0.8*len(dfcm)
motif_id = np.array([dfcm.iloc[i].motif_id for i in range(len(dfcm)) if not skip_row(dfcm.iloc[i], pad=pad)])
cuts_motif_pos.shape
# Strand flipped and reversed
plt.figure(figsize=(10,3))
plt.subplot(121)
plt.plot(cuts_motif[motif_id=='motif1',0].mean(axis=0), label='pos')
plt.plot(cuts_motif[motif_id=='motif1',1].mean(axis=0), label='neg')
plt.title("motif1")
plt.subplot(122)
plt.plot(cuts_motif[motif_id=='motif2',0].mean(axis=0), label='pos')
plt.plot(cuts_motif[motif_id=='motif2',1].mean(axis=0), label='neg')
plt.legend()
plt.title("motif2");
# Strand flipped
plt.figure(figsize=(10,3))
plt.subplot(121)
plt.plot(cuts_motif_pos[motif_id=='motif1'].mean(axis=0), label='pos')
plt.plot(cuts_motif_neg[motif_id=='motif1'].mean(axis=0), label='neg')
plt.title("motif1")
plt.subplot(122)
plt.plot(cuts_motif_pos[motif_id=='motif2'].mean(axis=0), label='pos')
plt.plot(cuts_motif_neg[motif_id=='motif2'].mean(axis=0), label='neg')
plt.legend()
plt.title("motif2");
# Strand not flipped
plt.figure(figsize=(10,3))
plt.subplot(121)
plt.plot(cuts_motif_pos[motif_id=='motif1'].mean(axis=0), label='pos')
plt.plot(cuts_motif_neg[motif_id=='motif1'].mean(axis=0), label='neg')
plt.title("motif1")
plt.subplot(122)
plt.plot(cuts_motif_pos[motif_id=='motif2'].mean(axis=0), label='pos')
plt.plot(cuts_motif_neg[motif_id=='motif2'].mean(axis=0), label='neg')
plt.legend()
plt.title("motif2");
for i,pwm in enumerate(pwm_list):
pwm.plotPWMInfo((5,1.5))
plt.title(f"motif{i+1}")