import pandas as pd
import pysam
import pyBigWig
import numpy as np
def get_dhs_signal(EXPERIMENT):
overlappingpeaks_df = pd.read_csv("/mnt/lab_data2/vir/tf_chr_atlas/02-24-2021/shap/{}/dhs_overlapping.bed".format(EXPERIMENT), sep='\t', header=None,
names=['Chrom', 'Start', 'End', 'name', 'score',
'strand', 'signal', 'p', 'q', 'summit'])
overlappingpeaks_df['label'] = 'overlapping'
nonoverlappingpeaks_df = pd.read_csv("/mnt/lab_data2/vir/tf_chr_atlas/02-24-2021/shap/{}/dhs_nonoverlapping.bed".format(EXPERIMENT), sep='\t', header=None,
names=['Chrom', 'Start', 'End', 'name', 'score',
'strand', 'signal', 'p', 'q', 'summit'])
nonoverlappingpeaks_df['label'] = 'non-overlapping'
peaks_df = overlappingpeaks_df.append(nonoverlappingpeaks_df)
peaks_df = peaks_df.reset_index()
d = {'sample': ['HepG2'], 'factor': ['DNAse'], 'bw_path' : ['/oak/stanford/groups/akundaje/projects/atlas/dnase_processed/dnase/39e50d95-1423-4dca-acd1-4b685ab94c4c/call-macs2_signal_track/shard-0/execution/ENCSR149XIL.merged.nodup.fc.signal.bigwig']}
bw_df = pd.DataFrame(d)
bw = pyBigWig.open(bw_df['bw_path'][0])
dhs_scores = np.empty(len(peaks_df))
for i, row in peaks_df.iterrows():
try:
dhs_scores[i] = np.asarray(bw.values(row['Chrom'], row['Start'], row['End'])).mean()
except:
dhs_scores[i] = 0
dhs_scores = pd.Series(dhs_scores).fillna(0)
peaks_df['dhs_scores'] = dhs_scores
return peaks_df
EXPERIMENT = "ENCSR000EDZ"
peaks_df = get_dhs_signal(EXPERIMENT)
from plotnine import *
from IPython.display import set_matplotlib_formats
set_matplotlib_formats('png', 'pdf')
%matplotlib inline
(
ggplot(peaks_df)
+ geom_boxplot(aes(x='factor(label)', y='dhs_scores'))
+ theme(axis_line=element_line(size=1),
axis_line_x=element_line(color='black'),
axis_line_y=element_line(color='black'),
axis_text=element_text(margin={'t': 5, 'r': 5}),
axis_text_x=element_text(family="calibri",color='black'),
axis_text_y=element_text(family="calibri",color='black'),
panel_border=element_blank(),
panel_background=element_blank(),
panel_grid_major = element_blank())
+ xlab("group")
+ ylab("DNAse signal\n(fold change)")
+ ggtitle(EXPERIMENT)
)
findfont: Font family ['calibri'] not found. Falling back to DejaVu Sans.
<ggplot: (8768273343861)>
EXPERIMENT = "ENCSR000EEB"
peaks_df = get_dhs_signal(EXPERIMENT)
from plotnine import *
from IPython.display import set_matplotlib_formats
set_matplotlib_formats('png', 'pdf')
%matplotlib inline
(
ggplot(peaks_df)
+ geom_boxplot(aes(x='factor(label)', y='dhs_scores'))
+ theme(axis_line=element_line(size=1),
axis_line_x=element_line(color='black'),
axis_line_y=element_line(color='black'),
axis_text=element_text(margin={'t': 5, 'r': 5}),
axis_text_x=element_text(family="calibri",color='black'),
axis_text_y=element_text(family="calibri",color='black'),
panel_border=element_blank(),
panel_background=element_blank(),
panel_grid_major = element_blank())
+ xlab("group")
+ ylab("DNAse signal\n(fold change)")
+ ggtitle(EXPERIMENT)
)
<ggplot: (8768171681445)>
EXPERIMENT = "ENCSR000EEC"
peaks_df = get_dhs_signal(EXPERIMENT)
from plotnine import *
from IPython.display import set_matplotlib_formats
set_matplotlib_formats('png', 'pdf')
%matplotlib inline
(
ggplot(peaks_df)
+ geom_boxplot(aes(x='factor(label)', y='dhs_scores'))
+ theme(axis_line=element_line(size=1),
axis_line_x=element_line(color='black'),
axis_line_y=element_line(color='black'),
axis_text=element_text(margin={'t': 5, 'r': 5}),
axis_text_x=element_text(family="calibri",color='black'),
axis_text_y=element_text(family="calibri",color='black'),
panel_border=element_blank(),
panel_background=element_blank(),
panel_grid_major = element_blank())
+ xlab("group")
+ ylab("DNAse signal\n(fold change)")
+ ggtitle(EXPERIMENT)
)
<ggplot: (8768167300109)>