In [1]:
import pandas as pd 
import pysam
import pyBigWig
import numpy as np
In [2]:
def get_dhs_signal(EXPERIMENT):
    overlappingpeaks_df = pd.read_csv("/mnt/lab_data2/vir/tf_chr_atlas/02-24-2021/shap/{}/dhs_overlapping.bed".format(EXPERIMENT), sep='\t', header=None, 
                               names=['Chrom', 'Start', 'End', 'name', 'score',
                                  'strand', 'signal', 'p', 'q', 'summit'])
    overlappingpeaks_df['label'] = 'overlapping'

    nonoverlappingpeaks_df = pd.read_csv("/mnt/lab_data2/vir/tf_chr_atlas/02-24-2021/shap/{}/dhs_nonoverlapping.bed".format(EXPERIMENT), sep='\t', header=None, 
                               names=['Chrom', 'Start', 'End', 'name', 'score',
                                  'strand', 'signal', 'p', 'q', 'summit'])
    nonoverlappingpeaks_df['label'] = 'non-overlapping'

    peaks_df = overlappingpeaks_df.append(nonoverlappingpeaks_df)
    peaks_df = peaks_df.reset_index()


    d = {'sample': ['HepG2'], 'factor': ['DNAse'], 'bw_path' : ['/oak/stanford/groups/akundaje/projects/atlas/dnase_processed/dnase/39e50d95-1423-4dca-acd1-4b685ab94c4c/call-macs2_signal_track/shard-0/execution/ENCSR149XIL.merged.nodup.fc.signal.bigwig']}
    bw_df = pd.DataFrame(d)

    bw = pyBigWig.open(bw_df['bw_path'][0])

    dhs_scores = np.empty(len(peaks_df))
    for i, row in peaks_df.iterrows():
            try:
                dhs_scores[i] = np.asarray(bw.values(row['Chrom'], row['Start'], row['End'])).mean()
            except:
                dhs_scores[i] = 0

    dhs_scores = pd.Series(dhs_scores).fillna(0)

    peaks_df['dhs_scores'] = dhs_scores
    return peaks_df
In [3]:
EXPERIMENT = "ENCSR000EDZ"
peaks_df = get_dhs_signal(EXPERIMENT)

from plotnine import *
from IPython.display import set_matplotlib_formats
set_matplotlib_formats('png', 'pdf')

%matplotlib inline

(
    ggplot(peaks_df)
    + geom_boxplot(aes(x='factor(label)', y='dhs_scores'))
    + theme(axis_line=element_line(size=1),
    axis_line_x=element_line(color='black'),
    axis_line_y=element_line(color='black'),

    axis_text=element_text(margin={'t': 5, 'r': 5}),
    axis_text_x=element_text(family="calibri",color='black'),
    axis_text_y=element_text(family="calibri",color='black'),
    panel_border=element_blank(),
    panel_background=element_blank(),
    panel_grid_major = element_blank())
    + xlab("group")
    + ylab("DNAse signal\n(fold change)")
    + ggtitle(EXPERIMENT)

)
findfont: Font family ['calibri'] not found. Falling back to DejaVu Sans.
Out[3]:
<ggplot: (8768273343861)>
In [4]:
EXPERIMENT = "ENCSR000EEB"
peaks_df = get_dhs_signal(EXPERIMENT)

from plotnine import *
from IPython.display import set_matplotlib_formats
set_matplotlib_formats('png', 'pdf')

%matplotlib inline

(
    ggplot(peaks_df)
    + geom_boxplot(aes(x='factor(label)', y='dhs_scores'))
    + theme(axis_line=element_line(size=1),
    axis_line_x=element_line(color='black'),
    axis_line_y=element_line(color='black'),

    axis_text=element_text(margin={'t': 5, 'r': 5}),
    axis_text_x=element_text(family="calibri",color='black'),
    axis_text_y=element_text(family="calibri",color='black'),
    panel_border=element_blank(),
    panel_background=element_blank(),
    panel_grid_major = element_blank())
    + xlab("group")
    + ylab("DNAse signal\n(fold change)")
    + ggtitle(EXPERIMENT)

)
Out[4]:
<ggplot: (8768171681445)>
In [5]:
EXPERIMENT = "ENCSR000EEC"
peaks_df = get_dhs_signal(EXPERIMENT)

from plotnine import *
from IPython.display import set_matplotlib_formats
set_matplotlib_formats('png', 'pdf')

%matplotlib inline

(
    ggplot(peaks_df)
    + geom_boxplot(aes(x='factor(label)', y='dhs_scores'))
    + theme(axis_line=element_line(size=1),
    axis_line_x=element_line(color='black'),
    axis_line_y=element_line(color='black'),

    axis_text=element_text(margin={'t': 5, 'r': 5}),
    axis_text_x=element_text(family="calibri",color='black'),
    axis_text_y=element_text(family="calibri",color='black'),
    panel_border=element_blank(),
    panel_background=element_blank(),
    panel_grid_major = element_blank())
    + xlab("group")
    + ylab("DNAse signal\n(fold change)")
    + ggtitle(EXPERIMENT)

)
Out[5]:
<ggplot: (8768167300109)>
In [ ]: