Goal¶

analyze workflow data

Conclusions¶

model fit is not as good as it was

Open questions¶

[ ] why is the model fit shitty?
[ ] for Sox2, how much of the IDR peaks are reproduced with previous results?
[ ] how do the results look like if you restrict to only the Sox2 peaks?
[ ] Why are so many Oct4 peaks in the training set?

Browser screenshot: https://epigenomegateway.wustl.edu/browser/?genome=mm10&session=KuA8LTv0cU&statusId=714272935

from basepair.config import get_data_dir, create_tf_session

create_tf_session(1)

---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-1-2a497f7352b8> in <module>()
----> 1 create_tf_session(1)

NameError: name 'create_tf_session' is not defined

mdir = "/users/avsec/workspace/basepair-workflow/models/0/"

ls {mdir}

bw/       dataspec.yaml  history.csv   model.h5  preprocessor.pkl
data.pkl  eval/          hparams.yaml  modisco/

ls {mdir}/modisco/test/Sox2/profile

distances.npy  included_samples.npy  kwargs.json  modisco.h5  profile/

import numpy as np

import matplotlib.pyplot as plt

for protein in ['Sox2', 'Oct4']:
    for task in ['profile', 'counts']:
        for split in ['valid', 'test']:
            fig = plt.figure(figsize=(4, 1))
            distances = np.load(f"{mdir}/modisco/{split}/{protein}/{task}/distances.npy")
            plt.hist(distances, bins=50);
            plt.title(f"{protein} {split} {task}")

from basepair.utils import write_pkl, read_pkl

/users/avsec/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/h5py/__init__.py:36: FutureWarning: Conversion of the second argument of issubdtype from `float` to `np.floating` is deprecated. In future, it will be treated as `np.float64 == np.dtype(float).type`.
  from ._conv import register_converters as _register_converters
Using TensorFlow backend.

train, valid, test = read_pkl(f"{mdir}/data.pkl")

len(train[0])

16915

train[2].task.value_counts()

Oct4    13557
Sox2     3358
Name: task, dtype: int64

valid[2].task.value_counts()

Oct4    4292
Sox2    1036
Name: task, dtype: int64

test[2].task.value_counts()

Oct4    3992
Sox2    1014
Name: task, dtype: int64

from keras.models import load_model

model = load_model(f"{mdir}/model.h5")

WARNING:tensorflow:From /users/avsec/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/tensorflow/python/util/deprecation.py:497: calling conv1d (from tensorflow.python.ops.nn_ops) with data_format=NHWC is deprecated and will be removed in a future version.
Instructions for updating:
`NHWC` for data_format is deprecated, use `NWC` instead

2018-06-17 13:07:51,280 [WARNING] From /users/avsec/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/tensorflow/python/util/deprecation.py:497: calling conv1d (from tensorflow.python.ops.nn_ops) with data_format=NHWC is deprecated and will be removed in a future version.
Instructions for updating:
`NHWC` for data_format is deprecated, use `NWC` instead

WARNING:tensorflow:From /users/avsec/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/tensorflow/contrib/learn/python/learn/datasets/base.py:198: retry (from tensorflow.contrib.learn.python.learn.datasets.base) is deprecated and will be removed in a future version.
Instructions for updating:
Use the retry module or similar alternatives.

2018-06-17 13:07:58,851 [WARNING] From /users/avsec/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/tensorflow/contrib/learn/python/learn/datasets/base.py:198: retry (from tensorflow.contrib.learn.python.learn.datasets.base) is deprecated and will be removed in a future version.
Instructions for updating:
Use the retry module or similar alternatives.

from basepair.plots import Seq2Nexus

from basepair.math import softmax

from basepair.plots import *

class Seq2Nexus:
    
    def __init__(self, x, y, df, model, tasks=['sox2', 'oct4']):
        self.x = x
        self.y = y
        self.df = df
        self.labels = df.chr + ":" + df.start.astype(str) + "-" + df.end.astype(str)
        self.model = model
        # Make the prediction
        self.tasks = tasks
        
        self.y_pred = [softmax(p) for p in model.predict(x)]
        self.seq_len = self.y_pred[0].shape[1]
        self.t2i = {k:i for i,k in enumerate(self.tasks)}
        
        
    def input_grad(self, x, strand='pos', task_id=0, seq_grad='max'):
        strand_id = {"pos":0, "neg":1}[strand]
        
        if seq_grad == 'count':
            inp = self.model.inputs[0]
            fn = K.function([inp], K.gradients(self.model.outputs[len(self.tasks)+task_id][:,strand_id], inp))
            return fn([x])[0]
        
        
        if seq_grad =='max':
            sfn = K.max
        elif seq_grad == 'mean':
            sfn = K.mean
        else:
            raise ValueError(f"seq_grad={seq_grad} couldn't be interpreted")
        inp = self.model.inputs[0]
        fn = K.function([inp], K.gradients(sfn(self.model.outputs[task_id][:,:,strand_id], axis=-1), inp))
        return fn([x])[0]
        
    
        
    def plot(self, n=10, kind='test', sort='random', 
             seq_grad='max', figsize=(20,6)):
        import matplotlib.pyplot as plt
         
        if sort=='random':
            idx_list = samplers.random(self.x, n)
        elif "_" in sort:
            kind, task = sort.split("_")
            task_id = self.t2i[task]
            if kind == "max":
                idx_list = samplers.top_max_count(self.y["profile/" + task], n)
            elif kind == "sum":
                idx_list = samplers.top_sum_count(self.y["profile/" + task], n)
        else:
            raise ValueError(f"sort={sort} couldn't be interpreted")
               

        # compute grads
        grads = [[self.input_grad(self.x[idx_list], 'pos', i, seq_grad) * self.x[idx_list],
                  self.input_grad(self.x[idx_list], 'neg', i, seq_grad) * self.x[idx_list],
                  self.input_grad(self.x[idx_list], 'pos', i, "count") * self.x[idx_list],
                  self.input_grad(self.x[idx_list], 'neg', i, "count") * self.x[idx_list]]
                  for i in range(len(self.tasks))]

        for i,idx in enumerate(idx_list):
            n = 6
            fig, axes = plt.subplots(n*len(self.tasks), 1, sharex=True, figsize=figsize)

            for tid, task in enumerate(self.tasks):
                axes[0 + n*tid].plot(np.arange(1,self.seq_len+1), self.y["profile/" + task][idx,:,0], label="pos")#
                axes[0 + n*tid].plot(np.arange(1,self.seq_len+1), self.y["profile/" + task][idx,:,1], label="neg")
                axes[0 + n*tid].set_ylabel("Observed")
                axes[0 + n*tid].legend()
                axes[0 + n*tid].set_title('{} {}'.format(task, self.labels.iloc[idx]))
                axes[1 + n*tid].plot(np.arange(1,self.seq_len+1), self.y_pred[tid][idx,:,0], label="pos")#
                axes[1 + n*tid].plot(np.arange(1,self.seq_len+1), self.y_pred[tid][idx,:,1], label="neg")
                axes[1 + n*tid].set_ylabel("Predicted")
                axes[1 + n*tid].legend()
                # ------------------
                seqlogo(grads[tid][0][i], ax=axes[2 + n*tid]);
                axes[2 + n*tid].set_ylabel("Pos. strand")
                seqlogo(grads[tid][1][i], ax=axes[3 + n*tid]);
                axes[3 + n*tid].set_ylabel("Neg. strand")
                seqlogo(grads[tid][2][i], ax=axes[4 + n*tid]);
                axes[4 + n*tid].set_ylabel("Counts Pos")
                seqlogo(grads[tid][3][i], ax=axes[5 + n*tid]);
                axes[5 + n*tid].set_ylabel("Counts Neg")
                x_range = [1, self.seq_len]
                axes[5 + n*tid].set_xticks(list(range(0, self.seq_len, 5)));

sn = Seq2Nexus(test[0], test[1], test[2], model, ["Sox2", "Oct4"])

sn.plot(sort='max_Oct4', figsize=(20,12))

sn.plot(sort='max_Sox2', figsize=(20,12))

---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-75-1f8a688cae5d> in <module>()
----> 1 model

NameError: name 'model' is not defined

from basepair.BPNet import BPNetPredictor

bp = BPNetPredictor(model, 
                    fasta_file="/mnt/data/pipeline_genome_data/mm10/mm10_no_alt_analysis_set_ENCODE.fasta",
                    tasks=["Sox2", "Oct4"],
                    preproc=preproc)

from pysam import FastaFile

fa = FastaFile("/mnt/data/pipeline_genome_data/mm10/mm10_no_alt_analysis_set_ENCODE.fasta")

fa.close()

import pandas as pd

dfm = pd.concat([train[2], valid[2], test[2]])

bt = BedTool.from_dataframe(dfm.sort_values(["chr", 'start'])[['chr', 'start', 'end']])

dfm = dfm.sort_values(["chr", 'start'])

genome = [(c, l) for c, l in zip(fa.references, fa.lengths)
              if c in list(dfm['chr'].unique())]

list(dfm['chr'].unique())

['chr1',
 'chr10',
 'chr11',
 'chr12',
 'chr13',
 'chr14',
 'chr15',
 'chr16',
 'chr17',
 'chr18',
 'chr19',
 'chr2',
 'chr3',
 'chr4',
 'chr5',
 'chr6',
 'chr7',
 'chr8',
 'chr9',
 'chrX',
 'chrY']

from basepair.cli.export_bw import export_bigwigs

/users/avsec/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/h5py/__init__.py:36: FutureWarning: Conversion of the second argument of issubdtype from `float` to `np.floating` is deprecated. In future, it will be treated as `np.float64 == np.dtype(float).type`.
  from ._conv import register_converters as _register_converters
Using TensorFlow backend.

export_bigwigs(mdir, f"{mdir}/bw/", profile_grad='max', batch_size=32, gpu=3)

---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-2-a7fbfb3e617c> in <module>()
----> 1 export_bigwigs(mdir, f"{mdir}/bw/", profile_grad='max', batch_size=32, gpu=3)

NameError: name 'mdir' is not defined

list(zip(fa.references, fa.lengths))

[('chr1', 195471971),
 ('chr10', 130694993),
 ('chr11', 122082543),
 ('chr12', 120129022),
 ('chr13', 120421639),
 ('chr14', 124902244),
 ('chr15', 104043685),
 ('chr16', 98207768),
 ('chr17', 94987271),
 ('chr18', 90702639),
 ('chr19', 61431566),
 ('chr1_GL456210_random', 169725),
 ('chr1_GL456211_random', 241735),
 ('chr1_GL456212_random', 153618),
 ('chr1_GL456213_random', 39340),
 ('chr1_GL456221_random', 206961),
 ('chr2', 182113224),
 ('chr3', 160039680),
 ('chr4', 156508116),
 ('chr4_GL456216_random', 66673),
 ('chr4_GL456350_random', 227966),
 ('chr4_JH584292_random', 14945),
 ('chr4_JH584293_random', 207968),
 ('chr4_JH584294_random', 191905),
 ('chr4_JH584295_random', 1976),
 ('chr5', 151834684),
 ('chr5_GL456354_random', 195993),
 ('chr5_JH584296_random', 199368),
 ('chr5_JH584297_random', 205776),
 ('chr5_JH584298_random', 184189),
 ('chr5_JH584299_random', 953012),
 ('chr6', 149736546),
 ('chr7', 145441459),
 ('chr7_GL456219_random', 175968),
 ('chr8', 129401213),
 ('chr9', 124595110),
 ('chrM', 16299),
 ('chrUn_GL456239', 40056),
 ('chrUn_GL456359', 22974),
 ('chrUn_GL456360', 31704),
 ('chrUn_GL456366', 47073),
 ('chrUn_GL456367', 42057),
 ('chrUn_GL456368', 20208),
 ('chrUn_GL456370', 26764),
 ('chrUn_GL456372', 28664),
 ('chrUn_GL456378', 31602),
 ('chrUn_GL456379', 72385),
 ('chrUn_GL456381', 25871),
 ('chrUn_GL456382', 23158),
 ('chrUn_GL456383', 38659),
 ('chrUn_GL456385', 35240),
 ('chrUn_GL456387', 24685),
 ('chrUn_GL456389', 28772),
 ('chrUn_GL456390', 24668),
 ('chrUn_GL456392', 23629),
 ('chrUn_GL456393', 55711),
 ('chrUn_GL456394', 24323),
 ('chrUn_GL456396', 21240),
 ('chrUn_JH584304', 114452),
 ('chrX', 171031299),
 ('chrX_GL456233_random', 336933),
 ('chrY', 91744698),
 ('chrY_JH584300_random', 182347),
 ('chrY_JH584301_random', 259875),
 ('chrY_JH584302_random', 155838),
 ('chrY_JH584303_random', 158099)]

bt = BedTool.from_dataframe(train[2][['chr', 'start', 'end']])

bt[0]

Interval(chr5:28398844-28399044)

a = bp.predict([bt[0]])

a[0]['scale_profile']

{'Sox2': array([13.079355, 13.6663  ], dtype=float32),
 'Oct4': array([104.96374, 108.90778], dtype=float32)}

bp.predict_plot([bt[0]])

ls {mdir}

data.pkl       eval/        hparams.yaml  modisco/
dataspec.yaml  history.csv  model.h5      preprocessor.pkl

preproc = read_pkl(f"{mdir}/preprocessor.pkl")

scaler = preproc.objects['profile/Oct4'].steps[1][1]

scaler.mean_

array([4.659818, 4.66212 ], dtype=float32)

scaler.scale_

array([0.70687026, 0.71041757], dtype=float32)

Make the generic plot for the current pipeline¶

predictions
gradients

Overlap the bed files¶

from pybedtools import BedTool
from basepair.datasets import BED_DIR
from basepair.cli.schemas import DataSpec
ds = DataSpec.load(f"{mdir}/dataspec.yaml")
ddir = get_data_dir()

orig_peak = f"{BED_DIR}/Sox2_123b_1_ppr.IDR0.05.filt.summit_centered_200bp.narrowPeak"

genome_file = "/mnt/data/pipeline_genome_data/mm10/mm10.chrom.sizes "

!mkdir -p {ddir}/processed/chipnexus/basepair-workflow

!bedtools slop -i {ds.task_specs['Sox2'].peaks} -g {genome_file} -l 100 -r 100 > \
    {ddir}/processed/chipnexus/basepair-workflow/sox2.peaks.bed

!head {ddir}/processed/chipnexus/basepair-workflow/sox2.peaks.bed

chrX	143482941	143483142
chr3	122145486	122145687
chr9	102205639	102205840
chr5	28209968	28210169
chr8	86393909	86394110
chr1	134535143	134535344
chr16	84769231	84769432
chr3	108433572	108433773
chr2	52072139	52072340
chrX	76598973	76599174

new_sox2 = BedTool(f"{ddir}/processed/chipnexus/basepair-workflow/sox2.peaks.bed")
old_sox2 = BedTool(orig_peak)

len(new_sox2)

5408

len(new_sox2.intersect(old_sox2, wa=True, u=True))

4584

len(old_sox2)

9396

len(old_sox2.intersect(new_sox2, wa=True, u=True))

4608

The old Sox2 had 2x the number of peaks. Intersection is pretty high.

!wc -l {ddir}/processed/chipnexus/basepair-workflow/sox2.peaks.bed

5408 /users/avsec/workspace/basepair/basepair/../data/processed/chipnexus/basepair-workflow/sox2.peaks.bed

ls

ataqc/                 mm10_klab.tsv
bowtie2_index/         mm10_no_alt_analysis_set_ENCODE.fasta
bwa_index/             mm10_no_alt_analysis_set_ENCODE.fasta.fai
mm10.blacklist.bed.gz  mm10_no_alt_analysis_set_ENCODE.fasta.gz
mm10.chrom.sizes       seq/

!cat {mdir}/dataspec.yaml

task_specs:
  Oct4:
    pos_counts: /users/avsec/workspace/basepair-workflow/data/Oct4/5prime.counts.pos.bw
    neg_counts: /users/avsec/workspace/basepair-workflow/data/Oct4/5prime.counts.neg.bw
    peaks: /users/avsec/workspace/basepair-workflow/data/Oct4/peaks.bed.gz
  Sox2:
    pos_counts: /users/avsec/workspace/basepair-workflow/data/Sox2/5prime.counts.pos.bw
    neg_counts: /users/avsec/workspace/basepair-workflow/data/Sox2/5prime.counts.neg.bw
    peaks: /users/avsec/workspace/basepair-workflow/data/Sox2/peaks.bed.gz
fasta_file: /mnt/data/pipeline_genome_data/mm10/mm10_no_alt_analysis_set_ENCODE.fasta
path: null

'/users/avsec/workspace/basepair-workflow/data/Sox2/peaks.bed.gz'

ds

---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-26-6f4a4004dcca> in <module>()
----> 1 ds

NameError: name 'ds' is not defined

!head {orig_peak}

chr17	17408871	17409072	.	899	.	23.14292	-1.00000	-0.11892	168
chr5	110284652	110284853	.	567	.	23.18326	-1.00000	-0.11992	168
chr1	57780110	57780311	.	1000	.	23.32034	-1.00000	-0.12406	168
chr6	94183931	94184132	.	1000	.	23.35388	-1.00000	-0.12534	168
chr5	121537340	121537541	.	817	.	23.35671	-1.00000	-0.12541	168
chr8	67966522	67966723	.	585	.	23.41701	-1.00000	-0.12709	168
chr11	68348387	68348588	.	1000	.	23.53622	-1.00000	-0.13079	168
chr1	9955253	9955454	.	1000	.	23.66639	-1.00000	-0.13534	168
chr10	123087061	123087262	.	830	.	23.75985	-1.00000	-0.13821	168
chr2	18567730	18567931	.	1000	.	23.80058	-1.00000	-0.13973	168

Goal¶

Conclusions¶

Open questions¶

Make the generic plot for the current pipeline¶

Overlap the bed files¶

Question: is is the peak caller or the tracks?¶