In [17]:
#!/usr/bin/env python
"""Generate all the reqired data
"""
import os
import pandas as pd
import types
import numpy as np
from basepair.seqmodel import SeqModel
from collections import OrderedDict
from basepair.preproc import rc_seq
from basepair.exp.chipnexus.simulate import (insert_motif, generate_sim, plot_sim, generate_seq,
                                             model2tasks, motif_coords, interactive_tracks, plot_motif_table,
                                             plot_sim_motif_col)
from concise.preprocessing import encodeDNA
from basepair.exp.paper.config import models_dir
from basepair.config import get_data_dir, create_tf_session
from basepair.utils import write_pkl
import argparse
import logging
logger = logging.getLogger(__name__)
logger.addHandler(logging.NullHandler())


# --------------------------------------------
# Motif configuration
side_motifs = OrderedDict([
    ("Oct4-Sox2", ("m0_p0", "TTTGCATAACAA")),
    ("Sox2", ("m0_p2", "CCATTGTT")),
    #("Err2", ("m0_p1", "TCAAGGTCA")),
    ("Nanog", ("m0_p2", "TTGATGGC")),
    ("Klf4", ("m2_p0", "GGGTGTGG")),
])

# get also the rc motifs
rc_side_motifs = OrderedDict([
    (m + "/rc", (v[0], rc_seq(v[1])))
    for m, v in side_motifs.items()
])
all_side_motifs = OrderedDict(list(side_motifs.items()) + list(rc_side_motifs.items()))

center_coords = [485, 520]
central_motifs = [
    ("Oct4", "TTTGCATAACAA"),
    ("Sox2", "CCATTGTT"),
    ("Nanog", "TTGATGGC"),
    ("Klf4", "GGGTGTGG"),
]

# --------------------------------------------
In [3]:
ddir = get_data_dir()
exp = 'nexus,peaks,OSNK,0,10,1,FALSE,same,0.5,64,25,0.004,9,FALSE,[1,50],TRUE'
imp_score = 'profile/wn'
repeat = 128

create_tf_session(1)
# create the output path
cache_path = f"{ddir}/cache/chipnexus/{exp}/simulation/spacing.pkl"
os.makedirs(os.path.dirname(cache_path), exist_ok=True)
In [4]:
# load the model
model_dir = models_dir / exp
model = SeqModel.from_mdir(model_dir)
# add the method
model.sim_pred = types.MethodType(sim_pred, model)
model.input_seqlen = types.MethodType(input_seqlen, model)

logger.info("Creating the output directory")

df_d = {}
res_dict_d = {}
WARNING:tensorflow:From /users/avsec/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/tensorflow/python/util/deprecation.py:497: calling conv1d (from tensorflow.python.ops.nn_ops) with data_format=NHWC is deprecated and will be removed in a future version.
Instructions for updating:
`NHWC` for data_format is deprecated, use `NWC` instead
2019-03-23 09:46:17,477 [WARNING] From /users/avsec/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/tensorflow/python/util/deprecation.py:497: calling conv1d (from tensorflow.python.ops.nn_ops) with data_format=NHWC is deprecated and will be removed in a future version.
Instructions for updating:
`NHWC` for data_format is deprecated, use `NWC` instead
WARNING:tensorflow:From /users/avsec/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/tensorflow/contrib/learn/python/learn/datasets/base.py:198: retry (from tensorflow.contrib.learn.python.learn.datasets.base) is deprecated and will be removed in a future version.
Instructions for updating:
Use the retry module or similar alternatives.
2019-03-23 09:46:25,001 [WARNING] From /users/avsec/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/tensorflow/contrib/learn/python/learn/datasets/base.py:198: retry (from tensorflow.contrib.learn.python.learn.datasets.base) is deprecated and will be removed in a future version.
Instructions for updating:
Use the retry module or similar alternatives.
2019-03-23 09:46:29,895 [INFO] Creating the output directory
In [18]:
from basepair.BPNet import BPNetSeqModel
In [19]:
bpnet = BPNetSeqModel(model)
In [5]:
central_motif_name, central_motif = central_motifs[0]
In [8]:
motif, (pattern, side_motif) = list(all_side_motifs.items())[0]
In [9]:
%load_ext line_profiler
In [26]:
# get the motifs
%lprun -f bpnet.sim_pred generate_sim(bpnet, central_motif, side_motif, list(range(511, 511 + 5, 1)), center_coords=center_coords, repeat=repeat,importance=[])
100%|██████████| 5/5 [00:01<00:00,  3.38it/s]
Timer unit: 1e-06 s

Total time: 1.75906 s
File: /users/avsec/workspace/basepair/basepair/BPNet.py
Function: sim_pred at line 810

Line #      Hits         Time  Per Hit   % Time  Line Contents
==============================================================
   810                                               def sim_pred(self, central_motif, side_motif=None, side_distances=[], repeat=128, importance=[]):
   811                                                   """
   812                                                   Args:
   813                                                     importance: list of importance scores
   814                                                   """
   815         6         82.0     13.7      0.0          from basepair.exp.chipnexus.simulate import generate_seq, average_profiles, flatten
   816         6          9.0      1.5      0.0          batch_size = repeat
   817         6         12.0      2.0      0.0          seqlen = self.seqmodel.seqlen
   818         6          9.0      1.5      0.0          tasks = self.seqmodel.tasks
   819                                           
   820                                                   # simulate sequence
   821         6         14.0      2.3      0.0          seqs = encodeDNA([generate_seq(central_motif, side_motif=side_motif,
   822                                                                                  side_distances=side_distances, seqlen=seqlen)
   823         6     853696.0 142282.7     48.5                            for i in range(repeat)])
   824                                           
   825                                                   # get predictions
   826         6     899545.0 149924.2     51.1          scaled_preds = self.predict(seqs, batch_size=batch_size)
   827                                           
   828         6         21.0      3.5      0.0          if importance:
   829                                                       # get the importance scores (compute only the profile and counts importance)
   830                                                       imp_scores_all = self.seqmodel.imp_score_all(seqs, intp_pattern=['*/profile/wn', '*/counts/pre-act'])
   831                                                       imp_scores = {t: {self._get_old_imp_score_name(imp_score_name): seqs * imp_scores_all[f'{t}/{imp_score_name}']
   832                                                                         for imp_score_name in importance}
   833                                                                     for t in tasks}
   834                                           
   835                                                       # merge and aggregate the profiles
   836                                                       out = {"imp": imp_scores, "profile": scaled_preds}
   837                                                   else:
   838         6         12.0      2.0      0.0              out = {"profile": scaled_preds}
   839         6       5664.0    944.0      0.3          return average_profiles(flatten(out, "/"))