In [4]:
target = "max_hela_1"
In [2]:
import os
import re
import json
import gzip
import codecs
import math
import os
import sys
import pickle
import numpy as np
from math import exp
from scipy import stats
import matplotlib as mpl
mpl.use('Agg')
import matplotlib.pyplot as plt
from vizsequence.viz_sequence import plot_weights_given_ax
from scipy.special import softmax
import keras
import keras.losses
from keras.models import Model, Sequential, load_model
from keras import backend as K
import numpy.random as rng
import seaborn as sns
from collections import OrderedDict
from basepair.losses import twochannel_multinomial_nll
import modisco
import modisco.tfmodisco_workflow.workflow
import h5py
import modisco.util
from collections import Counter
from modisco.visualization import viz_sequence
import modisco.affinitymat.core
import modisco.cluster.phenograph.core
import modisco.cluster.phenograph.cluster
import modisco.cluster.core
import modisco.aggregator
%matplotlib inline
from math import log, ceil
import numpy as np
import modisco
import modisco.tfmodisco_workflow.workflow
from modisco.tfmodisco_workflow import workflow
import h5py
import pandas as pd
import modisco.util
import keras
import keras.layers as kl
from keras import backend as K 
import tensorflow as tf
import tensorflow_probability as tfp
from keras.models import load_model
import keras_genomics
from keras_genomics.layers.convolutional import RevCompConv1D
from keras.utils import CustomObjectScope
from collections import Counter
from modisco.visualization import viz_sequence
from deeplift.dinuc_shuffle import dinuc_shuffle
import modisco.affinitymat.core
import modisco.cluster.phenograph.core
import modisco.cluster.phenograph.cluster
import modisco.cluster.core
import modisco.aggregator
import matplotlib
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression as LR
from sklearn.metrics import mean_squared_error
from scipy.stats import spearmanr, pearsonr, gaussian_kde
font = {'weight' : 'bold', 'size'   : 14}
Using TensorFlow backend.
/users/amr1/miniconda3/envs/basepair/lib/python3.6/site-packages/tensorflow/python/framework/dtypes.py:516: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
/users/amr1/miniconda3/envs/basepair/lib/python3.6/site-packages/tensorflow/python/framework/dtypes.py:517: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
/users/amr1/miniconda3/envs/basepair/lib/python3.6/site-packages/tensorflow/python/framework/dtypes.py:518: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
/users/amr1/miniconda3/envs/basepair/lib/python3.6/site-packages/tensorflow/python/framework/dtypes.py:519: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
/users/amr1/miniconda3/envs/basepair/lib/python3.6/site-packages/tensorflow/python/framework/dtypes.py:520: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
/users/amr1/miniconda3/envs/basepair/lib/python3.6/site-packages/tensorflow/python/framework/dtypes.py:525: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.
  np_resource = np.dtype([("resource", np.ubyte, 1)])
/users/amr1/miniconda3/envs/basepair/lib/python3.6/site-packages/tensorboard/compat/tensorflow_stub/dtypes.py:541: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
/users/amr1/miniconda3/envs/basepair/lib/python3.6/site-packages/tensorboard/compat/tensorflow_stub/dtypes.py:542: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
/users/amr1/miniconda3/envs/basepair/lib/python3.6/site-packages/tensorboard/compat/tensorflow_stub/dtypes.py:543: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
/users/amr1/miniconda3/envs/basepair/lib/python3.6/site-packages/tensorboard/compat/tensorflow_stub/dtypes.py:544: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
/users/amr1/miniconda3/envs/basepair/lib/python3.6/site-packages/tensorboard/compat/tensorflow_stub/dtypes.py:545: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
/users/amr1/miniconda3/envs/basepair/lib/python3.6/site-packages/tensorboard/compat/tensorflow_stub/dtypes.py:550: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.
  np_resource = np.dtype([("resource", np.ubyte, 1)])
/users/amr1/miniconda3/envs/basepair/lib/python3.6/site-packages/concise/utils/plot.py:115: FutureWarning: arrays to stack must be passed as a "sequence" type such as list or tuple. Support for non-sequence iterables such as generators is deprecated as of NumPy 1.16 and will raise an error in the future.
  min_coords = np.vstack(data.min(0) for data in polygons_data).min(0)
/users/amr1/miniconda3/envs/basepair/lib/python3.6/site-packages/concise/utils/plot.py:116: FutureWarning: arrays to stack must be passed as a "sequence" type such as list or tuple. Support for non-sequence iterables such as generators is deprecated as of NumPy 1.16 and will raise an error in the future.
  max_coords = np.vstack(data.max(0) for data in polygons_data).max(0)
WARNING:tensorflow:From /users/amr1/miniconda3/envs/basepair/lib/python3.6/site-packages/keras/optimizers.py:790: The name tf.train.Optimizer is deprecated. Please use tf.compat.v1.train.Optimizer instead.

2021-12-08 22:21:26,047 [WARNING] From /users/amr1/miniconda3/envs/basepair/lib/python3.6/site-packages/keras/optimizers.py:790: The name tf.train.Optimizer is deprecated. Please use tf.compat.v1.train.Optimizer instead.

TF-MoDISco is using the TensorFlow backend.

MoDISco

In [3]:
post_hypimps = np.load("/oak/stanford/groups/akundaje/amr1/pho4_final/models/imp-scores/"+target+"/post_profile_hypimps.npy")
post_actualimps = np.load("/oak/stanford/groups/akundaje/amr1/pho4_final/models/imp-scores/"+target+"/post_profile_actualimps.npy")  
modisco_seqs = np.load("/oak/stanford/groups/akundaje/amr1/pho4_final/models/imp-scores/"+target+"/seqs.npy")
grp = h5py.File("/oak/stanford/groups/akundaje/amr1/pho4_final/models/modisco/"+target+"/profile_results.hdf5","r")
print(post_actualimps.shape, post_hypimps.shape, modisco_seqs.shape)
track_set = modisco.tfmodisco_workflow.workflow.prep_track_set(
            task_names=["task0_profile"], contrib_scores={"task0_profile": post_actualimps},
            hypothetical_contribs={"task0_profile": post_hypimps}, one_hot=modisco_seqs)
loaded_tfmodisco_results = workflow.TfModiscoResults.from_hdf5(grp, track_set=track_set)
(20438, 1346, 4) (20438, 1346, 4) (20438, 1346, 4)
In [4]:
profile_hdf5_results = grp
activity_patterns = np.array(profile_hdf5_results['metaclustering_results']['attribute_vectors'])[
                    np.array(
        [x[0] for x in sorted(
                enumerate(profile_hdf5_results['metaclustering_results']['metacluster_indices']),
               key=lambda x: x[1])])]
plt.figure()
sns.heatmap(activity_patterns, center=0)
plt.show()
metacluster_names = [
    x.decode("utf-8") for x in 
    list(profile_hdf5_results["metaclustering_results"]
         ["all_metacluster_names"][:])]
all_patterns = []
for metacluster_name in metacluster_names:
    print("\n"+metacluster_name+"\n")
    metacluster_grp = (profile_hdf5_results["metacluster_idx_to_submetacluster_results"]
                                   [metacluster_name])
    all_pattern_names = [x.decode("utf-8") for x in 
                         list(metacluster_grp["seqlets_to_patterns_result"]
                                             ["patterns"]["all_pattern_names"][:])]
    for pattern_name in all_pattern_names:
        all_patterns.append((metacluster_name, pattern_name))
        pattern = metacluster_grp["seqlets_to_patterns_result"]["patterns"][pattern_name]
        print("\n"+pattern_name+": "+str(len(pattern["seqlets_and_alnmts"]["seqlets"]))+" seqlets\n")
        fig, ax = plt.subplots(4, 1, figsize=(30,20))
        background = np.array([0.27, 0.23, 0.23, 0.27])
        plot_weights_given_ax(ax[0], pattern["task0_hypothetical_contribs"]["fwd"])
        ax[0].set_title("hypothetical scores")
        plot_weights_given_ax(ax[1], pattern["task0_contrib_scores"]["fwd"])
        ax[1].set_title("actual importance scores")
        plot_weights_given_ax(ax[2], viz_sequence.ic_scale(np.array(pattern["sequence"]["fwd"]),
                                                        background=background))
        ax[2].set_title("onehot fwd")
        plot_weights_given_ax(ax[3], viz_sequence.ic_scale(np.array(pattern["sequence"]["rev"]),
                                                        background=background)) 
        ax[3].set_title("onehot rev")
        plt.show()
metacluster_0


pattern_0: 1285 seqlets

pattern_1: 1112 seqlets

pattern_2: 962 seqlets

pattern_3: 573 seqlets

pattern_4: 444 seqlets

pattern_5: 371 seqlets

pattern_6: 225 seqlets

pattern_7: 240 seqlets

pattern_8: 193 seqlets

pattern_9: 189 seqlets

pattern_10: 199 seqlets

pattern_11: 167 seqlets

pattern_12: 131 seqlets

pattern_13: 126 seqlets

pattern_14: 98 seqlets

metacluster_1


pattern_0: 768 seqlets

pattern_1: 598 seqlets

pattern_2: 601 seqlets

pattern_3: 232 seqlets

pattern_4: 89 seqlets

pattern_5: 98 seqlets

In [5]:
pattern_list = loaded_tfmodisco_results.metacluster_idx_to_submetacluster_results["metacluster_0"].seqlets_to_patterns_result.patterns
for idx, untrimmed_pattern in enumerate(pattern_list):
    if idx == 2:
        start = 22
        end = 32
    elif idx == 3:
        start = 16
        end = 29
    else: continue
    cwm = untrimmed_pattern["task0_profile_hypothetical_contribs"].fwd[start:end]
    matplotlib.rc('font', **font)
    fig = plt.figure(figsize=(8,5), dpi=300)
    ax = fig.add_subplot(111)
    viz_sequence.plot_weights_given_ax(ax, cwm,
                                        height_padding_factor=0.2,
                                        length_padding=1.0,
                                        subticks_frequency=1.0,
                                        highlight={})
    plt.show()
    fig.savefig('profile_cwm_'+str(idx)+'.eps', dpi=300, format='eps')
In [3]:
post_hypimps = np.load("/oak/stanford/groups/akundaje/amr1/pho4_final/models/imp-scores/"+target+"/post_counts_hypimps.npy")
post_actualimps = np.load("/oak/stanford/groups/akundaje/amr1/pho4_final/models/imp-scores/"+target+"/post_counts_actualimps.npy")  
modisco_seqs = np.load("/oak/stanford/groups/akundaje/amr1/pho4_final/models/imp-scores/"+target+"/seqs.npy")
grp = h5py.File("/oak/stanford/groups/akundaje/amr1/pho4_final/models/modisco/"+target+"/counts_results.hdf5","r")
print(post_actualimps.shape, post_hypimps.shape, modisco_seqs.shape)
track_set = modisco.tfmodisco_workflow.workflow.prep_track_set(
            task_names=["task0_counts"], contrib_scores={"task0_counts": post_actualimps},
            hypothetical_contribs={"task0_counts": post_hypimps}, one_hot=modisco_seqs)
loaded_tfmodisco_results = workflow.TfModiscoResults.from_hdf5(grp, track_set=track_set)
profile_hdf5_results = grp
activity_patterns = np.array(profile_hdf5_results['metaclustering_results']['attribute_vectors'])[
                    np.array(
        [x[0] for x in sorted(
                enumerate(profile_hdf5_results['metaclustering_results']['metacluster_indices']),
               key=lambda x: x[1])])]
plt.figure()
sns.heatmap(activity_patterns, center=0)
plt.show()
metacluster_names = [
    x.decode("utf-8") for x in 
    list(profile_hdf5_results["metaclustering_results"]
         ["all_metacluster_names"][:])]
all_patterns = []
for metacluster_name in metacluster_names:
    print("\n"+metacluster_name+"\n")
    metacluster_grp = (profile_hdf5_results["metacluster_idx_to_submetacluster_results"]
                                   [metacluster_name])
    all_pattern_names = [x.decode("utf-8") for x in 
                         list(metacluster_grp["seqlets_to_patterns_result"]
                                             ["patterns"]["all_pattern_names"][:])]
    for pattern_name in all_pattern_names:
        all_patterns.append((metacluster_name, pattern_name))
        pattern = metacluster_grp["seqlets_to_patterns_result"]["patterns"][pattern_name]
        print("\n"+pattern_name+": "+str(len(pattern["seqlets_and_alnmts"]["seqlets"]))+" seqlets\n")
        fig, ax = plt.subplots(4, 1, figsize=(30,20))
        background = np.array([0.27, 0.23, 0.23, 0.27])
        plot_weights_given_ax(ax[0], pattern["task0_hypothetical_contribs"]["fwd"])
        ax[0].set_title("hypothetical scores")
        plot_weights_given_ax(ax[1], pattern["task0_contrib_scores"]["fwd"])
        ax[1].set_title("actual importance scores")
        plot_weights_given_ax(ax[2], viz_sequence.ic_scale(np.array(pattern["sequence"]["fwd"]),
                                                        background=background))
        ax[2].set_title("onehot fwd")
        plot_weights_given_ax(ax[3], viz_sequence.ic_scale(np.array(pattern["sequence"]["rev"]),
                                                        background=background)) 
        ax[3].set_title("onehot rev")
        plt.show()
(20438, 1346, 4) (20438, 1346, 4) (20438, 1346, 4)
metacluster_0


pattern_0: 875 seqlets

pattern_1: 752 seqlets

pattern_2: 560 seqlets

pattern_3: 522 seqlets

pattern_4: 245 seqlets

pattern_5: 193 seqlets

pattern_6: 132 seqlets

pattern_7: 124 seqlets

pattern_8: 121 seqlets

pattern_9: 184 seqlets

pattern_10: 73 seqlets

pattern_11: 69 seqlets

metacluster_1


pattern_0: 2233 seqlets

pattern_1: 1936 seqlets

pattern_2: 1922 seqlets

pattern_3: 479 seqlets

pattern_4: 308 seqlets

pattern_5: 317 seqlets

pattern_6: 287 seqlets

pattern_7: 263 seqlets

pattern_8: 157 seqlets

pattern_9: 120 seqlets

In [7]:
pattern_list = loaded_tfmodisco_results.metacluster_idx_to_submetacluster_results["metacluster_1"].seqlets_to_patterns_result.patterns
for idx, untrimmed_pattern in enumerate(pattern_list):
    if idx == 0:
        start = 22
        end = 32
    elif idx == 1:
        start = 22
        end = 32
    elif idx == 2:
        start = 18
        end = 28
    elif idx == 3:
        start = 21
        end = 31
    else: continue
    cwm = untrimmed_pattern["task0_counts_hypothetical_contribs"].fwd[start:end]
    matplotlib.rc('font', **font)
    fig = plt.figure(figsize=(8,5), dpi=300)
    ax = fig.add_subplot(111)
    viz_sequence.plot_weights_given_ax(ax, cwm,
                                        height_padding_factor=0.2,
                                        length_padding=1.0,
                                        subticks_frequency=1.0,
                                        highlight={})
    plt.show()
    fig.savefig('counts_cwm_'+str(idx)+'.eps', dpi=300, format='eps')