target = "max_hela_1"
import os
import re
import json
import gzip
import codecs
import math
import os
import sys
import pickle
import numpy as np
from math import exp
from scipy import stats
import matplotlib as mpl
mpl.use('Agg')
import matplotlib.pyplot as plt
from vizsequence.viz_sequence import plot_weights_given_ax
from scipy.special import softmax
import keras
import keras.losses
from keras.models import Model, Sequential, load_model
from keras import backend as K
import numpy.random as rng
import seaborn as sns
from collections import OrderedDict
from basepair.losses import twochannel_multinomial_nll
import modisco
import modisco.tfmodisco_workflow.workflow
import h5py
import modisco.util
from collections import Counter
from modisco.visualization import viz_sequence
import modisco.affinitymat.core
import modisco.cluster.phenograph.core
import modisco.cluster.phenograph.cluster
import modisco.cluster.core
import modisco.aggregator
%matplotlib inline
from math import log, ceil
import numpy as np
import modisco
import modisco.tfmodisco_workflow.workflow
from modisco.tfmodisco_workflow import workflow
import h5py
import pandas as pd
import modisco.util
import keras
import keras.layers as kl
from keras import backend as K
import tensorflow as tf
import tensorflow_probability as tfp
from keras.models import load_model
import keras_genomics
from keras_genomics.layers.convolutional import RevCompConv1D
from keras.utils import CustomObjectScope
from collections import Counter
from modisco.visualization import viz_sequence
from deeplift.dinuc_shuffle import dinuc_shuffle
import modisco.affinitymat.core
import modisco.cluster.phenograph.core
import modisco.cluster.phenograph.cluster
import modisco.cluster.core
import modisco.aggregator
import matplotlib
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression as LR
from sklearn.metrics import mean_squared_error
from scipy.stats import spearmanr, pearsonr, gaussian_kde
font = {'weight' : 'bold', 'size' : 14}
MoDISco
post_hypimps = np.load("/oak/stanford/groups/akundaje/amr1/pho4_final/models/imp-scores/"+target+"/post_profile_hypimps.npy")
post_actualimps = np.load("/oak/stanford/groups/akundaje/amr1/pho4_final/models/imp-scores/"+target+"/post_profile_actualimps.npy")
modisco_seqs = np.load("/oak/stanford/groups/akundaje/amr1/pho4_final/models/imp-scores/"+target+"/seqs.npy")
grp = h5py.File("/oak/stanford/groups/akundaje/amr1/pho4_final/models/modisco/"+target+"/profile_results.hdf5","r")
print(post_actualimps.shape, post_hypimps.shape, modisco_seqs.shape)
track_set = modisco.tfmodisco_workflow.workflow.prep_track_set(
task_names=["task0_profile"], contrib_scores={"task0_profile": post_actualimps},
hypothetical_contribs={"task0_profile": post_hypimps}, one_hot=modisco_seqs)
loaded_tfmodisco_results = workflow.TfModiscoResults.from_hdf5(grp, track_set=track_set)
profile_hdf5_results = grp
activity_patterns = np.array(profile_hdf5_results['metaclustering_results']['attribute_vectors'])[
np.array(
[x[0] for x in sorted(
enumerate(profile_hdf5_results['metaclustering_results']['metacluster_indices']),
key=lambda x: x[1])])]
plt.figure()
sns.heatmap(activity_patterns, center=0)
plt.show()
metacluster_names = [
x.decode("utf-8") for x in
list(profile_hdf5_results["metaclustering_results"]
["all_metacluster_names"][:])]
all_patterns = []
for metacluster_name in metacluster_names:
print("\n"+metacluster_name+"\n")
metacluster_grp = (profile_hdf5_results["metacluster_idx_to_submetacluster_results"]
[metacluster_name])
all_pattern_names = [x.decode("utf-8") for x in
list(metacluster_grp["seqlets_to_patterns_result"]
["patterns"]["all_pattern_names"][:])]
for pattern_name in all_pattern_names:
all_patterns.append((metacluster_name, pattern_name))
pattern = metacluster_grp["seqlets_to_patterns_result"]["patterns"][pattern_name]
print("\n"+pattern_name+": "+str(len(pattern["seqlets_and_alnmts"]["seqlets"]))+" seqlets\n")
fig, ax = plt.subplots(4, 1, figsize=(30,20))
background = np.array([0.27, 0.23, 0.23, 0.27])
plot_weights_given_ax(ax[0], pattern["task0_hypothetical_contribs"]["fwd"])
ax[0].set_title("hypothetical scores")
plot_weights_given_ax(ax[1], pattern["task0_contrib_scores"]["fwd"])
ax[1].set_title("actual importance scores")
plot_weights_given_ax(ax[2], viz_sequence.ic_scale(np.array(pattern["sequence"]["fwd"]),
background=background))
ax[2].set_title("onehot fwd")
plot_weights_given_ax(ax[3], viz_sequence.ic_scale(np.array(pattern["sequence"]["rev"]),
background=background))
ax[3].set_title("onehot rev")
plt.show()
pattern_list = loaded_tfmodisco_results.metacluster_idx_to_submetacluster_results["metacluster_0"].seqlets_to_patterns_result.patterns
for idx, untrimmed_pattern in enumerate(pattern_list):
if idx == 2:
start = 22
end = 32
elif idx == 3:
start = 16
end = 29
else: continue
cwm = untrimmed_pattern["task0_profile_hypothetical_contribs"].fwd[start:end]
matplotlib.rc('font', **font)
fig = plt.figure(figsize=(8,5), dpi=300)
ax = fig.add_subplot(111)
viz_sequence.plot_weights_given_ax(ax, cwm,
height_padding_factor=0.2,
length_padding=1.0,
subticks_frequency=1.0,
highlight={})
plt.show()
fig.savefig('profile_cwm_'+str(idx)+'.eps', dpi=300, format='eps')
post_hypimps = np.load("/oak/stanford/groups/akundaje/amr1/pho4_final/models/imp-scores/"+target+"/post_counts_hypimps.npy")
post_actualimps = np.load("/oak/stanford/groups/akundaje/amr1/pho4_final/models/imp-scores/"+target+"/post_counts_actualimps.npy")
modisco_seqs = np.load("/oak/stanford/groups/akundaje/amr1/pho4_final/models/imp-scores/"+target+"/seqs.npy")
grp = h5py.File("/oak/stanford/groups/akundaje/amr1/pho4_final/models/modisco/"+target+"/counts_results.hdf5","r")
print(post_actualimps.shape, post_hypimps.shape, modisco_seqs.shape)
track_set = modisco.tfmodisco_workflow.workflow.prep_track_set(
task_names=["task0_counts"], contrib_scores={"task0_counts": post_actualimps},
hypothetical_contribs={"task0_counts": post_hypimps}, one_hot=modisco_seqs)
loaded_tfmodisco_results = workflow.TfModiscoResults.from_hdf5(grp, track_set=track_set)
profile_hdf5_results = grp
activity_patterns = np.array(profile_hdf5_results['metaclustering_results']['attribute_vectors'])[
np.array(
[x[0] for x in sorted(
enumerate(profile_hdf5_results['metaclustering_results']['metacluster_indices']),
key=lambda x: x[1])])]
plt.figure()
sns.heatmap(activity_patterns, center=0)
plt.show()
metacluster_names = [
x.decode("utf-8") for x in
list(profile_hdf5_results["metaclustering_results"]
["all_metacluster_names"][:])]
all_patterns = []
for metacluster_name in metacluster_names:
print("\n"+metacluster_name+"\n")
metacluster_grp = (profile_hdf5_results["metacluster_idx_to_submetacluster_results"]
[metacluster_name])
all_pattern_names = [x.decode("utf-8") for x in
list(metacluster_grp["seqlets_to_patterns_result"]
["patterns"]["all_pattern_names"][:])]
for pattern_name in all_pattern_names:
all_patterns.append((metacluster_name, pattern_name))
pattern = metacluster_grp["seqlets_to_patterns_result"]["patterns"][pattern_name]
print("\n"+pattern_name+": "+str(len(pattern["seqlets_and_alnmts"]["seqlets"]))+" seqlets\n")
fig, ax = plt.subplots(4, 1, figsize=(30,20))
background = np.array([0.27, 0.23, 0.23, 0.27])
plot_weights_given_ax(ax[0], pattern["task0_hypothetical_contribs"]["fwd"])
ax[0].set_title("hypothetical scores")
plot_weights_given_ax(ax[1], pattern["task0_contrib_scores"]["fwd"])
ax[1].set_title("actual importance scores")
plot_weights_given_ax(ax[2], viz_sequence.ic_scale(np.array(pattern["sequence"]["fwd"]),
background=background))
ax[2].set_title("onehot fwd")
plot_weights_given_ax(ax[3], viz_sequence.ic_scale(np.array(pattern["sequence"]["rev"]),
background=background))
ax[3].set_title("onehot rev")
plt.show()
pattern_list = loaded_tfmodisco_results.metacluster_idx_to_submetacluster_results["metacluster_1"].seqlets_to_patterns_result.patterns
for idx, untrimmed_pattern in enumerate(pattern_list):
if idx == 0:
start = 22
end = 32
elif idx == 1:
start = 22
end = 32
elif idx == 2:
start = 18
end = 28
elif idx == 3:
start = 21
end = 31
else: continue
cwm = untrimmed_pattern["task0_counts_hypothetical_contribs"].fwd[start:end]
matplotlib.rc('font', **font)
fig = plt.figure(figsize=(8,5), dpi=300)
ax = fig.add_subplot(111)
viz_sequence.plot_weights_given_ax(ax, cwm,
height_padding_factor=0.2,
length_padding=1.0,
subticks_frequency=1.0,
highlight={})
plt.show()
fig.savefig('counts_cwm_'+str(idx)+'.eps', dpi=300, format='eps')