%load_ext autoreload
%autoreload 2
import numpy as np
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt
from collections import OrderedDict
from matlas.pwms import ic_scale, adjust_for_ic_scale
from matlas.matches import vdom_pssm
from modisco.visualization import viz_sequence
from matlas.matches import vdom_pssm
motif_name = 'GABPA'
top_k_filters = OrderedDict()
filt_infl_from_loss = OrderedDict()
for fold_no in range(1,11):
aitacdir = "/mnt/lab_data2/msharmin/oc-atlas/DanSkinData/alex_bpnets/{0}/{1}".format(motif_name, fold_no)
filt_infl_from_loss[fold_no] = np.load(aitacdir+"/filt_infl_from_loss.npy")
top_k_filters[fold_no] = np.flip(np.argsort(filt_infl_from_loss[fold_no]))[:15]
print(fold_no, top_k_filters[fold_no])
# import numpy as np
# import seaborn as sns
# motif_name = 'GABPA'
# filter_weights = np.load("/mnt/lab_data2/msharmin/oc-atlas/DanSkinData/alex_bpnets/filters/{0}/{0}_filters.npy".format(
# motif_name))
# filter_weights.shape
from matplotlib import pyplot as plt
from matlas.pwms import ic_scale, adjust_for_ic_scale
from matlas.matches import vdom_pssm
from modisco.visualization import viz_sequence
from matlas.matches import vdom_pssm
def draw_filter_weights(start_filter, end_filter):
alphabets = np.array(['A', 'C', 'G', 'T'])
fig, axes = plt.subplots(4, 4, figsize=(30,10))
for i in range(start_filter, end_filter):
a = np.copy(filter_weights[:,:,i].T)
r,c = (i-start_filter)//4, i%4
#print(r, c)
ax = sns.heatmap(a, yticklabels=alphabets,
xticklabels=alphabets[np.argmax(a, axis=0)], ax = axes[r, c]
)
ax.set_ylabel("filter{}".format(i))
return None
from modisco.visualization import viz_sequence
for fold_no in range(1,11):
for i in top_k_filters:
print('filter', i)
mean_norm_weights = np.copy(filter_weights[:,:,i])
mean_norm_weights = mean_norm_weights - mean_norm_weights.mean(1, keepdims=True)
print('raw weights')
viz_sequence.plot_weights(filter_weights[:,:,i])
print('mean normalized weights')
viz_sequence.plot_weights(mean_norm_weights)
break
from matlas.matches import vdom_pssm
from IPython.display import display
from vdom.helpers import (h1, p, li, img, div, b, br, ul, img, a,
details, summary,
table, thead, th, tr, tbody, td, ol)
items_to_display = []
for i in top_k_filters:
filter_name = 'filter{}'.format(i)
mean_norm_weights = np.copy(filter_weights[:,:,i])
mean_norm_weights = mean_norm_weights - mean_norm_weights.mean(1, keepdims=True)
un_query = vdom_pssm(filter_weights[:,:,i])
n_query = vdom_pssm(mean_norm_weights)
items_to_display.append(summary(b(filter_name), p('raw weights'), un_query, 'mean normalized', n_query))
#break
display(details(summary(b("weights of selected filters")), summary(items_to_display)))
from matlas.pwms import ic_scale, adjust_for_ic_scale
from matlas.matches import vdom_pssm
from sklearn.preprocessing import MinMaxScaler
from collections import OrderedDict
def ic_scale_filter_weights(filter_weights, filter_count=64):
norm_weights = np.copy(filter_weights)
norm_weights = (norm_weights-np.mean(norm_weights))/np.std(norm_weights)
norm_weights[norm_weights>2] = 2.0
norm_weights[norm_weights<-2] = -2.0
ic_scaled_weights = OrderedDict()
for i in range(filter_count):
weights = np.copy(norm_weights[:,:,i])
scaler = MinMaxScaler()
arr = scaler.fit_transform(weights)
#convert to probability
arr = arr / arr.sum(1, keepdims=True) #divide by colsum
arr = arr + 0.01 # add pseudo-counts
probs = arr / arr.sum(1, keepdims=True)
#probs = adjust_for_ic_scale(probs)
try:
ic_scaled_weights['filter{0}'.format(i)] = ic_scale(probs)
except AssertionError:
print('filter{0}'.format(i))
return probs
return ic_scaled_weights
ic_scaled_weights = ic_scale_filter_weights(filter_weights)
All filter weights are standardised and each filter weights are scaled to 0-1 and converted to probability score before IC-scaling
# from IPython.display import display
# from matlas.pwms import ic_scale
# from vdom.helpers import (h1, p, li, img, div, b, br, ul, img, a,
# details, summary,
# table, thead, th, tr, tbody, td, ol)
# items_to_display = []
# for i in range(64):
# filter_name = 'filter{}'.format(i)
# query = vdom_pssm(ic_scaled_weights[filter_name])
# items_to_display.append(summary(filter_name, query))
# display(details(summary(b("filter weights in IC scale")), summary(items_to_display)))
#display(details(summary(b("Click here to see the filter weights in IC scale")), summary(items_to_display)))
#0, 18, 23, 45, 56, 61
selected_items_to_display = []
for i in top_k_filters:
filter_name = 'filter{}'.format(i)
query = vdom_pssm(ic_scaled_weights[filter_name])
selected_items_to_display.append(summary(filter_name, query))
display(details(summary(b("Click here to see the filter weights in IC scale")), summary(selected_items_to_display)))
imp_scores = np.load(aitacdir+"/imp_scores.npy")
rev_imp_scores = np.load(aitacdir+"/rev_imp_scores.npy")
OCR_matrix = np.load(aitacdir+"/OCR_matrix_0.npy")
activations2 = np.load(aitacdir+"/activations2_0.npy")
seq_indices_of_activation = np.load(aitacdir+"/seq_indices_of_activation_0.npy")
already_visited = []
for filter_no in top_k_filters:
filter_name = "filter{}".format(filter_no)
print(filter_name)
activated_seqs = np.argwhere(OCR_matrix[filter_no,:]==1)[:,0]; print(activated_seqs.shape)
activated_scores = imp_scores[activated_seqs]; print(activated_scores.shape)
for i in range(20):
if i in already_visited: continue
print('seq', activated_seqs[i])
viz_sequence.plot_weights(activated_scores[i, 600:746,:],
subticks_frequency=10, figsize=(20,1))
break
break
from matlas.sliding_similarities import aggregate_seqlets
seqlen = imp_scores.shape[1]
for filter_no in [45, 56, 18, 4, 52, 24, 57, 43, 35, 61, 23, 47, 16, 62, 34]:
activated_subseqs = np.argwhere(seq_indices_of_activation[filter_no]!=0); #print(activated_subseqs.shape)
activated_subscores = []
rev_activated_subscores = []
sub_activations = []
rev_sub_activations = []
for i,j,k in zip(activated_subseqs.T[0], activated_subseqs.T[1], activated_subseqs.T[1]+21):
activated_subscores.append(imp_scores[i, j:k, :])
rev_activated_subscores.append(rev_imp_scores[i, (seqlen-k):(seqlen-j), :])
sub_activations.append(activations2[i, filter_no, j])
activated_subscores = np.array(activated_subscores); #print(activated_subscores.shape)
rev_activated_subscores = np.array(rev_activated_subscores); #print(rev_activated_subscores.shape)
sub_activations = np.array(sub_activations)
sub_act_ind = np.flip(np.argsort(sub_activations))
activated_subscores = activated_subscores[sub_act_ind]
rev_activated_subscores = rev_activated_subscores[sub_act_ind]
avg_activated_subscores = aggregate_seqlets(activated_subscores[:100],
rev_activated_subscores[:100], 'sum'); #print(avg_activated_subscores.shape)
print('Aggregated importance scores using direct sum for filter-{}'.format(filter_no))
viz_sequence.plot_weights(avg_activated_subscores)
# print("filter{}".format(filter_no))
# print('running average')
# viz_sequence.plot_weights(avg_activated_subscores)
# viz_sequence.plot_weights(aggregate_seqlets(activated_subscores, rev_activated_subscores, 'old'))
#break
from matlas.matches import DenovoAitac
motif_name = 'GABPA'
obs = OrderedDict()
for fold_no in range(1,11):
aitacdir = "/mnt/lab_data2/msharmin/oc-atlas/DanSkinData/alex_bpnets/{0}/{1}".format(motif_name, fold_no)
ob = DenovoAitac(aitacdir, filt_infl_from_loss[fold_no], len(top_k_filters[fold_no]))
ob.fetch_tomtom_matches(
meme_db="/mnt/lab_data/kundaje/users/msharmin/annotations/HOCOMOCOv11_core_pwms_HUMAN_mono.renamed.nonredundant.annotated.meme",
database_name="HOCOMOCO.nonredundant.annotated",
save_report=True, tomtom_dir= "{0}/{1}_tomtomout".format(aitacdir, "HOCOMOCO.nonredundant.annotated"))
# ob.load_matched_motifs(database_name="HOCOMOCO.nonredundant.annotated")
# ob.get_motif_per_celltype(match_threshold=0.05, database_name="HOCOMOCO.nonredundant.annotated")
# pattern_tab, pattern_dict = ob.visualize_pattern_table()
# tf_tab, tf_dict = ob.visualize_tf_table("Aitac")
# obs[fold_no] = ob
ith_influence_in_aitac = mean(change of across_task_correlation(label, prediction) between original vs. leave-one-out)
ith_influence_in_profile = mean(change of nll between original vs. leave-one-out)
ith_influence_in_binary = mean(change of loss between original vs. leave-one-out)
from vdom.helpers import (b, summary, details)
from IPython.display import display
ob_tofold = OrderedDict()
pattern_tab_tofold, pattern_dict_tofold = OrderedDict(), OrderedDict()
tf_tab_tofold, tf_dict_tofold = OrderedDict(), OrderedDict()
for fold_no in range(1,11):
aitacdir = "/mnt/lab_data2/msharmin/oc-atlas/DanSkinData/alex_bpnets/{0}/{1}".format(motif_name, fold_no)
ob = DenovoAitac(aitacdir, filt_infl_from_loss[fold_no], len(top_k_filters[fold_no]))
ob.load_matched_motifs(database_name="HOCOMOCO.nonredundant.annotated")
ob.get_motif_per_celltype(match_threshold=0.05, database_name="HOCOMOCO.nonredundant.annotated")
pattern_tab, pattern_dict = ob.visualize_pattern_table()
tf_tab, tf_dict = ob.visualize_tf_table("Aitac")
ob_tofold[fold_no] = ob
pattern_tab_tofold[fold_no], pattern_dict_tofold[fold_no] = pattern_tab, pattern_dict
tf_tab_tofold[fold_no], tf_dict_tofold[fold_no] = tf_tab, tf_dict
display(details(summary('Click here for ', b('Denovo Patterns'), ' by ', b('{}'.format('Aitac')),
' in ', b(motif_name), 'for fold-', b(str(fold_no)),
": #{}".format(len(pattern_dict)),
), pattern_tab))
#break
for fold_no in range(1,11):
display(details(summary('Click here for ', b('Motifs'), ' by ', b('{}'.format('Aitac')),
' in ', b(motif_name), 'for fold-', b(str(fold_no)),
": #{}".format(len(tf_dict_tofold[fold_no])),
), tf_tab_tofold[fold_no]))
motif_name = 'GABPA'
aitacdir = "/mnt/lab_data2/msharmin/oc-atlas/DanSkinData/fold_a_alex/{}".format(motif_name)
from matlas.pwms import get_motifDB_id_maps, reduce_pwm_redundancy
memefile = "{0}/filter_motifs_pwm.meme".format(aitacdir)
motif_id_maps = get_motifDB_id_maps(database_name=memefile)
file_prefix = "/mnt/lab_data2/msharmin/oc-atlas/DanSkinData/fold_a_alex/{}/aitac".format(motif_name)
reduce_pwm_redundancy(
motif_id_maps,
out_pwm_file="{}.hclust_pwms.tmp".format(file_prefix),
pwmtype="probabilities", #probabilities
tmp_prefix=file_prefix,
pseudocount=PSEUDOCOUNT,
ic_thresh=IC_THRESHOLD,
cor_thresh=0.6,
ncor_thresh=0.4,
num_threads=28)
from matlas.reports import generate_merged_pwm_plots
from matlas.pwms import read_pwm_file, load_cisbp_maps
mergeddict = read_pwm_file("{}.hclust_pwms.tmp".format(file_prefix))
tab, tabledict = generate_merged_pwm_plots(mergeddict, motif_id_maps)
#display(tab)