%load_ext autoreload
%autoreload 2
import numpy as np
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt
from matlas.pwms import ic_scale, adjust_for_ic_scale
from matlas.matches import vdom_pssm
from modisco.visualization import viz_sequence
from matlas.matches import vdom_pssm
motif_name = 'GABPA'
aitacdir = "/mnt/lab_data2/msharmin/oc-atlas/DanSkinData/fold_a_alex/{}".format(motif_name)
filt_infl_from_loss = np.load(aitacdir+"/filt_infl_from_loss.npy")
top_k_filters = np.flip(np.argsort(filt_infl_from_loss))[:15]
top_k_filters
import numpy as np
import seaborn as sns
motif_name = 'GABPA'
filter_weights = np.load("/mnt/lab_data2/msharmin/oc-atlas/DanSkinData/fold_a_alex/filters/{0}/{0}_filters.npy".format(
motif_name))
filter_weights.shape
from matplotlib import pyplot as plt
from matlas.pwms import ic_scale, adjust_for_ic_scale
from matlas.matches import vdom_pssm
from modisco.visualization import viz_sequence
from matlas.matches import vdom_pssm
def draw_filter_weights(start_filter, end_filter):
alphabets = np.array(['A', 'C', 'G', 'T'])
fig, axes = plt.subplots(4, 4, figsize=(30,10))
for i in range(start_filter, end_filter):
a = np.copy(filter_weights[:,:,i].T)
r,c = (i-start_filter)//4, i%4
#print(r, c)
ax = sns.heatmap(a, yticklabels=alphabets,
xticklabels=alphabets[np.argmax(a, axis=0)], ax = axes[r, c]
)
ax.set_ylabel("filter{}".format(i))
return None
from modisco.visualization import viz_sequence
for i in top_k_filters:
print('filter', i)
mean_norm_weights = np.copy(filter_weights[:,:,i])
mean_norm_weights = mean_norm_weights - mean_norm_weights.mean(1, keepdims=True)
print('raw weights')
viz_sequence.plot_weights(filter_weights[:,:,i])
print('mean normalized weights')
viz_sequence.plot_weights(mean_norm_weights)
break
from matlas.matches import vdom_pssm
from IPython.display import display
from vdom.helpers import (h1, p, li, img, div, b, br, ul, img, a,
details, summary,
table, thead, th, tr, tbody, td, ol)
items_to_display = []
for i in top_k_filters:
filter_name = 'filter{}'.format(i)
mean_norm_weights = np.copy(filter_weights[:,:,i])
mean_norm_weights = mean_norm_weights - mean_norm_weights.mean(1, keepdims=True)
un_query = vdom_pssm(filter_weights[:,:,i])
n_query = vdom_pssm(mean_norm_weights)
items_to_display.append(summary(b(filter_name), p('raw weights'), un_query, 'mean normalized', n_query))
#break
display(details(summary(b("weights of selected filters")), summary(items_to_display)))
from matlas.pwms import ic_scale, adjust_for_ic_scale
from matlas.matches import vdom_pssm
from sklearn.preprocessing import MinMaxScaler
from collections import OrderedDict
def ic_scale_filter_weights(filter_weights, filter_count=64):
norm_weights = np.copy(filter_weights)
norm_weights = (norm_weights-np.mean(norm_weights))/np.std(norm_weights)
norm_weights[norm_weights>2] = 2.0
norm_weights[norm_weights<-2] = -2.0
ic_scaled_weights = OrderedDict()
for i in range(filter_count):
weights = np.copy(norm_weights[:,:,i])
scaler = MinMaxScaler()
arr = scaler.fit_transform(weights)
#convert to probability
arr = arr / arr.sum(1, keepdims=True) #divide by colsum
arr = arr + 0.01 # add pseudo-counts
probs = arr / arr.sum(1, keepdims=True)
#probs = adjust_for_ic_scale(probs)
try:
ic_scaled_weights['filter{0}'.format(i)] = ic_scale(probs)
except AssertionError:
print('filter{0}'.format(i))
return probs
return ic_scaled_weights
ic_scaled_weights = ic_scale_filter_weights(filter_weights)
All filter weights are standardised and each filter weights are scaled to 0-1 and converted to probability score before IC-scaling
# from IPython.display import display
# from matlas.pwms import ic_scale
# from vdom.helpers import (h1, p, li, img, div, b, br, ul, img, a,
# details, summary,
# table, thead, th, tr, tbody, td, ol)
# items_to_display = []
# for i in range(64):
# filter_name = 'filter{}'.format(i)
# query = vdom_pssm(ic_scaled_weights[filter_name])
# items_to_display.append(summary(filter_name, query))
# display(details(summary(b("filter weights in IC scale")), summary(items_to_display)))
#display(details(summary(b("Click here to see the filter weights in IC scale")), summary(items_to_display)))
#0, 18, 23, 45, 56, 61
selected_items_to_display = []
for i in top_k_filters:
filter_name = 'filter{}'.format(i)
query = vdom_pssm(ic_scaled_weights[filter_name])
selected_items_to_display.append(summary(filter_name, query))
display(details(summary(b("Click here to see the filter weights in IC scale")), summary(selected_items_to_display)))
imp_scores = np.load(aitacdir+"/imp_scores.npy")
rev_imp_scores = np.load(aitacdir+"/rev_imp_scores.npy")
OCR_matrix = np.load(aitacdir+"/OCR_matrix.npy")
already_visited = []
for filter_no in top_k_filters:
filter_name = "filter{}".format(filter_no)
print(filter_name)
activated_seqs = np.argwhere(OCR_matrix[filter_no,:]==1)[:,0]; print(activated_seqs.shape)
activated_scores = imp_scores[activated_seqs]; print(activated_scores.shape)
for i in range(20):
if i in already_visited: continue
print('seq', activated_seqs[i])
viz_sequence.plot_weights(activated_scores[i, 600:746,:],
subticks_frequency=10, figsize=(20,1))
break
break
def window_similarities(seq_1, seq_2):
"""
Takes two windows (W x 4 arrays) and computes a similarity between them,
using a continuous Jaccard metric.
"""
ab_1, ab_2 = np.abs(seq_1), np.abs(seq_2)
inter = np.minimum(ab_1, ab_2) * np.sign(seq_1) * np.sign(seq_2)
union = np.maximum(ab_1, ab_2)
cont_jaccard = np.sum(inter, axis=1) / np.sum(union, axis=1)
return np.sum(cont_jaccard)
def max_seqlet_similarity(seq_1, seq_2, window_size=8):
"""
Takes two seqlets (S x 4 arrays) and computes the maximum similarity over
all possible pairwise windows. Returns the starting indices of the best window
for each sequence, and the resulting score.
"""
seq_1_len, seq_2_len = seq_1.shape[0], seq_2.shape[0]
best_window_1, best_window_2, best_score = None, None, -float("inf")
for i in range(0, seq_1_len - window_size + 1):
for j in range(0, seq_2_len - window_size + 1):
window_score = window_similarities(seq_1[i : i + window_size], seq_2[j : j + window_size])
if window_score > best_score:
best_window_1, best_window_2, best_score = i, j, window_score
return best_window_1, best_window_2, best_score
def aggregate_seqlets(seqlets, rev_seqlets, aggregation='sum'):
"""
From the set of seqlets (a N x L x 4 array), aggregates them into a
single motif by successively merging (averaging the signal each time)
starting from the seqlet with the highest total gradient magnitudes.
"""
grad_mags = np.sum(np.abs(seqlets), axis=(1, 2))
sorted_inds = np.flip(np.argsort(grad_mags))
seqlets_sorted = seqlets[sorted_inds]
rev_seqlets_sorted = rev_seqlets[sorted_inds]
agg_seqlet_avg = seqlets_sorted[0]
for i in range(1, len(seqlets_sorted)):
# Align the next seqlet to the current aggregated average
agg_ind1, new_ind1, best_score1 = max_seqlet_similarity(agg_seqlet_avg, seqlets_sorted[i], window_size=12)
agg_ind2, new_ind2, best_score2 = max_seqlet_similarity(agg_seqlet_avg, rev_seqlets_sorted[i], window_size=12)
if best_score1 >= best_score2:
agg_ind, new_ind, best_score, new_seqlet = (agg_ind1, new_ind1, best_score1, seqlets_sorted[i])
else:
agg_ind, new_ind, best_score, new_seqlet = (agg_ind2, new_ind2, best_score2, rev_seqlets_sorted[i])
# print(i, new_ind1, agg_ind1, best_score1, new_ind2, agg_ind2, best_score2)
# viz_sequence.plot_weights(seqlets_sorted[i])
# viz_sequence.plot_weights(rev_seqlets_sorted[i])
# viz_sequence.plot_weights(agg_seqlet_avg)
# print(new_ind, agg_ind, best_score)
if new_ind > agg_ind:
diff = new_ind - agg_ind
new_seqlet = np.concatenate([new_seqlet[diff:], np.zeros((diff, 4))], axis=0)
elif new_ind < agg_ind:
diff = agg_ind - new_ind
new_seqlet = np.concatenate([np.zeros((diff, 4)), new_seqlet[:-diff]], axis=0)
# Update the average seqlet
#agg_seqlet_avg = (agg_seqlet_avg * i / (i + 1)) + (new_seqlet * i / (i + 1))
if aggregation=='sum':
agg_seqlet_avg = (agg_seqlet_avg + new_seqlet)
else:
agg_seqlet_avg = (agg_seqlet_avg * i / (i + 1)) + (new_seqlet * i / (i + 1))
# viz_sequence.plot_weights(new_seqlet)
# viz_sequence.plot_weights(agg_seqlet_avg)
if i>=200:
break
#if aggregation=='sum':
# agg_seqlet_avg = agg_seqlet_avg/(1.0*len(seqlets_sorted))
return agg_seqlet_avg
#from matlas.sliding_similarities import aggregate_seqlets
seqlen = imp_scores.shape[1]
for filter_no in [0, 43, 61, 47, 35, 23, 52, 24, 57, 4, 18, 56, 45][::-1][:3]:
# activated_subseqs = np.argwhere(seq_indices_of_activation[filter_no]==1); #print(activated_subseqs.shape)
# activated_subscores = []
# rev_activated_subscores = []
# alt_activated_subscores = []
# for i,j,k in zip(activated_subseqs.T[0], activated_subseqs.T[1], activated_subseqs.T[1]+19):
# activated_subscores.append(imp_scores[i, j:k, :])
# rev_activated_subscores.append(rev_imp_scores[i, (seqlen-k):(seqlen-j), :])
# alt_activated_subscores.append(alt_imp_scores[i, (seqlen-k):(seqlen-j), :])
# activated_subscores = np.array(activated_subscores); print(activated_subscores.shape)
# rev_activated_subscores = np.array(rev_activated_subscores); print(rev_activated_subscores.shape)
# alt_activated_subscores = np.array(alt_activated_subscores); print(alt_activated_subscores.shape)
avg_activated_subscores = aggregate_seqlets(activated_subscores, alt_activated_subscores, 'old'); #print(avg_activated_subscores.shape)
print("filter{}".format(filter_no))
print('direct sum')
viz_sequence.plot_weights(avg_activated_subscores)
print('running average')
viz_sequence.plot_weights(aggregate_seqlets(activated_subscores, rev_activated_subscores, 'old'))
break
from matlas.matches import DenovoAitac
motif_name = 'GABPA'
aitacdir = "/mnt/lab_data2/msharmin/oc-atlas/DanSkinData/fold_a_alex/{}".format(motif_name)
ob = DenovoAitac(aitacdir, filt_infl_from_loss, len(top_k_filters))
# ob.fetch_tomtom_matches(
# meme_db="/mnt/lab_data/kundaje/users/msharmin/annotations/HOCOMOCOv11_core_pwms_HUMAN_mono.renamed.nonredundant.annotated.meme",
# database_name="HOCOMOCO.nonredundant.annotated",
# save_report=True, tomtom_dir= "{0}/{1}_tomtomout".format(aitacdir, "HOCOMOCO.nonredundant.annotated"))
ob.load_matched_motifs(database_name="HOCOMOCO.nonredundant.annotated")
ob.get_motif_per_celltype(match_threshold=0.05, database_name="HOCOMOCO.nonredundant.annotated")
pattern_tab, pattern_dict = ob.visualize_pattern_table()
tf_tab, tf_dict = ob.visualize_tf_table("Aitac")
ith_influence_in_aitac = mean(change of across_task_correlation(label, prediction) between original vs. leave-one-out)
ith_influence_in_profile = mean(change of nll between original vs. leave-one-out)
ith_influence_in_binary = mean(change of loss between original vs. leave-one-out)
from vdom.helpers import (b, summary, details)
from IPython.display import display
display(details(summary('Click here for ', b('Denovo Patterns'), ' by ', b('{}'.format('Aitac')),
' in ', b(motif_name),
": #{}".format(len(pattern_dict)),
), pattern_tab))
# display(details(summary('Click here for ', b('Motifs'), ' by ', b('{}'.format('Aitac')),
# ' in ', b(motif_name),
# ": #{}".format(len(tf_dict)),
# ), tf_tab))
motif_name = 'GABPA'
aitacdir = "/mnt/lab_data2/msharmin/oc-atlas/DanSkinData/fold_a_alex/{}".format(motif_name)
from matlas.pwms import get_motifDB_id_maps, reduce_pwm_redundancy
memefile = "{0}/filter_motifs_pwm.meme".format(aitacdir)
motif_id_maps = get_motifDB_id_maps(database_name=memefile)
file_prefix = "/mnt/lab_data2/msharmin/oc-atlas/DanSkinData/fold_a_alex/{}/aitac".format(motif_name)
reduce_pwm_redundancy(
motif_id_maps,
out_pwm_file="{}.hclust_pwms.tmp".format(file_prefix),
pwmtype="probabilities", #probabilities
tmp_prefix=file_prefix,
pseudocount=PSEUDOCOUNT,
ic_thresh=IC_THRESHOLD,
cor_thresh=0.6,
ncor_thresh=0.4,
num_threads=28)
from matlas.reports import generate_merged_pwm_plots
from matlas.pwms import read_pwm_file, load_cisbp_maps
mergeddict = read_pwm_file("{}.hclust_pwms.tmp".format(file_prefix))
tab, tabledict = generate_merged_pwm_plots(mergeddict, motif_id_maps)
#display(tab)