In [1]:
%load_ext autoreload
%autoreload 2

import numpy as np
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt
In [3]:
from matlas.pwms import ic_scale, adjust_for_ic_scale
from matlas.matches import vdom_pssm
from modisco.visualization import viz_sequence
from matlas.matches import vdom_pssm
In [4]:
motif_name = 'GABPA'
aitacdir = "/mnt/lab_data2/msharmin/oc-atlas/DanSkinData/fold_a_alex/{}".format(motif_name)
filt_infl_from_loss = np.load(aitacdir+"/filt_infl_from_loss.npy")
top_k_filters = np.flip(np.argsort(filt_infl_from_loss))[:15]
top_k_filters
Out[4]:
array([45, 56, 18,  4, 57, 24, 52, 23, 35, 47, 61, 43,  0, 16,  3])
In [5]:
import numpy as np
import seaborn as sns

motif_name = 'GABPA'
filter_weights = np.load("/mnt/lab_data2/msharmin/oc-atlas/DanSkinData/fold_a_alex/filters/{0}/{0}_filters.npy".format(
    motif_name))
filter_weights.shape
Out[5]:
(21, 4, 64)
In [6]:
from matplotlib import pyplot as plt

from matlas.pwms import ic_scale, adjust_for_ic_scale
from matlas.matches import vdom_pssm
from modisco.visualization import viz_sequence
from matlas.matches import vdom_pssm

def draw_filter_weights(start_filter, end_filter):
    
    alphabets = np.array(['A', 'C', 'G', 'T'])

    fig, axes = plt.subplots(4, 4, figsize=(30,10))

    for i in range(start_filter, end_filter):
        a = np.copy(filter_weights[:,:,i].T)
        r,c = (i-start_filter)//4, i%4
        #print(r, c)
        ax = sns.heatmap(a, yticklabels=alphabets,
                    xticklabels=alphabets[np.argmax(a, axis=0)], ax = axes[r, c]
                   )
        ax.set_ylabel("filter{}".format(i))
        
    return None

Raw and mean normalized filter weights

In [7]:
from modisco.visualization import viz_sequence

for i in top_k_filters:
    print('filter', i)
    mean_norm_weights = np.copy(filter_weights[:,:,i])
    mean_norm_weights = mean_norm_weights - mean_norm_weights.mean(1, keepdims=True)
    print('raw weights')
    viz_sequence.plot_weights(filter_weights[:,:,i])
    print('mean normalized weights')
    viz_sequence.plot_weights(mean_norm_weights)
    break
filter 45
raw weights
mean normalized weights
In [8]:
from matlas.matches import vdom_pssm
from IPython.display import display

from vdom.helpers import (h1, p, li, img, div, b, br, ul, img, a, 
                          details, summary,
                          table, thead, th, tr, tbody, td, ol)

items_to_display = []
for i in top_k_filters:
    filter_name = 'filter{}'.format(i)
    mean_norm_weights = np.copy(filter_weights[:,:,i])
    mean_norm_weights = mean_norm_weights - mean_norm_weights.mean(1, keepdims=True)
    un_query = vdom_pssm(filter_weights[:,:,i])
    n_query = vdom_pssm(mean_norm_weights)
    items_to_display.append(summary(b(filter_name), p('raw weights'), un_query, 'mean normalized', n_query))
    #break
display(details(summary(b("weights of selected filters")), summary(items_to_display)))
weights of selected filtersfilter45

raw weights

mean normalized
filter56

raw weights

mean normalized
filter18

raw weights

mean normalized
filter4

raw weights

mean normalized
filter57

raw weights

mean normalized
filter24

raw weights

mean normalized
filter52

raw weights

mean normalized
filter23

raw weights

mean normalized
filter35

raw weights

mean normalized
filter47

raw weights

mean normalized
filter61

raw weights

mean normalized
filter43

raw weights

mean normalized
filter0

raw weights

mean normalized
filter16

raw weights

mean normalized
filter3

raw weights

mean normalized
In [9]:
from matlas.pwms import ic_scale, adjust_for_ic_scale
from matlas.matches import vdom_pssm
from sklearn.preprocessing import MinMaxScaler
from collections import OrderedDict

def ic_scale_filter_weights(filter_weights, filter_count=64):
    norm_weights = np.copy(filter_weights)
    norm_weights = (norm_weights-np.mean(norm_weights))/np.std(norm_weights)
    norm_weights[norm_weights>2] = 2.0
    norm_weights[norm_weights<-2] = -2.0

    ic_scaled_weights = OrderedDict()
    for i in range(filter_count):
        weights = np.copy(norm_weights[:,:,i])
        scaler = MinMaxScaler()
        arr = scaler.fit_transform(weights)

        #convert to probability
        arr = arr / arr.sum(1, keepdims=True) #divide by colsum
        arr = arr + 0.01  # add pseudo-counts
        probs = arr / arr.sum(1, keepdims=True)
        #probs = adjust_for_ic_scale(probs)
        try:
            ic_scaled_weights['filter{0}'.format(i)] = ic_scale(probs)
        except AssertionError:
            print('filter{0}'.format(i))
            return probs

    return ic_scaled_weights


ic_scaled_weights = ic_scale_filter_weights(filter_weights)

filter weights in IC scale

All filter weights are standardised and each filter weights are scaled to 0-1 and converted to probability score before IC-scaling

In [10]:
# from IPython.display import display
# from matlas.pwms import ic_scale

# from vdom.helpers import (h1, p, li, img, div, b, br, ul, img, a, 
#                           details, summary,
#                           table, thead, th, tr, tbody, td, ol)

# items_to_display = []
# for i in range(64):
#     filter_name = 'filter{}'.format(i)
#     query = vdom_pssm(ic_scaled_weights[filter_name])
#     items_to_display.append(summary(filter_name, query))

# display(details(summary(b("filter weights in IC scale")), summary(items_to_display)))
In [11]:
#display(details(summary(b("Click here to see the filter weights in IC scale")), summary(items_to_display)))
#0, 18, 23, 45, 56, 61

selected_items_to_display = []
for i in top_k_filters:
    filter_name = 'filter{}'.format(i)
    query = vdom_pssm(ic_scaled_weights[filter_name])
    selected_items_to_display.append(summary(filter_name, query))

display(details(summary(b("Click here to see the filter weights in IC scale")), summary(selected_items_to_display)))
Click here to see the filter weights in IC scalefilter45filter56filter18filter4filter57filter24filter52filter23filter35filter47filter61filter43filter0filter16filter3

importance score of the filtered sequences

In [17]:
imp_scores = np.load(aitacdir+"/imp_scores.npy")
rev_imp_scores = np.load(aitacdir+"/rev_imp_scores.npy")
OCR_matrix = np.load(aitacdir+"/OCR_matrix.npy")
already_visited = []
for filter_no in top_k_filters:
    filter_name = "filter{}".format(filter_no)
    print(filter_name)
    activated_seqs = np.argwhere(OCR_matrix[filter_no,:]==1)[:,0]; print(activated_seqs.shape)
    activated_scores = imp_scores[activated_seqs]; print(activated_scores.shape)
    for i in range(20):
        if i in already_visited: continue
        print('seq', activated_seqs[i])
        viz_sequence.plot_weights(activated_scores[i, 600:746,:],
                                  subticks_frequency=10, figsize=(20,1))
        break
    break
filter45
(7064,)
(7064, 1346, 4)
seq 0
In [ ]:
def window_similarities(seq_1, seq_2):
    """
    Takes two windows (W x 4 arrays) and computes a similarity between them,
    using a continuous Jaccard metric.
    """
    ab_1, ab_2 = np.abs(seq_1), np.abs(seq_2)
    inter = np.minimum(ab_1, ab_2) * np.sign(seq_1) * np.sign(seq_2)
    union = np.maximum(ab_1, ab_2)
    cont_jaccard = np.sum(inter, axis=1) / np.sum(union, axis=1)
    return np.sum(cont_jaccard)

def max_seqlet_similarity(seq_1, seq_2, window_size=8):
    """
    Takes two seqlets (S x 4 arrays) and computes the maximum similarity over
    all possible pairwise windows. Returns the starting indices of the best window
    for each sequence, and the resulting score.
    """
    seq_1_len, seq_2_len = seq_1.shape[0], seq_2.shape[0]
    best_window_1, best_window_2, best_score = None, None, -float("inf")
    for i in range(0, seq_1_len - window_size + 1):
        for j in range(0, seq_2_len - window_size + 1):
            window_score = window_similarities(seq_1[i : i + window_size], seq_2[j : j + window_size])
            if window_score > best_score:
                best_window_1, best_window_2, best_score = i, j, window_score
    return best_window_1, best_window_2, best_score

def aggregate_seqlets(seqlets, rev_seqlets, aggregation='sum'):                                        
    """                                                            
    From the set of seqlets (a N x L x 4 array), aggregates them into a
    single motif by successively merging (averaging the signal each time)
    starting from the seqlet with the highest total gradient magnitudes.
    """                                                          
    grad_mags = np.sum(np.abs(seqlets), axis=(1, 2))                   
    sorted_inds = np.flip(np.argsort(grad_mags))
    seqlets_sorted = seqlets[sorted_inds]
    rev_seqlets_sorted = rev_seqlets[sorted_inds]
    
    agg_seqlet_avg = seqlets_sorted[0]                                 
    for i in range(1, len(seqlets_sorted)):                                             
        # Align the next seqlet to the current aggregated average      
        agg_ind1, new_ind1, best_score1 = max_seqlet_similarity(agg_seqlet_avg, seqlets_sorted[i], window_size=12)
        agg_ind2, new_ind2, best_score2 = max_seqlet_similarity(agg_seqlet_avg, rev_seqlets_sorted[i], window_size=12)
        
        if best_score1 >= best_score2:
            agg_ind, new_ind, best_score, new_seqlet = (agg_ind1, new_ind1, best_score1, seqlets_sorted[i])
        else:
            agg_ind, new_ind, best_score, new_seqlet = (agg_ind2, new_ind2, best_score2, rev_seqlets_sorted[i])
        
#         print(i, new_ind1, agg_ind1, best_score1, new_ind2, agg_ind2, best_score2)
#         viz_sequence.plot_weights(seqlets_sorted[i])
#         viz_sequence.plot_weights(rev_seqlets_sorted[i])
#         viz_sequence.plot_weights(agg_seqlet_avg)
#         print(new_ind, agg_ind, best_score)
        if new_ind > agg_ind:                                          
            diff = new_ind - agg_ind                                   
            new_seqlet = np.concatenate([new_seqlet[diff:], np.zeros((diff, 4))], axis=0)
        elif new_ind < agg_ind:                                        
            diff = agg_ind - new_ind                                   
            new_seqlet = np.concatenate([np.zeros((diff, 4)), new_seqlet[:-diff]], axis=0)
        # Update the average seqlet                                    
        #agg_seqlet_avg = (agg_seqlet_avg * i / (i + 1)) + (new_seqlet * i / (i + 1))
        if aggregation=='sum': 
            agg_seqlet_avg = (agg_seqlet_avg + new_seqlet)
        else:
            agg_seqlet_avg = (agg_seqlet_avg * i / (i + 1)) + (new_seqlet * i / (i + 1))
        
#         viz_sequence.plot_weights(new_seqlet)
#         viz_sequence.plot_weights(agg_seqlet_avg)
        if i>=200:
            break
    #if aggregation=='sum': 
    #    agg_seqlet_avg = agg_seqlet_avg/(1.0*len(seqlets_sorted))
    
    return agg_seqlet_avg
#from matlas.sliding_similarities import aggregate_seqlets

seqlen = imp_scores.shape[1]
for filter_no in [0, 43, 61, 47, 35, 23, 52, 24, 57,  4, 18, 56, 45][::-1][:3]:
#     activated_subseqs = np.argwhere(seq_indices_of_activation[filter_no]==1); #print(activated_subseqs.shape)
#     activated_subscores = []
#     rev_activated_subscores = []
#     alt_activated_subscores = []
#     for i,j,k in zip(activated_subseqs.T[0], activated_subseqs.T[1], activated_subseqs.T[1]+19):
#         activated_subscores.append(imp_scores[i, j:k, :])
#         rev_activated_subscores.append(rev_imp_scores[i, (seqlen-k):(seqlen-j), :])
#         alt_activated_subscores.append(alt_imp_scores[i, (seqlen-k):(seqlen-j), :])

#     activated_subscores = np.array(activated_subscores); print(activated_subscores.shape) 
#     rev_activated_subscores = np.array(rev_activated_subscores); print(rev_activated_subscores.shape)
#     alt_activated_subscores = np.array(alt_activated_subscores); print(alt_activated_subscores.shape)
    
    avg_activated_subscores = aggregate_seqlets(activated_subscores, alt_activated_subscores, 'old'); #print(avg_activated_subscores.shape)
    print("filter{}".format(filter_no))
    print('direct sum')
    viz_sequence.plot_weights(avg_activated_subscores)
    print('running average')
    viz_sequence.plot_weights(aggregate_seqlets(activated_subscores, rev_activated_subscores, 'old'))
    break                   

aitac motifs

In [ ]:
from matlas.matches import DenovoAitac
motif_name = 'GABPA'
aitacdir = "/mnt/lab_data2/msharmin/oc-atlas/DanSkinData/fold_a_alex/{}".format(motif_name)

ob = DenovoAitac(aitacdir, filt_infl_from_loss, len(top_k_filters))
# ob.fetch_tomtom_matches(
#             meme_db="/mnt/lab_data/kundaje/users/msharmin/annotations/HOCOMOCOv11_core_pwms_HUMAN_mono.renamed.nonredundant.annotated.meme",
#             database_name="HOCOMOCO.nonredundant.annotated",
#             save_report=True, tomtom_dir= "{0}/{1}_tomtomout".format(aitacdir, "HOCOMOCO.nonredundant.annotated"))
ob.load_matched_motifs(database_name="HOCOMOCO.nonredundant.annotated")
ob.get_motif_per_celltype(match_threshold=0.05, database_name="HOCOMOCO.nonredundant.annotated")
pattern_tab, pattern_dict = ob.visualize_pattern_table()
tf_tab, tf_dict = ob.visualize_tf_table("Aitac")

ith_influence_in_aitac = mean(change of across_task_correlation(label, prediction) between original vs. leave-one-out)

ith_influence_in_profile = mean(change of nll between original vs. leave-one-out)

ith_influence_in_binary = mean(change of loss between original vs. leave-one-out)

In [14]:
from vdom.helpers import (b, summary, details)
from IPython.display import display

display(details(summary('Click here for ', b('Denovo Patterns'), ' by ', b('{}'.format('Aitac')),
                        ' in ', b(motif_name),
                        ": #{}".format(len(pattern_dict)),
                       ), pattern_tab))
Click here for Denovo Patterns by Aitac in GABPA: #15
Pattern NameTF Name(s)AitacInfluence
filter4521337.74807322463
filter5611534.497079847606
filter189198.245794270415
filter43271.616548979369
filter571683.1552190306925
filter241490.7094019139436
filter521276.1927264368628
filter231074.9423718667676
filter35882.5017111741323
filter47632.4030013710853
filter61HCLUST-121_AR.UNK.0.A610.3824608959138
filter43HCLUST-185_EGR1.UNK.0.A, HCLUST-68_ZNF341.UNK.0.A603.4364124707539
filter0456.38312183208626
filter16427.25683703285654
filter3386.5203047709338
In [15]:
# display(details(summary('Click here for ', b('Motifs'), ' by ', b('{}'.format('Aitac')),
#                         ' in ', b(motif_name),
#                         ": #{}".format(len(tf_dict)),
#                        ), tf_tab))

Rsat motif clustering

In [52]:
motif_name = 'GABPA'
aitacdir = "/mnt/lab_data2/msharmin/oc-atlas/DanSkinData/fold_a_alex/{}".format(motif_name)
from matlas.pwms import get_motifDB_id_maps, reduce_pwm_redundancy

memefile = "{0}/filter_motifs_pwm.meme".format(aitacdir)
motif_id_maps = get_motifDB_id_maps(database_name=memefile)

file_prefix = "/mnt/lab_data2/msharmin/oc-atlas/DanSkinData/fold_a_alex/{}/aitac".format(motif_name)
reduce_pwm_redundancy(
        motif_id_maps,
        out_pwm_file="{}.hclust_pwms.tmp".format(file_prefix),
        pwmtype="probabilities", #probabilities
        tmp_prefix=file_prefix,
        pseudocount=PSEUDOCOUNT,
        ic_thresh=IC_THRESHOLD,
        cor_thresh=0.6,
        ncor_thresh=0.4,
        num_threads=28)
filter4 1
filter35 1
filter54 3
/mnt/lab_data2/msharmin/oc-atlas/DanSkinData/fold_a_alex/GABPA/aitac.cor.motifs.mat.txt /mnt/lab_data2/msharmin/oc-atlas/DanSkinData/fold_a_alex/GABPA/aitac.ncor.motifs.mat.txt
0
16 filter24 (4, 10) 0.9999999999999998
27 filter41 (4, 21) 0.9999999999999998
40 filter61 (4, 12) 0.9999999999999996
(43, 43)
saving out filter14
saving out filter41
filter16;filter7 0.7837669610240231 1.0
filter26;filter62 0.6663147530321762 1.0
saving out filter3
saving out filter31
saving out filter63
filter0;filter8 0.9176371035680113 1.0
filter30;filter44 0.71088530682848 1.0
filter16;filter7;filter47 0.8672186778963747 1.0
filter0;filter8;filter22 0.8694045301896409 1.0
filter34;filter61 0.6329088766488 1.0
filter24;filter56 0.9874143409593419 1.0
saving out filter1
saving out filter42
filter17;filter25
/users/msharmin/code/mouse-atlas/matlas/pwms.py:449: FutureWarning:

read_table is deprecated, use read_csv instead, passing sep='\t'.

/users/msharmin/code/mouse-atlas/matlas/pwms.py:463: ClusterWarning:

scipy.cluster: The symmetric non-negative hollow observation matrix looks suspiciously like an uncondensed distance matrix

 0.8572632235836714 1.0
saving out filter45
filter2;filter9 0.8703432438339376 1.0
saving out filter16;filter7;filter47
saving out filter57
filter18;filter36 0.87593417548071 1.0
saving out filter53
saving out filter51
filter5;filter52 0.6681436387630173 1.0
saving out filter20
saving out filter34;filter61
filter0;filter8;filter22;filter32 0.6354558578356242 1.0
filter37;filter43 0.6558788717381621 1.0
saving out filter26;filter62
saving out filter46
filter18;filter36;filter59 0.6344360846319104 1.0
filter2;filter9;filter5;filter52 0.6217235786771977 0.6217235786771977
saving out filter30;filter44
filter0;filter8;filter22;filter32;filter24;filter56 0.8333762758783456 0.9722723218580699
filter13;filter27 0.6318553445003164 1.0
saving out filter23
saving out filter37;filter43
saving out filter18;filter36;filter59
filter2;filter9;filter5;filter52;filter40 0.8015078563633807 1.0
saving out filter13;filter27
saving out filter17;filter25
saving out filter0;filter8;filter22;filter32;filter24;filter56
saving out filter2;filter9;filter5;filter52;filter40
In [60]:
from matlas.reports import generate_merged_pwm_plots
from matlas.pwms import read_pwm_file, load_cisbp_maps

mergeddict = read_pwm_file("{}.hclust_pwms.tmp".format(file_prefix))
tab, tabledict = generate_merged_pwm_plots(mergeddict, motif_id_maps)
In [16]:
#display(tab)
In [ ]: