In [1]:
%load_ext autoreload
%autoreload 2

import numpy as np
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt
In [4]:
from matlas.pwms import ic_scale, adjust_for_ic_scale
from matlas.matches import vdom_pssm
from modisco.visualization import viz_sequence
from matlas.matches import vdom_pssm
In [5]:
motif_name = 'CEBPB'
aitacdir = "/mnt/lab_data2/msharmin/oc-atlas/DanSkinData/fold_a_alex/{}".format(motif_name)
filt_infl_from_loss = np.load(aitacdir+"/filt_infl_from_loss.npy")
top_k_filters = np.flip(np.argsort(filt_infl_from_loss))[:15]
top_k_filters
Out[5]:
array([54,  8, 32,  0,  7, 45, 16, 57,  4, 60,  3, 15, 50, 40, 43])
In [6]:
import numpy as np
import seaborn as sns

motif_name = 'CEBPB'
filter_weights = np.load("/mnt/lab_data2/msharmin/oc-atlas/DanSkinData/fold_a_alex/filters/{0}/{0}_filters.npy".format(
    motif_name))
filter_weights.shape
Out[6]:
(21, 4, 64)
In [3]:
from matplotlib import pyplot as plt

def draw_filter_weights(start_filter, end_filter):
    
    alphabets = np.array(['A', 'C', 'G', 'T'])

    fig, axes = plt.subplots(4, 4, figsize=(30,10))

    for i in range(start_filter, end_filter):
        a = np.copy(filter_weights[:,:,i].T)
        r,c = (i-start_filter)//4, i%4
        #print(r, c)
        ax = sns.heatmap(a, yticklabels=alphabets,
                    xticklabels=alphabets[np.argmax(a, axis=0)], ax = axes[r, c]
                   )
        ax.set_ylabel("filter{}".format(i))
        
    return None

Raw and mean normalized filter weights

In [7]:
from modisco.visualization import viz_sequence

for i in top_k_filters:
    print('filter', i)
    mean_norm_weights = np.copy(filter_weights[:,:,i])
    mean_norm_weights = mean_norm_weights - mean_norm_weights.mean(1, keepdims=True)
    print('raw weights')
    viz_sequence.plot_weights(filter_weights[:,:,i])
    print('mean normalized weights')
    viz_sequence.plot_weights(mean_norm_weights)
    break
filter 54
raw weights
mean normalized weights
In [8]:
from matlas.matches import vdom_pssm
from IPython.display import display

from vdom.helpers import (h1, p, li, img, div, b, br, ul, img, a, 
                          details, summary,
                          table, thead, th, tr, tbody, td, ol)

items_to_display = []
for i in top_k_filters:
    filter_name = 'filter{}'.format(i)
    mean_norm_weights = np.copy(filter_weights[:,:,i])
    mean_norm_weights = mean_norm_weights - mean_norm_weights.mean(1, keepdims=True)
    un_query = vdom_pssm(filter_weights[:,:,i])
    n_query = vdom_pssm(mean_norm_weights)
    items_to_display.append(summary(b(filter_name), p('raw weights'), un_query, 'mean normalized', n_query))
    #break
display(details(summary(b("weights of selected filters")), summary(items_to_display)))
weights of selected filtersfilter54

raw weights

mean normalized
filter8

raw weights

mean normalized
filter32

raw weights

mean normalized
filter0

raw weights

mean normalized
filter7

raw weights

mean normalized
filter45

raw weights

mean normalized
filter16

raw weights

mean normalized
filter57

raw weights

mean normalized
filter4

raw weights

mean normalized
filter60

raw weights

mean normalized
filter3

raw weights

mean normalized
filter15

raw weights

mean normalized
filter50

raw weights

mean normalized
filter40

raw weights

mean normalized
filter43

raw weights

mean normalized
In [9]:
from matlas.pwms import ic_scale, adjust_for_ic_scale
from matlas.matches import vdom_pssm
from sklearn.preprocessing import MinMaxScaler
from collections import OrderedDict

def ic_scale_filter_weights(filter_weights, filter_count=64):
    norm_weights = np.copy(filter_weights)
    norm_weights = (norm_weights-np.mean(norm_weights))/np.std(norm_weights)
    norm_weights[norm_weights>2] = 2.0
    norm_weights[norm_weights<-2] = -2.0

    ic_scaled_weights = OrderedDict()
    for i in range(filter_count):
        weights = np.copy(norm_weights[:,:,i])
        scaler = MinMaxScaler()
        arr = scaler.fit_transform(weights)

        #convert to probability
        arr = arr / arr.sum(1, keepdims=True) #divide by colsum
        arr = arr + 0.01  # add pseudo-counts
        probs = arr / arr.sum(1, keepdims=True)
        #probs = adjust_for_ic_scale(probs)
        try:
            ic_scaled_weights['filter{0}'.format(i)] = ic_scale(probs)
        except AssertionError:
            print('filter{0}'.format(i))
            return probs

    return ic_scaled_weights


ic_scaled_weights = ic_scale_filter_weights(filter_weights)

filter weights in IC scale

All filter weights are standardised and each filter weights are scaled to 0-1 and converted to probability score before IC-scaling

In [10]:
# from IPython.display import display
# from matlas.pwms import ic_scale

# from vdom.helpers import (h1, p, li, img, div, b, br, ul, img, a, 
#                           details, summary,
#                           table, thead, th, tr, tbody, td, ol)

# items_to_display = []
# for i in range(64):
#     filter_name = 'filter{}'.format(i)
#     query = vdom_pssm(ic_scaled_weights[filter_name])
#     items_to_display.append(summary(filter_name, query))

# display(details(summary(b("filter weights in IC scale")), summary(items_to_display)))
In [11]:
#4,7,57

selected_items_to_display = []
for i in top_k_filters:
    filter_name = 'filter{}'.format(i)
    query = vdom_pssm(ic_scaled_weights[filter_name])
    selected_items_to_display.append(summary(filter_name, query))

display(details(summary(b("Click here to see the filter weights in IC scale")), summary(selected_items_to_display)))
Click here to see the filter weights in IC scalefilter54filter8filter32filter0filter7filter45filter16filter57filter4filter60filter3filter15filter50filter40filter43

importance score of the filtered sequences

In [13]:
imp_scores = np.load(aitacdir+"/imp_scores.npy")
rev_imp_scores = np.load(aitacdir+"/rev_imp_scores.npy")
OCR_matrix = np.load(aitacdir+"/OCR_matrix_0.npy")
activations2 = np.load(aitacdir+"/activations2_0.npy")
seq_indices_of_activation = np.load(aitacdir+"/seq_indices_of_activation_0.npy")

already_visited = []
for filter_no in top_k_filters:
    filter_name = "filter{}".format(filter_no)
    print(filter_name)
    activated_seqs = np.argwhere(OCR_matrix[filter_no,:]==1)[:,0]; print(activated_seqs.shape)
    activated_scores = imp_scores[activated_seqs]; print(activated_scores.shape)
    for i in range(20):
        if i in already_visited: continue
        print('seq', activated_seqs[i])
        viz_sequence.plot_weights(activated_scores[i, 600:746,:],
                                  subticks_frequency=10, figsize=(20,1))
        break
    break
filter54
(8464,)
(8464, 1346, 4)
seq 0
In [15]:
from matlas.sliding_similarities import aggregate_seqlets


seqlen = imp_scores.shape[1]
for filter_no in top_k_filters:
    activated_subseqs = np.argwhere(seq_indices_of_activation[filter_no]!=0); #print(activated_subseqs.shape)
    activated_subscores = []
    rev_activated_subscores = []
    sub_activations = []
    rev_sub_activations = []
    for i,j,k in zip(activated_subseqs.T[0], activated_subseqs.T[1], activated_subseqs.T[1]+21):
        activated_subscores.append(imp_scores[i, j:k, :])
        rev_activated_subscores.append(rev_imp_scores[i, (seqlen-k):(seqlen-j), :])
        sub_activations.append(activations2[i, filter_no, j])

    activated_subscores = np.array(activated_subscores); #print(activated_subscores.shape) 
    rev_activated_subscores = np.array(rev_activated_subscores); #print(rev_activated_subscores.shape)
    sub_activations = np.array(sub_activations)
    sub_act_ind = np.flip(np.argsort(sub_activations))
    activated_subscores = activated_subscores[sub_act_ind]
    rev_activated_subscores = rev_activated_subscores[sub_act_ind]
    
    avg_activated_subscores = aggregate_seqlets(activated_subscores[:100], 
                                                rev_activated_subscores[:100], 'sum'); #print(avg_activated_subscores.shape)
    print('Aggregated importance scores using direct sum for filter-{}'.format(filter_no))
    viz_sequence.plot_weights(avg_activated_subscores)
#     print("filter{}".format(filter_no))
#     print('running average')
#     viz_sequence.plot_weights(avg_activated_subscores)
#     viz_sequence.plot_weights(aggregate_seqlets(activated_subscores, rev_activated_subscores, 'old'))
    #break         
Aggregated number of sequences is 68
Aggregated importance scores using direct sum for filter-54
Aggregated number of sequences is 70
Aggregated importance scores using direct sum for filter-8
Aggregated number of sequences is 68
Aggregated importance scores using direct sum for filter-32
Aggregated number of sequences is 75
Aggregated importance scores using direct sum for filter-0
Aggregated number of sequences is 82
Aggregated importance scores using direct sum for filter-7
Aggregated number of sequences is 66
Aggregated importance scores using direct sum for filter-45
Aggregated number of sequences is 66
Aggregated importance scores using direct sum for filter-16
Aggregated number of sequences is 67
Aggregated importance scores using direct sum for filter-57
Aggregated number of sequences is 77
Aggregated importance scores using direct sum for filter-4
Aggregated number of sequences is 72
Aggregated importance scores using direct sum for filter-60
Aggregated number of sequences is 69
Aggregated importance scores using direct sum for filter-3
Aggregated number of sequences is 71
Aggregated importance scores using direct sum for filter-15
Aggregated number of sequences is 74
Aggregated importance scores using direct sum for filter-50
Aggregated number of sequences is 71
Aggregated importance scores using direct sum for filter-40
Aggregated number of sequences is 70
Aggregated importance scores using direct sum for filter-43

aitac motifs

In [17]:
from matlas.matches import DenovoAitac
motif_name = 'CEBPB'
aitacdir = "/mnt/lab_data2/msharmin/oc-atlas/DanSkinData/fold_a_alex/{}".format(motif_name)

ob = DenovoAitac(aitacdir, filt_infl_from_loss, len(top_k_filters))
# ob.fetch_tomtom_matches(
#             meme_db="/mnt/lab_data/kundaje/users/msharmin/annotations/HOCOMOCOv11_core_pwms_HUMAN_mono.renamed.nonredundant.annotated.meme",
#             database_name="HOCOMOCO.nonredundant.annotated",
#             save_report=True, tomtom_dir= "{0}/{1}_tomtomout".format(aitacdir, "HOCOMOCO.nonredundant.annotated"))
ob.load_matched_motifs(database_name="HOCOMOCO.nonredundant.annotated")
ob.get_motif_per_celltype(match_threshold=0.05, database_name="HOCOMOCO.nonredundant.annotated")
pattern_tab, pattern_dict = ob.visualize_pattern_table()
tf_tab, tf_dict = ob.visualize_tf_table("Aitac")
In [18]:
from vdom.helpers import (b, summary, details)
from IPython.display import display

display(details(summary('Click here for ', b('Denovo Patterns'), ' by ', b('{}'.format('Aitac')),
                        ' in ', b(motif_name),
                        ": #{}".format(len(pattern_dict)),
                       ), pattern_tab))
Click here for Denovo Patterns by Aitac in CEBPB: #15
Pattern NameTF Name(s)AitacInfluence
filter540.7728701414398944
filter8HCLUST-185_EGR1.UNK.0.A0.014370349729647172
filter320.004895535478607343
filter00.004231883591772969
filter70.0037567814797779897
filter450.003061682194380346
filter16HCLUST-176_CBFB.UNK.0.A, HCLUST-121_AR.UNK.0.A0.001199900779109973
filter570.0010434280145358488
filter40.0009882113168206406
filter600.0008521652854643476
filter30.0006908377935115507
filter15HCLUST-127_FOXJ2.UNK.0.A0.0006621855519941209
filter500.0006470899068144286
filter400.0005823330687212506
filter430.00048183436839832757
In [19]:
display(details(summary('Click here for ', b('Motifs'), ' by ', b('{}'.format('Aitac')),
                        ' in ', b(motif_name),
                        ": #{}".format(len(tf_dict)),
                       ), tf_tab))
Click here for Motifs by Aitac in CEBPB: #4
TF NamePattern(s)
HCLUST-185_EGR1.UNK.0.A
Pattern NameAitacSignificance
filter80.026976
filter400.00021119099999999998
HCLUST-176_CBFB.UNK.0.A
Pattern NameAitacSignificance
filter160.014642700000000002
HCLUST-121_AR.UNK.0.A
Pattern NameAitacSignificance
filter160.0371034
HCLUST-127_FOXJ2.UNK.0.A
Pattern NameAitacSignificance
filter150.026797900000000003

Rsat clustering on aitac motifs

In [16]:
motif_name = 'CEBPB'
aitacdir = "/mnt/lab_data2/msharmin/oc-atlas/DanSkinData/fold_a_alex/{}".format(motif_name)
from matlas.pwms import get_motifDB_id_maps, reduce_pwm_redundancy
from matlas.genome_data import *

memefile = "{0}/filter_motifs_pwm.meme".format(aitacdir)
motif_id_maps = get_motifDB_id_maps(database_name=memefile)

file_prefix = "/mnt/lab_data2/msharmin/oc-atlas/DanSkinData/fold_a_alex/{}/aitac".format(motif_name)
reduce_pwm_redundancy(
        motif_id_maps,
        out_pwm_file="{}.hclust_pwms.tmp".format(file_prefix),
        pwmtype="probabilities", #probabilities
        tmp_prefix=file_prefix,
        pseudocount=PSEUDOCOUNT,
        ic_thresh=IC_THRESHOLD,
        cor_thresh=0.6,
        ncor_thresh=0.4,
        num_threads=28)
filter4 2
filter8 0
filter15 0
filter16 0
filter32 0
filter40 1
filter54 2
/mnt/lab_data2/msharmin/oc-atlas/DanSkinData/fold_a_alex/CEBPB/aitac.cor.motifs.mat.txt /mnt/lab_data2/msharmin/oc-atlas/DanSkinData/fold_a_alex/CEBPB/aitac.ncor.motifs.mat.txt
0
4 filter5 (4, 12) 0.9999999999999996
7 filter12 (4, 21) 0.9999999999999998
(26, 26)
filter0;filter19 0.7078714398899135 1.0
filter2;filter41 0.7297810025907647 1.0
saving out filter34
saving out filter42
saving out filter57
filter43;filter61 0.6862553250824988 1.0
filter1;filter3 0.7906007147545048 0.8894258040988179
filter36;filter52 0.6600176379662774 1.0
saving out filter60
saving out filter11
filter44;filter50 0.6855557454701704 1.0
filter26;filter39 0.7108863479037282 1.0
saving out filter0;filter19
saving out filter35
saving out filter2;filter41
saving out filter45
saving out filter27
saving out filter5
saving out filter12
saving out filter7
saving out filter1;filter3
saving out filter26;filter39
saving out filter43;filter61
saving out filter36;filter52
saving out filter17
saving out filter44;filter50
/users/msharmin/code/mouse-atlas/matlas/pwms.py:449: FutureWarning:

read_table is deprecated, use read_csv instead, passing sep='\t'.

/users/msharmin/code/mouse-atlas/matlas/pwms.py:463: ClusterWarning:

scipy.cluster: The symmetric non-negative hollow observation matrix looks suspiciously like an uncondensed distance matrix

In [17]:
from matlas.reports import generate_merged_pwm_plots
from matlas.pwms import read_pwm_file, load_cisbp_maps

mergeddict = read_pwm_file("{}.hclust_pwms.tmp".format(file_prefix))
tab, tabledict = generate_merged_pwm_plots(mergeddict, motif_id_maps)
In [3]:
#display(tab)
In [ ]: