In [1]:
%load_ext autoreload
%autoreload 2

import pickle
from collections import OrderedDict
import numpy as np
import pandas as pd
import os
import subprocess
import math
import copy
import pickle
import h5py
import glob

from matlas.genome_data import *
from matlas.pwms import shorten_tfname, get_motifDB_id_maps
from matlas.matches import fig2vdom, vdom_pssm
from matlas.modisco_results import ModiscoResult
from vdom.helpers import (h1, p, li, img, div, b, br, ul, a,
                          details, summary,
                          table, thead, th, tr, tbody, td, ol)
from IPython.display import display


project_root = "/mnt/lab_data/kundaje/msharmin/mouse_hem/with_tfd"
oak_root = "/oak/stanford/groups/akundaje/msharmin/mouse_hem/from_labcluster/with_tfd"
srv_root = "/srv/scratch/msharmin/mouse_hem/with_tfd"
data_name = "full_mouse50"
modisco_root = "{0}/{1}/Naive_modisco2019_msharmin".format(oak_root, data_name)
modisco_root = "{0}/{1}/Naive_modisco2019".format(srv_root, data_name)

homer_root = "{0}/{1}/Naive_scans".format(srv_root, data_name)
model_root = "{0}/{1}/fineFactorized".format(srv_root, data_name)
basset_root = "{0}/{1}/Naive_basset_motifs".format(srv_root, data_name)
reportfile= "/mnt/lab_data/kundaje/msharmin/mouse_hem/with_tfd/full_mouse50/filtering samples_MS2.xlsx"
sheetname = "filter23"
load data from labcluster
Using TensorFlow backend.
TF-MoDISco is using the TensorFlow backend.
In [2]:
from matlas.matches import DenovoModisco

task_idx = 273
modisco_dir = "{0}/task_{1}-naivegw".format(modisco_root, task_idx)
summit_file = "{0}/{1}/fineFactorized/task_{2}-naivegw/predictions/task_{2}-{3}Summit.calibrated.{1}.tab.gz".format(project_root, data_name, task_idx, "Naive")
perf_file = "{0}/task_{1}-naivegw/NaiveauPRC.txt".format(model_root, task_idx)
deeplift_hdf = "{0}/{1}/Naive_deeplift2019/task_{2}-naivegw/summit.h5".format(srv_root, data_name, task_idx)
ob = DenovoModisco(modisco_dir)
pattern_name = None
pattern_names = list(ob.denovo_pwms.keys()) if pattern_name is None else [pattern_name]
query = ob.load_chomped_pattern(pattern_names[0], plot=False)
#loads all the important scores   
ob.initialize_with_importance_scores(summit_file, perf_file, deeplift_hdf, query.keys(), use_all_keys=True) 
ob.per_base_imp
Out[2]:
0.05627951420283896
In [3]:
ob.keys_for_scoring = ['sum_score'] 
ob.keys_for_scoring += ["{}_scan".format(key) for key in query.keys() if key!= 'consensus']
ob.keys_for_scoring += ["{}_jaccard".format(key) for key in query.keys() if key!= 'consensus']
ob.keys_for_scoring += ["{}_convolution".format(key) for key in query.keys() if key!= 'consensus']
ob.keys_for_scoring += ["{}_masked_convolution".format(key) for key in query.keys() if key!= 'consensus']

method_to_null_scores = OrderedDict()
method_to_seqlet_scores = OrderedDict()
for pattern_name in ob.denovo_pwms.keys():
#pattern_name = "metacluster_1/pattern_7"
    query = ob.load_chomped_pattern(pattern_name, rc=False)
    method_to_null_scores[pattern_name] = ob.compute_null_scores_in_seqlets(pattern_name, query, 
                                                                       use_all_seqlets=True, debug=True)
    
    method_to_seqlet_scores[pattern_name] = ob.compute_scores_in_seqlets(pattern_name, query, 
                                                                       use_all_seqlets=True, debug=True)
    #new_seqlet_hits = ob.prediction_in_seqlets(pattern_name, seqlet_hits)
In [7]:
# from matlas.match_utils import get_any_high_scoring_instances
# seqlet_hits = OrderedDict()

# for pattern_name in ob.denovo_pwms.keys():
#     query = ob.load_chomped_pattern(pattern_name, rc=False)
#     seqlet_hits[pattern_name] = get_any_high_scoring_instances(method_to_seqlet_scores[pattern_name], 
#                                                     min_tot_imp=ob.per_base_imp*len(query['ppm']), 
#                                                      keys=ob.keys_for_scoring)
#     seqlet_hits[pattern_name]['sum_score'] = seqlet_hits[pattern_name]['sum_score']/len(query['ppm'])
          
seqlet_hits = method_to_seqlet_scores
In [8]:
# null_hits = OrderedDict()
# for pattern_name in ob.denovo_pwms.keys():
#     query = ob.load_chomped_pattern(pattern_name, rc=False)
#     null_hits[pattern_name] = get_any_high_scoring_instances(method_to_null_scores[pattern_name], 
#                                                     min_tot_imp=ob.per_base_imp*len(query['ppm']), 
#                                                      keys=ob.keys_for_scoring)
#     null_hits[pattern_name]['sum_score'] = null_hits[pattern_name]['sum_score']/len(query['ppm'])
null_hits = method_to_null_scores
In [9]:
from sklearn.preprocessing import StandardScaler

def scale_data(_hits, scaler=None):
    if scaler is None:
        # should be used only during training
        scaler = StandardScaler()
        scaler.fit(_hits)
        
    # Apply transform to both the training set and testing set
    _hits = scaler.transform(_hits)
    
    return scaler, _hits

features = ['sum_score', 'pwm_scan',
            'hyp_cwm_scan', 'cwm_scan',
            'hyp_cwm_jaccard', 'cwm_jaccard',
            'hyp_cwm_masked_convolution'
           ]

scaled_seqlet_hits = OrderedDict()
scaled_null_hits = OrderedDict()
scalers = OrderedDict()
for pattern_name in ob.denovo_pwms.keys():
    scalers[pattern_name], scaled_seqlet_hits[pattern_name] = scale_data(seqlet_hits[pattern_name][features])
    _, scaled_null_hits[pattern_name] = scale_data(null_hits[pattern_name][features], scalers[pattern_name])
    
for pattern_name in ob.denovo_pwms.keys():
    scaled_seqlet_hits[pattern_name] = pd.DataFrame(scaled_seqlet_hits[pattern_name], columns=features)
    #scaled_seqlet_hits[pattern_name].columns = seqlet_hits[pattern_name][features].columns
    
for pattern_name in ob.denovo_pwms.keys():
    scaled_null_hits[pattern_name] = pd.DataFrame(scaled_null_hits[pattern_name], columns=features)
    #scaled_null_hits[pattern_name].columns = null_hits[pattern_name][features].columns
In [10]:
from sklearn.decomposition import PCA

def reduce_data(_hits, pca=None):
    if pca is None:
        # Make an instance of the Model
        pca = PCA(.98)
        pca.fit(_hits)
        
    #Apply the mapping (transform) to both the training set and the test set.
    _hits = pca.transform(_hits)
    
    return pca, _hits


features = ['sum_score', 'pwm_scan',
            'hyp_cwm_scan', 'cwm_scan',
            'hyp_cwm_jaccard', 'cwm_jaccard',
            'hyp_cwm_masked_convolution'
           ]

pca_seqlet_hits = OrderedDict()
pca_null_hits = OrderedDict()
pcas = OrderedDict()
for pattern_name in ob.denovo_pwms.keys():
    pcas[pattern_name], pca_seqlet_hits[pattern_name] = reduce_data(scaled_seqlet_hits[pattern_name][features])
    _, pca_null_hits[pattern_name] = reduce_data(scaled_null_hits[pattern_name][features], pcas[pattern_name])
    
for pattern_name in ob.denovo_pwms.keys():
    pca_seqlet_hits[pattern_name] = pd.DataFrame(pca_seqlet_hits[pattern_name])
    pca_seqlet_hits[pattern_name].columns = ["PC{}".format(i+1) for i in range(pca_seqlet_hits[pattern_name].shape[1])]
    
for pattern_name in ob.denovo_pwms.keys():
    pca_null_hits[pattern_name] = pd.DataFrame(pca_null_hits[pattern_name])
    pca_null_hits[pattern_name].columns = ["PC{}".format(i+1) for i in range(pca_null_hits[pattern_name].shape[1])]
In [11]:
outdir = "/mnt/lab_data/kundaje/msharmin/mouse_hem/with_tfd/full_mouse50/instances2019/task_273"
_df_dict = OrderedDict()
for i, pattern_name in enumerate(ob.denovo_pwms.keys()):
    print(pattern_name)
    fname = "{}/{}.gw.gz".format(outdir, i)
    _df_dict[pattern_name] = pd.read_csv(fname, header=0, index_col=0, sep="\t")
    print(_df_dict[pattern_name].shape)
metacluster_1/pattern_0
/users/msharmin/anaconda2/envs/basepair13/lib/python3.6/site-packages/numpy/lib/arraysetops.py:522: FutureWarning:

elementwise comparison failed; returning scalar instead, but in the future will perform elementwise comparison

(3260164, 27)
metacluster_1/pattern_1
(3392147, 27)
metacluster_1/pattern_2
(3390929, 27)
metacluster_1/pattern_3
(3392147, 27)
metacluster_1/pattern_4
(3390929, 27)
metacluster_1/pattern_5
(3392147, 27)
metacluster_1/pattern_6
(3393254, 27)
metacluster_1/pattern_7
(3397481, 27)
metacluster_1/pattern_8
(3380835, 27)
metacluster_1/pattern_9
(3260164, 27)
metacluster_1/pattern_10
(3384625, 27)
metacluster_1/pattern_11
(3390929, 27)
metacluster_1/pattern_12
(3393254, 27)
metacluster_1/pattern_13
(3391553, 27)
metacluster_1/pattern_14
(3391553, 27)
metacluster_1/pattern_15
(3384625, 27)
metacluster_1/pattern_16
(3393254, 27)
metacluster_1/pattern_17
(2716324, 27)
metacluster_1/pattern_18
(3196262, 27)
metacluster_1/pattern_19
(3329663, 27)
In [12]:
_reduced_dict = OrderedDict()
for pattern_name in ob.denovo_pwms.keys():
    _, _reduced_dict[pattern_name] = scale_data(_df_dict[pattern_name][features], scalers[pattern_name])
    _, _reduced_dict[pattern_name] = reduce_data(_reduced_dict[pattern_name], pcas[pattern_name])
    _reduced_dict[pattern_name] = pd.DataFrame(_reduced_dict[pattern_name])
    _reduced_dict[pattern_name].columns = ["PC{}".format(i+1) for i in range(_reduced_dict[pattern_name].shape[1])]
In [20]:
import matplotlib.pyplot as plt 
from sklearn.cluster import Birch
from sklearn.mixture import GaussianMixture, BayesianGaussianMixture
from sklearn.metrics import davies_bouldin_score
import warnings
warnings.filterwarnings("ignore")

keys = ['PC1', 'PC2']
gmms = OrderedDict()
birchs = OrderedDict()
gmm_cno = OrderedDict()
brc_cno = OrderedDict()
everything = OrderedDict()
for pattern_name in ob.denovo_pwms.keys():
    info = OrderedDict()
    #pattern_name = "metacluster_1/pattern_19"
    key_ks = []
    for i, key in enumerate(keys):
        a = pca_seqlet_hits[pattern_name][key].values
        b = pca_null_hits[pattern_name][key].values
        the_k = np.percentile(b, 90)
        key_ks.append(the_k)
        info['PC1_k'] = the_k
        break
    print(pattern_name, key_ks)
    
    fig, axes = plt.subplots(nrows=2, ncols=4, figsize=(22,10))
    ax_all = axes[0, 0]
    ax_allBrc = axes[0, 1]
    ax_gwBrc = axes[0, 2]
    ax_gmm = axes[0, 3]
    
    ax_sq = axes[1, 0]
    ax_sqbrc = axes[1, 1]
    ax_dnbrc = axes[1, 2]
    ax_cmp = axes[1, 3]
    
    df = pca_seqlet_hits[pattern_name]
    #df = df[df['PC1']>=the_k][['PC1', 'PC2']]
    small_df = df[df['PC1']>=the_k][['PC1', 'PC2']]
    df = df[df['PC1']>=the_k][['PC1', 'PC2']]
    print(df.shape)
    clustering = Birch(branching_factor=50, n_clusters=3, threshold=0.5, compute_labels=True).fit(
        small_df.values)
    means = []
    for i in [0,1,2]:
        means.append(np.mean(small_df[clustering.labels_==i], axis=0))
    means = np.array(means)
    sq_cno = np.argmax(np.array(means[:,0])) # use only 1st column or PC1 to selelct desired cluster
    
    ax_sq.scatter(small_df['PC1'], small_df['PC2'], c=clustering.labels_, s=1, cmap='plasma');#
    ax_sq.set_xlabel('PC1')
    ax_sq.set_ylabel('PC2')
    ax_sq.set_title(pattern_name + " (Birch, Seqlets), {}".format(sq_cno))
    info['sq_cno'] = sq_cno
    info['sq_brc'] = clustering
    
    #anything
    all_df = _reduced_dict[pattern_name][['PC1', 'PC2']]
    df = all_df
    ax_all.scatter(df['PC1'], df['PC2'], s=1, cmap='plasma');
    ax_all.set_xlabel('PC1')
    ax_all.set_ylabel('PC2')
    ax_all.set_title(pattern_name+ " (All)")
    
    #all birch
    all_clustering = Birch(branching_factor=50, n_clusters=3, threshold=0.5, compute_labels=True).fit(
        df.values)
    all_means = []
    for i in [0,1,2]:
        all_means.append(np.mean(df[all_clustering.labels_==i], axis=0))
    all_means = np.array(all_means)
    all_cno = np.argmax(np.array(all_means[:,0]))
    
    ax_allBrc.scatter(df['PC1'], df['PC2'], c=all_clustering.labels_, s=1, cmap='plasma');
    ax_allBrc.set_xlabel('PC1')
    ax_allBrc.set_ylabel('PC2')
    ax_allBrc.set_title(pattern_name+ " (allBirch), {}".format(all_cno))
    info['all_cno'] = all_cno
    info['all_brc'] = all_clustering
    
    #reduced and all birch
    df = df[df['PC1']>=the_k][['PC1', 'PC2']]
    all_labels = all_clustering.labels_[all_df['PC1']>=the_k]
    ax_gwBrc.scatter(df['PC1'], df['PC2'], c=all_labels, s=1, cmap='plasma');
    ax_gwBrc.set_xlabel('PC1')
    ax_gwBrc.set_ylabel('PC2')
    ax_gwBrc.set_title(pattern_name+ " (allBirch, GW), {}".format(all_cno))
    
    #gw gmm
    gmm = GaussianMixture(n_components=3, means_init= means,
                              covariance_type='full', 
                              init_params='kmeans').fit(df[['PC1', 'PC2']])
    gmm_labels = gmm.predict(df)
    means = []
    for i in [0,1,2]:
        means.append(np.mean(df[gmm_labels==i], axis=0))
    means = np.array(means)
    gmm_cno[pattern_name] = np.argmax(np.array(means[:,0]))
    
    ax_gmm.scatter(df['PC1'], df['PC2'], c=gmm_labels, s=1, cmap='plasma');
    ax_gmm.set_xlabel('PC1')
    ax_gmm.set_ylabel('PC2')
    ax_gmm.set_title(pattern_name+ " (sqGMM, GW), {}".format(gmm_cno[pattern_name]))
    info['gmm_cno'] = gmm_cno[pattern_name]
    info['gmm'] = gmm
    
    #gw birch prediction
    brc_labels = clustering.predict(df.values)
    ax_sqbrc.scatter(df['PC1'], df['PC2'], c=brc_labels, s=1, cmap='plasma');#
    ax_sqbrc.set_xlabel('PC1')
    ax_sqbrc.set_ylabel('PC2')
    ax_sqbrc.set_title(pattern_name + " (sqBirch, GW), {}".format(sq_cno))
    
    
    #gw birch new
    brc = Birch(branching_factor=50, n_clusters=3, threshold=0.5, compute_labels=True).fit(
        df.values)
    means = []
    for i in [0,1,2]:
        means.append(np.mean(df[brc.labels_==i], axis=0))
    means = np.array(means)
    brc_cno[pattern_name] = np.argmax(np.array(means[:,0]))
    
    ax_dnbrc.scatter(df['PC1'], df['PC2'], c=brc.labels_, s=1, cmap='plasma');
    ax_dnbrc.set_xlabel('PC1')
    ax_dnbrc.set_ylabel('PC2')
    ax_dnbrc.set_title(pattern_name+ " (dnBirch, GW), {}".format(brc_cno[pattern_name]))
    
    info['brc_cno'] = brc_cno[pattern_name]
    info['reduced_brc'] = brc
    
    #fitness
    #seq_brc_db = davies_bouldin_score(small_df.values, clustering.labels_)
    #gw_brc_db = davies_bouldin_score(df.values, brc_labels)
    #brc_db = davies_bouldin_score(df.values, brc.labels_)
    #gmm_db = davies_bouldin_score(df.values, gmm_labels)
    
    new_all_brc_labels = np.array([0]*df.shape[0]); new_all_brc_labels[all_labels==all_cno] = 1
    new_seq_brc_labels = np.array([0]*small_df.shape[0]); new_seq_brc_labels[clustering.labels_==sq_cno] = 1
    new_gw_brc_labels = np.array([0]*df.shape[0]); new_gw_brc_labels[brc.labels_==sq_cno] = 1
    new_brc_labels = np.array([0]*df.shape[0]); new_brc_labels[brc.labels_==brc_cno[pattern_name]] = 1
    new_gmm_labels = np.array([0]*df.shape[0]); new_gmm_labels[gmm_labels==gmm_cno[pattern_name]] = 1
    
    new_all_brc_db = davies_bouldin_score(df.values, new_all_brc_labels)
    new_seq_brc_db = davies_bouldin_score(small_df.values, new_seq_brc_labels)
    new_gw_brc_db = davies_bouldin_score(df.values, new_gw_brc_labels)
    new_brc_db = davies_bouldin_score(df.values, new_brc_labels)
    new_gmm_db = davies_bouldin_score(df.values, new_gmm_labels)

    barWidth = 0.25
    bars = [new_all_brc_db, new_seq_brc_db, new_gmm_db, new_brc_db, new_gw_brc_db]
    
    r1 = np.arange(len(bars))
    ax_cmp.bar(r1, bars, color='#557f2d', width=barWidth, edgecolor='white')
    ax_cmp.set_xticks([r+barWidth/2  for r in range(len(bars))])
    ax_cmp.set_xticklabels(['Brc-All, {}'.format(all_cno), 
                            'Brc-Seqlets, {}'.format(sq_cno),
                            'GMM-GW, {}'.format(gmm_cno[pattern_name]), 
                            'dnBrc-GW, {}'.format(brc_cno[pattern_name]), 
                            'sqBrc-GW, {}'.format(sq_cno)], 
                           fontdict={'fontsize':12},
                           rotation='vertical')
    #ax_cmp.legend()
    ax_cmp.set_title('Davies Bouldin Scores')
    plt.tight_layout()
    plt.show()
    
    everything[pattern_name] = info
    #break
metacluster_1/pattern_0 [1.7198816939104558]
(107441, 2)
metacluster_1/pattern_1 [2.001332244271657]
(82131, 2)
metacluster_1/pattern_2 [1.7919665738371213]
(111991, 2)
metacluster_1/pattern_3 [2.5691248798406874]
(61988, 2)
metacluster_1/pattern_4 [2.042705031642133]
(97221, 2)
metacluster_1/pattern_5 [2.653934558669765]
(59385, 2)
metacluster_1/pattern_6 [2.0944077927033]
(78905, 2)
metacluster_1/pattern_7 [2.755896027557707]
(47493, 2)
metacluster_1/pattern_8 [2.3784424903039803]
(80330, 2)
metacluster_1/pattern_9 [2.1775024079472414]
(89982, 2)
metacluster_1/pattern_10 [2.6478769297127034]
(65735, 2)
metacluster_1/pattern_11 [1.9798633736837061]
(85283, 2)
metacluster_1/pattern_12 [2.5839614943313443]
(69323, 2)
metacluster_1/pattern_13 [2.4314707472089023]
(71213, 2)
metacluster_1/pattern_14 [2.474028973986961]
(73077, 2)
metacluster_1/pattern_15 [2.2377079082875437]
(78386, 2)
metacluster_1/pattern_16 [2.4442487378969266]
(76927, 2)
metacluster_1/pattern_17 [2.469670415261553]
(2092, 2)
metacluster_1/pattern_18 [2.0568376639611095]
(86920, 2)
metacluster_1/pattern_19 [2.089742549655602]
(103251, 2)
In [35]:
import matplotlib.pyplot as plt    
    
keys = ['PC1', 'PC2']
for pattern_name in ob.denovo_pwms.keys():
    fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(20,5))
    
    for i, key in enumerate(keys):
        a = pca_seqlet_hits[pattern_name][key].values
        b = pca_null_hits[pattern_name][key].values
    
        n, bins, patches = axes[i].hist(a, 200, color='red', histtype='step')
        n, bins, patches = axes[i].hist(b, 200, color='green', histtype='step')
        axes[i].axvline(x=np.percentile(b, 95))

    
    plt.title(pattern_name)
    plt.show()
In [46]:
features = ['sum_score', 'pwm_scan',
            'hyp_cwm_scan', 'cwm_scan',
            'hyp_cwm_jaccard', 'cwm_jaccard',
            'hyp_cwm_masked_convolution'
           ]

ob.keys_for_scoring = features
for pattern_name in ob.denovo_pwms.keys():
    #pattern_name = "metacluster_1/pattern_7"
    query = ob.load_chomped_pattern(pattern_name, rc=False)
    #seqlet_hits = ob.compute_scores_in_seqlets(pattern_name, query, use_all_seqlets=True)
    new_seqlet_hits = ob.prediction_in_seqlets(pattern_name, seqlet_hits[pattern_name])
(647226, 5)
(647226, 9) 2 7457
(551397, 6)
(551397, 10) 0 9621
(560822, 5)
(560822, 9) 2 140575
(551397, 5)
(551397, 9) 2 102653
(560822, 5)
(560822, 9) 0 123336
(551397, 6)
(551397, 10) 0 110981
(531787, 5)
(531787, 9) 2 122509
(522529, 6)
(522529, 10) 1 149055
(590518, 6)
(590518, 10) 0 177247
(647226, 6)
(647226, 10) 1 157308
(580526, 6)
(580526, 10) 0 83470
(560822, 6)
(560822, 10) 2 117869
(531787, 6)
(531787, 10) 1 134077
(541264, 5)
(541264, 9) 1 123664
(541264, 5)
(541264, 9) 2 97134
(580526, 6)
(580526, 10) 1 120227
(531787, 5)
(531787, 9) 1 106299
(22749, 6)
(22749, 10) 0 2553
(599333, 6)
(599333, 10) 2 100045
(645129, 6)
(645129, 10) 1 297792
In [30]:
import matplotlib.pyplot as plt

for pattern_name in ob.denovo_pwms.keys():
    a = method_to_seqlet_scores[pattern_name]['sum_score'].flatten()
    b = method_to_null_scores[pattern_name]['sum_score'].flatten()

    c = method_to_seqlet_scores[pattern_name]['hyp_cwm_masked_convolution'].flatten()
    d = method_to_null_scores[pattern_name]['hyp_cwm_masked_convolution'].flatten()

    #fig = plt.figure(figsize=(10, 8))
    fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(20,5))
    n, bins, patches = axes[0].hist(a, 200, color='red', histtype='step')
    n, bins, patches = axes[0].hist(b, 200, color='green', histtype='step')
    axes[0].axvline(x=np.percentile(b, 99.95)/len(query['ppm']))

    n, bins, patches = axes[1].hist(c, 200, color='red', histtype='step')
    n, bins, patches = axes[1].hist(d, 200, color='green', histtype='step')
    axes[1].axvline(x=np.percentile(d, 95))
    plt.title(pattern_name)
    plt.show()
    
In [32]:
import matplotlib.pyplot as plt

keys = ['hyp_cwm_jaccard', 'hyp_cwm_scan', 'hyp_cwm_masked_convolution']
for pattern_name in ob.denovo_pwms.keys():
    fig, axes = plt.subplots(nrows=1, ncols=3, figsize=(20,5))
    
    for i, key in enumerate(keys):
        a = seqlet_hits[pattern_name][key].values
        b = null_hits[pattern_name][key].values
    
        n, bins, patches = axes[i].hist(a, 200, color='red', histtype='step')
        n, bins, patches = axes[i].hist(b, 200, color='green', histtype='step')
        axes[i].axvline(x=np.percentile(b, 95))

    
    plt.title(pattern_name)
    plt.show()
In [37]:
#remove instances based on all match scores
filtered_seqlet_hits = OrderedDict()
filtered_null_hits = OrderedDict()
filtering_features = ['pwm_scan',
            'hyp_cwm_scan', 'cwm_scan',
            'hyp_cwm_jaccard', 'cwm_jaccard',
            'hyp_cwm_masked_convolution'
           ]

for pattern_name in ob.denovo_pwms.keys():
    for key in filtering_features:
        d = method_to_null_scores[pattern_name][key].flatten()
        the_k = np.percentile(d, 90)
        df = seqlet_hits[pattern_name]
        df = df[df[key]>the_k]
    filtered_seqlet_hits[pattern_name] = df
In [41]:
features = ['sum_score', 'pwm_scan',
            'hyp_cwm_scan', 'cwm_scan',
            'hyp_cwm_jaccard', 'cwm_jaccard',
            'hyp_cwm_masked_convolution'
           ]

ob.keys_for_scoring = features
for pattern_name in ob.denovo_pwms.keys():
#pattern_name = "metacluster_1/pattern_7"
    query = ob.load_chomped_pattern(pattern_name, rc=False)
    #seqlet_hits = ob.compute_scores_in_seqlets(pattern_name, query, use_all_seqlets=True)
    new_seqlet_hits = ob.prediction_in_seqlets(pattern_name, filtered_seqlet_hits[pattern_name])
(76154, 5)
(76154, 9) 1 7475
(49266, 5)
(49266, 9) 2 9380
(69300, 5)
(69300, 9) 2 3690
(45218, 5)
(45218, 9) 2 1637
(53022, 6)
(53022, 10) 2 1849
(35398, 5)
(35398, 9) 2 1166
(50899, 5)
(50899, 9) 1 10585
(24021, 5)
(24021, 9) 1 1472
(51483, 5)
(51483, 9) 1 696
(60627, 6)
(60627, 10) 1 519
(44698, 6)
(44698, 10) 1 416
(54805, 5)
(54805, 9) 1 9240
(41592, 6)
(41592, 10) 2 1877
(43871, 5)
(43871, 9) 0 1530
(42815, 5)
(42815, 9) 1 1433
(60000, 5)
(60000, 9) 1 8015
(48099, 6)
(48099, 10) 1 6962
(913, 6)
(913, 10) 1 24
(54953, 6)
(54953, 10) 1 11647
(56902, 6)
(56902, 10) 2 8647
In [ ]:
for pattern_name in ob.denovo_pwms.keys():
In [86]:
ob.keys_for_scoring = ['sum_score'] 
ob.keys_for_scoring += ["{}_scan".format(key) for key in query.keys() if key!= 'consensus']
ob.keys_for_scoring += ["{}_jaccard".format(key) for key in query.keys() if key!= 'consensus']
ob.keys_for_scoring += ["{}_convolution".format(key) for key in query.keys() if key!= 'consensus']
ob.keys_for_scoring += ["{}_masked_convolution".format(key) for key in query.keys() if key!= 'consensus']

for pattern_name in ob.denovo_pwms.keys():
#pattern_name = "metacluster_1/pattern_7"
    query = ob.load_chomped_pattern(pattern_name, rc=False)
    seqlet_hits = ob.compute_scores_in_seqlets(pattern_name, query, use_all_seqlets=False)
    new_seqlet_hits = ob.prediction_in_seqlets(pattern_name, seqlet_hits)
(260431, 8)
(260431, 12) 2
(120889, 8)
(120889, 12) 0
(52636, 8)
(52636, 12) 1
(21865, 7)
(21865, 11) 1
(18675, 7)
(18675, 11) 2
(18124, 7)
(18124, 11) 1
(18098, 7)
(18098, 11) 1
(14878, 8)
(14878, 12) 0
(12659, 8)
(12659, 12) 1
(13967, 8)
(13967, 12) 2
(6659, 7)
(6659, 11) 2
(7712, 8)
(7712, 12) 2
(6500, 7)
(6500, 11) 2
(4958, 7)
(4958, 11) 1
(4715, 6)
(4715, 10) 1
(3343, 7)
(6686, 11) 1
(3161, 8)
(6322, 12) 1
(49, 4)
(98, 8) 2
(1151, 8)
(2302, 12) 1
(1269, 7)
(2538, 11) 1
In [87]:
features = ['sum_score', 'pssm_scan',
            'hyp_cwm_scan', 'cwm_scan',
            'hyp_cwm_jaccard', 'cwm_jaccard','pssm_jaccard',
            'hyp_cwm_masked_convolution'
           ]
ob.keys_for_scoring = features
for pattern_name in ob.denovo_pwms.keys():
#pattern_name = "metacluster_1/pattern_7"
    query = ob.load_chomped_pattern(pattern_name, rc=False)
    seqlet_hits = ob.compute_scores_in_seqlets(pattern_name, query, use_all_seqlets=False)
    new_seqlet_hits = ob.prediction_in_seqlets(pattern_name, seqlet_hits)
(260431, 5)
(260431, 9) 2
(120889, 5)
(120889, 9) 1
(52636, 5)
(52636, 9) 2
(21865, 4)
(21865, 8) 2
(18675, 4)
(18675, 8) 1
(18124, 4)
(18124, 8) 1
(18098, 4)
(18098, 8) 1
(14878, 4)
(14878, 8) 2
(12659, 4)
(12659, 8) 1
(13967, 4)
(13967, 8) 1
(6659, 4)
(6659, 8) 1
(7712, 5)
(7712, 9) 1
(6500, 5)
(6500, 9) 1
(4958, 4)
(4958, 8) 0
(4715, 4)
(4715, 8) 2
(3343, 4)
(6686, 8) 0
(3161, 5)
(6322, 9) 2
(49, 3)
(98, 7) 2
(1151, 4)
(2302, 8) 1
(1269, 4)
(2538, 8) 1
In [94]:
features = ['sum_score', 'pwm_scan',
            'hyp_cwm_scan', 'cwm_scan',
            'hyp_cwm_jaccard', 'cwm_jaccard',
            'hyp_cwm_masked_convolution'
           ]
ob.keys_for_scoring = features
for pattern_name in ob.denovo_pwms.keys():
#pattern_name = "metacluster_1/pattern_7"
    query = ob.load_chomped_pattern(pattern_name, rc=False)
    seqlet_hits = ob.compute_scores_in_seqlets(pattern_name, query, use_all_seqlets=False)
    new_seqlet_hits = ob.prediction_in_seqlets(pattern_name, seqlet_hits)
(260431, 5)
(260431, 9) 2
(120889, 5)
(120889, 9) 1
(52636, 5)
(52636, 9) 1
(21865, 4)
(21865, 8) 1
(18675, 4)
(18675, 8) 0
(18124, 4)
(18124, 8) 1
(18098, 4)
(18098, 8) 2
(14878, 5)
(14878, 9) 1
(12659, 5)
(12659, 9) 0
(13967, 5)
(13967, 9) 1
(6659, 4)
(6659, 8) 2
(7712, 5)
(7712, 9) 0
(6500, 5)
(6500, 9) 1
(4958, 4)
(4958, 8) 2
(4715, 4)
(4715, 8) 1
(3343, 4)
(3343, 8) 1
(3161, 5)
(3161, 9) 1
(49, 2)
(49, 6) 0
(1151, 4)
(1151, 8) 1
(1269, 4)
(1269, 8) 1
In [ ]:
 
In [143]:
imp1 = None
imp2 = None
imp3 = None
ratio = None
for pattern_name in ob.denovo_pwms.keys():
    pca = ob.pca[pattern_name]
    #print(pattern_name)
    #print(pca.components_[:2,:])
    if imp1 is None:
        imp1 = pca.components_[0,:]
        imp2 = pca.components_[1,:]
        imp3 = pca.components_[2,:]
        ratio = pca.explained_variance_ratio_[:5]
    else:
        imp1 = np.vstack((imp1, pca.components_[0,:]))
        imp2 = np.vstack((imp2, pca.components_[1,:]))
        imp3 = np.vstack((imp3, pca.components_[2,:]))
        ratio = np.vstack((ratio, pca.explained_variance_ratio_[:5]))

imp1 = pd.DataFrame(imp1)
imp2 = pd.DataFrame(imp2)
imp3 = pd.DataFrame(imp3)
ratio = pd.DataFrame(ratio)
imp1.columns = features
imp2.columns = features
imp3.columns = features
print(imp1.shape, imp2.shape, imp3.shape)
ratio
(20, 7) (20, 7) (20, 7)
Out[143]:
0 1 2 3 4
0 0.680397 0.146271 0.105012 0.031013 0.019098
1 0.695268 0.142840 0.088085 0.034545 0.018544
2 0.697211 0.146268 0.086186 0.035164 0.018235
3 0.673356 0.144904 0.103110 0.041832 0.021371
4 0.685658 0.145955 0.094533 0.031906 0.022454
5 0.663303 0.147897 0.099756 0.045577 0.021188
6 0.711601 0.144336 0.078308 0.032313 0.016996
7 0.673456 0.146031 0.094547 0.042293 0.020838
8 0.656520 0.145385 0.123580 0.031199 0.022544
9 0.601955 0.151064 0.135308 0.055252 0.027652
10 0.667463 0.147296 0.104211 0.039380 0.019349
11 0.659383 0.143655 0.104919 0.046710 0.022555
12 0.700259 0.146478 0.080744 0.034838 0.017609
13 0.684291 0.146765 0.091768 0.038186 0.020123
14 0.698584 0.146546 0.086478 0.035270 0.018308
15 0.670124 0.146916 0.101797 0.040485 0.020208
16 0.693542 0.146399 0.087641 0.034317 0.019797
17 0.568312 0.150927 0.129471 0.071336 0.046031
18 0.574736 0.154696 0.138851 0.069306 0.033694
19 0.615117 0.150938 0.133129 0.045515 0.029377
In [149]:
fig = plt.figure()
ax = sns.heatmap(imp1, 
                 vmin=np.min(imp1.values), 
                 vmax=np.max(imp1.values))
ax.set_title('PC1')
ax.set_ylabel('pattern #')
fig = plt.figure()
ax = sns.heatmap(imp2, 
                 vmin=np.min(imp2.values), 
                 vmax=np.max(imp2.values))
ax.set_title('PC2')
ax.set_ylabel('pattern #')
fig = plt.figure()
ax = sns.heatmap(imp3, 
                 vmin=np.min(imp3.values), 
                 vmax=np.max(imp3.values))
ax.set_title('PC3')
ax.set_ylabel('pattern #')
fig = plt.figure()
ax = sns.heatmap(ratio, 
                 vmin=np.min(ratio.values), 
                 vmax=np.max(ratio.values))
ax.set_title('explained ratio')
ax.set_ylabel('pattern #')
Out[149]:
Text(30.5, 0.5, 'pattern #')
In [142]:
import seaborn as sns
fig = plt.figure()
ax = sns.heatmap(imp1, 
                 vmin=np.min(pd.concat([imp1, imp2, imp3]).values), 
                 vmax=np.max(pd.concat([imp1, imp2, imp3]).values))
fig = plt.figure()
ax = sns.heatmap(imp2, 
                 vmin=np.min(pd.concat([imp1, imp2, imp3]).values), 
                 vmax=np.max(pd.concat([imp1, imp2, imp3]).values))

fig = plt.figure()
ax = sns.heatmap(imp3, 
                 vmin=np.min(pd.concat([imp1, imp2, imp3]).values), 
                 vmax=np.max(pd.concat([imp1, imp2, imp3]).values))
fig = plt.figure()
ax = sns.heatmap(ratio, 
                 vmin=np.min(ratio.values), 
                 vmax=np.max(ratio.values))
In [79]:
ob.keys_for_scoring = ['sum_score'] 
ob.keys_for_scoring += ["{}_scan".format(key) for key in query.keys() if key!= 'consensus']
ob.keys_for_scoring += ["{}_jaccard".format(key) for key in query.keys() if key!= 'consensus']
ob.keys_for_scoring += ["{}_convolution".format(key) for key in query.keys() if key!= 'consensus']
ob.keys_for_scoring += ["{}_masked_convolution".format(key) for key in query.keys() if key!= 'consensus']

pattern_name = "metacluster_1/pattern_7"
query = ob.load_chomped_pattern(pattern_name, rc=False)
seqlet_hits = ob.compute_scores_in_seqlets(pattern_name, query, use_all_seqlets=False)
seqlet_hits.shape
Out[79]:
(522529, 27)
In [ ]:
 
In [33]:
#ob.deeplift_data['shuffled_scores']
# from matlas.sliding_similarities import sliding_sum
# a = sliding_sum(query['ppm'], ob.deeplift_data['shuffled_scores'])
# a = a.flatten()
# b = sliding_sum(query['ppm'], ob.deeplift_data['scores'])
# b = b.flatten()
# a = a/len(query['ppm'])
# b = b/len(query['ppm'])

average_motif_len = 15
a = sliding_sum(np.ones((average_motif_len, 4)), ob.deeplift_data['shuffled_scores'])
a = a.flatten()/average_motif_len

b = sliding_sum(np.ones((average_motif_len, 4)), ob.deeplift_data['scores'])
b = b.flatten()/average_motif_len

import matplotlib.pyplot as plt

fig = plt.figure(figsize=(10, 8))
n, bins, patches = plt.hist(a, 200, facecolor='red', histtype='step')
n, bins, patches = plt.hist(b, 200, facecolor='green', histtype='step')
plt.axvline(x=np.percentile(a, 99.95)/len(query['ppm']))
plt.show()
In [42]:
ob.per_base_imp
Out[42]:
0.003751967613522597
In [5]:
pattern_name = "metacluster_1/pattern_7"
outdir = "/mnt/lab_data/kundaje/msharmin/mouse_hem/with_tfd/full_mouse50/instances2019"

query = ob.load_chomped_pattern(pattern_name, rc=False)
display_name = ob.shorten_pattern_name(pattern_name)
outprefix = "{}/{}".format(outdir, display_name)
ob.find_pattern_instances_in_seqlets(pattern_name, query,
                                     fname="{}.seqlet.gmm.pdf".format(outprefix))
(13907, 9)
(13907, 4)
(13907, 7)
INFO:2019-10-22 08:02:32,076:test.log] trained metacluster_1/pattern_7. Used 13907 instances out of (13907, 9)
INFO:2019-10-22 08:02:32,076:test.log] trained metacluster_1/pattern_7. Used 13907 instances out of (13907, 9)
INFO:2019-10-22 08:02:32,076:test.log] trained metacluster_1/pattern_7. Used 13907 instances out of (13907, 9)
In [6]:
df = ob._predict_pattern_instances_in_sequences(pattern_name, query,
                                                cluster_plot=True,
                                               fname="{}.GW.gmm.pdf".format(outprefix)
                                              )
(3134133, 7)
INFO:2019-10-22 08:25:12,271:test.log] predicted, metacluster_1/pattern_7
INFO:2019-10-22 08:25:12,271:test.log] predicted, metacluster_1/pattern_7
INFO:2019-10-22 08:25:12,271:test.log] predicted, metacluster_1/pattern_7
In [9]:
fname = "{}.candidates.bed.gz".format(outprefix)
df.to_csv(fname, header=True, index=True, sep="\t", compression='gzip')
In [10]:
df[:5]
Out[10]:
sum_score seqno start_idx pssm_scan hyp_cwm_scan cwm_scan pssm_jaccard hyp_cwm_jaccard hyp_cwm_masked_convolution PC1 PC2 PC3 PC4 labels probs_0 probs_1 affinity
0 0.506014 0 457 4.630333 0.008418 0.576000 0.186843 -0.018382 -0.041447 0.186457 -1.008803 0.342027 -0.493843 0 1.0 1.081909e-75 -9.21034
1 0.692963 0 459 3.730682 0.039855 0.347557 0.139370 0.049612 0.324870 0.412132 -0.876732 -0.827821 0.223457 0 1.0 2.458299e-71 -9.21034
2 0.828139 0 460 2.220063 -0.238699 0.186014 0.085936 -0.015740 -0.321238 -1.427511 -0.329545 -0.174973 -0.542189 0 1.0 7.645040e-119 -9.21034
3 1.070684 0 461 2.534312 0.012432 0.432785 0.008754 -0.081084 -0.290332 -1.291234 0.012149 0.719207 0.270713 0 1.0 5.294447e-117 -9.21034
4 1.108170 0 462 0.376822 -0.106884 0.025820 0.016013 -0.034818 -0.648415 -2.352710 0.247801 0.060116 -0.541629 0 1.0 5.358193e-149 -9.21034
In [11]:
pos_df = df[df['labels']==ob.cluster_no[pattern_name]]
fname = "{}.candidates1.bed.gz".format(outprefix)
pos_df.to_csv(fname, header=True, index=True, sep="\t", compression='gzip')
pos_df.shape
Out[11]:
(9718, 17)
In [12]:
rev_query = ob.load_chomped_pattern(pattern_name, rc=True, plot=True)
df = ob._predict_pattern_instances_in_sequences(pattern_name, rev_query,
                                                cluster_plot=True,
                                               fname="{}.rev.GW.gmm.pdf".format(outprefix)
                                              )
fname = "{}.rev_candidates.bed.gz".format(outprefix)
df.to_csv(fname, header=True, index=True, sep="\t", compression='gzip')
pos_df = df[df['labels']==ob.cluster_no[pattern_name]]
fname = "{}.rev_candidates1.bed.gz".format(outprefix)
pos_df.to_csv(fname, header=True, index=True, sep="\t", compression='gzip')
ppmchomped ppmchomped pwmchomped log10_pwmchomped pssmchomped cwmchomped hyp_cwmconsensusaGATAAga
(3134133, 7)
INFO:2019-10-22 08:56:51,948:test.log] predicted, metacluster_1/pattern_7
INFO:2019-10-22 08:56:51,948:test.log] predicted, metacluster_1/pattern_7
INFO:2019-10-22 08:56:51,948:test.log] predicted, metacluster_1/pattern_7
INFO:2019-10-22 08:56:51,948:test.log] predicted, metacluster_1/pattern_7
In [13]:
#now combine the forward abd reverse coords and check for redundancy...
fname = "{}.candidates1.bed.gz".format(outprefix)
for_df = pd.read_csv(fname, header=0, index_col=0, sep="\t")
fname = "{}.rev_candidates1.bed.gz".format(outprefix)
rev_df = pd.read_csv(fname, header=0, index_col=0, sep="\t")
In [46]:
#get the sequences common between
common_seqs = np.array(list(set(list(for_df.seqno.values)).intersection(set(list(rev_df.seqno.values)))))
common_seqs = np.sort(common_seqs)
print(common_seqs.shape)
fore_common = np.array([seqno in common_seqs for seqno in for_df.seqno.values])
rev_common = np.array([seqno in common_seqs for seqno in rev_df.seqno.values])
(1014,)
In [21]:
for_df[fore_common][:5]
Out[21]:
sum_score seqno start_idx pssm_scan hyp_cwm_scan cwm_scan pssm_jaccard hyp_cwm_jaccard hyp_cwm_masked_convolution PC1 PC2 PC3 PC4 labels probs_0 probs_1 affinity
7087 0.719578 354 469 10.566150 1.108264 1.461974 0.545713 0.296834 0.980013 6.972319 -1.177662 -0.288694 -0.445434 1 1.754782e-06 0.999998 9.192943
8869 1.916423 454 492 10.946750 1.190199 1.507266 0.659304 0.281202 0.973364 7.552993 1.023540 -0.356206 -0.547166 1 7.413490e-09 1.000000 9.210266
16789 1.362010 904 514 10.263988 1.124887 1.414121 0.612824 0.266297 0.989716 7.006805 0.021724 -0.461967 -0.491541 1 3.295677e-07 1.000000 9.207050
16817 0.596663 907 492 8.788265 0.909073 1.277441 0.552744 0.238311 0.897645 5.879004 -1.290189 -0.611978 -0.669720 1 6.776545e-03 0.993223 4.972840
19983 1.162510 1099 820 10.946750 1.190199 1.507266 0.660087 0.286183 0.969329 7.477639 -0.319253 -0.308069 -0.807862 1 9.786088e-08 1.000000 9.209362
In [23]:
rev_df[rev_common][:5]
Out[23]:
sum_score seqno start_idx pssm_scan hyp_cwm_scan cwm_scan pssm_jaccard hyp_cwm_jaccard hyp_cwm_masked_convolution PC1 PC2 PC3 PC4 labels probs_0 probs_1 affinity
7094 0.794498 354 476 10.853761 1.095423 1.488945 0.546977 0.272352 0.900610 6.848125 -0.998914 -0.000308 -0.508452 1 2.396958e-06 0.999998 9.186651
8862 0.501731 454 439 10.659139 1.203041 1.480295 0.541482 0.269257 0.889027 6.841486 -1.535023 0.095035 -0.529422 1 4.906370e-06 0.999995 9.162437
16773 0.496409 904 299 10.349956 1.032806 1.417799 0.625997 0.255205 0.968811 6.772597 -1.482428 -0.379742 -0.893444 1 6.214006e-06 0.999994 9.150048
16824 0.657944 907 528 10.853761 1.095423 1.488945 0.562632 0.236953 0.934201 6.729680 -1.227934 0.081656 -0.505726 1 5.750581e-06 0.999994 9.154422
19973 0.752838 1099 775 10.893723 1.151066 1.494347 0.729961 0.254498 0.932956 7.351972 -0.965525 -0.274610 -1.299071 1 4.350850e-07 1.000000 9.205999
In [47]:
#for each sequence get the distances from each other
def compute_distances(common_seqs, for_df, rev_df):
    count = 0
    distances = []
    exemplify = False
    for seqno in common_seqs:
        fore = for_df[for_df.seqno==seqno]
        rev = rev_df[rev_df.seqno==seqno]
        res = np.subtract.outer(fore.start_idx.values, rev.start_idx.values).T
        
        if res.shape[0]==res.shape[1]:
            iu1 = np.triu_indices(res.shape[0])
            res = np.abs(res[iu1])
        else:
            print(fore.seqno, res.shape)
            print(res)
            res = np.abs(res.flatten())
            
        if exemplify==False and fore.shape[0]>1 and rev.shape[0]>1:
            print(fore.seqno)
            
            exemplify = True
            #return res
        #res = np.abs(res.flatten())
        count += len(res)
        distances.append(res)
    distances = np.sort(np.concatenate(distances))
    return distances
distances = compute_distances(common_seqs, for_df, rev_df)
62098    3402
62103    3402
Name: seqno, dtype: int64 (1, 2)
[[ 25 146]]
113935    6379
113972    6379
Name: seqno, dtype: int64
113977    6380
Name: seqno, dtype: int64 (2, 1)
[[-29]
 [-34]]
185333    10475
Name: seqno, dtype: int64 (2, 1)
[[-42]
 [-87]]
228602    12683
228607    12683
Name: seqno, dtype: int64 (1, 2)
[[-80 -75]]
249987    13780
249996    13780
Name: seqno, dtype: int64 (1, 2)
[[101 142]]
287311    15523
287320    15523
Name: seqno, dtype: int64 (1, 2)
[[-42 -21]]
307195    16319
307222    16319
Name: seqno, dtype: int64 (1, 2)
[[-150   52]]
315012    16692
Name: seqno, dtype: int64 (2, 1)
[[ -64]
 [-215]]
323483    17175
Name: seqno, dtype: int64 (2, 1)
[[126]
 [ 71]]
333091    17680
Name: seqno, dtype: int64 (2, 1)
[[ 132]
 [-150]]
341205    18078
Name: seqno, dtype: int64 (3, 1)
[[  69]
 [  40]
 [-154]]
361668    19047
361673    19047
Name: seqno, dtype: int64 (1, 2)
[[-77 -55]]
377172    19771
377182    19771
Name: seqno, dtype: int64 (1, 2)
[[138 179]]
377873    19807
Name: seqno, dtype: int64 (2, 1)
[[27]
 [22]]
400683    20994
400691    20994
Name: seqno, dtype: int64 (1, 2)
[[-183 -147]]
425480    22166
425500    22166
Name: seqno, dtype: int64 (1, 2)
[[-123  -19]]
430737    22411
Name: seqno, dtype: int64 (2, 1)
[[307]
 [-44]]
497494    25661
Name: seqno, dtype: int64 (2, 1)
[[-147]
 [-357]]
554973    28523
554981    28523
Name: seqno, dtype: int64 (1, 2)
[[-303  -89]]
555607    28552
Name: seqno, dtype: int64 (2, 1)
[[40]
 [10]]
561087    28821
561107    28821
Name: seqno, dtype: int64 (1, 2)
[[-73 141]]
605665    30987
Name: seqno, dtype: int64 (2, 1)
[[  56]
 [-112]]
612967    31325
612974    31325
Name: seqno, dtype: int64 (1, 2)
[[45 81]]
681257    34910
681262    34910
Name: seqno, dtype: int64 (1, 2)
[[-384 -379]]
692235    35451
692263    35451
Name: seqno, dtype: int64 (1, 2)
[[-162  202]]
708427    36270
708447    36270
Name: seqno, dtype: int64 (1, 2)
[[-53  -8]]
721824    36894
Name: seqno, dtype: int64 (2, 1)
[[405]
 [304]]
753103    38471
Name: seqno, dtype: int64 (2, 1)
[[-41]
 [-63]]
762536    38968
Name: seqno, dtype: int64 (2, 1)
[[  38]
 [-156]]
779477    39869
Name: seqno, dtype: int64 (2, 1)
[[-49]
 [-64]]
889488    45467
Name: seqno, dtype: int64 (2, 1)
[[ 94]
 [-30]]
903372    46146
Name: seqno, dtype: int64 (2, 1)
[[ -49]
 [-106]]
904286    46193
904294    46193
Name: seqno, dtype: int64 (1, 2)
[[-65 -39]]
916606    46823
Name: seqno, dtype: int64 (2, 1)
[[81]
 [67]]
1016228    51920
Name: seqno, dtype: int64 (2, 1)
[[32]
 [27]]
1026728    52480
1026739    52480
Name: seqno, dtype: int64 (1, 2)
[[-64  35]]
1047736    53607
1047775    53607
Name: seqno, dtype: int64 (1, 2)
[[-84  11]]
1050730    53771
1050738    53771
Name: seqno, dtype: int64 (1, 2)
[[37 54]]
1057151    54128
1057168    54128
Name: seqno, dtype: int64 (1, 2)
[[-99 -55]]
1065797    54544
Name: seqno, dtype: int64 (2, 1)
[[ 41]
 [-28]]
1068544    54672
Name: seqno, dtype: int64 (2, 1)
[[ -87]
 [-167]]
1099613    56174
1099620    56174
Name: seqno, dtype: int64 (1, 2)
[[-322 -263]]
1132445    57825
Name: seqno, dtype: int64 (2, 1)
[[87]
 [ 9]]
1161751    59209
Name: seqno, dtype: int64 (2, 1)
[[146]
 [ 90]]
1177002    60022
1177011    60022
Name: seqno, dtype: int64 (1, 2)
[[-94 -30]]
1179874    60172
Name: seqno, dtype: int64 (4, 1)
[[  15]
 [-410]
 [-529]
 [-560]]
1186227    60509
1186232    60509
1186244    60509
Name: seqno, dtype: int64 (1, 3)
[[-209  -52   57]]
1240900    63172
Name: seqno, dtype: int64 (2, 1)
[[57]
 [ 6]]
1246988    63460
Name: seqno, dtype: int64 (2, 1)
[[ 37]
 [-44]]
1249155    63573
Name: seqno, dtype: int64 (2, 1)
[[ 49]
 [-24]]
1252602    63752
Name: seqno, dtype: int64 (2, 1)
[[ -99]
 [-124]]
1252642    63753
Name: seqno, dtype: int64 (2, 1)
[[ -99]
 [-124]]
1286261    65357
Name: seqno, dtype: int64 (2, 1)
[[  -8]
 [-167]]
1301637    66081
Name: seqno, dtype: int64 (2, 1)
[[172]
 [ 96]]
1307448    66321
Name: seqno, dtype: int64 (2, 1)
[[38]
 [ 6]]
1313271    66586
1313283    66586
Name: seqno, dtype: int64 (1, 2)
[[-147  -98]]
1356018    68571
1356028    68571
Name: seqno, dtype: int64 (1, 2)
[[54 97]]
1407191    71173
1407217    71173
Name: seqno, dtype: int64 (1, 2)
[[-54  27]]
1411491    71367
Name: seqno, dtype: int64 (2, 1)
[[-11]
 [-66]]
1442467    72931
Name: seqno, dtype: int64 (2, 1)
[[130]
 [ 46]]
1461171    73869
Name: seqno, dtype: int64 (2, 1)
[[ -50]
 [-458]]
1490641    75269
1490664    75269
Name: seqno, dtype: int64 (1, 2)
[[-60  29]]
1518843    76750
1518864    76750
Name: seqno, dtype: int64 (1, 2)
[[-65 -23]]
1523084    77034
Name: seqno, dtype: int64 (2, 1)
[[ -89]
 [-180]]
1543849    78222
1543864    78222
Name: seqno, dtype: int64 (1, 2)
[[130 175]]
1550544    78604
Name: seqno, dtype: int64 (2, 1)
[[83]
 [43]]
1565522    79473
1565545    79473
Name: seqno, dtype: int64 (1, 2)
[[-50  48]]
1587057    80689
1587067    80689
Name: seqno, dtype: int64 (1, 2)
[[-99 -69]]
1594691    81146
Name: seqno, dtype: int64 (2, 1)
[[93]
 [58]]
1605243    81714
1605249    81714
Name: seqno, dtype: int64 (1, 2)
[[-195 -157]]
1622263    82566
Name: seqno, dtype: int64 (2, 1)
[[-58]
 [-68]]
1645640    83996
1645659    83996
Name: seqno, dtype: int64 (1, 2)
[[-167   47]]
1665830    85158
1665836    85158
Name: seqno, dtype: int64 (1, 2)
[[-292 -211]]
1667494    85241
1667510    85241
Name: seqno, dtype: int64 (1, 2)
[[50 90]]
1673893    85628
Name: seqno, dtype: int64 (2, 1)
[[ -46]
 [-123]]
1715803    87995
Name: seqno, dtype: int64 (2, 1)
[[-23]
 [-44]]
1752529    89900
Name: seqno, dtype: int64 (3, 1)
[[   8]
 [ -88]
 [-184]]
1755722    90075
1755735    90075
Name: seqno, dtype: int64 (1, 2)
[[-304    6]]
1757166    90148
Name: seqno, dtype: int64 (2, 1)
[[ -90]
 [-236]]
1781548    91359
1781570    91359
Name: seqno, dtype: int64 (1, 2)
[[43 84]]
1781597    91360
1781618    91360
Name: seqno, dtype: int64 (1, 2)
[[43 84]]
1792157    91877
Name: seqno, dtype: int64 (2, 1)
[[ -8]
 [-79]]
1837750    94411
Name: seqno, dtype: int64 (2, 1)
[[ -96]
 [-223]]
1927929    99603
1927945    99603
Name: seqno, dtype: int64 (1, 2)
[[-120   23]]
1957792    101197
1957800    101197
Name: seqno, dtype: int64 (1, 2)
[[42 70]]
2001786    103373
Name: seqno, dtype: int64 (2, 1)
[[ -64]
 [-114]]
2050032    105824
Name: seqno, dtype: int64 (2, 1)
[[118]
 [ 74]]
2071871    106884
2071878    106884
Name: seqno, dtype: int64 (1, 2)
[[ 68 181]]
2089865    107729
2089873    107729
Name: seqno, dtype: int64 (1, 2)
[[-682  -40]]
2095652    108004
2095693    108004
Name: seqno, dtype: int64 (1, 2)
[[-71 -10]]
2102907    108339
2102929    108339
Name: seqno, dtype: int64 (1, 2)
[[-53  57]]
2123196    109303
Name: seqno, dtype: int64 (3, 1)
[[113]
 [-63]
 [-91]]
2196818    113097
2196837    113097
Name: seqno, dtype: int64 (1, 2)
[[-145  -88]]
2200233    113287
Name: seqno, dtype: int64 (2, 1)
[[ 39]
 [-55]]
2216569    114107
2216598    114107
Name: seqno, dtype: int64 (1, 2)
[[-62  11]]
2222293    114391
2222302    114391
Name: seqno, dtype: int64 (1, 2)
[[-576 -234]]
2229746    114804
2229751    114804
Name: seqno, dtype: int64 (1, 2)
[[150 155]]
2241759    115381
2241770    115381
Name: seqno, dtype: int64 (1, 2)
[[-84  10]]
2248603    115680
2248625    115680
Name: seqno, dtype: int64 (1, 2)
[[-75  27]]
2252302    115852
Name: seqno, dtype: int64 (2, 1)
[[-33]
 [-81]]
2253138    115893
Name: seqno, dtype: int64 (2, 1)
[[187]
 [ 63]]
2298838    117886
2298843    117886
Name: seqno, dtype: int64 (1, 2)
[[45 50]]
2334153    119574
2334180    119574
2334185    119574
Name: seqno, dtype: int64 (1, 3)
[[-48 154 159]]
2340820    119904
2340841    119904
Name: seqno, dtype: int64 (1, 2)
[[365 414]]
2356256    120658
Name: seqno, dtype: int64 (2, 1)
[[108]
 [ 55]]
2356533    120668
Name: seqno, dtype: int64 (2, 1)
[[418]
 [ 52]]
2358389    120782
Name: seqno, dtype: int64 (2, 1)
[[-66]
 [-71]]
2358553    120790
Name: seqno, dtype: int64 (2, 1)
[[-26]
 [-57]]
2358581    120791
Name: seqno, dtype: int64 (2, 1)
[[-26]
 [-57]]
2436240    124649
2436246    124649
Name: seqno, dtype: int64 (1, 2)
[[-59 -53]]
2440924    124889
Name: seqno, dtype: int64 (2, 1)
[[ -60]
 [-107]]
2453097    125503
Name: seqno, dtype: int64 (2, 1)
[[ -69]
 [-126]]
2460474    125848
2460509    125848
Name: seqno, dtype: int64 (1, 2)
[[-253   10]]
2507712    128027
2507728    128027
Name: seqno, dtype: int64 (1, 2)
[[31 96]]
2513262    128244
Name: seqno, dtype: int64 (3, 1)
[[215]
 [179]
 [-37]]
2580277    131059
Name: seqno, dtype: int64 (2, 1)
[[ 172]
 [-371]]
2580807    131091
2580832    131091
Name: seqno, dtype: int64 (1, 2)
[[-64   6]]
2647795    134400
Name: seqno, dtype: int64 (2, 1)
[[ -61]
 [-155]]
2677003    135803
2677020    135803
Name: seqno, dtype: int64 (1, 2)
[[-55  27]]
2690795    136448
Name: seqno, dtype: int64 (2, 1)
[[ 45]
 [-21]]
2690810    136449
2690832    136449
Name: seqno, dtype: int64 (1, 2)
[[-21 289]]
2746886    139084
Name: seqno, dtype: int64 (2, 1)
[[ 54]
 [-56]]
2762553    139765
Name: seqno, dtype: int64 (2, 1)
[[-41]
 [-49]]
2763806    139833
2763826    139833
Name: seqno, dtype: int64 (1, 2)
[[-246   38]]
2775086    140358
2775091    140358
Name: seqno, dtype: int64 (1, 2)
[[19 69]]
2821297    142562
2821324    142562
2821334    142562
Name: seqno, dtype: int64 (1, 3)
[[-17  41  88]]
2827185    142821
Name: seqno, dtype: int64 (2, 1)
[[271]
 [ 10]]
2827333    142829
2827342    142829
2827362    142829
Name: seqno, dtype: int64 (1, 3)
[[ 38  63 105]]
2898156    146261
2898162    146261
Name: seqno, dtype: int64 (1, 2)
[[-89 -27]]
2898537    146284
Name: seqno, dtype: int64 (2, 1)
[[ -57]
 [-108]]
2914566    147092
Name: seqno, dtype: int64 (2, 1)
[[-235]
 [-353]]
2946648    148706
Name: seqno, dtype: int64 (2, 1)
[[-137]
 [-198]]
3008594    151729
3008624    151729
Name: seqno, dtype: int64 (1, 2)
[[-75  33]]
3008630    151730
3008669    151730
Name: seqno, dtype: int64 (1, 2)
[[-75  33]]
3009052    151744
Name: seqno, dtype: int64 (2, 1)
[[-153]
 [-236]]
3015209    152011
3015228    152011
Name: seqno, dtype: int64 (1, 2)
[[-54  13]]
3075762    154945
3075770    154945
Name: seqno, dtype: int64 (1, 2)
[[-86 -62]]
3112538    156921
3112545    156921
Name: seqno, dtype: int64 (1, 2)
[[37 74]]
3114465    157004
Name: seqno, dtype: int64 (2, 1)
[[96]
 [84]]
3123627    157454
3123638    157454
Name: seqno, dtype: int64 (1, 2)
[[50 73]]
In [48]:
import matplotlib.pyplot as plt
#plt.scatter(np.array(range(distances.shape[0])), distances, s=1);
plt.hist(distances, bins=100)
#curren_ax.hist(data, 50, facecolor='blue', alpha=0.5)
Out[48]:
(array([ 52.,  32.,  42.,  86.,  79., 121.,  93.,  85.,  60.,  65.,  35.,
         56.,  27.,  24.,  21.,   9.,  13.,  20.,  10.,   9.,  17.,  16.,
          6.,  11.,   7.,  10.,   6.,   5.,   4.,  11.,  10.,   2.,   5.,
          5.,   5.,   6.,   5.,   2.,   7.,   4.,   2.,   8.,   1.,   6.,
          1.,   4.,   3.,   1.,   2.,   1.,   2.,   2.,   1.,   3.,   5.,
          1.,   5.,   5.,   2.,   5.,   1.,   2.,   0.,   1.,   1.,   4.,
          1.,   1.,   1.,   2.,   0.,   0.,   1.,   0.,   1.,   3.,   0.,
          0.,   0.,   2.,   0.,   0.,   1.,   1.,   1.,   0.,   0.,   0.,
          0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   2.,   0.,
          1.]),
 array([  6.  ,  12.93,  19.86,  26.79,  33.72,  40.65,  47.58,  54.51,
         61.44,  68.37,  75.3 ,  82.23,  89.16,  96.09, 103.02, 109.95,
        116.88, 123.81, 130.74, 137.67, 144.6 , 151.53, 158.46, 165.39,
        172.32, 179.25, 186.18, 193.11, 200.04, 206.97, 213.9 , 220.83,
        227.76, 234.69, 241.62, 248.55, 255.48, 262.41, 269.34, 276.27,
        283.2 , 290.13, 297.06, 303.99, 310.92, 317.85, 324.78, 331.71,
        338.64, 345.57, 352.5 , 359.43, 366.36, 373.29, 380.22, 387.15,
        394.08, 401.01, 407.94, 414.87, 421.8 , 428.73, 435.66, 442.59,
        449.52, 456.45, 463.38, 470.31, 477.24, 484.17, 491.1 , 498.03,
        504.96, 511.89, 518.82, 525.75, 532.68, 539.61, 546.54, 553.47,
        560.4 , 567.33, 574.26, 581.19, 588.12, 595.05, 601.98, 608.91,
        615.84, 622.77, 629.7 , 636.63, 643.56, 650.49, 657.42, 664.35,
        671.28, 678.21, 685.14, 692.07, 699.  ]),
 <a list of 100 Patch objects>)
In [31]:
fore
Out[31]:
sum_score seqno start_idx pssm_scan hyp_cwm_scan cwm_scan pssm_jaccard hyp_cwm_jaccard hyp_cwm_masked_convolution PC1 PC2 PC3 PC4 labels probs_0 probs_1 affinity
113935 0.724063 6379 560 10.71255 1.190025 1.48021 0.521462 0.249506 0.930102 6.754057 -1.156876 0.139244 -0.263930 1 0.000005 0.999995 9.164708
113972 0.539283 6379 656 10.71255 1.190025 1.48021 0.664372 0.274725 0.964736 7.275955 -1.417457 -0.283314 -1.016604 1 0.000001 0.999999 9.198278
In [32]:
rev
Out[32]:
sum_score seqno start_idx pssm_scan hyp_cwm_scan cwm_scan pssm_jaccard hyp_cwm_jaccard hyp_cwm_masked_convolution PC1 PC2 PC3 PC4 labels probs_0 probs_1 affinity
113948 1.583670 6379 589 11.000161 1.177183 1.507181 0.752419 0.282170 0.965384 7.752105 0.497587 -0.502023 -1.130626 1 1.226194e-08 1.000000 9.210218
113969 0.563157 6379 638 8.484707 0.967039 1.257896 0.513487 0.225346 0.929413 5.725680 -1.391008 -0.562599 -0.383389 1 3.737137e-02 0.962629 3.246094
In [38]:
from modisco.visualization import viz_sequence
#[(560, 560+8), (656, 656+8), (589,589+8), (638, 638+8)]
highlights = [(60, 60+8), (156, 156+8), (89,89+8), (138, 138+8)]
viz_sequence.plot_weights(ob.deeplift_data["scores"][6379, 500:700,:],
                          highlight={"black": highlights},
                          subticks_frequency=10, figsize=(20,1)
                         )
In [36]:
len(query['ppm'])
Out[36]:
8
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [2]:
from matlas.matches import DenovoModisco

task_idx = 273
modisco_dir = "{0}/task_{1}-naivegw".format(modisco_root, task_idx)
summit_file = "{0}/{1}/fineFactorized/task_{2}-naivegw/predictions/task_{2}-{3}Summit.calibrated.{1}.tab.gz".format(project_root, data_name, task_idx, "Naive")
perf_file = "{0}/task_{1}-naivegw/NaiveauPRC.txt".format(model_root, task_idx)
deeplift_hdf = "{0}/{1}/Naive_deeplift2019/task_{2}-naivegw/summit.h5".format(srv_root, data_name, task_idx)
ob = DenovoModisco(modisco_dir)
pattern_name = "metacluster_1/pattern_7"
query = ob.load_chomped_pattern(pattern_name, plot=True)
ppmchomped ppmchomped pwmchomped log10_pwmchomped pssmchomped cwmchomped hyp_cwmconsensustcTTATCt
In [3]:
ob.initialize_with_importance_scores(summit_file, perf_file, deeplift_hdf, query.keys()) #loads all the important scores
deeplift_data = ob.deeplift_data
In [4]:
from matlas.utils import get_calibrated_peaks

calibrated_data = get_calibrated_peaks(summit_file, perf_file)
selected_summits = calibrated_data['selected_summits']
indices = calibrated_data['indices']

#from basepair.plot.profiles import extract_signal
from matlas.tracks import extract_signal
pattern_name = 'metacluster_1/pattern_7'

seqlet_scores = ob.contrib_per_pattern[pattern_name]
seqlet_onehot = ob.seqs_per_pattern[pattern_name]

# seqlets_ = ob.seqlets_per_pattern[pattern_name]
# shuffled_onehot = extract_signal(ob.deeplift_data['shuffled_onehot'][indices], seqlets_)
# shuffled_scores = extract_signal(ob.deeplift_data['shuffled_scores'][indices], seqlets_)

# shuffled_onehot.shape, shuffled_scores.shape
In [5]:
from matlas.matches import collect_scoring_measures

method_to_seqlet_scores = collect_scoring_measures(query, seqlet_scores, seqlet_onehot)
print(query.keys(), np.sort(np.array(list(method_to_seqlet_scores.keys()))))


scores = deeplift_data['scores']
one_hot = deeplift_data['one_hot']

method_to_scores = collect_scoring_measures(query, scores, one_hot)
dict_keys(['ppm', 'pwm', 'log10pwm', 'pssm', 'cwm', 'hyp_cwm', 'consensus']) ['cwm_convolution' 'cwm_jaccard' 'cwm_masked_convolution' 'cwm_scan'
 'hyp_cwm_convolution' 'hyp_cwm_jaccard' 'hyp_cwm_masked_convolution'
 'hyp_cwm_scan' 'log10pwm_convolution' 'log10pwm_jaccard'
 'log10pwm_masked_convolution' 'log10pwm_scan' 'ppm_convolution'
 'ppm_jaccard' 'ppm_masked_convolution' 'ppm_scan' 'pssm_convolution'
 'pssm_jaccard' 'pssm_masked_convolution' 'pssm_scan' 'pwm_convolution'
 'pwm_jaccard' 'pwm_masked_convolution' 'pwm_scan' 'sum_score']
In [6]:
from matlas.matches import get_any_high_scoring_instances
per_base_imp=0.0625

keys_for_scoring = ['sum_score'] 
keys_for_scoring += ["{}_scan".format(key) for key in query.keys() if key!= 'consensus']
keys_for_scoring += ["{}_jaccard".format(key) for key in query.keys() if key!= 'consensus']
keys_for_scoring += ["{}_convolution".format(key) for key in query.keys() if key!= 'consensus']
keys_for_scoring += ["{}_masked_convolution".format(key) for key in query.keys() if key!= 'consensus']
        
seqlet_hits = get_any_high_scoring_instances(method_to_seqlet_scores,
                                                min_tot_imp=per_base_imp*len(query['ppm']),
                                                keys=keys_for_scoring)

_hits = get_any_high_scoring_instances(method_to_scores,
                                       min_tot_imp=per_base_imp*len(query['ppm']),
                                       keys=keys_for_scoring)
In [11]:
keys_for_scoring=['sum_score']
keys_for_scoring += ["{}_scan".format(key) for key in ['pssm', 'hyp_cwm', 'cwm']]
keys_for_scoring += ["{}_jaccard".format(key) for key in ['pssm', 'hyp_cwm']]
#keys_for_scoring += ["{}_convolution".format(key) for key in ['hyp_cwm', 'cwm']]
keys_for_scoring += ["{}_masked_convolution".format(key) for key in ['hyp_cwm']]
In [7]:
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

def transform_data(_hits, scaler=None, pca=None):
    if scaler is None:
        # should be used only during training
        scaler = StandardScaler()
        scaler.fit(_hits)
        
    # Apply transform to both the training set and testing set
    _hits = scaler.transform(_hits)

    if pca is None:
        # Make an instance of the Model
        pca = PCA(.98)
        #fitting PCA on the training set only.
        pca.fit(_hits)
        
    #Apply the mapping (transform) to both the training set and the test set.
    _hits = pca.transform(_hits)
    
    return scaler, pca, _hits

scaler, pca, train_hits = transform_data(seqlet_hits[keys_for_scoring])
pca.explained_variance_ratio_
Out[7]:
array([0.73421992, 0.09118347, 0.07766985, 0.02654744, 0.022959  ,
       0.01230255, 0.00980085, 0.00861208])
In [8]:
pc_df = pd.DataFrame(data = train_hits, columns=["PC{}".format(i+1) for i in range(train_hits.shape[1])])
In [9]:
def load_true_positives(tp_file):
    coordinates = []
    weak_coordinates = []
    with open(tp_file, "r") as fp:
        for line in fp:
            line = line.strip().split(",")
            seqno = line[0]
            pos = int(line[1])
            if "-" in seqno:
                seqno = seqno.split("-")
                for i in range(int(seqno[0]), int(seqno[1])+1):
                    coordinates.append((i, pos))
            else:
                if len(line)>2:
                    #if 'weak' in line[2]:
                    weak_coordinates.append((int(seqno), pos))
                    #else:
                        #coordinates.append((int(seqno), pos))
                else:
                    coordinates.append((int(seqno), pos))
    
    
    return {'strong':coordinates, 'weak':weak_coordinates}

def get_tp_indices(seqlet_hits, coordinates, verbose=True):
    keep = OrderedDict()
    for key in coordinates.keys():
        keep[key] = []
        for cord in coordinates[key]:
            seqno, pos = cord
            df = seqlet_hits[(seqlet_hits['seqno']==seqno) & (seqlet_hits['start_idx']==pos)]
            if df.shape[0]>0:
                keep[key].append(df.index[0])
            else:
                if verbose: print(cord)

    return keep

tp_file = "{}/web_mouse_heme_data/modisco_tracks/binary_model/task_{}-naivegw/mm17_tp_new.txt".format(project_root, task_idx)
coordinates = load_true_positives(tp_file)
true_indices = get_tp_indices(seqlet_hits, coordinates, False)
len(true_indices['strong']), len(true_indices['weak']), seqlet_hits.shape#, sum(labels==1)
Out[9]:
(910, 11, (13587, 27))
In [10]:
import matplotlib.pyplot as plt
def plot_pca(df, true_ids=None):
    print(df.shape)
    fig = plt.figure(figsize = (8,8))
    ax = fig.add_subplot(1,1,1) 
    ax.set_xlabel('PC1', fontsize = 15)
    ax.set_ylabel('PC2', fontsize = 15)
    ax.set_title('2 component PCA', fontsize = 20)
    if true_ids is not None:
        strong_ids = true_ids['strong']
        strong_df = df.loc[np.array(strong_ids),:]
        ax.scatter(strong_df['PC1'], strong_df['PC2'], s = 1, c='tomato')
        
        weak_ids = true_ids['weak']
        weak_df = df.loc[np.array(weak_ids),:]
        ax.scatter(weak_df['PC1'], weak_df['PC2'], s = 5, c='darkred')
        
        df = df.loc[np.array(list(set(list(df.index)).difference(
            set(strong_ids).union(set(weak_ids))
        ))),:]
    ax.scatter(df['PC1'], df['PC2'], s = 1, c='slateblue')
    #ax.axvline(9)
    #ax.axhline(-1)
    ax.grid()
    ax.legend(['strong match', 'weak match', 'no match'], loc="upper right")
    return None
coordinates = load_true_positives(tp_file)
true_indices = get_tp_indices(seqlet_hits, coordinates, False)
plot_pca(pc_df, true_indices)
(13587, 8)
In [17]:
import matplotlib.pyplot as plt
from sklearn.mixture import GaussianMixture
#import sklearn.mixture.GaussianMixture
gmm = GaussianMixture(n_components=2, covariance_type='full', init_params='kmeans').fit(pc_df[['PC1', 'PC2']])
labels = gmm.predict(pc_df[['PC1', 'PC2']])
print(np.sum(labels==1), np.sum(labels==0))
plt.scatter(pc_df['PC1'], pc_df['PC2'], c=labels, s=1, cmap='seismic');
probs = gmm.predict_proba(pc_df[['PC1', 'PC2']])
plt.scatter(pc_df['PC1'], pc_df['PC2'], c=probs[:,1], s=1, cmap='seismic');
920 12667
In [12]:
pcv1 = pca.explained_variance_ratio_[0]
pcv2 = pca.explained_variance_ratio_[2]
pc_maxs = pc_df.max()
affinity = (pc_df['PC1']*pcv1/(pcv1+pcv2)+pc_df['PC2']*pcv2/(pcv1+pcv2))/(pc_maxs['PC1']*pcv1/(pcv1+pcv2)+pc_maxs['PC2']*pcv2/(pcv1+pcv2))
plt.scatter(pc_df['PC1'], pc_df['PC2'], c=affinity, s=1, cmap='seismic');
In [13]:
labels = np.array([0]*pc_df.shape[0])
labels[np.array(true_indices['strong'])]=2
labels[np.array(true_indices['weak'])]=1
pc_df['label'] = labels
pc_df['index'] = pc_df.index
pc_df['seqno'] = seqlet_hits.seqno
pc_df['pos'] = seqlet_hits.start_idx
pc_df['affinity'] = affinity
pc_df[:5]
Out[13]:
PC1 PC2 PC3 PC4 PC5 PC6 PC7 PC8 label index seqno pos affinity
0 -1.757138 -1.105117 -1.560638 0.357107 0.277466 0.376950 -0.193923 0.091057 0 0 0 16 -0.089076
1 2.880820 -0.113733 -0.338334 0.840248 0.175452 0.578696 0.093965 -0.772116 0 1 0 17 0.136358
2 0.438450 2.069480 -0.599759 0.185653 -0.684141 0.460374 -0.527480 -0.043134 0 2 0 18 0.031246
3 -3.361249 4.008285 0.283835 -0.326203 -0.736401 0.243396 -0.191256 0.144796 0 3 0 19 -0.139611
4 16.268981 3.032388 0.694093 1.201489 2.725356 0.261409 -0.523800 0.170852 2 4 0 20 0.788538
In [14]:
import plotly.express as px
fig = px.scatter(pc_df, x="PC1", y="PC2", hover_data=['index', 'seqno', 'pos'], color='affinity',
                 color_continuous_scale=px.colors.sequential.Magma
                )
fig
In [18]:
#seqlets pca, seqlet gmm 
_, _, test_hits = transform_data(_hits[keys_for_scoring], scaler, pca)
_pc_df = pd.DataFrame(data = test_hits, columns=["PC{}".format(i+1) for i in range(test_hits.shape[1])])

_labels = gmm.predict(_pc_df[['PC1', 'PC2']])
print(np.sum(_labels==1), np.sum(_labels==0))
_probs = gmm.predict_proba(_pc_df[['PC1', 'PC2']])
plt.scatter(_pc_df['PC1'], _pc_df['PC2'], c=_probs[:,1], s=1, cmap='seismic');
9116 3041038
In [11]:
dlhits[:5]
Out[11]:
0 1 2 3 4 5 6 7 8 9
0 chr1 3400299 3400307 M1_P7/tcTTATCt 0.999995 . 0.999995 -1 0.999995 -1
1 chr1 3451802 3451810 M1_P7/tcTTATCt 0.994065 . 0.994065 -1 0.994065 -1
2 chr1 6223145 6223153 M1_P7/tcTTATCt 0.969597 . 0.969597 -1 0.969597 -1
3 chr1 8134664 8134672 M1_P7/tcTTATCt 0.999999 . 0.999999 -1 0.999999 -1
4 chr1 8134664 8134672 M1_P7/tcTTATCt 0.999999 . 0.999999 -1 0.999999 -1
In [ ]:
 
In [19]:
from matlas.verification import compute_chip_matrix

compute_chip_matrix("MEL", "GATA", out_prefix, execute=True)
computeMatrix reference-point --referencePoint center --skipZeros -S /mnt/lab_data2/msharmin/mouse_chipseq_bigwigs/ENCFF686HPH.bigWig -R /mnt/lab_data/kundaje/msharmin/mouse_hem/with_tfd/web_mouse_heme_data/modisco_tracks/binary_model/task_273-naivegw/m17_gmm.bed -a 500 -b 500 -bs 10 -o /mnt/lab_data/kundaje/msharmin/mouse_hem/with_tfd/web_mouse_heme_data/modisco_tracks/binary_model/task_273-naivegw/m17_gmm.matrix.gz --outFileSortedRegions /mnt/lab_data/kundaje/msharmin/mouse_hem/with_tfd/web_mouse_heme_data/modisco_tracks/binary_model/task_273-naivegw/m17_gmm.matrix.bed 
In [20]:
from matlas.verification import plot_chip_heatmap
plot_chip_heatmap("MEL", "GATA", out_prefix, execute=True)
plotHeatmap -m /mnt/lab_data/kundaje/msharmin/mouse_hem/with_tfd/web_mouse_heme_data/modisco_tracks/binary_model/task_273-naivegw/m17_gmm.matrix.gz -out /mnt/lab_data/kundaje/msharmin/mouse_hem/with_tfd/web_mouse_heme_data/modisco_tracks/binary_model/task_273-naivegw/MEL-GATA-heatmap.png  --colorMap Blues --xAxisLabel Distance --missingDataColor 1 --zMin 0 --heatmapHeight 28 --heatmapWidth 8 --legendLocation none
In [ ]:
 
In [8]:
from matlas.matches import collect_scoring_measures
seqlet_scores = ob.contrib_per_pattern[pattern_name]
seqlet_onehot = ob.seqs_per_pattern[pattern_name]
method_to_seqlet_scores = collect_scoring_measures(query, seqlet_scores, seqlet_onehot)
query.keys(), np.sort(np.array(list(method_to_seqlet_scores.keys())))
Out[8]:
(dict_keys(['ppm', 'pwm', 'log10pwm', 'pssm', 'cwm', 'hyp_cwm', 'consensus']),
 array(['cwm_convolution', 'cwm_jaccard', 'cwm_masked_convolution',
        'cwm_scan', 'hyp_cwm_convolution', 'hyp_cwm_jaccard',
        'hyp_cwm_masked_convolution', 'hyp_cwm_scan',
        'log10pwm_convolution', 'log10pwm_jaccard',
        'log10pwm_masked_convolution', 'log10pwm_scan', 'ppm_convolution',
        'ppm_jaccard', 'ppm_masked_convolution', 'ppm_scan',
        'pssm_convolution', 'pssm_jaccard', 'pssm_masked_convolution',
        'pssm_scan', 'pwm_convolution', 'pwm_jaccard',
        'pwm_masked_convolution', 'pwm_scan', 'sum_score'], dtype='<U27'))
In [24]:
method_to_shuffled_scores = collect_scoring_measures(query, shuffled_scores, shuffled_onehot)
method_to_seq_null_scores = collect_scoring_measures(query, seq_null_scores, seq_null_onehot)
method_to_weight_null_scores = collect_scoring_measures(query, weight_null_scores, weight_null_onehot)
method_to_weightedseq_null_scores = collect_scoring_measures(query, weightedseq_null_scores, weightedseq_null_onehot)
In [10]:
from matlas.matches import get_any_high_scoring_instances
per_base_imp=0.0625

keys_for_scoring = ['sum_score'] 
keys_for_scoring += ["{}_scan".format(key) for key in query.keys() if key!= 'consensus']
keys_for_scoring += ["{}_jaccard".format(key) for key in query.keys() if key!= 'consensus']
keys_for_scoring += ["{}_convolution".format(key) for key in query.keys() if key!= 'consensus']
keys_for_scoring += ["{}_masked_convolution".format(key) for key in query.keys() if key!= 'consensus']
        
seqlet_hits = get_any_high_scoring_instances(method_to_seqlet_scores,
                                                min_tot_imp=per_base_imp*len(query['ppm']),
                                                keys=keys_for_scoring)
In [11]:
def get_background_instances(fg, method_to_null_scores, keys=None):
    # given the fg coords, get the df of bg
    # appy scaling and pca as in fg, then apply log-likelihood
    if keys is None:
        keys = list(method_to_scores.keys())
    sum_scores = method_to_null_scores['sum_score']
    matches = [(sum_scores[seqno, pos], seqno, pos) for seqno, pos in zip(fg['seqno'].values, fg['start_idx'].values)]
    match_df = pd.DataFrame(np.array(matches))
    match_df.columns = ['sum_score', 'seqno', 'start_idx']
    match_df.seqno = match_df.seqno.values.astype(int)
    match_df.start_idx = match_df.start_idx.values.astype(int)
    for key in keys:
        if(key=='sum_score'):
            continue
        match_df[key] = [method_to_null_scores[key][idx1, idx2] for 
                         (idx1, idx2) in zip(fg['seqno'].values, fg['start_idx'].values)]
    
    return match_df

shuffled_paired_hits = get_background_instances(seqlet_hits, method_to_shuffled_scores, keys=keys_for_scoring)
seq_null_paired_hits = get_background_instances(seqlet_hits, method_to_seq_null_scores, keys=keys_for_scoring)
weight_null_paired_hits = get_background_instances(seqlet_hits, method_to_weight_null_scores, keys=keys_for_scoring)
weightedseq_null_paired_hits = get_background_instances(seqlet_hits, method_to_weightedseq_null_scores, keys=keys_for_scoring)
seqlet_hits.shape, shuffled_paired_hits.shape, seq_null_paired_hits.shape, weight_null_paired_hits.shape, weightedseq_null_paired_hits.shape
Out[11]:
((13587, 27), (13587, 27), (13587, 27), (13587, 27), (13587, 27))
In [36]:
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

def transform_data(_hits, scaler=None, pca=None):
    if scaler is None:
        # should be used only during training
        scaler = StandardScaler()
        scaler.fit(_hits)
        
    # Apply transform to both the training set and testing set
    _hits = scaler.transform(_hits)

    if pca is None:
        # Make an instance of the Model
        pca = PCA(.98)
        #fitting PCA on the training set only.
        pca.fit(_hits)
        
    #Apply the mapping (transform) to both the training set and the test set.
    _hits = pca.transform(_hits)
    
    return scaler, pca, _hits

scaler, pca, train_hits = transform_data(seqlet_hits[keys_for_scoring])
pca.explained_variance_ratio_
Out[36]:
array([0.73421992, 0.09118347, 0.07766985, 0.02654744, 0.022959  ,
       0.01230255, 0.00980085, 0.00861208])
In [25]:
shuffled_seqlet_hits = get_any_high_scoring_instances(method_to_shuffled_scores,
                                                min_tot_imp=per_base_imp*len(query['ppm']),
                                                keys=keys_for_scoring)
seq_null_seqlet_hits = get_any_high_scoring_instances(method_to_seq_null_scores,
                                                min_tot_imp=per_base_imp*len(query['ppm']),
                                                keys=keys_for_scoring)
weight_null_seqlet_hits = get_any_high_scoring_instances(method_to_weight_null_scores,
                                                min_tot_imp=per_base_imp*len(query['ppm']),
                                                keys=keys_for_scoring)
weightedseq_null_seqlet_hits = get_any_high_scoring_instances(method_to_weightedseq_null_scores,
                                                min_tot_imp=per_base_imp*len(query['ppm']),
                                                keys=keys_for_scoring)
In [26]:
seqlet_hits['marker'] = 'foreground'
shuffled_seqlet_hits['marker'] = 'suffled'
seq_null_seqlet_hits['marker'] = 'seq_null'
weight_null_seqlet_hits['marker'] = 'weight_null'
weightedseq_null_seqlet_hits['marker'] = 'weightedseq'
test_hits = pd.concat([seqlet_hits,
                       shuffled_seqlet_hits,
                       seq_null_seqlet_hits,
                       weight_null_seqlet_hits,
                       weightedseq_null_seqlet_hits
                      ])
/users/msharmin/anaconda2/envs/basepair13/lib/python3.6/site-packages/ipykernel_launcher.py:10: FutureWarning:

Sorting because non-concatenation axis is not aligned. A future version
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.

To retain the current behavior and silence the warning, pass 'sort=True'.


In [15]:
#read the real truth
def load_true_positives(tp_file):
    coordinates = []
    with open(tp_file, "r") as fp:
        for line in fp:
            line = line.strip().split(",")
            seqno = line[0]
            pos = int(line[1])
            if "-" in seqno:
                seqno = seqno.split("-")
                for i in range(int(seqno[0]), int(seqno[1])+1):
                    coordinates.append((i, pos))
            else:
                coordinates.append((int(seqno), pos))
    return coordinates
def get_tp_indices(seqlet_hits, verbose=True):
    keep = []
    for cord in coordinates:
        seqno, pos = cord
        df = seqlet_hits[(seqlet_hits['seqno']==seqno) & (seqlet_hits['start_idx']==pos)]
        if df.shape[0]>0:
            keep.append(df.index[0])
        else:
            if verbose: print(cord)

    return keep

tp_file = "{}/web_mouse_heme_data/modisco_tracks/binary_model/task_{}-naivegw/mm17_tp.txt".format(project_root, task_idx)
coordinates = load_true_positives(tp_file)
true_indices = get_tp_indices(seqlet_hits, False)
In [85]:
import matplotlib.pyplot as plt
def plot_pca(df, true_ids=None):
    print(df.shape)
    fig = plt.figure(figsize = (8,8))
    ax = fig.add_subplot(1,1,1) 
    ax.set_xlabel('PC1', fontsize = 15)
    ax.set_ylabel('PC2', fontsize = 15)
    ax.set_title('2 component PCA', fontsize = 20)
    if true_ids is not None:
        true_df = df.loc[np.array(true_ids),:]
        df = df.loc[np.array(list(set(list(df.index)).difference(set(true_ids)))),:]
        ax.scatter(true_df['PC1'], true_df['PC2'], s = 20, c='r', marker='+')
    ax.scatter(df['PC1'], df['PC2'], s = 1, c='b')
    #ax.axvline(9)
    #ax.axhline(-1)
    ax.grid()
    return None
pc_df = pd.DataFrame(data = train_hits, columns=["PC{}".format(i+1) for i in range(train_hits.shape[1])])
plot_pca(pc_df, true_indices)
(13587, 8)
In [102]:
from sklearn.mixture import GaussianMixture
#import sklearn.mixture.GaussianMixture
gmm = GaussianMixture(n_components=2).fit(pc_df[['PC1', 'PC2']])
labels = gmm.predict(pc_df[['PC1', 'PC2']])
plt.scatter(pc_df['PC1'], pc_df['PC2'], c=labels, s=1, cmap='seismic');
In [101]:
probs = gmm.predict_proba(pc_df[['PC1', 'PC2']])
plt.scatter(pc_df['PC1'], pc_df['PC2'], c=probs[:,1], s=1, cmap='seismic');
In [121]:
conditions = np.copy(labels)
conditions[np.logical_and(pc_df['PC1'].values<10, pc_df['PC2'].values<-0.5)] = 0
plt.scatter(pc_df['PC1'], pc_df['PC2'], c=conditions, s=1, cmap='seismic');
In [4]:
# selected = np.array(list(set(list(np.argwhere(conditions==0).T[0])).difference(set(list(np.argwhere(labels==0).T[0])))))
# pc_df.loc[selected,:]
# seqlet_hits.loc[selected,:]
# probs[:,1]

from matlas.render_report import jupyter_nbconvert
import os
def render_test_plots(nbname="global_report"):
    rendered_ipynb = "/mnt/lab_data/kundaje/msharmin/mouse_hem/code/mouse_hem/{}.ipynb".format(nbname)
    jupyter_nbconvert(rendered_ipynb)

    os.system("cp /mnt/lab_data/kundaje/msharmin/mouse_hem/code/mouse_hem/{}.html /srv/www/kundaje/msharmin/report/global_htmls/".format(nbname))
    os.system("chmod +755 /srv/www/kundaje/msharmin/mouse_hem/report/global_htmls/{}.html".format(nbname))

    return None

render_test_plots('testSpi1Scan')
In [ ]:
 
In [27]:
_, _, test_hits = transform_data(shuffled_seqlet_hits[keys_for_scoring], scaler, pca)
shuffled_pc_df = pd.DataFrame(data = test_hits, columns=["PC{}".format(i+1) for i in range(test_hits.shape[1])])
#plot_pca(shuffled_pc_df, true_ids=None)

probs = gmm.predict_proba(shuffled_pc_df[['PC1', 'PC2']])
plt.scatter(shuffled_pc_df['PC1'], shuffled_pc_df['PC2'], c=probs[:,1], s=1, cmap='seismic');
In [67]:
marker = np.array(['F']*llr_df.shape[0])
marker[np.array(true_indices)] = 'TP'
llr_df['marker'] = marker
In [ ]:
 
In [47]:
#dimension reduction
len(keys_for_scoring)
keys_for_scoring = ['sum_score'] 
keys_for_scoring += ["{}_scan".format(key) for key in query.keys() if key!= 'consensus']
keys_for_scoring += ["{}_jaccard".format(key) for key in query.keys() if key!= 'consensus']
keys_for_scoring += ["{}_convolution".format(key) for key in query.keys() if key!= 'consensus']
keys_for_scoring += ["{}_masked_convolution".format(key) for key in ['hyp_cwm', 'log10pwm', 'pwm']]
In [48]:
import umap
trans = umap.UMAP(n_neighbors=5, random_state=42).fit(seqlet_hits[keys_for_scoring])
print(trans.embedding_.shape)
plt.scatter(trans.embedding_[:, 0], trans.embedding_[:, 1], s=1, cmap='Spectral')
/users/msharmin/anaconda2/envs/basepair13/lib/python3.6/site-packages/umap/spectral.py:229: UserWarning:

Embedding a total of 3 separate connected components using meta-embedding (experimental)

(13587, 2)
Out[48]:
<matplotlib.collections.PathCollection at 0x7f5f8aadbba8>
In [54]:
test_embedding = trans.transform(seqlet_hits[keys_for_scoring])
print(test_embedding.shape)
plt.scatter(test_embedding[:, 0], test_embedding[:, 1], s=1, cmap='Spectral')
(13587, 2)
Out[54]:
<matplotlib.collections.PathCollection at 0x7f5f8ab2ed30>
In [49]:
test_embedding = trans.transform(shuffled_seqlet_hits[keys_for_scoring])
print(test_embedding.shape)
plt.scatter(test_embedding[:, 0], test_embedding[:, 1], s=1, cmap='Spectral')
(2846, 2)
Out[49]:
<matplotlib.collections.PathCollection at 0x7f5f89ce7358>
In [50]:
test_embedding = trans.transform(seq_null_seqlet_hits[keys_for_scoring])
print(test_embedding.shape)
plt.scatter(test_embedding[:, 0], test_embedding[:, 1], s=1, cmap='Spectral')
(13587, 2)
Out[50]:
<matplotlib.collections.PathCollection at 0x7f5f8a2b3048>
In [51]:
test_embedding = trans.transform(weight_null_seqlet_hits[keys_for_scoring])
print(test_embedding.shape)
plt.scatter(test_embedding[:, 0], test_embedding[:, 1], s=1, cmap='Spectral')
(12928, 2)
Out[51]:
<matplotlib.collections.PathCollection at 0x7f5f8b1cc240>
In [53]:
test_embedding = trans.transform(weightedseq_null_seqlet_hits[keys_for_scoring])
print(test_embedding.shape)
plt.scatter(test_embedding[:, 0], test_embedding[:, 1], s=1, cmap='Spectral')
(13422, 2)
Out[53]:
<matplotlib.collections.PathCollection at 0x7f5f8a8647b8>
In [55]:
type(test_embedding)
Out[55]:
numpy.ndarray
In [56]:
umap_seqlet_hits = pd.DataFrame(trans.transform(seqlet_hits[keys_for_scoring]))
umap_shuffled_seqlet_hits = pd.DataFrame(trans.transform(shuffled_seqlet_hits[keys_for_scoring]))
umap_seq_null_seqlet_hits = pd.DataFrame(trans.transform(seq_null_seqlet_hits[keys_for_scoring]))
umap_weight_null_seqlet_hits = pd.DataFrame(trans.transform(weight_null_seqlet_hits[keys_for_scoring]))
umap_weightedseq_null_seqlet_hits = pd.DataFrame(trans.transform(weightedseq_null_seqlet_hits[keys_for_scoring]))
umap_seqlet_hits['marker'] = 'foreground'
umap_shuffled_seqlet_hits['marker'] = 'suffled'
umap_seq_null_seqlet_hits['marker'] = 'seq_null'
umap_weight_null_seqlet_hits['marker'] = 'weight_null'
umap_weightedseq_null_seqlet_hits['marker'] = 'weightedseq'
umap_test_hits = pd.concat([umap_seqlet_hits,
                       umap_shuffled_seqlet_hits,
                       umap_seq_null_seqlet_hits,
                       umap_weight_null_seqlet_hits,
                       umap_weightedseq_null_seqlet_hits
                      ])
In [58]:
sns.pairplot(umap_test_hits, 
             diag_kind="kde", kind='scatter', hue="marker")
/users/msharmin/anaconda2/envs/basepair13/lib/python3.6/site-packages/scipy/stats/stats.py:1713: FutureWarning:

Using a non-tuple sequence for multidimensional indexing is deprecated; use `arr[tuple(seq)]` instead of `arr[seq]`. In the future this will be interpreted as an array index, `arr[np.array(seq)]`, which will result either in an error or a different result.

Out[58]:
<seaborn.axisgrid.PairGrid at 0x7f5f8a689a58>
In [ ]:
 

PPM in forground vs. various null scores

In [60]:
from matlas.matches import plots_seqlet_measures
plots_seqlet_measures(seqlet_hits, metrics = {0:["sum_score", "ppm_scan"], 
                                            1:["sum_score", "ppm_convolution"], 
                                            2:["sum_score", "ppm_masked_convolution"], 
                                            3:["sum_score", "ppm_jaccard"]})
In [66]:
plots_seqlet_measures(shuffled_seqlet_hits, metrics = {0:["sum_score", "ppm_scan"], 
                                            1:["sum_score", "ppm_convolution"], 
                                            2:["sum_score", "ppm_masked_convolution"], 
                                            3:["sum_score", "ppm_jaccard"]})
/users/msharmin/anaconda2/envs/basepair13/lib/python3.6/site-packages/scipy/stats/stats.py:1713: FutureWarning:

Using a non-tuple sequence for multidimensional indexing is deprecated; use `arr[tuple(seq)]` instead of `arr[seq]`. In the future this will be interpreted as an array index, `arr[np.array(seq)]`, which will result either in an error or a different result.

In [67]:
plots_seqlet_measures(seq_null_seqlet_hits, metrics = {0:["sum_score", "ppm_scan"], 
                                            1:["sum_score", "ppm_convolution"], 
                                            2:["sum_score", "ppm_masked_convolution"], 
                                            3:["sum_score", "ppm_jaccard"]})
In [68]:
plots_seqlet_measures(weight_null_seqlet_hits, metrics = {0:["sum_score", "ppm_scan"], 
                                            1:["sum_score", "ppm_convolution"], 
                                            2:["sum_score", "ppm_masked_convolution"], 
                                            3:["sum_score", "ppm_jaccard"]})
In [69]:
plots_seqlet_measures(weightedseq_null_seqlet_hits, metrics = {0:["sum_score", "ppm_scan"], 
                                            1:["sum_score", "ppm_convolution"], 
                                            2:["sum_score", "ppm_masked_convolution"], 
                                            3:["sum_score", "ppm_jaccard"]})

PWM in forground vs. various null scores

In [59]:
from matlas.matches import plots_seqlet_measures
plots_seqlet_measures(seqlet_hits, metrics = {0:["sum_score", "pwm_scan"], 
                                            1:["sum_score", "pwm_convolution"], 
                                            2:["sum_score", "pwm_masked_convolution"], 
                                            3:["sum_score", "pwm_jaccard"]})
In [70]:
plots_seqlet_measures(shuffled_seqlet_hits, metrics = {0:["sum_score", "pwm_scan"], 
                                            1:["sum_score", "pwm_convolution"], 
                                            2:["sum_score", "pwm_masked_convolution"], 
                                            3:["sum_score", "pwm_jaccard"]})
In [71]:
plots_seqlet_measures(seq_null_seqlet_hits, metrics = {0:["sum_score", "pwm_scan"], 
                                            1:["sum_score", "pwm_convolution"], 
                                            2:["sum_score", "pwm_masked_convolution"], 
                                            3:["sum_score", "pwm_jaccard"]})
In [72]:
plots_seqlet_measures(weight_null_seqlet_hits, metrics = {0:["sum_score", "pwm_scan"], 
                                            1:["sum_score", "pwm_convolution"], 
                                            2:["sum_score", "pwm_masked_convolution"], 
                                            3:["sum_score", "pwm_jaccard"]})
In [75]:
plots_seqlet_measures(weightedseq_null_seqlet_hits, metrics = {0:["sum_score", "pwm_scan"], 
                                            1:["sum_score", "pwm_convolution"], 
                                            2:["sum_score", "pwm_masked_convolution"], 
                                            3:["sum_score", "pwm_jaccard"]})

log10pwm in forground vs. various null scores

In [62]:
from matlas.matches import plots_seqlet_measures
plots_seqlet_measures(seqlet_hits, metrics = {0:["sum_score", "log10pwm_scan"], 
                                            1:["sum_score", "log10pwm_convolution"], 
                                            2:["sum_score", "log10pwm_masked_convolution"], 
                                            3:["sum_score", "log10pwm_jaccard"]})
In [76]:
plots_seqlet_measures(shuffled_seqlet_hits, metrics = {0:["sum_score", "log10pwm_scan"], 
                                            1:["sum_score", "log10pwm_convolution"], 
                                            2:["sum_score", "log10pwm_masked_convolution"], 
                                            3:["sum_score", "log10pwm_jaccard"]})
In [77]:
from matlas.matches import plots_seqlet_measures
plots_seqlet_measures(seq_null_seqlet_hits, metrics = {0:["sum_score", "log10pwm_scan"], 
                                            1:["sum_score", "log10pwm_convolution"], 
                                            2:["sum_score", "log10pwm_masked_convolution"], 
                                            3:["sum_score", "log10pwm_jaccard"]})
In [78]:
from matlas.matches import plots_seqlet_measures
plots_seqlet_measures(weight_null_seqlet_hits, metrics = {0:["sum_score", "log10pwm_scan"], 
                                            1:["sum_score", "log10pwm_convolution"], 
                                            2:["sum_score", "log10pwm_masked_convolution"], 
                                            3:["sum_score", "log10pwm_jaccard"]})
In [79]:
from matlas.matches import plots_seqlet_measures
plots_seqlet_measures(weightedseq_null_seqlet_hits, metrics = {0:["sum_score", "log10pwm_scan"], 
                                            1:["sum_score", "log10pwm_convolution"], 
                                            2:["sum_score", "log10pwm_masked_convolution"], 
                                            3:["sum_score", "log10pwm_jaccard"]})

PSSM in forground vs. various null scores

In [63]:
from matlas.matches import plots_seqlet_measures
plots_seqlet_measures(seqlet_hits, metrics = {0:["sum_score", "pssm_scan"], 
                                            1:["sum_score", "pssm_convolution"], 
                                            2:["sum_score", "pssm_masked_convolution"], 
                                            3:["sum_score", "pssm_jaccard"]})
In [80]:
plots_seqlet_measures(shuffled_seqlet_hits, metrics = {0:["sum_score", "pssm_scan"], 
                                            1:["sum_score", "pssm_convolution"], 
                                            2:["sum_score", "pssm_masked_convolution"], 
                                            3:["sum_score", "pssm_jaccard"]})
In [81]:
plots_seqlet_measures(seq_null_seqlet_hits, metrics = {0:["sum_score", "pssm_scan"], 
                                            1:["sum_score", "pssm_convolution"], 
                                            2:["sum_score", "pssm_masked_convolution"], 
                                            3:["sum_score", "pssm_jaccard"]})
In [82]:
plots_seqlet_measures(weight_null_seqlet_hits, metrics = {0:["sum_score", "pssm_scan"], 
                                            1:["sum_score", "pssm_convolution"], 
                                            2:["sum_score", "pssm_masked_convolution"], 
                                            3:["sum_score", "pssm_jaccard"]})
In [83]:
plots_seqlet_measures(weightedseq_null_seqlet_hits, metrics = {0:["sum_score", "pssm_scan"], 
                                            1:["sum_score", "pssm_convolution"], 
                                            2:["sum_score", "pssm_masked_convolution"], 
                                            3:["sum_score", "pssm_jaccard"]})

cwm in forground vs. various null scores

In [64]:
from matlas.matches import plots_seqlet_measures
plots_seqlet_measures(seqlet_hits, metrics = {0:["sum_score", "cwm_scan"], 
                                            1:["sum_score", "cwm_convolution"], 
                                            2:["sum_score", "cwm_masked_convolution"], 
                                            3:["sum_score", "cwm_jaccard"]})
In [84]:
plots_seqlet_measures(shuffled_seqlet_hits, metrics = {0:["sum_score", "cwm_scan"], 
                                            1:["sum_score", "cwm_convolution"], 
                                            2:["sum_score", "cwm_masked_convolution"], 
                                            3:["sum_score", "cwm_jaccard"]})
In [85]:
plots_seqlet_measures(seq_null_seqlet_hits, metrics = {0:["sum_score", "cwm_scan"], 
                                            1:["sum_score", "cwm_convolution"], 
                                            2:["sum_score", "cwm_masked_convolution"], 
                                            3:["sum_score", "cwm_jaccard"]})
In [86]:
plots_seqlet_measures(weight_null_seqlet_hits, metrics = {0:["sum_score", "cwm_scan"], 
                                            1:["sum_score", "cwm_convolution"], 
                                            2:["sum_score", "cwm_masked_convolution"], 
                                            3:["sum_score", "cwm_jaccard"]})
In [87]:
plots_seqlet_measures(weightedseq_null_seqlet_hits, metrics = {0:["sum_score", "cwm_scan"], 
                                            1:["sum_score", "cwm_convolution"], 
                                            2:["sum_score", "cwm_masked_convolution"], 
                                            3:["sum_score", "cwm_jaccard"]})

hyp_cwm in forground vs. various null scores

In [65]:
from matlas.matches import plots_seqlet_measures
plots_seqlet_measures(seqlet_hits, metrics = {0:["sum_score", "hyp_cwm_scan"], 
                                            1:["sum_score", "hyp_cwm_convolution"], 
                                            2:["sum_score", "hyp_cwm_masked_convolution"], 
                                            3:["sum_score", "hyp_cwm_jaccard"]})
In [88]:
plots_seqlet_measures(shuffled_seqlet_hits, metrics = {0:["sum_score", "hyp_cwm_scan"], 
                                            1:["sum_score", "hyp_cwm_convolution"], 
                                            2:["sum_score", "hyp_cwm_masked_convolution"], 
                                            3:["sum_score", "hyp_cwm_jaccard"]})
In [89]:
plots_seqlet_measures(seq_null_seqlet_hits, metrics = {0:["sum_score", "hyp_cwm_scan"], 
                                            1:["sum_score", "hyp_cwm_convolution"], 
                                            2:["sum_score", "hyp_cwm_masked_convolution"], 
                                            3:["sum_score", "hyp_cwm_jaccard"]})
In [90]:
plots_seqlet_measures(weight_null_seqlet_hits, metrics = {0:["sum_score", "hyp_cwm_scan"], 
                                            1:["sum_score", "hyp_cwm_convolution"], 
                                            2:["sum_score", "hyp_cwm_masked_convolution"], 
                                            3:["sum_score", "hyp_cwm_jaccard"]})
In [91]:
plots_seqlet_measures(weightedseq_null_seqlet_hits, metrics = {0:["sum_score", "hyp_cwm_scan"], 
                                            1:["sum_score", "hyp_cwm_convolution"], 
                                            2:["sum_score", "hyp_cwm_masked_convolution"], 
                                            3:["sum_score", "hyp_cwm_jaccard"]})
In [ ]:
 
In [183]:
len(keys_for_scoring)
keys_for_scoring = ['sum_score'] 
keys_for_scoring += ["{}_scan".format(key) for key in query.keys() if key!= 'consensus']
keys_for_scoring += ["{}_jaccard".format(key) for key in query.keys() if key!= 'consensus']
keys_for_scoring += ["{}_convolution".format(key) for key in query.keys() if key!= 'consensus']
keys_for_scoring += ["{}_masked_convolution".format(key) for key in query.keys() if key!= 'consensus']
#keys_for_scoring += ["{}_masked_convolution".format(key) for key in ['hyp_cwm', 'log10pwm', 'pwm']]
In [184]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
# Fit on training set only.
scaler.fit(seqlet_hits[keys_for_scoring])
# Apply transform to both the training set and the test set.
train_hits = scaler.transform(seqlet_hits[keys_for_scoring])

from sklearn.decomposition import PCA
# Make an instance of the Model
pca = PCA(.98)
#fitting PCA on the training set only.
pca.fit(train_hits)
#Apply the mapping (transform) to both the training set and the test set.
train_hits = pca.transform(train_hits)
#test_hits = pca.transform(test_hits)
In [256]:
#seqlet_hits[seqlet_hits['seqno']==684]
pca.explained_variance_ratio_
Out[256]:
array([0.73421992, 0.09118347, 0.07766985, 0.02654744, 0.022959  ,
       0.01230255, 0.00980085, 0.00861208])
In [273]:
#read the real truth
def load_true_positives(tp_file):
    coordinates = []
    with open(tp_file, "r") as fp:
        for line in fp:
            line = line.strip().split(",")
            seqno = line[0]
            pos = int(line[1])
            if "-" in seqno:
                seqno = seqno.split("-")
                for i in range(int(seqno[0]), int(seqno[1])+1):
                    coordinates.append((i, pos))
            else:
                coordinates.append((int(seqno), pos))
    return coordinates
def get_tp_indices(seqlet_hits):
    keep = []
    for cord in coordinates:
        seqno, pos = cord
        df = seqlet_hits[(seqlet_hits['seqno']==seqno) & (seqlet_hits['start_idx']==pos)]
        if df.shape[0]>0:
            keep.append(df.index[0])
        else:
            print(cord)

    return keep

tp_file = "{}/web_mouse_heme_data/modisco_tracks/binary_model/task_{}-naivegw/mm17_tp.txt".format(project_root, task_idx)
coordinates = load_true_positives(tp_file)
true_indices = get_tp_indices(seqlet_hits)
(684, 49)
In [288]:
def plot_pca(df, true_ids=None):
    print(df.shape)
    fig = plt.figure(figsize = (8,8))
    ax = fig.add_subplot(1,1,1) 
    ax.set_xlabel('PC1', fontsize = 15)
    ax.set_ylabel('PC2', fontsize = 15)
    ax.set_title('2 component PCA', fontsize = 20)
    if true_ids is not None:
        true_df = df.loc[np.array(true_ids),:]
        df = df.loc[np.array(list(set(list(df.index)).difference(set(true_ids)))),:]
        ax.scatter(true_df['PC1'], true_df['PC2'], s = 20, c='r', marker='+')
    ax.scatter(df['PC1'], df['PC2'], s = 1, c='b')
    #ax.axvline(9)
    #ax.axhline(-1)
    ax.grid()
    return None
pc_df = pd.DataFrame(data = train_hits, columns=["PC{}".format(i+1) for i in range(train_hits.shape[1])])
plot_pca(pc_df, )
(13587, 8)
In [291]:
#test_df = pc_df.loc[np.array(list(set(list(pc_df.index)).difference(set(true_indices)))),:]
#test_df = test_df.loc[(test_df['PC1']>9) & (test_df['PC2']<0),:]
#seqlet_hits.loc[test_df.index, :]
seqlet_hits[seqlet_hits['seqno']==724]
Out[291]:
sum_score seqno start_idx ppm_scan pwm_scan log10pwm_scan pssm_scan cwm_scan hyp_cwm_scan ppm_jaccard ... pssm_convolution cwm_convolution hyp_cwm_convolution ppm_masked_convolution pwm_masked_convolution log10pwm_masked_convolution pssm_masked_convolution cwm_masked_convolution hyp_cwm_masked_convolution marker
12013 0.510935 724 16 2.173273 -16.685898 -11.565783 2.807995 0.271431 -0.129788 0.093749 ... 0.155122 0.014721 -0.012018 0.409525 -0.668292 -0.668292 0.344445 0.320698 -0.240834 foreground
12014 0.726112 724 17 2.955832 -9.491440 -6.578965 4.307728 0.489950 0.071016 0.157605 ... 0.343455 0.040122 0.003713 0.496489 -0.393983 -0.393983 0.428561 0.400553 0.038883 foreground
12015 0.918835 724 18 1.895810 -16.006632 -11.094952 2.776762 0.348214 -0.168935 0.129040 ... 0.351885 0.042564 -0.020289 0.580346 -0.552284 -0.552284 0.457278 0.387848 -0.180441 foreground
12016 1.216095 724 19 2.241223 -16.770756 -11.624602 2.795168 0.296356 -0.195357 0.087665 ... 0.231063 0.023784 -0.075097 0.231076 -0.803851 -0.803851 0.216105 0.210806 -0.522732 foreground
12017 1.241818 724 20 6.648924 12.798364 8.871150 10.946750 1.507266 1.190199 0.566732 ... 2.063255 0.314700 0.245513 0.901531 0.905861 0.905861 0.904396 0.937620 0.938175 foreground
12018 1.150841 724 21 1.979615 -17.255894 -11.960874 2.454263 0.213059 -0.089824 0.056313 ... 0.174882 0.015555 -0.031489 0.201595 -0.785020 -0.785020 0.188620 0.192204 -0.275004 foreground
12019 1.059108 724 22 2.545866 -14.645376 -10.151401 4.199507 0.512827 0.098539 0.146221 ... 0.573835 0.064573 -0.016732 0.439639 -0.466749 -0.466749 0.405790 0.358829 -0.106877 foreground
12020 1.016291 724 23 2.822197 -11.335154 -7.856930 4.201037 0.479085 -0.014662 0.111055 ... 0.314115 0.021815 -0.046603 0.319340 -0.526418 -0.526418 0.240512 0.133503 -0.285399 foreground
12021 0.894742 724 24 1.146093 -22.607031 -15.670000 2.113748 0.202143 -0.273157 0.044927 ... 0.174305 0.013882 -0.017559 0.214520 -0.388620 -0.388620 0.181476 0.139714 -0.138538 foreground
12022 0.670116 724 25 3.510759 -7.332713 -5.082649 4.744608 0.467600 0.199800 0.104182 ... 0.289349 0.032979 0.030203 0.540020 0.121739 0.121739 0.247456 0.286472 0.288089 foreground
12023 0.503941 724 26 2.680634 -11.520928 -7.985699 4.159111 0.579430 0.235952 0.061523 ... 0.092858 0.020784 0.024588 0.264452 -0.003100 -0.003100 0.091842 0.128490 0.168628 foreground
12024 0.560634 724 61 2.759909 -11.651375 -8.076117 4.703845 0.547412 0.042536 0.138787 ... 0.360025 0.044303 0.006862 0.609668 -0.272037 -0.272037 0.491439 0.481183 0.083735 foreground
12025 0.712192 724 62 2.736127 -15.611526 -10.821085 4.265700 0.423533 -0.155905 0.127224 ... 0.268559 0.028056 -0.034304 0.344609 -0.707176 -0.707176 0.337684 0.357184 -0.377643 foreground

13 rows × 28 columns

In [29]:
test_hits = scaler.transform(shuffled_seqlet_hits[keys_for_scoring])
test_hits = pca.transform(test_hits)
shuffled_pc_df = pd.DataFrame(data = test_hits, columns=["PC{}".format(i+1) for i in range(test_hits.shape[1])])
plot_pca(shuffled_pc_df, true_ids=None)
(2846, 4)
In [30]:
test_hits = scaler.transform(seq_null_seqlet_hits[keys_for_scoring])
test_hits = pca.transform(test_hits)
seq_null_pc_df = pd.DataFrame(data = test_hits, columns=["PC{}".format(i+1) for i in range(test_hits.shape[1])])
plot_pca(seq_null_pc_df)
(13587, 4)
In [31]:
test_hits = scaler.transform(weight_null_seqlet_hits[keys_for_scoring])
test_hits = pca.transform(test_hits)
weight_null_pc_df = pd.DataFrame(data = test_hits, columns=["PC{}".format(i+1) for i in range(test_hits.shape[1])])
plot_pca(weight_null_pc_df)
(13714, 4)
In [32]:
test_hits = scaler.transform(weightedseq_null_seqlet_hits[keys_for_scoring])
test_hits = pca.transform(test_hits)
weightedseq_null_pc_df = pd.DataFrame(data = test_hits, columns=["PC{}".format(i+1) for i in range(test_hits.shape[1])])
plot_pca(weightedseq_null_pc_df)
(12597, 4)
In [190]:
pc_df[:5]
Out[190]:
PC1 PC2 PC3 PC4 PC5 PC6 PC7 PC8
0 -1.757138 -1.105117 -1.560638 0.357107 0.277466 0.376950 -0.193923 0.091057
1 2.880820 -0.113733 -0.338334 0.840248 0.175452 0.578696 0.093965 -0.772116
2 0.438450 2.069480 -0.599759 0.185653 -0.684141 0.460374 -0.527480 -0.043134
3 -3.361249 4.008285 0.283835 -0.326203 -0.736401 0.243396 -0.191256 0.144796
4 16.268981 3.032388 0.694093 1.201489 2.725356 0.261409 -0.523800 0.170852
In [34]:
pca_seqlet_hits = pc_df[['PC1', 'PC2', 'PC3']]
pca_shuffled_seqlet_hits = shuffled_pc_df[['PC1', 'PC2', 'PC3']]
pca_seq_null_seqlet_hits = seq_null_pc_df[['PC1', 'PC2', 'PC3']]
pca_weight_null_seqlet_hits = weight_null_pc_df[['PC1', 'PC2', 'PC3']]
pca_weightedseq_null_seqlet_hits = weightedseq_null_pc_df[['PC1', 'PC2', 'PC3']]
pca_seqlet_hits['marker'] = 'foreground'
pca_shuffled_seqlet_hits['marker'] = 'shuffled'
pca_seq_null_seqlet_hits['marker'] = 'seq_null'
pca_weight_null_seqlet_hits['marker'] = 'weight_null'
pca_weightedseq_null_seqlet_hits['marker'] = 'weightedseq'
pca_test_hits = pd.concat([pca_seqlet_hits,
                       pca_shuffled_seqlet_hits,
                       pca_seq_null_seqlet_hits,
                       pca_weight_null_seqlet_hits,
                       pca_weightedseq_null_seqlet_hits
                      ])
In [36]:
import seaborn as sns
sns.pairplot(pd.concat([pca_seqlet_hits, pca_shuffled_seqlet_hits]), 
             diag_kind="kde", kind='scatter', hue="marker")
/users/msharmin/anaconda2/envs/basepair13/lib/python3.6/site-packages/scipy/stats/stats.py:1713: FutureWarning:

Using a non-tuple sequence for multidimensional indexing is deprecated; use `arr[tuple(seq)]` instead of `arr[seq]`. In the future this will be interpreted as an array index, `arr[np.array(seq)]`, which will result either in an error or a different result.

Out[36]:
<seaborn.axisgrid.PairGrid at 0x7f229c385630>
In [37]:
sns.pairplot(pd.concat([pca_seqlet_hits, pca_seq_null_seqlet_hits]), 
             diag_kind="kde", kind='scatter', hue="marker")
Out[37]:
<seaborn.axisgrid.PairGrid at 0x7f22a5fa9470>
In [38]:
sns.pairplot(pd.concat([pca_seqlet_hits, pca_weight_null_seqlet_hits]), 
             diag_kind="kde", kind='scatter', hue="marker")
Out[38]:
<seaborn.axisgrid.PairGrid at 0x7f180f5002e8>
In [39]:
sns.pairplot(pd.concat([pca_seqlet_hits, pca_weightedseq_null_seqlet_hits]), 
             diag_kind="kde", kind='scatter', hue="marker")
Out[39]:
<seaborn.axisgrid.PairGrid at 0x7f229d74c9e8>
In [ ]:
 
In [6]:
import seaborn as sns
sns.pairplot(test_hits[['sum_score', 
                   'weight_convolution', 'pfm_convolution', 'pwm_convolution']], 
             diag_kind="kde", kind='scatter')
/users/msharmin/anaconda2/envs/basepair/lib/python3.6/site-packages/scipy/stats/stats.py:1713: FutureWarning:

Using a non-tuple sequence for multidimensional indexing is deprecated; use `arr[tuple(seq)]` instead of `arr[seq]`. In the future this will be interpreted as an array index, `arr[np.array(seq)]`, which will result either in an error or a different result.

Out[6]:
<seaborn.axisgrid.PairGrid at 0x7f0ba03f0278>
In [7]:
sns.pairplot(test_hits[['sum_score', 
                   'weight_scan', 'pfm_scan', 'pwm_scan']], 
             diag_kind="kde", kind='scatter')
/users/msharmin/anaconda2/envs/basepair/lib/python3.6/site-packages/scipy/stats/stats.py:1713: FutureWarning:

Using a non-tuple sequence for multidimensional indexing is deprecated; use `arr[tuple(seq)]` instead of `arr[seq]`. In the future this will be interpreted as an array index, `arr[np.array(seq)]`, which will result either in an error or a different result.

Out[7]:
<seaborn.axisgrid.PairGrid at 0x7f0b9c4b1128>
In [8]:
sns.pairplot(test_hits[['sum_score', 
                   'weight_jaccard', 'pfm_jaccard', 'pwm_jaccard']], 
             diag_kind="kde", kind='scatter')
/users/msharmin/anaconda2/envs/basepair/lib/python3.6/site-packages/scipy/stats/stats.py:1713: FutureWarning:

Using a non-tuple sequence for multidimensional indexing is deprecated; use `arr[tuple(seq)]` instead of `arr[seq]`. In the future this will be interpreted as an array index, `arr[np.array(seq)]`, which will result either in an error or a different result.

Out[8]:
<seaborn.axisgrid.PairGrid at 0x7f0b946eca20>
In [9]:
sns.pairplot(test_hits[['sum_score', 
                   'weight_masked_convolution', 'pfm_masked_convolution', 'pwm_masked_convolution']], 
             diag_kind="kde", kind='scatter')
/users/msharmin/anaconda2/envs/basepair/lib/python3.6/site-packages/scipy/stats/stats.py:1713: FutureWarning:

Using a non-tuple sequence for multidimensional indexing is deprecated; use `arr[tuple(seq)]` instead of `arr[seq]`. In the future this will be interpreted as an array index, `arr[np.array(seq)]`, which will result either in an error or a different result.

Out[9]:
<seaborn.axisgrid.PairGrid at 0x7f0b944dc400>
In [10]:
from matlas.matches import plots_seqlet_measures
plots_seqlet_measures(test_hits, metrics = {0:["pwm_convolution", "pfm_convolution"], 
                                            1:["pwm_convolution", "weight_convolution"], 
                                            2:["pwm_convolution", "pwm_scan"], 
                                            3:["pwm_convolution", "pwm_jaccard"]})
/users/msharmin/anaconda2/envs/basepair/lib/python3.6/site-packages/scipy/stats/stats.py:1713: FutureWarning:

Using a non-tuple sequence for multidimensional indexing is deprecated; use `arr[tuple(seq)]` instead of `arr[seq]`. In the future this will be interpreted as an array index, `arr[np.array(seq)]`, which will result either in an error or a different result.

In [11]:
plots_seqlet_measures(test_hits, metrics = {0:["pwm_convolution", "pfm_jaccard"], 
                                            1:["pwm_convolution", "weight_jaccard"], 
                                            2:["pwm_convolution", "pwm_masked_convolution"], 
                                            3:["pwm_convolution", "pfm_masked_convolution"]})
/users/msharmin/anaconda2/envs/basepair/lib/python3.6/site-packages/scipy/stats/stats.py:1713: FutureWarning:

Using a non-tuple sequence for multidimensional indexing is deprecated; use `arr[tuple(seq)]` instead of `arr[seq]`. In the future this will be interpreted as an array index, `arr[np.array(seq)]`, which will result either in an error or a different result.