%load_ext autoreload
%autoreload 2

import pickle
from collections import OrderedDict
import numpy as np
import pandas as pd
import os
import subprocess
import math
import copy
import pickle
import h5py
import glob

from matlas.genome_data import *
from matlas.pwms import shorten_tfname, get_motifDB_id_maps
from matlas.matches import fig2vdom, vdom_pssm
from matlas.modisco_results import ModiscoResult
from vdom.helpers import (h1, p, li, img, div, b, br, ul, a,
                          details, summary,
                          table, thead, th, tr, tbody, td, ol)
from IPython.display import display


project_root = "/mnt/lab_data/kundaje/msharmin/mouse_hem/with_tfd"
oak_root = "/oak/stanford/groups/akundaje/msharmin/mouse_hem/from_labcluster/with_tfd"
srv_root = "/srv/scratch/msharmin/mouse_hem/with_tfd"
data_name = "full_mouse50"
modisco_root = "{0}/{1}/Naive_modisco2019_msharmin".format(oak_root, data_name)
modisco_root = "{0}/{1}/Naive_modisco2019".format(srv_root, data_name)

homer_root = "{0}/{1}/Naive_scans".format(srv_root, data_name)
model_root = "{0}/{1}/fineFactorized".format(srv_root, data_name)
basset_root = "{0}/{1}/Naive_basset_motifs".format(srv_root, data_name)
reportfile= "/mnt/lab_data/kundaje/msharmin/mouse_hem/with_tfd/full_mouse50/filtering samples_MS2.xlsx"
sheetname = "filter23"

load data from labcluster

Using TensorFlow backend.
TF-MoDISco is using the TensorFlow backend.

from matlas.matches import DenovoModisco

task_idx = 273
modisco_dir = "{0}/task_{1}-naivegw".format(modisco_root, task_idx)
summit_file = "{0}/{1}/fineFactorized/task_{2}-naivegw/predictions/task_{2}-{3}Summit.calibrated.{1}.tab.gz".format(project_root, data_name, task_idx, "Naive")
perf_file = "{0}/task_{1}-naivegw/NaiveauPRC.txt".format(model_root, task_idx)
deeplift_hdf = "{0}/{1}/Naive_deeplift2019/task_{2}-naivegw/summit.h5".format(srv_root, data_name, task_idx)
ob = DenovoModisco(modisco_dir)
pattern_name = None
pattern_names = list(ob.denovo_pwms.keys()) if pattern_name is None else [pattern_name]
query = ob.load_chomped_pattern(pattern_names[0], plot=False)
#loads all the important scores   
ob.initialize_with_importance_scores(summit_file, perf_file, deeplift_hdf, query.keys(), use_all_keys=True) 
ob.per_base_imp

0.05627951420283896

ob.keys_for_scoring = ['sum_score'] 
ob.keys_for_scoring += ["{}_scan".format(key) for key in query.keys() if key!= 'consensus']
ob.keys_for_scoring += ["{}_jaccard".format(key) for key in query.keys() if key!= 'consensus']
ob.keys_for_scoring += ["{}_convolution".format(key) for key in query.keys() if key!= 'consensus']
ob.keys_for_scoring += ["{}_masked_convolution".format(key) for key in query.keys() if key!= 'consensus']

method_to_null_scores = OrderedDict()
method_to_seqlet_scores = OrderedDict()
for pattern_name in ob.denovo_pwms.keys():
#pattern_name = "metacluster_1/pattern_7"
    query = ob.load_chomped_pattern(pattern_name, rc=False)
    method_to_null_scores[pattern_name] = ob.compute_null_scores_in_seqlets(pattern_name, query, 
                                                                       use_all_seqlets=True, debug=True)
    
    method_to_seqlet_scores[pattern_name] = ob.compute_scores_in_seqlets(pattern_name, query, 
                                                                       use_all_seqlets=True, debug=True)
    #new_seqlet_hits = ob.prediction_in_seqlets(pattern_name, seqlet_hits)

# from matlas.match_utils import get_any_high_scoring_instances
# seqlet_hits = OrderedDict()

# for pattern_name in ob.denovo_pwms.keys():
#     query = ob.load_chomped_pattern(pattern_name, rc=False)
#     seqlet_hits[pattern_name] = get_any_high_scoring_instances(method_to_seqlet_scores[pattern_name], 
#                                                     min_tot_imp=ob.per_base_imp*len(query['ppm']), 
#                                                      keys=ob.keys_for_scoring)
#     seqlet_hits[pattern_name]['sum_score'] = seqlet_hits[pattern_name]['sum_score']/len(query['ppm'])
          
seqlet_hits = method_to_seqlet_scores

# null_hits = OrderedDict()
# for pattern_name in ob.denovo_pwms.keys():
#     query = ob.load_chomped_pattern(pattern_name, rc=False)
#     null_hits[pattern_name] = get_any_high_scoring_instances(method_to_null_scores[pattern_name], 
#                                                     min_tot_imp=ob.per_base_imp*len(query['ppm']), 
#                                                      keys=ob.keys_for_scoring)
#     null_hits[pattern_name]['sum_score'] = null_hits[pattern_name]['sum_score']/len(query['ppm'])
null_hits = method_to_null_scores

from sklearn.preprocessing import StandardScaler

def scale_data(_hits, scaler=None):
    if scaler is None:
        # should be used only during training
        scaler = StandardScaler()
        scaler.fit(_hits)
        
    # Apply transform to both the training set and testing set
    _hits = scaler.transform(_hits)
    
    return scaler, _hits

features = ['sum_score', 'pwm_scan',
            'hyp_cwm_scan', 'cwm_scan',
            'hyp_cwm_jaccard', 'cwm_jaccard',
            'hyp_cwm_masked_convolution'
           ]

scaled_seqlet_hits = OrderedDict()
scaled_null_hits = OrderedDict()
scalers = OrderedDict()
for pattern_name in ob.denovo_pwms.keys():
    scalers[pattern_name], scaled_seqlet_hits[pattern_name] = scale_data(seqlet_hits[pattern_name][features])
    _, scaled_null_hits[pattern_name] = scale_data(null_hits[pattern_name][features], scalers[pattern_name])
    
for pattern_name in ob.denovo_pwms.keys():
    scaled_seqlet_hits[pattern_name] = pd.DataFrame(scaled_seqlet_hits[pattern_name], columns=features)
    #scaled_seqlet_hits[pattern_name].columns = seqlet_hits[pattern_name][features].columns
    
for pattern_name in ob.denovo_pwms.keys():
    scaled_null_hits[pattern_name] = pd.DataFrame(scaled_null_hits[pattern_name], columns=features)
    #scaled_null_hits[pattern_name].columns = null_hits[pattern_name][features].columns

from sklearn.decomposition import PCA

def reduce_data(_hits, pca=None):
    if pca is None:
        # Make an instance of the Model
        pca = PCA(.98)
        pca.fit(_hits)
        
    #Apply the mapping (transform) to both the training set and the test set.
    _hits = pca.transform(_hits)
    
    return pca, _hits


features = ['sum_score', 'pwm_scan',
            'hyp_cwm_scan', 'cwm_scan',
            'hyp_cwm_jaccard', 'cwm_jaccard',
            'hyp_cwm_masked_convolution'
           ]

pca_seqlet_hits = OrderedDict()
pca_null_hits = OrderedDict()
pcas = OrderedDict()
for pattern_name in ob.denovo_pwms.keys():
    pcas[pattern_name], pca_seqlet_hits[pattern_name] = reduce_data(scaled_seqlet_hits[pattern_name][features])
    _, pca_null_hits[pattern_name] = reduce_data(scaled_null_hits[pattern_name][features], pcas[pattern_name])
    
for pattern_name in ob.denovo_pwms.keys():
    pca_seqlet_hits[pattern_name] = pd.DataFrame(pca_seqlet_hits[pattern_name])
    pca_seqlet_hits[pattern_name].columns = ["PC{}".format(i+1) for i in range(pca_seqlet_hits[pattern_name].shape[1])]
    
for pattern_name in ob.denovo_pwms.keys():
    pca_null_hits[pattern_name] = pd.DataFrame(pca_null_hits[pattern_name])
    pca_null_hits[pattern_name].columns = ["PC{}".format(i+1) for i in range(pca_null_hits[pattern_name].shape[1])]

outdir = "/mnt/lab_data/kundaje/msharmin/mouse_hem/with_tfd/full_mouse50/instances2019/task_273"
_df_dict = OrderedDict()
for i, pattern_name in enumerate(ob.denovo_pwms.keys()):
    print(pattern_name)
    fname = "{}/{}.gw.gz".format(outdir, i)
    _df_dict[pattern_name] = pd.read_csv(fname, header=0, index_col=0, sep="\t")
    print(_df_dict[pattern_name].shape)

metacluster_1/pattern_0

/users/msharmin/anaconda2/envs/basepair13/lib/python3.6/site-packages/numpy/lib/arraysetops.py:522: FutureWarning:

elementwise comparison failed; returning scalar instead, but in the future will perform elementwise comparison

(3260164, 27)
metacluster_1/pattern_1
(3392147, 27)
metacluster_1/pattern_2
(3390929, 27)
metacluster_1/pattern_3
(3392147, 27)
metacluster_1/pattern_4
(3390929, 27)
metacluster_1/pattern_5
(3392147, 27)
metacluster_1/pattern_6
(3393254, 27)
metacluster_1/pattern_7
(3397481, 27)
metacluster_1/pattern_8
(3380835, 27)
metacluster_1/pattern_9
(3260164, 27)
metacluster_1/pattern_10
(3384625, 27)
metacluster_1/pattern_11
(3390929, 27)
metacluster_1/pattern_12
(3393254, 27)
metacluster_1/pattern_13
(3391553, 27)
metacluster_1/pattern_14
(3391553, 27)
metacluster_1/pattern_15
(3384625, 27)
metacluster_1/pattern_16
(3393254, 27)
metacluster_1/pattern_17
(2716324, 27)
metacluster_1/pattern_18
(3196262, 27)
metacluster_1/pattern_19
(3329663, 27)

_reduced_dict = OrderedDict()
for pattern_name in ob.denovo_pwms.keys():
    _, _reduced_dict[pattern_name] = scale_data(_df_dict[pattern_name][features], scalers[pattern_name])
    _, _reduced_dict[pattern_name] = reduce_data(_reduced_dict[pattern_name], pcas[pattern_name])
    _reduced_dict[pattern_name] = pd.DataFrame(_reduced_dict[pattern_name])
    _reduced_dict[pattern_name].columns = ["PC{}".format(i+1) for i in range(_reduced_dict[pattern_name].shape[1])]

import matplotlib.pyplot as plt 
from sklearn.cluster import Birch
from sklearn.mixture import GaussianMixture, BayesianGaussianMixture
from sklearn.metrics import davies_bouldin_score
import warnings
warnings.filterwarnings("ignore")

keys = ['PC1', 'PC2']
gmms = OrderedDict()
birchs = OrderedDict()
gmm_cno = OrderedDict()
brc_cno = OrderedDict()
everything = OrderedDict()
for pattern_name in ob.denovo_pwms.keys():
    info = OrderedDict()
    #pattern_name = "metacluster_1/pattern_19"
    key_ks = []
    for i, key in enumerate(keys):
        a = pca_seqlet_hits[pattern_name][key].values
        b = pca_null_hits[pattern_name][key].values
        the_k = np.percentile(b, 90)
        key_ks.append(the_k)
        info['PC1_k'] = the_k
        break
    print(pattern_name, key_ks)
    
    fig, axes = plt.subplots(nrows=2, ncols=4, figsize=(22,10))
    ax_all = axes[0, 0]
    ax_allBrc = axes[0, 1]
    ax_gwBrc = axes[0, 2]
    ax_gmm = axes[0, 3]
    
    ax_sq = axes[1, 0]
    ax_sqbrc = axes[1, 1]
    ax_dnbrc = axes[1, 2]
    ax_cmp = axes[1, 3]
    
    df = pca_seqlet_hits[pattern_name]
    #df = df[df['PC1']>=the_k][['PC1', 'PC2']]
    small_df = df[df['PC1']>=the_k][['PC1', 'PC2']]
    df = df[df['PC1']>=the_k][['PC1', 'PC2']]
    print(df.shape)
    clustering = Birch(branching_factor=50, n_clusters=3, threshold=0.5, compute_labels=True).fit(
        small_df.values)
    means = []
    for i in [0,1,2]:
        means.append(np.mean(small_df[clustering.labels_==i], axis=0))
    means = np.array(means)
    sq_cno = np.argmax(np.array(means[:,0])) # use only 1st column or PC1 to selelct desired cluster
    
    ax_sq.scatter(small_df['PC1'], small_df['PC2'], c=clustering.labels_, s=1, cmap='plasma');#
    ax_sq.set_xlabel('PC1')
    ax_sq.set_ylabel('PC2')
    ax_sq.set_title(pattern_name + " (Birch, Seqlets), {}".format(sq_cno))
    info['sq_cno'] = sq_cno
    info['sq_brc'] = clustering
    
    #anything
    all_df = _reduced_dict[pattern_name][['PC1', 'PC2']]
    df = all_df
    ax_all.scatter(df['PC1'], df['PC2'], s=1, cmap='plasma');
    ax_all.set_xlabel('PC1')
    ax_all.set_ylabel('PC2')
    ax_all.set_title(pattern_name+ " (All)")
    
    #all birch
    all_clustering = Birch(branching_factor=50, n_clusters=3, threshold=0.5, compute_labels=True).fit(
        df.values)
    all_means = []
    for i in [0,1,2]:
        all_means.append(np.mean(df[all_clustering.labels_==i], axis=0))
    all_means = np.array(all_means)
    all_cno = np.argmax(np.array(all_means[:,0]))
    
    ax_allBrc.scatter(df['PC1'], df['PC2'], c=all_clustering.labels_, s=1, cmap='plasma');
    ax_allBrc.set_xlabel('PC1')
    ax_allBrc.set_ylabel('PC2')
    ax_allBrc.set_title(pattern_name+ " (allBirch), {}".format(all_cno))
    info['all_cno'] = all_cno
    info['all_brc'] = all_clustering
    
    #reduced and all birch
    df = df[df['PC1']>=the_k][['PC1', 'PC2']]
    all_labels = all_clustering.labels_[all_df['PC1']>=the_k]
    ax_gwBrc.scatter(df['PC1'], df['PC2'], c=all_labels, s=1, cmap='plasma');
    ax_gwBrc.set_xlabel('PC1')
    ax_gwBrc.set_ylabel('PC2')
    ax_gwBrc.set_title(pattern_name+ " (allBirch, GW), {}".format(all_cno))
    
    #gw gmm
    gmm = GaussianMixture(n_components=3, means_init= means,
                              covariance_type='full', 
                              init_params='kmeans').fit(df[['PC1', 'PC2']])
    gmm_labels = gmm.predict(df)
    means = []
    for i in [0,1,2]:
        means.append(np.mean(df[gmm_labels==i], axis=0))
    means = np.array(means)
    gmm_cno[pattern_name] = np.argmax(np.array(means[:,0]))
    
    ax_gmm.scatter(df['PC1'], df['PC2'], c=gmm_labels, s=1, cmap='plasma');
    ax_gmm.set_xlabel('PC1')
    ax_gmm.set_ylabel('PC2')
    ax_gmm.set_title(pattern_name+ " (sqGMM, GW), {}".format(gmm_cno[pattern_name]))
    info['gmm_cno'] = gmm_cno[pattern_name]
    info['gmm'] = gmm
    
    #gw birch prediction
    brc_labels = clustering.predict(df.values)
    ax_sqbrc.scatter(df['PC1'], df['PC2'], c=brc_labels, s=1, cmap='plasma');#
    ax_sqbrc.set_xlabel('PC1')
    ax_sqbrc.set_ylabel('PC2')
    ax_sqbrc.set_title(pattern_name + " (sqBirch, GW), {}".format(sq_cno))
    
    
    #gw birch new
    brc = Birch(branching_factor=50, n_clusters=3, threshold=0.5, compute_labels=True).fit(
        df.values)
    means = []
    for i in [0,1,2]:
        means.append(np.mean(df[brc.labels_==i], axis=0))
    means = np.array(means)
    brc_cno[pattern_name] = np.argmax(np.array(means[:,0]))
    
    ax_dnbrc.scatter(df['PC1'], df['PC2'], c=brc.labels_, s=1, cmap='plasma');
    ax_dnbrc.set_xlabel('PC1')
    ax_dnbrc.set_ylabel('PC2')
    ax_dnbrc.set_title(pattern_name+ " (dnBirch, GW), {}".format(brc_cno[pattern_name]))
    
    info['brc_cno'] = brc_cno[pattern_name]
    info['reduced_brc'] = brc
    
    #fitness
    #seq_brc_db = davies_bouldin_score(small_df.values, clustering.labels_)
    #gw_brc_db = davies_bouldin_score(df.values, brc_labels)
    #brc_db = davies_bouldin_score(df.values, brc.labels_)
    #gmm_db = davies_bouldin_score(df.values, gmm_labels)
    
    new_all_brc_labels = np.array([0]*df.shape[0]); new_all_brc_labels[all_labels==all_cno] = 1
    new_seq_brc_labels = np.array([0]*small_df.shape[0]); new_seq_brc_labels[clustering.labels_==sq_cno] = 1
    new_gw_brc_labels = np.array([0]*df.shape[0]); new_gw_brc_labels[brc.labels_==sq_cno] = 1
    new_brc_labels = np.array([0]*df.shape[0]); new_brc_labels[brc.labels_==brc_cno[pattern_name]] = 1
    new_gmm_labels = np.array([0]*df.shape[0]); new_gmm_labels[gmm_labels==gmm_cno[pattern_name]] = 1
    
    new_all_brc_db = davies_bouldin_score(df.values, new_all_brc_labels)
    new_seq_brc_db = davies_bouldin_score(small_df.values, new_seq_brc_labels)
    new_gw_brc_db = davies_bouldin_score(df.values, new_gw_brc_labels)
    new_brc_db = davies_bouldin_score(df.values, new_brc_labels)
    new_gmm_db = davies_bouldin_score(df.values, new_gmm_labels)

    barWidth = 0.25
    bars = [new_all_brc_db, new_seq_brc_db, new_gmm_db, new_brc_db, new_gw_brc_db]
    
    r1 = np.arange(len(bars))
    ax_cmp.bar(r1, bars, color='#557f2d', width=barWidth, edgecolor='white')
    ax_cmp.set_xticks([r+barWidth/2  for r in range(len(bars))])
    ax_cmp.set_xticklabels(['Brc-All, {}'.format(all_cno), 
                            'Brc-Seqlets, {}'.format(sq_cno),
                            'GMM-GW, {}'.format(gmm_cno[pattern_name]), 
                            'dnBrc-GW, {}'.format(brc_cno[pattern_name]), 
                            'sqBrc-GW, {}'.format(sq_cno)], 
                           fontdict={'fontsize':12},
                           rotation='vertical')
    #ax_cmp.legend()
    ax_cmp.set_title('Davies Bouldin Scores')
    plt.tight_layout()
    plt.show()
    
    everything[pattern_name] = info
    #break

metacluster_1/pattern_0 [1.7198816939104558]
(107441, 2)

metacluster_1/pattern_1 [2.001332244271657]
(82131, 2)

metacluster_1/pattern_2 [1.7919665738371213]
(111991, 2)

metacluster_1/pattern_3 [2.5691248798406874]
(61988, 2)

metacluster_1/pattern_4 [2.042705031642133]
(97221, 2)

metacluster_1/pattern_5 [2.653934558669765]
(59385, 2)

metacluster_1/pattern_6 [2.0944077927033]
(78905, 2)

metacluster_1/pattern_7 [2.755896027557707]
(47493, 2)

metacluster_1/pattern_8 [2.3784424903039803]
(80330, 2)

metacluster_1/pattern_9 [2.1775024079472414]
(89982, 2)

import matplotlib.pyplot as plt    
    
keys = ['PC1', 'PC2']
for pattern_name in ob.denovo_pwms.keys():
    fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(20,5))
    
    for i, key in enumerate(keys):
        a = pca_seqlet_hits[pattern_name][key].values
        b = pca_null_hits[pattern_name][key].values
    
        n, bins, patches = axes[i].hist(a, 200, color='red', histtype='step')
        n, bins, patches = axes[i].hist(b, 200, color='green', histtype='step')
        axes[i].axvline(x=np.percentile(b, 95))

    
    plt.title(pattern_name)
    plt.show()

features = ['sum_score', 'pwm_scan',
            'hyp_cwm_scan', 'cwm_scan',
            'hyp_cwm_jaccard', 'cwm_jaccard',
            'hyp_cwm_masked_convolution'
           ]

ob.keys_for_scoring = features
for pattern_name in ob.denovo_pwms.keys():
    #pattern_name = "metacluster_1/pattern_7"
    query = ob.load_chomped_pattern(pattern_name, rc=False)
    #seqlet_hits = ob.compute_scores_in_seqlets(pattern_name, query, use_all_seqlets=True)
    new_seqlet_hits = ob.prediction_in_seqlets(pattern_name, seqlet_hits[pattern_name])

(647226, 5)
(647226, 9) 2 7457

(551397, 6)
(551397, 10) 0 9621

(560822, 5)
(560822, 9) 2 140575

(551397, 5)
(551397, 9) 2 102653

(560822, 5)
(560822, 9) 0 123336

(551397, 6)
(551397, 10) 0 110981

(531787, 5)
(531787, 9) 2 122509

(522529, 6)
(522529, 10) 1 149055

(590518, 6)
(590518, 10) 0 177247

(647226, 6)
(647226, 10) 1 157308

import matplotlib.pyplot as plt

for pattern_name in ob.denovo_pwms.keys():
    a = method_to_seqlet_scores[pattern_name]['sum_score'].flatten()
    b = method_to_null_scores[pattern_name]['sum_score'].flatten()

    c = method_to_seqlet_scores[pattern_name]['hyp_cwm_masked_convolution'].flatten()
    d = method_to_null_scores[pattern_name]['hyp_cwm_masked_convolution'].flatten()

    #fig = plt.figure(figsize=(10, 8))
    fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(20,5))
    n, bins, patches = axes[0].hist(a, 200, color='red', histtype='step')
    n, bins, patches = axes[0].hist(b, 200, color='green', histtype='step')
    axes[0].axvline(x=np.percentile(b, 99.95)/len(query['ppm']))

    n, bins, patches = axes[1].hist(c, 200, color='red', histtype='step')
    n, bins, patches = axes[1].hist(d, 200, color='green', histtype='step')
    axes[1].axvline(x=np.percentile(d, 95))
    plt.title(pattern_name)
    plt.show()

import matplotlib.pyplot as plt

keys = ['hyp_cwm_jaccard', 'hyp_cwm_scan', 'hyp_cwm_masked_convolution']
for pattern_name in ob.denovo_pwms.keys():
    fig, axes = plt.subplots(nrows=1, ncols=3, figsize=(20,5))
    
    for i, key in enumerate(keys):
        a = seqlet_hits[pattern_name][key].values
        b = null_hits[pattern_name][key].values
    
        n, bins, patches = axes[i].hist(a, 200, color='red', histtype='step')
        n, bins, patches = axes[i].hist(b, 200, color='green', histtype='step')
        axes[i].axvline(x=np.percentile(b, 95))

    
    plt.title(pattern_name)
    plt.show()

#remove instances based on all match scores
filtered_seqlet_hits = OrderedDict()
filtered_null_hits = OrderedDict()
filtering_features = ['pwm_scan',
            'hyp_cwm_scan', 'cwm_scan',
            'hyp_cwm_jaccard', 'cwm_jaccard',
            'hyp_cwm_masked_convolution'
           ]

for pattern_name in ob.denovo_pwms.keys():
    for key in filtering_features:
        d = method_to_null_scores[pattern_name][key].flatten()
        the_k = np.percentile(d, 90)
        df = seqlet_hits[pattern_name]
        df = df[df[key]>the_k]
    filtered_seqlet_hits[pattern_name] = df

features = ['sum_score', 'pwm_scan',
            'hyp_cwm_scan', 'cwm_scan',
            'hyp_cwm_jaccard', 'cwm_jaccard',
            'hyp_cwm_masked_convolution'
           ]

ob.keys_for_scoring = features
for pattern_name in ob.denovo_pwms.keys():
#pattern_name = "metacluster_1/pattern_7"
    query = ob.load_chomped_pattern(pattern_name, rc=False)
    #seqlet_hits = ob.compute_scores_in_seqlets(pattern_name, query, use_all_seqlets=True)
    new_seqlet_hits = ob.prediction_in_seqlets(pattern_name, filtered_seqlet_hits[pattern_name])

(76154, 5)
(76154, 9) 1 7475

(49266, 5)
(49266, 9) 2 9380

(69300, 5)
(69300, 9) 2 3690

(45218, 5)
(45218, 9) 2 1637

(53022, 6)
(53022, 10) 2 1849

(35398, 5)
(35398, 9) 2 1166

(50899, 5)
(50899, 9) 1 10585

(24021, 5)
(24021, 9) 1 1472

(51483, 5)
(51483, 9) 1 696

(60627, 6)
(60627, 10) 1 519

for pattern_name in ob.denovo_pwms.keys():

ob.keys_for_scoring = ['sum_score'] 
ob.keys_for_scoring += ["{}_scan".format(key) for key in query.keys() if key!= 'consensus']
ob.keys_for_scoring += ["{}_jaccard".format(key) for key in query.keys() if key!= 'consensus']
ob.keys_for_scoring += ["{}_convolution".format(key) for key in query.keys() if key!= 'consensus']
ob.keys_for_scoring += ["{}_masked_convolution".format(key) for key in query.keys() if key!= 'consensus']

for pattern_name in ob.denovo_pwms.keys():
#pattern_name = "metacluster_1/pattern_7"
    query = ob.load_chomped_pattern(pattern_name, rc=False)
    seqlet_hits = ob.compute_scores_in_seqlets(pattern_name, query, use_all_seqlets=False)
    new_seqlet_hits = ob.prediction_in_seqlets(pattern_name, seqlet_hits)

(260431, 8)
(260431, 12) 2

(120889, 8)
(120889, 12) 0

(52636, 8)
(52636, 12) 1

(21865, 7)
(21865, 11) 1

(18675, 7)
(18675, 11) 2

(18124, 7)
(18124, 11) 1

(18098, 7)
(18098, 11) 1

(14878, 8)
(14878, 12) 0

(12659, 8)
(12659, 12) 1

(13967, 8)
(13967, 12) 2

features = ['sum_score', 'pssm_scan',
            'hyp_cwm_scan', 'cwm_scan',
            'hyp_cwm_jaccard', 'cwm_jaccard','pssm_jaccard',
            'hyp_cwm_masked_convolution'
           ]
ob.keys_for_scoring = features
for pattern_name in ob.denovo_pwms.keys():
#pattern_name = "metacluster_1/pattern_7"
    query = ob.load_chomped_pattern(pattern_name, rc=False)
    seqlet_hits = ob.compute_scores_in_seqlets(pattern_name, query, use_all_seqlets=False)
    new_seqlet_hits = ob.prediction_in_seqlets(pattern_name, seqlet_hits)

(260431, 5)
(260431, 9) 2

(120889, 5)
(120889, 9) 1

(52636, 5)
(52636, 9) 2

(21865, 4)
(21865, 8) 2

(18675, 4)
(18675, 8) 1

(18124, 4)
(18124, 8) 1

(18098, 4)
(18098, 8) 1

(14878, 4)
(14878, 8) 2

(12659, 4)
(12659, 8) 1

(13967, 4)
(13967, 8) 1

features = ['sum_score', 'pwm_scan',
            'hyp_cwm_scan', 'cwm_scan',
            'hyp_cwm_jaccard', 'cwm_jaccard',
            'hyp_cwm_masked_convolution'
           ]
ob.keys_for_scoring = features
for pattern_name in ob.denovo_pwms.keys():
#pattern_name = "metacluster_1/pattern_7"
    query = ob.load_chomped_pattern(pattern_name, rc=False)
    seqlet_hits = ob.compute_scores_in_seqlets(pattern_name, query, use_all_seqlets=False)
    new_seqlet_hits = ob.prediction_in_seqlets(pattern_name, seqlet_hits)

(260431, 5)
(260431, 9) 2

(120889, 5)
(120889, 9) 1

(52636, 5)
(52636, 9) 1

(21865, 4)
(21865, 8) 1

(18675, 4)
(18675, 8) 0

(18124, 4)
(18124, 8) 1

(18098, 4)
(18098, 8) 2

(14878, 5)
(14878, 9) 1

(12659, 5)
(12659, 9) 0

(13967, 5)
(13967, 9) 1

imp1 = None
imp2 = None
imp3 = None
ratio = None
for pattern_name in ob.denovo_pwms.keys():
    pca = ob.pca[pattern_name]
    #print(pattern_name)
    #print(pca.components_[:2,:])
    if imp1 is None:
        imp1 = pca.components_[0,:]
        imp2 = pca.components_[1,:]
        imp3 = pca.components_[2,:]
        ratio = pca.explained_variance_ratio_[:5]
    else:
        imp1 = np.vstack((imp1, pca.components_[0,:]))
        imp2 = np.vstack((imp2, pca.components_[1,:]))
        imp3 = np.vstack((imp3, pca.components_[2,:]))
        ratio = np.vstack((ratio, pca.explained_variance_ratio_[:5]))

imp1 = pd.DataFrame(imp1)
imp2 = pd.DataFrame(imp2)
imp3 = pd.DataFrame(imp3)
ratio = pd.DataFrame(ratio)
imp1.columns = features
imp2.columns = features
imp3.columns = features
print(imp1.shape, imp2.shape, imp3.shape)
ratio

(20, 7) (20, 7) (20, 7)

fig = plt.figure()
ax = sns.heatmap(imp1, 
                 vmin=np.min(imp1.values), 
                 vmax=np.max(imp1.values))
ax.set_title('PC1')
ax.set_ylabel('pattern #')
fig = plt.figure()
ax = sns.heatmap(imp2, 
                 vmin=np.min(imp2.values), 
                 vmax=np.max(imp2.values))
ax.set_title('PC2')
ax.set_ylabel('pattern #')
fig = plt.figure()
ax = sns.heatmap(imp3, 
                 vmin=np.min(imp3.values), 
                 vmax=np.max(imp3.values))
ax.set_title('PC3')
ax.set_ylabel('pattern #')
fig = plt.figure()
ax = sns.heatmap(ratio, 
                 vmin=np.min(ratio.values), 
                 vmax=np.max(ratio.values))
ax.set_title('explained ratio')
ax.set_ylabel('pattern #')

Text(30.5, 0.5, 'pattern #')

import seaborn as sns
fig = plt.figure()
ax = sns.heatmap(imp1, 
                 vmin=np.min(pd.concat([imp1, imp2, imp3]).values), 
                 vmax=np.max(pd.concat([imp1, imp2, imp3]).values))
fig = plt.figure()
ax = sns.heatmap(imp2, 
                 vmin=np.min(pd.concat([imp1, imp2, imp3]).values), 
                 vmax=np.max(pd.concat([imp1, imp2, imp3]).values))

fig = plt.figure()
ax = sns.heatmap(imp3, 
                 vmin=np.min(pd.concat([imp1, imp2, imp3]).values), 
                 vmax=np.max(pd.concat([imp1, imp2, imp3]).values))
fig = plt.figure()
ax = sns.heatmap(ratio, 
                 vmin=np.min(ratio.values), 
                 vmax=np.max(ratio.values))

ob.keys_for_scoring = ['sum_score'] 
ob.keys_for_scoring += ["{}_scan".format(key) for key in query.keys() if key!= 'consensus']
ob.keys_for_scoring += ["{}_jaccard".format(key) for key in query.keys() if key!= 'consensus']
ob.keys_for_scoring += ["{}_convolution".format(key) for key in query.keys() if key!= 'consensus']
ob.keys_for_scoring += ["{}_masked_convolution".format(key) for key in query.keys() if key!= 'consensus']

pattern_name = "metacluster_1/pattern_7"
query = ob.load_chomped_pattern(pattern_name, rc=False)
seqlet_hits = ob.compute_scores_in_seqlets(pattern_name, query, use_all_seqlets=False)
seqlet_hits.shape

(522529, 27)

#ob.deeplift_data['shuffled_scores']
# from matlas.sliding_similarities import sliding_sum
# a = sliding_sum(query['ppm'], ob.deeplift_data['shuffled_scores'])
# a = a.flatten()
# b = sliding_sum(query['ppm'], ob.deeplift_data['scores'])
# b = b.flatten()
# a = a/len(query['ppm'])
# b = b/len(query['ppm'])

average_motif_len = 15
a = sliding_sum(np.ones((average_motif_len, 4)), ob.deeplift_data['shuffled_scores'])
a = a.flatten()/average_motif_len

b = sliding_sum(np.ones((average_motif_len, 4)), ob.deeplift_data['scores'])
b = b.flatten()/average_motif_len

import matplotlib.pyplot as plt

fig = plt.figure(figsize=(10, 8))
n, bins, patches = plt.hist(a, 200, facecolor='red', histtype='step')
n, bins, patches = plt.hist(b, 200, facecolor='green', histtype='step')
plt.axvline(x=np.percentile(a, 99.95)/len(query['ppm']))
plt.show()

ob.per_base_imp

0.003751967613522597

pattern_name = "metacluster_1/pattern_7"
outdir = "/mnt/lab_data/kundaje/msharmin/mouse_hem/with_tfd/full_mouse50/instances2019"

query = ob.load_chomped_pattern(pattern_name, rc=False)
display_name = ob.shorten_pattern_name(pattern_name)
outprefix = "{}/{}".format(outdir, display_name)
ob.find_pattern_instances_in_seqlets(pattern_name, query,
                                     fname="{}.seqlet.gmm.pdf".format(outprefix))

(13907, 9)
(13907, 4)
(13907, 7)

INFO:2019-10-22 08:02:32,076:test.log] trained metacluster_1/pattern_7. Used 13907 instances out of (13907, 9)
INFO:2019-10-22 08:02:32,076:test.log] trained metacluster_1/pattern_7. Used 13907 instances out of (13907, 9)
INFO:2019-10-22 08:02:32,076:test.log] trained metacluster_1/pattern_7. Used 13907 instances out of (13907, 9)

df = ob._predict_pattern_instances_in_sequences(pattern_name, query,
                                                cluster_plot=True,
                                               fname="{}.GW.gmm.pdf".format(outprefix)
                                              )

(3134133, 7)

INFO:2019-10-22 08:25:12,271:test.log] predicted, metacluster_1/pattern_7
INFO:2019-10-22 08:25:12,271:test.log] predicted, metacluster_1/pattern_7
INFO:2019-10-22 08:25:12,271:test.log] predicted, metacluster_1/pattern_7

fname = "{}.candidates.bed.gz".format(outprefix)
df.to_csv(fname, header=True, index=True, sep="\t", compression='gzip')

df[:5]

pos_df = df[df['labels']==ob.cluster_no[pattern_name]]
fname = "{}.candidates1.bed.gz".format(outprefix)
pos_df.to_csv(fname, header=True, index=True, sep="\t", compression='gzip')
pos_df.shape

(9718, 17)

rev_query = ob.load_chomped_pattern(pattern_name, rc=True, plot=True)
df = ob._predict_pattern_instances_in_sequences(pattern_name, rev_query,
                                                cluster_plot=True,
                                               fname="{}.rev.GW.gmm.pdf".format(outprefix)
                                              )
fname = "{}.rev_candidates.bed.gz".format(outprefix)
df.to_csv(fname, header=True, index=True, sep="\t", compression='gzip')
pos_df = df[df['labels']==ob.cluster_no[pattern_name]]
fname = "{}.rev_candidates1.bed.gz".format(outprefix)
pos_df.to_csv(fname, header=True, index=True, sep="\t", compression='gzip')

(3134133, 7)

INFO:2019-10-22 08:56:51,948:test.log] predicted, metacluster_1/pattern_7
INFO:2019-10-22 08:56:51,948:test.log] predicted, metacluster_1/pattern_7
INFO:2019-10-22 08:56:51,948:test.log] predicted, metacluster_1/pattern_7
INFO:2019-10-22 08:56:51,948:test.log] predicted, metacluster_1/pattern_7

#now combine the forward abd reverse coords and check for redundancy...
fname = "{}.candidates1.bed.gz".format(outprefix)
for_df = pd.read_csv(fname, header=0, index_col=0, sep="\t")
fname = "{}.rev_candidates1.bed.gz".format(outprefix)
rev_df = pd.read_csv(fname, header=0, index_col=0, sep="\t")

#get the sequences common between
common_seqs = np.array(list(set(list(for_df.seqno.values)).intersection(set(list(rev_df.seqno.values)))))
common_seqs = np.sort(common_seqs)
print(common_seqs.shape)
fore_common = np.array([seqno in common_seqs for seqno in for_df.seqno.values])
rev_common = np.array([seqno in common_seqs for seqno in rev_df.seqno.values])

(1014,)

for_df[fore_common][:5]

rev_df[rev_common][:5]

#for each sequence get the distances from each other
def compute_distances(common_seqs, for_df, rev_df):
    count = 0
    distances = []
    exemplify = False
    for seqno in common_seqs:
        fore = for_df[for_df.seqno==seqno]
        rev = rev_df[rev_df.seqno==seqno]
        res = np.subtract.outer(fore.start_idx.values, rev.start_idx.values).T
        
        if res.shape[0]==res.shape[1]:
            iu1 = np.triu_indices(res.shape[0])
            res = np.abs(res[iu1])
        else:
            print(fore.seqno, res.shape)
            print(res)
            res = np.abs(res.flatten())
            
        if exemplify==False and fore.shape[0]>1 and rev.shape[0]>1:
            print(fore.seqno)
            
            exemplify = True
            #return res
        #res = np.abs(res.flatten())
        count += len(res)
        distances.append(res)
    distances = np.sort(np.concatenate(distances))
    return distances
distances = compute_distances(common_seqs, for_df, rev_df)

62098    3402
62103    3402
Name: seqno, dtype: int64 (1, 2)
[[ 25 146]]
113935    6379
113972    6379
Name: seqno, dtype: int64
113977    6380
Name: seqno, dtype: int64 (2, 1)
[[-29]
 [-34]]
185333    10475
Name: seqno, dtype: int64 (2, 1)
[[-42]
 [-87]]
228602    12683
228607    12683
Name: seqno, dtype: int64 (1, 2)
[[-80 -75]]
249987    13780
249996    13780
Name: seqno, dtype: int64 (1, 2)
[[101 142]]
287311    15523
287320    15523
Name: seqno, dtype: int64 (1, 2)
[[-42 -21]]
307195    16319
307222    16319
Name: seqno, dtype: int64 (1, 2)
[[-150   52]]
315012    16692
Name: seqno, dtype: int64 (2, 1)
[[ -64]
 [-215]]
323483    17175
Name: seqno, dtype: int64 (2, 1)
[[126]
 [ 71]]
333091    17680
Name: seqno, dtype: int64 (2, 1)
[[ 132]
 [-150]]
341205    18078
Name: seqno, dtype: int64 (3, 1)
[[  69]
 [  40]
 [-154]]
361668    19047
361673    19047
Name: seqno, dtype: int64 (1, 2)
[[-77 -55]]
377172    19771
377182    19771
Name: seqno, dtype: int64 (1, 2)
[[138 179]]
377873    19807
Name: seqno, dtype: int64 (2, 1)
[[27]
 [22]]
400683    20994
400691    20994
Name: seqno, dtype: int64 (1, 2)
[[-183 -147]]
425480    22166
425500    22166
Name: seqno, dtype: int64 (1, 2)
[[-123  -19]]
430737    22411
Name: seqno, dtype: int64 (2, 1)
[[307]
 [-44]]
497494    25661
Name: seqno, dtype: int64 (2, 1)
[[-147]
 [-357]]
554973    28523
554981    28523
Name: seqno, dtype: int64 (1, 2)
[[-303  -89]]
555607    28552
Name: seqno, dtype: int64 (2, 1)
[[40]
 [10]]
561087    28821
561107    28821
Name: seqno, dtype: int64 (1, 2)
[[-73 141]]
605665    30987
Name: seqno, dtype: int64 (2, 1)
[[  56]
 [-112]]
612967    31325
612974    31325
Name: seqno, dtype: int64 (1, 2)
[[45 81]]
681257    34910
681262    34910
Name: seqno, dtype: int64 (1, 2)
[[-384 -379]]
692235    35451
692263    35451
Name: seqno, dtype: int64 (1, 2)
[[-162  202]]
708427    36270
708447    36270
Name: seqno, dtype: int64 (1, 2)
[[-53  -8]]
721824    36894
Name: seqno, dtype: int64 (2, 1)
[[405]
 [304]]
753103    38471
Name: seqno, dtype: int64 (2, 1)
[[-41]
 [-63]]
762536    38968
Name: seqno, dtype: int64 (2, 1)
[[  38]
 [-156]]
779477    39869
Name: seqno, dtype: int64 (2, 1)
[[-49]
 [-64]]
889488    45467
Name: seqno, dtype: int64 (2, 1)
[[ 94]
 [-30]]
903372    46146
Name: seqno, dtype: int64 (2, 1)
[[ -49]
 [-106]]
904286    46193
904294    46193
Name: seqno, dtype: int64 (1, 2)
[[-65 -39]]
916606    46823
Name: seqno, dtype: int64 (2, 1)
[[81]
 [67]]
1016228    51920
Name: seqno, dtype: int64 (2, 1)
[[32]
 [27]]
1026728    52480
1026739    52480
Name: seqno, dtype: int64 (1, 2)
[[-64  35]]
1047736    53607
1047775    53607
Name: seqno, dtype: int64 (1, 2)
[[-84  11]]
1050730    53771
1050738    53771
Name: seqno, dtype: int64 (1, 2)
[[37 54]]
1057151    54128
1057168    54128
Name: seqno, dtype: int64 (1, 2)
[[-99 -55]]
1065797    54544
Name: seqno, dtype: int64 (2, 1)
[[ 41]
 [-28]]
1068544    54672
Name: seqno, dtype: int64 (2, 1)
[[ -87]
 [-167]]
1099613    56174
1099620    56174
Name: seqno, dtype: int64 (1, 2)
[[-322 -263]]
1132445    57825
Name: seqno, dtype: int64 (2, 1)
[[87]
 [ 9]]
1161751    59209
Name: seqno, dtype: int64 (2, 1)
[[146]
 [ 90]]
1177002    60022
1177011    60022
Name: seqno, dtype: int64 (1, 2)
[[-94 -30]]
1179874    60172
Name: seqno, dtype: int64 (4, 1)
[[  15]
 [-410]
 [-529]
 [-560]]
1186227    60509
1186232    60509
1186244    60509
Name: seqno, dtype: int64 (1, 3)
[[-209  -52   57]]
1240900    63172
Name: seqno, dtype: int64 (2, 1)
[[57]
 [ 6]]
1246988    63460
Name: seqno, dtype: int64 (2, 1)
[[ 37]
 [-44]]
1249155    63573
Name: seqno, dtype: int64 (2, 1)
[[ 49]
 [-24]]
1252602    63752
Name: seqno, dtype: int64 (2, 1)
[[ -99]
 [-124]]
1252642    63753
Name: seqno, dtype: int64 (2, 1)
[[ -99]
 [-124]]
1286261    65357
Name: seqno, dtype: int64 (2, 1)
[[  -8]
 [-167]]
1301637    66081
Name: seqno, dtype: int64 (2, 1)
[[172]
 [ 96]]
1307448    66321
Name: seqno, dtype: int64 (2, 1)
[[38]
 [ 6]]
1313271    66586
1313283    66586
Name: seqno, dtype: int64 (1, 2)
[[-147  -98]]
1356018    68571
1356028    68571
Name: seqno, dtype: int64 (1, 2)
[[54 97]]
1407191    71173
1407217    71173
Name: seqno, dtype: int64 (1, 2)
[[-54  27]]
1411491    71367
Name: seqno, dtype: int64 (2, 1)
[[-11]
 [-66]]
1442467    72931
Name: seqno, dtype: int64 (2, 1)
[[130]
 [ 46]]
1461171    73869
Name: seqno, dtype: int64 (2, 1)
[[ -50]
 [-458]]
1490641    75269
1490664    75269
Name: seqno, dtype: int64 (1, 2)
[[-60  29]]
1518843    76750
1518864    76750
Name: seqno, dtype: int64 (1, 2)
[[-65 -23]]
1523084    77034
Name: seqno, dtype: int64 (2, 1)
[[ -89]
 [-180]]
1543849    78222
1543864    78222
Name: seqno, dtype: int64 (1, 2)
[[130 175]]
1550544    78604
Name: seqno, dtype: int64 (2, 1)
[[83]
 [43]]
1565522    79473
1565545    79473
Name: seqno, dtype: int64 (1, 2)
[[-50  48]]
1587057    80689
1587067    80689
Name: seqno, dtype: int64 (1, 2)
[[-99 -69]]
1594691    81146
Name: seqno, dtype: int64 (2, 1)
[[93]
 [58]]
1605243    81714
1605249    81714
Name: seqno, dtype: int64 (1, 2)
[[-195 -157]]
1622263    82566
Name: seqno, dtype: int64 (2, 1)
[[-58]
 [-68]]
1645640    83996
1645659    83996
Name: seqno, dtype: int64 (1, 2)
[[-167   47]]
1665830    85158
1665836    85158
Name: seqno, dtype: int64 (1, 2)
[[-292 -211]]
1667494    85241
1667510    85241
Name: seqno, dtype: int64 (1, 2)
[[50 90]]
1673893    85628
Name: seqno, dtype: int64 (2, 1)
[[ -46]
 [-123]]
1715803    87995
Name: seqno, dtype: int64 (2, 1)
[[-23]
 [-44]]
1752529    89900
Name: seqno, dtype: int64 (3, 1)
[[   8]
 [ -88]
 [-184]]
1755722    90075
1755735    90075
Name: seqno, dtype: int64 (1, 2)
[[-304    6]]
1757166    90148
Name: seqno, dtype: int64 (2, 1)
[[ -90]
 [-236]]
1781548    91359
1781570    91359
Name: seqno, dtype: int64 (1, 2)
[[43 84]]
1781597    91360
1781618    91360
Name: seqno, dtype: int64 (1, 2)
[[43 84]]
1792157    91877
Name: seqno, dtype: int64 (2, 1)
[[ -8]
 [-79]]
1837750    94411
Name: seqno, dtype: int64 (2, 1)
[[ -96]
 [-223]]
1927929    99603
1927945    99603
Name: seqno, dtype: int64 (1, 2)
[[-120   23]]
1957792    101197
1957800    101197
Name: seqno, dtype: int64 (1, 2)
[[42 70]]
2001786    103373
Name: seqno, dtype: int64 (2, 1)
[[ -64]
 [-114]]
2050032    105824
Name: seqno, dtype: int64 (2, 1)
[[118]
 [ 74]]
2071871    106884
2071878    106884
Name: seqno, dtype: int64 (1, 2)
[[ 68 181]]
2089865    107729
2089873    107729
Name: seqno, dtype: int64 (1, 2)
[[-682  -40]]
2095652    108004
2095693    108004
Name: seqno, dtype: int64 (1, 2)
[[-71 -10]]
2102907    108339
2102929    108339
Name: seqno, dtype: int64 (1, 2)
[[-53  57]]
2123196    109303
Name: seqno, dtype: int64 (3, 1)
[[113]
 [-63]
 [-91]]
2196818    113097
2196837    113097
Name: seqno, dtype: int64 (1, 2)
[[-145  -88]]
2200233    113287
Name: seqno, dtype: int64 (2, 1)
[[ 39]
 [-55]]
2216569    114107
2216598    114107
Name: seqno, dtype: int64 (1, 2)
[[-62  11]]
2222293    114391
2222302    114391
Name: seqno, dtype: int64 (1, 2)
[[-576 -234]]
2229746    114804
2229751    114804
Name: seqno, dtype: int64 (1, 2)
[[150 155]]
2241759    115381
2241770    115381
Name: seqno, dtype: int64 (1, 2)
[[-84  10]]
2248603    115680
2248625    115680
Name: seqno, dtype: int64 (1, 2)
[[-75  27]]
2252302    115852
Name: seqno, dtype: int64 (2, 1)
[[-33]
 [-81]]
2253138    115893
Name: seqno, dtype: int64 (2, 1)
[[187]
 [ 63]]
2298838    117886
2298843    117886
Name: seqno, dtype: int64 (1, 2)
[[45 50]]
2334153    119574
2334180    119574
2334185    119574
Name: seqno, dtype: int64 (1, 3)
[[-48 154 159]]
2340820    119904
2340841    119904
Name: seqno, dtype: int64 (1, 2)
[[365 414]]
2356256    120658
Name: seqno, dtype: int64 (2, 1)
[[108]
 [ 55]]
2356533    120668
Name: seqno, dtype: int64 (2, 1)
[[418]
 [ 52]]
2358389    120782
Name: seqno, dtype: int64 (2, 1)
[[-66]
 [-71]]
2358553    120790
Name: seqno, dtype: int64 (2, 1)
[[-26]
 [-57]]
2358581    120791
Name: seqno, dtype: int64 (2, 1)
[[-26]
 [-57]]
2436240    124649
2436246    124649
Name: seqno, dtype: int64 (1, 2)
[[-59 -53]]
2440924    124889
Name: seqno, dtype: int64 (2, 1)
[[ -60]
 [-107]]
2453097    125503
Name: seqno, dtype: int64 (2, 1)
[[ -69]
 [-126]]
2460474    125848
2460509    125848
Name: seqno, dtype: int64 (1, 2)
[[-253   10]]
2507712    128027
2507728    128027
Name: seqno, dtype: int64 (1, 2)
[[31 96]]
2513262    128244
Name: seqno, dtype: int64 (3, 1)
[[215]
 [179]
 [-37]]
2580277    131059
Name: seqno, dtype: int64 (2, 1)
[[ 172]
 [-371]]
2580807    131091
2580832    131091
Name: seqno, dtype: int64 (1, 2)
[[-64   6]]
2647795    134400
Name: seqno, dtype: int64 (2, 1)
[[ -61]
 [-155]]
2677003    135803
2677020    135803
Name: seqno, dtype: int64 (1, 2)
[[-55  27]]
2690795    136448
Name: seqno, dtype: int64 (2, 1)
[[ 45]
 [-21]]
2690810    136449
2690832    136449
Name: seqno, dtype: int64 (1, 2)
[[-21 289]]
2746886    139084
Name: seqno, dtype: int64 (2, 1)
[[ 54]
 [-56]]
2762553    139765
Name: seqno, dtype: int64 (2, 1)
[[-41]
 [-49]]
2763806    139833
2763826    139833
Name: seqno, dtype: int64 (1, 2)
[[-246   38]]
2775086    140358
2775091    140358
Name: seqno, dtype: int64 (1, 2)
[[19 69]]
2821297    142562
2821324    142562
2821334    142562
Name: seqno, dtype: int64 (1, 3)
[[-17  41  88]]
2827185    142821
Name: seqno, dtype: int64 (2, 1)
[[271]
 [ 10]]
2827333    142829
2827342    142829
2827362    142829
Name: seqno, dtype: int64 (1, 3)
[[ 38  63 105]]
2898156    146261
2898162    146261
Name: seqno, dtype: int64 (1, 2)
[[-89 -27]]
2898537    146284
Name: seqno, dtype: int64 (2, 1)
[[ -57]
 [-108]]
2914566    147092
Name: seqno, dtype: int64 (2, 1)
[[-235]
 [-353]]
2946648    148706
Name: seqno, dtype: int64 (2, 1)
[[-137]
 [-198]]
3008594    151729
3008624    151729
Name: seqno, dtype: int64 (1, 2)
[[-75  33]]
3008630    151730
3008669    151730
Name: seqno, dtype: int64 (1, 2)
[[-75  33]]
3009052    151744
Name: seqno, dtype: int64 (2, 1)
[[-153]
 [-236]]
3015209    152011
3015228    152011
Name: seqno, dtype: int64 (1, 2)
[[-54  13]]
3075762    154945
3075770    154945
Name: seqno, dtype: int64 (1, 2)
[[-86 -62]]
3112538    156921
3112545    156921
Name: seqno, dtype: int64 (1, 2)
[[37 74]]
3114465    157004
Name: seqno, dtype: int64 (2, 1)
[[96]
 [84]]
3123627    157454
3123638    157454
Name: seqno, dtype: int64 (1, 2)
[[50 73]]

import matplotlib.pyplot as plt
#plt.scatter(np.array(range(distances.shape[0])), distances, s=1);
plt.hist(distances, bins=100)
#curren_ax.hist(data, 50, facecolor='blue', alpha=0.5)

(array([ 52.,  32.,  42.,  86.,  79., 121.,  93.,  85.,  60.,  65.,  35.,
         56.,  27.,  24.,  21.,   9.,  13.,  20.,  10.,   9.,  17.,  16.,
          6.,  11.,   7.,  10.,   6.,   5.,   4.,  11.,  10.,   2.,   5.,
          5.,   5.,   6.,   5.,   2.,   7.,   4.,   2.,   8.,   1.,   6.,
          1.,   4.,   3.,   1.,   2.,   1.,   2.,   2.,   1.,   3.,   5.,
          1.,   5.,   5.,   2.,   5.,   1.,   2.,   0.,   1.,   1.,   4.,
          1.,   1.,   1.,   2.,   0.,   0.,   1.,   0.,   1.,   3.,   0.,
          0.,   0.,   2.,   0.,   0.,   1.,   1.,   1.,   0.,   0.,   0.,
          0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   2.,   0.,
          1.]),
 array([  6.  ,  12.93,  19.86,  26.79,  33.72,  40.65,  47.58,  54.51,
         61.44,  68.37,  75.3 ,  82.23,  89.16,  96.09, 103.02, 109.95,
        116.88, 123.81, 130.74, 137.67, 144.6 , 151.53, 158.46, 165.39,
        172.32, 179.25, 186.18, 193.11, 200.04, 206.97, 213.9 , 220.83,
        227.76, 234.69, 241.62, 248.55, 255.48, 262.41, 269.34, 276.27,
        283.2 , 290.13, 297.06, 303.99, 310.92, 317.85, 324.78, 331.71,
        338.64, 345.57, 352.5 , 359.43, 366.36, 373.29, 380.22, 387.15,
        394.08, 401.01, 407.94, 414.87, 421.8 , 428.73, 435.66, 442.59,
        449.52, 456.45, 463.38, 470.31, 477.24, 484.17, 491.1 , 498.03,
        504.96, 511.89, 518.82, 525.75, 532.68, 539.61, 546.54, 553.47,
        560.4 , 567.33, 574.26, 581.19, 588.12, 595.05, 601.98, 608.91,
        615.84, 622.77, 629.7 , 636.63, 643.56, 650.49, 657.42, 664.35,
        671.28, 678.21, 685.14, 692.07, 699.  ]),
 <a list of 100 Patch objects>)

fore

rev

from modisco.visualization import viz_sequence
#[(560, 560+8), (656, 656+8), (589,589+8), (638, 638+8)]
highlights = [(60, 60+8), (156, 156+8), (89,89+8), (138, 138+8)]
viz_sequence.plot_weights(ob.deeplift_data["scores"][6379, 500:700,:],
                          highlight={"black": highlights},
                          subticks_frequency=10, figsize=(20,1)
                         )

len(query['ppm'])

8

from matlas.matches import DenovoModisco

task_idx = 273
modisco_dir = "{0}/task_{1}-naivegw".format(modisco_root, task_idx)
summit_file = "{0}/{1}/fineFactorized/task_{2}-naivegw/predictions/task_{2}-{3}Summit.calibrated.{1}.tab.gz".format(project_root, data_name, task_idx, "Naive")
perf_file = "{0}/task_{1}-naivegw/NaiveauPRC.txt".format(model_root, task_idx)
deeplift_hdf = "{0}/{1}/Naive_deeplift2019/task_{2}-naivegw/summit.h5".format(srv_root, data_name, task_idx)
ob = DenovoModisco(modisco_dir)
pattern_name = "metacluster_1/pattern_7"
query = ob.load_chomped_pattern(pattern_name, plot=True)

ob.initialize_with_importance_scores(summit_file, perf_file, deeplift_hdf, query.keys()) #loads all the important scores
deeplift_data = ob.deeplift_data

from matlas.utils import get_calibrated_peaks

calibrated_data = get_calibrated_peaks(summit_file, perf_file)
selected_summits = calibrated_data['selected_summits']
indices = calibrated_data['indices']

#from basepair.plot.profiles import extract_signal
from matlas.tracks import extract_signal
pattern_name = 'metacluster_1/pattern_7'

seqlet_scores = ob.contrib_per_pattern[pattern_name]
seqlet_onehot = ob.seqs_per_pattern[pattern_name]

# seqlets_ = ob.seqlets_per_pattern[pattern_name]
# shuffled_onehot = extract_signal(ob.deeplift_data['shuffled_onehot'][indices], seqlets_)
# shuffled_scores = extract_signal(ob.deeplift_data['shuffled_scores'][indices], seqlets_)

# shuffled_onehot.shape, shuffled_scores.shape

from matlas.matches import collect_scoring_measures

method_to_seqlet_scores = collect_scoring_measures(query, seqlet_scores, seqlet_onehot)
print(query.keys(), np.sort(np.array(list(method_to_seqlet_scores.keys()))))


scores = deeplift_data['scores']
one_hot = deeplift_data['one_hot']

method_to_scores = collect_scoring_measures(query, scores, one_hot)

dict_keys(['ppm', 'pwm', 'log10pwm', 'pssm', 'cwm', 'hyp_cwm', 'consensus']) ['cwm_convolution' 'cwm_jaccard' 'cwm_masked_convolution' 'cwm_scan'
 'hyp_cwm_convolution' 'hyp_cwm_jaccard' 'hyp_cwm_masked_convolution'
 'hyp_cwm_scan' 'log10pwm_convolution' 'log10pwm_jaccard'
 'log10pwm_masked_convolution' 'log10pwm_scan' 'ppm_convolution'
 'ppm_jaccard' 'ppm_masked_convolution' 'ppm_scan' 'pssm_convolution'
 'pssm_jaccard' 'pssm_masked_convolution' 'pssm_scan' 'pwm_convolution'
 'pwm_jaccard' 'pwm_masked_convolution' 'pwm_scan' 'sum_score']

from matlas.matches import get_any_high_scoring_instances
per_base_imp=0.0625

keys_for_scoring = ['sum_score'] 
keys_for_scoring += ["{}_scan".format(key) for key in query.keys() if key!= 'consensus']
keys_for_scoring += ["{}_jaccard".format(key) for key in query.keys() if key!= 'consensus']
keys_for_scoring += ["{}_convolution".format(key) for key in query.keys() if key!= 'consensus']
keys_for_scoring += ["{}_masked_convolution".format(key) for key in query.keys() if key!= 'consensus']
        
seqlet_hits = get_any_high_scoring_instances(method_to_seqlet_scores,
                                                min_tot_imp=per_base_imp*len(query['ppm']),
                                                keys=keys_for_scoring)

_hits = get_any_high_scoring_instances(method_to_scores,
                                       min_tot_imp=per_base_imp*len(query['ppm']),
                                       keys=keys_for_scoring)

keys_for_scoring=['sum_score']
keys_for_scoring += ["{}_scan".format(key) for key in ['pssm', 'hyp_cwm', 'cwm']]
keys_for_scoring += ["{}_jaccard".format(key) for key in ['pssm', 'hyp_cwm']]
#keys_for_scoring += ["{}_convolution".format(key) for key in ['hyp_cwm', 'cwm']]
keys_for_scoring += ["{}_masked_convolution".format(key) for key in ['hyp_cwm']]

from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

def transform_data(_hits, scaler=None, pca=None):
    if scaler is None:
        # should be used only during training
        scaler = StandardScaler()
        scaler.fit(_hits)
        
    # Apply transform to both the training set and testing set
    _hits = scaler.transform(_hits)

    if pca is None:
        # Make an instance of the Model
        pca = PCA(.98)
        #fitting PCA on the training set only.
        pca.fit(_hits)
        
    #Apply the mapping (transform) to both the training set and the test set.
    _hits = pca.transform(_hits)
    
    return scaler, pca, _hits

scaler, pca, train_hits = transform_data(seqlet_hits[keys_for_scoring])
pca.explained_variance_ratio_

array([0.73421992, 0.09118347, 0.07766985, 0.02654744, 0.022959  ,
       0.01230255, 0.00980085, 0.00861208])

pc_df = pd.DataFrame(data = train_hits, columns=["PC{}".format(i+1) for i in range(train_hits.shape[1])])

def load_true_positives(tp_file):
    coordinates = []
    weak_coordinates = []
    with open(tp_file, "r") as fp:
        for line in fp:
            line = line.strip().split(",")
            seqno = line[0]
            pos = int(line[1])
            if "-" in seqno:
                seqno = seqno.split("-")
                for i in range(int(seqno[0]), int(seqno[1])+1):
                    coordinates.append((i, pos))
            else:
                if len(line)>2:
                    #if 'weak' in line[2]:
                    weak_coordinates.append((int(seqno), pos))
                    #else:
                        #coordinates.append((int(seqno), pos))
                else:
                    coordinates.append((int(seqno), pos))
    
    
    return {'strong':coordinates, 'weak':weak_coordinates}

def get_tp_indices(seqlet_hits, coordinates, verbose=True):
    keep = OrderedDict()
    for key in coordinates.keys():
        keep[key] = []
        for cord in coordinates[key]:
            seqno, pos = cord
            df = seqlet_hits[(seqlet_hits['seqno']==seqno) & (seqlet_hits['start_idx']==pos)]
            if df.shape[0]>0:
                keep[key].append(df.index[0])
            else:
                if verbose: print(cord)

    return keep

tp_file = "{}/web_mouse_heme_data/modisco_tracks/binary_model/task_{}-naivegw/mm17_tp_new.txt".format(project_root, task_idx)
coordinates = load_true_positives(tp_file)
true_indices = get_tp_indices(seqlet_hits, coordinates, False)
len(true_indices['strong']), len(true_indices['weak']), seqlet_hits.shape#, sum(labels==1)

(910, 11, (13587, 27))

import matplotlib.pyplot as plt
def plot_pca(df, true_ids=None):
    print(df.shape)
    fig = plt.figure(figsize = (8,8))
    ax = fig.add_subplot(1,1,1) 
    ax.set_xlabel('PC1', fontsize = 15)
    ax.set_ylabel('PC2', fontsize = 15)
    ax.set_title('2 component PCA', fontsize = 20)
    if true_ids is not None:
        strong_ids = true_ids['strong']
        strong_df = df.loc[np.array(strong_ids),:]
        ax.scatter(strong_df['PC1'], strong_df['PC2'], s = 1, c='tomato')
        
        weak_ids = true_ids['weak']
        weak_df = df.loc[np.array(weak_ids),:]
        ax.scatter(weak_df['PC1'], weak_df['PC2'], s = 5, c='darkred')
        
        df = df.loc[np.array(list(set(list(df.index)).difference(
            set(strong_ids).union(set(weak_ids))
        ))),:]
    ax.scatter(df['PC1'], df['PC2'], s = 1, c='slateblue')
    #ax.axvline(9)
    #ax.axhline(-1)
    ax.grid()
    ax.legend(['strong match', 'weak match', 'no match'], loc="upper right")
    return None
coordinates = load_true_positives(tp_file)
true_indices = get_tp_indices(seqlet_hits, coordinates, False)
plot_pca(pc_df, true_indices)

(13587, 8)

import matplotlib.pyplot as plt
from sklearn.mixture import GaussianMixture
#import sklearn.mixture.GaussianMixture
gmm = GaussianMixture(n_components=2, covariance_type='full', init_params='kmeans').fit(pc_df[['PC1', 'PC2']])
labels = gmm.predict(pc_df[['PC1', 'PC2']])
print(np.sum(labels==1), np.sum(labels==0))
plt.scatter(pc_df['PC1'], pc_df['PC2'], c=labels, s=1, cmap='seismic');
probs = gmm.predict_proba(pc_df[['PC1', 'PC2']])
plt.scatter(pc_df['PC1'], pc_df['PC2'], c=probs[:,1], s=1, cmap='seismic');

920 12667

pcv1 = pca.explained_variance_ratio_[0]
pcv2 = pca.explained_variance_ratio_[2]
pc_maxs = pc_df.max()
affinity = (pc_df['PC1']*pcv1/(pcv1+pcv2)+pc_df['PC2']*pcv2/(pcv1+pcv2))/(pc_maxs['PC1']*pcv1/(pcv1+pcv2)+pc_maxs['PC2']*pcv2/(pcv1+pcv2))
plt.scatter(pc_df['PC1'], pc_df['PC2'], c=affinity, s=1, cmap='seismic');

labels = np.array([0]*pc_df.shape[0])
labels[np.array(true_indices['strong'])]=2
labels[np.array(true_indices['weak'])]=1
pc_df['label'] = labels
pc_df['index'] = pc_df.index
pc_df['seqno'] = seqlet_hits.seqno
pc_df['pos'] = seqlet_hits.start_idx
pc_df['affinity'] = affinity
pc_df[:5]

import plotly.express as px
fig = px.scatter(pc_df, x="PC1", y="PC2", hover_data=['index', 'seqno', 'pos'], color='affinity',
                 color_continuous_scale=px.colors.sequential.Magma
                )
fig

#seqlets pca, seqlet gmm 
_, _, test_hits = transform_data(_hits[keys_for_scoring], scaler, pca)
_pc_df = pd.DataFrame(data = test_hits, columns=["PC{}".format(i+1) for i in range(test_hits.shape[1])])

_labels = gmm.predict(_pc_df[['PC1', 'PC2']])
print(np.sum(_labels==1), np.sum(_labels==0))
_probs = gmm.predict_proba(_pc_df[['PC1', 'PC2']])
plt.scatter(_pc_df['PC1'], _pc_df['PC2'], c=_probs[:,1], s=1, cmap='seismic');

9116 3041038

dlhits[:5]

from matlas.verification import compute_chip_matrix

compute_chip_matrix("MEL", "GATA", out_prefix, execute=True)

computeMatrix reference-point --referencePoint center --skipZeros -S /mnt/lab_data2/msharmin/mouse_chipseq_bigwigs/ENCFF686HPH.bigWig -R /mnt/lab_data/kundaje/msharmin/mouse_hem/with_tfd/web_mouse_heme_data/modisco_tracks/binary_model/task_273-naivegw/m17_gmm.bed -a 500 -b 500 -bs 10 -o /mnt/lab_data/kundaje/msharmin/mouse_hem/with_tfd/web_mouse_heme_data/modisco_tracks/binary_model/task_273-naivegw/m17_gmm.matrix.gz --outFileSortedRegions /mnt/lab_data/kundaje/msharmin/mouse_hem/with_tfd/web_mouse_heme_data/modisco_tracks/binary_model/task_273-naivegw/m17_gmm.matrix.bed

from matlas.verification import plot_chip_heatmap
plot_chip_heatmap("MEL", "GATA", out_prefix, execute=True)

plotHeatmap -m /mnt/lab_data/kundaje/msharmin/mouse_hem/with_tfd/web_mouse_heme_data/modisco_tracks/binary_model/task_273-naivegw/m17_gmm.matrix.gz -out /mnt/lab_data/kundaje/msharmin/mouse_hem/with_tfd/web_mouse_heme_data/modisco_tracks/binary_model/task_273-naivegw/MEL-GATA-heatmap.png  --colorMap Blues --xAxisLabel Distance --missingDataColor 1 --zMin 0 --heatmapHeight 28 --heatmapWidth 8 --legendLocation none

from matlas.matches import collect_scoring_measures
seqlet_scores = ob.contrib_per_pattern[pattern_name]
seqlet_onehot = ob.seqs_per_pattern[pattern_name]
method_to_seqlet_scores = collect_scoring_measures(query, seqlet_scores, seqlet_onehot)
query.keys(), np.sort(np.array(list(method_to_seqlet_scores.keys())))

(dict_keys(['ppm', 'pwm', 'log10pwm', 'pssm', 'cwm', 'hyp_cwm', 'consensus']),
 array(['cwm_convolution', 'cwm_jaccard', 'cwm_masked_convolution',
        'cwm_scan', 'hyp_cwm_convolution', 'hyp_cwm_jaccard',
        'hyp_cwm_masked_convolution', 'hyp_cwm_scan',
        'log10pwm_convolution', 'log10pwm_jaccard',
        'log10pwm_masked_convolution', 'log10pwm_scan', 'ppm_convolution',
        'ppm_jaccard', 'ppm_masked_convolution', 'ppm_scan',
        'pssm_convolution', 'pssm_jaccard', 'pssm_masked_convolution',
        'pssm_scan', 'pwm_convolution', 'pwm_jaccard',
        'pwm_masked_convolution', 'pwm_scan', 'sum_score'], dtype='<U27'))

method_to_shuffled_scores = collect_scoring_measures(query, shuffled_scores, shuffled_onehot)
method_to_seq_null_scores = collect_scoring_measures(query, seq_null_scores, seq_null_onehot)
method_to_weight_null_scores = collect_scoring_measures(query, weight_null_scores, weight_null_onehot)
method_to_weightedseq_null_scores = collect_scoring_measures(query, weightedseq_null_scores, weightedseq_null_onehot)

from matlas.matches import get_any_high_scoring_instances
per_base_imp=0.0625

keys_for_scoring = ['sum_score'] 
keys_for_scoring += ["{}_scan".format(key) for key in query.keys() if key!= 'consensus']
keys_for_scoring += ["{}_jaccard".format(key) for key in query.keys() if key!= 'consensus']
keys_for_scoring += ["{}_convolution".format(key) for key in query.keys() if key!= 'consensus']
keys_for_scoring += ["{}_masked_convolution".format(key) for key in query.keys() if key!= 'consensus']
        
seqlet_hits = get_any_high_scoring_instances(method_to_seqlet_scores,
                                                min_tot_imp=per_base_imp*len(query['ppm']),
                                                keys=keys_for_scoring)

def get_background_instances(fg, method_to_null_scores, keys=None):
    # given the fg coords, get the df of bg
    # appy scaling and pca as in fg, then apply log-likelihood
    if keys is None:
        keys = list(method_to_scores.keys())
    sum_scores = method_to_null_scores['sum_score']
    matches = [(sum_scores[seqno, pos], seqno, pos) for seqno, pos in zip(fg['seqno'].values, fg['start_idx'].values)]
    match_df = pd.DataFrame(np.array(matches))
    match_df.columns = ['sum_score', 'seqno', 'start_idx']
    match_df.seqno = match_df.seqno.values.astype(int)
    match_df.start_idx = match_df.start_idx.values.astype(int)
    for key in keys:
        if(key=='sum_score'):
            continue
        match_df[key] = [method_to_null_scores[key][idx1, idx2] for 
                         (idx1, idx2) in zip(fg['seqno'].values, fg['start_idx'].values)]
    
    return match_df

shuffled_paired_hits = get_background_instances(seqlet_hits, method_to_shuffled_scores, keys=keys_for_scoring)
seq_null_paired_hits = get_background_instances(seqlet_hits, method_to_seq_null_scores, keys=keys_for_scoring)
weight_null_paired_hits = get_background_instances(seqlet_hits, method_to_weight_null_scores, keys=keys_for_scoring)
weightedseq_null_paired_hits = get_background_instances(seqlet_hits, method_to_weightedseq_null_scores, keys=keys_for_scoring)
seqlet_hits.shape, shuffled_paired_hits.shape, seq_null_paired_hits.shape, weight_null_paired_hits.shape, weightedseq_null_paired_hits.shape

((13587, 27), (13587, 27), (13587, 27), (13587, 27), (13587, 27))

from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

def transform_data(_hits, scaler=None, pca=None):
    if scaler is None:
        # should be used only during training
        scaler = StandardScaler()
        scaler.fit(_hits)
        
    # Apply transform to both the training set and testing set
    _hits = scaler.transform(_hits)

    if pca is None:
        # Make an instance of the Model
        pca = PCA(.98)
        #fitting PCA on the training set only.
        pca.fit(_hits)
        
    #Apply the mapping (transform) to both the training set and the test set.
    _hits = pca.transform(_hits)
    
    return scaler, pca, _hits

scaler, pca, train_hits = transform_data(seqlet_hits[keys_for_scoring])
pca.explained_variance_ratio_

array([0.73421992, 0.09118347, 0.07766985, 0.02654744, 0.022959  ,
       0.01230255, 0.00980085, 0.00861208])

shuffled_seqlet_hits = get_any_high_scoring_instances(method_to_shuffled_scores,
                                                min_tot_imp=per_base_imp*len(query['ppm']),
                                                keys=keys_for_scoring)
seq_null_seqlet_hits = get_any_high_scoring_instances(method_to_seq_null_scores,
                                                min_tot_imp=per_base_imp*len(query['ppm']),
                                                keys=keys_for_scoring)
weight_null_seqlet_hits = get_any_high_scoring_instances(method_to_weight_null_scores,
                                                min_tot_imp=per_base_imp*len(query['ppm']),
                                                keys=keys_for_scoring)
weightedseq_null_seqlet_hits = get_any_high_scoring_instances(method_to_weightedseq_null_scores,
                                                min_tot_imp=per_base_imp*len(query['ppm']),
                                                keys=keys_for_scoring)

seqlet_hits['marker'] = 'foreground'
shuffled_seqlet_hits['marker'] = 'suffled'
seq_null_seqlet_hits['marker'] = 'seq_null'
weight_null_seqlet_hits['marker'] = 'weight_null'
weightedseq_null_seqlet_hits['marker'] = 'weightedseq'
test_hits = pd.concat([seqlet_hits,
                       shuffled_seqlet_hits,
                       seq_null_seqlet_hits,
                       weight_null_seqlet_hits,
                       weightedseq_null_seqlet_hits
                      ])

/users/msharmin/anaconda2/envs/basepair13/lib/python3.6/site-packages/ipykernel_launcher.py:10: FutureWarning:

Sorting because non-concatenation axis is not aligned. A future version
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.

To retain the current behavior and silence the warning, pass 'sort=True'.

#read the real truth
def load_true_positives(tp_file):
    coordinates = []
    with open(tp_file, "r") as fp:
        for line in fp:
            line = line.strip().split(",")
            seqno = line[0]
            pos = int(line[1])
            if "-" in seqno:
                seqno = seqno.split("-")
                for i in range(int(seqno[0]), int(seqno[1])+1):
                    coordinates.append((i, pos))
            else:
                coordinates.append((int(seqno), pos))
    return coordinates
def get_tp_indices(seqlet_hits, verbose=True):
    keep = []
    for cord in coordinates:
        seqno, pos = cord
        df = seqlet_hits[(seqlet_hits['seqno']==seqno) & (seqlet_hits['start_idx']==pos)]
        if df.shape[0]>0:
            keep.append(df.index[0])
        else:
            if verbose: print(cord)

    return keep

tp_file = "{}/web_mouse_heme_data/modisco_tracks/binary_model/task_{}-naivegw/mm17_tp.txt".format(project_root, task_idx)
coordinates = load_true_positives(tp_file)
true_indices = get_tp_indices(seqlet_hits, False)

import matplotlib.pyplot as plt
def plot_pca(df, true_ids=None):
    print(df.shape)
    fig = plt.figure(figsize = (8,8))
    ax = fig.add_subplot(1,1,1) 
    ax.set_xlabel('PC1', fontsize = 15)
    ax.set_ylabel('PC2', fontsize = 15)
    ax.set_title('2 component PCA', fontsize = 20)
    if true_ids is not None:
        true_df = df.loc[np.array(true_ids),:]
        df = df.loc[np.array(list(set(list(df.index)).difference(set(true_ids)))),:]
        ax.scatter(true_df['PC1'], true_df['PC2'], s = 20, c='r', marker='+')
    ax.scatter(df['PC1'], df['PC2'], s = 1, c='b')
    #ax.axvline(9)
    #ax.axhline(-1)
    ax.grid()
    return None
pc_df = pd.DataFrame(data = train_hits, columns=["PC{}".format(i+1) for i in range(train_hits.shape[1])])
plot_pca(pc_df, true_indices)

(13587, 8)

from sklearn.mixture import GaussianMixture
#import sklearn.mixture.GaussianMixture
gmm = GaussianMixture(n_components=2).fit(pc_df[['PC1', 'PC2']])
labels = gmm.predict(pc_df[['PC1', 'PC2']])
plt.scatter(pc_df['PC1'], pc_df['PC2'], c=labels, s=1, cmap='seismic');

probs = gmm.predict_proba(pc_df[['PC1', 'PC2']])
plt.scatter(pc_df['PC1'], pc_df['PC2'], c=probs[:,1], s=1, cmap='seismic');

conditions = np.copy(labels)
conditions[np.logical_and(pc_df['PC1'].values<10, pc_df['PC2'].values<-0.5)] = 0
plt.scatter(pc_df['PC1'], pc_df['PC2'], c=conditions, s=1, cmap='seismic');

# selected = np.array(list(set(list(np.argwhere(conditions==0).T[0])).difference(set(list(np.argwhere(labels==0).T[0])))))
# pc_df.loc[selected,:]
# seqlet_hits.loc[selected,:]
# probs[:,1]

from matlas.render_report import jupyter_nbconvert
import os
def render_test_plots(nbname="global_report"):
    rendered_ipynb = "/mnt/lab_data/kundaje/msharmin/mouse_hem/code/mouse_hem/{}.ipynb".format(nbname)
    jupyter_nbconvert(rendered_ipynb)

    os.system("cp /mnt/lab_data/kundaje/msharmin/mouse_hem/code/mouse_hem/{}.html /srv/www/kundaje/msharmin/report/global_htmls/".format(nbname))
    os.system("chmod +755 /srv/www/kundaje/msharmin/mouse_hem/report/global_htmls/{}.html".format(nbname))

    return None

render_test_plots('testSpi1Scan')

_, _, test_hits = transform_data(shuffled_seqlet_hits[keys_for_scoring], scaler, pca)
shuffled_pc_df = pd.DataFrame(data = test_hits, columns=["PC{}".format(i+1) for i in range(test_hits.shape[1])])
#plot_pca(shuffled_pc_df, true_ids=None)

probs = gmm.predict_proba(shuffled_pc_df[['PC1', 'PC2']])
plt.scatter(shuffled_pc_df['PC1'], shuffled_pc_df['PC2'], c=probs[:,1], s=1, cmap='seismic');

marker = np.array(['F']*llr_df.shape[0])
marker[np.array(true_indices)] = 'TP'
llr_df['marker'] = marker

#dimension reduction
len(keys_for_scoring)
keys_for_scoring = ['sum_score'] 
keys_for_scoring += ["{}_scan".format(key) for key in query.keys() if key!= 'consensus']
keys_for_scoring += ["{}_jaccard".format(key) for key in query.keys() if key!= 'consensus']
keys_for_scoring += ["{}_convolution".format(key) for key in query.keys() if key!= 'consensus']
keys_for_scoring += ["{}_masked_convolution".format(key) for key in ['hyp_cwm', 'log10pwm', 'pwm']]

import umap
trans = umap.UMAP(n_neighbors=5, random_state=42).fit(seqlet_hits[keys_for_scoring])
print(trans.embedding_.shape)
plt.scatter(trans.embedding_[:, 0], trans.embedding_[:, 1], s=1, cmap='Spectral')

/users/msharmin/anaconda2/envs/basepair13/lib/python3.6/site-packages/umap/spectral.py:229: UserWarning:

Embedding a total of 3 separate connected components using meta-embedding (experimental)

(13587, 2)

<matplotlib.collections.PathCollection at 0x7f5f8aadbba8>

test_embedding = trans.transform(seqlet_hits[keys_for_scoring])
print(test_embedding.shape)
plt.scatter(test_embedding[:, 0], test_embedding[:, 1], s=1, cmap='Spectral')

(13587, 2)

<matplotlib.collections.PathCollection at 0x7f5f8ab2ed30>

test_embedding = trans.transform(shuffled_seqlet_hits[keys_for_scoring])
print(test_embedding.shape)
plt.scatter(test_embedding[:, 0], test_embedding[:, 1], s=1, cmap='Spectral')

(2846, 2)

<matplotlib.collections.PathCollection at 0x7f5f89ce7358>

test_embedding = trans.transform(seq_null_seqlet_hits[keys_for_scoring])
print(test_embedding.shape)
plt.scatter(test_embedding[:, 0], test_embedding[:, 1], s=1, cmap='Spectral')

(13587, 2)

<matplotlib.collections.PathCollection at 0x7f5f8a2b3048>

test_embedding = trans.transform(weight_null_seqlet_hits[keys_for_scoring])
print(test_embedding.shape)
plt.scatter(test_embedding[:, 0], test_embedding[:, 1], s=1, cmap='Spectral')

(12928, 2)

<matplotlib.collections.PathCollection at 0x7f5f8b1cc240>

test_embedding = trans.transform(weightedseq_null_seqlet_hits[keys_for_scoring])
print(test_embedding.shape)
plt.scatter(test_embedding[:, 0], test_embedding[:, 1], s=1, cmap='Spectral')

(13422, 2)

<matplotlib.collections.PathCollection at 0x7f5f8a8647b8>

type(test_embedding)

numpy.ndarray

umap_seqlet_hits = pd.DataFrame(trans.transform(seqlet_hits[keys_for_scoring]))
umap_shuffled_seqlet_hits = pd.DataFrame(trans.transform(shuffled_seqlet_hits[keys_for_scoring]))
umap_seq_null_seqlet_hits = pd.DataFrame(trans.transform(seq_null_seqlet_hits[keys_for_scoring]))
umap_weight_null_seqlet_hits = pd.DataFrame(trans.transform(weight_null_seqlet_hits[keys_for_scoring]))
umap_weightedseq_null_seqlet_hits = pd.DataFrame(trans.transform(weightedseq_null_seqlet_hits[keys_for_scoring]))
umap_seqlet_hits['marker'] = 'foreground'
umap_shuffled_seqlet_hits['marker'] = 'suffled'
umap_seq_null_seqlet_hits['marker'] = 'seq_null'
umap_weight_null_seqlet_hits['marker'] = 'weight_null'
umap_weightedseq_null_seqlet_hits['marker'] = 'weightedseq'
umap_test_hits = pd.concat([umap_seqlet_hits,
                       umap_shuffled_seqlet_hits,
                       umap_seq_null_seqlet_hits,
                       umap_weight_null_seqlet_hits,
                       umap_weightedseq_null_seqlet_hits
                      ])

sns.pairplot(umap_test_hits, 
             diag_kind="kde", kind='scatter', hue="marker")

/users/msharmin/anaconda2/envs/basepair13/lib/python3.6/site-packages/scipy/stats/stats.py:1713: FutureWarning:

Using a non-tuple sequence for multidimensional indexing is deprecated; use `arr[tuple(seq)]` instead of `arr[seq]`. In the future this will be interpreted as an array index, `arr[np.array(seq)]`, which will result either in an error or a different result.

<seaborn.axisgrid.PairGrid at 0x7f5f8a689a58>

PPM in forground vs. various null scores¶

from matlas.matches import plots_seqlet_measures
plots_seqlet_measures(seqlet_hits, metrics = {0:["sum_score", "ppm_scan"], 
                                            1:["sum_score", "ppm_convolution"], 
                                            2:["sum_score", "ppm_masked_convolution"], 
                                            3:["sum_score", "ppm_jaccard"]})

plots_seqlet_measures(shuffled_seqlet_hits, metrics = {0:["sum_score", "ppm_scan"], 
                                            1:["sum_score", "ppm_convolution"], 
                                            2:["sum_score", "ppm_masked_convolution"], 
                                            3:["sum_score", "ppm_jaccard"]})

/users/msharmin/anaconda2/envs/basepair13/lib/python3.6/site-packages/scipy/stats/stats.py:1713: FutureWarning:

Using a non-tuple sequence for multidimensional indexing is deprecated; use `arr[tuple(seq)]` instead of `arr[seq]`. In the future this will be interpreted as an array index, `arr[np.array(seq)]`, which will result either in an error or a different result.

plots_seqlet_measures(seq_null_seqlet_hits, metrics = {0:["sum_score", "ppm_scan"], 
                                            1:["sum_score", "ppm_convolution"], 
                                            2:["sum_score", "ppm_masked_convolution"], 
                                            3:["sum_score", "ppm_jaccard"]})

plots_seqlet_measures(weight_null_seqlet_hits, metrics = {0:["sum_score", "ppm_scan"], 
                                            1:["sum_score", "ppm_convolution"], 
                                            2:["sum_score", "ppm_masked_convolution"], 
                                            3:["sum_score", "ppm_jaccard"]})

plots_seqlet_measures(weightedseq_null_seqlet_hits, metrics = {0:["sum_score", "ppm_scan"], 
                                            1:["sum_score", "ppm_convolution"], 
                                            2:["sum_score", "ppm_masked_convolution"], 
                                            3:["sum_score", "ppm_jaccard"]})

PWM in forground vs. various null scores¶

from matlas.matches import plots_seqlet_measures
plots_seqlet_measures(seqlet_hits, metrics = {0:["sum_score", "pwm_scan"], 
                                            1:["sum_score", "pwm_convolution"], 
                                            2:["sum_score", "pwm_masked_convolution"], 
                                            3:["sum_score", "pwm_jaccard"]})

plots_seqlet_measures(shuffled_seqlet_hits, metrics = {0:["sum_score", "pwm_scan"], 
                                            1:["sum_score", "pwm_convolution"], 
                                            2:["sum_score", "pwm_masked_convolution"], 
                                            3:["sum_score", "pwm_jaccard"]})

plots_seqlet_measures(seq_null_seqlet_hits, metrics = {0:["sum_score", "pwm_scan"], 
                                            1:["sum_score", "pwm_convolution"], 
                                            2:["sum_score", "pwm_masked_convolution"], 
                                            3:["sum_score", "pwm_jaccard"]})

plots_seqlet_measures(weight_null_seqlet_hits, metrics = {0:["sum_score", "pwm_scan"], 
                                            1:["sum_score", "pwm_convolution"], 
                                            2:["sum_score", "pwm_masked_convolution"], 
                                            3:["sum_score", "pwm_jaccard"]})

plots_seqlet_measures(weightedseq_null_seqlet_hits, metrics = {0:["sum_score", "pwm_scan"], 
                                            1:["sum_score", "pwm_convolution"], 
                                            2:["sum_score", "pwm_masked_convolution"], 
                                            3:["sum_score", "pwm_jaccard"]})

log10pwm in forground vs. various null scores¶

from matlas.matches import plots_seqlet_measures
plots_seqlet_measures(seqlet_hits, metrics = {0:["sum_score", "log10pwm_scan"], 
                                            1:["sum_score", "log10pwm_convolution"], 
                                            2:["sum_score", "log10pwm_masked_convolution"], 
                                            3:["sum_score", "log10pwm_jaccard"]})

plots_seqlet_measures(shuffled_seqlet_hits, metrics = {0:["sum_score", "log10pwm_scan"], 
                                            1:["sum_score", "log10pwm_convolution"], 
                                            2:["sum_score", "log10pwm_masked_convolution"], 
                                            3:["sum_score", "log10pwm_jaccard"]})

from matlas.matches import plots_seqlet_measures
plots_seqlet_measures(seq_null_seqlet_hits, metrics = {0:["sum_score", "log10pwm_scan"], 
                                            1:["sum_score", "log10pwm_convolution"], 
                                            2:["sum_score", "log10pwm_masked_convolution"], 
                                            3:["sum_score", "log10pwm_jaccard"]})

from matlas.matches import plots_seqlet_measures
plots_seqlet_measures(weight_null_seqlet_hits, metrics = {0:["sum_score", "log10pwm_scan"], 
                                            1:["sum_score", "log10pwm_convolution"], 
                                            2:["sum_score", "log10pwm_masked_convolution"], 
                                            3:["sum_score", "log10pwm_jaccard"]})

from matlas.matches import plots_seqlet_measures
plots_seqlet_measures(weightedseq_null_seqlet_hits, metrics = {0:["sum_score", "log10pwm_scan"], 
                                            1:["sum_score", "log10pwm_convolution"], 
                                            2:["sum_score", "log10pwm_masked_convolution"], 
                                            3:["sum_score", "log10pwm_jaccard"]})

PSSM in forground vs. various null scores¶

from matlas.matches import plots_seqlet_measures
plots_seqlet_measures(seqlet_hits, metrics = {0:["sum_score", "pssm_scan"], 
                                            1:["sum_score", "pssm_convolution"], 
                                            2:["sum_score", "pssm_masked_convolution"], 
                                            3:["sum_score", "pssm_jaccard"]})

plots_seqlet_measures(shuffled_seqlet_hits, metrics = {0:["sum_score", "pssm_scan"], 
                                            1:["sum_score", "pssm_convolution"], 
                                            2:["sum_score", "pssm_masked_convolution"], 
                                            3:["sum_score", "pssm_jaccard"]})

plots_seqlet_measures(seq_null_seqlet_hits, metrics = {0:["sum_score", "pssm_scan"], 
                                            1:["sum_score", "pssm_convolution"], 
                                            2:["sum_score", "pssm_masked_convolution"], 
                                            3:["sum_score", "pssm_jaccard"]})

plots_seqlet_measures(weight_null_seqlet_hits, metrics = {0:["sum_score", "pssm_scan"], 
                                            1:["sum_score", "pssm_convolution"], 
                                            2:["sum_score", "pssm_masked_convolution"], 
                                            3:["sum_score", "pssm_jaccard"]})

plots_seqlet_measures(weightedseq_null_seqlet_hits, metrics = {0:["sum_score", "pssm_scan"], 
                                            1:["sum_score", "pssm_convolution"], 
                                            2:["sum_score", "pssm_masked_convolution"], 
                                            3:["sum_score", "pssm_jaccard"]})

cwm in forground vs. various null scores¶

from matlas.matches import plots_seqlet_measures
plots_seqlet_measures(seqlet_hits, metrics = {0:["sum_score", "cwm_scan"], 
                                            1:["sum_score", "cwm_convolution"], 
                                            2:["sum_score", "cwm_masked_convolution"], 
                                            3:["sum_score", "cwm_jaccard"]})

plots_seqlet_measures(shuffled_seqlet_hits, metrics = {0:["sum_score", "cwm_scan"], 
                                            1:["sum_score", "cwm_convolution"], 
                                            2:["sum_score", "cwm_masked_convolution"], 
                                            3:["sum_score", "cwm_jaccard"]})

plots_seqlet_measures(seq_null_seqlet_hits, metrics = {0:["sum_score", "cwm_scan"], 
                                            1:["sum_score", "cwm_convolution"], 
                                            2:["sum_score", "cwm_masked_convolution"], 
                                            3:["sum_score", "cwm_jaccard"]})

plots_seqlet_measures(weight_null_seqlet_hits, metrics = {0:["sum_score", "cwm_scan"], 
                                            1:["sum_score", "cwm_convolution"], 
                                            2:["sum_score", "cwm_masked_convolution"], 
                                            3:["sum_score", "cwm_jaccard"]})

plots_seqlet_measures(weightedseq_null_seqlet_hits, metrics = {0:["sum_score", "cwm_scan"], 
                                            1:["sum_score", "cwm_convolution"], 
                                            2:["sum_score", "cwm_masked_convolution"], 
                                            3:["sum_score", "cwm_jaccard"]})

hyp_cwm in forground vs. various null scores¶

from matlas.matches import plots_seqlet_measures
plots_seqlet_measures(seqlet_hits, metrics = {0:["sum_score", "hyp_cwm_scan"], 
                                            1:["sum_score", "hyp_cwm_convolution"], 
                                            2:["sum_score", "hyp_cwm_masked_convolution"], 
                                            3:["sum_score", "hyp_cwm_jaccard"]})

plots_seqlet_measures(shuffled_seqlet_hits, metrics = {0:["sum_score", "hyp_cwm_scan"], 
                                            1:["sum_score", "hyp_cwm_convolution"], 
                                            2:["sum_score", "hyp_cwm_masked_convolution"], 
                                            3:["sum_score", "hyp_cwm_jaccard"]})

plots_seqlet_measures(seq_null_seqlet_hits, metrics = {0:["sum_score", "hyp_cwm_scan"], 
                                            1:["sum_score", "hyp_cwm_convolution"], 
                                            2:["sum_score", "hyp_cwm_masked_convolution"], 
                                            3:["sum_score", "hyp_cwm_jaccard"]})

plots_seqlet_measures(weight_null_seqlet_hits, metrics = {0:["sum_score", "hyp_cwm_scan"], 
                                            1:["sum_score", "hyp_cwm_convolution"], 
                                            2:["sum_score", "hyp_cwm_masked_convolution"], 
                                            3:["sum_score", "hyp_cwm_jaccard"]})

plots_seqlet_measures(weightedseq_null_seqlet_hits, metrics = {0:["sum_score", "hyp_cwm_scan"], 
                                            1:["sum_score", "hyp_cwm_convolution"], 
                                            2:["sum_score", "hyp_cwm_masked_convolution"], 
                                            3:["sum_score", "hyp_cwm_jaccard"]})

len(keys_for_scoring)
keys_for_scoring = ['sum_score'] 
keys_for_scoring += ["{}_scan".format(key) for key in query.keys() if key!= 'consensus']
keys_for_scoring += ["{}_jaccard".format(key) for key in query.keys() if key!= 'consensus']
keys_for_scoring += ["{}_convolution".format(key) for key in query.keys() if key!= 'consensus']
keys_for_scoring += ["{}_masked_convolution".format(key) for key in query.keys() if key!= 'consensus']
#keys_for_scoring += ["{}_masked_convolution".format(key) for key in ['hyp_cwm', 'log10pwm', 'pwm']]

from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
# Fit on training set only.
scaler.fit(seqlet_hits[keys_for_scoring])
# Apply transform to both the training set and the test set.
train_hits = scaler.transform(seqlet_hits[keys_for_scoring])

from sklearn.decomposition import PCA
# Make an instance of the Model
pca = PCA(.98)
#fitting PCA on the training set only.
pca.fit(train_hits)
#Apply the mapping (transform) to both the training set and the test set.
train_hits = pca.transform(train_hits)
#test_hits = pca.transform(test_hits)

#seqlet_hits[seqlet_hits['seqno']==684]
pca.explained_variance_ratio_

array([0.73421992, 0.09118347, 0.07766985, 0.02654744, 0.022959  ,
       0.01230255, 0.00980085, 0.00861208])

#read the real truth
def load_true_positives(tp_file):
    coordinates = []
    with open(tp_file, "r") as fp:
        for line in fp:
            line = line.strip().split(",")
            seqno = line[0]
            pos = int(line[1])
            if "-" in seqno:
                seqno = seqno.split("-")
                for i in range(int(seqno[0]), int(seqno[1])+1):
                    coordinates.append((i, pos))
            else:
                coordinates.append((int(seqno), pos))
    return coordinates
def get_tp_indices(seqlet_hits):
    keep = []
    for cord in coordinates:
        seqno, pos = cord
        df = seqlet_hits[(seqlet_hits['seqno']==seqno) & (seqlet_hits['start_idx']==pos)]
        if df.shape[0]>0:
            keep.append(df.index[0])
        else:
            print(cord)

    return keep

tp_file = "{}/web_mouse_heme_data/modisco_tracks/binary_model/task_{}-naivegw/mm17_tp.txt".format(project_root, task_idx)
coordinates = load_true_positives(tp_file)
true_indices = get_tp_indices(seqlet_hits)

(684, 49)

def plot_pca(df, true_ids=None):
    print(df.shape)
    fig = plt.figure(figsize = (8,8))
    ax = fig.add_subplot(1,1,1) 
    ax.set_xlabel('PC1', fontsize = 15)
    ax.set_ylabel('PC2', fontsize = 15)
    ax.set_title('2 component PCA', fontsize = 20)
    if true_ids is not None:
        true_df = df.loc[np.array(true_ids),:]
        df = df.loc[np.array(list(set(list(df.index)).difference(set(true_ids)))),:]
        ax.scatter(true_df['PC1'], true_df['PC2'], s = 20, c='r', marker='+')
    ax.scatter(df['PC1'], df['PC2'], s = 1, c='b')
    #ax.axvline(9)
    #ax.axhline(-1)
    ax.grid()
    return None
pc_df = pd.DataFrame(data = train_hits, columns=["PC{}".format(i+1) for i in range(train_hits.shape[1])])
plot_pca(pc_df, )

(13587, 8)

#test_df = pc_df.loc[np.array(list(set(list(pc_df.index)).difference(set(true_indices)))),:]
#test_df = test_df.loc[(test_df['PC1']>9) & (test_df['PC2']<0),:]
#seqlet_hits.loc[test_df.index, :]
seqlet_hits[seqlet_hits['seqno']==724]

test_hits = scaler.transform(shuffled_seqlet_hits[keys_for_scoring])
test_hits = pca.transform(test_hits)
shuffled_pc_df = pd.DataFrame(data = test_hits, columns=["PC{}".format(i+1) for i in range(test_hits.shape[1])])
plot_pca(shuffled_pc_df, true_ids=None)

(2846, 4)

test_hits = scaler.transform(seq_null_seqlet_hits[keys_for_scoring])
test_hits = pca.transform(test_hits)
seq_null_pc_df = pd.DataFrame(data = test_hits, columns=["PC{}".format(i+1) for i in range(test_hits.shape[1])])
plot_pca(seq_null_pc_df)

(13587, 4)

test_hits = scaler.transform(weight_null_seqlet_hits[keys_for_scoring])
test_hits = pca.transform(test_hits)
weight_null_pc_df = pd.DataFrame(data = test_hits, columns=["PC{}".format(i+1) for i in range(test_hits.shape[1])])
plot_pca(weight_null_pc_df)

(13714, 4)

test_hits = scaler.transform(weightedseq_null_seqlet_hits[keys_for_scoring])
test_hits = pca.transform(test_hits)
weightedseq_null_pc_df = pd.DataFrame(data = test_hits, columns=["PC{}".format(i+1) for i in range(test_hits.shape[1])])
plot_pca(weightedseq_null_pc_df)

(12597, 4)

pc_df[:5]

pca_seqlet_hits = pc_df[['PC1', 'PC2', 'PC3']]
pca_shuffled_seqlet_hits = shuffled_pc_df[['PC1', 'PC2', 'PC3']]
pca_seq_null_seqlet_hits = seq_null_pc_df[['PC1', 'PC2', 'PC3']]
pca_weight_null_seqlet_hits = weight_null_pc_df[['PC1', 'PC2', 'PC3']]
pca_weightedseq_null_seqlet_hits = weightedseq_null_pc_df[['PC1', 'PC2', 'PC3']]
pca_seqlet_hits['marker'] = 'foreground'
pca_shuffled_seqlet_hits['marker'] = 'shuffled'
pca_seq_null_seqlet_hits['marker'] = 'seq_null'
pca_weight_null_seqlet_hits['marker'] = 'weight_null'
pca_weightedseq_null_seqlet_hits['marker'] = 'weightedseq'
pca_test_hits = pd.concat([pca_seqlet_hits,
                       pca_shuffled_seqlet_hits,
                       pca_seq_null_seqlet_hits,
                       pca_weight_null_seqlet_hits,
                       pca_weightedseq_null_seqlet_hits
                      ])

import seaborn as sns
sns.pairplot(pd.concat([pca_seqlet_hits, pca_shuffled_seqlet_hits]), 
             diag_kind="kde", kind='scatter', hue="marker")

/users/msharmin/anaconda2/envs/basepair13/lib/python3.6/site-packages/scipy/stats/stats.py:1713: FutureWarning:

Using a non-tuple sequence for multidimensional indexing is deprecated; use `arr[tuple(seq)]` instead of `arr[seq]`. In the future this will be interpreted as an array index, `arr[np.array(seq)]`, which will result either in an error or a different result.

<seaborn.axisgrid.PairGrid at 0x7f229c385630>

sns.pairplot(pd.concat([pca_seqlet_hits, pca_seq_null_seqlet_hits]), 
             diag_kind="kde", kind='scatter', hue="marker")

<seaborn.axisgrid.PairGrid at 0x7f22a5fa9470>

sns.pairplot(pd.concat([pca_seqlet_hits, pca_weight_null_seqlet_hits]), 
             diag_kind="kde", kind='scatter', hue="marker")

<seaborn.axisgrid.PairGrid at 0x7f180f5002e8>

sns.pairplot(pd.concat([pca_seqlet_hits, pca_weightedseq_null_seqlet_hits]), 
             diag_kind="kde", kind='scatter', hue="marker")

<seaborn.axisgrid.PairGrid at 0x7f229d74c9e8>

import seaborn as sns
sns.pairplot(test_hits[['sum_score', 
                   'weight_convolution', 'pfm_convolution', 'pwm_convolution']], 
             diag_kind="kde", kind='scatter')

/users/msharmin/anaconda2/envs/basepair/lib/python3.6/site-packages/scipy/stats/stats.py:1713: FutureWarning:

Using a non-tuple sequence for multidimensional indexing is deprecated; use `arr[tuple(seq)]` instead of `arr[seq]`. In the future this will be interpreted as an array index, `arr[np.array(seq)]`, which will result either in an error or a different result.

<seaborn.axisgrid.PairGrid at 0x7f0ba03f0278>

sns.pairplot(test_hits[['sum_score', 
                   'weight_scan', 'pfm_scan', 'pwm_scan']], 
             diag_kind="kde", kind='scatter')

/users/msharmin/anaconda2/envs/basepair/lib/python3.6/site-packages/scipy/stats/stats.py:1713: FutureWarning:

Using a non-tuple sequence for multidimensional indexing is deprecated; use `arr[tuple(seq)]` instead of `arr[seq]`. In the future this will be interpreted as an array index, `arr[np.array(seq)]`, which will result either in an error or a different result.

<seaborn.axisgrid.PairGrid at 0x7f0b9c4b1128>

sns.pairplot(test_hits[['sum_score', 
                   'weight_jaccard', 'pfm_jaccard', 'pwm_jaccard']], 
             diag_kind="kde", kind='scatter')

/users/msharmin/anaconda2/envs/basepair/lib/python3.6/site-packages/scipy/stats/stats.py:1713: FutureWarning:

Using a non-tuple sequence for multidimensional indexing is deprecated; use `arr[tuple(seq)]` instead of `arr[seq]`. In the future this will be interpreted as an array index, `arr[np.array(seq)]`, which will result either in an error or a different result.

<seaborn.axisgrid.PairGrid at 0x7f0b946eca20>

sns.pairplot(test_hits[['sum_score', 
                   'weight_masked_convolution', 'pfm_masked_convolution', 'pwm_masked_convolution']], 
             diag_kind="kde", kind='scatter')

/users/msharmin/anaconda2/envs/basepair/lib/python3.6/site-packages/scipy/stats/stats.py:1713: FutureWarning:

Using a non-tuple sequence for multidimensional indexing is deprecated; use `arr[tuple(seq)]` instead of `arr[seq]`. In the future this will be interpreted as an array index, `arr[np.array(seq)]`, which will result either in an error or a different result.

<seaborn.axisgrid.PairGrid at 0x7f0b944dc400>

from matlas.matches import plots_seqlet_measures
plots_seqlet_measures(test_hits, metrics = {0:["pwm_convolution", "pfm_convolution"], 
                                            1:["pwm_convolution", "weight_convolution"], 
                                            2:["pwm_convolution", "pwm_scan"], 
                                            3:["pwm_convolution", "pwm_jaccard"]})

/users/msharmin/anaconda2/envs/basepair/lib/python3.6/site-packages/scipy/stats/stats.py:1713: FutureWarning:

Using a non-tuple sequence for multidimensional indexing is deprecated; use `arr[tuple(seq)]` instead of `arr[seq]`. In the future this will be interpreted as an array index, `arr[np.array(seq)]`, which will result either in an error or a different result.

plots_seqlet_measures(test_hits, metrics = {0:["pwm_convolution", "pfm_jaccard"], 
                                            1:["pwm_convolution", "weight_jaccard"], 
                                            2:["pwm_convolution", "pwm_masked_convolution"], 
                                            3:["pwm_convolution", "pfm_masked_convolution"]})

/users/msharmin/anaconda2/envs/basepair/lib/python3.6/site-packages/scipy/stats/stats.py:1713: FutureWarning:

Using a non-tuple sequence for multidimensional indexing is deprecated; use `arr[tuple(seq)]` instead of `arr[seq]`. In the future this will be interpreted as an array index, `arr[np.array(seq)]`, which will result either in an error or a different result.

	0	1	2	3	4
0	0.680397	0.146271	0.105012	0.031013	0.019098
1	0.695268	0.142840	0.088085	0.034545	0.018544
2	0.697211	0.146268	0.086186	0.035164	0.018235
3	0.673356	0.144904	0.103110	0.041832	0.021371
4	0.685658	0.145955	0.094533	0.031906	0.022454
5	0.663303	0.147897	0.099756	0.045577	0.021188
6	0.711601	0.144336	0.078308	0.032313	0.016996
7	0.673456	0.146031	0.094547	0.042293	0.020838
8	0.656520	0.145385	0.123580	0.031199	0.022544
9	0.601955	0.151064	0.135308	0.055252	0.027652
10	0.667463	0.147296	0.104211	0.039380	0.019349
11	0.659383	0.143655	0.104919	0.046710	0.022555
12	0.700259	0.146478	0.080744	0.034838	0.017609
13	0.684291	0.146765	0.091768	0.038186	0.020123
14	0.698584	0.146546	0.086478	0.035270	0.018308
15	0.670124	0.146916	0.101797	0.040485	0.020208
16	0.693542	0.146399	0.087641	0.034317	0.019797
17	0.568312	0.150927	0.129471	0.071336	0.046031
18	0.574736	0.154696	0.138851	0.069306	0.033694
19	0.615117	0.150938	0.133129	0.045515	0.029377

	sum_score	start_idx	pssm_scan	hyp_cwm_scan	cwm_scan	pssm_jaccard	hyp_cwm_jaccard	hyp_cwm_masked_convolution	PC1	PC2	PC3	PC4	probs_0	probs_1	affinity
0	0.506014	457	4.630333	0.008418	0.576000	0.186843	-0.018382	-0.041447	0.186457	-1.008803	0.342027	-0.493843	1.0	1.081909e-75	-9.21034
1	0.692963	459	3.730682	0.039855	0.347557	0.139370	0.049612	0.324870	0.412132	-0.876732	-0.827821	0.223457	1.0	2.458299e-71	-9.21034
2	0.828139	460	2.220063	-0.238699	0.186014	0.085936	-0.015740	-0.321238	-1.427511	-0.329545	-0.174973	-0.542189	1.0	7.645040e-119	-9.21034
3	1.070684	461	2.534312	0.012432	0.432785	0.008754	-0.081084	-0.290332	-1.291234	0.012149	0.719207	0.270713	1.0	5.294447e-117	-9.21034
4	1.108170	462	0.376822	-0.106884	0.025820	0.016013	-0.034818	-0.648415	-2.352710	0.247801	0.060116	-0.541629	1.0	5.358193e-149	-9.21034

	sum_score	seqno	start_idx	pssm_scan	hyp_cwm_scan	cwm_scan	pssm_jaccard	hyp_cwm_jaccard	hyp_cwm_masked_convolution	PC1	PC2	PC3	PC4	labels	probs_0	probs_1	affinity
7087	0.719578	354	469	10.566150	1.108264	1.461974	0.545713	0.296834	0.980013	6.972319	-1.177662	-0.288694	-0.445434	1	1.754782e-06	0.999998	9.192943
8869	1.916423	454	492	10.946750	1.190199	1.507266	0.659304	0.281202	0.973364	7.552993	1.023540	-0.356206	-0.547166	1	7.413490e-09	1.000000	9.210266
16789	1.362010	904	514	10.263988	1.124887	1.414121	0.612824	0.266297	0.989716	7.006805	0.021724	-0.461967	-0.491541	1	3.295677e-07	1.000000	9.207050
16817	0.596663	907	492	8.788265	0.909073	1.277441	0.552744	0.238311	0.897645	5.879004	-1.290189	-0.611978	-0.669720	1	6.776545e-03	0.993223	4.972840
19983	1.162510	1099	820	10.946750	1.190199	1.507266	0.660087	0.286183	0.969329	7.477639	-0.319253	-0.308069	-0.807862	1	9.786088e-08	1.000000	9.209362

	sum_score	seqno	start_idx	pssm_scan	hyp_cwm_scan	cwm_scan	pssm_jaccard	hyp_cwm_jaccard	hyp_cwm_masked_convolution	PC1	PC2	PC3	PC4	labels	probs_0	probs_1	affinity
7094	0.794498	354	476	10.853761	1.095423	1.488945	0.546977	0.272352	0.900610	6.848125	-0.998914	-0.000308	-0.508452	1	2.396958e-06	0.999998	9.186651
8862	0.501731	454	439	10.659139	1.203041	1.480295	0.541482	0.269257	0.889027	6.841486	-1.535023	0.095035	-0.529422	1	4.906370e-06	0.999995	9.162437
16773	0.496409	904	299	10.349956	1.032806	1.417799	0.625997	0.255205	0.968811	6.772597	-1.482428	-0.379742	-0.893444	1	6.214006e-06	0.999994	9.150048
16824	0.657944	907	528	10.853761	1.095423	1.488945	0.562632	0.236953	0.934201	6.729680	-1.227934	0.081656	-0.505726	1	5.750581e-06	0.999994	9.154422
19973	0.752838	1099	775	10.893723	1.151066	1.494347	0.729961	0.254498	0.932956	7.351972	-0.965525	-0.274610	-1.299071	1	4.350850e-07	1.000000	9.205999

	sum_score	seqno	start_idx	pssm_scan	hyp_cwm_scan	cwm_scan	pssm_jaccard	hyp_cwm_jaccard	hyp_cwm_masked_convolution	PC1	PC2	PC3	PC4	labels	probs_0	probs_1	affinity
113935	0.724063	6379	560	10.71255	1.190025	1.48021	0.521462	0.249506	0.930102	6.754057	-1.156876	0.139244	-0.263930	1	0.000005	0.999995	9.164708
113972	0.539283	6379	656	10.71255	1.190025	1.48021	0.664372	0.274725	0.964736	7.275955	-1.417457	-0.283314	-1.016604	1	0.000001	0.999999	9.198278

	sum_score	seqno	start_idx	pssm_scan	hyp_cwm_scan	cwm_scan	pssm_jaccard	hyp_cwm_jaccard	hyp_cwm_masked_convolution	PC1	PC2	PC3	PC4	labels	probs_0	probs_1	affinity
113948	1.583670	6379	589	11.000161	1.177183	1.507181	0.752419	0.282170	0.965384	7.752105	0.497587	-0.502023	-1.130626	1	1.226194e-08	1.000000	9.210218
113969	0.563157	6379	638	8.484707	0.967039	1.257896	0.513487	0.225346	0.929413	5.725680	-1.391008	-0.562599	-0.383389	1	3.737137e-02	0.962629	3.246094

	PC1	PC2	PC3	PC4	PC5	PC6	PC7	PC8	label	index	pos	affinity
0	-1.757138	-1.105117	-1.560638	0.357107	0.277466	0.376950	-0.193923	0.091057	0	0	16	-0.089076
1	2.880820	-0.113733	-0.338334	0.840248	0.175452	0.578696	0.093965	-0.772116	0	1	17	0.136358
2	0.438450	2.069480	-0.599759	0.185653	-0.684141	0.460374	-0.527480	-0.043134	0	2	18	0.031246
3	-3.361249	4.008285	0.283835	-0.326203	-0.736401	0.243396	-0.191256	0.144796	0	3	19	-0.139611
4	16.268981	3.032388	0.694093	1.201489	2.725356	0.261409	-0.523800	0.170852	2	4	20	0.788538

	0	1	2	3	4	5	6	7	8	9
0	chr1	3400299	3400307	M1_P7/tcTTATCt	0.999995	.	0.999995	-1	0.999995	-1
1	chr1	3451802	3451810	M1_P7/tcTTATCt	0.994065	.	0.994065	-1	0.994065	-1
2	chr1	6223145	6223153	M1_P7/tcTTATCt	0.969597	.	0.969597	-1	0.969597	-1
3	chr1	8134664	8134672	M1_P7/tcTTATCt	0.999999	.	0.999999	-1	0.999999	-1
4	chr1	8134664	8134672	M1_P7/tcTTATCt	0.999999	.	0.999999	-1	0.999999	-1

	sum_score	seqno	start_idx	ppm_scan	pwm_scan	log10pwm_scan	pssm_scan	cwm_scan	hyp_cwm_scan	ppm_jaccard	...	pssm_convolution	cwm_convolution	hyp_cwm_convolution	ppm_masked_convolution	pwm_masked_convolution	log10pwm_masked_convolution	pssm_masked_convolution	cwm_masked_convolution	hyp_cwm_masked_convolution	marker
12013	0.510935	724	16	2.173273	-16.685898	-11.565783	2.807995	0.271431	-0.129788	0.093749	...	0.155122	0.014721	-0.012018	0.409525	-0.668292	-0.668292	0.344445	0.320698	-0.240834	foreground
12014	0.726112	724	17	2.955832	-9.491440	-6.578965	4.307728	0.489950	0.071016	0.157605	...	0.343455	0.040122	0.003713	0.496489	-0.393983	-0.393983	0.428561	0.400553	0.038883	foreground
12015	0.918835	724	18	1.895810	-16.006632	-11.094952	2.776762	0.348214	-0.168935	0.129040	...	0.351885	0.042564	-0.020289	0.580346	-0.552284	-0.552284	0.457278	0.387848	-0.180441	foreground
12016	1.216095	724	19	2.241223	-16.770756	-11.624602	2.795168	0.296356	-0.195357	0.087665	...	0.231063	0.023784	-0.075097	0.231076	-0.803851	-0.803851	0.216105	0.210806	-0.522732	foreground
12017	1.241818	724	20	6.648924	12.798364	8.871150	10.946750	1.507266	1.190199	0.566732	...	2.063255	0.314700	0.245513	0.901531	0.905861	0.905861	0.904396	0.937620	0.938175	foreground
12018	1.150841	724	21	1.979615	-17.255894	-11.960874	2.454263	0.213059	-0.089824	0.056313	...	0.174882	0.015555	-0.031489	0.201595	-0.785020	-0.785020	0.188620	0.192204	-0.275004	foreground
12019	1.059108	724	22	2.545866	-14.645376	-10.151401	4.199507	0.512827	0.098539	0.146221	...	0.573835	0.064573	-0.016732	0.439639	-0.466749	-0.466749	0.405790	0.358829	-0.106877	foreground
12020	1.016291	724	23	2.822197	-11.335154	-7.856930	4.201037	0.479085	-0.014662	0.111055	...	0.314115	0.021815	-0.046603	0.319340	-0.526418	-0.526418	0.240512	0.133503	-0.285399	foreground
12021	0.894742	724	24	1.146093	-22.607031	-15.670000	2.113748	0.202143	-0.273157	0.044927	...	0.174305	0.013882	-0.017559	0.214520	-0.388620	-0.388620	0.181476	0.139714	-0.138538	foreground
12022	0.670116	724	25	3.510759	-7.332713	-5.082649	4.744608	0.467600	0.199800	0.104182	...	0.289349	0.032979	0.030203	0.540020	0.121739	0.121739	0.247456	0.286472	0.288089	foreground
12023	0.503941	724	26	2.680634	-11.520928	-7.985699	4.159111	0.579430	0.235952	0.061523	...	0.092858	0.020784	0.024588	0.264452	-0.003100	-0.003100	0.091842	0.128490	0.168628	foreground
12024	0.560634	724	61	2.759909	-11.651375	-8.076117	4.703845	0.547412	0.042536	0.138787	...	0.360025	0.044303	0.006862	0.609668	-0.272037	-0.272037	0.491439	0.481183	0.083735	foreground
12025	0.712192	724	62	2.736127	-15.611526	-10.821085	4.265700	0.423533	-0.155905	0.127224	...	0.268559	0.028056	-0.034304	0.344609	-0.707176	-0.707176	0.337684	0.357184	-0.377643	foreground