In [1]:
import pickle
from collections import OrderedDict
import numpy as np
import pandas as pd
import os
import subprocess
import math
import copy
import pickle
import h5py
import glob

from matlas.genome_data import *
from matlas.pwms import shorten_tfname, get_motifDB_id_maps
from matlas.matches import fig2vdom, vdom_pssm
from matlas.modisco_results import ModiscoResult
from vdom.helpers import (h1, p, li, img, div, b, br, ul, a,
                          details, summary,
                          table, thead, th, tr, tbody, td, ol)
from IPython.display import display


project_root = "/mnt/lab_data/kundaje/msharmin/mouse_hem/with_tfd"
oak_root = "/oak/stanford/groups/akundaje/msharmin/mouse_hem/from_labcluster/with_tfd"
srv_root = "/srv/scratch/msharmin/mouse_hem/with_tfd"
data_name = "full_mouse50"
modisco_root = "{0}/{1}/Naive_modisco2019_msharmin".format(oak_root, data_name)
modisco_root = "{0}/{1}/Naive_modisco2019".format(srv_root, data_name)

homer_root = "{0}/{1}/Naive_scans".format(srv_root, data_name)
model_root = "{0}/{1}/fineFactorized".format(srv_root, data_name)
basset_root = "{0}/{1}/Naive_basset_motifs".format(srv_root, data_name)
reportfile= "/mnt/lab_data/kundaje/msharmin/mouse_hem/with_tfd/full_mouse50/filtering samples_MS2.xlsx"
sheetname = "filter23"
load data from labcluster
Using TensorFlow backend.
TF-MoDISco is using the TensorFlow backend.
In [2]:
from matlas.matches import DenovoModisco

task_idx = 273
modisco_dir = "{0}/task_{1}-naivegw".format(modisco_root, task_idx)
summit_file = "{0}/{1}/fineFactorized/task_{2}-naivegw/predictions/task_{2}-{3}Summit.calibrated.{1}.tab.gz".format(project_root, data_name, task_idx, "Naive")
perf_file = "{0}/task_{1}-naivegw/NaiveauPRC.txt".format(model_root, task_idx)
deeplift_hdf = "{0}/{1}/Naive_deeplift2019/task_{2}-naivegw/summit.h5".format(srv_root, data_name, task_idx)
ob = DenovoModisco(modisco_dir)
pattern_name = "metacluster_1/pattern_7"
query = ob.load_chomped_pattern(pattern_name, plot=True)
ppmchomped ppmchomped pwmchomped log10_pwmchomped pssmchomped cwmchomped hyp_cwmconsensustcTTATCt
In [3]:
ob.initialize_with_importance_scores(summit_file, perf_file, deeplift_hdf, query.keys()) #loads all the important scores
deeplift_data = ob.deeplift_data
In [4]:
from matlas.utils import get_calibrated_peaks

calibrated_data = get_calibrated_peaks(summit_file, perf_file)
selected_summits = calibrated_data['selected_summits']
indices = calibrated_data['indices']

#from basepair.plot.profiles import extract_signal
from matlas.tracks import extract_signal
pattern_name = 'metacluster_1/pattern_7'

seqlet_scores = ob.contrib_per_pattern[pattern_name]
seqlet_onehot = ob.seqs_per_pattern[pattern_name]

seqlets_ = ob.seqlets_per_pattern[pattern_name]
shuffled_onehot = extract_signal(ob.deeplift_data['shuffled_onehot'][indices], seqlets_)
shuffled_scores = extract_signal(ob.deeplift_data['shuffled_scores'][indices], seqlets_)

shuffled_onehot.shape, shuffled_scores.shape
Out[4]:
((883, 70, 4), (883, 70, 4))
In [5]:
from matlas.matches import collect_scoring_measures

method_to_seqlet_scores = collect_scoring_measures(query, seqlet_scores, seqlet_onehot)
print(query.keys(), np.sort(np.array(list(method_to_seqlet_scores.keys()))))


scores = deeplift_data['scores']
one_hot = deeplift_data['one_hot']

method_to_scores = collect_scoring_measures(query, scores, one_hot)
dict_keys(['ppm', 'pwm', 'log10pwm', 'pssm', 'cwm', 'hyp_cwm', 'consensus']) ['cwm_convolution' 'cwm_jaccard' 'cwm_masked_convolution' 'cwm_scan'
 'hyp_cwm_convolution' 'hyp_cwm_jaccard' 'hyp_cwm_masked_convolution'
 'hyp_cwm_scan' 'log10pwm_convolution' 'log10pwm_jaccard'
 'log10pwm_masked_convolution' 'log10pwm_scan' 'ppm_convolution'
 'ppm_jaccard' 'ppm_masked_convolution' 'ppm_scan' 'pssm_convolution'
 'pssm_jaccard' 'pssm_masked_convolution' 'pssm_scan' 'pwm_convolution'
 'pwm_jaccard' 'pwm_masked_convolution' 'pwm_scan' 'sum_score']
In [6]:
from matlas.matches import get_any_high_scoring_instances
per_base_imp=0.0625

keys_for_scoring = ['sum_score'] 
keys_for_scoring += ["{}_scan".format(key) for key in query.keys() if key!= 'consensus']
keys_for_scoring += ["{}_jaccard".format(key) for key in query.keys() if key!= 'consensus']
keys_for_scoring += ["{}_convolution".format(key) for key in query.keys() if key!= 'consensus']
keys_for_scoring += ["{}_masked_convolution".format(key) for key in query.keys() if key!= 'consensus']
        
seqlet_hits = get_any_high_scoring_instances(method_to_seqlet_scores,
                                                min_tot_imp=per_base_imp*len(query['ppm']),
                                                keys=keys_for_scoring)

_hits = get_any_high_scoring_instances(method_to_scores,
                                       min_tot_imp=per_base_imp*len(query['ppm']),
                                       keys=keys_for_scoring)
In [11]:
keys_for_scoring=['sum_score']
keys_for_scoring += ["{}_scan".format(key) for key in ['pssm', 'hyp_cwm', 'cwm']]
keys_for_scoring += ["{}_jaccard".format(key) for key in ['pssm', 'hyp_cwm']]
#keys_for_scoring += ["{}_convolution".format(key) for key in ['hyp_cwm', 'cwm']]
keys_for_scoring += ["{}_masked_convolution".format(key) for key in ['hyp_cwm']]
In [7]:
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

def transform_data(_hits, scaler=None, pca=None):
    if scaler is None:
        # should be used only during training
        scaler = StandardScaler()
        scaler.fit(_hits)
        
    # Apply transform to both the training set and testing set
    _hits = scaler.transform(_hits)

    if pca is None:
        # Make an instance of the Model
        pca = PCA(.98)
        #fitting PCA on the training set only.
        pca.fit(_hits)
        
    #Apply the mapping (transform) to both the training set and the test set.
    _hits = pca.transform(_hits)
    
    return scaler, pca, _hits

scaler, pca, train_hits = transform_data(seqlet_hits[keys_for_scoring])
pca.explained_variance_ratio_
Out[7]:
array([0.73421992, 0.09118347, 0.07766985, 0.02654744, 0.022959  ,
       0.01230255, 0.00980085, 0.00861208])
In [8]:
pc_df = pd.DataFrame(data = train_hits, columns=["PC{}".format(i+1) for i in range(train_hits.shape[1])])
In [9]:
def load_true_positives(tp_file):
    coordinates = []
    weak_coordinates = []
    with open(tp_file, "r") as fp:
        for line in fp:
            line = line.strip().split(",")
            seqno = line[0]
            pos = int(line[1])
            if "-" in seqno:
                seqno = seqno.split("-")
                for i in range(int(seqno[0]), int(seqno[1])+1):
                    coordinates.append((i, pos))
            else:
                if len(line)>2:
                    #if 'weak' in line[2]:
                    weak_coordinates.append((int(seqno), pos))
                    #else:
                        #coordinates.append((int(seqno), pos))
                else:
                    coordinates.append((int(seqno), pos))
    
    
    return {'strong':coordinates, 'weak':weak_coordinates}

def get_tp_indices(seqlet_hits, coordinates, verbose=True):
    keep = OrderedDict()
    for key in coordinates.keys():
        keep[key] = []
        for cord in coordinates[key]:
            seqno, pos = cord
            df = seqlet_hits[(seqlet_hits['seqno']==seqno) & (seqlet_hits['start_idx']==pos)]
            if df.shape[0]>0:
                keep[key].append(df.index[0])
            else:
                if verbose: print(cord)

    return keep

tp_file = "{}/web_mouse_heme_data/modisco_tracks/binary_model/task_{}-naivegw/mm17_tp_new.txt".format(project_root, task_idx)
coordinates = load_true_positives(tp_file)
true_indices = get_tp_indices(seqlet_hits, coordinates, False)
len(true_indices['strong']), len(true_indices['weak']), seqlet_hits.shape#, sum(labels==1)
Out[9]:
(910, 11, (13587, 27))
In [10]:
import matplotlib.pyplot as plt
def plot_pca(df, true_ids=None):
    print(df.shape)
    fig = plt.figure(figsize = (8,8))
    ax = fig.add_subplot(1,1,1) 
    ax.set_xlabel('PC1', fontsize = 15)
    ax.set_ylabel('PC2', fontsize = 15)
    ax.set_title('2 component PCA', fontsize = 20)
    if true_ids is not None:
        strong_ids = true_ids['strong']
        strong_df = df.loc[np.array(strong_ids),:]
        ax.scatter(strong_df['PC1'], strong_df['PC2'], s = 1, c='tomato')
        
        weak_ids = true_ids['weak']
        weak_df = df.loc[np.array(weak_ids),:]
        ax.scatter(weak_df['PC1'], weak_df['PC2'], s = 5, c='darkred')
        
        df = df.loc[np.array(list(set(list(df.index)).difference(
            set(strong_ids).union(set(weak_ids))
        ))),:]
    ax.scatter(df['PC1'], df['PC2'], s = 1, c='slateblue')
    #ax.axvline(9)
    #ax.axhline(-1)
    ax.grid()
    ax.legend(['strong match', 'weak match', 'no match'], loc="upper right")
    return None
coordinates = load_true_positives(tp_file)
true_indices = get_tp_indices(seqlet_hits, coordinates, False)
plot_pca(pc_df, true_indices)
(13587, 8)
In [17]:
import matplotlib.pyplot as plt
from sklearn.mixture import GaussianMixture
#import sklearn.mixture.GaussianMixture
gmm = GaussianMixture(n_components=2, covariance_type='full', init_params='kmeans').fit(pc_df[['PC1', 'PC2']])
labels = gmm.predict(pc_df[['PC1', 'PC2']])
print(np.sum(labels==1), np.sum(labels==0))
plt.scatter(pc_df['PC1'], pc_df['PC2'], c=labels, s=1, cmap='seismic');
probs = gmm.predict_proba(pc_df[['PC1', 'PC2']])
plt.scatter(pc_df['PC1'], pc_df['PC2'], c=probs[:,1], s=1, cmap='seismic');
920 12667
In [12]:
pcv1 = pca.explained_variance_ratio_[0]
pcv2 = pca.explained_variance_ratio_[2]
pc_maxs = pc_df.max()
affinity = (pc_df['PC1']*pcv1/(pcv1+pcv2)+pc_df['PC2']*pcv2/(pcv1+pcv2))/(pc_maxs['PC1']*pcv1/(pcv1+pcv2)+pc_maxs['PC2']*pcv2/(pcv1+pcv2))
plt.scatter(pc_df['PC1'], pc_df['PC2'], c=affinity, s=1, cmap='seismic');
In [13]:
labels = np.array([0]*pc_df.shape[0])
labels[np.array(true_indices['strong'])]=2
labels[np.array(true_indices['weak'])]=1
pc_df['label'] = labels
pc_df['index'] = pc_df.index
pc_df['seqno'] = seqlet_hits.seqno
pc_df['pos'] = seqlet_hits.start_idx
pc_df['affinity'] = affinity
pc_df[:5]
Out[13]:
PC1 PC2 PC3 PC4 PC5 PC6 PC7 PC8 label index seqno pos affinity
0 -1.757138 -1.105117 -1.560638 0.357107 0.277466 0.376950 -0.193923 0.091057 0 0 0 16 -0.089076
1 2.880820 -0.113733 -0.338334 0.840248 0.175452 0.578696 0.093965 -0.772116 0 1 0 17 0.136358
2 0.438450 2.069480 -0.599759 0.185653 -0.684141 0.460374 -0.527480 -0.043134 0 2 0 18 0.031246
3 -3.361249 4.008285 0.283835 -0.326203 -0.736401 0.243396 -0.191256 0.144796 0 3 0 19 -0.139611
4 16.268981 3.032388 0.694093 1.201489 2.725356 0.261409 -0.523800 0.170852 2 4 0 20 0.788538
In [14]:
import plotly.express as px
fig = px.scatter(pc_df, x="PC1", y="PC2", hover_data=['index', 'seqno', 'pos'], color='affinity',
                 color_continuous_scale=px.colors.sequential.Magma
                )
fig
In [18]:
#seqlets pca, seqlet gmm 
_, _, test_hits = transform_data(_hits[keys_for_scoring], scaler, pca)
_pc_df = pd.DataFrame(data = test_hits, columns=["PC{}".format(i+1) for i in range(test_hits.shape[1])])

_labels = gmm.predict(_pc_df[['PC1', 'PC2']])
print(np.sum(_labels==1), np.sum(_labels==0))
_probs = gmm.predict_proba(_pc_df[['PC1', 'PC2']])
plt.scatter(_pc_df['PC1'], _pc_df['PC2'], c=_probs[:,1], s=1, cmap='seismic');
9116 3041038
In [16]:
pc_maxs = _pc_df.max()
affinity = (_pc_df['PC1']*pcv1/(pcv1+pcv2)+_pc_df['PC2']*pcv2/(pcv1+pcv2))/(pc_maxs['PC1']*pcv1/(pcv1+pcv2)+pc_maxs['PC2']*pcv2/(pcv1+pcv2))
plt.scatter(_pc_df['PC1'], _pc_df['PC2'], c=affinity, s=1, cmap='seismic');
In [14]:
#independent pca, seqlet gmm
_scaler, _pca, _train_hits = transform_data(_hits[keys_for_scoring])
print(_pca.explained_variance_ratio_)
_pc_df = pd.DataFrame(data = _train_hits, columns=["PC{}".format(i+1) for i in range(_train_hits.shape[1])])

_labels = gmm.predict(_pc_df[['PC1', 'PC2']])
print(np.sum(_labels==1), np.sum(_labels==0))
_probs = gmm.predict_proba(_pc_df[['PC1', 'PC2']])
plt.scatter(_pc_df['PC1'], _pc_df['PC2'], c=_probs[:,1], s=1, cmap='seismic');
[0.56338578 0.15092858 0.11137449 0.05474907 0.03781689 0.02420523
 0.01660753 0.01146892 0.00811075 0.00629166]
3039233 10921
In [15]:
# seqlets pca, independent gmm
_, _, test_hits = transform_data(_hits[keys_for_scoring], scaler, pca)
_pc_df = pd.DataFrame(data = test_hits, columns=["PC{}".format(i+1) for i in range(test_hits.shape[1])])

_gmm = GaussianMixture(n_components=2).fit(_pc_df[['PC1', 'PC2']])
_labels = _gmm.predict(_pc_df[['PC1', 'PC2']])
print(np.sum(_labels==1), np.sum(_labels==0))
_probs = _gmm.predict_proba(_pc_df[['PC1', 'PC2']])
plt.scatter(_pc_df['PC1'], _pc_df['PC2'], c=_probs[:,1], s=1, cmap='seismic');
712368 2337786
In [16]:
#independent pca, independent gmm
_scaler, _pca, _train_hits = transform_data(_hits[keys_for_scoring])
print(_pca.explained_variance_ratio_)
_pc_df = pd.DataFrame(data = _train_hits, columns=["PC{}".format(i+1) for i in range(_train_hits.shape[1])])
_gmm = GaussianMixture(n_components=2).fit(_pc_df[['PC1', 'PC2']])
_labels = _gmm.predict(_pc_df[['PC1', 'PC2']])
print(np.sum(_labels==1), np.sum(_labels==0))
_probs = _gmm.predict_proba(_pc_df[['PC1', 'PC2']])
plt.scatter(_pc_df['PC1'], _pc_df['PC2'], c=_probs[:,1], s=1, cmap='seismic');
[0.56338578 0.15092858 0.11137449 0.05474907 0.03781689 0.02420523
 0.01660753 0.01146892 0.00811075 0.00629166]
2362275 687879
In [20]:
_, _, test_hits = transform_data(_hits[keys_for_scoring], scaler, pca)
_pc_df = pd.DataFrame(data = test_hits, columns=["PC{}".format(i+1) for i in range(test_hits.shape[1])])
_labels = gmm.predict(_pc_df[['PC1', 'PC2']])
_probs = gmm.predict_proba(_pc_df[['PC1', 'PC2']])
sorted_hits = _hits
sorted_hits['probs'] = _probs[:,1]
sorted_hits['labels'] = _labels
sorted_hits = sorted_hits[sorted_hits['labels']==1]
print(sorted_hits.shape)

outdir = "/mnt/lab_data/kundaje/msharmin/mouse_hem/with_tfd/web_mouse_heme_data/modisco_tracks/binary_model/task_273-naivegw"
filename = "m17_gmm.gz"
instancefile = "{0}/{1}".format(outdir, filename)
sorted_hits.to_csv(instancefile, sep='\t')
(9340, 29)
In [21]:
def predict_pattern(df, probs=None):
    if probs is not None:
        df = df[df['probs']>=probs,:]
    
    consensus = query['consensus']
    summits = ob.deeplift_data['peaks']
    display_name = ob.shorten_pattern_name(pattern_name)
    
    peak_no = df.shape[0]
    _indices = df.seqno.values
    chroms = summits.chr.values[_indices]
    starts = summits.start.values[_indices] + df.start_idx.values
    ends = starts + len(consensus)
    names = np.array(["{0}/{1}".format(display_name, consensus)]*peak_no)
    scores = df.probs.values
    strands = np.array(['.']*peak_no)
    
    df = pd.DataFrame.from_dict({0:chroms, 1:starts, 2:ends, 3:names, 4:scores, 5:strands})
    df = df.sort_values([0, 1, 4])
    #df = df.drop_duplicates(subset=[0,1,2,3,5])
    
    return df

df = predict_pattern(sorted_hits)

from matlas.utils import send_to_mitra
outdir = "/mnt/lab_data/kundaje/msharmin/mouse_hem/with_tfd/web_mouse_heme_data/modisco_tracks/binary_model/task_273-naivegw"
filename = "m17_gmm"
out_prefix = "{0}/{1}".format(outdir, filename)
send_to_mitra(df, out_prefix, mitra=outdir, hammock=True)
writing...(9340, 6)
In [6]:
root = "/mnt/lab_data/kundaje/msharmin/mouse_hem/with_tfd/web_mouse_heme_data/modisco_tracks/binary_model/task_273-naivegw"
dlhits = pd.read_csv("{}/m17_gmm.bed.gz".format(root), header=None, sep="\t")
ismhits = pd.read_csv("{}/m17_gmm_ism.bed.gz".format(root), header=None, sep="\t")
dlhits.shape, ismhits.shape 
Out[6]:
((9116, 6), (16967, 6))
In [9]:
dlhits[6] = dlhits[4]
dlhits[7] = -1
dlhits[8] = dlhits[4]
dlhits[9] = -1

ismhits[6] = ismhits[4]
ismhits[7] = -1
ismhits[8] = ismhits[4]
ismhits[9] = -1
In [10]:
dlhits.to_csv("{}/m17_gmm_idr.bed".format(root), header=False, index=False, sep="\t")
ismhits.to_csv("{}/m17_gmm_ism_idr.bed".format(root), header=False, index=False, sep="\t")
In [11]:
dlhits[:5]
Out[11]:
0 1 2 3 4 5 6 7 8 9
0 chr1 3400299 3400307 M1_P7/tcTTATCt 0.999995 . 0.999995 -1 0.999995 -1
1 chr1 3451802 3451810 M1_P7/tcTTATCt 0.994065 . 0.994065 -1 0.994065 -1
2 chr1 6223145 6223153 M1_P7/tcTTATCt 0.969597 . 0.969597 -1 0.969597 -1
3 chr1 8134664 8134672 M1_P7/tcTTATCt 0.999999 . 0.999999 -1 0.999999 -1
4 chr1 8134664 8134672 M1_P7/tcTTATCt 0.999999 . 0.999999 -1 0.999999 -1
In [19]:
from matlas.verification import compute_chip_matrix

compute_chip_matrix("MEL", "GATA", out_prefix, execute=True)
computeMatrix reference-point --referencePoint center --skipZeros -S /mnt/lab_data2/msharmin/mouse_chipseq_bigwigs/ENCFF686HPH.bigWig -R /mnt/lab_data/kundaje/msharmin/mouse_hem/with_tfd/web_mouse_heme_data/modisco_tracks/binary_model/task_273-naivegw/m17_gmm.bed -a 500 -b 500 -bs 10 -o /mnt/lab_data/kundaje/msharmin/mouse_hem/with_tfd/web_mouse_heme_data/modisco_tracks/binary_model/task_273-naivegw/m17_gmm.matrix.gz --outFileSortedRegions /mnt/lab_data/kundaje/msharmin/mouse_hem/with_tfd/web_mouse_heme_data/modisco_tracks/binary_model/task_273-naivegw/m17_gmm.matrix.bed 
In [20]:
from matlas.verification import plot_chip_heatmap
plot_chip_heatmap("MEL", "GATA", out_prefix, execute=True)
plotHeatmap -m /mnt/lab_data/kundaje/msharmin/mouse_hem/with_tfd/web_mouse_heme_data/modisco_tracks/binary_model/task_273-naivegw/m17_gmm.matrix.gz -out /mnt/lab_data/kundaje/msharmin/mouse_hem/with_tfd/web_mouse_heme_data/modisco_tracks/binary_model/task_273-naivegw/MEL-GATA-heatmap.png  --colorMap Blues --xAxisLabel Distance --missingDataColor 1 --zMin 0 --heatmapHeight 28 --heatmapWidth 8 --legendLocation none
In [22]:
#null - 1 - shuffled deeplift score: shuffled one-hot*hyp scores
#null - 2 - keep the one-hot/sequence, change the importance of deeplift score: shuffle the values along bases 
#null - 3 - keep the importance values; randomize the bits in each base: 

from matlas.matches import get_null_sequences

seqno = 0
null_onehot, null_score = get_null_sequences(one_hot[seqno], scores[seqno],
                                            #total importance is same but shuffled the one-hot
                                            shuffle_seq=True, shuffle_weight=False
                                            )

null_onehot.shape, null_score.shape

from modisco.visualization import viz_sequence
print('real') 
viz_sequence.plot_weights(scores[seqno], subticks_frequency=10, figsize=(20,1))
print('same weight, different seq') 
viz_sequence.plot_weights(null_score, subticks_frequency=10, figsize=(20,1))
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-22-d89f730a7461> in <module>
      6 
      7 seqno = 0
----> 8 null_onehot, null_score = get_null_sequence(one_hot[seqno], scores[seqno],
      9                                             #total importance is same but shuffled the one-hot
     10                                             shuffle_seq=True, shuffle_weight=False

NameError: name 'get_null_sequence' is not defined
In [9]:
seqno = 0
null_onehot, null_score = get_null_sequences(one_hot[seqno], scores[seqno],
                                            #weights are different but keep the one-hot
                                            shuffle_seq=False, shuffle_weight=True
                                            )

null_onehot.shape, null_score.shape

from modisco.visualization import viz_sequence
print('real') 
viz_sequence.plot_weights(scores[seqno], subticks_frequency=10, figsize=(20,1))
print('same seq, different weights') 
viz_sequence.plot_weights(null_score, subticks_frequency=10, figsize=(20,1))
real
same seq, different weights
In [10]:
seqno = 0
null_onehot, null_score = get_null_sequences(one_hot[seqno], scores[seqno],
                                            #change seq, change weight
                                            shuffle_seq=True, shuffle_weight=True
                                            )

null_onehot.shape, null_score.shape

from modisco.visualization import viz_sequence
print('real') 
viz_sequence.plot_weights(scores[seqno], subticks_frequency=10, figsize=(20,1))
print('different weight, different seq') 
viz_sequence.plot_weights(null_score, subticks_frequency=10, figsize=(20,1))
real
different weight, different seq
In [23]:
from matlas.matches import get_null_sequences
seq_null_onehot, seq_null_scores = get_null_sequences(seqlet_onehot, seqlet_scores,
                                                shuffle_seq=True, shuffle_weight=False
                                                     )

weight_null_onehot, weight_null_scores = get_null_sequences(seqlet_onehot, seqlet_scores,
                                                shuffle_seq=False, shuffle_weight=True
                                                     )

weightedseq_null_onehot, weightedseq_null_scores = get_null_sequences(seqlet_onehot, seqlet_scores,
                                                shuffle_seq=True, shuffle_weight=True
                                                     )
done multiprocessing
(883, 70, 4)
(883, 70, 4)
done multiprocessing
(883, 70, 4)
(883, 70, 4)
done multiprocessing
(883, 70, 4)
(883, 70, 4)
In [8]:
from matlas.matches import collect_scoring_measures
seqlet_scores = ob.contrib_per_pattern[pattern_name]
seqlet_onehot = ob.seqs_per_pattern[pattern_name]
method_to_seqlet_scores = collect_scoring_measures(query, seqlet_scores, seqlet_onehot)
query.keys(), np.sort(np.array(list(method_to_seqlet_scores.keys())))
Out[8]:
(dict_keys(['ppm', 'pwm', 'log10pwm', 'pssm', 'cwm', 'hyp_cwm', 'consensus']),
 array(['cwm_convolution', 'cwm_jaccard', 'cwm_masked_convolution',
        'cwm_scan', 'hyp_cwm_convolution', 'hyp_cwm_jaccard',
        'hyp_cwm_masked_convolution', 'hyp_cwm_scan',
        'log10pwm_convolution', 'log10pwm_jaccard',
        'log10pwm_masked_convolution', 'log10pwm_scan', 'ppm_convolution',
        'ppm_jaccard', 'ppm_masked_convolution', 'ppm_scan',
        'pssm_convolution', 'pssm_jaccard', 'pssm_masked_convolution',
        'pssm_scan', 'pwm_convolution', 'pwm_jaccard',
        'pwm_masked_convolution', 'pwm_scan', 'sum_score'], dtype='<U27'))
In [24]:
method_to_shuffled_scores = collect_scoring_measures(query, shuffled_scores, shuffled_onehot)
method_to_seq_null_scores = collect_scoring_measures(query, seq_null_scores, seq_null_onehot)
method_to_weight_null_scores = collect_scoring_measures(query, weight_null_scores, weight_null_onehot)
method_to_weightedseq_null_scores = collect_scoring_measures(query, weightedseq_null_scores, weightedseq_null_onehot)
In [10]:
from matlas.matches import get_any_high_scoring_instances
per_base_imp=0.0625

keys_for_scoring = ['sum_score'] 
keys_for_scoring += ["{}_scan".format(key) for key in query.keys() if key!= 'consensus']
keys_for_scoring += ["{}_jaccard".format(key) for key in query.keys() if key!= 'consensus']
keys_for_scoring += ["{}_convolution".format(key) for key in query.keys() if key!= 'consensus']
keys_for_scoring += ["{}_masked_convolution".format(key) for key in query.keys() if key!= 'consensus']
        
seqlet_hits = get_any_high_scoring_instances(method_to_seqlet_scores,
                                                min_tot_imp=per_base_imp*len(query['ppm']),
                                                keys=keys_for_scoring)
In [11]:
def get_background_instances(fg, method_to_null_scores, keys=None):
    # given the fg coords, get the df of bg
    # appy scaling and pca as in fg, then apply log-likelihood
    if keys is None:
        keys = list(method_to_scores.keys())
    sum_scores = method_to_null_scores['sum_score']
    matches = [(sum_scores[seqno, pos], seqno, pos) for seqno, pos in zip(fg['seqno'].values, fg['start_idx'].values)]
    match_df = pd.DataFrame(np.array(matches))
    match_df.columns = ['sum_score', 'seqno', 'start_idx']
    match_df.seqno = match_df.seqno.values.astype(int)
    match_df.start_idx = match_df.start_idx.values.astype(int)
    for key in keys:
        if(key=='sum_score'):
            continue
        match_df[key] = [method_to_null_scores[key][idx1, idx2] for 
                         (idx1, idx2) in zip(fg['seqno'].values, fg['start_idx'].values)]
    
    return match_df

shuffled_paired_hits = get_background_instances(seqlet_hits, method_to_shuffled_scores, keys=keys_for_scoring)
seq_null_paired_hits = get_background_instances(seqlet_hits, method_to_seq_null_scores, keys=keys_for_scoring)
weight_null_paired_hits = get_background_instances(seqlet_hits, method_to_weight_null_scores, keys=keys_for_scoring)
weightedseq_null_paired_hits = get_background_instances(seqlet_hits, method_to_weightedseq_null_scores, keys=keys_for_scoring)
seqlet_hits.shape, shuffled_paired_hits.shape, seq_null_paired_hits.shape, weight_null_paired_hits.shape, weightedseq_null_paired_hits.shape
Out[11]:
((13587, 27), (13587, 27), (13587, 27), (13587, 27), (13587, 27))
In [36]:
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

def transform_data(_hits, scaler=None, pca=None):
    if scaler is None:
        # should be used only during training
        scaler = StandardScaler()
        scaler.fit(_hits)
        
    # Apply transform to both the training set and testing set
    _hits = scaler.transform(_hits)

    if pca is None:
        # Make an instance of the Model
        pca = PCA(.98)
        #fitting PCA on the training set only.
        pca.fit(_hits)
        
    #Apply the mapping (transform) to both the training set and the test set.
    _hits = pca.transform(_hits)
    
    return scaler, pca, _hits

scaler, pca, train_hits = transform_data(seqlet_hits[keys_for_scoring])
pca.explained_variance_ratio_
Out[36]:
array([0.73421992, 0.09118347, 0.07766985, 0.02654744, 0.022959  ,
       0.01230255, 0.00980085, 0.00861208])
In [25]:
shuffled_seqlet_hits = get_any_high_scoring_instances(method_to_shuffled_scores,
                                                min_tot_imp=per_base_imp*len(query['ppm']),
                                                keys=keys_for_scoring)
seq_null_seqlet_hits = get_any_high_scoring_instances(method_to_seq_null_scores,
                                                min_tot_imp=per_base_imp*len(query['ppm']),
                                                keys=keys_for_scoring)
weight_null_seqlet_hits = get_any_high_scoring_instances(method_to_weight_null_scores,
                                                min_tot_imp=per_base_imp*len(query['ppm']),
                                                keys=keys_for_scoring)
weightedseq_null_seqlet_hits = get_any_high_scoring_instances(method_to_weightedseq_null_scores,
                                                min_tot_imp=per_base_imp*len(query['ppm']),
                                                keys=keys_for_scoring)
In [26]:
seqlet_hits['marker'] = 'foreground'
shuffled_seqlet_hits['marker'] = 'suffled'
seq_null_seqlet_hits['marker'] = 'seq_null'
weight_null_seqlet_hits['marker'] = 'weight_null'
weightedseq_null_seqlet_hits['marker'] = 'weightedseq'
test_hits = pd.concat([seqlet_hits,
                       shuffled_seqlet_hits,
                       seq_null_seqlet_hits,
                       weight_null_seqlet_hits,
                       weightedseq_null_seqlet_hits
                      ])
/users/msharmin/anaconda2/envs/basepair13/lib/python3.6/site-packages/ipykernel_launcher.py:10: FutureWarning:

Sorting because non-concatenation axis is not aligned. A future version
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.

To retain the current behavior and silence the warning, pass 'sort=True'.


In [15]:
#read the real truth
def load_true_positives(tp_file):
    coordinates = []
    with open(tp_file, "r") as fp:
        for line in fp:
            line = line.strip().split(",")
            seqno = line[0]
            pos = int(line[1])
            if "-" in seqno:
                seqno = seqno.split("-")
                for i in range(int(seqno[0]), int(seqno[1])+1):
                    coordinates.append((i, pos))
            else:
                coordinates.append((int(seqno), pos))
    return coordinates
def get_tp_indices(seqlet_hits, verbose=True):
    keep = []
    for cord in coordinates:
        seqno, pos = cord
        df = seqlet_hits[(seqlet_hits['seqno']==seqno) & (seqlet_hits['start_idx']==pos)]
        if df.shape[0]>0:
            keep.append(df.index[0])
        else:
            if verbose: print(cord)

    return keep

tp_file = "{}/web_mouse_heme_data/modisco_tracks/binary_model/task_{}-naivegw/mm17_tp.txt".format(project_root, task_idx)
coordinates = load_true_positives(tp_file)
true_indices = get_tp_indices(seqlet_hits, False)
In [85]:
import matplotlib.pyplot as plt
def plot_pca(df, true_ids=None):
    print(df.shape)
    fig = plt.figure(figsize = (8,8))
    ax = fig.add_subplot(1,1,1) 
    ax.set_xlabel('PC1', fontsize = 15)
    ax.set_ylabel('PC2', fontsize = 15)
    ax.set_title('2 component PCA', fontsize = 20)
    if true_ids is not None:
        true_df = df.loc[np.array(true_ids),:]
        df = df.loc[np.array(list(set(list(df.index)).difference(set(true_ids)))),:]
        ax.scatter(true_df['PC1'], true_df['PC2'], s = 20, c='r', marker='+')
    ax.scatter(df['PC1'], df['PC2'], s = 1, c='b')
    #ax.axvline(9)
    #ax.axhline(-1)
    ax.grid()
    return None
pc_df = pd.DataFrame(data = train_hits, columns=["PC{}".format(i+1) for i in range(train_hits.shape[1])])
plot_pca(pc_df, true_indices)
(13587, 8)
In [102]:
from sklearn.mixture import GaussianMixture
#import sklearn.mixture.GaussianMixture
gmm = GaussianMixture(n_components=2).fit(pc_df[['PC1', 'PC2']])
labels = gmm.predict(pc_df[['PC1', 'PC2']])
plt.scatter(pc_df['PC1'], pc_df['PC2'], c=labels, s=1, cmap='seismic');
In [101]:
probs = gmm.predict_proba(pc_df[['PC1', 'PC2']])
plt.scatter(pc_df['PC1'], pc_df['PC2'], c=probs[:,1], s=1, cmap='seismic');
In [121]:
conditions = np.copy(labels)
conditions[np.logical_and(pc_df['PC1'].values<10, pc_df['PC2'].values<-0.5)] = 0
plt.scatter(pc_df['PC1'], pc_df['PC2'], c=conditions, s=1, cmap='seismic');
In [4]:
# selected = np.array(list(set(list(np.argwhere(conditions==0).T[0])).difference(set(list(np.argwhere(labels==0).T[0])))))
# pc_df.loc[selected,:]
# seqlet_hits.loc[selected,:]
# probs[:,1]

from matlas.render_report import jupyter_nbconvert
import os
def render_test_plots(nbname="global_report"):
    rendered_ipynb = "/mnt/lab_data/kundaje/msharmin/mouse_hem/code/mouse_hem/{}.ipynb".format(nbname)
    jupyter_nbconvert(rendered_ipynb)

    os.system("cp /mnt/lab_data/kundaje/msharmin/mouse_hem/code/mouse_hem/{}.html /srv/www/kundaje/msharmin/report/global_htmls/".format(nbname))
    os.system("chmod +755 /srv/www/kundaje/msharmin/mouse_hem/report/global_htmls/{}.html".format(nbname))

    return None

render_test_plots('testSpi1Scan')
In [27]:
_, _, test_hits = transform_data(shuffled_seqlet_hits[keys_for_scoring], scaler, pca)
shuffled_pc_df = pd.DataFrame(data = test_hits, columns=["PC{}".format(i+1) for i in range(test_hits.shape[1])])
#plot_pca(shuffled_pc_df, true_ids=None)

probs = gmm.predict_proba(shuffled_pc_df[['PC1', 'PC2']])
plt.scatter(shuffled_pc_df['PC1'], shuffled_pc_df['PC2'], c=probs[:,1], s=1, cmap='seismic');
In [58]:
import sys
def get_LLRs(fg_df, bg_df, keys=['PC1', 'PC2']):
    
    llr_df = OrderedDict()
    for key in keys:
        fg = fg_df[key].values
        bg = bg_df[key].values
        _min = abs(min(np.min(fg), np.min(bg)))+0.0001
        
        ratio = (fg+_min)/(bg+_min)
        llrs = -2 * np.log(ratio)
        llr_df[key] = llrs
        #break
        
    llr_df = pd.DataFrame(llr_df)
    
    return llr_df

llr_df = get_LLRs(pc_df, shuffled_paired_pc_df, keys=['PC1', 'PC2'])
#df[:5]
plot_pca(llr_df, true_ids=true_indices)
(13587, 2)
In [67]:
marker = np.array(['F']*llr_df.shape[0])
marker[np.array(true_indices)] = 'TP'
llr_df['marker'] = marker
In [69]:
import seaborn as sns
sns.pairplot(llr_df, 
             diag_kind="kde", kind='scatter', hue="marker")
/users/msharmin/anaconda2/envs/basepair13/lib/python3.6/site-packages/scipy/stats/stats.py:1713: FutureWarning:

Using a non-tuple sequence for multidimensional indexing is deprecated; use `arr[tuple(seq)]` instead of `arr[seq]`. In the future this will be interpreted as an array index, `arr[np.array(seq)]`, which will result either in an error or a different result.

Out[69]:
<seaborn.axisgrid.PairGrid at 0x7fb087eb6dd8>
In [71]:
from scipy import stats
def get_z_scores(df):
    z_df = OrderedDict()
    for key in df.columns:
        if key == 'marker':
            z_df[key] = df[key].values
            continue
        z_df[key] = stats.zscore(df[key].values)
    
    z_df = pd.DataFrame(z_df) 
    
    return z_df

z_df = get_z_scores(llr_df)
plot_pca(z_df, true_ids=true_indices)
(13587, 3)
In [72]:
import seaborn as sns
sns.pairplot(z_df, 
             diag_kind="kde", kind='scatter', hue="marker")
/users/msharmin/anaconda2/envs/basepair13/lib/python3.6/site-packages/scipy/stats/stats.py:1713: FutureWarning:

Using a non-tuple sequence for multidimensional indexing is deprecated; use `arr[tuple(seq)]` instead of `arr[seq]`. In the future this will be interpreted as an array index, `arr[np.array(seq)]`, which will result either in an error or a different result.

Out[72]:
<seaborn.axisgrid.PairGrid at 0x7fb087eaeb00>
In [84]:
import scipy
def get_p_values(df):
    z_scores = df['PC1'].values
    #p_values = 1 - scipy.special.ndtr(z_scores)
    #2 * (1 - scipy.stats.multivariate_normal.cdf(z_scores, mean=mu, cov=np.diag(std)))
    #p_values = scipy.stats.norm.sf(abs(z_scores)) #one-sided
    #p_values = scipy.stats.norm.sf(abs(z_scores))*2 #twosided
    return p_values

def plot_data(p_df):
    fig = plt.figure()
    ax = fig.add_subplot(1, 1, 1, axisbg="1.0")

    for data, color, group in zip(data, colors, groups):
        x, y = data
        ax.scatter(x, y, alpha=0.8, c=color, edgecolors='none', s=30, label=group)

    plt.title('Matplot scatter plot')
    plt.legend(loc=2)
    plt.show()

    return None

p_values = get_p_values(z_df)
p_df = pd.DataFrame({'p':p_values, 'log10p':-np.log10(p_values), 'marker': z_df['marker'].values})
p_df
Out[84]:
p log10p marker
0 0.339183 0.469566 F
1 0.719697 0.142850 F
2 0.390245 0.408663 F
3 0.911230 0.040372 F
4 0.700160 0.154803 T
5 0.145487 0.837175 F
6 0.951901 0.021408 F
7 0.346973 0.459704 F
8 0.780225 0.107780 F
9 0.268504 0.571050 F
10 0.017346 1.760809 F
11 0.768715 0.114235 F
12 0.294894 0.530334 F
13 0.038762 1.411589 F
14 0.766050 0.115743 F
15 0.543136 0.265091 F
16 0.491931 0.308095 F
17 0.759094 0.119705 F
18 0.373265 0.427983 F
19 0.563059 0.249446 F
20 0.838812 0.076335 F
21 0.444612 0.352019 F
22 0.561260 0.250836 F
23 0.966984 0.014581 F
24 0.611047 0.213925 F
25 0.967128 0.014516 F
26 0.748800 0.125634 F
27 0.603392 0.219400 F
28 0.554871 0.255808 F
29 0.534545 0.272016 F
... ... ... ...
13557 0.444668 0.351964 F
13558 0.154117 0.812149 F
13559 0.242292 0.615661 F
13560 0.173225 0.761390 F
13561 0.513153 0.289753 F
13562 0.356413 0.448047 F
13563 0.123291 0.909069 F
13564 0.727890 0.137935 T
13565 0.195246 0.709418 F
13566 0.340368 0.468051 F
13567 0.269086 0.570108 F
13568 0.344548 0.462750 F
13569 0.512104 0.290642 F
13570 0.798295 0.097836 F
13571 0.597403 0.223732 F
13572 0.355927 0.448639 F
13573 0.891977 0.049646 T
13574 0.194900 0.710187 F
13575 0.376943 0.423724 F
13576 0.264820 0.577049 F
13577 0.360948 0.442555 F
13578 0.665133 0.177091 F
13579 0.544177 0.264260 F
13580 0.899827 0.045841 F
13581 0.201243 0.696279 F
13582 0.880268 0.055385 T
13583 0.332491 0.478220 F
13584 0.436855 0.359662 F
13585 0.880313 0.055363 F
13586 0.185081 0.732639 F

13587 rows × 3 columns

In [22]:
#from matlas.matches import view_test_case
def view_test_case(scores, one_hot, df, pwm, seqno, start=400, end=600, itemno=None):
    df = df[df.seqno==seqno]
    if(df.shape[0]==0):
        print('Not part of the hits')
        viz_sequence.plot_weights(scores[seqno, start:end,:],
                                  highlight={"black": [(21, 29)]},
                                  subticks_frequency=10, figsize=(20,1))

        viz_sequence.plot_weights(one_hot[seqno, start:end, :], 
                                  highlight={"black": [(21, 29)]},
                                  subticks_frequency=10, figsize=(20,1))
        return None
    
    keys = set(df.columns).difference(set(['seqno', 'start_idx', 'marker']))
    debug = 0
    for i, row in df.iterrows():
        if itemno is not None:
            if debug!=itemno:
                debug += 1 
                continue
            else:
                debug += 1
            
        printables = ["{}: {}".format(key, round(row[key], 3)) for key in keys]
        seqno = int(row['seqno'])
        start_idx = int(row['start_idx'])
        print(i,
              printables,
              "coordinates:",(seqno, start_idx))
        start_motif = start_idx - start
        end_motif = start_motif+len(pwm)
        print(start_idx, start_motif, end_motif)
        if(end_motif > (end-start) or end_motif<0):
            continue
        viz_sequence.plot_weights(scores[seqno, start:end,:],
                                  highlight={"black": [(start_motif, end_motif)]},
                                  subticks_frequency=10, figsize=(20,1))

        viz_sequence.plot_weights(one_hot[seqno, start:end, :], 
                                  highlight={"black": [(start_motif, end_motif)]},
                                  subticks_frequency=10, figsize=(20,1))
    return None
view_test_case(seqlet_scores, seqlet_onehot, seqlet_hits, query['ppm'], 0, start=0, end=200, itemno=17)
view_test_case(shuffled_scores, shuffled_onehot, shuffled_seqlet_hits, query['ppm'], 0, start=0, end=200, itemno=16)
17 ['log10pwm_scan: -8.482', 'cwm_convolution: 0.204', 'log10pwm_masked_convolution: -0.473', 'pssm_jaccard: 0.185', 'hyp_cwm_scan: -0.009', 'hyp_cwm_masked_convolution: -0.08', 'cwm_jaccard: 0.152', 'sum_score: 3.044', 'pssm_convolution: 2.108', 'pssm_scan: 4.412', 'hyp_cwm_convolution: -0.035', 'pwm_convolution: -7.125', 'cwm_scan: 0.442', 'pwm_jaccard: -0.058', 'ppm_jaccard: 0.148', 'hyp_cwm_jaccard: -0.001', 'pwm_scan: -12.237', 'pwm_masked_convolution: -0.473', 'ppm_scan: 3.168', 'log10pwm_convolution: -4.938', 'cwm_masked_convolution: 0.5', 'log10pwm_jaccard: -0.058', 'ppm_convolution: 1.108', 'pssm_masked_convolution: 0.511', 'ppm_masked_convolution: 0.438'] coordinates: (0, 43)
43 43 51
16 ['log10pwm_scan: -17.251', 'cwm_convolution: 0.096', 'log10pwm_masked_convolution: -0.535', 'pssm_jaccard: 0.092', 'hyp_cwm_scan: -0.24', 'hyp_cwm_masked_convolution: -0.059', 'cwm_jaccard: 0.07', 'sum_score: 2.158', 'pssm_convolution: 0.941', 'pssm_scan: 2.045', 'hyp_cwm_convolution: -0.016', 'pwm_convolution: -6.257', 'cwm_scan: 0.201', 'pwm_jaccard: -0.098', 'ppm_jaccard: 0.068', 'hyp_cwm_jaccard: -0.025', 'pwm_scan: -24.888', 'pwm_masked_convolution: -0.535', 'ppm_scan: 1.097', 'log10pwm_convolution: -4.337', 'cwm_masked_convolution: 0.428', 'log10pwm_jaccard: -0.098', 'ppm_convolution: 0.484', 'pssm_masked_convolution: 0.432', 'ppm_masked_convolution: 0.44'] coordinates: (0, 43)
43 43 51
In [23]:
view_test_case(seq_null_scores, seq_null_onehot, seq_null_seqlet_hits, query['ppm'], 0, start=0, end=200, itemno=17)
view_test_case(weight_null_scores, weight_null_onehot, weight_null_seqlet_hits, query['ppm'], 0, start=0, end=200, itemno=25)
view_test_case(weightedseq_null_scores, weightedseq_null_onehot, weightedseq_null_seqlet_hits, query['ppm'], 0, start=0, end=200, itemno=27)
17 ['log10pwm_scan: -13.35', 'cwm_convolution: 0.104', 'log10pwm_masked_convolution: -0.571', 'pssm_jaccard: 0.1', 'hyp_cwm_scan: -0.039', 'hyp_cwm_masked_convolution: 0.075', 'cwm_jaccard: 0.062', 'sum_score: 3.044', 'pssm_convolution: 1.197', 'pssm_scan: 2.169', 'hyp_cwm_convolution: 0.021', 'pwm_convolution: -8.7', 'cwm_scan: 0.178', 'pwm_jaccard: -0.062', 'ppm_jaccard: 0.099', 'hyp_cwm_jaccard: 0.026', 'pwm_scan: -19.26', 'pwm_masked_convolution: -0.571', 'ppm_scan: 1.525', 'log10pwm_convolution: -6.031', 'cwm_masked_convolution: 0.452', 'log10pwm_jaccard: -0.062', 'ppm_convolution: 0.728', 'pssm_masked_convolution: 0.451', 'ppm_masked_convolution: 0.454'] coordinates: (0, 43)
43 43 51
25 ['log10pwm_scan: -8.482', 'cwm_convolution: 0.116', 'log10pwm_masked_convolution: 0.236', 'pssm_jaccard: 0.181', 'hyp_cwm_scan: -0.009', 'hyp_cwm_masked_convolution: 0.593', 'cwm_jaccard: 0.152', 'sum_score: 0.736', 'pssm_convolution: 1.126', 'pssm_scan: 4.412', 'hyp_cwm_convolution: 0.085', 'pwm_convolution: 1.17', 'cwm_scan: 0.442', 'pwm_jaccard: 0.033', 'ppm_jaccard: 0.212', 'hyp_cwm_jaccard: 0.08', 'pwm_scan: -12.237', 'pwm_masked_convolution: 0.236', 'ppm_scan: 3.168', 'log10pwm_convolution: 0.811', 'cwm_masked_convolution: 0.865', 'log10pwm_jaccard: 0.033', 'ppm_convolution: 0.649', 'pssm_masked_convolution: 0.83', 'ppm_masked_convolution: 0.78'] coordinates: (0, 43)
43 43 51
27 ['log10pwm_scan: -13.35', 'cwm_convolution: 0.011', 'log10pwm_masked_convolution: -0.774', 'pssm_jaccard: 0.018', 'hyp_cwm_scan: -0.039', 'hyp_cwm_masked_convolution: -0.171', 'cwm_jaccard: 0.026', 'sum_score: 1.074', 'pssm_convolution: 0.129', 'pssm_scan: 2.169', 'hyp_cwm_convolution: -0.019', 'pwm_convolution: -4.821', 'cwm_scan: 0.178', 'pwm_jaccard: -0.086', 'ppm_jaccard: 0.008', 'hyp_cwm_jaccard: -0.018', 'pwm_scan: -19.26', 'pwm_masked_convolution: -0.774', 'ppm_scan: 1.525', 'log10pwm_convolution: -3.342', 'cwm_masked_convolution: 0.12', 'log10pwm_jaccard: -0.086', 'ppm_convolution: 0.058', 'pssm_masked_convolution: 0.119', 'ppm_masked_convolution: 0.089'] coordinates: (0, 43)
43 43 51
In [24]:
import numpy as np
import matplotlib.pyplot as plt
def compare_test_case(series1, series2, filter_='convolution'):
    keys = set(series1.keys()).difference(set(['seqno', 'start_idx', 'marker']))
    keys = ['sum_score'] + [key for key in keys if filter_ in key]
    # set width of bar
    barWidth = 0.25

    # set height of bar
    bars1 = [series1[key] for key in keys]
    bars2 = [series2[key] for key in keys]

    # Set position of bar on X axis
    r1 = np.arange(len(bars1))
    r2 = [x + barWidth for x in r1]

    # Make the plot
    plt.bar(r1, bars1, color='#557f2d', width=barWidth, edgecolor='white', label='seqlet')
    plt.bar(r2, bars2, color='#2d7f5e', width=barWidth, edgecolor='white', label='shuffled')

    # Add xticks on the middle of the group bars
    plt.xlabel('group', fontweight='bold')
    plt.xticks([r + barWidth for r in range(len(bars1))], keys, rotation='vertical')

    # Create legend & Show graphic
    plt.legend()
    plt.show()
    
    return None

a=4; b=17
# compare_test_case(seqlet_hits.loc[a,:], seqlet_hits.loc[b,:], 'scan')
# compare_test_case(seqlet_hits.loc[a,:], seqlet_hits.loc[b,:], 'convolution')
# compare_test_case(seqlet_hits.loc[a,:], seqlet_hits.loc[b,:], 'jaccard')
# compare_test_case(seqlet_hits.loc[a,:], seqlet_hits.loc[b,:], 'masked')

a=17; b=16
# compare_test_case(seqlet_hits.loc[a,:], shuffled_seqlet_hits.loc[b,:], 'scan')
# compare_test_case(seqlet_hits.loc[a,:], shuffled_seqlet_hits.loc[b,:], 'convolution')
# compare_test_case(seqlet_hits.loc[a,:], shuffled_seqlet_hits.loc[b,:], 'jaccard')
# compare_test_case(seqlet_hits.loc[a,:], shuffled_seqlet_hits.loc[b,:], 'masked')

a=17; b=17
# compare_test_case(seqlet_hits.loc[a,:], seq_null_seqlet_hits.loc[b,:], 'scan')
# compare_test_case(seqlet_hits.loc[a,:], seq_null_seqlet_hits.loc[b,:], 'convolution')
# compare_test_case(seqlet_hits.loc[a,:], seq_null_seqlet_hits.loc[b,:], 'jaccard')
# compare_test_case(seqlet_hits.loc[a,:], seq_null_seqlet_hits.loc[b,:], 'masked')

a=17; b=25
# compare_test_case(seqlet_hits.loc[a,:], weight_null_seqlet_hits.loc[b,:], 'scan')
# compare_test_case(seqlet_hits.loc[a,:], weight_null_seqlet_hits.loc[b,:], 'convolution')
# compare_test_case(seqlet_hits.loc[a,:], weight_null_seqlet_hits.loc[b,:], 'jaccard')
# compare_test_case(seqlet_hits.loc[a,:], weight_null_seqlet_hits.loc[b,:], 'masked')

a=17; b=27
# compare_test_case(seqlet_hits.loc[a,:], weightedseq_null_seqlet_hits.loc[b,:], 'scan')
# compare_test_case(seqlet_hits.loc[a,:], weightedseq_null_seqlet_hits.loc[b,:], 'convolution')
# compare_test_case(seqlet_hits.loc[a,:], weightedseq_null_seqlet_hits.loc[b,:], 'jaccard')
# compare_test_case(seqlet_hits.loc[a,:], weightedseq_null_seqlet_hits.loc[b,:], 'masked')
In [25]:
import seaborn as sns
sns.pairplot(test_hits[['sum_score', 
                        'ppm_scan', 'pwm_scan', 'log10pwm_scan', 'marker']], 
             diag_kind="kde", kind='scatter', hue="marker")
/users/msharmin/anaconda2/envs/basepair13/lib/python3.6/site-packages/scipy/stats/stats.py:1713: FutureWarning:

Using a non-tuple sequence for multidimensional indexing is deprecated; use `arr[tuple(seq)]` instead of `arr[seq]`. In the future this will be interpreted as an array index, `arr[np.array(seq)]`, which will result either in an error or a different result.

Out[25]:
<seaborn.axisgrid.PairGrid at 0x7f621e198400>
In [26]:
import seaborn as sns
sns.pairplot(test_hits[['sum_score', 
                   'cwm_scan', 'hyp_cwm_scan', 'pssm_scan', 'marker']], 
             diag_kind="kde", kind='scatter', hue="marker")
Out[26]:
<seaborn.axisgrid.PairGrid at 0x7f621deb8940>
In [27]:
sns.pairplot(test_hits[['sum_score', 
                   'ppm_jaccard', 'pwm_jaccard', 'log10pwm_jaccard', 'marker']], 
             diag_kind="kde", kind='scatter', hue="marker")
Out[27]:
<seaborn.axisgrid.PairGrid at 0x7f621c1beb70>
In [28]:
sns.pairplot(test_hits[['sum_score', 
                   'cwm_jaccard', 'hyp_cwm_jaccard', 'pssm_jaccard', 'marker']], 
             diag_kind="kde", kind='scatter', hue="marker")
Out[28]:
<seaborn.axisgrid.PairGrid at 0x7f620ef3e358>
In [29]:
sns.pairplot(test_hits[['sum_score', 
                   'ppm_convolution', 'pwm_convolution', 'log10pwm_convolution', 'marker']], 
             diag_kind="kde", kind='scatter', hue="marker")
Out[29]:
<seaborn.axisgrid.PairGrid at 0x7f620cf79c18>
In [30]:
sns.pairplot(test_hits[['sum_score', 
                   'cwm_convolution', 'hyp_cwm_convolution', 'pssm_convolution', 'marker']], 
             diag_kind="kde", kind='scatter', hue="marker")
Out[30]:
<seaborn.axisgrid.PairGrid at 0x7f620cf79240>
In [31]:
sns.pairplot(test_hits[['sum_score', 
                   'ppm_masked_convolution', 'pwm_masked_convolution', 'log10pwm_masked_convolution', 'marker']], 
             diag_kind="kde", kind='scatter', hue="marker")
Out[31]:
<seaborn.axisgrid.PairGrid at 0x7f620c2216d8>
In [32]:
sns.pairplot(test_hits[['sum_score', 
                   'cwm_masked_convolution', 'hyp_cwm_masked_convolution', 'pssm_masked_convolution', 'marker']], 
             diag_kind="kde", kind='scatter', hue="marker")
Out[32]:
<seaborn.axisgrid.PairGrid at 0x7f5f9c7eec50>
In [47]:
#dimension reduction
len(keys_for_scoring)
keys_for_scoring = ['sum_score'] 
keys_for_scoring += ["{}_scan".format(key) for key in query.keys() if key!= 'consensus']
keys_for_scoring += ["{}_jaccard".format(key) for key in query.keys() if key!= 'consensus']
keys_for_scoring += ["{}_convolution".format(key) for key in query.keys() if key!= 'consensus']
keys_for_scoring += ["{}_masked_convolution".format(key) for key in ['hyp_cwm', 'log10pwm', 'pwm']]
In [48]:
import umap
trans = umap.UMAP(n_neighbors=5, random_state=42).fit(seqlet_hits[keys_for_scoring])
print(trans.embedding_.shape)
plt.scatter(trans.embedding_[:, 0], trans.embedding_[:, 1], s=1, cmap='Spectral')
/users/msharmin/anaconda2/envs/basepair13/lib/python3.6/site-packages/umap/spectral.py:229: UserWarning:

Embedding a total of 3 separate connected components using meta-embedding (experimental)

(13587, 2)
Out[48]:
<matplotlib.collections.PathCollection at 0x7f5f8aadbba8>
In [54]:
test_embedding = trans.transform(seqlet_hits[keys_for_scoring])
print(test_embedding.shape)
plt.scatter(test_embedding[:, 0], test_embedding[:, 1], s=1, cmap='Spectral')
(13587, 2)
Out[54]:
<matplotlib.collections.PathCollection at 0x7f5f8ab2ed30>
In [49]:
test_embedding = trans.transform(shuffled_seqlet_hits[keys_for_scoring])
print(test_embedding.shape)
plt.scatter(test_embedding[:, 0], test_embedding[:, 1], s=1, cmap='Spectral')
(2846, 2)
Out[49]:
<matplotlib.collections.PathCollection at 0x7f5f89ce7358>
In [50]:
test_embedding = trans.transform(seq_null_seqlet_hits[keys_for_scoring])
print(test_embedding.shape)
plt.scatter(test_embedding[:, 0], test_embedding[:, 1], s=1, cmap='Spectral')
(13587, 2)
Out[50]:
<matplotlib.collections.PathCollection at 0x7f5f8a2b3048>
In [51]:
test_embedding = trans.transform(weight_null_seqlet_hits[keys_for_scoring])
print(test_embedding.shape)
plt.scatter(test_embedding[:, 0], test_embedding[:, 1], s=1, cmap='Spectral')
(12928, 2)
Out[51]:
<matplotlib.collections.PathCollection at 0x7f5f8b1cc240>
In [53]:
test_embedding = trans.transform(weightedseq_null_seqlet_hits[keys_for_scoring])
print(test_embedding.shape)
plt.scatter(test_embedding[:, 0], test_embedding[:, 1], s=1, cmap='Spectral')
(13422, 2)
Out[53]:
<matplotlib.collections.PathCollection at 0x7f5f8a8647b8>
In [55]:
type(test_embedding)
Out[55]:
numpy.ndarray
In [56]:
umap_seqlet_hits = pd.DataFrame(trans.transform(seqlet_hits[keys_for_scoring]))
umap_shuffled_seqlet_hits = pd.DataFrame(trans.transform(shuffled_seqlet_hits[keys_for_scoring]))
umap_seq_null_seqlet_hits = pd.DataFrame(trans.transform(seq_null_seqlet_hits[keys_for_scoring]))
umap_weight_null_seqlet_hits = pd.DataFrame(trans.transform(weight_null_seqlet_hits[keys_for_scoring]))
umap_weightedseq_null_seqlet_hits = pd.DataFrame(trans.transform(weightedseq_null_seqlet_hits[keys_for_scoring]))
umap_seqlet_hits['marker'] = 'foreground'
umap_shuffled_seqlet_hits['marker'] = 'suffled'
umap_seq_null_seqlet_hits['marker'] = 'seq_null'
umap_weight_null_seqlet_hits['marker'] = 'weight_null'
umap_weightedseq_null_seqlet_hits['marker'] = 'weightedseq'
umap_test_hits = pd.concat([umap_seqlet_hits,
                       umap_shuffled_seqlet_hits,
                       umap_seq_null_seqlet_hits,
                       umap_weight_null_seqlet_hits,
                       umap_weightedseq_null_seqlet_hits
                      ])
In [58]:
sns.pairplot(umap_test_hits, 
             diag_kind="kde", kind='scatter', hue="marker")
/users/msharmin/anaconda2/envs/basepair13/lib/python3.6/site-packages/scipy/stats/stats.py:1713: FutureWarning:

Using a non-tuple sequence for multidimensional indexing is deprecated; use `arr[tuple(seq)]` instead of `arr[seq]`. In the future this will be interpreted as an array index, `arr[np.array(seq)]`, which will result either in an error or a different result.

Out[58]:
<seaborn.axisgrid.PairGrid at 0x7f5f8a689a58>

PPM in forground vs. various null scores

In [60]:
from matlas.matches import plots_seqlet_measures
plots_seqlet_measures(seqlet_hits, metrics = {0:["sum_score", "ppm_scan"], 
                                            1:["sum_score", "ppm_convolution"], 
                                            2:["sum_score", "ppm_masked_convolution"], 
                                            3:["sum_score", "ppm_jaccard"]})
In [66]:
plots_seqlet_measures(shuffled_seqlet_hits, metrics = {0:["sum_score", "ppm_scan"], 
                                            1:["sum_score", "ppm_convolution"], 
                                            2:["sum_score", "ppm_masked_convolution"], 
                                            3:["sum_score", "ppm_jaccard"]})
/users/msharmin/anaconda2/envs/basepair13/lib/python3.6/site-packages/scipy/stats/stats.py:1713: FutureWarning:

Using a non-tuple sequence for multidimensional indexing is deprecated; use `arr[tuple(seq)]` instead of `arr[seq]`. In the future this will be interpreted as an array index, `arr[np.array(seq)]`, which will result either in an error or a different result.

In [67]:
plots_seqlet_measures(seq_null_seqlet_hits, metrics = {0:["sum_score", "ppm_scan"], 
                                            1:["sum_score", "ppm_convolution"], 
                                            2:["sum_score", "ppm_masked_convolution"], 
                                            3:["sum_score", "ppm_jaccard"]})
In [68]:
plots_seqlet_measures(weight_null_seqlet_hits, metrics = {0:["sum_score", "ppm_scan"], 
                                            1:["sum_score", "ppm_convolution"], 
                                            2:["sum_score", "ppm_masked_convolution"], 
                                            3:["sum_score", "ppm_jaccard"]})
In [69]:
plots_seqlet_measures(weightedseq_null_seqlet_hits, metrics = {0:["sum_score", "ppm_scan"], 
                                            1:["sum_score", "ppm_convolution"], 
                                            2:["sum_score", "ppm_masked_convolution"], 
                                            3:["sum_score", "ppm_jaccard"]})

PWM in forground vs. various null scores

In [59]:
from matlas.matches import plots_seqlet_measures
plots_seqlet_measures(seqlet_hits, metrics = {0:["sum_score", "pwm_scan"], 
                                            1:["sum_score", "pwm_convolution"], 
                                            2:["sum_score", "pwm_masked_convolution"], 
                                            3:["sum_score", "pwm_jaccard"]})
In [70]:
plots_seqlet_measures(shuffled_seqlet_hits, metrics = {0:["sum_score", "pwm_scan"], 
                                            1:["sum_score", "pwm_convolution"], 
                                            2:["sum_score", "pwm_masked_convolution"], 
                                            3:["sum_score", "pwm_jaccard"]})
In [71]:
plots_seqlet_measures(seq_null_seqlet_hits, metrics = {0:["sum_score", "pwm_scan"], 
                                            1:["sum_score", "pwm_convolution"], 
                                            2:["sum_score", "pwm_masked_convolution"], 
                                            3:["sum_score", "pwm_jaccard"]})
In [72]:
plots_seqlet_measures(weight_null_seqlet_hits, metrics = {0:["sum_score", "pwm_scan"], 
                                            1:["sum_score", "pwm_convolution"], 
                                            2:["sum_score", "pwm_masked_convolution"], 
                                            3:["sum_score", "pwm_jaccard"]})
In [75]:
plots_seqlet_measures(weightedseq_null_seqlet_hits, metrics = {0:["sum_score", "pwm_scan"], 
                                            1:["sum_score", "pwm_convolution"], 
                                            2:["sum_score", "pwm_masked_convolution"], 
                                            3:["sum_score", "pwm_jaccard"]})

log10pwm in forground vs. various null scores

In [62]:
from matlas.matches import plots_seqlet_measures
plots_seqlet_measures(seqlet_hits, metrics = {0:["sum_score", "log10pwm_scan"], 
                                            1:["sum_score", "log10pwm_convolution"], 
                                            2:["sum_score", "log10pwm_masked_convolution"], 
                                            3:["sum_score", "log10pwm_jaccard"]})
In [76]:
plots_seqlet_measures(shuffled_seqlet_hits, metrics = {0:["sum_score", "log10pwm_scan"], 
                                            1:["sum_score", "log10pwm_convolution"], 
                                            2:["sum_score", "log10pwm_masked_convolution"], 
                                            3:["sum_score", "log10pwm_jaccard"]})
In [77]:
from matlas.matches import plots_seqlet_measures
plots_seqlet_measures(seq_null_seqlet_hits, metrics = {0:["sum_score", "log10pwm_scan"], 
                                            1:["sum_score", "log10pwm_convolution"], 
                                            2:["sum_score", "log10pwm_masked_convolution"], 
                                            3:["sum_score", "log10pwm_jaccard"]})
In [78]:
from matlas.matches import plots_seqlet_measures
plots_seqlet_measures(weight_null_seqlet_hits, metrics = {0:["sum_score", "log10pwm_scan"], 
                                            1:["sum_score", "log10pwm_convolution"], 
                                            2:["sum_score", "log10pwm_masked_convolution"], 
                                            3:["sum_score", "log10pwm_jaccard"]})
In [79]:
from matlas.matches import plots_seqlet_measures
plots_seqlet_measures(weightedseq_null_seqlet_hits, metrics = {0:["sum_score", "log10pwm_scan"], 
                                            1:["sum_score", "log10pwm_convolution"], 
                                            2:["sum_score", "log10pwm_masked_convolution"], 
                                            3:["sum_score", "log10pwm_jaccard"]})

PSSM in forground vs. various null scores

In [63]:
from matlas.matches import plots_seqlet_measures
plots_seqlet_measures(seqlet_hits, metrics = {0:["sum_score", "pssm_scan"], 
                                            1:["sum_score", "pssm_convolution"], 
                                            2:["sum_score", "pssm_masked_convolution"], 
                                            3:["sum_score", "pssm_jaccard"]})
In [80]:
plots_seqlet_measures(shuffled_seqlet_hits, metrics = {0:["sum_score", "pssm_scan"], 
                                            1:["sum_score", "pssm_convolution"], 
                                            2:["sum_score", "pssm_masked_convolution"], 
                                            3:["sum_score", "pssm_jaccard"]})
In [81]:
plots_seqlet_measures(seq_null_seqlet_hits, metrics = {0:["sum_score", "pssm_scan"], 
                                            1:["sum_score", "pssm_convolution"], 
                                            2:["sum_score", "pssm_masked_convolution"], 
                                            3:["sum_score", "pssm_jaccard"]})
In [82]:
plots_seqlet_measures(weight_null_seqlet_hits, metrics = {0:["sum_score", "pssm_scan"], 
                                            1:["sum_score", "pssm_convolution"], 
                                            2:["sum_score", "pssm_masked_convolution"], 
                                            3:["sum_score", "pssm_jaccard"]})
In [83]:
plots_seqlet_measures(weightedseq_null_seqlet_hits, metrics = {0:["sum_score", "pssm_scan"], 
                                            1:["sum_score", "pssm_convolution"], 
                                            2:["sum_score", "pssm_masked_convolution"], 
                                            3:["sum_score", "pssm_jaccard"]})

cwm in forground vs. various null scores

In [64]:
from matlas.matches import plots_seqlet_measures
plots_seqlet_measures(seqlet_hits, metrics = {0:["sum_score", "cwm_scan"], 
                                            1:["sum_score", "cwm_convolution"], 
                                            2:["sum_score", "cwm_masked_convolution"], 
                                            3:["sum_score", "cwm_jaccard"]})
In [84]:
plots_seqlet_measures(shuffled_seqlet_hits, metrics = {0:["sum_score", "cwm_scan"], 
                                            1:["sum_score", "cwm_convolution"], 
                                            2:["sum_score", "cwm_masked_convolution"], 
                                            3:["sum_score", "cwm_jaccard"]})
In [85]:
plots_seqlet_measures(seq_null_seqlet_hits, metrics = {0:["sum_score", "cwm_scan"], 
                                            1:["sum_score", "cwm_convolution"], 
                                            2:["sum_score", "cwm_masked_convolution"], 
                                            3:["sum_score", "cwm_jaccard"]})
In [86]:
plots_seqlet_measures(weight_null_seqlet_hits, metrics = {0:["sum_score", "cwm_scan"], 
                                            1:["sum_score", "cwm_convolution"], 
                                            2:["sum_score", "cwm_masked_convolution"], 
                                            3:["sum_score", "cwm_jaccard"]})
In [87]:
plots_seqlet_measures(weightedseq_null_seqlet_hits, metrics = {0:["sum_score", "cwm_scan"], 
                                            1:["sum_score", "cwm_convolution"], 
                                            2:["sum_score", "cwm_masked_convolution"], 
                                            3:["sum_score", "cwm_jaccard"]})

hyp_cwm in forground vs. various null scores

In [65]:
from matlas.matches import plots_seqlet_measures
plots_seqlet_measures(seqlet_hits, metrics = {0:["sum_score", "hyp_cwm_scan"], 
                                            1:["sum_score", "hyp_cwm_convolution"], 
                                            2:["sum_score", "hyp_cwm_masked_convolution"], 
                                            3:["sum_score", "hyp_cwm_jaccard"]})
In [88]:
plots_seqlet_measures(shuffled_seqlet_hits, metrics = {0:["sum_score", "hyp_cwm_scan"], 
                                            1:["sum_score", "hyp_cwm_convolution"], 
                                            2:["sum_score", "hyp_cwm_masked_convolution"], 
                                            3:["sum_score", "hyp_cwm_jaccard"]})
In [89]:
plots_seqlet_measures(seq_null_seqlet_hits, metrics = {0:["sum_score", "hyp_cwm_scan"], 
                                            1:["sum_score", "hyp_cwm_convolution"], 
                                            2:["sum_score", "hyp_cwm_masked_convolution"], 
                                            3:["sum_score", "hyp_cwm_jaccard"]})
In [90]:
plots_seqlet_measures(weight_null_seqlet_hits, metrics = {0:["sum_score", "hyp_cwm_scan"], 
                                            1:["sum_score", "hyp_cwm_convolution"], 
                                            2:["sum_score", "hyp_cwm_masked_convolution"], 
                                            3:["sum_score", "hyp_cwm_jaccard"]})
In [91]:
plots_seqlet_measures(weightedseq_null_seqlet_hits, metrics = {0:["sum_score", "hyp_cwm_scan"], 
                                            1:["sum_score", "hyp_cwm_convolution"], 
                                            2:["sum_score", "hyp_cwm_masked_convolution"], 
                                            3:["sum_score", "hyp_cwm_jaccard"]})
In [183]:
len(keys_for_scoring)
keys_for_scoring = ['sum_score'] 
keys_for_scoring += ["{}_scan".format(key) for key in query.keys() if key!= 'consensus']
keys_for_scoring += ["{}_jaccard".format(key) for key in query.keys() if key!= 'consensus']
keys_for_scoring += ["{}_convolution".format(key) for key in query.keys() if key!= 'consensus']
keys_for_scoring += ["{}_masked_convolution".format(key) for key in query.keys() if key!= 'consensus']
#keys_for_scoring += ["{}_masked_convolution".format(key) for key in ['hyp_cwm', 'log10pwm', 'pwm']]
In [184]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
# Fit on training set only.
scaler.fit(seqlet_hits[keys_for_scoring])
# Apply transform to both the training set and the test set.
train_hits = scaler.transform(seqlet_hits[keys_for_scoring])

from sklearn.decomposition import PCA
# Make an instance of the Model
pca = PCA(.98)
#fitting PCA on the training set only.
pca.fit(train_hits)
#Apply the mapping (transform) to both the training set and the test set.
train_hits = pca.transform(train_hits)
#test_hits = pca.transform(test_hits)
In [256]:
#seqlet_hits[seqlet_hits['seqno']==684]
pca.explained_variance_ratio_
Out[256]:
array([0.73421992, 0.09118347, 0.07766985, 0.02654744, 0.022959  ,
       0.01230255, 0.00980085, 0.00861208])
In [273]:
#read the real truth
def load_true_positives(tp_file):
    coordinates = []
    with open(tp_file, "r") as fp:
        for line in fp:
            line = line.strip().split(",")
            seqno = line[0]
            pos = int(line[1])
            if "-" in seqno:
                seqno = seqno.split("-")
                for i in range(int(seqno[0]), int(seqno[1])+1):
                    coordinates.append((i, pos))
            else:
                coordinates.append((int(seqno), pos))
    return coordinates
def get_tp_indices(seqlet_hits):
    keep = []
    for cord in coordinates:
        seqno, pos = cord
        df = seqlet_hits[(seqlet_hits['seqno']==seqno) & (seqlet_hits['start_idx']==pos)]
        if df.shape[0]>0:
            keep.append(df.index[0])
        else:
            print(cord)

    return keep

tp_file = "{}/web_mouse_heme_data/modisco_tracks/binary_model/task_{}-naivegw/mm17_tp.txt".format(project_root, task_idx)
coordinates = load_true_positives(tp_file)
true_indices = get_tp_indices(seqlet_hits)
(684, 49)
In [288]:
def plot_pca(df, true_ids=None):
    print(df.shape)
    fig = plt.figure(figsize = (8,8))
    ax = fig.add_subplot(1,1,1) 
    ax.set_xlabel('PC1', fontsize = 15)
    ax.set_ylabel('PC2', fontsize = 15)
    ax.set_title('2 component PCA', fontsize = 20)
    if true_ids is not None:
        true_df = df.loc[np.array(true_ids),:]
        df = df.loc[np.array(list(set(list(df.index)).difference(set(true_ids)))),:]
        ax.scatter(true_df['PC1'], true_df['PC2'], s = 20, c='r', marker='+')
    ax.scatter(df['PC1'], df['PC2'], s = 1, c='b')
    #ax.axvline(9)
    #ax.axhline(-1)
    ax.grid()
    return None
pc_df = pd.DataFrame(data = train_hits, columns=["PC{}".format(i+1) for i in range(train_hits.shape[1])])
plot_pca(pc_df, )
(13587, 8)
In [291]:
#test_df = pc_df.loc[np.array(list(set(list(pc_df.index)).difference(set(true_indices)))),:]
#test_df = test_df.loc[(test_df['PC1']>9) & (test_df['PC2']<0),:]
#seqlet_hits.loc[test_df.index, :]
seqlet_hits[seqlet_hits['seqno']==724]
Out[291]:
sum_score seqno start_idx ppm_scan pwm_scan log10pwm_scan pssm_scan cwm_scan hyp_cwm_scan ppm_jaccard ... pssm_convolution cwm_convolution hyp_cwm_convolution ppm_masked_convolution pwm_masked_convolution log10pwm_masked_convolution pssm_masked_convolution cwm_masked_convolution hyp_cwm_masked_convolution marker
12013 0.510935 724 16 2.173273 -16.685898 -11.565783 2.807995 0.271431 -0.129788 0.093749 ... 0.155122 0.014721 -0.012018 0.409525 -0.668292 -0.668292 0.344445 0.320698 -0.240834 foreground
12014 0.726112 724 17 2.955832 -9.491440 -6.578965 4.307728 0.489950 0.071016 0.157605 ... 0.343455 0.040122 0.003713 0.496489 -0.393983 -0.393983 0.428561 0.400553 0.038883 foreground
12015 0.918835 724 18 1.895810 -16.006632 -11.094952 2.776762 0.348214 -0.168935 0.129040 ... 0.351885 0.042564 -0.020289 0.580346 -0.552284 -0.552284 0.457278 0.387848 -0.180441 foreground
12016 1.216095 724 19 2.241223 -16.770756 -11.624602 2.795168 0.296356 -0.195357 0.087665 ... 0.231063 0.023784 -0.075097 0.231076 -0.803851 -0.803851 0.216105 0.210806 -0.522732 foreground
12017 1.241818 724 20 6.648924 12.798364 8.871150 10.946750 1.507266 1.190199 0.566732 ... 2.063255 0.314700 0.245513 0.901531 0.905861 0.905861 0.904396 0.937620 0.938175 foreground
12018 1.150841 724 21 1.979615 -17.255894 -11.960874 2.454263 0.213059 -0.089824 0.056313 ... 0.174882 0.015555 -0.031489 0.201595 -0.785020 -0.785020 0.188620 0.192204 -0.275004 foreground
12019 1.059108 724 22 2.545866 -14.645376 -10.151401 4.199507 0.512827 0.098539 0.146221 ... 0.573835 0.064573 -0.016732 0.439639 -0.466749 -0.466749 0.405790 0.358829 -0.106877 foreground
12020 1.016291 724 23 2.822197 -11.335154 -7.856930 4.201037 0.479085 -0.014662 0.111055 ... 0.314115 0.021815 -0.046603 0.319340 -0.526418 -0.526418 0.240512 0.133503 -0.285399 foreground
12021 0.894742 724 24 1.146093 -22.607031 -15.670000 2.113748 0.202143 -0.273157 0.044927 ... 0.174305 0.013882 -0.017559 0.214520 -0.388620 -0.388620 0.181476 0.139714 -0.138538 foreground
12022 0.670116 724 25 3.510759 -7.332713 -5.082649 4.744608 0.467600 0.199800 0.104182 ... 0.289349 0.032979 0.030203 0.540020 0.121739 0.121739 0.247456 0.286472 0.288089 foreground
12023 0.503941 724 26 2.680634 -11.520928 -7.985699 4.159111 0.579430 0.235952 0.061523 ... 0.092858 0.020784 0.024588 0.264452 -0.003100 -0.003100 0.091842 0.128490 0.168628 foreground
12024 0.560634 724 61 2.759909 -11.651375 -8.076117 4.703845 0.547412 0.042536 0.138787 ... 0.360025 0.044303 0.006862 0.609668 -0.272037 -0.272037 0.491439 0.481183 0.083735 foreground
12025 0.712192 724 62 2.736127 -15.611526 -10.821085 4.265700 0.423533 -0.155905 0.127224 ... 0.268559 0.028056 -0.034304 0.344609 -0.707176 -0.707176 0.337684 0.357184 -0.377643 foreground

13 rows × 28 columns

In [29]:
test_hits = scaler.transform(shuffled_seqlet_hits[keys_for_scoring])
test_hits = pca.transform(test_hits)
shuffled_pc_df = pd.DataFrame(data = test_hits, columns=["PC{}".format(i+1) for i in range(test_hits.shape[1])])
plot_pca(shuffled_pc_df, true_ids=None)
(2846, 4)
In [30]:
test_hits = scaler.transform(seq_null_seqlet_hits[keys_for_scoring])
test_hits = pca.transform(test_hits)
seq_null_pc_df = pd.DataFrame(data = test_hits, columns=["PC{}".format(i+1) for i in range(test_hits.shape[1])])
plot_pca(seq_null_pc_df)
(13587, 4)
In [31]:
test_hits = scaler.transform(weight_null_seqlet_hits[keys_for_scoring])
test_hits = pca.transform(test_hits)
weight_null_pc_df = pd.DataFrame(data = test_hits, columns=["PC{}".format(i+1) for i in range(test_hits.shape[1])])
plot_pca(weight_null_pc_df)
(13714, 4)
In [32]:
test_hits = scaler.transform(weightedseq_null_seqlet_hits[keys_for_scoring])
test_hits = pca.transform(test_hits)
weightedseq_null_pc_df = pd.DataFrame(data = test_hits, columns=["PC{}".format(i+1) for i in range(test_hits.shape[1])])
plot_pca(weightedseq_null_pc_df)
(12597, 4)
In [190]:
pc_df[:5]
Out[190]:
PC1 PC2 PC3 PC4 PC5 PC6 PC7 PC8
0 -1.757138 -1.105117 -1.560638 0.357107 0.277466 0.376950 -0.193923 0.091057
1 2.880820 -0.113733 -0.338334 0.840248 0.175452 0.578696 0.093965 -0.772116
2 0.438450 2.069480 -0.599759 0.185653 -0.684141 0.460374 -0.527480 -0.043134
3 -3.361249 4.008285 0.283835 -0.326203 -0.736401 0.243396 -0.191256 0.144796
4 16.268981 3.032388 0.694093 1.201489 2.725356 0.261409 -0.523800 0.170852
In [34]:
pca_seqlet_hits = pc_df[['PC1', 'PC2', 'PC3']]
pca_shuffled_seqlet_hits = shuffled_pc_df[['PC1', 'PC2', 'PC3']]
pca_seq_null_seqlet_hits = seq_null_pc_df[['PC1', 'PC2', 'PC3']]
pca_weight_null_seqlet_hits = weight_null_pc_df[['PC1', 'PC2', 'PC3']]
pca_weightedseq_null_seqlet_hits = weightedseq_null_pc_df[['PC1', 'PC2', 'PC3']]
pca_seqlet_hits['marker'] = 'foreground'
pca_shuffled_seqlet_hits['marker'] = 'shuffled'
pca_seq_null_seqlet_hits['marker'] = 'seq_null'
pca_weight_null_seqlet_hits['marker'] = 'weight_null'
pca_weightedseq_null_seqlet_hits['marker'] = 'weightedseq'
pca_test_hits = pd.concat([pca_seqlet_hits,
                       pca_shuffled_seqlet_hits,
                       pca_seq_null_seqlet_hits,
                       pca_weight_null_seqlet_hits,
                       pca_weightedseq_null_seqlet_hits
                      ])
In [292]:
sns.pairplot(pca_test_hits, 
             diag_kind="kde", kind='scatter', hue="marker")
/users/msharmin/anaconda2/envs/basepair13/lib/python3.6/site-packages/scipy/stats/stats.py:1713: FutureWarning:

Using a non-tuple sequence for multidimensional indexing is deprecated; use `arr[tuple(seq)]` instead of `arr[seq]`. In the future this will be interpreted as an array index, `arr[np.array(seq)]`, which will result either in an error or a different result.

Out[292]:
<seaborn.axisgrid.PairGrid at 0x7f5ec13d7518>
In [36]:
import seaborn as sns
sns.pairplot(pd.concat([pca_seqlet_hits, pca_shuffled_seqlet_hits]), 
             diag_kind="kde", kind='scatter', hue="marker")
/users/msharmin/anaconda2/envs/basepair13/lib/python3.6/site-packages/scipy/stats/stats.py:1713: FutureWarning:

Using a non-tuple sequence for multidimensional indexing is deprecated; use `arr[tuple(seq)]` instead of `arr[seq]`. In the future this will be interpreted as an array index, `arr[np.array(seq)]`, which will result either in an error or a different result.

Out[36]:
<seaborn.axisgrid.PairGrid at 0x7f229c385630>
In [37]:
sns.pairplot(pd.concat([pca_seqlet_hits, pca_seq_null_seqlet_hits]), 
             diag_kind="kde", kind='scatter', hue="marker")
Out[37]:
<seaborn.axisgrid.PairGrid at 0x7f22a5fa9470>
In [38]:
sns.pairplot(pd.concat([pca_seqlet_hits, pca_weight_null_seqlet_hits]), 
             diag_kind="kde", kind='scatter', hue="marker")
Out[38]:
<seaborn.axisgrid.PairGrid at 0x7f180f5002e8>
In [39]:
sns.pairplot(pd.concat([pca_seqlet_hits, pca_weightedseq_null_seqlet_hits]), 
             diag_kind="kde", kind='scatter', hue="marker")
Out[39]:
<seaborn.axisgrid.PairGrid at 0x7f229d74c9e8>
In [ ]:
ob = DenovoModisco(modisco_dir)
ob.initialize_with_importance_scores(summit_file, perf_file, deeplift_hdf) #loads all the important scores
#ob.find_instances_in_seqlets() #set up the clustering of seqlets for patterns
#ob.save_instances(out_prefix) #predict+collect the instances of sequences for all patterns
In [3]:
pattern_name = "metacluster_1/pattern_7"
pwm_query, _, _, _= ob.load_chomped_pattern(pattern_name, rc=False, plot=True)
modisco_pattern_weightsmodisco_pattern_probabilitychomped modisco_pattern_weightschomped modisco_pattern_probabilitychomped modisco_pattern_pwmconsensustcTTATCt
In [5]:
test_hits = ob.find_pattern_instances_in_seqlets(pattern_name, per_base_imp=0.125, test_mode=True)
In [6]:
import seaborn as sns
sns.pairplot(test_hits[['sum_score', 
                   'weight_convolution', 'pfm_convolution', 'pwm_convolution']], 
             diag_kind="kde", kind='scatter')
/users/msharmin/anaconda2/envs/basepair/lib/python3.6/site-packages/scipy/stats/stats.py:1713: FutureWarning:

Using a non-tuple sequence for multidimensional indexing is deprecated; use `arr[tuple(seq)]` instead of `arr[seq]`. In the future this will be interpreted as an array index, `arr[np.array(seq)]`, which will result either in an error or a different result.

Out[6]:
<seaborn.axisgrid.PairGrid at 0x7f0ba03f0278>
In [7]:
sns.pairplot(test_hits[['sum_score', 
                   'weight_scan', 'pfm_scan', 'pwm_scan']], 
             diag_kind="kde", kind='scatter')
/users/msharmin/anaconda2/envs/basepair/lib/python3.6/site-packages/scipy/stats/stats.py:1713: FutureWarning:

Using a non-tuple sequence for multidimensional indexing is deprecated; use `arr[tuple(seq)]` instead of `arr[seq]`. In the future this will be interpreted as an array index, `arr[np.array(seq)]`, which will result either in an error or a different result.

Out[7]:
<seaborn.axisgrid.PairGrid at 0x7f0b9c4b1128>
In [8]:
sns.pairplot(test_hits[['sum_score', 
                   'weight_jaccard', 'pfm_jaccard', 'pwm_jaccard']], 
             diag_kind="kde", kind='scatter')
/users/msharmin/anaconda2/envs/basepair/lib/python3.6/site-packages/scipy/stats/stats.py:1713: FutureWarning:

Using a non-tuple sequence for multidimensional indexing is deprecated; use `arr[tuple(seq)]` instead of `arr[seq]`. In the future this will be interpreted as an array index, `arr[np.array(seq)]`, which will result either in an error or a different result.

Out[8]:
<seaborn.axisgrid.PairGrid at 0x7f0b946eca20>
In [9]:
sns.pairplot(test_hits[['sum_score', 
                   'weight_masked_convolution', 'pfm_masked_convolution', 'pwm_masked_convolution']], 
             diag_kind="kde", kind='scatter')
/users/msharmin/anaconda2/envs/basepair/lib/python3.6/site-packages/scipy/stats/stats.py:1713: FutureWarning:

Using a non-tuple sequence for multidimensional indexing is deprecated; use `arr[tuple(seq)]` instead of `arr[seq]`. In the future this will be interpreted as an array index, `arr[np.array(seq)]`, which will result either in an error or a different result.

Out[9]:
<seaborn.axisgrid.PairGrid at 0x7f0b944dc400>
In [10]:
from matlas.matches import plots_seqlet_measures
plots_seqlet_measures(test_hits, metrics = {0:["pwm_convolution", "pfm_convolution"], 
                                            1:["pwm_convolution", "weight_convolution"], 
                                            2:["pwm_convolution", "pwm_scan"], 
                                            3:["pwm_convolution", "pwm_jaccard"]})
/users/msharmin/anaconda2/envs/basepair/lib/python3.6/site-packages/scipy/stats/stats.py:1713: FutureWarning:

Using a non-tuple sequence for multidimensional indexing is deprecated; use `arr[tuple(seq)]` instead of `arr[seq]`. In the future this will be interpreted as an array index, `arr[np.array(seq)]`, which will result either in an error or a different result.

In [11]:
plots_seqlet_measures(test_hits, metrics = {0:["pwm_convolution", "pfm_jaccard"], 
                                            1:["pwm_convolution", "weight_jaccard"], 
                                            2:["pwm_convolution", "pwm_masked_convolution"], 
                                            3:["pwm_convolution", "pfm_masked_convolution"]})
/users/msharmin/anaconda2/envs/basepair/lib/python3.6/site-packages/scipy/stats/stats.py:1713: FutureWarning:

Using a non-tuple sequence for multidimensional indexing is deprecated; use `arr[tuple(seq)]` instead of `arr[seq]`. In the future this will be interpreted as an array index, `arr[np.array(seq)]`, which will result either in an error or a different result.

In [4]:
ob.find_pattern_instances_in_seqlets(pattern_name, per_base_imp=0.0625, plot=True)
4 ['pwm_scan: 9.189', 'pwm_jaccard: 0.073', 'pfm_jaccard: 0.547', 'sum_score: 2.382', 'pwm_convolution: 3.044', 'pwm_masked_convolution: 0.927', 'weight_scan: 10.975', 'weight_convolution: 4.215', 'pfm_masked_convolution: 0.933', 'pfm_convolution: 2.205', 'pfm_scan: 6.621', 'weight_jaccard: 0.748', 'weight_masked_convolution: 0.974'] coordinates: (0, 20)
20 20 28
24 ['pwm_scan: 1.773', 'pwm_jaccard: 0.009', 'pfm_jaccard: 0.232', 'sum_score: 1.325', 'pwm_convolution: -0.108', 'pwm_masked_convolution: -0.041', 'weight_scan: 8.39', 'weight_convolution: 1.331', 'pfm_masked_convolution: 0.641', 'pfm_convolution: 0.758', 'pfm_scan: 4.862', 'weight_jaccard: 0.26', 'weight_masked_convolution: 0.595'] coordinates: (0, 50)
50 50 58
33 ['pwm_scan: 9.189', 'pwm_jaccard: 0.073', 'pfm_jaccard: 0.543', 'sum_score: 3.144', 'pwm_convolution: 4.006', 'pwm_masked_convolution: 0.891', 'weight_scan: 10.975', 'weight_convolution: 5.405', 'pfm_masked_convolution: 0.89', 'pfm_convolution: 2.879', 'pfm_scan: 6.621', 'weight_jaccard: 0.635', 'weight_masked_convolution: 0.912'] coordinates: (1, 20)
20 20 28
62 ['pwm_scan: 8.58', 'pwm_jaccard: 0.068', 'pfm_jaccard: 0.552', 'sum_score: 3.177', 'pwm_convolution: 4.004', 'pwm_masked_convolution: 0.927', 'weight_scan: 10.689', 'weight_convolution: 5.591', 'pfm_masked_convolution: 0.926', 'pfm_convolution: 2.901', 'pfm_scan: 6.335', 'weight_jaccard: 0.708', 'weight_masked_convolution: 0.958'] coordinates: (2, 20)
20 20 28
100 ['pwm_scan: 8.871', 'pwm_jaccard: 0.071', 'pfm_jaccard: 0.532', 'sum_score: 4.044', 'pwm_convolution: 5.181', 'pwm_masked_convolution: 0.9', 'weight_scan: 10.923', 'weight_convolution: 7.166', 'pfm_masked_convolution: 0.899', 'pfm_convolution: 3.73', 'pfm_scan: 6.47', 'weight_jaccard: 0.642', 'weight_masked_convolution: 0.934'] coordinates: (3, 20)
20 20 28
131 ['pwm_scan: 8.871', 'pwm_jaccard: 0.071', 'pfm_jaccard: 0.564', 'sum_score: 2.248', 'pwm_convolution: 2.824', 'pwm_masked_convolution: 0.933', 'weight_scan: 10.923', 'weight_convolution: 3.916', 'pfm_masked_convolution: 0.939', 'pfm_convolution: 2.051', 'pfm_scan: 6.47', 'weight_jaccard: 0.745', 'weight_masked_convolution: 0.97'] coordinates: (4, 20)
20 20 28
174 ['pwm_scan: 8.263', 'pwm_jaccard: 0.065', 'pfm_jaccard: 0.522', 'sum_score: 3.775', 'pwm_convolution: 4.362', 'pwm_masked_convolution: 0.881', 'weight_scan: 10.636', 'weight_convolution: 6.193', 'pfm_masked_convolution: 0.901', 'pfm_convolution: 3.244', 'pfm_scan: 6.184', 'weight_jaccard: 0.61', 'weight_masked_convolution: 0.913'] coordinates: (5, 20)
20 20 28
196 ['pwm_scan: 7.616', 'pwm_jaccard: 0.06', 'pfm_jaccard: 0.467', 'sum_score: 2.184', 'pwm_convolution: 2.69', 'pwm_masked_convolution: 0.841', 'weight_scan: 10.584', 'weight_convolution: 3.782', 'pfm_masked_convolution: 0.853', 'pfm_convolution: 1.964', 'pfm_scan: 6.033', 'weight_jaccard: 0.514', 'weight_masked_convolution: 0.866'] coordinates: (6, 20)
20 20 28
235 ['pwm_scan: 8.58', 'pwm_jaccard: 0.068', 'pfm_jaccard: 0.473', 'sum_score: 3.503', 'pwm_convolution: 4.523', 'pwm_masked_convolution: 0.904', 'weight_scan: 10.689', 'weight_convolution: 6.409', 'pfm_masked_convolution: 0.902', 'pfm_convolution: 3.276', 'pfm_scan: 6.335', 'weight_jaccard: 0.642', 'weight_masked_convolution: 0.947'] coordinates: (7, 20)
20 20 28
257 ['pwm_scan: 8.225', 'pwm_jaccard: 0.065', 'pfm_jaccard: 0.535', 'sum_score: 1.957', 'pwm_convolution: 2.51', 'pwm_masked_convolution: 0.918', 'weight_scan: 10.87', 'weight_convolution: 3.529', 'pfm_masked_convolution: 0.932', 'pfm_convolution: 1.822', 'pfm_scan: 6.32', 'weight_jaccard: 0.711', 'weight_masked_convolution: 0.971'] coordinates: (8, 20)
20 20 28
281 ['pwm_scan: 8.263', 'pwm_jaccard: 0.065', 'pfm_jaccard: 0.542', 'sum_score: 2.149', 'pwm_convolution: 2.572', 'pwm_masked_convolution: 0.927', 'weight_scan: 10.636', 'weight_convolution: 3.605', 'pfm_masked_convolution: 0.937', 'pfm_convolution: 1.891', 'pfm_scan: 6.184', 'weight_jaccard: 0.692', 'weight_masked_convolution: 0.948'] coordinates: (9, 20)
20 20 28
INFO:2019-09-09 18:20:43,900:test.log] trained metacluster_1/pattern_7. Used 13587 instances out of (13587, 15)
Number of instances (917, 15)
Metrics used Index(['sum_score', 'seqno', 'start_idx', 'pwm_scan', 'pfm_scan',
       'weight_scan', 'pwm_jaccard', 'pfm_jaccard', 'weight_jaccard',
       'pwm_convolution', 'pfm_convolution', 'weight_convolution',
       'pwm_masked_convolution', 'pfm_masked_convolution',
       'weight_masked_convolution'],
      dtype='object')
Minimum of sum_score 0.5294555804808624
Minimum of seqno 0
Minimum of start_idx 2
Minimum of pwm_scan 1.335839857548609
Minimum of pfm_scan 4.629802247582542
Minimum of weight_scan 8.267396133835705
Minimum of pwm_jaccard 0.009196646206622246
Minimum of pfm_jaccard 0.2071279446085702
Minimum of weight_jaccard 0.21956038132732028
Minimum of pwm_convolution -0.10803838253418951
Minimum of pfm_convolution 0.49349497595970093
Minimum of weight_convolution 0.6961941237791255
Minimum of pwm_masked_convolution -0.04106434081587958
Minimum of pfm_masked_convolution 0.4390628312454056
Minimum of weight_masked_convolution 0.3645870793961588
Missed seqlets set()
In [39]:
ob.train_seqlet_hits[pattern_name][:5]#.shape
# from random import sample

# random_seqlet_hits = sample(list(ob.train_seqlet_hits[pattern_name].index), 3)
#list(ob.train_seqlet_hits[pattern_name].index)


pwm_query, pfm_query, weigh_query, consensus = ob.load_chomped_pattern(pattern_name, rc=False, plot=True)
modisco_pattern_weightsmodisco_pattern_probabilitychomped modisco_pattern_weightschomped modisco_pattern_probabilitychomped modisco_pattern_pwmconsensusTCTTATCT
In [41]:
 
Out[41]:
'tcTTATCt'
In [6]:
#ob.deeplift_data['peaks'][(ob.deeplift_data['peaks'].chr=='chr2')&(ob.deeplift_data['peaks'].start>=3163300)]
from modisco.visualization import viz_sequence
viz_sequence.plot_weights(ob.deeplift_data['scores'][0, 400:600,:],
                                  #highlight={"black": [(start_motif, end_motif)]},
                                  subticks_frequency=10, figsize=(20,1))

viz_sequence.plot_weights(ob.deeplift_data['one_hot'][0, 400:600, :], 
                          #highlight={"black": [(start_motif, end_motif)]},
                          subticks_frequency=10, figsize=(20,1))
In [14]:
from matlas.matches import check_summary_instances
#seqlet_onehot = ob.seqs_per_pattern[pattern_name]
#filtered_seqlet_hits = ob.final_seqlet_hits[pattern_name]
check_summary_instances(ob.final_seqlet_hits[pattern_name], 
                        range(ob.seqs_per_pattern[pattern_name].shape[0]))
Number of instances (917, 15)
Metrics used Index(['sum_score', 'seqno', 'start_idx', 'pwm_scan', 'pfm_scan',
       'weight_scan', 'pwm_jaccard', 'pfm_jaccard', 'weight_jaccard',
       'pwm_convolution', 'pfm_convolution', 'weight_convolution',
       'pwm_masked_convolution', 'pfm_masked_convolution',
       'weight_masked_convolution'],
      dtype='object')
Minimum of sum_score 0.5294555804808624
Minimum of seqno 0
Minimum of start_idx 2
Minimum of pwm_scan 1.335839857548609
Minimum of pfm_scan 4.629802247582542
Minimum of weight_scan 8.267396133835705
Minimum of pwm_jaccard 0.009196646206622246
Minimum of pfm_jaccard 0.2071279446085702
Minimum of weight_jaccard 0.21956038132732028
Minimum of pwm_convolution -0.10803838253418951
Minimum of pfm_convolution 0.49349497595970093
Minimum of weight_convolution 0.6961941237791255
Minimum of pwm_masked_convolution -0.04106434081587958
Minimum of pfm_masked_convolution 0.4390628312454056
Minimum of weight_masked_convolution 0.3645870793961588
Missed seqlets set()
In [16]:
from matlas.matches import view_test_case

view_test_case(ob.contrib_per_pattern[pattern_name], 
               ob.seqs_per_pattern[pattern_name], 
               ob.final_seqlet_hits[pattern_name], 
               pwm_query, 
               seqno = 714,
               start=0, end=70
              )
            

# for seqno in np.sort(np.array([5055, 1635, 2437, 4616, 1353, 2185, 6253, 1550, 2446, 2257, 2353, 2325, 2038, 1945, 5019, 2303])):
#     view_test_case(seqlet_scores, seqlet_onehot, filtered_seqlet_hits2, pwm_query, seqno=seqno, start=0, end=70)
11909 ['weight_convolution: 0.914', 'pfm_jaccard: 0.474', 'pfm_convolution: 0.51', 'weight_masked_convolution: 0.864', 'pwm_scan: 7.616', 'pfm_scan: 6.033', 'pfm_masked_convolution: 0.913', 'pwm_jaccard: 0.06', 'weight_jaccard: 0.548', 'pwm_masked_convolution: 0.863', 'sum_score: 0.613', 'pwm_convolution: 0.669', 'weight_scan: 10.584'] coordinates: (714, 28)
28 28 36
In [18]:
from matlas.matches import view_test_cases

view_test_cases(ob.contrib_per_pattern[pattern_name], 
               ob.seqs_per_pattern[pattern_name], 
               ob.final_seqlet_hits[pattern_name], 
               pwm_query, 
               start=0, end=70, 
               max_examples_to_view=10)
4 ['weight_convolution: 4.215', 'pfm_jaccard: 0.547', 'pfm_convolution: 2.205', 'weight_masked_convolution: 0.974', 'pwm_scan: 9.189', 'pfm_scan: 6.621', 'pfm_masked_convolution: 0.933', 'pwm_jaccard: 0.073', 'weight_jaccard: 0.748', 'pwm_masked_convolution: 0.927', 'sum_score: 2.382', 'pwm_convolution: 3.044', 'weight_scan: 10.975'] coordinates: (0, 20)
20 20 28
24 ['weight_convolution: 1.331', 'pfm_jaccard: 0.232', 'pfm_convolution: 0.758', 'weight_masked_convolution: 0.595', 'pwm_scan: 1.773', 'pfm_scan: 4.862', 'pfm_masked_convolution: 0.641', 'pwm_jaccard: 0.009', 'weight_jaccard: 0.26', 'pwm_masked_convolution: -0.041', 'sum_score: 1.325', 'pwm_convolution: -0.108', 'weight_scan: 8.39'] coordinates: (0, 50)
50 50 58
33 ['weight_convolution: 5.405', 'pfm_jaccard: 0.543', 'pfm_convolution: 2.879', 'weight_masked_convolution: 0.912', 'pwm_scan: 9.189', 'pfm_scan: 6.621', 'pfm_masked_convolution: 0.89', 'pwm_jaccard: 0.073', 'weight_jaccard: 0.635', 'pwm_masked_convolution: 0.891', 'sum_score: 3.144', 'pwm_convolution: 4.006', 'weight_scan: 10.975'] coordinates: (1, 20)
20 20 28
62 ['weight_convolution: 5.591', 'pfm_jaccard: 0.552', 'pfm_convolution: 2.901', 'weight_masked_convolution: 0.958', 'pwm_scan: 8.58', 'pfm_scan: 6.335', 'pfm_masked_convolution: 0.926', 'pwm_jaccard: 0.068', 'weight_jaccard: 0.708', 'pwm_masked_convolution: 0.927', 'sum_score: 3.177', 'pwm_convolution: 4.004', 'weight_scan: 10.689'] coordinates: (2, 20)
20 20 28
100 ['weight_convolution: 7.166', 'pfm_jaccard: 0.532', 'pfm_convolution: 3.73', 'weight_masked_convolution: 0.934', 'pwm_scan: 8.871', 'pfm_scan: 6.47', 'pfm_masked_convolution: 0.899', 'pwm_jaccard: 0.071', 'weight_jaccard: 0.642', 'pwm_masked_convolution: 0.9', 'sum_score: 4.044', 'pwm_convolution: 5.181', 'weight_scan: 10.923'] coordinates: (3, 20)
20 20 28
131 ['weight_convolution: 3.916', 'pfm_jaccard: 0.564', 'pfm_convolution: 2.051', 'weight_masked_convolution: 0.97', 'pwm_scan: 8.871', 'pfm_scan: 6.47', 'pfm_masked_convolution: 0.939', 'pwm_jaccard: 0.071', 'weight_jaccard: 0.745', 'pwm_masked_convolution: 0.933', 'sum_score: 2.248', 'pwm_convolution: 2.824', 'weight_scan: 10.923'] coordinates: (4, 20)
20 20 28
174 ['weight_convolution: 6.193', 'pfm_jaccard: 0.522', 'pfm_convolution: 3.244', 'weight_masked_convolution: 0.913', 'pwm_scan: 8.263', 'pfm_scan: 6.184', 'pfm_masked_convolution: 0.901', 'pwm_jaccard: 0.065', 'weight_jaccard: 0.61', 'pwm_masked_convolution: 0.881', 'sum_score: 3.775', 'pwm_convolution: 4.362', 'weight_scan: 10.636'] coordinates: (5, 20)
20 20 28
196 ['weight_convolution: 3.782', 'pfm_jaccard: 0.467', 'pfm_convolution: 1.964', 'weight_masked_convolution: 0.866', 'pwm_scan: 7.616', 'pfm_scan: 6.033', 'pfm_masked_convolution: 0.853', 'pwm_jaccard: 0.06', 'weight_jaccard: 0.514', 'pwm_masked_convolution: 0.841', 'sum_score: 2.184', 'pwm_convolution: 2.69', 'weight_scan: 10.584'] coordinates: (6, 20)
20 20 28
235 ['weight_convolution: 6.409', 'pfm_jaccard: 0.473', 'pfm_convolution: 3.276', 'weight_masked_convolution: 0.947', 'pwm_scan: 8.58', 'pfm_scan: 6.335', 'pfm_masked_convolution: 0.902', 'pwm_jaccard: 0.068', 'weight_jaccard: 0.642', 'pwm_masked_convolution: 0.904', 'sum_score: 3.503', 'pwm_convolution: 4.523', 'weight_scan: 10.689'] coordinates: (7, 20)
20 20 28
257 ['weight_convolution: 3.529', 'pfm_jaccard: 0.535', 'pfm_convolution: 1.822', 'weight_masked_convolution: 0.971', 'pwm_scan: 8.225', 'pfm_scan: 6.32', 'pfm_masked_convolution: 0.932', 'pwm_jaccard: 0.065', 'weight_jaccard: 0.711', 'pwm_masked_convolution: 0.918', 'sum_score: 1.957', 'pwm_convolution: 2.51', 'weight_scan: 10.87'] coordinates: (8, 20)
20 20 28
281 ['weight_convolution: 3.605', 'pfm_jaccard: 0.542', 'pfm_convolution: 1.891', 'weight_masked_convolution: 0.948', 'pwm_scan: 8.263', 'pfm_scan: 6.184', 'pfm_masked_convolution: 0.937', 'pwm_jaccard: 0.065', 'weight_jaccard: 0.692', 'pwm_masked_convolution: 0.927', 'sum_score: 2.149', 'pwm_convolution: 2.572', 'weight_scan: 10.636'] coordinates: (9, 20)
20 20 28
In [7]:
#only forward pattern
df, consensus = ob._predict_pattern_instances_in_sequences("metacluster_1/pattern_7")
df.shape
INFO:2019-09-09 19:03:04,314:test.log] predicted, metacluster_1/pattern_7
Out[7]:
(15326, 15)
In [20]:
# both forward and backward pattern
df = ob.predict_pattern_instances_in_sequences("metacluster_1/pattern_7")
df.shape
Out[20]:
(27281, 6)
In [26]:
df.shape
Out[26]:
(27281, 6)
In [11]:
from matlas.matches import view_test_cases
view_test_cases(ob.deeplift_data['scores'], 
               ob.deeplift_data['one_hot'], 
               df, 
               pwm_query, 
               start=400, end=600, 
               max_examples_to_view=10)
19 ['pwm_scan: 9.189', 'pwm_jaccard: 0.073', 'pfm_jaccard: 0.645', 'sum_score: 0.569', 'pwm_convolution: 0.705', 'pwm_masked_convolution: 0.98', 'weight_scan: 10.975', 'weight_convolution: 0.912', 'pfm_masked_convolution: 0.977', 'pfm_convolution: 0.507', 'pfm_scan: 6.621', 'weight_jaccard: 0.738', 'weight_masked_convolution: 0.961'] coordinates: (1, 431)
431 31 39
126 ['pwm_scan: 3.334', 'pwm_jaccard: 0.081', 'pfm_jaccard: 0.376', 'sum_score: 0.504', 'pwm_convolution: 0.897', 'pwm_masked_convolution: 0.773', 'weight_scan: 8.915', 'weight_convolution: 0.955', 'pfm_masked_convolution: 0.866', 'pfm_convolution: 0.504', 'pfm_scan: 5.112', 'weight_jaccard: 0.421', 'weight_masked_convolution: 0.857'] coordinates: (5, 503)
503 103 111
923 ['pwm_scan: 3.685', 'pwm_jaccard: 0.017', 'pfm_jaccard: 0.323', 'sum_score: 1.056', 'pwm_convolution: 0.359', 'pwm_masked_convolution: 0.181', 'weight_scan: 8.893', 'weight_convolution: 0.916', 'pfm_masked_convolution: 0.631', 'pfm_convolution: 0.61', 'pfm_scan: 5.359', 'weight_jaccard: 0.29', 'weight_masked_convolution: 0.52'] coordinates: (39, 575)
575 175 183
1029 ['pwm_scan: 9.189', 'pwm_jaccard: 0.073', 'pfm_jaccard: 0.447', 'sum_score: 0.537', 'pwm_convolution: 0.577', 'pwm_masked_convolution: 0.776', 'weight_scan: 10.975', 'weight_convolution: 0.591', 'pfm_masked_convolution: 0.77', 'pfm_convolution: 0.412', 'pfm_scan: 6.621', 'weight_jaccard: 0.398', 'weight_masked_convolution: 0.603'] coordinates: (48, 525)
525 125 133
1600 ['pwm_scan: 9.189', 'pwm_jaccard: 0.073', 'pfm_jaccard: 0.618', 'sum_score: 0.885', 'pwm_convolution: 1.121', 'pwm_masked_convolution: 0.963', 'weight_scan: 10.975', 'weight_convolution: 1.505', 'pfm_masked_convolution: 0.963', 'pfm_convolution: 0.807', 'pfm_scan: 6.621', 'weight_jaccard: 0.775', 'weight_masked_convolution: 0.981'] coordinates: (66, 548)
548 148 156
1612 ['pwm_scan: 2.454', 'pwm_jaccard: 0.018', 'pfm_jaccard: 0.306', 'sum_score: 1.322', 'pwm_convolution: -0.04', 'pwm_masked_convolution: -0.016', 'weight_scan: 8.839', 'weight_convolution: 1.264', 'pfm_masked_convolution: 0.683', 'pfm_convolution: 0.777', 'pfm_scan: 5.245', 'weight_jaccard: 0.318', 'weight_masked_convolution: 0.609'] coordinates: (66, 608)
608 208 216
1621 ['pwm_scan: 2.454', 'pwm_jaccard: 0.017', 'pfm_jaccard: 0.287', 'sum_score: 0.682', 'pwm_convolution: 0.004', 'pwm_masked_convolution: 0.003', 'weight_scan: 8.839', 'weight_convolution: 0.774', 'pfm_masked_convolution: 0.654', 'pfm_convolution: 0.422', 'pfm_scan: 5.245', 'weight_jaccard: 0.308', 'weight_masked_convolution: 0.657'] coordinates: (66, 738)
738 338 346
1627 ['pwm_scan: 9.189', 'pwm_jaccard: 0.073', 'pfm_jaccard: 0.514', 'sum_score: 0.698', 'pwm_convolution: 0.884', 'pwm_masked_convolution: 0.914', 'weight_scan: 10.975', 'weight_convolution: 1.226', 'pfm_masked_convolution: 0.92', 'pfm_convolution: 0.641', 'pfm_scan: 6.621', 'weight_jaccard: 0.72', 'weight_masked_convolution: 0.961'] coordinates: (67, 421)
421 21 29
1638 ['pwm_scan: 2.454', 'pwm_jaccard: 0.006', 'pfm_jaccard: 0.286', 'sum_score: 1.219', 'pwm_convolution: -0.261', 'pwm_masked_convolution: -0.107', 'weight_scan: 8.839', 'weight_convolution: 0.924', 'pfm_masked_convolution: 0.554', 'pfm_convolution: 0.618', 'pfm_scan: 5.245', 'weight_jaccard: 0.254', 'weight_masked_convolution: 0.454'] coordinates: (67, 481)
481 81 89
1646 ['pwm_scan: 9.189', 'pwm_jaccard: 0.073', 'pfm_jaccard: 0.527', 'sum_score: 1.444', 'pwm_convolution: 1.856', 'pwm_masked_convolution: 0.919', 'weight_scan: 10.975', 'weight_convolution: 2.588', 'pfm_masked_convolution: 0.924', 'pfm_convolution: 1.343', 'pfm_scan: 6.621', 'weight_jaccard: 0.735', 'weight_masked_convolution: 0.973'] coordinates: (67, 551)
551 151 159
1655 ['pwm_scan: 5.229', 'pwm_jaccard: 0.042', 'pfm_jaccard: 0.485', 'sum_score: 0.553', 'pwm_convolution: 0.422', 'pwm_masked_convolution: 0.594', 'weight_scan: 9.257', 'weight_convolution: 0.711', 'pfm_masked_convolution: 0.911', 'pfm_convolution: 0.402', 'pfm_scan: 5.429', 'weight_jaccard: 0.49', 'weight_masked_convolution: 0.86'] coordinates: (67, 596)
596 196 204
1668 ['pwm_scan: 9.189', 'pwm_jaccard: 0.073', 'pfm_jaccard: 0.447', 'sum_score: 0.669', 'pwm_convolution: 0.87', 'pwm_masked_convolution: 0.876', 'weight_scan: 10.975', 'weight_convolution: 1.237', 'pfm_masked_convolution: 0.88', 'pfm_convolution: 0.629', 'pfm_scan: 6.621', 'weight_jaccard: 0.655', 'weight_masked_convolution: 0.945'] coordinates: (68, 206)
206 -194 -186
1681 ['pwm_scan: 9.189', 'pwm_jaccard: 0.073', 'pfm_jaccard: 0.509', 'sum_score: 0.591', 'pwm_convolution: 0.753', 'pwm_masked_convolution: 0.892', 'weight_scan: 10.975', 'weight_convolution: 1.056', 'pfm_masked_convolution: 0.902', 'pfm_convolution: 0.549', 'pfm_scan: 6.621', 'weight_jaccard: 0.692', 'weight_masked_convolution: 0.949'] coordinates: (68, 336)
336 -64 -56
2472 ['pwm_scan: 8.58', 'pwm_jaccard: 0.064', 'pfm_jaccard: 0.46', 'sum_score: 0.524', 'pwm_convolution: 0.686', 'pwm_masked_convolution: 0.917', 'weight_scan: 10.689', 'weight_convolution: 0.923', 'pfm_masked_convolution: 0.902', 'pfm_convolution: 0.49', 'pfm_scan: 6.335', 'weight_jaccard: 0.566', 'weight_masked_convolution: 0.912'] coordinates: (115, 461)
461 61 69
2563 ['pwm_scan: 8.225', 'pwm_jaccard: 0.065', 'pfm_jaccard: 0.524', 'sum_score: 0.7', 'pwm_convolution: 0.893', 'pwm_masked_convolution: 0.929', 'weight_scan: 10.87', 'weight_convolution: 1.238', 'pfm_masked_convolution: 0.941', 'pfm_convolution: 0.646', 'pfm_scan: 6.32', 'weight_jaccard: 0.711', 'weight_masked_convolution: 0.969'] coordinates: (118, 545)
545 145 153
2610 ['pwm_scan: 3.611', 'pwm_jaccard: 0.012', 'pfm_jaccard: 0.293', 'sum_score: 0.547', 'pwm_convolution: -0.09', 'pwm_masked_convolution: -0.084', 'weight_scan: 8.879', 'weight_convolution: 0.527', 'pfm_masked_convolution: 0.602', 'pfm_convolution: 0.309', 'pfm_scan: 5.358', 'weight_jaccard: 0.285', 'weight_masked_convolution: 0.564'] coordinates: (120, 548)
548 148 156
In [12]:
ob.deeplift_data['peaks'].loc[67,:]
Out[12]:
chr          chr1
end       8135243
start     8134243
strand       b'*'
Name: 67, dtype: object
In [28]:
from matlas.utils import send_to_mitra
outdir = "{0}/web_mouse_heme_data/modisco_tracks/binary_model/task_{1}-naivegw".format(project_root, task_idx)

if not os.path.exists(outdir):
    os.makedirs(outdir)
out_prefix = "{0}/mm17_hits_spectral_1_7".format(outdir)
send_to_mitra(df, out_prefix, mitra=outdir, hammock=True)
writing...(27281, 6)
In [31]:
data_file = "{0}/spectral.p".format(ob.data_dir)
pickle.dump(ob.clustering, open(data_file, "wb"))
In [32]:
pickle.load(open(data_file, "rb"))
Out[32]:
OrderedDict([('metacluster_1/pattern_7',
              SpectralClustering(affinity='rbf', assign_labels='discretize', coef0=1,
                        degree=3, eigen_solver=None, eigen_tol=0.0, gamma=1.0,
                        kernel_params=None, n_clusters=2, n_init=10, n_jobs=1,
                        n_neighbors=10, random_state=0))])
In [22]:
def print_hits(seqno, df):
    df = df[df.seqno==seqno]
    return df

seqno = 411
#print_hits(seqno, seqlet_hits)
#print_hits(seqno, filtered_seqlet_hits)
#print_hits(seqno, filtered_seqlet_hits2)
In [23]:
def print_positional_metadata(seqno, position, method_to_seqlet_scores):
    print("seqno: ", seqno, ", position: ", position)
    for key in method_to_seqlet_scores.keys():
        print(key, ": ", method_to_seqlet_scores[key][seqno, position])
    return None

#print_positional_metadata(411, 50)
In [24]:
#deeplift_data['peaks'].loc[67,:]#chr1:8135243:8134243, #67th in deeplift_scores
#np.argwhere(indices==67) #40, #40th in modisco_imp_scores

#seqlets_per_pattern = ob.mr.seqlets()
#seqlets_ = seqlets_per_pattern['metacluster_1/pattern_7']
# for seqlet in seqlets_:
#     if(seqlet.seqname==40):
#         break
# print(seqlet)#Seqlet(seqname=40, start=531, end=601, name='', strand='+')
#selected_summits.loc[40,:] #chr1:8134243:8135243