import pickle
from collections import OrderedDict
import numpy as np
import pandas as pd
import os
import subprocess
import math
import copy
import pickle
import h5py
import glob

from matlas.genome_data import *
from matlas.pwms import shorten_tfname, get_motifDB_id_maps
from matlas.matches import fig2vdom, vdom_pssm
from matlas.modisco_results import ModiscoResult
from vdom.helpers import (h1, p, li, img, div, b, br, ul, a,
                          details, summary,
                          table, thead, th, tr, tbody, td, ol)
from IPython.display import display


project_root = "/mnt/lab_data/kundaje/msharmin/mouse_hem/with_tfd"
oak_root = "/oak/stanford/groups/akundaje/msharmin/mouse_hem/from_labcluster/with_tfd"
srv_root = "/srv/scratch/msharmin/mouse_hem/with_tfd"
data_name = "full_mouse50"
modisco_root = "{0}/{1}/Naive_modisco2019_msharmin".format(oak_root, data_name)
modisco_root = "{0}/{1}/Naive_modisco2019".format(srv_root, data_name)

homer_root = "{0}/{1}/Naive_scans".format(srv_root, data_name)
model_root = "{0}/{1}/fineFactorized".format(srv_root, data_name)
basset_root = "{0}/{1}/Naive_basset_motifs".format(srv_root, data_name)
reportfile= "/mnt/lab_data/kundaje/msharmin/mouse_hem/with_tfd/full_mouse50/filtering samples_MS2.xlsx"
sheetname = "filter23"

load data from labcluster

Using TensorFlow backend.
TF-MoDISco is using the TensorFlow backend.

from matlas.matches import DenovoModisco

task_idx = 273
modisco_dir = "{0}/task_{1}-naivegw".format(modisco_root, task_idx)
summit_file = "{0}/{1}/fineFactorized/task_{2}-naivegw/predictions/task_{2}-{3}Summit.calibrated.{1}.tab.gz".format(project_root, data_name, task_idx, "Naive")
perf_file = "{0}/task_{1}-naivegw/NaiveauPRC.txt".format(model_root, task_idx)
deeplift_hdf = "{0}/{1}/Naive_deeplift2019/task_{2}-naivegw/summit.h5".format(srv_root, data_name, task_idx)
ob = DenovoModisco(modisco_dir)
pattern_name = "metacluster_1/pattern_7"
query = ob.load_chomped_pattern(pattern_name, plot=True)

ob.initialize_with_importance_scores(summit_file, perf_file, deeplift_hdf, query.keys()) #loads all the important scores
deeplift_data = ob.deeplift_data

from matlas.utils import get_calibrated_peaks

calibrated_data = get_calibrated_peaks(summit_file, perf_file)
selected_summits = calibrated_data['selected_summits']
indices = calibrated_data['indices']

#from basepair.plot.profiles import extract_signal
from matlas.tracks import extract_signal
pattern_name = 'metacluster_1/pattern_7'

seqlet_scores = ob.contrib_per_pattern[pattern_name]
seqlet_onehot = ob.seqs_per_pattern[pattern_name]

seqlets_ = ob.seqlets_per_pattern[pattern_name]
shuffled_onehot = extract_signal(ob.deeplift_data['shuffled_onehot'][indices], seqlets_)
shuffled_scores = extract_signal(ob.deeplift_data['shuffled_scores'][indices], seqlets_)

shuffled_onehot.shape, shuffled_scores.shape

((883, 70, 4), (883, 70, 4))

from matlas.matches import collect_scoring_measures

method_to_seqlet_scores = collect_scoring_measures(query, seqlet_scores, seqlet_onehot)
print(query.keys(), np.sort(np.array(list(method_to_seqlet_scores.keys()))))


scores = deeplift_data['scores']
one_hot = deeplift_data['one_hot']

method_to_scores = collect_scoring_measures(query, scores, one_hot)

dict_keys(['ppm', 'pwm', 'log10pwm', 'pssm', 'cwm', 'hyp_cwm', 'consensus']) ['cwm_convolution' 'cwm_jaccard' 'cwm_masked_convolution' 'cwm_scan'
 'hyp_cwm_convolution' 'hyp_cwm_jaccard' 'hyp_cwm_masked_convolution'
 'hyp_cwm_scan' 'log10pwm_convolution' 'log10pwm_jaccard'
 'log10pwm_masked_convolution' 'log10pwm_scan' 'ppm_convolution'
 'ppm_jaccard' 'ppm_masked_convolution' 'ppm_scan' 'pssm_convolution'
 'pssm_jaccard' 'pssm_masked_convolution' 'pssm_scan' 'pwm_convolution'
 'pwm_jaccard' 'pwm_masked_convolution' 'pwm_scan' 'sum_score']

from matlas.matches import get_any_high_scoring_instances
per_base_imp=0.0625

keys_for_scoring = ['sum_score'] 
keys_for_scoring += ["{}_scan".format(key) for key in query.keys() if key!= 'consensus']
keys_for_scoring += ["{}_jaccard".format(key) for key in query.keys() if key!= 'consensus']
keys_for_scoring += ["{}_convolution".format(key) for key in query.keys() if key!= 'consensus']
keys_for_scoring += ["{}_masked_convolution".format(key) for key in query.keys() if key!= 'consensus']
        
seqlet_hits = get_any_high_scoring_instances(method_to_seqlet_scores,
                                                min_tot_imp=per_base_imp*len(query['ppm']),
                                                keys=keys_for_scoring)

_hits = get_any_high_scoring_instances(method_to_scores,
                                       min_tot_imp=per_base_imp*len(query['ppm']),
                                       keys=keys_for_scoring)

keys_for_scoring=['sum_score']
keys_for_scoring += ["{}_scan".format(key) for key in ['pssm', 'hyp_cwm', 'cwm']]
keys_for_scoring += ["{}_jaccard".format(key) for key in ['pssm', 'hyp_cwm']]
#keys_for_scoring += ["{}_convolution".format(key) for key in ['hyp_cwm', 'cwm']]
keys_for_scoring += ["{}_masked_convolution".format(key) for key in ['hyp_cwm']]

from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

def transform_data(_hits, scaler=None, pca=None):
    if scaler is None:
        # should be used only during training
        scaler = StandardScaler()
        scaler.fit(_hits)
        
    # Apply transform to both the training set and testing set
    _hits = scaler.transform(_hits)

    if pca is None:
        # Make an instance of the Model
        pca = PCA(.98)
        #fitting PCA on the training set only.
        pca.fit(_hits)
        
    #Apply the mapping (transform) to both the training set and the test set.
    _hits = pca.transform(_hits)
    
    return scaler, pca, _hits

scaler, pca, train_hits = transform_data(seqlet_hits[keys_for_scoring])
pca.explained_variance_ratio_

array([0.73421992, 0.09118347, 0.07766985, 0.02654744, 0.022959  ,
       0.01230255, 0.00980085, 0.00861208])

pc_df = pd.DataFrame(data = train_hits, columns=["PC{}".format(i+1) for i in range(train_hits.shape[1])])

def load_true_positives(tp_file):
    coordinates = []
    weak_coordinates = []
    with open(tp_file, "r") as fp:
        for line in fp:
            line = line.strip().split(",")
            seqno = line[0]
            pos = int(line[1])
            if "-" in seqno:
                seqno = seqno.split("-")
                for i in range(int(seqno[0]), int(seqno[1])+1):
                    coordinates.append((i, pos))
            else:
                if len(line)>2:
                    #if 'weak' in line[2]:
                    weak_coordinates.append((int(seqno), pos))
                    #else:
                        #coordinates.append((int(seqno), pos))
                else:
                    coordinates.append((int(seqno), pos))
    
    
    return {'strong':coordinates, 'weak':weak_coordinates}

def get_tp_indices(seqlet_hits, coordinates, verbose=True):
    keep = OrderedDict()
    for key in coordinates.keys():
        keep[key] = []
        for cord in coordinates[key]:
            seqno, pos = cord
            df = seqlet_hits[(seqlet_hits['seqno']==seqno) & (seqlet_hits['start_idx']==pos)]
            if df.shape[0]>0:
                keep[key].append(df.index[0])
            else:
                if verbose: print(cord)

    return keep

tp_file = "{}/web_mouse_heme_data/modisco_tracks/binary_model/task_{}-naivegw/mm17_tp_new.txt".format(project_root, task_idx)
coordinates = load_true_positives(tp_file)
true_indices = get_tp_indices(seqlet_hits, coordinates, False)
len(true_indices['strong']), len(true_indices['weak']), seqlet_hits.shape#, sum(labels==1)

(910, 11, (13587, 27))

import matplotlib.pyplot as plt
def plot_pca(df, true_ids=None):
    print(df.shape)
    fig = plt.figure(figsize = (8,8))
    ax = fig.add_subplot(1,1,1) 
    ax.set_xlabel('PC1', fontsize = 15)
    ax.set_ylabel('PC2', fontsize = 15)
    ax.set_title('2 component PCA', fontsize = 20)
    if true_ids is not None:
        strong_ids = true_ids['strong']
        strong_df = df.loc[np.array(strong_ids),:]
        ax.scatter(strong_df['PC1'], strong_df['PC2'], s = 1, c='tomato')
        
        weak_ids = true_ids['weak']
        weak_df = df.loc[np.array(weak_ids),:]
        ax.scatter(weak_df['PC1'], weak_df['PC2'], s = 5, c='darkred')
        
        df = df.loc[np.array(list(set(list(df.index)).difference(
            set(strong_ids).union(set(weak_ids))
        ))),:]
    ax.scatter(df['PC1'], df['PC2'], s = 1, c='slateblue')
    #ax.axvline(9)
    #ax.axhline(-1)
    ax.grid()
    ax.legend(['strong match', 'weak match', 'no match'], loc="upper right")
    return None
coordinates = load_true_positives(tp_file)
true_indices = get_tp_indices(seqlet_hits, coordinates, False)
plot_pca(pc_df, true_indices)

(13587, 8)

import matplotlib.pyplot as plt
from sklearn.mixture import GaussianMixture
#import sklearn.mixture.GaussianMixture
gmm = GaussianMixture(n_components=2, covariance_type='full', init_params='kmeans').fit(pc_df[['PC1', 'PC2']])
labels = gmm.predict(pc_df[['PC1', 'PC2']])
print(np.sum(labels==1), np.sum(labels==0))
plt.scatter(pc_df['PC1'], pc_df['PC2'], c=labels, s=1, cmap='seismic');
probs = gmm.predict_proba(pc_df[['PC1', 'PC2']])
plt.scatter(pc_df['PC1'], pc_df['PC2'], c=probs[:,1], s=1, cmap='seismic');

920 12667

pcv1 = pca.explained_variance_ratio_[0]
pcv2 = pca.explained_variance_ratio_[2]
pc_maxs = pc_df.max()
affinity = (pc_df['PC1']*pcv1/(pcv1+pcv2)+pc_df['PC2']*pcv2/(pcv1+pcv2))/(pc_maxs['PC1']*pcv1/(pcv1+pcv2)+pc_maxs['PC2']*pcv2/(pcv1+pcv2))
plt.scatter(pc_df['PC1'], pc_df['PC2'], c=affinity, s=1, cmap='seismic');

labels = np.array([0]*pc_df.shape[0])
labels[np.array(true_indices['strong'])]=2
labels[np.array(true_indices['weak'])]=1
pc_df['label'] = labels
pc_df['index'] = pc_df.index
pc_df['seqno'] = seqlet_hits.seqno
pc_df['pos'] = seqlet_hits.start_idx
pc_df['affinity'] = affinity
pc_df[:5]

import plotly.express as px
fig = px.scatter(pc_df, x="PC1", y="PC2", hover_data=['index', 'seqno', 'pos'], color='affinity',
                 color_continuous_scale=px.colors.sequential.Magma
                )
fig

#seqlets pca, seqlet gmm 
_, _, test_hits = transform_data(_hits[keys_for_scoring], scaler, pca)
_pc_df = pd.DataFrame(data = test_hits, columns=["PC{}".format(i+1) for i in range(test_hits.shape[1])])

_labels = gmm.predict(_pc_df[['PC1', 'PC2']])
print(np.sum(_labels==1), np.sum(_labels==0))
_probs = gmm.predict_proba(_pc_df[['PC1', 'PC2']])
plt.scatter(_pc_df['PC1'], _pc_df['PC2'], c=_probs[:,1], s=1, cmap='seismic');

9116 3041038

pc_maxs = _pc_df.max()
affinity = (_pc_df['PC1']*pcv1/(pcv1+pcv2)+_pc_df['PC2']*pcv2/(pcv1+pcv2))/(pc_maxs['PC1']*pcv1/(pcv1+pcv2)+pc_maxs['PC2']*pcv2/(pcv1+pcv2))
plt.scatter(_pc_df['PC1'], _pc_df['PC2'], c=affinity, s=1, cmap='seismic');

#independent pca, seqlet gmm
_scaler, _pca, _train_hits = transform_data(_hits[keys_for_scoring])
print(_pca.explained_variance_ratio_)
_pc_df = pd.DataFrame(data = _train_hits, columns=["PC{}".format(i+1) for i in range(_train_hits.shape[1])])

_labels = gmm.predict(_pc_df[['PC1', 'PC2']])
print(np.sum(_labels==1), np.sum(_labels==0))
_probs = gmm.predict_proba(_pc_df[['PC1', 'PC2']])
plt.scatter(_pc_df['PC1'], _pc_df['PC2'], c=_probs[:,1], s=1, cmap='seismic');

[0.56338578 0.15092858 0.11137449 0.05474907 0.03781689 0.02420523
 0.01660753 0.01146892 0.00811075 0.00629166]
3039233 10921

# seqlets pca, independent gmm
_, _, test_hits = transform_data(_hits[keys_for_scoring], scaler, pca)
_pc_df = pd.DataFrame(data = test_hits, columns=["PC{}".format(i+1) for i in range(test_hits.shape[1])])

_gmm = GaussianMixture(n_components=2).fit(_pc_df[['PC1', 'PC2']])
_labels = _gmm.predict(_pc_df[['PC1', 'PC2']])
print(np.sum(_labels==1), np.sum(_labels==0))
_probs = _gmm.predict_proba(_pc_df[['PC1', 'PC2']])
plt.scatter(_pc_df['PC1'], _pc_df['PC2'], c=_probs[:,1], s=1, cmap='seismic');

712368 2337786

#independent pca, independent gmm
_scaler, _pca, _train_hits = transform_data(_hits[keys_for_scoring])
print(_pca.explained_variance_ratio_)
_pc_df = pd.DataFrame(data = _train_hits, columns=["PC{}".format(i+1) for i in range(_train_hits.shape[1])])
_gmm = GaussianMixture(n_components=2).fit(_pc_df[['PC1', 'PC2']])
_labels = _gmm.predict(_pc_df[['PC1', 'PC2']])
print(np.sum(_labels==1), np.sum(_labels==0))
_probs = _gmm.predict_proba(_pc_df[['PC1', 'PC2']])
plt.scatter(_pc_df['PC1'], _pc_df['PC2'], c=_probs[:,1], s=1, cmap='seismic');

[0.56338578 0.15092858 0.11137449 0.05474907 0.03781689 0.02420523
 0.01660753 0.01146892 0.00811075 0.00629166]
2362275 687879

_, _, test_hits = transform_data(_hits[keys_for_scoring], scaler, pca)
_pc_df = pd.DataFrame(data = test_hits, columns=["PC{}".format(i+1) for i in range(test_hits.shape[1])])
_labels = gmm.predict(_pc_df[['PC1', 'PC2']])
_probs = gmm.predict_proba(_pc_df[['PC1', 'PC2']])
sorted_hits = _hits
sorted_hits['probs'] = _probs[:,1]
sorted_hits['labels'] = _labels
sorted_hits = sorted_hits[sorted_hits['labels']==1]
print(sorted_hits.shape)

outdir = "/mnt/lab_data/kundaje/msharmin/mouse_hem/with_tfd/web_mouse_heme_data/modisco_tracks/binary_model/task_273-naivegw"
filename = "m17_gmm.gz"
instancefile = "{0}/{1}".format(outdir, filename)
sorted_hits.to_csv(instancefile, sep='\t')

(9340, 29)

def predict_pattern(df, probs=None):
    if probs is not None:
        df = df[df['probs']>=probs,:]
    
    consensus = query['consensus']
    summits = ob.deeplift_data['peaks']
    display_name = ob.shorten_pattern_name(pattern_name)
    
    peak_no = df.shape[0]
    _indices = df.seqno.values
    chroms = summits.chr.values[_indices]
    starts = summits.start.values[_indices] + df.start_idx.values
    ends = starts + len(consensus)
    names = np.array(["{0}/{1}".format(display_name, consensus)]*peak_no)
    scores = df.probs.values
    strands = np.array(['.']*peak_no)
    
    df = pd.DataFrame.from_dict({0:chroms, 1:starts, 2:ends, 3:names, 4:scores, 5:strands})
    df = df.sort_values([0, 1, 4])
    #df = df.drop_duplicates(subset=[0,1,2,3,5])
    
    return df

df = predict_pattern(sorted_hits)

from matlas.utils import send_to_mitra
outdir = "/mnt/lab_data/kundaje/msharmin/mouse_hem/with_tfd/web_mouse_heme_data/modisco_tracks/binary_model/task_273-naivegw"
filename = "m17_gmm"
out_prefix = "{0}/{1}".format(outdir, filename)
send_to_mitra(df, out_prefix, mitra=outdir, hammock=True)

writing...(9340, 6)

root = "/mnt/lab_data/kundaje/msharmin/mouse_hem/with_tfd/web_mouse_heme_data/modisco_tracks/binary_model/task_273-naivegw"
dlhits = pd.read_csv("{}/m17_gmm.bed.gz".format(root), header=None, sep="\t")
ismhits = pd.read_csv("{}/m17_gmm_ism.bed.gz".format(root), header=None, sep="\t")
dlhits.shape, ismhits.shape

((9116, 6), (16967, 6))

dlhits[6] = dlhits[4]
dlhits[7] = -1
dlhits[8] = dlhits[4]
dlhits[9] = -1

ismhits[6] = ismhits[4]
ismhits[7] = -1
ismhits[8] = ismhits[4]
ismhits[9] = -1

dlhits.to_csv("{}/m17_gmm_idr.bed".format(root), header=False, index=False, sep="\t")
ismhits.to_csv("{}/m17_gmm_ism_idr.bed".format(root), header=False, index=False, sep="\t")

dlhits[:5]

from matlas.verification import compute_chip_matrix

compute_chip_matrix("MEL", "GATA", out_prefix, execute=True)

computeMatrix reference-point --referencePoint center --skipZeros -S /mnt/lab_data2/msharmin/mouse_chipseq_bigwigs/ENCFF686HPH.bigWig -R /mnt/lab_data/kundaje/msharmin/mouse_hem/with_tfd/web_mouse_heme_data/modisco_tracks/binary_model/task_273-naivegw/m17_gmm.bed -a 500 -b 500 -bs 10 -o /mnt/lab_data/kundaje/msharmin/mouse_hem/with_tfd/web_mouse_heme_data/modisco_tracks/binary_model/task_273-naivegw/m17_gmm.matrix.gz --outFileSortedRegions /mnt/lab_data/kundaje/msharmin/mouse_hem/with_tfd/web_mouse_heme_data/modisco_tracks/binary_model/task_273-naivegw/m17_gmm.matrix.bed

from matlas.verification import plot_chip_heatmap
plot_chip_heatmap("MEL", "GATA", out_prefix, execute=True)

plotHeatmap -m /mnt/lab_data/kundaje/msharmin/mouse_hem/with_tfd/web_mouse_heme_data/modisco_tracks/binary_model/task_273-naivegw/m17_gmm.matrix.gz -out /mnt/lab_data/kundaje/msharmin/mouse_hem/with_tfd/web_mouse_heme_data/modisco_tracks/binary_model/task_273-naivegw/MEL-GATA-heatmap.png  --colorMap Blues --xAxisLabel Distance --missingDataColor 1 --zMin 0 --heatmapHeight 28 --heatmapWidth 8 --legendLocation none

#null - 1 - shuffled deeplift score: shuffled one-hot*hyp scores
#null - 2 - keep the one-hot/sequence, change the importance of deeplift score: shuffle the values along bases 
#null - 3 - keep the importance values; randomize the bits in each base: 

from matlas.matches import get_null_sequences

seqno = 0
null_onehot, null_score = get_null_sequences(one_hot[seqno], scores[seqno],
                                            #total importance is same but shuffled the one-hot
                                            shuffle_seq=True, shuffle_weight=False
                                            )

null_onehot.shape, null_score.shape

from modisco.visualization import viz_sequence
print('real') 
viz_sequence.plot_weights(scores[seqno], subticks_frequency=10, figsize=(20,1))
print('same weight, different seq') 
viz_sequence.plot_weights(null_score, subticks_frequency=10, figsize=(20,1))

---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-22-d89f730a7461> in <module>
      6 
      7 seqno = 0
----> 8 null_onehot, null_score = get_null_sequence(one_hot[seqno], scores[seqno],
      9                                             #total importance is same but shuffled the one-hot
     10                                             shuffle_seq=True, shuffle_weight=False

NameError: name 'get_null_sequence' is not defined

seqno = 0
null_onehot, null_score = get_null_sequences(one_hot[seqno], scores[seqno],
                                            #weights are different but keep the one-hot
                                            shuffle_seq=False, shuffle_weight=True
                                            )

null_onehot.shape, null_score.shape

from modisco.visualization import viz_sequence
print('real') 
viz_sequence.plot_weights(scores[seqno], subticks_frequency=10, figsize=(20,1))
print('same seq, different weights') 
viz_sequence.plot_weights(null_score, subticks_frequency=10, figsize=(20,1))

real

same seq, different weights

seqno = 0
null_onehot, null_score = get_null_sequences(one_hot[seqno], scores[seqno],
                                            #change seq, change weight
                                            shuffle_seq=True, shuffle_weight=True
                                            )

null_onehot.shape, null_score.shape

from modisco.visualization import viz_sequence
print('real') 
viz_sequence.plot_weights(scores[seqno], subticks_frequency=10, figsize=(20,1))
print('different weight, different seq') 
viz_sequence.plot_weights(null_score, subticks_frequency=10, figsize=(20,1))

real

different weight, different seq

from matlas.matches import get_null_sequences
seq_null_onehot, seq_null_scores = get_null_sequences(seqlet_onehot, seqlet_scores,
                                                shuffle_seq=True, shuffle_weight=False
                                                     )

weight_null_onehot, weight_null_scores = get_null_sequences(seqlet_onehot, seqlet_scores,
                                                shuffle_seq=False, shuffle_weight=True
                                                     )

weightedseq_null_onehot, weightedseq_null_scores = get_null_sequences(seqlet_onehot, seqlet_scores,
                                                shuffle_seq=True, shuffle_weight=True
                                                     )

done multiprocessing
(883, 70, 4)
(883, 70, 4)
done multiprocessing
(883, 70, 4)
(883, 70, 4)
done multiprocessing
(883, 70, 4)
(883, 70, 4)

from matlas.matches import collect_scoring_measures
seqlet_scores = ob.contrib_per_pattern[pattern_name]
seqlet_onehot = ob.seqs_per_pattern[pattern_name]
method_to_seqlet_scores = collect_scoring_measures(query, seqlet_scores, seqlet_onehot)
query.keys(), np.sort(np.array(list(method_to_seqlet_scores.keys())))

(dict_keys(['ppm', 'pwm', 'log10pwm', 'pssm', 'cwm', 'hyp_cwm', 'consensus']),
 array(['cwm_convolution', 'cwm_jaccard', 'cwm_masked_convolution',
        'cwm_scan', 'hyp_cwm_convolution', 'hyp_cwm_jaccard',
        'hyp_cwm_masked_convolution', 'hyp_cwm_scan',
        'log10pwm_convolution', 'log10pwm_jaccard',
        'log10pwm_masked_convolution', 'log10pwm_scan', 'ppm_convolution',
        'ppm_jaccard', 'ppm_masked_convolution', 'ppm_scan',
        'pssm_convolution', 'pssm_jaccard', 'pssm_masked_convolution',
        'pssm_scan', 'pwm_convolution', 'pwm_jaccard',
        'pwm_masked_convolution', 'pwm_scan', 'sum_score'], dtype='<U27'))

method_to_shuffled_scores = collect_scoring_measures(query, shuffled_scores, shuffled_onehot)
method_to_seq_null_scores = collect_scoring_measures(query, seq_null_scores, seq_null_onehot)
method_to_weight_null_scores = collect_scoring_measures(query, weight_null_scores, weight_null_onehot)
method_to_weightedseq_null_scores = collect_scoring_measures(query, weightedseq_null_scores, weightedseq_null_onehot)

from matlas.matches import get_any_high_scoring_instances
per_base_imp=0.0625

keys_for_scoring = ['sum_score'] 
keys_for_scoring += ["{}_scan".format(key) for key in query.keys() if key!= 'consensus']
keys_for_scoring += ["{}_jaccard".format(key) for key in query.keys() if key!= 'consensus']
keys_for_scoring += ["{}_convolution".format(key) for key in query.keys() if key!= 'consensus']
keys_for_scoring += ["{}_masked_convolution".format(key) for key in query.keys() if key!= 'consensus']
        
seqlet_hits = get_any_high_scoring_instances(method_to_seqlet_scores,
                                                min_tot_imp=per_base_imp*len(query['ppm']),
                                                keys=keys_for_scoring)

def get_background_instances(fg, method_to_null_scores, keys=None):
    # given the fg coords, get the df of bg
    # appy scaling and pca as in fg, then apply log-likelihood
    if keys is None:
        keys = list(method_to_scores.keys())
    sum_scores = method_to_null_scores['sum_score']
    matches = [(sum_scores[seqno, pos], seqno, pos) for seqno, pos in zip(fg['seqno'].values, fg['start_idx'].values)]
    match_df = pd.DataFrame(np.array(matches))
    match_df.columns = ['sum_score', 'seqno', 'start_idx']
    match_df.seqno = match_df.seqno.values.astype(int)
    match_df.start_idx = match_df.start_idx.values.astype(int)
    for key in keys:
        if(key=='sum_score'):
            continue
        match_df[key] = [method_to_null_scores[key][idx1, idx2] for 
                         (idx1, idx2) in zip(fg['seqno'].values, fg['start_idx'].values)]
    
    return match_df

shuffled_paired_hits = get_background_instances(seqlet_hits, method_to_shuffled_scores, keys=keys_for_scoring)
seq_null_paired_hits = get_background_instances(seqlet_hits, method_to_seq_null_scores, keys=keys_for_scoring)
weight_null_paired_hits = get_background_instances(seqlet_hits, method_to_weight_null_scores, keys=keys_for_scoring)
weightedseq_null_paired_hits = get_background_instances(seqlet_hits, method_to_weightedseq_null_scores, keys=keys_for_scoring)
seqlet_hits.shape, shuffled_paired_hits.shape, seq_null_paired_hits.shape, weight_null_paired_hits.shape, weightedseq_null_paired_hits.shape

((13587, 27), (13587, 27), (13587, 27), (13587, 27), (13587, 27))

from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

def transform_data(_hits, scaler=None, pca=None):
    if scaler is None:
        # should be used only during training
        scaler = StandardScaler()
        scaler.fit(_hits)
        
    # Apply transform to both the training set and testing set
    _hits = scaler.transform(_hits)

    if pca is None:
        # Make an instance of the Model
        pca = PCA(.98)
        #fitting PCA on the training set only.
        pca.fit(_hits)
        
    #Apply the mapping (transform) to both the training set and the test set.
    _hits = pca.transform(_hits)
    
    return scaler, pca, _hits

scaler, pca, train_hits = transform_data(seqlet_hits[keys_for_scoring])
pca.explained_variance_ratio_

array([0.73421992, 0.09118347, 0.07766985, 0.02654744, 0.022959  ,
       0.01230255, 0.00980085, 0.00861208])

shuffled_seqlet_hits = get_any_high_scoring_instances(method_to_shuffled_scores,
                                                min_tot_imp=per_base_imp*len(query['ppm']),
                                                keys=keys_for_scoring)
seq_null_seqlet_hits = get_any_high_scoring_instances(method_to_seq_null_scores,
                                                min_tot_imp=per_base_imp*len(query['ppm']),
                                                keys=keys_for_scoring)
weight_null_seqlet_hits = get_any_high_scoring_instances(method_to_weight_null_scores,
                                                min_tot_imp=per_base_imp*len(query['ppm']),
                                                keys=keys_for_scoring)
weightedseq_null_seqlet_hits = get_any_high_scoring_instances(method_to_weightedseq_null_scores,
                                                min_tot_imp=per_base_imp*len(query['ppm']),
                                                keys=keys_for_scoring)

seqlet_hits['marker'] = 'foreground'
shuffled_seqlet_hits['marker'] = 'suffled'
seq_null_seqlet_hits['marker'] = 'seq_null'
weight_null_seqlet_hits['marker'] = 'weight_null'
weightedseq_null_seqlet_hits['marker'] = 'weightedseq'
test_hits = pd.concat([seqlet_hits,
                       shuffled_seqlet_hits,
                       seq_null_seqlet_hits,
                       weight_null_seqlet_hits,
                       weightedseq_null_seqlet_hits
                      ])

/users/msharmin/anaconda2/envs/basepair13/lib/python3.6/site-packages/ipykernel_launcher.py:10: FutureWarning:

Sorting because non-concatenation axis is not aligned. A future version
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.

To retain the current behavior and silence the warning, pass 'sort=True'.

#read the real truth
def load_true_positives(tp_file):
    coordinates = []
    with open(tp_file, "r") as fp:
        for line in fp:
            line = line.strip().split(",")
            seqno = line[0]
            pos = int(line[1])
            if "-" in seqno:
                seqno = seqno.split("-")
                for i in range(int(seqno[0]), int(seqno[1])+1):
                    coordinates.append((i, pos))
            else:
                coordinates.append((int(seqno), pos))
    return coordinates
def get_tp_indices(seqlet_hits, verbose=True):
    keep = []
    for cord in coordinates:
        seqno, pos = cord
        df = seqlet_hits[(seqlet_hits['seqno']==seqno) & (seqlet_hits['start_idx']==pos)]
        if df.shape[0]>0:
            keep.append(df.index[0])
        else:
            if verbose: print(cord)

    return keep

tp_file = "{}/web_mouse_heme_data/modisco_tracks/binary_model/task_{}-naivegw/mm17_tp.txt".format(project_root, task_idx)
coordinates = load_true_positives(tp_file)
true_indices = get_tp_indices(seqlet_hits, False)

import matplotlib.pyplot as plt
def plot_pca(df, true_ids=None):
    print(df.shape)
    fig = plt.figure(figsize = (8,8))
    ax = fig.add_subplot(1,1,1) 
    ax.set_xlabel('PC1', fontsize = 15)
    ax.set_ylabel('PC2', fontsize = 15)
    ax.set_title('2 component PCA', fontsize = 20)
    if true_ids is not None:
        true_df = df.loc[np.array(true_ids),:]
        df = df.loc[np.array(list(set(list(df.index)).difference(set(true_ids)))),:]
        ax.scatter(true_df['PC1'], true_df['PC2'], s = 20, c='r', marker='+')
    ax.scatter(df['PC1'], df['PC2'], s = 1, c='b')
    #ax.axvline(9)
    #ax.axhline(-1)
    ax.grid()
    return None
pc_df = pd.DataFrame(data = train_hits, columns=["PC{}".format(i+1) for i in range(train_hits.shape[1])])
plot_pca(pc_df, true_indices)

(13587, 8)

from sklearn.mixture import GaussianMixture
#import sklearn.mixture.GaussianMixture
gmm = GaussianMixture(n_components=2).fit(pc_df[['PC1', 'PC2']])
labels = gmm.predict(pc_df[['PC1', 'PC2']])
plt.scatter(pc_df['PC1'], pc_df['PC2'], c=labels, s=1, cmap='seismic');

probs = gmm.predict_proba(pc_df[['PC1', 'PC2']])
plt.scatter(pc_df['PC1'], pc_df['PC2'], c=probs[:,1], s=1, cmap='seismic');

conditions = np.copy(labels)
conditions[np.logical_and(pc_df['PC1'].values<10, pc_df['PC2'].values<-0.5)] = 0
plt.scatter(pc_df['PC1'], pc_df['PC2'], c=conditions, s=1, cmap='seismic');

# selected = np.array(list(set(list(np.argwhere(conditions==0).T[0])).difference(set(list(np.argwhere(labels==0).T[0])))))
# pc_df.loc[selected,:]
# seqlet_hits.loc[selected,:]
# probs[:,1]

from matlas.render_report import jupyter_nbconvert
import os
def render_test_plots(nbname="global_report"):
    rendered_ipynb = "/mnt/lab_data/kundaje/msharmin/mouse_hem/code/mouse_hem/{}.ipynb".format(nbname)
    jupyter_nbconvert(rendered_ipynb)

    os.system("cp /mnt/lab_data/kundaje/msharmin/mouse_hem/code/mouse_hem/{}.html /srv/www/kundaje/msharmin/report/global_htmls/".format(nbname))
    os.system("chmod +755 /srv/www/kundaje/msharmin/mouse_hem/report/global_htmls/{}.html".format(nbname))

    return None

render_test_plots('testSpi1Scan')

_, _, test_hits = transform_data(shuffled_seqlet_hits[keys_for_scoring], scaler, pca)
shuffled_pc_df = pd.DataFrame(data = test_hits, columns=["PC{}".format(i+1) for i in range(test_hits.shape[1])])
#plot_pca(shuffled_pc_df, true_ids=None)

probs = gmm.predict_proba(shuffled_pc_df[['PC1', 'PC2']])
plt.scatter(shuffled_pc_df['PC1'], shuffled_pc_df['PC2'], c=probs[:,1], s=1, cmap='seismic');

import sys
def get_LLRs(fg_df, bg_df, keys=['PC1', 'PC2']):
    
    llr_df = OrderedDict()
    for key in keys:
        fg = fg_df[key].values
        bg = bg_df[key].values
        _min = abs(min(np.min(fg), np.min(bg)))+0.0001
        
        ratio = (fg+_min)/(bg+_min)
        llrs = -2 * np.log(ratio)
        llr_df[key] = llrs
        #break
        
    llr_df = pd.DataFrame(llr_df)
    
    return llr_df

llr_df = get_LLRs(pc_df, shuffled_paired_pc_df, keys=['PC1', 'PC2'])
#df[:5]
plot_pca(llr_df, true_ids=true_indices)

(13587, 2)

marker = np.array(['F']*llr_df.shape[0])
marker[np.array(true_indices)] = 'TP'
llr_df['marker'] = marker

import seaborn as sns
sns.pairplot(llr_df, 
             diag_kind="kde", kind='scatter', hue="marker")

/users/msharmin/anaconda2/envs/basepair13/lib/python3.6/site-packages/scipy/stats/stats.py:1713: FutureWarning:

Using a non-tuple sequence for multidimensional indexing is deprecated; use `arr[tuple(seq)]` instead of `arr[seq]`. In the future this will be interpreted as an array index, `arr[np.array(seq)]`, which will result either in an error or a different result.

<seaborn.axisgrid.PairGrid at 0x7fb087eb6dd8>

from scipy import stats
def get_z_scores(df):
    z_df = OrderedDict()
    for key in df.columns:
        if key == 'marker':
            z_df[key] = df[key].values
            continue
        z_df[key] = stats.zscore(df[key].values)
    
    z_df = pd.DataFrame(z_df) 
    
    return z_df

z_df = get_z_scores(llr_df)
plot_pca(z_df, true_ids=true_indices)

(13587, 3)

import seaborn as sns
sns.pairplot(z_df, 
             diag_kind="kde", kind='scatter', hue="marker")

/users/msharmin/anaconda2/envs/basepair13/lib/python3.6/site-packages/scipy/stats/stats.py:1713: FutureWarning:

Using a non-tuple sequence for multidimensional indexing is deprecated; use `arr[tuple(seq)]` instead of `arr[seq]`. In the future this will be interpreted as an array index, `arr[np.array(seq)]`, which will result either in an error or a different result.

<seaborn.axisgrid.PairGrid at 0x7fb087eaeb00>

import scipy
def get_p_values(df):
    z_scores = df['PC1'].values
    #p_values = 1 - scipy.special.ndtr(z_scores)
    #2 * (1 - scipy.stats.multivariate_normal.cdf(z_scores, mean=mu, cov=np.diag(std)))
    #p_values = scipy.stats.norm.sf(abs(z_scores)) #one-sided
    #p_values = scipy.stats.norm.sf(abs(z_scores))*2 #twosided
    return p_values

def plot_data(p_df):
    fig = plt.figure()
    ax = fig.add_subplot(1, 1, 1, axisbg="1.0")

    for data, color, group in zip(data, colors, groups):
        x, y = data
        ax.scatter(x, y, alpha=0.8, c=color, edgecolors='none', s=30, label=group)

    plt.title('Matplot scatter plot')
    plt.legend(loc=2)
    plt.show()

    return None

p_values = get_p_values(z_df)
p_df = pd.DataFrame({'p':p_values, 'log10p':-np.log10(p_values), 'marker': z_df['marker'].values})
p_df

#from matlas.matches import view_test_case
def view_test_case(scores, one_hot, df, pwm, seqno, start=400, end=600, itemno=None):
    df = df[df.seqno==seqno]
    if(df.shape[0]==0):
        print('Not part of the hits')
        viz_sequence.plot_weights(scores[seqno, start:end,:],
                                  highlight={"black": [(21, 29)]},
                                  subticks_frequency=10, figsize=(20,1))

        viz_sequence.plot_weights(one_hot[seqno, start:end, :], 
                                  highlight={"black": [(21, 29)]},
                                  subticks_frequency=10, figsize=(20,1))
        return None
    
    keys = set(df.columns).difference(set(['seqno', 'start_idx', 'marker']))
    debug = 0
    for i, row in df.iterrows():
        if itemno is not None:
            if debug!=itemno:
                debug += 1 
                continue
            else:
                debug += 1
            
        printables = ["{}: {}".format(key, round(row[key], 3)) for key in keys]
        seqno = int(row['seqno'])
        start_idx = int(row['start_idx'])
        print(i,
              printables,
              "coordinates:",(seqno, start_idx))
        start_motif = start_idx - start
        end_motif = start_motif+len(pwm)
        print(start_idx, start_motif, end_motif)
        if(end_motif > (end-start) or end_motif<0):
            continue
        viz_sequence.plot_weights(scores[seqno, start:end,:],
                                  highlight={"black": [(start_motif, end_motif)]},
                                  subticks_frequency=10, figsize=(20,1))

        viz_sequence.plot_weights(one_hot[seqno, start:end, :], 
                                  highlight={"black": [(start_motif, end_motif)]},
                                  subticks_frequency=10, figsize=(20,1))
    return None
view_test_case(seqlet_scores, seqlet_onehot, seqlet_hits, query['ppm'], 0, start=0, end=200, itemno=17)
view_test_case(shuffled_scores, shuffled_onehot, shuffled_seqlet_hits, query['ppm'], 0, start=0, end=200, itemno=16)

17 ['log10pwm_scan: -8.482', 'cwm_convolution: 0.204', 'log10pwm_masked_convolution: -0.473', 'pssm_jaccard: 0.185', 'hyp_cwm_scan: -0.009', 'hyp_cwm_masked_convolution: -0.08', 'cwm_jaccard: 0.152', 'sum_score: 3.044', 'pssm_convolution: 2.108', 'pssm_scan: 4.412', 'hyp_cwm_convolution: -0.035', 'pwm_convolution: -7.125', 'cwm_scan: 0.442', 'pwm_jaccard: -0.058', 'ppm_jaccard: 0.148', 'hyp_cwm_jaccard: -0.001', 'pwm_scan: -12.237', 'pwm_masked_convolution: -0.473', 'ppm_scan: 3.168', 'log10pwm_convolution: -4.938', 'cwm_masked_convolution: 0.5', 'log10pwm_jaccard: -0.058', 'ppm_convolution: 1.108', 'pssm_masked_convolution: 0.511', 'ppm_masked_convolution: 0.438'] coordinates: (0, 43)
43 43 51

16 ['log10pwm_scan: -17.251', 'cwm_convolution: 0.096', 'log10pwm_masked_convolution: -0.535', 'pssm_jaccard: 0.092', 'hyp_cwm_scan: -0.24', 'hyp_cwm_masked_convolution: -0.059', 'cwm_jaccard: 0.07', 'sum_score: 2.158', 'pssm_convolution: 0.941', 'pssm_scan: 2.045', 'hyp_cwm_convolution: -0.016', 'pwm_convolution: -6.257', 'cwm_scan: 0.201', 'pwm_jaccard: -0.098', 'ppm_jaccard: 0.068', 'hyp_cwm_jaccard: -0.025', 'pwm_scan: -24.888', 'pwm_masked_convolution: -0.535', 'ppm_scan: 1.097', 'log10pwm_convolution: -4.337', 'cwm_masked_convolution: 0.428', 'log10pwm_jaccard: -0.098', 'ppm_convolution: 0.484', 'pssm_masked_convolution: 0.432', 'ppm_masked_convolution: 0.44'] coordinates: (0, 43)
43 43 51

view_test_case(seq_null_scores, seq_null_onehot, seq_null_seqlet_hits, query['ppm'], 0, start=0, end=200, itemno=17)
view_test_case(weight_null_scores, weight_null_onehot, weight_null_seqlet_hits, query['ppm'], 0, start=0, end=200, itemno=25)
view_test_case(weightedseq_null_scores, weightedseq_null_onehot, weightedseq_null_seqlet_hits, query['ppm'], 0, start=0, end=200, itemno=27)

17 ['log10pwm_scan: -13.35', 'cwm_convolution: 0.104', 'log10pwm_masked_convolution: -0.571', 'pssm_jaccard: 0.1', 'hyp_cwm_scan: -0.039', 'hyp_cwm_masked_convolution: 0.075', 'cwm_jaccard: 0.062', 'sum_score: 3.044', 'pssm_convolution: 1.197', 'pssm_scan: 2.169', 'hyp_cwm_convolution: 0.021', 'pwm_convolution: -8.7', 'cwm_scan: 0.178', 'pwm_jaccard: -0.062', 'ppm_jaccard: 0.099', 'hyp_cwm_jaccard: 0.026', 'pwm_scan: -19.26', 'pwm_masked_convolution: -0.571', 'ppm_scan: 1.525', 'log10pwm_convolution: -6.031', 'cwm_masked_convolution: 0.452', 'log10pwm_jaccard: -0.062', 'ppm_convolution: 0.728', 'pssm_masked_convolution: 0.451', 'ppm_masked_convolution: 0.454'] coordinates: (0, 43)
43 43 51

25 ['log10pwm_scan: -8.482', 'cwm_convolution: 0.116', 'log10pwm_masked_convolution: 0.236', 'pssm_jaccard: 0.181', 'hyp_cwm_scan: -0.009', 'hyp_cwm_masked_convolution: 0.593', 'cwm_jaccard: 0.152', 'sum_score: 0.736', 'pssm_convolution: 1.126', 'pssm_scan: 4.412', 'hyp_cwm_convolution: 0.085', 'pwm_convolution: 1.17', 'cwm_scan: 0.442', 'pwm_jaccard: 0.033', 'ppm_jaccard: 0.212', 'hyp_cwm_jaccard: 0.08', 'pwm_scan: -12.237', 'pwm_masked_convolution: 0.236', 'ppm_scan: 3.168', 'log10pwm_convolution: 0.811', 'cwm_masked_convolution: 0.865', 'log10pwm_jaccard: 0.033', 'ppm_convolution: 0.649', 'pssm_masked_convolution: 0.83', 'ppm_masked_convolution: 0.78'] coordinates: (0, 43)
43 43 51

27 ['log10pwm_scan: -13.35', 'cwm_convolution: 0.011', 'log10pwm_masked_convolution: -0.774', 'pssm_jaccard: 0.018', 'hyp_cwm_scan: -0.039', 'hyp_cwm_masked_convolution: -0.171', 'cwm_jaccard: 0.026', 'sum_score: 1.074', 'pssm_convolution: 0.129', 'pssm_scan: 2.169', 'hyp_cwm_convolution: -0.019', 'pwm_convolution: -4.821', 'cwm_scan: 0.178', 'pwm_jaccard: -0.086', 'ppm_jaccard: 0.008', 'hyp_cwm_jaccard: -0.018', 'pwm_scan: -19.26', 'pwm_masked_convolution: -0.774', 'ppm_scan: 1.525', 'log10pwm_convolution: -3.342', 'cwm_masked_convolution: 0.12', 'log10pwm_jaccard: -0.086', 'ppm_convolution: 0.058', 'pssm_masked_convolution: 0.119', 'ppm_masked_convolution: 0.089'] coordinates: (0, 43)
43 43 51

import numpy as np
import matplotlib.pyplot as plt
def compare_test_case(series1, series2, filter_='convolution'):
    keys = set(series1.keys()).difference(set(['seqno', 'start_idx', 'marker']))
    keys = ['sum_score'] + [key for key in keys if filter_ in key]
    # set width of bar
    barWidth = 0.25

    # set height of bar
    bars1 = [series1[key] for key in keys]
    bars2 = [series2[key] for key in keys]

    # Set position of bar on X axis
    r1 = np.arange(len(bars1))
    r2 = [x + barWidth for x in r1]

    # Make the plot
    plt.bar(r1, bars1, color='#557f2d', width=barWidth, edgecolor='white', label='seqlet')
    plt.bar(r2, bars2, color='#2d7f5e', width=barWidth, edgecolor='white', label='shuffled')

    # Add xticks on the middle of the group bars
    plt.xlabel('group', fontweight='bold')
    plt.xticks([r + barWidth for r in range(len(bars1))], keys, rotation='vertical')

    # Create legend & Show graphic
    plt.legend()
    plt.show()
    
    return None

a=4; b=17
# compare_test_case(seqlet_hits.loc[a,:], seqlet_hits.loc[b,:], 'scan')
# compare_test_case(seqlet_hits.loc[a,:], seqlet_hits.loc[b,:], 'convolution')
# compare_test_case(seqlet_hits.loc[a,:], seqlet_hits.loc[b,:], 'jaccard')
# compare_test_case(seqlet_hits.loc[a,:], seqlet_hits.loc[b,:], 'masked')

a=17; b=16
# compare_test_case(seqlet_hits.loc[a,:], shuffled_seqlet_hits.loc[b,:], 'scan')
# compare_test_case(seqlet_hits.loc[a,:], shuffled_seqlet_hits.loc[b,:], 'convolution')
# compare_test_case(seqlet_hits.loc[a,:], shuffled_seqlet_hits.loc[b,:], 'jaccard')
# compare_test_case(seqlet_hits.loc[a,:], shuffled_seqlet_hits.loc[b,:], 'masked')

a=17; b=17
# compare_test_case(seqlet_hits.loc[a,:], seq_null_seqlet_hits.loc[b,:], 'scan')
# compare_test_case(seqlet_hits.loc[a,:], seq_null_seqlet_hits.loc[b,:], 'convolution')
# compare_test_case(seqlet_hits.loc[a,:], seq_null_seqlet_hits.loc[b,:], 'jaccard')
# compare_test_case(seqlet_hits.loc[a,:], seq_null_seqlet_hits.loc[b,:], 'masked')

a=17; b=25
# compare_test_case(seqlet_hits.loc[a,:], weight_null_seqlet_hits.loc[b,:], 'scan')
# compare_test_case(seqlet_hits.loc[a,:], weight_null_seqlet_hits.loc[b,:], 'convolution')
# compare_test_case(seqlet_hits.loc[a,:], weight_null_seqlet_hits.loc[b,:], 'jaccard')
# compare_test_case(seqlet_hits.loc[a,:], weight_null_seqlet_hits.loc[b,:], 'masked')

a=17; b=27
# compare_test_case(seqlet_hits.loc[a,:], weightedseq_null_seqlet_hits.loc[b,:], 'scan')
# compare_test_case(seqlet_hits.loc[a,:], weightedseq_null_seqlet_hits.loc[b,:], 'convolution')
# compare_test_case(seqlet_hits.loc[a,:], weightedseq_null_seqlet_hits.loc[b,:], 'jaccard')
# compare_test_case(seqlet_hits.loc[a,:], weightedseq_null_seqlet_hits.loc[b,:], 'masked')

import seaborn as sns
sns.pairplot(test_hits[['sum_score', 
                        'ppm_scan', 'pwm_scan', 'log10pwm_scan', 'marker']], 
             diag_kind="kde", kind='scatter', hue="marker")

/users/msharmin/anaconda2/envs/basepair13/lib/python3.6/site-packages/scipy/stats/stats.py:1713: FutureWarning:

Using a non-tuple sequence for multidimensional indexing is deprecated; use `arr[tuple(seq)]` instead of `arr[seq]`. In the future this will be interpreted as an array index, `arr[np.array(seq)]`, which will result either in an error or a different result.

<seaborn.axisgrid.PairGrid at 0x7f621e198400>

import seaborn as sns
sns.pairplot(test_hits[['sum_score', 
                   'cwm_scan', 'hyp_cwm_scan', 'pssm_scan', 'marker']], 
             diag_kind="kde", kind='scatter', hue="marker")

<seaborn.axisgrid.PairGrid at 0x7f621deb8940>

sns.pairplot(test_hits[['sum_score', 
                   'ppm_jaccard', 'pwm_jaccard', 'log10pwm_jaccard', 'marker']], 
             diag_kind="kde", kind='scatter', hue="marker")

<seaborn.axisgrid.PairGrid at 0x7f621c1beb70>

sns.pairplot(test_hits[['sum_score', 
                   'cwm_jaccard', 'hyp_cwm_jaccard', 'pssm_jaccard', 'marker']], 
             diag_kind="kde", kind='scatter', hue="marker")

<seaborn.axisgrid.PairGrid at 0x7f620ef3e358>

sns.pairplot(test_hits[['sum_score', 
                   'ppm_convolution', 'pwm_convolution', 'log10pwm_convolution', 'marker']], 
             diag_kind="kde", kind='scatter', hue="marker")

<seaborn.axisgrid.PairGrid at 0x7f620cf79c18>

sns.pairplot(test_hits[['sum_score', 
                   'cwm_convolution', 'hyp_cwm_convolution', 'pssm_convolution', 'marker']], 
             diag_kind="kde", kind='scatter', hue="marker")

<seaborn.axisgrid.PairGrid at 0x7f620cf79240>

sns.pairplot(test_hits[['sum_score', 
                   'ppm_masked_convolution', 'pwm_masked_convolution', 'log10pwm_masked_convolution', 'marker']], 
             diag_kind="kde", kind='scatter', hue="marker")

<seaborn.axisgrid.PairGrid at 0x7f620c2216d8>

sns.pairplot(test_hits[['sum_score', 
                   'cwm_masked_convolution', 'hyp_cwm_masked_convolution', 'pssm_masked_convolution', 'marker']], 
             diag_kind="kde", kind='scatter', hue="marker")

<seaborn.axisgrid.PairGrid at 0x7f5f9c7eec50>

#dimension reduction
len(keys_for_scoring)
keys_for_scoring = ['sum_score'] 
keys_for_scoring += ["{}_scan".format(key) for key in query.keys() if key!= 'consensus']
keys_for_scoring += ["{}_jaccard".format(key) for key in query.keys() if key!= 'consensus']
keys_for_scoring += ["{}_convolution".format(key) for key in query.keys() if key!= 'consensus']
keys_for_scoring += ["{}_masked_convolution".format(key) for key in ['hyp_cwm', 'log10pwm', 'pwm']]

import umap
trans = umap.UMAP(n_neighbors=5, random_state=42).fit(seqlet_hits[keys_for_scoring])
print(trans.embedding_.shape)
plt.scatter(trans.embedding_[:, 0], trans.embedding_[:, 1], s=1, cmap='Spectral')

/users/msharmin/anaconda2/envs/basepair13/lib/python3.6/site-packages/umap/spectral.py:229: UserWarning:

Embedding a total of 3 separate connected components using meta-embedding (experimental)

(13587, 2)

<matplotlib.collections.PathCollection at 0x7f5f8aadbba8>

test_embedding = trans.transform(seqlet_hits[keys_for_scoring])
print(test_embedding.shape)
plt.scatter(test_embedding[:, 0], test_embedding[:, 1], s=1, cmap='Spectral')

(13587, 2)

<matplotlib.collections.PathCollection at 0x7f5f8ab2ed30>

test_embedding = trans.transform(shuffled_seqlet_hits[keys_for_scoring])
print(test_embedding.shape)
plt.scatter(test_embedding[:, 0], test_embedding[:, 1], s=1, cmap='Spectral')

(2846, 2)

<matplotlib.collections.PathCollection at 0x7f5f89ce7358>

test_embedding = trans.transform(seq_null_seqlet_hits[keys_for_scoring])
print(test_embedding.shape)
plt.scatter(test_embedding[:, 0], test_embedding[:, 1], s=1, cmap='Spectral')

(13587, 2)

<matplotlib.collections.PathCollection at 0x7f5f8a2b3048>

test_embedding = trans.transform(weight_null_seqlet_hits[keys_for_scoring])
print(test_embedding.shape)
plt.scatter(test_embedding[:, 0], test_embedding[:, 1], s=1, cmap='Spectral')

(12928, 2)

<matplotlib.collections.PathCollection at 0x7f5f8b1cc240>

test_embedding = trans.transform(weightedseq_null_seqlet_hits[keys_for_scoring])
print(test_embedding.shape)
plt.scatter(test_embedding[:, 0], test_embedding[:, 1], s=1, cmap='Spectral')

(13422, 2)

<matplotlib.collections.PathCollection at 0x7f5f8a8647b8>

type(test_embedding)

numpy.ndarray

umap_seqlet_hits = pd.DataFrame(trans.transform(seqlet_hits[keys_for_scoring]))
umap_shuffled_seqlet_hits = pd.DataFrame(trans.transform(shuffled_seqlet_hits[keys_for_scoring]))
umap_seq_null_seqlet_hits = pd.DataFrame(trans.transform(seq_null_seqlet_hits[keys_for_scoring]))
umap_weight_null_seqlet_hits = pd.DataFrame(trans.transform(weight_null_seqlet_hits[keys_for_scoring]))
umap_weightedseq_null_seqlet_hits = pd.DataFrame(trans.transform(weightedseq_null_seqlet_hits[keys_for_scoring]))
umap_seqlet_hits['marker'] = 'foreground'
umap_shuffled_seqlet_hits['marker'] = 'suffled'
umap_seq_null_seqlet_hits['marker'] = 'seq_null'
umap_weight_null_seqlet_hits['marker'] = 'weight_null'
umap_weightedseq_null_seqlet_hits['marker'] = 'weightedseq'
umap_test_hits = pd.concat([umap_seqlet_hits,
                       umap_shuffled_seqlet_hits,
                       umap_seq_null_seqlet_hits,
                       umap_weight_null_seqlet_hits,
                       umap_weightedseq_null_seqlet_hits
                      ])

sns.pairplot(umap_test_hits, 
             diag_kind="kde", kind='scatter', hue="marker")

/users/msharmin/anaconda2/envs/basepair13/lib/python3.6/site-packages/scipy/stats/stats.py:1713: FutureWarning:

Using a non-tuple sequence for multidimensional indexing is deprecated; use `arr[tuple(seq)]` instead of `arr[seq]`. In the future this will be interpreted as an array index, `arr[np.array(seq)]`, which will result either in an error or a different result.

<seaborn.axisgrid.PairGrid at 0x7f5f8a689a58>

PPM in forground vs. various null scores¶

from matlas.matches import plots_seqlet_measures
plots_seqlet_measures(seqlet_hits, metrics = {0:["sum_score", "ppm_scan"], 
                                            1:["sum_score", "ppm_convolution"], 
                                            2:["sum_score", "ppm_masked_convolution"], 
                                            3:["sum_score", "ppm_jaccard"]})

plots_seqlet_measures(shuffled_seqlet_hits, metrics = {0:["sum_score", "ppm_scan"], 
                                            1:["sum_score", "ppm_convolution"], 
                                            2:["sum_score", "ppm_masked_convolution"], 
                                            3:["sum_score", "ppm_jaccard"]})

/users/msharmin/anaconda2/envs/basepair13/lib/python3.6/site-packages/scipy/stats/stats.py:1713: FutureWarning:

Using a non-tuple sequence for multidimensional indexing is deprecated; use `arr[tuple(seq)]` instead of `arr[seq]`. In the future this will be interpreted as an array index, `arr[np.array(seq)]`, which will result either in an error or a different result.

plots_seqlet_measures(seq_null_seqlet_hits, metrics = {0:["sum_score", "ppm_scan"], 
                                            1:["sum_score", "ppm_convolution"], 
                                            2:["sum_score", "ppm_masked_convolution"], 
                                            3:["sum_score", "ppm_jaccard"]})

plots_seqlet_measures(weight_null_seqlet_hits, metrics = {0:["sum_score", "ppm_scan"], 
                                            1:["sum_score", "ppm_convolution"], 
                                            2:["sum_score", "ppm_masked_convolution"], 
                                            3:["sum_score", "ppm_jaccard"]})

plots_seqlet_measures(weightedseq_null_seqlet_hits, metrics = {0:["sum_score", "ppm_scan"], 
                                            1:["sum_score", "ppm_convolution"], 
                                            2:["sum_score", "ppm_masked_convolution"], 
                                            3:["sum_score", "ppm_jaccard"]})

PWM in forground vs. various null scores¶

from matlas.matches import plots_seqlet_measures
plots_seqlet_measures(seqlet_hits, metrics = {0:["sum_score", "pwm_scan"], 
                                            1:["sum_score", "pwm_convolution"], 
                                            2:["sum_score", "pwm_masked_convolution"], 
                                            3:["sum_score", "pwm_jaccard"]})

plots_seqlet_measures(shuffled_seqlet_hits, metrics = {0:["sum_score", "pwm_scan"], 
                                            1:["sum_score", "pwm_convolution"], 
                                            2:["sum_score", "pwm_masked_convolution"], 
                                            3:["sum_score", "pwm_jaccard"]})

plots_seqlet_measures(seq_null_seqlet_hits, metrics = {0:["sum_score", "pwm_scan"], 
                                            1:["sum_score", "pwm_convolution"], 
                                            2:["sum_score", "pwm_masked_convolution"], 
                                            3:["sum_score", "pwm_jaccard"]})

plots_seqlet_measures(weight_null_seqlet_hits, metrics = {0:["sum_score", "pwm_scan"], 
                                            1:["sum_score", "pwm_convolution"], 
                                            2:["sum_score", "pwm_masked_convolution"], 
                                            3:["sum_score", "pwm_jaccard"]})

plots_seqlet_measures(weightedseq_null_seqlet_hits, metrics = {0:["sum_score", "pwm_scan"], 
                                            1:["sum_score", "pwm_convolution"], 
                                            2:["sum_score", "pwm_masked_convolution"], 
                                            3:["sum_score", "pwm_jaccard"]})

log10pwm in forground vs. various null scores¶

from matlas.matches import plots_seqlet_measures
plots_seqlet_measures(seqlet_hits, metrics = {0:["sum_score", "log10pwm_scan"], 
                                            1:["sum_score", "log10pwm_convolution"], 
                                            2:["sum_score", "log10pwm_masked_convolution"], 
                                            3:["sum_score", "log10pwm_jaccard"]})

plots_seqlet_measures(shuffled_seqlet_hits, metrics = {0:["sum_score", "log10pwm_scan"], 
                                            1:["sum_score", "log10pwm_convolution"], 
                                            2:["sum_score", "log10pwm_masked_convolution"], 
                                            3:["sum_score", "log10pwm_jaccard"]})

from matlas.matches import plots_seqlet_measures
plots_seqlet_measures(seq_null_seqlet_hits, metrics = {0:["sum_score", "log10pwm_scan"], 
                                            1:["sum_score", "log10pwm_convolution"], 
                                            2:["sum_score", "log10pwm_masked_convolution"], 
                                            3:["sum_score", "log10pwm_jaccard"]})

from matlas.matches import plots_seqlet_measures
plots_seqlet_measures(weight_null_seqlet_hits, metrics = {0:["sum_score", "log10pwm_scan"], 
                                            1:["sum_score", "log10pwm_convolution"], 
                                            2:["sum_score", "log10pwm_masked_convolution"], 
                                            3:["sum_score", "log10pwm_jaccard"]})

from matlas.matches import plots_seqlet_measures
plots_seqlet_measures(weightedseq_null_seqlet_hits, metrics = {0:["sum_score", "log10pwm_scan"], 
                                            1:["sum_score", "log10pwm_convolution"], 
                                            2:["sum_score", "log10pwm_masked_convolution"], 
                                            3:["sum_score", "log10pwm_jaccard"]})

PSSM in forground vs. various null scores¶

from matlas.matches import plots_seqlet_measures
plots_seqlet_measures(seqlet_hits, metrics = {0:["sum_score", "pssm_scan"], 
                                            1:["sum_score", "pssm_convolution"], 
                                            2:["sum_score", "pssm_masked_convolution"], 
                                            3:["sum_score", "pssm_jaccard"]})

plots_seqlet_measures(shuffled_seqlet_hits, metrics = {0:["sum_score", "pssm_scan"], 
                                            1:["sum_score", "pssm_convolution"], 
                                            2:["sum_score", "pssm_masked_convolution"], 
                                            3:["sum_score", "pssm_jaccard"]})

plots_seqlet_measures(seq_null_seqlet_hits, metrics = {0:["sum_score", "pssm_scan"], 
                                            1:["sum_score", "pssm_convolution"], 
                                            2:["sum_score", "pssm_masked_convolution"], 
                                            3:["sum_score", "pssm_jaccard"]})

plots_seqlet_measures(weight_null_seqlet_hits, metrics = {0:["sum_score", "pssm_scan"], 
                                            1:["sum_score", "pssm_convolution"], 
                                            2:["sum_score", "pssm_masked_convolution"], 
                                            3:["sum_score", "pssm_jaccard"]})

plots_seqlet_measures(weightedseq_null_seqlet_hits, metrics = {0:["sum_score", "pssm_scan"], 
                                            1:["sum_score", "pssm_convolution"], 
                                            2:["sum_score", "pssm_masked_convolution"], 
                                            3:["sum_score", "pssm_jaccard"]})

cwm in forground vs. various null scores¶

from matlas.matches import plots_seqlet_measures
plots_seqlet_measures(seqlet_hits, metrics = {0:["sum_score", "cwm_scan"], 
                                            1:["sum_score", "cwm_convolution"], 
                                            2:["sum_score", "cwm_masked_convolution"], 
                                            3:["sum_score", "cwm_jaccard"]})

plots_seqlet_measures(shuffled_seqlet_hits, metrics = {0:["sum_score", "cwm_scan"], 
                                            1:["sum_score", "cwm_convolution"], 
                                            2:["sum_score", "cwm_masked_convolution"], 
                                            3:["sum_score", "cwm_jaccard"]})

plots_seqlet_measures(seq_null_seqlet_hits, metrics = {0:["sum_score", "cwm_scan"], 
                                            1:["sum_score", "cwm_convolution"], 
                                            2:["sum_score", "cwm_masked_convolution"], 
                                            3:["sum_score", "cwm_jaccard"]})

plots_seqlet_measures(weight_null_seqlet_hits, metrics = {0:["sum_score", "cwm_scan"], 
                                            1:["sum_score", "cwm_convolution"], 
                                            2:["sum_score", "cwm_masked_convolution"], 
                                            3:["sum_score", "cwm_jaccard"]})

plots_seqlet_measures(weightedseq_null_seqlet_hits, metrics = {0:["sum_score", "cwm_scan"], 
                                            1:["sum_score", "cwm_convolution"], 
                                            2:["sum_score", "cwm_masked_convolution"], 
                                            3:["sum_score", "cwm_jaccard"]})

hyp_cwm in forground vs. various null scores¶

from matlas.matches import plots_seqlet_measures
plots_seqlet_measures(seqlet_hits, metrics = {0:["sum_score", "hyp_cwm_scan"], 
                                            1:["sum_score", "hyp_cwm_convolution"], 
                                            2:["sum_score", "hyp_cwm_masked_convolution"], 
                                            3:["sum_score", "hyp_cwm_jaccard"]})

plots_seqlet_measures(shuffled_seqlet_hits, metrics = {0:["sum_score", "hyp_cwm_scan"], 
                                            1:["sum_score", "hyp_cwm_convolution"], 
                                            2:["sum_score", "hyp_cwm_masked_convolution"], 
                                            3:["sum_score", "hyp_cwm_jaccard"]})

plots_seqlet_measures(seq_null_seqlet_hits, metrics = {0:["sum_score", "hyp_cwm_scan"], 
                                            1:["sum_score", "hyp_cwm_convolution"], 
                                            2:["sum_score", "hyp_cwm_masked_convolution"], 
                                            3:["sum_score", "hyp_cwm_jaccard"]})

plots_seqlet_measures(weight_null_seqlet_hits, metrics = {0:["sum_score", "hyp_cwm_scan"], 
                                            1:["sum_score", "hyp_cwm_convolution"], 
                                            2:["sum_score", "hyp_cwm_masked_convolution"], 
                                            3:["sum_score", "hyp_cwm_jaccard"]})

plots_seqlet_measures(weightedseq_null_seqlet_hits, metrics = {0:["sum_score", "hyp_cwm_scan"], 
                                            1:["sum_score", "hyp_cwm_convolution"], 
                                            2:["sum_score", "hyp_cwm_masked_convolution"], 
                                            3:["sum_score", "hyp_cwm_jaccard"]})

len(keys_for_scoring)
keys_for_scoring = ['sum_score'] 
keys_for_scoring += ["{}_scan".format(key) for key in query.keys() if key!= 'consensus']
keys_for_scoring += ["{}_jaccard".format(key) for key in query.keys() if key!= 'consensus']
keys_for_scoring += ["{}_convolution".format(key) for key in query.keys() if key!= 'consensus']
keys_for_scoring += ["{}_masked_convolution".format(key) for key in query.keys() if key!= 'consensus']
#keys_for_scoring += ["{}_masked_convolution".format(key) for key in ['hyp_cwm', 'log10pwm', 'pwm']]

from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
# Fit on training set only.
scaler.fit(seqlet_hits[keys_for_scoring])
# Apply transform to both the training set and the test set.
train_hits = scaler.transform(seqlet_hits[keys_for_scoring])

from sklearn.decomposition import PCA
# Make an instance of the Model
pca = PCA(.98)
#fitting PCA on the training set only.
pca.fit(train_hits)
#Apply the mapping (transform) to both the training set and the test set.
train_hits = pca.transform(train_hits)
#test_hits = pca.transform(test_hits)

#seqlet_hits[seqlet_hits['seqno']==684]
pca.explained_variance_ratio_

array([0.73421992, 0.09118347, 0.07766985, 0.02654744, 0.022959  ,
       0.01230255, 0.00980085, 0.00861208])

#read the real truth
def load_true_positives(tp_file):
    coordinates = []
    with open(tp_file, "r") as fp:
        for line in fp:
            line = line.strip().split(",")
            seqno = line[0]
            pos = int(line[1])
            if "-" in seqno:
                seqno = seqno.split("-")
                for i in range(int(seqno[0]), int(seqno[1])+1):
                    coordinates.append((i, pos))
            else:
                coordinates.append((int(seqno), pos))
    return coordinates
def get_tp_indices(seqlet_hits):
    keep = []
    for cord in coordinates:
        seqno, pos = cord
        df = seqlet_hits[(seqlet_hits['seqno']==seqno) & (seqlet_hits['start_idx']==pos)]
        if df.shape[0]>0:
            keep.append(df.index[0])
        else:
            print(cord)

    return keep

tp_file = "{}/web_mouse_heme_data/modisco_tracks/binary_model/task_{}-naivegw/mm17_tp.txt".format(project_root, task_idx)
coordinates = load_true_positives(tp_file)
true_indices = get_tp_indices(seqlet_hits)

(684, 49)

def plot_pca(df, true_ids=None):
    print(df.shape)
    fig = plt.figure(figsize = (8,8))
    ax = fig.add_subplot(1,1,1) 
    ax.set_xlabel('PC1', fontsize = 15)
    ax.set_ylabel('PC2', fontsize = 15)
    ax.set_title('2 component PCA', fontsize = 20)
    if true_ids is not None:
        true_df = df.loc[np.array(true_ids),:]
        df = df.loc[np.array(list(set(list(df.index)).difference(set(true_ids)))),:]
        ax.scatter(true_df['PC1'], true_df['PC2'], s = 20, c='r', marker='+')
    ax.scatter(df['PC1'], df['PC2'], s = 1, c='b')
    #ax.axvline(9)
    #ax.axhline(-1)
    ax.grid()
    return None
pc_df = pd.DataFrame(data = train_hits, columns=["PC{}".format(i+1) for i in range(train_hits.shape[1])])
plot_pca(pc_df, )

(13587, 8)

#test_df = pc_df.loc[np.array(list(set(list(pc_df.index)).difference(set(true_indices)))),:]
#test_df = test_df.loc[(test_df['PC1']>9) & (test_df['PC2']<0),:]
#seqlet_hits.loc[test_df.index, :]
seqlet_hits[seqlet_hits['seqno']==724]

test_hits = scaler.transform(shuffled_seqlet_hits[keys_for_scoring])
test_hits = pca.transform(test_hits)
shuffled_pc_df = pd.DataFrame(data = test_hits, columns=["PC{}".format(i+1) for i in range(test_hits.shape[1])])
plot_pca(shuffled_pc_df, true_ids=None)

(2846, 4)

test_hits = scaler.transform(seq_null_seqlet_hits[keys_for_scoring])
test_hits = pca.transform(test_hits)
seq_null_pc_df = pd.DataFrame(data = test_hits, columns=["PC{}".format(i+1) for i in range(test_hits.shape[1])])
plot_pca(seq_null_pc_df)

(13587, 4)

test_hits = scaler.transform(weight_null_seqlet_hits[keys_for_scoring])
test_hits = pca.transform(test_hits)
weight_null_pc_df = pd.DataFrame(data = test_hits, columns=["PC{}".format(i+1) for i in range(test_hits.shape[1])])
plot_pca(weight_null_pc_df)

(13714, 4)

test_hits = scaler.transform(weightedseq_null_seqlet_hits[keys_for_scoring])
test_hits = pca.transform(test_hits)
weightedseq_null_pc_df = pd.DataFrame(data = test_hits, columns=["PC{}".format(i+1) for i in range(test_hits.shape[1])])
plot_pca(weightedseq_null_pc_df)

(12597, 4)

pc_df[:5]

pca_seqlet_hits = pc_df[['PC1', 'PC2', 'PC3']]
pca_shuffled_seqlet_hits = shuffled_pc_df[['PC1', 'PC2', 'PC3']]
pca_seq_null_seqlet_hits = seq_null_pc_df[['PC1', 'PC2', 'PC3']]
pca_weight_null_seqlet_hits = weight_null_pc_df[['PC1', 'PC2', 'PC3']]
pca_weightedseq_null_seqlet_hits = weightedseq_null_pc_df[['PC1', 'PC2', 'PC3']]
pca_seqlet_hits['marker'] = 'foreground'
pca_shuffled_seqlet_hits['marker'] = 'shuffled'
pca_seq_null_seqlet_hits['marker'] = 'seq_null'
pca_weight_null_seqlet_hits['marker'] = 'weight_null'
pca_weightedseq_null_seqlet_hits['marker'] = 'weightedseq'
pca_test_hits = pd.concat([pca_seqlet_hits,
                       pca_shuffled_seqlet_hits,
                       pca_seq_null_seqlet_hits,
                       pca_weight_null_seqlet_hits,
                       pca_weightedseq_null_seqlet_hits
                      ])

sns.pairplot(pca_test_hits, 
             diag_kind="kde", kind='scatter', hue="marker")

/users/msharmin/anaconda2/envs/basepair13/lib/python3.6/site-packages/scipy/stats/stats.py:1713: FutureWarning:

Using a non-tuple sequence for multidimensional indexing is deprecated; use `arr[tuple(seq)]` instead of `arr[seq]`. In the future this will be interpreted as an array index, `arr[np.array(seq)]`, which will result either in an error or a different result.

<seaborn.axisgrid.PairGrid at 0x7f5ec13d7518>

import seaborn as sns
sns.pairplot(pd.concat([pca_seqlet_hits, pca_shuffled_seqlet_hits]), 
             diag_kind="kde", kind='scatter', hue="marker")

/users/msharmin/anaconda2/envs/basepair13/lib/python3.6/site-packages/scipy/stats/stats.py:1713: FutureWarning:

Using a non-tuple sequence for multidimensional indexing is deprecated; use `arr[tuple(seq)]` instead of `arr[seq]`. In the future this will be interpreted as an array index, `arr[np.array(seq)]`, which will result either in an error or a different result.

<seaborn.axisgrid.PairGrid at 0x7f229c385630>

sns.pairplot(pd.concat([pca_seqlet_hits, pca_seq_null_seqlet_hits]), 
             diag_kind="kde", kind='scatter', hue="marker")

<seaborn.axisgrid.PairGrid at 0x7f22a5fa9470>

sns.pairplot(pd.concat([pca_seqlet_hits, pca_weight_null_seqlet_hits]), 
             diag_kind="kde", kind='scatter', hue="marker")

<seaborn.axisgrid.PairGrid at 0x7f180f5002e8>

sns.pairplot(pd.concat([pca_seqlet_hits, pca_weightedseq_null_seqlet_hits]), 
             diag_kind="kde", kind='scatter', hue="marker")

<seaborn.axisgrid.PairGrid at 0x7f229d74c9e8>

ob = DenovoModisco(modisco_dir)
ob.initialize_with_importance_scores(summit_file, perf_file, deeplift_hdf) #loads all the important scores
#ob.find_instances_in_seqlets() #set up the clustering of seqlets for patterns
#ob.save_instances(out_prefix) #predict+collect the instances of sequences for all patterns

pattern_name = "metacluster_1/pattern_7"
pwm_query, _, _, _= ob.load_chomped_pattern(pattern_name, rc=False, plot=True)

test_hits = ob.find_pattern_instances_in_seqlets(pattern_name, per_base_imp=0.125, test_mode=True)

import seaborn as sns
sns.pairplot(test_hits[['sum_score', 
                   'weight_convolution', 'pfm_convolution', 'pwm_convolution']], 
             diag_kind="kde", kind='scatter')

/users/msharmin/anaconda2/envs/basepair/lib/python3.6/site-packages/scipy/stats/stats.py:1713: FutureWarning:

Using a non-tuple sequence for multidimensional indexing is deprecated; use `arr[tuple(seq)]` instead of `arr[seq]`. In the future this will be interpreted as an array index, `arr[np.array(seq)]`, which will result either in an error or a different result.

<seaborn.axisgrid.PairGrid at 0x7f0ba03f0278>

sns.pairplot(test_hits[['sum_score', 
                   'weight_scan', 'pfm_scan', 'pwm_scan']], 
             diag_kind="kde", kind='scatter')

/users/msharmin/anaconda2/envs/basepair/lib/python3.6/site-packages/scipy/stats/stats.py:1713: FutureWarning:

Using a non-tuple sequence for multidimensional indexing is deprecated; use `arr[tuple(seq)]` instead of `arr[seq]`. In the future this will be interpreted as an array index, `arr[np.array(seq)]`, which will result either in an error or a different result.

<seaborn.axisgrid.PairGrid at 0x7f0b9c4b1128>

sns.pairplot(test_hits[['sum_score', 
                   'weight_jaccard', 'pfm_jaccard', 'pwm_jaccard']], 
             diag_kind="kde", kind='scatter')

/users/msharmin/anaconda2/envs/basepair/lib/python3.6/site-packages/scipy/stats/stats.py:1713: FutureWarning:

Using a non-tuple sequence for multidimensional indexing is deprecated; use `arr[tuple(seq)]` instead of `arr[seq]`. In the future this will be interpreted as an array index, `arr[np.array(seq)]`, which will result either in an error or a different result.

<seaborn.axisgrid.PairGrid at 0x7f0b946eca20>

sns.pairplot(test_hits[['sum_score', 
                   'weight_masked_convolution', 'pfm_masked_convolution', 'pwm_masked_convolution']], 
             diag_kind="kde", kind='scatter')

/users/msharmin/anaconda2/envs/basepair/lib/python3.6/site-packages/scipy/stats/stats.py:1713: FutureWarning:

Using a non-tuple sequence for multidimensional indexing is deprecated; use `arr[tuple(seq)]` instead of `arr[seq]`. In the future this will be interpreted as an array index, `arr[np.array(seq)]`, which will result either in an error or a different result.

<seaborn.axisgrid.PairGrid at 0x7f0b944dc400>

from matlas.matches import plots_seqlet_measures
plots_seqlet_measures(test_hits, metrics = {0:["pwm_convolution", "pfm_convolution"], 
                                            1:["pwm_convolution", "weight_convolution"], 
                                            2:["pwm_convolution", "pwm_scan"], 
                                            3:["pwm_convolution", "pwm_jaccard"]})

/users/msharmin/anaconda2/envs/basepair/lib/python3.6/site-packages/scipy/stats/stats.py:1713: FutureWarning:

Using a non-tuple sequence for multidimensional indexing is deprecated; use `arr[tuple(seq)]` instead of `arr[seq]`. In the future this will be interpreted as an array index, `arr[np.array(seq)]`, which will result either in an error or a different result.

plots_seqlet_measures(test_hits, metrics = {0:["pwm_convolution", "pfm_jaccard"], 
                                            1:["pwm_convolution", "weight_jaccard"], 
                                            2:["pwm_convolution", "pwm_masked_convolution"], 
                                            3:["pwm_convolution", "pfm_masked_convolution"]})

/users/msharmin/anaconda2/envs/basepair/lib/python3.6/site-packages/scipy/stats/stats.py:1713: FutureWarning:

Using a non-tuple sequence for multidimensional indexing is deprecated; use `arr[tuple(seq)]` instead of `arr[seq]`. In the future this will be interpreted as an array index, `arr[np.array(seq)]`, which will result either in an error or a different result.

ob.find_pattern_instances_in_seqlets(pattern_name, per_base_imp=0.0625, plot=True)

4 ['pwm_scan: 9.189', 'pwm_jaccard: 0.073', 'pfm_jaccard: 0.547', 'sum_score: 2.382', 'pwm_convolution: 3.044', 'pwm_masked_convolution: 0.927', 'weight_scan: 10.975', 'weight_convolution: 4.215', 'pfm_masked_convolution: 0.933', 'pfm_convolution: 2.205', 'pfm_scan: 6.621', 'weight_jaccard: 0.748', 'weight_masked_convolution: 0.974'] coordinates: (0, 20)
20 20 28

24 ['pwm_scan: 1.773', 'pwm_jaccard: 0.009', 'pfm_jaccard: 0.232', 'sum_score: 1.325', 'pwm_convolution: -0.108', 'pwm_masked_convolution: -0.041', 'weight_scan: 8.39', 'weight_convolution: 1.331', 'pfm_masked_convolution: 0.641', 'pfm_convolution: 0.758', 'pfm_scan: 4.862', 'weight_jaccard: 0.26', 'weight_masked_convolution: 0.595'] coordinates: (0, 50)
50 50 58

33 ['pwm_scan: 9.189', 'pwm_jaccard: 0.073', 'pfm_jaccard: 0.543', 'sum_score: 3.144', 'pwm_convolution: 4.006', 'pwm_masked_convolution: 0.891', 'weight_scan: 10.975', 'weight_convolution: 5.405', 'pfm_masked_convolution: 0.89', 'pfm_convolution: 2.879', 'pfm_scan: 6.621', 'weight_jaccard: 0.635', 'weight_masked_convolution: 0.912'] coordinates: (1, 20)
20 20 28

62 ['pwm_scan: 8.58', 'pwm_jaccard: 0.068', 'pfm_jaccard: 0.552', 'sum_score: 3.177', 'pwm_convolution: 4.004', 'pwm_masked_convolution: 0.927', 'weight_scan: 10.689', 'weight_convolution: 5.591', 'pfm_masked_convolution: 0.926', 'pfm_convolution: 2.901', 'pfm_scan: 6.335', 'weight_jaccard: 0.708', 'weight_masked_convolution: 0.958'] coordinates: (2, 20)
20 20 28

100 ['pwm_scan: 8.871', 'pwm_jaccard: 0.071', 'pfm_jaccard: 0.532', 'sum_score: 4.044', 'pwm_convolution: 5.181', 'pwm_masked_convolution: 0.9', 'weight_scan: 10.923', 'weight_convolution: 7.166', 'pfm_masked_convolution: 0.899', 'pfm_convolution: 3.73', 'pfm_scan: 6.47', 'weight_jaccard: 0.642', 'weight_masked_convolution: 0.934'] coordinates: (3, 20)
20 20 28

131 ['pwm_scan: 8.871', 'pwm_jaccard: 0.071', 'pfm_jaccard: 0.564', 'sum_score: 2.248', 'pwm_convolution: 2.824', 'pwm_masked_convolution: 0.933', 'weight_scan: 10.923', 'weight_convolution: 3.916', 'pfm_masked_convolution: 0.939', 'pfm_convolution: 2.051', 'pfm_scan: 6.47', 'weight_jaccard: 0.745', 'weight_masked_convolution: 0.97'] coordinates: (4, 20)
20 20 28

174 ['pwm_scan: 8.263', 'pwm_jaccard: 0.065', 'pfm_jaccard: 0.522', 'sum_score: 3.775', 'pwm_convolution: 4.362', 'pwm_masked_convolution: 0.881', 'weight_scan: 10.636', 'weight_convolution: 6.193', 'pfm_masked_convolution: 0.901', 'pfm_convolution: 3.244', 'pfm_scan: 6.184', 'weight_jaccard: 0.61', 'weight_masked_convolution: 0.913'] coordinates: (5, 20)
20 20 28

196 ['pwm_scan: 7.616', 'pwm_jaccard: 0.06', 'pfm_jaccard: 0.467', 'sum_score: 2.184', 'pwm_convolution: 2.69', 'pwm_masked_convolution: 0.841', 'weight_scan: 10.584', 'weight_convolution: 3.782', 'pfm_masked_convolution: 0.853', 'pfm_convolution: 1.964', 'pfm_scan: 6.033', 'weight_jaccard: 0.514', 'weight_masked_convolution: 0.866'] coordinates: (6, 20)
20 20 28

235 ['pwm_scan: 8.58', 'pwm_jaccard: 0.068', 'pfm_jaccard: 0.473', 'sum_score: 3.503', 'pwm_convolution: 4.523', 'pwm_masked_convolution: 0.904', 'weight_scan: 10.689', 'weight_convolution: 6.409', 'pfm_masked_convolution: 0.902', 'pfm_convolution: 3.276', 'pfm_scan: 6.335', 'weight_jaccard: 0.642', 'weight_masked_convolution: 0.947'] coordinates: (7, 20)
20 20 28

257 ['pwm_scan: 8.225', 'pwm_jaccard: 0.065', 'pfm_jaccard: 0.535', 'sum_score: 1.957', 'pwm_convolution: 2.51', 'pwm_masked_convolution: 0.918', 'weight_scan: 10.87', 'weight_convolution: 3.529', 'pfm_masked_convolution: 0.932', 'pfm_convolution: 1.822', 'pfm_scan: 6.32', 'weight_jaccard: 0.711', 'weight_masked_convolution: 0.971'] coordinates: (8, 20)
20 20 28

ob.train_seqlet_hits[pattern_name][:5]#.shape
# from random import sample

# random_seqlet_hits = sample(list(ob.train_seqlet_hits[pattern_name].index), 3)
#list(ob.train_seqlet_hits[pattern_name].index)


pwm_query, pfm_query, weigh_query, consensus = ob.load_chomped_pattern(pattern_name, rc=False, plot=True)

'tcTTATCt'

#ob.deeplift_data['peaks'][(ob.deeplift_data['peaks'].chr=='chr2')&(ob.deeplift_data['peaks'].start>=3163300)]
from modisco.visualization import viz_sequence
viz_sequence.plot_weights(ob.deeplift_data['scores'][0, 400:600,:],
                                  #highlight={"black": [(start_motif, end_motif)]},
                                  subticks_frequency=10, figsize=(20,1))

viz_sequence.plot_weights(ob.deeplift_data['one_hot'][0, 400:600, :], 
                          #highlight={"black": [(start_motif, end_motif)]},
                          subticks_frequency=10, figsize=(20,1))

from matlas.matches import check_summary_instances
#seqlet_onehot = ob.seqs_per_pattern[pattern_name]
#filtered_seqlet_hits = ob.final_seqlet_hits[pattern_name]
check_summary_instances(ob.final_seqlet_hits[pattern_name], 
                        range(ob.seqs_per_pattern[pattern_name].shape[0]))

Number of instances (917, 15)
Metrics used Index(['sum_score', 'seqno', 'start_idx', 'pwm_scan', 'pfm_scan',
       'weight_scan', 'pwm_jaccard', 'pfm_jaccard', 'weight_jaccard',
       'pwm_convolution', 'pfm_convolution', 'weight_convolution',
       'pwm_masked_convolution', 'pfm_masked_convolution',
       'weight_masked_convolution'],
      dtype='object')
Minimum of sum_score 0.5294555804808624
Minimum of seqno 0
Minimum of start_idx 2
Minimum of pwm_scan 1.335839857548609
Minimum of pfm_scan 4.629802247582542
Minimum of weight_scan 8.267396133835705
Minimum of pwm_jaccard 0.009196646206622246
Minimum of pfm_jaccard 0.2071279446085702
Minimum of weight_jaccard 0.21956038132732028
Minimum of pwm_convolution -0.10803838253418951
Minimum of pfm_convolution 0.49349497595970093
Minimum of weight_convolution 0.6961941237791255
Minimum of pwm_masked_convolution -0.04106434081587958
Minimum of pfm_masked_convolution 0.4390628312454056
Minimum of weight_masked_convolution 0.3645870793961588
Missed seqlets set()

from matlas.matches import view_test_case

view_test_case(ob.contrib_per_pattern[pattern_name], 
               ob.seqs_per_pattern[pattern_name], 
               ob.final_seqlet_hits[pattern_name], 
               pwm_query, 
               seqno = 714,
               start=0, end=70
              )
            

# for seqno in np.sort(np.array([5055, 1635, 2437, 4616, 1353, 2185, 6253, 1550, 2446, 2257, 2353, 2325, 2038, 1945, 5019, 2303])):
#     view_test_case(seqlet_scores, seqlet_onehot, filtered_seqlet_hits2, pwm_query, seqno=seqno, start=0, end=70)

11909 ['weight_convolution: 0.914', 'pfm_jaccard: 0.474', 'pfm_convolution: 0.51', 'weight_masked_convolution: 0.864', 'pwm_scan: 7.616', 'pfm_scan: 6.033', 'pfm_masked_convolution: 0.913', 'pwm_jaccard: 0.06', 'weight_jaccard: 0.548', 'pwm_masked_convolution: 0.863', 'sum_score: 0.613', 'pwm_convolution: 0.669', 'weight_scan: 10.584'] coordinates: (714, 28)
28 28 36

from matlas.matches import view_test_cases

view_test_cases(ob.contrib_per_pattern[pattern_name], 
               ob.seqs_per_pattern[pattern_name], 
               ob.final_seqlet_hits[pattern_name], 
               pwm_query, 
               start=0, end=70, 
               max_examples_to_view=10)

4 ['weight_convolution: 4.215', 'pfm_jaccard: 0.547', 'pfm_convolution: 2.205', 'weight_masked_convolution: 0.974', 'pwm_scan: 9.189', 'pfm_scan: 6.621', 'pfm_masked_convolution: 0.933', 'pwm_jaccard: 0.073', 'weight_jaccard: 0.748', 'pwm_masked_convolution: 0.927', 'sum_score: 2.382', 'pwm_convolution: 3.044', 'weight_scan: 10.975'] coordinates: (0, 20)
20 20 28

24 ['weight_convolution: 1.331', 'pfm_jaccard: 0.232', 'pfm_convolution: 0.758', 'weight_masked_convolution: 0.595', 'pwm_scan: 1.773', 'pfm_scan: 4.862', 'pfm_masked_convolution: 0.641', 'pwm_jaccard: 0.009', 'weight_jaccard: 0.26', 'pwm_masked_convolution: -0.041', 'sum_score: 1.325', 'pwm_convolution: -0.108', 'weight_scan: 8.39'] coordinates: (0, 50)
50 50 58

33 ['weight_convolution: 5.405', 'pfm_jaccard: 0.543', 'pfm_convolution: 2.879', 'weight_masked_convolution: 0.912', 'pwm_scan: 9.189', 'pfm_scan: 6.621', 'pfm_masked_convolution: 0.89', 'pwm_jaccard: 0.073', 'weight_jaccard: 0.635', 'pwm_masked_convolution: 0.891', 'sum_score: 3.144', 'pwm_convolution: 4.006', 'weight_scan: 10.975'] coordinates: (1, 20)
20 20 28

62 ['weight_convolution: 5.591', 'pfm_jaccard: 0.552', 'pfm_convolution: 2.901', 'weight_masked_convolution: 0.958', 'pwm_scan: 8.58', 'pfm_scan: 6.335', 'pfm_masked_convolution: 0.926', 'pwm_jaccard: 0.068', 'weight_jaccard: 0.708', 'pwm_masked_convolution: 0.927', 'sum_score: 3.177', 'pwm_convolution: 4.004', 'weight_scan: 10.689'] coordinates: (2, 20)
20 20 28

100 ['weight_convolution: 7.166', 'pfm_jaccard: 0.532', 'pfm_convolution: 3.73', 'weight_masked_convolution: 0.934', 'pwm_scan: 8.871', 'pfm_scan: 6.47', 'pfm_masked_convolution: 0.899', 'pwm_jaccard: 0.071', 'weight_jaccard: 0.642', 'pwm_masked_convolution: 0.9', 'sum_score: 4.044', 'pwm_convolution: 5.181', 'weight_scan: 10.923'] coordinates: (3, 20)
20 20 28

131 ['weight_convolution: 3.916', 'pfm_jaccard: 0.564', 'pfm_convolution: 2.051', 'weight_masked_convolution: 0.97', 'pwm_scan: 8.871', 'pfm_scan: 6.47', 'pfm_masked_convolution: 0.939', 'pwm_jaccard: 0.071', 'weight_jaccard: 0.745', 'pwm_masked_convolution: 0.933', 'sum_score: 2.248', 'pwm_convolution: 2.824', 'weight_scan: 10.923'] coordinates: (4, 20)
20 20 28

174 ['weight_convolution: 6.193', 'pfm_jaccard: 0.522', 'pfm_convolution: 3.244', 'weight_masked_convolution: 0.913', 'pwm_scan: 8.263', 'pfm_scan: 6.184', 'pfm_masked_convolution: 0.901', 'pwm_jaccard: 0.065', 'weight_jaccard: 0.61', 'pwm_masked_convolution: 0.881', 'sum_score: 3.775', 'pwm_convolution: 4.362', 'weight_scan: 10.636'] coordinates: (5, 20)
20 20 28

196 ['weight_convolution: 3.782', 'pfm_jaccard: 0.467', 'pfm_convolution: 1.964', 'weight_masked_convolution: 0.866', 'pwm_scan: 7.616', 'pfm_scan: 6.033', 'pfm_masked_convolution: 0.853', 'pwm_jaccard: 0.06', 'weight_jaccard: 0.514', 'pwm_masked_convolution: 0.841', 'sum_score: 2.184', 'pwm_convolution: 2.69', 'weight_scan: 10.584'] coordinates: (6, 20)
20 20 28

235 ['weight_convolution: 6.409', 'pfm_jaccard: 0.473', 'pfm_convolution: 3.276', 'weight_masked_convolution: 0.947', 'pwm_scan: 8.58', 'pfm_scan: 6.335', 'pfm_masked_convolution: 0.902', 'pwm_jaccard: 0.068', 'weight_jaccard: 0.642', 'pwm_masked_convolution: 0.904', 'sum_score: 3.503', 'pwm_convolution: 4.523', 'weight_scan: 10.689'] coordinates: (7, 20)
20 20 28

257 ['weight_convolution: 3.529', 'pfm_jaccard: 0.535', 'pfm_convolution: 1.822', 'weight_masked_convolution: 0.971', 'pwm_scan: 8.225', 'pfm_scan: 6.32', 'pfm_masked_convolution: 0.932', 'pwm_jaccard: 0.065', 'weight_jaccard: 0.711', 'pwm_masked_convolution: 0.918', 'sum_score: 1.957', 'pwm_convolution: 2.51', 'weight_scan: 10.87'] coordinates: (8, 20)
20 20 28

#only forward pattern
df, consensus = ob._predict_pattern_instances_in_sequences("metacluster_1/pattern_7")
df.shape

INFO:2019-09-09 19:03:04,314:test.log] predicted, metacluster_1/pattern_7

(15326, 15)

# both forward and backward pattern
df = ob.predict_pattern_instances_in_sequences("metacluster_1/pattern_7")
df.shape

(27281, 6)

df.shape

(27281, 6)

from matlas.matches import view_test_cases
view_test_cases(ob.deeplift_data['scores'], 
               ob.deeplift_data['one_hot'], 
               df, 
               pwm_query, 
               start=400, end=600, 
               max_examples_to_view=10)

19 ['pwm_scan: 9.189', 'pwm_jaccard: 0.073', 'pfm_jaccard: 0.645', 'sum_score: 0.569', 'pwm_convolution: 0.705', 'pwm_masked_convolution: 0.98', 'weight_scan: 10.975', 'weight_convolution: 0.912', 'pfm_masked_convolution: 0.977', 'pfm_convolution: 0.507', 'pfm_scan: 6.621', 'weight_jaccard: 0.738', 'weight_masked_convolution: 0.961'] coordinates: (1, 431)
431 31 39

126 ['pwm_scan: 3.334', 'pwm_jaccard: 0.081', 'pfm_jaccard: 0.376', 'sum_score: 0.504', 'pwm_convolution: 0.897', 'pwm_masked_convolution: 0.773', 'weight_scan: 8.915', 'weight_convolution: 0.955', 'pfm_masked_convolution: 0.866', 'pfm_convolution: 0.504', 'pfm_scan: 5.112', 'weight_jaccard: 0.421', 'weight_masked_convolution: 0.857'] coordinates: (5, 503)
503 103 111

923 ['pwm_scan: 3.685', 'pwm_jaccard: 0.017', 'pfm_jaccard: 0.323', 'sum_score: 1.056', 'pwm_convolution: 0.359', 'pwm_masked_convolution: 0.181', 'weight_scan: 8.893', 'weight_convolution: 0.916', 'pfm_masked_convolution: 0.631', 'pfm_convolution: 0.61', 'pfm_scan: 5.359', 'weight_jaccard: 0.29', 'weight_masked_convolution: 0.52'] coordinates: (39, 575)
575 175 183

1029 ['pwm_scan: 9.189', 'pwm_jaccard: 0.073', 'pfm_jaccard: 0.447', 'sum_score: 0.537', 'pwm_convolution: 0.577', 'pwm_masked_convolution: 0.776', 'weight_scan: 10.975', 'weight_convolution: 0.591', 'pfm_masked_convolution: 0.77', 'pfm_convolution: 0.412', 'pfm_scan: 6.621', 'weight_jaccard: 0.398', 'weight_masked_convolution: 0.603'] coordinates: (48, 525)
525 125 133

1600 ['pwm_scan: 9.189', 'pwm_jaccard: 0.073', 'pfm_jaccard: 0.618', 'sum_score: 0.885', 'pwm_convolution: 1.121', 'pwm_masked_convolution: 0.963', 'weight_scan: 10.975', 'weight_convolution: 1.505', 'pfm_masked_convolution: 0.963', 'pfm_convolution: 0.807', 'pfm_scan: 6.621', 'weight_jaccard: 0.775', 'weight_masked_convolution: 0.981'] coordinates: (66, 548)
548 148 156

1612 ['pwm_scan: 2.454', 'pwm_jaccard: 0.018', 'pfm_jaccard: 0.306', 'sum_score: 1.322', 'pwm_convolution: -0.04', 'pwm_masked_convolution: -0.016', 'weight_scan: 8.839', 'weight_convolution: 1.264', 'pfm_masked_convolution: 0.683', 'pfm_convolution: 0.777', 'pfm_scan: 5.245', 'weight_jaccard: 0.318', 'weight_masked_convolution: 0.609'] coordinates: (66, 608)
608 208 216
1621 ['pwm_scan: 2.454', 'pwm_jaccard: 0.017', 'pfm_jaccard: 0.287', 'sum_score: 0.682', 'pwm_convolution: 0.004', 'pwm_masked_convolution: 0.003', 'weight_scan: 8.839', 'weight_convolution: 0.774', 'pfm_masked_convolution: 0.654', 'pfm_convolution: 0.422', 'pfm_scan: 5.245', 'weight_jaccard: 0.308', 'weight_masked_convolution: 0.657'] coordinates: (66, 738)
738 338 346
1627 ['pwm_scan: 9.189', 'pwm_jaccard: 0.073', 'pfm_jaccard: 0.514', 'sum_score: 0.698', 'pwm_convolution: 0.884', 'pwm_masked_convolution: 0.914', 'weight_scan: 10.975', 'weight_convolution: 1.226', 'pfm_masked_convolution: 0.92', 'pfm_convolution: 0.641', 'pfm_scan: 6.621', 'weight_jaccard: 0.72', 'weight_masked_convolution: 0.961'] coordinates: (67, 421)
421 21 29

1638 ['pwm_scan: 2.454', 'pwm_jaccard: 0.006', 'pfm_jaccard: 0.286', 'sum_score: 1.219', 'pwm_convolution: -0.261', 'pwm_masked_convolution: -0.107', 'weight_scan: 8.839', 'weight_convolution: 0.924', 'pfm_masked_convolution: 0.554', 'pfm_convolution: 0.618', 'pfm_scan: 5.245', 'weight_jaccard: 0.254', 'weight_masked_convolution: 0.454'] coordinates: (67, 481)
481 81 89

1646 ['pwm_scan: 9.189', 'pwm_jaccard: 0.073', 'pfm_jaccard: 0.527', 'sum_score: 1.444', 'pwm_convolution: 1.856', 'pwm_masked_convolution: 0.919', 'weight_scan: 10.975', 'weight_convolution: 2.588', 'pfm_masked_convolution: 0.924', 'pfm_convolution: 1.343', 'pfm_scan: 6.621', 'weight_jaccard: 0.735', 'weight_masked_convolution: 0.973'] coordinates: (67, 551)
551 151 159

1655 ['pwm_scan: 5.229', 'pwm_jaccard: 0.042', 'pfm_jaccard: 0.485', 'sum_score: 0.553', 'pwm_convolution: 0.422', 'pwm_masked_convolution: 0.594', 'weight_scan: 9.257', 'weight_convolution: 0.711', 'pfm_masked_convolution: 0.911', 'pfm_convolution: 0.402', 'pfm_scan: 5.429', 'weight_jaccard: 0.49', 'weight_masked_convolution: 0.86'] coordinates: (67, 596)
596 196 204
1668 ['pwm_scan: 9.189', 'pwm_jaccard: 0.073', 'pfm_jaccard: 0.447', 'sum_score: 0.669', 'pwm_convolution: 0.87', 'pwm_masked_convolution: 0.876', 'weight_scan: 10.975', 'weight_convolution: 1.237', 'pfm_masked_convolution: 0.88', 'pfm_convolution: 0.629', 'pfm_scan: 6.621', 'weight_jaccard: 0.655', 'weight_masked_convolution: 0.945'] coordinates: (68, 206)
206 -194 -186
1681 ['pwm_scan: 9.189', 'pwm_jaccard: 0.073', 'pfm_jaccard: 0.509', 'sum_score: 0.591', 'pwm_convolution: 0.753', 'pwm_masked_convolution: 0.892', 'weight_scan: 10.975', 'weight_convolution: 1.056', 'pfm_masked_convolution: 0.902', 'pfm_convolution: 0.549', 'pfm_scan: 6.621', 'weight_jaccard: 0.692', 'weight_masked_convolution: 0.949'] coordinates: (68, 336)
336 -64 -56
2472 ['pwm_scan: 8.58', 'pwm_jaccard: 0.064', 'pfm_jaccard: 0.46', 'sum_score: 0.524', 'pwm_convolution: 0.686', 'pwm_masked_convolution: 0.917', 'weight_scan: 10.689', 'weight_convolution: 0.923', 'pfm_masked_convolution: 0.902', 'pfm_convolution: 0.49', 'pfm_scan: 6.335', 'weight_jaccard: 0.566', 'weight_masked_convolution: 0.912'] coordinates: (115, 461)
461 61 69

2563 ['pwm_scan: 8.225', 'pwm_jaccard: 0.065', 'pfm_jaccard: 0.524', 'sum_score: 0.7', 'pwm_convolution: 0.893', 'pwm_masked_convolution: 0.929', 'weight_scan: 10.87', 'weight_convolution: 1.238', 'pfm_masked_convolution: 0.941', 'pfm_convolution: 0.646', 'pfm_scan: 6.32', 'weight_jaccard: 0.711', 'weight_masked_convolution: 0.969'] coordinates: (118, 545)
545 145 153

ob.deeplift_data['peaks'].loc[67,:]

chr          chr1
end       8135243
start     8134243
strand       b'*'
Name: 67, dtype: object

from matlas.utils import send_to_mitra
outdir = "{0}/web_mouse_heme_data/modisco_tracks/binary_model/task_{1}-naivegw".format(project_root, task_idx)

if not os.path.exists(outdir):
    os.makedirs(outdir)
out_prefix = "{0}/mm17_hits_spectral_1_7".format(outdir)
send_to_mitra(df, out_prefix, mitra=outdir, hammock=True)

writing...(27281, 6)

data_file = "{0}/spectral.p".format(ob.data_dir)
pickle.dump(ob.clustering, open(data_file, "wb"))

pickle.load(open(data_file, "rb"))

OrderedDict([('metacluster_1/pattern_7',
              SpectralClustering(affinity='rbf', assign_labels='discretize', coef0=1,
                        degree=3, eigen_solver=None, eigen_tol=0.0, gamma=1.0,
                        kernel_params=None, n_clusters=2, n_init=10, n_jobs=1,
                        n_neighbors=10, random_state=0))])

def print_hits(seqno, df):
    df = df[df.seqno==seqno]
    return df

seqno = 411
#print_hits(seqno, seqlet_hits)
#print_hits(seqno, filtered_seqlet_hits)
#print_hits(seqno, filtered_seqlet_hits2)

def print_positional_metadata(seqno, position, method_to_seqlet_scores):
    print("seqno: ", seqno, ", position: ", position)
    for key in method_to_seqlet_scores.keys():
        print(key, ": ", method_to_seqlet_scores[key][seqno, position])
    return None

#print_positional_metadata(411, 50)

#deeplift_data['peaks'].loc[67,:]#chr1:8135243:8134243, #67th in deeplift_scores
#np.argwhere(indices==67) #40, #40th in modisco_imp_scores

#seqlets_per_pattern = ob.mr.seqlets()
#seqlets_ = seqlets_per_pattern['metacluster_1/pattern_7']
# for seqlet in seqlets_:
#     if(seqlet.seqname==40):
#         break
# print(seqlet)#Seqlet(seqname=40, start=531, end=601, name='', strand='+')
#selected_summits.loc[40,:] #chr1:8134243:8135243

	PC1	PC2	PC3	PC4	PC5	PC6	PC7	PC8	label	index	pos	affinity
0	-1.757138	-1.105117	-1.560638	0.357107	0.277466	0.376950	-0.193923	0.091057	0	0	16	-0.089076
1	2.880820	-0.113733	-0.338334	0.840248	0.175452	0.578696	0.093965	-0.772116	0	1	17	0.136358
2	0.438450	2.069480	-0.599759	0.185653	-0.684141	0.460374	-0.527480	-0.043134	0	2	18	0.031246
3	-3.361249	4.008285	0.283835	-0.326203	-0.736401	0.243396	-0.191256	0.144796	0	3	19	-0.139611
4	16.268981	3.032388	0.694093	1.201489	2.725356	0.261409	-0.523800	0.170852	2	4	20	0.788538

	0	1	2	3	4	5	6	7	8	9
0	chr1	3400299	3400307	M1_P7/tcTTATCt	0.999995	.	0.999995	-1	0.999995	-1
1	chr1	3451802	3451810	M1_P7/tcTTATCt	0.994065	.	0.994065	-1	0.994065	-1
2	chr1	6223145	6223153	M1_P7/tcTTATCt	0.969597	.	0.969597	-1	0.969597	-1
3	chr1	8134664	8134672	M1_P7/tcTTATCt	0.999999	.	0.999999	-1	0.999999	-1
4	chr1	8134664	8134672	M1_P7/tcTTATCt	0.999999	.	0.999999	-1	0.999999	-1

	p	log10p	marker
0	0.339183	0.469566	F
1	0.719697	0.142850	F
2	0.390245	0.408663	F
3	0.911230	0.040372	F
4	0.700160	0.154803	T
5	0.145487	0.837175	F
6	0.951901	0.021408	F
7	0.346973	0.459704	F
8	0.780225	0.107780	F
9	0.268504	0.571050	F
10	0.017346	1.760809	F
11	0.768715	0.114235	F
12	0.294894	0.530334	F
13	0.038762	1.411589	F
14	0.766050	0.115743	F
15	0.543136	0.265091	F
16	0.491931	0.308095	F
17	0.759094	0.119705	F
18	0.373265	0.427983	F
19	0.563059	0.249446	F
20	0.838812	0.076335	F
21	0.444612	0.352019	F
22	0.561260	0.250836	F
23	0.966984	0.014581	F
24	0.611047	0.213925	F
25	0.967128	0.014516	F
26	0.748800	0.125634	F
27	0.603392	0.219400	F
28	0.554871	0.255808	F
29	0.534545	0.272016	F
...	...	...	...
13557	0.444668	0.351964	F
13558	0.154117	0.812149	F
13559	0.242292	0.615661	F
13560	0.173225	0.761390	F
13561	0.513153	0.289753	F
13562	0.356413	0.448047	F
13563	0.123291	0.909069	F
13564	0.727890	0.137935	T
13565	0.195246	0.709418	F
13566	0.340368	0.468051	F
13567	0.269086	0.570108	F
13568	0.344548	0.462750	F
13569	0.512104	0.290642	F
13570	0.798295	0.097836	F
13571	0.597403	0.223732	F
13572	0.355927	0.448639	F
13573	0.891977	0.049646	T
13574	0.194900	0.710187	F
13575	0.376943	0.423724	F
13576	0.264820	0.577049	F
13577	0.360948	0.442555	F
13578	0.665133	0.177091	F
13579	0.544177	0.264260	F
13580	0.899827	0.045841	F
13581	0.201243	0.696279	F
13582	0.880268	0.055385	T
13583	0.332491	0.478220	F
13584	0.436855	0.359662	F
13585	0.880313	0.055363	F
13586	0.185081	0.732639	F

	sum_score	seqno	start_idx	ppm_scan	pwm_scan	log10pwm_scan	pssm_scan	cwm_scan	hyp_cwm_scan	ppm_jaccard	...	pssm_convolution	cwm_convolution	hyp_cwm_convolution	ppm_masked_convolution	pwm_masked_convolution	log10pwm_masked_convolution	pssm_masked_convolution	cwm_masked_convolution	hyp_cwm_masked_convolution	marker
12013	0.510935	724	16	2.173273	-16.685898	-11.565783	2.807995	0.271431	-0.129788	0.093749	...	0.155122	0.014721	-0.012018	0.409525	-0.668292	-0.668292	0.344445	0.320698	-0.240834	foreground
12014	0.726112	724	17	2.955832	-9.491440	-6.578965	4.307728	0.489950	0.071016	0.157605	...	0.343455	0.040122	0.003713	0.496489	-0.393983	-0.393983	0.428561	0.400553	0.038883	foreground
12015	0.918835	724	18	1.895810	-16.006632	-11.094952	2.776762	0.348214	-0.168935	0.129040	...	0.351885	0.042564	-0.020289	0.580346	-0.552284	-0.552284	0.457278	0.387848	-0.180441	foreground
12016	1.216095	724	19	2.241223	-16.770756	-11.624602	2.795168	0.296356	-0.195357	0.087665	...	0.231063	0.023784	-0.075097	0.231076	-0.803851	-0.803851	0.216105	0.210806	-0.522732	foreground
12017	1.241818	724	20	6.648924	12.798364	8.871150	10.946750	1.507266	1.190199	0.566732	...	2.063255	0.314700	0.245513	0.901531	0.905861	0.905861	0.904396	0.937620	0.938175	foreground
12018	1.150841	724	21	1.979615	-17.255894	-11.960874	2.454263	0.213059	-0.089824	0.056313	...	0.174882	0.015555	-0.031489	0.201595	-0.785020	-0.785020	0.188620	0.192204	-0.275004	foreground
12019	1.059108	724	22	2.545866	-14.645376	-10.151401	4.199507	0.512827	0.098539	0.146221	...	0.573835	0.064573	-0.016732	0.439639	-0.466749	-0.466749	0.405790	0.358829	-0.106877	foreground
12020	1.016291	724	23	2.822197	-11.335154	-7.856930	4.201037	0.479085	-0.014662	0.111055	...	0.314115	0.021815	-0.046603	0.319340	-0.526418	-0.526418	0.240512	0.133503	-0.285399	foreground
12021	0.894742	724	24	1.146093	-22.607031	-15.670000	2.113748	0.202143	-0.273157	0.044927	...	0.174305	0.013882	-0.017559	0.214520	-0.388620	-0.388620	0.181476	0.139714	-0.138538	foreground
12022	0.670116	724	25	3.510759	-7.332713	-5.082649	4.744608	0.467600	0.199800	0.104182	...	0.289349	0.032979	0.030203	0.540020	0.121739	0.121739	0.247456	0.286472	0.288089	foreground
12023	0.503941	724	26	2.680634	-11.520928	-7.985699	4.159111	0.579430	0.235952	0.061523	...	0.092858	0.020784	0.024588	0.264452	-0.003100	-0.003100	0.091842	0.128490	0.168628	foreground
12024	0.560634	724	61	2.759909	-11.651375	-8.076117	4.703845	0.547412	0.042536	0.138787	...	0.360025	0.044303	0.006862	0.609668	-0.272037	-0.272037	0.491439	0.481183	0.083735	foreground
12025	0.712192	724	62	2.736127	-15.611526	-10.821085	4.265700	0.423533	-0.155905	0.127224	...	0.268559	0.028056	-0.034304	0.344609	-0.707176	-0.707176	0.337684	0.357184	-0.377643	foreground

	PC1	PC2	PC3	PC4	PC5	PC6	PC7	PC8
0	-1.757138	-1.105117	-1.560638	0.357107	0.277466	0.376950	-0.193923	0.091057
1	2.880820	-0.113733	-0.338334	0.840248	0.175452	0.578696	0.093965	-0.772116
2	0.438450	2.069480	-0.599759	0.185653	-0.684141	0.460374	-0.527480	-0.043134
3	-3.361249	4.008285	0.283835	-0.326203	-0.736401	0.243396	-0.191256	0.144796
4	16.268981	3.032388	0.694093	1.201489	2.725356	0.261409	-0.523800	0.170852