In [1]:
import pickle
from collections import OrderedDict
import numpy as np
import pandas as pd
import os
import subprocess
import math
import copy
import pickle
import h5py
import glob

from matlas.genome_data import *
from matlas.pwms import shorten_tfname, get_motifDB_id_maps
from matlas.matches import fig2vdom, vdom_pssm
#from basepair.modisco.results import ModiscoResult
from matlas.modisco_results import ModiscoResult
from vdom.helpers import (h1, p, li, img, div, b, br, ul, a,
                          details, summary,
                          table, thead, th, tr, tbody, td, ol)
from IPython.display import display


project_root = "/mnt/lab_data/kundaje/msharmin/mouse_hem/with_tfd"
oak_root = "/oak/stanford/groups/akundaje/msharmin/mouse_hem/from_labcluster/with_tfd"
srv_root = "/srv/scratch/msharmin/mouse_hem/with_tfd"
data_name = "full_mouse50"
modisco_root = "{0}/{1}/Naive_modisco2019_msharmin".format(oak_root, data_name)
modisco_root = "{0}/{1}/Naive_modisco2019".format(srv_root, data_name)

homer_root = "{0}/{1}/Naive_scans".format(srv_root, data_name)
model_root = "{0}/{1}/fineFactorized".format(srv_root, data_name)
basset_root = "{0}/{1}/Naive_basset_motifs".format(srv_root, data_name)
reportfile= "/mnt/lab_data/kundaje/msharmin/mouse_hem/with_tfd/full_mouse50/filtering samples_MS2.xlsx"
sheetname = "filter23"
load data from labcluster
Using TensorFlow backend.
TF-MoDISco is using the TensorFlow backend.
In [2]:
from matlas.matches import DenovoModisco

task_idx = 273
modisco_dir = "{0}/task_{1}-naivegw".format(modisco_root, task_idx)
summit_file = "{0}/{1}/fineFactorized/task_{2}-naivegw/predictions/task_{2}-{3}Summit.calibrated.{1}.tab.gz".format(project_root, data_name, task_idx, "Naive")
perf_file = "{0}/task_{1}-naivegw/NaiveauPRC.txt".format(model_root, task_idx)
deeplift_hdf = "{0}/{1}/Naive_deeplift2019/task_{2}-naivegw/summit.h5".format(srv_root, data_name, task_idx)
ob = DenovoModisco(modisco_dir)
pattern_name = "metacluster_1/pattern_3"
query = ob.load_chomped_pattern(pattern_name, plot=True)
ppmchomped ppmchomped pwmchomped log10_pwmchomped pssmchomped cwmchomped hyp_cwmconsensustgcTGAGTCAt
In [3]:
ob.initialize_with_importance_scores(summit_file, perf_file, deeplift_hdf, query.keys()) #loads all the important scores
deeplift_data = ob.deeplift_data
In [4]:
from matlas.utils import get_calibrated_peaks

calibrated_data = get_calibrated_peaks(summit_file, perf_file)
selected_summits = calibrated_data['selected_summits']
indices = calibrated_data['indices']

#from basepair.plot.profiles import extract_signal
from matlas.tracks import extract_signal
pattern_name = 'metacluster_1/pattern_3'

seqlet_scores = ob.contrib_per_pattern[pattern_name]
seqlet_onehot = ob.seqs_per_pattern[pattern_name]

seqlets_ = ob.seqlets_per_pattern[pattern_name]
shuffled_onehot = extract_signal(ob.deeplift_data['shuffled_onehot'][indices], seqlets_)
shuffled_scores = extract_signal(ob.deeplift_data['shuffled_scores'][indices], seqlets_)

shuffled_onehot.shape, shuffled_scores.shape
Out[4]:
((1077, 70, 4), (1077, 70, 4))
In [5]:
from matlas.matches import collect_scoring_measures

method_to_seqlet_scores = collect_scoring_measures(query, seqlet_scores, seqlet_onehot)
print(query.keys(), np.sort(np.array(list(method_to_seqlet_scores.keys()))))


scores = deeplift_data['scores']
one_hot = deeplift_data['one_hot']

method_to_scores = collect_scoring_measures(query, scores, one_hot)
dict_keys(['ppm', 'pwm', 'log10pwm', 'pssm', 'cwm', 'hyp_cwm', 'consensus']) ['cwm_convolution' 'cwm_jaccard' 'cwm_masked_convolution' 'cwm_scan'
 'hyp_cwm_convolution' 'hyp_cwm_jaccard' 'hyp_cwm_masked_convolution'
 'hyp_cwm_scan' 'log10pwm_convolution' 'log10pwm_jaccard'
 'log10pwm_masked_convolution' 'log10pwm_scan' 'ppm_convolution'
 'ppm_jaccard' 'ppm_masked_convolution' 'ppm_scan' 'pssm_convolution'
 'pssm_jaccard' 'pssm_masked_convolution' 'pssm_scan' 'pwm_convolution'
 'pwm_jaccard' 'pwm_masked_convolution' 'pwm_scan' 'sum_score']
In [6]:
from matlas.matches import get_any_high_scoring_instances
per_base_imp=0.0625

keys_for_scoring = ['sum_score'] 
keys_for_scoring += ["{}_scan".format(key) for key in query.keys() if key!= 'consensus']
keys_for_scoring += ["{}_jaccard".format(key) for key in query.keys() if key!= 'consensus']
keys_for_scoring += ["{}_convolution".format(key) for key in query.keys() if key!= 'consensus']
keys_for_scoring += ["{}_masked_convolution".format(key) for key in query.keys() if key!= 'consensus']
        
seqlet_hits = get_any_high_scoring_instances(method_to_seqlet_scores,
                                                min_tot_imp=per_base_imp*len(query['ppm']),
                                                keys=keys_for_scoring)

_hits = get_any_high_scoring_instances(method_to_scores,
                                       min_tot_imp=per_base_imp*len(query['ppm']),
                                       keys=keys_for_scoring)
In [7]:
keys_for_scoring=['sum_score']
keys_for_scoring += ["{}_scan".format(key) for key in ['pssm', 'hyp_cwm', 'cwm']]
keys_for_scoring += ["{}_jaccard".format(key) for key in ['pssm', 'hyp_cwm']]
keys_for_scoring += ["{}_masked_convolution".format(key) for key in ['hyp_cwm']]
In [7]:
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA, KernelPCA
from sklearn.manifold import Isomap

def reduce_data_dimension(_hits, scaler=None, pca=None):
    if scaler is None:
        # should be used only during training
        scaler = StandardScaler()
        scaler.fit(_hits)
        
    # Apply transform to both the training set and testing set
    _hits = scaler.transform(_hits)

    if pca is None:
        # Make an instance of the Model
        pca = PCA(.98)
        #pca = KernelPCA(n_components=2, kernel='linear', n_jobs=10)
        #pca = KernelPCA(n_components=2, kernel='linear', n_jobs=10)
        #isomap = Isomap(n_components=2, n_jobs = 10, n_neighbors = 5)
        #fitting PCA on the training set only.
        pca.fit(_hits)
        
    #Apply the mapping (transform) to both the training set and the test set.
    _hits = pca.transform(_hits)
    
    return scaler, pca, _hits

scaler, pca, train_hits = reduce_data_dimension(seqlet_hits[keys_for_scoring])
pca.explained_variance_ratio_, train_hits.shape
Out[7]:
(array([0.75931202, 0.09685279, 0.06073672, 0.02525954, 0.02161293,
        0.01377333, 0.00679918]), (20522, 7))
In [8]:
pc_df = pd.DataFrame(data = train_hits, columns=["PC{}".format(i+1) for i in range(train_hits.shape[1])])
In [9]:
from matlas.match_utils import load_true_positives, get_tp_indices

tp_file = "{}/web_mouse_heme_data/modisco_tracks/binary_model/task_{}-naivegw/mm13_tp.txt".format(project_root, task_idx)
coordinates = load_true_positives(tp_file)
true_indices = get_tp_indices(seqlet_hits, coordinates, False)
len(true_indices['strong']), len(true_indices['weak']), seqlet_hits.shape#, sum(labels==1)
Out[9]:
(1113, 47, (20522, 27))
In [10]:
import matplotlib.pyplot as plt
from matlas.match_utils plot_pca

coordinates = load_true_positives(tp_file)
true_indices = get_tp_indices(seqlet_hits, coordinates, False)
plot_pca(pc_df, true_indices)
(20522, 7)
In [11]:
import matplotlib.pyplot as plt
from sklearn.mixture import GaussianMixture
#import sklearn.mixture.GaussianMixture
gmm = GaussianMixture(n_components=2, covariance_type='full', init_params='kmeans').fit(pc_df[['PC1', 'PC2']])
labels = gmm.predict(pc_df[['PC1', 'PC2']])
plt.scatter(pc_df['PC1'], pc_df['PC2'], c=labels, s=1, cmap='seismic');
probs = gmm.predict_proba(pc_df[['PC1', 'PC2']])
plt.scatter(pc_df['PC1'], pc_df['PC2'], c=probs[:,1], s=1, cmap='seismic');
In [12]:
pcv1 = pca.explained_variance_ratio_[0]
pcv2 = pca.explained_variance_ratio_[2]
pc_maxs = pc_df.max()
affinity = (pc_df['PC1']*pcv1/(pcv1+pcv2)+pc_df['PC2']*pcv2/(pcv1+pcv2))/(pc_maxs['PC1']*pcv1/(pcv1+pcv2)+pc_maxs['PC2']*pcv2/(pcv1+pcv2))
plt.scatter(pc_df['PC1'], pc_df['PC2'], c=affinity, s=1, cmap='seismic');
In [13]:
labels = np.array([0]*pc_df.shape[0])
labels[np.array(true_indices['strong'])]=2
labels[np.array(true_indices['weak'])]=1.2
labels[3415] = 0.8
pc_df['label'] = labels
pc_df['index'] = pc_df.index
pc_df['seqno'] = seqlet_hits.seqno
pc_df['pos'] = seqlet_hits.start_idx
pc_df['affinity'] = affinity
pc_df[:5]
Out[13]:
PC1 PC2 PC3 PC4 PC5 PC6 PC7 label index seqno pos affinity
0 -2.918475 -2.318132 -1.050799 1.053675 0.237982 0.077093 -0.601777 0 0 0 20 -0.145744
1 1.867978 -1.788217 -1.439439 -0.142089 -0.532090 -0.353145 -0.028351 0 1 0 21 0.080995
2 -2.168122 -0.648587 -1.683943 -0.125806 -0.119410 0.165422 -0.509741 0 2 0 22 -0.104241
3 -2.426212 1.489661 1.173652 0.016627 -0.438543 -0.098798 -0.342505 0 3 0 23 -0.108328
4 3.058589 1.940800 0.683437 -0.008237 0.003923 -1.233440 -0.670710 0 4 0 24 0.150906
In [14]:
import plotly.express as px
fig = px.scatter(pc_df, x="PC1", y="PC2", hover_data=['index', 'seqno', 'pos'], color='affinity',
                 color_continuous_scale=px.colors.sequential.Magma
                )
fig
In [15]:
#seqlets pca, seqlet gmm 
_, _, test_hits = reduce_data_dimension(_hits[keys_for_scoring], scaler, pca)
_pc_df = pd.DataFrame(data = test_hits, columns=["PC{}".format(i+1) for i in range(test_hits.shape[1])])

_labels = gmm.predict(_pc_df[['PC1', 'PC2']])
print(np.sum(_labels==1), np.sum(_labels==0))
_probs = gmm.predict_proba(_pc_df[['PC1', 'PC2']])
plt.scatter(_pc_df['PC1'], _pc_df['PC2'], c=_probs[:,1], s=1, cmap='seismic');
9898 3021638
In [16]:
pc_maxs = _pc_df.max()
affinity = (_pc_df['PC1']*pcv1/(pcv1+pcv2)+_pc_df['PC2']*pcv2/(pcv1+pcv2))/(pc_maxs['PC1']*pcv1/(pcv1+pcv2)+pc_maxs['PC2']*pcv2/(pcv1+pcv2))
plt.scatter(_pc_df['PC1'], _pc_df['PC2'], c=affinity, s=1, cmap='seismic');
In [75]:
pc_df[(pc_df.seqno==122) & (pc_df.pos==41)]
Out[75]:
PC1 PC2 PC3 PC4 label index seqno pos
3415 5.329909 -0.988796 0.834296 -0.06379 1 3415 122 41
In [70]:
pd.DataFrame(seqlet_hits.loc[a,:]).T
#pd.DataFrame(seqlet_hits.loc[b,:]).T
Out[70]:
sum_score seqno start_idx ppm_scan pwm_scan log10pwm_scan pssm_scan cwm_scan hyp_cwm_scan ppm_jaccard ... log10pwm_convolution pssm_convolution cwm_convolution hyp_cwm_convolution ppm_masked_convolution pwm_masked_convolution log10pwm_masked_convolution pssm_masked_convolution cwm_masked_convolution hyp_cwm_masked_convolution
19758 1.194493 1012.0 28.0 7.884865 12.168408 8.434497 12.75469 2.076318 1.618032 0.460596 ... 1.242754 1.626674 0.261575 0.208945 0.891619 0.777796 0.777796 0.848675 0.843954 0.862352

1 rows × 27 columns

In [37]:
from matlas.matches import compare_test_case
#19758, 19300
a=19758; b=19673
compare_test_case(seqlet_hits.loc[a,:], seqlet_hits.loc[b,:], 'scan')
compare_test_case(seqlet_hits.loc[a,:], seqlet_hits.loc[b,:], 'convolution')
compare_test_case(seqlet_hits.loc[a,:], seqlet_hits.loc[b,:], 'jaccard')
compare_test_case(seqlet_hits.loc[a,:], seqlet_hits.loc[b,:], 'masked')
In [99]:
np.sum(_new_labels==0), np.sum(_labels==0)
Out[99]:
(9998, 9898)
In [107]:
from sklearn.semi_supervised import LabelSpreading
from random import sample

def get_high_scoring_instances(_hits, clustering=None, test_df=None):
    if clustering is None:
        # should be used only during training
        label_prop_model = LabelSpreading()
        label_prop_model.fit(_hits, labels)
        
    # Apply transform to both the training set and testing set
    _hits = scaler.transform(_hits)

    
    return _hits, label_prop_model

#label prop after 
#label prop after spectral clustering
prop_labels = np.array(list(labels) + [-1]*(_hits.shape[0]))
prop_df = pd.concat([pc_df[['PC1', 'PC2']], _pc_df[['PC1', 'PC2']]], axis=0)
max_labeled_instances_to_cluster = 20000
max_unlabeled_instances_to_cluster = 40000
labeled_indices = sample(list(seqlet_hits.index), max_labeled_instances_to_cluster)
unlabeled_indices = sample(list(_hits.index), max_unlabeled_instances_to_cluster)
labeled_hits = pc_df[['PC1', 'PC2']].loc[labeled_indices,:]
unlabeled_hits = _pc_df[['PC1', 'PC2']].loc[unlabeled_indices,:]
array_to_cluster = np.array(pd.concat([labeled_hits, unlabeled_hits]))
labels_to_cluster = np.array(list(labels[labeled_indices])+[-1]*unlabeled_hits.shape[0])
print(array_to_cluster.shape, len(labels_to_cluster))
label_prop_model = LabelSpreading(kernel="knn", n_neighbors=7, alpha=0.2, n_jobs=10)
label_prop_model.fit(array_to_cluster, labels_to_cluster)
print(np.sum(prop_labels==0), np.sum(prop_labels==1), np.sum(prop_labels==-1))
print(np.sum(labels_to_cluster==0), np.sum(labels_to_cluster==1), np.sum(labels_to_cluster==-1))

batch_size = 40000
batch_count = int(np.ceil(1.0*_pc_df.shape[0]/batch_size))
new_labels = []
new_probs = []
array_to_predict = np.array(_pc_df[['PC1', 'PC2']])
for si in range(batch_count):
    #print(si)
    if si == batch_count-1:
        array_to_predict = np.array(_pc_df[['PC1', 'PC2']])[si*batch_size:_pc_df.shape[0], :]
    else:
        array_to_predict = np.array(_pc_df[['PC1', 'PC2']])[si*batch_size:(si+1)*batch_size, :]
    _new_labels = label_prop_model.predict(array_to_predict)
    _new_probs = label_prop_model.predict_proba(array_to_predict)
    new_labels.append(_new_labels)
    new_probs.append(_new_probs)
_new_labels = np.concatenate(new_labels)
_new_probs = np.concatenate(new_probs)
print(_new_labels.shape, _new_probs.shape)
print(np.sum(_new_labels==0), np.sum(_new_labels==1), np.sum(_new_labels==-1))
# new_labels = label_prop_model.predict(array_to_cluster)
# new_probs = label_prop_model.predict_proba(array_to_cluster)
# print(new_probs.shape, new_labels.shape)
print(np.sum(_new_labels==0), np.sum(_new_labels==1), np.sum(_new_labels==-11), np.sum(_labels==0))
plt.scatter(_pc_df['PC1'], _pc_df['PC2'], c=_new_probs[:,0], s=1, cmap='seismic');
(60000, 2) 60000
/users/msharmin/anaconda2/envs/basepair13/lib/python3.6/site-packages/sklearn/semi_supervised/label_propagation.py:297: RuntimeWarning:

divide by zero encountered in true_divide

/users/msharmin/anaconda2/envs/basepair13/lib/python3.6/site-packages/sklearn/semi_supervised/label_propagation.py:297: RuntimeWarning:

invalid value encountered in true_divide

1135 19387 3031536
1096 18904 40000
(3031536,) (3031536, 2)
19239 3012297 0
19239 3012297 9898
In [108]:
np.sum(_new_labels==-1)
Out[108]:
0
In [89]:
_new_labels = np.concatenate(new_labels)
_new_probs = np.concatenate(new_probs)
_new_labels.shape, _new_probs.shape
Out[89]:
((3031536,), (3031536, 2))
In [64]:
 
1135 19387 3031536
1135 19387 20522
1201 39843 0
In [13]:
#independent pca, seqlet gmm
_scaler, _pca, _train_hits = reduce_data_dimension(_hits[keys_for_scoring])
print(_pca.explained_variance_ratio_)
_pc_df = pd.DataFrame(data = _train_hits, columns=["PC{}".format(i+1) for i in range(_train_hits.shape[1])])

_labels = gmm.predict(_pc_df[['PC1', 'PC2']])
print(np.sum(_labels==1), np.sum(_labels==0))
_probs = gmm.predict_proba(_pc_df[['PC1', 'PC2']])
plt.scatter(_pc_df['PC1'], _pc_df['PC2'], c=_probs[:,1], s=1, cmap='seismic');
[0.54780746 0.16908955 0.11908488 0.0654433  0.03190427 0.01992017
 0.01441596 0.01048189 0.00770887]
3019821 11715
In [16]:
# seqlets pca, independent gmm
_, _, test_hits = reduce_data_dimension(_hits[keys_for_scoring], scaler, pca)
_pc_df = pd.DataFrame(data = test_hits, columns=["PC{}".format(i+1) for i in range(test_hits.shape[1])])

_gmm = GaussianMixture(n_components=2, covariance_type='tied').fit(_pc_df[['PC1', 'PC2']])
_labels = _gmm.predict(_pc_df[['PC1', 'PC2']])
print(np.sum(_labels==1), np.sum(_labels==0))
_probs = _gmm.predict_proba(_pc_df[['PC1', 'PC2']])
plt.scatter(_pc_df['PC1'], _pc_df['PC2'], c=_probs[:,1], s=1, cmap='seismic');
542969 2488567
In [20]:
#independent pca, independent gmm
_scaler, _pca, _train_hits = reduce_data_dimension(_hits[keys_for_scoring])
print(_pca.explained_variance_ratio_)
_pc_df = pd.DataFrame(data = _train_hits, columns=["PC{}".format(i+1) for i in range(_train_hits.shape[1])])
_gmm = GaussianMixture(n_components=2, covariance_type='full').fit(_pc_df[['PC1', 'PC2']])
_labels = _gmm.predict(_pc_df[['PC1', 'PC2']])
print(np.sum(_labels==1), np.sum(_labels==0))
_probs = _gmm.predict_proba(_pc_df[['PC1', 'PC2']])
plt.scatter(_pc_df['PC1'], _pc_df['PC2'], c=_probs[:,1], s=1, cmap='seismic');
[0.54780746 0.16908955 0.11908488 0.0654433  0.03190427 0.01992017
 0.01441596 0.01048189 0.00770887]
567844 2463692
In [68]:
_, _, test_hits = reduce_data_dimension(_hits[keys_for_scoring], scaler, pca)
_pc_df = pd.DataFrame(data = test_hits, columns=["PC{}".format(i+1) for i in range(test_hits.shape[1])])
_labels = gmm.predict(_pc_df[['PC1', 'PC2']])
_probs = gmm.predict_proba(_pc_df[['PC1', 'PC2']])
sorted_hits = _hits
sorted_hits['probs'] = _probs[:,0]
sorted_hits['labels'] = _labels
sorted_hits = sorted_hits[sorted_hits['labels']==0]
print(sorted_hits.shape)

outdir = "/mnt/lab_data/kundaje/msharmin/mouse_hem/with_tfd/web_mouse_heme_data/modisco_tracks/binary_model/task_273-naivegw"
filename = "m13_gmm.gz"
instancefile = "{0}/{1}".format(outdir, filename)
sorted_hits.to_csv(instancefile, sep='\t')
(9939, 29)
In [69]:
def predict_pattern(df, probs=None):
    if probs is not None:
        df = df[df['probs']>=probs,:]
    
    consensus = query['consensus']
    summits = ob.deeplift_data['peaks']
    display_name = ob.shorten_pattern_name(pattern_name)
    
    peak_no = df.shape[0]
    _indices = df.seqno.values
    chroms = summits.chr.values[_indices]
    starts = summits.start.values[_indices] + df.start_idx.values
    ends = starts + len(consensus)
    names = np.array(["{0}/{1}".format(display_name, consensus)]*peak_no)
    scores = df.probs.values
    strands = np.array(['.']*peak_no)
    
    df = pd.DataFrame.from_dict({0:chroms, 1:starts, 2:ends, 3:names, 4:scores, 5:strands})
    df = df.sort_values([0, 1, 4])
    #df = df.drop_duplicates(subset=[0,1,2,3,5])
    
    return df

df = predict_pattern(sorted_hits)

from matlas.utils import send_to_mitra
outdir = "/mnt/lab_data/kundaje/msharmin/mouse_hem/with_tfd/web_mouse_heme_data/modisco_tracks/binary_model/task_273-naivegw"
filename = "m13_gmm"
out_prefix = "{0}/{1}".format(outdir, filename)
send_to_mitra(df, out_prefix, mitra=outdir, hammock=True)
writing...(9939, 6)
In [24]:
from matlas.verification import compute_chip_matrix

compute_chip_matrix("MEL", "JUND", out_prefix, execute=True)
computeMatrix reference-point --referencePoint center --skipZeros -S /mnt/lab_data2/msharmin/mouse_chipseq_bigwigs/ENCFF302XAM.bigWig -R /mnt/lab_data/kundaje/msharmin/mouse_hem/with_tfd/web_mouse_heme_data/modisco_tracks/binary_model/task_273-naivegw/m13_gmm.bed -a 500 -b 500 -bs 10 -o /mnt/lab_data/kundaje/msharmin/mouse_hem/with_tfd/web_mouse_heme_data/modisco_tracks/binary_model/task_273-naivegw/m13_gmm.matrix.gz --outFileSortedRegions /mnt/lab_data/kundaje/msharmin/mouse_hem/with_tfd/web_mouse_heme_data/modisco_tracks/binary_model/task_273-naivegw/m13_gmm.matrix.bed 
In [26]:
from matlas.verification import plot_chip_heatmap
plot_chip_heatmap("MEL", "JUND", out_prefix, execute=True)
plotHeatmap -m /mnt/lab_data/kundaje/msharmin/mouse_hem/with_tfd/web_mouse_heme_data/modisco_tracks/binary_model/task_273-naivegw/m13_gmm.matrix.gz -out /mnt/lab_data/kundaje/msharmin/mouse_hem/with_tfd/web_mouse_heme_data/modisco_tracks/binary_model/task_273-naivegw/MEL-JUND-heatmap.png  --colorMap Blues --xAxisLabel Distance --missingDataColor 1 --zMin 0 --heatmapHeight 28 --heatmapWidth 8 --legendLocation none
In [1]:
from matlas.matches import view_test_case
from modisco.visualization import viz_sequence
#from matlas.matches import view_test_cases

def view_test_cases(scores, one_hot, df, pwm, start=400, end=600, examples_to_view=None):
    
    seqnos = df.seqno.values
    positions = df.start_idx.values
    if examples_to_view is not None:
        seqnos = seqnos[examples_to_view]
        positions = positions[examples_to_view]
    
    for seqno, pos in zip(seqnos, positions):
        view_test_case(scores, one_hot, df, pwm, seqno, start=start, end=end, itemno=pos)
        
    return None
#tmp_df = seqlet_hits.loc[np.sort((np.array(true_indices))),:]
# view_test_cases(seqlet_scores, 
#                seqlet_onehot, 
#                seqlet_hits, 
#                query['pwm'], 
#                start=0, end=70, 
#                examples_to_view=pc_df['PC1'].values>6)

# l = np.array([0]*pc_df.shape[0])
# l[pc_df['PC1']>6] = 1
# plt.scatter(pc_df['PC1'], pc_df['PC2'], c=l, s=1, cmap='seismic');
#np.sum(pc_df['PC1'].values>6)
load data from labcluster
Using TensorFlow backend.
TF-MoDISco is using the TensorFlow backend.