In [1]:
import pickle
from collections import OrderedDict
import numpy as np
import pandas as pd
import os
import subprocess
import math
import copy
import pickle
import h5py
import glob

from matlas.genome_data import *
from matlas.pwms import shorten_tfname, get_motifDB_id_maps
from matlas.matches import fig2vdom, vdom_pssm
#from basepair.modisco.results import ModiscoResult
from matlas.modisco_results import ModiscoResult
from vdom.helpers import (h1, p, li, img, div, b, br, ul, a,
                          details, summary,
                          table, thead, th, tr, tbody, td, ol)
from IPython.display import display


project_root = "/mnt/lab_data/kundaje/msharmin/mouse_hem/with_tfd"
oak_root = "/oak/stanford/groups/akundaje/msharmin/mouse_hem/from_labcluster/with_tfd"
srv_root = "/srv/scratch/msharmin/mouse_hem/with_tfd"
data_name = "full_mouse50"
modisco_root = "{0}/{1}/Naive_modisco2019_msharmin".format(oak_root, data_name)
modisco_root = "{0}/{1}/Naive_modisco2019".format(srv_root, data_name)

homer_root = "{0}/{1}/Naive_scans".format(srv_root, data_name)
model_root = "{0}/{1}/fineFactorized".format(srv_root, data_name)
basset_root = "{0}/{1}/Naive_basset_motifs".format(srv_root, data_name)
reportfile= "/mnt/lab_data/kundaje/msharmin/mouse_hem/with_tfd/full_mouse50/filtering samples_MS2.xlsx"
sheetname = "filter23"
load data from labcluster
Using TensorFlow backend.
TF-MoDISco is using the TensorFlow backend.
In [2]:
from matlas.matches import DenovoModisco

task_idx = 273
modisco_dir = "{0}/task_{1}-naivegw".format(modisco_root, task_idx)
summit_file = "{0}/{1}/fineFactorized/task_{2}-naivegw/predictions/task_{2}-{3}Summit.calibrated.{1}.tab.gz".format(project_root, data_name, task_idx, "Naive")
perf_file = "{0}/task_{1}-naivegw/NaiveauPRC.txt".format(model_root, task_idx)
deeplift_hdf = "{0}/{1}/Naive_deeplift2019/task_{2}-naivegw/summit.h5".format(srv_root, data_name, task_idx)
ob = DenovoModisco(modisco_dir)
pattern_name = "metacluster_1/pattern_1"
query = ob.load_chomped_pattern(pattern_name, plot=True)
ppmchomped ppmchomped pwmchomped log10_pwmchomped pssmchomped cwmchomped hyp_cwmconsensusaCTTCCTcttt
In [3]:
ob.initialize_with_importance_scores(summit_file, perf_file, deeplift_hdf, query.keys()) #loads all the important scores
deeplift_data = ob.deeplift_data
In [4]:
from matlas.utils import get_calibrated_peaks

calibrated_data = get_calibrated_peaks(summit_file, perf_file)
selected_summits = calibrated_data['selected_summits']
indices = calibrated_data['indices']

#from basepair.plot.profiles import extract_signal
from matlas.tracks import extract_signal
pattern_name = 'metacluster_1/pattern_1'

seqlet_scores = ob.contrib_per_pattern[pattern_name]
seqlet_onehot = ob.seqs_per_pattern[pattern_name]

seqlets_ = ob.seqlets_per_pattern[pattern_name]
shuffled_onehot = extract_signal(ob.deeplift_data['shuffled_onehot'][indices], seqlets_)
shuffled_scores = extract_signal(ob.deeplift_data['shuffled_scores'][indices], seqlets_)

shuffled_onehot.shape, shuffled_scores.shape
Out[4]:
((6372, 70, 4), (6372, 70, 4))
In [5]:
from matlas.matches import collect_scoring_measures

method_to_seqlet_scores = collect_scoring_measures(query, seqlet_scores, seqlet_onehot)
print(query.keys(), np.sort(np.array(list(method_to_seqlet_scores.keys()))))


scores = deeplift_data['scores']
one_hot = deeplift_data['one_hot']

method_to_scores = collect_scoring_measures(query, scores, one_hot)
dict_keys(['ppm', 'pwm', 'log10pwm', 'pssm', 'cwm', 'hyp_cwm', 'consensus']) ['cwm_convolution' 'cwm_jaccard' 'cwm_masked_convolution' 'cwm_scan'
 'hyp_cwm_convolution' 'hyp_cwm_jaccard' 'hyp_cwm_masked_convolution'
 'hyp_cwm_scan' 'log10pwm_convolution' 'log10pwm_jaccard'
 'log10pwm_masked_convolution' 'log10pwm_scan' 'ppm_convolution'
 'ppm_jaccard' 'ppm_masked_convolution' 'ppm_scan' 'pssm_convolution'
 'pssm_jaccard' 'pssm_masked_convolution' 'pssm_scan' 'pwm_convolution'
 'pwm_jaccard' 'pwm_masked_convolution' 'pwm_scan' 'sum_score']
In [6]:
from matlas.matches import get_any_high_scoring_instances
per_base_imp=0.0625

keys_for_scoring = ['sum_score'] 
keys_for_scoring += ["{}_scan".format(key) for key in query.keys() if key!= 'consensus']
keys_for_scoring += ["{}_jaccard".format(key) for key in query.keys() if key!= 'consensus']
keys_for_scoring += ["{}_convolution".format(key) for key in query.keys() if key!= 'consensus']
keys_for_scoring += ["{}_masked_convolution".format(key) for key in query.keys() if key!= 'consensus']
        
seqlet_hits = get_any_high_scoring_instances(method_to_seqlet_scores,
                                                min_tot_imp=per_base_imp*len(query['ppm']),
                                                keys=keys_for_scoring)

_hits = get_any_high_scoring_instances(method_to_scores,
                                       min_tot_imp=per_base_imp*len(query['ppm']),
                                       keys=keys_for_scoring)
In [7]:
keys_for_scoring=['sum_score']
keys_for_scoring += ["{}_scan".format(key) for key in ['pssm', 'hyp_cwm', 'cwm']]
keys_for_scoring += ["{}_jaccard".format(key) for key in ['pssm', 'hyp_cwm']]
#keys_for_scoring += ["{}_convolution".format(key) for key in ['hyp_cwm', 'cwm']]
keys_for_scoring += ["{}_masked_convolution".format(key) for key in ['hyp_cwm']]
In [8]:
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

def transform_data(_hits, scaler=None, pca=None):
    if scaler is None:
        # should be used only during training
        scaler = StandardScaler()
        scaler.fit(_hits)
        
    # Apply transform to both the training set and testing set
    _hits = scaler.transform(_hits)

    if pca is None:
        # Make an instance of the Model
        pca = PCA(.98)
        #fitting PCA on the training set only.
        pca.fit(_hits)
        
    #Apply the mapping (transform) to both the training set and the test set.
    _hits = pca.transform(_hits)
    
    return scaler, pca, _hits

scaler, pca, train_hits = transform_data(seqlet_hits[keys_for_scoring])
pca.explained_variance_ratio_
Out[8]:
array([0.7435782 , 0.13957096, 0.06764074, 0.02793941, 0.01247329])
In [9]:
pc_df = pd.DataFrame(data = train_hits, columns=["PC{}".format(i+1) for i in range(train_hits.shape[1])])
In [11]:
#read the real truth
def load_true_positives(tp_file):
    coordinates = []
    weak_coordinates = []
    with open(tp_file, "r") as fp:
        for line in fp:
            line = line.strip().split(",")
            seqno = line[0]
            pos = int(line[1])
            if "-" in seqno:
                seqno = seqno.split("-")
                for i in range(int(seqno[0]), int(seqno[1])+1):
                    coordinates.append((i, pos))
            else:
                if len(line)>2:
                    weak_coordinates.append((int(seqno), pos))
                else:
                    coordinates.append((int(seqno), pos))
    
    
    return {'strong':coordinates, 'weak':weak_coordinates}

def get_tp_indices(seqlet_hits, coordinates, verbose=True):
    keep = OrderedDict()
    for key in coordinates.keys():
        keep[key] = []
        for cord in coordinates[key]:
            seqno, pos = cord
            df = seqlet_hits[(seqlet_hits['seqno']==seqno) & (seqlet_hits['start_idx']==pos)]
            if df.shape[0]>0:
                keep[key].append(df.index[0])
            else:
                if verbose: print(cord)

    return keep

tp_file = "{}/web_mouse_heme_data/modisco_tracks/binary_model/task_{}-naivegw/mm11_tp.txt".format(project_root, task_idx)
coordinates = load_true_positives(tp_file)
true_indices = get_tp_indices(seqlet_hits, coordinates, False)
len(true_indices['strong']), len(true_indices['weak']), seqlet_hits.shape#, sum(labels==1)
Out[11]:
(7159, 127, (114461, 27))
In [12]:
import matplotlib.pyplot as plt
def plot_pca(df, true_ids=None):
    print(df.shape)
    fig = plt.figure(figsize = (8,8))
    ax = fig.add_subplot(1,1,1) 
    ax.set_xlabel('PC1', fontsize = 15)
    ax.set_ylabel('PC2', fontsize = 15)
    ax.set_title('2 component PCA', fontsize = 20)
    if true_ids is not None:
        strong_ids = true_ids['strong']
        strong_df = df.loc[np.array(strong_ids),:]
        ax.scatter(strong_df['PC1'], strong_df['PC2'], s = 1, c='tomato')
        
        weak_ids = true_ids['weak']
        weak_df = df.loc[np.array(weak_ids),:]
        ax.scatter(weak_df['PC1'], weak_df['PC2'], s = 5, c='darkred')
        
        df = df.loc[np.array(list(set(list(df.index)).difference(
            set(strong_ids).union(set(weak_ids))
        ))),:]
    ax.scatter(df['PC1'], df['PC2'], s = 1, c='slateblue')
    #ax.axvline(9)
    #ax.axhline(-1)
    ax.grid()
    ax.legend(['strong match', 'weak match', 'no match'], loc="upper right")
    return None
pc_df = pd.DataFrame(data = train_hits, columns=["PC{}".format(i+1) for i in range(train_hits.shape[1])])
plot_pca(pc_df, true_indices)
(114461, 5)
In [12]:
import matplotlib.pyplot as plt
from sklearn.mixture import GaussianMixture
#import sklearn.mixture.GaussianMixture
gmm = GaussianMixture(n_components=2, covariance_type='full', init_params='kmeans').fit(pc_df[['PC1', 'PC2']])
labels = gmm.predict(pc_df[['PC1', 'PC2']])
plt.scatter(pc_df['PC1'], pc_df['PC2'], c=labels, s=1, cmap='seismic');
probs = gmm.predict_proba(pc_df[['PC1', 'PC2']])
plt.scatter(pc_df['PC1'], pc_df['PC2'], c=probs[:,1], s=1, cmap='seismic');
In [13]:
labels = np.array([0]*pc_df.shape[0])
labels[np.array(true_indices['strong'])]=2
labels[np.array(true_indices['weak'])]=1
pc_df['label'] = labels
pc_df['index'] = pc_df.index
pc_df['seqno'] = seqlet_hits.seqno
pc_df['pos'] = seqlet_hits.start_idx
pc_df[:5]
Out[13]:
PC1 PC2 PC3 PC4 PC5 label index seqno pos
0 0.340055 -0.938979 0.173631 0.693774 -0.267273 0 0 0 15
1 0.269587 -0.337240 0.461613 0.842198 0.163663 0 1 0 16
2 0.687375 0.413401 0.295652 0.732613 -0.099157 0 2 0 17
3 -1.359701 1.221290 -0.049403 0.259099 -0.247482 0 3 0 18
4 -1.603538 1.506033 -0.601422 -0.279105 0.196468 0 4 0 19
In [14]:
import plotly.express as px
fig = px.scatter(pc_df, x="PC1", y="PC2", hover_data=['index', 'seqno', 'pos'], color='label')
fig
In [13]:
#seqlets pca, seqlet gmm 
_, _, test_hits = transform_data(_hits[keys_for_scoring], scaler, pca)
_pc_df = pd.DataFrame(data = test_hits, columns=["PC{}".format(i+1) for i in range(test_hits.shape[1])])

_labels = gmm.predict(_pc_df[['PC1', 'PC2']])
print(np.sum(_labels==1), np.sum(_labels==0))
_probs = gmm.predict_proba(_pc_df[['PC1', 'PC2']])
plt.scatter(_pc_df['PC1'], _pc_df['PC2'], c=_probs[:,1], s=1, cmap='seismic');
51818 2979718
In [17]:
#independent pca, seqlet gmm
_scaler, _pca, _train_hits = transform_data(_hits[keys_for_scoring])
print(_pca.explained_variance_ratio_)
_pc_df = pd.DataFrame(data = _train_hits, columns=["PC{}".format(i+1) for i in range(_train_hits.shape[1])])

_labels = gmm.predict(_pc_df[['PC1', 'PC2']])
print(np.sum(_labels==1), np.sum(_labels==0))
_probs = gmm.predict_proba(_pc_df[['PC1', 'PC2']])
plt.scatter(_pc_df['PC1'], _pc_df['PC2'], c=_probs[:,1], s=1, cmap='seismic');
[0.61358277 0.13522865 0.09585911 0.04568591 0.03387325 0.02366706
 0.0166104  0.01167596 0.0062378 ]
2967950 63586
In [ ]:
# seqlets pca, independent gmm
_, _, test_hits = transform_data(_hits[keys_for_scoring], scaler, pca)
_pc_df = pd.DataFrame(data = test_hits, columns=["PC{}".format(i+1) for i in range(test_hits.shape[1])])

_gmm = GaussianMixture(n_components=2, covariance_type='full', init_params='kmeans').fit(_pc_df[['PC1', 'PC2']])
_labels = _gmm.predict(_pc_df[['PC1', 'PC2']])
print(np.sum(_labels==1), np.sum(_labels==0))
_probs = _gmm.predict_proba(_pc_df[['PC1', 'PC2']])
plt.scatter(_pc_df['PC1'], _pc_df['PC2'], c=_probs[:,1], s=1, cmap='seismic');
In [19]:
#independent pca, independent gmm
_scaler, _pca, _train_hits = transform_data(_hits[keys_for_scoring])
print(_pca.explained_variance_ratio_)
_pc_df = pd.DataFrame(data = _train_hits, columns=["PC{}".format(i+1) for i in range(_train_hits.shape[1])])
_gmm = GaussianMixture(n_components=2).fit(_pc_df[['PC1', 'PC2']])
_labels = _gmm.predict(_pc_df[['PC1', 'PC2']])
print(np.sum(_labels==1), np.sum(_labels==0))
_probs = _gmm.predict_proba(_pc_df[['PC1', 'PC2']])
plt.scatter(_pc_df['PC1'], _pc_df['PC2'], c=_probs[:,1], s=1, cmap='seismic');
[0.61358277 0.13522865 0.09585911 0.04568591 0.03387325 0.02366706
 0.0166104  0.01167596 0.0062378 ]
2546222 485314
In [14]:
_scaler, _pca, _train_hits = transform_data(_hits[keys_for_scoring])
_pc_df = pd.DataFrame(data = _train_hits, columns=["PC{}".format(i+1) for i in range(_train_hits.shape[1])])

_labels = gmm.predict(_pc_df[['PC1', 'PC2']])
_probs = gmm.predict_proba(_pc_df[['PC1', 'PC2']])
sorted_hits = _hits
sorted_hits['probs'] = _probs[:,1]
sorted_hits['labels'] = _labels
sorted_hits = sorted_hits[sorted_hits['labels']==1]
print(sorted_hits.shape)

outdir = "/mnt/lab_data/kundaje/msharmin/mouse_hem/with_tfd/web_mouse_heme_data/modisco_tracks/binary_model/task_273-naivegw"
filename = "m11_gmm_noisypca.gz"
instancefile = "{0}/{1}".format(outdir, filename)
sorted_hits.to_csv(instancefile, sep='\t')
(66440, 29)
In [16]:
_, _, test_hits = transform_data(_hits[keys_for_scoring], scaler, pca)
_pc_df = pd.DataFrame(data = test_hits, columns=["PC{}".format(i+1) for i in range(test_hits.shape[1])])
_labels = gmm.predict(_pc_df[['PC1', 'PC2']])
_probs = gmm.predict_proba(_pc_df[['PC1', 'PC2']])
sorted_hits = _hits
sorted_hits['probs'] = _probs[:,1]
sorted_hits['labels'] = _labels
sorted_hits = sorted_hits[sorted_hits['labels']==1]
print(sorted_hits.shape)

outdir = "/mnt/lab_data/kundaje/msharmin/mouse_hem/with_tfd/web_mouse_heme_data/modisco_tracks/binary_model/task_273-naivegw"
filename = "m11_gmm.gz"
instancefile = "{0}/{1}".format(outdir, filename)
sorted_hits.to_csv(instancefile, sep='\t')
(51818, 29)
In [17]:
#df = (probs[:,1]>0.2)[:5]

def predict_pattern(df, probs=None):
    if probs is not None:
        df = df[df['probs']>=probs,:]
    
    consensus = query['consensus']
    summits = ob.deeplift_data['peaks']
    display_name = ob.shorten_pattern_name(pattern_name)
    
    peak_no = df.shape[0]
    _indices = df.seqno.values
    chroms = summits.chr.values[_indices]
    starts = summits.start.values[_indices] + df.start_idx.values
    ends = starts + len(consensus)
    names = np.array(["{0}/{1}".format(display_name, consensus)]*peak_no)
    scores = df.probs.values
    strands = np.array(['.']*peak_no)
    
    df = pd.DataFrame.from_dict({0:chroms, 1:starts, 2:ends, 3:names, 4:scores, 5:strands})
    df = df.sort_values([0, 1, 4])
    #df = df.drop_duplicates(subset=[0,1,2,3,5])
    
    return df


df = predict_pattern(sorted_hits)
from matlas.utils import send_to_mitra
outdir = "/mnt/lab_data/kundaje/msharmin/mouse_hem/with_tfd/web_mouse_heme_data/modisco_tracks/binary_model/task_273-naivegw"
filename = "m11_gmm"
out_prefix = "{0}/{1}".format(outdir, filename)
send_to_mitra(df, out_prefix, mitra=outdir, hammock=True)
writing...(51818, 6)
In [31]:
instancefile = "/mnt/lab_data/kundaje/msharmin/mouse_hem/with_tfd/web_mouse_heme_data/modisco_tracks/binary_model/task_273-naivegw/m11_gmm.gz"

sorted_hits = pd.read_csv(instancefile, index_col=0, sep="\t")
In [32]:
sorted_hits.shape
Out[32]:
(48859, 29)
In [22]:
import seaborn as sns
probs = gmm.predict_proba(_pc_df[['PC1', 'PC2']])
prob_df = pd.DataFrame(probs)
sns.distplot(prob_df[1], hist=True, kde=True, 
             bins=int(180/5), color = 'darkblue', 
             hist_kws={'edgecolor':'black'},
             kde_kws={'linewidth': 4})
/users/msharmin/anaconda2/envs/basepair/lib/python3.6/site-packages/scipy/stats/stats.py:1713: FutureWarning:

Using a non-tuple sequence for multidimensional indexing is deprecated; use `arr[tuple(seq)]` instead of `arr[seq]`. In the future this will be interpreted as an array index, `arr[np.array(seq)]`, which will result either in an error or a different result.

Out[22]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f29f04c4860>
In [35]:
from matlas.verification import compute_chip_matrix

compute_chip_matrix("MEL", "ETS", out_prefix, execute=False)
computeMatrix reference-point --referencePoint center --skipZeros -S /mnt/lab_data2/msharmin/mouse_chipseq_bigwigs/ENCFF439KTC.bigWig -R /mnt/lab_data/kundaje/msharmin/mouse_hem/with_tfd/web_mouse_heme_data/modisco_tracks/binary_model/task_273-naivegw/m11_gmm.bed -a 500 -b 500 -bs 10 -o /mnt/lab_data/kundaje/msharmin/mouse_hem/with_tfd/web_mouse_heme_data/modisco_tracks/binary_model/task_273-naivegw/m11_gmm.matrix.gz --outFileSortedRegions /mnt/lab_data/kundaje/msharmin/mouse_hem/with_tfd/web_mouse_heme_data/modisco_tracks/binary_model/task_273-naivegw/m11_gmm.matrix.bed 
In [62]:
from matlas.verification import plot_chip_heatmap
plot_chip_heatmap("MEL", "ETS", out_prefix, execute=False)
plotHeatmap -m /mnt/lab_data/kundaje/msharmin/mouse_hem/with_tfd/web_mouse_heme_data/modisco_tracks/binary_model/task_273-naivegw/m11_gmm.matrix.gz -out /mnt/lab_data/kundaje/msharmin/mouse_hem/with_tfd/web_mouse_heme_data/modisco_tracks/binary_model/task_273-naivegw/MEL-ETS-heatmap.png  --colorMap Blues --xAxisLabel Distance --missingDataColor 1 --zMin 0 --heatmapHeight 28 --heatmapWidth 8 --legendLocation none
In [5]:
# from matlas.matches import get_null_sequences
# seq_null_onehot, seq_null_scores = get_null_sequences(one_hot, scores,
#                                                 shuffle_seq=True, shuffle_weight=False
#                                                      )

# weight_null_onehot, weight_null_scores = get_null_sequences(one_hot, scores,
#                                                 shuffle_seq=False, shuffle_weight=True
#                                                      )

# weightedseq_null_onehot, weightedseq_null_scores = get_null_sequences(one_hot, scores,
#                                                 shuffle_seq=True, shuffle_weight=True
#                                                      )
done multiprocessing
(6372, 70, 4)
(6372, 70, 4)
done multiprocessing
(6372, 70, 4)
(6372, 70, 4)
done multiprocessing
(6372, 70, 4)
(6372, 70, 4)
In [7]:
method_to_shuffled_scores = collect_scoring_measures(query, shuffled_scores, shuffled_onehot)
method_to_seq_null_scores = collect_scoring_measures(query, seq_null_scores, seq_null_onehot)
method_to_weight_null_scores = collect_scoring_measures(query, weight_null_scores, weight_null_onehot)
method_to_weightedseq_null_scores = collect_scoring_measures(query, weightedseq_null_scores, weightedseq_null_onehot)
In [9]:
shuffled_seqlet_hits = get_any_high_scoring_instances(method_to_shuffled_scores,
                                                min_tot_imp=per_base_imp*len(query['ppm']),
                                                keys=keys_for_scoring)
seq_null_seqlet_hits = get_any_high_scoring_instances(method_to_seq_null_scores,
                                                min_tot_imp=per_base_imp*len(query['ppm']),
                                                keys=keys_for_scoring)
weight_null_seqlet_hits = get_any_high_scoring_instances(method_to_weight_null_scores,
                                                min_tot_imp=per_base_imp*len(query['ppm']),
                                                keys=keys_for_scoring)
weightedseq_null_seqlet_hits = get_any_high_scoring_instances(method_to_weightedseq_null_scores,
                                                min_tot_imp=per_base_imp*len(query['ppm']),
                                                keys=keys_for_scoring)
In [10]:
seqlet_hits['marker'] = 'foreground'
shuffled_seqlet_hits['marker'] = 'suffled'
seq_null_seqlet_hits['marker'] = 'seq_null'
weight_null_seqlet_hits['marker'] = 'weight_null'
weightedseq_null_seqlet_hits['marker'] = 'weightedseq'
test_hits = pd.concat([seqlet_hits,
                       shuffled_seqlet_hits,
                       seq_null_seqlet_hits,
                       weight_null_seqlet_hits,
                       weightedseq_null_seqlet_hits
                      ])
In [12]:
#read the real truth
def load_true_positives(tp_file):
    coordinates = []
    with open(tp_file, "r") as fp:
        for line in fp:
            line = line.strip().split(",")
            seqno = line[0]
            pos = int(line[1])
            if "-" in seqno:
                seqno = seqno.split("-")
                for i in range(int(seqno[0]), int(seqno[1])+1):
                    coordinates.append((i, pos))
            else:
                coordinates.append((int(seqno), pos))
    return coordinates

def get_tp_indices(seqlet_hits, verbose=True):
    keep = []
    for cord in coordinates:
        seqno, pos = cord
        df = seqlet_hits[(seqlet_hits['seqno']==seqno) & (seqlet_hits['start_idx']==pos)]
        if df.shape[0]>0:
            keep.append(df.index[0])
        else:
            if verbose: print(cord)

    return keep

tp_file = "{}/web_mouse_heme_data/modisco_tracks/binary_model/task_{}-naivegw/mm11_tp_spectral.txt".format(project_root, task_idx)
coordinates = load_true_positives(tp_file)
true_indices = get_tp_indices(seqlet_hits, False)
In [13]:
import matplotlib.pyplot as plt
def plot_pca(df, true_ids=None):
    print(df.shape)
    fig = plt.figure(figsize = (8,8))
    ax = fig.add_subplot(1,1,1) 
    ax.set_xlabel('PC1', fontsize = 15)
    ax.set_ylabel('PC2', fontsize = 15)
    ax.set_title('2 component PCA', fontsize = 20)
    if true_ids is not None:
        true_df = df.loc[np.array(true_ids),:]
        df = df.loc[np.array(list(set(list(df.index)).difference(set(true_ids)))),:]
        ax.scatter(true_df['PC1'], true_df['PC2'], s = 1, c='r', marker='+')
    ax.scatter(df['PC1'], df['PC2'], s = 1, c='b')
    #ax.axvline(9)
    #ax.axhline(-1)
    ax.grid()
    return None
pc_df = pd.DataFrame(data = train_hits, columns=["PC{}".format(i+1) for i in range(train_hits.shape[1])])
plot_pca(pc_df, true_indices)
(114461, 8)
In [14]:
len(true_indices)
Out[14]:
7312
In [19]:
from matlas.matches import view_test_case
from modisco.visualization import viz_sequence
#from matlas.matches import view_test_cases

def view_test_cases(scores, one_hot, df, pwm, start=400, end=600, examples_to_view=None):
    
    seqnos = df.seqno.values
    positions = df.start_idx.values
    if examples_to_view is not None:
        seqnos = seqnos[examples_to_view]
        positions = positions[examples_to_view]
    
    for seqno, pos in zip(seqnos, positions):
        view_test_case(scores, one_hot, df, pwm, seqno, start=start, end=end, itemno=pos)
        
    return None

tmp_df = seqlet_hits.loc[np.sort((np.array(list(set(true_indices['strong']).union(true_indices['weak']))))),:]
# view_test_cases(seqlet_scores, 
#                seqlet_onehot, 
#                tmp_df, 
#                query['pwm'], 
#                start=0, end=70, 
#                examples_to_view=tmp_df.start_idx.values!=37)
In [21]:
# view_test_cases(seqlet_scores, 
#                seqlet_onehot, 
#                tmp_df, 
#                query['pwm'], 
#                start=0, end=70, 
#                examples_to_view=tmp_df.seqno.values<=1000 
#                )
In [1]:
# view_test_cases(seqlet_scores, 
#                seqlet_onehot, 
#                tmp_df, 
#                query['pwm'], 
#                start=0, end=70, 
#                examples_to_view=np.logical_and(tmp_df.seqno.values>1000, tmp_df.seqno.values<=2000) 
#                )
In [17]:
# view_test_cases(seqlet_scores, 
#                seqlet_onehot, 
#                tmp_df, 
#                query['pwm'], 
#                start=0, end=70, 
#                examples_to_view=np.logical_and(tmp_df.seqno.values>2000, tmp_df.seqno.values<=3000) 
#                )
In [19]:
# view_test_cases(seqlet_scores, 
#                seqlet_onehot, 
#                tmp_df, 
#                query['pwm'], 
#                start=0, end=70, 
#                examples_to_view=np.logical_and(tmp_df.seqno.values>3000, tmp_df.seqno.values<=4000) 
#                )
In [22]:
# view_test_cases(seqlet_scores, 
#                seqlet_onehot, 
#                tmp_df, 
#                query['pwm'], 
#                start=0, end=70, 
#                examples_to_view=np.logical_and(tmp_df.seqno.values>4000, tmp_df.seqno.values<=5000) 
#                )
In [24]:
# view_test_cases(seqlet_scores, 
#                seqlet_onehot, 
#                tmp_df, 
#                query['pwm'], 
#                start=0, end=70, 
#                examples_to_view=np.logical_and(tmp_df.seqno.values>5000, tmp_df.seqno.values<=6000) 
#                )
In [15]:
# view_test_cases(seqlet_scores, 
#                seqlet_onehot, 
#                tmp_df, 
#                query['pwm'], 
#                start=0, end=70, 
#                examples_to_view=tmp_df.seqno.values>6000 
#                )
In [ ]:
 
In [26]:
probs = gmm.predict_proba(pc_df[['PC1', 'PC2']])
plt.scatter(pc_df['PC1'], pc_df['PC2'], c=probs[:,1], s=1, cmap='seismic');
In [27]:
_, _, test_hits = transform_data(shuffled_seqlet_hits[keys_for_scoring], scaler, pca)
shuffled_pc_df = pd.DataFrame(data = test_hits, columns=["PC{}".format(i+1) for i in range(test_hits.shape[1])])
#plot_pca(shuffled_pc_df, true_ids=None)

probs = gmm.predict_proba(shuffled_pc_df[['PC1', 'PC2']])
plt.scatter(shuffled_pc_df['PC1'], shuffled_pc_df['PC2'], c=probs[:,1], s=1, cmap='seismic');
In [28]:
_, _, test_hits = transform_data(seq_null_seqlet_hits[keys_for_scoring], scaler, pca)
seq_null_pc_df = pd.DataFrame(data = test_hits, columns=["PC{}".format(i+1) for i in range(test_hits.shape[1])])
#plot_pca(shuffled_pc_df, true_ids=None)

probs = gmm.predict_proba(seq_null_pc_df[['PC1', 'PC2']])
plt.scatter(seq_null_pc_df['PC1'], seq_null_pc_df['PC2'], c=probs[:,1], s=1, cmap='seismic');
In [29]:
_, _, test_hits = transform_data(weight_null_seqlet_hits[keys_for_scoring], scaler, pca)
weight_null_pc_df = pd.DataFrame(data = test_hits, columns=["PC{}".format(i+1) for i in range(test_hits.shape[1])])
#plot_pca(weight_null_pc_df, true_ids=None)

probs = gmm.predict_proba(weight_null_pc_df[['PC1', 'PC2']])
plt.scatter(weight_null_pc_df['PC1'], weight_null_pc_df['PC2'], c=probs[:,1], s=1, cmap='seismic');
In [30]:
_, _, test_hits = transform_data(weightedseq_null_seqlet_hits[keys_for_scoring], scaler, pca)
weightedseq_null_pc_df = pd.DataFrame(data = test_hits, columns=["PC{}".format(i+1) for i in range(test_hits.shape[1])])
#plot_pca(weightedseq_null_pc_df, true_ids=None)

probs = gmm.predict_proba(weightedseq_null_pc_df[['PC1', 'PC2']])
plt.scatter(weightedseq_null_pc_df['PC1'], weightedseq_null_pc_df['PC2'], c=probs[:,1], s=1, cmap='seismic');
In [62]:
pca_seqlet_hits = pc_df[['PC1', 'PC2', 'PC3']]
pca_shuffled_seqlet_hits = shuffled_pc_df[['PC1', 'PC2', 'PC3']]
pca_seq_null_seqlet_hits = seq_null_pc_df[['PC1', 'PC2', 'PC3']]
pca_weight_null_seqlet_hits = weight_null_pc_df[['PC1', 'PC2', 'PC3']]
pca_weightedseq_null_seqlet_hits = weightedseq_null_pc_df[['PC1', 'PC2', 'PC3']]
pca_seqlet_hits['marker'] = 'foreground'
pca_shuffled_seqlet_hits['marker'] = 'suffled'
pca_seq_null_seqlet_hits['marker'] = 'seq_null'
pca_weight_null_seqlet_hits['marker'] = 'weight_null'
pca_weightedseq_null_seqlet_hits['marker'] = 'weightedseq'
pca_test_hits = pd.concat([pca_seqlet_hits,
                       pca_shuffled_seqlet_hits,
                       pca_seq_null_seqlet_hits,
                       pca_weight_null_seqlet_hits,
                       pca_weightedseq_null_seqlet_hits
                      ])
In [21]:
# import seaborn as sns
# sns.pairplot(pca_test_hits, 
#              diag_kind="kde", kind='scatter', hue="marker")
/users/msharmin/anaconda2/envs/basepair/lib/python3.6/site-packages/scipy/stats/stats.py:1713: FutureWarning:

Using a non-tuple sequence for multidimensional indexing is deprecated; use `arr[tuple(seq)]` instead of `arr[seq]`. In the future this will be interpreted as an array index, `arr[np.array(seq)]`, which will result either in an error or a different result.

Out[21]:
<seaborn.axisgrid.PairGrid at 0x7f0669a8beb8>
In [22]:
# sns.pairplot(pd.concat([pca_seqlet_hits, pca_shuffled_seqlet_hits]), 
#              diag_kind="kde", kind='scatter', hue="marker")
/users/msharmin/anaconda2/envs/basepair/lib/python3.6/site-packages/scipy/stats/stats.py:1713: FutureWarning:

Using a non-tuple sequence for multidimensional indexing is deprecated; use `arr[tuple(seq)]` instead of `arr[seq]`. In the future this will be interpreted as an array index, `arr[np.array(seq)]`, which will result either in an error or a different result.

Out[22]:
<seaborn.axisgrid.PairGrid at 0x7f0683430780>
In [23]:
# sns.pairplot(pd.concat([pca_seqlet_hits, pca_seq_null_seqlet_hits]), 
#              diag_kind="kde", kind='scatter', hue="marker")
/users/msharmin/anaconda2/envs/basepair/lib/python3.6/site-packages/scipy/stats/stats.py:1713: FutureWarning:

Using a non-tuple sequence for multidimensional indexing is deprecated; use `arr[tuple(seq)]` instead of `arr[seq]`. In the future this will be interpreted as an array index, `arr[np.array(seq)]`, which will result either in an error or a different result.

Out[23]:
<seaborn.axisgrid.PairGrid at 0x7f013adc8278>
In [24]:
# sns.pairplot(pd.concat([pca_seqlet_hits, pca_weight_null_seqlet_hits]), 
#              diag_kind="kde", kind='scatter', hue="marker")
/users/msharmin/anaconda2/envs/basepair/lib/python3.6/site-packages/scipy/stats/stats.py:1713: FutureWarning:

Using a non-tuple sequence for multidimensional indexing is deprecated; use `arr[tuple(seq)]` instead of `arr[seq]`. In the future this will be interpreted as an array index, `arr[np.array(seq)]`, which will result either in an error or a different result.

Out[24]:
<seaborn.axisgrid.PairGrid at 0x7f06833e0978>
In [25]:
# sns.pairplot(pd.concat([pca_seqlet_hits, pca_weightedseq_null_seqlet_hits]), 
#              diag_kind="kde", kind='scatter', hue="marker")
/users/msharmin/anaconda2/envs/basepair/lib/python3.6/site-packages/scipy/stats/stats.py:1713: FutureWarning:

Using a non-tuple sequence for multidimensional indexing is deprecated; use `arr[tuple(seq)]` instead of `arr[seq]`. In the future this will be interpreted as an array index, `arr[np.array(seq)]`, which will result either in an error or a different result.

Out[25]:
<seaborn.axisgrid.PairGrid at 0x7f0680a80cf8>
In [49]:
from sklearn.cluster import SpectralClustering
from sklearn.svm import LinearSVC, SVC
from sklearn.cluster import AffinityPropagation
def get_spectral_high_scoring_instances(train_df, keys_to_cluster, test_df=None, clustering=None, cno=None):
    train = train_df[keys_to_cluster]
    
    if clustering is None:
        clustering = SpectralClustering(n_clusters=2, assign_labels="discretize", random_state=0).fit(train.values)
        mean0 = np.mean(train_df[keys_to_cluster[0]][clustering.labels_==0])
        mean1 = np.mean(train_df[keys_to_cluster[0]][clustering.labels_==1])
        cno = 1
        if(mean0 > mean1):
            cno = 0
        df = train_df[clustering.labels_==cno]
    else:
        test = test_df[keys_to_cluster]
        y_tr = clustering.labels_
        #clf = LinearSVC(dual=False).fit(train.values, y_tr)
        clf = SVC().fit(train.values, y_tr)
        batch_size = 1000
        batch_count = int(np.ceil(1.0*test_df.shape[0]/batch_size))
        predicted_labels = clf.predict(test.values)
        predicted_labels = np.array(predicted_labels)
        assert len(predicted_labels)==test_df.shape[0], "not all the values have been predicted"
        df = test_df.loc[predicted_labels==cno, :]
        
    #df = df[df['pwm_jaccard']>0]
    
    return df, (clustering, cno)
In [50]:
# import sys
# from matlas.matches import clustering_method_to_functions#, get_spectral_high_scoring_instances
# from random import sample

# max_instances_to_cluster = min(seqlet_hits.shape[0], 20000) #similar size as the gata motif
# pca_seqlet_hits = pc_df[['PC1', 'PC2']]
# train_hits_forcluster = pca_seqlet_hits.loc[sample(list(seqlet_hits.index), max_instances_to_cluster),:]
# #train_hits_forcluster = pc_df[['PC1', 'PC2']]

# clustering_method = 'spectral'
# thismodule = sys.modules[__name__]
# method_to_call = getattr(thismodule, clustering_method_to_functions[clustering_method])

# #use a subset of the hits to build the cluster
# clustered_seqlet_hits, (clustering, cno) = method_to_call(train_hits_forcluster,
#                                                           keys_to_cluster=['PC1', 'PC2']
#                                                          )
In [86]:
# #use the cluster to predict on the full set of seqlets
# clustered_seqlet_hits2, (_, _) = method_to_call(train_hits_forcluster, #train_df
#                                         ['PC1', 'PC2'], #keys_to_cluster
#                                         pca_seqlet_hits, #test_df
#                                         clustering, #clustering
#                                         cno)
In [87]:
# clustered_seqlet_hits.shape, clustered_seqlet_hits2.shape, clustered_seqlet_hits3.shape
Out[87]:
((1275, 2), (7307, 4), (605, 4))
In [89]:
# clustered_seqlet_hits3, (_, _) = method_to_call(train_hits_forcluster, #train_df
#                                         ['PC1', 'PC2'], #keys_to_cluster
#                                         pca_shuffled_seqlet_hits, #test_df
#                                         clustering, #clustering
#                                         cno)

# tp_file = "{}/web_mouse_heme_data/modisco_tracks/binary_model/task_{}-naivegw/mm11_tp_random.txt".format(project_root, task_idx)
# write_tp(shuffled_seqlet_hits.loc[clustered_seqlet_hits3.index,:], tp_file)
# coordinates = load_true_positives(tp_file)
# true_indices = get_tp_indices(shuffled_seqlet_hits, verbose=False)
# plot_pca(shuffled_pc_df, true_indices)
(48842, 8)
In [90]:
# clustered_seqlet_hits3, (_, _) = method_to_call(train_hits_forcluster, #train_df
#                                         ['PC1', 'PC2'], #keys_to_cluster
#                                         pca_seq_null_seqlet_hits, #test_df
#                                         clustering, #clustering
#                                         cno)

# tp_file = "{}/web_mouse_heme_data/modisco_tracks/binary_model/task_{}-naivegw/mm11_tp_random.txt".format(project_root, task_idx)
# write_tp(seqlet_hits.loc[clustered_seqlet_hits3.index,:], tp_file)
# coordinates = load_true_positives(tp_file)
# true_indices = get_tp_indices(seq_null_seqlet_hits, verbose=False)
# plot_pca(seq_null_pc_df, true_indices)
(114461, 8)
In [91]:
# clustered_seqlet_hits3, (_, _) = method_to_call(train_hits_forcluster, #train_df
#                                         ['PC1', 'PC2'], #keys_to_cluster
#                                         pca_weight_null_seqlet_hits, #test_df
#                                         clustering, #clustering
#                                         cno)

# tp_file = "{}/web_mouse_heme_data/modisco_tracks/binary_model/task_{}-naivegw/mm11_tp_random.txt".format(project_root, task_idx)
# write_tp(weight_null_seqlet_hits.loc[clustered_seqlet_hits3.index,:], tp_file)
# coordinates = load_true_positives(tp_file)
# true_indices = get_tp_indices(weight_null_seqlet_hits, verbose=False)
# plot_pca(weight_null_pc_df, true_indices)
(124266, 8)
In [83]:
# clustered_seqlet_hits3, (_, _) = method_to_call(train_hits_forcluster, #train_df
#                                         ['PC1', 'PC2'], #keys_to_cluster
#                                         pca_weightedseq_null_seqlet_hits, #test_df
#                                         clustering, #clustering
#                                         cno)

# tp_file = "{}/web_mouse_heme_data/modisco_tracks/binary_model/task_{}-naivegw/mm11_tp_random.txt".format(project_root, task_idx)
# write_tp(weightedseq_null_seqlet_hits.loc[clustered_seqlet_hits3.index,:], tp_file)
# coordinates = load_true_positives(tp_file)
# true_indices = get_tp_indices(weightedseq_null_seqlet_hits, verbose=False)
# plot_pca(weightedseq_null_pc_df, true_indices)
(123596, 8)
In [ ]: