import pickle
from collections import OrderedDict
import numpy as np
import pandas as pd
import os
import subprocess
import math
import copy
import pickle
import h5py
import glob
from matlas.genome_data import *
from matlas.pwms import shorten_tfname, get_motifDB_id_maps
from matlas.matches import fig2vdom, vdom_pssm
from matlas.modisco_results import ModiscoResult
from vdom.helpers import (h1, p, li, img, div, b, br, ul, a,
details, summary,
table, thead, th, tr, tbody, td, ol)
from IPython.display import display
project_root = "/mnt/lab_data/kundaje/msharmin/mouse_hem/with_tfd"
oak_root = "/oak/stanford/groups/akundaje/msharmin/mouse_hem/from_labcluster/with_tfd"
srv_root = "/srv/scratch/msharmin/mouse_hem/with_tfd"
data_name = "full_mouse50"
modisco_root = "{0}/{1}/Naive_modisco2019_msharmin".format(oak_root, data_name)
modisco_root = "{0}/{1}/Naive_modisco2019".format(srv_root, data_name)
homer_root = "{0}/{1}/Naive_scans".format(srv_root, data_name)
model_root = "{0}/{1}/fineFactorized".format(srv_root, data_name)
basset_root = "{0}/{1}/Naive_basset_motifs".format(srv_root, data_name)
reportfile= "/mnt/lab_data/kundaje/msharmin/mouse_hem/with_tfd/full_mouse50/filtering samples_MS2.xlsx"
sheetname = "filter23"
from matlas.matches import DenovoModisco
task_idx = 273
modisco_dir = "{0}/task_{1}-naivegw".format(modisco_root, task_idx)
summit_file = "{0}/{1}/fineFactorized/task_{2}-naivegw/predictions/task_{2}-{3}Summit.calibrated.{1}.tab.gz".format(project_root, data_name, task_idx, "Naive")
perf_file = "{0}/task_{1}-naivegw/NaiveauPRC.txt".format(model_root, task_idx)
deeplift_hdf = "{0}/{1}/Naive_deeplift2019/task_{2}-naivegw/summit.h5".format(srv_root, data_name, task_idx)
ob = DenovoModisco(modisco_dir)
pattern_name = "metacluster_1/pattern_7"
query = ob.load_chomped_pattern(pattern_name, plot=True)
ob.initialize_with_importance_scores(summit_file, perf_file, deeplift_hdf, query.keys()) #loads all the important scores
deeplift_data = ob.deeplift_data
from matlas.utils import get_calibrated_peaks
calibrated_data = get_calibrated_peaks(summit_file, perf_file)
selected_summits = calibrated_data['selected_summits']
indices = calibrated_data['indices']
#from basepair.plot.profiles import extract_signal
from matlas.tracks import extract_signal
pattern_name = 'metacluster_1/pattern_7'
seqlet_scores = ob.contrib_per_pattern[pattern_name]
seqlet_onehot = ob.seqs_per_pattern[pattern_name]
seqlets_ = ob.seqlets_per_pattern[pattern_name]
shuffled_onehot = extract_signal(ob.deeplift_data['shuffled_onehot'][indices], seqlets_)
shuffled_scores = extract_signal(ob.deeplift_data['shuffled_scores'][indices], seqlets_)
shuffled_onehot.shape, shuffled_scores.shape
from matlas.matches import collect_scoring_measures
method_to_seqlet_scores = collect_scoring_measures(query, seqlet_scores, seqlet_onehot)
print(query.keys(), np.sort(np.array(list(method_to_seqlet_scores.keys()))))
scores = deeplift_data['scores']
one_hot = deeplift_data['one_hot']
method_to_scores = collect_scoring_measures(query, scores, one_hot)
from matlas.matches import get_any_high_scoring_instances
per_base_imp=0.0625
keys_for_scoring = ['sum_score']
keys_for_scoring += ["{}_scan".format(key) for key in query.keys() if key!= 'consensus']
keys_for_scoring += ["{}_jaccard".format(key) for key in query.keys() if key!= 'consensus']
keys_for_scoring += ["{}_convolution".format(key) for key in query.keys() if key!= 'consensus']
keys_for_scoring += ["{}_masked_convolution".format(key) for key in query.keys() if key!= 'consensus']
seqlet_hits = get_any_high_scoring_instances(method_to_seqlet_scores,
min_tot_imp=per_base_imp*len(query['ppm']),
keys=keys_for_scoring)
_hits = get_any_high_scoring_instances(method_to_scores,
min_tot_imp=per_base_imp*len(query['ppm']),
keys=keys_for_scoring)
keys_for_scoring=['sum_score']
keys_for_scoring += ["{}_scan".format(key) for key in ['pssm', 'hyp_cwm', 'cwm']]
keys_for_scoring += ["{}_jaccard".format(key) for key in ['pssm', 'hyp_cwm']]
#keys_for_scoring += ["{}_convolution".format(key) for key in ['hyp_cwm', 'cwm']]
keys_for_scoring += ["{}_masked_convolution".format(key) for key in ['hyp_cwm']]
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
def transform_data(_hits, scaler=None, pca=None):
if scaler is None:
# should be used only during training
scaler = StandardScaler()
scaler.fit(_hits)
# Apply transform to both the training set and testing set
_hits = scaler.transform(_hits)
if pca is None:
# Make an instance of the Model
pca = PCA(.98)
#fitting PCA on the training set only.
pca.fit(_hits)
#Apply the mapping (transform) to both the training set and the test set.
_hits = pca.transform(_hits)
return scaler, pca, _hits
scaler, pca, train_hits = transform_data(seqlet_hits[keys_for_scoring])
pca.explained_variance_ratio_
pc_df = pd.DataFrame(data = train_hits, columns=["PC{}".format(i+1) for i in range(train_hits.shape[1])])
def load_true_positives(tp_file):
coordinates = []
weak_coordinates = []
with open(tp_file, "r") as fp:
for line in fp:
line = line.strip().split(",")
seqno = line[0]
pos = int(line[1])
if "-" in seqno:
seqno = seqno.split("-")
for i in range(int(seqno[0]), int(seqno[1])+1):
coordinates.append((i, pos))
else:
if len(line)>2:
#if 'weak' in line[2]:
weak_coordinates.append((int(seqno), pos))
#else:
#coordinates.append((int(seqno), pos))
else:
coordinates.append((int(seqno), pos))
return {'strong':coordinates, 'weak':weak_coordinates}
def get_tp_indices(seqlet_hits, coordinates, verbose=True):
keep = OrderedDict()
for key in coordinates.keys():
keep[key] = []
for cord in coordinates[key]:
seqno, pos = cord
df = seqlet_hits[(seqlet_hits['seqno']==seqno) & (seqlet_hits['start_idx']==pos)]
if df.shape[0]>0:
keep[key].append(df.index[0])
else:
if verbose: print(cord)
return keep
tp_file = "{}/web_mouse_heme_data/modisco_tracks/binary_model/task_{}-naivegw/mm17_tp_new.txt".format(project_root, task_idx)
coordinates = load_true_positives(tp_file)
true_indices = get_tp_indices(seqlet_hits, coordinates, False)
len(true_indices['strong']), len(true_indices['weak']), seqlet_hits.shape#, sum(labels==1)
import matplotlib.pyplot as plt
def plot_pca(df, true_ids=None):
print(df.shape)
fig = plt.figure(figsize = (8,8))
ax = fig.add_subplot(1,1,1)
ax.set_xlabel('PC1', fontsize = 15)
ax.set_ylabel('PC2', fontsize = 15)
ax.set_title('2 component PCA', fontsize = 20)
if true_ids is not None:
strong_ids = true_ids['strong']
strong_df = df.loc[np.array(strong_ids),:]
ax.scatter(strong_df['PC1'], strong_df['PC2'], s = 1, c='tomato')
weak_ids = true_ids['weak']
weak_df = df.loc[np.array(weak_ids),:]
ax.scatter(weak_df['PC1'], weak_df['PC2'], s = 5, c='darkred')
df = df.loc[np.array(list(set(list(df.index)).difference(
set(strong_ids).union(set(weak_ids))
))),:]
ax.scatter(df['PC1'], df['PC2'], s = 1, c='slateblue')
#ax.axvline(9)
#ax.axhline(-1)
ax.grid()
ax.legend(['strong match', 'weak match', 'no match'], loc="upper right")
return None
coordinates = load_true_positives(tp_file)
true_indices = get_tp_indices(seqlet_hits, coordinates, False)
plot_pca(pc_df, true_indices)
import matplotlib.pyplot as plt
from sklearn.mixture import GaussianMixture
#import sklearn.mixture.GaussianMixture
gmm = GaussianMixture(n_components=2, covariance_type='full', init_params='kmeans').fit(pc_df[['PC1', 'PC2']])
labels = gmm.predict(pc_df[['PC1', 'PC2']])
print(np.sum(labels==1), np.sum(labels==0))
plt.scatter(pc_df['PC1'], pc_df['PC2'], c=labels, s=1, cmap='seismic');
probs = gmm.predict_proba(pc_df[['PC1', 'PC2']])
plt.scatter(pc_df['PC1'], pc_df['PC2'], c=probs[:,1], s=1, cmap='seismic');
pcv1 = pca.explained_variance_ratio_[0]
pcv2 = pca.explained_variance_ratio_[2]
pc_maxs = pc_df.max()
affinity = (pc_df['PC1']*pcv1/(pcv1+pcv2)+pc_df['PC2']*pcv2/(pcv1+pcv2))/(pc_maxs['PC1']*pcv1/(pcv1+pcv2)+pc_maxs['PC2']*pcv2/(pcv1+pcv2))
plt.scatter(pc_df['PC1'], pc_df['PC2'], c=affinity, s=1, cmap='seismic');
labels = np.array([0]*pc_df.shape[0])
labels[np.array(true_indices['strong'])]=2
labels[np.array(true_indices['weak'])]=1
pc_df['label'] = labels
pc_df['index'] = pc_df.index
pc_df['seqno'] = seqlet_hits.seqno
pc_df['pos'] = seqlet_hits.start_idx
pc_df['affinity'] = affinity
pc_df[:5]
import plotly.express as px
fig = px.scatter(pc_df, x="PC1", y="PC2", hover_data=['index', 'seqno', 'pos'], color='affinity',
color_continuous_scale=px.colors.sequential.Magma
)
fig
#seqlets pca, seqlet gmm
_, _, test_hits = transform_data(_hits[keys_for_scoring], scaler, pca)
_pc_df = pd.DataFrame(data = test_hits, columns=["PC{}".format(i+1) for i in range(test_hits.shape[1])])
_labels = gmm.predict(_pc_df[['PC1', 'PC2']])
print(np.sum(_labels==1), np.sum(_labels==0))
_probs = gmm.predict_proba(_pc_df[['PC1', 'PC2']])
plt.scatter(_pc_df['PC1'], _pc_df['PC2'], c=_probs[:,1], s=1, cmap='seismic');
pc_maxs = _pc_df.max()
affinity = (_pc_df['PC1']*pcv1/(pcv1+pcv2)+_pc_df['PC2']*pcv2/(pcv1+pcv2))/(pc_maxs['PC1']*pcv1/(pcv1+pcv2)+pc_maxs['PC2']*pcv2/(pcv1+pcv2))
plt.scatter(_pc_df['PC1'], _pc_df['PC2'], c=affinity, s=1, cmap='seismic');
#independent pca, seqlet gmm
_scaler, _pca, _train_hits = transform_data(_hits[keys_for_scoring])
print(_pca.explained_variance_ratio_)
_pc_df = pd.DataFrame(data = _train_hits, columns=["PC{}".format(i+1) for i in range(_train_hits.shape[1])])
_labels = gmm.predict(_pc_df[['PC1', 'PC2']])
print(np.sum(_labels==1), np.sum(_labels==0))
_probs = gmm.predict_proba(_pc_df[['PC1', 'PC2']])
plt.scatter(_pc_df['PC1'], _pc_df['PC2'], c=_probs[:,1], s=1, cmap='seismic');
# seqlets pca, independent gmm
_, _, test_hits = transform_data(_hits[keys_for_scoring], scaler, pca)
_pc_df = pd.DataFrame(data = test_hits, columns=["PC{}".format(i+1) for i in range(test_hits.shape[1])])
_gmm = GaussianMixture(n_components=2).fit(_pc_df[['PC1', 'PC2']])
_labels = _gmm.predict(_pc_df[['PC1', 'PC2']])
print(np.sum(_labels==1), np.sum(_labels==0))
_probs = _gmm.predict_proba(_pc_df[['PC1', 'PC2']])
plt.scatter(_pc_df['PC1'], _pc_df['PC2'], c=_probs[:,1], s=1, cmap='seismic');
#independent pca, independent gmm
_scaler, _pca, _train_hits = transform_data(_hits[keys_for_scoring])
print(_pca.explained_variance_ratio_)
_pc_df = pd.DataFrame(data = _train_hits, columns=["PC{}".format(i+1) for i in range(_train_hits.shape[1])])
_gmm = GaussianMixture(n_components=2).fit(_pc_df[['PC1', 'PC2']])
_labels = _gmm.predict(_pc_df[['PC1', 'PC2']])
print(np.sum(_labels==1), np.sum(_labels==0))
_probs = _gmm.predict_proba(_pc_df[['PC1', 'PC2']])
plt.scatter(_pc_df['PC1'], _pc_df['PC2'], c=_probs[:,1], s=1, cmap='seismic');
_, _, test_hits = transform_data(_hits[keys_for_scoring], scaler, pca)
_pc_df = pd.DataFrame(data = test_hits, columns=["PC{}".format(i+1) for i in range(test_hits.shape[1])])
_labels = gmm.predict(_pc_df[['PC1', 'PC2']])
_probs = gmm.predict_proba(_pc_df[['PC1', 'PC2']])
sorted_hits = _hits
sorted_hits['probs'] = _probs[:,1]
sorted_hits['labels'] = _labels
sorted_hits = sorted_hits[sorted_hits['labels']==1]
print(sorted_hits.shape)
outdir = "/mnt/lab_data/kundaje/msharmin/mouse_hem/with_tfd/web_mouse_heme_data/modisco_tracks/binary_model/task_273-naivegw"
filename = "m17_gmm.gz"
instancefile = "{0}/{1}".format(outdir, filename)
sorted_hits.to_csv(instancefile, sep='\t')
def predict_pattern(df, probs=None):
if probs is not None:
df = df[df['probs']>=probs,:]
consensus = query['consensus']
summits = ob.deeplift_data['peaks']
display_name = ob.shorten_pattern_name(pattern_name)
peak_no = df.shape[0]
_indices = df.seqno.values
chroms = summits.chr.values[_indices]
starts = summits.start.values[_indices] + df.start_idx.values
ends = starts + len(consensus)
names = np.array(["{0}/{1}".format(display_name, consensus)]*peak_no)
scores = df.probs.values
strands = np.array(['.']*peak_no)
df = pd.DataFrame.from_dict({0:chroms, 1:starts, 2:ends, 3:names, 4:scores, 5:strands})
df = df.sort_values([0, 1, 4])
#df = df.drop_duplicates(subset=[0,1,2,3,5])
return df
df = predict_pattern(sorted_hits)
from matlas.utils import send_to_mitra
outdir = "/mnt/lab_data/kundaje/msharmin/mouse_hem/with_tfd/web_mouse_heme_data/modisco_tracks/binary_model/task_273-naivegw"
filename = "m17_gmm"
out_prefix = "{0}/{1}".format(outdir, filename)
send_to_mitra(df, out_prefix, mitra=outdir, hammock=True)
root = "/mnt/lab_data/kundaje/msharmin/mouse_hem/with_tfd/web_mouse_heme_data/modisco_tracks/binary_model/task_273-naivegw"
dlhits = pd.read_csv("{}/m17_gmm.bed.gz".format(root), header=None, sep="\t")
ismhits = pd.read_csv("{}/m17_gmm_ism.bed.gz".format(root), header=None, sep="\t")
dlhits.shape, ismhits.shape
dlhits[6] = dlhits[4]
dlhits[7] = -1
dlhits[8] = dlhits[4]
dlhits[9] = -1
ismhits[6] = ismhits[4]
ismhits[7] = -1
ismhits[8] = ismhits[4]
ismhits[9] = -1
dlhits.to_csv("{}/m17_gmm_idr.bed".format(root), header=False, index=False, sep="\t")
ismhits.to_csv("{}/m17_gmm_ism_idr.bed".format(root), header=False, index=False, sep="\t")
dlhits[:5]
from matlas.verification import compute_chip_matrix
compute_chip_matrix("MEL", "GATA", out_prefix, execute=True)
from matlas.verification import plot_chip_heatmap
plot_chip_heatmap("MEL", "GATA", out_prefix, execute=True)
#null - 1 - shuffled deeplift score: shuffled one-hot*hyp scores
#null - 2 - keep the one-hot/sequence, change the importance of deeplift score: shuffle the values along bases
#null - 3 - keep the importance values; randomize the bits in each base:
from matlas.matches import get_null_sequences
seqno = 0
null_onehot, null_score = get_null_sequences(one_hot[seqno], scores[seqno],
#total importance is same but shuffled the one-hot
shuffle_seq=True, shuffle_weight=False
)
null_onehot.shape, null_score.shape
from modisco.visualization import viz_sequence
print('real')
viz_sequence.plot_weights(scores[seqno], subticks_frequency=10, figsize=(20,1))
print('same weight, different seq')
viz_sequence.plot_weights(null_score, subticks_frequency=10, figsize=(20,1))
seqno = 0
null_onehot, null_score = get_null_sequences(one_hot[seqno], scores[seqno],
#weights are different but keep the one-hot
shuffle_seq=False, shuffle_weight=True
)
null_onehot.shape, null_score.shape
from modisco.visualization import viz_sequence
print('real')
viz_sequence.plot_weights(scores[seqno], subticks_frequency=10, figsize=(20,1))
print('same seq, different weights')
viz_sequence.plot_weights(null_score, subticks_frequency=10, figsize=(20,1))
seqno = 0
null_onehot, null_score = get_null_sequences(one_hot[seqno], scores[seqno],
#change seq, change weight
shuffle_seq=True, shuffle_weight=True
)
null_onehot.shape, null_score.shape
from modisco.visualization import viz_sequence
print('real')
viz_sequence.plot_weights(scores[seqno], subticks_frequency=10, figsize=(20,1))
print('different weight, different seq')
viz_sequence.plot_weights(null_score, subticks_frequency=10, figsize=(20,1))
from matlas.matches import get_null_sequences
seq_null_onehot, seq_null_scores = get_null_sequences(seqlet_onehot, seqlet_scores,
shuffle_seq=True, shuffle_weight=False
)
weight_null_onehot, weight_null_scores = get_null_sequences(seqlet_onehot, seqlet_scores,
shuffle_seq=False, shuffle_weight=True
)
weightedseq_null_onehot, weightedseq_null_scores = get_null_sequences(seqlet_onehot, seqlet_scores,
shuffle_seq=True, shuffle_weight=True
)
from matlas.matches import collect_scoring_measures
seqlet_scores = ob.contrib_per_pattern[pattern_name]
seqlet_onehot = ob.seqs_per_pattern[pattern_name]
method_to_seqlet_scores = collect_scoring_measures(query, seqlet_scores, seqlet_onehot)
query.keys(), np.sort(np.array(list(method_to_seqlet_scores.keys())))
method_to_shuffled_scores = collect_scoring_measures(query, shuffled_scores, shuffled_onehot)
method_to_seq_null_scores = collect_scoring_measures(query, seq_null_scores, seq_null_onehot)
method_to_weight_null_scores = collect_scoring_measures(query, weight_null_scores, weight_null_onehot)
method_to_weightedseq_null_scores = collect_scoring_measures(query, weightedseq_null_scores, weightedseq_null_onehot)
from matlas.matches import get_any_high_scoring_instances
per_base_imp=0.0625
keys_for_scoring = ['sum_score']
keys_for_scoring += ["{}_scan".format(key) for key in query.keys() if key!= 'consensus']
keys_for_scoring += ["{}_jaccard".format(key) for key in query.keys() if key!= 'consensus']
keys_for_scoring += ["{}_convolution".format(key) for key in query.keys() if key!= 'consensus']
keys_for_scoring += ["{}_masked_convolution".format(key) for key in query.keys() if key!= 'consensus']
seqlet_hits = get_any_high_scoring_instances(method_to_seqlet_scores,
min_tot_imp=per_base_imp*len(query['ppm']),
keys=keys_for_scoring)
def get_background_instances(fg, method_to_null_scores, keys=None):
# given the fg coords, get the df of bg
# appy scaling and pca as in fg, then apply log-likelihood
if keys is None:
keys = list(method_to_scores.keys())
sum_scores = method_to_null_scores['sum_score']
matches = [(sum_scores[seqno, pos], seqno, pos) for seqno, pos in zip(fg['seqno'].values, fg['start_idx'].values)]
match_df = pd.DataFrame(np.array(matches))
match_df.columns = ['sum_score', 'seqno', 'start_idx']
match_df.seqno = match_df.seqno.values.astype(int)
match_df.start_idx = match_df.start_idx.values.astype(int)
for key in keys:
if(key=='sum_score'):
continue
match_df[key] = [method_to_null_scores[key][idx1, idx2] for
(idx1, idx2) in zip(fg['seqno'].values, fg['start_idx'].values)]
return match_df
shuffled_paired_hits = get_background_instances(seqlet_hits, method_to_shuffled_scores, keys=keys_for_scoring)
seq_null_paired_hits = get_background_instances(seqlet_hits, method_to_seq_null_scores, keys=keys_for_scoring)
weight_null_paired_hits = get_background_instances(seqlet_hits, method_to_weight_null_scores, keys=keys_for_scoring)
weightedseq_null_paired_hits = get_background_instances(seqlet_hits, method_to_weightedseq_null_scores, keys=keys_for_scoring)
seqlet_hits.shape, shuffled_paired_hits.shape, seq_null_paired_hits.shape, weight_null_paired_hits.shape, weightedseq_null_paired_hits.shape
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
def transform_data(_hits, scaler=None, pca=None):
if scaler is None:
# should be used only during training
scaler = StandardScaler()
scaler.fit(_hits)
# Apply transform to both the training set and testing set
_hits = scaler.transform(_hits)
if pca is None:
# Make an instance of the Model
pca = PCA(.98)
#fitting PCA on the training set only.
pca.fit(_hits)
#Apply the mapping (transform) to both the training set and the test set.
_hits = pca.transform(_hits)
return scaler, pca, _hits
scaler, pca, train_hits = transform_data(seqlet_hits[keys_for_scoring])
pca.explained_variance_ratio_
shuffled_seqlet_hits = get_any_high_scoring_instances(method_to_shuffled_scores,
min_tot_imp=per_base_imp*len(query['ppm']),
keys=keys_for_scoring)
seq_null_seqlet_hits = get_any_high_scoring_instances(method_to_seq_null_scores,
min_tot_imp=per_base_imp*len(query['ppm']),
keys=keys_for_scoring)
weight_null_seqlet_hits = get_any_high_scoring_instances(method_to_weight_null_scores,
min_tot_imp=per_base_imp*len(query['ppm']),
keys=keys_for_scoring)
weightedseq_null_seqlet_hits = get_any_high_scoring_instances(method_to_weightedseq_null_scores,
min_tot_imp=per_base_imp*len(query['ppm']),
keys=keys_for_scoring)
seqlet_hits['marker'] = 'foreground'
shuffled_seqlet_hits['marker'] = 'suffled'
seq_null_seqlet_hits['marker'] = 'seq_null'
weight_null_seqlet_hits['marker'] = 'weight_null'
weightedseq_null_seqlet_hits['marker'] = 'weightedseq'
test_hits = pd.concat([seqlet_hits,
shuffled_seqlet_hits,
seq_null_seqlet_hits,
weight_null_seqlet_hits,
weightedseq_null_seqlet_hits
])
#read the real truth
def load_true_positives(tp_file):
coordinates = []
with open(tp_file, "r") as fp:
for line in fp:
line = line.strip().split(",")
seqno = line[0]
pos = int(line[1])
if "-" in seqno:
seqno = seqno.split("-")
for i in range(int(seqno[0]), int(seqno[1])+1):
coordinates.append((i, pos))
else:
coordinates.append((int(seqno), pos))
return coordinates
def get_tp_indices(seqlet_hits, verbose=True):
keep = []
for cord in coordinates:
seqno, pos = cord
df = seqlet_hits[(seqlet_hits['seqno']==seqno) & (seqlet_hits['start_idx']==pos)]
if df.shape[0]>0:
keep.append(df.index[0])
else:
if verbose: print(cord)
return keep
tp_file = "{}/web_mouse_heme_data/modisco_tracks/binary_model/task_{}-naivegw/mm17_tp.txt".format(project_root, task_idx)
coordinates = load_true_positives(tp_file)
true_indices = get_tp_indices(seqlet_hits, False)
import matplotlib.pyplot as plt
def plot_pca(df, true_ids=None):
print(df.shape)
fig = plt.figure(figsize = (8,8))
ax = fig.add_subplot(1,1,1)
ax.set_xlabel('PC1', fontsize = 15)
ax.set_ylabel('PC2', fontsize = 15)
ax.set_title('2 component PCA', fontsize = 20)
if true_ids is not None:
true_df = df.loc[np.array(true_ids),:]
df = df.loc[np.array(list(set(list(df.index)).difference(set(true_ids)))),:]
ax.scatter(true_df['PC1'], true_df['PC2'], s = 20, c='r', marker='+')
ax.scatter(df['PC1'], df['PC2'], s = 1, c='b')
#ax.axvline(9)
#ax.axhline(-1)
ax.grid()
return None
pc_df = pd.DataFrame(data = train_hits, columns=["PC{}".format(i+1) for i in range(train_hits.shape[1])])
plot_pca(pc_df, true_indices)
from sklearn.mixture import GaussianMixture
#import sklearn.mixture.GaussianMixture
gmm = GaussianMixture(n_components=2).fit(pc_df[['PC1', 'PC2']])
labels = gmm.predict(pc_df[['PC1', 'PC2']])
plt.scatter(pc_df['PC1'], pc_df['PC2'], c=labels, s=1, cmap='seismic');
probs = gmm.predict_proba(pc_df[['PC1', 'PC2']])
plt.scatter(pc_df['PC1'], pc_df['PC2'], c=probs[:,1], s=1, cmap='seismic');
conditions = np.copy(labels)
conditions[np.logical_and(pc_df['PC1'].values<10, pc_df['PC2'].values<-0.5)] = 0
plt.scatter(pc_df['PC1'], pc_df['PC2'], c=conditions, s=1, cmap='seismic');
# selected = np.array(list(set(list(np.argwhere(conditions==0).T[0])).difference(set(list(np.argwhere(labels==0).T[0])))))
# pc_df.loc[selected,:]
# seqlet_hits.loc[selected,:]
# probs[:,1]
from matlas.render_report import jupyter_nbconvert
import os
def render_test_plots(nbname="global_report"):
rendered_ipynb = "/mnt/lab_data/kundaje/msharmin/mouse_hem/code/mouse_hem/{}.ipynb".format(nbname)
jupyter_nbconvert(rendered_ipynb)
os.system("cp /mnt/lab_data/kundaje/msharmin/mouse_hem/code/mouse_hem/{}.html /srv/www/kundaje/msharmin/report/global_htmls/".format(nbname))
os.system("chmod +755 /srv/www/kundaje/msharmin/mouse_hem/report/global_htmls/{}.html".format(nbname))
return None
render_test_plots('testSpi1Scan')
_, _, test_hits = transform_data(shuffled_seqlet_hits[keys_for_scoring], scaler, pca)
shuffled_pc_df = pd.DataFrame(data = test_hits, columns=["PC{}".format(i+1) for i in range(test_hits.shape[1])])
#plot_pca(shuffled_pc_df, true_ids=None)
probs = gmm.predict_proba(shuffled_pc_df[['PC1', 'PC2']])
plt.scatter(shuffled_pc_df['PC1'], shuffled_pc_df['PC2'], c=probs[:,1], s=1, cmap='seismic');
import sys
def get_LLRs(fg_df, bg_df, keys=['PC1', 'PC2']):
llr_df = OrderedDict()
for key in keys:
fg = fg_df[key].values
bg = bg_df[key].values
_min = abs(min(np.min(fg), np.min(bg)))+0.0001
ratio = (fg+_min)/(bg+_min)
llrs = -2 * np.log(ratio)
llr_df[key] = llrs
#break
llr_df = pd.DataFrame(llr_df)
return llr_df
llr_df = get_LLRs(pc_df, shuffled_paired_pc_df, keys=['PC1', 'PC2'])
#df[:5]
plot_pca(llr_df, true_ids=true_indices)
marker = np.array(['F']*llr_df.shape[0])
marker[np.array(true_indices)] = 'TP'
llr_df['marker'] = marker
import seaborn as sns
sns.pairplot(llr_df,
diag_kind="kde", kind='scatter', hue="marker")
from scipy import stats
def get_z_scores(df):
z_df = OrderedDict()
for key in df.columns:
if key == 'marker':
z_df[key] = df[key].values
continue
z_df[key] = stats.zscore(df[key].values)
z_df = pd.DataFrame(z_df)
return z_df
z_df = get_z_scores(llr_df)
plot_pca(z_df, true_ids=true_indices)
import seaborn as sns
sns.pairplot(z_df,
diag_kind="kde", kind='scatter', hue="marker")
import scipy
def get_p_values(df):
z_scores = df['PC1'].values
#p_values = 1 - scipy.special.ndtr(z_scores)
#2 * (1 - scipy.stats.multivariate_normal.cdf(z_scores, mean=mu, cov=np.diag(std)))
#p_values = scipy.stats.norm.sf(abs(z_scores)) #one-sided
#p_values = scipy.stats.norm.sf(abs(z_scores))*2 #twosided
return p_values
def plot_data(p_df):
fig = plt.figure()
ax = fig.add_subplot(1, 1, 1, axisbg="1.0")
for data, color, group in zip(data, colors, groups):
x, y = data
ax.scatter(x, y, alpha=0.8, c=color, edgecolors='none', s=30, label=group)
plt.title('Matplot scatter plot')
plt.legend(loc=2)
plt.show()
return None
p_values = get_p_values(z_df)
p_df = pd.DataFrame({'p':p_values, 'log10p':-np.log10(p_values), 'marker': z_df['marker'].values})
p_df
#from matlas.matches import view_test_case
def view_test_case(scores, one_hot, df, pwm, seqno, start=400, end=600, itemno=None):
df = df[df.seqno==seqno]
if(df.shape[0]==0):
print('Not part of the hits')
viz_sequence.plot_weights(scores[seqno, start:end,:],
highlight={"black": [(21, 29)]},
subticks_frequency=10, figsize=(20,1))
viz_sequence.plot_weights(one_hot[seqno, start:end, :],
highlight={"black": [(21, 29)]},
subticks_frequency=10, figsize=(20,1))
return None
keys = set(df.columns).difference(set(['seqno', 'start_idx', 'marker']))
debug = 0
for i, row in df.iterrows():
if itemno is not None:
if debug!=itemno:
debug += 1
continue
else:
debug += 1
printables = ["{}: {}".format(key, round(row[key], 3)) for key in keys]
seqno = int(row['seqno'])
start_idx = int(row['start_idx'])
print(i,
printables,
"coordinates:",(seqno, start_idx))
start_motif = start_idx - start
end_motif = start_motif+len(pwm)
print(start_idx, start_motif, end_motif)
if(end_motif > (end-start) or end_motif<0):
continue
viz_sequence.plot_weights(scores[seqno, start:end,:],
highlight={"black": [(start_motif, end_motif)]},
subticks_frequency=10, figsize=(20,1))
viz_sequence.plot_weights(one_hot[seqno, start:end, :],
highlight={"black": [(start_motif, end_motif)]},
subticks_frequency=10, figsize=(20,1))
return None
view_test_case(seqlet_scores, seqlet_onehot, seqlet_hits, query['ppm'], 0, start=0, end=200, itemno=17)
view_test_case(shuffled_scores, shuffled_onehot, shuffled_seqlet_hits, query['ppm'], 0, start=0, end=200, itemno=16)
view_test_case(seq_null_scores, seq_null_onehot, seq_null_seqlet_hits, query['ppm'], 0, start=0, end=200, itemno=17)
view_test_case(weight_null_scores, weight_null_onehot, weight_null_seqlet_hits, query['ppm'], 0, start=0, end=200, itemno=25)
view_test_case(weightedseq_null_scores, weightedseq_null_onehot, weightedseq_null_seqlet_hits, query['ppm'], 0, start=0, end=200, itemno=27)
import numpy as np
import matplotlib.pyplot as plt
def compare_test_case(series1, series2, filter_='convolution'):
keys = set(series1.keys()).difference(set(['seqno', 'start_idx', 'marker']))
keys = ['sum_score'] + [key for key in keys if filter_ in key]
# set width of bar
barWidth = 0.25
# set height of bar
bars1 = [series1[key] for key in keys]
bars2 = [series2[key] for key in keys]
# Set position of bar on X axis
r1 = np.arange(len(bars1))
r2 = [x + barWidth for x in r1]
# Make the plot
plt.bar(r1, bars1, color='#557f2d', width=barWidth, edgecolor='white', label='seqlet')
plt.bar(r2, bars2, color='#2d7f5e', width=barWidth, edgecolor='white', label='shuffled')
# Add xticks on the middle of the group bars
plt.xlabel('group', fontweight='bold')
plt.xticks([r + barWidth for r in range(len(bars1))], keys, rotation='vertical')
# Create legend & Show graphic
plt.legend()
plt.show()
return None
a=4; b=17
# compare_test_case(seqlet_hits.loc[a,:], seqlet_hits.loc[b,:], 'scan')
# compare_test_case(seqlet_hits.loc[a,:], seqlet_hits.loc[b,:], 'convolution')
# compare_test_case(seqlet_hits.loc[a,:], seqlet_hits.loc[b,:], 'jaccard')
# compare_test_case(seqlet_hits.loc[a,:], seqlet_hits.loc[b,:], 'masked')
a=17; b=16
# compare_test_case(seqlet_hits.loc[a,:], shuffled_seqlet_hits.loc[b,:], 'scan')
# compare_test_case(seqlet_hits.loc[a,:], shuffled_seqlet_hits.loc[b,:], 'convolution')
# compare_test_case(seqlet_hits.loc[a,:], shuffled_seqlet_hits.loc[b,:], 'jaccard')
# compare_test_case(seqlet_hits.loc[a,:], shuffled_seqlet_hits.loc[b,:], 'masked')
a=17; b=17
# compare_test_case(seqlet_hits.loc[a,:], seq_null_seqlet_hits.loc[b,:], 'scan')
# compare_test_case(seqlet_hits.loc[a,:], seq_null_seqlet_hits.loc[b,:], 'convolution')
# compare_test_case(seqlet_hits.loc[a,:], seq_null_seqlet_hits.loc[b,:], 'jaccard')
# compare_test_case(seqlet_hits.loc[a,:], seq_null_seqlet_hits.loc[b,:], 'masked')
a=17; b=25
# compare_test_case(seqlet_hits.loc[a,:], weight_null_seqlet_hits.loc[b,:], 'scan')
# compare_test_case(seqlet_hits.loc[a,:], weight_null_seqlet_hits.loc[b,:], 'convolution')
# compare_test_case(seqlet_hits.loc[a,:], weight_null_seqlet_hits.loc[b,:], 'jaccard')
# compare_test_case(seqlet_hits.loc[a,:], weight_null_seqlet_hits.loc[b,:], 'masked')
a=17; b=27
# compare_test_case(seqlet_hits.loc[a,:], weightedseq_null_seqlet_hits.loc[b,:], 'scan')
# compare_test_case(seqlet_hits.loc[a,:], weightedseq_null_seqlet_hits.loc[b,:], 'convolution')
# compare_test_case(seqlet_hits.loc[a,:], weightedseq_null_seqlet_hits.loc[b,:], 'jaccard')
# compare_test_case(seqlet_hits.loc[a,:], weightedseq_null_seqlet_hits.loc[b,:], 'masked')
import seaborn as sns
sns.pairplot(test_hits[['sum_score',
'ppm_scan', 'pwm_scan', 'log10pwm_scan', 'marker']],
diag_kind="kde", kind='scatter', hue="marker")
import seaborn as sns
sns.pairplot(test_hits[['sum_score',
'cwm_scan', 'hyp_cwm_scan', 'pssm_scan', 'marker']],
diag_kind="kde", kind='scatter', hue="marker")
sns.pairplot(test_hits[['sum_score',
'ppm_jaccard', 'pwm_jaccard', 'log10pwm_jaccard', 'marker']],
diag_kind="kde", kind='scatter', hue="marker")
sns.pairplot(test_hits[['sum_score',
'cwm_jaccard', 'hyp_cwm_jaccard', 'pssm_jaccard', 'marker']],
diag_kind="kde", kind='scatter', hue="marker")
sns.pairplot(test_hits[['sum_score',
'ppm_convolution', 'pwm_convolution', 'log10pwm_convolution', 'marker']],
diag_kind="kde", kind='scatter', hue="marker")
sns.pairplot(test_hits[['sum_score',
'cwm_convolution', 'hyp_cwm_convolution', 'pssm_convolution', 'marker']],
diag_kind="kde", kind='scatter', hue="marker")
sns.pairplot(test_hits[['sum_score',
'ppm_masked_convolution', 'pwm_masked_convolution', 'log10pwm_masked_convolution', 'marker']],
diag_kind="kde", kind='scatter', hue="marker")
sns.pairplot(test_hits[['sum_score',
'cwm_masked_convolution', 'hyp_cwm_masked_convolution', 'pssm_masked_convolution', 'marker']],
diag_kind="kde", kind='scatter', hue="marker")
#dimension reduction
len(keys_for_scoring)
keys_for_scoring = ['sum_score']
keys_for_scoring += ["{}_scan".format(key) for key in query.keys() if key!= 'consensus']
keys_for_scoring += ["{}_jaccard".format(key) for key in query.keys() if key!= 'consensus']
keys_for_scoring += ["{}_convolution".format(key) for key in query.keys() if key!= 'consensus']
keys_for_scoring += ["{}_masked_convolution".format(key) for key in ['hyp_cwm', 'log10pwm', 'pwm']]
import umap
trans = umap.UMAP(n_neighbors=5, random_state=42).fit(seqlet_hits[keys_for_scoring])
print(trans.embedding_.shape)
plt.scatter(trans.embedding_[:, 0], trans.embedding_[:, 1], s=1, cmap='Spectral')
test_embedding = trans.transform(seqlet_hits[keys_for_scoring])
print(test_embedding.shape)
plt.scatter(test_embedding[:, 0], test_embedding[:, 1], s=1, cmap='Spectral')
test_embedding = trans.transform(shuffled_seqlet_hits[keys_for_scoring])
print(test_embedding.shape)
plt.scatter(test_embedding[:, 0], test_embedding[:, 1], s=1, cmap='Spectral')
test_embedding = trans.transform(seq_null_seqlet_hits[keys_for_scoring])
print(test_embedding.shape)
plt.scatter(test_embedding[:, 0], test_embedding[:, 1], s=1, cmap='Spectral')
test_embedding = trans.transform(weight_null_seqlet_hits[keys_for_scoring])
print(test_embedding.shape)
plt.scatter(test_embedding[:, 0], test_embedding[:, 1], s=1, cmap='Spectral')
test_embedding = trans.transform(weightedseq_null_seqlet_hits[keys_for_scoring])
print(test_embedding.shape)
plt.scatter(test_embedding[:, 0], test_embedding[:, 1], s=1, cmap='Spectral')
type(test_embedding)
umap_seqlet_hits = pd.DataFrame(trans.transform(seqlet_hits[keys_for_scoring]))
umap_shuffled_seqlet_hits = pd.DataFrame(trans.transform(shuffled_seqlet_hits[keys_for_scoring]))
umap_seq_null_seqlet_hits = pd.DataFrame(trans.transform(seq_null_seqlet_hits[keys_for_scoring]))
umap_weight_null_seqlet_hits = pd.DataFrame(trans.transform(weight_null_seqlet_hits[keys_for_scoring]))
umap_weightedseq_null_seqlet_hits = pd.DataFrame(trans.transform(weightedseq_null_seqlet_hits[keys_for_scoring]))
umap_seqlet_hits['marker'] = 'foreground'
umap_shuffled_seqlet_hits['marker'] = 'suffled'
umap_seq_null_seqlet_hits['marker'] = 'seq_null'
umap_weight_null_seqlet_hits['marker'] = 'weight_null'
umap_weightedseq_null_seqlet_hits['marker'] = 'weightedseq'
umap_test_hits = pd.concat([umap_seqlet_hits,
umap_shuffled_seqlet_hits,
umap_seq_null_seqlet_hits,
umap_weight_null_seqlet_hits,
umap_weightedseq_null_seqlet_hits
])
sns.pairplot(umap_test_hits,
diag_kind="kde", kind='scatter', hue="marker")
from matlas.matches import plots_seqlet_measures
plots_seqlet_measures(seqlet_hits, metrics = {0:["sum_score", "ppm_scan"],
1:["sum_score", "ppm_convolution"],
2:["sum_score", "ppm_masked_convolution"],
3:["sum_score", "ppm_jaccard"]})
plots_seqlet_measures(shuffled_seqlet_hits, metrics = {0:["sum_score", "ppm_scan"],
1:["sum_score", "ppm_convolution"],
2:["sum_score", "ppm_masked_convolution"],
3:["sum_score", "ppm_jaccard"]})
plots_seqlet_measures(seq_null_seqlet_hits, metrics = {0:["sum_score", "ppm_scan"],
1:["sum_score", "ppm_convolution"],
2:["sum_score", "ppm_masked_convolution"],
3:["sum_score", "ppm_jaccard"]})
plots_seqlet_measures(weight_null_seqlet_hits, metrics = {0:["sum_score", "ppm_scan"],
1:["sum_score", "ppm_convolution"],
2:["sum_score", "ppm_masked_convolution"],
3:["sum_score", "ppm_jaccard"]})
plots_seqlet_measures(weightedseq_null_seqlet_hits, metrics = {0:["sum_score", "ppm_scan"],
1:["sum_score", "ppm_convolution"],
2:["sum_score", "ppm_masked_convolution"],
3:["sum_score", "ppm_jaccard"]})
from matlas.matches import plots_seqlet_measures
plots_seqlet_measures(seqlet_hits, metrics = {0:["sum_score", "pwm_scan"],
1:["sum_score", "pwm_convolution"],
2:["sum_score", "pwm_masked_convolution"],
3:["sum_score", "pwm_jaccard"]})
plots_seqlet_measures(shuffled_seqlet_hits, metrics = {0:["sum_score", "pwm_scan"],
1:["sum_score", "pwm_convolution"],
2:["sum_score", "pwm_masked_convolution"],
3:["sum_score", "pwm_jaccard"]})
plots_seqlet_measures(seq_null_seqlet_hits, metrics = {0:["sum_score", "pwm_scan"],
1:["sum_score", "pwm_convolution"],
2:["sum_score", "pwm_masked_convolution"],
3:["sum_score", "pwm_jaccard"]})
plots_seqlet_measures(weight_null_seqlet_hits, metrics = {0:["sum_score", "pwm_scan"],
1:["sum_score", "pwm_convolution"],
2:["sum_score", "pwm_masked_convolution"],
3:["sum_score", "pwm_jaccard"]})
plots_seqlet_measures(weightedseq_null_seqlet_hits, metrics = {0:["sum_score", "pwm_scan"],
1:["sum_score", "pwm_convolution"],
2:["sum_score", "pwm_masked_convolution"],
3:["sum_score", "pwm_jaccard"]})
from matlas.matches import plots_seqlet_measures
plots_seqlet_measures(seqlet_hits, metrics = {0:["sum_score", "log10pwm_scan"],
1:["sum_score", "log10pwm_convolution"],
2:["sum_score", "log10pwm_masked_convolution"],
3:["sum_score", "log10pwm_jaccard"]})
plots_seqlet_measures(shuffled_seqlet_hits, metrics = {0:["sum_score", "log10pwm_scan"],
1:["sum_score", "log10pwm_convolution"],
2:["sum_score", "log10pwm_masked_convolution"],
3:["sum_score", "log10pwm_jaccard"]})
from matlas.matches import plots_seqlet_measures
plots_seqlet_measures(seq_null_seqlet_hits, metrics = {0:["sum_score", "log10pwm_scan"],
1:["sum_score", "log10pwm_convolution"],
2:["sum_score", "log10pwm_masked_convolution"],
3:["sum_score", "log10pwm_jaccard"]})
from matlas.matches import plots_seqlet_measures
plots_seqlet_measures(weight_null_seqlet_hits, metrics = {0:["sum_score", "log10pwm_scan"],
1:["sum_score", "log10pwm_convolution"],
2:["sum_score", "log10pwm_masked_convolution"],
3:["sum_score", "log10pwm_jaccard"]})
from matlas.matches import plots_seqlet_measures
plots_seqlet_measures(weightedseq_null_seqlet_hits, metrics = {0:["sum_score", "log10pwm_scan"],
1:["sum_score", "log10pwm_convolution"],
2:["sum_score", "log10pwm_masked_convolution"],
3:["sum_score", "log10pwm_jaccard"]})
from matlas.matches import plots_seqlet_measures
plots_seqlet_measures(seqlet_hits, metrics = {0:["sum_score", "pssm_scan"],
1:["sum_score", "pssm_convolution"],
2:["sum_score", "pssm_masked_convolution"],
3:["sum_score", "pssm_jaccard"]})
plots_seqlet_measures(shuffled_seqlet_hits, metrics = {0:["sum_score", "pssm_scan"],
1:["sum_score", "pssm_convolution"],
2:["sum_score", "pssm_masked_convolution"],
3:["sum_score", "pssm_jaccard"]})
plots_seqlet_measures(seq_null_seqlet_hits, metrics = {0:["sum_score", "pssm_scan"],
1:["sum_score", "pssm_convolution"],
2:["sum_score", "pssm_masked_convolution"],
3:["sum_score", "pssm_jaccard"]})
plots_seqlet_measures(weight_null_seqlet_hits, metrics = {0:["sum_score", "pssm_scan"],
1:["sum_score", "pssm_convolution"],
2:["sum_score", "pssm_masked_convolution"],
3:["sum_score", "pssm_jaccard"]})
plots_seqlet_measures(weightedseq_null_seqlet_hits, metrics = {0:["sum_score", "pssm_scan"],
1:["sum_score", "pssm_convolution"],
2:["sum_score", "pssm_masked_convolution"],
3:["sum_score", "pssm_jaccard"]})
from matlas.matches import plots_seqlet_measures
plots_seqlet_measures(seqlet_hits, metrics = {0:["sum_score", "cwm_scan"],
1:["sum_score", "cwm_convolution"],
2:["sum_score", "cwm_masked_convolution"],
3:["sum_score", "cwm_jaccard"]})
plots_seqlet_measures(shuffled_seqlet_hits, metrics = {0:["sum_score", "cwm_scan"],
1:["sum_score", "cwm_convolution"],
2:["sum_score", "cwm_masked_convolution"],
3:["sum_score", "cwm_jaccard"]})
plots_seqlet_measures(seq_null_seqlet_hits, metrics = {0:["sum_score", "cwm_scan"],
1:["sum_score", "cwm_convolution"],
2:["sum_score", "cwm_masked_convolution"],
3:["sum_score", "cwm_jaccard"]})
plots_seqlet_measures(weight_null_seqlet_hits, metrics = {0:["sum_score", "cwm_scan"],
1:["sum_score", "cwm_convolution"],
2:["sum_score", "cwm_masked_convolution"],
3:["sum_score", "cwm_jaccard"]})
plots_seqlet_measures(weightedseq_null_seqlet_hits, metrics = {0:["sum_score", "cwm_scan"],
1:["sum_score", "cwm_convolution"],
2:["sum_score", "cwm_masked_convolution"],
3:["sum_score", "cwm_jaccard"]})
from matlas.matches import plots_seqlet_measures
plots_seqlet_measures(seqlet_hits, metrics = {0:["sum_score", "hyp_cwm_scan"],
1:["sum_score", "hyp_cwm_convolution"],
2:["sum_score", "hyp_cwm_masked_convolution"],
3:["sum_score", "hyp_cwm_jaccard"]})
plots_seqlet_measures(shuffled_seqlet_hits, metrics = {0:["sum_score", "hyp_cwm_scan"],
1:["sum_score", "hyp_cwm_convolution"],
2:["sum_score", "hyp_cwm_masked_convolution"],
3:["sum_score", "hyp_cwm_jaccard"]})
plots_seqlet_measures(seq_null_seqlet_hits, metrics = {0:["sum_score", "hyp_cwm_scan"],
1:["sum_score", "hyp_cwm_convolution"],
2:["sum_score", "hyp_cwm_masked_convolution"],
3:["sum_score", "hyp_cwm_jaccard"]})
plots_seqlet_measures(weight_null_seqlet_hits, metrics = {0:["sum_score", "hyp_cwm_scan"],
1:["sum_score", "hyp_cwm_convolution"],
2:["sum_score", "hyp_cwm_masked_convolution"],
3:["sum_score", "hyp_cwm_jaccard"]})
plots_seqlet_measures(weightedseq_null_seqlet_hits, metrics = {0:["sum_score", "hyp_cwm_scan"],
1:["sum_score", "hyp_cwm_convolution"],
2:["sum_score", "hyp_cwm_masked_convolution"],
3:["sum_score", "hyp_cwm_jaccard"]})
len(keys_for_scoring)
keys_for_scoring = ['sum_score']
keys_for_scoring += ["{}_scan".format(key) for key in query.keys() if key!= 'consensus']
keys_for_scoring += ["{}_jaccard".format(key) for key in query.keys() if key!= 'consensus']
keys_for_scoring += ["{}_convolution".format(key) for key in query.keys() if key!= 'consensus']
keys_for_scoring += ["{}_masked_convolution".format(key) for key in query.keys() if key!= 'consensus']
#keys_for_scoring += ["{}_masked_convolution".format(key) for key in ['hyp_cwm', 'log10pwm', 'pwm']]
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
# Fit on training set only.
scaler.fit(seqlet_hits[keys_for_scoring])
# Apply transform to both the training set and the test set.
train_hits = scaler.transform(seqlet_hits[keys_for_scoring])
from sklearn.decomposition import PCA
# Make an instance of the Model
pca = PCA(.98)
#fitting PCA on the training set only.
pca.fit(train_hits)
#Apply the mapping (transform) to both the training set and the test set.
train_hits = pca.transform(train_hits)
#test_hits = pca.transform(test_hits)
#seqlet_hits[seqlet_hits['seqno']==684]
pca.explained_variance_ratio_
#read the real truth
def load_true_positives(tp_file):
coordinates = []
with open(tp_file, "r") as fp:
for line in fp:
line = line.strip().split(",")
seqno = line[0]
pos = int(line[1])
if "-" in seqno:
seqno = seqno.split("-")
for i in range(int(seqno[0]), int(seqno[1])+1):
coordinates.append((i, pos))
else:
coordinates.append((int(seqno), pos))
return coordinates
def get_tp_indices(seqlet_hits):
keep = []
for cord in coordinates:
seqno, pos = cord
df = seqlet_hits[(seqlet_hits['seqno']==seqno) & (seqlet_hits['start_idx']==pos)]
if df.shape[0]>0:
keep.append(df.index[0])
else:
print(cord)
return keep
tp_file = "{}/web_mouse_heme_data/modisco_tracks/binary_model/task_{}-naivegw/mm17_tp.txt".format(project_root, task_idx)
coordinates = load_true_positives(tp_file)
true_indices = get_tp_indices(seqlet_hits)
def plot_pca(df, true_ids=None):
print(df.shape)
fig = plt.figure(figsize = (8,8))
ax = fig.add_subplot(1,1,1)
ax.set_xlabel('PC1', fontsize = 15)
ax.set_ylabel('PC2', fontsize = 15)
ax.set_title('2 component PCA', fontsize = 20)
if true_ids is not None:
true_df = df.loc[np.array(true_ids),:]
df = df.loc[np.array(list(set(list(df.index)).difference(set(true_ids)))),:]
ax.scatter(true_df['PC1'], true_df['PC2'], s = 20, c='r', marker='+')
ax.scatter(df['PC1'], df['PC2'], s = 1, c='b')
#ax.axvline(9)
#ax.axhline(-1)
ax.grid()
return None
pc_df = pd.DataFrame(data = train_hits, columns=["PC{}".format(i+1) for i in range(train_hits.shape[1])])
plot_pca(pc_df, )
#test_df = pc_df.loc[np.array(list(set(list(pc_df.index)).difference(set(true_indices)))),:]
#test_df = test_df.loc[(test_df['PC1']>9) & (test_df['PC2']<0),:]
#seqlet_hits.loc[test_df.index, :]
seqlet_hits[seqlet_hits['seqno']==724]
test_hits = scaler.transform(shuffled_seqlet_hits[keys_for_scoring])
test_hits = pca.transform(test_hits)
shuffled_pc_df = pd.DataFrame(data = test_hits, columns=["PC{}".format(i+1) for i in range(test_hits.shape[1])])
plot_pca(shuffled_pc_df, true_ids=None)
test_hits = scaler.transform(seq_null_seqlet_hits[keys_for_scoring])
test_hits = pca.transform(test_hits)
seq_null_pc_df = pd.DataFrame(data = test_hits, columns=["PC{}".format(i+1) for i in range(test_hits.shape[1])])
plot_pca(seq_null_pc_df)
test_hits = scaler.transform(weight_null_seqlet_hits[keys_for_scoring])
test_hits = pca.transform(test_hits)
weight_null_pc_df = pd.DataFrame(data = test_hits, columns=["PC{}".format(i+1) for i in range(test_hits.shape[1])])
plot_pca(weight_null_pc_df)
test_hits = scaler.transform(weightedseq_null_seqlet_hits[keys_for_scoring])
test_hits = pca.transform(test_hits)
weightedseq_null_pc_df = pd.DataFrame(data = test_hits, columns=["PC{}".format(i+1) for i in range(test_hits.shape[1])])
plot_pca(weightedseq_null_pc_df)
pc_df[:5]
pca_seqlet_hits = pc_df[['PC1', 'PC2', 'PC3']]
pca_shuffled_seqlet_hits = shuffled_pc_df[['PC1', 'PC2', 'PC3']]
pca_seq_null_seqlet_hits = seq_null_pc_df[['PC1', 'PC2', 'PC3']]
pca_weight_null_seqlet_hits = weight_null_pc_df[['PC1', 'PC2', 'PC3']]
pca_weightedseq_null_seqlet_hits = weightedseq_null_pc_df[['PC1', 'PC2', 'PC3']]
pca_seqlet_hits['marker'] = 'foreground'
pca_shuffled_seqlet_hits['marker'] = 'shuffled'
pca_seq_null_seqlet_hits['marker'] = 'seq_null'
pca_weight_null_seqlet_hits['marker'] = 'weight_null'
pca_weightedseq_null_seqlet_hits['marker'] = 'weightedseq'
pca_test_hits = pd.concat([pca_seqlet_hits,
pca_shuffled_seqlet_hits,
pca_seq_null_seqlet_hits,
pca_weight_null_seqlet_hits,
pca_weightedseq_null_seqlet_hits
])
sns.pairplot(pca_test_hits,
diag_kind="kde", kind='scatter', hue="marker")
import seaborn as sns
sns.pairplot(pd.concat([pca_seqlet_hits, pca_shuffled_seqlet_hits]),
diag_kind="kde", kind='scatter', hue="marker")
sns.pairplot(pd.concat([pca_seqlet_hits, pca_seq_null_seqlet_hits]),
diag_kind="kde", kind='scatter', hue="marker")
sns.pairplot(pd.concat([pca_seqlet_hits, pca_weight_null_seqlet_hits]),
diag_kind="kde", kind='scatter', hue="marker")
sns.pairplot(pd.concat([pca_seqlet_hits, pca_weightedseq_null_seqlet_hits]),
diag_kind="kde", kind='scatter', hue="marker")
ob = DenovoModisco(modisco_dir)
ob.initialize_with_importance_scores(summit_file, perf_file, deeplift_hdf) #loads all the important scores
#ob.find_instances_in_seqlets() #set up the clustering of seqlets for patterns
#ob.save_instances(out_prefix) #predict+collect the instances of sequences for all patterns
pattern_name = "metacluster_1/pattern_7"
pwm_query, _, _, _= ob.load_chomped_pattern(pattern_name, rc=False, plot=True)
test_hits = ob.find_pattern_instances_in_seqlets(pattern_name, per_base_imp=0.125, test_mode=True)
import seaborn as sns
sns.pairplot(test_hits[['sum_score',
'weight_convolution', 'pfm_convolution', 'pwm_convolution']],
diag_kind="kde", kind='scatter')
sns.pairplot(test_hits[['sum_score',
'weight_scan', 'pfm_scan', 'pwm_scan']],
diag_kind="kde", kind='scatter')
sns.pairplot(test_hits[['sum_score',
'weight_jaccard', 'pfm_jaccard', 'pwm_jaccard']],
diag_kind="kde", kind='scatter')
sns.pairplot(test_hits[['sum_score',
'weight_masked_convolution', 'pfm_masked_convolution', 'pwm_masked_convolution']],
diag_kind="kde", kind='scatter')
from matlas.matches import plots_seqlet_measures
plots_seqlet_measures(test_hits, metrics = {0:["pwm_convolution", "pfm_convolution"],
1:["pwm_convolution", "weight_convolution"],
2:["pwm_convolution", "pwm_scan"],
3:["pwm_convolution", "pwm_jaccard"]})
plots_seqlet_measures(test_hits, metrics = {0:["pwm_convolution", "pfm_jaccard"],
1:["pwm_convolution", "weight_jaccard"],
2:["pwm_convolution", "pwm_masked_convolution"],
3:["pwm_convolution", "pfm_masked_convolution"]})
ob.find_pattern_instances_in_seqlets(pattern_name, per_base_imp=0.0625, plot=True)
ob.train_seqlet_hits[pattern_name][:5]#.shape
# from random import sample
# random_seqlet_hits = sample(list(ob.train_seqlet_hits[pattern_name].index), 3)
#list(ob.train_seqlet_hits[pattern_name].index)
pwm_query, pfm_query, weigh_query, consensus = ob.load_chomped_pattern(pattern_name, rc=False, plot=True)
#ob.deeplift_data['peaks'][(ob.deeplift_data['peaks'].chr=='chr2')&(ob.deeplift_data['peaks'].start>=3163300)]
from modisco.visualization import viz_sequence
viz_sequence.plot_weights(ob.deeplift_data['scores'][0, 400:600,:],
#highlight={"black": [(start_motif, end_motif)]},
subticks_frequency=10, figsize=(20,1))
viz_sequence.plot_weights(ob.deeplift_data['one_hot'][0, 400:600, :],
#highlight={"black": [(start_motif, end_motif)]},
subticks_frequency=10, figsize=(20,1))
from matlas.matches import check_summary_instances
#seqlet_onehot = ob.seqs_per_pattern[pattern_name]
#filtered_seqlet_hits = ob.final_seqlet_hits[pattern_name]
check_summary_instances(ob.final_seqlet_hits[pattern_name],
range(ob.seqs_per_pattern[pattern_name].shape[0]))
from matlas.matches import view_test_case
view_test_case(ob.contrib_per_pattern[pattern_name],
ob.seqs_per_pattern[pattern_name],
ob.final_seqlet_hits[pattern_name],
pwm_query,
seqno = 714,
start=0, end=70
)
# for seqno in np.sort(np.array([5055, 1635, 2437, 4616, 1353, 2185, 6253, 1550, 2446, 2257, 2353, 2325, 2038, 1945, 5019, 2303])):
# view_test_case(seqlet_scores, seqlet_onehot, filtered_seqlet_hits2, pwm_query, seqno=seqno, start=0, end=70)
from matlas.matches import view_test_cases
view_test_cases(ob.contrib_per_pattern[pattern_name],
ob.seqs_per_pattern[pattern_name],
ob.final_seqlet_hits[pattern_name],
pwm_query,
start=0, end=70,
max_examples_to_view=10)
#only forward pattern
df, consensus = ob._predict_pattern_instances_in_sequences("metacluster_1/pattern_7")
df.shape
# both forward and backward pattern
df = ob.predict_pattern_instances_in_sequences("metacluster_1/pattern_7")
df.shape
df.shape
from matlas.matches import view_test_cases
view_test_cases(ob.deeplift_data['scores'],
ob.deeplift_data['one_hot'],
df,
pwm_query,
start=400, end=600,
max_examples_to_view=10)
ob.deeplift_data['peaks'].loc[67,:]
from matlas.utils import send_to_mitra
outdir = "{0}/web_mouse_heme_data/modisco_tracks/binary_model/task_{1}-naivegw".format(project_root, task_idx)
if not os.path.exists(outdir):
os.makedirs(outdir)
out_prefix = "{0}/mm17_hits_spectral_1_7".format(outdir)
send_to_mitra(df, out_prefix, mitra=outdir, hammock=True)
data_file = "{0}/spectral.p".format(ob.data_dir)
pickle.dump(ob.clustering, open(data_file, "wb"))
pickle.load(open(data_file, "rb"))
def print_hits(seqno, df):
df = df[df.seqno==seqno]
return df
seqno = 411
#print_hits(seqno, seqlet_hits)
#print_hits(seqno, filtered_seqlet_hits)
#print_hits(seqno, filtered_seqlet_hits2)
def print_positional_metadata(seqno, position, method_to_seqlet_scores):
print("seqno: ", seqno, ", position: ", position)
for key in method_to_seqlet_scores.keys():
print(key, ": ", method_to_seqlet_scores[key][seqno, position])
return None
#print_positional_metadata(411, 50)
#deeplift_data['peaks'].loc[67,:]#chr1:8135243:8134243, #67th in deeplift_scores
#np.argwhere(indices==67) #40, #40th in modisco_imp_scores
#seqlets_per_pattern = ob.mr.seqlets()
#seqlets_ = seqlets_per_pattern['metacluster_1/pattern_7']
# for seqlet in seqlets_:
# if(seqlet.seqname==40):
# break
# print(seqlet)#Seqlet(seqname=40, start=531, end=601, name='', strand='+')
#selected_summits.loc[40,:] #chr1:8134243:8135243