%load_ext autoreload
%autoreload 2
import pickle
from collections import OrderedDict
import numpy as np
import pandas as pd
import os
import subprocess
import math
import copy
import pickle
import h5py
import glob
from matlas.genome_data import *
from matlas.pwms import shorten_tfname, get_motifDB_id_maps
from matlas.matches import fig2vdom, vdom_pssm
from matlas.modisco_results import ModiscoResult
from vdom.helpers import (h1, p, li, img, div, b, br, ul, a,
details, summary,
table, thead, th, tr, tbody, td, ol)
from IPython.display import display
project_root = "/mnt/lab_data/kundaje/msharmin/mouse_hem/with_tfd"
oak_root = "/oak/stanford/groups/akundaje/msharmin/mouse_hem/from_labcluster/with_tfd"
srv_root = "/srv/scratch/msharmin/mouse_hem/with_tfd"
data_name = "full_mouse50"
modisco_root = "{0}/{1}/Naive_modisco2019_msharmin".format(oak_root, data_name)
modisco_root = "{0}/{1}/Naive_modisco2019".format(srv_root, data_name)
homer_root = "{0}/{1}/Naive_scans".format(srv_root, data_name)
model_root = "{0}/{1}/fineFactorized".format(srv_root, data_name)
basset_root = "{0}/{1}/Naive_basset_motifs".format(srv_root, data_name)
reportfile= "/mnt/lab_data/kundaje/msharmin/mouse_hem/with_tfd/full_mouse50/filtering samples_MS2.xlsx"
sheetname = "filter23"
from matlas.matches import DenovoModisco
task_idx = 273
modisco_dir = "{0}/task_{1}-naivegw".format(modisco_root, task_idx)
summit_file = "{0}/{1}/fineFactorized/task_{2}-naivegw/predictions/task_{2}-{3}Summit.calibrated.{1}.tab.gz".format(project_root, data_name, task_idx, "Naive")
perf_file = "{0}/task_{1}-naivegw/NaiveauPRC.txt".format(model_root, task_idx)
deeplift_hdf = "{0}/{1}/Naive_deeplift2019/task_{2}-naivegw/summit.h5".format(srv_root, data_name, task_idx)
ob = DenovoModisco(modisco_dir)
pattern_name = None
pattern_names = list(ob.denovo_pwms.keys()) if pattern_name is None else [pattern_name]
query = ob.load_chomped_pattern(pattern_names[0], plot=False)
#loads all the important scores
ob.initialize_with_importance_scores(summit_file, perf_file, deeplift_hdf, query.keys(), use_all_keys=True)
ob.per_base_imp
ob.keys_for_scoring = ['sum_score']
ob.keys_for_scoring += ["{}_scan".format(key) for key in query.keys() if key!= 'consensus']
ob.keys_for_scoring += ["{}_jaccard".format(key) for key in query.keys() if key!= 'consensus']
ob.keys_for_scoring += ["{}_convolution".format(key) for key in query.keys() if key!= 'consensus']
ob.keys_for_scoring += ["{}_masked_convolution".format(key) for key in query.keys() if key!= 'consensus']
method_to_null_scores = OrderedDict()
method_to_seqlet_scores = OrderedDict()
for pattern_name in ob.denovo_pwms.keys():
#pattern_name = "metacluster_1/pattern_7"
query = ob.load_chomped_pattern(pattern_name, rc=False)
method_to_null_scores[pattern_name] = ob.compute_null_scores_in_seqlets(pattern_name, query,
use_all_seqlets=True, debug=True)
method_to_seqlet_scores[pattern_name] = ob.compute_scores_in_seqlets(pattern_name, query,
use_all_seqlets=True, debug=True)
#new_seqlet_hits = ob.prediction_in_seqlets(pattern_name, seqlet_hits)
# from matlas.match_utils import get_any_high_scoring_instances
# seqlet_hits = OrderedDict()
# for pattern_name in ob.denovo_pwms.keys():
# query = ob.load_chomped_pattern(pattern_name, rc=False)
# seqlet_hits[pattern_name] = get_any_high_scoring_instances(method_to_seqlet_scores[pattern_name],
# min_tot_imp=ob.per_base_imp*len(query['ppm']),
# keys=ob.keys_for_scoring)
# seqlet_hits[pattern_name]['sum_score'] = seqlet_hits[pattern_name]['sum_score']/len(query['ppm'])
seqlet_hits = method_to_seqlet_scores
# null_hits = OrderedDict()
# for pattern_name in ob.denovo_pwms.keys():
# query = ob.load_chomped_pattern(pattern_name, rc=False)
# null_hits[pattern_name] = get_any_high_scoring_instances(method_to_null_scores[pattern_name],
# min_tot_imp=ob.per_base_imp*len(query['ppm']),
# keys=ob.keys_for_scoring)
# null_hits[pattern_name]['sum_score'] = null_hits[pattern_name]['sum_score']/len(query['ppm'])
null_hits = method_to_null_scores
from sklearn.preprocessing import StandardScaler
def scale_data(_hits, scaler=None):
if scaler is None:
# should be used only during training
scaler = StandardScaler()
scaler.fit(_hits)
# Apply transform to both the training set and testing set
_hits = scaler.transform(_hits)
return scaler, _hits
features = ['sum_score', 'pwm_scan',
'hyp_cwm_scan', 'cwm_scan',
'hyp_cwm_jaccard', 'cwm_jaccard',
'hyp_cwm_masked_convolution'
]
scaled_seqlet_hits = OrderedDict()
scaled_null_hits = OrderedDict()
scalers = OrderedDict()
for pattern_name in ob.denovo_pwms.keys():
scalers[pattern_name], scaled_seqlet_hits[pattern_name] = scale_data(seqlet_hits[pattern_name][features])
_, scaled_null_hits[pattern_name] = scale_data(null_hits[pattern_name][features], scalers[pattern_name])
for pattern_name in ob.denovo_pwms.keys():
scaled_seqlet_hits[pattern_name] = pd.DataFrame(scaled_seqlet_hits[pattern_name], columns=features)
#scaled_seqlet_hits[pattern_name].columns = seqlet_hits[pattern_name][features].columns
for pattern_name in ob.denovo_pwms.keys():
scaled_null_hits[pattern_name] = pd.DataFrame(scaled_null_hits[pattern_name], columns=features)
#scaled_null_hits[pattern_name].columns = null_hits[pattern_name][features].columns
from sklearn.decomposition import PCA
def reduce_data(_hits, pca=None):
if pca is None:
# Make an instance of the Model
pca = PCA(.98)
pca.fit(_hits)
#Apply the mapping (transform) to both the training set and the test set.
_hits = pca.transform(_hits)
return pca, _hits
features = ['sum_score', 'pwm_scan',
'hyp_cwm_scan', 'cwm_scan',
'hyp_cwm_jaccard', 'cwm_jaccard',
'hyp_cwm_masked_convolution'
]
pca_seqlet_hits = OrderedDict()
pca_null_hits = OrderedDict()
pcas = OrderedDict()
for pattern_name in ob.denovo_pwms.keys():
pcas[pattern_name], pca_seqlet_hits[pattern_name] = reduce_data(scaled_seqlet_hits[pattern_name][features])
_, pca_null_hits[pattern_name] = reduce_data(scaled_null_hits[pattern_name][features], pcas[pattern_name])
for pattern_name in ob.denovo_pwms.keys():
pca_seqlet_hits[pattern_name] = pd.DataFrame(pca_seqlet_hits[pattern_name])
pca_seqlet_hits[pattern_name].columns = ["PC{}".format(i+1) for i in range(pca_seqlet_hits[pattern_name].shape[1])]
for pattern_name in ob.denovo_pwms.keys():
pca_null_hits[pattern_name] = pd.DataFrame(pca_null_hits[pattern_name])
pca_null_hits[pattern_name].columns = ["PC{}".format(i+1) for i in range(pca_null_hits[pattern_name].shape[1])]
outdir = "/mnt/lab_data/kundaje/msharmin/mouse_hem/with_tfd/full_mouse50/instances2019/task_273"
_df_dict = OrderedDict()
for i, pattern_name in enumerate(ob.denovo_pwms.keys()):
print(pattern_name)
fname = "{}/{}.gw.gz".format(outdir, i)
_df_dict[pattern_name] = pd.read_csv(fname, header=0, index_col=0, sep="\t")
print(_df_dict[pattern_name].shape)
_reduced_dict = OrderedDict()
for pattern_name in ob.denovo_pwms.keys():
_, _reduced_dict[pattern_name] = scale_data(_df_dict[pattern_name][features], scalers[pattern_name])
_, _reduced_dict[pattern_name] = reduce_data(_reduced_dict[pattern_name], pcas[pattern_name])
_reduced_dict[pattern_name] = pd.DataFrame(_reduced_dict[pattern_name])
_reduced_dict[pattern_name].columns = ["PC{}".format(i+1) for i in range(_reduced_dict[pattern_name].shape[1])]
import matplotlib.pyplot as plt
from sklearn.cluster import Birch
from sklearn.mixture import GaussianMixture, BayesianGaussianMixture
from sklearn.metrics import davies_bouldin_score
import warnings
warnings.filterwarnings("ignore")
keys = ['PC1', 'PC2']
gmms = OrderedDict()
birchs = OrderedDict()
gmm_cno = OrderedDict()
brc_cno = OrderedDict()
everything = OrderedDict()
for pattern_name in ob.denovo_pwms.keys():
info = OrderedDict()
#pattern_name = "metacluster_1/pattern_19"
key_ks = []
for i, key in enumerate(keys):
a = pca_seqlet_hits[pattern_name][key].values
b = pca_null_hits[pattern_name][key].values
the_k = np.percentile(b, 90)
key_ks.append(the_k)
info['PC1_k'] = the_k
break
print(pattern_name, key_ks)
fig, axes = plt.subplots(nrows=2, ncols=4, figsize=(22,10))
ax_all = axes[0, 0]
ax_allBrc = axes[0, 1]
ax_gwBrc = axes[0, 2]
ax_gmm = axes[0, 3]
ax_sq = axes[1, 0]
ax_sqbrc = axes[1, 1]
ax_dnbrc = axes[1, 2]
ax_cmp = axes[1, 3]
df = pca_seqlet_hits[pattern_name]
#df = df[df['PC1']>=the_k][['PC1', 'PC2']]
small_df = df[df['PC1']>=the_k][['PC1', 'PC2']]
df = df[df['PC1']>=the_k][['PC1', 'PC2']]
print(df.shape)
clustering = Birch(branching_factor=50, n_clusters=3, threshold=0.5, compute_labels=True).fit(
small_df.values)
means = []
for i in [0,1,2]:
means.append(np.mean(small_df[clustering.labels_==i], axis=0))
means = np.array(means)
sq_cno = np.argmax(np.array(means[:,0])) # use only 1st column or PC1 to selelct desired cluster
ax_sq.scatter(small_df['PC1'], small_df['PC2'], c=clustering.labels_, s=1, cmap='plasma');#
ax_sq.set_xlabel('PC1')
ax_sq.set_ylabel('PC2')
ax_sq.set_title(pattern_name + " (Birch, Seqlets), {}".format(sq_cno))
info['sq_cno'] = sq_cno
info['sq_brc'] = clustering
#anything
all_df = _reduced_dict[pattern_name][['PC1', 'PC2']]
df = all_df
ax_all.scatter(df['PC1'], df['PC2'], s=1, cmap='plasma');
ax_all.set_xlabel('PC1')
ax_all.set_ylabel('PC2')
ax_all.set_title(pattern_name+ " (All)")
#all birch
all_clustering = Birch(branching_factor=50, n_clusters=3, threshold=0.5, compute_labels=True).fit(
df.values)
all_means = []
for i in [0,1,2]:
all_means.append(np.mean(df[all_clustering.labels_==i], axis=0))
all_means = np.array(all_means)
all_cno = np.argmax(np.array(all_means[:,0]))
ax_allBrc.scatter(df['PC1'], df['PC2'], c=all_clustering.labels_, s=1, cmap='plasma');
ax_allBrc.set_xlabel('PC1')
ax_allBrc.set_ylabel('PC2')
ax_allBrc.set_title(pattern_name+ " (allBirch), {}".format(all_cno))
info['all_cno'] = all_cno
info['all_brc'] = all_clustering
#reduced and all birch
df = df[df['PC1']>=the_k][['PC1', 'PC2']]
all_labels = all_clustering.labels_[all_df['PC1']>=the_k]
ax_gwBrc.scatter(df['PC1'], df['PC2'], c=all_labels, s=1, cmap='plasma');
ax_gwBrc.set_xlabel('PC1')
ax_gwBrc.set_ylabel('PC2')
ax_gwBrc.set_title(pattern_name+ " (allBirch, GW), {}".format(all_cno))
#gw gmm
gmm = GaussianMixture(n_components=3, means_init= means,
covariance_type='full',
init_params='kmeans').fit(df[['PC1', 'PC2']])
gmm_labels = gmm.predict(df)
means = []
for i in [0,1,2]:
means.append(np.mean(df[gmm_labels==i], axis=0))
means = np.array(means)
gmm_cno[pattern_name] = np.argmax(np.array(means[:,0]))
ax_gmm.scatter(df['PC1'], df['PC2'], c=gmm_labels, s=1, cmap='plasma');
ax_gmm.set_xlabel('PC1')
ax_gmm.set_ylabel('PC2')
ax_gmm.set_title(pattern_name+ " (sqGMM, GW), {}".format(gmm_cno[pattern_name]))
info['gmm_cno'] = gmm_cno[pattern_name]
info['gmm'] = gmm
#gw birch prediction
brc_labels = clustering.predict(df.values)
ax_sqbrc.scatter(df['PC1'], df['PC2'], c=brc_labels, s=1, cmap='plasma');#
ax_sqbrc.set_xlabel('PC1')
ax_sqbrc.set_ylabel('PC2')
ax_sqbrc.set_title(pattern_name + " (sqBirch, GW), {}".format(sq_cno))
#gw birch new
brc = Birch(branching_factor=50, n_clusters=3, threshold=0.5, compute_labels=True).fit(
df.values)
means = []
for i in [0,1,2]:
means.append(np.mean(df[brc.labels_==i], axis=0))
means = np.array(means)
brc_cno[pattern_name] = np.argmax(np.array(means[:,0]))
ax_dnbrc.scatter(df['PC1'], df['PC2'], c=brc.labels_, s=1, cmap='plasma');
ax_dnbrc.set_xlabel('PC1')
ax_dnbrc.set_ylabel('PC2')
ax_dnbrc.set_title(pattern_name+ " (dnBirch, GW), {}".format(brc_cno[pattern_name]))
info['brc_cno'] = brc_cno[pattern_name]
info['reduced_brc'] = brc
#fitness
#seq_brc_db = davies_bouldin_score(small_df.values, clustering.labels_)
#gw_brc_db = davies_bouldin_score(df.values, brc_labels)
#brc_db = davies_bouldin_score(df.values, brc.labels_)
#gmm_db = davies_bouldin_score(df.values, gmm_labels)
new_all_brc_labels = np.array([0]*df.shape[0]); new_all_brc_labels[all_labels==all_cno] = 1
new_seq_brc_labels = np.array([0]*small_df.shape[0]); new_seq_brc_labels[clustering.labels_==sq_cno] = 1
new_gw_brc_labels = np.array([0]*df.shape[0]); new_gw_brc_labels[brc.labels_==sq_cno] = 1
new_brc_labels = np.array([0]*df.shape[0]); new_brc_labels[brc.labels_==brc_cno[pattern_name]] = 1
new_gmm_labels = np.array([0]*df.shape[0]); new_gmm_labels[gmm_labels==gmm_cno[pattern_name]] = 1
new_all_brc_db = davies_bouldin_score(df.values, new_all_brc_labels)
new_seq_brc_db = davies_bouldin_score(small_df.values, new_seq_brc_labels)
new_gw_brc_db = davies_bouldin_score(df.values, new_gw_brc_labels)
new_brc_db = davies_bouldin_score(df.values, new_brc_labels)
new_gmm_db = davies_bouldin_score(df.values, new_gmm_labels)
barWidth = 0.25
bars = [new_all_brc_db, new_seq_brc_db, new_gmm_db, new_brc_db, new_gw_brc_db]
r1 = np.arange(len(bars))
ax_cmp.bar(r1, bars, color='#557f2d', width=barWidth, edgecolor='white')
ax_cmp.set_xticks([r+barWidth/2 for r in range(len(bars))])
ax_cmp.set_xticklabels(['Brc-All, {}'.format(all_cno),
'Brc-Seqlets, {}'.format(sq_cno),
'GMM-GW, {}'.format(gmm_cno[pattern_name]),
'dnBrc-GW, {}'.format(brc_cno[pattern_name]),
'sqBrc-GW, {}'.format(sq_cno)],
fontdict={'fontsize':12},
rotation='vertical')
#ax_cmp.legend()
ax_cmp.set_title('Davies Bouldin Scores')
plt.tight_layout()
plt.show()
everything[pattern_name] = info
#break
import matplotlib.pyplot as plt
keys = ['PC1', 'PC2']
for pattern_name in ob.denovo_pwms.keys():
fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(20,5))
for i, key in enumerate(keys):
a = pca_seqlet_hits[pattern_name][key].values
b = pca_null_hits[pattern_name][key].values
n, bins, patches = axes[i].hist(a, 200, color='red', histtype='step')
n, bins, patches = axes[i].hist(b, 200, color='green', histtype='step')
axes[i].axvline(x=np.percentile(b, 95))
plt.title(pattern_name)
plt.show()
features = ['sum_score', 'pwm_scan',
'hyp_cwm_scan', 'cwm_scan',
'hyp_cwm_jaccard', 'cwm_jaccard',
'hyp_cwm_masked_convolution'
]
ob.keys_for_scoring = features
for pattern_name in ob.denovo_pwms.keys():
#pattern_name = "metacluster_1/pattern_7"
query = ob.load_chomped_pattern(pattern_name, rc=False)
#seqlet_hits = ob.compute_scores_in_seqlets(pattern_name, query, use_all_seqlets=True)
new_seqlet_hits = ob.prediction_in_seqlets(pattern_name, seqlet_hits[pattern_name])
import matplotlib.pyplot as plt
for pattern_name in ob.denovo_pwms.keys():
a = method_to_seqlet_scores[pattern_name]['sum_score'].flatten()
b = method_to_null_scores[pattern_name]['sum_score'].flatten()
c = method_to_seqlet_scores[pattern_name]['hyp_cwm_masked_convolution'].flatten()
d = method_to_null_scores[pattern_name]['hyp_cwm_masked_convolution'].flatten()
#fig = plt.figure(figsize=(10, 8))
fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(20,5))
n, bins, patches = axes[0].hist(a, 200, color='red', histtype='step')
n, bins, patches = axes[0].hist(b, 200, color='green', histtype='step')
axes[0].axvline(x=np.percentile(b, 99.95)/len(query['ppm']))
n, bins, patches = axes[1].hist(c, 200, color='red', histtype='step')
n, bins, patches = axes[1].hist(d, 200, color='green', histtype='step')
axes[1].axvline(x=np.percentile(d, 95))
plt.title(pattern_name)
plt.show()
import matplotlib.pyplot as plt
keys = ['hyp_cwm_jaccard', 'hyp_cwm_scan', 'hyp_cwm_masked_convolution']
for pattern_name in ob.denovo_pwms.keys():
fig, axes = plt.subplots(nrows=1, ncols=3, figsize=(20,5))
for i, key in enumerate(keys):
a = seqlet_hits[pattern_name][key].values
b = null_hits[pattern_name][key].values
n, bins, patches = axes[i].hist(a, 200, color='red', histtype='step')
n, bins, patches = axes[i].hist(b, 200, color='green', histtype='step')
axes[i].axvline(x=np.percentile(b, 95))
plt.title(pattern_name)
plt.show()
#remove instances based on all match scores
filtered_seqlet_hits = OrderedDict()
filtered_null_hits = OrderedDict()
filtering_features = ['pwm_scan',
'hyp_cwm_scan', 'cwm_scan',
'hyp_cwm_jaccard', 'cwm_jaccard',
'hyp_cwm_masked_convolution'
]
for pattern_name in ob.denovo_pwms.keys():
for key in filtering_features:
d = method_to_null_scores[pattern_name][key].flatten()
the_k = np.percentile(d, 90)
df = seqlet_hits[pattern_name]
df = df[df[key]>the_k]
filtered_seqlet_hits[pattern_name] = df
features = ['sum_score', 'pwm_scan',
'hyp_cwm_scan', 'cwm_scan',
'hyp_cwm_jaccard', 'cwm_jaccard',
'hyp_cwm_masked_convolution'
]
ob.keys_for_scoring = features
for pattern_name in ob.denovo_pwms.keys():
#pattern_name = "metacluster_1/pattern_7"
query = ob.load_chomped_pattern(pattern_name, rc=False)
#seqlet_hits = ob.compute_scores_in_seqlets(pattern_name, query, use_all_seqlets=True)
new_seqlet_hits = ob.prediction_in_seqlets(pattern_name, filtered_seqlet_hits[pattern_name])
for pattern_name in ob.denovo_pwms.keys():
ob.keys_for_scoring = ['sum_score']
ob.keys_for_scoring += ["{}_scan".format(key) for key in query.keys() if key!= 'consensus']
ob.keys_for_scoring += ["{}_jaccard".format(key) for key in query.keys() if key!= 'consensus']
ob.keys_for_scoring += ["{}_convolution".format(key) for key in query.keys() if key!= 'consensus']
ob.keys_for_scoring += ["{}_masked_convolution".format(key) for key in query.keys() if key!= 'consensus']
for pattern_name in ob.denovo_pwms.keys():
#pattern_name = "metacluster_1/pattern_7"
query = ob.load_chomped_pattern(pattern_name, rc=False)
seqlet_hits = ob.compute_scores_in_seqlets(pattern_name, query, use_all_seqlets=False)
new_seqlet_hits = ob.prediction_in_seqlets(pattern_name, seqlet_hits)
features = ['sum_score', 'pssm_scan',
'hyp_cwm_scan', 'cwm_scan',
'hyp_cwm_jaccard', 'cwm_jaccard','pssm_jaccard',
'hyp_cwm_masked_convolution'
]
ob.keys_for_scoring = features
for pattern_name in ob.denovo_pwms.keys():
#pattern_name = "metacluster_1/pattern_7"
query = ob.load_chomped_pattern(pattern_name, rc=False)
seqlet_hits = ob.compute_scores_in_seqlets(pattern_name, query, use_all_seqlets=False)
new_seqlet_hits = ob.prediction_in_seqlets(pattern_name, seqlet_hits)
features = ['sum_score', 'pwm_scan',
'hyp_cwm_scan', 'cwm_scan',
'hyp_cwm_jaccard', 'cwm_jaccard',
'hyp_cwm_masked_convolution'
]
ob.keys_for_scoring = features
for pattern_name in ob.denovo_pwms.keys():
#pattern_name = "metacluster_1/pattern_7"
query = ob.load_chomped_pattern(pattern_name, rc=False)
seqlet_hits = ob.compute_scores_in_seqlets(pattern_name, query, use_all_seqlets=False)
new_seqlet_hits = ob.prediction_in_seqlets(pattern_name, seqlet_hits)
imp1 = None
imp2 = None
imp3 = None
ratio = None
for pattern_name in ob.denovo_pwms.keys():
pca = ob.pca[pattern_name]
#print(pattern_name)
#print(pca.components_[:2,:])
if imp1 is None:
imp1 = pca.components_[0,:]
imp2 = pca.components_[1,:]
imp3 = pca.components_[2,:]
ratio = pca.explained_variance_ratio_[:5]
else:
imp1 = np.vstack((imp1, pca.components_[0,:]))
imp2 = np.vstack((imp2, pca.components_[1,:]))
imp3 = np.vstack((imp3, pca.components_[2,:]))
ratio = np.vstack((ratio, pca.explained_variance_ratio_[:5]))
imp1 = pd.DataFrame(imp1)
imp2 = pd.DataFrame(imp2)
imp3 = pd.DataFrame(imp3)
ratio = pd.DataFrame(ratio)
imp1.columns = features
imp2.columns = features
imp3.columns = features
print(imp1.shape, imp2.shape, imp3.shape)
ratio
fig = plt.figure()
ax = sns.heatmap(imp1,
vmin=np.min(imp1.values),
vmax=np.max(imp1.values))
ax.set_title('PC1')
ax.set_ylabel('pattern #')
fig = plt.figure()
ax = sns.heatmap(imp2,
vmin=np.min(imp2.values),
vmax=np.max(imp2.values))
ax.set_title('PC2')
ax.set_ylabel('pattern #')
fig = plt.figure()
ax = sns.heatmap(imp3,
vmin=np.min(imp3.values),
vmax=np.max(imp3.values))
ax.set_title('PC3')
ax.set_ylabel('pattern #')
fig = plt.figure()
ax = sns.heatmap(ratio,
vmin=np.min(ratio.values),
vmax=np.max(ratio.values))
ax.set_title('explained ratio')
ax.set_ylabel('pattern #')
import seaborn as sns
fig = plt.figure()
ax = sns.heatmap(imp1,
vmin=np.min(pd.concat([imp1, imp2, imp3]).values),
vmax=np.max(pd.concat([imp1, imp2, imp3]).values))
fig = plt.figure()
ax = sns.heatmap(imp2,
vmin=np.min(pd.concat([imp1, imp2, imp3]).values),
vmax=np.max(pd.concat([imp1, imp2, imp3]).values))
fig = plt.figure()
ax = sns.heatmap(imp3,
vmin=np.min(pd.concat([imp1, imp2, imp3]).values),
vmax=np.max(pd.concat([imp1, imp2, imp3]).values))
fig = plt.figure()
ax = sns.heatmap(ratio,
vmin=np.min(ratio.values),
vmax=np.max(ratio.values))
ob.keys_for_scoring = ['sum_score']
ob.keys_for_scoring += ["{}_scan".format(key) for key in query.keys() if key!= 'consensus']
ob.keys_for_scoring += ["{}_jaccard".format(key) for key in query.keys() if key!= 'consensus']
ob.keys_for_scoring += ["{}_convolution".format(key) for key in query.keys() if key!= 'consensus']
ob.keys_for_scoring += ["{}_masked_convolution".format(key) for key in query.keys() if key!= 'consensus']
pattern_name = "metacluster_1/pattern_7"
query = ob.load_chomped_pattern(pattern_name, rc=False)
seqlet_hits = ob.compute_scores_in_seqlets(pattern_name, query, use_all_seqlets=False)
seqlet_hits.shape
#ob.deeplift_data['shuffled_scores']
# from matlas.sliding_similarities import sliding_sum
# a = sliding_sum(query['ppm'], ob.deeplift_data['shuffled_scores'])
# a = a.flatten()
# b = sliding_sum(query['ppm'], ob.deeplift_data['scores'])
# b = b.flatten()
# a = a/len(query['ppm'])
# b = b/len(query['ppm'])
average_motif_len = 15
a = sliding_sum(np.ones((average_motif_len, 4)), ob.deeplift_data['shuffled_scores'])
a = a.flatten()/average_motif_len
b = sliding_sum(np.ones((average_motif_len, 4)), ob.deeplift_data['scores'])
b = b.flatten()/average_motif_len
import matplotlib.pyplot as plt
fig = plt.figure(figsize=(10, 8))
n, bins, patches = plt.hist(a, 200, facecolor='red', histtype='step')
n, bins, patches = plt.hist(b, 200, facecolor='green', histtype='step')
plt.axvline(x=np.percentile(a, 99.95)/len(query['ppm']))
plt.show()
ob.per_base_imp
pattern_name = "metacluster_1/pattern_7"
outdir = "/mnt/lab_data/kundaje/msharmin/mouse_hem/with_tfd/full_mouse50/instances2019"
query = ob.load_chomped_pattern(pattern_name, rc=False)
display_name = ob.shorten_pattern_name(pattern_name)
outprefix = "{}/{}".format(outdir, display_name)
ob.find_pattern_instances_in_seqlets(pattern_name, query,
fname="{}.seqlet.gmm.pdf".format(outprefix))
df = ob._predict_pattern_instances_in_sequences(pattern_name, query,
cluster_plot=True,
fname="{}.GW.gmm.pdf".format(outprefix)
)
fname = "{}.candidates.bed.gz".format(outprefix)
df.to_csv(fname, header=True, index=True, sep="\t", compression='gzip')
df[:5]
pos_df = df[df['labels']==ob.cluster_no[pattern_name]]
fname = "{}.candidates1.bed.gz".format(outprefix)
pos_df.to_csv(fname, header=True, index=True, sep="\t", compression='gzip')
pos_df.shape
rev_query = ob.load_chomped_pattern(pattern_name, rc=True, plot=True)
df = ob._predict_pattern_instances_in_sequences(pattern_name, rev_query,
cluster_plot=True,
fname="{}.rev.GW.gmm.pdf".format(outprefix)
)
fname = "{}.rev_candidates.bed.gz".format(outprefix)
df.to_csv(fname, header=True, index=True, sep="\t", compression='gzip')
pos_df = df[df['labels']==ob.cluster_no[pattern_name]]
fname = "{}.rev_candidates1.bed.gz".format(outprefix)
pos_df.to_csv(fname, header=True, index=True, sep="\t", compression='gzip')
#now combine the forward abd reverse coords and check for redundancy...
fname = "{}.candidates1.bed.gz".format(outprefix)
for_df = pd.read_csv(fname, header=0, index_col=0, sep="\t")
fname = "{}.rev_candidates1.bed.gz".format(outprefix)
rev_df = pd.read_csv(fname, header=0, index_col=0, sep="\t")
#get the sequences common between
common_seqs = np.array(list(set(list(for_df.seqno.values)).intersection(set(list(rev_df.seqno.values)))))
common_seqs = np.sort(common_seqs)
print(common_seqs.shape)
fore_common = np.array([seqno in common_seqs for seqno in for_df.seqno.values])
rev_common = np.array([seqno in common_seqs for seqno in rev_df.seqno.values])
for_df[fore_common][:5]
rev_df[rev_common][:5]
#for each sequence get the distances from each other
def compute_distances(common_seqs, for_df, rev_df):
count = 0
distances = []
exemplify = False
for seqno in common_seqs:
fore = for_df[for_df.seqno==seqno]
rev = rev_df[rev_df.seqno==seqno]
res = np.subtract.outer(fore.start_idx.values, rev.start_idx.values).T
if res.shape[0]==res.shape[1]:
iu1 = np.triu_indices(res.shape[0])
res = np.abs(res[iu1])
else:
print(fore.seqno, res.shape)
print(res)
res = np.abs(res.flatten())
if exemplify==False and fore.shape[0]>1 and rev.shape[0]>1:
print(fore.seqno)
exemplify = True
#return res
#res = np.abs(res.flatten())
count += len(res)
distances.append(res)
distances = np.sort(np.concatenate(distances))
return distances
distances = compute_distances(common_seqs, for_df, rev_df)
import matplotlib.pyplot as plt
#plt.scatter(np.array(range(distances.shape[0])), distances, s=1);
plt.hist(distances, bins=100)
#curren_ax.hist(data, 50, facecolor='blue', alpha=0.5)
fore
rev
from modisco.visualization import viz_sequence
#[(560, 560+8), (656, 656+8), (589,589+8), (638, 638+8)]
highlights = [(60, 60+8), (156, 156+8), (89,89+8), (138, 138+8)]
viz_sequence.plot_weights(ob.deeplift_data["scores"][6379, 500:700,:],
highlight={"black": highlights},
subticks_frequency=10, figsize=(20,1)
)
len(query['ppm'])
from matlas.matches import DenovoModisco
task_idx = 273
modisco_dir = "{0}/task_{1}-naivegw".format(modisco_root, task_idx)
summit_file = "{0}/{1}/fineFactorized/task_{2}-naivegw/predictions/task_{2}-{3}Summit.calibrated.{1}.tab.gz".format(project_root, data_name, task_idx, "Naive")
perf_file = "{0}/task_{1}-naivegw/NaiveauPRC.txt".format(model_root, task_idx)
deeplift_hdf = "{0}/{1}/Naive_deeplift2019/task_{2}-naivegw/summit.h5".format(srv_root, data_name, task_idx)
ob = DenovoModisco(modisco_dir)
pattern_name = "metacluster_1/pattern_7"
query = ob.load_chomped_pattern(pattern_name, plot=True)
ob.initialize_with_importance_scores(summit_file, perf_file, deeplift_hdf, query.keys()) #loads all the important scores
deeplift_data = ob.deeplift_data
from matlas.utils import get_calibrated_peaks
calibrated_data = get_calibrated_peaks(summit_file, perf_file)
selected_summits = calibrated_data['selected_summits']
indices = calibrated_data['indices']
#from basepair.plot.profiles import extract_signal
from matlas.tracks import extract_signal
pattern_name = 'metacluster_1/pattern_7'
seqlet_scores = ob.contrib_per_pattern[pattern_name]
seqlet_onehot = ob.seqs_per_pattern[pattern_name]
# seqlets_ = ob.seqlets_per_pattern[pattern_name]
# shuffled_onehot = extract_signal(ob.deeplift_data['shuffled_onehot'][indices], seqlets_)
# shuffled_scores = extract_signal(ob.deeplift_data['shuffled_scores'][indices], seqlets_)
# shuffled_onehot.shape, shuffled_scores.shape
from matlas.matches import collect_scoring_measures
method_to_seqlet_scores = collect_scoring_measures(query, seqlet_scores, seqlet_onehot)
print(query.keys(), np.sort(np.array(list(method_to_seqlet_scores.keys()))))
scores = deeplift_data['scores']
one_hot = deeplift_data['one_hot']
method_to_scores = collect_scoring_measures(query, scores, one_hot)
from matlas.matches import get_any_high_scoring_instances
per_base_imp=0.0625
keys_for_scoring = ['sum_score']
keys_for_scoring += ["{}_scan".format(key) for key in query.keys() if key!= 'consensus']
keys_for_scoring += ["{}_jaccard".format(key) for key in query.keys() if key!= 'consensus']
keys_for_scoring += ["{}_convolution".format(key) for key in query.keys() if key!= 'consensus']
keys_for_scoring += ["{}_masked_convolution".format(key) for key in query.keys() if key!= 'consensus']
seqlet_hits = get_any_high_scoring_instances(method_to_seqlet_scores,
min_tot_imp=per_base_imp*len(query['ppm']),
keys=keys_for_scoring)
_hits = get_any_high_scoring_instances(method_to_scores,
min_tot_imp=per_base_imp*len(query['ppm']),
keys=keys_for_scoring)
keys_for_scoring=['sum_score']
keys_for_scoring += ["{}_scan".format(key) for key in ['pssm', 'hyp_cwm', 'cwm']]
keys_for_scoring += ["{}_jaccard".format(key) for key in ['pssm', 'hyp_cwm']]
#keys_for_scoring += ["{}_convolution".format(key) for key in ['hyp_cwm', 'cwm']]
keys_for_scoring += ["{}_masked_convolution".format(key) for key in ['hyp_cwm']]
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
def transform_data(_hits, scaler=None, pca=None):
if scaler is None:
# should be used only during training
scaler = StandardScaler()
scaler.fit(_hits)
# Apply transform to both the training set and testing set
_hits = scaler.transform(_hits)
if pca is None:
# Make an instance of the Model
pca = PCA(.98)
#fitting PCA on the training set only.
pca.fit(_hits)
#Apply the mapping (transform) to both the training set and the test set.
_hits = pca.transform(_hits)
return scaler, pca, _hits
scaler, pca, train_hits = transform_data(seqlet_hits[keys_for_scoring])
pca.explained_variance_ratio_
pc_df = pd.DataFrame(data = train_hits, columns=["PC{}".format(i+1) for i in range(train_hits.shape[1])])
def load_true_positives(tp_file):
coordinates = []
weak_coordinates = []
with open(tp_file, "r") as fp:
for line in fp:
line = line.strip().split(",")
seqno = line[0]
pos = int(line[1])
if "-" in seqno:
seqno = seqno.split("-")
for i in range(int(seqno[0]), int(seqno[1])+1):
coordinates.append((i, pos))
else:
if len(line)>2:
#if 'weak' in line[2]:
weak_coordinates.append((int(seqno), pos))
#else:
#coordinates.append((int(seqno), pos))
else:
coordinates.append((int(seqno), pos))
return {'strong':coordinates, 'weak':weak_coordinates}
def get_tp_indices(seqlet_hits, coordinates, verbose=True):
keep = OrderedDict()
for key in coordinates.keys():
keep[key] = []
for cord in coordinates[key]:
seqno, pos = cord
df = seqlet_hits[(seqlet_hits['seqno']==seqno) & (seqlet_hits['start_idx']==pos)]
if df.shape[0]>0:
keep[key].append(df.index[0])
else:
if verbose: print(cord)
return keep
tp_file = "{}/web_mouse_heme_data/modisco_tracks/binary_model/task_{}-naivegw/mm17_tp_new.txt".format(project_root, task_idx)
coordinates = load_true_positives(tp_file)
true_indices = get_tp_indices(seqlet_hits, coordinates, False)
len(true_indices['strong']), len(true_indices['weak']), seqlet_hits.shape#, sum(labels==1)
import matplotlib.pyplot as plt
def plot_pca(df, true_ids=None):
print(df.shape)
fig = plt.figure(figsize = (8,8))
ax = fig.add_subplot(1,1,1)
ax.set_xlabel('PC1', fontsize = 15)
ax.set_ylabel('PC2', fontsize = 15)
ax.set_title('2 component PCA', fontsize = 20)
if true_ids is not None:
strong_ids = true_ids['strong']
strong_df = df.loc[np.array(strong_ids),:]
ax.scatter(strong_df['PC1'], strong_df['PC2'], s = 1, c='tomato')
weak_ids = true_ids['weak']
weak_df = df.loc[np.array(weak_ids),:]
ax.scatter(weak_df['PC1'], weak_df['PC2'], s = 5, c='darkred')
df = df.loc[np.array(list(set(list(df.index)).difference(
set(strong_ids).union(set(weak_ids))
))),:]
ax.scatter(df['PC1'], df['PC2'], s = 1, c='slateblue')
#ax.axvline(9)
#ax.axhline(-1)
ax.grid()
ax.legend(['strong match', 'weak match', 'no match'], loc="upper right")
return None
coordinates = load_true_positives(tp_file)
true_indices = get_tp_indices(seqlet_hits, coordinates, False)
plot_pca(pc_df, true_indices)
import matplotlib.pyplot as plt
from sklearn.mixture import GaussianMixture
#import sklearn.mixture.GaussianMixture
gmm = GaussianMixture(n_components=2, covariance_type='full', init_params='kmeans').fit(pc_df[['PC1', 'PC2']])
labels = gmm.predict(pc_df[['PC1', 'PC2']])
print(np.sum(labels==1), np.sum(labels==0))
plt.scatter(pc_df['PC1'], pc_df['PC2'], c=labels, s=1, cmap='seismic');
probs = gmm.predict_proba(pc_df[['PC1', 'PC2']])
plt.scatter(pc_df['PC1'], pc_df['PC2'], c=probs[:,1], s=1, cmap='seismic');
pcv1 = pca.explained_variance_ratio_[0]
pcv2 = pca.explained_variance_ratio_[2]
pc_maxs = pc_df.max()
affinity = (pc_df['PC1']*pcv1/(pcv1+pcv2)+pc_df['PC2']*pcv2/(pcv1+pcv2))/(pc_maxs['PC1']*pcv1/(pcv1+pcv2)+pc_maxs['PC2']*pcv2/(pcv1+pcv2))
plt.scatter(pc_df['PC1'], pc_df['PC2'], c=affinity, s=1, cmap='seismic');
labels = np.array([0]*pc_df.shape[0])
labels[np.array(true_indices['strong'])]=2
labels[np.array(true_indices['weak'])]=1
pc_df['label'] = labels
pc_df['index'] = pc_df.index
pc_df['seqno'] = seqlet_hits.seqno
pc_df['pos'] = seqlet_hits.start_idx
pc_df['affinity'] = affinity
pc_df[:5]
import plotly.express as px
fig = px.scatter(pc_df, x="PC1", y="PC2", hover_data=['index', 'seqno', 'pos'], color='affinity',
color_continuous_scale=px.colors.sequential.Magma
)
fig
#seqlets pca, seqlet gmm
_, _, test_hits = transform_data(_hits[keys_for_scoring], scaler, pca)
_pc_df = pd.DataFrame(data = test_hits, columns=["PC{}".format(i+1) for i in range(test_hits.shape[1])])
_labels = gmm.predict(_pc_df[['PC1', 'PC2']])
print(np.sum(_labels==1), np.sum(_labels==0))
_probs = gmm.predict_proba(_pc_df[['PC1', 'PC2']])
plt.scatter(_pc_df['PC1'], _pc_df['PC2'], c=_probs[:,1], s=1, cmap='seismic');
dlhits[:5]
from matlas.verification import compute_chip_matrix
compute_chip_matrix("MEL", "GATA", out_prefix, execute=True)
from matlas.verification import plot_chip_heatmap
plot_chip_heatmap("MEL", "GATA", out_prefix, execute=True)
from matlas.matches import collect_scoring_measures
seqlet_scores = ob.contrib_per_pattern[pattern_name]
seqlet_onehot = ob.seqs_per_pattern[pattern_name]
method_to_seqlet_scores = collect_scoring_measures(query, seqlet_scores, seqlet_onehot)
query.keys(), np.sort(np.array(list(method_to_seqlet_scores.keys())))
method_to_shuffled_scores = collect_scoring_measures(query, shuffled_scores, shuffled_onehot)
method_to_seq_null_scores = collect_scoring_measures(query, seq_null_scores, seq_null_onehot)
method_to_weight_null_scores = collect_scoring_measures(query, weight_null_scores, weight_null_onehot)
method_to_weightedseq_null_scores = collect_scoring_measures(query, weightedseq_null_scores, weightedseq_null_onehot)
from matlas.matches import get_any_high_scoring_instances
per_base_imp=0.0625
keys_for_scoring = ['sum_score']
keys_for_scoring += ["{}_scan".format(key) for key in query.keys() if key!= 'consensus']
keys_for_scoring += ["{}_jaccard".format(key) for key in query.keys() if key!= 'consensus']
keys_for_scoring += ["{}_convolution".format(key) for key in query.keys() if key!= 'consensus']
keys_for_scoring += ["{}_masked_convolution".format(key) for key in query.keys() if key!= 'consensus']
seqlet_hits = get_any_high_scoring_instances(method_to_seqlet_scores,
min_tot_imp=per_base_imp*len(query['ppm']),
keys=keys_for_scoring)
def get_background_instances(fg, method_to_null_scores, keys=None):
# given the fg coords, get the df of bg
# appy scaling and pca as in fg, then apply log-likelihood
if keys is None:
keys = list(method_to_scores.keys())
sum_scores = method_to_null_scores['sum_score']
matches = [(sum_scores[seqno, pos], seqno, pos) for seqno, pos in zip(fg['seqno'].values, fg['start_idx'].values)]
match_df = pd.DataFrame(np.array(matches))
match_df.columns = ['sum_score', 'seqno', 'start_idx']
match_df.seqno = match_df.seqno.values.astype(int)
match_df.start_idx = match_df.start_idx.values.astype(int)
for key in keys:
if(key=='sum_score'):
continue
match_df[key] = [method_to_null_scores[key][idx1, idx2] for
(idx1, idx2) in zip(fg['seqno'].values, fg['start_idx'].values)]
return match_df
shuffled_paired_hits = get_background_instances(seqlet_hits, method_to_shuffled_scores, keys=keys_for_scoring)
seq_null_paired_hits = get_background_instances(seqlet_hits, method_to_seq_null_scores, keys=keys_for_scoring)
weight_null_paired_hits = get_background_instances(seqlet_hits, method_to_weight_null_scores, keys=keys_for_scoring)
weightedseq_null_paired_hits = get_background_instances(seqlet_hits, method_to_weightedseq_null_scores, keys=keys_for_scoring)
seqlet_hits.shape, shuffled_paired_hits.shape, seq_null_paired_hits.shape, weight_null_paired_hits.shape, weightedseq_null_paired_hits.shape
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
def transform_data(_hits, scaler=None, pca=None):
if scaler is None:
# should be used only during training
scaler = StandardScaler()
scaler.fit(_hits)
# Apply transform to both the training set and testing set
_hits = scaler.transform(_hits)
if pca is None:
# Make an instance of the Model
pca = PCA(.98)
#fitting PCA on the training set only.
pca.fit(_hits)
#Apply the mapping (transform) to both the training set and the test set.
_hits = pca.transform(_hits)
return scaler, pca, _hits
scaler, pca, train_hits = transform_data(seqlet_hits[keys_for_scoring])
pca.explained_variance_ratio_
shuffled_seqlet_hits = get_any_high_scoring_instances(method_to_shuffled_scores,
min_tot_imp=per_base_imp*len(query['ppm']),
keys=keys_for_scoring)
seq_null_seqlet_hits = get_any_high_scoring_instances(method_to_seq_null_scores,
min_tot_imp=per_base_imp*len(query['ppm']),
keys=keys_for_scoring)
weight_null_seqlet_hits = get_any_high_scoring_instances(method_to_weight_null_scores,
min_tot_imp=per_base_imp*len(query['ppm']),
keys=keys_for_scoring)
weightedseq_null_seqlet_hits = get_any_high_scoring_instances(method_to_weightedseq_null_scores,
min_tot_imp=per_base_imp*len(query['ppm']),
keys=keys_for_scoring)
seqlet_hits['marker'] = 'foreground'
shuffled_seqlet_hits['marker'] = 'suffled'
seq_null_seqlet_hits['marker'] = 'seq_null'
weight_null_seqlet_hits['marker'] = 'weight_null'
weightedseq_null_seqlet_hits['marker'] = 'weightedseq'
test_hits = pd.concat([seqlet_hits,
shuffled_seqlet_hits,
seq_null_seqlet_hits,
weight_null_seqlet_hits,
weightedseq_null_seqlet_hits
])
#read the real truth
def load_true_positives(tp_file):
coordinates = []
with open(tp_file, "r") as fp:
for line in fp:
line = line.strip().split(",")
seqno = line[0]
pos = int(line[1])
if "-" in seqno:
seqno = seqno.split("-")
for i in range(int(seqno[0]), int(seqno[1])+1):
coordinates.append((i, pos))
else:
coordinates.append((int(seqno), pos))
return coordinates
def get_tp_indices(seqlet_hits, verbose=True):
keep = []
for cord in coordinates:
seqno, pos = cord
df = seqlet_hits[(seqlet_hits['seqno']==seqno) & (seqlet_hits['start_idx']==pos)]
if df.shape[0]>0:
keep.append(df.index[0])
else:
if verbose: print(cord)
return keep
tp_file = "{}/web_mouse_heme_data/modisco_tracks/binary_model/task_{}-naivegw/mm17_tp.txt".format(project_root, task_idx)
coordinates = load_true_positives(tp_file)
true_indices = get_tp_indices(seqlet_hits, False)
import matplotlib.pyplot as plt
def plot_pca(df, true_ids=None):
print(df.shape)
fig = plt.figure(figsize = (8,8))
ax = fig.add_subplot(1,1,1)
ax.set_xlabel('PC1', fontsize = 15)
ax.set_ylabel('PC2', fontsize = 15)
ax.set_title('2 component PCA', fontsize = 20)
if true_ids is not None:
true_df = df.loc[np.array(true_ids),:]
df = df.loc[np.array(list(set(list(df.index)).difference(set(true_ids)))),:]
ax.scatter(true_df['PC1'], true_df['PC2'], s = 20, c='r', marker='+')
ax.scatter(df['PC1'], df['PC2'], s = 1, c='b')
#ax.axvline(9)
#ax.axhline(-1)
ax.grid()
return None
pc_df = pd.DataFrame(data = train_hits, columns=["PC{}".format(i+1) for i in range(train_hits.shape[1])])
plot_pca(pc_df, true_indices)
from sklearn.mixture import GaussianMixture
#import sklearn.mixture.GaussianMixture
gmm = GaussianMixture(n_components=2).fit(pc_df[['PC1', 'PC2']])
labels = gmm.predict(pc_df[['PC1', 'PC2']])
plt.scatter(pc_df['PC1'], pc_df['PC2'], c=labels, s=1, cmap='seismic');
probs = gmm.predict_proba(pc_df[['PC1', 'PC2']])
plt.scatter(pc_df['PC1'], pc_df['PC2'], c=probs[:,1], s=1, cmap='seismic');
conditions = np.copy(labels)
conditions[np.logical_and(pc_df['PC1'].values<10, pc_df['PC2'].values<-0.5)] = 0
plt.scatter(pc_df['PC1'], pc_df['PC2'], c=conditions, s=1, cmap='seismic');
# selected = np.array(list(set(list(np.argwhere(conditions==0).T[0])).difference(set(list(np.argwhere(labels==0).T[0])))))
# pc_df.loc[selected,:]
# seqlet_hits.loc[selected,:]
# probs[:,1]
from matlas.render_report import jupyter_nbconvert
import os
def render_test_plots(nbname="global_report"):
rendered_ipynb = "/mnt/lab_data/kundaje/msharmin/mouse_hem/code/mouse_hem/{}.ipynb".format(nbname)
jupyter_nbconvert(rendered_ipynb)
os.system("cp /mnt/lab_data/kundaje/msharmin/mouse_hem/code/mouse_hem/{}.html /srv/www/kundaje/msharmin/report/global_htmls/".format(nbname))
os.system("chmod +755 /srv/www/kundaje/msharmin/mouse_hem/report/global_htmls/{}.html".format(nbname))
return None
render_test_plots('testSpi1Scan')
_, _, test_hits = transform_data(shuffled_seqlet_hits[keys_for_scoring], scaler, pca)
shuffled_pc_df = pd.DataFrame(data = test_hits, columns=["PC{}".format(i+1) for i in range(test_hits.shape[1])])
#plot_pca(shuffled_pc_df, true_ids=None)
probs = gmm.predict_proba(shuffled_pc_df[['PC1', 'PC2']])
plt.scatter(shuffled_pc_df['PC1'], shuffled_pc_df['PC2'], c=probs[:,1], s=1, cmap='seismic');
marker = np.array(['F']*llr_df.shape[0])
marker[np.array(true_indices)] = 'TP'
llr_df['marker'] = marker
#dimension reduction
len(keys_for_scoring)
keys_for_scoring = ['sum_score']
keys_for_scoring += ["{}_scan".format(key) for key in query.keys() if key!= 'consensus']
keys_for_scoring += ["{}_jaccard".format(key) for key in query.keys() if key!= 'consensus']
keys_for_scoring += ["{}_convolution".format(key) for key in query.keys() if key!= 'consensus']
keys_for_scoring += ["{}_masked_convolution".format(key) for key in ['hyp_cwm', 'log10pwm', 'pwm']]
import umap
trans = umap.UMAP(n_neighbors=5, random_state=42).fit(seqlet_hits[keys_for_scoring])
print(trans.embedding_.shape)
plt.scatter(trans.embedding_[:, 0], trans.embedding_[:, 1], s=1, cmap='Spectral')
test_embedding = trans.transform(seqlet_hits[keys_for_scoring])
print(test_embedding.shape)
plt.scatter(test_embedding[:, 0], test_embedding[:, 1], s=1, cmap='Spectral')
test_embedding = trans.transform(shuffled_seqlet_hits[keys_for_scoring])
print(test_embedding.shape)
plt.scatter(test_embedding[:, 0], test_embedding[:, 1], s=1, cmap='Spectral')
test_embedding = trans.transform(seq_null_seqlet_hits[keys_for_scoring])
print(test_embedding.shape)
plt.scatter(test_embedding[:, 0], test_embedding[:, 1], s=1, cmap='Spectral')
test_embedding = trans.transform(weight_null_seqlet_hits[keys_for_scoring])
print(test_embedding.shape)
plt.scatter(test_embedding[:, 0], test_embedding[:, 1], s=1, cmap='Spectral')
test_embedding = trans.transform(weightedseq_null_seqlet_hits[keys_for_scoring])
print(test_embedding.shape)
plt.scatter(test_embedding[:, 0], test_embedding[:, 1], s=1, cmap='Spectral')
type(test_embedding)
umap_seqlet_hits = pd.DataFrame(trans.transform(seqlet_hits[keys_for_scoring]))
umap_shuffled_seqlet_hits = pd.DataFrame(trans.transform(shuffled_seqlet_hits[keys_for_scoring]))
umap_seq_null_seqlet_hits = pd.DataFrame(trans.transform(seq_null_seqlet_hits[keys_for_scoring]))
umap_weight_null_seqlet_hits = pd.DataFrame(trans.transform(weight_null_seqlet_hits[keys_for_scoring]))
umap_weightedseq_null_seqlet_hits = pd.DataFrame(trans.transform(weightedseq_null_seqlet_hits[keys_for_scoring]))
umap_seqlet_hits['marker'] = 'foreground'
umap_shuffled_seqlet_hits['marker'] = 'suffled'
umap_seq_null_seqlet_hits['marker'] = 'seq_null'
umap_weight_null_seqlet_hits['marker'] = 'weight_null'
umap_weightedseq_null_seqlet_hits['marker'] = 'weightedseq'
umap_test_hits = pd.concat([umap_seqlet_hits,
umap_shuffled_seqlet_hits,
umap_seq_null_seqlet_hits,
umap_weight_null_seqlet_hits,
umap_weightedseq_null_seqlet_hits
])
sns.pairplot(umap_test_hits,
diag_kind="kde", kind='scatter', hue="marker")
from matlas.matches import plots_seqlet_measures
plots_seqlet_measures(seqlet_hits, metrics = {0:["sum_score", "ppm_scan"],
1:["sum_score", "ppm_convolution"],
2:["sum_score", "ppm_masked_convolution"],
3:["sum_score", "ppm_jaccard"]})
plots_seqlet_measures(shuffled_seqlet_hits, metrics = {0:["sum_score", "ppm_scan"],
1:["sum_score", "ppm_convolution"],
2:["sum_score", "ppm_masked_convolution"],
3:["sum_score", "ppm_jaccard"]})
plots_seqlet_measures(seq_null_seqlet_hits, metrics = {0:["sum_score", "ppm_scan"],
1:["sum_score", "ppm_convolution"],
2:["sum_score", "ppm_masked_convolution"],
3:["sum_score", "ppm_jaccard"]})
plots_seqlet_measures(weight_null_seqlet_hits, metrics = {0:["sum_score", "ppm_scan"],
1:["sum_score", "ppm_convolution"],
2:["sum_score", "ppm_masked_convolution"],
3:["sum_score", "ppm_jaccard"]})
plots_seqlet_measures(weightedseq_null_seqlet_hits, metrics = {0:["sum_score", "ppm_scan"],
1:["sum_score", "ppm_convolution"],
2:["sum_score", "ppm_masked_convolution"],
3:["sum_score", "ppm_jaccard"]})
from matlas.matches import plots_seqlet_measures
plots_seqlet_measures(seqlet_hits, metrics = {0:["sum_score", "pwm_scan"],
1:["sum_score", "pwm_convolution"],
2:["sum_score", "pwm_masked_convolution"],
3:["sum_score", "pwm_jaccard"]})
plots_seqlet_measures(shuffled_seqlet_hits, metrics = {0:["sum_score", "pwm_scan"],
1:["sum_score", "pwm_convolution"],
2:["sum_score", "pwm_masked_convolution"],
3:["sum_score", "pwm_jaccard"]})
plots_seqlet_measures(seq_null_seqlet_hits, metrics = {0:["sum_score", "pwm_scan"],
1:["sum_score", "pwm_convolution"],
2:["sum_score", "pwm_masked_convolution"],
3:["sum_score", "pwm_jaccard"]})
plots_seqlet_measures(weight_null_seqlet_hits, metrics = {0:["sum_score", "pwm_scan"],
1:["sum_score", "pwm_convolution"],
2:["sum_score", "pwm_masked_convolution"],
3:["sum_score", "pwm_jaccard"]})
plots_seqlet_measures(weightedseq_null_seqlet_hits, metrics = {0:["sum_score", "pwm_scan"],
1:["sum_score", "pwm_convolution"],
2:["sum_score", "pwm_masked_convolution"],
3:["sum_score", "pwm_jaccard"]})
from matlas.matches import plots_seqlet_measures
plots_seqlet_measures(seqlet_hits, metrics = {0:["sum_score", "log10pwm_scan"],
1:["sum_score", "log10pwm_convolution"],
2:["sum_score", "log10pwm_masked_convolution"],
3:["sum_score", "log10pwm_jaccard"]})
plots_seqlet_measures(shuffled_seqlet_hits, metrics = {0:["sum_score", "log10pwm_scan"],
1:["sum_score", "log10pwm_convolution"],
2:["sum_score", "log10pwm_masked_convolution"],
3:["sum_score", "log10pwm_jaccard"]})
from matlas.matches import plots_seqlet_measures
plots_seqlet_measures(seq_null_seqlet_hits, metrics = {0:["sum_score", "log10pwm_scan"],
1:["sum_score", "log10pwm_convolution"],
2:["sum_score", "log10pwm_masked_convolution"],
3:["sum_score", "log10pwm_jaccard"]})
from matlas.matches import plots_seqlet_measures
plots_seqlet_measures(weight_null_seqlet_hits, metrics = {0:["sum_score", "log10pwm_scan"],
1:["sum_score", "log10pwm_convolution"],
2:["sum_score", "log10pwm_masked_convolution"],
3:["sum_score", "log10pwm_jaccard"]})
from matlas.matches import plots_seqlet_measures
plots_seqlet_measures(weightedseq_null_seqlet_hits, metrics = {0:["sum_score", "log10pwm_scan"],
1:["sum_score", "log10pwm_convolution"],
2:["sum_score", "log10pwm_masked_convolution"],
3:["sum_score", "log10pwm_jaccard"]})
from matlas.matches import plots_seqlet_measures
plots_seqlet_measures(seqlet_hits, metrics = {0:["sum_score", "pssm_scan"],
1:["sum_score", "pssm_convolution"],
2:["sum_score", "pssm_masked_convolution"],
3:["sum_score", "pssm_jaccard"]})
plots_seqlet_measures(shuffled_seqlet_hits, metrics = {0:["sum_score", "pssm_scan"],
1:["sum_score", "pssm_convolution"],
2:["sum_score", "pssm_masked_convolution"],
3:["sum_score", "pssm_jaccard"]})
plots_seqlet_measures(seq_null_seqlet_hits, metrics = {0:["sum_score", "pssm_scan"],
1:["sum_score", "pssm_convolution"],
2:["sum_score", "pssm_masked_convolution"],
3:["sum_score", "pssm_jaccard"]})
plots_seqlet_measures(weight_null_seqlet_hits, metrics = {0:["sum_score", "pssm_scan"],
1:["sum_score", "pssm_convolution"],
2:["sum_score", "pssm_masked_convolution"],
3:["sum_score", "pssm_jaccard"]})
plots_seqlet_measures(weightedseq_null_seqlet_hits, metrics = {0:["sum_score", "pssm_scan"],
1:["sum_score", "pssm_convolution"],
2:["sum_score", "pssm_masked_convolution"],
3:["sum_score", "pssm_jaccard"]})
from matlas.matches import plots_seqlet_measures
plots_seqlet_measures(seqlet_hits, metrics = {0:["sum_score", "cwm_scan"],
1:["sum_score", "cwm_convolution"],
2:["sum_score", "cwm_masked_convolution"],
3:["sum_score", "cwm_jaccard"]})
plots_seqlet_measures(shuffled_seqlet_hits, metrics = {0:["sum_score", "cwm_scan"],
1:["sum_score", "cwm_convolution"],
2:["sum_score", "cwm_masked_convolution"],
3:["sum_score", "cwm_jaccard"]})
plots_seqlet_measures(seq_null_seqlet_hits, metrics = {0:["sum_score", "cwm_scan"],
1:["sum_score", "cwm_convolution"],
2:["sum_score", "cwm_masked_convolution"],
3:["sum_score", "cwm_jaccard"]})
plots_seqlet_measures(weight_null_seqlet_hits, metrics = {0:["sum_score", "cwm_scan"],
1:["sum_score", "cwm_convolution"],
2:["sum_score", "cwm_masked_convolution"],
3:["sum_score", "cwm_jaccard"]})
plots_seqlet_measures(weightedseq_null_seqlet_hits, metrics = {0:["sum_score", "cwm_scan"],
1:["sum_score", "cwm_convolution"],
2:["sum_score", "cwm_masked_convolution"],
3:["sum_score", "cwm_jaccard"]})
from matlas.matches import plots_seqlet_measures
plots_seqlet_measures(seqlet_hits, metrics = {0:["sum_score", "hyp_cwm_scan"],
1:["sum_score", "hyp_cwm_convolution"],
2:["sum_score", "hyp_cwm_masked_convolution"],
3:["sum_score", "hyp_cwm_jaccard"]})
plots_seqlet_measures(shuffled_seqlet_hits, metrics = {0:["sum_score", "hyp_cwm_scan"],
1:["sum_score", "hyp_cwm_convolution"],
2:["sum_score", "hyp_cwm_masked_convolution"],
3:["sum_score", "hyp_cwm_jaccard"]})
plots_seqlet_measures(seq_null_seqlet_hits, metrics = {0:["sum_score", "hyp_cwm_scan"],
1:["sum_score", "hyp_cwm_convolution"],
2:["sum_score", "hyp_cwm_masked_convolution"],
3:["sum_score", "hyp_cwm_jaccard"]})
plots_seqlet_measures(weight_null_seqlet_hits, metrics = {0:["sum_score", "hyp_cwm_scan"],
1:["sum_score", "hyp_cwm_convolution"],
2:["sum_score", "hyp_cwm_masked_convolution"],
3:["sum_score", "hyp_cwm_jaccard"]})
plots_seqlet_measures(weightedseq_null_seqlet_hits, metrics = {0:["sum_score", "hyp_cwm_scan"],
1:["sum_score", "hyp_cwm_convolution"],
2:["sum_score", "hyp_cwm_masked_convolution"],
3:["sum_score", "hyp_cwm_jaccard"]})
len(keys_for_scoring)
keys_for_scoring = ['sum_score']
keys_for_scoring += ["{}_scan".format(key) for key in query.keys() if key!= 'consensus']
keys_for_scoring += ["{}_jaccard".format(key) for key in query.keys() if key!= 'consensus']
keys_for_scoring += ["{}_convolution".format(key) for key in query.keys() if key!= 'consensus']
keys_for_scoring += ["{}_masked_convolution".format(key) for key in query.keys() if key!= 'consensus']
#keys_for_scoring += ["{}_masked_convolution".format(key) for key in ['hyp_cwm', 'log10pwm', 'pwm']]
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
# Fit on training set only.
scaler.fit(seqlet_hits[keys_for_scoring])
# Apply transform to both the training set and the test set.
train_hits = scaler.transform(seqlet_hits[keys_for_scoring])
from sklearn.decomposition import PCA
# Make an instance of the Model
pca = PCA(.98)
#fitting PCA on the training set only.
pca.fit(train_hits)
#Apply the mapping (transform) to both the training set and the test set.
train_hits = pca.transform(train_hits)
#test_hits = pca.transform(test_hits)
#seqlet_hits[seqlet_hits['seqno']==684]
pca.explained_variance_ratio_
#read the real truth
def load_true_positives(tp_file):
coordinates = []
with open(tp_file, "r") as fp:
for line in fp:
line = line.strip().split(",")
seqno = line[0]
pos = int(line[1])
if "-" in seqno:
seqno = seqno.split("-")
for i in range(int(seqno[0]), int(seqno[1])+1):
coordinates.append((i, pos))
else:
coordinates.append((int(seqno), pos))
return coordinates
def get_tp_indices(seqlet_hits):
keep = []
for cord in coordinates:
seqno, pos = cord
df = seqlet_hits[(seqlet_hits['seqno']==seqno) & (seqlet_hits['start_idx']==pos)]
if df.shape[0]>0:
keep.append(df.index[0])
else:
print(cord)
return keep
tp_file = "{}/web_mouse_heme_data/modisco_tracks/binary_model/task_{}-naivegw/mm17_tp.txt".format(project_root, task_idx)
coordinates = load_true_positives(tp_file)
true_indices = get_tp_indices(seqlet_hits)
def plot_pca(df, true_ids=None):
print(df.shape)
fig = plt.figure(figsize = (8,8))
ax = fig.add_subplot(1,1,1)
ax.set_xlabel('PC1', fontsize = 15)
ax.set_ylabel('PC2', fontsize = 15)
ax.set_title('2 component PCA', fontsize = 20)
if true_ids is not None:
true_df = df.loc[np.array(true_ids),:]
df = df.loc[np.array(list(set(list(df.index)).difference(set(true_ids)))),:]
ax.scatter(true_df['PC1'], true_df['PC2'], s = 20, c='r', marker='+')
ax.scatter(df['PC1'], df['PC2'], s = 1, c='b')
#ax.axvline(9)
#ax.axhline(-1)
ax.grid()
return None
pc_df = pd.DataFrame(data = train_hits, columns=["PC{}".format(i+1) for i in range(train_hits.shape[1])])
plot_pca(pc_df, )
#test_df = pc_df.loc[np.array(list(set(list(pc_df.index)).difference(set(true_indices)))),:]
#test_df = test_df.loc[(test_df['PC1']>9) & (test_df['PC2']<0),:]
#seqlet_hits.loc[test_df.index, :]
seqlet_hits[seqlet_hits['seqno']==724]
test_hits = scaler.transform(shuffled_seqlet_hits[keys_for_scoring])
test_hits = pca.transform(test_hits)
shuffled_pc_df = pd.DataFrame(data = test_hits, columns=["PC{}".format(i+1) for i in range(test_hits.shape[1])])
plot_pca(shuffled_pc_df, true_ids=None)
test_hits = scaler.transform(seq_null_seqlet_hits[keys_for_scoring])
test_hits = pca.transform(test_hits)
seq_null_pc_df = pd.DataFrame(data = test_hits, columns=["PC{}".format(i+1) for i in range(test_hits.shape[1])])
plot_pca(seq_null_pc_df)
test_hits = scaler.transform(weight_null_seqlet_hits[keys_for_scoring])
test_hits = pca.transform(test_hits)
weight_null_pc_df = pd.DataFrame(data = test_hits, columns=["PC{}".format(i+1) for i in range(test_hits.shape[1])])
plot_pca(weight_null_pc_df)
test_hits = scaler.transform(weightedseq_null_seqlet_hits[keys_for_scoring])
test_hits = pca.transform(test_hits)
weightedseq_null_pc_df = pd.DataFrame(data = test_hits, columns=["PC{}".format(i+1) for i in range(test_hits.shape[1])])
plot_pca(weightedseq_null_pc_df)
pc_df[:5]
pca_seqlet_hits = pc_df[['PC1', 'PC2', 'PC3']]
pca_shuffled_seqlet_hits = shuffled_pc_df[['PC1', 'PC2', 'PC3']]
pca_seq_null_seqlet_hits = seq_null_pc_df[['PC1', 'PC2', 'PC3']]
pca_weight_null_seqlet_hits = weight_null_pc_df[['PC1', 'PC2', 'PC3']]
pca_weightedseq_null_seqlet_hits = weightedseq_null_pc_df[['PC1', 'PC2', 'PC3']]
pca_seqlet_hits['marker'] = 'foreground'
pca_shuffled_seqlet_hits['marker'] = 'shuffled'
pca_seq_null_seqlet_hits['marker'] = 'seq_null'
pca_weight_null_seqlet_hits['marker'] = 'weight_null'
pca_weightedseq_null_seqlet_hits['marker'] = 'weightedseq'
pca_test_hits = pd.concat([pca_seqlet_hits,
pca_shuffled_seqlet_hits,
pca_seq_null_seqlet_hits,
pca_weight_null_seqlet_hits,
pca_weightedseq_null_seqlet_hits
])
import seaborn as sns
sns.pairplot(pd.concat([pca_seqlet_hits, pca_shuffled_seqlet_hits]),
diag_kind="kde", kind='scatter', hue="marker")
sns.pairplot(pd.concat([pca_seqlet_hits, pca_seq_null_seqlet_hits]),
diag_kind="kde", kind='scatter', hue="marker")
sns.pairplot(pd.concat([pca_seqlet_hits, pca_weight_null_seqlet_hits]),
diag_kind="kde", kind='scatter', hue="marker")
sns.pairplot(pd.concat([pca_seqlet_hits, pca_weightedseq_null_seqlet_hits]),
diag_kind="kde", kind='scatter', hue="marker")
import seaborn as sns
sns.pairplot(test_hits[['sum_score',
'weight_convolution', 'pfm_convolution', 'pwm_convolution']],
diag_kind="kde", kind='scatter')
sns.pairplot(test_hits[['sum_score',
'weight_scan', 'pfm_scan', 'pwm_scan']],
diag_kind="kde", kind='scatter')
sns.pairplot(test_hits[['sum_score',
'weight_jaccard', 'pfm_jaccard', 'pwm_jaccard']],
diag_kind="kde", kind='scatter')
sns.pairplot(test_hits[['sum_score',
'weight_masked_convolution', 'pfm_masked_convolution', 'pwm_masked_convolution']],
diag_kind="kde", kind='scatter')
from matlas.matches import plots_seqlet_measures
plots_seqlet_measures(test_hits, metrics = {0:["pwm_convolution", "pfm_convolution"],
1:["pwm_convolution", "weight_convolution"],
2:["pwm_convolution", "pwm_scan"],
3:["pwm_convolution", "pwm_jaccard"]})
plots_seqlet_measures(test_hits, metrics = {0:["pwm_convolution", "pfm_jaccard"],
1:["pwm_convolution", "weight_jaccard"],
2:["pwm_convolution", "pwm_masked_convolution"],
3:["pwm_convolution", "pfm_masked_convolution"]})