%load_ext autoreload
%autoreload 2
import pickle
from collections import OrderedDict
import numpy as np
import pandas as pd
import os
import subprocess
import math
import copy
import pickle
import h5py
import glob
import time
from matlas.genome_data import *
from matlas.pwms import shorten_tfname, get_motifDB_id_maps
from matlas.matches import fig2vdom, vdom_pssm
#from basepair.modisco.results import ModiscoResult
from matlas.modisco_results import ModiscoResult
project_root = "/mnt/lab_data/kundaje/msharmin/mouse_hem/with_tfd"
oak_root = "/oak/stanford/groups/akundaje/msharmin/mouse_hem/from_labcluster/with_tfd"
srv_root = "/srv/scratch/msharmin/mouse_hem/with_tfd"
data_name = "full_mouse50"
modisco_root = "{0}/{1}/Naive_modisco2019_msharmin".format(oak_root, data_name)
modisco_root = "{0}/{1}/Naive_modisco2019".format(srv_root, data_name)
homer_root = "{0}/{1}/Naive_scans".format(srv_root, data_name)
model_root = "{0}/{1}/fineFactorized".format(srv_root, data_name)
basset_root = "{0}/{1}/Naive_basset_motifs".format(srv_root, data_name)
reportfile= "/mnt/lab_data/kundaje/msharmin/mouse_hem/with_tfd/full_mouse50/filtering samples_MS2.xlsx"
sheetname = "filter23"
from matlas.fimo_hits import load_deeplift_data
task_idx = 140
oc_root="/mnt/lab_data2/msharmin/oc-atlas"
deeplift_hdf = "{0}/deeplift2019_h5/task_{1}-naivegw/summit.h5".format(
oc_root, task_idx)
deeplift_data = load_deeplift_data(deeplift_hdf, keys=['one_hot', 'scores', 'shuffled_scores'])
hyp_scores = np.copy(deeplift_data['shuffled_scores'][:, 400:600, :])
b = np.sum(hyp_scores, axis=2)
# scores = np.copy(deeplift_data['scores'])
# a = np.sum(scores, axis=2)
c = deeplift_data['one_hot']
#for each seqno set a bin-number-gc calculation
from collections import Counter
c = np.copy(deeplift_data['one_hot'])[:, 400:600, :]
gc = lambda t: int(round(np.sum(t[:, 1:3]==1)/t.shape[0]*100))
bin_nos = np.array([gc(x) for x in c])
bins = sorted(Counter(bin_nos).items())
bins
start = time.time()
b_more = None
for i in range(10):
another_data = load_deeplift_data(deeplift_hdf, keys=['shuffled_scores'], shuffled_sets=[i])
b_1 = np.sum(another_data['shuffled_scores'][:, 400:600, :], axis=2)
if b_more is None:
b_more = np.expand_dims(b_1, 1)
else:
b_more = np.concatenate((b_more, np.expand_dims(b_1, 1)), axis=1)
#print(b_more.shape)
end = time.time()
print((end-start)/60)
np.array_equal(b_more, b_1)
dist_fns = OrderedDict()
for i, bin_ in enumerate(bins):
print(bin_)
x = np.squeeze(b_more[np.argwhere(bin_nos==bin_[0])]).flatten()
ecdf = ECDF(x, side='right')
inverted_edf = interp1d(ecdf.y, ecdf.x)
dist_fns[bin_[0]] = x
# dist_fns[bin_[0]]['ecdf'] = ecdf
# dist_fns[bin_[0]]['inverse_ecdf'] = inverted_edf
#dist_fns
start = time.time()
bin_nos, dist_fns = pickle.load(open("/mnt/lab_data2/msharmin/oc-atlas/mtf/t_stats/bin_fns_140", 'rb'))
#tmp2 = pickle.load(open("/mnt/lab_data2/msharmin/oc-atlas/mtf/t_stats/bin_fns_test2.p", 'rb'))
#pickle.dump([bin_nos, dist_fns], open("/mnt/lab_data2/msharmin/oc-atlas/mtf/t_stats/bin_fns_test2.p", "wb"))
end = time.time()
print((end-start)/60)
bin_nos.shape
#compute the distribution under a bin: extract the shuffled seqs, extract the middle 200bps and get the distribution
from statsmodels.distributions.empirical_distribution import ECDF
from scipy.interpolate import interp1d
from scipy.stats import *
from scipy import stats
from random import sample
from matplotlib import pyplot as plt
debug = 4
for i, bin_ in enumerate(bins):
#x = b[np.argwhere(bin_nos==bin_[0])]
x = np.squeeze(b_more[np.argwhere(bin_nos==bin_[0])]).flatten()
#x = sample(list(x), 5000)
ecdf = ECDF(x, side='right') #fit
inverted_edf = interp1d(ecdf.y, ecdf.x)
print(bin_, x.shape, np.percentile(x, 99.95),
np.percentile(sample(list(x), min(5000, len(x))), 99.95),
inverted_edf(0.9995)
)
#k_val_in_bg = np.percentile(x, my_k)#my_k is the k I want to optimize, k_val_in_bg is k to be cut into high and low
continue
if i%2==0:
fig, axes = plt.subplots(1, 2, figsize=(16,6))
ax_ = axes[0]
else:
ax_ = axes[1]
ax_.plot(ecdf.x, ecdf.y, c='b', label='ecdf', linestyle=':')
p = np.linspace(0, 1)
ax_.plot(inverted_edf(p), p, c='r', label='interpolated', linestyle='--')
ax_.plot([np.percentile(x, (i*0.1)) for i in range(0,1001,1)],
[(i*0.001) for i in range(0,1001,1)], c='k', label='percentile', linestyle='-.')
ax_.set_title("{}, {}, {}, {}, {}".format(bin_[0], bin_[1],
np.percentile(x, 99.95),
np.percentile(sample(list(x), min(5000, len(x))), 99.95),
inverted_edf(0.9995)
))
#[(i*0.001) for i in range(0,1001,1)]
debug -= 1
if debug==0:
break
#break
# x = np.linspace(0.1, 1)
# y = inverted_edf(x)
# plt.plot(x, y, 'ro', x, y, 'b-')
# plt.show()
#[(i*0.001) for i in range(0,1001,1)]
#ecdf.y.shape, ecdf.x.shape, x.shape
np.percentile(x, 99.5), inverted_edf(0.995)
# [np.percentile(x, (i*0.1)) for i in range(0,1001,1)]
# np.sort(x)
from matlas.fimo_hits import collect_motif_df
fimo_df = collect_motif_df("/mnt/lab_data2/msharmin/oc-atlas/mtf/atac_specific_hits/task_140/Ctcf.bed.gz")
bin_nos, dist_values = pickle.load(open("/mnt/lab_data2/msharmin/oc-atlas/mtf/t_stats/bin_fns_140", 'rb'))
bins = sorted(Counter(bin_nos).items())
dist_fns = OrderedDict()
for bin_, x in dist_values.items():
ecdf = ECDF(x, side='right')
dist_fns[bin_] = ecdf
p_vals = []
for i, instance in fimo_df.iterrows():
seqno = instance.seqno
start = instance.seqstart #to get the deeplift score
end = instance.seqend
bin_ = bin_nos[seqno]
mean_deeplift = instance.mean_deeplift
p_val = dist_fns[bin_](mean_deeplift)
p_vals.append(p_val)
from matlas.dlutils import load_deeplift_data
from matlas.fimo_hits import (dichotomize_chipseq_for_gold_standard,
#assign_significance_on_fimo_hits,
retain_fimo_hits_within_nb,
get_deeplift_passed_fimo_hits,
compute_instance_calling_performance,
optimize_performance_info,
check_performance_competition,
collect_motif_df,
plot_various_instance_scores,
decide_optimum_threshold,
collect_distance_info,
compute_distances_from_peaks,
plot_positional_information,
PEAK_TYPES
)
def prepare_verification_data_per_cell(
task_idx=273,
tf_biosample = "MEL cell line",
the_k=0.05627951420283896,
atac_root="/mnt/lab_data2/msharmin/oc-atlas/mtf/atac_specific_hits",
instance_root="/mnt/lab_data2/msharmin/oc-atlas/mtf/atac_pval_hits",
deeplift_root="/mnt/lab_data2/msharmin/oc-atlas/deeplift2019_h5",
background_files_root="/mnt/lab_data2/msharmin/oc-atlas/mtf/t_stats",
agnostic_root="/mnt/lab_data2/msharmin/oc-atlas/mtf/agnostic_hits"
):
tf_targets = sorted(set(CHIP_PEAK_BEDS[tf_biosample].keys()).intersection(set(CHIP_TO_MOTIF.keys())))
motifs = [CHIP_TO_MOTIF[tf] for tf in tf_targets]
# divide chipseq peaks into gold_standard and nongold for each tf under a cell type
fimodir = "{0}/task_{1}".format(atac_root, task_idx)
atacpeaks = "{}/peaks.bed".format(fimodir)
#print(fimodir)
#print(atacpeaks)
for tf, motif in zip(tf_targets, motifs):
print(tf_biosample, tf, motif)
chippeaks = "{0}/{1}".format(CHIP_PEAK_BED_ROOT, CHIP_PEAK_BEDS[tf_biosample][tf])
instancedir="{}/task_{}/{}".format(instance_root, task_idx, motif)
#print(chippeaks)
print(instancedir)
#print()
# the_k = 0.25
# instance_bed_dl = "Dl_k{}.bed".format(the_k)
# instance_bed_nondl = "nonDl_k{}.bed".format(the_k)
# perform_df, truth_df = compute_instance_calling_performance(
# bedname='Fimo50.bed',
# key_to_optimize='1-cdf',
# instancedir=instancedir,
# atac_bed=atacpeaks,
# verbose=True
# )
# tmp = np.load("{}/pval_pval.npz".format(instancedir), allow_pickle=True)
# check_performance_competition(tmp.f.a.tolist(), tmp.f.b.tolist(), tmp.f.c.tolist(),
# info="{}, {}, {}".format(task_idx, tf_biosample, motif),
# selected_key=None
# )
best_key = decide_optimum_threshold(instancedir)
tmp = np.load("{}/pval_pval.npz".format(instancedir), allow_pickle=True)
check_performance_competition(tmp.f.a.tolist(), tmp.f.b.tolist(), tmp.f.c.tolist(),
info="{}, {}, {}".format(task_idx, tf_biosample, motif),
selected_key=best_key
)
plot_various_instance_scores(instancedir, fimodir, motif, selected_key=best_key)
#plot_various_instance_scores(instancedir, fimodir, motif)
return None
from ast import literal_eval
def prepare_verification_data():
total_zeros = 0
for key, tf_biosamples in OC_TO_TF_SAMPLES.items():
task_idx, sample_id, oc_sample = literal_eval(key)
#if task_idx in [140, 142]:
# continue
print(task_idx, oc_sample, tf_biosamples, len(tf_biosamples))
total_zeros = prepare_verification_data_per_cell(task_idx=task_idx,
tf_biosample=tf_biosamples[0],
the_k=PER_BASE_BACKGROUND_IMPORTANCES[task_idx]
)
#break
return None
start = time.time()
prepare_verification_data()
end = time.time()
print((end-start)/60)
dlk = fimo_df[fimo_df['deeplift_cdf']>0.9]
nondlk = fimo_df[fimo_df['deeplift_cdf']<=0.9]
from matplotlib import pyplot as plt
fig, axes = plt.subplots(1, 2, figsize=(16,6))
ax_pdf = axes[0]
ax_cdf = axes[1]
key_to_show = 'sum_deeplift_0'
ax_pdf.hist(dlk[key_to_show].values, density=True, histtype='stepfilled',
alpha=0.6, bins=60, label='high')
# plot the scores for an example sequence of a as well
ax_pdf.hist(nondlk[key_to_show].values, density=True, histtype='stepfilled',
alpha=0.6, bins=60, label='low')
ax_pdf.legend(loc='best', frameon=False)
ax_pdf.legend(loc='best', frameon=False)
ax_cdf.hist(dlk[key_to_show].values, cumulative=True, density=True, histtype='bar',
alpha=0.6, bins=60, label='high')
# plot the scores for an example sequence of a as well
ax_cdf.hist(nondlk[key_to_show].values, cumulative=True, density=True, histtype='bar',
alpha=0.6, bins=60, label='low')
ax_cdf.legend(loc='best', frameon=False)
plt.show()
# revise performance measurements with changing the probability metric
# revise fimo50
# revise positional distribution and cdfs for a bed
# zcat chippeaks.bed.gz | sort -k1,1 -k2,2n | bedtools intersect -a ../../../atac_specific_hits/task_140/peaks.bed -b stdin -wb -wa -v > nongold3.bed
# zcat chippeaks.bed.gz | sort -k1,1 -k2,2n | bedtools intersect -a stdin -b ../../../atac_specific_hits/task_140/peaks.bed -wb -wa > gold.bed
# zcat /mnt/lab_data2/msharmin/tf-mouse-atlas/chip_idrs/ENCFF806PDR.bed.gz | sort -k1,1 -k2,2n | bedtools intersect -a stdin -b atac_specific_hits/task_140/peaks.bed -wb > atac_various_hits/task_140/Ctcf/gold.bed
b = np.sum(null_scores, axis=2)
a = np.sum(deeplift_scores, axis=2)
a.shape, b.shape
np.argmin(cdf_scores[:,5]), np.min(cdf_scores[:,5])
i = [0, 1]
c = b[0].flatten()
print(c.shape)
shape, mean, stdev = t.fit(c)
kstat, kval = stats.kstest(c, 't', args=[shape, mean, stdev])
# mean, stdev = norm.fit(c)
# kstat, kval = stats.kstest(c, 'norm', args=[ mean, stdev])
# shape, mean, stdev = gamma.fit(c)
# kstat, kval = stats.kstest(c, 'gamma', args=[shape, mean, stdev])
print(kstat, kval)
from matlas.fimo_hits import plot_fitted_dist_for_importance
plot_fitted_dist_for_importance(c, a[0], shape, mean, stdev)
# scores_ = [np.sum(j) for j in scores[0][300:700]]
# shape, mean, stdev = t.fit(scores_)
# kstat, kval = stats.kstest(scores_, 't', args=[shape, mean, stdev])
# print('kstat:', kstat, ' kval:', kval)
from scipy import stats
from scipy.stats import *
shape, mean, stdev = t.fit(b)
print(shape, mean, stdev)
kstat, kval = stats.kstest(b, 't', args=[shape, mean, stdev])
print('t sitribution', kstat, kval)
kstat, kval = stats.kstest(b, 'norm')
print('normal sitribution', kstat, kval)
#get the scores for all the windows, take per_base_imp distribution
# nonagnostic_out = "/mnt/lab_data/kundaje/msharmin/mouse_hem/mtf/specific_hits/task_224/Ctcf.bed.gz"
# df = pd.read_csv(nonagnostic_out, sep="\t")
# selected_peaks = np.sort(np.array(list(set(df['seqno'].values))))
# selected_null_scores = hyp_scores[selected_peaks,:,:]
# selected_null_scores.shape
# atacdf = pd.read_csv("/mnt/lab_data/kundaje/msharmin/mouse_hem/mtf/specific_hits/task_224/peaks.bed", header=None, sep="\t")
# #selected_atacs = atacdf.loc[selected_peaks,:]
# #selected_atacs.to_csv("/mnt/lab_data/kundaje/msharmin/mouse_hem/mtf/specific_hits/task_224/peaks_Ctcf.bed",
# #header=None, index=None, sep="\t")
from matlas.render_report import render_cell_tf_reports_in_parallel
render_cell_tf_reports_in_parallel()
# call dichotomize_chipseq_for_gold_standard for each tf under MEL
# call get_deeplift_passed_fimo_hits for each tf under MEL
# call get_peaks_within_nb for each tf under MEL
# for a tf: collect_distance_info, plot_distances_from_peaks
# for a tf: collect_performance_info, plot_performance_info
# for a tf: compute_joint_chip_matrix, plot_joint_chip_heatmap
# for a tf: load the png file
from matlas.fimo_hits import combine_motif_dfs
logdir = "/mnt/lab_data/kundaje/msharmin/mouse_hem/mtf"
outdir = "/mnt/lab_data/kundaje/msharmin/mouse_hem/mtf/specific_hits/task_{}".format(273)
nonagnostic_out = "{0}/{1}.bed.gz".format(outdir, "Adnp2")
df_dict = combine_motif_dfs(logdir, outdir,
combine_func="collect_motif_df",
)
df = pd.concat([df_dict[key] for key in df_dict.keys()])
#nonz_dfs = combine_motif_dfs(logdir, outdir, zscored=False)
from matlas.fimo_hits import plot_motif_scores
motifs = ['Spi1', 'Ctcf', 'Gata1']
turn = 0
fig, axes = None, None
for key in motifs:
fig, axes = plot_motif_scores(df_dict[key], title=key, turn=turn, axes=axes, fig=fig, kind='scatter')
turn = (turn+1)%4
import seaborn as sns
combined_df = pd.concat([df_dict[key] for key in df_dict.keys()])
sns.jointplot(x="avg_deeplift", y="fimo", data=combined_df, s=1, marginal_kws=dict(bins=100));
df_dict['Spi1'][:5]
from matlas.render_report import render_test_plots
render_test_plots(nbname="FimoVsDeepliftReport")
gencode_file = '/mnt/data/annotations/by_release/mm10/GENCODE_ann/gencode.vM6.annotation.gtf'
gencode = pd.read_table(gencode_file, sep='\t', header=None, comment='#')
genes = gencode[(gencode[0]!='chrM') & (gencode[2]=='gene')&(np.array(['protein_coding' in i for i in gencode[8].values]))]
gene_df = genes.loc[:, [0,3,4,8,5,6]]
gene_df = gene_df.sort_values([0,3])
gene_df.to_csv("{0}/genecode.bed.gz".format(modisco_dir), header=False, index=False, compression='gzip', sep="\t")
genes[:5]