%load_ext autoreload
%autoreload 2

import pickle
from collections import OrderedDict
import numpy as np
import pandas as pd
import os
import subprocess
import math
import copy
import pickle
import h5py
import glob
import time

from matlas.genome_data import *
from matlas.pwms import shorten_tfname, get_motifDB_id_maps
from matlas.matches import fig2vdom, vdom_pssm
#from basepair.modisco.results import ModiscoResult
from matlas.modisco_results import ModiscoResult


project_root = "/mnt/lab_data/kundaje/msharmin/mouse_hem/with_tfd"
oak_root = "/oak/stanford/groups/akundaje/msharmin/mouse_hem/from_labcluster/with_tfd"
srv_root = "/srv/scratch/msharmin/mouse_hem/with_tfd"
data_name = "full_mouse50"
modisco_root = "{0}/{1}/Naive_modisco2019_msharmin".format(oak_root, data_name)
modisco_root = "{0}/{1}/Naive_modisco2019".format(srv_root, data_name)

homer_root = "{0}/{1}/Naive_scans".format(srv_root, data_name)
model_root = "{0}/{1}/fineFactorized".format(srv_root, data_name)
basset_root = "{0}/{1}/Naive_basset_motifs".format(srv_root, data_name)
reportfile= "/mnt/lab_data/kundaje/msharmin/mouse_hem/with_tfd/full_mouse50/filtering samples_MS2.xlsx"
sheetname = "filter23"

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload

from matlas.fimo_hits import load_deeplift_data
task_idx = 140
oc_root="/mnt/lab_data2/msharmin/oc-atlas"
deeplift_hdf = "{0}/deeplift2019_h5/task_{1}-naivegw/summit.h5".format(
    oc_root, task_idx)
deeplift_data = load_deeplift_data(deeplift_hdf, keys=['one_hot', 'scores', 'shuffled_scores'])
hyp_scores = np.copy(deeplift_data['shuffled_scores'][:, 400:600, :])
b = np.sum(hyp_scores, axis=2)

# scores = np.copy(deeplift_data['scores'])
# a = np.sum(scores, axis=2)

c = deeplift_data['one_hot']

#for each seqno set a bin-number-gc calculation
from collections import Counter
c = np.copy(deeplift_data['one_hot'])[:, 400:600, :]
gc = lambda t: int(round(np.sum(t[:, 1:3]==1)/t.shape[0]*100))
bin_nos = np.array([gc(x) for x in c])
bins = sorted(Counter(bin_nos).items())

bins

[(22, 2),
 (24, 8),
 (25, 5),
 (26, 24),
 (27, 7),
 (28, 91),
 (29, 41),
 (30, 187),
 (31, 106),
 (32, 497),
 (33, 225),
 (34, 874),
 (35, 414),
 (36, 1601),
 (37, 693),
 (38, 2761),
 (39, 1122),
 (40, 4088),
 (41, 1736),
 (42, 6152),
 (43, 2378),
 (44, 7936),
 (45, 3074),
 (46, 9830),
 (47, 3580),
 (48, 11203),
 (49, 3824),
 (50, 11678),
 (51, 3843),
 (52, 11098),
 (53, 3563),
 (54, 6475),
 (55, 5933),
 (56, 7872),
 (57, 4338),
 (58, 3818),
 (59, 1690),
 (60, 4472),
 (61, 1302),
 (62, 3617),
 (63, 1044),
 (64, 2977),
 (65, 958),
 (66, 2688),
 (67, 888),
 (68, 2507),
 (69, 820),
 (70, 2362),
 (71, 769),
 (72, 2099),
 (73, 625),
 (74, 1871),
 (75, 561),
 (76, 1498),
 (77, 430),
 (78, 1137),
 (79, 304),
 (80, 839),
 (81, 221),
 (82, 481),
 (83, 123),
 (84, 234),
 (85, 51),
 (86, 100),
 (87, 13),
 (88, 29),
 (89, 5),
 (90, 5),
 (92, 1)]

start = time.time()
b_more = None
for i in range(10):
    another_data = load_deeplift_data(deeplift_hdf, keys=['shuffled_scores'], shuffled_sets=[i])
    b_1 =  np.sum(another_data['shuffled_scores'][:, 400:600, :], axis=2)
    if b_more is None:
        b_more = np.expand_dims(b_1, 1)
    else:
        b_more = np.concatenate((b_more, np.expand_dims(b_1, 1)), axis=1)
    #print(b_more.shape)
end = time.time()
print((end-start)/60)

2.268065563837687

np.array_equal(b_more, b_1)

True

dist_fns = OrderedDict()
for i, bin_ in enumerate(bins):
    print(bin_)
    x = np.squeeze(b_more[np.argwhere(bin_nos==bin_[0])]).flatten()

    ecdf = ECDF(x, side='right')
    inverted_edf = interp1d(ecdf.y, ecdf.x)
    dist_fns[bin_[0]] = x
#     dist_fns[bin_[0]]['ecdf'] = ecdf
#     dist_fns[bin_[0]]['inverse_ecdf'] = inverted_edf
#dist_fns

(22, 2)
(24, 8)
(25, 5)
(26, 24)
(27, 7)
(28, 91)
(29, 41)
(30, 187)
(31, 106)
(32, 497)
(33, 225)
(34, 874)
(35, 414)
(36, 1601)
(37, 693)
(38, 2761)
(39, 1122)
(40, 4088)
(41, 1736)
(42, 6152)
(43, 2378)
(44, 7936)
(45, 3074)
(46, 9830)
(47, 3580)
(48, 11203)
(49, 3824)
(50, 11678)
(51, 3843)
(52, 11098)
(53, 3563)
(54, 6475)
(55, 5933)
(56, 7872)
(57, 4338)
(58, 3818)
(59, 1690)
(60, 4472)
(61, 1302)
(62, 3617)
(63, 1044)
(64, 2977)
(65, 958)
(66, 2688)
(67, 888)
(68, 2507)
(69, 820)
(70, 2362)
(71, 769)
(72, 2099)
(73, 625)
(74, 1871)
(75, 561)
(76, 1498)
(77, 430)
(78, 1137)
(79, 304)
(80, 839)
(81, 221)
(82, 481)
(83, 123)
(84, 234)
(85, 51)
(86, 100)
(87, 13)
(88, 29)
(89, 5)
(90, 5)
(92, 1)

start = time.time()
bin_nos, dist_fns = pickle.load(open("/mnt/lab_data2/msharmin/oc-atlas/mtf/t_stats/bin_fns_140", 'rb'))
#tmp2 = pickle.load(open("/mnt/lab_data2/msharmin/oc-atlas/mtf/t_stats/bin_fns_test2.p", 'rb'))
#pickle.dump([bin_nos, dist_fns], open("/mnt/lab_data2/msharmin/oc-atlas/mtf/t_stats/bin_fns_test2.p", "wb"))
end = time.time()
print((end-start)/60)

0.057411026954650876

bin_nos.shape

(157798,)

#compute the distribution under a bin: extract the shuffled seqs, extract the middle 200bps and get the distribution
from statsmodels.distributions.empirical_distribution import ECDF
from scipy.interpolate import interp1d
from scipy.stats import *
from scipy import stats
from random import sample
from matplotlib import pyplot as plt

debug = 4
for i, bin_ in enumerate(bins):
    #x = b[np.argwhere(bin_nos==bin_[0])]
    x = np.squeeze(b_more[np.argwhere(bin_nos==bin_[0])]).flatten()
    #x = sample(list(x), 5000)
    ecdf = ECDF(x, side='right') #fit
    inverted_edf = interp1d(ecdf.y, ecdf.x)
    print(bin_, x.shape, np.percentile(x, 99.95), 
          np.percentile(sample(list(x), min(5000, len(x))), 99.95),
          inverted_edf(0.9995)
          
         )
    #k_val_in_bg = np.percentile(x, my_k)#my_k is the k I want to optimize, k_val_in_bg is k to be cut into high and low
    continue
    if i%2==0:
        fig, axes = plt.subplots(1, 2, figsize=(16,6))
        ax_ = axes[0]
    else:
        ax_ = axes[1]
    ax_.plot(ecdf.x, ecdf.y, c='b', label='ecdf', linestyle=':')
    p = np.linspace(0, 1)
    ax_.plot(inverted_edf(p), p, c='r', label='interpolated', linestyle='--')
    ax_.plot([np.percentile(x, (i*0.1)) for i in range(0,1001,1)], 
             [(i*0.001) for i in range(0,1001,1)], c='k', label='percentile', linestyle='-.')
    ax_.set_title("{}, {}, {}, {}, {}".format(bin_[0], bin_[1], 
                                              np.percentile(x, 99.95),
                                              np.percentile(sample(list(x), min(5000, len(x))), 99.95),
                                              inverted_edf(0.9995)
                                             ))
    #[(i*0.001) for i in range(0,1001,1)]
    debug -= 1
    if debug==0:
        break
    #break
    

# x = np.linspace(0.1, 1)
# y = inverted_edf(x)
# plt.plot(x, y, 'ro', x, y, 'b-')
# plt.show()

#[(i*0.001) for i in range(0,1001,1)]
#ecdf.y.shape, ecdf.x.shape, x.shape
np.percentile(x, 99.5), inverted_edf(0.995)

(0.047659593642922084, array(0.04762004))

# [np.percentile(x, (i*0.1)) for i in range(0,1001,1)]

# np.sort(x)

from matlas.fimo_hits import collect_motif_df
fimo_df = collect_motif_df("/mnt/lab_data2/msharmin/oc-atlas/mtf/atac_specific_hits/task_140/Ctcf.bed.gz")
bin_nos, dist_values = pickle.load(open("/mnt/lab_data2/msharmin/oc-atlas/mtf/t_stats/bin_fns_140", 'rb'))
bins = sorted(Counter(bin_nos).items())
dist_fns = OrderedDict()
for bin_, x in dist_values.items():
    ecdf = ECDF(x, side='right')
    dist_fns[bin_] = ecdf

p_vals = []
for i, instance in fimo_df.iterrows():
    seqno = instance.seqno
    start = instance.seqstart #to get the deeplift score
    end = instance.seqend
    bin_ = bin_nos[seqno]
    mean_deeplift = instance.mean_deeplift
    p_val = dist_fns[bin_](mean_deeplift)
    p_vals.append(p_val)

from matlas.dlutils import load_deeplift_data
from matlas.fimo_hits import (dichotomize_chipseq_for_gold_standard, 
                              #assign_significance_on_fimo_hits,
                              retain_fimo_hits_within_nb,
                              get_deeplift_passed_fimo_hits,
                              compute_instance_calling_performance,
                              optimize_performance_info,
                              check_performance_competition,
                              collect_motif_df,
                              plot_various_instance_scores,
                              decide_optimum_threshold,
                              collect_distance_info,
                              compute_distances_from_peaks,
                              plot_positional_information,
                              PEAK_TYPES
                             )



def prepare_verification_data_per_cell(
    task_idx=273,
    tf_biosample = "MEL cell line",
    the_k=0.05627951420283896,
    atac_root="/mnt/lab_data2/msharmin/oc-atlas/mtf/atac_specific_hits",
    instance_root="/mnt/lab_data2/msharmin/oc-atlas/mtf/atac_pval_hits",
    deeplift_root="/mnt/lab_data2/msharmin/oc-atlas/deeplift2019_h5",
    background_files_root="/mnt/lab_data2/msharmin/oc-atlas/mtf/t_stats",
    agnostic_root="/mnt/lab_data2/msharmin/oc-atlas/mtf/agnostic_hits"
):

    tf_targets = sorted(set(CHIP_PEAK_BEDS[tf_biosample].keys()).intersection(set(CHIP_TO_MOTIF.keys())))
    motifs = [CHIP_TO_MOTIF[tf] for tf in tf_targets]
    
    # divide chipseq peaks into gold_standard and nongold for each tf under a cell type
    fimodir = "{0}/task_{1}".format(atac_root, task_idx)
    atacpeaks = "{}/peaks.bed".format(fimodir)
    #print(fimodir)
    #print(atacpeaks)
    for tf, motif in zip(tf_targets, motifs):
        print(tf_biosample, tf, motif)
        chippeaks = "{0}/{1}".format(CHIP_PEAK_BED_ROOT, CHIP_PEAK_BEDS[tf_biosample][tf])
        instancedir="{}/task_{}/{}".format(instance_root, task_idx, motif)
        #print(chippeaks)
        print(instancedir)
        #print()
        
#         the_k = 0.25
#         instance_bed_dl = "Dl_k{}.bed".format(the_k)
#         instance_bed_nondl = "nonDl_k{}.bed".format(the_k)
#         perform_df, truth_df = compute_instance_calling_performance(
#             bedname='Fimo50.bed',
#             key_to_optimize='1-cdf',
#             instancedir=instancedir,
#             atac_bed=atacpeaks,
#             verbose=True
#         )


#         tmp = np.load("{}/pval_pval.npz".format(instancedir), allow_pickle=True)
#         check_performance_competition(tmp.f.a.tolist(), tmp.f.b.tolist(), tmp.f.c.tolist(),
#                                       info="{}, {}, {}".format(task_idx, tf_biosample, motif),
#                                       selected_key=None
#                                      )
        
        best_key = decide_optimum_threshold(instancedir)  
        tmp = np.load("{}/pval_pval.npz".format(instancedir), allow_pickle=True)
        check_performance_competition(tmp.f.a.tolist(), tmp.f.b.tolist(), tmp.f.c.tolist(),
                                     info="{}, {}, {}".format(task_idx, tf_biosample, motif),
                                      selected_key=best_key
                                    )
        plot_various_instance_scores(instancedir, fimodir, motif, selected_key=best_key)

        #plot_various_instance_scores(instancedir, fimodir, motif)
        
    return None
from ast import literal_eval


def prepare_verification_data():
    total_zeros = 0
    for key, tf_biosamples in OC_TO_TF_SAMPLES.items():
        task_idx, sample_id, oc_sample = literal_eval(key)
        #if task_idx in [140, 142]:
        #    continue
        print(task_idx, oc_sample, tf_biosamples, len(tf_biosamples))
        total_zeros = prepare_verification_data_per_cell(task_idx=task_idx, 
                                           tf_biosample=tf_biosamples[0],
                                           the_k=PER_BASE_BACKGROUND_IMPORTANCES[task_idx]
                                          )
        #break
    return None

start = time.time()
prepare_verification_data()
end = time.time()
print((end-start)/60)

140 Bone marrow derived pre-B cell ['bone marrow'] 1
bone marrow CTCF-mouse Ctcf
/mnt/lab_data2/msharmin/oc-atlas/mtf/atac_pval_hits/task_140/Ctcf

142 bone marrow-derived macrophages (Unstimulated) ['bone marrow macrophage'] 1
bone marrow macrophage CTCF-mouse Ctcf
/mnt/lab_data2/msharmin/oc-atlas/mtf/atac_pval_hits/task_142/Ctcf

143 bone marrow-derived macrophages(Lipid A stimulation 120min) ['bone marrow macrophage'] 1
bone marrow macrophage CTCF-mouse Ctcf
/mnt/lab_data2/msharmin/oc-atlas/mtf/atac_pval_hits/task_143/Ctcf

177 Cortical excitatory neurons ['cortical plate'] 1
cortical plate CTCF-mouse Ctcf
/mnt/lab_data2/msharmin/oc-atlas/mtf/atac_pval_hits/task_177/Ctcf

180 Thymus MECs (Aire+/+) ['thymus'] 1
thymus CTCF-mouse Ctcf
/mnt/lab_data2/msharmin/oc-atlas/mtf/atac_pval_hits/task_180/Ctcf

189 Megakaryocyte primary cell ['megakaryocyte'] 1
megakaryocyte FLI1-mouse Fli1
/mnt/lab_data2/msharmin/oc-atlas/mtf/atac_pval_hits/task_189/Fli1

192 Erythroblast primary cell ['erythroblast'] 1
erythroblast GATA1-mouse Gata1
/mnt/lab_data2/msharmin/oc-atlas/mtf/atac_pval_hits/task_192/Gata1

193 Megakaryocyte progenitor cell ['megakaryocyte'] 1
megakaryocyte FLI1-mouse Fli1
/mnt/lab_data2/msharmin/oc-atlas/mtf/atac_pval_hits/task_193/Fli1

202 Liver (Adult 8 weeks) ['liver'] 1
liver CTCF-mouse Ctcf
/mnt/lab_data2/msharmin/oc-atlas/mtf/atac_pval_hits/task_202/Ctcf

liver EP300-mouse Ep300
/mnt/lab_data2/msharmin/oc-atlas/mtf/atac_pval_hits/task_202/Ep300

dlk = fimo_df[fimo_df['deeplift_cdf']>0.9]
nondlk = fimo_df[fimo_df['deeplift_cdf']<=0.9]

from matplotlib import pyplot as plt
fig, axes = plt.subplots(1, 2, figsize=(16,6))
ax_pdf = axes[0]
ax_cdf = axes[1]

key_to_show = 'sum_deeplift_0'
ax_pdf.hist(dlk[key_to_show].values, density=True, histtype='stepfilled',
        alpha=0.6, bins=60, label='high')
# plot the scores for an example sequence of a as well
ax_pdf.hist(nondlk[key_to_show].values, density=True, histtype='stepfilled',
        alpha=0.6, bins=60, label='low')
ax_pdf.legend(loc='best', frameon=False)
ax_pdf.legend(loc='best', frameon=False)

ax_cdf.hist(dlk[key_to_show].values,  cumulative=True, density=True, histtype='bar',
        alpha=0.6, bins=60, label='high')
# plot the scores for an example sequence of a as well
ax_cdf.hist(nondlk[key_to_show].values,  cumulative=True, density=True, histtype='bar',
        alpha=0.6, bins=60, label='low')
ax_cdf.legend(loc='best', frameon=False)
plt.show()

# revise performance measurements with changing the probability metric

# revise fimo50
# revise positional distribution and cdfs for a bed


# zcat chippeaks.bed.gz | sort -k1,1 -k2,2n | bedtools intersect -a ../../../atac_specific_hits/task_140/peaks.bed -b stdin -wb -wa -v > nongold3.bed



# zcat chippeaks.bed.gz | sort -k1,1 -k2,2n | bedtools intersect -a stdin -b ../../../atac_specific_hits/task_140/peaks.bed -wb -wa > gold.bed
# zcat /mnt/lab_data2/msharmin/tf-mouse-atlas/chip_idrs/ENCFF806PDR.bed.gz | sort -k1,1 -k2,2n | bedtools intersect -a stdin -b atac_specific_hits/task_140/peaks.bed -wb > atac_various_hits/task_140/Ctcf/gold.bed

b = np.sum(null_scores, axis=2)
a = np.sum(deeplift_scores, axis=2)
a.shape, b.shape

((174993, 1000), (174993, 200))

np.argmin(cdf_scores[:,5]), np.min(cdf_scores[:,5])

(171195, 5.979713769975963e-05)

i = [0, 1]
c = b[0].flatten()
print(c.shape)
shape, mean, stdev = t.fit(c)
kstat, kval = stats.kstest(c, 't', args=[shape, mean, stdev])
# mean, stdev = norm.fit(c)
# kstat, kval = stats.kstest(c, 'norm', args=[ mean, stdev])
# shape, mean, stdev = gamma.fit(c)
# kstat, kval = stats.kstest(c, 'gamma', args=[shape,  mean, stdev])

print(kstat, kval)

(200,)
0.045556525408073756 0.8008351657723536

from matlas.fimo_hits import plot_fitted_dist_for_importance
plot_fitted_dist_for_importance(c, a[0], shape, mean, stdev)

# scores_ = [np.sum(j) for j in scores[0][300:700]]
# shape, mean, stdev = t.fit(scores_)
# kstat, kval = stats.kstest(scores_, 't', args=[shape, mean, stdev])
# print('kstat:', kstat, ' kval:', kval)

from scipy import stats

from scipy.stats import *
shape, mean, stdev = t.fit(b)
print(shape, mean, stdev)
kstat, kval = stats.kstest(b, 't', args=[shape, mean, stdev])
print('t sitribution', kstat, kval)
kstat, kval = stats.kstest(b, 'norm')
print('normal sitribution', kstat, kval)

1.5123294107168623 0.0008743328517088086 0.005880160282513114
t sitribution 0.06499847605991883 0.0
normal sitribution 0.48375382933815286 0.0

#get the scores for all the windows, take per_base_imp distribution
# nonagnostic_out = "/mnt/lab_data/kundaje/msharmin/mouse_hem/mtf/specific_hits/task_224/Ctcf.bed.gz"
# df = pd.read_csv(nonagnostic_out, sep="\t")
# selected_peaks = np.sort(np.array(list(set(df['seqno'].values))))
# selected_null_scores = hyp_scores[selected_peaks,:,:]
# selected_null_scores.shape
# atacdf = pd.read_csv("/mnt/lab_data/kundaje/msharmin/mouse_hem/mtf/specific_hits/task_224/peaks.bed", header=None, sep="\t")
# #selected_atacs = atacdf.loc[selected_peaks,:]
# #selected_atacs.to_csv("/mnt/lab_data/kundaje/msharmin/mouse_hem/mtf/specific_hits/task_224/peaks_Ctcf.bed", 
#                       #header=None, index=None, sep="\t")

from matlas.render_report import render_cell_tf_reports_in_parallel

render_cell_tf_reports_in_parallel()

# call dichotomize_chipseq_for_gold_standard for each tf under MEL
# call get_deeplift_passed_fimo_hits for each tf under MEL
# call get_peaks_within_nb for each tf under MEL
# for a tf: collect_distance_info, plot_distances_from_peaks
# for a tf: collect_performance_info, plot_performance_info
# for a tf: compute_joint_chip_matrix, plot_joint_chip_heatmap
# for a tf: load the png file

from matlas.fimo_hits import combine_motif_dfs

logdir = "/mnt/lab_data/kundaje/msharmin/mouse_hem/mtf"

outdir = "/mnt/lab_data/kundaje/msharmin/mouse_hem/mtf/specific_hits/task_{}".format(273)
nonagnostic_out = "{0}/{1}.bed.gz".format(outdir, "Adnp2")
df_dict = combine_motif_dfs(logdir, outdir, 
                      combine_func="collect_motif_df",
                      )
df = pd.concat([df_dict[key] for key in df_dict.keys()])

#nonz_dfs = combine_motif_dfs(logdir, outdir, zscored=False)

from matlas.fimo_hits import plot_motif_scores

motifs = ['Spi1', 'Ctcf', 'Gata1']
turn = 0
fig, axes = None, None
for key in motifs:
    fig, axes = plot_motif_scores(df_dict[key], title=key, turn=turn, axes=axes, fig=fig, kind='scatter')
    turn = (turn+1)%4

import seaborn as sns
combined_df = pd.concat([df_dict[key] for key in df_dict.keys()])
sns.jointplot(x="avg_deeplift", y="fimo", data=combined_df, s=1, marginal_kws=dict(bins=100));

df_dict['Spi1'][:5]

from matlas.render_report import render_test_plots
render_test_plots(nbname="FimoVsDeepliftReport")

gencode_file = '/mnt/data/annotations/by_release/mm10/GENCODE_ann/gencode.vM6.annotation.gtf'
gencode = pd.read_table(gencode_file, sep='\t', header=None, comment='#')
genes = gencode[(gencode[0]!='chrM') & (gencode[2]=='gene')&(np.array(['protein_coding' in i for i in gencode[8].values]))]
gene_df = genes.loc[:, [0,3,4,8,5,6]]
gene_df = gene_df.sort_values([0,3])
gene_df.to_csv("{0}/genecode.bed.gz".format(modisco_dir), header=False, index=False, compression='gzip', sep="\t")
genes[:5]

	motif	fimo	deeplift	avg_deeplift
0	Spi1	13.9878	0.587939	0.053449
1	Spi1	15.0000	1.235800	0.065042
2	Spi1	14.7753	0.065585	0.003858
3	Spi1	14.2121	0.042971	0.002686
4	Spi1	15.8902	0.103061	0.009369

	0	1	2	3	4	5	6	7	8
6	chr1	HAVANA	gene	3205901	3671498	.	-	.	gene_id "ENSMUSG00000051951.5"; gene_type "pro...
74	chr1	HAVANA	gene	4290846	4409241	.	-	.	gene_id "ENSMUSG00000025900.8"; gene_type "pro...
104	chr1	HAVANA	gene	4490931	4497354	.	-	.	gene_id "ENSMUSG00000025902.11"; gene_type "pr...
227	chr1	HAVANA	gene	4773206	4785739	.	-	.	gene_id "ENSMUSG00000033845.11"; gene_type "pr...
298	chr1	HAVANA	gene	4807788	4848410	.	+	.	gene_id "ENSMUSG00000025903.12"; gene_type "pr...