In [6]:
%load_ext autoreload
%autoreload 2

import pickle
from collections import OrderedDict
import numpy as np
import pandas as pd
import os
import subprocess
import math
import copy
import pickle
import h5py
import glob
import time

from matlas.genome_data import *
from matlas.pwms import shorten_tfname, get_motifDB_id_maps
from matlas.matches import fig2vdom, vdom_pssm
#from basepair.modisco.results import ModiscoResult
from matlas.modisco_results import ModiscoResult


project_root = "/mnt/lab_data/kundaje/msharmin/mouse_hem/with_tfd"
oak_root = "/oak/stanford/groups/akundaje/msharmin/mouse_hem/from_labcluster/with_tfd"
srv_root = "/srv/scratch/msharmin/mouse_hem/with_tfd"
data_name = "full_mouse50"
modisco_root = "{0}/{1}/Naive_modisco2019_msharmin".format(oak_root, data_name)
modisco_root = "{0}/{1}/Naive_modisco2019".format(srv_root, data_name)

homer_root = "{0}/{1}/Naive_scans".format(srv_root, data_name)
model_root = "{0}/{1}/fineFactorized".format(srv_root, data_name)
basset_root = "{0}/{1}/Naive_basset_motifs".format(srv_root, data_name)
reportfile= "/mnt/lab_data/kundaje/msharmin/mouse_hem/with_tfd/full_mouse50/filtering samples_MS2.xlsx"
sheetname = "filter23"
The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
In [2]:
from matlas.fimo_hits import load_deeplift_data
task_idx = 140
oc_root="/mnt/lab_data2/msharmin/oc-atlas"
deeplift_hdf = "{0}/deeplift2019_h5/task_{1}-naivegw/summit.h5".format(
    oc_root, task_idx)
deeplift_data = load_deeplift_data(deeplift_hdf, keys=['one_hot', 'scores', 'shuffled_scores'])
hyp_scores = np.copy(deeplift_data['shuffled_scores'][:, 400:600, :])
b = np.sum(hyp_scores, axis=2)

# scores = np.copy(deeplift_data['scores'])
# a = np.sum(scores, axis=2)

c = deeplift_data['one_hot']
In [3]:
#for each seqno set a bin-number-gc calculation
from collections import Counter
c = np.copy(deeplift_data['one_hot'])[:, 400:600, :]
gc = lambda t: int(round(np.sum(t[:, 1:3]==1)/t.shape[0]*100))
bin_nos = np.array([gc(x) for x in c])
bins = sorted(Counter(bin_nos).items())
In [4]:
bins
Out[4]:
[(22, 2),
 (24, 8),
 (25, 5),
 (26, 24),
 (27, 7),
 (28, 91),
 (29, 41),
 (30, 187),
 (31, 106),
 (32, 497),
 (33, 225),
 (34, 874),
 (35, 414),
 (36, 1601),
 (37, 693),
 (38, 2761),
 (39, 1122),
 (40, 4088),
 (41, 1736),
 (42, 6152),
 (43, 2378),
 (44, 7936),
 (45, 3074),
 (46, 9830),
 (47, 3580),
 (48, 11203),
 (49, 3824),
 (50, 11678),
 (51, 3843),
 (52, 11098),
 (53, 3563),
 (54, 6475),
 (55, 5933),
 (56, 7872),
 (57, 4338),
 (58, 3818),
 (59, 1690),
 (60, 4472),
 (61, 1302),
 (62, 3617),
 (63, 1044),
 (64, 2977),
 (65, 958),
 (66, 2688),
 (67, 888),
 (68, 2507),
 (69, 820),
 (70, 2362),
 (71, 769),
 (72, 2099),
 (73, 625),
 (74, 1871),
 (75, 561),
 (76, 1498),
 (77, 430),
 (78, 1137),
 (79, 304),
 (80, 839),
 (81, 221),
 (82, 481),
 (83, 123),
 (84, 234),
 (85, 51),
 (86, 100),
 (87, 13),
 (88, 29),
 (89, 5),
 (90, 5),
 (92, 1)]
In [7]:
start = time.time()
b_more = None
for i in range(10):
    another_data = load_deeplift_data(deeplift_hdf, keys=['shuffled_scores'], shuffled_sets=[i])
    b_1 =  np.sum(another_data['shuffled_scores'][:, 400:600, :], axis=2)
    if b_more is None:
        b_more = np.expand_dims(b_1, 1)
    else:
        b_more = np.concatenate((b_more, np.expand_dims(b_1, 1)), axis=1)
    #print(b_more.shape)
end = time.time()
print((end-start)/60)
2.268065563837687
In [211]:
np.array_equal(b_more, b_1)
Out[211]:
True
In [16]:
dist_fns = OrderedDict()
for i, bin_ in enumerate(bins):
    print(bin_)
    x = np.squeeze(b_more[np.argwhere(bin_nos==bin_[0])]).flatten()

    ecdf = ECDF(x, side='right')
    inverted_edf = interp1d(ecdf.y, ecdf.x)
    dist_fns[bin_[0]] = x
#     dist_fns[bin_[0]]['ecdf'] = ecdf
#     dist_fns[bin_[0]]['inverse_ecdf'] = inverted_edf
#dist_fns
(22, 2)
(24, 8)
(25, 5)
(26, 24)
(27, 7)
(28, 91)
(29, 41)
(30, 187)
(31, 106)
(32, 497)
(33, 225)
(34, 874)
(35, 414)
(36, 1601)
(37, 693)
(38, 2761)
(39, 1122)
(40, 4088)
(41, 1736)
(42, 6152)
(43, 2378)
(44, 7936)
(45, 3074)
(46, 9830)
(47, 3580)
(48, 11203)
(49, 3824)
(50, 11678)
(51, 3843)
(52, 11098)
(53, 3563)
(54, 6475)
(55, 5933)
(56, 7872)
(57, 4338)
(58, 3818)
(59, 1690)
(60, 4472)
(61, 1302)
(62, 3617)
(63, 1044)
(64, 2977)
(65, 958)
(66, 2688)
(67, 888)
(68, 2507)
(69, 820)
(70, 2362)
(71, 769)
(72, 2099)
(73, 625)
(74, 1871)
(75, 561)
(76, 1498)
(77, 430)
(78, 1137)
(79, 304)
(80, 839)
(81, 221)
(82, 481)
(83, 123)
(84, 234)
(85, 51)
(86, 100)
(87, 13)
(88, 29)
(89, 5)
(90, 5)
(92, 1)
In [38]:
start = time.time()
bin_nos, dist_fns = pickle.load(open("/mnt/lab_data2/msharmin/oc-atlas/mtf/t_stats/bin_fns_140", 'rb'))
#tmp2 = pickle.load(open("/mnt/lab_data2/msharmin/oc-atlas/mtf/t_stats/bin_fns_test2.p", 'rb'))
#pickle.dump([bin_nos, dist_fns], open("/mnt/lab_data2/msharmin/oc-atlas/mtf/t_stats/bin_fns_test2.p", "wb"))
end = time.time()
print((end-start)/60)
0.057411026954650876
In [46]:
bin_nos.shape
Out[46]:
(157798,)
In [11]:
#compute the distribution under a bin: extract the shuffled seqs, extract the middle 200bps and get the distribution
from statsmodels.distributions.empirical_distribution import ECDF
from scipy.interpolate import interp1d
from scipy.stats import *
from scipy import stats
from random import sample
from matplotlib import pyplot as plt

debug = 4
for i, bin_ in enumerate(bins):
    #x = b[np.argwhere(bin_nos==bin_[0])]
    x = np.squeeze(b_more[np.argwhere(bin_nos==bin_[0])]).flatten()
    #x = sample(list(x), 5000)
    ecdf = ECDF(x, side='right') #fit
    inverted_edf = interp1d(ecdf.y, ecdf.x)
    print(bin_, x.shape, np.percentile(x, 99.95), 
          np.percentile(sample(list(x), min(5000, len(x))), 99.95),
          inverted_edf(0.9995)
          
         )
    #k_val_in_bg = np.percentile(x, my_k)#my_k is the k I want to optimize, k_val_in_bg is k to be cut into high and low
    continue
    if i%2==0:
        fig, axes = plt.subplots(1, 2, figsize=(16,6))
        ax_ = axes[0]
    else:
        ax_ = axes[1]
    ax_.plot(ecdf.x, ecdf.y, c='b', label='ecdf', linestyle=':')
    p = np.linspace(0, 1)
    ax_.plot(inverted_edf(p), p, c='r', label='interpolated', linestyle='--')
    ax_.plot([np.percentile(x, (i*0.1)) for i in range(0,1001,1)], 
             [(i*0.001) for i in range(0,1001,1)], c='k', label='percentile', linestyle='-.')
    ax_.set_title("{}, {}, {}, {}, {}".format(bin_[0], bin_[1], 
                                              np.percentile(x, 99.95),
                                              np.percentile(sample(list(x), min(5000, len(x))), 99.95),
                                              inverted_edf(0.9995)
                                             ))
    #[(i*0.001) for i in range(0,1001,1)]
    debug -= 1
    if debug==0:
        break
    #break
    

# x = np.linspace(0.1, 1)
# y = inverted_edf(x)
# plt.plot(x, y, 'ro', x, y, 'b-')
# plt.show()
In [161]:
#[(i*0.001) for i in range(0,1001,1)]
#ecdf.y.shape, ecdf.x.shape, x.shape
np.percentile(x, 99.5), inverted_edf(0.995)
Out[161]:
(0.047659593642922084, array(0.04762004))
In [229]:
# [np.percentile(x, (i*0.1)) for i in range(0,1001,1)]
In [230]:
# np.sort(x)
In [50]:
from matlas.fimo_hits import collect_motif_df
fimo_df = collect_motif_df("/mnt/lab_data2/msharmin/oc-atlas/mtf/atac_specific_hits/task_140/Ctcf.bed.gz")
bin_nos, dist_values = pickle.load(open("/mnt/lab_data2/msharmin/oc-atlas/mtf/t_stats/bin_fns_140", 'rb'))
bins = sorted(Counter(bin_nos).items())
dist_fns = OrderedDict()
for bin_, x in dist_values.items():
    ecdf = ECDF(x, side='right')
    dist_fns[bin_] = ecdf
In [51]:
p_vals = []
for i, instance in fimo_df.iterrows():
    seqno = instance.seqno
    start = instance.seqstart #to get the deeplift score
    end = instance.seqend
    bin_ = bin_nos[seqno]
    mean_deeplift = instance.mean_deeplift
    p_val = dist_fns[bin_](mean_deeplift)
    p_vals.append(p_val)
In [140]:
from matlas.dlutils import load_deeplift_data
from matlas.fimo_hits import (dichotomize_chipseq_for_gold_standard, 
                              #assign_significance_on_fimo_hits,
                              retain_fimo_hits_within_nb,
                              get_deeplift_passed_fimo_hits,
                              compute_instance_calling_performance,
                              optimize_performance_info,
                              check_performance_competition,
                              collect_motif_df,
                              plot_various_instance_scores,
                              decide_optimum_threshold,
                              collect_distance_info,
                              compute_distances_from_peaks,
                              plot_positional_information,
                              PEAK_TYPES
                             )



def prepare_verification_data_per_cell(
    task_idx=273,
    tf_biosample = "MEL cell line",
    the_k=0.05627951420283896,
    atac_root="/mnt/lab_data2/msharmin/oc-atlas/mtf/atac_specific_hits",
    instance_root="/mnt/lab_data2/msharmin/oc-atlas/mtf/atac_pval_hits",
    deeplift_root="/mnt/lab_data2/msharmin/oc-atlas/deeplift2019_h5",
    background_files_root="/mnt/lab_data2/msharmin/oc-atlas/mtf/t_stats",
    agnostic_root="/mnt/lab_data2/msharmin/oc-atlas/mtf/agnostic_hits"
):

    tf_targets = sorted(set(CHIP_PEAK_BEDS[tf_biosample].keys()).intersection(set(CHIP_TO_MOTIF.keys())))
    motifs = [CHIP_TO_MOTIF[tf] for tf in tf_targets]
    
    # divide chipseq peaks into gold_standard and nongold for each tf under a cell type
    fimodir = "{0}/task_{1}".format(atac_root, task_idx)
    atacpeaks = "{}/peaks.bed".format(fimodir)
    #print(fimodir)
    #print(atacpeaks)
    for tf, motif in zip(tf_targets, motifs):
        print(tf_biosample, tf, motif)
        chippeaks = "{0}/{1}".format(CHIP_PEAK_BED_ROOT, CHIP_PEAK_BEDS[tf_biosample][tf])
        instancedir="{}/task_{}/{}".format(instance_root, task_idx, motif)
        #print(chippeaks)
        print(instancedir)
        #print()
        
#         the_k = 0.25
#         instance_bed_dl = "Dl_k{}.bed".format(the_k)
#         instance_bed_nondl = "nonDl_k{}.bed".format(the_k)
#         perform_df, truth_df = compute_instance_calling_performance(
#             bedname='Fimo50.bed',
#             key_to_optimize='1-cdf',
#             instancedir=instancedir,
#             atac_bed=atacpeaks,
#             verbose=True
#         )


#         tmp = np.load("{}/pval_pval.npz".format(instancedir), allow_pickle=True)
#         check_performance_competition(tmp.f.a.tolist(), tmp.f.b.tolist(), tmp.f.c.tolist(),
#                                       info="{}, {}, {}".format(task_idx, tf_biosample, motif),
#                                       selected_key=None
#                                      )
        
        best_key = decide_optimum_threshold(instancedir)  
        tmp = np.load("{}/pval_pval.npz".format(instancedir), allow_pickle=True)
        check_performance_competition(tmp.f.a.tolist(), tmp.f.b.tolist(), tmp.f.c.tolist(),
                                     info="{}, {}, {}".format(task_idx, tf_biosample, motif),
                                      selected_key=best_key
                                    )
        plot_various_instance_scores(instancedir, fimodir, motif, selected_key=best_key)

        #plot_various_instance_scores(instancedir, fimodir, motif)
        
    return None
from ast import literal_eval


def prepare_verification_data():
    total_zeros = 0
    for key, tf_biosamples in OC_TO_TF_SAMPLES.items():
        task_idx, sample_id, oc_sample = literal_eval(key)
        #if task_idx in [140, 142]:
        #    continue
        print(task_idx, oc_sample, tf_biosamples, len(tf_biosamples))
        total_zeros = prepare_verification_data_per_cell(task_idx=task_idx, 
                                           tf_biosample=tf_biosamples[0],
                                           the_k=PER_BASE_BACKGROUND_IMPORTANCES[task_idx]
                                          )
        #break
    return None

start = time.time()
prepare_verification_data()
end = time.time()
print((end-start)/60)
140 Bone marrow derived pre-B cell ['bone marrow'] 1
bone marrow CTCF-mouse Ctcf
/mnt/lab_data2/msharmin/oc-atlas/mtf/atac_pval_hits/task_140/Ctcf
142 bone marrow-derived macrophages (Unstimulated) ['bone marrow macrophage'] 1
bone marrow macrophage CTCF-mouse Ctcf
/mnt/lab_data2/msharmin/oc-atlas/mtf/atac_pval_hits/task_142/Ctcf
143 bone marrow-derived macrophages(Lipid A stimulation 120min) ['bone marrow macrophage'] 1
bone marrow macrophage CTCF-mouse Ctcf
/mnt/lab_data2/msharmin/oc-atlas/mtf/atac_pval_hits/task_143/Ctcf
177 Cortical excitatory neurons ['cortical plate'] 1
cortical plate CTCF-mouse Ctcf
/mnt/lab_data2/msharmin/oc-atlas/mtf/atac_pval_hits/task_177/Ctcf
180 Thymus MECs (Aire+/+) ['thymus'] 1
thymus CTCF-mouse Ctcf
/mnt/lab_data2/msharmin/oc-atlas/mtf/atac_pval_hits/task_180/Ctcf
189 Megakaryocyte primary cell ['megakaryocyte'] 1
megakaryocyte FLI1-mouse Fli1
/mnt/lab_data2/msharmin/oc-atlas/mtf/atac_pval_hits/task_189/Fli1
192 Erythroblast primary cell ['erythroblast'] 1
erythroblast GATA1-mouse Gata1
/mnt/lab_data2/msharmin/oc-atlas/mtf/atac_pval_hits/task_192/Gata1
193 Megakaryocyte progenitor cell ['megakaryocyte'] 1
megakaryocyte FLI1-mouse Fli1
/mnt/lab_data2/msharmin/oc-atlas/mtf/atac_pval_hits/task_193/Fli1
202 Liver (Adult 8 weeks) ['liver'] 1
liver CTCF-mouse Ctcf
/mnt/lab_data2/msharmin/oc-atlas/mtf/atac_pval_hits/task_202/Ctcf
liver EP300-mouse Ep300
/mnt/lab_data2/msharmin/oc-atlas/mtf/atac_pval_hits/task_202/Ep300
liver GATA4-mouse Gata4
/mnt/lab_data2/msharmin/oc-atlas/mtf/atac_pval_hits/task_202/Gata4
203 Lung (adult-8wks) ['lung'] 1
lung CTCF-mouse Ctcf
/mnt/lab_data2/msharmin/oc-atlas/mtf/atac_pval_hits/task_203/Ctcf
lung EP300-mouse Ep300
/mnt/lab_data2/msharmin/oc-atlas/mtf/atac_pval_hits/task_203/Ep300
210 mouse spleen B cells, CD43-,CD11b-         (immature B cells1) ['spleen'] 1
220 Fibroblast (adult-8wks) ['embryonic fibroblast'] 1
embryonic fibroblast CTCF-mouse Ctcf
/mnt/lab_data2/msharmin/oc-atlas/mtf/atac_pval_hits/task_220/Ctcf
223 Kidney(adult-8wks) ['kidney'] 1
kidney CTCF-mouse Ctcf
/mnt/lab_data2/msharmin/oc-atlas/mtf/atac_pval_hits/task_223/Ctcf
224 Liver(adult-8wks) ['liver'] 1
liver CTCF-mouse Ctcf
/mnt/lab_data2/msharmin/oc-atlas/mtf/atac_pval_hits/task_224/Ctcf
liver EP300-mouse Ep300
/mnt/lab_data2/msharmin/oc-atlas/mtf/atac_pval_hits/task_224/Ep300
liver GATA4-mouse Gata4
/mnt/lab_data2/msharmin/oc-atlas/mtf/atac_pval_hits/task_224/Gata4
225 Liver(E14.5) ['liver'] 1
liver CTCF-mouse Ctcf
/mnt/lab_data2/msharmin/oc-atlas/mtf/atac_pval_hits/task_225/Ctcf
liver EP300-mouse Ep300
/mnt/lab_data2/msharmin/oc-atlas/mtf/atac_pval_hits/task_225/Ep300
liver GATA4-mouse Gata4
/mnt/lab_data2/msharmin/oc-atlas/mtf/atac_pval_hits/task_225/Gata4
227 Lung(adult-8wks) ['lung'] 1
lung CTCF-mouse Ctcf
/mnt/lab_data2/msharmin/oc-atlas/mtf/atac_pval_hits/task_227/Ctcf
lung EP300-mouse Ep300
/mnt/lab_data2/msharmin/oc-atlas/mtf/atac_pval_hits/task_227/Ep300
229 Mouse lung (E14.5) ['lung'] 1
lung CTCF-mouse Ctcf
/mnt/lab_data2/msharmin/oc-atlas/mtf/atac_pval_hits/task_229/Ctcf
lung EP300-mouse Ep300
/mnt/lab_data2/msharmin/oc-atlas/mtf/atac_pval_hits/task_229/Ep300
230 Heart (adult-8wks) ['heart'] 1
heart CTCF-mouse Ctcf
/mnt/lab_data2/msharmin/oc-atlas/mtf/atac_pval_hits/task_230/Ctcf
heart EP300-mouse Ep300
/mnt/lab_data2/msharmin/oc-atlas/mtf/atac_pval_hits/task_230/Ep300
235 mixed embryonic mouse retina(E14.5) ['embryonic fibroblast'] 1
embryonic fibroblast CTCF-mouse Ctcf
/mnt/lab_data2/msharmin/oc-atlas/mtf/atac_pval_hits/task_235/Ctcf
236 Cerebellum ['cerebellum'] 1
cerebellum CTCF-mouse Ctcf
/mnt/lab_data2/msharmin/oc-atlas/mtf/atac_pval_hits/task_236/Ctcf
245 mixed embryonic mouse forebrain(E14.5) ['forebrain'] 1
forebrain CTCF-mouse Ctcf
/mnt/lab_data2/msharmin/oc-atlas/mtf/atac_pval_hits/task_245/Ctcf
248 mixed embryonic mouse hindbrain(E11.5) ['hindbrain'] 1
hindbrain CTCF-mouse Ctcf
/mnt/lab_data2/msharmin/oc-atlas/mtf/atac_pval_hits/task_248/Ctcf
249 hindbrain embryo(E14.5) ['hindbrain'] 1
hindbrain CTCF-mouse Ctcf
/mnt/lab_data2/msharmin/oc-atlas/mtf/atac_pval_hits/task_249/Ctcf
250 mixed embryonic mouse midbrain(E14.5) ['midbrain'] 1
midbrain CTCF-mouse Ctcf
/mnt/lab_data2/msharmin/oc-atlas/mtf/atac_pval_hits/task_250/Ctcf
251 midbrain embryo(E11.5) ['midbrain'] 1
midbrain CTCF-mouse Ctcf
/mnt/lab_data2/msharmin/oc-atlas/mtf/atac_pval_hits/task_251/Ctcf
263 Large Intestine (adult-8wks) ['intestine'] 1
intestine CTCF-mouse Ctcf
/mnt/lab_data2/msharmin/oc-atlas/mtf/atac_pval_hits/task_263/Ctcf
266 spleen(adult-8wks) ['spleen'] 1
271 CH12.LX cell line ['CH12.LX'] 1
CH12.LX BHLHE40-mouse Bhlhe40
/mnt/lab_data2/msharmin/oc-atlas/mtf/atac_pval_hits/task_271/Bhlhe40
CH12.LX CTCF-mouse Ctcf
/mnt/lab_data2/msharmin/oc-atlas/mtf/atac_pval_hits/task_271/Ctcf
CH12.LX E2F4-mouse E2f4
/mnt/lab_data2/msharmin/oc-atlas/mtf/atac_pval_hits/task_271/E2f4
CH12.LX ELF1-mouse Elf1
/mnt/lab_data2/msharmin/oc-atlas/mtf/atac_pval_hits/task_271/Elf1
CH12.LX EP300-mouse Ep300
/mnt/lab_data2/msharmin/oc-atlas/mtf/atac_pval_hits/task_271/Ep300
CH12.LX ETS1-mouse Ets1
/mnt/lab_data2/msharmin/oc-atlas/mtf/atac_pval_hits/task_271/Ets1
CH12.LX GABPA-mouse Gabpa
/mnt/lab_data2/msharmin/oc-atlas/mtf/atac_pval_hits/task_271/Gabpa
CH12.LX IRF4-mouse Irf4
/mnt/lab_data2/msharmin/oc-atlas/mtf/atac_pval_hits/task_271/Irf4
CH12.LX JUN-mouse Jun
/mnt/lab_data2/msharmin/oc-atlas/mtf/atac_pval_hits/task_271/Jun
CH12.LX JUND-mouse Jund
/mnt/lab_data2/msharmin/oc-atlas/mtf/atac_pval_hits/task_271/Jund
CH12.LX MAFK-mouse Mafk
/mnt/lab_data2/msharmin/oc-atlas/mtf/atac_pval_hits/task_271/Mafk
CH12.LX MAX-mouse Max
/mnt/lab_data2/msharmin/oc-atlas/mtf/atac_pval_hits/task_271/Max
CH12.LX MAZ-mouse Maz
/mnt/lab_data2/msharmin/oc-atlas/mtf/atac_pval_hits/task_271/Maz
CH12.LX MEF2A-mouse Mef2a
/mnt/lab_data2/msharmin/oc-atlas/mtf/atac_pval_hits/task_271/Mef2a
CH12.LX MXI1-mouse Mxi1
/mnt/lab_data2/msharmin/oc-atlas/mtf/atac_pval_hits/task_271/Mxi1
CH12.LX MYC-mouse Myc
/mnt/lab_data2/msharmin/oc-atlas/mtf/atac_pval_hits/task_271/Myc
CH12.LX NRF1-mouse Nrf1
/mnt/lab_data2/msharmin/oc-atlas/mtf/atac_pval_hits/task_271/Nrf1
CH12.LX PAX5-mouse Pax5
/mnt/lab_data2/msharmin/oc-atlas/mtf/atac_pval_hits/task_271/Pax5
CH12.LX TBP-mouse Tbp
/mnt/lab_data2/msharmin/oc-atlas/mtf/atac_pval_hits/task_271/Tbp
CH12.LX TCF12-mouse Tcf12
/mnt/lab_data2/msharmin/oc-atlas/mtf/atac_pval_hits/task_271/Tcf12
CH12.LX USF1-mouse Usf1
/mnt/lab_data2/msharmin/oc-atlas/mtf/atac_pval_hits/task_271/Usf1
CH12.LX USF2-mouse Usf2
/mnt/lab_data2/msharmin/oc-atlas/mtf/atac_pval_hits/task_271/Usf2
CH12.LX ZKSCAN1-mouse Zkscan1
/mnt/lab_data2/msharmin/oc-atlas/mtf/atac_pval_hits/task_271/Zkscan1
273 MEL cell line ['MEL cell line'] 1
MEL cell line BHLHE40-mouse Bhlhe40
/mnt/lab_data2/msharmin/oc-atlas/mtf/atac_pval_hits/task_273/Bhlhe40
MEL cell line CTCF-mouse Ctcf
/mnt/lab_data2/msharmin/oc-atlas/mtf/atac_pval_hits/task_273/Ctcf
MEL cell line E2F4-mouse E2f4
/mnt/lab_data2/msharmin/oc-atlas/mtf/atac_pval_hits/task_273/E2f4
MEL cell line ELF1-mouse Elf1
/mnt/lab_data2/msharmin/oc-atlas/mtf/atac_pval_hits/task_273/Elf1
MEL cell line EP300-mouse Ep300
/mnt/lab_data2/msharmin/oc-atlas/mtf/atac_pval_hits/task_273/Ep300
MEL cell line ETS1-mouse Ets1
/mnt/lab_data2/msharmin/oc-atlas/mtf/atac_pval_hits/task_273/Ets1
MEL cell line GABPA-mouse Gabpa
/mnt/lab_data2/msharmin/oc-atlas/mtf/atac_pval_hits/task_273/Gabpa
MEL cell line GATA1-mouse Gata1
/mnt/lab_data2/msharmin/oc-atlas/mtf/atac_pval_hits/task_273/Gata1
MEL cell line JUND-mouse Jund
/mnt/lab_data2/msharmin/oc-atlas/mtf/atac_pval_hits/task_273/Jund
MEL cell line MAFK-mouse Mafk
/mnt/lab_data2/msharmin/oc-atlas/mtf/atac_pval_hits/task_273/Mafk
MEL cell line MAX-mouse Max
/mnt/lab_data2/msharmin/oc-atlas/mtf/atac_pval_hits/task_273/Max
MEL cell line MAZ-mouse Maz
/mnt/lab_data2/msharmin/oc-atlas/mtf/atac_pval_hits/task_273/Maz
MEL cell line MEF2A-mouse Mef2a
/mnt/lab_data2/msharmin/oc-atlas/mtf/atac_pval_hits/task_273/Mef2a
MEL cell line MXI1-mouse Mxi1
/mnt/lab_data2/msharmin/oc-atlas/mtf/atac_pval_hits/task_273/Mxi1
MEL cell line MYB-mouse Myb
/mnt/lab_data2/msharmin/oc-atlas/mtf/atac_pval_hits/task_273/Myb
MEL cell line MYC-mouse Myc
/mnt/lab_data2/msharmin/oc-atlas/mtf/atac_pval_hits/task_273/Myc
MEL cell line NRF1-mouse Nrf1
/mnt/lab_data2/msharmin/oc-atlas/mtf/atac_pval_hits/task_273/Nrf1
MEL cell line TAL1-mouse Tal1
/mnt/lab_data2/msharmin/oc-atlas/mtf/atac_pval_hits/task_273/Tal1
MEL cell line TBP-mouse Tbp
/mnt/lab_data2/msharmin/oc-atlas/mtf/atac_pval_hits/task_273/Tbp
MEL cell line TCF12-mouse Tcf12
/mnt/lab_data2/msharmin/oc-atlas/mtf/atac_pval_hits/task_273/Tcf12
MEL cell line USF1-mouse Usf1
/mnt/lab_data2/msharmin/oc-atlas/mtf/atac_pval_hits/task_273/Usf1
MEL cell line USF2-mouse Usf2
/mnt/lab_data2/msharmin/oc-atlas/mtf/atac_pval_hits/task_273/Usf2
MEL cell line ZKSCAN1-mouse Zkscan1
/mnt/lab_data2/msharmin/oc-atlas/mtf/atac_pval_hits/task_273/Zkscan1
274 MEL cell line treated with 125 uM zinc dichloride for 24 hours ['MEL cell line'] 1
MEL cell line BHLHE40-mouse Bhlhe40
/mnt/lab_data2/msharmin/oc-atlas/mtf/atac_pval_hits/task_274/Bhlhe40
MEL cell line CTCF-mouse Ctcf
/mnt/lab_data2/msharmin/oc-atlas/mtf/atac_pval_hits/task_274/Ctcf
MEL cell line E2F4-mouse E2f4
/mnt/lab_data2/msharmin/oc-atlas/mtf/atac_pval_hits/task_274/E2f4
MEL cell line ELF1-mouse Elf1
/mnt/lab_data2/msharmin/oc-atlas/mtf/atac_pval_hits/task_274/Elf1
MEL cell line EP300-mouse Ep300
/mnt/lab_data2/msharmin/oc-atlas/mtf/atac_pval_hits/task_274/Ep300
MEL cell line ETS1-mouse Ets1
/mnt/lab_data2/msharmin/oc-atlas/mtf/atac_pval_hits/task_274/Ets1
MEL cell line GABPA-mouse Gabpa
/mnt/lab_data2/msharmin/oc-atlas/mtf/atac_pval_hits/task_274/Gabpa
MEL cell line GATA1-mouse Gata1
/mnt/lab_data2/msharmin/oc-atlas/mtf/atac_pval_hits/task_274/Gata1
MEL cell line JUND-mouse Jund
/mnt/lab_data2/msharmin/oc-atlas/mtf/atac_pval_hits/task_274/Jund
MEL cell line MAFK-mouse Mafk
/mnt/lab_data2/msharmin/oc-atlas/mtf/atac_pval_hits/task_274/Mafk
MEL cell line MAX-mouse Max
/mnt/lab_data2/msharmin/oc-atlas/mtf/atac_pval_hits/task_274/Max
MEL cell line MAZ-mouse Maz
/mnt/lab_data2/msharmin/oc-atlas/mtf/atac_pval_hits/task_274/Maz
MEL cell line MEF2A-mouse Mef2a
/mnt/lab_data2/msharmin/oc-atlas/mtf/atac_pval_hits/task_274/Mef2a
MEL cell line MXI1-mouse Mxi1
/mnt/lab_data2/msharmin/oc-atlas/mtf/atac_pval_hits/task_274/Mxi1
MEL cell line MYB-mouse Myb
/mnt/lab_data2/msharmin/oc-atlas/mtf/atac_pval_hits/task_274/Myb
MEL cell line MYC-mouse Myc
/mnt/lab_data2/msharmin/oc-atlas/mtf/atac_pval_hits/task_274/Myc
MEL cell line NRF1-mouse Nrf1
/mnt/lab_data2/msharmin/oc-atlas/mtf/atac_pval_hits/task_274/Nrf1
MEL cell line TAL1-mouse Tal1
/mnt/lab_data2/msharmin/oc-atlas/mtf/atac_pval_hits/task_274/Tal1
MEL cell line TBP-mouse Tbp
/mnt/lab_data2/msharmin/oc-atlas/mtf/atac_pval_hits/task_274/Tbp
MEL cell line TCF12-mouse Tcf12
/mnt/lab_data2/msharmin/oc-atlas/mtf/atac_pval_hits/task_274/Tcf12
MEL cell line USF1-mouse Usf1
/mnt/lab_data2/msharmin/oc-atlas/mtf/atac_pval_hits/task_274/Usf1
MEL cell line USF2-mouse Usf2
/mnt/lab_data2/msharmin/oc-atlas/mtf/atac_pval_hits/task_274/Usf2
MEL cell line ZKSCAN1-mouse Zkscan1
/mnt/lab_data2/msharmin/oc-atlas/mtf/atac_pval_hits/task_274/Zkscan1
44 macrophage  ['bone marrow macrophage'] 1
bone marrow macrophage CTCF-mouse Ctcf
/mnt/lab_data2/msharmin/oc-atlas/mtf/atac_pval_hits/task_44/Ctcf
52 Small Intestine ['small intestine'] 1
small intestine CTCF-mouse Ctcf
/mnt/lab_data2/msharmin/oc-atlas/mtf/atac_pval_hits/task_52/Ctcf
53 Large Intestine ['intestine'] 1
intestine CTCF-mouse Ctcf
/mnt/lab_data2/msharmin/oc-atlas/mtf/atac_pval_hits/task_53/Ctcf
54 Lung ['lung'] 1
lung CTCF-mouse Ctcf
/mnt/lab_data2/msharmin/oc-atlas/mtf/atac_pval_hits/task_54/Ctcf
lung EP300-mouse Ep300
/mnt/lab_data2/msharmin/oc-atlas/mtf/atac_pval_hits/task_54/Ep300
56 Heart ['heart'] 1
heart CTCF-mouse Ctcf
/mnt/lab_data2/msharmin/oc-atlas/mtf/atac_pval_hits/task_56/Ctcf
heart EP300-mouse Ep300
/mnt/lab_data2/msharmin/oc-atlas/mtf/atac_pval_hits/task_56/Ep300
60 testis ['testis'] 1
testis CTCF-mouse Ctcf
/mnt/lab_data2/msharmin/oc-atlas/mtf/atac_pval_hits/task_60/Ctcf
63 stomach ['stomach'] 1
stomach CTCF-mouse Ctcf
/mnt/lab_data2/msharmin/oc-atlas/mtf/atac_pval_hits/task_63/Ctcf
stomach EP300-mouse Ep300
/mnt/lab_data2/msharmin/oc-atlas/mtf/atac_pval_hits/task_63/Ep300
67 Brain- Olfactory Bulb ['olfactory bulb'] 1
olfactory bulb CTCF-mouse Ctcf
/mnt/lab_data2/msharmin/oc-atlas/mtf/atac_pval_hits/task_67/Ctcf
72 Brain- Cerebellum ['cerebellum'] 1
cerebellum CTCF-mouse Ctcf
/mnt/lab_data2/msharmin/oc-atlas/mtf/atac_pval_hits/task_72/Ctcf
15.203626275062561
In [19]:
dlk = fimo_df[fimo_df['deeplift_cdf']>0.9]
nondlk = fimo_df[fimo_df['deeplift_cdf']<=0.9]

from matplotlib import pyplot as plt
fig, axes = plt.subplots(1, 2, figsize=(16,6))
ax_pdf = axes[0]
ax_cdf = axes[1]

key_to_show = 'sum_deeplift_0'
ax_pdf.hist(dlk[key_to_show].values, density=True, histtype='stepfilled',
        alpha=0.6, bins=60, label='high')
# plot the scores for an example sequence of a as well
ax_pdf.hist(nondlk[key_to_show].values, density=True, histtype='stepfilled',
        alpha=0.6, bins=60, label='low')
ax_pdf.legend(loc='best', frameon=False)
ax_pdf.legend(loc='best', frameon=False)

ax_cdf.hist(dlk[key_to_show].values,  cumulative=True, density=True, histtype='bar',
        alpha=0.6, bins=60, label='high')
# plot the scores for an example sequence of a as well
ax_cdf.hist(nondlk[key_to_show].values,  cumulative=True, density=True, histtype='bar',
        alpha=0.6, bins=60, label='low')
ax_cdf.legend(loc='best', frameon=False)
plt.show()
In [ ]:
 
In [ ]:
 
In [ ]:
# revise performance measurements with changing the probability metric

# revise fimo50
# revise positional distribution and cdfs for a bed


# zcat chippeaks.bed.gz | sort -k1,1 -k2,2n | bedtools intersect -a ../../../atac_specific_hits/task_140/peaks.bed -b stdin -wb -wa -v > nongold3.bed



# zcat chippeaks.bed.gz | sort -k1,1 -k2,2n | bedtools intersect -a stdin -b ../../../atac_specific_hits/task_140/peaks.bed -wb -wa > gold.bed
# zcat /mnt/lab_data2/msharmin/tf-mouse-atlas/chip_idrs/ENCFF806PDR.bed.gz | sort -k1,1 -k2,2n | bedtools intersect -a stdin -b atac_specific_hits/task_140/peaks.bed -wb > atac_various_hits/task_140/Ctcf/gold.bed
In [117]:
b = np.sum(null_scores, axis=2)
a = np.sum(deeplift_scores, axis=2)
a.shape, b.shape
Out[117]:
((174993, 1000), (174993, 200))
In [118]:
np.argmin(cdf_scores[:,5]), np.min(cdf_scores[:,5])
Out[118]:
(171195, 5.979713769975963e-05)
In [150]:
i = [0, 1]
c = b[0].flatten()
print(c.shape)
shape, mean, stdev = t.fit(c)
kstat, kval = stats.kstest(c, 't', args=[shape, mean, stdev])
# mean, stdev = norm.fit(c)
# kstat, kval = stats.kstest(c, 'norm', args=[ mean, stdev])
# shape, mean, stdev = gamma.fit(c)
# kstat, kval = stats.kstest(c, 'gamma', args=[shape,  mean, stdev])

print(kstat, kval)
(200,)
0.045556525408073756 0.8008351657723536
In [191]:
from matlas.fimo_hits import plot_fitted_dist_for_importance
plot_fitted_dist_for_importance(c, a[0], shape, mean, stdev)
In [11]:
# scores_ = [np.sum(j) for j in scores[0][300:700]]
# shape, mean, stdev = t.fit(scores_)
# kstat, kval = stats.kstest(scores_, 't', args=[shape, mean, stdev])
# print('kstat:', kstat, ' kval:', kval)

from scipy import stats

from scipy.stats import *
shape, mean, stdev = t.fit(b)
print(shape, mean, stdev)
kstat, kval = stats.kstest(b, 't', args=[shape, mean, stdev])
print('t sitribution', kstat, kval)
kstat, kval = stats.kstest(b, 'norm')
print('normal sitribution', kstat, kval)
1.5123294107168623 0.0008743328517088086 0.005880160282513114
t sitribution 0.06499847605991883 0.0
normal sitribution 0.48375382933815286 0.0
In [ ]:
 
In [28]:
#get the scores for all the windows, take per_base_imp distribution
# nonagnostic_out = "/mnt/lab_data/kundaje/msharmin/mouse_hem/mtf/specific_hits/task_224/Ctcf.bed.gz"
# df = pd.read_csv(nonagnostic_out, sep="\t")
# selected_peaks = np.sort(np.array(list(set(df['seqno'].values))))
# selected_null_scores = hyp_scores[selected_peaks,:,:]
# selected_null_scores.shape
# atacdf = pd.read_csv("/mnt/lab_data/kundaje/msharmin/mouse_hem/mtf/specific_hits/task_224/peaks.bed", header=None, sep="\t")
# #selected_atacs = atacdf.loc[selected_peaks,:]
# #selected_atacs.to_csv("/mnt/lab_data/kundaje/msharmin/mouse_hem/mtf/specific_hits/task_224/peaks_Ctcf.bed", 
#                       #header=None, index=None, sep="\t")
In [ ]:
 
In [42]:
from matlas.render_report import render_cell_tf_reports_in_parallel

render_cell_tf_reports_in_parallel()
In [2]:
# call dichotomize_chipseq_for_gold_standard for each tf under MEL
# call get_deeplift_passed_fimo_hits for each tf under MEL
# call get_peaks_within_nb for each tf under MEL
# for a tf: collect_distance_info, plot_distances_from_peaks
# for a tf: collect_performance_info, plot_performance_info
# for a tf: compute_joint_chip_matrix, plot_joint_chip_heatmap
# for a tf: load the png file
In [2]:
from matlas.fimo_hits import combine_motif_dfs

logdir = "/mnt/lab_data/kundaje/msharmin/mouse_hem/mtf"

outdir = "/mnt/lab_data/kundaje/msharmin/mouse_hem/mtf/specific_hits/task_{}".format(273)
nonagnostic_out = "{0}/{1}.bed.gz".format(outdir, "Adnp2")
df_dict = combine_motif_dfs(logdir, outdir, 
                      combine_func="collect_motif_df",
                      )
df = pd.concat([df_dict[key] for key in df_dict.keys()])

#nonz_dfs = combine_motif_dfs(logdir, outdir, zscored=False)
In [93]:
from matlas.fimo_hits import plot_motif_scores

motifs = ['Spi1', 'Ctcf', 'Gata1']
turn = 0
fig, axes = None, None
for key in motifs:
    fig, axes = plot_motif_scores(df_dict[key], title=key, turn=turn, axes=axes, fig=fig, kind='scatter')
    turn = (turn+1)%4
In [87]:
import seaborn as sns
combined_df = pd.concat([df_dict[key] for key in df_dict.keys()])
sns.jointplot(x="avg_deeplift", y="fimo", data=combined_df, s=1, marginal_kws=dict(bins=100));
In [19]:
df_dict['Spi1'][:5]
Out[19]:
motif fimo deeplift avg_deeplift
0 Spi1 13.9878 0.587939 0.053449
1 Spi1 15.0000 1.235800 0.065042
2 Spi1 14.7753 0.065585 0.003858
3 Spi1 14.2121 0.042971 0.002686
4 Spi1 15.8902 0.103061 0.009369
In [3]:
from matlas.render_report import render_test_plots
render_test_plots(nbname="FimoVsDeepliftReport")
In [71]:
gencode_file = '/mnt/data/annotations/by_release/mm10/GENCODE_ann/gencode.vM6.annotation.gtf'
gencode = pd.read_table(gencode_file, sep='\t', header=None, comment='#')
genes = gencode[(gencode[0]!='chrM') & (gencode[2]=='gene')&(np.array(['protein_coding' in i for i in gencode[8].values]))]
gene_df = genes.loc[:, [0,3,4,8,5,6]]
gene_df = gene_df.sort_values([0,3])
gene_df.to_csv("{0}/genecode.bed.gz".format(modisco_dir), header=False, index=False, compression='gzip', sep="\t")
genes[:5]
Out[71]:
0 1 2 3 4 5 6 7 8
6 chr1 HAVANA gene 3205901 3671498 . - . gene_id "ENSMUSG00000051951.5"; gene_type "pro...
74 chr1 HAVANA gene 4290846 4409241 . - . gene_id "ENSMUSG00000025900.8"; gene_type "pro...
104 chr1 HAVANA gene 4490931 4497354 . - . gene_id "ENSMUSG00000025902.11"; gene_type "pr...
227 chr1 HAVANA gene 4773206 4785739 . - . gene_id "ENSMUSG00000033845.11"; gene_type "pr...
298 chr1 HAVANA gene 4807788 4848410 . + . gene_id "ENSMUSG00000025903.12"; gene_type "pr...
In [ ]: