In [1]:
%load_ext autoreload
%autoreload 2

import pickle
from collections import OrderedDict
import numpy as np
import pandas as pd
import os
import subprocess
import math
import copy
import pickle
import h5py
import glob

from matlas.genome_data import *
from matlas.pwms import shorten_tfname, get_motifDB_id_maps
from matlas.matches import fig2vdom, vdom_pssm
#from basepair.modisco.results import ModiscoResult
from matlas.modisco_results import ModiscoResult


project_root = "/mnt/lab_data/kundaje/msharmin/mouse_hem/with_tfd"
oak_root = "/oak/stanford/groups/akundaje/msharmin/mouse_hem/from_labcluster/with_tfd"
srv_root = "/srv/scratch/msharmin/mouse_hem/with_tfd"
data_name = "full_mouse50"
modisco_root = "{0}/{1}/Naive_modisco2019_msharmin".format(oak_root, data_name)
modisco_root = "{0}/{1}/Naive_modisco2019".format(srv_root, data_name)

homer_root = "{0}/{1}/Naive_scans".format(srv_root, data_name)
model_root = "{0}/{1}/fineFactorized".format(srv_root, data_name)
basset_root = "{0}/{1}/Naive_basset_motifs".format(srv_root, data_name)
reportfile= "/mnt/lab_data/kundaje/msharmin/mouse_hem/with_tfd/full_mouse50/filtering samples_MS2.xlsx"
sheetname = "filter23"
load data from labcluster
Using TensorFlow backend.
TF-MoDISco is using the TensorFlow backend.
In [2]:
from matlas.dlutils import load_deeplift_data
from matlas.fimo_hits import (dichotomize_chipseq_for_gold_standard, 
                              assign_significance_on_fimo_hits,
                              retain_fimo_hits_within_nb,
                              optimize_performance_info,
                              check_performance_competition,
                              collect_motif_df,
                              plot_various_instance_scores,
                              decide_optimum_threshold,
                              collect_distance_info,
                              compute_distances_from_peaks,
                              plot_positional_information
                             )



def prepare_verification_data_per_cell(
    task_idx=273,
    tf_biosample = "MEL cell line",
    the_k=0.05627951420283896,
    atac_root="/mnt/lab_data2/msharmin/oc-atlas/mtf/atac_specific_hits",
    instance_root="/mnt/lab_data2/msharmin/oc-atlas/mtf/atac_various_hits",
    deeplift_root="/mnt/lab_data2/msharmin/oc-atlas/deeplift2019_h5",
    background_files_root="/mnt/lab_data2/msharmin/oc-atlas/mtf/t_stats"
):

    tf_targets = sorted(set(CHIP_PEAK_BEDS[tf_biosample].keys()).intersection(set(CHIP_TO_MOTIF.keys())))
    motifs = [CHIP_TO_MOTIF[tf] for tf in tf_targets]
    
    # divide chipseq peaks into gold_standard and nongold for each tf under a cell type
    fimodir = "{0}/task_{1}".format(atac_root, task_idx)
    atacpeaks = "{}/peaks.bed".format(fimodir)
    #print(fimodir)
    #print(atacpeaks)
    for tf, motif in zip(tf_targets, motifs):
        print(tf_biosample, tf, motif)
        chippeaks = "{0}/{1}".format(CHIP_PEAK_BED_ROOT, CHIP_PEAK_BEDS[tf_biosample][tf])
        instancedir="{}/task_{}/{}".format(instance_root, task_idx, motif)
        #print(chippeaks)
        #print(instancedir)
        #print()

        best_key = decide_optimum_threshold(instancedir)
        distances = collect_distance_info(best_key=best_key, instancedir=instancedir,
                                          null_key='deeplift_cdf', key_to_split='deeplift_cdf',
                                          fimobed="{}/{}.bed.gz".format(fimodir, motif),
                                          verbose=False
                                         )
        plot_positional_information(distances, motif)

    
        # compute_joint_chip_matrix and plot_joint_chip_matrix
        
        
    return None
from ast import literal_eval

def prepare_verification_data():
    total_zeros = 0
    for key, tf_biosamples in OC_TO_TF_SAMPLES.items():
        task_idx, sample_id, oc_sample = literal_eval(key)
        
        print(task_idx, oc_sample, tf_biosamples, len(tf_biosamples))
        total_zeros = prepare_verification_data_per_cell(task_idx=task_idx, 
                                           tf_biosample=tf_biosamples[0],
                                           the_k=PER_BASE_BACKGROUND_IMPORTANCES[task_idx]
                                          )
        #break
    return None


prepare_verification_data()
140 Bone marrow derived pre-B cell ['bone marrow'] 1
bone marrow CTCF-mouse Ctcf
142 bone marrow-derived macrophages (Unstimulated) ['bone marrow macrophage'] 1
bone marrow macrophage CTCF-mouse Ctcf
143 bone marrow-derived macrophages(Lipid A stimulation 120min) ['bone marrow macrophage'] 1
bone marrow macrophage CTCF-mouse Ctcf
177 Cortical excitatory neurons ['cortical plate'] 1
cortical plate CTCF-mouse Ctcf
180 Thymus MECs (Aire+/+) ['thymus'] 1
thymus CTCF-mouse Ctcf
189 Megakaryocyte primary cell ['megakaryocyte'] 1
megakaryocyte FLI1-mouse Fli1
192 Erythroblast primary cell ['erythroblast'] 1
erythroblast GATA1-mouse Gata1
193 Megakaryocyte progenitor cell ['megakaryocyte'] 1
megakaryocyte FLI1-mouse Fli1
202 Liver (Adult 8 weeks) ['liver'] 1
liver CTCF-mouse Ctcf
liver EP300-mouse Ep300
liver GATA4-mouse Gata4
203 Lung (adult-8wks) ['lung'] 1
lung CTCF-mouse Ctcf
lung EP300-mouse Ep300
210 mouse spleen B cells, CD43-,CD11b-         (immature B cells1) ['spleen'] 1
220 Fibroblast (adult-8wks) ['embryonic fibroblast'] 1
embryonic fibroblast CTCF-mouse Ctcf
223 Kidney(adult-8wks) ['kidney'] 1
kidney CTCF-mouse Ctcf
224 Liver(adult-8wks) ['liver'] 1
liver CTCF-mouse Ctcf
liver EP300-mouse Ep300
liver GATA4-mouse Gata4
225 Liver(E14.5) ['liver'] 1
liver CTCF-mouse Ctcf
liver EP300-mouse Ep300
liver GATA4-mouse Gata4
227 Lung(adult-8wks) ['lung'] 1
lung CTCF-mouse Ctcf
lung EP300-mouse Ep300
229 Mouse lung (E14.5) ['lung'] 1
lung CTCF-mouse Ctcf
lung EP300-mouse Ep300
230 Heart (adult-8wks) ['heart'] 1
heart CTCF-mouse Ctcf
heart EP300-mouse Ep300
235 mixed embryonic mouse retina(E14.5) ['embryonic fibroblast'] 1
embryonic fibroblast CTCF-mouse Ctcf
236 Cerebellum ['cerebellum'] 1
cerebellum CTCF-mouse Ctcf
245 mixed embryonic mouse forebrain(E14.5) ['forebrain'] 1
forebrain CTCF-mouse Ctcf
248 mixed embryonic mouse hindbrain(E11.5) ['hindbrain'] 1
hindbrain CTCF-mouse Ctcf
249 hindbrain embryo(E14.5) ['hindbrain'] 1
hindbrain CTCF-mouse Ctcf
250 mixed embryonic mouse midbrain(E14.5) ['midbrain'] 1
midbrain CTCF-mouse Ctcf
251 midbrain embryo(E11.5) ['midbrain'] 1
midbrain CTCF-mouse Ctcf
263 Large Intestine (adult-8wks) ['intestine'] 1
intestine CTCF-mouse Ctcf
266 spleen(adult-8wks) ['spleen'] 1
271 CH12.LX cell line ['CH12.LX'] 1
CH12.LX BHLHE40-mouse Bhlhe40
CH12.LX CTCF-mouse Ctcf
CH12.LX E2F4-mouse E2f4
CH12.LX ELF1-mouse Elf1
CH12.LX EP300-mouse Ep300
CH12.LX ETS1-mouse Ets1
CH12.LX GABPA-mouse Gabpa
CH12.LX IRF4-mouse Irf4
CH12.LX JUN-mouse Jun
CH12.LX JUND-mouse Jund
CH12.LX MAFK-mouse Mafk
CH12.LX MAX-mouse Max
CH12.LX MAZ-mouse Maz
CH12.LX MEF2A-mouse Mef2a
CH12.LX MXI1-mouse Mxi1
CH12.LX MYC-mouse Myc
CH12.LX NRF1-mouse Nrf1
CH12.LX PAX5-mouse Pax5
CH12.LX TBP-mouse Tbp
CH12.LX TCF12-mouse Tcf12
CH12.LX USF1-mouse Usf1
CH12.LX USF2-mouse Usf2
CH12.LX ZKSCAN1-mouse Zkscan1
273 MEL cell line ['MEL cell line'] 1
MEL cell line BHLHE40-mouse Bhlhe40
MEL cell line CTCF-mouse Ctcf
MEL cell line E2F4-mouse E2f4
MEL cell line ELF1-mouse Elf1
MEL cell line EP300-mouse Ep300
MEL cell line ETS1-mouse Ets1
MEL cell line GABPA-mouse Gabpa
MEL cell line GATA1-mouse Gata1
MEL cell line JUND-mouse Jund
MEL cell line MAFK-mouse Mafk
MEL cell line MAX-mouse Max
MEL cell line MAZ-mouse Maz
MEL cell line MEF2A-mouse Mef2a
MEL cell line MXI1-mouse Mxi1
MEL cell line MYB-mouse Myb
MEL cell line MYC-mouse Myc
MEL cell line NRF1-mouse Nrf1
MEL cell line TAL1-mouse Tal1
MEL cell line TBP-mouse Tbp
MEL cell line TCF12-mouse Tcf12
MEL cell line USF1-mouse Usf1
MEL cell line USF2-mouse Usf2
MEL cell line ZKSCAN1-mouse Zkscan1
274 MEL cell line treated with 125 uM zinc dichloride for 24 hours ['MEL cell line'] 1
MEL cell line BHLHE40-mouse Bhlhe40
MEL cell line CTCF-mouse Ctcf
MEL cell line E2F4-mouse E2f4
MEL cell line ELF1-mouse Elf1
MEL cell line EP300-mouse Ep300
MEL cell line ETS1-mouse Ets1
MEL cell line GABPA-mouse Gabpa
MEL cell line GATA1-mouse Gata1
MEL cell line JUND-mouse Jund
MEL cell line MAFK-mouse Mafk
MEL cell line MAX-mouse Max
MEL cell line MAZ-mouse Maz
MEL cell line MEF2A-mouse Mef2a
MEL cell line MXI1-mouse Mxi1
MEL cell line MYB-mouse Myb
MEL cell line MYC-mouse Myc
MEL cell line NRF1-mouse Nrf1
MEL cell line TAL1-mouse Tal1
MEL cell line TBP-mouse Tbp
MEL cell line TCF12-mouse Tcf12
MEL cell line USF1-mouse Usf1
MEL cell line USF2-mouse Usf2
MEL cell line ZKSCAN1-mouse Zkscan1
44 macrophage  ['bone marrow macrophage'] 1
bone marrow macrophage CTCF-mouse Ctcf
52 Small Intestine ['small intestine'] 1
small intestine CTCF-mouse Ctcf
53 Large Intestine ['intestine'] 1
intestine CTCF-mouse Ctcf
54 Lung ['lung'] 1
lung CTCF-mouse Ctcf
lung EP300-mouse Ep300
56 Heart ['heart'] 1
heart CTCF-mouse Ctcf
heart EP300-mouse Ep300
60 testis ['testis'] 1
testis CTCF-mouse Ctcf
63 stomach ['stomach'] 1
stomach CTCF-mouse Ctcf
stomach EP300-mouse Ep300
67 Brain- Olfactory Bulb ['olfactory bulb'] 1
olfactory bulb CTCF-mouse Ctcf
72 Brain- Cerebellum ['cerebellum'] 1
cerebellum CTCF-mouse Ctcf
In [ ]:
# def prepare_verification_data():
#     total_zeros = 0
#     for key, tf_biosamples in OC_TO_TF_SAMPLES.items():
#         task_idx, sample_id, oc_sample = literal_eval(key)
        
#         print(task_idx, oc_sample, tf_biosamples, len(tf_biosamples))
#         total_zeros = prepare_verification_data_per_cell(task_idx=task_idx, 
#                                            tf_biosample=tf_biosamples[0],
#                                            the_k=PER_BASE_BACKGROUND_IMPORTANCES[task_idx]
#                                           )
#         #break
#     return None


# prepare_verification_data()
In [12]:
from matlas.fimo_hits import compute_distances_from_peaks
#mdf = compute_distances_from_peaks()
mdf
bedtools intersect -a /mnt/lab_data2/msharmin/oc-atlas/mtf/atac_various_hits/task_224/Gata4/gold.bed -b /mnt/lab_data2/msharmin/oc-atlas/mtf/atac_various_hits/task_224/Gata4/Fimo.bed -wb > /mnt/lab_data2/msharmin/oc-atlas/mtf/atac_various_hits/task_224/Gata4/Fimo.tp.r.bed
Out[12]:
peakchr peakstart peakend n point chr istart iend iname iscore istrand mean_deeplift mean_deeplift_0 deeplift_cdf imidpoint summit distance
0 chr19 3353359 3353360 13 69 chr19 3353359 3353360 138701 14.0000 - 0.000009 0.000009 0.506270 3353359 3353428 -69
1 chr19 3353359 3353361 13 69 chr19 3353359 3353361 138702 15.2241 + 0.000020 0.000020 0.506379 3353359 3353428 -69
2 chr9 92211224 92211235 92 180 chr9 92211224 92211235 281510 15.0690 - 0.061769 0.062867 0.947707 92211224 92211404 -180
3 chr9 92211225 92211235 92 180 chr9 92211225 92211235 281511 14.0000 + 0.069153 0.069153 0.963151 92211225 92211405 -180
4 chr3 97876403 97876413 133 180 chr3 97876403 97876413 178395 13.7121 - 0.018780 0.021136 0.890439 97876403 97876583 -180
5 chr3 97876403 97876414 133 180 chr3 97876403 97876414 178396 15.2069 + 0.015547 0.019214 0.807427 97876403 97876583 -180
6 chr4 57839659 57839671 143 180 chr4 57839659 57839671 192577 13.8991 + -0.006044 0.003068 0.483946 57839659 57839839 -180
7 chr2 64070549 64070561 155 180 chr2 64070549 64070561 152834 13.9358 - -0.001180 0.007046 0.504524 64070549 64070729 -180
8 chr2 64070730 64070742 155 180 chr2 64070730 64070742 152835 15.6239 + -0.003105 0.000162 0.421774 64070730 64070910 -180
9 chr2 36236976 36236986 164 180 chr2 36236976 36236986 149381 13.5455 - 0.006439 0.011360 0.501202 36236976 36237156 -180
10 chr5 142402937 142402938 170 180 chr5 142402937 142402948 219735 14.8966 + -0.001932 0.002020 0.373960 142402937 142403117 -180
11 chr9 63717308 63717320 188 180 chr9 63717308 63717320 278183 14.1009 - -0.000969 0.005634 0.519866 63717308 63717488 -180
12 chr1 138328353 138328363 220 180 chr1 138328353 138328363 16653 13.6515 + 0.025300 0.033887 0.708505 138328353 138328533 -180
13 chr7 89368032 89368044 228 180 chr7 89368032 89368044 249373 15.6239 + -0.007330 0.002958 0.286235 89368032 89368212 -180
14 chr14 31433515 31433526 229 0 chr14 31433515 31433526 83360 14.5690 + -0.001226 0.002116 0.498010 31433515 31433515 0
15 chr11 88719079 88719083 237 180 chr11 88719071 88719083 48572 14.9450 + 0.010161 0.015953 0.451415 88719071 88719259 -188
16 chr17 71204789 71204801 249 180 chr17 71204789 71204801 125370 14.9633 + -0.005869 0.001158 0.305810 71204789 71204969 -180
17 chr17 87055304 87055316 284 180 chr17 87055304 87055316 127151 14.6330 + -0.003392 0.002511 0.414718 87055304 87055484 -180
18 chr4 40723434 40723446 295 180 chr4 40723434 40723446 190625 14.3486 - 0.004248 0.009394 0.424002 40723434 40723614 -180
19 chrX 13347156 13347164 324 180 chrX 13347154 13347164 286367 13.6515 + 0.020639 0.027710 0.625309 13347154 13347336 -182
20 chr18 84914855 84914866 458 180 chr18 84914855 84914866 137955 14.5690 - 0.043586 0.046236 0.912073 84914855 84915035 -180
21 chr18 84914856 84914866 458 180 chr18 84914856 84914866 137956 13.6515 + 0.042969 0.045883 0.855519 84914856 84915036 -180
22 chr9 118506345 118506357 510 180 chr9 118506345 118506357 284463 14.0550 + 0.010347 0.012425 0.657760 118506345 118506525 -180
23 chr4 33189448 33189452 512 180 chr4 33189442 33189452 189632 13.6515 - 0.059905 0.059905 0.861427 33189442 33189628 -186
24 chr4 33189448 33189453 512 180 chr4 33189442 33189453 189633 14.5690 + 0.065274 0.065274 0.881431 33189442 33189628 -186
25 chr19 31891617 31891628 520 180 chr19 31891617 31891628 142109 14.6034 - -0.007655 0.000175 0.431695 31891617 31891797 -180
26 chr19 31891618 31891628 520 180 chr19 31891618 31891628 142110 13.7121 + -0.007866 0.000193 0.429193 31891618 31891798 -180
27 chr1 138328353 138328363 521 180 chr1 138328353 138328363 16653 13.6515 + 0.025300 0.033887 0.708505 138328353 138328533 -180
28 chr5 129002962 129002972 533 180 chr5 129002962 129002972 218381 14.5909 - 0.033325 0.037000 0.891563 129002962 129003142 -180
29 chr5 129002962 129002973 533 180 chr5 129002962 129002973 218382 14.8276 + 0.030694 0.034035 0.872494 129002962 129003142 -180
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
1985 chr12 54460378 54460388 23988 165 chr12 54460378 54460388 58019 14.6364 + 0.041227 0.043190 0.883674 54460378 54460543 -165
1986 chr2 4917864 4917875 24004 141 chr2 4917864 4917875 145812 14.4483 - 0.026546 0.031428 0.898278 4917864 4918005 -141
1987 chr16 95861399 95861410 24007 191 chr16 95861399 95861410 117106 15.1552 + 0.090691 0.090691 0.969027 95861399 95861590 -191
1988 chr16 95861569 95861581 24007 191 chr16 95861569 95861581 117107 13.9358 + -0.006067 0.009372 0.381054 95861569 95861760 -191
1989 chr19 21914271 21914283 24008 141 chr19 21914271 21914283 140970 14.9633 - -0.008817 0.003724 0.572142 21914271 21914412 -141
1990 chr14 31404144 31404155 24010 133 chr14 31404144 31404155 83354 14.4483 + 0.136148 0.137778 0.974815 31404144 31404277 -133
1991 chr3 96580268 96580278 24011 141 chr3 96580268 96580278 178259 13.5455 - 0.029698 0.033939 0.717987 96580268 96580409 -141
1992 chr18 74740003 74740014 24018 134 chr18 74740003 74740014 136858 14.5172 - 0.082407 0.087873 0.957729 74740003 74740137 -134
1993 chr3 145798887 145798897 24020 167 chr3 145798887 145798897 184009 14.5909 - 0.015289 0.019580 0.802202 145798887 145799054 -167
1994 chr3 145798887 145798898 24020 167 chr3 145798887 145798898 184010 15.2759 + 0.014038 0.017939 0.773085 145798887 145799054 -167
1995 chr3 101507498 101507509 24028 174 chr3 101507498 101507509 178842 14.4483 + 0.035862 0.036364 0.894372 101507498 101507672 -174
1996 chr2 119251032 119251043 24029 148 chr2 119251032 119251043 159427 15.1552 + -0.000419 0.012172 0.530666 119251032 119251180 -148
1997 chr15 85774617 85774627 24045 161 chr15 85774617 85774627 104220 14.6364 - 0.022734 0.022762 0.858381 85774617 85774778 -161
1998 chr15 85774617 85774628 24045 161 chr15 85774617 85774628 104221 15.5862 + 0.022096 0.022122 0.798481 85774617 85774778 -161
1999 chr16 95636821 95636831 24048 118 chr16 95636821 95636831 117091 13.7576 + 0.132582 0.135230 0.963767 95636821 95636939 -118
2000 chr11 117336375 117336385 24049 139 chr11 117336375 117336385 51614 14.0455 - 0.057277 0.057277 0.856385 117336375 117336514 -139
2001 chr11 117336375 117336386 24049 139 chr11 117336375 117336386 51615 15.9828 + 0.050633 0.052070 0.833818 117336375 117336514 -139
2002 chr11 117336420 117336431 24049 139 chr11 117336420 117336431 51616 15.3793 - 0.030477 0.034232 0.811284 117336420 117336559 -139
2003 chr11 117336421 117336431 24049 139 chr11 117336421 117336431 51617 14.0455 + 0.032955 0.037086 0.861280 117336421 117336560 -139
2004 chr9 21265174 21265185 24050 129 chr9 21265174 21265185 273094 14.4483 - 0.029732 0.038658 0.703107 21265174 21265303 -129
2005 chrX 103287148 103287160 24053 137 chrX 103287148 103287160 298244 14.9633 - 0.028596 0.031282 0.796116 103287148 103287285 -137
2006 chr9 23453429 23453440 24057 154 chr9 23453429 23453440 273328 14.6034 - 0.021790 0.025639 0.663244 23453429 23453583 -154
2007 chr9 23453430 23453440 24057 154 chr9 23453430 23453440 273329 13.7121 + 0.025294 0.028203 0.685594 23453430 23453584 -154
2008 chr11 86962179 86962189 24059 136 chr11 86962179 86962189 48397 13.5455 - 0.043214 0.045640 0.796347 86962179 86962315 -136
2009 chr15 89476382 89476392 24067 187 chr15 89476382 89476392 104606 13.7576 - 0.032280 0.032305 0.688963 89476382 89476569 -187
2010 chr15 89476382 89476393 24067 187 chr15 89476382 89476393 104607 15.0690 + 0.033723 0.033746 0.705416 89476382 89476569 -187
2011 chr18 38670122 38670133 24070 223 chr18 38670122 38670133 132619 14.4483 - 0.013610 0.015090 0.613286 38670122 38670345 -223
2012 chr19 40297635 40297645 24080 179 chr19 40297635 40297645 143127 13.7576 + 0.091819 0.091819 0.970569 40297635 40297814 -179
2013 chr10 69219198 69219209 24081 160 chr10 69219198 69219209 31563 15.6724 - 0.078809 0.078850 0.911700 69219198 69219358 -160
2014 chr10 69219199 69219209 24081 160 chr10 69219199 69219209 31564 14.0000 + 0.085234 0.085279 0.928227 69219199 69219359 -160

2015 rows × 17 columns

In [5]:
# determine the threshold and send it to both display as params

instancedir = "/mnt/lab_data2/msharmin/oc-atlas/mtf/atac_various_hits/task_140/Ctcf"
tmp = np.load("{}/cdf_cdf.npz".format(instancedir), allow_pickle=True)
dl_performdict, nondl_performdict, performdict = tmp.f.a.tolist(), tmp.f.b.tolist(), tmp.f.c.tolist()
performdict
Out[5]:
OrderedDict([('Fimo',
                 Balanced Accuracy  F1-score  Precision    Recall  auPRC_labeled  \
              0            0.74047  0.424785   0.284553  0.837535       0.534617   
              
                 auPRC_scored  N_instance_passing_k  N_atacs_passing_k  Imbalance  
              0      0.523323                200814              73877   0.144816  ),
             ('Fimo50',
                 Balanced Accuracy  F1-score  Precision    Recall  auPRC_labeled  \
              0           0.713836  0.494412   0.460747  0.533384       0.485627   
              
                 auPRC_scored  N_instance_passing_k  N_atacs_passing_k  Imbalance  
              0      0.411968                 83013              31050   0.144816  )])
In [13]:
precisions = [(v)['Precision'].values[0] for v in dl_performdict.values()]
df = pd.DataFrame({'keys': list(dl_performdict.keys()),
              'Precision': [(v)['Precision'].values[0] for v in dl_performdict.values()],
              'Recall': [(v)['Recall'].values[0] for v in dl_performdict.values()],
                   'auPRC_scored': [(v)['auPRC_scored'].values[0] for v in dl_performdict.values()],
                   'F1-score': [(v)['F1-score'].values[0] for v in dl_performdict.values()],
                   'Balanced Accuracy': [(v)['Balanced Accuracy'].values[0] for v in dl_performdict.values()]
              
             })
nondf = pd.DataFrame({'keys': list(nondl_performdict.keys()),
              'Precision': [(v)['Precision'].values[0] for v in nondl_performdict.values()],
              'Recall': [(v)['Recall'].values[0] for v in nondl_performdict.values()],
                      'auPRC_scored': [(v)['auPRC_scored'].values[0] for v in nondl_performdict.values()],
                      'F1-score': [(v)['F1-score'].values[0] for v in nondl_performdict.values()],
                   'Balanced Accuracy': [(v)['Balanced Accuracy'].values[0] for v in nondl_performdict.values()]
              
             })
precision_cond = df['Precision']>=performdict['Fimo50']['Precision'].values[0]
df = df[precision_cond]
nondf = nondf[precision_cond]

recall_cond = df['Recall']>=nondf['Recall']
df = df[recall_cond]
nondf = nondf[recall_cond]
pd.concat([df, nondf], axis=1)
recall_cond
Out[13]:
0    True
1    True
2    True
3    True
4    True
5    True
Name: Recall, dtype: bool
In [36]:
from scipy.stats import *
from scipy import stats
from matlas.fimo_hits import collect_motif_df, PEAK_TYPES

from sklearn.metrics import precision_recall_curve
from sklearn.metrics import f1_score
from sklearn.metrics import auc
from matplotlib import pyplot as plt


from matlas.fimo_hits import compute_instance_calling_performance

perform_df, truth_df = compute_instance_calling_performance(
            bedname="Dl_k50.0.bed",
            key_to_optimize="deeplift_cdf",
            instancedir="/mnt/lab_data2/msharmin/oc-atlas/mtf/atac_various_hits/task_224/Gata4",
            atac_bed="/mnt/lab_data2/msharmin/oc-atlas/mtf/atac_specific_hits/task_224/peaks.bed",
            verbose=True
        )

# fimo_df, truth_df = compute_peak_calling_performance(key_to_optimize='mean_deeplift_0')
# fimo_df.T
bedtools intersect -a /mnt/lab_data2/msharmin/oc-atlas/mtf/atac_various_hits/task_224/Gata4/gold.bed -b /mnt/lab_data2/msharmin/oc-atlas/mtf/atac_various_hits/task_224/Gata4/Dl_k50.0.bed -wb > /mnt/lab_data2/msharmin/oc-atlas/mtf/atac_various_hits/task_224/Gata4/Dl_k50.0.tp.r.bed
bedtools intersect -a /mnt/lab_data2/msharmin/oc-atlas/mtf/atac_various_hits/task_224/Gata4/nongold.bed -b /mnt/lab_data2/msharmin/oc-atlas/mtf/atac_various_hits/task_224/Gata4/Dl_k50.0.bed -wb > /mnt/lab_data2/msharmin/oc-atlas/mtf/atac_various_hits/task_224/Gata4/Dl_k50.0.fp.r.bed
bedtools intersect -a /mnt/lab_data2/msharmin/oc-atlas/mtf/atac_various_hits/task_224/Gata4/gold.bed -b /mnt/lab_data2/msharmin/oc-atlas/mtf/atac_various_hits/task_224/Gata4/Dl_k50.0.bed -v > /mnt/lab_data2/msharmin/oc-atlas/mtf/atac_various_hits/task_224/Gata4/Dl_k50.0.fn.bed
bedtools intersect -a /mnt/lab_data2/msharmin/oc-atlas/mtf/atac_various_hits/task_224/Gata4/nongold.bed -b /mnt/lab_data2/msharmin/oc-atlas/mtf/atac_various_hits/task_224/Gata4/Dl_k50.0.bed -v > /mnt/lab_data2/msharmin/oc-atlas/mtf/atac_various_hits/task_224/Gata4/Dl_k50.0.tn.bed
possible bug set()
bedtools intersect -a /mnt/lab_data2/msharmin/oc-atlas/mtf/atac_various_hits/task_224/Gata4/Dl_k50.0.bed -b /mnt/lab_data2/msharmin/oc-atlas/mtf/atac_specific_hits/task_224/peaks.bed -wb > /mnt/lab_data2/msharmin/oc-atlas/mtf/atac_various_hits/task_224/Gata4/Dl_k50.0.tp.atac.bed
sanity checks for atac overlap 8430 8430
sanity checks for area 0.1400144203157756 0.11710181851089989 0.1400144203157756
saniti checks for f1 0.08964103388672268 0.08964103388672268
saniti checks for recall 0.06303352365877099 [5.65323082e-05]
            true peaks  false peaks   total
pred_true         1115         6073    7188
pred_false       16574       142907  159481
total            17689       148980  166669
In [19]:
dlk = fimo_df[fimo_df['deeplift_cdf']>0.9]
nondlk = fimo_df[fimo_df['deeplift_cdf']<=0.9]

from matplotlib import pyplot as plt
fig, axes = plt.subplots(1, 2, figsize=(16,6))
ax_pdf = axes[0]
ax_cdf = axes[1]

key_to_show = 'sum_deeplift_0'
ax_pdf.hist(dlk[key_to_show].values, density=True, histtype='stepfilled',
        alpha=0.6, bins=60, label='high')
# plot the scores for an example sequence of a as well
ax_pdf.hist(nondlk[key_to_show].values, density=True, histtype='stepfilled',
        alpha=0.6, bins=60, label='low')
ax_pdf.legend(loc='best', frameon=False)
ax_pdf.legend(loc='best', frameon=False)

ax_cdf.hist(dlk[key_to_show].values,  cumulative=True, density=True, histtype='bar',
        alpha=0.6, bins=60, label='high')
# plot the scores for an example sequence of a as well
ax_cdf.hist(nondlk[key_to_show].values,  cumulative=True, density=True, histtype='bar',
        alpha=0.6, bins=60, label='low')
ax_cdf.legend(loc='best', frameon=False)
plt.show()
In [ ]:
 
In [ ]:
 
In [ ]:
# revise performance measurements with changing the probability metric

# revise fimo50
# revise positional distribution and cdfs for a bed


# zcat chippeaks.bed.gz | sort -k1,1 -k2,2n | bedtools intersect -a ../../../atac_specific_hits/task_140/peaks.bed -b stdin -wb -wa -v > nongold3.bed



# zcat chippeaks.bed.gz | sort -k1,1 -k2,2n | bedtools intersect -a stdin -b ../../../atac_specific_hits/task_140/peaks.bed -wb -wa > gold.bed
# zcat /mnt/lab_data2/msharmin/tf-mouse-atlas/chip_idrs/ENCFF806PDR.bed.gz | sort -k1,1 -k2,2n | bedtools intersect -a stdin -b atac_specific_hits/task_140/peaks.bed -wb > atac_various_hits/task_140/Ctcf/gold.bed
In [117]:
b = np.sum(null_scores, axis=2)
a = np.sum(deeplift_scores, axis=2)
a.shape, b.shape
Out[117]:
((174993, 1000), (174993, 200))
In [118]:
np.argmin(cdf_scores[:,5]), np.min(cdf_scores[:,5])
Out[118]:
(171195, 5.979713769975963e-05)
In [150]:
i = [0, 1]
c = b[0].flatten()
print(c.shape)
shape, mean, stdev = t.fit(c)
kstat, kval = stats.kstest(c, 't', args=[shape, mean, stdev])
# mean, stdev = norm.fit(c)
# kstat, kval = stats.kstest(c, 'norm', args=[ mean, stdev])
# shape, mean, stdev = gamma.fit(c)
# kstat, kval = stats.kstest(c, 'gamma', args=[shape,  mean, stdev])

print(kstat, kval)
(200,)
0.045556525408073756 0.8008351657723536
In [191]:
from matlas.fimo_hits import plot_fitted_dist_for_importance
plot_fitted_dist_for_importance(c, a[0], shape, mean, stdev)
In [11]:
# scores_ = [np.sum(j) for j in scores[0][300:700]]
# shape, mean, stdev = t.fit(scores_)
# kstat, kval = stats.kstest(scores_, 't', args=[shape, mean, stdev])
# print('kstat:', kstat, ' kval:', kval)

from scipy import stats

from scipy.stats import *
shape, mean, stdev = t.fit(b)
print(shape, mean, stdev)
kstat, kval = stats.kstest(b, 't', args=[shape, mean, stdev])
print('t sitribution', kstat, kval)
kstat, kval = stats.kstest(b, 'norm')
print('normal sitribution', kstat, kval)
1.5123294107168623 0.0008743328517088086 0.005880160282513114
t sitribution 0.06499847605991883 0.0
normal sitribution 0.48375382933815286 0.0
In [ ]:
 
In [28]:
#get the scores for all the windows, take per_base_imp distribution
# nonagnostic_out = "/mnt/lab_data/kundaje/msharmin/mouse_hem/mtf/specific_hits/task_224/Ctcf.bed.gz"
# df = pd.read_csv(nonagnostic_out, sep="\t")
# selected_peaks = np.sort(np.array(list(set(df['seqno'].values))))
# selected_null_scores = hyp_scores[selected_peaks,:,:]
# selected_null_scores.shape
# atacdf = pd.read_csv("/mnt/lab_data/kundaje/msharmin/mouse_hem/mtf/specific_hits/task_224/peaks.bed", header=None, sep="\t")
# #selected_atacs = atacdf.loc[selected_peaks,:]
# #selected_atacs.to_csv("/mnt/lab_data/kundaje/msharmin/mouse_hem/mtf/specific_hits/task_224/peaks_Ctcf.bed", 
#                       #header=None, index=None, sep="\t")
In [ ]:
 
In [42]:
# from matlas.render_report import render_cell_tf_reports_in_parallel

# render_cell_tf_reports_in_parallel()
In [2]:
# call dichotomize_chipseq_for_gold_standard for each tf under MEL
# call get_deeplift_passed_fimo_hits for each tf under MEL
# call get_peaks_within_nb for each tf under MEL
# for a tf: collect_distance_info, plot_distances_from_peaks
# for a tf: collect_performance_info, plot_performance_info
# for a tf: compute_joint_chip_matrix, plot_joint_chip_heatmap
# for a tf: load the png file
In [2]:
from matlas.fimo_hits import combine_motif_dfs

logdir = "/mnt/lab_data/kundaje/msharmin/mouse_hem/mtf"

outdir = "/mnt/lab_data/kundaje/msharmin/mouse_hem/mtf/specific_hits/task_{}".format(273)
nonagnostic_out = "{0}/{1}.bed.gz".format(outdir, "Adnp2")
df_dict = combine_motif_dfs(logdir, outdir, 
                      combine_func="collect_motif_df",
                      )
df = pd.concat([df_dict[key] for key in df_dict.keys()])

#nonz_dfs = combine_motif_dfs(logdir, outdir, zscored=False)
In [93]:
from matlas.fimo_hits import plot_motif_scores

motifs = ['Spi1', 'Ctcf', 'Gata1']
turn = 0
fig, axes = None, None
for key in motifs:
    fig, axes = plot_motif_scores(df_dict[key], title=key, turn=turn, axes=axes, fig=fig, kind='scatter')
    turn = (turn+1)%4
In [87]:
import seaborn as sns
combined_df = pd.concat([df_dict[key] for key in df_dict.keys()])
sns.jointplot(x="avg_deeplift", y="fimo", data=combined_df, s=1, marginal_kws=dict(bins=100));
In [19]:
df_dict['Spi1'][:5]
Out[19]:
motif fimo deeplift avg_deeplift
0 Spi1 13.9878 0.587939 0.053449
1 Spi1 15.0000 1.235800 0.065042
2 Spi1 14.7753 0.065585 0.003858
3 Spi1 14.2121 0.042971 0.002686
4 Spi1 15.8902 0.103061 0.009369
In [3]:
from matlas.render_report import render_test_plots
render_test_plots(nbname="FimoVsDeepliftReport")
In [71]:
gencode_file = '/mnt/data/annotations/by_release/mm10/GENCODE_ann/gencode.vM6.annotation.gtf'
gencode = pd.read_table(gencode_file, sep='\t', header=None, comment='#')
genes = gencode[(gencode[0]!='chrM') & (gencode[2]=='gene')&(np.array(['protein_coding' in i for i in gencode[8].values]))]
gene_df = genes.loc[:, [0,3,4,8,5,6]]
gene_df = gene_df.sort_values([0,3])
gene_df.to_csv("{0}/genecode.bed.gz".format(modisco_dir), header=False, index=False, compression='gzip', sep="\t")
genes[:5]
Out[71]:
0 1 2 3 4 5 6 7 8
6 chr1 HAVANA gene 3205901 3671498 . - . gene_id "ENSMUSG00000051951.5"; gene_type "pro...
74 chr1 HAVANA gene 4290846 4409241 . - . gene_id "ENSMUSG00000025900.8"; gene_type "pro...
104 chr1 HAVANA gene 4490931 4497354 . - . gene_id "ENSMUSG00000025902.11"; gene_type "pr...
227 chr1 HAVANA gene 4773206 4785739 . - . gene_id "ENSMUSG00000033845.11"; gene_type "pr...
298 chr1 HAVANA gene 4807788 4848410 . + . gene_id "ENSMUSG00000025903.12"; gene_type "pr...
In [ ]: