In [1]:
%load_ext autoreload
%autoreload 2

import pandas as pd
import numpy as np 
import glob
import os
from collections import OrderedDict
import pickle
import h5py
In [7]:
from matlas.matches import DenovoModisco
from matlas.genome_data import *
task_dir = "/mnt/lab_data/kundaje/users/msharmin/NSC_ATAC_PEAKS_to_share/gw_modisco/multi_tasks/task_0.0"
# ob = DenovoModisco(task_dir)
# ob.fetch_tomtom_matches(save_report=True,
#                         tomtom_dir= "{0}/{1}_tomtomout".format(task_dir, 
#                                                                    DEFAULT_DATABASE))
In [6]:
ob.load_matched_motifs()
ob.get_motif_per_celltype(match_threshold=0.05)
ob.display_individual_table()
The following two links show list of Denovo Patterns and corresponding Motifs discovered by TF-MoDISco
Click here for Denovo Patterns by TF-MoDISco: #21
Pattern NameTF Name(s)Modisco
metacluster_1/pattern_0 # seqlets: 3761 SequenceContrib ScoresHyp_Contrib Scores
Ctcf, Ctcfl
metacluster_1/pattern_1 # seqlets: 2098 SequenceContrib ScoresHyp_Contrib Scores
Sp2, Sp1, Sp3, Klf3, Sp4, Sp5, Klf6, Wt1, Maz, Zbtb17,

Klf1, Klf15, Egr1, Zfp281, Klf5, E2f4, Klf4, Klf7, Egr2, Klf8, Klf12,

Zfx, Rela, E2f1, Fli1, Mxi1, E2f7, E2f3, Ptf1a
metacluster_1/pattern_2 # seqlets: 1954 SequenceContrib ScoresHyp_Contrib Scores
Nfib, Nfic, Nfia
metacluster_1/pattern_3 # seqlets: 1365 SequenceContrib ScoresHyp_Contrib Scores
Nrf1
metacluster_1/pattern_4 # seqlets: 1047 SequenceContrib ScoresHyp_Contrib Scores
Nfyb, Nfyc, Foxi1, Nfya, Pbx3
metacluster_1/pattern_5 # seqlets: 935 SequenceContrib ScoresHyp_Contrib Scores
Gabpa, Ets1, Elk4, Erg, Elk3, Etv4, Elf1, Elk1, Etv1, Etv5,

Etv2, Fev, Etv3, Etv6, XP_911724.4, Elf4, Ehf, Elf2, Spi1, Elf5, Spdef,

Elf3, Spib
metacluster_1/pattern_6 # seqlets: 753 SequenceContrib ScoresHyp_Contrib Scores
Creb1, Atf7, Jdp2, Creb5, Atf2, Crem, Mafb, Atf1, Atf3, Fosl2,

Fosl1, Nfil3, Tef, Tfe3, Jun, Hlf, Dbp
metacluster_1/pattern_7 # seqlets: 700 SequenceContrib ScoresHyp_Contrib Scores
Vsx2, Lhx8, Prrx2, Lhx2, En2, Shox2, Rhox6, Alx1, Rax, Arx,

Emx2, Lhx9, Pou1f1, Uncx, Pax6, Lhx3, Lbx2, Vsx1, Lhx4, Noto, Alx4,

Hoxa2, Gbx1, Alx3, Sebox, Nkx1-2, Gbx2, Gsx2, Prop1, Msx3, Pax4, Phox2b,

Hoxd3, Evx1, Phox2a, Nkx1-1, Vax2, Esx1, Prrx1, Lhx6, Otp, En1, Hoxa1,

Vax1, Nobox, Lmx1b, Msx1, Pax7, Pou2f3, Evx2, Pou2f2, Isx, Mnx1, Pdx1,

Hoxc5, Hoxd1, Hoxb3, Hoxa3, Pou3f1, Lhx1, Pou3f2, Lhx5, Pou2f1, Msx2, Dlx3,

Lmx1a, Barhl1, Hoxa5, Isl1, Barhl2, Hoxc6, Dlx5, Hlx, Dbx1, Dbx2, Hoxb5,

Tlx2, Pou3f4, Isl2, Meox2, Pou4f3, Meox1, Pou6f1, Nkx6-3, Bsx, Hnf1b, Nkx6-1,

Hoxb4
metacluster_1/pattern_8 # seqlets: 682 SequenceContrib ScoresHyp_Contrib Scores
Rfx2, Rfx3, Rfx1, Rfx4, Rfx6, Rfx7
metacluster_1/pattern_9 # seqlets: 575 SequenceContrib ScoresHyp_Contrib Scores
Irf2, Irf9, Irf1, Prdm1, Stat2, Irf7, Irf8, Irf4
metacluster_1/pattern_10 # seqlets: 541 SequenceContrib ScoresHyp_Contrib Scores
Sox9, Sox3, Sox10, Sox4, Sox2, Sox6, Sox17
metacluster_1/pattern_11 # seqlets: 537 SequenceContrib ScoresHyp_Contrib Scores
Usf2, Usf1, Tfeb, Arntl, Mitf, Tfec, Arnt, Clock, Srebf1, Bhlhe40,

Mlx, Npas2, Mycn, Myc, Nr1h4, Max, Srebf2, Rxrg, Tcfl5, Creb3l2, Rxrb
metacluster_1/pattern_12 # seqlets: 498 SequenceContrib ScoresHyp_Contrib Scores
Thap11, Zfp143, Tbx2
metacluster_1/pattern_13 # seqlets: 316 SequenceContrib ScoresHyp_Contrib Scores
Cebpb, Cebpa, Cebpd, Cebpg, Atf4, Ddit3, Cebpe, Pou5f1, Pou3f3
metacluster_1/pattern_14 # seqlets: 201 SequenceContrib ScoresHyp_Contrib Scores
Zbtb33
metacluster_1/pattern_15 # seqlets: 147 SequenceContrib ScoresHyp_Contrib Scores
Yy1, Taf1, Zfp42
metacluster_1/pattern_16 # seqlets: 78 SequenceContrib ScoresHyp_Contrib Scores
Rest, Mafk
metacluster_1/pattern_17 # seqlets: 66 SequenceContrib ScoresHyp_Contrib Scores
Zfp335, Mafa, Bach2
metacluster_1/pattern_18 # seqlets: 63 SequenceContrib ScoresHyp_Contrib Scores
Fosb, Junb, Fos, Bach1, Batf, Jund, Tal1, Pknox1, Nfe2l2
metacluster_1/pattern_19 # seqlets: 38 SequenceContrib ScoresHyp_Contrib Scores
metacluster_1/pattern_20 # seqlets: 33 SequenceContrib ScoresHyp_Contrib Scores
Click here for Motifs by TF-MoDISco: #240
TF NamePattern(s)
Ctcf
Pattern NameModiscoSignificance
metacluster_1/pattern_02.21277e-13
Ctcfl
Pattern NameModiscoSignificance
metacluster_1/pattern_03.28261e-08
metacluster_1/pattern_10.00920411
metacluster_1/pattern_160.043392400000000005
Sp2
Pattern NameModiscoSignificance
metacluster_1/pattern_12.0133500000000002e-19
metacluster_1/pattern_30.00014194
metacluster_1/pattern_110.00392311
metacluster_1/pattern_140.0005045690000000001
metacluster_1/pattern_151.92848e-05
metacluster_1/pattern_166.05329e-05
metacluster_1/pattern_170.0111054
metacluster_1/pattern_180.00186434
metacluster_1/pattern_190.00535902
metacluster_1/pattern_200.00572945
Sp1
Pattern NameModiscoSignificance
metacluster_1/pattern_10.00041713199999999996
metacluster_1/pattern_30.00488916
metacluster_1/pattern_110.00663296
metacluster_1/pattern_140.0135784
metacluster_1/pattern_157.07842e-05
metacluster_1/pattern_160.00012533
metacluster_1/pattern_180.00302235
metacluster_1/pattern_200.006936499999999999
Sp3
Pattern NameModiscoSignificance
metacluster_1/pattern_13.62761e-15
metacluster_1/pattern_30.00108113
metacluster_1/pattern_110.021166499999999998
metacluster_1/pattern_140.0045324
metacluster_1/pattern_150.000255547
metacluster_1/pattern_167.73299e-05
metacluster_1/pattern_180.00615288
metacluster_1/pattern_190.0123889
metacluster_1/pattern_200.006936499999999999
Klf3
Pattern NameModiscoSignificance
metacluster_1/pattern_18.014960000000001e-10
metacluster_1/pattern_30.035037900000000004
metacluster_1/pattern_110.047038300000000005
metacluster_1/pattern_150.0251455
metacluster_1/pattern_160.00090607
metacluster_1/pattern_190.010247
metacluster_1/pattern_200.00832281
Sp4
Pattern NameModiscoSignificance
metacluster_1/pattern_10.00162733
Sp5
Pattern NameModiscoSignificance
metacluster_1/pattern_14.95864e-09
Klf6
Pattern NameModiscoSignificance
metacluster_1/pattern_19.59307e-08
metacluster_1/pattern_110.0329134
Wt1
Pattern NameModiscoSignificance
metacluster_1/pattern_11.07231e-07
metacluster_1/pattern_150.0094157
Maz
Pattern NameModiscoSignificance
metacluster_1/pattern_11.7219200000000001e-06
metacluster_1/pattern_150.0251455
Zbtb17
Pattern NameModiscoSignificance
metacluster_1/pattern_12.84178e-06
metacluster_1/pattern_150.0094157
metacluster_1/pattern_160.013859100000000001
metacluster_1/pattern_200.0182055
Klf1
Pattern NameModiscoSignificance
metacluster_1/pattern_12.49584e-05
Klf15
Pattern NameModiscoSignificance
metacluster_1/pattern_10.000100802
Egr1
Pattern NameModiscoSignificance
metacluster_1/pattern_10.00560533
metacluster_1/pattern_160.021183
Zfp281
Pattern NameModiscoSignificance
metacluster_1/pattern_10.000355485
Klf5
Pattern NameModiscoSignificance
metacluster_1/pattern_10.00041713199999999996
E2f4
Pattern NameModiscoSignificance
metacluster_1/pattern_10.000631057
Klf4
Pattern NameModiscoSignificance
metacluster_1/pattern_10.00242295
Klf7
Pattern NameModiscoSignificance
metacluster_1/pattern_10.00158309
Egr2
Pattern NameModiscoSignificance
metacluster_1/pattern_10.0023488000000000003
Klf8
Pattern NameModiscoSignificance
metacluster_1/pattern_10.00322845
Klf12
Pattern NameModiscoSignificance
metacluster_1/pattern_10.00425497
Zfx
Pattern NameModiscoSignificance
metacluster_1/pattern_10.00802029
metacluster_1/pattern_30.020768900000000003
metacluster_1/pattern_110.0166585
metacluster_1/pattern_140.0005965359999999999
metacluster_1/pattern_150.00904632
metacluster_1/pattern_160.00534176
metacluster_1/pattern_170.0226431
metacluster_1/pattern_180.0379626
metacluster_1/pattern_190.044659
Rela
Pattern NameModiscoSignificance
metacluster_1/pattern_10.0106026
E2f1
Pattern NameModiscoSignificance
metacluster_1/pattern_10.019803599999999998
Fli1
Pattern NameModiscoSignificance
metacluster_1/pattern_10.0210644
metacluster_1/pattern_50.00010769100000000001
Mxi1
Pattern NameModiscoSignificance
metacluster_1/pattern_10.027659899999999998
metacluster_1/pattern_110.000552055
metacluster_1/pattern_160.0168266
E2f7
Pattern NameModiscoSignificance
metacluster_1/pattern_10.0288358
E2f3
Pattern NameModiscoSignificance
metacluster_1/pattern_10.0407463
Ptf1a
Pattern NameModiscoSignificance
metacluster_1/pattern_10.0448586
Nfib
Pattern NameModiscoSignificance
metacluster_1/pattern_22.18813e-06
Nfic
Pattern NameModiscoSignificance
metacluster_1/pattern_20.00728978
Nfia
Pattern NameModiscoSignificance
metacluster_1/pattern_20.010015600000000001
Nrf1
Pattern NameModiscoSignificance
metacluster_1/pattern_31.14632e-09
Nfyb
Pattern NameModiscoSignificance
metacluster_1/pattern_49.07769e-11
Nfyc
Pattern NameModiscoSignificance
metacluster_1/pattern_44.64203e-06
Foxi1
Pattern NameModiscoSignificance
metacluster_1/pattern_44.64203e-06
Nfya
Pattern NameModiscoSignificance
metacluster_1/pattern_44.64203e-06
Pbx3
Pattern NameModiscoSignificance
metacluster_1/pattern_40.000159467
Gabpa
Pattern NameModiscoSignificance
metacluster_1/pattern_50.00045244199999999995
Ets1
Pattern NameModiscoSignificance
metacluster_1/pattern_50.000118101
Elk4
Pattern NameModiscoSignificance
metacluster_1/pattern_58.41147e-05
Erg
Pattern NameModiscoSignificance
metacluster_1/pattern_58.41147e-05
Elk3
Pattern NameModiscoSignificance
metacluster_1/pattern_50.000192003
Etv4
Pattern NameModiscoSignificance
metacluster_1/pattern_50.00353191
Elf1
Pattern NameModiscoSignificance
metacluster_1/pattern_53.14047e-05
Elk1
Pattern NameModiscoSignificance
metacluster_1/pattern_55.2556099999999996e-05
Etv1
Pattern NameModiscoSignificance
metacluster_1/pattern_53.14047e-05
Etv5
Pattern NameModiscoSignificance
metacluster_1/pattern_50.000152341
Etv2
Pattern NameModiscoSignificance
metacluster_1/pattern_50.000267516
Fev
Pattern NameModiscoSignificance
metacluster_1/pattern_50.000298175
Etv3
Pattern NameModiscoSignificance
metacluster_1/pattern_50.00045244199999999995
Etv6
Pattern NameModiscoSignificance
metacluster_1/pattern_50.014317400000000001
XP_911724.4
Pattern NameModiscoSignificance
metacluster_1/pattern_50.000561479
Elf4
Pattern NameModiscoSignificance
metacluster_1/pattern_50.00313489
Ehf
Pattern NameModiscoSignificance
metacluster_1/pattern_50.010368299999999999
Elf2
Pattern NameModiscoSignificance
metacluster_1/pattern_50.00543676
Spi1
Pattern NameModiscoSignificance
metacluster_1/pattern_50.0177581
Elf5
Pattern NameModiscoSignificance
metacluster_1/pattern_50.030081599999999997
Spdef
Pattern NameModiscoSignificance
metacluster_1/pattern_50.0488034
Elf3
Pattern NameModiscoSignificance
metacluster_1/pattern_50.037399699999999994
Spib
Pattern NameModiscoSignificance
metacluster_1/pattern_50.0375335
Creb1
Pattern NameModiscoSignificance
metacluster_1/pattern_60.00102214
Atf7
Pattern NameModiscoSignificance
metacluster_1/pattern_60.00018110900000000002
metacluster_1/pattern_110.0292682
metacluster_1/pattern_130.0428801
Jdp2
Pattern NameModiscoSignificance
metacluster_1/pattern_60.000492477
metacluster_1/pattern_130.0133428
Creb5
Pattern NameModiscoSignificance
metacluster_1/pattern_60.000162757
Atf2
Pattern NameModiscoSignificance
metacluster_1/pattern_60.000492477
metacluster_1/pattern_110.0474626
metacluster_1/pattern_130.0428801
Crem
Pattern NameModiscoSignificance
metacluster_1/pattern_60.00538215
Mafb
Pattern NameModiscoSignificance
metacluster_1/pattern_60.0008153460000000001
metacluster_1/pattern_170.0279878
metacluster_1/pattern_180.00347484
Atf1
Pattern NameModiscoSignificance
metacluster_1/pattern_60.0262524
Atf3
Pattern NameModiscoSignificance
metacluster_1/pattern_60.012846600000000001
metacluster_1/pattern_180.00186434
metacluster_1/pattern_200.0162968
Fosl2
Pattern NameModiscoSignificance
metacluster_1/pattern_60.012350700000000001
metacluster_1/pattern_180.00615288
metacluster_1/pattern_200.024879
Fosl1
Pattern NameModiscoSignificance
metacluster_1/pattern_60.017017400000000002
metacluster_1/pattern_160.049966500000000004
metacluster_1/pattern_180.020052200000000003
metacluster_1/pattern_200.023524700000000003
Nfil3
Pattern NameModiscoSignificance
metacluster_1/pattern_60.018165999999999998
metacluster_1/pattern_130.00540046
Tef
Pattern NameModiscoSignificance
metacluster_1/pattern_60.022365799999999998
metacluster_1/pattern_130.00110685
Tfe3
Pattern NameModiscoSignificance
metacluster_1/pattern_60.024303599999999998
metacluster_1/pattern_110.0005410419999999999
Jun
Pattern NameModiscoSignificance
metacluster_1/pattern_60.0262524
metacluster_1/pattern_180.00165401
metacluster_1/pattern_200.0111244
Hlf
Pattern NameModiscoSignificance
metacluster_1/pattern_60.0429291
metacluster_1/pattern_130.00110685
Dbp
Pattern NameModiscoSignificance
metacluster_1/pattern_60.0494014
metacluster_1/pattern_130.00124156
Vsx2
Pattern NameModiscoSignificance
metacluster_1/pattern_70.00271846
Lhx8
Pattern NameModiscoSignificance
metacluster_1/pattern_70.008982500000000001
Prrx2
Pattern NameModiscoSignificance
metacluster_1/pattern_70.00721902
Lhx2
Pattern NameModiscoSignificance
metacluster_1/pattern_70.00403594
En2
Pattern NameModiscoSignificance
metacluster_1/pattern_70.00721902
Shox2
Pattern NameModiscoSignificance
metacluster_1/pattern_70.00271846
Rhox6
Pattern NameModiscoSignificance
metacluster_1/pattern_70.00271846
Alx1
Pattern NameModiscoSignificance
metacluster_1/pattern_70.034777300000000004
Rax
Pattern NameModiscoSignificance
metacluster_1/pattern_70.00271846
Arx
Pattern NameModiscoSignificance
metacluster_1/pattern_70.0114321
Emx2
Pattern NameModiscoSignificance
metacluster_1/pattern_70.00271846
Lhx9
Pattern NameModiscoSignificance
metacluster_1/pattern_70.00271846
Pou1f1
Pattern NameModiscoSignificance
metacluster_1/pattern_70.0264526
Uncx
Pattern NameModiscoSignificance
metacluster_1/pattern_70.00341182
Pax6
Pattern NameModiscoSignificance
metacluster_1/pattern_70.00271846
Lhx3
Pattern NameModiscoSignificance
metacluster_1/pattern_70.0107791
Lbx2
Pattern NameModiscoSignificance
metacluster_1/pattern_70.00301495
Vsx1
Pattern NameModiscoSignificance
metacluster_1/pattern_70.00412507
Lhx4
Pattern NameModiscoSignificance
metacluster_1/pattern_70.00397673
Noto
Pattern NameModiscoSignificance
metacluster_1/pattern_70.00329871
Alx4
Pattern NameModiscoSignificance
metacluster_1/pattern_70.0264526
Hoxa2
Pattern NameModiscoSignificance
metacluster_1/pattern_70.00797275
Gbx1
Pattern NameModiscoSignificance
metacluster_1/pattern_70.00482725
Alx3
Pattern NameModiscoSignificance
metacluster_1/pattern_70.00339974
Sebox
Pattern NameModiscoSignificance
metacluster_1/pattern_70.00345727
Nkx1-2
Pattern NameModiscoSignificance
metacluster_1/pattern_70.00353469
Gbx2
Pattern NameModiscoSignificance
metacluster_1/pattern_70.00482725
Gsx2
Pattern NameModiscoSignificance
metacluster_1/pattern_70.00375283
Prop1
Pattern NameModiscoSignificance
metacluster_1/pattern_70.0300776
Msx3
Pattern NameModiscoSignificance
metacluster_1/pattern_70.012860499999999999
Pax4
Pattern NameModiscoSignificance
metacluster_1/pattern_70.00412507
Phox2b
Pattern NameModiscoSignificance
metacluster_1/pattern_70.00412507
Hoxd3
Pattern NameModiscoSignificance
metacluster_1/pattern_70.0102895
Evx1
Pattern NameModiscoSignificance
metacluster_1/pattern_70.00436254
Phox2a
Pattern NameModiscoSignificance
metacluster_1/pattern_70.00454138
Nkx1-1
Pattern NameModiscoSignificance
metacluster_1/pattern_70.00436254
Vax2
Pattern NameModiscoSignificance
metacluster_1/pattern_70.00451041
Esx1
Pattern NameModiscoSignificance
metacluster_1/pattern_70.00454138
Prrx1
Pattern NameModiscoSignificance
metacluster_1/pattern_70.00454138
Lhx6
Pattern NameModiscoSignificance
metacluster_1/pattern_70.0264526
Otp
Pattern NameModiscoSignificance
metacluster_1/pattern_70.0047009999999999994
En1
Pattern NameModiscoSignificance
metacluster_1/pattern_70.00492472
Hoxa1
Pattern NameModiscoSignificance
metacluster_1/pattern_70.00538095
Vax1
Pattern NameModiscoSignificance
metacluster_1/pattern_70.00566123
Nobox
Pattern NameModiscoSignificance
metacluster_1/pattern_70.014119399999999999
Lmx1b
Pattern NameModiscoSignificance
metacluster_1/pattern_70.00571092
Msx1
Pattern NameModiscoSignificance
metacluster_1/pattern_70.00571092
Pax7
Pattern NameModiscoSignificance
metacluster_1/pattern_70.0163168
Pou2f3
Pattern NameModiscoSignificance
metacluster_1/pattern_70.00595734
Evx2
Pattern NameModiscoSignificance
metacluster_1/pattern_70.00625445
Pou2f2
Pattern NameModiscoSignificance
metacluster_1/pattern_70.00700053
Isx
Pattern NameModiscoSignificance
metacluster_1/pattern_70.00721902
Mnx1
Pattern NameModiscoSignificance
metacluster_1/pattern_70.00797275
Pdx1
Pattern NameModiscoSignificance
metacluster_1/pattern_70.0355166
Hoxc5
Pattern NameModiscoSignificance
metacluster_1/pattern_70.011459100000000002
Hoxd1
Pattern NameModiscoSignificance
metacluster_1/pattern_70.012860499999999999
Hoxb3
Pattern NameModiscoSignificance
metacluster_1/pattern_70.013076599999999999
Hoxa3
Pattern NameModiscoSignificance
metacluster_1/pattern_70.014849700000000002
Pou3f1
Pattern NameModiscoSignificance
metacluster_1/pattern_70.0355166
Lhx1
Pattern NameModiscoSignificance
metacluster_1/pattern_70.0136906
Pou3f2
Pattern NameModiscoSignificance
metacluster_1/pattern_70.013838999999999999
Lhx5
Pattern NameModiscoSignificance
metacluster_1/pattern_70.014119399999999999
Pou2f1
Pattern NameModiscoSignificance
metacluster_1/pattern_70.024578
Msx2
Pattern NameModiscoSignificance
metacluster_1/pattern_70.0147736
Dlx3
Pattern NameModiscoSignificance
metacluster_1/pattern_70.014849700000000002
Lmx1a
Pattern NameModiscoSignificance
metacluster_1/pattern_70.0163168
Barhl1
Pattern NameModiscoSignificance
metacluster_1/pattern_70.024926900000000002
Hoxa5
Pattern NameModiscoSignificance
metacluster_1/pattern_70.0189358
Isl1
Pattern NameModiscoSignificance
metacluster_1/pattern_70.0284929
Barhl2
Pattern NameModiscoSignificance
metacluster_1/pattern_70.0226262
Hoxc6
Pattern NameModiscoSignificance
metacluster_1/pattern_70.0226262
Dlx5
Pattern NameModiscoSignificance
metacluster_1/pattern_70.023463099999999997
Hlx
Pattern NameModiscoSignificance
metacluster_1/pattern_70.023463099999999997
Dbx1
Pattern NameModiscoSignificance
metacluster_1/pattern_70.0265553
Dbx2
Pattern NameModiscoSignificance
metacluster_1/pattern_70.0274869
Hoxb5
Pattern NameModiscoSignificance
metacluster_1/pattern_70.0284929
Tlx2
Pattern NameModiscoSignificance
metacluster_1/pattern_70.0288847
Pou3f4
Pattern NameModiscoSignificance
metacluster_1/pattern_70.029105400000000003
Isl2
Pattern NameModiscoSignificance
metacluster_1/pattern_70.0301673
Meox2
Pattern NameModiscoSignificance
metacluster_1/pattern_70.031143
Pou4f3
Pattern NameModiscoSignificance
metacluster_1/pattern_70.032695
Meox1
Pattern NameModiscoSignificance
metacluster_1/pattern_70.034203199999999996
Pou6f1
Pattern NameModiscoSignificance
metacluster_1/pattern_70.042656
Nkx6-3
Pattern NameModiscoSignificance
metacluster_1/pattern_70.036727800000000005
Bsx
Pattern NameModiscoSignificance
metacluster_1/pattern_70.038165199999999996
Hnf1b
Pattern NameModiscoSignificance
metacluster_1/pattern_70.039832599999999996
Nkx6-1
Pattern NameModiscoSignificance
metacluster_1/pattern_70.0434904
Hoxb4
Pattern NameModiscoSignificance
metacluster_1/pattern_70.0457184
Rfx2
Pattern NameModiscoSignificance
metacluster_1/pattern_80.00697054
Rfx3
Pattern NameModiscoSignificance
metacluster_1/pattern_80.00177942
Rfx1
Pattern NameModiscoSignificance
metacluster_1/pattern_82.93659e-07
metacluster_1/pattern_110.00622844
metacluster_1/pattern_150.0094157
metacluster_1/pattern_160.03473180000000001
Rfx4
Pattern NameModiscoSignificance
metacluster_1/pattern_83.95044e-05
Rfx6
Pattern NameModiscoSignificance
metacluster_1/pattern_85.513810000000001e-05
Rfx7
Pattern NameModiscoSignificance
metacluster_1/pattern_80.0007294789999999999
Irf2
Pattern NameModiscoSignificance
metacluster_1/pattern_90.0011079
Irf9
Pattern NameModiscoSignificance
metacluster_1/pattern_90.000628536
Irf1
Pattern NameModiscoSignificance
metacluster_1/pattern_90.00116084
Prdm1
Pattern NameModiscoSignificance
metacluster_1/pattern_90.0121909
Stat2
Pattern NameModiscoSignificance
metacluster_1/pattern_90.0121909
Irf7
Pattern NameModiscoSignificance
metacluster_1/pattern_90.018856
Irf8
Pattern NameModiscoSignificance
metacluster_1/pattern_90.022936099999999997
Irf4
Pattern NameModiscoSignificance
metacluster_1/pattern_90.035331699999999994
Sox9
Pattern NameModiscoSignificance
metacluster_1/pattern_105.60264e-09
Sox3
Pattern NameModiscoSignificance
metacluster_1/pattern_100.00586208
Sox10
Pattern NameModiscoSignificance
metacluster_1/pattern_100.00385978
Sox4
Pattern NameModiscoSignificance
metacluster_1/pattern_100.00749509
Sox2
Pattern NameModiscoSignificance
metacluster_1/pattern_100.00493344
Sox6
Pattern NameModiscoSignificance
metacluster_1/pattern_100.025627800000000003
Sox17
Pattern NameModiscoSignificance
metacluster_1/pattern_100.026498900000000002
Usf2
Pattern NameModiscoSignificance
metacluster_1/pattern_110.00481113
metacluster_1/pattern_140.0135784
metacluster_1/pattern_160.013859100000000001
metacluster_1/pattern_180.0107957
metacluster_1/pattern_200.0106866
Usf1
Pattern NameModiscoSignificance
metacluster_1/pattern_110.0170826
Tfeb
Pattern NameModiscoSignificance
metacluster_1/pattern_110.00032956099999999997
Arntl
Pattern NameModiscoSignificance
metacluster_1/pattern_110.0166585
Mitf
Pattern NameModiscoSignificance
metacluster_1/pattern_110.00167709
Tfec
Pattern NameModiscoSignificance
metacluster_1/pattern_110.00060443
Arnt
Pattern NameModiscoSignificance
metacluster_1/pattern_110.00114255
Clock
Pattern NameModiscoSignificance
metacluster_1/pattern_110.0368431
Srebf1
Pattern NameModiscoSignificance
metacluster_1/pattern_110.0170826
Bhlhe40
Pattern NameModiscoSignificance
metacluster_1/pattern_110.041068099999999996
Mlx
Pattern NameModiscoSignificance
metacluster_1/pattern_110.00503065
Npas2
Pattern NameModiscoSignificance
metacluster_1/pattern_110.00414488
Mycn
Pattern NameModiscoSignificance
metacluster_1/pattern_110.021437099999999997
Myc
Pattern NameModiscoSignificance
metacluster_1/pattern_110.019431999999999998
Nr1h4
Pattern NameModiscoSignificance
metacluster_1/pattern_110.0110712
Max
Pattern NameModiscoSignificance
metacluster_1/pattern_110.0329134
Srebf2
Pattern NameModiscoSignificance
metacluster_1/pattern_110.0166585
Rxrg
Pattern NameModiscoSignificance
metacluster_1/pattern_110.0170826
Tcfl5
Pattern NameModiscoSignificance
metacluster_1/pattern_110.0170826
Creb3l2
Pattern NameModiscoSignificance
metacluster_1/pattern_110.022497
Rxrb
Pattern NameModiscoSignificance
metacluster_1/pattern_110.026301299999999996
Thap11
Pattern NameModiscoSignificance
metacluster_1/pattern_123.59675e-22
Zfp143
Pattern NameModiscoSignificance
metacluster_1/pattern_127.093510000000001e-22
Tbx2
Pattern NameModiscoSignificance
metacluster_1/pattern_121.45131e-12
Cebpb
Pattern NameModiscoSignificance
metacluster_1/pattern_130.026428500000000004
Cebpa
Pattern NameModiscoSignificance
metacluster_1/pattern_130.0225652
Cebpd
Pattern NameModiscoSignificance
metacluster_1/pattern_130.013482299999999997
Cebpg
Pattern NameModiscoSignificance
metacluster_1/pattern_130.017916400000000002
Atf4
Pattern NameModiscoSignificance
metacluster_1/pattern_130.0363353
Ddit3
Pattern NameModiscoSignificance
metacluster_1/pattern_130.0220364
Cebpe
Pattern NameModiscoSignificance
metacluster_1/pattern_130.0162449
Pou5f1
Pattern NameModiscoSignificance
metacluster_1/pattern_130.0473187
Pou3f3
Pattern NameModiscoSignificance
metacluster_1/pattern_130.0472054
Zbtb33
Pattern NameModiscoSignificance
metacluster_1/pattern_140.00038930800000000005
Yy1
Pattern NameModiscoSignificance
metacluster_1/pattern_155.27568e-06
Taf1
Pattern NameModiscoSignificance
metacluster_1/pattern_155.27568e-06
Zfp42
Pattern NameModiscoSignificance
metacluster_1/pattern_150.0094157
Rest
Pattern NameModiscoSignificance
metacluster_1/pattern_160.000275876
metacluster_1/pattern_172.6083e-09
Mafk
Pattern NameModiscoSignificance
metacluster_1/pattern_160.049966500000000004
metacluster_1/pattern_170.00371464
metacluster_1/pattern_180.00165401
metacluster_1/pattern_200.0199945
Zfp335
Pattern NameModiscoSignificance
metacluster_1/pattern_170.0111054
Mafa
Pattern NameModiscoSignificance
metacluster_1/pattern_170.013862000000000001
metacluster_1/pattern_180.00165401
Bach2
Pattern NameModiscoSignificance
metacluster_1/pattern_170.0240659
metacluster_1/pattern_180.00165401
metacluster_1/pattern_200.014226499999999998
Fosb
Pattern NameModiscoSignificance
metacluster_1/pattern_180.00615288
metacluster_1/pattern_200.024879
Junb
Pattern NameModiscoSignificance
metacluster_1/pattern_180.00704906
metacluster_1/pattern_200.0199945
Fos
Pattern NameModiscoSignificance
metacluster_1/pattern_180.00769162
metacluster_1/pattern_200.023524700000000003
Bach1
Pattern NameModiscoSignificance
metacluster_1/pattern_180.00908684
metacluster_1/pattern_200.0199945
Batf
Pattern NameModiscoSignificance
metacluster_1/pattern_180.011156999999999999
Jund
Pattern NameModiscoSignificance
metacluster_1/pattern_180.0282966
metacluster_1/pattern_200.0319936
Tal1
Pattern NameModiscoSignificance
metacluster_1/pattern_180.0248946
Pknox1
Pattern NameModiscoSignificance
metacluster_1/pattern_180.0253437
Nfe2l2
Pattern NameModiscoSignificance
metacluster_1/pattern_180.0379626
metacluster_1/pattern_200.0199945
In [3]:
[[j] for j in range(10)]
Out[3]:
[[0], [1], [2], [3], [4], [5], [6], [7], [8], [9]]
In [69]:
from matlas.performance_metrics.performance_metrics import compute_performance

for i in range(10):
    cur_metrics = compute_performance(
        root="/mnt/lab_data/kundaje/users/msharmin/NSC_ATAC_PEAKS_to_share",
        splitno=i,#or taskno
        model_class="basset_classification",
        task_indices=[[0]], #[[j for j in range(10)]]
        tuned_task=1
    )
    
    break
cur_metrics
sample_N
chunk_size
predictions_pickle_to_load
performance_metrics_classification_file
performance_metrics_regression_file
performance_metrics_profile_file
tasks
task_indices
labels_hdf5
predictions_hdf5
number of datasets 1
current tasks.. [0]
/mnt/lab_data/kundaje/users/msharmin/NSC_ATAC_PEAKS_to_share/gw_predictions/single_tasks_0/test_basset_classification.1.labels.0
/mnt/lab_data/kundaje/users/msharmin/NSC_ATAC_PEAKS_to_share/gw_predictions/single_tasks_0/test_basset_classification.1.predictions.0
data shape (3909420, 1) (3909420, 1)
No class threshold can give requested fdr <=:0.5
No class threshold can give requested fdr <=:0.2
No class threshold can give requested fdr <=:0.1
/mnt/lab_data/kundaje/users/msharmin/NSC_ATAC_PEAKS_to_share/gw_predictions/single_tasks_0/accuracy.basset_classification.1.tsv
Out[69]:
{'unbalanced_accuracy': [0.9985278811779204],
 'positive_accuracy': [0.0],
 'negative_accuracy': [1.0],
 'balanced_accuracy': [0.5],
 'Imbalance ratio': [0.0014721188220796833],
 'num_positives': [5751.0],
 'num_negatives': [3900863.0],
 'auprc': [0.0017251146150757402],
 'auroc': [0.49759167363517115],
 'recall_at_fdr_50': [nan],
 'recall_at_fdr_20': [nan],
 'recall_at_fdr_10': [nan]}
In [129]:
import glob
narrowdirs = np.sort(glob.glob("/mnt/lab_data/kundaje/users/msharmin/NSC_ATAC_PEAKS_to_share/FINAL_PEAK_FILES/*"))
for task_idx, narrowdir in enumerate(narrowdirs):
    narrowfile = glob.glob("{}/overlap/optimal_set/*.narrowPeak".format(narrowdir))[0]
    print(task_idx, os.path.basename(narrowdir))
    df = pd.read_csv(narrowfile, header=None, sep="\t")
    df_sorted = df.sort_values([0,1,2])
    _df = df_sorted.copy(deep=True)
    _df[1] = df_sorted[1]+df_sorted[9] - 500
    _df[2] = _df[1] + 1000
    _df = _df.sort_values([0,1,2])
    _df.loc[:,:3].to_csv(
        "/mnt/lab_data/kundaje/users/msharmin/NSC_ATAC_PEAKS_to_share/gw_peaks/task_{}.bed.gz".format(task_idx),
        sep="\t", header=False, index=False, compression='gzip'
    )
    #break

#df[:5]
0 O_Astrocyte_macs2
1 O_Endo_macs2
2 O_NPC_macs2
3 O_aNSC_macs2
4 O_qNSC_macs2
5 Y_Astrocyte_macs2
6 Y_Endo_macs2
7 Y_NPC_macs2
8 Y_aNSC_macs2
9 Y_qNSC_macs2
In [127]:
df2[:5]
Out[127]:
0 1 2 3 4 5 6 7 8 9
16167 chr1 3670531 3672087 Peak_16427 340 . 5.85709 34.04338 31.13434 1256
16168 chr1 3670531 3672087 Peak_21725 241 . 4.85323 24.12262 21.34323 1050
16169 chr1 3670531 3672087 Peak_33291 138 . 3.68295 13.83506 11.25262 536
16170 chr1 3670531 3672087 Peak_67276 58 . 2.47901 5.83442 3.54887 112
16303 chr1 3722770 3723025 Peak_69951 53 . 2.70398 5.37837 3.11009 116
In [128]:
_df[:5]
Out[128]:
0 1 2 3 4 5 6 7 8 9
16170 chr1 3670143 3671143 Peak_67276 58 . 2.47901 5.83442 3.54887 112
16169 chr1 3670567 3671567 Peak_33291 138 . 3.68295 13.83506 11.25262 536
16168 chr1 3671081 3672081 Peak_21725 241 . 4.85323 24.12262 21.34323 1050
16167 chr1 3671287 3672287 Peak_16427 340 . 5.85709 34.04338 31.13434 1256
16303 chr1 3722386 3723386 Peak_69951 53 . 2.70398 5.37837 3.11009 116
In [61]:
from matlas.model_layer import retrieve_sequences, one_hot_encode_along_col_axis#, setup_keras_session, get_keras_model
from matlas.dlutils import write_deeplift_track, get_shuffled_seqs, get_given_seq_ref_function
from deeplift.dinuc_shuffle import dinuc_shuffle

summitfile="/mnt/lab_data/kundaje/users/msharmin/NSC_ATAC_PEAKS_to_share/gw_peaks/test.bed.gz"
fasta_file="/srv/scratch/msharmin/metadata/mm10.genome.fa"
sequences, intervals_wo_flanks = retrieve_sequences(summitfile, fasta_file=fasta_file, flank_size=0)
input_data_list, input_references_list = get_shuffled_seqs(sequences, 10, shuffle_func=dinuc_shuffle,
                                                                one_hot_func=lambda x: np.array([one_hot_encode_along_col_axis(seq) for seq in x]),
                                                                progress_update=10000)
One hot encoding sequences...
One hot encoding done...
In [64]:
#input_data_list[0][0].shape, input_data_list[0][1].shape, len(sequences[0])
input_data_list[0].shape
Out[64]:
(800, 1000, 4)
In [12]:
pd.read_hdf("/mnt/lab_data/kundaje/users/msharmin/NSC_ATAC_PEAKS_to_share/gw_predictions/single_tasks_0/basset_classification.1.predictions.0")
Out[12]:
0
CHR START END
chr1 600000 601000 0.000015
600050 601050 0.000015
600100 601100 0.000015
600150 601150 0.000015
600200 601200 0.000015
600250 601250 0.000015
600300 601300 0.000015
600350 601350 0.000015
600400 601400 0.000015
600450 601450 0.000015
600500 601500 0.000015
600550 601550 0.000015
600600 601600 0.000015
600650 601650 0.000015
600700 601700 0.000015
600750 601750 0.000015
600800 601800 0.000015
600850 601850 0.000015
600900 601900 0.000015
600950 601950 0.000015
601000 602000 0.000015
601050 602050 0.000015
601100 602100 0.000015
601150 602150 0.000015
601200 602200 0.000015
601250 602250 0.000015
601300 602300 0.000015
601350 602350 0.000015
601400 602400 0.000015
601450 602450 0.000015
... ... ...
195148500 195149500 0.000055
195148550 195149550 0.000074
195148600 195149600 0.000068
195148650 195149650 0.000057
195148700 195149700 0.000059
195148750 195149750 0.000034
195148800 195149800 0.000044
195148850 195149850 0.000023
195148900 195149900 0.000040
195148950 195149950 0.000047
195149000 195150000 0.000057
195149050 195150050 0.000060
195149100 195150100 0.000045
195149150 195150150 0.000034
195149200 195150200 0.000041
195149250 195150250 0.000041
195149300 195150300 0.000039
195149350 195150350 0.000030
195149400 195150400 0.000030
195149450 195150450 0.000044
195149500 195150500 0.000044
195149550 195150550 0.000041
195149600 195150600 0.000041
195149650 195150650 0.000039
195149700 195150700 0.000039
195149750 195150750 0.000036
195149800 195150800 0.000036
195149850 195150850 0.000039
195149900 195150900 0.000052
195149950 195150950 0.000030

3909420 rows × 1 columns

In [ ]:
 
In [4]:
from matlas.aitac_motifs import get_task_cors

labels = np.load("/mnt/lab_data2/msharmin/oc-atlas/DanSkinData/task_labels_on_mel_peaks.npy")
logits = np.load("/mnt/lab_data2/msharmin/oc-atlas/DanSkinData/fold_0_mel/model_predictions.npy")
print(labels.shape, logits.shape)
correlations = get_task_cors(labels, logits, verbose=True)
idx = np.argwhere(np.asarray(correlations)>0.7).squeeze()
print(len(idx))
(88319, 279) (88319, 279)
weighted_cor is nan
number of NaN values: 284
19927
/users/msharmin/anaconda2/envs/basepair13/lib/python3.6/site-packages/ipykernel_launcher.py:7: RuntimeWarning: invalid value encountered in greater
  import sys
In [3]:
#x2 = cur_seqs[idx, :, :]
y2 = labels[idx, :]
pred_full_model2 = logits[idx,:]
correlations2 = get_task_cors(y2, pred_full_model2, verbose=True)
print(y2.shape, pred_full_model2.shape, len(correlations2))
weighted_cor is 0.7687390717177731
number of NaN values: 0
(19927, 279) (19927, 279) 19927
In [5]:
filter_predictions = []
for i in range(300):
    if i%10==0:
        print(i)
    f = h5py.File("/mnt/lab_data2/msharmin/oc-atlas/DanSkinData/fold_0_mel/filter_preds/filter_{}.h5".format(i))
    ablated_logits = f['logits'][:]#[idx]
    #print(ablated_logits.shape)
    f.close()
    filter_predictions.append(ablated_logits)
filter_predictions = np.asarray(filter_predictions)
filter_predictions = filter_predictions.reshape((len(idx), 300, -1))
np.save("/mnt/lab_data2/msharmin/oc-atlas/DanSkinData/fold_0_mel/filter_predictions.npy", filter_predictions)
#filter_predictions = np.load("/mnt/lab_data2/msharmin/oc-atlas/DanSkinData/fold_0_ggr/filter_predictions.npy")
filter_predictions.shape
0
10
20
30
40
50
60
70
80
90
100
110
120
130
140
150
160
170
180
190
200
210
220
230
240
250
260
270
280
290
Out[5]:
(19927, 300, 279)
In [6]:
y2.shape, len(correlations2), filter_predictions.shape
Out[6]:
((19927, 279), 19927, (19927, 300, 279))
In [7]:
from matlas.aitac_motifs import get_filt_corr
filt_corr, filt_infl, ave_filt_infl = get_filt_corr(filter_predictions, y2, correlations2, verbose=True)
print(filt_corr.shape, filt_infl.shape, ave_filt_infl.shape)
Shape of filter-wise correlations:
(19927, 300)
Shape of filter influence:
(19927, 300)
(19927, 300) (19927, 300) (300,)
In [8]:
from matlas.aitac_motifs import get_filt_infl
infl, infl_by_OCR = get_filt_infl(pred_full_model2, filter_predictions, verbose=True)
print(infl.shape, infl_by_OCR.shape)
(300, 279) (19927, 300, 279)
In [9]:
# print("loading activations....")
# activations = np.load("/mnt/lab_data2/msharmin/oc-atlas/DanSkinData/fold_0_mel/activations.npy")
# print(activations.shape)
# activations2 = activations[idx]
# activations2.shape
# print("saving activations....")
# np.save("/mnt/lab_data2/msharmin/oc-atlas/DanSkinData/fold_0_mel/activations_7_cor.npy", activations2)
activations = np.load("/mnt/lab_data2/msharmin/oc-atlas/DanSkinData/fold_0_mel/activations_7_cor.npy")
activations.shape, y2.shape
Out[9]:
((19927, 982, 300), (19927, 279))
In [10]:
#collect x2 for mel
from matlas.generators import *
input_bed_file="/mnt/lab_data2/msharmin/oc-atlas/DanSkinData/task_273_mel.bed.gz"
ref_fasta="/srv/scratch/msharmin/metadata/mm10.genome.fa"
data_generator=DataGenerator(input_bed_file, ref_fasta, batch_size=1000, center_on_summit=False,
                             flank=0, expand_dims=False)
for batch in data_generator:
    break
seqs = None
for i in range(len(data_generator)):
    batch = data_generator.get_batch(i)
    if(seqs is None):
        seqs = batch[0]
    else:
        seqs = np.vstack((seqs, batch[0]))
        
seqs.shape
Out[10]:
(88319, 1000, 4)
In [11]:
x2 = seqs[idx]
x2.shape
Out[11]:
(19927, 1000, 4)
In [12]:
from matlas.aitac_motifs import get_memes
pwm, act_ind, nseqs, activated_OCRs, n_activated_OCRs, OCR_matrix = get_memes(
    activations, x2, y2, 
    output_file_path="/mnt/lab_data2/msharmin/oc-atlas/DanSkinData/fold_0_mel/")
In [13]:
pwm.shape, act_ind.shape, nseqs.shape, activated_OCRs.shape, n_activated_OCRs.shape, OCR_matrix.shape
Out[13]:
((300, 19, 4), (7375276,), (300,), (300, 279), (300,), (300, 19927))
In [ ]:
 
In [7]:
def make_seqdataloader_task_tsv(): #with idr and narrow peaks
    task_metadata = OrderedDict()
    root = "/mnt/lab_data/kundaje/users/msharmin/NSC_ATAC_PEAKS_to_share"
    datasets = ["O_Ast","O_aNSC", "O_Endo", "O_NPC", "O_qNSC", "Y_aNSC", "Y_Ast", "Y_Endo", "Y_NPC", "Y_qNSC"]
    task_metadata["dataset"] = np.array(np.sort(datasets))
    
    fc_files = np.sort(glob.glob("{0}/FINAL_SIGNAL_TRACKS/*".format(root)))
    task_metadata['fc_bigwig'] = fc_files
    
    idr_files = np.sort(glob.glob("{0}/FINAL_PEAK_FILES/*/idr/optimal_set/*.filt.narrowPeak".format(root)))
    task_metadata['idr_peak'] = idr_files
    
    naive_files = np.sort(glob.glob("{0}/FINAL_PEAK_FILES/*/overlap/optimal_set/*.filt.narrowPeak".format(root)))
    task_metadata['overlap_peak'] = naive_files
    
    task_metadata = pd.DataFrame(task_metadata)
    ambigroot = "{0}/FINAL_AMBIG_FILES".format(root)
    ambig_files = []
    for task_id, idr_file, naive_file in zip(task_metadata['dataset'].values, 
                                   task_metadata['idr_peak'].values, 
                                   task_metadata['overlap_peak'].values):
        idrname = "{0}/{1}".format(ambigroot, os.path.basename(idr_file))
        naivename = "{0}/{1}".format(ambigroot, os.path.basename(naive_file))
        ambigname = "{0}/{1}.ambig.gz".format(ambigroot, task_id)
        ambig_files.append(ambigname)
        
        if not os.path.exists(ambigname):
            idr_peaks = pd.read_csv(idr_file, header=None, sep="\t")
            idr_peaks = idr_peaks.sort_values([0,1])
            idr_peaks.to_csv(idrname, header=False, index=False, sep="\t", compression='gzip')

            naive_peaks = pd.read_csv(naive_file, header=None, sep="\t")
            naive_peaks = naive_peaks.sort_values([0,1])
            naive_peaks.to_csv(naivename, header=False, index=False, sep="\t", compression='gzip')

            cmd = "bedtools intersect -v -a {} -b {} | gzip -c > {}".format(naivename, idrname, ambigname)
            print(task_id)
            print(cmd)
            os.system(cmd)
        if not task_id in naivename:
            print(task_id, "not in", naivename)
        if not task_id in idrname:
            print(task_id, "not in", idrname)
            
            
    task_metadata['ambig_peak'] = np.array(ambig_files)
    cols = ['dataset', 'idr_peak', 'fc_bigwig', 'ambig_peak']
    task_tiledb_gen = task_metadata[cols]
    task_tiledb_gen.to_csv("{}/seqdataout/task.tiledb.tsv".format(root), header=True, index=False, sep="\t")
    
    task_label_gen = task_metadata[cols]
    task_label_gen.columns = ['task', 'narrowPeak', 'bigwig', 'ambig']
    task_label_gen.to_csv("{}/seqdataout/task.labelgen.tsv".format(root), header=True, index=False, sep="\t")
    
    task_metadata['idr_peak'] = np.array([os.path.basename(i) for i in task_metadata['idr_peak']])
    task_metadata['fc_bigwig'] = np.array([os.path.basename(i) for i in task_metadata['fc_bigwig']])
    task_metadata['ambig_peak'] = np.array([os.path.basename(i) for i in task_metadata['ambig_peak']])
    return task_metadata

make_seqdataloader_task_tsv()
O_Ast
bedtools intersect -v -a /mnt/lab_data/kundaje/msharmin/NSC_ATAC_PEAKS_to_share/FINAL_AMBIG_FILES/O_Astrocyte_ppr.naive_overlap.filt.narrowPeak -b /mnt/lab_data/kundaje/msharmin/NSC_ATAC_PEAKS_to_share/FINAL_AMBIG_FILES/O_Astrocyte_ppr.IDR0.1.filt.narrowPeak | gzip -c > /mnt/lab_data/kundaje/msharmin/NSC_ATAC_PEAKS_to_share/FINAL_AMBIG_FILES/O_Ast.ambig.gz
O_Endo
bedtools intersect -v -a /mnt/lab_data/kundaje/msharmin/NSC_ATAC_PEAKS_to_share/FINAL_AMBIG_FILES/O_Endo_ppr.naive_overlap.filt.narrowPeak -b /mnt/lab_data/kundaje/msharmin/NSC_ATAC_PEAKS_to_share/FINAL_AMBIG_FILES/O_Endo_ppr.IDR0.1.filt.narrowPeak | gzip -c > /mnt/lab_data/kundaje/msharmin/NSC_ATAC_PEAKS_to_share/FINAL_AMBIG_FILES/O_Endo.ambig.gz
O_NPC
bedtools intersect -v -a /mnt/lab_data/kundaje/msharmin/NSC_ATAC_PEAKS_to_share/FINAL_AMBIG_FILES/O_NPC_ppr.naive_overlap.filt.narrowPeak -b /mnt/lab_data/kundaje/msharmin/NSC_ATAC_PEAKS_to_share/FINAL_AMBIG_FILES/O_NPC_rep1-rep2.IDR0.1.filt.narrowPeak | gzip -c > /mnt/lab_data/kundaje/msharmin/NSC_ATAC_PEAKS_to_share/FINAL_AMBIG_FILES/O_NPC.ambig.gz
O_aNSC
bedtools intersect -v -a /mnt/lab_data/kundaje/msharmin/NSC_ATAC_PEAKS_to_share/FINAL_AMBIG_FILES/O_aNSC_ppr.naive_overlap.filt.narrowPeak -b /mnt/lab_data/kundaje/msharmin/NSC_ATAC_PEAKS_to_share/FINAL_AMBIG_FILES/O_aNSC_ppr.IDR0.1.filt.narrowPeak | gzip -c > /mnt/lab_data/kundaje/msharmin/NSC_ATAC_PEAKS_to_share/FINAL_AMBIG_FILES/O_aNSC.ambig.gz
O_qNSC
bedtools intersect -v -a /mnt/lab_data/kundaje/msharmin/NSC_ATAC_PEAKS_to_share/FINAL_AMBIG_FILES/O_qNSC_rep1-rep3.naive_overlap.filt.narrowPeak -b /mnt/lab_data/kundaje/msharmin/NSC_ATAC_PEAKS_to_share/FINAL_AMBIG_FILES/O_qNSC_ppr.IDR0.1.filt.narrowPeak | gzip -c > /mnt/lab_data/kundaje/msharmin/NSC_ATAC_PEAKS_to_share/FINAL_AMBIG_FILES/O_qNSC.ambig.gz
Y_Ast
bedtools intersect -v -a /mnt/lab_data/kundaje/msharmin/NSC_ATAC_PEAKS_to_share/FINAL_AMBIG_FILES/Y_Astrocyte_ppr.naive_overlap.filt.narrowPeak -b /mnt/lab_data/kundaje/msharmin/NSC_ATAC_PEAKS_to_share/FINAL_AMBIG_FILES/Y_Astrocyte_ppr.IDR0.1.filt.narrowPeak | gzip -c > /mnt/lab_data/kundaje/msharmin/NSC_ATAC_PEAKS_to_share/FINAL_AMBIG_FILES/Y_Ast.ambig.gz
Y_Endo
bedtools intersect -v -a /mnt/lab_data/kundaje/msharmin/NSC_ATAC_PEAKS_to_share/FINAL_AMBIG_FILES/Y_Endo_rep1-rep2.naive_overlap.filt.narrowPeak -b /mnt/lab_data/kundaje/msharmin/NSC_ATAC_PEAKS_to_share/FINAL_AMBIG_FILES/Y_Endo_rep1-rep2.IDR0.1.filt.narrowPeak | gzip -c > /mnt/lab_data/kundaje/msharmin/NSC_ATAC_PEAKS_to_share/FINAL_AMBIG_FILES/Y_Endo.ambig.gz
Y_NPC
bedtools intersect -v -a /mnt/lab_data/kundaje/msharmin/NSC_ATAC_PEAKS_to_share/FINAL_AMBIG_FILES/Y_NPC_ppr.naive_overlap.filt.narrowPeak -b /mnt/lab_data/kundaje/msharmin/NSC_ATAC_PEAKS_to_share/FINAL_AMBIG_FILES/Y_NPC_ppr.IDR0.1.filt.narrowPeak | gzip -c > /mnt/lab_data/kundaje/msharmin/NSC_ATAC_PEAKS_to_share/FINAL_AMBIG_FILES/Y_NPC.ambig.gz
Y_aNSC
bedtools intersect -v -a /mnt/lab_data/kundaje/msharmin/NSC_ATAC_PEAKS_to_share/FINAL_AMBIG_FILES/Y_aNSC_ppr.naive_overlap.filt.narrowPeak -b /mnt/lab_data/kundaje/msharmin/NSC_ATAC_PEAKS_to_share/FINAL_AMBIG_FILES/Y_aNSC_ppr.IDR0.1.filt.narrowPeak | gzip -c > /mnt/lab_data/kundaje/msharmin/NSC_ATAC_PEAKS_to_share/FINAL_AMBIG_FILES/Y_aNSC.ambig.gz
Y_qNSC
bedtools intersect -v -a /mnt/lab_data/kundaje/msharmin/NSC_ATAC_PEAKS_to_share/FINAL_AMBIG_FILES/Y_qNSC_ppr.naive_overlap.filt.narrowPeak -b /mnt/lab_data/kundaje/msharmin/NSC_ATAC_PEAKS_to_share/FINAL_AMBIG_FILES/Y_qNSC_ppr.IDR0.1.filt.narrowPeak | gzip -c > /mnt/lab_data/kundaje/msharmin/NSC_ATAC_PEAKS_to_share/FINAL_AMBIG_FILES/Y_qNSC.ambig.gz
Out[7]:
dataset fc_bigwig idr_peak overlap_peak ambig_peak
0 O_Ast O_Ast.trim.PE2SE.nodup.tn5_pooled.pf.fc.signal... O_Astrocyte_ppr.IDR0.1.filt.narrowPeak /mnt/lab_data/kundaje/msharmin/NSC_ATAC_PEAKS_... O_Ast.ambig.gz
1 O_Endo O_Endo.trim.PE2SE.nodup.tn5_pooled.pf.fc.signa... O_Endo_ppr.IDR0.1.filt.narrowPeak /mnt/lab_data/kundaje/msharmin/NSC_ATAC_PEAKS_... O_Endo.ambig.gz
2 O_NPC O_NPC.trim.PE2SE.nodup.tn5_pooled.pf.fc.signal... O_NPC_rep1-rep2.IDR0.1.filt.narrowPeak /mnt/lab_data/kundaje/msharmin/NSC_ATAC_PEAKS_... O_NPC.ambig.gz
3 O_aNSC O_aNSC.trim.PE2SE.nodup.tn5_pooled.pf.fc.signa... O_aNSC_ppr.IDR0.1.filt.narrowPeak /mnt/lab_data/kundaje/msharmin/NSC_ATAC_PEAKS_... O_aNSC.ambig.gz
4 O_qNSC O_qNSC.trim.PE2SE.nodup.tn5_pooled.pf.fc.signa... O_qNSC_ppr.IDR0.1.filt.narrowPeak /mnt/lab_data/kundaje/msharmin/NSC_ATAC_PEAKS_... O_qNSC.ambig.gz
5 Y_Ast Y_Ast.trim.PE2SE.nodup.tn5_pooled.pf.fc.signal... Y_Astrocyte_ppr.IDR0.1.filt.narrowPeak /mnt/lab_data/kundaje/msharmin/NSC_ATAC_PEAKS_... Y_Ast.ambig.gz
6 Y_Endo Y_Endo.trim.PE2SE.nodup.tn5_pooled.pf.fc.signa... Y_Endo_rep1-rep2.IDR0.1.filt.narrowPeak /mnt/lab_data/kundaje/msharmin/NSC_ATAC_PEAKS_... Y_Endo.ambig.gz
7 Y_NPC Y_NPC.trim.PE2SE.nodup.tn5_pooled.pf.fc.signal... Y_NPC_ppr.IDR0.1.filt.narrowPeak /mnt/lab_data/kundaje/msharmin/NSC_ATAC_PEAKS_... Y_NPC.ambig.gz
8 Y_aNSC Y_aNSC.trim.PE2SE.nodup.tn5_pooled.pf.fc.signa... Y_aNSC_ppr.IDR0.1.filt.narrowPeak /mnt/lab_data/kundaje/msharmin/NSC_ATAC_PEAKS_... Y_aNSC.ambig.gz
9 Y_qNSC Y_qNSC.trim.PE2SE.nodup.tn5_pooled.pf.fc.signa... Y_qNSC_ppr.IDR0.1.filt.narrowPeak /mnt/lab_data/kundaje/msharmin/NSC_ATAC_PEAKS_... Y_qNSC.ambig.gz
In [18]:
def make_seqdataloader_task_tsv(): #with narrow peaks and blacklist
    task_metadata = OrderedDict()
    root = "/mnt/lab_data/kundaje/users/msharmin/NSC_ATAC_PEAKS_to_share"
    datasets = ["O_Ast","O_aNSC", "O_Endo", "O_NPC", "O_qNSC", "Y_aNSC", "Y_Ast", "Y_Endo", "Y_NPC", "Y_qNSC"]
    task_metadata["dataset"] = np.array(np.sort(datasets))
    
    naive_files = np.sort(glob.glob("{0}/FINAL_PEAK_FILES/*/overlap/optimal_set/*.filt.narrowPeak".format(root)))
    task_metadata['idr_peak'] = naive_files
    
    fc_files = np.sort(glob.glob("{0}/FINAL_SIGNAL_TRACKS/*".format(root)))
    task_metadata['fc_bigwig'] = fc_files
    
    ambig_files = ["/mnt/data/pipeline_genome_data/mm10/mm10.blacklist.bed.gz"]*10       
    task_metadata['ambig_peak'] = np.array(ambig_files)
    task_metadata = pd.DataFrame(task_metadata)
    
    cols = ['dataset', 'idr_peak', 'fc_bigwig', 'ambig_peak']
    task_tiledb_gen = task_metadata[cols]
    task_tiledb_gen.to_csv("{}/narrow_seqdataout/task.tiledb.tsv".format(root), header=True, index=False, sep="\t")
    
    task_label_gen = task_metadata[cols]
    task_label_gen.columns = ['task', 'narrowPeak', 'bigwig', 'ambig']
    task_label_gen.to_csv("{}/narrow_seqdataout/task.labelgen.tsv".format(root), header=True, index=False, sep="\t")
    
    task_metadata['idr_peak'] = np.array([os.path.basename(i) for i in task_metadata['idr_peak']])
    task_metadata['fc_bigwig'] = np.array([os.path.basename(i) for i in task_metadata['fc_bigwig']])
    task_metadata['ambig_peak'] = np.array([os.path.basename(i) for i in task_metadata['ambig_peak']])
    return task_metadata

make_seqdataloader_task_tsv()
Out[18]:
dataset idr_peak fc_bigwig ambig_peak
0 O_Ast O_Astrocyte_ppr.naive_overlap.filt.narrowPeak O_Ast.trim.PE2SE.nodup.tn5_pooled.pf.fc.signal... mm10.blacklist.bed.gz
1 O_Endo O_Endo_ppr.naive_overlap.filt.narrowPeak O_Endo.trim.PE2SE.nodup.tn5_pooled.pf.fc.signa... mm10.blacklist.bed.gz
2 O_NPC O_NPC_ppr.naive_overlap.filt.narrowPeak O_NPC.trim.PE2SE.nodup.tn5_pooled.pf.fc.signal... mm10.blacklist.bed.gz
3 O_aNSC O_aNSC_ppr.naive_overlap.filt.narrowPeak O_aNSC.trim.PE2SE.nodup.tn5_pooled.pf.fc.signa... mm10.blacklist.bed.gz
4 O_qNSC O_qNSC_rep1-rep3.naive_overlap.filt.narrowPeak O_qNSC.trim.PE2SE.nodup.tn5_pooled.pf.fc.signa... mm10.blacklist.bed.gz
5 Y_Ast Y_Astrocyte_ppr.naive_overlap.filt.narrowPeak Y_Ast.trim.PE2SE.nodup.tn5_pooled.pf.fc.signal... mm10.blacklist.bed.gz
6 Y_Endo Y_Endo_rep1-rep2.naive_overlap.filt.narrowPeak Y_Endo.trim.PE2SE.nodup.tn5_pooled.pf.fc.signa... mm10.blacklist.bed.gz
7 Y_NPC Y_NPC_ppr.naive_overlap.filt.narrowPeak Y_NPC.trim.PE2SE.nodup.tn5_pooled.pf.fc.signal... mm10.blacklist.bed.gz
8 Y_aNSC Y_aNSC_ppr.naive_overlap.filt.narrowPeak Y_aNSC.trim.PE2SE.nodup.tn5_pooled.pf.fc.signa... mm10.blacklist.bed.gz
9 Y_qNSC Y_qNSC_ppr.naive_overlap.filt.narrowPeak Y_qNSC.trim.PE2SE.nodup.tn5_pooled.pf.fc.signa... mm10.blacklist.bed.gz
In [10]:
!cat /mnt/lab_data/kundaje/users/msharmin/NSC_ATAC_PEAKS_to_share/scripts/dbingest.py
#!python /mnt/lab_data2/msharmin/oc-atlas/scripts/dbingest.py
from seqdataloader.dbingest import *

import warnings
warnings.filterwarnings('ignore')
root = "/mnt/lab_data/kundaje/msharmin/NSC_ATAC_PEAKS_to_share"
args={"tiledb_metadata": "{}/seqdataout/task.tiledb.tsv".format(root),
      "tiledb_group": "{}/seqdataout/NSC_tiledb".format(root),
      "overwrite": True,
      "chrom_sizes": "/mnt/data/pipeline_genome_data/mm10/mm10.chrom.sizes",
      "chrom_threads": 25,
      "task_threads": 1,
      "write_threads": 1,
      "store_summits": True,
      "summit_indicator": 2
     }

ingest(args)
In [20]:
import pandas as pd
def make_kerasAC_task_tsv():
    root = "/mnt/lab_data/kundaje/users/msharmin/NSC_ATAC_PEAKS_to_share"
    tiledb_taskfile  = "{0}/narrow_seqdataout/task.tiledb.tsv".format(root)
    tiledb_metadata = pd.read_csv(tiledb_taskfile, header=0, sep="\t")
    with open("{0}/narrow_seqdataout/task.tiledb.kerasAC.tsv".format(root), "w") as fp:
        for task_id in tiledb_metadata['dataset'].values:
            fp.write("{1}/{0}\n".format(task_id, "{0}/narrow_seqdataout/NSC_tiledb".format(root)))
    return None

make_kerasAC_task_tsv()
In [2]:
%load_ext autoreload
%autoreload 2

import pandas as pd
import numpy as np 
import glob
import os
from collections import OrderedDict
import pickle
import h5py
In [7]:
labels_h5="/mnt/lab_data/kundaje/users/msharmin/NSC_ATAC_PEAKS_to_share/seqdataout/regressionlabels.GW.all_genome_bins_regression.hdf5"

f = h5py.File(labels_h5, 'r')
f
f.close()
In [5]:
# tmp = f['data']['table'][:]
# tmp.shape
f['data']['table']
Out[5]:
<HDF5 dataset "table": shape (54576400,), type "|V94">
In [6]:
tmp[0]
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-6-8b27aabe4211> in <module>
----> 1 tmp[0]

NameError: name 'tmp' is not defined
In [8]:
f = h5py.File("/mnt/lab_data/kundaje/users/msharmin/NSC_ATAC_PEAKS_to_share/gw_predictions/basset_classification.0.predictions.0", "r")
preds = f['data']['table'][:]
f = h5py.File("/mnt/lab_data/kundaje/users/msharmin/NSC_ATAC_PEAKS_to_share/gw_predictions/basset_classification.0.labels.0", "r")
labels = f['data']['table'][:]
In [9]:
extracted_labels = []
extracted_preds = []
for i in range(labels.shape[0]):
    extracted_labels.append(labels[i][1])
    extracted_preds.append(preds[i][1])
extracted_labels = np.asarray(extracted_labels) 
extracted_preds = np.asarray(extracted_preds)
print(extracted_labels.shape, extracted_preds.shape)
(3909420, 10) (3909420, 10)
In [28]:
#cur_metrics['spearmanr'][0]#.correlation or .pvalue
#cur_metrics['pearsonr'][0]
cur_metrics
#cur_metrics['spearmanr'][0]
Out[28]:
{'spearmanr': [0.4410455638221149,
  0.3487488645879994,
  0.45306054074352065,
  0.47275218862526397,
  0.4296973903470311,
  0.41053163135822945,
  0.33458336173852476,
  0.4936518914515799,
  0.3559798326655384,
  0.4606049947853931],
 'pearsonr': [0.4064456867189462,
  0.36127962892725884,
  0.3437656936692318,
  0.4072179989865825,
  0.36630756898247385,
  0.3521546423385434,
  0.27079383846132854,
  0.38370842923235327,
  0.2577950639002984,
  0.3973807049853072],
 'spearmanr_nonzerobins': [0.34525055724436526,
  0.22795586252809402,
  0.35589672517620785,
  0.3855147248474218,
  0.3282102401893469,
  0.2892236913048848,
  0.20952296914608898,
  0.41245032697907136,
  0.23902360007853785,
  0.36833284259170335],
 'pearsonr_nonzerobins': [0.28402862254453703,
  0.17336023608472312,
  0.20942555677673497,
  0.3088963030219127,
  0.24211722850104386,
  0.24621552296220006,
  0.12759460048981267,
  0.24633068013949852,
  0.16791070025725116,
  0.25487474784006253],
 'spearman.pvalue': [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
 'pearson.pvalue': [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
 'spearman_nonzerobins.pvalue': [0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0],
 'pearson_nonzerobins.pvalue': [0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0]}
In [ ]:
from matlas.performance_metrics.performance_metrics import compute_performance

for i in range(0,10):
    compute_performance(
        root="/mnt/lab_data/kundaje/users/msharmin/NSC_ATAC_PEAKS_to_share",
        splitno=i,
        model_class="clb_basset_classification",
        task_indices=[[j for j in range(10)]])
    #break
sample_N
chunk_size
predictions_pickle_to_load
performance_metrics_classification_file
performance_metrics_regression_file
performance_metrics_profile_file
tasks
task_indices
labels_hdf5
predictions_hdf5
number of datasets 1
current tasks.. [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
/mnt/lab_data/kundaje/users/msharmin/NSC_ATAC_PEAKS_to_share/gw_predictions/multi_tasks/clb_basset_classification.0.labels.0
/mnt/lab_data/kundaje/users/msharmin/NSC_ATAC_PEAKS_to_share/gw_predictions/multi_tasks/clb_basset_classification.0.predictions.0
data shape (3909420, 10) (3909420, 10)
/mnt/lab_data/kundaje/users/msharmin/NSC_ATAC_PEAKS_to_share/gw_predictions/multi_tasks/accuracy.clb_basset_classification.0.tsv
/mnt/lab_data/kundaje/users/msharmin/NSC_ATAC_PEAKS_to_share/gw_predictions/multi_tasks/accuracy.clb_basset_classification.0.tsv
sample_N
chunk_size
predictions_pickle_to_load
performance_metrics_classification_file
performance_metrics_regression_file
performance_metrics_profile_file
tasks
task_indices
labels_hdf5
predictions_hdf5
number of datasets 1
current tasks.. [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
/mnt/lab_data/kundaje/users/msharmin/NSC_ATAC_PEAKS_to_share/gw_predictions/multi_tasks/clb_basset_classification.1.labels.0
/mnt/lab_data/kundaje/users/msharmin/NSC_ATAC_PEAKS_to_share/gw_predictions/multi_tasks/clb_basset_classification.1.predictions.0
data shape (4870857, 10) (4870857, 10)
/mnt/lab_data/kundaje/users/msharmin/NSC_ATAC_PEAKS_to_share/gw_predictions/multi_tasks/accuracy.clb_basset_classification.1.tsv
/mnt/lab_data/kundaje/users/msharmin/NSC_ATAC_PEAKS_to_share/gw_predictions/multi_tasks/accuracy.clb_basset_classification.1.tsv
sample_N
chunk_size
predictions_pickle_to_load
performance_metrics_classification_file
performance_metrics_regression_file
performance_metrics_profile_file
tasks
task_indices
labels_hdf5
predictions_hdf5
number of datasets 1
current tasks.. [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
/mnt/lab_data/kundaje/users/msharmin/NSC_ATAC_PEAKS_to_share/gw_predictions/multi_tasks/clb_basset_classification.2.labels.0
/mnt/lab_data/kundaje/users/msharmin/NSC_ATAC_PEAKS_to_share/gw_predictions/multi_tasks/clb_basset_classification.2.predictions.0
data shape (3200774, 10) (3200774, 10)
/mnt/lab_data/kundaje/users/msharmin/NSC_ATAC_PEAKS_to_share/gw_predictions/multi_tasks/accuracy.clb_basset_classification.2.tsv
/mnt/lab_data/kundaje/users/msharmin/NSC_ATAC_PEAKS_to_share/gw_predictions/multi_tasks/accuracy.clb_basset_classification.2.tsv
sample_N
chunk_size
predictions_pickle_to_load
performance_metrics_classification_file
performance_metrics_regression_file
performance_metrics_profile_file
tasks
task_indices
labels_hdf5
predictions_hdf5
number of datasets 1
current tasks.. [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
/mnt/lab_data/kundaje/users/msharmin/NSC_ATAC_PEAKS_to_share/gw_predictions/multi_tasks/clb_basset_classification.3.labels.0
/mnt/lab_data/kundaje/users/msharmin/NSC_ATAC_PEAKS_to_share/gw_predictions/multi_tasks/clb_basset_classification.3.predictions.0
data shape (5403124, 10) (5403124, 10)
/mnt/lab_data/kundaje/users/msharmin/NSC_ATAC_PEAKS_to_share/gw_predictions/multi_tasks/accuracy.clb_basset_classification.3.tsv
/mnt/lab_data/kundaje/users/msharmin/NSC_ATAC_PEAKS_to_share/gw_predictions/multi_tasks/accuracy.clb_basset_classification.3.tsv
sample_N
chunk_size
predictions_pickle_to_load
performance_metrics_classification_file
performance_metrics_regression_file
performance_metrics_profile_file
tasks
task_indices
labels_hdf5
predictions_hdf5
number of datasets 1
current tasks.. [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
/mnt/lab_data/kundaje/users/msharmin/NSC_ATAC_PEAKS_to_share/gw_predictions/multi_tasks/clb_basset_classification.4.labels.0
/mnt/lab_data/kundaje/users/msharmin/NSC_ATAC_PEAKS_to_share/gw_predictions/multi_tasks/clb_basset_classification.4.predictions.0
data shape (6835684, 10) (6835684, 10)
No class threshold can give requested fdr <=:0.1
/mnt/lab_data/kundaje/users/msharmin/NSC_ATAC_PEAKS_to_share/gw_predictions/multi_tasks/accuracy.clb_basset_classification.4.tsv
/mnt/lab_data/kundaje/users/msharmin/NSC_ATAC_PEAKS_to_share/gw_predictions/multi_tasks/accuracy.clb_basset_classification.4.tsv
sample_N
chunk_size
predictions_pickle_to_load
performance_metrics_classification_file
performance_metrics_regression_file
performance_metrics_profile_file
tasks
task_indices
labels_hdf5
predictions_hdf5
number of datasets 1
current tasks.. [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
/mnt/lab_data/kundaje/users/msharmin/NSC_ATAC_PEAKS_to_share/gw_predictions/multi_tasks/clb_basset_classification.5.labels.0
/mnt/lab_data/kundaje/users/msharmin/NSC_ATAC_PEAKS_to_share/gw_predictions/multi_tasks/clb_basset_classification.5.predictions.0
data shape (5210997, 10) (5210997, 10)
/mnt/lab_data/kundaje/users/msharmin/NSC_ATAC_PEAKS_to_share/gw_predictions/multi_tasks/accuracy.clb_basset_classification.5.tsv
/mnt/lab_data/kundaje/users/msharmin/NSC_ATAC_PEAKS_to_share/gw_predictions/multi_tasks/accuracy.clb_basset_classification.5.tsv
sample_N
chunk_size
predictions_pickle_to_load
performance_metrics_classification_file
performance_metrics_regression_file
performance_metrics_profile_file
tasks
task_indices
labels_hdf5
predictions_hdf5
number of datasets 1
current tasks.. [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
/mnt/lab_data/kundaje/users/msharmin/NSC_ATAC_PEAKS_to_share/gw_predictions/multi_tasks/clb_basset_classification.6.labels.0
/mnt/lab_data/kundaje/users/msharmin/NSC_ATAC_PEAKS_to_share/gw_predictions/multi_tasks/clb_basset_classification.6.predictions.0
data shape (7220868, 10) (7220868, 10)
In [ ]:
from matlas.performance_metrics.performance_metrics import compute_calibration

for fold in range(0,10):
    for ti in range(0, 10):
        compute_calibration(
            root="/mnt/lab_data/kundaje/users/msharmin/NSC_ATAC_PEAKS_to_share",
            splitno=fold,
            task_idx=ti,
            model_class="clb_basset_classification",
            visiblegpus='4'
        )
    #break
Using TensorFlow backend.
/users/msharmin/anaconda2/envs/basepair13/lib/python3.6/site-packages/sklearn/linear_model/logistic.py:433: FutureWarning: Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.
  FutureWarning)
Platt scaling coef: 1.121271046472429 ; intercept: -1.2085032348145983
/users/msharmin/anaconda2/envs/basepair13/lib/python3.6/site-packages/tensorflow/python/ops/gradients_impl.py:112: UserWarning: Converting sparse IndexedSlices to a dense Tensor of unknown shape. This may consume a large amount of memory.
  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "
created data generator from 0
69/69 [==============================] - 61s 891ms/step
break? 100000 69000 68576
got embeddings
got region labels
created data generator from 0
69/69 [==============================] - 57s 820ms/step
break? 100000 69000 68576
got embeddings
got region labels
/users/msharmin/anaconda2/envs/basepair13/lib/python3.6/site-packages/sklearn/linear_model/logistic.py:433: FutureWarning: Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.
  FutureWarning)
Platt scaling coef: 1.4174898733727166 ; intercept: -0.9923843362210041
/users/msharmin/anaconda2/envs/basepair13/lib/python3.6/site-packages/tensorflow/python/ops/gradients_impl.py:112: UserWarning: Converting sparse IndexedSlices to a dense Tensor of unknown shape. This may consume a large amount of memory.
  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "
created data generator from 0
21/21 [==============================] - 17s 814ms/step
break? 100000 21000 20237
got embeddings
got region labels
created data generator from 0
21/21 [==============================] - 17s 808ms/step
break? 100000 21000 20237
got embeddings
got region labels
/users/msharmin/anaconda2/envs/basepair13/lib/python3.6/site-packages/sklearn/linear_model/logistic.py:433: FutureWarning: Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.
  FutureWarning)
Platt scaling coef: 1.2171880946948614 ; intercept: -0.9459838935658006
/users/msharmin/anaconda2/envs/basepair13/lib/python3.6/site-packages/tensorflow/python/ops/gradients_impl.py:112: UserWarning: Converting sparse IndexedSlices to a dense Tensor of unknown shape. This may consume a large amount of memory.
  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "
created data generator from 0
70/70 [==============================] - 58s 833ms/step
break? 100000 70000 69025
got embeddings
got region labels
created data generator from 0
70/70 [==============================] - 57s 821ms/step- ET
break? 100000 70000 69025
got embeddings
got region labels
/users/msharmin/anaconda2/envs/basepair13/lib/python3.6/site-packages/sklearn/linear_model/logistic.py:433: FutureWarning: Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.
  FutureWarning)
Platt scaling coef: 1.2219649751618566 ; intercept: -1.1208110846744441
/users/msharmin/anaconda2/envs/basepair13/lib/python3.6/site-packages/tensorflow/python/ops/gradients_impl.py:112: UserWarning: Converting sparse IndexedSlices to a dense Tensor of unknown shape. This may consume a large amount of memory.
  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "
created data generator from 0
89/89 [==============================] - 74s 833ms/step - - ETA: 
break? 100000 89000 88383
got embeddings
got region labels
created data generator from 0
89/89 [==============================] - 73s 822ms/step 23 - ETA: 2 - ETA: 2 - - E -
break? 100000 89000 88383
got embeddings
got region labels
/users/msharmin/anaconda2/envs/basepair13/lib/python3.6/site-packages/sklearn/linear_model/logistic.py:433: FutureWarning: Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.
  FutureWarning)
Platt scaling coef: 1.1420844213210177 ; intercept: -1.3721454263625397
/users/msharmin/anaconda2/envs/basepair13/lib/python3.6/site-packages/tensorflow/python/ops/gradients_impl.py:112: UserWarning: Converting sparse IndexedSlices to a dense Tensor of unknown shape. This may consume a large amount of memory.
  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "
created data generator from 0
70/70 [==============================] - 59s 842ms/step  - ETA: 3 - ETA: 3 - ETA: -  - ETA: -
break? 100000 70000 69525
got embeddings
got region labels
created data generator from 0
70/70 [==============================] - 57s 818ms/step
break? 100000 70000 69525
got embeddings
got region labels
/users/msharmin/anaconda2/envs/basepair13/lib/python3.6/site-packages/sklearn/linear_model/logistic.py:433: FutureWarning: Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.
  FutureWarning)
Platt scaling coef: 1.1697864702505727 ; intercept: -1.372736228379726
/users/msharmin/anaconda2/envs/basepair13/lib/python3.6/site-packages/tensorflow/python/ops/gradients_impl.py:112: UserWarning: Converting sparse IndexedSlices to a dense Tensor of unknown shape. This may consume a large amount of memory.
  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "
created data generator from 0
72/72 [==============================] - 60s 838ms/step
break? 100000 72000 71458
got embeddings
got region labels
created data generator from 0
72/72 [==============================] - 59s 819ms/step
break? 100000 72000 71458
got embeddings
got region labels
/users/msharmin/anaconda2/envs/basepair13/lib/python3.6/site-packages/sklearn/linear_model/logistic.py:433: FutureWarning: Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.
  FutureWarning)
Platt scaling coef: 1.2511964903893593 ; intercept: -1.5349071149479925
/users/msharmin/anaconda2/envs/basepair13/lib/python3.6/site-packages/tensorflow/python/ops/gradients_impl.py:112: UserWarning: Converting sparse IndexedSlices to a dense Tensor of unknown shape. This may consume a large amount of memory.
  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "
created data generator from 0
29/29 [==============================] - 24s 831ms/step 17 - ET
break? 100000 29000 28280
got embeddings
got region labels
created data generator from 0
29/29 [==============================] - 24s 828ms/step
break? 100000 29000 28280
got embeddings
got region labels
/users/msharmin/anaconda2/envs/basepair13/lib/python3.6/site-packages/sklearn/linear_model/logistic.py:433: FutureWarning: Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.
  FutureWarning)
Platt scaling coef: 1.193829201042929 ; intercept: -1.40429267157284
/users/msharmin/anaconda2/envs/basepair13/lib/python3.6/site-packages/tensorflow/python/ops/gradients_impl.py:112: UserWarning: Converting sparse IndexedSlices to a dense Tensor of unknown shape. This may consume a large amount of memory.
  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "
created data generator from 0
92/92 [==============================] - 75s 818ms/step
break? 100000 92000 91406
got embeddings
got region labels
created data generator from 0
92/92 [==============================] - 75s 817ms/stepTA: 5 - ET
break? 100000 92000 91406
got embeddings
got region labels
/users/msharmin/anaconda2/envs/basepair13/lib/python3.6/site-packages/sklearn/linear_model/logistic.py:433: FutureWarning: Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.
  FutureWarning)
Platt scaling coef: 1.1630629283240554 ; intercept: -1.1265870309572672
/users/msharmin/anaconda2/envs/basepair13/lib/python3.6/site-packages/tensorflow/python/ops/gradients_impl.py:112: UserWarning: Converting sparse IndexedSlices to a dense Tensor of unknown shape. This may consume a large amount of memory.
  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "
created data generator from 0
75/75 [==============================] - 64s 847ms/step
break? 100000 75000 74840
got embeddings
got region labels
created data generator from 0
75/75 [==============================] - ETA:  - 62s 826ms/step
break? 100000 75000 74840
got embeddings
got region labels
/users/msharmin/anaconda2/envs/basepair13/lib/python3.6/site-packages/sklearn/linear_model/logistic.py:433: FutureWarning: Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.
  FutureWarning)
Platt scaling coef: 1.2640025336555025 ; intercept: -1.2451713381729304
/users/msharmin/anaconda2/envs/basepair13/lib/python3.6/site-packages/tensorflow/python/ops/gradients_impl.py:112: UserWarning: Converting sparse IndexedSlices to a dense Tensor of unknown shape. This may consume a large amount of memory.
  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "
created data generator from 0
71/71 [==============================] - 60s 839ms/step
break? 100000 71000 70705
got embeddings
got region labels
created data generator from 0
71/71 [==============================] - 59s 827ms/step
break? 100000 71000 70705
got embeddings
got region labels
/users/msharmin/anaconda2/envs/basepair13/lib/python3.6/site-packages/sklearn/linear_model/logistic.py:433: FutureWarning: Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.
  FutureWarning)
Platt scaling coef: 1.3275014785743666 ; intercept: -0.7275394084297472
/users/msharmin/anaconda2/envs/basepair13/lib/python3.6/site-packages/tensorflow/python/ops/gradients_impl.py:112: UserWarning: Converting sparse IndexedSlices to a dense Tensor of unknown shape. This may consume a large amount of memory.
  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "
created data generator from 0
69/69 [==============================] - 57s 832ms/step  -
break? 100000 69000 68576
got embeddings
got region labels
created data generator from 0
69/69 [==============================] - 57s 828ms/step
break? 100000 69000 68576
got embeddings
got region labels
/users/msharmin/anaconda2/envs/basepair13/lib/python3.6/site-packages/sklearn/linear_model/logistic.py:433: FutureWarning: Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.
  FutureWarning)
Platt scaling coef: 1.4672692683413568 ; intercept: -0.8435644509545636
/users/msharmin/anaconda2/envs/basepair13/lib/python3.6/site-packages/tensorflow/python/ops/gradients_impl.py:112: UserWarning: Converting sparse IndexedSlices to a dense Tensor of unknown shape. This may consume a large amount of memory.
  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "
created data generator from 0
21/21 [==============================] - 18s 837ms/step 42s - ETA
break? 100000 21000 20237
got embeddings
got region labels
created data generator from 0
21/21 [==============================] - 18s 834ms/step
break? 100000 21000 20237
got embeddings
got region labels
/users/msharmin/anaconda2/envs/basepair13/lib/python3.6/site-packages/sklearn/linear_model/logistic.py:433: FutureWarning: Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.
  FutureWarning)
Platt scaling coef: 1.3003715497817927 ; intercept: -0.5156272232912695
/users/msharmin/anaconda2/envs/basepair13/lib/python3.6/site-packages/tensorflow/python/ops/gradients_impl.py:112: UserWarning: Converting sparse IndexedSlices to a dense Tensor of unknown shape. This may consume a large amount of memory.
  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "
created data generator from 0
70/70 [==============================] - 57s 820ms/step
break? 100000 70000 69025
got embeddings
got region labels
created data generator from 0
70/70 [==============================] - 57s 819ms/step
break? 100000 70000 69025
got embeddings
got region labels
/users/msharmin/anaconda2/envs/basepair13/lib/python3.6/site-packages/sklearn/linear_model/logistic.py:433: FutureWarning: Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.
  FutureWarning)
Platt scaling coef: 1.3496015397903216 ; intercept: -0.5537164560259934
/users/msharmin/anaconda2/envs/basepair13/lib/python3.6/site-packages/tensorflow/python/ops/gradients_impl.py:112: UserWarning: Converting sparse IndexedSlices to a dense Tensor of unknown shape. This may consume a large amount of memory.
  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "
created data generator from 0
89/89 [==============================] - 73s 824ms/step
break? 100000 89000 88383
got embeddings
got region labels
created data generator from 0
89/89 [==============================] - 74s 828ms/step
break? 100000 89000 88383
got embeddings
got region labels
/users/msharmin/anaconda2/envs/basepair13/lib/python3.6/site-packages/sklearn/linear_model/logistic.py:433: FutureWarning: Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.
  FutureWarning)
Platt scaling coef: 1.303882569245345 ; intercept: -0.76673688152483
/users/msharmin/anaconda2/envs/basepair13/lib/python3.6/site-packages/tensorflow/python/ops/gradients_impl.py:112: UserWarning: Converting sparse IndexedSlices to a dense Tensor of unknown shape. This may consume a large amount of memory.
  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "
created data generator from 0
70/70 [==============================] - 58s 826ms/step
break? 100000 70000 69525
got embeddings
got region labels
created data generator from 0
70/70 [==============================] - 58s 826ms/step
break? 100000 70000 69525
got embeddings
got region labels
/users/msharmin/anaconda2/envs/basepair13/lib/python3.6/site-packages/sklearn/linear_model/logistic.py:433: FutureWarning: Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.
  FutureWarning)
Platt scaling coef: 1.4041731762374405 ; intercept: -0.7699644573780745
/users/msharmin/anaconda2/envs/basepair13/lib/python3.6/site-packages/tensorflow/python/ops/gradients_impl.py:112: UserWarning: Converting sparse IndexedSlices to a dense Tensor of unknown shape. This may consume a large amount of memory.
  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "
created data generator from 0
72/72 [==============================] - 59s 826ms/step 11 - ETA -
break? 100000 72000 71458
got embeddings
got region labels
created data generator from 0
72/72 [==============================] - 60s 831ms/step
break? 100000 72000 71458
got embeddings
got region labels
/users/msharmin/anaconda2/envs/basepair13/lib/python3.6/site-packages/sklearn/linear_model/logistic.py:433: FutureWarning: Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.
  FutureWarning)
Platt scaling coef: 1.3060030543385184 ; intercept: -1.0344009827507235
/users/msharmin/anaconda2/envs/basepair13/lib/python3.6/site-packages/tensorflow/python/ops/gradients_impl.py:112: UserWarning: Converting sparse IndexedSlices to a dense Tensor of unknown shape. This may consume a large amount of memory.
  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "
created data generator from 0
29/29 [==============================] - 25s 845ms/step
break? 100000 29000 28280
got embeddings
got region labels
created data generator from 0
29/29 [==============================] - 24s 839ms/step
break? 100000 29000 28280
got embeddings
got region labels
/users/msharmin/anaconda2/envs/basepair13/lib/python3.6/site-packages/sklearn/linear_model/logistic.py:433: FutureWarning: Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.
  FutureWarning)
Platt scaling coef: 1.2575246465170207 ; intercept: -0.7804520194177546
/users/msharmin/anaconda2/envs/basepair13/lib/python3.6/site-packages/tensorflow/python/ops/gradients_impl.py:112: UserWarning: Converting sparse IndexedSlices to a dense Tensor of unknown shape. This may consume a large amount of memory.
  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "
created data generator from 0
92/92 [==============================] - 76s 824ms/step
break? 100000 92000 91406
got embeddings
got region labels
created data generator from 0
92/92 [==============================] - 76s 825ms/step
break? 100000 92000 91406
got embeddings
got region labels
/users/msharmin/anaconda2/envs/basepair13/lib/python3.6/site-packages/sklearn/linear_model/logistic.py:433: FutureWarning: Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.
  FutureWarning)
Platt scaling coef: 1.236979757773599 ; intercept: -0.5080807961298984
/users/msharmin/anaconda2/envs/basepair13/lib/python3.6/site-packages/tensorflow/python/ops/gradients_impl.py:112: UserWarning: Converting sparse IndexedSlices to a dense Tensor of unknown shape. This may consume a large amount of memory.
  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "
created data generator from 0
75/75 [==============================] - 63s 836ms/step
break? 100000 75000 74840
got embeddings
got region labels
created data generator from 0
75/75 [==============================] - 63s 839ms/step
break? 100000 75000 74840
got embeddings
got region labels
/users/msharmin/anaconda2/envs/basepair13/lib/python3.6/site-packages/sklearn/linear_model/logistic.py:433: FutureWarning: Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.
  FutureWarning)
Platt scaling coef: 1.496646485008547 ; intercept: -0.5335662785143973
/users/msharmin/anaconda2/envs/basepair13/lib/python3.6/site-packages/tensorflow/python/ops/gradients_impl.py:112: UserWarning: Converting sparse IndexedSlices to a dense Tensor of unknown shape. This may consume a large amount of memory.
  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "
created data generator from 0
71/71 [==============================] - 59s 833ms/step
break? 100000 71000 70705
got embeddings
got region labels
created data generator from 0
71/71 [==============================] - 60s 838ms/step
break? 100000 71000 70705
got embeddings
got region labels
/users/msharmin/anaconda2/envs/basepair13/lib/python3.6/site-packages/sklearn/linear_model/logistic.py:433: FutureWarning: Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.
  FutureWarning)
Platt scaling coef: 1.2361770433512158 ; intercept: -0.7238622535001847
/users/msharmin/anaconda2/envs/basepair13/lib/python3.6/site-packages/tensorflow/python/ops/gradients_impl.py:112: UserWarning: Converting sparse IndexedSlices to a dense Tensor of unknown shape. This may consume a large amount of memory.
  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "
created data generator from 0
69/69 [==============================] - 58s 841ms/step
break? 100000 69000 68576
got embeddings
got region labels
created data generator from 0
69/69 [==============================] - 58s 836ms/step
break? 100000 69000 68576
got embeddings
got region labels
/users/msharmin/anaconda2/envs/basepair13/lib/python3.6/site-packages/sklearn/linear_model/logistic.py:433: FutureWarning: Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.
  FutureWarning)
Platt scaling coef: 1.3687618276909113 ; intercept: -0.31983214600962845
/users/msharmin/anaconda2/envs/basepair13/lib/python3.6/site-packages/tensorflow/python/ops/gradients_impl.py:112: UserWarning: Converting sparse IndexedSlices to a dense Tensor of unknown shape. This may consume a large amount of memory.
  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "
created data generator from 0
21/21 [==============================] - 19s 887ms/step
break? 100000 21000 20237
got embeddings
got region labels
created data generator from 0
21/21 [==============================] - 18s 868ms/step
break? 100000 21000 20237
got embeddings
got region labels
/users/msharmin/anaconda2/envs/basepair13/lib/python3.6/site-packages/sklearn/linear_model/logistic.py:433: FutureWarning: Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.
  FutureWarning)
Platt scaling coef: 1.3302395711684432 ; intercept: -0.5364984449050153
/users/msharmin/anaconda2/envs/basepair13/lib/python3.6/site-packages/tensorflow/python/ops/gradients_impl.py:112: UserWarning: Converting sparse IndexedSlices to a dense Tensor of unknown shape. This may consume a large amount of memory.
  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "
created data generator from 0
70/70 [==============================] - 58s 832ms/step
break? 100000 70000 69025
got embeddings
got region labels
created data generator from 0
70/70 [==============================] - 58s 835ms/step
break? 100000 70000 69025
got embeddings
got region labels
/users/msharmin/anaconda2/envs/basepair13/lib/python3.6/site-packages/sklearn/linear_model/logistic.py:433: FutureWarning: Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.
  FutureWarning)
Platt scaling coef: 1.3601312342160434 ; intercept: -0.8183738022947286
/users/msharmin/anaconda2/envs/basepair13/lib/python3.6/site-packages/tensorflow/python/ops/gradients_impl.py:112: UserWarning: Converting sparse IndexedSlices to a dense Tensor of unknown shape. This may consume a large amount of memory.
  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "
created data generator from 0
89/89 [==============================] - 75s 841ms/step
break? 100000 89000 88383
got embeddings
got region labels
created data generator from 0
89/89 [==============================] - 74s 832ms/step
break? 100000 89000 88383
got embeddings
got region labels
/users/msharmin/anaconda2/envs/basepair13/lib/python3.6/site-packages/sklearn/linear_model/logistic.py:433: FutureWarning: Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.
  FutureWarning)
Platt scaling coef: 1.3240848700633492 ; intercept: -0.7150127219973831
/users/msharmin/anaconda2/envs/basepair13/lib/python3.6/site-packages/tensorflow/python/ops/gradients_impl.py:112: UserWarning: Converting sparse IndexedSlices to a dense Tensor of unknown shape. This may consume a large amount of memory.
  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "
created data generator from 0
70/70 [==============================] - 59s 844ms/step - E - E
break? 100000 70000 69525
got embeddings
got region labels
created data generator from 0
70/70 [==============================] - 59s 838ms/step
break? 100000 70000 69525
got embeddings
got region labels
/users/msharmin/anaconda2/envs/basepair13/lib/python3.6/site-packages/sklearn/linear_model/logistic.py:433: FutureWarning: Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.
  FutureWarning)
Platt scaling coef: 1.3644826547728204 ; intercept: -0.9217086883222038
/users/msharmin/anaconda2/envs/basepair13/lib/python3.6/site-packages/tensorflow/python/ops/gradients_impl.py:112: UserWarning: Converting sparse IndexedSlices to a dense Tensor of unknown shape. This may consume a large amount of memory.
  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "
created data generator from 0
72/72 [==============================] - 61s 848ms/step
break? 100000 72000 71458
got embeddings
got region labels
created data generator from 0
72/72 [==============================] - 60s 840ms/step
break? 100000 72000 71458
got embeddings
got region labels
/users/msharmin/anaconda2/envs/basepair13/lib/python3.6/site-packages/sklearn/linear_model/logistic.py:433: FutureWarning: Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.
  FutureWarning)
Platt scaling coef: 1.4409996881234006 ; intercept: -0.7335805004332382
/users/msharmin/anaconda2/envs/basepair13/lib/python3.6/site-packages/tensorflow/python/ops/gradients_impl.py:112: UserWarning: Converting sparse IndexedSlices to a dense Tensor of unknown shape. This may consume a large amount of memory.
  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "
created data generator from 0
29/29 [==============================] - 25s 858ms/step
break? 100000 29000 28280
got embeddings
got region labels
created data generator from 0
29/29 [==============================] - 25s 863ms/step
break? 100000 29000 28280
got embeddings
got region labels
/users/msharmin/anaconda2/envs/basepair13/lib/python3.6/site-packages/sklearn/linear_model/logistic.py:433: FutureWarning: Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.
  FutureWarning)
Platt scaling coef: 1.2703785936811487 ; intercept: -0.9036099614784485
/users/msharmin/anaconda2/envs/basepair13/lib/python3.6/site-packages/tensorflow/python/ops/gradients_impl.py:112: UserWarning: Converting sparse IndexedSlices to a dense Tensor of unknown shape. This may consume a large amount of memory.
  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "
created data generator from 0
61/92 [==================>...........] - ETA: 28s-
In [42]:
root = "/mnt/lab_data/kundaje/users/msharmin/NSC_ATAC_PEAKS_to_share"
# for j in range(10):
#     labs = pd.read_hdf("{0}/gw_predictions/multi_tasks/clb_basset_classification.{1}.labels.0".format(root, j))
#     preacts = pd.read_hdf("{0}/gw_predictions/multi_tasks/clb_basset_classification.{1}.predictions.0".format(root, j))
#     break
    
for i in range(10):
    indices = np.logical_not(labs[i].isnull().values) #, np.logical_not(df['Date'].isnull().values)
    labs_ = np.array(labs.loc[indices, i])
    preacts_ = np.array(preacts.loc[indices, i])
    break
In [45]:
from abstention.calibration import PlattScaling
calibration_func = PlattScaling()(valid_preacts=preacts_, valid_labels=labs_)
calibrated_predictions = calibration_func(preacts_)
print(calibrated_predictions.shape, preacts_.shape)

import collections
counter = collections.Counter(labs_)
print(counter)
Counter({0.0: 3882174, 1.0: 18482})
In [49]:
from matlas.model_test import get_keras_model
# model_h5 = "{0}/gw_models/multi_tasks/basset_classification.{1}.hdf5".format(root, 0)
# visiblegpus = '6'
# model = get_keras_model(model_h5, visiblegpus, loss_function=None)
# model
from keras.models import Model
preact_model = Model(inputs=model.input, outputs=model.layers[-2].output)
In [60]:
from matlas.aitac_motifs import args_object_from_args_dict, get_embeddings
i=0
input_bed_file = "{0}/gw_peaks/task_{1}.bed.gz".format(root, i)
args = args_object_from_args_dict(
        {'input_bed_file': input_bed_file,
         'ref_fasta': "/srv/scratch/msharmin/metadata/mm10.genome.fa",
         'batch_size': 1000, 
         'expand_dims': False, #use True for conv2d model
         'center_on_summit': False, #use true for direct pipeline output of narowpeak file
         'flank': 0, #use true for direct pipeline output of narowpeak file 
         'max_queue_size': 100,
         'threads':1,
         'num_rows': 100000
        })

bed_entries, model_predictions = get_embeddings(args, model)
bed_entries, preact_predictions = get_embeddings(args, preact_model)
preact_predictions.shape, model_predictions.shape
Out[60]:
((68576, 10), (68576, 10))
In [55]:
# predict the peak regions using 1 and apply calibrate func
calibrated_test_predictions = calibration_func(model_predictions[:,i])
calibrated_peak_predictions = calibration_func(preact_predictions[:,i])
calibrated_peak_predictions.shape, preact_predictions.shape, np.expand_dims(calibrated_peak_predictions,axis=1).shape
Out[55]:
((68576,), (68576, 10), (68576, 1))
In [57]:
calibrated_predictions[:5], preact_predictions[:5], model_predictions[:5]
Out[57]:
(array([0.88629781, 0.70279411, 0.19890942, 0.03892881, 0.00202688]),
 array([[ 2.9091749e+00, -5.0181645e-01,  2.3916907e+00,  3.2057817e+00,
          2.6579347e+00,  2.7888513e+00,  1.3204409e-01,  2.9382834e+00,
          3.1644275e+00,  2.6841795e+00],
        [ 1.8453541e+00, -7.6490635e-01,  5.1553845e-01,  1.6436713e+00,
          1.6717912e+00,  1.9018229e+00, -2.4845228e-03,  7.3504663e-01,
          1.3388907e+00,  1.4956279e+00],
        [-1.6465351e-01, -4.5397019e+00, -2.6135215e-01,  7.0207953e-01,
         -2.9866147e-01, -6.0516030e-01, -3.5275211e+00,  6.0555738e-01,
          2.5613332e-01, -5.0707042e-01],
        [-1.7817374e+00, -5.0030932e+00, -6.8107897e-01, -3.8478777e-01,
         -1.7622745e+00, -2.3677373e+00, -4.5118046e+00, -5.7332642e-02,
         -1.0893991e+00, -1.6730957e+00],
        [-4.4509544e+00, -3.9125521e+00, -4.6628041e+00, -3.9018474e+00,
         -4.4397964e+00, -3.8568933e+00, -3.7874746e+00, -4.5909920e+00,
         -5.5414820e+00, -4.0989108e+00]], dtype=float32),
 array([[0.9482981 , 0.3771139 , 0.91619146, 0.9610513 , 0.93449837,
         0.94207036, 0.53296316, 0.9497068 , 0.95947343, 0.9360866 ],
        [0.8635807 , 0.31758198, 0.62610394, 0.83803385, 0.84181446,
         0.87009776, 0.49937886, 0.6759117 , 0.7923075 , 0.8169215 ],
        [0.45892936, 0.0105638 , 0.43503135, 0.66864866, 0.42588472,
         0.353164  , 0.02853923, 0.6469267 , 0.56368554, 0.37588054],
        [0.14408873, 0.00667232, 0.33602053, 0.40497267, 0.1465057 ,
         0.08566621, 0.01085941, 0.48567075, 0.25173146, 0.15801188],
        [0.01153287, 0.01959768, 0.00935168, 0.01980441, 0.01166076,
         0.02069617, 0.02215096, 0.01004095, 0.0039054 , 0.01631998]],
       dtype=float32))
In [85]:
from matlas.model_layer import get_accuracy_threshold
# auPRC should be same as regular auprc
# calibrated_threshold, calibrated_recall, auPRC = get_accuracy_threshold(labs_.astype(int), preacts_)
# calibrated_threshold, calibrated_recall, auPRC
calibration_info_file = "{0}/gw_peaks/calibration_info_task_{1}_{2}.txt".format(root, j, i)
with open(calibration_info_file, 'w') as fp:
    fp.write("calibrated_threshold\tcalibrated_recall\tauprc")
    fp.write("\n")
    fp.write("{}\t{}\t{}".format(calibrated_threshold[0], calibrated_recall[0], auPRC))
In [80]:
#summit_df = pd.read_csv(input_bed_file, header=None, sep="\t")
# bed_df = pd.DataFrame(list(map(list, bed_entries)))
# bed_df.equals(summit_df.loc[:,:2])
bed_entries.shape, summit_df.shape, preact_predictions.shape, calibrated_predictions.shape

# summit_df[4] = preact_predictions[:,i]
# summit_df[5] = calibrated_predictions
calibrated_bed_file = "{0}/gw_peaks/calibrated_task_{1}_{2}.bed.gz".format(root, j, i)
summit_df.to_csv(calibrated_bed_file, header=False, index=False, sep="\t", compression='gzip')
In [3]:
from matlas.utils import get_calibrated_peaks
root = "/mnt/lab_data/kundaje/users/msharmin/NSC_ATAC_PEAKS_to_share"
# calibrated_data = get_calibrated_peaks(calibrated_bed_file, calibration_info_file)
j = 0; i=0
deeplift_hdf = "{0}/gw_deeplift/multi_tasks/task_{1}.{2}/summit.h5".format(root, j, i)
# with h5py.File(deeplift_hdf, "r") as fp:
#     print(list(fp))
#     nhyp_scores = fp['deeplift_scores'][:] #NX1000X4
#     one_hot = fp['inputs'][:] #NX1000X4
#     shuffled_onehot = fp['shuffled_inputs'][:]
    
if len(nhyp_scores.shape)==4:
    hyp_scores = nhyp_scores[:, i, :, :]
else:
    hyp_scores = nhyp_scores
nhyp_scores.shape, hyp_scores.shape, one_hot.shape, shuffled_onehot.shape
Out[3]:
((68576, 1000, 4), (68576, 1000, 4), (68576, 1000, 4), (68576, 10, 1000, 4))
In [4]:
scores = np.multiply(hyp_scores, one_hot)
print('h', scores.shape)
h (68576, 1000, 4)
In [7]:
from modisco.visualization import viz_sequence
for i in [0,1,2]:
    print(i)
    viz_sequence.plot_weights(hyp_scores[i, 400:600,:], subticks_frequency=10, figsize=(20,1))
    viz_sequence.plot_weights(scores[i, 400:600,:], subticks_frequency=10, figsize=(20,1))
    viz_sequence.plot_weights(one_hot[i, 400:600,:], subticks_frequency=10, figsize=(20,1))
    
0
1
2
In [132]:
scores = np.multiply(hyp_scores, one_hot)
mean_hyp_scores = hyp_scores - np.mean(hyp_scores, axis=-1)[:, :, None]
dl_hyp_scores = np.sum(np.multiply(hyp_scores, shuffled_onehot), axis=2)
null_seq_no = 20000
nulldist_perposimp = [dl_hyp_scores[i[0],:] for i in np.random.randint(dl_hyp_scores.shape[0], size=(null_seq_no, 1))]
mean_hyp_scores.shape, dl_hyp_scores.shape, len(nulldist_perposimp)
Out[132]:
((68576, 1000, 4), (68576, 1000), 20000)
In [ ]:
from matplotlib import pylab as plt
#plt.scatter(predictions.reshape((-1, 1)), logits.reshape((-1, 1)), marker='.')
# plt.hist(nulldist_perposimp)
print('aa')
In [109]:
(calibrated_data['indices'])
Out[109]:
array([    0,     1,     2, ..., 68572, 68573, 68575])
In [149]:
from matlas.model_layer import retrieve_sequences, one_hot_encode_along_col_axis
from matlas.dlutils import write_deeplift_track, get_shuffled_seqs, get_given_seq_ref_function
from deeplift.dinuc_shuffle import dinuc_shuffle
from matlas.deeplift_run import retrieve_func_from_model
from matlas.dlutils import write_deeplift_track, get_shuffled_seqs, get_given_seq_ref_function
# summitfile = "{}/gw_peaks/test.bed.gz".format(root)
# num_refs_per_seq = 10
# sequences, intervals_wo_flanks = retrieve_sequences(
#     summitfile, 
#     fasta_file="/srv/scratch/msharmin/metadata/mm10.genome.fa", 
#     flank_size=0)

# input_data_list, input_references_list = get_shuffled_seqs(
#     sequences, num_refs_per_seq, shuffle_func=dinuc_shuffle,
#     one_hot_func=lambda x: np.array([one_hot_encode_along_col_axis(seq) for seq in x]),
#     progress_update=10000)

# model_h5 = "{0}/gw_models/multi_tasks/basset_classification.{1}.hdf5".format(root, 0)
# from matlas.utils import get_logger
# logger = get_logger('run-deeplift')
# contrib_funcs, input_layer_shape = retrieve_func_from_model(
#     model_h5, algorithm="rescale_conv_revealcancel_fc", sequential=False, 
#     w0=None, w1=None, logger=logger)

# shuffled_score_funcs = {input_name: get_given_seq_ref_function(score_computation_function=score_func)
#                         for input_name, score_func in contrib_funcs.items()}

# batch_size=256
# for input_name, score_func in shuffled_score_funcs.items():
#     hyp_scores = None
#     b = 10000
#     c = int(np.ceil(1.0*len(input_data_list[0])/b))
#     for si in range(c):
#         if(si==c-1):
#             tmp = score_func(task_idx=0, input_data_list=[input_data_list[0][si*b:len(input_data_list[0])]],
#                                input_references_list=[input_references_list[0][si*b:len(input_data_list[0])]],
#                                num_refs_per_seq=num_refs_per_seq, batch_size=batch_size,
#                                progress_update=10000)
#         else:
#             #print('batch: ', si, si*b, (si+1)*b) 
#             tmp = score_func(task_idx=0, input_data_list=[input_data_list[0][si*b:(si+1)*b]],
#                                input_references_list=[input_references_list[0][si*b:(si+1)*b]],
#                                num_refs_per_seq=num_refs_per_seq, 
#                                batch_size=batch_size,
#                                progress_update=10000)

# hyp_scores = tmp
# input_data_list[0] = np.squeeze(input_data_list[0]); print(input_data_list[0].shape)
# input_references_list[0] = np.squeeze(input_references_list[0]); print(input_references_list[0].shape)
# one_hot = input_data_list[0][[range(0, len(input_data_list[0]), num_refs_per_seq)]]; print(one_hot.shape)
# shuffled_onehot = input_references_list[0].reshape((one_hot.shape[0], num_refs_per_seq, 
#                                                    input_references_list[0].shape[-2], #seq_len
#                                                    input_references_list[0].shape[-1]))#alphabet 

scores = np.multiply(hyp_scores, one_hot)
print(scores.shape)
(60, 1000, 4)
In [151]:
from modisco.visualization import viz_sequence
for i in calibrated_data['indices'][:2]:
    print(i)
    viz_sequence.plot_weights(hyp_scores[i, 400:600,:], subticks_frequency=10, figsize=(20,1))
    viz_sequence.plot_weights(scores[i, 400:600,:], subticks_frequency=10, figsize=(20,1))
    viz_sequence.plot_weights(one_hot[i, 400:600,:], subticks_frequency=10, figsize=(20,1))
    
0
1
In [34]:
import collections
from matlas.model_layer import get_accuracy_threshold
# counter = collections.Counter(labs[0].values)
# print(counter)
# df = pd.DataFrame({'Date': [0, 0, 0, 1, 1, np.nan], 'val': [0, 1, 2, 3, 4, 5]}, index=['a', 'b', 'c', 'd', 'e', 'f']); 
# thresholds, recalls, auprcs = [], [], []
for i in range(10):
    indices = np.logical_not(labs[i].isnull().values) #, np.logical_not(df['Date'].isnull().values)
    labs_ = labs.loc[indices, i]
    preacts_ = preacts.loc[indices, i]
    break
    calibrated_threshold, calibrated_recall, auPRC = get_accuracy_threshold(labs_.astype(int), preacts_)
    thresholds.append(calibrated_threshold[0])
    recalls.append(calibrated_recall[0])
    auprcs.append(auPRC)
    #print(i, calibrated_threshold, calibrated_recall, auPRC)
    
# df = pd.DataFrame({'threshold': thresholds, 'recall': recalls, 'auprc': auprcs})
df 
for i in range(10):
    print(i, np.sum(preds[i].values>df['threshold'].values[i]))
0 14765
1 2683
2 17064
3 23929
4 12333
5 14306
6 3086
7 23146
8 19327
9 11137
In [ ]:
print('....')
In [ ]:
from matplotlib import pylab as plt
#plt.scatter(predictions.reshape((-1, 1)), logits.reshape((-1, 1)), marker='.')
plt.scatter(preds, labels, marker='.')
plt.xlabel('prediction')
plt.ylabel('logits')
In [ ]: