In [4]:
from basepair.modisco.results import ModiscoResult
from basepair.config import get_data_dir
from basepair.utils import read_json
from basepair.plot.vdom import vdom_modisco
from kipoi.readers import HDF5Reader
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
from plotnine import *
import pickle
import warnings
warnings.filterwarnings("ignore")
from basepair.plot.vdom import fig2vdom, vdom_pssm
from vdom.helpers import (h1, p, li, img, div, b, br, ul, img,
                          details, summary,
                          table, thead, th, tr, tbody, td, ol)
from collections import OrderedDict
Using TensorFlow backend.
In [ ]:
modisco_dir = "/srv/scratch/msharmin/mouse_hem/with_tfd/full_mouse50/dlmodisco_old/task_0-naivegw"
mr = ModiscoResult(f"{modisco_dir}/results.hdf5")
mr.open()
mc_stat = mr.metacluster_stats()
print("Total Number of seqlets")
print(sum(mc_stat['n']))
#vdom_pattern

vdom_to_figs = collect_vdoms_pattern(mr)
In [34]:
def collect_vdoms_pattern(mr):
    patterns = mr.patterns()
    figs = OrderedDict()
    vdom_to_figs = OrderedDict()
    for metacluster in mr.metaclusters():
        meta_patterns = mr.patterns(metacluster=metacluster)
        for pattern in meta_patterns:
            fullpattern_name = "{0}/{1}".format(metacluster, pattern)
            figs[fullpattern_name] = mr.plot_pattern(metacluster, pattern)
            vdom_to_figs[fullpattern_name] = [fig2vdom(fig) for fig in figs[fullpattern_name]]
            #break
    return vdom_to_figs

def get_data(task_idx):
    root = "/mnt/lab_data/kundaje/msharmin/mouse_hem/with_tfd/full_mouse50/dlmodisco_old"
    tfd_root = "/mnt/lab_data/kundaje/msharmin/mouse_hem/with_tfd"
    srv_root = "/srv/scratch/msharmin/mouse_hem/with_tfd/full_mouse50/dlmodisco_old"

    cisbpfile = "{0}/cisbp_id_tf_map.p".format(tfd_root)
    cisbp_map = pickle.load(open(cisbpfile, 'rb'))

    modisco_dir = "{0}/task_{1}-naivegw".format(srv_root, task_idx)
    mr = ModiscoResult(f"{modisco_dir}/results.hdf5")
    mr.open()
    
    mapfile = os.path.join(root, "task_{0}-naivegw".format(task_idx), "modisco_cisbp_match.p")
    modisco_map = pickle.load(open(mapfile, 'rb'))
    
    return mr, modisco_map, cisbp_map


def display_results(sample_name, mr, vdom_to_figs, modisco_map, cisbp_map, importance = True):
    
    patterns = mr.patterns()
    vdom_items = OrderedDict()
    metaclusters = mr.metaclusters()
    if(importance==True):
        metacluster = 'metacluster_0'
    else:
        metacluster = 'metacluster_1'
    #for metacluster in mr.metaclusters():
    vdom_items[metacluster] = []
    meta_patterns = mr.patterns(metacluster=metacluster)
    n_patterns = len(meta_patterns)
    n_seqlets = sum([mr.n_seqlets(metacluster, pattern)
                     for pattern in meta_patterns])
    for pattern in meta_patterns:
        fullpattern_name = "{0}/{1}".format(metacluster, pattern)
        tfnames = []
        for match in modisco_map[fullpattern_name]:
            if(match['q-value'] >= 0.01):
                continue
            tfname = cisbp_map[match['Target ID']].split('_')[0]
            if('(' in tfname):
                tfname = tfname.split('(')[1].split(')')[0]
            tfnames.append(tfname)
        #print(patterns[i], tfnames)
        n = mr.n_seqlets(metacluster, pattern)
        #mr.plot_pssm(metacluster, pattern, title="{0}, #seqlets: {1}, cisbp_match: {2}".format(patterns[i], n, tfnames))
        trimmed_motif = vdom_pssm(mr.get_pssm(metacluster, pattern, rc=False, trim_frac=0.08), letter_width=0.15, height=0.5)
        #full_motif = vdom_pssm(mr.get_pssm(metacluster, pattern, rc=False, trim_frac=0.0), letter_width=0.15, height=0.5)
        vdom_items[metacluster].append(details(summary(pattern, f": # seqlets: {n}" f" {tfnames}", trimmed_motif),
                                               details(summary("Sequence"), vdom_to_figs[fullpattern_name][0], ),
                                               details(summary("Contrib Scores"), vdom_to_figs[fullpattern_name][1], ),
                                               details(summary("Hyp_Contrib Scores"), vdom_to_figs[fullpattern_name][2], ),
                                               id=fullpattern_name
                                              )
                                      )
        #display(details(summary(pattern, f": # seqlets: {n}", trimmed_motif)))
        #break
    display(details(summary(b(metacluster), f", # patterns: {n_patterns},"
                            f" # seqlets: {n_seqlets}, "
                               "important for: ", b("{}".format(sample_name))),
                       ul([li(pattern) for pattern in vdom_items[metacluster]], start=0),
                       id=metacluster,
                       open=True))
    return None
In [21]:
#display_results(task_idx=0)
sample_name = 'LNG;A;CL'
task_idx = 54
mr, modisco_map, cisbp_map = get_data(task_idx)
vdom_to_figs = collect_vdoms_pattern(mr)
In [36]:
display_results(sample_name, mr, vdom_to_figs, modisco_map, cisbp_map, importance=True)
metacluster_0, # patterns: 26, # seqlets: 30196, important for: LNG;A;CL
  • pattern_0: # seqlets: 7873 ['Sp3', 'Sp5', 'Klf16', 'Sp2', 'Klf5', 'Klf14', 'Sp1', 'Klf2', 'Klf7', 'Wt1', 'Egr4', 'Sp8', 'Klf8', 'Klf4', 'Klf12', 'Sp4', 'Klf6', 'Klf13', 'Zfp148', 'Egr2', 'Klf1']
    Sequence
    Contrib Scores
    Hyp_Contrib Scores
  • pattern_1: # seqlets: 3771 ['Gabpa', 'Erf', 'Erg', 'Nr2c2', 'Ets1', 'Elf2', 'Etv5', 'Etv6', 'Ehf', 'Elf4', 'Ets2', 'Elk1', 'Elk3', 'Elf5', 'Gm5454', 'Etv1', 'Etv4', 'Elf1', 'Fli1', 'Etv3', 'Elk4', 'Gm4881', 'Elf3', 'Sfpi1', 'Fev', 'Etv2']
    Sequence
    Contrib Scores
    Hyp_Contrib Scores
  • pattern_2: # seqlets: 2627 ['Ctcf', 'Ctcfl']
    Sequence
    Contrib Scores
    Hyp_Contrib Scores
  • pattern_3: # seqlets: 2390 ['Pbx3', 'Foxi1', 'Nfya']
    Sequence
    Contrib Scores
    Hyp_Contrib Scores
  • pattern_4: # seqlets: 1786 []
    Sequence
    Contrib Scores
    Hyp_Contrib Scores
  • pattern_5: # seqlets: 1428 ['Mbtps2', 'Yy1']
    Sequence
    Contrib Scores
    Hyp_Contrib Scores
  • pattern_6: # seqlets: 1278 ['Cebpg', 'Cebpb', 'Cebpd', 'Cebpe']
    Sequence
    Contrib Scores
    Hyp_Contrib Scores
  • pattern_7: # seqlets: 1134 []
    Sequence
    Contrib Scores
    Hyp_Contrib Scores
  • pattern_8: # seqlets: 1081 []
    Sequence
    Contrib Scores
    Hyp_Contrib Scores
  • pattern_9: # seqlets: 1003 ['Creb3', '9430076C15Rik', 'Jdp2', 'Atf7', 'Atf2', 'Batf3', 'Creb1', 'Crem', 'Fosl2']
    Sequence
    Contrib Scores
    Hyp_Contrib Scores
  • pattern_10: # seqlets: 937 ['Nfia', 'Nfix', 'Nfib']
    Sequence
    Contrib Scores
    Hyp_Contrib Scores
  • pattern_11: # seqlets: 927 []
    Sequence
    Contrib Scores
    Hyp_Contrib Scores
  • pattern_12: # seqlets: 717 ['Smarcc2', 'Tbx2', 'Zfp143', 'Zfp523', 'Six5']
    Sequence
    Contrib Scores
    Hyp_Contrib Scores
  • pattern_13: # seqlets: 625 ['Tcfeb', 'Bach2', 'Tcfec', 'Mitf', 'Tcfe3', 'Usf2', 'Usf1', 'Arntl']
    Sequence
    Contrib Scores
    Hyp_Contrib Scores
  • pattern_14: # seqlets: 517 ['Bach1', 'Nfe2l2', 'Nfe2', 'Fosb', 'Jund']
    Sequence
    Contrib Scores
    Hyp_Contrib Scores
  • pattern_15: # seqlets: 404 ['Nkx2-6', 'Nkx2-4', 'Nkx2-3', 'Nkx2-1', 'Nkx3-1', 'Nkx2-9', 'Nkx3-2']
    Sequence
    Contrib Scores
    Hyp_Contrib Scores
  • pattern_16: # seqlets: 344 ['Sp3', 'Zbtb7a', 'Sp2', 'E2f1', 'Tcfap2d', 'Zfx']
    Sequence
    Contrib Scores
    Hyp_Contrib Scores
  • pattern_17: # seqlets: 228 ['Rfx1', 'Rfx2', 'Arid2', 'Rfx4', 'Rfx7']
    Sequence
    Contrib Scores
    Hyp_Contrib Scores
  • pattern_18: # seqlets: 211 []
    Sequence
    Contrib Scores
    Hyp_Contrib Scores
  • pattern_19: # seqlets: 185 ['Sp3', 'Zbtb7a', 'Sp2', 'Tcfap2d', 'Zfx', 'E2f1']
    Sequence
    Contrib Scores
    Hyp_Contrib Scores
  • pattern_20: # seqlets: 149 ['Rfx8', 'Rfx3', 'Rfx5', 'Rfx7']
    Sequence
    Contrib Scores
    Hyp_Contrib Scores
  • pattern_21: # seqlets: 148 []
    Sequence
    Contrib Scores
    Hyp_Contrib Scores
  • pattern_22: # seqlets: 155 ['Mef2d', 'Mef2b', 'Mef2c', 'Mef2a']
    Sequence
    Contrib Scores
    Hyp_Contrib Scores
  • pattern_23: # seqlets: 113 ['Grhl1']
    Sequence
    Contrib Scores
    Hyp_Contrib Scores
  • pattern_24: # seqlets: 98 ['Sp3', 'Zbtb7a', 'Mbd2', 'E2f1']
    Sequence
    Contrib Scores
    Hyp_Contrib Scores
  • pattern_25: # seqlets: 67 ['Zeb1', 'Sp3']
    Sequence
    Contrib Scores
    Hyp_Contrib Scores
In [35]:
display_results(sample_name, mr, vdom_to_figs, modisco_map, cisbp_map, importance=False)
metacluster_1, # patterns: 0, # seqlets: 0, important for: LNG;A;CL