In [4]:
from basepair.modisco.results import ModiscoResult
from basepair.config import get_data_dir
from basepair.utils import read_json
from basepair.plot.vdom import vdom_modisco
from kipoi.readers import HDF5Reader
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
from plotnine import *
import pickle
import warnings
warnings.filterwarnings("ignore")
from basepair.plot.vdom import fig2vdom, vdom_pssm
from vdom.helpers import (h1, p, li, img, div, b, br, ul, img,
                          details, summary,
                          table, thead, th, tr, tbody, td, ol)
from collections import OrderedDict
Using TensorFlow backend.
In [ ]:
modisco_dir = "/srv/scratch/msharmin/mouse_hem/with_tfd/full_mouse50/dlmodisco_old/task_0-naivegw"
mr = ModiscoResult(f"{modisco_dir}/results.hdf5")
mr.open()
mc_stat = mr.metacluster_stats()
print("Total Number of seqlets")
print(sum(mc_stat['n']))
#vdom_pattern

vdom_to_figs = collect_vdoms_pattern(mr)
In [38]:
def collect_vdoms_pattern(mr):
    patterns = mr.patterns()
    figs = OrderedDict()
    vdom_to_figs = OrderedDict()
    for metacluster in mr.metaclusters():
        meta_patterns = mr.patterns(metacluster=metacluster)
        for pattern in meta_patterns:
            fullpattern_name = "{0}/{1}".format(metacluster, pattern)
            figs[fullpattern_name] = mr.plot_pattern(metacluster, pattern)
            vdom_to_figs[fullpattern_name] = [fig2vdom(fig) for fig in figs[fullpattern_name]]
            #break
    return vdom_to_figs

def get_data(task_idx):
    root = "/mnt/lab_data/kundaje/msharmin/mouse_hem/with_tfd/full_mouse50/dlmodisco_old"
    tfd_root = "/mnt/lab_data/kundaje/msharmin/mouse_hem/with_tfd"
    srv_root = "/srv/scratch/msharmin/mouse_hem/with_tfd/full_mouse50/dlmodisco_old"

    cisbpfile = "{0}/cisbp_id_tf_map.p".format(tfd_root)
    cisbp_map = pickle.load(open(cisbpfile, 'rb'))

    modisco_dir = "{0}/task_{1}-naivegw".format(srv_root, task_idx)
    mr = ModiscoResult(f"{modisco_dir}/results.hdf5")
    mr.open()
    
    mapfile = os.path.join(root, "task_{0}-naivegw".format(task_idx), "modisco_cisbp_match.p")
    modisco_map = pickle.load(open(mapfile, 'rb'))
    
    return mr, modisco_map, cisbp_map


def display_results(sample_name, mr, vdom_to_figs, modisco_map, cisbp_map, importance = True):
    
    patterns = mr.patterns()
    vdom_items = OrderedDict()
    metaclusters = mr.metaclusters()
    if(importance==True):
        metacluster = 'metacluster_0'
        txt = 'positive scores'
    else:
        metacluster = 'metacluster_1'
        txt = 'negative scores'
    #for metacluster in mr.metaclusters():
    vdom_items[metacluster] = []
    meta_patterns = mr.patterns(metacluster=metacluster)
    n_patterns = len(meta_patterns)
    n_seqlets = sum([mr.n_seqlets(metacluster, pattern)
                     for pattern in meta_patterns])
    for pattern in meta_patterns:
        fullpattern_name = "{0}/{1}".format(metacluster, pattern)
        tfnames = []
        for match in modisco_map[fullpattern_name]:
            if(match['q-value'] >= 0.01):
                continue
            tfname = cisbp_map[match['Target ID']].split('_')[0]
            if('(' in tfname):
                tfname = tfname.split('(')[1].split(')')[0]
            tfnames.append(tfname)
        #print(patterns[i], tfnames)
        n = mr.n_seqlets(metacluster, pattern)
        #mr.plot_pssm(metacluster, pattern, title="{0}, #seqlets: {1}, cisbp_match: {2}".format(patterns[i], n, tfnames))
        trimmed_motif = vdom_pssm(mr.get_pssm(metacluster, pattern, rc=False, trim_frac=0.08), letter_width=0.15, height=0.5)
        #full_motif = vdom_pssm(mr.get_pssm(metacluster, pattern, rc=False, trim_frac=0.0), letter_width=0.15, height=0.5)
        vdom_items[metacluster].append(details(summary(pattern, f": # seqlets: {n}" f" {tfnames}", trimmed_motif),
                                               details(summary("Sequence"), vdom_to_figs[fullpattern_name][0], ),
                                               details(summary("Contrib Scores"), vdom_to_figs[fullpattern_name][1], ),
                                               details(summary("Hyp_Contrib Scores"), vdom_to_figs[fullpattern_name][2], ),
                                               id=fullpattern_name
                                              )
                                      )
        #display(details(summary(pattern, f": # seqlets: {n}", trimmed_motif)))
        #break
    display(details(summary(b(metacluster), f", # patterns: {n_patterns},"
                            f" # seqlets: {n_seqlets}, ",
                               txt, b("{}".format(sample_name))),
                       ul([li(pattern) for pattern in vdom_items[metacluster]], start=0),
                       id=metacluster,
                       open=True))
    return None
In [39]:
#display_results(task_idx=0)
sample_name = 'Muscle;A;CL'
task_idx = 55
mr, modisco_map, cisbp_map = get_data(task_idx)
vdom_to_figs = collect_vdoms_pattern(mr)
In [40]:
display_results(sample_name, mr, vdom_to_figs, modisco_map, cisbp_map, importance=True)
metacluster_0, # patterns: 23, # seqlets: 32162, positive scoresMuscle;A;CL
  • pattern_0: # seqlets: 6320 ['Sp3', 'Sp2', 'Klf16', 'Sp5', 'Klf5', 'Klf14', 'Sp1', 'Klf7', 'Egr4', 'Klf2', 'Wt1', 'Sp8', 'Klf4', 'Klf8', 'Klf12', 'Sp4', 'Klf6', 'Klf13', 'Zfp148']
    Sequence
    Contrib Scores
    Hyp_Contrib Scores
  • pattern_1: # seqlets: 3009 ['Ctcf', 'Ctcfl']
    Sequence
    Contrib Scores
    Hyp_Contrib Scores
  • pattern_2: # seqlets: 2658 []
    Sequence
    Contrib Scores
    Hyp_Contrib Scores
  • pattern_3: # seqlets: 2472 ['Mef2d', 'Mef2b', 'Mef2a', 'Mef2c']
    Sequence
    Contrib Scores
    Hyp_Contrib Scores
  • pattern_4: # seqlets: 2400 ['Pbx3', 'Foxi1', 'Nfya', 'Ybx1']
    Sequence
    Contrib Scores
    Hyp_Contrib Scores
  • pattern_5: # seqlets: 2259 ['Gabpa', 'Erf', 'Nr2c2', 'Etv5', 'Elk1', 'Erg', 'Gm5454', 'Elk3', 'Etv4', 'Elf2', 'Etv1', 'Elf1', 'Etv6', 'Elf4', 'Ets2', 'Ets1', 'Ehf', 'Elk4', 'Fli1', 'Etv3', 'Gm4881', 'Etv2', 'Elf5']
    Sequence
    Contrib Scores
    Hyp_Contrib Scores
  • pattern_6: # seqlets: 2215 []
    Sequence
    Contrib Scores
    Hyp_Contrib Scores
  • pattern_7: # seqlets: 1808 []
    Sequence
    Contrib Scores
    Hyp_Contrib Scores
  • pattern_8: # seqlets: 1767 ['Jdp2', 'Creb3', 'Atf7', '9430076C15Rik', 'Atf2', 'Creb1', 'Batf3', 'Crem']
    Sequence
    Contrib Scores
    Hyp_Contrib Scores
  • pattern_9: # seqlets: 1010 ['Mbtps2', 'Yy1']
    Sequence
    Contrib Scores
    Hyp_Contrib Scores
  • pattern_10: # seqlets: 823 ['Dbp', 'Tef', 'Cebpb']
    Sequence
    Contrib Scores
    Hyp_Contrib Scores
  • pattern_11: # seqlets: 773 ['Nr3c1', 'Pgr']
    Sequence
    Contrib Scores
    Hyp_Contrib Scores
  • pattern_12: # seqlets: 739 ['Bach2', 'Tcfec', 'Tcfeb', 'Mitf', 'Usf2', 'Tcfe3', 'Srebf1', 'Arntl', 'Mlx', 'Usf1', 'Srebf2']
    Sequence
    Contrib Scores
    Hyp_Contrib Scores
  • pattern_13: # seqlets: 731 ['Myod1', 'Myog', 'Tcf12', 'Ascl2']
    Sequence
    Contrib Scores
    Hyp_Contrib Scores
  • pattern_14: # seqlets: 719 ['Bach1', 'Nfe2l2', 'Nfe2', 'Smarcc1']
    Sequence
    Contrib Scores
    Hyp_Contrib Scores
  • pattern_15: # seqlets: 692 ['Smarcc2', 'Zfp143', 'Tbx2', 'Six5', 'Zfp523']
    Sequence
    Contrib Scores
    Hyp_Contrib Scores
  • pattern_16: # seqlets: 468 ['Sp3', 'Sp2', 'Maz', 'Klf15', 'Klf5', 'Zfp148']
    Sequence
    Contrib Scores
    Hyp_Contrib Scores
  • pattern_17: # seqlets: 337 []
    Sequence
    Contrib Scores
    Hyp_Contrib Scores
  • pattern_18: # seqlets: 279 []
    Sequence
    Contrib Scores
    Hyp_Contrib Scores
  • pattern_19: # seqlets: 219 ['Irf1', 'Prdm1', 'Stat2', 'Irf4', 'Irf2', 'Irf5']
    Sequence
    Contrib Scores
    Hyp_Contrib Scores
  • pattern_20: # seqlets: 200 ['Rfx8', 'Rfx1', 'Rfx4', 'Rfx5', 'Rfx3', 'Rfx7', 'Rfx2', 'Arid2']
    Sequence
    Contrib Scores
    Hyp_Contrib Scores
  • pattern_21: # seqlets: 173 []
    Sequence
    Contrib Scores
    Hyp_Contrib Scores
  • pattern_22: # seqlets: 91 ['Sp3', 'E2f1']
    Sequence
    Contrib Scores
    Hyp_Contrib Scores
In [41]:
display_results(sample_name, mr, vdom_to_figs, modisco_map, cisbp_map, importance=False)
metacluster_1, # patterns: 2, # seqlets: 632, negative scoresMuscle;A;CL
  • pattern_0: # seqlets: 344 ['Zeb1']
    Sequence
    Contrib Scores
    Hyp_Contrib Scores
  • pattern_1: # seqlets: 288 ['Glis2']
    Sequence
    Contrib Scores
    Hyp_Contrib Scores