In [1]:
import pickle
from collections import OrderedDict
import numpy as np
import pandas as pd
import os
import subprocess
import math
import copy
import pickle

from matlas.genome_data import *


project_root = "/mnt/lab_data/kundaje/msharmin/mouse_hem/from_labcluster/with_tfd"
oak_root = "/oak/stanford/groups/akundaje/msharmin/mouse_hem/from_labcluster/with_tfd"
srv_root = "/srv/scratch/msharmin/mouse_hem/with_tfd"
data_name = "full_mouse50"
modisco_root = "{0}/{1}/Naive_modisco2019_msharmin".format(oak_root, data_name)
homer_root = "{0}/{1}/Naive_scans".format(srv_root, data_name)
model_root = "{0}/{1}/fineFactorized".format(srv_root, data_name)

reportfile= "/mnt/lab_data/kundaje/msharmin/mouse_hem/with_tfd/full_mouse50/filtering samples_MS2.xlsx"
sheetname = "filter23"
load data from labcluster
In [2]:
import plotly.offline
import plotly.graph_objs as go
import plotly.figure_factory as ff
from scipy.cluster.hierarchy import linkage
import seaborn as sns
from matplotlib import pyplot as plt

from vdom.helpers import (h1, p, li, img, div, b, br, ul, a,
                          details, summary,
                          table, thead, th, tr, tbody, td, ol)
from IPython.display import display
from IPython.display import HTML, Javascript

from matlas.pwms import load_motifDB_id_maps
In [4]:
import numpy as np
import pandas as pd
import h5py
from collections import OrderedDict
import glob
import subprocess

from matlas.utils import fasta_writer
from matlas.genome_data import *
from matlas.utils import send_to_mitra
from basepair.modisco.sliding_similarities import pssm_scan

from matlas.pwms import get_motifDB_id_maps


class BassetMotif:

    def __init__(self, bassetdir, layers=[9, 12, 15]):
        self.denovo_motifs = OrderedDict()
        self.denovo_pwms = OrderedDict()

        self.layers = layers
        
        self.bassetdir = bassetdir
        
        self.memefiles = ["{0}/layer_{1}/filter_pwms.meme".format(bassetdir, layer_no) for layer_no in layers]

        for memefile, layer_no in zip(self.memefiles, layers):
            motif_id_maps = get_motifDB_id_maps(database_name=memefile)
            self.denovo_motifs[layer_no] = list(motif_id_maps.keys())
            for key, info in motif_id_maps.items():
                self.denovo_pwms["layer_{}/{}".format(layer_no, key)] = info['PPM']
    
    def get_pwm(self, motif_name, layer_no=15):
        if("/" in motif_name):
            return self.denovo_pwms[motif_name]
        else:
            return self.denovo_pwms["layer_{}/{}".format(layer_no, motif_name)]

    def reverse_pwm(self, pwm):
        return pwm[::-1, ::-1]
    
    def _add_pwm_to_meme(self, motif_name, pwm, motif_alt_name="", bg=[0.27, 0.23, 0.23, 0.27], meme_db=None):
        if(meme_db is None):
            meme_db = self.memefile
            
        if(os.path.exists(meme_db)):
            with open(meme_db, "r") as fp:
                for line in fp:
                    if(motif_name in line):
                        print("{} already exists".format(motif_name))
                        return None
        else:
            with open(meme_db, "w") as fp:
                fp.write("MEME version 4\n\nALPHABET= ACGT\n\nstrands: + -\n\nBackground letter frequencies\n\n")
                fp.write("A {0} C {1} G {2} T {3}\n\n".format(bg[0], bg[1], bg[2], bg[3]))

        pfm = pwm
        with open(meme_db, 'a') as fp:
            fp.write("MOTIF {0} {1}\n".format(motif_name, motif_alt_name))
            fp.write("letter-probability matrix: alength= 4 w= {} nsites= 20 E= 0e+0\n".format(pfm.shape[0]))
            for line in pfm:
                fp.write('%.5f %.5f %.5f %.5f\n' % tuple(line))
            fp.write("\n")
        return None
    
    def fetch_tomtom_matches(self, layer_no= 15, motifs_db='Mus_musculus.meme', 
                             bg=[0.26, 0.24, 0.24, 0.26], tomtom_exec_path='tomtom',
                             save_report=False, tomtom_dir='./'):
        
        pwms = self.denovo_pwms

        all_matched_motifs = OrderedDict()
        for alt_name, pwm in pwms.items():
            motif = alt_name.split("/")[1]
            report_dir = "{0}/{1}".format(tomtom_dir, motif)
            print(motif, alt_name)
            if(not os.path.exists(report_dir)):
                os.makedirs(report_dir)
            all_matched_motifs[motif] = self._fetch_tomtom_matches(motif, pwm, alt_motif=None, motifs_db=motifs_db, 
                                                                   bg=bg, tomtom_exec_path=tomtom_exec_path,
                                                                  save_report=save_report, report_dir=report_dir,
                                                                  temp_dir=tomtom_dir)
            
        return all_matched_motifs
        
    
    def _fetch_tomtom_matches(self, motif, pwm, alt_motif=None, motifs_db='Mus_musculus.meme', 
                              bg=[0.26, 0.24, 0.24, 0.26], tomtom_exec_path='tomtom',
                              save_report=False, report_dir='./', temp_dir='./'):
        """Fetches top matches from a motifs database using TomTom.
        Returns:
            list: a list of up to n results returned by tomtom, each entry is a
                dictionary with keys 'Target ID', 'p-value', 'E-value', 'q-value'  
        """
        fname = os.path.join(temp_dir, 'query_file')
        self._add_pwm_to_meme(motif, pwm, alt_motif, bg=bg, meme_db=fname)

        if(save_report):
            cmd = '{0} -no-ssc -oc {1} -verbosity 1 -min-overlap 5 -mi 1 -dist pearson -evalue -thresh 10.0 {2} {3}'.format(
                tomtom_exec_path, report_dir, fname, motifs_db)
            print(cmd)
            out = subprocess.check_output(cmd, shell=True)
            df = pd.read_table("{}/tomtom.tsv".format(report_dir))
            df = df[['Target_ID', 'p-value', 'E-value', 'q-value']]
            schema = list(df.columns)
            dat = df.get_values()
        else:
            cmd = "{0} -no-ssc -oc . -verbosity 1 -text -min-overlap 5 -mi 1 -dist pearson -evalue -thresh 10.0 {1} {2}".format(
                tomtom_exec_path, fname, motifs_db)
            print(cmd)
            out = subprocess.check_output(cmd, shell=True)
            dat = [x.split('\t') for x in out.strip().decode("utf-8").split('\n')]
            schema = dat[0]
            dat = dat[1:]

        tget_idx, pval_idx, eval_idx, qval_idx = schema.index('Target_ID'), schema.index('p-value'), schema.index('E-value'), schema.index('q-value')
        r = []
        for t in dat:
            if(len(t) < 4):
                break
            mtf = {}
            mtf['Target ID'] = t[tget_idx]
            mtf['p-value'] = float(t[pval_idx])
            mtf['E-value'] = float(t[eval_idx])
            mtf['q-value'] = float(t[qval_idx])
            # if(mtf['q-value']<0.001):
            #    break
            r.append(mtf)

        os.system('rm ' + fname)
        return r
Using TensorFlow backend.
In [5]:
from matlas.pwms import shorten_tfname
def get_motif_per_celltype(data_root, task_idx, denovo="homer",
                                      database_name=DEFAULT_DATABASE, importance=True):
    
    motifDB_data = load_motifDB_id_maps(database_name=database_name)
    task_dir = "{0}/task_{1}-naivegw".format(data_root, task_idx)

    tomtom_resultfile = "{0}/{2}_{1}_match.p".format(task_dir, database_name, denovo) #modisco_dir
    tomtom_out = pickle.load(open(tomtom_resultfile, 'rb'))
    if(denovo=="homer"):
        patterns = list(tomtom_out.keys())
        hr = HomerData("{}/homer".format(task_dir))
        result = hr
    elif(denovo=="basset"):
        patterns = list(tomtom_out.keys())
        bm = BassetMotif("{}".format(task_dir), [15])
        result = bm
    else:
        modisco_hdf = "{0}/results.hdf5".format(task_dir)
        mr = ModiscoResult(modisco_hdf)
        result = mr
        activity = mr.metacluster_activity()
        metacluster = ""
        if(importance==True):
            mcluster_no = np.argwhere((activity['task']==1).values)[0][0]
            metacluster = 'metacluster_{0}'.format(mcluster_no)
        elif(importance==False):
            mcluster_no = np.argwhere((activity['task']==-1).values)[0][0]
            metacluster = 'metacluster_{0}'.format(mcluster_no)
        patterns = mr.patterns()
        patterns = [pattern for pattern in patterns if metacluster in pattern]

    #short_pattern_to_tf = OrderedDict()
    pattern_to_tf = OrderedDict()
    tf_to_pattern = OrderedDict()
    for pattern_name in patterns:
        first_match = tomtom_out[pattern_name][0]
        if(not isinstance(first_match['Target ID'], str) and math.isnan(first_match['Target ID'])):
            tfname = "-"
            continue
        elif(math.isnan(first_match['q-value'])):
            tfname = "-"
            continue
        elif(first_match['q-value'] >= 0.1):
            tfname = "-"
            continue
        else:
            full_tf_name = motifDB_data['maps'][first_match['Target ID']]['TF Name']
            tfname = shorten_tfname(full_tf_name, database_name)
        #short_pattern_to_tf[long_to_short(pattern_name, denovo)] = tfname
        pattern_to_tf[pattern_name] = tfname
        if(tfname not in tf_to_pattern):
            tf_to_pattern[tfname] = OrderedDict()
        tf_to_pattern[tfname][pattern_name] = (first_match['Target ID'], #motifid
                                               first_match['q-value'], # qvalue
                                               motifDB_data['maps'][first_match['Target ID']]['URL']#url
                                              )
                
    return pattern_to_tf, tf_to_pattern, result

data_root = "/srv/scratch/msharmin/mouse_hem/with_tfd/full_mouse50/Naive_basset_motifs"
motif_matches = get_motif_per_celltype(data_root, '273', denovo="basset")
In [6]:
from basepair.plot.vdom import fig2vdom, vdom_pssm

def visualize_table(motif_matches):
    
    pattern_dict = motif_matches[0]
    bm = motif_matches[2]
    tabledict = OrderedDict()
    for pattern, motif in pattern_dict.items():
        tabledict[pattern] = OrderedDict()
        
        tabledict[pattern]['motif'] = motif
        tabledict[pattern]['basset'] = vdom_pssm(bm.get_pwm(pattern), letter_width=0.15, height=0.5)
            
    tab = table(thead(th('Pattern Name'), th('Motif Name'), th('Basset')),
              tbody([tr(td(b(name)), td(value['motif']), td(value['basset'])) 
                         for name, value in tabledict.items()]
                   )
                )
    
    return tab, tabledict


tab, tabledict = visualize_table(motif_matches)
2019-08-05 13:58:31,700 [WARNING] git-lfs not installed
In [14]:
#effective filters - len(tabledict)
motif_universe = list()
for key, info in tabledict.items():
    motif_universe.append(info['motif'])
motif_universe = list(set(motif_universe))
len(motif_universe)
Out[14]:
46
In [20]:
details(summary(b("Basset like motifs on layer 15".format()),
                ": # of filters active out of 300 is: {}".format(len(tabledict)),
                "and # of motifs found from database: {} ".format(len(motif_universe)),
                                     br(), 
                                    ), 
                             #br(), 
                             tab)
Out[20]:
Basset like motifs on layer 15: # of filters active out of 300 is: 77and # of motifs found from database: 46

Pattern NameMotif NameBasset
filter5Rxrb
filter16Trp53
filter17Gata6
filter19Atf3
filter22Tal1
filter25Nfia
filter27Spi1
filter28Spi1
filter33Gata6
filter35Rorc
filter40Nfia
filter41Etv4
filter45Foxb1
filter47Cebpg
filter50Foxb1
filter55Ctcf
filter58Ctcf
filter61Gata4
filter65Gata6
filter72Nfia
filter74Myog
filter75Pou2f1
filter76Spi1
filter79Klf3
filter81Ctcf
filter83Gata6
filter84Spic
filter86Sp3
filter93Gata3
filter99Mafk
filter104Klf6
filter112Crem
filter119Sp3
filter132Klf5
filter140Jun
filter149Jun
filter150Pknox1
filter161Gata6
filter162Gata6
filter164Rxrg
filter166Ctcf
filter168Clock
filter170Spi1
filter171Atf7
filter174Ctcf
filter181Gata4
filter190Hoxc12
filter191Gata4
filter202Cbfb
filter206Nfe2l1
filter215Nfia
filter218Nfia
filter221Hnf1b
filter223Myc
filter224Ctcf
filter233Etv4
filter234Tcf7l1
filter241Nobox
filter242Hoxd9
filter249Ctcf
filter254Nr5a1
filter257Spi1
filter259Ctcf
filter262Cebpd
filter264Meis1
filter265Fosl2
filter267Nfia
filter268Spi1
filter271Zeb1
filter277Fosl2
filter281Dbp
filter284Erg
filter288Tead4
filter291Tcf4
filter292Gata3
filter296Gata4
filter298Twist1