import pickle
from collections import OrderedDict
import numpy as np
import pandas as pd
import os
import subprocess
import math
import copy
import pickle
from matlas.genome_data import *
project_root = "/mnt/lab_data/kundaje/msharmin/mouse_hem/from_labcluster/with_tfd"
oak_root = "/oak/stanford/groups/akundaje/msharmin/mouse_hem/from_labcluster/with_tfd"
srv_root = "/srv/scratch/msharmin/mouse_hem/with_tfd"
data_name = "full_mouse50"
modisco_root = "{0}/{1}/Naive_modisco2019_msharmin".format(oak_root, data_name)
homer_root = "{0}/{1}/Naive_scans".format(srv_root, data_name)
model_root = "{0}/{1}/fineFactorized".format(srv_root, data_name)
reportfile= "/mnt/lab_data/kundaje/msharmin/mouse_hem/with_tfd/full_mouse50/filtering samples_MS2.xlsx"
sheetname = "filter23"
import plotly.offline
import plotly.graph_objs as go
import plotly.figure_factory as ff
from scipy.cluster.hierarchy import linkage
import seaborn as sns
from matplotlib import pyplot as plt
from vdom.helpers import (h1, p, li, img, div, b, br, ul, a,
details, summary,
table, thead, th, tr, tbody, td, ol)
from IPython.display import display
from IPython.display import HTML, Javascript
from matlas.pwms import load_motifDB_id_maps
import numpy as np
import pandas as pd
import h5py
from collections import OrderedDict
import glob
import subprocess
from matlas.utils import fasta_writer
from matlas.genome_data import *
from matlas.utils import send_to_mitra
from basepair.modisco.sliding_similarities import pssm_scan
from matlas.pwms import get_motifDB_id_maps
class BassetMotif:
def __init__(self, bassetdir, layers=[9, 12, 15]):
self.denovo_motifs = OrderedDict()
self.denovo_pwms = OrderedDict()
self.layers = layers
self.bassetdir = bassetdir
self.memefiles = ["{0}/layer_{1}/filter_pwms.meme".format(bassetdir, layer_no) for layer_no in layers]
for memefile, layer_no in zip(self.memefiles, layers):
motif_id_maps = get_motifDB_id_maps(database_name=memefile)
self.denovo_motifs[layer_no] = list(motif_id_maps.keys())
for key, info in motif_id_maps.items():
self.denovo_pwms["layer_{}/{}".format(layer_no, key)] = info['PPM']
def get_pwm(self, motif_name, layer_no=15):
if("/" in motif_name):
return self.denovo_pwms[motif_name]
else:
return self.denovo_pwms["layer_{}/{}".format(layer_no, motif_name)]
def reverse_pwm(self, pwm):
return pwm[::-1, ::-1]
def _add_pwm_to_meme(self, motif_name, pwm, motif_alt_name="", bg=[0.27, 0.23, 0.23, 0.27], meme_db=None):
if(meme_db is None):
meme_db = self.memefile
if(os.path.exists(meme_db)):
with open(meme_db, "r") as fp:
for line in fp:
if(motif_name in line):
print("{} already exists".format(motif_name))
return None
else:
with open(meme_db, "w") as fp:
fp.write("MEME version 4\n\nALPHABET= ACGT\n\nstrands: + -\n\nBackground letter frequencies\n\n")
fp.write("A {0} C {1} G {2} T {3}\n\n".format(bg[0], bg[1], bg[2], bg[3]))
pfm = pwm
with open(meme_db, 'a') as fp:
fp.write("MOTIF {0} {1}\n".format(motif_name, motif_alt_name))
fp.write("letter-probability matrix: alength= 4 w= {} nsites= 20 E= 0e+0\n".format(pfm.shape[0]))
for line in pfm:
fp.write('%.5f %.5f %.5f %.5f\n' % tuple(line))
fp.write("\n")
return None
def fetch_tomtom_matches(self, layer_no= 15, motifs_db='Mus_musculus.meme',
bg=[0.26, 0.24, 0.24, 0.26], tomtom_exec_path='tomtom',
save_report=False, tomtom_dir='./'):
pwms = self.denovo_pwms
all_matched_motifs = OrderedDict()
for alt_name, pwm in pwms.items():
motif = alt_name.split("/")[1]
report_dir = "{0}/{1}".format(tomtom_dir, motif)
print(motif, alt_name)
if(not os.path.exists(report_dir)):
os.makedirs(report_dir)
all_matched_motifs[motif] = self._fetch_tomtom_matches(motif, pwm, alt_motif=None, motifs_db=motifs_db,
bg=bg, tomtom_exec_path=tomtom_exec_path,
save_report=save_report, report_dir=report_dir,
temp_dir=tomtom_dir)
return all_matched_motifs
def _fetch_tomtom_matches(self, motif, pwm, alt_motif=None, motifs_db='Mus_musculus.meme',
bg=[0.26, 0.24, 0.24, 0.26], tomtom_exec_path='tomtom',
save_report=False, report_dir='./', temp_dir='./'):
"""Fetches top matches from a motifs database using TomTom.
Returns:
list: a list of up to n results returned by tomtom, each entry is a
dictionary with keys 'Target ID', 'p-value', 'E-value', 'q-value'
"""
fname = os.path.join(temp_dir, 'query_file')
self._add_pwm_to_meme(motif, pwm, alt_motif, bg=bg, meme_db=fname)
if(save_report):
cmd = '{0} -no-ssc -oc {1} -verbosity 1 -min-overlap 5 -mi 1 -dist pearson -evalue -thresh 10.0 {2} {3}'.format(
tomtom_exec_path, report_dir, fname, motifs_db)
print(cmd)
out = subprocess.check_output(cmd, shell=True)
df = pd.read_table("{}/tomtom.tsv".format(report_dir))
df = df[['Target_ID', 'p-value', 'E-value', 'q-value']]
schema = list(df.columns)
dat = df.get_values()
else:
cmd = "{0} -no-ssc -oc . -verbosity 1 -text -min-overlap 5 -mi 1 -dist pearson -evalue -thresh 10.0 {1} {2}".format(
tomtom_exec_path, fname, motifs_db)
print(cmd)
out = subprocess.check_output(cmd, shell=True)
dat = [x.split('\t') for x in out.strip().decode("utf-8").split('\n')]
schema = dat[0]
dat = dat[1:]
tget_idx, pval_idx, eval_idx, qval_idx = schema.index('Target_ID'), schema.index('p-value'), schema.index('E-value'), schema.index('q-value')
r = []
for t in dat:
if(len(t) < 4):
break
mtf = {}
mtf['Target ID'] = t[tget_idx]
mtf['p-value'] = float(t[pval_idx])
mtf['E-value'] = float(t[eval_idx])
mtf['q-value'] = float(t[qval_idx])
# if(mtf['q-value']<0.001):
# break
r.append(mtf)
os.system('rm ' + fname)
return r
from matlas.pwms import shorten_tfname
def get_motif_per_celltype(data_root, task_idx, denovo="homer",
database_name=DEFAULT_DATABASE, importance=True):
motifDB_data = load_motifDB_id_maps(database_name=database_name)
task_dir = "{0}/task_{1}-naivegw".format(data_root, task_idx)
tomtom_resultfile = "{0}/{2}_{1}_match.p".format(task_dir, database_name, denovo) #modisco_dir
tomtom_out = pickle.load(open(tomtom_resultfile, 'rb'))
if(denovo=="homer"):
patterns = list(tomtom_out.keys())
hr = HomerData("{}/homer".format(task_dir))
result = hr
elif(denovo=="basset"):
patterns = list(tomtom_out.keys())
bm = BassetMotif("{}".format(task_dir), [15])
result = bm
else:
modisco_hdf = "{0}/results.hdf5".format(task_dir)
mr = ModiscoResult(modisco_hdf)
result = mr
activity = mr.metacluster_activity()
metacluster = ""
if(importance==True):
mcluster_no = np.argwhere((activity['task']==1).values)[0][0]
metacluster = 'metacluster_{0}'.format(mcluster_no)
elif(importance==False):
mcluster_no = np.argwhere((activity['task']==-1).values)[0][0]
metacluster = 'metacluster_{0}'.format(mcluster_no)
patterns = mr.patterns()
patterns = [pattern for pattern in patterns if metacluster in pattern]
#short_pattern_to_tf = OrderedDict()
pattern_to_tf = OrderedDict()
tf_to_pattern = OrderedDict()
for pattern_name in patterns:
first_match = tomtom_out[pattern_name][0]
if(not isinstance(first_match['Target ID'], str) and math.isnan(first_match['Target ID'])):
tfname = "-"
continue
elif(math.isnan(first_match['q-value'])):
tfname = "-"
continue
elif(first_match['q-value'] >= 0.1):
tfname = "-"
continue
else:
full_tf_name = motifDB_data['maps'][first_match['Target ID']]['TF Name']
tfname = shorten_tfname(full_tf_name, database_name)
#short_pattern_to_tf[long_to_short(pattern_name, denovo)] = tfname
pattern_to_tf[pattern_name] = tfname
if(tfname not in tf_to_pattern):
tf_to_pattern[tfname] = OrderedDict()
tf_to_pattern[tfname][pattern_name] = (first_match['Target ID'], #motifid
first_match['q-value'], # qvalue
motifDB_data['maps'][first_match['Target ID']]['URL']#url
)
return pattern_to_tf, tf_to_pattern, result
data_root = "/srv/scratch/msharmin/mouse_hem/with_tfd/full_mouse50/Naive_basset_motifs"
motif_matches = get_motif_per_celltype(data_root, '273', denovo="basset")
from basepair.plot.vdom import fig2vdom, vdom_pssm
def visualize_table(motif_matches):
pattern_dict = motif_matches[0]
bm = motif_matches[2]
tabledict = OrderedDict()
for pattern, motif in pattern_dict.items():
tabledict[pattern] = OrderedDict()
tabledict[pattern]['motif'] = motif
tabledict[pattern]['basset'] = vdom_pssm(bm.get_pwm(pattern), letter_width=0.15, height=0.5)
tab = table(thead(th('Pattern Name'), th('Motif Name'), th('Basset')),
tbody([tr(td(b(name)), td(value['motif']), td(value['basset']))
for name, value in tabledict.items()]
)
)
return tab, tabledict
tab, tabledict = visualize_table(motif_matches)
#effective filters - len(tabledict)
motif_universe = list()
for key, info in tabledict.items():
motif_universe.append(info['motif'])
motif_universe = list(set(motif_universe))
len(motif_universe)
details(summary(b("Basset like motifs on layer 15".format()),
": # of filters active out of 300 is: {}".format(len(tabledict)),
"and # of motifs found from database: {} ".format(len(motif_universe)),
br(),
),
#br(),
tab)